1 /*
2  * See the file LICENSE for redistribution information.
3  *
4  * Copyright (c) 2002, 2014 Oracle and/or its affiliates.  All rights reserved.
5  *
6  */
7 
8 package com.sleepycat.je.dbi;
9 
10 import java.util.Arrays;
11 import java.util.ArrayList;
12 import java.util.List;
13 import java.util.Comparator;
14 import java.util.Set;
15 import java.util.logging.Level;
16 
17 import com.sleepycat.je.CacheMode;
18 import com.sleepycat.je.DatabaseEntry;
19 import com.sleepycat.je.DatabaseException;
20 import com.sleepycat.je.DuplicateDataException;
21 import com.sleepycat.je.EnvironmentFailureException;
22 import com.sleepycat.je.LockConflictException;
23 import com.sleepycat.je.OperationStatus;
24 import com.sleepycat.je.evictor.Evictor.EvictionSource;
25 import com.sleepycat.je.latch.LatchSupport;
26 import com.sleepycat.je.log.LogUtils;
27 import com.sleepycat.je.log.ReplicationContext;
28 import com.sleepycat.je.tree.BIN;
29 import com.sleepycat.je.tree.BINBoundary;
30 import com.sleepycat.je.tree.IN;
31 import com.sleepycat.je.tree.Key;
32 import com.sleepycat.je.tree.LN;
33 import com.sleepycat.je.tree.SearchResult;
34 import com.sleepycat.je.tree.TrackingInfo;
35 import com.sleepycat.je.tree.Tree;
36 import com.sleepycat.je.tree.TreeWalkerStatsAccumulator;
37 import com.sleepycat.je.txn.LockGrantType;
38 import com.sleepycat.je.txn.LockInfo;
39 import com.sleepycat.je.txn.LockManager;
40 import com.sleepycat.je.txn.LockResult;
41 import com.sleepycat.je.txn.LockType;
42 import com.sleepycat.je.txn.Locker;
43 import com.sleepycat.je.txn.LockerFactory;
44 import com.sleepycat.je.txn.WriteLockInfo;
45 import com.sleepycat.je.utilint.DbLsn;
46 import com.sleepycat.je.utilint.LoggerUtils;
47 import com.sleepycat.je.utilint.Pair;
48 import com.sleepycat.je.utilint.StatGroup;
49 import com.sleepycat.je.utilint.TestHook;
50 import com.sleepycat.je.utilint.TestHookExecute;
51 import com.sleepycat.je.utilint.ThroughputStatGroup;
52 import com.sleepycat.je.utilint.VLSN;
53 
54 /**
55  * A CursorImpl is the internal implementation of the cursor.
56  */
57 public class CursorImpl implements Cloneable {
58 
59     private static final boolean DEBUG = false;
60 
61     private static final byte CURSOR_NOT_INITIALIZED = 1;
62     private static final byte CURSOR_INITIALIZED = 2;
63     private static final byte CURSOR_CLOSED = 3;
64     private static final String TRACE_DELETE = "Delete";
65     private static final String TRACE_MOD = "Mod:";
66     private static final String TRACE_INSERT = "Ins:";
67 
68     public static final int FOUND = 0x1;
69     /* Exact match on the key portion. */
70     public static final int EXACT_KEY = 0x2;
71     /* Record found is the last one in the databaseImpl. */
72     public static final int FOUND_LAST = 0x4;
73 
74     /*
75      * Allocate hashCode ids from this. [#13896]
76      */
77     private static long lastAllocatedId = 0;
78 
79     /*
80      * Unique id that we can return as a hashCode to prevent calls to
81      * Object.hashCode(). [#13896]
82      */
83     private final int thisId;
84 
85     /* The databaseImpl behind the handle. */
86     private final DatabaseImpl databaseImpl;
87 
88     /* Owning transaction. */
89     private Locker locker;
90 
91     private final boolean retainNonTxnLocks;
92 
93     private final boolean isSecondaryCursor;
94 
95     /*
96      * Cursor location in the databaseImpl, represented by a BIN and an index
97      * in the BIN.  The bin is null if not established, and the index is
98      * negative if not established.
99      */
100     private volatile BIN bin;
101     private volatile int index;
102 
103     /* State of the cursor. See CURSOR_XXX above. */
104     private byte status;
105 
106     private CacheMode cacheMode;
107     private boolean allowEviction;
108 
109     /*
110      * A cache of the record version for the operation at the current position.
111      * Is null if the cursor is uninitialized.  For a secondary cursor, is the
112      * version of the primary record.
113      */
114     private RecordVersion currentRecordVersion;
115 
116     private ThreadLocal<TreeWalkerStatsAccumulator> treeStatsAccumulatorTL;
117 
118     private TestHook testHook;
119 
120     public enum SearchMode {
121         SET(true, false, "SET"),
122         BOTH(true, true, "BOTH"),
123         SET_RANGE(false, false, "SET_RANGE"),
124         BOTH_RANGE(false, true, "BOTH_RANGE");
125 
126         private final boolean exactSearch;
127         private final boolean dataSearch;
128         private final String name;
129 
SearchMode(boolean exactSearch, boolean dataSearch, String name)130         private SearchMode(boolean exactSearch,
131                            boolean dataSearch,
132                            String name) {
133             this.exactSearch = exactSearch;
134             this.dataSearch = dataSearch;
135             this.name = "SearchMode." + name;
136         }
137 
138         /**
139          * Returns true when the key or key/data search is exact, i.e., for SET
140          * and BOTH.
141          */
isExactSearch()142         public final boolean isExactSearch() {
143             return exactSearch;
144         }
145 
146         /**
147          * Returns true when the data value is included in the search, i.e.,
148          * for BOTH and BOTH_RANGE.
149          */
isDataSearch()150         public final boolean isDataSearch() {
151             return dataSearch;
152         }
153 
154         @Override
toString()155         public String toString() {
156             return name;
157         }
158     }
159 
160     /**
161      * Creates a cursor with retainNonTxnLocks=true, isSecondaryCursor=false.
162      * These are the standard settings for an internal cursor.
163      */
CursorImpl(DatabaseImpl database, Locker locker)164     public CursorImpl(DatabaseImpl database, Locker locker) {
165         this(database, locker,
166              true /*retainNonTxnLocks*/,
167              false /*isSecondaryCursor*/);
168     }
169 
170     /**
171      * Creates a cursor.
172      *
173      * A cursor always retains transactional locks when it is reset or closed.
174      * Non-transaction locks may be retained or not, depending on the
175      * retainNonTxnLocks parameter value.
176      *
177      * Normally a user-created non-transactional Cursor releases locks on reset
178      * and close, and a ThreadLocker is normally used.  However, by passing
179      * true for retainNonTxnLocks a ThreadLocker can be made to retain locks;
180      * this capability is used by SecondaryCursor.readPrimaryAfterGet.
181      *
182      * For internal (non-user) cursors, a BasicLocker is often used and locks
183      * are retained. In these internal use cases the caller explicitly calls
184      * BasicLocker.operationEnd() after the cursor is closed, and
185      * retainNonTxnLocks is set to true to prevent the locks acquired by the
186      * BasicLocker from being released when the cursor is closed.
187      *
188      * BasicLocker is also used for NameLN operations while opening a Database
189      * handle.  Database handle locks must be retained, even if the Database is
190      * opened non-transactionally.
191      *
192      * @param retainNonTxnLocks is true if non-transactional locks should be
193      * retained (not released automatically) when the cursor is reset or
194      * closed.
195      *
196      * @param isSecondaryCursor whether to treat this cursor as a secondary
197      * cursor, e.g., secondary records don't have record versions.
198      */
CursorImpl( DatabaseImpl databaseImpl, Locker locker, boolean retainNonTxnLocks, boolean isSecondaryCursor)199     public CursorImpl(
200         DatabaseImpl databaseImpl,
201         Locker locker,
202         boolean retainNonTxnLocks,
203         boolean isSecondaryCursor) {
204 
205         thisId = (int) getNextCursorId();
206         bin = null;
207         index = -1;
208 
209         this.retainNonTxnLocks = retainNonTxnLocks;
210         this.isSecondaryCursor = isSecondaryCursor;
211 
212         /* Associate this cursor with the databaseImpl. */
213         this.databaseImpl = databaseImpl;
214         this.locker = locker;
215         this.locker.registerCursor(this);
216 
217         /*
218          * This default value is used only when the CursorImpl is used directly
219          * (mainly for internal databases).  When the CursorImpl is created by
220          * a Cursor, CursorImpl.setCacheMode will be called.
221          */
222         this.cacheMode = CacheMode.DEFAULT;
223 
224         status = CURSOR_NOT_INITIALIZED;
225 
226         /*
227          * Do not perform eviction here because we may be synchronized on the
228          * Database instance. For example, this happens when we call
229          * Database.openCursor().  Also eviction may be disabled after the
230          * cursor is constructed.
231          */
232     }
233 
234     /**
235      * Performs a shallow copy and returns the new cursor.
236      *
237      * @param samePosition If true, this cursor's position is used for the new
238      * cursor, and addCursor is called on the new cursor to register it with
239      * the current BIN.  If false, the new cursor will be uninitialized.
240      */
cloneCursor(final boolean samePosition)241     public CursorImpl cloneCursor(final boolean samePosition) {
242 
243         assert assertCursorState(
244             false /*mustBeInitialized*/, false /*mustNotBeInitialized*/);
245 
246         CursorImpl ret = null;
247         try {
248             latchBIN();
249 
250             ret = (CursorImpl) super.clone();
251 
252             if (!retainNonTxnLocks) {
253                 ret.locker = locker.newNonTxnLocker();
254             }
255 
256             ret.locker.registerCursor(ret);
257 
258             if (samePosition) {
259                 ret.addCursor();
260             } else {
261                 ret.clear();
262             }
263         } catch (CloneNotSupportedException cannotOccur) {
264             return null;
265         } finally {
266             releaseBIN();
267         }
268 
269         /* Perform eviction before and after each cursor operation. */
270         criticalEviction();
271 
272         return ret;
273     }
274 
275     /*
276      * Allocate a new hashCode id.  Doesn't need to be synchronized since it's
277      * ok for two objects to have the same hashcode.
278      */
getNextCursorId()279     private static long getNextCursorId() {
280         return ++lastAllocatedId;
281     }
282 
283     @Override
hashCode()284     public int hashCode() {
285         return thisId;
286     }
287 
getLocker()288     public Locker getLocker() {
289         return locker;
290     }
291 
292     /**
293      * Called when a cursor has been duplicated prior to being moved.  The new
294      * locker is informed of the old locker, so that a preempted lock taken by
295      * the old locker can be ignored. [#16513]
296      *
297      * @param closingCursor the old cursor that will be closed if the new
298      * cursor is moved successfully.
299      */
setClosingLocker(CursorImpl closingCursor)300     public void setClosingLocker(CursorImpl closingCursor) {
301 
302         /*
303          * If the two lockers are different, then the old locker will be closed
304          * when the operation is complete.  This is currently the case only for
305          * ReadCommitted cursors and non-transactional cursors that do not
306          * retain locks.
307          */
308         if (!retainNonTxnLocks && locker != closingCursor.locker) {
309             locker.setClosingLocker(closingCursor.locker);
310         }
311     }
312 
313     /**
314      * Called when a cursor move operation is complete.  Clears the
315      * closingLocker so that a reference to the old closed locker is not held.
316      */
clearClosingLocker()317     public void clearClosingLocker() {
318         locker.setClosingLocker(null);
319     }
320 
getCacheMode()321     public CacheMode getCacheMode() {
322         return cacheMode;
323     }
324 
325     /**
326      * Sets the effective cache mode to use for the next operation.  The
327      * cacheMode field will never be set to null or DYNAMIC, and can be passed
328      * directly to latching methods.
329      *
330      * @see #performCacheEviction
331      */
setCacheMode(final CacheMode mode)332     public void setCacheMode(final CacheMode mode) {
333         cacheMode = databaseImpl.getEffectiveCacheMode(mode);
334     }
335 
setTreeStatsAccumulator(TreeWalkerStatsAccumulator tSA)336     public void setTreeStatsAccumulator(TreeWalkerStatsAccumulator tSA) {
337         maybeInitTreeStatsAccumulator();
338         treeStatsAccumulatorTL.set(tSA);
339     }
340 
maybeInitTreeStatsAccumulator()341     private void maybeInitTreeStatsAccumulator() {
342         if (treeStatsAccumulatorTL == null) {
343             treeStatsAccumulatorTL =
344                 new ThreadLocal<TreeWalkerStatsAccumulator>();
345         }
346     }
347 
getTreeStatsAccumulator()348     private TreeWalkerStatsAccumulator getTreeStatsAccumulator() {
349         if (EnvironmentImpl.getThreadLocalReferenceCount() > 0) {
350             maybeInitTreeStatsAccumulator();
351             return treeStatsAccumulatorTL.get();
352         } else {
353             return null;
354         }
355     }
356 
incrementLNCount()357     public void incrementLNCount() {
358         TreeWalkerStatsAccumulator treeStatsAccumulator =
359             getTreeStatsAccumulator();
360         if (treeStatsAccumulator != null) {
361             treeStatsAccumulator.incrementLNCount();
362         }
363     }
364 
getIndex()365     public int getIndex() {
366         return index;
367     }
368 
getBIN()369     public BIN getBIN() {
370         return bin;
371     }
372 
setIndex(int idx)373     public void setIndex(int idx) {
374         index = idx;
375     }
376 
setOnFirstSlot()377     public void setOnFirstSlot() {
378         assert(bin.isLatchOwner());
379         index = 0;
380     }
381 
setOnLastSlot()382     public void setOnLastSlot() {
383         assert(bin.isLatchOwner());
384         index = bin.getNEntries() - 1;
385     }
386 
isOnBIN(BIN bin)387     public boolean isOnBIN(BIN bin) {
388         return this.bin == bin;
389     }
390 
assertBIN(BIN bin)391     public void assertBIN(BIN bin) {
392         assert this.bin == bin :
393             "nodeId=" + bin.getNodeId() +
394             " cursor=" + dumpToString(true);
395     }
396 
isOnSamePosition(CursorImpl other)397     public boolean isOnSamePosition(CursorImpl other) {
398         return bin == other.bin && index == other.index;
399     }
400 
setBIN(BIN newBin)401     public void setBIN(BIN newBin) {
402 
403         /*
404          * Historical note. In the past we checked here that the cursor was
405          * removed for the prior BIN by calling BIN.containsCursor [#16280].
406          * Because the containsCursor method takes a latch on the prior BIN,
407          * this causes a rare latch deadlock when newBin is latched (during an
408          * insert, for example), since this thread will latch two BINs in
409          * arbitrary order; so the assertion was removed [#21395].
410          */
411         bin = newBin;
412     }
413 
latchBIN()414     public void latchBIN()
415         throws DatabaseException {
416 
417         while (bin != null) {
418             BIN waitingOn = bin;
419             waitingOn.latch(cacheMode);
420             if (bin == waitingOn) {
421                 return;
422             }
423             waitingOn.releaseLatch();
424         }
425     }
426 
releaseBIN()427     public void releaseBIN() {
428         if (bin != null) {
429             bin.releaseLatchIfOwner();
430         }
431     }
432 
addCursor(BIN bin)433     void addCursor(BIN bin) {
434         if (bin != null) {
435             assert bin.isLatchExclusiveOwner();
436             bin.addCursor(this);
437         }
438     }
439 
440     /**
441      * Add to the current cursor.
442      */
addCursor()443     void addCursor() {
444         if (bin != null) {
445             addCursor(bin);
446         }
447     }
448 
449     /**
450      * Change cursor to point to the given BIN/index.  If the new BIN is
451      * different, then old BIN must be unlatched and the new BIN must be
452      * latched.
453      */
setPosition(BIN newBin, int newIndex)454     private void setPosition(BIN newBin, int newIndex) {
455         if (bin != newBin) {
456             if (bin != null) {
457                 latchBIN();
458                 bin.removeCursor(this);
459                 bin.releaseLatch();
460             }
461             setBIN(newBin);
462             addCursor();
463         }
464         setIndex(newIndex);
465     }
466 
467     /**
468      * Called for creating trace messages without any latching.
469      */
getCurrentNodeId()470     public long getCurrentNodeId() {
471         final BIN b = bin;
472         return (b == null ? -1 : b.getNodeId());
473     }
474 
getCurrentLsn()475     public long getCurrentLsn() {
476 
477         assert(bin != null && bin.isLatchOwner());
478         assert(index >= 0 && index < bin.getNEntries());
479 
480         return bin.getLsn(index);
481     }
482 
getCurrentKey()483     public byte[] getCurrentKey() {
484         return getCurrentKey(false);
485     }
486 
487     /**
488      * Returns the key at the current position, regardless of whether the
489      * record is deleted.  Does not lock. The key returned is not a copy and
490      * may not be returned directly to the user without copying it first.
491      *
492      * The cursor must be initialized.
493      */
getCurrentKey(boolean isLatched)494     public byte[] getCurrentKey(boolean isLatched) {
495 
496         if (!isLatched) {
497             latchBIN();
498         }
499 
500         try {
501             assert(bin != null);
502             assert(index >= 0 && index < bin.getNEntries());
503 
504             return bin.getKey(index);
505         } finally {
506             if (!isLatched) {
507                 releaseBIN();
508             }
509         }
510     }
511 
setInitialized()512     private void setInitialized() {
513         status = CURSOR_INITIALIZED;
514     }
515 
516     /**
517      * @return true if this cursor is closed
518      */
isClosed()519     public boolean isClosed() {
520         return (status == CURSOR_CLOSED);
521     }
522 
523     /**
524      * @return true if this cursor is not initialized
525      */
isNotInitialized()526     public boolean isNotInitialized() {
527         return (status == CURSOR_NOT_INITIALIZED);
528     }
529 
isInternalDbCursor()530     public boolean isInternalDbCursor() {
531         return databaseImpl.isInternalDb();
532     }
533 
hasDuplicates()534     public boolean hasDuplicates() {
535         return databaseImpl.getSortedDuplicates();
536     }
537 
dumpTree()538     public void dumpTree() {
539         databaseImpl.getTree().dump();
540     }
541 
542     /**
543      * For a non-sticky cursor, this is an alternative to closing or resetting
544      * the cursor when it is initialized and an advancing operation
545      * (next/prev/skip) is about to be performed. The cursor position cannot be
546      * reset as it would be if the operation were a search, for example.
547      */
beforeAdvance()548     public void beforeAdvance() {
549 
550         /*
551          * When EVICT_LN is configured we should evict the LN at the current
552          * position before changing the position. [#21973]
553          *
554          * Note that EVICT_BIN cannot be implemented for a non-sticky cursor.
555          * We would have to call removeCursor before the BIN eviction, and then
556          * our position would be unstable. Even if this were possible, since
557          * we can't predict the new position we would evict the BIN at every
558          * operation, even when the BIN remained the same.
559          *
560          * It might be possible to support MAKE_COLD here, but more thought and
561          * testing would be required.
562          *
563          * Therefore, when EVICT_BIN or MAKE_COLD are configured, cursor
564          * cloning is always used.
565          */
566         if (cacheMode == CacheMode.EVICT_LN && bin != null) {
567             evict();
568         }
569 
570         releaseNonTxnLocks();
571 
572         criticalEviction();
573     }
574 
575     /**
576      * Reset a cursor to an uninitialized state, but unlike close(), allow it
577      * to be used further.
578      */
reset()579     public void reset()
580         throws DatabaseException {
581 
582         /* Must remove cursor before evicting BIN and releasing locks. */
583         removeCursorAndPerformCacheEviction(null /*newCursor*/);
584 
585         releaseNonTxnLocks();
586 
587         /* Perform eviction before and after each cursor operation. */
588         criticalEviction();
589     }
590 
clear()591     private void clear() {
592         bin = null;
593         index = -1;
594         status = CURSOR_NOT_INITIALIZED;
595         currentRecordVersion = null;
596     }
597 
releaseNonTxnLocks()598     private void releaseNonTxnLocks() {
599         if (!retainNonTxnLocks) {
600             locker.releaseNonTxnLocks();
601         }
602     }
603 
close()604     public void close()
605         throws DatabaseException {
606 
607         close(null);
608     }
609 
610     /**
611      * Close a cursor.
612      *
613      * @param newCursor is another cursor that is kept open by the parent
614      * Cursor object, or null if no other cursor is kept open.
615      *
616      * @throws DatabaseException if the cursor was previously closed.
617      */
close(final CursorImpl newCursor)618     public void close(final CursorImpl newCursor)
619         throws DatabaseException {
620 
621         assert assertCursorState(
622             false /*mustBeInitialized*/, false /*mustNotBeInitialized*/);
623 
624         /* Must remove cursor before evicting BIN and releasing locks. */
625         removeCursorAndPerformCacheEviction(newCursor);
626 
627         locker.unRegisterCursor(this);
628 
629         if (!retainNonTxnLocks) {
630             locker.nonTxnOperationEnd();
631         }
632 
633         status = CURSOR_CLOSED;
634 
635         /* Perform eviction before and after each cursor operation. */
636         criticalEviction();
637     }
638 
removeCursorAndPerformCacheEviction(CursorImpl newCursor)639     private void removeCursorAndPerformCacheEviction(CursorImpl newCursor) {
640 
641         latchBIN();
642 
643         if (bin == null) {
644             clear(); // ensure that state is uninitialized
645             return;
646         }
647 
648         try {
649             /* Must remove cursor before evicting BIN. */
650             bin.removeCursor(this);
651             performCacheEviction(newCursor); // may release latch
652         } finally {
653             releaseBIN();
654             clear();
655         }
656     }
657 
658     /**
659      * Disables or enables eviction during cursor operations.  For example, a
660      * cursor used to implement eviction (e.g., in some UtilizationProfile and
661      * most DbTree and VLSNIndex methods) should not itself perform eviction,
662      * but eviction should be enabled for user cursors.  Eviction is disabled
663      * by default.
664      */
setAllowEviction(boolean allowed)665     public void setAllowEviction(boolean allowed) {
666         allowEviction = allowed;
667     }
668 
criticalEviction()669     public void criticalEviction() {
670 
671         /*
672          * In addition to disabling critical eviction for internal cursors (see
673          * setAllowEviction above), we do not perform critical eviction when
674          * EVICT_BIN or MAKE_COLD is used.  Operations using MAKE_COLD and
675          * EVICT_BIN generally do not add any net memory to the cache, so they
676          * shouldn't have to perform critical eviction or block while another
677          * thread performs eviction.
678          */
679         if (allowEviction &&
680             cacheMode != CacheMode.MAKE_COLD &&
681             cacheMode != CacheMode.EVICT_BIN) {
682             databaseImpl.getEnv().criticalEviction(false /*backgroundIO*/);
683         }
684     }
685 
686     /**
687      * When multiple operations are performed, CacheMode-based eviction is
688      * performed for a given operation at the end of the next operation, which
689      * calls close() or reset() on the CursorImpl of the previous operation.
690      * Eviction for the last operation (including when only one operation is
691      * performed) also occurs during Cursor.close(), which calls
692      * CursorImpl.close().
693      *
694      * By default, the CacheMode returned by DatabaseImpl.getCacheMode is used,
695      * and the defaults specified by the user for the Database or Environment
696      * are applied.  However, the default mode can be overridden by the user by
697      * calling Cursor.setCacheMode, and the mode may be changed prior to each
698      * operation, if desired.
699      *
700      * To implement a per-operation CacheMode, two CacheMode fields are
701      * maintained.  Cursor.cacheMode is the mode to use for the next operation.
702      * CursorImpl.cacheMode is the mode that was used for the previous
703      * operation, and that is used for eviction when that CursorImpl is closed
704      * or reset.
705      *
706      * This method must be called with the BIN latched but may release it,
707      * namely when the BIN is evicted.
708      */
performCacheEviction(final CursorImpl newCursor)709     private void performCacheEviction(final CursorImpl newCursor) {
710 
711         EnvironmentImpl envImpl = databaseImpl.getEnv();
712 
713         if (cacheMode != CacheMode.EVICT_LN &&
714             cacheMode != CacheMode.EVICT_BIN &&
715             (cacheMode != CacheMode.MAKE_COLD ||
716              !(envImpl.isCacheFull() || envImpl.wasCacheEverFull()))) {
717             /* Eviction not configured, or for MAKE_COLD, cache never full. */
718             return;
719         }
720 
721         final BIN nextBin;
722         final int nextIndex;
723 
724         if (newCursor != null) {
725             nextBin = newCursor.bin;
726             nextIndex = newCursor.index;
727         } else {
728             nextBin = null;
729             nextIndex = -1;
730         }
731 
732         switch (cacheMode) {
733         case EVICT_LN:
734             /* Evict the LN if we've moved to a new record. */
735             if (bin != nextBin || index != nextIndex) {
736                 evict(true);
737             }
738             break;
739         case EVICT_BIN:
740             if (bin == nextBin) {
741                 /* BIN has not changed, do not evict it.  May evict the LN. */
742                 if (index != nextIndex) {
743                     evict(true);
744                 }
745             } else {
746                 /* BIN has changed, evict it. */
747                 evictBIN();
748             }
749             break;
750         case MAKE_COLD:
751             if (bin == nextBin) {
752                 /* BIN has not changed, do not evict it.  May evict the LN. */
753                 if (index != nextIndex) {
754                     evict(true);
755                 }
756             } else if (!envImpl.isCacheFull()) {
757                 /*
758                  * The BIN has changed, but the cache is not full. Just evict
759                  * the LN.
760                  */
761                 evict(true);
762             } else {
763                 /* BIN has changed, evict it. */
764                 evictBIN();
765             }
766             break;
767         default:
768             assert false;
769         }
770     }
771 
772     /**
773      * Evict current BIN.  Must already be latched. The latch will be released
774      * inside the doEvictOneIN() call.
775      */
evictBIN()776     private void evictBIN() {
777         databaseImpl.getEnv().getEvictor().doEvictOneIN(
778             bin, EvictionSource.CACHEMODE);
779     }
780 
781     /**
782      * Evict the LN node at the cursor position.
783      */
evict()784     public void evict()
785         throws DatabaseException {
786 
787         evict(false); // isLatched
788     }
789 
790     /**
791      * Evict the LN node at the cursor position.
792      */
evict(boolean isLatched)793     public void evict(boolean isLatched)
794         throws DatabaseException {
795 
796         try {
797             if (!isLatched) {
798                 latchBIN();
799             }
800             if (index >= 0) {
801                 bin.evictLN(index);
802             }
803         } finally {
804             if (!isLatched) {
805                 releaseBIN();
806             }
807         }
808     }
809 
deleteCurrentRecord( ReplicationContext repContext)810     public OperationStatus deleteCurrentRecord(
811         ReplicationContext repContext)
812         throws DatabaseException {
813         return deleteCurrentRecord(repContext, null);
814     }
815 
816     /**
817      * Delete the item pointed to by the cursor. If the item is already
818      * deleted, return KEYEMPTY. Returns with nothing latched.
819      *
820      * @param opStats Used to collect stats about user ops performed on BIN
821      * deltas. Internal ops should pass null.
822      *
823      * @return 0 on success, appropriate error code otherwise.
824      */
deleteCurrentRecord( ReplicationContext repContext, ThroughputStatGroup opStats)825     public OperationStatus deleteCurrentRecord(
826         ReplicationContext repContext,
827         ThroughputStatGroup opStats)
828         throws DatabaseException {
829 
830         assert assertCursorState(
831             true /*mustBeInitialized*/, false /*mustNotBeInitialized*/);
832         final EnvironmentImpl envImpl = databaseImpl.getEnv();
833         final DbType dbType = databaseImpl.getDbType();
834 
835         boolean success = false;
836         final long oldLsn;
837         final LN.LogResult logResult;
838 
839         latchBIN();
840 
841         try {
842             /*
843              * Get a write lock.  An uncontended lock is permitted because we
844              * will log a new LN before releasing the BIN latch.
845              */
846             final LockStanding lockStanding = lockLN(
847                 LockType.WRITE, true /*allowUncontended*/, false /*noWait*/);
848 
849             if (!lockStanding.recordExists()) {
850                 /* LN was deleted or cleaned. */
851                 revertLock(lockStanding);
852                 success = true;
853                 return OperationStatus.KEYEMPTY;
854             }
855 
856             oldLsn = lockStanding.lsn;
857             assert oldLsn != DbLsn.NULL_LSN;
858 
859             /*
860              * Must fetch LN if any of the following are true:
861              *  - CLEANER_FETCH_OBSOLETE_SIZE is configured and lastLoggedSize
862              *    is unknown
863              *  - this database does not use the standard LN class and we
864              *    cannot call DbType.createdDeletedLN further below
865              * For other cases, we are careful not to fetch, in order to avoid
866              * a random read during a delete operation.
867              *
868              * Note that we checked for a deleted/cleaned LN above, so we know
869              * that fetchLN will not return null.
870              */
871             final int lastLoggedSize = bin.getLastLoggedSize(index);
872             LN ln;
873             if ((lastLoggedSize == 0 &&
874                  envImpl.getCleaner().getFetchObsoleteSize(databaseImpl)) ||
875                 !dbType.mayCreateDeletedLN()) {
876                 /* Must fetch. */
877                 ln = bin.fetchLN(index, cacheMode);
878             } else {
879                 /* If resident, use LN for obsolete size tracking. */
880                 ln = (LN) bin.getTarget(index);
881             }
882 
883             /*
884              * Make the existing LN deleted, if cached, else create a new
885              * deleted LN.
886              */
887             final long oldLNMemSize;
888             if (ln != null) {
889                 /* Leave LN cached. Set ln.data to null and mark it dirty. */
890                 oldLNMemSize = ln.getMemorySizeIncludedByParent();
891                 ln.delete();
892             } else {
893                 /*
894                  * LN is not cached. Create an LN with ln.data == null, and
895                  * mark it dirty. Then attach it to the tree.
896                  */
897                 ln = dbType.createDeletedLN(envImpl);
898                 oldLNMemSize = ln.getMemorySizeIncludedByParent();
899                 bin.attachNode(index, ln, null /*lnSlotKey*/);
900             }
901 
902             /* Get a wli to log. */
903             WriteLockInfo wli = lockStanding.prepareForUpdate();
904 
905             /* Log the deleted record version and lock its new LSN. */
906             logResult = ln.optionalLog(
907                 envImpl, databaseImpl, bin.getKey(index), oldLsn,
908                 lastLoggedSize, locker, wli, repContext);
909 
910             /*
911              * Now update the parent BIN of the LN to correctly reference the
912              * LN and adjust the memory sizing.
913              */
914             bin.updateEntry(
915                 index, oldLNMemSize, logResult.newLsn, logResult.newSize,
916                 null /*lnSlotKey*/);
917             bin.setPendingDeleted(index);
918 
919             /* Cache record version for delete operation. */
920             setCurrentVersion(ln.getVLSNSequence(), logResult.newLsn);
921 
922             /*
923              * It is desirable to evict the LN after deletion because it will
924              * never be fetched again.  This is especially true because slot
925              * compression is deferred until a full BIN is logged.  But for
926              * deferred-write DBs we should not evict a dirty LN since it may
927              * be logged unnecessarily.
928              */
929             if (!databaseImpl.isDeferredWriteMode() &&
930                 bin.getTarget(index) != null) {
931                 bin.evictLN(index);
932             }
933 
934             locker.addDeleteInfo(bin);
935             success = true;
936 
937         } finally {
938 
939             if (success &&
940                 opStats != null &&
941                 bin != null &&
942                 bin.isBINDelta()) {
943                 opStats.increment(
944                     ThroughputStatGroup.BIN_DELTA_DELETES_OFFSET);
945             }
946 
947             releaseBIN();
948         }
949 
950         trace(Level.FINER, TRACE_DELETE, bin, index, oldLsn, logResult.newLsn);
951 
952         return OperationStatus.SUCCESS;
953     }
954 
955     /**
956      * Modify the current record with the given data, and optionally replace
957      * the key.
958      *
959      * @param key The new key value for the BIN slot S to be updated. Cannot
960      * be partial. For a no-dups DB, it is null. For dups DBs it is a 2-part
961      * key combining the current primary key of slot S with the original,
962      * user-provided data. "key" (if not null) must compare equal to S.key
963      * (otherwise DuplicateDataException is thrown), but the 2 keys may not
964      * be identical if custom comparators are used. So, S.key will actually
965      * be replaced by "key".
966      *
967      * @param data The new data to (perhaps partially) replace the data of the
968      * LN associated with the BIN slot. For dups DBs it is EMPTY_DUPS_DATA.
969      * Note: for dups DBs the original, user-provided "data" must not be
970      * partial.
971      *
972      * @param returnOldData To receive the old LN data (before the update).
973      * It is needed only by DBs with indexes/triggers; will be null otherwise.
974      *
975      * @param returnNewData To receive the full data of the updated LN.
976      * It is needed only by DBs with indexes/triggers and only if "data" is
977      * partial; will be null otherwise. Note: "returnNewData" may be different
978      * than "data" only if "data" is partial.
979      *
980      * @param opStats Used to collect stats about user ops performed on BIN
981      * deltas. Internal ops should pass null.
982      */
updateCurrentRecord( DatabaseEntry key, DatabaseEntry data, DatabaseEntry returnOldData, DatabaseEntry returnNewData, ReplicationContext repContext, ThroughputStatGroup opStats)983     public OperationStatus updateCurrentRecord(
984         DatabaseEntry key,
985         DatabaseEntry data,
986         DatabaseEntry returnOldData,
987         DatabaseEntry returnNewData,
988         ReplicationContext repContext,
989         ThroughputStatGroup opStats) {
990 
991         assert assertCursorState(
992             true /*mustBeInitialized*/, false /*mustNotBeInitialized*/);
993 
994         if (returnOldData != null) {
995             returnOldData.setData(null);
996         }
997         if (returnNewData != null) {
998             returnNewData.setData(null);
999         }
1000 
1001         final LockStanding lockStanding;
1002         OperationStatus status = OperationStatus.NOTFOUND;
1003         boolean success = false;
1004 
1005         latchBIN();
1006 
1007         try {
1008             /* Get a write lock. */
1009             lockStanding = lockLN(
1010                 LockType.WRITE, true /*allowUncontended*/, false /*noWait*/);
1011 
1012             if (!lockStanding.recordExists()) {
1013                 /* LN was deleted or cleaned. */
1014                 revertLock(lockStanding);
1015             } else {
1016                 status = updateRecordInternal(
1017                     (key != null ? Key.makeKey(key) : null), data,
1018                     returnOldData, returnNewData, lockStanding, repContext);
1019             }
1020 
1021             success = true;
1022             return status;
1023 
1024         } finally {
1025 
1026             if (success &&
1027                 opStats != null &&
1028                 bin != null &&
1029                 bin.isBINDelta()) {
1030                 opStats.increment(
1031                     ThroughputStatGroup.BIN_DELTA_UPDATES_OFFSET);
1032             }
1033 
1034             releaseBIN();
1035         }
1036     }
1037 
1038     /**
1039      * Insert the given record (key + LN) in the tree or return KEYEXIST if
1040      * the key is already present.
1041      *
1042      * The cursor must initially be uninitialized.
1043      *
1044      * This method is called directly internally for putting tree map LNs
1045      * and file summary LNs.
1046      */
insertRecord( byte[] key, LN ln, boolean blindInsertion, ReplicationContext repContext)1047     public OperationStatus insertRecord(
1048         byte[] key,
1049         LN ln,
1050         boolean blindInsertion,
1051         ReplicationContext repContext)
1052         throws DatabaseException {
1053 
1054         assert assertCursorState(
1055             false /*mustBeInitialized*/, true /*mustNotBeInitialized*/);
1056         if (LatchSupport.TRACK_LATCHES) {
1057             LatchSupport.expectBtreeLatchesHeld(0);
1058         }
1059 
1060         try {
1061             final Pair<LockStanding, Boolean> result = insertRecordInternal(
1062                 key, ln, blindInsertion, null, repContext);
1063 
1064             if (!result.second()) {
1065                 return OperationStatus.KEYEXIST;
1066             }
1067             return OperationStatus.SUCCESS;
1068         } finally {
1069             releaseBIN();
1070         }
1071     }
1072 
1073     /**
1074      * Insert or update a given record. The method searches for the record
1075      * using its key. It will perform an update if the record is found,
1076      * otherwise an insertion.
1077      *
1078      * The cursor must initially be uninitialized.
1079      *
1080      * Called by all the Cursor.putXXX() ops, except putCurrent().
1081      *
1082      * @param key The new key value for the BIN slot S to be inserted/updated.
1083      * Cannot be partial. For dups DBs it is a 2-part key combining the
1084      * original, user-provided key and data. In case of update, "key" must
1085      * compare equal to S.key (otherwise DuplicateDataException is thrown),
1086      * but the 2 keys may not be identical if custom comparators are used.
1087      * So, S.key will actually be replaced by "key".
1088      *
1089      * @param data In case of update, the new data to (perhaps partially)
1090      * replace the data of the LN associated with the BIN slot. For dups DBs
1091      * it is EMPTY_DUPS_DATA. Note: for dups DBs the original, user-provided
1092      * "data" must not be partial.
1093      *
1094      * @param ln is normally a new LN node that is created for insertion, and
1095      * will be discarded if an update occurs.  However, HA will pass an
1096      * existing node.
1097      *
1098      * @param putMode OVERWRITE or NO_OVERWRITE
1099      *
1100      * @param returnOldData To receive, in case of update, the old LN data
1101      * (before the update). It is needed only by DBs with indexes/triggers;
1102      * will be null otherwise.
1103      *
1104      * @param returnNewData To receive the full data of the new or updated LN.
1105      * It is needed only by DBs with indexes/triggers and only if "data" is
1106      * partial; will be null otherwise. Note: "returnNewData" may be different
1107      * than "data" only if "data" is partial.
1108      *
1109      * @param opStats Used to collect stats about user ops performed on BIN
1110      * deltas. Internal ops should pass null.
1111      *
1112      * @return pair of status and 'inserted' boolean.
1113      */
insertOrUpdateRecord( final DatabaseEntry key, final DatabaseEntry data, final LN ln, final PutMode putMode, final DatabaseEntry returnOldData, final DatabaseEntry returnNewData, final ReplicationContext repContext, ThroughputStatGroup opStats)1114     public Pair<OperationStatus, Boolean> insertOrUpdateRecord(
1115         final DatabaseEntry key,
1116         final DatabaseEntry data,
1117         final LN ln,
1118         final PutMode putMode,
1119         final DatabaseEntry returnOldData,
1120         final DatabaseEntry returnNewData,
1121         final ReplicationContext repContext,
1122         ThroughputStatGroup opStats) {
1123 
1124         assert key != null;
1125         assert data != null;
1126         assert ln != null;
1127         assert putMode != null;
1128         assert assertCursorState(
1129             false /*mustBeInitialized*/, true /*mustNotBeInitialized*/);
1130         if (LatchSupport.TRACK_LATCHES) {
1131             LatchSupport.expectBtreeLatchesHeld(0);
1132         }
1133 
1134         if (putMode != PutMode.OVERWRITE &&
1135             putMode != PutMode.NO_OVERWRITE &&
1136             putMode != PutMode.BLIND_INSERTION) {
1137             throw EnvironmentFailureException.unexpectedState(
1138                 putMode.toString());
1139         }
1140 
1141         boolean success = false;
1142         boolean inserted = false;
1143 
1144         byte[] keyCopy = Key.makeKey(key);
1145 
1146         try {
1147 
1148             /*
1149              * Try to insert the key/data pair as a new record. Will succeed if
1150              * the record does not exist in the DB already. Otherwise, the
1151              * insertRecord() returns with the cursor registered on the slot
1152              * whose key is equal to "key", with the LSN of that slot  ocked
1153              * in WRITE mode, and with the containing BIN latched.
1154              */
1155             Pair<LockStanding, Boolean> insertResult = insertRecordInternal(
1156                 keyCopy, ln,
1157                 (putMode == PutMode.BLIND_INSERTION) /*blindInsertion*/,
1158                 returnNewData, repContext);
1159 
1160             if (insertResult.second()) {
1161                 inserted = true;
1162                 success = true;
1163                 return new Pair<>(OperationStatus.SUCCESS, true);
1164             }
1165 
1166             /*
1167              * There is a non-deleted slot whose key is == "key". So, this is
1168              * going to be an update. Note: Cursor has been registered on the
1169              * existing slot by insertRecord()
1170              */
1171             if (putMode == PutMode.NO_OVERWRITE) {
1172                 success = true;
1173                 return new Pair<>(OperationStatus.KEYEXIST, false);
1174             }
1175 
1176             /*
1177              * Update the non-deleted record at the cursor position. We have
1178              * optimized by preferring to take an uncontended lock. The
1179              * lockStanding var is guaranteed to be non-null in this case.
1180              * The BIN must remain latched when calling this method.
1181              */
1182             final OperationStatus status = updateRecordInternal(
1183                 keyCopy, data, returnOldData, returnNewData,
1184                 insertResult.first(), repContext);
1185 
1186             success = true;
1187             return new Pair<>(status, false);
1188 
1189         } finally {
1190 
1191             if (success &&
1192                 opStats != null &&
1193                 bin != null &&
1194                 bin.isBINDelta()) {
1195                 if (inserted) {
1196                     opStats.increment(
1197                         ThroughputStatGroup.BIN_DELTA_INSERTS_OFFSET);
1198                 } else {
1199                     opStats.increment(
1200                         ThroughputStatGroup.BIN_DELTA_UPDATES_OFFSET);
1201                 }
1202             }
1203 
1204             releaseBIN();
1205         }
1206     }
1207 
1208     /*
1209      * Try to insert the key/data pair as a new record. Will succeed if the
1210      * record does not exist already.
1211      *
1212      * The cursor must initially be uninitialized.
1213      *
1214      * On return, this.bin is latched.
1215      */
insertRecordInternal( final byte[] key, final LN ln, boolean blindInsertion, DatabaseEntry returnNewData, final ReplicationContext repContext)1216     private Pair<LockStanding, Boolean> insertRecordInternal(
1217         final byte[] key,
1218         final LN ln,
1219         boolean blindInsertion,
1220         DatabaseEntry returnNewData,
1221         final ReplicationContext repContext) {
1222 
1223         final EnvironmentImpl envImpl = databaseImpl.getEnv();
1224         WriteLockInfo wli;
1225         LockStanding lockStanding = null;
1226         final boolean isSlotReuse;
1227         final Tree tree = databaseImpl.getTree();
1228 
1229         /*
1230          * At this point, this cursor does not have a position so it cannot be
1231          * registered with the BIN that will be used. This is good because it
1232          * allows slot compression to occur before BIN splits (thus avoiding
1233          * splits if compression finds and removes any deleted slots). However,
1234          * if another cursor, including the one from which this was cloned, is
1235          * registered with the BIN, then splits won't be allowed. This is a
1236          * good reason to use non-sticky cursors for insertions, especially
1237          * sequential insertions since they will often end up in the same BIN.
1238          *
1239          * Find and latch the BIN that should contain the "key". On return from
1240          * the tree search, this.bin is latched, but "this" is still not
1241          * registered.
1242          */
1243         bin = tree.findBinForInsert(key, getCacheMode());
1244 
1245         /*
1246          * In the case where logging occurs before locking, allow lockers to
1247          * reject the operation (e.g., if writing on a replica) and also
1248          * prepare to undo in the (very unlikely) event that logging succeeds
1249          * but locking fails. Call this method BEFORE slot insertion, in case
1250          * it throws an exception which would leave the slot with a null LSN.
1251          *
1252          * For Txn, creates the writeInfo map (if not done already), and
1253          * inserts databaseImpl in the undoDatabases map. Noop for other
1254          * non-HA lockers.
1255          */
1256         locker.preLogWithoutLock(databaseImpl);
1257 
1258         /*
1259          * If the key exists already, insertEntry1() does not insert, but
1260          * returns the index of the existing key.
1261          *
1262          * If bin is a delta and it does not contain the key, then:
1263          * (a) if blindInsertion is false, insertEntry1() will mutate it to a
1264          * full BIN and check again if the key exists or not.
1265          * (b) if blindInsertion is true, insertEntry1() will not mutate the
1266          * delta; it will just insert the key into the delta. This is OK,
1267          * because blindInsertion will be true only if we know already that the
1268          * key does not exist in the tree.
1269          */
1270         int insertIndex = bin.insertEntry1(
1271             ln, key, DbLsn.NULL_LSN, blindInsertion);
1272 
1273         if ((insertIndex & IN.INSERT_SUCCESS) == 0) {
1274             /*
1275              * Key exists. Insertion was not successful. Register the cursor on
1276              * the existing slot. If the slot is marked deleted, the key does
1277              * not really exist and the slot can be reused to do an insertion.
1278              */
1279             isSlotReuse = true;
1280             setIndex(insertIndex);
1281             addCursor();
1282             setInitialized();
1283 
1284             /*
1285              * Lock the LSN for the existing LN slot, and check deleted-ness.
1286              * An uncontended lock request is permitted because we are holding
1287              * the bin latch. If no locker holds a lock on the slot, then no
1288              * lock is taken by this cursor either.
1289              */
1290             lockStanding = lockLN(
1291                 LockType.WRITE, true /*allowUncontended*/, false /*noWait*/);
1292 
1293             boolean isDeleted = !lockStanding.recordExists();
1294 
1295             if (!isDeleted)
1296                 return new Pair<>(lockStanding, false);
1297 
1298             /*
1299              * The current slot has its KD or PD flag set. Note: the record of
1300              * the slot may have been deleted by this.locker itself.
1301              */
1302             assert(lockStanding != null);
1303 
1304             /*
1305              * Create a new WriteLockInfo or get an existing one for the LSN
1306              * of the current slot, and set its abortLSN and abortKD fields,
1307              * if needed, i.e, if it is not the current txn the one who created
1308              * this LSN. The abortLSN and abortKD fields of the wli will be
1309              * included in the new logrec.
1310              */
1311             wli = lockStanding.prepareForUpdate();
1312 
1313         } else {
1314             /*
1315              * Register the cursor at the slot that has been successfully
1316              * inserted.
1317              */
1318             isSlotReuse = false;
1319             setIndex(insertIndex &= ~IN.INSERT_SUCCESS);
1320             addCursor();
1321             setInitialized();
1322 
1323             /* Create a new WriteLockInfo */
1324             wli = LockStanding.prepareForInsert();
1325         }
1326 
1327         /*
1328          * Log the new LN and lock the LSN of the new logrec in WRITE mode.
1329          * Note: in case of slot reuse, we pass NULL_LSN for the oldLsn param
1330          * because the old LN was counted obsolete when it was deleted.
1331          */
1332         LN.LogResult logResult = null;
1333         try {
1334             logResult = ln.optionalLog(
1335                 envImpl, databaseImpl, key,
1336                 DbLsn.NULL_LSN /*oldLSN*/, 0 /*oldSize*/,
1337                 locker, wli, repContext);
1338         } finally {
1339             if (logResult == null && !isSlotReuse) {
1340                 /*
1341                  * Possible buffer overflow, out-of-memory, or I/O exception
1342                  * during logging. The BIN entry will contain a NULL_LSN. To
1343                  * prevent an exception during a future fetchLN() call, we
1344                  * set the KD flag. We do not call BIN.deleteEntry because it
1345                  * does not adjust cursors. We do not add this entry to the
1346                  * compressor queue to avoid complexity (this situation is
1347                  * rare).
1348                  */
1349                 bin.setKnownDeleted(index);
1350             }
1351         }
1352 
1353         if (lockStanding == null) {
1354             /*
1355              * No slot reuse; straight insertion. Update LSN in BIN slot.
1356              * The LN is already in the slot.
1357              */
1358             bin.updateEntry(index, logResult.newLsn, logResult.newSize);
1359 
1360             /*
1361              * The following call accounts for extra marshaled memory, i.e.,
1362              * memory that was added to the LN as a side-effect of logging it.
1363              * This can happen for FileSummaryLN's only (it is a noop for
1364              * other kinds of LNs).
1365              *
1366              * To avoid violating assertions (e.g., in IN.changeMemorySize), we
1367              * must must finish the memory adjustment while the BIN is still
1368              * latched. [#20069]
1369              *
1370              * This special handling does not apply to slot reuse, because the
1371              * updateEntry() version used in the slot reuse case will recalc
1372              * the BIN memory from scratch, and as a result, will take into
1373              * account the extra marshaled memory. [#20845]
1374              */
1375             ln.addExtraMarshaledMemorySize(bin);
1376         } else {
1377 
1378             /* For DW DBs, set the slot LSN to NULL. Otherwise, noop. */
1379             bin.prepareForSlotReuse(index);
1380 
1381             /*
1382              * Slot reuse. When reusing a slot, the key is replaced in the BIN
1383              * slot. This ensures that the correct key value is used when the
1384              * new key is non-identical to the key in the slot but is
1385              * considered equal by the btree comparator.
1386              */
1387             bin.updateEntry(index, ln, logResult.newLsn, logResult.newSize, key);
1388             bin.clearKnownDeleted(index);
1389             bin.clearPendingDeleted(index);
1390         }
1391 
1392         if (returnNewData != null) {
1393             returnNewData.setData(null);
1394             ln.setEntry(returnNewData);
1395         }
1396 
1397         /* Cursor is positioned on new record. */
1398         setInitialized();
1399 
1400         /* Cache record version for insertion operation. */
1401         setCurrentVersion(ln.getVLSNSequence(), bin.getLsn(index));
1402 
1403         /*
1404          * It is desirable to evict the LN in a duplicates DB because it will
1405          * never be fetched again. But for deferred-write DBs we should not
1406          * evict a dirty LN since it may be logged unnecessarily.
1407          */
1408         if (databaseImpl.getSortedDuplicates() &&
1409             !databaseImpl.isDeferredWriteMode() &&
1410             bin.getTarget(index) != null) {
1411             bin.evictLN(index);
1412         }
1413 
1414         traceInsert(Level.FINER, bin, logResult.newLsn, index);
1415 
1416         return new Pair<>(lockStanding, true);
1417     }
1418 
1419     /**
1420      * Update the record where the cursor is currently positioned at. The
1421      * cursor is registered with this position, the associated bin is latched,
1422      * the BIN slot is not marked deleted, and it has been locked in WRITE
1423      * mode.
1424      */
updateRecordInternal( byte[] key, DatabaseEntry data, DatabaseEntry returnOldData, DatabaseEntry returnNewData, LockStanding lockStanding, ReplicationContext repContext)1425     private OperationStatus updateRecordInternal(
1426         byte[] key,
1427         DatabaseEntry data,
1428         DatabaseEntry returnOldData,
1429         DatabaseEntry returnNewData,
1430         LockStanding lockStanding,
1431         ReplicationContext repContext) {
1432 
1433         final EnvironmentImpl envImpl = databaseImpl.getEnv();
1434         final DbType dbType = databaseImpl.getDbType();
1435 
1436         assert lockStanding.recordExists();
1437         final long oldLsn = lockStanding.lsn;
1438         assert oldLsn != DbLsn.NULL_LSN;
1439 
1440         final LN.LogResult logResult;
1441 
1442         /*
1443          * Must fetch LN if any of the following are true:
1444          *  - returnOldData is non-null: data needs to be returned
1445          *  - data is a partial entry: needs to be resolved
1446          *  - CLEANER_FETCH_OBSOLETE_SIZE is configured and lastLoggedSize
1447          *    is unknown
1448          *  - this database does not use the standard LN class and we
1449          *    cannot call DbType.createdUpdatedLN further below (this is
1450          *    the case for NameLNs, MapLNs, and FileSummaryLNs).
1451          * For other cases, we are careful not to fetch, in order to avoid
1452          * a random read during an update operation.
1453          *
1454          * Note that we checked for a deleted/cleaned LN above, so we know
1455          * that fetchLN will not return null.
1456          */
1457         final int lastLoggedSize = bin.getLastLoggedSize(index);
1458         LN ln;
1459 
1460         if (returnOldData != null ||
1461             data.getPartial() ||
1462             (lastLoggedSize == 0 &&
1463              envImpl.getCleaner().getFetchObsoleteSize(databaseImpl)) ||
1464             !dbType.mayCreateUpdatedLN()) {
1465             /* Must fetch. */
1466             ln = bin.fetchLN(index, cacheMode);
1467         } else {
1468             /* If resident, use LN for obsolete size tracking. */
1469             ln = (LN) bin.getTarget(index);
1470         }
1471 
1472         final byte[] oldKey = bin.getKey(index);
1473         final byte[] foundDataBytes = (ln != null) ? ln.getData() : null;
1474         final byte[] newData = (data.getPartial() ?
1475                                 LN.resolvePartialEntry(data, foundDataBytes) :
1476                                 LN.copyEntryData(data));
1477 
1478         /*
1479          * If the key is changed (according to the comparator), we assume
1480          * it is actually the data that has changed for a duplicate's DB.
1481          */
1482         if (key != null &&
1483             Key.compareKeys(
1484                 oldKey, key, databaseImpl.getKeyComparator()) != 0) {
1485             throw new DuplicateDataException(
1486                 "Can't replace a duplicate with new data that is not " +
1487                 "equal to the existing data according to the duplicate " +
1488                 " comparator.");
1489         }
1490 
1491         if (returnOldData != null) {
1492             assert foundDataBytes != null;
1493             returnOldData.setData(null);
1494             LN.setEntry(returnOldData, foundDataBytes);
1495         }
1496 
1497         /* Update the existing LN, if cached, else create new LN. */
1498         final long oldLNMemSize;
1499         if (ln != null) {
1500             /* LN is cached: set ln.data to newData and mark ln dirty. */
1501             oldLNMemSize = ln.getMemorySizeIncludedByParent();
1502             ln.modify(newData);
1503         } else {
1504             /* LN is not cached, create new LN for logging. */
1505             ln = dbType.createUpdatedLN(envImpl, newData);
1506             /* Make new LN resident. */
1507             bin.attachNode(index, ln, null /*lnSlotKey*/);
1508             oldLNMemSize = ln.getMemorySizeIncludedByParent();
1509         }
1510 
1511         /*
1512          * Create a new WriteLockInfo or get an existing one for the LSN
1513          * of the current slot, and set its abortLSN and abortKD fields,
1514          * if needed, i.e, if it is not the current txn the one who created
1515          * this LSN. The abortLSN and abortKD fields of the wli will be
1516          * included in the new logrec.
1517          */
1518         WriteLockInfo wli = lockStanding.prepareForUpdate();
1519 
1520         /* Log the new record version and lock its new LSN . */
1521         logResult = ln.optionalLog(
1522             envImpl, databaseImpl, (key != null ? key : oldKey),
1523             oldLsn, lastLoggedSize, locker, wli, repContext);
1524 
1525         /* Return a copy of resulting data, if requested. [#16932] */
1526         if (returnNewData != null) {
1527             returnNewData.setData(null);
1528             ln.setEntry(returnNewData);
1529         }
1530 
1531         /*
1532          * Update the parent BIN. Update the key, if changed. [#15704]
1533          */
1534         bin.updateEntry(
1535             index, oldLNMemSize, logResult.newLsn, logResult.newSize, key);
1536 
1537         /* Cache record version for update operation. */
1538         setCurrentVersion(ln.getVLSNSequence(), logResult.newLsn);
1539 
1540         /*
1541          * It is desirable to evict the LN in a duplicates DB because it
1542          * will never be fetched again.  But for deferred-write DBs we
1543          * should not evict a dirty LN since it may be logged unnecessarily.
1544          */
1545         if (databaseImpl.getSortedDuplicates() &&
1546             !databaseImpl.isDeferredWriteMode() &&
1547             bin.getTarget(index) != null) {
1548             bin.evictLN(index);
1549         }
1550 
1551         trace(Level.FINER, TRACE_MOD, bin, index, oldLsn, logResult.newLsn);
1552 
1553         return OperationStatus.SUCCESS;
1554     }
1555 
1556     /**
1557      * Position the cursor at the first or last record of the databaseImpl.
1558      * It's okay if this record is deleted.
1559      *
1560      * The cursor must initially be uninitialized.
1561      *
1562      * Returns with the target BIN latched!
1563      *
1564      * @return true if a first or last position is found, false if the
1565      * tree being searched is empty.
1566      */
positionFirstOrLast(boolean first)1567     public boolean positionFirstOrLast(boolean first)
1568         throws DatabaseException {
1569 
1570         assert assertCursorState(
1571             false /*mustBeInitialized*/, true /*mustNotBeInitialized*/);
1572 
1573         boolean found = false;
1574 
1575         try {
1576             if (first) {
1577                 bin = databaseImpl.getTree().getFirstNode(cacheMode);
1578             } else {
1579                 bin = databaseImpl.getTree().getLastNode(cacheMode);
1580             }
1581 
1582             if (bin != null) {
1583 
1584                 TreeWalkerStatsAccumulator treeStatsAccumulator =
1585                     getTreeStatsAccumulator();
1586 
1587                 if (bin.getNEntries() == 0) {
1588 
1589                     /*
1590                      * An IN was found. Even if it's empty, let Cursor
1591                      * handle moving to the first non-deleted entry.
1592                      */
1593                     found = true;
1594                     index = -1;
1595                 } else {
1596                     index = (first ? 0 : (bin.getNEntries() - 1));
1597 
1598                     if (treeStatsAccumulator != null &&
1599                         !bin.isEntryKnownDeleted(index) &&
1600                         !bin.isEntryPendingDeleted(index)) {
1601                         treeStatsAccumulator.incrementLNCount();
1602                     }
1603 
1604                     /*
1605                      * Even if the entry is deleted, just leave our
1606                      * position here and return.
1607                      */
1608                     found = true;
1609                 }
1610             }
1611 
1612             addCursor(bin);
1613             setInitialized();
1614 
1615             return found;
1616         } catch (final Throwable e) {
1617             /* Release latch on error. */
1618             releaseBIN();
1619             throw e;
1620         }
1621     }
1622 
1623     /**
1624      * Position this cursor on the slot whose key is the max key less or equal
1625      * to the given search key.
1626      *
1627      * To be more precise, let K1 be search key. The method positions the
1628      * cursor on the BIN that should contain K1. If the BIN does contain K1,
1629      * this.index is set to the containing slot. Otherwise, this.index is
1630      * set to the right-most slot whose key is < K1, or to -1 if K1 is < than
1631      * all keys in the BIN.
1632      *
1633      * The cursor must initially be uninitialized.
1634      *
1635      * The method returns with the BIN latched, unless an exception is raised.
1636      *
1637      * The method returns an integer that encodes the search outcome: If the
1638      * FOUND bit is not set, the tree is completely empty (has no BINs). If
1639      * the FOUND bit is set, the EXACT_KEY bit says whether K1 was found or
1640      * not and the FOUND_LAST bit says whether the cursor is positioned to the
1641      * very last slot of the BTree (note that this state can only be counted
1642      * on as long as the BIN is latched).
1643      *
1644      * Even if the search returns an exact result, the record may be deleted.
1645      * The caller must therefore check  whether the cursor is positioned on a
1646      * deleted record.
1647      *
1648      * This method does not lock the record. The caller is expected to call
1649      * lockAndGetCurrent to perform locking.
1650      */
searchRange( DatabaseEntry searchKey, Comparator<byte[]> comparator)1651     public int searchRange(
1652         DatabaseEntry searchKey,
1653         Comparator<byte[]> comparator) {
1654 
1655         assert assertCursorState(
1656             false /*mustBeInitialized*/, true /*mustNotBeInitialized*/);
1657 
1658         boolean foundSomething = false;
1659         boolean foundExactKey = false;
1660         boolean foundLast = false;
1661         BINBoundary binBoundary = new BINBoundary();
1662 
1663         try {
1664             byte[] key = Key.makeKey(searchKey);
1665 
1666             bin = databaseImpl.getTree().search(
1667                 key, Tree.SearchType.NORMAL, binBoundary, cacheMode,
1668                 comparator);
1669 
1670             if (bin != null) {
1671 
1672                 foundSomething = true;
1673                 if (bin.isBINDelta() && comparator != null) {
1674 
1675                     /*
1676                      * We must mutate a BIN delta if a non-null comparator is
1677                      * used. Otherwise, if we positioned the cursor on the
1678                      * delta using the non-null comparator, we would not be
1679                      * able to adjust its position correctly later when the
1680                      * delta gets mutated for some reason (because at that
1681                      * later time, the comparator used here would not be
1682                      * known).
1683                      */
1684                     bin.mutateToFullBIN();
1685                 }
1686 
1687                 index = bin.findEntry(
1688                     key, true /*indicateIfExact*/, false/*exact*/, comparator);
1689 
1690                 if (bin.isBINDelta() &&
1691                     (index < 0 ||
1692                      (index & IN.EXACT_MATCH) == 0 ||
1693                      binBoundary.isLastBin)) {
1694 
1695                     /*
1696                      * Note: if binBoundary.isLastBin, we must mutate the BIN
1697                      * in order to compute the foundLast flag below.
1698                      */
1699                     bin.mutateToFullBIN();
1700                     index = bin.findEntry(key, true, false, comparator);
1701                 }
1702 
1703                 if (index >= 0) {
1704                     if ((index & IN.EXACT_MATCH) != 0) {
1705                         foundExactKey = true;
1706                         index &= ~IN.EXACT_MATCH;
1707                     }
1708 
1709                     foundLast = (binBoundary.isLastBin &&
1710                                  index == bin.getNEntries() - 1);
1711                 }
1712 
1713                 /*
1714                  * Must call addCursor after mutateToFullBIN() to avoid having
1715                  * to reposition "this" inside mutateToFullBIN(), which would
1716                  * be both unnecessary and wrong given that this.index could
1717                  * have the IN.EXACT_MATCH still on.
1718                  */
1719                 addCursor(bin);
1720             }
1721 
1722             setInitialized();
1723 
1724             /* Return a multi-part status value */
1725             return ((foundSomething ? FOUND : 0) |
1726                     (foundExactKey ? EXACT_KEY : 0) |
1727                     (foundLast ? FOUND_LAST : 0));
1728 
1729         } catch (final Throwable e) {
1730             releaseBIN();
1731             throw e;
1732         }
1733     }
1734 
searchExact(DatabaseEntry searchKey, LockType lockType)1735     public boolean searchExact(DatabaseEntry searchKey, LockType lockType) {
1736         return searchExact(searchKey, lockType, false, false) != null;
1737     }
1738 
1739     /**
1740      * Position this cursor on the slot (if any) whose key matches the given
1741      * search key. If no such slot is found or the slot does not hold a "valid"
1742      * record, return null. Otherwise, lock the found record with the specified
1743      * lock type (which may be NONE) and return the LockStanding obj that was
1744      * created by the locking op. Whether the slot contains a "valid" record or
1745      * not depends on the slot's KD/PD flags and the lockType and dirtyReadAll
1746      * parameters. Four cases are considered; they are described in the
1747      * lockLNAndCheckDeleted() method.
1748      *
1749      * The cursor must initially be uninitialized.
1750      *
1751      * The method returns with the BIN latched, unless an exception is raised.
1752      *
1753      * In all cases, the method registers the cursor with the BIN that contains
1754      * or should contain the search key.
1755      *
1756      * @return the LockStanding for the found record, or null if no record was
1757      * found.
1758      */
searchExact( final DatabaseEntry searchKey, final LockType lockType, final boolean dirtyReadAll, final boolean dataRequested)1759     public LockStanding searchExact(
1760         final DatabaseEntry searchKey,
1761         final LockType lockType,
1762         final boolean dirtyReadAll,
1763         final boolean dataRequested) {
1764 
1765         assert assertCursorState(
1766             false /*mustBeInitialized*/, true /*mustNotBeInitialized*/);
1767 
1768         LockStanding lockStanding = null;
1769 
1770         try {
1771             byte[] key = Key.makeKey(searchKey);
1772 
1773             bin = databaseImpl.getTree().search(key, cacheMode);
1774 
1775             if (bin != null) {
1776 
1777                 index = bin.findEntry(key, false, true /*exact*/);
1778 
1779                 if (index < 0 && bin.isBINDelta()) {
1780 
1781                     if (bin.mayHaveKeyInFullBin(key)) {
1782                         bin.mutateToFullBIN();
1783                         index = bin.findEntry(key, false, true /*exact*/);
1784                     }
1785                 }
1786 
1787                 addCursor(bin);
1788 
1789                 if (index >= 0) {
1790                     lockStanding = lockLNAndCheckDeleted(
1791                         lockType, dirtyReadAll, dataRequested);
1792                 }
1793             }
1794 
1795             setInitialized();
1796             return lockStanding;
1797 
1798         } catch (final Throwable e) {
1799             /* Release latch on error. */
1800             releaseBIN();
1801             throw e;
1802         }
1803     }
1804 
1805     /**
1806      * Lock and copy current record into the key and data DatabaseEntry.
1807      * When calling this method, this.bin should not be latched already.
1808      * On return, this.bin is unlatched.
1809      */
lockAndGetCurrent( DatabaseEntry foundKey, DatabaseEntry foundData, final LockType lockType)1810     public OperationStatus lockAndGetCurrent(
1811         DatabaseEntry foundKey,
1812         DatabaseEntry foundData,
1813         final LockType lockType)
1814         throws DatabaseException {
1815 
1816         return lockAndGetCurrent(
1817             foundKey, foundData, lockType, false, false, true);
1818     }
1819 
1820     /**
1821      * Let S be the slot where this cursor is currently positioned on. If S
1822      * does not hold a "valid" record, return KEYEMPTY. Otherwise, lock the
1823      * record in S with the specified lock type(which may be NONE), copy its
1824      * key and data into the key and data DatabaseEntries, and return SUCCESS.
1825      * Whether the slot contains a "valid" record or not depends on the slot's
1826      * KD/PD flags and the lockType and dirtyReadAll parameters. Four cases are
1827      * considered; they are described in the lockLNAndCheckDeleted() method.
1828      *
1829      * On entry, the isLatched param says whether this.bin is latched or not.
1830      * On return, this.bin is unlatched if the unlatch param is true or an
1831      * exception is thrown.
1832      */
lockAndGetCurrent( DatabaseEntry foundKey, DatabaseEntry foundData, final LockType lockType, final boolean dirtyReadAll, final boolean isLatched, final boolean unlatch)1833     public OperationStatus lockAndGetCurrent(
1834         DatabaseEntry foundKey,
1835         DatabaseEntry foundData,
1836         final LockType lockType,
1837         final boolean dirtyReadAll,
1838         final boolean isLatched,
1839         final boolean unlatch)
1840         throws DatabaseException {
1841 
1842         /* Used in the finally to indicate whether exception was raised. */
1843         boolean success = false;
1844 
1845         try {
1846             assert assertCursorState(
1847                 true /*mustBeInitialized*/, false /*mustNotBeInitialized*/);
1848 
1849             assert checkAlreadyLatched(isLatched) : dumpToString(true);
1850 
1851             if (!isLatched) {
1852                 latchBIN();
1853             }
1854 
1855             assert(bin.getCursorSet().contains(this));
1856 
1857             TreeWalkerStatsAccumulator treeStatsAccumulator =
1858                 getTreeStatsAccumulator();
1859 
1860             /*
1861              * If we encounter a deleted slot, opportunistically add the BIN
1862              * to the compressor queue.
1863              */
1864             if (index >= 0 && index < bin.getNEntries() &&
1865                 (bin.isEntryKnownDeleted(index) ||
1866                  bin.isEntryPendingDeleted(index))) {
1867                 bin.queueSlotDeletion();
1868             }
1869 
1870             /*
1871              * Check the KD flag in the BIN slot and make sure this isn't an
1872              * empty BIN. The BIN could be empty by virtue of the compressor
1873              * running the size of this BIN to 0 but not having yet deleted
1874              * it from the tree.
1875              *
1876              * The index may be negative if we're at an intermediate stage in
1877              * an higher level operation (e.g., the starting search for a range
1878              * scan op), and we expect a higher level method to do a next or
1879              * prev operation after this returns KEYEMPTY. [#11700]
1880              */
1881             if (index < 0 ||
1882                 index >= bin.getNEntries() ||
1883                 bin.isEntryKnownDeleted(index)) {
1884                 /* Node is no longer present. */
1885                 if (treeStatsAccumulator != null) {
1886                     treeStatsAccumulator.incrementDeletedLNCount();
1887                 }
1888 
1889                 success = true;
1890                 return OperationStatus.KEYEMPTY;
1891             }
1892 
1893             assert TestHookExecute.doHookIfSet(testHook);
1894 
1895             final boolean dataRequested =
1896                 (foundData != null &&
1897                 (!foundData.getPartial() ||
1898                  foundData.getPartialLength() != 0));
1899 
1900             if (lockLNAndCheckDeleted(
1901                 lockType, dirtyReadAll, dataRequested) == null) {
1902                 if (treeStatsAccumulator != null) {
1903                     treeStatsAccumulator.incrementDeletedLNCount();
1904                 }
1905                 success = true;
1906                 return OperationStatus.KEYEMPTY;
1907             }
1908 
1909             getCurrent(foundKey, foundData);
1910 
1911             success = true;
1912             return OperationStatus.SUCCESS;
1913 
1914         } finally {
1915             if (unlatch || !success) {
1916                 releaseBIN();
1917             }
1918         }
1919     }
1920 
1921     /**
1922      * Let S be the slot where this cursor is currently positioned on. The
1923      * method locks S (i.e. its LSN), and depending on S's KD/PD flags, it
1924      * returns either null or the LockStanding obj that was created by the
1925      * locking op. The following 4 cases are considered:
1926      *
1927      * 1. If S is not KD/PD, return the LockStanding obj. In this case, we
1928      * know that S holds a valid (non-deleted) record.
1929      *
1930      * 2. If S is KD/PD and the lock type is not NONE, return null. In this
1931      * case, we know that the record that used to be in S has been definitely
1932      * deleted.
1933      *
1934      * 3. If S is KD/PD, the lock kind is NONE, and dirtyReadAll is false,
1935      * return null. This case corresponds to the READ_UNCOMMITTED LockMode.
1936      * The record in S has been deleted, but the deleting txn may be active
1937      * still, and if it aborts later, the record will be restored. To avoid a
1938      * potentially blocking lock, in READ_UNCOMMITTED mode we consider the
1939      * record to be non-existing and return null.
1940      *
1941      * 4. If S is KD/PD, the lock kind is NONE, and dirtyReadAll is true, lock
1942      * the record in READ mode. This case corresponds to the
1943      * READ_UNCOMMITTED_ALL LockMode, which requires that we do not skip
1944      * "provisionally deleted" records. There are two sub-cases:
1945      *
1946      *  4a. If dataRequested is false, we wait until the deleting txn finishes.
1947      *      In this case the READ lock is blocking. If after the lock is
1948      *      granted S is still KD/PD, release the lock and return null.
1949      *      Otherwise, release the lock and return the LockStanding obj.
1950      *
1951      *  4b. If dataRequested is true, then we check whether the deleting txn is
1952      *      still open by requested a non-blocking READ lock. If the lock is
1953      *      granted then the deleting txn is closed or this cursor's locker is
1954      *      the deleter, and we proceed as if the READ lock was granted in 4a.
1955      *      If the lock is denied then the deleting txn is still open, and we
1956      *      return the LockStanding obj so that the record is not skipped.
1957      *
1958      * The BIN must be latched on entry and is latched on exit.
1959      *
1960      * @param dirtyReadAll is true if using LockMode.READ_UNCOMMITTED_ALL.
1961      *
1962      * @param dataRequested is true if the read operation should return the
1963      * record data, meaning that a blocking lock must be used for dirtyReadAll.
1964      * Is ignored if dirtyReadAll is false. Is always false for a dup DB,
1965      * since data is never requested for dup DB ops at the CursorImpl level.
1966      */
lockLNAndCheckDeleted( final LockType lockType, final boolean dirtyReadAll, final boolean dataRequested)1967     private LockStanding lockLNAndCheckDeleted(
1968         final LockType lockType,
1969         final boolean dirtyReadAll,
1970         final boolean dataRequested) {
1971 
1972         assert !(dirtyReadAll && lockType != LockType.NONE);
1973         assert !(dataRequested && databaseImpl.getSortedDuplicates());
1974 
1975         LockStanding standing = lockLN(lockType);
1976 
1977         if (standing.recordExists()) {
1978             return standing;
1979         }
1980 
1981         /* The slot is KD/PD. */
1982 
1983         if (lockType != LockType.NONE) {
1984             revertLock(standing);
1985             /*
1986              * The deletion was committed by another locker, or has been
1987              * performed by this locker.
1988              */
1989             return null;
1990         }
1991 
1992         /* We're using dirty-read.  The lockLN above did not actually lock. */
1993 
1994         if (!dirtyReadAll) {
1995             /* READ_UNCOMMITTED -- skip deleted records without locking. */
1996             return null;
1997         }
1998 
1999         /*
2000          * READ_UNCOMMITTED_ALL -- get a read lock. Whether we can request a
2001          * no-wait or a blocking lock depends on the dataRequested parameter.
2002          *
2003          * Although there is some redundant processing in the sense that lockLN
2004          * is called more than once (above and below), this is not considered a
2005          * performance issue because accessing deleted records is normally
2006          * infrequent. Deleted slots are normally compressed away quickly.
2007          */
2008         standing = lockLN(
2009             LockType.READ, false /*allowUncontended*/,
2010             !dataRequested /*noWait*/);
2011 
2012         if (standing.lockResult.getLockGrant() == LockGrantType.DENIED) {
2013 
2014             /*
2015              * The no-wait lock request was denied, which means the data is not
2016              * needed and the deleting transaction is still open. The deleted
2017              * record should not be skipped in this case, according to the
2018              * definition of READ_UNCOMMITTED_ALL.
2019              */
2020             assert !standing.recordExists();
2021             return standing;
2022         }
2023 
2024         /* We have acquired a temporary read lock. */
2025         revertLock(standing);
2026 
2027         if (standing.recordExists()) {
2028             /* Another txn aborted the deletion while we waited. */
2029             return standing;
2030         }
2031 
2032         /*
2033          * The deletion was committed by another locker, or has been performed
2034          * by this locker.
2035          */
2036         return null;
2037     }
2038 
2039     /**
2040      * Copy current record into the key and data DatabaseEntry.
2041      */
getCurrent( DatabaseEntry foundKey, DatabaseEntry foundData)2042     public void getCurrent(
2043         DatabaseEntry foundKey,
2044         DatabaseEntry foundData) {
2045 
2046         assert(bin.isLatchExclusiveOwner());
2047         assert(index >= 0 && index < bin.getNEntries());
2048         assert(!bin.isEntryKnownDeleted(index));
2049 
2050         /*
2051          * We don't need to fetch the LN if the user has not requested that we
2052          * return the data, or if we know for sure that the LN is empty.
2053          */
2054         final boolean isEmptyLN = databaseImpl.isLNImmediatelyObsolete();
2055 
2056         final boolean dataRequested =
2057             (foundData != null &&
2058              (!foundData.getPartial() || foundData.getPartialLength() != 0));
2059 
2060         final LN ln = (!isEmptyLN && dataRequested ?
2061                        bin.fetchLN(index, cacheMode) :
2062                        null);
2063 
2064         /* Return the data. */
2065         if (dataRequested) {
2066             assert(ln != null || isEmptyLN);
2067 
2068             byte[] lnData =
2069                 (ln != null ? ln.getData() : LogUtils.ZERO_LENGTH_BYTE_ARRAY);
2070 
2071             LN.setEntry(foundData, lnData);
2072         }
2073 
2074         /* Return the key */
2075         if (foundKey != null) {
2076             LN.setEntry(foundKey, bin.getKey(index));
2077         }
2078 
2079         /* Cache record version for fetch operation. */
2080         final long vlsn = (ln != null ?
2081                            ln.getVLSNSequence() :
2082                            bin.getVLSN(index, false /*allowFetch*/, cacheMode));
2083 
2084         setCurrentVersion(vlsn, bin.getLsn(index));
2085     }
2086 
getCurrentLN(final boolean isLatched, final boolean unlatch)2087     public LN getCurrentLN(final boolean isLatched, final boolean unlatch)
2088         throws DatabaseException {
2089 
2090         /* Used in the finally to indicate whether exception was raised. */
2091         boolean success = false;
2092 
2093         try {
2094             assert assertCursorState(
2095                 true /*mustBeInitialized*/, false /*mustNotBeInitialized*/);
2096             assert checkAlreadyLatched(isLatched) : dumpToString(true);
2097 
2098             if (!isLatched) {
2099                 latchBIN();
2100             }
2101 
2102             assert(bin.getCursorSet().contains(this));
2103 
2104             LN ln = bin.fetchLN(index, cacheMode);
2105 
2106             success = true;
2107             return ln;
2108         } finally {
2109             if (unlatch || !success) {
2110                 releaseBIN();
2111             }
2112         }
2113     }
2114 
2115     /**
2116      * Retrieve the current LN, return with the target bin unlatched.
2117      */
lockAndGetCurrentLN(final LockType lockType)2118     public LN lockAndGetCurrentLN(final LockType lockType)
2119         throws DatabaseException {
2120 
2121         return lockAndGetCurrentLN(lockType, false, true);
2122     }
2123 
2124     /**
2125      * Retrieve the current LN. On entry, the isLatched param says whether
2126      * this.bin is latched or not. On return, this.bin is unlatched if the
2127      * unlatch param is true or an exception is thrown.
2128      */
lockAndGetCurrentLN(final LockType lockType, final boolean isLatched, final boolean unlatch)2129     public LN lockAndGetCurrentLN(final LockType lockType,
2130                                   final boolean isLatched,
2131                                   final boolean unlatch)
2132         throws DatabaseException {
2133 
2134         /* Used in the finally to indicate whether exception was raised. */
2135         boolean success = false;
2136 
2137         try {
2138             assert assertCursorState(
2139                 true /*mustBeInitialized*/, false /*mustNotBeInitialized*/);
2140             assert checkAlreadyLatched(isLatched) : dumpToString(true);
2141 
2142             if (!isLatched) {
2143                 latchBIN();
2144             }
2145 
2146             assert(bin.getCursorSet().contains(this));
2147 
2148             LockStanding lockStanding = lockLN(lockType);
2149             if (!lockStanding.recordExists()) {
2150                 revertLock(lockStanding);
2151                 success = true;
2152                 return null;
2153             }
2154 
2155             LN ln = bin.fetchLN(index, cacheMode);
2156 
2157             success = true;
2158             return ln;
2159         } finally {
2160             if (unlatch || !success) {
2161                 releaseBIN();
2162             }
2163         }
2164     }
2165 
2166     /**
2167      * Returns the VLSN and LSN for the record at the current position.  Must
2168      * be called when the cursor is positioned on a record.
2169      *
2170      * If this method is called on a secondary cursor, the version of the
2171      * associated primary record is returned.  In that case, the allowFetch
2172      * parameter is ignored, and the version is available only if the primary
2173      * record was retrieved (see setSecondaryCurrentVersion).
2174      *
2175      * @param allowFetch is true to fetch the LN to get the VLSN, or false to
2176      * return -1 for the VLSN if both the LN and VLSN are not cached.
2177      *
2178      * @throws IllegalStateException if the cursor is closed or uninitialized,
2179      * or this is a secondary cursor and the version is not cached.
2180      */
getCurrentVersion(boolean allowFetch)2181     public RecordVersion getCurrentVersion(boolean allowFetch) {
2182 
2183         /* Ensure cursor is open and initialized. */
2184         checkCursorState(
2185             true /*mustBeInitialized*/, false /*mustNotBeInitialized*/);
2186 
2187         /*
2188          * For a secondary cursor, the cached version is all we have.
2189          * See setSecondaryCurrentVersion.
2190          */
2191         if (isSecondaryCursor) {
2192             if (currentRecordVersion == null) {
2193                 throw new IllegalStateException(
2194                     "Record version is available via a SecondaryCursor only " +
2195                     "if the associated primary record was retrieved.");
2196             }
2197             return currentRecordVersion;
2198         }
2199 
2200         /*
2201          * Use cached version if available.  Do not use cached version if it
2202          * does not contain a VLSN, and VLSNs are preserved, and fetching is
2203          * allowed; instead, try to fetch it below.
2204          */
2205         if (currentRecordVersion != null) {
2206             if ((currentRecordVersion.getVLSN() !=
2207                  VLSN.NULL_VLSN.getSequence()) ||
2208                 !allowFetch ||
2209                 !databaseImpl.getEnv().getPreserveVLSN()) {
2210                 return currentRecordVersion;
2211             }
2212         }
2213 
2214         /* Get the VLSN from the BIN, create the version and cache it. */
2215         latchBIN();
2216         try {
2217             setCurrentVersion(
2218                 bin.getVLSN(index, allowFetch, cacheMode), bin.getLsn(index));
2219         } finally {
2220             releaseBIN();
2221         }
2222         return currentRecordVersion;
2223     }
2224 
setCurrentVersion(long vlsn, long lsn)2225     private void setCurrentVersion(long vlsn, long lsn) {
2226         if (isSecondaryCursor) {
2227             /* NOP.  Version is set by setSecondaryCurrentVersion. */
2228             return;
2229         }
2230         currentRecordVersion = new RecordVersion(vlsn, lsn);
2231     }
2232 
2233     /**
2234      * Returns the cached version without attempting to construct it.
2235      */
getCachedRecordVersion()2236     public RecordVersion getCachedRecordVersion() {
2237         return currentRecordVersion;
2238     }
2239 
2240     /**
2241      * When the primary record is read during a secondary operation, this
2242      * method is called to copy the primary version here. Since secondary
2243      * records do not have a version of their own, the secondary cursor
2244      * returns the version of the primary record.
2245      *
2246      * @param primaryVersion the version retrieved from the primary cursor
2247      * using getCachedRecordVersion.
2248      */
setSecondaryCurrentVersion(RecordVersion primaryVersion)2249     public void setSecondaryCurrentVersion(RecordVersion primaryVersion) {
2250         assert isSecondaryCursor;
2251         currentRecordVersion = primaryVersion;
2252     }
2253 
2254     /**
2255      * Advance a cursor.  Used so that verify can advance a cursor even in the
2256      * face of an exception [12932].
2257      * @param key on return contains the key if available, or null.
2258      * @param data on return contains the data if available, or null.
2259      */
advanceCursor(DatabaseEntry key, DatabaseEntry data)2260     public boolean advanceCursor(DatabaseEntry key, DatabaseEntry data) {
2261 
2262         BIN oldBin = bin;
2263         int oldIndex = index;
2264 
2265         key.setData(null);
2266         data.setData(null);
2267 
2268         try {
2269             getNext(
2270                 key, data, LockType.NONE, false /*dirtyReadAll*/,
2271                 true /*forward*/, false /*isLatched*/,
2272                 null /*rangeConstraint*/);
2273         } catch (DatabaseException ignored) {
2274             /* Klockwork - ok */
2275         }
2276 
2277         /*
2278          * If the position changed, regardless of an exception, then we believe
2279          * that we have advanced the cursor.
2280          */
2281         if (bin != oldBin || index != oldIndex) {
2282 
2283             /*
2284              * Return the key and data from the BIN entries, if we were not
2285              * able to read it above.
2286              */
2287             if (key.getData() == null && bin != null && index > 0) {
2288                 LN.setEntry(key, bin.getKey(index));
2289             }
2290             return true;
2291         } else {
2292             return false;
2293         }
2294     }
2295 
2296     /**
2297      * Move the cursor forward and return the next "valid" record. Whether a
2298      * slot contains a "valid" record or not depends on the slot's KD/PD flags
2299      * and the lockType and dirtyReadAll parameters. Four cases are considered;
2300      * they are described in the lockLNAndCheckDeleted() method.
2301      *
2302      * This will cross BIN boundaries. On return, no latches are held. If no
2303      * exceptions, the cursor is registered with its new location.
2304      *
2305      * @param foundKey DatabaseEntry to use for returning key
2306      *
2307      * @param foundData DatabaseEntry to use for returning data
2308      *
2309      * @param forward if true, move forward, else move backwards
2310      *
2311      * @param isLatched if true, the bin that we're on is already
2312      * latched.
2313      *
2314      * @param rangeConstraint if non-null, is called to determine whether a key
2315      * is out of range.
2316      *
2317      * @return the status.
2318      */
getNext( DatabaseEntry foundKey, DatabaseEntry foundData, LockType lockType, boolean dirtyReadAll, boolean forward, boolean isLatched, RangeConstraint rangeConstraint)2319     public OperationStatus getNext(
2320         DatabaseEntry foundKey,
2321         DatabaseEntry foundData,
2322         LockType lockType,
2323         boolean dirtyReadAll,
2324         boolean forward,
2325         boolean isLatched,
2326         RangeConstraint rangeConstraint)
2327         throws DatabaseException {
2328 
2329         assert assertCursorState(
2330             true /*mustBeInitialized*/, false /*mustNotBeInitialized*/);
2331 
2332         assert checkAlreadyLatched(isLatched) : dumpToString(true);
2333 
2334         OperationStatus result = OperationStatus.NOTFOUND;
2335         BIN anchorBIN = null;
2336 
2337         try {
2338             while (bin != null) {
2339 
2340                 assert checkAlreadyLatched(isLatched) : dumpToString(true);
2341 
2342                 if (!isLatched) {
2343                     latchBIN();
2344                     isLatched = true;
2345                 }
2346 
2347                 if (DEBUG) {
2348                     verifyCursor(bin);
2349                 }
2350 
2351                 bin.mutateToFullBIN();
2352 
2353                 /* Is there anything left on this BIN? */
2354                 if ((forward && ++index < bin.getNEntries()) ||
2355                     (!forward && --index > -1)) {
2356 
2357                     if (rangeConstraint != null &&
2358                         !rangeConstraint.inBounds(bin.getKey(index))) {
2359 
2360                         result = OperationStatus.NOTFOUND;
2361                         releaseBIN();
2362                         break;
2363                     }
2364 
2365                     OperationStatus ret = lockAndGetCurrent(
2366                         foundKey, foundData, lockType, dirtyReadAll,
2367                         true /*isLatched*/, false /*unlatch*/);
2368 
2369                     if (LatchSupport.TRACK_LATCHES) {
2370                         LatchSupport.expectBtreeLatchesHeld(1);
2371                     }
2372 
2373                     if (ret == OperationStatus.SUCCESS) {
2374                         incrementLNCount();
2375                         releaseBIN();
2376                         result = OperationStatus.SUCCESS;
2377                         break;
2378                     } else {
2379                         continue;
2380                     }
2381                 } else {
2382                     /*
2383                      * Make sure that the current BIN will not be pruned away
2384                      * if it is or becomes empty after it gets unlatched by
2385                      * Tree.getNextBin() or Tree.getPrevBin(). The operation
2386                      * of these Tree methods relies on the current BIN not
2387                      * getting pruned.
2388                      */
2389                     anchorBIN = bin;
2390                     anchorBIN.pin();
2391                     bin.removeCursor(this);
2392                     bin = null;
2393 
2394                     final Tree tree = databaseImpl.getTree();
2395 
2396                     /* SR #12736 Try to prune away oldBin */
2397                     assert TestHookExecute.doHookIfSet(testHook);
2398 
2399                     if (forward) {
2400                         bin = tree.getNextBin(anchorBIN, cacheMode);
2401                         index = -1;
2402                     } else {
2403                         bin = tree.getPrevBin(anchorBIN, cacheMode);
2404                         if (bin != null) {
2405                             index = bin.getNEntries();
2406                         }
2407                     }
2408                     isLatched = true;
2409 
2410                     if (bin == null) {
2411                         if (LatchSupport.TRACK_LATCHES) {
2412                             LatchSupport.expectBtreeLatchesHeld(0);
2413                         }
2414                         result = OperationStatus.NOTFOUND;
2415                         break;
2416                     } else {
2417                         if (LatchSupport.TRACK_LATCHES) {
2418                             LatchSupport.expectBtreeLatchesHeld(1);
2419                         }
2420 
2421                         addCursor();
2422                         anchorBIN.unpin();
2423                         anchorBIN = null;
2424                     }
2425                 }
2426             }
2427         } finally {
2428             if (anchorBIN != null) {
2429                 anchorBIN.unpin();
2430             }
2431         }
2432 
2433         if (LatchSupport.TRACK_LATCHES) {
2434             LatchSupport.expectBtreeLatchesHeld(0);
2435         }
2436 
2437         return result;
2438     }
2439 
2440     /**
2441      * Used to detect phantoms during "get next" operations with serializable
2442      * isolation.  If this method returns true, the caller should restart the
2443      * operation from the prior position.
2444      *
2445      * Something may have been added to the original cursor (cursorImpl) while
2446      * we were getting the next BIN.  cursorImpl would have been adjusted
2447      * properly but we would have skipped a BIN in the process.  This can
2448      * happen when all INs are unlatched in Tree.getNextBin.  It can also
2449      * happen without a split, simply due to inserted entries in the previous
2450      * BIN.
2451      *
2452      * @return true if an unaccounted for insertion happened.
2453      *
2454      * TODO:
2455      * Unfortunately, this method doesn't cover all cases where a phantom may
2456      * have been inserted.  Another case is described below.
2457      *
2458      *                        IN-0
2459      *                        ----------------------
2460      *                        |    | 50 | 100 |    |
2461      *                        ----------------------
2462      *                        /    /         \     \
2463      *                            /           \
2464      *                       /----             ----\
2465      *        IN-1          /                       \       IN-2
2466      *        ----------------                   ----------------
2467      *        | 60 | 70 | 80 |                   |    |    |    |
2468      *        ----------------                   ----------------
2469      *         /      |      \                     /
2470      *        /       |       \                   /
2471      *                   ----------------     -----------------
2472      *                   | 81 | 83 | 85 |     | 110 |    |    |
2473      *                   ----------------     -----------------
2474      *                   BIN-3                BIN-4
2475      *
2476      * Initially, the tree looks as above and a cursor (C) is located on the
2477      * last slot of BIN-3. For simplicity, assume no duplicates and no
2478      * serializable isolation. Also assume that C is a sticky cursor.
2479      *
2480      * 1. Thread 1 calls C.getNext(), which calls retrieveNextAllowPhantoms(),
2481      *    which duplicates C's cursorImpl, and calls dup.getNext().
2482      *
2483      *    dup.getNext() latches BIN-3, sets dup.binToBeRemoved to BIN-3 and
2484      *    then calls Tree.getNextBin(BIN-3).
2485      *
2486      *    Tree.getNextBin(BIN-3) does the following:
2487      *    - sets searchKey to 85
2488      *    - calls Tree.getParentINForChildIN(BIN-3).
2489      *    - Tree.getParentINForChildIN(BIN-3) unlatches BIN-3 and searches for
2490      *       BIN-3 parent, thus reaching IN-1.
2491      *    - IN-1.findEntry(85) sets "index" to 2,
2492      *    - "index" is incremented,
2493      *    - "moreEntriesThisIn" is set to false,
2494      *    - "next" is set to IN-1, .
2495      *    - Tree.getParentINForChildIN(IN-1) is called and unlatches IN-1.
2496      *
2497      *    Assume at this point thread 1 looses the cpu.
2498      *
2499      * 2. Thread 2 inserts keys 90 and 95, causing a split of both BIN-3 and
2500      *    IN-1. So the tree now looks like this:
2501      *
2502      *                       IN-0
2503      *                       ---------------------------
2504      *                       |    | 50 | 80 | 100 |    |
2505      *                       ---------------------------
2506      *                       /    /      |      \      \
2507      *                           /       |       \
2508      *                 /---------        |        ----------\
2509      *       IN-1     /                  |                   \       IN-2
2510      *               /              IN-5 |                    \
2511      *       -----------            -----------               ----------------
2512      *       | 60 | 70 |            | 80 | 90 |               |    |    |    |
2513      *       -----------            -----------               ----------------
2514      *        /      |              /         \                /
2515      *       /       |             /           \              /
2516      *                     ----------------  -----------   -----------------
2517      *                     | 81 | 83 | 85 |  | 90 | 95 |   | 110 |    |    |
2518      *                     ----------------  -----------   -----------------
2519      *                     BIN-3             BIN-6         BIN-4
2520      *
2521      *
2522      *    Notice that C.cursorImpl still points to the last slot of BIN-3.
2523      *
2524      * 3. Thread 1 resumes:
2525      *
2526      *    - Tree.getParentINForChildIN(IN-1) reaches IN-0.
2527      *    - IN-0.findEntry(85) sets "index" to 2,
2528      *    - "index" is incremented,
2529      *    - "nextIN" is set to IN-2, which is latched.
2530      *    - Tree.searchSubTree(IN-2, LEFT) is called, and returns BIN-4.
2531      *    - BIN-4 is the result of Tree.getNextBin(BIN-3), i.e., BIN-6 was
2532      *      skipped
2533      *
2534      *      Now we are back in dup.getNext():
2535      *    - dup.bin is set to BIN-4, dup.index to -1, and dup is added to BIN-4
2536      *    - the while loop repeats, dup.index is set to 0, the 1st slot of
2537      *      BIN-4 is locked, and dup.getNext() returns SUCCESS.
2538      *
2539      *      Now we are back in C.retrieveNextAllowPhantoms():
2540      *    - C.checkForInsertion() is called
2541      *    - C.cursorImpl and dup are on different BINs, but the condition:
2542      *      origBIN.getNEntries() - 1 > origCursor.getIndex()
2543      *      is false, so C.checkForInsertion() returns false.
2544      *
2545      * The end result is that BIN-6 has been missed. This is not be a "bug" for
2546      * non-serializable isolation, but the above scenario applies to
2547      * serializable isolation as well, and in that case, BIN-6 should really
2548      * not be missed. This could be solved by re-implementing
2549      * Tree.getNext/PrevBIN() do a more "logical" kind of search.
2550      */
checkForInsertion( final GetMode getMode, final CursorImpl dupCursor)2551     public boolean checkForInsertion(
2552         final GetMode getMode,
2553         final CursorImpl dupCursor) {
2554 
2555         final CursorImpl origCursor = this;
2556         boolean forward = getMode.isForward();
2557         boolean ret = false;
2558 
2559         if (origCursor.bin != dupCursor.bin) {
2560 
2561             /*
2562              * We jumped to the next BIN during getNext().
2563              *
2564              * Be sure to operate on the BIN returned by latchBIN, not a cached
2565              * var [#21121].
2566              *
2567              * Note that a cursor BIN can change after the check above, but
2568              * that's not relevant; what we're trying to detect are BIN changes
2569              * during the operation that has already completed.
2570              *
2571              * Note that we can call isPendingDeleted without locking.  If we
2572              * see a non-committed deleted entry, we'll just iterate around in
2573              * the caller. So a false positive is ok.
2574              */
2575             origCursor.latchBIN();
2576             final BIN origBIN = origCursor.bin;
2577 
2578             origBIN.mutateToFullBIN();
2579 
2580             try {
2581                 if (forward) {
2582                     if (origBIN.getNEntries() - 1 > origCursor.getIndex()) {
2583 
2584                         /*
2585                          * We were adjusted to something other than the
2586                          * last entry so some insertion happened.
2587                          */
2588                         for (int i = origCursor.getIndex() + 1;
2589                              i < origBIN.getNEntries();
2590                              i++) {
2591                             if (!origBIN.isEntryKnownDeleted(i) &&
2592                                 !origBIN.isEntryPendingDeleted(i)) {
2593                                 /* See comment above about locking. */
2594                                 ret = true;
2595                                 break;
2596                             }
2597                         }
2598                     }
2599                 } else {
2600                     if (origCursor.getIndex() > 0) {
2601 
2602                         /*
2603                          * We were adjusted to something other than the
2604                          * first entry so some insertion happened.
2605                          */
2606                         for (int i = 0; i < origCursor.getIndex(); i++) {
2607                             if (!origBIN.isEntryKnownDeleted(i) &&
2608                                 !origBIN.isEntryPendingDeleted(i)) {
2609                                 /* See comment above about locking. */
2610                                 ret = true;
2611                                 break;
2612                             }
2613                         }
2614                     }
2615                 }
2616             } finally {
2617                 origCursor.releaseBIN();
2618             }
2619             return ret;
2620         }
2621         return false;
2622     }
2623 
2624     /**
2625      * Skips over entries until a boundary condition is satisfied, either
2626      * because maxCount is reached or RangeConstraint.inBounds returns false.
2627      *
2628      * If a maxCount is passed, this allows advancing the cursor quickly by N
2629      * entries.  If a rangeConstraint is passed, this allows returning the
2630      * entry count after advancing until the predicate returns false, e.g., the
2631      * number of entries in a key range.  In either case, the number of entries
2632      * advanced is returned.
2633      *
2634      * Optimized to scan using level two of the tree when possible, to avoid
2635      * calling getNextBin/getPrevBin for every BIN of the database.  All BINs
2636      * beneath a level two IN can be skipped quickly, with the level two parent
2637      * IN latched, when all of its children BINs are resident and can be
2638      * latched without waiting.  When a child BIN is not resident or latching
2639      * waits, we revert to the getNextBin/getPrevBin approach, to avoid keeping
2640      * the parent IN latched for long time periods.
2641      *
2642      * Although this method positions the cursor on the last non-deleted entry
2643      * seen (before the boundary condition is satisfied), because it does not
2644      * lock the LN it is possible that it is deleted by another thread after
2645      * the BIN is unlatched.
2646      *
2647      * @param forward is true to skip forward, false to skip backward.
2648      *
2649      * @param maxCount is the maximum number of non-deleted entries to skip,
2650      * and may be LTE zero if no maximum is enforced.
2651      *
2652      * @param rangeConstraint is a predicate that returns false at a position
2653      * where advancement should stop, or null if no predicate is enforced.
2654      *
2655      * @return the number of non-deleted entries that were skipped.
2656      */
skip( boolean forward, long maxCount, RangeConstraint rangeConstraint)2657     public long skip(
2658         boolean forward,
2659         long maxCount,
2660         RangeConstraint rangeConstraint) {
2661 
2662         final CursorImpl c = cloneCursor(true /*samePosition*/);
2663 
2664         try {
2665             return c.skipInternal(forward, maxCount, rangeConstraint, this);
2666         } catch (final Throwable e) {
2667             /*
2668              * Get more info on dbsim duplicate.conf failure when c.close below
2669              * throws because the BIN latch is already held.  It should have
2670              * been released by skipInternal and therefore an unexpected
2671              * exception must have been throw and the error handling must be
2672              * incorrect.
2673              */
2674             e.printStackTrace(System.out);
2675             throw e;
2676         } finally {
2677             c.close();
2678         }
2679     }
2680 
2681 
2682     /**
2683      * Use this cursor to reference the current BIN in the traversal, to
2684      * prevent the current BIN from being compressed away.  But set the given
2685      * finalPositionCursor (the 'user' cursor) position only at non-deleted
2686      * entries, since it should be positioned on a valid entry when this method
2687      * returns.
2688      */
skipInternal( boolean forward, long maxCount, RangeConstraint rangeConstraint, CursorImpl finalPositionCursor)2689     private long skipInternal(
2690         boolean forward,
2691         long maxCount,
2692         RangeConstraint rangeConstraint,
2693         CursorImpl finalPositionCursor) {
2694 
2695         /* Start with the entry at the cursor position. */
2696         final Tree tree = databaseImpl.getTree();
2697 
2698         latchBIN();
2699 
2700         IN parent = null;
2701         BIN prevBin = null;
2702         BIN curBin = bin;
2703         int curIndex = getIndex();
2704         long count = 0;
2705         boolean success = false;
2706 
2707         try {
2708             while (true) {
2709                 curBin.mutateToFullBIN();
2710 
2711                 /* Skip entries in the current BIN. */
2712                 count = skipEntries(
2713                     forward, maxCount, rangeConstraint, finalPositionCursor,
2714                     curBin, curIndex, count);
2715 
2716                 if (count < 0) {
2717                     curBin.releaseLatch();
2718                     success = true;
2719                     return (- count);
2720                 }
2721 
2722                 /*
2723                  * Get the parent IN at level two. The BIN is unlatched by
2724                  * getParentINForChildIN.  Before releasing the BIN latch, get
2725                  * the search key for the last entry.
2726                  */
2727                 final byte[] idKey =
2728                     (curBin.getNEntries() == 0 ?
2729                      curBin.getIdentifierKey() :
2730                      (forward ?
2731                       curBin.getKey(curBin.getNEntries() - 1) :
2732                       curBin.getKey(0)));
2733 
2734                 final SearchResult result = tree.getParentINForChildIN(
2735                     curBin, false, /*useTargetLevel*/
2736                     true, /*doFetch*/ CacheMode.DEFAULT);
2737 
2738                 parent = result.parent;
2739 
2740                 if (!result.exactParentFound) {
2741                     throw EnvironmentFailureException.unexpectedState(
2742                         "Cannot get parent of BIN id=" +
2743                         curBin.getNodeId() + " key=" +
2744                         Arrays.toString(idKey));
2745                 }
2746 
2747                 /*
2748                  * Find and latch previous child BIN by matching idKey rather
2749                  * than using result.index, as in Tree.getNextIN (see comments
2750                  * there).
2751                  */
2752                 int parentIndex = parent.findEntry(idKey, false, false);
2753 
2754                 curBin = (BIN) parent.fetchIN(parentIndex);
2755                 curBin.latch();
2756 
2757                 if (forward ?
2758                     (parentIndex < parent.getNEntries() - 1) :
2759                     (parentIndex > 0)) {
2760 
2761                     /*
2762                      * There are more entries in the parent. Skip entries for
2763                      * child BINs that are resident and can be latched no-wait.
2764                      */
2765                     final int incr = forward ? 1 : (-1);
2766 
2767                     for (parentIndex += incr;; parentIndex += incr) {
2768 
2769                         prevBin = curBin;
2770                         curBin = null;
2771 
2772                         /* Break is no more entries in parent. */
2773                         if ((forward ?
2774                              parentIndex >= parent.getNEntries() :
2775                              parentIndex < 0)) {
2776                             parent.releaseLatch();
2777                             break;
2778                         }
2779 
2780                         /*
2781                          * Latch next child BIN, if cached and unlatched.
2782                          *
2783                          * Note that although 2 BINs are latched here, this
2784                          * can't cause deadlocks because the 2nd latch is
2785                          * no-wait.
2786                          */
2787                         curBin = (BIN) parent.getTarget(parentIndex);
2788 
2789                         if (curBin == null ||
2790                             !curBin.latchNoWait(CacheMode.DEFAULT)) {
2791                             parent.releaseLatch();
2792                             break;
2793                         }
2794 
2795                         /* Unlatch the prev BIN */
2796                         prevBin.releaseLatch();
2797                         prevBin = null;
2798 
2799                         /* Position at new BIN to prevent compression. */
2800                         setPosition(curBin, -1);
2801 
2802                         curBin.mutateToFullBIN();
2803 
2804                         /* Skip entries in new child BIN. */
2805                         count = skipEntries(
2806                             forward, maxCount, rangeConstraint,
2807                             finalPositionCursor, curBin,
2808                             forward ? (-1) : curBin.getNEntries(), count);
2809 
2810                         if (count < 0) {
2811                             parent.releaseLatch();
2812                             curBin.releaseLatch();
2813                             success = true;
2814                             return (- count);
2815                         }
2816                     }
2817                 } else {
2818                     /* No more entries in the parent. */
2819                     parent.releaseLatch();
2820                     prevBin = curBin;
2821                 }
2822 
2823                 /*
2824                  * Only the prevBin is still latched here. Move to the next
2825                  * BIN the "hard" way (i.e., via full tree searches).
2826                  */
2827                 curBin = forward ?
2828                     tree.getNextBin(prevBin, CacheMode.DEFAULT) :
2829                     tree.getPrevBin(prevBin, CacheMode.DEFAULT);
2830 
2831                 assert(!prevBin.isLatchOwner());
2832 
2833                 if (curBin == null) {
2834                     success = true;
2835                     return count;
2836                 }
2837 
2838                 prevBin = null;
2839                 curIndex = forward ? (-1) : curBin.getNEntries();
2840 
2841                 /* Position at new BIN to prevent compression. */
2842                 setPosition(curBin, -1);
2843             }
2844         } finally {
2845             if (curBin != null && !success) {
2846                 curBin.releaseLatchIfOwner();
2847             }
2848             if (prevBin != null && !success) {
2849                 prevBin.releaseLatchIfOwner();
2850             }
2851             if (parent != null && !success) {
2852                 parent.releaseLatchIfOwner();
2853             }
2854 
2855             if (LatchSupport.TRACK_LATCHES) {
2856                 LatchSupport.expectBtreeLatchesHeld(0);
2857             }
2858         }
2859     }
2860 
2861     /**
2862      * Skip entries in curBin from one past curIndex and onward.  Returns
2863      * non-negative count if skipping should continue, or negative count if
2864      * bounds is exceeded.
2865      */
skipEntries( boolean forward, long maxCount, RangeConstraint rangeConstraint, CursorImpl finalPositionCursor, BIN curBin, int curIndex, long count)2866     private long skipEntries(
2867         boolean forward,
2868         long maxCount,
2869         RangeConstraint rangeConstraint,
2870         CursorImpl finalPositionCursor,
2871         BIN curBin,
2872         int curIndex,
2873         long count) {
2874 
2875         assert(!curBin.isBINDelta());
2876 
2877         final int incr = forward ? 1 : (-1);
2878 
2879         for (int i = curIndex + incr;; i += incr) {
2880             if (forward ? (i >= curBin.getNEntries()) : (i < 0)) {
2881                 break;
2882             }
2883             if (rangeConstraint != null &&
2884                 !rangeConstraint.inBounds(curBin.getKey(i))) {
2885                 return (- count);
2886             }
2887             if (!curBin.isEntryKnownDeleted(i) &&
2888                 !curBin.isEntryPendingDeleted(i)) {
2889                 count += 1;
2890                 finalPositionCursor.setPosition(curBin, i);
2891                 if (maxCount > 0 && count >= maxCount) {
2892                     return (- count);
2893                 }
2894             }
2895         }
2896         return count;
2897     }
2898 
2899     /**
2900      * Returns the stack of ancestor TrackingInfo for the BIN at the cursor, or
2901      * null if a split occurs and the information returned would be
2902      * inconsistent.
2903      *
2904      * Used by CountEstimator.
2905      */
getAncestorPath()2906     public List<TrackingInfo> getAncestorPath() {
2907 
2908         /*
2909          * Search for parent of BIN, get TrackingInfo for ancestors.  If the
2910          * exact parent is not found, a split occurred and null is returned.
2911          */
2912         final List<TrackingInfo> trackingList = new ArrayList<TrackingInfo>();
2913 
2914         latchBIN();
2915 
2916         final BIN origBin = bin;
2917         final Tree tree = databaseImpl.getTree();
2918 
2919         final SearchResult result = tree.getParentINForChildIN(
2920             origBin, false, /*useTargetLevel*/
2921             true /*doFetch*/, CacheMode.UNCHANGED, trackingList);
2922 
2923         if (!result.exactParentFound) {
2924             /* Must have been a split. */
2925             return null;
2926         }
2927 
2928         /*
2929          * The parent was found and is now latched. If the child BIN does not
2930          * match the cursor's BIN, then a split occurred and null is returned.
2931          */
2932         final long binLsn;
2933         try {
2934             if (origBin != result.parent.getTarget(result.index) ||
2935                 origBin != bin) {
2936                 /* Must have been a split. */
2937                 return null;
2938             }
2939 
2940             binLsn = result.parent.getLsn(result.index);
2941             bin.latch();
2942 
2943         } finally {
2944             result.parent.releaseLatch();
2945         }
2946 
2947         /*
2948          * The child BIN is now latched. Subtract deleted entries from BIN's
2949          * total entries and adjust the index accordingly.  Add TrackingInfo
2950          * for child BIN.
2951          */
2952         try {
2953             int binEntries = bin.getNEntries();
2954             int binIndex = getIndex();
2955 
2956             for (int i = bin.getNEntries() - 1; i >= 0; i -= 1) {
2957 
2958                 if (bin.isEntryKnownDeleted(i) ||
2959                     bin.isEntryPendingDeleted(i)) {
2960 
2961                     binEntries -= 1;
2962                     if (i < binIndex) {
2963                         binIndex -= 1;
2964                     }
2965                 }
2966             }
2967 
2968             final TrackingInfo info = new TrackingInfo(
2969                 binLsn, bin.getNodeId(), binEntries, binIndex);
2970 
2971             trackingList.add(info);
2972 
2973             return trackingList;
2974 
2975         } finally {
2976             bin.releaseLatch();
2977         }
2978     }
2979 
2980     /**
2981      * Search for the next key following the given key, and acquire a range
2982      * insert lock on it.  If there are no more records following the given
2983      * key, lock the special EOF node for the databaseImpl.
2984      */
lockNextKeyForInsert(DatabaseEntry key)2985     public void lockNextKeyForInsert(DatabaseEntry key)
2986         throws DatabaseException {
2987 
2988         DatabaseEntry tempKey = new DatabaseEntry(
2989             key.getData(), key.getOffset(), key.getSize());
2990 
2991         boolean lockedNextKey = false;
2992         boolean latched = true;
2993 
2994         try {
2995             while (true) {
2996 
2997                 int searchResult = searchRange(tempKey, null /*comparator*/);
2998 
2999                 if ((searchResult & FOUND) != 0 &&
3000                     (searchResult & FOUND_LAST) == 0) {
3001 
3002                     /*
3003                      * The search positioned "this" on the BIN that should
3004                      * contain K1 and this BIN is now latched. If the BIN does
3005                      * contain K1, this.index points to K1's slot. Otherwise,
3006                      * this.index points to the right-most slot whose key is
3007                      * < K1 (or this.index is -1 if K1 is < than all keys in
3008                      * the BIN). Furthermore, "this" is NOT positioned on the
3009                      * very last slot of the BTree.
3010                      *
3011                      * Call getNext() to advance "this" to the next *valid*
3012                      * (i.e., not marked deleted) slot and lock that slot in
3013                      * RANGE_INSERT mode. Normally, getNext() will move the
3014                      * cursor to the 1st slot with a key K2 > K1. However, it
3015                      * is possible that K2 <= K1 (see the comments in
3016                      * Cursor.searchRangeAdvanceAndCheckKey() about how this
3017                      * can happen. We handle this race condition by restarting
3018                      * the search.
3019                      */
3020                     DatabaseEntry tempData = new DatabaseEntry();
3021                     tempData.setPartial(0, 0, true);
3022 
3023                     OperationStatus status = getNext(
3024                         tempKey, tempData, LockType.RANGE_INSERT,
3025                         false, true, true,
3026                         null /*rangeConstraint*/);
3027 
3028                     latched = false;
3029 
3030                     if (status == OperationStatus.SUCCESS) {
3031 
3032                         Comparator<byte[]> comparator =
3033                             databaseImpl.getKeyComparator();
3034 
3035                         int c = Key.compareKeys(tempKey, key, comparator);
3036                         if (c <= 0) {
3037                             tempKey.setData(
3038                                 key.getData(), key.getOffset(), key.getSize());
3039                             continue;
3040                         }
3041 
3042                         lockedNextKey = true;
3043                     }
3044                 }
3045 
3046                 break;
3047             }
3048         } finally {
3049             if (latched) {
3050                 releaseBIN();
3051             }
3052         }
3053 
3054         /* Lock the EOF node if no next key was found. */
3055         if (!lockedNextKey) {
3056             lockEof(LockType.RANGE_INSERT);
3057         }
3058     }
3059 
3060     /*
3061      * Locking
3062      */
3063 
3064     /**
3065      * Holds the result of a lockLN operation.  A lock may not actually be
3066      * held (getLockResult may return null) if an uncontended lock is allowed.
3067      */
3068     public static class LockStanding {
3069         private long lsn;
3070         private boolean nullLsn;
3071         private boolean deleted;
3072         private LockResult lockResult;
3073 
recordExists()3074         public boolean recordExists() {
3075             return !nullLsn && !deleted;
3076         }
3077 
getLockResult()3078         public LockResult getLockResult() {
3079             return lockResult;
3080         }
3081 
3082         /**
3083          * Called by update and delete ops, after lockLN() and before logging
3084          * the LN. It returns a WriteLockInfo that is meant to be passed to
3085          * the LN logging method, where its info will be included in the LN
3086          * log entry and also copied into the new WriteLockInfo that will be
3087          * created for the new LSN.
3088          *
3089          * If the locker is not transactional, or the current LSN has not been
3090          * write-locked before by this locker, a new WriteLockInfo is created
3091          * here and its abortLsn and abortKD fields are set. (note: even though
3092          * lockLN() is called before prepareForUpdate(), it may not actually
3093          * acquire a lock because of the uncontended optimization).
3094          *
3095          * Otherwise, a WriteLockInfo exists already. It may have been created
3096          * by the lockLN() call during the current updating op, or a lockLN()
3097          * call during an earlier updating op by the same txn. In the later
3098          * case, the abortLsn and abortKD have been set already and should not
3099          * be overwriten here.
3100          */
prepareForUpdate()3101         public WriteLockInfo prepareForUpdate() {
3102             final boolean abortKnownDeleted = !recordExists();
3103             WriteLockInfo wri = (lockResult == null ?
3104                                  null :
3105                                  lockResult.getWriteLockInfo());
3106             if (wri == null) {
3107                 wri = new WriteLockInfo();
3108                 wri.setAbortLsn(lsn);
3109                 wri.setAbortKnownDeleted(abortKnownDeleted);
3110             } else {
3111                 lockResult.setAbortLsn(lsn, abortKnownDeleted);
3112             }
3113             return wri;
3114         }
3115 
3116         /**
3117          * Creates WriteLockInfo that is appropriate for a newly inserted slot.
3118          * The return value is meant to be passed to an LN logging method and
3119          * copied into the WriteLockInfo for the new LSN.  This method is
3120          * static because lockLN is never called prior to logging an LN for a
3121          * newly inserted slot.
3122          */
prepareForInsert()3123         public static WriteLockInfo prepareForInsert() {
3124             final WriteLockInfo wri = new WriteLockInfo();
3125             return wri;
3126         }
3127     }
3128 
3129     /** Does not allow uncontended locks.  See lockLN(LockType, boolean). */
lockLN(LockType lockType)3130     public LockStanding lockLN(LockType lockType)
3131         throws LockConflictException {
3132 
3133         return lockLN(lockType, false /*allowUncontended*/, false /*noWait*/);
3134     }
3135 
3136     /**
3137      * Locks the LN at the cursor position.  Attempts to use a non-blocking
3138      * lock to avoid unlatching/relatching.
3139      *
3140      * Retries if necessary, to handle the case where the LSN is changed while
3141      * the BIN is unlatched.  Because it re-latches the BIN to check the LSN,
3142      * this serializes access to the LSN for locking, guaranteeing that two
3143      * lockers cannot obtain conflicting locks on the old and new LSNs.
3144      *
3145      * Preconditions: The BIN must be latched.
3146      *
3147      * Postconditions: The BIN is latched.
3148      *
3149      * LN Locking Rules
3150      * ----------------
3151      * The lock ID for an LN is its LSN in the parent BIN slot.  Because the
3152      * LSN changes when logging the LN, only two methods of locking an LN may
3153      * be used to support concurrent access:
3154      *
3155      * 1. This method may be called to lock the old LSN.  For read operations,
3156      * that is all that is necessary.  For write operations, the new LSN must
3157      * be locked after logging it, which is done by all the LN logging methods.
3158      * Be sure to pass a non-null locker to the LN logging method to lock the
3159      * LN, unless locking is not desired.
3160      *
3161      * 2. A non-blocking lock may be obtained on the old LSN (using
3162      * Locker.nonBlockingLock rather than this method), as long as the lock is
3163      * released before the BIN latch is released.  In this case a non-null
3164      * locker is not passed to the LN logging method; locking the new LSN is
3165      * unnecessary because no other thread can access the new LSN until the BIN
3166      * latch is released.
3167      *
3168      * The first method is used for all user operations.  The second method is
3169      * used by the cleaner, when flushing dirty deferred-write LNs, and by
3170      * certain btree operations.
3171      *
3172      * Uncontended Lock Optimization
3173      * -----------------------------
3174      * The allowUncontended param is passed as true for update and delete
3175      * operations as an optimization for the case where no lock on the old LSN
3176      * is held by any locker.  In this case we don't need to lock the old LSN
3177      * at all, as long as we log the new LSN before releasing the BIN latch.
3178      *
3179      * 1. Latch BIN
3180      * 2. Determine that no lock/waiter exists for oldLsn
3181      * 3. Log LN and get newLsn
3182      * 4. Lock newLsn
3183      * 5. Update BIN
3184      * 6. Release BIN latch
3185      *
3186      * The oldLsn is never locked, saving operations on the lock table.  The
3187      * assumption is that another locker will first have to latch the BIN to
3188      * get oldLsn, before requesting a lock.
3189      *
3190      * A potential problem is that the other locker may release the BIN latch
3191      * before requesting the lock.
3192      *
3193      * This Operation        Another Operation
3194      * --------------        -----------------
3195      *                       Latch BIN, get oldLsn, release BIN latch
3196      * Step 1 and 2
3197      *                       Request lock for oldLsn, granted
3198      * Step 3 and 4
3199      *
3200      * Both operations now believe they have an exclusive lock, but they have
3201      * locks on different LSNs.
3202      *
3203      * However, this problem is handled as long as the other lock is performed
3204      * using a lockLN method in this class, which will release the lock and
3205      * retry if the LSN changes while acquiring the lock.  Because it
3206      * re-latches the BIN to check the LSN, this will serialize access to the
3207      * LSN for locking, guaranteeing that two conflicting locks cannot be
3208      * granted on the old and new LSNs.
3209      *
3210      * Deferred-Write Locking
3211      * ----------------------
3212      * When one of the LN optionalLog methods is called, a deferred-write LN is
3213      * dirtied but not actually logged.  In order to lock an LN that has been
3214      * inserted but not yet assigned a true LSN, a transient LSNs is assigned.
3215      * These LSNs serve to lock the LN but never appear in the log.  See
3216      * LN.assignTransientLsn.
3217      *
3218      * A deferred-write LN is logged when its parent BIN is logged, or when the
3219      * LN is evicted.  This will replace transient LSNs with durable LSNs.  If
3220      * a lock is held by a cursor on a deferred-write LN when it is logged, the
3221      * same lock is acquired on the new LSN by the cursor.  See
3222      * lockAfterLsnChange.
3223      *
3224      * Cleaner Migration Locking
3225      * -------------------------
3226      * The cleaner takes a non-blocking read lock on the old LSN before
3227      * migrating/logging the LN, while holding the BIN latch.  It does not take
3228      * a lock on the new LSN, since it does not need to retain a lock after
3229      * releasing the BIN latch.
3230      *
3231      * Because a read, not write, lock is taken, other read locks may be held
3232      * during migration.  After logging, the cleaner calls lockAfterLsnChange
3233      * to lock the new LSN on behalf of other lockers.
3234      *
3235      * For more info on migration locking, see HandleLocker.
3236      *
3237      * Historical Notes
3238      * ----------------
3239      * In JE 4.1 and earlier, each LN had a node ID that was used for locking,
3240      * rather than using the LSN.  The node ID changed only if a deleted slot
3241      * was reused.  The node ID was stored in the LN, requiring that the LN be
3242      * fetched when locking the LN.  With LSN locking a fetch is not needed.
3243      *
3244      * When LN node IDs were used, deferred-write LNs were not assigned an LSN
3245      * until they were actually logged. Deferred-write LNs were initially
3246      * assigned a null LSN and transient LSNs were not needed.
3247      *
3248      * @param lockType the type of lock requested.
3249      *
3250      * @param allowUncontended is true to return immediately (no lock is taken)
3251      * when no locker holds or waits for the lock.
3252      *
3253      * @param noWait is true to perform a no-wait lock request while keeping
3254      * the BIN latched.  The caller must check the lock result to see whether
3255      * the lock was granted.
3256      *
3257      * @return all information about the lock; see LockStanding.
3258      *
3259      * @throws LockConflictException if the lsn is non-null, the lock is
3260      * contended, and a lock could not be obtained by blocking.
3261      */
lockLN(final LockType lockType, final boolean allowUncontended, final boolean noWait)3262     public LockStanding lockLN(final LockType lockType,
3263                                final boolean allowUncontended,
3264                                final boolean noWait)
3265         throws LockConflictException {
3266 
3267         final LockStanding standing = new LockStanding();
3268         standing.lsn = bin.getLsn(index);
3269         standing.deleted = bin.isEntryKnownDeleted(index) ||
3270                            bin.isEntryPendingDeleted(index);
3271 
3272         /* Ensure that a known-deleted null LSN is not present. */
3273         if (standing.lsn == DbLsn.NULL_LSN) {
3274             assert bin.isEntryKnownDeleted(index);
3275             standing.nullLsn = true;
3276             return standing;
3277         }
3278 
3279         /*
3280          * We can avoid taking a lock if uncontended.  However, we must
3281          * call preLogWithoutLock to prevent logging on a replica, and as
3282          * good measure to prepare for undo.
3283          */
3284         if (allowUncontended &&
3285             databaseImpl.getEnv().getTxnManager().getLockManager().
3286                          isLockUncontended(standing.lsn)) {
3287             locker.preLogWithoutLock(databaseImpl);
3288             assert verifyPendingDeleted(lockType);
3289             return standing;
3290         }
3291 
3292         /*
3293          * Try a non-blocking lock first, to avoid unlatching.  If the default
3294          * is no-wait, use the standard lock method so
3295          * LockNotAvailableException is thrown; there is no need to try a
3296          * non-blocking lock twice.
3297          *
3298          * Even for dirty-read (LockType.NONE) we must call Locker.lock() since
3299          * it checks the locker state and may throw LockPreemptedException.
3300          */
3301         if (locker.getDefaultNoWait()) {
3302             try {
3303                 standing.lockResult = locker.lock(
3304                     standing.lsn, lockType, true /*noWait*/, databaseImpl);
3305             } catch (LockConflictException e) {
3306 
3307                 /*
3308                  * Release all latches.  Note that we catch
3309                  * LockConflictException for simplicity but we expect either
3310                  * LockNotAvailableException or LockNotGrantedException.
3311                  */
3312                 releaseBIN();
3313                 throw e;
3314             }
3315         } else {
3316             standing.lockResult = locker.nonBlockingLock(
3317                 standing.lsn, lockType, false /*jumpAheadOfWaiters*/,
3318                 databaseImpl);
3319         }
3320 
3321         if (noWait ||
3322             standing.lockResult.getLockGrant() != LockGrantType.DENIED) {
3323             /* Lock was granted whiled latched, no need to check LSN. */
3324             assert verifyPendingDeleted(lockType);
3325             return standing;
3326         }
3327 
3328         /*
3329          * Unlatch, get a blocking lock, latch, and get the current LSN from
3330          * the slot.  If the LSN changes while unlatched, revert the lock and
3331          * repeat.
3332          */
3333         while (true) {
3334 
3335             /* Request a blocking lock. */
3336             releaseBIN();
3337             standing.lockResult = locker.lock(
3338                 standing.lsn, lockType, false /*noWait*/, databaseImpl);
3339 
3340             /* Check current LSN after locking. */
3341             latchBIN();
3342             standing.deleted = bin.isEntryKnownDeleted(index) ||
3343                                bin.isEntryPendingDeleted(index);
3344             final long newLsn = bin.getLsn(index);
3345             if (standing.lsn != newLsn) {
3346                 /* The LSN changed, revert the lock and try again. */
3347                 revertLock(standing);
3348                 standing.lsn = newLsn;
3349                 /* Ensure that a known-deleted null LSN is not present. */
3350                 if (newLsn == DbLsn.NULL_LSN) {
3351                     assert bin.isEntryKnownDeleted(index);
3352                     standing.nullLsn = true;
3353                     return standing;
3354                 }
3355                 continue;
3356             } else {
3357                 /* If locked correctly, return the result. */
3358                 assert verifyPendingDeleted(lockType);
3359                 return standing;
3360             }
3361         }
3362     }
3363 
3364     /**
3365      * After logging a deferred-write LN during eviction/checkpoint or a
3366      * migrated LN during cleaning, for every existing lock on the old LSN held
3367      * by another locker, we must lock the new LSN on behalf of that locker.
3368      *
3369      * This is done while holding the BIN latch so that the new LSN does not
3370      * change during the locking process.  The BIN must be latched on entry and
3371      * is left latched by this method.
3372      *
3373      * We release the lock on the oldLsn to prevent locks from accumulating
3374      * over time on a HandleLocker, as the cleaner migrates LNs, because
3375      * Database handle locks are legitimately very long-lived.  It is important
3376      * to first acquire all newLsn locks and then release the oldLsn locks.
3377      * Releasing an oldLsn lock might allow another locker to acquire it, and
3378      * then acquiring another newLsn lock may encounter a conflict. [#20617]
3379      *
3380      * @see com.sleepycat.je.txn.HandleLocker
3381      * @see #lockLN
3382      */
lockAfterLsnChange(DatabaseImpl dbImpl, long oldLsn, long newLsn, Locker excludeLocker)3383     public static void lockAfterLsnChange(DatabaseImpl dbImpl,
3384                                           long oldLsn,
3385                                           long newLsn,
3386                                           Locker excludeLocker) {
3387         final LockManager lockManager =
3388             dbImpl.getEnv().getTxnManager().getLockManager();
3389         final Set<LockInfo> owners = lockManager.getOwners(oldLsn);
3390         if (owners == null) {
3391             return;
3392         }
3393         /* Acquire newLsn locks. */
3394         for (LockInfo lockInfo : owners) {
3395             final Locker locker = lockInfo.getLocker();
3396             if (locker != excludeLocker) {
3397                 locker.lockAfterLsnChange(oldLsn, newLsn, dbImpl);
3398             }
3399         }
3400         /* Release oldLsn locks. */
3401         for (LockInfo lockInfo : owners) {
3402             final Locker locker = lockInfo.getLocker();
3403             if (locker != excludeLocker &&
3404                 locker.allowReleaseLockAfterLsnChange()) {
3405                 locker.releaseLock(oldLsn);
3406             }
3407         }
3408     }
3409 
3410     /**
3411      * For debugging. Verify that a BINs cursor set refers to the BIN.
3412      */
verifyCursor(BIN bin)3413     private void verifyCursor(BIN bin)
3414         throws DatabaseException {
3415 
3416         if (!bin.getCursorSet().contains(this)) {
3417             throw new EnvironmentFailureException
3418                 (databaseImpl.getEnv(),
3419                  EnvironmentFailureReason.UNEXPECTED_STATE,
3420                  "BIN cursorSet is inconsistent");
3421         }
3422     }
3423 
3424     /**
3425      * Calls checkCursorState and asserts false if an exception is thrown.
3426      * Otherwise returns true, so it can be called under an assertion.
3427      */
assertCursorState(boolean mustBeInitialized, boolean mustNotBeInitialized)3428     private boolean assertCursorState(boolean mustBeInitialized,
3429                                       boolean mustNotBeInitialized) {
3430         try {
3431             checkCursorState(mustBeInitialized, mustNotBeInitialized);
3432             return true;
3433         } catch (RuntimeException e) {
3434             assert false : e.toString() + " " + dumpToString(true);
3435             return false; // for compiler
3436         }
3437     }
3438 
3439     /**
3440      * Check that the cursor is open and optionally if it is initialized or
3441      * uninitialized.
3442      *
3443      * @throws IllegalStateException via all Cursor methods that call
3444      * Cursor.checkState (all get and put methods, plus more).
3445      */
checkCursorState(boolean mustBeInitialized, boolean mustNotBeInitialized)3446     public void checkCursorState(boolean mustBeInitialized,
3447                                  boolean mustNotBeInitialized) {
3448         switch (status) {
3449         case CURSOR_NOT_INITIALIZED:
3450             if (mustBeInitialized) {
3451                 throw new IllegalStateException("Cursor not initialized.");
3452             }
3453             break;
3454         case CURSOR_INITIALIZED:
3455             if (mustNotBeInitialized) {
3456                 throw EnvironmentFailureException.unexpectedState(
3457                     "Cursor is initialized.");
3458             }
3459             if (DEBUG) {
3460                 if (bin != null) {
3461                     verifyCursor(bin);
3462                 }
3463             }
3464             break;
3465         case CURSOR_CLOSED:
3466             throw new IllegalStateException("Cursor has been closed.");
3467         default:
3468             throw EnvironmentFailureException.unexpectedState(
3469                 "Unknown cursor status: " + status);
3470         }
3471     }
3472 
3473     /**
3474      * Checks that LN deletedness matches KD/PD flag state, at least when the
3475      * LN is resident.  Should only be called under an assertion.
3476      */
verifyPendingDeleted(LockType lockType)3477     private boolean verifyPendingDeleted(LockType lockType) {
3478 
3479         /* Cannot verify deletedness if LN is not locked. */
3480         if (lockType == LockType.NONE) {
3481             return true;
3482         }
3483 
3484         /* Cannot verify deletedness if cursor is not intialized. */
3485         if (bin == null || index < 0) {
3486             return true;
3487         }
3488 
3489         /* Cannot verify deletedness if LN is not resident. */
3490         final LN ln = (LN) bin.getTarget(index);
3491         if (ln == null) {
3492             return true;
3493         }
3494 
3495         /*
3496          * If the LN is deleted then KD or PD must be set.  If the LN is not
3497          * deleted then PD must not be set, but KD may or may not be set since
3498          * it used for various purposes (see IN.java).
3499          */
3500         final boolean kd = bin.isEntryKnownDeleted(index);
3501         final boolean pd = bin.isEntryPendingDeleted(index);
3502         final boolean lnDeleted = ln.isDeleted();
3503         assert ((lnDeleted && (kd || pd)) || (!lnDeleted && !pd)) :
3504                "Deleted state mismatch LNDeleted = " + lnDeleted +
3505                " PD = " + pd + " KD = " + kd;
3506         return true;
3507     }
3508 
revertLock(LockStanding standing)3509     public void revertLock(LockStanding standing)
3510         throws DatabaseException {
3511 
3512         if (standing.lockResult != null) {
3513             revertLock(standing.lsn, standing.lockResult);
3514             standing.lockResult = null;
3515         }
3516     }
3517 
3518     /**
3519      * Return this lock to its prior status. If the lock was just obtained,
3520      * release it. If it was promoted, demote it.
3521      */
revertLock(long lsn, LockResult lockResult)3522     private void revertLock(long lsn, LockResult lockResult)
3523         throws DatabaseException {
3524 
3525         LockGrantType lockStatus = lockResult.getLockGrant();
3526         if ((lockStatus == LockGrantType.NEW) ||
3527             (lockStatus == LockGrantType.WAIT_NEW)) {
3528             locker.releaseLock(lsn);
3529         } else if ((lockStatus == LockGrantType.PROMOTION) ||
3530                    (lockStatus == LockGrantType.WAIT_PROMOTION)){
3531             locker.demoteLock(lsn);
3532         }
3533     }
3534 
3535     /**
3536      * Locks the logical EOF node for the databaseImpl.
3537      */
lockEof(LockType lockType)3538     public void lockEof(LockType lockType)
3539         throws DatabaseException {
3540 
3541         locker.lock(databaseImpl.getEofLsn(), lockType,
3542                     false /*noWait*/, databaseImpl);
3543     }
3544 
3545     /**
3546      * @throws EnvironmentFailureException if the underlying environment is
3547      * invalid.
3548      */
checkEnv()3549     public void checkEnv() {
3550         databaseImpl.getEnv().checkIfInvalid();
3551     }
3552 
3553     /**
3554      * Callback object for traverseDbWithCursor.
3555      */
3556     public interface WithCursor {
3557 
3558         /**
3559          * Called for each record in the databaseImpl.
3560          * @return true to continue or false to stop the enumeration.
3561          */
withCursor(CursorImpl cursor, DatabaseEntry key, DatabaseEntry data)3562         boolean withCursor(CursorImpl cursor,
3563                            DatabaseEntry key,
3564                            DatabaseEntry data)
3565             throws DatabaseException;
3566     }
3567 
3568     /**
3569      * Enumerates all records in a databaseImpl non-transactionally and calls
3570      * the withCursor method for each record.  Stops the enumeration if the
3571      * callback returns false.
3572      *
3573      * @param db DatabaseImpl to traverse.
3574      *
3575      * @param lockType non-null LockType for reading records.
3576      *
3577      * @param allowEviction should normally be true to evict when performing
3578      * multiple operations, but may be false if eviction is disallowed in a
3579      * particular context.
3580      *
3581      * @param withCursor callback object.
3582      */
traverseDbWithCursor( DatabaseImpl db, LockType lockType, boolean allowEviction, WithCursor withCursor)3583     public static void traverseDbWithCursor(
3584         DatabaseImpl db,
3585         LockType lockType,
3586         boolean allowEviction,
3587         WithCursor withCursor)
3588         throws DatabaseException {
3589 
3590         DatabaseEntry key = new DatabaseEntry();
3591         DatabaseEntry data = new DatabaseEntry();
3592         Locker locker = null;
3593         CursorImpl cursor = null;
3594 
3595         try {
3596             EnvironmentImpl envImpl = db.getEnv();
3597 
3598             locker = LockerFactory.getInternalReadOperationLocker(envImpl);
3599 
3600             cursor = new CursorImpl(db, locker);
3601             cursor.setAllowEviction(allowEviction);
3602 
3603             if (cursor.positionFirstOrLast(true /*first*/)) {
3604 
3605                 OperationStatus status = cursor.lockAndGetCurrent(
3606                     key, data, lockType, false /*dirtyReadAll*/,
3607                     true /*isLatched*/, true /*unlatch*/);
3608 
3609                 boolean done = false;
3610                 while (!done) {
3611 
3612                     /*
3613                      * lockAndGetCurrent may have returned non-SUCCESS
3614                      * if the first record is deleted, but we can call getNext
3615                      * below to move forward.
3616                      */
3617                     if (status == OperationStatus.SUCCESS) {
3618                         if (!withCursor.withCursor(cursor, key, data)) {
3619                             done = true;
3620                         }
3621                     }
3622 
3623                     if (!done) {
3624                         status = cursor.getNext(
3625                             key, data, lockType, false /*dirtyReadAll*/,
3626                             true /*forward*/, false /*isLatched*/,
3627                             null /*rangeConstraint*/);
3628 
3629                         if (status != OperationStatus.SUCCESS) {
3630                             done = true;
3631                         }
3632                     }
3633                 }
3634             }
3635         } finally {
3636             if (cursor != null) {
3637                 cursor.releaseBIN();
3638                 cursor.close();
3639             }
3640             if (locker != null) {
3641                 locker.operationEnd();
3642             }
3643         }
3644     }
3645 
3646     /**
3647      * Dump the cursor for debugging purposes.  Dump the bin that the cursor
3648      * refers to if verbose is true.
3649      */
dump(boolean verbose)3650     public void dump(boolean verbose) {
3651         System.out.println(dumpToString(verbose));
3652     }
3653 
3654     /**
3655      * dump the cursor for debugging purposes.
3656      */
dump()3657     public void dump() {
3658         System.out.println(dumpToString(true));
3659     }
3660 
3661     /*
3662      * dumper
3663      */
statusToString(byte status)3664     private String statusToString(byte status) {
3665         switch(status) {
3666         case CURSOR_NOT_INITIALIZED:
3667             return "CURSOR_NOT_INITIALIZED";
3668         case CURSOR_INITIALIZED:
3669             return "CURSOR_INITIALIZED";
3670         case CURSOR_CLOSED:
3671             return "CURSOR_CLOSED";
3672         default:
3673             return "UNKNOWN (" + Byte.toString(status) + ")";
3674         }
3675     }
3676 
3677     /*
3678      * dumper
3679      */
dumpToString(boolean verbose)3680     public String dumpToString(boolean verbose) {
3681         StringBuilder sb = new StringBuilder();
3682 
3683         sb.append("<Cursor idx=\"").append(index).append("\"");
3684         sb.append(" status=\"").append(statusToString(status)).append("\"");
3685         sb.append(">\n");
3686         if (verbose) {
3687             sb.append((bin == null) ? "" : bin.dumpString(2, true));
3688         }
3689         sb.append("\n</Cursor>");
3690 
3691         return sb.toString();
3692     }
3693 
3694     /*
3695      * For unit tests
3696      */
getLockStats()3697     public StatGroup getLockStats()
3698         throws DatabaseException {
3699 
3700         return locker.collectStats();
3701     }
3702 
3703     /**
3704      * Send trace messages to the java.util.logger. Don't rely on the logger
3705      * alone to conditionalize whether we send this message, we don't even want
3706      * to construct the message if the level is not enabled.
3707      */
trace(Level level, String changeType, BIN theBin, int lnIndex, long oldLsn, long newLsn)3708     private void trace(Level level,
3709                        String changeType,
3710                        BIN theBin,
3711                        int lnIndex,
3712                        long oldLsn,
3713                        long newLsn) {
3714         EnvironmentImpl envImpl = databaseImpl.getEnv();
3715         if (envImpl.getLogger().isLoggable(level)) {
3716             StringBuilder sb = new StringBuilder();
3717             sb.append(changeType);
3718             sb.append(" bin=");
3719             sb.append(theBin.getNodeId());
3720             sb.append(" lnIdx=");
3721             sb.append(lnIndex);
3722             sb.append(" oldLnLsn=");
3723             sb.append(DbLsn.getNoFormatString(oldLsn));
3724             sb.append(" newLnLsn=");
3725             sb.append(DbLsn.getNoFormatString(newLsn));
3726 
3727             LoggerUtils.logMsg
3728                 (envImpl.getLogger(), envImpl, level, sb.toString());
3729         }
3730     }
3731 
3732     /**
3733      * Send trace messages to the java.util.logger. Don't rely on the logger
3734      * alone to conditionalize whether we send this message, we don't even want
3735      * to construct the message if the level is not enabled.
3736      */
traceInsert(Level level, BIN insertingBin, long lnLsn, int index)3737     private void traceInsert(Level level,
3738                              BIN insertingBin,
3739                              long lnLsn,
3740                              int index) {
3741         EnvironmentImpl envImpl = databaseImpl.getEnv();
3742         if (envImpl.getLogger().isLoggable(level)) {
3743             StringBuilder sb = new StringBuilder();
3744             sb.append(TRACE_INSERT);
3745             sb.append(" bin=");
3746             sb.append(insertingBin.getNodeId());
3747             sb.append(" lnLsn=");
3748             sb.append(DbLsn.getNoFormatString(lnLsn));
3749             sb.append(" index=");
3750             sb.append(index);
3751 
3752             LoggerUtils.logMsg(envImpl.getLogger(), envImpl, level,
3753                                sb.toString());
3754         }
3755     }
3756 
3757     /* For unit testing only. */
setTestHook(TestHook hook)3758     public void setTestHook(TestHook hook) {
3759         testHook = hook;
3760     }
3761 
3762     /* Check that the target bin is latched. For use in assertions. */
checkAlreadyLatched(boolean isLatched)3763     private boolean checkAlreadyLatched(boolean isLatched) {
3764         if (isLatched) {
3765             if (bin != null) {
3766                 return bin.isLatchExclusiveOwner();
3767             }
3768         }
3769         return true;
3770     }
3771 }
3772