1 /*-
2  * See the file LICENSE for redistribution information.
3  *
4  * Copyright (c) 2002, 2014 Oracle and/or its affiliates.  All rights reserved.
5  *
6  */
7 
8 package com.sleepycat.je.tree;
9 
10 import static com.sleepycat.je.dbi.BTreeStatDefinition.BTREE_RELATCHES_REQUIRED;
11 import static com.sleepycat.je.dbi.BTreeStatDefinition.BTREE_ROOT_SPLITS;
12 import static com.sleepycat.je.dbi.BTreeStatDefinition.GROUP_DESC;
13 import static com.sleepycat.je.dbi.BTreeStatDefinition.GROUP_NAME;
14 
15 import java.nio.ByteBuffer;
16 import java.util.ArrayList;
17 import java.util.Comparator;
18 import java.util.List;
19 import java.util.ListIterator;
20 import java.util.logging.Level;
21 import java.util.logging.Logger;
22 
23 import com.sleepycat.je.BtreeStats;
24 import com.sleepycat.je.CacheMode;
25 import com.sleepycat.je.DatabaseException;
26 import com.sleepycat.je.EnvironmentFailureException;
27 import com.sleepycat.je.StatsConfig;
28 import com.sleepycat.je.cleaner.LocalUtilizationTracker;
29 import com.sleepycat.je.dbi.DatabaseImpl;
30 import com.sleepycat.je.dbi.EnvironmentImpl;
31 import com.sleepycat.je.dbi.INList;
32 import com.sleepycat.je.latch.LatchContext;
33 import com.sleepycat.je.latch.LatchFactory;
34 import com.sleepycat.je.latch.LatchSupport;
35 import com.sleepycat.je.latch.LatchTable;
36 import com.sleepycat.je.latch.SharedLatch;
37 import com.sleepycat.je.log.LogManager;
38 import com.sleepycat.je.log.Loggable;
39 import com.sleepycat.je.recovery.RecoveryManager;
40 import com.sleepycat.je.evictor.Evictor;
41 import com.sleepycat.je.utilint.DbLsn;
42 import com.sleepycat.je.utilint.IntStat;
43 import com.sleepycat.je.utilint.LoggerUtils;
44 import com.sleepycat.je.utilint.LongStat;
45 import com.sleepycat.je.utilint.StatGroup;
46 import com.sleepycat.je.utilint.TestHook;
47 import com.sleepycat.je.utilint.TestHookExecute;
48 
49 /**
50  * Tree implements the JE B+Tree.
51  *
52  * A note on tree search patterns:
53  * There's a set of Tree.search* methods. Some clients of the tree use
54  * those search methods directly, whereas other clients of the tree
55  * tend to use methods built on top of search.
56  *
57  * The semantics of search* are
58  *   they leave you pointing at a BIN or IN
59  *   they don't tell you where the reference of interest is.
60  * The semantics of the get* methods are:
61  *   they leave you pointing at a BIN or IN
62  *   they return the index of the slot of interest
63  *   they traverse down to whatever level is needed
64  *   they are built on top of search* methods.
65  * For the future:
66  * Over time, we need to clarify which methods are to be used by clients
67  * of the tree. Preferably clients that call the tree use get*, although
68  * their are cases where they need visibility into the tree structure.
69  *
70  * Also, search* should return the location of the slot to save us a
71  * second binary search.
72  *
73  * Search Method Call Hierarchy
74  * ----------------------------
75  * getFirst/LastNode
76  *  search
77  *  CALLED BY:
78  *   CursorImpl.getFirstOrLast
79  *
80  * getNext/PrevBin
81  *  getParentINForChildIN
82  *  searchSubTree
83  *  CALLED BY:
84  *   DupConvert
85  *   CursorImpl.getNext
86  *
87  * getParentINForChildIN
88  *  IN.findParent
89  *  does not use shared latching
90  *  CALLED BY:
91  *   Checkpointer.flushIN (doFetch=false, targetLevel=-1)
92  *   FileProcessor.processIN (doFetch=true, targetLevel=LEVEL)
93  *   Evictor.evictIN (doFetch=true, targetLevel=-1)
94  *   RecoveryManager.replaceOrInsertChild (doFetch=true, targetLevel=-1)
95  *   getNext/PrevBin (doFetch=true, targetLevel=-1)
96  *
97  * search
98  *  searchSubTree
99  *  CALLED BY:
100  *   CursorImpl.searchAndPosition
101  *   INCompressor to find BIN
102  *
103  * searchSubTree
104  *  uses shared grandparent latching
105  *
106  * getParentBINForChildLN
107  *  searchSplitsAllowed
108  *   CALLED BY:
109  *    RecoveryManager.redo
110  *    RecoveryManager.recoveryUndo
111  *  search
112  *   CALLED BY:
113  *    RecoveryManager.abortUndo
114  *    RecoveryManager.rollbackUndo
115  *    FileProcessor.processLN
116  *    Cleaner.processPendingLN
117  *    UtilizationProfile.verifyLsnIsObsolete (utility)
118  *
119  * findBinForInsert
120  *  searchSplitsAllowed
121  *  CALLED BY:
122  *   CursorImpl.putInternal
123  *
124  * searchSplitsAllowed
125  *  uses shared non-grandparent latching
126  *  CALLED BY:
127  *   DupConvert (instead of findBinForInsert, which needs a cursor)
128  *
129  * Possible Shared Latching Improvements
130  * -------------------------------------
131  * By implementing shared latching for BINs we would get better concurrency in
132  * these cases:
133  *  Reads when LN is in cache, or LN is not needed (key-only op, e.g., dups)
134  */
135 public final class Tree implements Loggable {
136 
137     /* For debug tracing */
138     private static final String TRACE_ROOT_SPLIT = "RootSplit:";
139 
140     private DatabaseImpl database;
141 
142     private int maxTreeEntriesPerNode;
143 
144     private ChildReference root;
145 
146     /*
147      * Latch that must be held when using/accessing the root node.  Protects
148      * against the root being changed out from underneath us by splitRoot.
149      * After the root IN is latched, the rootLatch can be released.
150      */
151     private SharedLatch rootLatch;
152 
153     /*
154      * We don't need the stack trace on this so always throw a static and
155      * avoid the cost of Throwable.fillInStack() every time it's thrown.
156      * [#13354].
157      */
158     private static SplitRequiredException splitRequiredException =
159         new SplitRequiredException();
160 
161     /* Stats */
162     private StatGroup stats;
163 
164     /* The number of tree root splited. */
165     private IntStat rootSplits;
166     /* The number of latch upgrades from shared to exclusive required. */
167     private LongStat relatchesRequired;
168 
169     private final ThreadLocal<TreeWalkerStatsAccumulator> treeStatsAccumulatorTL =
170         new ThreadLocal<TreeWalkerStatsAccumulator>();
171 
172     /* For unit tests */
173     private TestHook waitHook; // used for generating race conditions
174     private TestHook searchHook; // [#12736]
175     private TestHook ckptHook; // [#13897]
176     private TestHook getParentINHook;
177     private TestHook fetchINHook;
178 
179     /**
180      * Embodies an enum for the type of search being performed.  NORMAL means
181      * do a regular search down the tree.  LEFT/RIGHT means search down the
182      * left/right side to find the first/last node in the tree.
183      */
184     public static class SearchType {
185         /* Search types */
186         public static final SearchType NORMAL = new SearchType();
187         public static final SearchType LEFT   = new SearchType();
188         public static final SearchType RIGHT  = new SearchType();
189 
190         /* No lock types can be defined outside this class. */
SearchType()191         private SearchType() {
192         }
193     }
194 
195     /*
196      * Class that overrides ChildReference methods to enforce rules that apply
197      * to the root.
198      *
199      * Overrides fetchTarget() so that if the rootLatch is not held exclusively
200      * when the root is fetched, we upgrade it to exclusive. Also overrides
201      * setter methods to assert that an exclusive latch is held.
202      *
203      * Overrides setDirty to dirty the DatabaseImpl, so that the MapLN will be
204      * logged during the next checkpoint. This is critical when updating the
205      * root LSN.
206      */
207     private class RootChildReference extends ChildReference {
208 
RootChildReference()209         private RootChildReference() {
210             super();
211         }
212 
RootChildReference(Node target, byte[] key, long lsn)213         private RootChildReference(Node target, byte[] key, long lsn) {
214             super(target, key, lsn);
215         }
216 
217         /* Caller is responsible for releasing rootLatch. */
218         @Override
fetchTarget(DatabaseImpl database, IN in)219         public Node fetchTarget(DatabaseImpl database, IN in)
220             throws DatabaseException {
221 
222             if (getTarget() == null && !rootLatch.isExclusiveOwner()) {
223 
224                 rootLatch.release();
225                 rootLatch.acquireExclusive();
226 
227                 /*
228                  * If the root field changed while unlatched then we have an
229                  * invalid state and cannot continue. [#21686]
230                  */
231                 if (this != root) {
232                     throw EnvironmentFailureException.unexpectedState(
233                         database.getEnv(),
234                         "Root changed while unlatched, dbId=" +
235                         database.getId());
236                 }
237             }
238 
239             return super.fetchTarget(database, in);
240         }
241 
242         @Override
setTarget(Node target)243         public void setTarget(Node target) {
244             assert rootLatch.isExclusiveOwner();
245             super.setTarget(target);
246         }
247 
248         @Override
clearTarget()249         public void clearTarget() {
250             assert rootLatch.isExclusiveOwner();
251             super.clearTarget();
252         }
253 
254         @Override
setLsn(long lsn)255         public void setLsn(long lsn) {
256             assert rootLatch.isExclusiveOwner();
257             super.setLsn(lsn);
258         }
259 
260         @Override
updateLsnAfterOptionalLog(DatabaseImpl dbImpl, long lsn)261         void updateLsnAfterOptionalLog(DatabaseImpl dbImpl, long lsn) {
262             assert rootLatch.isExclusiveOwner();
263             super.updateLsnAfterOptionalLog(dbImpl, lsn);
264         }
265 
266         @Override
setDirty()267         void setDirty() {
268             super.setDirty();
269             database.setDirty();
270         }
271     }
272 
273     /**
274      * Create a new tree.
275      */
Tree(DatabaseImpl database)276     public Tree(DatabaseImpl database) {
277         init(database);
278         setDatabase(database);
279     }
280 
281     /**
282      * Create a tree that's being read in from the log.
283      */
Tree()284     public Tree() {
285         init(null);
286         maxTreeEntriesPerNode = 0;
287     }
288 
289     /**
290      * constructor helper
291      */
init(DatabaseImpl database)292     private void init(DatabaseImpl database) {
293         this.root = null;
294         this.database = database;
295 
296         /* Do the stats definitions. */
297         stats = new StatGroup(GROUP_NAME, GROUP_DESC);
298         relatchesRequired = new LongStat(stats, BTREE_RELATCHES_REQUIRED);
299         rootSplits = new IntStat(stats, BTREE_ROOT_SPLITS);
300     }
301 
302     /**
303      * Set the database for this tree. Used by recovery when recreating an
304      * existing tree.
305      */
setDatabase(DatabaseImpl database)306     public void setDatabase(DatabaseImpl database) {
307         this.database = database;
308 
309         final EnvironmentImpl envImpl = database.getEnv();
310 
311         /*
312          * The LatchContext for the root is special in that it is considered a
313          * Btree latch (the Btree latch table is used), but the context is not
314          * implemented by the IN class.
315          */
316         final LatchContext latchContext = new LatchContext() {
317             @Override
318             public int getLatchTimeoutMs() {
319                 return envImpl.getLatchTimeoutMs();
320             }
321             @Override
322             public String getLatchName() {
323                 return "RootLatch";
324             }
325             @Override
326             public LatchTable getLatchTable() {
327                 return LatchSupport.btreeLatchTable;
328             }
329             @Override
330             public EnvironmentImpl getEnvImplForFatalException() {
331                 return envImpl;
332             }
333         };
334 
335         rootLatch = LatchFactory.createSharedLatch(
336             latchContext, false /*exclusiveOnly*/);
337 
338 
339         maxTreeEntriesPerNode = database.getNodeMaxTreeEntries();
340     }
341 
342     /**
343      * @return the database for this Tree.
344      */
getDatabase()345     public DatabaseImpl getDatabase() {
346         return database;
347     }
348 
latchRootLatchExclusive()349     public void latchRootLatchExclusive()
350         throws DatabaseException {
351 
352         rootLatch.acquireExclusive();
353     }
354 
releaseRootLatch()355     public void releaseRootLatch()
356         throws DatabaseException {
357 
358         rootLatch.release();
359     }
360 
361     /**
362      * Set the root for the tree. Should only be called within the root latch.
363      */
setRoot(ChildReference newRoot, boolean notLatched)364     public void setRoot(ChildReference newRoot, boolean notLatched) {
365         assert (notLatched || rootLatch.isExclusiveOwner());
366         root = newRoot;
367     }
368 
makeRootChildReference( Node target, byte[] key, long lsn)369     public ChildReference makeRootChildReference(
370         Node target,
371         byte[] key,
372         long lsn) {
373 
374         return new RootChildReference(target, key, lsn);
375     }
376 
makeRootChildReference()377     private RootChildReference makeRootChildReference() {
378         return new RootChildReference();
379     }
380 
381     /*
382      * A tree doesn't have a root if (a) the root field is null, or (b) the
383      * root is non-null, but has neither a valid target nor a valid LSN. Case
384      * (b) can happen if the database is or was previously opened in deferred
385      * write mode.
386      *
387      * @return false if there is no real root.
388      */
rootExists()389     public boolean rootExists() {
390         if (root == null) {
391             return false;
392         }
393 
394         if ((root.getTarget() == null) &&
395             (root.getLsn() == DbLsn.NULL_LSN)) {
396             return false;
397         }
398 
399         return true;
400     }
401 
402     /**
403      * Perform a fast check to see if the root IN is resident.  No latching is
404      * performed.  To ensure that the root IN is not loaded by another thread,
405      * this method should be called while holding a write lock on the MapLN.
406      * That will prevent opening the DB in another thread, and potentially
407      * loading the root IN. [#13415]
408      */
isRootResident()409     public boolean isRootResident() {
410         return root != null && root.getTarget() != null;
411     }
412 
413     /**
414      * Helper to obtain the root IN with shared root latching.  Optionally
415      * updates the generation of the root when latching it.
416      */
getRootIN(CacheMode cacheMode)417     public IN getRootIN(CacheMode cacheMode)
418         throws DatabaseException {
419 
420         return getRootINInternal(cacheMode, false/*exclusive*/);
421     }
422 
423     /**
424      * Helper to obtain the root IN with exclusive root latching.  Optionally
425      * updates the generation of the root when latching it.
426      */
getRootINLatchedExclusive(CacheMode cacheMode)427     public IN getRootINLatchedExclusive(CacheMode cacheMode)
428         throws DatabaseException {
429 
430         return getRootINInternal(cacheMode, true/*exclusive*/);
431     }
432 
getRootINInternal(CacheMode cacheMode, boolean exclusive)433     private IN getRootINInternal(CacheMode cacheMode, boolean exclusive)
434         throws DatabaseException {
435 
436         rootLatch.acquireShared();
437         try {
438             return getRootINRootAlreadyLatched(cacheMode, exclusive);
439         } finally {
440             rootLatch.release();
441         }
442     }
443 
444     /**
445      * Helper to obtain the root IN, when the root latch is already held.
446      */
getRootINRootAlreadyLatched( CacheMode cacheMode, boolean exclusive)447     public IN getRootINRootAlreadyLatched(
448         CacheMode cacheMode,
449         boolean exclusive) {
450 
451         if (!rootExists()) {
452             return null;
453         }
454 
455         final IN rootIN = (IN) root.fetchTarget(database, null);
456 
457         if (exclusive) {
458             rootIN.latch(cacheMode);
459         } else {
460             rootIN.latchShared(cacheMode);
461         }
462         return rootIN;
463     }
464 
getResidentRootIN(boolean latched)465     public IN getResidentRootIN(boolean latched)
466         throws DatabaseException {
467 
468         IN rootIN = null;
469         if (rootExists()) {
470             rootIN = (IN) root.getTarget();
471             if (rootIN != null && latched) {
472                 rootIN.latchShared(CacheMode.UNCHANGED);
473             }
474         }
475         return rootIN;
476     }
477 
withRootLatchedExclusive(WithRootLatched wrl)478     public IN withRootLatchedExclusive(WithRootLatched wrl)
479         throws DatabaseException {
480 
481         try {
482             rootLatch.acquireExclusive();
483             return wrl.doWork(root);
484         } finally {
485             rootLatch.release();
486         }
487     }
488 
withRootLatchedShared(WithRootLatched wrl)489     public IN withRootLatchedShared(WithRootLatched wrl)
490         throws DatabaseException {
491 
492         try {
493             rootLatch.acquireShared();
494             return wrl.doWork(root);
495         } finally {
496             rootLatch.release();
497         }
498     }
499 
500     /**
501      * Get LSN of the rootIN. Obtained without latching, should only be
502      * accessed while quiescent.
503      */
getRootLsn()504     public long getRootLsn() {
505         if (root == null) {
506             return DbLsn.NULL_LSN;
507         } else {
508             return root.getLsn();
509         }
510     }
511 
512     /**
513      * Cheaply calculates and returns the maximum possible number of LNs in the
514      * btree.
515      */
getMaxLNs()516     public long getMaxLNs() {
517 
518         final int levels;
519         final int topLevelSlots;
520         rootLatch.acquireShared();
521         try {
522             IN rootIN = (IN) root.fetchTarget(database, null);
523             levels = rootIN.getLevel() & IN.LEVEL_MASK;
524             topLevelSlots = rootIN.getNEntries();
525         } finally {
526             rootLatch.release();
527         }
528         return (long) (topLevelSlots *
529                        Math.pow(database.getNodeMaxTreeEntries(), levels - 1));
530     }
531 
532     /**
533      * Deletes a BIN specified by key from the tree. If the BIN resides in a
534      * subtree that can be pruned away, prune as much as possible, so we
535      * don't leave a branch that has no BINs.
536      *
537      * It's possible that the targeted BIN will now have entries, or will
538      * have resident cursors. Either will prevent deletion.
539      *
540      * Reverse splits cause INs to be logged in all ancestors, including the
541      * root. This is to avoid the "great aunt" problem described in
542      * LevelRecorder.
543      *
544      * INs below the root are logged provisionally; only the root is logged
545      * non-provisionally. Provisional logging is necessary during a checkpoint
546      * for levels less than maxFlushLevel.
547      *
548      * @param idKey - the identifier key of the node to delete.
549      * @param localTracker is used for tracking obsolete node info.
550      */
delete( byte[] idKey, LocalUtilizationTracker localTracker)551     public void delete(
552         byte[] idKey,
553         LocalUtilizationTracker localTracker)
554         throws NodeNotEmptyException, CursorsExistException {
555 
556         final EnvironmentImpl envImpl = database.getEnv();
557         final INList inList = envImpl.getInMemoryINs();
558         final Logger logger = envImpl.getLogger();
559         IN branchRoot = null;
560         IN branchParent = null;
561 
562         /*
563          * A delete is a reverse split that must be propagated up to the root.
564          * [#13501] Keep all nodes from the rootIN to the parent of the
565          * deletable subtree latched as we descend so we can log the
566          * IN deletion and cascade the logging up the tree. The latched
567          * nodes are kept in order in the nodeLadder.
568          */
569         final ArrayList<SplitInfo> nodeLadder = new ArrayList<SplitInfo>();
570 
571         IN rootIN = null;
572 
573         rootLatch.acquireExclusive();
574 
575         try {
576             if (!rootExists()) {
577                 /* no action, tree is deleted or was never persisted. */
578                 return;
579             }
580 
581             rootIN = (IN) root.fetchTarget(database, null);
582             rootIN.latch(CacheMode.UNCHANGED);
583 
584             searchDeletableSubTree(rootIN, idKey, nodeLadder);
585 
586             if (nodeLadder.size() == 0) {
587 
588                 /*
589                  * The tree is empty, so do nothing.  Root compression is no
590                  * longer supported.  Root compression has no impact on memory
591                  * usage now that we evict the root IN.  It reduces log space
592                  * taken by INs for empty (but not removed) databases, yet
593                  * requires logging a INDelete and MapLN; this provides very
594                  * little benefit, if any.  Because it requires extensive
595                  * testing (which has not been done), this minor benefit is not
596                  * worth the cost.  And by removing it we no longer log
597                  * INDelete, which reduces complexity going forward. [#17546]
598                  */
599             } else {
600                 /* Detach this subtree. */
601                 SplitInfo detachPoint = nodeLadder.get(nodeLadder.size() - 1);
602 
603                 branchParent = detachPoint.parent;
604                 branchRoot = detachPoint.child;
605 
606                 if (logger.isLoggable(Level.FINEST)) {
607                     LoggerUtils.envLogMsg(
608                         Level.FINEST, envImpl,
609                         "Tree.delete() " +
610                         Thread.currentThread().getId() + "-" +
611                         Thread.currentThread().getName() + "-" +
612                         envImpl.getName() +
613                         " Deleting child node: " + branchRoot.getNodeId() +
614                         " from parent node: " + branchParent.getNodeId() +
615                         " parent has " + branchParent.getNEntries() +
616                         " children");
617                 }
618 
619                 boolean deleteOk = branchParent.deleteEntry(
620                     detachPoint.index, true);
621                 assert deleteOk;
622 
623                 /*
624                  * For a deferred-write DB, account for deleted INs.  For a
625                  * deferred-write DB, they can't be actually counted obsolete
626                  * until the parent is logged at a later time.  [#21348]
627                  *
628                  * We do this before calling cascadeUpdates, because in the
629                  * future it may also apply to a normal DB, in which case they
630                  * will be actually counted obsolete when the parent is logged
631                  * by cascadeUpdates.
632                  */
633                 if (database.isDeferredWriteMode()) {
634                     branchRoot.accountForDeferredWriteSubtreeRemoval(
635                         inList, detachPoint.parent);
636                 }
637 
638                 /* Cascade updates upward, including writing the root IN. */
639                 cascadeUpdates(nodeLadder);
640             }
641         } finally {
642             releaseNodeLadderLatches(nodeLadder);
643 
644             if (rootIN != null) {
645                 rootIN.releaseLatch();
646             }
647 
648             rootLatch.release();
649         }
650 
651         if (branchRoot != null) {
652 
653             if (!database.isDeferredWriteMode()) {
654 
655                 /*
656                  * Count obsolete nodes after logging the delete. We can do
657                  * this without having the parent of the subtree latched
658                  * because the subtree has been detached from the tree.
659                  */
660                 branchRoot.accountForSubtreeRemoval(inList, localTracker);
661             }
662 
663             if (logger.isLoggable(Level.FINE)) {
664                 LoggerUtils.envLogMsg(
665                     Level.FINE, envImpl,
666                     "SubtreeRemoval: subtreeRoot = " +
667                     branchRoot.getNodeId());
668             }
669         }
670     }
671 
672     /**
673      * Search down the tree using a key, but instead of returning the BIN that
674      * houses that key, find the point where we can detach a deletable
675      * subtree. A deletable subtree is a branch where each IN has one child,
676      * and the bottom BIN has no entries and no resident cursors. That point
677      * can be found by saving a pointer to the lowest node in the path with
678      * more than one entry.
679      *
680      *              INa
681      *             /   \
682      *          INb    INc
683      *          |       |
684      *         INd     ..
685      *         / \
686      *      INe  ..
687      *       |
688      *     BINx (suspected of being empty)
689      *
690      * In this case, we'd like to prune off the subtree headed by INe. INd
691      * is the parent of this deletable subtree. As we descend, we must keep
692      * latches for all the nodes that will be logged. In this case, we
693      * will need to keep INa, INb and INd latched when we return from this
694      * method.
695      *
696      * The method returns a list of parent/child/index structures. In this
697      * example, the list will hold:
698      *  INa/INb/index
699      *  INb/INd/index
700      *  INd/INe/index
701      * Every node is latched, and every node except for the bottom most child
702      * (INe) must be logged.
703      */
searchDeletableSubTree( IN parent, byte[] key, ArrayList<SplitInfo> nodeLadder)704     private void searchDeletableSubTree(
705         IN parent,
706         byte[] key,
707         ArrayList<SplitInfo> nodeLadder)
708         throws NodeNotEmptyException, CursorsExistException {
709 
710         assert (parent!=null);
711         assert (key!= null);
712         assert parent.isLatchExclusiveOwner();
713 
714         int index;
715         IN child = null;
716 
717         /* Save the lowest IN in the path that has multiple entries. */
718         IN lowestMultipleEntryIN = null;
719         IN pinedIN = null;
720 
721         do {
722             if (parent.getNEntries() == 0) {
723                 throw EnvironmentFailureException.unexpectedState(
724                     "Found upper IN with 0 entries");
725             }
726 
727             /* Remember if this is the lowest multiple point. */
728             if (parent.getNEntries() > 1) {
729                 lowestMultipleEntryIN = parent;
730                 pinedIN = null;
731             } else if (parent.isPinned()) {
732                 pinedIN = parent;
733             }
734 
735             index = parent.findEntry(key, false, false);
736             assert index >= 0;
737 
738             /* Get the child node that matches. */
739             child = parent.fetchIN(index);
740 
741             child.latch(CacheMode.UNCHANGED);
742 
743             nodeLadder.add(new SplitInfo(parent, child, index));
744 
745             /* Continue down a level */
746             parent = child;
747         } while (!parent.isBIN());
748 
749         if (pinedIN != null) {
750             throw CursorsExistException.CURSORS_EXIST;
751         }
752 
753         /*
754          * See if there is a reason we can't delete this BIN -- i.e.
755          * new items have been inserted, or a cursor exists on it.
756          */
757         assert(child.isBIN());
758 
759         if (child.getNEntries() != 0) {
760             throw NodeNotEmptyException.NODE_NOT_EMPTY;
761         }
762 
763         if (child.isBINDelta()) {
764             throw EnvironmentFailureException.unexpectedState(
765                 "Found BIN delta with 0 entries");
766         }
767 
768         /*
769          * This case can happen if we are keeping a cursor on an empty
770          * BIN as we traverse.
771          */
772         if (((BIN) child).nCursors() > 0 || child.isPinned()) {
773             throw CursorsExistException.CURSORS_EXIST;
774         }
775 
776         if (lowestMultipleEntryIN != null) {
777 
778             final INList inList = database.getEnv().getInMemoryINs();
779 
780             /*
781              * Release all nodes up to the pair that holds the detach
782              * point. We won't be needing those nodes, since they'll be
783              * pruned and won't need to be updated.
784              */
785             ListIterator<SplitInfo> iter =
786                 nodeLadder.listIterator(nodeLadder.size());
787 
788             while (iter.hasPrevious()) {
789                 SplitInfo info = iter.previous();
790 
791                 if (info.parent == lowestMultipleEntryIN) {
792                     break;
793                 } else {
794                     /* Remove the child from the LRU and the INList */
795                     inList.remove(info.child);
796                     info.child.releaseLatch();
797                     iter.remove();
798                 }
799             }
800         } else {
801 
802             /*
803              * We actually have to prune off the entire tree. Release
804              * all latches, and clear the node ladder.
805              */
806             releaseNodeLadderLatches(nodeLadder);
807             nodeLadder.clear();
808         }
809     }
810 
releaseNodeLadderLatches(ArrayList<SplitInfo> nodeLadder)811     private void releaseNodeLadderLatches(ArrayList<SplitInfo> nodeLadder)
812         throws DatabaseException {
813 
814         /*
815          * Clear any latches left in the node ladder. Release from the
816          * bottom up.
817          */
818         ListIterator<SplitInfo> iter =
819             nodeLadder.listIterator(nodeLadder.size());
820 
821         while (iter.hasPrevious()) {
822             SplitInfo info = iter.previous();
823             info.child.releaseLatch();
824         }
825     }
826 
827     /**
828      * Update nodes for a delete, going upwards. For example, suppose a
829      * node ladder holds:
830      * INa, INb, index for INb in INa
831      * INb, INc, index for INc in INb
832      * INc, BINd, index for BINd in INc
833      *
834      * When we enter this method, BINd has already been removed from INc. We
835      * need to
836      *  - log INc
837      *  - update INb, log INb
838      *  - update INa, log INa
839      *
840      * @param nodeLadder List of SplitInfos describing each node pair on the
841      * downward path
842      */
cascadeUpdates(ArrayList<SplitInfo> nodeLadder)843     private void cascadeUpdates(ArrayList<SplitInfo> nodeLadder)
844         throws DatabaseException {
845 
846         final EnvironmentImpl envImpl = database.getEnv();
847         final LogManager logManager = envImpl.getLogManager();
848 
849         for (int i = nodeLadder.size() - 1; i >= 0; i -= 1) {
850             final SplitInfo info = nodeLadder.get(i);
851 
852             if (info.parent.isDbRoot()) {
853                 assert i == 0;
854                 final long newLsn = info.parent.optionalLog(logManager);
855                 root.updateLsnAfterOptionalLog(database, newLsn);
856             } else {
857                 assert i > 0;
858                 final SplitInfo nextInfo = nodeLadder.get(i - 1);
859 
860                 final long newLsn = info.parent.optionalLogProvisional(
861                     logManager, nextInfo.parent);
862 
863                 nextInfo.parent.updateEntry(
864                     nextInfo.index, newLsn, 0 /*lastLoggedSize*/);
865             }
866         }
867     }
868 
869     /**
870      * Find the leftmost node (IN or BIN) in the tree.
871      *
872      * @return the leftmost node in the tree, null if the tree is empty.  The
873      * returned node is latched and the caller must release it.
874      */
getFirstNode(CacheMode cacheMode)875     public BIN getFirstNode(CacheMode cacheMode)
876         throws DatabaseException {
877 
878         BIN bin = search(
879             null /*key*/, SearchType.LEFT, null /*binBoundary*/,
880             cacheMode, null /*comparator*/);
881 
882         if (bin != null) {
883             bin.mutateToFullBIN();
884         }
885 
886         return bin;
887     }
888 
889     /**
890      * Find the rightmost node (IN or BIN) in the tree.
891      *
892      * @return the rightmost node in the tree, null if the tree is empty.  The
893      * returned node is latched and the caller must release it.
894      */
getLastNode(CacheMode cacheMode)895     public BIN getLastNode(CacheMode cacheMode)
896         throws DatabaseException {
897 
898         BIN bin = search(
899             null /*key*/, SearchType.RIGHT, null /*binBoundary*/,
900             cacheMode, null /*comparator*/);
901 
902         if (bin != null) {
903             bin.mutateToFullBIN();
904         }
905 
906         return bin;
907     }
908 
909     /**
910      * Return a reference to the adjacent BIN.
911      *
912      * @param bin The BIN to find the next BIN for.  This BIN is latched.
913      *
914      * @return The next BIN, or null if there are no more.  The returned node
915      * is latched and the caller must release it.  If null is returned, the
916      * argument BIN remains latched.
917      */
getNextBin(BIN bin, CacheMode cacheMode)918     public BIN getNextBin(BIN bin, CacheMode cacheMode)
919         throws DatabaseException {
920 
921         return (BIN) getNextIN(bin, true, cacheMode);
922     }
923 
924     /**
925      * Return a reference to the previous BIN.
926      *
927      * @param bin The BIN to find the next BIN for.  This BIN is latched.
928      *
929      * @return The previous BIN, or null if there are no more.  The returned
930      * node is latched and the caller must release it.  If null is returned,
931      * the argument bin remains latched.
932      */
getPrevBin(BIN bin, CacheMode cacheMode)933     public BIN getPrevBin(BIN bin, CacheMode cacheMode)
934         throws DatabaseException {
935 
936         return (BIN) getNextIN(bin, false, cacheMode);
937     }
938 
939     /**
940      * Returns the next IN in the tree before/after the given IN, and at the
941      * same level.  For example, if a BIN is passed in the prevIn parameter,
942      * the next BIN will be returned.
943      *
944      * TODO: A possible problem with this method is that we don't know for
945      * certain whether it works properly in the face of splits.  There are
946      * comments below indicating it does.  But the Cursor.checkForInsertion
947      * method was apparently added because getNextBin/getPrevBin didn't work
948      * properly, and may skip a BIN.  So at least it didn't work properly in
949      * the distant past.  Archeology and possibly testing are needed to find
950      * the truth.  Hopefully it does now work, and Cursor.checkForInsertion can
951      * be removed.
952      */
getNextIN(IN prevIn, boolean forward, CacheMode cacheMode)953     public IN getNextIN(IN prevIn, boolean forward, CacheMode cacheMode) {
954 
955         assert prevIn.isLatchExclusiveOwner() : "what?";
956         if (LatchSupport.TRACK_LATCHES) {
957             LatchSupport.expectBtreeLatchesHeld(1);
958         }
959 
960         prevIn.mutateToFullBIN();
961 
962         /*
963          * Use the right most key (for a forward progressing cursor) or the
964          * left most key (for a backward progressing cursor) as the search key.
965          * The reason is that the IN may get split while finding the next IN so
966          * it's not safe to take the IN's identifierKey entry.  If the IN gets
967          * split, then the right (left) most key will still be on the
968          * resultant node.  The exception to this is that if there are no
969          * entries, we just use the identifier key.
970          */
971         final byte[] searchKey;
972 
973         if (prevIn.getNEntries() == 0) {
974             searchKey = prevIn.getIdentifierKey();
975         } else if (forward) {
976             searchKey = prevIn.getKey(prevIn.getNEntries() - 1);
977         } else {
978             searchKey = prevIn.getKey(0);
979         }
980 
981         final int targetLevel = prevIn.getLevel();
982         IN next = prevIn;
983         boolean nextIsLatched = false;
984 
985         /*
986          * Ascend the tree until we find a level that still has nodes to the
987          * right (or left if !forward) of the path that we're on.  If we reach
988          * the root level, we're done.
989          */
990         IN parent = null;
991         IN nextIN = null;
992         boolean nextINIsLatched = false;
993         boolean normalExit = false;
994         try {
995             while (true) {
996 
997                 /*
998                  * Move up a level from where we are now and check to see if we
999                  * reached the top of the tree.
1000                  */
1001                 nextIsLatched = false;
1002 
1003                 if (next.isRoot()) {
1004                     /* We've reached the root of the tree. */
1005                     next.releaseLatch();
1006                     if (LatchSupport.TRACK_LATCHES) {
1007                         LatchSupport.expectBtreeLatchesHeld(0);
1008                     }
1009                     normalExit = true;
1010                     return null;
1011                 }
1012 
1013                 final SearchResult result = getParentINForChildIN(
1014                     next, false, /*useTargetLevel*/
1015                     true, /*doFetch*/ cacheMode);
1016 
1017                 if (result.exactParentFound) {
1018                     if (LatchSupport.TRACK_LATCHES) {
1019                         LatchSupport.expectBtreeLatchesHeld(1);
1020                     }
1021                     parent = result.parent;
1022                 } else {
1023                     throw EnvironmentFailureException.unexpectedState(
1024                         "Failed to find parent for IN");
1025                 }
1026 
1027                 /*
1028                  * Figure out which entry we are in the parent.  Add (subtract)
1029                  * 1 to move to the next (previous) one and check that we're
1030                  * still pointing to a valid child.  Don't just use the result
1031                  * of the parent.findEntry call in getParentNode, because we
1032                  * want to use our explicitly chosen search key.
1033                  */
1034                 int index = parent.findEntry(searchKey, false, false);
1035 
1036                 final boolean moreEntriesThisIn;
1037 
1038                 if (forward) {
1039                     index++;
1040                     moreEntriesThisIn = (index < parent.getNEntries());
1041                 } else {
1042                     moreEntriesThisIn = (index > 0);
1043                     index--;
1044                 }
1045 
1046                 if (moreEntriesThisIn) {
1047 
1048                     /*
1049                      * There are more entries to the right of the current path
1050                      * in parent.  Get the entry, and then descend down the
1051                      * left most path to an IN.
1052                      */
1053                     nextIN = parent.fetchIN(index);
1054 
1055                     nextIN.latch(cacheMode);
1056                     nextINIsLatched = true;
1057 
1058                     nextIN.mutateToFullBIN();
1059 
1060                     if (LatchSupport.TRACK_LATCHES) {
1061                         LatchSupport.expectBtreeLatchesHeld(2);
1062                     }
1063 
1064                     if (nextIN.getLevel() == targetLevel) {
1065                         parent.releaseLatch();
1066                         parent = null; // to avoid falsely unlatching parent
1067 
1068                         final TreeWalkerStatsAccumulator treeStatsAccumulator =
1069                             getTreeStatsAccumulator();
1070                         if (treeStatsAccumulator != null) {
1071                             nextIN.accumulateStats(treeStatsAccumulator);
1072                         }
1073 
1074                         if (LatchSupport.TRACK_LATCHES) {
1075                             LatchSupport.expectBtreeLatchesHeld(1);
1076                         }
1077 
1078                         normalExit = true;
1079                         return nextIN;
1080 
1081                     } else {
1082 
1083                         /*
1084                          * We landed at a higher level than the target level.
1085                          * Descend down to the appropriate level.
1086                          */
1087                         parent.releaseLatch();
1088                         parent = null; // to avoid falsely unlatching parent
1089                         nextINIsLatched = false;
1090 
1091                         final IN ret = searchSubTree(
1092                             nextIN, null, /*key*/
1093                             (forward ? SearchType.LEFT : SearchType.RIGHT),
1094                             targetLevel, cacheMode, null /*comparator*/);
1095 
1096                         if (LatchSupport.TRACK_LATCHES) {
1097                             LatchSupport.expectBtreeLatchesHeld(1);
1098                         }
1099 
1100                         if (ret.getLevel() == targetLevel) {
1101                             normalExit = true;
1102                             return ret;
1103                         } else {
1104                             throw EnvironmentFailureException.unexpectedState(
1105                                 "subtree did not have a IN at level " +
1106                                 targetLevel);
1107                         }
1108                     }
1109                 }
1110 
1111                 /* Nothing at this level.  Ascend to a higher level. */
1112                 next = parent;
1113                 nextIsLatched = true;
1114                 parent = null; // to avoid falsely unlatching parent below
1115             }
1116         } finally {
1117             if (!normalExit) {
1118                 if (next != null && nextIsLatched) {
1119                     next.releaseLatch();
1120                 }
1121 
1122                 if (parent != null) {
1123                     parent.releaseLatch();
1124                 }
1125 
1126                 if (nextIN != null && nextINIsLatched) {
1127                     nextIN.releaseLatch();
1128                 }
1129             }
1130         }
1131     }
1132 
1133     /**
1134      * Search for the parent P of a given IN C (where C is viewed as a logical
1135      * node; not as a java obj). If found, P is returned latched exclusively.
1136      * The method is used when C has been accessed "directly", i.e., not via a
1137      * tree search, and we need to perform an operation on C that requires an
1138      * update to its parent. Such situations arise during eviction (C has been
1139      * accessed via the LRU list), checkpointing, and recovery (C has been read
1140      * from the log and is not attached to the in-memory tree).
1141      *
1142      * The method uses C's identifierKey to search down the tree until:
1143      *
1144      * (a) doFetch is false and we need to access a node that is not cached.
1145      * In this case, we are actually looking for the cached copies of both C
1146      * and its parent, so a cache miss on the path to C is considered a
1147      * failure. This search mode is used by the evictor: to evict C (which has
1148      * been retieved from the LRU), its parent must be found and EX-latched;
1149      * however, if the C has been evicted already by another thread, there is
1150      * nothing to to (C will GC-ed).
1151      * or
1152      * (b) We reach a node whose node id equals the C's node id. In this case,
1153      * we know for sure that C still belongs to the BTree and its parent has
1154      * been found.
1155      * or
1156      * (c) useTargetLevel is true and we reach a node P that is at one level
1157      * above C's level. We know that P contains a slot S whose corresponding
1158      * key range includes C's identifierKey. Since we haven't read the child
1159      * node under S to check its node id, we cannot know for sure that C is
1160      * still in the tree. Nevertheless, we consider this situation a success,
1161      * i.e., P is the parent node we are looking for. In this search mode,
1162      * after this method returns, the caller is expected to take further
1163      * action based on the info in slot S. For example, if C was created
1164      * by reading a log entry at LSN L, and the LSN at slot S is also L, then
1165      * we know P is the real parent (and we have thus saved a possible extra
1166      * I/O to refetch the C node from the log to check its node id). This
1167      * search mode is used by the cleaner.
1168      * or
1169      * (d) None of the above conditions occur and the bottom of the BTree is
1170      * reached. In this case, no parent exists (the child node is an old
1171      * version of a node that has been removed from the BTree).
1172      *
1173      * @param child The child node for which to find the parent.  This node is
1174      * latched by the caller and is unlatched by this function before returning
1175      * to the caller.
1176      *
1177      * @param useTargetLevel If true, the search is considered successful if
1178      * a node P is reached at one level above C's level. P is the parent to
1179      * return to the caller.
1180      *
1181      * @param doFetch if false, stop the search if we run into a non-resident
1182      * child and assume that no parent exists.
1183      *
1184      * @param cacheMode The CacheMode for affecting the hotness of the nodes
1185      * visited during the search.
1186      *
1187      * @return a SearchResult object. If the parent has been found,
1188      * result.foundExactMatch is true, result.parent refers to that node, and
1189      * result.index is the slot for the child IN inside the parent IN.
1190      * Otherwise, result.foundExactMatch is false and  result.parent is null.
1191      */
getParentINForChildIN( IN child, boolean useTargetLevel, boolean doFetch, CacheMode cacheMode)1192     public SearchResult getParentINForChildIN(
1193         IN child,
1194         boolean useTargetLevel,
1195         boolean doFetch,
1196         CacheMode cacheMode)
1197         throws DatabaseException {
1198 
1199         return getParentINForChildIN(
1200             child, useTargetLevel, doFetch,
1201             cacheMode, null /*trackingList*/);
1202     }
1203 
1204 
1205     /*
1206      * This version of getParentINForChildIN does the same thing as the version
1207      * above, but also adds a "trackingList" param. If trackingList is not
1208      * null, the LSNs of the parents visited along the way are added to the
1209      * list, as a debug tracing mechanism. This is meant to stay in production,
1210      * to add information to the log.
1211      */
getParentINForChildIN( IN child, boolean useTargetLevel, boolean doFetch, CacheMode cacheMode, List<TrackingInfo> trackingList)1212     public SearchResult getParentINForChildIN(
1213         IN child,
1214         boolean useTargetLevel,
1215         boolean doFetch,
1216         CacheMode cacheMode,
1217         List<TrackingInfo> trackingList)
1218         throws DatabaseException {
1219 
1220         /* Sanity checks */
1221         if (child == null) {
1222             throw EnvironmentFailureException.unexpectedState(
1223                 "getParentINForChildIN given null child node");
1224         }
1225 
1226         assert child.isLatchOwner();
1227 
1228         /*
1229          * Get information from child before releasing latch.
1230          */
1231         long targetId = child.getNodeId();
1232         byte[] targetKey = child.getIdentifierKey();
1233         int targetLevel = (useTargetLevel ? child.getLevel() : -1);
1234         int exclusiveLevel = child.getLevel() + 1;
1235         boolean requireExactMatch = true;
1236 
1237         child.releaseLatch();
1238 
1239         return getParentINForChildIN(
1240             targetId, targetKey, targetLevel,
1241             exclusiveLevel, requireExactMatch, doFetch,
1242             cacheMode, trackingList);
1243 
1244     }
1245 
1246     /*
1247      * This version of getParentINForChildIN() is the actual implementation
1248      * of the previous 2 versions (read the comments there), but it also
1249      * implements two additional use cases via the extra "requireExactMatch"
1250      * param.
1251      *
1252      * (a) requireExactMatch == false && doFetch == false
1253      * In this case we are actually looking for the lowest cached ancestor
1254      * of the C node. The method will always return a node (considred as the
1255      * "parent") unless the BTree is empty (has no nodes at all). The returned
1256      * node must be latched, but not necessarily in EX mode. This search mode
1257      * is used by the checkpointer.
1258      *
1259      * (b) requireExactMatch == false && doFetch == true
1260      * This search mode is used during recovery. It seems that
1261      * requireExactMatch can actually be true here ???? (and maybe use a
1262      * targetLevel as well ????).
1263      *
1264      * The exclusiveLevel param:
1265      * In general, if exclusiveLevel == L, nodes above L will be SH latched and
1266      * nodes at or below L will be EX-latched. In all current use cases, except
1267      * case (a) above, L is set to 1 + C.level; in case (a) it is set to 0.
1268      * This is because in all use cases except (a), the parent P, if any,
1269      * should be at level 1 + C.level and since P should be returned
1270      * EX-latched, when the search reaches L, it must EX-latch the node at that
1271      * level. In case (a), the "parent" does not need to be EX-latched (and it
1272      * can in fact be at any level).
1273      */
getParentINForChildIN( long targetNodeId, byte[] targetKey, int targetLevel, int exclusiveLevel, boolean requireExactMatch, boolean doFetch, CacheMode cacheMode, List<TrackingInfo> trackingList)1274     public SearchResult getParentINForChildIN(
1275         long targetNodeId,
1276         byte[] targetKey,
1277         int targetLevel,
1278         int exclusiveLevel,
1279         boolean requireExactMatch,
1280         boolean doFetch,
1281         CacheMode cacheMode,
1282         List<TrackingInfo> trackingList)
1283         throws DatabaseException {
1284 
1285         /* Call hook before latching. No latches are held. */
1286         TestHookExecute.doHookIfSet(getParentINHook);
1287 
1288         /*
1289          * SearchResult is initialized as follows:
1290          * exactParentFound = false;
1291          * parent = null; index = -1; childNotResident = false;
1292          */
1293         SearchResult result = new SearchResult();
1294 
1295         /* Get the tree root, SH-latched. */
1296         IN rootIN = getRootIN(cacheMode);
1297 
1298         if (rootIN == null) {
1299             return result;
1300         }
1301 
1302         /* If the root is the target node, there is no parent */
1303         assert(rootIN.getNodeId() != targetNodeId);
1304         assert(rootIN.getLevel() >= exclusiveLevel);
1305 
1306         IN parent = rootIN;
1307         IN child = null;
1308         boolean success = false;
1309 
1310         try {
1311 
1312             if (rootIN.getLevel() <= exclusiveLevel) {
1313                 rootIN.releaseLatch();
1314                 rootIN = getRootINLatchedExclusive(cacheMode);
1315                 assert(rootIN != null);
1316                 parent = rootIN;
1317             }
1318 
1319             while (true) {
1320 
1321                 assert(parent.getNEntries() > 0);
1322 
1323                 result.index = parent.findEntry(targetKey, false, false);
1324 
1325                 if (trackingList != null) {
1326                     trackingList.add(new TrackingInfo(
1327                         parent.getLsn(result.index), parent.getNodeId(),
1328                         parent.getNEntries(), result.index));
1329                 }
1330 
1331                 assert TestHookExecute.doHookIfSet(searchHook);
1332 
1333                 if (targetLevel > 0 && parent.getLevel() == targetLevel + 1) {
1334                     result.exactParentFound = true;
1335                     result.parent = parent;
1336                     break;
1337                 }
1338 
1339                 if (doFetch) {
1340                     child = parent.fetchINWithNoLatch(result, targetKey);
1341 
1342                     if (child == null) {
1343                         if (trackingList != null) {
1344                             trackingList.clear();
1345                         }
1346                         result.reset();
1347 
1348                         TestHookExecute.doHookIfSet(fetchINHook, child);
1349 
1350                         rootIN = getRootIN(cacheMode);
1351                         assert(rootIN != null);
1352 
1353                         if (rootIN.getLevel() <= exclusiveLevel) {
1354                             rootIN.releaseLatch();
1355                             rootIN = getRootINLatchedExclusive(cacheMode);
1356                             assert(rootIN != null);
1357                         }
1358 
1359                         parent = rootIN;
1360                         continue;
1361                     }
1362                 } else {
1363                     child = (IN) parent.getTarget(result.index);
1364                 }
1365 
1366                 assert(child != null || !doFetch);
1367 
1368                 if (child == null) {
1369                     if (requireExactMatch) {
1370                         parent.releaseLatch();
1371                     } else {
1372                         result.parent = parent;
1373                     }
1374                     result.childNotResident = true;
1375                     break;
1376                 }
1377 
1378                 if (child.getNodeId() == targetNodeId) {
1379                     result.exactParentFound = true;
1380                     result.parent = parent;
1381                     break;
1382                 }
1383 
1384                 if (child.isBIN()) {
1385                     if (requireExactMatch) {
1386                         parent.releaseLatch();
1387                     } else {
1388                         result.parent = parent;
1389                     }
1390                     break;
1391                 }
1392 
1393                 /* We can search further down the tree. */
1394                 if (child.getLevel() <= exclusiveLevel) {
1395                     child.latch(cacheMode);
1396                 } else {
1397                     child.latchShared(cacheMode);
1398                 }
1399 
1400                 parent.releaseLatch();
1401                 parent = child;
1402                 child = null;
1403             }
1404 
1405             success = true;
1406 
1407         } finally {
1408 
1409             if (!success) {
1410                 if (parent.isLatchOwner()) {
1411                     parent.releaseLatch();
1412                 }
1413 
1414                 if (child != null && child.isLatchOwner()) {
1415                     child.releaseLatch();
1416                 }
1417             }
1418         }
1419 
1420         if (result.parent != null) {
1421             if (LatchSupport.TRACK_LATCHES) {
1422                 LatchSupport.expectBtreeLatchesHeld(1);
1423             }
1424             assert((!doFetch && !requireExactMatch) ||
1425                    result.parent.isLatchExclusiveOwner());
1426         }
1427 
1428         return result;
1429     }
1430 
1431     /**
1432      * Return a reference to the parent of this LN. This searches through the
1433      * tree and allows splits, if the splitsAllowed param is true. Set the
1434      * tree location to the proper BIN parent whether or not the LN child is
1435      * found. That's because if the LN is not found, recovery or abort will
1436      * need to place it within the tree, and so we must point at the
1437      * appropriate position.
1438      *
1439      * <p>When this method returns with location.bin non-null, the BIN is
1440      * latched and must be unlatched by the caller.  Note that location.bin may
1441      * be non-null even if this method returns false.</p>
1442      *
1443      * @param location a holder class to hold state about the location
1444      * of our search. Sort of an internal cursor.
1445      *
1446      * @param key key to navigate through main key
1447      *
1448      * @param splitsAllowed true if this method is allowed to cause tree splits
1449      * as a side effect. In practice, recovery can cause splits, but abort
1450      * can't.
1451      *
1452      * @param blindDeltaOps Normally, if this method lands on a BIN-delta and
1453      * the search key is not in that delta, it will mutate the delta to a full
1454      * BIN to make sure whether the search key exists in the tree or not.
1455      * However, by passing true for blindDeltaOps, the caller indicates that
1456      * it doesn't really care whether the key is in the tree or not: it is
1457      * going to insert the key in the BIN-delta, if not already there,
1458      * essentially overwritting the slot that may exist in the full BIN. So,
1459      * if blindDeltaOps is true, the method will not mutate a BIN-delta parent
1460      * (unless the BIN-delta has no space for a slot insertion).
1461      *
1462      * @param cacheMode The CacheMode for affecting the hotness of the tree.
1463      *
1464      * @return true if node found in tree.
1465      * If false is returned and there is the possibility that we can insert
1466      * the record into a plausible parent we must also set
1467      * - location.bin (may be null if no possible parent found)
1468      * - location.lnKey (don't need to set if no possible parent).
1469      */
getParentBINForChildLN( TreeLocation location, byte[] key, boolean splitsAllowed, boolean blindDeltaOps, CacheMode cacheMode)1470     public boolean getParentBINForChildLN(
1471         TreeLocation location,
1472         byte[] key,
1473         boolean splitsAllowed,
1474         boolean blindDeltaOps,
1475         CacheMode cacheMode)
1476         throws DatabaseException {
1477 
1478         /*
1479          * Find the BIN that either points to this LN or could be its
1480          * ancestor.
1481          */
1482         location.reset();
1483         BIN bin;
1484         int index;
1485 
1486         if (splitsAllowed) {
1487             bin = searchSplitsAllowed(key, cacheMode, null /*comparator*/);
1488         } else {
1489             bin = search(key, cacheMode);
1490         }
1491 
1492         if (bin == null) {
1493             return false;
1494         }
1495 
1496         try {
1497             while (true) {
1498 
1499                 location.bin = bin;
1500 
1501                 index = bin.findEntry(
1502                     key, true /*indicateIfExact*/, false /*exactSearch*/);
1503 
1504                 boolean match = (index >= 0 &&
1505                                  (index & IN.EXACT_MATCH) != 0);
1506 
1507                 index &= ~IN.EXACT_MATCH;
1508                 location.index = index;
1509                 location.lnKey = key;
1510 
1511                 /*
1512                 if (!match && bin.isBINDelta() && blindDeltaOps) {
1513                       System.out.println(
1514                           "Blind op on BIN-delta : " + bin.getNodeId() +
1515                           " nEntries = " +
1516                           bin.getNEntries() +
1517                           " max entries = " +
1518                           bin.getMaxEntries() +
1519                           " full BIN entries = " +
1520                           bin.getFullBinNEntries() +
1521                           " full BIN max entries = " +
1522                           bin.getFullBinMaxEntries());
1523                 }
1524                 */
1525 
1526                 if (match) {
1527                     location.childLsn = bin.getLsn(index);
1528                     location.childLoggedSize = bin.getLastLoggedSize(index);
1529 
1530                     return true;
1531 
1532                 } else {
1533 
1534                     if (bin.isBINDelta() &&
1535                         (!blindDeltaOps ||
1536                          bin.getNEntries() >= bin.getMaxEntries())) {
1537 
1538                         bin.mutateToFullBIN();
1539                         location.reset();
1540                         continue;
1541                     }
1542 
1543                     return false;
1544                 }
1545             }
1546 
1547         } catch (RuntimeException e) {
1548             bin.releaseLatch();
1549             location.bin = null;
1550             throw e;
1551         }
1552     }
1553 
1554     /**
1555      * Find the BIN that is relevant to the insert.  If the tree doesn't exist
1556      * yet, then create the first IN and BIN.  On return, the cursor is set to
1557      * the BIN that is found or created, and the BIN is latched.
1558      */
findBinForInsert(final byte[] key, final CacheMode cacheMode)1559     public BIN findBinForInsert(final byte[] key, final CacheMode cacheMode) {
1560 
1561         boolean rootLatchIsHeld = false;
1562         BIN bin = null;
1563 
1564         try {
1565             long logLsn;
1566 
1567             /*
1568              * We may have to try several times because of a small
1569              * timing window, explained below.
1570              */
1571             while (true) {
1572 
1573                 rootLatchIsHeld = true;
1574                 rootLatch.acquireShared();
1575 
1576                 if (!rootExists()) {
1577 
1578                     rootLatch.release();
1579                     rootLatch.acquireExclusive();
1580                     if (rootExists()) {
1581                         rootLatch.release();
1582                         rootLatchIsHeld = false;
1583                         continue;
1584                     }
1585 
1586                     final EnvironmentImpl env = database.getEnv();
1587                     final LogManager logManager = env.getLogManager();
1588                     final INList inMemoryINs = env.getInMemoryINs();
1589                     final Evictor evictor = env.getEvictor();
1590 
1591                     /*
1592                      * This is an empty tree, either because it's brand new
1593                      * tree or because everything in it was deleted. Create an
1594                      * IN and a BIN.  We could latch the rootIN here, but
1595                      * there's no reason to since we're just creating the
1596                      * initial tree and we have the rootLatch held. Log the
1597                      * nodes as soon as they're created, but remember that
1598                      * referred-to children must come before any references to
1599                      * their LSNs.
1600                      */
1601 
1602                     /* First BIN in the tree, log provisionally right away. */
1603                     bin = new BIN(database, key, maxTreeEntriesPerNode, 1);
1604                     bin.latch(cacheMode);
1605                     logLsn = bin.optionalLogProvisional(logManager, null);
1606 
1607                     /*
1608                      * Log the root right away. Leave the root dirty, because
1609                      * the MapLN is not being updated, and we want to avoid
1610                      * this scenario from [#13897], where the LN has no
1611                      * possible parent.
1612                      *  provisional BIN
1613                      *  root IN
1614                      *  checkpoint start
1615                      *  LN is logged
1616                      *  checkpoint end
1617                      *  BIN is dirtied, but is not part of checkpoint
1618                      */
1619 
1620                     IN rootIN =
1621                         new IN(database, key, maxTreeEntriesPerNode, 2);
1622 
1623                     /*
1624                      * OK to latch the root after a child BIN because it's
1625                      * during creation.
1626                      */
1627                     rootIN.latch(cacheMode);
1628                     rootIN.setIsRoot(true);
1629 
1630                     boolean insertOk = rootIN.insertEntry(bin, key, logLsn);
1631                     assert insertOk;
1632 
1633                     logLsn = rootIN.optionalLog(logManager);
1634                     rootIN.setDirty(true);  /*force re-logging, see [#13897]*/
1635 
1636                     root = makeRootChildReference(rootIN, new byte[0], logLsn);
1637 
1638                     rootIN.releaseLatch();
1639 
1640                     /* Add the new nodes to the in memory list. */
1641                     inMemoryINs.add(bin);
1642                     inMemoryINs.add(rootIN);
1643                     evictor.addBack(bin);
1644 
1645                     rootLatch.release();
1646                     rootLatchIsHeld = false;
1647 
1648                     break;
1649                 } else {
1650                     rootLatch.release();
1651                     rootLatchIsHeld = false;
1652 
1653                     /*
1654                      * There's a tree here, so search for where we should
1655                      * insert. However, note that a window exists after we
1656                      * release the root latch. We release the latch because the
1657                      * search method expects to take the latch. After the
1658                      * release and before search, the INCompressor may come in
1659                      * and delete the entire tree, so search may return with a
1660                      * null.
1661                      */
1662                     bin = searchSplitsAllowed(key, cacheMode);
1663 
1664                     if (bin == null) {
1665                         /* The tree was deleted by the INCompressor. */
1666                         continue;
1667                     } else {
1668                         /* search() found a BIN where this key belongs. */
1669                         break;
1670                     }
1671                 }
1672             }
1673         } finally {
1674             if (rootLatchIsHeld) {
1675                 rootLatch.release();
1676             }
1677         }
1678 
1679         /* testing hook to insert item into log. */
1680         assert TestHookExecute.doHookIfSet(ckptHook);
1681 
1682         return bin;
1683     }
1684 
1685     /**
1686      * Do a key based search, permitting pre-emptive splits. Returns the
1687      * target node's parent.
1688      */
searchSplitsAllowed(byte[] key, CacheMode cacheMode)1689     public BIN searchSplitsAllowed(byte[] key, CacheMode cacheMode) {
1690 
1691         return searchSplitsAllowed(key, cacheMode, null);
1692     }
1693 
1694 
searchSplitsAllowed( byte[] key, CacheMode cacheMode, Comparator<byte[]> comparator)1695     private BIN searchSplitsAllowed(
1696         byte[] key,
1697         CacheMode cacheMode,
1698         Comparator<byte[]> comparator) {
1699 
1700         BIN insertTarget = null;
1701 
1702         while (insertTarget == null) {
1703 
1704             rootLatch.acquireShared();
1705 
1706             boolean rootLatched = true;
1707             boolean rootINLatched = false;
1708             boolean success = false;
1709             IN rootIN = null;
1710 
1711             /*
1712              * Latch the rootIN, check if it needs splitting. If so split it
1713              * and update the associated MapLN. To update the MapLN, we must
1714              * lock it, which implies that all latches must be released prior
1715              * to the lock, and as a result, the root may require splitting
1716              * again or may be split by another thread. So we must restart
1717              * the loop to get the latest root.
1718              */
1719             try {
1720                 if (!rootExists()) {
1721                     return null;
1722                 }
1723 
1724                 rootIN = (IN) root.fetchTarget(database, null);
1725 
1726                 if (rootIN.needsSplitting()) {
1727 
1728                     rootLatch.release();
1729                     rootLatch.acquireExclusive();
1730 
1731                     if (!rootExists()) {
1732                         return null;
1733                     }
1734 
1735                     rootIN = (IN) root.fetchTarget(database, null);
1736 
1737                     if (rootIN.needsSplitting()) {
1738 
1739                         splitRoot(cacheMode);
1740 
1741                         rootLatch.release();
1742                         rootLatched = false;
1743 
1744                         EnvironmentImpl env = database.getEnv();
1745                         env.getDbTree().optionalModifyDbRoot(database);
1746 
1747                         continue;
1748                     }
1749                 }
1750 
1751                 rootIN.latchShared(cacheMode);
1752                 rootINLatched = true;
1753                 success = true;
1754 
1755             } finally {
1756                 if (!success && rootINLatched) {
1757                     rootIN.releaseLatch();
1758                 }
1759                 if (rootLatched) {
1760                     rootLatch.release();
1761                 }
1762             }
1763 
1764             /*
1765              * Now, search the tree, doing splits if required. The rootIN
1766              * is latched in SH mode, but this.root is not latched. If any
1767              * splits are needed, this.root will first be latched exclusivelly
1768              * and will stay latched until all splits are done.
1769              */
1770             try {
1771                 assert(rootINLatched);
1772 
1773                 insertTarget = searchSplitsAllowed(
1774                     rootIN, key, cacheMode, comparator);
1775 
1776                 if (insertTarget == null) {
1777                     if (LatchSupport.TRACK_LATCHES) {
1778                         LatchSupport.expectBtreeLatchesHeld(0);
1779                     }
1780                     relatchesRequired.increment();
1781                     database.getEnv().incRelatchesRequired();
1782                 }
1783             } catch (SplitRequiredException e) {
1784 
1785                 /*
1786                  * The last slot in the root was used at the point when this
1787                  * thread released the rootIN latch in order to force splits.
1788                  * Retry. SR [#11147].
1789                  */
1790                 continue;
1791             }
1792         }
1793 
1794         return insertTarget;
1795     }
1796 
1797     /**
1798      * Search the tree, permitting preemptive splits.
1799      *
1800      * When this returns, parent will be unlatched unless parent is the
1801      * returned IN.
1802      */
searchSplitsAllowed( IN rootIN, byte[] key, CacheMode cacheMode, Comparator<byte[]> comparator)1803     private BIN searchSplitsAllowed(
1804         IN rootIN,
1805         byte[] key,
1806         CacheMode cacheMode,
1807         Comparator<byte[]> comparator)
1808         throws SplitRequiredException {
1809 
1810         assert(rootIN.isLatchOwner());
1811         if (!rootIN.isRoot()) {
1812             throw EnvironmentFailureException.unexpectedState(
1813                 "A null or non-root IN was given as the parent");
1814         }
1815 
1816         int index;
1817         IN parent = rootIN;
1818         IN child = null;
1819         boolean success = false;
1820 
1821         /*
1822          * Search downward until we hit a node that needs a split. In that
1823          * case, retreat to the top of the tree and force splits downward.
1824          */
1825         try {
1826             do {
1827                 if (parent.getNEntries() == 0) {
1828                     throw EnvironmentFailureException.unexpectedState(
1829                         "Found upper IN with 0 entries");
1830                 }
1831 
1832                 index = parent.findEntry(key, false, false, comparator);
1833                 assert index >= 0;
1834 
1835                 child = parent.fetchINWithNoLatch(index, key);
1836 
1837                 if (child == null) {
1838                     return null; // restart the search
1839                 }
1840 
1841                 /* if child is a BIN, it is actually EX-latched */
1842                 child.latchShared(cacheMode);
1843 
1844                 /*
1845                  * If we need to split, try compressing first and
1846                  * check again. If we still need to split, throw
1847                  * exception.
1848                  */
1849                 if (child.needsSplitting()) {
1850 
1851                     database.getEnv().lazyCompress(child);
1852 
1853                     if (child.needsSplitting()) {
1854 
1855                         child.releaseLatch();
1856                         parent.releaseLatch();
1857 
1858                         /* SR [#11144]*/
1859                         assert TestHookExecute.doHookIfSet(waitHook);
1860 
1861                         /*
1862                          * forceSplit may throw SplitRequiredException if it
1863                          * finds that the root needs splitting. Allow the
1864                          * exception to propagate up to the caller, who will
1865                          * do the root split. Otherwise, restart the search
1866                          * from the root IN again.
1867                          */
1868                         rootIN = forceSplit(key, cacheMode);
1869                         parent = rootIN;
1870 
1871                         assert(rootIN.isLatchOwner());
1872                         if (!rootIN.isRoot()) {
1873                             throw EnvironmentFailureException.unexpectedState(
1874                                 "A null or non-root IN was given as the parent");
1875                         }
1876                         continue;
1877                     }
1878                 }
1879 
1880                 /* Continue down a level */
1881                 parent.releaseLatch();
1882                 parent = child;
1883                 child = null;
1884 
1885             } while (!parent.isBIN());
1886 
1887             success = true;
1888             return (BIN)parent;
1889 
1890         } finally {
1891             if (!success) {
1892                 if (child != null && child.isLatchOwner()) {
1893                     child.releaseLatch();
1894                 }
1895                 if (parent != child && parent.isLatchOwner()) {
1896                     parent.releaseLatch();
1897                 }
1898             }
1899         }
1900     }
1901 
1902     /**
1903      * Do pre-emptive splitting: search down the tree until we get to the BIN
1904      * level, and split any nodes that fit the splittable requirement except
1905      * for the root. If the root needs splitting, a splitRequiredException
1906      * is thrown and the root split is handled at a higher level.
1907      *
1908      * Note that more than one node in the path may be splittable. For example,
1909      * a tree might have a level2 IN and a BIN that are both splittable, and
1910      * would be encountered by the same insert operation.
1911      *
1912      * Splits cause INs to be logged in all ancestors, including the root. This
1913      * is to avoid the "great aunt" problem described in LevelRecorder.
1914      *
1915      * INs below the root are logged provisionally; only the root is logged
1916      * non-provisionally. Provisional logging is necessary during a checkpoint
1917      * for levels less than maxFlushLevel.
1918      *
1919      * This method acquires and holds this.rootLatch in EX mode during its
1920      * whole duration (so splits are serialized). The rootLatch is released
1921      * on return.
1922      *
1923      * @return the tree root node, latched in EX mode. This may be different
1924      * than the tree root when this method was called, because no latches are
1925      * held on entering this method.
1926      *
1927      * All latches are released in case of exception.
1928      */
forceSplit(byte[] key, CacheMode cacheMode)1929     private IN forceSplit(byte[] key, CacheMode cacheMode)
1930         throws DatabaseException, SplitRequiredException {
1931 
1932         final ArrayList<SplitInfo> nodeLadder = new ArrayList<SplitInfo>();
1933 
1934         boolean allLeftSideDescent = true;
1935         boolean allRightSideDescent = true;
1936         int index;
1937         IN parent;
1938         IN child = null;
1939         IN rootIN = null;
1940 
1941         /*
1942          * Latch the root in order to update the root LSN when we're done.
1943          * Latch order must be: root, root IN. We'll leave this method with the
1944          * original parent latched.
1945          */
1946         rootLatch.acquireExclusive();
1947 
1948         boolean success = false;
1949         try {
1950             /* The root IN may have been evicted. [#16173] */
1951             rootIN = (IN) root.fetchTarget(database, null);
1952             parent = rootIN;
1953             parent.latch(cacheMode);
1954 
1955             /*
1956              * Another thread may have crept in and
1957              *  - used the last free slot in the parent, making it impossible
1958              *    to correctly propagate the split.
1959              *  - actually split the root, in which case we may be looking at
1960              *    the wrong subtree for this search.
1961              * If so, throw and retry from above. SR [#11144]
1962              */
1963             if (rootIN.needsSplitting()) {
1964                 throw splitRequiredException;
1965             }
1966 
1967             /*
1968              * Search downward to the BIN level, saving the information
1969              * needed to do a split if necessary.
1970              */
1971             do {
1972                 if (parent.getNEntries() == 0) {
1973                     throw EnvironmentFailureException.unexpectedState(
1974                         "Found upper IN with 0 entries");
1975                 }
1976 
1977                 /* Look for the entry matching key in the current node. */
1978                 index = parent.findEntry(key, false, false);
1979                 assert index >= 0;
1980                 if (index != 0) {
1981                     allLeftSideDescent = false;
1982                 }
1983                 if (index != (parent.getNEntries() - 1)) {
1984                     allRightSideDescent = false;
1985                 }
1986 
1987                 /*
1988                  * Get the child node that matches. We only need to work on
1989                  * nodes in residence.
1990                  */
1991                 if (!parent.isResident(index)) {
1992                     child = null;
1993                     break;
1994                 }
1995 
1996                 child = (IN)parent.getTarget(index);
1997 
1998                 child.latch(cacheMode);
1999 
2000                 nodeLadder.add(new SplitInfo(parent, child, index));
2001 
2002                 /* Continue down a level */
2003                 parent = child;
2004             } while (!parent.isBIN());
2005 
2006             boolean startedSplits = false;
2007             LogManager logManager = database.getEnv().getLogManager();
2008 
2009             /*
2010              * Process the accumulated nodes from the bottom up. Split each
2011              * node if required. If the node should not split, we check if
2012              * there have been any splits on the ladder yet. If there are none,
2013              * we merely release the node, since there is no update.  If splits
2014              * have started, we need to propagate new LSNs upward, so we log
2015              * the node and update its parent.
2016              */
2017             long lastParentForSplit = Node.NULL_NODE_ID;
2018 
2019             for (int i = nodeLadder.size() - 1; i >= 0; i -= 1) {
2020                 final SplitInfo info = nodeLadder.get(i);
2021 
2022                 child = info.child;
2023                 parent = info.parent;
2024                 index = info.index;
2025 
2026                 /* Opportunistically split the node if it is full. */
2027                 if (child.needsSplitting()) {
2028 
2029                     child.mutateToFullBIN();
2030 
2031                     final IN grandParent =
2032                         (i > 0) ? nodeLadder.get(i - 1).parent : null;
2033 
2034                     if (allLeftSideDescent || allRightSideDescent) {
2035                         child.splitSpecial(
2036                             parent, index, grandParent, maxTreeEntriesPerNode,
2037                             key, allLeftSideDescent, cacheMode);
2038                     } else {
2039                         child.split(
2040                             parent, index, grandParent, maxTreeEntriesPerNode,
2041                             cacheMode);
2042                     }
2043 
2044                     lastParentForSplit = parent.getNodeId();
2045                     startedSplits = true;
2046 
2047                     /*
2048                      * If the DB root IN was logged, update the DB tree's child
2049                      * reference. Now the MapLN is logically dirty. Be sure to
2050                      * flush the MapLN if we ever evict the root.
2051                      */
2052                     if (parent.isDbRoot()) {
2053                         root.updateLsnAfterOptionalLog(
2054                             database, parent.getLastLoggedVersion());
2055                     }
2056                 } else {
2057                     if (startedSplits) {
2058                         final long newChildLsn;
2059 
2060                         /*
2061                          * If this child was the parent of a split, it's
2062                          * already logged by the split call. We just need to
2063                          * propagate the logging upwards. If this child is just
2064                          * a link in the chain upwards, log it.
2065                          */
2066                         if (lastParentForSplit == child.getNodeId()) {
2067                             newChildLsn = child.getLastLoggedVersion();
2068                         } else {
2069                             newChildLsn = child.optionalLogProvisional(
2070                                 logManager, parent);
2071                         }
2072 
2073                         parent.updateEntry(
2074                             index, newChildLsn, 0 /*lastLoggedSize*/);
2075 
2076                         /*
2077                          * The root is never a 'child' in nodeLadder so it must
2078                          * be logged separately.
2079                          */
2080                         if (parent.isDbRoot()) {
2081 
2082                             final long newRootLsn =
2083                                 parent.optionalLog(logManager);
2084 
2085                             root.updateLsnAfterOptionalLog(
2086                                 database, newRootLsn);
2087                         }
2088                     }
2089                 }
2090                 child.releaseLatch();
2091                 child = null;
2092             }
2093             success = true;
2094         } finally {
2095             if (!success) {
2096                 if (child != null) {
2097                     child.releaseLatchIfOwner();
2098                 }
2099 
2100                 for (SplitInfo info : nodeLadder) {
2101                     info.child.releaseLatchIfOwner();
2102                 }
2103 
2104                 if (rootIN != null) {
2105                     rootIN.releaseLatchIfOwner();
2106                 }
2107             }
2108 
2109             rootLatch.release();
2110         }
2111 
2112         return rootIN;
2113     }
2114 
2115     /**
2116      * Split the root of the tree.
2117      */
splitRoot(CacheMode cacheMode)2118     private void splitRoot(CacheMode cacheMode)
2119         throws DatabaseException {
2120 
2121         /*
2122          * Create a new root IN, insert the current root IN into it, and then
2123          * call split.
2124          */
2125         EnvironmentImpl env = database.getEnv();
2126         LogManager logManager = env.getLogManager();
2127         INList inMemoryINs = env.getInMemoryINs();
2128 
2129         IN curRoot = null;
2130         curRoot = (IN) root.fetchTarget(database, null);
2131         curRoot.latch(cacheMode);
2132         long curRootLsn = 0;
2133         long logLsn = 0;
2134         IN newRoot = null;
2135         try {
2136 
2137             /*
2138              * Make a new root IN, giving it an id key from the previous root.
2139              */
2140             byte[] rootIdKey = curRoot.getKey(0);
2141             newRoot = new IN(database, rootIdKey, maxTreeEntriesPerNode,
2142                              curRoot.getLevel() + 1);
2143             newRoot.latch(cacheMode);
2144             newRoot.setIsRoot(true);
2145             curRoot.setIsRoot(false);
2146 
2147             /*
2148              * Make the new root IN point to the old root IN. Log the old root
2149              * provisionally, because we modified it so it's not the root
2150              * anymore, then log the new root. We are guaranteed to be able to
2151              * insert entries, since we just made this root.
2152              */
2153             boolean logSuccess = false;
2154             try {
2155                 curRootLsn = curRoot.optionalLogProvisional(
2156                     logManager, newRoot);
2157 
2158                 boolean inserted = newRoot.insertEntry(
2159                     curRoot, rootIdKey, curRootLsn);
2160                 assert inserted;
2161 
2162                 logLsn = newRoot.optionalLog(logManager);
2163                 logSuccess = true;
2164             } finally {
2165                 if (!logSuccess) {
2166                     /* Something went wrong when we tried to log. */
2167                     curRoot.setIsRoot(true);
2168                 }
2169             }
2170 
2171             inMemoryINs.add(newRoot);
2172 
2173             /*
2174              * Don't add the new root into the LRU because it has a cached
2175              * child.
2176              */
2177 
2178             /*
2179              * Make the tree's root reference point to this new node. Now the
2180              * MapLN is logically dirty, but the change hasn't been logged.  Be
2181              * sure to flush the MapLN if we ever evict the root.
2182              */
2183             root.setTarget(newRoot);
2184             root.updateLsnAfterOptionalLog(database, logLsn);
2185             curRoot.split(newRoot, 0, null, maxTreeEntriesPerNode, cacheMode);
2186             root.setLsn(newRoot.getLastLoggedVersion());
2187 
2188         } finally {
2189             /* FindBugs ignore possible null pointer dereference of newRoot. */
2190             newRoot.releaseLatch();
2191             curRoot.releaseLatch();
2192         }
2193         rootSplits.increment();
2194         traceSplitRoot(Level.FINE, TRACE_ROOT_SPLIT, newRoot, logLsn,
2195                        curRoot, curRootLsn);
2196     }
2197 
search(byte[] key, CacheMode cacheMode)2198     public BIN search(byte[] key, CacheMode cacheMode) {
2199 
2200         return search(key, SearchType.NORMAL, null, cacheMode, null);
2201     }
2202 
2203     /**
2204      * Search the tree, starting at the root. Depending on search type either
2205      * (a) search for the BIN that *should* contain a given key, or (b) return
2206      * the right-most or left-most BIN in the tree.
2207      *
2208      * Preemptive splitting is not done during the search.
2209      *
2210      * @param key - the key to search for, or null if searchType is LEFT or
2211      * RIGHT.
2212      *
2213      * @param searchType - The type of tree search to perform.  NORMAL means
2214      * we're searching for key in the tree.  LEFT/RIGHT means we're descending
2215      * down the left or right side, resp.
2216      *
2217      * @param binBoundary - If non-null, information is returned about whether
2218      * the BIN found is the first or last BIN in the database.
2219      *
2220      * @return - the BIN that matches the criteria, if any. Returns null if
2221      * the root is null. BIN is latched (unless it's null) and must be
2222      * unlatched by the caller.  In a NORMAL search, it is the caller's
2223      * responsibility to do the findEntry() call on the key and BIN to locate
2224      * the entry (if any) that matches key.
2225      */
search( byte[] key, SearchType searchType, BINBoundary binBoundary, CacheMode cacheMode, Comparator<byte[]> comparator)2226     public BIN search(
2227         byte[] key,
2228         SearchType searchType,
2229         BINBoundary binBoundary,
2230         CacheMode cacheMode,
2231         Comparator<byte[]> comparator) {
2232 
2233         IN rootIN = getRootIN(cacheMode);
2234 
2235         if (rootIN == null) {
2236             return null;
2237         }
2238 
2239         if ((searchType == SearchType.LEFT ||
2240              searchType == SearchType.RIGHT) &&
2241             key != null) {
2242 
2243             /*
2244              * If caller is asking for a right or left search, they shouldn't
2245              * be passing us a key.
2246              */
2247             throw EnvironmentFailureException.unexpectedState(
2248                 "searchSubTree passed key and left/right search");
2249         }
2250 
2251         if (binBoundary != null) {
2252             binBoundary.isLastBin = true;
2253             binBoundary.isFirstBin = true;
2254         }
2255 
2256         boolean success = false;
2257         int index;
2258         IN parent = rootIN;
2259         IN child = null;
2260 
2261         TreeWalkerStatsAccumulator treeStatsAccumulator =
2262             getTreeStatsAccumulator();
2263 
2264         try {
2265             if (treeStatsAccumulator != null) {
2266                 parent.accumulateStats(treeStatsAccumulator);
2267             }
2268 
2269             do {
2270                 if (parent.getNEntries() == 0) {
2271                     throw EnvironmentFailureException.unexpectedState(
2272                         "Upper IN with 0 entries");
2273                 }
2274 
2275                 if (searchType == SearchType.NORMAL) {
2276                     index = parent.findEntry(key, false, false, comparator);
2277 
2278                 } else if (searchType == SearchType.LEFT) {
2279                     index = 0;
2280 
2281                 } else if (searchType == SearchType.RIGHT) {
2282                     index = parent.getNEntries() - 1;
2283 
2284                 } else {
2285                     throw EnvironmentFailureException.unexpectedState(
2286                         "Invalid value of searchType: " + searchType);
2287                 }
2288 
2289                 assert(index >= 0);
2290 
2291                 if (binBoundary != null) {
2292                     if (index != parent.getNEntries() - 1) {
2293                         binBoundary.isLastBin = false;
2294                     }
2295                     if (index != 0) {
2296                         binBoundary.isFirstBin = false;
2297                     }
2298                 }
2299 
2300                 child = parent.fetchINWithNoLatch(index, key);
2301 
2302                 if (child == null) {
2303                     parent = getRootIN(cacheMode);
2304                     assert(parent != null);
2305                     if (treeStatsAccumulator != null) {
2306                         parent.accumulateStats(treeStatsAccumulator);
2307                     }
2308                     continue;
2309                 }
2310 
2311                 /* Latch the child. Note: BINs are always latched exclusive. */
2312                 child.latchShared(cacheMode);
2313 
2314                 if (treeStatsAccumulator != null) {
2315                     child.accumulateStats(treeStatsAccumulator);
2316                 }
2317 
2318                 parent.releaseLatch();
2319                 parent = child;
2320                 child = null;
2321 
2322             } while (!parent.isBIN());
2323 
2324             success = true;
2325             return (BIN)parent;
2326 
2327         } finally {
2328             if (!success) {
2329 
2330                 /*
2331                  * In [#14903] we encountered a latch exception below and the
2332                  * original exception was lost.  Print the stack trace and
2333                  * allow the original exception to be thrown if this happens
2334                  * again, to get more information about the problem.
2335                  */
2336                 try {
2337                     if (child != null && child.isLatchOwner()) {
2338                         child.releaseLatch();
2339                     }
2340 
2341                     if (parent != child && parent.isLatchOwner()) {
2342                         parent.releaseLatch();
2343                     }
2344                 } catch (Exception e) {
2345                     LoggerUtils.traceAndLogException(
2346                         database.getEnv(), "Tree", "searchSubTreeInternal", "",
2347                         e);
2348                 }
2349             }
2350         }
2351     }
2352 
searchSubTree( IN parent, byte[] key, SearchType searchType, int targetLevel, CacheMode cacheMode, Comparator<byte[]> comparator)2353     private IN searchSubTree(
2354         IN parent,
2355         byte[] key,
2356         SearchType searchType,
2357         int targetLevel,
2358         CacheMode cacheMode,
2359         Comparator<byte[]> comparator) {
2360 
2361         /*
2362          * If a an intermediate IN (e.g., from getNextIN) was
2363          * originally passed, it was latched exclusively.
2364          */
2365         assert(parent != null &&
2366                (parent.isRoot() ||
2367                 parent.isLatchExclusiveOwner()));
2368 
2369         if ((searchType == SearchType.LEFT ||
2370              searchType == SearchType.RIGHT) &&
2371             key != null) {
2372 
2373             /*
2374              * If caller is asking for a right or left search, they shouldn't
2375              * be passing us a key.
2376              */
2377             throw EnvironmentFailureException.unexpectedState(
2378                 "searchSubTree passed key and left/right search");
2379         }
2380 
2381         assert(parent.isUpperIN());
2382         assert(parent.isLatchOwner());
2383 
2384         boolean success = false;
2385         int index;
2386         IN subtreeRoot = parent;
2387         IN child = null;
2388         IN grandParent = null;
2389         boolean childIsLatched = false;
2390         boolean grandParentIsLatched = false;
2391         boolean doGrandparentLatching = !parent.isLatchExclusiveOwner();
2392 
2393         TreeWalkerStatsAccumulator treeStatsAccumulator =
2394             getTreeStatsAccumulator();
2395 
2396         try {
2397             do {
2398                 if (treeStatsAccumulator != null) {
2399                     parent.accumulateStats(treeStatsAccumulator);
2400                 }
2401 
2402                 assert(parent.getNEntries() > 0);
2403 
2404                 if (searchType == SearchType.NORMAL) {
2405                     /* Look for the entry matching key in the current node. */
2406                     index = parent.findEntry(key, false, false, comparator);
2407                 } else if (searchType == SearchType.LEFT) {
2408                     /* Left search, always take the 0th entry. */
2409                     index = 0;
2410                 } else if (searchType == SearchType.RIGHT) {
2411                     /* Right search, always take the highest entry. */
2412                     index = parent.getNEntries() - 1;
2413                 } else {
2414                     throw EnvironmentFailureException.unexpectedState(
2415                         "Invalid value of searchType: " + searchType);
2416                 }
2417 
2418                 assert(index >= 0);
2419 
2420                 /*
2421                  * Get the child IN.
2422                  *
2423                  * If the child is not cached and we are usimg grandparent
2424                  * latching, then:
2425                  *
2426                  * (a) If "parent" is not the subtree root, is is always
2427                  * SH-latched at this point. So, to fetch the child, we need to
2428                  * unlatch the parent and relatch it exclusively. Because we
2429                  * have the grandparent latch (in either SH or EX mode), the
2430                  * parent will not be evicted or detached from the tree and the
2431                  * index of the child within the parent won't change. After
2432                  * the parent is EX-latched, we can release the grandparent so
2433                  *. it won't be held while reading the child from the log.
2434                  *
2435                  * (b) If "parent" is the BTree root, it may be SH-latched. In
2436                  * this case, since there is no grandparent, we must unlatch
2437                  * the parent and relatch it in EX mode under the protection
2438                  * of the rootLatch; then we restart the do-loop.
2439                  *
2440                  * (c) If "parent" is the subtree root, but not the root of
2441                  * the full Btree, then it must be EX-latched already, and
2442                  * we can just fetch the child.
2443                  */
2444                 child = (IN) parent.getTarget(index);
2445 
2446                 if (child == null && doGrandparentLatching) {
2447 
2448                     if (parent != subtreeRoot) {
2449 
2450                         assert(!parent.isLatchExclusiveOwner());
2451                         parent.releaseLatch();
2452                         parent.latch(cacheMode);
2453                         grandParent.releaseLatch();
2454                         grandParentIsLatched = false;
2455                         grandParent = null;
2456                         doGrandparentLatching = false;
2457 
2458                     } else if (parent.isRoot() &&
2459                                !parent.isLatchExclusiveOwner()) {
2460 
2461                         parent.releaseLatch();
2462                         subtreeRoot = getRootINLatchedExclusive(cacheMode);
2463                         parent = subtreeRoot;
2464                         assert(parent != null);
2465                         assert(grandParent == null);
2466                         doGrandparentLatching = false;
2467 
2468                         continue;
2469                     }
2470 
2471                     child = parent.fetchIN(index);
2472 
2473                 } else if (child == null) {
2474 
2475                     child = parent.fetchIN(index);
2476                 }
2477 
2478                 /* After fetching the child we can release the grandparent. */
2479                 if (grandParent != null) {
2480                     grandParent.releaseLatch();
2481                     grandParentIsLatched = false;
2482                 }
2483 
2484                 /* Latch the child. Note: BINs are always latched exclusive. */
2485                 if (doGrandparentLatching && child.getLevel() != targetLevel) {
2486                     child.latchShared(cacheMode);
2487                 } else {
2488                     child.latch(cacheMode);
2489                 }
2490                 childIsLatched = true;
2491 
2492                 child.mutateToFullBIN();
2493 
2494                 if (treeStatsAccumulator != null) {
2495                     child.accumulateStats(treeStatsAccumulator);
2496                 }
2497 
2498                 /* Continue down a level */
2499                 if (doGrandparentLatching) {
2500                     grandParent = parent;
2501                     grandParentIsLatched = true;
2502                 } else {
2503                     parent.releaseLatch();
2504                 }
2505                 parent = child;
2506 
2507             } while (!parent.isBIN() && parent.getLevel() != targetLevel);
2508 
2509             success = true;
2510             return child;
2511 
2512         } finally {
2513             if (!success) {
2514 
2515                 /*
2516                  * In [#14903] we encountered a latch exception below and the
2517                  * original exception was lost.  Print the stack trace and
2518                  * allow the original exception to be thrown if this happens
2519                  * again, to get more information about the problem.
2520                  */
2521                 try {
2522                     if (child != null && childIsLatched) {
2523                         child.releaseLatch();
2524                     }
2525 
2526                     if (parent != child) {
2527                         parent.releaseLatch();
2528                     }
2529                 } catch (Exception e) {
2530                     LoggerUtils.traceAndLogException(
2531                         database.getEnv(), "Tree", "searchSubTreeInternal", "",
2532                         e);
2533                 }
2534             }
2535 
2536             if (grandParent != null && grandParentIsLatched) {
2537                 grandParent.releaseLatch();
2538             }
2539         }
2540     }
2541 
2542     /**
2543      * rebuildINList is used by recovery to add all the resident nodes to the
2544      * IN list.
2545      */
rebuildINList()2546     public void rebuildINList()
2547         throws DatabaseException {
2548 
2549         INList inMemoryList = database.getEnv().getInMemoryINs();
2550 
2551         if (root != null) {
2552             rootLatch.acquireShared();
2553             try {
2554                 Node rootIN = root.getTarget();
2555                 if (rootIN != null) {
2556                     rootIN.rebuildINList(inMemoryList);
2557                 }
2558             } finally {
2559                 rootLatch.release();
2560             }
2561         }
2562     }
2563 
2564     /**
2565      * Debugging check that all resident nodes are on the INList and no stray
2566      * nodes are present in the unused portion of the IN arrays.
2567      */
validateINList(IN parent)2568     public void validateINList(IN parent)
2569         throws DatabaseException {
2570 
2571         if (parent == null) {
2572             parent = (IN) root.getTarget();
2573         }
2574 
2575         if (parent != null) {
2576             INList inList = database.getEnv().getInMemoryINs();
2577 
2578             if (!inList.contains(parent)) {
2579                 throw EnvironmentFailureException.unexpectedState(
2580                     "IN " + parent.getNodeId() + " missing from INList");
2581             }
2582 
2583             for (int i = 0;; i += 1) {
2584                 try {
2585                     Node node = parent.getTarget(i);
2586 
2587                     if (i >= parent.getNEntries()) {
2588                         if (node != null) {
2589                             throw EnvironmentFailureException.unexpectedState(
2590                                 "IN " + parent.getNodeId() +
2591                                 " has stray node " + node +
2592                                 " at index " + i);
2593                         }
2594                         byte[] key = parent.getKey(i);
2595                         if (key != null) {
2596                             throw EnvironmentFailureException.unexpectedState(
2597                                "IN " + parent.getNodeId() +
2598                                " has stray key " + key +
2599                                " at index " + i);
2600                         }
2601                     }
2602 
2603                     if (node instanceof IN) {
2604                         validateINList((IN) node);
2605                     }
2606                 } catch (ArrayIndexOutOfBoundsException e) {
2607                     break;
2608                 }
2609             }
2610         }
2611     }
2612 
2613     /*
2614      * Logging support
2615      */
2616 
2617     /**
2618      * @see Loggable#getLogSize
2619      */
getLogSize()2620     public int getLogSize() {
2621         int size = 1;                          // rootExists
2622         if (root != null) {
2623             size += root.getLogSize();
2624         }
2625         return size;
2626     }
2627 
2628     /**
2629      * @see Loggable#writeToLog
2630      */
writeToLog(ByteBuffer logBuffer)2631     public void writeToLog(ByteBuffer logBuffer) {
2632         byte booleans = (byte) ((root != null) ? 1 : 0);
2633         logBuffer.put(booleans);
2634         if (root != null) {
2635             root.writeToLog(logBuffer);
2636         }
2637     }
2638 
2639     /**
2640      * @see Loggable#readFromLog
2641      */
readFromLog(ByteBuffer itemBuffer, int entryVersion)2642     public void readFromLog(ByteBuffer itemBuffer, int entryVersion) {
2643         boolean rootExists = false;
2644         byte booleans = itemBuffer.get();
2645         rootExists = (booleans & 1) != 0;
2646         if (rootExists) {
2647             root = makeRootChildReference();
2648             root.readFromLog(itemBuffer, entryVersion);
2649         }
2650     }
2651 
2652     /**
2653      * @see Loggable#dumpLog
2654      */
dumpLog(StringBuilder sb, boolean verbose)2655     public void dumpLog(StringBuilder sb, boolean verbose) {
2656         sb.append("<root>");
2657         if (root != null) {
2658             root.dumpLog(sb, verbose);
2659         }
2660         sb.append("</root>");
2661     }
2662 
2663     /**
2664      * @see Loggable#getTransactionId
2665      */
getTransactionId()2666     public long getTransactionId() {
2667         return 0;
2668     }
2669 
2670     /**
2671      * @see Loggable#logicalEquals
2672      * Always return false, this item should never be compared.
2673      */
logicalEquals(Loggable other)2674     public boolean logicalEquals(Loggable other) {
2675         return false;
2676     }
2677 
2678     /**
2679      * @return the TreeStats for this tree.
2680      */
getTreeStats()2681     int getTreeStats() {
2682         return rootSplits.get();
2683     }
2684 
getTreeStatsAccumulator()2685     private TreeWalkerStatsAccumulator getTreeStatsAccumulator() {
2686         if (EnvironmentImpl.getThreadLocalReferenceCount() > 0) {
2687             return treeStatsAccumulatorTL.get();
2688         } else {
2689             return null;
2690         }
2691     }
2692 
setTreeStatsAccumulator(TreeWalkerStatsAccumulator tSA)2693     public void setTreeStatsAccumulator(TreeWalkerStatsAccumulator tSA) {
2694         treeStatsAccumulatorTL.set(tSA);
2695     }
2696 
loadStats(StatsConfig config, BtreeStats btreeStats)2697     public void loadStats(StatsConfig config, BtreeStats btreeStats) {
2698         /* Add the tree statistics to BtreeStats. */
2699         btreeStats.setTreeStats(stats.cloneGroup(false));
2700 
2701         if (config.getClear()) {
2702             relatchesRequired.clear();
2703             rootSplits.clear();
2704         }
2705     }
2706 
2707     /*
2708      * Debugging stuff.
2709      */
dump()2710     public void dump() {
2711         System.out.println(dumpString(0));
2712     }
2713 
dumpString(int nSpaces)2714     public String dumpString(int nSpaces) {
2715         StringBuilder sb = new StringBuilder();
2716         sb.append(TreeUtils.indent(nSpaces));
2717         sb.append("<tree>");
2718         sb.append('\n');
2719         if (root != null) {
2720             sb.append(DbLsn.dumpString(root.getLsn(), nSpaces));
2721             sb.append('\n');
2722             IN rootIN = (IN) root.getTarget();
2723             if (rootIN == null) {
2724                 sb.append("<in/>");
2725             } else {
2726                 sb.append(rootIN.toString());
2727             }
2728             sb.append('\n');
2729         }
2730         sb.append(TreeUtils.indent(nSpaces));
2731         sb.append("</tree>");
2732         return sb.toString();
2733     }
2734 
2735     /**
2736      * Unit test support to validate subtree pruning. Didn't want to make root
2737      * access public.
2738      */
validateDelete(int index)2739     boolean validateDelete(int index)
2740         throws DatabaseException {
2741 
2742         rootLatch.acquireShared();
2743         try {
2744             IN rootIN = (IN) root.fetchTarget(database, null);
2745             rootIN.latch();
2746             try {
2747                 return rootIN.validateSubtreeBeforeDelete(index);
2748             } finally {
2749                 rootIN.releaseLatch();
2750             }
2751         } finally {
2752             rootLatch.release();
2753         }
2754     }
2755 
2756     /* For unit testing only. */
setWaitHook(TestHook hook)2757     public void setWaitHook(TestHook hook) {
2758         waitHook = hook;
2759     }
2760 
2761     /* For unit testing only. */
setSearchHook(TestHook hook)2762     public void setSearchHook(TestHook hook) {
2763         searchHook = hook;
2764     }
2765 
2766     /* For unit testing only. */
setCkptHook(TestHook hook)2767     public void setCkptHook(TestHook hook) {
2768         ckptHook = hook;
2769     }
2770 
2771     /* For unit testing only. */
setGetParentINHook(TestHook hook)2772     public void setGetParentINHook(TestHook hook) {
2773         getParentINHook = hook;
2774     }
2775 
setFetchINHook(TestHook hook)2776     public void setFetchINHook(TestHook hook) {
2777         fetchINHook = hook;
2778     }
2779     /**
2780      * Send trace messages to the java.util.logger. Don't rely on the logger
2781      * alone to conditionalize whether we send this message, we don't even want
2782      * to construct the message if the level is not enabled.
2783      */
traceSplitRoot(Level level, String splitType, IN newRoot, long newRootLsn, IN oldRoot, long oldRootLsn)2784     private void traceSplitRoot(Level level,
2785                                 String splitType,
2786                                 IN newRoot,
2787                                 long newRootLsn,
2788                                 IN oldRoot,
2789                                 long oldRootLsn) {
2790         Logger logger = database.getEnv().getLogger();
2791         if (logger.isLoggable(level)) {
2792             StringBuilder sb = new StringBuilder();
2793             sb.append(splitType);
2794             sb.append(" newRoot=").append(newRoot.getNodeId());
2795             sb.append(" newRootLsn=").
2796                 append(DbLsn.getNoFormatString(newRootLsn));
2797             sb.append(" oldRoot=").append(oldRoot.getNodeId());
2798             sb.append(" oldRootLsn=").
2799                 append(DbLsn.getNoFormatString(oldRootLsn));
2800             LoggerUtils.logMsg(
2801                 logger, database.getEnv(), level, sb.toString());
2802         }
2803     }
2804 
2805     private static class SplitInfo {
2806         IN parent;
2807         IN child;
2808         int index;
2809 
SplitInfo(IN parent, IN child, int index)2810         SplitInfo(IN parent, IN child, int index) {
2811             this.parent = parent;
2812             this.child = child;
2813             this.index = index;
2814         }
2815     }
2816 }
2817