1 /*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 2002, 2014 Oracle and/or its affiliates. All rights reserved. 5 * 6 */ 7 8 package com.sleepycat.je.tree; 9 10 import static com.sleepycat.je.EnvironmentFailureException.unexpectedState; 11 12 import java.io.FileNotFoundException; 13 import java.nio.ByteBuffer; 14 import java.util.Arrays; 15 import java.util.Comparator; 16 import java.util.logging.Level; 17 import java.util.logging.Logger; 18 19 import com.sleepycat.je.CacheMode; 20 import com.sleepycat.je.DatabaseException; 21 import com.sleepycat.je.EnvironmentFailureException; 22 import com.sleepycat.je.cleaner.LocalUtilizationTracker; 23 import com.sleepycat.je.cleaner.PackedObsoleteInfo; 24 import com.sleepycat.je.dbi.DatabaseId; 25 import com.sleepycat.je.dbi.DatabaseImpl; 26 import com.sleepycat.je.dbi.DbTree; 27 import com.sleepycat.je.dbi.EnvironmentFailureReason; 28 import com.sleepycat.je.dbi.EnvironmentImpl; 29 import com.sleepycat.je.dbi.INList; 30 import com.sleepycat.je.dbi.MemoryBudget; 31 import com.sleepycat.je.evictor.Evictor; 32 import com.sleepycat.je.latch.LatchContext; 33 import com.sleepycat.je.latch.LatchFactory; 34 import com.sleepycat.je.latch.LatchSupport; 35 import com.sleepycat.je.latch.LatchTable; 36 import com.sleepycat.je.latch.SharedLatch; 37 import com.sleepycat.je.log.LogEntryType; 38 import com.sleepycat.je.log.LogManager; 39 import com.sleepycat.je.log.LogUtils; 40 import com.sleepycat.je.log.Loggable; 41 import com.sleepycat.je.log.Provisional; 42 import com.sleepycat.je.log.ReplicationContext; 43 import com.sleepycat.je.log.WholeEntry; 44 import com.sleepycat.je.log.entry.INLogEntry; 45 import com.sleepycat.je.log.entry.LNLogEntry; 46 import com.sleepycat.je.log.entry.LogEntry; 47 import com.sleepycat.je.tree.dupConvert.DupConvert; 48 import com.sleepycat.je.utilint.DbLsn; 49 import com.sleepycat.je.utilint.LoggerUtils; 50 import com.sleepycat.je.utilint.SizeofMarker; 51 import com.sleepycat.je.utilint.TestHook; 52 import com.sleepycat.je.utilint.TestHookExecute; 53 54 /** 55 * An IN represents an Internal Node in the JE tree. 56 * 57 * Explanation of KD (KnownDeleted) and PD (PendingDelete) entry flags 58 * =================================================================== 59 * 60 * PD: set for all LN entries that are deleted, even before the LN is 61 * committed. Is used as an authoritative (transactionally correct) indication 62 * that an LN is deleted. PD will be cleared if the txn for the deleted LN is 63 * aborted. 64 * 65 * KD: set under special conditions for entries containing LNs which are known 66 * to be obsolete. Not used for entries in an active/uncommitted transaction. 67 * 68 * First notice that IN.fetchLN will allow a FileNotFoundException when the 69 * PD or KD flag is set on the entry. And it will allow a NULL_LSN when the KD 70 * flag is set. 71 * 72 * KD was implemented first, and was originally used when the cleaner attempts 73 * to migrate an LN and discovers it is deleted (see Cleaner.migrateLN). We 74 * need KD because the INCompressor may not have run, and may not have 75 * compressed the BIN. There's the danger that we'll try to fetch that entry, 76 * and that the file was deleted by the cleaner. 77 * 78 * KD was used more recently when an unexpected exception occurs while logging 79 * an LN, after inserting the entry. Rather than delete the entry to clean up, 80 * we mark the entry KD so it won't cause a fetch error later. In this case 81 * the entry LSN is NULL_LSN. See Tree.insertNewSlot. 82 * 83 * PD is closely related to the first use of KD above (for cleaned deleted LNs) 84 * and came about because of a cleaner optimization we make. The cleaner 85 * considers all deleted LN log entries to be obsolete, without doing a tree 86 * lookup, and without any record of an obsolete offset. This makes the cost 87 * of cleaning of deleted LNs very low. For example, if the log looks like 88 * this: 89 * 90 * 100 LNA 91 * 200 delete of LNA 92 * 93 * then LSN 200 will be considered obsolete when this file is processed by the 94 * cleaner. After all, only two things can happen: (1) the txn commits, and we 95 * don't need LSN 200, because we can wipe this LN out of the tree, or (2) the 96 * txn aborts, and we don't need LSN 200, because we are going to revert to LSN 97 * 100/LNA. 98 * 99 * We set PD for the entry of a deleted LN at the time of the operation, and we 100 * clear PD if the transaction aborts. There is no real danger that this log 101 * entry will be processed by the cleaner before it's committed, because 102 * cleaning can only happen after the first active LSN. 103 * 104 * Just as in the first use of KD above, setting PD is necessary to avoid a 105 * fetch error, when the file is deleted by the cleaner but the entry 106 * containing the deleted LN has not been deleted by the INCompressor. 107 * 108 * PD is also set in replication rollback, when LNs are marked as 109 * invisible. 110 * 111 * When LSN locking was implemented (see CursorImpl.lockLN), the PD flag took 112 * on additional meaning. PD is used to determine whether an LN is deleted 113 * without fetching it, and therefore is relied on to be transactionally 114 * correct. 115 * 116 * In addition to the setting and use of the KD/PD flags described above, the 117 * situation is complicated by the fact that we must restore the state of these 118 * flags during abort, recovery, and set them properly during slot reuse. 119 * 120 * We have been meaning to consider whether PD and KD can be consolidated into 121 * one flag: simply the Deleted flag. The Deleted flag would be set in the 122 * same way as PD is currently set, as well as the second use of KD described 123 * above (when the LSN is NULL_LSN after an insertion error). The use of KD 124 * and PD for invisible entries and recovery rollback should also be 125 * considered. 126 * 127 * If we consolidate the two flags and set the Deleted flag during a delete 128 * operation (like PD), we'll have to remove optimizations (in CursorImpl for 129 * example) that consider a slot deleted when KD is set. Since KD is rarely 130 * set currently, this shouldn't have a noticeable performance impact. 131 */ 132 public class IN extends Node implements Comparable<IN>, LatchContext { 133 134 private static final String BEGIN_TAG = "<in>"; 135 private static final String END_TAG = "</in>"; 136 private static final String TRACE_SPLIT = "Split:"; 137 private static final String TRACE_DELETE = "Delete:"; 138 139 private static final int BYTES_PER_LSN_ENTRY = 4; 140 public static final int MAX_FILE_OFFSET = 0xfffffe; 141 private static final int THREE_BYTE_NEGATIVE_ONE = 0xffffff; 142 143 /* 144 * Levels: 145 * The mapping tree has levels in the 0x20000 -> 0x2ffff number space. 146 * The main tree has levels in the 0x10000 -> 0x1ffff number space. 147 * The duplicate tree levels are in 0-> 0xffff number space. 148 */ 149 public static final int DBMAP_LEVEL = 0x20000; 150 public static final int MAIN_LEVEL = 0x10000; 151 public static final int LEVEL_MASK = 0x0ffff; 152 public static final int MIN_LEVEL = -1; 153 public static final int MAX_LEVEL = Integer.MAX_VALUE; 154 public static final int BIN_LEVEL = MAIN_LEVEL | 1; 155 156 /* Used to indicate that an exact match was found in findEntry. */ 157 public static final int EXACT_MATCH = (1 << 16); 158 159 /* Used to indicate that an insert was successful. */ 160 public static final int INSERT_SUCCESS = (1 << 17); 161 162 /* 163 * A bit flag set in the return value of partialEviction() to indicate 164 * whether the IN is evictable or not. 165 */ 166 public static final long NON_EVICTABLE_IN = (1L << 62); 167 168 /* 169 * Boolean properties of an IN, encoded as bits inside the flags 170 * data member. 171 */ 172 private static final int IN_DIRTY_BIT = 0x1; 173 private static final int IN_RECALC_TOGGLE_BIT = 0x2; 174 private static final int IN_IS_ROOT_BIT = 0x4; 175 private static final int HAS_CACHED_CHILDREN_BIT = 0x8; 176 private static final int IN_DIRTY_LRU_BIT = 0x10; 177 private static final int IN_DELTA_BIT = 0x20; 178 179 /* Tracing for LRU-related ops */ 180 private static final boolean traceLRU = false; 181 private static final boolean traceDeltas = false; 182 private static final Level traceLevel = Level.INFO; 183 184 DatabaseImpl databaseImpl; 185 186 private int level; 187 188 /* The unique id of this node. */ 189 long nodeId; 190 191 int flags; // not persistent, except from the root bit 192 193 /* 194 * The identifier key is a key that can be used used to search for this IN. 195 * Initially it is the key of the zeroth slot, but insertions prior to slot 196 * zero make this no longer true. It is always equal to some key in the 197 * IN, and therefore it is changed by BIN.compress when removing slots. 198 */ 199 private byte[] identifierKey; 200 201 int nEntries; 202 203 byte[] entryStates; 204 205 /* 206 * entryKeyVals contains the keys in their entirety if key prefixing is not 207 * being used. If prefixing is enabled, then keyPrefix contains the prefix 208 * and entryKeyVals contains the suffixes. 209 */ 210 INKeyRep entryKeyVals; 211 byte[] keyPrefix; 212 213 /* 214 * The following entryLsnXXX fields are used for storing LSNs. There are 215 * two possible representations: a byte array based rep, and a long array 216 * based one. For compactness, the byte array rep is used initially. A 217 * single byte[] that uses four bytes per LSN is used. The baseFileNumber 218 * field contains the lowest file number of any LSN in the array. Then for 219 * each entry (four bytes each), the first byte contains the offset from 220 * the baseFileNumber of that LSN's file number. The remaining three bytes 221 * contain the file offset portion of the LSN. Three bytes will hold a 222 * maximum offset of 16,777,214 (0xfffffe), so with the default JE log file 223 * size of 10,000,000 bytes this works well. 224 * 225 * If either (1) the difference in file numbers exceeds 127 226 * (Byte.MAX_VALUE) or (2) the file offset is greater than 16,777,214, then 227 * the byte[] based rep mutates to a long[] based rep. 228 * 229 * In the byte[] rep, DbLsn.NULL_LSN is represented by setting the file 230 * offset bytes for a given entry to -1 (0xffffff). 231 * 232 * Note: A compact representation will be changed to the non-compact one, 233 * if needed, but in the current implementation, the reverse mutation 234 * (from long to compact) never takes place. 235 */ 236 long baseFileNumber; 237 byte[] entryLsnByteArray; 238 long[] entryLsnLongArray; 239 public static boolean disableCompactLsns; // DbCacheSize only 240 241 /* 242 * The children of this IN. Only the ones that are actually in the cache 243 * have non-null entries. Specialized sparse array represents are used to 244 * represent the entries. The representation can mutate as modifications 245 * are made to it. 246 */ 247 INTargetRep entryTargets; 248 249 long inMemorySize; 250 251 /* 252 * accumluted memory budget delta. Once this exceeds 253 * MemoryBudget.ACCUMULATED_LIMIT we inform the MemoryBudget that a change 254 * has occurred. See SR 12273. 255 */ 256 private int accumulatedDelta = 0; 257 258 /* 259 * Max allowable accumulation of memory budget changes before MemoryBudget 260 * should be updated. This allows for consolidating multiple calls to 261 * updateXXXMemoryBudget() into one call. Not declared final so that the 262 * unit tests can modify it. See SR 12273. 263 */ 264 public static int ACCUMULATED_LIMIT = 1000; 265 266 private boolean inListResident; // true if this IN is on the IN list 267 268 /** 269 * References to the next and previous nodes in an LRU list. If the node 270 * is not in any LRUList, both of these will be null. If the node is at 271 * the front/back of an LRUList, prevLRUNode/nextLRUNode will point to 272 * the node itself. 273 */ 274 private IN nextLRUNode = null; 275 private IN prevLRUNode = null; 276 277 /* 278 * Let L be the most recently written logrec for this IN instance. 279 * (a) If this is a UIN, lastFullVersion is the lsn of L. 280 * (b) If this is a BIN instance and L is a full-version logrec, 281 * lastFullVersion is the lsn of L. 282 * (c) If this is a BIN instance and L is a delta logrec, lastFullVersion 283 * is the lsn of the most recently written full-version logrec for the 284 * same BIN. 285 * 286 * Notice that this is a persistent field, but except from case (c), when 287 * reading a logrec L, it is set not to the value found in L, but to the 288 * lsn of L. This is why its read/write is managed by the INLogEntry class 289 * rather than the IN readFromLog/writeFromLog methods. 290 */ 291 long lastFullVersion = DbLsn.NULL_LSN; 292 293 /* 294 * A sequence of obsolete info that cannot be counted as obsolete until an 295 * ancestor IN is logged non-provisionally. 296 */ 297 private PackedObsoleteInfo provisionalObsolete; 298 299 /* See convertDupKeys. */ 300 private boolean needDupKeyConversion; 301 302 private int pinCount = 0; 303 304 protected SharedLatch latch; 305 306 private long generation; 307 308 private TestHook fetchINHook; 309 310 /** 311 * Create an empty IN, with no node ID, to be filled in from the log. 312 */ IN()313 public IN() { 314 init(null, Key.EMPTY_KEY, 0, 0); 315 } 316 317 /** 318 * Create a new IN. 319 */ IN( DatabaseImpl dbImpl, byte[] identifierKey, int capacity, int level)320 public IN( 321 DatabaseImpl dbImpl, 322 byte[] identifierKey, 323 int capacity, 324 int level) { 325 326 nodeId = dbImpl.getEnv().getNodeSequence().getNextLocalNodeId(); 327 328 init(dbImpl, identifierKey, capacity, 329 generateLevel(dbImpl.getId(), level)); 330 331 initMemorySize(); 332 } 333 334 /** 335 * For Sizeof. 336 */ IN(@uppressWarningsR) SizeofMarker marker)337 public IN(@SuppressWarnings("unused") SizeofMarker marker) { 338 339 /* 340 * Set all variable fields to null, since they are not part of the 341 * fixed overhead. 342 */ 343 entryTargets = null; 344 entryKeyVals = null; 345 keyPrefix = null; 346 entryLsnByteArray = null; 347 entryLsnLongArray = null; 348 entryStates = null; 349 350 latch = LatchFactory.createSharedLatch( 351 LatchSupport.DUMMY_LATCH_CONTEXT, isAlwaysLatchedExclusively()); 352 353 /* 354 * Use the latch to force it to grow to "runtime size". 355 */ 356 latch.acquireExclusive(); 357 latch.release(); 358 latch.acquireExclusive(); 359 latch.release(); 360 } 361 362 /** 363 * Create a new IN. Need this because we can't call newInstance() without 364 * getting a 0 for nodeId. 365 */ createNewInstance(byte[] identifierKey, int maxEntries, int level)366 protected IN createNewInstance(byte[] identifierKey, 367 int maxEntries, 368 int level) { 369 return new IN(databaseImpl, identifierKey, maxEntries, level); 370 } 371 372 /** 373 * Initialize IN object. 374 */ init(DatabaseImpl db, @SuppressWarnings(R) byte[] identifierKey, int initialCapacity, @SuppressWarnings(R) int level)375 protected void init(DatabaseImpl db, 376 @SuppressWarnings("hiding") 377 byte[] identifierKey, 378 int initialCapacity, 379 @SuppressWarnings("hiding") 380 int level) { 381 setDatabase(db); 382 latch = LatchFactory.createSharedLatch( 383 this, isAlwaysLatchedExclusively()); 384 generation = 0; 385 flags = 0; 386 nEntries = 0; 387 this.identifierKey = identifierKey; 388 entryTargets = INTargetRep.NONE; 389 entryKeyVals = new INKeyRep.Default(initialCapacity); 390 keyPrefix = null; 391 baseFileNumber = -1; 392 393 /* 394 * Normally we start out with the compact LSN rep and then mutate to 395 * the long rep when needed. But for some purposes (DbCacheSize) we 396 * start out with the long rep and never use the compact rep. 397 */ 398 if (disableCompactLsns) { 399 entryLsnByteArray = null; 400 entryLsnLongArray = new long[initialCapacity]; 401 } else { 402 entryLsnByteArray = new byte[initialCapacity << 2]; 403 entryLsnLongArray = null; 404 } 405 406 entryStates = new byte[initialCapacity]; 407 this.level = level; 408 inListResident = false; 409 } 410 411 @Override isIN()412 public final boolean isIN() { 413 return true; 414 } 415 416 @Override isUpperIN()417 public final boolean isUpperIN() { 418 return !isBIN(); 419 } 420 421 @Override getLatchName()422 public final String getLatchName() { 423 return shortClassName() + getNodeId(); 424 } 425 getLatchString()426 public final String getLatchString() { 427 return latch.toString(); 428 } 429 430 @Override getLatchTimeoutMs()431 public final int getLatchTimeoutMs() { 432 return databaseImpl.getEnv().getLatchTimeoutMs(); 433 } 434 435 @Override getLatchTable()436 public final LatchTable getLatchTable() { 437 return LatchSupport.btreeLatchTable; 438 } 439 440 /* 441 * Return whether the shared latch for this kind of node should be of the 442 * "always exclusive" variety. Presently, only IN's are actually latched 443 * shared. BINs are latched exclusive only. 444 */ isAlwaysLatchedExclusively()445 boolean isAlwaysLatchedExclusively() { 446 return false; 447 } 448 449 /** 450 * Latch this node if it is not latched by another thread, optionally 451 * setting the generation if the latch succeeds. 452 */ latchNoWait(CacheMode cacheMode)453 public final boolean latchNoWait(CacheMode cacheMode) 454 throws DatabaseException { 455 456 if (latch.acquireExclusiveNoWait()) { 457 setGeneration(cacheMode); 458 return true; 459 } else { 460 return false; 461 } 462 } 463 464 /** 465 * Latch this node if it is not latched by another thread, and set the 466 * generation if the latch succeeds. 467 */ latchNoWait()468 public final boolean latchNoWait() 469 throws DatabaseException { 470 471 return latchNoWait(CacheMode.DEFAULT); 472 } 473 474 /** 475 * Latch this node exclusive, optionally setting the generation. 476 */ latch(CacheMode cacheMode)477 public void latch(CacheMode cacheMode) 478 throws DatabaseException { 479 480 latch.acquireExclusive(); 481 setGeneration(cacheMode); 482 } 483 484 /** 485 * Latch this node exclusive and set the generation. 486 */ latch()487 public final void latch() 488 throws DatabaseException { 489 490 latch(CacheMode.DEFAULT); 491 } 492 493 /** 494 * Latch this node shared, optionally setting the generation. 495 */ 496 @Override latchShared(CacheMode cacheMode)497 public void latchShared(CacheMode cacheMode) 498 throws DatabaseException { 499 500 latch.acquireShared(); 501 setGeneration(cacheMode); 502 } 503 504 /** 505 * Latch this node shared and set the generation. 506 */ 507 @Override latchShared()508 public final void latchShared() 509 throws DatabaseException { 510 511 latchShared(CacheMode.DEFAULT); 512 } 513 514 /** 515 * Release the latch on this node. 516 */ 517 @Override releaseLatch()518 public final void releaseLatch() { 519 latch.release(); 520 } 521 522 /** 523 * Release the latch on this node if it is owned. 524 */ releaseLatchIfOwner()525 public final void releaseLatchIfOwner() { 526 latch.releaseIfOwner(); 527 } 528 529 /** 530 * @return true if this thread holds the IN's latch 531 */ isLatchOwner()532 public final boolean isLatchOwner() { 533 return latch.isOwner(); 534 } 535 isLatchExclusiveOwner()536 public final boolean isLatchExclusiveOwner() { 537 return latch.isExclusiveOwner(); 538 } 539 540 /* For unit testing. */ getLatchNWaiters()541 public final int getLatchNWaiters() { 542 return latch.getNWaiters(); 543 } 544 getGeneration()545 public final long getGeneration() { 546 return generation; 547 } 548 setGeneration(CacheMode cacheMode)549 public final void setGeneration(CacheMode cacheMode) { 550 551 final Evictor evictor = getEvictor(); 552 553 switch (cacheMode) { 554 case DEFAULT: 555 case EVICT_LN: 556 generation = Generation.getNextGeneration(); 557 558 if (isBIN() || !hasCachedChildrenFlag()) { 559 assert(isBIN() || !hasResidentChildren()); 560 if (evictor != null) { 561 evictor.moveBack(this); 562 } 563 } 564 565 break; 566 case UNCHANGED: 567 break; 568 case KEEP_HOT: 569 generation = Long.MAX_VALUE; 570 571 if (isBIN() || !hasCachedChildrenFlag()) { 572 assert(isBIN() || !hasResidentChildren()); 573 if (evictor != null) { 574 evictor.moveBack(this); 575 } 576 } 577 578 break; 579 case MAKE_COLD: 580 case EVICT_BIN: 581 if (isBIN()) { 582 generation = 0L; 583 if (evictor != null) { 584 evictor.moveFront(this); 585 } 586 } 587 break; 588 default: 589 throw unexpectedState( 590 "unknown cacheMode: " + cacheMode); 591 } 592 } 593 setGeneration(long newGeneration)594 public final void setGeneration(long newGeneration) { 595 generation = newGeneration; 596 } 597 pin()598 public final synchronized void pin() { 599 assert(isLatchOwner()); 600 assert(pinCount >= 0); 601 ++pinCount; 602 } 603 unpin()604 public final synchronized void unpin() { 605 assert(pinCount > 0); 606 --pinCount; 607 } 608 isPinned()609 public final synchronized boolean isPinned() { 610 assert(isLatchExclusiveOwner()); 611 assert(pinCount >= 0); 612 return pinCount > 0; 613 } 614 615 /** 616 * Get the database for this IN. 617 */ getDatabase()618 public final DatabaseImpl getDatabase() { 619 return databaseImpl; 620 } 621 622 /** 623 * Set the database reference for this node. 624 */ setDatabase(DatabaseImpl db)625 public final void setDatabase(DatabaseImpl db) { 626 databaseImpl = db; 627 } 628 629 /* 630 * Get the database id for this node. 631 */ getDatabaseId()632 public final DatabaseId getDatabaseId() { 633 return databaseImpl.getId(); 634 } 635 636 @Override getEnvImplForFatalException()637 public final EnvironmentImpl getEnvImplForFatalException() { 638 return databaseImpl.getEnv(); 639 } 640 getEnv()641 public final EnvironmentImpl getEnv() { 642 return databaseImpl.getEnv(); 643 } 644 getEvictor()645 protected final Evictor getEvictor() { 646 return databaseImpl.getEnv().getEvictor(); 647 } 648 649 /** 650 * Convenience method to return the database key comparator. 651 */ getKeyComparator()652 public final Comparator<byte[]> getKeyComparator() { 653 return databaseImpl.getKeyComparator(); 654 } 655 656 @Override getLevel()657 public final int getLevel() { 658 return level; 659 } 660 getNormalizedLevel()661 public final int getNormalizedLevel() { 662 return level & LEVEL_MASK; 663 } 664 generateLevel(DatabaseId dbId, int newLevel)665 private static int generateLevel(DatabaseId dbId, int newLevel) { 666 if (dbId.equals(DbTree.ID_DB_ID)) { 667 return newLevel | DBMAP_LEVEL; 668 } else { 669 return newLevel | MAIN_LEVEL; 670 } 671 } 672 getNodeId()673 public final long getNodeId() { 674 return nodeId; 675 } 676 677 /* For unit tests only. */ setNodeId(long nid)678 final void setNodeId(long nid) { 679 nodeId = nid; 680 } 681 682 /** 683 * We would like as even a hash distribution as possible so that the 684 * Evictor's LRU is as accurate as possible. ConcurrentHashMap takes the 685 * value returned by this method and runs its own hash algorithm on it. 686 * So a bit complement of the node ID is sufficent as the return value and 687 * is a little better than returning just the node ID. If we use a 688 * different container in the future that does not re-hash the return 689 * value, we should probably implement the Wang-Jenkins hash function here. 690 */ 691 @Override hashCode()692 public final int hashCode() { 693 return (int) ~getNodeId(); 694 } 695 696 @Override equals(Object obj)697 public final boolean equals(Object obj) { 698 if (!(obj instanceof IN)) { 699 return false; 700 } 701 IN in = (IN) obj; 702 return (this.getNodeId() == in.getNodeId()); 703 } 704 705 /** 706 * Sort based on equality key. 707 */ compareTo(IN argIN)708 public final int compareTo(IN argIN) { 709 long argNodeId = argIN.getNodeId(); 710 long myNodeId = getNodeId(); 711 712 if (myNodeId < argNodeId) { 713 return -1; 714 } else if (myNodeId > argNodeId) { 715 return 1; 716 } else { 717 return 0; 718 } 719 } 720 getDirty()721 public final boolean getDirty() { 722 return (flags & IN_DIRTY_BIT) != 0; 723 } 724 725 /* public for unit tests */ setDirty(boolean dirty)726 public final void setDirty(boolean dirty) { 727 if (dirty) { 728 flags |= IN_DIRTY_BIT; 729 } else { 730 flags &= ~IN_DIRTY_BIT; 731 } 732 } 733 734 @Override isBINDelta()735 public final boolean isBINDelta() { 736 assert(isUpperIN() || isLatchOwner()); 737 return (flags & IN_DELTA_BIT) != 0; 738 } 739 740 /* 741 * This version of isBINDelta() takes a checkLatched param to allow 742 * for cases where it is ok to call the method without holding the 743 * BIN latch (e.g. in single-threaded tests, or when the BIN is not 744 * attached to the tree (and thus inaccessible from other threads)). 745 */ 746 @Override isBINDelta(boolean checkLatched)747 public final boolean isBINDelta(boolean checkLatched) { 748 assert(!checkLatched || isUpperIN() || isLatchOwner()); 749 return (flags & IN_DELTA_BIT) != 0; 750 } 751 setBINDelta(boolean delta)752 final void setBINDelta(boolean delta) { 753 if (delta) { 754 flags |= IN_DELTA_BIT; 755 } else { 756 flags &= ~IN_DELTA_BIT; 757 } 758 } 759 getRecalcToggle()760 public final boolean getRecalcToggle() { 761 return (flags & IN_RECALC_TOGGLE_BIT) != 0; 762 } 763 setRecalcToggle(boolean toggle)764 public final void setRecalcToggle(boolean toggle) { 765 if (toggle) { 766 flags |= IN_RECALC_TOGGLE_BIT; 767 } else { 768 flags &= ~IN_RECALC_TOGGLE_BIT; 769 } 770 } 771 isRoot()772 public final boolean isRoot() { 773 return (flags & IN_IS_ROOT_BIT) != 0; 774 } 775 isDbRoot()776 public final boolean isDbRoot() { 777 return (flags & IN_IS_ROOT_BIT) != 0; 778 } 779 setIsRoot(boolean isRoot)780 final void setIsRoot(boolean isRoot) { 781 setIsRootFlag(isRoot); 782 setDirty(true); 783 } 784 setIsRootFlag(boolean isRoot)785 private final void setIsRootFlag(boolean isRoot) { 786 if (isRoot) { 787 flags |= IN_IS_ROOT_BIT; 788 } else { 789 flags &= ~IN_IS_ROOT_BIT; 790 } 791 } 792 hasCachedChildrenFlag()793 public final boolean hasCachedChildrenFlag() { 794 return (flags & HAS_CACHED_CHILDREN_BIT) != 0; 795 } 796 setHasCachedChildrenFlag(boolean value)797 private final void setHasCachedChildrenFlag(boolean value) { 798 if (value) { 799 flags |= HAS_CACHED_CHILDREN_BIT; 800 } else { 801 flags &= ~HAS_CACHED_CHILDREN_BIT; 802 } 803 } 804 isInDirtyLRU()805 public final boolean isInDirtyLRU() { 806 return (flags & IN_DIRTY_LRU_BIT) != 0; 807 } 808 809 /* public for unit tests */ setInDirtyLRU(boolean value)810 public final void setInDirtyLRU(boolean value) { 811 if (value) { 812 flags |= IN_DIRTY_LRU_BIT; 813 } else { 814 flags &= ~IN_DIRTY_LRU_BIT; 815 } 816 } 817 818 /** 819 * @return the identifier key for this node. 820 */ getIdentifierKey()821 public final byte[] getIdentifierKey() { 822 return identifierKey; 823 } 824 825 /** 826 * Set the identifier key for this node. 827 * 828 * @param key - the new identifier key for this node. 829 */ setIdentifierKey(byte[] key)830 public final void setIdentifierKey(byte[] key) { 831 832 assert(!isBINDelta()); 833 834 /* 835 * The identifierKey is "intentionally" not kept track of in the 836 * memory budget. If we did, then it would look like this: 837 838 int oldIDKeySz = (identifierKey == null) ? 839 0 : 840 MemoryBudget.byteArraySize(identifierKey.length); 841 842 int newIDKeySz = (key == null) ? 843 0 : 844 MemoryBudget.byteArraySize(key.length); 845 updateMemorySize(newIDKeySz - oldIDKeySz); 846 847 */ 848 identifierKey = key; 849 setDirty(true); 850 } 851 852 /** 853 * @return the number of entries in this node. 854 */ getNEntries()855 public final int getNEntries() { 856 return nEntries; 857 } 858 859 /** 860 * @return the maximum number of entries in this node. 861 * 862 * Overriden by TestIN in INEntryTestBase.java 863 */ getMaxEntries()864 public int getMaxEntries() { 865 return entryStates.length; 866 } 867 getState(int idx)868 public final byte getState(int idx) { 869 return entryStates[idx]; 870 } 871 872 /** 873 * @return true if the object is dirty. 874 */ isDirty(int idx)875 final boolean isDirty(int idx) { 876 return ((entryStates[idx] & EntryStates.DIRTY_BIT) != 0); 877 } 878 879 /** 880 * @return true if the idx'th entry has been deleted, although the 881 * transaction that performed the deletion may not be committed. 882 */ isEntryPendingDeleted(int idx)883 public final boolean isEntryPendingDeleted(int idx) { 884 return ((entryStates[idx] & EntryStates.PENDING_DELETED_BIT) != 0); 885 } 886 887 /** 888 * Set pendingDeleted to true. 889 */ setPendingDeleted(int idx)890 public final void setPendingDeleted(int idx) { 891 892 entryStates[idx] |= EntryStates.PENDING_DELETED_BIT; 893 entryStates[idx] |= EntryStates.DIRTY_BIT; 894 } 895 896 /** 897 * Set pendingDeleted to false. 898 */ clearPendingDeleted(int idx)899 public final void clearPendingDeleted(int idx) { 900 901 entryStates[idx] &= EntryStates.CLEAR_PENDING_DELETED_BIT; 902 entryStates[idx] |= EntryStates.DIRTY_BIT; 903 } 904 905 /** 906 * @return true if the idx'th entry is deleted for sure. If a transaction 907 * performed the deletion, it has been committed. 908 */ isEntryKnownDeleted(int idx)909 public final boolean isEntryKnownDeleted(int idx) { 910 return ((entryStates[idx] & EntryStates.KNOWN_DELETED_BIT) != 0); 911 } 912 913 /** 914 * Set knownDeleted to true. 915 */ setKnownDeleted(int idx)916 void setKnownDeleted(int idx) { 917 918 entryStates[idx] |= EntryStates.KNOWN_DELETED_BIT; 919 entryStates[idx] |= EntryStates.DIRTY_BIT; 920 setDirty(true); 921 } 922 923 /** 924 * Set knownDeleted to false. 925 */ clearKnownDeleted(int idx)926 public final void clearKnownDeleted(int idx) { 927 928 entryStates[idx] &= EntryStates.CLEAR_KNOWN_DELETED_BIT; 929 entryStates[idx] |= EntryStates.DIRTY_BIT; 930 setDirty(true); 931 } 932 933 /* 934 * In the future we may want to move the following static methods to an 935 * EntryState utility class and share all state bit twidling among IN, 936 * ChildReference, and DeltaInfo. 937 */ 938 939 /** 940 * Returns true if the given state is known deleted. 941 */ isStateKnownDeleted(byte state)942 static boolean isStateKnownDeleted(byte state) { 943 return ((state & EntryStates.KNOWN_DELETED_BIT) != 0); 944 } 945 946 /** 947 * Returns true if the given state is pending deleted. 948 */ isStatePendingDeleted(byte state)949 static boolean isStatePendingDeleted(byte state) { 950 return ((state & EntryStates.PENDING_DELETED_BIT) != 0); 951 } 952 953 /* For unit testing */ getKeyVals()954 public final INKeyRep getKeyVals() { 955 return entryKeyVals; 956 } 957 958 /** 959 * Return the idx'th key. If prefixing is enabled, construct a new byte[] 960 * containing the prefix and suffix. If prefixing is not enabled, just 961 * return the current byte[] in entryKeyVals[]. 962 */ getKey(int idx)963 public final byte[] getKey(int idx) { 964 965 byte[] suffix = entryKeyVals.get(idx); 966 if (suffix == null) { 967 suffix = Key.EMPTY_KEY; 968 } 969 970 if (keyPrefix == null) { 971 return suffix; 972 } 973 974 final int prefixLen = keyPrefix.length; 975 if (prefixLen == 0) { 976 return suffix; 977 } 978 979 final int suffixLen = suffix.length; 980 final byte[] key = new byte[prefixLen + suffixLen]; 981 System.arraycopy(keyPrefix, 0, key, 0, prefixLen); 982 System.arraycopy(suffix, 0, key, prefixLen, suffixLen); 983 return key; 984 } 985 986 /** 987 * Sets the key in the idx-th slot of this node, if this node is a BIN, the 988 * idx-th slot "points" to an LN, and the new key is not identical to the 989 * current key in the slot [#15704] 990 * 991 * This method is called when an LN is fetched in order to ensure the key 992 * slot is transactionally correct. A key can change in 3 circumstances, 993 * when a key comparator is configured that may not compare all bytes of 994 * the key: 995 * 996 * 1) The user calls Cursor.putCurrent to update the data of a duplicate 997 * data item. CursorImpl.putCurrent will call this method indirectly to 998 * update the key. 999 * 1000 * 2) A transaction aborts or a BIN becomes out of date due to the 1001 * non-transactional nature of INs. The Node is set to null during abort 1002 * and recovery. IN.fetchCurrent will call this method indirectly to 1003 * update the key. 1004 * 1005 * 3) A slot for a deleted LN is reused. The key in the slot is updated 1006 * by IN.updateEntry along with the node and LSN. 1007 * 1008 * Note that transaction abort and recovery of BIN entries may cause 1009 * incorrect keys to be present in the tree, since these entries are 1010 * non-transactional. However, an incorrect key in a BIN slot may only be 1011 * present when the node in that slot is null. Undo/redo sets the node to 1012 * null. When the LN node is fetched, the key in the slot is set to the 1013 * LN's key, which is the source of truth and is transactionally correct. 1014 * 1015 * @param newKey is the key to set in the slot and is the LN key. It may 1016 * be null if it is known that the key cannot be changed (as in putCurrent 1017 * in a BIN). It must be null if the node is not an LN. 1018 * 1019 * @return true if a multi-slot change was made and the complete IN memory 1020 * size must be updated. 1021 */ setLNSlotKey(int idx, byte[] newKey)1022 private boolean setLNSlotKey(int idx, byte[] newKey) { 1023 1024 assert(newKey == null || isBIN()); 1025 1026 /* 1027 * The new key may be null if a dup LN was deleted, in which case there 1028 * is no need to update it. There is no need to compare keys if there 1029 * is no comparator configured, since a key cannot be changed when the 1030 * default comparator is used. 1031 */ 1032 if (newKey != null && 1033 getKeyComparator() != null && 1034 !Arrays.equals(newKey, getKey(idx))) { 1035 setDirty(true); 1036 return setKeyAndDirty(idx, newKey); 1037 } else { 1038 return false; 1039 } 1040 } 1041 1042 /** 1043 * Set the idx'th key. 1044 * 1045 * @return true if a multi-slot change was made and the complete IN memory 1046 * size must be updated. 1047 */ setKeyAndDirty(int idx, byte[] keyVal)1048 private boolean setKeyAndDirty(int idx, byte[] keyVal) { 1049 entryStates[idx] |= EntryStates.DIRTY_BIT; 1050 return setKeyAndPrefix(idx, keyVal); 1051 } 1052 1053 /* 1054 * Set the key at idx and adjust the key prefix if necessary. 1055 * 1056 * @return true if a multi-slot change was made and the complete IN memory 1057 * size must be updated. 1058 */ setKeyAndPrefix(int idx, byte[] keyVal)1059 private boolean setKeyAndPrefix(int idx, byte[] keyVal) { 1060 1061 /* 1062 * Only compute key prefix if prefixing is enabled and there's an 1063 * existing prefix. 1064 */ 1065 if (databaseImpl.getKeyPrefixing() && keyPrefix != null) { 1066 if (!compareToKeyPrefix(keyVal)) { 1067 1068 /* 1069 * The new key doesn't share the current prefix, so recompute 1070 * the prefix and readjust all the existing suffixes. 1071 */ 1072 byte[] newPrefix = computeKeyPrefix(idx); 1073 if (newPrefix != null) { 1074 /* Take the new key into consideration for new prefix. */ 1075 newPrefix = Key.createKeyPrefix(newPrefix, keyVal); 1076 } 1077 recalcSuffixes(newPrefix, keyVal, idx); 1078 return true; 1079 } 1080 1081 final INKeyRep.Type prevRepType = entryKeyVals.getType(); 1082 1083 entryKeyVals = entryKeyVals.set( 1084 idx, computeKeySuffix(keyPrefix, keyVal), this); 1085 1086 return prevRepType != entryKeyVals.getType(); 1087 } 1088 1089 if (keyPrefix != null) { 1090 1091 /* 1092 * Key prefixing has been turned off on this database, but there 1093 * are existing prefixes. Remove prefixes for this IN. 1094 */ 1095 recalcSuffixes(new byte[0], keyVal, idx); 1096 return true; 1097 } else { 1098 final INKeyRep.Type oldRepType = entryKeyVals.getType(); 1099 entryKeyVals = entryKeyVals.set(idx, keyVal, this); 1100 return oldRepType != entryKeyVals.getType(); 1101 } 1102 } 1103 1104 /* 1105 * Iterate over all keys in this IN and recalculate their suffixes based on 1106 * newPrefix. If keyVal and idx are supplied, it means that entry[idx] is 1107 * about to be changed to keyVal so use that instead of 1108 * entryKeyVals.get(idx) when computing the new suffixes. If idx is < 0, 1109 * and keyVal is null, then recalculate suffixes for all entries in this. 1110 */ recalcSuffixes(byte[] newPrefix, byte[] keyVal, int idx)1111 private void recalcSuffixes(byte[] newPrefix, byte[] keyVal, int idx) { 1112 1113 for (int i = 0; i < nEntries; i++) { 1114 byte[] curKey = (i == idx ? keyVal : getKey(i)); 1115 entryKeyVals = 1116 entryKeyVals.set(i, computeKeySuffix(newPrefix, curKey), this); 1117 } 1118 setKeyPrefix(newPrefix); 1119 } 1120 1121 /* 1122 * Given a prefix and a key, return the suffix portion of keyVal. 1123 */ computeKeySuffix(byte[] newPrefix, byte[] keyVal)1124 private byte[] computeKeySuffix(byte[] newPrefix, byte[] keyVal) { 1125 int prefixLen = (newPrefix == null ? 0 : newPrefix.length); 1126 1127 if (prefixLen == 0) { 1128 return keyVal; 1129 } 1130 1131 int suffixLen = keyVal.length - prefixLen; 1132 byte[] ret = new byte[suffixLen]; 1133 System.arraycopy(keyVal, prefixLen, ret, 0, suffixLen); 1134 return ret; 1135 } 1136 1137 /** 1138 * Forces computation of the key prefix, without requiring a split. 1139 * Is public for use by DbCacheSize. 1140 */ recalcKeyPrefix()1141 public final void recalcKeyPrefix() { 1142 1143 assert(!isBINDelta()); 1144 1145 recalcSuffixes(computeKeyPrefix(-1), null, -1); 1146 } 1147 1148 /* 1149 * Computes a key prefix based on all the keys in 'this'. Return null if 1150 * the IN is empty or prefixing is not enabled or there is no common 1151 * prefix for the keys. 1152 */ computeKeyPrefix(int excludeIdx)1153 private byte[] computeKeyPrefix(int excludeIdx) { 1154 if (!databaseImpl.getKeyPrefixing() || 1155 nEntries <= 1) { 1156 return null; 1157 } 1158 1159 int firstIdx = (excludeIdx == 0) ? 1 : 0; 1160 byte[] curPrefixKey = getKey(firstIdx); 1161 int prefixLen = curPrefixKey.length; 1162 1163 /* 1164 * Only need to look at first and last entries when keys are ordered 1165 * byte-by-byte. But when there is a comparator, keys are not 1166 * necessarily ordered byte-by-byte. [#21328] 1167 */ 1168 boolean byteOrdered; 1169 if (true) { 1170 /* Disable optimization for now. Needs testing. */ 1171 byteOrdered = false; 1172 } else { 1173 byteOrdered = (databaseImpl.getKeyComparator() == null); 1174 } 1175 if (byteOrdered) { 1176 int lastIdx = nEntries - 1; 1177 if (lastIdx == excludeIdx) { 1178 lastIdx -= 1; 1179 } 1180 if (lastIdx > firstIdx) { 1181 byte[] lastKey = getKey(lastIdx); 1182 int newPrefixLen = Key.getKeyPrefixLength 1183 (curPrefixKey, prefixLen, lastKey); 1184 if (newPrefixLen < prefixLen) { 1185 curPrefixKey = lastKey; 1186 prefixLen = newPrefixLen; 1187 } 1188 } 1189 } else { 1190 for (int i = firstIdx + 1; i < nEntries; i++) { 1191 if (i == excludeIdx) { 1192 continue; 1193 } 1194 byte[] curKey = getKey(i); 1195 int newPrefixLen = Key.getKeyPrefixLength 1196 (curPrefixKey, prefixLen, curKey); 1197 if (newPrefixLen < prefixLen) { 1198 curPrefixKey = curKey; 1199 prefixLen = newPrefixLen; 1200 } 1201 } 1202 } 1203 1204 byte[] ret = new byte[prefixLen]; 1205 System.arraycopy(curPrefixKey, 0, ret, 0, prefixLen); 1206 1207 return ret; 1208 } 1209 getKeyPrefix()1210 public final byte[] getKeyPrefix() { 1211 return keyPrefix; 1212 } 1213 1214 /* 1215 * For unit testing only 1216 */ hasKeyPrefix()1217 public final boolean hasKeyPrefix() { 1218 return keyPrefix != null; 1219 } 1220 1221 1222 /* This has default protection for access by the unit tests. */ setKeyPrefix(byte[] keyPrefix)1223 final void setKeyPrefix(byte[] keyPrefix) { 1224 1225 assert databaseImpl != null; 1226 1227 int prevLength = (this.keyPrefix == null) ? 0 : this.keyPrefix.length; 1228 this.keyPrefix = keyPrefix; 1229 /* Update the memory budgeting to reflect changes in the key prefix. */ 1230 int currLength = (keyPrefix == null) ? 0 : keyPrefix.length; 1231 updateMemorySize(prevLength, currLength); 1232 } 1233 1234 /* 1235 * Returns whether the given newKey is a prefix of, or equal to, the 1236 * current keyPrefix. 1237 * 1238 * This has default protection for the unit tests. 1239 */ compareToKeyPrefix(byte[] newKey)1240 final boolean compareToKeyPrefix(byte[] newKey) { 1241 1242 if (keyPrefix == null || keyPrefix.length == 0) { 1243 return false; 1244 } 1245 1246 int newKeyLen = newKey.length; 1247 for (int i = 0; i < keyPrefix.length; i++) { 1248 if (i < newKeyLen && 1249 keyPrefix[i] == newKey[i]) { 1250 continue; 1251 } else { 1252 return false; 1253 } 1254 } 1255 1256 return true; 1257 } 1258 1259 /* 1260 * For debugging. 1261 */ verifyKeyPrefix()1262 final boolean verifyKeyPrefix() { 1263 1264 byte[] computedKeyPrefix = computeKeyPrefix(-1); 1265 if (keyPrefix == null) { 1266 return computedKeyPrefix == null; 1267 } 1268 1269 if (computedKeyPrefix == null || 1270 computedKeyPrefix.length < keyPrefix.length) { 1271 System.out.println("VerifyKeyPrefix failed"); 1272 System.out.println(dumpString(0, false)); 1273 return false; 1274 } 1275 for (int i = 0; i < keyPrefix.length; i++) { 1276 if (keyPrefix[i] != computedKeyPrefix[i]) { 1277 System.out.println("VerifyKeyPrefix failed"); 1278 System.out.println(dumpString(0, false)); 1279 return false; 1280 } 1281 } 1282 return true; 1283 } 1284 1285 /** 1286 * Returns whether the given key is greater than or equal to the first key 1287 * in the IN and less than or equal to the last key in the IN. This method 1288 * is used to determine whether a key to be inserted belongs in this IN, 1289 * without doing a tree search. If false is returned it is still possible 1290 * that the key belongs in this IN, but a tree search must be performed to 1291 * find out. 1292 */ isKeyInBounds(byte[] keyVal)1293 public final boolean isKeyInBounds(byte[] keyVal) { 1294 1295 assert(!isBINDelta()); 1296 1297 if (nEntries < 2) { 1298 return false; 1299 } 1300 1301 Comparator<byte[]> userCompareToFcn = getKeyComparator(); 1302 int cmp; 1303 byte[] myKey; 1304 1305 /* Compare key given to my first key. */ 1306 myKey = getKey(0); 1307 cmp = Key.compareKeys(keyVal, myKey, userCompareToFcn); 1308 if (cmp < 0) { 1309 return false; 1310 } 1311 1312 /* Compare key given to my last key. */ 1313 myKey = getKey(nEntries - 1); 1314 cmp = Key.compareKeys(keyVal, myKey, userCompareToFcn); 1315 if (cmp > 0) { 1316 return false; 1317 } 1318 1319 return true; 1320 } 1321 1322 /** 1323 * Return the idx'th LSN for this entry. 1324 * 1325 * @return the idx'th LSN for this entry. 1326 */ getLsn(int idx)1327 public final long getLsn(int idx) { 1328 1329 if (entryLsnLongArray == null) { 1330 int offset = idx << 2; 1331 int fileOffset = getFileOffset(offset); 1332 if (fileOffset == -1) { 1333 return DbLsn.NULL_LSN; 1334 } else { 1335 return DbLsn.makeLsn((baseFileNumber + 1336 getFileNumberOffset(offset)), 1337 fileOffset); 1338 } 1339 } else { 1340 return entryLsnLongArray[idx]; 1341 } 1342 } 1343 1344 /** 1345 * Set the LSN of the idx'th slot, mark the slot dirty, and update 1346 * memory consuption. Throw exception if the update is not legitimate. 1347 * 1348 * Use default protection for unit tests. 1349 */ setLsn(int idx, long lsn)1350 final void setLsn(int idx, long lsn) { 1351 setLsn(idx, lsn, true); 1352 } 1353 1354 /** 1355 * Set the LSN of the idx'th slot, mark the slot dirty, and update 1356 * memory consuption. If "check" is true, throw exception if the 1357 * update is not legitimate. 1358 */ setLsn(int idx, long lsn, boolean check)1359 private final void setLsn(int idx, long lsn, boolean check) { 1360 1361 if (!check || shouldUpdateLsn(getLsn(idx), lsn)) { 1362 1363 int oldSize = computeLsnOverhead(); 1364 1365 /* setLsnInternal can mutate to an array of longs. */ 1366 setLsnInternal(idx, lsn); 1367 1368 updateMemorySize(computeLsnOverhead() - oldSize); 1369 entryStates[idx] |= EntryStates.DIRTY_BIT; 1370 } 1371 } 1372 1373 /* 1374 * Set the LSN of the idx'th slot. If the current storage for LSNs is the 1375 * compact one, mutate it to the the non-compact, if necessary. 1376 */ setLsnInternal(int idx, long value)1377 final void setLsnInternal(int idx, long value) { 1378 1379 /* Will implement this in the future. Note, don't adjust if mutating.*/ 1380 //maybeAdjustCapacity(offset); 1381 if (entryLsnLongArray != null) { 1382 entryLsnLongArray[idx] = value; 1383 return; 1384 } 1385 1386 int offset = idx << 2; 1387 1388 if (value == DbLsn.NULL_LSN) { 1389 setFileNumberOffset(offset, (byte) 0); 1390 setFileOffset(offset, -1); 1391 return; 1392 } 1393 1394 long thisFileNumber = DbLsn.getFileNumber(value); 1395 1396 if (baseFileNumber == -1) { 1397 /* First entry. */ 1398 baseFileNumber = thisFileNumber; 1399 setFileNumberOffset(offset, (byte) 0); 1400 1401 } else { 1402 1403 if (thisFileNumber < baseFileNumber) { 1404 if (!adjustFileNumbers(thisFileNumber)) { 1405 mutateToLongArray(idx, value); 1406 return; 1407 } 1408 baseFileNumber = thisFileNumber; 1409 } 1410 1411 long fileNumberDifference = thisFileNumber - baseFileNumber; 1412 if (fileNumberDifference > Byte.MAX_VALUE) { 1413 mutateToLongArray(idx, value); 1414 return; 1415 } 1416 1417 setFileNumberOffset( 1418 offset, (byte) (thisFileNumber - baseFileNumber)); 1419 //assert getFileNumberOffset(offset) >= 0; 1420 } 1421 1422 int fileOffset = (int) DbLsn.getFileOffset(value); 1423 if (fileOffset > MAX_FILE_OFFSET) { 1424 mutateToLongArray(idx, value); 1425 return; 1426 } 1427 1428 setFileOffset(offset, fileOffset); 1429 //assert getLsn(offset) == value; 1430 } 1431 adjustFileNumbers(long newBaseFileNumber)1432 private boolean adjustFileNumbers(long newBaseFileNumber) { 1433 1434 long oldBaseFileNumber = baseFileNumber; 1435 for (int i = 0; 1436 i < entryLsnByteArray.length; 1437 i += BYTES_PER_LSN_ENTRY) { 1438 if (getFileOffset(i) == -1) { 1439 continue; 1440 } 1441 1442 long curEntryFileNumber = 1443 oldBaseFileNumber + getFileNumberOffset(i); 1444 long newCurEntryFileNumberOffset = 1445 (curEntryFileNumber - newBaseFileNumber); 1446 1447 if (newCurEntryFileNumberOffset > Byte.MAX_VALUE) { 1448 long undoOffset = oldBaseFileNumber - newBaseFileNumber; 1449 for (int j = i - BYTES_PER_LSN_ENTRY; 1450 j >= 0; 1451 j -= BYTES_PER_LSN_ENTRY) { 1452 if (getFileOffset(j) == -1) { 1453 continue; 1454 } 1455 setFileNumberOffset 1456 (j, (byte) (getFileNumberOffset(j) - undoOffset)); 1457 //assert getFileNumberOffset(j) >= 0; 1458 } 1459 return false; 1460 } 1461 setFileNumberOffset(i, (byte) newCurEntryFileNumberOffset); 1462 1463 //assert getFileNumberOffset(i) >= 0; 1464 } 1465 return true; 1466 } 1467 setFileNumberOffset(int offset, byte fileNumberOffset)1468 private void setFileNumberOffset(int offset, byte fileNumberOffset) { 1469 entryLsnByteArray[offset] = fileNumberOffset; 1470 } 1471 getFileNumberOffset(int offset)1472 private byte getFileNumberOffset(int offset) { 1473 return entryLsnByteArray[offset]; 1474 } 1475 setFileOffset(int offset, int fileOffset)1476 private void setFileOffset(int offset, int fileOffset) { 1477 put3ByteInt(offset + 1, fileOffset); 1478 } 1479 getFileOffset(int offset)1480 private int getFileOffset(int offset) { 1481 return get3ByteInt(offset + 1); 1482 } 1483 put3ByteInt(int offset, int value)1484 private void put3ByteInt(int offset, int value) { 1485 entryLsnByteArray[offset++] = (byte) (value >>> 0); 1486 entryLsnByteArray[offset++] = (byte) (value >>> 8); 1487 entryLsnByteArray[offset] = (byte) (value >>> 16); 1488 } 1489 get3ByteInt(int offset)1490 private int get3ByteInt(int offset) { 1491 int ret = (entryLsnByteArray[offset++] & 0xFF) << 0; 1492 ret += (entryLsnByteArray[offset++] & 0xFF) << 8; 1493 ret += (entryLsnByteArray[offset] & 0xFF) << 16; 1494 if (ret == THREE_BYTE_NEGATIVE_ONE) { 1495 ret = -1; 1496 } 1497 1498 return ret; 1499 } 1500 mutateToLongArray(int idx, long value)1501 private void mutateToLongArray(int idx, long value) { 1502 int nElts = entryLsnByteArray.length >> 2; 1503 long[] newArr = new long[nElts]; 1504 for (int i = 0; i < nElts; i++) { 1505 newArr[i] = getLsn(i); 1506 } 1507 newArr[idx] = value; 1508 entryLsnLongArray = newArr; 1509 entryLsnByteArray = null; 1510 } 1511 1512 /** 1513 * For a deferred write database, ensure that information is not lost when 1514 * a new LSN is assigned. Also ensures that a transient LSN is not 1515 * accidentally assigned to a persistent entry. 1516 * 1517 * Because this method uses strict checking, prepareForSlotReuse must 1518 * sometimes be called when a new logical entry is being placed in a slot, 1519 * e.g., during an IN split or an LN slot reuse. 1520 * 1521 * The following transition is a NOOP and the LSN is not set: 1522 * Any LSN to same value. 1523 * The following transitions are allowed and cause the LSN to be set: 1524 * Null LSN to transient LSN 1525 * Null LSN to persistent LSN 1526 * Transient LSN to persistent LSN 1527 * Persistent LSN to new persistent LSN 1528 * The following transitions should not occur and throw an exception: 1529 * Transient LSN to null LSN 1530 * Transient LSN to new transient LSN 1531 * Persistent LSN to null LSN 1532 * Persistent LSN to transient LSN 1533 */ shouldUpdateLsn(long oldLsn, long newLsn)1534 private final boolean shouldUpdateLsn(long oldLsn, long newLsn) { 1535 1536 /* Save a little computation in packing/updating an unchanged LSN. */ 1537 if (oldLsn == newLsn) { 1538 return false; 1539 } 1540 /* The rules for a new null LSN can be broken in a read-only env. */ 1541 if (newLsn == DbLsn.NULL_LSN && getEnv().isReadOnly()) { 1542 return true; 1543 } 1544 /* Enforce LSN update rules. Assume newLsn != oldLsn. */ 1545 if (databaseImpl.isDeferredWriteMode()) { 1546 if (oldLsn != DbLsn.NULL_LSN && DbLsn.isTransientOrNull(newLsn)) { 1547 throw unexpectedState( 1548 "DeferredWrite LSN update not allowed" + 1549 " oldLsn = " + DbLsn.getNoFormatString(oldLsn) + 1550 " newLsn = " + DbLsn.getNoFormatString(newLsn)); 1551 } 1552 } else { 1553 if (DbLsn.isTransientOrNull(newLsn)) { 1554 throw unexpectedState( 1555 "LSN update not allowed" + 1556 " oldLsn = " + DbLsn.getNoFormatString(oldLsn) + 1557 " newLsn = " + DbLsn.getNoFormatString(newLsn)); 1558 } 1559 } 1560 return true; 1561 } 1562 1563 /* For unit tests. */ getEntryLsnLongArray()1564 final long[] getEntryLsnLongArray() { 1565 return entryLsnLongArray; 1566 } 1567 1568 /* For unit tests. */ getEntryLsnByteArray()1569 final byte[] getEntryLsnByteArray() { 1570 return entryLsnByteArray; 1571 } 1572 1573 /* For unit tests. */ initEntryLsn(int capacity)1574 final void initEntryLsn(int capacity) { 1575 entryLsnLongArray = null; 1576 entryLsnByteArray = new byte[capacity << 2]; 1577 baseFileNumber = -1; 1578 } 1579 1580 /* Will implement this in the future. Note, don't adjust if mutating.*/ 1581 /*** 1582 private void maybeAdjustCapacity(int offset) { 1583 if (entryLsnLongArray == null) { 1584 int bytesNeeded = offset + BYTES_PER_LSN_ENTRY; 1585 int currentBytes = entryLsnByteArray.length; 1586 if (currentBytes < bytesNeeded) { 1587 int newBytes = bytesNeeded + 1588 (GROWTH_INCREMENT * BYTES_PER_LSN_ENTRY); 1589 byte[] newArr = new byte[newBytes]; 1590 System.arraycopy(entryLsnByteArray, 0, newArr, 0, 1591 currentBytes); 1592 entryLsnByteArray = newArr; 1593 for (int i = currentBytes; 1594 i < newBytes; 1595 i += BYTES_PER_LSN_ENTRY) { 1596 setFileNumberOffset(i, (byte) 0); 1597 setFileOffset(i, -1); 1598 } 1599 } 1600 } else { 1601 int currentEntries = entryLsnLongArray.length; 1602 int idx = offset >> 2; 1603 if (currentEntries < idx + 1) { 1604 int newEntries = idx + GROWTH_INCREMENT; 1605 long[] newArr = new long[newEntries]; 1606 System.arraycopy(entryLsnLongArray, 0, newArr, 0, 1607 currentEntries); 1608 entryLsnLongArray = newArr; 1609 for (int i = currentEntries; i < newEntries; i++) { 1610 entryLsnLongArray[i] = DbLsn.NULL_LSN; 1611 } 1612 } 1613 } 1614 } 1615 ***/ 1616 1617 /** 1618 * The last logged size is not stored for UINs. 1619 */ isLastLoggedSizeStored()1620 boolean isLastLoggedSizeStored() { 1621 return false; 1622 } 1623 1624 /** 1625 * The last logged size is not stored for UINs. 1626 */ setLastLoggedSize(int idx, int lastLoggedSize)1627 public void setLastLoggedSize(int idx, int lastLoggedSize) { 1628 } 1629 1630 /** 1631 * The last logged size is not stored for UINs. 1632 */ setLastLoggedSizeUnconditional(int idx, int lastLoggedSize)1633 void setLastLoggedSizeUnconditional(int idx, int lastLoggedSize) { 1634 } 1635 1636 /** 1637 * The last logged size is always zero for UINs. 1638 */ getLastLoggedSize(int idx)1639 public int getLastLoggedSize(int idx) { 1640 return 0; 1641 } 1642 1643 /* For unit testing */ getTargets()1644 public final INArrayRep<INTargetRep, INTargetRep.Type, Node> getTargets() { 1645 return entryTargets; 1646 } 1647 isResident(int idx)1648 public final boolean isResident(int idx) { 1649 return entryTargets.get(idx) != null; 1650 } 1651 1652 /** 1653 * Sets the idx'th target. No need to make dirty, that state only applies 1654 * to key and LSN. 1655 * 1656 * <p>WARNING: This method does not update the memory budget. The caller 1657 * must update the budget.</p> 1658 */ setTarget(int idx, Node target)1659 void setTarget(int idx, Node target) { 1660 1661 assert isLatchExclusiveOwner() : 1662 "Not latched for write " + getClass().getName() + " id=" + getNodeId(); 1663 1664 final Evictor evictor = getEvictor(); 1665 1666 Node curChild = entryTargets.get(idx); 1667 1668 if (target == null) { 1669 1670 if (isUpperIN() && hasCachedChildrenFlag()) { 1671 1672 entryTargets = entryTargets.set(idx, target, this); 1673 1674 /* 1675 * If this UIN has just lost its last cached child, set its 1676 * hasCachedChildren flag to false and put it back to the 1677 * LRU list. 1678 */ 1679 if (curChild != null && !hasResidentChildren()) { 1680 setHasCachedChildrenFlag(false); 1681 if (evictor != null && !isDIN()) { 1682 if (traceLRU) { 1683 LoggerUtils.envLogMsg( 1684 traceLevel, getEnv(), 1685 Thread.currentThread().getId() + "-" + 1686 Thread.currentThread().getName() + 1687 "-" + getEnv().getName() + 1688 " setTarget(): " + 1689 " Adding UIN " + getNodeId() + 1690 " to LRU after detaching chld " + 1691 ((IN)curChild).getNodeId()); 1692 } 1693 evictor.addBack(this); 1694 } 1695 } 1696 } else { 1697 entryTargets = entryTargets.set(idx, target, this); 1698 } 1699 } else { 1700 if (isUpperIN() && !hasCachedChildrenFlag()) { 1701 1702 setHasCachedChildrenFlag(true); 1703 1704 if (evictor != null) { 1705 if (traceLRU) { 1706 LoggerUtils.envLogMsg( 1707 traceLevel, getEnv(), 1708 Thread.currentThread().getId() + "-" + 1709 Thread.currentThread().getName() + 1710 "-" + getEnv().getName() + 1711 " setTarget(): " + 1712 " Removing UIN " + getNodeId() + 1713 " after attaching child " + 1714 ((IN)target).getNodeId()); 1715 } 1716 evictor.remove(this); 1717 } 1718 } 1719 1720 entryTargets = entryTargets.set(idx, target, this); 1721 } 1722 } 1723 1724 /** 1725 * Return the idx'th target. 1726 */ getTarget(int idx)1727 public final Node getTarget(int idx) { 1728 return entryTargets.get(idx); 1729 } 1730 1731 /* 1732 * Returns the idx-th child of "this" upper IN, fetching the child from 1733 * the log and attaching it to its parent if it is not already attached. 1734 * This method is used during tree searches. 1735 * 1736 * On entry, the parent must be latched already. 1737 * 1738 * If the child must be fetched from the log, the parent is unlatched. 1739 * After the disk read is done, the parent is relatched. However, due to 1740 * splits, it may not be the correct parent anymore. If so, the method 1741 * will return null, and the caller is expected to restart the tree search. 1742 * 1743 * On return, the parent will be latched, unless null is returned or an 1744 * exception is thrown. 1745 * 1746 * The "searchKey" param is the key that the caller is looking for. It is 1747 * used by this method in determining if, after a disk read, "this" is 1748 * still the correct parent for the child. "searchKey" may be null if the 1749 * caller is doing a LEFT or RIGHT search. 1750 */ fetchINWithNoLatch(int idx, byte [] searchKey)1751 final IN fetchINWithNoLatch(int idx, byte [] searchKey) { 1752 return fetchINWithNoLatch(idx, searchKey, null); 1753 } 1754 1755 /* 1756 * This variant of fetchIN() takes a SearchResult as a param, instead of 1757 * an idx (it is used by Tree.getParentINForChildIN()). The ordinal number 1758 * of the child to fetch is specified by result.index. result.index will 1759 * be adjusted by this method if, after a disk read, the ordinal number 1760 * of the child changes due to insertions, deletions or splits in the 1761 * parent. 1762 */ fetchINWithNoLatch(SearchResult result, byte [] searchKey)1763 final IN fetchINWithNoLatch(SearchResult result, byte [] searchKey) { 1764 return fetchINWithNoLatch(result.index, searchKey, result); 1765 } 1766 1767 /* 1768 * Provides the implementation of the above two methods. 1769 */ fetchINWithNoLatch( int idx, byte [] searchKey, SearchResult result)1770 private IN fetchINWithNoLatch( 1771 int idx, 1772 byte [] searchKey, 1773 SearchResult result) { 1774 1775 assert(isUpperIN()); 1776 assert(isLatchOwner()); 1777 1778 final EnvironmentImpl envImpl = getEnv(); 1779 boolean isMiss = false; 1780 boolean success = false; 1781 1782 IN child = (IN)entryTargets.get(idx); 1783 1784 if (child == null) { 1785 1786 isMiss = true; 1787 1788 final long lsn = getLsn(idx); 1789 1790 if (lsn == DbLsn.NULL_LSN) { 1791 throw unexpectedState(makeFetchErrorMsg( 1792 "NULL_LSN in upper IN", this, lsn, entryStates[idx])); 1793 } 1794 1795 try { 1796 pin(); 1797 releaseLatch(); 1798 1799 TestHookExecute.doHookIfSet(fetchINHook); 1800 1801 final WholeEntry wholeEntry = 1802 envImpl.getLogManager(). 1803 getLogEntryAllowInvisibleAtRecovery(lsn); 1804 1805 final LogEntry logEntry = wholeEntry.getEntry(); 1806 1807 child = (IN) logEntry.getResolvedItem(databaseImpl); 1808 1809 latch(CacheMode.UNCHANGED); 1810 1811 /* 1812 * The following if statement relies on splits being logged 1813 * immediately, or more precisely, the split node and its 1814 * new sibling being logged immediately, while both siblings 1815 * and their parent are latched exclusively. The reason for 1816 * this is as follows: 1817 * 1818 * Let K be the search key. If we are doing a left-deep or 1819 * right-deep search, K is -INF or +INF respectively. 1820 * 1821 * Let P be the parent IN (i.e., "this") and S be the slot at 1822 * the idx position before P was unlatched above. Here, we 1823 * view slots as independent objects, not identified by their 1824 * position in an IN but by some unique (imaginary) and 1825 * immutable id assigned to the slot when it is first inserted 1826 * into an IN. 1827 * 1828 * Before unlatching P, S was the correct slot to follow down 1829 * the tree looking for K. After P is unlatched and then 1830 * relatched, let S' be the slot at the idx position, if P 1831 * still has an idx position. We consider the following 2 cases: 1832 * 1833 * 1. S' exists and S'.LSN == S.LSN. Then S and S' are the same 1834 * (logical) slot (because two different slots can never cover 1835 * overlapping ranges of keys, and as a result, can never point 1836 * to the same LSN). Then, S is still the correct slot to take 1837 * down the tree, unless the range of keys covered by S has 1838 * shrunk while P was unlatched. But the only way for S's key 1839 * range to shrink is for its child IN to split, which could 1840 * not have happened because if it did, the before and after 1841 * LSNs of S would be different, given that splits are logged 1842 * immediately. We conclude that the set of keys covered by 1843 * S after P is relatched is the same or a superset of the keys 1844 * covered by S before P was unlatched, and thus S (at the idx 1845 * position) is still the correct slot to follow. 1846 * 1847 * 2. There is no idx position in P or S'.LSN != S.LSN. In 1848 * this case we cannot be sure if S' (if it exists) is the 1849 * the correct slot to follow. So, we (re)search for K in P 1850 * to check if P is still the correct parent and find the 1851 * correct slot to follow. If this search lands on the 1st or 1852 * last slot in P, we may return null because using the key 1853 * info contained in P only, we do not know the full range of 1854 * keys covered by those two slots. If null is returned, the 1855 * caller is expected to restart the tree search from the root. 1856 * 1857 * Notice that the if conditions are necessary before calling 1858 * findEntry(). Without them, we could get into an infinite 1859 * loop of search re-tries in the scenario where nothing changes 1860 * in the tree and findEntry always lands on the 1st or last 1861 * slot in P. The conditions guarantee that we may restart the 1862 * tree search only if something changes with S while P is 1863 * unlatched (S moves to a different position or a different 1864 * IN or it points to a different LSN). 1865 * 1866 * Notice also that if P does not split while it is unlatched, 1867 * the range of keys covered by P does not change either. This 1868 * implies that the correct slot to follow *must* be inside P, 1869 * and as a result, the 1st and last slots in P can be trusted. 1870 * Unfortunately, however, we have no way to detecting reliably 1871 * whether P splits or not. 1872 * 1873 * Special care for DBs in DW mode: 1874 * 1875 * For DBs in DW mode, special care must be taken because 1876 * splits are not immediately logged. So, for DW DBs, to avoid 1877 * a call to findEntry() we require that not only S'.LSN == 1878 * S.LSN, but also the the child is not cached. These 2 1879 * conditions together guarantee that the child did not split 1880 * while P was unlatched, because if the child did split, it 1881 * was fetched and cached first, so after P is relatched, 1882 * either the child would be still cached, or if it was evicted 1883 * after the split, S.LSN would have changed. 1884 */ 1885 if (idx >= nEntries || 1886 getLsn(idx) != lsn || 1887 (databaseImpl.isDeferredWriteMode() && 1888 entryTargets.get(idx) != null)) { 1889 1890 if (searchKey == null) { 1891 return null; 1892 } 1893 1894 idx = findEntry(searchKey, false, false); 1895 1896 if ((idx == 0 || idx == nEntries - 1) && 1897 !isKeyInBounds(searchKey)) { 1898 return null; 1899 } 1900 } 1901 1902 if (result != null) { 1903 result.index = idx; 1904 } 1905 1906 /* 1907 * "this" is still the correct parent and "idx" points to the 1908 * correct slot to follow for the search down the tree. But 1909 * what we fetched from the log may be out-of-date by now 1910 * (because it was fetched and then updated by other threads) 1911 * or it may not be the correct child anymore ("idx" was 1912 * changed by the findEntry() call above). We check 3 cases: 1913 * (a) There is already a cached child at the "idx" position. 1914 * In this case, we return whatever is there because it has to 1915 * be the most recent version of the appropriate child node. 1916 * This is true even when a split or reverse split occurred. 1917 * The check for isKeyInBounds above is critical in that case. 1918 * (b) There is no cached child at the "idx" slot, but the slot 1919 * LSN is not the same as the LSN we read from the log. This is 1920 * the case if "idx" was changed by findEntry() or other 1921 * threads fetched the same child as this thread, updated it, 1922 * and then then evicted it. The child we fetched is obsolete 1923 * and should not be attached. For simplicity, we just return 1924 * null in this (quite rare) case. 1925 * (c) If neither (a) nor (b), we attached the fetched child to 1926 * the parent. 1927 */ 1928 if (entryTargets.get(idx) != null) { 1929 child = (IN)entryTargets.get(idx); 1930 1931 } else if (getLsn(idx) != lsn) { 1932 return null; 1933 1934 } else { 1935 child.postFetchInit(databaseImpl, lsn); 1936 attachNode(idx, child, null); 1937 } 1938 1939 success = true; 1940 1941 } catch (FileNotFoundException e) { 1942 throw new EnvironmentFailureException( 1943 envImpl, EnvironmentFailureReason.LOG_FILE_NOT_FOUND, 1944 makeFetchErrorMsg(null, this, lsn, entryStates[idx]), 1945 e); 1946 } catch (EnvironmentFailureException e) { 1947 e.addErrorMessage(makeFetchErrorMsg( 1948 null, this, lsn, entryStates[idx])); 1949 throw e; 1950 } catch (RuntimeException e) { 1951 throw new EnvironmentFailureException( 1952 envImpl, EnvironmentFailureReason.LOG_INTEGRITY, 1953 makeFetchErrorMsg(e.toString(), this, lsn, 1954 entryStates[idx]), 1955 e); 1956 } finally { 1957 /* 1958 * Release the parent latch if null is being returned. In this 1959 * case, the parent was unlatched earlier during the disk read, 1960 * and as a result, the caller cannot make any assumptions 1961 * about the state of the parent. 1962 * 1963 * If we are returning or throwing out of this try block, the 1964 * parent may or may not be latched. So, only release the latch 1965 * if it is currently held. 1966 */ 1967 if (!success) { 1968 if (child != null) { 1969 child.incFetchStats(envImpl, isMiss); 1970 } 1971 releaseLatchIfOwner(); 1972 } 1973 1974 unpin(); 1975 } 1976 } 1977 1978 assert(hasResidentChildren() == hasCachedChildrenFlag()); 1979 1980 child.incFetchStats(envImpl, isMiss); 1981 1982 return child; 1983 } 1984 1985 /* 1986 * Returns the idx-th child of "this" upper IN, fetching the child from 1987 * the log and attaching it to its parent if it is not already attached. 1988 * 1989 * On entry, the parent must be EX-latched already and it stays EX-latched 1990 * for the duration of this method and on return (even in case of 1991 * exceptions). 1992 * 1993 * @param idx The slot of the child to fetch. 1994 */ fetchIN(int idx)1995 public IN fetchIN(int idx) { 1996 1997 assert(isUpperIN()); 1998 if (!isLatchExclusiveOwner()) { 1999 throw unexpectedState("EX-latch not held before fetch"); 2000 } 2001 2002 final EnvironmentImpl envImpl = getEnv(); 2003 boolean isMiss = false; 2004 2005 IN child = (IN) entryTargets.get(idx); 2006 2007 if (child == null) { 2008 2009 final long lsn = getLsn(idx); 2010 2011 if (lsn == DbLsn.NULL_LSN) { 2012 throw unexpectedState(makeFetchErrorMsg( 2013 "NULL_LSN in upper IN", this, lsn, entryStates[idx])); 2014 } 2015 2016 try { 2017 final WholeEntry wholeEntry = 2018 envImpl.getLogManager(). 2019 getLogEntryAllowInvisibleAtRecovery(lsn); 2020 2021 final LogEntry logEntry = wholeEntry.getEntry(); 2022 2023 child = (IN) logEntry.getResolvedItem(databaseImpl); 2024 2025 child.postFetchInit(databaseImpl, lsn); 2026 attachNode(idx, child, null); 2027 isMiss = true; 2028 2029 } catch (FileNotFoundException e) { 2030 throw new EnvironmentFailureException( 2031 envImpl, EnvironmentFailureReason.LOG_FILE_NOT_FOUND, 2032 makeFetchErrorMsg(null, this, lsn, entryStates[idx]), 2033 e); 2034 } catch (EnvironmentFailureException e) { 2035 e.addErrorMessage(makeFetchErrorMsg( 2036 null, this, lsn, entryStates[idx])); 2037 throw e; 2038 } catch (RuntimeException e) { 2039 throw new EnvironmentFailureException( 2040 envImpl, EnvironmentFailureReason.LOG_INTEGRITY, 2041 makeFetchErrorMsg(e.toString(), this, lsn, 2042 entryStates[idx]), 2043 e); 2044 } 2045 } 2046 2047 assert(hasResidentChildren() == hasCachedChildrenFlag()); 2048 2049 child.incFetchStats(envImpl, isMiss); 2050 2051 return child; 2052 } 2053 2054 /** 2055 * Returns the target of the idx'th entry or null if a pendingDeleted or 2056 * knownDeleted entry has been cleaned. Note that null can only be 2057 * returned for a slot that could contain a deleted LN, not other node 2058 * types and not a DupCountLN since DupCountLNs are never deleted. Null is 2059 * also returned for a KnownDeleted slot with a NULL_LSN. 2060 * 2061 * An exclusive latch must be held on this BIN. 2062 * 2063 * @return the target node or null. 2064 */ fetchLN(int idx, CacheMode cacheMode)2065 public final LN fetchLN(int idx, CacheMode cacheMode) { 2066 2067 return (LN) fetchLN(idx, cacheMode, false); 2068 } 2069 2070 /** 2071 * Must be called only when the LSN being fetched is known to be active 2072 * (non-obsolete) i.e., will always be present in the log even when the LN 2073 * is "immediately obsolete". For example, this is true when the LN is 2074 * part of an active txn during partial rollback, and when DupConvert reads 2075 * a log written prior to the "immediately obsolete" implementation. 2076 * 2077 * Although this method is called "fetchLN", it may fetch a DIN child of 2078 * a BIN when called from DupConvert. 2079 * 2080 * An exclusive latch must be held on this IN. 2081 */ fetchLNKnownActive(int idx, CacheMode cacheMode)2082 public final LN fetchLNKnownActive(int idx, CacheMode cacheMode) { 2083 2084 return (LN) fetchLN(idx, cacheMode, true); 2085 } 2086 2087 /* 2088 * This method is the same as fetchLNKnownActive, but it may return either 2089 * an LN or a DIN child of a BIN. It is meant to be used from DupConvert 2090 * only. 2091 */ fetchLNOrDIN(int idx, CacheMode cacheMode)2092 public final Node fetchLNOrDIN(int idx, CacheMode cacheMode) { 2093 2094 return fetchLN(idx, cacheMode, true); 2095 } 2096 2097 /* 2098 * Underlying implementation of the above fetchLNXXX methods. 2099 */ fetchLN(int idx, CacheMode cacheMode, boolean knownActive)2100 final Node fetchLN(int idx, CacheMode cacheMode, boolean knownActive) { 2101 2102 assert(isBIN()); 2103 2104 if (!isLatchExclusiveOwner()) { 2105 throw unexpectedState("EX-latch not held before fetch"); 2106 } 2107 2108 if (isEntryKnownDeleted(idx)) { 2109 return null; 2110 } 2111 2112 final EnvironmentImpl envImpl = getEnv(); 2113 boolean isMiss = false; 2114 2115 Node targetNode = entryTargets.get(idx); 2116 2117 if (targetNode == null) { 2118 2119 final long lsn = getLsn(idx); 2120 2121 if (lsn == DbLsn.NULL_LSN) { 2122 throw unexpectedState( 2123 makeFetchErrorMsg("NULL_LSN without KnownDeleted", 2124 this, lsn, entryStates[idx])); 2125 } 2126 2127 /* 2128 * Fetch of immediately obsolete LN not allowed, unless it is part 2129 * of an active txn. This guard is 2130 * only very approximate. We could instead use the firstActiveLsn 2131 * but that's expensive and probably has lock ordering issues. 2132 */ 2133 if (databaseImpl.isLNImmediatelyObsolete()) { 2134 if (!knownActive) { 2135 throw unexpectedState( 2136 "May not fetch immediately obsolete LN"); 2137 } 2138 // TODO assert for more expensive check 2139 } 2140 2141 try { 2142 final WholeEntry wholeEntry = 2143 envImpl.getLogManager(). 2144 getLogEntryAllowInvisibleAtRecovery(lsn); 2145 2146 final LogEntry logEntry = wholeEntry.getEntry(); 2147 2148 /* Ensure keys are transactionally correct. [#15704] */ 2149 byte[] lnSlotKey = null; 2150 if (logEntry instanceof LNLogEntry) { 2151 final LNLogEntry<?> lnEntry = (LNLogEntry<?>) logEntry; 2152 lnEntry.postFetchInit(databaseImpl); 2153 lnSlotKey = lnEntry.getKey(); 2154 2155 final Evictor evictor = getEvictor(); 2156 if (evictor != null && 2157 cacheMode != CacheMode.EVICT_LN) { 2158 evictor.moveToMixedLRU(this); 2159 } 2160 } 2161 2162 targetNode = (Node) logEntry.getResolvedItem(databaseImpl); 2163 targetNode.postFetchInit(databaseImpl, lsn); 2164 attachNode(idx, targetNode, lnSlotKey); 2165 /* Last logged size is not present before log version 9. */ 2166 setLastLoggedSize(idx, wholeEntry.getHeader().getEntrySize()); 2167 isMiss = true; 2168 2169 } catch (FileNotFoundException e) { 2170 if (!isEntryKnownDeleted(idx) && 2171 !isEntryPendingDeleted(idx)) { 2172 throw new EnvironmentFailureException( 2173 envImpl, EnvironmentFailureReason.LOG_FILE_NOT_FOUND, 2174 makeFetchErrorMsg(null, this, lsn, entryStates[idx]), 2175 e); 2176 } 2177 2178 /* 2179 * Cleaner got to the log file, so just return null. It is safe 2180 * to ignore a deleted file for a KD or PD entry because files 2181 * with active txns will not be cleaned. 2182 */ 2183 return null; 2184 2185 } catch (EnvironmentFailureException e) { 2186 e.addErrorMessage(makeFetchErrorMsg( 2187 null, this, lsn, entryStates[idx])); 2188 throw e; 2189 2190 } catch (RuntimeException e) { 2191 throw new EnvironmentFailureException( 2192 envImpl, EnvironmentFailureReason.LOG_INTEGRITY, 2193 makeFetchErrorMsg(e.toString(), this, lsn, 2194 entryStates[idx]), 2195 e); 2196 } 2197 } 2198 2199 targetNode.incFetchStats(envImpl, isMiss); 2200 2201 return targetNode; 2202 } 2203 2204 /** 2205 * Initialize a node that has been read in from the log. 2206 */ 2207 @Override postFetchInit(DatabaseImpl db, long lastLoggedLsn)2208 public final void postFetchInit(DatabaseImpl db, long lastLoggedLsn) { 2209 2210 commonPostFetchInit(db, lastLoggedLsn); 2211 convertDupKeys(); // must be after initMemorySize 2212 2213 db.getEnv().getInMemoryINs().add(this); 2214 2215 final Evictor evictor = getEvictor(); 2216 2217 if (evictor != null && isIN() && !isDIN() && !isDBIN()) { 2218 if (isUpperIN() && traceLRU) { 2219 LoggerUtils.envLogMsg( 2220 traceLevel, getEnv(), 2221 Thread.currentThread().getId() + "-" + 2222 Thread.currentThread().getName() + 2223 "-" + getEnv().getName() + 2224 " postFetchInit(): " + 2225 " Adding UIN to LRU: " + getNodeId()); 2226 } 2227 evictor.addBack(this); 2228 } 2229 } 2230 2231 /** 2232 * Initialize a node read in during recovery. 2233 */ postRecoveryInit(DatabaseImpl db, long lastLoggedLsn)2234 public final void postRecoveryInit(DatabaseImpl db, long lastLoggedLsn) { 2235 commonPostFetchInit(db, lastLoggedLsn); 2236 } 2237 2238 /** 2239 * Common actions of postFetchInit and postRecoveryInit. 2240 */ commonPostFetchInit(DatabaseImpl db, long lastLoggedLsn)2241 private void commonPostFetchInit(DatabaseImpl db, long lastLoggedLsn) { 2242 setDatabase(db); 2243 setLastLoggedLsn(lastLoggedLsn); 2244 initMemorySize(); // compute before adding to IN list 2245 } 2246 2247 /** 2248 * Needed only during duplicates conversion, not during normal operation. 2249 * The needDupKeyConversion field will only be true when first upgrading 2250 * from JE 4.1. After the first time an IN is converted, it will be 2251 * written out in a later file format, so the needDupKeyConversion field 2252 * will be false and this method will do nothing. See 2253 * DupConvert.convertInKeys. 2254 */ convertDupKeys()2255 private void convertDupKeys() { 2256 /* Do not convert more than once. */ 2257 if (!needDupKeyConversion) { 2258 return; 2259 } 2260 needDupKeyConversion = false; 2261 DupConvert.convertInKeys(databaseImpl, this); 2262 } 2263 2264 /** 2265 * @see Node#incFetchStats 2266 */ 2267 @Override incFetchStats(EnvironmentImpl envImpl, boolean isMiss)2268 final void incFetchStats(EnvironmentImpl envImpl, boolean isMiss) { 2269 Evictor e = envImpl.getEvictor(); 2270 if (e == null) { 2271 return; 2272 } 2273 if (isBIN()) { 2274 e.incBINFetchStats(isMiss, isBINDelta(false/*checLatched*/)); 2275 } else { 2276 e.incUINFetchStats(isMiss); 2277 } 2278 } 2279 2280 /** 2281 * @param in parent IN. Is null when root is fetched. 2282 */ makeFetchErrorMsg(String msg, IN in, long lsn, byte state)2283 static String makeFetchErrorMsg(String msg, IN in, long lsn, byte state) { 2284 2285 /* 2286 * Bolster the exception with the LSN, which is critical for 2287 * debugging. Otherwise, the exception propagates upward and loses the 2288 * problem LSN. 2289 */ 2290 StringBuilder sb = new StringBuilder(); 2291 2292 if (in == null) { 2293 sb.append("fetchRoot of "); 2294 } else if (in.isBIN()) { 2295 sb.append("fetchLN of "); 2296 } else { 2297 sb.append("fetchIN of "); 2298 } 2299 2300 if (lsn == DbLsn.NULL_LSN) { 2301 sb.append("null lsn"); 2302 } else { 2303 sb.append(DbLsn.getNoFormatString(lsn)); 2304 } 2305 2306 if (in != null) { 2307 sb.append(" parent IN=").append(in.getNodeId()); 2308 sb.append(" IN class=").append(in.getClass().getName()); 2309 sb.append(" lastFullVersion="); 2310 sb.append(DbLsn.getNoFormatString(in.getLastFullVersion())); 2311 sb.append(" lastLoggedVersion="); 2312 sb.append(DbLsn.getNoFormatString(in.getLastLoggedVersion())); 2313 sb.append(" parent.getDirty()=").append(in.getDirty()); 2314 } 2315 2316 sb.append(" state=").append(state); 2317 2318 if (msg != null) { 2319 sb.append(" ").append(msg); 2320 } 2321 2322 return sb.toString(); 2323 } 2324 findEntry( byte[] key, boolean indicateIfDuplicate, boolean exact)2325 public final int findEntry( 2326 byte[] key, 2327 boolean indicateIfDuplicate, 2328 boolean exact) { 2329 2330 return findEntry(key, indicateIfDuplicate, exact, null /*Comparator*/); 2331 } 2332 2333 /** 2334 * Find the entry in this IN for which key is LTE the key arg. 2335 * 2336 * Currently uses a binary search, but eventually, this may use binary or 2337 * linear search depending on key size, number of entries, etc. 2338 * 2339 * This method guarantees that the key parameter, which is the user's key 2340 * parameter in user-initiated search operations, is always the left hand 2341 * parameter to the Comparator.compare method. This allows a comparator 2342 * to perform specialized searches, when passed down from upper layers. 2343 * 2344 * This is public so that DbCursorTest can access it. 2345 * 2346 * Note that the 0'th entry's key is treated specially in an IN. It always 2347 * compares lower than any other key. 2348 * 2349 * @param key - the key to search for. 2350 * @param indicateIfDuplicate - true if EXACT_MATCH should 2351 * be or'd onto the return value if key is already present in this node. 2352 * @param exact - true if an exact match must be found. 2353 * @return offset for the entry that has a key LTE the arg. 0 if key 2354 * is less than the 1st entry. -1 if exact is true and no exact match 2355 * is found. If indicateIfDuplicate is true and an exact match was found 2356 * then EXACT_MATCH is or'd onto the return value. 2357 */ findEntry( byte[] key, boolean indicateIfDuplicate, boolean exact, Comparator<byte[]> comparator)2358 public final int findEntry( 2359 byte[] key, 2360 boolean indicateIfDuplicate, 2361 boolean exact, 2362 Comparator<byte[]> comparator) { 2363 2364 int high = nEntries - 1; 2365 int low = 0; 2366 int middle = 0; 2367 2368 Comparator<byte[]> userCompareToFcn = (comparator != null) ? 2369 comparator : 2370 databaseImpl.getKeyComparator(); 2371 2372 /* 2373 * Special Treatment of 0th Entry 2374 * ------------------------------ 2375 * IN's are special in that they have a entry[0] where the key is a 2376 * virtual key in that it always compares lower than any other key. 2377 * BIN's don't treat key[0] specially. But if the caller asked for an 2378 * exact match or to indicate duplicates, then use the key[0] and 2379 * forget about the special entry zero comparison. 2380 * 2381 * We always use inexact searching to get down to the BIN, and then 2382 * call findEntry separately on the BIN if necessary. So the behavior 2383 * of findEntry is different for BINs and INs, because it's used in 2384 * different ways. 2385 * 2386 * Consider a tree where the lowest key is "b" and we want to insert 2387 * "a". If we did the comparison (with exact == false), we wouldn't 2388 * find the correct (i.e. the left) path down the tree. So the 2389 * virtual key ensures that "a" gets inserted down the left path. 2390 * 2391 * The insertion case is a good specific example. findBinForInsert 2392 * does inexact searching in the INs only, not the BIN. 2393 * 2394 * There's nothing special about the 0th key itself, only the use of 2395 * the 0th key in the comparison algorithm. 2396 */ 2397 boolean entryZeroSpecialCompare = 2398 isUpperIN() && !exact && !indicateIfDuplicate; 2399 2400 assert nEntries >= 0; 2401 2402 while (low <= high) { 2403 2404 middle = (high + low) / 2; 2405 int s; 2406 byte[] middleKey = null; 2407 2408 if (middle == 0 && entryZeroSpecialCompare) { 2409 s = 1; 2410 } else { 2411 middleKey = getKey(middle); 2412 s = Key.compareKeys(key, middleKey, userCompareToFcn); 2413 } 2414 2415 if (s < 0) { 2416 high = middle - 1; 2417 } else if (s > 0) { 2418 low = middle + 1; 2419 } else { 2420 int ret; 2421 if (indicateIfDuplicate) { 2422 ret = middle | EXACT_MATCH; 2423 } else { 2424 ret = middle; 2425 } 2426 2427 if ((ret >= 0) && exact && isEntryKnownDeleted(ret & 0xffff)) { 2428 return -1; 2429 } else { 2430 return ret; 2431 } 2432 } 2433 } 2434 2435 /* 2436 * No match found. Either return -1 if caller wanted exact matches 2437 * only, or return entry for which arg key is > entry key. 2438 */ 2439 if (exact) { 2440 return -1; 2441 } else { 2442 return high; 2443 } 2444 } 2445 2446 /** 2447 * Inserts a slot with the given key, lsn and child node into this IN, if 2448 * a slot with the same key does not exist already. The state of the new 2449 * slot is set to DIRTY. Assumes this node is already latched by the 2450 * caller. 2451 * 2452 * @return true if the entry was successfully inserted, false 2453 * if it was a duplicate. 2454 * 2455 * @throws EnvironmentFailureException if the node is full 2456 * (it should have been split earlier). 2457 */ insertEntry( Node child, byte[] key, long childLsn)2458 public final boolean insertEntry( 2459 Node child, 2460 byte[] key, 2461 long childLsn) 2462 throws DatabaseException { 2463 2464 assert(!isBINDelta()); 2465 2466 return 2467 (insertEntry1(child, key, childLsn, EntryStates.DIRTY_BIT, false) & 2468 INSERT_SUCCESS) != 0; 2469 } 2470 2471 /** 2472 * Inserts a slot with the given key, lsn and child node into this IN, if 2473 * a slot with the same key does not exist already. The state of the new 2474 * slot is set to DIRTY. Assumes this node is already latched by the 2475 * caller. 2476 * 2477 * @return either (1) the index of location in the IN where the entry was 2478 * inserted |'d with INSERT_SUCCESS, or (2) the index of the duplicate in 2479 * the IN if the entry was found to be a duplicate. 2480 * 2481 * @throws EnvironmentFailureException if the node is full (it should have 2482 * been split earlier). 2483 */ insertEntry1( Node child, byte[] key, long childLsn, boolean blindInsertion)2484 public final int insertEntry1( 2485 Node child, 2486 byte[] key, 2487 long childLsn, 2488 boolean blindInsertion) { 2489 2490 return insertEntry1( 2491 child, key, childLsn, EntryStates.DIRTY_BIT, blindInsertion); 2492 } 2493 2494 /** 2495 * Inserts a slot with the given key, lsn, state, and child node into this 2496 * IN, if a slot with the same key does not exist already. Assumes this 2497 * node is already latched by the caller. 2498 * 2499 * This returns a failure if there's a duplicate match. The caller must do 2500 * the processing to check if the entry is actually deleted and can be 2501 * overwritten. This is foisted upon the caller rather than handled in this 2502 * object because there may be some latch releasing/retaking in order to 2503 * check a child LN. 2504 * 2505 * @return either (1) the index of location in the IN where the entry was 2506 * inserted |'d with INSERT_SUCCESS, or (2) the index of the duplicate in 2507 * the IN if the entry was found to be a duplicate. 2508 * 2509 * @throws EnvironmentFailureException if the node is full (it should have 2510 * been split earlier). 2511 */ insertEntry1( Node child, byte[] key, long childLsn, byte state, boolean blindInsertion)2512 public final int insertEntry1( 2513 Node child, 2514 byte[] key, 2515 long childLsn, 2516 byte state, 2517 boolean blindInsertion) { 2518 2519 /* 2520 * Search without requiring an exact match, but do let us know the 2521 * index of the match if there is one. 2522 */ 2523 int index = findEntry(key, true, false); 2524 2525 if (index >= 0 && (index & EXACT_MATCH) != 0) { 2526 2527 /* 2528 * There is an exact match. Don't insert; let the caller decide 2529 * what to do with this duplicate. 2530 */ 2531 return index & ~IN.EXACT_MATCH; 2532 } 2533 2534 /* 2535 * There was no key match, but if this is a bin delta, there may be an 2536 * exact match in the full bin. Mutate to full bin and search again. 2537 * However, if we know for sure that the key does not exist in the full 2538 * BIN, then don't mutate, unless there is no space in the delta to do 2539 * the insertion. 2540 */ 2541 if (isBINDelta()) { 2542 2543 BIN bin = (BIN)this; 2544 2545 boolean doBlindInsertion = (nEntries < getMaxEntries()); 2546 2547 if (doBlindInsertion && 2548 !blindInsertion && 2549 bin.mayHaveKeyInFullBin(key)) { 2550 2551 doBlindInsertion = false; 2552 } 2553 2554 if (!doBlindInsertion) { 2555 2556 mutateToFullBIN(); 2557 2558 index = findEntry(key, true, false); 2559 2560 if (index >= 0 && (index & EXACT_MATCH) != 0) { 2561 return index & ~IN.EXACT_MATCH; 2562 } 2563 } else { 2564 getEvictor().incBinDeltaBlindOps(); 2565 2566 if (traceDeltas) { 2567 LoggerUtils.envLogMsg( 2568 traceLevel, getEnv(), 2569 Thread.currentThread().getId() + "-" + 2570 Thread.currentThread().getName() + 2571 "-" + getEnv().getName() + 2572 (blindInsertion ? 2573 " Blind insertion in BIN-delta " : 2574 " Blind put in BIN-delta ") + 2575 getNodeId() + " nEntries = " + 2576 nEntries + " max entries = " + 2577 getMaxEntries() + 2578 " full BIN entries = " + 2579 bin.getFullBinNEntries() + 2580 " full BIN max entries = " + 2581 bin.getFullBinMaxEntries()); 2582 } 2583 } 2584 } 2585 2586 if (nEntries >= getMaxEntries()) { 2587 throw unexpectedState( 2588 getEnv(), 2589 "Node " + getNodeId() + 2590 " should have been split before calling insertEntry" + 2591 " is BIN-delta: " + isBINDelta() + 2592 " num entries: " + nEntries + 2593 " max entries: " + getMaxEntries()); 2594 } 2595 2596 /* There was no key match, so insert to the right of this entry. */ 2597 index++; 2598 2599 /* We found a spot for insert, shift entries as needed. */ 2600 if (index < nEntries) { 2601 int oldSize = computeLsnOverhead(); 2602 2603 /* Adding elements to the LSN array can change the space used. */ 2604 shiftEntriesRight(index); 2605 2606 updateMemorySize(computeLsnOverhead() - oldSize); 2607 } else { 2608 nEntries++; 2609 } 2610 2611 if (isBINDelta()) { 2612 ((BIN)this).incFullBinNEntries(); 2613 } 2614 2615 int oldSize = computeLsnOverhead(); 2616 2617 setTarget(index, child); 2618 /* setLsnInternal can mutate to an array of longs. */ 2619 setLsnInternal(index, childLsn); 2620 entryStates[index] = state; 2621 boolean multiSlotChange = setKeyAndPrefix(index, key); 2622 2623 adjustCursorsForInsert(index); 2624 2625 updateMemorySize(oldSize, 2626 getEntryInMemorySize(index) + 2627 computeLsnOverhead()); 2628 2629 if (multiSlotChange) { 2630 updateMemorySize(inMemorySize, computeMemorySize()); 2631 } 2632 2633 setDirty(true); 2634 2635 assert(isBIN() || hasResidentChildren() == hasCachedChildrenFlag()); 2636 2637 return (index | INSERT_SUCCESS); 2638 } 2639 2640 /** 2641 * Deletes the ChildReference with the key arg from this IN. Assumes this 2642 * node is already latched by the caller. 2643 * 2644 * This seems to only be used by INTest. 2645 * 2646 * @param key The key of the reference to delete from the IN. 2647 * 2648 * @param maybeValidate true if assert validation should occur prior to 2649 * delete. Set this to false during recovery. 2650 * 2651 * @return true if the entry was successfully deleted, false if it was not 2652 * found. 2653 */ deleteEntry(byte[] key, boolean maybeValidate)2654 final boolean deleteEntry(byte[] key, boolean maybeValidate) 2655 throws DatabaseException { 2656 2657 assert(!isBINDelta()); 2658 2659 if (nEntries == 0) { 2660 return false; // caller should put this node on the IN cleaner list 2661 } 2662 2663 int index = findEntry(key, false, true); 2664 if (index < 0) { 2665 return false; 2666 } 2667 2668 return deleteEntry(index, maybeValidate); 2669 } 2670 2671 /** 2672 * Deletes the ChildReference at index from this IN. Assumes this node is 2673 * already latched by the caller. 2674 * 2675 * @param index The index of the entry to delete from the IN. 2676 * 2677 * @param maybeValidate true if asserts are enabled. 2678 * 2679 * @return true if the entry was successfully deleted, false if it was not 2680 * found. 2681 */ deleteEntry(int index, boolean maybeValidate)2682 public final boolean deleteEntry(int index, boolean maybeValidate) 2683 throws DatabaseException { 2684 2685 assert(!isBINDelta()); 2686 2687 if (nEntries == 0) { 2688 return false; 2689 } 2690 2691 /* Check the subtree validation only if maybeValidate is true. */ 2692 assert maybeValidate ? 2693 validateSubtreeBeforeDelete(index) : 2694 true; 2695 2696 if (index < nEntries) { 2697 2698 Node child = getTarget(index); 2699 2700 if (child != null && child.isIN()) { 2701 IN childIN = (IN)child; 2702 getEnv().getInMemoryINs().remove(childIN); 2703 } 2704 2705 updateMemorySize(getEntryInMemorySize(index), 0); 2706 int oldLSNArraySize = computeLsnOverhead(); 2707 2708 /* 2709 * Do the actual deletion. Note: setTarget() must be called before 2710 * copyEntries() so that the hasCachedChildrenFlag will be properly 2711 * maintained. 2712 */ 2713 setTarget(index, null); 2714 copyEntries(index + 1, index, nEntries - index - 1); 2715 nEntries--; 2716 2717 setDirty(true); 2718 setProhibitNextDelta(); 2719 2720 /* cleanup what used to be the last entry */ 2721 clearEntry(nEntries); 2722 2723 /* setLsnInternal can mutate to an array of longs. */ 2724 updateMemorySize(oldLSNArraySize, computeLsnOverhead()); 2725 2726 assert(isBIN() || hasCachedChildrenFlag() == hasResidentChildren()); 2727 2728 /* 2729 * Note that we don't have to adjust cursors for delete, since 2730 * there should be nothing pointing at this record. 2731 */ 2732 traceDelete(Level.FINEST, index); 2733 return true; 2734 } else { 2735 return false; 2736 } 2737 } 2738 2739 /** 2740 * WARNING: clearEntry() calls entryTargets.set() directly, instead of 2741 * setTarget(). As a result, the hasCachedChildren flag of the IN is not 2742 * updated here. The caller is responsible for updating this flag, if 2743 * needed. 2744 */ clearEntry(int idx)2745 void clearEntry(int idx) { 2746 2747 entryTargets = entryTargets.set(idx, null, this); 2748 entryKeyVals = entryKeyVals.set(idx, null, this); 2749 setLsnInternal(idx, DbLsn.NULL_LSN); 2750 entryStates[idx] = 0; 2751 } 2752 2753 /** 2754 * Update the idx'th entry of this node. 2755 */ updateEntry(int idx, long lsn, int lastLoggedSize)2756 public final void updateEntry(int idx, long lsn, int lastLoggedSize) { 2757 2758 setLsn(idx, lsn); 2759 setLastLoggedSize(idx, lastLoggedSize); 2760 setDirty(true); 2761 } 2762 2763 /** 2764 * Update the idx'th entry of this node. 2765 * 2766 * This method is used only by the BIN.applyDelta() method. 2767 * 2768 * Unlike other update methods, the LSN may be NULL_LSN if the KD flag is 2769 * set. This allows applying a BIN-delta with a NULL_LSN and KD, for an 2770 * invisible log entry for example. 2771 */ updateEntry( int idx, Node node, long lsn, int lastLoggedSize, byte state)2772 final void updateEntry( 2773 int idx, 2774 Node node, 2775 long lsn, 2776 int lastLoggedSize, 2777 byte state) { 2778 2779 assert(isBIN()); 2780 assert(!isBINDelta()); 2781 assert (lsn != DbLsn.NULL_LSN) || 2782 ((state & EntryStates.KNOWN_DELETED_BIT) != 0); 2783 2784 setLsn(idx, lsn, false/*check*/); 2785 setLastLoggedSize(idx, lastLoggedSize); 2786 entryStates[idx] = state; 2787 setTarget(idx, node); 2788 setDirty(true); 2789 } 2790 2791 /** 2792 * Update the LSN, key, lastLoggedSize, and cached-child properties of 2793 * the idx-th slot. 2794 * 2795 * This method is used to insert a new record in an existing slot. It is 2796 * also used in DupConvert, where it is called to convert the keys of an 2797 * upper IN that has just been fetched from the log and is not attached to 2798 * in-memory tree yet. 2799 */ updateEntry( int idx, Node node, long lsn, int lastLoggedSize, byte[] key)2800 public final void updateEntry( 2801 int idx, 2802 Node node, 2803 long lsn, 2804 int lastLoggedSize, 2805 byte[] key) { 2806 2807 Node child = getTarget(idx); 2808 assert(child == null || 2809 child.isLN() || 2810 !((IN)child).getInListResident() || 2811 child == node); 2812 2813 long oldSize = getEntryInMemorySize(idx); 2814 2815 setLsn(idx, lsn); 2816 setLastLoggedSize(idx, lastLoggedSize); 2817 setTarget(idx, node); 2818 2819 boolean multiSlotChange = setKeyAndDirty(idx, key); 2820 2821 if (multiSlotChange) { 2822 updateMemorySize(inMemorySize, computeMemorySize()); 2823 } else { 2824 long newSize = getEntryInMemorySize(idx); 2825 updateMemorySize(oldSize, newSize); 2826 } 2827 2828 setDirty(true); 2829 2830 assert(isBIN() || hasResidentChildren() == hasCachedChildrenFlag()); 2831 } 2832 2833 /** 2834 * Update the idx'th entry of this node. This flavor is used in 2835 * CursorImpl.updateRecordInternal() and CursorImpl.deleteCurrentRecord(). 2836 * The target is not set by this method, because it has already been set 2837 * or modified in-place by the caller; instead, the size of the old LN is 2838 * passed as input in order to do memory counting. This method allows 2839 * updating the slot without replacing the target to support delete and 2840 * update without fetching or causing eviction. 2841 */ updateEntry( int idx, long oldNodeSize, long lsn, int lastLoggedSize, byte[] lnSlotKey)2842 public final void updateEntry( 2843 int idx, 2844 long oldNodeSize, 2845 long lsn, 2846 int lastLoggedSize, 2847 byte[] lnSlotKey) { 2848 2849 assert(isBIN()); 2850 2851 long oldSize = getEntryInMemorySize(idx); 2852 2853 setLsn(idx, lsn); 2854 setLastLoggedSize(idx, lastLoggedSize); 2855 2856 boolean multiSlotChange = setLNSlotKey(idx, lnSlotKey); 2857 if (multiSlotChange) { 2858 updateMemorySize(inMemorySize, computeMemorySize()); 2859 } else { 2860 long newSize = getEntryInMemorySize(idx); 2861 updateMemorySize(oldSize, newSize); 2862 2863 /* Update mem size for node change. */ 2864 final Node currentNode = entryTargets.get(idx); 2865 long newNodeSize = (currentNode != null ? 2866 currentNode.getMemorySizeIncludedByParent() : 2867 0); 2868 updateMemorySize(oldNodeSize, newNodeSize); 2869 } 2870 2871 setDirty(true); 2872 } 2873 2874 /** 2875 * Update the cached-child and LSN properties of the idx-th slot. This 2876 * method is used by the RecoveryManager to change the version of a 2877 * child node, either to a later version (in case of redo), or to an 2878 * earlier version (in case of undo). The child node may or may not be 2879 * already attached to the tree. 2880 */ updateNode( int idx, Node node, long lsn, int lastLoggedSize)2881 public final void updateNode( 2882 int idx, 2883 Node node, 2884 long lsn, 2885 int lastLoggedSize) { 2886 2887 long oldSize = getEntryInMemorySize(idx); 2888 2889 /* 2890 * If we are about to detach a cached child IN, make sure that it is 2891 * not in the INList. This should be the case indeed, because if the 2892 * child node is an IN, this method is called during the recovery 2893 * phase where the INList is disabled, 2894 */ 2895 Node child = getTarget(idx); 2896 assert(child == null || 2897 child.isLN() || 2898 !((IN)child).getInListResident() || 2899 child == node/* this is needed by a unit test*/); 2900 2901 setLsn(idx, lsn); 2902 setLastLoggedSize(idx, lastLoggedSize); 2903 setTarget(idx, node); 2904 2905 long newSize = getEntryInMemorySize(idx); 2906 updateMemorySize(oldSize, newSize); 2907 2908 setDirty(true); 2909 2910 assert(isBIN() || hasResidentChildren() == hasCachedChildrenFlag()); 2911 } 2912 2913 /** 2914 * Attach the given node as the idx-th child of "this" node. If the child 2915 * node is an LN, set the key of the parent slot to the given key value, 2916 * if the 2 key values do not compare equal (see setLNSlotKey() for the 2917 * reason why the key may be updated). 2918 * 2919 * This method is called after the child node has been either (a) fetched 2920 * in from disk and is not dirty, or (b) is a newly created instance that 2921 * will be written out later by something like a checkpoint. In either 2922 * case, the slot LSN does not need to be updated. 2923 * 2924 * Note: does not dirty the node unless the LN slot key is changed. 2925 */ attachNode(int idx, Node node, byte[] lnSlotKey)2926 public final void attachNode(int idx, Node node, byte[] lnSlotKey) { 2927 2928 long oldSize = getEntryInMemorySize(idx); 2929 2930 /* Make sure we are not using this method to detach a cached child */ 2931 assert(getTarget(idx) == null); 2932 2933 setTarget(idx, node); 2934 2935 boolean multiSlotChange = setLNSlotKey(idx, lnSlotKey); 2936 if (multiSlotChange) { 2937 updateMemorySize(inMemorySize, computeMemorySize()); 2938 } else { 2939 long newSize = getEntryInMemorySize(idx); 2940 updateMemorySize(oldSize, newSize); 2941 } 2942 2943 assert(isBIN() || hasResidentChildren() == hasCachedChildrenFlag()); 2944 } 2945 2946 /* 2947 * Detach from the tree the child node at the idx-th slot. 2948 * 2949 * The most common caller of this method is the evictor. If the child 2950 * being evicted was dirty, it has just been logged and the lsn of the 2951 * slot must be updated. 2952 */ detachNode(int idx, boolean updateLsn, long newLsn)2953 public final void detachNode(int idx, boolean updateLsn, long newLsn) { 2954 2955 long oldSize = getEntryInMemorySize(idx); 2956 2957 Node child = getTarget(idx); 2958 2959 if (updateLsn) { 2960 setLsn(idx, newLsn); 2961 setDirty(true); 2962 } 2963 setTarget(idx, null); 2964 2965 long newSize = getEntryInMemorySize(idx); 2966 updateMemorySize(oldSize, newSize); 2967 2968 if (child != null && child.isIN()) { 2969 IN childIN = (IN)child; 2970 getEnv().getInMemoryINs().remove(childIN); 2971 } 2972 2973 assert(isBIN() || hasResidentChildren() == hasCachedChildrenFlag()); 2974 } 2975 copyEntries(final int from, final int to, final int n)2976 void copyEntries(final int from, final int to, final int n) { 2977 2978 entryTargets = entryTargets.copy(from, to, n, this); 2979 entryKeyVals = entryKeyVals.copy(from, to, n, this); 2980 2981 System.arraycopy(entryStates, from, entryStates, to, n); 2982 2983 if (entryLsnLongArray == null) { 2984 final int fromOff = from << 2; 2985 final int toOff = to << 2; 2986 final int nBytes = n << 2; 2987 System.arraycopy(entryLsnByteArray, fromOff, 2988 entryLsnByteArray, toOff, nBytes); 2989 } else { 2990 System.arraycopy(entryLsnLongArray, from, 2991 entryLsnLongArray, to, 2992 n); 2993 } 2994 } 2995 2996 /* 2997 * All methods that modify the entry array must adjust memory sizing. 2998 */ 2999 3000 /** 3001 * Set the idx'th entry of this node using the specified entry of another 3002 * node. Do not validate the LSN, just set it unconditionally. Used for 3003 * moving entries between BINs during splits. 3004 * 3005 * WARNING: This method bumps nEntries if it is less than idx+1, 3006 * effectively appending the slot. 3007 */ copyEntry(int idx, IN from, int fromIdx)3008 void copyEntry(int idx, IN from, int fromIdx) { 3009 3010 assert(!isBINDelta()); 3011 3012 final Node target = from.entryTargets.get(fromIdx); 3013 final byte[] keyVal = from.getKey(fromIdx); 3014 final long lsn = from.getLsn(fromIdx); 3015 final byte state = from.entryStates[fromIdx]; 3016 3017 long oldSize = computeLsnOverhead(); 3018 int newNEntries = idx + 1; 3019 3020 if (newNEntries > nEntries) { 3021 3022 /* 3023 * If the new entry is going to bump nEntries, then we don't need 3024 * the entry size accounting included in oldSize. 3025 */ 3026 nEntries = newNEntries; 3027 } else { 3028 oldSize += getEntryInMemorySize(idx); 3029 } 3030 3031 setTarget(idx, target); 3032 boolean multiSlotChange = setKeyAndPrefix(idx, keyVal); 3033 3034 /* setLsnInternal can mutate to an array of longs. */ 3035 setLsnInternal(idx, lsn); 3036 3037 entryStates[idx] = state; 3038 3039 if (multiSlotChange) { 3040 updateMemorySize(inMemorySize, computeMemorySize()); 3041 } else { 3042 long newSize = getEntryInMemorySize(idx) + computeLsnOverhead(); 3043 updateMemorySize(oldSize, newSize); 3044 } 3045 3046 setDirty(true); 3047 } 3048 3049 /** 3050 * Return true if this node needs splitting. For the moment, needing to be 3051 * split is defined by there being no free entries available. 3052 */ needsSplitting()3053 public final boolean needsSplitting() { 3054 3055 if (isBINDelta()) { 3056 BIN bin = (BIN)this; 3057 int fullBinNEntries = bin.getFullBinNEntries(); 3058 int fullBinMaxEntries = bin.getFullBinMaxEntries(); 3059 3060 if (fullBinNEntries < 0) { 3061 /* fullBinNEntries is unknown in logVersions < 10 */ 3062 mutateToFullBIN(); 3063 } else { 3064 assert(fullBinNEntries > 0); 3065 return ((fullBinMaxEntries - fullBinNEntries) < 1); 3066 } 3067 } 3068 3069 return ((getMaxEntries() - nEntries) < 1); 3070 } 3071 3072 /** 3073 * Split this into two nodes. Parent IN is passed in parent and should be 3074 * latched by the caller. 3075 * 3076 * childIndex is the index in parent of where "this" can be found. 3077 */ split( IN parent, int childIndex, IN grandParent, int maxEntries, CacheMode cacheMode)3078 final void split( 3079 IN parent, 3080 int childIndex, 3081 IN grandParent, 3082 int maxEntries, 3083 CacheMode cacheMode) 3084 throws DatabaseException { 3085 3086 splitInternal( 3087 parent, childIndex, grandParent, maxEntries, -1, cacheMode); 3088 } 3089 3090 /** 3091 * Called when we know we are about to split on behalf of a key that is the 3092 * minimum (leftSide) or maximum (!leftSide) of this node. This is 3093 * achieved by just forcing the split to occur either one element in from 3094 * the left or the right (i.e. splitIndex is 1 or nEntries - 1). 3095 */ splitSpecial( IN parent, int parentIndex, IN grandParent, int maxEntriesPerNode, byte[] key, boolean leftSide, CacheMode cacheMode)3096 void splitSpecial( 3097 IN parent, 3098 int parentIndex, 3099 IN grandParent, 3100 int maxEntriesPerNode, 3101 byte[] key, 3102 boolean leftSide, 3103 CacheMode cacheMode) 3104 throws DatabaseException { 3105 3106 int index = findEntry(key, false, false); 3107 3108 if (leftSide && index == 0) { 3109 splitInternal( 3110 parent, parentIndex, grandParent, maxEntriesPerNode, 1, 3111 cacheMode); 3112 3113 } else if (!leftSide && index == (nEntries - 1)) { 3114 splitInternal( 3115 parent, parentIndex, grandParent, maxEntriesPerNode, 3116 nEntries - 1, cacheMode); 3117 3118 } else { 3119 split( 3120 parent, parentIndex, grandParent, maxEntriesPerNode, 3121 cacheMode); 3122 } 3123 } 3124 splitInternal( IN parent, int childIndex, IN grandParent, int maxEntries, int splitIndex, CacheMode cacheMode)3125 final void splitInternal( 3126 IN parent, 3127 int childIndex, 3128 IN grandParent, 3129 int maxEntries, 3130 int splitIndex, 3131 CacheMode cacheMode) 3132 throws DatabaseException { 3133 3134 assert(!isBINDelta()); 3135 3136 /* 3137 * Find the index of the existing identifierKey so we know which IN 3138 * (new or old) to put it in. 3139 */ 3140 if (identifierKey == null) { 3141 throw unexpectedState(); 3142 } 3143 3144 int idKeyIndex = findEntry(identifierKey, false, false); 3145 3146 if (splitIndex < 0) { 3147 splitIndex = nEntries / 2; 3148 } 3149 3150 /* Range of entries to copy to new sibling. */ 3151 final int low, high; 3152 3153 if (idKeyIndex < splitIndex) { 3154 3155 /* 3156 * Current node (this) keeps left half entries. Right half entries 3157 * will go in the new node. 3158 */ 3159 low = splitIndex; 3160 high = nEntries; 3161 } else { 3162 3163 /* 3164 * Current node (this) keeps right half entries. Left half entries 3165 * will go in the new node. 3166 */ 3167 low = 0; 3168 high = splitIndex; 3169 } 3170 3171 byte[] newIdKey = getKey(low); 3172 long parentLsn = DbLsn.NULL_LSN; 3173 3174 IN newSibling = createNewInstance(newIdKey, maxEntries, level); 3175 3176 newSibling.latch(CacheMode.UNCHANGED); 3177 3178 try { 3179 int toIdx = 0; 3180 boolean deletedEntrySeen = false; 3181 BINReference binRef = null; 3182 int newSiblingNEntries = (high - low); 3183 boolean haveCachedChildren = hasCachedChildrenFlag(); 3184 3185 assert(isBIN() || haveCachedChildren == hasResidentChildren()); 3186 3187 /** 3188 * Distribute entries among the split node and the new sibling. 3189 */ 3190 for (int i = low; i < high; i++) { 3191 3192 if (isEntryPendingDeleted(i)) { 3193 if (!deletedEntrySeen) { 3194 deletedEntrySeen = true; 3195 assert newSibling.isBIN(); 3196 binRef = ((BIN) newSibling).createReference(); 3197 } 3198 } 3199 3200 newSibling.copyEntry(toIdx++, this, i); 3201 clearEntry(i); 3202 } 3203 3204 if (low == 0) { 3205 shiftEntriesLeft(newSiblingNEntries); 3206 } 3207 3208 newSibling.nEntries = toIdx; 3209 nEntries -= newSiblingNEntries; 3210 setDirty(true); 3211 3212 if (isUpperIN() && haveCachedChildren) { 3213 setHasCachedChildrenFlag(hasResidentChildren()); 3214 } 3215 3216 assert(isBIN() || 3217 hasCachedChildrenFlag() == hasResidentChildren()); 3218 assert(isBIN() || 3219 newSibling.hasCachedChildrenFlag() == 3220 newSibling.hasResidentChildren()); 3221 3222 /* 3223 * Always add to compressor queue since a full BIN is logged below, 3224 * and never a delta. 3225 */ 3226 if (deletedEntrySeen) { 3227 getEnv().addToCompressorQueue(binRef, false); 3228 } 3229 3230 adjustCursors(newSibling, low, high); 3231 3232 /* 3233 * If this node has no key prefix, calculate it now that it has 3234 * been split. This must be done before logging, to ensure the 3235 * prefix information is made persistent [#20799]. 3236 */ 3237 byte[] newKeyPrefix = computeKeyPrefix(-1); 3238 recalcSuffixes(newKeyPrefix, null, -1); 3239 /* Apply compaction after prefixing [#20799]. */ 3240 entryKeyVals = entryKeyVals.compact(this); 3241 3242 /* Only recalc if there are multiple entries in newSibling. */ 3243 if (newSibling.getNEntries() > 1) { 3244 byte[] newSiblingPrefix = newSibling.getKeyPrefix(); 3245 newSiblingPrefix = newSibling.computeKeyPrefix(-1); 3246 newSibling.recalcSuffixes(newSiblingPrefix, null, -1); 3247 /* initMemorySize calls entryKeyVals.compact. */ 3248 newSibling.initMemorySize(); 3249 } 3250 3251 /* 3252 * Update size. newSibling and parent are correct, but this IN has 3253 * had its entries shifted and is not correct. 3254 * 3255 * Also, inMemorySize does not reflect changes that may have 3256 * resulted from key prefixing related changes, it needs to be 3257 * brought up to date, so update it appropriately for this and the 3258 * above reason. 3259 */ 3260 EnvironmentImpl env = getEnv(); 3261 INList inMemoryINs = env.getInMemoryINs(); 3262 long oldMemorySize = inMemorySize; 3263 long newSize = computeMemorySize(); 3264 updateMemorySize(oldMemorySize, newSize); 3265 inMemoryINs.add(newSibling); 3266 3267 /* 3268 * Parent refers to child through an element of the entries array. 3269 * Depending on which half of the BIN we copied keys from, we 3270 * either have to adjust one pointer and add a new one, or we have 3271 * to just add a new pointer to the new sibling. 3272 * 3273 * We must use the provisional logging for two reasons: 3274 * 3275 * 1) All three log entries must be read atomically. The parent 3276 * must get logged last, as all referred-to children must precede 3277 * it. Provisional entries guarantee that all three are processed 3278 * as a unit. Recovery skips provisional entries, so the changed 3279 * children are only used if the parent makes it out to the log. 3280 * 3281 * 2) We log all they way to the root to avoid the "great aunt" 3282 * problem (see LevelRecorder), and provisional logging is 3283 * necessary during a checkpoint for levels less than 3284 * maxFlushLevel. 3285 */ 3286 LogManager logManager = env.getLogManager(); 3287 3288 long newSiblingLsn = 3289 newSibling.optionalLogProvisional(logManager, parent); 3290 3291 long myNewLsn = optionalLogProvisional(logManager, parent); 3292 3293 /* 3294 * When we update the parent entry, we make sure that we don't 3295 * replace the parent's key that points at 'this' with a key that 3296 * is > than the existing one. Replacing the parent's key with 3297 * something > would effectively render a piece of the subtree 3298 * inaccessible. So only replace the parent key with something 3299 * <= the existing one. See tree/SplitTest.java for more details 3300 * on the scenario. 3301 */ 3302 if (low == 0) { 3303 3304 /* 3305 * Change the original entry to point to the new child and add 3306 * an entry to point to the newly logged version of this 3307 * existing child. 3308 */ 3309 parent.prepareForSlotReuse(childIndex); 3310 3311 parent.updateSplitSlot( 3312 childIndex, newSibling, newSiblingLsn, newIdKey); 3313 3314 byte[] ourKey = getKey(0); 3315 boolean inserted = parent.insertEntry(this, ourKey, myNewLsn); 3316 assert inserted; 3317 } else { 3318 3319 /* 3320 * Update the existing child's LSN to reflect the newly logged 3321 * version and insert new child into parent. 3322 */ 3323 parent.updateSplitSlot(childIndex, this, myNewLsn, getKey(0)); 3324 3325 boolean inserted = parent.insertEntry( 3326 newSibling, newIdKey, newSiblingLsn); 3327 assert inserted; 3328 } 3329 3330 /** 3331 * Log the parent. Note that the root slot or grandparent slot is 3332 * not updated with the parent's LSN here; this is done by 3333 * Tree.forceSplit. 3334 */ 3335 if (parent.isDbRoot()) { 3336 parentLsn = parent.optionalLog(logManager); 3337 } else { 3338 parentLsn = parent.optionalLogProvisional( 3339 logManager, grandParent); 3340 } 3341 3342 /* 3343 * Check whether either the old or the new sibling must be added 3344 * to the LRU (mixed LRUSet). 3345 */ 3346 assert(!isDIN() && !isDBIN()); 3347 3348 final Evictor evictor = getEvictor(); 3349 3350 if (evictor != null) { 3351 if(isBIN() || !newSibling.hasCachedChildrenFlag()) { 3352 switch (cacheMode) { 3353 case DEFAULT: 3354 case EVICT_LN: 3355 case UNCHANGED: 3356 case KEEP_HOT: 3357 if (isUpperIN() && traceLRU) { 3358 LoggerUtils.envLogMsg( 3359 traceLevel, getEnv(), 3360 "split-newSibling " + 3361 Thread.currentThread().getId() + "-" + 3362 Thread.currentThread().getName() + 3363 "-" + getEnv().getName() + 3364 " Adding UIN to LRU: " + 3365 newSibling.getNodeId()); 3366 } 3367 evictor.addBack(newSibling); 3368 break; 3369 case MAKE_COLD: 3370 case EVICT_BIN: 3371 if (isBIN()) { 3372 evictor.addFront(newSibling); 3373 } else { 3374 if (traceLRU) { 3375 LoggerUtils.envLogMsg( 3376 traceLevel, getEnv(), 3377 "split-newSibling-2 " + 3378 Thread.currentThread().getId() + "-" + 3379 Thread.currentThread().getName() + 3380 "-" + getEnv().getName() + 3381 " Adding UIN to LRU: " + 3382 newSibling.getNodeId()); 3383 } 3384 evictor.addBack(newSibling); 3385 } 3386 break; 3387 default: 3388 throw unexpectedState 3389 ("unknown cacheMode: " + cacheMode); 3390 } 3391 } 3392 3393 if (isUpperIN() && 3394 haveCachedChildren && 3395 !hasCachedChildrenFlag()) { 3396 if (traceLRU) { 3397 LoggerUtils.envLogMsg( 3398 traceLevel, getEnv(), 3399 "split-oldSibling " + 3400 Thread.currentThread().getId() + "-" + 3401 Thread.currentThread().getName() + 3402 "-" + getEnv().getName() + 3403 " Adding UIN to LRU: " + getNodeId()); 3404 } 3405 evictor.addBack(this); 3406 } 3407 } 3408 3409 /* Debug log this information. */ 3410 traceSplit(Level.FINE, parent, 3411 newSibling, parentLsn, myNewLsn, 3412 newSiblingLsn, splitIndex, idKeyIndex, childIndex); 3413 } finally { 3414 newSibling.releaseLatch(); 3415 } 3416 } 3417 3418 /** 3419 * Update a slot that is being split. Ther slot to be updated here is the 3420 * one that existed before the split. 3421 * 3422 * @param child The new child to be placed under the slot. May be the 3423 * newly created sibling or the pre-existing sibling. 3424 * @param lsn The new lsn of the child (the child was logged just before 3425 * calling this method, so its slot lsn must be updated) 3426 * @param key The new key for the slot. We should not actually update the 3427 * slot key, because its value is the lower bound of the key range covered 3428 * by the slot, and this lower bound does not change as a result of the 3429 * split (the new slot created as a result of the split is placed to the 3430 * right of the pre-existing slot). There is however one exception: the 3431 * key can be updated if "idx" is the 0-slot. The 0-slot key is not a true 3432 * lower bound; the actual lower bound for the 0-slot is the key in the 3433 * parent slot for this IN. So, in this case, if the given key is less 3434 * than the current one, it is better to update the key in order to better 3435 * approximate the real lower bound (and thus make the isKeyInBounds() 3436 * method more effective). 3437 */ updateSplitSlot( int idx, IN child, long lsn, byte[] key)3438 private void updateSplitSlot( 3439 int idx, 3440 IN child, 3441 long lsn, 3442 byte[] key) { 3443 3444 assert(isUpperIN()); 3445 3446 long oldSize = getEntryInMemorySize(idx); 3447 3448 setLsn(idx, lsn); 3449 setTarget(idx, child); 3450 3451 if (idx == 0) { 3452 byte[] existingKey = getKey(idx); 3453 int s = Key.compareKeys(key, existingKey, getKeyComparator()); 3454 3455 boolean multiSlotChange = false; 3456 if (s < 0) { 3457 multiSlotChange = setKeyAndDirty(idx, key); 3458 } 3459 3460 if (multiSlotChange) { 3461 updateMemorySize(inMemorySize, computeMemorySize()); 3462 } else { 3463 long newSize = getEntryInMemorySize(idx); 3464 updateMemorySize(oldSize, newSize); 3465 } 3466 } else { 3467 long newSize = getEntryInMemorySize(idx); 3468 updateMemorySize(oldSize, newSize); 3469 } 3470 3471 setDirty(true); 3472 3473 assert(hasResidentChildren() == hasCachedChildrenFlag()); 3474 } 3475 3476 /** 3477 * Shift entries to the right by one position, starting with (and 3478 * including) the entry at index. Increment nEntries by 1. Called 3479 * in insertEntry1() 3480 * 3481 * @param index - The position to start shifting from. 3482 */ shiftEntriesRight(int index)3483 private void shiftEntriesRight(int index) { 3484 copyEntries(index, index + 1, nEntries - index); 3485 clearEntry(index); 3486 nEntries++; 3487 setDirty(true); 3488 } 3489 3490 /** 3491 * Shift entries starting at the byHowMuch'th element to the left, thus 3492 * removing the first byHowMuch'th elements of the entries array. This 3493 * always starts at the 0th entry. Caller is responsible for decrementing 3494 * nEntries. 3495 * 3496 * @param byHowMuch - The number of entries to remove from the left side 3497 * of the entries array. 3498 */ shiftEntriesLeft(int byHowMuch)3499 private void shiftEntriesLeft(int byHowMuch) { 3500 copyEntries(byHowMuch, 0, nEntries - byHowMuch); 3501 for (int i = nEntries - byHowMuch; i < nEntries; i++) { 3502 clearEntry(i); 3503 } 3504 setDirty(true); 3505 } 3506 adjustCursors( IN newSibling, int newSiblingLow, int newSiblingHigh)3507 void adjustCursors( 3508 IN newSibling, 3509 int newSiblingLow, 3510 int newSiblingHigh) { 3511 /* Cursors never refer to IN's. */ 3512 } 3513 adjustCursorsForInsert(int insertIndex)3514 void adjustCursorsForInsert(int insertIndex) { 3515 /* Cursors never refer to IN's. */ 3516 } 3517 3518 /** 3519 * Called prior to changing a slot to contain a different logical node. 3520 * Necessary to support assertions for transient LSNs in shouldUpdateLsn. 3521 * Examples: LN slot reuse, and splits where a new node is placed in an 3522 * existing slot. 3523 */ prepareForSlotReuse(int idx)3524 public void prepareForSlotReuse(int idx) { 3525 3526 if (databaseImpl.isDeferredWriteMode()) { 3527 setLsn(idx, DbLsn.NULL_LSN, false/*check*/); 3528 } 3529 } 3530 3531 /* 3532 * Get the current memory consumption of this node 3533 */ getInMemorySize()3534 public long getInMemorySize() { 3535 return inMemorySize; 3536 } 3537 3538 /** 3539 * Compute the current memory consumption of this node, after putting 3540 * its keys in their compact representation, if possible. 3541 */ initMemorySize()3542 protected void initMemorySize() { 3543 entryKeyVals = entryKeyVals.compact(this); 3544 inMemorySize = computeMemorySize(); 3545 } 3546 3547 /** 3548 * Count up the memory usage attributable to this node alone. LNs children 3549 * are counted by their BIN parents, but INs are not counted by their 3550 * parents because they are resident on the IN list. The identifierKey is 3551 * "intentionally" not kept track of in the memory budget. 3552 */ computeMemorySize()3553 public long computeMemorySize() { 3554 3555 long calcMemorySize = getFixedMemoryOverhead(); 3556 3557 calcMemorySize += MemoryBudget.byteArraySize(entryStates.length); 3558 3559 calcMemorySize += computeLsnOverhead(); 3560 3561 for (int i = 0; i < nEntries; i++) { 3562 calcMemorySize += getEntryInMemorySize(i); 3563 } 3564 3565 if (keyPrefix != null) { 3566 calcMemorySize += MemoryBudget.byteArraySize(keyPrefix.length); 3567 } 3568 3569 if (provisionalObsolete != null) { 3570 calcMemorySize += provisionalObsolete.getMemorySize(); 3571 } 3572 3573 calcMemorySize += entryTargets.calculateMemorySize(); 3574 calcMemorySize += entryKeyVals.calculateMemorySize(); 3575 3576 return calcMemorySize; 3577 } 3578 3579 /* 3580 * Overridden by subclasses. 3581 */ getFixedMemoryOverhead()3582 protected long getFixedMemoryOverhead() { 3583 return MemoryBudget.IN_FIXED_OVERHEAD; 3584 } 3585 3586 /* 3587 * Compute the memory consumption for storing this node's LSNs 3588 */ computeLsnOverhead()3589 private int computeLsnOverhead() { 3590 return (entryLsnLongArray == null) ? 3591 MemoryBudget.byteArraySize(entryLsnByteArray.length) : 3592 MemoryBudget.ARRAY_OVERHEAD + 3593 (entryLsnLongArray.length * 3594 MemoryBudget.PRIMITIVE_LONG_ARRAY_ITEM_OVERHEAD); 3595 } 3596 getEntryInMemorySize(int idx)3597 private long getEntryInMemorySize(int idx) { 3598 3599 /* 3600 * Do not count state size here, since it is counted as overhead 3601 * during initialization. 3602 */ 3603 long ret = 0; 3604 3605 /* 3606 * Don't count the key size if the representation has already 3607 * accounted for it. 3608 */ 3609 if (!entryKeyVals.accountsForKeyByteMemUsage()) { 3610 3611 /* 3612 * Materialize the key object only if needed, thus avoiding the 3613 * object allocation cost when possible. 3614 */ 3615 final byte[] key = entryKeyVals.get(idx); 3616 if (key != null) { 3617 ret += MemoryBudget.byteArraySize(key.length); 3618 } 3619 } 3620 3621 final Node target = entryTargets.get(idx); 3622 if (target != null) { 3623 ret += target.getMemorySizeIncludedByParent(); 3624 } 3625 return ret; 3626 } 3627 3628 /** 3629 * Compacts the representation of the BIN, currently just the 3630 * representation used by entryTargets if possible. Typically invoked after 3631 * a BIN has been stripped. 3632 * 3633 * @return number of bytes reclaimed. 3634 */ compactMemory()3635 public long compactMemory() { 3636 3637 final long oldSize = inMemorySize; 3638 final INKeyRep oldKeyRep = entryKeyVals; 3639 3640 entryTargets = entryTargets.compact(this); 3641 entryKeyVals = entryKeyVals.compact(this); 3642 3643 /* 3644 * Note that we only need to account for mem usage changes in the key 3645 * rep here, not the target rep. The target rep, unlike the key rep, 3646 * updates its mem usage internally, and the responsibility for mem 3647 * usage of contained nodes is fixed -- it is always managed by the IN. 3648 * 3649 * When the key rep changes, the accountsForKeyByteMemUsage property 3650 * also changes. Recalc the size of the entire IN, because 3651 * responsibility for managing contained key byte mem usage has shifted 3652 * between the key rep and the IN parent. 3653 */ 3654 if (entryKeyVals != oldKeyRep) { 3655 updateMemorySize(inMemorySize, computeMemorySize()); 3656 return oldSize - inMemorySize; 3657 } 3658 3659 return oldSize - inMemorySize; 3660 } 3661 3662 /** 3663 * Returns the amount of memory currently budgeted for this IN. 3664 */ getBudgetedMemorySize()3665 public long getBudgetedMemorySize() { 3666 return inMemorySize - accumulatedDelta; 3667 } 3668 3669 /** 3670 * Called as part of a memory budget reset (during a checkpoint) to clear 3671 * the accumulated delta and return the total memory size. 3672 */ resetAndGetMemorySize()3673 public long resetAndGetMemorySize() { 3674 accumulatedDelta = 0; 3675 return inMemorySize; 3676 } 3677 updateMemorySize(long oldSize, long newSize)3678 protected void updateMemorySize(long oldSize, long newSize) { 3679 long delta = newSize - oldSize; 3680 updateMemorySize(delta); 3681 } 3682 3683 /* 3684 * Called when a cached child is replaced by another cached child. 3685 */ updateMemorySize(Node oldNode, Node newNode)3686 void updateMemorySize(Node oldNode, Node newNode) { 3687 long delta = 0; 3688 if (newNode != null) { 3689 delta = newNode.getMemorySizeIncludedByParent(); 3690 } 3691 3692 if (oldNode != null) { 3693 delta -= oldNode.getMemorySizeIncludedByParent(); 3694 } 3695 updateMemorySize(delta); 3696 } 3697 3698 /* 3699 * Change this.onMemorySize by the given delta and update the memory 3700 * budget for the cache, but only if the accummulated delta for this 3701 * node exceeds the ACCUMULATED_LIMIT threshold and this IN is actually 3702 * on the IN list. (For example, when we create new INs, they are 3703 * manipulated off the IN list before being added; if we updated the 3704 * environment wide cache then, we'd end up double counting.) 3705 */ updateMemorySize(long delta)3706 void updateMemorySize(long delta) { 3707 3708 if (delta == 0) { 3709 return; 3710 } 3711 3712 inMemorySize += delta; 3713 3714 if (inListResident) { 3715 3716 /* 3717 * This assertion is disabled if the environment is invalid to 3718 * avoid spurious assertions during testing of IO errors. If the 3719 * environment is invalid, memory budgeting errors are irrelevant. 3720 * [#21929] 3721 */ 3722 assert 3723 inMemorySize >= getFixedMemoryOverhead() || 3724 !getEnv().isValid(): 3725 "delta: " + delta + " inMemorySize: " + inMemorySize + 3726 " overhead: " + getFixedMemoryOverhead() + 3727 " computed: " + computeMemorySize() + 3728 " dump: " + toString() + assertPrintMemorySize(); 3729 3730 accumulatedDelta += delta; 3731 if (accumulatedDelta > ACCUMULATED_LIMIT || 3732 accumulatedDelta < -ACCUMULATED_LIMIT) { 3733 updateMemoryBudget(); 3734 } 3735 } 3736 } 3737 3738 /** 3739 * Move the accumulated delta to the memory budget. 3740 */ updateMemoryBudget()3741 public void updateMemoryBudget() { 3742 final EnvironmentImpl env = getEnv(); 3743 env.getInMemoryINs().memRecalcUpdate(this, accumulatedDelta); 3744 env.getMemoryBudget().updateTreeMemoryUsage(accumulatedDelta); 3745 accumulatedDelta = 0; 3746 } 3747 3748 /** 3749 * Returns the treeAdmin memory in objects referenced by this IN. 3750 * Specifically, this refers to the DbFileSummaryMap held by 3751 * MapLNs 3752 */ getTreeAdminMemorySize()3753 public long getTreeAdminMemorySize() { 3754 return 0; // by default, no treeAdminMemory 3755 } 3756 3757 /* 3758 * Utility method used during unit testing. 3759 */ printMemorySize()3760 protected long printMemorySize() { 3761 3762 final long inOverhead = getFixedMemoryOverhead(); 3763 3764 final long statesOverhead = 3765 MemoryBudget.byteArraySize(entryStates.length); 3766 3767 final int lsnOverhead = computeLsnOverhead(); 3768 3769 int entryOverhead = 0; 3770 for (int i = 0; i < nEntries; i++) { 3771 entryOverhead += getEntryInMemorySize(i); 3772 } 3773 3774 final int keyPrefixOverhead = (keyPrefix != null) ? 3775 MemoryBudget.byteArraySize(keyPrefix.length) : 0; 3776 3777 final int provisionalOverhead = (provisionalObsolete != null) ? 3778 provisionalObsolete.getMemorySize() : 0; 3779 3780 final long targetRepOverhead = entryTargets.calculateMemorySize(); 3781 final long keyRepOverhead = entryKeyVals.calculateMemorySize(); 3782 final long total = inOverhead + statesOverhead + lsnOverhead + 3783 entryOverhead + keyPrefixOverhead + provisionalOverhead + 3784 targetRepOverhead + keyRepOverhead; 3785 3786 System.out.println(" nEntries:" + nEntries + 3787 "/" + entryStates.length + 3788 " in: " + inOverhead + 3789 " states: " + statesOverhead + 3790 " entry: " + entryOverhead + 3791 " lsn: " + lsnOverhead + 3792 " keyPrefix: " + keyPrefixOverhead + 3793 " provisional: " + provisionalOverhead + 3794 " targetRep(" + entryTargets.getType() + "): " + 3795 targetRepOverhead + 3796 " keyRep(" + entryKeyVals.getType() +"): " + 3797 keyRepOverhead + 3798 " Total: " + total + 3799 " inMemorySize: " + inMemorySize); 3800 return total; 3801 } 3802 3803 /* Utility method used to print memory size in an assertion. */ assertPrintMemorySize()3804 private boolean assertPrintMemorySize() { 3805 printMemorySize(); 3806 return true; 3807 } 3808 verifyMemorySize()3809 public boolean verifyMemorySize() { 3810 3811 long calcMemorySize = computeMemorySize(); 3812 if (calcMemorySize != inMemorySize) { 3813 3814 String msg = "-Warning: Out of sync. Should be " + 3815 calcMemorySize + " / actual: " + inMemorySize + 3816 " node: " + getNodeId(); 3817 LoggerUtils.envLogMsg(Level.INFO, getEnv(), msg); 3818 3819 System.out.println(msg); 3820 printMemorySize(); 3821 3822 return false; 3823 } 3824 return true; 3825 } 3826 3827 /** 3828 * Adds (increments) or removes (decrements) the cache stats for the key 3829 * and target representations. Used when rep objects are being replaced 3830 * with a new instance, rather than by calling their mutator methods. 3831 * Specifically, it is called when mutating from full bin to bin delta 3832 * or vice-versa. 3833 */ updateRepCacheStats(boolean increment)3834 protected void updateRepCacheStats(boolean increment) { 3835 assert(isBIN()); 3836 entryKeyVals.updateCacheStats(increment, this); 3837 entryTargets.updateCacheStats(increment, this); 3838 } 3839 getCompactMaxKeyLength()3840 protected int getCompactMaxKeyLength() { 3841 return getEnv().getCompactMaxKeyLength(); 3842 } 3843 3844 /** 3845 * Called when adding/removing this IN to/from the INList. 3846 */ setInListResident(boolean resident)3847 public void setInListResident(boolean resident) { 3848 3849 if (!resident) { 3850 /* Decrement the stats before clearing its residency */ 3851 entryTargets.updateCacheStats(resident, this); 3852 entryKeyVals.updateCacheStats(resident, this); 3853 } 3854 3855 inListResident = resident; 3856 3857 if (resident) { 3858 /* Increment the stats after setting its residency. */ 3859 entryTargets.updateCacheStats(resident, this); 3860 entryKeyVals.updateCacheStats(resident, this); 3861 } 3862 } 3863 3864 /** 3865 * Returns whether this IN is on the INList. 3866 */ getInListResident()3867 public boolean getInListResident() { 3868 return inListResident; 3869 } 3870 getPrevLRUNode()3871 public IN getPrevLRUNode() { 3872 return prevLRUNode; 3873 } 3874 setPrevLRUNode(IN node)3875 public void setPrevLRUNode(IN node) { 3876 prevLRUNode = node; 3877 } 3878 getNextLRUNode()3879 public IN getNextLRUNode() { 3880 return nextLRUNode; 3881 } 3882 setNextLRUNode(IN node)3883 public void setNextLRUNode(IN node) { 3884 nextLRUNode = node; 3885 } 3886 3887 /** 3888 * Try to compact or otherwise reclaim memory in this IN and return the 3889 * number of bytes reclaimed. For example, a BIN should evict LNs, if 3890 * possible. 3891 * 3892 * Used by the evictor to reclaim memory by some means short of evicting 3893 * the entire node. If a positive value is returned, the evictor will 3894 * postpone full eviction of this node. 3895 */ partialEviction()3896 public long partialEviction() { 3897 return 0; 3898 } 3899 3900 /** 3901 * Returns whether any child is non-null. Is final to indicate it is not 3902 * overridden (unlike hasPinnedChildren, isEvictionProhibited, etc). 3903 * 3904 * Note that the IN may or may not be latched when this method is called. 3905 * Returning the wrong answer is OK in that case (it will be called again 3906 * later when latched), but an exception should not occur. 3907 */ hasResidentChildren()3908 public final boolean hasResidentChildren() { 3909 3910 for (int i = 0; i < getNEntries(); i++) { 3911 if (isResident(i)) { 3912 return true; 3913 } 3914 } 3915 3916 return false; 3917 } 3918 3919 /** 3920 * Do nothing since INs don't support deltas. 3921 */ setProhibitNextDelta()3922 public void setProhibitNextDelta() { 3923 } 3924 3925 /* 3926 * Validate the subtree that we're about to delete. Make sure there aren't 3927 * more than one valid entry on each IN and that the last level of the tree 3928 * is empty. Also check that there are no cursors on any bins in this 3929 * subtree. Assumes caller is holding the latch on this parent node. 3930 * 3931 * While we could latch couple down the tree, rather than hold latches as 3932 * we descend, we are presumably about to delete this subtree so 3933 * concurrency shouldn't be an issue. 3934 * 3935 * @return true if the subtree rooted at the entry specified by "index" is 3936 * ok to delete. 3937 * 3938 * Overriden by BIN class. 3939 */ validateSubtreeBeforeDelete(int index)3940 boolean validateSubtreeBeforeDelete(int index) 3941 throws DatabaseException { 3942 3943 if (index >= nEntries) { 3944 3945 /* 3946 * There's no entry here, so of course this entry is deletable. 3947 */ 3948 return true; 3949 } else { 3950 IN child = fetchIN(index); 3951 3952 boolean needToLatch = !child.isLatchExclusiveOwner(); 3953 3954 try { 3955 if (needToLatch) { 3956 child.latch(CacheMode.UNCHANGED); 3957 } 3958 return child.isValidForDelete(); 3959 } finally { 3960 if (needToLatch && isLatchOwner()) { 3961 child.releaseLatch(); 3962 } 3963 } 3964 } 3965 } 3966 3967 /** 3968 * Check if this node fits the qualifications for being part of a deletable 3969 * subtree. It can only have one IN child and no LN children. 3970 * 3971 * Note: the method is overwritten by BIN and LN. 3972 * BIN.isValidForDelete() will not fetch any child LNs. 3973 * LN.isValidForDelete() simply returns false. 3974 * 3975 * We assume that this is only called under an assert. 3976 */ 3977 @Override isValidForDelete()3978 boolean isValidForDelete() 3979 throws DatabaseException { 3980 3981 assert(!isBINDelta()); 3982 3983 /* 3984 * Can only have one valid child, and that child should be 3985 * deletable. 3986 */ 3987 if (nEntries > 1) { // more than 1 entry. 3988 return false; 3989 3990 } else if (nEntries == 1) { // 1 entry, check child 3991 IN child = fetchIN(0); 3992 child.latch(CacheMode.UNCHANGED); 3993 3994 boolean ret = false; 3995 try { 3996 if (child.isBINDelta()) { 3997 return false; 3998 } 3999 4000 ret = child.isValidForDelete(); 4001 4002 } finally { 4003 child.releaseLatch(); 4004 } 4005 return ret; 4006 } else { // 0 entries. 4007 return true; 4008 } 4009 } 4010 4011 /** 4012 * Check that the IN is in a valid state. For now, validity means that the 4013 * keys are in sorted order and that there are more than 0 entries. 4014 * maxKey, if non-null specifies that all keys in this node must be less 4015 * than maxKey. 4016 * @throws EnvironmentFailureException when implemented. 4017 */ 4018 @Override verify(byte[] maxKey)4019 public final void verify(byte[] maxKey) 4020 throws EnvironmentFailureException { 4021 4022 /********* never used, but may be used for the basis of a verify() 4023 method in the future. 4024 try { 4025 Comparator<byte[]> userCompareToFcn = 4026 (databaseImpl == null ? null : getKeyComparator()); 4027 4028 byte[] key1 = null; 4029 for (int i = 1; i < nEntries; i++) { 4030 key1 = entryKeyVals.get(i); 4031 byte[] key2 = entryKeyVals.get(i - 1); 4032 4033 int s = Key.compareKeys(key1, key2, userCompareToFcn); 4034 if (s <= 0) { 4035 throw EnvironmentFailureException.unexpectedState 4036 ("IN " + getNodeId() + " key " + (i-1) + 4037 " (" + Key.dumpString(key2, 0) + 4038 ") and " + 4039 i + " (" + Key.dumpString(key1, 0) + 4040 ") are out of order"); 4041 } 4042 } 4043 4044 boolean inconsistent = false; 4045 if (maxKey != null && key1 != null) { 4046 if (Key.compareKeys(key1, maxKey, userCompareToFcn) >= 0) { 4047 inconsistent = true; 4048 } 4049 } 4050 4051 if (inconsistent) { 4052 throw EnvironmentFailureException.unexpectedState 4053 ("IN " + getNodeId() + 4054 " has entry larger than next entry in parent."); 4055 } 4056 } catch (DatabaseException DE) { 4057 DE.printStackTrace(System.out); 4058 } 4059 *****************/ 4060 } 4061 4062 /** 4063 * Add self and children to this in-memory IN list. Called by recovery, can 4064 * run with no latching. 4065 */ 4066 @Override rebuildINList(INList inList)4067 final void rebuildINList(INList inList) 4068 throws DatabaseException { 4069 4070 /* 4071 * Recompute your in memory size first and then add yourself to the 4072 * list. 4073 */ 4074 initMemorySize(); 4075 4076 inList.add(this); 4077 4078 boolean hasCachedChildren = false; 4079 4080 /* 4081 * Add your children if they're resident. (LNs know how to stop the 4082 * flow). 4083 */ 4084 for (int i = 0; i < nEntries; i++) { 4085 Node n = getTarget(i); 4086 if (n != null) { 4087 n.rebuildINList(inList); 4088 hasCachedChildren = true; 4089 } 4090 } 4091 4092 final Evictor evictor = getEvictor(); 4093 4094 if (isUpperIN()) { 4095 if (hasCachedChildren) { 4096 setHasCachedChildrenFlag(true); 4097 } else { 4098 setHasCachedChildrenFlag(false); 4099 if (evictor != null && !isDIN()) { 4100 if (traceLRU) { 4101 LoggerUtils.envLogMsg( 4102 traceLevel, getEnv(), 4103 "rebuildINList " + 4104 Thread.currentThread().getId() + 4105 "-" + 4106 Thread.currentThread().getName() + 4107 "-" + getEnv().getName() + 4108 " Adding UIN to LRU: " + 4109 getNodeId()); 4110 } 4111 evictor.addBack(this); 4112 } 4113 } 4114 } else if (isBIN() && !isDBIN()) { 4115 if (evictor != null) { 4116 evictor.addBack(this); 4117 } 4118 } 4119 } 4120 4121 /** 4122 * For a regular (not deferred-write) DB, account for a deleted subtree. 4123 * <p> 4124 * Count removed nodes as obsolete in the local tracker. The local tracker 4125 * will be flushed by the compressor. 4126 * <p> 4127 * TODO: Note that we neglect to transfer any accumulated provisional 4128 * obsolete info from the deleted INs to the tracker. See 4129 * accountForDeferredWriteSubtreeRemoval for the case where this comes up. 4130 * Ideally, in a future release we will record all obsolete info for 4131 * regular as well as deferred-write DBs, including any accumulated 4132 * provisional info, from the deleted INs to the subtree parent, as we do 4133 * in accountForDeferredWriteSubtreeRemoval, and then rely on the logging 4134 * of the subtree parent (for a regular DB) to flush that info. This was 4135 * not done as part of SR [#21348] because logging the subtree parent does 4136 * not flush the utilization info all the way to the log, while the 4137 * compressor does flush the local tracker info all the way to the log. 4138 * With further thought we may decide this change in behavior is 4139 * acceptable, but it was too risky for a patch release. See the SR 4140 * [#21348] for more details. 4141 */ 4142 @Override accountForSubtreeRemoval( INList inList, LocalUtilizationTracker localTracker)4143 final void accountForSubtreeRemoval( 4144 INList inList, 4145 LocalUtilizationTracker localTracker) 4146 throws DatabaseException { 4147 4148 assert(!isBINDelta(false/*checkLatched*/)); 4149 4150 if (nEntries > 1) { 4151 throw unexpectedState( 4152 "Found non-deletable IN " + getNodeId() + 4153 " while flushing INList. nEntries = " + nEntries); 4154 } 4155 4156 /* Count full and delta versions as obsolete. */ 4157 if (getLastFullVersion() != DbLsn.NULL_LSN) { 4158 localTracker.countObsoleteNode( 4159 getLastFullVersion(), getLogType(), 0, databaseImpl); 4160 } 4161 if (getLastDeltaVersion() != DbLsn.NULL_LSN) { 4162 localTracker.countObsoleteNode( 4163 getLastDeltaVersion(), getLogType(), 0, databaseImpl); 4164 } 4165 4166 /* Remove your child, if any. It should normally be resident. */ 4167 if (nEntries > 0) { 4168 if (isBIN()) { 4169 assert(isEntryKnownDeleted(0)); 4170 } else { 4171 Node n = getTarget(0); 4172 n.accountForSubtreeRemoval(inList, localTracker); 4173 } 4174 } 4175 } 4176 4177 /** 4178 * For a deferred-write DB, account for a deleted subtree. [#21348] 4179 * <p> 4180 * Count removed nodes as provisionally obsolete, recording this info in 4181 * the parent of the subtree. When the root IN is logged non-provisionally 4182 * by a checkpoint or Database.sync, the provisional obsolete info will be 4183 * flushed to the log. 4184 * <p> 4185 * When removed nodes are counted obsolete, also transfer their accumulated 4186 * provisional obsolete to the subtree parent. This accounts for the case 4187 * where a subtree is removed in the middle of a checkpoint. 4188 * <p> 4189 * For example, say a two level subtree is removed, INa-BINb, and the 4190 * grandparent, INc, is the subtree parent. Say a checkpoint logged BINb 4191 * provisionally, but did not log INa (or INc). The provisional obsolete 4192 * info for BINb will be present in INa. In this method, we transfer that 4193 * info to INc, so it will be flushed when INc is logged non-provisionally. 4194 */ 4195 @Override accountForDeferredWriteSubtreeRemoval( INList inList, IN subtreeParent)4196 final void accountForDeferredWriteSubtreeRemoval( 4197 INList inList, 4198 IN subtreeParent) 4199 throws DatabaseException { 4200 4201 assert(!isBINDelta(false)); 4202 4203 if (nEntries > 1) { 4204 throw unexpectedState( 4205 "Found non-deletable IN " + getNodeId() + 4206 " while flushing INList. nEntries = " + nEntries); 4207 } 4208 4209 final Evictor evictor = getEvictor(); 4210 4211 /* Count full and delta versions as obsolete. */ 4212 if (getLastFullVersion() != DbLsn.NULL_LSN) { 4213 subtreeParent.trackProvisionalObsolete 4214 (this, getLastFullVersion(), false /*isObsoleteLN*/, 0); 4215 } 4216 if (getLastDeltaVersion() != DbLsn.NULL_LSN) { 4217 subtreeParent.trackProvisionalObsolete 4218 (this, getLastDeltaVersion(), false /*isObsoleteLN*/, 0); 4219 } 4220 4221 /* Remove your child, if any. It should normally be resident. */ 4222 if (nEntries > 0) { 4223 if (isBIN()) { 4224 assert(isEntryKnownDeleted(0)); 4225 } else { 4226 Node n = getTarget(0); 4227 assert(n != null && !n.isBINDelta(false)); 4228 n.accountForDeferredWriteSubtreeRemoval(inList, subtreeParent); 4229 } 4230 } 4231 } 4232 4233 /* 4234 * DbStat support. 4235 */ accumulateStats(TreeWalkerStatsAccumulator acc)4236 void accumulateStats(TreeWalkerStatsAccumulator acc) { 4237 acc.processIN(this, Long.valueOf(getNodeId()), getLevel()); 4238 } 4239 4240 /** 4241 * Sets the last logged LSN, which for a BIN may be a delta. 4242 * 4243 * In this class, the last logged version and last full version are the 4244 * same. In the BIN class, this method is overridden since they may be 4245 * different. 4246 */ setLastLoggedLsn(long lsn)4247 void setLastLoggedLsn(long lsn) { 4248 setLastFullLsn(lsn); 4249 } 4250 4251 /** 4252 * Returns the last logged LSN, or NULL_LSN if never logged. 4253 * 4254 * In this class, the last logged version and last full version are the 4255 * same. In the BIN class, this method is overridden since they may be 4256 * different. 4257 */ getLastLoggedVersion()4258 public long getLastLoggedVersion() { 4259 return getLastFullVersion(); 4260 } 4261 4262 /** 4263 * Sets the last full version LSN. 4264 */ setLastFullLsn(long lsn)4265 public final void setLastFullLsn(long lsn) { 4266 lastFullVersion = lsn; 4267 } 4268 4269 /** 4270 * Returns the last full version LSN, or NULL_LSN if never logged. 4271 */ getLastFullVersion()4272 public final long getLastFullVersion() { 4273 return lastFullVersion; 4274 } 4275 4276 /** 4277 * Returns the last delta version LSN, or NULL_LSN if a delta was not last 4278 * logged. Public for unit testing. 4279 */ getLastDeltaVersion()4280 public long getLastDeltaVersion() { 4281 return DbLsn.NULL_LSN; 4282 } 4283 4284 /* 4285 * Logging support 4286 */ 4287 4288 /** 4289 * When splits and checkpoints intermingle in a deferred write databases, 4290 * a checkpoint target may appear which has a valid target but a null LSN. 4291 * Deferred write dbs are written out in checkpoint style by either 4292 * Database.sync() or a checkpoint which has cleaned a file containing 4293 * deferred write entries. For example, 4294 * INa 4295 * | 4296 * BINb 4297 * 4298 * A checkpoint or Database.sync starts 4299 * The INList is traversed, dirty nodes are selected 4300 * BINb is bypassed on the INList, since it's not dirty 4301 * BINb is split, creating a new sibling, BINc, and dirtying INa 4302 * INa is selected as a dirty node for the ckpt 4303 * 4304 * If this happens, INa is in the selected dirty set, but not its dirty 4305 * child BINb and new child BINc. 4306 * 4307 * In a durable db, the existence of BINb and BINc are logged 4308 * anyway. But in a deferred write db, there is an entry that points to 4309 * BINc, but no logged version. 4310 * 4311 * This will not cause problems with eviction, because INa can't be 4312 * evicted until BINb and BINc are logged, are non-dirty, and are detached. 4313 * But it can cause problems at recovery, because INa will have a null LSN 4314 * for a valid entry, and the LN children of BINc will not find a home. 4315 * To prevent this, search for all dirty children that might have been 4316 * missed during the selection phase, and write them out. It's not 4317 * sufficient to write only null-LSN children, because the existing sibling 4318 * must be logged lest LN children recover twice (once in the new sibling, 4319 * once in the old existing sibling. 4320 * 4321 * Overriden by BIN class. 4322 */ logDirtyChildren()4323 public void logDirtyChildren() 4324 throws DatabaseException { 4325 4326 assert(!isBINDelta()); 4327 4328 EnvironmentImpl envImpl = getDatabase().getEnv(); 4329 4330 /* Look for targets that are dirty. */ 4331 for (int i = 0; i < getNEntries(); i++) { 4332 4333 IN child = (IN) getTarget(i); 4334 if (child != null) { 4335 child.latch(CacheMode.UNCHANGED); 4336 try { 4337 if (child.getDirty()) { 4338 /* Ask descendents to log their children. */ 4339 child.logDirtyChildren(); 4340 long childLsn = 4341 child.log(envImpl.getLogManager(), 4342 false, // allowDeltas 4343 false, // allowCompress 4344 true, // isProvisional 4345 true, // backgroundIO 4346 this); // parent 4347 updateEntry(i, childLsn, 0 /*lastLoggedSize*/); 4348 } 4349 } finally { 4350 child.releaseLatch(); 4351 } 4352 } 4353 } 4354 } 4355 4356 /** 4357 * Log this IN and clear the dirty flag. 4358 */ log(LogManager logManager)4359 public final long log(LogManager logManager) 4360 throws DatabaseException { 4361 4362 return logInternal(logManager, 4363 false, // allowDeltas 4364 false, // allowCompress 4365 Provisional.NO, 4366 false, // backgroundIO 4367 null); // parent 4368 } 4369 4370 /** 4371 * Log this node with all available options. 4372 */ log( LogManager logManager, boolean allowDeltas, boolean allowCompress, boolean isProvisional, boolean backgroundIO, IN parent)4373 public final long log( 4374 LogManager logManager, 4375 boolean allowDeltas, 4376 boolean allowCompress, 4377 boolean isProvisional, 4378 boolean backgroundIO, 4379 IN parent) // for provisional 4380 throws DatabaseException { 4381 4382 return logInternal(logManager, 4383 allowDeltas, 4384 allowCompress, 4385 isProvisional ? Provisional.YES : Provisional.NO, 4386 backgroundIO, 4387 parent); 4388 } 4389 log( LogManager logManager, boolean allowDeltas, boolean allowCompress, Provisional provisional, boolean backgroundIO, IN parent)4390 public final long log( 4391 LogManager logManager, 4392 boolean allowDeltas, 4393 boolean allowCompress, 4394 Provisional provisional, 4395 boolean backgroundIO, 4396 IN parent) // for provisional 4397 throws DatabaseException { 4398 4399 return logInternal(logManager, 4400 allowDeltas, 4401 allowCompress, 4402 provisional, 4403 backgroundIO, 4404 parent); 4405 } 4406 4407 /** 4408 * Log this IN and clear the dirty flag. 4409 */ optionalLog(LogManager logManager)4410 public final long optionalLog(LogManager logManager) 4411 throws DatabaseException { 4412 4413 if (databaseImpl.isDeferredWriteMode()) { 4414 return getLastLoggedVersion(); 4415 } else { 4416 return logInternal(logManager, 4417 false, // allowDeltas 4418 false, // allowCompress 4419 Provisional.NO, 4420 false, // backgroundIO 4421 null); // parent 4422 } 4423 } 4424 4425 /** 4426 * Log this node provisionally and clear the dirty flag. 4427 * @return LSN of the new log entry 4428 */ optionalLogProvisional(LogManager logManager, IN parent)4429 public final long optionalLogProvisional(LogManager logManager, IN parent) 4430 throws DatabaseException { 4431 4432 if (databaseImpl.isDeferredWriteMode()) { 4433 return getLastLoggedVersion(); 4434 } else { 4435 return logInternal(logManager, 4436 false, // allowDeltas 4437 false, // allowCompress 4438 Provisional.YES, 4439 false, // backgroundIO 4440 parent); 4441 } 4442 } 4443 4444 /** 4445 * Bottleneck method for all single-IN logging. Multi-IN logging uses 4446 * beforeLog and afterLog instead. 4447 */ logInternal( LogManager logManager, boolean allowDeltas, boolean allowCompress, Provisional provisional, boolean backgroundIO, IN parent)4448 private long logInternal( 4449 LogManager logManager, 4450 boolean allowDeltas, 4451 boolean allowCompress, 4452 Provisional provisional, 4453 boolean backgroundIO, 4454 IN parent) 4455 throws DatabaseException { 4456 4457 INLogItem item = new INLogItem(); 4458 item.provisional = provisional; 4459 item.parent = parent; 4460 item.repContext = ReplicationContext.NO_REPLICATE; 4461 4462 INLogContext context = new INLogContext(); 4463 context.nodeDb = getDatabase(); 4464 context.backgroundIO = backgroundIO; 4465 context.allowDeltas = allowDeltas; 4466 context.allowCompress = allowCompress; 4467 4468 beforeLog(logManager, item, context); 4469 4470 logManager.log(item, context); 4471 4472 afterLog(logManager, item, context); 4473 4474 return item.newLsn; 4475 } 4476 4477 /** 4478 * Pre-log processing. Used implicitly for single-item logging and 4479 * explicitly for multi-item logging. Overridden by subclasses as needed. 4480 * 4481 * Decide how to log this node. INs are always logged in full. Cleaner LN 4482 * migration is never performed since it only applies to BINs. 4483 * 4484 * Overriden by BIN class. 4485 */ beforeLog( LogManager logManager, INLogItem item, INLogContext context)4486 public void beforeLog( 4487 LogManager logManager, 4488 INLogItem item, 4489 INLogContext context) { 4490 4491 beforeLogCommon(item, context, lastFullVersion, DbLsn.NULL_LSN); 4492 item.entry = new INLogEntry<IN>(this); 4493 } 4494 4495 /** 4496 * Pre-log processing shared by IN and BIN classes. 4497 */ beforeLogCommon( INLogItem item, INLogContext context, long oldLsn, long auxOldLsn)4498 final void beforeLogCommon( 4499 INLogItem item, 4500 INLogContext context, 4501 long oldLsn, 4502 long auxOldLsn) { 4503 4504 assert isLatchExclusiveOwner(); 4505 assert item.parent == null || item.parent.isLatchExclusiveOwner(); 4506 4507 /* Count obsolete info when logging non-provisionally. */ 4508 if (countObsoleteDuringLogging(item.provisional)) { 4509 item.oldLsn = oldLsn; 4510 item.auxOldLsn = auxOldLsn; 4511 context.packedObsoleteInfo = provisionalObsolete; 4512 } 4513 } 4514 4515 /** 4516 * Post-log processing. Used implicitly for single-item logging and 4517 * explicitly for multi-item logging. Overridden by subclasses as needed. 4518 * 4519 * The last version of this node must be counted obsolete at the correct 4520 * time. If logging non-provisionally, the last version of this node and 4521 * any provisionally logged descendants are immediately obsolete and can be 4522 * flushed. If logging provisionally, the last version isn't obsolete until 4523 * an ancestor is logged non-provisionally, so propagate obsolete lsns 4524 * upwards. 4525 * 4526 * Overriden by BIN class. 4527 */ afterLog( LogManager logManager, INLogItem item, INLogContext context)4528 public void afterLog( 4529 LogManager logManager, 4530 INLogItem item, 4531 INLogContext context) { 4532 4533 afterLogCommon(logManager, item, context, lastFullVersion, 4534 DbLsn.NULL_LSN); 4535 4536 setLastFullLsn(item.newLsn); 4537 } 4538 4539 /** 4540 * Post-log processing shared by IN and BIN classes. 4541 */ afterLogCommon( LogManager logManager, INLogItem item, INLogContext context, long oldLsn, long auxOldLsn)4542 final void afterLogCommon( 4543 LogManager logManager, 4544 INLogItem item, 4545 INLogContext context, 4546 long oldLsn, 4547 long auxOldLsn) { 4548 4549 if (countObsoleteDuringLogging(item.provisional)) { 4550 discardProvisionalObsolete(logManager); 4551 } else { 4552 if (item.parent != null) { 4553 if (oldLsn != DbLsn.NULL_LSN) { 4554 item.parent.trackProvisionalObsolete 4555 (this, oldLsn, false /*isObsoleteLN*/, 0); 4556 } 4557 if (auxOldLsn != DbLsn.NULL_LSN) { 4558 item.parent.trackProvisionalObsolete 4559 (this, auxOldLsn, false /*isObsoleteLN*/, 0); 4560 } 4561 } 4562 } 4563 4564 setDirty(false); 4565 4566 final Evictor evictor = getEvictor(); 4567 if (evictor != null) { 4568 /* 4569 * To capture all cases where a node needs to be moved to the 4570 * mixed LRUSet after being cleaned, we invoke moveToMixedLRU() 4571 * from IN.afterLog(). This includes the case where the node is 4572 * being logged as part of being evicted, in which case we don't 4573 * really want it to go back to the LRU. However, this is ok 4574 * because moveToMixedLRU() checks whether the node is actually 4575 * in the dirty LRUSet before moving it to the mixed LRUSet. 4576 */ 4577 4578 if (traceLRU && isUpperIN()) { 4579 LoggerUtils.envLogMsg( 4580 traceLevel, getEnv(), 4581 Thread.currentThread().getId() + "-" + 4582 Thread.currentThread().getName() + 4583 "-" + getEnv().getName() + 4584 " afterLogCommon(): " + 4585 " Moving UIN to mixed LRU: " + getNodeId()); 4586 } 4587 evictor.moveToMixedLRU(this); 4588 } 4589 4590 /* 4591 * When logging an IN, its parent IN slot should not have KD/PD flags 4592 * set, since these flags are applicable only to child LNs. However, 4593 * in the past these flags were sometimes set incorrectly. Clear the 4594 * flags here to guard against possible problems. 4595 */ 4596 if (item.parent != null && item.parentIndex >= 0) { 4597 item.parent.clearKnownDeleted(item.parentIndex); 4598 item.parent.clearPendingDeleted(item.parentIndex); 4599 } 4600 } 4601 4602 /** 4603 * Returns whether to count the prior version of an IN (as well as 4604 * accumulated provisionally obsolete LSNs for child nodes) obsolete when 4605 * logging the new version. 4606 * 4607 * True is returned if we are logging the IN non-provisionally, since the 4608 * non-provisional version durably replaces the prior version and causes 4609 * all provisional children to also become durable. 4610 * 4611 * True is also returned if the database is temporary. Since we never use a 4612 * temporary DB past recovery, prior versions of an IN are never used. 4613 * [#16928] 4614 */ countObsoleteDuringLogging(Provisional provisional)4615 private boolean countObsoleteDuringLogging(Provisional provisional) { 4616 return provisional != Provisional.YES || 4617 databaseImpl.isTemporary(); 4618 } 4619 4620 /** 4621 * Adds the given obsolete LSN and any tracked obsolete LSNs for the given 4622 * child IN to this IN's tracking list. This method is called to track 4623 * obsolete LSNs when a child IN or LN is logged provisionally. Such LSNs 4624 * cannot be considered obsolete until an ancestor IN is logged 4625 * non-provisionally. 4626 */ trackProvisionalObsolete( IN childIN, long obsoleteLsn, boolean isObsoleteLN, int obsoleteSize)4627 final void trackProvisionalObsolete( 4628 IN childIN, 4629 long obsoleteLsn, 4630 boolean isObsoleteLN, 4631 int obsoleteSize) { 4632 4633 int oldMemSize = (provisionalObsolete != null) ? 4634 provisionalObsolete.getMemorySize() : 4635 0; 4636 4637 if (childIN != null && childIN.provisionalObsolete != null) { 4638 if (provisionalObsolete != null) { 4639 /* Append child info to parent info. */ 4640 provisionalObsolete.copyObsoleteInfo 4641 (childIN.provisionalObsolete); 4642 } else { 4643 /* Move reference from child to parent. */ 4644 provisionalObsolete = childIN.provisionalObsolete; 4645 } 4646 childIN.updateMemorySize( 4647 0 - childIN.provisionalObsolete.getMemorySize()); 4648 childIN.provisionalObsolete = null; 4649 } 4650 4651 if (obsoleteLsn != DbLsn.NULL_LSN) { 4652 if (provisionalObsolete == null) { 4653 provisionalObsolete = new PackedObsoleteInfo(); 4654 } 4655 provisionalObsolete.addObsoleteInfo(obsoleteLsn, isObsoleteLN, 4656 obsoleteSize); 4657 } 4658 4659 updateMemorySize(oldMemSize, 4660 (provisionalObsolete != null) ? 4661 provisionalObsolete.getMemorySize() : 4662 0); 4663 } 4664 4665 /** 4666 * Discards the provisional obsolete tracking information in this node 4667 * after it has been counted in the live tracker. This method is called 4668 * after this node is logged non-provisionally. 4669 */ discardProvisionalObsolete(LogManager logManager)4670 private void discardProvisionalObsolete(LogManager logManager) 4671 throws DatabaseException { 4672 4673 if (provisionalObsolete != null) { 4674 updateMemorySize(0 - provisionalObsolete.getMemorySize()); 4675 provisionalObsolete = null; 4676 } 4677 } 4678 4679 /* 4680 * NOOP for upper INs. Overriden by BIN class. 4681 */ mutateToFullBIN()4682 public void mutateToFullBIN() { 4683 } 4684 getNEntriesToWrite(boolean deltasOnly)4685 private int getNEntriesToWrite(boolean deltasOnly) { 4686 if (!deltasOnly) { 4687 return nEntries; 4688 } 4689 return getNDeltas(); 4690 } 4691 getNDeltas()4692 public final int getNDeltas() { 4693 int n = 0; 4694 for (int i = 0; i < nEntries; i++) { 4695 if (!isDirty(i)) { 4696 continue; 4697 } 4698 n += 1; 4699 } 4700 return n; 4701 } 4702 4703 /** 4704 * @see Node#getGenericLogType 4705 */ 4706 @Override getGenericLogType()4707 public final LogEntryType getGenericLogType() { 4708 return getLogType(); 4709 } 4710 4711 /** 4712 * Get the log type of this node. 4713 */ getLogType()4714 public LogEntryType getLogType() { 4715 return LogEntryType.LOG_IN; 4716 } 4717 4718 /** 4719 * @see Loggable#getLogSize 4720 * 4721 * Overrriden by DIN and DBIN classes. 4722 */ 4723 @Override getLogSize()4724 public int getLogSize() { 4725 return getLogSize(false); 4726 } 4727 getLogSize(boolean deltasOnly)4728 public final int getLogSize(boolean deltasOnly) { 4729 4730 int size = super.getLogSize(); // ancestors 4731 4732 size += LogUtils.getPackedLongLogSize(nodeId); 4733 size += LogUtils.getByteArrayLogSize(identifierKey); // identifier key 4734 4735 if (keyPrefix != null) { 4736 size += LogUtils.getByteArrayLogSize(keyPrefix); 4737 } 4738 4739 size += 1; // one byte for boolean flags 4740 4741 final int nEntriesToWrite = getNEntriesToWrite(deltasOnly); 4742 4743 final int maxEntriesToWrite = 4744 (!deltasOnly ? 4745 getMaxEntries() : 4746 ((BIN)this).getDeltaCapacity(nEntriesToWrite)); 4747 4748 size += LogUtils.getPackedIntLogSize(nEntriesToWrite); 4749 size += LogUtils.getPackedIntLogSize(level); 4750 size += LogUtils.getPackedIntLogSize(maxEntriesToWrite); 4751 4752 final boolean compactLsnsRep = (entryLsnLongArray == null); 4753 size += LogUtils.getBooleanLogSize(); // compactLsnsRep 4754 if (compactLsnsRep) { 4755 size += LogUtils.INT_BYTES; // baseFileNumber 4756 } 4757 4758 boolean hasLastLoggedSize = isLastLoggedSizeStored(); 4759 4760 for (int i = 0; i < nEntries; i++) { // entries 4761 if (deltasOnly && !isDirty(i)) { 4762 continue; 4763 } 4764 size += LogUtils.getByteArrayLogSize(entryKeyVals.get(i)) + // key 4765 (compactLsnsRep ? LogUtils.INT_BYTES : 4766 LogUtils.getLongLogSize()) + // LSN 4767 1; // state 4768 if (hasLastLoggedSize) { 4769 size += LogUtils.getPackedIntLogSize(getLastLoggedSize(i)); 4770 } 4771 } 4772 4773 if (deltasOnly) { 4774 4775 BIN bin = (BIN)this; 4776 4777 size += LogUtils.getPackedIntLogSize(bin.getFullBinNEntries()); 4778 size += LogUtils.getPackedIntLogSize(bin.getFullBinMaxEntries()); 4779 4780 size += bin.getBloomFilterLogSize(); 4781 } 4782 4783 return size; 4784 } 4785 4786 /* 4787 * Overrriden by DIN and DBIN classes. 4788 */ 4789 @Override writeToLog(ByteBuffer logBuffer)4790 public void writeToLog(ByteBuffer logBuffer) { 4791 writeToLog(logBuffer, false); 4792 } 4793 writeToLog(ByteBuffer logBuffer, boolean deltasOnly)4794 public final void writeToLog(ByteBuffer logBuffer, boolean deltasOnly) { 4795 4796 super.writeToLog(logBuffer); 4797 4798 LogUtils.writePackedLong(logBuffer, nodeId); 4799 4800 LogUtils.writeByteArray(logBuffer, identifierKey); 4801 4802 boolean hasKeyPrefix = (keyPrefix != null); 4803 boolean hasLastLoggedSize = isLastLoggedSizeStored(); 4804 4805 byte booleans = (byte) (isRoot() ? 1 : 0); 4806 booleans |= (hasKeyPrefix ? 2 : 0); 4807 booleans |= (hasLastLoggedSize ? 4 : 0); 4808 4809 byte[] bloomFilter = null; 4810 4811 if (deltasOnly) { 4812 BIN bin = (BIN)this; 4813 bloomFilter = bin.createBloomFilter(); 4814 4815 if (bloomFilter != null) { 4816 booleans |= 8; 4817 } 4818 } 4819 4820 logBuffer.put(booleans); 4821 4822 if (hasKeyPrefix) { 4823 LogUtils.writeByteArray(logBuffer, keyPrefix); 4824 } 4825 4826 final int nEntriesToWrite = getNEntriesToWrite(deltasOnly); 4827 4828 final int maxEntriesToWrite = 4829 (!deltasOnly ? 4830 getMaxEntries() : 4831 ((BIN)this).getDeltaCapacity(nEntriesToWrite)); 4832 /* 4833 if (deltasOnly) { 4834 BIN bin = (BIN)this; 4835 System.out.println( 4836 "Logging BIN-delta: " + getNodeId() + 4837 " is delta = " + isBINDelta() + 4838 " nEntries = " + nEntriesToWrite + 4839 " max entries = " + maxEntriesToWrite + 4840 " full BIN entries = " + bin.getFullBinNEntries() + 4841 " full BIN max entries = " + bin.getFullBinMaxEntries()); 4842 } 4843 */ 4844 LogUtils.writePackedInt(logBuffer, nEntriesToWrite); 4845 LogUtils.writePackedInt(logBuffer, level); 4846 LogUtils.writePackedInt(logBuffer, maxEntriesToWrite); 4847 4848 /* true if compact representation. */ 4849 boolean compactLsnsRep = (entryLsnLongArray == null); 4850 LogUtils.writeBoolean(logBuffer, compactLsnsRep); 4851 if (compactLsnsRep) { 4852 LogUtils.writeInt(logBuffer, (int) baseFileNumber); 4853 } 4854 4855 for (int i = 0; i < nEntries; i++) { 4856 4857 if (deltasOnly && !isDirty(i)) { 4858 continue; 4859 } 4860 4861 LogUtils.writeByteArray(logBuffer, entryKeyVals.get(i)); // key 4862 4863 /* 4864 * A NULL_LSN may be stored when an incomplete insertion occurs, 4865 * but in that case the KnownDeleted flag must be set. See 4866 * Tree.insert. [#13126] 4867 */ 4868 assert checkForNullLSN(i) : 4869 "logging IN " + getNodeId() + " with null lsn child " + 4870 " db=" + databaseImpl.getDebugName() + 4871 " isDeferredWriteMode=" + databaseImpl.isDeferredWriteMode() + 4872 " isTemporary=" + databaseImpl.isTemporary(); 4873 4874 if (compactLsnsRep) { // LSN 4875 int offset = i << 2; 4876 int fileOffset = getFileOffset(offset); 4877 logBuffer.put(getFileNumberOffset(offset)); 4878 logBuffer.put((byte) ((fileOffset >>> 0) & 0xff)); 4879 logBuffer.put((byte) ((fileOffset >>> 8) & 0xff)); 4880 logBuffer.put((byte) ((fileOffset >>> 16) & 0xff)); 4881 } else { 4882 LogUtils.writeLong(logBuffer, entryLsnLongArray[i]); 4883 } 4884 4885 logBuffer.put(entryStates[i]); // state 4886 4887 if (!deltasOnly) { 4888 entryStates[i] &= EntryStates.CLEAR_DIRTY_BIT; 4889 } 4890 4891 if (hasLastLoggedSize) { 4892 LogUtils.writePackedInt(logBuffer, getLastLoggedSize(i)); 4893 } 4894 } 4895 4896 if (deltasOnly) { 4897 4898 BIN bin = (BIN)this; 4899 4900 LogUtils.writePackedInt(logBuffer, bin.getFullBinNEntries()); 4901 LogUtils.writePackedInt(logBuffer, bin.getFullBinMaxEntries()); 4902 4903 if (bloomFilter != null) { 4904 BINDeltaBloomFilter.writeToLog(bloomFilter, logBuffer); 4905 } 4906 } 4907 } 4908 4909 /* 4910 * Used for assertion to prevent writing a null lsn to the log. 4911 */ checkForNullLSN(int index)4912 private boolean checkForNullLSN(int index) { 4913 boolean ok; 4914 if (isBIN()) { 4915 ok = !(getLsn(index) == DbLsn.NULL_LSN && 4916 (entryStates[index] & EntryStates.KNOWN_DELETED_BIT) == 0); 4917 } else { 4918 ok = (getLsn(index) != DbLsn.NULL_LSN); 4919 } 4920 return ok; 4921 } 4922 4923 /* 4924 * Overrriden by DIN and DBIN classes. 4925 */ 4926 @Override readFromLog( ByteBuffer itemBuffer, int entryVersion)4927 public void readFromLog( 4928 ByteBuffer itemBuffer, 4929 int entryVersion) { 4930 4931 readFromLog(itemBuffer, entryVersion, false); 4932 } 4933 readFromLog( ByteBuffer itemBuffer, int entryVersion, boolean deltasOnly)4934 public final void readFromLog( 4935 ByteBuffer itemBuffer, 4936 int entryVersion, 4937 boolean deltasOnly) { 4938 4939 super.readFromLog(itemBuffer, entryVersion); 4940 4941 boolean unpacked = (entryVersion < 6); 4942 4943 nodeId = LogUtils.readLong(itemBuffer, unpacked); 4944 identifierKey = LogUtils.readByteArray(itemBuffer, unpacked); 4945 4946 byte booleans = itemBuffer.get(); 4947 setIsRootFlag((booleans & 1) != 0); 4948 if ((booleans & 2) != 0) { 4949 keyPrefix = LogUtils.readByteArray(itemBuffer, unpacked); 4950 } 4951 4952 boolean hasLastLoggedSize = ((booleans & 4) != 0); 4953 assert !(hasLastLoggedSize && (entryVersion < 9)); 4954 4955 boolean hasBloomFilter = ((booleans & 8) != 0); 4956 assert(!hasBloomFilter || (entryVersion >= 10 && deltasOnly)); 4957 4958 nEntries = LogUtils.readInt(itemBuffer, unpacked); 4959 level = LogUtils.readInt(itemBuffer, unpacked); 4960 int length = LogUtils.readInt(itemBuffer, unpacked); 4961 4962 entryTargets = INTargetRep.NONE; 4963 entryKeyVals = new INKeyRep.Default(length); 4964 baseFileNumber = -1; 4965 long storedBaseFileNumber = -1; 4966 if (disableCompactLsns) { 4967 entryLsnByteArray = null; 4968 entryLsnLongArray = new long[length]; 4969 } else { 4970 entryLsnByteArray = new byte[length << 2]; 4971 entryLsnLongArray = null; 4972 } 4973 entryStates = new byte[length]; 4974 boolean compactLsnsRep = false; 4975 4976 if (entryVersion > 1) { 4977 compactLsnsRep = LogUtils.readBoolean(itemBuffer); 4978 if (compactLsnsRep) { 4979 baseFileNumber = LogUtils.readInt(itemBuffer) & 0xffffffff; 4980 storedBaseFileNumber = baseFileNumber; 4981 } 4982 } 4983 4984 for (int i = 0; i < nEntries; i++) { 4985 4986 entryKeyVals = entryKeyVals.set( 4987 i, LogUtils.readByteArray(itemBuffer, unpacked), this); 4988 4989 long lsn; 4990 if (compactLsnsRep) { 4991 /* LSNs in compact form. */ 4992 byte fileNumberOffset = itemBuffer.get(); 4993 int fileOffset = (itemBuffer.get() & 0xff); 4994 fileOffset |= ((itemBuffer.get() & 0xff) << 8); 4995 fileOffset |= ((itemBuffer.get() & 0xff) << 16); 4996 if (fileOffset == THREE_BYTE_NEGATIVE_ONE) { 4997 lsn = DbLsn.NULL_LSN; 4998 } else { 4999 lsn = DbLsn.makeLsn 5000 (storedBaseFileNumber + fileNumberOffset, fileOffset); 5001 } 5002 } else { 5003 /* LSNs in long form. */ 5004 lsn = LogUtils.readLong(itemBuffer); // LSN 5005 } 5006 5007 setLsnInternal(i, lsn); 5008 5009 byte entryState = itemBuffer.get(); // state 5010 if (!deltasOnly) { 5011 entryState &= EntryStates.CLEAR_DIRTY_BIT; 5012 } 5013 entryState &= EntryStates.CLEAR_MIGRATE_BIT; 5014 5015 /* 5016 * A NULL_LSN is the remnant of an incomplete insertion and the 5017 * KnownDeleted flag should be set. But because of bugs in prior 5018 * releases, the KnownDeleted flag may not be set. So set it here. 5019 * See Tree.insert. [#13126] 5020 */ 5021 if (lsn == DbLsn.NULL_LSN) { 5022 entryState |= EntryStates.KNOWN_DELETED_BIT; 5023 } 5024 5025 entryStates[i] = entryState; 5026 5027 if (hasLastLoggedSize) { 5028 setLastLoggedSizeUnconditional( 5029 i, LogUtils.readPackedInt(itemBuffer)); 5030 } 5031 } 5032 5033 if (deltasOnly) { 5034 setBINDelta(true); 5035 5036 if (entryVersion >= 10) { 5037 BIN bin = (BIN)this; 5038 bin.setFullBinNEntries(LogUtils.readPackedInt(itemBuffer)); 5039 bin.setFullBinMaxEntries(LogUtils.readPackedInt(itemBuffer)); 5040 5041 if (hasBloomFilter) { 5042 bin.bloomFilter = BINDeltaBloomFilter.readFromLog( 5043 itemBuffer, entryVersion); 5044 } 5045 } 5046 } 5047 5048 /* Dup conversion will be done by postFetchInit. */ 5049 needDupKeyConversion = (entryVersion < 8); 5050 } 5051 5052 /** 5053 * @see Loggable#logicalEquals 5054 * Always return false, this item should never be compared. 5055 */ 5056 @Override logicalEquals(Loggable other)5057 public final boolean logicalEquals(Loggable other) { 5058 return false; 5059 } 5060 5061 /** 5062 * @see Loggable#dumpLog 5063 */ 5064 @Override dumpLog(StringBuilder sb, boolean verbose)5065 public final void dumpLog(StringBuilder sb, boolean verbose) { 5066 sb.append(beginTag()); 5067 5068 sb.append("<nodeId val=\""); 5069 sb.append(nodeId); 5070 sb.append("\"/>"); 5071 5072 sb.append(Key.dumpString(identifierKey, 0)); 5073 5074 // isRoot 5075 sb.append("<isRoot val=\""); 5076 sb.append(isRoot()); 5077 sb.append("\"/>"); 5078 5079 // level 5080 sb.append("<level val=\""); 5081 sb.append(Integer.toHexString(level)); 5082 sb.append("\"/>"); 5083 5084 if (keyPrefix != null) { 5085 sb.append("<keyPrefix>"); 5086 sb.append(Key.dumpString(keyPrefix, 0)); 5087 sb.append("</keyPrefix>"); 5088 } 5089 5090 // nEntries, length of entries array 5091 sb.append("<entries numEntries=\""); 5092 sb.append(nEntries); 5093 5094 sb.append("\" length=\""); 5095 sb.append(getMaxEntries()); 5096 5097 if (isBINDelta(false)) { 5098 BIN bin = (BIN)this; 5099 5100 sb.append("\" numFullBinEntries=\""); 5101 sb.append(bin.getFullBinNEntries()); 5102 sb.append("\" maxFullBinEntries=\""); 5103 sb.append(bin.getFullBinMaxEntries()); 5104 } 5105 5106 boolean compactLsnsRep = (entryLsnLongArray == null); 5107 if (compactLsnsRep) { 5108 sb.append("\" baseFileNumber=\""); 5109 sb.append(baseFileNumber); 5110 } 5111 sb.append("\">"); 5112 5113 if (verbose) { 5114 for (int i = 0; i < nEntries; i++) { 5115 sb.append("<ref kd=\""). 5116 append(isEntryKnownDeleted(i)); 5117 sb.append("\" pd=\""). 5118 append(isEntryPendingDeleted(i)); 5119 sb.append("\" logSize=\""). 5120 append(getLastLoggedSize(i)); 5121 sb.append("\">"); 5122 sb.append(Key.dumpString(entryKeyVals.get(i), 0)); 5123 sb.append(DbLsn.toString(getLsn(i))); 5124 sb.append("</ref>"); 5125 } 5126 } 5127 5128 sb.append("</entries>"); 5129 5130 if (isBINDelta(false)) { 5131 BIN bin = (BIN)this; 5132 if (bin.bloomFilter != null) { 5133 BINDeltaBloomFilter.dumpLog(bin.bloomFilter, sb, verbose); 5134 } 5135 } 5136 5137 /* Add on any additional items from subclasses before the end tag. */ 5138 dumpLogAdditional(sb); 5139 5140 sb.append(endTag()); 5141 } 5142 5143 /** 5144 * Allows subclasses to add additional fields before the end tag. If they 5145 * just overload dumpLog, the xml isn't nested. 5146 */ dumpLogAdditional(StringBuilder sb)5147 protected void dumpLogAdditional(StringBuilder sb) { 5148 } 5149 beginTag()5150 public String beginTag() { 5151 return BEGIN_TAG; 5152 } 5153 endTag()5154 public String endTag() { 5155 return END_TAG; 5156 } 5157 dumpKeys()5158 final void dumpKeys() { 5159 for (int i = 0; i < nEntries; i++) { 5160 System.out.println(Key.dumpString(entryKeyVals.get(i), 0)); 5161 } 5162 } 5163 5164 /** 5165 * For unit test support: 5166 * @return a string that dumps information about this IN, without 5167 */ 5168 @Override dumpString(int nSpaces, boolean dumpTags)5169 public String dumpString(int nSpaces, boolean dumpTags) { 5170 StringBuilder sb = new StringBuilder(); 5171 if (dumpTags) { 5172 sb.append(TreeUtils.indent(nSpaces)); 5173 sb.append(beginTag()); 5174 sb.append('\n'); 5175 } 5176 5177 if (dumpTags) { 5178 sb.append("<nodeId val=\""); 5179 sb.append(nodeId); 5180 sb.append("\"/>"); 5181 } else { 5182 sb.append(nodeId); 5183 } 5184 sb.append('\n'); 5185 5186 sb.append(TreeUtils.indent(nSpaces + 2)); 5187 sb.append("<idkey>"); 5188 sb.append(identifierKey == null ? 5189 "" : 5190 Key.dumpString(identifierKey, 0)); 5191 sb.append("</idkey>"); 5192 sb.append('\n'); 5193 sb.append(TreeUtils.indent(nSpaces + 2)); 5194 sb.append("<prefix>"); 5195 sb.append(keyPrefix == null ? "" : Key.dumpString(keyPrefix, 0)); 5196 sb.append("</prefix>\n"); 5197 sb.append(TreeUtils.indent(nSpaces + 2)); 5198 sb.append("<dirty val=\"").append(getDirty()).append("\"/>"); 5199 sb.append('\n'); 5200 sb.append(TreeUtils.indent(nSpaces + 2)); 5201 sb.append("<generation val=\"").append(generation).append("\"/>"); 5202 sb.append('\n'); 5203 sb.append(TreeUtils.indent(nSpaces + 2)); 5204 sb.append("<level val=\""); 5205 sb.append(Integer.toHexString(level)).append("\"/>"); 5206 sb.append('\n'); 5207 sb.append(TreeUtils.indent(nSpaces + 2)); 5208 sb.append("<isRoot val=\"").append(isRoot()).append("\"/>"); 5209 sb.append('\n'); 5210 sb.append(TreeUtils.indent(nSpaces + 2)); 5211 sb.append("<isBINDelta val=\"").append(isBINDelta(false)).append("\"/>"); 5212 sb.append('\n'); 5213 5214 sb.append(TreeUtils.indent(nSpaces + 2)); 5215 sb.append("<entries nEntries=\""); 5216 sb.append(nEntries); 5217 sb.append("\">"); 5218 sb.append('\n'); 5219 5220 for (int i = 0; i < nEntries; i++) { 5221 sb.append(TreeUtils.indent(nSpaces + 4)); 5222 sb.append("<entry id=\"" + i + "\">"); 5223 sb.append('\n'); 5224 if (getLsn(i) == DbLsn.NULL_LSN) { 5225 sb.append(TreeUtils.indent(nSpaces + 6)); 5226 sb.append("<lsn/>"); 5227 } else { 5228 sb.append(DbLsn.dumpString(getLsn(i), nSpaces + 6)); 5229 } 5230 sb.append('\n'); 5231 if (entryKeyVals.get(i) == null) { 5232 sb.append(TreeUtils.indent(nSpaces + 6)); 5233 sb.append("<key/>"); 5234 } else { 5235 sb.append(Key.dumpString(entryKeyVals.get(i), (nSpaces + 6))); 5236 } 5237 sb.append('\n'); 5238 if (entryTargets.get(i) == null) { 5239 sb.append(TreeUtils.indent(nSpaces + 6)); 5240 sb.append("<target/>"); 5241 } else { 5242 sb.append(entryTargets.get(i).dumpString(nSpaces + 6, true)); 5243 } 5244 sb.append('\n'); 5245 sb.append(TreeUtils.indent(nSpaces + 6)); 5246 dumpDeletedState(sb, getState(i)); 5247 sb.append("<dirty val=\"").append(isDirty(i)).append("\"/>"); 5248 sb.append('\n'); 5249 sb.append(TreeUtils.indent(nSpaces + 4)); 5250 sb.append("</entry>"); 5251 sb.append('\n'); 5252 } 5253 5254 sb.append(TreeUtils.indent(nSpaces + 2)); 5255 sb.append("</entries>"); 5256 sb.append('\n'); 5257 if (dumpTags) { 5258 sb.append(TreeUtils.indent(nSpaces)); 5259 sb.append(endTag()); 5260 } 5261 return sb.toString(); 5262 } 5263 5264 /** 5265 * Utility method for output of knownDeleted and pendingDelete. 5266 */ dumpDeletedState(StringBuilder sb, byte state)5267 static void dumpDeletedState(StringBuilder sb, byte state) { 5268 sb.append("<knownDeleted val=\""); 5269 sb.append(isStateKnownDeleted(state)).append("\"/>"); 5270 sb.append("<pendingDeleted val=\""); 5271 sb.append(isStatePendingDeleted(state)).append("\"/>"); 5272 } 5273 5274 @Override toString()5275 public String toString() { 5276 return dumpString(0, true); 5277 } 5278 shortClassName()5279 public String shortClassName() { 5280 return "IN"; 5281 } 5282 5283 /** 5284 * Send trace messages to the java.util.logger. Don't rely on the logger 5285 * alone to conditionalize whether we send this message, we don't even want 5286 * to construct the message if the level is not enabled. 5287 */ traceSplit(Level level, IN parent, IN newSibling, long parentLsn, long myNewLsn, long newSiblingLsn, int splitIndex, int idKeyIndex, int childIndex)5288 private void traceSplit(Level level, 5289 IN parent, 5290 IN newSibling, 5291 long parentLsn, 5292 long myNewLsn, 5293 long newSiblingLsn, 5294 int splitIndex, 5295 int idKeyIndex, 5296 int childIndex) { 5297 Logger logger = getEnv().getLogger(); 5298 if (logger.isLoggable(level)) { 5299 StringBuilder sb = new StringBuilder(); 5300 sb.append(TRACE_SPLIT); 5301 sb.append(" parent="); 5302 sb.append(parent.getNodeId()); 5303 sb.append(" child="); 5304 sb.append(getNodeId()); 5305 sb.append(" newSibling="); 5306 sb.append(newSibling.getNodeId()); 5307 sb.append(" parentLsn = "); 5308 sb.append(DbLsn.getNoFormatString(parentLsn)); 5309 sb.append(" childLsn = "); 5310 sb.append(DbLsn.getNoFormatString(myNewLsn)); 5311 sb.append(" newSiblingLsn = "); 5312 sb.append(DbLsn.getNoFormatString(newSiblingLsn)); 5313 sb.append(" splitIdx="); 5314 sb.append(splitIndex); 5315 sb.append(" idKeyIdx="); 5316 sb.append(idKeyIndex); 5317 sb.append(" childIdx="); 5318 sb.append(childIndex); 5319 LoggerUtils.logMsg(logger, 5320 databaseImpl.getEnv(), 5321 level, 5322 sb.toString()); 5323 } 5324 } 5325 5326 /** 5327 * Send trace messages to the java.util.logger. Don't rely on the logger 5328 * alone to conditionalize whether we send this message, we don't even want 5329 * to construct the message if the level is not enabled. 5330 */ traceDelete(Level level, int index)5331 private void traceDelete(Level level, int index) { 5332 Logger logger = databaseImpl.getEnv().getLogger(); 5333 if (logger.isLoggable(level)) { 5334 StringBuilder sb = new StringBuilder(); 5335 sb.append(TRACE_DELETE); 5336 sb.append(" in=").append(getNodeId()); 5337 sb.append(" index="); 5338 sb.append(index); 5339 LoggerUtils.logMsg(logger, 5340 databaseImpl.getEnv(), 5341 level, 5342 sb.toString()); 5343 } 5344 } 5345 setFetchINHook(TestHook hook)5346 public final void setFetchINHook(TestHook hook) { 5347 fetchINHook = hook; 5348 } 5349 } 5350