1 /*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 2002, 2014 Oracle and/or its affiliates. All rights reserved. 5 * 6 */ 7 8 package com.sleepycat.je.rep.vlsn; 9 10 import java.io.PrintStream; 11 import java.nio.ByteBuffer; 12 import java.util.ArrayList; 13 import java.util.List; 14 15 import com.sleepycat.bind.tuple.LongBinding; 16 import com.sleepycat.bind.tuple.TupleBinding; 17 import com.sleepycat.bind.tuple.TupleInput; 18 import com.sleepycat.bind.tuple.TupleOutput; 19 import com.sleepycat.je.Cursor; 20 import com.sleepycat.je.CursorConfig; 21 import com.sleepycat.je.DatabaseEntry; 22 import com.sleepycat.je.DbInternal; 23 import com.sleepycat.je.EnvironmentFailureException; 24 import com.sleepycat.je.OperationStatus; 25 import com.sleepycat.je.config.EnvironmentParams; 26 import com.sleepycat.je.dbi.DatabaseImpl; 27 import com.sleepycat.je.dbi.EnvironmentImpl; 28 import com.sleepycat.je.log.FileReader; 29 import com.sleepycat.je.txn.Txn; 30 import com.sleepycat.je.utilint.DbLsn; 31 import com.sleepycat.je.utilint.VLSN; 32 33 /** 34 * A VLSNBucket instance represents a set of VLSN->LSN mappings. Buckets are 35 * usually not updated, except at times when the replication stream may have 36 * been reduced in size, by log cleaning or syncup. The VLSNBuckets in the 37 * VLSNIndex's VLSNTracker are written to disk and are persistent. There are 38 * also VLSNBuckets in the temporary recovery-time tracker that are used for 39 * collecting mappings found in the log during recovery. 40 * 41 * VLSNBuckets only hold mappings from a single log file. A single log file 42 * may be mapped by multiple VLSNBuckets though. 43 * 44 * As a tradeoff in space vs time, a VLSNBucket only stores a sparse set of 45 * mappings and the caller must use a VLSNReader to scan the log file and 46 * find any log entries not mapped directly by the bucket. In addition, 47 * the VLSN is not actually stored. Only the offset portion of the LSN is 48 * stored, and the VLSN is intuited by a stride field. Each VLSNBucket 49 * only maps a single file, though a single file may be mapped by several 50 * VLSNBuckets. 51 * 52 * For example, suppose a node had these VLSN->LSN mappings: 53 * 54 * VLSN LSN (file/offset) 55 * 9 10/100 56 * 10 10/110 57 * 11 10/120 58 * 12 10/130 59 * 13 10/140 60 * 14 11/100 61 * 15 11/120 62 * 63 * The mappings in file 10 could be represented by a VLSNBucket with 64 * a stride of 4. That means the bucket would hold the mappings for 65 * 9 10/100, 66 * 13 10/140 67 * 68 * And since the target log file number and the stride is known, the mappings 69 * can be represented in by the offset alone in this array: {100, 140}, rather 70 * than storing the whole lsn. 71 * 72 * Each bucket can also provide the mapping for the first and last VLSN it 73 * covers, even if the lastVLSN is not divisible by the stride. This is done to 74 * support forward and backward scanning. From the example above, the completed 75 * bucket can provide 9->10/100, 13->10/140, 15 -> 10/160 even though 15 is not 76 * a stride's worth away from 13. 77 * 78 * Because registering a VLSN->LSN mapping is done outside the log write latch, 79 * any inserts into the VLSNBucket may not be in order. However, when any 80 * VLSN is registered, we can assume that all VLSNs < that value do exist in 81 * the log. It's just an accident of timing that they haven't yet been 82 * registered. Note that out of order inserts into the buckets can create holes 83 * in the bucket's offset array, or cause the array to be shorter than 84 * anticipated. 85 * 86 * For example, if the insertion order into the bucket is vlsns 9, 15, we'll 87 * actually only keep an offset array of size 1. We have to be able to handle 88 * holes in the bucket, and can't count on filling them in when the lagging 89 * vlsn arrives, because it is possible that a reading thread will access the 90 * bucket before the laggard inserter arrives, or that the bucket might be 91 * flushed to disk, and become immutable. 92 */ 93 public class VLSNBucket { 94 95 /* On-disk version. */ 96 private static final int VERSION = 1; 97 98 /* File number for target file. */ 99 private final long fileNumber; 100 101 /* Interval between VLSN values that are mapped. */ 102 private final int stride; 103 104 protected VLSN firstVLSN = VLSN.NULL_VLSN; 105 protected VLSN lastVLSN = VLSN.NULL_VLSN; 106 private long lastLsn = DbLsn.NULL_LSN; 107 108 /* 109 * The file offsets are really unsigned ints. The calls to put() are 110 * implemented to let us assume that the list is fully populated. A 111 * subclass of truncateableList has been used in order to provide access to 112 * the ArrayList.removeFromRange method. 113 */ 114 private TruncateableList<Integer> fileOffsets; 115 116 /* 117 * The max number of offsets and maxDistance help guide when to close the 118 * bucket and start a new one. Not persistent. 119 */ 120 private int maxMappings; 121 private int maxDistance; 122 123 private static final int NO_OFFSET = 0; 124 125 /* True if there are changes to the bucket that are not on disk. */ 126 boolean dirty; 127 128 /* 129 * True if the VLSNBucket will not accept any more modifications; used to 130 * safeguard the bucket while the index is being written to disk. 131 */ 132 private boolean closed = false; 133 VLSNBucket(long fileNumber, int stride, int maxMappings, int maxDistance, VLSN firstVLSN)134 VLSNBucket(long fileNumber, 135 int stride, 136 int maxMappings, 137 int maxDistance, 138 VLSN firstVLSN) { 139 this.fileNumber = fileNumber; 140 this.stride = stride; 141 this.maxMappings = maxMappings; 142 this.maxDistance = maxDistance; 143 144 /* 145 * The VLSNs in the bucket are initialized to indicate what range 146 * should be covered by this bucket. But there may not be any offsets 147 * recorded either in the lastLsn or the fileOffset. 148 */ 149 this.firstVLSN = firstVLSN; 150 this.lastVLSN = firstVLSN; 151 152 /* 153 * Initialize the file offsets with a -1 value to correspond to 154 * firstVLSN. 155 */ 156 fileOffsets = new TruncateableList<Integer>(); 157 fileOffsets.add(0, NO_OFFSET); 158 } 159 160 /* For reading from disk. */ VLSNBucket(TupleInput ti)161 private VLSNBucket(TupleInput ti) { 162 fileNumber = ti.readPackedLong(); 163 stride = ti.readPackedInt(); 164 firstVLSN = new VLSN(ti.readPackedLong()); 165 lastVLSN = new VLSN(ti.readPackedLong()); 166 lastLsn = ti.readPackedLong(); 167 int size = ti.readPackedInt(); 168 fileOffsets = new TruncateableList<Integer>(size); 169 for (int i = 0; i < size; i++) { 170 fileOffsets.add(i, DbLsn.getFileOffsetAsInt(ti.readUnsignedInt())); 171 } 172 } 173 174 /** 175 * Record the LSN location for this VLSN. 176 * 177 * One key issue is that puts() are not synchronized, and the VLSNs may 178 * arrive out of order. If an out of order VLSN does arrive, we can still 179 * assume that the earlier VLSNs have been successfully logged. If a VLSN 180 * arrives that is divisible by the stride, and should be recorded in the 181 * fileOffsets, but is not the next VLSN that should be recorded, we'll pad 182 * out the fileOffsets list with placeholders. 183 * 184 * For example, suppose the stride is 3, and the first VLSN is 2. Then this 185 * bucket should record VLSN 2, 5, 8, ... etc. If VLSN 8 arrives before 186 * VLSN 5, VLSN 8 will be recorded, and VLSN 5 will have an offset 187 * placeholder of NO_OFFSET. It is a non-issue if VLSNs 3, 4, 6, 7 arrive 188 * out of order, because they would not have been recorded anyway. This 189 * should not happen often, because the stride should be fairly large, and 190 * the calls to put() should be close together. If the insertion order is 191 * vlsn 2, 5, 9, then the file offsets array will be a little short, and 192 * will only have 2 elements, instead of 3. 193 * 194 * We follow this policy because we must always have a valid begin and end 195 * point for the range. We must handle placeholders in all cases, and can't 196 * count of later vlsn inserts, because a bucket can become immutable at 197 * any time if it is flushed to disk. 198 * 199 * @return false if this bucket will not accept this VLSN. Generally, a 200 * refusal might happen because the bucket was full or the mapping is too 201 * large a distance away from the previous mapping. In that case, the 202 * tracker will start another bucket. 203 */ put(VLSN vlsn, long lsn)204 synchronized boolean put(VLSN vlsn, long lsn) { 205 206 if (closed) { 207 return false; 208 } 209 210 if (!belongs(vlsn, lsn)) { 211 return false; 212 } 213 214 /* 215 * Add it to the fileOffset if it's on a stride boundary and is the 216 * next mapping in the fileOffset list. 217 */ 218 if (isModulo(vlsn)) { 219 int index = getIndex(vlsn); 220 int listLen = fileOffsets.size(); 221 if (index < listLen) { 222 fileOffsets.set(index, DbLsn.getFileOffsetAsInt(lsn)); 223 } else if (index == listLen) { 224 fileOffsets.add(DbLsn.getFileOffsetAsInt(lsn)); 225 } else { 226 for (int i = listLen; i < index; i++) { 227 fileOffsets.add(NO_OFFSET); 228 } 229 fileOffsets.add(DbLsn.getFileOffsetAsInt(lsn)); 230 } 231 dirty = true; 232 } 233 234 /* If the lastLsn is less, or not initialized, set it to this VLSN. */ 235 if ((lastVLSN.compareTo(vlsn) < 0) || 236 (lastLsn == DbLsn.NULL_LSN)) { 237 lastVLSN = vlsn; 238 lastLsn = lsn; 239 dirty = true; 240 } 241 242 return true; 243 } 244 245 /* 246 * Return true if this VLSN is on a stride boundary. Assumes 247 * !firstVLSN.isNull() 248 */ isModulo(VLSN vlsn)249 private boolean isModulo(VLSN vlsn) { 250 return (((vlsn.getSequence() - firstVLSN.getSequence()) % stride) == 251 0); 252 } 253 getIndex(VLSN vlsn)254 private int getIndex(VLSN vlsn) { 255 assert isModulo(vlsn) : "Don't call getIndex on non-modulo VLSN " + 256 vlsn + " bucket=" + this; 257 258 return (int) ((vlsn.getSequence() - firstVLSN.getSequence()) / stride); 259 } 260 261 /** 262 * @return true if this VLSN->LSN mapping should go into this bucket. 263 */ belongs(VLSN vlsn, long lsn)264 private boolean belongs(VLSN vlsn, long lsn) { 265 assert vlsn.compareTo(firstVLSN) >= 0 : 266 "firstVLSN = " + firstVLSN + " should not be greater than " + vlsn; 267 268 if (DbLsn.getFileNumber(lsn) != fileNumber) { 269 /* Mappings must be for same file. */ 270 return false; 271 } 272 273 if (emptyInternal()) { 274 return true; 275 } 276 277 /* 278 * Some other thread beat us to the put() call and inserted a later 279 * mapping, so we know for sure that we fit in this bucket 280 */ 281 if (lastVLSN.compareTo(vlsn) > 0) { 282 return true; 283 } 284 285 boolean onStrideBoundary = isModulo(vlsn); 286 if (onStrideBoundary && (fileOffsets.size() >= maxMappings)) { 287 /* Too full. */ 288 return false; 289 } 290 291 /* 292 * Will this VLSN be next one recorded in the fileOffsets? If so, 293 * calculate the scan distance. 294 */ 295 if ((onStrideBoundary && (getIndex(vlsn) == fileOffsets.size())) || 296 lastVLSN.compareTo(vlsn) < 0) { 297 /* This VLSN is going in at the tail of the bucket. */ 298 int lastOffset = fileOffsets.get(fileOffsets.size() - 1); 299 if ((DbLsn.getFileOffset(lsn) - 300 DbLsn.convertIntFileOffsetToLong(lastOffset)) > 301 maxDistance) { 302 /* The scan distance is exceeded. */ 303 return false; 304 } 305 } 306 307 return true; 308 } 309 310 /** 311 * @return true if this bucket contains this mapping. 312 */ owns(VLSN vlsn)313 synchronized boolean owns(VLSN vlsn) { 314 if (vlsn.equals(VLSN.NULL_VLSN)) { 315 return false; 316 } else if (firstVLSN.equals(VLSN.NULL_VLSN)) { 317 return false; 318 } else { 319 return (firstVLSN.compareTo(vlsn) <= 0) && 320 (lastVLSN.compareTo(vlsn) >= 0); 321 } 322 } 323 getFirst()324 synchronized VLSN getFirst() { 325 return firstVLSN; 326 } 327 getLast()328 synchronized VLSN getLast() { 329 return lastVLSN; 330 } 331 332 /** 333 * Return a file number that is less or equal to the first lsn mapped 334 * by this bucket. In standard VLSNBuckets, only one file is covered, so 335 * there is only one possible value. In GhostBuckets, multiple files could 336 * be covered. 337 * @return 338 */ getLTEFileNumber()339 long getLTEFileNumber() { 340 return fileNumber; 341 } 342 343 /* 344 * Similar to getLTEFileNumber, for this implementation there's only one 345 * possible file. 346 */ getGTEFileNumber()347 long getGTEFileNumber() { 348 return fileNumber; 349 } 350 empty()351 synchronized boolean empty() { 352 return emptyInternal(); 353 } 354 emptyInternal()355 private boolean emptyInternal() { 356 return (firstVLSN.equals(lastVLSN) && 357 (lastLsn == DbLsn.NULL_LSN)); 358 } 359 follows(VLSN vlsn)360 boolean follows(VLSN vlsn) { 361 return (firstVLSN.compareTo(vlsn) > 0); 362 } 363 precedes(VLSN vlsn)364 boolean precedes(VLSN vlsn) { 365 return (!lastVLSN.equals(VLSN.NULL_VLSN) && 366 (lastVLSN.compareTo(vlsn) < 0)); 367 } 368 369 /** 370 * Returns the mapping whose VLSN is >= the VLSN parameter. For example, if 371 * the bucket holds mappings for vlsn 10, 13, 16, 372 * 373 * - the greater than or equal mapping for VLSN 10 is 10/lsn 374 * - the greater than or equal mapping for VLSN 11 is 13/lsn 375 * - the greater than or equal mapping for VLSN 13 is 13/lsn 376 * 377 * File offsets may be null in the middle of the file offsets array because 378 * of out of order mappings. This method must return a non-null lsn, and 379 * must account for null offsets. 380 * 381 * @return the mapping whose VLSN is >= the VLSN parameter. Will never 382 * return NULL_LSN, because the VLSNRange begin and end point are always 383 * mapped. 384 */ getGTELsn(VLSN vlsn)385 public synchronized long getGTELsn(VLSN vlsn) { 386 387 if (lastVLSN.equals(vlsn)) { 388 return lastLsn; 389 } 390 391 int index; 392 if (firstVLSN.compareTo(vlsn) >= 0) { 393 394 /* 395 * It's possible for vlsn to be < the firstVLSN if vlsn 396 * falls between two buckets. For example, if the buckets are: 397 * bucketA = vlsn 10-> 20 398 * bucketB = vlsn 22->30 399 * then vlsn 21 will fall between two buckets, and will get bucketB 400 */ 401 index = 0; 402 } else { 403 index = getGTEIndex(vlsn); 404 } 405 406 /* 407 * This should never happen. Throw this exception to make debugging 408 * info available. 409 */ 410 if (index < 0) { 411 throw EnvironmentFailureException.unexpectedState 412 ("index=" + index + 413 " vlsn=" + vlsn + 414 " bucket=" + this); 415 } 416 417 if (index >= fileOffsets.size()) { 418 return lastLsn; 419 } 420 int useIndex = findPopulatedIndex(index, true /* forward */); 421 int offset = fileOffsets.get(useIndex); 422 return offset == NO_OFFSET ? 423 lastLsn : DbLsn.makeLsn(fileNumber, offset); 424 } 425 426 /** 427 * Return the index for the mapping >= this VLSN. Note that this is just 428 * a stride calculation, and a non-existent file offset index might be 429 * returned. 430 */ getGTEIndex(VLSN vlsn)431 private int getGTEIndex(VLSN vlsn) { 432 long diff = vlsn.getSequence() - firstVLSN.getSequence(); 433 return (int) ((diff + (stride - 1)) / stride); 434 } 435 436 /** 437 * We'd like to return the mapping at startIndex for the get{LTE, GTE} 438 * Mapping methods, but the offsets may not be populated if put() calls 439 * have come out of order. Search for the next populated offset. 440 */ findPopulatedIndex(int startIndex, boolean forward)441 private int findPopulatedIndex(int startIndex, boolean forward) { 442 if (forward) { 443 for (int i = startIndex; i < fileOffsets.size(); i++) { 444 if (fileOffsets.get(i) != NO_OFFSET) { 445 return i; 446 } 447 } 448 } else { 449 for (int i = startIndex; i >= 0; i--) { 450 if (fileOffsets.get(i) != NO_OFFSET) { 451 return i; 452 } 453 } 454 } 455 return startIndex; 456 } 457 458 /** 459 * Returns the lsn whose VLSN is <= the VLSN parameter. For example, if 460 * the bucket holds mappings for vlsn 10, 13, 16, 461 * 462 * - the less than or equal mapping for VLSN 10 is 10/lsn 463 * - the less than or equal mapping for VLSN 11 is 10/lsn 464 * - the less than or equal mapping for VLSN 13 is 13/lsn 465 * 466 * File offsets may be null in the middle of the file offsets array because 467 * of out of order mappings. This method must return a non-null lsn, and 468 * must account for null offsets. 469 * 470 * @return the lsn whose VLSN is <= the VLSN parameter. Will never return 471 * NULL_LSN, because the VLSNRange begin and end points are always mapped. 472 */ getLTELsn(VLSN vlsn)473 synchronized long getLTELsn(VLSN vlsn) { 474 475 /* 476 * It's possible for vlsn to be greater than lastVLSN if vlsn falls 477 * between two buckets. 478 * For example, if the buckets are: 479 * bucketA = vlsn 10-> 20 480 * bucketB = vlsn 22->30 481 * then vlsn 21 will fall between two buckets, and will get bucketA 482 */ 483 if (lastVLSN.compareTo(vlsn) <= 0) { 484 return lastLsn; 485 } 486 487 long diff = vlsn.getSequence() - firstVLSN.getSequence(); 488 489 /* 490 * Make sure that the file offset array isn't unexpectedly short due to 491 * out of order inserts. 492 */ 493 int index = (int)(diff / stride); 494 if (index >= fileOffsets.size()) { 495 index = fileOffsets.size() - 1; 496 } 497 498 int useIndex = findPopulatedIndex(index, false /* forward */); 499 int offset = fileOffsets.get(useIndex); 500 501 assert offset != NO_OFFSET : "bucket should always have a non-null " + 502 "first offset. vlsn= " + vlsn + " bucket=" + this; 503 504 return (DbLsn.makeLsn(fileNumber, offset)); 505 } 506 507 /** 508 * @return the lsn whose VLSN is == the VLSN parameter or DbLsn.NULL_LSN if 509 * there is no mapping. Note that because of out of order puts, there may 510 * be missing mappings that appear later on. 511 */ getLsn(VLSN vlsn)512 public synchronized long getLsn(VLSN vlsn) { 513 assert owns(vlsn) : "vlsn=" + vlsn + " " + this; 514 515 if (lastVLSN.equals(vlsn)) { 516 return lastLsn; 517 } 518 519 if (!isModulo(vlsn)) { 520 return DbLsn.NULL_LSN; 521 } 522 523 int index = getIndex(vlsn); 524 if (index >= fileOffsets.size()) { 525 return DbLsn.NULL_LSN; 526 } 527 528 int offset = fileOffsets.get(index); 529 if (offset == NO_OFFSET) { 530 return DbLsn.NULL_LSN; 531 } 532 533 return DbLsn.makeLsn(fileNumber, offset); 534 } 535 getLastLsn()536 synchronized long getLastLsn() { 537 return lastLsn; 538 } 539 540 /** 541 * Remove the mappings from this bucket that are for VLSNs <= 542 * lastDuplicate. If this results in a broken stride interval, package all 543 * those mappings into their own bucket and return it as a remainder 544 * bucket. 545 * 546 * For example, suppose this bucket has a stride of 5 and maps VLSN 10-23. 547 * Then it has mappings for 10, 15, 20, 23. 548 * 549 * If we need to remove mappings <= 16, we'll end up without a bucket that 550 * serves as a home base for vlsns 17,18,19. Those will be spun out into 551 * their own bucket, and this bucket will be adjusted to start at VLSN 20. 552 * This bucket should end up with 553 * 554 * - firstVLSN = 20 555 * - fileOffset is an array of size 1, for the LSN for VLSN 20 556 * - lastVLSN = 23 557 * - lastLsn = the same as before 558 * 559 * The spun-off bucket should be: 560 * - firstVLSN = 17 561 * - fileOffset is an array of size 1, for the LSN for VLSN 17 562 * - lastVLSN = 19 563 * - lastLsn = lsn for 19 564 * 565 * @return the newly created bucket that holds mappings from a broken 566 * stride interval, or null if there was no need to create such a bucket. 567 */ removeFromHead(EnvironmentImpl envImpl, VLSN lastDuplicate)568 VLSNBucket removeFromHead(EnvironmentImpl envImpl, VLSN lastDuplicate) { 569 570 if (empty()) { 571 return null; 572 } 573 574 /* 575 * No overlap, this bucket owns mappngs that follow the duplicate 576 * range. 577 */ 578 if (lastDuplicate.compareTo(firstVLSN) < 0) { 579 return null; 580 } 581 582 /* 583 * This whole bucket is to be deleted, all its mappings are <= the 584 * lastDuplicate. 585 */ 586 if (lastVLSN.compareTo(lastDuplicate) <= 0) { 587 fileOffsets = null; 588 firstVLSN = VLSN.NULL_VLSN; 589 lastVLSN = VLSN.NULL_VLSN; 590 lastLsn = DbLsn.NULL_LSN; 591 return null; 592 } 593 594 VLSN indexVLSN = firstVLSN; 595 int newFirstIndex = -1; 596 597 /* 598 * Find the mappings that still belong. Using the example above, we 599 * should find that we can delete fileOffset[0] and fileOffset[1] and 600 * preserve fileOffset[2] 601 */ 602 for (int i = 0; i < fileOffsets.size(); i++) { 603 if ((indexVLSN.compareTo(lastDuplicate) > 0) && 604 (fileOffsets.get(i) != NO_OFFSET)) { 605 newFirstIndex = i; 606 break; 607 } 608 indexVLSN = new VLSN(indexVLSN.getSequence() + stride); 609 } 610 611 VLSNBucket remainder = null; 612 int lastOffset; 613 if (newFirstIndex == -1) { 614 615 /* 616 * None of the VLSNs represented by the strided file offsets are 617 * needed anymore. This bucket consists solely of the last 618 * VLSN->LSN pair. 619 */ 620 lastOffset = fileOffsets.get(fileOffsets.size() - 1); 621 fileOffsets = new TruncateableList<Integer>(); 622 fileOffsets.add(DbLsn.getFileOffsetAsInt(lastLsn)); 623 firstVLSN = lastVLSN; 624 } else { 625 /* Move the still-valid mappings to a new list. */ 626 assert (newFirstIndex > 0); 627 lastOffset = fileOffsets.get(newFirstIndex - 1); 628 TruncateableList<Integer> newFileOffsets = 629 new TruncateableList<Integer> 630 (fileOffsets.subList(newFirstIndex, fileOffsets.size())); 631 fileOffsets = newFileOffsets; 632 firstVLSN = new VLSN((newFirstIndex * stride) + 633 firstVLSN.getSequence()); 634 } 635 636 if (!firstVLSN.equals(lastDuplicate.getNext())) { 637 638 /* 639 * If lastDuplicate was not on the same stride boundary as our old 640 * bucket, we may have a broken bucket of mappings to preserve. 641 * Using our example numbers above, we still need to make sure 642 * there's a bucket that matches VLSNs 17, 18 19. 643 */ 644 long scanStart = DbLsn.makeLsn(fileNumber, lastOffset); 645 remainder = scanForNewBucket(envImpl, 646 lastDuplicate.getNext(), 647 firstVLSN.getPrev(), 648 scanStart); 649 } 650 651 dirty = true; 652 return remainder; 653 } 654 655 /** 656 * Scan the log fle for VLSN->LSN mappings for creating a new bucket. 657 */ scanForNewBucket(EnvironmentImpl envImpl, VLSN first, VLSN last, long startLsn)658 private VLSNBucket scanForNewBucket(EnvironmentImpl envImpl, 659 VLSN first, 660 VLSN last, 661 long startLsn) { 662 663 VLSNBucket newBucket = new VLSNBucket(fileNumber, stride, 664 maxMappings, maxDistance, 665 first); 666 int readBufferSize = 667 envImpl.getConfigManager().getInt 668 (EnvironmentParams.LOG_ITERATOR_MAX_SIZE); 669 670 NewBucketReader scanner = 671 new NewBucketReader(newBucket, envImpl, readBufferSize, first, 672 last, startLsn); 673 674 while (!scanner.isDone() && (scanner.readNextEntry())) { 675 } 676 677 assert scanner.isDone(); 678 679 return newBucket; 680 } 681 682 /** 683 * Remove the mappings from this bucket that are for VLSNs >= 684 * startOfDelete. Unlike removing from the head, we need not worry about 685 * breaking a bucket stride interval. 686 * 687 * If prevLsn is NULL_VLSN, we don't have a good value to cap the bucket. 688 * Instead, we'll have to delete the bucket back to whatever was the next 689 * available lsn. For example, suppose the bucket has these mappings. This 690 * strange bucket (stride 25 is missing) is possible if vlsn 26 arrived 691 * early, out of order. 692 * 693 * in fileOffset: 10 -> 101 694 * in fileOffset: 15 -> no offset 695 * in fileOffset: 20 -> 201 696 * lastVLSN->lastnLsn mapping 26 -> 250 697 * 698 * If we have a prevLsn and the startOfDelete is 17, then we can create 699 * a new mapping 700 * in fileOffset: 10 -> 101 701 * in fileOffset: 15 -> no offset 702 * lastVLSN->lastnLsn mapping 17 -> 190 703 * 704 * If we don't have a prevLsn, then we know that we have to cut the bucket 705 * back to the largest known mapping, losing many mappings along the way. 706 * in fileOffset: 10 -> 101 707 * lastVLSN->lastnLsn mapping 10 -> 101 708 * 709 * If we are deleting in the vlsn area between the last stride and the 710 * last offset, (i.e. vlsn 23 is the startOfDelete) the with and without 711 * prevLSn cases would look like this: 712 * 713 * (there is a prevLsn, and 23 is startDelete. No need to truncate 714 * anything) 715 * in fileOffset: 10 -> 101 716 * in fileOffset: 15 -> no offset 717 * in fileOffset: 20 -> 201 718 * lastVLSN->lastnLsn mapping 23 -> prevLsn 719 * 720 * (there is no prevLsn, and 23 is startDelete) 721 * in fileOffset: 10 -> 101 722 * in fileOffset: 15 -> no offset 723 * in fileOffset: 20 -> 201 724 * lastVLSN->lastnLsn mapping 20 -> 201 725 * 726 * @param startOfDelete is the VLSN that begins the range to delete, 727 * inclusive 728 * @param prevLsn is the lsn of startOfDelete.getPrev(). We'll be using it 729 * to cap off the end of the bucket, by assigning it to the lastLsn field. 730 */ removeFromTail(VLSN startOfDelete, long prevLsn)731 void removeFromTail(VLSN startOfDelete, long prevLsn) { 732 733 if (empty()) { 734 return; 735 } 736 737 if (lastVLSN.compareTo(startOfDelete) < 0) { 738 return; 739 } 740 741 /* Delete all the mappings. */ 742 if (firstVLSN.compareTo(startOfDelete) >= 0) { 743 lastVLSN = firstVLSN; 744 lastLsn = DbLsn.NULL_LSN; 745 fileOffsets.clear(); 746 return; 747 } 748 749 /* Delete some of the mappings. */ 750 int deleteIndex = getGTEIndex(startOfDelete); 751 752 /* 753 * This should never happen, because the startOfDelete should be a vlsn 754 * that is >= the first vlsn and we handled the case where 755 * startOfDelete == firstVLSN already.) Throw this exception to make 756 * debugging info available. 757 */ 758 if (deleteIndex <= 0) { 759 throw EnvironmentFailureException.unexpectedState 760 ("deleteIndex=" + deleteIndex + 761 " startOfDelete=" + startOfDelete + 762 " bucket=" + this); 763 } 764 765 /* See if there are any fileoffsets to prune off. */ 766 if (deleteIndex < fileOffsets.size()) { 767 768 /* 769 * The startOfDeleteVLSN is a value between the firstVLSN and 770 * the last file offset. 771 */ 772 if (prevLsn == DbLsn.NULL_LSN) { 773 int lastPopulatedIndex = 774 findPopulatedIndex(deleteIndex-1, false); 775 if (lastPopulatedIndex != (deleteIndex -1)) { 776 deleteIndex = lastPopulatedIndex + 1; 777 } 778 } 779 fileOffsets.truncate(deleteIndex); 780 } else { 781 /* 782 * The startOfDelete vlsn is somewhere between the last file offset 783 * and the lastVLSN. 784 */ 785 if (prevLsn == DbLsn.NULL_LSN) { 786 int lastIndex = fileOffsets.size() - 1; 787 int lastPopulatedIndex = findPopulatedIndex(lastIndex, false); 788 if (lastPopulatedIndex < lastIndex) { 789 fileOffsets.truncate(lastPopulatedIndex); 790 } 791 } 792 } 793 794 /* Now set the lastVLSN -> lastLSN mapping. */ 795 if (prevLsn == DbLsn.NULL_LSN) { 796 lastVLSN = new VLSN(((fileOffsets.size()-1) * stride) + 797 firstVLSN.getSequence()); 798 Integer lastOffset = fileOffsets.get(fileOffsets.size() - 1); 799 assert lastOffset != null; 800 lastLsn = DbLsn.makeLsn(fileNumber, lastOffset); 801 } else { 802 lastVLSN = startOfDelete.getPrev(); 803 lastLsn = prevLsn; 804 } 805 dirty = true; 806 } 807 808 /* For unit tests */ getNumOffsets()809 int getNumOffsets() { 810 return fileOffsets.size(); 811 } 812 close()813 void close() { 814 closed = true; 815 } 816 817 /** 818 * Write this bucket to the mapping database. 819 */ writeToDatabase(EnvironmentImpl envImpl, DatabaseImpl bucketDbImpl, Txn txn)820 void writeToDatabase(EnvironmentImpl envImpl, 821 DatabaseImpl bucketDbImpl, 822 Txn txn) { 823 824 if (!dirty) { 825 return; 826 } 827 828 Cursor c = null; 829 try { 830 c = DbInternal.makeCursor(bucketDbImpl, 831 txn, 832 CursorConfig.DEFAULT); 833 writeToDatabase(envImpl, c); 834 } finally { 835 if (c != null) { 836 c.close(); 837 } 838 } 839 } 840 841 /** 842 * Write this bucket to the mapping database using a cursor. Note that 843 * this method must disable critical eviction. Critical eviction makes the 844 * calling thread search for a target IN node to evict. That target IN node 845 * may or may not be in the internal VLSN db. 846 * 847 * For example, when a new, replicated LN is inserted or modified, a 848 * new VLSN is allocated. To do so, the app thread that is executing the 849 * operation 850 * A1. Takes a BIN latch on a BIN in a replicated db 851 * A2. Takes the VLSNINdex mutex 852 * 853 * Anyone calling writeDatabase() has to take these steps: 854 * B1. Take the VLSNIndex mutex 855 * B2. Get a BIN latch for a BIN in the internal vlsn db. 856 * 857 * This difference in locking hierarchy could cause a deadlock except for 858 * the fact that A1 and B2 are guaranteed to be in different databases. If 859 * writeDatabase() also did critical eviction, it would have a step where 860 * it tried to get a BIN latch on a replicated db, and we'd have a 861 * deadlock. [#18475] 862 */ writeToDatabase(EnvironmentImpl envImpl, Cursor cursor)863 void writeToDatabase(EnvironmentImpl envImpl, Cursor cursor) { 864 865 if (!dirty) { 866 return; 867 } 868 869 DatabaseEntry key = new DatabaseEntry(); 870 DatabaseEntry data = new DatabaseEntry(); 871 872 LongBinding.longToEntry(firstVLSN.getSequence(), key); 873 VLSNBucketBinding bucketBinding = new VLSNBucketBinding(); 874 bucketBinding.objectToEntry(this, data); 875 876 DbInternal.getCursorImpl(cursor).setAllowEviction(false); 877 OperationStatus status = cursor.put(key, data); 878 879 if (status != OperationStatus.SUCCESS) { 880 throw EnvironmentFailureException.unexpectedState 881 (envImpl, "Unable to write VLSNBucket for file " + 882 fileNumber + " status=" + status); 883 } 884 dirty = false; 885 } 886 887 /** 888 * Instantiate this from the database. Assumes that this bucket will not be 889 * used for insertion in the future. 890 */ readFromDatabase(DatabaseEntry data)891 public static VLSNBucket readFromDatabase(DatabaseEntry data) { 892 893 VLSNBucketBinding mapperBinding = new VLSNBucketBinding(); 894 VLSNBucket bucket = mapperBinding.entryToObject(data); 895 return bucket; 896 } 897 fillDataEntry(DatabaseEntry data)898 void fillDataEntry(DatabaseEntry data) { 899 VLSNBucketBinding binding = new VLSNBucketBinding(); 900 binding.objectToEntry(this, data); 901 } 902 903 /** 904 * @see java.lang.Object#toString() 905 */ 906 @Override toString()907 public String toString() { 908 return String.format("<VLSNBucket fileNum=%d(0x%x) numOffsets=%d " + 909 "stride=%d firstVLSN=%s lastVLSN=%s lastLsn=%s/>", 910 fileNumber, fileNumber, 911 (fileOffsets == null) ? 0 : fileOffsets.size(), 912 stride, firstVLSN, lastVLSN, 913 DbLsn.getNoFormatString(lastLsn)); 914 } 915 916 /** 917 * For debugging and tracing. 918 */ dump(PrintStream out)919 public void dump(PrintStream out) { 920 if (fileOffsets == null) { 921 return; 922 } 923 924 long vlsnVal = firstVLSN.getSequence(); 925 int newlineCounter = 0; 926 for (Integer offset : fileOffsets) { 927 out.printf(" [%d 0x%x]", vlsnVal, 928 DbLsn.convertIntFileOffsetToLong(offset)); 929 930 vlsnVal += stride; 931 if (++newlineCounter > 6) { 932 out.println("\n"); 933 newlineCounter = 0; 934 } 935 } 936 937 out.printf("\n---------Last: VLSN=%s LSN=%s", lastVLSN, 938 DbLsn.getNoFormatString(lastLsn)); 939 } 940 isGhost()941 boolean isGhost() { 942 return false; 943 } 944 writeToTupleOutput(TupleOutput to)945 void writeToTupleOutput(TupleOutput to) { 946 947 to.writePackedLong(fileNumber); 948 to.writePackedInt(stride); 949 to.writePackedLong(firstVLSN.getSequence()); 950 to.writePackedLong(lastVLSN.getSequence()); 951 to.writePackedLong(lastLsn); 952 to.writePackedInt(fileOffsets.size()); 953 for (Integer offset: fileOffsets) { 954 to.writeUnsignedInt(DbLsn.convertIntFileOffsetToLong(offset)); 955 } 956 } 957 958 /** 959 * Marshals a VLSNBucket to a byte buffer to store in the database. 960 * Doesn't persist the file number, because that's the key of the database. 961 * A number of the fields are transient and are also not stored. 962 */ 963 private static class VLSNBucketBinding extends TupleBinding<VLSNBucket> { 964 965 @Override entryToObject(TupleInput ti)966 public VLSNBucket entryToObject(TupleInput ti) { 967 968 int onDiskVersion = ti.readPackedInt(); 969 if (onDiskVersion != VLSNBucket.VERSION) { 970 throw EnvironmentFailureException.unexpectedState 971 ("Don't expect version diff on_disk=" + onDiskVersion + 972 " source=" + VLSNBucket.VERSION); 973 } 974 boolean isGhost = ti.readBoolean(); 975 VLSNBucket bucket = null; 976 if (isGhost) { 977 bucket = GhostBucket.makeNewInstance(ti); 978 } else { 979 bucket = new VLSNBucket(ti); 980 } 981 return bucket; 982 } 983 984 @Override objectToEntry(VLSNBucket bucket, TupleOutput to)985 public void objectToEntry(VLSNBucket bucket, TupleOutput to) { 986 to.writePackedInt(VLSNBucket.VERSION); 987 to.writeBoolean(bucket.isGhost()); 988 bucket.writeToTupleOutput(to); 989 } 990 } 991 992 /** 993 * Scan a specific section of log and generate a new VLSNBucket for 994 * this section. 995 */ 996 private static class NewBucketReader extends FileReader { 997 private final VLSNBucket remainderBucket; 998 private boolean done = false; 999 private final VLSN first; 1000 private final VLSN last; 1001 NewBucketReader(VLSNBucket remainderBucket, EnvironmentImpl envImpl, int readBufferSize, VLSN first, VLSN last, long startLsn)1002 public NewBucketReader(VLSNBucket remainderBucket, 1003 EnvironmentImpl envImpl, 1004 int readBufferSize, 1005 VLSN first, 1006 VLSN last, 1007 long startLsn) { 1008 super(envImpl, 1009 readBufferSize, 1010 true, // forward 1011 startLsn, 1012 null, // singleFileNumber 1013 DbLsn.NULL_LSN, // endOfFileLsn 1014 DbLsn.NULL_LSN); // finishLsn 1015 1016 this.remainderBucket = remainderBucket; 1017 this.first = first; 1018 this.last = last; 1019 } 1020 1021 /** 1022 * Return true if this entry is replicated and its VLSN >= the 1023 * firstVLSN and the entry is not invisible. These entries will 1024 * be used to bring the VLSNIndex up to speed. 1025 */ 1026 @Override isTargetEntry()1027 protected boolean isTargetEntry() { 1028 return (!currentEntryHeader.isInvisible() && 1029 entryIsReplicated() && 1030 (currentEntryHeader.getVLSN().compareTo(first) >= 0)); 1031 } 1032 1033 @Override processEntry(ByteBuffer entryBuffer)1034 protected boolean processEntry(ByteBuffer entryBuffer) { 1035 if (currentEntryHeader.getVLSN().compareTo(last) > 0) { 1036 done = true; 1037 } else { 1038 remainderBucket.put(currentEntryHeader.getVLSN(), 1039 getLastLsn()); 1040 } 1041 1042 entryBuffer.position(entryBuffer.position() + 1043 currentEntryHeader.getItemSize()); 1044 return true; 1045 } 1046 isDone()1047 boolean isDone() { 1048 return done; 1049 } 1050 } 1051 1052 @SuppressWarnings("serial") 1053 private static class TruncateableList<T> extends ArrayList<T> { 1054 TruncateableList()1055 TruncateableList() { 1056 super(); 1057 } 1058 TruncateableList(int capacity)1059 TruncateableList(int capacity) { 1060 super(capacity); 1061 } 1062 TruncateableList(List<T> list)1063 TruncateableList(List<T> list) { 1064 super(list); 1065 } 1066 truncate(int fromIndex)1067 void truncate(int fromIndex) { 1068 removeRange(fromIndex, size()); 1069 } 1070 } 1071 } 1072