1 /*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 2002, 2014 Oracle and/or its affiliates. All rights reserved. 5 * 6 */ 7 8 package com.sleepycat.je.rep.vlsn; 9 10 import static com.sleepycat.je.utilint.VLSN.NULL_VLSN; 11 12 import java.util.ArrayList; 13 import java.util.NoSuchElementException; 14 import java.util.SortedMap; 15 import java.util.TreeMap; 16 17 import com.sleepycat.bind.tuple.LongBinding; 18 import com.sleepycat.je.Cursor; 19 import com.sleepycat.je.CursorConfig; 20 import com.sleepycat.je.DatabaseEntry; 21 import com.sleepycat.je.DatabaseException; 22 import com.sleepycat.je.DbInternal; 23 import com.sleepycat.je.EnvironmentFailureException; 24 import com.sleepycat.je.LockMode; 25 import com.sleepycat.je.OperationStatus; 26 import com.sleepycat.je.dbi.DatabaseImpl; 27 import com.sleepycat.je.dbi.EnvironmentImpl; 28 import com.sleepycat.je.rep.vlsn.VLSNRange.VLSNRangeBinding; 29 import com.sleepycat.je.txn.BasicLocker; 30 import com.sleepycat.je.txn.Locker; 31 import com.sleepycat.je.txn.Txn; 32 import com.sleepycat.je.utilint.DbLsn; 33 import com.sleepycat.je.utilint.LongStat; 34 import com.sleepycat.je.utilint.StatGroup; 35 import com.sleepycat.je.utilint.VLSN; 36 37 /** 38 * See @link{VLSNIndex} for an overview of the mapping system. The VLSNTracker 39 * packages the VLSNRange and the cached, in-memory VLSNBuckets. 40 * 41 * The tracker has a notion of the "currentBucket", which is the one receiving 42 * updates. All other cached buckets are finished and are awaiting a write to 43 * the database. Those finished buckets will only be updated in special 44 * circumstances, such as log cleaning or replication stream truncation, when 45 * we can assume that there will no readers viewing the buckets. 46 */ 47 class VLSNTracker { 48 private final EnvironmentImpl envImpl; 49 50 /* The first mapping that is is the tracker cache. */ 51 private VLSN firstTrackedVLSN = NULL_VLSN; 52 53 /* The last VLSN that is stored on disk. */ 54 private VLSN lastOnDiskVLSN = NULL_VLSN; 55 56 /* 57 * A cache of buckets that are not on disk. The map key is the bucket's 58 * first VLSN. 59 */ 60 SortedMap<Long, VLSNBucket> bucketCache; 61 62 /* 63 * The range should always be updated through assignment to a new 64 * VLSNRange, to ensure that the values stay consistent. The current bucket 65 * mutex must be taken in order to update this value, so that the current 66 * bucket is updated before the range is updated. When reading range 67 * fields, the caller must be sure to get a reference to a single range 68 * instance, so that there's no danger of getting inconsistent values. 69 */ 70 protected volatile VLSNRange range; 71 private boolean rangeTruncated; 72 73 /* 74 * In the future, we may want to vary stride, maxMappings and maxDistance 75 * dynamically, in reaction to efficient the mappings look. 76 */ 77 private final int stride; 78 private final int maxMappings; 79 private final int maxDistance; 80 81 private final LongStat nBucketsCreated; 82 83 /* 84 * Create an VLSNTracker, with the range initialized from the mapping db. 85 */ VLSNTracker(EnvironmentImpl envImpl, DatabaseImpl mappingDbImpl, int stride, int maxMappings, int maxDistance, StatGroup statistics)86 VLSNTracker(EnvironmentImpl envImpl, 87 DatabaseImpl mappingDbImpl, 88 int stride, 89 int maxMappings, 90 int maxDistance, 91 StatGroup statistics) 92 throws DatabaseException { 93 94 this.stride = stride; 95 this.maxMappings = maxMappings; 96 this.maxDistance = maxDistance; 97 this.envImpl = envImpl; 98 nBucketsCreated = 99 new LongStat(statistics, VLSNIndexStatDefinition.N_BUCKETS_CREATED); 100 101 bucketCache = new TreeMap<Long, VLSNBucket>(); 102 103 /* Read the current range information. */ 104 DatabaseEntry key = new DatabaseEntry(); 105 DatabaseEntry data = new DatabaseEntry(); 106 LongBinding.longToEntry(VLSNRange.RANGE_KEY, key); 107 108 Cursor cursor = null; 109 Locker locker = null; 110 try { 111 locker = BasicLocker.createBasicLocker(envImpl); 112 cursor = DbInternal.makeCursor(mappingDbImpl, 113 locker, 114 CursorConfig.DEFAULT); 115 DbInternal.getCursorImpl(cursor).setAllowEviction(false); 116 117 OperationStatus status = cursor.getSearchKey(key, data, 118 LockMode.DEFAULT); 119 if (status == OperationStatus.SUCCESS) { 120 /* initialize the range from the database. */ 121 VLSNRangeBinding rangeBinding = new VLSNRangeBinding(); 122 range = rangeBinding.entryToObject(data); 123 lastOnDiskVLSN = range.getLast(); 124 } else if (status == OperationStatus.NOTFOUND) { 125 /* No mappings exist before. */ 126 range = VLSNRange.EMPTY; 127 } else { 128 throw EnvironmentFailureException.unexpectedState 129 ("VLSNTracker init: status=" + status); 130 } 131 } finally { 132 if (cursor != null) { 133 cursor.close(); 134 } 135 136 if (locker != null) { 137 locker.operationEnd(true); 138 } 139 } 140 } 141 142 /* 143 * Create an empty VLSNTracker. Used during recovery. 144 */ VLSNTracker(EnvironmentImpl envImpl, int stride, int maxMappings, int maxDistance)145 VLSNTracker(EnvironmentImpl envImpl, 146 int stride, 147 int maxMappings, 148 int maxDistance) { 149 150 this.envImpl = envImpl; 151 this.stride = stride; 152 this.maxMappings = maxMappings; 153 this.maxDistance = maxDistance; 154 155 /* Set up a temporary stat group for use during recovery */ 156 StatGroup statistics = 157 new StatGroup(VLSNIndexStatDefinition.GROUP_NAME, 158 VLSNIndexStatDefinition.GROUP_DESC); 159 nBucketsCreated = 160 new LongStat(statistics, 161 VLSNIndexStatDefinition.N_BUCKETS_CREATED); 162 163 initEmpty(); 164 } 165 initEmpty()166 void initEmpty() { 167 bucketCache = new TreeMap<Long, VLSNBucket>(); 168 range = VLSNRange.EMPTY; 169 } 170 171 /** 172 * Return a bucket for reading a mapping for this VLSN. If vlsn is > 173 * lastOnDisk, a bucket is guaranteed to be returned. 174 */ getGTEBucket(VLSN vlsn)175 synchronized VLSNBucket getGTEBucket(VLSN vlsn) { 176 177 if (lastOnDiskVLSN.compareTo(vlsn) >= 0) { 178 /* The bucket is in the database, not the cache. */ 179 return null; 180 } 181 182 Long pivot = vlsn.getSequence() + 1; 183 SortedMap<Long, VLSNBucket> head = bucketCache.headMap(pivot); 184 VLSNBucket prevBucket = null; 185 if (head.size() > 0) { 186 prevBucket = head.get(head.lastKey()); 187 if (prevBucket.owns(vlsn)) { 188 return prevBucket; 189 } 190 } 191 192 /* 193 * If the key is not in the headMap, we must return the next bucket 194 * with mappings that follow on. 195 */ 196 SortedMap<Long, VLSNBucket> tail = bucketCache.tailMap(pivot); 197 if (tail.size() > 0) { 198 VLSNBucket bucket = tail.get(tail.firstKey()); 199 assert (bucket.owns(vlsn) || bucket.follows(vlsn)) : 200 "VLSN " + vlsn + " got wrong bucket " + bucket; 201 return bucket; 202 } 203 204 throw EnvironmentFailureException.unexpectedState 205 (envImpl, "VLSN " + vlsn + " should be held within this tracker. " + 206 this + " prevBucket=" + prevBucket); 207 } 208 209 /** 210 * Get the bucket which holds a mapping for this VLSN. If there is no such 211 * bucket, get the one directly preceding it. If this VLSN is >= 212 * firstTrackedVLSN, then we should be able to guarantee that a bucket is 213 * returned. 214 */ getLTEBucket(VLSN vlsn)215 synchronized VLSNBucket getLTEBucket(VLSN vlsn) { 216 217 if (firstTrackedVLSN.equals(NULL_VLSN) || 218 (firstTrackedVLSN.compareTo(vlsn) > 0)) { 219 return null; 220 } 221 222 Long pivot = vlsn.getSequence() + 1; 223 SortedMap<Long, VLSNBucket> head = bucketCache.headMap(pivot); 224 if (head.size() > 0) { 225 return head.get(head.lastKey()); 226 } 227 228 /* 229 * We shouldn't get here. Get the tail purely for creating a debugging 230 * message. 231 */ 232 SortedMap<Long, VLSNBucket> tail = bucketCache.tailMap(pivot); 233 VLSNBucket nextBucket = null; 234 if (tail.size() > 0) { 235 nextBucket = tail.get(tail.firstKey()); 236 } 237 throw EnvironmentFailureException.unexpectedState 238 (envImpl, "VLSN " + vlsn + " should be held within this tracker. " + 239 this + " nextBucket=" + nextBucket); 240 } 241 242 /** 243 * Record a new VLSN->LSN mapping. We guarantee that the first and last 244 * VLSNs in the range have a mapping. If a VLSN comes out of order, we will 245 * discard it, and will not update the range. 246 */ track(VLSN vlsn, long lsn, byte entryTypeNum)247 synchronized void track(VLSN vlsn, long lsn, byte entryTypeNum) { 248 249 VLSNBucket currentBucket = null; 250 251 if (vlsn.compareTo(lastOnDiskVLSN) < 0) { 252 253 /* 254 * This VLSN is a laggard. It belongs to a bucket that has already 255 * gone to disk. Since on disk buckets can't be modified, throw 256 * this mapping away. Do be sure to update the range in case 257 * lastSync or lastTxnEnd should be updated as a result of this 258 * mapping. 259 */ 260 updateRange(vlsn, entryTypeNum); 261 return; 262 } 263 264 if (bucketCache.size() == 0) { 265 /* Nothing in the tracker, add a new bucket. */ 266 currentBucket = new VLSNBucket(DbLsn.getFileNumber(lsn), stride, 267 maxMappings, maxDistance, vlsn); 268 nBucketsCreated.increment(); 269 270 bucketCache.put(currentBucket.getFirst().getSequence(), 271 currentBucket); 272 } else { 273 /* Find the last bucket. Only the last bucket is updateable. */ 274 currentBucket = bucketCache.get(bucketCache.lastKey()); 275 } 276 277 /* 278 * This VLSN is a laggard that was preceded by an earlier mapping which 279 * came unseasonably early. This VLSN can't fit into the current 280 * bucket, and since we only want to update the last bucket, we'll 281 * throw this mapping away. 282 */ 283 if (currentBucket.follows(vlsn)) { 284 updateRange(vlsn, entryTypeNum); 285 return; 286 } 287 288 if (!currentBucket.put(vlsn, lsn)) { 289 290 /* 291 * Couldn't put the mapping in this bucket. Close it and move to a 292 * new one. 293 */ 294 currentBucket = 295 new VLSNBucket(DbLsn.getFileNumber(lsn), stride, 296 maxMappings, maxDistance, vlsn); 297 nBucketsCreated.increment(); 298 bucketCache.put(currentBucket.getFirst().getSequence(), 299 currentBucket); 300 if (!currentBucket.put(vlsn, lsn)) { 301 throw EnvironmentFailureException.unexpectedState 302 (envImpl, "Couldn't put VLSN " + vlsn + " into " + 303 currentBucket); 304 } 305 } 306 307 updateRange(vlsn, entryTypeNum); 308 309 /* 310 * Only update the firstTrackedVLSN if this mapping is really resident 311 * in a bucket. Don't update it if it's an out-of-order, skipped 312 * mapping. 313 */ 314 if (firstTrackedVLSN == NULL_VLSN) { 315 firstTrackedVLSN = vlsn; 316 } 317 } 318 319 /** 320 * Update the range with a newly arrived VLSN. 321 */ updateRange(VLSN vlsn, byte entryTypeNum)322 private void updateRange(VLSN vlsn, byte entryTypeNum) { 323 324 /* 325 * Get a reference to the volatile range field so that we always see 326 * consistent values. 327 */ 328 VLSNRange currentRange = range; 329 range = currentRange.getUpdateForNewMapping(vlsn, entryTypeNum); 330 } 331 332 /** 333 * Flush the tracker cache to disk. 334 * Ideally, we'd allow concurrent VLSN put() calls while a flush to database 335 * is happening. To do that, we need a more sophisticated synchronization 336 * scheme that defines the immutable vs. mutable portion of the tracker 337 * cache. See SR [#17689] 338 */ flushToDatabase(DatabaseImpl mappingDbImpl, Txn txn)339 synchronized void flushToDatabase(DatabaseImpl mappingDbImpl, Txn txn) { 340 341 VLSNRange flushRange = range; 342 343 if (bucketCache.size() == 0) { 344 /* 345 * No mappings to write out, but it's possible that the range 346 * changed due to a truncation, in which case we must update the 347 * database. RangeTruncated is used to reduce the chance that 348 * we are doing unnecessary writes of the range record to the db. 349 */ 350 if (rangeTruncated) { 351 lastOnDiskVLSN = flushRange.writeToDatabase(envImpl, 352 mappingDbImpl, txn); 353 rangeTruncated = false; 354 } 355 return; 356 } 357 358 /* 359 * Save information about the portion of the cache that we are trying 360 * to flush. Close off the last bucket, so the flush portion becomes 361 * immutable. In the past, this was in a synchronization block. 362 * This isn't strictly needed right now, since the method is currently 363 * fully synchronized, but is a good practice, and will be 364 * necessary if future changes make it possible to have concurrent 365 * puts. 366 */ 367 VLSNBucket lastBucket = bucketCache.get(bucketCache.lastKey()); 368 lastBucket.close(); 369 370 /* 371 * Write the flush portion to disk. Ideally, this portion would be done 372 * without synchronization on the tracker, to permit concurrent reads 373 * and writes to the tracker. Note that because there is no 374 * synchronization, the buckets must be flushed to disk in vlsn order, 375 * so that the vlsn index always looks consistent. There should be no 376 * gaps in the bucket range. This assumption permits concurrent access 377 * by VLSNIndex.getGTEBucketFromDatabase. 378 */ 379 VLSN currentLastVLSN = lastOnDiskVLSN; 380 for (Long key : bucketCache.keySet()) { 381 382 VLSNBucket target = bucketCache.get(key); 383 384 /* Do some sanity checking before we write this bucket to disk. */ 385 validateBeforeWrite(target, 386 flushRange, 387 currentLastVLSN, 388 target==lastBucket); 389 target.writeToDatabase(envImpl, mappingDbImpl, txn); 390 currentLastVLSN = target.getLast(); 391 } 392 393 lastOnDiskVLSN = flushRange.writeToDatabase(envImpl, 394 mappingDbImpl, txn); 395 rangeTruncated = false; 396 397 /* 398 * Update the tracker to remove the parts that have been written to 399 * disk. Update firstTrackedVLSN, lastOnDiskVLSN, and clear the bucket 400 * cache. 401 * 402 * In the current implementation, bucketCache is guaranteed not to 403 * change, so we can just clear the cache. If we had written the 404 * buckets out without synchronization, the cache might have gained 405 * mappings, and we wouuld have to be careful to set the 406 * firstTrackedVLSN to be the first VLSN of the remaining buckets, and 407 * to set lastTrackedVLSN to be the last VLSN in flushRange. 408 * 409 * We would also have to explicitly call SortedMap.remove() to remove 410 * the written buckets. Do not use TreeMap.tailSet, which would 411 * inadvertently leave us with the semantics of a SubMap, which has 412 * mandated key ranges. With rollbacks, we may end up adding buckets 413 * that are "repeats" in the future. 414 */ 415 bucketCache.clear(); 416 firstTrackedVLSN = NULL_VLSN; 417 } 418 419 /** 420 * Do some sanity checking before the write, to help prevent any corruption. 421 */ validateBeforeWrite(VLSNBucket target, VLSNRange flushRange, VLSN currentLastVLSN, boolean isLastBucket)422 private void validateBeforeWrite(VLSNBucket target, 423 VLSNRange flushRange, 424 VLSN currentLastVLSN, 425 boolean isLastBucket) { 426 427 if (!(currentLastVLSN.equals(NULL_VLSN)) && 428 (target.getFirst().compareTo(currentLastVLSN) <= 0)) { 429 throw EnvironmentFailureException.unexpectedState 430 (envImpl, "target bucket overlaps previous bucket. " + 431 "currentLastVLSN= " + currentLastVLSN + " target=" + 432 target); 433 } 434 435 if (target.getLast().compareTo(flushRange.getLast()) > 0) { 436 throw EnvironmentFailureException.unexpectedState 437 (envImpl, "target bucket exceeds flush range. " + 438 "range= " + flushRange + " target=" + 439 target); 440 } 441 442 if (isLastBucket) { 443 if (target.getLast().compareTo(flushRange.getLast()) != 0) { 444 throw EnvironmentFailureException.unexpectedState 445 (envImpl, "end of last bucket should match end of range. " + 446 "range= " + flushRange + " target=" + target); 447 } 448 } 449 } 450 truncateFromHead(VLSN deleteEnd, long deleteFileNum)451 synchronized void truncateFromHead(VLSN deleteEnd, long deleteFileNum) { 452 /* 453 * Update range first, to prevent others from attempting to read the 454 * truncated portions. Update the firstTrackedLSN last, after the 455 * buckets are removed; getLTEBucket relies on the firstTrackedLSN 456 * value. 457 */ 458 VLSNRange oldRange = range; 459 range = oldRange.shortenFromHead(deleteEnd); 460 461 /* 462 * Ensure that the range is written to the db even if the resulting 463 * number of buckets is 0. 464 */ 465 rangeTruncated = true; 466 467 /* 468 * afterDelete may not == range.getFirst, becayse range.getFirst may 469 * be NULL_VLSN if the entire vlsn index has been deleted. 470 */ 471 VLSN afterDelete = deleteEnd.getNext(); 472 473 /** 474 * Check if the cached buckets needs to be modified. Suppose vlsns 1-8 475 * are on disk, and vlsns 12->40 are in the tracker cache, with an out 476 * of order gap from 9 - 11. 477 * 478 * case 1: deleteEnd <= 8, Do not need to truncate anything in the 479 * bucket cacke, but we may have to make a ghost bucket to fill the 480 * gap. 481 * case 2: deleteEnd == 11, we don't need to modify the cache or 482 * create a ghost bucket. 483 * case 3: deleteEnd >8 and < 11 is in the gap, we need to create a 484 * ghost bucket to hold the beginning of the range.. 485 * case 4: deleteEnd >= 12, we need to modify the bucket cache. 486 */ 487 if (!lastOnDiskVLSN.equals(VLSN.NULL_VLSN) && 488 (lastOnDiskVLSN.compareTo(deleteEnd) >= 0)) { 489 490 /* 491 * case 1: the buckets in the tracker cache are unaffected, all 492 * the pertinent mappings are on disk. 493 */ 494 if (lastOnDiskVLSN.equals(deleteEnd)) { 495 if (firstTrackedVLSN.compareTo(afterDelete) > 0) { 496 checkForGhostBucket(bucketCache, deleteFileNum); 497 } 498 lastOnDiskVLSN = NULL_VLSN; 499 } 500 return; 501 } 502 503 assert(!firstTrackedVLSN.equals(NULL_VLSN)); 504 505 if (firstTrackedVLSN.equals(afterDelete)) { 506 /* case 2: The tracker is lined up perfectly with the new range. */ 507 lastOnDiskVLSN = NULL_VLSN; 508 return; 509 } 510 511 if (firstTrackedVLSN.compareTo(afterDelete) > 0) { 512 /* 513 * case 3: we have to make a ghost bucket. 514 */ 515 checkForGhostBucket(bucketCache, deleteFileNum); 516 lastOnDiskVLSN = NULL_VLSN; 517 return; 518 } 519 520 /* 521 * case 4: we have to prune the buckets. 522 */ 523 VLSNBucket owningBucket = getLTEBucket(deleteEnd); 524 Long retainBucketKey = owningBucket.getFirst().getNext().getSequence(); 525 SortedMap<Long, VLSNBucket> tail; 526 try { 527 /* 528 * We need to chop off part of the bucket cache. Find the portion 529 * to retain. 530 */ 531 tail = bucketCache.tailMap(retainBucketKey); 532 checkForGhostBucket(tail, deleteFileNum); 533 bucketCache = tail; 534 } catch (NoSuchElementException e) { 535 firstTrackedVLSN = NULL_VLSN; 536 bucketCache = new TreeMap<Long, VLSNBucket>(); 537 } 538 539 lastOnDiskVLSN = NULL_VLSN; 540 } 541 checkForGhostBucket(SortedMap<Long, VLSNBucket> buckets, long deleteFileNum)542 private void checkForGhostBucket(SortedMap<Long, VLSNBucket> buckets, 543 long deleteFileNum) { 544 /* 545 * The range has just been truncated. If the first bucket's first vlsn 546 * != the first in range, insert a GhostBucket. 547 */ 548 Long firstKey = buckets.firstKey(); 549 VLSNBucket firstBucket = buckets.get(firstKey); 550 VLSN firstRangeVLSN = range.getFirst(); 551 VLSN firstBucketVLSN = firstBucket.getFirst(); 552 if (!firstBucketVLSN.equals(firstRangeVLSN)) { 553 long nextFile = 554 envImpl.getFileManager().getFollowingFileNum(deleteFileNum, 555 true /* forward */); 556 long lastPossibleLsn = firstBucket.getLsn(firstBucketVLSN); 557 VLSNBucket placeholder = new GhostBucket(firstRangeVLSN, 558 DbLsn.makeLsn(nextFile, 0), 559 lastPossibleLsn); 560 buckets.put(firstRangeVLSN.getSequence(), placeholder); 561 } 562 } 563 564 /** 565 * Remove the mappings for VLSNs >= deleteStart. This should only be used 566 * at syncup, or recovery. We can assume no new mappings are coming in, but 567 * the VLSNIndex may be read during this time; checkpoints continue to 568 * happen during rollbacks. Since syncup is always at a sync-able log 569 * entry, and since we don't roll back past a commit, we can assume that 570 * the lastSync can be changed to deleteStart.getPrev(), and lastTxnEnd is 571 * unchanged. The VLSNRange itself is updated before the tracker and 572 * on-disk buckets are pruned. 573 * 574 * The VLSNIndex guarantees that the beginning and end vlsns in the range 575 * have mappings so LTE and GTE search operations work. When truncating the 576 * mappings in the tracker, check to see if the new end vlsn has a mapping, 577 * since that may not happen if the buckets are cut in the middle, or have 578 * a gap due to out of order mapping. For example, suppose mapping for vlsn 579 * 20 came before the mappings for vlsn 17, 18 and 19. The buckets would 580 * have this gap: 581 * Bucket A: vlsns 10-16 582 * Bucket B: vlsns 20-25 583 * Bucket C: vlsns 26 - 30 584 * If the new end of vlsn range is 11-14, 17, 18, 19, 21 - 24, or 27-29, 585 * we have to end the vlsn tracker with a new mapping of 586 * (vlsn (deleteStart-1) -> prevLsn) to cap off the buckets. 587 * 588 * Cases: 589 * End of range is in the gap -- either 17, 18, 19 590 * 1. on disk buckets exist, but don't encompass the gap: bucket A on disk 591 * 2. all buckets are in the tracker cache: bucket A in tracker 592 * 3. on disk buckets encompass the gap: bucket A and B on disk. 593 */ truncateFromTail(VLSN deleteStart, long prevLsn)594 synchronized void truncateFromTail(VLSN deleteStart, long prevLsn) { 595 596 /* 597 * Update the range first, it will define the range that the vlsnIndex 598 * covers. Then adjust any mappings held in the bucket cache. Don't 599 * update the lastOnDiskVLSN marker, which says what is on disk. Since 600 * the on-disk portion may also get truncated, the caller will update 601 * the lastOnDiskVLSN field when it inspects the database. 602 */ 603 VLSNRange oldRange = range; 604 range = oldRange.shortenFromEnd(deleteStart); 605 606 /* 607 * Ensure that the range is written to the db even if the resulting 608 * number of buckets is 0. 609 */ 610 rangeTruncated = true; 611 612 if (firstTrackedVLSN.equals(NULL_VLSN)) { 613 /* No mappings in this tracker */ 614 return; 615 } 616 617 if (firstTrackedVLSN.compareTo(deleteStart) >= 0) { 618 619 /* 620 * Everything in this tracker should be removed. In addition, the 621 * caller may be removing some items from the database. 622 */ 623 bucketCache.clear(); 624 firstTrackedVLSN = NULL_VLSN; 625 return; 626 } 627 628 /* 629 * We need to do some pruning of the bucket in the cache. Find the 630 * buckets in the bucketCache that have mappings >= deleteStart, and 631 * remove their mappings. First establish the headset of buckets that 632 * should be preserved. 633 */ 634 VLSNBucket targetBucket = getGTEBucket(deleteStart); 635 Long targetKey = targetBucket.getFirst().getSequence(); 636 637 /* The newCache has buckets that should be preserved. */ 638 SortedMap<Long, VLSNBucket> newCache = 639 new TreeMap<Long, VLSNBucket>(bucketCache.headMap(targetKey)); 640 641 /* 642 * Prune any mappings >= deleteStart out of targetBucket. If it has any 643 * mappings left, add it to newCache. 644 */ 645 targetBucket.removeFromTail(deleteStart, prevLsn); 646 if (!targetBucket.empty()) { 647 newCache.put(targetBucket.getFirst().getSequence(), 648 targetBucket); 649 } 650 651 bucketCache = newCache; 652 653 /* 654 * Now all truncated mappings have been removed from the index. Check 655 * that the end point of the vlsn range is represented in this 656 * tracker. Since vlsn mappings can come out of order, it's 657 * possible that the buckets have a gap, and that truncation will 658 * cleave the bucket cache just at a point where a mapping is 659 * missing. For example, vlsn mappings come in this order: 660 * vlsn 16, vlsn 18, vlsn 17 661 * causing the buckets to have this gap: 662 * bucket N: vlsn 10 - 16 663 * bucket N+1: vlsn 18 - 20 664 * and that the vlsn range is truncated with a deleteStart of vlsn 18. 665 * The new end range becomes 17, but the buckets do not have a 666 * mapping for vlsn 17. If so, create a mapping now. 667 * 668 * If we are at this point, we know that the tracker should contain 669 * a mapping for the last VLSN in the range. Fix it now, so the 670 * datastructure is as tidy as possible. 671 */ 672 boolean needEndMapping = false; 673 if (bucketCache.isEmpty()) { 674 needEndMapping = true; 675 } else { 676 VLSNBucket lastBucket = bucketCache.get(bucketCache.lastKey()); 677 needEndMapping = 678 (lastBucket.getLast().compareTo(range.getLast()) < 0); 679 } 680 681 if (needEndMapping) { 682 addEndMapping(range.getLast(), prevLsn); 683 } 684 } 685 686 /** 687 * Called by the VLSNIndex to see if we have to add a bucket in the tracker 688 * after pruning the on-disk storage. The get{LTE,GTE}Bucket methods 689 * assume that there are mappings for the first and last VLSN in the range. 690 * But since out of order vlsn mappings can cause non-contiguous buckets, 691 * the pruning may have lost the end mapping. [#23491] 692 */ ensureRangeEndIsMapped(VLSN lastVLSN, long lastLsn)693 synchronized void ensureRangeEndIsMapped(VLSN lastVLSN, long lastLsn) { 694 695 /** 696 * if lastOnDiskVLSN < lastVLSN < firstTrackedVLSN or 697 * lastOnDiskVLSN < lastVLSN and firstTrackedVLSN is null 698 * then we need to add a mapping for lastVLSN->lastLsn. Otherwise a 699 * mapping already exists. 700 */ 701 if (lastOnDiskVLSN.compareTo(lastVLSN) < 0) { 702 703 /* 704 * The on-disk vlsn mappings aren't sufficient to cover the 705 * lastVLSN. There needs to be a mapping in the tracker for the 706 * end point of the range. 707 */ 708 if (!firstTrackedVLSN.equals(NULL_VLSN)) { 709 710 /* 711 * There are mappings in the tracker, so we can assume 712 * that they cover the end point, because truncateFromTail() 713 * should have adjusted for that. But assert to be sure! 714 */ 715 if (lastVLSN.compareTo(firstTrackedVLSN) < 0) { 716 throw EnvironmentFailureException.unexpectedState 717 (envImpl, 718 "Expected this tracker to cover vlsn " + lastVLSN + 719 " after truncateFromTail. LastOnDiskVLSN= " + 720 lastOnDiskVLSN + " tracker=" + this); 721 } 722 723 return; 724 } 725 726 /* 727 * There are no mappings in the tracker. The on disk mappings were 728 * pruned, and a gap was created between what was on disk and what 729 * is in the tracker cache. Add a mapping. 730 */ 731 addEndMapping(lastVLSN, lastLsn); 732 } 733 } 734 735 /** 736 * Add a bucket holding a lastVLSN -> lastLsn mapping 737 */ addEndMapping(VLSN lastVLSN, long lastLsn)738 private void addEndMapping(VLSN lastVLSN, long lastLsn) { 739 /* Assert that it's in the range */ 740 assert (lastVLSN.compareTo(range.getLast()) == 0) : 741 "lastVLSN=" + lastVLSN + " lastLsn = " + lastLsn + 742 " range=" + range; 743 744 VLSNBucket addBucket = 745 new VLSNBucket(DbLsn.getFileNumber(lastLsn), stride, 746 maxMappings, maxDistance, lastVLSN); 747 addBucket.put(lastVLSN, lastLsn); 748 nBucketsCreated.increment(); 749 bucketCache.put(addBucket.getFirst().getSequence(), addBucket); 750 if (firstTrackedVLSN.equals(NULL_VLSN)) { 751 firstTrackedVLSN = lastVLSN; 752 } 753 } 754 755 /** 756 * Attempt to replace the mappings in this vlsnIndex for 757 * deleteStart->lastVLSN with those from the recovery mapper. 758 * Since we do not supply a prevLsn argument, we may not be able to "cap" 759 * the truncated vlsn bucket the same what that we can in 760 * truncateFromTail. Because of that, we may need to remove mappings that 761 * are < deleteStart. 762 * 763 * For example, suppose a bucket holds mappings 764 * 10 -> 101 765 * 15 -> 201 766 * 18 -> 301 767 * If deleteStart == 17, we will have to delete all the way to vlsn 15. 768 * 769 * The maintenance of VLSNRange.lastSync and lastTxnEnd are simplified 770 * because of assumptions we can make because we are about to append 771 * mappings found by the recovery tracker that begin at deleteStart. If 772 * lastSync and lastTxnEnd are <= deleteStart, we know that they will 773 * either stay the same, or be replaced by lastSync and lastTxnEnd from the 774 * recoveryTracker. Even we delete mappings that are < deleteStart, we know 775 * that we did not roll back past an abort or commit, so that we do not 776 * have to updated the lastSync or lastTxnEnd value. 777 */ merge(VLSN prunedLastOnDiskVLSN, VLSNRecoveryTracker recoveryTracker)778 void merge(VLSN prunedLastOnDiskVLSN, VLSNRecoveryTracker recoveryTracker) { 779 780 VLSNRange oldRange = range; 781 range = oldRange.merge(recoveryTracker.range); 782 VLSN recoveryFirst = recoveryTracker.getRange().getFirst(); 783 784 lastOnDiskVLSN = prunedLastOnDiskVLSN; 785 786 /* 787 * Find the buckets in the bucketCache that have mappings >= 788 * recoveryFirst, and remove their mappings. First establish the 789 * headset of buckets that should be preserved. At this point, we 790 * have already pruned the database, so the bucket set may not 791 * be fully contiguous -- we may have pruned away mappings that 792 * would normally be in the database. 793 */ 794 if (bucketCache.size() == 0) { 795 /* Just take all the recovery tracker's mappings */ 796 bucketCache = recoveryTracker.bucketCache; 797 } else { 798 VLSNBucket targetBucket = getGTEBucket(recoveryFirst); 799 Long targetKey = targetBucket.getFirst().getSequence(); 800 801 /* The newCache holds buckets that should be preserved. */ 802 SortedMap<Long, VLSNBucket> newCache = 803 new TreeMap<Long, VLSNBucket>(bucketCache.headMap(targetKey)); 804 805 /* 806 * Prune any mappings >= recoveryFirst out of targetBucket. If it 807 * has any mappings left, add it to newCache. 808 */ 809 targetBucket.removeFromTail(recoveryFirst, DbLsn.NULL_LSN); 810 if (!targetBucket.empty()) { 811 newCache.put(targetBucket.getFirst().getSequence(), 812 targetBucket); 813 } 814 815 newCache.putAll(recoveryTracker.bucketCache); 816 bucketCache = newCache; 817 } 818 819 if (bucketCache.size() > 0) { 820 VLSNBucket firstBucket = bucketCache.get(bucketCache.firstKey()); 821 firstTrackedVLSN = firstBucket.getFirst(); 822 } 823 } 824 append(VLSNRecoveryTracker recoveryTracker)825 void append(VLSNRecoveryTracker recoveryTracker) { 826 827 /* 828 * This method assume that there is no overlap between this tracker 829 * and the recovery tracker. Everything in this tracker should precede 830 * the recovery tracker. 831 */ 832 if (!range.getLast().isNull()) { 833 if (range.getLast().compareTo 834 (recoveryTracker.getFirstTracked()) >= 0) { 835 836 throw EnvironmentFailureException.unexpectedState 837 (envImpl, 838 "Expected this tracker to precede recovery tracker. " + 839 "This tracker= " + this + " recoveryTracker = " + 840 recoveryTracker); 841 } 842 } 843 844 bucketCache.putAll(recoveryTracker.bucketCache); 845 VLSNRange currentRange = range; 846 range = currentRange.getUpdate(recoveryTracker.getRange()); 847 if (bucketCache.size() > 0) { 848 VLSNBucket firstBucket = bucketCache.get(bucketCache.firstKey()); 849 firstTrackedVLSN = firstBucket.getFirst(); 850 } 851 } 852 getRange()853 VLSNRange getRange() { 854 return range; 855 } 856 857 @Override toString()858 public String toString() { 859 StringBuilder sb = new StringBuilder(); 860 sb.append(range); 861 sb.append(" firstTracked=").append(firstTrackedVLSN); 862 sb.append(" lastOnDiskVLSN=").append(lastOnDiskVLSN); 863 864 for (VLSNBucket bucket: bucketCache.values()) { 865 sb.append("\n"); 866 sb.append(bucket); 867 } 868 return sb.toString(); 869 } 870 871 /** 872 * For unit test support. 873 */ verify(boolean verbose)874 synchronized boolean verify(boolean verbose) { 875 876 if (!range.verify(verbose)) { 877 return false; 878 } 879 880 /* Check that all the buckets are in order. */ 881 ArrayList<VLSN> firstVLSN = new ArrayList<VLSN>(); 882 ArrayList<VLSN> lastVLSN = new ArrayList<VLSN>(); 883 884 for (VLSNBucket b : bucketCache.values()) { 885 firstVLSN.add(b.getFirst()); 886 lastVLSN.add(b.getLast()); 887 } 888 889 if (!verifyBucketBoundaries(firstVLSN, lastVLSN)) { 890 return false; 891 } 892 893 if (firstVLSN.size() > 0) { 894 if (!firstVLSN.get(0).equals(firstTrackedVLSN)) { 895 if (verbose) { 896 System.err.println("firstBucketVLSN = " + 897 firstVLSN.get(0) + " should equal " + 898 firstTrackedVLSN); 899 } 900 return false; 901 } 902 903 VLSN lastBucketVLSN = lastVLSN.get(lastVLSN.size() -1); 904 if (!lastBucketVLSN.equals(range.getLast())) { 905 if (verbose) { 906 System.err.println("lastBucketVLSN = " + lastBucketVLSN + 907 " should equal " + range.getLast()); 908 } 909 return false; 910 } 911 } 912 913 return true; 914 } 915 916 /* 917 * Used to check bucket boundaries for both the in-memory flush list and 918 * the on-disk database buckets. Buckets may not be adjacent, but must be 919 * in order. 920 */ verifyBucketBoundaries(ArrayList<VLSN> firstVLSN, ArrayList<VLSN> lastVLSN)921 static boolean verifyBucketBoundaries(ArrayList<VLSN> firstVLSN, 922 ArrayList<VLSN> lastVLSN) { 923 for (int i = 1; i < firstVLSN.size(); i++) { 924 VLSN first = firstVLSN.get(i); 925 VLSN prevLast = lastVLSN.get(i-1); 926 if (first.compareTo(prevLast.getNext()) < 0) { 927 System.out.println("Boundary problem: bucket " + i + 928 " first " + first + 929 " follows bucket.last " + prevLast); 930 return false; 931 } 932 } 933 return true; 934 } 935 getFirstTracked()936 VLSN getFirstTracked() { 937 return firstTrackedVLSN; 938 } 939 getLastOnDisk()940 VLSN getLastOnDisk() { 941 return lastOnDiskVLSN; 942 } 943 setLastOnDiskVLSN(VLSN lastOnDisk)944 void setLastOnDiskVLSN(VLSN lastOnDisk) { 945 lastOnDiskVLSN = lastOnDisk; 946 } 947 948 /* 949 * For unit test support only. Can only be called when replication stream 950 * is quiescent. 951 */ isFlushedToDisk()952 boolean isFlushedToDisk() { 953 return lastOnDiskVLSN.equals(range.getLast()); 954 } 955 } 956