1 /*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 2002, 2014 Oracle and/or its affiliates. All rights reserved. 5 * 6 */ 7 8 package com.sleepycat.je.recovery; 9 10 import java.util.HashMap; 11 import java.util.HashSet; 12 import java.util.IdentityHashMap; 13 import java.util.Iterator; 14 import java.util.Map; 15 import java.util.Set; 16 import java.util.SortedMap; 17 import java.util.TreeMap; 18 19 import com.sleepycat.je.CacheMode; 20 import com.sleepycat.je.DatabaseException; 21 import com.sleepycat.je.dbi.DatabaseId; 22 import com.sleepycat.je.dbi.DatabaseImpl; 23 import com.sleepycat.je.dbi.DbTree; 24 import com.sleepycat.je.dbi.EnvironmentImpl; 25 import com.sleepycat.je.dbi.INList; 26 import com.sleepycat.je.dbi.MemoryBudget; 27 import com.sleepycat.je.recovery.Checkpointer.CheckpointReference; 28 import com.sleepycat.je.tree.BIN; 29 import com.sleepycat.je.tree.IN; 30 import com.sleepycat.je.tree.MapLN; 31 import com.sleepycat.je.utilint.TestHookExecute; 32 33 /** 34 * Manages the by-level map of checkpoint references that are to be flushed by 35 * a checkpoint or Database.sync, the MapLNs to be flushed, the highest level 36 * by database to be flushed, and the state of the checkpoint. 37 * 38 * An single instance of this class is used for checkpoints and has the same 39 * lifetime as the checkpointer and environment. An instance per Database.sync 40 * is created as needed. Only one checkpoint can occur at a time, but multiple 41 * syncs may occur concurrently with each other and with the checkpoint. 42 * 43 * The methods in this class are synchronized to protect internal state from 44 * concurrent access by the checkpointer and eviction, and to coordinate state 45 * changes between the two. Eviction must participate in the checkpoint so 46 * that INs cascade up properly; see coordinateEvictionWithCheckpoint. 47 * 48 * When INs are latched along with synchronization on a DirtyINMap, the order 49 * must be: 1) IN latches and 2) synchronize on DirtyINMap. For example, 50 * the evictor latches the parent and child IN before calling the synchronized 51 * method coordinateEvictionWithCheckpoint, and selectDirtyINsForCheckpoint 52 * latches the IN before calling the synchronized method selectForCheckpoint. 53 */ 54 class DirtyINMap { 55 56 private final EnvironmentImpl envImpl; 57 private final SortedMap<Integer, Map<Long, CheckpointReference>> levelMap; 58 private int numEntries; 59 private final Set<DatabaseId> mapLNsToFlush; 60 private final Map<DatabaseImpl, Integer> highestFlushLevels; 61 62 enum CkptState { 63 /** No checkpoint in progress, or is used for Database.sync. */ 64 NONE, 65 /** Checkpoint started but dirty map is not yet complete. */ 66 DIRTY_MAP_INCOMPLETE, 67 /** Checkpoint in progress and dirty map is complete. */ 68 DIRTY_MAP_COMPLETE, 69 }; 70 71 private CkptState ckptState; 72 private boolean ckptFlushAll; 73 private boolean ckptFlushExtraLevel; 74 DirtyINMap(EnvironmentImpl envImpl)75 DirtyINMap(EnvironmentImpl envImpl) { 76 this.envImpl = envImpl; 77 levelMap = new TreeMap<Integer, Map<Long, CheckpointReference>>(); 78 numEntries = 0; 79 mapLNsToFlush = new HashSet<DatabaseId>(); 80 highestFlushLevels = new IdentityHashMap<DatabaseImpl, Integer>(); 81 ckptState = CkptState.NONE; 82 } 83 84 /** 85 * Coordinates an eviction with an in-progress checkpoint and returns 86 * whether or not provisional logging is needed. 87 * 88 * @return true if the target must be logged provisionally. 89 */ coordinateEvictionWithCheckpoint(IN target, IN parent)90 synchronized boolean coordinateEvictionWithCheckpoint(IN target, 91 IN parent) { 92 final DatabaseImpl db = target.getDatabase(); 93 94 /* 95 * If the checkpoint is in-progress and has not finished dirty map 96 * construction, we must add the parent to the dirty map. That way the 97 * dirtiness and logging will cascade up in the same way as if the 98 * target were not evicted, and instead were encountered during dirty 99 * map construction. We don't want the evictor's actions to introduce 100 * an IN in the log that has not cascaded up properly. 101 * 102 * Note that we add the parent even if it is not dirty here. It will 103 * become dirty after the target child is logged, but that hasn't 104 * happened yet. 105 * 106 * We do not add the parent if it is null, which is the case when the 107 * root is being evicted. 108 */ 109 if (ckptState == CkptState.DIRTY_MAP_INCOMPLETE && 110 parent != null) { 111 /* Add latched parent IN to dirty map. */ 112 selectForCheckpoint(parent); 113 /* Save dirty/temp DBs for later. */ 114 saveMapLNsToFlush(parent); 115 } 116 117 /* 118 * The evictor has to log provisionally in three cases: 119 * 120 * 1 - The eviction target is part of a deferred write database. 121 */ 122 if (db.isDeferredWriteMode()) { 123 return true; 124 } 125 126 /* 127 * 2 - The checkpoint is in-progress and has not finished dirty map 128 * construction, and the target is not the root. The parent IN has 129 * been added to the dirty map, so we know the child IN is at a 130 * level below the max flush level. 131 */ 132 if (ckptState == CkptState.DIRTY_MAP_INCOMPLETE && 133 parent != null) { 134 return true; 135 } 136 137 /* 138 * 3 - The checkpoint is in-progress and has finished dirty map 139 * construction, and is at a level above the eviction target. 140 */ 141 if (ckptState == CkptState.DIRTY_MAP_COMPLETE && 142 target.getLevel() < getHighestFlushLevel(db)) { 143 return true; 144 } 145 146 /* Otherwise, log non-provisionally. */ 147 return false; 148 } 149 150 /** 151 * Must be called before starting a checkpoint, and must not be called for 152 * Database.sync. Updates memory budget and sets checkpoint state. 153 */ beginCheckpoint(boolean flushAll, boolean flushExtraLevel)154 synchronized void beginCheckpoint(boolean flushAll, 155 boolean flushExtraLevel) { 156 assert levelMap.isEmpty(); 157 assert mapLNsToFlush.isEmpty(); 158 assert highestFlushLevels.isEmpty(); 159 assert numEntries == 0; 160 assert ckptState == CkptState.NONE; 161 ckptState = CkptState.DIRTY_MAP_INCOMPLETE; 162 ckptFlushAll = flushAll; 163 ckptFlushExtraLevel = flushExtraLevel; 164 } 165 166 /** 167 * Must be called after a checkpoint or Database.sync is complete. Updates 168 * memory budget and clears checkpoint state. 169 */ reset()170 synchronized void reset() { 171 removeCostFromMemoryBudget(); 172 levelMap.clear(); 173 mapLNsToFlush.clear(); 174 highestFlushLevels.clear(); 175 numEntries = 0; 176 ckptState = CkptState.NONE; 177 } 178 179 /** 180 * Scan the INList for all dirty INs, excluding temp DB INs. Save them in 181 * a tree-level ordered map for level ordered flushing. 182 * 183 * Take this opportunity to recalculate the memory budget tree usage. 184 * 185 * This method itself is not synchronized to allow concurrent eviction. 186 * Synchronization is performed on a per-IN basis to protect the data 187 * structures here, and eviction can occur in between INs. 188 */ selectDirtyINsForCheckpoint()189 void selectDirtyINsForCheckpoint() 190 throws DatabaseException { 191 192 assert ckptState == CkptState.DIRTY_MAP_INCOMPLETE; 193 194 /* 195 * Opportunistically recalculate the INList memory budget while 196 * traversing the entire INList. 197 */ 198 final INList inMemINs = envImpl.getInMemoryINs(); 199 inMemINs.memRecalcBegin(); 200 boolean completed = false; 201 try { 202 for (IN in : inMemINs) { 203 in.latchShared(CacheMode.UNCHANGED); 204 try { 205 inMemINs.memRecalcIterate(in); 206 /* Add latched IN to dirty map. */ 207 if (in.getDirty()) { 208 selectForCheckpoint(in); 209 } 210 /* Save dirty/temp DBs for later. */ 211 saveMapLNsToFlush(in); 212 } finally { 213 in.releaseLatch(); 214 } 215 216 /* Call test hook after releasing latch. */ 217 TestHookExecute.doHookIfSet 218 (Checkpointer.examineINForCheckpointHook, in); 219 } 220 completed = true; 221 } finally { 222 inMemINs.memRecalcEnd(completed); 223 } 224 225 /* 226 * Finish filling out the highestFlushLevels map. For each entry in 227 * highestFlushLevels that has a null level Integer value (set by 228 * selectForCheckpoint), we call DbTree.getHighestLevel and replace the 229 * null level. We must call DbTree.getHighestLevel, which latches the 230 * root, only when not synchronized, to avoid breaking the 231 * synchronization rules described in the class comment. This must be 232 * done in several steps to follow the sychronization rules, yet 233 * protect the highestFlushLevels using synchronization. 234 */ 235 final Map<DatabaseImpl, Integer> maxFlushDbs = 236 new HashMap<DatabaseImpl, Integer>(); 237 238 /* Copy entries with a null level. */ 239 synchronized (this) { 240 for (DatabaseImpl db : highestFlushLevels.keySet()) { 241 if (highestFlushLevels.get(db) == null) { 242 maxFlushDbs.put(db, null); 243 } 244 } 245 } 246 247 /* Call getHighestLevel without synchronization. */ 248 final DbTree dbTree = envImpl.getDbTree(); 249 for (Map.Entry<DatabaseImpl, Integer> entry : maxFlushDbs.entrySet()) { 250 entry.setValue(dbTree.getHighestLevel(entry.getKey())); 251 } 252 253 /* Fill in levels in highestFlushLevels. */ 254 synchronized (this) { 255 for (Map.Entry<DatabaseImpl, Integer> entry : 256 maxFlushDbs.entrySet()) { 257 highestFlushLevels.put(entry.getKey(), entry.getValue()); 258 } 259 } 260 261 /* Complete this phase of the checkpoint. */ 262 synchronized (this) { 263 addCostToMemoryBudget(); 264 ckptState = CkptState.DIRTY_MAP_COMPLETE; 265 } 266 } 267 268 /** 269 * Adds the IN to the dirty map unless it belongs to a temp DB, whether or 270 * not the IN is dirty. Also updates the highest flush level map. 271 */ selectForCheckpoint(IN in)272 private synchronized void selectForCheckpoint(IN in) { 273 /* Do not checkpoint temporary databases. */ 274 final DatabaseImpl db = in.getDatabase(); 275 if (db.isTemporary()) { 276 return; 277 } 278 279 Integer level = addIN(in, false /*updateMemoryBudget*/); 280 281 /* 282 * IN was added to the dirty map. Update the highest level seen 283 * for the database. Use one level higher when ckptFlushExtraLevel 284 * is set. When ckptFlushAll is set, use the maximum level for the 285 * database. Durable deferred-write databases must be synced, so 286 * also use the maximum level. 287 * 288 * Always flush at least one level above the bottom-most BIN level so 289 * that the BIN level is logged provisionally and the expense of 290 * processing BINs during recovery is avoided. 291 */ 292 if (ckptFlushAll || db.isDurableDeferredWrite()) { 293 if (!highestFlushLevels.containsKey(db)) { 294 295 /* 296 * Null is used as an indicator that getHighestLevel should be 297 * called in selectDirtyINsForCheckpoint, when not 298 * synchronized. 299 */ 300 highestFlushLevels.put(db, null); 301 } 302 } else { 303 int levelVal = level.intValue(); 304 if (ckptFlushExtraLevel || in.isBIN()) { 305 if (in.isRoot()) { 306 /* No reason to go above DB root. */ 307 } else { 308 /* Next level up in the same tree. */ 309 levelVal += 1; 310 level = levelVal; 311 } 312 } 313 final Integer highestLevelSeen = highestFlushLevels.get(db); 314 if (highestLevelSeen == null || 315 levelVal > highestLevelSeen.intValue()) { 316 highestFlushLevels.put(db, level); 317 } 318 } 319 } 320 321 /** 322 * Scan the INList for all dirty INs for a given database. Arrange them in 323 * level sorted map for level ordered flushing. 324 * 325 * This method is not synchronized to allow concurrent eviction. 326 * Coordination between eviction and Database.sync is not required. 327 */ selectDirtyINsForDbSync(DatabaseImpl dbImpl)328 void selectDirtyINsForDbSync(DatabaseImpl dbImpl) 329 throws DatabaseException { 330 331 assert ckptState == CkptState.NONE; 332 333 final DatabaseId dbId = dbImpl.getId(); 334 335 for (IN in : envImpl.getInMemoryINs()) { 336 if (in.getDatabaseId().equals(dbId)) { 337 in.latch(CacheMode.UNCHANGED); 338 try { 339 if (in.getDirty()) { 340 addIN(in, false /*updateMemoryBudget*/); 341 } 342 } finally { 343 in.releaseLatch(); 344 } 345 } 346 } 347 348 /* 349 * Create a single entry map that forces all levels of this DB to 350 * be flushed. 351 */ 352 highestFlushLevels.put 353 (dbImpl, 354 Integer.valueOf(envImpl.getDbTree().getHighestLevel(dbImpl))); 355 356 /* Add the dirty map to the memory budget. */ 357 addCostToMemoryBudget(); 358 } 359 getHighestFlushLevel(DatabaseImpl db)360 synchronized int getHighestFlushLevel(DatabaseImpl db) { 361 362 assert ckptState != CkptState.DIRTY_MAP_INCOMPLETE; 363 364 /* 365 * This method is only called while flushing dirty nodes for a 366 * checkpoint or Database.sync, not for an eviction, so an entry for 367 * this database should normally exist. However, if the DB root (and 368 * DatabaseImpl) have been evicted since the highestFlushLevels was 369 * constructed, the new DatabaseImpl instance will not be present in 370 * the map. In this case, we do not need to checkpoint the IN and 371 * eviction should be non-provisional. 372 */ 373 Integer val = highestFlushLevels.get(db); 374 return (val != null) ? val.intValue() : IN.MIN_LEVEL; 375 } 376 getNumLevels()377 synchronized int getNumLevels() { 378 return levelMap.size(); 379 } 380 addCostToMemoryBudget()381 private synchronized void addCostToMemoryBudget() { 382 final MemoryBudget mb = envImpl.getMemoryBudget(); 383 final int cost = numEntries * MemoryBudget.CHECKPOINT_REFERENCE_SIZE; 384 mb.updateAdminMemoryUsage(cost); 385 } 386 removeCostFromMemoryBudget()387 private synchronized void removeCostFromMemoryBudget() { 388 final MemoryBudget mb = envImpl.getMemoryBudget(); 389 final int cost = numEntries * MemoryBudget.CHECKPOINT_REFERENCE_SIZE; 390 mb.updateAdminMemoryUsage(0 - cost); 391 } 392 393 /** 394 * Add a node unconditionally to the dirty map. The dirty map is keyed by 395 * level (Integers) and holds sets of IN references. 396 * 397 * @param updateMemoryBudget if true then update the memory budget as the 398 * map is changed; if false then addCostToMemoryBudget must be called 399 * later. 400 * 401 * @return level of IN added to the dirty map. The level is returned 402 * rather than a boolean simply to avoid allocating another Integer in the 403 * caller. 404 */ addIN(IN in, boolean updateMemoryBudget)405 synchronized Integer addIN(IN in, boolean updateMemoryBudget) { 406 407 final Integer level = Integer.valueOf(in.getLevel()); 408 final Map<Long, CheckpointReference> nodeMap; 409 if (levelMap.containsKey(level)) { 410 nodeMap = levelMap.get(level); 411 } else { 412 nodeMap = new TreeMap<Long, CheckpointReference>(); 413 levelMap.put(level, nodeMap); 414 } 415 416 nodeMap.put(in.getNodeId(), 417 new CheckpointReference(in.getDatabase().getId(), 418 in.getNodeId(), 419 in.getLevel(), 420 in.isDbRoot(), 421 in.getIdentifierKey())); 422 numEntries++; 423 424 if (updateMemoryBudget) { 425 final MemoryBudget mb = envImpl.getMemoryBudget(); 426 mb.updateAdminMemoryUsage 427 (MemoryBudget.CHECKPOINT_REFERENCE_SIZE); 428 } 429 430 return level; 431 } 432 433 /** 434 * Get the lowest level currently stored in the map. 435 */ getLowestLevelSet()436 synchronized Integer getLowestLevelSet() { 437 return levelMap.firstKey(); 438 } 439 440 /** 441 * Removes the set corresponding to the given level. 442 */ removeLevel(Integer level)443 synchronized void removeLevel(Integer level) { 444 levelMap.remove(level); 445 } 446 containsNode(Integer level, Long nodeId)447 synchronized boolean containsNode(Integer level, Long nodeId) { 448 final Map<Long, CheckpointReference> nodeMap = levelMap.get(level); 449 if (nodeMap != null) { 450 return nodeMap.containsKey(nodeId); 451 } 452 return false; 453 } 454 removeNode(Integer level, Long nodeId)455 synchronized CheckpointReference removeNode(Integer level, Long nodeId) { 456 final Map<Long, CheckpointReference> nodeMap = levelMap.get(level); 457 if (nodeMap != null) { 458 return nodeMap.remove(nodeId); 459 } 460 return null; 461 } 462 removeNextNode(Integer level)463 synchronized CheckpointReference removeNextNode(Integer level) { 464 final Map<Long, CheckpointReference> nodeMap = levelMap.get(level); 465 if (nodeMap != null) { 466 final Iterator<Map.Entry<Long, CheckpointReference>> iter = 467 nodeMap.entrySet().iterator(); 468 if (iter.hasNext()) { 469 final CheckpointReference ref = iter.next().getValue(); 470 iter.remove(); 471 return ref; 472 } 473 } 474 return null; 475 } 476 477 /** 478 * If the given IN is a BIN for the ID mapping database, saves all 479 * dirty/temp MapLNs contained in it. 480 */ saveMapLNsToFlush(IN in)481 private synchronized void saveMapLNsToFlush(IN in) { 482 if (in instanceof BIN && 483 in.getDatabase().getId().equals(DbTree.ID_DB_ID)) { 484 for (int i = 0; i < in.getNEntries(); i += 1) { 485 final MapLN ln = (MapLN) in.getTarget(i); 486 if (ln != null && ln.getDatabase().isCheckpointNeeded()) { 487 mapLNsToFlush.add(ln.getDatabase().getId()); 488 } 489 } 490 } 491 } 492 493 /** 494 * Flushes all saved dirty/temp MapLNs and clears the saved set. 495 * 496 * <p>If dirty, a MapLN must be flushed at each checkpoint to record 497 * updated utilization info in the checkpoint interval. If it is a 498 * temporary DB, the MapLN must be flushed because all temp DBs must be 499 * encountered by recovery so they can be removed if they were not closed 500 * (and removed) by the user.</p> 501 * 502 * This method is not synchronized because it takes the Btree root latch, 503 * and we must never latch something in the Btree after synchronizing on 504 * DirtyINMap; see class comments. Special synchronization is performed 505 * for accessing internal state; see below. 506 * 507 * @param checkpointStart start LSN of the checkpoint in progress. To 508 * reduce unnecessary logging, the MapLN is only flushed if it has not been 509 * written since that LSN. 510 */ flushMapLNs(long checkpointStart)511 void flushMapLNs(long checkpointStart) 512 throws DatabaseException { 513 514 /* 515 * This method is called only while flushing dirty nodes for a 516 * checkpoint or Database.sync, not for an eviction, and mapLNsToFlush 517 * is not changed during the flushing phase. So we don't strictly need 518 * to synchronize while accessing mapLNsToFlush. However, for 519 * consistency and extra safety we always synchronize while accessing 520 * internal state. 521 */ 522 final Set<DatabaseId> mapLNsCopy; 523 synchronized (this) { 524 assert ckptState != CkptState.DIRTY_MAP_INCOMPLETE; 525 if (mapLNsToFlush.isEmpty()) { 526 mapLNsCopy = null; 527 } else { 528 mapLNsCopy = new HashSet<DatabaseId>(mapLNsToFlush); 529 mapLNsToFlush.clear(); 530 } 531 } 532 if (mapLNsCopy != null) { 533 final DbTree dbTree = envImpl.getDbTree(); 534 for (DatabaseId dbId : mapLNsCopy) { 535 final DatabaseImpl db = dbTree.getDb(dbId); 536 try { 537 if (db != null && 538 !db.isDeleted() && 539 db.isCheckpointNeeded()) { 540 dbTree.modifyDbRoot 541 (db, checkpointStart /*ifBeforeLsn*/, 542 true /*mustExist*/); 543 } 544 } finally { 545 dbTree.releaseDb(db); 546 } 547 } 548 } 549 } 550 551 /** 552 * Flushes the DB mapping tree root at the end of the checkpoint, if either 553 * mapping DB is dirty and the root was not flushed previously during the 554 * checkpoint. 555 * 556 * This method is not synchronized because it does not access internal 557 * state. Also, it takes the DbTree root latch and although this latch 558 * should never be held by eviction, for consistency we should not latch 559 * something related to the Btree after synchronizing on DirtyINMap; see 560 * class comments. 561 * 562 * @param checkpointStart start LSN of the checkpoint in progress. To 563 * reduce unnecessary logging, the Root is only flushed if it has not been 564 * written since that LSN. 565 */ flushRoot(long checkpointStart)566 void flushRoot(long checkpointStart) 567 throws DatabaseException { 568 569 final DbTree dbTree = envImpl.getDbTree(); 570 if (dbTree.getDb(DbTree.ID_DB_ID).isCheckpointNeeded() || 571 dbTree.getDb(DbTree.NAME_DB_ID).isCheckpointNeeded()) { 572 envImpl.logMapTreeRoot(checkpointStart); 573 } 574 } 575 getNumEntries()576 synchronized int getNumEntries() { 577 return numEntries; 578 } 579 } 580