1 /*-
2  * See the file LICENSE for redistribution information.
3  *
4  * Copyright (c) 2002, 2014 Oracle and/or its affiliates.  All rights reserved.
5  *
6  */
7 
8 package com.sleepycat.je.recovery;
9 
10 import java.util.HashMap;
11 import java.util.HashSet;
12 import java.util.IdentityHashMap;
13 import java.util.Iterator;
14 import java.util.Map;
15 import java.util.Set;
16 import java.util.SortedMap;
17 import java.util.TreeMap;
18 
19 import com.sleepycat.je.CacheMode;
20 import com.sleepycat.je.DatabaseException;
21 import com.sleepycat.je.dbi.DatabaseId;
22 import com.sleepycat.je.dbi.DatabaseImpl;
23 import com.sleepycat.je.dbi.DbTree;
24 import com.sleepycat.je.dbi.EnvironmentImpl;
25 import com.sleepycat.je.dbi.INList;
26 import com.sleepycat.je.dbi.MemoryBudget;
27 import com.sleepycat.je.recovery.Checkpointer.CheckpointReference;
28 import com.sleepycat.je.tree.BIN;
29 import com.sleepycat.je.tree.IN;
30 import com.sleepycat.je.tree.MapLN;
31 import com.sleepycat.je.utilint.TestHookExecute;
32 
33 /**
34  * Manages the by-level map of checkpoint references that are to be flushed by
35  * a checkpoint or Database.sync, the MapLNs to be flushed, the highest level
36  * by database to be flushed, and the state of the checkpoint.
37  *
38  * An single instance of this class is used for checkpoints and has the same
39  * lifetime as the checkpointer and environment.  An instance per Database.sync
40  * is created as needed.  Only one checkpoint can occur at a time, but multiple
41  * syncs may occur concurrently with each other and with the checkpoint.
42  *
43  * The methods in this class are synchronized to protect internal state from
44  * concurrent access by the checkpointer and eviction, and to coordinate state
45  * changes between the two.  Eviction must participate in the checkpoint so
46  * that INs cascade up properly; see coordinateEvictionWithCheckpoint.
47  *
48  * When INs are latched along with synchronization on a DirtyINMap, the order
49  * must be: 1) IN latches and 2) synchronize on DirtyINMap.  For example,
50  * the evictor latches the parent and child IN before calling the synchronized
51  * method coordinateEvictionWithCheckpoint, and selectDirtyINsForCheckpoint
52  * latches the IN before calling the synchronized method selectForCheckpoint.
53  */
54 class DirtyINMap {
55 
56     private final EnvironmentImpl envImpl;
57     private final SortedMap<Integer, Map<Long, CheckpointReference>> levelMap;
58     private int numEntries;
59     private final Set<DatabaseId> mapLNsToFlush;
60     private final Map<DatabaseImpl, Integer> highestFlushLevels;
61 
62     enum CkptState {
63         /** No checkpoint in progress, or is used for Database.sync. */
64         NONE,
65         /** Checkpoint started but dirty map is not yet complete. */
66         DIRTY_MAP_INCOMPLETE,
67         /** Checkpoint in progress and dirty map is complete. */
68         DIRTY_MAP_COMPLETE,
69     };
70 
71     private CkptState ckptState;
72     private boolean ckptFlushAll;
73     private boolean ckptFlushExtraLevel;
74 
DirtyINMap(EnvironmentImpl envImpl)75     DirtyINMap(EnvironmentImpl envImpl) {
76         this.envImpl = envImpl;
77         levelMap = new TreeMap<Integer, Map<Long, CheckpointReference>>();
78         numEntries = 0;
79         mapLNsToFlush = new HashSet<DatabaseId>();
80         highestFlushLevels = new IdentityHashMap<DatabaseImpl, Integer>();
81         ckptState = CkptState.NONE;
82     }
83 
84     /**
85      * Coordinates an eviction with an in-progress checkpoint and returns
86      * whether or not provisional logging is needed.
87      *
88      * @return true if the target must be logged provisionally.
89      */
coordinateEvictionWithCheckpoint(IN target, IN parent)90     synchronized boolean coordinateEvictionWithCheckpoint(IN target,
91                                                           IN parent) {
92         final DatabaseImpl db = target.getDatabase();
93 
94         /*
95          * If the checkpoint is in-progress and has not finished dirty map
96          * construction, we must add the parent to the dirty map.  That way the
97          * dirtiness and logging will cascade up in the same way as if the
98          * target were not evicted, and instead were encountered during dirty
99          * map construction.  We don't want the evictor's actions to introduce
100          * an IN in the log that has not cascaded up properly.
101          *
102          * Note that we add the parent even if it is not dirty here.  It will
103          * become dirty after the target child is logged, but that hasn't
104          * happened yet.
105          *
106          * We do not add the parent if it is null, which is the case when the
107          * root is being evicted.
108          */
109         if (ckptState == CkptState.DIRTY_MAP_INCOMPLETE &&
110             parent != null) {
111             /* Add latched parent IN to dirty map. */
112             selectForCheckpoint(parent);
113             /* Save dirty/temp DBs for later. */
114             saveMapLNsToFlush(parent);
115         }
116 
117         /*
118          * The evictor has to log provisionally in three cases:
119          *
120          * 1 - The eviction target is part of a deferred write database.
121          */
122         if (db.isDeferredWriteMode()) {
123             return true;
124         }
125 
126         /*
127          * 2 - The checkpoint is in-progress and has not finished dirty map
128          *     construction, and the target is not the root. The parent IN has
129          *     been added to the dirty map, so we know the child IN is at a
130          *     level below the max flush level.
131          */
132         if (ckptState == CkptState.DIRTY_MAP_INCOMPLETE &&
133             parent != null) {
134             return true;
135         }
136 
137         /*
138          * 3 - The checkpoint is in-progress and has finished dirty map
139          *     construction, and is at a level above the eviction target.
140          */
141         if (ckptState == CkptState.DIRTY_MAP_COMPLETE &&
142             target.getLevel() < getHighestFlushLevel(db)) {
143             return true;
144         }
145 
146         /* Otherwise, log non-provisionally. */
147         return false;
148     }
149 
150     /**
151      * Must be called before starting a checkpoint, and must not be called for
152      * Database.sync.  Updates memory budget and sets checkpoint state.
153      */
beginCheckpoint(boolean flushAll, boolean flushExtraLevel)154     synchronized void beginCheckpoint(boolean flushAll,
155                                       boolean flushExtraLevel) {
156         assert levelMap.isEmpty();
157         assert mapLNsToFlush.isEmpty();
158         assert highestFlushLevels.isEmpty();
159         assert numEntries == 0;
160         assert ckptState == CkptState.NONE;
161         ckptState = CkptState.DIRTY_MAP_INCOMPLETE;
162         ckptFlushAll = flushAll;
163         ckptFlushExtraLevel = flushExtraLevel;
164     }
165 
166     /**
167      * Must be called after a checkpoint or Database.sync is complete.  Updates
168      * memory budget and clears checkpoint state.
169      */
reset()170     synchronized void reset() {
171         removeCostFromMemoryBudget();
172         levelMap.clear();
173         mapLNsToFlush.clear();
174         highestFlushLevels.clear();
175         numEntries = 0;
176         ckptState = CkptState.NONE;
177     }
178 
179     /**
180      * Scan the INList for all dirty INs, excluding temp DB INs.  Save them in
181      * a tree-level ordered map for level ordered flushing.
182      *
183      * Take this opportunity to recalculate the memory budget tree usage.
184      *
185      * This method itself is not synchronized to allow concurrent eviction.
186      * Synchronization is performed on a per-IN basis to protect the data
187      * structures here, and eviction can occur in between INs.
188      */
selectDirtyINsForCheckpoint()189     void selectDirtyINsForCheckpoint()
190         throws DatabaseException {
191 
192         assert ckptState == CkptState.DIRTY_MAP_INCOMPLETE;
193 
194         /*
195          * Opportunistically recalculate the INList memory budget while
196          * traversing the entire INList.
197          */
198         final INList inMemINs = envImpl.getInMemoryINs();
199         inMemINs.memRecalcBegin();
200         boolean completed = false;
201         try {
202             for (IN in : inMemINs) {
203                 in.latchShared(CacheMode.UNCHANGED);
204                 try {
205                     inMemINs.memRecalcIterate(in);
206                     /* Add latched IN to dirty map. */
207                     if (in.getDirty()) {
208                         selectForCheckpoint(in);
209                     }
210                     /* Save dirty/temp DBs for later. */
211                     saveMapLNsToFlush(in);
212                 } finally {
213                     in.releaseLatch();
214                 }
215 
216                 /* Call test hook after releasing latch. */
217                 TestHookExecute.doHookIfSet
218                     (Checkpointer.examineINForCheckpointHook, in);
219             }
220             completed = true;
221         } finally {
222             inMemINs.memRecalcEnd(completed);
223         }
224 
225         /*
226          * Finish filling out the highestFlushLevels map. For each entry in
227          * highestFlushLevels that has a null level Integer value (set by
228          * selectForCheckpoint), we call DbTree.getHighestLevel and replace the
229          * null level. We must call DbTree.getHighestLevel, which latches the
230          * root, only when not synchronized, to avoid breaking the
231          * synchronization rules described in the class comment.  This must be
232          * done in several steps to follow the sychronization rules, yet
233          * protect the highestFlushLevels using synchronization.
234          */
235         final Map<DatabaseImpl, Integer> maxFlushDbs =
236             new HashMap<DatabaseImpl, Integer>();
237 
238         /* Copy entries with a null level. */
239         synchronized (this) {
240             for (DatabaseImpl db : highestFlushLevels.keySet()) {
241                 if (highestFlushLevels.get(db) == null) {
242                     maxFlushDbs.put(db, null);
243                 }
244             }
245         }
246 
247         /* Call getHighestLevel without synchronization. */
248         final DbTree dbTree = envImpl.getDbTree();
249         for (Map.Entry<DatabaseImpl, Integer> entry : maxFlushDbs.entrySet()) {
250             entry.setValue(dbTree.getHighestLevel(entry.getKey()));
251         }
252 
253         /* Fill in levels in highestFlushLevels. */
254         synchronized (this) {
255             for (Map.Entry<DatabaseImpl, Integer> entry :
256                  maxFlushDbs.entrySet()) {
257                 highestFlushLevels.put(entry.getKey(), entry.getValue());
258             }
259         }
260 
261         /* Complete this phase of the checkpoint. */
262         synchronized (this) {
263             addCostToMemoryBudget();
264             ckptState = CkptState.DIRTY_MAP_COMPLETE;
265         }
266     }
267 
268     /**
269      * Adds the IN to the dirty map unless it belongs to a temp DB, whether or
270      * not the IN is dirty.  Also updates the highest flush level map.
271      */
selectForCheckpoint(IN in)272     private synchronized void selectForCheckpoint(IN in) {
273         /* Do not checkpoint temporary databases. */
274         final DatabaseImpl db = in.getDatabase();
275         if (db.isTemporary()) {
276             return;
277         }
278 
279         Integer level = addIN(in, false /*updateMemoryBudget*/);
280 
281         /*
282          * IN was added to the dirty map.  Update the highest level seen
283          * for the database.  Use one level higher when ckptFlushExtraLevel
284          * is set.  When ckptFlushAll is set, use the maximum level for the
285          * database.  Durable deferred-write databases must be synced, so
286          * also use the maximum level.
287          *
288          * Always flush at least one level above the bottom-most BIN level so
289          * that the BIN level is logged provisionally and the expense of
290          * processing BINs during recovery is avoided.
291          */
292         if (ckptFlushAll || db.isDurableDeferredWrite()) {
293             if (!highestFlushLevels.containsKey(db)) {
294 
295                 /*
296                  * Null is used as an indicator that getHighestLevel should be
297                  * called in selectDirtyINsForCheckpoint, when not
298                  * synchronized.
299                  */
300                 highestFlushLevels.put(db, null);
301             }
302         } else {
303             int levelVal = level.intValue();
304             if (ckptFlushExtraLevel || in.isBIN()) {
305                 if (in.isRoot()) {
306                     /* No reason to go above DB root. */
307                 } else {
308                     /* Next level up in the same tree. */
309                     levelVal += 1;
310                     level = levelVal;
311                 }
312             }
313             final Integer highestLevelSeen = highestFlushLevels.get(db);
314             if (highestLevelSeen == null ||
315                 levelVal > highestLevelSeen.intValue()) {
316                 highestFlushLevels.put(db, level);
317             }
318         }
319     }
320 
321     /**
322      * Scan the INList for all dirty INs for a given database.  Arrange them in
323      * level sorted map for level ordered flushing.
324      *
325      * This method is not synchronized to allow concurrent eviction.
326      * Coordination between eviction and Database.sync is not required.
327      */
selectDirtyINsForDbSync(DatabaseImpl dbImpl)328     void selectDirtyINsForDbSync(DatabaseImpl dbImpl)
329         throws DatabaseException {
330 
331         assert ckptState == CkptState.NONE;
332 
333         final DatabaseId dbId = dbImpl.getId();
334 
335         for (IN in : envImpl.getInMemoryINs()) {
336             if (in.getDatabaseId().equals(dbId)) {
337                 in.latch(CacheMode.UNCHANGED);
338                 try {
339                     if (in.getDirty()) {
340                         addIN(in, false /*updateMemoryBudget*/);
341                     }
342                 } finally {
343                     in.releaseLatch();
344                 }
345             }
346         }
347 
348         /*
349          * Create a single entry map that forces all levels of this DB to
350          * be flushed.
351          */
352         highestFlushLevels.put
353             (dbImpl,
354              Integer.valueOf(envImpl.getDbTree().getHighestLevel(dbImpl)));
355 
356         /* Add the dirty map to the memory budget.  */
357         addCostToMemoryBudget();
358     }
359 
getHighestFlushLevel(DatabaseImpl db)360     synchronized int getHighestFlushLevel(DatabaseImpl db) {
361 
362         assert ckptState != CkptState.DIRTY_MAP_INCOMPLETE;
363 
364         /*
365          * This method is only called while flushing dirty nodes for a
366          * checkpoint or Database.sync, not for an eviction, so an entry for
367          * this database should normally exist.  However, if the DB root (and
368          * DatabaseImpl) have been evicted since the highestFlushLevels was
369          * constructed, the new DatabaseImpl instance will not be present in
370          * the map.  In this case, we do not need to checkpoint the IN and
371          * eviction should be non-provisional.
372          */
373         Integer val = highestFlushLevels.get(db);
374         return (val != null) ? val.intValue() : IN.MIN_LEVEL;
375     }
376 
getNumLevels()377     synchronized int getNumLevels() {
378         return levelMap.size();
379     }
380 
addCostToMemoryBudget()381     private synchronized void addCostToMemoryBudget() {
382         final MemoryBudget mb = envImpl.getMemoryBudget();
383         final int cost = numEntries * MemoryBudget.CHECKPOINT_REFERENCE_SIZE;
384         mb.updateAdminMemoryUsage(cost);
385     }
386 
removeCostFromMemoryBudget()387     private synchronized void removeCostFromMemoryBudget() {
388         final MemoryBudget mb = envImpl.getMemoryBudget();
389         final int cost = numEntries * MemoryBudget.CHECKPOINT_REFERENCE_SIZE;
390         mb.updateAdminMemoryUsage(0 - cost);
391     }
392 
393     /**
394      * Add a node unconditionally to the dirty map. The dirty map is keyed by
395      * level (Integers) and holds sets of IN references.
396      *
397      * @param updateMemoryBudget if true then update the memory budget as the
398      * map is changed; if false then addCostToMemoryBudget must be called
399      * later.
400      *
401      * @return level of IN added to the dirty map.  The level is returned
402      * rather than a boolean simply to avoid allocating another Integer in the
403      * caller.
404      */
addIN(IN in, boolean updateMemoryBudget)405     synchronized Integer addIN(IN in, boolean updateMemoryBudget) {
406 
407         final Integer level = Integer.valueOf(in.getLevel());
408         final Map<Long, CheckpointReference> nodeMap;
409         if (levelMap.containsKey(level)) {
410             nodeMap = levelMap.get(level);
411         } else {
412             nodeMap = new TreeMap<Long, CheckpointReference>();
413             levelMap.put(level, nodeMap);
414         }
415 
416         nodeMap.put(in.getNodeId(),
417                     new CheckpointReference(in.getDatabase().getId(),
418                                             in.getNodeId(),
419                                             in.getLevel(),
420                                             in.isDbRoot(),
421                                             in.getIdentifierKey()));
422         numEntries++;
423 
424         if (updateMemoryBudget) {
425             final MemoryBudget mb = envImpl.getMemoryBudget();
426             mb.updateAdminMemoryUsage
427                 (MemoryBudget.CHECKPOINT_REFERENCE_SIZE);
428         }
429 
430         return level;
431     }
432 
433     /**
434      * Get the lowest level currently stored in the map.
435      */
getLowestLevelSet()436     synchronized Integer getLowestLevelSet() {
437         return levelMap.firstKey();
438     }
439 
440     /**
441      * Removes the set corresponding to the given level.
442      */
removeLevel(Integer level)443     synchronized void removeLevel(Integer level) {
444         levelMap.remove(level);
445     }
446 
containsNode(Integer level, Long nodeId)447     synchronized boolean containsNode(Integer level, Long nodeId) {
448         final Map<Long, CheckpointReference> nodeMap = levelMap.get(level);
449         if (nodeMap != null) {
450             return nodeMap.containsKey(nodeId);
451         }
452         return false;
453     }
454 
removeNode(Integer level, Long nodeId)455     synchronized CheckpointReference removeNode(Integer level, Long nodeId) {
456         final Map<Long, CheckpointReference> nodeMap = levelMap.get(level);
457         if (nodeMap != null) {
458             return nodeMap.remove(nodeId);
459         }
460         return null;
461     }
462 
removeNextNode(Integer level)463     synchronized CheckpointReference removeNextNode(Integer level) {
464         final Map<Long, CheckpointReference> nodeMap = levelMap.get(level);
465         if (nodeMap != null) {
466             final Iterator<Map.Entry<Long, CheckpointReference>> iter =
467                 nodeMap.entrySet().iterator();
468             if (iter.hasNext()) {
469                 final CheckpointReference ref = iter.next().getValue();
470                 iter.remove();
471                 return ref;
472             }
473         }
474         return null;
475     }
476 
477     /**
478      * If the given IN is a BIN for the ID mapping database, saves all
479      * dirty/temp MapLNs contained in it.
480      */
saveMapLNsToFlush(IN in)481     private synchronized void saveMapLNsToFlush(IN in) {
482         if (in instanceof BIN &&
483             in.getDatabase().getId().equals(DbTree.ID_DB_ID)) {
484             for (int i = 0; i < in.getNEntries(); i += 1) {
485                 final MapLN ln = (MapLN) in.getTarget(i);
486                 if (ln != null && ln.getDatabase().isCheckpointNeeded()) {
487                     mapLNsToFlush.add(ln.getDatabase().getId());
488                 }
489             }
490         }
491     }
492 
493     /**
494      * Flushes all saved dirty/temp MapLNs and clears the saved set.
495      *
496      * <p>If dirty, a MapLN must be flushed at each checkpoint to record
497      * updated utilization info in the checkpoint interval.  If it is a
498      * temporary DB, the MapLN must be flushed because all temp DBs must be
499      * encountered by recovery so they can be removed if they were not closed
500      * (and removed) by the user.</p>
501      *
502      * This method is not synchronized because it takes the Btree root latch,
503      * and we must never latch something in the Btree after synchronizing on
504      * DirtyINMap; see class comments.  Special synchronization is performed
505      * for accessing internal state; see below.
506      *
507      * @param checkpointStart start LSN of the checkpoint in progress.  To
508      * reduce unnecessary logging, the MapLN is only flushed if it has not been
509      * written since that LSN.
510      */
flushMapLNs(long checkpointStart)511     void flushMapLNs(long checkpointStart)
512         throws DatabaseException {
513 
514         /*
515          * This method is called only while flushing dirty nodes for a
516          * checkpoint or Database.sync, not for an eviction, and mapLNsToFlush
517          * is not changed during the flushing phase.  So we don't strictly need
518          * to synchronize while accessing mapLNsToFlush.  However, for
519          * consistency and extra safety we always synchronize while accessing
520          * internal state.
521          */
522         final Set<DatabaseId> mapLNsCopy;
523         synchronized (this) {
524             assert ckptState != CkptState.DIRTY_MAP_INCOMPLETE;
525             if (mapLNsToFlush.isEmpty()) {
526                 mapLNsCopy = null;
527             } else {
528                 mapLNsCopy = new HashSet<DatabaseId>(mapLNsToFlush);
529                 mapLNsToFlush.clear();
530             }
531         }
532         if (mapLNsCopy != null) {
533             final DbTree dbTree = envImpl.getDbTree();
534             for (DatabaseId dbId : mapLNsCopy) {
535                 final DatabaseImpl db = dbTree.getDb(dbId);
536                 try {
537                     if (db != null &&
538                         !db.isDeleted() &&
539                         db.isCheckpointNeeded()) {
540                         dbTree.modifyDbRoot
541                             (db, checkpointStart /*ifBeforeLsn*/,
542                              true /*mustExist*/);
543                     }
544                 } finally {
545                     dbTree.releaseDb(db);
546                 }
547             }
548         }
549     }
550 
551     /**
552      * Flushes the DB mapping tree root at the end of the checkpoint, if either
553      * mapping DB is dirty and the root was not flushed previously during the
554      * checkpoint.
555      *
556      * This method is not synchronized because it does not access internal
557      * state.  Also, it takes the DbTree root latch and although this latch
558      * should never be held by eviction, for consistency we should not latch
559      * something related to the Btree after synchronizing on DirtyINMap; see
560      * class comments.
561      *
562      * @param checkpointStart start LSN of the checkpoint in progress.  To
563      * reduce unnecessary logging, the Root is only flushed if it has not been
564      * written since that LSN.
565      */
flushRoot(long checkpointStart)566     void flushRoot(long checkpointStart)
567         throws DatabaseException {
568 
569         final DbTree dbTree = envImpl.getDbTree();
570         if (dbTree.getDb(DbTree.ID_DB_ID).isCheckpointNeeded() ||
571             dbTree.getDb(DbTree.NAME_DB_ID).isCheckpointNeeded()) {
572             envImpl.logMapTreeRoot(checkpointStart);
573         }
574     }
575 
getNumEntries()576     synchronized int getNumEntries() {
577         return numEntries;
578     }
579 }
580