1 /*-
2  * Copyright (c) 2014-2018 MongoDB, Inc.
3  * Copyright (c) 2008-2014 WiredTiger, Inc.
4  *	All rights reserved.
5  *
6  * See the file LICENSE for redistribution information.
7  */
8 
9 #include "wt_internal.h"
10 
11 static int __ckpt_process(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *);
12 static int __ckpt_string(
13 	WT_SESSION_IMPL *, WT_BLOCK *, const uint8_t *, WT_ITEM *);
14 static int __ckpt_update(
15 	WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *, WT_BLOCK_CKPT *, bool);
16 
17 /*
18  * __wt_block_ckpt_init --
19  *	Initialize a checkpoint structure.
20  */
21 int
__wt_block_ckpt_init(WT_SESSION_IMPL * session,WT_BLOCK_CKPT * ci,const char * name)22 __wt_block_ckpt_init(
23     WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci, const char *name)
24 {
25 	WT_CLEAR(*ci);
26 
27 	ci->version = WT_BM_CHECKPOINT_VERSION;
28 	ci->root_offset = WT_BLOCK_INVALID_OFFSET;
29 
30 	WT_RET(__wt_block_extlist_init(
31 	    session, &ci->alloc, name, "alloc", false));
32 	WT_RET(__wt_block_extlist_init(
33 	    session, &ci->avail, name, "avail", true));
34 	WT_RET(__wt_block_extlist_init(
35 	    session, &ci->discard, name, "discard", false));
36 	WT_RET(__wt_block_extlist_init(
37 	    session, &ci->ckpt_avail, name, "ckpt_avail", true));
38 
39 	return (0);
40 }
41 
42 /*
43  * __wt_block_checkpoint_load --
44  *	Load a checkpoint.
45  */
46 int
__wt_block_checkpoint_load(WT_SESSION_IMPL * session,WT_BLOCK * block,const uint8_t * addr,size_t addr_size,uint8_t * root_addr,size_t * root_addr_sizep,bool checkpoint)47 __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block,
48     const uint8_t *addr, size_t addr_size,
49     uint8_t *root_addr, size_t *root_addr_sizep, bool checkpoint)
50 {
51 	WT_BLOCK_CKPT *ci, _ci;
52 	WT_DECL_ITEM(tmp);
53 	WT_DECL_RET;
54 	uint8_t *endp;
55 
56 	/*
57 	 * Sometimes we don't find a root page (we weren't given a checkpoint,
58 	 * or the checkpoint was empty).  In that case we return an empty root
59 	 * address, set that up now.
60 	 */
61 	*root_addr_sizep = 0;
62 
63 	ci = NULL;
64 
65 	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
66 		if (addr != NULL) {
67 			WT_ERR(__wt_scr_alloc(session, 0, &tmp));
68 			WT_ERR(__ckpt_string(session, block, addr, tmp));
69 		}
70 		__wt_verbose(session, WT_VERB_CHECKPOINT,
71 		    "%s: load-checkpoint: %s", block->name,
72 		    addr == NULL ? "[Empty]" : (const char *)tmp->data);
73 	}
74 
75 	/*
76 	 * There's a single checkpoint in the file that can be written, all of
77 	 * the others are read-only.  We use the same initialization calls for
78 	 * readonly checkpoints, but the information doesn't persist.
79 	 */
80 	if (checkpoint) {
81 		ci = &_ci;
82 		WT_ERR(__wt_block_ckpt_init(session, ci, "checkpoint"));
83 	} else {
84 		/*
85 		 * We depend on the btree level for locking: things will go bad
86 		 * fast if we open the live system in two handles, or salvage,
87 		 * truncate or verify the live/running file.
88 		 */
89 #ifdef HAVE_DIAGNOSTIC
90 		__wt_spin_lock(session, &block->live_lock);
91 		WT_ASSERT(session, block->live_open == false);
92 		block->live_open = true;
93 		__wt_spin_unlock(session, &block->live_lock);
94 #endif
95 		ci = &block->live;
96 		WT_ERR(__wt_block_ckpt_init(session, ci, "live"));
97 	}
98 
99 	/*
100 	 * If the checkpoint has an on-disk root page, load it.  Otherwise, size
101 	 * the file past the description information.
102 	 */
103 	if (addr == NULL || addr_size == 0)
104 		ci->file_size = block->allocsize;
105 	else {
106 		/* Crack the checkpoint cookie. */
107 		WT_ERR(__wt_block_buffer_to_ckpt(session, block, addr, ci));
108 
109 		/* Verify sets up next. */
110 		if (block->verify)
111 			WT_ERR(__wt_verify_ckpt_load(session, block, ci));
112 
113 		/* Read any root page. */
114 		if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) {
115 			endp = root_addr;
116 			WT_ERR(__wt_block_addr_to_buffer(block, &endp,
117 			    ci->root_offset, ci->root_size, ci->root_checksum));
118 			*root_addr_sizep = WT_PTRDIFF(endp, root_addr);
119 		}
120 
121 		/*
122 		 * Rolling a checkpoint forward requires the avail list, the
123 		 * blocks from which we can allocate.
124 		 */
125 		if (!checkpoint)
126 			WT_ERR(__wt_block_extlist_read_avail(
127 			    session, block, &ci->avail, ci->file_size));
128 	}
129 
130 	/*
131 	 * If the checkpoint can be written, that means anything written after
132 	 * the checkpoint is no longer interesting, truncate the file.  Don't
133 	 * bother checking the avail list for a block at the end of the file,
134 	 * that was done when the checkpoint was first written (re-writing the
135 	 * checkpoint might possibly make it relevant here, but it's unlikely
136 	 * enough I don't bother).
137 	 */
138 	if (!checkpoint)
139 		WT_ERR(__wt_block_truncate(session, block, ci->file_size));
140 
141 	if (0) {
142 err:		/*
143 		 * Don't call checkpoint-unload: unload does real work including
144 		 * file truncation.  If we fail early enough that the checkpoint
145 		 * information isn't correct, bad things would happen.  The only
146 		 * allocated memory was in the service of verify, clean that up.
147 		 */
148 		if (block->verify)
149 			WT_TRET(__wt_verify_ckpt_unload(session, block));
150 	}
151 
152 	/* Checkpoints don't need the original information, discard it. */
153 	if (checkpoint && ci != NULL)
154 		__wt_block_ckpt_destroy(session, ci);
155 
156 	__wt_scr_free(session, &tmp);
157 	return (ret);
158 }
159 
160 /*
161  * __wt_block_checkpoint_unload --
162  *	Unload a checkpoint.
163  */
164 int
__wt_block_checkpoint_unload(WT_SESSION_IMPL * session,WT_BLOCK * block,bool checkpoint)165 __wt_block_checkpoint_unload(
166     WT_SESSION_IMPL *session, WT_BLOCK *block, bool checkpoint)
167 {
168 	WT_DECL_RET;
169 
170 	/* Verify cleanup. */
171 	if (block->verify)
172 		WT_TRET(__wt_verify_ckpt_unload(session, block));
173 
174 	/*
175 	 * If it's the live system, truncate to discard any extended blocks and
176 	 * discard the active extent lists.  Hold the lock even though we're
177 	 * unloading the live checkpoint, there could be readers active in other
178 	 * checkpoints.
179 	 */
180 	if (!checkpoint) {
181 		WT_TRET(__wt_block_truncate(session, block, block->size));
182 
183 		__wt_spin_lock(session, &block->live_lock);
184 		__wt_block_ckpt_destroy(session, &block->live);
185 #ifdef HAVE_DIAGNOSTIC
186 		block->live_open = false;
187 #endif
188 		__wt_spin_unlock(session, &block->live_lock);
189 	}
190 
191 	return (ret);
192 }
193 
194 /*
195  * __wt_block_ckpt_destroy --
196  *	Clear a checkpoint structure.
197  */
198 void
__wt_block_ckpt_destroy(WT_SESSION_IMPL * session,WT_BLOCK_CKPT * ci)199 __wt_block_ckpt_destroy(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci)
200 {
201 	/* Discard the extent lists. */
202 	__wt_block_extlist_free(session, &ci->alloc);
203 	__wt_block_extlist_free(session, &ci->avail);
204 	__wt_block_extlist_free(session, &ci->discard);
205 	__wt_block_extlist_free(session, &ci->ckpt_alloc);
206 	__wt_block_extlist_free(session, &ci->ckpt_avail);
207 	__wt_block_extlist_free(session, &ci->ckpt_discard);
208 }
209 
210 /*
211  * __wt_block_checkpoint_start --
212  *	Start a checkpoint.
213  */
214 int
__wt_block_checkpoint_start(WT_SESSION_IMPL * session,WT_BLOCK * block)215 __wt_block_checkpoint_start(WT_SESSION_IMPL *session, WT_BLOCK *block)
216 {
217 	WT_DECL_RET;
218 
219 	__wt_spin_lock(session, &block->live_lock);
220 	switch (block->ckpt_state) {
221 	case WT_CKPT_INPROGRESS:
222 	case WT_CKPT_PANIC_ON_FAILURE:
223 	case WT_CKPT_SALVAGE:
224 		__wt_err(session, EINVAL,
225 		    "%s: an unexpected checkpoint start: the checkpoint "
226 		    "has already started or was configured for salvage",
227 		    block->name);
228 		ret = __wt_block_panic(session);
229 		break;
230 	case WT_CKPT_NONE:
231 		block->ckpt_state = WT_CKPT_INPROGRESS;
232 		break;
233 	}
234 	__wt_spin_unlock(session, &block->live_lock);
235 	return (ret);
236 }
237 
238 /*
239  * __wt_block_checkpoint --
240  *	Create a new checkpoint.
241  */
242 int
__wt_block_checkpoint(WT_SESSION_IMPL * session,WT_BLOCK * block,WT_ITEM * buf,WT_CKPT * ckptbase,bool data_checksum)243 __wt_block_checkpoint(WT_SESSION_IMPL *session,
244     WT_BLOCK *block, WT_ITEM *buf, WT_CKPT *ckptbase, bool data_checksum)
245 {
246 	WT_BLOCK_CKPT *ci;
247 	WT_DECL_RET;
248 
249 	ci = &block->live;
250 
251 	/* Switch to first-fit allocation. */
252 	__wt_block_configure_first_fit(block, true);
253 
254 	/*
255 	 * Write the root page: it's possible for there to be a checkpoint of
256 	 * an empty tree, in which case, we store an illegal root offset.
257 	 *
258 	 * !!!
259 	 * We happen to know that checkpoints are single-threaded above us in
260 	 * the btree engine.  That's probably something we want to guarantee
261 	 * for any WiredTiger block manager.
262 	 */
263 	if (buf == NULL) {
264 		ci->root_offset = WT_BLOCK_INVALID_OFFSET;
265 		ci->root_size = ci->root_checksum = 0;
266 	} else
267 		WT_ERR(__wt_block_write_off(session, block, buf,
268 		    &ci->root_offset, &ci->root_size, &ci->root_checksum,
269 		    data_checksum, true, false));
270 
271 	/*
272 	 * Checkpoints are potentially reading/writing/merging lots of blocks,
273 	 * pre-allocate structures for this thread's use.
274 	 */
275 	WT_ERR(__wt_block_ext_prealloc(session, 250));
276 
277 	/* Process the checkpoint list, deleting and updating as required. */
278 	ret = __ckpt_process(session, block, ckptbase);
279 
280 	/* Discard any excessive memory we've allocated. */
281 	WT_TRET(__wt_block_ext_discard(session, 250));
282 
283 	/* Restore the original allocation plan. */
284 err:	__wt_block_configure_first_fit(block, false);
285 
286 	return (ret);
287 }
288 
289 /*
290  * __ckpt_extlist_read --
291  *	Read a checkpoints extent lists and copy
292  */
293 static int
__ckpt_extlist_read(WT_SESSION_IMPL * session,WT_BLOCK * block,WT_CKPT * ckpt)294 __ckpt_extlist_read(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt)
295 {
296 	WT_BLOCK_CKPT *ci;
297 
298 	/*
299 	 * Allocate a checkpoint structure, crack the cookie and read the
300 	 * checkpoint's extent lists.
301 	 *
302 	 * Ignore the avail list: checkpoint avail lists are only useful if we
303 	 * are rolling forward from the particular checkpoint and they represent
304 	 * our best understanding of what blocks can be allocated.  If we are
305 	 * not operating on the live checkpoint, subsequent checkpoints might
306 	 * have allocated those blocks, and the avail list is useless.  We don't
307 	 * discard it, because it is useful as part of verification, but we
308 	 * don't re-write it either.
309 	 */
310 	WT_RET(__wt_calloc(session, 1, sizeof(WT_BLOCK_CKPT), &ckpt->bpriv));
311 
312 	ci = ckpt->bpriv;
313 	WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name));
314 	WT_RET(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci));
315 	WT_RET(__wt_block_extlist_read(
316 	    session, block, &ci->alloc, ci->file_size));
317 	WT_RET(__wt_block_extlist_read(
318 	    session, block, &ci->discard, ci->file_size));
319 
320 	return (0);
321 }
322 
323 /*
324  * __ckpt_extlist_fblocks --
325  *	If a checkpoint's extent list is going away, free its blocks.
326  */
327 static int
__ckpt_extlist_fblocks(WT_SESSION_IMPL * session,WT_BLOCK * block,WT_EXTLIST * el)328 __ckpt_extlist_fblocks(
329     WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el)
330 {
331 	if (el->offset == WT_BLOCK_INVALID_OFFSET)
332 		return (0);
333 
334 	/*
335 	 * Free blocks used to write checkpoint extents into the live system's
336 	 * checkpoint avail list (they were never on any alloc list). Do not
337 	 * use the live system's avail list because that list is used to decide
338 	 * if the file can be truncated, and we can't truncate any part of the
339 	 * file that contains a previous checkpoint's extents.
340 	 */
341 	return (__wt_block_insert_ext(
342 	    session, block, &block->live.ckpt_avail, el->offset, el->size));
343 }
344 
345 #ifdef HAVE_DIAGNOSTIC
346 /*
347  * __ckpt_verify --
348  *	Diagnostic code, confirm we get what we expect in the checkpoint array.
349  */
350 static int
__ckpt_verify(WT_SESSION_IMPL * session,WT_CKPT * ckptbase)351 __ckpt_verify(WT_SESSION_IMPL *session, WT_CKPT *ckptbase)
352 {
353 	WT_CKPT *ckpt;
354 
355 	/*
356 	 * Fast check that we're seeing what we expect to see: some number of
357 	 * checkpoints to add, delete or ignore, terminated by a new checkpoint.
358 	 */
359 	WT_CKPT_FOREACH(ckptbase, ckpt)
360 		switch (ckpt->flags) {
361 		case 0:
362 		case WT_CKPT_DELETE:
363 		case WT_CKPT_DELETE | WT_CKPT_FAKE:
364 		case WT_CKPT_FAKE:
365 			break;
366 		case WT_CKPT_ADD:
367 			if (ckpt[1].name == NULL)
368 				break;
369 			/* FALLTHROUGH */
370 		default:
371 			/*
372 			 * Don't convert to WT_ILLEGAL_VALUE, it won't compile
373 			 * on some gcc compilers because they don't understand
374 			 * FALLTHROUGH as part of a macro.
375 			 */
376 			return (__wt_illegal_value(session, ckpt->flags));
377 		}
378 	return (0);
379 }
380 #endif
381 
382 /*
383  * __ckpt_process --
384  *	Process the list of checkpoints.
385  */
386 static int
__ckpt_process(WT_SESSION_IMPL * session,WT_BLOCK * block,WT_CKPT * ckptbase)387 __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
388 {
389 	WT_BLOCK_CKPT *a, *b, *ci;
390 	WT_CKPT *ckpt, *next_ckpt;
391 	WT_DECL_ITEM(tmp);
392 	WT_DECL_RET;
393 	uint64_t ckpt_size;
394 	bool deleting, fatal, locked;
395 
396 	ci = &block->live;
397 	fatal = locked = false;
398 
399 #ifdef HAVE_DIAGNOSTIC
400 	WT_RET(__ckpt_verify(session, ckptbase));
401 #endif
402 
403 	/*
404 	 * Checkpoints are a two-step process: first, write a new checkpoint to
405 	 * disk (including all the new extent lists for modified checkpoints
406 	 * and the live system).  As part of this, create a list of file blocks
407 	 * newly available for reallocation, based on checkpoints being deleted.
408 	 * We then return the locations of the new checkpoint information to our
409 	 * caller.  Our caller has to write that information into some kind of
410 	 * stable storage, and once that's done, we can actually allocate from
411 	 * that list of newly available file blocks.  (We can't allocate from
412 	 * that list immediately because the allocation might happen before our
413 	 * caller saves the new checkpoint information, and if we crashed before
414 	 * the new checkpoint location was saved, we'd have overwritten blocks
415 	 * still referenced by checkpoints in the system.)  In summary, there is
416 	 * a second step: after our caller saves the checkpoint information, we
417 	 * are called to add the newly available blocks into the live system's
418 	 * available list.
419 	 *
420 	 * This function is the first step, the second step is in the resolve
421 	 * function.
422 	 *
423 	 * If we're called to checkpoint the same file twice (without the second
424 	 * resolution step), or re-entered for any reason, it's an error in our
425 	 * caller, and our choices are all bad: leak blocks or potentially crash
426 	 * with our caller not yet having saved previous checkpoint information
427 	 * to stable storage.
428 	 */
429 	__wt_spin_lock(session, &block->live_lock);
430 	switch (block->ckpt_state) {
431 	case WT_CKPT_INPROGRESS:
432 		block->ckpt_state = WT_CKPT_PANIC_ON_FAILURE;
433 		break;
434 	case WT_CKPT_NONE:
435 	case WT_CKPT_PANIC_ON_FAILURE:
436 		__wt_err(session, EINVAL,
437 		    "%s: an unexpected checkpoint attempt: the checkpoint "
438 		    "was never started or has already completed",
439 		    block->name);
440 		ret = __wt_block_panic(session);
441 		break;
442 	case WT_CKPT_SALVAGE:
443 		/* Salvage doesn't use the standard checkpoint APIs. */
444 		break;
445 	}
446 	__wt_spin_unlock(session, &block->live_lock);
447 	WT_RET(ret);
448 
449 	/*
450 	 * Extents newly available as a result of deleting previous checkpoints
451 	 * are added to a list of extents.  The list should be empty, but as
452 	 * described above, there is no "free the checkpoint information" call
453 	 * into the block manager; if there was an error in an upper level that
454 	 * resulted in some previous checkpoint never being resolved, the list
455 	 * may not be empty.  We should have caught that with the "checkpoint
456 	 * in progress" test, but it doesn't cost us anything to be cautious.
457 	 *
458 	 * We free the checkpoint's allocation and discard extent lists as part
459 	 * of the resolution step, not because they're needed at that time, but
460 	 * because it's potentially a lot of work, and waiting allows the btree
461 	 * layer to continue eviction sooner.  As for the checkpoint-available
462 	 * list, make sure they get cleaned out.
463 	 */
464 	__wt_block_extlist_free(session, &ci->ckpt_avail);
465 	WT_RET(__wt_block_extlist_init(
466 	    session, &ci->ckpt_avail, "live", "ckpt_avail", true));
467 	__wt_block_extlist_free(session, &ci->ckpt_alloc);
468 	__wt_block_extlist_free(session, &ci->ckpt_discard);
469 
470 	/*
471 	 * To delete a checkpoint, we'll need checkpoint information for it and
472 	 * the subsequent checkpoint into which it gets rolled; read them from
473 	 * disk before we lock things down.
474 	 */
475 	deleting = false;
476 	WT_CKPT_FOREACH(ckptbase, ckpt) {
477 		if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
478 		    !F_ISSET(ckpt, WT_CKPT_DELETE))
479 			continue;
480 		deleting = true;
481 
482 		/*
483 		 * Read the checkpoint and next checkpoint extent lists if we
484 		 * haven't already read them (we may have already read these
485 		 * extent blocks if there is more than one deleted checkpoint).
486 		 */
487 		if (ckpt->bpriv == NULL)
488 			WT_ERR(__ckpt_extlist_read(session, block, ckpt));
489 
490 		for (next_ckpt = ckpt + 1;; ++next_ckpt)
491 			if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
492 				break;
493 
494 		/*
495 		 * The "next" checkpoint may be the live tree which has no
496 		 * extent blocks to read.
497 		 */
498 		if (next_ckpt->bpriv == NULL &&
499 		    !F_ISSET(next_ckpt, WT_CKPT_ADD))
500 			WT_ERR(__ckpt_extlist_read(session, block, next_ckpt));
501 	}
502 
503 	/*
504 	 * Failures are now fatal: we can't currently back out the merge of any
505 	 * deleted checkpoint extent lists into the live system's extent lists,
506 	 * so continuing after error would leave the live system's extent lists
507 	 * corrupted for any subsequent checkpoint (and potentially, should a
508 	 * subsequent checkpoint succeed, for recovery).
509 	 */
510 	fatal = true;
511 
512 	/*
513 	 * Hold a lock so the live extent lists and the file size can't change
514 	 * underneath us.  I suspect we'll tighten this if checkpoints take too
515 	 * much time away from real work: we read the historic checkpoint
516 	 * information without a lock, but we could also merge and re-write the
517 	 * deleted and merged checkpoint information without a lock, except for
518 	 * the final merge of ranges into the live tree.
519 	 */
520 	__wt_spin_lock(session, &block->live_lock);
521 	locked = true;
522 
523 	/*
524 	 * We've allocated our last page, update the checkpoint size.  We need
525 	 * to calculate the live system's checkpoint size before merging
526 	 * checkpoint allocation and discard information from the checkpoints
527 	 * we're deleting, those operations change the underlying byte counts.
528 	 */
529 	ckpt_size = ci->ckpt_size;
530 	ckpt_size += ci->alloc.bytes;
531 	ckpt_size -= ci->discard.bytes;
532 
533 	/* Skip the additional processing if we aren't deleting checkpoints. */
534 	if (!deleting)
535 		goto live_update;
536 
537 	/*
538 	 * Delete any no-longer-needed checkpoints: we do this first as it frees
539 	 * blocks to the live lists, and the freed blocks will then be included
540 	 * when writing the live extent lists.
541 	 */
542 	WT_CKPT_FOREACH(ckptbase, ckpt) {
543 		if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
544 		    !F_ISSET(ckpt, WT_CKPT_DELETE))
545 			continue;
546 
547 		if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
548 			if (tmp == NULL)
549 				WT_ERR(__wt_scr_alloc(session, 0, &tmp));
550 			WT_ERR(__ckpt_string(
551 			    session, block, ckpt->raw.data, tmp));
552 			__wt_verbose(session, WT_VERB_CHECKPOINT,
553 			    "%s: delete-checkpoint: %s: %s",
554 			    block->name, ckpt->name, (const char *)tmp->data);
555 		}
556 		/*
557 		 * Find the checkpoint into which we'll roll this checkpoint's
558 		 * blocks: it's the next real checkpoint in the list, and it
559 		 * better have been read in (if it's not the add slot).
560 		 */
561 		for (next_ckpt = ckpt + 1;; ++next_ckpt)
562 			if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
563 				break;
564 
565 		/*
566 		 * Set the from/to checkpoint structures, where the "to" value
567 		 * may be the live tree.
568 		 */
569 		a = ckpt->bpriv;
570 		if (F_ISSET(next_ckpt, WT_CKPT_ADD))
571 			b = &block->live;
572 		else
573 			b = next_ckpt->bpriv;
574 
575 		/*
576 		 * Free the root page: there's nothing special about this free,
577 		 * the root page is allocated using normal rules, that is, it
578 		 * may have been taken from the avail list, and was entered on
579 		 * the live system's alloc list at that time.  We free it into
580 		 * the checkpoint's discard list, however, not the live system's
581 		 * list because it appears on the checkpoint's alloc list and so
582 		 * must be paired in the checkpoint.
583 		 */
584 		if (a->root_offset != WT_BLOCK_INVALID_OFFSET)
585 			WT_ERR(__wt_block_insert_ext(session, block,
586 			    &a->discard, a->root_offset, a->root_size));
587 
588 		/*
589 		 * Free the blocks used to hold the "from" checkpoint's extent
590 		 * lists, including the avail list.
591 		 */
592 		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc));
593 		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail));
594 		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard));
595 
596 		/*
597 		 * Roll the "from" alloc and discard extent lists into the "to"
598 		 * checkpoint's lists.
599 		 */
600 		if (a->alloc.entries != 0)
601 			WT_ERR(__wt_block_extlist_merge(
602 			    session, block, &a->alloc, &b->alloc));
603 		if (a->discard.entries != 0)
604 			WT_ERR(__wt_block_extlist_merge(
605 			    session, block, &a->discard, &b->discard));
606 
607 		/*
608 		 * If the "to" checkpoint is also being deleted, we're done with
609 		 * it, it's merged into some other checkpoint in the next loop.
610 		 * This means the extent lists may aggregate over a number of
611 		 * checkpoints, but that's OK, they're disjoint sets of ranges.
612 		 */
613 		if (F_ISSET(next_ckpt, WT_CKPT_DELETE))
614 			continue;
615 
616 		/*
617 		 * Find blocks for re-use: wherever the "to" checkpoint's
618 		 * allocate and discard lists overlap, move the range to
619 		 * the live system's checkpoint available list.
620 		 */
621 		WT_ERR(__wt_block_extlist_overlap(session, block, b));
622 
623 		/*
624 		 * If we're updating the live system's information, we're done.
625 		 */
626 		if (F_ISSET(next_ckpt, WT_CKPT_ADD))
627 			continue;
628 
629 		/*
630 		 * We have to write the "to" checkpoint's extent lists out in
631 		 * new blocks, and update its cookie.
632 		 *
633 		 * Free the blocks used to hold the "to" checkpoint's extent
634 		 * lists; don't include the avail list, it's not changing.
635 		 */
636 		WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc));
637 		WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard));
638 
639 		F_SET(next_ckpt, WT_CKPT_UPDATE);
640 	}
641 
642 	/* Update checkpoints marked for update. */
643 	WT_CKPT_FOREACH(ckptbase, ckpt)
644 		if (F_ISSET(ckpt, WT_CKPT_UPDATE))
645 			WT_ERR(__ckpt_update(
646 			    session, block, ckpt, ckpt->bpriv, false));
647 
648 live_update:
649 	/* Truncate the file if that's possible. */
650 	WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail));
651 
652 	/* Update the final, added checkpoint based on the live system. */
653 	WT_CKPT_FOREACH(ckptbase, ckpt)
654 		if (F_ISSET(ckpt, WT_CKPT_ADD)) {
655 			/*
656 			 * !!!
657 			 * Our caller wants the final checkpoint size.  Setting
658 			 * the size here violates layering, but the alternative
659 			 * is a call for the btree layer to crack the checkpoint
660 			 * cookie into its components, and that's a fair amount
661 			 * of work.
662 			 */
663 			ckpt->ckpt_size = ckpt_size;
664 
665 			/*
666 			 * Set the rolling checkpoint size for the live system.
667 			 * The current size includes the current checkpoint's
668 			 * root page size (root pages are on the checkpoint's
669 			 * block allocation list as root pages are allocated
670 			 * with the usual block allocation functions). That's
671 			 * correct, but we don't want to include it in the size
672 			 * for the next checkpoint.
673 			 */
674 			ckpt_size -= ci->root_size;
675 
676 			/*
677 			 * Additionally, we had a bug for awhile where the live
678 			 * checkpoint size grew without bound. We can't sanity
679 			 * check the value, that would require walking the tree
680 			 * as part of the checkpoint. Bound any bug at the size
681 			 * of the file.
682 			 * It isn't practical to assert that the value is within
683 			 * bounds since databases created with older versions
684 			 * of WiredTiger (2.8.0) would likely see an error.
685 			 */
686 			ci->ckpt_size =
687 			    WT_MIN(ckpt_size, (uint64_t)block->size);
688 
689 			WT_ERR(__ckpt_update(session, block, ckpt, ci, true));
690 		}
691 
692 	/*
693 	 * Reset the live system's alloc and discard extent lists, leave the
694 	 * avail list alone.  This includes freeing a lot of extents, so do it
695 	 * outside of the system's lock by copying and resetting the original,
696 	 * then doing the work later.
697 	 */
698 	ci->ckpt_alloc = ci->alloc;
699 	WT_ERR(__wt_block_extlist_init(
700 	    session, &ci->alloc, "live", "alloc", false));
701 	ci->ckpt_discard = ci->discard;
702 	WT_ERR(__wt_block_extlist_init(
703 	    session, &ci->discard, "live", "discard", false));
704 
705 #ifdef HAVE_DIAGNOSTIC
706 	/*
707 	 * The first checkpoint in the system should always have an empty
708 	 * discard list.  If we've read that checkpoint and/or created it,
709 	 * check.
710 	 */
711 	WT_CKPT_FOREACH(ckptbase, ckpt)
712 		if (!F_ISSET(ckpt, WT_CKPT_DELETE))
713 			break;
714 	if ((a = ckpt->bpriv) == NULL)
715 		a = &block->live;
716 	if (a->discard.entries != 0)
717 		WT_ERR_MSG(session, WT_ERROR,
718 		    "first checkpoint incorrectly has blocks on the discard "
719 		    "list");
720 #endif
721 
722 err:	if (ret != 0 && fatal) {
723 		__wt_err(session, ret,
724 		    "%s: fatal checkpoint failure", block->name);
725 		ret = __wt_block_panic(session);
726 	}
727 
728 	if (locked)
729 		__wt_spin_unlock(session, &block->live_lock);
730 
731 	/* Discard any checkpoint information we loaded. */
732 	WT_CKPT_FOREACH(ckptbase, ckpt)
733 		if ((ci = ckpt->bpriv) != NULL)
734 			__wt_block_ckpt_destroy(session, ci);
735 
736 	__wt_scr_free(session, &tmp);
737 	return (ret);
738 }
739 
740 /*
741  * __ckpt_update --
742  *	Update a checkpoint.
743  */
744 static int
__ckpt_update(WT_SESSION_IMPL * session,WT_BLOCK * block,WT_CKPT * ckpt,WT_BLOCK_CKPT * ci,bool is_live)745 __ckpt_update(WT_SESSION_IMPL *session,
746     WT_BLOCK *block, WT_CKPT *ckpt, WT_BLOCK_CKPT *ci, bool is_live)
747 {
748 	WT_DECL_ITEM(tmp);
749 	WT_DECL_RET;
750 	uint8_t *endp;
751 
752 #ifdef HAVE_DIAGNOSTIC
753 	/* Check the extent list combinations for overlaps. */
754 	WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->avail));
755 	WT_RET(__wt_block_extlist_check(session, &ci->discard, &ci->avail));
756 	WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->discard));
757 #endif
758 	/*
759 	 * Write the checkpoint's alloc and discard extent lists.  After each
760 	 * write, remove any allocated blocks from the system's allocation
761 	 * list, checkpoint extent blocks don't appear on any extent lists.
762 	 */
763 	WT_RET(__wt_block_extlist_write(session, block, &ci->alloc, NULL));
764 	WT_RET(__wt_block_extlist_write(session, block, &ci->discard, NULL));
765 
766 	/*
767 	 * We only write an avail list for the live system, other checkpoint's
768 	 * avail lists are static and never change.
769 	 *
770 	 * Write the avail list last so it reflects changes due to allocating
771 	 * blocks for the alloc and discard lists.  Second, when we write the
772 	 * live system's avail list, it's two lists: the current avail list
773 	 * plus the list of blocks to be made available when the new checkpoint
774 	 * completes.  We can't merge that second list into the real list yet,
775 	 * it's not truly available until the new checkpoint locations have been
776 	 * saved to the metadata.
777 	 */
778 	if (is_live)
779 		WT_RET(__wt_block_extlist_write(
780 		    session, block, &ci->avail, &ci->ckpt_avail));
781 
782 	/*
783 	 * Set the file size for the live system.
784 	 *
785 	 * !!!
786 	 * We do NOT set the file size when re-writing checkpoints because we
787 	 * want to test the checkpoint's blocks against a reasonable maximum
788 	 * file size during verification.  This is bad: imagine a checkpoint
789 	 * appearing early in the file, re-written, and then the checkpoint
790 	 * requires blocks at the end of the file, blocks after the listed file
791 	 * size.  If the application opens that checkpoint for writing
792 	 * (discarding subsequent checkpoints), we would truncate the file to
793 	 * the early chunk, discarding the re-written checkpoint information.
794 	 * The alternative, updating the file size has its own problems, in
795 	 * that case we'd work correctly, but we'd lose all of the blocks
796 	 * between the original checkpoint and the re-written checkpoint.
797 	 * Currently, there's no API to roll-forward intermediate checkpoints,
798 	 * if there ever is, this will need to be fixed.
799 	 */
800 	if (is_live)
801 		ci->file_size = block->size;
802 
803 	/*
804 	 * Copy the checkpoint information into the checkpoint array's address
805 	 * cookie.
806 	 */
807 	WT_RET(__wt_buf_init(session, &ckpt->raw, WT_BTREE_MAX_ADDR_COOKIE));
808 	endp = ckpt->raw.mem;
809 	WT_RET(__wt_block_ckpt_to_buffer(session, block, &endp, ci));
810 	ckpt->raw.size = WT_PTRDIFF(endp, ckpt->raw.mem);
811 
812 	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
813 		WT_RET(__wt_scr_alloc(session, 0, &tmp));
814 		WT_ERR(__ckpt_string(session, block, ckpt->raw.data, tmp));
815 		__wt_verbose(session, WT_VERB_CHECKPOINT,
816 		    "%s: create-checkpoint: %s: %s",
817 		    block->name, ckpt->name, (const char *)tmp->data);
818 	}
819 
820 err:	__wt_scr_free(session, &tmp);
821 	return (ret);
822 }
823 
824 /*
825  * __wt_block_checkpoint_resolve --
826  *	Resolve a checkpoint.
827  */
828 int
__wt_block_checkpoint_resolve(WT_SESSION_IMPL * session,WT_BLOCK * block,bool failed)829 __wt_block_checkpoint_resolve(
830     WT_SESSION_IMPL *session, WT_BLOCK *block, bool failed)
831 {
832 	WT_BLOCK_CKPT *ci;
833 	WT_DECL_RET;
834 
835 	ci = &block->live;
836 
837 	/*
838 	 * Resolve the checkpoint after our caller has written the checkpoint
839 	 * information to stable storage.
840 	 */
841 	__wt_spin_lock(session, &block->live_lock);
842 	switch (block->ckpt_state) {
843 	case WT_CKPT_INPROGRESS:
844 		/* Something went wrong, but it's recoverable at our level. */
845 		goto done;
846 	case WT_CKPT_NONE:
847 	case WT_CKPT_SALVAGE:
848 		__wt_err(session, EINVAL,
849 		    "%s: an unexpected checkpoint resolution: the checkpoint "
850 		    "was never started or completed, or configured for salvage",
851 		    block->name);
852 		ret = __wt_block_panic(session);
853 		break;
854 	case WT_CKPT_PANIC_ON_FAILURE:
855 		if (!failed)
856 			break;
857 		__wt_err(session, EINVAL,
858 		    "%s: the checkpoint failed, the system must restart",
859 		    block->name);
860 		ret = __wt_block_panic(session);
861 		break;
862 	}
863 	WT_ERR(ret);
864 
865 	if ((ret = __wt_block_extlist_merge(
866 	    session, block, &ci->ckpt_avail, &ci->avail)) != 0) {
867 		__wt_err(session, ret,
868 		    "%s: fatal checkpoint failure during extent list merge",
869 		    block->name);
870 		ret = __wt_block_panic(session);
871 	}
872 	__wt_spin_unlock(session, &block->live_lock);
873 
874 	/* Discard the lists remaining after the checkpoint call. */
875 	__wt_block_extlist_free(session, &ci->ckpt_avail);
876 	__wt_block_extlist_free(session, &ci->ckpt_alloc);
877 	__wt_block_extlist_free(session, &ci->ckpt_discard);
878 
879 	__wt_spin_lock(session, &block->live_lock);
880 done:	block->ckpt_state = WT_CKPT_NONE;
881 err:	__wt_spin_unlock(session, &block->live_lock);
882 
883 	return (ret);
884 }
885 
886 /*
887  * __ckpt_string --
888  *	Return a printable string representation of a checkpoint address cookie.
889  */
890 static int
__ckpt_string(WT_SESSION_IMPL * session,WT_BLOCK * block,const uint8_t * addr,WT_ITEM * buf)891 __ckpt_string(WT_SESSION_IMPL *session,
892     WT_BLOCK *block, const uint8_t *addr, WT_ITEM *buf)
893 {
894 	WT_BLOCK_CKPT *ci, _ci;
895 
896 	/* Initialize the checkpoint, crack the cookie. */
897 	ci = &_ci;
898 	WT_RET(__wt_block_ckpt_init(session, ci, "string"));
899 	WT_RET(__wt_block_buffer_to_ckpt(session, block, addr, ci));
900 
901 	WT_RET(__wt_buf_fmt(session, buf,
902 	    "version=%" PRIu8, ci->version));
903 	if (ci->root_offset == WT_BLOCK_INVALID_OFFSET)
904 		WT_RET(__wt_buf_catfmt(session, buf, ", root=[Empty]"));
905 	else
906 		WT_RET(__wt_buf_catfmt(session, buf,
907 		    ", root=[%"
908 		    PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
909 		    (uintmax_t)ci->root_offset,
910 		    (uintmax_t)(ci->root_offset + ci->root_size),
911 		    ci->root_size, ci->root_checksum));
912 	if (ci->alloc.offset == WT_BLOCK_INVALID_OFFSET)
913 		WT_RET(__wt_buf_catfmt(session, buf, ", alloc=[Empty]"));
914 	else
915 		WT_RET(__wt_buf_catfmt(session, buf,
916 		    ", alloc=[%"
917 		    PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
918 		    (uintmax_t)ci->alloc.offset,
919 		    (uintmax_t)(ci->alloc.offset + ci->alloc.size),
920 		    ci->alloc.size, ci->alloc.checksum));
921 	if (ci->avail.offset == WT_BLOCK_INVALID_OFFSET)
922 		WT_RET(__wt_buf_catfmt(session, buf, ", avail=[Empty]"));
923 	else
924 		WT_RET(__wt_buf_catfmt(session, buf,
925 		    ", avail=[%"
926 		    PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
927 		    (uintmax_t)ci->avail.offset,
928 		    (uintmax_t)(ci->avail.offset + ci->avail.size),
929 		    ci->avail.size, ci->avail.checksum));
930 	if (ci->discard.offset == WT_BLOCK_INVALID_OFFSET)
931 		WT_RET(__wt_buf_catfmt(session, buf, ", discard=[Empty]"));
932 	else
933 		WT_RET(__wt_buf_catfmt(session, buf,
934 		    ", discard=[%"
935 		    PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
936 		    (uintmax_t)ci->discard.offset,
937 		    (uintmax_t)(ci->discard.offset + ci->discard.size),
938 		    ci->discard.size, ci->discard.checksum));
939 	WT_RET(__wt_buf_catfmt(session, buf,
940 	    ", file size=%" PRIuMAX, (uintmax_t)ci->file_size));
941 
942 	__wt_block_ckpt_destroy(session, ci);
943 
944 	return (0);
945 }
946