1 /*-
2 * Copyright (c) 2014-2018 MongoDB, Inc.
3 * Copyright (c) 2008-2014 WiredTiger, Inc.
4 * All rights reserved.
5 *
6 * See the file LICENSE for redistribution information.
7 */
8
9 #include "wt_internal.h"
10
11 static int __ckpt_process(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *);
12 static int __ckpt_string(
13 WT_SESSION_IMPL *, WT_BLOCK *, const uint8_t *, WT_ITEM *);
14 static int __ckpt_update(
15 WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *, WT_BLOCK_CKPT *, bool);
16
17 /*
18 * __wt_block_ckpt_init --
19 * Initialize a checkpoint structure.
20 */
21 int
__wt_block_ckpt_init(WT_SESSION_IMPL * session,WT_BLOCK_CKPT * ci,const char * name)22 __wt_block_ckpt_init(
23 WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci, const char *name)
24 {
25 WT_CLEAR(*ci);
26
27 ci->version = WT_BM_CHECKPOINT_VERSION;
28 ci->root_offset = WT_BLOCK_INVALID_OFFSET;
29
30 WT_RET(__wt_block_extlist_init(
31 session, &ci->alloc, name, "alloc", false));
32 WT_RET(__wt_block_extlist_init(
33 session, &ci->avail, name, "avail", true));
34 WT_RET(__wt_block_extlist_init(
35 session, &ci->discard, name, "discard", false));
36 WT_RET(__wt_block_extlist_init(
37 session, &ci->ckpt_avail, name, "ckpt_avail", true));
38
39 return (0);
40 }
41
42 /*
43 * __wt_block_checkpoint_load --
44 * Load a checkpoint.
45 */
46 int
__wt_block_checkpoint_load(WT_SESSION_IMPL * session,WT_BLOCK * block,const uint8_t * addr,size_t addr_size,uint8_t * root_addr,size_t * root_addr_sizep,bool checkpoint)47 __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block,
48 const uint8_t *addr, size_t addr_size,
49 uint8_t *root_addr, size_t *root_addr_sizep, bool checkpoint)
50 {
51 WT_BLOCK_CKPT *ci, _ci;
52 WT_DECL_ITEM(tmp);
53 WT_DECL_RET;
54 uint8_t *endp;
55
56 /*
57 * Sometimes we don't find a root page (we weren't given a checkpoint,
58 * or the checkpoint was empty). In that case we return an empty root
59 * address, set that up now.
60 */
61 *root_addr_sizep = 0;
62
63 ci = NULL;
64
65 if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
66 if (addr != NULL) {
67 WT_ERR(__wt_scr_alloc(session, 0, &tmp));
68 WT_ERR(__ckpt_string(session, block, addr, tmp));
69 }
70 __wt_verbose(session, WT_VERB_CHECKPOINT,
71 "%s: load-checkpoint: %s", block->name,
72 addr == NULL ? "[Empty]" : (const char *)tmp->data);
73 }
74
75 /*
76 * There's a single checkpoint in the file that can be written, all of
77 * the others are read-only. We use the same initialization calls for
78 * readonly checkpoints, but the information doesn't persist.
79 */
80 if (checkpoint) {
81 ci = &_ci;
82 WT_ERR(__wt_block_ckpt_init(session, ci, "checkpoint"));
83 } else {
84 /*
85 * We depend on the btree level for locking: things will go bad
86 * fast if we open the live system in two handles, or salvage,
87 * truncate or verify the live/running file.
88 */
89 #ifdef HAVE_DIAGNOSTIC
90 __wt_spin_lock(session, &block->live_lock);
91 WT_ASSERT(session, block->live_open == false);
92 block->live_open = true;
93 __wt_spin_unlock(session, &block->live_lock);
94 #endif
95 ci = &block->live;
96 WT_ERR(__wt_block_ckpt_init(session, ci, "live"));
97 }
98
99 /*
100 * If the checkpoint has an on-disk root page, load it. Otherwise, size
101 * the file past the description information.
102 */
103 if (addr == NULL || addr_size == 0)
104 ci->file_size = block->allocsize;
105 else {
106 /* Crack the checkpoint cookie. */
107 WT_ERR(__wt_block_buffer_to_ckpt(session, block, addr, ci));
108
109 /* Verify sets up next. */
110 if (block->verify)
111 WT_ERR(__wt_verify_ckpt_load(session, block, ci));
112
113 /* Read any root page. */
114 if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) {
115 endp = root_addr;
116 WT_ERR(__wt_block_addr_to_buffer(block, &endp,
117 ci->root_offset, ci->root_size, ci->root_checksum));
118 *root_addr_sizep = WT_PTRDIFF(endp, root_addr);
119 }
120
121 /*
122 * Rolling a checkpoint forward requires the avail list, the
123 * blocks from which we can allocate.
124 */
125 if (!checkpoint)
126 WT_ERR(__wt_block_extlist_read_avail(
127 session, block, &ci->avail, ci->file_size));
128 }
129
130 /*
131 * If the checkpoint can be written, that means anything written after
132 * the checkpoint is no longer interesting, truncate the file. Don't
133 * bother checking the avail list for a block at the end of the file,
134 * that was done when the checkpoint was first written (re-writing the
135 * checkpoint might possibly make it relevant here, but it's unlikely
136 * enough I don't bother).
137 */
138 if (!checkpoint)
139 WT_ERR(__wt_block_truncate(session, block, ci->file_size));
140
141 if (0) {
142 err: /*
143 * Don't call checkpoint-unload: unload does real work including
144 * file truncation. If we fail early enough that the checkpoint
145 * information isn't correct, bad things would happen. The only
146 * allocated memory was in the service of verify, clean that up.
147 */
148 if (block->verify)
149 WT_TRET(__wt_verify_ckpt_unload(session, block));
150 }
151
152 /* Checkpoints don't need the original information, discard it. */
153 if (checkpoint && ci != NULL)
154 __wt_block_ckpt_destroy(session, ci);
155
156 __wt_scr_free(session, &tmp);
157 return (ret);
158 }
159
160 /*
161 * __wt_block_checkpoint_unload --
162 * Unload a checkpoint.
163 */
164 int
__wt_block_checkpoint_unload(WT_SESSION_IMPL * session,WT_BLOCK * block,bool checkpoint)165 __wt_block_checkpoint_unload(
166 WT_SESSION_IMPL *session, WT_BLOCK *block, bool checkpoint)
167 {
168 WT_DECL_RET;
169
170 /* Verify cleanup. */
171 if (block->verify)
172 WT_TRET(__wt_verify_ckpt_unload(session, block));
173
174 /*
175 * If it's the live system, truncate to discard any extended blocks and
176 * discard the active extent lists. Hold the lock even though we're
177 * unloading the live checkpoint, there could be readers active in other
178 * checkpoints.
179 */
180 if (!checkpoint) {
181 WT_TRET(__wt_block_truncate(session, block, block->size));
182
183 __wt_spin_lock(session, &block->live_lock);
184 __wt_block_ckpt_destroy(session, &block->live);
185 #ifdef HAVE_DIAGNOSTIC
186 block->live_open = false;
187 #endif
188 __wt_spin_unlock(session, &block->live_lock);
189 }
190
191 return (ret);
192 }
193
194 /*
195 * __wt_block_ckpt_destroy --
196 * Clear a checkpoint structure.
197 */
198 void
__wt_block_ckpt_destroy(WT_SESSION_IMPL * session,WT_BLOCK_CKPT * ci)199 __wt_block_ckpt_destroy(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci)
200 {
201 /* Discard the extent lists. */
202 __wt_block_extlist_free(session, &ci->alloc);
203 __wt_block_extlist_free(session, &ci->avail);
204 __wt_block_extlist_free(session, &ci->discard);
205 __wt_block_extlist_free(session, &ci->ckpt_alloc);
206 __wt_block_extlist_free(session, &ci->ckpt_avail);
207 __wt_block_extlist_free(session, &ci->ckpt_discard);
208 }
209
210 /*
211 * __wt_block_checkpoint_start --
212 * Start a checkpoint.
213 */
214 int
__wt_block_checkpoint_start(WT_SESSION_IMPL * session,WT_BLOCK * block)215 __wt_block_checkpoint_start(WT_SESSION_IMPL *session, WT_BLOCK *block)
216 {
217 WT_DECL_RET;
218
219 __wt_spin_lock(session, &block->live_lock);
220 switch (block->ckpt_state) {
221 case WT_CKPT_INPROGRESS:
222 case WT_CKPT_PANIC_ON_FAILURE:
223 case WT_CKPT_SALVAGE:
224 __wt_err(session, EINVAL,
225 "%s: an unexpected checkpoint start: the checkpoint "
226 "has already started or was configured for salvage",
227 block->name);
228 ret = __wt_block_panic(session);
229 break;
230 case WT_CKPT_NONE:
231 block->ckpt_state = WT_CKPT_INPROGRESS;
232 break;
233 }
234 __wt_spin_unlock(session, &block->live_lock);
235 return (ret);
236 }
237
238 /*
239 * __wt_block_checkpoint --
240 * Create a new checkpoint.
241 */
242 int
__wt_block_checkpoint(WT_SESSION_IMPL * session,WT_BLOCK * block,WT_ITEM * buf,WT_CKPT * ckptbase,bool data_checksum)243 __wt_block_checkpoint(WT_SESSION_IMPL *session,
244 WT_BLOCK *block, WT_ITEM *buf, WT_CKPT *ckptbase, bool data_checksum)
245 {
246 WT_BLOCK_CKPT *ci;
247 WT_DECL_RET;
248
249 ci = &block->live;
250
251 /* Switch to first-fit allocation. */
252 __wt_block_configure_first_fit(block, true);
253
254 /*
255 * Write the root page: it's possible for there to be a checkpoint of
256 * an empty tree, in which case, we store an illegal root offset.
257 *
258 * !!!
259 * We happen to know that checkpoints are single-threaded above us in
260 * the btree engine. That's probably something we want to guarantee
261 * for any WiredTiger block manager.
262 */
263 if (buf == NULL) {
264 ci->root_offset = WT_BLOCK_INVALID_OFFSET;
265 ci->root_size = ci->root_checksum = 0;
266 } else
267 WT_ERR(__wt_block_write_off(session, block, buf,
268 &ci->root_offset, &ci->root_size, &ci->root_checksum,
269 data_checksum, true, false));
270
271 /*
272 * Checkpoints are potentially reading/writing/merging lots of blocks,
273 * pre-allocate structures for this thread's use.
274 */
275 WT_ERR(__wt_block_ext_prealloc(session, 250));
276
277 /* Process the checkpoint list, deleting and updating as required. */
278 ret = __ckpt_process(session, block, ckptbase);
279
280 /* Discard any excessive memory we've allocated. */
281 WT_TRET(__wt_block_ext_discard(session, 250));
282
283 /* Restore the original allocation plan. */
284 err: __wt_block_configure_first_fit(block, false);
285
286 return (ret);
287 }
288
289 /*
290 * __ckpt_extlist_read --
291 * Read a checkpoints extent lists and copy
292 */
293 static int
__ckpt_extlist_read(WT_SESSION_IMPL * session,WT_BLOCK * block,WT_CKPT * ckpt)294 __ckpt_extlist_read(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt)
295 {
296 WT_BLOCK_CKPT *ci;
297
298 /*
299 * Allocate a checkpoint structure, crack the cookie and read the
300 * checkpoint's extent lists.
301 *
302 * Ignore the avail list: checkpoint avail lists are only useful if we
303 * are rolling forward from the particular checkpoint and they represent
304 * our best understanding of what blocks can be allocated. If we are
305 * not operating on the live checkpoint, subsequent checkpoints might
306 * have allocated those blocks, and the avail list is useless. We don't
307 * discard it, because it is useful as part of verification, but we
308 * don't re-write it either.
309 */
310 WT_RET(__wt_calloc(session, 1, sizeof(WT_BLOCK_CKPT), &ckpt->bpriv));
311
312 ci = ckpt->bpriv;
313 WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name));
314 WT_RET(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci));
315 WT_RET(__wt_block_extlist_read(
316 session, block, &ci->alloc, ci->file_size));
317 WT_RET(__wt_block_extlist_read(
318 session, block, &ci->discard, ci->file_size));
319
320 return (0);
321 }
322
323 /*
324 * __ckpt_extlist_fblocks --
325 * If a checkpoint's extent list is going away, free its blocks.
326 */
327 static int
__ckpt_extlist_fblocks(WT_SESSION_IMPL * session,WT_BLOCK * block,WT_EXTLIST * el)328 __ckpt_extlist_fblocks(
329 WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el)
330 {
331 if (el->offset == WT_BLOCK_INVALID_OFFSET)
332 return (0);
333
334 /*
335 * Free blocks used to write checkpoint extents into the live system's
336 * checkpoint avail list (they were never on any alloc list). Do not
337 * use the live system's avail list because that list is used to decide
338 * if the file can be truncated, and we can't truncate any part of the
339 * file that contains a previous checkpoint's extents.
340 */
341 return (__wt_block_insert_ext(
342 session, block, &block->live.ckpt_avail, el->offset, el->size));
343 }
344
345 #ifdef HAVE_DIAGNOSTIC
346 /*
347 * __ckpt_verify --
348 * Diagnostic code, confirm we get what we expect in the checkpoint array.
349 */
350 static int
__ckpt_verify(WT_SESSION_IMPL * session,WT_CKPT * ckptbase)351 __ckpt_verify(WT_SESSION_IMPL *session, WT_CKPT *ckptbase)
352 {
353 WT_CKPT *ckpt;
354
355 /*
356 * Fast check that we're seeing what we expect to see: some number of
357 * checkpoints to add, delete or ignore, terminated by a new checkpoint.
358 */
359 WT_CKPT_FOREACH(ckptbase, ckpt)
360 switch (ckpt->flags) {
361 case 0:
362 case WT_CKPT_DELETE:
363 case WT_CKPT_DELETE | WT_CKPT_FAKE:
364 case WT_CKPT_FAKE:
365 break;
366 case WT_CKPT_ADD:
367 if (ckpt[1].name == NULL)
368 break;
369 /* FALLTHROUGH */
370 default:
371 /*
372 * Don't convert to WT_ILLEGAL_VALUE, it won't compile
373 * on some gcc compilers because they don't understand
374 * FALLTHROUGH as part of a macro.
375 */
376 return (__wt_illegal_value(session, ckpt->flags));
377 }
378 return (0);
379 }
380 #endif
381
382 /*
383 * __ckpt_process --
384 * Process the list of checkpoints.
385 */
386 static int
__ckpt_process(WT_SESSION_IMPL * session,WT_BLOCK * block,WT_CKPT * ckptbase)387 __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
388 {
389 WT_BLOCK_CKPT *a, *b, *ci;
390 WT_CKPT *ckpt, *next_ckpt;
391 WT_DECL_ITEM(tmp);
392 WT_DECL_RET;
393 uint64_t ckpt_size;
394 bool deleting, fatal, locked;
395
396 ci = &block->live;
397 fatal = locked = false;
398
399 #ifdef HAVE_DIAGNOSTIC
400 WT_RET(__ckpt_verify(session, ckptbase));
401 #endif
402
403 /*
404 * Checkpoints are a two-step process: first, write a new checkpoint to
405 * disk (including all the new extent lists for modified checkpoints
406 * and the live system). As part of this, create a list of file blocks
407 * newly available for reallocation, based on checkpoints being deleted.
408 * We then return the locations of the new checkpoint information to our
409 * caller. Our caller has to write that information into some kind of
410 * stable storage, and once that's done, we can actually allocate from
411 * that list of newly available file blocks. (We can't allocate from
412 * that list immediately because the allocation might happen before our
413 * caller saves the new checkpoint information, and if we crashed before
414 * the new checkpoint location was saved, we'd have overwritten blocks
415 * still referenced by checkpoints in the system.) In summary, there is
416 * a second step: after our caller saves the checkpoint information, we
417 * are called to add the newly available blocks into the live system's
418 * available list.
419 *
420 * This function is the first step, the second step is in the resolve
421 * function.
422 *
423 * If we're called to checkpoint the same file twice (without the second
424 * resolution step), or re-entered for any reason, it's an error in our
425 * caller, and our choices are all bad: leak blocks or potentially crash
426 * with our caller not yet having saved previous checkpoint information
427 * to stable storage.
428 */
429 __wt_spin_lock(session, &block->live_lock);
430 switch (block->ckpt_state) {
431 case WT_CKPT_INPROGRESS:
432 block->ckpt_state = WT_CKPT_PANIC_ON_FAILURE;
433 break;
434 case WT_CKPT_NONE:
435 case WT_CKPT_PANIC_ON_FAILURE:
436 __wt_err(session, EINVAL,
437 "%s: an unexpected checkpoint attempt: the checkpoint "
438 "was never started or has already completed",
439 block->name);
440 ret = __wt_block_panic(session);
441 break;
442 case WT_CKPT_SALVAGE:
443 /* Salvage doesn't use the standard checkpoint APIs. */
444 break;
445 }
446 __wt_spin_unlock(session, &block->live_lock);
447 WT_RET(ret);
448
449 /*
450 * Extents newly available as a result of deleting previous checkpoints
451 * are added to a list of extents. The list should be empty, but as
452 * described above, there is no "free the checkpoint information" call
453 * into the block manager; if there was an error in an upper level that
454 * resulted in some previous checkpoint never being resolved, the list
455 * may not be empty. We should have caught that with the "checkpoint
456 * in progress" test, but it doesn't cost us anything to be cautious.
457 *
458 * We free the checkpoint's allocation and discard extent lists as part
459 * of the resolution step, not because they're needed at that time, but
460 * because it's potentially a lot of work, and waiting allows the btree
461 * layer to continue eviction sooner. As for the checkpoint-available
462 * list, make sure they get cleaned out.
463 */
464 __wt_block_extlist_free(session, &ci->ckpt_avail);
465 WT_RET(__wt_block_extlist_init(
466 session, &ci->ckpt_avail, "live", "ckpt_avail", true));
467 __wt_block_extlist_free(session, &ci->ckpt_alloc);
468 __wt_block_extlist_free(session, &ci->ckpt_discard);
469
470 /*
471 * To delete a checkpoint, we'll need checkpoint information for it and
472 * the subsequent checkpoint into which it gets rolled; read them from
473 * disk before we lock things down.
474 */
475 deleting = false;
476 WT_CKPT_FOREACH(ckptbase, ckpt) {
477 if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
478 !F_ISSET(ckpt, WT_CKPT_DELETE))
479 continue;
480 deleting = true;
481
482 /*
483 * Read the checkpoint and next checkpoint extent lists if we
484 * haven't already read them (we may have already read these
485 * extent blocks if there is more than one deleted checkpoint).
486 */
487 if (ckpt->bpriv == NULL)
488 WT_ERR(__ckpt_extlist_read(session, block, ckpt));
489
490 for (next_ckpt = ckpt + 1;; ++next_ckpt)
491 if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
492 break;
493
494 /*
495 * The "next" checkpoint may be the live tree which has no
496 * extent blocks to read.
497 */
498 if (next_ckpt->bpriv == NULL &&
499 !F_ISSET(next_ckpt, WT_CKPT_ADD))
500 WT_ERR(__ckpt_extlist_read(session, block, next_ckpt));
501 }
502
503 /*
504 * Failures are now fatal: we can't currently back out the merge of any
505 * deleted checkpoint extent lists into the live system's extent lists,
506 * so continuing after error would leave the live system's extent lists
507 * corrupted for any subsequent checkpoint (and potentially, should a
508 * subsequent checkpoint succeed, for recovery).
509 */
510 fatal = true;
511
512 /*
513 * Hold a lock so the live extent lists and the file size can't change
514 * underneath us. I suspect we'll tighten this if checkpoints take too
515 * much time away from real work: we read the historic checkpoint
516 * information without a lock, but we could also merge and re-write the
517 * deleted and merged checkpoint information without a lock, except for
518 * the final merge of ranges into the live tree.
519 */
520 __wt_spin_lock(session, &block->live_lock);
521 locked = true;
522
523 /*
524 * We've allocated our last page, update the checkpoint size. We need
525 * to calculate the live system's checkpoint size before merging
526 * checkpoint allocation and discard information from the checkpoints
527 * we're deleting, those operations change the underlying byte counts.
528 */
529 ckpt_size = ci->ckpt_size;
530 ckpt_size += ci->alloc.bytes;
531 ckpt_size -= ci->discard.bytes;
532
533 /* Skip the additional processing if we aren't deleting checkpoints. */
534 if (!deleting)
535 goto live_update;
536
537 /*
538 * Delete any no-longer-needed checkpoints: we do this first as it frees
539 * blocks to the live lists, and the freed blocks will then be included
540 * when writing the live extent lists.
541 */
542 WT_CKPT_FOREACH(ckptbase, ckpt) {
543 if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
544 !F_ISSET(ckpt, WT_CKPT_DELETE))
545 continue;
546
547 if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
548 if (tmp == NULL)
549 WT_ERR(__wt_scr_alloc(session, 0, &tmp));
550 WT_ERR(__ckpt_string(
551 session, block, ckpt->raw.data, tmp));
552 __wt_verbose(session, WT_VERB_CHECKPOINT,
553 "%s: delete-checkpoint: %s: %s",
554 block->name, ckpt->name, (const char *)tmp->data);
555 }
556 /*
557 * Find the checkpoint into which we'll roll this checkpoint's
558 * blocks: it's the next real checkpoint in the list, and it
559 * better have been read in (if it's not the add slot).
560 */
561 for (next_ckpt = ckpt + 1;; ++next_ckpt)
562 if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
563 break;
564
565 /*
566 * Set the from/to checkpoint structures, where the "to" value
567 * may be the live tree.
568 */
569 a = ckpt->bpriv;
570 if (F_ISSET(next_ckpt, WT_CKPT_ADD))
571 b = &block->live;
572 else
573 b = next_ckpt->bpriv;
574
575 /*
576 * Free the root page: there's nothing special about this free,
577 * the root page is allocated using normal rules, that is, it
578 * may have been taken from the avail list, and was entered on
579 * the live system's alloc list at that time. We free it into
580 * the checkpoint's discard list, however, not the live system's
581 * list because it appears on the checkpoint's alloc list and so
582 * must be paired in the checkpoint.
583 */
584 if (a->root_offset != WT_BLOCK_INVALID_OFFSET)
585 WT_ERR(__wt_block_insert_ext(session, block,
586 &a->discard, a->root_offset, a->root_size));
587
588 /*
589 * Free the blocks used to hold the "from" checkpoint's extent
590 * lists, including the avail list.
591 */
592 WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc));
593 WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail));
594 WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard));
595
596 /*
597 * Roll the "from" alloc and discard extent lists into the "to"
598 * checkpoint's lists.
599 */
600 if (a->alloc.entries != 0)
601 WT_ERR(__wt_block_extlist_merge(
602 session, block, &a->alloc, &b->alloc));
603 if (a->discard.entries != 0)
604 WT_ERR(__wt_block_extlist_merge(
605 session, block, &a->discard, &b->discard));
606
607 /*
608 * If the "to" checkpoint is also being deleted, we're done with
609 * it, it's merged into some other checkpoint in the next loop.
610 * This means the extent lists may aggregate over a number of
611 * checkpoints, but that's OK, they're disjoint sets of ranges.
612 */
613 if (F_ISSET(next_ckpt, WT_CKPT_DELETE))
614 continue;
615
616 /*
617 * Find blocks for re-use: wherever the "to" checkpoint's
618 * allocate and discard lists overlap, move the range to
619 * the live system's checkpoint available list.
620 */
621 WT_ERR(__wt_block_extlist_overlap(session, block, b));
622
623 /*
624 * If we're updating the live system's information, we're done.
625 */
626 if (F_ISSET(next_ckpt, WT_CKPT_ADD))
627 continue;
628
629 /*
630 * We have to write the "to" checkpoint's extent lists out in
631 * new blocks, and update its cookie.
632 *
633 * Free the blocks used to hold the "to" checkpoint's extent
634 * lists; don't include the avail list, it's not changing.
635 */
636 WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc));
637 WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard));
638
639 F_SET(next_ckpt, WT_CKPT_UPDATE);
640 }
641
642 /* Update checkpoints marked for update. */
643 WT_CKPT_FOREACH(ckptbase, ckpt)
644 if (F_ISSET(ckpt, WT_CKPT_UPDATE))
645 WT_ERR(__ckpt_update(
646 session, block, ckpt, ckpt->bpriv, false));
647
648 live_update:
649 /* Truncate the file if that's possible. */
650 WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail));
651
652 /* Update the final, added checkpoint based on the live system. */
653 WT_CKPT_FOREACH(ckptbase, ckpt)
654 if (F_ISSET(ckpt, WT_CKPT_ADD)) {
655 /*
656 * !!!
657 * Our caller wants the final checkpoint size. Setting
658 * the size here violates layering, but the alternative
659 * is a call for the btree layer to crack the checkpoint
660 * cookie into its components, and that's a fair amount
661 * of work.
662 */
663 ckpt->ckpt_size = ckpt_size;
664
665 /*
666 * Set the rolling checkpoint size for the live system.
667 * The current size includes the current checkpoint's
668 * root page size (root pages are on the checkpoint's
669 * block allocation list as root pages are allocated
670 * with the usual block allocation functions). That's
671 * correct, but we don't want to include it in the size
672 * for the next checkpoint.
673 */
674 ckpt_size -= ci->root_size;
675
676 /*
677 * Additionally, we had a bug for awhile where the live
678 * checkpoint size grew without bound. We can't sanity
679 * check the value, that would require walking the tree
680 * as part of the checkpoint. Bound any bug at the size
681 * of the file.
682 * It isn't practical to assert that the value is within
683 * bounds since databases created with older versions
684 * of WiredTiger (2.8.0) would likely see an error.
685 */
686 ci->ckpt_size =
687 WT_MIN(ckpt_size, (uint64_t)block->size);
688
689 WT_ERR(__ckpt_update(session, block, ckpt, ci, true));
690 }
691
692 /*
693 * Reset the live system's alloc and discard extent lists, leave the
694 * avail list alone. This includes freeing a lot of extents, so do it
695 * outside of the system's lock by copying and resetting the original,
696 * then doing the work later.
697 */
698 ci->ckpt_alloc = ci->alloc;
699 WT_ERR(__wt_block_extlist_init(
700 session, &ci->alloc, "live", "alloc", false));
701 ci->ckpt_discard = ci->discard;
702 WT_ERR(__wt_block_extlist_init(
703 session, &ci->discard, "live", "discard", false));
704
705 #ifdef HAVE_DIAGNOSTIC
706 /*
707 * The first checkpoint in the system should always have an empty
708 * discard list. If we've read that checkpoint and/or created it,
709 * check.
710 */
711 WT_CKPT_FOREACH(ckptbase, ckpt)
712 if (!F_ISSET(ckpt, WT_CKPT_DELETE))
713 break;
714 if ((a = ckpt->bpriv) == NULL)
715 a = &block->live;
716 if (a->discard.entries != 0)
717 WT_ERR_MSG(session, WT_ERROR,
718 "first checkpoint incorrectly has blocks on the discard "
719 "list");
720 #endif
721
722 err: if (ret != 0 && fatal) {
723 __wt_err(session, ret,
724 "%s: fatal checkpoint failure", block->name);
725 ret = __wt_block_panic(session);
726 }
727
728 if (locked)
729 __wt_spin_unlock(session, &block->live_lock);
730
731 /* Discard any checkpoint information we loaded. */
732 WT_CKPT_FOREACH(ckptbase, ckpt)
733 if ((ci = ckpt->bpriv) != NULL)
734 __wt_block_ckpt_destroy(session, ci);
735
736 __wt_scr_free(session, &tmp);
737 return (ret);
738 }
739
740 /*
741 * __ckpt_update --
742 * Update a checkpoint.
743 */
744 static int
__ckpt_update(WT_SESSION_IMPL * session,WT_BLOCK * block,WT_CKPT * ckpt,WT_BLOCK_CKPT * ci,bool is_live)745 __ckpt_update(WT_SESSION_IMPL *session,
746 WT_BLOCK *block, WT_CKPT *ckpt, WT_BLOCK_CKPT *ci, bool is_live)
747 {
748 WT_DECL_ITEM(tmp);
749 WT_DECL_RET;
750 uint8_t *endp;
751
752 #ifdef HAVE_DIAGNOSTIC
753 /* Check the extent list combinations for overlaps. */
754 WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->avail));
755 WT_RET(__wt_block_extlist_check(session, &ci->discard, &ci->avail));
756 WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->discard));
757 #endif
758 /*
759 * Write the checkpoint's alloc and discard extent lists. After each
760 * write, remove any allocated blocks from the system's allocation
761 * list, checkpoint extent blocks don't appear on any extent lists.
762 */
763 WT_RET(__wt_block_extlist_write(session, block, &ci->alloc, NULL));
764 WT_RET(__wt_block_extlist_write(session, block, &ci->discard, NULL));
765
766 /*
767 * We only write an avail list for the live system, other checkpoint's
768 * avail lists are static and never change.
769 *
770 * Write the avail list last so it reflects changes due to allocating
771 * blocks for the alloc and discard lists. Second, when we write the
772 * live system's avail list, it's two lists: the current avail list
773 * plus the list of blocks to be made available when the new checkpoint
774 * completes. We can't merge that second list into the real list yet,
775 * it's not truly available until the new checkpoint locations have been
776 * saved to the metadata.
777 */
778 if (is_live)
779 WT_RET(__wt_block_extlist_write(
780 session, block, &ci->avail, &ci->ckpt_avail));
781
782 /*
783 * Set the file size for the live system.
784 *
785 * !!!
786 * We do NOT set the file size when re-writing checkpoints because we
787 * want to test the checkpoint's blocks against a reasonable maximum
788 * file size during verification. This is bad: imagine a checkpoint
789 * appearing early in the file, re-written, and then the checkpoint
790 * requires blocks at the end of the file, blocks after the listed file
791 * size. If the application opens that checkpoint for writing
792 * (discarding subsequent checkpoints), we would truncate the file to
793 * the early chunk, discarding the re-written checkpoint information.
794 * The alternative, updating the file size has its own problems, in
795 * that case we'd work correctly, but we'd lose all of the blocks
796 * between the original checkpoint and the re-written checkpoint.
797 * Currently, there's no API to roll-forward intermediate checkpoints,
798 * if there ever is, this will need to be fixed.
799 */
800 if (is_live)
801 ci->file_size = block->size;
802
803 /*
804 * Copy the checkpoint information into the checkpoint array's address
805 * cookie.
806 */
807 WT_RET(__wt_buf_init(session, &ckpt->raw, WT_BTREE_MAX_ADDR_COOKIE));
808 endp = ckpt->raw.mem;
809 WT_RET(__wt_block_ckpt_to_buffer(session, block, &endp, ci));
810 ckpt->raw.size = WT_PTRDIFF(endp, ckpt->raw.mem);
811
812 if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
813 WT_RET(__wt_scr_alloc(session, 0, &tmp));
814 WT_ERR(__ckpt_string(session, block, ckpt->raw.data, tmp));
815 __wt_verbose(session, WT_VERB_CHECKPOINT,
816 "%s: create-checkpoint: %s: %s",
817 block->name, ckpt->name, (const char *)tmp->data);
818 }
819
820 err: __wt_scr_free(session, &tmp);
821 return (ret);
822 }
823
824 /*
825 * __wt_block_checkpoint_resolve --
826 * Resolve a checkpoint.
827 */
828 int
__wt_block_checkpoint_resolve(WT_SESSION_IMPL * session,WT_BLOCK * block,bool failed)829 __wt_block_checkpoint_resolve(
830 WT_SESSION_IMPL *session, WT_BLOCK *block, bool failed)
831 {
832 WT_BLOCK_CKPT *ci;
833 WT_DECL_RET;
834
835 ci = &block->live;
836
837 /*
838 * Resolve the checkpoint after our caller has written the checkpoint
839 * information to stable storage.
840 */
841 __wt_spin_lock(session, &block->live_lock);
842 switch (block->ckpt_state) {
843 case WT_CKPT_INPROGRESS:
844 /* Something went wrong, but it's recoverable at our level. */
845 goto done;
846 case WT_CKPT_NONE:
847 case WT_CKPT_SALVAGE:
848 __wt_err(session, EINVAL,
849 "%s: an unexpected checkpoint resolution: the checkpoint "
850 "was never started or completed, or configured for salvage",
851 block->name);
852 ret = __wt_block_panic(session);
853 break;
854 case WT_CKPT_PANIC_ON_FAILURE:
855 if (!failed)
856 break;
857 __wt_err(session, EINVAL,
858 "%s: the checkpoint failed, the system must restart",
859 block->name);
860 ret = __wt_block_panic(session);
861 break;
862 }
863 WT_ERR(ret);
864
865 if ((ret = __wt_block_extlist_merge(
866 session, block, &ci->ckpt_avail, &ci->avail)) != 0) {
867 __wt_err(session, ret,
868 "%s: fatal checkpoint failure during extent list merge",
869 block->name);
870 ret = __wt_block_panic(session);
871 }
872 __wt_spin_unlock(session, &block->live_lock);
873
874 /* Discard the lists remaining after the checkpoint call. */
875 __wt_block_extlist_free(session, &ci->ckpt_avail);
876 __wt_block_extlist_free(session, &ci->ckpt_alloc);
877 __wt_block_extlist_free(session, &ci->ckpt_discard);
878
879 __wt_spin_lock(session, &block->live_lock);
880 done: block->ckpt_state = WT_CKPT_NONE;
881 err: __wt_spin_unlock(session, &block->live_lock);
882
883 return (ret);
884 }
885
886 /*
887 * __ckpt_string --
888 * Return a printable string representation of a checkpoint address cookie.
889 */
890 static int
__ckpt_string(WT_SESSION_IMPL * session,WT_BLOCK * block,const uint8_t * addr,WT_ITEM * buf)891 __ckpt_string(WT_SESSION_IMPL *session,
892 WT_BLOCK *block, const uint8_t *addr, WT_ITEM *buf)
893 {
894 WT_BLOCK_CKPT *ci, _ci;
895
896 /* Initialize the checkpoint, crack the cookie. */
897 ci = &_ci;
898 WT_RET(__wt_block_ckpt_init(session, ci, "string"));
899 WT_RET(__wt_block_buffer_to_ckpt(session, block, addr, ci));
900
901 WT_RET(__wt_buf_fmt(session, buf,
902 "version=%" PRIu8, ci->version));
903 if (ci->root_offset == WT_BLOCK_INVALID_OFFSET)
904 WT_RET(__wt_buf_catfmt(session, buf, ", root=[Empty]"));
905 else
906 WT_RET(__wt_buf_catfmt(session, buf,
907 ", root=[%"
908 PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
909 (uintmax_t)ci->root_offset,
910 (uintmax_t)(ci->root_offset + ci->root_size),
911 ci->root_size, ci->root_checksum));
912 if (ci->alloc.offset == WT_BLOCK_INVALID_OFFSET)
913 WT_RET(__wt_buf_catfmt(session, buf, ", alloc=[Empty]"));
914 else
915 WT_RET(__wt_buf_catfmt(session, buf,
916 ", alloc=[%"
917 PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
918 (uintmax_t)ci->alloc.offset,
919 (uintmax_t)(ci->alloc.offset + ci->alloc.size),
920 ci->alloc.size, ci->alloc.checksum));
921 if (ci->avail.offset == WT_BLOCK_INVALID_OFFSET)
922 WT_RET(__wt_buf_catfmt(session, buf, ", avail=[Empty]"));
923 else
924 WT_RET(__wt_buf_catfmt(session, buf,
925 ", avail=[%"
926 PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
927 (uintmax_t)ci->avail.offset,
928 (uintmax_t)(ci->avail.offset + ci->avail.size),
929 ci->avail.size, ci->avail.checksum));
930 if (ci->discard.offset == WT_BLOCK_INVALID_OFFSET)
931 WT_RET(__wt_buf_catfmt(session, buf, ", discard=[Empty]"));
932 else
933 WT_RET(__wt_buf_catfmt(session, buf,
934 ", discard=[%"
935 PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
936 (uintmax_t)ci->discard.offset,
937 (uintmax_t)(ci->discard.offset + ci->discard.size),
938 ci->discard.size, ci->discard.checksum));
939 WT_RET(__wt_buf_catfmt(session, buf,
940 ", file size=%" PRIuMAX, (uintmax_t)ci->file_size));
941
942 __wt_block_ckpt_destroy(session, ci);
943
944 return (0);
945 }
946