1 /*-
2 * Copyright (c) 2014-2018 MongoDB, Inc.
3 * Copyright (c) 2008-2014 WiredTiger, Inc.
4 * All rights reserved.
5 *
6 * See the file LICENSE for redistribution information.
7 */
8
9 #include "wt_internal.h"
10
11 static int __btree_conf(WT_SESSION_IMPL *, WT_CKPT *ckpt);
12 static int __btree_get_last_recno(WT_SESSION_IMPL *);
13 static int __btree_page_sizes(WT_SESSION_IMPL *);
14 static int __btree_preload(WT_SESSION_IMPL *);
15 static int __btree_tree_open_empty(WT_SESSION_IMPL *, bool);
16
17 /*
18 * __btree_clear --
19 * Clear a Btree, either on handle discard or re-open.
20 */
21 static int
__btree_clear(WT_SESSION_IMPL * session)22 __btree_clear(WT_SESSION_IMPL *session)
23 {
24 WT_BTREE *btree;
25 WT_DECL_RET;
26
27 btree = S2BT(session);
28
29 /*
30 * If the tree hasn't gone through an open/close cycle, there's no
31 * cleanup to be done.
32 */
33 if (!F_ISSET(btree, WT_BTREE_CLOSED))
34 return (0);
35
36 /* Close the Huffman tree. */
37 __wt_btree_huffman_close(session);
38
39 /* Terminate any associated collator. */
40 if (btree->collator_owned && btree->collator->terminate != NULL)
41 WT_TRET(btree->collator->terminate(
42 btree->collator, &session->iface));
43
44 /* Destroy locks. */
45 __wt_rwlock_destroy(session, &btree->ovfl_lock);
46 __wt_spin_destroy(session, &btree->flush_lock);
47
48 /* Free allocated memory. */
49 __wt_free(session, btree->key_format);
50 __wt_free(session, btree->value_format);
51
52 return (ret);
53 }
54
55 /*
56 * __wt_btree_open --
57 * Open a Btree.
58 */
59 int
__wt_btree_open(WT_SESSION_IMPL * session,const char * op_cfg[])60 __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[])
61 {
62 WT_BM *bm;
63 WT_BTREE *btree;
64 WT_CKPT ckpt;
65 WT_CONFIG_ITEM cval;
66 WT_DATA_HANDLE *dhandle;
67 WT_DECL_RET;
68 size_t root_addr_size;
69 uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE];
70 const char *filename;
71 bool creation, forced_salvage;
72
73 btree = S2BT(session);
74 dhandle = session->dhandle;
75
76 /*
77 * This may be a re-open, clean up the btree structure.
78 * Clear the fields that don't persist across a re-open.
79 * Clear all flags other than the operation flags (which are set by the
80 * connection handle software that called us).
81 */
82 WT_RET(__btree_clear(session));
83 memset(btree, 0, WT_BTREE_CLEAR_SIZE);
84 F_CLR(btree, ~WT_BTREE_SPECIAL_FLAGS);
85
86 /* Set the data handle first, our called functions reasonably use it. */
87 btree->dhandle = dhandle;
88
89 /* Checkpoint and verify files are readonly. */
90 if (dhandle->checkpoint != NULL || F_ISSET(btree, WT_BTREE_VERIFY) ||
91 F_ISSET(S2C(session), WT_CONN_READONLY))
92 F_SET(btree, WT_BTREE_READONLY);
93
94 /* Get the checkpoint information for this name/checkpoint pair. */
95 WT_CLEAR(ckpt);
96 WT_RET(__wt_meta_checkpoint(
97 session, dhandle->name, dhandle->checkpoint, &ckpt));
98
99 /*
100 * Bulk-load is only permitted on newly created files, not any empty
101 * file -- see the checkpoint code for a discussion.
102 */
103 creation = ckpt.raw.size == 0;
104 if (!creation && F_ISSET(btree, WT_BTREE_BULK))
105 WT_ERR_MSG(session, EINVAL,
106 "bulk-load is only supported on newly created objects");
107
108 /* Handle salvage configuration. */
109 forced_salvage = false;
110 if (F_ISSET(btree, WT_BTREE_SALVAGE)) {
111 WT_ERR(__wt_config_gets(session, op_cfg, "force", &cval));
112 forced_salvage = cval.val != 0;
113 }
114
115 /* Initialize and configure the WT_BTREE structure. */
116 WT_ERR(__btree_conf(session, &ckpt));
117
118 /*
119 * We could be a re-open of a table that was put in the lookaside
120 * dropped list. Remove our id from that list.
121 */
122 __wt_las_remove_dropped(session);
123
124 /* Connect to the underlying block manager. */
125 filename = dhandle->name;
126 if (!WT_PREFIX_SKIP(filename, "file:"))
127 WT_ERR_MSG(session, EINVAL, "expected a 'file:' URI");
128
129 WT_ERR(__wt_block_manager_open(session, filename, dhandle->cfg,
130 forced_salvage, F_ISSET(btree, WT_BTREE_READONLY),
131 btree->allocsize, &btree->bm));
132 bm = btree->bm;
133
134 /*
135 * !!!
136 * As part of block-manager configuration, we need to return the maximum
137 * sized address cookie that a block manager will ever return. There's
138 * a limit of WT_BTREE_MAX_ADDR_COOKIE, but at 255B, it's too large for
139 * a Btree with 512B internal pages. The default block manager packs
140 * a wt_off_t and 2 uint32_t's into its cookie, so there's no problem
141 * now, but when we create a block manager extension API, we need some
142 * way to consider the block manager's maximum cookie size versus the
143 * minimum Btree internal node size.
144 */
145 btree->block_header = bm->block_header(bm);
146
147 /*
148 * Open the specified checkpoint unless it's a special command (special
149 * commands are responsible for loading their own checkpoints, if any).
150 */
151 if (!F_ISSET(btree,
152 WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) {
153 /*
154 * There are two reasons to load an empty tree rather than a
155 * checkpoint: either there is no checkpoint (the file is
156 * being created), or the load call returns no root page (the
157 * checkpoint is for an empty file).
158 */
159 WT_ERR(bm->checkpoint_load(bm, session,
160 ckpt.raw.data, ckpt.raw.size,
161 root_addr, &root_addr_size,
162 F_ISSET(btree, WT_BTREE_READONLY)));
163 if (creation || root_addr_size == 0)
164 WT_ERR(__btree_tree_open_empty(session, creation));
165 else {
166 WT_ERR(__wt_btree_tree_open(
167 session, root_addr, root_addr_size));
168
169 /*
170 * Rebalance uses the cache, but only wants the root
171 * page, nothing else.
172 */
173 if (!F_ISSET(btree, WT_BTREE_REBALANCE)) {
174 /* Warm the cache, if possible. */
175 WT_WITH_PAGE_INDEX(session,
176 ret = __btree_preload(session));
177 WT_ERR(ret);
178
179 /*
180 * Get the last record number in a column-store
181 * file.
182 */
183 if (btree->type != BTREE_ROW)
184 WT_ERR(__btree_get_last_recno(session));
185 }
186 }
187 }
188
189 /*
190 * Eviction ignores trees until the handle's open flag is set, configure
191 * eviction before that happens.
192 *
193 * Files that can still be bulk-loaded cannot be evicted.
194 * Permanently cache-resident files can never be evicted.
195 * Special operations don't enable eviction. The underlying commands may
196 * turn on eviction (for example, verify turns on eviction while working
197 * a file to keep from consuming the cache), but it's their decision. If
198 * an underlying command reconfigures eviction, it must either clear the
199 * evict-disabled-open flag or restore the eviction configuration when
200 * finished so that handle close behaves correctly.
201 */
202 if (btree->original ||
203 F_ISSET(btree, WT_BTREE_IN_MEMORY | WT_BTREE_REBALANCE |
204 WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) {
205 WT_ERR(__wt_evict_file_exclusive_on(session));
206 btree->evict_disabled_open = true;
207 }
208
209 if (0) {
210 err: WT_TRET(__wt_btree_close(session));
211 }
212 __wt_meta_checkpoint_free(session, &ckpt);
213
214 return (ret);
215 }
216
217 /*
218 * __wt_btree_close --
219 * Close a Btree.
220 */
221 int
__wt_btree_close(WT_SESSION_IMPL * session)222 __wt_btree_close(WT_SESSION_IMPL *session)
223 {
224 WT_BM *bm;
225 WT_BTREE *btree;
226 WT_DECL_RET;
227
228 btree = S2BT(session);
229
230 /*
231 * The close process isn't the same as discarding the handle: we might
232 * re-open the handle, which isn't a big deal, but the backing blocks
233 * for the handle may not yet have been discarded from the cache, and
234 * eviction uses WT_BTREE structure elements. Free backing resources
235 * but leave the rest alone, and we'll discard the structure when we
236 * discard the data handle.
237 *
238 * Handles can be closed multiple times, ignore all but the first.
239 */
240 if (F_ISSET(btree, WT_BTREE_CLOSED))
241 return (0);
242 F_SET(btree, WT_BTREE_CLOSED);
243
244 /*
245 * If closing a tree let sweep drop lookaside entries for it.
246 */
247 if (F_ISSET(S2C(session), WT_CONN_LOOKASIDE_OPEN) &&
248 btree->lookaside_entries) {
249 WT_ASSERT(session, !WT_IS_METADATA(btree->dhandle) &&
250 !F_ISSET(btree, WT_BTREE_LOOKASIDE));
251 WT_TRET(__wt_las_save_dropped(session));
252 }
253
254 /*
255 * If we turned eviction off and never turned it back on, do that now,
256 * otherwise the counter will be off.
257 */
258 if (btree->evict_disabled_open) {
259 btree->evict_disabled_open = false;
260 __wt_evict_file_exclusive_off(session);
261 }
262
263 /* Discard any underlying block manager resources. */
264 if ((bm = btree->bm) != NULL) {
265 btree->bm = NULL;
266
267 /* Unload the checkpoint, unless it's a special command. */
268 if (!F_ISSET(btree,
269 WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY))
270 WT_TRET(bm->checkpoint_unload(bm, session));
271
272 /* Close the underlying block manager reference. */
273 WT_TRET(bm->close(bm, session));
274 }
275
276 return (ret);
277 }
278
279 /*
280 * __wt_btree_discard --
281 * Discard a Btree.
282 */
283 int
__wt_btree_discard(WT_SESSION_IMPL * session)284 __wt_btree_discard(WT_SESSION_IMPL *session)
285 {
286 WT_BTREE *btree;
287 WT_DECL_RET;
288
289 ret = __btree_clear(session);
290
291 btree = S2BT(session);
292 __wt_overwrite_and_free(session, btree);
293 session->dhandle->handle = NULL;
294
295 return (ret);
296 }
297
298 /*
299 * __btree_conf --
300 * Configure a WT_BTREE structure.
301 */
302 static int
__btree_conf(WT_SESSION_IMPL * session,WT_CKPT * ckpt)303 __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
304 {
305 WT_BTREE *btree;
306 WT_CONFIG_ITEM cval, enc, keyid, metadata;
307 WT_CONNECTION_IMPL *conn;
308 WT_DECL_RET;
309 int64_t maj_version, min_version;
310 uint32_t bitcnt;
311 const char **cfg, *enc_cfg[] = { NULL, NULL };
312 bool fixed;
313
314 btree = S2BT(session);
315 cfg = btree->dhandle->cfg;
316 conn = S2C(session);
317
318 /* Dump out format information. */
319 if (WT_VERBOSE_ISSET(session, WT_VERB_VERSION)) {
320 WT_RET(__wt_config_gets(session, cfg, "version.major", &cval));
321 maj_version = cval.val;
322 WT_RET(__wt_config_gets(session, cfg, "version.minor", &cval));
323 min_version = cval.val;
324 __wt_verbose(session, WT_VERB_VERSION,
325 "%" PRId64 ".%" PRId64, maj_version, min_version);
326 }
327
328 /* Get the file ID. */
329 WT_RET(__wt_config_gets(session, cfg, "id", &cval));
330 btree->id = (uint32_t)cval.val;
331
332 /* Validate file types and check the data format plan. */
333 WT_RET(__wt_config_gets(session, cfg, "key_format", &cval));
334 WT_RET(__wt_struct_confchk(session, &cval));
335 if (WT_STRING_MATCH("r", cval.str, cval.len))
336 btree->type = BTREE_COL_VAR;
337 else
338 btree->type = BTREE_ROW;
339 WT_RET(__wt_strndup(session, cval.str, cval.len, &btree->key_format));
340
341 WT_RET(__wt_config_gets(session, cfg, "value_format", &cval));
342 WT_RET(__wt_struct_confchk(session, &cval));
343 WT_RET(__wt_strndup(session, cval.str, cval.len, &btree->value_format));
344
345 /* Row-store key comparison and key gap for prefix compression. */
346 if (btree->type == BTREE_ROW) {
347 WT_RET(__wt_config_gets_none(session, cfg, "collator", &cval));
348 if (cval.len != 0) {
349 WT_RET(__wt_config_gets(
350 session, cfg, "app_metadata", &metadata));
351 WT_RET(__wt_collator_config(
352 session, btree->dhandle->name, &cval, &metadata,
353 &btree->collator, &btree->collator_owned));
354 }
355
356 WT_RET(__wt_config_gets(session, cfg, "key_gap", &cval));
357 btree->key_gap = (uint32_t)cval.val;
358 }
359
360 /* Column-store: check for fixed-size data. */
361 if (btree->type == BTREE_COL_VAR) {
362 WT_RET(__wt_struct_check(
363 session, cval.str, cval.len, &fixed, &bitcnt));
364 if (fixed) {
365 if (bitcnt == 0 || bitcnt > 8)
366 WT_RET_MSG(session, EINVAL,
367 "fixed-width field sizes must be greater "
368 "than 0 and less than or equal to 8");
369 btree->bitcnt = (uint8_t)bitcnt;
370 btree->type = BTREE_COL_FIX;
371 }
372 }
373
374 /* Page sizes */
375 WT_RET(__btree_page_sizes(session));
376
377 WT_RET(__wt_config_gets(session, cfg, "cache_resident", &cval));
378 if (cval.val)
379 F_SET(btree, WT_BTREE_IN_MEMORY);
380 else
381 F_CLR(btree, WT_BTREE_IN_MEMORY);
382
383 WT_RET(__wt_config_gets(session,
384 cfg, "ignore_in_memory_cache_size", &cval));
385 if (cval.val) {
386 if (!F_ISSET(conn, WT_CONN_IN_MEMORY))
387 WT_RET_MSG(session, EINVAL,
388 "ignore_in_memory_cache_size setting is only valid "
389 "with databases configured to run in-memory");
390 F_SET(btree, WT_BTREE_IGNORE_CACHE);
391 } else
392 F_CLR(btree, WT_BTREE_IGNORE_CACHE);
393
394 /*
395 * The metadata isn't blocked by in-memory cache limits because metadata
396 * "unroll" is performed by updates that are potentially blocked by the
397 * cache-full checks.
398 */
399 if (WT_IS_METADATA(btree->dhandle))
400 F_SET(btree, WT_BTREE_IGNORE_CACHE);
401
402 WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval));
403 if (cval.val)
404 F_CLR(btree, WT_BTREE_NO_LOGGING);
405 else
406 F_SET(btree, WT_BTREE_NO_LOGGING);
407
408 /* Checksums */
409 WT_RET(__wt_config_gets(session, cfg, "checksum", &cval));
410 if (WT_STRING_MATCH("on", cval.str, cval.len))
411 btree->checksum = CKSUM_ON;
412 else if (WT_STRING_MATCH("off", cval.str, cval.len))
413 btree->checksum = CKSUM_OFF;
414 else
415 btree->checksum = CKSUM_UNCOMPRESSED;
416
417 /* Debugging information */
418 WT_RET(__wt_config_gets(session,
419 cfg, "assert.commit_timestamp", &cval));
420 btree->assert_flags = 0;
421 if (WT_STRING_MATCH("always", cval.str, cval.len))
422 FLD_SET(btree->assert_flags, WT_ASSERT_COMMIT_TS_ALWAYS);
423 else if (WT_STRING_MATCH("key_consistent", cval.str, cval.len))
424 FLD_SET(btree->assert_flags, WT_ASSERT_COMMIT_TS_KEYS);
425 else if (WT_STRING_MATCH("never", cval.str, cval.len))
426 FLD_SET(btree->assert_flags, WT_ASSERT_COMMIT_TS_NEVER);
427 WT_RET(__wt_config_gets(session, cfg, "assert.read_timestamp", &cval));
428 if (WT_STRING_MATCH("always", cval.str, cval.len))
429 FLD_SET(btree->assert_flags, WT_ASSERT_READ_TS_ALWAYS);
430 else if (WT_STRING_MATCH("never", cval.str, cval.len))
431 FLD_SET(btree->assert_flags, WT_ASSERT_READ_TS_NEVER);
432
433 /* Huffman encoding */
434 WT_RET(__wt_btree_huffman_open(session));
435
436 /*
437 * Reconciliation configuration:
438 * Block compression (all)
439 * Dictionary compression (variable-length column-store, row-store)
440 * Page-split percentage
441 * Prefix compression (row-store)
442 * Suffix compression (row-store)
443 */
444 switch (btree->type) {
445 case BTREE_COL_FIX:
446 break;
447 case BTREE_ROW:
448 WT_RET(__wt_config_gets(
449 session, cfg, "internal_key_truncate", &cval));
450 btree->internal_key_truncate = cval.val != 0;
451
452 WT_RET(__wt_config_gets(
453 session, cfg, "prefix_compression", &cval));
454 btree->prefix_compression = cval.val != 0;
455 WT_RET(__wt_config_gets(
456 session, cfg, "prefix_compression_min", &cval));
457 btree->prefix_compression_min = (u_int)cval.val;
458 /* FALLTHROUGH */
459 case BTREE_COL_VAR:
460 WT_RET(__wt_config_gets(session, cfg, "dictionary", &cval));
461 btree->dictionary = (u_int)cval.val;
462 break;
463 }
464
465 WT_RET(__wt_config_gets_none(session, cfg, "block_compressor", &cval));
466 WT_RET(__wt_compressor_config(session, &cval, &btree->compressor));
467
468 /*
469 * Configure compression adjustment.
470 * When doing compression, assume compression rates that will result in
471 * pages larger than the maximum in-memory images allowed. If we're
472 * wrong, we adjust downward (but we're almost certainly correct, the
473 * maximum in-memory images allowed are only 4x the maximum page size,
474 * and compression always gives us more than 4x).
475 * Don't do compression adjustment for fixed-size column store, the
476 * leaf page sizes don't change. (We could adjust internal pages but not
477 * internal pages, but that seems an unlikely use case.)
478 * XXX
479 * Don't do compression adjustment of snappy-compressed blocks.
480 */
481 btree->intlpage_compadjust = false;
482 btree->maxintlpage_precomp = btree->maxintlpage;
483 btree->leafpage_compadjust = false;
484 btree->maxleafpage_precomp = btree->maxleafpage;
485 if (btree->compressor != NULL && btree->compressor->compress != NULL &&
486 !WT_STRING_MATCH("snappy", cval.str, cval.len) &&
487 btree->type != BTREE_COL_FIX) {
488 /*
489 * Don't do compression adjustment when on-disk page sizes are
490 * less than 16KB. There's not enough compression going on to
491 * fine-tune the size, all we end up doing is hammering shared
492 * memory.
493 *
494 * Don't do compression adjustment when on-disk page sizes are
495 * equal to the maximum in-memory page image, the bytes taken
496 * for compression can't grow past the base value.
497 */
498 if (btree->maxintlpage >= 16 * 1024 &&
499 btree->maxmempage_image > btree->maxintlpage) {
500 btree->intlpage_compadjust = true;
501 btree->maxintlpage_precomp = btree->maxmempage_image;
502 }
503 if (btree->maxleafpage >= 16 * 1024 &&
504 btree->maxmempage_image > btree->maxleafpage) {
505 btree->leafpage_compadjust = true;
506 btree->maxleafpage_precomp = btree->maxmempage_image;
507 }
508 }
509
510 /*
511 * We do not use __wt_config_gets_none here because "none" and the empty
512 * string have different meanings. The empty string means inherit the
513 * system encryption setting and "none" means this table is in the clear
514 * even if the database is encrypted.
515 */
516 WT_RET(__wt_config_gets(session, cfg, "encryption.name", &cval));
517 if (cval.len == 0)
518 btree->kencryptor = conn->kencryptor;
519 else if (WT_STRING_MATCH("none", cval.str, cval.len))
520 btree->kencryptor = NULL;
521 else {
522 WT_RET(__wt_config_gets_none(
523 session, cfg, "encryption.keyid", &keyid));
524 WT_RET(__wt_config_gets(session, cfg, "encryption", &enc));
525 if (enc.len != 0)
526 WT_RET(__wt_strndup(session, enc.str, enc.len,
527 &enc_cfg[0]));
528 ret = __wt_encryptor_config(session, &cval, &keyid,
529 (WT_CONFIG_ARG *)enc_cfg, &btree->kencryptor);
530 __wt_free(session, enc_cfg[0]);
531 WT_RET(ret);
532 }
533
534 /* Initialize locks. */
535 WT_RET(__wt_rwlock_init(session, &btree->ovfl_lock));
536 WT_RET(__wt_spin_init(session, &btree->flush_lock, "btree flush"));
537
538 btree->modified = false; /* Clean */
539
540 btree->syncing = WT_BTREE_SYNC_OFF; /* Not syncing */
541 btree->write_gen = ckpt->write_gen; /* Write generation */
542 btree->checkpoint_gen = __wt_gen(session, WT_GEN_CHECKPOINT);
543
544 return (0);
545 }
546
547 /*
548 * __wt_root_ref_init --
549 * Initialize a tree root reference, and link in the root page.
550 */
551 void
__wt_root_ref_init(WT_REF * root_ref,WT_PAGE * root,bool is_recno)552 __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, bool is_recno)
553 {
554 memset(root_ref, 0, sizeof(*root_ref));
555
556 root_ref->page = root;
557 root_ref->state = WT_REF_MEM;
558
559 root_ref->ref_recno = is_recno ? 1 : WT_RECNO_OOB;
560
561 root->pg_intl_parent_ref = root_ref;
562 }
563
564 /*
565 * __wt_btree_tree_open --
566 * Read in a tree from disk.
567 */
568 int
__wt_btree_tree_open(WT_SESSION_IMPL * session,const uint8_t * addr,size_t addr_size)569 __wt_btree_tree_open(
570 WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
571 {
572 WT_BM *bm;
573 WT_BTREE *btree;
574 WT_DECL_ITEM(tmp);
575 WT_DECL_RET;
576 WT_ITEM dsk;
577 WT_PAGE *page;
578
579 btree = S2BT(session);
580 bm = btree->bm;
581
582 /*
583 * A buffer into which we read a root page; don't use a scratch buffer,
584 * the buffer's allocated memory becomes the persistent in-memory page.
585 */
586 WT_CLEAR(dsk);
587
588 /*
589 * Read and verify the page (verify to catch encrypted objects we can't
590 * decrypt, where we read the object successfully but we can't decrypt
591 * it, and we want to fail gracefully).
592 *
593 * Create a printable version of the address to pass to verify.
594 */
595 WT_ERR(__wt_scr_alloc(session, 0, &tmp));
596 WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size));
597
598 F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE);
599 if ((ret = __wt_bt_read(session, &dsk, addr, addr_size)) == 0)
600 ret = __wt_verify_dsk(session, tmp->data, &dsk);
601 F_CLR(session, WT_SESSION_QUIET_CORRUPT_FILE);
602 if (ret != 0)
603 __wt_err(session, ret,
604 "unable to read root page from %s", session->dhandle->name);
605 /*
606 * Failure to open metadata means that the database is unavailable.
607 * Try to provide a helpful failure message.
608 */
609 if (ret != 0 && WT_IS_METADATA(session->dhandle)) {
610 __wt_errx(session,
611 "WiredTiger has failed to open its metadata");
612 __wt_errx(session, "This may be due to the database"
613 " files being encrypted, being from an older"
614 " version or due to corruption on disk");
615 __wt_errx(session, "You should confirm that you have"
616 " opened the database with the correct options including"
617 " all encryption and compression options");
618 }
619 WT_ERR(ret);
620
621 /*
622 * Build the in-memory version of the page. Clear our local reference to
623 * the allocated copy of the disk image on return, the in-memory object
624 * steals it.
625 */
626 WT_ERR(__wt_page_inmem(session, NULL, dsk.data,
627 WT_DATA_IN_ITEM(&dsk) ?
628 WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page));
629 dsk.mem = NULL;
630
631 /* Finish initializing the root, root reference links. */
632 __wt_root_ref_init(&btree->root, page, btree->type != BTREE_ROW);
633
634 err: __wt_buf_free(session, &dsk);
635 __wt_scr_free(session, &tmp);
636
637 return (ret);
638 }
639
640 /*
641 * __btree_tree_open_empty --
642 * Create an empty in-memory tree.
643 */
644 static int
__btree_tree_open_empty(WT_SESSION_IMPL * session,bool creation)645 __btree_tree_open_empty(WT_SESSION_IMPL *session, bool creation)
646 {
647 WT_BTREE *btree;
648 WT_DECL_RET;
649 WT_PAGE *leaf, *root;
650 WT_PAGE_INDEX *pindex;
651 WT_REF *ref;
652
653 btree = S2BT(session);
654 root = leaf = NULL;
655 ref = NULL;
656
657 /*
658 * Newly created objects can be used for cursor inserts or for bulk
659 * loads; set a flag that's cleared when a row is inserted into the
660 * tree.
661 */
662 if (creation)
663 btree->original = 1;
664
665 /*
666 * A note about empty trees: the initial tree is a single root page.
667 * It has a single reference to a leaf page, marked deleted. The leaf
668 * page will be created by the first update. If the root is evicted
669 * without being modified, that's OK, nothing is ever written.
670 *
671 * !!!
672 * Be cautious about changing the order of updates in this code: to call
673 * __wt_page_out on error, we require a correct page setup at each point
674 * where we might fail.
675 */
676 switch (btree->type) {
677 case BTREE_COL_FIX:
678 case BTREE_COL_VAR:
679 WT_ERR(__wt_page_alloc(
680 session, WT_PAGE_COL_INT, 1, true, &root));
681 root->pg_intl_parent_ref = &btree->root;
682
683 pindex = WT_INTL_INDEX_GET_SAFE(root);
684 ref = pindex->index[0];
685 ref->home = root;
686 ref->page = NULL;
687 ref->addr = NULL;
688 ref->state = WT_REF_DELETED;
689 ref->ref_recno = 1;
690 break;
691 case BTREE_ROW:
692 WT_ERR(__wt_page_alloc(
693 session, WT_PAGE_ROW_INT, 1, true, &root));
694 root->pg_intl_parent_ref = &btree->root;
695
696 pindex = WT_INTL_INDEX_GET_SAFE(root);
697 ref = pindex->index[0];
698 ref->home = root;
699 ref->page = NULL;
700 ref->addr = NULL;
701 ref->state = WT_REF_DELETED;
702 WT_ERR(__wt_row_ikey_incr(session, root, 0, "", 1, ref));
703 break;
704 }
705
706 /* Bulk loads require a leaf page for reconciliation: create it now. */
707 if (F_ISSET(btree, WT_BTREE_BULK)) {
708 WT_ERR(__wt_btree_new_leaf_page(session, &leaf));
709 ref->page = leaf;
710 ref->state = WT_REF_MEM;
711 WT_ERR(__wt_page_modify_init(session, leaf));
712 __wt_page_only_modify_set(session, leaf);
713 }
714
715 /* Finish initializing the root, root reference links. */
716 __wt_root_ref_init(&btree->root, root, btree->type != BTREE_ROW);
717
718 return (0);
719
720 err: if (leaf != NULL)
721 __wt_page_out(session, &leaf);
722 if (root != NULL)
723 __wt_page_out(session, &root);
724 return (ret);
725 }
726
727 /*
728 * __wt_btree_new_leaf_page --
729 * Create an empty leaf page.
730 */
731 int
__wt_btree_new_leaf_page(WT_SESSION_IMPL * session,WT_PAGE ** pagep)732 __wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep)
733 {
734 WT_BTREE *btree;
735
736 btree = S2BT(session);
737
738 switch (btree->type) {
739 case BTREE_COL_FIX:
740 WT_RET(__wt_page_alloc(
741 session, WT_PAGE_COL_FIX, 0, false, pagep));
742 break;
743 case BTREE_COL_VAR:
744 WT_RET(__wt_page_alloc(
745 session, WT_PAGE_COL_VAR, 0, false, pagep));
746 break;
747 case BTREE_ROW:
748 WT_RET(__wt_page_alloc(
749 session, WT_PAGE_ROW_LEAF, 0, false, pagep));
750 break;
751 }
752 return (0);
753 }
754
755 /*
756 * __btree_preload --
757 * Pre-load internal pages.
758 */
759 static int
__btree_preload(WT_SESSION_IMPL * session)760 __btree_preload(WT_SESSION_IMPL *session)
761 {
762 WT_BM *bm;
763 WT_BTREE *btree;
764 WT_REF *ref;
765 size_t addr_size;
766 const uint8_t *addr;
767
768 btree = S2BT(session);
769 bm = btree->bm;
770
771 /* Pre-load the second-level internal pages. */
772 WT_INTL_FOREACH_BEGIN(session, btree->root.page, ref) {
773 __wt_ref_info(ref, &addr, &addr_size, NULL);
774 if (addr != NULL)
775 WT_RET(bm->preload(bm, session, addr, addr_size));
776 } WT_INTL_FOREACH_END;
777 return (0);
778 }
779
780 /*
781 * __btree_get_last_recno --
782 * Set the last record number for a column-store.
783 */
784 static int
__btree_get_last_recno(WT_SESSION_IMPL * session)785 __btree_get_last_recno(WT_SESSION_IMPL *session)
786 {
787 WT_BTREE *btree;
788 WT_PAGE *page;
789 WT_REF *next_walk;
790
791 btree = S2BT(session);
792
793 next_walk = NULL;
794 WT_RET(__wt_tree_walk(session, &next_walk, WT_READ_PREV));
795 if (next_walk == NULL)
796 return (WT_NOTFOUND);
797
798 page = next_walk->page;
799 btree->last_recno = page->type == WT_PAGE_COL_VAR ?
800 __col_var_last_recno(next_walk) : __col_fix_last_recno(next_walk);
801
802 return (__wt_page_release(session, next_walk, 0));
803 }
804
805 /*
806 * __btree_page_sizes --
807 * Verify the page sizes. Some of these sizes are automatically checked
808 * using limits defined in the API, don't duplicate the logic here.
809 */
810 static int
__btree_page_sizes(WT_SESSION_IMPL * session)811 __btree_page_sizes(WT_SESSION_IMPL *session)
812 {
813 WT_BTREE *btree;
814 WT_CONFIG_ITEM cval;
815 WT_CONNECTION_IMPL *conn;
816 uint64_t cache_size;
817 uint32_t intl_split_size, leaf_split_size, max;
818 const char **cfg;
819
820 btree = S2BT(session);
821 conn = S2C(session);
822 cfg = btree->dhandle->cfg;
823
824 /*
825 * Get the allocation size. Allocation sizes must be a power-of-two,
826 * nothing else makes sense.
827 */
828 WT_RET(__wt_direct_io_size_check(
829 session, cfg, "allocation_size", &btree->allocsize));
830 if (!__wt_ispo2(btree->allocsize))
831 WT_RET_MSG(session,
832 EINVAL, "the allocation size must be a power of two");
833
834 /*
835 * Get the internal/leaf page sizes.
836 * All page sizes must be in units of the allocation size.
837 */
838 WT_RET(__wt_direct_io_size_check(
839 session, cfg, "internal_page_max", &btree->maxintlpage));
840 WT_RET(__wt_direct_io_size_check(
841 session, cfg, "leaf_page_max", &btree->maxleafpage));
842 if (btree->maxintlpage < btree->allocsize ||
843 btree->maxintlpage % btree->allocsize != 0 ||
844 btree->maxleafpage < btree->allocsize ||
845 btree->maxleafpage % btree->allocsize != 0)
846 WT_RET_MSG(session, EINVAL,
847 "page sizes must be a multiple of the page allocation "
848 "size (%" PRIu32 "B)", btree->allocsize);
849
850 /*
851 * Default in-memory page image size for compression is 4x the maximum
852 * internal or leaf page size, and enforce the on-disk page sizes as a
853 * lower-limit for the in-memory image size.
854 */
855 WT_RET(__wt_config_gets(session, cfg, "memory_page_image_max", &cval));
856 btree->maxmempage_image = (uint32_t)cval.val;
857 max = WT_MAX(btree->maxintlpage, btree->maxleafpage);
858 if (btree->maxmempage_image == 0)
859 btree->maxmempage_image = 4 * max;
860 else if (btree->maxmempage_image < max)
861 WT_RET_MSG(session, EINVAL,
862 "in-memory page image size must be larger than the maximum "
863 "page size (%" PRIu32 "B < %" PRIu32 "B)",
864 btree->maxmempage_image, max);
865
866 /*
867 * Don't let pages grow large compared to the cache size or we can end
868 * up in a situation where nothing can be evicted. Make sure at least
869 * 10 pages fit in cache when it is at the dirty trigger where threads
870 * stall.
871 *
872 * Take care getting the cache size: with a shared cache, it may not
873 * have been set. Don't forget to update the API documentation if you
874 * alter the bounds for any of the parameters here.
875 */
876 WT_RET(__wt_config_gets(session, cfg, "memory_page_max", &cval));
877 btree->maxmempage = (uint64_t)cval.val;
878 if (!F_ISSET(conn, WT_CONN_CACHE_POOL) &&
879 (cache_size = conn->cache_size) > 0)
880 btree->maxmempage = (uint64_t)WT_MIN(btree->maxmempage,
881 (conn->cache->eviction_dirty_trigger * cache_size) / 1000);
882
883 /* Enforce a lower bound of a single disk leaf page */
884 btree->maxmempage = WT_MAX(btree->maxmempage, btree->maxleafpage);
885
886 /*
887 * Try in-memory splits once we hit 80% of the maximum in-memory page
888 * size. This gives multi-threaded append workloads a better chance of
889 * not stalling.
890 */
891 btree->splitmempage = (8 * btree->maxmempage) / 10;
892
893 /*
894 * Get the split percentage (reconciliation splits pages into smaller
895 * than the maximum page size chunks so we don't split every time a
896 * new entry is added). Determine how large newly split pages will be.
897 * Set to the minimum, if the read value is less than that.
898 */
899 WT_RET(__wt_config_gets(session, cfg, "split_pct", &cval));
900 if (cval.val < WT_BTREE_MIN_SPLIT_PCT) {
901 btree->split_pct = WT_BTREE_MIN_SPLIT_PCT;
902 WT_RET(__wt_msg(session,
903 "Re-setting split_pct for %s to the minimum allowed of "
904 "%d%%.", session->dhandle->name, WT_BTREE_MIN_SPLIT_PCT));
905 } else
906 btree->split_pct = (int)cval.val;
907 intl_split_size = __wt_split_page_size(
908 btree->split_pct, btree->maxintlpage, btree->allocsize);
909 leaf_split_size = __wt_split_page_size(
910 btree->split_pct, btree->maxleafpage, btree->allocsize);
911
912 /*
913 * In-memory split configuration.
914 */
915 if (__wt_config_gets(
916 session, cfg, "split_deepen_min_child", &cval) == WT_NOTFOUND ||
917 cval.val == 0)
918 btree->split_deepen_min_child = WT_SPLIT_DEEPEN_MIN_CHILD_DEF;
919 else
920 btree->split_deepen_min_child = (u_int)cval.val;
921 if (__wt_config_gets(
922 session, cfg, "split_deepen_per_child", &cval) == WT_NOTFOUND ||
923 cval.val == 0)
924 btree->split_deepen_per_child = WT_SPLIT_DEEPEN_PER_CHILD_DEF;
925 else
926 btree->split_deepen_per_child = (u_int)cval.val;
927
928 /*
929 * Get the maximum internal/leaf page key/value sizes.
930 *
931 * In-memory configuration overrides any key/value sizes, there's no
932 * such thing as an overflow item in an in-memory configuration.
933 */
934 if (F_ISSET(conn, WT_CONN_IN_MEMORY)) {
935 btree->maxintlkey = WT_BTREE_MAX_OBJECT_SIZE;
936 btree->maxleafkey = WT_BTREE_MAX_OBJECT_SIZE;
937 btree->maxleafvalue = WT_BTREE_MAX_OBJECT_SIZE;
938 return (0);
939 }
940
941 /*
942 * In historic versions of WiredTiger, the maximum internal/leaf page
943 * key/value sizes were set by the internal_item_max and leaf_item_max
944 * configuration strings. Look for those strings if we don't find the
945 * newer ones.
946 */
947 WT_RET(__wt_config_gets(session, cfg, "internal_key_max", &cval));
948 btree->maxintlkey = (uint32_t)cval.val;
949 if (btree->maxintlkey == 0) {
950 WT_RET(
951 __wt_config_gets(session, cfg, "internal_item_max", &cval));
952 btree->maxintlkey = (uint32_t)cval.val;
953 }
954 WT_RET(__wt_config_gets(session, cfg, "leaf_key_max", &cval));
955 btree->maxleafkey = (uint32_t)cval.val;
956 WT_RET(__wt_config_gets(session, cfg, "leaf_value_max", &cval));
957 btree->maxleafvalue = (uint32_t)cval.val;
958 if (btree->maxleafkey == 0 && btree->maxleafvalue == 0) {
959 WT_RET(__wt_config_gets(session, cfg, "leaf_item_max", &cval));
960 btree->maxleafkey = (uint32_t)cval.val;
961 btree->maxleafvalue = (uint32_t)cval.val;
962 }
963
964 /*
965 * Default/maximum for internal and leaf page keys: split-page / 10.
966 * Default for leaf page values: split-page / 2.
967 *
968 * It's difficult for applications to configure this in any exact way as
969 * they have to duplicate our calculation of how many keys must fit on a
970 * page, and given a split-percentage and page header, that isn't easy
971 * to do. If the maximum internal key value is too large for the page,
972 * reset it to the default.
973 */
974 if (btree->maxintlkey == 0 || btree->maxintlkey > intl_split_size / 10)
975 btree->maxintlkey = intl_split_size / 10;
976 if (btree->maxleafkey == 0)
977 btree->maxleafkey = leaf_split_size / 10;
978 if (btree->maxleafvalue == 0)
979 btree->maxleafvalue = leaf_split_size / 2;
980
981 return (0);
982 }
983
984 /*
985 * __wt_btree_immediately_durable --
986 * Check whether this btree is configured for immediate durability.
987 */
988 bool
__wt_btree_immediately_durable(WT_SESSION_IMPL * session)989 __wt_btree_immediately_durable(WT_SESSION_IMPL *session)
990 {
991 WT_BTREE *btree;
992
993 btree = S2BT(session);
994
995 /*
996 * This is used to determine whether timestamp updates should
997 * be rolled back for this btree. With in-memory, the logging
998 * setting on tables is still important and when enabled they
999 * should be considered "durable".
1000 */
1001 return ((FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED) ||
1002 (F_ISSET(S2C(session), WT_CONN_IN_MEMORY))) &&
1003 !F_ISSET(btree, WT_BTREE_NO_LOGGING));
1004 }
1005