1 /*-
2  * Copyright (c) 2014-2018 MongoDB, Inc.
3  * Copyright (c) 2008-2014 WiredTiger, Inc.
4  *	All rights reserved.
5  *
6  * See the file LICENSE for redistribution information.
7  */
8 
9 #include "wt_internal.h"
10 
11 static int __btree_conf(WT_SESSION_IMPL *, WT_CKPT *ckpt);
12 static int __btree_get_last_recno(WT_SESSION_IMPL *);
13 static int __btree_page_sizes(WT_SESSION_IMPL *);
14 static int __btree_preload(WT_SESSION_IMPL *);
15 static int __btree_tree_open_empty(WT_SESSION_IMPL *, bool);
16 
17 /*
18  * __btree_clear --
19  *	Clear a Btree, either on handle discard or re-open.
20  */
21 static int
__btree_clear(WT_SESSION_IMPL * session)22 __btree_clear(WT_SESSION_IMPL *session)
23 {
24 	WT_BTREE *btree;
25 	WT_DECL_RET;
26 
27 	btree = S2BT(session);
28 
29 	/*
30 	 * If the tree hasn't gone through an open/close cycle, there's no
31 	 * cleanup to be done.
32 	 */
33 	if (!F_ISSET(btree, WT_BTREE_CLOSED))
34 		return (0);
35 
36 	/* Close the Huffman tree. */
37 	__wt_btree_huffman_close(session);
38 
39 	/* Terminate any associated collator. */
40 	if (btree->collator_owned && btree->collator->terminate != NULL)
41 		WT_TRET(btree->collator->terminate(
42 		    btree->collator, &session->iface));
43 
44 	/* Destroy locks. */
45 	__wt_rwlock_destroy(session, &btree->ovfl_lock);
46 	__wt_spin_destroy(session, &btree->flush_lock);
47 
48 	/* Free allocated memory. */
49 	__wt_free(session, btree->key_format);
50 	__wt_free(session, btree->value_format);
51 
52 	return (ret);
53 }
54 
55 /*
56  * __wt_btree_open --
57  *	Open a Btree.
58  */
59 int
__wt_btree_open(WT_SESSION_IMPL * session,const char * op_cfg[])60 __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[])
61 {
62 	WT_BM *bm;
63 	WT_BTREE *btree;
64 	WT_CKPT ckpt;
65 	WT_CONFIG_ITEM cval;
66 	WT_DATA_HANDLE *dhandle;
67 	WT_DECL_RET;
68 	size_t root_addr_size;
69 	uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE];
70 	const char *filename;
71 	bool creation, forced_salvage;
72 
73 	btree = S2BT(session);
74 	dhandle = session->dhandle;
75 
76 	/*
77 	 * This may be a re-open, clean up the btree structure.
78 	 * Clear the fields that don't persist across a re-open.
79 	 * Clear all flags other than the operation flags (which are set by the
80 	 * connection handle software that called us).
81 	 */
82 	WT_RET(__btree_clear(session));
83 	memset(btree, 0, WT_BTREE_CLEAR_SIZE);
84 	F_CLR(btree, ~WT_BTREE_SPECIAL_FLAGS);
85 
86 	/* Set the data handle first, our called functions reasonably use it. */
87 	btree->dhandle = dhandle;
88 
89 	/* Checkpoint and verify files are readonly. */
90 	if (dhandle->checkpoint != NULL || F_ISSET(btree, WT_BTREE_VERIFY) ||
91 	    F_ISSET(S2C(session), WT_CONN_READONLY))
92 		F_SET(btree, WT_BTREE_READONLY);
93 
94 	/* Get the checkpoint information for this name/checkpoint pair. */
95 	WT_CLEAR(ckpt);
96 	WT_RET(__wt_meta_checkpoint(
97 	    session, dhandle->name, dhandle->checkpoint, &ckpt));
98 
99 	/*
100 	 * Bulk-load is only permitted on newly created files, not any empty
101 	 * file -- see the checkpoint code for a discussion.
102 	 */
103 	creation = ckpt.raw.size == 0;
104 	if (!creation && F_ISSET(btree, WT_BTREE_BULK))
105 		WT_ERR_MSG(session, EINVAL,
106 		    "bulk-load is only supported on newly created objects");
107 
108 	/* Handle salvage configuration. */
109 	forced_salvage = false;
110 	if (F_ISSET(btree, WT_BTREE_SALVAGE)) {
111 		WT_ERR(__wt_config_gets(session, op_cfg, "force", &cval));
112 		forced_salvage = cval.val != 0;
113 	}
114 
115 	/* Initialize and configure the WT_BTREE structure. */
116 	WT_ERR(__btree_conf(session, &ckpt));
117 
118 	/*
119 	 * We could be a re-open of a table that was put in the lookaside
120 	 * dropped list. Remove our id from that list.
121 	 */
122 	__wt_las_remove_dropped(session);
123 
124 	/* Connect to the underlying block manager. */
125 	filename = dhandle->name;
126 	if (!WT_PREFIX_SKIP(filename, "file:"))
127 		WT_ERR_MSG(session, EINVAL, "expected a 'file:' URI");
128 
129 	WT_ERR(__wt_block_manager_open(session, filename, dhandle->cfg,
130 	    forced_salvage, F_ISSET(btree, WT_BTREE_READONLY),
131 	    btree->allocsize, &btree->bm));
132 	bm = btree->bm;
133 
134 	/*
135 	 * !!!
136 	 * As part of block-manager configuration, we need to return the maximum
137 	 * sized address cookie that a block manager will ever return.  There's
138 	 * a limit of WT_BTREE_MAX_ADDR_COOKIE, but at 255B, it's too large for
139 	 * a Btree with 512B internal pages.  The default block manager packs
140 	 * a wt_off_t and 2 uint32_t's into its cookie, so there's no problem
141 	 * now, but when we create a block manager extension API, we need some
142 	 * way to consider the block manager's maximum cookie size versus the
143 	 * minimum Btree internal node size.
144 	 */
145 	btree->block_header = bm->block_header(bm);
146 
147 	/*
148 	 * Open the specified checkpoint unless it's a special command (special
149 	 * commands are responsible for loading their own checkpoints, if any).
150 	 */
151 	if (!F_ISSET(btree,
152 	    WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) {
153 		/*
154 		 * There are two reasons to load an empty tree rather than a
155 		 * checkpoint: either there is no checkpoint (the file is
156 		 * being created), or the load call returns no root page (the
157 		 * checkpoint is for an empty file).
158 		 */
159 		WT_ERR(bm->checkpoint_load(bm, session,
160 		    ckpt.raw.data, ckpt.raw.size,
161 		    root_addr, &root_addr_size,
162 		    F_ISSET(btree, WT_BTREE_READONLY)));
163 		if (creation || root_addr_size == 0)
164 			WT_ERR(__btree_tree_open_empty(session, creation));
165 		else {
166 			WT_ERR(__wt_btree_tree_open(
167 			    session, root_addr, root_addr_size));
168 
169 			/*
170 			 * Rebalance uses the cache, but only wants the root
171 			 * page, nothing else.
172 			 */
173 			if (!F_ISSET(btree, WT_BTREE_REBALANCE)) {
174 				/* Warm the cache, if possible. */
175 				WT_WITH_PAGE_INDEX(session,
176 				    ret = __btree_preload(session));
177 				WT_ERR(ret);
178 
179 				/*
180 				 * Get the last record number in a column-store
181 				 * file.
182 				 */
183 				if (btree->type != BTREE_ROW)
184 					WT_ERR(__btree_get_last_recno(session));
185 			}
186 		}
187 	}
188 
189 	/*
190 	 * Eviction ignores trees until the handle's open flag is set, configure
191 	 * eviction before that happens.
192 	 *
193 	 * Files that can still be bulk-loaded cannot be evicted.
194 	 * Permanently cache-resident files can never be evicted.
195 	 * Special operations don't enable eviction. The underlying commands may
196 	 * turn on eviction (for example, verify turns on eviction while working
197 	 * a file to keep from consuming the cache), but it's their decision. If
198 	 * an underlying command reconfigures eviction, it must either clear the
199 	 * evict-disabled-open flag or restore the eviction configuration when
200 	 * finished so that handle close behaves correctly.
201 	 */
202 	if (btree->original ||
203 	    F_ISSET(btree, WT_BTREE_IN_MEMORY | WT_BTREE_REBALANCE |
204 	    WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) {
205 		WT_ERR(__wt_evict_file_exclusive_on(session));
206 		btree->evict_disabled_open = true;
207 	}
208 
209 	if (0) {
210 err:		WT_TRET(__wt_btree_close(session));
211 	}
212 	__wt_meta_checkpoint_free(session, &ckpt);
213 
214 	return (ret);
215 }
216 
217 /*
218  * __wt_btree_close --
219  *	Close a Btree.
220  */
221 int
__wt_btree_close(WT_SESSION_IMPL * session)222 __wt_btree_close(WT_SESSION_IMPL *session)
223 {
224 	WT_BM *bm;
225 	WT_BTREE *btree;
226 	WT_DECL_RET;
227 
228 	btree = S2BT(session);
229 
230 	/*
231 	 * The close process isn't the same as discarding the handle: we might
232 	 * re-open the handle, which isn't a big deal, but the backing blocks
233 	 * for the handle may not yet have been discarded from the cache, and
234 	 * eviction uses WT_BTREE structure elements. Free backing resources
235 	 * but leave the rest alone, and we'll discard the structure when we
236 	 * discard the data handle.
237 	 *
238 	 * Handles can be closed multiple times, ignore all but the first.
239 	 */
240 	if (F_ISSET(btree, WT_BTREE_CLOSED))
241 		return (0);
242 	F_SET(btree, WT_BTREE_CLOSED);
243 
244 	/*
245 	 * If closing a tree let sweep drop lookaside entries for it.
246 	 */
247 	if (F_ISSET(S2C(session), WT_CONN_LOOKASIDE_OPEN) &&
248 	    btree->lookaside_entries) {
249 		WT_ASSERT(session, !WT_IS_METADATA(btree->dhandle) &&
250 		    !F_ISSET(btree, WT_BTREE_LOOKASIDE));
251 		WT_TRET(__wt_las_save_dropped(session));
252 	}
253 
254 	/*
255 	 * If we turned eviction off and never turned it back on, do that now,
256 	 * otherwise the counter will be off.
257 	 */
258 	if (btree->evict_disabled_open) {
259 		btree->evict_disabled_open = false;
260 		__wt_evict_file_exclusive_off(session);
261 	}
262 
263 	/* Discard any underlying block manager resources. */
264 	if ((bm = btree->bm) != NULL) {
265 		btree->bm = NULL;
266 
267 		/* Unload the checkpoint, unless it's a special command. */
268 		if (!F_ISSET(btree,
269 		    WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY))
270 			WT_TRET(bm->checkpoint_unload(bm, session));
271 
272 		/* Close the underlying block manager reference. */
273 		WT_TRET(bm->close(bm, session));
274 	}
275 
276 	return (ret);
277 }
278 
279 /*
280  * __wt_btree_discard --
281  *	Discard a Btree.
282  */
283 int
__wt_btree_discard(WT_SESSION_IMPL * session)284 __wt_btree_discard(WT_SESSION_IMPL *session)
285 {
286 	WT_BTREE *btree;
287 	WT_DECL_RET;
288 
289 	ret = __btree_clear(session);
290 
291 	btree = S2BT(session);
292 	__wt_overwrite_and_free(session, btree);
293 	session->dhandle->handle = NULL;
294 
295 	return (ret);
296 }
297 
298 /*
299  * __btree_conf --
300  *	Configure a WT_BTREE structure.
301  */
302 static int
__btree_conf(WT_SESSION_IMPL * session,WT_CKPT * ckpt)303 __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
304 {
305 	WT_BTREE *btree;
306 	WT_CONFIG_ITEM cval, enc, keyid, metadata;
307 	WT_CONNECTION_IMPL *conn;
308 	WT_DECL_RET;
309 	int64_t maj_version, min_version;
310 	uint32_t bitcnt;
311 	const char **cfg, *enc_cfg[] = { NULL, NULL };
312 	bool fixed;
313 
314 	btree = S2BT(session);
315 	cfg = btree->dhandle->cfg;
316 	conn = S2C(session);
317 
318 	/* Dump out format information. */
319 	if (WT_VERBOSE_ISSET(session, WT_VERB_VERSION)) {
320 		WT_RET(__wt_config_gets(session, cfg, "version.major", &cval));
321 		maj_version = cval.val;
322 		WT_RET(__wt_config_gets(session, cfg, "version.minor", &cval));
323 		min_version = cval.val;
324 		__wt_verbose(session, WT_VERB_VERSION,
325 		    "%" PRId64 ".%" PRId64, maj_version, min_version);
326 	}
327 
328 	/* Get the file ID. */
329 	WT_RET(__wt_config_gets(session, cfg, "id", &cval));
330 	btree->id = (uint32_t)cval.val;
331 
332 	/* Validate file types and check the data format plan. */
333 	WT_RET(__wt_config_gets(session, cfg, "key_format", &cval));
334 	WT_RET(__wt_struct_confchk(session, &cval));
335 	if (WT_STRING_MATCH("r", cval.str, cval.len))
336 		btree->type = BTREE_COL_VAR;
337 	else
338 		btree->type = BTREE_ROW;
339 	WT_RET(__wt_strndup(session, cval.str, cval.len, &btree->key_format));
340 
341 	WT_RET(__wt_config_gets(session, cfg, "value_format", &cval));
342 	WT_RET(__wt_struct_confchk(session, &cval));
343 	WT_RET(__wt_strndup(session, cval.str, cval.len, &btree->value_format));
344 
345 	/* Row-store key comparison and key gap for prefix compression. */
346 	if (btree->type == BTREE_ROW) {
347 		WT_RET(__wt_config_gets_none(session, cfg, "collator", &cval));
348 		if (cval.len != 0) {
349 			WT_RET(__wt_config_gets(
350 			    session, cfg, "app_metadata", &metadata));
351 			WT_RET(__wt_collator_config(
352 			    session, btree->dhandle->name, &cval, &metadata,
353 			    &btree->collator, &btree->collator_owned));
354 		}
355 
356 		WT_RET(__wt_config_gets(session, cfg, "key_gap", &cval));
357 		btree->key_gap = (uint32_t)cval.val;
358 	}
359 
360 	/* Column-store: check for fixed-size data. */
361 	if (btree->type == BTREE_COL_VAR) {
362 		WT_RET(__wt_struct_check(
363 		    session, cval.str, cval.len, &fixed, &bitcnt));
364 		if (fixed) {
365 			if (bitcnt == 0 || bitcnt > 8)
366 				WT_RET_MSG(session, EINVAL,
367 				    "fixed-width field sizes must be greater "
368 				    "than 0 and less than or equal to 8");
369 			btree->bitcnt = (uint8_t)bitcnt;
370 			btree->type = BTREE_COL_FIX;
371 		}
372 	}
373 
374 	/* Page sizes */
375 	WT_RET(__btree_page_sizes(session));
376 
377 	WT_RET(__wt_config_gets(session, cfg, "cache_resident", &cval));
378 	if (cval.val)
379 		F_SET(btree, WT_BTREE_IN_MEMORY);
380 	else
381 		F_CLR(btree, WT_BTREE_IN_MEMORY);
382 
383 	WT_RET(__wt_config_gets(session,
384 	    cfg, "ignore_in_memory_cache_size", &cval));
385 	if (cval.val) {
386 		if (!F_ISSET(conn, WT_CONN_IN_MEMORY))
387 			WT_RET_MSG(session, EINVAL,
388 			    "ignore_in_memory_cache_size setting is only valid "
389 			    "with databases configured to run in-memory");
390 		F_SET(btree, WT_BTREE_IGNORE_CACHE);
391 	} else
392 		F_CLR(btree, WT_BTREE_IGNORE_CACHE);
393 
394 	/*
395 	 * The metadata isn't blocked by in-memory cache limits because metadata
396 	 * "unroll" is performed by updates that are potentially blocked by the
397 	 * cache-full checks.
398 	 */
399 	if (WT_IS_METADATA(btree->dhandle))
400 		F_SET(btree, WT_BTREE_IGNORE_CACHE);
401 
402 	WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval));
403 	if (cval.val)
404 		F_CLR(btree, WT_BTREE_NO_LOGGING);
405 	else
406 		F_SET(btree, WT_BTREE_NO_LOGGING);
407 
408 	/* Checksums */
409 	WT_RET(__wt_config_gets(session, cfg, "checksum", &cval));
410 	if (WT_STRING_MATCH("on", cval.str, cval.len))
411 		btree->checksum = CKSUM_ON;
412 	else if (WT_STRING_MATCH("off", cval.str, cval.len))
413 		btree->checksum = CKSUM_OFF;
414 	else
415 		btree->checksum = CKSUM_UNCOMPRESSED;
416 
417 	/* Debugging information */
418 	WT_RET(__wt_config_gets(session,
419 	    cfg, "assert.commit_timestamp", &cval));
420 	btree->assert_flags = 0;
421 	if (WT_STRING_MATCH("always", cval.str, cval.len))
422 		FLD_SET(btree->assert_flags, WT_ASSERT_COMMIT_TS_ALWAYS);
423 	else if (WT_STRING_MATCH("key_consistent", cval.str, cval.len))
424 		FLD_SET(btree->assert_flags, WT_ASSERT_COMMIT_TS_KEYS);
425 	else if (WT_STRING_MATCH("never", cval.str, cval.len))
426 		FLD_SET(btree->assert_flags, WT_ASSERT_COMMIT_TS_NEVER);
427 	WT_RET(__wt_config_gets(session, cfg, "assert.read_timestamp", &cval));
428 	if (WT_STRING_MATCH("always", cval.str, cval.len))
429 		FLD_SET(btree->assert_flags, WT_ASSERT_READ_TS_ALWAYS);
430 	else if (WT_STRING_MATCH("never", cval.str, cval.len))
431 		FLD_SET(btree->assert_flags, WT_ASSERT_READ_TS_NEVER);
432 
433 	/* Huffman encoding */
434 	WT_RET(__wt_btree_huffman_open(session));
435 
436 	/*
437 	 * Reconciliation configuration:
438 	 *	Block compression (all)
439 	 *	Dictionary compression (variable-length column-store, row-store)
440 	 *	Page-split percentage
441 	 *	Prefix compression (row-store)
442 	 *	Suffix compression (row-store)
443 	 */
444 	switch (btree->type) {
445 	case BTREE_COL_FIX:
446 		break;
447 	case BTREE_ROW:
448 		WT_RET(__wt_config_gets(
449 		    session, cfg, "internal_key_truncate", &cval));
450 		btree->internal_key_truncate = cval.val != 0;
451 
452 		WT_RET(__wt_config_gets(
453 		    session, cfg, "prefix_compression", &cval));
454 		btree->prefix_compression = cval.val != 0;
455 		WT_RET(__wt_config_gets(
456 		    session, cfg, "prefix_compression_min", &cval));
457 		btree->prefix_compression_min = (u_int)cval.val;
458 		/* FALLTHROUGH */
459 	case BTREE_COL_VAR:
460 		WT_RET(__wt_config_gets(session, cfg, "dictionary", &cval));
461 		btree->dictionary = (u_int)cval.val;
462 		break;
463 	}
464 
465 	WT_RET(__wt_config_gets_none(session, cfg, "block_compressor", &cval));
466 	WT_RET(__wt_compressor_config(session, &cval, &btree->compressor));
467 
468 	/*
469 	 * Configure compression adjustment.
470 	 * When doing compression, assume compression rates that will result in
471 	 * pages larger than the maximum in-memory images allowed. If we're
472 	 * wrong, we adjust downward (but we're almost certainly correct, the
473 	 * maximum in-memory images allowed are only 4x the maximum page size,
474 	 * and compression always gives us more than 4x).
475 	 *	Don't do compression adjustment for fixed-size column store, the
476 	 * leaf page sizes don't change. (We could adjust internal pages but not
477 	 * internal pages, but that seems an unlikely use case.)
478 	 *	XXX
479 	 *	Don't do compression adjustment of snappy-compressed blocks.
480 	 */
481 	btree->intlpage_compadjust = false;
482 	btree->maxintlpage_precomp = btree->maxintlpage;
483 	btree->leafpage_compadjust = false;
484 	btree->maxleafpage_precomp = btree->maxleafpage;
485 	if (btree->compressor != NULL && btree->compressor->compress != NULL &&
486 	    !WT_STRING_MATCH("snappy", cval.str, cval.len) &&
487 	    btree->type != BTREE_COL_FIX) {
488 		/*
489 		 * Don't do compression adjustment when on-disk page sizes are
490 		 * less than 16KB. There's not enough compression going on to
491 		 * fine-tune the size, all we end up doing is hammering shared
492 		 * memory.
493 		 *
494 		 * Don't do compression adjustment when on-disk page sizes are
495 		 * equal to the maximum in-memory page image, the bytes taken
496 		 * for compression can't grow past the base value.
497 		 */
498 		if (btree->maxintlpage >= 16 * 1024 &&
499 		    btree->maxmempage_image > btree->maxintlpage) {
500 			btree->intlpage_compadjust = true;
501 			btree->maxintlpage_precomp = btree->maxmempage_image;
502 		}
503 		if (btree->maxleafpage >= 16 * 1024 &&
504 		    btree->maxmempage_image > btree->maxleafpage) {
505 			btree->leafpage_compadjust = true;
506 			btree->maxleafpage_precomp = btree->maxmempage_image;
507 		}
508 	}
509 
510 	/*
511 	 * We do not use __wt_config_gets_none here because "none" and the empty
512 	 * string have different meanings. The empty string means inherit the
513 	 * system encryption setting and "none" means this table is in the clear
514 	 * even if the database is encrypted.
515 	 */
516 	WT_RET(__wt_config_gets(session, cfg, "encryption.name", &cval));
517 	if (cval.len == 0)
518 		btree->kencryptor = conn->kencryptor;
519 	else if (WT_STRING_MATCH("none", cval.str, cval.len))
520 		btree->kencryptor = NULL;
521 	else {
522 		WT_RET(__wt_config_gets_none(
523 		    session, cfg, "encryption.keyid", &keyid));
524 		WT_RET(__wt_config_gets(session, cfg, "encryption", &enc));
525 		if (enc.len != 0)
526 			WT_RET(__wt_strndup(session, enc.str, enc.len,
527 			    &enc_cfg[0]));
528 		ret = __wt_encryptor_config(session, &cval, &keyid,
529 		    (WT_CONFIG_ARG *)enc_cfg, &btree->kencryptor);
530 		__wt_free(session, enc_cfg[0]);
531 		WT_RET(ret);
532 	}
533 
534 	/* Initialize locks. */
535 	WT_RET(__wt_rwlock_init(session, &btree->ovfl_lock));
536 	WT_RET(__wt_spin_init(session, &btree->flush_lock, "btree flush"));
537 
538 	btree->modified = false;			/* Clean */
539 
540 	btree->syncing = WT_BTREE_SYNC_OFF;	/* Not syncing */
541 	btree->write_gen = ckpt->write_gen;	/* Write generation */
542 	btree->checkpoint_gen = __wt_gen(session, WT_GEN_CHECKPOINT);
543 
544 	return (0);
545 }
546 
547 /*
548  * __wt_root_ref_init --
549  *	Initialize a tree root reference, and link in the root page.
550  */
551 void
__wt_root_ref_init(WT_REF * root_ref,WT_PAGE * root,bool is_recno)552 __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, bool is_recno)
553 {
554 	memset(root_ref, 0, sizeof(*root_ref));
555 
556 	root_ref->page = root;
557 	root_ref->state = WT_REF_MEM;
558 
559 	root_ref->ref_recno = is_recno ? 1 : WT_RECNO_OOB;
560 
561 	root->pg_intl_parent_ref = root_ref;
562 }
563 
564 /*
565  * __wt_btree_tree_open --
566  *	Read in a tree from disk.
567  */
568 int
__wt_btree_tree_open(WT_SESSION_IMPL * session,const uint8_t * addr,size_t addr_size)569 __wt_btree_tree_open(
570     WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
571 {
572 	WT_BM *bm;
573 	WT_BTREE *btree;
574 	WT_DECL_ITEM(tmp);
575 	WT_DECL_RET;
576 	WT_ITEM dsk;
577 	WT_PAGE *page;
578 
579 	btree = S2BT(session);
580 	bm = btree->bm;
581 
582 	/*
583 	 * A buffer into which we read a root page; don't use a scratch buffer,
584 	 * the buffer's allocated memory becomes the persistent in-memory page.
585 	 */
586 	WT_CLEAR(dsk);
587 
588 	/*
589 	 * Read and verify the page (verify to catch encrypted objects we can't
590 	 * decrypt, where we read the object successfully but we can't decrypt
591 	 * it, and we want to fail gracefully).
592 	 *
593 	 * Create a printable version of the address to pass to verify.
594 	 */
595 	WT_ERR(__wt_scr_alloc(session, 0, &tmp));
596 	WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size));
597 
598 	F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE);
599 	if ((ret = __wt_bt_read(session, &dsk, addr, addr_size)) == 0)
600 		ret = __wt_verify_dsk(session, tmp->data, &dsk);
601 	F_CLR(session, WT_SESSION_QUIET_CORRUPT_FILE);
602 	if (ret != 0)
603 		__wt_err(session, ret,
604 		    "unable to read root page from %s", session->dhandle->name);
605 	/*
606 	 * Failure to open metadata means that the database is unavailable.
607 	 * Try to provide a helpful failure message.
608 	 */
609 	if (ret != 0 && WT_IS_METADATA(session->dhandle)) {
610 		__wt_errx(session,
611 		    "WiredTiger has failed to open its metadata");
612 		__wt_errx(session, "This may be due to the database"
613 		    " files being encrypted, being from an older"
614 		    " version or due to corruption on disk");
615 		__wt_errx(session, "You should confirm that you have"
616 		    " opened the database with the correct options including"
617 		    " all encryption and compression options");
618 	}
619 	WT_ERR(ret);
620 
621 	/*
622 	 * Build the in-memory version of the page. Clear our local reference to
623 	 * the allocated copy of the disk image on return, the in-memory object
624 	 * steals it.
625 	 */
626 	WT_ERR(__wt_page_inmem(session, NULL, dsk.data,
627 	    WT_DATA_IN_ITEM(&dsk) ?
628 	    WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page));
629 	dsk.mem = NULL;
630 
631 	/* Finish initializing the root, root reference links. */
632 	__wt_root_ref_init(&btree->root, page, btree->type != BTREE_ROW);
633 
634 err:	__wt_buf_free(session, &dsk);
635 	__wt_scr_free(session, &tmp);
636 
637 	return (ret);
638 }
639 
640 /*
641  * __btree_tree_open_empty --
642  *	Create an empty in-memory tree.
643  */
644 static int
__btree_tree_open_empty(WT_SESSION_IMPL * session,bool creation)645 __btree_tree_open_empty(WT_SESSION_IMPL *session, bool creation)
646 {
647 	WT_BTREE *btree;
648 	WT_DECL_RET;
649 	WT_PAGE *leaf, *root;
650 	WT_PAGE_INDEX *pindex;
651 	WT_REF *ref;
652 
653 	btree = S2BT(session);
654 	root = leaf = NULL;
655 	ref = NULL;
656 
657 	/*
658 	 * Newly created objects can be used for cursor inserts or for bulk
659 	 * loads; set a flag that's cleared when a row is inserted into the
660 	 * tree.
661 	 */
662 	if (creation)
663 		btree->original = 1;
664 
665 	/*
666 	 * A note about empty trees: the initial tree is a single root page.
667 	 * It has a single reference to a leaf page, marked deleted.  The leaf
668 	 * page will be created by the first update.  If the root is evicted
669 	 * without being modified, that's OK, nothing is ever written.
670 	 *
671 	 * !!!
672 	 * Be cautious about changing the order of updates in this code: to call
673 	 * __wt_page_out on error, we require a correct page setup at each point
674 	 * where we might fail.
675 	 */
676 	switch (btree->type) {
677 	case BTREE_COL_FIX:
678 	case BTREE_COL_VAR:
679 		WT_ERR(__wt_page_alloc(
680 		    session, WT_PAGE_COL_INT, 1, true, &root));
681 		root->pg_intl_parent_ref = &btree->root;
682 
683 		pindex = WT_INTL_INDEX_GET_SAFE(root);
684 		ref = pindex->index[0];
685 		ref->home = root;
686 		ref->page = NULL;
687 		ref->addr = NULL;
688 		ref->state = WT_REF_DELETED;
689 		ref->ref_recno = 1;
690 		break;
691 	case BTREE_ROW:
692 		WT_ERR(__wt_page_alloc(
693 		    session, WT_PAGE_ROW_INT, 1, true, &root));
694 		root->pg_intl_parent_ref = &btree->root;
695 
696 		pindex = WT_INTL_INDEX_GET_SAFE(root);
697 		ref = pindex->index[0];
698 		ref->home = root;
699 		ref->page = NULL;
700 		ref->addr = NULL;
701 		ref->state = WT_REF_DELETED;
702 		WT_ERR(__wt_row_ikey_incr(session, root, 0, "", 1, ref));
703 		break;
704 	}
705 
706 	/* Bulk loads require a leaf page for reconciliation: create it now. */
707 	if (F_ISSET(btree, WT_BTREE_BULK)) {
708 		WT_ERR(__wt_btree_new_leaf_page(session, &leaf));
709 		ref->page = leaf;
710 		ref->state = WT_REF_MEM;
711 		WT_ERR(__wt_page_modify_init(session, leaf));
712 		__wt_page_only_modify_set(session, leaf);
713 	}
714 
715 	/* Finish initializing the root, root reference links. */
716 	__wt_root_ref_init(&btree->root, root, btree->type != BTREE_ROW);
717 
718 	return (0);
719 
720 err:	if (leaf != NULL)
721 		__wt_page_out(session, &leaf);
722 	if (root != NULL)
723 		__wt_page_out(session, &root);
724 	return (ret);
725 }
726 
727 /*
728  * __wt_btree_new_leaf_page --
729  *	Create an empty leaf page.
730  */
731 int
__wt_btree_new_leaf_page(WT_SESSION_IMPL * session,WT_PAGE ** pagep)732 __wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep)
733 {
734 	WT_BTREE *btree;
735 
736 	btree = S2BT(session);
737 
738 	switch (btree->type) {
739 	case BTREE_COL_FIX:
740 		WT_RET(__wt_page_alloc(
741 		    session, WT_PAGE_COL_FIX, 0, false, pagep));
742 		break;
743 	case BTREE_COL_VAR:
744 		WT_RET(__wt_page_alloc(
745 		    session, WT_PAGE_COL_VAR, 0, false, pagep));
746 		break;
747 	case BTREE_ROW:
748 		WT_RET(__wt_page_alloc(
749 		    session, WT_PAGE_ROW_LEAF, 0, false, pagep));
750 		break;
751 	}
752 	return (0);
753 }
754 
755 /*
756  * __btree_preload --
757  *	Pre-load internal pages.
758  */
759 static int
__btree_preload(WT_SESSION_IMPL * session)760 __btree_preload(WT_SESSION_IMPL *session)
761 {
762 	WT_BM *bm;
763 	WT_BTREE *btree;
764 	WT_REF *ref;
765 	size_t addr_size;
766 	const uint8_t *addr;
767 
768 	btree = S2BT(session);
769 	bm = btree->bm;
770 
771 	/* Pre-load the second-level internal pages. */
772 	WT_INTL_FOREACH_BEGIN(session, btree->root.page, ref) {
773 		__wt_ref_info(ref, &addr, &addr_size, NULL);
774 		if (addr != NULL)
775 			WT_RET(bm->preload(bm, session, addr, addr_size));
776 	} WT_INTL_FOREACH_END;
777 	return (0);
778 }
779 
780 /*
781  * __btree_get_last_recno --
782  *	Set the last record number for a column-store.
783  */
784 static int
__btree_get_last_recno(WT_SESSION_IMPL * session)785 __btree_get_last_recno(WT_SESSION_IMPL *session)
786 {
787 	WT_BTREE *btree;
788 	WT_PAGE *page;
789 	WT_REF *next_walk;
790 
791 	btree = S2BT(session);
792 
793 	next_walk = NULL;
794 	WT_RET(__wt_tree_walk(session, &next_walk, WT_READ_PREV));
795 	if (next_walk == NULL)
796 		return (WT_NOTFOUND);
797 
798 	page = next_walk->page;
799 	btree->last_recno = page->type == WT_PAGE_COL_VAR ?
800 	    __col_var_last_recno(next_walk) : __col_fix_last_recno(next_walk);
801 
802 	return (__wt_page_release(session, next_walk, 0));
803 }
804 
805 /*
806  * __btree_page_sizes --
807  *	Verify the page sizes. Some of these sizes are automatically checked
808  *	using limits defined in the API, don't duplicate the logic here.
809  */
810 static int
__btree_page_sizes(WT_SESSION_IMPL * session)811 __btree_page_sizes(WT_SESSION_IMPL *session)
812 {
813 	WT_BTREE *btree;
814 	WT_CONFIG_ITEM cval;
815 	WT_CONNECTION_IMPL *conn;
816 	uint64_t cache_size;
817 	uint32_t intl_split_size, leaf_split_size, max;
818 	const char **cfg;
819 
820 	btree = S2BT(session);
821 	conn = S2C(session);
822 	cfg = btree->dhandle->cfg;
823 
824 	/*
825 	 * Get the allocation size.  Allocation sizes must be a power-of-two,
826 	 * nothing else makes sense.
827 	 */
828 	WT_RET(__wt_direct_io_size_check(
829 	    session, cfg, "allocation_size", &btree->allocsize));
830 	if (!__wt_ispo2(btree->allocsize))
831 		WT_RET_MSG(session,
832 		    EINVAL, "the allocation size must be a power of two");
833 
834 	/*
835 	 * Get the internal/leaf page sizes.
836 	 * All page sizes must be in units of the allocation size.
837 	 */
838 	WT_RET(__wt_direct_io_size_check(
839 	    session, cfg, "internal_page_max", &btree->maxintlpage));
840 	WT_RET(__wt_direct_io_size_check(
841 	    session, cfg, "leaf_page_max", &btree->maxleafpage));
842 	if (btree->maxintlpage < btree->allocsize ||
843 	    btree->maxintlpage % btree->allocsize != 0 ||
844 	    btree->maxleafpage < btree->allocsize ||
845 	    btree->maxleafpage % btree->allocsize != 0)
846 		WT_RET_MSG(session, EINVAL,
847 		    "page sizes must be a multiple of the page allocation "
848 		    "size (%" PRIu32 "B)", btree->allocsize);
849 
850 	/*
851 	 * Default in-memory page image size for compression is 4x the maximum
852 	 * internal or leaf page size, and enforce the on-disk page sizes as a
853 	 * lower-limit for the in-memory image size.
854 	 */
855 	WT_RET(__wt_config_gets(session, cfg, "memory_page_image_max", &cval));
856 	btree->maxmempage_image = (uint32_t)cval.val;
857 	max = WT_MAX(btree->maxintlpage, btree->maxleafpage);
858 	if (btree->maxmempage_image == 0)
859 		btree->maxmempage_image = 4 * max;
860 	else if (btree->maxmempage_image < max)
861 		WT_RET_MSG(session, EINVAL,
862 		    "in-memory page image size must be larger than the maximum "
863 		    "page size (%" PRIu32 "B < %" PRIu32 "B)",
864 		    btree->maxmempage_image, max);
865 
866 	/*
867 	 * Don't let pages grow large compared to the cache size or we can end
868 	 * up in a situation where nothing can be evicted.  Make sure at least
869 	 * 10 pages fit in cache when it is at the dirty trigger where threads
870 	 * stall.
871 	 *
872 	 * Take care getting the cache size: with a shared cache, it may not
873 	 * have been set.  Don't forget to update the API documentation if you
874 	 * alter the bounds for any of the parameters here.
875 	 */
876 	WT_RET(__wt_config_gets(session, cfg, "memory_page_max", &cval));
877 	btree->maxmempage = (uint64_t)cval.val;
878 	if (!F_ISSET(conn, WT_CONN_CACHE_POOL) &&
879 	    (cache_size = conn->cache_size) > 0)
880 		btree->maxmempage = (uint64_t)WT_MIN(btree->maxmempage,
881 		    (conn->cache->eviction_dirty_trigger * cache_size) / 1000);
882 
883 	/* Enforce a lower bound of a single disk leaf page */
884 	btree->maxmempage = WT_MAX(btree->maxmempage, btree->maxleafpage);
885 
886 	/*
887 	 * Try in-memory splits once we hit 80% of the maximum in-memory page
888 	 * size.  This gives multi-threaded append workloads a better chance of
889 	 * not stalling.
890 	 */
891 	btree->splitmempage = (8 * btree->maxmempage) / 10;
892 
893 	/*
894 	 * Get the split percentage (reconciliation splits pages into smaller
895 	 * than the maximum page size chunks so we don't split every time a
896 	 * new entry is added). Determine how large newly split pages will be.
897 	 * Set to the minimum, if the read value is less than that.
898 	 */
899 	WT_RET(__wt_config_gets(session, cfg, "split_pct", &cval));
900 	if (cval.val < WT_BTREE_MIN_SPLIT_PCT) {
901 		btree->split_pct = WT_BTREE_MIN_SPLIT_PCT;
902 		WT_RET(__wt_msg(session,
903 		    "Re-setting split_pct for %s to the minimum allowed of "
904 		    "%d%%.", session->dhandle->name, WT_BTREE_MIN_SPLIT_PCT));
905 	} else
906 		btree->split_pct = (int)cval.val;
907 	intl_split_size = __wt_split_page_size(
908 	    btree->split_pct, btree->maxintlpage, btree->allocsize);
909 	leaf_split_size = __wt_split_page_size(
910 	    btree->split_pct, btree->maxleafpage, btree->allocsize);
911 
912 	/*
913 	 * In-memory split configuration.
914 	 */
915 	if (__wt_config_gets(
916 	    session, cfg, "split_deepen_min_child", &cval) == WT_NOTFOUND ||
917 	    cval.val == 0)
918 		btree->split_deepen_min_child = WT_SPLIT_DEEPEN_MIN_CHILD_DEF;
919 	else
920 		btree->split_deepen_min_child = (u_int)cval.val;
921 	if (__wt_config_gets(
922 	    session, cfg, "split_deepen_per_child", &cval) == WT_NOTFOUND ||
923 	    cval.val == 0)
924 		btree->split_deepen_per_child = WT_SPLIT_DEEPEN_PER_CHILD_DEF;
925 	else
926 		btree->split_deepen_per_child = (u_int)cval.val;
927 
928 	/*
929 	 * Get the maximum internal/leaf page key/value sizes.
930 	 *
931 	 * In-memory configuration overrides any key/value sizes, there's no
932 	 * such thing as an overflow item in an in-memory configuration.
933 	 */
934 	if (F_ISSET(conn, WT_CONN_IN_MEMORY)) {
935 		btree->maxintlkey = WT_BTREE_MAX_OBJECT_SIZE;
936 		btree->maxleafkey = WT_BTREE_MAX_OBJECT_SIZE;
937 		btree->maxleafvalue = WT_BTREE_MAX_OBJECT_SIZE;
938 		return (0);
939 	}
940 
941 	/*
942 	 * In historic versions of WiredTiger, the maximum internal/leaf page
943 	 * key/value sizes were set by the internal_item_max and leaf_item_max
944 	 * configuration strings. Look for those strings if we don't find the
945 	 * newer ones.
946 	 */
947 	WT_RET(__wt_config_gets(session, cfg, "internal_key_max", &cval));
948 	btree->maxintlkey = (uint32_t)cval.val;
949 	if (btree->maxintlkey == 0) {
950 		WT_RET(
951 		    __wt_config_gets(session, cfg, "internal_item_max", &cval));
952 		btree->maxintlkey = (uint32_t)cval.val;
953 	}
954 	WT_RET(__wt_config_gets(session, cfg, "leaf_key_max", &cval));
955 	btree->maxleafkey = (uint32_t)cval.val;
956 	WT_RET(__wt_config_gets(session, cfg, "leaf_value_max", &cval));
957 	btree->maxleafvalue = (uint32_t)cval.val;
958 	if (btree->maxleafkey == 0 && btree->maxleafvalue == 0) {
959 		WT_RET(__wt_config_gets(session, cfg, "leaf_item_max", &cval));
960 		btree->maxleafkey = (uint32_t)cval.val;
961 		btree->maxleafvalue = (uint32_t)cval.val;
962 	}
963 
964 	/*
965 	 * Default/maximum for internal and leaf page keys: split-page / 10.
966 	 * Default for leaf page values: split-page / 2.
967 	 *
968 	 * It's difficult for applications to configure this in any exact way as
969 	 * they have to duplicate our calculation of how many keys must fit on a
970 	 * page, and given a split-percentage and page header, that isn't easy
971 	 * to do. If the maximum internal key value is too large for the page,
972 	 * reset it to the default.
973 	 */
974 	if (btree->maxintlkey == 0 || btree->maxintlkey > intl_split_size / 10)
975 		btree->maxintlkey = intl_split_size / 10;
976 	if (btree->maxleafkey == 0)
977 		btree->maxleafkey = leaf_split_size / 10;
978 	if (btree->maxleafvalue == 0)
979 		btree->maxleafvalue = leaf_split_size / 2;
980 
981 	return (0);
982 }
983 
984 /*
985  * __wt_btree_immediately_durable --
986  *	Check whether this btree is configured for immediate durability.
987  */
988 bool
__wt_btree_immediately_durable(WT_SESSION_IMPL * session)989 __wt_btree_immediately_durable(WT_SESSION_IMPL *session)
990 {
991 	WT_BTREE *btree;
992 
993 	btree = S2BT(session);
994 
995 	/*
996 	 * This is used to determine whether timestamp updates should
997 	 * be rolled back for this btree. With in-memory, the logging
998 	 * setting on tables is still important and when enabled they
999 	 * should be considered "durable".
1000 	 */
1001 	return ((FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED) ||
1002 	    (F_ISSET(S2C(session), WT_CONN_IN_MEMORY))) &&
1003 	    !F_ISSET(btree, WT_BTREE_NO_LOGGING));
1004 }
1005