1 /*-
2  * Copyright (c) 2014-2018 MongoDB, Inc.
3  * Copyright (c) 2008-2014 WiredTiger, Inc.
4  *	All rights reserved.
5  *
6  * See the file LICENSE for redistribution information.
7  */
8 
9 #include "wt_internal.h"
10 
11 static int __lsm_tree_cleanup_old(WT_SESSION_IMPL *, const char *);
12 static int __lsm_tree_open_check(WT_SESSION_IMPL *, WT_LSM_TREE *);
13 static int __lsm_tree_open(
14     WT_SESSION_IMPL *, const char *, bool, WT_LSM_TREE **);
15 static int __lsm_tree_set_name(WT_SESSION_IMPL *, WT_LSM_TREE *, const char *);
16 
17 /*
18  * __lsm_tree_discard_state --
19  *	Free the metadata configuration state-related LSM tree pointers.
20  */
21 static void
__lsm_tree_discard_state(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree)22 __lsm_tree_discard_state(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
23 {
24 	WT_LSM_CHUNK *chunk;
25 	u_int i;
26 
27 	__wt_free(session, lsm_tree->config);
28 	__wt_free(session, lsm_tree->key_format);
29 	__wt_free(session, lsm_tree->value_format);
30 	__wt_free(session, lsm_tree->collator_name);
31 	__wt_free(session, lsm_tree->custom_prefix);
32 	__wt_free(session, lsm_tree->custom_suffix);
33 	__wt_free(session, lsm_tree->bloom_config);
34 	__wt_free(session, lsm_tree->file_config);
35 
36 	for (i = 0; i < lsm_tree->nchunks; i++) {
37 		if ((chunk = lsm_tree->chunk[i]) == NULL)
38 			continue;
39 
40 		__wt_spin_destroy(session, &chunk->timestamp_spinlock);
41 		__wt_free(session, chunk->bloom_uri);
42 		__wt_free(session, chunk->uri);
43 		__wt_free(session, chunk);
44 	}
45 
46 	for (i = 0; i < lsm_tree->nold_chunks; i++) {
47 		chunk = lsm_tree->old_chunks[i];
48 		WT_ASSERT(session, chunk != NULL);
49 
50 		__wt_spin_destroy(session, &chunk->timestamp_spinlock);
51 		__wt_free(session, chunk->bloom_uri);
52 		__wt_free(session, chunk->uri);
53 		__wt_free(session, chunk);
54 	}
55 }
56 
57 /*
58  * __lsm_tree_discard --
59  *	Free an LSM tree structure.
60  */
61 static int
__lsm_tree_discard(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree,bool final)62 __lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final)
63 {
64 	WT_DECL_RET;
65 
66 	WT_UNUSED(final);	/* Only used in diagnostic builds */
67 
68 	WT_ASSERT(session, !lsm_tree->active);
69 	/*
70 	 * The work unit queue should be empty, but it's worth checking
71 	 * since work units use a different locking scheme to regular tree
72 	 * operations.
73 	 */
74 	WT_ASSERT(session, lsm_tree->queue_ref == 0);
75 
76 	/* We may be destroying an lsm_tree before it was added. */
77 	if (F_ISSET(lsm_tree, WT_LSM_TREE_OPEN)) {
78 		WT_ASSERT(session, final ||
79 		    F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE));
80 		TAILQ_REMOVE(&S2C(session)->lsmqh, lsm_tree, q);
81 	}
82 
83 	if (lsm_tree->collator_owned &&
84 	    lsm_tree->collator->terminate != NULL)
85 		WT_TRET(lsm_tree->collator->terminate(
86 		    lsm_tree->collator, &session->iface));
87 
88 	__wt_free(session, lsm_tree->name);
89 	__lsm_tree_discard_state(session, lsm_tree);
90 	__wt_free(session, lsm_tree->chunk);
91 	__wt_free(session, lsm_tree->old_chunks);
92 
93 	__wt_rwlock_destroy(session, &lsm_tree->rwlock);
94 
95 	__wt_free(session, lsm_tree);
96 
97 	return (ret);
98 }
99 
100 /*
101  * __lsm_tree_close --
102  *	Close an LSM tree structure.
103  */
104 static void
__lsm_tree_close(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree,bool final)105 __lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final)
106 {
107 	/*
108 	 * Stop any new work units being added. The barrier is necessary
109 	 * because we rely on the state change being visible before checking
110 	 * the tree queue state.
111 	 */
112 	lsm_tree->active = false;
113 	WT_FULL_BARRIER();
114 
115 	/*
116 	 * Wait for all LSM operations to drain. If WiredTiger is shutting
117 	 * down also wait for the tree reference count to go to zero, otherwise
118 	 * we know a user is holding a reference to the tree, so exclusive
119 	 * access is not available.
120 	 */
121 	while (lsm_tree->queue_ref > 0 || (final && lsm_tree->refcnt > 1)) {
122 		/*
123 		 * Remove any work units from the manager queues. Do this step
124 		 * repeatedly in case a work unit was in the process of being
125 		 * created when we cleared the active flag.
126 		 *
127 		 * !!! Drop the schema and handle list locks whilst completing
128 		 * this step so that we don't block any operations that require
129 		 * the schema lock to complete. This is safe because any
130 		 * operation that is closing the tree should first have gotten
131 		 * exclusive access to the LSM tree via __wt_lsm_tree_get, so
132 		 * other schema level operations will return EBUSY, even though
133 		 * we're dropping the schema lock here.
134 		 */
135 		WT_WITHOUT_LOCKS(session,
136 		    __wt_lsm_manager_clear_tree(session, lsm_tree));
137 	}
138 }
139 
140 /*
141  * __wt_lsm_tree_close_all --
142  *	Close all LSM tree structures.
143  */
144 int
__wt_lsm_tree_close_all(WT_SESSION_IMPL * session)145 __wt_lsm_tree_close_all(WT_SESSION_IMPL *session)
146 {
147 	WT_DECL_RET;
148 	WT_LSM_TREE *lsm_tree, *lsm_tree_tmp;
149 
150 	/* We are shutting down: the handle list lock isn't required. */
151 
152 	WT_TAILQ_SAFE_REMOVE_BEGIN(lsm_tree,
153 	    &S2C(session)->lsmqh, q, lsm_tree_tmp) {
154 		/*
155 		 * Tree close assumes that we have a reference to the tree
156 		 * so it can tell when it's safe to do the close. We could
157 		 * get the tree here, but we short circuit instead. There
158 		 * is no need to decrement the reference count since discard
159 		 * is unconditional.
160 		 */
161 		(void)__wt_atomic_add32(&lsm_tree->refcnt, 1);
162 		__lsm_tree_close(session, lsm_tree, true);
163 		WT_TRET(__lsm_tree_discard(session, lsm_tree, true));
164 	} WT_TAILQ_SAFE_REMOVE_END
165 
166 	return (ret);
167 }
168 
169 /*
170  * __lsm_tree_set_name --
171  *	Set or reset the name of an LSM tree
172  */
173 static int
__lsm_tree_set_name(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree,const char * uri)174 __lsm_tree_set_name(WT_SESSION_IMPL *session,
175     WT_LSM_TREE *lsm_tree, const char *uri)
176 {
177 	void *p;
178 
179 	WT_RET(__wt_strdup(session, uri, &p));
180 
181 	__wt_free(session, lsm_tree->name);
182 	lsm_tree->name = p;
183 	lsm_tree->filename = lsm_tree->name + strlen("lsm:");
184 	return (0);
185 }
186 
187 /*
188  * __wt_lsm_tree_bloom_name --
189  *	Get the URI of the Bloom filter for a given chunk.
190  */
191 int
__wt_lsm_tree_bloom_name(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree,uint32_t id,const char ** retp)192 __wt_lsm_tree_bloom_name(WT_SESSION_IMPL *session,
193     WT_LSM_TREE *lsm_tree, uint32_t id, const char **retp)
194 {
195 	WT_DECL_ITEM(tmp);
196 	WT_DECL_RET;
197 
198 	WT_RET(__wt_scr_alloc(session, 0, &tmp));
199 	WT_ERR(__wt_buf_fmt(
200 	    session, tmp, "file:%s-%06" PRIu32 ".bf", lsm_tree->filename, id));
201 	WT_ERR(__wt_strndup(session, tmp->data, tmp->size, retp));
202 
203 err:	__wt_scr_free(session, &tmp);
204 	return (ret);
205 }
206 
207 /*
208  * __wt_lsm_tree_chunk_name --
209  *	Get the URI of the file for a given chunk.
210  */
211 int
__wt_lsm_tree_chunk_name(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree,uint32_t id,uint32_t generation,const char ** retp)212 __wt_lsm_tree_chunk_name(WT_SESSION_IMPL *session,
213     WT_LSM_TREE *lsm_tree, uint32_t id, uint32_t generation, const char **retp)
214 {
215 	WT_DECL_ITEM(tmp);
216 	WT_DECL_RET;
217 
218 	WT_RET(__wt_scr_alloc(session, 0, &tmp));
219 
220 	if (lsm_tree->custom_generation != 0 &&
221 	    generation >= lsm_tree->custom_generation)
222 		WT_ERR(__wt_buf_fmt(session, tmp, "%s:%s-%06" PRIu32 "%s",
223 		    lsm_tree->custom_prefix, lsm_tree->filename, id,
224 		    lsm_tree->custom_suffix));
225 	else
226 		WT_ERR(__wt_buf_fmt(session, tmp, "file:%s-%06" PRIu32 ".lsm",
227 		    lsm_tree->filename, id));
228 
229 	WT_ERR(__wt_strndup(session, tmp->data, tmp->size, retp));
230 
231 err:	__wt_scr_free(session, &tmp);
232 	return (ret);
233 }
234 
235 /*
236  * __wt_lsm_tree_set_chunk_size --
237  *	Set the size of the chunk. Should only be called for chunks that are
238  *	on disk, or about to become on disk.
239  */
240 int
__wt_lsm_tree_set_chunk_size(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree,WT_LSM_CHUNK * chunk)241 __wt_lsm_tree_set_chunk_size(
242     WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk)
243 {
244 	WT_DATA_SOURCE *dsrc;
245 	wt_off_t size;
246 	const char *filename;
247 
248 	size = 0;
249 	if (lsm_tree->custom_generation != 0 &&
250 	    chunk->generation >= lsm_tree->custom_generation) {
251 		dsrc = __wt_schema_get_source(session, chunk->uri);
252 		/*
253 		 * We can only retrieve a size if the data source exposes the
254 		 * information.
255 		 */
256 		if (dsrc != NULL && dsrc->size != NULL) {
257 			/* Call the callback. */
258 			WT_RET(dsrc->size(
259 			    dsrc, (WT_SESSION*)session, chunk->uri, &size));
260 		}
261 	} else {
262 		filename = chunk->uri;
263 		if (!WT_PREFIX_SKIP(filename, "file:"))
264 			WT_RET_MSG(session, EINVAL,
265 			    "Expected a 'file:' URI: %s", chunk->uri);
266 		WT_RET(__wt_fs_size(session, filename, &size));
267 	}
268 
269 	chunk->size = (uint64_t)size;
270 
271 	return (0);
272 }
273 
274 /*
275  * __lsm_tree_cleanup_old --
276  *	Cleanup any old LSM chunks that might conflict with one we are
277  *	about to create. Sometimes failed LSM metadata operations can
278  *	leave old files and bloom filters behind.
279  */
280 static int
__lsm_tree_cleanup_old(WT_SESSION_IMPL * session,const char * uri)281 __lsm_tree_cleanup_old(WT_SESSION_IMPL *session, const char *uri)
282 {
283 	WT_DECL_RET;
284 	const char *cfg[] =
285 	    { WT_CONFIG_BASE(session, WT_SESSION_drop), "force", NULL };
286 	bool exists, is_file;
287 
288 	exists = false;
289 	is_file = WT_PREFIX_MATCH(uri, "file:");
290 	if (is_file)
291 		WT_RET(__wt_fs_exist(session, uri + strlen("file:"), &exists));
292 	if (!is_file || exists)
293 		WT_WITH_SCHEMA_LOCK(session,
294 		    ret = __wt_schema_drop(session, uri, cfg));
295 	return (ret);
296 }
297 
298 /*
299  * __wt_lsm_tree_setup_chunk --
300  *	Initialize a chunk of an LSM tree.
301  */
302 int
__wt_lsm_tree_setup_chunk(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree,WT_LSM_CHUNK * chunk)303 __wt_lsm_tree_setup_chunk(
304     WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk)
305 {
306 	WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA));
307 	__wt_epoch(session, &chunk->create_time);
308 
309 	WT_RET(__wt_spin_init(session,
310 	    &chunk->timestamp_spinlock, "LSM chunk timestamp"));
311 	WT_RET(__wt_lsm_tree_chunk_name(
312 	    session, lsm_tree, chunk->id, chunk->generation, &chunk->uri));
313 
314 	/*
315 	 * If the underlying file exists, drop the chunk first - there may be
316 	 * some content hanging over from an aborted merge or checkpoint.
317 	 *
318 	 * Don't do this for the very first chunk: we are called during
319 	 * WT_SESSION::create, and doing a drop inside there does interesting
320 	 * things with handle locks and metadata tracking.  It can never have
321 	 * been the result of an interrupted merge, anyway.
322 	 */
323 	if (chunk->id > 1)
324 		WT_RET(__lsm_tree_cleanup_old(session, chunk->uri));
325 
326 	return (__wt_schema_create(session, chunk->uri, lsm_tree->file_config));
327 }
328 
329 /*
330  * __wt_lsm_tree_setup_bloom --
331  *	Initialize a bloom filter for an LSM tree.
332  */
333 int
__wt_lsm_tree_setup_bloom(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree,WT_LSM_CHUNK * chunk)334 __wt_lsm_tree_setup_bloom(
335     WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk)
336 {
337 	/*
338 	 * The Bloom URI can be populated when the chunk is created, but
339 	 * it isn't set yet on open or merge.
340 	 */
341 	if (chunk->bloom_uri == NULL)
342 		WT_RET(__wt_lsm_tree_bloom_name(
343 		    session, lsm_tree, chunk->id, &chunk->bloom_uri));
344 
345 	return (__lsm_tree_cleanup_old(session, chunk->bloom_uri));
346 }
347 
348 /*
349  * __wt_lsm_tree_create --
350  *	Create an LSM tree structure for the given name.
351  */
352 int
__wt_lsm_tree_create(WT_SESSION_IMPL * session,const char * uri,bool exclusive,const char * config)353 __wt_lsm_tree_create(WT_SESSION_IMPL *session,
354     const char *uri, bool exclusive, const char *config)
355 {
356 	WT_CONFIG_ITEM cval;
357 	WT_DECL_RET;
358 	WT_LSM_TREE *lsm_tree;
359 	const char *cfg[] =
360 	    { WT_CONFIG_BASE(session, lsm_meta), config, NULL };
361 	const char *metadata;
362 
363 	metadata = NULL;
364 
365 	/* If the tree can be opened, it already exists. */
366 	if ((ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree)) == 0) {
367 		__wt_lsm_tree_release(session, lsm_tree);
368 		return (exclusive ? EEXIST : 0);
369 	}
370 	WT_RET_NOTFOUND_OK(ret);
371 
372 	if (!F_ISSET(S2C(session), WT_CONN_READONLY)) {
373 		/* LSM doesn't yet support the 'r' format. */
374 		WT_ERR(__wt_config_gets(session, cfg, "key_format", &cval));
375 		if (WT_STRING_MATCH("r", cval.str, cval.len))
376 			WT_ERR_MSG(session, EINVAL,
377 			    "LSM trees do not support a key format of 'r'");
378 
379 		WT_ERR(__wt_config_merge(session, cfg, NULL, &metadata));
380 		WT_ERR(__wt_metadata_insert(session, uri, metadata));
381 	}
382 
383 	/*
384 	 * Open our new tree and add it to the handle cache. Don't discard on
385 	 * error: the returned handle is NULL on error, and the metadata
386 	 * tracking macros handle cleaning up on failure.
387 	 */
388 	WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
389 	    ret = __lsm_tree_open(session, uri, true, &lsm_tree));
390 	if (ret == 0)
391 		__wt_lsm_tree_release(session, lsm_tree);
392 
393 err:	__wt_free(session, metadata);
394 	return (ret);
395 }
396 
397 /*
398  * __lsm_tree_find --
399  *	Find an LSM tree structure for the given name. Optionally get exclusive
400  *	access to the handle. Exclusive access works separately to the LSM tree
401  *	lock - since operations that need exclusive access may also need to
402  *	take the LSM tree lock for example outstanding work unit operations.
403  */
404 static int
__lsm_tree_find(WT_SESSION_IMPL * session,const char * uri,bool exclusive,WT_LSM_TREE ** treep)405 __lsm_tree_find(WT_SESSION_IMPL *session,
406     const char *uri, bool exclusive, WT_LSM_TREE **treep)
407 {
408 	WT_LSM_TREE *lsm_tree;
409 
410 	*treep = NULL;
411 	WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
412 
413 	/* See if the tree is already open. */
414 	TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q)
415 		if (strcmp(uri, lsm_tree->name) == 0) {
416 			if (exclusive) {
417 				/*
418 				 * Make sure we win the race to switch on the
419 				 * exclusive flag.
420 				 */
421 				if (!__wt_atomic_cas_ptr(
422 				    &lsm_tree->excl_session, NULL, session))
423 					return (__wt_set_return(
424 					    session, EBUSY));
425 
426 				/*
427 				 * Drain the work queue before checking for
428 				 * open cursors - otherwise we can generate
429 				 * spurious busy returns.
430 				 */
431 				(void)__wt_atomic_add32(&lsm_tree->refcnt, 1);
432 				__lsm_tree_close(session, lsm_tree, false);
433 				if (lsm_tree->refcnt != 1) {
434 					__wt_lsm_tree_release(
435 					    session, lsm_tree);
436 					return (__wt_set_return(
437 					    session, EBUSY));
438 				}
439 			} else {
440 				(void)__wt_atomic_add32(&lsm_tree->refcnt, 1);
441 
442 				/*
443 				 * We got a reference, check if an exclusive
444 				 * lock beat us to it.
445 				 */
446 				if (lsm_tree->excl_session != NULL) {
447 					WT_ASSERT(session,
448 					    lsm_tree->refcnt > 0);
449 					__wt_lsm_tree_release(
450 					    session, lsm_tree);
451 					return (__wt_set_return(
452 					    session, EBUSY));
453 				}
454 			}
455 
456 			*treep = lsm_tree;
457 
458 			WT_ASSERT(session, lsm_tree->excl_session ==
459 			     (exclusive ? session : NULL));
460 			return (0);
461 		}
462 
463 	return (WT_NOTFOUND);
464 }
465 
466 /*
467  * __lsm_tree_open_check --
468  *	Validate the configuration of an LSM tree.
469  */
470 static int
__lsm_tree_open_check(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree)471 __lsm_tree_open_check(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
472 {
473 	WT_CONFIG_ITEM cval;
474 	WT_CONNECTION_IMPL *conn;
475 	uint64_t maxleafpage, required;
476 	const char *cfg[] = { WT_CONFIG_BASE(
477 	    session, WT_SESSION_create), lsm_tree->file_config, NULL };
478 
479 	conn = S2C(session);
480 
481 	WT_RET(__wt_config_gets(session, cfg, "leaf_page_max", &cval));
482 	maxleafpage = (uint64_t)cval.val;
483 
484 	required = WT_LSM_TREE_MINIMUM_SIZE(
485 	    lsm_tree->chunk_size, lsm_tree->merge_max, maxleafpage);
486 	if (conn->cache_size < required)
487 		WT_RET_MSG(session, EINVAL,
488 		    "LSM cache size %" PRIu64 " (%" PRIu64 "MB) too small, "
489 		    "must be at least %" PRIu64 " (%" PRIu64 "MB)",
490 		    conn->cache_size, conn->cache_size / WT_MEGABYTE,
491 		    required, (required + (WT_MEGABYTE - 1))/ WT_MEGABYTE);
492 	return (0);
493 }
494 
495 /*
496  * __lsm_tree_open --
497  *	Open an LSM tree structure.
498  */
499 static int
__lsm_tree_open(WT_SESSION_IMPL * session,const char * uri,bool exclusive,WT_LSM_TREE ** treep)500 __lsm_tree_open(WT_SESSION_IMPL *session,
501     const char *uri, bool exclusive, WT_LSM_TREE **treep)
502 {
503 	WT_CONNECTION_IMPL *conn;
504 	WT_DECL_RET;
505 	WT_LSM_TREE *lsm_tree;
506 
507 	conn = S2C(session);
508 	lsm_tree = NULL;
509 
510 	WT_ASSERT(session,
511 	    F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE));
512 
513 	/* Start the LSM manager thread if it isn't running. */
514 	WT_RET(__wt_lsm_manager_start(session));
515 
516 	/* Make sure no one beat us to it. */
517 	if ((ret = __lsm_tree_find(
518 	    session, uri, exclusive, treep)) != WT_NOTFOUND)
519 		return (ret);
520 
521 	/* Try to open the tree. */
522 	WT_RET(__wt_calloc_one(session, &lsm_tree));
523 	WT_ERR(__wt_rwlock_init(session, &lsm_tree->rwlock));
524 
525 	WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri));
526 
527 	WT_ERR(__wt_lsm_meta_read(session, lsm_tree));
528 
529 	/*
530 	 * Sanity check the configuration. Do it now since this is the first
531 	 * time we have the LSM tree configuration.
532 	 */
533 	WT_ERR(__lsm_tree_open_check(session, lsm_tree));
534 
535 	/* Set the generation number so cursors are opened on first usage. */
536 	lsm_tree->dsk_gen = 1;
537 
538 	/*
539 	 * Setup reference counting. Use separate reference counts for tree
540 	 * handles and queue entries, so that queue entries don't interfere
541 	 * with getting handles exclusive.
542 	 */
543 	lsm_tree->refcnt = 1;
544 	lsm_tree->excl_session = exclusive ? session : NULL;
545 	lsm_tree->queue_ref = 0;
546 
547 	/* Set a flush timestamp as a baseline. */
548 	__wt_epoch(session, &lsm_tree->last_flush_time);
549 
550 	/* Now the tree is setup, make it visible to others. */
551 	TAILQ_INSERT_HEAD(&conn->lsmqh, lsm_tree, q);
552 	if (!exclusive)
553 		lsm_tree->active = true;
554 	F_SET(lsm_tree, WT_LSM_TREE_OPEN);
555 
556 	*treep = lsm_tree;
557 
558 	if (0) {
559 err:		WT_TRET(__lsm_tree_discard(session, lsm_tree, false));
560 	}
561 	return (ret);
562 }
563 
564 /*
565  * __wt_lsm_tree_get --
566  *	Find an LSM tree handle or open a new one.
567  */
568 int
__wt_lsm_tree_get(WT_SESSION_IMPL * session,const char * uri,bool exclusive,WT_LSM_TREE ** treep)569 __wt_lsm_tree_get(WT_SESSION_IMPL *session,
570     const char *uri, bool exclusive, WT_LSM_TREE **treep)
571 {
572 	WT_DECL_RET;
573 
574 	/*
575 	 * Dropping and re-acquiring the lock is safe here, since the tree open
576 	 * call checks to see if another thread beat it to opening the tree
577 	 * before proceeding.
578 	 */
579 	if (exclusive)
580 		WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
581 		    ret = __lsm_tree_find(session, uri, exclusive, treep));
582 	else
583 		WT_WITH_HANDLE_LIST_READ_LOCK(session,
584 		    ret = __lsm_tree_find(session, uri, exclusive, treep));
585 	if (ret == WT_NOTFOUND)
586 		WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
587 		    ret = __lsm_tree_open(session, uri, exclusive, treep));
588 
589 	return (ret);
590 }
591 
592 /*
593  * __wt_lsm_tree_release --
594  *	Release an LSM tree structure.
595  */
596 void
__wt_lsm_tree_release(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree)597 __wt_lsm_tree_release(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
598 {
599 	WT_ASSERT(session, lsm_tree->refcnt > 0);
600 	if (lsm_tree->excl_session == session) {
601 		/* We cleared the active flag when getting exclusive access. */
602 		lsm_tree->active = true;
603 		lsm_tree->excl_session = NULL;
604 	}
605 	(void)__wt_atomic_sub32(&lsm_tree->refcnt, 1);
606 }
607 
608 /* How aggressively to ramp up or down throttle due to level 0 merging */
609 #define	WT_LSM_MERGE_THROTTLE_BUMP_PCT	(100 / lsm_tree->merge_max)
610 /* Number of level 0 chunks that need to be present to throttle inserts */
611 #define	WT_LSM_MERGE_THROTTLE_THRESHOLD					\
612 	(2 * lsm_tree->merge_min)
613 /* Minimal throttling time */
614 #define	WT_LSM_THROTTLE_START		20
615 
616 #define	WT_LSM_MERGE_THROTTLE_INCREASE(val)	do {			\
617 	(val) += ((val) * WT_LSM_MERGE_THROTTLE_BUMP_PCT) / 100;	\
618 	if ((val) < WT_LSM_THROTTLE_START)				\
619 		(val) = WT_LSM_THROTTLE_START;				\
620 	} while (0)
621 
622 #define	WT_LSM_MERGE_THROTTLE_DECREASE(val)	do {			\
623 	(val) -= ((val) * WT_LSM_MERGE_THROTTLE_BUMP_PCT) / 100;	\
624 	if ((val) < WT_LSM_THROTTLE_START)				\
625 		(val) = 0;						\
626 	} while (0)
627 
628 /*
629  * __wt_lsm_tree_throttle --
630  *	Calculate whether LSM updates need to be throttled. Must be called
631  *	with the LSM tree lock held.
632  */
633 void
__wt_lsm_tree_throttle(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree,bool decrease_only)634 __wt_lsm_tree_throttle(
635     WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool decrease_only)
636 {
637 	WT_LSM_CHUNK *last_chunk, **cp, *ondisk, *prev_chunk;
638 	uint64_t cache_sz, cache_used, oldtime, record_count, timediff;
639 	uint32_t in_memory, gen0_chunks;
640 
641 	/* Never throttle in small trees. */
642 	if (lsm_tree->nchunks < 3) {
643 		lsm_tree->ckpt_throttle = lsm_tree->merge_throttle = 0;
644 		return;
645 	}
646 
647 	cache_sz = S2C(session)->cache_size;
648 
649 	/*
650 	 * In the steady state, we expect that the checkpoint worker thread
651 	 * will keep up with inserts.  If not, throttle the insert rate to
652 	 * avoid filling the cache with in-memory chunks.  Threads sleep every
653 	 * 100 operations, so take that into account in the calculation.
654 	 *
655 	 * Also throttle based on whether merge threads are keeping up.  If
656 	 * there are enough chunks that have never been merged we slow down
657 	 * inserts so that merges have some chance of keeping up.
658 	 *
659 	 * Count the number of in-memory chunks, the number of unmerged chunk
660 	 * on disk, and find the most recent on-disk chunk (if any).
661 	 */
662 	record_count = 1;
663 	gen0_chunks = in_memory = 0;
664 	ondisk = NULL;
665 	for (cp = lsm_tree->chunk + lsm_tree->nchunks - 1;
666 	    cp >= lsm_tree->chunk;
667 	    --cp)
668 		if (!F_ISSET(*cp, WT_LSM_CHUNK_ONDISK)) {
669 			record_count += (*cp)->count;
670 			++in_memory;
671 		} else {
672 			/*
673 			 * Assign ondisk to the last chunk that has been
674 			 * flushed since the tree was last opened (i.e it's on
675 			 * disk and stable is not set).
676 			 */
677 			if (ondisk == NULL &&
678 			    ((*cp)->generation == 0 &&
679 			    !F_ISSET(*cp, WT_LSM_CHUNK_STABLE)))
680 				ondisk = *cp;
681 
682 			if ((*cp)->generation == 0 &&
683 			    !F_ISSET(*cp, WT_LSM_CHUNK_MERGING))
684 				++gen0_chunks;
685 		}
686 
687 	last_chunk = lsm_tree->chunk[lsm_tree->nchunks - 1];
688 
689 	/* Checkpoint throttling, based on the number of in-memory chunks. */
690 	if (!F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) || in_memory <= 3)
691 		lsm_tree->ckpt_throttle = 0;
692 	else if (decrease_only)
693 		; /* Nothing to do */
694 	else if (ondisk == NULL) {
695 		/*
696 		 * No checkpoint has completed this run.  Keep slowing down
697 		 * inserts until one does.
698 		 */
699 		lsm_tree->ckpt_throttle =
700 		    WT_MAX(WT_LSM_THROTTLE_START, 2 * lsm_tree->ckpt_throttle);
701 	} else {
702 		WT_ASSERT(session, WT_TIMECMP(
703 		    last_chunk->create_time, ondisk->create_time) >= 0);
704 		timediff = WT_TIMEDIFF_NS(
705 		    last_chunk->create_time, ondisk->create_time);
706 		lsm_tree->ckpt_throttle =
707 		    (in_memory - 2) * timediff / (20 * record_count);
708 
709 		/*
710 		 * Get more aggressive as the number of in memory chunks
711 		 * consumes a large proportion of the cache. In memory chunks
712 		 * are allowed to grow up to twice as large as the configured
713 		 * value when checkpoints aren't keeping up. That worst case
714 		 * is when this calculation is relevant.
715 		 * There is nothing particularly special about the chosen
716 		 * multipliers.
717 		 */
718 		cache_used = in_memory * lsm_tree->chunk_size * 2;
719 		if (cache_used > cache_sz * 0.8)
720 			lsm_tree->ckpt_throttle *= 5;
721 	}
722 
723 	/*
724 	 * Merge throttling, based on the number of on-disk, level 0 chunks.
725 	 *
726 	 * Don't throttle if the tree has less than a single level's number
727 	 * of chunks.
728 	 */
729 	if (F_ISSET(lsm_tree, WT_LSM_TREE_MERGES)) {
730 		if (lsm_tree->nchunks < lsm_tree->merge_max)
731 			lsm_tree->merge_throttle = 0;
732 		else if (gen0_chunks < WT_LSM_MERGE_THROTTLE_THRESHOLD)
733 			WT_LSM_MERGE_THROTTLE_DECREASE(
734 			    lsm_tree->merge_throttle);
735 		else if (!decrease_only)
736 			WT_LSM_MERGE_THROTTLE_INCREASE(
737 			    lsm_tree->merge_throttle);
738 	}
739 
740 	/* Put an upper bound of 1s on both throttle calculations. */
741 	lsm_tree->ckpt_throttle = WT_MIN(WT_MILLION, lsm_tree->ckpt_throttle);
742 	lsm_tree->merge_throttle = WT_MIN(WT_MILLION, lsm_tree->merge_throttle);
743 
744 	/*
745 	 * Update our estimate of how long each in-memory chunk stays active.
746 	 * Filter out some noise by keeping a weighted history of the
747 	 * calculated value.  Wait until we have enough chunks that we can
748 	 * check that the new value is sane: otherwise, after a long idle
749 	 * period, we can calculate a crazy value.
750 	 */
751 	if (in_memory > 1 && ondisk != NULL) {
752 		prev_chunk = lsm_tree->chunk[lsm_tree->nchunks - 2];
753 		WT_ASSERT(session, prev_chunk->generation == 0);
754 		WT_ASSERT(session, WT_TIMECMP(
755 		    last_chunk->create_time, prev_chunk->create_time) >= 0);
756 		timediff = WT_TIMEDIFF_NS(
757 		    last_chunk->create_time, prev_chunk->create_time);
758 		WT_ASSERT(session, WT_TIMECMP(
759 		    prev_chunk->create_time, ondisk->create_time) >= 0);
760 		oldtime = WT_TIMEDIFF_NS(
761 		    prev_chunk->create_time, ondisk->create_time);
762 		if (timediff < 10 * oldtime)
763 			lsm_tree->chunk_fill_ms =
764 			    (3 * lsm_tree->chunk_fill_ms +
765 			    timediff / WT_MILLION) / 4;
766 	}
767 }
768 
769 /*
770  * __wt_lsm_tree_switch --
771  *	Switch to a new in-memory tree.
772  */
773 int
__wt_lsm_tree_switch(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree)774 __wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
775 {
776 	WT_DECL_RET;
777 	WT_LSM_CHUNK *chunk, *last_chunk;
778 	uint32_t chunks_moved, nchunks, new_id;
779 	bool first_switch;
780 
781 	__wt_lsm_tree_writelock(session, lsm_tree);
782 
783 	nchunks = lsm_tree->nchunks;
784 
785 	first_switch = nchunks == 0;
786 
787 	/*
788 	 * Check if a switch is still needed: we may have raced while waiting
789 	 * for a lock.
790 	 */
791 	last_chunk = NULL;
792 	if (!first_switch &&
793 	    (last_chunk = lsm_tree->chunk[nchunks - 1]) != NULL &&
794 	    !F_ISSET(last_chunk, WT_LSM_CHUNK_ONDISK) &&
795 	    !lsm_tree->need_switch)
796 		goto err;
797 
798 	/* Update the throttle time. */
799 	__wt_lsm_tree_throttle(session, lsm_tree, false);
800 
801 	new_id = __wt_atomic_add32(&lsm_tree->last, 1);
802 
803 	WT_ERR(__wt_realloc_def(session, &lsm_tree->chunk_alloc,
804 	    nchunks + 1, &lsm_tree->chunk));
805 
806 	__wt_verbose(session, WT_VERB_LSM,
807 	    "Tree %s switch to: %" PRIu32 ", checkpoint throttle %" PRIu64
808 	    ", merge throttle %" PRIu64, lsm_tree->name,
809 	    new_id, lsm_tree->ckpt_throttle, lsm_tree->merge_throttle);
810 
811 	WT_ERR(__wt_calloc_one(session, &chunk));
812 	chunk->id = new_id;
813 	chunk->switch_txn = WT_TXN_NONE;
814 	lsm_tree->chunk[lsm_tree->nchunks++] = chunk;
815 	WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));
816 
817 	WT_ERR(__wt_lsm_meta_write(session, lsm_tree, NULL));
818 	lsm_tree->need_switch = false;
819 	lsm_tree->modified = true;
820 
821 	/*
822 	 * Ensure the updated disk generation is visible to all other threads
823 	 * before updating the transaction ID.
824 	 */
825 	++lsm_tree->dsk_gen;
826 	WT_FULL_BARRIER();
827 
828 	/*
829 	 * Set the switch transaction in the previous chunk unless this is
830 	 * the first chunk in a new or newly opened tree.
831 	 */
832 	if (last_chunk != NULL && last_chunk->switch_txn == WT_TXN_NONE &&
833 	    !F_ISSET(last_chunk, WT_LSM_CHUNK_ONDISK))
834 		last_chunk->switch_txn = __wt_txn_id_alloc(session, false);
835 
836 	/*
837 	 * If a maximum number of chunks are configured, drop the any chunks
838 	 * past the limit.
839 	 */
840 	if (lsm_tree->chunk_count_limit != 0 &&
841 	    lsm_tree->nchunks > lsm_tree->chunk_count_limit) {
842 		chunks_moved = lsm_tree->nchunks - lsm_tree->chunk_count_limit;
843 		/* Move the last chunk onto the old chunk list. */
844 		WT_ERR(__wt_lsm_tree_retire_chunks(
845 		    session, lsm_tree, 0, chunks_moved));
846 
847 		/* Update the active chunk list. */
848 		lsm_tree->nchunks -= chunks_moved;
849 		/* Move the remaining chunks to the start of the active list */
850 		memmove(lsm_tree->chunk,
851 		    lsm_tree->chunk + chunks_moved,
852 		    lsm_tree->nchunks * sizeof(*lsm_tree->chunk));
853 		/* Clear out the chunks at the end of the tree */
854 		memset(lsm_tree->chunk + lsm_tree->nchunks,
855 		    0, chunks_moved * sizeof(*lsm_tree->chunk));
856 
857 		/* Make sure the manager knows there is work to do. */
858 		WT_ERR(__wt_lsm_manager_push_entry(
859 		    session, WT_LSM_WORK_DROP, 0, lsm_tree));
860 	}
861 
862 err:	__wt_lsm_tree_writeunlock(session, lsm_tree);
863 	/*
864 	 * Errors that happen during a tree switch leave the tree in a state
865 	 * where we can't make progress. Error out of WiredTiger.
866 	 */
867 	if (ret != 0)
868 		WT_PANIC_RET(session, ret, "Failed doing LSM switch");
869 	else if (!first_switch)
870 		WT_RET(__wt_lsm_manager_push_entry(
871 		    session, WT_LSM_WORK_FLUSH, 0, lsm_tree));
872 	return (ret);
873 }
874 
875 /*
876  * __wt_lsm_tree_retire_chunks --
877  *	Move a set of chunks onto the old chunks list.
878  *	It's the callers responsibility to update the active chunks list.
879  *	Must be called with the LSM lock held.
880  */
881 int
__wt_lsm_tree_retire_chunks(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree,u_int start_chunk,u_int nchunks)882 __wt_lsm_tree_retire_chunks(WT_SESSION_IMPL *session,
883     WT_LSM_TREE *lsm_tree, u_int start_chunk, u_int nchunks)
884 {
885 	u_int i;
886 
887 	WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks);
888 
889 	/* Setup the array of obsolete chunks. */
890 	WT_RET(__wt_realloc_def(session, &lsm_tree->old_alloc,
891 	    lsm_tree->nold_chunks + nchunks, &lsm_tree->old_chunks));
892 
893 	/* Copy entries one at a time, so we can reuse gaps in the list. */
894 	for (i = 0; i < nchunks; i++)
895 		lsm_tree->old_chunks[lsm_tree->nold_chunks++] =
896 		    lsm_tree->chunk[start_chunk + i];
897 
898 	return (0);
899 }
900 
901 /*
902  * __wt_lsm_tree_drop --
903  *	Drop an LSM tree.
904  */
905 int
__wt_lsm_tree_drop(WT_SESSION_IMPL * session,const char * name,const char * cfg[])906 __wt_lsm_tree_drop(
907     WT_SESSION_IMPL *session, const char *name, const char *cfg[])
908 {
909 	WT_DECL_RET;
910 	WT_LSM_CHUNK *chunk;
911 	WT_LSM_TREE *lsm_tree;
912 	u_int i;
913 	int tret;
914 	bool locked;
915 
916 	WT_NOT_READ(locked, false);
917 
918 	/* Get the LSM tree. */
919 	WT_RET(__wt_lsm_tree_get(session, name, true, &lsm_tree));
920 	WT_ASSERT(session, !lsm_tree->active);
921 
922 	/* Prevent any new opens. */
923 	__wt_lsm_tree_writelock(session, lsm_tree);
924 	locked = true;
925 
926 	/* Drop the chunks. */
927 	for (i = 0; i < lsm_tree->nchunks; i++) {
928 		chunk = lsm_tree->chunk[i];
929 		WT_ERR(__wt_schema_drop(session, chunk->uri, cfg));
930 		if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
931 			WT_ERR(
932 			    __wt_schema_drop(session, chunk->bloom_uri, cfg));
933 	}
934 
935 	/* Drop any chunks on the obsolete list. */
936 	for (i = 0; i < lsm_tree->nold_chunks; i++) {
937 		if ((chunk = lsm_tree->old_chunks[i]) == NULL)
938 			continue;
939 		WT_ERR(__wt_schema_drop(session, chunk->uri, cfg));
940 		if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
941 			WT_ERR(
942 			    __wt_schema_drop(session, chunk->bloom_uri, cfg));
943 	}
944 
945 	locked = false;
946 	__wt_lsm_tree_writeunlock(session, lsm_tree);
947 	ret = __wt_metadata_remove(session, name);
948 
949 	WT_ASSERT(session, !lsm_tree->active);
950 err:	if (locked)
951 		__wt_lsm_tree_writeunlock(session, lsm_tree);
952 	WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
953 	    tret = __lsm_tree_discard(session, lsm_tree, false));
954 	WT_TRET(tret);
955 	return (ret);
956 }
957 
958 /*
959  * __wt_lsm_tree_rename --
960  *	Rename an LSM tree.
961  */
962 int
__wt_lsm_tree_rename(WT_SESSION_IMPL * session,const char * olduri,const char * newuri,const char * cfg[])963 __wt_lsm_tree_rename(WT_SESSION_IMPL *session,
964     const char *olduri, const char *newuri, const char *cfg[])
965 {
966 	WT_DECL_RET;
967 	WT_LSM_CHUNK *chunk;
968 	WT_LSM_TREE *lsm_tree;
969 	u_int i;
970 	int tret;
971 	const char *old;
972 	bool locked;
973 
974 	old = NULL;
975 	WT_NOT_READ(locked, false);
976 
977 	/* Get the LSM tree. */
978 	WT_RET(__wt_lsm_tree_get(session, olduri, true, &lsm_tree));
979 
980 	/* Prevent any new opens. */
981 	__wt_lsm_tree_writelock(session, lsm_tree);
982 	locked = true;
983 
984 	/* Set the new name. */
985 	WT_ERR(__lsm_tree_set_name(session, lsm_tree, newuri));
986 
987 	/* Rename the chunks. */
988 	for (i = 0; i < lsm_tree->nchunks; i++) {
989 		chunk = lsm_tree->chunk[i];
990 		old = chunk->uri;
991 		chunk->uri = NULL;
992 
993 		WT_ERR(__wt_lsm_tree_chunk_name(session, lsm_tree,
994 		    chunk->id, chunk->generation, &chunk->uri));
995 		WT_ERR(__wt_schema_rename(session, old, chunk->uri, cfg));
996 		__wt_free(session, old);
997 
998 		if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) {
999 			old = chunk->bloom_uri;
1000 			chunk->bloom_uri = NULL;
1001 			WT_ERR(__wt_lsm_tree_bloom_name(
1002 			    session, lsm_tree, chunk->id, &chunk->bloom_uri));
1003 			F_SET(chunk, WT_LSM_CHUNK_BLOOM);
1004 			WT_ERR(__wt_schema_rename(
1005 			    session, old, chunk->uri, cfg));
1006 			__wt_free(session, old);
1007 		}
1008 	}
1009 
1010 	WT_ERR(__wt_lsm_meta_write(session, lsm_tree, NULL));
1011 	locked = false;
1012 	__wt_lsm_tree_writeunlock(session, lsm_tree);
1013 	WT_ERR(__wt_metadata_remove(session, olduri));
1014 
1015 err:	if (locked)
1016 		__wt_lsm_tree_writeunlock(session, lsm_tree);
1017 	__wt_free(session, old);
1018 
1019 	/*
1020 	 * Discard this LSM tree structure. The first operation on the renamed
1021 	 * tree will create a new one.
1022 	 */
1023 	WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
1024 	    tret = __lsm_tree_discard(session, lsm_tree, false));
1025 	WT_TRET(tret);
1026 	return (ret);
1027 }
1028 
1029 /*
1030  * __wt_lsm_tree_truncate --
1031  *	Truncate an LSM tree.
1032  */
1033 int
__wt_lsm_tree_truncate(WT_SESSION_IMPL * session,const char * name,const char * cfg[])1034 __wt_lsm_tree_truncate(
1035     WT_SESSION_IMPL *session, const char *name, const char *cfg[])
1036 {
1037 	WT_DECL_RET;
1038 	WT_LSM_CHUNK *chunk;
1039 	WT_LSM_TREE *lsm_tree;
1040 	int tret;
1041 	bool locked;
1042 
1043 	WT_UNUSED(cfg);
1044 
1045 	chunk = NULL;
1046 	WT_NOT_READ(locked, false);
1047 
1048 	/* Get the LSM tree. */
1049 	WT_RET(__wt_lsm_tree_get(session, name, true, &lsm_tree));
1050 
1051 	/* Prevent any new opens. */
1052 	__wt_lsm_tree_writelock(session, lsm_tree);
1053 	locked = true;
1054 
1055 	/* Create the new chunk. */
1056 	WT_ERR(__wt_calloc_one(session, &chunk));
1057 	chunk->id = __wt_atomic_add32(&lsm_tree->last, 1);
1058 	WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));
1059 
1060 	/* Mark all chunks old. */
1061 	WT_ERR(__wt_lsm_merge_update_tree(
1062 	    session, lsm_tree, 0, lsm_tree->nchunks, chunk));
1063 
1064 	WT_ERR(__wt_lsm_meta_write(session, lsm_tree, NULL));
1065 
1066 	locked = false;
1067 	__wt_lsm_tree_writeunlock(session, lsm_tree);
1068 	__wt_lsm_tree_release(session, lsm_tree);
1069 
1070 err:	if (locked)
1071 		__wt_lsm_tree_writeunlock(session, lsm_tree);
1072 	if (ret != 0) {
1073 		if (chunk != NULL) {
1074 			WT_TRET(__wt_schema_drop(session, chunk->uri, NULL));
1075 			__wt_free(session, chunk);
1076 		}
1077 		/*
1078 		 * Discard the LSM tree structure on error. This will force the
1079 		 * LSM tree to be re-opened the next time it is accessed and
1080 		 * the last good version of the metadata will be used, resulting
1081 		 * in a valid (not truncated) tree.
1082 		 */
1083 		WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
1084 		    tret = __lsm_tree_discard(session, lsm_tree, false));
1085 		WT_TRET(tret);
1086 	}
1087 	return (ret);
1088 }
1089 
1090 /*
1091  * __wt_lsm_tree_readlock --
1092  *	Acquire a shared lock on an LSM tree.
1093  */
1094 void
__wt_lsm_tree_readlock(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree)1095 __wt_lsm_tree_readlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
1096 {
1097 	__wt_readlock(session, &lsm_tree->rwlock);
1098 
1099 	/*
1100 	 * Diagnostic: avoid deadlocks with the schema lock: if we need it for
1101 	 * an operation, we should already have it.
1102 	 */
1103 	F_SET(session,
1104 	    WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_SCHEMA_LOCK);
1105 }
1106 
1107 /*
1108  * __wt_lsm_tree_readunlock --
1109  *	Release a shared lock on an LSM tree.
1110  */
1111 void
__wt_lsm_tree_readunlock(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree)1112 __wt_lsm_tree_readunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
1113 {
1114 	F_CLR(session,
1115 	    WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_SCHEMA_LOCK);
1116 
1117 	__wt_readunlock(session, &lsm_tree->rwlock);
1118 }
1119 
1120 /*
1121  * __wt_lsm_tree_writelock --
1122  *	Acquire an exclusive lock on an LSM tree.
1123  */
1124 void
__wt_lsm_tree_writelock(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree)1125 __wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
1126 {
1127 	__wt_writelock(session, &lsm_tree->rwlock);
1128 
1129 	/*
1130 	 * Diagnostic: avoid deadlocks with the schema lock: if we need it for
1131 	 * an operation, we should already have it.
1132 	 */
1133 	F_SET(session,
1134 	    WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_SCHEMA_LOCK);
1135 }
1136 
1137 /*
1138  * __wt_lsm_tree_writeunlock --
1139  *	Release an exclusive lock on an LSM tree.
1140  */
1141 void
__wt_lsm_tree_writeunlock(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree)1142 __wt_lsm_tree_writeunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
1143 {
1144 	F_CLR(session,
1145 	    WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_SCHEMA_LOCK);
1146 
1147 	__wt_writeunlock(session, &lsm_tree->rwlock);
1148 }
1149 
1150 /*
1151  * __wt_lsm_compact --
1152  *	Compact an LSM tree called via __wt_schema_worker.
1153  */
1154 int
__wt_lsm_compact(WT_SESSION_IMPL * session,const char * name,bool * skipp)1155 __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp)
1156 {
1157 	WT_DECL_RET;
1158 	WT_LSM_CHUNK *chunk;
1159 	WT_LSM_TREE *lsm_tree;
1160 	uint64_t progress;
1161 	uint32_t i;
1162 	bool compacting, flushing, locked, push_flush, ref;
1163 
1164 	compacting = flushing = locked = ref = false;
1165 	chunk = NULL;
1166 	/*
1167 	 * This function is applied to all matching sources: ignore anything
1168 	 * that is not an LSM tree.
1169 	 */
1170 	if (!WT_PREFIX_MATCH(name, "lsm:"))
1171 		return (0);
1172 
1173 	/* Tell __wt_schema_worker not to look inside the LSM tree. */
1174 	*skipp = true;
1175 
1176 	WT_RET(__wt_lsm_tree_get(session, name, false, &lsm_tree));
1177 
1178 	if (!F_ISSET(S2C(session), WT_CONN_LSM_MERGE))
1179 		WT_ERR_MSG(session, EINVAL,
1180 		    "LSM compaction requires active merge threads");
1181 
1182 	/*
1183 	 * There is no work to do if there is only a single chunk in the tree
1184 	 * and it has a bloom filter or is configured to never have a bloom
1185 	 * filter.
1186 	 */
1187 	if (lsm_tree->nchunks == 1 &&
1188 	    (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST) ||
1189 	    F_ISSET(lsm_tree->chunk[0], WT_LSM_CHUNK_BLOOM))) {
1190 		__wt_lsm_tree_release(session, lsm_tree);
1191 		return (0);
1192 	}
1193 
1194 	/*
1195 	 * Compacting has two distinct phases.
1196 	 * 1.  All in-memory chunks up to and including the current
1197 	 * current chunk must be flushed.  Normally, the flush code
1198 	 * does not flush the last, in-use chunk, so we set a force
1199 	 * flag to include that last chunk.  We monitor the state of the
1200 	 * last chunk and periodically push another forced flush work
1201 	 * unit until it is complete.
1202 	 * 2.  After all flushing is done, we move onto the merging
1203 	 * phase for compaction.  Again, we monitor the state and
1204 	 * continue to push merge work units until all merging is done.
1205 	 */
1206 
1207 	/* Lock the tree: single-thread compaction. */
1208 	__wt_lsm_tree_writelock(session, lsm_tree);
1209 	locked = true;
1210 
1211 	/* Clear any merge throttle: compact throws out that calculation. */
1212 	lsm_tree->merge_throttle = 0;
1213 	lsm_tree->merge_aggressiveness = 0;
1214 	progress = lsm_tree->merge_progressing;
1215 
1216 	/* If another thread started a compact on this tree, we're done. */
1217 	if (F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING))
1218 		goto err;
1219 
1220 	/*
1221 	 * Set the switch transaction on the current chunk, if it
1222 	 * hasn't been set before.  This prevents further writes, so it
1223 	 * can be flushed by the checkpoint worker. If this is a newly
1224 	 * opened tree the primary chunk may already be stable. Only
1225 	 * push a flush work unit if necessary.
1226 	 */
1227 	push_flush = false;
1228 	if (lsm_tree->nchunks > 0 &&
1229 	    (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) != NULL &&
1230 	    !F_ISSET(chunk, (WT_LSM_CHUNK_ONDISK | WT_LSM_CHUNK_STABLE))) {
1231 		push_flush = true;
1232 		if (chunk->switch_txn == WT_TXN_NONE) {
1233 			/*
1234 			 * Make sure any cursors open on the tree see the
1235 			 * new switch generation before updating.
1236 			 */
1237 			++lsm_tree->dsk_gen;
1238 			WT_FULL_BARRIER();
1239 			chunk->switch_txn = __wt_txn_id_alloc(session, false);
1240 		}
1241 		/*
1242 		 * If we have a chunk, we want to look for it to be on-disk.
1243 		 * So we need to add a reference to keep it available.
1244 		 */
1245 		(void)__wt_atomic_add32(&chunk->refcnt, 1);
1246 		ref = true;
1247 	}
1248 
1249 	if (push_flush) {
1250 		__wt_verbose(session, WT_VERB_LSM,
1251 		    "Compact force flush %s flags 0x%" PRIx32
1252 		    " chunk %" PRIu32 " flags 0x%" PRIx32,
1253 		    name, lsm_tree->flags, chunk->id, chunk->flags);
1254 		flushing = true;
1255 		locked = false;
1256 		__wt_lsm_tree_writeunlock(session, lsm_tree);
1257 		/*
1258 		 * Make sure the in-memory chunk gets flushed do not push a
1259 		 * switch, because we don't want to create a new in-memory
1260 		 * chunk if the tree is being used read-only now.
1261 		 */
1262 		WT_ERR(__wt_lsm_manager_push_entry(session,
1263 		    WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE, lsm_tree));
1264 	} else {
1265 		/*
1266 		 * If there is no chunk to flush, go straight to the
1267 		 * compacting state.
1268 		 */
1269 		compacting = true;
1270 		progress = lsm_tree->merge_progressing;
1271 		F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
1272 		__wt_verbose(session, WT_VERB_LSM,
1273 		    "COMPACT: Start compacting %s", lsm_tree->name);
1274 		locked = false;
1275 		__wt_lsm_tree_writeunlock(session, lsm_tree);
1276 	}
1277 
1278 	/* Wait for the work unit queues to drain. */
1279 	while (lsm_tree->active) {
1280 		/*
1281 		 * The flush flag is cleared when the chunk has been flushed.
1282 		 * Continue to push forced flushes until the chunk is on disk.
1283 		 * Once it is on disk move to the compacting phase.
1284 		 */
1285 		if (flushing) {
1286 			WT_ASSERT(session, chunk != NULL);
1287 			if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
1288 				__wt_verbose(session,
1289 				    WT_VERB_LSM,
1290 				    "Compact flush done %s chunk %" PRIu32 ". "
1291 				    "Start compacting progress %" PRIu64,
1292 				    name, chunk->id,
1293 				    lsm_tree->merge_progressing);
1294 				(void)__wt_atomic_sub32(&chunk->refcnt, 1);
1295 				flushing = ref = false;
1296 				compacting = true;
1297 				F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
1298 				progress = lsm_tree->merge_progressing;
1299 			} else {
1300 				__wt_verbose(session, WT_VERB_LSM,
1301 				    "Compact flush retry %s chunk %" PRIu32,
1302 				    name, chunk->id);
1303 				WT_ERR(__wt_lsm_manager_push_entry(session,
1304 				    WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE,
1305 				    lsm_tree));
1306 			}
1307 		}
1308 
1309 		/*
1310 		 * The compacting flag is cleared when no merges can be done.
1311 		 * Ensure that we push through some aggressive merges before
1312 		 * stopping otherwise we might not do merges that would
1313 		 * span chunks with different generations.
1314 		 */
1315 		if (compacting && !F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) {
1316 			if (lsm_tree->merge_aggressiveness < 10 ||
1317 			    (progress < lsm_tree->merge_progressing) ||
1318 			    lsm_tree->merge_syncing) {
1319 				progress = lsm_tree->merge_progressing;
1320 				F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
1321 				lsm_tree->merge_aggressiveness = 10;
1322 			} else
1323 				break;
1324 		}
1325 
1326 		/*
1327 		 * Periodically check if we've timed out or eviction is stuck.
1328 		 * Quit if eviction is stuck, we're making the problem worse.
1329 		 */
1330 		WT_ERR(__wt_session_compact_check_timeout(session));
1331 		if (__wt_cache_stuck(session))
1332 			WT_ERR(EBUSY);
1333 		__wt_sleep(1, 0);
1334 
1335 		/*
1336 		 * Push merge operations while they are still getting work
1337 		 * done. If we are pushing merges, make sure they are
1338 		 * aggressive, to avoid duplicating effort.
1339 		 */
1340 		if (compacting)
1341 #define	COMPACT_PARALLEL_MERGES	5
1342 			for (i = lsm_tree->queue_ref;
1343 			    i < COMPACT_PARALLEL_MERGES; i++) {
1344 				lsm_tree->merge_aggressiveness = 10;
1345 				WT_ERR(__wt_lsm_manager_push_entry(
1346 				    session, WT_LSM_WORK_MERGE, 0, lsm_tree));
1347 			}
1348 	}
1349 err:
1350 	/* Ensure anything we set is cleared. */
1351 	if (ref)
1352 		(void)__wt_atomic_sub32(&chunk->refcnt, 1);
1353 	if (compacting) {
1354 		F_CLR(lsm_tree, WT_LSM_TREE_COMPACTING);
1355 		lsm_tree->merge_aggressiveness = 0;
1356 	}
1357 	if (locked)
1358 		__wt_lsm_tree_writeunlock(session, lsm_tree);
1359 
1360 	__wt_verbose(session, WT_VERB_LSM,
1361 	    "Compact %s complete, return %d", name, ret);
1362 
1363 	__wt_lsm_tree_release(session, lsm_tree);
1364 	return (ret);
1365 }
1366 
1367 /*
1368  * __wt_lsm_tree_worker --
1369  *	Run a schema worker operation on each level of a LSM tree.
1370  */
1371 int
__wt_lsm_tree_worker(WT_SESSION_IMPL * session,const char * uri,int (* file_func)(WT_SESSION_IMPL *,const char * []),int (* name_func)(WT_SESSION_IMPL *,const char *,bool *),const char * cfg[],uint32_t open_flags)1372 __wt_lsm_tree_worker(WT_SESSION_IMPL *session,
1373    const char *uri,
1374    int (*file_func)(WT_SESSION_IMPL *, const char *[]),
1375    int (*name_func)(WT_SESSION_IMPL *, const char *, bool *),
1376    const char *cfg[], uint32_t open_flags)
1377 {
1378 	WT_DECL_RET;
1379 	WT_LSM_CHUNK *chunk;
1380 	WT_LSM_TREE *lsm_tree;
1381 	u_int i;
1382 	bool exclusive, locked, need_release;
1383 
1384 	WT_NOT_READ(locked, false);
1385 	WT_NOT_READ(need_release, false);
1386 	exclusive = FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE);
1387 
1388 	WT_RET(__wt_lsm_tree_get(session, uri, exclusive, &lsm_tree));
1389 	need_release = true;
1390 
1391 	/*
1392 	 * We mark that we're busy using the tree to coordinate
1393 	 * with merges so that merging doesn't change the chunk
1394 	 * array out from underneath us.
1395 	 */
1396 	if (exclusive)
1397 		__wt_lsm_tree_writelock(session, lsm_tree);
1398 	else
1399 		__wt_lsm_tree_readlock(session, lsm_tree);
1400 	locked = true;
1401 	for (i = 0; i < lsm_tree->nchunks; i++) {
1402 		chunk = lsm_tree->chunk[i];
1403 		/*
1404 		 * If the chunk is on disk, don't include underlying handles in
1405 		 * the checkpoint. Checking the "get handles" function is all
1406 		 * we need to do, no further checkpoint calls are done if the
1407 		 * handle is not gathered.
1408 		 */
1409 		if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
1410 		    file_func == __wt_checkpoint_get_handles)
1411 			continue;
1412 		WT_ERR(__wt_schema_worker(session, chunk->uri,
1413 		    file_func, name_func, cfg, open_flags));
1414 		if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
1415 			WT_ERR(__wt_schema_worker(session, chunk->bloom_uri,
1416 			    file_func, name_func, cfg, open_flags));
1417 	}
1418 	/*
1419 	 * If this was an alter operation, we need to alter the configuration
1420 	 * for the overall tree and then reread it so it isn't out of date.
1421 	 * Reread it here so that we update the configuration of the
1422 	 * current tree's structure to any new, altered values.
1423 	 */
1424 	if (FLD_ISSET(open_flags, WT_BTREE_ALTER)) {
1425 		WT_ERR(__wt_lsm_meta_write(session, lsm_tree, cfg[0]));
1426 
1427 		locked = false;
1428 		if (exclusive)
1429 			__wt_lsm_tree_writeunlock(session, lsm_tree);
1430 		else
1431 			__wt_lsm_tree_readunlock(session, lsm_tree);
1432 
1433 		/*
1434 		 * We rewrote the meta-data.  Discard the tree and the next
1435 		 * access will reopen it.
1436 		 */
1437 		need_release = false;
1438 		WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
1439 		    ret = __lsm_tree_discard(session, lsm_tree, false));
1440 		WT_ERR(ret);
1441 	}
1442 
1443 err:	if (locked) {
1444 		if (exclusive)
1445 			__wt_lsm_tree_writeunlock(session, lsm_tree);
1446 		else
1447 			__wt_lsm_tree_readunlock(session, lsm_tree);
1448 	}
1449 	if (need_release)
1450 		__wt_lsm_tree_release(session, lsm_tree);
1451 	return (ret);
1452 }
1453