1 /*-
2 * Copyright (c) 2014-2018 MongoDB, Inc.
3 * Copyright (c) 2008-2014 WiredTiger, Inc.
4 * All rights reserved.
5 *
6 * See the file LICENSE for redistribution information.
7 */
8
9 #include "wt_internal.h"
10
11 static int __lsm_tree_cleanup_old(WT_SESSION_IMPL *, const char *);
12 static int __lsm_tree_open_check(WT_SESSION_IMPL *, WT_LSM_TREE *);
13 static int __lsm_tree_open(
14 WT_SESSION_IMPL *, const char *, bool, WT_LSM_TREE **);
15 static int __lsm_tree_set_name(WT_SESSION_IMPL *, WT_LSM_TREE *, const char *);
16
17 /*
18 * __lsm_tree_discard_state --
19 * Free the metadata configuration state-related LSM tree pointers.
20 */
21 static void
__lsm_tree_discard_state(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree)22 __lsm_tree_discard_state(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
23 {
24 WT_LSM_CHUNK *chunk;
25 u_int i;
26
27 __wt_free(session, lsm_tree->config);
28 __wt_free(session, lsm_tree->key_format);
29 __wt_free(session, lsm_tree->value_format);
30 __wt_free(session, lsm_tree->collator_name);
31 __wt_free(session, lsm_tree->custom_prefix);
32 __wt_free(session, lsm_tree->custom_suffix);
33 __wt_free(session, lsm_tree->bloom_config);
34 __wt_free(session, lsm_tree->file_config);
35
36 for (i = 0; i < lsm_tree->nchunks; i++) {
37 if ((chunk = lsm_tree->chunk[i]) == NULL)
38 continue;
39
40 __wt_spin_destroy(session, &chunk->timestamp_spinlock);
41 __wt_free(session, chunk->bloom_uri);
42 __wt_free(session, chunk->uri);
43 __wt_free(session, chunk);
44 }
45
46 for (i = 0; i < lsm_tree->nold_chunks; i++) {
47 chunk = lsm_tree->old_chunks[i];
48 WT_ASSERT(session, chunk != NULL);
49
50 __wt_spin_destroy(session, &chunk->timestamp_spinlock);
51 __wt_free(session, chunk->bloom_uri);
52 __wt_free(session, chunk->uri);
53 __wt_free(session, chunk);
54 }
55 }
56
57 /*
58 * __lsm_tree_discard --
59 * Free an LSM tree structure.
60 */
61 static int
__lsm_tree_discard(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree,bool final)62 __lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final)
63 {
64 WT_DECL_RET;
65
66 WT_UNUSED(final); /* Only used in diagnostic builds */
67
68 WT_ASSERT(session, !lsm_tree->active);
69 /*
70 * The work unit queue should be empty, but it's worth checking
71 * since work units use a different locking scheme to regular tree
72 * operations.
73 */
74 WT_ASSERT(session, lsm_tree->queue_ref == 0);
75
76 /* We may be destroying an lsm_tree before it was added. */
77 if (F_ISSET(lsm_tree, WT_LSM_TREE_OPEN)) {
78 WT_ASSERT(session, final ||
79 F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE));
80 TAILQ_REMOVE(&S2C(session)->lsmqh, lsm_tree, q);
81 }
82
83 if (lsm_tree->collator_owned &&
84 lsm_tree->collator->terminate != NULL)
85 WT_TRET(lsm_tree->collator->terminate(
86 lsm_tree->collator, &session->iface));
87
88 __wt_free(session, lsm_tree->name);
89 __lsm_tree_discard_state(session, lsm_tree);
90 __wt_free(session, lsm_tree->chunk);
91 __wt_free(session, lsm_tree->old_chunks);
92
93 __wt_rwlock_destroy(session, &lsm_tree->rwlock);
94
95 __wt_free(session, lsm_tree);
96
97 return (ret);
98 }
99
100 /*
101 * __lsm_tree_close --
102 * Close an LSM tree structure.
103 */
104 static void
__lsm_tree_close(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree,bool final)105 __lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final)
106 {
107 /*
108 * Stop any new work units being added. The barrier is necessary
109 * because we rely on the state change being visible before checking
110 * the tree queue state.
111 */
112 lsm_tree->active = false;
113 WT_FULL_BARRIER();
114
115 /*
116 * Wait for all LSM operations to drain. If WiredTiger is shutting
117 * down also wait for the tree reference count to go to zero, otherwise
118 * we know a user is holding a reference to the tree, so exclusive
119 * access is not available.
120 */
121 while (lsm_tree->queue_ref > 0 || (final && lsm_tree->refcnt > 1)) {
122 /*
123 * Remove any work units from the manager queues. Do this step
124 * repeatedly in case a work unit was in the process of being
125 * created when we cleared the active flag.
126 *
127 * !!! Drop the schema and handle list locks whilst completing
128 * this step so that we don't block any operations that require
129 * the schema lock to complete. This is safe because any
130 * operation that is closing the tree should first have gotten
131 * exclusive access to the LSM tree via __wt_lsm_tree_get, so
132 * other schema level operations will return EBUSY, even though
133 * we're dropping the schema lock here.
134 */
135 WT_WITHOUT_LOCKS(session,
136 __wt_lsm_manager_clear_tree(session, lsm_tree));
137 }
138 }
139
140 /*
141 * __wt_lsm_tree_close_all --
142 * Close all LSM tree structures.
143 */
144 int
__wt_lsm_tree_close_all(WT_SESSION_IMPL * session)145 __wt_lsm_tree_close_all(WT_SESSION_IMPL *session)
146 {
147 WT_DECL_RET;
148 WT_LSM_TREE *lsm_tree, *lsm_tree_tmp;
149
150 /* We are shutting down: the handle list lock isn't required. */
151
152 WT_TAILQ_SAFE_REMOVE_BEGIN(lsm_tree,
153 &S2C(session)->lsmqh, q, lsm_tree_tmp) {
154 /*
155 * Tree close assumes that we have a reference to the tree
156 * so it can tell when it's safe to do the close. We could
157 * get the tree here, but we short circuit instead. There
158 * is no need to decrement the reference count since discard
159 * is unconditional.
160 */
161 (void)__wt_atomic_add32(&lsm_tree->refcnt, 1);
162 __lsm_tree_close(session, lsm_tree, true);
163 WT_TRET(__lsm_tree_discard(session, lsm_tree, true));
164 } WT_TAILQ_SAFE_REMOVE_END
165
166 return (ret);
167 }
168
169 /*
170 * __lsm_tree_set_name --
171 * Set or reset the name of an LSM tree
172 */
173 static int
__lsm_tree_set_name(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree,const char * uri)174 __lsm_tree_set_name(WT_SESSION_IMPL *session,
175 WT_LSM_TREE *lsm_tree, const char *uri)
176 {
177 void *p;
178
179 WT_RET(__wt_strdup(session, uri, &p));
180
181 __wt_free(session, lsm_tree->name);
182 lsm_tree->name = p;
183 lsm_tree->filename = lsm_tree->name + strlen("lsm:");
184 return (0);
185 }
186
187 /*
188 * __wt_lsm_tree_bloom_name --
189 * Get the URI of the Bloom filter for a given chunk.
190 */
191 int
__wt_lsm_tree_bloom_name(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree,uint32_t id,const char ** retp)192 __wt_lsm_tree_bloom_name(WT_SESSION_IMPL *session,
193 WT_LSM_TREE *lsm_tree, uint32_t id, const char **retp)
194 {
195 WT_DECL_ITEM(tmp);
196 WT_DECL_RET;
197
198 WT_RET(__wt_scr_alloc(session, 0, &tmp));
199 WT_ERR(__wt_buf_fmt(
200 session, tmp, "file:%s-%06" PRIu32 ".bf", lsm_tree->filename, id));
201 WT_ERR(__wt_strndup(session, tmp->data, tmp->size, retp));
202
203 err: __wt_scr_free(session, &tmp);
204 return (ret);
205 }
206
207 /*
208 * __wt_lsm_tree_chunk_name --
209 * Get the URI of the file for a given chunk.
210 */
211 int
__wt_lsm_tree_chunk_name(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree,uint32_t id,uint32_t generation,const char ** retp)212 __wt_lsm_tree_chunk_name(WT_SESSION_IMPL *session,
213 WT_LSM_TREE *lsm_tree, uint32_t id, uint32_t generation, const char **retp)
214 {
215 WT_DECL_ITEM(tmp);
216 WT_DECL_RET;
217
218 WT_RET(__wt_scr_alloc(session, 0, &tmp));
219
220 if (lsm_tree->custom_generation != 0 &&
221 generation >= lsm_tree->custom_generation)
222 WT_ERR(__wt_buf_fmt(session, tmp, "%s:%s-%06" PRIu32 "%s",
223 lsm_tree->custom_prefix, lsm_tree->filename, id,
224 lsm_tree->custom_suffix));
225 else
226 WT_ERR(__wt_buf_fmt(session, tmp, "file:%s-%06" PRIu32 ".lsm",
227 lsm_tree->filename, id));
228
229 WT_ERR(__wt_strndup(session, tmp->data, tmp->size, retp));
230
231 err: __wt_scr_free(session, &tmp);
232 return (ret);
233 }
234
235 /*
236 * __wt_lsm_tree_set_chunk_size --
237 * Set the size of the chunk. Should only be called for chunks that are
238 * on disk, or about to become on disk.
239 */
240 int
__wt_lsm_tree_set_chunk_size(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree,WT_LSM_CHUNK * chunk)241 __wt_lsm_tree_set_chunk_size(
242 WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk)
243 {
244 WT_DATA_SOURCE *dsrc;
245 wt_off_t size;
246 const char *filename;
247
248 size = 0;
249 if (lsm_tree->custom_generation != 0 &&
250 chunk->generation >= lsm_tree->custom_generation) {
251 dsrc = __wt_schema_get_source(session, chunk->uri);
252 /*
253 * We can only retrieve a size if the data source exposes the
254 * information.
255 */
256 if (dsrc != NULL && dsrc->size != NULL) {
257 /* Call the callback. */
258 WT_RET(dsrc->size(
259 dsrc, (WT_SESSION*)session, chunk->uri, &size));
260 }
261 } else {
262 filename = chunk->uri;
263 if (!WT_PREFIX_SKIP(filename, "file:"))
264 WT_RET_MSG(session, EINVAL,
265 "Expected a 'file:' URI: %s", chunk->uri);
266 WT_RET(__wt_fs_size(session, filename, &size));
267 }
268
269 chunk->size = (uint64_t)size;
270
271 return (0);
272 }
273
274 /*
275 * __lsm_tree_cleanup_old --
276 * Cleanup any old LSM chunks that might conflict with one we are
277 * about to create. Sometimes failed LSM metadata operations can
278 * leave old files and bloom filters behind.
279 */
280 static int
__lsm_tree_cleanup_old(WT_SESSION_IMPL * session,const char * uri)281 __lsm_tree_cleanup_old(WT_SESSION_IMPL *session, const char *uri)
282 {
283 WT_DECL_RET;
284 const char *cfg[] =
285 { WT_CONFIG_BASE(session, WT_SESSION_drop), "force", NULL };
286 bool exists, is_file;
287
288 exists = false;
289 is_file = WT_PREFIX_MATCH(uri, "file:");
290 if (is_file)
291 WT_RET(__wt_fs_exist(session, uri + strlen("file:"), &exists));
292 if (!is_file || exists)
293 WT_WITH_SCHEMA_LOCK(session,
294 ret = __wt_schema_drop(session, uri, cfg));
295 return (ret);
296 }
297
298 /*
299 * __wt_lsm_tree_setup_chunk --
300 * Initialize a chunk of an LSM tree.
301 */
302 int
__wt_lsm_tree_setup_chunk(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree,WT_LSM_CHUNK * chunk)303 __wt_lsm_tree_setup_chunk(
304 WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk)
305 {
306 WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA));
307 __wt_epoch(session, &chunk->create_time);
308
309 WT_RET(__wt_spin_init(session,
310 &chunk->timestamp_spinlock, "LSM chunk timestamp"));
311 WT_RET(__wt_lsm_tree_chunk_name(
312 session, lsm_tree, chunk->id, chunk->generation, &chunk->uri));
313
314 /*
315 * If the underlying file exists, drop the chunk first - there may be
316 * some content hanging over from an aborted merge or checkpoint.
317 *
318 * Don't do this for the very first chunk: we are called during
319 * WT_SESSION::create, and doing a drop inside there does interesting
320 * things with handle locks and metadata tracking. It can never have
321 * been the result of an interrupted merge, anyway.
322 */
323 if (chunk->id > 1)
324 WT_RET(__lsm_tree_cleanup_old(session, chunk->uri));
325
326 return (__wt_schema_create(session, chunk->uri, lsm_tree->file_config));
327 }
328
329 /*
330 * __wt_lsm_tree_setup_bloom --
331 * Initialize a bloom filter for an LSM tree.
332 */
333 int
__wt_lsm_tree_setup_bloom(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree,WT_LSM_CHUNK * chunk)334 __wt_lsm_tree_setup_bloom(
335 WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk)
336 {
337 /*
338 * The Bloom URI can be populated when the chunk is created, but
339 * it isn't set yet on open or merge.
340 */
341 if (chunk->bloom_uri == NULL)
342 WT_RET(__wt_lsm_tree_bloom_name(
343 session, lsm_tree, chunk->id, &chunk->bloom_uri));
344
345 return (__lsm_tree_cleanup_old(session, chunk->bloom_uri));
346 }
347
348 /*
349 * __wt_lsm_tree_create --
350 * Create an LSM tree structure for the given name.
351 */
352 int
__wt_lsm_tree_create(WT_SESSION_IMPL * session,const char * uri,bool exclusive,const char * config)353 __wt_lsm_tree_create(WT_SESSION_IMPL *session,
354 const char *uri, bool exclusive, const char *config)
355 {
356 WT_CONFIG_ITEM cval;
357 WT_DECL_RET;
358 WT_LSM_TREE *lsm_tree;
359 const char *cfg[] =
360 { WT_CONFIG_BASE(session, lsm_meta), config, NULL };
361 const char *metadata;
362
363 metadata = NULL;
364
365 /* If the tree can be opened, it already exists. */
366 if ((ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree)) == 0) {
367 __wt_lsm_tree_release(session, lsm_tree);
368 return (exclusive ? EEXIST : 0);
369 }
370 WT_RET_NOTFOUND_OK(ret);
371
372 if (!F_ISSET(S2C(session), WT_CONN_READONLY)) {
373 /* LSM doesn't yet support the 'r' format. */
374 WT_ERR(__wt_config_gets(session, cfg, "key_format", &cval));
375 if (WT_STRING_MATCH("r", cval.str, cval.len))
376 WT_ERR_MSG(session, EINVAL,
377 "LSM trees do not support a key format of 'r'");
378
379 WT_ERR(__wt_config_merge(session, cfg, NULL, &metadata));
380 WT_ERR(__wt_metadata_insert(session, uri, metadata));
381 }
382
383 /*
384 * Open our new tree and add it to the handle cache. Don't discard on
385 * error: the returned handle is NULL on error, and the metadata
386 * tracking macros handle cleaning up on failure.
387 */
388 WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
389 ret = __lsm_tree_open(session, uri, true, &lsm_tree));
390 if (ret == 0)
391 __wt_lsm_tree_release(session, lsm_tree);
392
393 err: __wt_free(session, metadata);
394 return (ret);
395 }
396
397 /*
398 * __lsm_tree_find --
399 * Find an LSM tree structure for the given name. Optionally get exclusive
400 * access to the handle. Exclusive access works separately to the LSM tree
401 * lock - since operations that need exclusive access may also need to
402 * take the LSM tree lock for example outstanding work unit operations.
403 */
404 static int
__lsm_tree_find(WT_SESSION_IMPL * session,const char * uri,bool exclusive,WT_LSM_TREE ** treep)405 __lsm_tree_find(WT_SESSION_IMPL *session,
406 const char *uri, bool exclusive, WT_LSM_TREE **treep)
407 {
408 WT_LSM_TREE *lsm_tree;
409
410 *treep = NULL;
411 WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
412
413 /* See if the tree is already open. */
414 TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q)
415 if (strcmp(uri, lsm_tree->name) == 0) {
416 if (exclusive) {
417 /*
418 * Make sure we win the race to switch on the
419 * exclusive flag.
420 */
421 if (!__wt_atomic_cas_ptr(
422 &lsm_tree->excl_session, NULL, session))
423 return (__wt_set_return(
424 session, EBUSY));
425
426 /*
427 * Drain the work queue before checking for
428 * open cursors - otherwise we can generate
429 * spurious busy returns.
430 */
431 (void)__wt_atomic_add32(&lsm_tree->refcnt, 1);
432 __lsm_tree_close(session, lsm_tree, false);
433 if (lsm_tree->refcnt != 1) {
434 __wt_lsm_tree_release(
435 session, lsm_tree);
436 return (__wt_set_return(
437 session, EBUSY));
438 }
439 } else {
440 (void)__wt_atomic_add32(&lsm_tree->refcnt, 1);
441
442 /*
443 * We got a reference, check if an exclusive
444 * lock beat us to it.
445 */
446 if (lsm_tree->excl_session != NULL) {
447 WT_ASSERT(session,
448 lsm_tree->refcnt > 0);
449 __wt_lsm_tree_release(
450 session, lsm_tree);
451 return (__wt_set_return(
452 session, EBUSY));
453 }
454 }
455
456 *treep = lsm_tree;
457
458 WT_ASSERT(session, lsm_tree->excl_session ==
459 (exclusive ? session : NULL));
460 return (0);
461 }
462
463 return (WT_NOTFOUND);
464 }
465
466 /*
467 * __lsm_tree_open_check --
468 * Validate the configuration of an LSM tree.
469 */
470 static int
__lsm_tree_open_check(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree)471 __lsm_tree_open_check(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
472 {
473 WT_CONFIG_ITEM cval;
474 WT_CONNECTION_IMPL *conn;
475 uint64_t maxleafpage, required;
476 const char *cfg[] = { WT_CONFIG_BASE(
477 session, WT_SESSION_create), lsm_tree->file_config, NULL };
478
479 conn = S2C(session);
480
481 WT_RET(__wt_config_gets(session, cfg, "leaf_page_max", &cval));
482 maxleafpage = (uint64_t)cval.val;
483
484 required = WT_LSM_TREE_MINIMUM_SIZE(
485 lsm_tree->chunk_size, lsm_tree->merge_max, maxleafpage);
486 if (conn->cache_size < required)
487 WT_RET_MSG(session, EINVAL,
488 "LSM cache size %" PRIu64 " (%" PRIu64 "MB) too small, "
489 "must be at least %" PRIu64 " (%" PRIu64 "MB)",
490 conn->cache_size, conn->cache_size / WT_MEGABYTE,
491 required, (required + (WT_MEGABYTE - 1))/ WT_MEGABYTE);
492 return (0);
493 }
494
495 /*
496 * __lsm_tree_open --
497 * Open an LSM tree structure.
498 */
499 static int
__lsm_tree_open(WT_SESSION_IMPL * session,const char * uri,bool exclusive,WT_LSM_TREE ** treep)500 __lsm_tree_open(WT_SESSION_IMPL *session,
501 const char *uri, bool exclusive, WT_LSM_TREE **treep)
502 {
503 WT_CONNECTION_IMPL *conn;
504 WT_DECL_RET;
505 WT_LSM_TREE *lsm_tree;
506
507 conn = S2C(session);
508 lsm_tree = NULL;
509
510 WT_ASSERT(session,
511 F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE));
512
513 /* Start the LSM manager thread if it isn't running. */
514 WT_RET(__wt_lsm_manager_start(session));
515
516 /* Make sure no one beat us to it. */
517 if ((ret = __lsm_tree_find(
518 session, uri, exclusive, treep)) != WT_NOTFOUND)
519 return (ret);
520
521 /* Try to open the tree. */
522 WT_RET(__wt_calloc_one(session, &lsm_tree));
523 WT_ERR(__wt_rwlock_init(session, &lsm_tree->rwlock));
524
525 WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri));
526
527 WT_ERR(__wt_lsm_meta_read(session, lsm_tree));
528
529 /*
530 * Sanity check the configuration. Do it now since this is the first
531 * time we have the LSM tree configuration.
532 */
533 WT_ERR(__lsm_tree_open_check(session, lsm_tree));
534
535 /* Set the generation number so cursors are opened on first usage. */
536 lsm_tree->dsk_gen = 1;
537
538 /*
539 * Setup reference counting. Use separate reference counts for tree
540 * handles and queue entries, so that queue entries don't interfere
541 * with getting handles exclusive.
542 */
543 lsm_tree->refcnt = 1;
544 lsm_tree->excl_session = exclusive ? session : NULL;
545 lsm_tree->queue_ref = 0;
546
547 /* Set a flush timestamp as a baseline. */
548 __wt_epoch(session, &lsm_tree->last_flush_time);
549
550 /* Now the tree is setup, make it visible to others. */
551 TAILQ_INSERT_HEAD(&conn->lsmqh, lsm_tree, q);
552 if (!exclusive)
553 lsm_tree->active = true;
554 F_SET(lsm_tree, WT_LSM_TREE_OPEN);
555
556 *treep = lsm_tree;
557
558 if (0) {
559 err: WT_TRET(__lsm_tree_discard(session, lsm_tree, false));
560 }
561 return (ret);
562 }
563
564 /*
565 * __wt_lsm_tree_get --
566 * Find an LSM tree handle or open a new one.
567 */
568 int
__wt_lsm_tree_get(WT_SESSION_IMPL * session,const char * uri,bool exclusive,WT_LSM_TREE ** treep)569 __wt_lsm_tree_get(WT_SESSION_IMPL *session,
570 const char *uri, bool exclusive, WT_LSM_TREE **treep)
571 {
572 WT_DECL_RET;
573
574 /*
575 * Dropping and re-acquiring the lock is safe here, since the tree open
576 * call checks to see if another thread beat it to opening the tree
577 * before proceeding.
578 */
579 if (exclusive)
580 WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
581 ret = __lsm_tree_find(session, uri, exclusive, treep));
582 else
583 WT_WITH_HANDLE_LIST_READ_LOCK(session,
584 ret = __lsm_tree_find(session, uri, exclusive, treep));
585 if (ret == WT_NOTFOUND)
586 WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
587 ret = __lsm_tree_open(session, uri, exclusive, treep));
588
589 return (ret);
590 }
591
592 /*
593 * __wt_lsm_tree_release --
594 * Release an LSM tree structure.
595 */
596 void
__wt_lsm_tree_release(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree)597 __wt_lsm_tree_release(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
598 {
599 WT_ASSERT(session, lsm_tree->refcnt > 0);
600 if (lsm_tree->excl_session == session) {
601 /* We cleared the active flag when getting exclusive access. */
602 lsm_tree->active = true;
603 lsm_tree->excl_session = NULL;
604 }
605 (void)__wt_atomic_sub32(&lsm_tree->refcnt, 1);
606 }
607
608 /* How aggressively to ramp up or down throttle due to level 0 merging */
609 #define WT_LSM_MERGE_THROTTLE_BUMP_PCT (100 / lsm_tree->merge_max)
610 /* Number of level 0 chunks that need to be present to throttle inserts */
611 #define WT_LSM_MERGE_THROTTLE_THRESHOLD \
612 (2 * lsm_tree->merge_min)
613 /* Minimal throttling time */
614 #define WT_LSM_THROTTLE_START 20
615
616 #define WT_LSM_MERGE_THROTTLE_INCREASE(val) do { \
617 (val) += ((val) * WT_LSM_MERGE_THROTTLE_BUMP_PCT) / 100; \
618 if ((val) < WT_LSM_THROTTLE_START) \
619 (val) = WT_LSM_THROTTLE_START; \
620 } while (0)
621
622 #define WT_LSM_MERGE_THROTTLE_DECREASE(val) do { \
623 (val) -= ((val) * WT_LSM_MERGE_THROTTLE_BUMP_PCT) / 100; \
624 if ((val) < WT_LSM_THROTTLE_START) \
625 (val) = 0; \
626 } while (0)
627
628 /*
629 * __wt_lsm_tree_throttle --
630 * Calculate whether LSM updates need to be throttled. Must be called
631 * with the LSM tree lock held.
632 */
633 void
__wt_lsm_tree_throttle(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree,bool decrease_only)634 __wt_lsm_tree_throttle(
635 WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool decrease_only)
636 {
637 WT_LSM_CHUNK *last_chunk, **cp, *ondisk, *prev_chunk;
638 uint64_t cache_sz, cache_used, oldtime, record_count, timediff;
639 uint32_t in_memory, gen0_chunks;
640
641 /* Never throttle in small trees. */
642 if (lsm_tree->nchunks < 3) {
643 lsm_tree->ckpt_throttle = lsm_tree->merge_throttle = 0;
644 return;
645 }
646
647 cache_sz = S2C(session)->cache_size;
648
649 /*
650 * In the steady state, we expect that the checkpoint worker thread
651 * will keep up with inserts. If not, throttle the insert rate to
652 * avoid filling the cache with in-memory chunks. Threads sleep every
653 * 100 operations, so take that into account in the calculation.
654 *
655 * Also throttle based on whether merge threads are keeping up. If
656 * there are enough chunks that have never been merged we slow down
657 * inserts so that merges have some chance of keeping up.
658 *
659 * Count the number of in-memory chunks, the number of unmerged chunk
660 * on disk, and find the most recent on-disk chunk (if any).
661 */
662 record_count = 1;
663 gen0_chunks = in_memory = 0;
664 ondisk = NULL;
665 for (cp = lsm_tree->chunk + lsm_tree->nchunks - 1;
666 cp >= lsm_tree->chunk;
667 --cp)
668 if (!F_ISSET(*cp, WT_LSM_CHUNK_ONDISK)) {
669 record_count += (*cp)->count;
670 ++in_memory;
671 } else {
672 /*
673 * Assign ondisk to the last chunk that has been
674 * flushed since the tree was last opened (i.e it's on
675 * disk and stable is not set).
676 */
677 if (ondisk == NULL &&
678 ((*cp)->generation == 0 &&
679 !F_ISSET(*cp, WT_LSM_CHUNK_STABLE)))
680 ondisk = *cp;
681
682 if ((*cp)->generation == 0 &&
683 !F_ISSET(*cp, WT_LSM_CHUNK_MERGING))
684 ++gen0_chunks;
685 }
686
687 last_chunk = lsm_tree->chunk[lsm_tree->nchunks - 1];
688
689 /* Checkpoint throttling, based on the number of in-memory chunks. */
690 if (!F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) || in_memory <= 3)
691 lsm_tree->ckpt_throttle = 0;
692 else if (decrease_only)
693 ; /* Nothing to do */
694 else if (ondisk == NULL) {
695 /*
696 * No checkpoint has completed this run. Keep slowing down
697 * inserts until one does.
698 */
699 lsm_tree->ckpt_throttle =
700 WT_MAX(WT_LSM_THROTTLE_START, 2 * lsm_tree->ckpt_throttle);
701 } else {
702 WT_ASSERT(session, WT_TIMECMP(
703 last_chunk->create_time, ondisk->create_time) >= 0);
704 timediff = WT_TIMEDIFF_NS(
705 last_chunk->create_time, ondisk->create_time);
706 lsm_tree->ckpt_throttle =
707 (in_memory - 2) * timediff / (20 * record_count);
708
709 /*
710 * Get more aggressive as the number of in memory chunks
711 * consumes a large proportion of the cache. In memory chunks
712 * are allowed to grow up to twice as large as the configured
713 * value when checkpoints aren't keeping up. That worst case
714 * is when this calculation is relevant.
715 * There is nothing particularly special about the chosen
716 * multipliers.
717 */
718 cache_used = in_memory * lsm_tree->chunk_size * 2;
719 if (cache_used > cache_sz * 0.8)
720 lsm_tree->ckpt_throttle *= 5;
721 }
722
723 /*
724 * Merge throttling, based on the number of on-disk, level 0 chunks.
725 *
726 * Don't throttle if the tree has less than a single level's number
727 * of chunks.
728 */
729 if (F_ISSET(lsm_tree, WT_LSM_TREE_MERGES)) {
730 if (lsm_tree->nchunks < lsm_tree->merge_max)
731 lsm_tree->merge_throttle = 0;
732 else if (gen0_chunks < WT_LSM_MERGE_THROTTLE_THRESHOLD)
733 WT_LSM_MERGE_THROTTLE_DECREASE(
734 lsm_tree->merge_throttle);
735 else if (!decrease_only)
736 WT_LSM_MERGE_THROTTLE_INCREASE(
737 lsm_tree->merge_throttle);
738 }
739
740 /* Put an upper bound of 1s on both throttle calculations. */
741 lsm_tree->ckpt_throttle = WT_MIN(WT_MILLION, lsm_tree->ckpt_throttle);
742 lsm_tree->merge_throttle = WT_MIN(WT_MILLION, lsm_tree->merge_throttle);
743
744 /*
745 * Update our estimate of how long each in-memory chunk stays active.
746 * Filter out some noise by keeping a weighted history of the
747 * calculated value. Wait until we have enough chunks that we can
748 * check that the new value is sane: otherwise, after a long idle
749 * period, we can calculate a crazy value.
750 */
751 if (in_memory > 1 && ondisk != NULL) {
752 prev_chunk = lsm_tree->chunk[lsm_tree->nchunks - 2];
753 WT_ASSERT(session, prev_chunk->generation == 0);
754 WT_ASSERT(session, WT_TIMECMP(
755 last_chunk->create_time, prev_chunk->create_time) >= 0);
756 timediff = WT_TIMEDIFF_NS(
757 last_chunk->create_time, prev_chunk->create_time);
758 WT_ASSERT(session, WT_TIMECMP(
759 prev_chunk->create_time, ondisk->create_time) >= 0);
760 oldtime = WT_TIMEDIFF_NS(
761 prev_chunk->create_time, ondisk->create_time);
762 if (timediff < 10 * oldtime)
763 lsm_tree->chunk_fill_ms =
764 (3 * lsm_tree->chunk_fill_ms +
765 timediff / WT_MILLION) / 4;
766 }
767 }
768
769 /*
770 * __wt_lsm_tree_switch --
771 * Switch to a new in-memory tree.
772 */
773 int
__wt_lsm_tree_switch(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree)774 __wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
775 {
776 WT_DECL_RET;
777 WT_LSM_CHUNK *chunk, *last_chunk;
778 uint32_t chunks_moved, nchunks, new_id;
779 bool first_switch;
780
781 __wt_lsm_tree_writelock(session, lsm_tree);
782
783 nchunks = lsm_tree->nchunks;
784
785 first_switch = nchunks == 0;
786
787 /*
788 * Check if a switch is still needed: we may have raced while waiting
789 * for a lock.
790 */
791 last_chunk = NULL;
792 if (!first_switch &&
793 (last_chunk = lsm_tree->chunk[nchunks - 1]) != NULL &&
794 !F_ISSET(last_chunk, WT_LSM_CHUNK_ONDISK) &&
795 !lsm_tree->need_switch)
796 goto err;
797
798 /* Update the throttle time. */
799 __wt_lsm_tree_throttle(session, lsm_tree, false);
800
801 new_id = __wt_atomic_add32(&lsm_tree->last, 1);
802
803 WT_ERR(__wt_realloc_def(session, &lsm_tree->chunk_alloc,
804 nchunks + 1, &lsm_tree->chunk));
805
806 __wt_verbose(session, WT_VERB_LSM,
807 "Tree %s switch to: %" PRIu32 ", checkpoint throttle %" PRIu64
808 ", merge throttle %" PRIu64, lsm_tree->name,
809 new_id, lsm_tree->ckpt_throttle, lsm_tree->merge_throttle);
810
811 WT_ERR(__wt_calloc_one(session, &chunk));
812 chunk->id = new_id;
813 chunk->switch_txn = WT_TXN_NONE;
814 lsm_tree->chunk[lsm_tree->nchunks++] = chunk;
815 WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));
816
817 WT_ERR(__wt_lsm_meta_write(session, lsm_tree, NULL));
818 lsm_tree->need_switch = false;
819 lsm_tree->modified = true;
820
821 /*
822 * Ensure the updated disk generation is visible to all other threads
823 * before updating the transaction ID.
824 */
825 ++lsm_tree->dsk_gen;
826 WT_FULL_BARRIER();
827
828 /*
829 * Set the switch transaction in the previous chunk unless this is
830 * the first chunk in a new or newly opened tree.
831 */
832 if (last_chunk != NULL && last_chunk->switch_txn == WT_TXN_NONE &&
833 !F_ISSET(last_chunk, WT_LSM_CHUNK_ONDISK))
834 last_chunk->switch_txn = __wt_txn_id_alloc(session, false);
835
836 /*
837 * If a maximum number of chunks are configured, drop the any chunks
838 * past the limit.
839 */
840 if (lsm_tree->chunk_count_limit != 0 &&
841 lsm_tree->nchunks > lsm_tree->chunk_count_limit) {
842 chunks_moved = lsm_tree->nchunks - lsm_tree->chunk_count_limit;
843 /* Move the last chunk onto the old chunk list. */
844 WT_ERR(__wt_lsm_tree_retire_chunks(
845 session, lsm_tree, 0, chunks_moved));
846
847 /* Update the active chunk list. */
848 lsm_tree->nchunks -= chunks_moved;
849 /* Move the remaining chunks to the start of the active list */
850 memmove(lsm_tree->chunk,
851 lsm_tree->chunk + chunks_moved,
852 lsm_tree->nchunks * sizeof(*lsm_tree->chunk));
853 /* Clear out the chunks at the end of the tree */
854 memset(lsm_tree->chunk + lsm_tree->nchunks,
855 0, chunks_moved * sizeof(*lsm_tree->chunk));
856
857 /* Make sure the manager knows there is work to do. */
858 WT_ERR(__wt_lsm_manager_push_entry(
859 session, WT_LSM_WORK_DROP, 0, lsm_tree));
860 }
861
862 err: __wt_lsm_tree_writeunlock(session, lsm_tree);
863 /*
864 * Errors that happen during a tree switch leave the tree in a state
865 * where we can't make progress. Error out of WiredTiger.
866 */
867 if (ret != 0)
868 WT_PANIC_RET(session, ret, "Failed doing LSM switch");
869 else if (!first_switch)
870 WT_RET(__wt_lsm_manager_push_entry(
871 session, WT_LSM_WORK_FLUSH, 0, lsm_tree));
872 return (ret);
873 }
874
875 /*
876 * __wt_lsm_tree_retire_chunks --
877 * Move a set of chunks onto the old chunks list.
878 * It's the callers responsibility to update the active chunks list.
879 * Must be called with the LSM lock held.
880 */
881 int
__wt_lsm_tree_retire_chunks(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree,u_int start_chunk,u_int nchunks)882 __wt_lsm_tree_retire_chunks(WT_SESSION_IMPL *session,
883 WT_LSM_TREE *lsm_tree, u_int start_chunk, u_int nchunks)
884 {
885 u_int i;
886
887 WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks);
888
889 /* Setup the array of obsolete chunks. */
890 WT_RET(__wt_realloc_def(session, &lsm_tree->old_alloc,
891 lsm_tree->nold_chunks + nchunks, &lsm_tree->old_chunks));
892
893 /* Copy entries one at a time, so we can reuse gaps in the list. */
894 for (i = 0; i < nchunks; i++)
895 lsm_tree->old_chunks[lsm_tree->nold_chunks++] =
896 lsm_tree->chunk[start_chunk + i];
897
898 return (0);
899 }
900
901 /*
902 * __wt_lsm_tree_drop --
903 * Drop an LSM tree.
904 */
905 int
__wt_lsm_tree_drop(WT_SESSION_IMPL * session,const char * name,const char * cfg[])906 __wt_lsm_tree_drop(
907 WT_SESSION_IMPL *session, const char *name, const char *cfg[])
908 {
909 WT_DECL_RET;
910 WT_LSM_CHUNK *chunk;
911 WT_LSM_TREE *lsm_tree;
912 u_int i;
913 int tret;
914 bool locked;
915
916 WT_NOT_READ(locked, false);
917
918 /* Get the LSM tree. */
919 WT_RET(__wt_lsm_tree_get(session, name, true, &lsm_tree));
920 WT_ASSERT(session, !lsm_tree->active);
921
922 /* Prevent any new opens. */
923 __wt_lsm_tree_writelock(session, lsm_tree);
924 locked = true;
925
926 /* Drop the chunks. */
927 for (i = 0; i < lsm_tree->nchunks; i++) {
928 chunk = lsm_tree->chunk[i];
929 WT_ERR(__wt_schema_drop(session, chunk->uri, cfg));
930 if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
931 WT_ERR(
932 __wt_schema_drop(session, chunk->bloom_uri, cfg));
933 }
934
935 /* Drop any chunks on the obsolete list. */
936 for (i = 0; i < lsm_tree->nold_chunks; i++) {
937 if ((chunk = lsm_tree->old_chunks[i]) == NULL)
938 continue;
939 WT_ERR(__wt_schema_drop(session, chunk->uri, cfg));
940 if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
941 WT_ERR(
942 __wt_schema_drop(session, chunk->bloom_uri, cfg));
943 }
944
945 locked = false;
946 __wt_lsm_tree_writeunlock(session, lsm_tree);
947 ret = __wt_metadata_remove(session, name);
948
949 WT_ASSERT(session, !lsm_tree->active);
950 err: if (locked)
951 __wt_lsm_tree_writeunlock(session, lsm_tree);
952 WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
953 tret = __lsm_tree_discard(session, lsm_tree, false));
954 WT_TRET(tret);
955 return (ret);
956 }
957
958 /*
959 * __wt_lsm_tree_rename --
960 * Rename an LSM tree.
961 */
962 int
__wt_lsm_tree_rename(WT_SESSION_IMPL * session,const char * olduri,const char * newuri,const char * cfg[])963 __wt_lsm_tree_rename(WT_SESSION_IMPL *session,
964 const char *olduri, const char *newuri, const char *cfg[])
965 {
966 WT_DECL_RET;
967 WT_LSM_CHUNK *chunk;
968 WT_LSM_TREE *lsm_tree;
969 u_int i;
970 int tret;
971 const char *old;
972 bool locked;
973
974 old = NULL;
975 WT_NOT_READ(locked, false);
976
977 /* Get the LSM tree. */
978 WT_RET(__wt_lsm_tree_get(session, olduri, true, &lsm_tree));
979
980 /* Prevent any new opens. */
981 __wt_lsm_tree_writelock(session, lsm_tree);
982 locked = true;
983
984 /* Set the new name. */
985 WT_ERR(__lsm_tree_set_name(session, lsm_tree, newuri));
986
987 /* Rename the chunks. */
988 for (i = 0; i < lsm_tree->nchunks; i++) {
989 chunk = lsm_tree->chunk[i];
990 old = chunk->uri;
991 chunk->uri = NULL;
992
993 WT_ERR(__wt_lsm_tree_chunk_name(session, lsm_tree,
994 chunk->id, chunk->generation, &chunk->uri));
995 WT_ERR(__wt_schema_rename(session, old, chunk->uri, cfg));
996 __wt_free(session, old);
997
998 if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) {
999 old = chunk->bloom_uri;
1000 chunk->bloom_uri = NULL;
1001 WT_ERR(__wt_lsm_tree_bloom_name(
1002 session, lsm_tree, chunk->id, &chunk->bloom_uri));
1003 F_SET(chunk, WT_LSM_CHUNK_BLOOM);
1004 WT_ERR(__wt_schema_rename(
1005 session, old, chunk->uri, cfg));
1006 __wt_free(session, old);
1007 }
1008 }
1009
1010 WT_ERR(__wt_lsm_meta_write(session, lsm_tree, NULL));
1011 locked = false;
1012 __wt_lsm_tree_writeunlock(session, lsm_tree);
1013 WT_ERR(__wt_metadata_remove(session, olduri));
1014
1015 err: if (locked)
1016 __wt_lsm_tree_writeunlock(session, lsm_tree);
1017 __wt_free(session, old);
1018
1019 /*
1020 * Discard this LSM tree structure. The first operation on the renamed
1021 * tree will create a new one.
1022 */
1023 WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
1024 tret = __lsm_tree_discard(session, lsm_tree, false));
1025 WT_TRET(tret);
1026 return (ret);
1027 }
1028
1029 /*
1030 * __wt_lsm_tree_truncate --
1031 * Truncate an LSM tree.
1032 */
1033 int
__wt_lsm_tree_truncate(WT_SESSION_IMPL * session,const char * name,const char * cfg[])1034 __wt_lsm_tree_truncate(
1035 WT_SESSION_IMPL *session, const char *name, const char *cfg[])
1036 {
1037 WT_DECL_RET;
1038 WT_LSM_CHUNK *chunk;
1039 WT_LSM_TREE *lsm_tree;
1040 int tret;
1041 bool locked;
1042
1043 WT_UNUSED(cfg);
1044
1045 chunk = NULL;
1046 WT_NOT_READ(locked, false);
1047
1048 /* Get the LSM tree. */
1049 WT_RET(__wt_lsm_tree_get(session, name, true, &lsm_tree));
1050
1051 /* Prevent any new opens. */
1052 __wt_lsm_tree_writelock(session, lsm_tree);
1053 locked = true;
1054
1055 /* Create the new chunk. */
1056 WT_ERR(__wt_calloc_one(session, &chunk));
1057 chunk->id = __wt_atomic_add32(&lsm_tree->last, 1);
1058 WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));
1059
1060 /* Mark all chunks old. */
1061 WT_ERR(__wt_lsm_merge_update_tree(
1062 session, lsm_tree, 0, lsm_tree->nchunks, chunk));
1063
1064 WT_ERR(__wt_lsm_meta_write(session, lsm_tree, NULL));
1065
1066 locked = false;
1067 __wt_lsm_tree_writeunlock(session, lsm_tree);
1068 __wt_lsm_tree_release(session, lsm_tree);
1069
1070 err: if (locked)
1071 __wt_lsm_tree_writeunlock(session, lsm_tree);
1072 if (ret != 0) {
1073 if (chunk != NULL) {
1074 WT_TRET(__wt_schema_drop(session, chunk->uri, NULL));
1075 __wt_free(session, chunk);
1076 }
1077 /*
1078 * Discard the LSM tree structure on error. This will force the
1079 * LSM tree to be re-opened the next time it is accessed and
1080 * the last good version of the metadata will be used, resulting
1081 * in a valid (not truncated) tree.
1082 */
1083 WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
1084 tret = __lsm_tree_discard(session, lsm_tree, false));
1085 WT_TRET(tret);
1086 }
1087 return (ret);
1088 }
1089
1090 /*
1091 * __wt_lsm_tree_readlock --
1092 * Acquire a shared lock on an LSM tree.
1093 */
1094 void
__wt_lsm_tree_readlock(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree)1095 __wt_lsm_tree_readlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
1096 {
1097 __wt_readlock(session, &lsm_tree->rwlock);
1098
1099 /*
1100 * Diagnostic: avoid deadlocks with the schema lock: if we need it for
1101 * an operation, we should already have it.
1102 */
1103 F_SET(session,
1104 WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_SCHEMA_LOCK);
1105 }
1106
1107 /*
1108 * __wt_lsm_tree_readunlock --
1109 * Release a shared lock on an LSM tree.
1110 */
1111 void
__wt_lsm_tree_readunlock(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree)1112 __wt_lsm_tree_readunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
1113 {
1114 F_CLR(session,
1115 WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_SCHEMA_LOCK);
1116
1117 __wt_readunlock(session, &lsm_tree->rwlock);
1118 }
1119
1120 /*
1121 * __wt_lsm_tree_writelock --
1122 * Acquire an exclusive lock on an LSM tree.
1123 */
1124 void
__wt_lsm_tree_writelock(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree)1125 __wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
1126 {
1127 __wt_writelock(session, &lsm_tree->rwlock);
1128
1129 /*
1130 * Diagnostic: avoid deadlocks with the schema lock: if we need it for
1131 * an operation, we should already have it.
1132 */
1133 F_SET(session,
1134 WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_SCHEMA_LOCK);
1135 }
1136
1137 /*
1138 * __wt_lsm_tree_writeunlock --
1139 * Release an exclusive lock on an LSM tree.
1140 */
1141 void
__wt_lsm_tree_writeunlock(WT_SESSION_IMPL * session,WT_LSM_TREE * lsm_tree)1142 __wt_lsm_tree_writeunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
1143 {
1144 F_CLR(session,
1145 WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_SCHEMA_LOCK);
1146
1147 __wt_writeunlock(session, &lsm_tree->rwlock);
1148 }
1149
1150 /*
1151 * __wt_lsm_compact --
1152 * Compact an LSM tree called via __wt_schema_worker.
1153 */
1154 int
__wt_lsm_compact(WT_SESSION_IMPL * session,const char * name,bool * skipp)1155 __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp)
1156 {
1157 WT_DECL_RET;
1158 WT_LSM_CHUNK *chunk;
1159 WT_LSM_TREE *lsm_tree;
1160 uint64_t progress;
1161 uint32_t i;
1162 bool compacting, flushing, locked, push_flush, ref;
1163
1164 compacting = flushing = locked = ref = false;
1165 chunk = NULL;
1166 /*
1167 * This function is applied to all matching sources: ignore anything
1168 * that is not an LSM tree.
1169 */
1170 if (!WT_PREFIX_MATCH(name, "lsm:"))
1171 return (0);
1172
1173 /* Tell __wt_schema_worker not to look inside the LSM tree. */
1174 *skipp = true;
1175
1176 WT_RET(__wt_lsm_tree_get(session, name, false, &lsm_tree));
1177
1178 if (!F_ISSET(S2C(session), WT_CONN_LSM_MERGE))
1179 WT_ERR_MSG(session, EINVAL,
1180 "LSM compaction requires active merge threads");
1181
1182 /*
1183 * There is no work to do if there is only a single chunk in the tree
1184 * and it has a bloom filter or is configured to never have a bloom
1185 * filter.
1186 */
1187 if (lsm_tree->nchunks == 1 &&
1188 (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST) ||
1189 F_ISSET(lsm_tree->chunk[0], WT_LSM_CHUNK_BLOOM))) {
1190 __wt_lsm_tree_release(session, lsm_tree);
1191 return (0);
1192 }
1193
1194 /*
1195 * Compacting has two distinct phases.
1196 * 1. All in-memory chunks up to and including the current
1197 * current chunk must be flushed. Normally, the flush code
1198 * does not flush the last, in-use chunk, so we set a force
1199 * flag to include that last chunk. We monitor the state of the
1200 * last chunk and periodically push another forced flush work
1201 * unit until it is complete.
1202 * 2. After all flushing is done, we move onto the merging
1203 * phase for compaction. Again, we monitor the state and
1204 * continue to push merge work units until all merging is done.
1205 */
1206
1207 /* Lock the tree: single-thread compaction. */
1208 __wt_lsm_tree_writelock(session, lsm_tree);
1209 locked = true;
1210
1211 /* Clear any merge throttle: compact throws out that calculation. */
1212 lsm_tree->merge_throttle = 0;
1213 lsm_tree->merge_aggressiveness = 0;
1214 progress = lsm_tree->merge_progressing;
1215
1216 /* If another thread started a compact on this tree, we're done. */
1217 if (F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING))
1218 goto err;
1219
1220 /*
1221 * Set the switch transaction on the current chunk, if it
1222 * hasn't been set before. This prevents further writes, so it
1223 * can be flushed by the checkpoint worker. If this is a newly
1224 * opened tree the primary chunk may already be stable. Only
1225 * push a flush work unit if necessary.
1226 */
1227 push_flush = false;
1228 if (lsm_tree->nchunks > 0 &&
1229 (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) != NULL &&
1230 !F_ISSET(chunk, (WT_LSM_CHUNK_ONDISK | WT_LSM_CHUNK_STABLE))) {
1231 push_flush = true;
1232 if (chunk->switch_txn == WT_TXN_NONE) {
1233 /*
1234 * Make sure any cursors open on the tree see the
1235 * new switch generation before updating.
1236 */
1237 ++lsm_tree->dsk_gen;
1238 WT_FULL_BARRIER();
1239 chunk->switch_txn = __wt_txn_id_alloc(session, false);
1240 }
1241 /*
1242 * If we have a chunk, we want to look for it to be on-disk.
1243 * So we need to add a reference to keep it available.
1244 */
1245 (void)__wt_atomic_add32(&chunk->refcnt, 1);
1246 ref = true;
1247 }
1248
1249 if (push_flush) {
1250 __wt_verbose(session, WT_VERB_LSM,
1251 "Compact force flush %s flags 0x%" PRIx32
1252 " chunk %" PRIu32 " flags 0x%" PRIx32,
1253 name, lsm_tree->flags, chunk->id, chunk->flags);
1254 flushing = true;
1255 locked = false;
1256 __wt_lsm_tree_writeunlock(session, lsm_tree);
1257 /*
1258 * Make sure the in-memory chunk gets flushed do not push a
1259 * switch, because we don't want to create a new in-memory
1260 * chunk if the tree is being used read-only now.
1261 */
1262 WT_ERR(__wt_lsm_manager_push_entry(session,
1263 WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE, lsm_tree));
1264 } else {
1265 /*
1266 * If there is no chunk to flush, go straight to the
1267 * compacting state.
1268 */
1269 compacting = true;
1270 progress = lsm_tree->merge_progressing;
1271 F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
1272 __wt_verbose(session, WT_VERB_LSM,
1273 "COMPACT: Start compacting %s", lsm_tree->name);
1274 locked = false;
1275 __wt_lsm_tree_writeunlock(session, lsm_tree);
1276 }
1277
1278 /* Wait for the work unit queues to drain. */
1279 while (lsm_tree->active) {
1280 /*
1281 * The flush flag is cleared when the chunk has been flushed.
1282 * Continue to push forced flushes until the chunk is on disk.
1283 * Once it is on disk move to the compacting phase.
1284 */
1285 if (flushing) {
1286 WT_ASSERT(session, chunk != NULL);
1287 if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
1288 __wt_verbose(session,
1289 WT_VERB_LSM,
1290 "Compact flush done %s chunk %" PRIu32 ". "
1291 "Start compacting progress %" PRIu64,
1292 name, chunk->id,
1293 lsm_tree->merge_progressing);
1294 (void)__wt_atomic_sub32(&chunk->refcnt, 1);
1295 flushing = ref = false;
1296 compacting = true;
1297 F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
1298 progress = lsm_tree->merge_progressing;
1299 } else {
1300 __wt_verbose(session, WT_VERB_LSM,
1301 "Compact flush retry %s chunk %" PRIu32,
1302 name, chunk->id);
1303 WT_ERR(__wt_lsm_manager_push_entry(session,
1304 WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE,
1305 lsm_tree));
1306 }
1307 }
1308
1309 /*
1310 * The compacting flag is cleared when no merges can be done.
1311 * Ensure that we push through some aggressive merges before
1312 * stopping otherwise we might not do merges that would
1313 * span chunks with different generations.
1314 */
1315 if (compacting && !F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) {
1316 if (lsm_tree->merge_aggressiveness < 10 ||
1317 (progress < lsm_tree->merge_progressing) ||
1318 lsm_tree->merge_syncing) {
1319 progress = lsm_tree->merge_progressing;
1320 F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
1321 lsm_tree->merge_aggressiveness = 10;
1322 } else
1323 break;
1324 }
1325
1326 /*
1327 * Periodically check if we've timed out or eviction is stuck.
1328 * Quit if eviction is stuck, we're making the problem worse.
1329 */
1330 WT_ERR(__wt_session_compact_check_timeout(session));
1331 if (__wt_cache_stuck(session))
1332 WT_ERR(EBUSY);
1333 __wt_sleep(1, 0);
1334
1335 /*
1336 * Push merge operations while they are still getting work
1337 * done. If we are pushing merges, make sure they are
1338 * aggressive, to avoid duplicating effort.
1339 */
1340 if (compacting)
1341 #define COMPACT_PARALLEL_MERGES 5
1342 for (i = lsm_tree->queue_ref;
1343 i < COMPACT_PARALLEL_MERGES; i++) {
1344 lsm_tree->merge_aggressiveness = 10;
1345 WT_ERR(__wt_lsm_manager_push_entry(
1346 session, WT_LSM_WORK_MERGE, 0, lsm_tree));
1347 }
1348 }
1349 err:
1350 /* Ensure anything we set is cleared. */
1351 if (ref)
1352 (void)__wt_atomic_sub32(&chunk->refcnt, 1);
1353 if (compacting) {
1354 F_CLR(lsm_tree, WT_LSM_TREE_COMPACTING);
1355 lsm_tree->merge_aggressiveness = 0;
1356 }
1357 if (locked)
1358 __wt_lsm_tree_writeunlock(session, lsm_tree);
1359
1360 __wt_verbose(session, WT_VERB_LSM,
1361 "Compact %s complete, return %d", name, ret);
1362
1363 __wt_lsm_tree_release(session, lsm_tree);
1364 return (ret);
1365 }
1366
1367 /*
1368 * __wt_lsm_tree_worker --
1369 * Run a schema worker operation on each level of a LSM tree.
1370 */
1371 int
__wt_lsm_tree_worker(WT_SESSION_IMPL * session,const char * uri,int (* file_func)(WT_SESSION_IMPL *,const char * []),int (* name_func)(WT_SESSION_IMPL *,const char *,bool *),const char * cfg[],uint32_t open_flags)1372 __wt_lsm_tree_worker(WT_SESSION_IMPL *session,
1373 const char *uri,
1374 int (*file_func)(WT_SESSION_IMPL *, const char *[]),
1375 int (*name_func)(WT_SESSION_IMPL *, const char *, bool *),
1376 const char *cfg[], uint32_t open_flags)
1377 {
1378 WT_DECL_RET;
1379 WT_LSM_CHUNK *chunk;
1380 WT_LSM_TREE *lsm_tree;
1381 u_int i;
1382 bool exclusive, locked, need_release;
1383
1384 WT_NOT_READ(locked, false);
1385 WT_NOT_READ(need_release, false);
1386 exclusive = FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE);
1387
1388 WT_RET(__wt_lsm_tree_get(session, uri, exclusive, &lsm_tree));
1389 need_release = true;
1390
1391 /*
1392 * We mark that we're busy using the tree to coordinate
1393 * with merges so that merging doesn't change the chunk
1394 * array out from underneath us.
1395 */
1396 if (exclusive)
1397 __wt_lsm_tree_writelock(session, lsm_tree);
1398 else
1399 __wt_lsm_tree_readlock(session, lsm_tree);
1400 locked = true;
1401 for (i = 0; i < lsm_tree->nchunks; i++) {
1402 chunk = lsm_tree->chunk[i];
1403 /*
1404 * If the chunk is on disk, don't include underlying handles in
1405 * the checkpoint. Checking the "get handles" function is all
1406 * we need to do, no further checkpoint calls are done if the
1407 * handle is not gathered.
1408 */
1409 if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
1410 file_func == __wt_checkpoint_get_handles)
1411 continue;
1412 WT_ERR(__wt_schema_worker(session, chunk->uri,
1413 file_func, name_func, cfg, open_flags));
1414 if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
1415 WT_ERR(__wt_schema_worker(session, chunk->bloom_uri,
1416 file_func, name_func, cfg, open_flags));
1417 }
1418 /*
1419 * If this was an alter operation, we need to alter the configuration
1420 * for the overall tree and then reread it so it isn't out of date.
1421 * Reread it here so that we update the configuration of the
1422 * current tree's structure to any new, altered values.
1423 */
1424 if (FLD_ISSET(open_flags, WT_BTREE_ALTER)) {
1425 WT_ERR(__wt_lsm_meta_write(session, lsm_tree, cfg[0]));
1426
1427 locked = false;
1428 if (exclusive)
1429 __wt_lsm_tree_writeunlock(session, lsm_tree);
1430 else
1431 __wt_lsm_tree_readunlock(session, lsm_tree);
1432
1433 /*
1434 * We rewrote the meta-data. Discard the tree and the next
1435 * access will reopen it.
1436 */
1437 need_release = false;
1438 WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
1439 ret = __lsm_tree_discard(session, lsm_tree, false));
1440 WT_ERR(ret);
1441 }
1442
1443 err: if (locked) {
1444 if (exclusive)
1445 __wt_lsm_tree_writeunlock(session, lsm_tree);
1446 else
1447 __wt_lsm_tree_readunlock(session, lsm_tree);
1448 }
1449 if (need_release)
1450 __wt_lsm_tree_release(session, lsm_tree);
1451 return (ret);
1452 }
1453