1 /*-
2 * Copyright (c) 2014-2018 MongoDB, Inc.
3 * Copyright (c) 2008-2014 WiredTiger, Inc.
4 * All rights reserved.
5 *
6 * See the file LICENSE for redistribution information.
7 */
8
9 #include "wt_internal.h"
10
11 /*
12 * Compaction is the place where the underlying block manager becomes visible
13 * in the higher engine btree and API layers. As there is currently only one
14 * block manager, this code is written with it in mind: other block managers
15 * may need changes to support compaction, and a smart block manager might need
16 * far less support from the engine.
17 *
18 * First, the default block manager cannot entirely own compaction because it
19 * has no way to find a block after it moves other than a request from the
20 * btree layer with the new address. In other words, if internal page X points
21 * to leaf page Y, and page Y moves, the address of page Y has to be updated in
22 * page X. Generally, this is solved by building a translation layer in the
23 * block manager so internal pages don't require updates to relocate blocks:
24 * however, the translation table must be durable, has its own garbage
25 * collection issues and might be slower, all of which have their own problems.
26 *
27 * Second, the btree layer cannot entirely own compaction because page
28 * addresses are opaque, it cannot know where a page is in the file from the
29 * address cookie.
30 *
31 * For these reasons, compaction is a cooperative process between the btree
32 * layer and the block manager. The btree layer walks files, and asks the
33 * block manager if rewriting a particular block would reduce the file
34 * footprint: if writing the page will help, the page is marked dirty so it
35 * will eventually be written. As pages are written, the original page
36 * potentially becomes available for reuse and if enough pages at the end of
37 * the file are available for reuse, the file can be truncated, and compaction
38 * succeeds.
39 *
40 * However, writing a page is not by itself sufficient to make a page available
41 * for reuse. The original version of the page is still referenced by at least
42 * the most recent checkpoint in the file. To make a page available for reuse,
43 * we have to checkpoint the file so we can discard the checkpoint referencing
44 * the original version of the block; once no checkpoint references a block, it
45 * becomes available for reuse.
46 *
47 * Compaction is not necessarily possible in WiredTiger, even in a file with
48 * lots of available space. If a block at the end of the file is referenced by
49 * a named checkpoint, there is nothing we can do to compact the file, no
50 * matter how many times we rewrite the block, the named checkpoint can't be
51 * discarded and so the reference count on the original block will never go to
52 * zero. What's worse, because the block manager doesn't reference count
53 * blocks, it can't easily know this is the case, and so we'll waste a lot of
54 * effort trying to compact files that can't be compacted.
55 *
56 * Finally, compaction checkpoints are database-wide, otherwise we can corrupt
57 * file relationships, for example, an index checkpointed by compaction could
58 * be out of sync with the primary after a crash.
59 *
60 * Now, to the actual process. First, we checkpoint the database: there are
61 * potentially many dirty blocks in the cache, and we want to write them out
62 * and then discard previous checkpoints so we have as many blocks as possible
63 * on the file's "available for reuse" list when we start compaction.
64 *
65 * Then, we compact the high-level object.
66 *
67 * Compacting the object is done 10% at a time, that is, we try and move blocks
68 * from the last 10% of the file into the beginning of the file (the 10% is
69 * hard coded in the block manager). The reason for this is because we are
70 * walking the file in logical order, not block offset order, and we can fail
71 * to compact a file if we write the wrong blocks first.
72 *
73 * For example, imagine a file with 10 blocks in the first 10% of a file, 1,000
74 * blocks in the 3rd quartile of the file, and 10 blocks in the last 10% of the
75 * file. If we were to rewrite blocks from more than the last 10% of the file,
76 * and found the 1,000 blocks in the 3rd quartile of the file first, we'd copy
77 * 10 of them without ever rewriting the blocks from the end of the file which
78 * would allow us to compact the file. So, we compact the last 10% of the
79 * file, and if that works, we compact the last 10% of the file again, and so
80 * on. Note the block manager uses a first-fit block selection algorithm
81 * during compaction to maximize block movement.
82 *
83 * After each 10% compaction, we checkpoint two more times (seriously, twice).
84 * The second and third checkpoints are because the block manager checkpoints
85 * in two steps: blocks made available for reuse during a checkpoint are put on
86 * a special checkpoint-available list and only moved to the real available
87 * list after the metadata has been updated with the new checkpoint's
88 * information. (Otherwise it is possible to allocate a rewritten block, crash
89 * before the metadata is updated, and see corruption.) For this reason,
90 * blocks allocated to write the checkpoint itself cannot be taken from the
91 * blocks made available by the checkpoint.
92 *
93 * To say it another way, the second checkpoint puts the blocks from the end of
94 * the file that were made available by compaction onto the checkpoint-available
95 * list, but then potentially writes the checkpoint itself at the end of the
96 * file, which would prevent any file truncation. When the metadata is updated
97 * for the second checkpoint, the blocks freed by compaction become available
98 * for the third checkpoint, so the third checkpoint's blocks are written
99 * towards the beginning of the file, and then the file can be truncated.
100 */
101
102 /*
103 * __compact_start --
104 * Start object compaction.
105 */
106 static int
__compact_start(WT_SESSION_IMPL * session)107 __compact_start(WT_SESSION_IMPL *session)
108 {
109 WT_BM *bm;
110
111 bm = S2BT(session)->bm;
112 return (bm->compact_start(bm, session));
113 }
114
115 /*
116 * __compact_end --
117 * End object compaction.
118 */
119 static int
__compact_end(WT_SESSION_IMPL * session)120 __compact_end(WT_SESSION_IMPL *session)
121 {
122 WT_BM *bm;
123
124 bm = S2BT(session)->bm;
125 return (bm->compact_end(bm, session));
126 }
127
128 /*
129 * __compact_uri_analyze --
130 * Extract information relevant to deciding what work compact needs to
131 * do from a URI that is part of a table schema.
132 * Called via the schema_worker function.
133 */
134 static int
__compact_uri_analyze(WT_SESSION_IMPL * session,const char * uri,bool * skipp)135 __compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, bool *skipp)
136 {
137 /*
138 * Add references to schema URI objects to the list of objects to be
139 * compacted. Skip over LSM trees or we will get false positives on
140 * the "file:" URIs for the chunks.
141 */
142 if (WT_PREFIX_MATCH(uri, "lsm:")) {
143 session->compact->lsm_count++;
144 *skipp = true;
145 } else if (WT_PREFIX_MATCH(uri, "file:"))
146 session->compact->file_count++;
147
148 return (0);
149 }
150
151 /*
152 * __compact_handle_append --
153 * Gather a file handle to be compacted.
154 * Called via the schema_worker function.
155 */
156 static int
__compact_handle_append(WT_SESSION_IMPL * session,const char * cfg[])157 __compact_handle_append(WT_SESSION_IMPL *session, const char *cfg[])
158 {
159 WT_DECL_RET;
160
161 WT_UNUSED(cfg);
162
163 WT_RET(__wt_session_get_dhandle(
164 session, session->dhandle->name, NULL, NULL, 0));
165
166 /* Set compact active on the handle. */
167 if ((ret = __compact_start(session)) != 0) {
168 WT_TRET(__wt_session_release_dhandle(session));
169 return (ret);
170 }
171
172 /* Make sure there is space for the next entry. */
173 WT_RET(__wt_realloc_def(session, &session->op_handle_allocated,
174 session->op_handle_next + 1, &session->op_handle));
175
176 session->op_handle[session->op_handle_next++] = session->dhandle;
177 return (0);
178 }
179
180 /*
181 * __wt_session_compact_check_timeout --
182 * Check if the timeout has been exceeded.
183 */
184 int
__wt_session_compact_check_timeout(WT_SESSION_IMPL * session)185 __wt_session_compact_check_timeout(WT_SESSION_IMPL *session)
186 {
187 struct timespec end;
188
189 if (session->compact->max_time == 0)
190 return (0);
191
192 __wt_epoch(session, &end);
193 return (session->compact->max_time >
194 WT_TIMEDIFF_SEC(end, session->compact->begin) ? 0 : ETIMEDOUT);
195 }
196
197 /*
198 * __compact_checkpoint --
199 * Perform a checkpoint for compaction.
200 */
201 static int
__compact_checkpoint(WT_SESSION_IMPL * session)202 __compact_checkpoint(WT_SESSION_IMPL *session)
203 {
204 WT_DECL_RET;
205 WT_TXN_GLOBAL *txn_global;
206 uint64_t txn_gen;
207
208 /*
209 * Force compaction checkpoints: we don't want to skip it because the
210 * work we need to have done is done in the underlying block manager.
211 */
212 const char *checkpoint_cfg[] = {
213 WT_CONFIG_BASE(session, WT_SESSION_checkpoint), "force=1", NULL };
214
215 /* Checkpoints take a lot of time, check if we've run out. */
216 WT_RET(__wt_session_compact_check_timeout(session));
217
218 if ((ret = __wt_txn_checkpoint(session, checkpoint_cfg, false)) == 0)
219 return (0);
220 WT_RET_BUSY_OK(ret);
221
222 /*
223 * If there's a checkpoint running, wait for it to complete, checking if
224 * we're out of time. If there's no checkpoint running or the checkpoint
225 * generation number changes, the checkpoint blocking us has completed.
226 */
227 txn_global = &S2C(session)->txn_global;
228 for (txn_gen = __wt_gen(session, WT_GEN_CHECKPOINT);;) {
229 /*
230 * This loop only checks objects that are declared volatile,
231 * therefore no barriers are needed.
232 */
233 if (!txn_global->checkpoint_running ||
234 txn_gen != __wt_gen(session, WT_GEN_CHECKPOINT))
235 break;
236
237 WT_RET(__wt_session_compact_check_timeout(session));
238 __wt_sleep(2, 0);
239 }
240
241 return (0);
242 }
243
244 /*
245 * __compact_worker --
246 * Function to alternate between checkpoints and compaction calls.
247 */
248 static int
__compact_worker(WT_SESSION_IMPL * session)249 __compact_worker(WT_SESSION_IMPL *session)
250 {
251 WT_DECL_RET;
252 u_int i, loop;
253 bool another_pass;
254
255 /*
256 * Reset the handles' compaction skip flag (we don't bother setting
257 * or resetting it when we finish compaction, it's simpler to do it
258 * once, here).
259 */
260 for (i = 0; i < session->op_handle_next; ++i)
261 session->op_handle[i]->compact_skip = false;
262
263 /*
264 * Perform an initial checkpoint (see this file's leading comment for
265 * details).
266 */
267 WT_ERR(__compact_checkpoint(session));
268
269 /*
270 * We compact 10% of a file on each pass (but the overall size of the
271 * file is decreasing each time, so we're not compacting 10% of the
272 * original file each time). Try 100 times (which is clearly more than
273 * we need); quit if we make no progress.
274 */
275 for (loop = 0; loop < 100; ++loop) {
276 /* Step through the list of files being compacted. */
277 for (another_pass = false,
278 i = 0; i < session->op_handle_next; ++i) {
279 /* Skip objects where there's no more work. */
280 if (session->op_handle[i]->compact_skip)
281 continue;
282
283 session->compact_state = WT_COMPACT_RUNNING;
284 WT_WITH_DHANDLE(session,
285 session->op_handle[i], ret = __wt_compact(session));
286
287 /*
288 * If successful and we did work, schedule another pass.
289 * If successful and we did no work, skip this file in
290 * the future.
291 */
292 if (ret == 0) {
293 if (session->
294 compact_state == WT_COMPACT_SUCCESS)
295 another_pass = true;
296 else
297 session->
298 op_handle[i]->compact_skip = true;
299 continue;
300 }
301
302 /*
303 * If compaction failed because checkpoint was running,
304 * continue with the next handle. We might continue to
305 * race with checkpoint on each handle, but that's OK,
306 * we'll step through all the handles, and then we'll
307 * block until a checkpoint completes.
308 *
309 * Just quit if eviction is the problem.
310 */
311 if (ret == EBUSY) {
312 if (__wt_cache_stuck(session)) {
313 WT_ERR_MSG(session, EBUSY,
314 "compaction halted by eviction "
315 "pressure");
316 }
317 ret = 0;
318 another_pass = true;
319 }
320 WT_ERR(ret);
321 }
322 if (!another_pass)
323 break;
324
325 /*
326 * Perform two checkpoints (see this file's leading comment for
327 * details).
328 */
329 WT_ERR(__compact_checkpoint(session));
330 WT_ERR(__compact_checkpoint(session));
331 }
332
333 err: session->compact_state = WT_COMPACT_NONE;
334
335 return (ret);
336 }
337
338 /*
339 * __wt_session_compact --
340 * WT_SESSION.compact method.
341 */
342 int
__wt_session_compact(WT_SESSION * wt_session,const char * uri,const char * config)343 __wt_session_compact(
344 WT_SESSION *wt_session, const char *uri, const char *config)
345 {
346 WT_COMPACT_STATE compact;
347 WT_CONFIG_ITEM cval;
348 WT_DATA_SOURCE *dsrc;
349 WT_DECL_RET;
350 WT_SESSION_IMPL *session;
351 u_int i;
352 bool ignore_cache_size_set;
353
354 ignore_cache_size_set = false;
355
356 session = (WT_SESSION_IMPL *)wt_session;
357 SESSION_API_CALL(session, compact, config, cfg);
358
359 /*
360 * The compaction thread should not block when the cache is full: it is
361 * holding locks blocking checkpoints and once the cache is full, it can
362 * spend a long time doing eviction.
363 */
364 if (!F_ISSET(session, WT_SESSION_IGNORE_CACHE_SIZE)) {
365 ignore_cache_size_set = true;
366 F_SET(session, WT_SESSION_IGNORE_CACHE_SIZE);
367 }
368
369 /* In-memory ignores compaction operations. */
370 if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
371 goto err;
372
373 /*
374 * Non-LSM object compaction requires checkpoints, which are impossible
375 * in transactional contexts. Disallow in all contexts (there's no
376 * reason for LSM to allow this, possible or not), and check now so the
377 * error message isn't confusing.
378 */
379 WT_ERR(__wt_txn_context_check(session, false));
380
381 /* Disallow objects in the WiredTiger name space. */
382 WT_ERR(__wt_str_name_check(session, uri));
383
384 if (!WT_PREFIX_MATCH(uri, "colgroup:") &&
385 !WT_PREFIX_MATCH(uri, "file:") &&
386 !WT_PREFIX_MATCH(uri, "index:") &&
387 !WT_PREFIX_MATCH(uri, "lsm:") &&
388 !WT_PREFIX_MATCH(uri, "table:")) {
389 if ((dsrc = __wt_schema_get_source(session, uri)) != NULL)
390 ret = dsrc->compact == NULL ?
391 __wt_object_unsupported(session, uri) :
392 dsrc->compact(
393 dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg);
394 else
395 ret = __wt_bad_object_type(session, uri);
396 goto err;
397 }
398
399 /* Setup the session handle's compaction state structure. */
400 memset(&compact, 0, sizeof(WT_COMPACT_STATE));
401 session->compact = &compact;
402
403 /* Compaction can be time-limited. */
404 WT_ERR(__wt_config_gets(session, cfg, "timeout", &cval));
405 session->compact->max_time = (uint64_t)cval.val;
406 __wt_epoch(session, &session->compact->begin);
407
408 /* Find the types of data sources being compacted. */
409 WT_WITH_SCHEMA_LOCK(session,
410 ret = __wt_schema_worker(session, uri,
411 __compact_handle_append, __compact_uri_analyze, cfg, 0));
412 WT_ERR(ret);
413
414 if (session->compact->lsm_count != 0)
415 WT_ERR(__wt_schema_worker(
416 session, uri, NULL, __wt_lsm_compact, cfg, 0));
417 if (session->compact->file_count != 0)
418 WT_ERR(__compact_worker(session));
419
420 err: session->compact = NULL;
421
422 for (i = 0; i < session->op_handle_next; ++i) {
423 WT_WITH_DHANDLE(session, session->op_handle[i],
424 WT_TRET(__compact_end(session)));
425 WT_WITH_DHANDLE(session, session->op_handle[i],
426 WT_TRET(__wt_session_release_dhandle(session)));
427 }
428
429 __wt_free(session, session->op_handle);
430 session->op_handle_allocated = session->op_handle_next = 0;
431
432 /*
433 * Release common session resources (for example, checkpoint may acquire
434 * significant reconciliation structures/memory).
435 */
436 WT_TRET(__wt_session_release_resources(session));
437
438 if (ignore_cache_size_set)
439 F_CLR(session, WT_SESSION_IGNORE_CACHE_SIZE);
440
441 if (ret != 0)
442 WT_STAT_CONN_INCR(session, session_table_compact_fail);
443 else
444 WT_STAT_CONN_INCR(session, session_table_compact_success);
445 API_END_RET_NOTFOUND_MAP(session, ret);
446 }
447
448 /*
449 * __wt_session_compact_readonly --
450 * WT_SESSION.compact method; readonly version.
451 */
452 int
__wt_session_compact_readonly(WT_SESSION * wt_session,const char * uri,const char * config)453 __wt_session_compact_readonly(
454 WT_SESSION *wt_session, const char *uri, const char *config)
455 {
456 WT_DECL_RET;
457 WT_SESSION_IMPL *session;
458
459 WT_UNUSED(uri);
460 WT_UNUSED(config);
461
462 session = (WT_SESSION_IMPL *)wt_session;
463 SESSION_API_CALL_NOCONF(session, compact);
464
465 WT_STAT_CONN_INCR(session, session_table_compact_fail);
466 ret = __wt_session_notsup(session);
467 err: API_END_RET(session, ret);
468 }
469