1 /*-
2 * Public Domain 2014-2018 MongoDB, Inc.
3 * Public Domain 2008-2014 WiredTiger, Inc.
4 *
5 * This is free and unencumbered software released into the public domain.
6 *
7 * Anyone is free to copy, modify, publish, use, compile, sell, or
8 * distribute this software, either in source code form or as a compiled
9 * binary, for any purpose, commercial or non-commercial, and by any
10 * means.
11 *
12 * In jurisdictions that recognize copyright laws, the author or authors
13 * of this software dedicate any and all copyright interest in the
14 * software to the public domain. We make this dedication for the benefit
15 * of the public at large and to the detriment of our heirs and
16 * successors. We intend this dedication to be an overt act of
17 * relinquishment in perpetuity of all present and future rights to this
18 * software under copyright law.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
24 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
25 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
26 * OTHER DEALINGS IN THE SOFTWARE.
27 */
28 #include "test_util.h"
29
30 /*
31 * JIRA ticket reference: WT-3363
32 *
33 * Test case description: There are a number of operations that we run that we
34 * expect not to conflict with or block against a running checkpoint. This test
35 * aims to run repeated checkpoints in a thread, while running an assortment
36 * of operations that we expect to execute quickly on further threads. To
37 * ensure that we catch any blockages we introduce a very large delay into the
38 * checkpoint and measure that no operation takes 1/2 the length of this delay.
39 *
40 * Failure mode: We monitor the execution time of all operations and if we find
41 * any operation taking longer than 1/2 the delay time, we abort dumping a core
42 * file which can be used to determine what operation was blocked.
43 */
44 static WT_THREAD_RET do_checkpoints(void *);
45 static WT_THREAD_RET do_ops(void *);
46 static WT_THREAD_RET monitor(void *);
47
48 /*
49 * Time delay to introduce into checkpoints in seconds. Should be at-least
50 * double the maximum time that any one of the operations should take. Currently
51 * this is set to 10 seconds and we expect no single operation to take longer
52 * than 5 seconds.
53 */
54 #define MAX_EXECUTION_TIME 10
55 #define N_THREADS 10
56
57 /*
58 * Number of seconds to execute for. Initially set to 15 minutes, as we need to
59 * run long enough to be certain we have captured any blockages. In initial
60 * testing 5 minutes was enough to reproduce the issue, so we run for 3x that
61 * here to ensure we reproduce before declaring success.
62 */
63 #define RUNTIME 900.0
64
65 static WT_EVENT_HANDLER event_handler = {
66 handle_op_error,
67 handle_op_message,
68 NULL,
69 NULL
70 };
71
72 int
main(int argc,char * argv[])73 main(int argc, char *argv[])
74 {
75 TEST_OPTS *opts, _opts;
76 TEST_PER_THREAD_OPTS thread_args[N_THREADS];
77 pthread_t ckpt_thread, mon_thread, threads[N_THREADS];
78 int i;
79
80 /*
81 * This test should not run unless long tests flag is set. The test
82 * runs for 15 minutes.
83 */
84 if (!testutil_is_flag_set("TESTUTIL_ENABLE_TIMING_TESTS"))
85 return (EXIT_SUCCESS);
86
87 opts = &_opts;
88 opts->unique_id = 0;
89 memset(opts, 0, sizeof(*opts));
90
91 testutil_check(testutil_parse_opts(argc, argv, opts));
92 testutil_make_work_dir(opts->home);
93
94 testutil_check(wiredtiger_open(opts->home, &event_handler,
95 "create,cache_size=1G,timing_stress_for_test=[checkpoint_slow]",
96 &opts->conn));
97
98 testutil_check(pthread_create(
99 &ckpt_thread, NULL, do_checkpoints, opts));
100
101 for (i = 0; i < N_THREADS; ++i) {
102 thread_args[i].testopts = opts;
103 thread_args[i].thread_counter = 0;
104 thread_args[i].threadnum = i;
105 testutil_check(pthread_create(
106 &threads[i], NULL, do_ops, &thread_args[i]));
107 }
108
109 /*
110 * Pass the whole array of thread arguments to the monitoring thread.
111 * This thread will need to monitor each threads counter to track if it
112 * is stuck.
113 */
114 testutil_check(pthread_create(&mon_thread, NULL, monitor, thread_args));
115
116 for (i = 0; i < N_THREADS; ++i)
117 testutil_check(pthread_join(threads[i], NULL));
118
119 testutil_check(pthread_join(mon_thread, NULL));
120
121 testutil_check(pthread_join(ckpt_thread, NULL));
122
123 printf("Success\n");
124
125 testutil_cleanup(opts);
126 return (EXIT_SUCCESS);
127 }
128
129 /*
130 * Function for repeatedly running checkpoint operations.
131 */
132 static WT_THREAD_RET
do_checkpoints(void * _opts)133 do_checkpoints(void *_opts)
134 {
135 TEST_OPTS *opts;
136 WT_SESSION *session;
137 time_t now, start;
138 int ret;
139
140 opts = (TEST_OPTS *)_opts;
141 (void)time(&start);
142 (void)time(&now);
143
144 while (difftime(now, start) < RUNTIME) {
145 testutil_check(
146 opts->conn->open_session(opts->conn, NULL, NULL, &session));
147
148 if ((ret = session->checkpoint(session, "force")) != 0)
149 if (ret != EBUSY && ret != ENOENT)
150 testutil_die(ret, "session.checkpoint");
151
152 testutil_check(session->close(session, NULL));
153
154 /*
155 * A short sleep to let operations process and avoid back to
156 * back checkpoints locking up resources.
157 */
158 sleep(1);
159 (void)time(&now);
160 }
161
162 return (WT_THREAD_RET_VALUE);
163 }
164
165 /*
166 * Function to monitor running operations and abort to dump core in the event
167 * that we catch an operation running long.
168 */
169 static WT_THREAD_RET
monitor(void * args)170 monitor(void *args)
171 {
172 TEST_PER_THREAD_OPTS *thread_args;
173 time_t now, start;
174 int ctr, i, last_ops[N_THREADS];
175
176 thread_args = (TEST_PER_THREAD_OPTS *)args;
177
178 (void)time(&start);
179 (void)time(&now);
180
181 memset(last_ops, 0, sizeof(int) + N_THREADS);
182
183 while (difftime(now, start) < RUNTIME) {
184 /*
185 * Checkpoints will run for slightly over MAX_EXECUTION_TIME.
186 * MAX_EXECUTION_TIME should always be long enough that we can
187 * complete any single operation in 1/2 that time.
188 */
189 sleep(MAX_EXECUTION_TIME / 2);
190
191 for (i = 0; i < N_THREADS; i++) {
192 ctr = thread_args[i].thread_counter;
193
194 /* Ignore any threads which may not have started yet. */
195 if (ctr == 0)
196 continue;
197
198 /*
199 * We track how many operations each thread has done. If
200 * we have slept and the counter remains the same for a
201 * thread it is stuck and should drop a core so the
202 * cause of the hang can be investigated.
203 */
204 if (ctr != last_ops[i])
205 last_ops[i] = ctr;
206 else {
207 printf("Thread %d had a task running"
208 " for more than %d seconds\n",
209 i, MAX_EXECUTION_TIME / 2);
210 abort();
211 }
212 }
213 (void)time(&now);
214 }
215
216 return (WT_THREAD_RET_VALUE);
217 }
218
219 /*
220 * Worker thread. Executes random operations from the set of 6.
221 */
222 static WT_THREAD_RET
do_ops(void * args)223 do_ops(void *args)
224 {
225 WT_RAND_STATE rnd;
226 time_t now, start;
227
228 __wt_random_init_seed(NULL, &rnd);
229 (void)time(&start);
230 (void)time(&now);
231
232 while (difftime(now, start) < RUNTIME) {
233 switch (__wt_random(&rnd) % 6) {
234 case 0:
235 op_bulk(args);
236 break;
237 case 1:
238 op_create(args);
239 break;
240 case 2:
241 op_cursor(args);
242 break;
243 case 3:
244 op_drop(args);
245 break;
246 case 4:
247 op_bulk_unique(args);
248 break;
249 case 5:
250 op_create_unique(args);
251 break;
252 }
253 (void)time(&now);
254 }
255
256 return (WT_THREAD_RET_VALUE);
257 }
258