1 /*-
2  * Public Domain 2014-2018 MongoDB, Inc.
3  * Public Domain 2008-2014 WiredTiger, Inc.
4  *
5  * This is free and unencumbered software released into the public domain.
6  *
7  * Anyone is free to copy, modify, publish, use, compile, sell, or
8  * distribute this software, either in source code form or as a compiled
9  * binary, for any purpose, commercial or non-commercial, and by any
10  * means.
11  *
12  * In jurisdictions that recognize copyright laws, the author or authors
13  * of this software dedicate any and all copyright interest in the
14  * software to the public domain. We make this dedication for the benefit
15  * of the public at large and to the detriment of our heirs and
16  * successors. We intend this dedication to be an overt act of
17  * relinquishment in perpetuity of all present and future rights to this
18  * software under copyright law.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23  * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
24  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
25  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
26  * OTHER DEALINGS IN THE SOFTWARE.
27  */
28 #include "test_util.h"
29 
30 /*
31  * JIRA ticket reference: WT-3363
32  *
33  * Test case description: There are a number of operations that we run that we
34  * expect not to conflict with or block against a running checkpoint. This test
35  * aims to run repeated checkpoints in a thread, while running an assortment
36  * of operations that we expect to execute quickly on further threads. To
37  * ensure that we catch any blockages we introduce a very large delay into the
38  * checkpoint and measure that no operation takes 1/2 the length of this delay.
39  *
40  * Failure mode: We monitor the execution time of all operations and if we find
41  * any operation taking longer than 1/2 the delay time, we abort dumping a core
42  * file which can be used to determine what operation was blocked.
43  */
44 static WT_THREAD_RET do_checkpoints(void *);
45 static WT_THREAD_RET do_ops(void *);
46 static WT_THREAD_RET monitor(void *);
47 
48 /*
49  * Time delay to introduce into checkpoints in seconds. Should be at-least
50  * double the maximum time that any one of the operations should take. Currently
51  * this is set to 10 seconds and we expect no single operation to take longer
52  * than 5 seconds.
53  */
54 #define	MAX_EXECUTION_TIME	10
55 #define	N_THREADS		10
56 
57 /*
58  * Number of seconds to execute for. Initially set to 15 minutes, as we need to
59  * run long enough to be certain we have captured any blockages. In initial
60  * testing 5 minutes was enough to reproduce the issue, so we run for 3x that
61  * here to ensure we reproduce before declaring success.
62  */
63 #define	RUNTIME	900.0
64 
65 static WT_EVENT_HANDLER event_handler = {
66 	handle_op_error,
67 	handle_op_message,
68 	NULL,
69 	NULL
70 };
71 
72 int
main(int argc,char * argv[])73 main(int argc, char *argv[])
74 {
75 	TEST_OPTS *opts, _opts;
76 	TEST_PER_THREAD_OPTS thread_args[N_THREADS];
77 	pthread_t ckpt_thread, mon_thread, threads[N_THREADS];
78 	int i;
79 
80 	/*
81 	 * This test should not run unless long tests flag is set. The test
82 	 * runs for 15 minutes.
83 	 */
84 	if (!testutil_is_flag_set("TESTUTIL_ENABLE_TIMING_TESTS"))
85 		return (EXIT_SUCCESS);
86 
87 	opts = &_opts;
88 	opts->unique_id = 0;
89 	memset(opts, 0, sizeof(*opts));
90 
91 	testutil_check(testutil_parse_opts(argc, argv, opts));
92 	testutil_make_work_dir(opts->home);
93 
94 	testutil_check(wiredtiger_open(opts->home, &event_handler,
95 	    "create,cache_size=1G,timing_stress_for_test=[checkpoint_slow]",
96 	    &opts->conn));
97 
98 	testutil_check(pthread_create(
99 	    &ckpt_thread, NULL, do_checkpoints, opts));
100 
101 	for (i = 0; i < N_THREADS; ++i) {
102 		thread_args[i].testopts = opts;
103 		thread_args[i].thread_counter = 0;
104 		thread_args[i].threadnum = i;
105 		testutil_check(pthread_create(
106 		    &threads[i], NULL, do_ops, &thread_args[i]));
107 	}
108 
109 	/*
110 	 * Pass the whole array of thread arguments to the monitoring thread.
111 	 * This thread will need to monitor each threads counter to track if it
112 	 * is stuck.
113 	 */
114 	testutil_check(pthread_create(&mon_thread, NULL, monitor, thread_args));
115 
116 	for (i = 0; i < N_THREADS; ++i)
117 		testutil_check(pthread_join(threads[i], NULL));
118 
119 	testutil_check(pthread_join(mon_thread, NULL));
120 
121 	testutil_check(pthread_join(ckpt_thread, NULL));
122 
123 	printf("Success\n");
124 
125 	testutil_cleanup(opts);
126 	return (EXIT_SUCCESS);
127 }
128 
129 /*
130  * Function for repeatedly running checkpoint operations.
131  */
132 static WT_THREAD_RET
do_checkpoints(void * _opts)133 do_checkpoints(void *_opts)
134 {
135 	TEST_OPTS *opts;
136 	WT_SESSION *session;
137 	time_t now, start;
138 	int ret;
139 
140 	opts = (TEST_OPTS *)_opts;
141 	(void)time(&start);
142 	(void)time(&now);
143 
144 	while (difftime(now, start) < RUNTIME) {
145 		testutil_check(
146 		    opts->conn->open_session(opts->conn, NULL, NULL, &session));
147 
148 		if ((ret = session->checkpoint(session, "force")) != 0)
149 			if (ret != EBUSY && ret != ENOENT)
150 				testutil_die(ret, "session.checkpoint");
151 
152 		testutil_check(session->close(session, NULL));
153 
154 		/*
155 		 * A short sleep to let operations process and avoid back to
156 		 * back checkpoints locking up resources.
157 		 */
158 		sleep(1);
159 		(void)time(&now);
160 	}
161 
162 	return (WT_THREAD_RET_VALUE);
163 }
164 
165 /*
166  * Function to monitor running operations and abort to dump core in the event
167  * that we catch an operation running long.
168  */
169 static WT_THREAD_RET
monitor(void * args)170 monitor(void *args)
171 {
172 	TEST_PER_THREAD_OPTS *thread_args;
173 	time_t now, start;
174 	int ctr, i, last_ops[N_THREADS];
175 
176 	thread_args = (TEST_PER_THREAD_OPTS *)args;
177 
178 	(void)time(&start);
179 	(void)time(&now);
180 
181 	memset(last_ops, 0, sizeof(int) + N_THREADS);
182 
183 	while (difftime(now, start) < RUNTIME) {
184 		/*
185 		 * Checkpoints will run for slightly over MAX_EXECUTION_TIME.
186 		 * MAX_EXECUTION_TIME should always be long enough that we can
187 		 * complete any single operation in 1/2 that time.
188 		 */
189 		sleep(MAX_EXECUTION_TIME / 2);
190 
191 		for (i = 0; i < N_THREADS; i++) {
192 			ctr = thread_args[i].thread_counter;
193 
194 			/* Ignore any threads which may not have started yet. */
195 			if (ctr == 0)
196 				continue;
197 
198 			/*
199 			 * We track how many operations each thread has done. If
200 			 * we have slept and the counter remains the same for a
201 			 * thread it is stuck and should drop a core so the
202 			 * cause of the hang can be investigated.
203 			 */
204 			if (ctr != last_ops[i])
205 				last_ops[i] = ctr;
206 			else {
207 				printf("Thread %d had a task running"
208 				    " for more than %d seconds\n",
209 				    i, MAX_EXECUTION_TIME / 2);
210 				abort();
211 			}
212 		}
213 		(void)time(&now);
214 	}
215 
216 	return (WT_THREAD_RET_VALUE);
217 }
218 
219 /*
220  * Worker thread. Executes random operations from the set of 6.
221  */
222 static WT_THREAD_RET
do_ops(void * args)223 do_ops(void *args)
224 {
225 	WT_RAND_STATE rnd;
226 	time_t now, start;
227 
228 	__wt_random_init_seed(NULL, &rnd);
229 	(void)time(&start);
230 	(void)time(&now);
231 
232 	while (difftime(now, start) < RUNTIME) {
233 		switch (__wt_random(&rnd) % 6) {
234 		case 0:
235 			op_bulk(args);
236 			break;
237 		case 1:
238 			op_create(args);
239 			break;
240 		case 2:
241 			op_cursor(args);
242 			break;
243 		case 3:
244 			op_drop(args);
245 			break;
246 		case 4:
247 			op_bulk_unique(args);
248 			break;
249 		case 5:
250 			op_create_unique(args);
251 			break;
252 		}
253 		(void)time(&now);
254 	}
255 
256 	return (WT_THREAD_RET_VALUE);
257 }
258