1 /*-
2  * Copyright (c) 2014-2018 MongoDB, Inc.
3  * Copyright (c) 2008-2014 WiredTiger, Inc.
4  *	All rights reserved.
5  *
6  * See the file LICENSE for redistribution information.
7  */
8 
9 #include "wt_internal.h"
10 
11 /*
12  * __snapsort_partition --
13  *	Custom quick sort partitioning for snapshots.
14  */
15 static uint32_t
__snapsort_partition(uint64_t * array,uint32_t f,uint32_t l,uint64_t pivot)16 __snapsort_partition(uint64_t *array, uint32_t f, uint32_t l, uint64_t pivot)
17 {
18 	uint32_t i, j;
19 
20 	i = f - 1;
21 	j = l + 1;
22 	for (;;) {
23 		while (pivot < array[--j])
24 			;
25 		while (array[++i] < pivot)
26 			;
27 		if (i < j) {
28 			uint64_t tmp = array[i];
29 			array[i] = array[j];
30 			array[j] = tmp;
31 		} else
32 			return (j);
33 	}
34 }
35 
36 /*
37  * __snapsort_impl --
38  *	Custom quick sort implementation for snapshots.
39  */
40 static void
__snapsort_impl(uint64_t * array,uint32_t f,uint32_t l)41 __snapsort_impl(uint64_t *array, uint32_t f, uint32_t l)
42 {
43 	while (f + 16 < l) {
44 		uint64_t v1 = array[f], v2 = array[l], v3 = array[(f + l)/2];
45 		uint64_t median = v1 < v2 ?
46 		    (v3 < v1 ? v1 : WT_MIN(v2, v3)) :
47 		    (v3 < v2 ? v2 : WT_MIN(v1, v3));
48 		uint32_t m = __snapsort_partition(array, f, l, median);
49 		__snapsort_impl(array, f, m);
50 		f = m + 1;
51 	}
52 }
53 
54 /*
55  * __snapsort --
56  *	Sort an array of transaction IDs.
57  */
58 static void
__snapsort(uint64_t * array,uint32_t size)59 __snapsort(uint64_t *array, uint32_t size)
60 {
61 	__snapsort_impl(array, 0, size - 1);
62 	WT_INSERTION_SORT(array, size, uint64_t, WT_TXNID_LT);
63 }
64 
65 /*
66  * __txn_remove_from_global_table --
67  *	Remove the txn id from the global txn table.
68  */
69 static inline void
__txn_remove_from_global_table(WT_SESSION_IMPL * session)70 __txn_remove_from_global_table(WT_SESSION_IMPL *session)
71 {
72 #ifdef HAVE_DIAGNOSTIC
73 	WT_TXN *txn;
74 	WT_TXN_GLOBAL *txn_global;
75 	WT_TXN_STATE *txn_state;
76 
77 	txn = &session->txn;
78 	txn_global = &S2C(session)->txn_global;
79 	txn_state = WT_SESSION_TXN_STATE(session);
80 
81 	WT_ASSERT(session, !WT_TXNID_LT(txn->id, txn_global->last_running));
82 	WT_ASSERT(session,
83 	    txn->id != WT_TXN_NONE && txn_state->id != WT_TXN_NONE);
84 #else
85 	WT_TXN_STATE *txn_state;
86 
87 	txn_state = WT_SESSION_TXN_STATE(session);
88 #endif
89 	WT_PUBLISH(txn_state->id, WT_TXN_NONE);
90 }
91 
92 /*
93  * __txn_sort_snapshot --
94  *	Sort a snapshot for faster searching and set the min/max bounds.
95  */
96 static void
__txn_sort_snapshot(WT_SESSION_IMPL * session,uint32_t n,uint64_t snap_max)97 __txn_sort_snapshot(WT_SESSION_IMPL *session, uint32_t n, uint64_t snap_max)
98 {
99 	WT_TXN *txn;
100 
101 	txn = &session->txn;
102 
103 	if (n > 1)
104 		__snapsort(txn->snapshot, n);
105 
106 	txn->snapshot_count = n;
107 	txn->snap_max = snap_max;
108 	txn->snap_min = (n > 0 && WT_TXNID_LE(txn->snapshot[0], snap_max)) ?
109 	    txn->snapshot[0] : snap_max;
110 	F_SET(txn, WT_TXN_HAS_SNAPSHOT);
111 	WT_ASSERT(session, n == 0 || txn->snap_min != WT_TXN_NONE);
112 }
113 
114 /*
115  * __wt_txn_release_snapshot --
116  *	Release the snapshot in the current transaction.
117  */
118 void
__wt_txn_release_snapshot(WT_SESSION_IMPL * session)119 __wt_txn_release_snapshot(WT_SESSION_IMPL *session)
120 {
121 	WT_TXN *txn;
122 	WT_TXN_GLOBAL *txn_global;
123 	WT_TXN_STATE *txn_state;
124 
125 	txn = &session->txn;
126 	txn_global = &S2C(session)->txn_global;
127 	txn_state = WT_SESSION_TXN_STATE(session);
128 
129 	WT_ASSERT(session,
130 	    txn_state->pinned_id == WT_TXN_NONE ||
131 	    session->txn.isolation == WT_ISO_READ_UNCOMMITTED ||
132 	    !__wt_txn_visible_all(session, txn_state->pinned_id, NULL));
133 
134 	txn_state->metadata_pinned = txn_state->pinned_id = WT_TXN_NONE;
135 	F_CLR(txn, WT_TXN_HAS_SNAPSHOT);
136 
137 	/* Clear a checkpoint's pinned ID. */
138 	if (WT_SESSION_IS_CHECKPOINT(session)) {
139 		txn_global->checkpoint_state.pinned_id = WT_TXN_NONE;
140 		__wt_timestamp_set_zero(&txn_global->checkpoint_timestamp);
141 	}
142 
143 	__wt_txn_clear_read_timestamp(session);
144 }
145 
146 /*
147  * __wt_txn_get_snapshot --
148  *	Allocate a snapshot.
149  */
150 void
__wt_txn_get_snapshot(WT_SESSION_IMPL * session)151 __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
152 {
153 	WT_CONNECTION_IMPL *conn;
154 	WT_TXN *txn;
155 	WT_TXN_GLOBAL *txn_global;
156 	WT_TXN_STATE *s, *txn_state;
157 	uint64_t commit_gen, current_id, id, prev_oldest_id, pinned_id;
158 	uint32_t i, n, session_cnt;
159 
160 	conn = S2C(session);
161 	txn = &session->txn;
162 	txn_global = &conn->txn_global;
163 	txn_state = WT_SESSION_TXN_STATE(session);
164 	n = 0;
165 
166 	/* Fast path if we already have the current snapshot. */
167 	if ((commit_gen = __wt_session_gen(session, WT_GEN_COMMIT)) != 0) {
168 		if (F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) &&
169 		    commit_gen == __wt_gen(session, WT_GEN_COMMIT))
170 			return;
171 		__wt_session_gen_leave(session, WT_GEN_COMMIT);
172 	}
173 	__wt_session_gen_enter(session, WT_GEN_COMMIT);
174 
175 	/* We're going to scan the table: wait for the lock. */
176 	__wt_readlock(session, &txn_global->rwlock);
177 
178 	current_id = pinned_id = txn_global->current;
179 	prev_oldest_id = txn_global->oldest_id;
180 
181 	/*
182 	 * Include the checkpoint transaction, if one is running: we should
183 	 * ignore any uncommitted changes the checkpoint has written to the
184 	 * metadata.  We don't have to keep the checkpoint's changes pinned so
185 	 * don't including it in the published pinned ID.
186 	 */
187 	if ((id = txn_global->checkpoint_state.id) != WT_TXN_NONE) {
188 		txn->snapshot[n++] = id;
189 		txn_state->metadata_pinned = id;
190 	}
191 
192 	/* For pure read-only workloads, avoid scanning. */
193 	if (prev_oldest_id == current_id) {
194 		txn_state->pinned_id = current_id;
195 		/* Check that the oldest ID has not moved in the meantime. */
196 		WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
197 		goto done;
198 	}
199 
200 	/* Walk the array of concurrent transactions. */
201 	WT_ORDERED_READ(session_cnt, conn->session_cnt);
202 	for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
203 		/*
204 		 * Build our snapshot of any concurrent transaction IDs.
205 		 *
206 		 * Ignore:
207 		 *  - Our own ID: we always read our own updates.
208 		 *  - The ID if it is older than the oldest ID we saw. This
209 		 *    can happen if we race with a thread that is allocating
210 		 *    an ID -- the ID will not be used because the thread will
211 		 *    keep spinning until it gets a valid one.
212 		 */
213 		if (s != txn_state &&
214 		    (id = s->id) != WT_TXN_NONE &&
215 		    WT_TXNID_LE(prev_oldest_id, id)) {
216 			txn->snapshot[n++] = id;
217 			if (WT_TXNID_LT(id, pinned_id))
218 				pinned_id = id;
219 		}
220 	}
221 
222 	/*
223 	 * If we got a new snapshot, update the published pinned ID for this
224 	 * session.
225 	 */
226 	WT_ASSERT(session, WT_TXNID_LE(prev_oldest_id, pinned_id));
227 	WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
228 	txn_state->pinned_id = pinned_id;
229 
230 done:	__wt_readunlock(session, &txn_global->rwlock);
231 	__txn_sort_snapshot(session, n, current_id);
232 }
233 
234 /*
235  * __txn_oldest_scan --
236  *	Sweep the running transactions to calculate the oldest ID required.
237  */
238 static void
__txn_oldest_scan(WT_SESSION_IMPL * session,uint64_t * oldest_idp,uint64_t * last_runningp,uint64_t * metadata_pinnedp,WT_SESSION_IMPL ** oldest_sessionp)239 __txn_oldest_scan(WT_SESSION_IMPL *session,
240     uint64_t *oldest_idp, uint64_t *last_runningp, uint64_t *metadata_pinnedp,
241     WT_SESSION_IMPL **oldest_sessionp)
242 {
243 	WT_CONNECTION_IMPL *conn;
244 	WT_SESSION_IMPL *oldest_session;
245 	WT_TXN_GLOBAL *txn_global;
246 	WT_TXN_STATE *s;
247 	uint64_t id, last_running, metadata_pinned, oldest_id, prev_oldest_id;
248 	uint32_t i, session_cnt;
249 
250 	conn = S2C(session);
251 	txn_global = &conn->txn_global;
252 	oldest_session = NULL;
253 
254 	/* The oldest ID cannot change while we are holding the scan lock. */
255 	prev_oldest_id = txn_global->oldest_id;
256 	last_running = oldest_id = txn_global->current;
257 	if ((metadata_pinned = txn_global->checkpoint_state.id) == WT_TXN_NONE)
258 		metadata_pinned = oldest_id;
259 
260 	/* Walk the array of concurrent transactions. */
261 	WT_ORDERED_READ(session_cnt, conn->session_cnt);
262 	for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
263 		/* Update the last running transaction ID. */
264 		if ((id = s->id) != WT_TXN_NONE &&
265 		    WT_TXNID_LE(prev_oldest_id, id) &&
266 		    WT_TXNID_LT(id, last_running))
267 			last_running = id;
268 
269 		/* Update the metadata pinned ID. */
270 		if ((id = s->metadata_pinned) != WT_TXN_NONE &&
271 		    WT_TXNID_LT(id, metadata_pinned))
272 			metadata_pinned = id;
273 
274 		/*
275 		 * !!!
276 		 * Note: Don't ignore pinned ID values older than the previous
277 		 * oldest ID.  Read-uncommitted operations publish pinned ID
278 		 * values without acquiring the scan lock to protect the global
279 		 * table.  See the comment in __wt_txn_cursor_op for more
280 		 * details.
281 		 */
282 		if ((id = s->pinned_id) != WT_TXN_NONE &&
283 		    WT_TXNID_LT(id, oldest_id)) {
284 			oldest_id = id;
285 			oldest_session = &conn->sessions[i];
286 		}
287 	}
288 
289 	if (WT_TXNID_LT(last_running, oldest_id))
290 		oldest_id = last_running;
291 
292 	/* The oldest ID can't move past any named snapshots. */
293 	if ((id = txn_global->nsnap_oldest_id) != WT_TXN_NONE &&
294 	    WT_TXNID_LT(id, oldest_id))
295 		oldest_id = id;
296 
297 	/* The metadata pinned ID can't move past the oldest ID. */
298 	if (WT_TXNID_LT(oldest_id, metadata_pinned))
299 		metadata_pinned = oldest_id;
300 
301 	*last_runningp = last_running;
302 	*metadata_pinnedp = metadata_pinned;
303 	*oldest_idp = oldest_id;
304 	*oldest_sessionp = oldest_session;
305 }
306 
307 /*
308  * __wt_txn_update_oldest --
309  *	Sweep the running transactions to update the oldest ID required.
310  */
311 int
__wt_txn_update_oldest(WT_SESSION_IMPL * session,uint32_t flags)312 __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags)
313 {
314 	WT_CONNECTION_IMPL *conn;
315 	WT_DECL_RET;
316 	WT_SESSION_IMPL *oldest_session;
317 	WT_TXN_GLOBAL *txn_global;
318 	uint64_t current_id, last_running, metadata_pinned, oldest_id;
319 	uint64_t prev_last_running, prev_metadata_pinned, prev_oldest_id;
320 	bool strict, wait;
321 
322 	conn = S2C(session);
323 	txn_global = &conn->txn_global;
324 	strict = LF_ISSET(WT_TXN_OLDEST_STRICT);
325 	wait = LF_ISSET(WT_TXN_OLDEST_WAIT);
326 
327 	current_id = last_running = metadata_pinned = txn_global->current;
328 	prev_last_running = txn_global->last_running;
329 	prev_metadata_pinned = txn_global->metadata_pinned;
330 	prev_oldest_id = txn_global->oldest_id;
331 
332 #ifdef HAVE_TIMESTAMPS
333 	/* Try to move the pinned timestamp forward. */
334 	if (strict)
335 		WT_RET(__wt_txn_update_pinned_timestamp(session, false));
336 #endif
337 
338 	/*
339 	 * For pure read-only workloads, or if the update isn't forced and the
340 	 * oldest ID isn't too far behind, avoid scanning.
341 	 */
342 	if ((prev_oldest_id == current_id &&
343 	    prev_metadata_pinned == current_id) ||
344 	    (!strict && WT_TXNID_LT(current_id, prev_oldest_id + 100)))
345 		return (0);
346 
347 	/* First do a read-only scan. */
348 	if (wait)
349 		__wt_readlock(session, &txn_global->rwlock);
350 	else if ((ret =
351 	    __wt_try_readlock(session, &txn_global->rwlock)) != 0)
352 		return (ret == EBUSY ? 0 : ret);
353 	__txn_oldest_scan(session,
354 	    &oldest_id, &last_running, &metadata_pinned, &oldest_session);
355 	__wt_readunlock(session, &txn_global->rwlock);
356 
357 	/*
358 	 * If the state hasn't changed (or hasn't moved far enough for
359 	 * non-forced updates), give up.
360 	 */
361 	if ((oldest_id == prev_oldest_id ||
362 	    (!strict && WT_TXNID_LT(oldest_id, prev_oldest_id + 100))) &&
363 	    ((last_running == prev_last_running) ||
364 	    (!strict && WT_TXNID_LT(last_running, prev_last_running + 100))) &&
365 	    metadata_pinned == prev_metadata_pinned)
366 		return (0);
367 
368 	/* It looks like an update is necessary, wait for exclusive access. */
369 	if (wait)
370 		__wt_writelock(session, &txn_global->rwlock);
371 	else if ((ret =
372 	    __wt_try_writelock(session, &txn_global->rwlock)) != 0)
373 		return (ret == EBUSY ? 0 : ret);
374 
375 	/*
376 	 * If the oldest ID has been updated while we waited, don't bother
377 	 * scanning.
378 	 */
379 	if (WT_TXNID_LE(oldest_id, txn_global->oldest_id) &&
380 	    WT_TXNID_LE(last_running, txn_global->last_running) &&
381 	    WT_TXNID_LE(metadata_pinned, txn_global->metadata_pinned))
382 		goto done;
383 
384 	/*
385 	 * Re-scan now that we have exclusive access.  This is necessary because
386 	 * threads get transaction snapshots with read locks, and we have to be
387 	 * sure that there isn't a thread that has got a snapshot locally but
388 	 * not yet published its snap_min.
389 	 */
390 	__txn_oldest_scan(session,
391 	    &oldest_id, &last_running, &metadata_pinned, &oldest_session);
392 
393 #ifdef HAVE_DIAGNOSTIC
394 	{
395 	/*
396 	 * Make sure the ID doesn't move past any named snapshots.
397 	 *
398 	 * Don't include the read/assignment in the assert statement.  Coverity
399 	 * complains if there are assignments only done in diagnostic builds,
400 	 * and when the read is from a volatile.
401 	 */
402 	uint64_t id = txn_global->nsnap_oldest_id;
403 	WT_ASSERT(session,
404 	    id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id));
405 	}
406 #endif
407 	/* Update the public IDs. */
408 	if (WT_TXNID_LT(txn_global->metadata_pinned, metadata_pinned))
409 		txn_global->metadata_pinned = metadata_pinned;
410 	if (WT_TXNID_LT(txn_global->oldest_id, oldest_id))
411 		txn_global->oldest_id = oldest_id;
412 	if (WT_TXNID_LT(txn_global->last_running, last_running)) {
413 		txn_global->last_running = last_running;
414 
415 		/* Output a verbose message about long-running transactions,
416 		 * but only when some progress is being made. */
417 		if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) &&
418 		    current_id - oldest_id > 10000 && oldest_session != NULL) {
419 			__wt_verbose(session, WT_VERB_TRANSACTION,
420 			    "old snapshot %" PRIu64
421 			    " pinned in session %" PRIu32 " [%s]"
422 			    " with snap_min %" PRIu64,
423 			    oldest_id, oldest_session->id,
424 			    oldest_session->lastop,
425 			    oldest_session->txn.snap_min);
426 		}
427 	}
428 
429 done:	__wt_writeunlock(session, &txn_global->rwlock);
430 	return (ret);
431 }
432 
433 /*
434  * __wt_txn_config --
435  *	Configure a transaction.
436  */
437 int
__wt_txn_config(WT_SESSION_IMPL * session,const char * cfg[])438 __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[])
439 {
440 	WT_CONFIG_ITEM cval;
441 	WT_TXN *txn;
442 
443 	txn = &session->txn;
444 
445 	WT_RET(__wt_config_gets_def(session, cfg, "isolation", 0, &cval));
446 	if (cval.len != 0)
447 		txn->isolation =
448 		    WT_STRING_MATCH("snapshot", cval.str, cval.len) ?
449 		    WT_ISO_SNAPSHOT :
450 		    WT_STRING_MATCH("read-committed", cval.str, cval.len) ?
451 		    WT_ISO_READ_COMMITTED : WT_ISO_READ_UNCOMMITTED;
452 
453 	/*
454 	 * The default sync setting is inherited from the connection, but can
455 	 * be overridden by an explicit "sync" setting for this transaction.
456 	 *
457 	 * We want to distinguish between inheriting implicitly and explicitly.
458 	 */
459 	F_CLR(txn, WT_TXN_SYNC_SET);
460 	WT_RET(__wt_config_gets_def(
461 	    session, cfg, "sync", (int)UINT_MAX, &cval));
462 	if (cval.val == 0 || cval.val == 1)
463 		/*
464 		 * This is an explicit setting of sync.  Set the flag so
465 		 * that we know not to overwrite it in commit_transaction.
466 		 */
467 		F_SET(txn, WT_TXN_SYNC_SET);
468 
469 	/*
470 	 * If sync is turned off explicitly, clear the transaction's sync field.
471 	 */
472 	if (cval.val == 0)
473 		txn->txn_logsync = 0;
474 
475 	WT_RET(__wt_config_gets_def(session, cfg, "snapshot", 0, &cval));
476 	if (cval.len > 0)
477 		/*
478 		 * The layering here isn't ideal - the named snapshot get
479 		 * function does both validation and setup. Otherwise we'd
480 		 * need to walk the list of named snapshots twice during
481 		 * transaction open.
482 		 */
483 		WT_RET(__wt_txn_named_snapshot_get(session, &cval));
484 
485 	/* Check if prepared updates should be ignored during reads. */
486 	WT_RET(__wt_config_gets_def(session, cfg, "ignore_prepare", 0, &cval));
487 	if (cval.val)
488 		F_SET(txn, WT_TXN_IGNORE_PREPARE);
489 
490 	WT_RET(__wt_txn_parse_read_timestamp(session, cfg));
491 
492 	return (0);
493 }
494 
495 /*
496  * __wt_txn_reconfigure --
497  *	WT_SESSION::reconfigure for transactions.
498  */
499 int
__wt_txn_reconfigure(WT_SESSION_IMPL * session,const char * config)500 __wt_txn_reconfigure(WT_SESSION_IMPL *session, const char *config)
501 {
502 	WT_CONFIG_ITEM cval;
503 	WT_DECL_RET;
504 	WT_TXN *txn;
505 
506 	txn = &session->txn;
507 
508 	ret = __wt_config_getones(session, config, "isolation", &cval);
509 	if (ret == 0 && cval.len != 0) {
510 		session->isolation = txn->isolation =
511 		    WT_STRING_MATCH("snapshot", cval.str, cval.len) ?
512 		    WT_ISO_SNAPSHOT :
513 		    WT_STRING_MATCH("read-uncommitted", cval.str, cval.len) ?
514 		    WT_ISO_READ_UNCOMMITTED : WT_ISO_READ_COMMITTED;
515 	}
516 	WT_RET_NOTFOUND_OK(ret);
517 
518 	return (0);
519 }
520 
521 /*
522  * __wt_txn_release --
523  *	Release the resources associated with the current transaction.
524  */
525 void
__wt_txn_release(WT_SESSION_IMPL * session)526 __wt_txn_release(WT_SESSION_IMPL *session)
527 {
528 	WT_TXN *txn;
529 	WT_TXN_GLOBAL *txn_global;
530 
531 	txn = &session->txn;
532 	txn_global = &S2C(session)->txn_global;
533 
534 	WT_ASSERT(session, txn->mod_count == 0);
535 	txn->notify = NULL;
536 
537 	/* Clear the transaction's ID from the global table. */
538 	if (WT_SESSION_IS_CHECKPOINT(session)) {
539 		WT_ASSERT(session,
540 		    WT_SESSION_TXN_STATE(session)->id == WT_TXN_NONE);
541 		txn->id = txn_global->checkpoint_state.id = WT_TXN_NONE;
542 
543 		/*
544 		 * Be extra careful to cleanup everything for checkpoints: once
545 		 * the global checkpoint ID is cleared, we can no longer tell
546 		 * if this session is doing a checkpoint.
547 		 */
548 		txn_global->checkpoint_id = 0;
549 	} else if (F_ISSET(txn, WT_TXN_HAS_ID)) {
550 		/*
551 		 * If transaction is prepared, this would have been done in
552 		 * prepare.
553 		 */
554 		if (!F_ISSET(txn, WT_TXN_PREPARE))
555 			__txn_remove_from_global_table(session);
556 		txn->id = WT_TXN_NONE;
557 	}
558 
559 	__wt_txn_clear_commit_timestamp(session);
560 
561 	/* Free the scratch buffer allocated for logging. */
562 	__wt_logrec_free(session, &txn->logrec);
563 
564 	/* Discard any memory from the session's stash that we can. */
565 	WT_ASSERT(session, __wt_session_gen(session, WT_GEN_SPLIT) == 0);
566 	__wt_stash_discard(session);
567 
568 	/*
569 	 * Reset the transaction state to not running and release the snapshot.
570 	 */
571 	__wt_txn_release_snapshot(session);
572 	txn->isolation = session->isolation;
573 
574 	txn->rollback_reason = NULL;
575 
576 	/* Ensure the transaction flags are cleared on exit */
577 	txn->flags = 0;
578 }
579 
580 #ifdef	HAVE_TIMESTAMPS
581 /*
582  * __txn_commit_timestamp_validate --
583  *	Validate that timestamp provided to commit is legal.
584  */
585 static inline int
__txn_commit_timestamp_validate(WT_SESSION_IMPL * session)586 __txn_commit_timestamp_validate(WT_SESSION_IMPL *session)
587 {
588 	WT_DECL_TIMESTAMP(op_timestamp)
589 	WT_TXN *txn;
590 	WT_TXN_OP *op;
591 	WT_UPDATE *upd;
592 	u_int i;
593 	bool op_zero_ts, upd_zero_ts;
594 
595 	txn = &session->txn;
596 
597 	/*
598 	 * Debugging checks on timestamps, if user requested them.
599 	 */
600 	if (F_ISSET(txn, WT_TXN_TS_COMMIT_ALWAYS) &&
601 	    !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
602 	    txn->mod_count != 0)
603 		WT_RET_MSG(session, EINVAL, "commit_timestamp required and "
604 		    "none set on this transaction");
605 	if (F_ISSET(txn, WT_TXN_TS_COMMIT_NEVER) &&
606 	    F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
607 	    txn->mod_count != 0)
608 		WT_RET_MSG(session, EINVAL, "no commit_timestamp required and "
609 		    "timestamp set on this transaction");
610 
611 	/*
612 	 * If we're not doing any key consistency checking, we're done.
613 	 */
614 	if (!F_ISSET(txn, WT_TXN_TS_COMMIT_KEYS))
615 		return (0);
616 
617 	/*
618 	 * Error on any valid update structures for the same key that
619 	 * are at a later timestamp or use timestamps inconsistently.
620 	 */
621 	for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++)
622 		if (op->type == WT_TXN_OP_BASIC_COL ||
623 		    op->type == WT_TXN_OP_BASIC_ROW) {
624 			/*
625 			 * Skip over any aborted update structures or ones
626 			 * from our own transaction.
627 			 */
628 			upd = op->u.op_upd->next;
629 			while (upd != NULL && (upd->txnid == WT_TXN_ABORTED ||
630 			    upd->txnid == txn->id))
631 				upd = upd->next;
632 
633 			/*
634 			 * Check the timestamp on this update with the
635 			 * first valid update in the chain. They're in
636 			 * most recent order.
637 			 */
638 			if (upd == NULL)
639 				continue;
640 			/*
641 			 * Check for consistent per-key timestamp usage.
642 			 * If timestamps are or are not used originally then
643 			 * they should be used the same way always. For this
644 			 * transaction, timestamps are in use anytime the
645 			 * commit timestamp is set.
646 			 * Check timestamps are used in order.
647 			 */
648 			op_zero_ts = !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT);
649 			upd_zero_ts = __wt_timestamp_iszero(&upd->timestamp);
650 			if (op_zero_ts != upd_zero_ts)
651 				WT_RET_MSG(session, EINVAL,
652 				    "per-key timestamps used inconsistently");
653 			/*
654 			 * If we aren't using timestamps for this transaction
655 			 * then we are done checking. Don't check the timestamp
656 			 * because the one in the transaction is not cleared.
657 			 */
658 			if (op_zero_ts)
659 				continue;
660 
661 			op_timestamp = op->u.op_upd->timestamp;
662 			/*
663 			 * Only if the update structure doesn't have a timestamp
664 			 * then use the one in the transaction structure.
665 			 */
666 			if (__wt_timestamp_iszero(&op_timestamp))
667 				op_timestamp = txn->commit_timestamp;
668 			if (__wt_timestamp_cmp(&op_timestamp,
669 			    &upd->timestamp) < 0)
670 				WT_RET_MSG(session, EINVAL,
671 				    "out of order timestamps");
672 		}
673 	return (0);
674 }
675 #endif
676 
677 /*
678  * __wt_txn_commit --
679  *	Commit the current transaction.
680  */
681 int
__wt_txn_commit(WT_SESSION_IMPL * session,const char * cfg[])682 __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
683 {
684 	WT_CONFIG_ITEM cval;
685 	WT_CONNECTION_IMPL *conn;
686 	WT_DECL_RET;
687 	WT_TXN *txn;
688 	WT_TXN_GLOBAL *txn_global;
689 	WT_TXN_OP *op;
690 	WT_UPDATE *upd;
691 	uint32_t fileid;
692 	u_int i;
693 	bool locked, readonly;
694 #ifdef HAVE_TIMESTAMPS
695 	wt_timestamp_t prev_commit_timestamp, ts;
696 	bool update_timestamp;
697 #endif
698 
699 	txn = &session->txn;
700 	conn = S2C(session);
701 	txn_global = &conn->txn_global;
702 	locked = false;
703 
704 	WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));
705 	WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) ||
706 	    txn->mod_count == 0);
707 
708 	readonly = txn->mod_count == 0;
709 	/*
710 	 * Look for a commit timestamp.
711 	 */
712 	WT_ERR(
713 	    __wt_config_gets_def(session, cfg, "commit_timestamp", 0, &cval));
714 	if (cval.len != 0) {
715 #ifdef HAVE_TIMESTAMPS
716 		WT_ERR(__wt_txn_parse_timestamp(session, "commit", &ts, &cval));
717 		WT_ERR(__wt_timestamp_validate(session, "commit", &ts, &cval));
718 		__wt_timestamp_set(&txn->commit_timestamp, &ts);
719 		__wt_txn_set_commit_timestamp(session);
720 #else
721 		WT_ERR_MSG(session, EINVAL, "commit_timestamp requires a "
722 		    "version of WiredTiger built with timestamp support");
723 #endif
724 	}
725 	if (F_ISSET(txn, WT_TXN_PREPARE) &&
726 	    !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT))
727 		WT_ERR_MSG(session, EINVAL,
728 		    "commit_timestamp is required for a prepared transaction");
729 
730 #ifdef HAVE_TIMESTAMPS
731 	WT_ERR(__txn_commit_timestamp_validate(session));
732 #endif
733 
734 	/*
735 	 * The default sync setting is inherited from the connection, but can
736 	 * be overridden by an explicit "sync" setting for this transaction.
737 	 */
738 	WT_ERR(__wt_config_gets_def(session, cfg, "sync", 0, &cval));
739 
740 	/*
741 	 * If the user chose the default setting, check whether sync is enabled
742 	 * for this transaction (either inherited or via begin_transaction).
743 	 * If sync is disabled, clear the field to avoid the log write being
744 	 * flushed.
745 	 *
746 	 * Otherwise check for specific settings.  We don't need to check for
747 	 * "on" because that is the default inherited from the connection.  If
748 	 * the user set anything in begin_transaction, we only override with an
749 	 * explicit setting.
750 	 */
751 	if (cval.len == 0) {
752 		if (!FLD_ISSET(txn->txn_logsync, WT_LOG_SYNC_ENABLED) &&
753 		    !F_ISSET(txn, WT_TXN_SYNC_SET))
754 			txn->txn_logsync = 0;
755 	} else {
756 		/*
757 		 * If the caller already set sync on begin_transaction then
758 		 * they should not be using sync on commit_transaction.
759 		 * Flag that as an error.
760 		 */
761 		if (F_ISSET(txn, WT_TXN_SYNC_SET))
762 			WT_ERR_MSG(session, EINVAL,
763 			    "Sync already set during begin_transaction");
764 		if (WT_STRING_MATCH("background", cval.str, cval.len))
765 			txn->txn_logsync = WT_LOG_BACKGROUND;
766 		else if (WT_STRING_MATCH("off", cval.str, cval.len))
767 			txn->txn_logsync = 0;
768 		/*
769 		 * We don't need to check for "on" here because that is the
770 		 * default to inherit from the connection setting.
771 		 */
772 	}
773 
774 	/* Commit notification. */
775 	if (txn->notify != NULL)
776 		WT_ERR(txn->notify->notify(txn->notify,
777 		    (WT_SESSION *)session, txn->id, 1));
778 
779 	/*
780 	 * We are about to release the snapshot: copy values into any
781 	 * positioned cursors so they don't point to updates that could be
782 	 * freed once we don't have a snapshot.
783 	 * If this transaction is prepared, then copying values would have been
784 	 * done during prepare.
785 	 */
786 	if (session->ncursors > 0 && !F_ISSET(txn, WT_TXN_PREPARE)) {
787 		WT_DIAGNOSTIC_YIELD;
788 		WT_ERR(__wt_session_copy_values(session));
789 	}
790 
791 	/* If we are logging, write a commit log record. */
792 	if (txn->logrec != NULL &&
793 	    FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) &&
794 	    !F_ISSET(session, WT_SESSION_NO_LOGGING)) {
795 		/*
796 		 * We are about to block on I/O writing the log.
797 		 * Release our snapshot in case it is keeping data pinned.
798 		 * This is particularly important for checkpoints.
799 		 */
800 		__wt_txn_release_snapshot(session);
801 		/*
802 		 * We hold the visibility lock for reading from the time
803 		 * we write our log record until the time we release our
804 		 * transaction so that the LSN any checkpoint gets will
805 		 * always reflect visible data.
806 		 */
807 		__wt_readlock(session, &txn_global->visibility_rwlock);
808 		locked = true;
809 		WT_ERR(__wt_txn_log_commit(session, cfg));
810 	}
811 
812 	/* Note: we're going to commit: nothing can fail after this point. */
813 
814 	/* Process and free updates. */
815 	for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) {
816 		fileid = op->btree->id;
817 		switch (op->type) {
818 		case WT_TXN_OP_NONE:
819 			break;
820 		case WT_TXN_OP_BASIC_COL:
821 		case WT_TXN_OP_BASIC_ROW:
822 		case WT_TXN_OP_INMEM_COL:
823 		case WT_TXN_OP_INMEM_ROW:
824 			upd = op->u.op_upd;
825 
826 			/*
827 			 * Switch reserved operations to abort to
828 			 * simplify obsolete update list truncation.
829 			 */
830 			if (upd->type == WT_UPDATE_RESERVE) {
831 				upd->txnid = WT_TXN_ABORTED;
832 				break;
833 			}
834 
835 			/*
836 			 * Writes to the lookaside file can be evicted as soon
837 			 * as they commit.
838 			 */
839 			if (conn->cache->las_fileid != 0 &&
840 			    fileid == conn->cache->las_fileid) {
841 				upd->txnid = WT_TXN_NONE;
842 				break;
843 			}
844 			/* FALLTHROUGH */
845 		case WT_TXN_OP_REF_DELETE:
846 #ifdef HAVE_TIMESTAMPS
847 			__wt_txn_op_set_timestamp(session, op);
848 #endif
849 			break;
850 		case WT_TXN_OP_TRUNCATE_COL:
851 		case WT_TXN_OP_TRUNCATE_ROW:
852 			/* Other operations don't need timestamps. */
853 			break;
854 		}
855 
856 		__wt_txn_op_free(session, op);
857 	}
858 	txn->mod_count = 0;
859 
860 #ifdef HAVE_TIMESTAMPS
861 	/*
862 	 * Track the largest commit timestamp we have seen.
863 	 *
864 	 * We don't actually clear the local commit timestamp, just the flag.
865 	 * That said, we can't update the global commit timestamp until this
866 	 * transaction is visible, which happens when we release it.
867 	 */
868 	update_timestamp = F_ISSET(txn, WT_TXN_HAS_TS_COMMIT);
869 #endif
870 
871 	__wt_txn_release(session);
872 	if (locked)
873 		__wt_readunlock(session, &txn_global->visibility_rwlock);
874 
875 	/*
876 	 * If we have made some updates visible, start a new commit generation:
877 	 * any cached snapshots have to be refreshed.
878 	 */
879 	if (!readonly)
880 		(void)__wt_gen_next(session, WT_GEN_COMMIT);
881 
882 #ifdef HAVE_TIMESTAMPS
883 	/* First check if we've already committed something in the future. */
884 	if (update_timestamp) {
885 		WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
886 		    __wt_timestamp_set(
887 			&prev_commit_timestamp, &txn_global->commit_timestamp));
888 		update_timestamp = __wt_timestamp_cmp(
889 		    &txn->commit_timestamp, &prev_commit_timestamp) > 0;
890 	}
891 
892 	/*
893 	 * If it looks like we need to move the global commit timestamp,
894 	 * write lock and re-check.
895 	 */
896 	if (update_timestamp) {
897 #if WT_TIMESTAMP_SIZE == 8
898 		while (__wt_timestamp_cmp(
899 		    &txn->commit_timestamp, &prev_commit_timestamp) > 0) {
900 			if (__wt_atomic_cas64(
901 			    &txn_global->commit_timestamp.val,
902 			    prev_commit_timestamp.val,
903 			    txn->commit_timestamp.val)) {
904 				txn_global->has_commit_timestamp = true;
905 				break;
906 			}
907 		    __wt_timestamp_set(
908 			&prev_commit_timestamp, &txn_global->commit_timestamp);
909 		}
910 #else
911 		__wt_writelock(session, &txn_global->rwlock);
912 		if (__wt_timestamp_cmp(&txn->commit_timestamp,
913 		    &txn_global->commit_timestamp) > 0) {
914 			__wt_timestamp_set(&txn_global->commit_timestamp,
915 			    &txn->commit_timestamp);
916 			txn_global->has_commit_timestamp = true;
917 		}
918 		__wt_writeunlock(session, &txn_global->rwlock);
919 #endif
920 	}
921 #endif
922 
923 	/*
924 	 * We're between transactions, if we need to block for eviction, it's
925 	 * a good time to do so.  Note that we must ignore any error return
926 	 * because the user's data is committed.
927 	 */
928 	if (!readonly)
929 		(void)__wt_cache_eviction_check(session, false, false, NULL);
930 	return (0);
931 
932 err:	/*
933 	 * If anything went wrong, roll back.
934 	 *
935 	 * !!!
936 	 * Nothing can fail after this point.
937 	 */
938 	if (locked)
939 		__wt_readunlock(session, &txn_global->visibility_rwlock);
940 	WT_TRET(__wt_txn_rollback(session, cfg));
941 	return (ret);
942 }
943 
944 /*
945  * __wt_txn_prepare --
946  *	Prepare the current transaction.
947  */
948 int
__wt_txn_prepare(WT_SESSION_IMPL * session,const char * cfg[])949 __wt_txn_prepare(WT_SESSION_IMPL *session, const char *cfg[])
950 {
951 #ifdef HAVE_TIMESTAMPS
952 	WT_TXN *txn;
953 	WT_TXN_OP *op;
954 	WT_UPDATE *upd;
955 	wt_timestamp_t ts;
956 	u_int i;
957 
958 	txn = &session->txn;
959 
960 	WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));
961 	WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || txn->mod_count == 0);
962 	/* Transaction should not have updated any of the logged tables. */
963 	WT_ASSERT(session, txn->logrec == NULL);
964 
965 	WT_RET(__wt_txn_context_check(session, true));
966 
967 	/* Parse and validate the prepare timestamp.  */
968 	WT_RET(__wt_txn_parse_prepare_timestamp(session, cfg, &ts));
969 	__wt_timestamp_set(&txn->prepare_timestamp, &ts);
970 
971 	/*
972 	 * We are about to release the snapshot: copy values into any
973 	 * positioned cursors so they don't point to updates that could be
974 	 * freed once we don't have a snapshot.
975 	 */
976 	if (session->ncursors > 0) {
977 		WT_DIAGNOSTIC_YIELD;
978 		WT_RET(__wt_session_copy_values(session));
979 	}
980 
981 	/* Prepare updates. */
982 	for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) {
983 		/* Assert it's not an update to the lookaside file. */
984 		WT_ASSERT(session, S2C(session)->cache->las_fileid == 0 ||
985 		    !F_ISSET(op->btree, WT_BTREE_LOOKASIDE));
986 
987 		/* Metadata updates are never prepared. */
988 		if (WT_IS_METADATA(op->btree->dhandle))
989 			continue;
990 
991 		upd = op->u.op_upd;
992 
993 		switch (op->type) {
994 		case WT_TXN_OP_NONE:
995 			break;
996 		case WT_TXN_OP_BASIC_COL:
997 		case WT_TXN_OP_BASIC_ROW:
998 		case WT_TXN_OP_INMEM_COL:
999 		case WT_TXN_OP_INMEM_ROW:
1000 			/*
1001 			 * Switch reserved operation to abort to simplify
1002 			 * obsolete update list truncation. The object free
1003 			 * function clears the operation type so we don't
1004 			 * try to visit this update again: it can be evicted.
1005 			 */
1006 			if (upd->type == WT_UPDATE_RESERVE) {
1007 				upd->txnid = WT_TXN_ABORTED;
1008 				__wt_txn_op_free(session, op);
1009 				break;
1010 			}
1011 
1012 			/* Set prepare timestamp. */
1013 			__wt_timestamp_set(&upd->timestamp, &ts);
1014 
1015 			WT_PUBLISH(upd->prepare_state, WT_PREPARE_INPROGRESS);
1016 			break;
1017 		case WT_TXN_OP_REF_DELETE:
1018 			__wt_timestamp_set(
1019 			    &op->u.ref->page_del->timestamp, &ts);
1020 			WT_PUBLISH(op->u.ref->page_del->prepare_state,
1021 			    WT_PREPARE_INPROGRESS);
1022 			break;
1023 		case WT_TXN_OP_TRUNCATE_COL:
1024 		case WT_TXN_OP_TRUNCATE_ROW:
1025 			/* Other operations don't need timestamps. */
1026 			break;
1027 		}
1028 	}
1029 
1030 	/* Set transaction state to prepare. */
1031 	F_SET(&session->txn, WT_TXN_PREPARE);
1032 
1033 	/* Release our snapshot in case it is keeping data pinned. */
1034 	__wt_txn_release_snapshot(session);
1035 
1036 	/*
1037 	 * Clear the transaction's ID from the global table, to facilitate
1038 	 * prepared data visibility, but not from local txn structure.
1039 	 */
1040 	if (F_ISSET(txn, WT_TXN_HAS_ID))
1041 		__txn_remove_from_global_table(session);
1042 
1043 	return (0);
1044 #else
1045 	WT_UNUSED(cfg);
1046 	WT_RET_MSG(session, ENOTSUP, "prepare_transaction requires a version "
1047 	    "of WiredTiger built with timestamp support");
1048 #endif
1049 }
1050 
1051 /*
1052  * __wt_txn_rollback --
1053  *	Roll back the current transaction.
1054  */
1055 int
__wt_txn_rollback(WT_SESSION_IMPL * session,const char * cfg[])1056 __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[])
1057 {
1058 	WT_DECL_RET;
1059 	WT_TXN *txn;
1060 	WT_TXN_OP *op;
1061 	WT_UPDATE *upd;
1062 	u_int i;
1063 	bool readonly;
1064 
1065 	WT_UNUSED(cfg);
1066 
1067 	txn = &session->txn;
1068 	readonly = txn->mod_count == 0;
1069 	WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));
1070 
1071 	/* Rollback notification. */
1072 	if (txn->notify != NULL)
1073 		WT_TRET(txn->notify->notify(txn->notify, (WT_SESSION *)session,
1074 		    txn->id, 0));
1075 
1076 	/* Rollback updates. */
1077 	for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) {
1078 		/* Assert it's not an update to the lookaside file. */
1079 		WT_ASSERT(session, S2C(session)->cache->las_fileid == 0 ||
1080 		    !F_ISSET(op->btree, WT_BTREE_LOOKASIDE));
1081 
1082 		/* Metadata updates are never rolled back. */
1083 		if (WT_IS_METADATA(op->btree->dhandle))
1084 			continue;
1085 
1086 		upd = op->u.op_upd;
1087 
1088 		switch (op->type) {
1089 		case WT_TXN_OP_NONE:
1090 			break;
1091 		case WT_TXN_OP_BASIC_COL:
1092 		case WT_TXN_OP_BASIC_ROW:
1093 		case WT_TXN_OP_INMEM_COL:
1094 		case WT_TXN_OP_INMEM_ROW:
1095 			WT_ASSERT(session,
1096 			    upd->txnid == txn->id ||
1097 			    upd->txnid == WT_TXN_ABORTED);
1098 			upd->txnid = WT_TXN_ABORTED;
1099 			break;
1100 		case WT_TXN_OP_REF_DELETE:
1101 			WT_TRET(__wt_delete_page_rollback(session, op->u.ref));
1102 			break;
1103 		case WT_TXN_OP_TRUNCATE_COL:
1104 		case WT_TXN_OP_TRUNCATE_ROW:
1105 			/*
1106 			 * Nothing to do: these operations are only logged for
1107 			 * recovery.  The in-memory changes will be rolled back
1108 			 * with a combination of WT_TXN_OP_REF_DELETE and
1109 			 * WT_TXN_OP_INMEM operations.
1110 			 */
1111 			break;
1112 		}
1113 
1114 		__wt_txn_op_free(session, op);
1115 	}
1116 	txn->mod_count = 0;
1117 
1118 	__wt_txn_release(session);
1119 	/*
1120 	 * We're between transactions, if we need to block for eviction, it's
1121 	 * a good time to do so.  Note that we must ignore any error return
1122 	 * because the user's data is committed.
1123 	 */
1124 	if (!readonly)
1125 		(void)__wt_cache_eviction_check(session, false, false, NULL);
1126 	return (ret);
1127 }
1128 
1129 /*
1130  * __wt_txn_rollback_required --
1131  *	Prepare to log a reason if the user attempts to use the transaction to
1132  * do anything other than rollback.
1133  */
1134 int
__wt_txn_rollback_required(WT_SESSION_IMPL * session,const char * reason)1135 __wt_txn_rollback_required(WT_SESSION_IMPL *session, const char *reason)
1136 {
1137 	session->txn.rollback_reason = reason;
1138 	return (WT_ROLLBACK);
1139 }
1140 
1141 /*
1142  * __wt_txn_init --
1143  *	Initialize a session's transaction data.
1144  */
1145 int
__wt_txn_init(WT_SESSION_IMPL * session,WT_SESSION_IMPL * session_ret)1146 __wt_txn_init(WT_SESSION_IMPL *session, WT_SESSION_IMPL *session_ret)
1147 {
1148 	WT_TXN *txn;
1149 
1150 	txn = &session_ret->txn;
1151 	txn->id = WT_TXN_NONE;
1152 
1153 	WT_RET(__wt_calloc_def(session,
1154 	    S2C(session_ret)->session_size, &txn->snapshot));
1155 
1156 #ifdef HAVE_DIAGNOSTIC
1157 	if (S2C(session_ret)->txn_global.states != NULL) {
1158 		WT_TXN_STATE *txn_state;
1159 		txn_state = WT_SESSION_TXN_STATE(session_ret);
1160 		WT_ASSERT(session, txn_state->pinned_id == WT_TXN_NONE);
1161 	}
1162 #endif
1163 
1164 	/*
1165 	 * Take care to clean these out in case we are reusing the transaction
1166 	 * for eviction.
1167 	 */
1168 	txn->mod = NULL;
1169 
1170 	txn->isolation = session_ret->isolation;
1171 	return (0);
1172 }
1173 
1174 /*
1175  * __wt_txn_stats_update --
1176  *	Update the transaction statistics for return to the application.
1177  */
1178 void
__wt_txn_stats_update(WT_SESSION_IMPL * session)1179 __wt_txn_stats_update(WT_SESSION_IMPL *session)
1180 {
1181 	WT_CONNECTION_IMPL *conn;
1182 	WT_CONNECTION_STATS **stats;
1183 	WT_TXN_GLOBAL *txn_global;
1184 	uint64_t checkpoint_pinned, snapshot_pinned;
1185 
1186 	conn = S2C(session);
1187 	txn_global = &conn->txn_global;
1188 	stats = conn->stats;
1189 	checkpoint_pinned = txn_global->checkpoint_state.pinned_id;
1190 	snapshot_pinned = txn_global->nsnap_oldest_id;
1191 
1192 	WT_STAT_SET(session, stats, txn_pinned_range,
1193 	    txn_global->current - txn_global->oldest_id);
1194 
1195 #if WT_TIMESTAMP_SIZE == 8
1196 	{
1197 	WT_DECL_TIMESTAMP(checkpoint_timestamp)
1198 	WT_DECL_TIMESTAMP(commit_timestamp)
1199 	WT_DECL_TIMESTAMP(pinned_timestamp)
1200 
1201 	checkpoint_timestamp = txn_global->checkpoint_timestamp;
1202 	commit_timestamp = txn_global->commit_timestamp;
1203 	pinned_timestamp = txn_global->pinned_timestamp;
1204 	if (checkpoint_timestamp.val != 0 &&
1205 	    checkpoint_timestamp.val < pinned_timestamp.val)
1206 		pinned_timestamp = checkpoint_timestamp;
1207 	WT_STAT_SET(session, stats, txn_pinned_timestamp,
1208 	    commit_timestamp.val - pinned_timestamp.val);
1209 	WT_STAT_SET(session, stats, txn_pinned_timestamp_checkpoint,
1210 	    commit_timestamp.val - checkpoint_timestamp.val);
1211 	WT_STAT_SET(session, stats, txn_pinned_timestamp_oldest,
1212 	    commit_timestamp.val - txn_global->oldest_timestamp.val);
1213 	}
1214 #endif
1215 
1216 	WT_STAT_SET(session, stats, txn_pinned_snapshot_range,
1217 	    snapshot_pinned == WT_TXN_NONE ?
1218 	    0 : txn_global->current - snapshot_pinned);
1219 
1220 	WT_STAT_SET(session, stats, txn_pinned_checkpoint_range,
1221 	    checkpoint_pinned == WT_TXN_NONE ?
1222 	    0 : txn_global->current - checkpoint_pinned);
1223 
1224 	WT_STAT_SET(
1225 	    session, stats, txn_checkpoint_time_max, conn->ckpt_time_max);
1226 	WT_STAT_SET(
1227 	    session, stats, txn_checkpoint_time_min, conn->ckpt_time_min);
1228 	WT_STAT_SET(
1229 	    session, stats, txn_checkpoint_time_recent, conn->ckpt_time_recent);
1230 	WT_STAT_SET(
1231 	    session, stats, txn_checkpoint_time_total, conn->ckpt_time_total);
1232 	WT_STAT_SET(session,
1233 	    stats, txn_commit_queue_len, txn_global->commit_timestampq_len);
1234 	WT_STAT_SET(session,
1235 	    stats, txn_read_queue_len, txn_global->read_timestampq_len);
1236 }
1237 
1238 /*
1239  * __wt_txn_release_resources --
1240  *	Release resources for a session's transaction data.
1241  */
1242 void
__wt_txn_release_resources(WT_SESSION_IMPL * session)1243 __wt_txn_release_resources(WT_SESSION_IMPL *session)
1244 {
1245 	WT_TXN *txn;
1246 
1247 	txn = &session->txn;
1248 
1249 	WT_ASSERT(session, txn->mod_count == 0);
1250 	__wt_free(session, txn->mod);
1251 	txn->mod_alloc = 0;
1252 	txn->mod_count = 0;
1253 }
1254 
1255 /*
1256  * __wt_txn_destroy --
1257  *	Destroy a session's transaction data.
1258  */
1259 void
__wt_txn_destroy(WT_SESSION_IMPL * session)1260 __wt_txn_destroy(WT_SESSION_IMPL *session)
1261 {
1262 	__wt_txn_release_resources(session);
1263 	__wt_free(session, session->txn.snapshot);
1264 }
1265 
1266 /*
1267  * __wt_txn_global_init --
1268  *	Initialize the global transaction state.
1269  */
1270 int
__wt_txn_global_init(WT_SESSION_IMPL * session,const char * cfg[])1271 __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[])
1272 {
1273 	WT_CONNECTION_IMPL *conn;
1274 	WT_TXN_GLOBAL *txn_global;
1275 	WT_TXN_STATE *s;
1276 	u_int i;
1277 
1278 	WT_UNUSED(cfg);
1279 	conn = S2C(session);
1280 
1281 	txn_global = &conn->txn_global;
1282 	txn_global->current = txn_global->last_running =
1283 	    txn_global->metadata_pinned = txn_global->oldest_id = WT_TXN_FIRST;
1284 
1285 	WT_RET(__wt_spin_init(
1286 	    session, &txn_global->id_lock, "transaction id lock"));
1287 	WT_RWLOCK_INIT_TRACKED(session, &txn_global->rwlock, txn_global);
1288 	WT_RET(__wt_rwlock_init(session, &txn_global->visibility_rwlock));
1289 
1290 	WT_RWLOCK_INIT_TRACKED(session,
1291 	    &txn_global->commit_timestamp_rwlock, commit_timestamp);
1292 	TAILQ_INIT(&txn_global->commit_timestamph);
1293 
1294 	WT_RWLOCK_INIT_TRACKED(session,
1295 	    &txn_global->read_timestamp_rwlock, read_timestamp);
1296 	TAILQ_INIT(&txn_global->read_timestamph);
1297 
1298 	WT_RET(__wt_rwlock_init(session, &txn_global->nsnap_rwlock));
1299 	txn_global->nsnap_oldest_id = WT_TXN_NONE;
1300 	TAILQ_INIT(&txn_global->nsnaph);
1301 
1302 	WT_RET(__wt_calloc_def(
1303 	    session, conn->session_size, &txn_global->states));
1304 
1305 	for (i = 0, s = txn_global->states; i < conn->session_size; i++, s++)
1306 		s->id = s->metadata_pinned = s->pinned_id = WT_TXN_NONE;
1307 
1308 	return (0);
1309 }
1310 
1311 /*
1312  * __wt_txn_global_destroy --
1313  *	Destroy the global transaction state.
1314  */
1315 void
__wt_txn_global_destroy(WT_SESSION_IMPL * session)1316 __wt_txn_global_destroy(WT_SESSION_IMPL *session)
1317 {
1318 	WT_CONNECTION_IMPL *conn;
1319 	WT_TXN_GLOBAL *txn_global;
1320 
1321 	conn = S2C(session);
1322 	txn_global = &conn->txn_global;
1323 
1324 	if (txn_global == NULL)
1325 		return;
1326 
1327 	__wt_spin_destroy(session, &txn_global->id_lock);
1328 	__wt_rwlock_destroy(session, &txn_global->rwlock);
1329 	__wt_rwlock_destroy(session, &txn_global->commit_timestamp_rwlock);
1330 	__wt_rwlock_destroy(session, &txn_global->read_timestamp_rwlock);
1331 	__wt_rwlock_destroy(session, &txn_global->nsnap_rwlock);
1332 	__wt_rwlock_destroy(session, &txn_global->visibility_rwlock);
1333 	__wt_free(session, txn_global->states);
1334 }
1335 
1336 /*
1337  * __wt_txn_activity_drain --
1338  *	Wait for transactions to quiesce.
1339  */
1340 int
__wt_txn_activity_drain(WT_SESSION_IMPL * session)1341 __wt_txn_activity_drain(WT_SESSION_IMPL *session)
1342 {
1343 	bool txn_active;
1344 
1345 	/*
1346 	 * It's possible that the eviction server is in the middle of a long
1347 	 * operation, with a transaction ID pinned.  In that case, we will loop
1348 	 * here until the transaction ID is released, when the oldest
1349 	 * transaction ID will catch up with the current ID.
1350 	 */
1351 	for (;;) {
1352 		WT_RET(__wt_txn_activity_check(session, &txn_active));
1353 		if (!txn_active)
1354 			break;
1355 
1356 		WT_STAT_CONN_INCR(session, txn_release_blocked);
1357 		__wt_yield();
1358 	}
1359 
1360 	return (0);
1361 }
1362 
1363 /*
1364  * __wt_txn_global_shutdown --
1365  *	Shut down the global transaction state.
1366  */
1367 void
__wt_txn_global_shutdown(WT_SESSION_IMPL * session)1368 __wt_txn_global_shutdown(WT_SESSION_IMPL *session)
1369 {
1370 #ifdef HAVE_TIMESTAMPS
1371 	/*
1372 	 * All application transactions have completed, ignore the pinned
1373 	 * timestamp so that updates can be evicted from the cache during
1374 	 * connection close.
1375 	 *
1376 	 * Note that we are relying on a special case in __wt_txn_visible_all
1377 	 * that returns true during close when there is no pinned timestamp
1378 	 * set.
1379 	 */
1380 	S2C(session)->txn_global.has_pinned_timestamp = false;
1381 #else
1382 	WT_UNUSED(session);
1383 #endif
1384 }
1385 
1386 /*
1387  * __wt_verbose_dump_txn_one --
1388  *	Output diagnostic information about a transaction structure.
1389  */
1390 int
__wt_verbose_dump_txn_one(WT_SESSION_IMPL * session,WT_TXN * txn)1391 __wt_verbose_dump_txn_one(WT_SESSION_IMPL *session, WT_TXN *txn)
1392 {
1393 #ifdef HAVE_TIMESTAMPS
1394 	char hex_timestamp[3][2 * WT_TIMESTAMP_SIZE + 1];
1395 #endif
1396 	const char *iso_tag;
1397 
1398 	WT_NOT_READ(iso_tag, "INVALID");
1399 	switch (txn->isolation) {
1400 	case WT_ISO_READ_COMMITTED:
1401 		iso_tag = "WT_ISO_READ_COMMITTED";
1402 		break;
1403 	case WT_ISO_READ_UNCOMMITTED:
1404 		iso_tag = "WT_ISO_READ_UNCOMMITTED";
1405 		break;
1406 	case WT_ISO_SNAPSHOT:
1407 		iso_tag = "WT_ISO_SNAPSHOT";
1408 		break;
1409 	}
1410 #ifdef HAVE_TIMESTAMPS
1411 	WT_RET(__wt_timestamp_to_hex_string(
1412 	    session, hex_timestamp[0], &txn->commit_timestamp));
1413 	WT_RET(__wt_timestamp_to_hex_string(
1414 	    session, hex_timestamp[1], &txn->first_commit_timestamp));
1415 	WT_RET(__wt_timestamp_to_hex_string(
1416 	    session, hex_timestamp[2], &txn->read_timestamp));
1417 	WT_RET(__wt_msg(session,
1418 	    "mod count: %u"
1419 	    ", snap min: %" PRIu64
1420 	    ", snap max: %" PRIu64
1421 	    ", commit_timestamp: %s"
1422 	    ", first_commit_timestamp: %s"
1423 	    ", read_timestamp: %s"
1424 	    ", flags: 0x%08" PRIx32
1425 	    ", isolation: %s",
1426 	    txn->mod_count,
1427 	    txn->snap_min,
1428 	    txn->snap_max,
1429 	    hex_timestamp[0],
1430 	    hex_timestamp[1],
1431 	    hex_timestamp[2],
1432 	    txn->flags,
1433 	    iso_tag));
1434 #else
1435 	WT_RET(__wt_msg(session,
1436 	    "mod count: %u"
1437 	    ", snap min: %" PRIu64
1438 	    ", snap max: %" PRIu64
1439 	    ", flags: 0x%08" PRIx32
1440 	    ", isolation: %s",
1441 	    txn->mod_count,
1442 	    txn->snap_min,
1443 	    txn->snap_max,
1444 	    txn->flags,
1445 	    iso_tag));
1446 #endif
1447 	return (0);
1448 }
1449 
1450 /*
1451  * __wt_verbose_dump_txn --
1452  *	Output diagnostic information about the global transaction state.
1453  */
1454 int
__wt_verbose_dump_txn(WT_SESSION_IMPL * session)1455 __wt_verbose_dump_txn(WT_SESSION_IMPL *session)
1456 {
1457 	WT_CONNECTION_IMPL *conn;
1458 	WT_SESSION_IMPL *sess;
1459 	WT_TXN_GLOBAL *txn_global;
1460 	WT_TXN_STATE *s;
1461 	uint64_t id;
1462 	uint32_t i, session_cnt;
1463 #ifdef HAVE_TIMESTAMPS
1464 	char hex_timestamp[3][2 * WT_TIMESTAMP_SIZE + 1];
1465 #endif
1466 
1467 	conn = S2C(session);
1468 	txn_global = &conn->txn_global;
1469 
1470 	WT_RET(__wt_msg(session, "%s", WT_DIVIDER));
1471 	WT_RET(__wt_msg(session, "transaction state dump"));
1472 
1473 	WT_RET(__wt_msg(session, "current ID: %" PRIu64, txn_global->current));
1474 	WT_RET(__wt_msg(session,
1475 	    "last running ID: %" PRIu64, txn_global->last_running));
1476 	WT_RET(__wt_msg(session, "oldest ID: %" PRIu64, txn_global->oldest_id));
1477 
1478 #ifdef HAVE_TIMESTAMPS
1479 	WT_RET(__wt_timestamp_to_hex_string(
1480 	    session, hex_timestamp[0], &txn_global->commit_timestamp));
1481 	WT_RET(__wt_msg(session, "commit timestamp: %s", hex_timestamp[0]));
1482 	WT_RET(__wt_timestamp_to_hex_string(
1483 	    session, hex_timestamp[0], &txn_global->oldest_timestamp));
1484 	WT_RET(__wt_msg(session, "oldest timestamp: %s", hex_timestamp[0]));
1485 	WT_RET(__wt_timestamp_to_hex_string(
1486 	    session, hex_timestamp[0], &txn_global->pinned_timestamp));
1487 	WT_RET(__wt_msg(session, "pinned timestamp: %s", hex_timestamp[0]));
1488 	WT_RET(__wt_timestamp_to_hex_string(
1489 	    session, hex_timestamp[0], &txn_global->stable_timestamp));
1490 	WT_RET(__wt_msg(session, "stable timestamp: %s", hex_timestamp[0]));
1491 	WT_RET(__wt_msg(session, "has_commit_timestamp: %s",
1492 	    txn_global->has_commit_timestamp ? "yes" : "no"));
1493 	WT_RET(__wt_msg(session, "has_oldest_timestamp: %s",
1494 	    txn_global->has_oldest_timestamp ? "yes" : "no"));
1495 	WT_RET(__wt_msg(session, "has_pinned_timestamp: %s",
1496 	    txn_global->has_pinned_timestamp ? "yes" : "no"));
1497 	WT_RET(__wt_msg(session, "has_stable_timestamp: %s",
1498 	    txn_global->has_stable_timestamp ? "yes" : "no"));
1499 	WT_RET(__wt_msg(session, "oldest_is_pinned: %s",
1500 	    txn_global->oldest_is_pinned ? "yes" : "no"));
1501 	WT_RET(__wt_msg(session, "stable_is_pinned: %s",
1502 	    txn_global->stable_is_pinned ? "yes" : "no"));
1503 #endif
1504 
1505 	WT_RET(__wt_msg(session, "checkpoint running: %s",
1506 	    txn_global->checkpoint_running ? "yes" : "no"));
1507 	WT_RET(__wt_msg(session, "checkpoint generation: %" PRIu64,
1508 	    __wt_gen(session, WT_GEN_CHECKPOINT)));
1509 	WT_RET(__wt_msg(session, "checkpoint pinned ID: %" PRIu64,
1510 	    txn_global->checkpoint_state.pinned_id));
1511 	WT_RET(__wt_msg(session, "checkpoint txn ID: %" PRIu64,
1512 	    txn_global->checkpoint_state.id));
1513 
1514 	WT_RET(__wt_msg(session,
1515 	    "oldest named snapshot ID: %" PRIu64, txn_global->nsnap_oldest_id));
1516 
1517 	WT_ORDERED_READ(session_cnt, conn->session_cnt);
1518 	WT_RET(__wt_msg(session, "session count: %" PRIu32, session_cnt));
1519 	WT_RET(__wt_msg(session, "Transaction state of active sessions:"));
1520 
1521 	/*
1522 	 * Walk each session transaction state and dump information. Accessing
1523 	 * the content of session handles is not thread safe, so some
1524 	 * information may change while traversing if other threads are active
1525 	 * at the same time, which is OK since this is diagnostic code.
1526 	 */
1527 	for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
1528 		/* Skip sessions with no active transaction */
1529 		if ((id = s->id) == WT_TXN_NONE && s->pinned_id == WT_TXN_NONE)
1530 			continue;
1531 		sess = &conn->sessions[i];
1532 		WT_RET(__wt_msg(session,
1533 		    "ID: %" PRIu64
1534 		    ", pinned ID: %" PRIu64
1535 		    ", metadata pinned ID: %" PRIu64
1536 		    ", name: %s",
1537 		    id, s->pinned_id, s->metadata_pinned,
1538 		    sess->name == NULL ?
1539 		    "EMPTY" : sess->name));
1540 		WT_RET(__wt_verbose_dump_txn_one(sess, &sess->txn));
1541 	}
1542 
1543 	return (0);
1544 }
1545