1 /*-
2 * Copyright (c) 2014-2018 MongoDB, Inc.
3 * Copyright (c) 2008-2014 WiredTiger, Inc.
4 * All rights reserved.
5 *
6 * See the file LICENSE for redistribution information.
7 */
8
9 #include "wt_internal.h"
10
11 /*
12 * __snapsort_partition --
13 * Custom quick sort partitioning for snapshots.
14 */
15 static uint32_t
__snapsort_partition(uint64_t * array,uint32_t f,uint32_t l,uint64_t pivot)16 __snapsort_partition(uint64_t *array, uint32_t f, uint32_t l, uint64_t pivot)
17 {
18 uint32_t i, j;
19
20 i = f - 1;
21 j = l + 1;
22 for (;;) {
23 while (pivot < array[--j])
24 ;
25 while (array[++i] < pivot)
26 ;
27 if (i < j) {
28 uint64_t tmp = array[i];
29 array[i] = array[j];
30 array[j] = tmp;
31 } else
32 return (j);
33 }
34 }
35
36 /*
37 * __snapsort_impl --
38 * Custom quick sort implementation for snapshots.
39 */
40 static void
__snapsort_impl(uint64_t * array,uint32_t f,uint32_t l)41 __snapsort_impl(uint64_t *array, uint32_t f, uint32_t l)
42 {
43 while (f + 16 < l) {
44 uint64_t v1 = array[f], v2 = array[l], v3 = array[(f + l)/2];
45 uint64_t median = v1 < v2 ?
46 (v3 < v1 ? v1 : WT_MIN(v2, v3)) :
47 (v3 < v2 ? v2 : WT_MIN(v1, v3));
48 uint32_t m = __snapsort_partition(array, f, l, median);
49 __snapsort_impl(array, f, m);
50 f = m + 1;
51 }
52 }
53
54 /*
55 * __snapsort --
56 * Sort an array of transaction IDs.
57 */
58 static void
__snapsort(uint64_t * array,uint32_t size)59 __snapsort(uint64_t *array, uint32_t size)
60 {
61 __snapsort_impl(array, 0, size - 1);
62 WT_INSERTION_SORT(array, size, uint64_t, WT_TXNID_LT);
63 }
64
65 /*
66 * __txn_remove_from_global_table --
67 * Remove the txn id from the global txn table.
68 */
69 static inline void
__txn_remove_from_global_table(WT_SESSION_IMPL * session)70 __txn_remove_from_global_table(WT_SESSION_IMPL *session)
71 {
72 #ifdef HAVE_DIAGNOSTIC
73 WT_TXN *txn;
74 WT_TXN_GLOBAL *txn_global;
75 WT_TXN_STATE *txn_state;
76
77 txn = &session->txn;
78 txn_global = &S2C(session)->txn_global;
79 txn_state = WT_SESSION_TXN_STATE(session);
80
81 WT_ASSERT(session, !WT_TXNID_LT(txn->id, txn_global->last_running));
82 WT_ASSERT(session,
83 txn->id != WT_TXN_NONE && txn_state->id != WT_TXN_NONE);
84 #else
85 WT_TXN_STATE *txn_state;
86
87 txn_state = WT_SESSION_TXN_STATE(session);
88 #endif
89 WT_PUBLISH(txn_state->id, WT_TXN_NONE);
90 }
91
92 /*
93 * __txn_sort_snapshot --
94 * Sort a snapshot for faster searching and set the min/max bounds.
95 */
96 static void
__txn_sort_snapshot(WT_SESSION_IMPL * session,uint32_t n,uint64_t snap_max)97 __txn_sort_snapshot(WT_SESSION_IMPL *session, uint32_t n, uint64_t snap_max)
98 {
99 WT_TXN *txn;
100
101 txn = &session->txn;
102
103 if (n > 1)
104 __snapsort(txn->snapshot, n);
105
106 txn->snapshot_count = n;
107 txn->snap_max = snap_max;
108 txn->snap_min = (n > 0 && WT_TXNID_LE(txn->snapshot[0], snap_max)) ?
109 txn->snapshot[0] : snap_max;
110 F_SET(txn, WT_TXN_HAS_SNAPSHOT);
111 WT_ASSERT(session, n == 0 || txn->snap_min != WT_TXN_NONE);
112 }
113
114 /*
115 * __wt_txn_release_snapshot --
116 * Release the snapshot in the current transaction.
117 */
118 void
__wt_txn_release_snapshot(WT_SESSION_IMPL * session)119 __wt_txn_release_snapshot(WT_SESSION_IMPL *session)
120 {
121 WT_TXN *txn;
122 WT_TXN_GLOBAL *txn_global;
123 WT_TXN_STATE *txn_state;
124
125 txn = &session->txn;
126 txn_global = &S2C(session)->txn_global;
127 txn_state = WT_SESSION_TXN_STATE(session);
128
129 WT_ASSERT(session,
130 txn_state->pinned_id == WT_TXN_NONE ||
131 session->txn.isolation == WT_ISO_READ_UNCOMMITTED ||
132 !__wt_txn_visible_all(session, txn_state->pinned_id, NULL));
133
134 txn_state->metadata_pinned = txn_state->pinned_id = WT_TXN_NONE;
135 F_CLR(txn, WT_TXN_HAS_SNAPSHOT);
136
137 /* Clear a checkpoint's pinned ID. */
138 if (WT_SESSION_IS_CHECKPOINT(session)) {
139 txn_global->checkpoint_state.pinned_id = WT_TXN_NONE;
140 __wt_timestamp_set_zero(&txn_global->checkpoint_timestamp);
141 }
142
143 __wt_txn_clear_read_timestamp(session);
144 }
145
146 /*
147 * __wt_txn_get_snapshot --
148 * Allocate a snapshot.
149 */
150 void
__wt_txn_get_snapshot(WT_SESSION_IMPL * session)151 __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
152 {
153 WT_CONNECTION_IMPL *conn;
154 WT_TXN *txn;
155 WT_TXN_GLOBAL *txn_global;
156 WT_TXN_STATE *s, *txn_state;
157 uint64_t commit_gen, current_id, id, prev_oldest_id, pinned_id;
158 uint32_t i, n, session_cnt;
159
160 conn = S2C(session);
161 txn = &session->txn;
162 txn_global = &conn->txn_global;
163 txn_state = WT_SESSION_TXN_STATE(session);
164 n = 0;
165
166 /* Fast path if we already have the current snapshot. */
167 if ((commit_gen = __wt_session_gen(session, WT_GEN_COMMIT)) != 0) {
168 if (F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) &&
169 commit_gen == __wt_gen(session, WT_GEN_COMMIT))
170 return;
171 __wt_session_gen_leave(session, WT_GEN_COMMIT);
172 }
173 __wt_session_gen_enter(session, WT_GEN_COMMIT);
174
175 /* We're going to scan the table: wait for the lock. */
176 __wt_readlock(session, &txn_global->rwlock);
177
178 current_id = pinned_id = txn_global->current;
179 prev_oldest_id = txn_global->oldest_id;
180
181 /*
182 * Include the checkpoint transaction, if one is running: we should
183 * ignore any uncommitted changes the checkpoint has written to the
184 * metadata. We don't have to keep the checkpoint's changes pinned so
185 * don't including it in the published pinned ID.
186 */
187 if ((id = txn_global->checkpoint_state.id) != WT_TXN_NONE) {
188 txn->snapshot[n++] = id;
189 txn_state->metadata_pinned = id;
190 }
191
192 /* For pure read-only workloads, avoid scanning. */
193 if (prev_oldest_id == current_id) {
194 txn_state->pinned_id = current_id;
195 /* Check that the oldest ID has not moved in the meantime. */
196 WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
197 goto done;
198 }
199
200 /* Walk the array of concurrent transactions. */
201 WT_ORDERED_READ(session_cnt, conn->session_cnt);
202 for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
203 /*
204 * Build our snapshot of any concurrent transaction IDs.
205 *
206 * Ignore:
207 * - Our own ID: we always read our own updates.
208 * - The ID if it is older than the oldest ID we saw. This
209 * can happen if we race with a thread that is allocating
210 * an ID -- the ID will not be used because the thread will
211 * keep spinning until it gets a valid one.
212 */
213 if (s != txn_state &&
214 (id = s->id) != WT_TXN_NONE &&
215 WT_TXNID_LE(prev_oldest_id, id)) {
216 txn->snapshot[n++] = id;
217 if (WT_TXNID_LT(id, pinned_id))
218 pinned_id = id;
219 }
220 }
221
222 /*
223 * If we got a new snapshot, update the published pinned ID for this
224 * session.
225 */
226 WT_ASSERT(session, WT_TXNID_LE(prev_oldest_id, pinned_id));
227 WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
228 txn_state->pinned_id = pinned_id;
229
230 done: __wt_readunlock(session, &txn_global->rwlock);
231 __txn_sort_snapshot(session, n, current_id);
232 }
233
234 /*
235 * __txn_oldest_scan --
236 * Sweep the running transactions to calculate the oldest ID required.
237 */
238 static void
__txn_oldest_scan(WT_SESSION_IMPL * session,uint64_t * oldest_idp,uint64_t * last_runningp,uint64_t * metadata_pinnedp,WT_SESSION_IMPL ** oldest_sessionp)239 __txn_oldest_scan(WT_SESSION_IMPL *session,
240 uint64_t *oldest_idp, uint64_t *last_runningp, uint64_t *metadata_pinnedp,
241 WT_SESSION_IMPL **oldest_sessionp)
242 {
243 WT_CONNECTION_IMPL *conn;
244 WT_SESSION_IMPL *oldest_session;
245 WT_TXN_GLOBAL *txn_global;
246 WT_TXN_STATE *s;
247 uint64_t id, last_running, metadata_pinned, oldest_id, prev_oldest_id;
248 uint32_t i, session_cnt;
249
250 conn = S2C(session);
251 txn_global = &conn->txn_global;
252 oldest_session = NULL;
253
254 /* The oldest ID cannot change while we are holding the scan lock. */
255 prev_oldest_id = txn_global->oldest_id;
256 last_running = oldest_id = txn_global->current;
257 if ((metadata_pinned = txn_global->checkpoint_state.id) == WT_TXN_NONE)
258 metadata_pinned = oldest_id;
259
260 /* Walk the array of concurrent transactions. */
261 WT_ORDERED_READ(session_cnt, conn->session_cnt);
262 for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
263 /* Update the last running transaction ID. */
264 if ((id = s->id) != WT_TXN_NONE &&
265 WT_TXNID_LE(prev_oldest_id, id) &&
266 WT_TXNID_LT(id, last_running))
267 last_running = id;
268
269 /* Update the metadata pinned ID. */
270 if ((id = s->metadata_pinned) != WT_TXN_NONE &&
271 WT_TXNID_LT(id, metadata_pinned))
272 metadata_pinned = id;
273
274 /*
275 * !!!
276 * Note: Don't ignore pinned ID values older than the previous
277 * oldest ID. Read-uncommitted operations publish pinned ID
278 * values without acquiring the scan lock to protect the global
279 * table. See the comment in __wt_txn_cursor_op for more
280 * details.
281 */
282 if ((id = s->pinned_id) != WT_TXN_NONE &&
283 WT_TXNID_LT(id, oldest_id)) {
284 oldest_id = id;
285 oldest_session = &conn->sessions[i];
286 }
287 }
288
289 if (WT_TXNID_LT(last_running, oldest_id))
290 oldest_id = last_running;
291
292 /* The oldest ID can't move past any named snapshots. */
293 if ((id = txn_global->nsnap_oldest_id) != WT_TXN_NONE &&
294 WT_TXNID_LT(id, oldest_id))
295 oldest_id = id;
296
297 /* The metadata pinned ID can't move past the oldest ID. */
298 if (WT_TXNID_LT(oldest_id, metadata_pinned))
299 metadata_pinned = oldest_id;
300
301 *last_runningp = last_running;
302 *metadata_pinnedp = metadata_pinned;
303 *oldest_idp = oldest_id;
304 *oldest_sessionp = oldest_session;
305 }
306
307 /*
308 * __wt_txn_update_oldest --
309 * Sweep the running transactions to update the oldest ID required.
310 */
311 int
__wt_txn_update_oldest(WT_SESSION_IMPL * session,uint32_t flags)312 __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags)
313 {
314 WT_CONNECTION_IMPL *conn;
315 WT_DECL_RET;
316 WT_SESSION_IMPL *oldest_session;
317 WT_TXN_GLOBAL *txn_global;
318 uint64_t current_id, last_running, metadata_pinned, oldest_id;
319 uint64_t prev_last_running, prev_metadata_pinned, prev_oldest_id;
320 bool strict, wait;
321
322 conn = S2C(session);
323 txn_global = &conn->txn_global;
324 strict = LF_ISSET(WT_TXN_OLDEST_STRICT);
325 wait = LF_ISSET(WT_TXN_OLDEST_WAIT);
326
327 current_id = last_running = metadata_pinned = txn_global->current;
328 prev_last_running = txn_global->last_running;
329 prev_metadata_pinned = txn_global->metadata_pinned;
330 prev_oldest_id = txn_global->oldest_id;
331
332 #ifdef HAVE_TIMESTAMPS
333 /* Try to move the pinned timestamp forward. */
334 if (strict)
335 WT_RET(__wt_txn_update_pinned_timestamp(session, false));
336 #endif
337
338 /*
339 * For pure read-only workloads, or if the update isn't forced and the
340 * oldest ID isn't too far behind, avoid scanning.
341 */
342 if ((prev_oldest_id == current_id &&
343 prev_metadata_pinned == current_id) ||
344 (!strict && WT_TXNID_LT(current_id, prev_oldest_id + 100)))
345 return (0);
346
347 /* First do a read-only scan. */
348 if (wait)
349 __wt_readlock(session, &txn_global->rwlock);
350 else if ((ret =
351 __wt_try_readlock(session, &txn_global->rwlock)) != 0)
352 return (ret == EBUSY ? 0 : ret);
353 __txn_oldest_scan(session,
354 &oldest_id, &last_running, &metadata_pinned, &oldest_session);
355 __wt_readunlock(session, &txn_global->rwlock);
356
357 /*
358 * If the state hasn't changed (or hasn't moved far enough for
359 * non-forced updates), give up.
360 */
361 if ((oldest_id == prev_oldest_id ||
362 (!strict && WT_TXNID_LT(oldest_id, prev_oldest_id + 100))) &&
363 ((last_running == prev_last_running) ||
364 (!strict && WT_TXNID_LT(last_running, prev_last_running + 100))) &&
365 metadata_pinned == prev_metadata_pinned)
366 return (0);
367
368 /* It looks like an update is necessary, wait for exclusive access. */
369 if (wait)
370 __wt_writelock(session, &txn_global->rwlock);
371 else if ((ret =
372 __wt_try_writelock(session, &txn_global->rwlock)) != 0)
373 return (ret == EBUSY ? 0 : ret);
374
375 /*
376 * If the oldest ID has been updated while we waited, don't bother
377 * scanning.
378 */
379 if (WT_TXNID_LE(oldest_id, txn_global->oldest_id) &&
380 WT_TXNID_LE(last_running, txn_global->last_running) &&
381 WT_TXNID_LE(metadata_pinned, txn_global->metadata_pinned))
382 goto done;
383
384 /*
385 * Re-scan now that we have exclusive access. This is necessary because
386 * threads get transaction snapshots with read locks, and we have to be
387 * sure that there isn't a thread that has got a snapshot locally but
388 * not yet published its snap_min.
389 */
390 __txn_oldest_scan(session,
391 &oldest_id, &last_running, &metadata_pinned, &oldest_session);
392
393 #ifdef HAVE_DIAGNOSTIC
394 {
395 /*
396 * Make sure the ID doesn't move past any named snapshots.
397 *
398 * Don't include the read/assignment in the assert statement. Coverity
399 * complains if there are assignments only done in diagnostic builds,
400 * and when the read is from a volatile.
401 */
402 uint64_t id = txn_global->nsnap_oldest_id;
403 WT_ASSERT(session,
404 id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id));
405 }
406 #endif
407 /* Update the public IDs. */
408 if (WT_TXNID_LT(txn_global->metadata_pinned, metadata_pinned))
409 txn_global->metadata_pinned = metadata_pinned;
410 if (WT_TXNID_LT(txn_global->oldest_id, oldest_id))
411 txn_global->oldest_id = oldest_id;
412 if (WT_TXNID_LT(txn_global->last_running, last_running)) {
413 txn_global->last_running = last_running;
414
415 /* Output a verbose message about long-running transactions,
416 * but only when some progress is being made. */
417 if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) &&
418 current_id - oldest_id > 10000 && oldest_session != NULL) {
419 __wt_verbose(session, WT_VERB_TRANSACTION,
420 "old snapshot %" PRIu64
421 " pinned in session %" PRIu32 " [%s]"
422 " with snap_min %" PRIu64,
423 oldest_id, oldest_session->id,
424 oldest_session->lastop,
425 oldest_session->txn.snap_min);
426 }
427 }
428
429 done: __wt_writeunlock(session, &txn_global->rwlock);
430 return (ret);
431 }
432
433 /*
434 * __wt_txn_config --
435 * Configure a transaction.
436 */
437 int
__wt_txn_config(WT_SESSION_IMPL * session,const char * cfg[])438 __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[])
439 {
440 WT_CONFIG_ITEM cval;
441 WT_TXN *txn;
442
443 txn = &session->txn;
444
445 WT_RET(__wt_config_gets_def(session, cfg, "isolation", 0, &cval));
446 if (cval.len != 0)
447 txn->isolation =
448 WT_STRING_MATCH("snapshot", cval.str, cval.len) ?
449 WT_ISO_SNAPSHOT :
450 WT_STRING_MATCH("read-committed", cval.str, cval.len) ?
451 WT_ISO_READ_COMMITTED : WT_ISO_READ_UNCOMMITTED;
452
453 /*
454 * The default sync setting is inherited from the connection, but can
455 * be overridden by an explicit "sync" setting for this transaction.
456 *
457 * We want to distinguish between inheriting implicitly and explicitly.
458 */
459 F_CLR(txn, WT_TXN_SYNC_SET);
460 WT_RET(__wt_config_gets_def(
461 session, cfg, "sync", (int)UINT_MAX, &cval));
462 if (cval.val == 0 || cval.val == 1)
463 /*
464 * This is an explicit setting of sync. Set the flag so
465 * that we know not to overwrite it in commit_transaction.
466 */
467 F_SET(txn, WT_TXN_SYNC_SET);
468
469 /*
470 * If sync is turned off explicitly, clear the transaction's sync field.
471 */
472 if (cval.val == 0)
473 txn->txn_logsync = 0;
474
475 WT_RET(__wt_config_gets_def(session, cfg, "snapshot", 0, &cval));
476 if (cval.len > 0)
477 /*
478 * The layering here isn't ideal - the named snapshot get
479 * function does both validation and setup. Otherwise we'd
480 * need to walk the list of named snapshots twice during
481 * transaction open.
482 */
483 WT_RET(__wt_txn_named_snapshot_get(session, &cval));
484
485 /* Check if prepared updates should be ignored during reads. */
486 WT_RET(__wt_config_gets_def(session, cfg, "ignore_prepare", 0, &cval));
487 if (cval.val)
488 F_SET(txn, WT_TXN_IGNORE_PREPARE);
489
490 WT_RET(__wt_txn_parse_read_timestamp(session, cfg));
491
492 return (0);
493 }
494
495 /*
496 * __wt_txn_reconfigure --
497 * WT_SESSION::reconfigure for transactions.
498 */
499 int
__wt_txn_reconfigure(WT_SESSION_IMPL * session,const char * config)500 __wt_txn_reconfigure(WT_SESSION_IMPL *session, const char *config)
501 {
502 WT_CONFIG_ITEM cval;
503 WT_DECL_RET;
504 WT_TXN *txn;
505
506 txn = &session->txn;
507
508 ret = __wt_config_getones(session, config, "isolation", &cval);
509 if (ret == 0 && cval.len != 0) {
510 session->isolation = txn->isolation =
511 WT_STRING_MATCH("snapshot", cval.str, cval.len) ?
512 WT_ISO_SNAPSHOT :
513 WT_STRING_MATCH("read-uncommitted", cval.str, cval.len) ?
514 WT_ISO_READ_UNCOMMITTED : WT_ISO_READ_COMMITTED;
515 }
516 WT_RET_NOTFOUND_OK(ret);
517
518 return (0);
519 }
520
521 /*
522 * __wt_txn_release --
523 * Release the resources associated with the current transaction.
524 */
525 void
__wt_txn_release(WT_SESSION_IMPL * session)526 __wt_txn_release(WT_SESSION_IMPL *session)
527 {
528 WT_TXN *txn;
529 WT_TXN_GLOBAL *txn_global;
530
531 txn = &session->txn;
532 txn_global = &S2C(session)->txn_global;
533
534 WT_ASSERT(session, txn->mod_count == 0);
535 txn->notify = NULL;
536
537 /* Clear the transaction's ID from the global table. */
538 if (WT_SESSION_IS_CHECKPOINT(session)) {
539 WT_ASSERT(session,
540 WT_SESSION_TXN_STATE(session)->id == WT_TXN_NONE);
541 txn->id = txn_global->checkpoint_state.id = WT_TXN_NONE;
542
543 /*
544 * Be extra careful to cleanup everything for checkpoints: once
545 * the global checkpoint ID is cleared, we can no longer tell
546 * if this session is doing a checkpoint.
547 */
548 txn_global->checkpoint_id = 0;
549 } else if (F_ISSET(txn, WT_TXN_HAS_ID)) {
550 /*
551 * If transaction is prepared, this would have been done in
552 * prepare.
553 */
554 if (!F_ISSET(txn, WT_TXN_PREPARE))
555 __txn_remove_from_global_table(session);
556 txn->id = WT_TXN_NONE;
557 }
558
559 __wt_txn_clear_commit_timestamp(session);
560
561 /* Free the scratch buffer allocated for logging. */
562 __wt_logrec_free(session, &txn->logrec);
563
564 /* Discard any memory from the session's stash that we can. */
565 WT_ASSERT(session, __wt_session_gen(session, WT_GEN_SPLIT) == 0);
566 __wt_stash_discard(session);
567
568 /*
569 * Reset the transaction state to not running and release the snapshot.
570 */
571 __wt_txn_release_snapshot(session);
572 txn->isolation = session->isolation;
573
574 txn->rollback_reason = NULL;
575
576 /* Ensure the transaction flags are cleared on exit */
577 txn->flags = 0;
578 }
579
580 #ifdef HAVE_TIMESTAMPS
581 /*
582 * __txn_commit_timestamp_validate --
583 * Validate that timestamp provided to commit is legal.
584 */
585 static inline int
__txn_commit_timestamp_validate(WT_SESSION_IMPL * session)586 __txn_commit_timestamp_validate(WT_SESSION_IMPL *session)
587 {
588 WT_DECL_TIMESTAMP(op_timestamp)
589 WT_TXN *txn;
590 WT_TXN_OP *op;
591 WT_UPDATE *upd;
592 u_int i;
593 bool op_zero_ts, upd_zero_ts;
594
595 txn = &session->txn;
596
597 /*
598 * Debugging checks on timestamps, if user requested them.
599 */
600 if (F_ISSET(txn, WT_TXN_TS_COMMIT_ALWAYS) &&
601 !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
602 txn->mod_count != 0)
603 WT_RET_MSG(session, EINVAL, "commit_timestamp required and "
604 "none set on this transaction");
605 if (F_ISSET(txn, WT_TXN_TS_COMMIT_NEVER) &&
606 F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
607 txn->mod_count != 0)
608 WT_RET_MSG(session, EINVAL, "no commit_timestamp required and "
609 "timestamp set on this transaction");
610
611 /*
612 * If we're not doing any key consistency checking, we're done.
613 */
614 if (!F_ISSET(txn, WT_TXN_TS_COMMIT_KEYS))
615 return (0);
616
617 /*
618 * Error on any valid update structures for the same key that
619 * are at a later timestamp or use timestamps inconsistently.
620 */
621 for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++)
622 if (op->type == WT_TXN_OP_BASIC_COL ||
623 op->type == WT_TXN_OP_BASIC_ROW) {
624 /*
625 * Skip over any aborted update structures or ones
626 * from our own transaction.
627 */
628 upd = op->u.op_upd->next;
629 while (upd != NULL && (upd->txnid == WT_TXN_ABORTED ||
630 upd->txnid == txn->id))
631 upd = upd->next;
632
633 /*
634 * Check the timestamp on this update with the
635 * first valid update in the chain. They're in
636 * most recent order.
637 */
638 if (upd == NULL)
639 continue;
640 /*
641 * Check for consistent per-key timestamp usage.
642 * If timestamps are or are not used originally then
643 * they should be used the same way always. For this
644 * transaction, timestamps are in use anytime the
645 * commit timestamp is set.
646 * Check timestamps are used in order.
647 */
648 op_zero_ts = !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT);
649 upd_zero_ts = __wt_timestamp_iszero(&upd->timestamp);
650 if (op_zero_ts != upd_zero_ts)
651 WT_RET_MSG(session, EINVAL,
652 "per-key timestamps used inconsistently");
653 /*
654 * If we aren't using timestamps for this transaction
655 * then we are done checking. Don't check the timestamp
656 * because the one in the transaction is not cleared.
657 */
658 if (op_zero_ts)
659 continue;
660
661 op_timestamp = op->u.op_upd->timestamp;
662 /*
663 * Only if the update structure doesn't have a timestamp
664 * then use the one in the transaction structure.
665 */
666 if (__wt_timestamp_iszero(&op_timestamp))
667 op_timestamp = txn->commit_timestamp;
668 if (__wt_timestamp_cmp(&op_timestamp,
669 &upd->timestamp) < 0)
670 WT_RET_MSG(session, EINVAL,
671 "out of order timestamps");
672 }
673 return (0);
674 }
675 #endif
676
677 /*
678 * __wt_txn_commit --
679 * Commit the current transaction.
680 */
681 int
__wt_txn_commit(WT_SESSION_IMPL * session,const char * cfg[])682 __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
683 {
684 WT_CONFIG_ITEM cval;
685 WT_CONNECTION_IMPL *conn;
686 WT_DECL_RET;
687 WT_TXN *txn;
688 WT_TXN_GLOBAL *txn_global;
689 WT_TXN_OP *op;
690 WT_UPDATE *upd;
691 uint32_t fileid;
692 u_int i;
693 bool locked, readonly;
694 #ifdef HAVE_TIMESTAMPS
695 wt_timestamp_t prev_commit_timestamp, ts;
696 bool update_timestamp;
697 #endif
698
699 txn = &session->txn;
700 conn = S2C(session);
701 txn_global = &conn->txn_global;
702 locked = false;
703
704 WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));
705 WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) ||
706 txn->mod_count == 0);
707
708 readonly = txn->mod_count == 0;
709 /*
710 * Look for a commit timestamp.
711 */
712 WT_ERR(
713 __wt_config_gets_def(session, cfg, "commit_timestamp", 0, &cval));
714 if (cval.len != 0) {
715 #ifdef HAVE_TIMESTAMPS
716 WT_ERR(__wt_txn_parse_timestamp(session, "commit", &ts, &cval));
717 WT_ERR(__wt_timestamp_validate(session, "commit", &ts, &cval));
718 __wt_timestamp_set(&txn->commit_timestamp, &ts);
719 __wt_txn_set_commit_timestamp(session);
720 #else
721 WT_ERR_MSG(session, EINVAL, "commit_timestamp requires a "
722 "version of WiredTiger built with timestamp support");
723 #endif
724 }
725 if (F_ISSET(txn, WT_TXN_PREPARE) &&
726 !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT))
727 WT_ERR_MSG(session, EINVAL,
728 "commit_timestamp is required for a prepared transaction");
729
730 #ifdef HAVE_TIMESTAMPS
731 WT_ERR(__txn_commit_timestamp_validate(session));
732 #endif
733
734 /*
735 * The default sync setting is inherited from the connection, but can
736 * be overridden by an explicit "sync" setting for this transaction.
737 */
738 WT_ERR(__wt_config_gets_def(session, cfg, "sync", 0, &cval));
739
740 /*
741 * If the user chose the default setting, check whether sync is enabled
742 * for this transaction (either inherited or via begin_transaction).
743 * If sync is disabled, clear the field to avoid the log write being
744 * flushed.
745 *
746 * Otherwise check for specific settings. We don't need to check for
747 * "on" because that is the default inherited from the connection. If
748 * the user set anything in begin_transaction, we only override with an
749 * explicit setting.
750 */
751 if (cval.len == 0) {
752 if (!FLD_ISSET(txn->txn_logsync, WT_LOG_SYNC_ENABLED) &&
753 !F_ISSET(txn, WT_TXN_SYNC_SET))
754 txn->txn_logsync = 0;
755 } else {
756 /*
757 * If the caller already set sync on begin_transaction then
758 * they should not be using sync on commit_transaction.
759 * Flag that as an error.
760 */
761 if (F_ISSET(txn, WT_TXN_SYNC_SET))
762 WT_ERR_MSG(session, EINVAL,
763 "Sync already set during begin_transaction");
764 if (WT_STRING_MATCH("background", cval.str, cval.len))
765 txn->txn_logsync = WT_LOG_BACKGROUND;
766 else if (WT_STRING_MATCH("off", cval.str, cval.len))
767 txn->txn_logsync = 0;
768 /*
769 * We don't need to check for "on" here because that is the
770 * default to inherit from the connection setting.
771 */
772 }
773
774 /* Commit notification. */
775 if (txn->notify != NULL)
776 WT_ERR(txn->notify->notify(txn->notify,
777 (WT_SESSION *)session, txn->id, 1));
778
779 /*
780 * We are about to release the snapshot: copy values into any
781 * positioned cursors so they don't point to updates that could be
782 * freed once we don't have a snapshot.
783 * If this transaction is prepared, then copying values would have been
784 * done during prepare.
785 */
786 if (session->ncursors > 0 && !F_ISSET(txn, WT_TXN_PREPARE)) {
787 WT_DIAGNOSTIC_YIELD;
788 WT_ERR(__wt_session_copy_values(session));
789 }
790
791 /* If we are logging, write a commit log record. */
792 if (txn->logrec != NULL &&
793 FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) &&
794 !F_ISSET(session, WT_SESSION_NO_LOGGING)) {
795 /*
796 * We are about to block on I/O writing the log.
797 * Release our snapshot in case it is keeping data pinned.
798 * This is particularly important for checkpoints.
799 */
800 __wt_txn_release_snapshot(session);
801 /*
802 * We hold the visibility lock for reading from the time
803 * we write our log record until the time we release our
804 * transaction so that the LSN any checkpoint gets will
805 * always reflect visible data.
806 */
807 __wt_readlock(session, &txn_global->visibility_rwlock);
808 locked = true;
809 WT_ERR(__wt_txn_log_commit(session, cfg));
810 }
811
812 /* Note: we're going to commit: nothing can fail after this point. */
813
814 /* Process and free updates. */
815 for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) {
816 fileid = op->btree->id;
817 switch (op->type) {
818 case WT_TXN_OP_NONE:
819 break;
820 case WT_TXN_OP_BASIC_COL:
821 case WT_TXN_OP_BASIC_ROW:
822 case WT_TXN_OP_INMEM_COL:
823 case WT_TXN_OP_INMEM_ROW:
824 upd = op->u.op_upd;
825
826 /*
827 * Switch reserved operations to abort to
828 * simplify obsolete update list truncation.
829 */
830 if (upd->type == WT_UPDATE_RESERVE) {
831 upd->txnid = WT_TXN_ABORTED;
832 break;
833 }
834
835 /*
836 * Writes to the lookaside file can be evicted as soon
837 * as they commit.
838 */
839 if (conn->cache->las_fileid != 0 &&
840 fileid == conn->cache->las_fileid) {
841 upd->txnid = WT_TXN_NONE;
842 break;
843 }
844 /* FALLTHROUGH */
845 case WT_TXN_OP_REF_DELETE:
846 #ifdef HAVE_TIMESTAMPS
847 __wt_txn_op_set_timestamp(session, op);
848 #endif
849 break;
850 case WT_TXN_OP_TRUNCATE_COL:
851 case WT_TXN_OP_TRUNCATE_ROW:
852 /* Other operations don't need timestamps. */
853 break;
854 }
855
856 __wt_txn_op_free(session, op);
857 }
858 txn->mod_count = 0;
859
860 #ifdef HAVE_TIMESTAMPS
861 /*
862 * Track the largest commit timestamp we have seen.
863 *
864 * We don't actually clear the local commit timestamp, just the flag.
865 * That said, we can't update the global commit timestamp until this
866 * transaction is visible, which happens when we release it.
867 */
868 update_timestamp = F_ISSET(txn, WT_TXN_HAS_TS_COMMIT);
869 #endif
870
871 __wt_txn_release(session);
872 if (locked)
873 __wt_readunlock(session, &txn_global->visibility_rwlock);
874
875 /*
876 * If we have made some updates visible, start a new commit generation:
877 * any cached snapshots have to be refreshed.
878 */
879 if (!readonly)
880 (void)__wt_gen_next(session, WT_GEN_COMMIT);
881
882 #ifdef HAVE_TIMESTAMPS
883 /* First check if we've already committed something in the future. */
884 if (update_timestamp) {
885 WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
886 __wt_timestamp_set(
887 &prev_commit_timestamp, &txn_global->commit_timestamp));
888 update_timestamp = __wt_timestamp_cmp(
889 &txn->commit_timestamp, &prev_commit_timestamp) > 0;
890 }
891
892 /*
893 * If it looks like we need to move the global commit timestamp,
894 * write lock and re-check.
895 */
896 if (update_timestamp) {
897 #if WT_TIMESTAMP_SIZE == 8
898 while (__wt_timestamp_cmp(
899 &txn->commit_timestamp, &prev_commit_timestamp) > 0) {
900 if (__wt_atomic_cas64(
901 &txn_global->commit_timestamp.val,
902 prev_commit_timestamp.val,
903 txn->commit_timestamp.val)) {
904 txn_global->has_commit_timestamp = true;
905 break;
906 }
907 __wt_timestamp_set(
908 &prev_commit_timestamp, &txn_global->commit_timestamp);
909 }
910 #else
911 __wt_writelock(session, &txn_global->rwlock);
912 if (__wt_timestamp_cmp(&txn->commit_timestamp,
913 &txn_global->commit_timestamp) > 0) {
914 __wt_timestamp_set(&txn_global->commit_timestamp,
915 &txn->commit_timestamp);
916 txn_global->has_commit_timestamp = true;
917 }
918 __wt_writeunlock(session, &txn_global->rwlock);
919 #endif
920 }
921 #endif
922
923 /*
924 * We're between transactions, if we need to block for eviction, it's
925 * a good time to do so. Note that we must ignore any error return
926 * because the user's data is committed.
927 */
928 if (!readonly)
929 (void)__wt_cache_eviction_check(session, false, false, NULL);
930 return (0);
931
932 err: /*
933 * If anything went wrong, roll back.
934 *
935 * !!!
936 * Nothing can fail after this point.
937 */
938 if (locked)
939 __wt_readunlock(session, &txn_global->visibility_rwlock);
940 WT_TRET(__wt_txn_rollback(session, cfg));
941 return (ret);
942 }
943
944 /*
945 * __wt_txn_prepare --
946 * Prepare the current transaction.
947 */
948 int
__wt_txn_prepare(WT_SESSION_IMPL * session,const char * cfg[])949 __wt_txn_prepare(WT_SESSION_IMPL *session, const char *cfg[])
950 {
951 #ifdef HAVE_TIMESTAMPS
952 WT_TXN *txn;
953 WT_TXN_OP *op;
954 WT_UPDATE *upd;
955 wt_timestamp_t ts;
956 u_int i;
957
958 txn = &session->txn;
959
960 WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));
961 WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || txn->mod_count == 0);
962 /* Transaction should not have updated any of the logged tables. */
963 WT_ASSERT(session, txn->logrec == NULL);
964
965 WT_RET(__wt_txn_context_check(session, true));
966
967 /* Parse and validate the prepare timestamp. */
968 WT_RET(__wt_txn_parse_prepare_timestamp(session, cfg, &ts));
969 __wt_timestamp_set(&txn->prepare_timestamp, &ts);
970
971 /*
972 * We are about to release the snapshot: copy values into any
973 * positioned cursors so they don't point to updates that could be
974 * freed once we don't have a snapshot.
975 */
976 if (session->ncursors > 0) {
977 WT_DIAGNOSTIC_YIELD;
978 WT_RET(__wt_session_copy_values(session));
979 }
980
981 /* Prepare updates. */
982 for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) {
983 /* Assert it's not an update to the lookaside file. */
984 WT_ASSERT(session, S2C(session)->cache->las_fileid == 0 ||
985 !F_ISSET(op->btree, WT_BTREE_LOOKASIDE));
986
987 /* Metadata updates are never prepared. */
988 if (WT_IS_METADATA(op->btree->dhandle))
989 continue;
990
991 upd = op->u.op_upd;
992
993 switch (op->type) {
994 case WT_TXN_OP_NONE:
995 break;
996 case WT_TXN_OP_BASIC_COL:
997 case WT_TXN_OP_BASIC_ROW:
998 case WT_TXN_OP_INMEM_COL:
999 case WT_TXN_OP_INMEM_ROW:
1000 /*
1001 * Switch reserved operation to abort to simplify
1002 * obsolete update list truncation. The object free
1003 * function clears the operation type so we don't
1004 * try to visit this update again: it can be evicted.
1005 */
1006 if (upd->type == WT_UPDATE_RESERVE) {
1007 upd->txnid = WT_TXN_ABORTED;
1008 __wt_txn_op_free(session, op);
1009 break;
1010 }
1011
1012 /* Set prepare timestamp. */
1013 __wt_timestamp_set(&upd->timestamp, &ts);
1014
1015 WT_PUBLISH(upd->prepare_state, WT_PREPARE_INPROGRESS);
1016 break;
1017 case WT_TXN_OP_REF_DELETE:
1018 __wt_timestamp_set(
1019 &op->u.ref->page_del->timestamp, &ts);
1020 WT_PUBLISH(op->u.ref->page_del->prepare_state,
1021 WT_PREPARE_INPROGRESS);
1022 break;
1023 case WT_TXN_OP_TRUNCATE_COL:
1024 case WT_TXN_OP_TRUNCATE_ROW:
1025 /* Other operations don't need timestamps. */
1026 break;
1027 }
1028 }
1029
1030 /* Set transaction state to prepare. */
1031 F_SET(&session->txn, WT_TXN_PREPARE);
1032
1033 /* Release our snapshot in case it is keeping data pinned. */
1034 __wt_txn_release_snapshot(session);
1035
1036 /*
1037 * Clear the transaction's ID from the global table, to facilitate
1038 * prepared data visibility, but not from local txn structure.
1039 */
1040 if (F_ISSET(txn, WT_TXN_HAS_ID))
1041 __txn_remove_from_global_table(session);
1042
1043 return (0);
1044 #else
1045 WT_UNUSED(cfg);
1046 WT_RET_MSG(session, ENOTSUP, "prepare_transaction requires a version "
1047 "of WiredTiger built with timestamp support");
1048 #endif
1049 }
1050
1051 /*
1052 * __wt_txn_rollback --
1053 * Roll back the current transaction.
1054 */
1055 int
__wt_txn_rollback(WT_SESSION_IMPL * session,const char * cfg[])1056 __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[])
1057 {
1058 WT_DECL_RET;
1059 WT_TXN *txn;
1060 WT_TXN_OP *op;
1061 WT_UPDATE *upd;
1062 u_int i;
1063 bool readonly;
1064
1065 WT_UNUSED(cfg);
1066
1067 txn = &session->txn;
1068 readonly = txn->mod_count == 0;
1069 WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));
1070
1071 /* Rollback notification. */
1072 if (txn->notify != NULL)
1073 WT_TRET(txn->notify->notify(txn->notify, (WT_SESSION *)session,
1074 txn->id, 0));
1075
1076 /* Rollback updates. */
1077 for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) {
1078 /* Assert it's not an update to the lookaside file. */
1079 WT_ASSERT(session, S2C(session)->cache->las_fileid == 0 ||
1080 !F_ISSET(op->btree, WT_BTREE_LOOKASIDE));
1081
1082 /* Metadata updates are never rolled back. */
1083 if (WT_IS_METADATA(op->btree->dhandle))
1084 continue;
1085
1086 upd = op->u.op_upd;
1087
1088 switch (op->type) {
1089 case WT_TXN_OP_NONE:
1090 break;
1091 case WT_TXN_OP_BASIC_COL:
1092 case WT_TXN_OP_BASIC_ROW:
1093 case WT_TXN_OP_INMEM_COL:
1094 case WT_TXN_OP_INMEM_ROW:
1095 WT_ASSERT(session,
1096 upd->txnid == txn->id ||
1097 upd->txnid == WT_TXN_ABORTED);
1098 upd->txnid = WT_TXN_ABORTED;
1099 break;
1100 case WT_TXN_OP_REF_DELETE:
1101 WT_TRET(__wt_delete_page_rollback(session, op->u.ref));
1102 break;
1103 case WT_TXN_OP_TRUNCATE_COL:
1104 case WT_TXN_OP_TRUNCATE_ROW:
1105 /*
1106 * Nothing to do: these operations are only logged for
1107 * recovery. The in-memory changes will be rolled back
1108 * with a combination of WT_TXN_OP_REF_DELETE and
1109 * WT_TXN_OP_INMEM operations.
1110 */
1111 break;
1112 }
1113
1114 __wt_txn_op_free(session, op);
1115 }
1116 txn->mod_count = 0;
1117
1118 __wt_txn_release(session);
1119 /*
1120 * We're between transactions, if we need to block for eviction, it's
1121 * a good time to do so. Note that we must ignore any error return
1122 * because the user's data is committed.
1123 */
1124 if (!readonly)
1125 (void)__wt_cache_eviction_check(session, false, false, NULL);
1126 return (ret);
1127 }
1128
1129 /*
1130 * __wt_txn_rollback_required --
1131 * Prepare to log a reason if the user attempts to use the transaction to
1132 * do anything other than rollback.
1133 */
1134 int
__wt_txn_rollback_required(WT_SESSION_IMPL * session,const char * reason)1135 __wt_txn_rollback_required(WT_SESSION_IMPL *session, const char *reason)
1136 {
1137 session->txn.rollback_reason = reason;
1138 return (WT_ROLLBACK);
1139 }
1140
1141 /*
1142 * __wt_txn_init --
1143 * Initialize a session's transaction data.
1144 */
1145 int
__wt_txn_init(WT_SESSION_IMPL * session,WT_SESSION_IMPL * session_ret)1146 __wt_txn_init(WT_SESSION_IMPL *session, WT_SESSION_IMPL *session_ret)
1147 {
1148 WT_TXN *txn;
1149
1150 txn = &session_ret->txn;
1151 txn->id = WT_TXN_NONE;
1152
1153 WT_RET(__wt_calloc_def(session,
1154 S2C(session_ret)->session_size, &txn->snapshot));
1155
1156 #ifdef HAVE_DIAGNOSTIC
1157 if (S2C(session_ret)->txn_global.states != NULL) {
1158 WT_TXN_STATE *txn_state;
1159 txn_state = WT_SESSION_TXN_STATE(session_ret);
1160 WT_ASSERT(session, txn_state->pinned_id == WT_TXN_NONE);
1161 }
1162 #endif
1163
1164 /*
1165 * Take care to clean these out in case we are reusing the transaction
1166 * for eviction.
1167 */
1168 txn->mod = NULL;
1169
1170 txn->isolation = session_ret->isolation;
1171 return (0);
1172 }
1173
1174 /*
1175 * __wt_txn_stats_update --
1176 * Update the transaction statistics for return to the application.
1177 */
1178 void
__wt_txn_stats_update(WT_SESSION_IMPL * session)1179 __wt_txn_stats_update(WT_SESSION_IMPL *session)
1180 {
1181 WT_CONNECTION_IMPL *conn;
1182 WT_CONNECTION_STATS **stats;
1183 WT_TXN_GLOBAL *txn_global;
1184 uint64_t checkpoint_pinned, snapshot_pinned;
1185
1186 conn = S2C(session);
1187 txn_global = &conn->txn_global;
1188 stats = conn->stats;
1189 checkpoint_pinned = txn_global->checkpoint_state.pinned_id;
1190 snapshot_pinned = txn_global->nsnap_oldest_id;
1191
1192 WT_STAT_SET(session, stats, txn_pinned_range,
1193 txn_global->current - txn_global->oldest_id);
1194
1195 #if WT_TIMESTAMP_SIZE == 8
1196 {
1197 WT_DECL_TIMESTAMP(checkpoint_timestamp)
1198 WT_DECL_TIMESTAMP(commit_timestamp)
1199 WT_DECL_TIMESTAMP(pinned_timestamp)
1200
1201 checkpoint_timestamp = txn_global->checkpoint_timestamp;
1202 commit_timestamp = txn_global->commit_timestamp;
1203 pinned_timestamp = txn_global->pinned_timestamp;
1204 if (checkpoint_timestamp.val != 0 &&
1205 checkpoint_timestamp.val < pinned_timestamp.val)
1206 pinned_timestamp = checkpoint_timestamp;
1207 WT_STAT_SET(session, stats, txn_pinned_timestamp,
1208 commit_timestamp.val - pinned_timestamp.val);
1209 WT_STAT_SET(session, stats, txn_pinned_timestamp_checkpoint,
1210 commit_timestamp.val - checkpoint_timestamp.val);
1211 WT_STAT_SET(session, stats, txn_pinned_timestamp_oldest,
1212 commit_timestamp.val - txn_global->oldest_timestamp.val);
1213 }
1214 #endif
1215
1216 WT_STAT_SET(session, stats, txn_pinned_snapshot_range,
1217 snapshot_pinned == WT_TXN_NONE ?
1218 0 : txn_global->current - snapshot_pinned);
1219
1220 WT_STAT_SET(session, stats, txn_pinned_checkpoint_range,
1221 checkpoint_pinned == WT_TXN_NONE ?
1222 0 : txn_global->current - checkpoint_pinned);
1223
1224 WT_STAT_SET(
1225 session, stats, txn_checkpoint_time_max, conn->ckpt_time_max);
1226 WT_STAT_SET(
1227 session, stats, txn_checkpoint_time_min, conn->ckpt_time_min);
1228 WT_STAT_SET(
1229 session, stats, txn_checkpoint_time_recent, conn->ckpt_time_recent);
1230 WT_STAT_SET(
1231 session, stats, txn_checkpoint_time_total, conn->ckpt_time_total);
1232 WT_STAT_SET(session,
1233 stats, txn_commit_queue_len, txn_global->commit_timestampq_len);
1234 WT_STAT_SET(session,
1235 stats, txn_read_queue_len, txn_global->read_timestampq_len);
1236 }
1237
1238 /*
1239 * __wt_txn_release_resources --
1240 * Release resources for a session's transaction data.
1241 */
1242 void
__wt_txn_release_resources(WT_SESSION_IMPL * session)1243 __wt_txn_release_resources(WT_SESSION_IMPL *session)
1244 {
1245 WT_TXN *txn;
1246
1247 txn = &session->txn;
1248
1249 WT_ASSERT(session, txn->mod_count == 0);
1250 __wt_free(session, txn->mod);
1251 txn->mod_alloc = 0;
1252 txn->mod_count = 0;
1253 }
1254
1255 /*
1256 * __wt_txn_destroy --
1257 * Destroy a session's transaction data.
1258 */
1259 void
__wt_txn_destroy(WT_SESSION_IMPL * session)1260 __wt_txn_destroy(WT_SESSION_IMPL *session)
1261 {
1262 __wt_txn_release_resources(session);
1263 __wt_free(session, session->txn.snapshot);
1264 }
1265
1266 /*
1267 * __wt_txn_global_init --
1268 * Initialize the global transaction state.
1269 */
1270 int
__wt_txn_global_init(WT_SESSION_IMPL * session,const char * cfg[])1271 __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[])
1272 {
1273 WT_CONNECTION_IMPL *conn;
1274 WT_TXN_GLOBAL *txn_global;
1275 WT_TXN_STATE *s;
1276 u_int i;
1277
1278 WT_UNUSED(cfg);
1279 conn = S2C(session);
1280
1281 txn_global = &conn->txn_global;
1282 txn_global->current = txn_global->last_running =
1283 txn_global->metadata_pinned = txn_global->oldest_id = WT_TXN_FIRST;
1284
1285 WT_RET(__wt_spin_init(
1286 session, &txn_global->id_lock, "transaction id lock"));
1287 WT_RWLOCK_INIT_TRACKED(session, &txn_global->rwlock, txn_global);
1288 WT_RET(__wt_rwlock_init(session, &txn_global->visibility_rwlock));
1289
1290 WT_RWLOCK_INIT_TRACKED(session,
1291 &txn_global->commit_timestamp_rwlock, commit_timestamp);
1292 TAILQ_INIT(&txn_global->commit_timestamph);
1293
1294 WT_RWLOCK_INIT_TRACKED(session,
1295 &txn_global->read_timestamp_rwlock, read_timestamp);
1296 TAILQ_INIT(&txn_global->read_timestamph);
1297
1298 WT_RET(__wt_rwlock_init(session, &txn_global->nsnap_rwlock));
1299 txn_global->nsnap_oldest_id = WT_TXN_NONE;
1300 TAILQ_INIT(&txn_global->nsnaph);
1301
1302 WT_RET(__wt_calloc_def(
1303 session, conn->session_size, &txn_global->states));
1304
1305 for (i = 0, s = txn_global->states; i < conn->session_size; i++, s++)
1306 s->id = s->metadata_pinned = s->pinned_id = WT_TXN_NONE;
1307
1308 return (0);
1309 }
1310
1311 /*
1312 * __wt_txn_global_destroy --
1313 * Destroy the global transaction state.
1314 */
1315 void
__wt_txn_global_destroy(WT_SESSION_IMPL * session)1316 __wt_txn_global_destroy(WT_SESSION_IMPL *session)
1317 {
1318 WT_CONNECTION_IMPL *conn;
1319 WT_TXN_GLOBAL *txn_global;
1320
1321 conn = S2C(session);
1322 txn_global = &conn->txn_global;
1323
1324 if (txn_global == NULL)
1325 return;
1326
1327 __wt_spin_destroy(session, &txn_global->id_lock);
1328 __wt_rwlock_destroy(session, &txn_global->rwlock);
1329 __wt_rwlock_destroy(session, &txn_global->commit_timestamp_rwlock);
1330 __wt_rwlock_destroy(session, &txn_global->read_timestamp_rwlock);
1331 __wt_rwlock_destroy(session, &txn_global->nsnap_rwlock);
1332 __wt_rwlock_destroy(session, &txn_global->visibility_rwlock);
1333 __wt_free(session, txn_global->states);
1334 }
1335
1336 /*
1337 * __wt_txn_activity_drain --
1338 * Wait for transactions to quiesce.
1339 */
1340 int
__wt_txn_activity_drain(WT_SESSION_IMPL * session)1341 __wt_txn_activity_drain(WT_SESSION_IMPL *session)
1342 {
1343 bool txn_active;
1344
1345 /*
1346 * It's possible that the eviction server is in the middle of a long
1347 * operation, with a transaction ID pinned. In that case, we will loop
1348 * here until the transaction ID is released, when the oldest
1349 * transaction ID will catch up with the current ID.
1350 */
1351 for (;;) {
1352 WT_RET(__wt_txn_activity_check(session, &txn_active));
1353 if (!txn_active)
1354 break;
1355
1356 WT_STAT_CONN_INCR(session, txn_release_blocked);
1357 __wt_yield();
1358 }
1359
1360 return (0);
1361 }
1362
1363 /*
1364 * __wt_txn_global_shutdown --
1365 * Shut down the global transaction state.
1366 */
1367 void
__wt_txn_global_shutdown(WT_SESSION_IMPL * session)1368 __wt_txn_global_shutdown(WT_SESSION_IMPL *session)
1369 {
1370 #ifdef HAVE_TIMESTAMPS
1371 /*
1372 * All application transactions have completed, ignore the pinned
1373 * timestamp so that updates can be evicted from the cache during
1374 * connection close.
1375 *
1376 * Note that we are relying on a special case in __wt_txn_visible_all
1377 * that returns true during close when there is no pinned timestamp
1378 * set.
1379 */
1380 S2C(session)->txn_global.has_pinned_timestamp = false;
1381 #else
1382 WT_UNUSED(session);
1383 #endif
1384 }
1385
1386 /*
1387 * __wt_verbose_dump_txn_one --
1388 * Output diagnostic information about a transaction structure.
1389 */
1390 int
__wt_verbose_dump_txn_one(WT_SESSION_IMPL * session,WT_TXN * txn)1391 __wt_verbose_dump_txn_one(WT_SESSION_IMPL *session, WT_TXN *txn)
1392 {
1393 #ifdef HAVE_TIMESTAMPS
1394 char hex_timestamp[3][2 * WT_TIMESTAMP_SIZE + 1];
1395 #endif
1396 const char *iso_tag;
1397
1398 WT_NOT_READ(iso_tag, "INVALID");
1399 switch (txn->isolation) {
1400 case WT_ISO_READ_COMMITTED:
1401 iso_tag = "WT_ISO_READ_COMMITTED";
1402 break;
1403 case WT_ISO_READ_UNCOMMITTED:
1404 iso_tag = "WT_ISO_READ_UNCOMMITTED";
1405 break;
1406 case WT_ISO_SNAPSHOT:
1407 iso_tag = "WT_ISO_SNAPSHOT";
1408 break;
1409 }
1410 #ifdef HAVE_TIMESTAMPS
1411 WT_RET(__wt_timestamp_to_hex_string(
1412 session, hex_timestamp[0], &txn->commit_timestamp));
1413 WT_RET(__wt_timestamp_to_hex_string(
1414 session, hex_timestamp[1], &txn->first_commit_timestamp));
1415 WT_RET(__wt_timestamp_to_hex_string(
1416 session, hex_timestamp[2], &txn->read_timestamp));
1417 WT_RET(__wt_msg(session,
1418 "mod count: %u"
1419 ", snap min: %" PRIu64
1420 ", snap max: %" PRIu64
1421 ", commit_timestamp: %s"
1422 ", first_commit_timestamp: %s"
1423 ", read_timestamp: %s"
1424 ", flags: 0x%08" PRIx32
1425 ", isolation: %s",
1426 txn->mod_count,
1427 txn->snap_min,
1428 txn->snap_max,
1429 hex_timestamp[0],
1430 hex_timestamp[1],
1431 hex_timestamp[2],
1432 txn->flags,
1433 iso_tag));
1434 #else
1435 WT_RET(__wt_msg(session,
1436 "mod count: %u"
1437 ", snap min: %" PRIu64
1438 ", snap max: %" PRIu64
1439 ", flags: 0x%08" PRIx32
1440 ", isolation: %s",
1441 txn->mod_count,
1442 txn->snap_min,
1443 txn->snap_max,
1444 txn->flags,
1445 iso_tag));
1446 #endif
1447 return (0);
1448 }
1449
1450 /*
1451 * __wt_verbose_dump_txn --
1452 * Output diagnostic information about the global transaction state.
1453 */
1454 int
__wt_verbose_dump_txn(WT_SESSION_IMPL * session)1455 __wt_verbose_dump_txn(WT_SESSION_IMPL *session)
1456 {
1457 WT_CONNECTION_IMPL *conn;
1458 WT_SESSION_IMPL *sess;
1459 WT_TXN_GLOBAL *txn_global;
1460 WT_TXN_STATE *s;
1461 uint64_t id;
1462 uint32_t i, session_cnt;
1463 #ifdef HAVE_TIMESTAMPS
1464 char hex_timestamp[3][2 * WT_TIMESTAMP_SIZE + 1];
1465 #endif
1466
1467 conn = S2C(session);
1468 txn_global = &conn->txn_global;
1469
1470 WT_RET(__wt_msg(session, "%s", WT_DIVIDER));
1471 WT_RET(__wt_msg(session, "transaction state dump"));
1472
1473 WT_RET(__wt_msg(session, "current ID: %" PRIu64, txn_global->current));
1474 WT_RET(__wt_msg(session,
1475 "last running ID: %" PRIu64, txn_global->last_running));
1476 WT_RET(__wt_msg(session, "oldest ID: %" PRIu64, txn_global->oldest_id));
1477
1478 #ifdef HAVE_TIMESTAMPS
1479 WT_RET(__wt_timestamp_to_hex_string(
1480 session, hex_timestamp[0], &txn_global->commit_timestamp));
1481 WT_RET(__wt_msg(session, "commit timestamp: %s", hex_timestamp[0]));
1482 WT_RET(__wt_timestamp_to_hex_string(
1483 session, hex_timestamp[0], &txn_global->oldest_timestamp));
1484 WT_RET(__wt_msg(session, "oldest timestamp: %s", hex_timestamp[0]));
1485 WT_RET(__wt_timestamp_to_hex_string(
1486 session, hex_timestamp[0], &txn_global->pinned_timestamp));
1487 WT_RET(__wt_msg(session, "pinned timestamp: %s", hex_timestamp[0]));
1488 WT_RET(__wt_timestamp_to_hex_string(
1489 session, hex_timestamp[0], &txn_global->stable_timestamp));
1490 WT_RET(__wt_msg(session, "stable timestamp: %s", hex_timestamp[0]));
1491 WT_RET(__wt_msg(session, "has_commit_timestamp: %s",
1492 txn_global->has_commit_timestamp ? "yes" : "no"));
1493 WT_RET(__wt_msg(session, "has_oldest_timestamp: %s",
1494 txn_global->has_oldest_timestamp ? "yes" : "no"));
1495 WT_RET(__wt_msg(session, "has_pinned_timestamp: %s",
1496 txn_global->has_pinned_timestamp ? "yes" : "no"));
1497 WT_RET(__wt_msg(session, "has_stable_timestamp: %s",
1498 txn_global->has_stable_timestamp ? "yes" : "no"));
1499 WT_RET(__wt_msg(session, "oldest_is_pinned: %s",
1500 txn_global->oldest_is_pinned ? "yes" : "no"));
1501 WT_RET(__wt_msg(session, "stable_is_pinned: %s",
1502 txn_global->stable_is_pinned ? "yes" : "no"));
1503 #endif
1504
1505 WT_RET(__wt_msg(session, "checkpoint running: %s",
1506 txn_global->checkpoint_running ? "yes" : "no"));
1507 WT_RET(__wt_msg(session, "checkpoint generation: %" PRIu64,
1508 __wt_gen(session, WT_GEN_CHECKPOINT)));
1509 WT_RET(__wt_msg(session, "checkpoint pinned ID: %" PRIu64,
1510 txn_global->checkpoint_state.pinned_id));
1511 WT_RET(__wt_msg(session, "checkpoint txn ID: %" PRIu64,
1512 txn_global->checkpoint_state.id));
1513
1514 WT_RET(__wt_msg(session,
1515 "oldest named snapshot ID: %" PRIu64, txn_global->nsnap_oldest_id));
1516
1517 WT_ORDERED_READ(session_cnt, conn->session_cnt);
1518 WT_RET(__wt_msg(session, "session count: %" PRIu32, session_cnt));
1519 WT_RET(__wt_msg(session, "Transaction state of active sessions:"));
1520
1521 /*
1522 * Walk each session transaction state and dump information. Accessing
1523 * the content of session handles is not thread safe, so some
1524 * information may change while traversing if other threads are active
1525 * at the same time, which is OK since this is diagnostic code.
1526 */
1527 for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
1528 /* Skip sessions with no active transaction */
1529 if ((id = s->id) == WT_TXN_NONE && s->pinned_id == WT_TXN_NONE)
1530 continue;
1531 sess = &conn->sessions[i];
1532 WT_RET(__wt_msg(session,
1533 "ID: %" PRIu64
1534 ", pinned ID: %" PRIu64
1535 ", metadata pinned ID: %" PRIu64
1536 ", name: %s",
1537 id, s->pinned_id, s->metadata_pinned,
1538 sess->name == NULL ?
1539 "EMPTY" : sess->name));
1540 WT_RET(__wt_verbose_dump_txn_one(sess, &sess->txn));
1541 }
1542
1543 return (0);
1544 }
1545