1 /*-
2 * Copyright (c) 2014-2018 MongoDB, Inc.
3 * Copyright (c) 2008-2014 WiredTiger, Inc.
4 * All rights reserved.
5 *
6 * See the file LICENSE for redistribution information.
7 */
8
9 static inline int __wt_txn_id_check(WT_SESSION_IMPL *session);
10 static inline void __wt_txn_read_last(WT_SESSION_IMPL *session);
11
12 typedef enum {
13 WT_VISIBLE_FALSE=0, /* Not a visible update */
14 WT_VISIBLE_PREPARE=1, /* Prepared update */
15 WT_VISIBLE_TRUE=2 /* A visible update */
16 } WT_VISIBLE_TYPE;
17 #ifdef HAVE_TIMESTAMPS
18 /*
19 * __wt_txn_timestamp_flags --
20 * Set txn related timestamp flags.
21 */
22 static inline void
__wt_txn_timestamp_flags(WT_SESSION_IMPL * session)23 __wt_txn_timestamp_flags(WT_SESSION_IMPL *session)
24 {
25 WT_BTREE *btree;
26
27 if (session->dhandle == NULL)
28 return;
29 btree = S2BT(session);
30 if (btree == NULL)
31 return;
32 if (FLD_ISSET(btree->assert_flags, WT_ASSERT_COMMIT_TS_ALWAYS))
33 F_SET(&session->txn, WT_TXN_TS_COMMIT_ALWAYS);
34 if (FLD_ISSET(btree->assert_flags, WT_ASSERT_COMMIT_TS_KEYS))
35 F_SET(&session->txn, WT_TXN_TS_COMMIT_KEYS);
36 if (FLD_ISSET(btree->assert_flags, WT_ASSERT_COMMIT_TS_NEVER))
37 F_SET(&session->txn, WT_TXN_TS_COMMIT_NEVER);
38 }
39
40 #if WT_TIMESTAMP_SIZE == 8
41 #define WT_WITH_TIMESTAMP_READLOCK(session, l, e) e
42
43 /*
44 * __wt_timestamp_cmp --
45 * Compare two timestamps.
46 */
47 static inline int
__wt_timestamp_cmp(const wt_timestamp_t * ts1,const wt_timestamp_t * ts2)48 __wt_timestamp_cmp(const wt_timestamp_t *ts1, const wt_timestamp_t *ts2)
49 {
50 return (ts1->val == ts2->val ? 0 : (ts1->val > ts2->val ? 1 : -1));
51 }
52
53 /*
54 * __wt_timestamp_set --
55 * Set a timestamp.
56 */
57 static inline void
__wt_timestamp_set(wt_timestamp_t * dest,const wt_timestamp_t * src)58 __wt_timestamp_set(wt_timestamp_t *dest, const wt_timestamp_t *src)
59 {
60 dest->val = src->val;
61 }
62
63 /*
64 * __wt_timestamp_subone --
65 * Subtract one from a timestamp.
66 */
67 static inline void
__wt_timestamp_subone(wt_timestamp_t * ts)68 __wt_timestamp_subone(wt_timestamp_t *ts)
69 {
70 ts->val -= 1;
71 }
72
73 /*
74 * __wt_timestamp_iszero --
75 * Check if a timestamp is equal to the special "zero" time.
76 */
77 static inline bool
__wt_timestamp_iszero(const wt_timestamp_t * ts)78 __wt_timestamp_iszero(const wt_timestamp_t *ts)
79 {
80 return (ts->val == 0);
81 }
82
83 /*
84 * __wt_timestamp_set_inf --
85 * Set a timestamp to the maximum value.
86 */
87 static inline void
__wt_timestamp_set_inf(wt_timestamp_t * ts)88 __wt_timestamp_set_inf(wt_timestamp_t *ts)
89 {
90 ts->val = UINT64_MAX;
91 }
92
93 /*
94 * __wt_timestamp_set_zero --
95 * Zero out a timestamp.
96 */
97 static inline void
__wt_timestamp_set_zero(wt_timestamp_t * ts)98 __wt_timestamp_set_zero(wt_timestamp_t *ts)
99 {
100 ts->val = 0;
101 }
102
103 #else /* WT_TIMESTAMP_SIZE != 8 */
104
105 #define WT_WITH_TIMESTAMP_READLOCK(s, l, e) do { \
106 __wt_readlock((s), (l)); \
107 e; \
108 __wt_readunlock((s), (l)); \
109 } while (0)
110
111 /*
112 * __wt_timestamp_cmp --
113 * Compare two timestamps.
114 */
115 static inline int
__wt_timestamp_cmp(const wt_timestamp_t * ts1,const wt_timestamp_t * ts2)116 __wt_timestamp_cmp(const wt_timestamp_t *ts1, const wt_timestamp_t *ts2)
117 {
118 return (memcmp(ts1->ts, ts2->ts, WT_TIMESTAMP_SIZE));
119 }
120
121 /*
122 * __wt_timestamp_set --
123 * Set a timestamp.
124 */
125 static inline void
__wt_timestamp_set(wt_timestamp_t * dest,const wt_timestamp_t * src)126 __wt_timestamp_set(wt_timestamp_t *dest, const wt_timestamp_t *src)
127 {
128 (void)memcpy(dest->ts, src->ts, WT_TIMESTAMP_SIZE);
129 }
130
131 /*
132 * __wt_timestamp_iszero --
133 * Check if a timestamp is equal to the special "zero" time.
134 */
135 static inline bool
__wt_timestamp_iszero(const wt_timestamp_t * ts)136 __wt_timestamp_iszero(const wt_timestamp_t *ts)
137 {
138 static const wt_timestamp_t zero_timestamp;
139
140 return (memcmp(ts->ts, &zero_timestamp, WT_TIMESTAMP_SIZE) == 0);
141 }
142
143 /*
144 * __wt_timestamp_set_inf --
145 * Set a timestamp to the maximum value.
146 */
147 static inline void
__wt_timestamp_set_inf(wt_timestamp_t * ts)148 __wt_timestamp_set_inf(wt_timestamp_t *ts)
149 {
150 memset(ts->ts, 0xff, WT_TIMESTAMP_SIZE);
151 }
152
153 /*
154 * __wt_timestamp_set_zero --
155 * Zero out a timestamp.
156 */
157 static inline void
__wt_timestamp_set_zero(wt_timestamp_t * ts)158 __wt_timestamp_set_zero(wt_timestamp_t *ts)
159 {
160 memset(ts->ts, 0x00, WT_TIMESTAMP_SIZE);
161 }
162
163 /*
164 * __wt_timestamp_subone --
165 * Subtract one from a timestamp.
166 */
167 static inline void
__wt_timestamp_subone(wt_timestamp_t * ts)168 __wt_timestamp_subone(wt_timestamp_t *ts)
169 {
170 uint8_t *tsb;
171
172 /*
173 * Complicated path for arbitrary-sized timestamps: start with the
174 * least significant byte, subtract one, continue to more significant
175 * bytes on underflow.
176 */
177 for (tsb = ts->ts + WT_TIMESTAMP_SIZE - 1; tsb >= ts->ts; --tsb)
178 if (--*tsb != 0xff)
179 break;
180 }
181
182 #endif /* WT_TIMESTAMP_SIZE == 8 */
183
184 #else /* !HAVE_TIMESTAMPS */
185
186 #define __wt_timestamp_set(dest, src)
187 #define __wt_timestamp_set_inf(ts)
188 #define __wt_timestamp_set_zero(ts)
189 #define __wt_timestamp_subone(ts)
190 #define __wt_txn_clear_commit_timestamp(session)
191 #define __wt_txn_clear_read_timestamp(session)
192 #define __wt_txn_timestamp_flags(session)
193
194 #endif /* HAVE_TIMESTAMPS */
195
196 /*
197 * __txn_next_op --
198 * Mark a WT_UPDATE object modified by the current transaction.
199 */
200 static inline int
__txn_next_op(WT_SESSION_IMPL * session,WT_TXN_OP ** opp)201 __txn_next_op(WT_SESSION_IMPL *session, WT_TXN_OP **opp)
202 {
203 WT_TXN *txn;
204 WT_TXN_OP *op;
205
206 *opp = NULL;
207
208 txn = &session->txn;
209
210 /*
211 * We're about to perform an update.
212 * Make sure we have allocated a transaction ID.
213 */
214 WT_RET(__wt_txn_id_check(session));
215 WT_ASSERT(session, F_ISSET(txn, WT_TXN_HAS_ID));
216
217 WT_RET(__wt_realloc_def(session, &txn->mod_alloc,
218 txn->mod_count + 1, &txn->mod));
219
220 op = &txn->mod[txn->mod_count++];
221 WT_CLEAR(*op);
222 op->btree = S2BT(session);
223 (void)__wt_atomic_addi32(&session->dhandle->session_inuse, 1);
224 *opp = op;
225 return (0);
226 }
227
228 /*
229 * __wt_txn_unmodify --
230 * If threads race making updates, they may discard the last referenced
231 * WT_UPDATE item while the transaction is still active. This function
232 * removes the last update item from the "log".
233 */
234 static inline void
__wt_txn_unmodify(WT_SESSION_IMPL * session)235 __wt_txn_unmodify(WT_SESSION_IMPL *session)
236 {
237 WT_TXN *txn;
238 WT_TXN_OP *op;
239
240 txn = &session->txn;
241 if (F_ISSET(txn, WT_TXN_HAS_ID)) {
242 WT_ASSERT(session, txn->mod_count > 0);
243 --txn->mod_count;
244 op = txn->mod + txn->mod_count;
245 __wt_txn_op_free(session, op);
246 }
247 }
248
249 #ifdef HAVE_TIMESTAMPS
250 /*
251 * __wt_txn_op_commit_page_del --
252 * Make the transaction ID and timestamp updates necessary to a ref that
253 * was created by a fast delete truncate operation.
254 */
255 static inline void
__wt_txn_op_commit_page_del(WT_SESSION_IMPL * session,WT_REF * ref)256 __wt_txn_op_commit_page_del(WT_SESSION_IMPL *session, WT_REF *ref)
257 {
258 WT_TXN *txn;
259 WT_UPDATE **updp;
260 uint32_t previous_state;
261
262 txn = &session->txn;
263
264 /* Avoid locking the page if a previous eviction already cleaned up. */
265 if (ref->page_del->update_list == NULL)
266 return;
267
268 /*
269 * Lock the ref to ensure we don't race with eviction freeing the
270 * page deleted update list.
271 */
272 for (;; __wt_yield()) {
273 previous_state = ref->state;
274 if (previous_state != WT_REF_LOCKED &&
275 __wt_atomic_casv32(
276 &ref->state, previous_state, WT_REF_LOCKED))
277 break;
278 }
279
280 for (updp = ref->page_del->update_list;
281 updp != NULL && *updp != NULL; ++updp) {
282 __wt_timestamp_set(&(*updp)->timestamp, &txn->commit_timestamp);
283 if (F_ISSET(txn, WT_TXN_PREPARE))
284 /*
285 * Holding the ref locked means we have exclusive
286 * access, so don't need to use the prepare locked
287 * transition state.
288 */
289 (*updp)->prepare_state = WT_PREPARE_RESOLVED;
290 }
291
292 /*
293 * Publish to ensure we don't let the page be evicted and the updates
294 * discarded before being written.
295 */
296 WT_PUBLISH(ref->state, previous_state);
297 }
298
299 /*
300 * __wt_txn_op_set_timestamp --
301 * Decide whether to copy a commit timestamp into an update. If the op
302 * structure doesn't have a populated update or ref field or in prepared
303 * state there won't be any check for an existing timestamp.
304 */
305 static inline void
__wt_txn_op_set_timestamp(WT_SESSION_IMPL * session,WT_TXN_OP * op)306 __wt_txn_op_set_timestamp(WT_SESSION_IMPL *session, WT_TXN_OP *op)
307 {
308 WT_TXN *txn;
309 WT_UPDATE *upd;
310 wt_timestamp_t *timestamp;
311
312 txn = &session->txn;
313
314 /*
315 * Updates in the metadata never get timestamps (either now or at
316 * commit): metadata cannot be read at a point in time, only the most
317 * recently committed data matches files on disk.
318 */
319 if (WT_IS_METADATA(op->btree->dhandle) ||
320 !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT))
321 return;
322
323 if (F_ISSET(txn, WT_TXN_PREPARE)) {
324 if (op->type == WT_TXN_OP_REF_DELETE)
325 __wt_txn_op_commit_page_del(session, op->u.ref);
326 else {
327 /*
328 * In case of a prepared transaction, the order of
329 * modification of the prepare timestamp to the commit
330 * timestamp in the update chain will not affect the
331 * data visibility, a reader will encounter a prepared
332 * update resulting in prepare conflict.
333 *
334 * As updating timestamp might not be an atomic
335 * operation, we will manage using state.
336 */
337 upd = op->u.op_upd;
338 upd->prepare_state = WT_PREPARE_LOCKED;
339 WT_WRITE_BARRIER();
340 __wt_timestamp_set(
341 &upd->timestamp, &txn->commit_timestamp);
342 WT_PUBLISH(upd->prepare_state, WT_PREPARE_RESOLVED);
343 }
344 } else {
345 /*
346 * The timestamp is in the page deleted structure for
347 * truncates, or in the update for other operations.
348 */
349 timestamp = op->type == WT_TXN_OP_REF_DELETE ?
350 &op->u.ref->page_del->timestamp : &op->u.op_upd->timestamp;
351 if (__wt_timestamp_iszero(timestamp))
352 __wt_timestamp_set(timestamp, &txn->commit_timestamp);
353 }
354 }
355 #endif
356
357 /*
358 * __wt_txn_modify --
359 * Mark a WT_UPDATE object modified by the current transaction.
360 */
361 static inline int
__wt_txn_modify(WT_SESSION_IMPL * session,WT_CURSOR_BTREE * cbt,WT_UPDATE * upd)362 __wt_txn_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
363 {
364 WT_BTREE *btree;
365 WT_ITEM key;
366 WT_TXN *txn;
367 WT_TXN_OP *op;
368
369 btree = S2BT(session);
370 txn = &session->txn;
371
372 if (F_ISSET(txn, WT_TXN_READONLY))
373 WT_RET_MSG(session, WT_ROLLBACK,
374 "Attempt to update in a read-only transaction");
375
376 WT_RET(__txn_next_op(session, &op));
377 if (F_ISSET(session, WT_SESSION_LOGGING_INMEM)) {
378 if (btree->type == BTREE_ROW)
379 op->type = WT_TXN_OP_INMEM_ROW;
380 else
381 op->type = WT_TXN_OP_INMEM_COL;
382 } else {
383 if (btree->type == BTREE_ROW)
384 op->type = WT_TXN_OP_BASIC_ROW;
385 else
386 op->type = WT_TXN_OP_BASIC_COL;
387 }
388 op->u.op_upd = upd;
389 upd->txnid = session->txn.id;
390
391 #ifdef HAVE_TIMESTAMPS
392 __wt_txn_op_set_timestamp(session, op);
393
394 /*
395 * TODO:
396 * Following code block is under #ifdef temporarily, to avoid
397 * performance penalty. This block will be enabled, once an alternative
398 * is figured out, or we have to live with this penalty.
399 */
400 #ifdef HAVE_LONG_RUNNING_PREPARE
401 /*
402 * Transaction operation with timestamp cannot be prepared.
403 * Copy the key into the transaction op structure, so the update
404 * can be evicted to lookaside, and we have a chance of finding it
405 * again. This is only possible for transactions that are in the
406 * prepared state, but we don't know at this stage if a transaction
407 * will be prepared or not.
408 */
409 if (!WT_SESSION_IS_CHECKPOINT(session) &&
410 !F_ISSET(btree, WT_BTREE_LOOKASIDE) &&
411 !WT_IS_METADATA(op->btree->dhandle)) {
412 /*
413 * Store the key, to search the prepared update in case of
414 * prepared transaction.
415 */
416 if (btree->type == BTREE_ROW) {
417 WT_RET(__wt_cursor_get_raw_key(&cbt->iface, &key));
418 WT_RET(__wt_buf_set(session,
419 &op->u.op_row.key, key.data, key.size));
420 } else
421 op->u.op_col.recno = cbt->recno;
422 }
423 #endif
424
425 #endif
426 WT_UNUSED(btree);
427 WT_UNUSED(cbt);
428 WT_UNUSED(key);
429
430 return (0);
431 }
432
433 /*
434 * __wt_txn_modify_page_delete --
435 * Remember a page truncated by the current transaction.
436 */
437 static inline int
__wt_txn_modify_page_delete(WT_SESSION_IMPL * session,WT_REF * ref)438 __wt_txn_modify_page_delete(WT_SESSION_IMPL *session, WT_REF *ref)
439 {
440 WT_DECL_RET;
441 WT_TXN *txn;
442 WT_TXN_OP *op;
443
444 txn = &session->txn;
445
446 WT_RET(__txn_next_op(session, &op));
447 op->type = WT_TXN_OP_REF_DELETE;
448
449 op->u.ref = ref;
450 ref->page_del->txnid = txn->id;
451 #ifdef HAVE_TIMESTAMPS
452 __wt_txn_op_set_timestamp(session, op);
453 #endif
454
455 WT_ERR(__wt_txn_log_op(session, NULL));
456 return (0);
457
458 err: __wt_txn_unmodify(session);
459 return (ret);
460 }
461
462 /*
463 * __wt_txn_oldest_id --
464 * Return the oldest transaction ID that has to be kept for the current
465 * tree.
466 */
467 static inline uint64_t
__wt_txn_oldest_id(WT_SESSION_IMPL * session)468 __wt_txn_oldest_id(WT_SESSION_IMPL *session)
469 {
470 WT_BTREE *btree;
471 WT_TXN_GLOBAL *txn_global;
472 uint64_t checkpoint_pinned, oldest_id;
473 bool include_checkpoint_txn;
474
475 txn_global = &S2C(session)->txn_global;
476 btree = S2BT_SAFE(session);
477
478 /*
479 * The metadata is tracked specially because of optimizations for
480 * checkpoints.
481 */
482 if (session->dhandle != NULL && WT_IS_METADATA(session->dhandle))
483 return (txn_global->metadata_pinned);
484
485 /*
486 * Take a local copy of these IDs in case they are updated while we are
487 * checking visibility.
488 */
489 oldest_id = txn_global->oldest_id;
490 include_checkpoint_txn = btree == NULL ||
491 (!F_ISSET(btree, WT_BTREE_LOOKASIDE) &&
492 btree->checkpoint_gen != __wt_gen(session, WT_GEN_CHECKPOINT));
493 if (!include_checkpoint_txn)
494 return (oldest_id);
495
496 /*
497 * The read of the transaction ID pinned by a checkpoint needs to be
498 * carefully ordered: if a checkpoint is starting and we have to start
499 * checking the pinned ID, we take the minimum of it with the oldest
500 * ID, which is what we want.
501 */
502 WT_READ_BARRIER();
503
504 /*
505 * Checkpoint transactions often fall behind ordinary application
506 * threads. Take special effort to not keep changes pinned in cache
507 * if they are only required for the checkpoint and it has already
508 * seen them.
509 *
510 * If there is no active checkpoint or this handle is up to date with
511 * the active checkpoint then it's safe to ignore the checkpoint ID in
512 * the visibility check.
513 */
514 checkpoint_pinned = txn_global->checkpoint_state.pinned_id;
515 if (checkpoint_pinned == WT_TXN_NONE ||
516 WT_TXNID_LT(oldest_id, checkpoint_pinned))
517 return (oldest_id);
518
519 return (checkpoint_pinned);
520 }
521
522 #ifdef HAVE_TIMESTAMPS
523 /*
524 * __wt_txn_pinned_timestamp --
525 * Get the first timestamp that has to be kept for the current tree.
526 */
527 static inline void
__wt_txn_pinned_timestamp(WT_SESSION_IMPL * session,wt_timestamp_t * pinned_tsp)528 __wt_txn_pinned_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *pinned_tsp)
529 {
530 WT_BTREE *btree;
531 WT_TXN_GLOBAL *txn_global;
532 wt_timestamp_t checkpoint_ts, pinned_ts;
533 bool include_checkpoint_txn;
534
535 btree = S2BT_SAFE(session);
536 txn_global = &S2C(session)->txn_global;
537
538 WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
539 __wt_timestamp_set(&pinned_ts, &txn_global->pinned_timestamp));
540 __wt_timestamp_set(pinned_tsp, &pinned_ts);
541
542 /*
543 * Checkpoint transactions often fall behind ordinary application
544 * threads. Take special effort to not keep changes pinned in cache if
545 * they are only required for the checkpoint and it has already seen
546 * them.
547 *
548 * If there is no active checkpoint or this handle is up to date with
549 * the active checkpoint then it's safe to ignore the checkpoint ID in
550 * the visibility check.
551 */
552 include_checkpoint_txn = btree == NULL ||
553 (!F_ISSET(btree, WT_BTREE_LOOKASIDE) &&
554 btree->checkpoint_gen != __wt_gen(session, WT_GEN_CHECKPOINT));
555 if (!include_checkpoint_txn)
556 return;
557
558 /*
559 * The read of the timestamp pinned by a checkpoint needs to be
560 * carefully ordered: if a checkpoint is starting and we have to use
561 * the checkpoint timestamp, we take the minimum of it with the oldest
562 * timestamp, which is what we want.
563 */
564 WT_READ_BARRIER();
565
566 WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
567 __wt_timestamp_set(&checkpoint_ts,
568 &txn_global->checkpoint_timestamp));
569
570 if (!__wt_timestamp_iszero(&checkpoint_ts) &&
571 __wt_timestamp_cmp(&checkpoint_ts, &pinned_ts) < 0)
572 __wt_timestamp_set(pinned_tsp, &checkpoint_ts);
573 }
574 #endif
575
576 /*
577 * __txn_visible_all_id --
578 * Check if a given transaction ID is "globally visible". This is, if
579 * all sessions in the system will see the transaction ID including the
580 * ID that belongs to a running checkpoint.
581 */
582 static inline bool
__txn_visible_all_id(WT_SESSION_IMPL * session,uint64_t id)583 __txn_visible_all_id(WT_SESSION_IMPL *session, uint64_t id)
584 {
585 uint64_t oldest_id;
586
587 oldest_id = __wt_txn_oldest_id(session);
588
589 return (WT_TXNID_LT(id, oldest_id));
590 }
591
592 /*
593 * __wt_txn_visible_all --
594 * Check if a given transaction is "globally visible". This is, if all
595 * sessions in the system will see the transaction ID including the ID
596 * that belongs to a running checkpoint.
597 */
598 static inline bool
__wt_txn_visible_all(WT_SESSION_IMPL * session,uint64_t id,const wt_timestamp_t * timestamp)599 __wt_txn_visible_all(
600 WT_SESSION_IMPL *session, uint64_t id, const wt_timestamp_t *timestamp)
601 {
602 if (!__txn_visible_all_id(session, id))
603 return (false);
604
605 #ifdef HAVE_TIMESTAMPS
606 {
607 wt_timestamp_t pinned_ts;
608
609 /* Timestamp check. */
610 if (timestamp == NULL || __wt_timestamp_iszero(timestamp))
611 return (true);
612
613 /*
614 * If no oldest timestamp has been supplied, updates have to stay in
615 * cache until we are shutting down.
616 */
617 if (!S2C(session)->txn_global.has_pinned_timestamp)
618 return (F_ISSET(S2C(session), WT_CONN_CLOSING));
619
620 __wt_txn_pinned_timestamp(session, &pinned_ts);
621 return (__wt_timestamp_cmp(timestamp, &pinned_ts) <= 0);
622 }
623 #else
624 WT_UNUSED(timestamp);
625 return (true);
626 #endif
627 }
628
629 /*
630 * __wt_txn_upd_visible_all --
631 * Is the given update visible to all (possible) readers?
632 */
633 static inline bool
__wt_txn_upd_visible_all(WT_SESSION_IMPL * session,WT_UPDATE * upd)634 __wt_txn_upd_visible_all(WT_SESSION_IMPL *session, WT_UPDATE *upd)
635 {
636 if (upd->prepare_state == WT_PREPARE_LOCKED ||
637 upd->prepare_state == WT_PREPARE_INPROGRESS)
638 return (false);
639
640 return (__wt_txn_visible_all(
641 session, upd->txnid, WT_TIMESTAMP_NULL(&upd->timestamp)));
642 }
643
644 /*
645 * __txn_visible_id --
646 * Can the current transaction see the given ID?
647 */
648 static inline bool
__txn_visible_id(WT_SESSION_IMPL * session,uint64_t id)649 __txn_visible_id(WT_SESSION_IMPL *session, uint64_t id)
650 {
651 WT_TXN *txn;
652 bool found;
653
654 txn = &session->txn;
655
656 /* Changes with no associated transaction are always visible. */
657 if (id == WT_TXN_NONE)
658 return (true);
659
660 /* Nobody sees the results of aborted transactions. */
661 if (id == WT_TXN_ABORTED)
662 return (false);
663
664 /* Read-uncommitted transactions see all other changes. */
665 if (txn->isolation == WT_ISO_READ_UNCOMMITTED)
666 return (true);
667
668 /*
669 * If we don't have a transactional snapshot, only make stable updates
670 * visible.
671 */
672 if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT))
673 return (__txn_visible_all_id(session, id));
674
675 /* Transactions see their own changes. */
676 if (id == txn->id)
677 return (true);
678
679 /*
680 * WT_ISO_SNAPSHOT, WT_ISO_READ_COMMITTED: the ID is visible if it is
681 * not the result of a concurrent transaction, that is, if was
682 * committed before the snapshot was taken.
683 *
684 * The order here is important: anything newer than the maximum ID we
685 * saw when taking the snapshot should be invisible, even if the
686 * snapshot is empty.
687 */
688 if (WT_TXNID_LE(txn->snap_max, id))
689 return (false);
690 if (txn->snapshot_count == 0 || WT_TXNID_LT(id, txn->snap_min))
691 return (true);
692
693 WT_BINARY_SEARCH(id, txn->snapshot, txn->snapshot_count, found);
694 return (!found);
695 }
696
697 /*
698 * __wt_txn_visible --
699 * Can the current transaction see the given ID / timestamp?
700 */
701 static inline bool
__wt_txn_visible(WT_SESSION_IMPL * session,uint64_t id,const wt_timestamp_t * timestamp)702 __wt_txn_visible(
703 WT_SESSION_IMPL *session, uint64_t id, const wt_timestamp_t *timestamp)
704 {
705 if (!__txn_visible_id(session, id))
706 return (false);
707
708 /* Transactions read their writes, regardless of timestamps. */
709 if (F_ISSET(&session->txn, WT_TXN_HAS_ID) && id == session->txn.id)
710 return (true);
711
712 #ifdef HAVE_TIMESTAMPS
713 {
714 WT_TXN *txn = &session->txn;
715
716 /* Timestamp check. */
717 if (!F_ISSET(txn, WT_TXN_HAS_TS_READ) || timestamp == NULL)
718 return (true);
719
720 return (__wt_timestamp_cmp(timestamp, &txn->read_timestamp) <= 0);
721 }
722 #else
723 WT_UNUSED(timestamp);
724 return (true);
725 #endif
726 }
727
728 /*
729 * __wt_txn_upd_visible_type --
730 * Visible type of given update for the current transaction.
731 */
732 static inline WT_VISIBLE_TYPE
__wt_txn_upd_visible_type(WT_SESSION_IMPL * session,WT_UPDATE * upd)733 __wt_txn_upd_visible_type(WT_SESSION_IMPL *session, WT_UPDATE *upd)
734 {
735 uint8_t prepare_state, previous_state;
736 bool upd_visible;
737
738 for (;;__wt_yield()) {
739 /* Prepare state change is in progress, yield and try again. */
740 WT_ORDERED_READ(prepare_state, upd->prepare_state);
741 if (prepare_state == WT_PREPARE_LOCKED)
742 continue;
743
744 upd_visible = __wt_txn_visible(
745 session, upd->txnid, WT_TIMESTAMP_NULL(&upd->timestamp));
746
747 /*
748 * The visibility check is only valid if the update does not
749 * change state. If the state does change, recheck visibility.
750 */
751 previous_state = prepare_state;
752 WT_ORDERED_READ(prepare_state, upd->prepare_state);
753 if (previous_state == prepare_state)
754 break;
755
756 WT_STAT_CONN_INCR(session, prepared_transition_blocked_page);
757 }
758
759 if (!upd_visible)
760 return (WT_VISIBLE_FALSE);
761
762 /* Ignore the prepared update, if transaction configuration says so. */
763 if (prepare_state == WT_PREPARE_INPROGRESS)
764 return (F_ISSET(&session->txn, WT_TXN_IGNORE_PREPARE) ?
765 WT_VISIBLE_FALSE : WT_VISIBLE_PREPARE);
766
767 return (WT_VISIBLE_TRUE);
768 }
769
770 /*
771 * __wt_txn_upd_visible --
772 * Can the current transaction see the given update.
773 */
774 static inline bool
__wt_txn_upd_visible(WT_SESSION_IMPL * session,WT_UPDATE * upd)775 __wt_txn_upd_visible(WT_SESSION_IMPL *session, WT_UPDATE *upd)
776 {
777 return (__wt_txn_upd_visible_type(session, upd) == WT_VISIBLE_TRUE);
778 }
779
780 /*
781 * __wt_txn_read --
782 * Get the first visible update in a list (or NULL if none are visible).
783 */
784 static inline int
__wt_txn_read(WT_SESSION_IMPL * session,WT_UPDATE * upd,WT_UPDATE ** updp)785 __wt_txn_read(WT_SESSION_IMPL *session, WT_UPDATE *upd, WT_UPDATE **updp)
786 {
787 static WT_UPDATE tombstone = {
788 .txnid = WT_TXN_NONE, .type = WT_UPDATE_TOMBSTONE
789 };
790 WT_VISIBLE_TYPE upd_visible;
791 uint8_t type;
792 bool skipped_birthmark;
793
794 *updp = NULL;
795
796 type = WT_UPDATE_INVALID; /* [-Wconditional-uninitialized] */
797 for (skipped_birthmark = false; upd != NULL; upd = upd->next) {
798 WT_ORDERED_READ(type, upd->type);
799
800 /* Skip reserved place-holders, they're never visible. */
801 if (type != WT_UPDATE_RESERVE) {
802 upd_visible = __wt_txn_upd_visible_type(session, upd);
803 if (upd_visible == WT_VISIBLE_TRUE)
804 break;
805 if (upd_visible == WT_VISIBLE_PREPARE)
806 return (WT_PREPARE_CONFLICT);
807 }
808 /* An invisible birthmark is equivalent to a tombstone. */
809 if (type == WT_UPDATE_BIRTHMARK)
810 skipped_birthmark = true;
811 }
812
813 if (upd == NULL && skipped_birthmark) {
814 upd = &tombstone;
815 type = upd->type;
816 }
817
818 *updp = upd == NULL || type == WT_UPDATE_BIRTHMARK ? NULL : upd;
819 return (0);
820 }
821
822 /*
823 * __wt_txn_begin --
824 * Begin a transaction.
825 */
826 static inline int
__wt_txn_begin(WT_SESSION_IMPL * session,const char * cfg[])827 __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[])
828 {
829 WT_TXN *txn;
830
831 txn = &session->txn;
832 txn->isolation = session->isolation;
833 txn->txn_logsync = S2C(session)->txn_logsync;
834
835 if (cfg != NULL)
836 WT_RET(__wt_txn_config(session, cfg));
837
838 /*
839 * Allocate a snapshot if required. Named snapshot transactions already
840 * have an ID setup.
841 */
842 if (txn->isolation == WT_ISO_SNAPSHOT &&
843 !F_ISSET(txn, WT_TXN_NAMED_SNAPSHOT)) {
844 if (session->ncursors > 0)
845 WT_RET(__wt_session_copy_values(session));
846
847 /* Stall here if the cache is completely full. */
848 WT_RET(__wt_cache_eviction_check(session, false, true, NULL));
849
850 __wt_txn_get_snapshot(session);
851 }
852
853 F_SET(txn, WT_TXN_RUNNING);
854 if (F_ISSET(S2C(session), WT_CONN_READONLY))
855 F_SET(txn, WT_TXN_READONLY);
856
857 return (0);
858 }
859
860 /*
861 * __wt_txn_autocommit_check --
862 * If an auto-commit transaction is required, start one.
863 */
864 static inline int
__wt_txn_autocommit_check(WT_SESSION_IMPL * session)865 __wt_txn_autocommit_check(WT_SESSION_IMPL *session)
866 {
867 WT_TXN *txn;
868
869 txn = &session->txn;
870 if (F_ISSET(txn, WT_TXN_AUTOCOMMIT)) {
871 F_CLR(txn, WT_TXN_AUTOCOMMIT);
872 return (__wt_txn_begin(session, NULL));
873 }
874 return (0);
875 }
876
877 /*
878 * __wt_txn_idle_cache_check --
879 * If there is no transaction active in this thread and we haven't checked
880 * if the cache is full, do it now. If we have to block for eviction,
881 * this is the best time to do it.
882 */
883 static inline int
__wt_txn_idle_cache_check(WT_SESSION_IMPL * session)884 __wt_txn_idle_cache_check(WT_SESSION_IMPL *session)
885 {
886 WT_TXN *txn;
887 WT_TXN_STATE *txn_state;
888
889 txn = &session->txn;
890 txn_state = WT_SESSION_TXN_STATE(session);
891
892 /*
893 * Check the published snap_min because read-uncommitted never sets
894 * WT_TXN_HAS_SNAPSHOT. We don't have any transaction information at
895 * this point, so assume the transaction will be read-only. The dirty
896 * cache check will be performed when the transaction completes, if
897 * necessary.
898 */
899 if (F_ISSET(txn, WT_TXN_RUNNING) &&
900 !F_ISSET(txn, WT_TXN_HAS_ID) && txn_state->pinned_id == WT_TXN_NONE)
901 WT_RET(__wt_cache_eviction_check(session, false, true, NULL));
902
903 return (0);
904 }
905
906 /*
907 * __wt_txn_id_alloc --
908 * Allocate a new transaction ID.
909 */
910 static inline uint64_t
__wt_txn_id_alloc(WT_SESSION_IMPL * session,bool publish)911 __wt_txn_id_alloc(WT_SESSION_IMPL *session, bool publish)
912 {
913 WT_TXN_GLOBAL *txn_global;
914 WT_TXN_STATE *txn_state;
915 uint64_t id;
916
917 txn_global = &S2C(session)->txn_global;
918 txn_state = WT_SESSION_TXN_STATE(session);
919
920 /*
921 * Allocating transaction IDs involves several steps.
922 *
923 * Firstly, we do an atomic increment to allocate a unique ID. The
924 * field we increment is not used anywhere else.
925 *
926 * Then we optionally publish the allocated ID into the global
927 * transaction table. It is critical that this becomes visible before
928 * the global current value moves past our ID, or some concurrent
929 * reader could get a snapshot that makes our changes visible before we
930 * commit.
931 *
932 * We want the global value to lead the allocated values, so that any
933 * allocated transaction ID eventually becomes globally visible. When
934 * there are no transactions running, the oldest_id will reach the
935 * global current ID, so we want post-increment semantics. Our atomic
936 * add primitive does pre-increment, so adjust the result here.
937 *
938 * We rely on atomic reads of the current ID to create snapshots, so
939 * for unlocked reads to be well defined, we must use an atomic
940 * increment here.
941 */
942 __wt_spin_lock(session, &txn_global->id_lock);
943 id = txn_global->current;
944
945 if (publish) {
946 session->txn.id = id;
947 WT_PUBLISH(txn_state->id, id);
948 }
949
950 /*
951 * Even though we are in a spinlock, readers are not. We rely on
952 * atomic reads of the current ID to create snapshots, so for unlocked
953 * reads to be well defined, we must use an atomic increment here.
954 */
955 (void)__wt_atomic_addv64(&txn_global->current, 1);
956 __wt_spin_unlock(session, &txn_global->id_lock);
957 return (id);
958 }
959
960 /*
961 * __wt_txn_id_check --
962 * A transaction is going to do an update, allocate a transaction ID.
963 */
964 static inline int
__wt_txn_id_check(WT_SESSION_IMPL * session)965 __wt_txn_id_check(WT_SESSION_IMPL *session)
966 {
967 WT_TXN *txn;
968
969 txn = &session->txn;
970
971 WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));
972
973 if (F_ISSET(txn, WT_TXN_HAS_ID))
974 return (0);
975
976 /* If the transaction is idle, check that the cache isn't full. */
977 WT_RET(__wt_txn_idle_cache_check(session));
978
979 (void)__wt_txn_id_alloc(session, true);
980
981 /*
982 * If we have used 64-bits of transaction IDs, there is nothing
983 * more we can do.
984 */
985 if (txn->id == WT_TXN_ABORTED)
986 WT_RET_MSG(session, WT_ERROR, "out of transaction IDs");
987 F_SET(txn, WT_TXN_HAS_ID);
988
989 return (0);
990 }
991
992 /*
993 * __wt_txn_search_check --
994 * Check if the current transaction can search.
995 */
996 static inline int
__wt_txn_search_check(WT_SESSION_IMPL * session)997 __wt_txn_search_check(WT_SESSION_IMPL *session)
998 {
999 #ifdef HAVE_TIMESTAMPS
1000 WT_BTREE *btree;
1001 WT_TXN *txn;
1002
1003 txn = &session->txn;
1004 btree = S2BT(session);
1005 /*
1006 * If the user says a table should always use a read timestamp,
1007 * verify this transaction has one. Same if it should never have
1008 * a read timestamp.
1009 */
1010 if (FLD_ISSET(btree->assert_flags, WT_ASSERT_READ_TS_ALWAYS) &&
1011 !F_ISSET(txn, WT_TXN_PUBLIC_TS_READ))
1012 WT_RET_MSG(session, EINVAL, "read_timestamp required and "
1013 "none set on this transaction");
1014 if (FLD_ISSET(btree->assert_flags, WT_ASSERT_READ_TS_NEVER) &&
1015 F_ISSET(txn, WT_TXN_PUBLIC_TS_READ))
1016 WT_RET_MSG(session, EINVAL, "no read_timestamp required and "
1017 "timestamp set on this transaction");
1018 #endif
1019 WT_UNUSED(session);
1020 return (0);
1021 }
1022
1023 /*
1024 * __wt_txn_update_check --
1025 * Check if the current transaction can update an item.
1026 */
1027 static inline int
__wt_txn_update_check(WT_SESSION_IMPL * session,WT_UPDATE * upd)1028 __wt_txn_update_check(WT_SESSION_IMPL *session, WT_UPDATE *upd)
1029 {
1030 WT_TXN *txn;
1031 bool ignore_prepare_set;
1032
1033 txn = &session->txn;
1034
1035 if (txn->isolation != WT_ISO_SNAPSHOT)
1036 return (0);
1037
1038 /*
1039 * Always include prepared transactions in this check: they are not
1040 * supposed to affect visibility for update operations.
1041 */
1042 ignore_prepare_set = F_ISSET(txn, WT_TXN_IGNORE_PREPARE);
1043 F_CLR(txn, WT_TXN_IGNORE_PREPARE);
1044 for (;upd != NULL && !__wt_txn_upd_visible(session, upd);
1045 upd = upd->next) {
1046 if (upd->txnid != WT_TXN_ABORTED) {
1047 if (ignore_prepare_set)
1048 F_SET(txn, WT_TXN_IGNORE_PREPARE);
1049 WT_STAT_CONN_INCR(session, txn_update_conflict);
1050 WT_STAT_DATA_INCR(session, txn_update_conflict);
1051 return (__wt_txn_rollback_required(session,
1052 "conflict between concurrent operations"));
1053 }
1054 }
1055
1056 if (ignore_prepare_set)
1057 F_SET(txn, WT_TXN_IGNORE_PREPARE);
1058 return (0);
1059 }
1060
1061 /*
1062 * __wt_txn_read_last --
1063 * Called when the last page for a session is released.
1064 */
1065 static inline void
__wt_txn_read_last(WT_SESSION_IMPL * session)1066 __wt_txn_read_last(WT_SESSION_IMPL *session)
1067 {
1068 WT_TXN *txn;
1069
1070 txn = &session->txn;
1071
1072 /*
1073 * Release the snap_min ID we put in the global table.
1074 *
1075 * If the isolation has been temporarily forced, don't touch the
1076 * snapshot here: it will be restored by WT_WITH_TXN_ISOLATION.
1077 */
1078 if ((!F_ISSET(txn, WT_TXN_RUNNING) ||
1079 txn->isolation != WT_ISO_SNAPSHOT) && txn->forced_iso == 0)
1080 __wt_txn_release_snapshot(session);
1081 }
1082
1083 /*
1084 * __wt_txn_cursor_op --
1085 * Called for each cursor operation.
1086 */
1087 static inline void
__wt_txn_cursor_op(WT_SESSION_IMPL * session)1088 __wt_txn_cursor_op(WT_SESSION_IMPL *session)
1089 {
1090 WT_TXN *txn;
1091 WT_TXN_GLOBAL *txn_global;
1092 WT_TXN_STATE *txn_state;
1093
1094 txn = &session->txn;
1095 txn_global = &S2C(session)->txn_global;
1096 txn_state = WT_SESSION_TXN_STATE(session);
1097
1098 /*
1099 * We are about to read data, which means we need to protect against
1100 * updates being freed from underneath this cursor. Read-uncommitted
1101 * isolation protects values by putting a transaction ID in the global
1102 * table to prevent any update that we are reading from being freed.
1103 * Other isolation levels get a snapshot to protect their reads.
1104 *
1105 * !!!
1106 * Note: We are updating the global table unprotected, so the global
1107 * oldest_id may move past our snap_min if a scan races with this value
1108 * being published. That said, read-uncommitted operations always see
1109 * the most recent update for each record that has not been aborted
1110 * regardless of the snap_min value published here. Even if there is a
1111 * race while publishing this ID, it prevents the oldest ID from moving
1112 * further forward, so that once a read-uncommitted cursor is
1113 * positioned on a value, it can't be freed.
1114 */
1115 if (txn->isolation == WT_ISO_READ_UNCOMMITTED) {
1116 if (txn_state->pinned_id == WT_TXN_NONE)
1117 txn_state->pinned_id = txn_global->last_running;
1118 if (txn_state->metadata_pinned == WT_TXN_NONE)
1119 txn_state->metadata_pinned = txn_state->pinned_id;
1120 } else if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT))
1121 __wt_txn_get_snapshot(session);
1122 }
1123
1124 /*
1125 * __wt_txn_am_oldest --
1126 * Am I the oldest transaction in the system?
1127 */
1128 static inline bool
__wt_txn_am_oldest(WT_SESSION_IMPL * session)1129 __wt_txn_am_oldest(WT_SESSION_IMPL *session)
1130 {
1131 WT_CONNECTION_IMPL *conn;
1132 WT_TXN *txn;
1133 WT_TXN_GLOBAL *txn_global;
1134 WT_TXN_STATE *s;
1135 uint64_t id;
1136 uint32_t i, session_cnt;
1137
1138 conn = S2C(session);
1139 txn = &session->txn;
1140 txn_global = &conn->txn_global;
1141
1142 if (txn->id == WT_TXN_NONE)
1143 return (false);
1144
1145 WT_ORDERED_READ(session_cnt, conn->session_cnt);
1146 for (i = 0, s = txn_global->states; i < session_cnt; i++, s++)
1147 if ((id = s->id) != WT_TXN_NONE && WT_TXNID_LT(id, txn->id))
1148 return (false);
1149
1150 return (true);
1151 }
1152
1153 /*
1154 * __wt_txn_activity_check --
1155 * Check whether there are any running transactions.
1156 */
1157 static inline int
__wt_txn_activity_check(WT_SESSION_IMPL * session,bool * txn_active)1158 __wt_txn_activity_check(WT_SESSION_IMPL *session, bool *txn_active)
1159 {
1160 WT_TXN_GLOBAL *txn_global;
1161
1162 txn_global = &S2C(session)->txn_global;
1163
1164 /*
1165 * Ensure the oldest ID is as up to date as possible so we can use a
1166 * simple check to find if there are any running transactions.
1167 */
1168 WT_RET(__wt_txn_update_oldest(session,
1169 WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT));
1170
1171 *txn_active = (txn_global->oldest_id != txn_global->current ||
1172 txn_global->metadata_pinned != txn_global->current);
1173
1174 return (0);
1175 }
1176