1 /*-
2  * Copyright (c) 2014-2018 MongoDB, Inc.
3  * Copyright (c) 2008-2014 WiredTiger, Inc.
4  *	All rights reserved.
5  *
6  * See the file LICENSE for redistribution information.
7  */
8 
9 static inline int __wt_txn_id_check(WT_SESSION_IMPL *session);
10 static inline void __wt_txn_read_last(WT_SESSION_IMPL *session);
11 
12 typedef enum {
13 	WT_VISIBLE_FALSE=0,     /* Not a visible update */
14 	WT_VISIBLE_PREPARE=1,   /* Prepared update */
15 	WT_VISIBLE_TRUE=2       /* A visible update */
16 } WT_VISIBLE_TYPE;
17 #ifdef HAVE_TIMESTAMPS
18 /*
19  * __wt_txn_timestamp_flags --
20  *	Set txn related timestamp flags.
21  */
22 static inline void
__wt_txn_timestamp_flags(WT_SESSION_IMPL * session)23 __wt_txn_timestamp_flags(WT_SESSION_IMPL *session)
24 {
25 	WT_BTREE *btree;
26 
27 	if (session->dhandle == NULL)
28 		return;
29 	btree = S2BT(session);
30 	if (btree == NULL)
31 		return;
32 	if (FLD_ISSET(btree->assert_flags, WT_ASSERT_COMMIT_TS_ALWAYS))
33 		F_SET(&session->txn, WT_TXN_TS_COMMIT_ALWAYS);
34 	if (FLD_ISSET(btree->assert_flags, WT_ASSERT_COMMIT_TS_KEYS))
35 		F_SET(&session->txn, WT_TXN_TS_COMMIT_KEYS);
36 	if (FLD_ISSET(btree->assert_flags, WT_ASSERT_COMMIT_TS_NEVER))
37 		F_SET(&session->txn, WT_TXN_TS_COMMIT_NEVER);
38 }
39 
40 #if WT_TIMESTAMP_SIZE == 8
41 #define	WT_WITH_TIMESTAMP_READLOCK(session, l, e)       e
42 
43 /*
44  * __wt_timestamp_cmp --
45  *	Compare two timestamps.
46  */
47 static inline int
__wt_timestamp_cmp(const wt_timestamp_t * ts1,const wt_timestamp_t * ts2)48 __wt_timestamp_cmp(const wt_timestamp_t *ts1, const wt_timestamp_t *ts2)
49 {
50 	return (ts1->val == ts2->val ? 0 : (ts1->val > ts2->val ? 1 : -1));
51 }
52 
53 /*
54  * __wt_timestamp_set --
55  *	Set a timestamp.
56  */
57 static inline void
__wt_timestamp_set(wt_timestamp_t * dest,const wt_timestamp_t * src)58 __wt_timestamp_set(wt_timestamp_t *dest, const wt_timestamp_t *src)
59 {
60 	dest->val = src->val;
61 }
62 
63 /*
64  * __wt_timestamp_subone --
65  *	Subtract one from a timestamp.
66  */
67 static inline void
__wt_timestamp_subone(wt_timestamp_t * ts)68 __wt_timestamp_subone(wt_timestamp_t *ts)
69 {
70 	ts->val -= 1;
71 }
72 
73 /*
74  * __wt_timestamp_iszero --
75  *	Check if a timestamp is equal to the special "zero" time.
76  */
77 static inline bool
__wt_timestamp_iszero(const wt_timestamp_t * ts)78 __wt_timestamp_iszero(const wt_timestamp_t *ts)
79 {
80 	return (ts->val == 0);
81 }
82 
83 /*
84  * __wt_timestamp_set_inf --
85  *	Set a timestamp to the maximum value.
86  */
87 static inline void
__wt_timestamp_set_inf(wt_timestamp_t * ts)88 __wt_timestamp_set_inf(wt_timestamp_t *ts)
89 {
90 	ts->val = UINT64_MAX;
91 }
92 
93 /*
94  * __wt_timestamp_set_zero --
95  *	Zero out a timestamp.
96  */
97 static inline void
__wt_timestamp_set_zero(wt_timestamp_t * ts)98 __wt_timestamp_set_zero(wt_timestamp_t *ts)
99 {
100 	ts->val = 0;
101 }
102 
103 #else /* WT_TIMESTAMP_SIZE != 8 */
104 
105 #define	WT_WITH_TIMESTAMP_READLOCK(s, l, e)	do {                    \
106 	__wt_readlock((s), (l));                                        \
107 	e;                                                              \
108 	__wt_readunlock((s), (l));                                      \
109 } while (0)
110 
111 /*
112  * __wt_timestamp_cmp --
113  *	Compare two timestamps.
114  */
115 static inline int
__wt_timestamp_cmp(const wt_timestamp_t * ts1,const wt_timestamp_t * ts2)116 __wt_timestamp_cmp(const wt_timestamp_t *ts1, const wt_timestamp_t *ts2)
117 {
118 	return (memcmp(ts1->ts, ts2->ts, WT_TIMESTAMP_SIZE));
119 }
120 
121 /*
122  * __wt_timestamp_set --
123  *	Set a timestamp.
124  */
125 static inline void
__wt_timestamp_set(wt_timestamp_t * dest,const wt_timestamp_t * src)126 __wt_timestamp_set(wt_timestamp_t *dest, const wt_timestamp_t *src)
127 {
128 	(void)memcpy(dest->ts, src->ts, WT_TIMESTAMP_SIZE);
129 }
130 
131 /*
132  * __wt_timestamp_iszero --
133  *	Check if a timestamp is equal to the special "zero" time.
134  */
135 static inline bool
__wt_timestamp_iszero(const wt_timestamp_t * ts)136 __wt_timestamp_iszero(const wt_timestamp_t *ts)
137 {
138 	static const wt_timestamp_t zero_timestamp;
139 
140 	return (memcmp(ts->ts, &zero_timestamp, WT_TIMESTAMP_SIZE) == 0);
141 }
142 
143 /*
144  * __wt_timestamp_set_inf --
145  *	Set a timestamp to the maximum value.
146  */
147 static inline void
__wt_timestamp_set_inf(wt_timestamp_t * ts)148 __wt_timestamp_set_inf(wt_timestamp_t *ts)
149 {
150 	memset(ts->ts, 0xff, WT_TIMESTAMP_SIZE);
151 }
152 
153 /*
154  * __wt_timestamp_set_zero --
155  *	Zero out a timestamp.
156  */
157 static inline void
__wt_timestamp_set_zero(wt_timestamp_t * ts)158 __wt_timestamp_set_zero(wt_timestamp_t *ts)
159 {
160 	memset(ts->ts, 0x00, WT_TIMESTAMP_SIZE);
161 }
162 
163 /*
164  * __wt_timestamp_subone --
165  *	Subtract one from a timestamp.
166  */
167 static inline void
__wt_timestamp_subone(wt_timestamp_t * ts)168 __wt_timestamp_subone(wt_timestamp_t *ts)
169 {
170 	uint8_t *tsb;
171 
172 	/*
173 	 * Complicated path for arbitrary-sized timestamps: start with the
174 	 * least significant byte, subtract one, continue to more significant
175 	 * bytes on underflow.
176 	 */
177 	for (tsb = ts->ts + WT_TIMESTAMP_SIZE - 1; tsb >= ts->ts; --tsb)
178 		if (--*tsb != 0xff)
179 			break;
180 }
181 
182 #endif /* WT_TIMESTAMP_SIZE == 8 */
183 
184 #else /* !HAVE_TIMESTAMPS */
185 
186 #define	__wt_timestamp_set(dest, src)
187 #define	__wt_timestamp_set_inf(ts)
188 #define	__wt_timestamp_set_zero(ts)
189 #define	__wt_timestamp_subone(ts)
190 #define	__wt_txn_clear_commit_timestamp(session)
191 #define	__wt_txn_clear_read_timestamp(session)
192 #define	__wt_txn_timestamp_flags(session)
193 
194 #endif /* HAVE_TIMESTAMPS */
195 
196 /*
197  * __txn_next_op --
198  *	Mark a WT_UPDATE object modified by the current transaction.
199  */
200 static inline int
__txn_next_op(WT_SESSION_IMPL * session,WT_TXN_OP ** opp)201 __txn_next_op(WT_SESSION_IMPL *session, WT_TXN_OP **opp)
202 {
203 	WT_TXN *txn;
204 	WT_TXN_OP *op;
205 
206 	*opp = NULL;
207 
208 	txn = &session->txn;
209 
210 	/*
211 	 * We're about to perform an update.
212 	 * Make sure we have allocated a transaction ID.
213 	 */
214 	WT_RET(__wt_txn_id_check(session));
215 	WT_ASSERT(session, F_ISSET(txn, WT_TXN_HAS_ID));
216 
217 	WT_RET(__wt_realloc_def(session, &txn->mod_alloc,
218 	    txn->mod_count + 1, &txn->mod));
219 
220 	op = &txn->mod[txn->mod_count++];
221 	WT_CLEAR(*op);
222 	op->btree = S2BT(session);
223 	(void)__wt_atomic_addi32(&session->dhandle->session_inuse, 1);
224 	*opp = op;
225 	return (0);
226 }
227 
228 /*
229  * __wt_txn_unmodify --
230  *	If threads race making updates, they may discard the last referenced
231  *	WT_UPDATE item while the transaction is still active.  This function
232  *	removes the last update item from the "log".
233  */
234 static inline void
__wt_txn_unmodify(WT_SESSION_IMPL * session)235 __wt_txn_unmodify(WT_SESSION_IMPL *session)
236 {
237 	WT_TXN *txn;
238 	WT_TXN_OP *op;
239 
240 	txn = &session->txn;
241 	if (F_ISSET(txn, WT_TXN_HAS_ID)) {
242 		WT_ASSERT(session, txn->mod_count > 0);
243 		--txn->mod_count;
244 		op = txn->mod + txn->mod_count;
245 		__wt_txn_op_free(session, op);
246 	}
247 }
248 
249 #ifdef HAVE_TIMESTAMPS
250 /*
251  * __wt_txn_op_commit_page_del --
252  *	Make the transaction ID and timestamp updates necessary to a ref that
253  *      was created by a fast delete truncate operation.
254  */
255 static inline void
__wt_txn_op_commit_page_del(WT_SESSION_IMPL * session,WT_REF * ref)256 __wt_txn_op_commit_page_del(WT_SESSION_IMPL *session, WT_REF *ref)
257 {
258 	WT_TXN *txn;
259 	WT_UPDATE **updp;
260 	uint32_t previous_state;
261 
262 	txn = &session->txn;
263 
264 	/* Avoid locking the page if a previous eviction already cleaned up. */
265 	if (ref->page_del->update_list == NULL)
266 		return;
267 
268 	/*
269 	 * Lock the ref to ensure we don't race with eviction freeing the
270 	 * page deleted update list.
271 	 */
272 	for (;; __wt_yield()) {
273 		previous_state = ref->state;
274 		if (previous_state != WT_REF_LOCKED &&
275 		    __wt_atomic_casv32(
276 		    &ref->state, previous_state, WT_REF_LOCKED))
277 			break;
278 	}
279 
280 	for (updp = ref->page_del->update_list;
281 	    updp != NULL && *updp != NULL; ++updp) {
282 		__wt_timestamp_set(&(*updp)->timestamp, &txn->commit_timestamp);
283 		if (F_ISSET(txn, WT_TXN_PREPARE))
284 			/*
285 			 * Holding the ref locked means we have exclusive
286 			 * access, so don't need to use the prepare locked
287 			 * transition state.
288 			 */
289 			(*updp)->prepare_state = WT_PREPARE_RESOLVED;
290 	}
291 
292 	/*
293 	 * Publish to ensure we don't let the page be evicted and the updates
294 	 * discarded before being written.
295 	 */
296 	WT_PUBLISH(ref->state, previous_state);
297 }
298 
299 /*
300  * __wt_txn_op_set_timestamp --
301  *	Decide whether to copy a commit timestamp into an update. If the op
302  *	structure doesn't have a populated update or ref field or in prepared
303  *      state there won't be any check for an existing timestamp.
304  */
305 static inline void
__wt_txn_op_set_timestamp(WT_SESSION_IMPL * session,WT_TXN_OP * op)306 __wt_txn_op_set_timestamp(WT_SESSION_IMPL *session, WT_TXN_OP *op)
307 {
308 	WT_TXN *txn;
309 	WT_UPDATE *upd;
310 	wt_timestamp_t *timestamp;
311 
312 	txn = &session->txn;
313 
314 	/*
315 	 * Updates in the metadata never get timestamps (either now or at
316 	 * commit): metadata cannot be read at a point in time, only the most
317 	 * recently committed data matches files on disk.
318 	 */
319 	if (WT_IS_METADATA(op->btree->dhandle) ||
320 	    !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT))
321 		return;
322 
323 	if (F_ISSET(txn, WT_TXN_PREPARE)) {
324 		if (op->type == WT_TXN_OP_REF_DELETE)
325 			__wt_txn_op_commit_page_del(session, op->u.ref);
326 		else {
327 			/*
328 			 * In case of a prepared transaction, the order of
329 			 * modification of the prepare timestamp to the commit
330 			 * timestamp in the update chain will not affect the
331 			 * data visibility, a reader will encounter a prepared
332 			 * update resulting in prepare conflict.
333 			 *
334 			 * As updating timestamp might not be an atomic
335 			 * operation, we will manage using state.
336 			 */
337 			upd = op->u.op_upd;
338 			upd->prepare_state = WT_PREPARE_LOCKED;
339 			WT_WRITE_BARRIER();
340 			__wt_timestamp_set(
341 			    &upd->timestamp, &txn->commit_timestamp);
342 			WT_PUBLISH(upd->prepare_state, WT_PREPARE_RESOLVED);
343 		}
344 	} else {
345 		/*
346 		 * The timestamp is in the page deleted structure for
347 		 * truncates, or in the update for other operations.
348 		 */
349 		timestamp = op->type == WT_TXN_OP_REF_DELETE ?
350 		    &op->u.ref->page_del->timestamp : &op->u.op_upd->timestamp;
351 		if (__wt_timestamp_iszero(timestamp))
352 			__wt_timestamp_set(timestamp, &txn->commit_timestamp);
353 	}
354 }
355 #endif
356 
357 /*
358  * __wt_txn_modify --
359  *	Mark a WT_UPDATE object modified by the current transaction.
360  */
361 static inline int
__wt_txn_modify(WT_SESSION_IMPL * session,WT_CURSOR_BTREE * cbt,WT_UPDATE * upd)362 __wt_txn_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
363 {
364 	WT_BTREE *btree;
365 	WT_ITEM key;
366 	WT_TXN *txn;
367 	WT_TXN_OP *op;
368 
369 	btree = S2BT(session);
370 	txn = &session->txn;
371 
372 	if (F_ISSET(txn, WT_TXN_READONLY))
373 		WT_RET_MSG(session, WT_ROLLBACK,
374 		    "Attempt to update in a read-only transaction");
375 
376 	WT_RET(__txn_next_op(session, &op));
377 	if (F_ISSET(session, WT_SESSION_LOGGING_INMEM)) {
378 		if (btree->type == BTREE_ROW)
379 			op->type = WT_TXN_OP_INMEM_ROW;
380 		else
381 			op->type = WT_TXN_OP_INMEM_COL;
382 	} else {
383 		if (btree->type == BTREE_ROW)
384 			op->type = WT_TXN_OP_BASIC_ROW;
385 		else
386 			op->type = WT_TXN_OP_BASIC_COL;
387 	}
388 	op->u.op_upd = upd;
389 	upd->txnid = session->txn.id;
390 
391 #ifdef HAVE_TIMESTAMPS
392 	__wt_txn_op_set_timestamp(session, op);
393 
394 	/*
395 	 * TODO:
396 	 * Following code block is under #ifdef temporarily, to avoid
397 	 * performance penalty. This block will be enabled, once an alternative
398 	 * is figured out, or we have to live with this penalty.
399 	 */
400 #ifdef HAVE_LONG_RUNNING_PREPARE
401 	/*
402 	 * Transaction operation with timestamp cannot be prepared.
403 	 * Copy the key into the transaction op structure, so the update
404 	 * can be evicted to lookaside, and we have a chance of finding it
405 	 * again. This is only possible for transactions that are in the
406 	 * prepared state, but we don't know at this stage if a transaction
407 	 * will be prepared or not.
408 	 */
409 	if (!WT_SESSION_IS_CHECKPOINT(session) &&
410 	    !F_ISSET(btree, WT_BTREE_LOOKASIDE) &&
411 	    !WT_IS_METADATA(op->btree->dhandle)) {
412 		/*
413 		 * Store the key, to search the prepared update in case of
414 		 * prepared transaction.
415 		 */
416 		if (btree->type == BTREE_ROW) {
417 			WT_RET(__wt_cursor_get_raw_key(&cbt->iface, &key));
418 			WT_RET(__wt_buf_set(session,
419 			    &op->u.op_row.key, key.data, key.size));
420 		} else
421 			op->u.op_col.recno = cbt->recno;
422 	}
423 #endif
424 
425 #endif
426 	WT_UNUSED(btree);
427 	WT_UNUSED(cbt);
428 	WT_UNUSED(key);
429 
430 	return (0);
431 }
432 
433 /*
434  * __wt_txn_modify_page_delete --
435  *	Remember a page truncated by the current transaction.
436  */
437 static inline int
__wt_txn_modify_page_delete(WT_SESSION_IMPL * session,WT_REF * ref)438 __wt_txn_modify_page_delete(WT_SESSION_IMPL *session, WT_REF *ref)
439 {
440 	WT_DECL_RET;
441 	WT_TXN *txn;
442 	WT_TXN_OP *op;
443 
444 	txn = &session->txn;
445 
446 	WT_RET(__txn_next_op(session, &op));
447 	op->type = WT_TXN_OP_REF_DELETE;
448 
449 	op->u.ref = ref;
450 	ref->page_del->txnid = txn->id;
451 #ifdef HAVE_TIMESTAMPS
452 	__wt_txn_op_set_timestamp(session, op);
453 #endif
454 
455 	WT_ERR(__wt_txn_log_op(session, NULL));
456 	return (0);
457 
458 err:	__wt_txn_unmodify(session);
459 	return (ret);
460 }
461 
462 /*
463  * __wt_txn_oldest_id --
464  *	Return the oldest transaction ID that has to be kept for the current
465  *	tree.
466  */
467 static inline uint64_t
__wt_txn_oldest_id(WT_SESSION_IMPL * session)468 __wt_txn_oldest_id(WT_SESSION_IMPL *session)
469 {
470 	WT_BTREE *btree;
471 	WT_TXN_GLOBAL *txn_global;
472 	uint64_t checkpoint_pinned, oldest_id;
473 	bool include_checkpoint_txn;
474 
475 	txn_global = &S2C(session)->txn_global;
476 	btree = S2BT_SAFE(session);
477 
478 	/*
479 	 * The metadata is tracked specially because of optimizations for
480 	 * checkpoints.
481 	 */
482 	if (session->dhandle != NULL && WT_IS_METADATA(session->dhandle))
483 		return (txn_global->metadata_pinned);
484 
485 	/*
486 	 * Take a local copy of these IDs in case they are updated while we are
487 	 * checking visibility.
488 	 */
489 	oldest_id = txn_global->oldest_id;
490 	include_checkpoint_txn = btree == NULL ||
491 	    (!F_ISSET(btree, WT_BTREE_LOOKASIDE) &&
492 	    btree->checkpoint_gen != __wt_gen(session, WT_GEN_CHECKPOINT));
493 	if (!include_checkpoint_txn)
494 		return (oldest_id);
495 
496 	/*
497 	 * The read of the transaction ID pinned by a checkpoint needs to be
498 	 * carefully ordered: if a checkpoint is starting and we have to start
499 	 * checking the pinned ID, we take the minimum of it with the oldest
500 	 * ID, which is what we want.
501 	 */
502 	WT_READ_BARRIER();
503 
504 	/*
505 	 * Checkpoint transactions often fall behind ordinary application
506 	 * threads.  Take special effort to not keep changes pinned in cache
507 	 * if they are only required for the checkpoint and it has already
508 	 * seen them.
509 	 *
510 	 * If there is no active checkpoint or this handle is up to date with
511 	 * the active checkpoint then it's safe to ignore the checkpoint ID in
512 	 * the visibility check.
513 	 */
514 	checkpoint_pinned = txn_global->checkpoint_state.pinned_id;
515 	if (checkpoint_pinned == WT_TXN_NONE ||
516 	    WT_TXNID_LT(oldest_id, checkpoint_pinned))
517 		return (oldest_id);
518 
519 	return (checkpoint_pinned);
520 }
521 
522 #ifdef HAVE_TIMESTAMPS
523 /*
524  * __wt_txn_pinned_timestamp --
525  *	Get the first timestamp that has to be kept for the current tree.
526  */
527 static inline void
__wt_txn_pinned_timestamp(WT_SESSION_IMPL * session,wt_timestamp_t * pinned_tsp)528 __wt_txn_pinned_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *pinned_tsp)
529 {
530 	WT_BTREE *btree;
531 	WT_TXN_GLOBAL *txn_global;
532 	wt_timestamp_t checkpoint_ts, pinned_ts;
533 	bool include_checkpoint_txn;
534 
535 	btree = S2BT_SAFE(session);
536 	txn_global = &S2C(session)->txn_global;
537 
538 	WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
539 	    __wt_timestamp_set(&pinned_ts, &txn_global->pinned_timestamp));
540 	__wt_timestamp_set(pinned_tsp, &pinned_ts);
541 
542 	/*
543 	 * Checkpoint transactions often fall behind ordinary application
544 	 * threads.  Take special effort to not keep changes pinned in cache if
545 	 * they are only required for the checkpoint and it has already seen
546 	 * them.
547 	 *
548 	 * If there is no active checkpoint or this handle is up to date with
549 	 * the active checkpoint then it's safe to ignore the checkpoint ID in
550 	 * the visibility check.
551 	 */
552 	include_checkpoint_txn = btree == NULL ||
553 	    (!F_ISSET(btree, WT_BTREE_LOOKASIDE) &&
554 	    btree->checkpoint_gen != __wt_gen(session, WT_GEN_CHECKPOINT));
555 	if (!include_checkpoint_txn)
556 		return;
557 
558 	/*
559 	 * The read of the timestamp pinned by a checkpoint needs to be
560 	 * carefully ordered: if a checkpoint is starting and we have to use
561 	 * the checkpoint timestamp, we take the minimum of it with the oldest
562 	 * timestamp, which is what we want.
563 	 */
564 	WT_READ_BARRIER();
565 
566 	WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
567 	    __wt_timestamp_set(&checkpoint_ts,
568 	    &txn_global->checkpoint_timestamp));
569 
570 	if (!__wt_timestamp_iszero(&checkpoint_ts) &&
571 	    __wt_timestamp_cmp(&checkpoint_ts, &pinned_ts) < 0)
572 		__wt_timestamp_set(pinned_tsp, &checkpoint_ts);
573 }
574 #endif
575 
576 /*
577  * __txn_visible_all_id --
578  *	Check if a given transaction ID is "globally visible".	This is, if
579  *	all sessions in the system will see the transaction ID including the
580  *	ID that belongs to a running checkpoint.
581  */
582 static inline bool
__txn_visible_all_id(WT_SESSION_IMPL * session,uint64_t id)583 __txn_visible_all_id(WT_SESSION_IMPL *session, uint64_t id)
584 {
585 	uint64_t oldest_id;
586 
587 	oldest_id = __wt_txn_oldest_id(session);
588 
589 	return (WT_TXNID_LT(id, oldest_id));
590 }
591 
592 /*
593  * __wt_txn_visible_all --
594  *	Check if a given transaction is "globally visible". This is, if all
595  *	sessions in the system will see the transaction ID including the ID
596  *	that belongs to a running checkpoint.
597  */
598 static inline bool
__wt_txn_visible_all(WT_SESSION_IMPL * session,uint64_t id,const wt_timestamp_t * timestamp)599 __wt_txn_visible_all(
600     WT_SESSION_IMPL *session, uint64_t id, const wt_timestamp_t *timestamp)
601 {
602 	if (!__txn_visible_all_id(session, id))
603 		return (false);
604 
605 #ifdef HAVE_TIMESTAMPS
606 	{
607 	wt_timestamp_t pinned_ts;
608 
609 	/* Timestamp check. */
610 	if (timestamp == NULL || __wt_timestamp_iszero(timestamp))
611 		return (true);
612 
613 	/*
614 	 * If no oldest timestamp has been supplied, updates have to stay in
615 	 * cache until we are shutting down.
616 	 */
617 	if (!S2C(session)->txn_global.has_pinned_timestamp)
618 		return (F_ISSET(S2C(session), WT_CONN_CLOSING));
619 
620 	__wt_txn_pinned_timestamp(session, &pinned_ts);
621 	return (__wt_timestamp_cmp(timestamp, &pinned_ts) <= 0);
622 	}
623 #else
624 	WT_UNUSED(timestamp);
625 	return (true);
626 #endif
627 }
628 
629 /*
630  * __wt_txn_upd_visible_all --
631  *	Is the given update visible to all (possible) readers?
632  */
633 static inline bool
__wt_txn_upd_visible_all(WT_SESSION_IMPL * session,WT_UPDATE * upd)634 __wt_txn_upd_visible_all(WT_SESSION_IMPL *session, WT_UPDATE *upd)
635 {
636 	if (upd->prepare_state == WT_PREPARE_LOCKED ||
637 	    upd->prepare_state == WT_PREPARE_INPROGRESS)
638 		return (false);
639 
640 	return (__wt_txn_visible_all(
641 	    session, upd->txnid, WT_TIMESTAMP_NULL(&upd->timestamp)));
642 }
643 
644 /*
645  * __txn_visible_id --
646  *	Can the current transaction see the given ID?
647  */
648 static inline bool
__txn_visible_id(WT_SESSION_IMPL * session,uint64_t id)649 __txn_visible_id(WT_SESSION_IMPL *session, uint64_t id)
650 {
651 	WT_TXN *txn;
652 	bool found;
653 
654 	txn = &session->txn;
655 
656 	/* Changes with no associated transaction are always visible. */
657 	if (id == WT_TXN_NONE)
658 		return (true);
659 
660 	/* Nobody sees the results of aborted transactions. */
661 	if (id == WT_TXN_ABORTED)
662 		return (false);
663 
664 	/* Read-uncommitted transactions see all other changes. */
665 	if (txn->isolation == WT_ISO_READ_UNCOMMITTED)
666 		return (true);
667 
668 	/*
669 	 * If we don't have a transactional snapshot, only make stable updates
670 	 * visible.
671 	 */
672 	if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT))
673 		return (__txn_visible_all_id(session, id));
674 
675 	/* Transactions see their own changes. */
676 	if (id == txn->id)
677 		return (true);
678 
679 	/*
680 	 * WT_ISO_SNAPSHOT, WT_ISO_READ_COMMITTED: the ID is visible if it is
681 	 * not the result of a concurrent transaction, that is, if was
682 	 * committed before the snapshot was taken.
683 	 *
684 	 * The order here is important: anything newer than the maximum ID we
685 	 * saw when taking the snapshot should be invisible, even if the
686 	 * snapshot is empty.
687 	 */
688 	if (WT_TXNID_LE(txn->snap_max, id))
689 		return (false);
690 	if (txn->snapshot_count == 0 || WT_TXNID_LT(id, txn->snap_min))
691 		return (true);
692 
693 	WT_BINARY_SEARCH(id, txn->snapshot, txn->snapshot_count, found);
694 	return (!found);
695 }
696 
697 /*
698  * __wt_txn_visible --
699  *	Can the current transaction see the given ID / timestamp?
700  */
701 static inline bool
__wt_txn_visible(WT_SESSION_IMPL * session,uint64_t id,const wt_timestamp_t * timestamp)702 __wt_txn_visible(
703     WT_SESSION_IMPL *session, uint64_t id, const wt_timestamp_t *timestamp)
704 {
705 	if (!__txn_visible_id(session, id))
706 		return (false);
707 
708 	/* Transactions read their writes, regardless of timestamps. */
709 	if (F_ISSET(&session->txn, WT_TXN_HAS_ID) && id == session->txn.id)
710 		return (true);
711 
712 #ifdef HAVE_TIMESTAMPS
713 	{
714 	WT_TXN *txn = &session->txn;
715 
716 	/* Timestamp check. */
717 	if (!F_ISSET(txn, WT_TXN_HAS_TS_READ) || timestamp == NULL)
718 		return (true);
719 
720 	return (__wt_timestamp_cmp(timestamp, &txn->read_timestamp) <= 0);
721 	}
722 #else
723 	WT_UNUSED(timestamp);
724 	return (true);
725 #endif
726 }
727 
728 /*
729  * __wt_txn_upd_visible_type --
730  *      Visible type of given update for the current transaction.
731  */
732 static inline WT_VISIBLE_TYPE
__wt_txn_upd_visible_type(WT_SESSION_IMPL * session,WT_UPDATE * upd)733 __wt_txn_upd_visible_type(WT_SESSION_IMPL *session, WT_UPDATE *upd)
734 {
735 	uint8_t prepare_state, previous_state;
736 	bool upd_visible;
737 
738 	for (;;__wt_yield()) {
739 		/* Prepare state change is in progress, yield and try again. */
740 		WT_ORDERED_READ(prepare_state, upd->prepare_state);
741 		if (prepare_state == WT_PREPARE_LOCKED)
742 			continue;
743 
744 		upd_visible = __wt_txn_visible(
745 		    session, upd->txnid, WT_TIMESTAMP_NULL(&upd->timestamp));
746 
747 		/*
748 		 * The visibility check is only valid if the update does not
749 		 * change state.  If the state does change, recheck visibility.
750 		 */
751 		previous_state = prepare_state;
752 		WT_ORDERED_READ(prepare_state, upd->prepare_state);
753 		if (previous_state == prepare_state)
754 			break;
755 
756 		WT_STAT_CONN_INCR(session, prepared_transition_blocked_page);
757 	}
758 
759 	if (!upd_visible)
760 		return (WT_VISIBLE_FALSE);
761 
762 	/* Ignore the prepared update, if transaction configuration says so. */
763 	if (prepare_state == WT_PREPARE_INPROGRESS)
764 		return (F_ISSET(&session->txn, WT_TXN_IGNORE_PREPARE) ?
765 		    WT_VISIBLE_FALSE : WT_VISIBLE_PREPARE);
766 
767 	return (WT_VISIBLE_TRUE);
768 }
769 
770 /*
771  * __wt_txn_upd_visible --
772  *	Can the current transaction see the given update.
773  */
774 static inline bool
__wt_txn_upd_visible(WT_SESSION_IMPL * session,WT_UPDATE * upd)775 __wt_txn_upd_visible(WT_SESSION_IMPL *session, WT_UPDATE *upd)
776 {
777 	return (__wt_txn_upd_visible_type(session, upd) == WT_VISIBLE_TRUE);
778 }
779 
780 /*
781  * __wt_txn_read --
782  *	Get the first visible update in a list (or NULL if none are visible).
783  */
784 static inline int
__wt_txn_read(WT_SESSION_IMPL * session,WT_UPDATE * upd,WT_UPDATE ** updp)785 __wt_txn_read(WT_SESSION_IMPL *session, WT_UPDATE *upd, WT_UPDATE **updp)
786 {
787     static WT_UPDATE tombstone = {
788 		.txnid = WT_TXN_NONE, .type = WT_UPDATE_TOMBSTONE
789 	};
790     WT_VISIBLE_TYPE upd_visible;
791     uint8_t type;
792     bool skipped_birthmark;
793 
794     *updp = NULL;
795 
796     type = WT_UPDATE_INVALID; /* [-Wconditional-uninitialized] */
797     for (skipped_birthmark = false; upd != NULL; upd = upd->next) {
798 	WT_ORDERED_READ(type, upd->type);
799 
800 	/* Skip reserved place-holders, they're never visible. */
801 	if (type != WT_UPDATE_RESERVE) {
802 	    upd_visible = __wt_txn_upd_visible_type(session, upd);
803 	    if (upd_visible == WT_VISIBLE_TRUE)
804 		break;
805 	    if (upd_visible == WT_VISIBLE_PREPARE)
806 		return (WT_PREPARE_CONFLICT);
807 	}
808 	/* An invisible birthmark is equivalent to a tombstone. */
809 	if (type == WT_UPDATE_BIRTHMARK)
810 	    skipped_birthmark = true;
811     }
812 
813     if (upd == NULL && skipped_birthmark) {
814 	upd = &tombstone;
815 	type = upd->type;
816     }
817 
818     *updp = upd == NULL || type == WT_UPDATE_BIRTHMARK ? NULL : upd;
819     return (0);
820 }
821 
822 /*
823  * __wt_txn_begin --
824  *	Begin a transaction.
825  */
826 static inline int
__wt_txn_begin(WT_SESSION_IMPL * session,const char * cfg[])827 __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[])
828 {
829 	WT_TXN *txn;
830 
831 	txn = &session->txn;
832 	txn->isolation = session->isolation;
833 	txn->txn_logsync = S2C(session)->txn_logsync;
834 
835 	if (cfg != NULL)
836 		WT_RET(__wt_txn_config(session, cfg));
837 
838 	/*
839 	 * Allocate a snapshot if required. Named snapshot transactions already
840 	 * have an ID setup.
841 	 */
842 	if (txn->isolation == WT_ISO_SNAPSHOT &&
843 	    !F_ISSET(txn, WT_TXN_NAMED_SNAPSHOT)) {
844 		if (session->ncursors > 0)
845 			WT_RET(__wt_session_copy_values(session));
846 
847 		/* Stall here if the cache is completely full. */
848 		WT_RET(__wt_cache_eviction_check(session, false, true, NULL));
849 
850 		__wt_txn_get_snapshot(session);
851 	}
852 
853 	F_SET(txn, WT_TXN_RUNNING);
854 	if (F_ISSET(S2C(session), WT_CONN_READONLY))
855 		F_SET(txn, WT_TXN_READONLY);
856 
857 	return (0);
858 }
859 
860 /*
861  * __wt_txn_autocommit_check --
862  *	If an auto-commit transaction is required, start one.
863  */
864 static inline int
__wt_txn_autocommit_check(WT_SESSION_IMPL * session)865 __wt_txn_autocommit_check(WT_SESSION_IMPL *session)
866 {
867 	WT_TXN *txn;
868 
869 	txn = &session->txn;
870 	if (F_ISSET(txn, WT_TXN_AUTOCOMMIT)) {
871 		F_CLR(txn, WT_TXN_AUTOCOMMIT);
872 		return (__wt_txn_begin(session, NULL));
873 	}
874 	return (0);
875 }
876 
877 /*
878  * __wt_txn_idle_cache_check --
879  *	If there is no transaction active in this thread and we haven't checked
880  *	if the cache is full, do it now.  If we have to block for eviction,
881  *	this is the best time to do it.
882  */
883 static inline int
__wt_txn_idle_cache_check(WT_SESSION_IMPL * session)884 __wt_txn_idle_cache_check(WT_SESSION_IMPL *session)
885 {
886 	WT_TXN *txn;
887 	WT_TXN_STATE *txn_state;
888 
889 	txn = &session->txn;
890 	txn_state = WT_SESSION_TXN_STATE(session);
891 
892 	/*
893 	 * Check the published snap_min because read-uncommitted never sets
894 	 * WT_TXN_HAS_SNAPSHOT.  We don't have any transaction information at
895 	 * this point, so assume the transaction will be read-only.  The dirty
896 	 * cache check will be performed when the transaction completes, if
897 	 * necessary.
898 	 */
899 	if (F_ISSET(txn, WT_TXN_RUNNING) &&
900 	    !F_ISSET(txn, WT_TXN_HAS_ID) && txn_state->pinned_id == WT_TXN_NONE)
901 		WT_RET(__wt_cache_eviction_check(session, false, true, NULL));
902 
903 	return (0);
904 }
905 
906 /*
907  * __wt_txn_id_alloc --
908  *	Allocate a new transaction ID.
909  */
910 static inline uint64_t
__wt_txn_id_alloc(WT_SESSION_IMPL * session,bool publish)911 __wt_txn_id_alloc(WT_SESSION_IMPL *session, bool publish)
912 {
913 	WT_TXN_GLOBAL *txn_global;
914 	WT_TXN_STATE *txn_state;
915 	uint64_t id;
916 
917 	txn_global = &S2C(session)->txn_global;
918 	txn_state = WT_SESSION_TXN_STATE(session);
919 
920 	/*
921 	 * Allocating transaction IDs involves several steps.
922 	 *
923 	 * Firstly, we do an atomic increment to allocate a unique ID.  The
924 	 * field we increment is not used anywhere else.
925 	 *
926 	 * Then we optionally publish the allocated ID into the global
927 	 * transaction table.  It is critical that this becomes visible before
928 	 * the global current value moves past our ID, or some concurrent
929 	 * reader could get a snapshot that makes our changes visible before we
930 	 * commit.
931 	 *
932 	 * We want the global value to lead the allocated values, so that any
933 	 * allocated transaction ID eventually becomes globally visible.  When
934 	 * there are no transactions running, the oldest_id will reach the
935 	 * global current ID, so we want post-increment semantics.  Our atomic
936 	 * add primitive does pre-increment, so adjust the result here.
937 	 *
938 	 * We rely on atomic reads of the current ID to create snapshots, so
939 	 * for unlocked reads to be well defined, we must use an atomic
940 	 * increment here.
941 	 */
942 	__wt_spin_lock(session, &txn_global->id_lock);
943 	id = txn_global->current;
944 
945 	if (publish) {
946 		session->txn.id = id;
947 		WT_PUBLISH(txn_state->id, id);
948 	}
949 
950 	/*
951 	 * Even though we are in a spinlock, readers are not.  We rely on
952 	 * atomic reads of the current ID to create snapshots, so for unlocked
953 	 * reads to be well defined, we must use an atomic increment here.
954 	 */
955 	(void)__wt_atomic_addv64(&txn_global->current, 1);
956 	__wt_spin_unlock(session, &txn_global->id_lock);
957 	return (id);
958 }
959 
960 /*
961  * __wt_txn_id_check --
962  *	A transaction is going to do an update, allocate a transaction ID.
963  */
964 static inline int
__wt_txn_id_check(WT_SESSION_IMPL * session)965 __wt_txn_id_check(WT_SESSION_IMPL *session)
966 {
967 	WT_TXN *txn;
968 
969 	txn = &session->txn;
970 
971 	WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));
972 
973 	if (F_ISSET(txn, WT_TXN_HAS_ID))
974 		return (0);
975 
976 	/* If the transaction is idle, check that the cache isn't full. */
977 	WT_RET(__wt_txn_idle_cache_check(session));
978 
979 	(void)__wt_txn_id_alloc(session, true);
980 
981 	/*
982 	 * If we have used 64-bits of transaction IDs, there is nothing
983 	 * more we can do.
984 	 */
985 	if (txn->id == WT_TXN_ABORTED)
986 		WT_RET_MSG(session, WT_ERROR, "out of transaction IDs");
987 	F_SET(txn, WT_TXN_HAS_ID);
988 
989 	return (0);
990 }
991 
992 /*
993  * __wt_txn_search_check --
994  *	Check if the current transaction can search.
995  */
996 static inline int
__wt_txn_search_check(WT_SESSION_IMPL * session)997 __wt_txn_search_check(WT_SESSION_IMPL *session)
998 {
999 #ifdef  HAVE_TIMESTAMPS
1000 	WT_BTREE *btree;
1001 	WT_TXN *txn;
1002 
1003 	txn = &session->txn;
1004 	btree = S2BT(session);
1005 	/*
1006 	 * If the user says a table should always use a read timestamp,
1007 	 * verify this transaction has one.  Same if it should never have
1008 	 * a read timestamp.
1009 	 */
1010 	if (FLD_ISSET(btree->assert_flags, WT_ASSERT_READ_TS_ALWAYS) &&
1011 	    !F_ISSET(txn, WT_TXN_PUBLIC_TS_READ))
1012 		WT_RET_MSG(session, EINVAL, "read_timestamp required and "
1013 		    "none set on this transaction");
1014 	if (FLD_ISSET(btree->assert_flags, WT_ASSERT_READ_TS_NEVER) &&
1015 	    F_ISSET(txn, WT_TXN_PUBLIC_TS_READ))
1016 		WT_RET_MSG(session, EINVAL, "no read_timestamp required and "
1017 		    "timestamp set on this transaction");
1018 #endif
1019 	WT_UNUSED(session);
1020 	return (0);
1021 }
1022 
1023 /*
1024  * __wt_txn_update_check --
1025  *	Check if the current transaction can update an item.
1026  */
1027 static inline int
__wt_txn_update_check(WT_SESSION_IMPL * session,WT_UPDATE * upd)1028 __wt_txn_update_check(WT_SESSION_IMPL *session, WT_UPDATE *upd)
1029 {
1030 	WT_TXN *txn;
1031 	bool ignore_prepare_set;
1032 
1033 	txn = &session->txn;
1034 
1035 	if (txn->isolation != WT_ISO_SNAPSHOT)
1036 		return (0);
1037 
1038 	/*
1039 	 * Always include prepared transactions in this check: they are not
1040 	 * supposed to affect visibility for update operations.
1041 	 */
1042 	ignore_prepare_set = F_ISSET(txn, WT_TXN_IGNORE_PREPARE);
1043 	F_CLR(txn, WT_TXN_IGNORE_PREPARE);
1044 	for (;upd != NULL && !__wt_txn_upd_visible(session, upd);
1045 	    upd = upd->next) {
1046 		if (upd->txnid != WT_TXN_ABORTED) {
1047 			if (ignore_prepare_set)
1048 				F_SET(txn, WT_TXN_IGNORE_PREPARE);
1049 			WT_STAT_CONN_INCR(session, txn_update_conflict);
1050 			WT_STAT_DATA_INCR(session, txn_update_conflict);
1051 			return (__wt_txn_rollback_required(session,
1052 			    "conflict between concurrent operations"));
1053 		}
1054 	}
1055 
1056 	if (ignore_prepare_set)
1057 		F_SET(txn, WT_TXN_IGNORE_PREPARE);
1058 	return (0);
1059 }
1060 
1061 /*
1062  * __wt_txn_read_last --
1063  *	Called when the last page for a session is released.
1064  */
1065 static inline void
__wt_txn_read_last(WT_SESSION_IMPL * session)1066 __wt_txn_read_last(WT_SESSION_IMPL *session)
1067 {
1068 	WT_TXN *txn;
1069 
1070 	txn = &session->txn;
1071 
1072 	/*
1073 	 * Release the snap_min ID we put in the global table.
1074 	 *
1075 	 * If the isolation has been temporarily forced, don't touch the
1076 	 * snapshot here: it will be restored by WT_WITH_TXN_ISOLATION.
1077 	 */
1078 	if ((!F_ISSET(txn, WT_TXN_RUNNING) ||
1079 	    txn->isolation != WT_ISO_SNAPSHOT) && txn->forced_iso == 0)
1080 		__wt_txn_release_snapshot(session);
1081 }
1082 
1083 /*
1084  * __wt_txn_cursor_op --
1085  *	Called for each cursor operation.
1086  */
1087 static inline void
__wt_txn_cursor_op(WT_SESSION_IMPL * session)1088 __wt_txn_cursor_op(WT_SESSION_IMPL *session)
1089 {
1090 	WT_TXN *txn;
1091 	WT_TXN_GLOBAL *txn_global;
1092 	WT_TXN_STATE *txn_state;
1093 
1094 	txn = &session->txn;
1095 	txn_global = &S2C(session)->txn_global;
1096 	txn_state = WT_SESSION_TXN_STATE(session);
1097 
1098 	/*
1099 	 * We are about to read data, which means we need to protect against
1100 	 * updates being freed from underneath this cursor. Read-uncommitted
1101 	 * isolation protects values by putting a transaction ID in the global
1102 	 * table to prevent any update that we are reading from being freed.
1103 	 * Other isolation levels get a snapshot to protect their reads.
1104 	 *
1105 	 * !!!
1106 	 * Note:  We are updating the global table unprotected, so the global
1107 	 * oldest_id may move past our snap_min if a scan races with this value
1108 	 * being published. That said, read-uncommitted operations always see
1109 	 * the most recent update for each record that has not been aborted
1110 	 * regardless of the snap_min value published here.  Even if there is a
1111 	 * race while publishing this ID, it prevents the oldest ID from moving
1112 	 * further forward, so that once a read-uncommitted cursor is
1113 	 * positioned on a value, it can't be freed.
1114 	 */
1115 	if (txn->isolation == WT_ISO_READ_UNCOMMITTED) {
1116 		if (txn_state->pinned_id == WT_TXN_NONE)
1117 			txn_state->pinned_id = txn_global->last_running;
1118 		if (txn_state->metadata_pinned == WT_TXN_NONE)
1119 			txn_state->metadata_pinned = txn_state->pinned_id;
1120 	} else if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT))
1121 		__wt_txn_get_snapshot(session);
1122 }
1123 
1124 /*
1125  * __wt_txn_am_oldest --
1126  *	Am I the oldest transaction in the system?
1127  */
1128 static inline bool
__wt_txn_am_oldest(WT_SESSION_IMPL * session)1129 __wt_txn_am_oldest(WT_SESSION_IMPL *session)
1130 {
1131 	WT_CONNECTION_IMPL *conn;
1132 	WT_TXN *txn;
1133 	WT_TXN_GLOBAL *txn_global;
1134 	WT_TXN_STATE *s;
1135 	uint64_t id;
1136 	uint32_t i, session_cnt;
1137 
1138 	conn = S2C(session);
1139 	txn = &session->txn;
1140 	txn_global = &conn->txn_global;
1141 
1142 	if (txn->id == WT_TXN_NONE)
1143 		return (false);
1144 
1145 	WT_ORDERED_READ(session_cnt, conn->session_cnt);
1146 	for (i = 0, s = txn_global->states; i < session_cnt; i++, s++)
1147 		if ((id = s->id) != WT_TXN_NONE && WT_TXNID_LT(id, txn->id))
1148 			return (false);
1149 
1150 	return (true);
1151 }
1152 
1153 /*
1154  * __wt_txn_activity_check --
1155  *	Check whether there are any running transactions.
1156  */
1157 static inline int
__wt_txn_activity_check(WT_SESSION_IMPL * session,bool * txn_active)1158 __wt_txn_activity_check(WT_SESSION_IMPL *session, bool *txn_active)
1159 {
1160 	WT_TXN_GLOBAL *txn_global;
1161 
1162 	txn_global = &S2C(session)->txn_global;
1163 
1164 	/*
1165 	 * Ensure the oldest ID is as up to date as possible so we can use a
1166 	 * simple check to find if there are any running transactions.
1167 	 */
1168 	WT_RET(__wt_txn_update_oldest(session,
1169 	    WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT));
1170 
1171 	*txn_active = (txn_global->oldest_id != txn_global->current ||
1172 	    txn_global->metadata_pinned != txn_global->current);
1173 
1174 	return (0);
1175 }
1176