1 /*-
2  * Copyright (c) 2014-2018 MongoDB, Inc.
3  * Copyright (c) 2008-2014 WiredTiger, Inc.
4  *	All rights reserved.
5  *
6  * See the file LICENSE for redistribution information.
7  */
8 
9 #include "wt_internal.h"
10 
11 #ifdef HAVE_TIMESTAMPS
12 /*
13  * __wt_timestamp_to_hex_string --
14  *	Convert a timestamp to hex string representation.
15  */
16 int
__wt_timestamp_to_hex_string(WT_SESSION_IMPL * session,char * hex_timestamp,const wt_timestamp_t * ts_src)17 __wt_timestamp_to_hex_string(
18     WT_SESSION_IMPL *session, char *hex_timestamp, const wt_timestamp_t *ts_src)
19 {
20 	wt_timestamp_t ts;
21 
22 	__wt_timestamp_set(&ts, ts_src);
23 
24 	if (__wt_timestamp_iszero(&ts)) {
25 		hex_timestamp[0] = '0';
26 		hex_timestamp[1] = '\0';
27 		return (0);
28 	}
29 
30 #if WT_TIMESTAMP_SIZE == 8
31 	{
32 	char *p, v;
33 
34 	for (p = hex_timestamp; ts.val != 0; ts.val >>= 4)
35 		*p++ = (char)__wt_hex((u_char)(ts.val & 0x0f));
36 	*p = '\0';
37 
38 	/* Reverse the string. */
39 	for (--p; p > hex_timestamp;) {
40 		v = *p;
41 		*p-- = *hex_timestamp;
42 		*hex_timestamp++ = v;
43 	}
44 	WT_UNUSED(session);
45 	}
46 #else
47 	{
48 	WT_ITEM hexts;
49 	size_t len;
50 	uint8_t *tsp;
51 
52 	/* Avoid memory allocation: set up an item guaranteed large enough. */
53 	hexts.data = hexts.mem = hex_timestamp;
54 	hexts.memsize = 2 * WT_TIMESTAMP_SIZE + 1;
55 	/* Trim leading zeros. */
56 	for (tsp = ts.ts, len = WT_TIMESTAMP_SIZE;
57 	    len > 0 && *tsp == 0;
58 	    ++tsp, --len)
59 		;
60 	WT_RET(__wt_raw_to_hex(session, tsp, len, &hexts));
61 	}
62 #endif
63 	return (0);
64 }
65 
66 /*
67  * __wt_verbose_timestamp --
68  *	Output a verbose message along with the specified timestamp.
69  */
70 void
__wt_verbose_timestamp(WT_SESSION_IMPL * session,const wt_timestamp_t * ts,const char * msg)71 __wt_verbose_timestamp(WT_SESSION_IMPL *session,
72     const wt_timestamp_t *ts, const char *msg)
73 {
74 	char timestamp_buf[2 * WT_TIMESTAMP_SIZE + 1];
75 
76 	if (!WT_VERBOSE_ISSET(session, WT_VERB_TIMESTAMP) ||
77 	    (__wt_timestamp_to_hex_string(session, timestamp_buf, ts) != 0))
78 	       return;
79 
80 	__wt_verbose(session,
81 	    WT_VERB_TIMESTAMP, "Timestamp %s : %s", timestamp_buf, msg);
82 }
83 
84 /*
85  * __wt_txn_parse_timestamp_raw --
86  *	Decodes and sets a timestamp. Don't do any checking.
87  */
88 int
__wt_txn_parse_timestamp_raw(WT_SESSION_IMPL * session,const char * name,wt_timestamp_t * timestamp,WT_CONFIG_ITEM * cval)89 __wt_txn_parse_timestamp_raw(WT_SESSION_IMPL *session, const char *name,
90     wt_timestamp_t *timestamp, WT_CONFIG_ITEM *cval)
91 {
92 	__wt_timestamp_set_zero(timestamp);
93 
94 	if (cval->len == 0)
95 		return (0);
96 
97 	/* Protect against unexpectedly long hex strings. */
98 	if (cval->len > 2 * WT_TIMESTAMP_SIZE)
99 		WT_RET_MSG(session, EINVAL,
100 		    "%s timestamp too long '%.*s'",
101 		    name, (int)cval->len, cval->str);
102 
103 #if WT_TIMESTAMP_SIZE == 8
104 	{
105 	static const int8_t hextable[] = {
106 	    -1, -1,  -1,  -1,  -1,  -1,  -1,  -1,
107 	    -1, -1,  -1,  -1,  -1,  -1,  -1,  -1,
108 	    -1, -1,  -1,  -1,  -1,  -1,  -1,  -1,
109 	    -1, -1,  -1,  -1,  -1,  -1,  -1,  -1,
110 	    -1, -1,  -1,  -1,  -1,  -1,  -1,  -1,
111 	    -1, -1,  -1,  -1,  -1,  -1,  -1,  -1,
112 	     0,  1,   2,   3,   4,   5,   6,   7,
113 	     8,  9,  -1,  -1,  -1,  -1,  -1,  -1,
114 	    -1, 10,  11,  12,  13,  14,  15,  -1,
115 	    -1, -1,  -1,  -1,  -1,  -1,  -1,  -1,
116 	    -1, -1,  -1,  -1,  -1,  -1,  -1,  -1,
117 	    -1, -1,  -1,  -1,  -1,  -1,  -1,  -1,
118 	    -1, 10,  11,  12,  13,  14,  15,  -1
119 	};
120 	wt_timestamp_t ts;
121 	size_t len;
122 	int hex_val;
123 	const char *hex_itr;
124 
125 	for (ts.val = 0, hex_itr = cval->str, len = cval->len; len > 0; --len) {
126 		if ((size_t)*hex_itr < WT_ELEMENTS(hextable))
127 			hex_val = hextable[(size_t)*hex_itr++];
128 		else
129 			hex_val = -1;
130 		if (hex_val < 0)
131 			WT_RET_MSG(session, EINVAL,
132 			    "Failed to parse %s timestamp '%.*s'",
133 			    name, (int)cval->len, cval->str);
134 		ts.val = (ts.val << 4) | (uint64_t)hex_val;
135 	}
136 	__wt_timestamp_set(timestamp, &ts);
137 	}
138 #else
139 	{
140 	WT_DECL_RET;
141 	WT_ITEM ts;
142 	wt_timestamp_t tsbuf;
143 	size_t hexlen;
144 	const char *hexts;
145 	char padbuf[2 * WT_TIMESTAMP_SIZE + 1];
146 
147 	/*
148 	 * The decoding function assumes it is decoding data produced by dump
149 	 * and so requires an even number of hex digits.
150 	 */
151 	if ((cval->len & 1) == 0) {
152 		hexts = cval->str;
153 		hexlen = cval->len;
154 	} else {
155 		padbuf[0] = '0';
156 		memcpy(padbuf + 1, cval->str, cval->len);
157 		hexts = padbuf;
158 		hexlen = cval->len + 1;
159 	}
160 
161 	/* Avoid memory allocation to decode timestamps. */
162 	ts.data = ts.mem = tsbuf.ts;
163 	ts.memsize = sizeof(tsbuf.ts);
164 
165 	if ((ret = __wt_nhex_to_raw(session, hexts, hexlen, &ts)) != 0)
166 		WT_RET_MSG(session, ret, "Failed to parse %s timestamp '%.*s'",
167 		    name, (int)cval->len, cval->str);
168 	WT_ASSERT(session, ts.size <= WT_TIMESTAMP_SIZE);
169 
170 	/* Copy the raw value to the end of the timestamp. */
171 	memcpy(timestamp->ts + WT_TIMESTAMP_SIZE - ts.size,
172 	    ts.data, ts.size);
173 	}
174 #endif
175 	return (0);
176 }
177 
178 /*
179  * __wt_txn_parse_timestamp --
180  *	Decodes and sets a timestamp checking it is non-zero.
181  */
182 int
__wt_txn_parse_timestamp(WT_SESSION_IMPL * session,const char * name,wt_timestamp_t * timestamp,WT_CONFIG_ITEM * cval)183 __wt_txn_parse_timestamp(WT_SESSION_IMPL *session, const char *name,
184     wt_timestamp_t *timestamp, WT_CONFIG_ITEM *cval)
185 {
186 	WT_RET(__wt_txn_parse_timestamp_raw(session, name, timestamp, cval));
187 	if (cval->len != 0 && __wt_timestamp_iszero(timestamp))
188 		WT_RET_MSG(session, EINVAL,
189 		    "Failed to parse %s timestamp '%.*s': zero not permitted",
190 		    name, (int)cval->len, cval->str);
191 
192 	return (0);
193 }
194 
195 /*
196  * __txn_get_pinned_timestamp --
197  *	Calculate the current pinned timestamp.
198  */
199 static int
__txn_get_pinned_timestamp(WT_SESSION_IMPL * session,wt_timestamp_t * tsp,bool include_checkpoint,bool include_oldest)200 __txn_get_pinned_timestamp(
201    WT_SESSION_IMPL *session, wt_timestamp_t *tsp, bool include_checkpoint,
202    bool include_oldest)
203 {
204 	WT_CONNECTION_IMPL *conn;
205 	WT_DECL_TIMESTAMP(tmp_ts)
206 	WT_TXN *txn;
207 	WT_TXN_GLOBAL *txn_global;
208 
209 	conn = S2C(session);
210 	txn_global = &conn->txn_global;
211 
212 	if (include_oldest && !txn_global->has_oldest_timestamp)
213 		return (WT_NOTFOUND);
214 
215 	__wt_readlock(session, &txn_global->rwlock);
216 	if (include_oldest)
217 		__wt_timestamp_set(&tmp_ts, &txn_global->oldest_timestamp);
218 	else
219 		__wt_timestamp_set_zero(&tmp_ts);
220 
221 	/* Check for a running checkpoint */
222 	if (include_checkpoint &&
223 	    !__wt_timestamp_iszero(&txn_global->checkpoint_timestamp) &&
224 	    (__wt_timestamp_iszero(&tmp_ts) ||
225 	    __wt_timestamp_cmp(&txn_global->checkpoint_timestamp, &tmp_ts) <
226 	    0))
227 		__wt_timestamp_set(&tmp_ts, &txn_global->checkpoint_timestamp);
228 	__wt_readunlock(session, &txn_global->rwlock);
229 
230 	/* Look for the oldest ordinary reader. */
231 	__wt_readlock(session, &txn_global->read_timestamp_rwlock);
232 	TAILQ_FOREACH(txn, &txn_global->read_timestamph, read_timestampq) {
233 		/*
234 		 * Skip any transactions on the queue that are not active.
235 		 */
236 		if (txn->clear_read_q)
237 			continue;
238 		/*
239 		 * A zero timestamp is possible here only when the oldest
240 		 * timestamp is not accounted for.
241 		 */
242 		if (__wt_timestamp_iszero(&tmp_ts) ||
243 		    __wt_timestamp_cmp(&txn->read_timestamp, &tmp_ts) < 0)
244 			__wt_timestamp_set(&tmp_ts, &txn->read_timestamp);
245 		/*
246 		 * We break on the first active txn on the list.
247 		 */
248 		break;
249 	}
250 	__wt_readunlock(session, &txn_global->read_timestamp_rwlock);
251 
252 	if (!include_oldest && __wt_timestamp_iszero(&tmp_ts))
253 		return (WT_NOTFOUND);
254 	__wt_timestamp_set(tsp, &tmp_ts);
255 
256 	return (0);
257 }
258 
259 /*
260  * __txn_global_query_timestamp --
261  *	Query a timestamp on the global transaction.
262  */
263 static int
__txn_global_query_timestamp(WT_SESSION_IMPL * session,wt_timestamp_t * tsp,const char * cfg[])264 __txn_global_query_timestamp(
265     WT_SESSION_IMPL *session, wt_timestamp_t *tsp, const char *cfg[])
266 {
267 	WT_CONFIG_ITEM cval;
268 	WT_CONNECTION_IMPL *conn;
269 	WT_TXN *txn;
270 	WT_TXN_GLOBAL *txn_global;
271 	wt_timestamp_t ts, tmpts;
272 
273 	conn = S2C(session);
274 	txn_global = &conn->txn_global;
275 
276 	WT_STAT_CONN_INCR(session, txn_query_ts);
277 	WT_RET(__wt_config_gets(session, cfg, "get", &cval));
278 	if (WT_STRING_MATCH("all_committed", cval.str, cval.len)) {
279 		if (!txn_global->has_commit_timestamp)
280 			return (WT_NOTFOUND);
281 		WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
282 		    __wt_timestamp_set(&ts, &txn_global->commit_timestamp));
283 		WT_ASSERT(session, !__wt_timestamp_iszero(&ts));
284 
285 		/* Skip the lock if there are no running transactions. */
286 		if (TAILQ_EMPTY(&txn_global->commit_timestamph))
287 			goto done;
288 
289 		/* Compare with the oldest running transaction. */
290 		__wt_readlock(session, &txn_global->commit_timestamp_rwlock);
291 		TAILQ_FOREACH(txn, &txn_global->commit_timestamph,
292 		    commit_timestampq) {
293 			if (txn->clear_commit_q)
294 				continue;
295 
296 			__wt_timestamp_set(
297 			    &tmpts, &txn->first_commit_timestamp);
298 			WT_ASSERT(session, !__wt_timestamp_iszero(&tmpts));
299 			__wt_timestamp_subone(&tmpts);
300 
301 			if (__wt_timestamp_cmp(&tmpts, &ts) < 0)
302 				__wt_timestamp_set(&ts, &tmpts);
303 			break;
304 		}
305 		__wt_readunlock(session, &txn_global->commit_timestamp_rwlock);
306 	} else if (WT_STRING_MATCH("last_checkpoint", cval.str, cval.len))
307 		/* Read-only value forever. No lock needed. */
308 		__wt_timestamp_set(&ts, &txn_global->last_ckpt_timestamp);
309 	else if (WT_STRING_MATCH("oldest", cval.str, cval.len)) {
310 		if (!txn_global->has_oldest_timestamp)
311 			return (WT_NOTFOUND);
312 		WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
313 		    __wt_timestamp_set(&ts, &txn_global->oldest_timestamp));
314 	} else if (WT_STRING_MATCH("oldest_reader", cval.str, cval.len))
315 		WT_RET(__txn_get_pinned_timestamp(session, &ts, true, false));
316 	else if (WT_STRING_MATCH("pinned", cval.str, cval.len))
317 		WT_RET(__txn_get_pinned_timestamp(session, &ts, true, true));
318 	else if (WT_STRING_MATCH("recovery", cval.str, cval.len))
319 		/* Read-only value forever. No lock needed. */
320 		__wt_timestamp_set(&ts, &txn_global->recovery_timestamp);
321 	else if (WT_STRING_MATCH("stable", cval.str, cval.len)) {
322 		if (!txn_global->has_stable_timestamp)
323 			return (WT_NOTFOUND);
324 		WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
325 		    __wt_timestamp_set(&ts, &txn_global->stable_timestamp));
326 	} else
327 		WT_RET_MSG(session, EINVAL,
328 		    "unknown timestamp query %.*s", (int)cval.len, cval.str);
329 
330 done:	__wt_timestamp_set(tsp, &ts);
331 	return (0);
332 }
333 
334 /*
335  * __txn_query_timestamp --
336  *	Query a timestamp within this session's transaction.
337  */
338 static int
__txn_query_timestamp(WT_SESSION_IMPL * session,wt_timestamp_t * tsp,const char * cfg[])339 __txn_query_timestamp(
340     WT_SESSION_IMPL *session, wt_timestamp_t *tsp, const char *cfg[])
341 {
342 	WT_CONFIG_ITEM cval;
343 	WT_TXN *txn;
344 
345 	txn = &session->txn;
346 
347 	WT_STAT_CONN_INCR(session, session_query_ts);
348 	if (!F_ISSET(txn, WT_TXN_RUNNING))
349 		return (WT_NOTFOUND);
350 
351 	WT_RET(__wt_config_gets(session, cfg, "get", &cval));
352 	if (WT_STRING_MATCH("commit", cval.str, cval.len))
353 		__wt_timestamp_set(tsp, &txn->commit_timestamp);
354 	else if (WT_STRING_MATCH("first_commit", cval.str, cval.len))
355 		__wt_timestamp_set(tsp, &txn->first_commit_timestamp);
356 	else if (WT_STRING_MATCH("prepare", cval.str, cval.len))
357 		__wt_timestamp_set(tsp, &txn->prepare_timestamp);
358 	else if (WT_STRING_MATCH("read", cval.str, cval.len))
359 		__wt_timestamp_set(tsp, &txn->read_timestamp);
360 	else
361 		WT_RET_MSG(session, EINVAL,
362 		    "unknown timestamp query %.*s", (int)cval.len, cval.str);
363 
364 	return (0);
365 }
366 #endif
367 
368 /*
369  * __wt_txn_query_timestamp --
370  *	Query a timestamp. The caller may query the global transaction or the
371  *      session's transaction.
372  */
373 int
__wt_txn_query_timestamp(WT_SESSION_IMPL * session,char * hex_timestamp,const char * cfg[],bool global_txn)374 __wt_txn_query_timestamp(WT_SESSION_IMPL *session,
375     char *hex_timestamp, const char *cfg[], bool global_txn)
376 {
377 #ifdef HAVE_TIMESTAMPS
378 	wt_timestamp_t ts;
379 
380 	if (global_txn)
381 		WT_RET(__txn_global_query_timestamp(session, &ts, cfg));
382 	else
383 		WT_RET(__txn_query_timestamp(session, &ts, cfg));
384 
385 	return (__wt_timestamp_to_hex_string(session, hex_timestamp, &ts));
386 #else
387 	WT_UNUSED(hex_timestamp);
388 	WT_UNUSED(cfg);
389 	WT_UNUSED(global_txn);
390 
391 	WT_RET_MSG(session, ENOTSUP,
392 	    "requires a version of WiredTiger built with timestamp support");
393 #endif
394 }
395 
396 #ifdef HAVE_TIMESTAMPS
397 /*
398  * __wt_txn_update_pinned_timestamp --
399  *	Update the pinned timestamp (the oldest timestamp that has to be
400  *	maintained for current or future readers).
401  */
402 int
__wt_txn_update_pinned_timestamp(WT_SESSION_IMPL * session,bool force)403 __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session, bool force)
404 {
405 	WT_DECL_RET;
406 	WT_TXN_GLOBAL *txn_global;
407 	wt_timestamp_t active_timestamp, last_pinned_timestamp;
408 	wt_timestamp_t oldest_timestamp, pinned_timestamp;
409 
410 	txn_global = &S2C(session)->txn_global;
411 
412 	/* Skip locking and scanning when the oldest timestamp is pinned. */
413 	if (txn_global->oldest_is_pinned)
414 		return (0);
415 
416 	WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
417 	    __wt_timestamp_set(
418 	    &oldest_timestamp, &txn_global->oldest_timestamp));
419 
420 	/* Scan to find the global pinned timestamp. */
421 	if ((ret = __txn_get_pinned_timestamp(
422 	    session, &active_timestamp, false, true)) != 0)
423 		return (ret == WT_NOTFOUND ? 0 : ret);
424 
425 	if (__wt_timestamp_cmp(&oldest_timestamp, &active_timestamp) < 0)
426 		__wt_timestamp_set(&pinned_timestamp, &oldest_timestamp);
427 	else
428 		__wt_timestamp_set(&pinned_timestamp, &active_timestamp);
429 
430 	if (txn_global->has_pinned_timestamp && !force) {
431 		WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
432 		    __wt_timestamp_set(
433 		    &last_pinned_timestamp, &txn_global->pinned_timestamp));
434 
435 		if (__wt_timestamp_cmp(
436 		    &pinned_timestamp, &last_pinned_timestamp) <= 0)
437 			return (0);
438 	}
439 
440 	__wt_writelock(session, &txn_global->rwlock);
441 	if (!txn_global->has_pinned_timestamp || force || __wt_timestamp_cmp(
442 	    &txn_global->pinned_timestamp, &pinned_timestamp) < 0) {
443 		__wt_timestamp_set(
444 		    &txn_global->pinned_timestamp, &pinned_timestamp);
445 		txn_global->has_pinned_timestamp = true;
446 		txn_global->oldest_is_pinned = __wt_timestamp_cmp(
447 		    &txn_global->pinned_timestamp,
448 		    &txn_global->oldest_timestamp) == 0;
449 		txn_global->stable_is_pinned = __wt_timestamp_cmp(
450 		    &txn_global->pinned_timestamp,
451 		    &txn_global->stable_timestamp) == 0;
452 		__wt_verbose_timestamp(session,
453 		    &pinned_timestamp, "Updated pinned timestamp");
454 	}
455 	__wt_writeunlock(session, &txn_global->rwlock);
456 
457 	return (0);
458 }
459 #endif
460 
461 /*
462  * __wt_txn_global_set_timestamp --
463  *	Set a global transaction timestamp.
464  */
465 int
__wt_txn_global_set_timestamp(WT_SESSION_IMPL * session,const char * cfg[])466 __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[])
467 {
468 	WT_CONFIG_ITEM commit_cval, oldest_cval, stable_cval;
469 	bool has_commit, has_oldest, has_stable;
470 
471 	WT_STAT_CONN_INCR(session, txn_set_ts);
472 	WT_RET(__wt_config_gets_def(session,
473 	    cfg, "commit_timestamp", 0, &commit_cval));
474 	has_commit = commit_cval.len != 0;
475 	if (has_commit)
476 		WT_STAT_CONN_INCR(session, txn_set_ts_commit);
477 
478 	WT_RET(__wt_config_gets_def(session,
479 	    cfg, "oldest_timestamp", 0, &oldest_cval));
480 	has_oldest = oldest_cval.len != 0;
481 	if (has_oldest)
482 		WT_STAT_CONN_INCR(session, txn_set_ts_oldest);
483 
484 	WT_RET(__wt_config_gets_def(session,
485 	    cfg, "stable_timestamp", 0, &stable_cval));
486 	has_stable = stable_cval.len != 0;
487 	if (has_stable)
488 		WT_STAT_CONN_INCR(session, txn_set_ts_stable);
489 
490 	/* If no timestamp was supplied, there's nothing to do. */
491 	if (!has_commit && !has_oldest && !has_stable)
492 		return (0);
493 
494 #ifdef HAVE_TIMESTAMPS
495 	{
496 	WT_CONFIG_ITEM cval;
497 	WT_TXN_GLOBAL *txn_global;
498 	wt_timestamp_t commit_ts, oldest_ts, stable_ts;
499 	wt_timestamp_t last_oldest_ts, last_stable_ts;
500 	char hex_timestamp[2][2 * WT_TIMESTAMP_SIZE + 1];
501 	bool force;
502 
503 	txn_global = &S2C(session)->txn_global;
504 
505 	/*
506 	 * Parsing will initialize the timestamp to zero even if
507 	 * it is not configured.
508 	 */
509 	WT_RET(__wt_txn_parse_timestamp(
510 	    session, "commit", &commit_ts, &commit_cval));
511 	WT_RET(__wt_txn_parse_timestamp(
512 	    session, "oldest", &oldest_ts, &oldest_cval));
513 	WT_RET(__wt_txn_parse_timestamp(
514 	    session, "stable", &stable_ts, &stable_cval));
515 
516 	WT_RET(__wt_config_gets_def(session,
517 	    cfg, "force", 0, &cval));
518 	force = cval.val != 0;
519 
520 	if (force)
521 		goto set;
522 
523 	__wt_readlock(session, &txn_global->rwlock);
524 
525 	__wt_timestamp_set(&last_oldest_ts, &txn_global->oldest_timestamp);
526 	__wt_timestamp_set(&last_stable_ts, &txn_global->stable_timestamp);
527 
528 	/*
529 	 * First do error checking on the timestamp values.  The
530 	 * oldest timestamp must always be less than or equal to
531 	 * the stable timestamp.  If we're only setting one
532 	 * then compare against the system timestamp.  If we're
533 	 * setting both then compare the passed in values.
534 	 */
535 	if (!has_commit && txn_global->has_commit_timestamp)
536 		__wt_timestamp_set(&commit_ts, &txn_global->commit_timestamp);
537 	if (!has_oldest && txn_global->has_oldest_timestamp)
538 		__wt_timestamp_set(&oldest_ts, &last_oldest_ts);
539 	if (!has_stable && txn_global->has_stable_timestamp)
540 		__wt_timestamp_set(&stable_ts, &last_stable_ts);
541 
542 	/*
543 	 * If a commit timestamp was supplied, check that it is no older than
544 	 * either the stable timestamp or the oldest timestamp.
545 	 */
546 	if (has_commit && (has_oldest || txn_global->has_oldest_timestamp) &&
547 	    __wt_timestamp_cmp(&oldest_ts, &commit_ts) > 0) {
548 		__wt_readunlock(session, &txn_global->rwlock);
549 		WT_RET(__wt_timestamp_to_hex_string(
550 		    session, hex_timestamp[0], &oldest_ts));
551 		WT_RET(__wt_timestamp_to_hex_string(
552 		    session, hex_timestamp[1], &commit_ts));
553 		WT_RET_MSG(session, EINVAL,
554 		    "set_timestamp: oldest timestamp %s must not be later than "
555 		    "commit timestamp %s", hex_timestamp[0], hex_timestamp[1]);
556 	}
557 
558 	if (has_commit && (has_stable || txn_global->has_stable_timestamp) &&
559 	    __wt_timestamp_cmp(&stable_ts, &commit_ts) > 0) {
560 		__wt_readunlock(session, &txn_global->rwlock);
561 		WT_RET(__wt_timestamp_to_hex_string(
562 		    session, hex_timestamp[0], &stable_ts));
563 		WT_RET(__wt_timestamp_to_hex_string(
564 		    session, hex_timestamp[1], &commit_ts));
565 		WT_RET_MSG(session, EINVAL,
566 		    "set_timestamp: stable timestamp %s must not be later than "
567 		    "commit timestamp %s", hex_timestamp[0], hex_timestamp[1]);
568 	}
569 
570 	/*
571 	 * The oldest and stable timestamps must always satisfy the condition
572 	 * that oldest <= stable.
573 	 */
574 	if ((has_oldest || has_stable) &&
575 	    (has_oldest || txn_global->has_oldest_timestamp) &&
576 	    (has_stable || txn_global->has_stable_timestamp) &&
577 	    __wt_timestamp_cmp(&oldest_ts, &stable_ts) > 0) {
578 		__wt_readunlock(session, &txn_global->rwlock);
579 		WT_RET(__wt_timestamp_to_hex_string(
580 		    session, hex_timestamp[0], &oldest_ts));
581 		WT_RET(__wt_timestamp_to_hex_string(
582 		    session, hex_timestamp[1], &stable_ts));
583 		WT_RET_MSG(session, EINVAL,
584 		    "set_timestamp: oldest timestamp %s must not be later than "
585 		    "stable timestamp %s", hex_timestamp[0], hex_timestamp[1]);
586 	}
587 
588 	__wt_readunlock(session, &txn_global->rwlock);
589 
590 	/* Check if we are actually updating anything. */
591 	if (has_oldest && txn_global->has_oldest_timestamp &&
592 	    __wt_timestamp_cmp(&oldest_ts, &last_oldest_ts) <= 0)
593 		has_oldest = false;
594 
595 	if (has_stable && txn_global->has_stable_timestamp &&
596 	    __wt_timestamp_cmp(&stable_ts, &last_stable_ts) <= 0)
597 		has_stable = false;
598 
599 	if (!has_commit && !has_oldest && !has_stable)
600 		return (0);
601 
602 set:	__wt_writelock(session, &txn_global->rwlock);
603 	/*
604 	 * This method can be called from multiple threads, check that we are
605 	 * moving the global timestamps forwards.
606 	 *
607 	 * The exception is the commit timestamp, where the application can
608 	 * move it backwards (in fact, it only really makes sense to explicitly
609 	 * move it backwards because it otherwise tracks the largest
610 	 * commit_timestamp so it moves forward whenever transactions are
611 	 * assigned timestamps).
612 	 */
613 	if (has_commit) {
614 		__wt_timestamp_set(&txn_global->commit_timestamp, &commit_ts);
615 		txn_global->has_commit_timestamp = true;
616 		WT_STAT_CONN_INCR(session, txn_set_ts_commit_upd);
617 		__wt_verbose_timestamp(session, &commit_ts,
618 		    "Updated global commit timestamp");
619 	}
620 
621 	if (has_oldest && (!txn_global->has_oldest_timestamp ||
622 	    force || __wt_timestamp_cmp(
623 	    &oldest_ts, &txn_global->oldest_timestamp) > 0)) {
624 		__wt_timestamp_set(&txn_global->oldest_timestamp, &oldest_ts);
625 		WT_STAT_CONN_INCR(session, txn_set_ts_oldest_upd);
626 		txn_global->has_oldest_timestamp = true;
627 		txn_global->oldest_is_pinned = false;
628 		__wt_verbose_timestamp(session, &oldest_ts,
629 		    "Updated global oldest timestamp");
630 	}
631 
632 	if (has_stable && (!txn_global->has_stable_timestamp ||
633 	    force || __wt_timestamp_cmp(
634 	    &stable_ts, &txn_global->stable_timestamp) > 0)) {
635 		__wt_timestamp_set(&txn_global->stable_timestamp, &stable_ts);
636 		WT_STAT_CONN_INCR(session, txn_set_ts_stable_upd);
637 		txn_global->has_stable_timestamp = true;
638 		txn_global->stable_is_pinned = false;
639 		__wt_verbose_timestamp(session, &stable_ts,
640 		    "Updated global stable timestamp");
641 	}
642 	__wt_writeunlock(session, &txn_global->rwlock);
643 
644 	if (has_oldest || has_stable)
645 		WT_RET(__wt_txn_update_pinned_timestamp(session, force));
646 	}
647 	return (0);
648 #else
649 	WT_RET_MSG(session, ENOTSUP, "set_timestamp requires a "
650 	    "version of WiredTiger built with timestamp support");
651 #endif
652 }
653 
654 #ifdef HAVE_TIMESTAMPS
655 /*
656  * __wt_timestamp_validate --
657  *	Validate a timestamp to be not older than the global oldest and global
658  *	stable and running transaction commit timestamp and running transaction
659  *	prepare timestamp.
660  */
661 int
__wt_timestamp_validate(WT_SESSION_IMPL * session,const char * name,wt_timestamp_t * ts,WT_CONFIG_ITEM * cval)662 __wt_timestamp_validate(WT_SESSION_IMPL *session, const char *name,
663     wt_timestamp_t *ts, WT_CONFIG_ITEM *cval)
664 {
665 	WT_TXN *txn = &session->txn;
666 	WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global;
667 	wt_timestamp_t oldest_ts, stable_ts;
668 	char hex_timestamp[2 * WT_TIMESTAMP_SIZE + 1];
669 	bool has_oldest_ts, has_stable_ts;
670 
671 	/*
672 	 * Added this redundant initialization to circumvent build failure.
673 	 */
674 	__wt_timestamp_set_zero(&oldest_ts);
675 	__wt_timestamp_set_zero(&stable_ts);
676 	/*
677 	 * Compare against the oldest and the stable timestamp. Return an error
678 	 * if the given timestamp is older than oldest and/or stable timestamp.
679 	 */
680 	WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
681 	    if ((has_oldest_ts = txn_global->has_oldest_timestamp))
682 		__wt_timestamp_set(&oldest_ts, &txn_global->oldest_timestamp);
683 	    if ((has_stable_ts = txn_global->has_stable_timestamp))
684 		__wt_timestamp_set(&stable_ts, &txn_global->stable_timestamp));
685 
686 	if (has_oldest_ts && __wt_timestamp_cmp(ts, &oldest_ts) < 0) {
687 		WT_RET(__wt_timestamp_to_hex_string(session, hex_timestamp,
688 		    &oldest_ts));
689 		WT_RET_MSG(session, EINVAL,
690 		    "%s timestamp %.*s older than oldest timestamp %s",
691 		    name, (int)cval->len, cval->str, hex_timestamp);
692 	}
693 	if (has_stable_ts && __wt_timestamp_cmp(ts, &stable_ts) < 0) {
694 		WT_RET(__wt_timestamp_to_hex_string(session, hex_timestamp,
695 		    &stable_ts));
696 		WT_RET_MSG(session, EINVAL,
697 		    "%s timestamp %.*s older than stable timestamp %s",
698 		    name, (int)cval->len, cval->str, hex_timestamp);
699 	}
700 
701 	/*
702 	 * Compare against the commit timestamp of the current transaction.
703 	 * Return an error if the given timestamp is older than the first
704 	 * commit timestamp.
705 	 */
706 	if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
707 	    __wt_timestamp_cmp(ts, &txn->first_commit_timestamp) < 0) {
708 		WT_RET(__wt_timestamp_to_hex_string(
709 		    session, hex_timestamp, &txn->first_commit_timestamp));
710 		WT_RET_MSG(session, EINVAL,
711 		    "%s timestamp %.*s older than the first "
712 		    "commit timestamp %s for this transaction",
713 		    name, (int)cval->len, cval->str, hex_timestamp);
714 	}
715 
716 	/*
717 	 * Compare against the prepare timestamp of the current transaction.
718 	 * Return an error if the given timestamp is older than the prepare
719 	 * timestamp.
720 	 */
721 	if (F_ISSET(txn, WT_TXN_PREPARE) &&
722 	    __wt_timestamp_cmp(ts, &txn->prepare_timestamp) < 0) {
723 		WT_RET(__wt_timestamp_to_hex_string(
724 		    session, hex_timestamp, &txn->prepare_timestamp));
725 		WT_RET_MSG(session, EINVAL,
726 		    "%s timestamp %.*s older than the prepare timestamp %s "
727 		    "for this transaction",
728 		    name, (int)cval->len, cval->str, hex_timestamp);
729 	}
730 
731 	return (0);
732 }
733 #endif
734 
735 /*
736  * __wt_txn_set_timestamp --
737  *	Parse a request to set a timestamp in a transaction.
738  */
739 int
__wt_txn_set_timestamp(WT_SESSION_IMPL * session,const char * cfg[])740 __wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[])
741 {
742 	WT_CONFIG_ITEM cval;
743 	WT_DECL_RET;
744 
745 	/* Look for a commit timestamp. */
746 	ret = __wt_config_gets_def(session, cfg, "commit_timestamp", 0, &cval);
747 	WT_RET_NOTFOUND_OK(ret);
748 	if (ret == 0 && cval.len != 0) {
749 #ifdef HAVE_TIMESTAMPS
750 		WT_TXN *txn = &session->txn;
751 		wt_timestamp_t ts;
752 
753 		WT_TRET(__wt_txn_context_check(session, true));
754 		WT_RET(__wt_txn_parse_timestamp(session, "commit", &ts, &cval));
755 		WT_RET(__wt_timestamp_validate(session, "commit", &ts, &cval));
756 		__wt_timestamp_set(&txn->commit_timestamp, &ts);
757 		__wt_txn_set_commit_timestamp(session);
758 #else
759 		WT_RET_MSG(session, ENOTSUP, "commit_timestamp requires a "
760 		    "version of WiredTiger built with timestamp support");
761 #endif
762 	} else
763 		/*
764 		 * We allow setting the commit timestamp after a prepare
765 		 * but no other timestamp.
766 		 */
767 		WT_RET(__wt_txn_context_prepare_check(session));
768 
769 	/* Look for a read timestamp. */
770 	WT_RET(__wt_txn_parse_read_timestamp(session, cfg));
771 
772 	return (0);
773 }
774 
775 /*
776  * __wt_txn_parse_prepare_timestamp --
777  *	Parse a request to set a transaction's prepare_timestamp.
778  */
779 int
__wt_txn_parse_prepare_timestamp(WT_SESSION_IMPL * session,const char * cfg[],wt_timestamp_t * timestamp)780 __wt_txn_parse_prepare_timestamp(
781     WT_SESSION_IMPL *session, const char *cfg[], wt_timestamp_t *timestamp)
782 {
783 	WT_CONFIG_ITEM cval;
784 
785 	WT_RET(__wt_config_gets_def(session,
786 	    cfg, "prepare_timestamp", 0, &cval));
787 	if (cval.len > 0) {
788 #ifdef HAVE_TIMESTAMPS
789 		WT_TXN *prev;
790 		WT_TXN_GLOBAL *txn_global;
791 		wt_timestamp_t oldest_ts;
792 		char hex_timestamp[2 * WT_TIMESTAMP_SIZE + 1];
793 
794 		txn_global = &S2C(session)->txn_global;
795 
796 		if (F_ISSET(&session->txn, WT_TXN_HAS_TS_COMMIT))
797 			WT_RET_MSG(session, EINVAL,
798 			    "commit timestamp should not have been set before "
799 			    "prepare transaction");
800 
801 		WT_RET(__wt_txn_parse_timestamp(
802 		    session, "prepare", timestamp, &cval));
803 
804 		/*
805 		 * Prepare timestamp must be later/greater than latest active
806 		 * read timestamp.
807 		 */
808 		__wt_readlock(session, &txn_global->read_timestamp_rwlock);
809 		prev = TAILQ_LAST(&txn_global->read_timestamph,
810 		    __wt_txn_rts_qh);
811 		while (prev != NULL) {
812 			/*
813 			 * Skip any transactions that are not active.
814 			 */
815 			if (prev->clear_read_q) {
816 				prev = TAILQ_PREV(
817 				    prev, __wt_txn_rts_qh, read_timestampq);
818 				continue;
819 			}
820 			if (__wt_timestamp_cmp(
821 			    &prev->read_timestamp, timestamp) >= 0) {
822 				__wt_readunlock(session,
823 				    &txn_global->read_timestamp_rwlock);
824 				WT_RET(__wt_timestamp_to_hex_string(session,
825 				    hex_timestamp, &prev->read_timestamp));
826 				WT_RET_MSG(session, EINVAL,
827 				    "prepare timestamp %.*s not later than "
828 				    "an active read timestamp %s ",
829 				    (int)cval.len, cval.str, hex_timestamp);
830 			}
831 			break;
832 		}
833 		__wt_readunlock(session, &txn_global->read_timestamp_rwlock);
834 
835 		/*
836 		 * If there are no active readers, prepare timestamp must not
837 		 * be older than oldest timestamp.
838 		 */
839 		if (prev == NULL) {
840 			WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
841 			    __wt_timestamp_set(&oldest_ts,
842 			    &txn_global->oldest_timestamp));
843 
844 			if (__wt_timestamp_cmp(timestamp, &oldest_ts) < 0) {
845 				WT_RET(__wt_timestamp_to_hex_string(session,
846 				    hex_timestamp, &oldest_ts));
847 				WT_RET_MSG(session, EINVAL,
848 				    "prepare timestamp %.*s is older than the "
849 				    "oldest timestamp %s ", (int)cval.len,
850 				    cval.str, hex_timestamp);
851 			}
852 		 }
853 #else
854 		WT_UNUSED(timestamp);
855 		WT_RET_MSG(session, EINVAL, "prepare_timestamp requires a "
856 		    "version of WiredTiger built with timestamp support");
857 #endif
858 	} else
859 		WT_RET_MSG(session, EINVAL, "prepare timestamp is required");
860 
861 	return (0);
862 }
863 /*
864  * __wt_txn_parse_read_timestamp --
865  *	Parse a request to set a transaction's read_timestamp.
866  */
867 int
__wt_txn_parse_read_timestamp(WT_SESSION_IMPL * session,const char * cfg[])868 __wt_txn_parse_read_timestamp(WT_SESSION_IMPL *session, const char *cfg[])
869 {
870 	WT_CONFIG_ITEM cval;
871 	WT_TXN *txn;
872 
873 	txn = &session->txn;
874 
875 	WT_RET(__wt_config_gets_def(session, cfg, "read_timestamp", 0, &cval));
876 	if (cval.len > 0) {
877 #ifdef HAVE_TIMESTAMPS
878 		wt_timestamp_t ts;
879 		WT_TXN_GLOBAL *txn_global;
880 		char hex_timestamp[2][2 * WT_TIMESTAMP_SIZE + 1];
881 		bool round_to_oldest;
882 
883 		txn_global = &S2C(session)->txn_global;
884 		WT_RET(__wt_txn_parse_timestamp(session, "read", &ts, &cval));
885 
886 		/* Read timestamps imply / require snapshot isolation. */
887 		if (!F_ISSET(txn, WT_TXN_RUNNING))
888 			txn->isolation = WT_ISO_SNAPSHOT;
889 		else if (txn->isolation != WT_ISO_SNAPSHOT)
890 			WT_RET_MSG(session, EINVAL, "setting a read_timestamp"
891 			    " requires a transaction running at snapshot"
892 			    " isolation");
893 
894 		/* Read timestamps can't change once set. */
895 		if (F_ISSET(txn, WT_TXN_HAS_TS_READ))
896 			WT_RET_MSG(session, EINVAL, "a read_timestamp"
897 			    " may only be set once per transaction");
898 
899 		/*
900 		 * Read the configuration here to reduce the span of the
901 		 * critical section.
902 		 */
903 		WT_RET(__wt_config_gets_def(session,
904 		    cfg, "round_to_oldest", 0, &cval));
905 		round_to_oldest = cval.val;
906 		/*
907 		 * This code is not using the timestamp validate function to
908 		 * avoid a race between checking and setting transaction
909 		 * timestamp.
910 		 */
911 		WT_RET(__wt_timestamp_to_hex_string(session,
912 		    hex_timestamp[0], &ts));
913 		__wt_readlock(session, &txn_global->rwlock);
914 		if (__wt_timestamp_cmp(
915 		    &ts, &txn_global->oldest_timestamp) < 0) {
916 			WT_RET(__wt_timestamp_to_hex_string(session,
917 			    hex_timestamp[1], &txn_global->oldest_timestamp));
918 			/*
919 			 * If given read timestamp is earlier than oldest
920 			 * timestamp then round the read timestamp to
921 			 * oldest timestamp.
922 			 */
923 			if (round_to_oldest)
924 				__wt_timestamp_set(&txn->read_timestamp,
925 				    &txn_global->oldest_timestamp);
926 			else {
927 				__wt_readunlock(session, &txn_global->rwlock);
928 				WT_RET_MSG(session, EINVAL, "read timestamp "
929 				    "%s older than oldest timestamp %s",
930 				    hex_timestamp[0], hex_timestamp[1]);
931 			}
932 		} else {
933 			__wt_timestamp_set(&txn->read_timestamp, &ts);
934 			/*
935 			 * Reset to avoid a verbose message as read
936 			 * timestamp is not rounded to oldest timestamp.
937 			 */
938 			round_to_oldest = false;
939 		}
940 
941 		__wt_txn_set_read_timestamp(session);
942 		__wt_readunlock(session, &txn_global->rwlock);
943 		if (round_to_oldest) {
944 			/*
945 			 * This message is generated here to reduce the span of
946 			 * critical section.
947 			 */
948 			__wt_verbose(session, WT_VERB_TIMESTAMP, "Read "
949 			    "timestamp %s : Rounded to oldest timestamp %s",
950 			    hex_timestamp[0], hex_timestamp[1]);
951 		}
952 
953 		/*
954 		 * If we already have a snapshot, it may be too early to match
955 		 * the timestamp (including the one we just read, if rounding
956 		 * to oldest).  Get a new one.
957 		 */
958 		if (F_ISSET(txn, WT_TXN_RUNNING))
959 			__wt_txn_get_snapshot(session);
960 
961 #else
962 		WT_UNUSED(txn);
963 		WT_RET_MSG(session, EINVAL, "read_timestamp requires a "
964 		    "version of WiredTiger built with timestamp support");
965 #endif
966 	}
967 
968 	return (0);
969 }
970 
971 #ifdef HAVE_TIMESTAMPS
972 /*
973  * __wt_txn_set_commit_timestamp --
974  *	Publish a transaction's commit timestamp.
975  */
976 void
__wt_txn_set_commit_timestamp(WT_SESSION_IMPL * session)977 __wt_txn_set_commit_timestamp(WT_SESSION_IMPL *session)
978 {
979 	WT_TXN *qtxn, *txn, *txn_tmp;
980 	WT_TXN_GLOBAL *txn_global;
981 	wt_timestamp_t ts;
982 	uint64_t walked;
983 
984 	txn = &session->txn;
985 	txn_global = &S2C(session)->txn_global;
986 
987 	if (F_ISSET(txn, WT_TXN_PUBLIC_TS_COMMIT))
988 		return;
989 
990 	/*
991 	 * Copy the current commit timestamp (which can change while the
992 	 * transaction is running) into the first_commit_timestamp, which is
993 	 * fixed.
994 	 */
995 	__wt_timestamp_set(&ts, &txn->commit_timestamp);
996 
997 	__wt_writelock(session, &txn_global->commit_timestamp_rwlock);
998 	/*
999 	 * If our transaction is on the queue remove it first. The timestamp
1000 	 * may move earlier so we otherwise might not remove ourselves before
1001 	 * finding where to insert ourselves (which would result in a list
1002 	 * loop) and we don't want to walk more of the list than needed.
1003 	 */
1004 	if (txn->clear_commit_q) {
1005 		TAILQ_REMOVE(&txn_global->commit_timestamph,
1006 		    txn, commit_timestampq);
1007 		WT_PUBLISH(txn->clear_commit_q, false);
1008 		--txn_global->commit_timestampq_len;
1009 	}
1010 	/*
1011 	 * Walk the list to look for where to insert our own transaction
1012 	 * and remove any transactions that are not active.  We stop when
1013 	 * we get to the location where we want to insert.
1014 	 */
1015 	if (TAILQ_EMPTY(&txn_global->commit_timestamph)) {
1016 		TAILQ_INSERT_HEAD(
1017 		    &txn_global->commit_timestamph, txn, commit_timestampq);
1018 		WT_STAT_CONN_INCR(session, txn_commit_queue_empty);
1019 	} else {
1020 		/* Walk from the start, removing cleared entries. */
1021 		walked = 0;
1022 		TAILQ_FOREACH_SAFE(qtxn, &txn_global->commit_timestamph,
1023 		    commit_timestampq, txn_tmp) {
1024 			++walked;
1025 			/*
1026 			 * Stop on the first entry that we cannot clear.
1027 			 */
1028 			if (!qtxn->clear_commit_q)
1029 				break;
1030 
1031 			TAILQ_REMOVE(&txn_global->commit_timestamph,
1032 			    qtxn, commit_timestampq);
1033 			WT_PUBLISH(qtxn->clear_commit_q, false);
1034 			--txn_global->commit_timestampq_len;
1035 		}
1036 
1037 		/*
1038 		 * Now walk backwards from the end to find the correct position
1039 		 * for the insert.
1040 		 */
1041 		qtxn = TAILQ_LAST(
1042 		     &txn_global->commit_timestamph, __wt_txn_cts_qh);
1043 		while (qtxn != NULL && __wt_timestamp_cmp(
1044 		    &qtxn->first_commit_timestamp, &ts) > 0) {
1045 			++walked;
1046 			qtxn = TAILQ_PREV(
1047 			    qtxn, __wt_txn_cts_qh, commit_timestampq);
1048 		}
1049 		if (qtxn == NULL) {
1050 			TAILQ_INSERT_HEAD(&txn_global->commit_timestamph,
1051 			   txn, commit_timestampq);
1052 			WT_STAT_CONN_INCR(session, txn_commit_queue_head);
1053 		} else
1054 			TAILQ_INSERT_AFTER(&txn_global->commit_timestamph,
1055 			    qtxn, txn, commit_timestampq);
1056 		WT_STAT_CONN_INCRV(session, txn_commit_queue_walked, walked);
1057 	}
1058 	__wt_timestamp_set(&txn->first_commit_timestamp, &ts);
1059 	++txn_global->commit_timestampq_len;
1060 	WT_STAT_CONN_INCR(session, txn_commit_queue_inserts);
1061 	txn->clear_commit_q = false;
1062 	F_SET(txn, WT_TXN_HAS_TS_COMMIT | WT_TXN_PUBLIC_TS_COMMIT);
1063 	__wt_writeunlock(session, &txn_global->commit_timestamp_rwlock);
1064 }
1065 
1066 /*
1067  * __wt_txn_clear_commit_timestamp --
1068  *	Clear a transaction's published commit timestamp.
1069  */
1070 void
__wt_txn_clear_commit_timestamp(WT_SESSION_IMPL * session)1071 __wt_txn_clear_commit_timestamp(WT_SESSION_IMPL *session)
1072 {
1073 	WT_TXN *txn;
1074 	uint32_t flags;
1075 
1076 	txn = &session->txn;
1077 
1078 	if (!F_ISSET(txn, WT_TXN_PUBLIC_TS_COMMIT))
1079 		return;
1080 	flags = txn->flags;
1081 	LF_CLR(WT_TXN_PUBLIC_TS_COMMIT);
1082 
1083 	/*
1084 	 * Notify other threads that our transaction is inactive and can be
1085 	 * cleaned up safely from the commit timestamp queue whenever the next
1086 	 * thread walks the queue. We do not need to remove it now.
1087 	 */
1088 	WT_PUBLISH(txn->clear_commit_q, true);
1089 	WT_PUBLISH(txn->flags, flags);
1090 }
1091 
1092 /*
1093  * __wt_txn_set_read_timestamp --
1094  *	Publish a transaction's read timestamp.
1095  */
1096 void
__wt_txn_set_read_timestamp(WT_SESSION_IMPL * session)1097 __wt_txn_set_read_timestamp(WT_SESSION_IMPL *session)
1098 {
1099 	WT_TXN *qtxn, *txn, *txn_tmp;
1100 	WT_TXN_GLOBAL *txn_global;
1101 	uint64_t walked;
1102 
1103 	txn = &session->txn;
1104 	txn_global = &S2C(session)->txn_global;
1105 
1106 	if (F_ISSET(txn, WT_TXN_PUBLIC_TS_READ))
1107 		return;
1108 
1109 	__wt_writelock(session, &txn_global->read_timestamp_rwlock);
1110 	/*
1111 	 * If our transaction is on the queue remove it first. The timestamp
1112 	 * may move earlier so we otherwise might not remove ourselves before
1113 	 * finding where to insert ourselves (which would result in a list
1114 	 * loop) and we don't want to walk more of the list than needed.
1115 	 */
1116 	if (txn->clear_read_q) {
1117 		TAILQ_REMOVE(&txn_global->read_timestamph,
1118 		    txn, read_timestampq);
1119 		WT_PUBLISH(txn->clear_read_q, false);
1120 		--txn_global->read_timestampq_len;
1121 	}
1122 	/*
1123 	 * Walk the list to look for where to insert our own transaction
1124 	 * and remove any transactions that are not active.  We stop when
1125 	 * we get to the location where we want to insert.
1126 	 */
1127 	if (TAILQ_EMPTY(&txn_global->read_timestamph)) {
1128 		TAILQ_INSERT_HEAD(
1129 		    &txn_global->read_timestamph, txn, read_timestampq);
1130 		WT_STAT_CONN_INCR(session, txn_read_queue_empty);
1131 	} else {
1132 		/* Walk from the start, removing cleared entries. */
1133 		walked = 0;
1134 		TAILQ_FOREACH_SAFE(qtxn, &txn_global->read_timestamph,
1135 		    read_timestampq, txn_tmp) {
1136 			++walked;
1137 			if (!qtxn->clear_read_q)
1138 				break;
1139 
1140 			TAILQ_REMOVE(&txn_global->read_timestamph,
1141 			    qtxn, read_timestampq);
1142 			WT_PUBLISH(qtxn->clear_read_q, false);
1143 			--txn_global->read_timestampq_len;
1144 		}
1145 
1146 		/*
1147 		 * Now walk backwards from the end to find the correct position
1148 		 * for the insert.
1149 		 */
1150 		qtxn = TAILQ_LAST(
1151 		     &txn_global->read_timestamph, __wt_txn_rts_qh);
1152 		while (qtxn != NULL &&
1153 		    __wt_timestamp_cmp(&qtxn->read_timestamp,
1154 		    &txn->read_timestamp) > 0) {
1155 			++walked;
1156 			qtxn = TAILQ_PREV(
1157 			    qtxn, __wt_txn_rts_qh, read_timestampq);
1158 		}
1159 		if (qtxn == NULL) {
1160 			TAILQ_INSERT_HEAD(&txn_global->read_timestamph,
1161 			   txn, read_timestampq);
1162 			WT_STAT_CONN_INCR(session, txn_read_queue_head);
1163 		} else
1164 			TAILQ_INSERT_AFTER(&txn_global->read_timestamph,
1165 			    qtxn, txn, read_timestampq);
1166 		WT_STAT_CONN_INCRV(session, txn_read_queue_walked, walked);
1167 	}
1168 	/*
1169 	 * We do not set the read timestamp here. It has been set in the caller
1170 	 * because special processing for round to oldest.
1171 	 */
1172 	++txn_global->read_timestampq_len;
1173 	WT_STAT_CONN_INCR(session, txn_read_queue_inserts);
1174 	txn->clear_read_q = false;
1175 	F_SET(txn, WT_TXN_HAS_TS_READ | WT_TXN_PUBLIC_TS_READ);
1176 	__wt_writeunlock(session, &txn_global->read_timestamp_rwlock);
1177 }
1178 
1179 /*
1180  * __wt_txn_clear_read_timestamp --
1181  *	Clear a transaction's published read timestamp.
1182  */
1183 void
__wt_txn_clear_read_timestamp(WT_SESSION_IMPL * session)1184 __wt_txn_clear_read_timestamp(WT_SESSION_IMPL *session)
1185 {
1186 	WT_TXN *txn;
1187 	uint32_t flags;
1188 
1189 	txn = &session->txn;
1190 
1191 	if (!F_ISSET(txn, WT_TXN_PUBLIC_TS_READ))
1192 		return;
1193 
1194 #ifdef HAVE_DIAGNOSTIC
1195 	{
1196 	WT_TXN_GLOBAL *txn_global;
1197 	wt_timestamp_t pinned_ts;
1198 
1199 	txn_global = &S2C(session)->txn_global;
1200 	WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
1201 	    __wt_timestamp_set(&pinned_ts, &txn_global->pinned_timestamp));
1202 	WT_ASSERT(session,
1203 	    __wt_timestamp_cmp(&txn->read_timestamp, &pinned_ts) >= 0);
1204 	}
1205 #endif
1206 	flags = txn->flags;
1207 	LF_CLR(WT_TXN_PUBLIC_TS_READ);
1208 
1209 	/*
1210 	 * Notify other threads that our transaction is inactive and can be
1211 	 * cleaned up safely from the read timestamp queue whenever the
1212 	 * next thread walks the queue. We do not need to remove it now.
1213 	 */
1214 	WT_PUBLISH(txn->clear_read_q, true);
1215 	WT_PUBLISH(txn->flags, flags);
1216 }
1217 #endif
1218 
1219 /*
1220  * __wt_txn_clear_timestamp_queues --
1221  *	We're about to clear the session and overwrite the txn structure.
1222  *	Remove ourselves from the commit timestamp queue and the read
1223  *	timestamp queue if we're on either of them.
1224  */
1225 void
__wt_txn_clear_timestamp_queues(WT_SESSION_IMPL * session)1226 __wt_txn_clear_timestamp_queues(WT_SESSION_IMPL *session)
1227 {
1228 	WT_TXN *txn;
1229 	WT_TXN_GLOBAL *txn_global;
1230 
1231 	txn = &session->txn;
1232 	txn_global = &S2C(session)->txn_global;
1233 
1234 	if (!txn->clear_commit_q && !txn->clear_read_q)
1235 		return;
1236 
1237 	if (txn->clear_commit_q) {
1238 		__wt_writelock(session, &txn_global->commit_timestamp_rwlock);
1239 		/*
1240 		 * Recheck after acquiring the lock.
1241 		 */
1242 		if (txn->clear_commit_q) {
1243 			TAILQ_REMOVE(&txn_global->commit_timestamph,
1244 			    txn, commit_timestampq);
1245 			--txn_global->commit_timestampq_len;
1246 			txn->clear_commit_q = false;
1247 		}
1248 		__wt_writeunlock(session, &txn_global->commit_timestamp_rwlock);
1249 	}
1250 	if (txn->clear_read_q) {
1251 		__wt_writelock(session, &txn_global->read_timestamp_rwlock);
1252 		/*
1253 		 * Recheck after acquiring the lock.
1254 		 */
1255 		if (txn->clear_read_q) {
1256 			TAILQ_REMOVE(
1257 			    &txn_global->read_timestamph, txn, read_timestampq);
1258 			--txn_global->read_timestampq_len;
1259 			txn->clear_read_q = false;
1260 		}
1261 		__wt_writeunlock(session, &txn_global->read_timestamp_rwlock);
1262 	}
1263 }
1264