1 /*-
2  * Copyright (c) 2014-2018 MongoDB, Inc.
3  * Copyright (c) 2008-2014 WiredTiger, Inc.
4  *	All rights reserved.
5  *
6  * See the file LICENSE for redistribution information.
7  */
8 
9 #include "wt_internal.h"
10 
11 #ifdef HAVE_TIMESTAMPS
12 /*
13  * __txn_rollback_to_stable_lookaside_fixup --
14  *	Remove any updates that need to be rolled back from the lookaside file.
15  */
16 static int
__txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL * session)17 __txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session)
18 {
19 	WT_CONNECTION_IMPL *conn;
20 	WT_CURSOR *cursor;
21 	WT_DECL_RET;
22 	WT_DECL_TIMESTAMP(rollback_timestamp)
23 	WT_DECL_TIMESTAMP(upd_timestamp)
24 	WT_ITEM las_key, las_timestamp, las_value;
25 	WT_TXN_GLOBAL *txn_global;
26 	uint64_t las_counter, las_pageid, las_total, las_txnid;
27 	uint32_t las_id, session_flags;
28 	uint8_t prepare_state, upd_type;
29 
30 	conn = S2C(session);
31 	cursor = NULL;
32 	las_total = 0;
33 	session_flags = 0;		/* [-Werror=maybe-uninitialized] */
34 	WT_CLEAR(las_timestamp);
35 
36 	/*
37 	 * Copy the stable timestamp, otherwise we'd need to lock it each time
38 	 * it's accessed. Even though the stable timestamp isn't supposed to be
39 	 * updated while rolling back, accessing it without a lock would
40 	 * violate protocol.
41 	 */
42 	txn_global = &conn->txn_global;
43 	WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
44 	    __wt_timestamp_set(
45 	    &rollback_timestamp, &txn_global->stable_timestamp));
46 
47 	__wt_las_cursor(session, &cursor, &session_flags);
48 
49 	/* Discard pages we read as soon as we're done with them. */
50 	F_SET(session, WT_SESSION_READ_WONT_NEED);
51 
52 	/* Walk the file. */
53 	__wt_writelock(session, &conn->cache->las_sweepwalk_lock);
54 	while ((ret = cursor->next(cursor)) == 0) {
55 		++las_total;
56 		WT_ERR(cursor->get_key(cursor,
57 		    &las_pageid, &las_id, &las_counter, &las_key));
58 
59 		/* Check the file ID so we can skip durable tables */
60 		if (las_id >= conn->stable_rollback_maxfile)
61 			WT_PANIC_RET(session, EINVAL, "file ID %" PRIu32
62 			    " in lookaside table larger than max %" PRIu32,
63 			    las_id, conn->stable_rollback_maxfile);
64 		if (__bit_test(conn->stable_rollback_bitstring, las_id))
65 			continue;
66 
67 		WT_ERR(cursor->get_value(cursor, &las_txnid,
68 		    &las_timestamp, &prepare_state, &upd_type, &las_value));
69 		WT_ASSERT(session, las_timestamp.size == WT_TIMESTAMP_SIZE);
70 		memcpy(&upd_timestamp, las_timestamp.data, las_timestamp.size);
71 
72 		/*
73 		 * Entries with no timestamp will have a timestamp of zero,
74 		 * which will fail the following check and cause them to never
75 		 * be removed.
76 		 */
77 		if (__wt_timestamp_cmp(
78 		    &rollback_timestamp, &upd_timestamp) < 0) {
79 			WT_ERR(cursor->remove(cursor));
80 			WT_STAT_CONN_INCR(session, txn_rollback_las_removed);
81 			--las_total;
82 		}
83 	}
84 	WT_ERR_NOTFOUND_OK(ret);
85 err:	if (ret == 0) {
86 		conn->cache->las_insert_count = las_total;
87 		conn->cache->las_remove_count = 0;
88 	}
89 	__wt_writeunlock(session, &conn->cache->las_sweepwalk_lock);
90 	WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
91 
92 	F_CLR(session, WT_SESSION_READ_WONT_NEED);
93 
94 	return (ret);
95 }
96 
97 /*
98  * __txn_abort_newer_update --
99  *	Abort updates in an update change with timestamps newer than the
100  *	rollback timestamp.
101  */
102 static void
__txn_abort_newer_update(WT_SESSION_IMPL * session,WT_UPDATE * first_upd,wt_timestamp_t * rollback_timestamp)103 __txn_abort_newer_update(WT_SESSION_IMPL *session,
104     WT_UPDATE *first_upd, wt_timestamp_t *rollback_timestamp)
105 {
106 	WT_UPDATE *upd;
107 	bool skip_zero_timestamps;
108 
109 	skip_zero_timestamps = !FLD_ISSET(S2BT(session)->assert_flags,
110 	    WT_ASSERT_COMMIT_TS_ALWAYS | WT_ASSERT_COMMIT_TS_KEYS);
111 
112 	for (upd = first_upd; upd != NULL; upd = upd->next) {
113 		/*
114 		 * Updates with no timestamp will have a timestamp of zero and
115 		 * will never be rolled back.  If the table is configured for
116 		 * strict timestamp checking, assert that all more recent
117 		 * updates were also rolled back.
118 		 */
119 		if (upd->txnid == WT_TXN_ABORTED && upd == first_upd)
120 			first_upd = upd->next;
121 		else if (__wt_timestamp_iszero(&upd->timestamp)) {
122 			if (skip_zero_timestamps && upd == first_upd)
123 				first_upd = upd->next;
124 		} else if (__wt_timestamp_cmp(
125 		    rollback_timestamp, &upd->timestamp) < 0) {
126 			upd->txnid = WT_TXN_ABORTED;
127 			WT_STAT_CONN_INCR(session, txn_rollback_upd_aborted);
128 			__wt_timestamp_set_zero(&upd->timestamp);
129 
130 			/*
131 			 * If any updates are aborted, all newer updates
132 			 * better be aborted as well.
133 			 */
134 			WT_ASSERT(session, upd == first_upd);
135 			first_upd = upd->next;
136 		}
137 	}
138 }
139 
140 /*
141  * __txn_abort_newer_insert --
142  *	Apply the update abort check to each entry in an insert skip list
143  */
144 static void
__txn_abort_newer_insert(WT_SESSION_IMPL * session,WT_INSERT_HEAD * head,wt_timestamp_t * rollback_timestamp)145 __txn_abort_newer_insert(WT_SESSION_IMPL *session,
146     WT_INSERT_HEAD *head, wt_timestamp_t *rollback_timestamp)
147 {
148 	WT_INSERT *ins;
149 
150 	WT_SKIP_FOREACH(ins, head)
151 		__txn_abort_newer_update(session, ins->upd, rollback_timestamp);
152 }
153 
154 /*
155  * __txn_abort_newer_col_var --
156  *	Abort updates on a variable length col leaf page with timestamps newer
157  *	than the rollback timestamp.
158  */
159 static void
__txn_abort_newer_col_var(WT_SESSION_IMPL * session,WT_PAGE * page,wt_timestamp_t * rollback_timestamp)160 __txn_abort_newer_col_var(
161     WT_SESSION_IMPL *session, WT_PAGE *page, wt_timestamp_t *rollback_timestamp)
162 {
163 	WT_COL *cip;
164 	WT_INSERT_HEAD *ins;
165 	uint32_t i;
166 
167 	/* Review the changes to the original on-page data items */
168 	WT_COL_FOREACH(page, cip, i)
169 		if ((ins = WT_COL_UPDATE(page, cip)) != NULL)
170 			__txn_abort_newer_insert(session,
171 			    ins, rollback_timestamp);
172 
173 	/* Review the append list */
174 	if ((ins = WT_COL_APPEND(page)) != NULL)
175 		__txn_abort_newer_insert(session, ins, rollback_timestamp);
176 }
177 
178 /*
179  * __txn_abort_newer_col_fix --
180  *	Abort updates on a fixed length col leaf page with timestamps newer than
181  *	the rollback timestamp.
182  */
183 static void
__txn_abort_newer_col_fix(WT_SESSION_IMPL * session,WT_PAGE * page,wt_timestamp_t * rollback_timestamp)184 __txn_abort_newer_col_fix(
185     WT_SESSION_IMPL *session, WT_PAGE *page, wt_timestamp_t *rollback_timestamp)
186 {
187 	WT_INSERT_HEAD *ins;
188 
189 	/* Review the changes to the original on-page data items */
190 	if ((ins = WT_COL_UPDATE_SINGLE(page)) != NULL)
191 		__txn_abort_newer_insert(session, ins, rollback_timestamp);
192 
193 	/* Review the append list */
194 	if ((ins = WT_COL_APPEND(page)) != NULL)
195 		__txn_abort_newer_insert(session, ins, rollback_timestamp);
196 }
197 
198 /*
199  * __txn_abort_newer_row_leaf --
200  *	Abort updates on a row leaf page with timestamps newer than the
201  *	rollback timestamp.
202  */
203 static void
__txn_abort_newer_row_leaf(WT_SESSION_IMPL * session,WT_PAGE * page,wt_timestamp_t * rollback_timestamp)204 __txn_abort_newer_row_leaf(
205     WT_SESSION_IMPL *session, WT_PAGE *page, wt_timestamp_t *rollback_timestamp)
206 {
207 	WT_INSERT_HEAD *insert;
208 	WT_ROW *rip;
209 	WT_UPDATE *upd;
210 	uint32_t i;
211 
212 	/*
213 	 * Review the insert list for keys before the first entry on the disk
214 	 * page.
215 	 */
216 	if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL)
217 		__txn_abort_newer_insert(session, insert, rollback_timestamp);
218 
219 	/*
220 	 * Review updates that belong to keys that are on the disk image,
221 	 * as well as for keys inserted since the page was read from disk.
222 	 */
223 	WT_ROW_FOREACH(page, rip, i) {
224 		if ((upd = WT_ROW_UPDATE(page, rip)) != NULL)
225 			__txn_abort_newer_update(
226 			    session, upd, rollback_timestamp);
227 
228 		if ((insert = WT_ROW_INSERT(page, rip)) != NULL)
229 			__txn_abort_newer_insert(
230 			    session, insert, rollback_timestamp);
231 	}
232 }
233 
234 /*
235  * __txn_abort_newer_updates --
236  *	Abort updates on this page newer than the timestamp.
237  */
238 static int
__txn_abort_newer_updates(WT_SESSION_IMPL * session,WT_REF * ref,wt_timestamp_t * rollback_timestamp)239 __txn_abort_newer_updates(
240     WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t *rollback_timestamp)
241 {
242 	WT_DECL_RET;
243 	WT_PAGE *page;
244 	uint32_t read_flags;
245 	bool local_read;
246 
247 	/*
248 	 * If we created a page image with updates the need to be rolled back,
249 	 * read the history into cache now and make sure the page is marked
250 	 * dirty.  Otherwise, the history we need could be swept from the
251 	 * lookaside table before the page is read because the lookaside sweep
252 	 * code has no way to tell that the page image is invalid.
253 	 *
254 	 * So, if there is lookaside history for a page, first check if the
255 	 * history needs to be rolled back make sure that history is loaded
256 	 * into cache.  That is, if skew_newest is true, so the disk image
257 	 * potentially contained unstable updates, and the history is more
258 	 * recent than the rollback timestamp.
259 	 *
260 	 * Also, we have separately discarded any lookaside history more recent
261 	 * than the rollback timestamp.  For page_las structures in cache,
262 	 * reset any future timestamps back to the rollback timestamp.  This
263 	 * allows those structures to be discarded once the rollback timestamp
264 	 * is stable (crucially for tests, they can be discarded if the
265 	 * connection is closed right after a rollback_to_stable call).
266 	 */
267 	local_read = false;
268 	read_flags = WT_READ_WONT_NEED;
269 	if (ref->page_las != NULL) {
270 		if (ref->page_las->skew_newest &&
271 		    __wt_timestamp_cmp(rollback_timestamp,
272 		    &ref->page_las->unstable_timestamp) < 0) {
273 			/*
274 			 * Make sure we get back a page with history, not a
275 			 * limbo page.
276 			 */
277 			WT_ASSERT(session,
278 			    !F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT));
279 			WT_RET(__wt_page_in(session, ref, read_flags));
280 			WT_ASSERT(session, ref->state != WT_REF_LIMBO &&
281 			    ref->page != NULL &&
282 			    __wt_page_is_modified(ref->page));
283 			local_read = true;
284 		}
285 		if (__wt_timestamp_cmp(&ref->page_las->max_timestamp,
286 		    rollback_timestamp) > 0)
287 			ref->page_las->max_timestamp = *rollback_timestamp;
288 		if (__wt_timestamp_cmp(&ref->page_las->unstable_timestamp,
289 		    rollback_timestamp) > 0)
290 			ref->page_las->unstable_timestamp = *rollback_timestamp;
291 		if (__wt_timestamp_cmp(&ref->page_las->unstable_timestamp,
292 		    rollback_timestamp) > 0)
293 			ref->page_las->unstable_timestamp = *rollback_timestamp;
294 	}
295 
296 	/* Review deleted page saved to the ref */
297 	if (ref->page_del != NULL && __wt_timestamp_cmp(
298 	    rollback_timestamp, &ref->page_del->timestamp) < 0)
299 		WT_ERR(__wt_delete_page_rollback(session, ref));
300 
301 	/*
302 	 * If we have a ref with no page, or the page is clean, there is
303 	 * nothing to roll back.
304 	 *
305 	 * This check for a clean page is partly an optimization (checkpoint
306 	 * only marks pages clean when they have no unwritten updates so
307 	 * there's no point visiting them again), but also covers a corner case
308 	 * of a checkpoint with use_timestamp=false.  Such a checkpoint
309 	 * effectively moves the stable timestamp forward, because changes that
310 	 * are written in the checkpoint cannot be reliably rolled back.  The
311 	 * actual stable timestamp doesn't change, though, so if we try to roll
312 	 * back clean pages the in-memory tree can get out of sync with the
313 	 * on-disk tree.
314 	 */
315 	if ((page = ref->page) == NULL || !__wt_page_is_modified(page))
316 		goto err;
317 
318 	switch (page->type) {
319 	case WT_PAGE_COL_FIX:
320 		__txn_abort_newer_col_fix(session, page, rollback_timestamp);
321 		break;
322 	case WT_PAGE_COL_VAR:
323 		__txn_abort_newer_col_var(session, page, rollback_timestamp);
324 		break;
325 	case WT_PAGE_COL_INT:
326 	case WT_PAGE_ROW_INT:
327 		/*
328 		 * There is nothing to do for internal pages, since we aren't
329 		 * rolling back far enough to potentially include reconciled
330 		 * changes - and thus won't need to roll back structure
331 		 * changes on internal pages.
332 		 */
333 		break;
334 	case WT_PAGE_ROW_LEAF:
335 		__txn_abort_newer_row_leaf(session, page, rollback_timestamp);
336 		break;
337 	WT_ILLEGAL_VALUE_ERR(session, page->type);
338 	}
339 
340 err:	if (local_read)
341 		WT_TRET(__wt_page_release(session, ref, read_flags));
342 	return (ret);
343 }
344 
345 /*
346  * __txn_rollback_to_stable_btree_walk --
347  *	Called for each open handle - choose to either skip or wipe the commits
348  */
349 static int
__txn_rollback_to_stable_btree_walk(WT_SESSION_IMPL * session,wt_timestamp_t * rollback_timestamp)350 __txn_rollback_to_stable_btree_walk(
351     WT_SESSION_IMPL *session, wt_timestamp_t *rollback_timestamp)
352 {
353 	WT_DECL_RET;
354 	WT_REF *child_ref, *ref;
355 
356 	/* Walk the tree, marking commits aborted where appropriate. */
357 	ref = NULL;
358 	while ((ret = __wt_tree_walk(session, &ref,
359 	    WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_WONT_NEED)) == 0 &&
360 	    ref != NULL) {
361 		if (WT_PAGE_IS_INTERNAL(ref->page)) {
362 			WT_INTL_FOREACH_BEGIN(session, ref->page, child_ref) {
363 				WT_RET(__txn_abort_newer_updates(
364 				    session, child_ref, rollback_timestamp));
365 			} WT_INTL_FOREACH_END;
366 		} else
367 			WT_RET(__txn_abort_newer_updates(
368 			    session, ref, rollback_timestamp));
369 	}
370 	return (ret);
371 }
372 
373 /*
374  * __txn_rollback_eviction_drain --
375  *	Wait for eviction to drain from a tree.
376  */
377 static int
__txn_rollback_eviction_drain(WT_SESSION_IMPL * session,const char * cfg[])378 __txn_rollback_eviction_drain(WT_SESSION_IMPL *session, const char *cfg[])
379 {
380 	WT_UNUSED(cfg);
381 
382 	WT_RET(__wt_evict_file_exclusive_on(session));
383 	__wt_evict_file_exclusive_off(session);
384 	return (0);
385 }
386 
387 /*
388  * __txn_rollback_to_stable_btree --
389  *	Called for each open handle - choose to either skip or wipe the commits
390  */
391 static int
__txn_rollback_to_stable_btree(WT_SESSION_IMPL * session,const char * cfg[])392 __txn_rollback_to_stable_btree(WT_SESSION_IMPL *session, const char *cfg[])
393 {
394 	WT_BTREE *btree;
395 	WT_CONNECTION_IMPL *conn;
396 	WT_DECL_RET;
397 	WT_DECL_TIMESTAMP(rollback_timestamp)
398 	WT_TXN_GLOBAL *txn_global;
399 
400 	WT_UNUSED(cfg);
401 
402 	btree = S2BT(session);
403 	conn = S2C(session);
404 	txn_global = &conn->txn_global;
405 
406 	/*
407 	 * Immediately durable files don't get their commits wiped. This case
408 	 * mostly exists to support the semantic required for the oplog in
409 	 * MongoDB - updates that have been made to the oplog should not be
410 	 * aborted. It also wouldn't be safe to roll back updates for any
411 	 * table that had it's records logged, since those updates would be
412 	 * recovered after a crash making them inconsistent.
413 	 */
414 	if (__wt_btree_immediately_durable(session)) {
415 		/*
416 		 * Add the btree ID to the bitstring, so we can exclude any
417 		 * lookaside entries for this btree.
418 		 */
419 		if (btree->id >= conn->stable_rollback_maxfile)
420 			WT_PANIC_RET(session, EINVAL, "btree file ID %" PRIu32
421 			    " larger than max %" PRIu32,
422 			    btree->id, conn->stable_rollback_maxfile);
423 		__bit_set(conn->stable_rollback_bitstring, btree->id);
424 		return (0);
425 	}
426 
427 	/* There is never anything to do for checkpoint handles */
428 	if (session->dhandle->checkpoint != NULL)
429 		return (0);
430 
431 	/* There is nothing to do on an empty tree. */
432 	if (btree->root.page == NULL)
433 		return (0);
434 
435 	/*
436 	 * Copy the stable timestamp, otherwise we'd need to lock it each time
437 	 * it's accessed. Even though the stable timestamp isn't supposed to be
438 	 * updated while rolling back, accessing it without a lock would
439 	 * violate protocol.
440 	 */
441 	WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
442 	    __wt_timestamp_set(
443 	    &rollback_timestamp, &txn_global->stable_timestamp));
444 
445 	/*
446 	 * Ensure the eviction server is out of the file - we don't
447 	 * want it messing with us. This step shouldn't be required, but
448 	 * it simplifies some of the reasoning about what state trees can
449 	 * be in.
450 	 */
451 	WT_RET(__wt_evict_file_exclusive_on(session));
452 	WT_WITH_PAGE_INDEX(session, ret = __txn_rollback_to_stable_btree_walk(
453 	    session, &rollback_timestamp));
454 	__wt_evict_file_exclusive_off(session);
455 
456 	return (ret);
457 }
458 
459 /*
460  * __txn_rollback_to_stable_check --
461  *	Ensure the rollback request is reasonable.
462  */
463 static int
__txn_rollback_to_stable_check(WT_SESSION_IMPL * session)464 __txn_rollback_to_stable_check(WT_SESSION_IMPL *session)
465 {
466 	WT_TXN_GLOBAL *txn_global;
467 	bool txn_active;
468 
469 	txn_global = &S2C(session)->txn_global;
470 	if (!txn_global->has_stable_timestamp)
471 		WT_RET_MSG(session, EINVAL,
472 		    "rollback_to_stable requires a stable timestamp");
473 
474 	/*
475 	 * Help the user - see if they have any active transactions. I'd
476 	 * like to check the transaction running flag, but that would
477 	 * require peeking into all open sessions, which isn't really
478 	 * kosher.
479 	 */
480 	WT_RET(__wt_txn_activity_check(session, &txn_active));
481 	if (txn_active)
482 		WT_RET_MSG(session, EINVAL,
483 		    "rollback_to_stable illegal with active transactions");
484 
485 	return (0);
486 }
487 #endif
488 
489 /*
490  * __wt_txn_rollback_to_stable --
491  *	Rollback all in-memory state related to timestamps more recent than
492  *	the passed in timestamp.
493  */
494 int
__wt_txn_rollback_to_stable(WT_SESSION_IMPL * session,const char * cfg[])495 __wt_txn_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[])
496 {
497 #ifndef HAVE_TIMESTAMPS
498 	WT_UNUSED(cfg);
499 
500 	WT_RET_MSG(session, ENOTSUP, "rollback_to_stable "
501 	    "requires a version of WiredTiger built with timestamp support");
502 #else
503 	WT_CONNECTION_IMPL *conn;
504 	WT_DECL_RET;
505 
506 	conn = S2C(session);
507 
508 	WT_STAT_CONN_INCR(session, txn_rollback_to_stable);
509 	/*
510 	 * Mark that a rollback operation is in progress and wait for eviction
511 	 * to drain.  This is necessary because lookaside eviction uses
512 	 * transactions and causes the check for a quiescent system to fail.
513 	 *
514 	 * Configuring lookaside eviction off isn't atomic, safe because the
515 	 * flag is only otherwise set when closing down the database. Assert
516 	 * to avoid confusion in the future.
517 	 */
518 	WT_ASSERT(session, !F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE));
519 	F_SET(conn, WT_CONN_EVICTION_NO_LOOKASIDE);
520 
521 	WT_ERR(__wt_conn_btree_apply(session,
522 	    NULL, __txn_rollback_eviction_drain, NULL, cfg));
523 
524 	WT_ERR(__txn_rollback_to_stable_check(session));
525 
526 	F_CLR(conn, WT_CONN_EVICTION_NO_LOOKASIDE);
527 
528 	/*
529 	 * Allocate a non-durable btree bitstring.  We increment the global
530 	 * value before using it, so the current value is already in use, and
531 	 * hence we need to add one here.
532 	 */
533 	conn->stable_rollback_maxfile = conn->next_file_id + 1;
534 	WT_ERR(__bit_alloc(session,
535 	    conn->stable_rollback_maxfile, &conn->stable_rollback_bitstring));
536 	WT_ERR(__wt_conn_btree_apply(session,
537 	    NULL, __txn_rollback_to_stable_btree, NULL, cfg));
538 
539 	/*
540 	 * Clear any offending content from the lookaside file. This must be
541 	 * done after the in-memory application, since the process of walking
542 	 * trees in cache populates a list that is used to check which
543 	 * lookaside records should be removed.
544 	 */
545 	if (!F_ISSET(conn, WT_CONN_IN_MEMORY))
546 		WT_ERR(__txn_rollback_to_stable_lookaside_fixup(session));
547 
548 err:	F_CLR(conn, WT_CONN_EVICTION_NO_LOOKASIDE);
549 	__wt_free(session, conn->stable_rollback_bitstring);
550 	return (ret);
551 #endif
552 }
553