1 /*-
2 * Copyright (c) 2014-2018 MongoDB, Inc.
3 * Copyright (c) 2008-2014 WiredTiger, Inc.
4 * All rights reserved.
5 *
6 * See the file LICENSE for redistribution information.
7 */
8
9 #include "wt_internal.h"
10
11 #ifdef HAVE_TIMESTAMPS
12 /*
13 * __txn_rollback_to_stable_lookaside_fixup --
14 * Remove any updates that need to be rolled back from the lookaside file.
15 */
16 static int
__txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL * session)17 __txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session)
18 {
19 WT_CONNECTION_IMPL *conn;
20 WT_CURSOR *cursor;
21 WT_DECL_RET;
22 WT_DECL_TIMESTAMP(rollback_timestamp)
23 WT_DECL_TIMESTAMP(upd_timestamp)
24 WT_ITEM las_key, las_timestamp, las_value;
25 WT_TXN_GLOBAL *txn_global;
26 uint64_t las_counter, las_pageid, las_total, las_txnid;
27 uint32_t las_id, session_flags;
28 uint8_t prepare_state, upd_type;
29
30 conn = S2C(session);
31 cursor = NULL;
32 las_total = 0;
33 session_flags = 0; /* [-Werror=maybe-uninitialized] */
34 WT_CLEAR(las_timestamp);
35
36 /*
37 * Copy the stable timestamp, otherwise we'd need to lock it each time
38 * it's accessed. Even though the stable timestamp isn't supposed to be
39 * updated while rolling back, accessing it without a lock would
40 * violate protocol.
41 */
42 txn_global = &conn->txn_global;
43 WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
44 __wt_timestamp_set(
45 &rollback_timestamp, &txn_global->stable_timestamp));
46
47 __wt_las_cursor(session, &cursor, &session_flags);
48
49 /* Discard pages we read as soon as we're done with them. */
50 F_SET(session, WT_SESSION_READ_WONT_NEED);
51
52 /* Walk the file. */
53 __wt_writelock(session, &conn->cache->las_sweepwalk_lock);
54 while ((ret = cursor->next(cursor)) == 0) {
55 ++las_total;
56 WT_ERR(cursor->get_key(cursor,
57 &las_pageid, &las_id, &las_counter, &las_key));
58
59 /* Check the file ID so we can skip durable tables */
60 if (las_id >= conn->stable_rollback_maxfile)
61 WT_PANIC_RET(session, EINVAL, "file ID %" PRIu32
62 " in lookaside table larger than max %" PRIu32,
63 las_id, conn->stable_rollback_maxfile);
64 if (__bit_test(conn->stable_rollback_bitstring, las_id))
65 continue;
66
67 WT_ERR(cursor->get_value(cursor, &las_txnid,
68 &las_timestamp, &prepare_state, &upd_type, &las_value));
69 WT_ASSERT(session, las_timestamp.size == WT_TIMESTAMP_SIZE);
70 memcpy(&upd_timestamp, las_timestamp.data, las_timestamp.size);
71
72 /*
73 * Entries with no timestamp will have a timestamp of zero,
74 * which will fail the following check and cause them to never
75 * be removed.
76 */
77 if (__wt_timestamp_cmp(
78 &rollback_timestamp, &upd_timestamp) < 0) {
79 WT_ERR(cursor->remove(cursor));
80 WT_STAT_CONN_INCR(session, txn_rollback_las_removed);
81 --las_total;
82 }
83 }
84 WT_ERR_NOTFOUND_OK(ret);
85 err: if (ret == 0) {
86 conn->cache->las_insert_count = las_total;
87 conn->cache->las_remove_count = 0;
88 }
89 __wt_writeunlock(session, &conn->cache->las_sweepwalk_lock);
90 WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
91
92 F_CLR(session, WT_SESSION_READ_WONT_NEED);
93
94 return (ret);
95 }
96
97 /*
98 * __txn_abort_newer_update --
99 * Abort updates in an update change with timestamps newer than the
100 * rollback timestamp.
101 */
102 static void
__txn_abort_newer_update(WT_SESSION_IMPL * session,WT_UPDATE * first_upd,wt_timestamp_t * rollback_timestamp)103 __txn_abort_newer_update(WT_SESSION_IMPL *session,
104 WT_UPDATE *first_upd, wt_timestamp_t *rollback_timestamp)
105 {
106 WT_UPDATE *upd;
107 bool skip_zero_timestamps;
108
109 skip_zero_timestamps = !FLD_ISSET(S2BT(session)->assert_flags,
110 WT_ASSERT_COMMIT_TS_ALWAYS | WT_ASSERT_COMMIT_TS_KEYS);
111
112 for (upd = first_upd; upd != NULL; upd = upd->next) {
113 /*
114 * Updates with no timestamp will have a timestamp of zero and
115 * will never be rolled back. If the table is configured for
116 * strict timestamp checking, assert that all more recent
117 * updates were also rolled back.
118 */
119 if (upd->txnid == WT_TXN_ABORTED && upd == first_upd)
120 first_upd = upd->next;
121 else if (__wt_timestamp_iszero(&upd->timestamp)) {
122 if (skip_zero_timestamps && upd == first_upd)
123 first_upd = upd->next;
124 } else if (__wt_timestamp_cmp(
125 rollback_timestamp, &upd->timestamp) < 0) {
126 upd->txnid = WT_TXN_ABORTED;
127 WT_STAT_CONN_INCR(session, txn_rollback_upd_aborted);
128 __wt_timestamp_set_zero(&upd->timestamp);
129
130 /*
131 * If any updates are aborted, all newer updates
132 * better be aborted as well.
133 */
134 WT_ASSERT(session, upd == first_upd);
135 first_upd = upd->next;
136 }
137 }
138 }
139
140 /*
141 * __txn_abort_newer_insert --
142 * Apply the update abort check to each entry in an insert skip list
143 */
144 static void
__txn_abort_newer_insert(WT_SESSION_IMPL * session,WT_INSERT_HEAD * head,wt_timestamp_t * rollback_timestamp)145 __txn_abort_newer_insert(WT_SESSION_IMPL *session,
146 WT_INSERT_HEAD *head, wt_timestamp_t *rollback_timestamp)
147 {
148 WT_INSERT *ins;
149
150 WT_SKIP_FOREACH(ins, head)
151 __txn_abort_newer_update(session, ins->upd, rollback_timestamp);
152 }
153
154 /*
155 * __txn_abort_newer_col_var --
156 * Abort updates on a variable length col leaf page with timestamps newer
157 * than the rollback timestamp.
158 */
159 static void
__txn_abort_newer_col_var(WT_SESSION_IMPL * session,WT_PAGE * page,wt_timestamp_t * rollback_timestamp)160 __txn_abort_newer_col_var(
161 WT_SESSION_IMPL *session, WT_PAGE *page, wt_timestamp_t *rollback_timestamp)
162 {
163 WT_COL *cip;
164 WT_INSERT_HEAD *ins;
165 uint32_t i;
166
167 /* Review the changes to the original on-page data items */
168 WT_COL_FOREACH(page, cip, i)
169 if ((ins = WT_COL_UPDATE(page, cip)) != NULL)
170 __txn_abort_newer_insert(session,
171 ins, rollback_timestamp);
172
173 /* Review the append list */
174 if ((ins = WT_COL_APPEND(page)) != NULL)
175 __txn_abort_newer_insert(session, ins, rollback_timestamp);
176 }
177
178 /*
179 * __txn_abort_newer_col_fix --
180 * Abort updates on a fixed length col leaf page with timestamps newer than
181 * the rollback timestamp.
182 */
183 static void
__txn_abort_newer_col_fix(WT_SESSION_IMPL * session,WT_PAGE * page,wt_timestamp_t * rollback_timestamp)184 __txn_abort_newer_col_fix(
185 WT_SESSION_IMPL *session, WT_PAGE *page, wt_timestamp_t *rollback_timestamp)
186 {
187 WT_INSERT_HEAD *ins;
188
189 /* Review the changes to the original on-page data items */
190 if ((ins = WT_COL_UPDATE_SINGLE(page)) != NULL)
191 __txn_abort_newer_insert(session, ins, rollback_timestamp);
192
193 /* Review the append list */
194 if ((ins = WT_COL_APPEND(page)) != NULL)
195 __txn_abort_newer_insert(session, ins, rollback_timestamp);
196 }
197
198 /*
199 * __txn_abort_newer_row_leaf --
200 * Abort updates on a row leaf page with timestamps newer than the
201 * rollback timestamp.
202 */
203 static void
__txn_abort_newer_row_leaf(WT_SESSION_IMPL * session,WT_PAGE * page,wt_timestamp_t * rollback_timestamp)204 __txn_abort_newer_row_leaf(
205 WT_SESSION_IMPL *session, WT_PAGE *page, wt_timestamp_t *rollback_timestamp)
206 {
207 WT_INSERT_HEAD *insert;
208 WT_ROW *rip;
209 WT_UPDATE *upd;
210 uint32_t i;
211
212 /*
213 * Review the insert list for keys before the first entry on the disk
214 * page.
215 */
216 if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL)
217 __txn_abort_newer_insert(session, insert, rollback_timestamp);
218
219 /*
220 * Review updates that belong to keys that are on the disk image,
221 * as well as for keys inserted since the page was read from disk.
222 */
223 WT_ROW_FOREACH(page, rip, i) {
224 if ((upd = WT_ROW_UPDATE(page, rip)) != NULL)
225 __txn_abort_newer_update(
226 session, upd, rollback_timestamp);
227
228 if ((insert = WT_ROW_INSERT(page, rip)) != NULL)
229 __txn_abort_newer_insert(
230 session, insert, rollback_timestamp);
231 }
232 }
233
234 /*
235 * __txn_abort_newer_updates --
236 * Abort updates on this page newer than the timestamp.
237 */
238 static int
__txn_abort_newer_updates(WT_SESSION_IMPL * session,WT_REF * ref,wt_timestamp_t * rollback_timestamp)239 __txn_abort_newer_updates(
240 WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t *rollback_timestamp)
241 {
242 WT_DECL_RET;
243 WT_PAGE *page;
244 uint32_t read_flags;
245 bool local_read;
246
247 /*
248 * If we created a page image with updates the need to be rolled back,
249 * read the history into cache now and make sure the page is marked
250 * dirty. Otherwise, the history we need could be swept from the
251 * lookaside table before the page is read because the lookaside sweep
252 * code has no way to tell that the page image is invalid.
253 *
254 * So, if there is lookaside history for a page, first check if the
255 * history needs to be rolled back make sure that history is loaded
256 * into cache. That is, if skew_newest is true, so the disk image
257 * potentially contained unstable updates, and the history is more
258 * recent than the rollback timestamp.
259 *
260 * Also, we have separately discarded any lookaside history more recent
261 * than the rollback timestamp. For page_las structures in cache,
262 * reset any future timestamps back to the rollback timestamp. This
263 * allows those structures to be discarded once the rollback timestamp
264 * is stable (crucially for tests, they can be discarded if the
265 * connection is closed right after a rollback_to_stable call).
266 */
267 local_read = false;
268 read_flags = WT_READ_WONT_NEED;
269 if (ref->page_las != NULL) {
270 if (ref->page_las->skew_newest &&
271 __wt_timestamp_cmp(rollback_timestamp,
272 &ref->page_las->unstable_timestamp) < 0) {
273 /*
274 * Make sure we get back a page with history, not a
275 * limbo page.
276 */
277 WT_ASSERT(session,
278 !F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT));
279 WT_RET(__wt_page_in(session, ref, read_flags));
280 WT_ASSERT(session, ref->state != WT_REF_LIMBO &&
281 ref->page != NULL &&
282 __wt_page_is_modified(ref->page));
283 local_read = true;
284 }
285 if (__wt_timestamp_cmp(&ref->page_las->max_timestamp,
286 rollback_timestamp) > 0)
287 ref->page_las->max_timestamp = *rollback_timestamp;
288 if (__wt_timestamp_cmp(&ref->page_las->unstable_timestamp,
289 rollback_timestamp) > 0)
290 ref->page_las->unstable_timestamp = *rollback_timestamp;
291 if (__wt_timestamp_cmp(&ref->page_las->unstable_timestamp,
292 rollback_timestamp) > 0)
293 ref->page_las->unstable_timestamp = *rollback_timestamp;
294 }
295
296 /* Review deleted page saved to the ref */
297 if (ref->page_del != NULL && __wt_timestamp_cmp(
298 rollback_timestamp, &ref->page_del->timestamp) < 0)
299 WT_ERR(__wt_delete_page_rollback(session, ref));
300
301 /*
302 * If we have a ref with no page, or the page is clean, there is
303 * nothing to roll back.
304 *
305 * This check for a clean page is partly an optimization (checkpoint
306 * only marks pages clean when they have no unwritten updates so
307 * there's no point visiting them again), but also covers a corner case
308 * of a checkpoint with use_timestamp=false. Such a checkpoint
309 * effectively moves the stable timestamp forward, because changes that
310 * are written in the checkpoint cannot be reliably rolled back. The
311 * actual stable timestamp doesn't change, though, so if we try to roll
312 * back clean pages the in-memory tree can get out of sync with the
313 * on-disk tree.
314 */
315 if ((page = ref->page) == NULL || !__wt_page_is_modified(page))
316 goto err;
317
318 switch (page->type) {
319 case WT_PAGE_COL_FIX:
320 __txn_abort_newer_col_fix(session, page, rollback_timestamp);
321 break;
322 case WT_PAGE_COL_VAR:
323 __txn_abort_newer_col_var(session, page, rollback_timestamp);
324 break;
325 case WT_PAGE_COL_INT:
326 case WT_PAGE_ROW_INT:
327 /*
328 * There is nothing to do for internal pages, since we aren't
329 * rolling back far enough to potentially include reconciled
330 * changes - and thus won't need to roll back structure
331 * changes on internal pages.
332 */
333 break;
334 case WT_PAGE_ROW_LEAF:
335 __txn_abort_newer_row_leaf(session, page, rollback_timestamp);
336 break;
337 WT_ILLEGAL_VALUE_ERR(session, page->type);
338 }
339
340 err: if (local_read)
341 WT_TRET(__wt_page_release(session, ref, read_flags));
342 return (ret);
343 }
344
345 /*
346 * __txn_rollback_to_stable_btree_walk --
347 * Called for each open handle - choose to either skip or wipe the commits
348 */
349 static int
__txn_rollback_to_stable_btree_walk(WT_SESSION_IMPL * session,wt_timestamp_t * rollback_timestamp)350 __txn_rollback_to_stable_btree_walk(
351 WT_SESSION_IMPL *session, wt_timestamp_t *rollback_timestamp)
352 {
353 WT_DECL_RET;
354 WT_REF *child_ref, *ref;
355
356 /* Walk the tree, marking commits aborted where appropriate. */
357 ref = NULL;
358 while ((ret = __wt_tree_walk(session, &ref,
359 WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_WONT_NEED)) == 0 &&
360 ref != NULL) {
361 if (WT_PAGE_IS_INTERNAL(ref->page)) {
362 WT_INTL_FOREACH_BEGIN(session, ref->page, child_ref) {
363 WT_RET(__txn_abort_newer_updates(
364 session, child_ref, rollback_timestamp));
365 } WT_INTL_FOREACH_END;
366 } else
367 WT_RET(__txn_abort_newer_updates(
368 session, ref, rollback_timestamp));
369 }
370 return (ret);
371 }
372
373 /*
374 * __txn_rollback_eviction_drain --
375 * Wait for eviction to drain from a tree.
376 */
377 static int
__txn_rollback_eviction_drain(WT_SESSION_IMPL * session,const char * cfg[])378 __txn_rollback_eviction_drain(WT_SESSION_IMPL *session, const char *cfg[])
379 {
380 WT_UNUSED(cfg);
381
382 WT_RET(__wt_evict_file_exclusive_on(session));
383 __wt_evict_file_exclusive_off(session);
384 return (0);
385 }
386
387 /*
388 * __txn_rollback_to_stable_btree --
389 * Called for each open handle - choose to either skip or wipe the commits
390 */
391 static int
__txn_rollback_to_stable_btree(WT_SESSION_IMPL * session,const char * cfg[])392 __txn_rollback_to_stable_btree(WT_SESSION_IMPL *session, const char *cfg[])
393 {
394 WT_BTREE *btree;
395 WT_CONNECTION_IMPL *conn;
396 WT_DECL_RET;
397 WT_DECL_TIMESTAMP(rollback_timestamp)
398 WT_TXN_GLOBAL *txn_global;
399
400 WT_UNUSED(cfg);
401
402 btree = S2BT(session);
403 conn = S2C(session);
404 txn_global = &conn->txn_global;
405
406 /*
407 * Immediately durable files don't get their commits wiped. This case
408 * mostly exists to support the semantic required for the oplog in
409 * MongoDB - updates that have been made to the oplog should not be
410 * aborted. It also wouldn't be safe to roll back updates for any
411 * table that had it's records logged, since those updates would be
412 * recovered after a crash making them inconsistent.
413 */
414 if (__wt_btree_immediately_durable(session)) {
415 /*
416 * Add the btree ID to the bitstring, so we can exclude any
417 * lookaside entries for this btree.
418 */
419 if (btree->id >= conn->stable_rollback_maxfile)
420 WT_PANIC_RET(session, EINVAL, "btree file ID %" PRIu32
421 " larger than max %" PRIu32,
422 btree->id, conn->stable_rollback_maxfile);
423 __bit_set(conn->stable_rollback_bitstring, btree->id);
424 return (0);
425 }
426
427 /* There is never anything to do for checkpoint handles */
428 if (session->dhandle->checkpoint != NULL)
429 return (0);
430
431 /* There is nothing to do on an empty tree. */
432 if (btree->root.page == NULL)
433 return (0);
434
435 /*
436 * Copy the stable timestamp, otherwise we'd need to lock it each time
437 * it's accessed. Even though the stable timestamp isn't supposed to be
438 * updated while rolling back, accessing it without a lock would
439 * violate protocol.
440 */
441 WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
442 __wt_timestamp_set(
443 &rollback_timestamp, &txn_global->stable_timestamp));
444
445 /*
446 * Ensure the eviction server is out of the file - we don't
447 * want it messing with us. This step shouldn't be required, but
448 * it simplifies some of the reasoning about what state trees can
449 * be in.
450 */
451 WT_RET(__wt_evict_file_exclusive_on(session));
452 WT_WITH_PAGE_INDEX(session, ret = __txn_rollback_to_stable_btree_walk(
453 session, &rollback_timestamp));
454 __wt_evict_file_exclusive_off(session);
455
456 return (ret);
457 }
458
459 /*
460 * __txn_rollback_to_stable_check --
461 * Ensure the rollback request is reasonable.
462 */
463 static int
__txn_rollback_to_stable_check(WT_SESSION_IMPL * session)464 __txn_rollback_to_stable_check(WT_SESSION_IMPL *session)
465 {
466 WT_TXN_GLOBAL *txn_global;
467 bool txn_active;
468
469 txn_global = &S2C(session)->txn_global;
470 if (!txn_global->has_stable_timestamp)
471 WT_RET_MSG(session, EINVAL,
472 "rollback_to_stable requires a stable timestamp");
473
474 /*
475 * Help the user - see if they have any active transactions. I'd
476 * like to check the transaction running flag, but that would
477 * require peeking into all open sessions, which isn't really
478 * kosher.
479 */
480 WT_RET(__wt_txn_activity_check(session, &txn_active));
481 if (txn_active)
482 WT_RET_MSG(session, EINVAL,
483 "rollback_to_stable illegal with active transactions");
484
485 return (0);
486 }
487 #endif
488
489 /*
490 * __wt_txn_rollback_to_stable --
491 * Rollback all in-memory state related to timestamps more recent than
492 * the passed in timestamp.
493 */
494 int
__wt_txn_rollback_to_stable(WT_SESSION_IMPL * session,const char * cfg[])495 __wt_txn_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[])
496 {
497 #ifndef HAVE_TIMESTAMPS
498 WT_UNUSED(cfg);
499
500 WT_RET_MSG(session, ENOTSUP, "rollback_to_stable "
501 "requires a version of WiredTiger built with timestamp support");
502 #else
503 WT_CONNECTION_IMPL *conn;
504 WT_DECL_RET;
505
506 conn = S2C(session);
507
508 WT_STAT_CONN_INCR(session, txn_rollback_to_stable);
509 /*
510 * Mark that a rollback operation is in progress and wait for eviction
511 * to drain. This is necessary because lookaside eviction uses
512 * transactions and causes the check for a quiescent system to fail.
513 *
514 * Configuring lookaside eviction off isn't atomic, safe because the
515 * flag is only otherwise set when closing down the database. Assert
516 * to avoid confusion in the future.
517 */
518 WT_ASSERT(session, !F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE));
519 F_SET(conn, WT_CONN_EVICTION_NO_LOOKASIDE);
520
521 WT_ERR(__wt_conn_btree_apply(session,
522 NULL, __txn_rollback_eviction_drain, NULL, cfg));
523
524 WT_ERR(__txn_rollback_to_stable_check(session));
525
526 F_CLR(conn, WT_CONN_EVICTION_NO_LOOKASIDE);
527
528 /*
529 * Allocate a non-durable btree bitstring. We increment the global
530 * value before using it, so the current value is already in use, and
531 * hence we need to add one here.
532 */
533 conn->stable_rollback_maxfile = conn->next_file_id + 1;
534 WT_ERR(__bit_alloc(session,
535 conn->stable_rollback_maxfile, &conn->stable_rollback_bitstring));
536 WT_ERR(__wt_conn_btree_apply(session,
537 NULL, __txn_rollback_to_stable_btree, NULL, cfg));
538
539 /*
540 * Clear any offending content from the lookaside file. This must be
541 * done after the in-memory application, since the process of walking
542 * trees in cache populates a list that is used to check which
543 * lookaside records should be removed.
544 */
545 if (!F_ISSET(conn, WT_CONN_IN_MEMORY))
546 WT_ERR(__txn_rollback_to_stable_lookaside_fixup(session));
547
548 err: F_CLR(conn, WT_CONN_EVICTION_NO_LOOKASIDE);
549 __wt_free(session, conn->stable_rollback_bitstring);
550 return (ret);
551 #endif
552 }
553