1 /*-
2 * Copyright (c) 2014-2018 MongoDB, Inc.
3 * Copyright (c) 2008-2014 WiredTiger, Inc.
4 * All rights reserved.
5 *
6 * See the file LICENSE for redistribution information.
7 */
8
9 #include "wt_internal.h"
10
11 /*
12 * Fast-delete support.
13 *
14 * This file contains most of the code that allows WiredTiger to delete pages
15 * of data without reading them into the cache. (This feature is currently
16 * only available for row-store objects.)
17 *
18 * The way cursor truncate works in a row-store object is it explicitly reads
19 * the first and last pages of the truncate range, then walks the tree with a
20 * flag so the tree walk code skips reading eligible pages within the range
21 * and instead just marks them as deleted, by changing their WT_REF state to
22 * WT_REF_DELETED. Pages ineligible for this fast path include pages already
23 * in the cache, having overflow items, or requiring lookaside records.
24 * Ineligible pages are read and have their rows updated/deleted individually.
25 * The transaction for the delete operation is stored in memory referenced by
26 * the WT_REF.page_del field.
27 *
28 * Future cursor walks of the tree will skip the deleted page based on the
29 * transaction stored for the delete, but it gets more complicated if a read is
30 * done using a random key, or a cursor walk is done with a transaction where
31 * the delete is not visible. In those cases, we read the original contents of
32 * the page. The page-read code notices a deleted page is being read, and as
33 * part of the read instantiates the contents of the page, creating a WT_UPDATE
34 * with a deleted operation, in the same transaction as deleted the page. In
35 * other words, the read process makes it appear as if the page was read and
36 * each individual row deleted, exactly as would have happened if the page had
37 * been in the cache all along.
38 *
39 * There's an additional complication to support rollback of the page delete.
40 * When the page was marked deleted, a pointer to the WT_REF was saved in the
41 * deleting session's transaction list and the delete is unrolled by resetting
42 * the WT_REF_DELETED state back to WT_REF_DISK. However, if the page has been
43 * instantiated by some reading thread, that's not enough, each individual row
44 * on the page must have the delete operation reset. If the page split, the
45 * WT_UPDATE lists might have been saved/restored during reconciliation and
46 * appear on multiple pages, and the WT_REF stored in the deleting session's
47 * transaction list is no longer useful. For this reason, when the page is
48 * instantiated by a read, a list of the WT_UPDATE structures on the page is
49 * stored in the WT_REF.page_del field, with the transaction ID, that way the
50 * session committing/unrolling the delete can find all WT_UPDATE structures
51 * that require update.
52 *
53 * One final note: pages can also be marked deleted if emptied and evicted. In
54 * that case, the WT_REF state will be set to WT_REF_DELETED but there will not
55 * be any associated WT_REF.page_del field. These pages are always skipped
56 * during cursor traversal (the page could not have been evicted if there were
57 * updates that weren't globally visible), and if read is forced to instantiate
58 * such a page, it simply creates an empty page from scratch.
59 */
60
61 /*
62 * __wt_delete_page --
63 * If deleting a range, try to delete the page without instantiating it.
64 */
65 int
__wt_delete_page(WT_SESSION_IMPL * session,WT_REF * ref,bool * skipp)66 __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
67 {
68 WT_ADDR *ref_addr;
69 WT_DECL_RET;
70 uint32_t previous_state;
71
72 *skipp = false;
73
74 /* If we have a clean page in memory, attempt to evict it. */
75 previous_state = ref->state;
76 if ((previous_state == WT_REF_MEM || previous_state == WT_REF_LIMBO) &&
77 __wt_atomic_casv32(&ref->state, previous_state, WT_REF_LOCKED)) {
78 if (__wt_page_is_modified(ref->page)) {
79 ref->state = previous_state;
80 return (0);
81 }
82
83 (void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1);
84 ret = __wt_evict(session, ref, false, previous_state);
85 (void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1);
86 WT_RET_BUSY_OK(ret);
87 ret = 0;
88 }
89
90 /*
91 * Fast check to see if it's worth locking, then atomically switch the
92 * page's state to lock it.
93 */
94 previous_state = ref->state;
95 switch (previous_state) {
96 case WT_REF_DISK:
97 case WT_REF_LOOKASIDE:
98 break;
99 default:
100 return (0);
101 }
102 if (!__wt_atomic_casv32(&ref->state, previous_state, WT_REF_LOCKED))
103 return (0);
104
105 /*
106 * If this WT_REF was previously part of a truncate operation, there
107 * may be existing page-delete information. The structure is only read
108 * while the state is locked, free the previous version.
109 *
110 * Note: changes have been made, we must publish any state change from
111 * this point on.
112 */
113 if (ref->page_del != NULL) {
114 WT_ASSERT(session, ref->page_del->txnid == WT_TXN_ABORTED);
115 __wt_free(session, ref->page_del->update_list);
116 __wt_free(session, ref->page_del);
117 }
118
119 /*
120 * We cannot truncate pages that have overflow key/value items as the
121 * overflow blocks have to be discarded. The way we figure that out is
122 * to check the page's cell type, cells for leaf pages without overflow
123 * items are special.
124 *
125 * To look at an on-page cell, we need to look at the parent page, and
126 * that's dangerous, our parent page could change without warning if
127 * the parent page were to split, deepening the tree. We can look at
128 * the parent page itself because the page can't change underneath us.
129 * However, if the parent page splits, our reference address can change;
130 * we don't care what version of it we read, as long as we don't read
131 * it twice.
132 */
133 WT_ORDERED_READ(ref_addr, ref->addr);
134 if (ref_addr != NULL &&
135 (__wt_off_page(ref->home, ref_addr) ?
136 ref_addr->type != WT_ADDR_LEAF_NO :
137 __wt_cell_type_raw((WT_CELL *)ref_addr) != WT_CELL_ADDR_LEAF_NO))
138 goto err;
139
140 /*
141 * This action dirties the parent page: mark it dirty now, there's no
142 * future reconciliation of the child leaf page that will dirty it as
143 * we write the tree.
144 */
145 WT_ERR(__wt_page_parent_modify_set(session, ref, false));
146
147 /* Allocate and initialize the page-deleted structure. */
148 WT_ERR(__wt_calloc_one(session, &ref->page_del));
149 ref->page_del->previous_state = previous_state;
150
151 WT_ERR(__wt_txn_modify_page_delete(session, ref));
152
153 *skipp = true;
154 WT_STAT_CONN_INCR(session, rec_page_delete_fast);
155 WT_STAT_DATA_INCR(session, rec_page_delete_fast);
156
157 /* Publish the page to its new state, ensuring visibility. */
158 WT_PUBLISH(ref->state, WT_REF_DELETED);
159 return (0);
160
161 err: __wt_free(session, ref->page_del);
162
163 /* Publish the page to its previous state, ensuring visibility. */
164 WT_PUBLISH(ref->state, previous_state);
165 return (ret);
166 }
167
168 /*
169 * __wt_delete_page_rollback --
170 * Abort pages that were deleted without being instantiated.
171 */
172 int
__wt_delete_page_rollback(WT_SESSION_IMPL * session,WT_REF * ref)173 __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
174 {
175 WT_UPDATE **updp;
176 uint64_t sleep_usecs, yield_count;
177 uint32_t current_state;
178 bool locked;
179
180 /*
181 * If the page is still "deleted", it's as we left it, reset the state
182 * to on-disk and we're done. Otherwise, we expect the page is either
183 * instantiated or being instantiated. Loop because it's possible for
184 * the page to return to the deleted state if instantiation fails.
185 */
186 for (locked = false, sleep_usecs = yield_count = 0;;) {
187 switch (current_state = ref->state) {
188 case WT_REF_DELETED:
189 /*
190 * If the page is still "deleted", it's as we left it,
191 * reset the state.
192 */
193 if (__wt_atomic_casv32(&ref->state,
194 WT_REF_DELETED, ref->page_del->previous_state))
195 goto done;
196 break;
197 case WT_REF_LOCKED:
198 /*
199 * A possible state, the page is being instantiated.
200 */
201 break;
202 case WT_REF_MEM:
203 case WT_REF_SPLIT:
204 if (__wt_atomic_casv32(
205 &ref->state, current_state, WT_REF_LOCKED))
206 locked = true;
207 break;
208 case WT_REF_DISK:
209 case WT_REF_LIMBO:
210 case WT_REF_LOOKASIDE:
211 case WT_REF_READING:
212 WT_ILLEGAL_VALUE(session, current_state);
213 }
214
215 if (locked)
216 break;
217
218 /*
219 * We wait for the change in page state, yield before retrying,
220 * and if we've yielded enough times, start sleeping so we
221 * don't burn CPU to no purpose.
222 */
223 __wt_spin_backoff(&yield_count, &sleep_usecs);
224 WT_STAT_CONN_INCRV(session,
225 page_del_rollback_blocked, sleep_usecs);
226 }
227
228 /*
229 * We can't use the normal read path to get a copy of the page
230 * because the session may have closed the cursor, we no longer
231 * have the reference to the tree required for a hazard
232 * pointer. We're safe because with unresolved transactions,
233 * the page isn't going anywhere.
234 *
235 * The page is in an in-memory state, which means it
236 * was instantiated at some point. Walk any list of
237 * update structures and abort them.
238 */
239 WT_ASSERT(session, locked);
240 if ((updp = ref->page_del->update_list) != NULL)
241 for (; *updp != NULL; ++updp)
242 (*updp)->txnid = WT_TXN_ABORTED;
243
244 ref->state = current_state;
245
246 done: /*
247 * Now mark the truncate aborted: this must come last because after
248 * this point there is nothing preventing the page from being evicted.
249 */
250 WT_PUBLISH(ref->page_del->txnid, WT_TXN_ABORTED);
251 return (0);
252 }
253
254 /*
255 * __wt_delete_page_skip --
256 * If iterating a cursor, skip deleted pages that are either visible to
257 * us or globally visible.
258 */
259 bool
__wt_delete_page_skip(WT_SESSION_IMPL * session,WT_REF * ref,bool visible_all)260 __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
261 {
262 bool skip;
263
264 /*
265 * Deleted pages come from two sources: either it's a truncate as
266 * described above, or the page has been emptied by other operations
267 * and eviction deleted it.
268 *
269 * In both cases, the WT_REF state will be WT_REF_DELETED. In the case
270 * of a truncated page, there will be a WT_PAGE_DELETED structure with
271 * the transaction ID of the transaction that deleted the page, and the
272 * page is visible if that transaction ID is visible. In the case of an
273 * empty page, there will be no WT_PAGE_DELETED structure and the delete
274 * is by definition visible, eviction could not have deleted the page if
275 * there were changes on it that were not globally visible.
276 *
277 * We're here because we found a WT_REF state set to WT_REF_DELETED. It
278 * is possible the page is being read into memory right now, though, and
279 * the page could switch to an in-memory state at any time. Lock down
280 * the structure, just to be safe.
281 */
282 if (ref->page_del == NULL && ref->page_las == NULL)
283 return (true);
284
285 if (!__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED))
286 return (false);
287
288 skip = !__wt_page_del_active(session, ref, visible_all) &&
289 !__wt_page_las_active(session, ref);
290
291 /*
292 * The page_del structure can be freed as soon as the delete is stable:
293 * it is only read when the ref state is locked. It is worth checking
294 * every time we come through because once this is freed, we no longer
295 * need synchronization to check the ref.
296 */
297 if (skip && ref->page_del != NULL && (visible_all ||
298 __wt_txn_visible_all(session, ref->page_del->txnid,
299 WT_TIMESTAMP_NULL(&ref->page_del->timestamp)))) {
300 __wt_free(session, ref->page_del->update_list);
301 __wt_free(session, ref->page_del);
302 }
303
304 WT_PUBLISH(ref->state, WT_REF_DELETED);
305 return (skip);
306 }
307
308 /*
309 * __tombstone_update_alloc --
310 * Allocate and initialize a page-deleted tombstone update structure.
311 */
312 static int
__tombstone_update_alloc(WT_SESSION_IMPL * session,WT_PAGE_DELETED * page_del,WT_UPDATE ** updp,size_t * sizep)313 __tombstone_update_alloc(WT_SESSION_IMPL *session,
314 WT_PAGE_DELETED *page_del, WT_UPDATE **updp, size_t *sizep)
315 {
316 WT_UPDATE *upd;
317
318 WT_RET(
319 __wt_update_alloc(session, NULL, &upd, sizep, WT_UPDATE_TOMBSTONE));
320
321 /*
322 * Cleared memory matches the lowest possible transaction ID and
323 * timestamp, do nothing.
324 */
325 if (page_del != NULL) {
326 upd->txnid = page_del->txnid;
327 __wt_timestamp_set(&upd->timestamp, &page_del->timestamp);
328 upd->prepare_state = page_del->prepare_state;
329 }
330 *updp = upd;
331 return (0);
332 }
333
334 /*
335 * __wt_delete_page_instantiate --
336 * Instantiate an entirely deleted row-store leaf page.
337 */
338 int
__wt_delete_page_instantiate(WT_SESSION_IMPL * session,WT_REF * ref)339 __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
340 {
341 WT_BTREE *btree;
342 WT_DECL_RET;
343 WT_INSERT *ins;
344 WT_INSERT_HEAD *insert;
345 WT_PAGE *page;
346 WT_PAGE_DELETED *page_del;
347 WT_ROW *rip;
348 WT_UPDATE **upd_array, *upd;
349 size_t size;
350 uint32_t count, i;
351
352 btree = S2BT(session);
353 page = ref->page;
354
355 WT_STAT_CONN_INCR(session, cache_read_deleted);
356 WT_STAT_DATA_INCR(session, cache_read_deleted);
357
358 /*
359 * Give the page a modify structure.
360 *
361 * Mark tree dirty, unless the handle is read-only.
362 * (We'd like to free the deleted pages, but if the handle is read-only,
363 * we're not able to do so.)
364 */
365 WT_RET(__wt_page_modify_init(session, page));
366 if (!F_ISSET(btree, WT_BTREE_READONLY))
367 __wt_page_modify_set(session, page);
368
369 if (ref->page_del != NULL &&
370 ref->page_del->prepare_state != WT_PREPARE_INIT) {
371 WT_STAT_CONN_INCR(session, cache_read_deleted_prepared);
372 WT_STAT_DATA_INCR(session, cache_read_deleted_prepared);
373 }
374
375 /*
376 * An operation is accessing a "deleted" page, and we're building an
377 * in-memory version of the page (making it look like all entries in
378 * the page were individually updated by a remove operation). There
379 * are two cases where we end up here:
380 *
381 * First, a running transaction used a truncate call to delete the page
382 * without reading it, in which case the page reference includes a
383 * structure with a transaction ID; the page we're building might split
384 * in the future, so we update that structure to include references to
385 * all of the update structures we create, so the transaction can abort.
386 *
387 * Second, a truncate call deleted a page and the truncate committed,
388 * but an older transaction in the system forced us to keep the old
389 * version of the page around, then we crashed and recovered or we're
390 * running inside a checkpoint, and now we're being forced to read that
391 * page.
392 *
393 * Expect a page-deleted structure if there's a running transaction that
394 * needs to be resolved, otherwise, there may not be one (and, if the
395 * transaction has resolved, we can ignore the page-deleted structure).
396 */
397 page_del = __wt_page_del_active(session, ref, true) ?
398 ref->page_del : NULL;
399
400 /*
401 * Allocate the per-page update array if one doesn't already exist. (It
402 * might already exist because deletes are instantiated after lookaside
403 * table updates.)
404 */
405 if (page->entries != 0 && page->modify->mod_row_update == NULL)
406 WT_RET(__wt_calloc_def(
407 session, page->entries, &page->modify->mod_row_update));
408
409 /*
410 * Allocate the per-reference update array; in the case of instantiating
411 * a page deleted in a running transaction, we need a list of the update
412 * structures for the eventual commit or abort.
413 */
414 if (page_del != NULL) {
415 count = 0;
416 if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL)
417 WT_SKIP_FOREACH(ins, insert)
418 ++count;
419 WT_ROW_FOREACH(page, rip, i) {
420 ++count;
421 if ((insert = WT_ROW_INSERT(page, rip)) != NULL)
422 WT_SKIP_FOREACH(ins, insert)
423 ++count;
424 }
425 WT_RET(__wt_calloc_def(
426 session, count + 1, &page_del->update_list));
427 }
428
429 /* Walk the page entries, giving each one a tombstone. */
430 size = 0;
431 count = 0;
432 upd_array = page->modify->mod_row_update;
433 if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL)
434 WT_SKIP_FOREACH(ins, insert) {
435 WT_ERR(__tombstone_update_alloc(
436 session, page_del, &upd, &size));
437 upd->next = ins->upd;
438 ins->upd = upd;
439
440 if (page_del != NULL)
441 page_del->update_list[count++] = upd;
442 }
443 WT_ROW_FOREACH(page, rip, i) {
444 WT_ERR(__tombstone_update_alloc(
445 session, page_del, &upd, &size));
446 upd->next = upd_array[WT_ROW_SLOT(page, rip)];
447 upd_array[WT_ROW_SLOT(page, rip)] = upd;
448
449 if (page_del != NULL)
450 page_del->update_list[count++] = upd;
451
452 if ((insert = WT_ROW_INSERT(page, rip)) != NULL)
453 WT_SKIP_FOREACH(ins, insert) {
454 WT_ERR(__tombstone_update_alloc(
455 session, page_del, &upd, &size));
456 upd->next = ins->upd;
457 ins->upd = upd;
458
459 if (page_del != NULL)
460 page_del->update_list[count++] = upd;
461 }
462 }
463
464 __wt_cache_page_inmem_incr(session, page, size);
465
466 return (0);
467
468 err: /*
469 * The page-delete update structure may have existed before we were
470 * called, and presumably might be in use by a running transaction.
471 * The list of update structures cannot have been created before we
472 * were called, and should not exist if we exit with an error.
473 */
474 if (page_del != NULL)
475 __wt_free(session, page_del->update_list);
476 return (ret);
477 }
478