1 /*-
2  * Copyright (c) 2014-2018 MongoDB, Inc.
3  * Copyright (c) 2008-2014 WiredTiger, Inc.
4  *	All rights reserved.
5  *
6  * See the file LICENSE for redistribution information.
7  */
8 
9 #include "wt_internal.h"
10 
11 /*
12  * Fast-delete support.
13  *
14  * This file contains most of the code that allows WiredTiger to delete pages
15  * of data without reading them into the cache.  (This feature is currently
16  * only available for row-store objects.)
17  *
18  * The way cursor truncate works in a row-store object is it explicitly reads
19  * the first and last pages of the truncate range, then walks the tree with a
20  * flag so the tree walk code skips reading eligible pages within the range
21  * and instead just marks them as deleted, by changing their WT_REF state to
22  * WT_REF_DELETED. Pages ineligible for this fast path include pages already
23  * in the cache, having overflow items, or requiring lookaside records.
24  * Ineligible pages are read and have their rows updated/deleted individually.
25  * The transaction for the delete operation is stored in memory referenced by
26  * the WT_REF.page_del field.
27  *
28  * Future cursor walks of the tree will skip the deleted page based on the
29  * transaction stored for the delete, but it gets more complicated if a read is
30  * done using a random key, or a cursor walk is done with a transaction where
31  * the delete is not visible.  In those cases, we read the original contents of
32  * the page.  The page-read code notices a deleted page is being read, and as
33  * part of the read instantiates the contents of the page, creating a WT_UPDATE
34  * with a deleted operation, in the same transaction as deleted the page.  In
35  * other words, the read process makes it appear as if the page was read and
36  * each individual row deleted, exactly as would have happened if the page had
37  * been in the cache all along.
38  *
39  * There's an additional complication to support rollback of the page delete.
40  * When the page was marked deleted, a pointer to the WT_REF was saved in the
41  * deleting session's transaction list and the delete is unrolled by resetting
42  * the WT_REF_DELETED state back to WT_REF_DISK.  However, if the page has been
43  * instantiated by some reading thread, that's not enough, each individual row
44  * on the page must have the delete operation reset.  If the page split, the
45  * WT_UPDATE lists might have been saved/restored during reconciliation and
46  * appear on multiple pages, and the WT_REF stored in the deleting session's
47  * transaction list is no longer useful.  For this reason, when the page is
48  * instantiated by a read, a list of the WT_UPDATE structures on the page is
49  * stored in the WT_REF.page_del field, with the transaction ID, that way the
50  * session committing/unrolling the delete can find all WT_UPDATE structures
51  * that require update.
52  *
53  * One final note: pages can also be marked deleted if emptied and evicted.  In
54  * that case, the WT_REF state will be set to WT_REF_DELETED but there will not
55  * be any associated WT_REF.page_del field.  These pages are always skipped
56  * during cursor traversal (the page could not have been evicted if there were
57  * updates that weren't globally visible), and if read is forced to instantiate
58  * such a page, it simply creates an empty page from scratch.
59  */
60 
61 /*
62  * __wt_delete_page --
63  *	If deleting a range, try to delete the page without instantiating it.
64  */
65 int
__wt_delete_page(WT_SESSION_IMPL * session,WT_REF * ref,bool * skipp)66 __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
67 {
68 	WT_ADDR *ref_addr;
69 	WT_DECL_RET;
70 	uint32_t previous_state;
71 
72 	*skipp = false;
73 
74 	/* If we have a clean page in memory, attempt to evict it. */
75 	previous_state = ref->state;
76 	if ((previous_state == WT_REF_MEM || previous_state == WT_REF_LIMBO) &&
77 	    __wt_atomic_casv32(&ref->state, previous_state, WT_REF_LOCKED)) {
78 		if (__wt_page_is_modified(ref->page)) {
79 			ref->state = previous_state;
80 			return (0);
81 		}
82 
83 		(void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1);
84 		ret = __wt_evict(session, ref, false, previous_state);
85 		(void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1);
86 		WT_RET_BUSY_OK(ret);
87 		ret = 0;
88 	}
89 
90 	/*
91 	 * Fast check to see if it's worth locking, then atomically switch the
92 	 * page's state to lock it.
93 	 */
94 	previous_state = ref->state;
95 	switch (previous_state) {
96 	case WT_REF_DISK:
97 	case WT_REF_LOOKASIDE:
98 		break;
99 	default:
100 		return (0);
101 	}
102 	if (!__wt_atomic_casv32(&ref->state, previous_state, WT_REF_LOCKED))
103 		return (0);
104 
105 	/*
106 	 * If this WT_REF was previously part of a truncate operation, there
107 	 * may be existing page-delete information. The structure is only read
108 	 * while the state is locked, free the previous version.
109 	 *
110 	 * Note: changes have been made, we must publish any state change from
111 	 * this point on.
112 	 */
113 	if (ref->page_del != NULL) {
114 		WT_ASSERT(session, ref->page_del->txnid == WT_TXN_ABORTED);
115 		__wt_free(session, ref->page_del->update_list);
116 		__wt_free(session, ref->page_del);
117 	}
118 
119 	/*
120 	 * We cannot truncate pages that have overflow key/value items as the
121 	 * overflow blocks have to be discarded.  The way we figure that out is
122 	 * to check the page's cell type, cells for leaf pages without overflow
123 	 * items are special.
124 	 *
125 	 * To look at an on-page cell, we need to look at the parent page, and
126 	 * that's dangerous, our parent page could change without warning if
127 	 * the parent page were to split, deepening the tree. We can look at
128 	 * the parent page itself because the page can't change underneath us.
129 	 * However, if the parent page splits, our reference address can change;
130 	 * we don't care what version of it we read, as long as we don't read
131 	 * it twice.
132 	 */
133 	WT_ORDERED_READ(ref_addr, ref->addr);
134 	if (ref_addr != NULL &&
135 	    (__wt_off_page(ref->home, ref_addr) ?
136 	    ref_addr->type != WT_ADDR_LEAF_NO :
137 	    __wt_cell_type_raw((WT_CELL *)ref_addr) != WT_CELL_ADDR_LEAF_NO))
138 		goto err;
139 
140 	/*
141 	 * This action dirties the parent page: mark it dirty now, there's no
142 	 * future reconciliation of the child leaf page that will dirty it as
143 	 * we write the tree.
144 	 */
145 	WT_ERR(__wt_page_parent_modify_set(session, ref, false));
146 
147 	/* Allocate and initialize the page-deleted structure. */
148 	WT_ERR(__wt_calloc_one(session, &ref->page_del));
149 	ref->page_del->previous_state = previous_state;
150 
151 	WT_ERR(__wt_txn_modify_page_delete(session, ref));
152 
153 	*skipp = true;
154 	WT_STAT_CONN_INCR(session, rec_page_delete_fast);
155 	WT_STAT_DATA_INCR(session, rec_page_delete_fast);
156 
157 	/* Publish the page to its new state, ensuring visibility. */
158 	WT_PUBLISH(ref->state, WT_REF_DELETED);
159 	return (0);
160 
161 err:	__wt_free(session, ref->page_del);
162 
163 	/* Publish the page to its previous state, ensuring visibility. */
164 	WT_PUBLISH(ref->state, previous_state);
165 	return (ret);
166 }
167 
168 /*
169  * __wt_delete_page_rollback --
170  *	Abort pages that were deleted without being instantiated.
171  */
172 int
__wt_delete_page_rollback(WT_SESSION_IMPL * session,WT_REF * ref)173 __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
174 {
175 	WT_UPDATE **updp;
176 	uint64_t sleep_usecs, yield_count;
177 	uint32_t current_state;
178 	bool locked;
179 
180 	/*
181 	 * If the page is still "deleted", it's as we left it, reset the state
182 	 * to on-disk and we're done.  Otherwise, we expect the page is either
183 	 * instantiated or being instantiated.  Loop because it's possible for
184 	 * the page to return to the deleted state if instantiation fails.
185 	 */
186 	for (locked = false, sleep_usecs = yield_count = 0;;) {
187 		switch (current_state = ref->state) {
188 		case WT_REF_DELETED:
189 			/*
190 			 * If the page is still "deleted", it's as we left it,
191 			 * reset the state.
192 			 */
193 			if (__wt_atomic_casv32(&ref->state,
194 			    WT_REF_DELETED, ref->page_del->previous_state))
195 				goto done;
196 			break;
197 		case WT_REF_LOCKED:
198 			/*
199 			 * A possible state, the page is being instantiated.
200 			 */
201 			break;
202 		case WT_REF_MEM:
203 		case WT_REF_SPLIT:
204 			if (__wt_atomic_casv32(
205 			    &ref->state, current_state, WT_REF_LOCKED))
206 				locked = true;
207 			break;
208 		case WT_REF_DISK:
209 		case WT_REF_LIMBO:
210 		case WT_REF_LOOKASIDE:
211 		case WT_REF_READING:
212 		WT_ILLEGAL_VALUE(session, current_state);
213 		}
214 
215 		if (locked)
216 			break;
217 
218 		/*
219 		 * We wait for the change in page state, yield before retrying,
220 		 * and if we've yielded enough times, start sleeping so we
221 		 * don't burn CPU to no purpose.
222 		 */
223 		__wt_spin_backoff(&yield_count, &sleep_usecs);
224 		WT_STAT_CONN_INCRV(session,
225 		    page_del_rollback_blocked, sleep_usecs);
226 	}
227 
228 	/*
229 	 * We can't use the normal read path to get a copy of the page
230 	 * because the session may have closed the cursor, we no longer
231 	 * have the reference to the tree required for a hazard
232 	 * pointer.  We're safe because with unresolved transactions,
233 	 * the page isn't going anywhere.
234 	 *
235 	 * The page is in an in-memory state, which means it
236 	 * was instantiated at some point. Walk any list of
237 	 * update structures and abort them.
238 	 */
239 	WT_ASSERT(session, locked);
240 	if ((updp = ref->page_del->update_list) != NULL)
241 		for (; *updp != NULL; ++updp)
242 			(*updp)->txnid = WT_TXN_ABORTED;
243 
244 	ref->state = current_state;
245 
246 done:	/*
247 	 * Now mark the truncate aborted: this must come last because after
248 	 * this point there is nothing preventing the page from being evicted.
249 	 */
250 	WT_PUBLISH(ref->page_del->txnid, WT_TXN_ABORTED);
251 	return (0);
252 }
253 
254 /*
255  * __wt_delete_page_skip --
256  *	If iterating a cursor, skip deleted pages that are either visible to
257  * us or globally visible.
258  */
259 bool
__wt_delete_page_skip(WT_SESSION_IMPL * session,WT_REF * ref,bool visible_all)260 __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
261 {
262 	bool skip;
263 
264 	/*
265 	 * Deleted pages come from two sources: either it's a truncate as
266 	 * described above, or the page has been emptied by other operations
267 	 * and eviction deleted it.
268 	 *
269 	 * In both cases, the WT_REF state will be WT_REF_DELETED.  In the case
270 	 * of a truncated page, there will be a WT_PAGE_DELETED structure with
271 	 * the transaction ID of the transaction that deleted the page, and the
272 	 * page is visible if that transaction ID is visible.  In the case of an
273 	 * empty page, there will be no WT_PAGE_DELETED structure and the delete
274 	 * is by definition visible, eviction could not have deleted the page if
275 	 * there were changes on it that were not globally visible.
276 	 *
277 	 * We're here because we found a WT_REF state set to WT_REF_DELETED.  It
278 	 * is possible the page is being read into memory right now, though, and
279 	 * the page could switch to an in-memory state at any time.  Lock down
280 	 * the structure, just to be safe.
281 	 */
282 	if (ref->page_del == NULL && ref->page_las == NULL)
283 		return (true);
284 
285 	if (!__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED))
286 		return (false);
287 
288 	skip = !__wt_page_del_active(session, ref, visible_all) &&
289 	    !__wt_page_las_active(session, ref);
290 
291 	/*
292 	 * The page_del structure can be freed as soon as the delete is stable:
293 	 * it is only read when the ref state is locked. It is worth checking
294 	 * every time we come through because once this is freed, we no longer
295 	 * need synchronization to check the ref.
296 	 */
297 	if (skip && ref->page_del != NULL && (visible_all ||
298 	    __wt_txn_visible_all(session, ref->page_del->txnid,
299 	    WT_TIMESTAMP_NULL(&ref->page_del->timestamp)))) {
300 		__wt_free(session, ref->page_del->update_list);
301 		__wt_free(session, ref->page_del);
302 	}
303 
304 	WT_PUBLISH(ref->state, WT_REF_DELETED);
305 	return (skip);
306 }
307 
308 /*
309  * __tombstone_update_alloc --
310  *	Allocate and initialize a page-deleted tombstone update structure.
311  */
312 static int
__tombstone_update_alloc(WT_SESSION_IMPL * session,WT_PAGE_DELETED * page_del,WT_UPDATE ** updp,size_t * sizep)313 __tombstone_update_alloc(WT_SESSION_IMPL *session,
314     WT_PAGE_DELETED *page_del, WT_UPDATE **updp, size_t *sizep)
315 {
316 	WT_UPDATE *upd;
317 
318 	WT_RET(
319 	    __wt_update_alloc(session, NULL, &upd, sizep, WT_UPDATE_TOMBSTONE));
320 
321 	/*
322 	 * Cleared memory matches the lowest possible transaction ID and
323 	 * timestamp, do nothing.
324 	 */
325 	if (page_del != NULL) {
326 		upd->txnid = page_del->txnid;
327 		__wt_timestamp_set(&upd->timestamp, &page_del->timestamp);
328 		upd->prepare_state = page_del->prepare_state;
329 	}
330 	*updp = upd;
331 	return (0);
332 }
333 
334 /*
335  * __wt_delete_page_instantiate --
336  *	Instantiate an entirely deleted row-store leaf page.
337  */
338 int
__wt_delete_page_instantiate(WT_SESSION_IMPL * session,WT_REF * ref)339 __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
340 {
341 	WT_BTREE *btree;
342 	WT_DECL_RET;
343 	WT_INSERT *ins;
344 	WT_INSERT_HEAD *insert;
345 	WT_PAGE *page;
346 	WT_PAGE_DELETED *page_del;
347 	WT_ROW *rip;
348 	WT_UPDATE **upd_array, *upd;
349 	size_t size;
350 	uint32_t count, i;
351 
352 	btree = S2BT(session);
353 	page = ref->page;
354 
355 	WT_STAT_CONN_INCR(session, cache_read_deleted);
356 	WT_STAT_DATA_INCR(session, cache_read_deleted);
357 
358 	/*
359 	 * Give the page a modify structure.
360 	 *
361 	 * Mark tree dirty, unless the handle is read-only.
362 	 * (We'd like to free the deleted pages, but if the handle is read-only,
363 	 * we're not able to do so.)
364 	 */
365 	WT_RET(__wt_page_modify_init(session, page));
366 	if (!F_ISSET(btree, WT_BTREE_READONLY))
367 		__wt_page_modify_set(session, page);
368 
369 	if (ref->page_del != NULL &&
370 	    ref->page_del->prepare_state != WT_PREPARE_INIT) {
371 		WT_STAT_CONN_INCR(session, cache_read_deleted_prepared);
372 		WT_STAT_DATA_INCR(session, cache_read_deleted_prepared);
373 	}
374 
375 	/*
376 	 * An operation is accessing a "deleted" page, and we're building an
377 	 * in-memory version of the page (making it look like all entries in
378 	 * the page were individually updated by a remove operation).  There
379 	 * are two cases where we end up here:
380 	 *
381 	 * First, a running transaction used a truncate call to delete the page
382 	 * without reading it, in which case the page reference includes a
383 	 * structure with a transaction ID; the page we're building might split
384 	 * in the future, so we update that structure to include references to
385 	 * all of the update structures we create, so the transaction can abort.
386 	 *
387 	 * Second, a truncate call deleted a page and the truncate committed,
388 	 * but an older transaction in the system forced us to keep the old
389 	 * version of the page around, then we crashed and recovered or we're
390 	 * running inside a checkpoint, and now we're being forced to read that
391 	 * page.
392 	 *
393 	 * Expect a page-deleted structure if there's a running transaction that
394 	 * needs to be resolved, otherwise, there may not be one (and, if the
395 	 * transaction has resolved, we can ignore the page-deleted structure).
396 	 */
397 	page_del = __wt_page_del_active(session, ref, true) ?
398 	    ref->page_del : NULL;
399 
400 	/*
401 	 * Allocate the per-page update array if one doesn't already exist. (It
402 	 * might already exist because deletes are instantiated after lookaside
403 	 * table updates.)
404 	 */
405 	if (page->entries != 0 && page->modify->mod_row_update == NULL)
406 		WT_RET(__wt_calloc_def(
407 		    session, page->entries, &page->modify->mod_row_update));
408 
409 	/*
410 	 * Allocate the per-reference update array; in the case of instantiating
411 	 * a page deleted in a running transaction, we need a list of the update
412 	 * structures for the eventual commit or abort.
413 	 */
414 	if (page_del != NULL) {
415 		count = 0;
416 		if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL)
417 			WT_SKIP_FOREACH(ins, insert)
418 				++count;
419 		WT_ROW_FOREACH(page, rip, i) {
420 			++count;
421 			if ((insert = WT_ROW_INSERT(page, rip)) != NULL)
422 				WT_SKIP_FOREACH(ins, insert)
423 					++count;
424 		}
425 		WT_RET(__wt_calloc_def(
426 		    session, count + 1, &page_del->update_list));
427 	}
428 
429 	/* Walk the page entries, giving each one a tombstone. */
430 	size = 0;
431 	count = 0;
432 	upd_array = page->modify->mod_row_update;
433 	if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL)
434 		WT_SKIP_FOREACH(ins, insert) {
435 			WT_ERR(__tombstone_update_alloc(
436 			    session, page_del, &upd, &size));
437 			upd->next = ins->upd;
438 			ins->upd = upd;
439 
440 			if (page_del != NULL)
441 				page_del->update_list[count++] = upd;
442 		}
443 	WT_ROW_FOREACH(page, rip, i) {
444 		WT_ERR(__tombstone_update_alloc(
445 		    session, page_del, &upd, &size));
446 		upd->next = upd_array[WT_ROW_SLOT(page, rip)];
447 		upd_array[WT_ROW_SLOT(page, rip)] = upd;
448 
449 		if (page_del != NULL)
450 			page_del->update_list[count++] = upd;
451 
452 		if ((insert = WT_ROW_INSERT(page, rip)) != NULL)
453 			WT_SKIP_FOREACH(ins, insert) {
454 				WT_ERR(__tombstone_update_alloc(
455 				    session, page_del, &upd, &size));
456 				upd->next = ins->upd;
457 				ins->upd = upd;
458 
459 				if (page_del != NULL)
460 					page_del->update_list[count++] = upd;
461 			}
462 	}
463 
464 	__wt_cache_page_inmem_incr(session, page, size);
465 
466 	return (0);
467 
468 err:	/*
469 	 * The page-delete update structure may have existed before we were
470 	 * called, and presumably might be in use by a running transaction.
471 	 * The list of update structures cannot have been created before we
472 	 * were called, and should not exist if we exit with an error.
473 	 */
474 	if (page_del != NULL)
475 		__wt_free(session, page_del->update_list);
476 	return (ret);
477 }
478