1 /*-
2  * Copyright (c) 2014-2018 MongoDB, Inc.
3  * Copyright (c) 2008-2014 WiredTiger, Inc.
4  *	All rights reserved.
5  *
6  * See the file LICENSE for redistribution information.
7  */
8 
9 /*
10  * __wt_ref_is_root --
11  *	Return if the page reference is for the root page.
12  */
13 static inline bool
__wt_ref_is_root(WT_REF * ref)14 __wt_ref_is_root(WT_REF *ref)
15 {
16 	return (ref->home == NULL);
17 }
18 
19 /*
20  * __wt_page_is_empty --
21  *	Return if the page is empty.
22  */
23 static inline bool
__wt_page_is_empty(WT_PAGE * page)24 __wt_page_is_empty(WT_PAGE *page)
25 {
26 	return (page->modify != NULL &&
27 	    page->modify->rec_result == WT_PM_REC_EMPTY);
28 }
29 
30 /*
31  * __wt_page_evict_clean --
32  *	Return if the page can be evicted without dirtying the tree.
33  */
34 static inline bool
__wt_page_evict_clean(WT_PAGE * page)35 __wt_page_evict_clean(WT_PAGE *page)
36 {
37 	return (page->modify == NULL ||
38 	    (page->modify->page_state == WT_PAGE_CLEAN &&
39 	    page->modify->rec_result == 0));
40 }
41 
42 /*
43  * __wt_page_is_modified --
44  *	Return if the page is dirty.
45  */
46 static inline bool
__wt_page_is_modified(WT_PAGE * page)47 __wt_page_is_modified(WT_PAGE *page)
48 {
49 	return (page->modify != NULL &&
50 	    page->modify->page_state != WT_PAGE_CLEAN);
51 }
52 
53 /*
54  * __wt_btree_block_free --
55  *	Helper function to free a block from the current tree.
56  */
57 static inline int
__wt_btree_block_free(WT_SESSION_IMPL * session,const uint8_t * addr,size_t addr_size)58 __wt_btree_block_free(
59     WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
60 {
61 	WT_BM *bm;
62 	WT_BTREE *btree;
63 
64 	btree = S2BT(session);
65 	bm = btree->bm;
66 
67 	return (bm->free(bm, session, addr, addr_size));
68 }
69 
70 /*
71  * __wt_btree_bytes_inuse --
72  *	Return the number of bytes in use.
73  */
74 static inline uint64_t
__wt_btree_bytes_inuse(WT_SESSION_IMPL * session)75 __wt_btree_bytes_inuse(WT_SESSION_IMPL *session)
76 {
77 	WT_BTREE *btree;
78 	WT_CACHE *cache;
79 
80 	btree = S2BT(session);
81 	cache = S2C(session)->cache;
82 
83 	return (__wt_cache_bytes_plus_overhead(cache, btree->bytes_inmem));
84 }
85 
86 /*
87  * __wt_btree_bytes_evictable --
88  *	Return the number of bytes that can be evicted (i.e. bytes apart from
89  *	the pinned root page).
90  */
91 static inline uint64_t
__wt_btree_bytes_evictable(WT_SESSION_IMPL * session)92 __wt_btree_bytes_evictable(WT_SESSION_IMPL *session)
93 {
94 	WT_BTREE *btree;
95 	WT_CACHE *cache;
96 	WT_PAGE *root_page;
97 	uint64_t bytes_inmem, bytes_root;
98 
99 	btree = S2BT(session);
100 	cache = S2C(session)->cache;
101 	root_page = btree->root.page;
102 
103 	bytes_inmem = btree->bytes_inmem;
104 	bytes_root = root_page == NULL ? 0 : root_page->memory_footprint;
105 
106 	return (bytes_inmem <= bytes_root ? 0 :
107 	    __wt_cache_bytes_plus_overhead(cache, bytes_inmem - bytes_root));
108 }
109 
110 /*
111  * __wt_btree_dirty_inuse --
112  *	Return the number of dirty bytes in use.
113  */
114 static inline uint64_t
__wt_btree_dirty_inuse(WT_SESSION_IMPL * session)115 __wt_btree_dirty_inuse(WT_SESSION_IMPL *session)
116 {
117 	WT_BTREE *btree;
118 	WT_CACHE *cache;
119 
120 	btree = S2BT(session);
121 	cache = S2C(session)->cache;
122 
123 	return (__wt_cache_bytes_plus_overhead(cache,
124 	    btree->bytes_dirty_intl + btree->bytes_dirty_leaf));
125 }
126 
127 /*
128  * __wt_btree_dirty_leaf_inuse --
129  *	Return the number of bytes in use by dirty leaf pages.
130  */
131 static inline uint64_t
__wt_btree_dirty_leaf_inuse(WT_SESSION_IMPL * session)132 __wt_btree_dirty_leaf_inuse(WT_SESSION_IMPL *session)
133 {
134 	WT_BTREE *btree;
135 	WT_CACHE *cache;
136 
137 	btree = S2BT(session);
138 	cache = S2C(session)->cache;
139 
140 	return (__wt_cache_bytes_plus_overhead(cache, btree->bytes_dirty_leaf));
141 }
142 
143 /*
144  * __wt_cache_page_inmem_incr --
145  *	Increment a page's memory footprint in the cache.
146  */
147 static inline void
__wt_cache_page_inmem_incr(WT_SESSION_IMPL * session,WT_PAGE * page,size_t size)148 __wt_cache_page_inmem_incr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size)
149 {
150 	WT_BTREE *btree;
151 	WT_CACHE *cache;
152 
153 	WT_ASSERT(session, size < WT_EXABYTE);
154 	btree = S2BT(session);
155 	cache = S2C(session)->cache;
156 
157 	(void)__wt_atomic_add64(&btree->bytes_inmem, size);
158 	(void)__wt_atomic_add64(&cache->bytes_inmem, size);
159 	(void)__wt_atomic_addsize(&page->memory_footprint, size);
160 	if (__wt_page_is_modified(page)) {
161 		(void)__wt_atomic_addsize(&page->modify->bytes_dirty, size);
162 		if (WT_PAGE_IS_INTERNAL(page)) {
163 			(void)__wt_atomic_add64(&btree->bytes_dirty_intl, size);
164 			(void)__wt_atomic_add64(&cache->bytes_dirty_intl, size);
165 		} else if (!btree->lsm_primary) {
166 			(void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size);
167 			(void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size);
168 		}
169 	}
170 	/* Track internal size in cache. */
171 	if (WT_PAGE_IS_INTERNAL(page))
172 		(void)__wt_atomic_add64(&cache->bytes_internal, size);
173 }
174 
175 /*
176  * __wt_cache_decr_check_size --
177  *	Decrement a size_t cache value and check for underflow.
178  */
179 static inline void
__wt_cache_decr_check_size(WT_SESSION_IMPL * session,size_t * vp,size_t v,const char * fld)180 __wt_cache_decr_check_size(
181     WT_SESSION_IMPL *session, size_t *vp, size_t v, const char *fld)
182 {
183 	if (v == 0 || __wt_atomic_subsize(vp, v) < WT_EXABYTE)
184 		return;
185 
186 	/*
187 	 * It's a bug if this accounting underflowed but allow the application
188 	 * to proceed - the consequence is we use more cache than configured.
189 	 */
190 	*vp = 0;
191 	__wt_errx(session,
192 	    "%s went negative with decrement of %" WT_SIZET_FMT, fld, v);
193 
194 #ifdef HAVE_DIAGNOSTIC
195 	__wt_abort(session);
196 #endif
197 }
198 
199 /*
200  * __wt_cache_decr_check_uint64 --
201  *	Decrement a uint64_t cache value and check for underflow.
202  */
203 static inline void
__wt_cache_decr_check_uint64(WT_SESSION_IMPL * session,uint64_t * vp,uint64_t v,const char * fld)204 __wt_cache_decr_check_uint64(
205     WT_SESSION_IMPL *session, uint64_t *vp, uint64_t v, const char *fld)
206 {
207 	uint64_t orig = *vp;
208 
209 	if (v == 0 || __wt_atomic_sub64(vp, v) < WT_EXABYTE)
210 		return;
211 
212 	/*
213 	 * It's a bug if this accounting underflowed but allow the application
214 	 * to proceed - the consequence is we use more cache than configured.
215 	 */
216 	*vp = 0;
217 	__wt_errx(session,
218 	    "%s was %" PRIu64 ", went negative with decrement of %" PRIu64, fld,
219 	    orig, v);
220 
221 #ifdef HAVE_DIAGNOSTIC
222 	__wt_abort(session);
223 #endif
224 }
225 
226 /*
227  * __wt_cache_page_byte_dirty_decr --
228  *	Decrement the page's dirty byte count, guarding from underflow.
229  */
230 static inline void
__wt_cache_page_byte_dirty_decr(WT_SESSION_IMPL * session,WT_PAGE * page,size_t size)231 __wt_cache_page_byte_dirty_decr(
232     WT_SESSION_IMPL *session, WT_PAGE *page, size_t size)
233 {
234 	WT_BTREE *btree;
235 	WT_CACHE *cache;
236 	size_t decr, orig;
237 	int i;
238 
239 	btree = S2BT(session);
240 	cache = S2C(session)->cache;
241 	decr = 0;			/* [-Wconditional-uninitialized] */
242 
243 	/*
244 	 * We don't have exclusive access and there are ways of decrementing the
245 	 * page's dirty byte count by a too-large value. For example:
246 	 *	T1: __wt_cache_page_inmem_incr(page, size)
247 	 *		page is clean, don't increment dirty byte count
248 	 *	T2: mark page dirty
249 	 *	T1: __wt_cache_page_inmem_decr(page, size)
250 	 *		page is dirty, decrement dirty byte count
251 	 * and, of course, the reverse where the page is dirty at the increment
252 	 * and clean at the decrement.
253 	 *
254 	 * The page's dirty-byte value always reflects bytes represented in the
255 	 * cache's dirty-byte count, decrement the page/cache as much as we can
256 	 * without underflow. If we can't decrement the dirty byte counts after
257 	 * few tries, give up: the cache's value will be wrong, but consistent,
258 	 * and we'll fix it the next time this page is marked clean, or evicted.
259 	 */
260 	for (i = 0; i < 5; ++i) {
261 		/*
262 		 * Take care to read the dirty-byte count only once in case
263 		 * we're racing with updates.
264 		 */
265 		WT_ORDERED_READ(orig, page->modify->bytes_dirty);
266 		decr = WT_MIN(size, orig);
267 		if (__wt_atomic_cassize(
268 		    &page->modify->bytes_dirty, orig, orig - decr))
269 			break;
270 	}
271 
272 	if (i == 5)
273 		return;
274 
275 	if (WT_PAGE_IS_INTERNAL(page)) {
276 		__wt_cache_decr_check_uint64(session, &btree->bytes_dirty_intl,
277 		    decr, "WT_BTREE.bytes_dirty_intl");
278 		__wt_cache_decr_check_uint64(session, &cache->bytes_dirty_intl,
279 		    decr, "WT_CACHE.bytes_dirty_intl");
280 	} else if (!btree->lsm_primary) {
281 		__wt_cache_decr_check_uint64(session, &btree->bytes_dirty_leaf,
282 		    decr, "WT_BTREE.bytes_dirty_leaf");
283 		__wt_cache_decr_check_uint64(session, &cache->bytes_dirty_leaf,
284 		    decr, "WT_CACHE.bytes_dirty_leaf");
285 	}
286 }
287 
288 /*
289  * __wt_cache_page_inmem_decr --
290  *	Decrement a page's memory footprint in the cache.
291  */
292 static inline void
__wt_cache_page_inmem_decr(WT_SESSION_IMPL * session,WT_PAGE * page,size_t size)293 __wt_cache_page_inmem_decr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size)
294 {
295 	WT_CACHE *cache;
296 
297 	cache = S2C(session)->cache;
298 
299 	WT_ASSERT(session, size < WT_EXABYTE);
300 
301 	__wt_cache_decr_check_uint64(
302 	    session, &S2BT(session)->bytes_inmem, size, "WT_BTREE.bytes_inmem");
303 	__wt_cache_decr_check_uint64(
304 	    session, &cache->bytes_inmem, size, "WT_CACHE.bytes_inmem");
305 	__wt_cache_decr_check_size(
306 	    session, &page->memory_footprint, size, "WT_PAGE.memory_footprint");
307 	if (__wt_page_is_modified(page))
308 		__wt_cache_page_byte_dirty_decr(session, page, size);
309 	/* Track internal size in cache. */
310 	if (WT_PAGE_IS_INTERNAL(page))
311 		__wt_cache_decr_check_uint64(session,
312 		    &cache->bytes_internal, size, "WT_CACHE.bytes_internal");
313 }
314 
315 /*
316  * __wt_cache_dirty_incr --
317  *	Page switch from clean to dirty: increment the cache dirty page/byte
318  * counts.
319  */
320 static inline void
__wt_cache_dirty_incr(WT_SESSION_IMPL * session,WT_PAGE * page)321 __wt_cache_dirty_incr(WT_SESSION_IMPL *session, WT_PAGE *page)
322 {
323 	WT_BTREE *btree;
324 	WT_CACHE *cache;
325 	size_t size;
326 
327 	btree = S2BT(session);
328 	cache = S2C(session)->cache;
329 
330 	/*
331 	 * Take care to read the memory_footprint once in case we are racing
332 	 * with updates.
333 	 */
334 	size = page->memory_footprint;
335 	if (WT_PAGE_IS_INTERNAL(page)) {
336 		(void)__wt_atomic_add64(&btree->bytes_dirty_intl, size);
337 		(void)__wt_atomic_add64(&cache->bytes_dirty_intl, size);
338 		(void)__wt_atomic_add64(&cache->pages_dirty_intl, 1);
339 	} else {
340 		if (!btree->lsm_primary) {
341 			(void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size);
342 			(void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size);
343 		}
344 		(void)__wt_atomic_add64(&cache->pages_dirty_leaf, 1);
345 	}
346 	(void)__wt_atomic_addsize(&page->modify->bytes_dirty, size);
347 }
348 
349 /*
350  * __wt_cache_dirty_decr --
351  *	Page switch from dirty to clean: decrement the cache dirty page/byte
352  * counts.
353  */
354 static inline void
__wt_cache_dirty_decr(WT_SESSION_IMPL * session,WT_PAGE * page)355 __wt_cache_dirty_decr(WT_SESSION_IMPL *session, WT_PAGE *page)
356 {
357 	WT_CACHE *cache;
358 	WT_PAGE_MODIFY *modify;
359 
360 	cache = S2C(session)->cache;
361 
362 	if (WT_PAGE_IS_INTERNAL(page))
363 		__wt_cache_decr_check_uint64(session,
364 		    &cache->pages_dirty_intl, 1, "dirty internal page count");
365 	else
366 		__wt_cache_decr_check_uint64(session,
367 		    &cache->pages_dirty_leaf, 1, "dirty leaf page count");
368 
369 	modify = page->modify;
370 	if (modify != NULL && modify->bytes_dirty != 0)
371 		__wt_cache_page_byte_dirty_decr(
372 		    session, page, modify->bytes_dirty);
373 }
374 
375 /*
376  * __wt_cache_page_image_decr --
377  *	Decrement a page image's size to the cache.
378  */
379 static inline void
__wt_cache_page_image_decr(WT_SESSION_IMPL * session,uint32_t size)380 __wt_cache_page_image_decr(WT_SESSION_IMPL *session, uint32_t size)
381 {
382 	WT_CACHE *cache;
383 
384 	cache = S2C(session)->cache;
385 
386 	__wt_cache_decr_check_uint64(
387 	    session, &cache->bytes_image, size, "WT_CACHE.image_inmem");
388 }
389 
390 /*
391  * __wt_cache_page_image_incr --
392  *	Increment a page image's size to the cache.
393  */
394 static inline void
__wt_cache_page_image_incr(WT_SESSION_IMPL * session,uint32_t size)395 __wt_cache_page_image_incr(WT_SESSION_IMPL *session, uint32_t size)
396 {
397 	WT_CACHE *cache;
398 
399 	cache = S2C(session)->cache;
400 	(void)__wt_atomic_add64(&cache->bytes_image, size);
401 }
402 
403 /*
404  * __wt_cache_page_evict --
405  *	Evict pages from the cache.
406  */
407 static inline void
__wt_cache_page_evict(WT_SESSION_IMPL * session,WT_PAGE * page)408 __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page)
409 {
410 	WT_BTREE *btree;
411 	WT_CACHE *cache;
412 	WT_PAGE_MODIFY *modify;
413 
414 	btree = S2BT(session);
415 	cache = S2C(session)->cache;
416 	modify = page->modify;
417 
418 	/* Update the bytes in-memory to reflect the eviction. */
419 	__wt_cache_decr_check_uint64(session, &btree->bytes_inmem,
420 	    page->memory_footprint, "WT_BTREE.bytes_inmem");
421 	__wt_cache_decr_check_uint64(session, &cache->bytes_inmem,
422 	    page->memory_footprint, "WT_CACHE.bytes_inmem");
423 
424 	/* Update the bytes_internal value to reflect the eviction */
425 	if (WT_PAGE_IS_INTERNAL(page))
426 		__wt_cache_decr_check_uint64(session,
427 		    &cache->bytes_internal,
428 		    page->memory_footprint, "WT_CACHE.bytes_internal");
429 
430 	/* Update the cache's dirty-byte count. */
431 	if (modify != NULL && modify->bytes_dirty != 0) {
432 		if (WT_PAGE_IS_INTERNAL(page)) {
433 			__wt_cache_decr_check_uint64(session,
434 			    &btree->bytes_dirty_intl,
435 			    modify->bytes_dirty, "WT_BTREE.bytes_dirty_intl");
436 			__wt_cache_decr_check_uint64(session,
437 			    &cache->bytes_dirty_intl,
438 			    modify->bytes_dirty, "WT_CACHE.bytes_dirty_intl");
439 		} else if (!btree->lsm_primary) {
440 			__wt_cache_decr_check_uint64(session,
441 			    &btree->bytes_dirty_leaf,
442 			    modify->bytes_dirty, "WT_BTREE.bytes_dirty_leaf");
443 			__wt_cache_decr_check_uint64(session,
444 			    &cache->bytes_dirty_leaf,
445 			    modify->bytes_dirty, "WT_CACHE.bytes_dirty_leaf");
446 		}
447 	}
448 
449 	/* Update bytes and pages evicted. */
450 	(void)__wt_atomic_add64(&cache->bytes_evict, page->memory_footprint);
451 	(void)__wt_atomic_addv64(&cache->pages_evicted, 1);
452 
453 	/*
454 	 * Track if eviction makes progress.  This is used in various places to
455 	 * determine whether eviction is stuck.
456 	 */
457 	if (!F_ISSET_ATOMIC(page, WT_PAGE_EVICT_NO_PROGRESS))
458 		(void)__wt_atomic_addv64(&cache->eviction_progress, 1);
459 }
460 
461 /*
462  * __wt_update_list_memsize --
463  *      The size in memory of a list of updates.
464  */
465 static inline size_t
__wt_update_list_memsize(WT_UPDATE * upd)466 __wt_update_list_memsize(WT_UPDATE *upd)
467 {
468 	size_t upd_size;
469 
470 	for (upd_size = 0; upd != NULL; upd = upd->next)
471 		upd_size += WT_UPDATE_MEMSIZE(upd);
472 
473 	return (upd_size);
474 }
475 
476 /*
477  * __wt_page_modify_init --
478  *	A page is about to be modified, allocate the modification structure.
479  */
480 static inline int
__wt_page_modify_init(WT_SESSION_IMPL * session,WT_PAGE * page)481 __wt_page_modify_init(WT_SESSION_IMPL *session, WT_PAGE *page)
482 {
483 	return (page->modify == NULL ?
484 	    __wt_page_modify_alloc(session, page) : 0);
485 }
486 
487 /*
488  * __wt_page_only_modify_set --
489  *	Mark the page (but only the page) dirty.
490  */
491 static inline void
__wt_page_only_modify_set(WT_SESSION_IMPL * session,WT_PAGE * page)492 __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
493 {
494 	uint64_t last_running;
495 
496 	WT_ASSERT(session, !F_ISSET(session->dhandle, WT_DHANDLE_DEAD));
497 
498 	last_running = 0;
499 	if (page->modify->page_state == WT_PAGE_CLEAN)
500 		last_running = S2C(session)->txn_global.last_running;
501 
502 	/*
503 	 * We depend on the atomic operation being a write barrier, that is, a
504 	 * barrier to ensure all changes to the page are flushed before updating
505 	 * the page state and/or marking the tree dirty, otherwise checkpoints
506 	 * and/or page reconciliation might be looking at a clean page/tree.
507 	 *
508 	 * Every time the page transitions from clean to dirty, update the cache
509 	 * and transactional information.
510 	 *
511 	 * The page state can only ever be incremented above dirty by the number
512 	 * of concurrently running threads, so the counter will never approach
513 	 * the point where it would wrap.
514 	 */
515 	if (page->modify->page_state < WT_PAGE_DIRTY &&
516 	    __wt_atomic_add32(&page->modify->page_state, 1) ==
517 	    WT_PAGE_DIRTY_FIRST) {
518 		__wt_cache_dirty_incr(session, page);
519 		/*
520 		 * In the event we dirty a page which is flagged for eviction
521 		 * soon, we update its read generation to avoid evicting a
522 		 * dirty page prematurely.
523 		 */
524 		if (page->read_gen == WT_READGEN_WONT_NEED)
525 			__wt_cache_read_gen_new(session, page);
526 
527 		/*
528 		 * We won the race to dirty the page, but another thread could
529 		 * have committed in the meantime, and the last_running field
530 		 * been updated past it.  That is all very unlikely, but not
531 		 * impossible, so we take care to read the global state before
532 		 * the atomic increment.
533 		 *
534 		 * If the page was dirty on entry, then last_running == 0. The
535 		 * page could have become clean since then, if reconciliation
536 		 * completed. In that case, we leave the previous value for
537 		 * first_dirty_txn rather than potentially racing to update it,
538 		 * at worst, we'll unnecessarily write a page in a checkpoint.
539 		 */
540 		if (last_running != 0)
541 			page->modify->first_dirty_txn = last_running;
542 	}
543 
544 	/* Check if this is the largest transaction ID to update the page. */
545 	if (WT_TXNID_LT(page->modify->update_txn, session->txn.id))
546 		page->modify->update_txn = session->txn.id;
547 }
548 
549 /*
550  * __wt_tree_modify_set --
551  *	Mark the tree dirty.
552  */
553 static inline void
__wt_tree_modify_set(WT_SESSION_IMPL * session)554 __wt_tree_modify_set(WT_SESSION_IMPL *session)
555 {
556 	/*
557 	 * Test before setting the dirty flag, it's a hot cache line.
558 	 *
559 	 * The tree's modified flag is cleared by the checkpoint thread: set it
560 	 * and insert a barrier before dirtying the page.  (I don't think it's
561 	 * a problem if the tree is marked dirty with all the pages clean, it
562 	 * might result in an extra checkpoint that doesn't do any work but it
563 	 * shouldn't cause problems; regardless, let's play it safe.)
564 	 */
565 	if (!S2BT(session)->modified) {
566 		/* Assert we never dirty a checkpoint handle. */
567 		WT_ASSERT(session, session->dhandle->checkpoint == NULL);
568 
569 		S2BT(session)->modified = true;
570 		WT_FULL_BARRIER();
571 	}
572 
573 	/*
574 	 * The btree may already be marked dirty while the connection is still
575 	 * clean; mark the connection dirty outside the test of the btree state.
576 	 */
577 	if (!S2C(session)->modified)
578 		S2C(session)->modified = true;
579 }
580 
581 /*
582  * __wt_page_modify_clear --
583  *	Clean a modified page.
584  */
585 static inline void
__wt_page_modify_clear(WT_SESSION_IMPL * session,WT_PAGE * page)586 __wt_page_modify_clear(WT_SESSION_IMPL *session, WT_PAGE *page)
587 {
588 	/*
589 	 * The page must be held exclusive when this call is made, this call
590 	 * can only be used when the page is owned by a single thread.
591 	 *
592 	 * Allow the call to be made on clean pages.
593 	 */
594 	if (__wt_page_is_modified(page)) {
595 		/*
596 		 * The only part where ordering matters is during
597 		 * reconciliation where updates on other threads are performing
598 		 * writes to the page state that need to be visible to the
599 		 * reconciliation thread.
600 		 *
601 		 * Since clearing of the page state is not going to be happening
602 		 * during reconciliation on a separate thread, there's no write
603 		 * barrier needed here.
604 		 */
605 		page->modify->page_state = WT_PAGE_CLEAN;
606 		__wt_cache_dirty_decr(session, page);
607 	}
608 }
609 
610 /*
611  * __wt_page_modify_set --
612  *	Mark the page and tree dirty.
613  */
614 static inline void
__wt_page_modify_set(WT_SESSION_IMPL * session,WT_PAGE * page)615 __wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
616 {
617 	/*
618 	 * Mark the tree dirty (even if the page is already marked dirty), newly
619 	 * created pages to support "empty" files are dirty, but the file isn't
620 	 * marked dirty until there's a real change needing to be written.
621 	 */
622 	__wt_tree_modify_set(session);
623 
624 	__wt_page_only_modify_set(session, page);
625 }
626 
627 /*
628  * __wt_page_parent_modify_set --
629  *	Mark the parent page, and optionally the tree, dirty.
630  */
631 static inline int
__wt_page_parent_modify_set(WT_SESSION_IMPL * session,WT_REF * ref,bool page_only)632 __wt_page_parent_modify_set(
633     WT_SESSION_IMPL *session, WT_REF *ref, bool page_only)
634 {
635 	WT_PAGE *parent;
636 
637 	/*
638 	 * This function exists as a place to stash this comment.  There are a
639 	 * few places where we need to dirty a page's parent.  The trick is the
640 	 * page's parent might split at any point, and the page parent might be
641 	 * the wrong parent at any particular time.  We ignore this and dirty
642 	 * whatever page the page's reference structure points to.  This is safe
643 	 * because if we're pointing to the wrong parent, that parent must have
644 	 * split, deepening the tree, which implies marking the original parent
645 	 * and all of the newly-created children as dirty.  In other words, if
646 	 * we have the wrong parent page, everything was marked dirty already.
647 	 */
648 	parent = ref->home;
649 	WT_RET(__wt_page_modify_init(session, parent));
650 	if (page_only)
651 		__wt_page_only_modify_set(session, parent);
652 	else
653 		__wt_page_modify_set(session, parent);
654 	return (0);
655 }
656 
657 /*
658  * __wt_off_page --
659  *	Return if a pointer references off-page data.
660  */
661 static inline bool
__wt_off_page(WT_PAGE * page,const void * p)662 __wt_off_page(WT_PAGE *page, const void *p)
663 {
664 	/*
665 	 * There may be no underlying page, in which case the reference is
666 	 * off-page by definition.
667 	 */
668 	return (page->dsk == NULL ||
669 	    p < (void *)page->dsk ||
670 	    p >= (void *)((uint8_t *)page->dsk + page->dsk->mem_size));
671 }
672 
673 /*
674  * __wt_ref_addr_free --
675  *	Free the address in a reference, if necessary.
676  */
677 static inline void
__wt_ref_addr_free(WT_SESSION_IMPL * session,WT_REF * ref)678 __wt_ref_addr_free(WT_SESSION_IMPL *session, WT_REF *ref)
679 {
680 	if (ref->addr == NULL)
681 		return;
682 
683 	if (ref->home == NULL || __wt_off_page(ref->home, ref->addr)) {
684 		__wt_free(session, ((WT_ADDR *)ref->addr)->addr);
685 		__wt_free(session, ref->addr);
686 	}
687 	ref->addr = NULL;
688 }
689 
690 /*
691  * __wt_ref_key --
692  *	Return a reference to a row-store internal page key as cheaply as
693  * possible.
694  */
695 static inline void
__wt_ref_key(WT_PAGE * page,WT_REF * ref,void * keyp,size_t * sizep)696 __wt_ref_key(WT_PAGE *page, WT_REF *ref, void *keyp, size_t *sizep)
697 {
698 	uintptr_t v;
699 
700 	/*
701 	 * An internal page key is in one of two places: if we instantiated the
702 	 * key (for example, when reading the page), WT_REF.ref_ikey references
703 	 * a WT_IKEY structure, otherwise WT_REF.ref_ikey references an on-page
704 	 * key offset/length pair.
705 	 *
706 	 * Now the magic: allocated memory must be aligned to store any standard
707 	 * type, and we expect some standard type to require at least quad-byte
708 	 * alignment, so allocated memory should have some clear low-order bits.
709 	 * On-page objects consist of an offset/length pair: the maximum page
710 	 * size currently fits into 29 bits, so we use the low-order bits of the
711 	 * pointer to mark the other bits of the pointer as encoding the key's
712 	 * location and length.  This breaks if allocated memory isn't aligned,
713 	 * of course.
714 	 *
715 	 * In this specific case, we use bit 0x01 to mark an on-page key, else
716 	 * it's a WT_IKEY reference.  The bit pattern for internal row-store
717 	 * on-page keys is:
718 	 *	32 bits		key length
719 	 *	31 bits		page offset of the key's bytes,
720 	 *	 1 bits		flags
721 	 */
722 #define	WT_IK_FLAG			0x01
723 #define	WT_IK_ENCODE_KEY_LEN(v)		((uintptr_t)(v) << 32)
724 #define	WT_IK_DECODE_KEY_LEN(v)		((v) >> 32)
725 #define	WT_IK_ENCODE_KEY_OFFSET(v)	((uintptr_t)(v) << 1)
726 #define	WT_IK_DECODE_KEY_OFFSET(v)	(((v) & 0xFFFFFFFF) >> 1)
727 	v = (uintptr_t)ref->ref_ikey;
728 	if (v & WT_IK_FLAG) {
729 		*(void **)keyp =
730 		    WT_PAGE_REF_OFFSET(page, WT_IK_DECODE_KEY_OFFSET(v));
731 		*sizep = WT_IK_DECODE_KEY_LEN(v);
732 	} else {
733 		*(void **)keyp = WT_IKEY_DATA(ref->ref_ikey);
734 		*sizep = ((WT_IKEY *)ref->ref_ikey)->size;
735 	}
736 }
737 
738 /*
739  * __wt_ref_key_onpage_set --
740  *	Set a WT_REF to reference an on-page key.
741  */
742 static inline void
__wt_ref_key_onpage_set(WT_PAGE * page,WT_REF * ref,WT_CELL_UNPACK * unpack)743 __wt_ref_key_onpage_set(WT_PAGE *page, WT_REF *ref, WT_CELL_UNPACK *unpack)
744 {
745 	uintptr_t v;
746 
747 	/*
748 	 * See the comment in __wt_ref_key for an explanation of the magic.
749 	 */
750 	v = WT_IK_ENCODE_KEY_LEN(unpack->size) |
751 	    WT_IK_ENCODE_KEY_OFFSET(WT_PAGE_DISK_OFFSET(page, unpack->data)) |
752 	    WT_IK_FLAG;
753 	ref->ref_ikey = (void *)v;
754 }
755 
756 /*
757  * __wt_ref_key_instantiated --
758  *	Return if a WT_REF key is instantiated.
759  */
760 static inline WT_IKEY *
__wt_ref_key_instantiated(WT_REF * ref)761 __wt_ref_key_instantiated(WT_REF *ref)
762 {
763 	uintptr_t v;
764 
765 	/*
766 	 * See the comment in __wt_ref_key for an explanation of the magic.
767 	 */
768 	v = (uintptr_t)ref->ref_ikey;
769 	return (v & WT_IK_FLAG ? NULL : ref->ref_ikey);
770 }
771 
772 /*
773  * __wt_ref_key_clear --
774  *	Clear a WT_REF key.
775  */
776 static inline void
__wt_ref_key_clear(WT_REF * ref)777 __wt_ref_key_clear(WT_REF *ref)
778 {
779 	/*
780 	 * The key union has 2 8B fields; this is equivalent to:
781 	 *
782 	 *	ref->ref_recno = WT_RECNO_OOB;
783 	 *	ref->ref_ikey = NULL;
784 	 */
785 	ref->ref_recno = 0;
786 }
787 
788 /*
789  * __wt_row_leaf_key_info --
790  *	Return a row-store leaf page key referenced by a WT_ROW if it can be
791  * had without unpacking a cell, and information about the cell, if the key
792  * isn't cheaply available.
793  */
794 static inline bool
__wt_row_leaf_key_info(WT_PAGE * page,void * copy,WT_IKEY ** ikeyp,WT_CELL ** cellp,void * datap,size_t * sizep)795 __wt_row_leaf_key_info(WT_PAGE *page, void *copy,
796     WT_IKEY **ikeyp, WT_CELL **cellp, void *datap, size_t *sizep)
797 {
798 	WT_IKEY *ikey;
799 	uintptr_t v;
800 
801 	v = (uintptr_t)copy;
802 
803 	/*
804 	 * A row-store leaf page key is in one of two places: if instantiated,
805 	 * the WT_ROW pointer references a WT_IKEY structure, otherwise, it
806 	 * references an on-page offset.  Further, on-page keys are in one of
807 	 * two states: if the key is a simple key (not an overflow key, prefix
808 	 * compressed or Huffman encoded, all of which are likely), the key's
809 	 * offset/size is encoded in the pointer.  Otherwise, the offset is to
810 	 * the key's on-page cell.
811 	 *
812 	 * Now the magic: allocated memory must be aligned to store any standard
813 	 * type, and we expect some standard type to require at least quad-byte
814 	 * alignment, so allocated memory should have some clear low-order bits.
815 	 * On-page objects consist of an offset/length pair: the maximum page
816 	 * size currently fits into 29 bits, so we use the low-order bits of the
817 	 * pointer to mark the other bits of the pointer as encoding the key's
818 	 * location and length.  This breaks if allocated memory isn't aligned,
819 	 * of course.
820 	 *
821 	 * In this specific case, we use bit 0x01 to mark an on-page cell, bit
822 	 * 0x02 to mark an on-page key, 0x03 to mark an on-page key/value pair,
823 	 * otherwise it's a WT_IKEY reference. The bit pattern for on-page cells
824 	 * is:
825 	 *	29 bits		page offset of the key's cell,
826 	 *	 2 bits		flags
827 	 *
828 	 * The bit pattern for on-page keys is:
829 	 *	32 bits		key length,
830 	 *	29 bits		page offset of the key's bytes,
831 	 *	 2 bits		flags
832 	 *
833 	 * But, while that allows us to skip decoding simple key cells, we also
834 	 * want to skip decoding the value cell in the case where the value cell
835 	 * is also simple/short.  We use bit 0x03 to mark an encoded on-page key
836 	 * and value pair.  The bit pattern for on-page key/value pairs is:
837 	 *	 9 bits		key length,
838 	 *	13 bits		value length,
839 	 *	20 bits		page offset of the key's bytes,
840 	 *	20 bits		page offset of the value's bytes,
841 	 *	 2 bits		flags
842 	 *
843 	 * These bit patterns are in-memory only, of course, so can be modified
844 	 * (we could even tune for specific workloads).  Generally, the fields
845 	 * are larger than the anticipated values being stored (512B keys, 8KB
846 	 * values, 1MB pages), hopefully that won't be necessary.
847 	 *
848 	 * This function returns a list of things about the key (instantiation
849 	 * reference, cell reference and key/length pair).  Our callers know
850 	 * the order in which we look things up and the information returned;
851 	 * for example, the cell will never be returned if we are working with
852 	 * an on-page key.
853 	 */
854 #define	WT_CELL_FLAG			0x01
855 #define	WT_CELL_ENCODE_OFFSET(v)	((uintptr_t)(v) << 2)
856 #define	WT_CELL_DECODE_OFFSET(v)	(((v) & 0xFFFFFFFF) >> 2)
857 
858 #define	WT_K_FLAG			0x02
859 #define	WT_K_ENCODE_KEY_LEN(v)		((uintptr_t)(v) << 32)
860 #define	WT_K_DECODE_KEY_LEN(v)		((v) >> 32)
861 #define	WT_K_ENCODE_KEY_OFFSET(v)	((uintptr_t)(v) << 2)
862 #define	WT_K_DECODE_KEY_OFFSET(v)	(((v) & 0xFFFFFFFF) >> 2)
863 
864 #define	WT_KV_FLAG			0x03
865 #define	WT_KV_ENCODE_KEY_LEN(v)		((uintptr_t)(v) << 55)
866 #define	WT_KV_DECODE_KEY_LEN(v)		((v) >> 55)
867 #define	WT_KV_MAX_KEY_LEN		(0x200 - 1)
868 #define	WT_KV_ENCODE_VALUE_LEN(v)	((uintptr_t)(v) << 42)
869 #define	WT_KV_DECODE_VALUE_LEN(v)	(((v) & 0x007FFC0000000000) >> 42)
870 #define	WT_KV_MAX_VALUE_LEN		(0x2000 - 1)
871 #define	WT_KV_ENCODE_KEY_OFFSET(v)	((uintptr_t)(v) << 22)
872 #define	WT_KV_DECODE_KEY_OFFSET(v)	(((v) & 0x000003FFFFC00000) >> 22)
873 #define	WT_KV_MAX_KEY_OFFSET		(0x100000 - 1)
874 #define	WT_KV_ENCODE_VALUE_OFFSET(v)	((uintptr_t)(v) << 2)
875 #define	WT_KV_DECODE_VALUE_OFFSET(v)	(((v) & 0x00000000003FFFFC) >> 2)
876 #define	WT_KV_MAX_VALUE_OFFSET		(0x100000 - 1)
877 	switch (v & 0x03) {
878 	case WT_CELL_FLAG:
879 		/* On-page cell: no instantiated key. */
880 		if (ikeyp != NULL)
881 			*ikeyp = NULL;
882 		if (cellp != NULL)
883 			*cellp =
884 			    WT_PAGE_REF_OFFSET(page, WT_CELL_DECODE_OFFSET(v));
885 		return (false);
886 	case WT_K_FLAG:
887 		/* Encoded key: no instantiated key, no cell. */
888 		if (cellp != NULL)
889 			*cellp = NULL;
890 		if (ikeyp != NULL)
891 			*ikeyp = NULL;
892 		if (datap != NULL) {
893 			*(void **)datap =
894 			    WT_PAGE_REF_OFFSET(page, WT_K_DECODE_KEY_OFFSET(v));
895 			*sizep = WT_K_DECODE_KEY_LEN(v);
896 			return (true);
897 		}
898 		return (false);
899 	case WT_KV_FLAG:
900 		/* Encoded key/value pair: no instantiated key, no cell. */
901 		if (cellp != NULL)
902 			*cellp = NULL;
903 		if (ikeyp != NULL)
904 			*ikeyp = NULL;
905 		if (datap != NULL) {
906 			*(void **)datap = WT_PAGE_REF_OFFSET(
907 			    page, WT_KV_DECODE_KEY_OFFSET(v));
908 			*sizep = WT_KV_DECODE_KEY_LEN(v);
909 			return (true);
910 		}
911 		return (false);
912 
913 	}
914 
915 	/* Instantiated key. */
916 	ikey = copy;
917 	if (ikeyp != NULL)
918 		*ikeyp = copy;
919 	if (cellp != NULL)
920 		*cellp = WT_PAGE_REF_OFFSET(page, ikey->cell_offset);
921 	if (datap != NULL) {
922 		*(void **)datap = WT_IKEY_DATA(ikey);
923 		*sizep = ikey->size;
924 		return (true);
925 	}
926 	return (false);
927 }
928 
929 /*
930  * __wt_row_leaf_key_set_cell --
931  *	Set a WT_ROW to reference an on-page row-store leaf cell.
932  */
933 static inline void
__wt_row_leaf_key_set_cell(WT_PAGE * page,WT_ROW * rip,WT_CELL * cell)934 __wt_row_leaf_key_set_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL *cell)
935 {
936 	uintptr_t v;
937 
938 	/*
939 	 * See the comment in __wt_row_leaf_key_info for an explanation of the
940 	 * magic.
941 	 */
942 	v = WT_CELL_ENCODE_OFFSET(WT_PAGE_DISK_OFFSET(page, cell)) |
943 	    WT_CELL_FLAG;
944 	WT_ROW_KEY_SET(rip, v);
945 }
946 
947 /*
948  * __wt_row_leaf_key_set --
949  *	Set a WT_ROW to reference an on-page row-store leaf key.
950  */
951 static inline void
__wt_row_leaf_key_set(WT_PAGE * page,WT_ROW * rip,WT_CELL_UNPACK * unpack)952 __wt_row_leaf_key_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack)
953 {
954 	uintptr_t v;
955 
956 	/*
957 	 * See the comment in __wt_row_leaf_key_info for an explanation of the
958 	 * magic.
959 	 */
960 	v = WT_K_ENCODE_KEY_LEN(unpack->size) |
961 	    WT_K_ENCODE_KEY_OFFSET(WT_PAGE_DISK_OFFSET(page, unpack->data)) |
962 	    WT_K_FLAG;
963 	WT_ROW_KEY_SET(rip, v);
964 }
965 
966 /*
967  * __wt_row_leaf_value_set --
968  *	Set a WT_ROW to reference an on-page row-store leaf value.
969  */
970 static inline void
__wt_row_leaf_value_set(WT_PAGE * page,WT_ROW * rip,WT_CELL_UNPACK * unpack)971 __wt_row_leaf_value_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack)
972 {
973 	uintptr_t key_len, key_offset, value_offset, v;
974 
975 	v = (uintptr_t)WT_ROW_KEY_COPY(rip);
976 
977 	/*
978 	 * See the comment in __wt_row_leaf_key_info for an explanation of the
979 	 * magic.
980 	 */
981 	if (!(v & WT_K_FLAG))			/* Already an encoded key */
982 		return;
983 
984 	key_len = WT_K_DECODE_KEY_LEN(v);	/* Key length */
985 	if (key_len > WT_KV_MAX_KEY_LEN)
986 		return;
987 	if (unpack->size > WT_KV_MAX_VALUE_LEN)	/* Value length */
988 		return;
989 
990 	key_offset = WT_K_DECODE_KEY_OFFSET(v);	/* Page offsets */
991 	if (key_offset > WT_KV_MAX_KEY_OFFSET)
992 		return;
993 	value_offset = WT_PAGE_DISK_OFFSET(page, unpack->data);
994 	if (value_offset > WT_KV_MAX_VALUE_OFFSET)
995 		return;
996 
997 	v = WT_KV_ENCODE_KEY_LEN(key_len) |
998 	    WT_KV_ENCODE_VALUE_LEN(unpack->size) |
999 	    WT_KV_ENCODE_KEY_OFFSET(key_offset) |
1000 	    WT_KV_ENCODE_VALUE_OFFSET(value_offset) | WT_KV_FLAG;
1001 	WT_ROW_KEY_SET(rip, v);
1002 }
1003 
1004 /*
1005  * __wt_row_leaf_key --
1006  *	Set a buffer to reference a row-store leaf page key as cheaply as
1007  * possible.
1008  */
1009 static inline int
__wt_row_leaf_key(WT_SESSION_IMPL * session,WT_PAGE * page,WT_ROW * rip,WT_ITEM * key,bool instantiate)1010 __wt_row_leaf_key(WT_SESSION_IMPL *session,
1011     WT_PAGE *page, WT_ROW *rip, WT_ITEM *key, bool instantiate)
1012 {
1013 	void *copy;
1014 
1015 	/*
1016 	 * A front-end for __wt_row_leaf_key_work, here to inline fast paths.
1017 	 *
1018 	 * The row-store key can change underfoot; explicitly take a copy.
1019 	 */
1020 	copy = WT_ROW_KEY_COPY(rip);
1021 
1022 	/*
1023 	 * All we handle here are on-page keys (which should be a common case),
1024 	 * and instantiated keys (which start out rare, but become more common
1025 	 * as a leaf page is searched, instantiating prefix-compressed keys).
1026 	 */
1027 	if (__wt_row_leaf_key_info(
1028 	    page, copy, NULL, NULL, &key->data, &key->size))
1029 		return (0);
1030 
1031 	/*
1032 	 * The alternative is an on-page cell with some kind of compressed or
1033 	 * overflow key that's never been instantiated.  Call the underlying
1034 	 * worker function to figure it out.
1035 	 */
1036 	return (__wt_row_leaf_key_work(session, page, rip, key, instantiate));
1037 }
1038 
1039 /*
1040  * __wt_row_leaf_value_cell --
1041  *	Return the unpacked value for a row-store leaf page key.
1042  */
1043 static inline void
__wt_row_leaf_value_cell(WT_PAGE * page,WT_ROW * rip,WT_CELL_UNPACK * kpack,WT_CELL_UNPACK * vpack)1044 __wt_row_leaf_value_cell(
1045     WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *kpack, WT_CELL_UNPACK *vpack)
1046 {
1047 	WT_CELL *kcell, *vcell;
1048 	WT_CELL_UNPACK unpack;
1049 	size_t size;
1050 	void *copy, *key;
1051 
1052 	/* If we already have an unpacked key cell, use it. */
1053 	if (kpack != NULL)
1054 		vcell = (WT_CELL *)
1055 		    ((uint8_t *)kpack->cell + __wt_cell_total_len(kpack));
1056 	else {
1057 		/*
1058 		 * The row-store key can change underfoot; explicitly take a
1059 		 * copy.
1060 		 */
1061 		copy = WT_ROW_KEY_COPY(rip);
1062 
1063 		/*
1064 		 * Figure out where the key is, step past it to the value cell.
1065 		 * The test for a cell not being set tells us that we have an
1066 		 * on-page key, otherwise we're looking at an instantiated key
1067 		 * or on-page cell, both of which require an unpack of the key's
1068 		 * cell to find the value cell that follows.
1069 		 */
1070 		if (__wt_row_leaf_key_info(
1071 		    page, copy, NULL, &kcell, &key, &size) && kcell == NULL)
1072 			vcell = (WT_CELL *)((uint8_t *)key + size);
1073 		else {
1074 			__wt_cell_unpack(kcell, &unpack);
1075 			vcell = (WT_CELL *)((uint8_t *)
1076 			    unpack.cell + __wt_cell_total_len(&unpack));
1077 		}
1078 	}
1079 
1080 	__wt_cell_unpack(__wt_cell_leaf_value_parse(page, vcell), vpack);
1081 }
1082 
1083 /*
1084  * __wt_row_leaf_value --
1085  *	Return the value for a row-store leaf page encoded key/value pair.
1086  */
1087 static inline bool
__wt_row_leaf_value(WT_PAGE * page,WT_ROW * rip,WT_ITEM * value)1088 __wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value)
1089 {
1090 	uintptr_t v;
1091 
1092 	/* The row-store key can change underfoot; explicitly take a copy. */
1093 	v = (uintptr_t)WT_ROW_KEY_COPY(rip);
1094 
1095 	/*
1096 	 * See the comment in __wt_row_leaf_key_info for an explanation of the
1097 	 * magic.
1098 	 */
1099 	if ((v & 0x03) == WT_KV_FLAG) {
1100 		value->data =
1101 		    WT_PAGE_REF_OFFSET(page, WT_KV_DECODE_VALUE_OFFSET(v));
1102 		value->size = WT_KV_DECODE_VALUE_LEN(v);
1103 		return (true);
1104 	}
1105 	return (false);
1106 }
1107 
1108 /*
1109  * __wt_ref_info --
1110  *	Return the addr/size and type triplet for a reference.
1111  */
1112 static inline void
__wt_ref_info(WT_REF * ref,const uint8_t ** addrp,size_t * sizep,u_int * typep)1113 __wt_ref_info(WT_REF *ref, const uint8_t **addrp, size_t *sizep, u_int *typep)
1114 {
1115 	WT_ADDR *addr;
1116 	WT_CELL_UNPACK *unpack, _unpack;
1117 
1118 	addr = ref->addr;
1119 	unpack = &_unpack;
1120 
1121 	/*
1122 	 * If NULL, there is no location.
1123 	 * If off-page, the pointer references a WT_ADDR structure.
1124 	 * If on-page, the pointer references a cell.
1125 	 *
1126 	 * The type is of a limited set: internal, leaf or no-overflow leaf.
1127 	 */
1128 	if (addr == NULL) {
1129 		*addrp = NULL;
1130 		*sizep = 0;
1131 		if (typep != NULL)
1132 			*typep = 0;
1133 	} else if (__wt_off_page(ref->home, addr)) {
1134 		*addrp = addr->addr;
1135 		*sizep = addr->size;
1136 		if (typep != NULL)
1137 			switch (addr->type) {
1138 			case WT_ADDR_INT:
1139 				*typep = WT_CELL_ADDR_INT;
1140 				break;
1141 			case WT_ADDR_LEAF:
1142 				*typep = WT_CELL_ADDR_LEAF;
1143 				break;
1144 			case WT_ADDR_LEAF_NO:
1145 				*typep = WT_CELL_ADDR_LEAF_NO;
1146 				break;
1147 			default:
1148 				*typep = 0;
1149 				break;
1150 			}
1151 	} else {
1152 		__wt_cell_unpack((WT_CELL *)addr, unpack);
1153 		*addrp = unpack->data;
1154 		*sizep = unpack->size;
1155 		if (typep != NULL)
1156 			*typep = unpack->type;
1157 	}
1158 }
1159 
1160 /*
1161  * __wt_ref_block_free --
1162  *	Free the on-disk block for a reference and clear the address.
1163  */
1164 static inline int
__wt_ref_block_free(WT_SESSION_IMPL * session,WT_REF * ref)1165 __wt_ref_block_free(WT_SESSION_IMPL *session, WT_REF *ref)
1166 {
1167 	size_t addr_size;
1168 	const uint8_t *addr;
1169 
1170 	if (ref->addr == NULL)
1171 		return (0);
1172 
1173 	__wt_ref_info(ref, &addr, &addr_size, NULL);
1174 	WT_RET(__wt_btree_block_free(session, addr, addr_size));
1175 
1176 	/* Clear the address (so we don't free it twice). */
1177 	__wt_ref_addr_free(session, ref);
1178 	return (0);
1179 }
1180 
1181 /*
1182  * __wt_page_del_active --
1183  *	Return if a truncate operation is active.
1184  */
1185 static inline bool
__wt_page_del_active(WT_SESSION_IMPL * session,WT_REF * ref,bool visible_all)1186 __wt_page_del_active(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
1187 {
1188 	WT_PAGE_DELETED *page_del;
1189 	uint8_t prepare_state;
1190 
1191 	if ((page_del = ref->page_del) == NULL)
1192 		return (false);
1193 	if (page_del->txnid == WT_TXN_ABORTED)
1194 		return (false);
1195 	WT_ORDERED_READ(prepare_state, page_del->prepare_state);
1196 	if (prepare_state == WT_PREPARE_INPROGRESS ||
1197 	    prepare_state == WT_PREPARE_LOCKED)
1198 		return (true);
1199 	return (visible_all ?
1200 	    !__wt_txn_visible_all(session,
1201 	    page_del->txnid, WT_TIMESTAMP_NULL(&page_del->timestamp)) :
1202 	    !__wt_txn_visible(session,
1203 	    page_del->txnid, WT_TIMESTAMP_NULL(&page_del->timestamp)));
1204 }
1205 
1206 /*
1207  * __wt_page_las_active --
1208  *	Return if lookaside data for a page is still required.
1209  */
1210 static inline bool
__wt_page_las_active(WT_SESSION_IMPL * session,WT_REF * ref)1211 __wt_page_las_active(WT_SESSION_IMPL *session, WT_REF *ref)
1212 {
1213 	WT_PAGE_LOOKASIDE *page_las;
1214 
1215 	if ((page_las = ref->page_las) == NULL)
1216 		return (false);
1217 	if (page_las->resolved)
1218 		return (false);
1219 	if (!page_las->skew_newest)
1220 		return (true);
1221 	if (__wt_txn_visible_all(session, page_las->max_txn,
1222 	    WT_TIMESTAMP_NULL(&page_las->max_timestamp)))
1223 		return (false);
1224 
1225 	return (true);
1226 }
1227 
1228 /*
1229  * __wt_btree_can_evict_dirty --
1230  *	Check whether eviction of dirty pages or splits are permitted in the
1231  *	current tree.
1232  *
1233  *      We cannot evict dirty pages or split while a checkpoint is in progress,
1234  *      unless the checkpoint thread is doing the work.
1235  *
1236  *	Also, during connection close, if we take a checkpoint as of a
1237  *	timestamp, eviction should not write dirty pages to avoid updates newer
1238  *	than the checkpoint timestamp leaking to disk.
1239  */
1240 static inline bool
__wt_btree_can_evict_dirty(WT_SESSION_IMPL * session)1241 __wt_btree_can_evict_dirty(WT_SESSION_IMPL *session)
1242 {
1243 	WT_BTREE *btree;
1244 
1245 	btree = S2BT(session);
1246 	return ((!WT_BTREE_SYNCING(btree) || WT_SESSION_BTREE_SYNC(session)) &&
1247 	    !F_ISSET(S2C(session), WT_CONN_CLOSING_TIMESTAMP));
1248 }
1249 
1250 /*
1251  * __wt_leaf_page_can_split --
1252  *	Check whether a page can be split in memory.
1253  */
1254 static inline bool
__wt_leaf_page_can_split(WT_SESSION_IMPL * session,WT_PAGE * page)1255 __wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
1256 {
1257 	WT_BTREE *btree;
1258 	WT_INSERT *ins;
1259 	WT_INSERT_HEAD *ins_head;
1260 	size_t size;
1261 	int count;
1262 
1263 	btree = S2BT(session);
1264 
1265 	/*
1266 	 * Checkpoints can't do in-memory splits in the tree they are walking:
1267 	 * that can lead to corruption when the parent internal page is
1268 	 * updated.
1269 	 */
1270 	if (WT_SESSION_BTREE_SYNC(session))
1271 		return (false);
1272 
1273 	/*
1274 	 * Only split a page once, otherwise workloads that update in the middle
1275 	 * of the page could continually split without benefit.
1276 	 */
1277 	if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT))
1278 		return (false);
1279 
1280 	/*
1281 	 * Check for pages with append-only workloads. A common application
1282 	 * pattern is to have multiple threads frantically appending to the
1283 	 * tree. We want to reconcile and evict this page, but we'd like to
1284 	 * do it without making the appending threads wait. See if it's worth
1285 	 * doing a split to let the threads continue before doing eviction.
1286 	 *
1287 	 * Ignore anything other than large, dirty leaf pages. We depend on the
1288 	 * page being dirty for correctness (the page must be reconciled again
1289 	 * before being evicted after the split, information from a previous
1290 	 * reconciliation will be wrong, so we can't evict immediately).
1291 	 */
1292 	if (page->memory_footprint < btree->splitmempage)
1293 		return (false);
1294 	if (WT_PAGE_IS_INTERNAL(page))
1295 		return (false);
1296 	if (!__wt_page_is_modified(page))
1297 		return (false);
1298 
1299 	/*
1300 	 * There is no point doing an in-memory split unless there is a lot of
1301 	 * data in the last skiplist on the page.  Split if there are enough
1302 	 * items and the skiplist does not fit within a single disk page.
1303 	 */
1304 	ins_head = page->type == WT_PAGE_ROW_LEAF ?
1305 	    (page->entries == 0 ?
1306 	    WT_ROW_INSERT_SMALLEST(page) :
1307 	    WT_ROW_INSERT_SLOT(page, page->entries - 1)) :
1308 	    WT_COL_APPEND(page);
1309 	if (ins_head == NULL)
1310 		return (false);
1311 
1312 	/*
1313 	 * In the extreme case, where the page is much larger than the maximum
1314 	 * size, split as soon as there are 5 items on the page.
1315 	 */
1316 #define	WT_MAX_SPLIT_COUNT	5
1317 	if (page->memory_footprint > (size_t)btree->maxleafpage * 2) {
1318 		for (count = 0, ins = ins_head->head[0];
1319 		    ins != NULL;
1320 		    ins = ins->next[0]) {
1321 			if (++count < WT_MAX_SPLIT_COUNT)
1322 				continue;
1323 
1324 			WT_STAT_CONN_INCR(session, cache_inmem_splittable);
1325 			WT_STAT_DATA_INCR(session, cache_inmem_splittable);
1326 			return (true);
1327 		}
1328 
1329 		return (false);
1330 	}
1331 
1332 	/*
1333 	 * Rather than scanning the whole list, walk a higher level, which
1334 	 * gives a sample of the items -- at level 0 we have all the items, at
1335 	 * level 1 we have 1/4 and at level 2 we have 1/16th.  If we see more
1336 	 * than 30 items and more data than would fit in a disk page, split.
1337 	 */
1338 #define	WT_MIN_SPLIT_DEPTH	2
1339 #define	WT_MIN_SPLIT_COUNT	30
1340 #define	WT_MIN_SPLIT_MULTIPLIER 16      /* At level 2, we see 1/16th entries */
1341 
1342 	for (count = 0, size = 0, ins = ins_head->head[WT_MIN_SPLIT_DEPTH];
1343 	    ins != NULL;
1344 	    ins = ins->next[WT_MIN_SPLIT_DEPTH]) {
1345 		count += WT_MIN_SPLIT_MULTIPLIER;
1346 		size += WT_MIN_SPLIT_MULTIPLIER *
1347 		    (WT_INSERT_KEY_SIZE(ins) + WT_UPDATE_MEMSIZE(ins->upd));
1348 		if (count > WT_MIN_SPLIT_COUNT &&
1349 		    size > (size_t)btree->maxleafpage) {
1350 			WT_STAT_CONN_INCR(session, cache_inmem_splittable);
1351 			WT_STAT_DATA_INCR(session, cache_inmem_splittable);
1352 			return (true);
1353 		}
1354 	}
1355 	return (false);
1356 }
1357 
1358 /*
1359  * __wt_page_evict_retry --
1360  *	Avoid busy-spinning attempting to evict the same page all the time.
1361  */
1362 static inline bool
__wt_page_evict_retry(WT_SESSION_IMPL * session,WT_PAGE * page)1363 __wt_page_evict_retry(WT_SESSION_IMPL *session, WT_PAGE *page)
1364 {
1365 	WT_DECL_TIMESTAMP(pinned_ts)
1366 	WT_PAGE_MODIFY *mod;
1367 	WT_TXN_GLOBAL *txn_global;
1368 
1369 	txn_global = &S2C(session)->txn_global;
1370 
1371 	/*
1372 	 * If the page hasn't been through one round of update/restore, give it
1373 	 * a try.
1374 	 */
1375 	if ((mod = page->modify) == NULL ||
1376 	    !FLD_ISSET(mod->restore_state, WT_PAGE_RS_RESTORED))
1377 		return (true);
1378 
1379 	/*
1380 	 * Retry if a reasonable amount of eviction time has passed, the
1381 	 * choice of 5 eviction passes as a reasonable amount of time is
1382 	 * currently pretty arbitrary.
1383 	 */
1384 	if (__wt_cache_aggressive(session) ||
1385 	    mod->last_evict_pass_gen + 5 < S2C(session)->cache->evict_pass_gen)
1386 		return (true);
1387 
1388 	/* Retry if the global transaction state has moved forward. */
1389 	if (txn_global->current == txn_global->oldest_id ||
1390 	    mod->last_eviction_id != __wt_txn_oldest_id(session))
1391 		return (true);
1392 
1393 #ifdef HAVE_TIMESTAMPS
1394 	if (__wt_timestamp_iszero(&mod->last_eviction_timestamp))
1395 		return (true);
1396 
1397 	__wt_txn_pinned_timestamp(session, &pinned_ts);
1398 	if (__wt_timestamp_cmp(&pinned_ts, &mod->last_eviction_timestamp) > 0)
1399 		return (true);
1400 #endif
1401 
1402 	return (false);
1403 }
1404 
1405 /*
1406  * __wt_page_can_evict --
1407  *	Check whether a page can be evicted.
1408  */
1409 static inline bool
__wt_page_can_evict(WT_SESSION_IMPL * session,WT_REF * ref,bool * inmem_splitp)1410 __wt_page_can_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool *inmem_splitp)
1411 {
1412 	WT_PAGE *page;
1413 	WT_PAGE_MODIFY *mod;
1414 	bool modified;
1415 
1416 	if (inmem_splitp != NULL)
1417 		*inmem_splitp = false;
1418 
1419 	page = ref->page;
1420 	mod = page->modify;
1421 
1422 	/* A truncated page can't be evicted until the truncate completes. */
1423 	if (__wt_page_del_active(session, ref, true))
1424 		return (false);
1425 
1426 	/* Otherwise, never modified pages can always be evicted. */
1427 	if (mod == NULL)
1428 		return (true);
1429 
1430 	/*
1431 	 * We can't split or evict multiblock row-store pages where the parent's
1432 	 * key for the page is an overflow item, because the split into the
1433 	 * parent frees the backing blocks for any no-longer-used overflow keys,
1434 	 * which will corrupt the checkpoint's block management.
1435 	 */
1436 	if (!__wt_btree_can_evict_dirty(session) &&
1437 	    F_ISSET_ATOMIC(ref->home, WT_PAGE_OVERFLOW_KEYS))
1438 		return (false);
1439 
1440 	/*
1441 	 * Check for in-memory splits before other eviction tests. If the page
1442 	 * should split in-memory, return success immediately and skip more
1443 	 * detailed eviction tests. We don't need further tests since the page
1444 	 * won't be written or discarded from the cache.
1445 	 */
1446 	if (__wt_leaf_page_can_split(session, page)) {
1447 		if (inmem_splitp != NULL)
1448 			*inmem_splitp = true;
1449 		return (true);
1450 	}
1451 
1452 	modified = __wt_page_is_modified(page);
1453 
1454 	/*
1455 	 * If the file is being checkpointed, other threads can't evict dirty
1456 	 * pages: if a page is written and the previous version freed, that
1457 	 * previous version might be referenced by an internal page already
1458 	 * written in the checkpoint, leaving the checkpoint inconsistent.
1459 	 */
1460 	if (modified && !__wt_btree_can_evict_dirty(session)) {
1461 		WT_STAT_CONN_INCR(session, cache_eviction_checkpoint);
1462 		WT_STAT_DATA_INCR(session, cache_eviction_checkpoint);
1463 		return (false);
1464 	}
1465 
1466 	/*
1467 	 * If a split created new internal pages, those newly created internal
1468 	 * pages cannot be evicted until all threads are known to have exited
1469 	 * the original parent page's index, because evicting an internal page
1470 	 * discards its WT_REF array, and a thread traversing the original
1471 	 * parent page index might see a freed WT_REF.
1472 	 *
1473 	 * One special case where we know this is safe is if the handle is
1474 	 * locked exclusive (e.g., when the whole tree is being evicted).  In
1475 	 * that case, no readers can be looking at an old index.
1476 	 */
1477 	if (WT_PAGE_IS_INTERNAL(page) &&
1478 	    !F_ISSET(session->dhandle, WT_DHANDLE_EXCLUSIVE) &&
1479 	    __wt_gen_active(session, WT_GEN_SPLIT, page->pg_intl_split_gen))
1480 		return (false);
1481 
1482 	/*
1483 	 * If the page is clean but has modifications that appear too new to
1484 	 * evict, skip it.
1485 	 */
1486 	if (!modified && !__wt_txn_visible_all(session,
1487 	    mod->rec_max_txn, WT_TIMESTAMP_NULL(&mod->rec_max_timestamp)))
1488 		return (false);
1489 
1490 	return (true);
1491 }
1492 
1493 /*
1494  * __wt_page_release --
1495  *	Release a reference to a page.
1496  */
1497 static inline int
__wt_page_release(WT_SESSION_IMPL * session,WT_REF * ref,uint32_t flags)1498 __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
1499 {
1500 	WT_BTREE *btree;
1501 	WT_PAGE *page;
1502 	bool inmem_split;
1503 
1504 	btree = S2BT(session);
1505 
1506 	/*
1507 	 * Discard our hazard pointer.  Ignore pages we don't have and the root
1508 	 * page, which sticks in memory, regardless.
1509 	 */
1510 	if (ref == NULL || ref->page == NULL || __wt_ref_is_root(ref))
1511 		return (0);
1512 
1513 	/*
1514 	 * If hazard pointers aren't necessary for this file, we can't be
1515 	 * evicting, we're done.
1516 	 */
1517 	if (F_ISSET(btree, WT_BTREE_IN_MEMORY))
1518 		return (0);
1519 
1520 	/*
1521 	 * Attempt to evict pages with the special "oldest" read generation.
1522 	 * This is set for pages that grow larger than the configured
1523 	 * memory_page_max setting, when we see many deleted items, and when we
1524 	 * are attempting to scan without trashing the cache.
1525 	 *
1526 	 * Checkpoint should not queue pages for urgent eviction if they require
1527 	 * dirty eviction: there is a special exemption that allows checkpoint
1528 	 * to evict dirty pages in a tree that is being checkpointed, and no
1529 	 * other thread can help with that. Checkpoints don't rely on this code
1530 	 * for dirty eviction: that is handled explicitly in __wt_sync_file.
1531 	 *
1532 	 * If the operation has disabled eviction or splitting, or the session
1533 	 * is preventing from reconciling, then just queue the page for urgent
1534 	 * eviction.  Otherwise, attempt to release and evict it.
1535 	 */
1536 	page = ref->page;
1537 	if (WT_READGEN_EVICT_SOON(page->read_gen) &&
1538 	    btree->evict_disabled == 0 &&
1539 	    __wt_page_can_evict(session, ref, &inmem_split) &&
1540 	    (!WT_SESSION_IS_CHECKPOINT(session) ||
1541 	    __wt_page_evict_clean(page))) {
1542 		if (LF_ISSET(WT_READ_NO_EVICT) ||
1543 		    (inmem_split ? LF_ISSET(WT_READ_NO_SPLIT) :
1544 		    F_ISSET(session, WT_SESSION_NO_RECONCILE)))
1545 			WT_IGNORE_RET(
1546 			    __wt_page_evict_urgent(session, ref));
1547 		else {
1548 			WT_RET_BUSY_OK(
1549 			    __wt_page_release_evict(session, ref));
1550 			return (0);
1551 		}
1552 	}
1553 
1554 	return (__wt_hazard_clear(session, ref));
1555 }
1556 
1557 /*
1558  * __wt_skip_choose_depth --
1559  *	Randomly choose a depth for a skiplist insert.
1560  */
1561 static inline u_int
__wt_skip_choose_depth(WT_SESSION_IMPL * session)1562 __wt_skip_choose_depth(WT_SESSION_IMPL *session)
1563 {
1564 	u_int d;
1565 
1566 	for (d = 1; d < WT_SKIP_MAXDEPTH &&
1567 	    __wt_random(&session->rnd) < WT_SKIP_PROBABILITY; d++)
1568 		;
1569 	return (d);
1570 }
1571 
1572 /*
1573  * __wt_btree_lsm_over_size --
1574  *	Return if the size of an in-memory tree with a single leaf page is over
1575  * a specified maximum.  If called on anything other than a simple tree with a
1576  * single leaf page, returns true so our LSM caller will switch to a new tree.
1577  */
1578 static inline bool
__wt_btree_lsm_over_size(WT_SESSION_IMPL * session,uint64_t maxsize)1579 __wt_btree_lsm_over_size(WT_SESSION_IMPL *session, uint64_t maxsize)
1580 {
1581 	WT_BTREE *btree;
1582 	WT_PAGE *child, *root;
1583 	WT_PAGE_INDEX *pindex;
1584 	WT_REF *first;
1585 
1586 	btree = S2BT(session);
1587 	root = btree->root.page;
1588 
1589 	/* Check for a non-existent tree. */
1590 	if (root == NULL)
1591 		return (false);
1592 
1593 	/* A tree that can be evicted always requires a switch. */
1594 	if (btree->evict_disabled == 0)
1595 		return (true);
1596 
1597 	/* Check for a tree with a single leaf page. */
1598 	WT_INTL_INDEX_GET(session, root, pindex);
1599 	if (pindex->entries != 1)		/* > 1 child page, switch */
1600 		return (true);
1601 
1602 	first = pindex->index[0];
1603 	if (first->state != WT_REF_MEM)		/* no child page, ignore */
1604 		return (false);
1605 
1606 	/*
1607 	 * We're reaching down into the page without a hazard pointer, but
1608 	 * that's OK because we know that no-eviction is set and so the page
1609 	 * cannot disappear.
1610 	 */
1611 	child = first->page;
1612 	if (child->type != WT_PAGE_ROW_LEAF)	/* not a single leaf page */
1613 		return (true);
1614 
1615 	return (child->memory_footprint > maxsize);
1616 }
1617 
1618 /*
1619  * __wt_split_descent_race --
1620  *	Return if we raced with an internal page split when descending the tree.
1621  */
1622 static inline bool
__wt_split_descent_race(WT_SESSION_IMPL * session,WT_REF * ref,WT_PAGE_INDEX * saved_pindex)1623 __wt_split_descent_race(
1624     WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX *saved_pindex)
1625 {
1626 	WT_PAGE_INDEX *pindex;
1627 
1628 	/* No test when starting the descent (there's no home to check). */
1629 	if (__wt_ref_is_root(ref))
1630 		return (false);
1631 
1632 	/*
1633 	 * A place to hang this comment...
1634 	 *
1635 	 * There's a page-split race when we walk the tree: if we're splitting
1636 	 * an internal page into its parent, we update the parent's page index
1637 	 * before updating the split page's page index, and it's not an atomic
1638 	 * update. A thread can read the parent page's original page index and
1639 	 * then read the split page's replacement index.
1640 	 *
1641 	 * For example, imagine a search descending the tree.
1642 	 *
1643 	 * Because internal page splits work by truncating the original page to
1644 	 * the initial part of the original page, the result of this race is we
1645 	 * will have a search key that points past the end of the current page.
1646 	 * This is only an issue when we search past the end of the page, if we
1647 	 * find a WT_REF in the page with the namespace we're searching for, we
1648 	 * don't care if the WT_REF moved or not while we were searching, we
1649 	 * have the correct page.
1650 	 *
1651 	 * For example, imagine an internal page with 3 child pages, with the
1652 	 * namespaces a-f, g-h and i-j; the first child page splits. The parent
1653 	 * starts out with the following page-index:
1654 	 *
1655 	 *	| ... | a | g | i | ... |
1656 	 *
1657 	 * which changes to this:
1658 	 *
1659 	 *	| ... | a | c | e | g | i | ... |
1660 	 *
1661 	 * The child starts out with the following page-index:
1662 	 *
1663 	 *	| a | b | c | d | e | f |
1664 	 *
1665 	 * which changes to this:
1666 	 *
1667 	 *	| a | b |
1668 	 *
1669 	 * The thread searches the original parent page index for the key "cat",
1670 	 * it couples to the "a" child page; if it uses the replacement child
1671 	 * page index, it will search past the end of the page and couple to the
1672 	 * "b" page, which is wrong.
1673 	 *
1674 	 * To detect the problem, we remember the parent page's page index used
1675 	 * to descend the tree. Whenever we search past the end of a page, we
1676 	 * check to see if the parent's page index has changed since our use of
1677 	 * it during descent. As the problem only appears if we read the split
1678 	 * page's replacement index, the parent page's index must already have
1679 	 * changed, ensuring we detect the problem.
1680 	 *
1681 	 * It's possible for the opposite race to happen (a thread could read
1682 	 * the parent page's replacement page index and then read the split
1683 	 * page's original index). This isn't a problem because internal splits
1684 	 * work by truncating the split page, so the split page search is for
1685 	 * content the split page retains after the split, and we ignore this
1686 	 * race.
1687 	 *
1688 	 * This code is a general purpose check for a descent race and we call
1689 	 * it in other cases, for example, a cursor traversing backwards through
1690 	 * the tree.
1691 	 *
1692 	 * Presumably we acquired a page index on the child page before calling
1693 	 * this code, don't re-order that acquisition with this check.
1694 	 */
1695 	WT_BARRIER();
1696 	WT_INTL_INDEX_GET(session, ref->home, pindex);
1697 	return (pindex != saved_pindex);
1698 }
1699 
1700 /*
1701  * __wt_page_swap_func --
1702  *	Swap one page's hazard pointer for another one when hazard pointer
1703  * coupling up/down the tree.
1704  */
1705 static inline int
__wt_page_swap_func(WT_SESSION_IMPL * session,WT_REF * held,WT_REF * want,uint32_t flags,const char * func,int line)1706 __wt_page_swap_func(
1707     WT_SESSION_IMPL *session, WT_REF *held, WT_REF *want, uint32_t flags
1708 #ifdef HAVE_DIAGNOSTIC
1709     , const char *func, int line
1710 #endif
1711     )
1712 {
1713 	WT_DECL_RET;
1714 	bool acquired;
1715 
1716 	/*
1717 	 * This function is here to simplify the error handling during hazard
1718 	 * pointer coupling so we never leave a hazard pointer dangling.  The
1719 	 * assumption is we're holding a hazard pointer on "held", and want to
1720 	 * acquire a hazard pointer on "want", releasing the hazard pointer on
1721 	 * "held" when we're done.
1722 	 *
1723 	 * When walking the tree, we sometimes swap to the same page. Fast-path
1724 	 * that to avoid thinking about error handling.
1725 	 */
1726 	if (held == want)
1727 		return (0);
1728 
1729 	/* Get the wanted page. */
1730 	ret = __wt_page_in_func(session, want, flags
1731 #ifdef HAVE_DIAGNOSTIC
1732 	    , func, line
1733 #endif
1734 	    );
1735 
1736 	/*
1737 	 * Expected failures: page not found or restart. Our callers list the
1738 	 * errors they're expecting to handle.
1739 	 */
1740 	if (LF_ISSET(WT_READ_NOTFOUND_OK) && ret == WT_NOTFOUND)
1741 		return (WT_NOTFOUND);
1742 	if (LF_ISSET(WT_READ_RESTART_OK) && ret == WT_RESTART)
1743 		return (WT_RESTART);
1744 
1745 	/* Discard the original held page on either success or error. */
1746 	acquired = ret == 0;
1747 	WT_TRET(__wt_page_release(session, held, flags));
1748 
1749 	/* Fast-path expected success. */
1750 	if (ret == 0)
1751 		return (0);
1752 
1753 	/*
1754 	 * If there was an error at any point that our caller isn't prepared to
1755 	 * handle, discard any page we acquired.
1756 	 */
1757 	if (acquired)
1758 		WT_TRET(__wt_page_release(session, want, flags));
1759 
1760 	/*
1761 	 * If we're returning an error, don't let it be one our caller expects
1762 	 * to handle as returned by page-in: the expectation includes the held
1763 	 * page not having been released, and that's not the case.
1764 	 */
1765 	if (LF_ISSET(WT_READ_NOTFOUND_OK) && ret == WT_NOTFOUND)
1766 		WT_RET_MSG(session,
1767 		    EINVAL, "page-release WT_NOTFOUND error mapped to EINVAL");
1768 	if (LF_ISSET(WT_READ_RESTART_OK) && ret == WT_RESTART)
1769 		WT_RET_MSG(session,
1770 		    EINVAL, "page-release WT_RESTART error mapped to EINVAL");
1771 
1772 	return (ret);
1773 }
1774