1 /*-
2  * Copyright (c) 2014-2018 MongoDB, Inc.
3  * Copyright (c) 2008-2014 WiredTiger, Inc.
4  *	All rights reserved.
5  *
6  * See the file LICENSE for redistribution information.
7  */
8 
9 #include "wt_internal.h"
10 
11 /*
12  * __ref_index_slot --
13  *      Return the page's index and slot for a reference.
14  */
15 static inline void
__ref_index_slot(WT_SESSION_IMPL * session,WT_REF * ref,WT_PAGE_INDEX ** pindexp,uint32_t * slotp)16 __ref_index_slot(WT_SESSION_IMPL *session,
17     WT_REF *ref, WT_PAGE_INDEX **pindexp, uint32_t *slotp)
18 {
19 	WT_PAGE_INDEX *pindex;
20 	WT_REF **start, **stop, **p, **t;
21 	uint64_t sleep_usecs, yield_count;
22 	uint32_t entries, slot;
23 
24 	/*
25 	 * If we don't find our reference, the page split and our home
26 	 * pointer references the wrong page. When internal pages
27 	 * split, their WT_REF structure home values are updated; yield
28 	 * and wait for that to happen.
29 	 */
30 	for (sleep_usecs = yield_count = 0;;) {
31 		/*
32 		 * Copy the parent page's index value: the page can split at
33 		 * any time, but the index's value is always valid, even if
34 		 * it's not up-to-date.
35 		 */
36 		WT_INTL_INDEX_GET(session, ref->home, pindex);
37 		entries = pindex->entries;
38 
39 		/*
40 		 * Use the page's reference hint: it should be correct unless
41 		 * there was a split or delete in the parent before our slot.
42 		 * If the hint is wrong, it can be either too big or too small,
43 		 * but often only by a small amount.  Search up and down the
44 		 * index starting from the hint.
45 		 *
46 		 * It's not an error for the reference hint to be wrong, it
47 		 * just means the first retrieval (which sets the hint for
48 		 * subsequent retrievals), is slower.
49 		 */
50 		slot = ref->pindex_hint;
51 		if (slot >= entries)
52 			slot = entries - 1;
53 		if (pindex->index[slot] == ref)
54 			goto found;
55 		for (start = &pindex->index[0],
56 		    stop = &pindex->index[entries - 1],
57 		    p = t = &pindex->index[slot];
58 		    p > start || t < stop;) {
59 			if (p > start && *--p == ref) {
60 				slot = (uint32_t)(p - start);
61 				goto found;
62 			}
63 			if (t < stop && *++t == ref) {
64 				slot = (uint32_t)(t - start);
65 				goto found;
66 			}
67 		}
68 		/*
69 		 * We failed to get the page index and slot reference, yield
70 		 * before retrying, and if we've yielded enough times, start
71 		 * sleeping so we don't burn CPU to no purpose.
72 		 */
73 		__wt_spin_backoff(&yield_count, &sleep_usecs);
74 		WT_STAT_CONN_INCRV(session, page_index_slot_ref_blocked,
75 		    sleep_usecs);
76 	}
77 
78 found:	WT_ASSERT(session, pindex->index[slot] == ref);
79 	*pindexp = pindex;
80 	*slotp = slot;
81 }
82 
83 /*
84  * __ref_is_leaf --
85  *	Check if a reference is for a leaf page.
86  */
87 static inline bool
__ref_is_leaf(WT_REF * ref)88 __ref_is_leaf(WT_REF *ref)
89 {
90 	size_t addr_size;
91 	const uint8_t *addr;
92 	u_int type;
93 
94 	/*
95 	 * If the page has a disk address, we can crack it to figure out if
96 	 * this page is a leaf page or not. If there's no address, the page
97 	 * isn't on disk and we don't know the page type.
98 	 */
99 	__wt_ref_info(ref, &addr, &addr_size, &type);
100 	return (addr == NULL ?
101 	    false : type == WT_CELL_ADDR_LEAF || type == WT_CELL_ADDR_LEAF_NO);
102 }
103 
104 /*
105  * __ref_ascend --
106  *	Ascend the tree one level.
107  */
108 static inline void
__ref_ascend(WT_SESSION_IMPL * session,WT_REF ** refp,WT_PAGE_INDEX ** pindexp,uint32_t * slotp)109 __ref_ascend(WT_SESSION_IMPL *session,
110     WT_REF **refp, WT_PAGE_INDEX **pindexp, uint32_t *slotp)
111 {
112 	WT_REF *parent_ref, *ref;
113 
114 	/*
115 	 * Ref points to the first/last slot on an internal page from which we
116 	 * are ascending the tree, moving to the parent page. This is tricky
117 	 * because the internal page we're on may be splitting into its parent.
118 	 * Find a stable configuration where the page we start from and the
119 	 * page we're moving to are connected. The tree eventually stabilizes
120 	 * into that configuration, keep trying until we succeed.
121 	 */
122 	for (ref = *refp;;) {
123 		/*
124 		 * Find our parent slot on the next higher internal page, the
125 		 * slot from which we move to a next/prev slot, checking that
126 		 * we haven't reached the root.
127 		 */
128 		parent_ref = ref->home->pg_intl_parent_ref;
129 		if (__wt_ref_is_root(parent_ref))
130 			break;
131 		__ref_index_slot(session, parent_ref, pindexp, slotp);
132 
133 		/*
134 		 * There's a split race when a cursor moving forwards through
135 		 * the tree ascends the tree. If we're splitting an internal
136 		 * page into its parent, we move the WT_REF structures and
137 		 * then update the parent's page index before updating the split
138 		 * page's page index, and it's not an atomic update. A thread
139 		 * can read the split page's original page index and then read
140 		 * the parent page's replacement index.
141 		 *
142 		 * This can create a race for next-cursor movements.
143 		 *
144 		 * For example, imagine an internal page with 3 child pages,
145 		 * with the namespaces a-f, g-h and i-j; the first child page
146 		 * splits. The parent starts out with the following page-index:
147 		 *
148 		 *	| ... | a | g | i | ... |
149 		 *
150 		 * which changes to this:
151 		 *
152 		 *	| ... | a | c | e | g | i | ... |
153 		 *
154 		 * The split page starts out with the following page-index:
155 		 *
156 		 *	| a | b | c | d | e | f |
157 		 *
158 		 * Imagine a cursor finishing the 'f' part of the namespace that
159 		 * starts its ascent to the parent's 'a' slot. Then the page
160 		 * splits and the parent page's page index is replaced. If the
161 		 * cursor then searches the parent's replacement page index for
162 		 * the 'a' slot, it finds it and then increments to the slot
163 		 * after the 'a' slot, the 'c' slot, and then it incorrectly
164 		 * repeats its traversal of part of the namespace.
165 		 *
166 		 * This function takes a WT_REF argument which is the page from
167 		 * which we start our ascent. If the parent's slot we find in
168 		 * our search doesn't point to the same page as that initial
169 		 * WT_REF, there's a race and we start over again.
170 		 */
171 		if (ref->home == parent_ref->page)
172 			break;
173 	}
174 
175 	*refp = parent_ref;
176 }
177 
178 /*
179  * __split_prev_race --
180  *	Check for races when descending the tree during a previous-cursor walk.
181  */
182 static inline bool
__split_prev_race(WT_SESSION_IMPL * session,WT_REF * ref,WT_PAGE_INDEX ** pindexp)183 __split_prev_race(
184     WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp)
185 {
186 	WT_PAGE_INDEX *pindex;
187 
188 	/*
189 	 * Handle a cursor moving backwards through the tree or setting up at
190 	 * the end of the tree. We're passed the child page into which we're
191 	 * descending, and the parent page's page-index we used to find that
192 	 * child page.
193 	 *
194 	 * When splitting an internal page into its parent, we move the split
195 	 * pages WT_REF structures, then update the parent's page index, then
196 	 * update the split page's page index, and nothing is atomic. A thread
197 	 * can read the parent page's replacement page index and then the split
198 	 * page's original index, or vice-versa, and either change can cause a
199 	 * cursor moving backwards through the tree to skip pages.
200 	 *
201 	 * This isn't a problem for a cursor setting up at the start of the tree
202 	 * or moving forward through the tree because we do right-hand splits on
203 	 * internal pages and the initial part of the split page's namespace
204 	 * won't change as part of a split (in other words, a thread reading the
205 	 * parent page's and split page's indexes will move to the same slot no
206 	 * matter what order of indexes are read.
207 	 *
208 	 * Acquire the child's page index, then confirm the parent's page index
209 	 * hasn't changed, to check for reading an old version of the parent's
210 	 * page index and then reading a new version of the child's page index.
211 	 */
212 	WT_INTL_INDEX_GET(session, ref->page, pindex);
213 	if (__wt_split_descent_race(session, ref, *pindexp))
214 		return (true);
215 
216 	/*
217 	 * That doesn't check if we read a new version of parent's page index
218 	 * and then an old version of the child's page index. For example, if
219 	 * a thread were in a newly created split page subtree, the split
220 	 * completes into the parent before the thread reads it and descends
221 	 * into the child (where the split hasn't yet completed).
222 	 *
223 	 * Imagine an internal page with 3 child pages, with the namespaces a-f,
224 	 * g-h and i-j; the first child page splits. The parent starts out with
225 	 * the following page-index:
226 	 *
227 	 *	| ... | a | g | i | ... |
228 	 *
229 	 * The split page starts out with the following page-index:
230 	 *
231 	 *	| a | b | c | d | e | f |
232 	 *
233 	 * The first step is to move the c-f ranges into a new subtree, so, for
234 	 * example we might have two new internal pages 'c' and 'e', where the
235 	 * new 'c' page references the c-d namespace and the new 'e' page
236 	 * references the e-f namespace. The top of the subtree references the
237 	 * parent page, but until the parent's page index is updated, threads in
238 	 * the subtree won't be able to ascend out of the subtree. However, once
239 	 * the parent page's page index is updated to this:
240 	 *
241 	 *	| ... | a | c | e | g | i | ... |
242 	 *
243 	 * threads in the subtree can ascend into the parent. Imagine a cursor
244 	 * in the c-d part of the namespace that ascends to the parent's 'c'
245 	 * slot. It would then decrement to the slot before the 'c' slot, the
246 	 * 'a' slot.
247 	 *
248 	 * The previous-cursor movement selects the last slot in the 'a' page;
249 	 * if the split page's page-index hasn't been updated yet, it selects
250 	 * the 'f' slot, which is incorrect. Once the split page's page index is
251 	 * updated to this:
252 	 *
253 	 *	| a | b |
254 	 *
255 	 * the previous-cursor movement will select the 'b' slot, which is
256 	 * correct.
257 	 *
258 	 * If the last slot on the page no longer points to the current page as
259 	 * its "home", the page is being split and part of its namespace moved,
260 	 * restart. (We probably don't have to restart, I think we could spin
261 	 * until the page-index is updated, but I'm not willing to debug that
262 	 * one if I'm wrong.)
263 	 */
264 	if (pindex->index[pindex->entries - 1]->home != ref->page)
265 		return (true);
266 
267 	*pindexp = pindex;
268 	return (false);
269 }
270 
271 /*
272  * __tree_walk_internal --
273  *	Move to the next/previous page in the tree.
274  */
275 static inline int
__tree_walk_internal(WT_SESSION_IMPL * session,WT_REF ** refp,uint64_t * walkcntp,int (* skip_func)(WT_SESSION_IMPL *,WT_REF *,void *,bool *),void * func_cookie,uint32_t flags)276 __tree_walk_internal(WT_SESSION_IMPL *session,
277     WT_REF **refp, uint64_t *walkcntp,
278     int (*skip_func)(WT_SESSION_IMPL *, WT_REF *, void *, bool *),
279     void *func_cookie, uint32_t flags)
280 {
281 	WT_BTREE *btree;
282 	WT_DECL_RET;
283 	WT_PAGE_INDEX *pindex;
284 	WT_REF *couple, *ref, *ref_orig;
285 	uint64_t restart_sleep, restart_yield, swap_sleep, swap_yield;
286 	uint32_t current_state, slot;
287 	bool empty_internal, prev, skip;
288 
289 	btree = S2BT(session);
290 	pindex = NULL;
291 	restart_sleep = restart_yield = swap_sleep = swap_yield = 0;
292 	empty_internal = false;
293 
294 	/*
295 	 * We're not supposed to walk trees without root pages. As this has not
296 	 * always been the case, assert to debug that change.
297 	 */
298 	WT_ASSERT(session, btree->root.page != NULL);
299 
300 	/* Check whether deleted pages can be skipped. */
301 	if (!LF_ISSET(WT_READ_DELETED_SKIP))
302 		LF_SET(WT_READ_DELETED_CHECK);
303 
304 	/*
305 	 * !!!
306 	 * Fast-truncate currently only works on row-store trees.
307 	 */
308 	if (btree->type != BTREE_ROW)
309 		LF_CLR(WT_READ_TRUNCATE);
310 
311 	prev = LF_ISSET(WT_READ_PREV) ? 1 : 0;
312 
313 	/*
314 	 * There are multiple reasons and approaches to walking the in-memory
315 	 * tree:
316 	 *
317 	 * (1) finding pages to evict (the eviction server);
318 	 * (2) writing just dirty leaves or internal nodes (checkpoint);
319 	 * (3) discarding pages (close);
320 	 * (4) truncating pages in a range (fast truncate);
321 	 * (5) skipping pages based on outside information (compaction);
322 	 * (6) cursor scans (applications).
323 	 *
324 	 * Except for cursor scans and compaction, the walk is limited to the
325 	 * cache, no pages are read.  In all cases, hazard pointers protect the
326 	 * walked pages from eviction.
327 	 *
328 	 * Walks use hazard-pointer coupling through the tree and that's OK
329 	 * (hazard pointers can't deadlock, so there's none of the usual
330 	 * problems found when logically locking up a btree).  If the eviction
331 	 * thread tries to evict the active page, it fails because of our
332 	 * hazard pointer.  If eviction tries to evict our parent, that fails
333 	 * because the parent has a child page that can't be discarded.  We do
334 	 * play one game: don't couple up to our parent and then back down to a
335 	 * new leaf, couple to the next page to which we're descending, it
336 	 * saves a hazard-pointer swap for each cursor page movement.
337 	 *
338 	 * The hazard pointer on the original location is held until the end of
339 	 * the movement, in case we have to restart the movement. Take a copy
340 	 * of any held page and clear the return value (it makes future error
341 	 * handling easier).
342 	 */
343 	couple = NULL;
344 	ref_orig = *refp;
345 	*refp = NULL;
346 
347 	/*
348 	 * Tree walks are special: they look inside page structures that splits
349 	 * may want to free. Publish the tree is active during this window.
350 	 */
351 	WT_ENTER_PAGE_INDEX(session);
352 
353 	/* If no page is active, begin a walk from the start/end of the tree. */
354 	if ((ref = ref_orig) == NULL) {
355 		if (0) {
356 restart:		/*
357 			 * Yield before retrying, and if we've yielded enough
358 			 * times, start sleeping so we don't burn CPU to no
359 			 * purpose.
360 			 */
361 			__wt_spin_backoff(&restart_yield, &restart_sleep);
362 
363 			WT_ERR(__wt_page_release(session, couple, flags));
364 			couple = NULL;
365 		}
366 
367 		if ((ref = ref_orig) == NULL) {
368 			ref = &btree->root;
369 			WT_INTL_INDEX_GET(session, ref->page, pindex);
370 			slot = prev ? pindex->entries - 1 : 0;
371 			goto descend;
372 		}
373 	}
374 
375 	/*
376 	 * If the active page was the root, we've reached the walk's end; we
377 	 * only get here if we've returned the root to our caller, so we're
378 	 * holding no hazard pointers.
379 	 */
380 	if (__wt_ref_is_root(ref))
381 		goto done;
382 
383 	/* Figure out the current slot in the WT_REF array. */
384 	__ref_index_slot(session, ref, &pindex, &slot);
385 
386 	for (;;) {
387 		/*
388 		 * If we're at the last/first slot on the internal page, return
389 		 * it in post-order traversal. Otherwise move to the next/prev
390 		 * slot and left/right-most element in that subtree.
391 		 */
392 		while ((prev && slot == 0) ||
393 		    (!prev && slot == pindex->entries - 1)) {
394 			/* Ascend to the parent. */
395 			__ref_ascend(session, &ref, &pindex, &slot);
396 
397 			/*
398 			 * If at the root and returning internal pages, return
399 			 * the root page, otherwise we're done.
400 			 */
401 			if (__wt_ref_is_root(ref)) {
402 				if (!LF_ISSET(WT_READ_SKIP_INTL))
403 					*refp = ref;
404 				goto done;
405 			}
406 
407 			/*
408 			 * If we got all the way through an internal page and
409 			 * all of the child pages were deleted, mark it for
410 			 * eviction.
411 			 */
412 			if (empty_internal) {
413 				__wt_page_evict_soon(session, ref);
414 				empty_internal = false;
415 			}
416 
417 			/* Encourage races. */
418 			__wt_timing_stress(session, WT_TIMING_STRESS_SPLIT_8);
419 
420 			/* Optionally return internal pages. */
421 			if (LF_ISSET(WT_READ_SKIP_INTL))
422 				continue;
423 
424 			for (;;) {
425 				/*
426 				 * Swap our previous hazard pointer for the page
427 				 * we'll return.
428 				 *
429 				 * Not-found is an expected return, as eviction
430 				 * might have been attempted. The page can't be
431 				 * evicted, we're holding a hazard pointer on a
432 				 * child, spin until we're successful.
433 				 *
434 				 * Restart is not expected, our parent WT_REF
435 				 * should not have split.
436 				 */
437 				ret = __wt_page_swap(session,
438 				    couple, ref, WT_READ_NOTFOUND_OK | flags);
439 				if (ret == 0) {
440 					/* Success, "couple" released. */
441 					couple = NULL;
442 					*refp = ref;
443 					goto done;
444 				}
445 
446 				WT_ASSERT(session, ret == WT_NOTFOUND);
447 				WT_ERR_NOTFOUND_OK(ret);
448 
449 				__wt_spin_backoff(&swap_yield, &swap_sleep);
450 			}
451 			/* NOTREACHED */
452 		}
453 
454 		if (prev)
455 			--slot;
456 		else
457 			++slot;
458 
459 		if (walkcntp != NULL)
460 			++*walkcntp;
461 
462 		for (;;) {
463 descend:		/*
464 			 * Get a reference, setting the reference hint if it's
465 			 * wrong (used when we continue the walk). We don't
466 			 * always update the hints when splitting, it's expected
467 			 * for them to be incorrect in some workloads.
468 			 */
469 			ref = pindex->index[slot];
470 			if (ref->pindex_hint != slot)
471 				ref->pindex_hint = slot;
472 
473 			/*
474 			 * If we see any child states other than deleted, the
475 			 * page isn't empty.
476 			 */
477 			current_state = ref->state;
478 			if (current_state != WT_REF_DELETED &&
479 			    !LF_ISSET(WT_READ_TRUNCATE))
480 				empty_internal = false;
481 
482 			if (LF_ISSET(WT_READ_CACHE)) {
483 				/*
484 				 * Only look at unlocked pages in memory:
485 				 * fast-path some common cases.
486 				 */
487 				if (LF_ISSET(WT_READ_NO_WAIT) &&
488 				    current_state != WT_REF_MEM &&
489 				    current_state != WT_REF_LIMBO)
490 					break;
491 
492 				/* Skip lookaside pages if not requested. */
493 				if (current_state == WT_REF_LOOKASIDE &&
494 				    !LF_ISSET(WT_READ_LOOKASIDE))
495 					break;
496 			} else if (LF_ISSET(WT_READ_TRUNCATE)) {
497 				/*
498 				 * Avoid pulling a deleted page back in to try
499 				 * to delete it again.
500 				 */
501 				if (current_state == WT_REF_DELETED &&
502 				    __wt_delete_page_skip(session, ref, false))
503 					break;
504 				/*
505 				 * If deleting a range, try to delete the page
506 				 * without instantiating it.
507 				 */
508 				WT_ERR(__wt_delete_page(session, ref, &skip));
509 				if (skip)
510 					break;
511 				empty_internal = false;
512 			} else if (skip_func != NULL) {
513 				WT_ERR(skip_func(session,
514 				    ref, func_cookie, &skip));
515 				if (skip)
516 					break;
517 			} else {
518 				/*
519 				 * Try to skip deleted pages visible to us.
520 				 */
521 				if (current_state == WT_REF_DELETED &&
522 				    __wt_delete_page_skip(session, ref, false))
523 					break;
524 			}
525 
526 			ret = __wt_page_swap(session, couple, ref,
527 			    WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK | flags);
528 			if (ret == 0) {
529 				/* Success, so "couple" has been released. */
530 				couple = NULL;
531 
532 				/* Return leaf pages to our caller. */
533 				if (!WT_PAGE_IS_INTERNAL(ref->page)) {
534 					*refp = ref;
535 					goto done;
536 				}
537 
538 				/* Set the new "couple" value. */
539 				couple = ref;
540 
541 				/* Configure traversal of any internal page. */
542 				empty_internal = true;
543 				if (prev) {
544 					if (__split_prev_race(
545 					    session, ref, &pindex))
546 						goto restart;
547 					slot = pindex->entries - 1;
548 				} else {
549 					WT_INTL_INDEX_GET(
550 					    session, ref->page, pindex);
551 					slot = 0;
552 				}
553 				continue;
554 			}
555 
556 			/*
557 			 * Not-found is an expected return when walking only
558 			 * in-cache pages, or if we see a deleted page.
559 			 *
560 			 * An expected error, so "couple" is unchanged.
561 			 */
562 			if (ret == WT_NOTFOUND) {
563 				WT_NOT_READ(ret, 0);
564 				break;
565 			}
566 
567 			/*
568 			 * The page we're moving to might have split, in which
569 			 * case restart the movement.
570 			 *
571 			 * An expected error, so "couple" is unchanged.
572 			 */
573 			if (ret == WT_RESTART)
574 				goto restart;
575 
576 			/* Unexpected error, so "couple" was released. */
577 			couple = NULL;
578 			goto err;
579 		}
580 	}
581 
582 done:
583 err:
584 	WT_TRET(__wt_page_release(session, couple, flags));
585 	WT_TRET(__wt_page_release(session, ref_orig, flags));
586 	WT_LEAVE_PAGE_INDEX(session);
587 	return (ret);
588 }
589 
590 /*
591  * __wt_tree_walk --
592  *	Move to the next/previous page in the tree.
593  */
594 int
__wt_tree_walk(WT_SESSION_IMPL * session,WT_REF ** refp,uint32_t flags)595 __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags)
596 {
597 	return (__tree_walk_internal(session, refp, NULL, NULL, NULL, flags));
598 }
599 
600 /*
601  * __wt_tree_walk_count --
602  *	Move to the next/previous page in the tree, tracking how many
603  * references were visited to get there.
604  */
605 int
__wt_tree_walk_count(WT_SESSION_IMPL * session,WT_REF ** refp,uint64_t * walkcntp,uint32_t flags)606 __wt_tree_walk_count(WT_SESSION_IMPL *session,
607     WT_REF **refp, uint64_t *walkcntp, uint32_t flags)
608 {
609 	return (__tree_walk_internal(
610 	    session, refp, walkcntp, NULL, NULL, flags));
611 }
612 
613 /*
614  * __wt_tree_walk_custom_skip --
615  *	Walk the tree calling a custom function to decide whether to skip refs.
616  */
617 int
__wt_tree_walk_custom_skip(WT_SESSION_IMPL * session,WT_REF ** refp,int (* skip_func)(WT_SESSION_IMPL *,WT_REF *,void *,bool *),void * func_cookie,uint32_t flags)618 __wt_tree_walk_custom_skip(
619     WT_SESSION_IMPL *session, WT_REF **refp,
620     int (*skip_func)(WT_SESSION_IMPL *, WT_REF *, void *, bool *),
621     void *func_cookie, uint32_t flags)
622 {
623 	return (__tree_walk_internal(
624 	    session, refp, NULL, skip_func, func_cookie, flags));
625 }
626 
627 /*
628  * __tree_walk_skip_count_callback --
629  *	Optionally skip leaf pages.
630  * When the skip-leaf-count variable is non-zero, skip some count of leaf
631  * pages, then take the next leaf page we can.
632  *
633  * The reason to do some of this work here, is because we can look at the cell
634  * and know it's a leaf page without reading it into memory. If this page is
635  * disk-based, crack the cell to figure out it's a leaf page without reading
636  * it.
637  */
638 static int
__tree_walk_skip_count_callback(WT_SESSION_IMPL * session,WT_REF * ref,void * context,bool * skipp)639 __tree_walk_skip_count_callback(
640     WT_SESSION_IMPL *session, WT_REF *ref, void *context, bool *skipp)
641 {
642 	uint64_t *skipleafcntp;
643 
644 	skipleafcntp = (uint64_t *)context;
645 	WT_ASSERT(session, skipleafcntp != NULL);
646 
647 	/*
648 	 * Skip deleted pages visible to us.
649 	 */
650 	if (ref->state == WT_REF_DELETED &&
651 	    __wt_delete_page_skip(session, ref, false))
652 		*skipp = true;
653 	else if (*skipleafcntp > 0 && __ref_is_leaf(ref)) {
654 		--*skipleafcntp;
655 		*skipp = true;
656 	} else
657 		*skipp = false;
658 	return (0);
659 }
660 
661 /*
662  * __wt_tree_walk_skip --
663  *	Move to the next/previous page in the tree, skipping a certain number
664  *	of leaf pages before returning.
665  */
666 int
__wt_tree_walk_skip(WT_SESSION_IMPL * session,WT_REF ** refp,uint64_t * skipleafcntp)667 __wt_tree_walk_skip(
668     WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp)
669 {
670 	/*
671 	 * Optionally skip leaf pages, the second half. The tree-walk function
672 	 * didn't have an on-page cell it could use to figure out if the page
673 	 * was a leaf page or not, it had to acquire the hazard pointer and look
674 	 * at the page. The tree-walk code never acquires a hazard pointer on a
675 	 * leaf page without returning it, and it's not trivial to change that.
676 	 * So, the tree-walk code returns all leaf pages here and we deal with
677 	 * decrementing the count.
678 	 */
679 	do {
680 		WT_RET(__tree_walk_internal(session, refp, NULL,
681 		    __tree_walk_skip_count_callback, skipleafcntp,
682 		    WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED));
683 
684 		/*
685 		 * The walk skipped internal pages, any page returned must be a
686 		 * leaf page.
687 		 */
688 		if (*skipleafcntp > 0)
689 			--*skipleafcntp;
690 	} while (*skipleafcntp > 0);
691 
692 	return (0);
693 }
694