1 /*-
2  * Copyright (c) 2014-2018 MongoDB, Inc.
3  * Copyright (c) 2008-2014 WiredTiger, Inc.
4  *	All rights reserved.
5  *
6  * See the file LICENSE for redistribution information.
7  */
8 
9 #include "wt_internal.h"
10 
11 /*
12  * __wt_row_random_leaf --
13  *	Return a random key from a row-store leaf page.
14  */
15 int
__wt_row_random_leaf(WT_CURSOR_BTREE * cbt)16 __wt_row_random_leaf(WT_CURSOR_BTREE *cbt)
17 {
18 	WT_INSERT *ins, **start, **stop;
19 	WT_INSERT_HEAD *ins_head;
20 	WT_PAGE *page;
21 	WT_SESSION_IMPL *session;
22 	uint64_t samples;
23 	uint32_t choice, entries, i;
24 	int level;
25 
26 	page = cbt->ref->page;
27 	session = (WT_SESSION_IMPL *)cbt->iface.session;
28 	start = stop = NULL;		/* [-Wconditional-uninitialized] */
29 	entries = 0;			/* [-Wconditional-uninitialized] */
30 
31 	__cursor_pos_clear(cbt);
32 
33 	/* If the page has disk-based entries, select from them. */
34 	if (page->entries != 0) {
35 		cbt->compare = 0;
36 		cbt->slot = __wt_random(&session->rnd) % page->entries;
37 
38 		/*
39 		 * The real row-store search function builds the key, so we
40 		 * have to as well.
41 		 */
42 		return (__wt_row_leaf_key(session,
43 		    page, page->pg_row + cbt->slot, cbt->tmp, false));
44 	}
45 
46 	/*
47 	 * If the tree is new (and not empty), it might have a large insert
48 	 * list.
49 	 *
50 	 * Walk down the list until we find a level with at least 50 entries,
51 	 * that's where we'll start rolling random numbers. The value 50 is
52 	 * used to ignore levels with only a few entries, that is, levels which
53 	 * are potentially badly skewed.
54 	 */
55 	F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
56 	if ((ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL)
57 		return (WT_NOTFOUND);
58 	for (level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) {
59 		start = &ins_head->head[level];
60 		for (entries = 0, stop = start;
61 		    *stop != NULL; stop = &(*stop)->next[level])
62 			++entries;
63 
64 		if (entries > 50)
65 			break;
66 	}
67 
68 	/*
69 	 * If it's a tiny list and we went all the way to level 0, correct the
70 	 * level; entries is correctly set.
71 	 */
72 	if (level < 0)
73 		level = 0;
74 
75 	/*
76 	 * Step down the skip list levels, selecting a random chunk of the name
77 	 * space at each level.
78 	 */
79 	for (samples = entries; level > 0; samples += entries) {
80 		/*
81 		 * There are (entries) or (entries + 1) chunks of the name space
82 		 * considered at each level. They are: between start and the 1st
83 		 * element, between the 1st and 2nd elements, and so on to the
84 		 * last chunk which is the name space after the stop element on
85 		 * the current level. This last chunk of name space may or may
86 		 * not be there: as we descend the levels of the skip list, this
87 		 * chunk may appear, depending if the next level down has
88 		 * entries logically after the stop point in the current level.
89 		 * We can't ignore those entries: because of the algorithm used
90 		 * to determine the depth of a skiplist, there may be a large
91 		 * number of entries "revealed" by descending a level.
92 		 *
93 		 * If the next level down has more items after the current stop
94 		 * point, there are (entries + 1) chunks to consider, else there
95 		 * are (entries) chunks.
96 		 */
97 		if (*(stop - 1) == NULL)
98 			choice = __wt_random(&session->rnd) % entries;
99 		else
100 			choice = __wt_random(&session->rnd) % (entries + 1);
101 
102 		if (choice == entries) {
103 			/*
104 			 * We selected the name space after the stop element on
105 			 * this level. Set the start point to the current stop
106 			 * point, descend a level and move the stop element to
107 			 * the end of the list, that is, the end of the newly
108 			 * discovered name space, counting entries as we go.
109 			 */
110 			start = stop;
111 			--start;
112 			--level;
113 			for (entries = 0, stop = start;
114 			    *stop != NULL; stop = &(*stop)->next[level])
115 				++entries;
116 		} else {
117 			/*
118 			 * We selected another name space on the level. Move the
119 			 * start pointer the selected number of entries forward
120 			 * to the start of the selected chunk (if the selected
121 			 * number is 0, start won't move). Set the stop pointer
122 			 * to the next element in the list and drop both start
123 			 * and stop down a level.
124 			 */
125 			for (i = 0; i < choice; ++i)
126 				start = &(*start)->next[level];
127 			stop = &(*start)->next[level];
128 
129 			--start;
130 			--stop;
131 			--level;
132 
133 			/* Count the entries in the selected name space. */
134 			for (entries = 0,
135 			    ins = *start; ins != *stop; ins = ins->next[level])
136 				++entries;
137 		}
138 	}
139 
140 	/*
141 	 * When we reach the bottom level, entries will already be set. Select
142 	 * a random entry from the name space and return it.
143 	 *
144 	 * It should be impossible for the entries count to be 0 at this point,
145 	 * but check for it out of paranoia and to quiet static testing tools.
146 	 */
147 	if (entries > 0)
148 		entries = __wt_random(&session->rnd) % entries;
149 	for (ins = *start; entries > 0; --entries)
150 		ins = ins->next[0];
151 
152 	cbt->ins = ins;
153 	cbt->ins_head = ins_head;
154 	cbt->compare = 0;
155 
156 	/*
157 	 * Random lookups in newly created collections can be slow if a page
158 	 * consists of a large skiplist. Schedule the page for eviction if we
159 	 * encounter a large skiplist. This worthwhile because applications
160 	 * that take a sample often take many samples, so the overhead of
161 	 * traversing the skip list each time accumulates to real time.
162 	 */
163 	if (samples > 5000)
164 		__wt_page_evict_soon(session, cbt->ref);
165 
166 	return (0);
167 }
168 
169 /*
170  * __wt_random_descent --
171  *	Find a random page in a tree for either sampling or eviction.
172  */
173 int
__wt_random_descent(WT_SESSION_IMPL * session,WT_REF ** refp,uint32_t flags)174 __wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags)
175 {
176 	WT_BTREE *btree;
177 	WT_DECL_RET;
178 	WT_PAGE *page;
179 	WT_PAGE_INDEX *pindex;
180 	WT_REF *current, *descent;
181 	uint32_t i, entries, retry;
182 	bool eviction;
183 
184 	*refp = NULL;
185 
186 	btree = S2BT(session);
187 	current = NULL;
188 	retry = 100;
189 	/*
190 	 * This function is called by eviction to find a random page in the
191 	 * cache. That case is indicated by the WT_READ_CACHE flag. Ordinary
192 	 * lookups in a tree will read pages into cache as needed.
193 	 */
194 	eviction = LF_ISSET(WT_READ_CACHE);
195 
196 	if (0) {
197 restart:	/*
198 		 * Discard the currently held page and restart the search from
199 		 * the root.
200 		 */
201 		WT_RET(__wt_page_release(session, current, flags));
202 	}
203 
204 	/* Search the internal pages of the tree. */
205 	current = &btree->root;
206 	for (;;) {
207 		page = current->page;
208 		if (!WT_PAGE_IS_INTERNAL(page))
209 			break;
210 
211 		WT_INTL_INDEX_GET(session, page, pindex);
212 		entries = pindex->entries;
213 
214 		/* Eviction just wants any random child. */
215 		if (eviction) {
216 			descent = pindex->index[
217 			    __wt_random(&session->rnd) % entries];
218 			goto descend;
219 		}
220 
221 		/*
222 		 * There may be empty pages in the tree, and they're useless to
223 		 * us. If we don't find a non-empty page in "entries" random
224 		 * guesses, take the first non-empty page in the tree. If the
225 		 * search page contains nothing other than empty pages, restart
226 		 * from the root some number of times before giving up.
227 		 *
228 		 * Random sampling is looking for a key/value pair on a random
229 		 * leaf page, and so will accept any page that contains a valid
230 		 * key/value pair, so on-disk is fine, but deleted is not.
231 		 */
232 		descent = NULL;
233 		for (i = 0; i < entries; ++i) {
234 			descent =
235 			    pindex->index[__wt_random(&session->rnd) % entries];
236 			if (descent->state == WT_REF_DISK ||
237 			    descent->state == WT_REF_LIMBO ||
238 			    descent->state == WT_REF_LOOKASIDE ||
239 			    descent->state == WT_REF_MEM)
240 				break;
241 		}
242 		if (i == entries)
243 			for (i = 0; i < entries; ++i) {
244 				descent = pindex->index[i];
245 				if (descent->state == WT_REF_DISK ||
246 				    descent->state == WT_REF_LIMBO ||
247 				    descent->state == WT_REF_LOOKASIDE ||
248 				    descent->state == WT_REF_MEM)
249 					break;
250 			}
251 		if (i == entries || descent == NULL) {
252 			if (--retry > 0)
253 				goto restart;
254 
255 			WT_RET(__wt_page_release(session, current, flags));
256 			return (WT_NOTFOUND);
257 		}
258 
259 		/*
260 		 * Swap the current page for the child page. If the page splits
261 		 * while we're retrieving it, restart the search at the root.
262 		 *
263 		 * On other error, simply return, the swap call ensures we're
264 		 * holding nothing on failure.
265 		 */
266 descend:	if ((ret = __wt_page_swap(
267 		    session, current, descent, flags)) == 0) {
268 			current = descent;
269 			continue;
270 		}
271 		if (eviction && (ret == WT_NOTFOUND || ret == WT_RESTART))
272 			break;
273 		if (ret == WT_RESTART)
274 			goto restart;
275 		return (ret);
276 	}
277 
278 	/*
279 	 * There is no point starting with the root page: the walk will exit
280 	 * immediately.  In that case we aren't holding a hazard pointer so
281 	 * there is nothing to release.
282 	 */
283 	if (!eviction || !__wt_ref_is_root(current))
284 		*refp = current;
285 	return (0);
286 }
287 
288 /*
289  * __wt_btcur_next_random --
290  *	Move to a random record in the tree. There are two algorithms, one
291  *	where we select a record at random from the whole tree on each
292  *	retrieval and one where we first select a record at random from the
293  *	whole tree, and then subsequently sample forward from that location.
294  *	The sampling approach allows us to select reasonably uniform random
295  *	points from unbalanced trees.
296  */
297 int
__wt_btcur_next_random(WT_CURSOR_BTREE * cbt)298 __wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
299 {
300 	WT_BTREE *btree;
301 	WT_CURSOR *cursor;
302 	WT_DECL_RET;
303 	WT_SESSION_IMPL *session;
304 	WT_UPDATE *upd;
305 	wt_off_t size;
306 	uint64_t n, skip;
307 	uint32_t read_flags;
308 	bool valid;
309 
310 	btree = cbt->btree;
311 	cursor = &cbt->iface;
312 	session = (WT_SESSION_IMPL *)cbt->iface.session;
313 	read_flags = WT_READ_RESTART_OK;
314 	if (F_ISSET(cbt, WT_CBT_READ_ONCE))
315 		FLD_SET(read_flags, WT_READ_WONT_NEED);
316 
317 	/*
318 	 * Only supports row-store: applications can trivially select a random
319 	 * value from a column-store, if there were any reason to do so.
320 	 */
321 	if (btree->type != BTREE_ROW)
322 		WT_RET_MSG(session, ENOTSUP,
323 		    "WT_CURSOR.next_random only supported by row-store tables");
324 
325 	WT_STAT_CONN_INCR(session, cursor_next);
326 	WT_STAT_DATA_INCR(session, cursor_next);
327 
328 	F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
329 
330 #ifdef HAVE_DIAGNOSTIC
331 	/*
332 	 * Under some conditions we end up using the underlying cursor.next to
333 	 * walk through the object. Since there are multiple calls, we can hit
334 	 * the cursor-order checks, turn them off.
335 	 */
336 	__wt_cursor_key_order_reset(cbt);
337 #endif
338 	/*
339 	 * If we don't have a current position in the tree, or if retrieving
340 	 * random values without sampling, pick a roughly random leaf page in
341 	 * the tree and return an entry from it.
342 	 */
343 	if (cbt->ref == NULL || cbt->next_random_sample_size == 0) {
344 		WT_ERR(__cursor_func_init(cbt, true));
345 		WT_WITH_PAGE_INDEX(session,
346 		    ret = __wt_random_descent(session, &cbt->ref, read_flags));
347 		if (ret == 0)
348 			goto random_page_entry;
349 
350 		/*
351 		 * Random descent may return not-found: the tree might be empty
352 		 * or have so many deleted items we didn't find any valid pages.
353 		 * We can't return WT_NOTFOUND to the application unless a tree
354 		 * is really empty, fallback to skipping through tree pages.
355 		 */
356 		WT_ERR_NOTFOUND_OK(ret);
357 	}
358 
359 	/*
360 	 * Cursor through the tree, skipping past the sample size of the leaf
361 	 * pages in the tree between each random key return to compensate for
362 	 * unbalanced trees.
363 	 *
364 	 * If the random descent attempt failed, we don't have a configured
365 	 * sample size, use 100 for no particular reason.
366 	 */
367 	if (cbt->next_random_sample_size == 0)
368 		cbt->next_random_sample_size = 100;
369 
370 	/*
371 	 * If the random descent attempt failed, or it's our first skip attempt,
372 	 * we haven't yet set the pages to skip, do it now.
373 	 *
374 	 * Use the underlying file size divided by its block allocation size as
375 	 * our guess of leaf pages in the file (this can be entirely wrong, as
376 	 * it depends on how many pages are in this particular checkpoint, how
377 	 * large the leaf and internal pages really are, and other factors).
378 	 * Then, divide that value by the configured sample size and increment
379 	 * the final result to make sure tiny files don't leave us with a skip
380 	 * value of 0.
381 	 *
382 	 * !!!
383 	 * Ideally, the number would be prime to avoid restart issues.
384 	 */
385 	if (cbt->next_random_leaf_skip == 0) {
386 		WT_ERR(btree->bm->size(btree->bm, session, &size));
387 		cbt->next_random_leaf_skip = (uint64_t)
388 		    ((size / btree->allocsize) /
389 		    cbt->next_random_sample_size) + 1;
390 	}
391 
392 	/*
393 	 * Be paranoid about loop termination: first, if the last leaf page
394 	 * skipped was also the last leaf page in the tree, skip may be set to
395 	 * zero on return along with the NULL WT_REF end-of-walk condition.
396 	 * Second, if a tree has no valid pages at all (the condition after
397 	 * initial creation), we might make no progress at all, or finally, if
398 	 * a tree has only deleted pages, we'll make progress, but never get a
399 	 * useful WT_REF. And, of course, the tree can switch from one of these
400 	 * states to another without warning. Decrement skip regardless of what
401 	 * is happening in the search, guarantee we eventually quit.
402 	 *
403 	 * Pages read for data sampling aren't "useful"; don't update the read
404 	 * generation of pages already in memory, and if a page is read, set
405 	 * its generation to a low value so it is evicted quickly.
406 	 */
407 	for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) {
408 		n = skip;
409 		WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip));
410 		if (n == skip) {
411 			if (skip == 0)
412 				break;
413 			--skip;
414 		}
415 	}
416 
417 	/*
418 	 * We can't return WT_NOTFOUND to the application unless a tree is
419 	 * really empty, fallback to a random entry from the first page in the
420 	 * tree that has anything at all.
421 	 */
422 	if (cbt->ref == NULL)
423 		WT_ERR(__wt_btcur_next(cbt, false));
424 
425 random_page_entry:
426 	/*
427 	 * Select a random entry from the leaf page. If it's not valid, move to
428 	 * the next entry, if that doesn't work, move to the previous entry.
429 	 */
430 	WT_ERR(__wt_row_random_leaf(cbt));
431 	WT_ERR(__wt_cursor_valid(cbt, &upd, &valid));
432 	if (valid) {
433 		WT_ERR(__wt_key_return(cbt));
434 		WT_ERR(__wt_value_return(cbt, upd));
435 	} else {
436 		if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND)
437 			ret = __wt_btcur_prev(cbt, false);
438 		WT_ERR(ret);
439 	}
440 	return (0);
441 
442 err:	WT_TRET(__cursor_reset(cbt));
443 	return (ret);
444 }
445