1 /*-
2 * Copyright (c) 2014-2018 MongoDB, Inc.
3 * Copyright (c) 2008-2014 WiredTiger, Inc.
4 * All rights reserved.
5 *
6 * See the file LICENSE for redistribution information.
7 */
8
9 #include "wt_internal.h"
10
11 /*
12 * __wt_row_random_leaf --
13 * Return a random key from a row-store leaf page.
14 */
15 int
__wt_row_random_leaf(WT_CURSOR_BTREE * cbt)16 __wt_row_random_leaf(WT_CURSOR_BTREE *cbt)
17 {
18 WT_INSERT *ins, **start, **stop;
19 WT_INSERT_HEAD *ins_head;
20 WT_PAGE *page;
21 WT_SESSION_IMPL *session;
22 uint64_t samples;
23 uint32_t choice, entries, i;
24 int level;
25
26 page = cbt->ref->page;
27 session = (WT_SESSION_IMPL *)cbt->iface.session;
28 start = stop = NULL; /* [-Wconditional-uninitialized] */
29 entries = 0; /* [-Wconditional-uninitialized] */
30
31 __cursor_pos_clear(cbt);
32
33 /* If the page has disk-based entries, select from them. */
34 if (page->entries != 0) {
35 cbt->compare = 0;
36 cbt->slot = __wt_random(&session->rnd) % page->entries;
37
38 /*
39 * The real row-store search function builds the key, so we
40 * have to as well.
41 */
42 return (__wt_row_leaf_key(session,
43 page, page->pg_row + cbt->slot, cbt->tmp, false));
44 }
45
46 /*
47 * If the tree is new (and not empty), it might have a large insert
48 * list.
49 *
50 * Walk down the list until we find a level with at least 50 entries,
51 * that's where we'll start rolling random numbers. The value 50 is
52 * used to ignore levels with only a few entries, that is, levels which
53 * are potentially badly skewed.
54 */
55 F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
56 if ((ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL)
57 return (WT_NOTFOUND);
58 for (level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) {
59 start = &ins_head->head[level];
60 for (entries = 0, stop = start;
61 *stop != NULL; stop = &(*stop)->next[level])
62 ++entries;
63
64 if (entries > 50)
65 break;
66 }
67
68 /*
69 * If it's a tiny list and we went all the way to level 0, correct the
70 * level; entries is correctly set.
71 */
72 if (level < 0)
73 level = 0;
74
75 /*
76 * Step down the skip list levels, selecting a random chunk of the name
77 * space at each level.
78 */
79 for (samples = entries; level > 0; samples += entries) {
80 /*
81 * There are (entries) or (entries + 1) chunks of the name space
82 * considered at each level. They are: between start and the 1st
83 * element, between the 1st and 2nd elements, and so on to the
84 * last chunk which is the name space after the stop element on
85 * the current level. This last chunk of name space may or may
86 * not be there: as we descend the levels of the skip list, this
87 * chunk may appear, depending if the next level down has
88 * entries logically after the stop point in the current level.
89 * We can't ignore those entries: because of the algorithm used
90 * to determine the depth of a skiplist, there may be a large
91 * number of entries "revealed" by descending a level.
92 *
93 * If the next level down has more items after the current stop
94 * point, there are (entries + 1) chunks to consider, else there
95 * are (entries) chunks.
96 */
97 if (*(stop - 1) == NULL)
98 choice = __wt_random(&session->rnd) % entries;
99 else
100 choice = __wt_random(&session->rnd) % (entries + 1);
101
102 if (choice == entries) {
103 /*
104 * We selected the name space after the stop element on
105 * this level. Set the start point to the current stop
106 * point, descend a level and move the stop element to
107 * the end of the list, that is, the end of the newly
108 * discovered name space, counting entries as we go.
109 */
110 start = stop;
111 --start;
112 --level;
113 for (entries = 0, stop = start;
114 *stop != NULL; stop = &(*stop)->next[level])
115 ++entries;
116 } else {
117 /*
118 * We selected another name space on the level. Move the
119 * start pointer the selected number of entries forward
120 * to the start of the selected chunk (if the selected
121 * number is 0, start won't move). Set the stop pointer
122 * to the next element in the list and drop both start
123 * and stop down a level.
124 */
125 for (i = 0; i < choice; ++i)
126 start = &(*start)->next[level];
127 stop = &(*start)->next[level];
128
129 --start;
130 --stop;
131 --level;
132
133 /* Count the entries in the selected name space. */
134 for (entries = 0,
135 ins = *start; ins != *stop; ins = ins->next[level])
136 ++entries;
137 }
138 }
139
140 /*
141 * When we reach the bottom level, entries will already be set. Select
142 * a random entry from the name space and return it.
143 *
144 * It should be impossible for the entries count to be 0 at this point,
145 * but check for it out of paranoia and to quiet static testing tools.
146 */
147 if (entries > 0)
148 entries = __wt_random(&session->rnd) % entries;
149 for (ins = *start; entries > 0; --entries)
150 ins = ins->next[0];
151
152 cbt->ins = ins;
153 cbt->ins_head = ins_head;
154 cbt->compare = 0;
155
156 /*
157 * Random lookups in newly created collections can be slow if a page
158 * consists of a large skiplist. Schedule the page for eviction if we
159 * encounter a large skiplist. This worthwhile because applications
160 * that take a sample often take many samples, so the overhead of
161 * traversing the skip list each time accumulates to real time.
162 */
163 if (samples > 5000)
164 __wt_page_evict_soon(session, cbt->ref);
165
166 return (0);
167 }
168
169 /*
170 * __wt_random_descent --
171 * Find a random page in a tree for either sampling or eviction.
172 */
173 int
__wt_random_descent(WT_SESSION_IMPL * session,WT_REF ** refp,uint32_t flags)174 __wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags)
175 {
176 WT_BTREE *btree;
177 WT_DECL_RET;
178 WT_PAGE *page;
179 WT_PAGE_INDEX *pindex;
180 WT_REF *current, *descent;
181 uint32_t i, entries, retry;
182 bool eviction;
183
184 *refp = NULL;
185
186 btree = S2BT(session);
187 current = NULL;
188 retry = 100;
189 /*
190 * This function is called by eviction to find a random page in the
191 * cache. That case is indicated by the WT_READ_CACHE flag. Ordinary
192 * lookups in a tree will read pages into cache as needed.
193 */
194 eviction = LF_ISSET(WT_READ_CACHE);
195
196 if (0) {
197 restart: /*
198 * Discard the currently held page and restart the search from
199 * the root.
200 */
201 WT_RET(__wt_page_release(session, current, flags));
202 }
203
204 /* Search the internal pages of the tree. */
205 current = &btree->root;
206 for (;;) {
207 page = current->page;
208 if (!WT_PAGE_IS_INTERNAL(page))
209 break;
210
211 WT_INTL_INDEX_GET(session, page, pindex);
212 entries = pindex->entries;
213
214 /* Eviction just wants any random child. */
215 if (eviction) {
216 descent = pindex->index[
217 __wt_random(&session->rnd) % entries];
218 goto descend;
219 }
220
221 /*
222 * There may be empty pages in the tree, and they're useless to
223 * us. If we don't find a non-empty page in "entries" random
224 * guesses, take the first non-empty page in the tree. If the
225 * search page contains nothing other than empty pages, restart
226 * from the root some number of times before giving up.
227 *
228 * Random sampling is looking for a key/value pair on a random
229 * leaf page, and so will accept any page that contains a valid
230 * key/value pair, so on-disk is fine, but deleted is not.
231 */
232 descent = NULL;
233 for (i = 0; i < entries; ++i) {
234 descent =
235 pindex->index[__wt_random(&session->rnd) % entries];
236 if (descent->state == WT_REF_DISK ||
237 descent->state == WT_REF_LIMBO ||
238 descent->state == WT_REF_LOOKASIDE ||
239 descent->state == WT_REF_MEM)
240 break;
241 }
242 if (i == entries)
243 for (i = 0; i < entries; ++i) {
244 descent = pindex->index[i];
245 if (descent->state == WT_REF_DISK ||
246 descent->state == WT_REF_LIMBO ||
247 descent->state == WT_REF_LOOKASIDE ||
248 descent->state == WT_REF_MEM)
249 break;
250 }
251 if (i == entries || descent == NULL) {
252 if (--retry > 0)
253 goto restart;
254
255 WT_RET(__wt_page_release(session, current, flags));
256 return (WT_NOTFOUND);
257 }
258
259 /*
260 * Swap the current page for the child page. If the page splits
261 * while we're retrieving it, restart the search at the root.
262 *
263 * On other error, simply return, the swap call ensures we're
264 * holding nothing on failure.
265 */
266 descend: if ((ret = __wt_page_swap(
267 session, current, descent, flags)) == 0) {
268 current = descent;
269 continue;
270 }
271 if (eviction && (ret == WT_NOTFOUND || ret == WT_RESTART))
272 break;
273 if (ret == WT_RESTART)
274 goto restart;
275 return (ret);
276 }
277
278 /*
279 * There is no point starting with the root page: the walk will exit
280 * immediately. In that case we aren't holding a hazard pointer so
281 * there is nothing to release.
282 */
283 if (!eviction || !__wt_ref_is_root(current))
284 *refp = current;
285 return (0);
286 }
287
288 /*
289 * __wt_btcur_next_random --
290 * Move to a random record in the tree. There are two algorithms, one
291 * where we select a record at random from the whole tree on each
292 * retrieval and one where we first select a record at random from the
293 * whole tree, and then subsequently sample forward from that location.
294 * The sampling approach allows us to select reasonably uniform random
295 * points from unbalanced trees.
296 */
297 int
__wt_btcur_next_random(WT_CURSOR_BTREE * cbt)298 __wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
299 {
300 WT_BTREE *btree;
301 WT_CURSOR *cursor;
302 WT_DECL_RET;
303 WT_SESSION_IMPL *session;
304 WT_UPDATE *upd;
305 wt_off_t size;
306 uint64_t n, skip;
307 uint32_t read_flags;
308 bool valid;
309
310 btree = cbt->btree;
311 cursor = &cbt->iface;
312 session = (WT_SESSION_IMPL *)cbt->iface.session;
313 read_flags = WT_READ_RESTART_OK;
314 if (F_ISSET(cbt, WT_CBT_READ_ONCE))
315 FLD_SET(read_flags, WT_READ_WONT_NEED);
316
317 /*
318 * Only supports row-store: applications can trivially select a random
319 * value from a column-store, if there were any reason to do so.
320 */
321 if (btree->type != BTREE_ROW)
322 WT_RET_MSG(session, ENOTSUP,
323 "WT_CURSOR.next_random only supported by row-store tables");
324
325 WT_STAT_CONN_INCR(session, cursor_next);
326 WT_STAT_DATA_INCR(session, cursor_next);
327
328 F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
329
330 #ifdef HAVE_DIAGNOSTIC
331 /*
332 * Under some conditions we end up using the underlying cursor.next to
333 * walk through the object. Since there are multiple calls, we can hit
334 * the cursor-order checks, turn them off.
335 */
336 __wt_cursor_key_order_reset(cbt);
337 #endif
338 /*
339 * If we don't have a current position in the tree, or if retrieving
340 * random values without sampling, pick a roughly random leaf page in
341 * the tree and return an entry from it.
342 */
343 if (cbt->ref == NULL || cbt->next_random_sample_size == 0) {
344 WT_ERR(__cursor_func_init(cbt, true));
345 WT_WITH_PAGE_INDEX(session,
346 ret = __wt_random_descent(session, &cbt->ref, read_flags));
347 if (ret == 0)
348 goto random_page_entry;
349
350 /*
351 * Random descent may return not-found: the tree might be empty
352 * or have so many deleted items we didn't find any valid pages.
353 * We can't return WT_NOTFOUND to the application unless a tree
354 * is really empty, fallback to skipping through tree pages.
355 */
356 WT_ERR_NOTFOUND_OK(ret);
357 }
358
359 /*
360 * Cursor through the tree, skipping past the sample size of the leaf
361 * pages in the tree between each random key return to compensate for
362 * unbalanced trees.
363 *
364 * If the random descent attempt failed, we don't have a configured
365 * sample size, use 100 for no particular reason.
366 */
367 if (cbt->next_random_sample_size == 0)
368 cbt->next_random_sample_size = 100;
369
370 /*
371 * If the random descent attempt failed, or it's our first skip attempt,
372 * we haven't yet set the pages to skip, do it now.
373 *
374 * Use the underlying file size divided by its block allocation size as
375 * our guess of leaf pages in the file (this can be entirely wrong, as
376 * it depends on how many pages are in this particular checkpoint, how
377 * large the leaf and internal pages really are, and other factors).
378 * Then, divide that value by the configured sample size and increment
379 * the final result to make sure tiny files don't leave us with a skip
380 * value of 0.
381 *
382 * !!!
383 * Ideally, the number would be prime to avoid restart issues.
384 */
385 if (cbt->next_random_leaf_skip == 0) {
386 WT_ERR(btree->bm->size(btree->bm, session, &size));
387 cbt->next_random_leaf_skip = (uint64_t)
388 ((size / btree->allocsize) /
389 cbt->next_random_sample_size) + 1;
390 }
391
392 /*
393 * Be paranoid about loop termination: first, if the last leaf page
394 * skipped was also the last leaf page in the tree, skip may be set to
395 * zero on return along with the NULL WT_REF end-of-walk condition.
396 * Second, if a tree has no valid pages at all (the condition after
397 * initial creation), we might make no progress at all, or finally, if
398 * a tree has only deleted pages, we'll make progress, but never get a
399 * useful WT_REF. And, of course, the tree can switch from one of these
400 * states to another without warning. Decrement skip regardless of what
401 * is happening in the search, guarantee we eventually quit.
402 *
403 * Pages read for data sampling aren't "useful"; don't update the read
404 * generation of pages already in memory, and if a page is read, set
405 * its generation to a low value so it is evicted quickly.
406 */
407 for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) {
408 n = skip;
409 WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip));
410 if (n == skip) {
411 if (skip == 0)
412 break;
413 --skip;
414 }
415 }
416
417 /*
418 * We can't return WT_NOTFOUND to the application unless a tree is
419 * really empty, fallback to a random entry from the first page in the
420 * tree that has anything at all.
421 */
422 if (cbt->ref == NULL)
423 WT_ERR(__wt_btcur_next(cbt, false));
424
425 random_page_entry:
426 /*
427 * Select a random entry from the leaf page. If it's not valid, move to
428 * the next entry, if that doesn't work, move to the previous entry.
429 */
430 WT_ERR(__wt_row_random_leaf(cbt));
431 WT_ERR(__wt_cursor_valid(cbt, &upd, &valid));
432 if (valid) {
433 WT_ERR(__wt_key_return(cbt));
434 WT_ERR(__wt_value_return(cbt, upd));
435 } else {
436 if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND)
437 ret = __wt_btcur_prev(cbt, false);
438 WT_ERR(ret);
439 }
440 return (0);
441
442 err: WT_TRET(__cursor_reset(cbt));
443 return (ret);
444 }
445