1 /*-
2 * Copyright (c) 2014-2018 MongoDB, Inc.
3 * Copyright (c) 2008-2014 WiredTiger, Inc.
4 * All rights reserved.
5 *
6 * See the file LICENSE for redistribution information.
7 */
8
9 #include "wt_internal.h"
10
11 /*
12 * __col_instantiate --
13 * Update a column-store page entry based on a lookaside table update list.
14 */
15 static int
__col_instantiate(WT_SESSION_IMPL * session,uint64_t recno,WT_REF * ref,WT_CURSOR_BTREE * cbt,WT_UPDATE * updlist)16 __col_instantiate(WT_SESSION_IMPL *session,
17 uint64_t recno, WT_REF *ref, WT_CURSOR_BTREE *cbt, WT_UPDATE *updlist)
18 {
19 WT_PAGE *page;
20 WT_UPDATE *upd;
21
22 page = ref->page;
23
24 /*
25 * Discard any of the updates we don't need.
26 *
27 * Just free the memory: it hasn't been accounted for on the page yet.
28 */
29 if (updlist->next != NULL &&
30 (upd = __wt_update_obsolete_check(session, page, updlist)) != NULL)
31 __wt_free_update_list(session, upd);
32
33 /* Search the page and add updates. */
34 WT_RET(__wt_col_search(cbt, recno, ref, true, NULL));
35 WT_RET(__wt_col_modify(
36 cbt, recno, NULL, updlist, WT_UPDATE_INVALID, false));
37 return (0);
38 }
39
40 /*
41 * __row_instantiate --
42 * Update a row-store page entry based on a lookaside table update list.
43 */
44 static int
__row_instantiate(WT_SESSION_IMPL * session,WT_ITEM * key,WT_REF * ref,WT_CURSOR_BTREE * cbt,WT_UPDATE * updlist)45 __row_instantiate(WT_SESSION_IMPL *session,
46 WT_ITEM *key, WT_REF *ref, WT_CURSOR_BTREE *cbt, WT_UPDATE *updlist)
47 {
48 WT_PAGE *page;
49 WT_UPDATE *upd;
50
51 page = ref->page;
52
53 /*
54 * Discard any of the updates we don't need.
55 *
56 * Just free the memory: it hasn't been accounted for on the page yet.
57 */
58 if (updlist->next != NULL &&
59 (upd = __wt_update_obsolete_check(session, page, updlist)) != NULL)
60 __wt_free_update_list(session, upd);
61
62 /* Search the page and add updates. */
63 WT_RET(__wt_row_search(cbt, key, true, ref, true, NULL));
64 WT_RET(__wt_row_modify(
65 cbt, key, NULL, updlist, WT_UPDATE_INVALID, false));
66 return (0);
67 }
68
69 /*
70 * __las_page_instantiate_verbose --
71 * Create a verbose message to display at most once per checkpoint when
72 * performing a lookaside table read.
73 */
74 static void
__las_page_instantiate_verbose(WT_SESSION_IMPL * session,uint64_t las_pageid)75 __las_page_instantiate_verbose(WT_SESSION_IMPL *session, uint64_t las_pageid)
76 {
77 WT_CACHE *cache;
78 uint64_t ckpt_gen_current, ckpt_gen_last;
79
80 if (!WT_VERBOSE_ISSET(session,
81 WT_VERB_LOOKASIDE | WT_VERB_LOOKASIDE_ACTIVITY))
82 return;
83
84 cache = S2C(session)->cache;
85 ckpt_gen_current = __wt_gen(session, WT_GEN_CHECKPOINT);
86 ckpt_gen_last = cache->las_verb_gen_read;
87
88 /*
89 * This message is throttled to one per checkpoint. To do this we
90 * track the generation of the last checkpoint for which the message
91 * was printed and check against the current checkpoint generation.
92 */
93 if (WT_VERBOSE_ISSET(session, WT_VERB_LOOKASIDE) ||
94 ckpt_gen_current > ckpt_gen_last) {
95 /*
96 * Attempt to atomically replace the last checkpoint generation
97 * for which this message was printed. If the atomic swap fails
98 * we have raced and the winning thread will print the message.
99 */
100 if (__wt_atomic_casv64(&cache->las_verb_gen_read,
101 ckpt_gen_last, ckpt_gen_current)) {
102 __wt_verbose(session,
103 WT_VERB_LOOKASIDE | WT_VERB_LOOKASIDE_ACTIVITY,
104 "Read from lookaside file triggered for "
105 "file ID %" PRIu32 ", page ID %" PRIu64,
106 S2BT(session)->id, las_pageid);
107 }
108 }
109 }
110
111 /*
112 * __las_page_instantiate --
113 * Instantiate lookaside update records in a recently read page.
114 */
115 static int
__las_page_instantiate(WT_SESSION_IMPL * session,WT_REF * ref)116 __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
117 {
118 WT_CACHE *cache;
119 WT_CURSOR *cursor;
120 WT_CURSOR_BTREE cbt;
121 WT_DECL_ITEM(current_key);
122 WT_DECL_RET;
123 WT_ITEM las_key, las_timestamp, las_value;
124 WT_PAGE *page;
125 WT_UPDATE *first_upd, *last_upd, *upd;
126 size_t incr, total_incr;
127 uint64_t current_recno, las_counter, las_pageid, las_txnid, recno;
128 uint32_t las_id, session_flags;
129 const uint8_t *p;
130 uint8_t prepare_state, upd_type;
131 bool locked;
132
133 cursor = NULL;
134 page = ref->page;
135 first_upd = last_upd = upd = NULL;
136 locked = false;
137 total_incr = 0;
138 current_recno = recno = WT_RECNO_OOB;
139 las_pageid = ref->page_las->las_pageid;
140 session_flags = 0; /* [-Werror=maybe-uninitialized] */
141 WT_CLEAR(las_key);
142
143 cache = S2C(session)->cache;
144 __las_page_instantiate_verbose(session, las_pageid);
145 WT_STAT_CONN_INCR(session, cache_read_lookaside);
146 WT_STAT_DATA_INCR(session, cache_read_lookaside);
147 if (WT_SESSION_IS_CHECKPOINT(session))
148 WT_STAT_CONN_INCR(session, cache_read_lookaside_checkpoint);
149
150 __wt_btcur_init(session, &cbt);
151 __wt_btcur_open(&cbt);
152
153 WT_ERR(__wt_scr_alloc(session, 0, ¤t_key));
154
155 /* Open a lookaside table cursor. */
156 __wt_las_cursor(session, &cursor, &session_flags);
157
158 /*
159 * The lookaside records are in key and update order, that is, there
160 * will be a set of in-order updates for a key, then another set of
161 * in-order updates for a subsequent key. We process all of the updates
162 * for a key and then insert those updates into the page, then all the
163 * updates for the next key, and so on.
164 */
165 WT_PUBLISH(cache->las_reader, true);
166 __wt_readlock(session, &cache->las_sweepwalk_lock);
167 WT_PUBLISH(cache->las_reader, false);
168 locked = true;
169 for (ret = __wt_las_cursor_position(cursor, las_pageid);
170 ret == 0;
171 ret = cursor->next(cursor)) {
172 WT_ERR(cursor->get_key(cursor,
173 &las_pageid, &las_id, &las_counter, &las_key));
174
175 /*
176 * Confirm the search using the unique prefix; if not a match,
177 * we're done searching for records for this page.
178 */
179 if (las_pageid != ref->page_las->las_pageid)
180 break;
181
182 /* Allocate the WT_UPDATE structure. */
183 WT_ERR(cursor->get_value(
184 cursor, &las_txnid, &las_timestamp,
185 &prepare_state, &upd_type, &las_value));
186 WT_ERR(__wt_update_alloc(
187 session, &las_value, &upd, &incr, upd_type));
188 total_incr += incr;
189 upd->txnid = las_txnid;
190 upd->prepare_state = prepare_state;
191 #ifdef HAVE_TIMESTAMPS
192 WT_ASSERT(session, las_timestamp.size == WT_TIMESTAMP_SIZE);
193 memcpy(&upd->timestamp, las_timestamp.data, las_timestamp.size);
194 #endif
195
196 switch (page->type) {
197 case WT_PAGE_COL_FIX:
198 case WT_PAGE_COL_VAR:
199 p = las_key.data;
200 WT_ERR(__wt_vunpack_uint(&p, 0, &recno));
201 if (current_recno == recno)
202 break;
203 WT_ASSERT(session, current_recno < recno);
204
205 if (first_upd != NULL) {
206 WT_ERR(__col_instantiate(session,
207 current_recno, ref, &cbt, first_upd));
208 first_upd = NULL;
209 }
210 current_recno = recno;
211 break;
212 case WT_PAGE_ROW_LEAF:
213 if (current_key->size == las_key.size &&
214 memcmp(current_key->data,
215 las_key.data, las_key.size) == 0)
216 break;
217
218 if (first_upd != NULL) {
219 WT_ERR(__row_instantiate(session,
220 current_key, ref, &cbt, first_upd));
221 first_upd = NULL;
222 }
223 WT_ERR(__wt_buf_set(session,
224 current_key, las_key.data, las_key.size));
225 break;
226 WT_ILLEGAL_VALUE_ERR(session, page->type);
227 }
228
229 /* Append the latest update to the list. */
230 if (first_upd == NULL)
231 first_upd = last_upd = upd;
232 else {
233 last_upd->next = upd;
234 last_upd = upd;
235 }
236 upd = NULL;
237 }
238 __wt_readunlock(session, &cache->las_sweepwalk_lock);
239 locked = false;
240 WT_ERR_NOTFOUND_OK(ret);
241
242 /* Insert the last set of updates, if any. */
243 if (first_upd != NULL)
244 switch (page->type) {
245 case WT_PAGE_COL_FIX:
246 case WT_PAGE_COL_VAR:
247 WT_ERR(__col_instantiate(session,
248 current_recno, ref, &cbt, first_upd));
249 first_upd = NULL;
250 break;
251 case WT_PAGE_ROW_LEAF:
252 WT_ERR(__row_instantiate(session,
253 current_key, ref, &cbt, first_upd));
254 first_upd = NULL;
255 break;
256 WT_ILLEGAL_VALUE_ERR(session, page->type);
257 }
258
259 /* Discard the cursor. */
260 WT_ERR(__wt_las_cursor_close(session, &cursor, session_flags));
261
262 if (total_incr != 0) {
263 __wt_cache_page_inmem_incr(session, page, total_incr);
264
265 /*
266 * If the updates in lookaside are newer than the versions on
267 * the page, it must be included in the next checkpoint.
268 *
269 * Otherwise, the page image contained the newest versions of
270 * data so the updates are all older and we could consider
271 * marking it clean (i.e., the next checkpoint can use the
272 * version already on disk).
273 *
274 * This needs care because (a) it creates pages with history
275 * that can't be evicted until they are marked dirty again, and
276 * (b) checkpoints may need to visit these pages to resolve
277 * changes evicted while a checkpoint is running.
278 */
279 page->modify->first_dirty_txn = WT_TXN_FIRST;
280
281 FLD_SET(page->modify->restore_state, WT_PAGE_RS_LOOKASIDE);
282
283 if (ref->page_las->skew_newest &&
284 !S2C(session)->txn_global.has_stable_timestamp &&
285 __wt_txn_visible_all(session, ref->page_las->unstable_txn,
286 WT_TIMESTAMP_NULL(&ref->page_las->unstable_timestamp))) {
287 page->modify->rec_max_txn = ref->page_las->max_txn;
288 __wt_timestamp_set(&page->modify->rec_max_timestamp,
289 &ref->page_las->max_timestamp);
290 __wt_page_modify_clear(session, page);
291 }
292 }
293
294 /*
295 * Now the lookaside history has been read into cache there is no
296 * further need to maintain a reference to it.
297 */
298 ref->page_las->eviction_to_lookaside = false;
299 ref->page_las->resolved = true;
300
301 err: if (locked)
302 __wt_readunlock(session, &cache->las_sweepwalk_lock);
303 WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
304 WT_TRET(__wt_btcur_close(&cbt, true));
305
306 /*
307 * On error, upd points to a single unlinked WT_UPDATE structure,
308 * first_upd points to a list.
309 */
310 __wt_free(session, upd);
311 __wt_free_update_list(session, first_upd);
312
313 __wt_scr_free(session, ¤t_key);
314
315 return (ret);
316 }
317
318 /*
319 * __evict_force_check --
320 * Check if a page matches the criteria for forced eviction.
321 */
322 static bool
__evict_force_check(WT_SESSION_IMPL * session,WT_REF * ref)323 __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref)
324 {
325 WT_BTREE *btree;
326 WT_PAGE *page;
327 size_t footprint;
328
329 btree = S2BT(session);
330 page = ref->page;
331
332 /* Leaf pages only. */
333 if (WT_PAGE_IS_INTERNAL(page))
334 return (false);
335
336 /*
337 * It's hard to imagine a page with a huge memory footprint that has
338 * never been modified, but check to be sure.
339 */
340 if (__wt_page_evict_clean(page))
341 return (false);
342
343 /*
344 * Exclude the disk image size from the footprint checks. Usually the
345 * disk image size is small compared with the in-memory limit (e.g.
346 * 16KB vs 5MB), so this doesn't make a big difference. Where it is
347 * important is for pages with a small number of large values, where
348 * the disk image size takes into account large values that have
349 * already been written and should not trigger forced eviction.
350 */
351 footprint = page->memory_footprint;
352 if (page->dsk != NULL)
353 footprint -= page->dsk->mem_size;
354
355 /* Pages are usually small enough, check that first. */
356 if (footprint < btree->splitmempage)
357 return (false);
358
359 /*
360 * If this session has more than one hazard pointer, eviction will fail
361 * and there is no point trying.
362 */
363 if (__wt_hazard_count(session, ref) > 1)
364 return (false);
365
366 /* If we can do an in-memory split, do it. */
367 if (__wt_leaf_page_can_split(session, page))
368 return (true);
369 if (footprint < btree->maxmempage)
370 return (false);
371
372 /* Bump the oldest ID, we're about to do some visibility checks. */
373 WT_IGNORE_RET(__wt_txn_update_oldest(session, 0));
374
375 /*
376 * Allow some leeway if the transaction ID isn't moving forward since
377 * it is unlikely eviction will be able to evict the page. Don't keep
378 * skipping the page indefinitely or large records can lead to
379 * extremely large memory footprints.
380 */
381 if (!__wt_page_evict_retry(session, page))
382 return (false);
383
384 /* Trigger eviction on the next page release. */
385 __wt_page_evict_soon(session, ref);
386
387 /* If eviction cannot succeed, don't try. */
388 return (__wt_page_can_evict(session, ref, NULL));
389 }
390
391 /*
392 * __page_read_lookaside --
393 * Figure out whether to instantiate content from lookaside on
394 * page access.
395 */
396 static inline int
__page_read_lookaside(WT_SESSION_IMPL * session,WT_REF * ref,uint32_t previous_state,uint32_t * final_statep)397 __page_read_lookaside(WT_SESSION_IMPL *session,
398 WT_REF *ref, uint32_t previous_state, uint32_t *final_statep)
399 {
400 /*
401 * Reading a lookaside ref for the first time, and not requiring the
402 * history triggers a transition to WT_REF_LIMBO, if we are already
403 * in limbo and still don't need the history - we are done.
404 */
405 if (__wt_las_page_skip_locked(session, ref)) {
406 if (previous_state == WT_REF_LOOKASIDE) {
407 WT_STAT_CONN_INCR(
408 session, cache_read_lookaside_skipped);
409 ref->page_las->eviction_to_lookaside = true;
410 *final_statep = WT_REF_LIMBO;
411 }
412 return (0);
413 }
414
415 /* Instantiate updates from the database's lookaside table. */
416 if (previous_state == WT_REF_LIMBO) {
417 WT_STAT_CONN_INCR(session, cache_read_lookaside_delay);
418 if (WT_SESSION_IS_CHECKPOINT(session))
419 WT_STAT_CONN_INCR(session,
420 cache_read_lookaside_delay_checkpoint);
421 }
422
423 WT_RET(__las_page_instantiate(session, ref));
424 return (0);
425 }
426
427 /*
428 * __page_read --
429 * Read a page from the file.
430 */
431 static int
__page_read(WT_SESSION_IMPL * session,WT_REF * ref,uint32_t flags)432 __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
433 {
434 WT_DECL_RET;
435 WT_ITEM tmp;
436 WT_PAGE *notused;
437 size_t addr_size;
438 uint64_t time_start, time_stop;
439 uint32_t page_flags, final_state, new_state, previous_state;
440 const uint8_t *addr;
441 bool timer;
442
443 time_start = time_stop = 0;
444
445 /*
446 * Don't pass an allocated buffer to the underlying block read function,
447 * force allocation of new memory of the appropriate size.
448 */
449 WT_CLEAR(tmp);
450
451 /*
452 * Attempt to set the state to WT_REF_READING for normal reads, or
453 * WT_REF_LOCKED, for deleted pages or pages with lookaside entries.
454 * The difference is that checkpoints can skip over clean pages that
455 * are being read into cache, but need to wait for deletes or lookaside
456 * updates to be resolved (in order for checkpoint to write the correct
457 * version of the page).
458 *
459 * If successful, we've won the race, read the page.
460 */
461 switch (previous_state = ref->state) {
462 case WT_REF_DISK:
463 new_state = WT_REF_READING;
464 break;
465 case WT_REF_DELETED:
466 case WT_REF_LIMBO:
467 case WT_REF_LOOKASIDE:
468 new_state = WT_REF_LOCKED;
469 break;
470 default:
471 return (0);
472 }
473 if (!__wt_atomic_casv32(&ref->state, previous_state, new_state))
474 return (0);
475
476 final_state = WT_REF_MEM;
477
478 /* If we already have the page image, just instantiate the history. */
479 if (previous_state == WT_REF_LIMBO)
480 goto skip_read;
481
482 /*
483 * Get the address: if there is no address, the page was deleted or had
484 * only lookaside entries, and a subsequent search or insert is forcing
485 * re-creation of the name space.
486 */
487 __wt_ref_info(ref, &addr, &addr_size, NULL);
488 if (addr == NULL) {
489 WT_ASSERT(session, previous_state != WT_REF_DISK);
490
491 WT_ERR(__wt_btree_new_leaf_page(session, &ref->page));
492 goto skip_read;
493 }
494
495 /*
496 * There's an address, read or map the backing disk page and build an
497 * in-memory version of the page.
498 */
499 timer = !F_ISSET(session, WT_SESSION_INTERNAL);
500 if (timer)
501 time_start = __wt_clock(session);
502 WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size));
503 if (timer) {
504 time_stop = __wt_clock(session);
505 WT_STAT_CONN_INCR(session, cache_read_app_count);
506 WT_STAT_CONN_INCRV(session, cache_read_app_time,
507 WT_CLOCKDIFF_US(time_stop, time_start));
508 }
509
510 /*
511 * Build the in-memory version of the page. Clear our local reference to
512 * the allocated copy of the disk image on return, the in-memory object
513 * steals it.
514 *
515 * If a page is read with eviction disabled, we don't count evicting it
516 * as progress. Since disabling eviction allows pages to be read even
517 * when the cache is full, we want to avoid workloads repeatedly reading
518 * a page with eviction disabled (e.g., a metadata page), then evicting
519 * that page and deciding that is a sign that eviction is unstuck.
520 */
521 page_flags =
522 WT_DATA_IN_ITEM(&tmp) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED;
523 if (LF_ISSET(WT_READ_IGNORE_CACHE_SIZE))
524 FLD_SET(page_flags, WT_PAGE_EVICT_NO_PROGRESS);
525 WT_ERR(__wt_page_inmem(session, ref, tmp.data, page_flags, ¬used));
526 tmp.mem = NULL;
527
528 /*
529 * The WT_REF lookaside state should match the page-header state of
530 * any page we read.
531 */
532 WT_ASSERT(session,
533 (previous_state != WT_REF_LIMBO &&
534 previous_state != WT_REF_LOOKASIDE) ||
535 ref->page->dsk == NULL ||
536 F_ISSET(ref->page->dsk, WT_PAGE_LAS_UPDATE));
537
538 skip_read:
539 switch (previous_state) {
540 case WT_REF_DELETED:
541 /*
542 * A truncated page may also have lookaside information. The
543 * delete happened after page eviction (writing the lookaside
544 * information), first update based on the lookaside table and
545 * then apply the delete.
546 */
547 if (ref->page_las != NULL)
548 WT_ERR(__las_page_instantiate(session, ref));
549
550 /* Move all records to a deleted state. */
551 WT_ERR(__wt_delete_page_instantiate(session, ref));
552 break;
553 case WT_REF_LIMBO:
554 case WT_REF_LOOKASIDE:
555 WT_ERR(__page_read_lookaside(
556 session, ref, previous_state, &final_state));
557 break;
558 }
559
560 /*
561 * Once the page is instantiated, we no longer need the history in
562 * lookaside. We leave the lookaside sweep thread to do most cleanup,
563 * but it can only remove keys that skew newest (if there are entries
564 * in the lookaside newer than the page, they need to be read back into
565 * cache or they will be lost).
566 *
567 * There is no reason for the lookaside remove should fail, but ignore
568 * it if for some reason it fails, we've got a valid page.
569 *
570 * Don't free WT_REF.page_las, there may be concurrent readers.
571 */
572 if (final_state == WT_REF_MEM &&
573 ref->page_las != NULL && !ref->page_las->skew_newest)
574 WT_IGNORE_RET(__wt_las_remove_block(
575 session, ref->page_las->las_pageid, false));
576
577 WT_PUBLISH(ref->state, final_state);
578 return (ret);
579
580 err: /*
581 * If the function building an in-memory version of the page failed,
582 * it discarded the page, but not the disk image. Discard the page
583 * and separately discard the disk image in all cases.
584 */
585 if (ref->page != NULL && previous_state != WT_REF_LIMBO)
586 __wt_ref_out(session, ref);
587 WT_PUBLISH(ref->state, previous_state);
588
589 __wt_buf_free(session, &tmp);
590
591 return (ret);
592 }
593
594 /*
595 * __wt_page_in_func --
596 * Acquire a hazard pointer to a page; if the page is not in-memory,
597 * read it from the disk and build an in-memory version.
598 */
599 int
__wt_page_in_func(WT_SESSION_IMPL * session,WT_REF * ref,uint32_t flags,const char * func,int line)600 __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
601 #ifdef HAVE_DIAGNOSTIC
602 , const char *func, int line
603 #endif
604 )
605 {
606 WT_BTREE *btree;
607 WT_DECL_RET;
608 WT_PAGE *page;
609 uint64_t sleep_usecs, yield_cnt;
610 uint32_t current_state;
611 int force_attempts;
612 bool busy, cache_work, did_read, stalled, wont_need;
613
614 btree = S2BT(session);
615
616 if (F_ISSET(session, WT_SESSION_IGNORE_CACHE_SIZE))
617 LF_SET(WT_READ_IGNORE_CACHE_SIZE);
618
619 /* Sanity check flag combinations. */
620 WT_ASSERT(session, !LF_ISSET(
621 WT_READ_DELETED_SKIP | WT_READ_NO_WAIT | WT_READ_LOOKASIDE) ||
622 LF_ISSET(WT_READ_CACHE));
623 WT_ASSERT(session, !LF_ISSET(WT_READ_DELETED_CHECK) ||
624 !LF_ISSET(WT_READ_DELETED_SKIP));
625
626 /*
627 * Ignore reads of pages already known to be in cache, otherwise the
628 * eviction server can dominate these statistics.
629 */
630 if (!LF_ISSET(WT_READ_CACHE)) {
631 WT_STAT_CONN_INCR(session, cache_pages_requested);
632 WT_STAT_DATA_INCR(session, cache_pages_requested);
633 }
634
635 for (did_read = wont_need = stalled = false,
636 force_attempts = 0, sleep_usecs = yield_cnt = 0;;) {
637 switch (current_state = ref->state) {
638 case WT_REF_DELETED:
639 if (LF_ISSET(WT_READ_DELETED_SKIP | WT_READ_NO_WAIT))
640 return (WT_NOTFOUND);
641 if (LF_ISSET(WT_READ_DELETED_CHECK) &&
642 __wt_delete_page_skip(session, ref, false))
643 return (WT_NOTFOUND);
644 goto read;
645 case WT_REF_LOOKASIDE:
646 if (LF_ISSET(WT_READ_CACHE)) {
647 if (!LF_ISSET(WT_READ_LOOKASIDE))
648 return (WT_NOTFOUND);
649 /*
650 * If we skip a lookaside page, the tree
651 * cannot be left clean: lookaside entries
652 * must be resolved before the tree can be
653 * discarded.
654 */
655 if (__wt_las_page_skip(session, ref)) {
656 __wt_tree_modify_set(session);
657 return (WT_NOTFOUND);
658 }
659 }
660 goto read;
661 case WT_REF_DISK:
662 if (LF_ISSET(WT_READ_CACHE))
663 return (WT_NOTFOUND);
664
665 read: /*
666 * The page isn't in memory, read it. If this thread
667 * respects the cache size, check for space in the
668 * cache.
669 */
670 if (!LF_ISSET(WT_READ_IGNORE_CACHE_SIZE))
671 WT_RET(__wt_cache_eviction_check(
672 session, true,
673 !F_ISSET(&session->txn, WT_TXN_HAS_ID),
674 NULL));
675 WT_RET(__page_read(session, ref, flags));
676
677 /*
678 * We just read a page, don't evict it before we have a
679 * chance to use it.
680 */
681 did_read = true;
682
683 /*
684 * If configured to not trash the cache, leave the page
685 * generation unset, we'll set it before returning to
686 * the oldest read generation, so the page is forcibly
687 * evicted as soon as possible. We don't do that set
688 * here because we don't want to evict the page before
689 * we "acquire" it.
690 */
691 wont_need = LF_ISSET(WT_READ_WONT_NEED) ||
692 F_ISSET(session, WT_SESSION_READ_WONT_NEED) ||
693 F_ISSET(S2C(session)->cache, WT_CACHE_EVICT_NOKEEP);
694 continue;
695 case WT_REF_READING:
696 if (LF_ISSET(WT_READ_CACHE))
697 return (WT_NOTFOUND);
698 if (LF_ISSET(WT_READ_NO_WAIT))
699 return (WT_NOTFOUND);
700
701 /* Waiting on another thread's read, stall. */
702 WT_STAT_CONN_INCR(session, page_read_blocked);
703 stalled = true;
704 break;
705 case WT_REF_LOCKED:
706 if (LF_ISSET(WT_READ_NO_WAIT))
707 return (WT_NOTFOUND);
708
709 /* Waiting on eviction, stall. */
710 WT_STAT_CONN_INCR(session, page_locked_blocked);
711 stalled = true;
712 break;
713 case WT_REF_SPLIT:
714 return (WT_RESTART);
715 case WT_REF_LIMBO:
716 case WT_REF_MEM:
717 /*
718 * The page is in memory.
719 *
720 * Get a hazard pointer if one is required. We cannot
721 * be evicting if no hazard pointer is required, we're
722 * done.
723 */
724 if (F_ISSET(btree, WT_BTREE_IN_MEMORY))
725 goto skip_evict;
726
727 /*
728 * The expected reason we can't get a hazard pointer is
729 * because the page is being evicted, yield, try again.
730 */
731 #ifdef HAVE_DIAGNOSTIC
732 WT_RET(
733 __wt_hazard_set(session, ref, &busy, func, line));
734 #else
735 WT_RET(__wt_hazard_set(session, ref, &busy));
736 #endif
737 if (busy) {
738 WT_STAT_CONN_INCR(session, page_busy_blocked);
739 break;
740 }
741 /*
742 * If we are a limbo page check whether we need to
743 * instantiate the history. By having a hazard pointer
744 * we can use the locked version.
745 */
746 if (current_state == WT_REF_LIMBO &&
747 ((!LF_ISSET(WT_READ_CACHE) ||
748 LF_ISSET(WT_READ_LOOKASIDE)) &&
749 !__wt_las_page_skip_locked(session, ref))) {
750 WT_RET(__wt_hazard_clear(session, ref));
751 goto read;
752 }
753 if (current_state == WT_REF_LIMBO &&
754 LF_ISSET(WT_READ_CACHE) &&
755 LF_ISSET(WT_READ_LOOKASIDE))
756 __wt_tree_modify_set(session);
757
758 /*
759 * Check if the page requires forced eviction.
760 */
761 if (did_read || LF_ISSET(WT_READ_NO_SPLIT) ||
762 btree->evict_disabled > 0 || btree->lsm_primary)
763 goto skip_evict;
764
765 /*
766 * If reconciliation is disabled (e.g., when inserting
767 * into the lookaside table), skip forced eviction if
768 * the page can't split.
769 */
770 if (F_ISSET(session, WT_SESSION_NO_RECONCILE) &&
771 !__wt_leaf_page_can_split(session, ref->page))
772 goto skip_evict;
773
774 /*
775 * Forcibly evict pages that are too big.
776 */
777 if (force_attempts < 10 &&
778 __evict_force_check(session, ref)) {
779 ++force_attempts;
780 ret = __wt_page_release_evict(session, ref);
781 /* If forced eviction fails, stall. */
782 if (ret == EBUSY) {
783 WT_NOT_READ(ret, 0);
784 WT_STAT_CONN_INCR(session,
785 page_forcible_evict_blocked);
786 stalled = true;
787 break;
788 }
789 WT_RET(ret);
790
791 /*
792 * The result of a successful forced eviction
793 * is a page-state transition (potentially to
794 * an in-memory page we can use, or a restart
795 * return for our caller), continue the outer
796 * page-acquisition loop.
797 */
798 continue;
799 }
800
801 skip_evict: /*
802 * If we read the page and are configured to not trash
803 * the cache, and no other thread has already used the
804 * page, set the read generation so the page is evicted
805 * soon.
806 *
807 * Otherwise, if we read the page, or, if configured to
808 * update the page's read generation and the page isn't
809 * already flagged for forced eviction, update the page
810 * read generation.
811 */
812 page = ref->page;
813 if (page->read_gen == WT_READGEN_NOTSET) {
814 if (wont_need)
815 page->read_gen = WT_READGEN_WONT_NEED;
816 else
817 __wt_cache_read_gen_new(session, page);
818 } else if (!LF_ISSET(WT_READ_NO_GEN))
819 __wt_cache_read_gen_bump(session, page);
820
821 /*
822 * Check if we need an autocommit transaction.
823 * Starting a transaction can trigger eviction, so skip
824 * it if eviction isn't permitted.
825 *
826 * The logic here is a little weird: some code paths do
827 * a blanket ban on checking the cache size in
828 * sessions, but still require a transaction (e.g.,
829 * when updating metadata or lookaside). If
830 * WT_READ_IGNORE_CACHE_SIZE was passed in explicitly,
831 * we're done. If we set WT_READ_IGNORE_CACHE_SIZE
832 * because it was set in the session then make sure we
833 * start a transaction.
834 */
835 return (LF_ISSET(WT_READ_IGNORE_CACHE_SIZE) &&
836 !F_ISSET(session, WT_SESSION_IGNORE_CACHE_SIZE) ?
837 0 : __wt_txn_autocommit_check(session));
838 WT_ILLEGAL_VALUE(session, current_state);
839 }
840
841 /*
842 * We failed to get the page -- yield before retrying, and if
843 * we've yielded enough times, start sleeping so we don't burn
844 * CPU to no purpose.
845 */
846 if (yield_cnt < WT_THOUSAND) {
847 if (!stalled) {
848 ++yield_cnt;
849 __wt_yield();
850 continue;
851 }
852 yield_cnt = WT_THOUSAND;
853 }
854
855 /*
856 * If stalling and this thread is allowed to do eviction work,
857 * check if the cache needs help. If we do work for the cache,
858 * substitute that for a sleep.
859 */
860 if (!LF_ISSET(WT_READ_IGNORE_CACHE_SIZE)) {
861 WT_RET(__wt_cache_eviction_check(
862 session, true,
863 !F_ISSET(&session->txn, WT_TXN_HAS_ID),
864 &cache_work));
865 if (cache_work)
866 continue;
867 }
868 __wt_spin_backoff(&yield_cnt, &sleep_usecs);
869 WT_STAT_CONN_INCRV(session, page_sleep, sleep_usecs);
870 }
871 }
872