1 /*-
2  * Copyright (c) 2014-2018 MongoDB, Inc.
3  * Copyright (c) 2008-2014 WiredTiger, Inc.
4  *	All rights reserved.
5  *
6  * See the file LICENSE for redistribution information.
7  */
8 
9 #include "wt_internal.h"
10 
11 /*
12  * __col_instantiate --
13  *	Update a column-store page entry based on a lookaside table update list.
14  */
15 static int
__col_instantiate(WT_SESSION_IMPL * session,uint64_t recno,WT_REF * ref,WT_CURSOR_BTREE * cbt,WT_UPDATE * updlist)16 __col_instantiate(WT_SESSION_IMPL *session,
17     uint64_t recno, WT_REF *ref, WT_CURSOR_BTREE *cbt, WT_UPDATE *updlist)
18 {
19 	WT_PAGE *page;
20 	WT_UPDATE *upd;
21 
22 	page = ref->page;
23 
24 	/*
25 	 * Discard any of the updates we don't need.
26 	 *
27 	 * Just free the memory: it hasn't been accounted for on the page yet.
28 	 */
29 	if (updlist->next != NULL &&
30 	    (upd = __wt_update_obsolete_check(session, page, updlist)) != NULL)
31 		__wt_free_update_list(session, upd);
32 
33 	/* Search the page and add updates. */
34 	WT_RET(__wt_col_search(cbt, recno, ref, true, NULL));
35 	WT_RET(__wt_col_modify(
36 	    cbt, recno, NULL, updlist, WT_UPDATE_INVALID, false));
37 	return (0);
38 }
39 
40 /*
41  * __row_instantiate --
42  *	Update a row-store page entry based on a lookaside table update list.
43  */
44 static int
__row_instantiate(WT_SESSION_IMPL * session,WT_ITEM * key,WT_REF * ref,WT_CURSOR_BTREE * cbt,WT_UPDATE * updlist)45 __row_instantiate(WT_SESSION_IMPL *session,
46     WT_ITEM *key, WT_REF *ref, WT_CURSOR_BTREE *cbt, WT_UPDATE *updlist)
47 {
48 	WT_PAGE *page;
49 	WT_UPDATE *upd;
50 
51 	page = ref->page;
52 
53 	/*
54 	 * Discard any of the updates we don't need.
55 	 *
56 	 * Just free the memory: it hasn't been accounted for on the page yet.
57 	 */
58 	if (updlist->next != NULL &&
59 	    (upd = __wt_update_obsolete_check(session, page, updlist)) != NULL)
60 		__wt_free_update_list(session, upd);
61 
62 	/* Search the page and add updates. */
63 	WT_RET(__wt_row_search(cbt, key, true, ref, true, NULL));
64 	WT_RET(__wt_row_modify(
65 	    cbt, key, NULL, updlist, WT_UPDATE_INVALID, false));
66 	return (0);
67 }
68 
69 /*
70  * __las_page_instantiate_verbose --
71  *	Create a verbose message to display at most once per checkpoint when
72  *	performing a lookaside table read.
73  */
74 static void
__las_page_instantiate_verbose(WT_SESSION_IMPL * session,uint64_t las_pageid)75 __las_page_instantiate_verbose(WT_SESSION_IMPL *session, uint64_t las_pageid)
76 {
77 	WT_CACHE *cache;
78 	uint64_t ckpt_gen_current, ckpt_gen_last;
79 
80 	if (!WT_VERBOSE_ISSET(session,
81 	    WT_VERB_LOOKASIDE | WT_VERB_LOOKASIDE_ACTIVITY))
82 		return;
83 
84 	cache = S2C(session)->cache;
85 	ckpt_gen_current = __wt_gen(session, WT_GEN_CHECKPOINT);
86 	ckpt_gen_last = cache->las_verb_gen_read;
87 
88 	/*
89 	 * This message is throttled to one per checkpoint. To do this we
90 	 * track the generation of the last checkpoint for which the message
91 	 * was printed and check against the current checkpoint generation.
92 	 */
93 	if (WT_VERBOSE_ISSET(session, WT_VERB_LOOKASIDE) ||
94 	    ckpt_gen_current > ckpt_gen_last) {
95 		/*
96 		 * Attempt to atomically replace the last checkpoint generation
97 		 * for which this message was printed. If the atomic swap fails
98 		 * we have raced and the winning thread will print the message.
99 		 */
100 		if (__wt_atomic_casv64(&cache->las_verb_gen_read,
101 			ckpt_gen_last, ckpt_gen_current)) {
102 			__wt_verbose(session,
103 			    WT_VERB_LOOKASIDE | WT_VERB_LOOKASIDE_ACTIVITY,
104 			    "Read from lookaside file triggered for "
105 			    "file ID %" PRIu32 ", page ID %" PRIu64,
106 			    S2BT(session)->id, las_pageid);
107 		}
108 	}
109 }
110 
111 /*
112  * __las_page_instantiate --
113  *	Instantiate lookaside update records in a recently read page.
114  */
115 static int
__las_page_instantiate(WT_SESSION_IMPL * session,WT_REF * ref)116 __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
117 {
118 	WT_CACHE *cache;
119 	WT_CURSOR *cursor;
120 	WT_CURSOR_BTREE cbt;
121 	WT_DECL_ITEM(current_key);
122 	WT_DECL_RET;
123 	WT_ITEM las_key, las_timestamp, las_value;
124 	WT_PAGE *page;
125 	WT_UPDATE *first_upd, *last_upd, *upd;
126 	size_t incr, total_incr;
127 	uint64_t current_recno, las_counter, las_pageid, las_txnid, recno;
128 	uint32_t las_id, session_flags;
129 	const uint8_t *p;
130 	uint8_t prepare_state, upd_type;
131 	bool locked;
132 
133 	cursor = NULL;
134 	page = ref->page;
135 	first_upd = last_upd = upd = NULL;
136 	locked = false;
137 	total_incr = 0;
138 	current_recno = recno = WT_RECNO_OOB;
139 	las_pageid = ref->page_las->las_pageid;
140 	session_flags = 0;		/* [-Werror=maybe-uninitialized] */
141 	WT_CLEAR(las_key);
142 
143 	cache = S2C(session)->cache;
144 	__las_page_instantiate_verbose(session, las_pageid);
145 	WT_STAT_CONN_INCR(session, cache_read_lookaside);
146 	WT_STAT_DATA_INCR(session, cache_read_lookaside);
147 	if (WT_SESSION_IS_CHECKPOINT(session))
148 		WT_STAT_CONN_INCR(session, cache_read_lookaside_checkpoint);
149 
150 	__wt_btcur_init(session, &cbt);
151 	__wt_btcur_open(&cbt);
152 
153 	WT_ERR(__wt_scr_alloc(session, 0, &current_key));
154 
155 	/* Open a lookaside table cursor. */
156 	__wt_las_cursor(session, &cursor, &session_flags);
157 
158 	/*
159 	 * The lookaside records are in key and update order, that is, there
160 	 * will be a set of in-order updates for a key, then another set of
161 	 * in-order updates for a subsequent key. We process all of the updates
162 	 * for a key and then insert those updates into the page, then all the
163 	 * updates for the next key, and so on.
164 	 */
165 	WT_PUBLISH(cache->las_reader, true);
166 	__wt_readlock(session, &cache->las_sweepwalk_lock);
167 	WT_PUBLISH(cache->las_reader, false);
168 	locked = true;
169 	for (ret = __wt_las_cursor_position(cursor, las_pageid);
170 	    ret == 0;
171 	    ret = cursor->next(cursor)) {
172 		WT_ERR(cursor->get_key(cursor,
173 		    &las_pageid, &las_id, &las_counter, &las_key));
174 
175 		/*
176 		 * Confirm the search using the unique prefix; if not a match,
177 		 * we're done searching for records for this page.
178 		 */
179 		if (las_pageid != ref->page_las->las_pageid)
180 			break;
181 
182 		/* Allocate the WT_UPDATE structure. */
183 		WT_ERR(cursor->get_value(
184 		    cursor, &las_txnid, &las_timestamp,
185 		    &prepare_state, &upd_type, &las_value));
186 		WT_ERR(__wt_update_alloc(
187 		    session, &las_value, &upd, &incr, upd_type));
188 		total_incr += incr;
189 		upd->txnid = las_txnid;
190 		upd->prepare_state = prepare_state;
191 #ifdef HAVE_TIMESTAMPS
192 		WT_ASSERT(session, las_timestamp.size == WT_TIMESTAMP_SIZE);
193 		memcpy(&upd->timestamp, las_timestamp.data, las_timestamp.size);
194 #endif
195 
196 		switch (page->type) {
197 		case WT_PAGE_COL_FIX:
198 		case WT_PAGE_COL_VAR:
199 			p = las_key.data;
200 			WT_ERR(__wt_vunpack_uint(&p, 0, &recno));
201 			if (current_recno == recno)
202 				break;
203 			WT_ASSERT(session, current_recno < recno);
204 
205 			if (first_upd != NULL) {
206 				WT_ERR(__col_instantiate(session,
207 				    current_recno, ref, &cbt, first_upd));
208 				first_upd = NULL;
209 			}
210 			current_recno = recno;
211 			break;
212 		case WT_PAGE_ROW_LEAF:
213 			if (current_key->size == las_key.size &&
214 			    memcmp(current_key->data,
215 			    las_key.data, las_key.size) == 0)
216 				break;
217 
218 			if (first_upd != NULL) {
219 				WT_ERR(__row_instantiate(session,
220 				    current_key, ref, &cbt, first_upd));
221 				first_upd = NULL;
222 			}
223 			WT_ERR(__wt_buf_set(session,
224 			    current_key, las_key.data, las_key.size));
225 			break;
226 		WT_ILLEGAL_VALUE_ERR(session, page->type);
227 		}
228 
229 		/* Append the latest update to the list. */
230 		if (first_upd == NULL)
231 			first_upd = last_upd = upd;
232 		else {
233 			last_upd->next = upd;
234 			last_upd = upd;
235 		}
236 		upd = NULL;
237 	}
238 	__wt_readunlock(session, &cache->las_sweepwalk_lock);
239 	locked = false;
240 	WT_ERR_NOTFOUND_OK(ret);
241 
242 	/* Insert the last set of updates, if any. */
243 	if (first_upd != NULL)
244 		switch (page->type) {
245 		case WT_PAGE_COL_FIX:
246 		case WT_PAGE_COL_VAR:
247 			WT_ERR(__col_instantiate(session,
248 			    current_recno, ref, &cbt, first_upd));
249 			first_upd = NULL;
250 			break;
251 		case WT_PAGE_ROW_LEAF:
252 			WT_ERR(__row_instantiate(session,
253 			    current_key, ref, &cbt, first_upd));
254 			first_upd = NULL;
255 			break;
256 		WT_ILLEGAL_VALUE_ERR(session, page->type);
257 		}
258 
259 	/* Discard the cursor. */
260 	WT_ERR(__wt_las_cursor_close(session, &cursor, session_flags));
261 
262 	if (total_incr != 0) {
263 		__wt_cache_page_inmem_incr(session, page, total_incr);
264 
265 		/*
266 		 * If the updates in lookaside are newer than the versions on
267 		 * the page, it must be included in the next checkpoint.
268 		 *
269 		 * Otherwise, the page image contained the newest versions of
270 		 * data so the updates are all older and we could consider
271 		 * marking it clean (i.e., the next checkpoint can use the
272 		 * version already on disk).
273 		 *
274 		 * This needs care because (a) it creates pages with history
275 		 * that can't be evicted until they are marked dirty again, and
276 		 * (b) checkpoints may need to visit these pages to resolve
277 		 * changes evicted while a checkpoint is running.
278 		 */
279 		page->modify->first_dirty_txn = WT_TXN_FIRST;
280 
281 		FLD_SET(page->modify->restore_state, WT_PAGE_RS_LOOKASIDE);
282 
283 		if (ref->page_las->skew_newest &&
284 		    !S2C(session)->txn_global.has_stable_timestamp &&
285 		    __wt_txn_visible_all(session, ref->page_las->unstable_txn,
286 		    WT_TIMESTAMP_NULL(&ref->page_las->unstable_timestamp))) {
287 			page->modify->rec_max_txn = ref->page_las->max_txn;
288 			__wt_timestamp_set(&page->modify->rec_max_timestamp,
289 			    &ref->page_las->max_timestamp);
290 			__wt_page_modify_clear(session, page);
291 		}
292 	}
293 
294 	/*
295 	 * Now the lookaside history has been read into cache there is no
296 	 * further need to maintain a reference to it.
297 	 */
298 	ref->page_las->eviction_to_lookaside = false;
299 	ref->page_las->resolved = true;
300 
301 err:	if (locked)
302 		__wt_readunlock(session, &cache->las_sweepwalk_lock);
303 	WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
304 	WT_TRET(__wt_btcur_close(&cbt, true));
305 
306 	/*
307 	 * On error, upd points to a single unlinked WT_UPDATE structure,
308 	 * first_upd points to a list.
309 	 */
310 	__wt_free(session, upd);
311 	__wt_free_update_list(session, first_upd);
312 
313 	__wt_scr_free(session, &current_key);
314 
315 	return (ret);
316 }
317 
318 /*
319  * __evict_force_check --
320  *	Check if a page matches the criteria for forced eviction.
321  */
322 static bool
__evict_force_check(WT_SESSION_IMPL * session,WT_REF * ref)323 __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref)
324 {
325 	WT_BTREE *btree;
326 	WT_PAGE *page;
327 	size_t footprint;
328 
329 	btree = S2BT(session);
330 	page = ref->page;
331 
332 	/* Leaf pages only. */
333 	if (WT_PAGE_IS_INTERNAL(page))
334 		return (false);
335 
336 	/*
337 	 * It's hard to imagine a page with a huge memory footprint that has
338 	 * never been modified, but check to be sure.
339 	 */
340 	if (__wt_page_evict_clean(page))
341 		return (false);
342 
343 	/*
344 	 * Exclude the disk image size from the footprint checks.  Usually the
345 	 * disk image size is small compared with the in-memory limit (e.g.
346 	 * 16KB vs 5MB), so this doesn't make a big difference.  Where it is
347 	 * important is for pages with a small number of large values, where
348 	 * the disk image size takes into account large values that have
349 	 * already been written and should not trigger forced eviction.
350 	 */
351 	footprint = page->memory_footprint;
352 	if (page->dsk != NULL)
353 		footprint -= page->dsk->mem_size;
354 
355 	/* Pages are usually small enough, check that first. */
356 	if (footprint < btree->splitmempage)
357 		return (false);
358 
359 	/*
360 	 * If this session has more than one hazard pointer, eviction will fail
361 	 * and there is no point trying.
362 	 */
363 	if (__wt_hazard_count(session, ref) > 1)
364 		return (false);
365 
366 	/* If we can do an in-memory split, do it. */
367 	if (__wt_leaf_page_can_split(session, page))
368 		return (true);
369 	if (footprint < btree->maxmempage)
370 		return (false);
371 
372 	/* Bump the oldest ID, we're about to do some visibility checks. */
373 	WT_IGNORE_RET(__wt_txn_update_oldest(session, 0));
374 
375 	/*
376 	 * Allow some leeway if the transaction ID isn't moving forward since
377 	 * it is unlikely eviction will be able to evict the page. Don't keep
378 	 * skipping the page indefinitely or large records can lead to
379 	 * extremely large memory footprints.
380 	 */
381 	if (!__wt_page_evict_retry(session, page))
382 		return (false);
383 
384 	/* Trigger eviction on the next page release. */
385 	__wt_page_evict_soon(session, ref);
386 
387 	/* If eviction cannot succeed, don't try. */
388 	return (__wt_page_can_evict(session, ref, NULL));
389 }
390 
391 /*
392  * __page_read_lookaside --
393  *	Figure out whether to instantiate content from lookaside on
394  *	page access.
395  */
396 static inline int
__page_read_lookaside(WT_SESSION_IMPL * session,WT_REF * ref,uint32_t previous_state,uint32_t * final_statep)397 __page_read_lookaside(WT_SESSION_IMPL *session,
398     WT_REF *ref, uint32_t previous_state, uint32_t *final_statep)
399 {
400 	/*
401 	 * Reading a lookaside ref for the first time, and not requiring the
402 	 * history triggers a transition to WT_REF_LIMBO, if we are already
403 	 * in limbo and still don't need the history - we are done.
404 	 */
405 	if (__wt_las_page_skip_locked(session, ref)) {
406 		if (previous_state == WT_REF_LOOKASIDE) {
407 			WT_STAT_CONN_INCR(
408 			    session, cache_read_lookaside_skipped);
409 			ref->page_las->eviction_to_lookaside = true;
410 			*final_statep = WT_REF_LIMBO;
411 		}
412 		return (0);
413 	}
414 
415 	/* Instantiate updates from the database's lookaside table. */
416 	if (previous_state == WT_REF_LIMBO) {
417 		WT_STAT_CONN_INCR(session, cache_read_lookaside_delay);
418 		if (WT_SESSION_IS_CHECKPOINT(session))
419 			WT_STAT_CONN_INCR(session,
420 			    cache_read_lookaside_delay_checkpoint);
421 	}
422 
423 	WT_RET(__las_page_instantiate(session, ref));
424 	return (0);
425 }
426 
427 /*
428  * __page_read --
429  *	Read a page from the file.
430  */
431 static int
__page_read(WT_SESSION_IMPL * session,WT_REF * ref,uint32_t flags)432 __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
433 {
434 	WT_DECL_RET;
435 	WT_ITEM tmp;
436 	WT_PAGE *notused;
437 	size_t addr_size;
438 	uint64_t time_start, time_stop;
439 	uint32_t page_flags, final_state, new_state, previous_state;
440 	const uint8_t *addr;
441 	bool timer;
442 
443 	time_start = time_stop = 0;
444 
445 	/*
446 	 * Don't pass an allocated buffer to the underlying block read function,
447 	 * force allocation of new memory of the appropriate size.
448 	 */
449 	WT_CLEAR(tmp);
450 
451 	/*
452 	 * Attempt to set the state to WT_REF_READING for normal reads, or
453 	 * WT_REF_LOCKED, for deleted pages or pages with lookaside entries.
454 	 * The difference is that checkpoints can skip over clean pages that
455 	 * are being read into cache, but need to wait for deletes or lookaside
456 	 * updates to be resolved (in order for checkpoint to write the correct
457 	 * version of the page).
458 	 *
459 	 * If successful, we've won the race, read the page.
460 	 */
461 	switch (previous_state = ref->state) {
462 	case WT_REF_DISK:
463 		new_state = WT_REF_READING;
464 		break;
465 	case WT_REF_DELETED:
466 	case WT_REF_LIMBO:
467 	case WT_REF_LOOKASIDE:
468 		new_state = WT_REF_LOCKED;
469 		break;
470 	default:
471 		return (0);
472 	}
473 	if (!__wt_atomic_casv32(&ref->state, previous_state, new_state))
474 		return (0);
475 
476 	final_state = WT_REF_MEM;
477 
478 	/* If we already have the page image, just instantiate the history. */
479 	if (previous_state == WT_REF_LIMBO)
480 		goto skip_read;
481 
482 	/*
483 	 * Get the address: if there is no address, the page was deleted or had
484 	 * only lookaside entries, and a subsequent search or insert is forcing
485 	 * re-creation of the name space.
486 	 */
487 	__wt_ref_info(ref, &addr, &addr_size, NULL);
488 	if (addr == NULL) {
489 		WT_ASSERT(session, previous_state != WT_REF_DISK);
490 
491 		WT_ERR(__wt_btree_new_leaf_page(session, &ref->page));
492 		goto skip_read;
493 	}
494 
495 	/*
496 	 * There's an address, read or map the backing disk page and build an
497 	 * in-memory version of the page.
498 	 */
499 	timer = !F_ISSET(session, WT_SESSION_INTERNAL);
500 	if (timer)
501 		time_start = __wt_clock(session);
502 	WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size));
503 	if (timer) {
504 		time_stop = __wt_clock(session);
505 		WT_STAT_CONN_INCR(session, cache_read_app_count);
506 		WT_STAT_CONN_INCRV(session, cache_read_app_time,
507 		    WT_CLOCKDIFF_US(time_stop, time_start));
508 	}
509 
510 	/*
511 	 * Build the in-memory version of the page. Clear our local reference to
512 	 * the allocated copy of the disk image on return, the in-memory object
513 	 * steals it.
514 	 *
515 	 * If a page is read with eviction disabled, we don't count evicting it
516 	 * as progress. Since disabling eviction allows pages to be read even
517 	 * when the cache is full, we want to avoid workloads repeatedly reading
518 	 * a page with eviction disabled (e.g., a metadata page), then evicting
519 	 * that page and deciding that is a sign that eviction is unstuck.
520 	 */
521 	page_flags =
522 	    WT_DATA_IN_ITEM(&tmp) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED;
523 	if (LF_ISSET(WT_READ_IGNORE_CACHE_SIZE))
524 		FLD_SET(page_flags, WT_PAGE_EVICT_NO_PROGRESS);
525 	WT_ERR(__wt_page_inmem(session, ref, tmp.data, page_flags, &notused));
526 	tmp.mem = NULL;
527 
528 	/*
529 	 * The WT_REF lookaside state should match the page-header state of
530 	 * any page we read.
531 	 */
532 	WT_ASSERT(session,
533 	    (previous_state != WT_REF_LIMBO &&
534 	    previous_state != WT_REF_LOOKASIDE) ||
535 	    ref->page->dsk == NULL ||
536 	    F_ISSET(ref->page->dsk, WT_PAGE_LAS_UPDATE));
537 
538 skip_read:
539 	switch (previous_state) {
540 	case WT_REF_DELETED:
541 		/*
542 		 * A truncated page may also have lookaside information. The
543 		 * delete happened after page eviction (writing the lookaside
544 		 * information), first update based on the lookaside table and
545 		 * then apply the delete.
546 		 */
547 		if (ref->page_las != NULL)
548 			WT_ERR(__las_page_instantiate(session, ref));
549 
550 		/* Move all records to a deleted state. */
551 		WT_ERR(__wt_delete_page_instantiate(session, ref));
552 		break;
553 	case WT_REF_LIMBO:
554 	case WT_REF_LOOKASIDE:
555 		WT_ERR(__page_read_lookaside(
556 		    session, ref, previous_state, &final_state));
557 		break;
558 	}
559 
560 	/*
561 	 * Once the page is instantiated, we no longer need the history in
562 	 * lookaside.  We leave the lookaside sweep thread to do most cleanup,
563 	 * but it can only remove keys that skew newest (if there are entries
564 	 * in the lookaside newer than the page, they need to be read back into
565 	 * cache or they will be lost).
566 	 *
567 	 * There is no reason for the lookaside remove should fail, but ignore
568 	 * it if for some reason it fails, we've got a valid page.
569 	 *
570 	 * Don't free WT_REF.page_las, there may be concurrent readers.
571 	 */
572 	if (final_state == WT_REF_MEM &&
573 	    ref->page_las != NULL && !ref->page_las->skew_newest)
574 		WT_IGNORE_RET(__wt_las_remove_block(
575 		    session, ref->page_las->las_pageid, false));
576 
577 	WT_PUBLISH(ref->state, final_state);
578 	return (ret);
579 
580 err:	/*
581 	 * If the function building an in-memory version of the page failed,
582 	 * it discarded the page, but not the disk image.  Discard the page
583 	 * and separately discard the disk image in all cases.
584 	 */
585 	if (ref->page != NULL && previous_state != WT_REF_LIMBO)
586 		__wt_ref_out(session, ref);
587 	WT_PUBLISH(ref->state, previous_state);
588 
589 	__wt_buf_free(session, &tmp);
590 
591 	return (ret);
592 }
593 
594 /*
595  * __wt_page_in_func --
596  *	Acquire a hazard pointer to a page; if the page is not in-memory,
597  *	read it from the disk and build an in-memory version.
598  */
599 int
__wt_page_in_func(WT_SESSION_IMPL * session,WT_REF * ref,uint32_t flags,const char * func,int line)600 __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
601 #ifdef HAVE_DIAGNOSTIC
602     , const char *func, int line
603 #endif
604     )
605 {
606 	WT_BTREE *btree;
607 	WT_DECL_RET;
608 	WT_PAGE *page;
609 	uint64_t sleep_usecs, yield_cnt;
610 	uint32_t current_state;
611 	int force_attempts;
612 	bool busy, cache_work, did_read, stalled, wont_need;
613 
614 	btree = S2BT(session);
615 
616 	if (F_ISSET(session, WT_SESSION_IGNORE_CACHE_SIZE))
617 		LF_SET(WT_READ_IGNORE_CACHE_SIZE);
618 
619 	/* Sanity check flag combinations. */
620 	WT_ASSERT(session, !LF_ISSET(
621 	    WT_READ_DELETED_SKIP | WT_READ_NO_WAIT | WT_READ_LOOKASIDE) ||
622 	    LF_ISSET(WT_READ_CACHE));
623 	WT_ASSERT(session, !LF_ISSET(WT_READ_DELETED_CHECK) ||
624 	    !LF_ISSET(WT_READ_DELETED_SKIP));
625 
626 	/*
627 	 * Ignore reads of pages already known to be in cache, otherwise the
628 	 * eviction server can dominate these statistics.
629 	 */
630 	if (!LF_ISSET(WT_READ_CACHE)) {
631 		WT_STAT_CONN_INCR(session, cache_pages_requested);
632 		WT_STAT_DATA_INCR(session, cache_pages_requested);
633 	}
634 
635 	for (did_read = wont_need = stalled = false,
636 	    force_attempts = 0, sleep_usecs = yield_cnt = 0;;) {
637 		switch (current_state = ref->state) {
638 		case WT_REF_DELETED:
639 			if (LF_ISSET(WT_READ_DELETED_SKIP | WT_READ_NO_WAIT))
640 				return (WT_NOTFOUND);
641 			if (LF_ISSET(WT_READ_DELETED_CHECK) &&
642 			    __wt_delete_page_skip(session, ref, false))
643 				return (WT_NOTFOUND);
644 			goto read;
645 		case WT_REF_LOOKASIDE:
646 			if (LF_ISSET(WT_READ_CACHE)) {
647 				if (!LF_ISSET(WT_READ_LOOKASIDE))
648 					return (WT_NOTFOUND);
649 				/*
650 				 * If we skip a lookaside page, the tree
651 				 * cannot be left clean: lookaside entries
652 				 * must be resolved before the tree can be
653 				 * discarded.
654 				 */
655 				if (__wt_las_page_skip(session, ref)) {
656 					__wt_tree_modify_set(session);
657 					return (WT_NOTFOUND);
658 				}
659 			}
660 			goto read;
661 		case WT_REF_DISK:
662 			if (LF_ISSET(WT_READ_CACHE))
663 				return (WT_NOTFOUND);
664 
665 read:			/*
666 			 * The page isn't in memory, read it. If this thread
667 			 * respects the cache size, check for space in the
668 			 * cache.
669 			 */
670 			if (!LF_ISSET(WT_READ_IGNORE_CACHE_SIZE))
671 				WT_RET(__wt_cache_eviction_check(
672 				    session, true,
673 				    !F_ISSET(&session->txn, WT_TXN_HAS_ID),
674 				    NULL));
675 			WT_RET(__page_read(session, ref, flags));
676 
677 			/*
678 			 * We just read a page, don't evict it before we have a
679 			 * chance to use it.
680 			 */
681 			did_read = true;
682 
683 			/*
684 			 * If configured to not trash the cache, leave the page
685 			 * generation unset, we'll set it before returning to
686 			 * the oldest read generation, so the page is forcibly
687 			 * evicted as soon as possible. We don't do that set
688 			 * here because we don't want to evict the page before
689 			 * we "acquire" it.
690 			 */
691 			wont_need = LF_ISSET(WT_READ_WONT_NEED) ||
692 			    F_ISSET(session, WT_SESSION_READ_WONT_NEED) ||
693 			    F_ISSET(S2C(session)->cache, WT_CACHE_EVICT_NOKEEP);
694 			continue;
695 		case WT_REF_READING:
696 			if (LF_ISSET(WT_READ_CACHE))
697 				return (WT_NOTFOUND);
698 			if (LF_ISSET(WT_READ_NO_WAIT))
699 				return (WT_NOTFOUND);
700 
701 			/* Waiting on another thread's read, stall. */
702 			WT_STAT_CONN_INCR(session, page_read_blocked);
703 			stalled = true;
704 			break;
705 		case WT_REF_LOCKED:
706 			if (LF_ISSET(WT_READ_NO_WAIT))
707 				return (WT_NOTFOUND);
708 
709 			/* Waiting on eviction, stall. */
710 			WT_STAT_CONN_INCR(session, page_locked_blocked);
711 			stalled = true;
712 			break;
713 		case WT_REF_SPLIT:
714 			return (WT_RESTART);
715 		case WT_REF_LIMBO:
716 		case WT_REF_MEM:
717 			/*
718 			 * The page is in memory.
719 			 *
720 			 * Get a hazard pointer if one is required. We cannot
721 			 * be evicting if no hazard pointer is required, we're
722 			 * done.
723 			 */
724 			if (F_ISSET(btree, WT_BTREE_IN_MEMORY))
725 				goto skip_evict;
726 
727 			/*
728 			 * The expected reason we can't get a hazard pointer is
729 			 * because the page is being evicted, yield, try again.
730 			 */
731 #ifdef HAVE_DIAGNOSTIC
732 			WT_RET(
733 			    __wt_hazard_set(session, ref, &busy, func, line));
734 #else
735 			WT_RET(__wt_hazard_set(session, ref, &busy));
736 #endif
737 			if (busy) {
738 				WT_STAT_CONN_INCR(session, page_busy_blocked);
739 				break;
740 			}
741 			/*
742 			 * If we are a limbo page check whether we need to
743 			 * instantiate the history. By having a hazard pointer
744 			 * we can use the locked version.
745 			 */
746 			if (current_state == WT_REF_LIMBO &&
747 			    ((!LF_ISSET(WT_READ_CACHE) ||
748 			    LF_ISSET(WT_READ_LOOKASIDE)) &&
749 			    !__wt_las_page_skip_locked(session, ref))) {
750 				WT_RET(__wt_hazard_clear(session, ref));
751 				goto read;
752 			}
753 			if (current_state == WT_REF_LIMBO &&
754 			    LF_ISSET(WT_READ_CACHE) &&
755 			    LF_ISSET(WT_READ_LOOKASIDE))
756 				__wt_tree_modify_set(session);
757 
758 			/*
759 			 * Check if the page requires forced eviction.
760 			 */
761 			if (did_read || LF_ISSET(WT_READ_NO_SPLIT) ||
762 			    btree->evict_disabled > 0 || btree->lsm_primary)
763 				goto skip_evict;
764 
765 			/*
766 			 * If reconciliation is disabled (e.g., when inserting
767 			 * into the lookaside table), skip forced eviction if
768 			 * the page can't split.
769 			 */
770 			if (F_ISSET(session, WT_SESSION_NO_RECONCILE) &&
771 			    !__wt_leaf_page_can_split(session, ref->page))
772 				goto skip_evict;
773 
774 			/*
775 			 * Forcibly evict pages that are too big.
776 			 */
777 			if (force_attempts < 10 &&
778 			    __evict_force_check(session, ref)) {
779 				++force_attempts;
780 				ret = __wt_page_release_evict(session, ref);
781 				/* If forced eviction fails, stall. */
782 				if (ret == EBUSY) {
783 					WT_NOT_READ(ret, 0);
784 					WT_STAT_CONN_INCR(session,
785 					    page_forcible_evict_blocked);
786 					stalled = true;
787 					break;
788 				}
789 				WT_RET(ret);
790 
791 				/*
792 				 * The result of a successful forced eviction
793 				 * is a page-state transition (potentially to
794 				 * an in-memory page we can use, or a restart
795 				 * return for our caller), continue the outer
796 				 * page-acquisition loop.
797 				 */
798 				continue;
799 			}
800 
801 skip_evict:		/*
802 			 * If we read the page and are configured to not trash
803 			 * the cache, and no other thread has already used the
804 			 * page, set the read generation so the page is evicted
805 			 * soon.
806 			 *
807 			 * Otherwise, if we read the page, or, if configured to
808 			 * update the page's read generation and the page isn't
809 			 * already flagged for forced eviction, update the page
810 			 * read generation.
811 			 */
812 			page = ref->page;
813 			if (page->read_gen == WT_READGEN_NOTSET) {
814 				if (wont_need)
815 					page->read_gen = WT_READGEN_WONT_NEED;
816 				else
817 					__wt_cache_read_gen_new(session, page);
818 			} else if (!LF_ISSET(WT_READ_NO_GEN))
819 				__wt_cache_read_gen_bump(session, page);
820 
821 			/*
822 			 * Check if we need an autocommit transaction.
823 			 * Starting a transaction can trigger eviction, so skip
824 			 * it if eviction isn't permitted.
825 			 *
826 			 * The logic here is a little weird: some code paths do
827 			 * a blanket ban on checking the cache size in
828 			 * sessions, but still require a transaction (e.g.,
829 			 * when updating metadata or lookaside).  If
830 			 * WT_READ_IGNORE_CACHE_SIZE was passed in explicitly,
831 			 * we're done. If we set WT_READ_IGNORE_CACHE_SIZE
832 			 * because it was set in the session then make sure we
833 			 * start a transaction.
834 			 */
835 			return (LF_ISSET(WT_READ_IGNORE_CACHE_SIZE) &&
836 			    !F_ISSET(session, WT_SESSION_IGNORE_CACHE_SIZE) ?
837 			    0 : __wt_txn_autocommit_check(session));
838 		WT_ILLEGAL_VALUE(session, current_state);
839 		}
840 
841 		/*
842 		 * We failed to get the page -- yield before retrying, and if
843 		 * we've yielded enough times, start sleeping so we don't burn
844 		 * CPU to no purpose.
845 		 */
846 		if (yield_cnt < WT_THOUSAND) {
847 			if (!stalled) {
848 				++yield_cnt;
849 				__wt_yield();
850 				continue;
851 			}
852 			yield_cnt = WT_THOUSAND;
853 		}
854 
855 		/*
856 		 * If stalling and this thread is allowed to do eviction work,
857 		 * check if the cache needs help. If we do work for the cache,
858 		 * substitute that for a sleep.
859 		 */
860 		if (!LF_ISSET(WT_READ_IGNORE_CACHE_SIZE)) {
861 			WT_RET(__wt_cache_eviction_check(
862 			    session, true,
863 			    !F_ISSET(&session->txn, WT_TXN_HAS_ID),
864 			    &cache_work));
865 			if (cache_work)
866 				continue;
867 		}
868 		__wt_spin_backoff(&yield_cnt, &sleep_usecs);
869 		WT_STAT_CONN_INCRV(session, page_sleep, sleep_usecs);
870 	}
871 }
872