1 /*-
2  * Copyright (c) 2014-2018 MongoDB, Inc.
3  * Copyright (c) 2008-2014 WiredTiger, Inc.
4  *	All rights reserved.
5  *
6  * See the file LICENSE for redistribution information.
7  */
8 
9 #include "wt_internal.h"
10 
11 static int  __evict_clear_all_walks(WT_SESSION_IMPL *);
12 static int  WT_CDECL __evict_lru_cmp(const void *, const void *);
13 static int  __evict_lru_pages(WT_SESSION_IMPL *, bool);
14 static int  __evict_lru_walk(WT_SESSION_IMPL *);
15 static int  __evict_page(WT_SESSION_IMPL *, bool);
16 static int  __evict_pass(WT_SESSION_IMPL *);
17 static int  __evict_server(WT_SESSION_IMPL *, bool *);
18 static void __evict_tune_workers(WT_SESSION_IMPL *session);
19 static int  __evict_walk(WT_SESSION_IMPL *, WT_EVICT_QUEUE *);
20 static int  __evict_walk_tree(
21     WT_SESSION_IMPL *, WT_EVICT_QUEUE *, u_int, u_int *);
22 
23 #define	WT_EVICT_HAS_WORKERS(s)				\
24 	(S2C(s)->evict_threads.current_threads > 1)
25 
26 /*
27  * __evict_lock_handle_list --
28  *	Try to get the handle list lock, with yield and sleep back off.
29  *	Keep timing statistics overall.
30  */
31 static int
__evict_lock_handle_list(WT_SESSION_IMPL * session)32 __evict_lock_handle_list(WT_SESSION_IMPL *session)
33 {
34 	WT_CACHE *cache;
35 	WT_CONNECTION_IMPL *conn;
36 	WT_DECL_RET;
37 	WT_RWLOCK *dh_lock;
38 	u_int spins;
39 
40 	conn = S2C(session);
41 	cache = conn->cache;
42 	dh_lock = &conn->dhandle_lock;
43 
44 	/*
45 	 * Use a custom lock acquisition back off loop so the eviction server
46 	 * notices any interrupt quickly.
47 	 */
48 	for (spins = 0;
49 	    (ret = __wt_try_readlock(session, dh_lock)) == EBUSY &&
50 	    cache->pass_intr == 0; spins++) {
51 		if (spins < WT_THOUSAND)
52 			__wt_yield();
53 		else
54 			__wt_sleep(0, WT_THOUSAND);
55 	}
56 	return (ret);
57 }
58 
59 /*
60  * __evict_entry_priority --
61  *	Get the adjusted read generation for an eviction entry.
62  */
63 static inline uint64_t
__evict_entry_priority(WT_SESSION_IMPL * session,WT_REF * ref)64 __evict_entry_priority(WT_SESSION_IMPL *session, WT_REF *ref)
65 {
66 	WT_BTREE *btree;
67 	WT_PAGE *page;
68 	uint64_t read_gen;
69 
70 	btree = S2BT(session);
71 	page = ref->page;
72 
73 	/* Any page set to the oldest generation should be discarded. */
74 	if (WT_READGEN_EVICT_SOON(page->read_gen))
75 		return (WT_READGEN_OLDEST);
76 
77 	/* Any page from a dead tree is a great choice. */
78 	if (F_ISSET(btree->dhandle, WT_DHANDLE_DEAD))
79 		return (WT_READGEN_OLDEST);
80 
81 	/* Any empty page (leaf or internal), is a good choice. */
82 	if (__wt_page_is_empty(page))
83 		return (WT_READGEN_OLDEST);
84 
85 	/* Any large page in memory is likewise a good choice. */
86 	if (page->memory_footprint > btree->splitmempage)
87 		return (WT_READGEN_OLDEST);
88 
89 	/*
90 	 * The base read-generation is skewed by the eviction priority.
91 	 * Internal pages are also adjusted, we prefer to evict leaf pages.
92 	 */
93 	if (page->modify != NULL &&
94 	    F_ISSET(S2C(session)->cache, WT_CACHE_EVICT_DIRTY) &&
95 	    !F_ISSET(S2C(session)->cache, WT_CACHE_EVICT_CLEAN))
96 		read_gen = page->modify->update_txn;
97 	else
98 		read_gen = page->read_gen;
99 
100 	read_gen += btree->evict_priority;
101 
102 #define	WT_EVICT_INTL_SKEW 1000
103 	if (WT_PAGE_IS_INTERNAL(page))
104 		read_gen += WT_EVICT_INTL_SKEW;
105 
106 	return (read_gen);
107 }
108 
109 /*
110  * __evict_lru_cmp --
111  *	Qsort function: sort the eviction array.
112  */
113 static int WT_CDECL
__evict_lru_cmp(const void * a_arg,const void * b_arg)114 __evict_lru_cmp(const void *a_arg, const void *b_arg)
115 {
116 	const WT_EVICT_ENTRY *a, *b;
117 	uint64_t a_score, b_score;
118 
119 	a = a_arg;
120 	b = b_arg;
121 	a_score = (a->ref == NULL ? UINT64_MAX : a->score);
122 	b_score = (b->ref == NULL ? UINT64_MAX : b->score);
123 
124 	return ((a_score < b_score) ? -1 : (a_score == b_score) ? 0 : 1);
125 }
126 
127 /*
128  * __evict_list_clear --
129  *	Clear an entry in the LRU eviction list.
130  */
131 static inline void
__evict_list_clear(WT_SESSION_IMPL * session,WT_EVICT_ENTRY * e)132 __evict_list_clear(WT_SESSION_IMPL *session, WT_EVICT_ENTRY *e)
133 {
134 	if (e->ref != NULL) {
135 		WT_ASSERT(session,
136 		    F_ISSET_ATOMIC(e->ref->page, WT_PAGE_EVICT_LRU));
137 		F_CLR_ATOMIC(e->ref->page, WT_PAGE_EVICT_LRU);
138 	}
139 	e->ref = NULL;
140 	e->btree = WT_DEBUG_POINT;
141 }
142 
143 /*
144  * __wt_evict_list_clear_page --
145  *	Make sure a page is not in the LRU eviction list.  This called from the
146  *	page eviction code to make sure there is no attempt to evict a child
147  *	page multiple times.
148  */
149 void
__wt_evict_list_clear_page(WT_SESSION_IMPL * session,WT_REF * ref)150 __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref)
151 {
152 	WT_CACHE *cache;
153 	WT_EVICT_ENTRY *evict;
154 	uint32_t i, elem, q;
155 	bool found;
156 
157 	WT_ASSERT(session,
158 	    __wt_ref_is_root(ref) || ref->state == WT_REF_LOCKED);
159 
160 	/* Fast path: if the page isn't on the queue, don't bother searching. */
161 	if (!F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU))
162 		return;
163 
164 	cache = S2C(session)->cache;
165 	__wt_spin_lock(session, &cache->evict_queue_lock);
166 
167 	found = false;
168 	for (q = 0; q < WT_EVICT_QUEUE_MAX && !found; q++) {
169 		__wt_spin_lock(session, &cache->evict_queues[q].evict_lock);
170 		elem = cache->evict_queues[q].evict_max;
171 		for (i = 0, evict = cache->evict_queues[q].evict_queue;
172 		    i < elem; i++, evict++)
173 			if (evict->ref == ref) {
174 				found = true;
175 				__evict_list_clear(session, evict);
176 				break;
177 			}
178 		__wt_spin_unlock(session, &cache->evict_queues[q].evict_lock);
179 	}
180 	WT_ASSERT(session, !F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU));
181 
182 	__wt_spin_unlock(session, &cache->evict_queue_lock);
183 }
184 
185 /*
186  * __evict_queue_empty --
187  *	Is the queue empty?
188  *
189  *	Note that the eviction server is pessimistic and treats a half full
190  *	queue as empty.
191  */
192 static inline bool
__evict_queue_empty(WT_EVICT_QUEUE * queue,bool server_check)193 __evict_queue_empty(WT_EVICT_QUEUE *queue, bool server_check)
194 {
195 	uint32_t candidates, used;
196 
197 	if (queue->evict_current == NULL)
198 		return (true);
199 
200 	/* The eviction server only considers half of the candidates. */
201 	candidates = queue->evict_candidates;
202 	if (server_check && candidates > 1)
203 		candidates /= 2;
204 	used = (uint32_t)(queue->evict_current - queue->evict_queue);
205 	return (used >= candidates);
206 }
207 
208 /*
209  * __evict_queue_full --
210  *	Is the queue full (i.e., it has been populated with candidates and none
211  *	of them have been evicted yet)?
212  */
213 static inline bool
__evict_queue_full(WT_EVICT_QUEUE * queue)214 __evict_queue_full(WT_EVICT_QUEUE *queue)
215 {
216 	return (queue->evict_current == queue->evict_queue &&
217 	    queue->evict_candidates != 0);
218 }
219 
220 /*
221  * __wt_evict_server_wake --
222  *	Wake the eviction server thread.
223  */
224 void
__wt_evict_server_wake(WT_SESSION_IMPL * session)225 __wt_evict_server_wake(WT_SESSION_IMPL *session)
226 {
227 	WT_CACHE *cache;
228 	WT_CONNECTION_IMPL *conn;
229 
230 	conn = S2C(session);
231 	cache = conn->cache;
232 
233 	if (WT_VERBOSE_ISSET(session, WT_VERB_EVICTSERVER)) {
234 		uint64_t bytes_inuse, bytes_max;
235 
236 		bytes_inuse = __wt_cache_bytes_inuse(cache);
237 		bytes_max = conn->cache_size;
238 		__wt_verbose(session, WT_VERB_EVICTSERVER,
239 		    "waking, bytes inuse %s max (%" PRIu64
240 		    "MB %s %" PRIu64 "MB)",
241 		    bytes_inuse <= bytes_max ? "<=" : ">",
242 		    bytes_inuse / WT_MEGABYTE,
243 		    bytes_inuse <= bytes_max ? "<=" : ">",
244 		    bytes_max / WT_MEGABYTE);
245 	}
246 
247 	__wt_cond_signal(session, cache->evict_cond);
248 }
249 
250 /*
251  * __wt_evict_thread_chk --
252  *	Check to decide if the eviction thread should continue running.
253  */
254 bool
__wt_evict_thread_chk(WT_SESSION_IMPL * session)255 __wt_evict_thread_chk(WT_SESSION_IMPL *session)
256 {
257 	return (F_ISSET(S2C(session), WT_CONN_EVICTION_RUN));
258 }
259 
260 /*
261  * __wt_evict_thread_run --
262  *	Entry function for an eviction thread.  This is called repeatedly
263  *	from the thread group code so it does not need to loop itself.
264  */
265 int
__wt_evict_thread_run(WT_SESSION_IMPL * session,WT_THREAD * thread)266 __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread)
267 {
268 	WT_CACHE *cache;
269 	WT_CONNECTION_IMPL *conn;
270 	WT_DECL_RET;
271 	bool did_work, was_intr;
272 
273 	conn = S2C(session);
274 	cache = conn->cache;
275 
276 	/*
277 	 * The thread group code calls us repeatedly.  So each call is one pass
278 	 * through eviction.
279 	 */
280 	if (conn->evict_server_running &&
281 	    __wt_spin_trylock(session, &cache->evict_pass_lock) == 0) {
282 		/*
283 		 * Cannot use WT_WITH_PASS_LOCK because this is a try lock.
284 		 * Fix when that is supported.  We set the flag on both sessions
285 		 * because we may call clear_walk when we are walking with
286 		 * the walk session, locked.
287 		 */
288 		F_SET(session, WT_SESSION_LOCKED_PASS);
289 		F_SET(cache->walk_session, WT_SESSION_LOCKED_PASS);
290 		ret = __evict_server(session, &did_work);
291 		F_CLR(cache->walk_session, WT_SESSION_LOCKED_PASS);
292 		F_CLR(session, WT_SESSION_LOCKED_PASS);
293 		was_intr = cache->pass_intr != 0;
294 		__wt_spin_unlock(session, &cache->evict_pass_lock);
295 		WT_ERR(ret);
296 
297 		/*
298 		 * If the eviction server was interrupted, wait until requests
299 		 * have been processed: the system may otherwise be busy so
300 		 * don't go to sleep.
301 		 */
302 		if (was_intr)
303 			while (cache->pass_intr != 0 &&
304 			    F_ISSET(conn, WT_CONN_EVICTION_RUN) &&
305 			    F_ISSET(thread, WT_THREAD_RUN))
306 				__wt_yield();
307 		else {
308 			__wt_verbose(session,
309 			    WT_VERB_EVICTSERVER, "%s", "sleeping");
310 
311 			/* Don't rely on signals: check periodically. */
312 			__wt_cond_auto_wait(session,
313 			    cache->evict_cond, did_work, NULL);
314 			__wt_verbose(session,
315 			    WT_VERB_EVICTSERVER, "%s", "waking");
316 		}
317 	} else
318 		WT_ERR(__evict_lru_pages(session, false));
319 
320 	if (0) {
321 err:		WT_PANIC_RET(session, ret, "cache eviction thread error");
322 	}
323 	return (ret);
324 }
325 
326 /*
327  * __wt_evict_thread_stop --
328  *	Shutdown function for an eviction thread.
329  */
330 int
__wt_evict_thread_stop(WT_SESSION_IMPL * session,WT_THREAD * thread)331 __wt_evict_thread_stop(WT_SESSION_IMPL *session, WT_THREAD *thread)
332 {
333 	WT_CACHE *cache;
334 	WT_CONNECTION_IMPL *conn;
335 	WT_DECL_RET;
336 
337 	if (thread->id != 0)
338 		return (0);
339 
340 	conn = S2C(session);
341 	cache = conn->cache;
342 	/*
343 	 * The only time the first eviction thread is stopped is on shutdown:
344 	 * in case any trees are still open, clear all walks now so that they
345 	 * can be closed.
346 	 */
347 	WT_WITH_PASS_LOCK(session, ret = __evict_clear_all_walks(session));
348 	WT_ERR(ret);
349 	/*
350 	 * The only two cases when the eviction server is expected to
351 	 * stop are when recovery is finished or when the connection is
352 	 * closing.
353 	 */
354 	WT_ASSERT(session, F_ISSET(conn, WT_CONN_CLOSING | WT_CONN_RECOVERING));
355 
356 	__wt_verbose(session,
357 	    WT_VERB_EVICTSERVER, "%s", "cache eviction thread exiting");
358 
359 	if (0) {
360 err:		WT_PANIC_RET(session, ret, "cache eviction thread error");
361 	}
362 	return (ret);
363 }
364 
365 /*
366  * __evict_server --
367  *	Thread to evict pages from the cache.
368  */
369 static int
__evict_server(WT_SESSION_IMPL * session,bool * did_work)370 __evict_server(WT_SESSION_IMPL *session, bool *did_work)
371 {
372 	struct timespec now;
373 	WT_CACHE *cache;
374 	WT_CONNECTION_IMPL *conn;
375 	WT_DECL_RET;
376 
377 	/* Assume there has been no progress. */
378 	*did_work = false;
379 
380 	conn = S2C(session);
381 	cache = conn->cache;
382 
383 	/* Evict pages from the cache as needed. */
384 	WT_RET(__evict_pass(session));
385 
386 	if (!F_ISSET(conn, WT_CONN_EVICTION_RUN) || cache->pass_intr != 0)
387 		return (0);
388 
389 	if (!__wt_cache_stuck(session)) {
390 		/*
391 		 * Try to get the handle list lock: if we give up, that
392 		 * indicates a session is waiting for us to clear walks.  Do
393 		 * that as part of a normal pass (without the handle list
394 		 * lock) to avoid deadlock.
395 		 */
396 		if ((ret = __evict_lock_handle_list(session)) == EBUSY)
397 			return (0);
398 		WT_RET(ret);
399 
400 		/*
401 		 * Clear the walks so we don't pin pages while asleep,
402 		 * otherwise we can block applications evicting large pages.
403 		 */
404 		ret = __evict_clear_all_walks(session);
405 
406 		__wt_readunlock(session, &conn->dhandle_lock);
407 		WT_RET(ret);
408 
409 		/* Make sure we'll notice next time we're stuck. */
410 		cache->last_eviction_progress = 0;
411 		return (0);
412 	}
413 
414 	/* Track if work was done. */
415 	*did_work = cache->eviction_progress != cache->last_eviction_progress;
416 	cache->last_eviction_progress = cache->eviction_progress;
417 
418 	/* Eviction is stuck, check if we have made progress. */
419 	if (*did_work) {
420 #if !defined(HAVE_DIAGNOSTIC)
421 		/* Need verbose check only if not in diagnostic build */
422 		if (WT_VERBOSE_ISSET(session, WT_VERB_EVICT_STUCK))
423 #endif
424 			__wt_epoch(session, &cache->stuck_time);
425 		return (0);
426 	}
427 
428 #if !defined(HAVE_DIAGNOSTIC)
429 	/* Need verbose check only if not in diagnostic build */
430 	if (!WT_VERBOSE_ISSET(session, WT_VERB_EVICT_STUCK))
431 		return (0);
432 #endif
433 	/*
434 	 * If we're stuck for 5 minutes in diagnostic mode, or the verbose
435 	 * evict_stuck flag is configured, log the cache and transaction state.
436 	 *
437 	 * If we're stuck for 5 minutes in diagnostic mode, give up.
438 	 *
439 	 * We don't do this check for in-memory workloads because application
440 	 * threads are not blocked by the cache being full. If the cache becomes
441 	 * full of clean pages, we can be servicing reads while the cache
442 	 * appears stuck to eviction.
443 	 */
444 	if (F_ISSET(conn, WT_CONN_IN_MEMORY))
445 		return (0);
446 
447 	__wt_epoch(session, &now);
448 	if (WT_TIMEDIFF_SEC(now, cache->stuck_time) > WT_MINUTE * 5) {
449 #if defined(HAVE_DIAGNOSTIC)
450 		__wt_err(session, ETIMEDOUT,
451 		    "Cache stuck for too long, giving up");
452 		WT_RET(__wt_verbose_dump_txn(session));
453 		WT_RET(__wt_verbose_dump_cache(session));
454 		return (__wt_set_return(session, ETIMEDOUT));
455 #else
456 		if (WT_VERBOSE_ISSET(session, WT_VERB_EVICT_STUCK)) {
457 			WT_RET(__wt_verbose_dump_txn(session));
458 			WT_RET(__wt_verbose_dump_cache(session));
459 
460 			/* Reset the timer. */
461 			__wt_epoch(session, &cache->stuck_time);
462 		}
463 #endif
464 	}
465 	return (0);
466 }
467 
468 /*
469  * __wt_evict_create --
470  *	Start the eviction server.
471  */
472 int
__wt_evict_create(WT_SESSION_IMPL * session)473 __wt_evict_create(WT_SESSION_IMPL *session)
474 {
475 	WT_CONNECTION_IMPL *conn;
476 	uint32_t session_flags;
477 
478 	conn = S2C(session);
479 
480 	WT_ASSERT(session, conn->evict_threads_min > 0);
481 	/* Set first, the thread might run before we finish up. */
482 	F_SET(conn, WT_CONN_EVICTION_RUN);
483 
484 	/*
485 	 * Create the eviction thread group.
486 	 * Set the group size to the maximum allowed sessions.
487 	 */
488 	session_flags = WT_THREAD_CAN_WAIT |
489 	    WT_THREAD_LOOKASIDE | WT_THREAD_PANIC_FAIL;
490 	WT_RET(__wt_thread_group_create(session, &conn->evict_threads,
491 	    "eviction-server", conn->evict_threads_min, conn->evict_threads_max,
492 	    session_flags, __wt_evict_thread_chk, __wt_evict_thread_run,
493 	    __wt_evict_thread_stop));
494 
495 	/*
496 	 * Ensure the cache stuck timer is initialized when starting eviction.
497 	 */
498 #if !defined(HAVE_DIAGNOSTIC)
499 	/* Need verbose check only if not in diagnostic build */
500 	if (WT_VERBOSE_ISSET(session, WT_VERB_EVICTSERVER))
501 #endif
502 		__wt_epoch(session, &conn->cache->stuck_time);
503 
504 	/*
505 	 * Allow queues to be populated now that the eviction threads
506 	 * are running.
507 	 */
508 	conn->evict_server_running = true;
509 
510 	return (0);
511 }
512 
513 /*
514  * __wt_evict_destroy --
515  *	Destroy the eviction threads.
516  */
517 int
__wt_evict_destroy(WT_SESSION_IMPL * session)518 __wt_evict_destroy(WT_SESSION_IMPL *session)
519 {
520 	WT_CONNECTION_IMPL *conn;
521 
522 	conn = S2C(session);
523 
524 	/* We are done if the eviction server didn't start successfully. */
525 	if (!conn->evict_server_running)
526 		return (0);
527 
528 	/* Wait for any eviction thread group changes to stabilize. */
529 	__wt_writelock(session, &conn->evict_threads.lock);
530 
531 	/*
532 	 * Signal the threads to finish and stop populating the queue.
533 	 */
534 	F_CLR(conn, WT_CONN_EVICTION_RUN);
535 	conn->evict_server_running = false;
536 	__wt_evict_server_wake(session);
537 
538 	__wt_verbose(
539 	    session, WT_VERB_EVICTSERVER, "%s", "waiting for helper threads");
540 
541 	/*
542 	 * We call the destroy function still holding the write lock.
543 	 * It assumes it is called locked.
544 	 */
545 	WT_RET(__wt_thread_group_destroy(session, &conn->evict_threads));
546 
547 	return (0);
548 }
549 
550 /*
551  * __evict_update_work --
552  *	Configure eviction work state.
553  */
554 static bool
__evict_update_work(WT_SESSION_IMPL * session)555 __evict_update_work(WT_SESSION_IMPL *session)
556 {
557 	WT_BTREE *las_tree;
558 	WT_CACHE *cache;
559 	WT_CONNECTION_IMPL *conn;
560 	double dirty_target, dirty_trigger, target, trigger;
561 	uint64_t bytes_inuse, bytes_max, dirty_inuse;
562 	uint32_t flags;
563 
564 	conn = S2C(session);
565 	cache = conn->cache;
566 
567 	dirty_target = __wt_eviction_dirty_target(cache);
568 	dirty_trigger = cache->eviction_dirty_trigger;
569 	target = cache->eviction_target;
570 	trigger = cache->eviction_trigger;
571 
572 	/* Build up the new state. */
573 	flags = 0;
574 
575 	if (!F_ISSET(conn, WT_CONN_EVICTION_RUN)) {
576 		cache->flags = 0;
577 		return (false);
578 	}
579 
580 	if (!__evict_queue_empty(cache->evict_urgent_queue, false))
581 		LF_SET(WT_CACHE_EVICT_URGENT);
582 
583 	if (F_ISSET(conn, WT_CONN_LOOKASIDE_OPEN)) {
584 		WT_ASSERT(session,
585 		    F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR));
586 
587 		las_tree = ((WT_CURSOR_BTREE *)session->las_cursor)->btree;
588 		cache->bytes_lookaside = las_tree->bytes_inmem;
589 	}
590 
591 	/*
592 	 * If we need space in the cache, try to find clean pages to evict.
593 	 *
594 	 * Avoid division by zero if the cache size has not yet been set in a
595 	 * shared cache.
596 	 */
597 	bytes_max = conn->cache_size + 1;
598 	bytes_inuse = __wt_cache_bytes_inuse(cache);
599 	if (__wt_eviction_clean_needed(session, NULL))
600 		LF_SET(WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_CLEAN_HARD);
601 	else if (bytes_inuse > (target * bytes_max) / 100)
602 		LF_SET(WT_CACHE_EVICT_CLEAN);
603 
604 	dirty_inuse = __wt_cache_dirty_leaf_inuse(cache);
605 	if (__wt_eviction_dirty_needed(session, NULL))
606 		LF_SET(WT_CACHE_EVICT_DIRTY | WT_CACHE_EVICT_DIRTY_HARD);
607 	else if (dirty_inuse > (uint64_t)(dirty_target * bytes_max) / 100)
608 		LF_SET(WT_CACHE_EVICT_DIRTY);
609 
610 	/*
611 	 * If application threads are blocked by the total volume of data in
612 	 * cache, try dirty pages as well.
613 	 */
614 	if (__wt_cache_aggressive(session) &&
615 	    LF_ISSET(WT_CACHE_EVICT_CLEAN_HARD))
616 		LF_SET(WT_CACHE_EVICT_DIRTY);
617 
618 	/* When we stop looking for dirty pages, reduce the lookaside score. */
619 	if (!LF_ISSET(WT_CACHE_EVICT_DIRTY))
620 		__wt_cache_update_lookaside_score(session, 1, 0);
621 
622 	/*
623 	 * Scrub dirty pages and keep them in cache if we are less than half
624 	 * way to the clean or dirty trigger.
625 	 */
626 	if (bytes_inuse < (uint64_t)((target + trigger) * bytes_max) / 200) {
627 		if (dirty_inuse < (uint64_t)
628 		    ((dirty_target + dirty_trigger) * bytes_max) / 200)
629 			LF_SET(WT_CACHE_EVICT_SCRUB);
630 	} else
631 		LF_SET(WT_CACHE_EVICT_NOKEEP);
632 
633 	/*
634 	 * Try lookaside evict when:
635 	 * (1) the cache is stuck; OR
636 	 * (2) the lookaside score goes over 80; and
637 	 * (3) the cache is more than half way from the dirty target to the
638 	 *     dirty trigger.
639 	 */
640 	if (__wt_cache_stuck(session) ||
641 	    (__wt_cache_lookaside_score(cache) > 80 &&
642 	    dirty_inuse >
643 	    (uint64_t)((dirty_target + dirty_trigger) * bytes_max) / 200))
644 		LF_SET(WT_CACHE_EVICT_LOOKASIDE);
645 
646 	/*
647 	 * With an in-memory cache, we only do dirty eviction in order to scrub
648 	 * pages.
649 	 */
650 	if (F_ISSET(conn, WT_CONN_IN_MEMORY)) {
651 		if (LF_ISSET(WT_CACHE_EVICT_CLEAN))
652 			LF_SET(WT_CACHE_EVICT_DIRTY);
653 		if (LF_ISSET(WT_CACHE_EVICT_CLEAN_HARD))
654 			LF_SET(WT_CACHE_EVICT_DIRTY_HARD);
655 		LF_CLR(WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_CLEAN_HARD);
656 	}
657 
658 	/* Update the global eviction state. */
659 	cache->flags = flags;
660 
661 	return (F_ISSET(cache, WT_CACHE_EVICT_ALL | WT_CACHE_EVICT_URGENT));
662 }
663 
664 /*
665  * __evict_pass --
666  *	Evict pages from memory.
667  */
668 static int
__evict_pass(WT_SESSION_IMPL * session)669 __evict_pass(WT_SESSION_IMPL *session)
670 {
671 	WT_CACHE *cache;
672 	WT_CONNECTION_IMPL *conn;
673 	WT_TXN_GLOBAL *txn_global;
674 	uint64_t eviction_progress, oldest_id, prev_oldest_id;
675 	uint64_t time_now, time_prev;
676 	u_int loop;
677 
678 	conn = S2C(session);
679 	cache = conn->cache;
680 	txn_global = &conn->txn_global;
681 	time_prev = 0;			/* [-Wconditional-uninitialized] */
682 
683 	/* Track whether pages are being evicted and progress is made. */
684 	eviction_progress = cache->eviction_progress;
685 	prev_oldest_id = txn_global->oldest_id;
686 
687 	/* Evict pages from the cache. */
688 	for (loop = 0; cache->pass_intr == 0; loop++) {
689 		time_now = __wt_clock(session);
690 		if (loop == 0)
691 			time_prev = time_now;
692 
693 		__evict_tune_workers(session);
694 		/*
695 		 * Increment the shared read generation. Do this occasionally
696 		 * even if eviction is not currently required, so that pages
697 		 * have some relative read generation when the eviction server
698 		 * does need to do some work.
699 		 */
700 		__wt_cache_read_gen_incr(session);
701 		++cache->evict_pass_gen;
702 
703 		/*
704 		 * Update the oldest ID: we use it to decide whether pages are
705 		 * candidates for eviction.  Without this, if all threads are
706 		 * blocked after a long-running transaction (such as a
707 		 * checkpoint) completes, we may never start evicting again.
708 		 *
709 		 * Do this every time the eviction server wakes up, regardless
710 		 * of whether the cache is full, to prevent the oldest ID
711 		 * falling too far behind.  Don't wait to lock the table: with
712 		 * highly threaded workloads, that creates a bottleneck.
713 		 */
714 		WT_RET(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT));
715 
716 		if (!__evict_update_work(session))
717 			break;
718 
719 		__wt_verbose(session, WT_VERB_EVICTSERVER,
720 		    "Eviction pass with: Max: %" PRIu64
721 		    " In use: %" PRIu64 " Dirty: %" PRIu64,
722 		    conn->cache_size, cache->bytes_inmem,
723 		    cache->bytes_dirty_intl + cache->bytes_dirty_leaf);
724 
725 		if (F_ISSET(cache, WT_CACHE_EVICT_ALL))
726 			WT_RET(__evict_lru_walk(session));
727 
728 		/*
729 		 * If the queue has been empty recently, keep queuing more
730 		 * pages to evict.  If the rate of queuing pages is high
731 		 * enough, this score will go to zero, in which case the
732 		 * eviction server might as well help out with eviction.
733 		 *
734 		 * Also, if there is a single eviction server thread with no
735 		 * workers, it must service the urgent queue in case all
736 		 * application threads are busy.
737 		 */
738 		if (!WT_EVICT_HAS_WORKERS(session) &&
739 		    (cache->evict_empty_score < WT_EVICT_SCORE_CUTOFF ||
740 		    !__evict_queue_empty(cache->evict_urgent_queue, false)))
741 			WT_RET(__evict_lru_pages(session, true));
742 
743 		if (cache->pass_intr != 0)
744 			break;
745 
746 		/*
747 		 * If we're making progress, keep going; if we're not making
748 		 * any progress at all, mark the cache "stuck" and go back to
749 		 * sleep, it's not something we can fix.
750 		 *
751 		 * We check for progress every 20ms, the idea being that the
752 		 * aggressive score will reach 10 after 200ms if we aren't
753 		 * making progress and eviction will start considering more
754 		 * pages.  If there is still no progress after 2s, we will
755 		 * treat the cache as stuck and start rolling back
756 		 * transactions and writing updates to the lookaside table.
757 		 */
758 		if (eviction_progress == cache->eviction_progress) {
759 			if (WT_CLOCKDIFF_MS(time_now, time_prev) >= 20 &&
760 			    F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD |
761 			    WT_CACHE_EVICT_DIRTY_HARD)) {
762 				if (cache->evict_aggressive_score < 100)
763 					++cache->evict_aggressive_score;
764 				oldest_id = txn_global->oldest_id;
765 				if (prev_oldest_id == oldest_id &&
766 				    txn_global->current != oldest_id &&
767 				    cache->evict_aggressive_score < 100)
768 					++cache->evict_aggressive_score;
769 				time_prev = time_now;
770 				prev_oldest_id = oldest_id;
771 			}
772 
773 			/*
774 			 * Keep trying for long enough that we should be able
775 			 * to evict a page if the server isn't interfering.
776 			 */
777 			if (loop < 100 || cache->evict_aggressive_score < 100) {
778 				/*
779 				 * Back off if we aren't making progress: walks
780 				 * hold the handle list lock, blocking other
781 				 * operations that can free space in cache,
782 				 * such as LSM discarding handles.
783 				 *
784 				 * Allow this wait to be interrupted (e.g. if a
785 				 * checkpoint completes): make sure we wait for
786 				 * a non-zero number of microseconds).
787 				 */
788 				WT_STAT_CONN_INCR(session,
789 				    cache_eviction_server_slept);
790 				__wt_cond_wait(session,
791 				    cache->evict_cond, WT_THOUSAND, NULL);
792 				continue;
793 			}
794 
795 			WT_STAT_CONN_INCR(session, cache_eviction_slow);
796 			__wt_verbose(session, WT_VERB_EVICTSERVER,
797 			    "%s", "unable to reach eviction goal");
798 			break;
799 		}
800 		if (cache->evict_aggressive_score > 0)
801 			--cache->evict_aggressive_score;
802 		loop = 0;
803 		eviction_progress = cache->eviction_progress;
804 	}
805 	return (0);
806 }
807 
808 /*
809  * __evict_clear_walk --
810  *	Clear a single walk point.
811  */
812 static int
__evict_clear_walk(WT_SESSION_IMPL * session)813 __evict_clear_walk(WT_SESSION_IMPL *session)
814 {
815 	WT_BTREE *btree;
816 	WT_CACHE *cache;
817 	WT_DECL_RET;
818 	WT_REF *ref;
819 
820 	btree = S2BT(session);
821 	cache = S2C(session)->cache;
822 
823 	WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_PASS));
824 	if (session->dhandle == cache->walk_tree)
825 		cache->walk_tree = NULL;
826 
827 	if ((ref = btree->evict_ref) == NULL)
828 		return (0);
829 
830 	WT_STAT_CONN_INCR(session, cache_eviction_walks_abandoned);
831 	WT_STAT_DATA_INCR(session, cache_eviction_walks_abandoned);
832 
833 	/*
834 	 * Clear evict_ref before releasing it in case that forces eviction (we
835 	 * assert that we never try to evict the current eviction walk point).
836 	 */
837 	btree->evict_ref = NULL;
838 
839 	WT_WITH_DHANDLE(cache->walk_session, session->dhandle,
840 	    (ret = __wt_page_release(cache->walk_session,
841 	    ref, WT_READ_NO_EVICT)));
842 	return (ret);
843 }
844 
845 /*
846  * __evict_clear_all_walks --
847  *	Clear the eviction walk points for all files a session is waiting on.
848  */
849 static int
__evict_clear_all_walks(WT_SESSION_IMPL * session)850 __evict_clear_all_walks(WT_SESSION_IMPL *session)
851 {
852 	WT_CONNECTION_IMPL *conn;
853 	WT_DATA_HANDLE *dhandle;
854 	WT_DECL_RET;
855 
856 	conn = S2C(session);
857 
858 	TAILQ_FOREACH(dhandle, &conn->dhqh, q)
859 		if (dhandle->type == WT_DHANDLE_TYPE_BTREE)
860 			WT_WITH_DHANDLE(session, dhandle,
861 			    WT_TRET(__evict_clear_walk(session)));
862 	return (ret);
863 }
864 
865 /*
866  * __wt_evict_file_exclusive_on --
867  *	Get exclusive eviction access to a file and discard any of the file's
868  *	blocks queued for eviction.
869  */
870 int
__wt_evict_file_exclusive_on(WT_SESSION_IMPL * session)871 __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session)
872 {
873 	WT_BTREE *btree;
874 	WT_CACHE *cache;
875 	WT_DECL_RET;
876 	WT_EVICT_ENTRY *evict;
877 	u_int i, elem, q;
878 
879 	btree = S2BT(session);
880 	cache = S2C(session)->cache;
881 
882 	/* Hold the walk lock to turn off eviction. */
883 	__wt_spin_lock(session, &cache->evict_walk_lock);
884 	if (++btree->evict_disabled > 1) {
885 		__wt_spin_unlock(session, &cache->evict_walk_lock);
886 		return (0);
887 	}
888 
889 	/*
890 	 * Ensure no new pages from the file will be queued for eviction after
891 	 * this point, then clear any existing LRU eviction walk for the file.
892 	 */
893 	(void)__wt_atomic_addv32(&cache->pass_intr, 1);
894 	WT_WITH_PASS_LOCK(session, ret = __evict_clear_walk(session));
895 	(void)__wt_atomic_subv32(&cache->pass_intr, 1);
896 	WT_ERR(ret);
897 
898 	/*
899 	 * The eviction candidate list might reference pages from the file,
900 	 * clear it. Hold the evict lock to remove queued pages from a file.
901 	 */
902 	__wt_spin_lock(session, &cache->evict_queue_lock);
903 
904 	for (q = 0; q < WT_EVICT_QUEUE_MAX; q++) {
905 		__wt_spin_lock(session, &cache->evict_queues[q].evict_lock);
906 		elem = cache->evict_queues[q].evict_max;
907 		for (i = 0, evict = cache->evict_queues[q].evict_queue;
908 		    i < elem; i++, evict++)
909 			if (evict->btree == btree)
910 				__evict_list_clear(session, evict);
911 		__wt_spin_unlock(session, &cache->evict_queues[q].evict_lock);
912 	}
913 
914 	__wt_spin_unlock(session, &cache->evict_queue_lock);
915 
916 	/*
917 	 * We have disabled further eviction: wait for concurrent LRU eviction
918 	 * activity to drain.
919 	 */
920 	while (btree->evict_busy > 0)
921 		__wt_yield();
922 
923 	if (0) {
924 err:		--btree->evict_disabled;
925 	}
926 	__wt_spin_unlock(session, &cache->evict_walk_lock);
927 	return (ret);
928 }
929 
930 /*
931  * __wt_evict_file_exclusive_off --
932  *	Release exclusive eviction access to a file.
933  */
934 void
__wt_evict_file_exclusive_off(WT_SESSION_IMPL * session)935 __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session)
936 {
937 	WT_BTREE *btree;
938 
939 	btree = S2BT(session);
940 
941 	/*
942 	 * We have seen subtle bugs with multiple threads racing to turn
943 	 * eviction on/off.  Make races more likely in diagnostic builds.
944 	 */
945 	WT_DIAGNOSTIC_YIELD;
946 
947 	/*
948 	 * Atomically decrement the evict-disabled count, without acquiring the
949 	 * eviction walk-lock. We can't acquire that lock here because there's
950 	 * a potential deadlock. When acquiring exclusive eviction access, we
951 	 * acquire the eviction walk-lock and then the cache's pass-intr lock.
952 	 * The current eviction implementation can hold the pass-intr lock and
953 	 * call into this function (see WT-3303 for the details), which might
954 	 * deadlock with another thread trying to get exclusive eviction access.
955 	 */
956 #if defined(HAVE_DIAGNOSTIC)
957 	{
958 	int32_t v;
959 
960 	WT_ASSERT(session, btree->evict_ref == NULL);
961 	v = __wt_atomic_subi32(&btree->evict_disabled, 1);
962 	WT_ASSERT(session, v >= 0);
963 	}
964 #else
965 	(void)__wt_atomic_subi32(&btree->evict_disabled, 1);
966 #endif
967 }
968 
969 #define	EVICT_TUNE_BATCH	1	/* Max workers to add each period */
970 /*
971  * Data points needed before deciding if we should keep adding workers or settle
972  * on an earlier value.
973  */
974 #define	EVICT_TUNE_DATAPT_MIN   8
975 #define	EVICT_TUNE_PERIOD	60	/* Tune period in milliseconds */
976 
977 /*
978  * We will do a fresh re-tune every that many milliseconds to adjust to
979  * significant phase changes.
980  */
981 #define	EVICT_FORCE_RETUNE	25000
982 
983 /*
984  * __evict_tune_workers --
985  * Find the right number of eviction workers. Gradually ramp up the number of
986  * workers increasing the number in batches indicated by the setting above.
987  * Store the number of workers that gave us the best throughput so far and the
988  * number of data points we have tried.
989  *
990  * Every once in a while when we have the minimum number of data points we check
991  * whether the eviction throughput achieved with the current number of workers
992  * is the best we have seen so far. If so, we will keep increasing the number of
993  * workers.  If not, we are past the infliction point on the eviction throughput
994  * curve.  In that case, we will set the number of workers to the best observed
995  * so far and settle into a stable state.
996  */
997 static void
__evict_tune_workers(WT_SESSION_IMPL * session)998 __evict_tune_workers(WT_SESSION_IMPL *session)
999 {
1000 	struct timespec current_time;
1001 	WT_CACHE *cache;
1002 	WT_CONNECTION_IMPL *conn;
1003 	uint64_t delta_msec, delta_pages;
1004 	uint64_t eviction_progress, eviction_progress_rate, time_diff;
1005 	int32_t cur_threads, i, target_threads, thread_surplus;
1006 
1007 	conn = S2C(session);
1008 	cache = conn->cache;
1009 
1010 	/*
1011 	 * If we have a fixed number of eviction threads, there is no value in
1012 	 * calculating if we should do any tuning.
1013 	 */
1014 	if (conn->evict_threads_max == conn->evict_threads_min)
1015 		return;
1016 
1017 	__wt_epoch(session, &current_time);
1018 	time_diff = WT_TIMEDIFF_MS(current_time, cache->evict_tune_last_time);
1019 
1020 	/*
1021 	 * If we have reached the stable state and have not run long enough to
1022 	 * surpass the forced re-tuning threshold, return.
1023 	 */
1024 	if (cache->evict_tune_stable) {
1025 		if (time_diff < EVICT_FORCE_RETUNE)
1026 			return;
1027 
1028 		/*
1029 		 * Stable state was reached a long time ago. Let's re-tune.
1030 		 * Reset all the state.
1031 		 */
1032 		cache->evict_tune_stable = false;
1033 		cache->evict_tune_last_action_time.tv_sec = 0;
1034 		cache->evict_tune_progress_last = 0;
1035 		cache->evict_tune_num_points = 0;
1036 		cache->evict_tune_progress_rate_max = 0;
1037 
1038 		/* Reduce the number of eviction workers by one */
1039 		thread_surplus =
1040 		    (int32_t)conn->evict_threads.current_threads -
1041 		    (int32_t)conn->evict_threads_min;
1042 
1043 		if (thread_surplus > 0) {
1044 			__wt_thread_group_stop_one(
1045 			    session, &conn->evict_threads);
1046 			WT_STAT_CONN_INCR(session,
1047 			    cache_eviction_worker_removed);
1048 		}
1049 		WT_STAT_CONN_INCR(session, cache_eviction_force_retune);
1050 	} else
1051 		if (time_diff < EVICT_TUNE_PERIOD)
1052 			/*
1053 			 * If we have not reached stable state, don't do
1054 			 * anything unless enough time has passed since the last
1055 			 * time we have taken any action in this function.
1056 			 */
1057 			return;
1058 
1059 	/*
1060 	 * Measure the evicted progress so far. Eviction rate correlates to
1061 	 * performance, so this is our metric of success.
1062 	 */
1063 	eviction_progress = cache->eviction_progress;
1064 
1065 	/*
1066 	 * If we have recorded the number of pages evicted at the end of
1067 	 * the previous measurement interval, we can compute the eviction
1068 	 * rate in evicted pages per second achieved during the current
1069 	 * measurement interval.
1070 	 * Otherwise, we just record the number of evicted pages and return.
1071 	 */
1072 	if (cache->evict_tune_progress_last == 0)
1073 		goto done;
1074 
1075 	delta_msec = WT_TIMEDIFF_MS(current_time, cache->evict_tune_last_time);
1076 	delta_pages = eviction_progress - cache->evict_tune_progress_last;
1077 	eviction_progress_rate = (delta_pages * WT_THOUSAND) / delta_msec;
1078 	cache->evict_tune_num_points++;
1079 
1080 	/*
1081 	 * Keep track of the maximum eviction throughput seen and the number
1082 	 * of workers corresponding to that throughput.
1083 	 */
1084 	if (eviction_progress_rate > cache->evict_tune_progress_rate_max) {
1085 		cache->evict_tune_progress_rate_max = eviction_progress_rate;
1086 		cache->evict_tune_workers_best =
1087 		    conn->evict_threads.current_threads;
1088 	}
1089 
1090 	/*
1091 	 * Compare the current number of data points with the number
1092 	 * needed variable. If they are equal, we will check whether
1093 	 * we are still going up on the performance curve, in which
1094 	 * case we will increase the number of needed data points, to provide
1095 	 * opportunity for further increasing the number of workers. Or
1096 	 * we are past the inflection point on the curve, in which case
1097 	 * we will go back to the best observed number of workers and
1098 	 * settle into a stable state.
1099 	 */
1100 	if (cache->evict_tune_num_points >= cache->evict_tune_datapts_needed) {
1101 		if (cache->evict_tune_workers_best ==
1102 		    conn->evict_threads.current_threads &&
1103 		    conn->evict_threads.current_threads <
1104 		    conn->evict_threads_max) {
1105 			/*
1106 			 * Keep adding workers. We will check again
1107 			 * at the next check point.
1108 			 */
1109 			cache->evict_tune_datapts_needed += WT_MIN(
1110 			    EVICT_TUNE_DATAPT_MIN,
1111 			    (conn->evict_threads_max -
1112 			    conn->evict_threads.current_threads) /
1113 			    EVICT_TUNE_BATCH);
1114 		} else {
1115 			/*
1116 			 * We are past the inflection point. Choose the
1117 			 * best number of eviction workers observed and
1118 			 * settle into a stable state.
1119 			 */
1120 			thread_surplus =
1121 			    (int32_t)conn->evict_threads.current_threads -
1122 			    (int32_t)cache->evict_tune_workers_best;
1123 
1124 			for (i = 0; i < thread_surplus; i++) {
1125 				__wt_thread_group_stop_one(
1126 				    session, &conn->evict_threads);
1127 				WT_STAT_CONN_INCR(session,
1128 				    cache_eviction_worker_removed);
1129 			}
1130 			cache->evict_tune_stable = true;
1131 			goto done;
1132 		}
1133 	}
1134 
1135 	/*
1136 	 * If we have not added any worker threads in the past, we set the
1137 	 * number of data points needed equal to the number of data points that
1138 	 * we must accumulate before deciding if we should keep adding workers
1139 	 * or settle on a previously tried stable number of workers.
1140 	 */
1141 	if (cache->evict_tune_last_action_time.tv_sec == 0)
1142 		cache->evict_tune_datapts_needed = EVICT_TUNE_DATAPT_MIN;
1143 
1144 	if (F_ISSET(cache, WT_CACHE_EVICT_ALL)) {
1145 		cur_threads = (int32_t)conn->evict_threads.current_threads;
1146 		target_threads = WT_MIN(cur_threads + EVICT_TUNE_BATCH,
1147 		    (int32_t)conn->evict_threads_max);
1148 		/*
1149 		 * Start the new threads.
1150 		 */
1151 		for (i = cur_threads; i < target_threads; ++i) {
1152 			__wt_thread_group_start_one(session,
1153 			    &conn->evict_threads, false);
1154 			WT_STAT_CONN_INCR(session,
1155 			    cache_eviction_worker_created);
1156 			__wt_verbose(session,
1157 			    WT_VERB_EVICTSERVER, "%s", "added worker thread");
1158 		}
1159 		cache->evict_tune_last_action_time = current_time;
1160 	}
1161 
1162 done:	cache->evict_tune_last_time = current_time;
1163 	cache->evict_tune_progress_last = eviction_progress;
1164 }
1165 
1166 /*
1167  * __evict_lru_pages --
1168  *	Get pages from the LRU queue to evict.
1169  */
1170 static int
__evict_lru_pages(WT_SESSION_IMPL * session,bool is_server)1171 __evict_lru_pages(WT_SESSION_IMPL *session, bool is_server)
1172 {
1173 	WT_CONNECTION_IMPL *conn;
1174 	WT_DECL_RET;
1175 	WT_TRACK_OP_DECL;
1176 
1177 	WT_TRACK_OP_INIT(session);
1178 	conn = S2C(session);
1179 
1180 	/*
1181 	 * Reconcile and discard some pages: EBUSY is returned if a page fails
1182 	 * eviction because it's unavailable, continue in that case.
1183 	 */
1184 	while (F_ISSET(conn, WT_CONN_EVICTION_RUN) && ret == 0)
1185 		if ((ret = __evict_page(session, is_server)) == EBUSY)
1186 			ret = 0;
1187 
1188 	/* If a worker thread found the queue empty, pause. */
1189 	if (ret == WT_NOTFOUND && !is_server &&
1190 	    F_ISSET(conn, WT_CONN_EVICTION_RUN))
1191 		__wt_cond_wait(
1192 		    session, conn->evict_threads.wait_cond, 10000, NULL);
1193 
1194 	WT_TRACK_OP_END(session);
1195 	return (ret == WT_NOTFOUND ? 0 : ret);
1196 }
1197 
1198 /*
1199  * __evict_lru_walk --
1200  *	Add pages to the LRU queue to be evicted from cache.
1201  */
1202 static int
__evict_lru_walk(WT_SESSION_IMPL * session)1203 __evict_lru_walk(WT_SESSION_IMPL *session)
1204 {
1205 	WT_CACHE *cache;
1206 	WT_DECL_RET;
1207 	WT_EVICT_QUEUE *queue, *other_queue;
1208 	WT_TRACK_OP_DECL;
1209 	uint64_t read_gen_oldest;
1210 	uint32_t candidates, entries;
1211 
1212 	WT_TRACK_OP_INIT(session);
1213 	cache = S2C(session)->cache;
1214 
1215 	/* Age out the score of how much the queue has been empty recently. */
1216 	if (cache->evict_empty_score > 0)
1217 		--cache->evict_empty_score;
1218 
1219 	/* Fill the next queue (that isn't the urgent queue). */
1220 	queue = cache->evict_fill_queue;
1221 	other_queue = cache->evict_queues + (1 - (queue - cache->evict_queues));
1222 	cache->evict_fill_queue = other_queue;
1223 
1224 	/* If this queue is full, try the other one. */
1225 	if (__evict_queue_full(queue) && !__evict_queue_full(other_queue))
1226 		queue = other_queue;
1227 
1228 	/*
1229 	 * If both queues are full and haven't been empty on recent refills,
1230 	 * we're done.
1231 	 */
1232 	if (__evict_queue_full(queue) &&
1233 	    cache->evict_empty_score < WT_EVICT_SCORE_CUTOFF)
1234 		goto err;
1235 
1236 	/*
1237 	 * If the queue we are filling is empty, pages are being requested
1238 	 * faster than they are being queued.
1239 	 */
1240 	if (__evict_queue_empty(queue, false)) {
1241 		if (F_ISSET(cache,
1242 		    WT_CACHE_EVICT_CLEAN_HARD | WT_CACHE_EVICT_DIRTY_HARD))
1243 			cache->evict_empty_score = WT_MIN(
1244 			    cache->evict_empty_score + WT_EVICT_SCORE_BUMP,
1245 			    WT_EVICT_SCORE_MAX);
1246 		WT_STAT_CONN_INCR(session, cache_eviction_queue_empty);
1247 	} else
1248 		WT_STAT_CONN_INCR(session, cache_eviction_queue_not_empty);
1249 
1250 	/*
1251 	 * Get some more pages to consider for eviction.
1252 	 *
1253 	 * If the walk is interrupted, we still need to sort the queue: the
1254 	 * next walk assumes there are no entries beyond WT_EVICT_WALK_BASE.
1255 	 */
1256 	if ((ret = __evict_walk(cache->walk_session, queue)) == EBUSY)
1257 		ret = 0;
1258 	WT_ERR_NOTFOUND_OK(ret);
1259 
1260 	/* Sort the list into LRU order and restart. */
1261 	__wt_spin_lock(session, &queue->evict_lock);
1262 
1263 	/*
1264 	 * We have locked the queue: in the (unusual) case where we are filling
1265 	 * the current queue, mark it empty so that subsequent requests switch
1266 	 * to the other queue.
1267 	 */
1268 	if (queue == cache->evict_current_queue)
1269 		queue->evict_current = NULL;
1270 
1271 	entries = queue->evict_entries;
1272 	__wt_qsort(queue->evict_queue,
1273 	    entries, sizeof(WT_EVICT_ENTRY), __evict_lru_cmp);
1274 
1275 	/* Trim empty entries from the end. */
1276 	while (entries > 0 && queue->evict_queue[entries - 1].ref == NULL)
1277 		--entries;
1278 
1279 	/*
1280 	 * If we have more entries than the maximum tracked between walks,
1281 	 * clear them.  Do this before figuring out how many of the entries are
1282 	 * candidates so we never end up with more candidates than entries.
1283 	 */
1284 	while (entries > WT_EVICT_WALK_BASE)
1285 		__evict_list_clear(session, &queue->evict_queue[--entries]);
1286 
1287 	queue->evict_entries = entries;
1288 
1289 	if (entries == 0) {
1290 		/*
1291 		 * If there are no entries, there cannot be any candidates.
1292 		 * Make sure application threads don't read past the end of the
1293 		 * candidate list, or they may race with the next walk.
1294 		 */
1295 		queue->evict_candidates = 0;
1296 		queue->evict_current = NULL;
1297 		__wt_spin_unlock(session, &queue->evict_lock);
1298 		goto err;
1299 	}
1300 
1301 	/* Decide how many of the candidates we're going to try and evict. */
1302 	if (__wt_cache_aggressive(session))
1303 		queue->evict_candidates = entries;
1304 	else {
1305 		/*
1306 		 * Find the oldest read generation apart that we have in the
1307 		 * queue, used to set the initial value for pages read into the
1308 		 * system.  The queue is sorted, find the first "normal"
1309 		 * generation.
1310 		 */
1311 		read_gen_oldest = WT_READGEN_START_VALUE;
1312 		for (candidates = 0; candidates < entries; ++candidates) {
1313 			read_gen_oldest = queue->evict_queue[candidates].score;
1314 			if (!WT_READGEN_EVICT_SOON(read_gen_oldest))
1315 				break;
1316 		}
1317 
1318 		/*
1319 		 * Take all candidates if we only gathered pages with an oldest
1320 		 * read generation set.
1321 		 *
1322 		 * We normally never take more than 50% of the entries but if
1323 		 * 50% of the entries were at the oldest read generation, take
1324 		 * all of them.
1325 		 */
1326 		if (WT_READGEN_EVICT_SOON(read_gen_oldest))
1327 			queue->evict_candidates = entries;
1328 		else if (candidates > entries / 2)
1329 			queue->evict_candidates = candidates;
1330 		else {
1331 			/*
1332 			 * Take all of the urgent pages plus a third of
1333 			 * ordinary candidates (which could be expressed as
1334 			 * WT_EVICT_WALK_INCR / WT_EVICT_WALK_BASE).  In the
1335 			 * steady state, we want to get as many candidates as
1336 			 * the eviction walk adds to the queue.
1337 			 *
1338 			 * That said, if there is only one entry, which is
1339 			 * normal when populating an empty file, don't exclude
1340 			 * it.
1341 			 */
1342 			queue->evict_candidates =
1343 			    1 + candidates + ((entries - candidates) - 1) / 3;
1344 			cache->read_gen_oldest = read_gen_oldest;
1345 		}
1346 	}
1347 
1348 	queue->evict_current = queue->evict_queue;
1349 	__wt_spin_unlock(session, &queue->evict_lock);
1350 
1351 	/*
1352 	 * Signal any application or helper threads that may be waiting
1353 	 * to help with eviction.
1354 	 */
1355 	__wt_cond_signal(session, S2C(session)->evict_threads.wait_cond);
1356 
1357 err:	WT_TRACK_OP_END(session);
1358 	return (ret);
1359 }
1360 
1361 /*
1362  * __evict_walk --
1363  *	Fill in the array by walking the next set of pages.
1364  */
1365 static int
__evict_walk(WT_SESSION_IMPL * session,WT_EVICT_QUEUE * queue)1366 __evict_walk(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue)
1367 {
1368 	WT_BTREE *btree;
1369 	WT_CACHE *cache;
1370 	WT_CONNECTION_IMPL *conn;
1371 	WT_DATA_HANDLE *dhandle;
1372 	WT_DECL_RET;
1373 	WT_TRACK_OP_DECL;
1374 	u_int max_entries, retries, slot, start_slot, total_candidates;
1375 	bool dhandle_locked, incr;
1376 
1377 	WT_TRACK_OP_INIT(session);
1378 
1379 	conn = S2C(session);
1380 	cache = conn->cache;
1381 	btree = NULL;
1382 	dhandle = NULL;
1383 	dhandle_locked = incr = false;
1384 	retries = 0;
1385 
1386 	/*
1387 	 * Set the starting slot in the queue and the maximum pages added
1388 	 * per walk.
1389 	 */
1390 	start_slot = slot = queue->evict_entries;
1391 	max_entries = WT_MIN(slot + WT_EVICT_WALK_INCR, cache->evict_slots);
1392 
1393 	/*
1394 	 * Another pathological case: if there are only a tiny number of
1395 	 * candidate pages in cache, don't put all of them on one queue.
1396 	 */
1397 	total_candidates = (u_int)(F_ISSET(cache, WT_CACHE_EVICT_CLEAN) ?
1398 	    __wt_cache_pages_inuse(cache) : cache->pages_dirty_leaf);
1399 	max_entries = WT_MIN(max_entries, 1 + total_candidates / 2);
1400 
1401 retry:	while (slot < max_entries) {
1402 		/*
1403 		 * If another thread is waiting on the eviction server to clear
1404 		 * the walk point in a tree, give up.
1405 		 */
1406 		if (cache->pass_intr != 0)
1407 			WT_ERR(EBUSY);
1408 
1409 		/*
1410 		 * Lock the dhandle list to find the next handle and bump its
1411 		 * reference count to keep it alive while we sweep.
1412 		 */
1413 		if (!dhandle_locked) {
1414 			WT_ERR(__evict_lock_handle_list(session));
1415 			dhandle_locked = true;
1416 		}
1417 
1418 		if (dhandle == NULL) {
1419 			/*
1420 			 * On entry, continue from wherever we got to in the
1421 			 * scan last time through.  If we don't have a saved
1422 			 * handle, start from the beginning of the list.
1423 			 */
1424 			if ((dhandle = cache->walk_tree) != NULL)
1425 				cache->walk_tree = NULL;
1426 			else
1427 				dhandle = TAILQ_FIRST(&conn->dhqh);
1428 		} else {
1429 			if (incr) {
1430 				WT_ASSERT(session, dhandle->session_inuse > 0);
1431 				(void)__wt_atomic_subi32(
1432 				    &dhandle->session_inuse, 1);
1433 				incr = false;
1434 				cache->walk_tree = NULL;
1435 			}
1436 			dhandle = TAILQ_NEXT(dhandle, q);
1437 		}
1438 
1439 		/* If we reach the end of the list, we're done. */
1440 		if (dhandle == NULL)
1441 			break;
1442 
1443 		/* Ignore non-btree handles, or handles that aren't open. */
1444 		if (dhandle->type != WT_DHANDLE_TYPE_BTREE ||
1445 		    !F_ISSET(dhandle, WT_DHANDLE_OPEN))
1446 			continue;
1447 
1448 		/* Skip files that don't allow eviction. */
1449 		btree = dhandle->handle;
1450 		if (btree->evict_disabled > 0)
1451 			continue;
1452 
1453 		/*
1454 		 * Skip files that are checkpointing if we are only looking for
1455 		 * dirty pages.
1456 		 */
1457 		if (WT_BTREE_SYNCING(btree) &&
1458 		    !F_ISSET(cache, WT_CACHE_EVICT_CLEAN))
1459 			continue;
1460 
1461 		/*
1462 		 * Skip files that are configured to stick in cache until we
1463 		 * become aggressive.
1464 		 */
1465 		if (btree->evict_priority != 0 &&
1466 		    !__wt_cache_aggressive(session))
1467 			continue;
1468 
1469 		/*
1470 		 * Skip files if we have too many active walks.
1471 		 *
1472 		 * This used to be limited by the configured maximum number of
1473 		 * hazard pointers per session.  Even though that ceiling has
1474 		 * been removed, we need to test eviction with huge numbers of
1475 		 * active trees before allowing larger numbers of hazard
1476 		 * pointers in the walk session.
1477 		 */
1478 		if (btree->evict_ref == NULL &&
1479 		    session->nhazard > WT_EVICT_MAX_TREES)
1480 			continue;
1481 
1482 		/*
1483 		 * If we are filling the queue, skip files that haven't been
1484 		 * useful in the past.
1485 		 */
1486 		if (btree->evict_walk_period != 0 &&
1487 		    btree->evict_walk_skips++ < btree->evict_walk_period)
1488 			continue;
1489 		btree->evict_walk_skips = 0;
1490 
1491 		(void)__wt_atomic_addi32(&dhandle->session_inuse, 1);
1492 		incr = true;
1493 		__wt_readunlock(session, &conn->dhandle_lock);
1494 		dhandle_locked = false;
1495 
1496 		/*
1497 		 * Re-check the "no eviction" flag, used to enforce exclusive
1498 		 * access when a handle is being closed.
1499 		 *
1500 		 * Only try to acquire the lock and simply continue if we fail;
1501 		 * the lock is held while the thread turning off eviction clears
1502 		 * the tree's current eviction point, and part of the process is
1503 		 * waiting on this thread to acknowledge that action.
1504 		 *
1505 		 * If a handle is being discarded, it will still be marked open,
1506 		 * but won't have a root page.
1507 		 */
1508 		if (btree->evict_disabled == 0 &&
1509 		    !__wt_spin_trylock(session, &cache->evict_walk_lock)) {
1510 			if (btree->evict_disabled == 0 &&
1511 			    btree->root.page != NULL) {
1512 				/*
1513 				 * Remember the file to visit first, next loop.
1514 				 */
1515 				cache->walk_tree = dhandle;
1516 				WT_WITH_DHANDLE(session, dhandle,
1517 				    ret = __evict_walk_tree(
1518 				    session, queue, max_entries, &slot));
1519 
1520 				WT_ASSERT(session, __wt_session_gen(
1521 				    session, WT_GEN_SPLIT) == 0);
1522 			}
1523 			__wt_spin_unlock(session, &cache->evict_walk_lock);
1524 			WT_ERR(ret);
1525 		}
1526 	}
1527 
1528 	if (incr) {
1529 		WT_ASSERT(session, dhandle->session_inuse > 0);
1530 		(void)__wt_atomic_subi32(&dhandle->session_inuse, 1);
1531 		incr = false;
1532 	}
1533 
1534 	/*
1535 	 * Walk the list of files a few times if we don't find enough pages.
1536 	 * Try two passes through all the files, give up when we have some
1537 	 * candidates and we aren't finding more.
1538 	 */
1539 	if (slot < max_entries && (retries < 2 ||
1540 	    (retries < WT_RETRY_MAX &&
1541 	    (slot == queue->evict_entries || slot > start_slot)))) {
1542 		start_slot = slot;
1543 		++retries;
1544 		goto retry;
1545 	}
1546 
1547 err:	if (dhandle_locked)
1548 		__wt_readunlock(session, &conn->dhandle_lock);
1549 
1550 	/*
1551 	 * If we didn't find any entries on a walk when we weren't interrupted,
1552 	 * let our caller know.
1553 	 */
1554 	if (queue->evict_entries == slot && cache->pass_intr == 0)
1555 		ret = WT_NOTFOUND;
1556 
1557 	queue->evict_entries = slot;
1558 	WT_TRACK_OP_END(session);
1559 	return (ret);
1560 }
1561 
1562 /*
1563  * __evict_push_candidate --
1564  *	Initialize a WT_EVICT_ENTRY structure with a given page.
1565  */
1566 static bool
__evict_push_candidate(WT_SESSION_IMPL * session,WT_EVICT_QUEUE * queue,WT_EVICT_ENTRY * evict,WT_REF * ref)1567 __evict_push_candidate(WT_SESSION_IMPL *session,
1568     WT_EVICT_QUEUE *queue, WT_EVICT_ENTRY *evict, WT_REF *ref)
1569 {
1570 	uint8_t orig_flags, new_flags;
1571 	u_int slot;
1572 
1573 	/*
1574 	 * Threads can race to queue a page (e.g., an ordinary LRU walk can
1575 	 * race with a page being queued for urgent eviction).
1576 	 */
1577 	orig_flags = new_flags = ref->page->flags_atomic;
1578 	FLD_SET(new_flags, WT_PAGE_EVICT_LRU);
1579 	if (orig_flags == new_flags ||
1580 	    !__wt_atomic_cas8(&ref->page->flags_atomic, orig_flags, new_flags))
1581 		return (false);
1582 
1583 	/* Keep track of the maximum slot we are using. */
1584 	slot = (u_int)(evict - queue->evict_queue);
1585 	if (slot >= queue->evict_max)
1586 		queue->evict_max = slot + 1;
1587 
1588 	if (evict->ref != NULL)
1589 		__evict_list_clear(session, evict);
1590 
1591 	evict->btree = S2BT(session);
1592 	evict->ref = ref;
1593 	evict->score = __evict_entry_priority(session, ref);
1594 
1595 	/* Adjust for size when doing dirty eviction. */
1596 	if (F_ISSET(S2C(session)->cache, WT_CACHE_EVICT_DIRTY) &&
1597 	    evict->score != WT_READGEN_OLDEST && evict->score != UINT64_MAX &&
1598 	    !__wt_page_is_modified(ref->page))
1599 		evict->score += WT_MEGABYTE -
1600 		    WT_MIN(WT_MEGABYTE, ref->page->memory_footprint);
1601 
1602 	return (true);
1603 }
1604 
1605 /*
1606  * __evict_walk_target --
1607  *	Calculate how many pages to queue for a given tree.
1608  */
1609 static uint32_t
__evict_walk_target(WT_SESSION_IMPL * session,u_int max_entries)1610 __evict_walk_target(WT_SESSION_IMPL *session, u_int max_entries)
1611 {
1612 	WT_CACHE *cache;
1613 	uint64_t btree_inuse, bytes_per_slot, cache_inuse;
1614 	uint32_t target_pages_clean, target_pages_dirty, target_pages;
1615 	uint32_t total_slots;
1616 
1617 	cache = S2C(session)->cache;
1618 	target_pages_clean = target_pages_dirty = 0;
1619 	total_slots = max_entries;
1620 
1621 	/*
1622 	 * The number of times we should fill the queue by the end of
1623 	 * considering all trees.
1624 	 */
1625 #define	QUEUE_FILLS_PER_PASS	10
1626 
1627 	/*
1628 	 * The minimum number of pages we should consider per tree.
1629 	 */
1630 #define	MIN_PAGES_PER_TREE	10
1631 
1632 	/*
1633 	 * The target number of pages for this tree is proportional to the
1634 	 * space it is taking up in cache.  Round to the nearest number of
1635 	 * slots so we assign all of the slots to a tree filling 99+% of the
1636 	 * cache (and only have to walk it once).
1637 	 */
1638 	if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) {
1639 		btree_inuse = __wt_btree_bytes_evictable(session);
1640 		cache_inuse = __wt_cache_bytes_inuse(cache);
1641 		bytes_per_slot = 1 + cache_inuse / total_slots;
1642 		target_pages_clean = (uint32_t)(
1643 		    (btree_inuse + bytes_per_slot / 2) / bytes_per_slot);
1644 	}
1645 
1646 	if (F_ISSET(cache, WT_CACHE_EVICT_DIRTY)) {
1647 		btree_inuse = __wt_btree_dirty_leaf_inuse(session);
1648 		cache_inuse = __wt_cache_dirty_leaf_inuse(cache);
1649 		bytes_per_slot = 1 + cache_inuse / total_slots;
1650 		target_pages_dirty = (uint32_t)(
1651 		    (btree_inuse + bytes_per_slot / 2) / bytes_per_slot);
1652 	}
1653 
1654 	/*
1655 	 * Weight the number of target pages by the number of times we want to
1656 	 * fill the cache per pass through all the trees.  Note that we don't
1657 	 * build this into the calculation above because we don't want to favor
1658 	 * small trees, so round to a whole number of slots (zero for small
1659 	 * trees) before multiplying.
1660 	 */
1661 	target_pages = WT_MAX(target_pages_clean, target_pages_dirty) *
1662 	    QUEUE_FILLS_PER_PASS;
1663 
1664 	/*
1665 	 * Walk trees with a small fraction of the cache in case there are so
1666 	 * many trees that none of them use enough of the cache to be allocated
1667 	 * slots.  Only skip a tree if it has no bytes of interest.
1668 	 */
1669 	if (target_pages == 0) {
1670 		btree_inuse = F_ISSET(cache, WT_CACHE_EVICT_CLEAN) ?
1671 		    __wt_btree_bytes_evictable(session) :
1672 		    __wt_btree_dirty_leaf_inuse(session);
1673 
1674 		if (btree_inuse == 0)
1675 			return (0);
1676 	}
1677 
1678 	/*
1679 	 * There is some cost associated with walking a tree.  If we're going
1680 	 * to visit this tree, always look for a minimum number of pages.
1681 	 */
1682 	if (target_pages < MIN_PAGES_PER_TREE)
1683 		target_pages = MIN_PAGES_PER_TREE;
1684 
1685 	/* If the tree is dead, take a lot of pages.  */
1686 	if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD))
1687 		target_pages *= 10;
1688 
1689 	return (target_pages);
1690 }
1691 
1692 /*
1693  * __evict_walk_tree --
1694  *	Get a few page eviction candidates from a single underlying file.
1695  */
1696 static int
__evict_walk_tree(WT_SESSION_IMPL * session,WT_EVICT_QUEUE * queue,u_int max_entries,u_int * slotp)1697 __evict_walk_tree(WT_SESSION_IMPL *session,
1698     WT_EVICT_QUEUE *queue, u_int max_entries, u_int *slotp)
1699 {
1700 	WT_BTREE *btree;
1701 	WT_CACHE *cache;
1702 	WT_CONNECTION_IMPL *conn;
1703 	WT_DECL_RET;
1704 	WT_EVICT_ENTRY *end, *evict, *start;
1705 	WT_PAGE *last_parent, *page;
1706 	WT_REF *ref;
1707 	uint64_t min_pages, pages_seen, pages_queued, refs_walked;
1708 	uint32_t read_flags, remaining_slots, target_pages, walk_flags;
1709 	int restarts;
1710 	bool give_up, modified, urgent_queued;
1711 
1712 	conn = S2C(session);
1713 	btree = S2BT(session);
1714 	cache = conn->cache;
1715 	last_parent = NULL;
1716 	restarts = 0;
1717 	give_up = urgent_queued = false;
1718 
1719 	/*
1720 	 * Figure out how many slots to fill from this tree.
1721 	 * Note that some care is taken in the calculation to avoid overflow.
1722 	 */
1723 	start = queue->evict_queue + *slotp;
1724 	remaining_slots = max_entries - *slotp;
1725 	if (btree->evict_walk_progress >= btree->evict_walk_target) {
1726 		btree->evict_walk_target =
1727 		    __evict_walk_target(session, max_entries);
1728 		btree->evict_walk_progress = 0;
1729 	}
1730 	target_pages = WT_MIN(btree->evict_walk_target / QUEUE_FILLS_PER_PASS,
1731 	    btree->evict_walk_target - btree->evict_walk_progress);
1732 
1733 	if (target_pages > remaining_slots)
1734 		target_pages = remaining_slots;
1735 
1736 	/* If we don't want any pages from this tree, move on. */
1737 	if (target_pages == 0)
1738 		return (0);
1739 
1740 	/*
1741 	 * These statistics generate a histogram of the number of pages targeted
1742 	 * for eviction each round. The range of values here start at
1743 	 * MIN_PAGES_PER_TREE as this is the smallest number of pages we can
1744 	 * target, unless there are fewer slots available. The aim is to cover
1745 	 * the likely ranges of target pages in as few statistics as possible to
1746 	 * reduce the overall overhead.
1747 	 */
1748 	if (target_pages < MIN_PAGES_PER_TREE) {
1749 		WT_STAT_CONN_INCR(session, cache_eviction_target_page_lt10);
1750 		WT_STAT_DATA_INCR(session, cache_eviction_target_page_lt10);
1751 	} else if (target_pages < 32) {
1752 		WT_STAT_CONN_INCR(session, cache_eviction_target_page_lt32);
1753 		WT_STAT_DATA_INCR(session, cache_eviction_target_page_lt32);
1754 	} else if (target_pages < 64) {
1755 		WT_STAT_CONN_INCR(session, cache_eviction_target_page_lt64);
1756 		WT_STAT_DATA_INCR(session, cache_eviction_target_page_lt64);
1757 	} else if (target_pages < 128) {
1758 		WT_STAT_CONN_INCR(session, cache_eviction_target_page_lt128);
1759 		WT_STAT_DATA_INCR(session, cache_eviction_target_page_lt128);
1760 	} else {
1761 		WT_STAT_CONN_INCR(session, cache_eviction_target_page_ge128);
1762 		WT_STAT_DATA_INCR(session, cache_eviction_target_page_ge128);
1763 	}
1764 
1765 	end = start + target_pages;
1766 
1767 	/*
1768 	 * Examine at least a reasonable number of pages before deciding
1769 	 * whether to give up.  When we are only looking for dirty pages,
1770 	 * search the tree for longer.
1771 	 */
1772 	min_pages = 10 * (uint64_t)target_pages;
1773 	if (F_ISSET(cache, WT_CACHE_EVICT_DIRTY) &&
1774 	    !F_ISSET(cache, WT_CACHE_EVICT_CLEAN))
1775 		min_pages *= 10;
1776 
1777 	if (btree->evict_ref == NULL) {
1778 		WT_STAT_CONN_INCR(session, cache_eviction_walk_from_root);
1779 		WT_STAT_DATA_INCR(session, cache_eviction_walk_from_root);
1780 	} else {
1781 		WT_STAT_CONN_INCR(session, cache_eviction_walk_saved_pos);
1782 		WT_STAT_DATA_INCR(session, cache_eviction_walk_saved_pos);
1783 	}
1784 
1785 	walk_flags =
1786 	    WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT;
1787 
1788 	/*
1789 	 * Choose a random point in the tree if looking for candidates in a
1790 	 * tree with no starting point set. This is mostly aimed at ensuring
1791 	 * eviction fairly visits all pages in trees with a lot of in-cache
1792 	 * content.
1793 	 */
1794 	switch (btree->evict_start_type) {
1795 	case WT_EVICT_WALK_NEXT:
1796 		break;
1797 	case WT_EVICT_WALK_PREV:
1798 		FLD_SET(walk_flags, WT_READ_PREV);
1799 		break;
1800 	case WT_EVICT_WALK_RAND_PREV:
1801 		FLD_SET(walk_flags, WT_READ_PREV);
1802 		/* FALLTHROUGH */
1803 	case WT_EVICT_WALK_RAND_NEXT:
1804 		read_flags = WT_READ_CACHE | WT_READ_NO_EVICT |
1805 			WT_READ_NO_GEN | WT_READ_NO_WAIT |
1806 			WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK;
1807 		if (btree->evict_ref == NULL) {
1808 			/* Ensure internal pages indexes remain valid */
1809 			WT_WITH_PAGE_INDEX(session, ret = __wt_random_descent(
1810 			    session, &btree->evict_ref, read_flags));
1811 			WT_RET_NOTFOUND_OK(ret);
1812 		}
1813 		break;
1814 	}
1815 
1816 	/*
1817 	 * Get some more eviction candidate pages, starting at the last saved
1818 	 * point. Clear the saved point immediately, we assert when discarding
1819 	 * pages we're not discarding an eviction point, so this clear must be
1820 	 * complete before the page is released.
1821 	 */
1822 	ref = btree->evict_ref;
1823 	btree->evict_ref = NULL;
1824 
1825 	/*
1826 	 * !!! Take care terminating this loop.
1827 	 *
1828 	 * Don't make an extra call to __wt_tree_walk after we hit the end of a
1829 	 * tree: that will leave a page pinned, which may prevent any work from
1830 	 * being done.
1831 	 *
1832 	 * Once we hit the page limit, do one more step through the walk in
1833 	 * case we are appending and only the last page in the file is live.
1834 	 */
1835 	for (evict = start, pages_queued = pages_seen = refs_walked = 0;
1836 	    evict < end && (ret == 0 || ret == WT_NOTFOUND);
1837 	    last_parent = ref == NULL ? NULL : ref->home,
1838 	    ret = __wt_tree_walk_count(
1839 	    session, &ref, &refs_walked, walk_flags)) {
1840 		/*
1841 		 * Check whether we're finding a good ratio of candidates vs
1842 		 * pages seen.  Some workloads create "deserts" in trees where
1843 		 * no good eviction candidates can be found.  Abandon the walk
1844 		 * if we get into that situation.
1845 		 */
1846 		give_up = !__wt_cache_aggressive(session) &&
1847 		    !F_ISSET(btree, WT_BTREE_LOOKASIDE) &&
1848 		    pages_seen > min_pages &&
1849 		    (pages_queued == 0 || (pages_seen / pages_queued) >
1850 		    (min_pages / target_pages));
1851 		if (give_up) {
1852 			/*
1853 			 * Try a different walk start point next time if a
1854 			 * walk gave up.
1855 			 */
1856 			switch (btree->evict_start_type) {
1857 			case WT_EVICT_WALK_NEXT:
1858 				btree->evict_start_type = WT_EVICT_WALK_PREV;
1859 				break;
1860 			case WT_EVICT_WALK_PREV:
1861 				btree->evict_start_type =
1862 				    WT_EVICT_WALK_RAND_PREV;
1863 				break;
1864 			case WT_EVICT_WALK_RAND_PREV:
1865 				btree->evict_start_type =
1866 				    WT_EVICT_WALK_RAND_NEXT;
1867 				break;
1868 			case WT_EVICT_WALK_RAND_NEXT:
1869 				btree->evict_start_type = WT_EVICT_WALK_NEXT;
1870 				break;
1871 			}
1872 
1873 			/*
1874 			 * We differentiate the reasons we gave up on this walk
1875 			 * and increment the stats accordingly.
1876 			 */
1877 			if (pages_queued == 0) {
1878 				WT_STAT_CONN_INCR(session,
1879 				    cache_eviction_walks_gave_up_no_targets);
1880 				WT_STAT_DATA_INCR(session,
1881 				    cache_eviction_walks_gave_up_no_targets);
1882 			} else {
1883 				WT_STAT_CONN_INCR(session,
1884 				    cache_eviction_walks_gave_up_ratio);
1885 				WT_STAT_DATA_INCR(session,
1886 				    cache_eviction_walks_gave_up_ratio);
1887 			}
1888 			break;
1889 		}
1890 
1891 		if (ref == NULL) {
1892 			WT_STAT_CONN_INCR(session, cache_eviction_walks_ended);
1893 			WT_STAT_DATA_INCR(session, cache_eviction_walks_ended);
1894 
1895 			if (++restarts == 2) {
1896 				WT_STAT_CONN_INCR(
1897 				    session, cache_eviction_walks_stopped);
1898 				WT_STAT_DATA_INCR(
1899 				    session, cache_eviction_walks_stopped);
1900 				break;
1901 			}
1902 			WT_STAT_CONN_INCR(
1903 			    session, cache_eviction_walks_started);
1904 			continue;
1905 		}
1906 
1907 		++pages_seen;
1908 
1909 		/* Ignore root pages entirely. */
1910 		if (__wt_ref_is_root(ref))
1911 			continue;
1912 
1913 		page = ref->page;
1914 		modified = __wt_page_is_modified(page);
1915 		page->evict_pass_gen = cache->evict_pass_gen;
1916 
1917 		/*
1918 		 * Use the EVICT_LRU flag to avoid putting pages onto the list
1919 		 * multiple times.
1920 		 */
1921 		if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU))
1922 			continue;
1923 
1924 		/* Don't queue dirty pages in trees during checkpoints. */
1925 		if (modified && WT_BTREE_SYNCING(btree))
1926 			continue;
1927 
1928 		/*
1929 		 * It's possible (but unlikely) to visit a page without a read
1930 		 * generation, if we race with the read instantiating the page.
1931 		 * Set the page's read generation here to ensure a bug doesn't
1932 		 * somehow leave a page without a read generation.
1933 		 */
1934 		if (page->read_gen == WT_READGEN_NOTSET)
1935 			__wt_cache_read_gen_new(session, page);
1936 
1937 		/* Pages being forcibly evicted go on the urgent queue. */
1938 		if (page->read_gen == WT_READGEN_OLDEST ||
1939 		    page->memory_footprint >= btree->splitmempage) {
1940 			WT_STAT_CONN_INCR(
1941 			    session, cache_eviction_pages_queued_oldest);
1942 			if (__wt_page_evict_urgent(session, ref))
1943 				urgent_queued = true;
1944 			continue;
1945 		}
1946 
1947 		/*
1948 		 * Pages that are empty or from dead trees are fast-tracked.
1949 		 *
1950 		 * Also evict lookaside table pages without further filtering:
1951 		 * the cache is under pressure by definition and we want to
1952 		 * free space.
1953 		 */
1954 		if (__wt_page_is_empty(page) ||
1955 		    F_ISSET(session->dhandle, WT_DHANDLE_DEAD) ||
1956 		    F_ISSET(btree, WT_BTREE_LOOKASIDE))
1957 			goto fast;
1958 
1959 		/*
1960 		 * If application threads are blocked on eviction of clean
1961 		 * pages, and the only thing preventing a clean leaf page from
1962 		 * being evicted is it contains historical data, mark it dirty
1963 		 * so we can do lookaside eviction.  We also mark the tree
1964 		 * dirty to avoid an assertion that we don't discard dirty
1965 		 * pages from a clean tree.
1966 		 */
1967 		if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD) &&
1968 		    !F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE) &&
1969 		    !WT_PAGE_IS_INTERNAL(page) &&
1970 		    !modified && page->modify != NULL &&
1971 		    !__wt_txn_visible_all(session, page->modify->rec_max_txn,
1972 		    WT_TIMESTAMP_NULL(&page->modify->rec_max_timestamp))) {
1973 			__wt_page_modify_set(session, page);
1974 			goto fast;
1975 		}
1976 
1977 		/* Skip clean pages if appropriate. */
1978 		if (!modified && !F_ISSET(cache, WT_CACHE_EVICT_CLEAN))
1979 			continue;
1980 
1981 		/* Skip dirty pages if appropriate. */
1982 		if (modified && !F_ISSET(cache, WT_CACHE_EVICT_DIRTY))
1983 			continue;
1984 
1985 		/*
1986 		 * Don't attempt eviction of internal pages with children in
1987 		 * cache (indicated by seeing an internal page that is the
1988 		 * parent of the last page we saw).
1989 		 *
1990 		 * Also skip internal page unless we get aggressive or the tree
1991 		 * is idle (indicated by the tree being skipped for walks).
1992 		 * The goal here is that if trees become completely idle, we
1993 		 * eventually push them out of cache completely.
1994 		 */
1995 		if (WT_PAGE_IS_INTERNAL(page)) {
1996 			if (page == last_parent)
1997 				continue;
1998 			if (btree->evict_walk_period == 0 &&
1999 			    !__wt_cache_aggressive(session))
2000 				continue;
2001 		}
2002 
2003 		/* If eviction gets aggressive, anything else is fair game. */
2004 		if (__wt_cache_aggressive(session))
2005 			goto fast;
2006 
2007 		/*
2008 		 * If the global transaction state hasn't changed since the
2009 		 * last time we tried eviction, it's unlikely we can make
2010 		 * progress.  Similarly, if the most recent update on the page
2011 		 * is not yet globally visible, eviction will fail.  This
2012 		 * heuristic avoids repeated attempts to evict the same page.
2013 		 */
2014 		if (!__wt_page_evict_retry(session, page) || (modified &&
2015 		    !__txn_visible_all_id(session, page->modify->update_txn)))
2016 			continue;
2017 
2018 fast:		/* If the page can't be evicted, give up. */
2019 		if (!__wt_page_can_evict(session, ref, NULL))
2020 			continue;
2021 
2022 		WT_ASSERT(session, evict->ref == NULL);
2023 		if (!__evict_push_candidate(session, queue, evict, ref))
2024 			continue;
2025 		++evict;
2026 		++pages_queued;
2027 		++btree->evict_walk_progress;
2028 
2029 		__wt_verbose(session, WT_VERB_EVICTSERVER,
2030 		    "select: %p, size %" WT_SIZET_FMT,
2031 		    (void *)page, page->memory_footprint);
2032 	}
2033 	WT_RET_NOTFOUND_OK(ret);
2034 
2035 	*slotp += (u_int)(evict - start);
2036 	WT_STAT_CONN_INCRV(
2037 	    session, cache_eviction_pages_queued, (u_int)(evict - start));
2038 
2039 	__wt_verbose(session, WT_VERB_EVICTSERVER,
2040 	    "%s walk: seen %" PRIu64 ", queued %" PRIu64,
2041 	    session->dhandle->name, pages_seen, pages_queued);
2042 
2043 	/*
2044 	 * If we couldn't find the number of pages we were looking for, skip
2045 	 * the tree next time.
2046 	 */
2047 	if (pages_queued < target_pages / 2 && !urgent_queued)
2048 		btree->evict_walk_period = WT_MIN(
2049 		    WT_MAX(1, 2 * btree->evict_walk_period), 100);
2050 	else if (pages_queued == target_pages)
2051 		btree->evict_walk_period = 0;
2052 	else if (btree->evict_walk_period > 0)
2053 		btree->evict_walk_period /= 2;
2054 
2055 	/*
2056 	 * Give up the walk occasionally.
2057 	 *
2058 	 * If we happen to end up on the root page or a page requiring urgent
2059 	 * eviction, clear it.  We have to track hazard pointers, and the root
2060 	 * page complicates that calculation.
2061 	 *
2062 	 * Likewise if we found no new candidates during the walk: there is no
2063 	 * point keeping a page pinned, since it may be the only candidate in
2064 	 * an idle tree.
2065 	 *
2066 	 * If we land on a page requiring forced eviction, or that isn't an
2067 	 * ordinary in-memory page (e.g., WT_REF_LIMBO), move until we find an
2068 	 * ordinary page: we should not prevent exclusive access to the page
2069 	 * until the next walk.
2070 	 */
2071 	if (ref != NULL) {
2072 		if (__wt_ref_is_root(ref) || evict == start || give_up ||
2073 		    ref->page->memory_footprint >= btree->splitmempage) {
2074 			if (restarts == 0)
2075 				WT_STAT_CONN_INCR(
2076 				    session, cache_eviction_walks_abandoned);
2077 			WT_RET(__wt_page_release(
2078 			    cache->walk_session, ref, walk_flags));
2079 			ref = NULL;
2080 		} else
2081 			while (ref != NULL && (ref->state != WT_REF_MEM ||
2082 			    WT_READGEN_EVICT_SOON(ref->page->read_gen)))
2083 				WT_RET_NOTFOUND_OK(__wt_tree_walk_count(
2084 				    session, &ref, &refs_walked, walk_flags));
2085 		btree->evict_ref = ref;
2086 	}
2087 
2088 	WT_STAT_CONN_INCRV(session, cache_eviction_walk, refs_walked);
2089 	WT_STAT_CONN_INCRV(session, cache_eviction_pages_seen, pages_seen);
2090 	WT_STAT_DATA_INCRV(session, cache_eviction_pages_seen, pages_seen);
2091 	WT_STAT_CONN_INCRV(session, cache_eviction_walk_passes, 1);
2092 	WT_STAT_DATA_INCRV(session, cache_eviction_walk_passes, 1);
2093 
2094 	return (0);
2095 }
2096 
2097 /*
2098  * __evict_get_ref --
2099  *	Get a page for eviction.
2100  */
2101 static int
__evict_get_ref(WT_SESSION_IMPL * session,bool is_server,WT_BTREE ** btreep,WT_REF ** refp,uint32_t * previous_statep)2102 __evict_get_ref(WT_SESSION_IMPL *session,
2103     bool is_server, WT_BTREE **btreep, WT_REF **refp, uint32_t *previous_statep)
2104 {
2105 	WT_CACHE *cache;
2106 	WT_EVICT_ENTRY *evict;
2107 	WT_EVICT_QUEUE *queue, *other_queue, *urgent_queue;
2108 	uint32_t candidates, previous_state;
2109 	bool is_app, server_only, urgent_ok;
2110 
2111 	*btreep = NULL;
2112 	/*
2113 	 * It is polite to initialize output variables, but it isn't safe for
2114 	 * callers to use the previous state if we don't return a locked ref.
2115 	 */
2116 	*previous_statep = WT_REF_MEM;
2117 	*refp = NULL;
2118 
2119 	cache = S2C(session)->cache;
2120 	is_app = !F_ISSET(session, WT_SESSION_INTERNAL);
2121 	server_only = is_server && !WT_EVICT_HAS_WORKERS(session);
2122 	/* Application threads do eviction when cache is full of dirty data */
2123 	urgent_ok = (!is_app && !is_server) ||
2124 	    !WT_EVICT_HAS_WORKERS(session) ||
2125 	    (is_app && F_ISSET(cache, WT_CACHE_EVICT_DIRTY_HARD));
2126 	urgent_queue = cache->evict_urgent_queue;
2127 
2128 	WT_STAT_CONN_INCR(session, cache_eviction_get_ref);
2129 
2130 	/* Avoid the LRU lock if no pages are available. */
2131 	if (__evict_queue_empty(cache->evict_current_queue, is_server) &&
2132 	    __evict_queue_empty(cache->evict_other_queue, is_server) &&
2133 	    (!urgent_ok || __evict_queue_empty(urgent_queue, false))) {
2134 		WT_STAT_CONN_INCR(session, cache_eviction_get_ref_empty);
2135 		return (WT_NOTFOUND);
2136 	}
2137 
2138 	/*
2139 	 * The server repopulates whenever the other queue is not full, as long
2140 	 * as at least one page has been evicted out of the current queue.
2141 	 *
2142 	 * Note that there are pathological cases where there are only enough
2143 	 * eviction candidates in the cache to fill one queue.  In that case,
2144 	 * we will continually evict one page and attempt to refill the queues.
2145 	 * Such cases are extremely rare in real applications.
2146 	 */
2147 	if (is_server &&
2148 	    (!urgent_ok || __evict_queue_empty(urgent_queue, false)) &&
2149 	    !__evict_queue_full(cache->evict_current_queue) &&
2150 	    !__evict_queue_full(cache->evict_fill_queue) &&
2151 	    (cache->evict_empty_score > WT_EVICT_SCORE_CUTOFF ||
2152 	    __evict_queue_empty(cache->evict_fill_queue, false)))
2153 		return (WT_NOTFOUND);
2154 
2155 	__wt_spin_lock(session, &cache->evict_queue_lock);
2156 
2157 	/* Check the urgent queue first. */
2158 	if (urgent_ok && !__evict_queue_empty(urgent_queue, false))
2159 		queue = urgent_queue;
2160 	else {
2161 		/*
2162 		 * Check if the current queue needs to change.
2163 		 *
2164 		 * The server will only evict half of the pages before looking
2165 		 * for more, but should only switch queues if there are no
2166 		 * other eviction workers.
2167 		 */
2168 		queue = cache->evict_current_queue;
2169 		other_queue = cache->evict_other_queue;
2170 		if (__evict_queue_empty(queue, server_only) &&
2171 		    !__evict_queue_empty(other_queue, server_only)) {
2172 			cache->evict_current_queue = other_queue;
2173 			cache->evict_other_queue = queue;
2174 		}
2175 	}
2176 
2177 	__wt_spin_unlock(session, &cache->evict_queue_lock);
2178 
2179 	/*
2180 	 * We got the queue lock, which should be fast, and chose a queue.
2181 	 * Now we want to get the lock on the individual queue.
2182 	 */
2183 	for (;;) {
2184 		/* Verify there are still pages available. */
2185 		if (__evict_queue_empty(
2186 		    queue, is_server && queue != urgent_queue)) {
2187 			WT_STAT_CONN_INCR(
2188 			    session, cache_eviction_get_ref_empty2);
2189 			return (WT_NOTFOUND);
2190 		}
2191 		if (!is_server)
2192 			__wt_spin_lock(session, &queue->evict_lock);
2193 		else if (__wt_spin_trylock(session, &queue->evict_lock) != 0)
2194 			continue;
2195 		break;
2196 	}
2197 
2198 	/*
2199 	 * Only evict half of the pages before looking for more. The remainder
2200 	 * are left to eviction workers (if configured), or application thread
2201 	 * if necessary.
2202 	 */
2203 	candidates = queue->evict_candidates;
2204 	if (is_server && queue != urgent_queue && candidates > 1)
2205 		candidates /= 2;
2206 
2207 	/* Get the next page queued for eviction. */
2208 	for (evict = queue->evict_current;
2209 	    evict >= queue->evict_queue &&
2210 	    evict < queue->evict_queue + candidates;
2211 	    ++evict) {
2212 		if (evict->ref == NULL)
2213 			continue;
2214 		WT_ASSERT(session, evict->btree != NULL);
2215 
2216 		/*
2217 		 * Evicting a dirty page in the server thread could stall
2218 		 * during a write and prevent eviction from finding new work.
2219 		 *
2220 		 * However, we can't skip entries in the urgent queue or they
2221 		 * may never be found again.
2222 		 *
2223 		 * Don't force application threads to evict dirty pages if they
2224 		 * aren't stalled by the amount of dirty data in cache.
2225 		 */
2226 		if (!urgent_ok && (is_server ||
2227 		    !F_ISSET(cache, WT_CACHE_EVICT_DIRTY_HARD)) &&
2228 		    __wt_page_is_modified(evict->ref->page)) {
2229 			--evict;
2230 			break;
2231 		}
2232 
2233 		/*
2234 		 * Lock the page while holding the eviction mutex to prevent
2235 		 * multiple attempts to evict it.  For pages that are already
2236 		 * being evicted, this operation will fail and we will move on.
2237 		 */
2238 		if (((previous_state = evict->ref->state) != WT_REF_MEM &&
2239 		    previous_state != WT_REF_LIMBO) ||
2240 		    !__wt_atomic_casv32(
2241 		    &evict->ref->state, previous_state, WT_REF_LOCKED)) {
2242 			__evict_list_clear(session, evict);
2243 			continue;
2244 		}
2245 
2246 		/*
2247 		 * Increment the busy count in the btree handle to prevent it
2248 		 * from being closed under us.
2249 		 */
2250 		(void)__wt_atomic_addv32(&evict->btree->evict_busy, 1);
2251 
2252 		*btreep = evict->btree;
2253 		*refp = evict->ref;
2254 		*previous_statep = previous_state;
2255 
2256 		/*
2257 		 * Remove the entry so we never try to reconcile the same page
2258 		 * on reconciliation error.
2259 		 */
2260 		__evict_list_clear(session, evict);
2261 		break;
2262 	}
2263 
2264 	/* Move to the next item. */
2265 	if (evict != NULL &&
2266 	    evict + 1 < queue->evict_queue + queue->evict_candidates)
2267 		queue->evict_current = evict + 1;
2268 	else /* Clear the current pointer if there are no more candidates. */
2269 		queue->evict_current = NULL;
2270 
2271 	__wt_spin_unlock(session, &queue->evict_lock);
2272 
2273 	return (*refp == NULL ? WT_NOTFOUND : 0);
2274 }
2275 
2276 /*
2277  * __evict_page --
2278  *	Called by both eviction and application threads to evict a page.
2279  */
2280 static int
__evict_page(WT_SESSION_IMPL * session,bool is_server)2281 __evict_page(WT_SESSION_IMPL *session, bool is_server)
2282 {
2283 	WT_BTREE *btree;
2284 	WT_CACHE *cache;
2285 	WT_DECL_RET;
2286 	WT_REF *ref;
2287 	WT_TRACK_OP_DECL;
2288 	uint64_t time_start, time_stop;
2289 	uint32_t previous_state;
2290 	bool app_timer;
2291 
2292 	WT_TRACK_OP_INIT(session);
2293 
2294 	WT_RET_TRACK(__evict_get_ref(
2295 	    session, is_server, &btree, &ref, &previous_state));
2296 	WT_ASSERT(session, ref->state == WT_REF_LOCKED);
2297 
2298 	app_timer = false;
2299 	cache = S2C(session)->cache;
2300 	time_start = time_stop = 0;
2301 
2302 	/*
2303 	 * An internal session flags either the server itself or an eviction
2304 	 * worker thread.
2305 	 */
2306 	if (is_server) {
2307 		WT_STAT_CONN_INCR(session, cache_eviction_server_evicting);
2308 		cache->server_evicts++;
2309 	} else if (F_ISSET(session, WT_SESSION_INTERNAL)) {
2310 		WT_STAT_CONN_INCR(session, cache_eviction_worker_evicting);
2311 		cache->worker_evicts++;
2312 	} else {
2313 		if (__wt_page_is_modified(ref->page))
2314 			WT_STAT_CONN_INCR(session, cache_eviction_app_dirty);
2315 		WT_STAT_CONN_INCR(session, cache_eviction_app);
2316 		cache->app_evicts++;
2317 		if (WT_STAT_ENABLED(session)) {
2318 			app_timer = true;
2319 			time_start = __wt_clock(session);
2320 		}
2321 	}
2322 
2323 	/*
2324 	 * In case something goes wrong, don't pick the same set of pages every
2325 	 * time.
2326 	 *
2327 	 * We used to bump the page's read generation only if eviction failed,
2328 	 * but that isn't safe: at that point, eviction has already unlocked
2329 	 * the page and some other thread may have evicted it by the time we
2330 	 * look at it.
2331 	 */
2332 	__wt_cache_read_gen_bump(session, ref->page);
2333 
2334 	WT_WITH_BTREE(session, btree,
2335 	     ret = __wt_evict(session, ref, false, previous_state));
2336 
2337 	(void)__wt_atomic_subv32(&btree->evict_busy, 1);
2338 
2339 	if (app_timer) {
2340 		time_stop = __wt_clock(session);
2341 		WT_STAT_CONN_INCRV(session,
2342 		    application_evict_time,
2343 		    WT_CLOCKDIFF_US(time_stop, time_start));
2344 	}
2345 	WT_TRACK_OP_END(session);
2346 	return (ret);
2347 }
2348 
2349 /*
2350  * __wt_cache_eviction_worker --
2351  *	Worker function for __wt_cache_eviction_check: evict pages if the cache
2352  * crosses its boundaries.
2353  */
2354 int
__wt_cache_eviction_worker(WT_SESSION_IMPL * session,bool busy,bool readonly,double pct_full)2355 __wt_cache_eviction_worker(
2356     WT_SESSION_IMPL *session, bool busy, bool readonly, double pct_full)
2357 {
2358 	WT_CACHE *cache;
2359 	WT_CONNECTION_IMPL *conn;
2360 	WT_DECL_RET;
2361 	WT_TRACK_OP_DECL;
2362 	WT_TXN_GLOBAL *txn_global;
2363 	WT_TXN_STATE *txn_state;
2364 	uint64_t elapsed, time_start, time_stop;
2365 	uint64_t initial_progress, max_progress;
2366 	bool timer;
2367 
2368 	WT_TRACK_OP_INIT(session);
2369 
2370 	conn = S2C(session);
2371 	cache = conn->cache;
2372 	time_start = time_stop = 0;
2373 	txn_global = &conn->txn_global;
2374 	txn_state = WT_SESSION_TXN_STATE(session);
2375 
2376 	/*
2377 	 * It is not safe to proceed if the eviction server threads aren't
2378 	 * setup yet.
2379 	 */
2380 	if (!conn->evict_server_running || (busy && pct_full < 100.0))
2381 		goto done;
2382 
2383 	/* Wake the eviction server if we need to do work. */
2384 	__wt_evict_server_wake(session);
2385 
2386 	/* Track how long application threads spend doing eviction. */
2387 	timer = !F_ISSET(session, WT_SESSION_INTERNAL);
2388 	if (timer)
2389 		time_start = __wt_clock(session);
2390 
2391 	for (initial_progress = cache->eviction_progress;; ret = 0) {
2392 		/*
2393 		 * A pathological case: if we're the oldest transaction in the
2394 		 * system and the eviction server is stuck trying to find space
2395 		 * (and we're not in recovery, because those transactions can't
2396 		 * be rolled back), abort the transaction to give up all hazard
2397 		 * pointers before trying again.
2398 		 */
2399 		if (__wt_cache_stuck(session) &&
2400 		    __wt_txn_am_oldest(session) &&
2401 		    !F_ISSET(conn, WT_CONN_RECOVERING)) {
2402 			--cache->evict_aggressive_score;
2403 			WT_STAT_CONN_INCR(session, txn_fail_cache);
2404 			WT_ERR(__wt_txn_rollback_required(session,
2405 			    "oldest transaction rolled back for eviction"));
2406 		}
2407 
2408 		/*
2409 		 * Check if we have become busy.
2410 		 *
2411 		 * If we're busy (because of the transaction check we just did
2412 		 * or because our caller is waiting on a longer-than-usual event
2413 		 * such as a page read), and the cache level drops below 100%,
2414 		 * limit the work to 5 evictions and return. If that's not the
2415 		 * case, we can do more.
2416 		 */
2417 		if (!busy && txn_state->pinned_id != WT_TXN_NONE &&
2418 		    txn_global->current != txn_global->oldest_id)
2419 			busy = true;
2420 		max_progress = busy ? 5 : 20;
2421 
2422 		/* See if eviction is still needed. */
2423 		if (!__wt_eviction_needed(session, busy, readonly, &pct_full) ||
2424 		    (pct_full < 100.0 && (cache->eviction_progress >
2425 		    initial_progress + max_progress)))
2426 			break;
2427 
2428 		/* Evict a page. */
2429 		switch (ret = __evict_page(session, false)) {
2430 		case 0:
2431 			if (busy)
2432 				goto err;
2433 			/* FALLTHROUGH */
2434 		case EBUSY:
2435 			break;
2436 		case WT_NOTFOUND:
2437 			/* Allow the queue to re-populate before retrying. */
2438 			__wt_cond_wait(session,
2439 			    conn->evict_threads.wait_cond, 10000, NULL);
2440 			cache->app_waits++;
2441 			break;
2442 		default:
2443 			goto err;
2444 		}
2445 		/* Stop if we've exceeded the time out. */
2446 		if (timer && cache->cache_max_wait_us != 0) {
2447 			time_stop = __wt_clock(session);
2448 			if (session->cache_wait_us +
2449 			    WT_CLOCKDIFF_US(time_stop, time_start) >
2450 			    cache->cache_max_wait_us)
2451 				goto err;
2452 		}
2453 	}
2454 
2455 err:	if (timer) {
2456 		time_stop = __wt_clock(session);
2457 		elapsed = WT_CLOCKDIFF_US(time_stop, time_start);
2458 		WT_STAT_CONN_INCRV(session, application_cache_time, elapsed);
2459 		session->cache_wait_us += elapsed;
2460 		if (cache->cache_max_wait_us != 0 &&
2461 		    session->cache_wait_us > cache->cache_max_wait_us) {
2462 			WT_TRET(WT_CACHE_FULL);
2463 			WT_STAT_CONN_INCR(session, cache_timed_out_ops);
2464 		}
2465 	}
2466 
2467 done:	WT_TRACK_OP_END(session);
2468 	return (ret);
2469 }
2470 
2471 /*
2472  * __wt_page_evict_urgent --
2473  *      Set a page to be evicted as soon as possible.
2474  */
2475 bool
__wt_page_evict_urgent(WT_SESSION_IMPL * session,WT_REF * ref)2476 __wt_page_evict_urgent(WT_SESSION_IMPL *session, WT_REF *ref)
2477 {
2478 	WT_CACHE *cache;
2479 	WT_EVICT_ENTRY *evict;
2480 	WT_EVICT_QUEUE *urgent_queue;
2481 	WT_PAGE *page;
2482 	bool queued;
2483 
2484 	/* Root pages should never be evicted via LRU. */
2485 	WT_ASSERT(session, !__wt_ref_is_root(ref));
2486 
2487 	page = ref->page;
2488 	if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU) ||
2489 	    S2BT(session)->evict_disabled > 0)
2490 		return (false);
2491 
2492 	/* Append to the urgent queue if we can. */
2493 	cache = S2C(session)->cache;
2494 	urgent_queue = &cache->evict_queues[WT_EVICT_URGENT_QUEUE];
2495 	queued = false;
2496 
2497 	__wt_spin_lock(session, &cache->evict_queue_lock);
2498 	if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU) ||
2499 	    S2BT(session)->evict_disabled > 0)
2500 		goto done;
2501 
2502 	__wt_spin_lock(session, &urgent_queue->evict_lock);
2503 	if (__evict_queue_empty(urgent_queue, false)) {
2504 		urgent_queue->evict_current = urgent_queue->evict_queue;
2505 		urgent_queue->evict_candidates = 0;
2506 	}
2507 	evict = urgent_queue->evict_queue + urgent_queue->evict_candidates;
2508 	if (evict < urgent_queue->evict_queue + cache->evict_slots &&
2509 	    __evict_push_candidate(session, urgent_queue, evict, ref)) {
2510 		++urgent_queue->evict_candidates;
2511 		queued = true;
2512 	}
2513 	__wt_spin_unlock(session, &urgent_queue->evict_lock);
2514 
2515 done:	__wt_spin_unlock(session, &cache->evict_queue_lock);
2516 	if (queued) {
2517 		WT_STAT_CONN_INCR(session, cache_eviction_pages_queued_urgent);
2518 		if (WT_EVICT_HAS_WORKERS(session))
2519 			__wt_cond_signal(session,
2520 			    S2C(session)->evict_threads.wait_cond);
2521 		else
2522 			__wt_evict_server_wake(session);
2523 	}
2524 
2525 	return (queued);
2526 }
2527 
2528 /*
2529  * __wt_evict_priority_set --
2530  *	Set a tree's eviction priority.
2531  */
2532 void
__wt_evict_priority_set(WT_SESSION_IMPL * session,uint64_t v)2533 __wt_evict_priority_set(WT_SESSION_IMPL *session, uint64_t v)
2534 {
2535 	S2BT(session)->evict_priority = v;
2536 }
2537 
2538 /*
2539  * __wt_evict_priority_clear --
2540  *	Clear a tree's eviction priority.
2541  */
2542 void
__wt_evict_priority_clear(WT_SESSION_IMPL * session)2543 __wt_evict_priority_clear(WT_SESSION_IMPL *session)
2544 {
2545 	S2BT(session)->evict_priority = 0;
2546 }
2547 
2548 /*
2549  * __verbose_dump_cache_single --
2550  *	Output diagnostic information about a single file in the cache.
2551  */
2552 static int
__verbose_dump_cache_single(WT_SESSION_IMPL * session,uint64_t * total_bytesp,uint64_t * total_dirty_bytesp)2553 __verbose_dump_cache_single(WT_SESSION_IMPL *session,
2554     uint64_t *total_bytesp, uint64_t *total_dirty_bytesp)
2555 {
2556 	WT_BTREE *btree;
2557 	WT_DATA_HANDLE *dhandle;
2558 	WT_PAGE *page;
2559 	WT_REF *next_walk;
2560 	size_t size;
2561 	uint64_t intl_bytes, intl_bytes_max, intl_dirty_bytes;
2562 	uint64_t intl_dirty_bytes_max, intl_dirty_pages, intl_pages;
2563 	uint64_t leaf_bytes, leaf_bytes_max, leaf_dirty_bytes;
2564 	uint64_t leaf_dirty_bytes_max, leaf_dirty_pages, leaf_pages;
2565 
2566 	intl_bytes = intl_bytes_max = intl_dirty_bytes = 0;
2567 	intl_dirty_bytes_max = intl_dirty_pages = intl_pages = 0;
2568 	leaf_bytes = leaf_bytes_max = leaf_dirty_bytes = 0;
2569 	leaf_dirty_bytes_max = leaf_dirty_pages = leaf_pages = 0;
2570 
2571 	next_walk = NULL;
2572 	while (__wt_tree_walk(session, &next_walk,
2573 	    WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_WAIT) == 0 &&
2574 	    next_walk != NULL) {
2575 		page = next_walk->page;
2576 		size = page->memory_footprint;
2577 
2578 		if (WT_PAGE_IS_INTERNAL(page)) {
2579 			++intl_pages;
2580 			intl_bytes += size;
2581 			intl_bytes_max = WT_MAX(intl_bytes_max, size);
2582 			if (__wt_page_is_modified(page)) {
2583 				++intl_dirty_pages;
2584 				intl_dirty_bytes += size;
2585 				intl_dirty_bytes_max =
2586 				    WT_MAX(intl_dirty_bytes_max, size);
2587 			}
2588 		} else {
2589 			++leaf_pages;
2590 			leaf_bytes += size;
2591 			leaf_bytes_max = WT_MAX(leaf_bytes_max, size);
2592 			if (__wt_page_is_modified(page)) {
2593 				++leaf_dirty_pages;
2594 				leaf_dirty_bytes += size;
2595 				leaf_dirty_bytes_max =
2596 				    WT_MAX(leaf_dirty_bytes_max, size);
2597 			}
2598 		}
2599 	}
2600 
2601 	dhandle = session->dhandle;
2602 	btree = dhandle->handle;
2603 	WT_RET(__wt_msg(session, "%s(%s%s)%s%s:",
2604 	    dhandle->name, dhandle->checkpoint != NULL ? "checkpoint=" : "",
2605 	    dhandle->checkpoint != NULL ? dhandle->checkpoint : "<live>",
2606 	    btree->evict_disabled != 0 ?  "eviction disabled" : "",
2607 	    btree->evict_disabled_open ? " at open" : ""));
2608 	if (intl_pages == 0)
2609 		WT_RET(__wt_msg(session, "internal: 0 pages"));
2610 	else
2611 		WT_RET(__wt_msg(session,
2612 		    "internal: "
2613 		    "%" PRIu64 " pages, "
2614 		    "%" PRIu64 "MB, "
2615 		    "%" PRIu64 "/%" PRIu64 " clean/dirty pages, "
2616 		    "%" PRIu64 "/%" PRIu64 " clean/dirty MB, "
2617 		    "%" PRIu64 "MB max page, "
2618 		    "%" PRIu64 "MB max dirty page",
2619 		    intl_pages,
2620 		    intl_bytes / WT_MEGABYTE,
2621 		    intl_pages - intl_dirty_pages,
2622 		    intl_dirty_pages,
2623 		    (intl_bytes - intl_dirty_bytes) / WT_MEGABYTE,
2624 		    intl_dirty_bytes / WT_MEGABYTE,
2625 		    intl_bytes_max / WT_MEGABYTE,
2626 		    intl_dirty_bytes_max / WT_MEGABYTE));
2627 	if (leaf_pages == 0)
2628 		WT_RET(__wt_msg(session, "leaf: 0 pages"));
2629 	else
2630 		WT_RET(__wt_msg(session,
2631 		    "leaf: "
2632 		    "%" PRIu64 " pages, "
2633 		    "%" PRIu64 "MB, "
2634 		    "%" PRIu64 "/%" PRIu64 " clean/dirty pages, "
2635 		    "%" PRIu64 "/%" PRIu64 " clean/dirty MB, "
2636 		    "%" PRIu64 "MB max page, "
2637 		    "%" PRIu64 "MB max dirty page",
2638 		    leaf_pages,
2639 		    leaf_bytes / WT_MEGABYTE,
2640 		    leaf_pages - leaf_dirty_pages,
2641 		    leaf_dirty_pages,
2642 		    (leaf_bytes - leaf_dirty_bytes) / WT_MEGABYTE,
2643 		    leaf_dirty_bytes / WT_MEGABYTE,
2644 		    leaf_bytes_max / WT_MEGABYTE,
2645 		    leaf_dirty_bytes_max / WT_MEGABYTE));
2646 
2647 	*total_bytesp += intl_bytes + leaf_bytes;
2648 	*total_dirty_bytesp += intl_dirty_bytes + leaf_dirty_bytes;
2649 
2650 	return (0);
2651 }
2652 
2653 /*
2654  * __wt_verbose_dump_cache --
2655  *	Output diagnostic information about the cache.
2656  */
2657 int
__wt_verbose_dump_cache(WT_SESSION_IMPL * session)2658 __wt_verbose_dump_cache(WT_SESSION_IMPL *session)
2659 {
2660 	WT_CONNECTION_IMPL *conn;
2661 	WT_DATA_HANDLE *dhandle;
2662 	WT_DECL_RET;
2663 	double pct;
2664 	uint64_t total_bytes, total_dirty_bytes;
2665 	bool needed;
2666 
2667 	conn = S2C(session);
2668 	total_bytes = total_dirty_bytes = 0;
2669 	pct = 0.0;				/* [-Werror=uninitialized] */
2670 
2671 	WT_RET(__wt_msg(session, "%s", WT_DIVIDER));
2672 	WT_RET(__wt_msg(session, "cache dump"));
2673 
2674 	WT_RET(__wt_msg(session,
2675 	    "cache full: %s", __wt_cache_full(session) ? "yes" : "no"));
2676 	needed = __wt_eviction_clean_needed(session, &pct);
2677 	WT_RET(__wt_msg(session,
2678 	    "cache clean check: %s (%2.3f%%)", needed ? "yes" : "no", pct));
2679 	needed = __wt_eviction_dirty_needed(session, &pct);
2680 	WT_RET(__wt_msg(session,
2681 	    "cache dirty check: %s (%2.3f%%)", needed ? "yes" : "no", pct));
2682 
2683 	for (dhandle = NULL;;) {
2684 		WT_WITH_HANDLE_LIST_READ_LOCK(session,
2685 		    WT_DHANDLE_NEXT(session, dhandle, &conn->dhqh, q));
2686 		if (dhandle == NULL)
2687 			break;
2688 		if (dhandle->type != WT_DHANDLE_TYPE_BTREE ||
2689 		    !F_ISSET(dhandle, WT_DHANDLE_OPEN))
2690 			continue;
2691 
2692 		WT_WITH_DHANDLE(session, dhandle,
2693 		    ret = __verbose_dump_cache_single(
2694 		    session, &total_bytes, &total_dirty_bytes));
2695 		if (ret != 0)
2696 			break;
2697 	}
2698 	WT_RET(ret);
2699 
2700 	/*
2701 	 * Apply the overhead percentage so our total bytes are comparable with
2702 	 * the tracked value.
2703 	 */
2704 	total_bytes = __wt_cache_bytes_plus_overhead(conn->cache, total_bytes);
2705 
2706 	WT_RET(__wt_msg(session,
2707 	    "cache dump: "
2708 	    "total found: %" PRIu64 "MB vs tracked inuse %" PRIu64 "MB",
2709 	    total_bytes / WT_MEGABYTE,
2710 	    __wt_cache_bytes_inuse(conn->cache) / WT_MEGABYTE));
2711 	WT_RET(__wt_msg(session,
2712 	    "total dirty bytes: %" PRIu64 "MB",
2713 	    total_dirty_bytes / WT_MEGABYTE));
2714 
2715 	return (0);
2716 }
2717