1 /*-
2 * Copyright (c) 2014-2018 MongoDB, Inc.
3 * Copyright (c) 2008-2014 WiredTiger, Inc.
4 * All rights reserved.
5 *
6 * See the file LICENSE for redistribution information.
7 */
8
9 #include "wt_internal.h"
10
11 static int __evict_clear_all_walks(WT_SESSION_IMPL *);
12 static int WT_CDECL __evict_lru_cmp(const void *, const void *);
13 static int __evict_lru_pages(WT_SESSION_IMPL *, bool);
14 static int __evict_lru_walk(WT_SESSION_IMPL *);
15 static int __evict_page(WT_SESSION_IMPL *, bool);
16 static int __evict_pass(WT_SESSION_IMPL *);
17 static int __evict_server(WT_SESSION_IMPL *, bool *);
18 static void __evict_tune_workers(WT_SESSION_IMPL *session);
19 static int __evict_walk(WT_SESSION_IMPL *, WT_EVICT_QUEUE *);
20 static int __evict_walk_tree(
21 WT_SESSION_IMPL *, WT_EVICT_QUEUE *, u_int, u_int *);
22
23 #define WT_EVICT_HAS_WORKERS(s) \
24 (S2C(s)->evict_threads.current_threads > 1)
25
26 /*
27 * __evict_lock_handle_list --
28 * Try to get the handle list lock, with yield and sleep back off.
29 * Keep timing statistics overall.
30 */
31 static int
__evict_lock_handle_list(WT_SESSION_IMPL * session)32 __evict_lock_handle_list(WT_SESSION_IMPL *session)
33 {
34 WT_CACHE *cache;
35 WT_CONNECTION_IMPL *conn;
36 WT_DECL_RET;
37 WT_RWLOCK *dh_lock;
38 u_int spins;
39
40 conn = S2C(session);
41 cache = conn->cache;
42 dh_lock = &conn->dhandle_lock;
43
44 /*
45 * Use a custom lock acquisition back off loop so the eviction server
46 * notices any interrupt quickly.
47 */
48 for (spins = 0;
49 (ret = __wt_try_readlock(session, dh_lock)) == EBUSY &&
50 cache->pass_intr == 0; spins++) {
51 if (spins < WT_THOUSAND)
52 __wt_yield();
53 else
54 __wt_sleep(0, WT_THOUSAND);
55 }
56 return (ret);
57 }
58
59 /*
60 * __evict_entry_priority --
61 * Get the adjusted read generation for an eviction entry.
62 */
63 static inline uint64_t
__evict_entry_priority(WT_SESSION_IMPL * session,WT_REF * ref)64 __evict_entry_priority(WT_SESSION_IMPL *session, WT_REF *ref)
65 {
66 WT_BTREE *btree;
67 WT_PAGE *page;
68 uint64_t read_gen;
69
70 btree = S2BT(session);
71 page = ref->page;
72
73 /* Any page set to the oldest generation should be discarded. */
74 if (WT_READGEN_EVICT_SOON(page->read_gen))
75 return (WT_READGEN_OLDEST);
76
77 /* Any page from a dead tree is a great choice. */
78 if (F_ISSET(btree->dhandle, WT_DHANDLE_DEAD))
79 return (WT_READGEN_OLDEST);
80
81 /* Any empty page (leaf or internal), is a good choice. */
82 if (__wt_page_is_empty(page))
83 return (WT_READGEN_OLDEST);
84
85 /* Any large page in memory is likewise a good choice. */
86 if (page->memory_footprint > btree->splitmempage)
87 return (WT_READGEN_OLDEST);
88
89 /*
90 * The base read-generation is skewed by the eviction priority.
91 * Internal pages are also adjusted, we prefer to evict leaf pages.
92 */
93 if (page->modify != NULL &&
94 F_ISSET(S2C(session)->cache, WT_CACHE_EVICT_DIRTY) &&
95 !F_ISSET(S2C(session)->cache, WT_CACHE_EVICT_CLEAN))
96 read_gen = page->modify->update_txn;
97 else
98 read_gen = page->read_gen;
99
100 read_gen += btree->evict_priority;
101
102 #define WT_EVICT_INTL_SKEW 1000
103 if (WT_PAGE_IS_INTERNAL(page))
104 read_gen += WT_EVICT_INTL_SKEW;
105
106 return (read_gen);
107 }
108
109 /*
110 * __evict_lru_cmp --
111 * Qsort function: sort the eviction array.
112 */
113 static int WT_CDECL
__evict_lru_cmp(const void * a_arg,const void * b_arg)114 __evict_lru_cmp(const void *a_arg, const void *b_arg)
115 {
116 const WT_EVICT_ENTRY *a, *b;
117 uint64_t a_score, b_score;
118
119 a = a_arg;
120 b = b_arg;
121 a_score = (a->ref == NULL ? UINT64_MAX : a->score);
122 b_score = (b->ref == NULL ? UINT64_MAX : b->score);
123
124 return ((a_score < b_score) ? -1 : (a_score == b_score) ? 0 : 1);
125 }
126
127 /*
128 * __evict_list_clear --
129 * Clear an entry in the LRU eviction list.
130 */
131 static inline void
__evict_list_clear(WT_SESSION_IMPL * session,WT_EVICT_ENTRY * e)132 __evict_list_clear(WT_SESSION_IMPL *session, WT_EVICT_ENTRY *e)
133 {
134 if (e->ref != NULL) {
135 WT_ASSERT(session,
136 F_ISSET_ATOMIC(e->ref->page, WT_PAGE_EVICT_LRU));
137 F_CLR_ATOMIC(e->ref->page, WT_PAGE_EVICT_LRU);
138 }
139 e->ref = NULL;
140 e->btree = WT_DEBUG_POINT;
141 }
142
143 /*
144 * __wt_evict_list_clear_page --
145 * Make sure a page is not in the LRU eviction list. This called from the
146 * page eviction code to make sure there is no attempt to evict a child
147 * page multiple times.
148 */
149 void
__wt_evict_list_clear_page(WT_SESSION_IMPL * session,WT_REF * ref)150 __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref)
151 {
152 WT_CACHE *cache;
153 WT_EVICT_ENTRY *evict;
154 uint32_t i, elem, q;
155 bool found;
156
157 WT_ASSERT(session,
158 __wt_ref_is_root(ref) || ref->state == WT_REF_LOCKED);
159
160 /* Fast path: if the page isn't on the queue, don't bother searching. */
161 if (!F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU))
162 return;
163
164 cache = S2C(session)->cache;
165 __wt_spin_lock(session, &cache->evict_queue_lock);
166
167 found = false;
168 for (q = 0; q < WT_EVICT_QUEUE_MAX && !found; q++) {
169 __wt_spin_lock(session, &cache->evict_queues[q].evict_lock);
170 elem = cache->evict_queues[q].evict_max;
171 for (i = 0, evict = cache->evict_queues[q].evict_queue;
172 i < elem; i++, evict++)
173 if (evict->ref == ref) {
174 found = true;
175 __evict_list_clear(session, evict);
176 break;
177 }
178 __wt_spin_unlock(session, &cache->evict_queues[q].evict_lock);
179 }
180 WT_ASSERT(session, !F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU));
181
182 __wt_spin_unlock(session, &cache->evict_queue_lock);
183 }
184
185 /*
186 * __evict_queue_empty --
187 * Is the queue empty?
188 *
189 * Note that the eviction server is pessimistic and treats a half full
190 * queue as empty.
191 */
192 static inline bool
__evict_queue_empty(WT_EVICT_QUEUE * queue,bool server_check)193 __evict_queue_empty(WT_EVICT_QUEUE *queue, bool server_check)
194 {
195 uint32_t candidates, used;
196
197 if (queue->evict_current == NULL)
198 return (true);
199
200 /* The eviction server only considers half of the candidates. */
201 candidates = queue->evict_candidates;
202 if (server_check && candidates > 1)
203 candidates /= 2;
204 used = (uint32_t)(queue->evict_current - queue->evict_queue);
205 return (used >= candidates);
206 }
207
208 /*
209 * __evict_queue_full --
210 * Is the queue full (i.e., it has been populated with candidates and none
211 * of them have been evicted yet)?
212 */
213 static inline bool
__evict_queue_full(WT_EVICT_QUEUE * queue)214 __evict_queue_full(WT_EVICT_QUEUE *queue)
215 {
216 return (queue->evict_current == queue->evict_queue &&
217 queue->evict_candidates != 0);
218 }
219
220 /*
221 * __wt_evict_server_wake --
222 * Wake the eviction server thread.
223 */
224 void
__wt_evict_server_wake(WT_SESSION_IMPL * session)225 __wt_evict_server_wake(WT_SESSION_IMPL *session)
226 {
227 WT_CACHE *cache;
228 WT_CONNECTION_IMPL *conn;
229
230 conn = S2C(session);
231 cache = conn->cache;
232
233 if (WT_VERBOSE_ISSET(session, WT_VERB_EVICTSERVER)) {
234 uint64_t bytes_inuse, bytes_max;
235
236 bytes_inuse = __wt_cache_bytes_inuse(cache);
237 bytes_max = conn->cache_size;
238 __wt_verbose(session, WT_VERB_EVICTSERVER,
239 "waking, bytes inuse %s max (%" PRIu64
240 "MB %s %" PRIu64 "MB)",
241 bytes_inuse <= bytes_max ? "<=" : ">",
242 bytes_inuse / WT_MEGABYTE,
243 bytes_inuse <= bytes_max ? "<=" : ">",
244 bytes_max / WT_MEGABYTE);
245 }
246
247 __wt_cond_signal(session, cache->evict_cond);
248 }
249
250 /*
251 * __wt_evict_thread_chk --
252 * Check to decide if the eviction thread should continue running.
253 */
254 bool
__wt_evict_thread_chk(WT_SESSION_IMPL * session)255 __wt_evict_thread_chk(WT_SESSION_IMPL *session)
256 {
257 return (F_ISSET(S2C(session), WT_CONN_EVICTION_RUN));
258 }
259
260 /*
261 * __wt_evict_thread_run --
262 * Entry function for an eviction thread. This is called repeatedly
263 * from the thread group code so it does not need to loop itself.
264 */
265 int
__wt_evict_thread_run(WT_SESSION_IMPL * session,WT_THREAD * thread)266 __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread)
267 {
268 WT_CACHE *cache;
269 WT_CONNECTION_IMPL *conn;
270 WT_DECL_RET;
271 bool did_work, was_intr;
272
273 conn = S2C(session);
274 cache = conn->cache;
275
276 /*
277 * The thread group code calls us repeatedly. So each call is one pass
278 * through eviction.
279 */
280 if (conn->evict_server_running &&
281 __wt_spin_trylock(session, &cache->evict_pass_lock) == 0) {
282 /*
283 * Cannot use WT_WITH_PASS_LOCK because this is a try lock.
284 * Fix when that is supported. We set the flag on both sessions
285 * because we may call clear_walk when we are walking with
286 * the walk session, locked.
287 */
288 F_SET(session, WT_SESSION_LOCKED_PASS);
289 F_SET(cache->walk_session, WT_SESSION_LOCKED_PASS);
290 ret = __evict_server(session, &did_work);
291 F_CLR(cache->walk_session, WT_SESSION_LOCKED_PASS);
292 F_CLR(session, WT_SESSION_LOCKED_PASS);
293 was_intr = cache->pass_intr != 0;
294 __wt_spin_unlock(session, &cache->evict_pass_lock);
295 WT_ERR(ret);
296
297 /*
298 * If the eviction server was interrupted, wait until requests
299 * have been processed: the system may otherwise be busy so
300 * don't go to sleep.
301 */
302 if (was_intr)
303 while (cache->pass_intr != 0 &&
304 F_ISSET(conn, WT_CONN_EVICTION_RUN) &&
305 F_ISSET(thread, WT_THREAD_RUN))
306 __wt_yield();
307 else {
308 __wt_verbose(session,
309 WT_VERB_EVICTSERVER, "%s", "sleeping");
310
311 /* Don't rely on signals: check periodically. */
312 __wt_cond_auto_wait(session,
313 cache->evict_cond, did_work, NULL);
314 __wt_verbose(session,
315 WT_VERB_EVICTSERVER, "%s", "waking");
316 }
317 } else
318 WT_ERR(__evict_lru_pages(session, false));
319
320 if (0) {
321 err: WT_PANIC_RET(session, ret, "cache eviction thread error");
322 }
323 return (ret);
324 }
325
326 /*
327 * __wt_evict_thread_stop --
328 * Shutdown function for an eviction thread.
329 */
330 int
__wt_evict_thread_stop(WT_SESSION_IMPL * session,WT_THREAD * thread)331 __wt_evict_thread_stop(WT_SESSION_IMPL *session, WT_THREAD *thread)
332 {
333 WT_CACHE *cache;
334 WT_CONNECTION_IMPL *conn;
335 WT_DECL_RET;
336
337 if (thread->id != 0)
338 return (0);
339
340 conn = S2C(session);
341 cache = conn->cache;
342 /*
343 * The only time the first eviction thread is stopped is on shutdown:
344 * in case any trees are still open, clear all walks now so that they
345 * can be closed.
346 */
347 WT_WITH_PASS_LOCK(session, ret = __evict_clear_all_walks(session));
348 WT_ERR(ret);
349 /*
350 * The only two cases when the eviction server is expected to
351 * stop are when recovery is finished or when the connection is
352 * closing.
353 */
354 WT_ASSERT(session, F_ISSET(conn, WT_CONN_CLOSING | WT_CONN_RECOVERING));
355
356 __wt_verbose(session,
357 WT_VERB_EVICTSERVER, "%s", "cache eviction thread exiting");
358
359 if (0) {
360 err: WT_PANIC_RET(session, ret, "cache eviction thread error");
361 }
362 return (ret);
363 }
364
365 /*
366 * __evict_server --
367 * Thread to evict pages from the cache.
368 */
369 static int
__evict_server(WT_SESSION_IMPL * session,bool * did_work)370 __evict_server(WT_SESSION_IMPL *session, bool *did_work)
371 {
372 struct timespec now;
373 WT_CACHE *cache;
374 WT_CONNECTION_IMPL *conn;
375 WT_DECL_RET;
376
377 /* Assume there has been no progress. */
378 *did_work = false;
379
380 conn = S2C(session);
381 cache = conn->cache;
382
383 /* Evict pages from the cache as needed. */
384 WT_RET(__evict_pass(session));
385
386 if (!F_ISSET(conn, WT_CONN_EVICTION_RUN) || cache->pass_intr != 0)
387 return (0);
388
389 if (!__wt_cache_stuck(session)) {
390 /*
391 * Try to get the handle list lock: if we give up, that
392 * indicates a session is waiting for us to clear walks. Do
393 * that as part of a normal pass (without the handle list
394 * lock) to avoid deadlock.
395 */
396 if ((ret = __evict_lock_handle_list(session)) == EBUSY)
397 return (0);
398 WT_RET(ret);
399
400 /*
401 * Clear the walks so we don't pin pages while asleep,
402 * otherwise we can block applications evicting large pages.
403 */
404 ret = __evict_clear_all_walks(session);
405
406 __wt_readunlock(session, &conn->dhandle_lock);
407 WT_RET(ret);
408
409 /* Make sure we'll notice next time we're stuck. */
410 cache->last_eviction_progress = 0;
411 return (0);
412 }
413
414 /* Track if work was done. */
415 *did_work = cache->eviction_progress != cache->last_eviction_progress;
416 cache->last_eviction_progress = cache->eviction_progress;
417
418 /* Eviction is stuck, check if we have made progress. */
419 if (*did_work) {
420 #if !defined(HAVE_DIAGNOSTIC)
421 /* Need verbose check only if not in diagnostic build */
422 if (WT_VERBOSE_ISSET(session, WT_VERB_EVICT_STUCK))
423 #endif
424 __wt_epoch(session, &cache->stuck_time);
425 return (0);
426 }
427
428 #if !defined(HAVE_DIAGNOSTIC)
429 /* Need verbose check only if not in diagnostic build */
430 if (!WT_VERBOSE_ISSET(session, WT_VERB_EVICT_STUCK))
431 return (0);
432 #endif
433 /*
434 * If we're stuck for 5 minutes in diagnostic mode, or the verbose
435 * evict_stuck flag is configured, log the cache and transaction state.
436 *
437 * If we're stuck for 5 minutes in diagnostic mode, give up.
438 *
439 * We don't do this check for in-memory workloads because application
440 * threads are not blocked by the cache being full. If the cache becomes
441 * full of clean pages, we can be servicing reads while the cache
442 * appears stuck to eviction.
443 */
444 if (F_ISSET(conn, WT_CONN_IN_MEMORY))
445 return (0);
446
447 __wt_epoch(session, &now);
448 if (WT_TIMEDIFF_SEC(now, cache->stuck_time) > WT_MINUTE * 5) {
449 #if defined(HAVE_DIAGNOSTIC)
450 __wt_err(session, ETIMEDOUT,
451 "Cache stuck for too long, giving up");
452 WT_RET(__wt_verbose_dump_txn(session));
453 WT_RET(__wt_verbose_dump_cache(session));
454 return (__wt_set_return(session, ETIMEDOUT));
455 #else
456 if (WT_VERBOSE_ISSET(session, WT_VERB_EVICT_STUCK)) {
457 WT_RET(__wt_verbose_dump_txn(session));
458 WT_RET(__wt_verbose_dump_cache(session));
459
460 /* Reset the timer. */
461 __wt_epoch(session, &cache->stuck_time);
462 }
463 #endif
464 }
465 return (0);
466 }
467
468 /*
469 * __wt_evict_create --
470 * Start the eviction server.
471 */
472 int
__wt_evict_create(WT_SESSION_IMPL * session)473 __wt_evict_create(WT_SESSION_IMPL *session)
474 {
475 WT_CONNECTION_IMPL *conn;
476 uint32_t session_flags;
477
478 conn = S2C(session);
479
480 WT_ASSERT(session, conn->evict_threads_min > 0);
481 /* Set first, the thread might run before we finish up. */
482 F_SET(conn, WT_CONN_EVICTION_RUN);
483
484 /*
485 * Create the eviction thread group.
486 * Set the group size to the maximum allowed sessions.
487 */
488 session_flags = WT_THREAD_CAN_WAIT |
489 WT_THREAD_LOOKASIDE | WT_THREAD_PANIC_FAIL;
490 WT_RET(__wt_thread_group_create(session, &conn->evict_threads,
491 "eviction-server", conn->evict_threads_min, conn->evict_threads_max,
492 session_flags, __wt_evict_thread_chk, __wt_evict_thread_run,
493 __wt_evict_thread_stop));
494
495 /*
496 * Ensure the cache stuck timer is initialized when starting eviction.
497 */
498 #if !defined(HAVE_DIAGNOSTIC)
499 /* Need verbose check only if not in diagnostic build */
500 if (WT_VERBOSE_ISSET(session, WT_VERB_EVICTSERVER))
501 #endif
502 __wt_epoch(session, &conn->cache->stuck_time);
503
504 /*
505 * Allow queues to be populated now that the eviction threads
506 * are running.
507 */
508 conn->evict_server_running = true;
509
510 return (0);
511 }
512
513 /*
514 * __wt_evict_destroy --
515 * Destroy the eviction threads.
516 */
517 int
__wt_evict_destroy(WT_SESSION_IMPL * session)518 __wt_evict_destroy(WT_SESSION_IMPL *session)
519 {
520 WT_CONNECTION_IMPL *conn;
521
522 conn = S2C(session);
523
524 /* We are done if the eviction server didn't start successfully. */
525 if (!conn->evict_server_running)
526 return (0);
527
528 /* Wait for any eviction thread group changes to stabilize. */
529 __wt_writelock(session, &conn->evict_threads.lock);
530
531 /*
532 * Signal the threads to finish and stop populating the queue.
533 */
534 F_CLR(conn, WT_CONN_EVICTION_RUN);
535 conn->evict_server_running = false;
536 __wt_evict_server_wake(session);
537
538 __wt_verbose(
539 session, WT_VERB_EVICTSERVER, "%s", "waiting for helper threads");
540
541 /*
542 * We call the destroy function still holding the write lock.
543 * It assumes it is called locked.
544 */
545 WT_RET(__wt_thread_group_destroy(session, &conn->evict_threads));
546
547 return (0);
548 }
549
550 /*
551 * __evict_update_work --
552 * Configure eviction work state.
553 */
554 static bool
__evict_update_work(WT_SESSION_IMPL * session)555 __evict_update_work(WT_SESSION_IMPL *session)
556 {
557 WT_BTREE *las_tree;
558 WT_CACHE *cache;
559 WT_CONNECTION_IMPL *conn;
560 double dirty_target, dirty_trigger, target, trigger;
561 uint64_t bytes_inuse, bytes_max, dirty_inuse;
562 uint32_t flags;
563
564 conn = S2C(session);
565 cache = conn->cache;
566
567 dirty_target = __wt_eviction_dirty_target(cache);
568 dirty_trigger = cache->eviction_dirty_trigger;
569 target = cache->eviction_target;
570 trigger = cache->eviction_trigger;
571
572 /* Build up the new state. */
573 flags = 0;
574
575 if (!F_ISSET(conn, WT_CONN_EVICTION_RUN)) {
576 cache->flags = 0;
577 return (false);
578 }
579
580 if (!__evict_queue_empty(cache->evict_urgent_queue, false))
581 LF_SET(WT_CACHE_EVICT_URGENT);
582
583 if (F_ISSET(conn, WT_CONN_LOOKASIDE_OPEN)) {
584 WT_ASSERT(session,
585 F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR));
586
587 las_tree = ((WT_CURSOR_BTREE *)session->las_cursor)->btree;
588 cache->bytes_lookaside = las_tree->bytes_inmem;
589 }
590
591 /*
592 * If we need space in the cache, try to find clean pages to evict.
593 *
594 * Avoid division by zero if the cache size has not yet been set in a
595 * shared cache.
596 */
597 bytes_max = conn->cache_size + 1;
598 bytes_inuse = __wt_cache_bytes_inuse(cache);
599 if (__wt_eviction_clean_needed(session, NULL))
600 LF_SET(WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_CLEAN_HARD);
601 else if (bytes_inuse > (target * bytes_max) / 100)
602 LF_SET(WT_CACHE_EVICT_CLEAN);
603
604 dirty_inuse = __wt_cache_dirty_leaf_inuse(cache);
605 if (__wt_eviction_dirty_needed(session, NULL))
606 LF_SET(WT_CACHE_EVICT_DIRTY | WT_CACHE_EVICT_DIRTY_HARD);
607 else if (dirty_inuse > (uint64_t)(dirty_target * bytes_max) / 100)
608 LF_SET(WT_CACHE_EVICT_DIRTY);
609
610 /*
611 * If application threads are blocked by the total volume of data in
612 * cache, try dirty pages as well.
613 */
614 if (__wt_cache_aggressive(session) &&
615 LF_ISSET(WT_CACHE_EVICT_CLEAN_HARD))
616 LF_SET(WT_CACHE_EVICT_DIRTY);
617
618 /* When we stop looking for dirty pages, reduce the lookaside score. */
619 if (!LF_ISSET(WT_CACHE_EVICT_DIRTY))
620 __wt_cache_update_lookaside_score(session, 1, 0);
621
622 /*
623 * Scrub dirty pages and keep them in cache if we are less than half
624 * way to the clean or dirty trigger.
625 */
626 if (bytes_inuse < (uint64_t)((target + trigger) * bytes_max) / 200) {
627 if (dirty_inuse < (uint64_t)
628 ((dirty_target + dirty_trigger) * bytes_max) / 200)
629 LF_SET(WT_CACHE_EVICT_SCRUB);
630 } else
631 LF_SET(WT_CACHE_EVICT_NOKEEP);
632
633 /*
634 * Try lookaside evict when:
635 * (1) the cache is stuck; OR
636 * (2) the lookaside score goes over 80; and
637 * (3) the cache is more than half way from the dirty target to the
638 * dirty trigger.
639 */
640 if (__wt_cache_stuck(session) ||
641 (__wt_cache_lookaside_score(cache) > 80 &&
642 dirty_inuse >
643 (uint64_t)((dirty_target + dirty_trigger) * bytes_max) / 200))
644 LF_SET(WT_CACHE_EVICT_LOOKASIDE);
645
646 /*
647 * With an in-memory cache, we only do dirty eviction in order to scrub
648 * pages.
649 */
650 if (F_ISSET(conn, WT_CONN_IN_MEMORY)) {
651 if (LF_ISSET(WT_CACHE_EVICT_CLEAN))
652 LF_SET(WT_CACHE_EVICT_DIRTY);
653 if (LF_ISSET(WT_CACHE_EVICT_CLEAN_HARD))
654 LF_SET(WT_CACHE_EVICT_DIRTY_HARD);
655 LF_CLR(WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_CLEAN_HARD);
656 }
657
658 /* Update the global eviction state. */
659 cache->flags = flags;
660
661 return (F_ISSET(cache, WT_CACHE_EVICT_ALL | WT_CACHE_EVICT_URGENT));
662 }
663
664 /*
665 * __evict_pass --
666 * Evict pages from memory.
667 */
668 static int
__evict_pass(WT_SESSION_IMPL * session)669 __evict_pass(WT_SESSION_IMPL *session)
670 {
671 WT_CACHE *cache;
672 WT_CONNECTION_IMPL *conn;
673 WT_TXN_GLOBAL *txn_global;
674 uint64_t eviction_progress, oldest_id, prev_oldest_id;
675 uint64_t time_now, time_prev;
676 u_int loop;
677
678 conn = S2C(session);
679 cache = conn->cache;
680 txn_global = &conn->txn_global;
681 time_prev = 0; /* [-Wconditional-uninitialized] */
682
683 /* Track whether pages are being evicted and progress is made. */
684 eviction_progress = cache->eviction_progress;
685 prev_oldest_id = txn_global->oldest_id;
686
687 /* Evict pages from the cache. */
688 for (loop = 0; cache->pass_intr == 0; loop++) {
689 time_now = __wt_clock(session);
690 if (loop == 0)
691 time_prev = time_now;
692
693 __evict_tune_workers(session);
694 /*
695 * Increment the shared read generation. Do this occasionally
696 * even if eviction is not currently required, so that pages
697 * have some relative read generation when the eviction server
698 * does need to do some work.
699 */
700 __wt_cache_read_gen_incr(session);
701 ++cache->evict_pass_gen;
702
703 /*
704 * Update the oldest ID: we use it to decide whether pages are
705 * candidates for eviction. Without this, if all threads are
706 * blocked after a long-running transaction (such as a
707 * checkpoint) completes, we may never start evicting again.
708 *
709 * Do this every time the eviction server wakes up, regardless
710 * of whether the cache is full, to prevent the oldest ID
711 * falling too far behind. Don't wait to lock the table: with
712 * highly threaded workloads, that creates a bottleneck.
713 */
714 WT_RET(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT));
715
716 if (!__evict_update_work(session))
717 break;
718
719 __wt_verbose(session, WT_VERB_EVICTSERVER,
720 "Eviction pass with: Max: %" PRIu64
721 " In use: %" PRIu64 " Dirty: %" PRIu64,
722 conn->cache_size, cache->bytes_inmem,
723 cache->bytes_dirty_intl + cache->bytes_dirty_leaf);
724
725 if (F_ISSET(cache, WT_CACHE_EVICT_ALL))
726 WT_RET(__evict_lru_walk(session));
727
728 /*
729 * If the queue has been empty recently, keep queuing more
730 * pages to evict. If the rate of queuing pages is high
731 * enough, this score will go to zero, in which case the
732 * eviction server might as well help out with eviction.
733 *
734 * Also, if there is a single eviction server thread with no
735 * workers, it must service the urgent queue in case all
736 * application threads are busy.
737 */
738 if (!WT_EVICT_HAS_WORKERS(session) &&
739 (cache->evict_empty_score < WT_EVICT_SCORE_CUTOFF ||
740 !__evict_queue_empty(cache->evict_urgent_queue, false)))
741 WT_RET(__evict_lru_pages(session, true));
742
743 if (cache->pass_intr != 0)
744 break;
745
746 /*
747 * If we're making progress, keep going; if we're not making
748 * any progress at all, mark the cache "stuck" and go back to
749 * sleep, it's not something we can fix.
750 *
751 * We check for progress every 20ms, the idea being that the
752 * aggressive score will reach 10 after 200ms if we aren't
753 * making progress and eviction will start considering more
754 * pages. If there is still no progress after 2s, we will
755 * treat the cache as stuck and start rolling back
756 * transactions and writing updates to the lookaside table.
757 */
758 if (eviction_progress == cache->eviction_progress) {
759 if (WT_CLOCKDIFF_MS(time_now, time_prev) >= 20 &&
760 F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD |
761 WT_CACHE_EVICT_DIRTY_HARD)) {
762 if (cache->evict_aggressive_score < 100)
763 ++cache->evict_aggressive_score;
764 oldest_id = txn_global->oldest_id;
765 if (prev_oldest_id == oldest_id &&
766 txn_global->current != oldest_id &&
767 cache->evict_aggressive_score < 100)
768 ++cache->evict_aggressive_score;
769 time_prev = time_now;
770 prev_oldest_id = oldest_id;
771 }
772
773 /*
774 * Keep trying for long enough that we should be able
775 * to evict a page if the server isn't interfering.
776 */
777 if (loop < 100 || cache->evict_aggressive_score < 100) {
778 /*
779 * Back off if we aren't making progress: walks
780 * hold the handle list lock, blocking other
781 * operations that can free space in cache,
782 * such as LSM discarding handles.
783 *
784 * Allow this wait to be interrupted (e.g. if a
785 * checkpoint completes): make sure we wait for
786 * a non-zero number of microseconds).
787 */
788 WT_STAT_CONN_INCR(session,
789 cache_eviction_server_slept);
790 __wt_cond_wait(session,
791 cache->evict_cond, WT_THOUSAND, NULL);
792 continue;
793 }
794
795 WT_STAT_CONN_INCR(session, cache_eviction_slow);
796 __wt_verbose(session, WT_VERB_EVICTSERVER,
797 "%s", "unable to reach eviction goal");
798 break;
799 }
800 if (cache->evict_aggressive_score > 0)
801 --cache->evict_aggressive_score;
802 loop = 0;
803 eviction_progress = cache->eviction_progress;
804 }
805 return (0);
806 }
807
808 /*
809 * __evict_clear_walk --
810 * Clear a single walk point.
811 */
812 static int
__evict_clear_walk(WT_SESSION_IMPL * session)813 __evict_clear_walk(WT_SESSION_IMPL *session)
814 {
815 WT_BTREE *btree;
816 WT_CACHE *cache;
817 WT_DECL_RET;
818 WT_REF *ref;
819
820 btree = S2BT(session);
821 cache = S2C(session)->cache;
822
823 WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_PASS));
824 if (session->dhandle == cache->walk_tree)
825 cache->walk_tree = NULL;
826
827 if ((ref = btree->evict_ref) == NULL)
828 return (0);
829
830 WT_STAT_CONN_INCR(session, cache_eviction_walks_abandoned);
831 WT_STAT_DATA_INCR(session, cache_eviction_walks_abandoned);
832
833 /*
834 * Clear evict_ref before releasing it in case that forces eviction (we
835 * assert that we never try to evict the current eviction walk point).
836 */
837 btree->evict_ref = NULL;
838
839 WT_WITH_DHANDLE(cache->walk_session, session->dhandle,
840 (ret = __wt_page_release(cache->walk_session,
841 ref, WT_READ_NO_EVICT)));
842 return (ret);
843 }
844
845 /*
846 * __evict_clear_all_walks --
847 * Clear the eviction walk points for all files a session is waiting on.
848 */
849 static int
__evict_clear_all_walks(WT_SESSION_IMPL * session)850 __evict_clear_all_walks(WT_SESSION_IMPL *session)
851 {
852 WT_CONNECTION_IMPL *conn;
853 WT_DATA_HANDLE *dhandle;
854 WT_DECL_RET;
855
856 conn = S2C(session);
857
858 TAILQ_FOREACH(dhandle, &conn->dhqh, q)
859 if (dhandle->type == WT_DHANDLE_TYPE_BTREE)
860 WT_WITH_DHANDLE(session, dhandle,
861 WT_TRET(__evict_clear_walk(session)));
862 return (ret);
863 }
864
865 /*
866 * __wt_evict_file_exclusive_on --
867 * Get exclusive eviction access to a file and discard any of the file's
868 * blocks queued for eviction.
869 */
870 int
__wt_evict_file_exclusive_on(WT_SESSION_IMPL * session)871 __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session)
872 {
873 WT_BTREE *btree;
874 WT_CACHE *cache;
875 WT_DECL_RET;
876 WT_EVICT_ENTRY *evict;
877 u_int i, elem, q;
878
879 btree = S2BT(session);
880 cache = S2C(session)->cache;
881
882 /* Hold the walk lock to turn off eviction. */
883 __wt_spin_lock(session, &cache->evict_walk_lock);
884 if (++btree->evict_disabled > 1) {
885 __wt_spin_unlock(session, &cache->evict_walk_lock);
886 return (0);
887 }
888
889 /*
890 * Ensure no new pages from the file will be queued for eviction after
891 * this point, then clear any existing LRU eviction walk for the file.
892 */
893 (void)__wt_atomic_addv32(&cache->pass_intr, 1);
894 WT_WITH_PASS_LOCK(session, ret = __evict_clear_walk(session));
895 (void)__wt_atomic_subv32(&cache->pass_intr, 1);
896 WT_ERR(ret);
897
898 /*
899 * The eviction candidate list might reference pages from the file,
900 * clear it. Hold the evict lock to remove queued pages from a file.
901 */
902 __wt_spin_lock(session, &cache->evict_queue_lock);
903
904 for (q = 0; q < WT_EVICT_QUEUE_MAX; q++) {
905 __wt_spin_lock(session, &cache->evict_queues[q].evict_lock);
906 elem = cache->evict_queues[q].evict_max;
907 for (i = 0, evict = cache->evict_queues[q].evict_queue;
908 i < elem; i++, evict++)
909 if (evict->btree == btree)
910 __evict_list_clear(session, evict);
911 __wt_spin_unlock(session, &cache->evict_queues[q].evict_lock);
912 }
913
914 __wt_spin_unlock(session, &cache->evict_queue_lock);
915
916 /*
917 * We have disabled further eviction: wait for concurrent LRU eviction
918 * activity to drain.
919 */
920 while (btree->evict_busy > 0)
921 __wt_yield();
922
923 if (0) {
924 err: --btree->evict_disabled;
925 }
926 __wt_spin_unlock(session, &cache->evict_walk_lock);
927 return (ret);
928 }
929
930 /*
931 * __wt_evict_file_exclusive_off --
932 * Release exclusive eviction access to a file.
933 */
934 void
__wt_evict_file_exclusive_off(WT_SESSION_IMPL * session)935 __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session)
936 {
937 WT_BTREE *btree;
938
939 btree = S2BT(session);
940
941 /*
942 * We have seen subtle bugs with multiple threads racing to turn
943 * eviction on/off. Make races more likely in diagnostic builds.
944 */
945 WT_DIAGNOSTIC_YIELD;
946
947 /*
948 * Atomically decrement the evict-disabled count, without acquiring the
949 * eviction walk-lock. We can't acquire that lock here because there's
950 * a potential deadlock. When acquiring exclusive eviction access, we
951 * acquire the eviction walk-lock and then the cache's pass-intr lock.
952 * The current eviction implementation can hold the pass-intr lock and
953 * call into this function (see WT-3303 for the details), which might
954 * deadlock with another thread trying to get exclusive eviction access.
955 */
956 #if defined(HAVE_DIAGNOSTIC)
957 {
958 int32_t v;
959
960 WT_ASSERT(session, btree->evict_ref == NULL);
961 v = __wt_atomic_subi32(&btree->evict_disabled, 1);
962 WT_ASSERT(session, v >= 0);
963 }
964 #else
965 (void)__wt_atomic_subi32(&btree->evict_disabled, 1);
966 #endif
967 }
968
969 #define EVICT_TUNE_BATCH 1 /* Max workers to add each period */
970 /*
971 * Data points needed before deciding if we should keep adding workers or settle
972 * on an earlier value.
973 */
974 #define EVICT_TUNE_DATAPT_MIN 8
975 #define EVICT_TUNE_PERIOD 60 /* Tune period in milliseconds */
976
977 /*
978 * We will do a fresh re-tune every that many milliseconds to adjust to
979 * significant phase changes.
980 */
981 #define EVICT_FORCE_RETUNE 25000
982
983 /*
984 * __evict_tune_workers --
985 * Find the right number of eviction workers. Gradually ramp up the number of
986 * workers increasing the number in batches indicated by the setting above.
987 * Store the number of workers that gave us the best throughput so far and the
988 * number of data points we have tried.
989 *
990 * Every once in a while when we have the minimum number of data points we check
991 * whether the eviction throughput achieved with the current number of workers
992 * is the best we have seen so far. If so, we will keep increasing the number of
993 * workers. If not, we are past the infliction point on the eviction throughput
994 * curve. In that case, we will set the number of workers to the best observed
995 * so far and settle into a stable state.
996 */
997 static void
__evict_tune_workers(WT_SESSION_IMPL * session)998 __evict_tune_workers(WT_SESSION_IMPL *session)
999 {
1000 struct timespec current_time;
1001 WT_CACHE *cache;
1002 WT_CONNECTION_IMPL *conn;
1003 uint64_t delta_msec, delta_pages;
1004 uint64_t eviction_progress, eviction_progress_rate, time_diff;
1005 int32_t cur_threads, i, target_threads, thread_surplus;
1006
1007 conn = S2C(session);
1008 cache = conn->cache;
1009
1010 /*
1011 * If we have a fixed number of eviction threads, there is no value in
1012 * calculating if we should do any tuning.
1013 */
1014 if (conn->evict_threads_max == conn->evict_threads_min)
1015 return;
1016
1017 __wt_epoch(session, ¤t_time);
1018 time_diff = WT_TIMEDIFF_MS(current_time, cache->evict_tune_last_time);
1019
1020 /*
1021 * If we have reached the stable state and have not run long enough to
1022 * surpass the forced re-tuning threshold, return.
1023 */
1024 if (cache->evict_tune_stable) {
1025 if (time_diff < EVICT_FORCE_RETUNE)
1026 return;
1027
1028 /*
1029 * Stable state was reached a long time ago. Let's re-tune.
1030 * Reset all the state.
1031 */
1032 cache->evict_tune_stable = false;
1033 cache->evict_tune_last_action_time.tv_sec = 0;
1034 cache->evict_tune_progress_last = 0;
1035 cache->evict_tune_num_points = 0;
1036 cache->evict_tune_progress_rate_max = 0;
1037
1038 /* Reduce the number of eviction workers by one */
1039 thread_surplus =
1040 (int32_t)conn->evict_threads.current_threads -
1041 (int32_t)conn->evict_threads_min;
1042
1043 if (thread_surplus > 0) {
1044 __wt_thread_group_stop_one(
1045 session, &conn->evict_threads);
1046 WT_STAT_CONN_INCR(session,
1047 cache_eviction_worker_removed);
1048 }
1049 WT_STAT_CONN_INCR(session, cache_eviction_force_retune);
1050 } else
1051 if (time_diff < EVICT_TUNE_PERIOD)
1052 /*
1053 * If we have not reached stable state, don't do
1054 * anything unless enough time has passed since the last
1055 * time we have taken any action in this function.
1056 */
1057 return;
1058
1059 /*
1060 * Measure the evicted progress so far. Eviction rate correlates to
1061 * performance, so this is our metric of success.
1062 */
1063 eviction_progress = cache->eviction_progress;
1064
1065 /*
1066 * If we have recorded the number of pages evicted at the end of
1067 * the previous measurement interval, we can compute the eviction
1068 * rate in evicted pages per second achieved during the current
1069 * measurement interval.
1070 * Otherwise, we just record the number of evicted pages and return.
1071 */
1072 if (cache->evict_tune_progress_last == 0)
1073 goto done;
1074
1075 delta_msec = WT_TIMEDIFF_MS(current_time, cache->evict_tune_last_time);
1076 delta_pages = eviction_progress - cache->evict_tune_progress_last;
1077 eviction_progress_rate = (delta_pages * WT_THOUSAND) / delta_msec;
1078 cache->evict_tune_num_points++;
1079
1080 /*
1081 * Keep track of the maximum eviction throughput seen and the number
1082 * of workers corresponding to that throughput.
1083 */
1084 if (eviction_progress_rate > cache->evict_tune_progress_rate_max) {
1085 cache->evict_tune_progress_rate_max = eviction_progress_rate;
1086 cache->evict_tune_workers_best =
1087 conn->evict_threads.current_threads;
1088 }
1089
1090 /*
1091 * Compare the current number of data points with the number
1092 * needed variable. If they are equal, we will check whether
1093 * we are still going up on the performance curve, in which
1094 * case we will increase the number of needed data points, to provide
1095 * opportunity for further increasing the number of workers. Or
1096 * we are past the inflection point on the curve, in which case
1097 * we will go back to the best observed number of workers and
1098 * settle into a stable state.
1099 */
1100 if (cache->evict_tune_num_points >= cache->evict_tune_datapts_needed) {
1101 if (cache->evict_tune_workers_best ==
1102 conn->evict_threads.current_threads &&
1103 conn->evict_threads.current_threads <
1104 conn->evict_threads_max) {
1105 /*
1106 * Keep adding workers. We will check again
1107 * at the next check point.
1108 */
1109 cache->evict_tune_datapts_needed += WT_MIN(
1110 EVICT_TUNE_DATAPT_MIN,
1111 (conn->evict_threads_max -
1112 conn->evict_threads.current_threads) /
1113 EVICT_TUNE_BATCH);
1114 } else {
1115 /*
1116 * We are past the inflection point. Choose the
1117 * best number of eviction workers observed and
1118 * settle into a stable state.
1119 */
1120 thread_surplus =
1121 (int32_t)conn->evict_threads.current_threads -
1122 (int32_t)cache->evict_tune_workers_best;
1123
1124 for (i = 0; i < thread_surplus; i++) {
1125 __wt_thread_group_stop_one(
1126 session, &conn->evict_threads);
1127 WT_STAT_CONN_INCR(session,
1128 cache_eviction_worker_removed);
1129 }
1130 cache->evict_tune_stable = true;
1131 goto done;
1132 }
1133 }
1134
1135 /*
1136 * If we have not added any worker threads in the past, we set the
1137 * number of data points needed equal to the number of data points that
1138 * we must accumulate before deciding if we should keep adding workers
1139 * or settle on a previously tried stable number of workers.
1140 */
1141 if (cache->evict_tune_last_action_time.tv_sec == 0)
1142 cache->evict_tune_datapts_needed = EVICT_TUNE_DATAPT_MIN;
1143
1144 if (F_ISSET(cache, WT_CACHE_EVICT_ALL)) {
1145 cur_threads = (int32_t)conn->evict_threads.current_threads;
1146 target_threads = WT_MIN(cur_threads + EVICT_TUNE_BATCH,
1147 (int32_t)conn->evict_threads_max);
1148 /*
1149 * Start the new threads.
1150 */
1151 for (i = cur_threads; i < target_threads; ++i) {
1152 __wt_thread_group_start_one(session,
1153 &conn->evict_threads, false);
1154 WT_STAT_CONN_INCR(session,
1155 cache_eviction_worker_created);
1156 __wt_verbose(session,
1157 WT_VERB_EVICTSERVER, "%s", "added worker thread");
1158 }
1159 cache->evict_tune_last_action_time = current_time;
1160 }
1161
1162 done: cache->evict_tune_last_time = current_time;
1163 cache->evict_tune_progress_last = eviction_progress;
1164 }
1165
1166 /*
1167 * __evict_lru_pages --
1168 * Get pages from the LRU queue to evict.
1169 */
1170 static int
__evict_lru_pages(WT_SESSION_IMPL * session,bool is_server)1171 __evict_lru_pages(WT_SESSION_IMPL *session, bool is_server)
1172 {
1173 WT_CONNECTION_IMPL *conn;
1174 WT_DECL_RET;
1175 WT_TRACK_OP_DECL;
1176
1177 WT_TRACK_OP_INIT(session);
1178 conn = S2C(session);
1179
1180 /*
1181 * Reconcile and discard some pages: EBUSY is returned if a page fails
1182 * eviction because it's unavailable, continue in that case.
1183 */
1184 while (F_ISSET(conn, WT_CONN_EVICTION_RUN) && ret == 0)
1185 if ((ret = __evict_page(session, is_server)) == EBUSY)
1186 ret = 0;
1187
1188 /* If a worker thread found the queue empty, pause. */
1189 if (ret == WT_NOTFOUND && !is_server &&
1190 F_ISSET(conn, WT_CONN_EVICTION_RUN))
1191 __wt_cond_wait(
1192 session, conn->evict_threads.wait_cond, 10000, NULL);
1193
1194 WT_TRACK_OP_END(session);
1195 return (ret == WT_NOTFOUND ? 0 : ret);
1196 }
1197
1198 /*
1199 * __evict_lru_walk --
1200 * Add pages to the LRU queue to be evicted from cache.
1201 */
1202 static int
__evict_lru_walk(WT_SESSION_IMPL * session)1203 __evict_lru_walk(WT_SESSION_IMPL *session)
1204 {
1205 WT_CACHE *cache;
1206 WT_DECL_RET;
1207 WT_EVICT_QUEUE *queue, *other_queue;
1208 WT_TRACK_OP_DECL;
1209 uint64_t read_gen_oldest;
1210 uint32_t candidates, entries;
1211
1212 WT_TRACK_OP_INIT(session);
1213 cache = S2C(session)->cache;
1214
1215 /* Age out the score of how much the queue has been empty recently. */
1216 if (cache->evict_empty_score > 0)
1217 --cache->evict_empty_score;
1218
1219 /* Fill the next queue (that isn't the urgent queue). */
1220 queue = cache->evict_fill_queue;
1221 other_queue = cache->evict_queues + (1 - (queue - cache->evict_queues));
1222 cache->evict_fill_queue = other_queue;
1223
1224 /* If this queue is full, try the other one. */
1225 if (__evict_queue_full(queue) && !__evict_queue_full(other_queue))
1226 queue = other_queue;
1227
1228 /*
1229 * If both queues are full and haven't been empty on recent refills,
1230 * we're done.
1231 */
1232 if (__evict_queue_full(queue) &&
1233 cache->evict_empty_score < WT_EVICT_SCORE_CUTOFF)
1234 goto err;
1235
1236 /*
1237 * If the queue we are filling is empty, pages are being requested
1238 * faster than they are being queued.
1239 */
1240 if (__evict_queue_empty(queue, false)) {
1241 if (F_ISSET(cache,
1242 WT_CACHE_EVICT_CLEAN_HARD | WT_CACHE_EVICT_DIRTY_HARD))
1243 cache->evict_empty_score = WT_MIN(
1244 cache->evict_empty_score + WT_EVICT_SCORE_BUMP,
1245 WT_EVICT_SCORE_MAX);
1246 WT_STAT_CONN_INCR(session, cache_eviction_queue_empty);
1247 } else
1248 WT_STAT_CONN_INCR(session, cache_eviction_queue_not_empty);
1249
1250 /*
1251 * Get some more pages to consider for eviction.
1252 *
1253 * If the walk is interrupted, we still need to sort the queue: the
1254 * next walk assumes there are no entries beyond WT_EVICT_WALK_BASE.
1255 */
1256 if ((ret = __evict_walk(cache->walk_session, queue)) == EBUSY)
1257 ret = 0;
1258 WT_ERR_NOTFOUND_OK(ret);
1259
1260 /* Sort the list into LRU order and restart. */
1261 __wt_spin_lock(session, &queue->evict_lock);
1262
1263 /*
1264 * We have locked the queue: in the (unusual) case where we are filling
1265 * the current queue, mark it empty so that subsequent requests switch
1266 * to the other queue.
1267 */
1268 if (queue == cache->evict_current_queue)
1269 queue->evict_current = NULL;
1270
1271 entries = queue->evict_entries;
1272 __wt_qsort(queue->evict_queue,
1273 entries, sizeof(WT_EVICT_ENTRY), __evict_lru_cmp);
1274
1275 /* Trim empty entries from the end. */
1276 while (entries > 0 && queue->evict_queue[entries - 1].ref == NULL)
1277 --entries;
1278
1279 /*
1280 * If we have more entries than the maximum tracked between walks,
1281 * clear them. Do this before figuring out how many of the entries are
1282 * candidates so we never end up with more candidates than entries.
1283 */
1284 while (entries > WT_EVICT_WALK_BASE)
1285 __evict_list_clear(session, &queue->evict_queue[--entries]);
1286
1287 queue->evict_entries = entries;
1288
1289 if (entries == 0) {
1290 /*
1291 * If there are no entries, there cannot be any candidates.
1292 * Make sure application threads don't read past the end of the
1293 * candidate list, or they may race with the next walk.
1294 */
1295 queue->evict_candidates = 0;
1296 queue->evict_current = NULL;
1297 __wt_spin_unlock(session, &queue->evict_lock);
1298 goto err;
1299 }
1300
1301 /* Decide how many of the candidates we're going to try and evict. */
1302 if (__wt_cache_aggressive(session))
1303 queue->evict_candidates = entries;
1304 else {
1305 /*
1306 * Find the oldest read generation apart that we have in the
1307 * queue, used to set the initial value for pages read into the
1308 * system. The queue is sorted, find the first "normal"
1309 * generation.
1310 */
1311 read_gen_oldest = WT_READGEN_START_VALUE;
1312 for (candidates = 0; candidates < entries; ++candidates) {
1313 read_gen_oldest = queue->evict_queue[candidates].score;
1314 if (!WT_READGEN_EVICT_SOON(read_gen_oldest))
1315 break;
1316 }
1317
1318 /*
1319 * Take all candidates if we only gathered pages with an oldest
1320 * read generation set.
1321 *
1322 * We normally never take more than 50% of the entries but if
1323 * 50% of the entries were at the oldest read generation, take
1324 * all of them.
1325 */
1326 if (WT_READGEN_EVICT_SOON(read_gen_oldest))
1327 queue->evict_candidates = entries;
1328 else if (candidates > entries / 2)
1329 queue->evict_candidates = candidates;
1330 else {
1331 /*
1332 * Take all of the urgent pages plus a third of
1333 * ordinary candidates (which could be expressed as
1334 * WT_EVICT_WALK_INCR / WT_EVICT_WALK_BASE). In the
1335 * steady state, we want to get as many candidates as
1336 * the eviction walk adds to the queue.
1337 *
1338 * That said, if there is only one entry, which is
1339 * normal when populating an empty file, don't exclude
1340 * it.
1341 */
1342 queue->evict_candidates =
1343 1 + candidates + ((entries - candidates) - 1) / 3;
1344 cache->read_gen_oldest = read_gen_oldest;
1345 }
1346 }
1347
1348 queue->evict_current = queue->evict_queue;
1349 __wt_spin_unlock(session, &queue->evict_lock);
1350
1351 /*
1352 * Signal any application or helper threads that may be waiting
1353 * to help with eviction.
1354 */
1355 __wt_cond_signal(session, S2C(session)->evict_threads.wait_cond);
1356
1357 err: WT_TRACK_OP_END(session);
1358 return (ret);
1359 }
1360
1361 /*
1362 * __evict_walk --
1363 * Fill in the array by walking the next set of pages.
1364 */
1365 static int
__evict_walk(WT_SESSION_IMPL * session,WT_EVICT_QUEUE * queue)1366 __evict_walk(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue)
1367 {
1368 WT_BTREE *btree;
1369 WT_CACHE *cache;
1370 WT_CONNECTION_IMPL *conn;
1371 WT_DATA_HANDLE *dhandle;
1372 WT_DECL_RET;
1373 WT_TRACK_OP_DECL;
1374 u_int max_entries, retries, slot, start_slot, total_candidates;
1375 bool dhandle_locked, incr;
1376
1377 WT_TRACK_OP_INIT(session);
1378
1379 conn = S2C(session);
1380 cache = conn->cache;
1381 btree = NULL;
1382 dhandle = NULL;
1383 dhandle_locked = incr = false;
1384 retries = 0;
1385
1386 /*
1387 * Set the starting slot in the queue and the maximum pages added
1388 * per walk.
1389 */
1390 start_slot = slot = queue->evict_entries;
1391 max_entries = WT_MIN(slot + WT_EVICT_WALK_INCR, cache->evict_slots);
1392
1393 /*
1394 * Another pathological case: if there are only a tiny number of
1395 * candidate pages in cache, don't put all of them on one queue.
1396 */
1397 total_candidates = (u_int)(F_ISSET(cache, WT_CACHE_EVICT_CLEAN) ?
1398 __wt_cache_pages_inuse(cache) : cache->pages_dirty_leaf);
1399 max_entries = WT_MIN(max_entries, 1 + total_candidates / 2);
1400
1401 retry: while (slot < max_entries) {
1402 /*
1403 * If another thread is waiting on the eviction server to clear
1404 * the walk point in a tree, give up.
1405 */
1406 if (cache->pass_intr != 0)
1407 WT_ERR(EBUSY);
1408
1409 /*
1410 * Lock the dhandle list to find the next handle and bump its
1411 * reference count to keep it alive while we sweep.
1412 */
1413 if (!dhandle_locked) {
1414 WT_ERR(__evict_lock_handle_list(session));
1415 dhandle_locked = true;
1416 }
1417
1418 if (dhandle == NULL) {
1419 /*
1420 * On entry, continue from wherever we got to in the
1421 * scan last time through. If we don't have a saved
1422 * handle, start from the beginning of the list.
1423 */
1424 if ((dhandle = cache->walk_tree) != NULL)
1425 cache->walk_tree = NULL;
1426 else
1427 dhandle = TAILQ_FIRST(&conn->dhqh);
1428 } else {
1429 if (incr) {
1430 WT_ASSERT(session, dhandle->session_inuse > 0);
1431 (void)__wt_atomic_subi32(
1432 &dhandle->session_inuse, 1);
1433 incr = false;
1434 cache->walk_tree = NULL;
1435 }
1436 dhandle = TAILQ_NEXT(dhandle, q);
1437 }
1438
1439 /* If we reach the end of the list, we're done. */
1440 if (dhandle == NULL)
1441 break;
1442
1443 /* Ignore non-btree handles, or handles that aren't open. */
1444 if (dhandle->type != WT_DHANDLE_TYPE_BTREE ||
1445 !F_ISSET(dhandle, WT_DHANDLE_OPEN))
1446 continue;
1447
1448 /* Skip files that don't allow eviction. */
1449 btree = dhandle->handle;
1450 if (btree->evict_disabled > 0)
1451 continue;
1452
1453 /*
1454 * Skip files that are checkpointing if we are only looking for
1455 * dirty pages.
1456 */
1457 if (WT_BTREE_SYNCING(btree) &&
1458 !F_ISSET(cache, WT_CACHE_EVICT_CLEAN))
1459 continue;
1460
1461 /*
1462 * Skip files that are configured to stick in cache until we
1463 * become aggressive.
1464 */
1465 if (btree->evict_priority != 0 &&
1466 !__wt_cache_aggressive(session))
1467 continue;
1468
1469 /*
1470 * Skip files if we have too many active walks.
1471 *
1472 * This used to be limited by the configured maximum number of
1473 * hazard pointers per session. Even though that ceiling has
1474 * been removed, we need to test eviction with huge numbers of
1475 * active trees before allowing larger numbers of hazard
1476 * pointers in the walk session.
1477 */
1478 if (btree->evict_ref == NULL &&
1479 session->nhazard > WT_EVICT_MAX_TREES)
1480 continue;
1481
1482 /*
1483 * If we are filling the queue, skip files that haven't been
1484 * useful in the past.
1485 */
1486 if (btree->evict_walk_period != 0 &&
1487 btree->evict_walk_skips++ < btree->evict_walk_period)
1488 continue;
1489 btree->evict_walk_skips = 0;
1490
1491 (void)__wt_atomic_addi32(&dhandle->session_inuse, 1);
1492 incr = true;
1493 __wt_readunlock(session, &conn->dhandle_lock);
1494 dhandle_locked = false;
1495
1496 /*
1497 * Re-check the "no eviction" flag, used to enforce exclusive
1498 * access when a handle is being closed.
1499 *
1500 * Only try to acquire the lock and simply continue if we fail;
1501 * the lock is held while the thread turning off eviction clears
1502 * the tree's current eviction point, and part of the process is
1503 * waiting on this thread to acknowledge that action.
1504 *
1505 * If a handle is being discarded, it will still be marked open,
1506 * but won't have a root page.
1507 */
1508 if (btree->evict_disabled == 0 &&
1509 !__wt_spin_trylock(session, &cache->evict_walk_lock)) {
1510 if (btree->evict_disabled == 0 &&
1511 btree->root.page != NULL) {
1512 /*
1513 * Remember the file to visit first, next loop.
1514 */
1515 cache->walk_tree = dhandle;
1516 WT_WITH_DHANDLE(session, dhandle,
1517 ret = __evict_walk_tree(
1518 session, queue, max_entries, &slot));
1519
1520 WT_ASSERT(session, __wt_session_gen(
1521 session, WT_GEN_SPLIT) == 0);
1522 }
1523 __wt_spin_unlock(session, &cache->evict_walk_lock);
1524 WT_ERR(ret);
1525 }
1526 }
1527
1528 if (incr) {
1529 WT_ASSERT(session, dhandle->session_inuse > 0);
1530 (void)__wt_atomic_subi32(&dhandle->session_inuse, 1);
1531 incr = false;
1532 }
1533
1534 /*
1535 * Walk the list of files a few times if we don't find enough pages.
1536 * Try two passes through all the files, give up when we have some
1537 * candidates and we aren't finding more.
1538 */
1539 if (slot < max_entries && (retries < 2 ||
1540 (retries < WT_RETRY_MAX &&
1541 (slot == queue->evict_entries || slot > start_slot)))) {
1542 start_slot = slot;
1543 ++retries;
1544 goto retry;
1545 }
1546
1547 err: if (dhandle_locked)
1548 __wt_readunlock(session, &conn->dhandle_lock);
1549
1550 /*
1551 * If we didn't find any entries on a walk when we weren't interrupted,
1552 * let our caller know.
1553 */
1554 if (queue->evict_entries == slot && cache->pass_intr == 0)
1555 ret = WT_NOTFOUND;
1556
1557 queue->evict_entries = slot;
1558 WT_TRACK_OP_END(session);
1559 return (ret);
1560 }
1561
1562 /*
1563 * __evict_push_candidate --
1564 * Initialize a WT_EVICT_ENTRY structure with a given page.
1565 */
1566 static bool
__evict_push_candidate(WT_SESSION_IMPL * session,WT_EVICT_QUEUE * queue,WT_EVICT_ENTRY * evict,WT_REF * ref)1567 __evict_push_candidate(WT_SESSION_IMPL *session,
1568 WT_EVICT_QUEUE *queue, WT_EVICT_ENTRY *evict, WT_REF *ref)
1569 {
1570 uint8_t orig_flags, new_flags;
1571 u_int slot;
1572
1573 /*
1574 * Threads can race to queue a page (e.g., an ordinary LRU walk can
1575 * race with a page being queued for urgent eviction).
1576 */
1577 orig_flags = new_flags = ref->page->flags_atomic;
1578 FLD_SET(new_flags, WT_PAGE_EVICT_LRU);
1579 if (orig_flags == new_flags ||
1580 !__wt_atomic_cas8(&ref->page->flags_atomic, orig_flags, new_flags))
1581 return (false);
1582
1583 /* Keep track of the maximum slot we are using. */
1584 slot = (u_int)(evict - queue->evict_queue);
1585 if (slot >= queue->evict_max)
1586 queue->evict_max = slot + 1;
1587
1588 if (evict->ref != NULL)
1589 __evict_list_clear(session, evict);
1590
1591 evict->btree = S2BT(session);
1592 evict->ref = ref;
1593 evict->score = __evict_entry_priority(session, ref);
1594
1595 /* Adjust for size when doing dirty eviction. */
1596 if (F_ISSET(S2C(session)->cache, WT_CACHE_EVICT_DIRTY) &&
1597 evict->score != WT_READGEN_OLDEST && evict->score != UINT64_MAX &&
1598 !__wt_page_is_modified(ref->page))
1599 evict->score += WT_MEGABYTE -
1600 WT_MIN(WT_MEGABYTE, ref->page->memory_footprint);
1601
1602 return (true);
1603 }
1604
1605 /*
1606 * __evict_walk_target --
1607 * Calculate how many pages to queue for a given tree.
1608 */
1609 static uint32_t
__evict_walk_target(WT_SESSION_IMPL * session,u_int max_entries)1610 __evict_walk_target(WT_SESSION_IMPL *session, u_int max_entries)
1611 {
1612 WT_CACHE *cache;
1613 uint64_t btree_inuse, bytes_per_slot, cache_inuse;
1614 uint32_t target_pages_clean, target_pages_dirty, target_pages;
1615 uint32_t total_slots;
1616
1617 cache = S2C(session)->cache;
1618 target_pages_clean = target_pages_dirty = 0;
1619 total_slots = max_entries;
1620
1621 /*
1622 * The number of times we should fill the queue by the end of
1623 * considering all trees.
1624 */
1625 #define QUEUE_FILLS_PER_PASS 10
1626
1627 /*
1628 * The minimum number of pages we should consider per tree.
1629 */
1630 #define MIN_PAGES_PER_TREE 10
1631
1632 /*
1633 * The target number of pages for this tree is proportional to the
1634 * space it is taking up in cache. Round to the nearest number of
1635 * slots so we assign all of the slots to a tree filling 99+% of the
1636 * cache (and only have to walk it once).
1637 */
1638 if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) {
1639 btree_inuse = __wt_btree_bytes_evictable(session);
1640 cache_inuse = __wt_cache_bytes_inuse(cache);
1641 bytes_per_slot = 1 + cache_inuse / total_slots;
1642 target_pages_clean = (uint32_t)(
1643 (btree_inuse + bytes_per_slot / 2) / bytes_per_slot);
1644 }
1645
1646 if (F_ISSET(cache, WT_CACHE_EVICT_DIRTY)) {
1647 btree_inuse = __wt_btree_dirty_leaf_inuse(session);
1648 cache_inuse = __wt_cache_dirty_leaf_inuse(cache);
1649 bytes_per_slot = 1 + cache_inuse / total_slots;
1650 target_pages_dirty = (uint32_t)(
1651 (btree_inuse + bytes_per_slot / 2) / bytes_per_slot);
1652 }
1653
1654 /*
1655 * Weight the number of target pages by the number of times we want to
1656 * fill the cache per pass through all the trees. Note that we don't
1657 * build this into the calculation above because we don't want to favor
1658 * small trees, so round to a whole number of slots (zero for small
1659 * trees) before multiplying.
1660 */
1661 target_pages = WT_MAX(target_pages_clean, target_pages_dirty) *
1662 QUEUE_FILLS_PER_PASS;
1663
1664 /*
1665 * Walk trees with a small fraction of the cache in case there are so
1666 * many trees that none of them use enough of the cache to be allocated
1667 * slots. Only skip a tree if it has no bytes of interest.
1668 */
1669 if (target_pages == 0) {
1670 btree_inuse = F_ISSET(cache, WT_CACHE_EVICT_CLEAN) ?
1671 __wt_btree_bytes_evictable(session) :
1672 __wt_btree_dirty_leaf_inuse(session);
1673
1674 if (btree_inuse == 0)
1675 return (0);
1676 }
1677
1678 /*
1679 * There is some cost associated with walking a tree. If we're going
1680 * to visit this tree, always look for a minimum number of pages.
1681 */
1682 if (target_pages < MIN_PAGES_PER_TREE)
1683 target_pages = MIN_PAGES_PER_TREE;
1684
1685 /* If the tree is dead, take a lot of pages. */
1686 if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD))
1687 target_pages *= 10;
1688
1689 return (target_pages);
1690 }
1691
1692 /*
1693 * __evict_walk_tree --
1694 * Get a few page eviction candidates from a single underlying file.
1695 */
1696 static int
__evict_walk_tree(WT_SESSION_IMPL * session,WT_EVICT_QUEUE * queue,u_int max_entries,u_int * slotp)1697 __evict_walk_tree(WT_SESSION_IMPL *session,
1698 WT_EVICT_QUEUE *queue, u_int max_entries, u_int *slotp)
1699 {
1700 WT_BTREE *btree;
1701 WT_CACHE *cache;
1702 WT_CONNECTION_IMPL *conn;
1703 WT_DECL_RET;
1704 WT_EVICT_ENTRY *end, *evict, *start;
1705 WT_PAGE *last_parent, *page;
1706 WT_REF *ref;
1707 uint64_t min_pages, pages_seen, pages_queued, refs_walked;
1708 uint32_t read_flags, remaining_slots, target_pages, walk_flags;
1709 int restarts;
1710 bool give_up, modified, urgent_queued;
1711
1712 conn = S2C(session);
1713 btree = S2BT(session);
1714 cache = conn->cache;
1715 last_parent = NULL;
1716 restarts = 0;
1717 give_up = urgent_queued = false;
1718
1719 /*
1720 * Figure out how many slots to fill from this tree.
1721 * Note that some care is taken in the calculation to avoid overflow.
1722 */
1723 start = queue->evict_queue + *slotp;
1724 remaining_slots = max_entries - *slotp;
1725 if (btree->evict_walk_progress >= btree->evict_walk_target) {
1726 btree->evict_walk_target =
1727 __evict_walk_target(session, max_entries);
1728 btree->evict_walk_progress = 0;
1729 }
1730 target_pages = WT_MIN(btree->evict_walk_target / QUEUE_FILLS_PER_PASS,
1731 btree->evict_walk_target - btree->evict_walk_progress);
1732
1733 if (target_pages > remaining_slots)
1734 target_pages = remaining_slots;
1735
1736 /* If we don't want any pages from this tree, move on. */
1737 if (target_pages == 0)
1738 return (0);
1739
1740 /*
1741 * These statistics generate a histogram of the number of pages targeted
1742 * for eviction each round. The range of values here start at
1743 * MIN_PAGES_PER_TREE as this is the smallest number of pages we can
1744 * target, unless there are fewer slots available. The aim is to cover
1745 * the likely ranges of target pages in as few statistics as possible to
1746 * reduce the overall overhead.
1747 */
1748 if (target_pages < MIN_PAGES_PER_TREE) {
1749 WT_STAT_CONN_INCR(session, cache_eviction_target_page_lt10);
1750 WT_STAT_DATA_INCR(session, cache_eviction_target_page_lt10);
1751 } else if (target_pages < 32) {
1752 WT_STAT_CONN_INCR(session, cache_eviction_target_page_lt32);
1753 WT_STAT_DATA_INCR(session, cache_eviction_target_page_lt32);
1754 } else if (target_pages < 64) {
1755 WT_STAT_CONN_INCR(session, cache_eviction_target_page_lt64);
1756 WT_STAT_DATA_INCR(session, cache_eviction_target_page_lt64);
1757 } else if (target_pages < 128) {
1758 WT_STAT_CONN_INCR(session, cache_eviction_target_page_lt128);
1759 WT_STAT_DATA_INCR(session, cache_eviction_target_page_lt128);
1760 } else {
1761 WT_STAT_CONN_INCR(session, cache_eviction_target_page_ge128);
1762 WT_STAT_DATA_INCR(session, cache_eviction_target_page_ge128);
1763 }
1764
1765 end = start + target_pages;
1766
1767 /*
1768 * Examine at least a reasonable number of pages before deciding
1769 * whether to give up. When we are only looking for dirty pages,
1770 * search the tree for longer.
1771 */
1772 min_pages = 10 * (uint64_t)target_pages;
1773 if (F_ISSET(cache, WT_CACHE_EVICT_DIRTY) &&
1774 !F_ISSET(cache, WT_CACHE_EVICT_CLEAN))
1775 min_pages *= 10;
1776
1777 if (btree->evict_ref == NULL) {
1778 WT_STAT_CONN_INCR(session, cache_eviction_walk_from_root);
1779 WT_STAT_DATA_INCR(session, cache_eviction_walk_from_root);
1780 } else {
1781 WT_STAT_CONN_INCR(session, cache_eviction_walk_saved_pos);
1782 WT_STAT_DATA_INCR(session, cache_eviction_walk_saved_pos);
1783 }
1784
1785 walk_flags =
1786 WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT;
1787
1788 /*
1789 * Choose a random point in the tree if looking for candidates in a
1790 * tree with no starting point set. This is mostly aimed at ensuring
1791 * eviction fairly visits all pages in trees with a lot of in-cache
1792 * content.
1793 */
1794 switch (btree->evict_start_type) {
1795 case WT_EVICT_WALK_NEXT:
1796 break;
1797 case WT_EVICT_WALK_PREV:
1798 FLD_SET(walk_flags, WT_READ_PREV);
1799 break;
1800 case WT_EVICT_WALK_RAND_PREV:
1801 FLD_SET(walk_flags, WT_READ_PREV);
1802 /* FALLTHROUGH */
1803 case WT_EVICT_WALK_RAND_NEXT:
1804 read_flags = WT_READ_CACHE | WT_READ_NO_EVICT |
1805 WT_READ_NO_GEN | WT_READ_NO_WAIT |
1806 WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK;
1807 if (btree->evict_ref == NULL) {
1808 /* Ensure internal pages indexes remain valid */
1809 WT_WITH_PAGE_INDEX(session, ret = __wt_random_descent(
1810 session, &btree->evict_ref, read_flags));
1811 WT_RET_NOTFOUND_OK(ret);
1812 }
1813 break;
1814 }
1815
1816 /*
1817 * Get some more eviction candidate pages, starting at the last saved
1818 * point. Clear the saved point immediately, we assert when discarding
1819 * pages we're not discarding an eviction point, so this clear must be
1820 * complete before the page is released.
1821 */
1822 ref = btree->evict_ref;
1823 btree->evict_ref = NULL;
1824
1825 /*
1826 * !!! Take care terminating this loop.
1827 *
1828 * Don't make an extra call to __wt_tree_walk after we hit the end of a
1829 * tree: that will leave a page pinned, which may prevent any work from
1830 * being done.
1831 *
1832 * Once we hit the page limit, do one more step through the walk in
1833 * case we are appending and only the last page in the file is live.
1834 */
1835 for (evict = start, pages_queued = pages_seen = refs_walked = 0;
1836 evict < end && (ret == 0 || ret == WT_NOTFOUND);
1837 last_parent = ref == NULL ? NULL : ref->home,
1838 ret = __wt_tree_walk_count(
1839 session, &ref, &refs_walked, walk_flags)) {
1840 /*
1841 * Check whether we're finding a good ratio of candidates vs
1842 * pages seen. Some workloads create "deserts" in trees where
1843 * no good eviction candidates can be found. Abandon the walk
1844 * if we get into that situation.
1845 */
1846 give_up = !__wt_cache_aggressive(session) &&
1847 !F_ISSET(btree, WT_BTREE_LOOKASIDE) &&
1848 pages_seen > min_pages &&
1849 (pages_queued == 0 || (pages_seen / pages_queued) >
1850 (min_pages / target_pages));
1851 if (give_up) {
1852 /*
1853 * Try a different walk start point next time if a
1854 * walk gave up.
1855 */
1856 switch (btree->evict_start_type) {
1857 case WT_EVICT_WALK_NEXT:
1858 btree->evict_start_type = WT_EVICT_WALK_PREV;
1859 break;
1860 case WT_EVICT_WALK_PREV:
1861 btree->evict_start_type =
1862 WT_EVICT_WALK_RAND_PREV;
1863 break;
1864 case WT_EVICT_WALK_RAND_PREV:
1865 btree->evict_start_type =
1866 WT_EVICT_WALK_RAND_NEXT;
1867 break;
1868 case WT_EVICT_WALK_RAND_NEXT:
1869 btree->evict_start_type = WT_EVICT_WALK_NEXT;
1870 break;
1871 }
1872
1873 /*
1874 * We differentiate the reasons we gave up on this walk
1875 * and increment the stats accordingly.
1876 */
1877 if (pages_queued == 0) {
1878 WT_STAT_CONN_INCR(session,
1879 cache_eviction_walks_gave_up_no_targets);
1880 WT_STAT_DATA_INCR(session,
1881 cache_eviction_walks_gave_up_no_targets);
1882 } else {
1883 WT_STAT_CONN_INCR(session,
1884 cache_eviction_walks_gave_up_ratio);
1885 WT_STAT_DATA_INCR(session,
1886 cache_eviction_walks_gave_up_ratio);
1887 }
1888 break;
1889 }
1890
1891 if (ref == NULL) {
1892 WT_STAT_CONN_INCR(session, cache_eviction_walks_ended);
1893 WT_STAT_DATA_INCR(session, cache_eviction_walks_ended);
1894
1895 if (++restarts == 2) {
1896 WT_STAT_CONN_INCR(
1897 session, cache_eviction_walks_stopped);
1898 WT_STAT_DATA_INCR(
1899 session, cache_eviction_walks_stopped);
1900 break;
1901 }
1902 WT_STAT_CONN_INCR(
1903 session, cache_eviction_walks_started);
1904 continue;
1905 }
1906
1907 ++pages_seen;
1908
1909 /* Ignore root pages entirely. */
1910 if (__wt_ref_is_root(ref))
1911 continue;
1912
1913 page = ref->page;
1914 modified = __wt_page_is_modified(page);
1915 page->evict_pass_gen = cache->evict_pass_gen;
1916
1917 /*
1918 * Use the EVICT_LRU flag to avoid putting pages onto the list
1919 * multiple times.
1920 */
1921 if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU))
1922 continue;
1923
1924 /* Don't queue dirty pages in trees during checkpoints. */
1925 if (modified && WT_BTREE_SYNCING(btree))
1926 continue;
1927
1928 /*
1929 * It's possible (but unlikely) to visit a page without a read
1930 * generation, if we race with the read instantiating the page.
1931 * Set the page's read generation here to ensure a bug doesn't
1932 * somehow leave a page without a read generation.
1933 */
1934 if (page->read_gen == WT_READGEN_NOTSET)
1935 __wt_cache_read_gen_new(session, page);
1936
1937 /* Pages being forcibly evicted go on the urgent queue. */
1938 if (page->read_gen == WT_READGEN_OLDEST ||
1939 page->memory_footprint >= btree->splitmempage) {
1940 WT_STAT_CONN_INCR(
1941 session, cache_eviction_pages_queued_oldest);
1942 if (__wt_page_evict_urgent(session, ref))
1943 urgent_queued = true;
1944 continue;
1945 }
1946
1947 /*
1948 * Pages that are empty or from dead trees are fast-tracked.
1949 *
1950 * Also evict lookaside table pages without further filtering:
1951 * the cache is under pressure by definition and we want to
1952 * free space.
1953 */
1954 if (__wt_page_is_empty(page) ||
1955 F_ISSET(session->dhandle, WT_DHANDLE_DEAD) ||
1956 F_ISSET(btree, WT_BTREE_LOOKASIDE))
1957 goto fast;
1958
1959 /*
1960 * If application threads are blocked on eviction of clean
1961 * pages, and the only thing preventing a clean leaf page from
1962 * being evicted is it contains historical data, mark it dirty
1963 * so we can do lookaside eviction. We also mark the tree
1964 * dirty to avoid an assertion that we don't discard dirty
1965 * pages from a clean tree.
1966 */
1967 if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD) &&
1968 !F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE) &&
1969 !WT_PAGE_IS_INTERNAL(page) &&
1970 !modified && page->modify != NULL &&
1971 !__wt_txn_visible_all(session, page->modify->rec_max_txn,
1972 WT_TIMESTAMP_NULL(&page->modify->rec_max_timestamp))) {
1973 __wt_page_modify_set(session, page);
1974 goto fast;
1975 }
1976
1977 /* Skip clean pages if appropriate. */
1978 if (!modified && !F_ISSET(cache, WT_CACHE_EVICT_CLEAN))
1979 continue;
1980
1981 /* Skip dirty pages if appropriate. */
1982 if (modified && !F_ISSET(cache, WT_CACHE_EVICT_DIRTY))
1983 continue;
1984
1985 /*
1986 * Don't attempt eviction of internal pages with children in
1987 * cache (indicated by seeing an internal page that is the
1988 * parent of the last page we saw).
1989 *
1990 * Also skip internal page unless we get aggressive or the tree
1991 * is idle (indicated by the tree being skipped for walks).
1992 * The goal here is that if trees become completely idle, we
1993 * eventually push them out of cache completely.
1994 */
1995 if (WT_PAGE_IS_INTERNAL(page)) {
1996 if (page == last_parent)
1997 continue;
1998 if (btree->evict_walk_period == 0 &&
1999 !__wt_cache_aggressive(session))
2000 continue;
2001 }
2002
2003 /* If eviction gets aggressive, anything else is fair game. */
2004 if (__wt_cache_aggressive(session))
2005 goto fast;
2006
2007 /*
2008 * If the global transaction state hasn't changed since the
2009 * last time we tried eviction, it's unlikely we can make
2010 * progress. Similarly, if the most recent update on the page
2011 * is not yet globally visible, eviction will fail. This
2012 * heuristic avoids repeated attempts to evict the same page.
2013 */
2014 if (!__wt_page_evict_retry(session, page) || (modified &&
2015 !__txn_visible_all_id(session, page->modify->update_txn)))
2016 continue;
2017
2018 fast: /* If the page can't be evicted, give up. */
2019 if (!__wt_page_can_evict(session, ref, NULL))
2020 continue;
2021
2022 WT_ASSERT(session, evict->ref == NULL);
2023 if (!__evict_push_candidate(session, queue, evict, ref))
2024 continue;
2025 ++evict;
2026 ++pages_queued;
2027 ++btree->evict_walk_progress;
2028
2029 __wt_verbose(session, WT_VERB_EVICTSERVER,
2030 "select: %p, size %" WT_SIZET_FMT,
2031 (void *)page, page->memory_footprint);
2032 }
2033 WT_RET_NOTFOUND_OK(ret);
2034
2035 *slotp += (u_int)(evict - start);
2036 WT_STAT_CONN_INCRV(
2037 session, cache_eviction_pages_queued, (u_int)(evict - start));
2038
2039 __wt_verbose(session, WT_VERB_EVICTSERVER,
2040 "%s walk: seen %" PRIu64 ", queued %" PRIu64,
2041 session->dhandle->name, pages_seen, pages_queued);
2042
2043 /*
2044 * If we couldn't find the number of pages we were looking for, skip
2045 * the tree next time.
2046 */
2047 if (pages_queued < target_pages / 2 && !urgent_queued)
2048 btree->evict_walk_period = WT_MIN(
2049 WT_MAX(1, 2 * btree->evict_walk_period), 100);
2050 else if (pages_queued == target_pages)
2051 btree->evict_walk_period = 0;
2052 else if (btree->evict_walk_period > 0)
2053 btree->evict_walk_period /= 2;
2054
2055 /*
2056 * Give up the walk occasionally.
2057 *
2058 * If we happen to end up on the root page or a page requiring urgent
2059 * eviction, clear it. We have to track hazard pointers, and the root
2060 * page complicates that calculation.
2061 *
2062 * Likewise if we found no new candidates during the walk: there is no
2063 * point keeping a page pinned, since it may be the only candidate in
2064 * an idle tree.
2065 *
2066 * If we land on a page requiring forced eviction, or that isn't an
2067 * ordinary in-memory page (e.g., WT_REF_LIMBO), move until we find an
2068 * ordinary page: we should not prevent exclusive access to the page
2069 * until the next walk.
2070 */
2071 if (ref != NULL) {
2072 if (__wt_ref_is_root(ref) || evict == start || give_up ||
2073 ref->page->memory_footprint >= btree->splitmempage) {
2074 if (restarts == 0)
2075 WT_STAT_CONN_INCR(
2076 session, cache_eviction_walks_abandoned);
2077 WT_RET(__wt_page_release(
2078 cache->walk_session, ref, walk_flags));
2079 ref = NULL;
2080 } else
2081 while (ref != NULL && (ref->state != WT_REF_MEM ||
2082 WT_READGEN_EVICT_SOON(ref->page->read_gen)))
2083 WT_RET_NOTFOUND_OK(__wt_tree_walk_count(
2084 session, &ref, &refs_walked, walk_flags));
2085 btree->evict_ref = ref;
2086 }
2087
2088 WT_STAT_CONN_INCRV(session, cache_eviction_walk, refs_walked);
2089 WT_STAT_CONN_INCRV(session, cache_eviction_pages_seen, pages_seen);
2090 WT_STAT_DATA_INCRV(session, cache_eviction_pages_seen, pages_seen);
2091 WT_STAT_CONN_INCRV(session, cache_eviction_walk_passes, 1);
2092 WT_STAT_DATA_INCRV(session, cache_eviction_walk_passes, 1);
2093
2094 return (0);
2095 }
2096
2097 /*
2098 * __evict_get_ref --
2099 * Get a page for eviction.
2100 */
2101 static int
__evict_get_ref(WT_SESSION_IMPL * session,bool is_server,WT_BTREE ** btreep,WT_REF ** refp,uint32_t * previous_statep)2102 __evict_get_ref(WT_SESSION_IMPL *session,
2103 bool is_server, WT_BTREE **btreep, WT_REF **refp, uint32_t *previous_statep)
2104 {
2105 WT_CACHE *cache;
2106 WT_EVICT_ENTRY *evict;
2107 WT_EVICT_QUEUE *queue, *other_queue, *urgent_queue;
2108 uint32_t candidates, previous_state;
2109 bool is_app, server_only, urgent_ok;
2110
2111 *btreep = NULL;
2112 /*
2113 * It is polite to initialize output variables, but it isn't safe for
2114 * callers to use the previous state if we don't return a locked ref.
2115 */
2116 *previous_statep = WT_REF_MEM;
2117 *refp = NULL;
2118
2119 cache = S2C(session)->cache;
2120 is_app = !F_ISSET(session, WT_SESSION_INTERNAL);
2121 server_only = is_server && !WT_EVICT_HAS_WORKERS(session);
2122 /* Application threads do eviction when cache is full of dirty data */
2123 urgent_ok = (!is_app && !is_server) ||
2124 !WT_EVICT_HAS_WORKERS(session) ||
2125 (is_app && F_ISSET(cache, WT_CACHE_EVICT_DIRTY_HARD));
2126 urgent_queue = cache->evict_urgent_queue;
2127
2128 WT_STAT_CONN_INCR(session, cache_eviction_get_ref);
2129
2130 /* Avoid the LRU lock if no pages are available. */
2131 if (__evict_queue_empty(cache->evict_current_queue, is_server) &&
2132 __evict_queue_empty(cache->evict_other_queue, is_server) &&
2133 (!urgent_ok || __evict_queue_empty(urgent_queue, false))) {
2134 WT_STAT_CONN_INCR(session, cache_eviction_get_ref_empty);
2135 return (WT_NOTFOUND);
2136 }
2137
2138 /*
2139 * The server repopulates whenever the other queue is not full, as long
2140 * as at least one page has been evicted out of the current queue.
2141 *
2142 * Note that there are pathological cases where there are only enough
2143 * eviction candidates in the cache to fill one queue. In that case,
2144 * we will continually evict one page and attempt to refill the queues.
2145 * Such cases are extremely rare in real applications.
2146 */
2147 if (is_server &&
2148 (!urgent_ok || __evict_queue_empty(urgent_queue, false)) &&
2149 !__evict_queue_full(cache->evict_current_queue) &&
2150 !__evict_queue_full(cache->evict_fill_queue) &&
2151 (cache->evict_empty_score > WT_EVICT_SCORE_CUTOFF ||
2152 __evict_queue_empty(cache->evict_fill_queue, false)))
2153 return (WT_NOTFOUND);
2154
2155 __wt_spin_lock(session, &cache->evict_queue_lock);
2156
2157 /* Check the urgent queue first. */
2158 if (urgent_ok && !__evict_queue_empty(urgent_queue, false))
2159 queue = urgent_queue;
2160 else {
2161 /*
2162 * Check if the current queue needs to change.
2163 *
2164 * The server will only evict half of the pages before looking
2165 * for more, but should only switch queues if there are no
2166 * other eviction workers.
2167 */
2168 queue = cache->evict_current_queue;
2169 other_queue = cache->evict_other_queue;
2170 if (__evict_queue_empty(queue, server_only) &&
2171 !__evict_queue_empty(other_queue, server_only)) {
2172 cache->evict_current_queue = other_queue;
2173 cache->evict_other_queue = queue;
2174 }
2175 }
2176
2177 __wt_spin_unlock(session, &cache->evict_queue_lock);
2178
2179 /*
2180 * We got the queue lock, which should be fast, and chose a queue.
2181 * Now we want to get the lock on the individual queue.
2182 */
2183 for (;;) {
2184 /* Verify there are still pages available. */
2185 if (__evict_queue_empty(
2186 queue, is_server && queue != urgent_queue)) {
2187 WT_STAT_CONN_INCR(
2188 session, cache_eviction_get_ref_empty2);
2189 return (WT_NOTFOUND);
2190 }
2191 if (!is_server)
2192 __wt_spin_lock(session, &queue->evict_lock);
2193 else if (__wt_spin_trylock(session, &queue->evict_lock) != 0)
2194 continue;
2195 break;
2196 }
2197
2198 /*
2199 * Only evict half of the pages before looking for more. The remainder
2200 * are left to eviction workers (if configured), or application thread
2201 * if necessary.
2202 */
2203 candidates = queue->evict_candidates;
2204 if (is_server && queue != urgent_queue && candidates > 1)
2205 candidates /= 2;
2206
2207 /* Get the next page queued for eviction. */
2208 for (evict = queue->evict_current;
2209 evict >= queue->evict_queue &&
2210 evict < queue->evict_queue + candidates;
2211 ++evict) {
2212 if (evict->ref == NULL)
2213 continue;
2214 WT_ASSERT(session, evict->btree != NULL);
2215
2216 /*
2217 * Evicting a dirty page in the server thread could stall
2218 * during a write and prevent eviction from finding new work.
2219 *
2220 * However, we can't skip entries in the urgent queue or they
2221 * may never be found again.
2222 *
2223 * Don't force application threads to evict dirty pages if they
2224 * aren't stalled by the amount of dirty data in cache.
2225 */
2226 if (!urgent_ok && (is_server ||
2227 !F_ISSET(cache, WT_CACHE_EVICT_DIRTY_HARD)) &&
2228 __wt_page_is_modified(evict->ref->page)) {
2229 --evict;
2230 break;
2231 }
2232
2233 /*
2234 * Lock the page while holding the eviction mutex to prevent
2235 * multiple attempts to evict it. For pages that are already
2236 * being evicted, this operation will fail and we will move on.
2237 */
2238 if (((previous_state = evict->ref->state) != WT_REF_MEM &&
2239 previous_state != WT_REF_LIMBO) ||
2240 !__wt_atomic_casv32(
2241 &evict->ref->state, previous_state, WT_REF_LOCKED)) {
2242 __evict_list_clear(session, evict);
2243 continue;
2244 }
2245
2246 /*
2247 * Increment the busy count in the btree handle to prevent it
2248 * from being closed under us.
2249 */
2250 (void)__wt_atomic_addv32(&evict->btree->evict_busy, 1);
2251
2252 *btreep = evict->btree;
2253 *refp = evict->ref;
2254 *previous_statep = previous_state;
2255
2256 /*
2257 * Remove the entry so we never try to reconcile the same page
2258 * on reconciliation error.
2259 */
2260 __evict_list_clear(session, evict);
2261 break;
2262 }
2263
2264 /* Move to the next item. */
2265 if (evict != NULL &&
2266 evict + 1 < queue->evict_queue + queue->evict_candidates)
2267 queue->evict_current = evict + 1;
2268 else /* Clear the current pointer if there are no more candidates. */
2269 queue->evict_current = NULL;
2270
2271 __wt_spin_unlock(session, &queue->evict_lock);
2272
2273 return (*refp == NULL ? WT_NOTFOUND : 0);
2274 }
2275
2276 /*
2277 * __evict_page --
2278 * Called by both eviction and application threads to evict a page.
2279 */
2280 static int
__evict_page(WT_SESSION_IMPL * session,bool is_server)2281 __evict_page(WT_SESSION_IMPL *session, bool is_server)
2282 {
2283 WT_BTREE *btree;
2284 WT_CACHE *cache;
2285 WT_DECL_RET;
2286 WT_REF *ref;
2287 WT_TRACK_OP_DECL;
2288 uint64_t time_start, time_stop;
2289 uint32_t previous_state;
2290 bool app_timer;
2291
2292 WT_TRACK_OP_INIT(session);
2293
2294 WT_RET_TRACK(__evict_get_ref(
2295 session, is_server, &btree, &ref, &previous_state));
2296 WT_ASSERT(session, ref->state == WT_REF_LOCKED);
2297
2298 app_timer = false;
2299 cache = S2C(session)->cache;
2300 time_start = time_stop = 0;
2301
2302 /*
2303 * An internal session flags either the server itself or an eviction
2304 * worker thread.
2305 */
2306 if (is_server) {
2307 WT_STAT_CONN_INCR(session, cache_eviction_server_evicting);
2308 cache->server_evicts++;
2309 } else if (F_ISSET(session, WT_SESSION_INTERNAL)) {
2310 WT_STAT_CONN_INCR(session, cache_eviction_worker_evicting);
2311 cache->worker_evicts++;
2312 } else {
2313 if (__wt_page_is_modified(ref->page))
2314 WT_STAT_CONN_INCR(session, cache_eviction_app_dirty);
2315 WT_STAT_CONN_INCR(session, cache_eviction_app);
2316 cache->app_evicts++;
2317 if (WT_STAT_ENABLED(session)) {
2318 app_timer = true;
2319 time_start = __wt_clock(session);
2320 }
2321 }
2322
2323 /*
2324 * In case something goes wrong, don't pick the same set of pages every
2325 * time.
2326 *
2327 * We used to bump the page's read generation only if eviction failed,
2328 * but that isn't safe: at that point, eviction has already unlocked
2329 * the page and some other thread may have evicted it by the time we
2330 * look at it.
2331 */
2332 __wt_cache_read_gen_bump(session, ref->page);
2333
2334 WT_WITH_BTREE(session, btree,
2335 ret = __wt_evict(session, ref, false, previous_state));
2336
2337 (void)__wt_atomic_subv32(&btree->evict_busy, 1);
2338
2339 if (app_timer) {
2340 time_stop = __wt_clock(session);
2341 WT_STAT_CONN_INCRV(session,
2342 application_evict_time,
2343 WT_CLOCKDIFF_US(time_stop, time_start));
2344 }
2345 WT_TRACK_OP_END(session);
2346 return (ret);
2347 }
2348
2349 /*
2350 * __wt_cache_eviction_worker --
2351 * Worker function for __wt_cache_eviction_check: evict pages if the cache
2352 * crosses its boundaries.
2353 */
2354 int
__wt_cache_eviction_worker(WT_SESSION_IMPL * session,bool busy,bool readonly,double pct_full)2355 __wt_cache_eviction_worker(
2356 WT_SESSION_IMPL *session, bool busy, bool readonly, double pct_full)
2357 {
2358 WT_CACHE *cache;
2359 WT_CONNECTION_IMPL *conn;
2360 WT_DECL_RET;
2361 WT_TRACK_OP_DECL;
2362 WT_TXN_GLOBAL *txn_global;
2363 WT_TXN_STATE *txn_state;
2364 uint64_t elapsed, time_start, time_stop;
2365 uint64_t initial_progress, max_progress;
2366 bool timer;
2367
2368 WT_TRACK_OP_INIT(session);
2369
2370 conn = S2C(session);
2371 cache = conn->cache;
2372 time_start = time_stop = 0;
2373 txn_global = &conn->txn_global;
2374 txn_state = WT_SESSION_TXN_STATE(session);
2375
2376 /*
2377 * It is not safe to proceed if the eviction server threads aren't
2378 * setup yet.
2379 */
2380 if (!conn->evict_server_running || (busy && pct_full < 100.0))
2381 goto done;
2382
2383 /* Wake the eviction server if we need to do work. */
2384 __wt_evict_server_wake(session);
2385
2386 /* Track how long application threads spend doing eviction. */
2387 timer = !F_ISSET(session, WT_SESSION_INTERNAL);
2388 if (timer)
2389 time_start = __wt_clock(session);
2390
2391 for (initial_progress = cache->eviction_progress;; ret = 0) {
2392 /*
2393 * A pathological case: if we're the oldest transaction in the
2394 * system and the eviction server is stuck trying to find space
2395 * (and we're not in recovery, because those transactions can't
2396 * be rolled back), abort the transaction to give up all hazard
2397 * pointers before trying again.
2398 */
2399 if (__wt_cache_stuck(session) &&
2400 __wt_txn_am_oldest(session) &&
2401 !F_ISSET(conn, WT_CONN_RECOVERING)) {
2402 --cache->evict_aggressive_score;
2403 WT_STAT_CONN_INCR(session, txn_fail_cache);
2404 WT_ERR(__wt_txn_rollback_required(session,
2405 "oldest transaction rolled back for eviction"));
2406 }
2407
2408 /*
2409 * Check if we have become busy.
2410 *
2411 * If we're busy (because of the transaction check we just did
2412 * or because our caller is waiting on a longer-than-usual event
2413 * such as a page read), and the cache level drops below 100%,
2414 * limit the work to 5 evictions and return. If that's not the
2415 * case, we can do more.
2416 */
2417 if (!busy && txn_state->pinned_id != WT_TXN_NONE &&
2418 txn_global->current != txn_global->oldest_id)
2419 busy = true;
2420 max_progress = busy ? 5 : 20;
2421
2422 /* See if eviction is still needed. */
2423 if (!__wt_eviction_needed(session, busy, readonly, &pct_full) ||
2424 (pct_full < 100.0 && (cache->eviction_progress >
2425 initial_progress + max_progress)))
2426 break;
2427
2428 /* Evict a page. */
2429 switch (ret = __evict_page(session, false)) {
2430 case 0:
2431 if (busy)
2432 goto err;
2433 /* FALLTHROUGH */
2434 case EBUSY:
2435 break;
2436 case WT_NOTFOUND:
2437 /* Allow the queue to re-populate before retrying. */
2438 __wt_cond_wait(session,
2439 conn->evict_threads.wait_cond, 10000, NULL);
2440 cache->app_waits++;
2441 break;
2442 default:
2443 goto err;
2444 }
2445 /* Stop if we've exceeded the time out. */
2446 if (timer && cache->cache_max_wait_us != 0) {
2447 time_stop = __wt_clock(session);
2448 if (session->cache_wait_us +
2449 WT_CLOCKDIFF_US(time_stop, time_start) >
2450 cache->cache_max_wait_us)
2451 goto err;
2452 }
2453 }
2454
2455 err: if (timer) {
2456 time_stop = __wt_clock(session);
2457 elapsed = WT_CLOCKDIFF_US(time_stop, time_start);
2458 WT_STAT_CONN_INCRV(session, application_cache_time, elapsed);
2459 session->cache_wait_us += elapsed;
2460 if (cache->cache_max_wait_us != 0 &&
2461 session->cache_wait_us > cache->cache_max_wait_us) {
2462 WT_TRET(WT_CACHE_FULL);
2463 WT_STAT_CONN_INCR(session, cache_timed_out_ops);
2464 }
2465 }
2466
2467 done: WT_TRACK_OP_END(session);
2468 return (ret);
2469 }
2470
2471 /*
2472 * __wt_page_evict_urgent --
2473 * Set a page to be evicted as soon as possible.
2474 */
2475 bool
__wt_page_evict_urgent(WT_SESSION_IMPL * session,WT_REF * ref)2476 __wt_page_evict_urgent(WT_SESSION_IMPL *session, WT_REF *ref)
2477 {
2478 WT_CACHE *cache;
2479 WT_EVICT_ENTRY *evict;
2480 WT_EVICT_QUEUE *urgent_queue;
2481 WT_PAGE *page;
2482 bool queued;
2483
2484 /* Root pages should never be evicted via LRU. */
2485 WT_ASSERT(session, !__wt_ref_is_root(ref));
2486
2487 page = ref->page;
2488 if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU) ||
2489 S2BT(session)->evict_disabled > 0)
2490 return (false);
2491
2492 /* Append to the urgent queue if we can. */
2493 cache = S2C(session)->cache;
2494 urgent_queue = &cache->evict_queues[WT_EVICT_URGENT_QUEUE];
2495 queued = false;
2496
2497 __wt_spin_lock(session, &cache->evict_queue_lock);
2498 if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU) ||
2499 S2BT(session)->evict_disabled > 0)
2500 goto done;
2501
2502 __wt_spin_lock(session, &urgent_queue->evict_lock);
2503 if (__evict_queue_empty(urgent_queue, false)) {
2504 urgent_queue->evict_current = urgent_queue->evict_queue;
2505 urgent_queue->evict_candidates = 0;
2506 }
2507 evict = urgent_queue->evict_queue + urgent_queue->evict_candidates;
2508 if (evict < urgent_queue->evict_queue + cache->evict_slots &&
2509 __evict_push_candidate(session, urgent_queue, evict, ref)) {
2510 ++urgent_queue->evict_candidates;
2511 queued = true;
2512 }
2513 __wt_spin_unlock(session, &urgent_queue->evict_lock);
2514
2515 done: __wt_spin_unlock(session, &cache->evict_queue_lock);
2516 if (queued) {
2517 WT_STAT_CONN_INCR(session, cache_eviction_pages_queued_urgent);
2518 if (WT_EVICT_HAS_WORKERS(session))
2519 __wt_cond_signal(session,
2520 S2C(session)->evict_threads.wait_cond);
2521 else
2522 __wt_evict_server_wake(session);
2523 }
2524
2525 return (queued);
2526 }
2527
2528 /*
2529 * __wt_evict_priority_set --
2530 * Set a tree's eviction priority.
2531 */
2532 void
__wt_evict_priority_set(WT_SESSION_IMPL * session,uint64_t v)2533 __wt_evict_priority_set(WT_SESSION_IMPL *session, uint64_t v)
2534 {
2535 S2BT(session)->evict_priority = v;
2536 }
2537
2538 /*
2539 * __wt_evict_priority_clear --
2540 * Clear a tree's eviction priority.
2541 */
2542 void
__wt_evict_priority_clear(WT_SESSION_IMPL * session)2543 __wt_evict_priority_clear(WT_SESSION_IMPL *session)
2544 {
2545 S2BT(session)->evict_priority = 0;
2546 }
2547
2548 /*
2549 * __verbose_dump_cache_single --
2550 * Output diagnostic information about a single file in the cache.
2551 */
2552 static int
__verbose_dump_cache_single(WT_SESSION_IMPL * session,uint64_t * total_bytesp,uint64_t * total_dirty_bytesp)2553 __verbose_dump_cache_single(WT_SESSION_IMPL *session,
2554 uint64_t *total_bytesp, uint64_t *total_dirty_bytesp)
2555 {
2556 WT_BTREE *btree;
2557 WT_DATA_HANDLE *dhandle;
2558 WT_PAGE *page;
2559 WT_REF *next_walk;
2560 size_t size;
2561 uint64_t intl_bytes, intl_bytes_max, intl_dirty_bytes;
2562 uint64_t intl_dirty_bytes_max, intl_dirty_pages, intl_pages;
2563 uint64_t leaf_bytes, leaf_bytes_max, leaf_dirty_bytes;
2564 uint64_t leaf_dirty_bytes_max, leaf_dirty_pages, leaf_pages;
2565
2566 intl_bytes = intl_bytes_max = intl_dirty_bytes = 0;
2567 intl_dirty_bytes_max = intl_dirty_pages = intl_pages = 0;
2568 leaf_bytes = leaf_bytes_max = leaf_dirty_bytes = 0;
2569 leaf_dirty_bytes_max = leaf_dirty_pages = leaf_pages = 0;
2570
2571 next_walk = NULL;
2572 while (__wt_tree_walk(session, &next_walk,
2573 WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_WAIT) == 0 &&
2574 next_walk != NULL) {
2575 page = next_walk->page;
2576 size = page->memory_footprint;
2577
2578 if (WT_PAGE_IS_INTERNAL(page)) {
2579 ++intl_pages;
2580 intl_bytes += size;
2581 intl_bytes_max = WT_MAX(intl_bytes_max, size);
2582 if (__wt_page_is_modified(page)) {
2583 ++intl_dirty_pages;
2584 intl_dirty_bytes += size;
2585 intl_dirty_bytes_max =
2586 WT_MAX(intl_dirty_bytes_max, size);
2587 }
2588 } else {
2589 ++leaf_pages;
2590 leaf_bytes += size;
2591 leaf_bytes_max = WT_MAX(leaf_bytes_max, size);
2592 if (__wt_page_is_modified(page)) {
2593 ++leaf_dirty_pages;
2594 leaf_dirty_bytes += size;
2595 leaf_dirty_bytes_max =
2596 WT_MAX(leaf_dirty_bytes_max, size);
2597 }
2598 }
2599 }
2600
2601 dhandle = session->dhandle;
2602 btree = dhandle->handle;
2603 WT_RET(__wt_msg(session, "%s(%s%s)%s%s:",
2604 dhandle->name, dhandle->checkpoint != NULL ? "checkpoint=" : "",
2605 dhandle->checkpoint != NULL ? dhandle->checkpoint : "<live>",
2606 btree->evict_disabled != 0 ? "eviction disabled" : "",
2607 btree->evict_disabled_open ? " at open" : ""));
2608 if (intl_pages == 0)
2609 WT_RET(__wt_msg(session, "internal: 0 pages"));
2610 else
2611 WT_RET(__wt_msg(session,
2612 "internal: "
2613 "%" PRIu64 " pages, "
2614 "%" PRIu64 "MB, "
2615 "%" PRIu64 "/%" PRIu64 " clean/dirty pages, "
2616 "%" PRIu64 "/%" PRIu64 " clean/dirty MB, "
2617 "%" PRIu64 "MB max page, "
2618 "%" PRIu64 "MB max dirty page",
2619 intl_pages,
2620 intl_bytes / WT_MEGABYTE,
2621 intl_pages - intl_dirty_pages,
2622 intl_dirty_pages,
2623 (intl_bytes - intl_dirty_bytes) / WT_MEGABYTE,
2624 intl_dirty_bytes / WT_MEGABYTE,
2625 intl_bytes_max / WT_MEGABYTE,
2626 intl_dirty_bytes_max / WT_MEGABYTE));
2627 if (leaf_pages == 0)
2628 WT_RET(__wt_msg(session, "leaf: 0 pages"));
2629 else
2630 WT_RET(__wt_msg(session,
2631 "leaf: "
2632 "%" PRIu64 " pages, "
2633 "%" PRIu64 "MB, "
2634 "%" PRIu64 "/%" PRIu64 " clean/dirty pages, "
2635 "%" PRIu64 "/%" PRIu64 " clean/dirty MB, "
2636 "%" PRIu64 "MB max page, "
2637 "%" PRIu64 "MB max dirty page",
2638 leaf_pages,
2639 leaf_bytes / WT_MEGABYTE,
2640 leaf_pages - leaf_dirty_pages,
2641 leaf_dirty_pages,
2642 (leaf_bytes - leaf_dirty_bytes) / WT_MEGABYTE,
2643 leaf_dirty_bytes / WT_MEGABYTE,
2644 leaf_bytes_max / WT_MEGABYTE,
2645 leaf_dirty_bytes_max / WT_MEGABYTE));
2646
2647 *total_bytesp += intl_bytes + leaf_bytes;
2648 *total_dirty_bytesp += intl_dirty_bytes + leaf_dirty_bytes;
2649
2650 return (0);
2651 }
2652
2653 /*
2654 * __wt_verbose_dump_cache --
2655 * Output diagnostic information about the cache.
2656 */
2657 int
__wt_verbose_dump_cache(WT_SESSION_IMPL * session)2658 __wt_verbose_dump_cache(WT_SESSION_IMPL *session)
2659 {
2660 WT_CONNECTION_IMPL *conn;
2661 WT_DATA_HANDLE *dhandle;
2662 WT_DECL_RET;
2663 double pct;
2664 uint64_t total_bytes, total_dirty_bytes;
2665 bool needed;
2666
2667 conn = S2C(session);
2668 total_bytes = total_dirty_bytes = 0;
2669 pct = 0.0; /* [-Werror=uninitialized] */
2670
2671 WT_RET(__wt_msg(session, "%s", WT_DIVIDER));
2672 WT_RET(__wt_msg(session, "cache dump"));
2673
2674 WT_RET(__wt_msg(session,
2675 "cache full: %s", __wt_cache_full(session) ? "yes" : "no"));
2676 needed = __wt_eviction_clean_needed(session, &pct);
2677 WT_RET(__wt_msg(session,
2678 "cache clean check: %s (%2.3f%%)", needed ? "yes" : "no", pct));
2679 needed = __wt_eviction_dirty_needed(session, &pct);
2680 WT_RET(__wt_msg(session,
2681 "cache dirty check: %s (%2.3f%%)", needed ? "yes" : "no", pct));
2682
2683 for (dhandle = NULL;;) {
2684 WT_WITH_HANDLE_LIST_READ_LOCK(session,
2685 WT_DHANDLE_NEXT(session, dhandle, &conn->dhqh, q));
2686 if (dhandle == NULL)
2687 break;
2688 if (dhandle->type != WT_DHANDLE_TYPE_BTREE ||
2689 !F_ISSET(dhandle, WT_DHANDLE_OPEN))
2690 continue;
2691
2692 WT_WITH_DHANDLE(session, dhandle,
2693 ret = __verbose_dump_cache_single(
2694 session, &total_bytes, &total_dirty_bytes));
2695 if (ret != 0)
2696 break;
2697 }
2698 WT_RET(ret);
2699
2700 /*
2701 * Apply the overhead percentage so our total bytes are comparable with
2702 * the tracked value.
2703 */
2704 total_bytes = __wt_cache_bytes_plus_overhead(conn->cache, total_bytes);
2705
2706 WT_RET(__wt_msg(session,
2707 "cache dump: "
2708 "total found: %" PRIu64 "MB vs tracked inuse %" PRIu64 "MB",
2709 total_bytes / WT_MEGABYTE,
2710 __wt_cache_bytes_inuse(conn->cache) / WT_MEGABYTE));
2711 WT_RET(__wt_msg(session,
2712 "total dirty bytes: %" PRIu64 "MB",
2713 total_dirty_bytes / WT_MEGABYTE));
2714
2715 return (0);
2716 }
2717