1 /*****************************************************************************
2
3 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License, version 2.0,
7 as published by the Free Software Foundation.
8
9 This program is also distributed with certain software (including
10 but not limited to OpenSSL) that is licensed under separate terms,
11 as designated in a particular file or component or in included license
12 documentation. The authors of MySQL hereby grant you an additional
13 permission to link the program and your derivative works with the
14 separately licensed software that they have included with MySQL.
15
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License, version 2.0, for more details.
20
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
24
25 *****************************************************************************/
26
27 /**************************************************//**
28 @file buf/buf0flu.cc
29 The database buffer buf_pool flush algorithm
30
31 Created 11/11/1995 Heikki Tuuri
32 *******************************************************/
33
34 #include "buf0flu.h"
35
36 #ifdef UNIV_NONINL
37 #include "buf0flu.ic"
38 #endif
39
40 #include "buf0buf.h"
41 #include "buf0checksum.h"
42 #include "srv0start.h"
43 #include "srv0srv.h"
44 #include "page0zip.h"
45 #ifndef UNIV_HOTBACKUP
46 #include "ut0byte.h"
47 #include "ut0lst.h"
48 #include "page0page.h"
49 #include "fil0fil.h"
50 #include "buf0lru.h"
51 #include "buf0rea.h"
52 #include "ibuf0ibuf.h"
53 #include "log0log.h"
54 #include "os0file.h"
55 #include "trx0sys.h"
56 #include "srv0mon.h"
57 #include "mysql/plugin.h"
58 #include "mysql/service_thd_wait.h"
59
60 /** Number of pages flushed through non flush_list flushes. */
61 // static ulint buf_lru_flush_page_count = 0;
62
63 /** Flag indicating if the page_cleaner is in active state. This flag
64 is set to TRUE by the page_cleaner thread when it is spawned and is set
65 back to FALSE at shutdown by the page_cleaner as well. Therefore no
66 need to protect it by a mutex. It is only ever read by the thread
67 doing the shutdown */
68 UNIV_INTERN ibool buf_page_cleaner_is_active = FALSE;
69
70 /** Flag indicating if the lru_manager is in active state. */
71 UNIV_INTERN bool buf_lru_manager_is_active = false;
72
73 #ifdef UNIV_PFS_THREAD
74 UNIV_INTERN mysql_pfs_key_t buf_page_cleaner_thread_key;
75 UNIV_INTERN mysql_pfs_key_t buf_lru_manager_thread_key;
76 #endif /* UNIV_PFS_THREAD */
77
78 /* @} */
79
80 /** Handled page counters for a single flush */
81 struct flush_counters_t {
82 ulint flushed; /*!< number of dirty pages flushed */
83 ulint evicted; /*!< number of clean pages evicted, including
84 evicted uncompressed page images */
85 ulint unzip_LRU_evicted;/*!< number of uncompressed page images
86 evicted */
87 };
88
89 /******************************************************************//**
90 Increases flush_list size in bytes with zip_size for compressed page,
91 UNIV_PAGE_SIZE for uncompressed page in inline function */
92 static inline
93 void
incr_flush_list_size_in_bytes(buf_block_t * block,buf_pool_t * buf_pool)94 incr_flush_list_size_in_bytes(
95 /*==========================*/
96 buf_block_t* block, /*!< in: control block */
97 buf_pool_t* buf_pool) /*!< in: buffer pool instance */
98 {
99 ut_ad(buf_flush_list_mutex_own(buf_pool));
100 ulint zip_size = page_zip_get_size(&block->page.zip);
101 buf_pool->stat.flush_list_bytes += zip_size ? zip_size : UNIV_PAGE_SIZE;
102 ut_ad(buf_pool->stat.flush_list_bytes <= buf_pool->curr_pool_size);
103 }
104
105 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
106 /******************************************************************//**
107 Validates the flush list.
108 @return TRUE if ok */
109 static
110 ibool
111 buf_flush_validate_low(
112 /*===================*/
113 buf_pool_t* buf_pool); /*!< in: Buffer pool instance */
114
115 /******************************************************************//**
116 Validates the flush list some of the time.
117 @return TRUE if ok or the check was skipped */
118 static
119 ibool
buf_flush_validate_skip(buf_pool_t * buf_pool)120 buf_flush_validate_skip(
121 /*====================*/
122 buf_pool_t* buf_pool) /*!< in: Buffer pool instance */
123 {
124 /** Try buf_flush_validate_low() every this many times */
125 # define BUF_FLUSH_VALIDATE_SKIP 23
126
127 /** The buf_flush_validate_low() call skip counter.
128 Use a signed type because of the race condition below. */
129 static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
130
131 /* There is a race condition below, but it does not matter,
132 because this call is only for heuristic purposes. We want to
133 reduce the call frequency of the costly buf_flush_validate_low()
134 check in debug builds. */
135 if (--buf_flush_validate_count > 0) {
136 return(TRUE);
137 }
138
139 buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
140 return(buf_flush_validate_low(buf_pool));
141 }
142 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
143
144 /*******************************************************************//**
145 Sets hazard pointer during flush_list iteration. */
146 UNIV_INLINE
147 void
buf_flush_set_hp(buf_pool_t * buf_pool,const buf_page_t * bpage)148 buf_flush_set_hp(
149 /*=============*/
150 buf_pool_t* buf_pool,/*!< in/out: buffer pool instance */
151 const buf_page_t* bpage) /*!< in: buffer control block */
152 {
153 ut_ad(buf_flush_list_mutex_own(buf_pool));
154 ut_ad(buf_pool->flush_list_hp == NULL || bpage == NULL);
155 ut_ad(!bpage || buf_page_in_file(bpage)
156 || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
157 ut_ad(!bpage || bpage->in_flush_list);
158 ut_ad(!bpage || buf_pool_from_bpage(bpage) == buf_pool);
159
160 buf_pool->flush_list_hp = bpage;
161 }
162
163 /*******************************************************************//**
164 Checks if the given block is a hazard pointer
165 @return true if bpage is hazard pointer */
166 UNIV_INLINE
167 bool
buf_flush_is_hp(buf_pool_t * buf_pool,const buf_page_t * bpage)168 buf_flush_is_hp(
169 /*============*/
170 buf_pool_t* buf_pool,/*!< in: buffer pool instance */
171 const buf_page_t* bpage) /*!< in: buffer control block */
172 {
173 ut_ad(buf_flush_list_mutex_own(buf_pool));
174
175 return(buf_pool->flush_list_hp == bpage);
176 }
177
178 /*******************************************************************//**
179 Whenever we move a block in flush_list (either to remove it or to
180 relocate it) we check the hazard pointer set by some other thread
181 doing the flush list scan. If the hazard pointer is the same as the
182 one we are about going to move then we set it to NULL to force a rescan
183 in the thread doing the batch. */
184 UNIV_INLINE
185 void
buf_flush_update_hp(buf_pool_t * buf_pool,buf_page_t * bpage)186 buf_flush_update_hp(
187 /*================*/
188 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
189 buf_page_t* bpage) /*!< in: buffer control block */
190 {
191 ut_ad(buf_flush_list_mutex_own(buf_pool));
192
193 if (buf_flush_is_hp(buf_pool, bpage)) {
194 buf_flush_set_hp(buf_pool, NULL);
195 MONITOR_INC(MONITOR_FLUSH_HP_RESCAN);
196 }
197 }
198
199 /******************************************************************//**
200 Insert a block in the flush_rbt and returns a pointer to its
201 predecessor or NULL if no predecessor. The ordering is maintained
202 on the basis of the <oldest_modification, space, offset> key.
203 @return pointer to the predecessor or NULL if no predecessor. */
204 static
205 buf_page_t*
buf_flush_insert_in_flush_rbt(buf_page_t * bpage)206 buf_flush_insert_in_flush_rbt(
207 /*==========================*/
208 buf_page_t* bpage) /*!< in: bpage to be inserted. */
209 {
210 const ib_rbt_node_t* c_node;
211 const ib_rbt_node_t* p_node;
212 buf_page_t* prev = NULL;
213 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
214
215 ut_ad(buf_flush_list_mutex_own(buf_pool));
216
217 /* Insert this buffer into the rbt. */
218 c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
219 ut_a(c_node != NULL);
220
221 /* Get the predecessor. */
222 p_node = rbt_prev(buf_pool->flush_rbt, c_node);
223
224 if (p_node != NULL) {
225 buf_page_t** value;
226 value = rbt_value(buf_page_t*, p_node);
227 prev = *value;
228 ut_a(prev != NULL);
229 }
230
231 return(prev);
232 }
233
234 /*********************************************************//**
235 Delete a bpage from the flush_rbt. */
236 static
237 void
buf_flush_delete_from_flush_rbt(buf_page_t * bpage)238 buf_flush_delete_from_flush_rbt(
239 /*============================*/
240 buf_page_t* bpage) /*!< in: bpage to be removed. */
241 {
242 #ifdef UNIV_DEBUG
243 ibool ret = FALSE;
244 #endif /* UNIV_DEBUG */
245 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
246
247 ut_ad(buf_flush_list_mutex_own(buf_pool));
248
249 #ifdef UNIV_DEBUG
250 ret =
251 #endif /* UNIV_DEBUG */
252 rbt_delete(buf_pool->flush_rbt, &bpage);
253
254 ut_ad(ret);
255 }
256
257 /*****************************************************************//**
258 Compare two modified blocks in the buffer pool. The key for comparison
259 is:
260 key = <oldest_modification, space, offset>
261 This comparison is used to maintian ordering of blocks in the
262 buf_pool->flush_rbt.
263 Note that for the purpose of flush_rbt, we only need to order blocks
264 on the oldest_modification. The other two fields are used to uniquely
265 identify the blocks.
266 @return < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */
267 static
268 int
buf_flush_block_cmp(const void * p1,const void * p2)269 buf_flush_block_cmp(
270 /*================*/
271 const void* p1, /*!< in: block1 */
272 const void* p2) /*!< in: block2 */
273 {
274 int ret;
275 const buf_page_t* b1 = *(const buf_page_t**) p1;
276 const buf_page_t* b2 = *(const buf_page_t**) p2;
277 #ifdef UNIV_DEBUG
278 buf_pool_t* buf_pool = buf_pool_from_bpage(b1);
279 #endif /* UNIV_DEBUG */
280
281 ut_ad(b1 != NULL);
282 ut_ad(b2 != NULL);
283
284 ut_ad(buf_flush_list_mutex_own(buf_pool));
285
286 ut_ad(b1->in_flush_list);
287 ut_ad(b2->in_flush_list);
288
289 if (b2->oldest_modification > b1->oldest_modification) {
290 return(1);
291 } else if (b2->oldest_modification < b1->oldest_modification) {
292 return(-1);
293 }
294
295 /* If oldest_modification is same then decide on the space. */
296 ret = (int)(b2->space - b1->space);
297
298 /* Or else decide ordering on the offset field. */
299 return(ret ? ret : (int)(b2->offset - b1->offset));
300 }
301
302 /********************************************************************//**
303 Initialize the red-black tree to speed up insertions into the flush_list
304 during recovery process. Should be called at the start of recovery
305 process before any page has been read/written. */
306 UNIV_INTERN
307 void
buf_flush_init_flush_rbt(void)308 buf_flush_init_flush_rbt(void)
309 /*==========================*/
310 {
311 ulint i;
312
313 for (i = 0; i < srv_buf_pool_instances; i++) {
314 buf_pool_t* buf_pool;
315
316 buf_pool = buf_pool_from_array(i);
317
318 buf_flush_list_mutex_enter(buf_pool);
319
320 ut_ad(buf_pool->flush_rbt == NULL);
321
322 /* Create red black tree for speedy insertions in flush list. */
323 buf_pool->flush_rbt = rbt_create(
324 sizeof(buf_page_t*), buf_flush_block_cmp);
325
326 buf_flush_list_mutex_exit(buf_pool);
327 }
328 }
329
330 /********************************************************************//**
331 Frees up the red-black tree. */
332 UNIV_INTERN
333 void
buf_flush_free_flush_rbt(void)334 buf_flush_free_flush_rbt(void)
335 /*==========================*/
336 {
337 ulint i;
338
339 for (i = 0; i < srv_buf_pool_instances; i++) {
340 buf_pool_t* buf_pool;
341
342 buf_pool = buf_pool_from_array(i);
343
344 buf_flush_list_mutex_enter(buf_pool);
345
346 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
347 ut_a(buf_flush_validate_low(buf_pool));
348 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
349
350 rbt_free(buf_pool->flush_rbt);
351 buf_pool->flush_rbt = NULL;
352
353 buf_flush_list_mutex_exit(buf_pool);
354 }
355 }
356
357 /********************************************************************//**
358 Inserts a modified block into the flush list. */
359 UNIV_INTERN
360 void
buf_flush_insert_into_flush_list(buf_pool_t * buf_pool,buf_block_t * block,lsn_t lsn)361 buf_flush_insert_into_flush_list(
362 /*=============================*/
363 buf_pool_t* buf_pool, /*!< buffer pool instance */
364 buf_block_t* block, /*!< in/out: block which is modified */
365 lsn_t lsn) /*!< in: oldest modification */
366 {
367 ut_ad(srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE);
368 ut_ad(log_flush_order_mutex_own());
369 ut_ad(mutex_own(&block->mutex));
370
371 buf_flush_list_mutex_enter(buf_pool);
372
373 ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
374 || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
375 <= lsn));
376
377 /* If we are in the recovery then we need to update the flush
378 red-black tree as well. */
379 if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
380 buf_flush_list_mutex_exit(buf_pool);
381 buf_flush_insert_sorted_into_flush_list(buf_pool, block, lsn);
382 return;
383 }
384
385 ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
386 ut_ad(!block->page.in_flush_list);
387
388 ut_d(block->page.in_flush_list = TRUE);
389 block->page.oldest_modification = lsn;
390 UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
391 incr_flush_list_size_in_bytes(block, buf_pool);
392
393 #ifdef UNIV_DEBUG_VALGRIND
394 {
395 ulint zip_size = buf_block_get_zip_size(block);
396
397 if (zip_size) {
398 UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
399 } else {
400 UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
401 }
402 }
403 #endif /* UNIV_DEBUG_VALGRIND */
404 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
405 ut_a(buf_flush_validate_skip(buf_pool));
406 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
407
408 buf_flush_list_mutex_exit(buf_pool);
409 }
410
411 /********************************************************************//**
412 Inserts a modified block into the flush list in the right sorted position.
413 This function is used by recovery, because there the modifications do not
414 necessarily come in the order of lsn's. */
415 UNIV_INTERN
416 void
buf_flush_insert_sorted_into_flush_list(buf_pool_t * buf_pool,buf_block_t * block,lsn_t lsn)417 buf_flush_insert_sorted_into_flush_list(
418 /*====================================*/
419 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
420 buf_block_t* block, /*!< in/out: block which is modified */
421 lsn_t lsn) /*!< in: oldest modification */
422 {
423 buf_page_t* prev_b;
424 buf_page_t* b;
425
426 ut_ad(srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE);
427 ut_ad(log_flush_order_mutex_own());
428 ut_ad(mutex_own(&block->mutex));
429 ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
430
431 buf_flush_list_mutex_enter(buf_pool);
432
433 /* The field in_LRU_list is protected by buf_pool->LRU_list_mutex,
434 which we are not holding. However, while a block is in the flush
435 list, it is dirty and cannot be discarded, not from the
436 page_hash or from the LRU list. At most, the uncompressed
437 page frame of a compressed block may be discarded or created
438 (copying the block->page to or from a buf_page_t that is
439 dynamically allocated from buf_buddy_alloc()). Because those
440 transitions hold block->mutex and the flush list mutex (via
441 buf_flush_relocate_on_flush_list()), there is no possibility
442 of a race condition in the assertions below. */
443 ut_ad(block->page.in_LRU_list);
444 ut_ad(block->page.in_page_hash);
445 /* buf_buddy_block_register() will take a block in the
446 BUF_BLOCK_MEMORY state, not a file page. */
447 ut_ad(!block->page.in_zip_hash);
448
449 ut_ad(!block->page.in_flush_list);
450 ut_d(block->page.in_flush_list = TRUE);
451 block->page.oldest_modification = lsn;
452
453 #ifdef UNIV_DEBUG_VALGRIND
454 {
455 ulint zip_size = buf_block_get_zip_size(block);
456
457 if (zip_size) {
458 UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
459 } else {
460 UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
461 }
462 }
463 #endif /* UNIV_DEBUG_VALGRIND */
464
465 prev_b = NULL;
466
467 /* For the most part when this function is called the flush_rbt
468 should not be NULL. In a very rare boundary case it is possible
469 that the flush_rbt has already been freed by the recovery thread
470 before the last page was hooked up in the flush_list by the
471 io-handler thread. In that case we'll just do a simple
472 linear search in the else block. */
473 if (buf_pool->flush_rbt) {
474
475 prev_b = buf_flush_insert_in_flush_rbt(&block->page);
476
477 } else {
478
479 b = UT_LIST_GET_FIRST(buf_pool->flush_list);
480
481 while (b && b->oldest_modification
482 > block->page.oldest_modification) {
483 ut_ad(b->in_flush_list);
484 prev_b = b;
485 b = UT_LIST_GET_NEXT(list, b);
486 }
487 }
488
489 if (prev_b == NULL) {
490 UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
491 } else {
492 UT_LIST_INSERT_AFTER(list, buf_pool->flush_list,
493 prev_b, &block->page);
494 }
495
496 incr_flush_list_size_in_bytes(block, buf_pool);
497
498 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
499 ut_a(buf_flush_validate_low(buf_pool));
500 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
501
502 buf_flush_list_mutex_exit(buf_pool);
503 }
504
505 /********************************************************************//**
506 Returns TRUE if the file page block is immediately suitable for replacement,
507 i.e., the transition FILE_PAGE => NOT_USED allowed.
508 @return TRUE if can replace immediately */
509 UNIV_INTERN
510 ibool
buf_flush_ready_for_replace(buf_page_t * bpage)511 buf_flush_ready_for_replace(
512 /*========================*/
513 buf_page_t* bpage) /*!< in: buffer control block, must be
514 buf_page_in_file(bpage) and in the LRU list */
515 {
516 #ifdef UNIV_DEBUG
517 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
518 ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
519 #endif /* UNIV_DEBUG */
520 ut_ad(mutex_own(buf_page_get_mutex(bpage)));
521 ut_ad(bpage->in_LRU_list);
522
523 if (buf_page_in_file(bpage)) {
524
525 return(bpage->oldest_modification == 0
526 && bpage->buf_fix_count == 0
527 && buf_page_get_io_fix(bpage) == BUF_IO_NONE);
528 }
529
530 ut_print_timestamp(stderr);
531 fprintf(stderr,
532 " InnoDB: Error: buffer block state %lu"
533 " in the LRU list!\n",
534 (ulong) buf_page_get_state(bpage));
535 ut_print_buf(stderr, bpage, sizeof(buf_page_t));
536 putc('\n', stderr);
537
538 return(FALSE);
539 }
540
541 /********************************************************************//**
542 Returns true if the block is modified and ready for flushing.
543 @return true if can flush immediately */
544 UNIV_INTERN
545 bool
buf_flush_ready_for_flush(buf_page_t * bpage,buf_flush_t flush_type)546 buf_flush_ready_for_flush(
547 /*======================*/
548 buf_page_t* bpage, /*!< in: buffer control block, must be
549 buf_page_in_file(bpage) */
550 buf_flush_t flush_type)/*!< in: type of flush */
551 {
552 ut_ad(flush_type < BUF_FLUSH_N_TYPES);
553 ut_ad(mutex_own(buf_page_get_mutex(bpage))
554 || flush_type == BUF_FLUSH_LIST);
555 ut_a(buf_page_in_file(bpage)
556 || (buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH
557 #ifdef UNIV_DEBUG
558 && !mutex_own(&buf_pool_from_bpage(bpage)->LRU_list_mutex)
559 #endif
560 ));
561
562 if (bpage->oldest_modification == 0
563 || buf_page_get_io_fix_unlocked(bpage) != BUF_IO_NONE) {
564 return(false);
565 }
566
567 ut_ad(bpage->in_flush_list);
568
569 switch (flush_type) {
570 case BUF_FLUSH_LIST:
571 return(buf_page_get_state(bpage) != BUF_BLOCK_REMOVE_HASH);
572 case BUF_FLUSH_LRU:
573 case BUF_FLUSH_SINGLE_PAGE:
574 return(true);
575
576 case BUF_FLUSH_N_TYPES:
577 break;
578 }
579
580 ut_error;
581 return(false);
582 }
583
584 /********************************************************************//**
585 Remove a block from the flush list of modified blocks. */
586 UNIV_INTERN
587 void
buf_flush_remove(buf_page_t * bpage)588 buf_flush_remove(
589 /*=============*/
590 buf_page_t* bpage) /*!< in: pointer to the block in question */
591 {
592 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
593 ulint zip_size;
594
595 ut_ad(mutex_own(buf_page_get_mutex(bpage)));
596 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
597 ut_ad(buf_page_get_state(bpage) != BUF_BLOCK_ZIP_DIRTY
598 || mutex_own(&buf_pool->LRU_list_mutex));
599 #endif
600 ut_ad(bpage->in_flush_list);
601
602 buf_flush_list_mutex_enter(buf_pool);
603
604 switch (buf_page_get_state(bpage)) {
605 case BUF_BLOCK_POOL_WATCH:
606 case BUF_BLOCK_ZIP_PAGE:
607 /* Clean compressed pages should not be on the flush list */
608 case BUF_BLOCK_NOT_USED:
609 case BUF_BLOCK_READY_FOR_USE:
610 case BUF_BLOCK_MEMORY:
611 case BUF_BLOCK_REMOVE_HASH:
612 ut_error;
613 return;
614 case BUF_BLOCK_ZIP_DIRTY:
615 buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
616 UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
617 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
618 buf_LRU_insert_zip_clean(bpage);
619 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
620 break;
621 case BUF_BLOCK_FILE_PAGE:
622 UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
623 break;
624 }
625
626 /* If the flush_rbt is active then delete from there as well. */
627 if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
628 buf_flush_delete_from_flush_rbt(bpage);
629 }
630
631 /* Must be done after we have removed it from the flush_rbt
632 because we assert on in_flush_list in comparison function. */
633 ut_d(bpage->in_flush_list = FALSE);
634
635 zip_size = page_zip_get_size(&bpage->zip);
636 buf_pool->stat.flush_list_bytes -= zip_size ? zip_size : UNIV_PAGE_SIZE;
637
638 bpage->oldest_modification = 0;
639
640 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
641 ut_a(buf_flush_validate_skip(buf_pool));
642 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
643
644 buf_flush_update_hp(buf_pool, bpage);
645 buf_flush_list_mutex_exit(buf_pool);
646 }
647
648 /*******************************************************************//**
649 Relocates a buffer control block on the flush_list.
650 Note that it is assumed that the contents of bpage have already been
651 copied to dpage.
652 IMPORTANT: When this function is called bpage and dpage are not
653 exact copies of each other. For example, they both will have different
654 ::state. Also the ::list pointers in dpage may be stale. We need to
655 use the current list node (bpage) to do the list manipulation because
656 the list pointers could have changed between the time that we copied
657 the contents of bpage to the dpage and the flush list manipulation
658 below. */
659 UNIV_INTERN
660 void
buf_flush_relocate_on_flush_list(buf_page_t * bpage,buf_page_t * dpage)661 buf_flush_relocate_on_flush_list(
662 /*=============================*/
663 buf_page_t* bpage, /*!< in/out: control block being moved */
664 buf_page_t* dpage) /*!< in/out: destination block */
665 {
666 buf_page_t* prev;
667 buf_page_t* prev_b = NULL;
668 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
669
670 /* Must reside in the same buffer pool. */
671 ut_ad(buf_pool == buf_pool_from_bpage(dpage));
672
673 ut_ad(mutex_own(buf_page_get_mutex(bpage)));
674
675 buf_flush_list_mutex_enter(buf_pool);
676
677 ut_ad(bpage->in_flush_list);
678 ut_ad(dpage->in_flush_list);
679
680 /* If recovery is active we must swap the control blocks in
681 the flush_rbt as well. */
682 if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
683 buf_flush_delete_from_flush_rbt(bpage);
684 prev_b = buf_flush_insert_in_flush_rbt(dpage);
685 }
686
687 /* Must be done after we have removed it from the flush_rbt
688 because we assert on in_flush_list in comparison function. */
689 ut_d(bpage->in_flush_list = FALSE);
690
691 prev = UT_LIST_GET_PREV(list, bpage);
692 UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
693
694 if (prev) {
695 ut_ad(prev->in_flush_list);
696 UT_LIST_INSERT_AFTER(
697 list,
698 buf_pool->flush_list,
699 prev, dpage);
700 } else {
701 UT_LIST_ADD_FIRST(
702 list,
703 buf_pool->flush_list,
704 dpage);
705 }
706
707 /* Just an extra check. Previous in flush_list
708 should be the same control block as in flush_rbt. */
709 ut_a(!buf_pool->flush_rbt || prev_b == prev);
710
711 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
712 ut_a(buf_flush_validate_low(buf_pool));
713 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
714
715 buf_flush_update_hp(buf_pool, bpage);
716 buf_flush_list_mutex_exit(buf_pool);
717 }
718
719 /********************************************************************//**
720 Updates the flush system data structures when a write is completed. */
721 UNIV_INTERN
722 void
buf_flush_write_complete(buf_page_t * bpage)723 buf_flush_write_complete(
724 /*=====================*/
725 buf_page_t* bpage) /*!< in: pointer to the block in question */
726 {
727 buf_flush_t flush_type = buf_page_get_flush_type(bpage);
728 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
729
730 mutex_enter(&buf_pool->flush_state_mutex);
731
732 buf_flush_remove(bpage);
733
734 buf_page_set_io_fix(bpage, BUF_IO_NONE);
735
736 buf_pool->n_flush[flush_type]--;
737 ut_ad(buf_pool->n_flush[flush_type] != ULINT_MAX);
738
739 /* fprintf(stderr, "n pending flush %lu\n",
740 buf_pool->n_flush[flush_type]); */
741
742 if (buf_pool->n_flush[flush_type] == 0
743 && buf_pool->init_flush[flush_type] == FALSE) {
744
745 /* The running flush batch has ended */
746
747 os_event_set(buf_pool->no_flush[flush_type]);
748 }
749
750 buf_dblwr_update(bpage, flush_type);
751
752 mutex_exit(&buf_pool->flush_state_mutex);
753 }
754 #endif /* !UNIV_HOTBACKUP */
755
756 /********************************************************************//**
757 Calculate the checksum of a page from compressed table and update the page. */
758 UNIV_INTERN
759 void
buf_flush_update_zip_checksum(buf_frame_t * page,ulint zip_size,lsn_t lsn)760 buf_flush_update_zip_checksum(
761 /*==========================*/
762 buf_frame_t* page, /*!< in/out: Page to update */
763 ulint zip_size, /*!< in: Compressed page size */
764 lsn_t lsn) /*!< in: Lsn to stamp on the page */
765 {
766 ut_a(zip_size > 0);
767
768 ib_uint32_t checksum = static_cast<ib_uint32_t>(
769 page_zip_calc_checksum(
770 page, zip_size,
771 static_cast<srv_checksum_algorithm_t>(
772 srv_checksum_algorithm)));
773
774 mach_write_to_8(page + FIL_PAGE_LSN, lsn);
775 memset(page + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
776 mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
777 }
778
779 /********************************************************************//**
780 Initializes a page for writing to the tablespace. */
781 UNIV_INTERN
782 void
buf_flush_init_for_writing(byte * page,void * page_zip_,lsn_t newest_lsn)783 buf_flush_init_for_writing(
784 /*=======================*/
785 byte* page, /*!< in/out: page */
786 void* page_zip_, /*!< in/out: compressed page, or NULL */
787 lsn_t newest_lsn) /*!< in: newest modification lsn
788 to the page */
789 {
790 ib_uint32_t checksum = 0 /* silence bogus gcc warning */;
791
792 ut_ad(page);
793
794 if (page_zip_) {
795 page_zip_des_t* page_zip;
796 ulint zip_size;
797
798 page_zip = static_cast<page_zip_des_t*>(page_zip_);
799 zip_size = page_zip_get_size(page_zip);
800
801 ut_ad(zip_size);
802 ut_ad(ut_is_2pow(zip_size));
803 ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
804
805 switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) {
806 case FIL_PAGE_TYPE_ALLOCATED:
807 case FIL_PAGE_INODE:
808 case FIL_PAGE_IBUF_BITMAP:
809 case FIL_PAGE_TYPE_FSP_HDR:
810 case FIL_PAGE_TYPE_XDES:
811 /* These are essentially uncompressed pages. */
812 memcpy(page_zip->data, page, zip_size);
813 /* fall through */
814 case FIL_PAGE_TYPE_ZBLOB:
815 case FIL_PAGE_TYPE_ZBLOB2:
816 case FIL_PAGE_INDEX:
817
818 buf_flush_update_zip_checksum(
819 page_zip->data, zip_size, newest_lsn);
820
821 return;
822 }
823
824 ut_print_timestamp(stderr);
825 fputs(" InnoDB: ERROR: The compressed page to be written"
826 " seems corrupt:", stderr);
827 ut_print_buf(stderr, page, zip_size);
828 fputs("\nInnoDB: Possibly older version of the page:", stderr);
829 ut_print_buf(stderr, page_zip->data, zip_size);
830 putc('\n', stderr);
831 ut_error;
832 }
833
834 /* Write the newest modification lsn to the page header and trailer */
835 mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
836
837 mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
838 newest_lsn);
839
840 /* Store the new formula checksum */
841
842 switch ((srv_checksum_algorithm_t) srv_checksum_algorithm) {
843 case SRV_CHECKSUM_ALGORITHM_CRC32:
844 case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
845 checksum = buf_calc_page_crc32(page);
846 mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
847 break;
848 case SRV_CHECKSUM_ALGORITHM_INNODB:
849 case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
850 checksum = (ib_uint32_t) buf_calc_page_new_checksum(page);
851 mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
852 checksum = (ib_uint32_t) buf_calc_page_old_checksum(page);
853 break;
854 case SRV_CHECKSUM_ALGORITHM_NONE:
855 case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
856 checksum = BUF_NO_CHECKSUM_MAGIC;
857 mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
858 break;
859 /* no default so the compiler will emit a warning if new enum
860 is added and not handled here */
861 }
862
863 /* With the InnoDB checksum, we overwrite the first 4 bytes of
864 the end lsn field to store the old formula checksum. Since it
865 depends also on the field FIL_PAGE_SPACE_OR_CHKSUM, it has to
866 be calculated after storing the new formula checksum.
867
868 In other cases we write the same value to both fields.
869 If CRC32 is used then it is faster to use that checksum
870 (calculated above) instead of calculating another one.
871 We can afford to store something other than
872 buf_calc_page_old_checksum() or BUF_NO_CHECKSUM_MAGIC in
873 this field because the file will not be readable by old
874 versions of MySQL/InnoDB anyway (older than MySQL 5.6.3) */
875
876 mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
877 checksum);
878 }
879
880 #ifndef UNIV_HOTBACKUP
881 /********************************************************************//**
882 Does an asynchronous write of a buffer page. NOTE: in simulated aio and
883 also when the doublewrite buffer is used, we must call
884 buf_dblwr_flush_buffered_writes after we have posted a batch of
885 writes! */
886 static
887 void
buf_flush_write_block_low(buf_page_t * bpage,buf_flush_t flush_type,bool sync)888 buf_flush_write_block_low(
889 /*======================*/
890 buf_page_t* bpage, /*!< in: buffer block to write */
891 buf_flush_t flush_type, /*!< in: type of flush */
892 bool sync) /*!< in: true if sync IO request */
893 {
894 ulint zip_size = buf_page_get_zip_size(bpage);
895 page_t* frame = NULL;
896
897 #ifdef UNIV_DEBUG
898 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
899 ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
900 #endif
901
902 #ifdef UNIV_LOG_DEBUG
903 static ibool univ_log_debug_warned;
904 #endif /* UNIV_LOG_DEBUG */
905
906 ut_ad(buf_page_in_file(bpage));
907
908 /* We are not holding block_mutex here.
909 Nevertheless, it is safe to access bpage, because it is
910 io_fixed and oldest_modification != 0. Thus, it cannot be
911 relocated in the buffer pool or removed from flush_list or
912 LRU_list. */
913 ut_ad(!buf_flush_list_mutex_own(buf_pool));
914 ut_ad(!mutex_own(buf_page_get_mutex(bpage)));
915 ut_ad(buf_page_get_io_fix_unlocked(bpage) == BUF_IO_WRITE);
916 ut_ad(bpage->oldest_modification != 0);
917
918 #ifdef UNIV_IBUF_COUNT_DEBUG
919 ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
920 #endif
921 ut_ad(bpage->newest_modification != 0);
922
923 #ifdef UNIV_LOG_DEBUG
924 if (!univ_log_debug_warned) {
925 univ_log_debug_warned = TRUE;
926 fputs("Warning: cannot force log to disk if"
927 " UNIV_LOG_DEBUG is defined!\n"
928 "Crash recovery will not work!\n",
929 stderr);
930 }
931 #else
932 /* Force the log to the disk before writing the modified block */
933 log_write_up_to(bpage->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
934 #endif
935 switch (buf_page_get_state(bpage)) {
936 case BUF_BLOCK_POOL_WATCH:
937 case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
938 case BUF_BLOCK_NOT_USED:
939 case BUF_BLOCK_READY_FOR_USE:
940 case BUF_BLOCK_MEMORY:
941 case BUF_BLOCK_REMOVE_HASH:
942 ut_error;
943 break;
944 case BUF_BLOCK_ZIP_DIRTY:
945 frame = bpage->zip.data;
946 mach_write_to_8(frame + FIL_PAGE_LSN,
947 bpage->newest_modification);
948
949 ut_a(page_zip_verify_checksum(frame, zip_size));
950
951 memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
952 break;
953 case BUF_BLOCK_FILE_PAGE:
954 frame = bpage->zip.data;
955 if (!frame) {
956 frame = ((buf_block_t*) bpage)->frame;
957 }
958
959 buf_flush_init_for_writing(((buf_block_t*) bpage)->frame,
960 bpage->zip.data
961 ? &bpage->zip : NULL,
962 bpage->newest_modification);
963 break;
964 }
965
966 if (!srv_use_doublewrite_buf || !buf_dblwr) {
967 fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
968 sync, buf_page_get_space(bpage), zip_size,
969 buf_page_get_page_no(bpage), 0,
970 zip_size ? zip_size : UNIV_PAGE_SIZE,
971 frame, bpage);
972 } else if (flush_type == BUF_FLUSH_SINGLE_PAGE) {
973 buf_dblwr_write_single_page(bpage, sync);
974 } else {
975 ut_ad(!sync);
976 buf_dblwr_add_to_batch(bpage);
977 }
978
979 /* When doing single page flushing the IO is done synchronously
980 and we flush the changes to disk only for the tablespace we
981 are working on. */
982 if (sync) {
983 ut_ad(flush_type == BUF_FLUSH_SINGLE_PAGE);
984 fil_flush(buf_page_get_space(bpage));
985 buf_page_io_complete(bpage);
986 }
987
988 /* Increment the counter of I/O operations used
989 for selecting LRU policy. */
990 buf_LRU_stat_inc_io();
991 }
992
993 /********************************************************************//**
994 Writes a flushable page asynchronously from the buffer pool to a file.
995 NOTE: in simulated aio we must call
996 os_aio_simulated_wake_handler_threads after we have posted a batch of
997 writes! NOTE: buf_page_get_mutex(bpage) must be held upon entering this
998 function, and it will be released by this function if it returns true.
999 LRU_list_mutex must be held iff performing a single page flush and will be
1000 released by the function if it returns true.
1001 @return TRUE if the page was flushed */
1002 UNIV_INTERN
1003 bool
buf_flush_page(buf_pool_t * buf_pool,buf_page_t * bpage,buf_flush_t flush_type,bool sync)1004 buf_flush_page(
1005 /*===========*/
1006 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1007 buf_page_t* bpage, /*!< in: buffer control block */
1008 buf_flush_t flush_type, /*!< in: type of flush */
1009 bool sync) /*!< in: true if sync IO request */
1010 {
1011 ut_ad(flush_type < BUF_FLUSH_N_TYPES);
1012 /* Hold the LRU list mutex iff called for a single page LRU
1013 flush. A single page LRU flush is already non-performant, and holding
1014 the LRU list mutex allows us to avoid having to store the previous LRU
1015 list page or to restart the LRU scan in
1016 buf_flush_single_page_from_LRU(). */
1017 ut_ad(flush_type == BUF_FLUSH_SINGLE_PAGE ||
1018 !mutex_own(&buf_pool->LRU_list_mutex));
1019 ut_ad(flush_type != BUF_FLUSH_SINGLE_PAGE ||
1020 mutex_own(&buf_pool->LRU_list_mutex));
1021 ut_ad(buf_page_in_file(bpage));
1022 ut_ad(!sync || flush_type == BUF_FLUSH_SINGLE_PAGE);
1023
1024 ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
1025
1026 ut_ad(mutex_own(block_mutex));
1027
1028 ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
1029
1030 bool is_uncompressed;
1031
1032 is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
1033 ut_ad(is_uncompressed == (block_mutex != &buf_pool->zip_mutex));
1034
1035 ibool flush;
1036 rw_lock_t* rw_lock;
1037 bool no_fix_count = bpage->buf_fix_count == 0;
1038
1039 if (!is_uncompressed) {
1040 flush = TRUE;
1041 rw_lock = NULL;
1042
1043 } else if (!(no_fix_count || flush_type == BUF_FLUSH_LIST)) {
1044 /* This is a heuristic, to avoid expensive S attempts. */
1045 flush = FALSE;
1046 } else {
1047
1048 rw_lock = &reinterpret_cast<buf_block_t*>(bpage)->lock;
1049
1050 if (flush_type != BUF_FLUSH_LIST) {
1051 flush = rw_lock_s_lock_gen_nowait(
1052 rw_lock, BUF_IO_WRITE);
1053 } else {
1054 /* Will S lock later */
1055 flush = TRUE;
1056 }
1057 }
1058
1059 if (flush) {
1060
1061 /* We are committed to flushing by the time we get here */
1062
1063 mutex_enter(&buf_pool->flush_state_mutex);
1064
1065 buf_page_set_io_fix(bpage, BUF_IO_WRITE);
1066
1067 buf_page_set_flush_type(bpage, flush_type);
1068
1069 if (buf_pool->n_flush[flush_type] == 0) {
1070
1071 os_event_reset(buf_pool->no_flush[flush_type]);
1072 }
1073
1074 ++buf_pool->n_flush[flush_type];
1075 ut_ad(buf_pool->n_flush[flush_type] != 0);
1076
1077 mutex_exit(&buf_pool->flush_state_mutex);
1078
1079 mutex_exit(block_mutex);
1080
1081 if (flush_type == BUF_FLUSH_SINGLE_PAGE)
1082 mutex_exit(&buf_pool->LRU_list_mutex);
1083
1084 if (flush_type == BUF_FLUSH_LIST
1085 && is_uncompressed
1086 && !rw_lock_s_lock_gen_nowait(rw_lock, BUF_IO_WRITE)) {
1087 /* avoiding deadlock possibility involves doublewrite
1088 buffer, should flush it, because it might hold the
1089 another block->lock. */
1090 buf_dblwr_flush_buffered_writes();
1091
1092 rw_lock_s_lock_gen(rw_lock, BUF_IO_WRITE);
1093 }
1094
1095 /* Even though bpage is not protected by any mutex at this
1096 point, it is safe to access bpage, because it is io_fixed and
1097 oldest_modification != 0. Thus, it cannot be relocated in the
1098 buffer pool or removed from flush_list or LRU_list. */
1099
1100 buf_flush_write_block_low(bpage, flush_type, sync);
1101 }
1102
1103 return(flush);
1104 }
1105
1106 # if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
1107 /********************************************************************//**
1108 Writes a flushable page asynchronously from the buffer pool to a file.
1109 NOTE: block and LRU list mutexes must be held upon entering this function, and
1110 they will be released by this function after flushing. This is loosely based on
1111 buf_flush_batch() and buf_flush_page().
1112 @return TRUE if the page was flushed and the mutexes released */
1113 UNIV_INTERN
1114 ibool
buf_flush_page_try(buf_pool_t * buf_pool,buf_block_t * block)1115 buf_flush_page_try(
1116 /*===============*/
1117 buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */
1118 buf_block_t* block) /*!< in/out: buffer control block */
1119 {
1120 ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
1121 ut_ad(mutex_own(&block->mutex));
1122 ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
1123
1124 if (!buf_flush_ready_for_flush(&block->page, BUF_FLUSH_SINGLE_PAGE)) {
1125 return(FALSE);
1126 }
1127
1128 /* The following call will release the LRU list and
1129 block mutex if successful. */
1130 return(buf_flush_page(
1131 buf_pool, &block->page, BUF_FLUSH_SINGLE_PAGE, true));
1132 }
1133 # endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
1134 /***********************************************************//**
1135 Check the page is in buffer pool and can be flushed.
1136 @return true if the page can be flushed. */
1137 static
1138 bool
buf_flush_check_neighbor(ulint space,ulint offset,buf_flush_t flush_type)1139 buf_flush_check_neighbor(
1140 /*=====================*/
1141 ulint space, /*!< in: space id */
1142 ulint offset, /*!< in: page offset */
1143 buf_flush_t flush_type) /*!< in: BUF_FLUSH_LRU or
1144 BUF_FLUSH_LIST */
1145 {
1146 buf_page_t* bpage;
1147 buf_pool_t* buf_pool = buf_pool_get(space, offset);
1148 bool ret;
1149 prio_rw_lock_t* hash_lock;
1150 ib_mutex_t* block_mutex;
1151
1152 ut_ad(flush_type == BUF_FLUSH_LRU
1153 || flush_type == BUF_FLUSH_LIST);
1154
1155 /* We only want to flush pages from this buffer pool. */
1156 bpage = buf_page_hash_get_s_locked(buf_pool, space, offset,
1157 &hash_lock);
1158
1159 if (!bpage) {
1160
1161 return(false);
1162 }
1163
1164 block_mutex = buf_page_get_mutex(bpage);
1165
1166 mutex_enter(block_mutex);
1167
1168 rw_lock_s_unlock(hash_lock);
1169
1170 ut_a(buf_page_in_file(bpage));
1171
1172 /* We avoid flushing 'non-old' blocks in an LRU flush,
1173 because the flushed blocks are soon freed */
1174
1175 ret = false;
1176 if (flush_type != BUF_FLUSH_LRU || buf_page_is_old(bpage)) {
1177
1178 if (buf_flush_ready_for_flush(bpage, flush_type)) {
1179 ret = true;
1180 }
1181 }
1182
1183 mutex_exit(block_mutex);
1184
1185 return(ret);
1186 }
1187
1188 /***********************************************************//**
1189 Flushes to disk all flushable pages within the flush area.
1190 @return number of pages flushed */
1191 static
1192 ulint
buf_flush_try_neighbors(ulint space,ulint offset,buf_flush_t flush_type,ulint n_flushed,ulint n_to_flush)1193 buf_flush_try_neighbors(
1194 /*====================*/
1195 ulint space, /*!< in: space id */
1196 ulint offset, /*!< in: page offset */
1197 buf_flush_t flush_type, /*!< in: BUF_FLUSH_LRU or
1198 BUF_FLUSH_LIST */
1199 ulint n_flushed, /*!< in: number of pages
1200 flushed so far in this batch */
1201 ulint n_to_flush) /*!< in: maximum number of pages
1202 we are allowed to flush */
1203 {
1204 ulint i;
1205 ulint low;
1206 ulint high;
1207 buf_pool_t* buf_pool = buf_pool_get(space, offset);
1208
1209 ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1210 ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
1211 ut_ad(!buf_flush_list_mutex_own(buf_pool));
1212
1213 if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN
1214 || srv_flush_neighbors == 0) {
1215 /* If there is little space or neighbor flushing is
1216 not enabled then just flush the victim. */
1217 low = offset;
1218 high = offset + 1;
1219 } else {
1220 /* When flushed, dirty blocks are searched in
1221 neighborhoods of this size, and flushed along with the
1222 original page. */
1223
1224 ulint buf_flush_area;
1225
1226 buf_flush_area = ut_min(
1227 BUF_READ_AHEAD_AREA(buf_pool),
1228 buf_pool->curr_size / 16);
1229
1230 low = (offset / buf_flush_area) * buf_flush_area;
1231 high = (offset / buf_flush_area + 1) * buf_flush_area;
1232
1233 if (srv_flush_neighbors == 1) {
1234 /* adjust 'low' and 'high' to limit
1235 for contiguous dirty area */
1236 if (offset > low) {
1237 for (i = offset - 1;
1238 i >= low
1239 && buf_flush_check_neighbor(
1240 space, i, flush_type);
1241 i--) {
1242 /* do nothing */
1243 }
1244 low = i + 1;
1245 }
1246
1247 for (i = offset + 1;
1248 i < high
1249 && buf_flush_check_neighbor(
1250 space, i, flush_type);
1251 i++) {
1252 /* do nothing */
1253 }
1254 high = i;
1255 }
1256 }
1257
1258 /* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
1259
1260 if (high > fil_space_get_size(space)) {
1261 high = fil_space_get_size(space);
1262 }
1263
1264 ulint count = 0;
1265
1266 for (i = low; i < high; i++) {
1267
1268 prio_rw_lock_t* hash_lock;
1269 ib_mutex_t* block_mutex;
1270
1271 if ((count + n_flushed) >= n_to_flush) {
1272
1273 /* We have already flushed enough pages and
1274 should call it a day. There is, however, one
1275 exception. If the page whose neighbors we
1276 are flushing has not been flushed yet then
1277 we'll try to flush the victim that we
1278 selected originally. */
1279 if (i <= offset) {
1280 i = offset;
1281 } else {
1282 break;
1283 }
1284 }
1285
1286 buf_pool = buf_pool_get(space, i);
1287
1288 /* We only want to flush pages from this buffer pool. */
1289 buf_page_t* bpage = buf_page_hash_get_s_locked(buf_pool,
1290 space, i, &hash_lock);
1291
1292 if (bpage == NULL) {
1293
1294 continue;
1295 }
1296
1297 block_mutex = buf_page_get_mutex(bpage);
1298
1299 mutex_enter(block_mutex);
1300
1301 rw_lock_s_unlock(hash_lock);
1302
1303 ut_a(buf_page_in_file(bpage));
1304
1305 /* We avoid flushing 'non-old' blocks in an LRU flush,
1306 because the flushed blocks are soon freed */
1307
1308 if (flush_type != BUF_FLUSH_LRU
1309 || i == offset
1310 || buf_page_is_old(bpage)) {
1311
1312 if (buf_flush_ready_for_flush(bpage, flush_type)
1313 && (i == offset || bpage->buf_fix_count == 0)
1314 && buf_flush_page(
1315 buf_pool, bpage, flush_type, false)) {
1316
1317 ++count;
1318
1319 continue;
1320 }
1321 }
1322
1323 mutex_exit(block_mutex);
1324 }
1325
1326 if (count > 0) {
1327 MONITOR_INC_VALUE_CUMULATIVE(
1328 MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
1329 MONITOR_FLUSH_NEIGHBOR_COUNT,
1330 MONITOR_FLUSH_NEIGHBOR_PAGES,
1331 (count - 1));
1332 }
1333
1334 return(count);
1335 }
1336
1337 /********************************************************************//**
1338 Check if the block is modified and ready for flushing. If the the block
1339 is ready to flush then flush the page and try o flush its neighbors.
1340
1341 @return TRUE if, depending on the flush type, either LRU or flush list
1342 mutex was released during this function. This does not guarantee that some
1343 pages were written as well.
1344 Number of pages written are incremented to the count. */
1345 static
1346 ibool
buf_flush_page_and_try_neighbors(buf_page_t * bpage,buf_flush_t flush_type,ulint n_to_flush,ulint * count)1347 buf_flush_page_and_try_neighbors(
1348 /*=============================*/
1349 buf_page_t* bpage, /*!< in: buffer control block,
1350 must be
1351 buf_page_in_file(bpage) */
1352 buf_flush_t flush_type, /*!< in: BUF_FLUSH_LRU
1353 or BUF_FLUSH_LIST */
1354 ulint n_to_flush, /*!< in: number of pages to
1355 flush */
1356 ulint* count) /*!< in/out: number of pages
1357 flushed */
1358 {
1359 ibool flushed;
1360 ib_mutex_t* block_mutex = NULL;
1361 #ifdef UNIV_DEBUG
1362 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1363 #endif /* UNIV_DEBUG */
1364
1365 ut_ad((flush_type == BUF_FLUSH_LRU
1366 && mutex_own(&buf_pool->LRU_list_mutex))
1367 || (flush_type == BUF_FLUSH_LIST
1368 && buf_flush_list_mutex_own(buf_pool)));
1369
1370 if (flush_type == BUF_FLUSH_LRU) {
1371 block_mutex = buf_page_get_mutex(bpage);
1372 mutex_enter(block_mutex);
1373 }
1374
1375 ut_a(buf_page_in_file(bpage)
1376 || (buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH
1377 #ifdef UNIV_DEBUG
1378 && !mutex_own(&buf_pool->LRU_list_mutex)
1379 #endif
1380 ));
1381
1382 if (buf_flush_ready_for_flush(bpage, flush_type)) {
1383 buf_pool_t* buf_pool;
1384
1385 buf_pool = buf_pool_from_bpage(bpage);
1386
1387 if (flush_type == BUF_FLUSH_LRU) {
1388 mutex_exit(&buf_pool->LRU_list_mutex);
1389 }
1390
1391 /* These fields are protected by the buf_page_get_mutex()
1392 mutex. */
1393 /* Read the fields directly in order to avoid asserting on
1394 BUF_BLOCK_REMOVE_HASH pages. */
1395 ulint space = bpage->space;
1396 ulint offset = bpage->offset;
1397
1398 if (flush_type == BUF_FLUSH_LRU) {
1399 mutex_exit(block_mutex);
1400 } else {
1401 buf_flush_list_mutex_exit(buf_pool);
1402 }
1403
1404 /* Try to flush also all the neighbors */
1405 *count += buf_flush_try_neighbors(
1406 space, offset, flush_type, *count, n_to_flush);
1407
1408 if (flush_type == BUF_FLUSH_LRU) {
1409 mutex_enter(&buf_pool->LRU_list_mutex);
1410 } else {
1411 buf_flush_list_mutex_enter(buf_pool);
1412 }
1413 flushed = TRUE;
1414
1415 } else if (flush_type == BUF_FLUSH_LRU) {
1416 mutex_exit(block_mutex);
1417 flushed = FALSE;
1418 } else {
1419 flushed = FALSE;
1420 }
1421
1422 ut_ad((flush_type == BUF_FLUSH_LRU
1423 && mutex_own(&buf_pool->LRU_list_mutex))
1424 || (flush_type == BUF_FLUSH_LIST
1425 && buf_flush_list_mutex_own(buf_pool)));
1426
1427 return(flushed);
1428 }
1429
1430 /*******************************************************************//**
1431 This utility moves the uncompressed frames of pages to the free list.
1432 Note that this function does not actually flush any data to disk. It
1433 just detaches the uncompressed frames from the compressed pages at the
1434 tail of the unzip_LRU and puts those freed frames in the free list.
1435 Note that it is a best effort attempt and it is not guaranteed that
1436 after a call to this function there will be 'max' blocks in the free
1437 list.
1438 @return number of blocks moved to the free list. */
1439 static
1440 ulint
buf_free_from_unzip_LRU_list_batch(buf_pool_t * buf_pool,ulint max)1441 buf_free_from_unzip_LRU_list_batch(
1442 /*===============================*/
1443 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1444 ulint max) /*!< in: desired number of
1445 blocks in the free_list */
1446 {
1447 buf_block_t* block;
1448 ulint scanned = 0;
1449 ulint count = 0;
1450 ulint free_len = UT_LIST_GET_LEN(buf_pool->free);
1451 ulint lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
1452
1453 ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
1454
1455 block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
1456 while (block != NULL && count < max
1457 && free_len < srv_LRU_scan_depth
1458 && lru_len > UT_LIST_GET_LEN(buf_pool->LRU) / 10) {
1459
1460 ib_mutex_t* block_mutex = buf_page_get_mutex(&block->page);
1461
1462 ++scanned;
1463
1464 mutex_enter(block_mutex);
1465
1466 if (buf_LRU_free_page(&block->page, false)) {
1467
1468 mutex_exit(block_mutex);
1469 /* Block was freed. LRU list mutex potentially
1470 released and reacquired */
1471 ++count;
1472 mutex_enter(&buf_pool->LRU_list_mutex);
1473 block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
1474
1475 } else {
1476
1477 mutex_exit(block_mutex);
1478 block = UT_LIST_GET_PREV(unzip_LRU, block);
1479 }
1480
1481 free_len = UT_LIST_GET_LEN(buf_pool->free);
1482 lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
1483 }
1484
1485 ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
1486
1487 if (scanned) {
1488 MONITOR_INC_VALUE_CUMULATIVE(
1489 MONITOR_LRU_BATCH_SCANNED,
1490 MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
1491 MONITOR_LRU_BATCH_SCANNED_PER_CALL,
1492 scanned);
1493 }
1494
1495 return(count);
1496 }
1497
1498 /*******************************************************************//**
1499 This utility flushes dirty blocks from the end of the LRU list.
1500 The calling thread is not allowed to own any latches on pages!
1501 It attempts to make 'max' blocks available in the free list. Note that
1502 it is a best effort attempt and it is not guaranteed that after a call
1503 to this function there will be 'max' blocks in the free list.
1504 @return number of blocks for which the write request was queued. */
1505 MY_ATTRIBUTE((nonnull))
1506 static
1507 void
buf_flush_LRU_list_batch(buf_pool_t * buf_pool,ulint max,bool limited_scan,flush_counters_t * n)1508 buf_flush_LRU_list_batch(
1509 /*=====================*/
1510 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1511 ulint max, /*!< in: desired number of
1512 blocks in the free_list */
1513 bool limited_scan, /*!< in: if true, allow to scan only up
1514 to srv_LRU_scan_depth pages in total */
1515 flush_counters_t* n) /*!< out: flushed/evicted page
1516 counts */
1517 {
1518 buf_page_t* bpage;
1519 ulint scanned = 0;
1520 ulint lru_position = 0;
1521 ulint max_lru_position;
1522 ulint max_scanned_pages;
1523 ulint free_len = UT_LIST_GET_LEN(buf_pool->free);
1524 ulint lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
1525
1526 n->flushed = 0;
1527 n->evicted = 0;
1528 n->unzip_LRU_evicted = 0;
1529
1530 ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
1531
1532 max_scanned_pages = limited_scan ? srv_LRU_scan_depth : lru_len * max;
1533 max_lru_position = ut_min(srv_LRU_scan_depth, lru_len);
1534
1535 bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1536 while (bpage != NULL
1537 && (srv_cleaner_eviction_factor ? n->evicted : n->flushed) < max
1538 && free_len < srv_LRU_scan_depth
1539 && lru_len > BUF_LRU_MIN_LEN
1540 && lru_position < max_lru_position
1541 && scanned < max_scanned_pages) {
1542
1543 ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
1544 ibool evict;
1545 ulint failed_acquire;
1546
1547 ++scanned;
1548 ++lru_position;
1549
1550 failed_acquire = mutex_enter_nowait(block_mutex);
1551
1552 evict = UNIV_LIKELY(!failed_acquire)
1553 && buf_flush_ready_for_replace(bpage);
1554
1555 if (UNIV_LIKELY(!failed_acquire) && !evict) {
1556
1557 mutex_exit(block_mutex);
1558 }
1559
1560 /* If the block is ready to be replaced we try to
1561 free it i.e.: put it on the free list.
1562 Otherwise we try to flush the block and its
1563 neighbors. In this case we'll put it on the
1564 free list in the next pass. We do this extra work
1565 of putting blocks to the free list instead of
1566 just flushing them because after every flush
1567 we have to restart the scan from the tail of
1568 the LRU list and if we don't clear the tail
1569 of the flushed pages then the scan becomes
1570 O(n*n). */
1571 if (evict) {
1572
1573 if (buf_LRU_free_page(bpage, true)) {
1574
1575 mutex_exit(block_mutex);
1576 n->evicted++;
1577 lru_position = 0;
1578 mutex_enter(&buf_pool->LRU_list_mutex);
1579 bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1580 } else {
1581
1582 bpage = UT_LIST_GET_PREV(LRU, bpage);
1583 mutex_exit(block_mutex);
1584 }
1585 } else if (UNIV_LIKELY(!failed_acquire)) {
1586
1587 ulint space;
1588 ulint offset;
1589 buf_page_t* prev_bpage;
1590
1591 prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
1592
1593 /* Save the previous bpage */
1594
1595 if (prev_bpage != NULL) {
1596 space = prev_bpage->space;
1597 offset = prev_bpage->offset;
1598 } else {
1599 space = ULINT_UNDEFINED;
1600 offset = ULINT_UNDEFINED;
1601 }
1602
1603 if (buf_flush_page_and_try_neighbors(
1604 bpage,
1605 BUF_FLUSH_LRU, max, &n->flushed)) {
1606
1607 /* LRU list mutex was released.
1608 reposition the iterator. Note: the
1609 prev block could have been repositioned
1610 too but that should be rare. */
1611
1612 if (prev_bpage != NULL) {
1613
1614 ut_ad(space != ULINT_UNDEFINED);
1615 ut_ad(offset != ULINT_UNDEFINED);
1616
1617 prev_bpage = buf_page_hash_get(
1618 buf_pool, space, offset);
1619 }
1620 }
1621
1622 bpage = prev_bpage;
1623 }
1624
1625 free_len = UT_LIST_GET_LEN(buf_pool->free);
1626 lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
1627 }
1628
1629 ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
1630
1631 /* We keep track of all flushes happening as part of LRU
1632 flush. When estimating the desired rate at which flush_list
1633 should be flushed, we factor in this value. */
1634 buf_pool->stat.buf_lru_flush_page_count += n->flushed;
1635
1636 if (scanned) {
1637 MONITOR_INC_VALUE_CUMULATIVE(
1638 MONITOR_LRU_BATCH_SCANNED,
1639 MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
1640 MONITOR_LRU_BATCH_SCANNED_PER_CALL,
1641 scanned);
1642 }
1643 }
1644
1645 /*******************************************************************//**
1646 Flush and move pages from LRU or unzip_LRU list to the free list.
1647 Whether LRU or unzip_LRU is used depends on the state of the system.
1648 @return number of blocks for which either the write request was queued
1649 or in case of unzip_LRU the number of blocks actually moved to the
1650 free list */
1651 MY_ATTRIBUTE((nonnull))
1652 static
1653 void
buf_do_LRU_batch(buf_pool_t * buf_pool,ulint max,bool limited_scan,flush_counters_t * n)1654 buf_do_LRU_batch(
1655 /*=============*/
1656 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1657 ulint max, /*!< in: desired number of
1658 blocks in the free_list */
1659 bool limited_scan, /*!< in: if true, allow to scan only up
1660 to srv_LRU_scan_depth pages in total */
1661 flush_counters_t* n) /*!< out: flushed/evicted page
1662 counts */
1663 {
1664 ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
1665
1666 if (buf_LRU_evict_from_unzip_LRU(buf_pool)) {
1667 n->unzip_LRU_evicted
1668 = buf_free_from_unzip_LRU_list_batch(buf_pool, max);
1669 } else {
1670 n->unzip_LRU_evicted = 0;
1671 }
1672
1673 if (max > n->unzip_LRU_evicted) {
1674 buf_flush_LRU_list_batch(buf_pool, max - n->unzip_LRU_evicted,
1675 limited_scan, n);
1676 } else {
1677 n->evicted = 0;
1678 n->flushed = 0;
1679 }
1680
1681 n->evicted += n->unzip_LRU_evicted;
1682 }
1683
1684 /*******************************************************************//**
1685 This utility flushes dirty blocks from the end of the flush_list.
1686 the calling thread is not allowed to own any latches on pages!
1687 @return number of blocks for which the write request was queued;
1688 ULINT_UNDEFINED if there was a flush of the same type already
1689 running */
1690 static
1691 ulint
buf_do_flush_list_batch(buf_pool_t * buf_pool,ulint min_n,lsn_t lsn_limit)1692 buf_do_flush_list_batch(
1693 /*====================*/
1694 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1695 ulint min_n, /*!< in: wished minimum mumber
1696 of blocks flushed (it is not
1697 guaranteed that the actual
1698 number is that big, though) */
1699 lsn_t lsn_limit) /*!< all blocks whose
1700 oldest_modification is smaller
1701 than this should be flushed (if
1702 their number does not exceed
1703 min_n) */
1704 {
1705 ulint count = 0;
1706 ulint scanned = 0;
1707
1708 /* Start from the end of the list looking for a suitable
1709 block to be flushed. */
1710 buf_flush_list_mutex_enter(buf_pool);
1711 ulint len = UT_LIST_GET_LEN(buf_pool->flush_list);
1712
1713 /* In order not to degenerate this scan to O(n*n) we attempt
1714 to preserve pointer of previous block in the flush list. To do
1715 so we declare it a hazard pointer. Any thread working on the
1716 flush list must check the hazard pointer and if it is removing
1717 the same block then it must reset it. */
1718 for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
1719 count < min_n && bpage != NULL && len > 0
1720 && bpage->oldest_modification < lsn_limit;
1721 ++scanned) {
1722
1723 buf_page_t* prev;
1724
1725 ut_a(bpage->oldest_modification > 0);
1726 ut_ad(bpage->in_flush_list);
1727
1728 prev = UT_LIST_GET_PREV(list, bpage);
1729 buf_flush_set_hp(buf_pool, prev);
1730
1731 #ifdef UNIV_DEBUG
1732 bool flushed =
1733 #endif /* UNIV_DEBUG */
1734 buf_flush_page_and_try_neighbors(
1735 bpage, BUF_FLUSH_LIST, min_n, &count);
1736
1737 ut_ad(flushed || buf_flush_is_hp(buf_pool, prev));
1738
1739 if (!buf_flush_is_hp(buf_pool, prev)) {
1740 /* The hazard pointer was reset by some other
1741 thread. Restart the scan. */
1742 ut_ad(buf_flush_is_hp(buf_pool, NULL));
1743 bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
1744 len = UT_LIST_GET_LEN(buf_pool->flush_list);
1745 } else {
1746 bpage = prev;
1747 --len;
1748 buf_flush_set_hp(buf_pool, NULL);
1749 }
1750
1751 ut_ad(!bpage || bpage->in_flush_list);
1752 }
1753
1754 buf_flush_list_mutex_exit(buf_pool);
1755
1756 MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_SCANNED,
1757 MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
1758 MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
1759 scanned);
1760
1761 return(count);
1762 }
1763
1764 /*******************************************************************//**
1765 This utility flushes dirty blocks from the end of the LRU list or flush_list.
1766 NOTE 1: in the case of an LRU flush the calling thread may own latches to
1767 pages: to avoid deadlocks, this function must be written so that it cannot
1768 end up waiting for these latches! NOTE 2: in the case of a flush list flush,
1769 the calling thread is not allowed to own any latches on pages!
1770 @return number of blocks for which the write request was queued */
1771 MY_ATTRIBUTE((nonnull))
1772 static
1773 void
buf_flush_batch(buf_pool_t * buf_pool,buf_flush_t flush_type,ulint min_n,lsn_t lsn_limit,bool limited_lru_scan,flush_counters_t * n)1774 buf_flush_batch(
1775 /*============*/
1776 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1777 buf_flush_t flush_type, /*!< in: BUF_FLUSH_LRU or
1778 BUF_FLUSH_LIST; if BUF_FLUSH_LIST,
1779 then the caller must not own any
1780 latches on pages */
1781 ulint min_n, /*!< in: wished minimum mumber of blocks
1782 flushed (it is not guaranteed that the
1783 actual number is that big, though) */
1784 lsn_t lsn_limit, /*!< in: in the case of BUF_FLUSH_LIST
1785 all blocks whose oldest_modification is
1786 smaller than this should be flushed
1787 (if their number does not exceed
1788 min_n), otherwise ignored */
1789 bool limited_lru_scan,/*!< in: for LRU flushes, if true,
1790 allow to scan only up to
1791 srv_LRU_scan_depth pages in total */
1792 flush_counters_t* n) /*!< out: flushed/evicted page
1793 counts */
1794 {
1795 ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1796 #ifdef UNIV_SYNC_DEBUG
1797 ut_ad((flush_type != BUF_FLUSH_LIST)
1798 || sync_thread_levels_empty_except_dict());
1799 #endif /* UNIV_SYNC_DEBUG */
1800
1801 /* Note: The buffer pool mutexes are released and reacquired within
1802 the flush functions. */
1803 switch (flush_type) {
1804 case BUF_FLUSH_LRU:
1805 mutex_enter(&buf_pool->LRU_list_mutex);
1806 buf_do_LRU_batch(buf_pool, min_n, limited_lru_scan, n);
1807 mutex_exit(&buf_pool->LRU_list_mutex);
1808 break;
1809 case BUF_FLUSH_LIST:
1810 ut_ad(!limited_lru_scan);
1811 n->flushed = buf_do_flush_list_batch(buf_pool, min_n,
1812 lsn_limit);
1813 n->evicted = 0;
1814 break;
1815 default:
1816 ut_error;
1817 }
1818
1819 #ifdef UNIV_DEBUG
1820 if (buf_debug_prints && n->flushed > 0) {
1821 fprintf(stderr, flush_type == BUF_FLUSH_LRU
1822 ? "Flushed %lu pages in LRU flush\n"
1823 : "Flushed %lu pages in flush list flush\n",
1824 (ulong) n->flushed);
1825 }
1826 #endif /* UNIV_DEBUG */
1827 }
1828
1829 /******************************************************************//**
1830 Gather the aggregated stats for both flush list and LRU list flushing */
1831 static
1832 void
buf_flush_common(buf_flush_t flush_type,ulint page_count)1833 buf_flush_common(
1834 /*=============*/
1835 buf_flush_t flush_type, /*!< in: type of flush */
1836 ulint page_count) /*!< in: number of pages flushed */
1837 {
1838 if (page_count) {
1839 buf_dblwr_flush_buffered_writes();
1840 }
1841
1842 ut_a(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1843
1844 #ifdef UNIV_DEBUG
1845 if (buf_debug_prints && page_count > 0) {
1846 fprintf(stderr, flush_type == BUF_FLUSH_LRU
1847 ? "Flushed %lu pages in LRU flush\n"
1848 : "Flushed %lu pages in flush list flush\n",
1849 (ulong) page_count);
1850 }
1851 #endif /* UNIV_DEBUG */
1852
1853 srv_stats.buf_pool_flushed.add(page_count);
1854 }
1855
1856 /******************************************************************//**
1857 Start a buffer flush batch for LRU or flush list */
1858 static
1859 ibool
buf_flush_start(buf_pool_t * buf_pool,buf_flush_t flush_type)1860 buf_flush_start(
1861 /*============*/
1862 buf_pool_t* buf_pool, /*!< buffer pool instance */
1863 buf_flush_t flush_type) /*!< in: BUF_FLUSH_LRU
1864 or BUF_FLUSH_LIST */
1865 {
1866 mutex_enter(&buf_pool->flush_state_mutex);
1867
1868 if (buf_pool->n_flush[flush_type] > 0
1869 || buf_pool->init_flush[flush_type] == TRUE) {
1870
1871 /* There is already a flush batch of the same type running */
1872
1873 mutex_exit(&buf_pool->flush_state_mutex);
1874
1875 return(FALSE);
1876 }
1877
1878 buf_pool->init_flush[flush_type] = TRUE;
1879
1880 mutex_exit(&buf_pool->flush_state_mutex);
1881
1882 return(TRUE);
1883 }
1884
1885 /******************************************************************//**
1886 End a buffer flush batch for LRU or flush list */
1887 static
1888 void
buf_flush_end(buf_pool_t * buf_pool,buf_flush_t flush_type)1889 buf_flush_end(
1890 /*==========*/
1891 buf_pool_t* buf_pool, /*!< buffer pool instance */
1892 buf_flush_t flush_type) /*!< in: BUF_FLUSH_LRU
1893 or BUF_FLUSH_LIST */
1894 {
1895 mutex_enter(&buf_pool->flush_state_mutex);
1896
1897 buf_pool->init_flush[flush_type] = FALSE;
1898
1899 buf_pool->try_LRU_scan = TRUE;
1900
1901 if (buf_pool->n_flush[flush_type] == 0) {
1902
1903 /* The running flush batch has ended */
1904
1905 os_event_set(buf_pool->no_flush[flush_type]);
1906 }
1907
1908 mutex_exit(&buf_pool->flush_state_mutex);
1909 }
1910
1911 /******************************************************************//**
1912 Waits until a flush batch of the given type ends */
1913 UNIV_INTERN
1914 void
buf_flush_wait_batch_end(buf_pool_t * buf_pool,buf_flush_t type)1915 buf_flush_wait_batch_end(
1916 /*=====================*/
1917 buf_pool_t* buf_pool, /*!< buffer pool instance */
1918 buf_flush_t type) /*!< in: BUF_FLUSH_LRU
1919 or BUF_FLUSH_LIST */
1920 {
1921 ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
1922
1923 if (buf_pool == NULL) {
1924 ulint i;
1925
1926 for (i = 0; i < srv_buf_pool_instances; ++i) {
1927 buf_pool_t* buf_pool;
1928
1929 buf_pool = buf_pool_from_array(i);
1930
1931 thd_wait_begin(NULL, THD_WAIT_DISKIO);
1932 os_event_wait(buf_pool->no_flush[type]);
1933 thd_wait_end(NULL);
1934 }
1935 } else {
1936 thd_wait_begin(NULL, THD_WAIT_DISKIO);
1937 os_event_wait(buf_pool->no_flush[type]);
1938 thd_wait_end(NULL);
1939 }
1940 }
1941
1942 /*******************************************************************//**
1943 This utility flushes dirty blocks from the end of the LRU list and also
1944 puts replaceable clean pages from the end of the LRU list to the free
1945 list.
1946 NOTE: The calling thread is not allowed to own any latches on pages!
1947 @return true if a batch was queued successfully. false if another batch
1948 of same type was already running. */
1949 MY_ATTRIBUTE((nonnull))
1950 static
1951 bool
buf_flush_LRU(buf_pool_t * buf_pool,ulint min_n,bool limited_scan,flush_counters_t * n)1952 buf_flush_LRU(
1953 /*==========*/
1954 buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */
1955 ulint min_n, /*!< in: wished minimum mumber of blocks
1956 flushed (it is not guaranteed that the
1957 actual number is that big, though) */
1958 bool limited_scan, /*!< in: if true, allow to scan
1959 only up to srv_LRU_scan_depth
1960 pages in total */
1961 flush_counters_t *n) /*!< out: flushed/evicted page
1962 counts */
1963 {
1964 if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) {
1965 n->flushed = 0;
1966 n->evicted = 0;
1967 n->unzip_LRU_evicted = 0;
1968 return(false);
1969 }
1970
1971 buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0, limited_scan, n);
1972
1973 buf_flush_end(buf_pool, BUF_FLUSH_LRU);
1974
1975 buf_flush_common(BUF_FLUSH_LRU, n->flushed);
1976
1977 return(true);
1978 }
1979
1980 /*******************************************************************//**
1981 This utility flushes dirty blocks from the end of the flush list of
1982 all buffer pool instances.
1983 NOTE: The calling thread is not allowed to own any latches on pages!
1984 @return true if a batch was queued successfully for each buffer pool
1985 instance. false if another batch of same type was already running in
1986 at least one of the buffer pool instance */
1987 UNIV_INTERN
1988 bool
buf_flush_list(ulint min_n,lsn_t lsn_limit,ulint * n_processed)1989 buf_flush_list(
1990 /*===========*/
1991 ulint min_n, /*!< in: wished minimum mumber of blocks
1992 flushed (it is not guaranteed that the
1993 actual number is that big, though) */
1994 lsn_t lsn_limit, /*!< in the case BUF_FLUSH_LIST all
1995 blocks whose oldest_modification is
1996 smaller than this should be flushed
1997 (if their number does not exceed
1998 min_n), otherwise ignored */
1999 ulint* n_processed) /*!< out: the number of pages
2000 which were processed is passed
2001 back to caller. Ignored if NULL */
2002
2003 {
2004 ulint i;
2005
2006 ulint requested_pages[MAX_BUFFER_POOLS];
2007 bool active_instance[MAX_BUFFER_POOLS];
2008 ulint remaining_instances = srv_buf_pool_instances;
2009 bool timeout = false;
2010 ulint flush_start_time = 0;
2011
2012 for (i = 0; i < srv_buf_pool_instances; i++) {
2013 requested_pages[i] = 0;
2014 active_instance[i] = true;
2015 }
2016
2017 if (n_processed) {
2018 *n_processed = 0;
2019 }
2020
2021 if (min_n != ULINT_MAX) {
2022 /* Ensure that flushing is spread evenly amongst the
2023 buffer pool instances. When min_n is ULINT_MAX
2024 we need to flush everything up to the lsn limit
2025 so no limit here. */
2026 min_n = (min_n + srv_buf_pool_instances - 1)
2027 / srv_buf_pool_instances;
2028 if (lsn_limit != LSN_MAX) {
2029 flush_start_time = ut_time_ms();
2030 }
2031 }
2032
2033 /* Flush to lsn_limit in all buffer pool instances */
2034 while (remaining_instances && !timeout) {
2035
2036 ulint flush_common_batch = 0;
2037
2038 for (i = 0; i < srv_buf_pool_instances; i++) {
2039
2040 if (flush_start_time
2041 && (ut_time_ms() - flush_start_time
2042 >= srv_cleaner_max_flush_time)) {
2043
2044 timeout = true;
2045 break;
2046 }
2047
2048 if (active_instance[i]) {
2049
2050 buf_pool_t* buf_pool;
2051 ulint chunk_size;
2052 flush_counters_t n;
2053
2054 chunk_size = ut_min(
2055 srv_cleaner_flush_chunk_size,
2056 min_n - requested_pages[i]);
2057
2058 buf_pool = buf_pool_from_array(i);
2059
2060 if (!buf_flush_start(buf_pool,
2061 BUF_FLUSH_LIST)) {
2062
2063 continue;
2064 }
2065
2066 buf_flush_batch(buf_pool, BUF_FLUSH_LIST,
2067 chunk_size, lsn_limit, false,
2068 &n);
2069
2070 buf_flush_end(buf_pool, BUF_FLUSH_LIST);
2071
2072 flush_common_batch += n.flushed;
2073
2074 if (n_processed) {
2075 *n_processed += n.flushed;
2076 }
2077
2078 requested_pages[i] += chunk_size;
2079
2080 if (requested_pages[i] >= min_n
2081 || !n.flushed) {
2082
2083 active_instance[i] = false;
2084 remaining_instances--;
2085 }
2086
2087 if (n.flushed) {
2088 MONITOR_INC_VALUE_CUMULATIVE(
2089 MONITOR_FLUSH_BATCH_TOTAL_PAGE,
2090 MONITOR_FLUSH_BATCH_COUNT,
2091 MONITOR_FLUSH_BATCH_PAGES,
2092 n.flushed);
2093 }
2094 }
2095 }
2096
2097 buf_flush_common(BUF_FLUSH_LIST, flush_common_batch);
2098 }
2099
2100 /* If we haven't flushed all the instances due to timeout or a repeat
2101 failure to start a flush, return failure */
2102 for (i = 0; i < srv_buf_pool_instances; i++) {
2103 if (active_instance[i]) {
2104 return(false);
2105 }
2106 }
2107
2108 return(true);
2109 }
2110
2111 /******************************************************************//**
2112 This function picks up a single dirty page from the tail of the LRU
2113 list, flushes it, removes it from page_hash and LRU list and puts
2114 it on the free list. It is called from user threads when they are
2115 unable to find a replaceable page at the tail of the LRU list i.e.:
2116 when the background LRU flushing in the page_cleaner thread is not
2117 fast enough to keep pace with the workload.
2118 @return TRUE if success. */
2119 UNIV_INTERN
2120 ibool
buf_flush_single_page_from_LRU(buf_pool_t * buf_pool)2121 buf_flush_single_page_from_LRU(
2122 /*===========================*/
2123 buf_pool_t* buf_pool) /*!< in/out: buffer pool instance */
2124 {
2125 ulint scanned;
2126 buf_page_t* bpage;
2127 ibool flushed = FALSE;
2128
2129 mutex_enter(&buf_pool->LRU_list_mutex);
2130
2131 for (bpage = UT_LIST_GET_LAST(buf_pool->LRU), scanned = 1;
2132 bpage != NULL;
2133 bpage = UT_LIST_GET_PREV(LRU, bpage), ++scanned) {
2134
2135 ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
2136
2137 mutex_enter(block_mutex);
2138
2139 if (buf_flush_ready_for_flush(bpage, BUF_FLUSH_SINGLE_PAGE)) {
2140
2141 /* The following call will release the LRU list
2142 and block mutex. */
2143
2144 flushed = buf_flush_page(buf_pool, bpage,
2145 BUF_FLUSH_SINGLE_PAGE, true);
2146
2147 if (flushed) {
2148 /* buf_flush_page() will release the
2149 block mutex */
2150 break;
2151 }
2152 }
2153
2154 mutex_exit(block_mutex);
2155 }
2156
2157 if (!flushed)
2158 mutex_exit(&buf_pool->LRU_list_mutex);
2159
2160 MONITOR_INC_VALUE_CUMULATIVE(
2161 MONITOR_LRU_SINGLE_FLUSH_SCANNED,
2162 MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
2163 MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL,
2164 scanned);
2165
2166 if (bpage == NULL) {
2167 /* Can't find a single flushable page. */
2168 return(FALSE);
2169 }
2170
2171
2172 ibool freed = FALSE;
2173
2174 /* At this point the page has been written to the disk.
2175 As we are not holding LRU list or buf_page_get_mutex() mutex therefore
2176 we cannot use the bpage safely. It may have been plucked out
2177 of the LRU list by some other thread or it may even have
2178 relocated in case of a compressed page. We need to start
2179 the scan of LRU list again to remove the block from the LRU
2180 list and put it on the free list. */
2181 mutex_enter(&buf_pool->LRU_list_mutex);
2182
2183 for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
2184 bpage != NULL;
2185 bpage = UT_LIST_GET_PREV(LRU, bpage)) {
2186
2187 ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
2188
2189 mutex_enter(block_mutex);
2190
2191 ibool ready = buf_flush_ready_for_replace(bpage);
2192
2193 if (ready) {
2194 bool evict_zip;
2195
2196 evict_zip = !buf_LRU_evict_from_unzip_LRU(buf_pool);;
2197
2198 freed = buf_LRU_free_page(bpage, evict_zip);
2199
2200 mutex_exit(block_mutex);
2201
2202 break;
2203 }
2204
2205 mutex_exit(block_mutex);
2206
2207 }
2208
2209 if (!freed)
2210 mutex_exit(&buf_pool->LRU_list_mutex);
2211
2212 return(freed);
2213 }
2214
2215 /*********************************************************************//**
2216 Clears up tail of the LRU lists:
2217 * Put replaceable pages at the tail of LRU to the free list
2218 * Flush dirty pages at the tail of LRU to the disk
2219 The depth to which we scan each buffer pool is controlled by dynamic
2220 config parameter innodb_LRU_scan_depth.
2221 @return number of flushed and evicted pages */
2222 UNIV_INTERN
2223 ulint
buf_flush_LRU_tail(void)2224 buf_flush_LRU_tail(void)
2225 /*====================*/
2226 {
2227 ulint total_flushed = 0;
2228 ulint total_evicted = 0;
2229 ulint start_time = ut_time_ms();
2230 ulint scan_depth[MAX_BUFFER_POOLS];
2231 ulint requested_pages[MAX_BUFFER_POOLS];
2232 bool active_instance[MAX_BUFFER_POOLS];
2233 bool limited_scan[MAX_BUFFER_POOLS];
2234 ulint previous_evicted[MAX_BUFFER_POOLS];
2235 ulint remaining_instances = srv_buf_pool_instances;
2236 ulint lru_chunk_size = srv_cleaner_lru_chunk_size;
2237 ulint free_list_lwm = srv_LRU_scan_depth / 100
2238 * srv_cleaner_free_list_lwm;
2239
2240 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2241
2242 const buf_pool_t* buf_pool = buf_pool_from_array(i);
2243
2244 scan_depth[i] = ut_min(srv_LRU_scan_depth,
2245 UT_LIST_GET_LEN(buf_pool->LRU));
2246 requested_pages[i] = 0;
2247 active_instance[i] = true;
2248 limited_scan[i] = true;
2249 previous_evicted[i] = 0;
2250 }
2251
2252 while (remaining_instances) {
2253
2254 if (ut_time_ms() - start_time >= srv_cleaner_max_lru_time) {
2255
2256 break;
2257 }
2258
2259 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2260
2261 if (!active_instance[i]) {
2262 continue;
2263 }
2264
2265 ulint free_len = free_list_lwm;
2266 buf_pool_t* buf_pool = buf_pool_from_array(i);
2267
2268 do {
2269 flush_counters_t n;
2270
2271 ut_ad(requested_pages[i] <= scan_depth[i]);
2272
2273 /* Currently page_cleaner is the only thread
2274 that can trigger an LRU flush. It is possible
2275 that a batch triggered during last iteration is
2276 still running, */
2277 if (buf_flush_LRU(buf_pool, lru_chunk_size,
2278 limited_scan[i], &n)) {
2279
2280 /* Allowed only one batch per
2281 buffer pool instance. */
2282 buf_flush_wait_batch_end(
2283 buf_pool, BUF_FLUSH_LRU);
2284 }
2285
2286 total_flushed += n.flushed;
2287
2288 /* When we evict less pages than we did on a
2289 previous try we relax the LRU scan limit in
2290 order to attempt to evict more */
2291 limited_scan[i]
2292 = (previous_evicted[i] > n.evicted);
2293 previous_evicted[i] = n.evicted;
2294 total_evicted += n.evicted;
2295
2296 requested_pages[i] += lru_chunk_size;
2297
2298 /* If we failed to flush or evict this
2299 instance, do not bother anymore. But take into
2300 account that we might have zero flushed pages
2301 because the flushing request was fully
2302 satisfied by unzip_LRU evictions. */
2303 if (requested_pages[i] >= scan_depth[i]
2304 || !(srv_cleaner_eviction_factor
2305 ? n.evicted
2306 : (n.flushed + n.unzip_LRU_evicted))) {
2307
2308 active_instance[i] = false;
2309 remaining_instances--;
2310 } else {
2311
2312 free_len = UT_LIST_GET_LEN(
2313 buf_pool->free);
2314 }
2315 } while (active_instance[i]
2316 && free_len <= free_list_lwm);
2317 }
2318 }
2319
2320 if (total_flushed) {
2321 MONITOR_INC_VALUE_CUMULATIVE(
2322 MONITOR_LRU_BATCH_TOTAL_PAGE,
2323 MONITOR_LRU_BATCH_COUNT,
2324 MONITOR_LRU_BATCH_PAGES,
2325 total_flushed);
2326 }
2327 return(total_flushed + total_evicted);
2328 }
2329
2330 /*********************************************************************//**
2331 Wait for any possible LRU flushes that are in progress to end. */
2332 UNIV_INTERN
2333 void
buf_flush_wait_LRU_batch_end(void)2334 buf_flush_wait_LRU_batch_end(void)
2335 /*==============================*/
2336 {
2337 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2338 buf_pool_t* buf_pool;
2339
2340 buf_pool = buf_pool_from_array(i);
2341
2342 mutex_enter(&buf_pool->flush_state_mutex);
2343
2344 if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0
2345 || buf_pool->init_flush[BUF_FLUSH_LRU]) {
2346
2347 mutex_exit(&buf_pool->flush_state_mutex);
2348 buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
2349 } else {
2350 mutex_exit(&buf_pool->flush_state_mutex);
2351 }
2352 }
2353 }
2354
2355 /*********************************************************************//**
2356 Flush a batch of dirty pages from the flush list
2357 @return number of pages flushed, 0 if no page is flushed or if another
2358 flush_list type batch is running */
2359 static
2360 ulint
page_cleaner_do_flush_batch(ulint n_to_flush,lsn_t lsn_limit)2361 page_cleaner_do_flush_batch(
2362 /*========================*/
2363 ulint n_to_flush, /*!< in: number of pages that
2364 we should attempt to flush. */
2365 lsn_t lsn_limit) /*!< in: LSN up to which flushing
2366 must happen */
2367 {
2368 ulint n_flushed;
2369
2370 buf_flush_list(n_to_flush, lsn_limit, &n_flushed);
2371
2372 return(n_flushed);
2373 }
2374
2375 /*********************************************************************//**
2376 Calculates if flushing is required based on number of dirty pages in
2377 the buffer pool.
2378 @return percent of io_capacity to flush to manage dirty page ratio */
2379 static
2380 ulint
af_get_pct_for_dirty()2381 af_get_pct_for_dirty()
2382 /*==================*/
2383 {
2384 ulint dirty_pct = buf_get_modified_ratio_pct();
2385
2386 if (dirty_pct > 0 && srv_max_buf_pool_modified_pct == 0) {
2387 return(100);
2388 }
2389
2390 ut_a(srv_max_dirty_pages_pct_lwm
2391 <= srv_max_buf_pool_modified_pct);
2392
2393 if (srv_max_dirty_pages_pct_lwm == 0) {
2394 /* The user has not set the option to preflush dirty
2395 pages as we approach the high water mark. */
2396 if (dirty_pct > srv_max_buf_pool_modified_pct) {
2397 /* We have crossed the high water mark of dirty
2398 pages In this case we start flushing at 100% of
2399 innodb_io_capacity. */
2400 return(100);
2401 }
2402 } else if (dirty_pct > srv_max_dirty_pages_pct_lwm) {
2403 /* We should start flushing pages gradually. */
2404 return((dirty_pct * 100)
2405 / (srv_max_buf_pool_modified_pct + 1));
2406 }
2407
2408 return(0);
2409 }
2410
2411 /*********************************************************************//**
2412 Calculates if flushing is required based on redo generation rate.
2413 @return percent of io_capacity to flush to manage redo space */
2414 static
2415 ulint
af_get_pct_for_lsn(lsn_t age)2416 af_get_pct_for_lsn(
2417 /*===============*/
2418 lsn_t age) /*!< in: current age of LSN. */
2419 {
2420 lsn_t max_async_age;
2421 lsn_t lsn_age_factor;
2422 lsn_t af_lwm = (srv_adaptive_flushing_lwm
2423 * log_get_capacity()) / 100;
2424
2425 if (age < af_lwm) {
2426 /* No adaptive flushing. */
2427 return(0);
2428 }
2429
2430 max_async_age = log_get_max_modified_age_async();
2431
2432 if (age < max_async_age && !srv_adaptive_flushing) {
2433 /* We have still not reached the max_async point and
2434 the user has disabled adaptive flushing. */
2435 return(0);
2436 }
2437
2438 /* If we are here then we know that either:
2439 1) User has enabled adaptive flushing
2440 2) User may have disabled adaptive flushing but we have reached
2441 max_async_age. */
2442 lsn_age_factor = (age * 100) / max_async_age;
2443
2444 ut_ad(srv_max_io_capacity >= srv_io_capacity);
2445 switch ((srv_cleaner_lsn_age_factor_t)srv_cleaner_lsn_age_factor) {
2446 case SRV_CLEANER_LSN_AGE_FACTOR_LEGACY:
2447 return(static_cast<ulint>(
2448 ((srv_max_io_capacity / srv_io_capacity)
2449 * (lsn_age_factor
2450 * sqrt((double)lsn_age_factor)))
2451 / 7.5));
2452 case SRV_CLEANER_LSN_AGE_FACTOR_HIGH_CHECKPOINT:
2453 return(static_cast<ulint>(
2454 ((srv_max_io_capacity / srv_io_capacity)
2455 * (lsn_age_factor * lsn_age_factor
2456 * sqrt((double)lsn_age_factor)))
2457 / 700.5));
2458 default:
2459 ut_error;
2460 }
2461 }
2462
2463 /*********************************************************************//**
2464 This function is called approximately once every second by the
2465 page_cleaner thread. Based on various factors it decides if there is a
2466 need to do flushing. If flushing is needed it is performed and the
2467 number of pages flushed is returned.
2468 @return number of pages flushed */
2469 static
2470 ulint
page_cleaner_flush_pages_if_needed(void)2471 page_cleaner_flush_pages_if_needed(void)
2472 /*====================================*/
2473 {
2474 static lsn_t lsn_avg_rate = 0;
2475 static lsn_t prev_lsn = 0;
2476 static lsn_t last_lsn = 0;
2477 static ulint sum_pages = 0;
2478 static ulint last_pages = 0;
2479 static ulint prev_pages = 0;
2480 static ulint avg_page_rate = 0;
2481 static ulint n_iterations = 0;
2482 lsn_t oldest_lsn;
2483 lsn_t cur_lsn;
2484 lsn_t age;
2485 lsn_t lsn_rate;
2486 ulint n_pages = 0;
2487 ulint pct_for_dirty = 0;
2488 ulint pct_for_lsn = 0;
2489 ulint pct_total = 0;
2490 int age_factor = 0;
2491
2492 cur_lsn = log_get_lsn();
2493
2494 if (prev_lsn == 0) {
2495 /* First time around. */
2496 prev_lsn = cur_lsn;
2497 return(0);
2498 }
2499
2500 if (prev_lsn == cur_lsn) {
2501 return(0);
2502 }
2503
2504 /* We update our variables every srv_flushing_avg_loops
2505 iterations to smooth out transition in workload. */
2506 if (++n_iterations >= srv_flushing_avg_loops) {
2507
2508 avg_page_rate = ((sum_pages / srv_flushing_avg_loops)
2509 + avg_page_rate) / 2;
2510
2511 /* How much LSN we have generated since last call. */
2512 lsn_rate = (cur_lsn - prev_lsn) / srv_flushing_avg_loops;
2513
2514 lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2;
2515
2516 prev_lsn = cur_lsn;
2517
2518 n_iterations = 0;
2519
2520 sum_pages = 0;
2521 }
2522
2523 oldest_lsn = buf_pool_get_oldest_modification();
2524
2525 ut_ad(oldest_lsn <= log_get_lsn());
2526
2527 age = cur_lsn > oldest_lsn ? cur_lsn - oldest_lsn : 0;
2528
2529 pct_for_dirty = af_get_pct_for_dirty();
2530 pct_for_lsn = af_get_pct_for_lsn(age);
2531
2532 pct_total = ut_max(pct_for_dirty, pct_for_lsn);
2533
2534 /* Cap the maximum IO capacity that we are going to use by
2535 max_io_capacity. */
2536 n_pages = PCT_IO(pct_total);
2537 if (age < log_get_max_modified_age_async())
2538 n_pages = (n_pages + avg_page_rate) / 2;
2539
2540 if (n_pages > srv_max_io_capacity) {
2541 n_pages = srv_max_io_capacity;
2542 }
2543
2544 if (last_pages && cur_lsn - last_lsn > lsn_avg_rate / 2) {
2545 age_factor = static_cast<int>(prev_pages / last_pages);
2546 }
2547
2548 MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages);
2549
2550 prev_pages = n_pages;
2551 n_pages = page_cleaner_do_flush_batch(
2552 n_pages, oldest_lsn + lsn_avg_rate * (age_factor + 1));
2553
2554 last_lsn= cur_lsn;
2555 last_pages= n_pages + 1;
2556
2557 MONITOR_SET(MONITOR_FLUSH_AVG_PAGE_RATE, avg_page_rate);
2558 MONITOR_SET(MONITOR_FLUSH_LSN_AVG_RATE, lsn_avg_rate);
2559 MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty);
2560 MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn);
2561
2562 if (n_pages) {
2563 MONITOR_INC_VALUE_CUMULATIVE(
2564 MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
2565 MONITOR_FLUSH_ADAPTIVE_COUNT,
2566 MONITOR_FLUSH_ADAPTIVE_PAGES,
2567 n_pages);
2568
2569 sum_pages += n_pages;
2570 }
2571
2572 return(n_pages);
2573 }
2574
2575 /*********************************************************************//**
2576 Puts the page_cleaner thread to sleep if it has finished work in less
2577 than a second */
2578 static void
page_cleaner_sleep_if_needed(ut_monotonic_time next_loop_time)2579 page_cleaner_sleep_if_needed(ut_monotonic_time next_loop_time) {
2580 /* No sleep if we are cleaning the buffer pool during the shutdown
2581 with everything else finished */
2582 if (srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE)
2583 return;
2584
2585 const ut_monotonic_time cur_time = ut_monotonic_time_ms();
2586
2587 if (next_loop_time.ms > cur_time.ms) {
2588 /* Get sleep interval in micro seconds. We use
2589 ut_min() to avoid long sleep in case of
2590 wrap around. */
2591 os_thread_sleep(ut_min(
2592 1000000, (next_loop_time.ms - cur_time.ms) * 1000));
2593 }
2594 }
2595
2596 /*********************************************************************//**
2597 Returns the aggregate free list length over all buffer pool instances.
2598 @return total free list length. */
2599 MY_ATTRIBUTE((warn_unused_result))
2600 static
2601 ulint
buf_get_total_free_list_length(void)2602 buf_get_total_free_list_length(void)
2603 /*================================*/
2604 {
2605 ulint result = 0;
2606
2607 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2608
2609 result += UT_LIST_GET_LEN(buf_pool_from_array(i)->free);
2610 }
2611
2612 return result;
2613 }
2614
2615 /** Returns the aggregate LRU list length over all buffer pool instances.
2616 @return total LRU list length. */
2617 MY_ATTRIBUTE((warn_unused_result))
2618 static
2619 ulint
buf_get_total_LRU_list_length(void)2620 buf_get_total_LRU_list_length(void)
2621 {
2622 ulint result = 0;
2623
2624 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2625
2626 result += UT_LIST_GET_LEN(buf_pool_from_array(i)->LRU);
2627 }
2628
2629 return result;
2630 }
2631
2632 /*********************************************************************//**
2633 Adjust the desired page cleaner thread sleep time for LRU flushes. */
2634 MY_ATTRIBUTE((nonnull))
2635 static void
page_cleaner_adapt_lru_sleep_time(ut_monotonic_time * lru_sleep_time,ulint lru_n_flushed)2636 page_cleaner_adapt_lru_sleep_time(
2637 ut_monotonic_time * lru_sleep_time, /*!< in/out: desired page cleaner thread
2638 sleep time for LRU flushes */
2639 ulint lru_n_flushed) /*!< in: number of flushed in previous batch */
2640 {
2641 ulint free_len = buf_get_total_free_list_length();
2642 ulint max_free_len = ut_min(buf_get_total_LRU_list_length(),
2643 srv_LRU_scan_depth * srv_buf_pool_instances);
2644
2645 if (free_len < max_free_len / 100 && lru_n_flushed) {
2646
2647 /* Free lists filled less than 1%
2648 and iteration was able to flush, no sleep */
2649 lru_sleep_time->ms = 0;
2650 } else if (free_len > max_free_len / 5
2651 || (free_len < max_free_len / 100 && lru_n_flushed == 0)) {
2652
2653 /* Free lists filled more than 20%
2654 or no pages flushed in previous batch, sleep a bit more */
2655 lru_sleep_time->ms += 1;
2656 if (lru_sleep_time->ms > srv_cleaner_max_lru_time)
2657 lru_sleep_time->ms = srv_cleaner_max_lru_time;
2658 } else if (free_len < max_free_len / 20 && lru_sleep_time->ms >= 50) {
2659 /* Free lists filled less than 5%, sleep a bit less */
2660 lru_sleep_time->ms -= 50;
2661 } else {
2662
2663 /* Free lists filled between 5% and 20%, no change */
2664 }
2665 }
2666
2667 /*********************************************************************//**
2668 Get the desired page cleaner thread sleep time for flush list flushes.
2669 @return desired sleep time */
2670 MY_ATTRIBUTE((warn_unused_result))
2671 static
2672 ulint
page_cleaner_adapt_flush_sleep_time(void)2673 page_cleaner_adapt_flush_sleep_time(void)
2674 /*=====================================*/
2675 {
2676 lsn_t age = log_get_lsn() - log_sys->last_checkpoint_lsn;
2677
2678 if (age > log_sys->max_modified_age_sync) {
2679
2680 /* No sleep if in sync preflush zone */
2681 return(0);
2682 }
2683
2684 /* In all other cases flush list factors do not influence the page
2685 cleaner sleep time */
2686 return(srv_cleaner_max_flush_time);
2687 }
2688
2689 /******************************************************************//**
2690 page_cleaner thread tasked with flushing dirty pages from the buffer
2691 pool flush lists. As of now we'll have only one instance of this thread.
2692 @return a dummy parameter */
2693 extern "C" UNIV_INTERN
2694 os_thread_ret_t
DECLARE_THREAD(buf_flush_page_cleaner_thread)2695 DECLARE_THREAD(buf_flush_page_cleaner_thread)(
2696 /*==========================================*/
2697 void* arg MY_ATTRIBUTE((unused)))
2698 /*!< in: a dummy parameter required by
2699 os_thread_create */
2700 {
2701 my_thread_init();
2702 ut_monotonic_time next_loop_time = ut_monotonic_time_ms();
2703 next_loop_time.ms += 1000;
2704 ulint n_flushed = 0;
2705 ulint last_activity = srv_get_activity_count();
2706 ut_monotonic_time last_activity_time = ut_monotonic_time_ms();
2707
2708 ut_ad(!srv_read_only_mode);
2709
2710 #ifdef UNIV_PFS_THREAD
2711 pfs_register_thread(buf_page_cleaner_thread_key);
2712 #endif /* UNIV_PFS_THREAD */
2713
2714 srv_cleaner_tid = os_thread_get_tid();
2715
2716 os_thread_set_priority(srv_cleaner_tid, srv_sched_priority_cleaner);
2717
2718 #ifdef UNIV_DEBUG_THREAD_CREATION
2719 fprintf(stderr, "InnoDB: page_cleaner thread running, id %lu\n",
2720 os_thread_pf(os_thread_get_curr_id()));
2721 #endif /* UNIV_DEBUG_THREAD_CREATION */
2722
2723 buf_page_cleaner_is_active = TRUE;
2724
2725 while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
2726 ulint page_cleaner_sleep_time;
2727 ibool server_active;
2728
2729 srv_current_thread_priority = srv_cleaner_thread_priority;
2730
2731 /* The page_cleaner skips sleep if the server is
2732 idle and there are no pending IOs in the buffer pool
2733 and there is work to do. */
2734 if (srv_check_activity(last_activity)
2735 || buf_get_n_pending_read_ios()
2736 || n_flushed == 0) {
2737 page_cleaner_sleep_if_needed(next_loop_time);
2738 }
2739
2740 page_cleaner_sleep_time =
2741 page_cleaner_adapt_flush_sleep_time();
2742
2743 next_loop_time.ms =
2744 ut_monotonic_time_ms().ms + page_cleaner_sleep_time;
2745
2746 server_active = srv_check_activity(last_activity);
2747 if (server_active ||
2748 ut_monotonic_time_ms().ms - last_activity_time.ms <
2749 1000) {
2750 if (server_active) {
2751
2752 last_activity = srv_get_activity_count();
2753 last_activity_time = ut_monotonic_time_ms();
2754 }
2755
2756 /* Flush pages from flush_list if required */
2757 n_flushed = page_cleaner_flush_pages_if_needed();
2758
2759 } else {
2760 n_flushed = page_cleaner_do_flush_batch(
2761 PCT_IO(100),
2762 LSN_MAX);
2763
2764 if (n_flushed) {
2765 MONITOR_INC_VALUE_CUMULATIVE(
2766 MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
2767 MONITOR_FLUSH_BACKGROUND_COUNT,
2768 MONITOR_FLUSH_BACKGROUND_PAGES,
2769 n_flushed);
2770 }
2771 }
2772 }
2773
2774 ut_ad(srv_shutdown_state > 0);
2775 if (srv_fast_shutdown == 2) {
2776 /* In very fast shutdown we simulate a crash of
2777 buffer pool. We are not required to do any flushing */
2778 goto thread_exit;
2779 }
2780
2781 /* In case of normal and slow shutdown the page_cleaner thread
2782 must wait for all other activity in the server to die down.
2783 Note that we can start flushing the buffer pool as soon as the
2784 server enters shutdown phase but we must stay alive long enough
2785 to ensure that any work done by the master or purge threads is
2786 also flushed.
2787 During shutdown we pass through two stages. In the first stage,
2788 when SRV_SHUTDOWN_CLEANUP is set other threads like the master
2789 and the purge threads may be working as well. We start flushing
2790 the buffer pool but can't be sure that no new pages are being
2791 dirtied until we enter SRV_SHUTDOWN_FLUSH_PHASE phase. Because
2792 the LRU manager thread is also flushing at SRV_SHUTDOWN_CLEANUP
2793 but not SRV_SHUTDOWN_FLUSH_PHASE, we only leave the
2794 SRV_SHUTDOWN_CLEANUP loop when the LRU manager quits. */
2795
2796 do {
2797 n_flushed = page_cleaner_do_flush_batch(PCT_IO(100), LSN_MAX);
2798
2799 /* We sleep only if there are no pages to flush */
2800 if (n_flushed == 0) {
2801 os_thread_sleep(100000);
2802 }
2803
2804 os_rmb;
2805 } while (srv_shutdown_state == SRV_SHUTDOWN_CLEANUP
2806 || buf_lru_manager_is_active);
2807
2808 /* At this point all threads including the master and the purge
2809 thread must have been suspended. */
2810 ut_a(srv_get_active_thread_type() == SRV_NONE);
2811 ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
2812
2813 /* We can now make a final sweep on flushing the buffer pool
2814 and exit after we have cleaned the whole buffer pool.
2815 It is important that we wait for any running batch that has
2816 been triggered by us to finish. Otherwise we can end up
2817 considering end of that batch as a finish of our final
2818 sweep and we'll come out of the loop leaving behind dirty pages
2819 in the flush_list */
2820 buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
2821 buf_flush_wait_LRU_batch_end();
2822
2823 #ifdef UNIV_DEBUG
2824 os_rmb;
2825 ut_ad(!buf_lru_manager_is_active);
2826 #endif
2827
2828 bool success;
2829
2830 do {
2831
2832 success = buf_flush_list(PCT_IO(100), LSN_MAX, &n_flushed);
2833 buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
2834
2835 } while (!success || n_flushed > 0);
2836
2837 /* Some sanity checks */
2838 ut_a(srv_get_active_thread_type() == SRV_NONE);
2839 ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
2840 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2841 buf_pool_t* buf_pool = buf_pool_from_array(i);
2842 ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == 0);
2843 }
2844
2845 /* We have lived our life. Time to die. */
2846
2847 thread_exit:
2848 buf_page_cleaner_is_active = FALSE;
2849
2850 my_thread_end();
2851 /* We count the number of threads in os_thread_exit(). A created
2852 thread should always use that to exit and not use return() to exit. */
2853 os_thread_exit(NULL);
2854
2855 OS_THREAD_DUMMY_RETURN;
2856 }
2857
2858 /******************************************************************//**
2859 lru_manager thread tasked with performing LRU flushes and evictions to refill
2860 the buffer pool free lists. As of now we'll have only one instance of this
2861 thread.
2862 @return a dummy parameter */
2863 extern "C" UNIV_INTERN
2864 os_thread_ret_t
DECLARE_THREAD(buf_flush_lru_manager_thread)2865 DECLARE_THREAD(buf_flush_lru_manager_thread)(
2866 /*==========================================*/
2867 void* arg MY_ATTRIBUTE((unused)))
2868 /*!< in: a dummy parameter required by
2869 os_thread_create */
2870 {
2871 ut_monotonic_time next_loop_time = ut_monotonic_time_ms();
2872 next_loop_time.ms += 1000;
2873 ut_monotonic_time lru_sleep_time;
2874 lru_sleep_time.ms = srv_cleaner_max_lru_time;
2875 ulint lru_n_flushed = 1;
2876
2877 #ifdef UNIV_PFS_THREAD
2878 pfs_register_thread(buf_lru_manager_thread_key);
2879 #endif /* UNIV_PFS_THREAD */
2880
2881 srv_lru_manager_tid = os_thread_get_tid();
2882
2883 os_thread_set_priority(srv_lru_manager_tid,
2884 srv_sched_priority_cleaner);
2885
2886 #ifdef UNIV_DEBUG_THREAD_CREATION
2887 fprintf(stderr, "InnoDB: lru_manager thread running, id %lu\n",
2888 os_thread_pf(os_thread_get_curr_id()));
2889 #endif /* UNIV_DEBUG_THREAD_CREATION */
2890
2891 buf_lru_manager_is_active = true;
2892 os_wmb;
2893
2894 /* On server shutdown, the LRU manager thread runs through cleanup
2895 phase to provide free pages for the master and purge threads. */
2896 while (srv_shutdown_state == SRV_SHUTDOWN_NONE
2897 || srv_shutdown_state == SRV_SHUTDOWN_CLEANUP) {
2898
2899 srv_current_thread_priority = srv_cleaner_thread_priority;
2900
2901 page_cleaner_sleep_if_needed(next_loop_time);
2902
2903 page_cleaner_adapt_lru_sleep_time(&lru_sleep_time, lru_n_flushed);
2904
2905 next_loop_time.ms =
2906 ut_monotonic_time_ms().ms + lru_sleep_time.ms;
2907
2908 lru_n_flushed = buf_flush_LRU_tail();
2909 }
2910
2911 buf_lru_manager_is_active = false;
2912 os_wmb;
2913
2914 /* We count the number of threads in os_thread_exit(). A created
2915 thread should always use that to exit and not use return() to exit. */
2916 os_thread_exit(NULL);
2917
2918 OS_THREAD_DUMMY_RETURN;
2919 }
2920
2921 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
2922
2923 /** Functor to validate the flush list. */
2924 struct Check {
operator ()Check2925 void operator()(const buf_page_t* elem)
2926 {
2927 ut_a(elem->in_flush_list);
2928 }
2929 };
2930
2931 /******************************************************************//**
2932 Validates the flush list.
2933 @return TRUE if ok */
2934 static
2935 ibool
buf_flush_validate_low(buf_pool_t * buf_pool)2936 buf_flush_validate_low(
2937 /*===================*/
2938 buf_pool_t* buf_pool) /*!< in: Buffer pool instance */
2939 {
2940 buf_page_t* bpage;
2941 const ib_rbt_node_t* rnode = NULL;
2942
2943 ut_ad(buf_flush_list_mutex_own(buf_pool));
2944
2945 UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list, Check());
2946
2947 bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
2948
2949 /* If we are in recovery mode i.e.: flush_rbt != NULL
2950 then each block in the flush_list must also be present
2951 in the flush_rbt. */
2952 if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
2953 rnode = rbt_first(buf_pool->flush_rbt);
2954 }
2955
2956 while (bpage != NULL) {
2957 const lsn_t om = bpage->oldest_modification;
2958
2959 ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
2960
2961 ut_ad(bpage->in_flush_list);
2962
2963 /* A page in buf_pool->flush_list can be in
2964 BUF_BLOCK_REMOVE_HASH state. This happens when a page
2965 is in the middle of being relocated. In that case the
2966 original descriptor can have this state and still be
2967 in the flush list waiting to acquire the
2968 buf_pool->flush_list_mutex to complete the relocation. */
2969 ut_a(buf_page_in_file(bpage)
2970 || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
2971 ut_a(om > 0);
2972
2973 if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
2974 buf_page_t** prpage;
2975
2976 ut_a(rnode);
2977 prpage = rbt_value(buf_page_t*, rnode);
2978
2979 ut_a(*prpage);
2980 ut_a(*prpage == bpage);
2981 rnode = rbt_next(buf_pool->flush_rbt, rnode);
2982 }
2983
2984 bpage = UT_LIST_GET_NEXT(list, bpage);
2985
2986 ut_a(!bpage || om >= bpage->oldest_modification);
2987 }
2988
2989 /* By this time we must have exhausted the traversal of
2990 flush_rbt (if active) as well. */
2991 ut_a(rnode == NULL);
2992
2993 return(TRUE);
2994 }
2995
2996 /******************************************************************//**
2997 Validates the flush list.
2998 @return TRUE if ok */
2999 UNIV_INTERN
3000 ibool
buf_flush_validate(buf_pool_t * buf_pool)3001 buf_flush_validate(
3002 /*===============*/
3003 buf_pool_t* buf_pool) /*!< buffer pool instance */
3004 {
3005 ibool ret;
3006
3007 buf_flush_list_mutex_enter(buf_pool);
3008
3009 ret = buf_flush_validate_low(buf_pool);
3010
3011 buf_flush_list_mutex_exit(buf_pool);
3012
3013 return(ret);
3014 }
3015 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3016 #endif /* !UNIV_HOTBACKUP */
3017
3018 #ifdef UNIV_DEBUG
3019 /******************************************************************//**
3020 Check if there are any dirty pages that belong to a space id in the flush
3021 list in a particular buffer pool.
3022 @return number of dirty pages present in a single buffer pool */
3023 UNIV_INTERN
3024 ulint
buf_pool_get_dirty_pages_count(buf_pool_t * buf_pool,ulint id)3025 buf_pool_get_dirty_pages_count(
3026 /*===========================*/
3027 buf_pool_t* buf_pool, /*!< in: buffer pool */
3028 ulint id) /*!< in: space id to check */
3029
3030 {
3031 ulint count = 0;
3032
3033 buf_flush_list_mutex_enter(buf_pool);
3034
3035 buf_page_t* bpage;
3036
3037 for (bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
3038 bpage != 0;
3039 bpage = UT_LIST_GET_NEXT(list, bpage)) {
3040
3041 ut_ad(buf_page_in_file(bpage)
3042 || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
3043 ut_ad(bpage->in_flush_list);
3044 ut_ad(bpage->oldest_modification > 0);
3045
3046 if (bpage->space == id) {
3047 ++count;
3048 }
3049 }
3050
3051 buf_flush_list_mutex_exit(buf_pool);
3052
3053 return(count);
3054 }
3055
3056 /******************************************************************//**
3057 Check if there are any dirty pages that belong to a space id in the flush list.
3058 @return number of dirty pages present in all the buffer pools */
3059 UNIV_INTERN
3060 ulint
buf_flush_get_dirty_pages_count(ulint id)3061 buf_flush_get_dirty_pages_count(
3062 /*============================*/
3063 ulint id) /*!< in: space id to check */
3064
3065 {
3066 ulint count = 0;
3067
3068 for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
3069 buf_pool_t* buf_pool;
3070
3071 buf_pool = buf_pool_from_array(i);
3072
3073 count += buf_pool_get_dirty_pages_count(buf_pool, id);
3074 }
3075
3076 return(count);
3077 }
3078 #endif /* UNIV_DEBUG */
3079