1 /*****************************************************************************
2
3 Copyright (c) 1995, 2021, Oracle and/or its affiliates.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License, version 2.0,
7 as published by the Free Software Foundation.
8
9 This program is also distributed with certain software (including
10 but not limited to OpenSSL) that is licensed under separate terms,
11 as designated in a particular file or component or in included license
12 documentation. The authors of MySQL hereby grant you an additional
13 permission to link the program and your derivative works with the
14 separately licensed software that they have included with MySQL.
15
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License, version 2.0, for more details.
20
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
24
25 *****************************************************************************/
26
27 /**************************************************//**
28 @file buf/buf0flu.cc
29 The database buffer buf_pool flush algorithm
30
31 Created 11/11/1995 Heikki Tuuri
32 *******************************************************/
33
34 #include "ha_prototypes.h"
35 #include <mysql/service_thd_wait.h>
36 #include <my_dbug.h>
37
38 #include "buf0flu.h"
39
40 #ifdef UNIV_NONINL
41 #include "buf0flu.ic"
42 #endif
43
44 #include "buf0buf.h"
45 #include "buf0checksum.h"
46 #include "srv0start.h"
47 #include "srv0srv.h"
48 #include "page0zip.h"
49 #ifndef UNIV_HOTBACKUP
50 #include "ut0byte.h"
51 #include "page0page.h"
52 #include "fil0fil.h"
53 #include "buf0lru.h"
54 #include "buf0rea.h"
55 #include "ibuf0ibuf.h"
56 #include "log0log.h"
57 #include "os0file.h"
58 #include "trx0sys.h"
59 #include "srv0mon.h"
60 #include "fsp0sysspace.h"
61 #include "ut0stage.h"
62
63 #ifdef UNIV_LINUX
64 /* include defs for CPU time priority settings */
65 #include <unistd.h>
66 #include <sys/syscall.h>
67 #include <sys/time.h>
68 #include <sys/resource.h>
69 static const int buf_flush_page_cleaner_priority = -20;
70 #endif /* UNIV_LINUX */
71
72 /** Sleep time in microseconds for loop waiting for the oldest
73 modification lsn */
74 static const ulint buf_flush_wait_flushed_sleep_time = 10000;
75
76 /** Number of pages flushed through non flush_list flushes. */
77 static ulint buf_lru_flush_page_count = 0;
78
79 /** Flag indicating if the page_cleaner is in active state. This flag
80 is set to TRUE by the page_cleaner thread when it is spawned and is set
81 back to FALSE at shutdown by the page_cleaner as well. Therefore no
82 need to protect it by a mutex. It is only ever read by the thread
83 doing the shutdown */
84 bool buf_page_cleaner_is_active = false;
85
86 /** Factor for scan length to determine n_pages for intended oldest LSN
87 progress */
88 static ulint buf_flush_lsn_scan_factor = 3;
89
90 /** Average redo generation rate */
91 static lsn_t lsn_avg_rate = 0;
92
93 /** Target oldest LSN for the requested flush_sync */
94 static lsn_t buf_flush_sync_lsn = 0;
95
96 #ifdef UNIV_PFS_THREAD
97 mysql_pfs_key_t page_cleaner_thread_key;
98 #endif /* UNIV_PFS_THREAD */
99
100 /** Event to synchronise with the flushing. */
101 os_event_t buf_flush_event;
102
103 /** State for page cleaner array slot */
104 enum page_cleaner_state_t {
105 /** Not requested any yet.
106 Moved from FINISHED by the coordinator. */
107 PAGE_CLEANER_STATE_NONE = 0,
108 /** Requested but not started flushing.
109 Moved from NONE by the coordinator. */
110 PAGE_CLEANER_STATE_REQUESTED,
111 /** Flushing is on going.
112 Moved from REQUESTED by the worker. */
113 PAGE_CLEANER_STATE_FLUSHING,
114 /** Flushing was finished.
115 Moved from FLUSHING by the worker. */
116 PAGE_CLEANER_STATE_FINISHED
117 };
118
119 /** Page cleaner request state for each buffer pool instance */
120 struct page_cleaner_slot_t {
121 page_cleaner_state_t state; /*!< state of the request.
122 protected by page_cleaner_t::mutex
123 if the worker thread got the slot and
124 set to PAGE_CLEANER_STATE_FLUSHING,
125 n_flushed_lru and n_flushed_list can be
126 updated only by the worker thread */
127 /* This value is set during state==PAGE_CLEANER_STATE_NONE */
128 ulint n_pages_requested;
129 /*!< number of requested pages
130 for the slot */
131 /* These values are updated during state==PAGE_CLEANER_STATE_FLUSHING,
132 and commited with state==PAGE_CLEANER_STATE_FINISHED.
133 The consistency is protected by the 'state' */
134 ulint n_flushed_lru;
135 /*!< number of flushed pages
136 by LRU scan flushing */
137 ulint n_flushed_list;
138 /*!< number of flushed pages
139 by flush_list flushing */
140 bool succeeded_list;
141 /*!< true if flush_list flushing
142 succeeded. */
143 uint64_t flush_lru_time;
144 /*!< elapsed time for LRU flushing */
145 uint64_t flush_list_time;
146 /*!< elapsed time for flush_list
147 flushing */
148 ulint flush_lru_pass;
149 /*!< count to attempt LRU flushing */
150 ulint flush_list_pass;
151 /*!< count to attempt flush_list
152 flushing */
153 };
154
155 /** Page cleaner structure common for all threads */
156 struct page_cleaner_t {
157 ib_mutex_t mutex; /*!< mutex to protect whole of
158 page_cleaner_t struct and
159 page_cleaner_slot_t slots. */
160 os_event_t is_requested; /*!< event to activate worker
161 threads. */
162 os_event_t is_finished; /*!< event to signal that all
163 slots were finished. */
164 volatile ulint n_workers; /*!< number of worker threads
165 in existence */
166 bool requested; /*!< true if requested pages
167 to flush */
168 lsn_t lsn_limit; /*!< upper limit of LSN to be
169 flushed */
170 ulint n_slots; /*!< total number of slots */
171 ulint n_slots_requested;
172 /*!< number of slots
173 in the state
174 PAGE_CLEANER_STATE_REQUESTED */
175 ulint n_slots_flushing;
176 /*!< number of slots
177 in the state
178 PAGE_CLEANER_STATE_FLUSHING */
179 ulint n_slots_finished;
180 /*!< number of slots
181 in the state
182 PAGE_CLEANER_STATE_FINISHED */
183 uint64_t flush_time; /*!< elapsed time to flush
184 requests for all slots */
185 ulint flush_pass; /*!< count to finish to flush
186 requests for all slots */
187 page_cleaner_slot_t* slots; /*!< pointer to the slots */
188 bool is_running; /*!< false if attempt
189 to shutdown */
190
191 #ifdef UNIV_DEBUG
192 ulint n_disabled_debug;
193 /*<! how many of pc threads
194 have been disabled */
195 #endif /* UNIV_DEBUG */
196 };
197
198 static page_cleaner_t* page_cleaner = NULL;
199
200 #ifdef UNIV_DEBUG
201 my_bool innodb_page_cleaner_disabled_debug;
202 #endif /* UNIV_DEBUG */
203
204 /** If LRU list of a buf_pool is less than this size then LRU eviction
205 should not happen. This is because when we do LRU flushing we also put
206 the blocks on free list. If LRU list is very small then we can end up
207 in thrashing. */
208 #define BUF_LRU_MIN_LEN 256
209
210 /* @} */
211
212 /******************************************************************//**
213 Increases flush_list size in bytes with the page size in inline function */
214 static inline
215 void
incr_flush_list_size_in_bytes(buf_block_t * block,buf_pool_t * buf_pool)216 incr_flush_list_size_in_bytes(
217 /*==========================*/
218 buf_block_t* block, /*!< in: control block */
219 buf_pool_t* buf_pool) /*!< in: buffer pool instance */
220 {
221 ut_ad(buf_flush_list_mutex_own(buf_pool));
222
223 buf_pool->stat.flush_list_bytes += block->page.size.physical();
224
225 ut_ad(buf_pool->stat.flush_list_bytes <= buf_pool->curr_pool_size);
226 }
227
228 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
229 /******************************************************************//**
230 Validates the flush list.
231 @return TRUE if ok */
232 static
233 ibool
234 buf_flush_validate_low(
235 /*===================*/
236 buf_pool_t* buf_pool); /*!< in: Buffer pool instance */
237
238 /******************************************************************//**
239 Validates the flush list some of the time.
240 @return TRUE if ok or the check was skipped */
241 static
242 ibool
buf_flush_validate_skip(buf_pool_t * buf_pool)243 buf_flush_validate_skip(
244 /*====================*/
245 buf_pool_t* buf_pool) /*!< in: Buffer pool instance */
246 {
247 /** Try buf_flush_validate_low() every this many times */
248 # define BUF_FLUSH_VALIDATE_SKIP 23
249
250 /** The buf_flush_validate_low() call skip counter.
251 Use a signed type because of the race condition below. */
252 static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
253
254 /* There is a race condition below, but it does not matter,
255 because this call is only for heuristic purposes. We want to
256 reduce the call frequency of the costly buf_flush_validate_low()
257 check in debug builds. */
258 if (--buf_flush_validate_count > 0) {
259 return(TRUE);
260 }
261
262 buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
263 return(buf_flush_validate_low(buf_pool));
264 }
265 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
266
267 /******************************************************************//**
268 Insert a block in the flush_rbt and returns a pointer to its
269 predecessor or NULL if no predecessor. The ordering is maintained
270 on the basis of the <oldest_modification, space, offset> key.
271 @return pointer to the predecessor or NULL if no predecessor. */
272 static
273 buf_page_t*
buf_flush_insert_in_flush_rbt(buf_page_t * bpage)274 buf_flush_insert_in_flush_rbt(
275 /*==========================*/
276 buf_page_t* bpage) /*!< in: bpage to be inserted. */
277 {
278 const ib_rbt_node_t* c_node;
279 const ib_rbt_node_t* p_node;
280 buf_page_t* prev = NULL;
281 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
282
283 ut_ad(buf_flush_list_mutex_own(buf_pool));
284
285 /* Insert this buffer into the rbt. */
286 c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
287 ut_a(c_node != NULL);
288
289 /* Get the predecessor. */
290 p_node = rbt_prev(buf_pool->flush_rbt, c_node);
291
292 if (p_node != NULL) {
293 buf_page_t** value;
294 value = rbt_value(buf_page_t*, p_node);
295 prev = *value;
296 ut_a(prev != NULL);
297 }
298
299 return(prev);
300 }
301
302 /*********************************************************//**
303 Delete a bpage from the flush_rbt. */
304 static
305 void
buf_flush_delete_from_flush_rbt(buf_page_t * bpage)306 buf_flush_delete_from_flush_rbt(
307 /*============================*/
308 buf_page_t* bpage) /*!< in: bpage to be removed. */
309 {
310 #ifdef UNIV_DEBUG
311 ibool ret = FALSE;
312 #endif /* UNIV_DEBUG */
313 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
314
315 ut_ad(buf_flush_list_mutex_own(buf_pool));
316
317 #ifdef UNIV_DEBUG
318 ret =
319 #endif /* UNIV_DEBUG */
320 rbt_delete(buf_pool->flush_rbt, &bpage);
321
322 ut_ad(ret);
323 }
324
325 /*****************************************************************//**
326 Compare two modified blocks in the buffer pool. The key for comparison
327 is:
328 key = <oldest_modification, space, offset>
329 This comparison is used to maintian ordering of blocks in the
330 buf_pool->flush_rbt.
331 Note that for the purpose of flush_rbt, we only need to order blocks
332 on the oldest_modification. The other two fields are used to uniquely
333 identify the blocks.
334 @return < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */
335 static
336 int
buf_flush_block_cmp(const void * p1,const void * p2)337 buf_flush_block_cmp(
338 /*================*/
339 const void* p1, /*!< in: block1 */
340 const void* p2) /*!< in: block2 */
341 {
342 int ret;
343 const buf_page_t* b1 = *(const buf_page_t**) p1;
344 const buf_page_t* b2 = *(const buf_page_t**) p2;
345
346 ut_ad(b1 != NULL);
347 ut_ad(b2 != NULL);
348
349 #ifdef UNIV_DEBUG
350 buf_pool_t* buf_pool = buf_pool_from_bpage(b1);
351 #endif /* UNIV_DEBUG */
352
353 ut_ad(buf_flush_list_mutex_own(buf_pool));
354
355 ut_ad(b1->in_flush_list);
356 ut_ad(b2->in_flush_list);
357
358 if (b2->oldest_modification > b1->oldest_modification) {
359 return(1);
360 } else if (b2->oldest_modification < b1->oldest_modification) {
361 return(-1);
362 }
363
364 /* If oldest_modification is same then decide on the space. */
365 ret = (int)(b2->id.space() - b1->id.space());
366
367 /* Or else decide ordering on the page number. */
368 return(ret ? ret : (int) (b2->id.page_no() - b1->id.page_no()));
369 }
370
371 /********************************************************************//**
372 Initialize the red-black tree to speed up insertions into the flush_list
373 during recovery process. Should be called at the start of recovery
374 process before any page has been read/written. */
375 void
buf_flush_init_flush_rbt(void)376 buf_flush_init_flush_rbt(void)
377 /*==========================*/
378 {
379 ulint i;
380
381 for (i = 0; i < srv_buf_pool_instances; i++) {
382 buf_pool_t* buf_pool;
383
384 buf_pool = buf_pool_from_array(i);
385
386 buf_flush_list_mutex_enter(buf_pool);
387
388 ut_ad(buf_pool->flush_rbt == NULL);
389
390 /* Create red black tree for speedy insertions in flush list. */
391 buf_pool->flush_rbt = rbt_create(
392 sizeof(buf_page_t*), buf_flush_block_cmp);
393
394 buf_flush_list_mutex_exit(buf_pool);
395 }
396 }
397
398 /********************************************************************//**
399 Frees up the red-black tree. */
400 void
buf_flush_free_flush_rbt(void)401 buf_flush_free_flush_rbt(void)
402 /*==========================*/
403 {
404 ulint i;
405
406 for (i = 0; i < srv_buf_pool_instances; i++) {
407 buf_pool_t* buf_pool;
408
409 buf_pool = buf_pool_from_array(i);
410
411 buf_flush_list_mutex_enter(buf_pool);
412
413 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
414 ut_a(buf_flush_validate_low(buf_pool));
415 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
416
417 rbt_free(buf_pool->flush_rbt);
418 buf_pool->flush_rbt = NULL;
419
420 buf_flush_list_mutex_exit(buf_pool);
421 }
422 }
423
424 /********************************************************************//**
425 Inserts a modified block into the flush list. */
426 void
buf_flush_insert_into_flush_list(buf_pool_t * buf_pool,buf_block_t * block,lsn_t lsn)427 buf_flush_insert_into_flush_list(
428 /*=============================*/
429 buf_pool_t* buf_pool, /*!< buffer pool instance */
430 buf_block_t* block, /*!< in/out: block which is modified */
431 lsn_t lsn) /*!< in: oldest modification */
432 {
433 ut_ad(!buf_pool_mutex_own(buf_pool));
434 ut_ad(log_flush_order_mutex_own());
435 ut_ad(buf_page_mutex_own(block));
436
437 buf_flush_list_mutex_enter(buf_pool);
438
439 ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
440 || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
441 <= lsn));
442
443 /* If we are in the recovery then we need to update the flush
444 red-black tree as well. */
445 if (buf_pool->flush_rbt != NULL) {
446 buf_flush_list_mutex_exit(buf_pool);
447 buf_flush_insert_sorted_into_flush_list(buf_pool, block, lsn);
448 return;
449 }
450
451 ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
452 ut_ad(!block->page.in_flush_list);
453
454 ut_d(block->page.in_flush_list = TRUE);
455 block->page.oldest_modification = lsn;
456
457 UT_LIST_ADD_FIRST(buf_pool->flush_list, &block->page);
458
459 incr_flush_list_size_in_bytes(block, buf_pool);
460
461 #ifdef UNIV_DEBUG_VALGRIND
462 void* p;
463
464 if (block->page.size.is_compressed()) {
465 p = block->page.zip.data;
466 } else {
467 p = block->frame;
468 }
469
470 UNIV_MEM_ASSERT_RW(p, block->page.size.physical());
471 #endif /* UNIV_DEBUG_VALGRIND */
472
473 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
474 ut_a(buf_flush_validate_skip(buf_pool));
475 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
476
477 buf_flush_list_mutex_exit(buf_pool);
478 }
479
480 /********************************************************************//**
481 Inserts a modified block into the flush list in the right sorted position.
482 This function is used by recovery, because there the modifications do not
483 necessarily come in the order of lsn's. */
484 void
buf_flush_insert_sorted_into_flush_list(buf_pool_t * buf_pool,buf_block_t * block,lsn_t lsn)485 buf_flush_insert_sorted_into_flush_list(
486 /*====================================*/
487 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
488 buf_block_t* block, /*!< in/out: block which is modified */
489 lsn_t lsn) /*!< in: oldest modification */
490 {
491 buf_page_t* prev_b;
492 buf_page_t* b;
493
494 ut_ad(!buf_pool_mutex_own(buf_pool));
495 ut_ad(log_flush_order_mutex_own());
496 ut_ad(buf_page_mutex_own(block));
497 ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
498
499 buf_flush_list_mutex_enter(buf_pool);
500
501 /* The field in_LRU_list is protected by buf_pool->mutex, which
502 we are not holding. However, while a block is in the flush
503 list, it is dirty and cannot be discarded, not from the
504 page_hash or from the LRU list. At most, the uncompressed
505 page frame of a compressed block may be discarded or created
506 (copying the block->page to or from a buf_page_t that is
507 dynamically allocated from buf_buddy_alloc()). Because those
508 transitions hold block->mutex and the flush list mutex (via
509 buf_flush_relocate_on_flush_list()), there is no possibility
510 of a race condition in the assertions below. */
511 ut_ad(block->page.in_LRU_list);
512 ut_ad(block->page.in_page_hash);
513 /* buf_buddy_block_register() will take a block in the
514 BUF_BLOCK_MEMORY state, not a file page. */
515 ut_ad(!block->page.in_zip_hash);
516
517 ut_ad(!block->page.in_flush_list);
518 ut_d(block->page.in_flush_list = TRUE);
519 block->page.oldest_modification = lsn;
520
521 #ifdef UNIV_DEBUG_VALGRIND
522 void* p;
523
524 if (block->page.size.is_compressed()) {
525 p = block->page.zip.data;
526 } else {
527 p = block->frame;
528 }
529
530 UNIV_MEM_ASSERT_RW(p, block->page.size.physical());
531 #endif /* UNIV_DEBUG_VALGRIND */
532
533 prev_b = NULL;
534
535 /* For the most part when this function is called the flush_rbt
536 should not be NULL. In a very rare boundary case it is possible
537 that the flush_rbt has already been freed by the recovery thread
538 before the last page was hooked up in the flush_list by the
539 io-handler thread. In that case we'll just do a simple
540 linear search in the else block. */
541 if (buf_pool->flush_rbt != NULL) {
542
543 prev_b = buf_flush_insert_in_flush_rbt(&block->page);
544
545 } else {
546
547 b = UT_LIST_GET_FIRST(buf_pool->flush_list);
548
549 while (b != NULL && b->oldest_modification
550 > block->page.oldest_modification) {
551
552 ut_ad(b->in_flush_list);
553 prev_b = b;
554 b = UT_LIST_GET_NEXT(list, b);
555 }
556 }
557
558 if (prev_b == NULL) {
559 UT_LIST_ADD_FIRST(buf_pool->flush_list, &block->page);
560 } else {
561 UT_LIST_INSERT_AFTER(buf_pool->flush_list, prev_b, &block->page);
562 }
563
564 if (buf_pool->oldest_hp.get() != NULL) {
565 /* clear oldest_hp */
566 buf_pool->oldest_hp.set(NULL);
567 }
568
569 incr_flush_list_size_in_bytes(block, buf_pool);
570
571 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
572 ut_a(buf_flush_validate_low(buf_pool));
573 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
574
575 buf_flush_list_mutex_exit(buf_pool);
576 }
577
578 /********************************************************************//**
579 Returns TRUE if the file page block is immediately suitable for replacement,
580 i.e., the transition FILE_PAGE => NOT_USED allowed.
581 @return TRUE if can replace immediately */
582 ibool
buf_flush_ready_for_replace(buf_page_t * bpage)583 buf_flush_ready_for_replace(
584 /*========================*/
585 buf_page_t* bpage) /*!< in: buffer control block, must be
586 buf_page_in_file(bpage) and in the LRU list */
587 {
588 #ifdef UNIV_DEBUG
589 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
590 ut_ad(buf_pool_mutex_own(buf_pool));
591 #endif /* UNIV_DEBUG */
592 ut_ad(mutex_own(buf_page_get_mutex(bpage)));
593 ut_ad(bpage->in_LRU_list);
594
595 if (buf_page_in_file(bpage)) {
596
597 return(bpage->oldest_modification == 0
598 && bpage->buf_fix_count == 0
599 && buf_page_get_io_fix(bpage) == BUF_IO_NONE);
600 }
601
602 ib::fatal() << "Buffer block " << bpage << " state " << bpage->state
603 << " in the LRU list!";
604
605 return(FALSE);
606 }
607
608 /********************************************************************//**
609 Returns true if the block is modified and ready for flushing.
610 @return true if can flush immediately */
611 bool
buf_flush_ready_for_flush(buf_page_t * bpage,buf_flush_t flush_type)612 buf_flush_ready_for_flush(
613 /*======================*/
614 buf_page_t* bpage, /*!< in: buffer control block, must be
615 buf_page_in_file(bpage) */
616 buf_flush_t flush_type)/*!< in: type of flush */
617 {
618 #ifdef UNIV_DEBUG
619 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
620 ut_ad(buf_pool_mutex_own(buf_pool));
621 #endif /* UNIV_DEBUG */
622
623 ut_a(buf_page_in_file(bpage));
624 ut_ad(mutex_own(buf_page_get_mutex(bpage)));
625 ut_ad(flush_type < BUF_FLUSH_N_TYPES);
626
627 if (bpage->oldest_modification == 0
628 || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
629 return(false);
630 }
631
632 ut_ad(bpage->in_flush_list);
633
634 switch (flush_type) {
635 case BUF_FLUSH_LIST:
636 case BUF_FLUSH_LRU:
637 case BUF_FLUSH_SINGLE_PAGE:
638 return(true);
639
640 case BUF_FLUSH_N_TYPES:
641 break;
642 }
643
644 ut_error;
645 return(false);
646 }
647
648 /********************************************************************//**
649 Remove a block from the flush list of modified blocks. */
650 void
buf_flush_remove(buf_page_t * bpage)651 buf_flush_remove(
652 /*=============*/
653 buf_page_t* bpage) /*!< in: pointer to the block in question */
654 {
655 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
656
657 ut_ad(buf_pool_mutex_own(buf_pool));
658 ut_ad(mutex_own(buf_page_get_mutex(bpage)));
659 ut_ad(bpage->in_flush_list);
660
661 buf_flush_list_mutex_enter(buf_pool);
662
663 /* Important that we adjust the hazard pointer before removing
664 the bpage from flush list. */
665 buf_pool->flush_hp.adjust(bpage);
666 buf_pool->oldest_hp.adjust(bpage);
667
668 switch (buf_page_get_state(bpage)) {
669 case BUF_BLOCK_POOL_WATCH:
670 case BUF_BLOCK_ZIP_PAGE:
671 /* Clean compressed pages should not be on the flush list */
672 case BUF_BLOCK_NOT_USED:
673 case BUF_BLOCK_READY_FOR_USE:
674 case BUF_BLOCK_MEMORY:
675 case BUF_BLOCK_REMOVE_HASH:
676 ut_error;
677 return;
678 case BUF_BLOCK_ZIP_DIRTY:
679 buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
680 UT_LIST_REMOVE(buf_pool->flush_list, bpage);
681 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
682 buf_LRU_insert_zip_clean(bpage);
683 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
684 break;
685 case BUF_BLOCK_FILE_PAGE:
686 UT_LIST_REMOVE(buf_pool->flush_list, bpage);
687 break;
688 }
689
690 /* If the flush_rbt is active then delete from there as well. */
691 if (buf_pool->flush_rbt != NULL) {
692 buf_flush_delete_from_flush_rbt(bpage);
693 }
694
695 /* Must be done after we have removed it from the flush_rbt
696 because we assert on in_flush_list in comparison function. */
697 ut_d(bpage->in_flush_list = FALSE);
698
699 buf_pool->stat.flush_list_bytes -= bpage->size.physical();
700
701 bpage->oldest_modification = 0;
702
703 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
704 ut_a(buf_flush_validate_skip(buf_pool));
705 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
706
707 /* If there is an observer that want to know if the asynchronous
708 flushing was done then notify it. */
709 if (bpage->flush_observer != NULL) {
710 bpage->flush_observer->notify_remove(buf_pool, bpage);
711
712 bpage->flush_observer = NULL;
713 }
714
715 buf_flush_list_mutex_exit(buf_pool);
716 }
717
718 /*******************************************************************//**
719 Relocates a buffer control block on the flush_list.
720 Note that it is assumed that the contents of bpage have already been
721 copied to dpage.
722 IMPORTANT: When this function is called bpage and dpage are not
723 exact copies of each other. For example, they both will have different
724 ::state. Also the ::list pointers in dpage may be stale. We need to
725 use the current list node (bpage) to do the list manipulation because
726 the list pointers could have changed between the time that we copied
727 the contents of bpage to the dpage and the flush list manipulation
728 below. */
729 void
buf_flush_relocate_on_flush_list(buf_page_t * bpage,buf_page_t * dpage)730 buf_flush_relocate_on_flush_list(
731 /*=============================*/
732 buf_page_t* bpage, /*!< in/out: control block being moved */
733 buf_page_t* dpage) /*!< in/out: destination block */
734 {
735 buf_page_t* prev;
736 buf_page_t* prev_b = NULL;
737 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
738
739 ut_ad(buf_pool_mutex_own(buf_pool));
740 /* Must reside in the same buffer pool. */
741 ut_ad(buf_pool == buf_pool_from_bpage(dpage));
742
743 ut_ad(mutex_own(buf_page_get_mutex(bpage)));
744
745 buf_flush_list_mutex_enter(buf_pool);
746
747 /* FIXME: At this point we have both buf_pool and flush_list
748 mutexes. Theoretically removal of a block from flush list is
749 only covered by flush_list mutex but currently we do
750 have buf_pool mutex in buf_flush_remove() therefore this block
751 is guaranteed to be in the flush list. We need to check if
752 this will work without the assumption of block removing code
753 having the buf_pool mutex. */
754 ut_ad(bpage->in_flush_list);
755 ut_ad(dpage->in_flush_list);
756
757 /* If recovery is active we must swap the control blocks in
758 the flush_rbt as well. */
759 if (buf_pool->flush_rbt != NULL) {
760 buf_flush_delete_from_flush_rbt(bpage);
761 prev_b = buf_flush_insert_in_flush_rbt(dpage);
762 }
763
764 /* Important that we adjust the hazard pointer before removing
765 the bpage from the flush list. */
766 buf_pool->flush_hp.move(bpage, dpage);
767 buf_pool->oldest_hp.move(bpage, dpage);
768
769 /* Must be done after we have removed it from the flush_rbt
770 because we assert on in_flush_list in comparison function. */
771 ut_d(bpage->in_flush_list = FALSE);
772
773 prev = UT_LIST_GET_PREV(list, bpage);
774 UT_LIST_REMOVE(buf_pool->flush_list, bpage);
775
776 if (prev) {
777 ut_ad(prev->in_flush_list);
778 UT_LIST_INSERT_AFTER( buf_pool->flush_list, prev, dpage);
779 } else {
780 UT_LIST_ADD_FIRST(buf_pool->flush_list, dpage);
781 }
782
783 /* Just an extra check. Previous in flush_list
784 should be the same control block as in flush_rbt. */
785 ut_a(buf_pool->flush_rbt == NULL || prev_b == prev);
786
787 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
788 ut_a(buf_flush_validate_low(buf_pool));
789 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
790
791 buf_flush_list_mutex_exit(buf_pool);
792 }
793
794 /********************************************************************//**
795 Updates the flush system data structures when a write is completed. */
796 void
buf_flush_write_complete(buf_page_t * bpage)797 buf_flush_write_complete(
798 /*=====================*/
799 buf_page_t* bpage) /*!< in: pointer to the block in question */
800 {
801 buf_flush_t flush_type;
802 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
803
804 ut_ad(bpage);
805
806 buf_flush_remove(bpage);
807
808 flush_type = buf_page_get_flush_type(bpage);
809 buf_pool->n_flush[flush_type]--;
810
811 if (buf_pool->n_flush[flush_type] == 0
812 && buf_pool->init_flush[flush_type] == FALSE) {
813
814 /* The running flush batch has ended */
815
816 os_event_set(buf_pool->no_flush[flush_type]);
817 }
818
819 buf_dblwr_update(bpage, flush_type);
820 }
821 #endif /* !UNIV_HOTBACKUP */
822
823 /** Calculate the checksum of a page from compressed table and update
824 the page.
825 @param[in,out] page page to update
826 @param[in] size compressed page size
827 @param[in] lsn LSN to stamp on the page */
828 void
buf_flush_update_zip_checksum(buf_frame_t * page,ulint size,lsn_t lsn)829 buf_flush_update_zip_checksum(
830 buf_frame_t* page,
831 ulint size,
832 lsn_t lsn)
833 {
834 ut_a(size > 0);
835
836 const uint32_t checksum = page_zip_calc_checksum(
837 page, size,
838 static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm));
839
840 mach_write_to_8(page + FIL_PAGE_LSN, lsn);
841 mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
842 }
843
844 /** Initialize a page for writing to the tablespace.
845 @param[in] block buffer block; NULL if bypassing the buffer pool
846 @param[in,out] page page frame
847 @param[in,out] page_zip_ compressed page, or NULL if uncompressed
848 @param[in] newest_lsn newest modification LSN to the page
849 @param[in] skip_checksum whether to disable the page checksum */
850 void
buf_flush_init_for_writing(const buf_block_t * block,byte * page,void * page_zip_,lsn_t newest_lsn,bool skip_checksum)851 buf_flush_init_for_writing(
852 const buf_block_t* block,
853 byte* page,
854 void* page_zip_,
855 lsn_t newest_lsn,
856 bool skip_checksum)
857 {
858 ib_uint32_t checksum = BUF_NO_CHECKSUM_MAGIC;
859
860 ut_ad(block == NULL || block->frame == page);
861 ut_ad(block == NULL || page_zip_ == NULL
862 || &block->page.zip == page_zip_);
863 ut_ad(page);
864
865 if (page_zip_) {
866 page_zip_des_t* page_zip;
867 ulint size;
868
869 page_zip = static_cast<page_zip_des_t*>(page_zip_);
870 size = page_zip_get_size(page_zip);
871
872 ut_ad(size);
873 ut_ad(ut_is_2pow(size));
874 ut_ad(size <= UNIV_ZIP_SIZE_MAX);
875
876 switch (fil_page_get_type(page)) {
877 case FIL_PAGE_TYPE_ALLOCATED:
878 case FIL_PAGE_INODE:
879 case FIL_PAGE_IBUF_BITMAP:
880 case FIL_PAGE_TYPE_FSP_HDR:
881 case FIL_PAGE_TYPE_XDES:
882 /* These are essentially uncompressed pages. */
883 memcpy(page_zip->data, page, size);
884 /* fall through */
885 case FIL_PAGE_TYPE_ZBLOB:
886 case FIL_PAGE_TYPE_ZBLOB2:
887 case FIL_PAGE_INDEX:
888 case FIL_PAGE_RTREE:
889
890 buf_flush_update_zip_checksum(
891 page_zip->data, size, newest_lsn);
892
893 return;
894 }
895
896 ib::error() << "The compressed page to be written"
897 " seems corrupt:";
898 ut_print_buf(stderr, page, size);
899 fputs("\nInnoDB: Possibly older version of the page:", stderr);
900 ut_print_buf(stderr, page_zip->data, size);
901 putc('\n', stderr);
902 ut_error;
903 }
904
905 /* Write the newest modification lsn to the page header and trailer */
906 mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
907
908 mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
909 newest_lsn);
910
911 if (skip_checksum) {
912 mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
913 } else {
914 if (block != NULL && UNIV_PAGE_SIZE == 16384) {
915 /* The page type could be garbage in old files
916 created before MySQL 5.5. Such files always
917 had a page size of 16 kilobytes. */
918 ulint page_type = fil_page_get_type(page);
919 ulint reset_type = page_type;
920
921 switch (block->page.id.page_no() % 16384) {
922 case 0:
923 reset_type = block->page.id.page_no() == 0
924 ? FIL_PAGE_TYPE_FSP_HDR
925 : FIL_PAGE_TYPE_XDES;
926 break;
927 case 1:
928 reset_type = FIL_PAGE_IBUF_BITMAP;
929 break;
930 default:
931 switch (page_type) {
932 case FIL_PAGE_INDEX:
933 case FIL_PAGE_RTREE:
934 case FIL_PAGE_UNDO_LOG:
935 case FIL_PAGE_INODE:
936 case FIL_PAGE_IBUF_FREE_LIST:
937 case FIL_PAGE_TYPE_ALLOCATED:
938 case FIL_PAGE_TYPE_SYS:
939 case FIL_PAGE_TYPE_TRX_SYS:
940 case FIL_PAGE_TYPE_BLOB:
941 case FIL_PAGE_TYPE_ZBLOB:
942 case FIL_PAGE_TYPE_ZBLOB2:
943 break;
944 case FIL_PAGE_TYPE_FSP_HDR:
945 case FIL_PAGE_TYPE_XDES:
946 case FIL_PAGE_IBUF_BITMAP:
947 /* These pages should have
948 predetermined page numbers
949 (see above). */
950 default:
951 reset_type = FIL_PAGE_TYPE_UNKNOWN;
952 break;
953 }
954 }
955
956 if (UNIV_UNLIKELY(page_type != reset_type)) {
957 ib::info()
958 << "Resetting invalid page "
959 << block->page.id << " type "
960 << page_type << " to "
961 << reset_type << " when flushing.";
962 fil_page_set_type(page, reset_type);
963 }
964 }
965
966 switch ((srv_checksum_algorithm_t) srv_checksum_algorithm) {
967 case SRV_CHECKSUM_ALGORITHM_CRC32:
968 case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
969 checksum = buf_calc_page_crc32(page);
970 mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
971 checksum);
972 break;
973 case SRV_CHECKSUM_ALGORITHM_INNODB:
974 case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
975 checksum = (ib_uint32_t) buf_calc_page_new_checksum(
976 page);
977 mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
978 checksum);
979 checksum = (ib_uint32_t) buf_calc_page_old_checksum(
980 page);
981 break;
982 case SRV_CHECKSUM_ALGORITHM_NONE:
983 case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
984 mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
985 checksum);
986 break;
987 /* no default so the compiler will emit a warning if
988 new enum is added and not handled here */
989 }
990 }
991
992 /* With the InnoDB checksum, we overwrite the first 4 bytes of
993 the end lsn field to store the old formula checksum. Since it
994 depends also on the field FIL_PAGE_SPACE_OR_CHKSUM, it has to
995 be calculated after storing the new formula checksum.
996
997 In other cases we write the same value to both fields.
998 If CRC32 is used then it is faster to use that checksum
999 (calculated above) instead of calculating another one.
1000 We can afford to store something other than
1001 buf_calc_page_old_checksum() or BUF_NO_CHECKSUM_MAGIC in
1002 this field because the file will not be readable by old
1003 versions of MySQL/InnoDB anyway (older than MySQL 5.6.3) */
1004
1005 mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
1006 checksum);
1007 }
1008
1009 #ifndef UNIV_HOTBACKUP
1010 /********************************************************************//**
1011 Does an asynchronous write of a buffer page. NOTE: in simulated aio and
1012 also when the doublewrite buffer is used, we must call
1013 buf_dblwr_flush_buffered_writes after we have posted a batch of
1014 writes! */
1015 static
1016 void
buf_flush_write_block_low(buf_page_t * bpage,buf_flush_t flush_type,bool sync)1017 buf_flush_write_block_low(
1018 /*======================*/
1019 buf_page_t* bpage, /*!< in: buffer block to write */
1020 buf_flush_t flush_type, /*!< in: type of flush */
1021 bool sync) /*!< in: true if sync IO request */
1022 {
1023 page_t* frame = NULL;
1024
1025 #ifdef UNIV_DEBUG
1026 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1027 ut_ad(!buf_pool_mutex_own(buf_pool));
1028 #endif /* UNIV_DEBUG */
1029
1030 DBUG_PRINT("ib_buf", ("flush %s %u page " UINT32PF ":" UINT32PF,
1031 sync ? "sync" : "async", (unsigned) flush_type,
1032 bpage->id.space(), bpage->id.page_no()));
1033
1034 ut_ad(buf_page_in_file(bpage));
1035
1036 /* We are not holding buf_pool->mutex or block_mutex here.
1037 Nevertheless, it is safe to access bpage, because it is
1038 io_fixed and oldest_modification != 0. Thus, it cannot be
1039 relocated in the buffer pool or removed from flush_list or
1040 LRU_list. */
1041 ut_ad(!buf_pool_mutex_own(buf_pool));
1042 ut_ad(!buf_flush_list_mutex_own(buf_pool));
1043 ut_ad(!buf_page_get_mutex(bpage)->is_owned());
1044 ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
1045 ut_ad(bpage->oldest_modification != 0);
1046
1047 #ifdef UNIV_IBUF_COUNT_DEBUG
1048 ut_a(ibuf_count_get(bpage->id) == 0);
1049 #endif /* UNIV_IBUF_COUNT_DEBUG */
1050
1051 ut_ad(bpage->newest_modification != 0);
1052
1053 /* Force the log to the disk before writing the modified block */
1054 if (!srv_read_only_mode) {
1055 log_write_up_to(bpage->newest_modification, true);
1056 }
1057
1058 switch (buf_page_get_state(bpage)) {
1059 case BUF_BLOCK_POOL_WATCH:
1060 case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
1061 case BUF_BLOCK_NOT_USED:
1062 case BUF_BLOCK_READY_FOR_USE:
1063 case BUF_BLOCK_MEMORY:
1064 case BUF_BLOCK_REMOVE_HASH:
1065 ut_error;
1066 break;
1067 case BUF_BLOCK_ZIP_DIRTY:
1068 frame = bpage->zip.data;
1069
1070 mach_write_to_8(frame + FIL_PAGE_LSN,
1071 bpage->newest_modification);
1072
1073 ut_a(page_zip_verify_checksum(frame, bpage->size.physical()));
1074 break;
1075 case BUF_BLOCK_FILE_PAGE:
1076 frame = bpage->zip.data;
1077 if (!frame) {
1078 frame = ((buf_block_t*) bpage)->frame;
1079 }
1080
1081 buf_flush_init_for_writing(
1082 reinterpret_cast<const buf_block_t*>(bpage),
1083 reinterpret_cast<const buf_block_t*>(bpage)->frame,
1084 bpage->zip.data ? &bpage->zip : NULL,
1085 bpage->newest_modification,
1086 fsp_is_checksum_disabled(bpage->id.space()));
1087 break;
1088 }
1089
1090 /* Disable use of double-write buffer for temporary tablespace.
1091 Given the nature and load of temporary tablespace doublewrite buffer
1092 adds an overhead during flushing. */
1093
1094 if (!srv_use_doublewrite_buf
1095 || buf_dblwr == NULL
1096 || srv_read_only_mode
1097 || fsp_is_system_temporary(bpage->id.space())) {
1098
1099 ut_ad(!srv_read_only_mode
1100 || fsp_is_system_temporary(bpage->id.space()));
1101
1102 ulint type = IORequest::WRITE | IORequest::DO_NOT_WAKE;
1103
1104 IORequest request(type);
1105
1106 fil_io(request,
1107 sync, bpage->id, bpage->size, 0, bpage->size.physical(),
1108 frame, bpage);
1109
1110 } else if (flush_type == BUF_FLUSH_SINGLE_PAGE) {
1111 buf_dblwr_write_single_page(bpage, sync);
1112 } else {
1113 ut_ad(!sync);
1114 buf_dblwr_add_to_batch(bpage);
1115 }
1116
1117 /* When doing single page flushing the IO is done synchronously
1118 and we flush the changes to disk only for the tablespace we
1119 are working on. */
1120 if (sync) {
1121 ut_ad(flush_type == BUF_FLUSH_SINGLE_PAGE);
1122 fil_flush(bpage->id.space());
1123
1124 /* true means we want to evict this page from the
1125 LRU list as well. */
1126 buf_page_io_complete(bpage, true);
1127 }
1128
1129 /* Increment the counter of I/O operations used
1130 for selecting LRU policy. */
1131 buf_LRU_stat_inc_io();
1132 }
1133
1134 /********************************************************************//**
1135 Writes a flushable page asynchronously from the buffer pool to a file.
1136 NOTE: in simulated aio we must call
1137 os_aio_simulated_wake_handler_threads after we have posted a batch of
1138 writes! NOTE: buf_pool->mutex and buf_page_get_mutex(bpage) must be
1139 held upon entering this function, and they will be released by this
1140 function if it returns true.
1141 @return TRUE if the page was flushed */
1142 ibool
buf_flush_page(buf_pool_t * buf_pool,buf_page_t * bpage,buf_flush_t flush_type,bool sync)1143 buf_flush_page(
1144 /*===========*/
1145 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1146 buf_page_t* bpage, /*!< in: buffer control block */
1147 buf_flush_t flush_type, /*!< in: type of flush */
1148 bool sync) /*!< in: true if sync IO request */
1149 {
1150 BPageMutex* block_mutex;
1151
1152 ut_ad(flush_type < BUF_FLUSH_N_TYPES);
1153 ut_ad(buf_pool_mutex_own(buf_pool));
1154 ut_ad(buf_page_in_file(bpage));
1155 ut_ad(!sync || flush_type == BUF_FLUSH_SINGLE_PAGE);
1156
1157 block_mutex = buf_page_get_mutex(bpage);
1158 ut_ad(mutex_own(block_mutex));
1159
1160 ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
1161
1162 bool is_uncompressed;
1163
1164 is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
1165 ut_ad(is_uncompressed == (block_mutex != &buf_pool->zip_mutex));
1166
1167 ibool flush;
1168 rw_lock_t* rw_lock;
1169 bool no_fix_count = bpage->buf_fix_count == 0;
1170
1171 if (!is_uncompressed) {
1172 flush = TRUE;
1173 rw_lock = NULL;
1174 } else if (!(no_fix_count || flush_type == BUF_FLUSH_LIST)
1175 || (!no_fix_count
1176 && srv_shutdown_state <= SRV_SHUTDOWN_CLEANUP
1177 && fsp_is_system_temporary(bpage->id.space()))) {
1178 /* This is a heuristic, to avoid expensive SX attempts. */
1179 /* For table residing in temporary tablespace sync is done
1180 using IO_FIX and so before scheduling for flush ensure that
1181 page is not fixed. */
1182 flush = FALSE;
1183 } else {
1184 rw_lock = &reinterpret_cast<buf_block_t*>(bpage)->lock;
1185 if (flush_type != BUF_FLUSH_LIST) {
1186 flush = rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE);
1187 } else {
1188 /* Will SX lock later */
1189 flush = TRUE;
1190 }
1191 }
1192
1193 if (flush) {
1194
1195 /* We are committed to flushing by the time we get here */
1196
1197 buf_page_set_io_fix(bpage, BUF_IO_WRITE);
1198
1199 buf_page_set_flush_type(bpage, flush_type);
1200
1201 if (buf_pool->n_flush[flush_type] == 0) {
1202 os_event_reset(buf_pool->no_flush[flush_type]);
1203 }
1204
1205 ++buf_pool->n_flush[flush_type];
1206
1207 mutex_exit(block_mutex);
1208
1209 buf_pool_mutex_exit(buf_pool);
1210
1211 if (flush_type == BUF_FLUSH_LIST
1212 && is_uncompressed
1213 && !rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE)) {
1214
1215 if (!fsp_is_system_temporary(bpage->id.space())) {
1216 /* avoiding deadlock possibility involves
1217 doublewrite buffer, should flush it, because
1218 it might hold the another block->lock. */
1219 buf_dblwr_flush_buffered_writes();
1220 } else {
1221 buf_dblwr_sync_datafiles();
1222 }
1223
1224 rw_lock_sx_lock_gen(rw_lock, BUF_IO_WRITE);
1225 }
1226
1227 /* If there is an observer that want to know if the asynchronous
1228 flushing was sent then notify it.
1229 Note: we set flush observer to a page with x-latch, so we can
1230 guarantee that notify_flush and notify_remove are called in pair
1231 with s-latch on a uncompressed page. */
1232 if (bpage->flush_observer != NULL) {
1233 buf_pool_mutex_enter(buf_pool);
1234
1235 bpage->flush_observer->notify_flush(buf_pool, bpage);
1236
1237 buf_pool_mutex_exit(buf_pool);
1238 }
1239
1240 /* Even though bpage is not protected by any mutex at this
1241 point, it is safe to access bpage, because it is io_fixed and
1242 oldest_modification != 0. Thus, it cannot be relocated in the
1243 buffer pool or removed from flush_list or LRU_list. */
1244
1245 buf_flush_write_block_low(bpage, flush_type, sync);
1246 }
1247
1248 return(flush);
1249 }
1250
1251 # if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
1252 /********************************************************************//**
1253 Writes a flushable page asynchronously from the buffer pool to a file.
1254 NOTE: buf_pool->mutex and block->mutex must be held upon entering this
1255 function, and they will be released by this function after flushing.
1256 This is loosely based on buf_flush_batch() and buf_flush_page().
1257 @return TRUE if the page was flushed and the mutexes released */
1258 ibool
buf_flush_page_try(buf_pool_t * buf_pool,buf_block_t * block)1259 buf_flush_page_try(
1260 /*===============*/
1261 buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */
1262 buf_block_t* block) /*!< in/out: buffer control block */
1263 {
1264 ut_ad(buf_pool_mutex_own(buf_pool));
1265 ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
1266 ut_ad(buf_page_mutex_own(block));
1267
1268 if (!buf_flush_ready_for_flush(&block->page, BUF_FLUSH_SINGLE_PAGE)) {
1269 return(FALSE);
1270 }
1271
1272 /* The following call will release the buffer pool and
1273 block mutex. */
1274 return(buf_flush_page(
1275 buf_pool, &block->page,
1276 BUF_FLUSH_SINGLE_PAGE, true));
1277 }
1278 # endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
1279
1280 /** Check the page is in buffer pool and can be flushed.
1281 @param[in] page_id page id
1282 @param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST
1283 @return true if the page can be flushed. */
1284 static
1285 bool
buf_flush_check_neighbor(const page_id_t & page_id,buf_flush_t flush_type)1286 buf_flush_check_neighbor(
1287 const page_id_t& page_id,
1288 buf_flush_t flush_type)
1289 {
1290 buf_page_t* bpage;
1291 buf_pool_t* buf_pool = buf_pool_get(page_id);
1292 bool ret;
1293
1294 ut_ad(flush_type == BUF_FLUSH_LRU
1295 || flush_type == BUF_FLUSH_LIST);
1296
1297 buf_pool_mutex_enter(buf_pool);
1298
1299 /* We only want to flush pages from this buffer pool. */
1300 bpage = buf_page_hash_get(buf_pool, page_id);
1301
1302 if (!bpage) {
1303
1304 buf_pool_mutex_exit(buf_pool);
1305 return(false);
1306 }
1307
1308 ut_a(buf_page_in_file(bpage));
1309
1310 /* We avoid flushing 'non-old' blocks in an LRU flush,
1311 because the flushed blocks are soon freed */
1312
1313 ret = false;
1314 if (flush_type != BUF_FLUSH_LRU || buf_page_is_old(bpage)) {
1315 BPageMutex* block_mutex = buf_page_get_mutex(bpage);
1316
1317 mutex_enter(block_mutex);
1318 if (buf_flush_ready_for_flush(bpage, flush_type)) {
1319 ret = true;
1320 }
1321 mutex_exit(block_mutex);
1322 }
1323 buf_pool_mutex_exit(buf_pool);
1324
1325 return(ret);
1326 }
1327
1328 /** Flushes to disk all flushable pages within the flush area.
1329 @param[in] page_id page id
1330 @param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST
1331 @param[in] n_flushed number of pages flushed so far in this batch
1332 @param[in] n_to_flush maximum number of pages we are allowed to flush
1333 @return number of pages flushed */
1334 static
1335 ulint
buf_flush_try_neighbors(const page_id_t & page_id,buf_flush_t flush_type,ulint n_flushed,ulint n_to_flush)1336 buf_flush_try_neighbors(
1337 const page_id_t& page_id,
1338 buf_flush_t flush_type,
1339 ulint n_flushed,
1340 ulint n_to_flush)
1341 {
1342 ulint i;
1343 ulint low;
1344 ulint high;
1345 ulint count = 0;
1346 buf_pool_t* buf_pool = buf_pool_get(page_id);
1347
1348 ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1349
1350 if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN
1351 || srv_flush_neighbors == 0) {
1352 /* If there is little space or neighbor flushing is
1353 not enabled then just flush the victim. */
1354 low = page_id.page_no();
1355 high = page_id.page_no() + 1;
1356 } else {
1357 /* When flushed, dirty blocks are searched in
1358 neighborhoods of this size, and flushed along with the
1359 original page. */
1360
1361 ulint buf_flush_area;
1362
1363 buf_flush_area = ut_min(
1364 BUF_READ_AHEAD_AREA(buf_pool),
1365 buf_pool->curr_size / 16);
1366
1367 low = (page_id.page_no() / buf_flush_area) * buf_flush_area;
1368 high = (page_id.page_no() / buf_flush_area + 1) * buf_flush_area;
1369
1370 if (srv_flush_neighbors == 1) {
1371 /* adjust 'low' and 'high' to limit
1372 for contiguous dirty area */
1373 if (page_id.page_no() > low) {
1374 for (i = page_id.page_no() - 1; i >= low; i--) {
1375 if (!buf_flush_check_neighbor(
1376 page_id_t(page_id.space(), i),
1377 flush_type)) {
1378
1379 break;
1380 }
1381
1382 if (i == low) {
1383 /* Avoid overwrap when low == 0
1384 and calling
1385 buf_flush_check_neighbor() with
1386 i == (ulint) -1 */
1387 i--;
1388 break;
1389 }
1390 }
1391 low = i + 1;
1392 }
1393
1394 for (i = page_id.page_no() + 1;
1395 i < high
1396 && buf_flush_check_neighbor(
1397 page_id_t(page_id.space(), i),
1398 flush_type);
1399 i++) {
1400 /* do nothing */
1401 }
1402 high = i;
1403 }
1404 }
1405
1406 const ulint space_size = fil_space_get_size(page_id.space());
1407 if (high > space_size) {
1408 high = space_size;
1409 }
1410
1411 DBUG_PRINT("ib_buf", ("flush " UINT32PF ":%u..%u",
1412 page_id.space(),
1413 (unsigned) low, (unsigned) high));
1414
1415 for (ulint i = low; i < high; i++) {
1416 buf_page_t* bpage;
1417
1418 if ((count + n_flushed) >= n_to_flush) {
1419
1420 /* We have already flushed enough pages and
1421 should call it a day. There is, however, one
1422 exception. If the page whose neighbors we
1423 are flushing has not been flushed yet then
1424 we'll try to flush the victim that we
1425 selected originally. */
1426 if (i <= page_id.page_no()) {
1427 i = page_id.page_no();
1428 } else {
1429 break;
1430 }
1431 }
1432
1433 const page_id_t cur_page_id(page_id.space(), i);
1434
1435 buf_pool = buf_pool_get(cur_page_id);
1436
1437 buf_pool_mutex_enter(buf_pool);
1438
1439 /* We only want to flush pages from this buffer pool. */
1440 bpage = buf_page_hash_get(buf_pool, cur_page_id);
1441
1442 if (bpage == NULL) {
1443
1444 buf_pool_mutex_exit(buf_pool);
1445 continue;
1446 }
1447
1448 ut_a(buf_page_in_file(bpage));
1449
1450 /* We avoid flushing 'non-old' blocks in an LRU flush,
1451 because the flushed blocks are soon freed */
1452
1453 if (flush_type != BUF_FLUSH_LRU
1454 || i == page_id.page_no()
1455 || buf_page_is_old(bpage)) {
1456
1457 BPageMutex* block_mutex = buf_page_get_mutex(bpage);
1458
1459 mutex_enter(block_mutex);
1460
1461 if (buf_flush_ready_for_flush(bpage, flush_type)
1462 && (i == page_id.page_no()
1463 || bpage->buf_fix_count == 0)) {
1464
1465 /* We also try to flush those
1466 neighbors != offset */
1467
1468 if (buf_flush_page(
1469 buf_pool, bpage, flush_type, false)) {
1470
1471 ++count;
1472 } else {
1473 mutex_exit(block_mutex);
1474 buf_pool_mutex_exit(buf_pool);
1475 }
1476
1477 continue;
1478 } else {
1479 mutex_exit(block_mutex);
1480 }
1481 }
1482 buf_pool_mutex_exit(buf_pool);
1483 }
1484
1485 if (count > 1) {
1486 MONITOR_INC_VALUE_CUMULATIVE(
1487 MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
1488 MONITOR_FLUSH_NEIGHBOR_COUNT,
1489 MONITOR_FLUSH_NEIGHBOR_PAGES,
1490 (count - 1));
1491 }
1492
1493 return(count);
1494 }
1495
1496 /** Check if the block is modified and ready for flushing.
1497 If the the block is ready to flush then flush the page and try o flush
1498 its neighbors.
1499 @param[in] bpage buffer control block,
1500 must be buf_page_in_file(bpage)
1501 @param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST
1502 @param[in] n_to_flush number of pages to flush
1503 @param[in,out] count number of pages flushed
1504 @return TRUE if buf_pool mutex was released during this function.
1505 This does not guarantee that some pages were written as well.
1506 Number of pages written are incremented to the count. */
1507 static
1508 bool
buf_flush_page_and_try_neighbors(buf_page_t * bpage,buf_flush_t flush_type,ulint n_to_flush,ulint * count)1509 buf_flush_page_and_try_neighbors(
1510 buf_page_t* bpage,
1511 buf_flush_t flush_type,
1512 ulint n_to_flush,
1513 ulint* count)
1514 {
1515 #ifdef UNIV_DEBUG
1516 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1517
1518 ut_ad(buf_pool_mutex_own(buf_pool));
1519 #endif /* UNIV_DEBUG */
1520
1521 bool flushed;
1522 BPageMutex* block_mutex = buf_page_get_mutex(bpage);
1523
1524 mutex_enter(block_mutex);
1525
1526 ut_a(buf_page_in_file(bpage));
1527
1528 if (buf_flush_ready_for_flush(bpage, flush_type)) {
1529 buf_pool_t* buf_pool;
1530
1531 buf_pool = buf_pool_from_bpage(bpage);
1532
1533 const page_id_t page_id = bpage->id;
1534
1535 mutex_exit(block_mutex);
1536
1537 buf_pool_mutex_exit(buf_pool);
1538
1539 /* Try to flush also all the neighbors */
1540 *count += buf_flush_try_neighbors(
1541 page_id, flush_type, *count, n_to_flush);
1542
1543 buf_pool_mutex_enter(buf_pool);
1544 flushed = TRUE;
1545 } else {
1546 mutex_exit(block_mutex);
1547
1548 flushed = false;
1549 }
1550
1551 ut_ad(buf_pool_mutex_own(buf_pool));
1552
1553 return(flushed);
1554 }
1555
1556 /*******************************************************************//**
1557 This utility moves the uncompressed frames of pages to the free list.
1558 Note that this function does not actually flush any data to disk. It
1559 just detaches the uncompressed frames from the compressed pages at the
1560 tail of the unzip_LRU and puts those freed frames in the free list.
1561 Note that it is a best effort attempt and it is not guaranteed that
1562 after a call to this function there will be 'max' blocks in the free
1563 list.
1564 @return number of blocks moved to the free list. */
1565 static
1566 ulint
buf_free_from_unzip_LRU_list_batch(buf_pool_t * buf_pool,ulint max)1567 buf_free_from_unzip_LRU_list_batch(
1568 /*===============================*/
1569 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1570 ulint max) /*!< in: desired number of
1571 blocks in the free_list */
1572 {
1573 ulint scanned = 0;
1574 ulint count = 0;
1575 ulint free_len = UT_LIST_GET_LEN(buf_pool->free);
1576 ulint lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
1577
1578 ut_ad(buf_pool_mutex_own(buf_pool));
1579
1580 buf_block_t* block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
1581
1582 while (block != NULL
1583 && count < max
1584 && free_len < srv_LRU_scan_depth
1585 && lru_len > UT_LIST_GET_LEN(buf_pool->LRU) / 10) {
1586
1587 ++scanned;
1588 if (buf_LRU_free_page(&block->page, false)) {
1589 /* Block was freed. buf_pool->mutex potentially
1590 released and reacquired */
1591 ++count;
1592 block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
1593
1594 } else {
1595
1596 block = UT_LIST_GET_PREV(unzip_LRU, block);
1597 }
1598
1599 free_len = UT_LIST_GET_LEN(buf_pool->free);
1600 lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
1601 }
1602
1603 ut_ad(buf_pool_mutex_own(buf_pool));
1604
1605 if (scanned) {
1606 MONITOR_INC_VALUE_CUMULATIVE(
1607 MONITOR_LRU_BATCH_SCANNED,
1608 MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
1609 MONITOR_LRU_BATCH_SCANNED_PER_CALL,
1610 scanned);
1611 }
1612
1613 return(count);
1614 }
1615
1616 /*******************************************************************//**
1617 This utility flushes dirty blocks from the end of the LRU list.
1618 The calling thread is not allowed to own any latches on pages!
1619 It attempts to make 'max' blocks available in the free list. Note that
1620 it is a best effort attempt and it is not guaranteed that after a call
1621 to this function there will be 'max' blocks in the free list.
1622 @return number of blocks for which the write request was queued. */
1623 static
1624 ulint
buf_flush_LRU_list_batch(buf_pool_t * buf_pool,ulint max)1625 buf_flush_LRU_list_batch(
1626 /*=====================*/
1627 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1628 ulint max) /*!< in: desired number of
1629 blocks in the free_list */
1630 {
1631 buf_page_t* bpage;
1632 ulint scanned = 0;
1633 ulint evict_count = 0;
1634 ulint count = 0;
1635 ulint free_len = UT_LIST_GET_LEN(buf_pool->free);
1636 ulint lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
1637 ulint withdraw_depth = 0;
1638
1639 ut_ad(buf_pool_mutex_own(buf_pool));
1640
1641 if (buf_pool->curr_size < buf_pool->old_size
1642 && buf_pool->withdraw_target > 0) {
1643 withdraw_depth = buf_pool->withdraw_target
1644 - UT_LIST_GET_LEN(buf_pool->withdraw);
1645 }
1646
1647 for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1648 bpage != NULL && count + evict_count < max
1649 && free_len < srv_LRU_scan_depth + withdraw_depth
1650 && lru_len > BUF_LRU_MIN_LEN;
1651 ++scanned,
1652 bpage = buf_pool->lru_hp.get()) {
1653
1654 buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage);
1655 buf_pool->lru_hp.set(prev);
1656
1657 BPageMutex* block_mutex = buf_page_get_mutex(bpage);
1658
1659 mutex_enter(block_mutex);
1660
1661 if (buf_flush_ready_for_replace(bpage)) {
1662 /* block is ready for eviction i.e., it is
1663 clean and is not IO-fixed or buffer fixed. */
1664 mutex_exit(block_mutex);
1665 if (buf_LRU_free_page(bpage, true)) {
1666 ++evict_count;
1667 }
1668 } else if (buf_flush_ready_for_flush(bpage, BUF_FLUSH_LRU)) {
1669 /* Block is ready for flush. Dispatch an IO
1670 request. The IO helper thread will put it on
1671 free list in IO completion routine. */
1672 mutex_exit(block_mutex);
1673 buf_flush_page_and_try_neighbors(
1674 bpage, BUF_FLUSH_LRU, max, &count);
1675 } else {
1676 /* Can't evict or dispatch this block. Go to
1677 previous. */
1678 ut_ad(buf_pool->lru_hp.is_hp(prev));
1679 mutex_exit(block_mutex);
1680 }
1681
1682 ut_ad(!mutex_own(block_mutex));
1683 ut_ad(buf_pool_mutex_own(buf_pool));
1684
1685 free_len = UT_LIST_GET_LEN(buf_pool->free);
1686 lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
1687 }
1688
1689 buf_pool->lru_hp.set(NULL);
1690
1691 /* We keep track of all flushes happening as part of LRU
1692 flush. When estimating the desired rate at which flush_list
1693 should be flushed, we factor in this value. */
1694 buf_lru_flush_page_count += count;
1695
1696 ut_ad(buf_pool_mutex_own(buf_pool));
1697
1698 if (evict_count) {
1699 MONITOR_INC_VALUE_CUMULATIVE(
1700 MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
1701 MONITOR_LRU_BATCH_EVICT_COUNT,
1702 MONITOR_LRU_BATCH_EVICT_PAGES,
1703 evict_count);
1704 }
1705
1706 if (scanned) {
1707 MONITOR_INC_VALUE_CUMULATIVE(
1708 MONITOR_LRU_BATCH_SCANNED,
1709 MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
1710 MONITOR_LRU_BATCH_SCANNED_PER_CALL,
1711 scanned);
1712 }
1713
1714 return(count);
1715 }
1716
1717 /*******************************************************************//**
1718 Flush and move pages from LRU or unzip_LRU list to the free list.
1719 Whether LRU or unzip_LRU is used depends on the state of the system.
1720 @return number of blocks for which either the write request was queued
1721 or in case of unzip_LRU the number of blocks actually moved to the
1722 free list */
1723 static
1724 ulint
buf_do_LRU_batch(buf_pool_t * buf_pool,ulint max)1725 buf_do_LRU_batch(
1726 /*=============*/
1727 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1728 ulint max) /*!< in: desired number of
1729 blocks in the free_list */
1730 {
1731 ulint count = 0;
1732
1733 if (buf_LRU_evict_from_unzip_LRU(buf_pool)) {
1734 count += buf_free_from_unzip_LRU_list_batch(buf_pool, max);
1735 }
1736
1737 if (max > count) {
1738 count += buf_flush_LRU_list_batch(buf_pool, max - count);
1739 }
1740
1741 return(count);
1742 }
1743
1744 /** This utility flushes dirty blocks from the end of the flush_list.
1745 The calling thread is not allowed to own any latches on pages!
1746 @param[in] buf_pool buffer pool instance
1747 @param[in] min_n wished minimum mumber of blocks flushed (it is
1748 not guaranteed that the actual number is that big, though)
1749 @param[in] lsn_limit all blocks whose oldest_modification is smaller
1750 than this should be flushed (if their number does not exceed min_n)
1751 @return number of blocks for which the write request was queued;
1752 ULINT_UNDEFINED if there was a flush of the same type already
1753 running */
1754 static
1755 ulint
buf_do_flush_list_batch(buf_pool_t * buf_pool,ulint min_n,lsn_t lsn_limit)1756 buf_do_flush_list_batch(
1757 buf_pool_t* buf_pool,
1758 ulint min_n,
1759 lsn_t lsn_limit)
1760 {
1761 ulint count = 0;
1762 ulint scanned = 0;
1763
1764 ut_ad(buf_pool_mutex_own(buf_pool));
1765
1766 /* Start from the end of the list looking for a suitable
1767 block to be flushed. */
1768 buf_flush_list_mutex_enter(buf_pool);
1769 ulint len = UT_LIST_GET_LEN(buf_pool->flush_list);
1770
1771 /* In order not to degenerate this scan to O(n*n) we attempt
1772 to preserve pointer of previous block in the flush list. To do
1773 so we declare it a hazard pointer. Any thread working on the
1774 flush list must check the hazard pointer and if it is removing
1775 the same block then it must reset it. */
1776 for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
1777 count < min_n && bpage != NULL && len > 0
1778 && bpage->oldest_modification < lsn_limit;
1779 bpage = buf_pool->flush_hp.get(),
1780 ++scanned) {
1781
1782 buf_page_t* prev;
1783
1784 ut_a(bpage->oldest_modification > 0);
1785 ut_ad(bpage->in_flush_list);
1786
1787 prev = UT_LIST_GET_PREV(list, bpage);
1788 buf_pool->flush_hp.set(prev);
1789 buf_flush_list_mutex_exit(buf_pool);
1790
1791 #ifdef UNIV_DEBUG
1792 bool flushed =
1793 #endif /* UNIV_DEBUG */
1794 buf_flush_page_and_try_neighbors(
1795 bpage, BUF_FLUSH_LIST, min_n, &count);
1796
1797 buf_flush_list_mutex_enter(buf_pool);
1798
1799 ut_ad(flushed || buf_pool->flush_hp.is_hp(prev));
1800
1801 --len;
1802 }
1803
1804 buf_pool->flush_hp.set(NULL);
1805 buf_flush_list_mutex_exit(buf_pool);
1806
1807 if (scanned) {
1808 MONITOR_INC_VALUE_CUMULATIVE(
1809 MONITOR_FLUSH_BATCH_SCANNED,
1810 MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
1811 MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
1812 scanned);
1813 }
1814
1815 if (count) {
1816 MONITOR_INC_VALUE_CUMULATIVE(
1817 MONITOR_FLUSH_BATCH_TOTAL_PAGE,
1818 MONITOR_FLUSH_BATCH_COUNT,
1819 MONITOR_FLUSH_BATCH_PAGES,
1820 count);
1821 }
1822
1823 ut_ad(buf_pool_mutex_own(buf_pool));
1824
1825 return(count);
1826 }
1827
1828 /** This utility flushes dirty blocks from the end of the LRU list or
1829 flush_list.
1830 NOTE 1: in the case of an LRU flush the calling thread may own latches to
1831 pages: to avoid deadlocks, this function must be written so that it cannot
1832 end up waiting for these latches! NOTE 2: in the case of a flush list flush,
1833 the calling thread is not allowed to own any latches on pages!
1834 @param[in] buf_pool buffer pool instance
1835 @param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST; if
1836 BUF_FLUSH_LIST, then the caller must not own any latches on pages
1837 @param[in] min_n wished minimum mumber of blocks flushed (it is
1838 not guaranteed that the actual number is that big, though)
1839 @param[in] lsn_limit in the case of BUF_FLUSH_LIST all blocks whose
1840 oldest_modification is smaller than this should be flushed (if their number
1841 does not exceed min_n), otherwise ignored
1842 @return number of blocks for which the write request was queued */
1843 static
1844 ulint
buf_flush_batch(buf_pool_t * buf_pool,buf_flush_t flush_type,ulint min_n,lsn_t lsn_limit)1845 buf_flush_batch(
1846 buf_pool_t* buf_pool,
1847 buf_flush_t flush_type,
1848 ulint min_n,
1849 lsn_t lsn_limit)
1850 {
1851 ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1852
1853 #ifdef UNIV_DEBUG
1854 {
1855 dict_sync_check check(true);
1856
1857 ut_ad(flush_type != BUF_FLUSH_LIST
1858 || !sync_check_iterate(check));
1859 }
1860 #endif /* UNIV_DEBUG */
1861
1862 buf_pool_mutex_enter(buf_pool);
1863
1864 ulint count = 0;
1865
1866 /* Note: The buffer pool mutex is released and reacquired within
1867 the flush functions. */
1868 switch (flush_type) {
1869 case BUF_FLUSH_LRU:
1870 count = buf_do_LRU_batch(buf_pool, min_n);
1871 break;
1872 case BUF_FLUSH_LIST:
1873 count = buf_do_flush_list_batch(buf_pool, min_n, lsn_limit);
1874 break;
1875 default:
1876 ut_error;
1877 }
1878
1879 buf_pool_mutex_exit(buf_pool);
1880
1881 DBUG_PRINT("ib_buf", ("flush %u completed, %u pages",
1882 unsigned(flush_type), unsigned(count)));
1883
1884 return(count);
1885 }
1886
1887 /******************************************************************//**
1888 Gather the aggregated stats for both flush list and LRU list flushing.
1889 @param page_count_flush number of pages flushed from the end of the flush_list
1890 @param page_count_LRU number of pages flushed from the end of the LRU list
1891 */
1892 static
1893 void
buf_flush_stats(ulint page_count_flush,ulint page_count_LRU)1894 buf_flush_stats(
1895 /*============*/
1896 ulint page_count_flush,
1897 ulint page_count_LRU)
1898 {
1899 DBUG_PRINT("ib_buf", ("flush completed, from flush_list %u pages, "
1900 "from LRU_list %u pages",
1901 unsigned(page_count_flush),
1902 unsigned(page_count_LRU)));
1903
1904 srv_stats.buf_pool_flushed.add(page_count_flush + page_count_LRU);
1905 }
1906
1907 /******************************************************************//**
1908 Start a buffer flush batch for LRU or flush list */
1909 static
1910 ibool
buf_flush_start(buf_pool_t * buf_pool,buf_flush_t flush_type)1911 buf_flush_start(
1912 /*============*/
1913 buf_pool_t* buf_pool, /*!< buffer pool instance */
1914 buf_flush_t flush_type) /*!< in: BUF_FLUSH_LRU
1915 or BUF_FLUSH_LIST */
1916 {
1917 ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1918
1919 buf_pool_mutex_enter(buf_pool);
1920
1921 if (buf_pool->n_flush[flush_type] > 0
1922 || buf_pool->init_flush[flush_type] == TRUE) {
1923
1924 /* There is already a flush batch of the same type running */
1925
1926 buf_pool_mutex_exit(buf_pool);
1927
1928 return(FALSE);
1929 }
1930
1931 buf_pool->init_flush[flush_type] = TRUE;
1932
1933 os_event_reset(buf_pool->no_flush[flush_type]);
1934
1935 buf_pool_mutex_exit(buf_pool);
1936
1937 return(TRUE);
1938 }
1939
1940 /******************************************************************//**
1941 End a buffer flush batch for LRU or flush list */
1942 static
1943 void
buf_flush_end(buf_pool_t * buf_pool,buf_flush_t flush_type)1944 buf_flush_end(
1945 /*==========*/
1946 buf_pool_t* buf_pool, /*!< buffer pool instance */
1947 buf_flush_t flush_type) /*!< in: BUF_FLUSH_LRU
1948 or BUF_FLUSH_LIST */
1949 {
1950 buf_pool_mutex_enter(buf_pool);
1951
1952 buf_pool->init_flush[flush_type] = FALSE;
1953
1954 buf_pool->try_LRU_scan = TRUE;
1955
1956 if (buf_pool->n_flush[flush_type] == 0) {
1957
1958 /* The running flush batch has ended */
1959
1960 os_event_set(buf_pool->no_flush[flush_type]);
1961 }
1962
1963 buf_pool_mutex_exit(buf_pool);
1964
1965 if (!srv_read_only_mode) {
1966 buf_dblwr_flush_buffered_writes();
1967 } else {
1968 os_aio_simulated_wake_handler_threads();
1969 }
1970 }
1971
1972 /******************************************************************//**
1973 Waits until a flush batch of the given type ends */
1974 void
buf_flush_wait_batch_end(buf_pool_t * buf_pool,buf_flush_t type)1975 buf_flush_wait_batch_end(
1976 /*=====================*/
1977 buf_pool_t* buf_pool, /*!< buffer pool instance */
1978 buf_flush_t type) /*!< in: BUF_FLUSH_LRU
1979 or BUF_FLUSH_LIST */
1980 {
1981 ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
1982
1983 if (buf_pool == NULL) {
1984 ulint i;
1985
1986 for (i = 0; i < srv_buf_pool_instances; ++i) {
1987 buf_pool_t* buf_pool;
1988
1989 buf_pool = buf_pool_from_array(i);
1990
1991 thd_wait_begin(NULL, THD_WAIT_DISKIO);
1992 os_event_wait(buf_pool->no_flush[type]);
1993 thd_wait_end(NULL);
1994 }
1995 } else {
1996 thd_wait_begin(NULL, THD_WAIT_DISKIO);
1997 os_event_wait(buf_pool->no_flush[type]);
1998 thd_wait_end(NULL);
1999 }
2000 }
2001
2002 /** Do flushing batch of a given type.
2003 NOTE: The calling thread is not allowed to own any latches on pages!
2004 @param[in,out] buf_pool buffer pool instance
2005 @param[in] type flush type
2006 @param[in] min_n wished minimum mumber of blocks flushed
2007 (it is not guaranteed that the actual number is that big, though)
2008 @param[in] lsn_limit in the case BUF_FLUSH_LIST all blocks whose
2009 oldest_modification is smaller than this should be flushed (if their number
2010 does not exceed min_n), otherwise ignored
2011 @param[out] n_processed the number of pages which were processed is
2012 passed back to caller. Ignored if NULL
2013 @retval true if a batch was queued successfully.
2014 @retval false if another batch of same type was already running. */
2015 bool
buf_flush_do_batch(buf_pool_t * buf_pool,buf_flush_t type,ulint min_n,lsn_t lsn_limit,ulint * n_processed)2016 buf_flush_do_batch(
2017 buf_pool_t* buf_pool,
2018 buf_flush_t type,
2019 ulint min_n,
2020 lsn_t lsn_limit,
2021 ulint* n_processed)
2022 {
2023 ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
2024
2025 if (n_processed != NULL) {
2026 *n_processed = 0;
2027 }
2028
2029 if (!buf_flush_start(buf_pool, type)) {
2030 return(false);
2031 }
2032
2033 ulint page_count = buf_flush_batch(buf_pool, type, min_n, lsn_limit);
2034
2035 buf_flush_end(buf_pool, type);
2036
2037 if (n_processed != NULL) {
2038 *n_processed = page_count;
2039 }
2040
2041 return(true);
2042 }
2043
2044 /**
2045 Waits until a flush batch of the given lsn ends
2046 @param[in] new_oldest target oldest_modified_lsn to wait for */
2047
2048 void
buf_flush_wait_flushed(lsn_t new_oldest)2049 buf_flush_wait_flushed(
2050 lsn_t new_oldest)
2051 {
2052 for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
2053 buf_pool_t* buf_pool;
2054 lsn_t oldest;
2055
2056 buf_pool = buf_pool_from_array(i);
2057
2058 for (;;) {
2059 /* We don't need to wait for fsync of the flushed
2060 blocks, because anyway we need fsync to make chekpoint.
2061 So, we don't need to wait for the batch end here. */
2062
2063 buf_flush_list_mutex_enter(buf_pool);
2064
2065 buf_page_t* bpage;
2066
2067 /* We don't need to wait for system temporary pages */
2068 for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
2069 bpage != NULL
2070 && fsp_is_system_temporary(bpage->id.space());
2071 bpage = UT_LIST_GET_PREV(list, bpage)) {
2072 /* Do nothing. */
2073 }
2074
2075 if (bpage != NULL) {
2076 ut_ad(bpage->in_flush_list);
2077 oldest = bpage->oldest_modification;
2078 } else {
2079 oldest = 0;
2080 }
2081
2082 buf_flush_list_mutex_exit(buf_pool);
2083
2084 if (oldest == 0 || oldest >= new_oldest) {
2085 break;
2086 }
2087
2088 /* sleep and retry */
2089 os_thread_sleep(buf_flush_wait_flushed_sleep_time);
2090
2091 MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
2092 }
2093 }
2094 }
2095
2096 /** This utility flushes dirty blocks from the end of the flush list of all
2097 buffer pool instances.
2098 NOTE: The calling thread is not allowed to own any latches on pages!
2099 @param[in] min_n wished minimum mumber of blocks flushed (it is
2100 not guaranteed that the actual number is that big, though)
2101 @param[in] lsn_limit in the case BUF_FLUSH_LIST all blocks whose
2102 oldest_modification is smaller than this should be flushed (if their number
2103 does not exceed min_n), otherwise ignored
2104 @param[out] n_processed the number of pages which were processed is
2105 passed back to caller. Ignored if NULL.
2106 @return true if a batch was queued successfully for each buffer pool
2107 instance. false if another batch of same type was already running in
2108 at least one of the buffer pool instance */
2109 bool
buf_flush_lists(ulint min_n,lsn_t lsn_limit,ulint * n_processed)2110 buf_flush_lists(
2111 ulint min_n,
2112 lsn_t lsn_limit,
2113 ulint* n_processed)
2114 {
2115 ulint i;
2116 ulint n_flushed = 0;
2117 bool success = true;
2118
2119 if (n_processed) {
2120 *n_processed = 0;
2121 }
2122
2123 if (min_n != ULINT_MAX) {
2124 /* Ensure that flushing is spread evenly amongst the
2125 buffer pool instances. When min_n is ULINT_MAX
2126 we need to flush everything up to the lsn limit
2127 so no limit here. */
2128 min_n = (min_n + srv_buf_pool_instances - 1)
2129 / srv_buf_pool_instances;
2130 }
2131
2132 /* Flush to lsn_limit in all buffer pool instances */
2133 for (i = 0; i < srv_buf_pool_instances; i++) {
2134 buf_pool_t* buf_pool;
2135 ulint page_count = 0;
2136
2137 buf_pool = buf_pool_from_array(i);
2138
2139 if (!buf_flush_do_batch(buf_pool,
2140 BUF_FLUSH_LIST,
2141 min_n,
2142 lsn_limit,
2143 &page_count)) {
2144 /* We have two choices here. If lsn_limit was
2145 specified then skipping an instance of buffer
2146 pool means we cannot guarantee that all pages
2147 up to lsn_limit has been flushed. We can
2148 return right now with failure or we can try
2149 to flush remaining buffer pools up to the
2150 lsn_limit. We attempt to flush other buffer
2151 pools based on the assumption that it will
2152 help in the retry which will follow the
2153 failure. */
2154 success = false;
2155
2156 continue;
2157 }
2158
2159 n_flushed += page_count;
2160 }
2161
2162 if (n_flushed) {
2163 buf_flush_stats(n_flushed, 0);
2164 }
2165
2166 if (n_processed) {
2167 *n_processed = n_flushed;
2168 }
2169
2170 return(success);
2171 }
2172
2173 /******************************************************************//**
2174 This function picks up a single page from the tail of the LRU
2175 list, flushes it (if it is dirty), removes it from page_hash and LRU
2176 list and puts it on the free list. It is called from user threads when
2177 they are unable to find a replaceable page at the tail of the LRU
2178 list i.e.: when the background LRU flushing in the page_cleaner thread
2179 is not fast enough to keep pace with the workload.
2180 @return true if success. */
2181 bool
buf_flush_single_page_from_LRU(buf_pool_t * buf_pool)2182 buf_flush_single_page_from_LRU(
2183 /*===========================*/
2184 buf_pool_t* buf_pool) /*!< in/out: buffer pool instance */
2185 {
2186 ulint scanned;
2187 buf_page_t* bpage;
2188 ibool freed;
2189
2190 buf_pool_mutex_enter(buf_pool);
2191
2192 for (bpage = buf_pool->single_scan_itr.start(), scanned = 0,
2193 freed = false;
2194 bpage != NULL;
2195 ++scanned, bpage = buf_pool->single_scan_itr.get()) {
2196
2197 ut_ad(buf_pool_mutex_own(buf_pool));
2198
2199 buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage);
2200
2201 buf_pool->single_scan_itr.set(prev);
2202
2203 BPageMutex* block_mutex;
2204
2205 block_mutex = buf_page_get_mutex(bpage);
2206
2207 mutex_enter(block_mutex);
2208
2209 if (buf_flush_ready_for_replace(bpage)) {
2210 /* block is ready for eviction i.e., it is
2211 clean and is not IO-fixed or buffer fixed. */
2212 mutex_exit(block_mutex);
2213
2214 if (buf_LRU_free_page(bpage, true)) {
2215 buf_pool_mutex_exit(buf_pool);
2216 freed = true;
2217 break;
2218 }
2219
2220 } else if (buf_flush_ready_for_flush(
2221 bpage, BUF_FLUSH_SINGLE_PAGE)) {
2222
2223 /* Block is ready for flush. Try and dispatch an IO
2224 request. We'll put it on free list in IO completion
2225 routine if it is not buffer fixed. The following call
2226 will release the buffer pool and block mutex.
2227
2228 Note: There is no guarantee that this page has actually
2229 been freed, only that it has been flushed to disk */
2230
2231 freed = buf_flush_page(
2232 buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE, true);
2233
2234 if (freed) {
2235 break;
2236 }
2237
2238 mutex_exit(block_mutex);
2239 } else {
2240 mutex_exit(block_mutex);
2241 }
2242
2243 ut_ad(!mutex_own(block_mutex));
2244 }
2245
2246 if (!freed) {
2247 /* Can't find a single flushable page. */
2248 ut_ad(!bpage);
2249 buf_pool_mutex_exit(buf_pool);
2250 }
2251
2252 if (scanned) {
2253 MONITOR_INC_VALUE_CUMULATIVE(
2254 MONITOR_LRU_SINGLE_FLUSH_SCANNED,
2255 MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
2256 MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL,
2257 scanned);
2258 }
2259
2260 ut_ad(!buf_pool_mutex_own(buf_pool));
2261
2262 return(freed);
2263 }
2264
2265 /**
2266 Clears up tail of the LRU list of a given buffer pool instance:
2267 * Put replaceable pages at the tail of LRU to the free list
2268 * Flush dirty pages at the tail of LRU to the disk
2269 The depth to which we scan each buffer pool is controlled by dynamic
2270 config parameter innodb_LRU_scan_depth.
2271 @param buf_pool buffer pool instance
2272 @return total pages flushed */
2273 static
2274 ulint
buf_flush_LRU_list(buf_pool_t * buf_pool)2275 buf_flush_LRU_list(
2276 buf_pool_t* buf_pool)
2277 {
2278 ulint scan_depth, withdraw_depth;
2279 ulint n_flushed = 0;
2280
2281 ut_ad(buf_pool);
2282
2283 /* srv_LRU_scan_depth can be arbitrarily large value.
2284 We cap it with current LRU size. */
2285 buf_pool_mutex_enter(buf_pool);
2286 scan_depth = UT_LIST_GET_LEN(buf_pool->LRU);
2287 if (buf_pool->curr_size < buf_pool->old_size
2288 && buf_pool->withdraw_target > 0) {
2289 withdraw_depth = buf_pool->withdraw_target
2290 - UT_LIST_GET_LEN(buf_pool->withdraw);
2291 } else {
2292 withdraw_depth = 0;
2293 }
2294 buf_pool_mutex_exit(buf_pool);
2295
2296 if (withdraw_depth > srv_LRU_scan_depth) {
2297 scan_depth = ut_min(withdraw_depth, scan_depth);
2298 } else {
2299 scan_depth = ut_min(static_cast<ulint>(srv_LRU_scan_depth),
2300 scan_depth);
2301 }
2302
2303 /* Currently one of page_cleaners is the only thread
2304 that can trigger an LRU flush at the same time.
2305 So, it is not possible that a batch triggered during
2306 last iteration is still running, */
2307 buf_flush_do_batch(buf_pool, BUF_FLUSH_LRU, scan_depth,
2308 0, &n_flushed);
2309
2310 return(n_flushed);
2311 }
2312
2313 /*********************************************************************//**
2314 Clears up tail of the LRU lists:
2315 * Put replaceable pages at the tail of LRU to the free list
2316 * Flush dirty pages at the tail of LRU to the disk
2317 The depth to which we scan each buffer pool is controlled by dynamic
2318 config parameter innodb_LRU_scan_depth.
2319 @return total pages flushed */
2320 ulint
buf_flush_LRU_lists(void)2321 buf_flush_LRU_lists(void)
2322 /*=====================*/
2323 {
2324 ulint n_flushed = 0;
2325
2326 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2327
2328 n_flushed += buf_flush_LRU_list(buf_pool_from_array(i));
2329 }
2330
2331 if (n_flushed) {
2332 buf_flush_stats(0, n_flushed);
2333 }
2334
2335 return(n_flushed);
2336 }
2337
2338 /*********************************************************************//**
2339 Wait for any possible LRU flushes that are in progress to end. */
2340 void
buf_flush_wait_LRU_batch_end(void)2341 buf_flush_wait_LRU_batch_end(void)
2342 /*==============================*/
2343 {
2344 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2345 buf_pool_t* buf_pool;
2346
2347 buf_pool = buf_pool_from_array(i);
2348
2349 buf_pool_mutex_enter(buf_pool);
2350
2351 if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0
2352 || buf_pool->init_flush[BUF_FLUSH_LRU]) {
2353
2354 buf_pool_mutex_exit(buf_pool);
2355 buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
2356 } else {
2357 buf_pool_mutex_exit(buf_pool);
2358 }
2359 }
2360 }
2361
2362 /*********************************************************************//**
2363 Calculates if flushing is required based on number of dirty pages in
2364 the buffer pool.
2365 @return percent of io_capacity to flush to manage dirty page ratio */
2366 static
2367 ulint
af_get_pct_for_dirty()2368 af_get_pct_for_dirty()
2369 /*==================*/
2370 {
2371 double dirty_pct = buf_get_modified_ratio_pct();
2372
2373 if (dirty_pct == 0.0) {
2374 /* No pages modified */
2375 return(0);
2376 }
2377
2378 ut_a(srv_max_dirty_pages_pct_lwm
2379 <= srv_max_buf_pool_modified_pct);
2380
2381 if (srv_max_dirty_pages_pct_lwm == 0) {
2382 /* The user has not set the option to preflush dirty
2383 pages as we approach the high water mark. */
2384 if (dirty_pct >= srv_max_buf_pool_modified_pct) {
2385 /* We have crossed the high water mark of dirty
2386 pages In this case we start flushing at 100% of
2387 innodb_io_capacity. */
2388 return(100);
2389 }
2390 } else if (dirty_pct >= srv_max_dirty_pages_pct_lwm) {
2391 /* We should start flushing pages gradually. */
2392 return(static_cast<ulint>((dirty_pct * 100)
2393 / (srv_max_buf_pool_modified_pct + 1)));
2394 }
2395
2396 return(0);
2397 }
2398
2399 /*********************************************************************//**
2400 Calculates if flushing is required based on redo generation rate.
2401 @return percent of io_capacity to flush to manage redo space */
2402 static
2403 ulint
af_get_pct_for_lsn(lsn_t age)2404 af_get_pct_for_lsn(
2405 /*===============*/
2406 lsn_t age) /*!< in: current age of LSN. */
2407 {
2408 lsn_t max_async_age;
2409 lsn_t lsn_age_factor;
2410 lsn_t af_lwm = (srv_adaptive_flushing_lwm
2411 * log_get_capacity()) / 100;
2412
2413 if (age < af_lwm) {
2414 /* No adaptive flushing. */
2415 return(0);
2416 }
2417
2418 max_async_age = log_get_max_modified_age_async();
2419
2420 if (age < max_async_age && !srv_adaptive_flushing) {
2421 /* We have still not reached the max_async point and
2422 the user has disabled adaptive flushing. */
2423 return(0);
2424 }
2425
2426 /* If we are here then we know that either:
2427 1) User has enabled adaptive flushing
2428 2) User may have disabled adaptive flushing but we have reached
2429 max_async_age. */
2430 lsn_age_factor = (age * 100) / max_async_age;
2431
2432 ut_ad(srv_max_io_capacity >= srv_io_capacity);
2433 return(static_cast<ulint>(
2434 ((srv_max_io_capacity / srv_io_capacity)
2435 * (lsn_age_factor * sqrt((double)lsn_age_factor)))
2436 / 7.5));
2437 }
2438
2439 /*********************************************************************//**
2440 This function is called approximately once every second by the
2441 page_cleaner thread. Based on various factors it decides if there is a
2442 need to do flushing.
2443 @return number of pages recommended to be flushed
2444 @param lsn_limit pointer to return LSN up to which flushing must happen
2445 @param last_pages_in the number of pages flushed by the last flush_list
2446 flushing. */
2447 static
2448 ulint
page_cleaner_flush_pages_recommendation(lsn_t * lsn_limit,ulint last_pages_in)2449 page_cleaner_flush_pages_recommendation(
2450 /*====================================*/
2451 lsn_t* lsn_limit,
2452 ulint last_pages_in)
2453 {
2454 static lsn_t prev_lsn = 0;
2455 static ulint sum_pages = 0;
2456 static ulint avg_page_rate = 0;
2457 static ulint n_iterations = 0;
2458 static ib_time_monotonic_t prev_time;
2459 lsn_t oldest_lsn;
2460 lsn_t cur_lsn;
2461 lsn_t age;
2462 lsn_t lsn_rate;
2463 ulint n_pages = 0;
2464 ulint pct_for_dirty = 0;
2465 ulint pct_for_lsn = 0;
2466 ulint pct_total = 0;
2467
2468 cur_lsn = log_get_lsn();
2469
2470 if (prev_lsn == 0) {
2471 /* First time around. */
2472 prev_lsn = cur_lsn;
2473 prev_time = ut_time_monotonic();
2474 return(0);
2475 }
2476
2477 if (prev_lsn == cur_lsn) {
2478 return(0);
2479 }
2480
2481 sum_pages += last_pages_in;
2482
2483 ib_time_monotonic_t curr_time = ut_time_monotonic();
2484 uint64_t time_elapsed = curr_time - prev_time;
2485 const ulong avg_loop = srv_flushing_avg_loops;
2486
2487 /* We update our variables every srv_flushing_avg_loops
2488 iterations to smooth out transition in workload. */
2489 if (++n_iterations >= avg_loop
2490 || time_elapsed >= (uint64_t)avg_loop) {
2491
2492 if (time_elapsed < 1) {
2493 time_elapsed = 1;
2494 }
2495
2496 avg_page_rate = static_cast<ulint>(
2497 ((static_cast<double>(sum_pages)
2498 / time_elapsed)
2499 + avg_page_rate) / 2);
2500
2501 /* How much LSN we have generated since last call. */
2502 lsn_rate = static_cast<lsn_t>(
2503 static_cast<double>(cur_lsn - prev_lsn)
2504 / time_elapsed);
2505
2506 lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2;
2507
2508
2509 /* aggregate stats of all slots */
2510 mutex_enter(&page_cleaner->mutex);
2511
2512 uint64_t flush_tm = page_cleaner->flush_time;
2513 ulint flush_pass = page_cleaner->flush_pass;
2514
2515 page_cleaner->flush_time = 0;
2516 page_cleaner->flush_pass = 0;
2517
2518 uint64_t lru_tm = 0;
2519 uint64_t list_tm = 0;
2520 ulint lru_pass = 0;
2521 ulint list_pass = 0;
2522
2523 for (ulint i = 0; i < page_cleaner->n_slots; i++) {
2524 page_cleaner_slot_t* slot;
2525
2526 slot = &page_cleaner->slots[i];
2527
2528 lru_tm += slot->flush_lru_time;
2529 lru_pass += slot->flush_lru_pass;
2530 list_tm += slot->flush_list_time;
2531 list_pass += slot->flush_list_pass;
2532
2533 slot->flush_lru_time = 0;
2534 slot->flush_lru_pass = 0;
2535 slot->flush_list_time = 0;
2536 slot->flush_list_pass = 0;
2537 }
2538
2539 mutex_exit(&page_cleaner->mutex);
2540
2541 /* minimum values are 1, to avoid dividing by zero. */
2542 if (lru_tm < 1) {
2543 lru_tm = 1;
2544 }
2545 if (list_tm < 1) {
2546 list_tm = 1;
2547 }
2548 if (flush_tm < 1) {
2549 flush_tm = 1;
2550 }
2551
2552 if (lru_pass < 1) {
2553 lru_pass = 1;
2554 }
2555 if (list_pass < 1) {
2556 list_pass = 1;
2557 }
2558 if (flush_pass < 1) {
2559 flush_pass = 1;
2560 }
2561
2562 MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_SLOT,
2563 list_tm / list_pass);
2564 MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_SLOT,
2565 lru_tm / lru_pass);
2566
2567 MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_THREAD,
2568 list_tm / (srv_n_page_cleaners * flush_pass));
2569 MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_THREAD,
2570 lru_tm / (srv_n_page_cleaners * flush_pass));
2571 MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_EST,
2572 flush_tm * list_tm / flush_pass
2573 / (list_tm + lru_tm));
2574 MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_EST,
2575 flush_tm * lru_tm / flush_pass
2576 / (list_tm + lru_tm));
2577 MONITOR_SET(MONITOR_FLUSH_AVG_TIME, flush_tm / flush_pass);
2578
2579 MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS,
2580 list_pass / page_cleaner->n_slots);
2581 MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_PASS,
2582 lru_pass / page_cleaner->n_slots);
2583 MONITOR_SET(MONITOR_FLUSH_AVG_PASS, flush_pass);
2584
2585 prev_lsn = cur_lsn;
2586 prev_time = curr_time;
2587
2588 n_iterations = 0;
2589
2590 sum_pages = 0;
2591 }
2592
2593 oldest_lsn = buf_pool_get_oldest_modification();
2594
2595 ut_ad(oldest_lsn <= log_get_lsn());
2596
2597 age = cur_lsn > oldest_lsn ? cur_lsn - oldest_lsn : 0;
2598
2599 pct_for_dirty = af_get_pct_for_dirty();
2600 pct_for_lsn = af_get_pct_for_lsn(age);
2601
2602 pct_total = ut_max(pct_for_dirty, pct_for_lsn);
2603
2604 /* Estimate pages to be flushed for the lsn progress */
2605 ulint sum_pages_for_lsn = 0;
2606 lsn_t target_lsn = oldest_lsn
2607 + lsn_avg_rate * buf_flush_lsn_scan_factor;
2608
2609 /* Cap the maximum IO capacity that we are going to use by
2610 max_io_capacity. Limit the value to avoid too quick increase */
2611 const ulint sum_pages_max = srv_max_io_capacity * 2;
2612
2613 /* Limit individual BP scan based on overall capacity. */
2614 const ulint pages_for_lsn_max =
2615 (sum_pages_max / srv_buf_pool_instances) *
2616 buf_flush_lsn_scan_factor * 2;
2617
2618 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2619 buf_pool_t* buf_pool = buf_pool_from_array(i);
2620 ulint pages_for_lsn = 0;
2621
2622 buf_flush_list_mutex_enter(buf_pool);
2623 for (buf_page_t* b = UT_LIST_GET_LAST(buf_pool->flush_list);
2624 b != NULL;
2625 b = UT_LIST_GET_PREV(list, b)) {
2626 if (b->oldest_modification > target_lsn) {
2627 break;
2628 }
2629 ++pages_for_lsn;
2630 if (pages_for_lsn >= pages_for_lsn_max) {
2631 break;
2632 }
2633 }
2634 buf_flush_list_mutex_exit(buf_pool);
2635
2636 sum_pages_for_lsn += pages_for_lsn;
2637
2638 mutex_enter(&page_cleaner->mutex);
2639 ut_ad(page_cleaner->slots[i].state
2640 == PAGE_CLEANER_STATE_NONE);
2641 page_cleaner->slots[i].n_pages_requested
2642 = pages_for_lsn / buf_flush_lsn_scan_factor + 1;
2643 mutex_exit(&page_cleaner->mutex);
2644 }
2645
2646 sum_pages_for_lsn /= buf_flush_lsn_scan_factor;
2647 if(sum_pages_for_lsn < 1) {
2648 sum_pages_for_lsn = 1;
2649 }
2650
2651 /* Cap the maximum IO capacity that we are going to use by
2652 max_io_capacity. Limit the value to avoid too quick increase */
2653 ulint pages_for_lsn =
2654 std::min<ulint>(sum_pages_for_lsn, sum_pages_max);
2655
2656 n_pages = (PCT_IO(pct_total) + avg_page_rate + pages_for_lsn) / 3;
2657
2658 if (n_pages > srv_max_io_capacity) {
2659 n_pages = srv_max_io_capacity;
2660 }
2661
2662 /* Normalize request for each instance */
2663 mutex_enter(&page_cleaner->mutex);
2664 ut_ad(page_cleaner->n_slots_requested == 0);
2665 ut_ad(page_cleaner->n_slots_flushing == 0);
2666 ut_ad(page_cleaner->n_slots_finished == 0);
2667
2668 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2669 /* if REDO has enough of free space,
2670 don't care about age distribution of pages */
2671 page_cleaner->slots[i].n_pages_requested = pct_for_lsn > 30 ?
2672 page_cleaner->slots[i].n_pages_requested
2673 * n_pages / sum_pages_for_lsn + 1
2674 : n_pages / srv_buf_pool_instances;
2675 }
2676 mutex_exit(&page_cleaner->mutex);
2677
2678 MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages);
2679
2680 MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_BY_AGE, sum_pages_for_lsn);
2681
2682 MONITOR_SET(MONITOR_FLUSH_AVG_PAGE_RATE, avg_page_rate);
2683 MONITOR_SET(MONITOR_FLUSH_LSN_AVG_RATE, lsn_avg_rate);
2684 MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty);
2685 MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn);
2686
2687 *lsn_limit = LSN_MAX;
2688
2689 return(n_pages);
2690 }
2691
2692 /*********************************************************************//**
2693 Puts the page_cleaner thread to sleep if it has finished work in less
2694 than a second
2695 @retval 0 wake up by event set,
2696 @retval OS_SYNC_TIME_EXCEEDED if timeout was exceeded
2697 @param next_loop_time time when next loop iteration should start
2698 @param sig_count zero or the value returned by previous call of
2699 os_event_reset() */
2700 static
2701 ulint
pc_sleep_if_needed(ib_time_monotonic_ms_t next_loop_time,int64_t sig_count)2702 pc_sleep_if_needed(
2703 /*===============*/
2704 ib_time_monotonic_ms_t next_loop_time,
2705 int64_t sig_count)
2706 {
2707 ib_time_monotonic_ms_t cur_time = ut_time_monotonic_ms();
2708
2709 if (next_loop_time > cur_time) {
2710 /* Get sleep interval in micro seconds. We use
2711 ut_min() to avoid long sleep in case of wrap around. */
2712 int64_t sleep_us;
2713
2714 sleep_us = ut_min(int64_t(1000000),
2715 (next_loop_time - cur_time) * int64_t(1000));
2716 ut_a(sleep_us > 0);
2717
2718 return(os_event_wait_time_low(buf_flush_event,
2719 sleep_us, sig_count));
2720 }
2721
2722 return(OS_SYNC_TIME_EXCEEDED);
2723 }
2724
2725 /******************************************************************//**
2726 Initialize page_cleaner. */
2727 void
buf_flush_page_cleaner_init(void)2728 buf_flush_page_cleaner_init(void)
2729 /*=============================*/
2730 {
2731 ut_ad(page_cleaner == NULL);
2732
2733 page_cleaner = static_cast<page_cleaner_t*>(
2734 ut_zalloc_nokey(sizeof(*page_cleaner)));
2735
2736 mutex_create(LATCH_ID_PAGE_CLEANER, &page_cleaner->mutex);
2737
2738 page_cleaner->is_requested = os_event_create("pc_is_requested");
2739 page_cleaner->is_finished = os_event_create("pc_is_finished");
2740
2741 page_cleaner->n_slots = static_cast<ulint>(srv_buf_pool_instances);
2742
2743 page_cleaner->slots = static_cast<page_cleaner_slot_t*>(
2744 ut_zalloc_nokey(page_cleaner->n_slots
2745 * sizeof(*page_cleaner->slots)));
2746
2747 ut_d(page_cleaner->n_disabled_debug = 0);
2748
2749 page_cleaner->is_running = true;
2750 }
2751
2752 /**
2753 Close page_cleaner. */
2754 static
2755 void
buf_flush_page_cleaner_close(void)2756 buf_flush_page_cleaner_close(void)
2757 {
2758 /* waiting for all worker threads exit */
2759 while (page_cleaner->n_workers > 0) {
2760 os_thread_sleep(10000);
2761 }
2762
2763 mutex_destroy(&page_cleaner->mutex);
2764
2765 ut_free(page_cleaner->slots);
2766
2767 os_event_destroy(page_cleaner->is_finished);
2768 os_event_destroy(page_cleaner->is_requested);
2769
2770 ut_free(page_cleaner);
2771
2772 page_cleaner = NULL;
2773 }
2774
2775 /**
2776 Requests for all slots to flush all buffer pool instances.
2777 @param min_n wished minimum mumber of blocks flushed
2778 (it is not guaranteed that the actual number is that big)
2779 @param lsn_limit in the case BUF_FLUSH_LIST all blocks whose
2780 oldest_modification is smaller than this should be flushed
2781 (if their number does not exceed min_n), otherwise ignored
2782 */
2783 static
2784 void
pc_request(ulint min_n,lsn_t lsn_limit)2785 pc_request(
2786 ulint min_n,
2787 lsn_t lsn_limit)
2788 {
2789 if (min_n != ULINT_MAX) {
2790 /* Ensure that flushing is spread evenly amongst the
2791 buffer pool instances. When min_n is ULINT_MAX
2792 we need to flush everything up to the lsn limit
2793 so no limit here. */
2794 min_n = (min_n + srv_buf_pool_instances - 1)
2795 / srv_buf_pool_instances;
2796 }
2797
2798 mutex_enter(&page_cleaner->mutex);
2799
2800 ut_ad(page_cleaner->n_slots_requested == 0);
2801 ut_ad(page_cleaner->n_slots_flushing == 0);
2802 ut_ad(page_cleaner->n_slots_finished == 0);
2803
2804 page_cleaner->requested = (min_n > 0);
2805 page_cleaner->lsn_limit = lsn_limit;
2806
2807 for (ulint i = 0; i < page_cleaner->n_slots; i++) {
2808 page_cleaner_slot_t* slot = &page_cleaner->slots[i];
2809
2810 ut_ad(slot->state == PAGE_CLEANER_STATE_NONE);
2811
2812 if (min_n == ULINT_MAX) {
2813 slot->n_pages_requested = ULINT_MAX;
2814 } else if (min_n == 0) {
2815 slot->n_pages_requested = 0;
2816 }
2817
2818 /* slot->n_pages_requested was already set by
2819 page_cleaner_flush_pages_recommendation() */
2820
2821 slot->state = PAGE_CLEANER_STATE_REQUESTED;
2822 }
2823
2824 page_cleaner->n_slots_requested = page_cleaner->n_slots;
2825 page_cleaner->n_slots_flushing = 0;
2826 page_cleaner->n_slots_finished = 0;
2827
2828 os_event_set(page_cleaner->is_requested);
2829
2830 mutex_exit(&page_cleaner->mutex);
2831 }
2832
2833 /**
2834 Do flush for one slot.
2835 @return the number of the slots which has not been treated yet. */
2836 static
2837 ulint
pc_flush_slot(void)2838 pc_flush_slot(void)
2839 {
2840 ib_time_monotonic_ms_t lru_tm = 0;
2841 ib_time_monotonic_ms_t list_tm = 0;
2842 int lru_pass = 0;
2843 int list_pass = 0;
2844
2845 mutex_enter(&page_cleaner->mutex);
2846
2847 if (page_cleaner->n_slots_requested > 0) {
2848 page_cleaner_slot_t* slot = NULL;
2849 ulint i;
2850
2851 for (i = 0; i < page_cleaner->n_slots; i++) {
2852 slot = &page_cleaner->slots[i];
2853
2854 if (slot->state == PAGE_CLEANER_STATE_REQUESTED) {
2855 break;
2856 }
2857 }
2858
2859 /* slot should be found because
2860 page_cleaner->n_slots_requested > 0 */
2861 ut_a(i < page_cleaner->n_slots);
2862
2863 buf_pool_t* buf_pool = buf_pool_from_array(i);
2864
2865 page_cleaner->n_slots_requested--;
2866 page_cleaner->n_slots_flushing++;
2867 slot->state = PAGE_CLEANER_STATE_FLUSHING;
2868
2869 if (page_cleaner->n_slots_requested == 0) {
2870 os_event_reset(page_cleaner->is_requested);
2871 }
2872
2873 if (!page_cleaner->is_running) {
2874 slot->n_flushed_lru = 0;
2875 slot->n_flushed_list = 0;
2876 goto finish_mutex;
2877 }
2878
2879 mutex_exit(&page_cleaner->mutex);
2880
2881 lru_tm = ut_time_monotonic_ms();
2882
2883 /* Flush pages from end of LRU if required */
2884 slot->n_flushed_lru = buf_flush_LRU_list(buf_pool);
2885
2886 lru_tm = ut_time_monotonic_ms() - lru_tm;
2887 lru_pass++;
2888
2889 if (!page_cleaner->is_running) {
2890 slot->n_flushed_list = 0;
2891 goto finish;
2892 }
2893
2894 /* Flush pages from flush_list if required */
2895 if (page_cleaner->requested) {
2896
2897 list_tm = ut_time_monotonic_ms();
2898
2899 slot->succeeded_list = buf_flush_do_batch(
2900 buf_pool, BUF_FLUSH_LIST,
2901 slot->n_pages_requested,
2902 page_cleaner->lsn_limit,
2903 &slot->n_flushed_list);
2904
2905 list_tm = ut_time_monotonic_ms() - list_tm;
2906 list_pass++;
2907 } else {
2908 slot->n_flushed_list = 0;
2909 slot->succeeded_list = true;
2910 }
2911 finish:
2912 mutex_enter(&page_cleaner->mutex);
2913 finish_mutex:
2914 page_cleaner->n_slots_flushing--;
2915 page_cleaner->n_slots_finished++;
2916 slot->state = PAGE_CLEANER_STATE_FINISHED;
2917
2918 slot->flush_lru_time += lru_tm;
2919 slot->flush_list_time += list_tm;
2920 slot->flush_lru_pass += lru_pass;
2921 slot->flush_list_pass += list_pass;
2922
2923 if (page_cleaner->n_slots_requested == 0
2924 && page_cleaner->n_slots_flushing == 0) {
2925 os_event_set(page_cleaner->is_finished);
2926 }
2927 }
2928
2929 ulint ret = page_cleaner->n_slots_requested;
2930
2931 mutex_exit(&page_cleaner->mutex);
2932
2933 return(ret);
2934 }
2935
2936 /**
2937 Wait until all flush requests are finished.
2938 @param n_flushed_lru number of pages flushed from the end of the LRU list.
2939 @param n_flushed_list number of pages flushed from the end of the
2940 flush_list.
2941 @return true if all flush_list flushing batch were success. */
2942 static
2943 bool
pc_wait_finished(ulint * n_flushed_lru,ulint * n_flushed_list)2944 pc_wait_finished(
2945 ulint* n_flushed_lru,
2946 ulint* n_flushed_list)
2947 {
2948 bool all_succeeded = true;
2949
2950 *n_flushed_lru = 0;
2951 *n_flushed_list = 0;
2952
2953 os_event_wait(page_cleaner->is_finished);
2954
2955 mutex_enter(&page_cleaner->mutex);
2956
2957 ut_ad(page_cleaner->n_slots_requested == 0);
2958 ut_ad(page_cleaner->n_slots_flushing == 0);
2959 ut_ad(page_cleaner->n_slots_finished == page_cleaner->n_slots);
2960
2961 for (ulint i = 0; i < page_cleaner->n_slots; i++) {
2962 page_cleaner_slot_t* slot = &page_cleaner->slots[i];
2963
2964 ut_ad(slot->state == PAGE_CLEANER_STATE_FINISHED);
2965
2966 *n_flushed_lru += slot->n_flushed_lru;
2967 *n_flushed_list += slot->n_flushed_list;
2968 all_succeeded &= slot->succeeded_list;
2969
2970 slot->state = PAGE_CLEANER_STATE_NONE;
2971
2972 slot->n_pages_requested = 0;
2973 }
2974
2975 page_cleaner->n_slots_finished = 0;
2976
2977 os_event_reset(page_cleaner->is_finished);
2978
2979 mutex_exit(&page_cleaner->mutex);
2980
2981 return(all_succeeded);
2982 }
2983
2984 #ifdef UNIV_LINUX
2985 /**
2986 Set priority for page_cleaner threads.
2987 @param[in] priority priority intended to set
2988 @return true if set as intended */
2989 static
2990 bool
buf_flush_page_cleaner_set_priority(int priority)2991 buf_flush_page_cleaner_set_priority(
2992 int priority)
2993 {
2994 setpriority(PRIO_PROCESS, (pid_t)syscall(SYS_gettid),
2995 priority);
2996 return(getpriority(PRIO_PROCESS, (pid_t)syscall(SYS_gettid))
2997 == priority);
2998 }
2999 #endif /* UNIV_LINUX */
3000
3001 #ifdef UNIV_DEBUG
3002 /** Loop used to disable page cleaner threads. */
3003 static
3004 void
buf_flush_page_cleaner_disabled_loop(void)3005 buf_flush_page_cleaner_disabled_loop(void)
3006 {
3007 ut_ad(page_cleaner != NULL);
3008
3009 if (!innodb_page_cleaner_disabled_debug) {
3010 /* We return to avoid entering and exiting mutex. */
3011 return;
3012 }
3013
3014 mutex_enter(&page_cleaner->mutex);
3015 page_cleaner->n_disabled_debug++;
3016 mutex_exit(&page_cleaner->mutex);
3017
3018 while (innodb_page_cleaner_disabled_debug
3019 && srv_shutdown_state == SRV_SHUTDOWN_NONE
3020 && page_cleaner->is_running) {
3021
3022 os_thread_sleep(100000); /* [A] */
3023 }
3024
3025 /* We need to wait for threads exiting here, otherwise we would
3026 encounter problem when we quickly perform following steps:
3027 1) SET GLOBAL innodb_page_cleaner_disabled_debug = 1;
3028 2) SET GLOBAL innodb_page_cleaner_disabled_debug = 0;
3029 3) SET GLOBAL innodb_page_cleaner_disabled_debug = 1;
3030 That's because after step 1 this thread could still be sleeping
3031 inside the loop above at [A] and steps 2, 3 could happen before
3032 this thread wakes up from [A]. In such case this thread would
3033 not re-increment n_disabled_debug and we would be waiting for
3034 him forever in buf_flush_page_cleaner_disabled_debug_update(...).
3035
3036 Therefore we are waiting in step 2 for this thread exiting here. */
3037
3038 mutex_enter(&page_cleaner->mutex);
3039 page_cleaner->n_disabled_debug--;
3040 mutex_exit(&page_cleaner->mutex);
3041 }
3042
3043 /** Disables page cleaner threads (coordinator and workers).
3044 It's used by: SET GLOBAL innodb_page_cleaner_disabled_debug = 1 (0).
3045 @param[in] thd thread handle
3046 @param[in] var pointer to system variable
3047 @param[out] var_ptr where the formal string goes
3048 @param[in] save immediate result from check function */
3049 void
buf_flush_page_cleaner_disabled_debug_update(THD * thd,struct st_mysql_sys_var * var,void * var_ptr,const void * save)3050 buf_flush_page_cleaner_disabled_debug_update(
3051 THD* thd,
3052 struct st_mysql_sys_var* var,
3053 void* var_ptr,
3054 const void* save)
3055 {
3056 if (page_cleaner == NULL) {
3057 return;
3058 }
3059
3060 if (!*static_cast<const my_bool*>(save)) {
3061 if (!innodb_page_cleaner_disabled_debug) {
3062 return;
3063 }
3064
3065 innodb_page_cleaner_disabled_debug = false;
3066
3067 /* Enable page cleaner threads. */
3068 while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
3069 mutex_enter(&page_cleaner->mutex);
3070 const ulint n = page_cleaner->n_disabled_debug;
3071 mutex_exit(&page_cleaner->mutex);
3072 /* Check if all threads have been enabled, to avoid
3073 problem when we decide to re-disable them soon. */
3074 if (n == 0) {
3075 break;
3076 }
3077 }
3078 return;
3079 }
3080
3081 if (innodb_page_cleaner_disabled_debug) {
3082 return;
3083 }
3084
3085 innodb_page_cleaner_disabled_debug = true;
3086
3087 while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
3088 /* Workers are possibly sleeping on is_requested.
3089
3090 We have to wake them, otherwise they could possibly
3091 have never noticed, that they should be disabled,
3092 and we would wait for them here forever.
3093
3094 That's why we have sleep-loop instead of simply
3095 waiting on some disabled_debug_event. */
3096 os_event_set(page_cleaner->is_requested);
3097
3098 mutex_enter(&page_cleaner->mutex);
3099
3100 ut_ad(page_cleaner->n_disabled_debug
3101 <= srv_n_page_cleaners);
3102
3103 if (page_cleaner->n_disabled_debug
3104 == srv_n_page_cleaners) {
3105
3106 mutex_exit(&page_cleaner->mutex);
3107 break;
3108 }
3109
3110 mutex_exit(&page_cleaner->mutex);
3111
3112 os_thread_sleep(100000);
3113 }
3114 }
3115 #endif /* UNIV_DEBUG */
3116
3117 /******************************************************************//**
3118 page_cleaner thread tasked with flushing dirty pages from the buffer
3119 pools. As of now we'll have only one coordinator.
3120 @return a dummy parameter */
3121 extern "C"
3122 os_thread_ret_t
DECLARE_THREAD(buf_flush_page_cleaner_coordinator)3123 DECLARE_THREAD(buf_flush_page_cleaner_coordinator)(
3124 /*===============================================*/
3125 void* arg MY_ATTRIBUTE((unused)))
3126 /*!< in: a dummy parameter required by
3127 os_thread_create */
3128 {
3129 ib_time_monotonic_t next_loop_time = ut_time_monotonic_ms() + 1000;
3130 ulint n_flushed = 0;
3131 ulint last_activity = srv_get_activity_count();
3132 ulint last_pages = 0;
3133
3134 my_thread_init();
3135
3136 #ifdef UNIV_PFS_THREAD
3137 pfs_register_thread(page_cleaner_thread_key);
3138 #endif /* UNIV_PFS_THREAD */
3139
3140 #ifdef UNIV_DEBUG_THREAD_CREATION
3141 ib::info() << "page_cleaner thread running, id "
3142 << os_thread_pf(os_thread_get_curr_id());
3143 #endif /* UNIV_DEBUG_THREAD_CREATION */
3144
3145 #ifdef UNIV_LINUX
3146 /* linux might be able to set different setting for each thread.
3147 worth to try to set high priority for page cleaner threads */
3148 if (buf_flush_page_cleaner_set_priority(
3149 buf_flush_page_cleaner_priority)) {
3150
3151 ib::info() << "page_cleaner coordinator priority: "
3152 << buf_flush_page_cleaner_priority;
3153 } else {
3154 ib::info() << "If the mysqld execution user is authorized,"
3155 " page cleaner thread priority can be changed."
3156 " See the man page of setpriority().";
3157 }
3158 #endif /* UNIV_LINUX */
3159
3160 buf_page_cleaner_is_active = true;
3161
3162 while (!srv_read_only_mode
3163 && srv_shutdown_state == SRV_SHUTDOWN_NONE
3164 && recv_sys->heap != NULL) {
3165 /* treat flushing requests during recovery. */
3166 ulint n_flushed_lru = 0;
3167 ulint n_flushed_list = 0;
3168
3169 os_event_wait(recv_sys->flush_start);
3170
3171 if (srv_shutdown_state != SRV_SHUTDOWN_NONE
3172 || recv_sys->heap == NULL) {
3173 break;
3174 }
3175
3176 switch (recv_sys->flush_type) {
3177 case BUF_FLUSH_LRU:
3178 /* Flush pages from end of LRU if required */
3179 pc_request(0, LSN_MAX);
3180 while (pc_flush_slot() > 0) {}
3181 pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3182 break;
3183
3184 case BUF_FLUSH_LIST:
3185 /* Flush all pages */
3186 do {
3187 pc_request(ULINT_MAX, LSN_MAX);
3188 while (pc_flush_slot() > 0) {}
3189 } while (!pc_wait_finished(&n_flushed_lru,
3190 &n_flushed_list));
3191 break;
3192
3193 default:
3194 ut_ad(0);
3195 }
3196
3197 os_event_reset(recv_sys->flush_start);
3198 os_event_set(recv_sys->flush_end);
3199 }
3200
3201 os_event_wait(buf_flush_event);
3202
3203 ulint ret_sleep = 0;
3204 ulint n_evicted = 0;
3205 ulint n_flushed_last = 0;
3206 ulint warn_interval = 1;
3207 ulint warn_count = 0;
3208 int64_t sig_count = os_event_reset(buf_flush_event);
3209
3210 while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
3211
3212 /* The page_cleaner skips sleep if the server is
3213 idle and there are no pending IOs in the buffer pool
3214 and there is work to do. */
3215 if (srv_check_activity(last_activity)
3216 || buf_get_n_pending_read_ios()
3217 || n_flushed == 0) {
3218
3219 ret_sleep = pc_sleep_if_needed(
3220 next_loop_time, sig_count);
3221
3222 if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
3223 break;
3224 }
3225 } else if (ut_time_monotonic_ms() > next_loop_time) {
3226 ret_sleep = OS_SYNC_TIME_EXCEEDED;
3227 } else {
3228 ret_sleep = 0;
3229 }
3230
3231 sig_count = os_event_reset(buf_flush_event);
3232
3233 if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3234 ib_time_monotonic_ms_t curr_time =
3235 ut_time_monotonic_ms();
3236
3237 if (curr_time > next_loop_time + 3000) {
3238 if (warn_count == 0) {
3239 ib::info() << "page_cleaner: 1000ms"
3240 " intended loop took "
3241 << 1000 + curr_time
3242 - next_loop_time
3243 << "ms. The settings might not"
3244 " be optimal. (flushed="
3245 << n_flushed_last
3246 << " and evicted="
3247 << n_evicted
3248 << ", during the time.)";
3249 if (warn_interval > 300) {
3250 warn_interval = 600;
3251 } else {
3252 warn_interval *= 2;
3253 }
3254
3255 warn_count = warn_interval;
3256 } else {
3257 --warn_count;
3258 }
3259 } else {
3260 /* reset counter */
3261 warn_interval = 1;
3262 warn_count = 0;
3263 }
3264
3265 next_loop_time = curr_time + 1000;
3266 n_flushed_last = n_evicted = 0;
3267 }
3268
3269 if (ret_sleep != OS_SYNC_TIME_EXCEEDED
3270 && srv_flush_sync
3271 && buf_flush_sync_lsn > 0) {
3272 /* woke up for flush_sync */
3273 mutex_enter(&page_cleaner->mutex);
3274 lsn_t lsn_limit = buf_flush_sync_lsn;
3275 buf_flush_sync_lsn = 0;
3276 mutex_exit(&page_cleaner->mutex);
3277
3278 /* Request flushing for threads */
3279 pc_request(ULINT_MAX, lsn_limit);
3280
3281 ib_time_monotonic_ms_t tm = ut_time_monotonic_ms();
3282
3283 /* Coordinator also treats requests */
3284 while (pc_flush_slot() > 0) {}
3285
3286 /* only coordinator is using these counters,
3287 so no need to protect by lock. */
3288 page_cleaner->flush_time += ut_time_monotonic_ms() - tm;
3289 page_cleaner->flush_pass++;
3290
3291 /* Wait for all slots to be finished */
3292 ulint n_flushed_lru = 0;
3293 ulint n_flushed_list = 0;
3294 pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3295
3296 if (n_flushed_list > 0 || n_flushed_lru > 0) {
3297 buf_flush_stats(n_flushed_list, n_flushed_lru);
3298
3299 MONITOR_INC_VALUE_CUMULATIVE(
3300 MONITOR_FLUSH_SYNC_TOTAL_PAGE,
3301 MONITOR_FLUSH_SYNC_COUNT,
3302 MONITOR_FLUSH_SYNC_PAGES,
3303 n_flushed_lru + n_flushed_list);
3304 }
3305
3306 n_flushed = n_flushed_lru + n_flushed_list;
3307
3308 } else if (srv_check_activity(last_activity)) {
3309 ulint n_to_flush;
3310 lsn_t lsn_limit = 0;
3311
3312 /* Estimate pages from flush_list to be flushed */
3313 if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3314 last_activity = srv_get_activity_count();
3315 n_to_flush =
3316 page_cleaner_flush_pages_recommendation(
3317 &lsn_limit, last_pages);
3318 } else {
3319 n_to_flush = 0;
3320 }
3321
3322 /* Request flushing for threads */
3323 pc_request(n_to_flush, lsn_limit);
3324
3325 ib_time_monotonic_ms_t tm = ut_time_monotonic_ms();
3326
3327 /* Coordinator also treats requests */
3328 while (pc_flush_slot() > 0) {
3329 /* No op */
3330 }
3331
3332 /* only coordinator is using these counters,
3333 so no need to protect by lock. */
3334 page_cleaner->flush_time += ut_time_monotonic_ms() - tm;
3335 page_cleaner->flush_pass++ ;
3336
3337 /* Wait for all slots to be finished */
3338 ulint n_flushed_lru = 0;
3339 ulint n_flushed_list = 0;
3340
3341 pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3342
3343 if (n_flushed_list > 0 || n_flushed_lru > 0) {
3344 buf_flush_stats(n_flushed_list, n_flushed_lru);
3345 }
3346
3347 if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3348 last_pages = n_flushed_list;
3349 }
3350
3351 n_evicted += n_flushed_lru;
3352 n_flushed_last += n_flushed_list;
3353
3354 n_flushed = n_flushed_lru + n_flushed_list;
3355
3356 if (n_flushed_lru) {
3357 MONITOR_INC_VALUE_CUMULATIVE(
3358 MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
3359 MONITOR_LRU_BATCH_FLUSH_COUNT,
3360 MONITOR_LRU_BATCH_FLUSH_PAGES,
3361 n_flushed_lru);
3362 }
3363
3364 if (n_flushed_list) {
3365 MONITOR_INC_VALUE_CUMULATIVE(
3366 MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
3367 MONITOR_FLUSH_ADAPTIVE_COUNT,
3368 MONITOR_FLUSH_ADAPTIVE_PAGES,
3369 n_flushed_list);
3370 }
3371
3372 } else if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3373 /* no activity, slept enough */
3374 buf_flush_lists(PCT_IO(100), LSN_MAX, &n_flushed);
3375
3376 n_flushed_last += n_flushed;
3377
3378 if (n_flushed) {
3379 MONITOR_INC_VALUE_CUMULATIVE(
3380 MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
3381 MONITOR_FLUSH_BACKGROUND_COUNT,
3382 MONITOR_FLUSH_BACKGROUND_PAGES,
3383 n_flushed);
3384
3385 }
3386
3387 } else {
3388 /* no activity, but woken up by event */
3389 n_flushed = 0;
3390 }
3391
3392 ut_d(buf_flush_page_cleaner_disabled_loop());
3393 }
3394
3395 ut_ad(srv_shutdown_state > 0);
3396 if (srv_fast_shutdown == 2
3397 || srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
3398 /* In very fast shutdown or when innodb failed to start, we
3399 simulate a crash of the buffer pool. We are not required to do
3400 any flushing. */
3401 goto thread_exit;
3402 }
3403
3404 /* In case of normal and slow shutdown the page_cleaner thread
3405 must wait for all other activity in the server to die down.
3406 Note that we can start flushing the buffer pool as soon as the
3407 server enters shutdown phase but we must stay alive long enough
3408 to ensure that any work done by the master or purge threads is
3409 also flushed.
3410 During shutdown we pass through two stages. In the first stage,
3411 when SRV_SHUTDOWN_CLEANUP is set other threads like the master
3412 and the purge threads may be working as well. We start flushing
3413 the buffer pool but can't be sure that no new pages are being
3414 dirtied until we enter SRV_SHUTDOWN_FLUSH_PHASE phase. */
3415
3416 do {
3417 pc_request(ULINT_MAX, LSN_MAX);
3418
3419 while (pc_flush_slot() > 0) {}
3420
3421 ulint n_flushed_lru = 0;
3422 ulint n_flushed_list = 0;
3423 pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3424
3425 n_flushed = n_flushed_lru + n_flushed_list;
3426
3427 /* We sleep only if there are no pages to flush */
3428 if (n_flushed == 0) {
3429 os_thread_sleep(100000);
3430 }
3431 } while (srv_shutdown_state == SRV_SHUTDOWN_CLEANUP);
3432
3433 /* At this point all threads including the master and the purge
3434 thread must have been suspended. */
3435 ut_a(srv_get_active_thread_type() == SRV_NONE);
3436 ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
3437
3438 /* We can now make a final sweep on flushing the buffer pool
3439 and exit after we have cleaned the whole buffer pool.
3440 It is important that we wait for any running batch that has
3441 been triggered by us to finish. Otherwise we can end up
3442 considering end of that batch as a finish of our final
3443 sweep and we'll come out of the loop leaving behind dirty pages
3444 in the flush_list */
3445 buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
3446 buf_flush_wait_LRU_batch_end();
3447
3448 bool success;
3449
3450 do {
3451 pc_request(ULINT_MAX, LSN_MAX);
3452
3453 while (pc_flush_slot() > 0) {}
3454
3455 ulint n_flushed_lru = 0;
3456 ulint n_flushed_list = 0;
3457 success = pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3458
3459 n_flushed = n_flushed_lru + n_flushed_list;
3460
3461 buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
3462 buf_flush_wait_LRU_batch_end();
3463
3464 } while (!success || n_flushed > 0);
3465
3466 /* Some sanity checks */
3467 ut_a(srv_get_active_thread_type() == SRV_NONE);
3468 ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
3469
3470 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
3471 buf_pool_t* buf_pool = buf_pool_from_array(i);
3472 ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == 0);
3473 }
3474
3475 /* We have lived our life. Time to die. */
3476
3477 thread_exit:
3478 /* All worker threads are waiting for the event here,
3479 and no more access to page_cleaner structure by them.
3480 Wakes worker threads up just to make them exit. */
3481 page_cleaner->is_running = false;
3482 os_event_set(page_cleaner->is_requested);
3483
3484 buf_flush_page_cleaner_close();
3485
3486 buf_page_cleaner_is_active = false;
3487
3488 my_thread_end();
3489
3490 /* We count the number of threads in os_thread_exit(). A created
3491 thread should always use that to exit and not use return() to exit. */
3492 os_thread_exit();
3493
3494 OS_THREAD_DUMMY_RETURN;
3495 }
3496
3497 /******************************************************************//**
3498 Worker thread of page_cleaner.
3499 @return a dummy parameter */
3500 extern "C"
3501 os_thread_ret_t
DECLARE_THREAD(buf_flush_page_cleaner_worker)3502 DECLARE_THREAD(buf_flush_page_cleaner_worker)(
3503 /*==========================================*/
3504 void* arg MY_ATTRIBUTE((unused)))
3505 /*!< in: a dummy parameter required by
3506 os_thread_create */
3507 {
3508 my_thread_init();
3509
3510 mutex_enter(&page_cleaner->mutex);
3511 page_cleaner->n_workers++;
3512 mutex_exit(&page_cleaner->mutex);
3513
3514 #ifdef UNIV_LINUX
3515 /* linux might be able to set different setting for each thread
3516 worth to try to set high priority for page cleaner threads */
3517 if (buf_flush_page_cleaner_set_priority(
3518 buf_flush_page_cleaner_priority)) {
3519
3520 ib::info() << "page_cleaner worker priority: "
3521 << buf_flush_page_cleaner_priority;
3522 }
3523 #endif /* UNIV_LINUX */
3524
3525 while (true) {
3526 os_event_wait(page_cleaner->is_requested);
3527
3528 ut_d(buf_flush_page_cleaner_disabled_loop());
3529
3530 if (!page_cleaner->is_running) {
3531 break;
3532 }
3533
3534 pc_flush_slot();
3535 }
3536
3537 mutex_enter(&page_cleaner->mutex);
3538 page_cleaner->n_workers--;
3539 mutex_exit(&page_cleaner->mutex);
3540
3541 my_thread_end();
3542
3543 os_thread_exit();
3544
3545 OS_THREAD_DUMMY_RETURN;
3546 }
3547
3548 /*******************************************************************//**
3549 Synchronously flush dirty blocks from the end of the flush list of all buffer
3550 pool instances.
3551 NOTE: The calling thread is not allowed to own any latches on pages! */
3552 void
buf_flush_sync_all_buf_pools(void)3553 buf_flush_sync_all_buf_pools(void)
3554 /*==============================*/
3555 {
3556 bool success;
3557 do {
3558 success = buf_flush_lists(ULINT_MAX, LSN_MAX, NULL);
3559 buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
3560 } while (!success);
3561
3562 ut_a(success);
3563 }
3564
3565 /** Request IO burst and wake page_cleaner up.
3566 @param[in] lsn_limit upper limit of LSN to be flushed */
3567 void
buf_flush_request_force(lsn_t lsn_limit)3568 buf_flush_request_force(
3569 lsn_t lsn_limit)
3570 {
3571 /* adjust based on lsn_avg_rate not to get old */
3572 lsn_t lsn_target = lsn_limit + lsn_avg_rate * 3;
3573
3574 mutex_enter(&page_cleaner->mutex);
3575 if (lsn_target > buf_flush_sync_lsn) {
3576 buf_flush_sync_lsn = lsn_target;
3577 }
3578 mutex_exit(&page_cleaner->mutex);
3579
3580 os_event_set(buf_flush_event);
3581 }
3582 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3583
3584 /** Functor to validate the flush list. */
3585 struct Check {
operator ()Check3586 void operator()(const buf_page_t* elem)
3587 {
3588 ut_a(elem->in_flush_list);
3589 }
3590 };
3591
3592 /******************************************************************//**
3593 Validates the flush list.
3594 @return TRUE if ok */
3595 static
3596 ibool
buf_flush_validate_low(buf_pool_t * buf_pool)3597 buf_flush_validate_low(
3598 /*===================*/
3599 buf_pool_t* buf_pool) /*!< in: Buffer pool instance */
3600 {
3601 buf_page_t* bpage;
3602 const ib_rbt_node_t* rnode = NULL;
3603 Check check;
3604
3605 ut_ad(buf_flush_list_mutex_own(buf_pool));
3606
3607 ut_list_validate(buf_pool->flush_list, check);
3608
3609 bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
3610
3611 /* If we are in recovery mode i.e.: flush_rbt != NULL
3612 then each block in the flush_list must also be present
3613 in the flush_rbt. */
3614 if (buf_pool->flush_rbt != NULL) {
3615 rnode = rbt_first(buf_pool->flush_rbt);
3616 }
3617
3618 while (bpage != NULL) {
3619 const lsn_t om = bpage->oldest_modification;
3620
3621 ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
3622
3623 ut_ad(bpage->in_flush_list);
3624
3625 /* A page in buf_pool->flush_list can be in
3626 BUF_BLOCK_REMOVE_HASH state. This happens when a page
3627 is in the middle of being relocated. In that case the
3628 original descriptor can have this state and still be
3629 in the flush list waiting to acquire the
3630 buf_pool->flush_list_mutex to complete the relocation. */
3631 ut_a(buf_page_in_file(bpage)
3632 || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
3633 ut_a(om > 0);
3634
3635 if (buf_pool->flush_rbt != NULL) {
3636 buf_page_t** prpage;
3637
3638 ut_a(rnode != NULL);
3639 prpage = rbt_value(buf_page_t*, rnode);
3640
3641 ut_a(*prpage != NULL);
3642 ut_a(*prpage == bpage);
3643 rnode = rbt_next(buf_pool->flush_rbt, rnode);
3644 }
3645
3646 bpage = UT_LIST_GET_NEXT(list, bpage);
3647
3648 ut_a(bpage == NULL || om >= bpage->oldest_modification);
3649 }
3650
3651 /* By this time we must have exhausted the traversal of
3652 flush_rbt (if active) as well. */
3653 ut_a(rnode == NULL);
3654
3655 return(TRUE);
3656 }
3657
3658 /******************************************************************//**
3659 Validates the flush list.
3660 @return TRUE if ok */
3661 ibool
buf_flush_validate(buf_pool_t * buf_pool)3662 buf_flush_validate(
3663 /*===============*/
3664 buf_pool_t* buf_pool) /*!< buffer pool instance */
3665 {
3666 ibool ret;
3667
3668 buf_flush_list_mutex_enter(buf_pool);
3669
3670 ret = buf_flush_validate_low(buf_pool);
3671
3672 buf_flush_list_mutex_exit(buf_pool);
3673
3674 return(ret);
3675 }
3676 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3677 #endif /* !UNIV_HOTBACKUP */
3678
3679 /******************************************************************//**
3680 Check if there are any dirty pages that belong to a space id in the flush
3681 list in a particular buffer pool.
3682 @return number of dirty pages present in a single buffer pool */
3683 ulint
buf_pool_get_dirty_pages_count(buf_pool_t * buf_pool,ulint id,FlushObserver * observer)3684 buf_pool_get_dirty_pages_count(
3685 /*===========================*/
3686 buf_pool_t* buf_pool, /*!< in: buffer pool */
3687 ulint id, /*!< in: space id to check */
3688 FlushObserver* observer) /*!< in: flush observer to check */
3689
3690 {
3691 ulint count = 0;
3692
3693 buf_pool_mutex_enter(buf_pool);
3694 buf_flush_list_mutex_enter(buf_pool);
3695
3696 buf_page_t* bpage;
3697
3698 for (bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
3699 bpage != 0;
3700 bpage = UT_LIST_GET_NEXT(list, bpage)) {
3701
3702 ut_ad(buf_page_in_file(bpage));
3703 ut_ad(bpage->in_flush_list);
3704 ut_ad(bpage->oldest_modification > 0);
3705
3706 if ((observer != NULL
3707 && observer == bpage->flush_observer)
3708 || (observer == NULL
3709 && id == bpage->id.space())) {
3710 ++count;
3711 }
3712 }
3713
3714 buf_flush_list_mutex_exit(buf_pool);
3715 buf_pool_mutex_exit(buf_pool);
3716
3717 return(count);
3718 }
3719
3720 /******************************************************************//**
3721 Check if there are any dirty pages that belong to a space id in the flush list.
3722 @return number of dirty pages present in all the buffer pools */
3723 ulint
buf_flush_get_dirty_pages_count(ulint id,FlushObserver * observer)3724 buf_flush_get_dirty_pages_count(
3725 /*============================*/
3726 ulint id, /*!< in: space id to check */
3727 FlushObserver* observer) /*!< in: flush observer to check */
3728 {
3729 ulint count = 0;
3730
3731 for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
3732 buf_pool_t* buf_pool;
3733
3734 buf_pool = buf_pool_from_array(i);
3735
3736 count += buf_pool_get_dirty_pages_count(buf_pool, id, observer);
3737 }
3738
3739 return(count);
3740 }
3741
3742 /** FlushObserver constructor
3743 @param[in] space_id table space id
3744 @param[in] trx trx instance
3745 @param[in] stage performance schema accounting object,
3746 used by ALTER TABLE. It is passed to log_preflush_pool_modified_pages()
3747 for accounting. */
FlushObserver(ulint space_id,trx_t * trx,ut_stage_alter_t * stage)3748 FlushObserver::FlushObserver(
3749 ulint space_id,
3750 trx_t* trx,
3751 ut_stage_alter_t* stage)
3752 :
3753 m_space_id(space_id),
3754 m_trx(trx),
3755 m_stage(stage),
3756 m_interrupted(false)
3757 {
3758 m_flushed = UT_NEW_NOKEY(std::vector<ulint>(srv_buf_pool_instances));
3759 m_removed = UT_NEW_NOKEY(std::vector<ulint>(srv_buf_pool_instances));
3760
3761 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
3762 m_flushed->at(i) = 0;
3763 m_removed->at(i) = 0;
3764 }
3765
3766 #ifdef FLUSH_LIST_OBSERVER_DEBUG
3767 ib::info() << "FlushObserver constructor: " << m_trx->id;
3768 #endif /* FLUSH_LIST_OBSERVER_DEBUG */
3769 }
3770
3771 /** FlushObserver deconstructor */
~FlushObserver()3772 FlushObserver::~FlushObserver()
3773 {
3774 ut_ad(buf_flush_get_dirty_pages_count(m_space_id, this) == 0);
3775
3776 UT_DELETE(m_flushed);
3777 UT_DELETE(m_removed);
3778
3779 #ifdef FLUSH_LIST_OBSERVER_DEBUG
3780 ib::info() << "FlushObserver deconstructor: " << m_trx->id;
3781 #endif /* FLUSH_LIST_OBSERVER_DEBUG */
3782 }
3783
3784 /** Check whether trx is interrupted
3785 @return true if trx is interrupted */
3786 bool
check_interrupted()3787 FlushObserver::check_interrupted()
3788 {
3789 if (trx_is_interrupted(m_trx)) {
3790 interrupted();
3791
3792 return(true);
3793 }
3794
3795 return(false);
3796 }
3797
3798 /** Notify observer of a flush
3799 @param[in] buf_pool buffer pool instance
3800 @param[in] bpage buffer page to flush */
3801 void
notify_flush(buf_pool_t * buf_pool,buf_page_t * bpage)3802 FlushObserver::notify_flush(
3803 buf_pool_t* buf_pool,
3804 buf_page_t* bpage)
3805 {
3806 ut_ad(buf_pool_mutex_own(buf_pool));
3807
3808 m_flushed->at(buf_pool->instance_no)++;
3809
3810 if (m_stage != NULL) {
3811 m_stage->inc();
3812 }
3813
3814 #ifdef FLUSH_LIST_OBSERVER_DEBUG
3815 ib::info() << "Flush <" << bpage->id.space()
3816 << ", " << bpage->id.page_no() << ">";
3817 #endif /* FLUSH_LIST_OBSERVER_DEBUG */
3818 }
3819
3820 /** Notify observer of a remove
3821 @param[in] buf_pool buffer pool instance
3822 @param[in] bpage buffer page flushed */
3823 void
notify_remove(buf_pool_t * buf_pool,buf_page_t * bpage)3824 FlushObserver::notify_remove(
3825 buf_pool_t* buf_pool,
3826 buf_page_t* bpage)
3827 {
3828 ut_ad(buf_pool_mutex_own(buf_pool));
3829
3830 m_removed->at(buf_pool->instance_no)++;
3831
3832 #ifdef FLUSH_LIST_OBSERVER_DEBUG
3833 ib::info() << "Remove <" << bpage->id.space()
3834 << ", " << bpage->id.page_no() << ">";
3835 #endif /* FLUSH_LIST_OBSERVER_DEBUG */
3836 }
3837
3838 /** Flush dirty pages and wait. */
3839 void
flush()3840 FlushObserver::flush()
3841 {
3842 buf_remove_t buf_remove;
3843
3844 if (m_interrupted) {
3845 buf_remove = BUF_REMOVE_FLUSH_NO_WRITE;
3846 } else {
3847 buf_remove = BUF_REMOVE_FLUSH_WRITE;
3848
3849 if (m_stage != NULL) {
3850 ulint pages_to_flush =
3851 buf_flush_get_dirty_pages_count(
3852 m_space_id, this);
3853
3854 m_stage->begin_phase_flush(pages_to_flush);
3855 }
3856 }
3857
3858 /* Flush or remove dirty pages. */
3859 buf_LRU_flush_or_remove_pages(m_space_id, buf_remove, m_trx);
3860
3861 /* Wait for all dirty pages were flushed. */
3862 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
3863 while (!is_complete(i)) {
3864
3865 os_thread_sleep(2000);
3866 }
3867 }
3868 }
3869