1 /*****************************************************************************
2
3 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2013, 2020, MariaDB Corporation.
5 Copyright (c) 2013, 2014, Fusion-io
6
7 This program is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free Software
9 Foundation; version 2 of the License.
10
11 This program is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License along with
16 this program; if not, write to the Free Software Foundation, Inc.,
17 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
18
19 *****************************************************************************/
20
21 /**************************************************//**
22 @file buf/buf0flu.cc
23 The database buffer buf_pool flush algorithm
24
25 Created 11/11/1995 Heikki Tuuri
26 *******************************************************/
27
28 #include "univ.i"
29 #include <mysql/service_thd_wait.h>
30 #include <sql_class.h>
31
32 #include "buf0flu.h"
33 #include "buf0buf.h"
34 #include "buf0checksum.h"
35 #include "srv0start.h"
36 #include "srv0srv.h"
37 #include "page0zip.h"
38 #include "ut0byte.h"
39 #include "page0page.h"
40 #include "fil0fil.h"
41 #include "buf0lru.h"
42 #include "buf0rea.h"
43 #include "ibuf0ibuf.h"
44 #include "log0log.h"
45 #include "os0file.h"
46 #include "trx0sys.h"
47 #include "srv0mon.h"
48 #include "ut0stage.h"
49 #include "fil0pagecompress.h"
50 #ifdef UNIV_LINUX
51 /* include defs for CPU time priority settings */
52 #include <unistd.h>
53 #include <sys/syscall.h>
54 #include <sys/time.h>
55 #include <sys/resource.h>
56 static const int buf_flush_page_cleaner_priority = -20;
57 #endif /* UNIV_LINUX */
58
59 /** Sleep time in microseconds for loop waiting for the oldest
60 modification lsn */
61 static const ulint buf_flush_wait_flushed_sleep_time = 10000;
62
63 #include <my_service_manager.h>
64
65 /** Number of pages flushed through non flush_list flushes. */
66 static ulint buf_lru_flush_page_count = 0;
67
68 /** Flag indicating if the page_cleaner is in active state. This flag
69 is set to TRUE by the page_cleaner thread when it is spawned and is set
70 back to FALSE at shutdown by the page_cleaner as well. Therefore no
71 need to protect it by a mutex. It is only ever read by the thread
72 doing the shutdown */
73 bool buf_page_cleaner_is_active;
74
75 /** Factor for scan length to determine n_pages for intended oldest LSN
76 progress */
77 static ulint buf_flush_lsn_scan_factor = 3;
78
79 /** Average redo generation rate */
80 static lsn_t lsn_avg_rate = 0;
81
82 /** Target oldest LSN for the requested flush_sync */
83 static lsn_t buf_flush_sync_lsn = 0;
84
85 #ifdef UNIV_PFS_THREAD
86 mysql_pfs_key_t page_cleaner_thread_key;
87 #endif /* UNIV_PFS_THREAD */
88
89 /** Event to synchronise with the flushing. */
90 os_event_t buf_flush_event;
91
92 /** State for page cleaner array slot */
93 enum page_cleaner_state_t {
94 /** Not requested any yet.
95 Moved from FINISHED by the coordinator. */
96 PAGE_CLEANER_STATE_NONE = 0,
97 /** Requested but not started flushing.
98 Moved from NONE by the coordinator. */
99 PAGE_CLEANER_STATE_REQUESTED,
100 /** Flushing is on going.
101 Moved from REQUESTED by the worker. */
102 PAGE_CLEANER_STATE_FLUSHING,
103 /** Flushing was finished.
104 Moved from FLUSHING by the worker. */
105 PAGE_CLEANER_STATE_FINISHED
106 };
107
108 /** Page cleaner request state for each buffer pool instance */
109 struct page_cleaner_slot_t {
110 page_cleaner_state_t state; /*!< state of the request.
111 protected by page_cleaner_t::mutex
112 if the worker thread got the slot and
113 set to PAGE_CLEANER_STATE_FLUSHING,
114 n_flushed_lru and n_flushed_list can be
115 updated only by the worker thread */
116 /* This value is set during state==PAGE_CLEANER_STATE_NONE */
117 ulint n_pages_requested;
118 /*!< number of requested pages
119 for the slot */
120 /* These values are updated during state==PAGE_CLEANER_STATE_FLUSHING,
121 and commited with state==PAGE_CLEANER_STATE_FINISHED.
122 The consistency is protected by the 'state' */
123 ulint n_flushed_lru;
124 /*!< number of flushed pages
125 by LRU scan flushing */
126 ulint n_flushed_list;
127 /*!< number of flushed pages
128 by flush_list flushing */
129 bool succeeded_list;
130 /*!< true if flush_list flushing
131 succeeded. */
132 ulint flush_lru_time;
133 /*!< elapsed time for LRU flushing */
134 ulint flush_list_time;
135 /*!< elapsed time for flush_list
136 flushing */
137 ulint flush_lru_pass;
138 /*!< count to attempt LRU flushing */
139 ulint flush_list_pass;
140 /*!< count to attempt flush_list
141 flushing */
142 };
143
144 /** Page cleaner structure common for all threads */
145 struct page_cleaner_t {
146 ib_mutex_t mutex; /*!< mutex to protect whole of
147 page_cleaner_t struct and
148 page_cleaner_slot_t slots. */
149 os_event_t is_requested; /*!< event to activate worker
150 threads. */
151 os_event_t is_finished; /*!< event to signal that all
152 slots were finished. */
153 os_event_t is_started; /*!< event to signal that
154 thread is started/exiting */
155 volatile ulint n_workers; /*!< number of worker threads
156 in existence */
157 bool requested; /*!< true if requested pages
158 to flush */
159 lsn_t lsn_limit; /*!< upper limit of LSN to be
160 flushed */
161 ulint n_slots; /*!< total number of slots */
162 ulint n_slots_requested;
163 /*!< number of slots
164 in the state
165 PAGE_CLEANER_STATE_REQUESTED */
166 ulint n_slots_flushing;
167 /*!< number of slots
168 in the state
169 PAGE_CLEANER_STATE_FLUSHING */
170 ulint n_slots_finished;
171 /*!< number of slots
172 in the state
173 PAGE_CLEANER_STATE_FINISHED */
174 ulint flush_time; /*!< elapsed time to flush
175 requests for all slots */
176 ulint flush_pass; /*!< count to finish to flush
177 requests for all slots */
178 page_cleaner_slot_t slots[MAX_BUFFER_POOLS];
179 bool is_running; /*!< false if attempt
180 to shutdown */
181
182 #ifdef UNIV_DEBUG
183 ulint n_disabled_debug;
184 /*<! how many of pc threads
185 have been disabled */
186 #endif /* UNIV_DEBUG */
187 };
188
189 static page_cleaner_t page_cleaner;
190
191 #ifdef UNIV_DEBUG
192 my_bool innodb_page_cleaner_disabled_debug;
193 #endif /* UNIV_DEBUG */
194
195 /* @} */
196
197 /******************************************************************//**
198 Increases flush_list size in bytes with the page size in inline function */
199 static inline
200 void
incr_flush_list_size_in_bytes(buf_block_t * block,buf_pool_t * buf_pool)201 incr_flush_list_size_in_bytes(
202 /*==========================*/
203 buf_block_t* block, /*!< in: control block */
204 buf_pool_t* buf_pool) /*!< in: buffer pool instance */
205 {
206 ut_ad(buf_flush_list_mutex_own(buf_pool));
207
208 buf_pool->stat.flush_list_bytes += block->page.size.physical();
209
210 ut_ad(buf_pool->stat.flush_list_bytes <= buf_pool->curr_pool_size);
211 }
212
213 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
214 /******************************************************************//**
215 Validates the flush list.
216 @return TRUE if ok */
217 static
218 ibool
219 buf_flush_validate_low(
220 /*===================*/
221 buf_pool_t* buf_pool); /*!< in: Buffer pool instance */
222
223 /******************************************************************//**
224 Validates the flush list some of the time.
225 @return TRUE if ok or the check was skipped */
226 static
227 ibool
buf_flush_validate_skip(buf_pool_t * buf_pool)228 buf_flush_validate_skip(
229 /*====================*/
230 buf_pool_t* buf_pool) /*!< in: Buffer pool instance */
231 {
232 /** Try buf_flush_validate_low() every this many times */
233 # define BUF_FLUSH_VALIDATE_SKIP 23
234
235 /** The buf_flush_validate_low() call skip counter.
236 Use a signed type because of the race condition below. */
237 static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
238
239 /* There is a race condition below, but it does not matter,
240 because this call is only for heuristic purposes. We want to
241 reduce the call frequency of the costly buf_flush_validate_low()
242 check in debug builds. */
243 if (--buf_flush_validate_count > 0) {
244 return(TRUE);
245 }
246
247 buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
248 return(buf_flush_validate_low(buf_pool));
249 }
250 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
251
252 /******************************************************************//**
253 Insert a block in the flush_rbt and returns a pointer to its
254 predecessor or NULL if no predecessor. The ordering is maintained
255 on the basis of the <oldest_modification, space, offset> key.
256 @return pointer to the predecessor or NULL if no predecessor. */
257 static
258 buf_page_t*
buf_flush_insert_in_flush_rbt(buf_page_t * bpage)259 buf_flush_insert_in_flush_rbt(
260 /*==========================*/
261 buf_page_t* bpage) /*!< in: bpage to be inserted. */
262 {
263 const ib_rbt_node_t* c_node;
264 const ib_rbt_node_t* p_node;
265 buf_page_t* prev = NULL;
266 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
267
268 ut_ad(srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE);
269 ut_ad(buf_flush_list_mutex_own(buf_pool));
270
271 /* Insert this buffer into the rbt. */
272 c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
273 ut_a(c_node != NULL);
274
275 /* Get the predecessor. */
276 p_node = rbt_prev(buf_pool->flush_rbt, c_node);
277
278 if (p_node != NULL) {
279 buf_page_t** value;
280 value = rbt_value(buf_page_t*, p_node);
281 prev = *value;
282 ut_a(prev != NULL);
283 }
284
285 return(prev);
286 }
287
288 /*********************************************************//**
289 Delete a bpage from the flush_rbt. */
290 static
291 void
buf_flush_delete_from_flush_rbt(buf_page_t * bpage)292 buf_flush_delete_from_flush_rbt(
293 /*============================*/
294 buf_page_t* bpage) /*!< in: bpage to be removed. */
295 {
296 #ifdef UNIV_DEBUG
297 ibool ret = FALSE;
298 #endif /* UNIV_DEBUG */
299 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
300
301 ut_ad(buf_flush_list_mutex_own(buf_pool));
302
303 #ifdef UNIV_DEBUG
304 ret =
305 #endif /* UNIV_DEBUG */
306 rbt_delete(buf_pool->flush_rbt, &bpage);
307
308 ut_ad(ret);
309 }
310
311 /*****************************************************************//**
312 Compare two modified blocks in the buffer pool. The key for comparison
313 is:
314 key = <oldest_modification, space, offset>
315 This comparison is used to maintian ordering of blocks in the
316 buf_pool->flush_rbt.
317 Note that for the purpose of flush_rbt, we only need to order blocks
318 on the oldest_modification. The other two fields are used to uniquely
319 identify the blocks.
320 @return < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */
321 static
322 int
buf_flush_block_cmp(const void * p1,const void * p2)323 buf_flush_block_cmp(
324 /*================*/
325 const void* p1, /*!< in: block1 */
326 const void* p2) /*!< in: block2 */
327 {
328 int ret;
329 const buf_page_t* b1 = *(const buf_page_t**) p1;
330 const buf_page_t* b2 = *(const buf_page_t**) p2;
331
332 ut_ad(b1 != NULL);
333 ut_ad(b2 != NULL);
334
335 #ifdef UNIV_DEBUG
336 buf_pool_t* buf_pool = buf_pool_from_bpage(b1);
337 #endif /* UNIV_DEBUG */
338
339 ut_ad(buf_flush_list_mutex_own(buf_pool));
340
341 ut_ad(b1->in_flush_list);
342 ut_ad(b2->in_flush_list);
343
344 if (b2->oldest_modification > b1->oldest_modification) {
345 return(1);
346 } else if (b2->oldest_modification < b1->oldest_modification) {
347 return(-1);
348 }
349
350 /* If oldest_modification is same then decide on the space. */
351 ret = (int)(b2->id.space() - b1->id.space());
352
353 /* Or else decide ordering on the page number. */
354 return(ret ? ret : (int) (b2->id.page_no() - b1->id.page_no()));
355 }
356
357 /********************************************************************//**
358 Initialize the red-black tree to speed up insertions into the flush_list
359 during recovery process. Should be called at the start of recovery
360 process before any page has been read/written. */
361 void
buf_flush_init_flush_rbt(void)362 buf_flush_init_flush_rbt(void)
363 /*==========================*/
364 {
365 ulint i;
366
367 for (i = 0; i < srv_buf_pool_instances; i++) {
368 buf_pool_t* buf_pool;
369
370 buf_pool = buf_pool_from_array(i);
371
372 buf_flush_list_mutex_enter(buf_pool);
373
374 ut_ad(buf_pool->flush_rbt == NULL);
375
376 /* Create red black tree for speedy insertions in flush list. */
377 buf_pool->flush_rbt = rbt_create(
378 sizeof(buf_page_t*), buf_flush_block_cmp);
379
380 buf_flush_list_mutex_exit(buf_pool);
381 }
382 }
383
384 /********************************************************************//**
385 Frees up the red-black tree. */
386 void
buf_flush_free_flush_rbt(void)387 buf_flush_free_flush_rbt(void)
388 /*==========================*/
389 {
390 ulint i;
391
392 for (i = 0; i < srv_buf_pool_instances; i++) {
393 buf_pool_t* buf_pool;
394
395 buf_pool = buf_pool_from_array(i);
396
397 buf_flush_list_mutex_enter(buf_pool);
398
399 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
400 ut_a(buf_flush_validate_low(buf_pool));
401 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
402
403 rbt_free(buf_pool->flush_rbt);
404 buf_pool->flush_rbt = NULL;
405
406 buf_flush_list_mutex_exit(buf_pool);
407 }
408 }
409
410 /********************************************************************//**
411 Inserts a modified block into the flush list. */
412 void
buf_flush_insert_into_flush_list(buf_pool_t * buf_pool,buf_block_t * block,lsn_t lsn)413 buf_flush_insert_into_flush_list(
414 /*=============================*/
415 buf_pool_t* buf_pool, /*!< buffer pool instance */
416 buf_block_t* block, /*!< in/out: block which is modified */
417 lsn_t lsn) /*!< in: oldest modification */
418 {
419 ut_ad(!buf_pool_mutex_own(buf_pool));
420 ut_ad(log_flush_order_mutex_own());
421 ut_ad(buf_page_mutex_own(block));
422
423 buf_flush_list_mutex_enter(buf_pool);
424
425 ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
426 || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
427 <= lsn));
428
429 /* If we are in the recovery then we need to update the flush
430 red-black tree as well. */
431 if (buf_pool->flush_rbt != NULL) {
432 buf_flush_list_mutex_exit(buf_pool);
433 buf_flush_insert_sorted_into_flush_list(buf_pool, block, lsn);
434 return;
435 }
436
437 ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
438 ut_ad(!block->page.in_flush_list);
439
440 ut_d(block->page.in_flush_list = TRUE);
441 block->page.oldest_modification = lsn;
442
443 UT_LIST_ADD_FIRST(buf_pool->flush_list, &block->page);
444
445 incr_flush_list_size_in_bytes(block, buf_pool);
446
447 MEM_CHECK_DEFINED(block->page.size.is_compressed()
448 ? block->page.zip.data : block->frame,
449 block->page.size.physical());
450 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
451 ut_a(buf_flush_validate_skip(buf_pool));
452 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
453
454 buf_flush_list_mutex_exit(buf_pool);
455 }
456
457 /********************************************************************//**
458 Inserts a modified block into the flush list in the right sorted position.
459 This function is used by recovery, because there the modifications do not
460 necessarily come in the order of lsn's. */
461 void
buf_flush_insert_sorted_into_flush_list(buf_pool_t * buf_pool,buf_block_t * block,lsn_t lsn)462 buf_flush_insert_sorted_into_flush_list(
463 /*====================================*/
464 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
465 buf_block_t* block, /*!< in/out: block which is modified */
466 lsn_t lsn) /*!< in: oldest modification */
467 {
468 buf_page_t* prev_b;
469 buf_page_t* b;
470
471 ut_ad(srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE);
472 ut_ad(!buf_pool_mutex_own(buf_pool));
473 ut_ad(log_flush_order_mutex_own());
474 ut_ad(buf_page_mutex_own(block));
475 ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
476
477 buf_flush_list_mutex_enter(buf_pool);
478
479 /* The field in_LRU_list is protected by buf_pool->mutex, which
480 we are not holding. However, while a block is in the flush
481 list, it is dirty and cannot be discarded, not from the
482 page_hash or from the LRU list. At most, the uncompressed
483 page frame of a compressed block may be discarded or created
484 (copying the block->page to or from a buf_page_t that is
485 dynamically allocated from buf_buddy_alloc()). Because those
486 transitions hold block->mutex and the flush list mutex (via
487 buf_flush_relocate_on_flush_list()), there is no possibility
488 of a race condition in the assertions below. */
489 ut_ad(block->page.in_LRU_list);
490 ut_ad(block->page.in_page_hash);
491 /* buf_buddy_block_register() will take a block in the
492 BUF_BLOCK_MEMORY state, not a file page. */
493 ut_ad(!block->page.in_zip_hash);
494
495 ut_ad(!block->page.in_flush_list);
496 ut_d(block->page.in_flush_list = TRUE);
497 block->page.oldest_modification = lsn;
498
499 MEM_CHECK_DEFINED(block->page.size.is_compressed()
500 ? block->page.zip.data : block->frame,
501 block->page.size.physical());
502
503 prev_b = NULL;
504
505 /* For the most part when this function is called the flush_rbt
506 should not be NULL. In a very rare boundary case it is possible
507 that the flush_rbt has already been freed by the recovery thread
508 before the last page was hooked up in the flush_list by the
509 io-handler thread. In that case we'll just do a simple
510 linear search in the else block. */
511 if (buf_pool->flush_rbt != NULL) {
512
513 prev_b = buf_flush_insert_in_flush_rbt(&block->page);
514
515 } else {
516
517 b = UT_LIST_GET_FIRST(buf_pool->flush_list);
518
519 while (b != NULL && b->oldest_modification
520 > block->page.oldest_modification) {
521
522 ut_ad(b->in_flush_list);
523 prev_b = b;
524 b = UT_LIST_GET_NEXT(list, b);
525 }
526 }
527
528 if (prev_b == NULL) {
529 UT_LIST_ADD_FIRST(buf_pool->flush_list, &block->page);
530 } else {
531 UT_LIST_INSERT_AFTER(buf_pool->flush_list, prev_b, &block->page);
532 }
533
534 incr_flush_list_size_in_bytes(block, buf_pool);
535
536 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
537 ut_a(buf_flush_validate_low(buf_pool));
538 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
539
540 buf_flush_list_mutex_exit(buf_pool);
541 }
542
543 /********************************************************************//**
544 Returns TRUE if the file page block is immediately suitable for replacement,
545 i.e., the transition FILE_PAGE => NOT_USED allowed.
546 @return TRUE if can replace immediately */
547 ibool
buf_flush_ready_for_replace(buf_page_t * bpage)548 buf_flush_ready_for_replace(
549 /*========================*/
550 buf_page_t* bpage) /*!< in: buffer control block, must be
551 buf_page_in_file(bpage) and in the LRU list */
552 {
553 #ifdef UNIV_DEBUG
554 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
555 ut_ad(buf_pool_mutex_own(buf_pool));
556 #endif /* UNIV_DEBUG */
557 ut_ad(mutex_own(buf_page_get_mutex(bpage)));
558 ut_ad(bpage->in_LRU_list);
559 ut_a(buf_page_in_file(bpage));
560
561 return bpage->oldest_modification == 0
562 && bpage->buf_fix_count == 0
563 && buf_page_get_io_fix(bpage) == BUF_IO_NONE;
564 }
565
566 /********************************************************************//**
567 Returns true if the block is modified and ready for flushing.
568 @return true if can flush immediately */
569 bool
buf_flush_ready_for_flush(buf_page_t * bpage,buf_flush_t flush_type)570 buf_flush_ready_for_flush(
571 /*======================*/
572 buf_page_t* bpage, /*!< in: buffer control block, must be
573 buf_page_in_file(bpage) */
574 buf_flush_t flush_type)/*!< in: type of flush */
575 {
576 #ifdef UNIV_DEBUG
577 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
578 ut_ad(buf_pool_mutex_own(buf_pool));
579 #endif /* UNIV_DEBUG */
580
581 ut_a(buf_page_in_file(bpage));
582 ut_ad(mutex_own(buf_page_get_mutex(bpage)));
583 ut_ad(flush_type < BUF_FLUSH_N_TYPES);
584
585 if (bpage->oldest_modification == 0
586 || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
587 return(false);
588 }
589
590 ut_ad(bpage->in_flush_list);
591
592 switch (flush_type) {
593 case BUF_FLUSH_LIST:
594 case BUF_FLUSH_LRU:
595 case BUF_FLUSH_SINGLE_PAGE:
596 return(true);
597
598 case BUF_FLUSH_N_TYPES:
599 break;
600 }
601
602 ut_error;
603 return(false);
604 }
605
606 /********************************************************************//**
607 Remove a block from the flush list of modified blocks. */
608 void
buf_flush_remove(buf_page_t * bpage)609 buf_flush_remove(
610 /*=============*/
611 buf_page_t* bpage) /*!< in: pointer to the block in question */
612 {
613 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
614
615 #if 0 // FIXME: Rate-limit the output. Move this to the page cleaner?
616 if (UNIV_UNLIKELY(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE)) {
617 service_manager_extend_timeout(
618 INNODB_EXTEND_TIMEOUT_INTERVAL,
619 "Flush and remove page with tablespace id %u"
620 ", Poolid " ULINTPF ", flush list length " ULINTPF,
621 bpage->space, buf_pool->instance_no,
622 UT_LIST_GET_LEN(buf_pool->flush_list));
623 }
624 #endif
625
626 ut_ad(buf_pool_mutex_own(buf_pool));
627 ut_ad(mutex_own(buf_page_get_mutex(bpage)));
628 ut_ad(bpage->in_flush_list);
629
630 buf_flush_list_mutex_enter(buf_pool);
631
632 /* Important that we adjust the hazard pointer before removing
633 the bpage from flush list. */
634 buf_pool->flush_hp.adjust(bpage);
635
636 switch (buf_page_get_state(bpage)) {
637 case BUF_BLOCK_POOL_WATCH:
638 case BUF_BLOCK_ZIP_PAGE:
639 /* Clean compressed pages should not be on the flush list */
640 case BUF_BLOCK_NOT_USED:
641 case BUF_BLOCK_READY_FOR_USE:
642 case BUF_BLOCK_MEMORY:
643 case BUF_BLOCK_REMOVE_HASH:
644 ut_error;
645 return;
646 case BUF_BLOCK_ZIP_DIRTY:
647 buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
648 UT_LIST_REMOVE(buf_pool->flush_list, bpage);
649 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
650 buf_LRU_insert_zip_clean(bpage);
651 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
652 break;
653 case BUF_BLOCK_FILE_PAGE:
654 UT_LIST_REMOVE(buf_pool->flush_list, bpage);
655 break;
656 }
657
658 /* If the flush_rbt is active then delete from there as well. */
659 if (buf_pool->flush_rbt != NULL) {
660 buf_flush_delete_from_flush_rbt(bpage);
661 }
662
663 /* Must be done after we have removed it from the flush_rbt
664 because we assert on in_flush_list in comparison function. */
665 ut_d(bpage->in_flush_list = FALSE);
666
667 buf_pool->stat.flush_list_bytes -= bpage->size.physical();
668
669 bpage->oldest_modification = 0;
670
671 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
672 ut_a(buf_flush_validate_skip(buf_pool));
673 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
674
675 /* If there is an observer that want to know if the asynchronous
676 flushing was done then notify it. */
677 if (bpage->flush_observer != NULL) {
678 bpage->flush_observer->notify_remove(buf_pool, bpage);
679
680 bpage->flush_observer = NULL;
681 }
682
683 buf_flush_list_mutex_exit(buf_pool);
684 }
685
686 /*******************************************************************//**
687 Relocates a buffer control block on the flush_list.
688 Note that it is assumed that the contents of bpage have already been
689 copied to dpage.
690 IMPORTANT: When this function is called bpage and dpage are not
691 exact copies of each other. For example, they both will have different
692 ::state. Also the ::list pointers in dpage may be stale. We need to
693 use the current list node (bpage) to do the list manipulation because
694 the list pointers could have changed between the time that we copied
695 the contents of bpage to the dpage and the flush list manipulation
696 below. */
697 void
buf_flush_relocate_on_flush_list(buf_page_t * bpage,buf_page_t * dpage)698 buf_flush_relocate_on_flush_list(
699 /*=============================*/
700 buf_page_t* bpage, /*!< in/out: control block being moved */
701 buf_page_t* dpage) /*!< in/out: destination block */
702 {
703 buf_page_t* prev;
704 buf_page_t* prev_b = NULL;
705 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
706
707 ut_ad(buf_pool_mutex_own(buf_pool));
708 /* Must reside in the same buffer pool. */
709 ut_ad(buf_pool == buf_pool_from_bpage(dpage));
710
711 ut_ad(mutex_own(buf_page_get_mutex(bpage)));
712
713 buf_flush_list_mutex_enter(buf_pool);
714
715 /* FIXME: At this point we have both buf_pool and flush_list
716 mutexes. Theoretically removal of a block from flush list is
717 only covered by flush_list mutex but currently we do
718 have buf_pool mutex in buf_flush_remove() therefore this block
719 is guaranteed to be in the flush list. We need to check if
720 this will work without the assumption of block removing code
721 having the buf_pool mutex. */
722 ut_ad(bpage->in_flush_list);
723 ut_ad(dpage->in_flush_list);
724
725 /* If recovery is active we must swap the control blocks in
726 the flush_rbt as well. */
727 if (buf_pool->flush_rbt != NULL) {
728 buf_flush_delete_from_flush_rbt(bpage);
729 prev_b = buf_flush_insert_in_flush_rbt(dpage);
730 }
731
732 /* Important that we adjust the hazard pointer before removing
733 the bpage from the flush list. */
734 buf_pool->flush_hp.adjust(bpage);
735
736 /* Must be done after we have removed it from the flush_rbt
737 because we assert on in_flush_list in comparison function. */
738 ut_d(bpage->in_flush_list = FALSE);
739
740 prev = UT_LIST_GET_PREV(list, bpage);
741 UT_LIST_REMOVE(buf_pool->flush_list, bpage);
742
743 if (prev) {
744 ut_ad(prev->in_flush_list);
745 UT_LIST_INSERT_AFTER( buf_pool->flush_list, prev, dpage);
746 } else {
747 UT_LIST_ADD_FIRST(buf_pool->flush_list, dpage);
748 }
749
750 /* Just an extra check. Previous in flush_list
751 should be the same control block as in flush_rbt. */
752 ut_a(buf_pool->flush_rbt == NULL || prev_b == prev);
753
754 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
755 ut_a(buf_flush_validate_low(buf_pool));
756 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
757
758 buf_flush_list_mutex_exit(buf_pool);
759 }
760
761 /** Update the flush system data structures when a write is completed.
762 @param[in,out] bpage flushed page
763 @param[in] dblwr whether the doublewrite buffer was used */
buf_flush_write_complete(buf_page_t * bpage,bool dblwr)764 void buf_flush_write_complete(buf_page_t* bpage, bool dblwr)
765 {
766 buf_flush_t flush_type;
767 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
768
769 ut_ad(bpage);
770
771 buf_flush_remove(bpage);
772
773 flush_type = buf_page_get_flush_type(bpage);
774 buf_pool->n_flush[flush_type]--;
775 ut_ad(buf_pool->n_flush[flush_type] != ULINT_MAX);
776
777 ut_ad(buf_pool_mutex_own(buf_pool));
778
779 if (buf_pool->n_flush[flush_type] == 0
780 && buf_pool->init_flush[flush_type] == FALSE) {
781
782 /* The running flush batch has ended */
783
784 os_event_set(buf_pool->no_flush[flush_type]);
785 }
786
787 if (dblwr) {
788 buf_dblwr_update(bpage, flush_type);
789 }
790 }
791
792 /** Calculate the checksum of a page from compressed table and update
793 the page.
794 @param[in,out] page page to update
795 @param[in] size compressed page size
796 @param[in] lsn LSN to stamp on the page */
797 void
buf_flush_update_zip_checksum(buf_frame_t * page,ulint size,lsn_t lsn)798 buf_flush_update_zip_checksum(
799 buf_frame_t* page,
800 ulint size,
801 lsn_t lsn)
802 {
803 ut_a(size > 0);
804
805 const uint32_t checksum = page_zip_calc_checksum(
806 page, size,
807 static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm));
808
809 mach_write_to_8(page + FIL_PAGE_LSN, lsn);
810 mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
811 }
812
813 /** Initialize a page for writing to the tablespace.
814 @param[in] block buffer block; NULL if bypassing the buffer pool
815 @param[in,out] page page frame
816 @param[in,out] page_zip_ compressed page, or NULL if uncompressed
817 @param[in] newest_lsn newest modification LSN to the page */
818 void
buf_flush_init_for_writing(const buf_block_t * block,byte * page,void * page_zip_,lsn_t newest_lsn)819 buf_flush_init_for_writing(
820 const buf_block_t* block,
821 byte* page,
822 void* page_zip_,
823 lsn_t newest_lsn)
824 {
825 ut_ad(block == NULL || block->frame == page);
826 ut_ad(block == NULL || page_zip_ == NULL
827 || &block->page.zip == page_zip_);
828 ut_ad(!block || newest_lsn);
829 ut_ad(page);
830 #if 0 /* MDEV-15528 TODO: reinstate this check */
831 /* innodb_immediate_scrub_data_uncompressed=ON would cause
832 fsp_init_file_page() to be called on freed pages, and thus
833 cause them to be written as almost-all-zeroed.
834 In MDEV-15528 we should change that implement an option to
835 make freed pages appear all-zero, bypassing this code. */
836 ut_ad(!newest_lsn || fil_page_get_type(page));
837 #endif
838
839 if (page_zip_) {
840 page_zip_des_t* page_zip;
841 ulint size;
842
843 page_zip = static_cast<page_zip_des_t*>(page_zip_);
844 size = page_zip_get_size(page_zip);
845
846 ut_ad(size);
847 ut_ad(ut_is_2pow(size));
848 ut_ad(size <= UNIV_ZIP_SIZE_MAX);
849
850 switch (fil_page_get_type(page)) {
851 case FIL_PAGE_TYPE_ALLOCATED:
852 case FIL_PAGE_INODE:
853 case FIL_PAGE_IBUF_BITMAP:
854 case FIL_PAGE_TYPE_FSP_HDR:
855 case FIL_PAGE_TYPE_XDES:
856 /* These are essentially uncompressed pages. */
857 memcpy(page_zip->data, page, size);
858 /* fall through */
859 case FIL_PAGE_TYPE_ZBLOB:
860 case FIL_PAGE_TYPE_ZBLOB2:
861 case FIL_PAGE_INDEX:
862 case FIL_PAGE_RTREE:
863
864 buf_flush_update_zip_checksum(
865 page_zip->data, size, newest_lsn);
866
867 return;
868 }
869
870 ib::error() << "The compressed page to be written"
871 " seems corrupt:";
872 ut_print_buf(stderr, page, size);
873 fputs("\nInnoDB: Possibly older version of the page:", stderr);
874 ut_print_buf(stderr, page_zip->data, size);
875 putc('\n', stderr);
876 ut_error;
877 }
878
879 /* Write the newest modification lsn to the page header and trailer */
880 mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
881
882 mach_write_to_8(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM,
883 newest_lsn);
884
885 if (block && srv_page_size == 16384) {
886 /* The page type could be garbage in old files
887 created before MySQL 5.5. Such files always
888 had a page size of 16 kilobytes. */
889 ulint page_type = fil_page_get_type(page);
890 ulint reset_type = page_type;
891
892 switch (block->page.id.page_no() % 16384) {
893 case 0:
894 reset_type = block->page.id.page_no() == 0
895 ? FIL_PAGE_TYPE_FSP_HDR
896 : FIL_PAGE_TYPE_XDES;
897 break;
898 case 1:
899 reset_type = FIL_PAGE_IBUF_BITMAP;
900 break;
901 case FSP_TRX_SYS_PAGE_NO:
902 if (block->page.id.page_no()
903 == TRX_SYS_PAGE_NO
904 && block->page.id.space()
905 == TRX_SYS_SPACE) {
906 reset_type = FIL_PAGE_TYPE_TRX_SYS;
907 break;
908 }
909 /* fall through */
910 default:
911 switch (page_type) {
912 case FIL_PAGE_INDEX:
913 case FIL_PAGE_TYPE_INSTANT:
914 case FIL_PAGE_RTREE:
915 case FIL_PAGE_UNDO_LOG:
916 case FIL_PAGE_INODE:
917 case FIL_PAGE_IBUF_FREE_LIST:
918 case FIL_PAGE_TYPE_ALLOCATED:
919 case FIL_PAGE_TYPE_SYS:
920 case FIL_PAGE_TYPE_TRX_SYS:
921 case FIL_PAGE_TYPE_BLOB:
922 case FIL_PAGE_TYPE_ZBLOB:
923 case FIL_PAGE_TYPE_ZBLOB2:
924 break;
925 case FIL_PAGE_TYPE_FSP_HDR:
926 case FIL_PAGE_TYPE_XDES:
927 case FIL_PAGE_IBUF_BITMAP:
928 /* These pages should have
929 predetermined page numbers
930 (see above). */
931 default:
932 reset_type = FIL_PAGE_TYPE_UNKNOWN;
933 break;
934 }
935 }
936
937 if (UNIV_UNLIKELY(page_type != reset_type)) {
938 ib::info()
939 << "Resetting invalid page "
940 << block->page.id << " type "
941 << page_type << " to "
942 << reset_type << " when flushing.";
943 fil_page_set_type(page, reset_type);
944 }
945 }
946
947 uint32_t checksum = BUF_NO_CHECKSUM_MAGIC;
948
949 switch (srv_checksum_algorithm_t(srv_checksum_algorithm)) {
950 case SRV_CHECKSUM_ALGORITHM_INNODB:
951 case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
952 checksum = buf_calc_page_new_checksum(page);
953 mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
954 checksum);
955 /* With the InnoDB checksum, we overwrite the first 4 bytes of
956 the end lsn field to store the old formula checksum. Since it
957 depends also on the field FIL_PAGE_SPACE_OR_CHKSUM, it has to
958 be calculated after storing the new formula checksum. */
959 checksum = buf_calc_page_old_checksum(page);
960 break;
961 case SRV_CHECKSUM_ALGORITHM_CRC32:
962 case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
963 /* In other cases we write the same checksum to both fields. */
964 checksum = buf_calc_page_crc32(page);
965 mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
966 checksum);
967 break;
968 case SRV_CHECKSUM_ALGORITHM_NONE:
969 case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
970 mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
971 checksum);
972 break;
973 /* no default so the compiler will emit a warning if
974 new enum is added and not handled here */
975 }
976
977 mach_write_to_4(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM,
978 checksum);
979 }
980
981 /********************************************************************//**
982 Does an asynchronous write of a buffer page. NOTE: in simulated aio and
983 also when the doublewrite buffer is used, we must call
984 buf_dblwr_flush_buffered_writes after we have posted a batch of
985 writes! */
986 static
987 void
buf_flush_write_block_low(buf_page_t * bpage,buf_flush_t flush_type,bool sync)988 buf_flush_write_block_low(
989 /*======================*/
990 buf_page_t* bpage, /*!< in: buffer block to write */
991 buf_flush_t flush_type, /*!< in: type of flush */
992 bool sync) /*!< in: true if sync IO request */
993 {
994 fil_space_t* space = fil_space_acquire_for_io(bpage->id.space());
995 if (!space) {
996 return;
997 }
998 ut_ad(space->purpose == FIL_TYPE_TEMPORARY
999 || space->purpose == FIL_TYPE_IMPORT
1000 || space->purpose == FIL_TYPE_TABLESPACE);
1001 ut_ad((space->purpose == FIL_TYPE_TEMPORARY)
1002 == (space == fil_system.temp_space));
1003 page_t* frame = NULL;
1004 #ifdef UNIV_DEBUG
1005 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1006 ut_ad(!buf_pool_mutex_own(buf_pool));
1007 #endif /* UNIV_DEBUG */
1008
1009 DBUG_PRINT("ib_buf", ("flush %s %u page %u:%u",
1010 sync ? "sync" : "async", (unsigned) flush_type,
1011 bpage->id.space(), bpage->id.page_no()));
1012
1013 ut_ad(buf_page_in_file(bpage));
1014
1015 /* We are not holding buf_pool->mutex or block_mutex here.
1016 Nevertheless, it is safe to access bpage, because it is
1017 io_fixed and oldest_modification != 0. Thus, it cannot be
1018 relocated in the buffer pool or removed from flush_list or
1019 LRU_list. */
1020 ut_ad(!buf_pool_mutex_own(buf_pool));
1021 ut_ad(!buf_flush_list_mutex_own(buf_pool));
1022 ut_ad(!buf_page_get_mutex(bpage)->is_owned());
1023 ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
1024 ut_ad(bpage->oldest_modification != 0);
1025 ut_ad(bpage->newest_modification != 0);
1026
1027 /* Force the log to the disk before writing the modified block */
1028 if (!srv_read_only_mode) {
1029 log_write_up_to(bpage->newest_modification, true);
1030 }
1031
1032 switch (buf_page_get_state(bpage)) {
1033 case BUF_BLOCK_POOL_WATCH:
1034 case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
1035 case BUF_BLOCK_NOT_USED:
1036 case BUF_BLOCK_READY_FOR_USE:
1037 case BUF_BLOCK_MEMORY:
1038 case BUF_BLOCK_REMOVE_HASH:
1039 ut_error;
1040 break;
1041 case BUF_BLOCK_ZIP_DIRTY:
1042 frame = bpage->zip.data;
1043
1044 buf_flush_update_zip_checksum(frame, bpage->size.physical(),
1045 bpage->newest_modification);
1046 break;
1047 case BUF_BLOCK_FILE_PAGE:
1048 frame = bpage->zip.data;
1049 if (!frame) {
1050 frame = ((buf_block_t*) bpage)->frame;
1051 }
1052
1053 buf_flush_init_for_writing(
1054 reinterpret_cast<const buf_block_t*>(bpage),
1055 reinterpret_cast<const buf_block_t*>(bpage)->frame,
1056 bpage->zip.data ? &bpage->zip : NULL,
1057 bpage->newest_modification);
1058 break;
1059 }
1060
1061 frame = buf_page_encrypt_before_write(space, bpage, frame);
1062
1063 ut_ad(space->purpose == FIL_TYPE_TABLESPACE
1064 || space->atomic_write_supported);
1065 if (!space->use_doublewrite()) {
1066 ulint type = IORequest::WRITE | IORequest::DO_NOT_WAKE;
1067
1068 IORequest request(type, bpage);
1069
1070 /* TODO: pass the tablespace to fil_io() */
1071 fil_io(request,
1072 sync, bpage->id, bpage->size, 0, bpage->size.physical(),
1073 frame, bpage);
1074 } else {
1075 ut_ad(!srv_read_only_mode);
1076
1077 if (flush_type == BUF_FLUSH_SINGLE_PAGE) {
1078 buf_dblwr_write_single_page(bpage, sync);
1079 } else {
1080 ut_ad(!sync);
1081 buf_dblwr_add_to_batch(bpage);
1082 }
1083 }
1084
1085 /* When doing single page flushing the IO is done synchronously
1086 and we flush the changes to disk only for the tablespace we
1087 are working on. */
1088 if (sync) {
1089 ut_ad(flush_type == BUF_FLUSH_SINGLE_PAGE);
1090 if (space->purpose != FIL_TYPE_TEMPORARY) {
1091 fil_flush(space);
1092 }
1093
1094 /* The tablespace could already have been dropped,
1095 because fil_io(request, sync) would already have
1096 decremented the node->n_pending. However,
1097 buf_page_io_complete() only needs to look up the
1098 tablespace during read requests, not during writes. */
1099 ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
1100 #ifdef UNIV_DEBUG
1101 dberr_t err =
1102 #endif
1103 /* true means we want to evict this page from the
1104 LRU list as well. */
1105 buf_page_io_complete(bpage, space->use_doublewrite(), true);
1106
1107 ut_ad(err == DB_SUCCESS);
1108 }
1109
1110 space->release_for_io();
1111
1112 /* Increment the counter of I/O operations used
1113 for selecting LRU policy. */
1114 buf_LRU_stat_inc_io();
1115 }
1116
1117 /********************************************************************//**
1118 Writes a flushable page asynchronously from the buffer pool to a file.
1119 NOTE: in simulated aio we must call
1120 os_aio_simulated_wake_handler_threads after we have posted a batch of
1121 writes! NOTE: buf_pool->mutex and buf_page_get_mutex(bpage) must be
1122 held upon entering this function, and they will be released by this
1123 function if it returns true.
1124 @return TRUE if the page was flushed */
1125 ibool
buf_flush_page(buf_pool_t * buf_pool,buf_page_t * bpage,buf_flush_t flush_type,bool sync)1126 buf_flush_page(
1127 /*===========*/
1128 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1129 buf_page_t* bpage, /*!< in: buffer control block */
1130 buf_flush_t flush_type, /*!< in: type of flush */
1131 bool sync) /*!< in: true if sync IO request */
1132 {
1133 BPageMutex* block_mutex;
1134
1135 ut_ad(flush_type < BUF_FLUSH_N_TYPES);
1136 ut_ad(buf_pool_mutex_own(buf_pool));
1137 ut_ad(buf_page_in_file(bpage));
1138 ut_ad(!sync || flush_type == BUF_FLUSH_SINGLE_PAGE);
1139
1140 block_mutex = buf_page_get_mutex(bpage);
1141 ut_ad(mutex_own(block_mutex));
1142
1143 ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
1144
1145 bool is_uncompressed;
1146
1147 is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
1148 ut_ad(is_uncompressed == (block_mutex != &buf_pool->zip_mutex));
1149
1150 ibool flush;
1151 rw_lock_t* rw_lock;
1152 bool no_fix_count = bpage->buf_fix_count == 0;
1153
1154 if (!is_uncompressed) {
1155 flush = TRUE;
1156 rw_lock = NULL;
1157 } else if (!(no_fix_count || flush_type == BUF_FLUSH_LIST)
1158 || (!no_fix_count
1159 && srv_shutdown_state <= SRV_SHUTDOWN_CLEANUP
1160 && fsp_is_system_temporary(bpage->id.space()))) {
1161 /* This is a heuristic, to avoid expensive SX attempts. */
1162 /* For table residing in temporary tablespace sync is done
1163 using IO_FIX and so before scheduling for flush ensure that
1164 page is not fixed. */
1165 flush = FALSE;
1166 } else {
1167 rw_lock = &reinterpret_cast<buf_block_t*>(bpage)->lock;
1168 if (flush_type != BUF_FLUSH_LIST) {
1169 flush = rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE);
1170 } else {
1171 /* Will SX lock later */
1172 flush = TRUE;
1173 }
1174 }
1175
1176 if (flush) {
1177
1178 /* We are committed to flushing by the time we get here */
1179
1180 buf_page_set_io_fix(bpage, BUF_IO_WRITE);
1181
1182 buf_page_set_flush_type(bpage, flush_type);
1183
1184 if (buf_pool->n_flush[flush_type] == 0) {
1185 os_event_reset(buf_pool->no_flush[flush_type]);
1186 }
1187
1188 ++buf_pool->n_flush[flush_type];
1189 ut_ad(buf_pool->n_flush[flush_type] != 0);
1190
1191 mutex_exit(block_mutex);
1192
1193 buf_pool_mutex_exit(buf_pool);
1194
1195 if (flush_type == BUF_FLUSH_LIST
1196 && is_uncompressed
1197 && !rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE)) {
1198
1199 if (!fsp_is_system_temporary(bpage->id.space())) {
1200 /* avoiding deadlock possibility involves
1201 doublewrite buffer, should flush it, because
1202 it might hold the another block->lock. */
1203 buf_dblwr_flush_buffered_writes();
1204 } else {
1205 buf_dblwr_sync_datafiles();
1206 }
1207
1208 rw_lock_sx_lock_gen(rw_lock, BUF_IO_WRITE);
1209 }
1210
1211 /* If there is an observer that want to know if the asynchronous
1212 flushing was sent then notify it.
1213 Note: we set flush observer to a page with x-latch, so we can
1214 guarantee that notify_flush and notify_remove are called in pair
1215 with s-latch on a uncompressed page. */
1216 if (bpage->flush_observer != NULL) {
1217 buf_pool_mutex_enter(buf_pool);
1218
1219 bpage->flush_observer->notify_flush(buf_pool, bpage);
1220
1221 buf_pool_mutex_exit(buf_pool);
1222 }
1223
1224 /* Even though bpage is not protected by any mutex at this
1225 point, it is safe to access bpage, because it is io_fixed and
1226 oldest_modification != 0. Thus, it cannot be relocated in the
1227 buffer pool or removed from flush_list or LRU_list. */
1228
1229 buf_flush_write_block_low(bpage, flush_type, sync);
1230 }
1231
1232 return(flush);
1233 }
1234
1235 # if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
1236 /********************************************************************//**
1237 Writes a flushable page asynchronously from the buffer pool to a file.
1238 NOTE: buf_pool->mutex and block->mutex must be held upon entering this
1239 function, and they will be released by this function after flushing.
1240 This is loosely based on buf_flush_batch() and buf_flush_page().
1241 @return TRUE if the page was flushed and the mutexes released */
1242 ibool
buf_flush_page_try(buf_pool_t * buf_pool,buf_block_t * block)1243 buf_flush_page_try(
1244 /*===============*/
1245 buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */
1246 buf_block_t* block) /*!< in/out: buffer control block */
1247 {
1248 ut_ad(buf_pool_mutex_own(buf_pool));
1249 ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
1250 ut_ad(buf_page_mutex_own(block));
1251
1252 if (!buf_flush_ready_for_flush(&block->page, BUF_FLUSH_SINGLE_PAGE)) {
1253 return(FALSE);
1254 }
1255
1256 /* The following call will release the buffer pool and
1257 block mutex. */
1258 return(buf_flush_page(
1259 buf_pool, &block->page,
1260 BUF_FLUSH_SINGLE_PAGE, true));
1261 }
1262 # endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
1263
1264 /** Check the page is in buffer pool and can be flushed.
1265 @param[in] page_id page id
1266 @param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST
1267 @return true if the page can be flushed. */
1268 static
1269 bool
buf_flush_check_neighbor(const page_id_t page_id,buf_flush_t flush_type)1270 buf_flush_check_neighbor(
1271 const page_id_t page_id,
1272 buf_flush_t flush_type)
1273 {
1274 buf_page_t* bpage;
1275 buf_pool_t* buf_pool = buf_pool_get(page_id);
1276 bool ret;
1277
1278 ut_ad(flush_type == BUF_FLUSH_LRU
1279 || flush_type == BUF_FLUSH_LIST);
1280
1281 buf_pool_mutex_enter(buf_pool);
1282
1283 /* We only want to flush pages from this buffer pool. */
1284 bpage = buf_page_hash_get(buf_pool, page_id);
1285
1286 if (!bpage) {
1287
1288 buf_pool_mutex_exit(buf_pool);
1289 return(false);
1290 }
1291
1292 ut_a(buf_page_in_file(bpage));
1293
1294 /* We avoid flushing 'non-old' blocks in an LRU flush,
1295 because the flushed blocks are soon freed */
1296
1297 ret = false;
1298 if (flush_type != BUF_FLUSH_LRU || buf_page_is_old(bpage)) {
1299 BPageMutex* block_mutex = buf_page_get_mutex(bpage);
1300
1301 mutex_enter(block_mutex);
1302 if (buf_flush_ready_for_flush(bpage, flush_type)) {
1303 ret = true;
1304 }
1305 mutex_exit(block_mutex);
1306 }
1307 buf_pool_mutex_exit(buf_pool);
1308
1309 return(ret);
1310 }
1311
1312 /** Flushes to disk all flushable pages within the flush area.
1313 @param[in] page_id page id
1314 @param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST
1315 @param[in] n_flushed number of pages flushed so far in this batch
1316 @param[in] n_to_flush maximum number of pages we are allowed to flush
1317 @return number of pages flushed */
1318 static
1319 ulint
buf_flush_try_neighbors(const page_id_t page_id,buf_flush_t flush_type,ulint n_flushed,ulint n_to_flush)1320 buf_flush_try_neighbors(
1321 const page_id_t page_id,
1322 buf_flush_t flush_type,
1323 ulint n_flushed,
1324 ulint n_to_flush)
1325 {
1326 ulint i;
1327 ulint low;
1328 ulint high;
1329 ulint count = 0;
1330 buf_pool_t* buf_pool = buf_pool_get(page_id);
1331
1332 ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1333
1334 if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN
1335 || srv_flush_neighbors == 0) {
1336 /* If there is little space or neighbor flushing is
1337 not enabled then just flush the victim. */
1338 low = page_id.page_no();
1339 high = page_id.page_no() + 1;
1340 } else {
1341 /* When flushed, dirty blocks are searched in
1342 neighborhoods of this size, and flushed along with the
1343 original page. */
1344
1345 ulint buf_flush_area;
1346
1347 buf_flush_area = ut_min(
1348 BUF_READ_AHEAD_AREA(buf_pool),
1349 buf_pool->curr_size / 16);
1350
1351 low = (page_id.page_no() / buf_flush_area) * buf_flush_area;
1352 high = (page_id.page_no() / buf_flush_area + 1) * buf_flush_area;
1353
1354 if (srv_flush_neighbors == 1) {
1355 /* adjust 'low' and 'high' to limit
1356 for contiguous dirty area */
1357 if (page_id.page_no() > low) {
1358 for (i = page_id.page_no() - 1; i >= low; i--) {
1359 if (!buf_flush_check_neighbor(
1360 page_id_t(page_id.space(), i),
1361 flush_type)) {
1362
1363 break;
1364 }
1365
1366 if (i == low) {
1367 /* Avoid overwrap when low == 0
1368 and calling
1369 buf_flush_check_neighbor() with
1370 i == (ulint) -1 */
1371 i--;
1372 break;
1373 }
1374 }
1375 low = i + 1;
1376 }
1377
1378 for (i = page_id.page_no() + 1;
1379 i < high
1380 && buf_flush_check_neighbor(
1381 page_id_t(page_id.space(), i),
1382 flush_type);
1383 i++) {
1384 /* do nothing */
1385 }
1386 high = i;
1387 }
1388 }
1389
1390 if (fil_space_t *s = fil_space_acquire_for_io(page_id.space())) {
1391 high = s->max_page_number_for_io(high);
1392 s->release_for_io();
1393 } else {
1394 return 0;
1395 }
1396
1397 DBUG_PRINT("ib_buf", ("flush %u:%u..%u",
1398 page_id.space(),
1399 (unsigned) low, (unsigned) high));
1400
1401 for (ulint i = low; i < high; i++) {
1402 buf_page_t* bpage;
1403
1404 if ((count + n_flushed) >= n_to_flush) {
1405
1406 /* We have already flushed enough pages and
1407 should call it a day. There is, however, one
1408 exception. If the page whose neighbors we
1409 are flushing has not been flushed yet then
1410 we'll try to flush the victim that we
1411 selected originally. */
1412 if (i <= page_id.page_no()) {
1413 i = page_id.page_no();
1414 } else {
1415 break;
1416 }
1417 }
1418
1419 const page_id_t cur_page_id(page_id.space(), i);
1420
1421 buf_pool = buf_pool_get(cur_page_id);
1422
1423 buf_pool_mutex_enter(buf_pool);
1424
1425 /* We only want to flush pages from this buffer pool. */
1426 bpage = buf_page_hash_get(buf_pool, cur_page_id);
1427
1428 if (bpage == NULL) {
1429
1430 buf_pool_mutex_exit(buf_pool);
1431 continue;
1432 }
1433
1434 ut_a(buf_page_in_file(bpage));
1435
1436 /* We avoid flushing 'non-old' blocks in an LRU flush,
1437 because the flushed blocks are soon freed */
1438
1439 if (flush_type != BUF_FLUSH_LRU
1440 || i == page_id.page_no()
1441 || buf_page_is_old(bpage)) {
1442
1443 BPageMutex* block_mutex = buf_page_get_mutex(bpage);
1444
1445 mutex_enter(block_mutex);
1446
1447 if (buf_flush_ready_for_flush(bpage, flush_type)
1448 && (i == page_id.page_no()
1449 || bpage->buf_fix_count == 0)) {
1450
1451 /* We also try to flush those
1452 neighbors != offset */
1453
1454 if (buf_flush_page(
1455 buf_pool, bpage, flush_type, false)) {
1456
1457 ++count;
1458 } else {
1459 mutex_exit(block_mutex);
1460 buf_pool_mutex_exit(buf_pool);
1461 }
1462
1463 continue;
1464 } else {
1465 mutex_exit(block_mutex);
1466 }
1467 }
1468 buf_pool_mutex_exit(buf_pool);
1469 }
1470
1471 if (count > 1) {
1472 MONITOR_INC_VALUE_CUMULATIVE(
1473 MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
1474 MONITOR_FLUSH_NEIGHBOR_COUNT,
1475 MONITOR_FLUSH_NEIGHBOR_PAGES,
1476 (count - 1));
1477 }
1478
1479 return(count);
1480 }
1481
1482 /** Check if the block is modified and ready for flushing.
1483 If the the block is ready to flush then flush the page and try o flush
1484 its neighbors.
1485 @param[in] bpage buffer control block,
1486 must be buf_page_in_file(bpage)
1487 @param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST
1488 @param[in] n_to_flush number of pages to flush
1489 @param[in,out] count number of pages flushed
1490 @return TRUE if buf_pool mutex was released during this function.
1491 This does not guarantee that some pages were written as well.
1492 Number of pages written are incremented to the count. */
1493 static
1494 bool
buf_flush_page_and_try_neighbors(buf_page_t * bpage,buf_flush_t flush_type,ulint n_to_flush,ulint * count)1495 buf_flush_page_and_try_neighbors(
1496 buf_page_t* bpage,
1497 buf_flush_t flush_type,
1498 ulint n_to_flush,
1499 ulint* count)
1500 {
1501 #ifdef UNIV_DEBUG
1502 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1503
1504 ut_ad(buf_pool_mutex_own(buf_pool));
1505 #endif /* UNIV_DEBUG */
1506
1507 bool flushed;
1508 BPageMutex* block_mutex = buf_page_get_mutex(bpage);
1509
1510 mutex_enter(block_mutex);
1511
1512 ut_a(buf_page_in_file(bpage));
1513
1514 if (buf_flush_ready_for_flush(bpage, flush_type)) {
1515 buf_pool_t* buf_pool;
1516
1517 buf_pool = buf_pool_from_bpage(bpage);
1518
1519 const page_id_t page_id = bpage->id;
1520
1521 mutex_exit(block_mutex);
1522
1523 buf_pool_mutex_exit(buf_pool);
1524
1525 /* Try to flush also all the neighbors */
1526 *count += buf_flush_try_neighbors(
1527 page_id, flush_type, *count, n_to_flush);
1528
1529 buf_pool_mutex_enter(buf_pool);
1530 flushed = TRUE;
1531 } else {
1532 mutex_exit(block_mutex);
1533
1534 flushed = false;
1535 }
1536
1537 ut_ad(buf_pool_mutex_own(buf_pool));
1538
1539 return(flushed);
1540 }
1541
1542 /*******************************************************************//**
1543 This utility moves the uncompressed frames of pages to the free list.
1544 Note that this function does not actually flush any data to disk. It
1545 just detaches the uncompressed frames from the compressed pages at the
1546 tail of the unzip_LRU and puts those freed frames in the free list.
1547 Note that it is a best effort attempt and it is not guaranteed that
1548 after a call to this function there will be 'max' blocks in the free
1549 list.
1550 @return number of blocks moved to the free list. */
1551 static
1552 ulint
buf_free_from_unzip_LRU_list_batch(buf_pool_t * buf_pool,ulint max)1553 buf_free_from_unzip_LRU_list_batch(
1554 /*===============================*/
1555 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1556 ulint max) /*!< in: desired number of
1557 blocks in the free_list */
1558 {
1559 ulint scanned = 0;
1560 ulint count = 0;
1561 ulint free_len = UT_LIST_GET_LEN(buf_pool->free);
1562 ulint lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
1563
1564 ut_ad(buf_pool_mutex_own(buf_pool));
1565
1566 buf_block_t* block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
1567
1568 while (block != NULL
1569 && count < max
1570 && free_len < srv_LRU_scan_depth
1571 && lru_len > UT_LIST_GET_LEN(buf_pool->LRU) / 10) {
1572
1573 ++scanned;
1574 if (buf_LRU_free_page(&block->page, false)) {
1575 /* Block was freed. buf_pool->mutex potentially
1576 released and reacquired */
1577 ++count;
1578 block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
1579
1580 } else {
1581
1582 block = UT_LIST_GET_PREV(unzip_LRU, block);
1583 }
1584
1585 free_len = UT_LIST_GET_LEN(buf_pool->free);
1586 lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
1587 }
1588
1589 ut_ad(buf_pool_mutex_own(buf_pool));
1590
1591 if (scanned) {
1592 MONITOR_INC_VALUE_CUMULATIVE(
1593 MONITOR_LRU_BATCH_SCANNED,
1594 MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
1595 MONITOR_LRU_BATCH_SCANNED_PER_CALL,
1596 scanned);
1597 }
1598
1599 return(count);
1600 }
1601
1602 /*******************************************************************//**
1603 This utility flushes dirty blocks from the end of the LRU list.
1604 The calling thread is not allowed to own any latches on pages!
1605 It attempts to make 'max' blocks available in the free list. Note that
1606 it is a best effort attempt and it is not guaranteed that after a call
1607 to this function there will be 'max' blocks in the free list.*/
1608
1609 void
buf_flush_LRU_list_batch(buf_pool_t * buf_pool,ulint max,flush_counters_t * n)1610 buf_flush_LRU_list_batch(
1611 /*=====================*/
1612 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1613 ulint max, /*!< in: desired number of
1614 blocks in the free_list */
1615 flush_counters_t* n) /*!< out: flushed/evicted page
1616 counts */
1617 {
1618 buf_page_t* bpage;
1619 ulint scanned = 0;
1620 ulint free_len = UT_LIST_GET_LEN(buf_pool->free);
1621 ulint lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
1622 ulint withdraw_depth = 0;
1623
1624 n->flushed = 0;
1625 n->evicted = 0;
1626 n->unzip_LRU_evicted = 0;
1627 ut_ad(buf_pool_mutex_own(buf_pool));
1628 if (buf_pool->curr_size < buf_pool->old_size
1629 && buf_pool->withdraw_target > 0) {
1630 withdraw_depth = buf_pool->withdraw_target
1631 - UT_LIST_GET_LEN(buf_pool->withdraw);
1632 }
1633
1634 for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1635 bpage != NULL && n->flushed + n->evicted < max
1636 && free_len < srv_LRU_scan_depth + withdraw_depth
1637 && lru_len > BUF_LRU_MIN_LEN;
1638 ++scanned,
1639 bpage = buf_pool->lru_hp.get()) {
1640
1641 buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage);
1642 buf_pool->lru_hp.set(prev);
1643
1644 BPageMutex* block_mutex = buf_page_get_mutex(bpage);
1645
1646 mutex_enter(block_mutex);
1647
1648 if (buf_flush_ready_for_replace(bpage)) {
1649 /* block is ready for eviction i.e., it is
1650 clean and is not IO-fixed or buffer fixed. */
1651 mutex_exit(block_mutex);
1652 if (buf_LRU_free_page(bpage, true)) {
1653 ++n->evicted;
1654 }
1655 } else if (buf_flush_ready_for_flush(bpage, BUF_FLUSH_LRU)) {
1656 /* Block is ready for flush. Dispatch an IO
1657 request. The IO helper thread will put it on
1658 free list in IO completion routine. */
1659 mutex_exit(block_mutex);
1660 buf_flush_page_and_try_neighbors(
1661 bpage, BUF_FLUSH_LRU, max, &n->flushed);
1662 } else {
1663 /* Can't evict or dispatch this block. Go to
1664 previous. */
1665 ut_ad(buf_pool->lru_hp.is_hp(prev));
1666 mutex_exit(block_mutex);
1667 }
1668
1669 ut_ad(!mutex_own(block_mutex));
1670 ut_ad(buf_pool_mutex_own(buf_pool));
1671
1672 free_len = UT_LIST_GET_LEN(buf_pool->free);
1673 lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
1674 }
1675
1676 buf_pool->lru_hp.set(NULL);
1677
1678 /* We keep track of all flushes happening as part of LRU
1679 flush. When estimating the desired rate at which flush_list
1680 should be flushed, we factor in this value. */
1681 buf_lru_flush_page_count += n->flushed;
1682
1683 ut_ad(buf_pool_mutex_own(buf_pool));
1684
1685 if (n->evicted) {
1686 MONITOR_INC_VALUE_CUMULATIVE(
1687 MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
1688 MONITOR_LRU_BATCH_EVICT_COUNT,
1689 MONITOR_LRU_BATCH_EVICT_PAGES,
1690 n->evicted);
1691 }
1692
1693 if (scanned) {
1694 MONITOR_INC_VALUE_CUMULATIVE(
1695 MONITOR_LRU_BATCH_SCANNED,
1696 MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
1697 MONITOR_LRU_BATCH_SCANNED_PER_CALL,
1698 scanned);
1699 }
1700 }
1701
1702 /*******************************************************************//**
1703 Flush and move pages from LRU or unzip_LRU list to the free list.
1704 Whether LRU or unzip_LRU is used depends on the state of the system.*/
1705
1706 static
1707 void
buf_do_LRU_batch(buf_pool_t * buf_pool,ulint max,flush_counters_t * n)1708 buf_do_LRU_batch(
1709 /*=============*/
1710 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1711 ulint max, /*!< in: desired number of
1712 blocks in the free_list */
1713 flush_counters_t* n) /*!< out: flushed/evicted page
1714 counts */
1715 {
1716 if (buf_LRU_evict_from_unzip_LRU(buf_pool)) {
1717 n->unzip_LRU_evicted = buf_free_from_unzip_LRU_list_batch(buf_pool, max);
1718 } else {
1719 n->unzip_LRU_evicted = 0;
1720 }
1721
1722 if (max > n->unzip_LRU_evicted) {
1723 buf_flush_LRU_list_batch(buf_pool, max - n->unzip_LRU_evicted, n);
1724 } else {
1725 n->evicted = 0;
1726 n->flushed = 0;
1727 }
1728
1729 /* Add evicted pages from unzip_LRU to the evicted pages from
1730 the simple LRU. */
1731 n->evicted += n->unzip_LRU_evicted;
1732 }
1733
1734 /** This utility flushes dirty blocks from the end of the flush_list.
1735 The calling thread is not allowed to own any latches on pages!
1736 @param[in] buf_pool buffer pool instance
1737 @param[in] min_n wished minimum mumber of blocks flushed (it is
1738 not guaranteed that the actual number is that big, though)
1739 @param[in] lsn_limit all blocks whose oldest_modification is smaller
1740 than this should be flushed (if their number does not exceed min_n)
1741 @return number of blocks for which the write request was queued;
1742 ULINT_UNDEFINED if there was a flush of the same type already
1743 running */
1744 static
1745 ulint
buf_do_flush_list_batch(buf_pool_t * buf_pool,ulint min_n,lsn_t lsn_limit)1746 buf_do_flush_list_batch(
1747 buf_pool_t* buf_pool,
1748 ulint min_n,
1749 lsn_t lsn_limit)
1750 {
1751 ulint count = 0;
1752 ulint scanned = 0;
1753
1754 ut_ad(buf_pool_mutex_own(buf_pool));
1755
1756 /* Start from the end of the list looking for a suitable
1757 block to be flushed. */
1758 buf_flush_list_mutex_enter(buf_pool);
1759 ulint len = UT_LIST_GET_LEN(buf_pool->flush_list);
1760
1761 /* In order not to degenerate this scan to O(n*n) we attempt
1762 to preserve pointer of previous block in the flush list. To do
1763 so we declare it a hazard pointer. Any thread working on the
1764 flush list must check the hazard pointer and if it is removing
1765 the same block then it must reset it. */
1766 for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
1767 count < min_n && bpage != NULL && len > 0
1768 && bpage->oldest_modification < lsn_limit;
1769 bpage = buf_pool->flush_hp.get(),
1770 ++scanned) {
1771
1772 buf_page_t* prev;
1773
1774 ut_a(bpage->oldest_modification > 0);
1775 ut_ad(bpage->in_flush_list);
1776
1777 prev = UT_LIST_GET_PREV(list, bpage);
1778 buf_pool->flush_hp.set(prev);
1779 buf_flush_list_mutex_exit(buf_pool);
1780
1781 #ifdef UNIV_DEBUG
1782 bool flushed =
1783 #endif /* UNIV_DEBUG */
1784 buf_flush_page_and_try_neighbors(
1785 bpage, BUF_FLUSH_LIST, min_n, &count);
1786
1787 buf_flush_list_mutex_enter(buf_pool);
1788
1789 ut_ad(flushed || buf_pool->flush_hp.is_hp(prev));
1790
1791 --len;
1792 }
1793
1794 buf_pool->flush_hp.set(NULL);
1795 buf_flush_list_mutex_exit(buf_pool);
1796
1797 if (scanned) {
1798 MONITOR_INC_VALUE_CUMULATIVE(
1799 MONITOR_FLUSH_BATCH_SCANNED,
1800 MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
1801 MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
1802 scanned);
1803 }
1804
1805 if (count) {
1806 MONITOR_INC_VALUE_CUMULATIVE(
1807 MONITOR_FLUSH_BATCH_TOTAL_PAGE,
1808 MONITOR_FLUSH_BATCH_COUNT,
1809 MONITOR_FLUSH_BATCH_PAGES,
1810 count);
1811 }
1812
1813 ut_ad(buf_pool_mutex_own(buf_pool));
1814
1815 return(count);
1816 }
1817
1818 /** This utility flushes dirty blocks from the end of the LRU list or
1819 flush_list.
1820 NOTE 1: in the case of an LRU flush the calling thread may own latches to
1821 pages: to avoid deadlocks, this function must be written so that it cannot
1822 end up waiting for these latches! NOTE 2: in the case of a flush list flush,
1823 the calling thread is not allowed to own any latches on pages!
1824 @param[in] buf_pool buffer pool instance
1825 @param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST; if
1826 BUF_FLUSH_LIST, then the caller must not own any latches on pages
1827 @param[in] min_n wished minimum mumber of blocks flushed (it is
1828 not guaranteed that the actual number is that big, though)
1829 @param[in] lsn_limit in the case of BUF_FLUSH_LIST all blocks whose
1830 oldest_modification is smaller than this should be flushed (if their number
1831 does not exceed min_n), otherwise ignored */
1832 static
1833 void
buf_flush_batch(buf_pool_t * buf_pool,buf_flush_t flush_type,ulint min_n,lsn_t lsn_limit,flush_counters_t * n)1834 buf_flush_batch(
1835 buf_pool_t* buf_pool,
1836 buf_flush_t flush_type,
1837 ulint min_n,
1838 lsn_t lsn_limit,
1839 flush_counters_t* n) /*!< out: flushed/evicted page
1840 counts */
1841 {
1842 ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1843 ut_ad(flush_type == BUF_FLUSH_LRU
1844 || !sync_check_iterate(dict_sync_check()));
1845
1846 buf_pool_mutex_enter(buf_pool);
1847
1848 /* Note: The buffer pool mutex is released and reacquired within
1849 the flush functions. */
1850 switch (flush_type) {
1851 case BUF_FLUSH_LRU:
1852 buf_do_LRU_batch(buf_pool, min_n, n);
1853 break;
1854 case BUF_FLUSH_LIST:
1855 n->flushed = buf_do_flush_list_batch(buf_pool, min_n, lsn_limit);
1856 n->evicted = 0;
1857 break;
1858 default:
1859 ut_error;
1860 }
1861
1862 buf_pool_mutex_exit(buf_pool);
1863
1864 DBUG_LOG("ib_buf", "flush " << flush_type << " completed");
1865 }
1866
1867 /******************************************************************//**
1868 Gather the aggregated stats for both flush list and LRU list flushing.
1869 @param page_count_flush number of pages flushed from the end of the flush_list
1870 @param page_count_LRU number of pages flushed from the end of the LRU list
1871 */
1872 static
1873 void
buf_flush_stats(ulint page_count_flush,ulint page_count_LRU)1874 buf_flush_stats(
1875 /*============*/
1876 ulint page_count_flush,
1877 ulint page_count_LRU)
1878 {
1879 DBUG_PRINT("ib_buf", ("flush completed, from flush_list %u pages, "
1880 "from LRU_list %u pages",
1881 unsigned(page_count_flush),
1882 unsigned(page_count_LRU)));
1883
1884 srv_stats.buf_pool_flushed.add(page_count_flush + page_count_LRU);
1885 }
1886
1887 /******************************************************************//**
1888 Start a buffer flush batch for LRU or flush list */
1889 static
1890 ibool
buf_flush_start(buf_pool_t * buf_pool,buf_flush_t flush_type)1891 buf_flush_start(
1892 /*============*/
1893 buf_pool_t* buf_pool, /*!< buffer pool instance */
1894 buf_flush_t flush_type) /*!< in: BUF_FLUSH_LRU
1895 or BUF_FLUSH_LIST */
1896 {
1897 ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1898
1899 buf_pool_mutex_enter(buf_pool);
1900
1901 if (buf_pool->n_flush[flush_type] > 0
1902 || buf_pool->init_flush[flush_type] == TRUE) {
1903
1904 /* There is already a flush batch of the same type running */
1905
1906 buf_pool_mutex_exit(buf_pool);
1907
1908 return(FALSE);
1909 }
1910
1911 buf_pool->init_flush[flush_type] = TRUE;
1912
1913 os_event_reset(buf_pool->no_flush[flush_type]);
1914
1915 buf_pool_mutex_exit(buf_pool);
1916
1917 return(TRUE);
1918 }
1919
1920 /******************************************************************//**
1921 End a buffer flush batch for LRU or flush list */
1922 static
1923 void
buf_flush_end(buf_pool_t * buf_pool,buf_flush_t flush_type)1924 buf_flush_end(
1925 /*==========*/
1926 buf_pool_t* buf_pool, /*!< buffer pool instance */
1927 buf_flush_t flush_type) /*!< in: BUF_FLUSH_LRU
1928 or BUF_FLUSH_LIST */
1929 {
1930 buf_pool_mutex_enter(buf_pool);
1931
1932 buf_pool->init_flush[flush_type] = FALSE;
1933
1934 buf_pool->try_LRU_scan = TRUE;
1935
1936 if (buf_pool->n_flush[flush_type] == 0) {
1937
1938 /* The running flush batch has ended */
1939
1940 os_event_set(buf_pool->no_flush[flush_type]);
1941 }
1942
1943 buf_pool_mutex_exit(buf_pool);
1944
1945 if (!srv_read_only_mode) {
1946 buf_dblwr_flush_buffered_writes();
1947 } else {
1948 os_aio_simulated_wake_handler_threads();
1949 }
1950 }
1951
1952 /******************************************************************//**
1953 Waits until a flush batch of the given type ends */
1954 void
buf_flush_wait_batch_end(buf_pool_t * buf_pool,buf_flush_t type)1955 buf_flush_wait_batch_end(
1956 /*=====================*/
1957 buf_pool_t* buf_pool, /*!< buffer pool instance */
1958 buf_flush_t type) /*!< in: BUF_FLUSH_LRU
1959 or BUF_FLUSH_LIST */
1960 {
1961 ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
1962
1963 if (buf_pool == NULL) {
1964 ulint i;
1965
1966 for (i = 0; i < srv_buf_pool_instances; ++i) {
1967 buf_pool_t* buf_pool;
1968
1969 buf_pool = buf_pool_from_array(i);
1970
1971 thd_wait_begin(NULL, THD_WAIT_DISKIO);
1972 os_event_wait(buf_pool->no_flush[type]);
1973 thd_wait_end(NULL);
1974 }
1975 } else {
1976 thd_wait_begin(NULL, THD_WAIT_DISKIO);
1977 os_event_wait(buf_pool->no_flush[type]);
1978 thd_wait_end(NULL);
1979 }
1980 }
1981
1982 /** Do flushing batch of a given type.
1983 NOTE: The calling thread is not allowed to own any latches on pages!
1984 @param[in,out] buf_pool buffer pool instance
1985 @param[in] type flush type
1986 @param[in] min_n wished minimum mumber of blocks flushed
1987 (it is not guaranteed that the actual number is that big, though)
1988 @param[in] lsn_limit in the case BUF_FLUSH_LIST all blocks whose
1989 oldest_modification is smaller than this should be flushed (if their number
1990 does not exceed min_n), otherwise ignored
1991 @param[out] n_processed the number of pages which were processed is
1992 passed back to caller. Ignored if NULL
1993 @retval true if a batch was queued successfully.
1994 @retval false if another batch of same type was already running. */
1995 bool
buf_flush_do_batch(buf_pool_t * buf_pool,buf_flush_t type,ulint min_n,lsn_t lsn_limit,flush_counters_t * n)1996 buf_flush_do_batch(
1997 buf_pool_t* buf_pool,
1998 buf_flush_t type,
1999 ulint min_n,
2000 lsn_t lsn_limit,
2001 flush_counters_t* n)
2002 {
2003 ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
2004
2005 if (n != NULL) {
2006 n->flushed = 0;
2007 }
2008
2009 if (!buf_flush_start(buf_pool, type)) {
2010 return(false);
2011 }
2012
2013 buf_flush_batch(buf_pool, type, min_n, lsn_limit, n);
2014
2015 buf_flush_end(buf_pool, type);
2016
2017 return(true);
2018 }
2019 /**
2020 Waits until a flush batch of the given lsn ends
2021 @param[in] new_oldest target oldest_modified_lsn to wait for */
2022
2023 void
buf_flush_wait_flushed(lsn_t new_oldest)2024 buf_flush_wait_flushed(
2025 lsn_t new_oldest)
2026 {
2027 for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
2028 buf_pool_t* buf_pool;
2029 lsn_t oldest;
2030
2031 buf_pool = buf_pool_from_array(i);
2032
2033 for (;;) {
2034 /* We don't need to wait for fsync of the flushed
2035 blocks, because anyway we need fsync to make chekpoint.
2036 So, we don't need to wait for the batch end here. */
2037
2038 buf_flush_list_mutex_enter(buf_pool);
2039
2040 buf_page_t* bpage;
2041
2042 /* We don't need to wait for system temporary pages */
2043 for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
2044 bpage != NULL
2045 && fsp_is_system_temporary(bpage->id.space());
2046 bpage = UT_LIST_GET_PREV(list, bpage)) {
2047 /* Do nothing. */
2048 }
2049
2050 if (bpage != NULL) {
2051 ut_ad(bpage->in_flush_list);
2052 oldest = bpage->oldest_modification;
2053 } else {
2054 oldest = 0;
2055 }
2056
2057 buf_flush_list_mutex_exit(buf_pool);
2058
2059 if (oldest == 0 || oldest >= new_oldest) {
2060 break;
2061 }
2062
2063 /* sleep and retry */
2064 os_thread_sleep(buf_flush_wait_flushed_sleep_time);
2065
2066 MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
2067 }
2068 }
2069 }
2070
2071 /** This utility flushes dirty blocks from the end of the flush list of all
2072 buffer pool instances.
2073 NOTE: The calling thread is not allowed to own any latches on pages!
2074 @param[in] min_n wished minimum mumber of blocks flushed (it is
2075 not guaranteed that the actual number is that big, though)
2076 @param[in] lsn_limit in the case BUF_FLUSH_LIST all blocks whose
2077 oldest_modification is smaller than this should be flushed (if their number
2078 does not exceed min_n), otherwise ignored
2079 @param[out] n_processed the number of pages which were processed is
2080 passed back to caller. Ignored if NULL.
2081 @return true if a batch was queued successfully for each buffer pool
2082 instance. false if another batch of same type was already running in
2083 at least one of the buffer pool instance */
2084 bool
buf_flush_lists(ulint min_n,lsn_t lsn_limit,ulint * n_processed)2085 buf_flush_lists(
2086 ulint min_n,
2087 lsn_t lsn_limit,
2088 ulint* n_processed)
2089 {
2090 ulint i;
2091 ulint n_flushed = 0;
2092 bool success = true;
2093
2094 if (n_processed) {
2095 *n_processed = 0;
2096 }
2097
2098 if (min_n != ULINT_MAX) {
2099 /* Ensure that flushing is spread evenly amongst the
2100 buffer pool instances. When min_n is ULINT_MAX
2101 we need to flush everything up to the lsn limit
2102 so no limit here. */
2103 min_n = (min_n + srv_buf_pool_instances - 1)
2104 / srv_buf_pool_instances;
2105 }
2106
2107 /* Flush to lsn_limit in all buffer pool instances */
2108 for (i = 0; i < srv_buf_pool_instances; i++) {
2109 buf_pool_t* buf_pool;
2110 flush_counters_t n;
2111
2112 memset(&n, 0, sizeof(flush_counters_t));
2113 buf_pool = buf_pool_from_array(i);
2114
2115 if (!buf_flush_do_batch(buf_pool,
2116 BUF_FLUSH_LIST,
2117 min_n,
2118 lsn_limit,
2119 &n)) {
2120 /* We have two choices here. If lsn_limit was
2121 specified then skipping an instance of buffer
2122 pool means we cannot guarantee that all pages
2123 up to lsn_limit has been flushed. We can
2124 return right now with failure or we can try
2125 to flush remaining buffer pools up to the
2126 lsn_limit. We attempt to flush other buffer
2127 pools based on the assumption that it will
2128 help in the retry which will follow the
2129 failure. */
2130 success = false;
2131
2132 }
2133
2134 n_flushed += n.flushed;
2135 }
2136
2137 if (n_flushed) {
2138 buf_flush_stats(n_flushed, 0);
2139 if (n_processed) {
2140 *n_processed = n_flushed;
2141 }
2142 }
2143
2144 return(success);
2145 }
2146
2147 /******************************************************************//**
2148 This function picks up a single page from the tail of the LRU
2149 list, flushes it (if it is dirty), removes it from page_hash and LRU
2150 list and puts it on the free list. It is called from user threads when
2151 they are unable to find a replaceable page at the tail of the LRU
2152 list i.e.: when the background LRU flushing in the page_cleaner thread
2153 is not fast enough to keep pace with the workload.
2154 @return true if success. */
2155 bool
buf_flush_single_page_from_LRU(buf_pool_t * buf_pool)2156 buf_flush_single_page_from_LRU(
2157 /*===========================*/
2158 buf_pool_t* buf_pool) /*!< in/out: buffer pool instance */
2159 {
2160 ulint scanned;
2161 buf_page_t* bpage;
2162 ibool freed;
2163
2164 buf_pool_mutex_enter(buf_pool);
2165
2166 for (bpage = buf_pool->single_scan_itr.start(), scanned = 0,
2167 freed = false;
2168 bpage != NULL;
2169 ++scanned, bpage = buf_pool->single_scan_itr.get()) {
2170
2171 ut_ad(buf_pool_mutex_own(buf_pool));
2172
2173 buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage);
2174 buf_pool->single_scan_itr.set(prev);
2175 BPageMutex* block_mutex;
2176
2177 block_mutex = buf_page_get_mutex(bpage);
2178
2179 mutex_enter(block_mutex);
2180
2181 if (buf_flush_ready_for_replace(bpage)) {
2182 /* block is ready for eviction i.e., it is
2183 clean and is not IO-fixed or buffer fixed. */
2184 mutex_exit(block_mutex);
2185
2186 if (buf_LRU_free_page(bpage, true)) {
2187 buf_pool_mutex_exit(buf_pool);
2188 freed = true;
2189 break;
2190 }
2191
2192 } else if (buf_flush_ready_for_flush(
2193 bpage, BUF_FLUSH_SINGLE_PAGE)) {
2194
2195 /* Block is ready for flush. Try and dispatch an IO
2196 request. We'll put it on free list in IO completion
2197 routine if it is not buffer fixed. The following call
2198 will release the buffer pool and block mutex.
2199
2200 Note: There is no guarantee that this page has actually
2201 been freed, only that it has been flushed to disk */
2202
2203 freed = buf_flush_page(
2204 buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE, true);
2205
2206 if (freed) {
2207 break;
2208 }
2209
2210 mutex_exit(block_mutex);
2211 } else {
2212 mutex_exit(block_mutex);
2213 }
2214 ut_ad(!mutex_own(block_mutex));
2215 }
2216 if (!freed) {
2217 /* Can't find a single flushable page. */
2218 ut_ad(!bpage);
2219 buf_pool_mutex_exit(buf_pool);
2220 }
2221
2222 if (scanned) {
2223 MONITOR_INC_VALUE_CUMULATIVE(
2224 MONITOR_LRU_SINGLE_FLUSH_SCANNED,
2225 MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
2226 MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL,
2227 scanned);
2228 }
2229
2230 ut_ad(!buf_pool_mutex_own(buf_pool));
2231 return(freed);
2232 }
2233
2234 /**
2235 Clears up tail of the LRU list of a given buffer pool instance:
2236 * Put replaceable pages at the tail of LRU to the free list
2237 * Flush dirty pages at the tail of LRU to the disk
2238 The depth to which we scan each buffer pool is controlled by dynamic
2239 config parameter innodb_LRU_scan_depth.
2240 @param buf_pool buffer pool instance
2241 @return total pages flushed */
2242 static
2243 ulint
buf_flush_LRU_list(buf_pool_t * buf_pool)2244 buf_flush_LRU_list(
2245 buf_pool_t* buf_pool)
2246 {
2247 ulint scan_depth, withdraw_depth;
2248 flush_counters_t n;
2249
2250 memset(&n, 0, sizeof(flush_counters_t));
2251
2252 ut_ad(buf_pool);
2253 /* srv_LRU_scan_depth can be arbitrarily large value.
2254 We cap it with current LRU size. */
2255 buf_pool_mutex_enter(buf_pool);
2256 scan_depth = UT_LIST_GET_LEN(buf_pool->LRU);
2257 if (buf_pool->curr_size < buf_pool->old_size
2258 && buf_pool->withdraw_target > 0) {
2259 withdraw_depth = buf_pool->withdraw_target
2260 - UT_LIST_GET_LEN(buf_pool->withdraw);
2261 } else {
2262 withdraw_depth = 0;
2263 }
2264 buf_pool_mutex_exit(buf_pool);
2265 if (withdraw_depth > srv_LRU_scan_depth) {
2266 scan_depth = ut_min(withdraw_depth, scan_depth);
2267 } else {
2268 scan_depth = ut_min(static_cast<ulint>(srv_LRU_scan_depth),
2269 scan_depth);
2270 }
2271 /* Currently one of page_cleaners is the only thread
2272 that can trigger an LRU flush at the same time.
2273 So, it is not possible that a batch triggered during
2274 last iteration is still running, */
2275 buf_flush_do_batch(buf_pool, BUF_FLUSH_LRU, scan_depth,
2276 0, &n);
2277
2278 return(n.flushed);
2279 }
2280
2281 /*********************************************************************//**
2282 Wait for any possible LRU flushes that are in progress to end. */
2283 void
buf_flush_wait_LRU_batch_end(void)2284 buf_flush_wait_LRU_batch_end(void)
2285 /*==============================*/
2286 {
2287 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2288 buf_pool_t* buf_pool;
2289
2290 buf_pool = buf_pool_from_array(i);
2291
2292 buf_pool_mutex_enter(buf_pool);
2293
2294 if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0
2295 || buf_pool->init_flush[BUF_FLUSH_LRU]) {
2296
2297 buf_pool_mutex_exit(buf_pool);
2298 buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
2299 } else {
2300 buf_pool_mutex_exit(buf_pool);
2301 }
2302 }
2303 }
2304
2305 /*********************************************************************//**
2306 Calculates if flushing is required based on number of dirty pages in
2307 the buffer pool.
2308 @return percent of io_capacity to flush to manage dirty page ratio */
2309 static
2310 ulint
af_get_pct_for_dirty()2311 af_get_pct_for_dirty()
2312 /*==================*/
2313 {
2314 double dirty_pct = buf_get_modified_ratio_pct();
2315
2316 if (dirty_pct == 0.0) {
2317 /* No pages modified */
2318 return(0);
2319 }
2320
2321 ut_a(srv_max_dirty_pages_pct_lwm
2322 <= srv_max_buf_pool_modified_pct);
2323
2324 if (srv_max_dirty_pages_pct_lwm == 0) {
2325 /* The user has not set the option to preflush dirty
2326 pages as we approach the high water mark. */
2327 if (dirty_pct >= srv_max_buf_pool_modified_pct) {
2328 /* We have crossed the high water mark of dirty
2329 pages In this case we start flushing at 100% of
2330 innodb_io_capacity. */
2331 return(100);
2332 }
2333 } else if (dirty_pct >= srv_max_dirty_pages_pct_lwm) {
2334 /* We should start flushing pages gradually. */
2335 return(static_cast<ulint>((dirty_pct * 100)
2336 / (srv_max_buf_pool_modified_pct + 1)));
2337 }
2338
2339 return(0);
2340 }
2341
2342 /*********************************************************************//**
2343 Calculates if flushing is required based on redo generation rate.
2344 @return percent of io_capacity to flush to manage redo space */
2345 static
2346 ulint
af_get_pct_for_lsn(lsn_t age)2347 af_get_pct_for_lsn(
2348 /*===============*/
2349 lsn_t age) /*!< in: current age of LSN. */
2350 {
2351 lsn_t max_async_age;
2352 lsn_t lsn_age_factor;
2353 lsn_t af_lwm = (lsn_t) ((srv_adaptive_flushing_lwm
2354 * log_get_capacity()) / 100);
2355
2356 if (age < af_lwm) {
2357 /* No adaptive flushing. */
2358 return(0);
2359 }
2360
2361 max_async_age = log_get_max_modified_age_async();
2362
2363 if (age < max_async_age && !srv_adaptive_flushing) {
2364 /* We have still not reached the max_async point and
2365 the user has disabled adaptive flushing. */
2366 return(0);
2367 }
2368
2369 /* If we are here then we know that either:
2370 1) User has enabled adaptive flushing
2371 2) User may have disabled adaptive flushing but we have reached
2372 max_async_age. */
2373 lsn_age_factor = (age * 100) / max_async_age;
2374
2375 ut_ad(srv_max_io_capacity >= srv_io_capacity);
2376 return(static_cast<ulint>(
2377 ((srv_max_io_capacity / srv_io_capacity)
2378 * (lsn_age_factor * sqrt((double)lsn_age_factor)))
2379 / 7.5));
2380 }
2381
2382 /*********************************************************************//**
2383 This function is called approximately once every second by the
2384 page_cleaner thread. Based on various factors it decides if there is a
2385 need to do flushing.
2386 @return number of pages recommended to be flushed
2387 @param lsn_limit pointer to return LSN up to which flushing must happen
2388 @param last_pages_in the number of pages flushed by the last flush_list
2389 flushing. */
2390 static
2391 ulint
page_cleaner_flush_pages_recommendation(lsn_t * lsn_limit,ulint last_pages_in)2392 page_cleaner_flush_pages_recommendation(
2393 /*====================================*/
2394 lsn_t* lsn_limit,
2395 ulint last_pages_in)
2396 {
2397 static lsn_t prev_lsn = 0;
2398 static ulint sum_pages = 0;
2399 static ulint avg_page_rate = 0;
2400 static ulint n_iterations = 0;
2401 static time_t prev_time;
2402 lsn_t oldest_lsn;
2403 lsn_t cur_lsn;
2404 lsn_t age;
2405 lsn_t lsn_rate;
2406 ulint n_pages = 0;
2407 ulint pct_for_dirty = 0;
2408 ulint pct_for_lsn = 0;
2409 ulint pct_total = 0;
2410
2411 cur_lsn = log_get_lsn_nowait();
2412
2413 /* log_get_lsn_nowait tries to get log_sys.mutex with
2414 mutex_enter_nowait, if this does not succeed function
2415 returns 0, do not use that value to update stats. */
2416 if (cur_lsn == 0) {
2417 return(0);
2418 }
2419
2420 if (prev_lsn == 0) {
2421 /* First time around. */
2422 prev_lsn = cur_lsn;
2423 prev_time = time(NULL);
2424 return(0);
2425 }
2426
2427 if (prev_lsn == cur_lsn) {
2428 return(0);
2429 }
2430
2431 sum_pages += last_pages_in;
2432
2433 time_t curr_time = time(NULL);
2434 double time_elapsed = difftime(curr_time, prev_time);
2435
2436 /* We update our variables every srv_flushing_avg_loops
2437 iterations to smooth out transition in workload. */
2438 if (++n_iterations >= srv_flushing_avg_loops
2439 || time_elapsed >= srv_flushing_avg_loops) {
2440
2441 if (time_elapsed < 1) {
2442 time_elapsed = 1;
2443 }
2444
2445 avg_page_rate = static_cast<ulint>(
2446 ((static_cast<double>(sum_pages)
2447 / time_elapsed)
2448 + avg_page_rate) / 2);
2449
2450 /* How much LSN we have generated since last call. */
2451 lsn_rate = static_cast<lsn_t>(
2452 static_cast<double>(cur_lsn - prev_lsn)
2453 / time_elapsed);
2454
2455 lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2;
2456
2457 /* aggregate stats of all slots */
2458 mutex_enter(&page_cleaner.mutex);
2459
2460 ulint flush_tm = page_cleaner.flush_time;
2461 ulint flush_pass = page_cleaner.flush_pass;
2462
2463 page_cleaner.flush_time = 0;
2464 page_cleaner.flush_pass = 0;
2465
2466 ulint lru_tm = 0;
2467 ulint list_tm = 0;
2468 ulint lru_pass = 0;
2469 ulint list_pass = 0;
2470
2471 for (ulint i = 0; i < page_cleaner.n_slots; i++) {
2472 page_cleaner_slot_t* slot;
2473
2474 slot = &page_cleaner.slots[i];
2475
2476 lru_tm += slot->flush_lru_time;
2477 lru_pass += slot->flush_lru_pass;
2478 list_tm += slot->flush_list_time;
2479 list_pass += slot->flush_list_pass;
2480
2481 slot->flush_lru_time = 0;
2482 slot->flush_lru_pass = 0;
2483 slot->flush_list_time = 0;
2484 slot->flush_list_pass = 0;
2485 }
2486
2487 mutex_exit(&page_cleaner.mutex);
2488
2489 /* minimum values are 1, to avoid dividing by zero. */
2490 if (lru_tm < 1) {
2491 lru_tm = 1;
2492 }
2493 if (list_tm < 1) {
2494 list_tm = 1;
2495 }
2496 if (flush_tm < 1) {
2497 flush_tm = 1;
2498 }
2499
2500 if (lru_pass < 1) {
2501 lru_pass = 1;
2502 }
2503 if (list_pass < 1) {
2504 list_pass = 1;
2505 }
2506 if (flush_pass < 1) {
2507 flush_pass = 1;
2508 }
2509
2510 MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_SLOT,
2511 list_tm / list_pass);
2512 MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_SLOT,
2513 lru_tm / lru_pass);
2514
2515 MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_THREAD,
2516 list_tm / (srv_n_page_cleaners * flush_pass));
2517 MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_THREAD,
2518 lru_tm / (srv_n_page_cleaners * flush_pass));
2519 MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_EST,
2520 flush_tm * list_tm / flush_pass
2521 / (list_tm + lru_tm));
2522 MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_EST,
2523 flush_tm * lru_tm / flush_pass
2524 / (list_tm + lru_tm));
2525 MONITOR_SET(MONITOR_FLUSH_AVG_TIME, flush_tm / flush_pass);
2526
2527 MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS,
2528 list_pass / page_cleaner.n_slots);
2529 MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_PASS,
2530 lru_pass / page_cleaner.n_slots);
2531 MONITOR_SET(MONITOR_FLUSH_AVG_PASS, flush_pass);
2532
2533 prev_lsn = cur_lsn;
2534 prev_time = curr_time;
2535
2536 n_iterations = 0;
2537
2538 sum_pages = 0;
2539 }
2540
2541 oldest_lsn = buf_pool_get_oldest_modification();
2542
2543 ut_ad(oldest_lsn <= log_get_lsn());
2544
2545 age = cur_lsn > oldest_lsn ? cur_lsn - oldest_lsn : 0;
2546
2547 pct_for_dirty = af_get_pct_for_dirty();
2548 pct_for_lsn = af_get_pct_for_lsn(age);
2549
2550 pct_total = ut_max(pct_for_dirty, pct_for_lsn);
2551
2552 /* Estimate pages to be flushed for the lsn progress */
2553 ulint sum_pages_for_lsn = 0;
2554 lsn_t target_lsn = oldest_lsn
2555 + lsn_avg_rate * buf_flush_lsn_scan_factor;
2556
2557 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2558 buf_pool_t* buf_pool = buf_pool_from_array(i);
2559 ulint pages_for_lsn = 0;
2560
2561 buf_flush_list_mutex_enter(buf_pool);
2562 for (buf_page_t* b = UT_LIST_GET_LAST(buf_pool->flush_list);
2563 b != NULL;
2564 b = UT_LIST_GET_PREV(list, b)) {
2565 if (b->oldest_modification > target_lsn) {
2566 break;
2567 }
2568 ++pages_for_lsn;
2569 }
2570 buf_flush_list_mutex_exit(buf_pool);
2571
2572 sum_pages_for_lsn += pages_for_lsn;
2573
2574 mutex_enter(&page_cleaner.mutex);
2575 ut_ad(page_cleaner.slots[i].state
2576 == PAGE_CLEANER_STATE_NONE);
2577 page_cleaner.slots[i].n_pages_requested
2578 = pages_for_lsn / buf_flush_lsn_scan_factor + 1;
2579 mutex_exit(&page_cleaner.mutex);
2580 }
2581
2582 sum_pages_for_lsn /= buf_flush_lsn_scan_factor;
2583 if(sum_pages_for_lsn < 1) {
2584 sum_pages_for_lsn = 1;
2585 }
2586
2587 /* Cap the maximum IO capacity that we are going to use by
2588 max_io_capacity. Limit the value to avoid too quick increase */
2589 ulint pages_for_lsn =
2590 std::min<ulint>(sum_pages_for_lsn, srv_max_io_capacity * 2);
2591
2592 n_pages = (PCT_IO(pct_total) + avg_page_rate + pages_for_lsn) / 3;
2593
2594 if (n_pages > srv_max_io_capacity) {
2595 n_pages = srv_max_io_capacity;
2596 }
2597
2598 /* Normalize request for each instance */
2599 mutex_enter(&page_cleaner.mutex);
2600 ut_ad(page_cleaner.n_slots_requested == 0);
2601 ut_ad(page_cleaner.n_slots_flushing == 0);
2602 ut_ad(page_cleaner.n_slots_finished == 0);
2603
2604 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2605 /* if REDO has enough of free space,
2606 don't care about age distribution of pages */
2607 page_cleaner.slots[i].n_pages_requested = pct_for_lsn > 30 ?
2608 page_cleaner.slots[i].n_pages_requested
2609 * n_pages / sum_pages_for_lsn + 1
2610 : n_pages / srv_buf_pool_instances;
2611 }
2612 mutex_exit(&page_cleaner.mutex);
2613
2614 MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages);
2615
2616 MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_BY_AGE, sum_pages_for_lsn);
2617
2618 MONITOR_SET(MONITOR_FLUSH_AVG_PAGE_RATE, avg_page_rate);
2619 MONITOR_SET(MONITOR_FLUSH_LSN_AVG_RATE, lsn_avg_rate);
2620 MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty);
2621 MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn);
2622
2623 *lsn_limit = LSN_MAX;
2624
2625 return(n_pages);
2626 }
2627
2628 /*********************************************************************//**
2629 Puts the page_cleaner thread to sleep if it has finished work in less
2630 than a second
2631 @retval 0 wake up by event set,
2632 @retval OS_SYNC_TIME_EXCEEDED if timeout was exceeded
2633 @param next_loop_time time when next loop iteration should start
2634 @param sig_count zero or the value returned by previous call of
2635 os_event_reset()
2636 @param cur_time current time as in ut_time_ms() */
2637 static
2638 ulint
pc_sleep_if_needed(ulint next_loop_time,int64_t sig_count,ulint cur_time)2639 pc_sleep_if_needed(
2640 /*===============*/
2641 ulint next_loop_time,
2642 int64_t sig_count,
2643 ulint cur_time)
2644 {
2645 /* No sleep if we are cleaning the buffer pool during the shutdown
2646 with everything else finished */
2647 if (srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE)
2648 return OS_SYNC_TIME_EXCEEDED;
2649
2650 if (next_loop_time > cur_time) {
2651 /* Get sleep interval in micro seconds. We use
2652 ut_min() to avoid long sleep in case of wrap around. */
2653 ulint sleep_us;
2654
2655 sleep_us = ut_min(static_cast<ulint>(1000000),
2656 (next_loop_time - cur_time) * 1000);
2657
2658 return(os_event_wait_time_low(buf_flush_event,
2659 sleep_us, sig_count));
2660 }
2661
2662 return(OS_SYNC_TIME_EXCEEDED);
2663 }
2664
2665 /******************************************************************//**
2666 Initialize page_cleaner. */
2667 void
buf_flush_page_cleaner_init(void)2668 buf_flush_page_cleaner_init(void)
2669 /*=============================*/
2670 {
2671 ut_ad(!page_cleaner.is_running);
2672
2673 mutex_create(LATCH_ID_PAGE_CLEANER, &page_cleaner.mutex);
2674
2675 page_cleaner.is_requested = os_event_create("pc_is_requested");
2676 page_cleaner.is_finished = os_event_create("pc_is_finished");
2677 page_cleaner.is_started = os_event_create("pc_is_started");
2678 page_cleaner.n_slots = static_cast<ulint>(srv_buf_pool_instances);
2679
2680 ut_d(page_cleaner.n_disabled_debug = 0);
2681
2682 page_cleaner.is_running = true;
2683 }
2684
2685 /**
2686 Requests for all slots to flush all buffer pool instances.
2687 @param min_n wished minimum mumber of blocks flushed
2688 (it is not guaranteed that the actual number is that big)
2689 @param lsn_limit in the case BUF_FLUSH_LIST all blocks whose
2690 oldest_modification is smaller than this should be flushed
2691 (if their number does not exceed min_n), otherwise ignored
2692 */
2693 static
2694 void
pc_request(ulint min_n,lsn_t lsn_limit)2695 pc_request(
2696 ulint min_n,
2697 lsn_t lsn_limit)
2698 {
2699 if (min_n != ULINT_MAX) {
2700 /* Ensure that flushing is spread evenly amongst the
2701 buffer pool instances. When min_n is ULINT_MAX
2702 we need to flush everything up to the lsn limit
2703 so no limit here. */
2704 min_n = (min_n + srv_buf_pool_instances - 1)
2705 / srv_buf_pool_instances;
2706 }
2707
2708 mutex_enter(&page_cleaner.mutex);
2709
2710 ut_ad(page_cleaner.n_slots_requested == 0);
2711 ut_ad(page_cleaner.n_slots_flushing == 0);
2712 ut_ad(page_cleaner.n_slots_finished == 0);
2713
2714 page_cleaner.requested = (min_n > 0);
2715 page_cleaner.lsn_limit = lsn_limit;
2716
2717 for (ulint i = 0; i < page_cleaner.n_slots; i++) {
2718 page_cleaner_slot_t* slot = &page_cleaner.slots[i];
2719
2720 ut_ad(slot->state == PAGE_CLEANER_STATE_NONE);
2721
2722 if (min_n == ULINT_MAX) {
2723 slot->n_pages_requested = ULINT_MAX;
2724 } else if (min_n == 0) {
2725 slot->n_pages_requested = 0;
2726 }
2727
2728 /* slot->n_pages_requested was already set by
2729 page_cleaner_flush_pages_recommendation() */
2730
2731 slot->state = PAGE_CLEANER_STATE_REQUESTED;
2732 }
2733
2734 page_cleaner.n_slots_requested = page_cleaner.n_slots;
2735 page_cleaner.n_slots_flushing = 0;
2736 page_cleaner.n_slots_finished = 0;
2737
2738 os_event_set(page_cleaner.is_requested);
2739
2740 mutex_exit(&page_cleaner.mutex);
2741 }
2742
2743 /**
2744 Do flush for one slot.
2745 @return the number of the slots which has not been treated yet. */
2746 static
2747 ulint
pc_flush_slot(void)2748 pc_flush_slot(void)
2749 {
2750 ulint lru_tm = 0;
2751 ulint list_tm = 0;
2752 ulint lru_pass = 0;
2753 ulint list_pass = 0;
2754
2755 mutex_enter(&page_cleaner.mutex);
2756
2757 if (!page_cleaner.n_slots_requested) {
2758 os_event_reset(page_cleaner.is_requested);
2759 } else {
2760 page_cleaner_slot_t* slot = NULL;
2761 ulint i;
2762
2763 for (i = 0; i < page_cleaner.n_slots; i++) {
2764 slot = &page_cleaner.slots[i];
2765
2766 if (slot->state == PAGE_CLEANER_STATE_REQUESTED) {
2767 break;
2768 }
2769 }
2770
2771 /* slot should be found because
2772 page_cleaner.n_slots_requested > 0 */
2773 ut_a(i < page_cleaner.n_slots);
2774
2775 buf_pool_t* buf_pool = buf_pool_from_array(i);
2776
2777 page_cleaner.n_slots_requested--;
2778 page_cleaner.n_slots_flushing++;
2779 slot->state = PAGE_CLEANER_STATE_FLUSHING;
2780
2781 if (UNIV_UNLIKELY(!page_cleaner.is_running)) {
2782 slot->n_flushed_lru = 0;
2783 slot->n_flushed_list = 0;
2784 goto finish_mutex;
2785 }
2786
2787 if (page_cleaner.n_slots_requested == 0) {
2788 os_event_reset(page_cleaner.is_requested);
2789 }
2790
2791 mutex_exit(&page_cleaner.mutex);
2792
2793 lru_tm = ut_time_ms();
2794
2795 /* Flush pages from end of LRU if required */
2796 slot->n_flushed_lru = buf_flush_LRU_list(buf_pool);
2797
2798 lru_tm = ut_time_ms() - lru_tm;
2799 lru_pass++;
2800
2801 if (UNIV_UNLIKELY(!page_cleaner.is_running)) {
2802 slot->n_flushed_list = 0;
2803 goto finish;
2804 }
2805
2806 /* Flush pages from flush_list if required */
2807 if (page_cleaner.requested) {
2808 flush_counters_t n;
2809 memset(&n, 0, sizeof(flush_counters_t));
2810 list_tm = ut_time_ms();
2811
2812 slot->succeeded_list = buf_flush_do_batch(
2813 buf_pool, BUF_FLUSH_LIST,
2814 slot->n_pages_requested,
2815 page_cleaner.lsn_limit,
2816 &n);
2817
2818 slot->n_flushed_list = n.flushed;
2819
2820 list_tm = ut_time_ms() - list_tm;
2821 list_pass++;
2822 } else {
2823 slot->n_flushed_list = 0;
2824 slot->succeeded_list = true;
2825 }
2826 finish:
2827 mutex_enter(&page_cleaner.mutex);
2828 finish_mutex:
2829 page_cleaner.n_slots_flushing--;
2830 page_cleaner.n_slots_finished++;
2831 slot->state = PAGE_CLEANER_STATE_FINISHED;
2832
2833 slot->flush_lru_time += lru_tm;
2834 slot->flush_list_time += list_tm;
2835 slot->flush_lru_pass += lru_pass;
2836 slot->flush_list_pass += list_pass;
2837
2838 if (page_cleaner.n_slots_requested == 0
2839 && page_cleaner.n_slots_flushing == 0) {
2840 os_event_set(page_cleaner.is_finished);
2841 }
2842 }
2843
2844 ulint ret = page_cleaner.n_slots_requested;
2845
2846 mutex_exit(&page_cleaner.mutex);
2847
2848 return(ret);
2849 }
2850
2851 /**
2852 Wait until all flush requests are finished.
2853 @param n_flushed_lru number of pages flushed from the end of the LRU list.
2854 @param n_flushed_list number of pages flushed from the end of the
2855 flush_list.
2856 @return true if all flush_list flushing batch were success. */
2857 static
2858 bool
pc_wait_finished(ulint * n_flushed_lru,ulint * n_flushed_list)2859 pc_wait_finished(
2860 ulint* n_flushed_lru,
2861 ulint* n_flushed_list)
2862 {
2863 bool all_succeeded = true;
2864
2865 *n_flushed_lru = 0;
2866 *n_flushed_list = 0;
2867
2868 os_event_wait(page_cleaner.is_finished);
2869
2870 mutex_enter(&page_cleaner.mutex);
2871
2872 ut_ad(page_cleaner.n_slots_requested == 0);
2873 ut_ad(page_cleaner.n_slots_flushing == 0);
2874 ut_ad(page_cleaner.n_slots_finished == page_cleaner.n_slots);
2875
2876 for (ulint i = 0; i < page_cleaner.n_slots; i++) {
2877 page_cleaner_slot_t* slot = &page_cleaner.slots[i];
2878
2879 ut_ad(slot->state == PAGE_CLEANER_STATE_FINISHED);
2880
2881 *n_flushed_lru += slot->n_flushed_lru;
2882 *n_flushed_list += slot->n_flushed_list;
2883 all_succeeded &= slot->succeeded_list;
2884
2885 slot->state = PAGE_CLEANER_STATE_NONE;
2886
2887 slot->n_pages_requested = 0;
2888 }
2889
2890 page_cleaner.n_slots_finished = 0;
2891
2892 os_event_reset(page_cleaner.is_finished);
2893
2894 mutex_exit(&page_cleaner.mutex);
2895
2896 return(all_succeeded);
2897 }
2898
2899 #ifdef UNIV_LINUX
2900 /**
2901 Set priority for page_cleaner threads.
2902 @param[in] priority priority intended to set
2903 @return true if set as intended */
2904 static
2905 bool
buf_flush_page_cleaner_set_priority(int priority)2906 buf_flush_page_cleaner_set_priority(
2907 int priority)
2908 {
2909 setpriority(PRIO_PROCESS, (pid_t)syscall(SYS_gettid),
2910 priority);
2911 return(getpriority(PRIO_PROCESS, (pid_t)syscall(SYS_gettid))
2912 == priority);
2913 }
2914 #endif /* UNIV_LINUX */
2915
2916 #ifdef UNIV_DEBUG
2917 /** Loop used to disable page cleaner threads. */
2918 static
2919 void
buf_flush_page_cleaner_disabled_loop(void)2920 buf_flush_page_cleaner_disabled_loop(void)
2921 {
2922 if (!innodb_page_cleaner_disabled_debug) {
2923 /* We return to avoid entering and exiting mutex. */
2924 return;
2925 }
2926
2927 mutex_enter(&page_cleaner.mutex);
2928 page_cleaner.n_disabled_debug++;
2929 mutex_exit(&page_cleaner.mutex);
2930
2931 while (innodb_page_cleaner_disabled_debug
2932 && srv_shutdown_state == SRV_SHUTDOWN_NONE
2933 && page_cleaner.is_running) {
2934
2935 os_thread_sleep(100000); /* [A] */
2936 }
2937
2938 /* We need to wait for threads exiting here, otherwise we would
2939 encounter problem when we quickly perform following steps:
2940 1) SET GLOBAL innodb_page_cleaner_disabled_debug = 1;
2941 2) SET GLOBAL innodb_page_cleaner_disabled_debug = 0;
2942 3) SET GLOBAL innodb_page_cleaner_disabled_debug = 1;
2943 That's because after step 1 this thread could still be sleeping
2944 inside the loop above at [A] and steps 2, 3 could happen before
2945 this thread wakes up from [A]. In such case this thread would
2946 not re-increment n_disabled_debug and we would be waiting for
2947 him forever in buf_flush_page_cleaner_disabled_debug_update(...).
2948
2949 Therefore we are waiting in step 2 for this thread exiting here. */
2950
2951 mutex_enter(&page_cleaner.mutex);
2952 page_cleaner.n_disabled_debug--;
2953 mutex_exit(&page_cleaner.mutex);
2954 }
2955
2956 /** Disables page cleaner threads (coordinator and workers).
2957 @param[in] save immediate result from check function */
buf_flush_page_cleaner_disabled_debug_update(THD *,st_mysql_sys_var *,void *,const void * save)2958 void buf_flush_page_cleaner_disabled_debug_update(THD*,
2959 st_mysql_sys_var*, void*,
2960 const void* save)
2961 {
2962 if (!page_cleaner.is_running) {
2963 return;
2964 }
2965
2966 if (!*static_cast<const my_bool*>(save)) {
2967 if (!innodb_page_cleaner_disabled_debug) {
2968 return;
2969 }
2970
2971 innodb_page_cleaner_disabled_debug = false;
2972
2973 /* Enable page cleaner threads. */
2974 while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
2975 mutex_enter(&page_cleaner.mutex);
2976 const ulint n = page_cleaner.n_disabled_debug;
2977 mutex_exit(&page_cleaner.mutex);
2978 /* Check if all threads have been enabled, to avoid
2979 problem when we decide to re-disable them soon. */
2980 if (n == 0) {
2981 break;
2982 }
2983 }
2984 return;
2985 }
2986
2987 if (innodb_page_cleaner_disabled_debug) {
2988 return;
2989 }
2990
2991 innodb_page_cleaner_disabled_debug = true;
2992
2993 while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
2994 /* Workers are possibly sleeping on is_requested.
2995
2996 We have to wake them, otherwise they could possibly
2997 have never noticed, that they should be disabled,
2998 and we would wait for them here forever.
2999
3000 That's why we have sleep-loop instead of simply
3001 waiting on some disabled_debug_event. */
3002 os_event_set(page_cleaner.is_requested);
3003
3004 mutex_enter(&page_cleaner.mutex);
3005
3006 ut_ad(page_cleaner.n_disabled_debug
3007 <= srv_n_page_cleaners);
3008
3009 if (page_cleaner.n_disabled_debug
3010 == srv_n_page_cleaners) {
3011
3012 mutex_exit(&page_cleaner.mutex);
3013 break;
3014 }
3015
3016 mutex_exit(&page_cleaner.mutex);
3017
3018 os_thread_sleep(100000);
3019 }
3020 }
3021 #endif /* UNIV_DEBUG */
3022
3023 /******************************************************************//**
3024 page_cleaner thread tasked with flushing dirty pages from the buffer
3025 pools. As of now we'll have only one coordinator.
3026 @return a dummy parameter */
3027 extern "C"
3028 os_thread_ret_t
DECLARE_THREAD(buf_flush_page_cleaner_coordinator)3029 DECLARE_THREAD(buf_flush_page_cleaner_coordinator)(void*)
3030 {
3031 my_thread_init();
3032 #ifdef UNIV_PFS_THREAD
3033 pfs_register_thread(page_cleaner_thread_key);
3034 #endif /* UNIV_PFS_THREAD */
3035 ut_ad(!srv_read_only_mode);
3036
3037 #ifdef UNIV_DEBUG_THREAD_CREATION
3038 ib::info() << "page_cleaner thread running, id "
3039 << os_thread_pf(os_thread_get_curr_id());
3040 #endif /* UNIV_DEBUG_THREAD_CREATION */
3041 #ifdef UNIV_LINUX
3042 /* linux might be able to set different setting for each thread.
3043 worth to try to set high priority for page cleaner threads */
3044 if (buf_flush_page_cleaner_set_priority(
3045 buf_flush_page_cleaner_priority)) {
3046
3047 ib::info() << "page_cleaner coordinator priority: "
3048 << buf_flush_page_cleaner_priority;
3049 } else {
3050 ib::info() << "If the mysqld execution user is authorized,"
3051 " page cleaner thread priority can be changed."
3052 " See the man page of setpriority().";
3053 }
3054 /* Signal that setpriority() has been attempted. */
3055 os_event_set(recv_sys->flush_end);
3056 #endif /* UNIV_LINUX */
3057
3058 do {
3059 /* treat flushing requests during recovery. */
3060 ulint n_flushed_lru = 0;
3061 ulint n_flushed_list = 0;
3062
3063 os_event_wait(recv_sys->flush_start);
3064
3065 if (!recv_writer_thread_active) {
3066 break;
3067 }
3068
3069 switch (recv_sys->flush_type) {
3070 case BUF_FLUSH_LRU:
3071 /* Flush pages from end of LRU if required */
3072 pc_request(0, LSN_MAX);
3073 while (pc_flush_slot() > 0) {}
3074 pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3075 break;
3076
3077 case BUF_FLUSH_LIST:
3078 /* Flush all pages */
3079 do {
3080 pc_request(ULINT_MAX, LSN_MAX);
3081 while (pc_flush_slot() > 0) {}
3082 } while (!pc_wait_finished(&n_flushed_lru,
3083 &n_flushed_list));
3084 break;
3085
3086 default:
3087 ut_ad(0);
3088 }
3089
3090 os_event_reset(recv_sys->flush_start);
3091 os_event_set(recv_sys->flush_end);
3092 } while (recv_writer_thread_active);
3093
3094 os_event_wait(buf_flush_event);
3095
3096 ulint ret_sleep = 0;
3097 ulint n_evicted = 0;
3098 ulint n_flushed_last = 0;
3099 ulint warn_interval = 1;
3100 ulint warn_count = 0;
3101 int64_t sig_count = os_event_reset(buf_flush_event);
3102 ulint next_loop_time = ut_time_ms() + 1000;
3103 ulint n_flushed = 0;
3104 ulint last_activity = srv_get_activity_count();
3105 ulint last_pages = 0;
3106
3107 while (srv_shutdown_state <= SRV_SHUTDOWN_INITIATED) {
3108 ulint curr_time = ut_time_ms();
3109
3110 /* The page_cleaner skips sleep if the server is
3111 idle and there are no pending IOs in the buffer pool
3112 and there is work to do. */
3113 if (srv_check_activity(last_activity)
3114 || buf_get_n_pending_read_ios()
3115 || n_flushed == 0) {
3116
3117 ret_sleep = pc_sleep_if_needed(
3118 next_loop_time, sig_count, curr_time);
3119 } else if (curr_time > next_loop_time) {
3120 ret_sleep = OS_SYNC_TIME_EXCEEDED;
3121 } else {
3122 ret_sleep = 0;
3123 }
3124
3125 if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) {
3126 break;
3127 }
3128
3129 sig_count = os_event_reset(buf_flush_event);
3130
3131 if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3132 if (global_system_variables.log_warnings > 2
3133 && curr_time > next_loop_time + 3000
3134 && !(test_flags & TEST_SIGINT)) {
3135 if (warn_count == 0) {
3136 ib::info() << "page_cleaner: 1000ms"
3137 " intended loop took "
3138 << 1000 + curr_time
3139 - next_loop_time
3140 << "ms. The settings might not"
3141 " be optimal. (flushed="
3142 << n_flushed_last
3143 << " and evicted="
3144 << n_evicted
3145 << ", during the time.)";
3146 if (warn_interval > 300) {
3147 warn_interval = 600;
3148 } else {
3149 warn_interval *= 2;
3150 }
3151
3152 warn_count = warn_interval;
3153 } else {
3154 --warn_count;
3155 }
3156 } else {
3157 /* reset counter */
3158 warn_interval = 1;
3159 warn_count = 0;
3160 }
3161
3162 next_loop_time = curr_time + 1000;
3163 n_flushed_last = n_evicted = 0;
3164 }
3165
3166 if (ret_sleep != OS_SYNC_TIME_EXCEEDED
3167 && srv_flush_sync
3168 && buf_flush_sync_lsn > 0) {
3169 /* woke up for flush_sync */
3170 mutex_enter(&page_cleaner.mutex);
3171 lsn_t lsn_limit = buf_flush_sync_lsn;
3172 buf_flush_sync_lsn = 0;
3173 mutex_exit(&page_cleaner.mutex);
3174
3175 /* Request flushing for threads */
3176 pc_request(ULINT_MAX, lsn_limit);
3177
3178 ulint tm = ut_time_ms();
3179
3180 /* Coordinator also treats requests */
3181 while (pc_flush_slot() > 0) {}
3182
3183 /* only coordinator is using these counters,
3184 so no need to protect by lock. */
3185 page_cleaner.flush_time += ut_time_ms() - tm;
3186 page_cleaner.flush_pass++;
3187
3188 /* Wait for all slots to be finished */
3189 ulint n_flushed_lru = 0;
3190 ulint n_flushed_list = 0;
3191 pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3192
3193 if (n_flushed_list > 0 || n_flushed_lru > 0) {
3194 buf_flush_stats(n_flushed_list, n_flushed_lru);
3195
3196 MONITOR_INC_VALUE_CUMULATIVE(
3197 MONITOR_FLUSH_SYNC_TOTAL_PAGE,
3198 MONITOR_FLUSH_SYNC_COUNT,
3199 MONITOR_FLUSH_SYNC_PAGES,
3200 n_flushed_lru + n_flushed_list);
3201 }
3202
3203 n_flushed = n_flushed_lru + n_flushed_list;
3204
3205 } else if (srv_check_activity(last_activity)) {
3206 ulint n_to_flush;
3207 lsn_t lsn_limit = 0;
3208
3209 /* Estimate pages from flush_list to be flushed */
3210 if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3211 last_activity = srv_get_activity_count();
3212 n_to_flush =
3213 page_cleaner_flush_pages_recommendation(
3214 &lsn_limit, last_pages);
3215 } else {
3216 n_to_flush = 0;
3217 }
3218
3219 /* Request flushing for threads */
3220 pc_request(n_to_flush, lsn_limit);
3221
3222 ulint tm = ut_time_ms();
3223
3224 /* Coordinator also treats requests */
3225 while (pc_flush_slot() > 0) {
3226 /* No op */
3227 }
3228
3229 /* only coordinator is using these counters,
3230 so no need to protect by lock. */
3231 page_cleaner.flush_time += ut_time_ms() - tm;
3232 page_cleaner.flush_pass++ ;
3233
3234 /* Wait for all slots to be finished */
3235 ulint n_flushed_lru = 0;
3236 ulint n_flushed_list = 0;
3237
3238 pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3239
3240 if (n_flushed_list > 0 || n_flushed_lru > 0) {
3241 buf_flush_stats(n_flushed_list, n_flushed_lru);
3242 }
3243
3244 if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3245 last_pages = n_flushed_list;
3246 }
3247
3248 n_evicted += n_flushed_lru;
3249 n_flushed_last += n_flushed_list;
3250
3251 n_flushed = n_flushed_lru + n_flushed_list;
3252
3253 if (n_flushed_lru) {
3254 MONITOR_INC_VALUE_CUMULATIVE(
3255 MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
3256 MONITOR_LRU_BATCH_FLUSH_COUNT,
3257 MONITOR_LRU_BATCH_FLUSH_PAGES,
3258 n_flushed_lru);
3259 }
3260
3261 if (n_flushed_list) {
3262 MONITOR_INC_VALUE_CUMULATIVE(
3263 MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
3264 MONITOR_FLUSH_ADAPTIVE_COUNT,
3265 MONITOR_FLUSH_ADAPTIVE_PAGES,
3266 n_flushed_list);
3267 }
3268
3269 } else if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3270 /* no activity, slept enough */
3271 buf_flush_lists(PCT_IO(100), LSN_MAX, &n_flushed);
3272
3273 n_flushed_last += n_flushed;
3274
3275 if (n_flushed) {
3276 MONITOR_INC_VALUE_CUMULATIVE(
3277 MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
3278 MONITOR_FLUSH_BACKGROUND_COUNT,
3279 MONITOR_FLUSH_BACKGROUND_PAGES,
3280 n_flushed);
3281
3282 }
3283
3284 } else {
3285 /* no activity, but woken up by event */
3286 n_flushed = 0;
3287 }
3288
3289 ut_d(buf_flush_page_cleaner_disabled_loop());
3290 }
3291
3292 ut_ad(srv_shutdown_state > SRV_SHUTDOWN_INITIATED);
3293 if (srv_fast_shutdown == 2
3294 || srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
3295 /* In very fast shutdown or when innodb failed to start, we
3296 simulate a crash of the buffer pool. We are not required to do
3297 any flushing. */
3298 goto thread_exit;
3299 }
3300
3301 /* In case of normal and slow shutdown the page_cleaner thread
3302 must wait for all other activity in the server to die down.
3303 Note that we can start flushing the buffer pool as soon as the
3304 server enters shutdown phase but we must stay alive long enough
3305 to ensure that any work done by the master or purge threads is
3306 also flushed.
3307 During shutdown we pass through two stages. In the first stage,
3308 when SRV_SHUTDOWN_CLEANUP is set other threads like the master
3309 and the purge threads may be working as well. We start flushing
3310 the buffer pool but can't be sure that no new pages are being
3311 dirtied until we enter SRV_SHUTDOWN_FLUSH_PHASE phase. */
3312
3313 do {
3314 pc_request(ULINT_MAX, LSN_MAX);
3315
3316 while (pc_flush_slot() > 0) {}
3317
3318 ulint n_flushed_lru = 0;
3319 ulint n_flushed_list = 0;
3320 pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3321
3322 n_flushed = n_flushed_lru + n_flushed_list;
3323
3324 /* We sleep only if there are no pages to flush */
3325 if (n_flushed == 0) {
3326 os_thread_sleep(100000);
3327 }
3328 } while (srv_shutdown_state == SRV_SHUTDOWN_CLEANUP);
3329
3330 /* At this point all threads including the master and the purge
3331 thread must have been suspended. */
3332 ut_a(srv_get_active_thread_type() == SRV_NONE);
3333 ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
3334
3335 /* We can now make a final sweep on flushing the buffer pool
3336 and exit after we have cleaned the whole buffer pool.
3337 It is important that we wait for any running batch that has
3338 been triggered by us to finish. Otherwise we can end up
3339 considering end of that batch as a finish of our final
3340 sweep and we'll come out of the loop leaving behind dirty pages
3341 in the flush_list */
3342 buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
3343 buf_flush_wait_LRU_batch_end();
3344
3345 bool success;
3346
3347 do {
3348 pc_request(ULINT_MAX, LSN_MAX);
3349
3350 while (pc_flush_slot() > 0) {}
3351
3352 ulint n_flushed_lru = 0;
3353 ulint n_flushed_list = 0;
3354 success = pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3355
3356 n_flushed = n_flushed_lru + n_flushed_list;
3357
3358 buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
3359 buf_flush_wait_LRU_batch_end();
3360
3361 } while (!success || n_flushed > 0);
3362
3363 /* Some sanity checks */
3364 ut_a(srv_get_active_thread_type() == SRV_NONE);
3365 ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
3366
3367 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
3368 buf_pool_t* buf_pool = buf_pool_from_array(i);
3369 ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == 0);
3370 }
3371
3372 /* We have lived our life. Time to die. */
3373
3374 thread_exit:
3375 /* All worker threads are waiting for the event here,
3376 and no more access to page_cleaner structure by them.
3377 Wakes worker threads up just to make them exit. */
3378 page_cleaner.is_running = false;
3379
3380 /* waiting for all worker threads exit */
3381 while (page_cleaner.n_workers) {
3382 os_event_set(page_cleaner.is_requested);
3383 os_thread_sleep(10000);
3384 }
3385
3386 mutex_destroy(&page_cleaner.mutex);
3387
3388 os_event_destroy(page_cleaner.is_finished);
3389 os_event_destroy(page_cleaner.is_requested);
3390 os_event_destroy(page_cleaner.is_started);
3391
3392 buf_page_cleaner_is_active = false;
3393
3394 my_thread_end();
3395 /* We count the number of threads in os_thread_exit(). A created
3396 thread should always use that to exit and not use return() to exit. */
3397 os_thread_exit();
3398
3399 OS_THREAD_DUMMY_RETURN;
3400 }
3401
3402 /** Adjust thread count for page cleaner workers.
3403 @param[in] new_cnt Number of threads to be used */
3404 void
buf_flush_set_page_cleaner_thread_cnt(ulong new_cnt)3405 buf_flush_set_page_cleaner_thread_cnt(ulong new_cnt)
3406 {
3407 mutex_enter(&page_cleaner.mutex);
3408
3409 srv_n_page_cleaners = new_cnt;
3410 if (new_cnt > page_cleaner.n_workers) {
3411 /* User has increased the number of page
3412 cleaner threads. */
3413 ulint add = new_cnt - page_cleaner.n_workers;
3414 for (ulint i = 0; i < add; i++) {
3415 os_thread_id_t cleaner_thread_id;
3416 os_thread_create(buf_flush_page_cleaner_worker, NULL, &cleaner_thread_id);
3417 }
3418 }
3419
3420 mutex_exit(&page_cleaner.mutex);
3421
3422 /* Wait until defined number of workers has started. */
3423 while (page_cleaner.is_running &&
3424 page_cleaner.n_workers != (srv_n_page_cleaners - 1)) {
3425 os_event_set(page_cleaner.is_requested);
3426 os_event_reset(page_cleaner.is_started);
3427 os_event_wait_time(page_cleaner.is_started, 1000000);
3428 }
3429 }
3430
3431 /******************************************************************//**
3432 Worker thread of page_cleaner.
3433 @return a dummy parameter */
3434 extern "C"
3435 os_thread_ret_t
DECLARE_THREAD(buf_flush_page_cleaner_worker)3436 DECLARE_THREAD(buf_flush_page_cleaner_worker)(
3437 /*==========================================*/
3438 void* arg MY_ATTRIBUTE((unused)))
3439 /*!< in: a dummy parameter required by
3440 os_thread_create */
3441 {
3442 my_thread_init();
3443 #ifndef DBUG_OFF
3444 os_thread_id_t cleaner_thread_id = os_thread_get_curr_id();
3445 #endif
3446
3447 mutex_enter(&page_cleaner.mutex);
3448 ulint thread_no = page_cleaner.n_workers++;
3449
3450 DBUG_LOG("ib_buf", "Thread " << cleaner_thread_id
3451 << " started; n_workers=" << page_cleaner.n_workers);
3452
3453 /* Signal that we have started */
3454 os_event_set(page_cleaner.is_started);
3455 mutex_exit(&page_cleaner.mutex);
3456
3457 #ifdef UNIV_LINUX
3458 /* linux might be able to set different setting for each thread
3459 worth to try to set high priority for page cleaner threads */
3460 if (buf_flush_page_cleaner_set_priority(
3461 buf_flush_page_cleaner_priority)) {
3462
3463 ib::info() << "page_cleaner worker priority: "
3464 << buf_flush_page_cleaner_priority;
3465 }
3466 #endif /* UNIV_LINUX */
3467
3468 while (true) {
3469 os_event_wait(page_cleaner.is_requested);
3470
3471 ut_d(buf_flush_page_cleaner_disabled_loop());
3472
3473 if (!page_cleaner.is_running) {
3474 break;
3475 }
3476
3477 ut_ad(srv_n_page_cleaners >= 1);
3478
3479 /* If number of page cleaner threads is decreased
3480 exit those that are not anymore needed. */
3481 if (srv_shutdown_state == SRV_SHUTDOWN_NONE &&
3482 thread_no >= (srv_n_page_cleaners - 1)) {
3483 DBUG_LOG("ib_buf", "Exiting "
3484 << thread_no
3485 << " page cleaner worker thread_id "
3486 << os_thread_pf(cleaner_thread_id)
3487 << " total threads " << srv_n_page_cleaners << ".");
3488 break;
3489 }
3490
3491 pc_flush_slot();
3492 }
3493
3494 mutex_enter(&page_cleaner.mutex);
3495 page_cleaner.n_workers--;
3496
3497 DBUG_LOG("ib_buf", "Thread " << cleaner_thread_id
3498 << " exiting; n_workers=" << page_cleaner.n_workers);
3499
3500 /* Signal that we have stopped */
3501 os_event_set(page_cleaner.is_started);
3502 mutex_exit(&page_cleaner.mutex);
3503
3504 my_thread_end();
3505
3506 os_thread_exit();
3507
3508 OS_THREAD_DUMMY_RETURN;
3509 }
3510
3511 /*******************************************************************//**
3512 Synchronously flush dirty blocks from the end of the flush list of all buffer
3513 pool instances.
3514 NOTE: The calling thread is not allowed to own any latches on pages! */
3515 void
buf_flush_sync_all_buf_pools(void)3516 buf_flush_sync_all_buf_pools(void)
3517 /*==============================*/
3518 {
3519 bool success;
3520 do {
3521 success = buf_flush_lists(ULINT_MAX, LSN_MAX, NULL);
3522 buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
3523 } while (!success);
3524
3525 ut_a(success);
3526 }
3527
3528 /** Request IO burst and wake page_cleaner up.
3529 @param[in] lsn_limit upper limit of LSN to be flushed */
3530 void
buf_flush_request_force(lsn_t lsn_limit)3531 buf_flush_request_force(
3532 lsn_t lsn_limit)
3533 {
3534 /* adjust based on lsn_avg_rate not to get old */
3535 lsn_t lsn_target = lsn_limit + lsn_avg_rate * 3;
3536
3537 mutex_enter(&page_cleaner.mutex);
3538 if (lsn_target > buf_flush_sync_lsn) {
3539 buf_flush_sync_lsn = lsn_target;
3540 }
3541 mutex_exit(&page_cleaner.mutex);
3542
3543 os_event_set(buf_flush_event);
3544 }
3545 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3546
3547 /** Functor to validate the flush list. */
3548 struct Check {
operator ()Check3549 void operator()(const buf_page_t* elem) const
3550 {
3551 ut_a(elem->in_flush_list);
3552 }
3553 };
3554
3555 /******************************************************************//**
3556 Validates the flush list.
3557 @return TRUE if ok */
3558 static
3559 ibool
buf_flush_validate_low(buf_pool_t * buf_pool)3560 buf_flush_validate_low(
3561 /*===================*/
3562 buf_pool_t* buf_pool) /*!< in: Buffer pool instance */
3563 {
3564 buf_page_t* bpage;
3565 const ib_rbt_node_t* rnode = NULL;
3566
3567 ut_ad(buf_flush_list_mutex_own(buf_pool));
3568
3569 ut_list_validate(buf_pool->flush_list, Check());
3570
3571 bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
3572
3573 /* If we are in recovery mode i.e.: flush_rbt != NULL
3574 then each block in the flush_list must also be present
3575 in the flush_rbt. */
3576 if (buf_pool->flush_rbt != NULL) {
3577 rnode = rbt_first(buf_pool->flush_rbt);
3578 }
3579
3580 while (bpage != NULL) {
3581 const lsn_t om = bpage->oldest_modification;
3582
3583 ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
3584
3585 ut_ad(bpage->in_flush_list);
3586
3587 /* A page in buf_pool->flush_list can be in
3588 BUF_BLOCK_REMOVE_HASH state. This happens when a page
3589 is in the middle of being relocated. In that case the
3590 original descriptor can have this state and still be
3591 in the flush list waiting to acquire the
3592 buf_pool->flush_list_mutex to complete the relocation. */
3593 ut_a(buf_page_in_file(bpage)
3594 || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
3595 ut_a(om > 0);
3596
3597 if (buf_pool->flush_rbt != NULL) {
3598 buf_page_t** prpage;
3599
3600 ut_a(rnode != NULL);
3601 prpage = rbt_value(buf_page_t*, rnode);
3602
3603 ut_a(*prpage != NULL);
3604 ut_a(*prpage == bpage);
3605 rnode = rbt_next(buf_pool->flush_rbt, rnode);
3606 }
3607
3608 bpage = UT_LIST_GET_NEXT(list, bpage);
3609
3610 ut_a(bpage == NULL || om >= bpage->oldest_modification);
3611 }
3612
3613 /* By this time we must have exhausted the traversal of
3614 flush_rbt (if active) as well. */
3615 ut_a(rnode == NULL);
3616
3617 return(TRUE);
3618 }
3619
3620 /******************************************************************//**
3621 Validates the flush list.
3622 @return TRUE if ok */
3623 ibool
buf_flush_validate(buf_pool_t * buf_pool)3624 buf_flush_validate(
3625 /*===============*/
3626 buf_pool_t* buf_pool) /*!< buffer pool instance */
3627 {
3628 ibool ret;
3629
3630 buf_flush_list_mutex_enter(buf_pool);
3631
3632 ret = buf_flush_validate_low(buf_pool);
3633
3634 buf_flush_list_mutex_exit(buf_pool);
3635
3636 return(ret);
3637 }
3638
3639 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3640
3641 /******************************************************************//**
3642 Check if there are any dirty pages that belong to a space id in the flush
3643 list in a particular buffer pool.
3644 @return number of dirty pages present in a single buffer pool */
3645 ulint
buf_pool_get_dirty_pages_count(buf_pool_t * buf_pool,ulint id,FlushObserver * observer)3646 buf_pool_get_dirty_pages_count(
3647 /*===========================*/
3648 buf_pool_t* buf_pool, /*!< in: buffer pool */
3649 ulint id, /*!< in: space id to check */
3650 FlushObserver* observer) /*!< in: flush observer to check */
3651
3652 {
3653 ulint count = 0;
3654
3655 buf_pool_mutex_enter(buf_pool);
3656 buf_flush_list_mutex_enter(buf_pool);
3657
3658 buf_page_t* bpage;
3659
3660 for (bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
3661 bpage != 0;
3662 bpage = UT_LIST_GET_NEXT(list, bpage)) {
3663
3664 ut_ad(buf_page_in_file(bpage));
3665 ut_ad(bpage->in_flush_list);
3666 ut_ad(bpage->oldest_modification > 0);
3667
3668 if ((observer != NULL
3669 && observer == bpage->flush_observer)
3670 || (observer == NULL
3671 && id == bpage->id.space())) {
3672 ++count;
3673 }
3674 }
3675
3676 buf_flush_list_mutex_exit(buf_pool);
3677 buf_pool_mutex_exit(buf_pool);
3678
3679 return(count);
3680 }
3681
3682 /******************************************************************//**
3683 Check if there are any dirty pages that belong to a space id in the flush list.
3684 @return number of dirty pages present in all the buffer pools */
3685 static
3686 ulint
buf_flush_get_dirty_pages_count(ulint id,FlushObserver * observer)3687 buf_flush_get_dirty_pages_count(
3688 /*============================*/
3689 ulint id, /*!< in: space id to check */
3690 FlushObserver* observer) /*!< in: flush observer to check */
3691 {
3692 ulint count = 0;
3693
3694 for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
3695 buf_pool_t* buf_pool;
3696
3697 buf_pool = buf_pool_from_array(i);
3698
3699 count += buf_pool_get_dirty_pages_count(buf_pool, id, observer);
3700 }
3701
3702 return(count);
3703 }
3704
3705 /** FlushObserver constructor
3706 @param[in] space tablespace
3707 @param[in] trx trx instance
3708 @param[in] stage performance schema accounting object,
3709 used by ALTER TABLE. It is passed to log_preflush_pool_modified_pages()
3710 for accounting. */
FlushObserver(fil_space_t * space,trx_t * trx,ut_stage_alter_t * stage)3711 FlushObserver::FlushObserver(
3712 fil_space_t* space,
3713 trx_t* trx,
3714 ut_stage_alter_t* stage)
3715 :
3716 m_space(space),
3717 m_trx(trx),
3718 m_stage(stage),
3719 m_interrupted(false)
3720 {
3721 m_flushed = UT_NEW_NOKEY(std::vector<ulint>(srv_buf_pool_instances));
3722 m_removed = UT_NEW_NOKEY(std::vector<ulint>(srv_buf_pool_instances));
3723
3724 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
3725 m_flushed->at(i) = 0;
3726 m_removed->at(i) = 0;
3727 }
3728
3729 DBUG_LOG("flush", "FlushObserver(): trx->id=" << m_trx->id);
3730 }
3731
3732 /** FlushObserver deconstructor */
~FlushObserver()3733 FlushObserver::~FlushObserver()
3734 {
3735 ut_ad(buf_flush_get_dirty_pages_count(m_space->id, this) == 0);
3736
3737 UT_DELETE(m_flushed);
3738 UT_DELETE(m_removed);
3739
3740 DBUG_LOG("flush", "~FlushObserver(): trx->id=" << m_trx->id);
3741 }
3742
3743 /** Check whether the operation has been interrupted */
check_interrupted()3744 void FlushObserver::check_interrupted()
3745 {
3746 if (trx_is_interrupted(m_trx)) {
3747 interrupted();
3748 }
3749 }
3750
3751 /** Notify observer of a flush
3752 @param[in] buf_pool buffer pool instance
3753 @param[in] bpage buffer page to flush */
3754 void
notify_flush(buf_pool_t * buf_pool,buf_page_t * bpage)3755 FlushObserver::notify_flush(
3756 buf_pool_t* buf_pool,
3757 buf_page_t* bpage)
3758 {
3759 ut_ad(buf_pool_mutex_own(buf_pool));
3760
3761 m_flushed->at(buf_pool->instance_no)++;
3762
3763 if (m_stage != NULL) {
3764 m_stage->inc();
3765 }
3766
3767 DBUG_LOG("flush", "Flush " << bpage->id);
3768 }
3769
3770 /** Notify observer of a remove
3771 @param[in] buf_pool buffer pool instance
3772 @param[in] bpage buffer page flushed */
3773 void
notify_remove(buf_pool_t * buf_pool,buf_page_t * bpage)3774 FlushObserver::notify_remove(
3775 buf_pool_t* buf_pool,
3776 buf_page_t* bpage)
3777 {
3778 ut_ad(buf_pool_mutex_own(buf_pool));
3779
3780 m_removed->at(buf_pool->instance_no)++;
3781
3782 DBUG_LOG("flush", "Remove " << bpage->id);
3783 }
3784
3785 /** Flush dirty pages and wait. */
3786 void
flush()3787 FlushObserver::flush()
3788 {
3789 ut_ad(m_trx);
3790
3791 if (!m_interrupted && m_stage) {
3792 m_stage->begin_phase_flush(buf_flush_get_dirty_pages_count(
3793 m_space->id, this));
3794 }
3795
3796 buf_LRU_flush_or_remove_pages(m_space->id, this);
3797
3798 /* Wait for all dirty pages were flushed. */
3799 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
3800 while (!is_complete(i)) {
3801
3802 os_thread_sleep(2000);
3803 }
3804 }
3805 }
3806