1 /*****************************************************************************
2
3 Copyright (c) 1995, 2019, Oracle and/or its affiliates. All Rights Reserved.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License, version 2.0,
7 as published by the Free Software Foundation.
8
9 This program is also distributed with certain software (including
10 but not limited to OpenSSL) that is licensed under separate terms,
11 as designated in a particular file or component or in included license
12 documentation. The authors of MySQL hereby grant you an additional
13 permission to link the program and your derivative works with the
14 separately licensed software that they have included with MySQL.
15
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License, version 2.0, for more details.
20
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
24
25 *****************************************************************************/
26
27 /**************************************************//**
28 @file buf/buf0flu.cc
29 The database buffer buf_pool flush algorithm
30
31 Created 11/11/1995 Heikki Tuuri
32 *******************************************************/
33
34 #include "ha_prototypes.h"
35 #include <mysql/service_thd_wait.h>
36 #include <my_dbug.h>
37
38 #include "buf0flu.h"
39
40 #ifdef UNIV_NONINL
41 #include "buf0flu.ic"
42 #endif
43
44 #include "buf0buf.h"
45 #include "buf0checksum.h"
46 #include "srv0start.h"
47 #include "srv0srv.h"
48 #include "page0zip.h"
49 #ifndef UNIV_HOTBACKUP
50 #include "ut0byte.h"
51 #include "page0page.h"
52 #include "fil0fil.h"
53 #include "buf0lru.h"
54 #include "buf0rea.h"
55 #include "ibuf0ibuf.h"
56 #include "log0log.h"
57 #include "os0file.h"
58 #include "trx0sys.h"
59 #include "srv0mon.h"
60 #include "fsp0sysspace.h"
61 #include "ut0stage.h"
62
63 #ifdef UNIV_LINUX
64 /* include defs for CPU time priority settings */
65 #include <unistd.h>
66 #include <sys/syscall.h>
67 #include <sys/time.h>
68 #include <sys/resource.h>
69 static const int buf_flush_page_cleaner_priority = -20;
70 #endif /* UNIV_LINUX */
71
72 /** Sleep time in microseconds for loop waiting for the oldest
73 modification lsn */
74 static const ulint buf_flush_wait_flushed_sleep_time = 10000;
75
76 /** Number of pages flushed through non flush_list flushes. */
77 static ulint buf_lru_flush_page_count = 0;
78
79 /** Flag indicating if the page_cleaner is in active state. This flag
80 is set to TRUE by the page_cleaner thread when it is spawned and is set
81 back to FALSE at shutdown by the page_cleaner as well. Therefore no
82 need to protect it by a mutex. It is only ever read by the thread
83 doing the shutdown */
84 bool buf_page_cleaner_is_active = false;
85
86 /** Factor for scan length to determine n_pages for intended oldest LSN
87 progress */
88 static ulint buf_flush_lsn_scan_factor = 3;
89
90 /** Average redo generation rate */
91 static lsn_t lsn_avg_rate = 0;
92
93 /** Target oldest LSN for the requested flush_sync */
94 static lsn_t buf_flush_sync_lsn = 0;
95
96 #ifdef UNIV_PFS_THREAD
97 mysql_pfs_key_t page_cleaner_thread_key;
98 #endif /* UNIV_PFS_THREAD */
99
100 /** Event to synchronise with the flushing. */
101 os_event_t buf_flush_event;
102
103 /** State for page cleaner array slot */
104 enum page_cleaner_state_t {
105 /** Not requested any yet.
106 Moved from FINISHED by the coordinator. */
107 PAGE_CLEANER_STATE_NONE = 0,
108 /** Requested but not started flushing.
109 Moved from NONE by the coordinator. */
110 PAGE_CLEANER_STATE_REQUESTED,
111 /** Flushing is on going.
112 Moved from REQUESTED by the worker. */
113 PAGE_CLEANER_STATE_FLUSHING,
114 /** Flushing was finished.
115 Moved from FLUSHING by the worker. */
116 PAGE_CLEANER_STATE_FINISHED
117 };
118
119 /** Page cleaner request state for each buffer pool instance */
120 struct page_cleaner_slot_t {
121 page_cleaner_state_t state; /*!< state of the request.
122 protected by page_cleaner_t::mutex
123 if the worker thread got the slot and
124 set to PAGE_CLEANER_STATE_FLUSHING,
125 n_flushed_lru and n_flushed_list can be
126 updated only by the worker thread */
127 /* This value is set during state==PAGE_CLEANER_STATE_NONE */
128 ulint n_pages_requested;
129 /*!< number of requested pages
130 for the slot */
131 /* These values are updated during state==PAGE_CLEANER_STATE_FLUSHING,
132 and commited with state==PAGE_CLEANER_STATE_FINISHED.
133 The consistency is protected by the 'state' */
134 ulint n_flushed_lru;
135 /*!< number of flushed pages
136 by LRU scan flushing */
137 ulint n_flushed_list;
138 /*!< number of flushed pages
139 by flush_list flushing */
140 bool succeeded_list;
141 /*!< true if flush_list flushing
142 succeeded. */
143 uint64_t flush_lru_time;
144 /*!< elapsed time for LRU flushing */
145 uint64_t flush_list_time;
146 /*!< elapsed time for flush_list
147 flushing */
148 ulint flush_lru_pass;
149 /*!< count to attempt LRU flushing */
150 ulint flush_list_pass;
151 /*!< count to attempt flush_list
152 flushing */
153 };
154
155 /** Page cleaner structure common for all threads */
156 struct page_cleaner_t {
157 ib_mutex_t mutex; /*!< mutex to protect whole of
158 page_cleaner_t struct and
159 page_cleaner_slot_t slots. */
160 os_event_t is_requested; /*!< event to activate worker
161 threads. */
162 os_event_t is_finished; /*!< event to signal that all
163 slots were finished. */
164 volatile ulint n_workers; /*!< number of worker threads
165 in existence */
166 bool requested; /*!< true if requested pages
167 to flush */
168 lsn_t lsn_limit; /*!< upper limit of LSN to be
169 flushed */
170 ulint n_slots; /*!< total number of slots */
171 ulint n_slots_requested;
172 /*!< number of slots
173 in the state
174 PAGE_CLEANER_STATE_REQUESTED */
175 ulint n_slots_flushing;
176 /*!< number of slots
177 in the state
178 PAGE_CLEANER_STATE_FLUSHING */
179 ulint n_slots_finished;
180 /*!< number of slots
181 in the state
182 PAGE_CLEANER_STATE_FINISHED */
183 uint64_t flush_time; /*!< elapsed time to flush
184 requests for all slots */
185 ulint flush_pass; /*!< count to finish to flush
186 requests for all slots */
187 page_cleaner_slot_t* slots; /*!< pointer to the slots */
188 bool is_running; /*!< false if attempt
189 to shutdown */
190
191 #ifdef UNIV_DEBUG
192 ulint n_disabled_debug;
193 /*<! how many of pc threads
194 have been disabled */
195 #endif /* UNIV_DEBUG */
196 };
197
198 static page_cleaner_t* page_cleaner = NULL;
199
200 #ifdef UNIV_DEBUG
201 my_bool innodb_page_cleaner_disabled_debug;
202 #endif /* UNIV_DEBUG */
203
204 /** If LRU list of a buf_pool is less than this size then LRU eviction
205 should not happen. This is because when we do LRU flushing we also put
206 the blocks on free list. If LRU list is very small then we can end up
207 in thrashing. */
208 #define BUF_LRU_MIN_LEN 256
209
210 /* @} */
211
212 /******************************************************************//**
213 Increases flush_list size in bytes with the page size in inline function */
214 static inline
215 void
incr_flush_list_size_in_bytes(buf_block_t * block,buf_pool_t * buf_pool)216 incr_flush_list_size_in_bytes(
217 /*==========================*/
218 buf_block_t* block, /*!< in: control block */
219 buf_pool_t* buf_pool) /*!< in: buffer pool instance */
220 {
221 ut_ad(buf_flush_list_mutex_own(buf_pool));
222
223 buf_pool->stat.flush_list_bytes += block->page.size.physical();
224
225 ut_ad(buf_pool->stat.flush_list_bytes <= buf_pool->curr_pool_size);
226 }
227
228 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
229 /******************************************************************//**
230 Validates the flush list.
231 @return TRUE if ok */
232 static
233 ibool
234 buf_flush_validate_low(
235 /*===================*/
236 buf_pool_t* buf_pool); /*!< in: Buffer pool instance */
237
238 /******************************************************************//**
239 Validates the flush list some of the time.
240 @return TRUE if ok or the check was skipped */
241 static
242 ibool
buf_flush_validate_skip(buf_pool_t * buf_pool)243 buf_flush_validate_skip(
244 /*====================*/
245 buf_pool_t* buf_pool) /*!< in: Buffer pool instance */
246 {
247 /** Try buf_flush_validate_low() every this many times */
248 # define BUF_FLUSH_VALIDATE_SKIP 23
249
250 /** The buf_flush_validate_low() call skip counter.
251 Use a signed type because of the race condition below. */
252 static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
253
254 /* There is a race condition below, but it does not matter,
255 because this call is only for heuristic purposes. We want to
256 reduce the call frequency of the costly buf_flush_validate_low()
257 check in debug builds. */
258 if (--buf_flush_validate_count > 0) {
259 return(TRUE);
260 }
261
262 buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
263 return(buf_flush_validate_low(buf_pool));
264 }
265 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
266
267 /******************************************************************//**
268 Insert a block in the flush_rbt and returns a pointer to its
269 predecessor or NULL if no predecessor. The ordering is maintained
270 on the basis of the <oldest_modification, space, offset> key.
271 @return pointer to the predecessor or NULL if no predecessor. */
272 static
273 buf_page_t*
buf_flush_insert_in_flush_rbt(buf_page_t * bpage)274 buf_flush_insert_in_flush_rbt(
275 /*==========================*/
276 buf_page_t* bpage) /*!< in: bpage to be inserted. */
277 {
278 const ib_rbt_node_t* c_node;
279 const ib_rbt_node_t* p_node;
280 buf_page_t* prev = NULL;
281 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
282
283 ut_ad(buf_flush_list_mutex_own(buf_pool));
284
285 /* Insert this buffer into the rbt. */
286 c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
287 ut_a(c_node != NULL);
288
289 /* Get the predecessor. */
290 p_node = rbt_prev(buf_pool->flush_rbt, c_node);
291
292 if (p_node != NULL) {
293 buf_page_t** value;
294 value = rbt_value(buf_page_t*, p_node);
295 prev = *value;
296 ut_a(prev != NULL);
297 }
298
299 return(prev);
300 }
301
302 /*********************************************************//**
303 Delete a bpage from the flush_rbt. */
304 static
305 void
buf_flush_delete_from_flush_rbt(buf_page_t * bpage)306 buf_flush_delete_from_flush_rbt(
307 /*============================*/
308 buf_page_t* bpage) /*!< in: bpage to be removed. */
309 {
310 #ifdef UNIV_DEBUG
311 ibool ret = FALSE;
312 #endif /* UNIV_DEBUG */
313 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
314
315 ut_ad(buf_flush_list_mutex_own(buf_pool));
316
317 #ifdef UNIV_DEBUG
318 ret =
319 #endif /* UNIV_DEBUG */
320 rbt_delete(buf_pool->flush_rbt, &bpage);
321
322 ut_ad(ret);
323 }
324
325 /*****************************************************************//**
326 Compare two modified blocks in the buffer pool. The key for comparison
327 is:
328 key = <oldest_modification, space, offset>
329 This comparison is used to maintian ordering of blocks in the
330 buf_pool->flush_rbt.
331 Note that for the purpose of flush_rbt, we only need to order blocks
332 on the oldest_modification. The other two fields are used to uniquely
333 identify the blocks.
334 @return < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */
335 static
336 int
buf_flush_block_cmp(const void * p1,const void * p2)337 buf_flush_block_cmp(
338 /*================*/
339 const void* p1, /*!< in: block1 */
340 const void* p2) /*!< in: block2 */
341 {
342 int ret;
343 const buf_page_t* b1 = *(const buf_page_t**) p1;
344 const buf_page_t* b2 = *(const buf_page_t**) p2;
345
346 ut_ad(b1 != NULL);
347 ut_ad(b2 != NULL);
348
349 #ifdef UNIV_DEBUG
350 buf_pool_t* buf_pool = buf_pool_from_bpage(b1);
351 #endif /* UNIV_DEBUG */
352
353 ut_ad(buf_flush_list_mutex_own(buf_pool));
354
355 ut_ad(b1->in_flush_list);
356 ut_ad(b2->in_flush_list);
357
358 if (b2->oldest_modification > b1->oldest_modification) {
359 return(1);
360 } else if (b2->oldest_modification < b1->oldest_modification) {
361 return(-1);
362 }
363
364 /* If oldest_modification is same then decide on the space. */
365 ret = (int)(b2->id.space() - b1->id.space());
366
367 /* Or else decide ordering on the page number. */
368 return(ret ? ret : (int) (b2->id.page_no() - b1->id.page_no()));
369 }
370
371 /********************************************************************//**
372 Initialize the red-black tree to speed up insertions into the flush_list
373 during recovery process. Should be called at the start of recovery
374 process before any page has been read/written. */
375 void
buf_flush_init_flush_rbt(void)376 buf_flush_init_flush_rbt(void)
377 /*==========================*/
378 {
379 ulint i;
380
381 for (i = 0; i < srv_buf_pool_instances; i++) {
382 buf_pool_t* buf_pool;
383
384 buf_pool = buf_pool_from_array(i);
385
386 buf_flush_list_mutex_enter(buf_pool);
387
388 ut_ad(buf_pool->flush_rbt == NULL);
389
390 /* Create red black tree for speedy insertions in flush list. */
391 buf_pool->flush_rbt = rbt_create(
392 sizeof(buf_page_t*), buf_flush_block_cmp);
393
394 buf_flush_list_mutex_exit(buf_pool);
395 }
396 }
397
398 /********************************************************************//**
399 Frees up the red-black tree. */
400 void
buf_flush_free_flush_rbt(void)401 buf_flush_free_flush_rbt(void)
402 /*==========================*/
403 {
404 ulint i;
405
406 for (i = 0; i < srv_buf_pool_instances; i++) {
407 buf_pool_t* buf_pool;
408
409 buf_pool = buf_pool_from_array(i);
410
411 buf_flush_list_mutex_enter(buf_pool);
412
413 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
414 ut_a(buf_flush_validate_low(buf_pool));
415 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
416
417 rbt_free(buf_pool->flush_rbt);
418 buf_pool->flush_rbt = NULL;
419
420 buf_flush_list_mutex_exit(buf_pool);
421 }
422 }
423
424 /********************************************************************//**
425 Inserts a modified block into the flush list. */
426 void
buf_flush_insert_into_flush_list(buf_pool_t * buf_pool,buf_block_t * block,lsn_t lsn)427 buf_flush_insert_into_flush_list(
428 /*=============================*/
429 buf_pool_t* buf_pool, /*!< buffer pool instance */
430 buf_block_t* block, /*!< in/out: block which is modified */
431 lsn_t lsn) /*!< in: oldest modification */
432 {
433 ut_ad(!buf_pool_mutex_own(buf_pool));
434 ut_ad(log_flush_order_mutex_own());
435 ut_ad(buf_page_mutex_own(block));
436
437 buf_flush_list_mutex_enter(buf_pool);
438
439 ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
440 || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
441 <= lsn));
442
443 /* If we are in the recovery then we need to update the flush
444 red-black tree as well. */
445 if (buf_pool->flush_rbt != NULL) {
446 buf_flush_list_mutex_exit(buf_pool);
447 buf_flush_insert_sorted_into_flush_list(buf_pool, block, lsn);
448 return;
449 }
450
451 ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
452 ut_ad(!block->page.in_flush_list);
453
454 ut_d(block->page.in_flush_list = TRUE);
455 block->page.oldest_modification = lsn;
456
457 UT_LIST_ADD_FIRST(buf_pool->flush_list, &block->page);
458
459 incr_flush_list_size_in_bytes(block, buf_pool);
460
461 #ifdef UNIV_DEBUG_VALGRIND
462 void* p;
463
464 if (block->page.size.is_compressed()) {
465 p = block->page.zip.data;
466 } else {
467 p = block->frame;
468 }
469
470 UNIV_MEM_ASSERT_RW(p, block->page.size.physical());
471 #endif /* UNIV_DEBUG_VALGRIND */
472
473 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
474 ut_a(buf_flush_validate_skip(buf_pool));
475 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
476
477 buf_flush_list_mutex_exit(buf_pool);
478 }
479
480 /********************************************************************//**
481 Inserts a modified block into the flush list in the right sorted position.
482 This function is used by recovery, because there the modifications do not
483 necessarily come in the order of lsn's. */
484 void
buf_flush_insert_sorted_into_flush_list(buf_pool_t * buf_pool,buf_block_t * block,lsn_t lsn)485 buf_flush_insert_sorted_into_flush_list(
486 /*====================================*/
487 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
488 buf_block_t* block, /*!< in/out: block which is modified */
489 lsn_t lsn) /*!< in: oldest modification */
490 {
491 buf_page_t* prev_b;
492 buf_page_t* b;
493
494 ut_ad(!buf_pool_mutex_own(buf_pool));
495 ut_ad(log_flush_order_mutex_own());
496 ut_ad(buf_page_mutex_own(block));
497 ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
498
499 buf_flush_list_mutex_enter(buf_pool);
500
501 /* The field in_LRU_list is protected by buf_pool->mutex, which
502 we are not holding. However, while a block is in the flush
503 list, it is dirty and cannot be discarded, not from the
504 page_hash or from the LRU list. At most, the uncompressed
505 page frame of a compressed block may be discarded or created
506 (copying the block->page to or from a buf_page_t that is
507 dynamically allocated from buf_buddy_alloc()). Because those
508 transitions hold block->mutex and the flush list mutex (via
509 buf_flush_relocate_on_flush_list()), there is no possibility
510 of a race condition in the assertions below. */
511 ut_ad(block->page.in_LRU_list);
512 ut_ad(block->page.in_page_hash);
513 /* buf_buddy_block_register() will take a block in the
514 BUF_BLOCK_MEMORY state, not a file page. */
515 ut_ad(!block->page.in_zip_hash);
516
517 ut_ad(!block->page.in_flush_list);
518 ut_d(block->page.in_flush_list = TRUE);
519 block->page.oldest_modification = lsn;
520
521 #ifdef UNIV_DEBUG_VALGRIND
522 void* p;
523
524 if (block->page.size.is_compressed()) {
525 p = block->page.zip.data;
526 } else {
527 p = block->frame;
528 }
529
530 UNIV_MEM_ASSERT_RW(p, block->page.size.physical());
531 #endif /* UNIV_DEBUG_VALGRIND */
532
533 prev_b = NULL;
534
535 /* For the most part when this function is called the flush_rbt
536 should not be NULL. In a very rare boundary case it is possible
537 that the flush_rbt has already been freed by the recovery thread
538 before the last page was hooked up in the flush_list by the
539 io-handler thread. In that case we'll just do a simple
540 linear search in the else block. */
541 if (buf_pool->flush_rbt != NULL) {
542
543 prev_b = buf_flush_insert_in_flush_rbt(&block->page);
544
545 } else {
546
547 b = UT_LIST_GET_FIRST(buf_pool->flush_list);
548
549 while (b != NULL && b->oldest_modification
550 > block->page.oldest_modification) {
551
552 ut_ad(b->in_flush_list);
553 prev_b = b;
554 b = UT_LIST_GET_NEXT(list, b);
555 }
556 }
557
558 if (prev_b == NULL) {
559 UT_LIST_ADD_FIRST(buf_pool->flush_list, &block->page);
560 } else {
561 UT_LIST_INSERT_AFTER(buf_pool->flush_list, prev_b, &block->page);
562 }
563
564 incr_flush_list_size_in_bytes(block, buf_pool);
565
566 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
567 ut_a(buf_flush_validate_low(buf_pool));
568 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
569
570 buf_flush_list_mutex_exit(buf_pool);
571 }
572
573 /********************************************************************//**
574 Returns TRUE if the file page block is immediately suitable for replacement,
575 i.e., the transition FILE_PAGE => NOT_USED allowed.
576 @return TRUE if can replace immediately */
577 ibool
buf_flush_ready_for_replace(buf_page_t * bpage)578 buf_flush_ready_for_replace(
579 /*========================*/
580 buf_page_t* bpage) /*!< in: buffer control block, must be
581 buf_page_in_file(bpage) and in the LRU list */
582 {
583 #ifdef UNIV_DEBUG
584 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
585 ut_ad(buf_pool_mutex_own(buf_pool));
586 #endif /* UNIV_DEBUG */
587 ut_ad(mutex_own(buf_page_get_mutex(bpage)));
588 ut_ad(bpage->in_LRU_list);
589
590 if (buf_page_in_file(bpage)) {
591
592 return(bpage->oldest_modification == 0
593 && bpage->buf_fix_count == 0
594 && buf_page_get_io_fix(bpage) == BUF_IO_NONE);
595 }
596
597 ib::fatal() << "Buffer block " << bpage << " state " << bpage->state
598 << " in the LRU list!";
599
600 return(FALSE);
601 }
602
603 /********************************************************************//**
604 Returns true if the block is modified and ready for flushing.
605 @return true if can flush immediately */
606 bool
buf_flush_ready_for_flush(buf_page_t * bpage,buf_flush_t flush_type)607 buf_flush_ready_for_flush(
608 /*======================*/
609 buf_page_t* bpage, /*!< in: buffer control block, must be
610 buf_page_in_file(bpage) */
611 buf_flush_t flush_type)/*!< in: type of flush */
612 {
613 #ifdef UNIV_DEBUG
614 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
615 ut_ad(buf_pool_mutex_own(buf_pool));
616 #endif /* UNIV_DEBUG */
617
618 ut_a(buf_page_in_file(bpage));
619 ut_ad(mutex_own(buf_page_get_mutex(bpage)));
620 ut_ad(flush_type < BUF_FLUSH_N_TYPES);
621
622 if (bpage->oldest_modification == 0
623 || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
624 return(false);
625 }
626
627 ut_ad(bpage->in_flush_list);
628
629 switch (flush_type) {
630 case BUF_FLUSH_LIST:
631 case BUF_FLUSH_LRU:
632 case BUF_FLUSH_SINGLE_PAGE:
633 return(true);
634
635 case BUF_FLUSH_N_TYPES:
636 break;
637 }
638
639 ut_error;
640 return(false);
641 }
642
643 /********************************************************************//**
644 Remove a block from the flush list of modified blocks. */
645 void
buf_flush_remove(buf_page_t * bpage)646 buf_flush_remove(
647 /*=============*/
648 buf_page_t* bpage) /*!< in: pointer to the block in question */
649 {
650 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
651
652 ut_ad(buf_pool_mutex_own(buf_pool));
653 ut_ad(mutex_own(buf_page_get_mutex(bpage)));
654 ut_ad(bpage->in_flush_list);
655
656 buf_flush_list_mutex_enter(buf_pool);
657
658 /* Important that we adjust the hazard pointer before removing
659 the bpage from flush list. */
660 buf_pool->flush_hp.adjust(bpage);
661
662 switch (buf_page_get_state(bpage)) {
663 case BUF_BLOCK_POOL_WATCH:
664 case BUF_BLOCK_ZIP_PAGE:
665 /* Clean compressed pages should not be on the flush list */
666 case BUF_BLOCK_NOT_USED:
667 case BUF_BLOCK_READY_FOR_USE:
668 case BUF_BLOCK_MEMORY:
669 case BUF_BLOCK_REMOVE_HASH:
670 ut_error;
671 return;
672 case BUF_BLOCK_ZIP_DIRTY:
673 buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
674 UT_LIST_REMOVE(buf_pool->flush_list, bpage);
675 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
676 buf_LRU_insert_zip_clean(bpage);
677 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
678 break;
679 case BUF_BLOCK_FILE_PAGE:
680 UT_LIST_REMOVE(buf_pool->flush_list, bpage);
681 break;
682 }
683
684 /* If the flush_rbt is active then delete from there as well. */
685 if (buf_pool->flush_rbt != NULL) {
686 buf_flush_delete_from_flush_rbt(bpage);
687 }
688
689 /* Must be done after we have removed it from the flush_rbt
690 because we assert on in_flush_list in comparison function. */
691 ut_d(bpage->in_flush_list = FALSE);
692
693 buf_pool->stat.flush_list_bytes -= bpage->size.physical();
694
695 bpage->oldest_modification = 0;
696
697 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
698 ut_a(buf_flush_validate_skip(buf_pool));
699 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
700
701 /* If there is an observer that want to know if the asynchronous
702 flushing was done then notify it. */
703 if (bpage->flush_observer != NULL) {
704 bpage->flush_observer->notify_remove(buf_pool, bpage);
705
706 bpage->flush_observer = NULL;
707 }
708
709 buf_flush_list_mutex_exit(buf_pool);
710 }
711
712 /*******************************************************************//**
713 Relocates a buffer control block on the flush_list.
714 Note that it is assumed that the contents of bpage have already been
715 copied to dpage.
716 IMPORTANT: When this function is called bpage and dpage are not
717 exact copies of each other. For example, they both will have different
718 ::state. Also the ::list pointers in dpage may be stale. We need to
719 use the current list node (bpage) to do the list manipulation because
720 the list pointers could have changed between the time that we copied
721 the contents of bpage to the dpage and the flush list manipulation
722 below. */
723 void
buf_flush_relocate_on_flush_list(buf_page_t * bpage,buf_page_t * dpage)724 buf_flush_relocate_on_flush_list(
725 /*=============================*/
726 buf_page_t* bpage, /*!< in/out: control block being moved */
727 buf_page_t* dpage) /*!< in/out: destination block */
728 {
729 buf_page_t* prev;
730 buf_page_t* prev_b = NULL;
731 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
732
733 ut_ad(buf_pool_mutex_own(buf_pool));
734 /* Must reside in the same buffer pool. */
735 ut_ad(buf_pool == buf_pool_from_bpage(dpage));
736
737 ut_ad(mutex_own(buf_page_get_mutex(bpage)));
738
739 buf_flush_list_mutex_enter(buf_pool);
740
741 /* FIXME: At this point we have both buf_pool and flush_list
742 mutexes. Theoretically removal of a block from flush list is
743 only covered by flush_list mutex but currently we do
744 have buf_pool mutex in buf_flush_remove() therefore this block
745 is guaranteed to be in the flush list. We need to check if
746 this will work without the assumption of block removing code
747 having the buf_pool mutex. */
748 ut_ad(bpage->in_flush_list);
749 ut_ad(dpage->in_flush_list);
750
751 /* If recovery is active we must swap the control blocks in
752 the flush_rbt as well. */
753 if (buf_pool->flush_rbt != NULL) {
754 buf_flush_delete_from_flush_rbt(bpage);
755 prev_b = buf_flush_insert_in_flush_rbt(dpage);
756 }
757
758 /* Important that we adjust the hazard pointer before removing
759 the bpage from the flush list. */
760 buf_pool->flush_hp.adjust(bpage);
761
762 /* Must be done after we have removed it from the flush_rbt
763 because we assert on in_flush_list in comparison function. */
764 ut_d(bpage->in_flush_list = FALSE);
765
766 prev = UT_LIST_GET_PREV(list, bpage);
767 UT_LIST_REMOVE(buf_pool->flush_list, bpage);
768
769 if (prev) {
770 ut_ad(prev->in_flush_list);
771 UT_LIST_INSERT_AFTER( buf_pool->flush_list, prev, dpage);
772 } else {
773 UT_LIST_ADD_FIRST(buf_pool->flush_list, dpage);
774 }
775
776 /* Just an extra check. Previous in flush_list
777 should be the same control block as in flush_rbt. */
778 ut_a(buf_pool->flush_rbt == NULL || prev_b == prev);
779
780 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
781 ut_a(buf_flush_validate_low(buf_pool));
782 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
783
784 buf_flush_list_mutex_exit(buf_pool);
785 }
786
787 /********************************************************************//**
788 Updates the flush system data structures when a write is completed. */
789 void
buf_flush_write_complete(buf_page_t * bpage)790 buf_flush_write_complete(
791 /*=====================*/
792 buf_page_t* bpage) /*!< in: pointer to the block in question */
793 {
794 buf_flush_t flush_type;
795 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
796
797 ut_ad(bpage);
798
799 buf_flush_remove(bpage);
800
801 flush_type = buf_page_get_flush_type(bpage);
802 buf_pool->n_flush[flush_type]--;
803
804 if (buf_pool->n_flush[flush_type] == 0
805 && buf_pool->init_flush[flush_type] == FALSE) {
806
807 /* The running flush batch has ended */
808
809 os_event_set(buf_pool->no_flush[flush_type]);
810 }
811
812 buf_dblwr_update(bpage, flush_type);
813 }
814 #endif /* !UNIV_HOTBACKUP */
815
816 /** Calculate the checksum of a page from compressed table and update
817 the page.
818 @param[in,out] page page to update
819 @param[in] size compressed page size
820 @param[in] lsn LSN to stamp on the page */
821 void
buf_flush_update_zip_checksum(buf_frame_t * page,ulint size,lsn_t lsn)822 buf_flush_update_zip_checksum(
823 buf_frame_t* page,
824 ulint size,
825 lsn_t lsn)
826 {
827 ut_a(size > 0);
828
829 const uint32_t checksum = page_zip_calc_checksum(
830 page, size,
831 static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm));
832
833 mach_write_to_8(page + FIL_PAGE_LSN, lsn);
834 mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
835 }
836
837 /** Initialize a page for writing to the tablespace.
838 @param[in] block buffer block; NULL if bypassing the buffer pool
839 @param[in,out] page page frame
840 @param[in,out] page_zip_ compressed page, or NULL if uncompressed
841 @param[in] newest_lsn newest modification LSN to the page
842 @param[in] skip_checksum whether to disable the page checksum */
843 void
buf_flush_init_for_writing(const buf_block_t * block,byte * page,void * page_zip_,lsn_t newest_lsn,bool skip_checksum)844 buf_flush_init_for_writing(
845 const buf_block_t* block,
846 byte* page,
847 void* page_zip_,
848 lsn_t newest_lsn,
849 bool skip_checksum)
850 {
851 ib_uint32_t checksum = BUF_NO_CHECKSUM_MAGIC;
852
853 ut_ad(block == NULL || block->frame == page);
854 ut_ad(block == NULL || page_zip_ == NULL
855 || &block->page.zip == page_zip_);
856 ut_ad(page);
857
858 if (page_zip_) {
859 page_zip_des_t* page_zip;
860 ulint size;
861
862 page_zip = static_cast<page_zip_des_t*>(page_zip_);
863 size = page_zip_get_size(page_zip);
864
865 ut_ad(size);
866 ut_ad(ut_is_2pow(size));
867 ut_ad(size <= UNIV_ZIP_SIZE_MAX);
868
869 switch (fil_page_get_type(page)) {
870 case FIL_PAGE_TYPE_ALLOCATED:
871 case FIL_PAGE_INODE:
872 case FIL_PAGE_IBUF_BITMAP:
873 case FIL_PAGE_TYPE_FSP_HDR:
874 case FIL_PAGE_TYPE_XDES:
875 /* These are essentially uncompressed pages. */
876 memcpy(page_zip->data, page, size);
877 /* fall through */
878 case FIL_PAGE_TYPE_ZBLOB:
879 case FIL_PAGE_TYPE_ZBLOB2:
880 case FIL_PAGE_INDEX:
881 case FIL_PAGE_RTREE:
882
883 buf_flush_update_zip_checksum(
884 page_zip->data, size, newest_lsn);
885
886 return;
887 }
888
889 ib::error() << "The compressed page to be written"
890 " seems corrupt:";
891 ut_print_buf(stderr, page, size);
892 fputs("\nInnoDB: Possibly older version of the page:", stderr);
893 ut_print_buf(stderr, page_zip->data, size);
894 putc('\n', stderr);
895 ut_error;
896 }
897
898 /* Write the newest modification lsn to the page header and trailer */
899 mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
900
901 mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
902 newest_lsn);
903
904 if (skip_checksum) {
905 mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
906 } else {
907 if (block != NULL && UNIV_PAGE_SIZE == 16384) {
908 /* The page type could be garbage in old files
909 created before MySQL 5.5. Such files always
910 had a page size of 16 kilobytes. */
911 ulint page_type = fil_page_get_type(page);
912 ulint reset_type = page_type;
913
914 switch (block->page.id.page_no() % 16384) {
915 case 0:
916 reset_type = block->page.id.page_no() == 0
917 ? FIL_PAGE_TYPE_FSP_HDR
918 : FIL_PAGE_TYPE_XDES;
919 break;
920 case 1:
921 reset_type = FIL_PAGE_IBUF_BITMAP;
922 break;
923 case 5:
924 if (block->page.id.page_no() == 5 &&
925 block->page.id.space() == TRX_SYS_SPACE) {
926 reset_type = FIL_PAGE_TYPE_TRX_SYS;
927 }
928 break;
929 case 3:
930 case 6:
931 case 7:
932 if (block->page.id.page_no() < 16384 &&
933 block->page.id.space() == TRX_SYS_SPACE) {
934 reset_type = FIL_PAGE_TYPE_SYS;
935 }
936 break;
937 case 4:
938 if (block->page.id.page_no() == 4 &&
939 block->page.id.space() == TRX_SYS_SPACE) {
940 reset_type = FIL_PAGE_INDEX;
941 }
942 break;
943 default:
944 switch (page_type) {
945 case FIL_PAGE_INDEX:
946 case FIL_PAGE_RTREE:
947 case FIL_PAGE_UNDO_LOG:
948 case FIL_PAGE_INODE:
949 case FIL_PAGE_IBUF_FREE_LIST:
950 case FIL_PAGE_TYPE_ALLOCATED:
951 case FIL_PAGE_TYPE_SYS:
952 case FIL_PAGE_TYPE_TRX_SYS:
953 case FIL_PAGE_TYPE_BLOB:
954 case FIL_PAGE_TYPE_ZBLOB:
955 case FIL_PAGE_TYPE_ZBLOB2:
956 break;
957 case FIL_PAGE_TYPE_FSP_HDR:
958 case FIL_PAGE_TYPE_XDES:
959 case FIL_PAGE_IBUF_BITMAP:
960 /* These pages should have
961 predetermined page numbers
962 (see above). */
963 default:
964 reset_type = FIL_PAGE_TYPE_UNKNOWN;
965 break;
966 }
967 }
968
969 if (UNIV_UNLIKELY(page_type != reset_type)) {
970 ib::info()
971 << "Resetting invalid page "
972 << block->page.id << " type "
973 << page_type << " to "
974 << reset_type << " when flushing.";
975 fil_page_set_type(page, reset_type);
976 }
977 }
978
979 switch ((srv_checksum_algorithm_t) srv_checksum_algorithm) {
980 case SRV_CHECKSUM_ALGORITHM_CRC32:
981 case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
982 checksum = buf_calc_page_crc32(page);
983 mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
984 checksum);
985 break;
986 case SRV_CHECKSUM_ALGORITHM_INNODB:
987 case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
988 checksum = (srv_fast_checksum) ?
989 (ib_uint32_t) buf_calc_page_new_checksum_32(page) :
990 (ib_uint32_t) buf_calc_page_new_checksum(page);
991 mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
992 checksum);
993 checksum = (ib_uint32_t) buf_calc_page_old_checksum(
994 page);
995 break;
996 case SRV_CHECKSUM_ALGORITHM_NONE:
997 case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
998 mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
999 checksum);
1000 break;
1001 /* no default so the compiler will emit a warning if
1002 new enum is added and not handled here */
1003 }
1004 }
1005
1006 /* With the InnoDB checksum, we overwrite the first 4 bytes of
1007 the end lsn field to store the old formula checksum. Since it
1008 depends also on the field FIL_PAGE_SPACE_OR_CHKSUM, it has to
1009 be calculated after storing the new formula checksum.
1010
1011 In other cases we write the same value to both fields.
1012 If CRC32 is used then it is faster to use that checksum
1013 (calculated above) instead of calculating another one.
1014 We can afford to store something other than
1015 buf_calc_page_old_checksum() or BUF_NO_CHECKSUM_MAGIC in
1016 this field because the file will not be readable by old
1017 versions of MySQL/InnoDB anyway (older than MySQL 5.6.3) */
1018
1019 mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
1020 checksum);
1021 }
1022
1023 #ifndef UNIV_HOTBACKUP
1024 /********************************************************************//**
1025 Does an asynchronous write of a buffer page. NOTE: in simulated aio and
1026 also when the doublewrite buffer is used, we must call
1027 buf_dblwr_flush_buffered_writes after we have posted a batch of
1028 writes! */
1029 static
1030 void
buf_flush_write_block_low(buf_page_t * bpage,buf_flush_t flush_type,bool sync)1031 buf_flush_write_block_low(
1032 /*======================*/
1033 buf_page_t* bpage, /*!< in: buffer block to write */
1034 buf_flush_t flush_type, /*!< in: type of flush */
1035 bool sync) /*!< in: true if sync IO request */
1036 {
1037 page_t* frame = NULL;
1038
1039 #ifdef UNIV_DEBUG
1040 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1041 ut_ad(!buf_pool_mutex_own(buf_pool));
1042 #endif /* UNIV_DEBUG */
1043
1044 DBUG_PRINT("ib_buf", ("flush %s %u page " UINT32PF ":" UINT32PF,
1045 sync ? "sync" : "async", (unsigned) flush_type,
1046 bpage->id.space(), bpage->id.page_no()));
1047
1048 ut_ad(buf_page_in_file(bpage));
1049
1050 /* We are not holding buf_pool->mutex or block_mutex here.
1051 Nevertheless, it is safe to access bpage, because it is
1052 io_fixed and oldest_modification != 0. Thus, it cannot be
1053 relocated in the buffer pool or removed from flush_list or
1054 LRU_list. */
1055 ut_ad(!buf_pool_mutex_own(buf_pool));
1056 ut_ad(!buf_flush_list_mutex_own(buf_pool));
1057 ut_ad(!buf_page_get_mutex(bpage)->is_owned());
1058 ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
1059 ut_ad(bpage->oldest_modification != 0);
1060
1061 #ifdef UNIV_IBUF_COUNT_DEBUG
1062 ut_a(ibuf_count_get(bpage->id) == 0);
1063 #endif /* UNIV_IBUF_COUNT_DEBUG */
1064
1065 ut_ad(bpage->newest_modification != 0);
1066
1067 /* Force the log to the disk before writing the modified block */
1068 if (!srv_read_only_mode) {
1069 log_write_up_to(bpage->newest_modification, true);
1070 }
1071
1072 switch (buf_page_get_state(bpage)) {
1073 case BUF_BLOCK_POOL_WATCH:
1074 case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
1075 case BUF_BLOCK_NOT_USED:
1076 case BUF_BLOCK_READY_FOR_USE:
1077 case BUF_BLOCK_MEMORY:
1078 case BUF_BLOCK_REMOVE_HASH:
1079 ut_error;
1080 break;
1081 case BUF_BLOCK_ZIP_DIRTY:
1082 frame = bpage->zip.data;
1083
1084 mach_write_to_8(frame + FIL_PAGE_LSN,
1085 bpage->newest_modification);
1086
1087 ut_a(page_zip_verify_checksum(frame, bpage->size.physical()));
1088 break;
1089 case BUF_BLOCK_FILE_PAGE:
1090 frame = bpage->zip.data;
1091 if (!frame) {
1092 frame = ((buf_block_t*) bpage)->frame;
1093 }
1094
1095 buf_flush_init_for_writing(
1096 reinterpret_cast<const buf_block_t*>(bpage),
1097 reinterpret_cast<const buf_block_t*>(bpage)->frame,
1098 bpage->zip.data ? &bpage->zip : NULL,
1099 bpage->newest_modification,
1100 fsp_is_checksum_disabled(bpage->id.space()));
1101 break;
1102 }
1103
1104 /* Disable use of double-write buffer for temporary tablespace.
1105 Given the nature and load of temporary tablespace doublewrite buffer
1106 adds an overhead during flushing. */
1107
1108 if (!srv_use_doublewrite_buf
1109 || buf_dblwr == NULL
1110 || srv_read_only_mode
1111 || fsp_is_system_temporary(bpage->id.space())) {
1112
1113 ut_ad(!srv_read_only_mode
1114 || fsp_is_system_temporary(bpage->id.space()));
1115
1116 ulint type = IORequest::WRITE | IORequest::DO_NOT_WAKE;
1117
1118 IORequest request(type);
1119
1120 fil_io(request,
1121 sync, bpage->id, bpage->size, 0, bpage->size.physical(),
1122 frame, bpage);
1123
1124 } else if (flush_type == BUF_FLUSH_SINGLE_PAGE) {
1125 buf_dblwr_write_single_page(bpage, sync);
1126 } else {
1127 ut_ad(!sync);
1128 buf_dblwr_add_to_batch(bpage);
1129 }
1130
1131 /* When doing single page flushing the IO is done synchronously
1132 and we flush the changes to disk only for the tablespace we
1133 are working on. */
1134 if (sync) {
1135 ut_ad(flush_type == BUF_FLUSH_SINGLE_PAGE);
1136 fil_flush(bpage->id.space());
1137
1138 /* true means we want to evict this page from the
1139 LRU list as well. */
1140 buf_page_io_complete(bpage, true);
1141 }
1142
1143 /* Increment the counter of I/O operations used
1144 for selecting LRU policy. */
1145 buf_LRU_stat_inc_io();
1146 }
1147
1148 /********************************************************************//**
1149 Writes a flushable page asynchronously from the buffer pool to a file.
1150 NOTE: in simulated aio we must call
1151 os_aio_simulated_wake_handler_threads after we have posted a batch of
1152 writes! NOTE: buf_pool->mutex and buf_page_get_mutex(bpage) must be
1153 held upon entering this function, and they will be released by this
1154 function if it returns true.
1155 @return TRUE if the page was flushed */
1156 ibool
buf_flush_page(buf_pool_t * buf_pool,buf_page_t * bpage,buf_flush_t flush_type,bool sync)1157 buf_flush_page(
1158 /*===========*/
1159 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1160 buf_page_t* bpage, /*!< in: buffer control block */
1161 buf_flush_t flush_type, /*!< in: type of flush */
1162 bool sync) /*!< in: true if sync IO request */
1163 {
1164 BPageMutex* block_mutex;
1165
1166 ut_ad(flush_type < BUF_FLUSH_N_TYPES);
1167 ut_ad(buf_pool_mutex_own(buf_pool));
1168 ut_ad(buf_page_in_file(bpage));
1169 ut_ad(!sync || flush_type == BUF_FLUSH_SINGLE_PAGE);
1170
1171 block_mutex = buf_page_get_mutex(bpage);
1172 ut_ad(mutex_own(block_mutex));
1173
1174 ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
1175
1176 bool is_uncompressed;
1177
1178 is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
1179 ut_ad(is_uncompressed == (block_mutex != &buf_pool->zip_mutex));
1180
1181 ibool flush;
1182 rw_lock_t* rw_lock;
1183 bool no_fix_count = bpage->buf_fix_count == 0;
1184
1185 if (!is_uncompressed) {
1186 flush = TRUE;
1187 rw_lock = NULL;
1188 } else if (!(no_fix_count || flush_type == BUF_FLUSH_LIST)
1189 || (!no_fix_count
1190 && srv_shutdown_state <= SRV_SHUTDOWN_CLEANUP
1191 && fsp_is_system_temporary(bpage->id.space()))) {
1192 /* This is a heuristic, to avoid expensive SX attempts. */
1193 /* For table residing in temporary tablespace sync is done
1194 using IO_FIX and so before scheduling for flush ensure that
1195 page is not fixed. */
1196 flush = FALSE;
1197 } else {
1198 rw_lock = &reinterpret_cast<buf_block_t*>(bpage)->lock;
1199 if (flush_type != BUF_FLUSH_LIST) {
1200 flush = rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE);
1201 } else {
1202 /* Will SX lock later */
1203 flush = TRUE;
1204 }
1205 }
1206
1207 if (flush) {
1208
1209 /* We are committed to flushing by the time we get here */
1210
1211 buf_page_set_io_fix(bpage, BUF_IO_WRITE);
1212
1213 buf_page_set_flush_type(bpage, flush_type);
1214
1215 if (buf_pool->n_flush[flush_type] == 0) {
1216 os_event_reset(buf_pool->no_flush[flush_type]);
1217 }
1218
1219 ++buf_pool->n_flush[flush_type];
1220
1221 mutex_exit(block_mutex);
1222
1223 buf_pool_mutex_exit(buf_pool);
1224
1225 if (flush_type == BUF_FLUSH_LIST
1226 && is_uncompressed
1227 && !rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE)) {
1228
1229 if (!fsp_is_system_temporary(bpage->id.space())) {
1230 /* avoiding deadlock possibility involves
1231 doublewrite buffer, should flush it, because
1232 it might hold the another block->lock. */
1233 buf_dblwr_flush_buffered_writes();
1234 } else {
1235 buf_dblwr_sync_datafiles();
1236 }
1237
1238 rw_lock_sx_lock_gen(rw_lock, BUF_IO_WRITE);
1239 }
1240
1241 /* If there is an observer that want to know if the asynchronous
1242 flushing was sent then notify it.
1243 Note: we set flush observer to a page with x-latch, so we can
1244 guarantee that notify_flush and notify_remove are called in pair
1245 with s-latch on a uncompressed page. */
1246 if (bpage->flush_observer != NULL) {
1247 buf_pool_mutex_enter(buf_pool);
1248
1249 bpage->flush_observer->notify_flush(buf_pool, bpage);
1250
1251 buf_pool_mutex_exit(buf_pool);
1252 }
1253
1254 /* Even though bpage is not protected by any mutex at this
1255 point, it is safe to access bpage, because it is io_fixed and
1256 oldest_modification != 0. Thus, it cannot be relocated in the
1257 buffer pool or removed from flush_list or LRU_list. */
1258
1259 buf_flush_write_block_low(bpage, flush_type, sync);
1260 }
1261
1262 return(flush);
1263 }
1264
1265 # if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
1266 /********************************************************************//**
1267 Writes a flushable page asynchronously from the buffer pool to a file.
1268 NOTE: buf_pool->mutex and block->mutex must be held upon entering this
1269 function, and they will be released by this function after flushing.
1270 This is loosely based on buf_flush_batch() and buf_flush_page().
1271 @return TRUE if the page was flushed and the mutexes released */
1272 ibool
buf_flush_page_try(buf_pool_t * buf_pool,buf_block_t * block)1273 buf_flush_page_try(
1274 /*===============*/
1275 buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */
1276 buf_block_t* block) /*!< in/out: buffer control block */
1277 {
1278 ut_ad(buf_pool_mutex_own(buf_pool));
1279 ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
1280 ut_ad(buf_page_mutex_own(block));
1281
1282 if (!buf_flush_ready_for_flush(&block->page, BUF_FLUSH_SINGLE_PAGE)) {
1283 return(FALSE);
1284 }
1285
1286 /* The following call will release the buffer pool and
1287 block mutex. */
1288 return(buf_flush_page(
1289 buf_pool, &block->page,
1290 BUF_FLUSH_SINGLE_PAGE, true));
1291 }
1292 # endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
1293
1294 /** Check the page is in buffer pool and can be flushed.
1295 @param[in] page_id page id
1296 @param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST
1297 @return true if the page can be flushed. */
1298 static
1299 bool
buf_flush_check_neighbor(const page_id_t & page_id,buf_flush_t flush_type)1300 buf_flush_check_neighbor(
1301 const page_id_t& page_id,
1302 buf_flush_t flush_type)
1303 {
1304 buf_page_t* bpage;
1305 buf_pool_t* buf_pool = buf_pool_get(page_id);
1306 bool ret;
1307
1308 ut_ad(flush_type == BUF_FLUSH_LRU
1309 || flush_type == BUF_FLUSH_LIST);
1310
1311 buf_pool_mutex_enter(buf_pool);
1312
1313 /* We only want to flush pages from this buffer pool. */
1314 bpage = buf_page_hash_get(buf_pool, page_id);
1315
1316 if (!bpage) {
1317
1318 buf_pool_mutex_exit(buf_pool);
1319 return(false);
1320 }
1321
1322 ut_a(buf_page_in_file(bpage));
1323
1324 /* We avoid flushing 'non-old' blocks in an LRU flush,
1325 because the flushed blocks are soon freed */
1326
1327 ret = false;
1328 if (flush_type != BUF_FLUSH_LRU || buf_page_is_old(bpage)) {
1329 BPageMutex* block_mutex = buf_page_get_mutex(bpage);
1330
1331 mutex_enter(block_mutex);
1332 if (buf_flush_ready_for_flush(bpage, flush_type)) {
1333 ret = true;
1334 }
1335 mutex_exit(block_mutex);
1336 }
1337 buf_pool_mutex_exit(buf_pool);
1338
1339 return(ret);
1340 }
1341
1342 /** Flushes to disk all flushable pages within the flush area.
1343 @param[in] page_id page id
1344 @param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST
1345 @param[in] n_flushed number of pages flushed so far in this batch
1346 @param[in] n_to_flush maximum number of pages we are allowed to flush
1347 @return number of pages flushed */
1348 static
1349 ulint
buf_flush_try_neighbors(const page_id_t & page_id,buf_flush_t flush_type,ulint n_flushed,ulint n_to_flush)1350 buf_flush_try_neighbors(
1351 const page_id_t& page_id,
1352 buf_flush_t flush_type,
1353 ulint n_flushed,
1354 ulint n_to_flush)
1355 {
1356 ulint i;
1357 ulint low;
1358 ulint high;
1359 ulint count = 0;
1360 buf_pool_t* buf_pool = buf_pool_get(page_id);
1361
1362 ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1363
1364 if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN
1365 || srv_flush_neighbors == 0) {
1366 /* If there is little space or neighbor flushing is
1367 not enabled then just flush the victim. */
1368 low = page_id.page_no();
1369 high = page_id.page_no() + 1;
1370 } else {
1371 /* When flushed, dirty blocks are searched in
1372 neighborhoods of this size, and flushed along with the
1373 original page. */
1374
1375 ulint buf_flush_area;
1376
1377 buf_flush_area = ut_min(
1378 BUF_READ_AHEAD_AREA(buf_pool),
1379 buf_pool->curr_size / 16);
1380
1381 low = (page_id.page_no() / buf_flush_area) * buf_flush_area;
1382 high = (page_id.page_no() / buf_flush_area + 1) * buf_flush_area;
1383
1384 if (srv_flush_neighbors == 1) {
1385 /* adjust 'low' and 'high' to limit
1386 for contiguous dirty area */
1387 if (page_id.page_no() > low) {
1388 for (i = page_id.page_no() - 1; i >= low; i--) {
1389 if (!buf_flush_check_neighbor(
1390 page_id_t(page_id.space(), i),
1391 flush_type)) {
1392
1393 break;
1394 }
1395
1396 if (i == low) {
1397 /* Avoid overwrap when low == 0
1398 and calling
1399 buf_flush_check_neighbor() with
1400 i == (ulint) -1 */
1401 i--;
1402 break;
1403 }
1404 }
1405 low = i + 1;
1406 }
1407
1408 for (i = page_id.page_no() + 1;
1409 i < high
1410 && buf_flush_check_neighbor(
1411 page_id_t(page_id.space(), i),
1412 flush_type);
1413 i++) {
1414 /* do nothing */
1415 }
1416 high = i;
1417 }
1418 }
1419
1420 const ulint space_size = fil_space_get_size(page_id.space());
1421 if (high > space_size) {
1422 high = space_size;
1423 }
1424
1425 DBUG_PRINT("ib_buf", ("flush " UINT32PF ":%u..%u",
1426 page_id.space(),
1427 (unsigned) low, (unsigned) high));
1428
1429 for (ulint i = low; i < high; i++) {
1430 buf_page_t* bpage;
1431
1432 if ((count + n_flushed) >= n_to_flush) {
1433
1434 /* We have already flushed enough pages and
1435 should call it a day. There is, however, one
1436 exception. If the page whose neighbors we
1437 are flushing has not been flushed yet then
1438 we'll try to flush the victim that we
1439 selected originally. */
1440 if (i <= page_id.page_no()) {
1441 i = page_id.page_no();
1442 } else {
1443 break;
1444 }
1445 }
1446
1447 const page_id_t cur_page_id(page_id.space(), i);
1448
1449 buf_pool = buf_pool_get(cur_page_id);
1450
1451 buf_pool_mutex_enter(buf_pool);
1452
1453 /* We only want to flush pages from this buffer pool. */
1454 bpage = buf_page_hash_get(buf_pool, cur_page_id);
1455
1456 if (bpage == NULL) {
1457
1458 buf_pool_mutex_exit(buf_pool);
1459 continue;
1460 }
1461
1462 ut_a(buf_page_in_file(bpage));
1463
1464 /* We avoid flushing 'non-old' blocks in an LRU flush,
1465 because the flushed blocks are soon freed */
1466
1467 if (flush_type != BUF_FLUSH_LRU
1468 || i == page_id.page_no()
1469 || buf_page_is_old(bpage)) {
1470
1471 BPageMutex* block_mutex = buf_page_get_mutex(bpage);
1472
1473 mutex_enter(block_mutex);
1474
1475 if (buf_flush_ready_for_flush(bpage, flush_type)
1476 && (i == page_id.page_no()
1477 || bpage->buf_fix_count == 0)) {
1478
1479 /* We also try to flush those
1480 neighbors != offset */
1481
1482 if (buf_flush_page(
1483 buf_pool, bpage, flush_type, false)) {
1484
1485 ++count;
1486 } else {
1487 mutex_exit(block_mutex);
1488 buf_pool_mutex_exit(buf_pool);
1489 }
1490
1491 continue;
1492 } else {
1493 mutex_exit(block_mutex);
1494 }
1495 }
1496 buf_pool_mutex_exit(buf_pool);
1497 }
1498
1499 if (count > 1) {
1500 MONITOR_INC_VALUE_CUMULATIVE(
1501 MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
1502 MONITOR_FLUSH_NEIGHBOR_COUNT,
1503 MONITOR_FLUSH_NEIGHBOR_PAGES,
1504 (count - 1));
1505 }
1506
1507 return(count);
1508 }
1509
1510 /** Check if the block is modified and ready for flushing.
1511 If the the block is ready to flush then flush the page and try o flush
1512 its neighbors.
1513 @param[in] bpage buffer control block,
1514 must be buf_page_in_file(bpage)
1515 @param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST
1516 @param[in] n_to_flush number of pages to flush
1517 @param[in,out] count number of pages flushed
1518 @return TRUE if buf_pool mutex was released during this function.
1519 This does not guarantee that some pages were written as well.
1520 Number of pages written are incremented to the count. */
1521 static
1522 bool
buf_flush_page_and_try_neighbors(buf_page_t * bpage,buf_flush_t flush_type,ulint n_to_flush,ulint * count)1523 buf_flush_page_and_try_neighbors(
1524 buf_page_t* bpage,
1525 buf_flush_t flush_type,
1526 ulint n_to_flush,
1527 ulint* count)
1528 {
1529 #ifdef UNIV_DEBUG
1530 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1531
1532 ut_ad(buf_pool_mutex_own(buf_pool));
1533 #endif /* UNIV_DEBUG */
1534
1535 bool flushed;
1536 BPageMutex* block_mutex = buf_page_get_mutex(bpage);
1537
1538 mutex_enter(block_mutex);
1539
1540 ut_a(buf_page_in_file(bpage));
1541
1542 if (buf_flush_ready_for_flush(bpage, flush_type)) {
1543 buf_pool_t* buf_pool;
1544
1545 buf_pool = buf_pool_from_bpage(bpage);
1546
1547 const page_id_t page_id = bpage->id;
1548
1549 mutex_exit(block_mutex);
1550
1551 buf_pool_mutex_exit(buf_pool);
1552
1553 /* Try to flush also all the neighbors */
1554 *count += buf_flush_try_neighbors(
1555 page_id, flush_type, *count, n_to_flush);
1556
1557 buf_pool_mutex_enter(buf_pool);
1558 flushed = TRUE;
1559 } else {
1560 mutex_exit(block_mutex);
1561
1562 flushed = false;
1563 }
1564
1565 ut_ad(buf_pool_mutex_own(buf_pool));
1566
1567 return(flushed);
1568 }
1569
1570 /*******************************************************************//**
1571 This utility moves the uncompressed frames of pages to the free list.
1572 Note that this function does not actually flush any data to disk. It
1573 just detaches the uncompressed frames from the compressed pages at the
1574 tail of the unzip_LRU and puts those freed frames in the free list.
1575 Note that it is a best effort attempt and it is not guaranteed that
1576 after a call to this function there will be 'max' blocks in the free
1577 list.
1578 @return number of blocks moved to the free list. */
1579 static
1580 ulint
buf_free_from_unzip_LRU_list_batch(buf_pool_t * buf_pool,ulint max)1581 buf_free_from_unzip_LRU_list_batch(
1582 /*===============================*/
1583 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1584 ulint max) /*!< in: desired number of
1585 blocks in the free_list */
1586 {
1587 ulint scanned = 0;
1588 ulint count = 0;
1589 ulint free_len = UT_LIST_GET_LEN(buf_pool->free);
1590 ulint lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
1591
1592 ut_ad(buf_pool_mutex_own(buf_pool));
1593
1594 buf_block_t* block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
1595
1596 while (block != NULL
1597 && count < max
1598 && free_len < srv_LRU_scan_depth
1599 && lru_len > UT_LIST_GET_LEN(buf_pool->LRU) / 10) {
1600
1601 ++scanned;
1602 if (buf_LRU_free_page(&block->page, false)) {
1603 /* Block was freed. buf_pool->mutex potentially
1604 released and reacquired */
1605 ++count;
1606 block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
1607
1608 } else {
1609
1610 block = UT_LIST_GET_PREV(unzip_LRU, block);
1611 }
1612
1613 free_len = UT_LIST_GET_LEN(buf_pool->free);
1614 lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
1615 }
1616
1617 ut_ad(buf_pool_mutex_own(buf_pool));
1618
1619 if (scanned) {
1620 MONITOR_INC_VALUE_CUMULATIVE(
1621 MONITOR_LRU_BATCH_SCANNED,
1622 MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
1623 MONITOR_LRU_BATCH_SCANNED_PER_CALL,
1624 scanned);
1625 }
1626
1627 return(count);
1628 }
1629
1630 /*******************************************************************//**
1631 This utility flushes dirty blocks from the end of the LRU list.
1632 The calling thread is not allowed to own any latches on pages!
1633 It attempts to make 'max' blocks available in the free list. Note that
1634 it is a best effort attempt and it is not guaranteed that after a call
1635 to this function there will be 'max' blocks in the free list.
1636 @return number of blocks for which the write request was queued. */
1637 static
1638 ulint
buf_flush_LRU_list_batch(buf_pool_t * buf_pool,ulint max)1639 buf_flush_LRU_list_batch(
1640 /*=====================*/
1641 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1642 ulint max) /*!< in: desired number of
1643 blocks in the free_list */
1644 {
1645 buf_page_t* bpage;
1646 ulint scanned = 0;
1647 ulint evict_count = 0;
1648 ulint count = 0;
1649 ulint free_len = UT_LIST_GET_LEN(buf_pool->free);
1650 ulint lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
1651 ulint withdraw_depth = 0;
1652
1653 ut_ad(buf_pool_mutex_own(buf_pool));
1654
1655 if (buf_pool->curr_size < buf_pool->old_size
1656 && buf_pool->withdraw_target > 0) {
1657 withdraw_depth = buf_pool->withdraw_target
1658 - UT_LIST_GET_LEN(buf_pool->withdraw);
1659 }
1660
1661 for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1662 bpage != NULL && count + evict_count < max
1663 && free_len < srv_LRU_scan_depth + withdraw_depth
1664 && lru_len > BUF_LRU_MIN_LEN;
1665 ++scanned,
1666 bpage = buf_pool->lru_hp.get()) {
1667
1668 buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage);
1669 buf_pool->lru_hp.set(prev);
1670
1671 BPageMutex* block_mutex = buf_page_get_mutex(bpage);
1672
1673 mutex_enter(block_mutex);
1674
1675 if (buf_flush_ready_for_replace(bpage)) {
1676 /* block is ready for eviction i.e., it is
1677 clean and is not IO-fixed or buffer fixed. */
1678 mutex_exit(block_mutex);
1679 if (buf_LRU_free_page(bpage, true)) {
1680 ++evict_count;
1681 }
1682 } else if (buf_flush_ready_for_flush(bpage, BUF_FLUSH_LRU)) {
1683 /* Block is ready for flush. Dispatch an IO
1684 request. The IO helper thread will put it on
1685 free list in IO completion routine. */
1686 mutex_exit(block_mutex);
1687 buf_flush_page_and_try_neighbors(
1688 bpage, BUF_FLUSH_LRU, max, &count);
1689 } else {
1690 /* Can't evict or dispatch this block. Go to
1691 previous. */
1692 ut_ad(buf_pool->lru_hp.is_hp(prev));
1693 mutex_exit(block_mutex);
1694 }
1695
1696 ut_ad(!mutex_own(block_mutex));
1697 ut_ad(buf_pool_mutex_own(buf_pool));
1698
1699 free_len = UT_LIST_GET_LEN(buf_pool->free);
1700 lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
1701 }
1702
1703 buf_pool->lru_hp.set(NULL);
1704
1705 /* We keep track of all flushes happening as part of LRU
1706 flush. When estimating the desired rate at which flush_list
1707 should be flushed, we factor in this value. */
1708 buf_lru_flush_page_count += count;
1709
1710 ut_ad(buf_pool_mutex_own(buf_pool));
1711
1712 if (evict_count) {
1713 MONITOR_INC_VALUE_CUMULATIVE(
1714 MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
1715 MONITOR_LRU_BATCH_EVICT_COUNT,
1716 MONITOR_LRU_BATCH_EVICT_PAGES,
1717 evict_count);
1718 }
1719
1720 if (scanned) {
1721 MONITOR_INC_VALUE_CUMULATIVE(
1722 MONITOR_LRU_BATCH_SCANNED,
1723 MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
1724 MONITOR_LRU_BATCH_SCANNED_PER_CALL,
1725 scanned);
1726 }
1727
1728 return(count);
1729 }
1730
1731 /*******************************************************************//**
1732 Flush and move pages from LRU or unzip_LRU list to the free list.
1733 Whether LRU or unzip_LRU is used depends on the state of the system.
1734 @return number of blocks for which either the write request was queued
1735 or in case of unzip_LRU the number of blocks actually moved to the
1736 free list */
1737 static
1738 ulint
buf_do_LRU_batch(buf_pool_t * buf_pool,ulint max)1739 buf_do_LRU_batch(
1740 /*=============*/
1741 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1742 ulint max) /*!< in: desired number of
1743 blocks in the free_list */
1744 {
1745 ulint count = 0;
1746
1747 if (buf_LRU_evict_from_unzip_LRU(buf_pool)) {
1748 count += buf_free_from_unzip_LRU_list_batch(buf_pool, max);
1749 }
1750
1751 if (max > count) {
1752 count += buf_flush_LRU_list_batch(buf_pool, max - count);
1753 }
1754
1755 return(count);
1756 }
1757
1758 /** This utility flushes dirty blocks from the end of the flush_list.
1759 The calling thread is not allowed to own any latches on pages!
1760 @param[in] buf_pool buffer pool instance
1761 @param[in] min_n wished minimum mumber of blocks flushed (it is
1762 not guaranteed that the actual number is that big, though)
1763 @param[in] lsn_limit all blocks whose oldest_modification is smaller
1764 than this should be flushed (if their number does not exceed min_n)
1765 @return number of blocks for which the write request was queued;
1766 ULINT_UNDEFINED if there was a flush of the same type already
1767 running */
1768 static
1769 ulint
buf_do_flush_list_batch(buf_pool_t * buf_pool,ulint min_n,lsn_t lsn_limit)1770 buf_do_flush_list_batch(
1771 buf_pool_t* buf_pool,
1772 ulint min_n,
1773 lsn_t lsn_limit)
1774 {
1775 ulint count = 0;
1776 ulint scanned = 0;
1777
1778 ut_ad(buf_pool_mutex_own(buf_pool));
1779
1780 /* Start from the end of the list looking for a suitable
1781 block to be flushed. */
1782 buf_flush_list_mutex_enter(buf_pool);
1783 ulint len = UT_LIST_GET_LEN(buf_pool->flush_list);
1784
1785 /* In order not to degenerate this scan to O(n*n) we attempt
1786 to preserve pointer of previous block in the flush list. To do
1787 so we declare it a hazard pointer. Any thread working on the
1788 flush list must check the hazard pointer and if it is removing
1789 the same block then it must reset it. */
1790 for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
1791 count < min_n && bpage != NULL && len > 0
1792 && bpage->oldest_modification < lsn_limit;
1793 bpage = buf_pool->flush_hp.get(),
1794 ++scanned) {
1795
1796 buf_page_t* prev;
1797
1798 ut_a(bpage->oldest_modification > 0);
1799 ut_ad(bpage->in_flush_list);
1800
1801 prev = UT_LIST_GET_PREV(list, bpage);
1802 buf_pool->flush_hp.set(prev);
1803 buf_flush_list_mutex_exit(buf_pool);
1804
1805 #ifdef UNIV_DEBUG
1806 bool flushed =
1807 #endif /* UNIV_DEBUG */
1808 buf_flush_page_and_try_neighbors(
1809 bpage, BUF_FLUSH_LIST, min_n, &count);
1810
1811 buf_flush_list_mutex_enter(buf_pool);
1812
1813 ut_ad(flushed || buf_pool->flush_hp.is_hp(prev));
1814
1815 --len;
1816 }
1817
1818 buf_pool->flush_hp.set(NULL);
1819 buf_flush_list_mutex_exit(buf_pool);
1820
1821 if (scanned) {
1822 MONITOR_INC_VALUE_CUMULATIVE(
1823 MONITOR_FLUSH_BATCH_SCANNED,
1824 MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
1825 MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
1826 scanned);
1827 }
1828
1829 if (count) {
1830 MONITOR_INC_VALUE_CUMULATIVE(
1831 MONITOR_FLUSH_BATCH_TOTAL_PAGE,
1832 MONITOR_FLUSH_BATCH_COUNT,
1833 MONITOR_FLUSH_BATCH_PAGES,
1834 count);
1835 }
1836
1837 ut_ad(buf_pool_mutex_own(buf_pool));
1838
1839 return(count);
1840 }
1841
1842 /** This utility flushes dirty blocks from the end of the LRU list or
1843 flush_list.
1844 NOTE 1: in the case of an LRU flush the calling thread may own latches to
1845 pages: to avoid deadlocks, this function must be written so that it cannot
1846 end up waiting for these latches! NOTE 2: in the case of a flush list flush,
1847 the calling thread is not allowed to own any latches on pages!
1848 @param[in] buf_pool buffer pool instance
1849 @param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST; if
1850 BUF_FLUSH_LIST, then the caller must not own any latches on pages
1851 @param[in] min_n wished minimum mumber of blocks flushed (it is
1852 not guaranteed that the actual number is that big, though)
1853 @param[in] lsn_limit in the case of BUF_FLUSH_LIST all blocks whose
1854 oldest_modification is smaller than this should be flushed (if their number
1855 does not exceed min_n), otherwise ignored
1856 @return number of blocks for which the write request was queued */
1857 static
1858 ulint
buf_flush_batch(buf_pool_t * buf_pool,buf_flush_t flush_type,ulint min_n,lsn_t lsn_limit)1859 buf_flush_batch(
1860 buf_pool_t* buf_pool,
1861 buf_flush_t flush_type,
1862 ulint min_n,
1863 lsn_t lsn_limit)
1864 {
1865 ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1866
1867 #ifdef UNIV_DEBUG
1868 {
1869 dict_sync_check check(true);
1870
1871 ut_ad(flush_type != BUF_FLUSH_LIST
1872 || !sync_check_iterate(check));
1873 }
1874 #endif /* UNIV_DEBUG */
1875
1876 buf_pool_mutex_enter(buf_pool);
1877
1878 ulint count = 0;
1879
1880 /* Note: The buffer pool mutex is released and reacquired within
1881 the flush functions. */
1882 switch (flush_type) {
1883 case BUF_FLUSH_LRU:
1884 count = buf_do_LRU_batch(buf_pool, min_n);
1885 break;
1886 case BUF_FLUSH_LIST:
1887 count = buf_do_flush_list_batch(buf_pool, min_n, lsn_limit);
1888 break;
1889 default:
1890 ut_error;
1891 }
1892
1893 buf_pool_mutex_exit(buf_pool);
1894
1895 DBUG_PRINT("ib_buf", ("flush %u completed, %u pages",
1896 unsigned(flush_type), unsigned(count)));
1897
1898 return(count);
1899 }
1900
1901 /******************************************************************//**
1902 Gather the aggregated stats for both flush list and LRU list flushing.
1903 @param page_count_flush number of pages flushed from the end of the flush_list
1904 @param page_count_LRU number of pages flushed from the end of the LRU list
1905 */
1906 static
1907 void
buf_flush_stats(ulint page_count_flush,ulint page_count_LRU)1908 buf_flush_stats(
1909 /*============*/
1910 ulint page_count_flush,
1911 ulint page_count_LRU)
1912 {
1913 DBUG_PRINT("ib_buf", ("flush completed, from flush_list %u pages, "
1914 "from LRU_list %u pages",
1915 unsigned(page_count_flush),
1916 unsigned(page_count_LRU)));
1917
1918 srv_stats.buf_pool_flushed.add(page_count_flush + page_count_LRU);
1919 }
1920
1921 /******************************************************************//**
1922 Start a buffer flush batch for LRU or flush list */
1923 static
1924 ibool
buf_flush_start(buf_pool_t * buf_pool,buf_flush_t flush_type)1925 buf_flush_start(
1926 /*============*/
1927 buf_pool_t* buf_pool, /*!< buffer pool instance */
1928 buf_flush_t flush_type) /*!< in: BUF_FLUSH_LRU
1929 or BUF_FLUSH_LIST */
1930 {
1931 ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1932
1933 buf_pool_mutex_enter(buf_pool);
1934
1935 if (buf_pool->n_flush[flush_type] > 0
1936 || buf_pool->init_flush[flush_type] == TRUE) {
1937
1938 /* There is already a flush batch of the same type running */
1939
1940 buf_pool_mutex_exit(buf_pool);
1941
1942 return(FALSE);
1943 }
1944
1945 buf_pool->init_flush[flush_type] = TRUE;
1946
1947 os_event_reset(buf_pool->no_flush[flush_type]);
1948
1949 buf_pool_mutex_exit(buf_pool);
1950
1951 return(TRUE);
1952 }
1953
1954 /******************************************************************//**
1955 End a buffer flush batch for LRU or flush list */
1956 static
1957 void
buf_flush_end(buf_pool_t * buf_pool,buf_flush_t flush_type)1958 buf_flush_end(
1959 /*==========*/
1960 buf_pool_t* buf_pool, /*!< buffer pool instance */
1961 buf_flush_t flush_type) /*!< in: BUF_FLUSH_LRU
1962 or BUF_FLUSH_LIST */
1963 {
1964 buf_pool_mutex_enter(buf_pool);
1965
1966 buf_pool->init_flush[flush_type] = FALSE;
1967
1968 buf_pool->try_LRU_scan = TRUE;
1969
1970 if (buf_pool->n_flush[flush_type] == 0) {
1971
1972 /* The running flush batch has ended */
1973
1974 os_event_set(buf_pool->no_flush[flush_type]);
1975 }
1976
1977 buf_pool_mutex_exit(buf_pool);
1978
1979 if (!srv_read_only_mode) {
1980 buf_dblwr_flush_buffered_writes();
1981 } else {
1982 os_aio_simulated_wake_handler_threads();
1983 }
1984 }
1985
1986 /******************************************************************//**
1987 Waits until a flush batch of the given type ends */
1988 void
buf_flush_wait_batch_end(buf_pool_t * buf_pool,buf_flush_t type)1989 buf_flush_wait_batch_end(
1990 /*=====================*/
1991 buf_pool_t* buf_pool, /*!< buffer pool instance */
1992 buf_flush_t type) /*!< in: BUF_FLUSH_LRU
1993 or BUF_FLUSH_LIST */
1994 {
1995 ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
1996
1997 if (buf_pool == NULL) {
1998 ulint i;
1999
2000 for (i = 0; i < srv_buf_pool_instances; ++i) {
2001 buf_pool_t* buf_pool;
2002
2003 buf_pool = buf_pool_from_array(i);
2004
2005 thd_wait_begin(NULL, THD_WAIT_DISKIO);
2006 os_event_wait(buf_pool->no_flush[type]);
2007 thd_wait_end(NULL);
2008 }
2009 } else {
2010 thd_wait_begin(NULL, THD_WAIT_DISKIO);
2011 os_event_wait(buf_pool->no_flush[type]);
2012 thd_wait_end(NULL);
2013 }
2014 }
2015
2016 /** Do flushing batch of a given type.
2017 NOTE: The calling thread is not allowed to own any latches on pages!
2018 @param[in,out] buf_pool buffer pool instance
2019 @param[in] type flush type
2020 @param[in] min_n wished minimum mumber of blocks flushed
2021 (it is not guaranteed that the actual number is that big, though)
2022 @param[in] lsn_limit in the case BUF_FLUSH_LIST all blocks whose
2023 oldest_modification is smaller than this should be flushed (if their number
2024 does not exceed min_n), otherwise ignored
2025 @param[out] n_processed the number of pages which were processed is
2026 passed back to caller. Ignored if NULL
2027 @retval true if a batch was queued successfully.
2028 @retval false if another batch of same type was already running. */
2029 bool
buf_flush_do_batch(buf_pool_t * buf_pool,buf_flush_t type,ulint min_n,lsn_t lsn_limit,ulint * n_processed)2030 buf_flush_do_batch(
2031 buf_pool_t* buf_pool,
2032 buf_flush_t type,
2033 ulint min_n,
2034 lsn_t lsn_limit,
2035 ulint* n_processed)
2036 {
2037 ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
2038
2039 if (n_processed != NULL) {
2040 *n_processed = 0;
2041 }
2042
2043 if (!buf_flush_start(buf_pool, type)) {
2044 return(false);
2045 }
2046
2047 ulint page_count = buf_flush_batch(buf_pool, type, min_n, lsn_limit);
2048
2049 buf_flush_end(buf_pool, type);
2050
2051 if (n_processed != NULL) {
2052 *n_processed = page_count;
2053 }
2054
2055 return(true);
2056 }
2057
2058 /**
2059 Waits until a flush batch of the given lsn ends
2060 @param[in] new_oldest target oldest_modified_lsn to wait for */
2061
2062 void
buf_flush_wait_flushed(lsn_t new_oldest)2063 buf_flush_wait_flushed(
2064 lsn_t new_oldest)
2065 {
2066 for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
2067 buf_pool_t* buf_pool;
2068 lsn_t oldest;
2069
2070 buf_pool = buf_pool_from_array(i);
2071
2072 for (;;) {
2073 /* We don't need to wait for fsync of the flushed
2074 blocks, because anyway we need fsync to make chekpoint.
2075 So, we don't need to wait for the batch end here. */
2076
2077 buf_flush_list_mutex_enter(buf_pool);
2078
2079 buf_page_t* bpage;
2080
2081 /* We don't need to wait for system temporary pages */
2082 for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
2083 bpage != NULL
2084 && fsp_is_system_temporary(bpage->id.space());
2085 bpage = UT_LIST_GET_PREV(list, bpage)) {
2086 /* Do nothing. */
2087 }
2088
2089 if (bpage != NULL) {
2090 ut_ad(bpage->in_flush_list);
2091 oldest = bpage->oldest_modification;
2092 } else {
2093 oldest = 0;
2094 }
2095
2096 buf_flush_list_mutex_exit(buf_pool);
2097
2098 if (oldest == 0 || oldest >= new_oldest) {
2099 break;
2100 }
2101
2102 /* sleep and retry */
2103 os_thread_sleep(buf_flush_wait_flushed_sleep_time);
2104
2105 MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
2106 }
2107 }
2108 }
2109
2110 /** This utility flushes dirty blocks from the end of the flush list of all
2111 buffer pool instances.
2112 NOTE: The calling thread is not allowed to own any latches on pages!
2113 @param[in] min_n wished minimum mumber of blocks flushed (it is
2114 not guaranteed that the actual number is that big, though)
2115 @param[in] lsn_limit in the case BUF_FLUSH_LIST all blocks whose
2116 oldest_modification is smaller than this should be flushed (if their number
2117 does not exceed min_n), otherwise ignored
2118 @param[out] n_processed the number of pages which were processed is
2119 passed back to caller. Ignored if NULL.
2120 @return true if a batch was queued successfully for each buffer pool
2121 instance. false if another batch of same type was already running in
2122 at least one of the buffer pool instance */
2123 bool
buf_flush_lists(ulint min_n,lsn_t lsn_limit,ulint * n_processed)2124 buf_flush_lists(
2125 ulint min_n,
2126 lsn_t lsn_limit,
2127 ulint* n_processed)
2128 {
2129 ulint i;
2130 ulint n_flushed = 0;
2131 bool success = true;
2132
2133 if (n_processed) {
2134 *n_processed = 0;
2135 }
2136
2137 if (min_n != ULINT_MAX) {
2138 /* Ensure that flushing is spread evenly amongst the
2139 buffer pool instances. When min_n is ULINT_MAX
2140 we need to flush everything up to the lsn limit
2141 so no limit here. */
2142 min_n = (min_n + srv_buf_pool_instances - 1)
2143 / srv_buf_pool_instances;
2144 }
2145
2146 /* Flush to lsn_limit in all buffer pool instances */
2147 for (i = 0; i < srv_buf_pool_instances; i++) {
2148 buf_pool_t* buf_pool;
2149 ulint page_count = 0;
2150
2151 buf_pool = buf_pool_from_array(i);
2152
2153 if (!buf_flush_do_batch(buf_pool,
2154 BUF_FLUSH_LIST,
2155 min_n,
2156 lsn_limit,
2157 &page_count)) {
2158 /* We have two choices here. If lsn_limit was
2159 specified then skipping an instance of buffer
2160 pool means we cannot guarantee that all pages
2161 up to lsn_limit has been flushed. We can
2162 return right now with failure or we can try
2163 to flush remaining buffer pools up to the
2164 lsn_limit. We attempt to flush other buffer
2165 pools based on the assumption that it will
2166 help in the retry which will follow the
2167 failure. */
2168 success = false;
2169
2170 continue;
2171 }
2172
2173 n_flushed += page_count;
2174 }
2175
2176 if (n_flushed) {
2177 buf_flush_stats(n_flushed, 0);
2178 }
2179
2180 if (n_processed) {
2181 *n_processed = n_flushed;
2182 }
2183
2184 return(success);
2185 }
2186
2187 /******************************************************************//**
2188 This function picks up a single page from the tail of the LRU
2189 list, flushes it (if it is dirty), removes it from page_hash and LRU
2190 list and puts it on the free list. It is called from user threads when
2191 they are unable to find a replaceable page at the tail of the LRU
2192 list i.e.: when the background LRU flushing in the page_cleaner thread
2193 is not fast enough to keep pace with the workload.
2194 @return true if success. */
2195 bool
buf_flush_single_page_from_LRU(buf_pool_t * buf_pool)2196 buf_flush_single_page_from_LRU(
2197 /*===========================*/
2198 buf_pool_t* buf_pool) /*!< in/out: buffer pool instance */
2199 {
2200 ulint scanned;
2201 buf_page_t* bpage;
2202 ibool freed;
2203
2204 buf_pool_mutex_enter(buf_pool);
2205
2206 for (bpage = buf_pool->single_scan_itr.start(), scanned = 0,
2207 freed = false;
2208 bpage != NULL;
2209 ++scanned, bpage = buf_pool->single_scan_itr.get()) {
2210
2211 ut_ad(buf_pool_mutex_own(buf_pool));
2212
2213 buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage);
2214
2215 buf_pool->single_scan_itr.set(prev);
2216
2217 BPageMutex* block_mutex;
2218
2219 block_mutex = buf_page_get_mutex(bpage);
2220
2221 mutex_enter(block_mutex);
2222
2223 if (buf_flush_ready_for_replace(bpage)) {
2224 /* block is ready for eviction i.e., it is
2225 clean and is not IO-fixed or buffer fixed. */
2226 mutex_exit(block_mutex);
2227
2228 if (buf_LRU_free_page(bpage, true)) {
2229 buf_pool_mutex_exit(buf_pool);
2230 freed = true;
2231 break;
2232 }
2233
2234 } else if (buf_flush_ready_for_flush(
2235 bpage, BUF_FLUSH_SINGLE_PAGE)) {
2236
2237 /* Block is ready for flush. Try and dispatch an IO
2238 request. We'll put it on free list in IO completion
2239 routine if it is not buffer fixed. The following call
2240 will release the buffer pool and block mutex.
2241
2242 Note: There is no guarantee that this page has actually
2243 been freed, only that it has been flushed to disk */
2244
2245 freed = buf_flush_page(
2246 buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE, true);
2247
2248 if (freed) {
2249 break;
2250 }
2251
2252 mutex_exit(block_mutex);
2253 } else {
2254 mutex_exit(block_mutex);
2255 }
2256
2257 ut_ad(!mutex_own(block_mutex));
2258 }
2259
2260 if (!freed) {
2261 /* Can't find a single flushable page. */
2262 ut_ad(!bpage);
2263 buf_pool_mutex_exit(buf_pool);
2264 }
2265
2266 if (scanned) {
2267 MONITOR_INC_VALUE_CUMULATIVE(
2268 MONITOR_LRU_SINGLE_FLUSH_SCANNED,
2269 MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
2270 MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL,
2271 scanned);
2272 }
2273
2274 ut_ad(!buf_pool_mutex_own(buf_pool));
2275
2276 return(freed);
2277 }
2278
2279 /**
2280 Clears up tail of the LRU list of a given buffer pool instance:
2281 * Put replaceable pages at the tail of LRU to the free list
2282 * Flush dirty pages at the tail of LRU to the disk
2283 The depth to which we scan each buffer pool is controlled by dynamic
2284 config parameter innodb_LRU_scan_depth.
2285 @param buf_pool buffer pool instance
2286 @return total pages flushed */
2287 static
2288 ulint
buf_flush_LRU_list(buf_pool_t * buf_pool)2289 buf_flush_LRU_list(
2290 buf_pool_t* buf_pool)
2291 {
2292 ulint scan_depth, withdraw_depth;
2293 ulint n_flushed = 0;
2294
2295 ut_ad(buf_pool);
2296
2297 /* srv_LRU_scan_depth can be arbitrarily large value.
2298 We cap it with current LRU size. */
2299 buf_pool_mutex_enter(buf_pool);
2300 scan_depth = UT_LIST_GET_LEN(buf_pool->LRU);
2301 if (buf_pool->curr_size < buf_pool->old_size
2302 && buf_pool->withdraw_target > 0) {
2303 withdraw_depth = buf_pool->withdraw_target
2304 - UT_LIST_GET_LEN(buf_pool->withdraw);
2305 } else {
2306 withdraw_depth = 0;
2307 }
2308 buf_pool_mutex_exit(buf_pool);
2309
2310 if (withdraw_depth > srv_LRU_scan_depth) {
2311 scan_depth = ut_min(withdraw_depth, scan_depth);
2312 } else {
2313 scan_depth = ut_min(static_cast<ulint>(srv_LRU_scan_depth),
2314 scan_depth);
2315 }
2316
2317 /* Currently one of page_cleaners is the only thread
2318 that can trigger an LRU flush at the same time.
2319 So, it is not possible that a batch triggered during
2320 last iteration is still running, */
2321 buf_flush_do_batch(buf_pool, BUF_FLUSH_LRU, scan_depth,
2322 0, &n_flushed);
2323
2324 return(n_flushed);
2325 }
2326
2327 /*********************************************************************//**
2328 Clears up tail of the LRU lists:
2329 * Put replaceable pages at the tail of LRU to the free list
2330 * Flush dirty pages at the tail of LRU to the disk
2331 The depth to which we scan each buffer pool is controlled by dynamic
2332 config parameter innodb_LRU_scan_depth.
2333 @return total pages flushed */
2334 ulint
buf_flush_LRU_lists(void)2335 buf_flush_LRU_lists(void)
2336 /*=====================*/
2337 {
2338 ulint n_flushed = 0;
2339
2340 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2341
2342 n_flushed += buf_flush_LRU_list(buf_pool_from_array(i));
2343 }
2344
2345 if (n_flushed) {
2346 buf_flush_stats(0, n_flushed);
2347 }
2348
2349 return(n_flushed);
2350 }
2351
2352 /*********************************************************************//**
2353 Wait for any possible LRU flushes that are in progress to end. */
2354 void
buf_flush_wait_LRU_batch_end(void)2355 buf_flush_wait_LRU_batch_end(void)
2356 /*==============================*/
2357 {
2358 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2359 buf_pool_t* buf_pool;
2360
2361 buf_pool = buf_pool_from_array(i);
2362
2363 buf_pool_mutex_enter(buf_pool);
2364
2365 if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0
2366 || buf_pool->init_flush[BUF_FLUSH_LRU]) {
2367
2368 buf_pool_mutex_exit(buf_pool);
2369 buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
2370 } else {
2371 buf_pool_mutex_exit(buf_pool);
2372 }
2373 }
2374 }
2375
2376 /*********************************************************************//**
2377 Calculates if flushing is required based on number of dirty pages in
2378 the buffer pool.
2379 @return percent of io_capacity to flush to manage dirty page ratio */
2380 static
2381 ulint
af_get_pct_for_dirty()2382 af_get_pct_for_dirty()
2383 /*==================*/
2384 {
2385 double dirty_pct = buf_get_modified_ratio_pct();
2386
2387 if (dirty_pct == 0.0) {
2388 /* No pages modified */
2389 return(0);
2390 }
2391
2392 ut_a(srv_max_dirty_pages_pct_lwm
2393 <= srv_max_buf_pool_modified_pct);
2394
2395 if (srv_max_dirty_pages_pct_lwm == 0) {
2396 /* The user has not set the option to preflush dirty
2397 pages as we approach the high water mark. */
2398 if (dirty_pct >= srv_max_buf_pool_modified_pct) {
2399 /* We have crossed the high water mark of dirty
2400 pages In this case we start flushing at 100% of
2401 innodb_io_capacity. */
2402 return(100);
2403 }
2404 } else if (dirty_pct >= srv_max_dirty_pages_pct_lwm) {
2405 /* We should start flushing pages gradually. */
2406 return(static_cast<ulint>((dirty_pct * 100)
2407 / (srv_max_buf_pool_modified_pct + 1)));
2408 }
2409
2410 return(0);
2411 }
2412
2413 /*********************************************************************//**
2414 Calculates if flushing is required based on redo generation rate.
2415 @return percent of io_capacity to flush to manage redo space */
2416 static
2417 ulint
af_get_pct_for_lsn(lsn_t age)2418 af_get_pct_for_lsn(
2419 /*===============*/
2420 lsn_t age) /*!< in: current age of LSN. */
2421 {
2422 lsn_t max_async_age;
2423 lsn_t lsn_age_factor;
2424 lsn_t af_lwm = (srv_adaptive_flushing_lwm
2425 * log_get_capacity()) / 100;
2426
2427 if (age < af_lwm) {
2428 /* No adaptive flushing. */
2429 return(0);
2430 }
2431
2432 max_async_age = log_get_max_modified_age_async();
2433
2434 if (age < max_async_age && !srv_adaptive_flushing) {
2435 /* We have still not reached the max_async point and
2436 the user has disabled adaptive flushing. */
2437 return(0);
2438 }
2439
2440 /* If we are here then we know that either:
2441 1) User has enabled adaptive flushing
2442 2) User may have disabled adaptive flushing but we have reached
2443 max_async_age. */
2444 lsn_age_factor = (age * 100) / max_async_age;
2445
2446 ut_ad(srv_max_io_capacity >= srv_io_capacity);
2447 return(static_cast<ulint>(
2448 ((srv_max_io_capacity / srv_io_capacity)
2449 * (lsn_age_factor * sqrt((double)lsn_age_factor)))
2450 / 7.5));
2451 }
2452
2453 /*********************************************************************//**
2454 This function is called approximately once every second by the
2455 page_cleaner thread. Based on various factors it decides if there is a
2456 need to do flushing.
2457 @return number of pages recommended to be flushed
2458 @param lsn_limit pointer to return LSN up to which flushing must happen
2459 @param last_pages_in the number of pages flushed by the last flush_list
2460 flushing. */
2461 static
2462 ulint
page_cleaner_flush_pages_recommendation(lsn_t * lsn_limit,ulint last_pages_in)2463 page_cleaner_flush_pages_recommendation(
2464 /*====================================*/
2465 lsn_t* lsn_limit,
2466 ulint last_pages_in)
2467 {
2468 static lsn_t prev_lsn = 0;
2469 static ulint sum_pages = 0;
2470 static ulint avg_page_rate = 0;
2471 static ulint n_iterations = 0;
2472 static ib_time_monotonic_t prev_time;
2473 lsn_t oldest_lsn;
2474 lsn_t cur_lsn;
2475 lsn_t age;
2476 lsn_t lsn_rate;
2477 ulint n_pages = 0;
2478 ulint pct_for_dirty = 0;
2479 ulint pct_for_lsn = 0;
2480 ulint pct_total = 0;
2481
2482 cur_lsn = log_get_lsn();
2483
2484 if (prev_lsn == 0) {
2485 /* First time around. */
2486 prev_lsn = cur_lsn;
2487 prev_time = ut_time_monotonic();
2488 return(0);
2489 }
2490
2491 if (prev_lsn == cur_lsn) {
2492 return(0);
2493 }
2494
2495 sum_pages += last_pages_in;
2496
2497 ib_time_monotonic_t curr_time = ut_time_monotonic();
2498 uint64_t time_elapsed = curr_time - prev_time;
2499 const ulong avg_loop = srv_flushing_avg_loops;
2500
2501 /* We update our variables every srv_flushing_avg_loops
2502 iterations to smooth out transition in workload. */
2503 if (++n_iterations >= avg_loop
2504 || time_elapsed >= (uint64_t)avg_loop) {
2505
2506 if (time_elapsed < 1) {
2507 time_elapsed = 1;
2508 }
2509
2510 avg_page_rate = static_cast<ulint>(
2511 ((static_cast<double>(sum_pages)
2512 / time_elapsed)
2513 + avg_page_rate) / 2);
2514
2515 /* How much LSN we have generated since last call. */
2516 lsn_rate = static_cast<lsn_t>(
2517 static_cast<double>(cur_lsn - prev_lsn)
2518 / time_elapsed);
2519
2520 lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2;
2521
2522
2523 /* aggregate stats of all slots */
2524 mutex_enter(&page_cleaner->mutex);
2525
2526 uint64_t flush_tm = page_cleaner->flush_time;
2527 ulint flush_pass = page_cleaner->flush_pass;
2528
2529 page_cleaner->flush_time = 0;
2530 page_cleaner->flush_pass = 0;
2531
2532 uint64_t lru_tm = 0;
2533 uint64_t list_tm = 0;
2534 ulint lru_pass = 0;
2535 ulint list_pass = 0;
2536
2537 for (ulint i = 0; i < page_cleaner->n_slots; i++) {
2538 page_cleaner_slot_t* slot;
2539
2540 slot = &page_cleaner->slots[i];
2541
2542 lru_tm += slot->flush_lru_time;
2543 lru_pass += slot->flush_lru_pass;
2544 list_tm += slot->flush_list_time;
2545 list_pass += slot->flush_list_pass;
2546
2547 slot->flush_lru_time = 0;
2548 slot->flush_lru_pass = 0;
2549 slot->flush_list_time = 0;
2550 slot->flush_list_pass = 0;
2551 }
2552
2553 mutex_exit(&page_cleaner->mutex);
2554
2555 /* minimum values are 1, to avoid dividing by zero. */
2556 if (lru_tm < 1) {
2557 lru_tm = 1;
2558 }
2559 if (list_tm < 1) {
2560 list_tm = 1;
2561 }
2562 if (flush_tm < 1) {
2563 flush_tm = 1;
2564 }
2565
2566 if (lru_pass < 1) {
2567 lru_pass = 1;
2568 }
2569 if (list_pass < 1) {
2570 list_pass = 1;
2571 }
2572 if (flush_pass < 1) {
2573 flush_pass = 1;
2574 }
2575
2576 MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_SLOT,
2577 list_tm / list_pass);
2578 MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_SLOT,
2579 lru_tm / lru_pass);
2580
2581 MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_THREAD,
2582 list_tm / (srv_n_page_cleaners * flush_pass));
2583 MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_THREAD,
2584 lru_tm / (srv_n_page_cleaners * flush_pass));
2585 MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_EST,
2586 flush_tm * list_tm / flush_pass
2587 / (list_tm + lru_tm));
2588 MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_EST,
2589 flush_tm * lru_tm / flush_pass
2590 / (list_tm + lru_tm));
2591 MONITOR_SET(MONITOR_FLUSH_AVG_TIME, flush_tm / flush_pass);
2592
2593 MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS,
2594 list_pass / page_cleaner->n_slots);
2595 MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_PASS,
2596 lru_pass / page_cleaner->n_slots);
2597 MONITOR_SET(MONITOR_FLUSH_AVG_PASS, flush_pass);
2598
2599 prev_lsn = cur_lsn;
2600 prev_time = curr_time;
2601
2602 n_iterations = 0;
2603
2604 sum_pages = 0;
2605 }
2606
2607 oldest_lsn = buf_pool_get_oldest_modification();
2608
2609 ut_ad(oldest_lsn <= log_get_lsn());
2610
2611 age = cur_lsn > oldest_lsn ? cur_lsn - oldest_lsn : 0;
2612
2613 pct_for_dirty = af_get_pct_for_dirty();
2614 pct_for_lsn = af_get_pct_for_lsn(age);
2615
2616 pct_total = ut_max(pct_for_dirty, pct_for_lsn);
2617
2618 /* Estimate pages to be flushed for the lsn progress */
2619 ulint sum_pages_for_lsn = 0;
2620 lsn_t target_lsn = oldest_lsn
2621 + lsn_avg_rate * buf_flush_lsn_scan_factor;
2622
2623 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2624 buf_pool_t* buf_pool = buf_pool_from_array(i);
2625 ulint pages_for_lsn = 0;
2626
2627 buf_flush_list_mutex_enter(buf_pool);
2628 for (buf_page_t* b = UT_LIST_GET_LAST(buf_pool->flush_list);
2629 b != NULL;
2630 b = UT_LIST_GET_PREV(list, b)) {
2631 if (b->oldest_modification > target_lsn) {
2632 break;
2633 }
2634 ++pages_for_lsn;
2635 }
2636 buf_flush_list_mutex_exit(buf_pool);
2637
2638 sum_pages_for_lsn += pages_for_lsn;
2639
2640 mutex_enter(&page_cleaner->mutex);
2641 ut_ad(page_cleaner->slots[i].state
2642 == PAGE_CLEANER_STATE_NONE);
2643 page_cleaner->slots[i].n_pages_requested
2644 = pages_for_lsn / buf_flush_lsn_scan_factor + 1;
2645 mutex_exit(&page_cleaner->mutex);
2646 }
2647
2648 sum_pages_for_lsn /= buf_flush_lsn_scan_factor;
2649 if(sum_pages_for_lsn < 1) {
2650 sum_pages_for_lsn = 1;
2651 }
2652
2653 /* Cap the maximum IO capacity that we are going to use by
2654 max_io_capacity. Limit the value to avoid too quick increase */
2655 ulint pages_for_lsn =
2656 std::min<ulint>(sum_pages_for_lsn, srv_max_io_capacity * 2);
2657
2658 n_pages = (PCT_IO(pct_total) + avg_page_rate + pages_for_lsn) / 3;
2659
2660 if (n_pages > srv_max_io_capacity) {
2661 n_pages = srv_max_io_capacity;
2662 }
2663
2664 /* Normalize request for each instance */
2665 mutex_enter(&page_cleaner->mutex);
2666 ut_ad(page_cleaner->n_slots_requested == 0);
2667 ut_ad(page_cleaner->n_slots_flushing == 0);
2668 ut_ad(page_cleaner->n_slots_finished == 0);
2669
2670 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2671 /* if REDO has enough of free space,
2672 don't care about age distribution of pages */
2673 page_cleaner->slots[i].n_pages_requested = pct_for_lsn > 30 ?
2674 page_cleaner->slots[i].n_pages_requested
2675 * n_pages / sum_pages_for_lsn + 1
2676 : n_pages / srv_buf_pool_instances;
2677 }
2678 mutex_exit(&page_cleaner->mutex);
2679
2680 MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages);
2681
2682 MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_BY_AGE, sum_pages_for_lsn);
2683
2684 MONITOR_SET(MONITOR_FLUSH_AVG_PAGE_RATE, avg_page_rate);
2685 MONITOR_SET(MONITOR_FLUSH_LSN_AVG_RATE, lsn_avg_rate);
2686 MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty);
2687 MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn);
2688
2689 *lsn_limit = LSN_MAX;
2690
2691 return(n_pages);
2692 }
2693
2694 /*********************************************************************//**
2695 Puts the page_cleaner thread to sleep if it has finished work in less
2696 than a second
2697 @retval 0 wake up by event set,
2698 @retval OS_SYNC_TIME_EXCEEDED if timeout was exceeded
2699 @param next_loop_time time when next loop iteration should start
2700 @param sig_count zero or the value returned by previous call of
2701 os_event_reset() */
2702 static
2703 ulint
pc_sleep_if_needed(ib_time_monotonic_ms_t next_loop_time,int64_t sig_count)2704 pc_sleep_if_needed(
2705 /*===============*/
2706 ib_time_monotonic_ms_t next_loop_time,
2707 int64_t sig_count)
2708 {
2709 ib_time_monotonic_ms_t cur_time = ut_time_monotonic_ms();
2710
2711 if (next_loop_time > cur_time) {
2712 /* Get sleep interval in micro seconds. We use
2713 ut_min() to avoid long sleep in case of wrap around. */
2714 int64_t sleep_us;
2715
2716 sleep_us = ut_min(int64_t(1000000),
2717 (next_loop_time - cur_time) * int64_t(1000));
2718 ut_a(sleep_us > 0);
2719
2720 return(os_event_wait_time_low(buf_flush_event,
2721 sleep_us, sig_count));
2722 }
2723
2724 return(OS_SYNC_TIME_EXCEEDED);
2725 }
2726
2727 /******************************************************************//**
2728 Initialize page_cleaner. */
2729 void
buf_flush_page_cleaner_init(void)2730 buf_flush_page_cleaner_init(void)
2731 /*=============================*/
2732 {
2733 ut_ad(page_cleaner == NULL);
2734
2735 page_cleaner = static_cast<page_cleaner_t*>(
2736 ut_zalloc_nokey(sizeof(*page_cleaner)));
2737
2738 mutex_create(LATCH_ID_PAGE_CLEANER, &page_cleaner->mutex);
2739
2740 page_cleaner->is_requested = os_event_create("pc_is_requested");
2741 page_cleaner->is_finished = os_event_create("pc_is_finished");
2742
2743 page_cleaner->n_slots = static_cast<ulint>(srv_buf_pool_instances);
2744
2745 page_cleaner->slots = static_cast<page_cleaner_slot_t*>(
2746 ut_zalloc_nokey(page_cleaner->n_slots
2747 * sizeof(*page_cleaner->slots)));
2748
2749 ut_d(page_cleaner->n_disabled_debug = 0);
2750
2751 page_cleaner->is_running = true;
2752 }
2753
2754 /**
2755 Close page_cleaner. */
2756 static
2757 void
buf_flush_page_cleaner_close(void)2758 buf_flush_page_cleaner_close(void)
2759 {
2760 /* waiting for all worker threads exit */
2761 while (page_cleaner->n_workers > 0) {
2762 os_thread_sleep(10000);
2763 }
2764
2765 mutex_destroy(&page_cleaner->mutex);
2766
2767 ut_free(page_cleaner->slots);
2768
2769 os_event_destroy(page_cleaner->is_finished);
2770 os_event_destroy(page_cleaner->is_requested);
2771
2772 ut_free(page_cleaner);
2773
2774 page_cleaner = NULL;
2775 }
2776
2777 /**
2778 Requests for all slots to flush all buffer pool instances.
2779 @param min_n wished minimum mumber of blocks flushed
2780 (it is not guaranteed that the actual number is that big)
2781 @param lsn_limit in the case BUF_FLUSH_LIST all blocks whose
2782 oldest_modification is smaller than this should be flushed
2783 (if their number does not exceed min_n), otherwise ignored
2784 */
2785 static
2786 void
pc_request(ulint min_n,lsn_t lsn_limit)2787 pc_request(
2788 ulint min_n,
2789 lsn_t lsn_limit)
2790 {
2791 if (min_n != ULINT_MAX) {
2792 /* Ensure that flushing is spread evenly amongst the
2793 buffer pool instances. When min_n is ULINT_MAX
2794 we need to flush everything up to the lsn limit
2795 so no limit here. */
2796 min_n = (min_n + srv_buf_pool_instances - 1)
2797 / srv_buf_pool_instances;
2798 }
2799
2800 mutex_enter(&page_cleaner->mutex);
2801
2802 ut_ad(page_cleaner->n_slots_requested == 0);
2803 ut_ad(page_cleaner->n_slots_flushing == 0);
2804 ut_ad(page_cleaner->n_slots_finished == 0);
2805
2806 page_cleaner->requested = (min_n > 0);
2807 page_cleaner->lsn_limit = lsn_limit;
2808
2809 for (ulint i = 0; i < page_cleaner->n_slots; i++) {
2810 page_cleaner_slot_t* slot = &page_cleaner->slots[i];
2811
2812 ut_ad(slot->state == PAGE_CLEANER_STATE_NONE);
2813
2814 if (min_n == ULINT_MAX) {
2815 slot->n_pages_requested = ULINT_MAX;
2816 } else if (min_n == 0) {
2817 slot->n_pages_requested = 0;
2818 }
2819
2820 /* slot->n_pages_requested was already set by
2821 page_cleaner_flush_pages_recommendation() */
2822
2823 slot->state = PAGE_CLEANER_STATE_REQUESTED;
2824 }
2825
2826 page_cleaner->n_slots_requested = page_cleaner->n_slots;
2827 page_cleaner->n_slots_flushing = 0;
2828 page_cleaner->n_slots_finished = 0;
2829
2830 os_event_set(page_cleaner->is_requested);
2831
2832 mutex_exit(&page_cleaner->mutex);
2833 }
2834
2835 /**
2836 Do flush for one slot.
2837 @return the number of the slots which has not been treated yet. */
2838 static
2839 ulint
pc_flush_slot(void)2840 pc_flush_slot(void)
2841 {
2842 ib_time_monotonic_ms_t lru_tm = 0;
2843 ib_time_monotonic_ms_t list_tm = 0;
2844 int lru_pass = 0;
2845 int list_pass = 0;
2846
2847 mutex_enter(&page_cleaner->mutex);
2848
2849 if (page_cleaner->n_slots_requested > 0) {
2850 page_cleaner_slot_t* slot = NULL;
2851 ulint i;
2852
2853 for (i = 0; i < page_cleaner->n_slots; i++) {
2854 slot = &page_cleaner->slots[i];
2855
2856 if (slot->state == PAGE_CLEANER_STATE_REQUESTED) {
2857 break;
2858 }
2859 }
2860
2861 /* slot should be found because
2862 page_cleaner->n_slots_requested > 0 */
2863 ut_a(i < page_cleaner->n_slots);
2864
2865 buf_pool_t* buf_pool = buf_pool_from_array(i);
2866
2867 page_cleaner->n_slots_requested--;
2868 page_cleaner->n_slots_flushing++;
2869 slot->state = PAGE_CLEANER_STATE_FLUSHING;
2870
2871 if (page_cleaner->n_slots_requested == 0) {
2872 os_event_reset(page_cleaner->is_requested);
2873 }
2874
2875 if (!page_cleaner->is_running) {
2876 slot->n_flushed_lru = 0;
2877 slot->n_flushed_list = 0;
2878 goto finish_mutex;
2879 }
2880
2881 mutex_exit(&page_cleaner->mutex);
2882
2883 lru_tm = ut_time_monotonic_ms();
2884
2885 /* Flush pages from end of LRU if required */
2886 slot->n_flushed_lru = buf_flush_LRU_list(buf_pool);
2887
2888 lru_tm = ut_time_monotonic_ms() - lru_tm;
2889 lru_pass++;
2890
2891 if (!page_cleaner->is_running) {
2892 slot->n_flushed_list = 0;
2893 goto finish;
2894 }
2895
2896 /* Flush pages from flush_list if required */
2897 if (page_cleaner->requested) {
2898
2899 list_tm = ut_time_monotonic_ms();
2900
2901 slot->succeeded_list = buf_flush_do_batch(
2902 buf_pool, BUF_FLUSH_LIST,
2903 slot->n_pages_requested,
2904 page_cleaner->lsn_limit,
2905 &slot->n_flushed_list);
2906
2907 list_tm = ut_time_monotonic_ms() - list_tm;
2908 list_pass++;
2909 } else {
2910 slot->n_flushed_list = 0;
2911 slot->succeeded_list = true;
2912 }
2913 finish:
2914 mutex_enter(&page_cleaner->mutex);
2915 finish_mutex:
2916 page_cleaner->n_slots_flushing--;
2917 page_cleaner->n_slots_finished++;
2918 slot->state = PAGE_CLEANER_STATE_FINISHED;
2919
2920 slot->flush_lru_time += lru_tm;
2921 slot->flush_list_time += list_tm;
2922 slot->flush_lru_pass += lru_pass;
2923 slot->flush_list_pass += list_pass;
2924
2925 if (page_cleaner->n_slots_requested == 0
2926 && page_cleaner->n_slots_flushing == 0) {
2927 os_event_set(page_cleaner->is_finished);
2928 }
2929 }
2930
2931 ulint ret = page_cleaner->n_slots_requested;
2932
2933 mutex_exit(&page_cleaner->mutex);
2934
2935 return(ret);
2936 }
2937
2938 /**
2939 Wait until all flush requests are finished.
2940 @param n_flushed_lru number of pages flushed from the end of the LRU list.
2941 @param n_flushed_list number of pages flushed from the end of the
2942 flush_list.
2943 @return true if all flush_list flushing batch were success. */
2944 static
2945 bool
pc_wait_finished(ulint * n_flushed_lru,ulint * n_flushed_list)2946 pc_wait_finished(
2947 ulint* n_flushed_lru,
2948 ulint* n_flushed_list)
2949 {
2950 bool all_succeeded = true;
2951
2952 *n_flushed_lru = 0;
2953 *n_flushed_list = 0;
2954
2955 os_event_wait(page_cleaner->is_finished);
2956
2957 mutex_enter(&page_cleaner->mutex);
2958
2959 ut_ad(page_cleaner->n_slots_requested == 0);
2960 ut_ad(page_cleaner->n_slots_flushing == 0);
2961 ut_ad(page_cleaner->n_slots_finished == page_cleaner->n_slots);
2962
2963 for (ulint i = 0; i < page_cleaner->n_slots; i++) {
2964 page_cleaner_slot_t* slot = &page_cleaner->slots[i];
2965
2966 ut_ad(slot->state == PAGE_CLEANER_STATE_FINISHED);
2967
2968 *n_flushed_lru += slot->n_flushed_lru;
2969 *n_flushed_list += slot->n_flushed_list;
2970 all_succeeded &= slot->succeeded_list;
2971
2972 slot->state = PAGE_CLEANER_STATE_NONE;
2973
2974 slot->n_pages_requested = 0;
2975 }
2976
2977 page_cleaner->n_slots_finished = 0;
2978
2979 os_event_reset(page_cleaner->is_finished);
2980
2981 mutex_exit(&page_cleaner->mutex);
2982
2983 return(all_succeeded);
2984 }
2985
2986 #ifdef UNIV_LINUX
2987 /**
2988 Set priority for page_cleaner threads.
2989 @param[in] priority priority intended to set
2990 @return true if set as intended */
2991 static
2992 bool
buf_flush_page_cleaner_set_priority(int priority)2993 buf_flush_page_cleaner_set_priority(
2994 int priority)
2995 {
2996 setpriority(PRIO_PROCESS, (pid_t)syscall(SYS_gettid),
2997 priority);
2998 return(getpriority(PRIO_PROCESS, (pid_t)syscall(SYS_gettid))
2999 == priority);
3000 }
3001 #endif /* UNIV_LINUX */
3002
3003 #ifdef UNIV_DEBUG
3004 /** Loop used to disable page cleaner threads. */
3005 static
3006 void
buf_flush_page_cleaner_disabled_loop(void)3007 buf_flush_page_cleaner_disabled_loop(void)
3008 {
3009 ut_ad(page_cleaner != NULL);
3010
3011 if (!innodb_page_cleaner_disabled_debug) {
3012 /* We return to avoid entering and exiting mutex. */
3013 return;
3014 }
3015
3016 mutex_enter(&page_cleaner->mutex);
3017 page_cleaner->n_disabled_debug++;
3018 mutex_exit(&page_cleaner->mutex);
3019
3020 while (innodb_page_cleaner_disabled_debug
3021 && srv_shutdown_state == SRV_SHUTDOWN_NONE
3022 && page_cleaner->is_running) {
3023
3024 os_thread_sleep(100000); /* [A] */
3025 }
3026
3027 /* We need to wait for threads exiting here, otherwise we would
3028 encounter problem when we quickly perform following steps:
3029 1) SET GLOBAL innodb_page_cleaner_disabled_debug = 1;
3030 2) SET GLOBAL innodb_page_cleaner_disabled_debug = 0;
3031 3) SET GLOBAL innodb_page_cleaner_disabled_debug = 1;
3032 That's because after step 1 this thread could still be sleeping
3033 inside the loop above at [A] and steps 2, 3 could happen before
3034 this thread wakes up from [A]. In such case this thread would
3035 not re-increment n_disabled_debug and we would be waiting for
3036 him forever in buf_flush_page_cleaner_disabled_debug_update(...).
3037
3038 Therefore we are waiting in step 2 for this thread exiting here. */
3039
3040 mutex_enter(&page_cleaner->mutex);
3041 page_cleaner->n_disabled_debug--;
3042 mutex_exit(&page_cleaner->mutex);
3043 }
3044
3045 /** Disables page cleaner threads (coordinator and workers).
3046 It's used by: SET GLOBAL innodb_page_cleaner_disabled_debug = 1 (0).
3047 @param[in] thd thread handle
3048 @param[in] var pointer to system variable
3049 @param[out] var_ptr where the formal string goes
3050 @param[in] save immediate result from check function */
3051 void
buf_flush_page_cleaner_disabled_debug_update(THD * thd,struct st_mysql_sys_var * var,void * var_ptr,const void * save)3052 buf_flush_page_cleaner_disabled_debug_update(
3053 THD* thd,
3054 struct st_mysql_sys_var* var,
3055 void* var_ptr,
3056 const void* save)
3057 {
3058 if (page_cleaner == NULL) {
3059 return;
3060 }
3061
3062 if (!*static_cast<const my_bool*>(save)) {
3063 if (!innodb_page_cleaner_disabled_debug) {
3064 return;
3065 }
3066
3067 innodb_page_cleaner_disabled_debug = false;
3068
3069 /* Enable page cleaner threads. */
3070 while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
3071 mutex_enter(&page_cleaner->mutex);
3072 const ulint n = page_cleaner->n_disabled_debug;
3073 mutex_exit(&page_cleaner->mutex);
3074 /* Check if all threads have been enabled, to avoid
3075 problem when we decide to re-disable them soon. */
3076 if (n == 0) {
3077 break;
3078 }
3079 }
3080 return;
3081 }
3082
3083 if (innodb_page_cleaner_disabled_debug) {
3084 return;
3085 }
3086
3087 innodb_page_cleaner_disabled_debug = true;
3088
3089 while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
3090 /* Workers are possibly sleeping on is_requested.
3091
3092 We have to wake them, otherwise they could possibly
3093 have never noticed, that they should be disabled,
3094 and we would wait for them here forever.
3095
3096 That's why we have sleep-loop instead of simply
3097 waiting on some disabled_debug_event. */
3098 os_event_set(page_cleaner->is_requested);
3099
3100 mutex_enter(&page_cleaner->mutex);
3101
3102 ut_ad(page_cleaner->n_disabled_debug
3103 <= srv_n_page_cleaners);
3104
3105 if (page_cleaner->n_disabled_debug
3106 == srv_n_page_cleaners) {
3107
3108 mutex_exit(&page_cleaner->mutex);
3109 break;
3110 }
3111
3112 mutex_exit(&page_cleaner->mutex);
3113
3114 os_thread_sleep(100000);
3115 }
3116 }
3117 #endif /* UNIV_DEBUG */
3118
3119 /******************************************************************//**
3120 page_cleaner thread tasked with flushing dirty pages from the buffer
3121 pools. As of now we'll have only one coordinator.
3122 @return a dummy parameter */
3123 extern "C"
3124 os_thread_ret_t
DECLARE_THREAD(buf_flush_page_cleaner_coordinator)3125 DECLARE_THREAD(buf_flush_page_cleaner_coordinator)(
3126 /*===============================================*/
3127 void* arg MY_ATTRIBUTE((unused)))
3128 /*!< in: a dummy parameter required by
3129 os_thread_create */
3130 {
3131 ib_time_monotonic_t next_loop_time = ut_time_monotonic_ms() + 1000;
3132 ulint n_flushed = 0;
3133 ulint last_activity = srv_get_activity_count();
3134 ulint last_pages = 0;
3135
3136 my_thread_init();
3137
3138 #ifdef UNIV_PFS_THREAD
3139 pfs_register_thread(page_cleaner_thread_key);
3140 #endif /* UNIV_PFS_THREAD */
3141
3142 #ifdef UNIV_DEBUG_THREAD_CREATION
3143 ib::info() << "page_cleaner thread running, id "
3144 << os_thread_pf(os_thread_get_curr_id());
3145 #endif /* UNIV_DEBUG_THREAD_CREATION */
3146
3147 #ifdef UNIV_LINUX
3148 /* linux might be able to set different setting for each thread.
3149 worth to try to set high priority for page cleaner threads */
3150 if (buf_flush_page_cleaner_set_priority(
3151 buf_flush_page_cleaner_priority)) {
3152
3153 ib::info() << "page_cleaner coordinator priority: "
3154 << buf_flush_page_cleaner_priority;
3155 } else {
3156 ib::info() << "If the mysqld execution user is authorized,"
3157 " page cleaner thread priority can be changed."
3158 " See the man page of setpriority().";
3159 }
3160 #endif /* UNIV_LINUX */
3161
3162 buf_page_cleaner_is_active = true;
3163
3164 while (!srv_read_only_mode
3165 && srv_shutdown_state == SRV_SHUTDOWN_NONE
3166 && recv_sys->heap != NULL) {
3167 /* treat flushing requests during recovery. */
3168 ulint n_flushed_lru = 0;
3169 ulint n_flushed_list = 0;
3170
3171 os_event_wait(recv_sys->flush_start);
3172
3173 if (srv_shutdown_state != SRV_SHUTDOWN_NONE
3174 || recv_sys->heap == NULL) {
3175 break;
3176 }
3177
3178 switch (recv_sys->flush_type) {
3179 case BUF_FLUSH_LRU:
3180 /* Flush pages from end of LRU if required */
3181 pc_request(0, LSN_MAX);
3182 while (pc_flush_slot() > 0) {}
3183 pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3184 break;
3185
3186 case BUF_FLUSH_LIST:
3187 /* Flush all pages */
3188 do {
3189 pc_request(ULINT_MAX, LSN_MAX);
3190 while (pc_flush_slot() > 0) {}
3191 } while (!pc_wait_finished(&n_flushed_lru,
3192 &n_flushed_list));
3193 break;
3194
3195 default:
3196 ut_ad(0);
3197 }
3198
3199 os_event_reset(recv_sys->flush_start);
3200 os_event_set(recv_sys->flush_end);
3201 }
3202
3203 os_event_wait(buf_flush_event);
3204
3205 ulint ret_sleep = 0;
3206 ulint n_evicted = 0;
3207 ulint n_flushed_last = 0;
3208 ulint warn_interval = 1;
3209 ulint warn_count = 0;
3210 int64_t sig_count = os_event_reset(buf_flush_event);
3211
3212 while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
3213
3214 /* The page_cleaner skips sleep if the server is
3215 idle and there are no pending IOs in the buffer pool
3216 and there is work to do. */
3217 if (srv_check_activity(last_activity)
3218 || buf_get_n_pending_read_ios()
3219 || n_flushed == 0) {
3220
3221 ret_sleep = pc_sleep_if_needed(
3222 next_loop_time, sig_count);
3223
3224 if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
3225 break;
3226 }
3227 } else if (ut_time_monotonic_ms() > next_loop_time) {
3228 ret_sleep = OS_SYNC_TIME_EXCEEDED;
3229 } else {
3230 ret_sleep = 0;
3231 }
3232
3233 sig_count = os_event_reset(buf_flush_event);
3234
3235 if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3236 ib_time_monotonic_ms_t curr_time =
3237 ut_time_monotonic_ms();
3238
3239 if (curr_time > next_loop_time + 3000) {
3240 if (warn_count == 0) {
3241 ib::info() << "page_cleaner: 1000ms"
3242 " intended loop took "
3243 << 1000 + curr_time
3244 - next_loop_time
3245 << "ms. The settings might not"
3246 " be optimal. (flushed="
3247 << n_flushed_last
3248 << " and evicted="
3249 << n_evicted
3250 << ", during the time.)";
3251 if (warn_interval > 300) {
3252 warn_interval = 600;
3253 } else {
3254 warn_interval *= 2;
3255 }
3256
3257 warn_count = warn_interval;
3258 } else {
3259 --warn_count;
3260 }
3261 } else {
3262 /* reset counter */
3263 warn_interval = 1;
3264 warn_count = 0;
3265 }
3266
3267 next_loop_time = curr_time + 1000;
3268 n_flushed_last = n_evicted = 0;
3269 }
3270
3271 if (ret_sleep != OS_SYNC_TIME_EXCEEDED
3272 && srv_flush_sync
3273 && buf_flush_sync_lsn > 0) {
3274 /* woke up for flush_sync */
3275 mutex_enter(&page_cleaner->mutex);
3276 lsn_t lsn_limit = buf_flush_sync_lsn;
3277 buf_flush_sync_lsn = 0;
3278 mutex_exit(&page_cleaner->mutex);
3279
3280 /* Request flushing for threads */
3281 pc_request(ULINT_MAX, lsn_limit);
3282
3283 ib_time_monotonic_ms_t tm = ut_time_monotonic_ms();
3284
3285 /* Coordinator also treats requests */
3286 while (pc_flush_slot() > 0) {}
3287
3288 /* only coordinator is using these counters,
3289 so no need to protect by lock. */
3290 page_cleaner->flush_time += ut_time_monotonic_ms() - tm;
3291 page_cleaner->flush_pass++;
3292
3293 /* Wait for all slots to be finished */
3294 ulint n_flushed_lru = 0;
3295 ulint n_flushed_list = 0;
3296 pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3297
3298 if (n_flushed_list > 0 || n_flushed_lru > 0) {
3299 buf_flush_stats(n_flushed_list, n_flushed_lru);
3300
3301 MONITOR_INC_VALUE_CUMULATIVE(
3302 MONITOR_FLUSH_SYNC_TOTAL_PAGE,
3303 MONITOR_FLUSH_SYNC_COUNT,
3304 MONITOR_FLUSH_SYNC_PAGES,
3305 n_flushed_lru + n_flushed_list);
3306 }
3307
3308 n_flushed = n_flushed_lru + n_flushed_list;
3309
3310 } else if (srv_check_activity(last_activity)) {
3311 ulint n_to_flush;
3312 lsn_t lsn_limit = 0;
3313
3314 /* Estimate pages from flush_list to be flushed */
3315 if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3316 last_activity = srv_get_activity_count();
3317 n_to_flush =
3318 page_cleaner_flush_pages_recommendation(
3319 &lsn_limit, last_pages);
3320 } else {
3321 n_to_flush = 0;
3322 }
3323
3324 /* Request flushing for threads */
3325 pc_request(n_to_flush, lsn_limit);
3326
3327 ib_time_monotonic_ms_t tm = ut_time_monotonic_ms();
3328
3329 /* Coordinator also treats requests */
3330 while (pc_flush_slot() > 0) {
3331 /* No op */
3332 }
3333
3334 /* only coordinator is using these counters,
3335 so no need to protect by lock. */
3336 page_cleaner->flush_time += ut_time_monotonic_ms() - tm;
3337 page_cleaner->flush_pass++ ;
3338
3339 /* Wait for all slots to be finished */
3340 ulint n_flushed_lru = 0;
3341 ulint n_flushed_list = 0;
3342
3343 pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3344
3345 if (n_flushed_list > 0 || n_flushed_lru > 0) {
3346 buf_flush_stats(n_flushed_list, n_flushed_lru);
3347 }
3348
3349 if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3350 last_pages = n_flushed_list;
3351 }
3352
3353 n_evicted += n_flushed_lru;
3354 n_flushed_last += n_flushed_list;
3355
3356 n_flushed = n_flushed_lru + n_flushed_list;
3357
3358 if (n_flushed_lru) {
3359 MONITOR_INC_VALUE_CUMULATIVE(
3360 MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
3361 MONITOR_LRU_BATCH_FLUSH_COUNT,
3362 MONITOR_LRU_BATCH_FLUSH_PAGES,
3363 n_flushed_lru);
3364 }
3365
3366 if (n_flushed_list) {
3367 MONITOR_INC_VALUE_CUMULATIVE(
3368 MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
3369 MONITOR_FLUSH_ADAPTIVE_COUNT,
3370 MONITOR_FLUSH_ADAPTIVE_PAGES,
3371 n_flushed_list);
3372 }
3373
3374 } else if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3375 /* no activity, slept enough */
3376 buf_flush_lists(PCT_IO(100), LSN_MAX, &n_flushed);
3377
3378 n_flushed_last += n_flushed;
3379
3380 if (n_flushed) {
3381 MONITOR_INC_VALUE_CUMULATIVE(
3382 MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
3383 MONITOR_FLUSH_BACKGROUND_COUNT,
3384 MONITOR_FLUSH_BACKGROUND_PAGES,
3385 n_flushed);
3386
3387 }
3388
3389 } else {
3390 /* no activity, but woken up by event */
3391 n_flushed = 0;
3392 }
3393
3394 ut_d(buf_flush_page_cleaner_disabled_loop());
3395 }
3396
3397 ut_ad(srv_shutdown_state > 0);
3398 if (srv_fast_shutdown == 2
3399 || srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
3400 /* In very fast shutdown or when innodb failed to start, we
3401 simulate a crash of the buffer pool. We are not required to do
3402 any flushing. */
3403 goto thread_exit;
3404 }
3405
3406 /* In case of normal and slow shutdown the page_cleaner thread
3407 must wait for all other activity in the server to die down.
3408 Note that we can start flushing the buffer pool as soon as the
3409 server enters shutdown phase but we must stay alive long enough
3410 to ensure that any work done by the master or purge threads is
3411 also flushed.
3412 During shutdown we pass through two stages. In the first stage,
3413 when SRV_SHUTDOWN_CLEANUP is set other threads like the master
3414 and the purge threads may be working as well. We start flushing
3415 the buffer pool but can't be sure that no new pages are being
3416 dirtied until we enter SRV_SHUTDOWN_FLUSH_PHASE phase. */
3417
3418 do {
3419 pc_request(ULINT_MAX, LSN_MAX);
3420
3421 while (pc_flush_slot() > 0) {}
3422
3423 ulint n_flushed_lru = 0;
3424 ulint n_flushed_list = 0;
3425 pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3426
3427 n_flushed = n_flushed_lru + n_flushed_list;
3428
3429 /* We sleep only if there are no pages to flush */
3430 if (n_flushed == 0) {
3431 os_thread_sleep(100000);
3432 }
3433 } while (srv_shutdown_state == SRV_SHUTDOWN_CLEANUP);
3434
3435 /* At this point all threads including the master and the purge
3436 thread must have been suspended. */
3437 ut_a(srv_get_active_thread_type() == SRV_NONE);
3438 ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
3439
3440 /* We can now make a final sweep on flushing the buffer pool
3441 and exit after we have cleaned the whole buffer pool.
3442 It is important that we wait for any running batch that has
3443 been triggered by us to finish. Otherwise we can end up
3444 considering end of that batch as a finish of our final
3445 sweep and we'll come out of the loop leaving behind dirty pages
3446 in the flush_list */
3447 buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
3448 buf_flush_wait_LRU_batch_end();
3449
3450 bool success;
3451
3452 do {
3453 pc_request(ULINT_MAX, LSN_MAX);
3454
3455 while (pc_flush_slot() > 0) {}
3456
3457 ulint n_flushed_lru = 0;
3458 ulint n_flushed_list = 0;
3459 success = pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3460
3461 n_flushed = n_flushed_lru + n_flushed_list;
3462
3463 buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
3464 buf_flush_wait_LRU_batch_end();
3465
3466 } while (!success || n_flushed > 0 || buf_get_n_pending_read_ios() > 0);
3467
3468 /* Some sanity checks */
3469 ut_a(srv_get_active_thread_type() == SRV_NONE);
3470 ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
3471
3472 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
3473 buf_pool_t* buf_pool = buf_pool_from_array(i);
3474 ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == 0);
3475 }
3476
3477 /* We have lived our life. Time to die. */
3478
3479 thread_exit:
3480 /* All worker threads are waiting for the event here,
3481 and no more access to page_cleaner structure by them.
3482 Wakes worker threads up just to make them exit. */
3483 page_cleaner->is_running = false;
3484 os_event_set(page_cleaner->is_requested);
3485
3486 buf_flush_page_cleaner_close();
3487
3488 buf_page_cleaner_is_active = false;
3489
3490 my_thread_end();
3491
3492 /* We count the number of threads in os_thread_exit(). A created
3493 thread should always use that to exit and not use return() to exit. */
3494 os_thread_exit();
3495
3496 OS_THREAD_DUMMY_RETURN;
3497 }
3498
3499 /******************************************************************//**
3500 Worker thread of page_cleaner.
3501 @return a dummy parameter */
3502 extern "C"
3503 os_thread_ret_t
DECLARE_THREAD(buf_flush_page_cleaner_worker)3504 DECLARE_THREAD(buf_flush_page_cleaner_worker)(
3505 /*==========================================*/
3506 void* arg MY_ATTRIBUTE((unused)))
3507 /*!< in: a dummy parameter required by
3508 os_thread_create */
3509 {
3510 my_thread_init();
3511
3512 mutex_enter(&page_cleaner->mutex);
3513 page_cleaner->n_workers++;
3514 mutex_exit(&page_cleaner->mutex);
3515
3516 #ifdef UNIV_LINUX
3517 /* linux might be able to set different setting for each thread
3518 worth to try to set high priority for page cleaner threads */
3519 if (buf_flush_page_cleaner_set_priority(
3520 buf_flush_page_cleaner_priority)) {
3521
3522 ib::info() << "page_cleaner worker priority: "
3523 << buf_flush_page_cleaner_priority;
3524 }
3525 #endif /* UNIV_LINUX */
3526
3527 while (true) {
3528 os_event_wait(page_cleaner->is_requested);
3529
3530 ut_d(buf_flush_page_cleaner_disabled_loop());
3531
3532 if (!page_cleaner->is_running) {
3533 break;
3534 }
3535
3536 pc_flush_slot();
3537 }
3538
3539 mutex_enter(&page_cleaner->mutex);
3540 page_cleaner->n_workers--;
3541 mutex_exit(&page_cleaner->mutex);
3542
3543 my_thread_end();
3544
3545 os_thread_exit();
3546
3547 OS_THREAD_DUMMY_RETURN;
3548 }
3549
3550 /*******************************************************************//**
3551 Synchronously flush dirty blocks from the end of the flush list of all buffer
3552 pool instances.
3553 NOTE: The calling thread is not allowed to own any latches on pages! */
3554 void
buf_flush_sync_all_buf_pools(void)3555 buf_flush_sync_all_buf_pools(void)
3556 /*==============================*/
3557 {
3558 bool success;
3559 do {
3560 success = buf_flush_lists(ULINT_MAX, LSN_MAX, NULL);
3561 buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
3562 } while (!success);
3563
3564 ut_a(success);
3565 }
3566
3567 /** Request IO burst and wake page_cleaner up.
3568 @param[in] lsn_limit upper limit of LSN to be flushed */
3569 void
buf_flush_request_force(lsn_t lsn_limit)3570 buf_flush_request_force(
3571 lsn_t lsn_limit)
3572 {
3573 /* adjust based on lsn_avg_rate not to get old */
3574 lsn_t lsn_target = lsn_limit + lsn_avg_rate * 3;
3575
3576 mutex_enter(&page_cleaner->mutex);
3577 if (lsn_target > buf_flush_sync_lsn) {
3578 buf_flush_sync_lsn = lsn_target;
3579 }
3580 mutex_exit(&page_cleaner->mutex);
3581
3582 os_event_set(buf_flush_event);
3583 }
3584 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3585
3586 /** Functor to validate the flush list. */
3587 struct Check {
operator ()Check3588 void operator()(const buf_page_t* elem)
3589 {
3590 ut_a(elem->in_flush_list);
3591 }
3592 };
3593
3594 /******************************************************************//**
3595 Validates the flush list.
3596 @return TRUE if ok */
3597 static
3598 ibool
buf_flush_validate_low(buf_pool_t * buf_pool)3599 buf_flush_validate_low(
3600 /*===================*/
3601 buf_pool_t* buf_pool) /*!< in: Buffer pool instance */
3602 {
3603 buf_page_t* bpage;
3604 const ib_rbt_node_t* rnode = NULL;
3605 Check check;
3606
3607 ut_ad(buf_flush_list_mutex_own(buf_pool));
3608
3609 ut_list_validate(buf_pool->flush_list, check);
3610
3611 bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
3612
3613 /* If we are in recovery mode i.e.: flush_rbt != NULL
3614 then each block in the flush_list must also be present
3615 in the flush_rbt. */
3616 if (buf_pool->flush_rbt != NULL) {
3617 rnode = rbt_first(buf_pool->flush_rbt);
3618 }
3619
3620 while (bpage != NULL) {
3621 const lsn_t om = bpage->oldest_modification;
3622
3623 ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
3624
3625 ut_ad(bpage->in_flush_list);
3626
3627 /* A page in buf_pool->flush_list can be in
3628 BUF_BLOCK_REMOVE_HASH state. This happens when a page
3629 is in the middle of being relocated. In that case the
3630 original descriptor can have this state and still be
3631 in the flush list waiting to acquire the
3632 buf_pool->flush_list_mutex to complete the relocation. */
3633 ut_a(buf_page_in_file(bpage)
3634 || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
3635 ut_a(om > 0);
3636
3637 if (buf_pool->flush_rbt != NULL) {
3638 buf_page_t** prpage;
3639
3640 ut_a(rnode != NULL);
3641 prpage = rbt_value(buf_page_t*, rnode);
3642
3643 ut_a(*prpage != NULL);
3644 ut_a(*prpage == bpage);
3645 rnode = rbt_next(buf_pool->flush_rbt, rnode);
3646 }
3647
3648 bpage = UT_LIST_GET_NEXT(list, bpage);
3649
3650 ut_a(bpage == NULL || om >= bpage->oldest_modification);
3651 }
3652
3653 /* By this time we must have exhausted the traversal of
3654 flush_rbt (if active) as well. */
3655 ut_a(rnode == NULL);
3656
3657 return(TRUE);
3658 }
3659
3660 /******************************************************************//**
3661 Validates the flush list.
3662 @return TRUE if ok */
3663 ibool
buf_flush_validate(buf_pool_t * buf_pool)3664 buf_flush_validate(
3665 /*===============*/
3666 buf_pool_t* buf_pool) /*!< buffer pool instance */
3667 {
3668 ibool ret;
3669
3670 buf_flush_list_mutex_enter(buf_pool);
3671
3672 ret = buf_flush_validate_low(buf_pool);
3673
3674 buf_flush_list_mutex_exit(buf_pool);
3675
3676 return(ret);
3677 }
3678 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3679 #endif /* !UNIV_HOTBACKUP */
3680
3681 /******************************************************************//**
3682 Check if there are any dirty pages that belong to a space id in the flush
3683 list in a particular buffer pool.
3684 @return number of dirty pages present in a single buffer pool */
3685 ulint
buf_pool_get_dirty_pages_count(buf_pool_t * buf_pool,ulint id,FlushObserver * observer)3686 buf_pool_get_dirty_pages_count(
3687 /*===========================*/
3688 buf_pool_t* buf_pool, /*!< in: buffer pool */
3689 ulint id, /*!< in: space id to check */
3690 FlushObserver* observer) /*!< in: flush observer to check */
3691
3692 {
3693 ulint count = 0;
3694
3695 buf_pool_mutex_enter(buf_pool);
3696 buf_flush_list_mutex_enter(buf_pool);
3697
3698 buf_page_t* bpage;
3699
3700 for (bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
3701 bpage != 0;
3702 bpage = UT_LIST_GET_NEXT(list, bpage)) {
3703
3704 ut_ad(buf_page_in_file(bpage));
3705 ut_ad(bpage->in_flush_list);
3706 ut_ad(bpage->oldest_modification > 0);
3707
3708 if ((observer != NULL
3709 && observer == bpage->flush_observer)
3710 || (observer == NULL
3711 && id == bpage->id.space())) {
3712 ++count;
3713 }
3714 }
3715
3716 buf_flush_list_mutex_exit(buf_pool);
3717 buf_pool_mutex_exit(buf_pool);
3718
3719 return(count);
3720 }
3721
3722 /******************************************************************//**
3723 Check if there are any dirty pages that belong to a space id in the flush list.
3724 @return number of dirty pages present in all the buffer pools */
3725 ulint
buf_flush_get_dirty_pages_count(ulint id,FlushObserver * observer)3726 buf_flush_get_dirty_pages_count(
3727 /*============================*/
3728 ulint id, /*!< in: space id to check */
3729 FlushObserver* observer) /*!< in: flush observer to check */
3730 {
3731 ulint count = 0;
3732
3733 for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
3734 buf_pool_t* buf_pool;
3735
3736 buf_pool = buf_pool_from_array(i);
3737
3738 count += buf_pool_get_dirty_pages_count(buf_pool, id, observer);
3739 }
3740
3741 return(count);
3742 }
3743
3744 /** FlushObserver constructor
3745 @param[in] space_id table space id
3746 @param[in] trx trx instance
3747 @param[in] stage performance schema accounting object,
3748 used by ALTER TABLE. It is passed to log_preflush_pool_modified_pages()
3749 for accounting. */
FlushObserver(ulint space_id,trx_t * trx,ut_stage_alter_t * stage)3750 FlushObserver::FlushObserver(
3751 ulint space_id,
3752 trx_t* trx,
3753 ut_stage_alter_t* stage)
3754 :
3755 m_space_id(space_id),
3756 m_trx(trx),
3757 m_stage(stage),
3758 m_interrupted(false)
3759 {
3760 m_flushed = UT_NEW_NOKEY(std::vector<ulint>(srv_buf_pool_instances));
3761 m_removed = UT_NEW_NOKEY(std::vector<ulint>(srv_buf_pool_instances));
3762
3763 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
3764 m_flushed->at(i) = 0;
3765 m_removed->at(i) = 0;
3766 }
3767
3768 #ifdef FLUSH_LIST_OBSERVER_DEBUG
3769 ib::info() << "FlushObserver constructor: " << m_trx->id;
3770 #endif /* FLUSH_LIST_OBSERVER_DEBUG */
3771 }
3772
3773 /** FlushObserver deconstructor */
~FlushObserver()3774 FlushObserver::~FlushObserver()
3775 {
3776 ut_ad(buf_flush_get_dirty_pages_count(m_space_id, this) == 0);
3777
3778 UT_DELETE(m_flushed);
3779 UT_DELETE(m_removed);
3780
3781 #ifdef FLUSH_LIST_OBSERVER_DEBUG
3782 ib::info() << "FlushObserver deconstructor: " << m_trx->id;
3783 #endif /* FLUSH_LIST_OBSERVER_DEBUG */
3784 }
3785
3786 /** Check whether trx is interrupted
3787 @return true if trx is interrupted */
3788 bool
check_interrupted()3789 FlushObserver::check_interrupted()
3790 {
3791 if (trx_is_interrupted(m_trx)) {
3792 interrupted();
3793
3794 return(true);
3795 }
3796
3797 return(false);
3798 }
3799
3800 /** Notify observer of a flush
3801 @param[in] buf_pool buffer pool instance
3802 @param[in] bpage buffer page to flush */
3803 void
notify_flush(buf_pool_t * buf_pool,buf_page_t * bpage)3804 FlushObserver::notify_flush(
3805 buf_pool_t* buf_pool,
3806 buf_page_t* bpage)
3807 {
3808 ut_ad(buf_pool_mutex_own(buf_pool));
3809
3810 m_flushed->at(buf_pool->instance_no)++;
3811
3812 if (m_stage != NULL) {
3813 m_stage->inc();
3814 }
3815
3816 #ifdef FLUSH_LIST_OBSERVER_DEBUG
3817 ib::info() << "Flush <" << bpage->id.space()
3818 << ", " << bpage->id.page_no() << ">";
3819 #endif /* FLUSH_LIST_OBSERVER_DEBUG */
3820 }
3821
3822 /** Notify observer of a remove
3823 @param[in] buf_pool buffer pool instance
3824 @param[in] bpage buffer page flushed */
3825 void
notify_remove(buf_pool_t * buf_pool,buf_page_t * bpage)3826 FlushObserver::notify_remove(
3827 buf_pool_t* buf_pool,
3828 buf_page_t* bpage)
3829 {
3830 ut_ad(buf_pool_mutex_own(buf_pool));
3831
3832 m_removed->at(buf_pool->instance_no)++;
3833
3834 #ifdef FLUSH_LIST_OBSERVER_DEBUG
3835 ib::info() << "Remove <" << bpage->id.space()
3836 << ", " << bpage->id.page_no() << ">";
3837 #endif /* FLUSH_LIST_OBSERVER_DEBUG */
3838 }
3839
3840 /** Flush dirty pages and wait. */
3841 void
flush()3842 FlushObserver::flush()
3843 {
3844 buf_remove_t buf_remove;
3845
3846 if (m_interrupted) {
3847 buf_remove = BUF_REMOVE_FLUSH_NO_WRITE;
3848 } else {
3849 buf_remove = BUF_REMOVE_FLUSH_WRITE;
3850
3851 if (m_stage != NULL) {
3852 ulint pages_to_flush =
3853 buf_flush_get_dirty_pages_count(
3854 m_space_id, this);
3855
3856 m_stage->begin_phase_flush(pages_to_flush);
3857 }
3858 }
3859
3860 /* Flush or remove dirty pages. */
3861 buf_LRU_flush_or_remove_pages(m_space_id, buf_remove, m_trx);
3862
3863 /* Wait for all dirty pages were flushed. */
3864 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
3865 while (!is_complete(i)) {
3866
3867 os_thread_sleep(2000);
3868 }
3869 }
3870 }
3871