1 /*****************************************************************************
2 
3 Copyright (c) 1995, 2021, Oracle and/or its affiliates.
4 
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License, version 2.0,
7 as published by the Free Software Foundation.
8 
9 This program is also distributed with certain software (including
10 but not limited to OpenSSL) that is licensed under separate terms,
11 as designated in a particular file or component or in included license
12 documentation.  The authors of MySQL hereby grant you an additional
13 permission to link the program and your derivative works with the
14 separately licensed software that they have included with MySQL.
15 
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 GNU General Public License, version 2.0, for more details.
20 
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
24 
25 *****************************************************************************/
26 
27 /**************************************************//**
28 @file buf/buf0flu.cc
29 The database buffer buf_pool flush algorithm
30 
31 Created 11/11/1995 Heikki Tuuri
32 *******************************************************/
33 
34 #include "ha_prototypes.h"
35 #include <mysql/service_thd_wait.h>
36 #include <my_dbug.h>
37 
38 #include "buf0flu.h"
39 
40 #ifdef UNIV_NONINL
41 #include "buf0flu.ic"
42 #endif
43 
44 #include "buf0buf.h"
45 #include "buf0checksum.h"
46 #include "srv0start.h"
47 #include "srv0srv.h"
48 #include "page0zip.h"
49 #ifndef UNIV_HOTBACKUP
50 #include "ut0byte.h"
51 #include "page0page.h"
52 #include "fil0fil.h"
53 #include "buf0lru.h"
54 #include "buf0rea.h"
55 #include "ibuf0ibuf.h"
56 #include "log0log.h"
57 #include "os0file.h"
58 #include "trx0sys.h"
59 #include "srv0mon.h"
60 #include "fsp0sysspace.h"
61 #include "ut0stage.h"
62 
63 #ifdef UNIV_LINUX
64 /* include defs for CPU time priority settings */
65 #include <unistd.h>
66 #include <sys/syscall.h>
67 #include <sys/time.h>
68 #include <sys/resource.h>
69 static const int buf_flush_page_cleaner_priority = -20;
70 #endif /* UNIV_LINUX */
71 
72 /** Sleep time in microseconds for loop waiting for the oldest
73 modification lsn */
74 static const ulint buf_flush_wait_flushed_sleep_time = 10000;
75 
76 /** Number of pages flushed through non flush_list flushes. */
77 static ulint buf_lru_flush_page_count = 0;
78 
79 /** Flag indicating if the page_cleaner is in active state. This flag
80 is set to TRUE by the page_cleaner thread when it is spawned and is set
81 back to FALSE at shutdown by the page_cleaner as well. Therefore no
82 need to protect it by a mutex. It is only ever read by the thread
83 doing the shutdown */
84 bool buf_page_cleaner_is_active = false;
85 
86 /** Factor for scan length to determine n_pages for intended oldest LSN
87 progress */
88 static ulint buf_flush_lsn_scan_factor = 3;
89 
90 /** Average redo generation rate */
91 static lsn_t lsn_avg_rate = 0;
92 
93 /** Target oldest LSN for the requested flush_sync */
94 static lsn_t buf_flush_sync_lsn = 0;
95 
96 #ifdef UNIV_PFS_THREAD
97 mysql_pfs_key_t page_cleaner_thread_key;
98 #endif /* UNIV_PFS_THREAD */
99 
100 /** Event to synchronise with the flushing. */
101 os_event_t	buf_flush_event;
102 
103 /** State for page cleaner array slot */
104 enum page_cleaner_state_t {
105 	/** Not requested any yet.
106 	Moved from FINISHED by the coordinator. */
107 	PAGE_CLEANER_STATE_NONE = 0,
108 	/** Requested but not started flushing.
109 	Moved from NONE by the coordinator. */
110 	PAGE_CLEANER_STATE_REQUESTED,
111 	/** Flushing is on going.
112 	Moved from REQUESTED by the worker. */
113 	PAGE_CLEANER_STATE_FLUSHING,
114 	/** Flushing was finished.
115 	Moved from FLUSHING by the worker. */
116 	PAGE_CLEANER_STATE_FINISHED
117 };
118 
119 /** Page cleaner request state for each buffer pool instance */
120 struct page_cleaner_slot_t {
121 	page_cleaner_state_t	state;	/*!< state of the request.
122 					protected by page_cleaner_t::mutex
123 					if the worker thread got the slot and
124 					set to PAGE_CLEANER_STATE_FLUSHING,
125 					n_flushed_lru and n_flushed_list can be
126 					updated only by the worker thread */
127 	/* This value is set during state==PAGE_CLEANER_STATE_NONE */
128 	ulint			n_pages_requested;
129 					/*!< number of requested pages
130 					for the slot */
131 	/* These values are updated during state==PAGE_CLEANER_STATE_FLUSHING,
132 	and commited with state==PAGE_CLEANER_STATE_FINISHED.
133 	The consistency is protected by the 'state' */
134 	ulint			n_flushed_lru;
135 					/*!< number of flushed pages
136 					by LRU scan flushing */
137 	ulint			n_flushed_list;
138 					/*!< number of flushed pages
139 					by flush_list flushing */
140 	bool			succeeded_list;
141 					/*!< true if flush_list flushing
142 					succeeded. */
143 	uint64_t		flush_lru_time;
144 					/*!< elapsed time for LRU flushing */
145 	uint64_t		flush_list_time;
146 					/*!< elapsed time for flush_list
147 					flushing */
148 	ulint			flush_lru_pass;
149 					/*!< count to attempt LRU flushing */
150 	ulint			flush_list_pass;
151 					/*!< count to attempt flush_list
152 					flushing */
153 };
154 
155 /** Page cleaner structure common for all threads */
156 struct page_cleaner_t {
157 	ib_mutex_t		mutex;		/*!< mutex to protect whole of
158 						page_cleaner_t struct and
159 						page_cleaner_slot_t slots. */
160 	os_event_t		is_requested;	/*!< event to activate worker
161 						threads. */
162 	os_event_t		is_finished;	/*!< event to signal that all
163 						slots were finished. */
164 	volatile ulint		n_workers;	/*!< number of worker threads
165 						in existence */
166 	bool			requested;	/*!< true if requested pages
167 						to flush */
168 	lsn_t			lsn_limit;	/*!< upper limit of LSN to be
169 						flushed */
170 	ulint			n_slots;	/*!< total number of slots */
171 	ulint			n_slots_requested;
172 						/*!< number of slots
173 						in the state
174 						PAGE_CLEANER_STATE_REQUESTED */
175 	ulint			n_slots_flushing;
176 						/*!< number of slots
177 						in the state
178 						PAGE_CLEANER_STATE_FLUSHING */
179 	ulint			n_slots_finished;
180 						/*!< number of slots
181 						in the state
182 						PAGE_CLEANER_STATE_FINISHED */
183 	uint64_t		flush_time;	/*!< elapsed time to flush
184 						requests for all slots */
185 	ulint			flush_pass;	/*!< count to finish to flush
186 						requests for all slots */
187 	page_cleaner_slot_t*	slots;		/*!< pointer to the slots */
188 	bool			is_running;	/*!< false if attempt
189 						to shutdown */
190 
191 #ifdef UNIV_DEBUG
192 	ulint			n_disabled_debug;
193 						/*<! how many of pc threads
194 						have been disabled */
195 #endif /* UNIV_DEBUG */
196 };
197 
198 static page_cleaner_t*	page_cleaner = NULL;
199 
200 #ifdef UNIV_DEBUG
201 my_bool innodb_page_cleaner_disabled_debug;
202 #endif /* UNIV_DEBUG */
203 
204 /** If LRU list of a buf_pool is less than this size then LRU eviction
205 should not happen. This is because when we do LRU flushing we also put
206 the blocks on free list. If LRU list is very small then we can end up
207 in thrashing. */
208 #define BUF_LRU_MIN_LEN		256
209 
210 /* @} */
211 
212 /******************************************************************//**
213 Increases flush_list size in bytes with the page size in inline function */
214 static inline
215 void
incr_flush_list_size_in_bytes(buf_block_t * block,buf_pool_t * buf_pool)216 incr_flush_list_size_in_bytes(
217 /*==========================*/
218 	buf_block_t*	block,		/*!< in: control block */
219 	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
220 {
221 	ut_ad(buf_flush_list_mutex_own(buf_pool));
222 
223 	buf_pool->stat.flush_list_bytes += block->page.size.physical();
224 
225 	ut_ad(buf_pool->stat.flush_list_bytes <= buf_pool->curr_pool_size);
226 }
227 
228 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
229 /******************************************************************//**
230 Validates the flush list.
231 @return TRUE if ok */
232 static
233 ibool
234 buf_flush_validate_low(
235 /*===================*/
236 	buf_pool_t*	buf_pool);	/*!< in: Buffer pool instance */
237 
238 /******************************************************************//**
239 Validates the flush list some of the time.
240 @return TRUE if ok or the check was skipped */
241 static
242 ibool
buf_flush_validate_skip(buf_pool_t * buf_pool)243 buf_flush_validate_skip(
244 /*====================*/
245 	buf_pool_t*	buf_pool)	/*!< in: Buffer pool instance */
246 {
247 /** Try buf_flush_validate_low() every this many times */
248 # define BUF_FLUSH_VALIDATE_SKIP	23
249 
250 	/** The buf_flush_validate_low() call skip counter.
251 	Use a signed type because of the race condition below. */
252 	static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
253 
254 	/* There is a race condition below, but it does not matter,
255 	because this call is only for heuristic purposes. We want to
256 	reduce the call frequency of the costly buf_flush_validate_low()
257 	check in debug builds. */
258 	if (--buf_flush_validate_count > 0) {
259 		return(TRUE);
260 	}
261 
262 	buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
263 	return(buf_flush_validate_low(buf_pool));
264 }
265 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
266 
267 /******************************************************************//**
268 Insert a block in the flush_rbt and returns a pointer to its
269 predecessor or NULL if no predecessor. The ordering is maintained
270 on the basis of the <oldest_modification, space, offset> key.
271 @return pointer to the predecessor or NULL if no predecessor. */
272 static
273 buf_page_t*
buf_flush_insert_in_flush_rbt(buf_page_t * bpage)274 buf_flush_insert_in_flush_rbt(
275 /*==========================*/
276 	buf_page_t*	bpage)	/*!< in: bpage to be inserted. */
277 {
278 	const ib_rbt_node_t*	c_node;
279 	const ib_rbt_node_t*	p_node;
280 	buf_page_t*		prev = NULL;
281 	buf_pool_t*		buf_pool = buf_pool_from_bpage(bpage);
282 
283 	ut_ad(buf_flush_list_mutex_own(buf_pool));
284 
285 	/* Insert this buffer into the rbt. */
286 	c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
287 	ut_a(c_node != NULL);
288 
289 	/* Get the predecessor. */
290 	p_node = rbt_prev(buf_pool->flush_rbt, c_node);
291 
292 	if (p_node != NULL) {
293 		buf_page_t**	value;
294 		value = rbt_value(buf_page_t*, p_node);
295 		prev = *value;
296 		ut_a(prev != NULL);
297 	}
298 
299 	return(prev);
300 }
301 
302 /*********************************************************//**
303 Delete a bpage from the flush_rbt. */
304 static
305 void
buf_flush_delete_from_flush_rbt(buf_page_t * bpage)306 buf_flush_delete_from_flush_rbt(
307 /*============================*/
308 	buf_page_t*	bpage)	/*!< in: bpage to be removed. */
309 {
310 #ifdef UNIV_DEBUG
311 	ibool		ret = FALSE;
312 #endif /* UNIV_DEBUG */
313 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
314 
315 	ut_ad(buf_flush_list_mutex_own(buf_pool));
316 
317 #ifdef UNIV_DEBUG
318 	ret =
319 #endif /* UNIV_DEBUG */
320 	rbt_delete(buf_pool->flush_rbt, &bpage);
321 
322 	ut_ad(ret);
323 }
324 
325 /*****************************************************************//**
326 Compare two modified blocks in the buffer pool. The key for comparison
327 is:
328 key = <oldest_modification, space, offset>
329 This comparison is used to maintian ordering of blocks in the
330 buf_pool->flush_rbt.
331 Note that for the purpose of flush_rbt, we only need to order blocks
332 on the oldest_modification. The other two fields are used to uniquely
333 identify the blocks.
334 @return < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */
335 static
336 int
buf_flush_block_cmp(const void * p1,const void * p2)337 buf_flush_block_cmp(
338 /*================*/
339 	const void*	p1,		/*!< in: block1 */
340 	const void*	p2)		/*!< in: block2 */
341 {
342 	int			ret;
343 	const buf_page_t*	b1 = *(const buf_page_t**) p1;
344 	const buf_page_t*	b2 = *(const buf_page_t**) p2;
345 
346 	ut_ad(b1 != NULL);
347 	ut_ad(b2 != NULL);
348 
349 #ifdef UNIV_DEBUG
350 	buf_pool_t*	buf_pool = buf_pool_from_bpage(b1);
351 #endif /* UNIV_DEBUG */
352 
353 	ut_ad(buf_flush_list_mutex_own(buf_pool));
354 
355 	ut_ad(b1->in_flush_list);
356 	ut_ad(b2->in_flush_list);
357 
358 	if (b2->oldest_modification > b1->oldest_modification) {
359 		return(1);
360 	} else if (b2->oldest_modification < b1->oldest_modification) {
361 		return(-1);
362 	}
363 
364 	/* If oldest_modification is same then decide on the space. */
365 	ret = (int)(b2->id.space() - b1->id.space());
366 
367 	/* Or else decide ordering on the page number. */
368 	return(ret ? ret : (int) (b2->id.page_no() - b1->id.page_no()));
369 }
370 
371 /********************************************************************//**
372 Initialize the red-black tree to speed up insertions into the flush_list
373 during recovery process. Should be called at the start of recovery
374 process before any page has been read/written. */
375 void
buf_flush_init_flush_rbt(void)376 buf_flush_init_flush_rbt(void)
377 /*==========================*/
378 {
379 	ulint	i;
380 
381 	for (i = 0; i < srv_buf_pool_instances; i++) {
382 		buf_pool_t*	buf_pool;
383 
384 		buf_pool = buf_pool_from_array(i);
385 
386 		buf_flush_list_mutex_enter(buf_pool);
387 
388 		ut_ad(buf_pool->flush_rbt == NULL);
389 
390 		/* Create red black tree for speedy insertions in flush list. */
391 		buf_pool->flush_rbt = rbt_create(
392 			sizeof(buf_page_t*), buf_flush_block_cmp);
393 
394 		buf_flush_list_mutex_exit(buf_pool);
395 	}
396 }
397 
398 /********************************************************************//**
399 Frees up the red-black tree. */
400 void
buf_flush_free_flush_rbt(void)401 buf_flush_free_flush_rbt(void)
402 /*==========================*/
403 {
404 	ulint	i;
405 
406 	for (i = 0; i < srv_buf_pool_instances; i++) {
407 		buf_pool_t*	buf_pool;
408 
409 		buf_pool = buf_pool_from_array(i);
410 
411 		buf_flush_list_mutex_enter(buf_pool);
412 
413 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
414 		ut_a(buf_flush_validate_low(buf_pool));
415 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
416 
417 		rbt_free(buf_pool->flush_rbt);
418 		buf_pool->flush_rbt = NULL;
419 
420 		buf_flush_list_mutex_exit(buf_pool);
421 	}
422 }
423 
424 /********************************************************************//**
425 Inserts a modified block into the flush list. */
426 void
buf_flush_insert_into_flush_list(buf_pool_t * buf_pool,buf_block_t * block,lsn_t lsn)427 buf_flush_insert_into_flush_list(
428 /*=============================*/
429 	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
430 	buf_block_t*	block,		/*!< in/out: block which is modified */
431 	lsn_t		lsn)		/*!< in: oldest modification */
432 {
433 	ut_ad(!buf_pool_mutex_own(buf_pool));
434 	ut_ad(log_flush_order_mutex_own());
435 	ut_ad(buf_page_mutex_own(block));
436 
437 	buf_flush_list_mutex_enter(buf_pool);
438 
439 	ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
440 	      || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
441 		  <= lsn));
442 
443 	/* If we are in the recovery then we need to update the flush
444 	red-black tree as well. */
445 	if (buf_pool->flush_rbt != NULL) {
446 		buf_flush_list_mutex_exit(buf_pool);
447 		buf_flush_insert_sorted_into_flush_list(buf_pool, block, lsn);
448 		return;
449 	}
450 
451 	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
452 	ut_ad(!block->page.in_flush_list);
453 
454 	ut_d(block->page.in_flush_list = TRUE);
455 	block->page.oldest_modification = lsn;
456 
457 	UT_LIST_ADD_FIRST(buf_pool->flush_list, &block->page);
458 
459 	incr_flush_list_size_in_bytes(block, buf_pool);
460 
461 #ifdef UNIV_DEBUG_VALGRIND
462 	void*	p;
463 
464 	if (block->page.size.is_compressed()) {
465 		p = block->page.zip.data;
466 	} else {
467 		p = block->frame;
468 	}
469 
470 	UNIV_MEM_ASSERT_RW(p, block->page.size.physical());
471 #endif /* UNIV_DEBUG_VALGRIND */
472 
473 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
474 	ut_a(buf_flush_validate_skip(buf_pool));
475 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
476 
477 	buf_flush_list_mutex_exit(buf_pool);
478 }
479 
480 /********************************************************************//**
481 Inserts a modified block into the flush list in the right sorted position.
482 This function is used by recovery, because there the modifications do not
483 necessarily come in the order of lsn's. */
484 void
buf_flush_insert_sorted_into_flush_list(buf_pool_t * buf_pool,buf_block_t * block,lsn_t lsn)485 buf_flush_insert_sorted_into_flush_list(
486 /*====================================*/
487 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
488 	buf_block_t*	block,		/*!< in/out: block which is modified */
489 	lsn_t		lsn)		/*!< in: oldest modification */
490 {
491 	buf_page_t*	prev_b;
492 	buf_page_t*	b;
493 
494 	ut_ad(!buf_pool_mutex_own(buf_pool));
495 	ut_ad(log_flush_order_mutex_own());
496 	ut_ad(buf_page_mutex_own(block));
497 	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
498 
499 	buf_flush_list_mutex_enter(buf_pool);
500 
501 	/* The field in_LRU_list is protected by buf_pool->mutex, which
502 	we are not holding.  However, while a block is in the flush
503 	list, it is dirty and cannot be discarded, not from the
504 	page_hash or from the LRU list.  At most, the uncompressed
505 	page frame of a compressed block may be discarded or created
506 	(copying the block->page to or from a buf_page_t that is
507 	dynamically allocated from buf_buddy_alloc()).  Because those
508 	transitions hold block->mutex and the flush list mutex (via
509 	buf_flush_relocate_on_flush_list()), there is no possibility
510 	of a race condition in the assertions below. */
511 	ut_ad(block->page.in_LRU_list);
512 	ut_ad(block->page.in_page_hash);
513 	/* buf_buddy_block_register() will take a block in the
514 	BUF_BLOCK_MEMORY state, not a file page. */
515 	ut_ad(!block->page.in_zip_hash);
516 
517 	ut_ad(!block->page.in_flush_list);
518 	ut_d(block->page.in_flush_list = TRUE);
519 	block->page.oldest_modification = lsn;
520 
521 #ifdef UNIV_DEBUG_VALGRIND
522 	void*	p;
523 
524 	if (block->page.size.is_compressed()) {
525 		p = block->page.zip.data;
526 	} else {
527 		p = block->frame;
528 	}
529 
530 	UNIV_MEM_ASSERT_RW(p, block->page.size.physical());
531 #endif /* UNIV_DEBUG_VALGRIND */
532 
533 	prev_b = NULL;
534 
535 	/* For the most part when this function is called the flush_rbt
536 	should not be NULL. In a very rare boundary case it is possible
537 	that the flush_rbt has already been freed by the recovery thread
538 	before the last page was hooked up in the flush_list by the
539 	io-handler thread. In that case we'll just do a simple
540 	linear search in the else block. */
541 	if (buf_pool->flush_rbt != NULL) {
542 
543 		prev_b = buf_flush_insert_in_flush_rbt(&block->page);
544 
545 	} else {
546 
547 		b = UT_LIST_GET_FIRST(buf_pool->flush_list);
548 
549 		while (b != NULL && b->oldest_modification
550 		       > block->page.oldest_modification) {
551 
552 			ut_ad(b->in_flush_list);
553 			prev_b = b;
554 			b = UT_LIST_GET_NEXT(list, b);
555 		}
556 	}
557 
558 	if (prev_b == NULL) {
559 		UT_LIST_ADD_FIRST(buf_pool->flush_list, &block->page);
560 	} else {
561 		UT_LIST_INSERT_AFTER(buf_pool->flush_list, prev_b, &block->page);
562 	}
563 
564 	if (buf_pool->oldest_hp.get() != NULL) {
565 		/* clear oldest_hp */
566 		buf_pool->oldest_hp.set(NULL);
567 	}
568 
569 	incr_flush_list_size_in_bytes(block, buf_pool);
570 
571 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
572 	ut_a(buf_flush_validate_low(buf_pool));
573 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
574 
575 	buf_flush_list_mutex_exit(buf_pool);
576 }
577 
578 /********************************************************************//**
579 Returns TRUE if the file page block is immediately suitable for replacement,
580 i.e., the transition FILE_PAGE => NOT_USED allowed.
581 @return TRUE if can replace immediately */
582 ibool
buf_flush_ready_for_replace(buf_page_t * bpage)583 buf_flush_ready_for_replace(
584 /*========================*/
585 	buf_page_t*	bpage)	/*!< in: buffer control block, must be
586 				buf_page_in_file(bpage) and in the LRU list */
587 {
588 #ifdef UNIV_DEBUG
589 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
590 	ut_ad(buf_pool_mutex_own(buf_pool));
591 #endif /* UNIV_DEBUG */
592 	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
593 	ut_ad(bpage->in_LRU_list);
594 
595 	if (buf_page_in_file(bpage)) {
596 
597 		return(bpage->oldest_modification == 0
598 		       && bpage->buf_fix_count == 0
599 		       && buf_page_get_io_fix(bpage) == BUF_IO_NONE);
600 	}
601 
602 	ib::fatal() << "Buffer block " << bpage << " state " <<  bpage->state
603 		<< " in the LRU list!";
604 
605 	return(FALSE);
606 }
607 
608 /********************************************************************//**
609 Returns true if the block is modified and ready for flushing.
610 @return true if can flush immediately */
611 bool
buf_flush_ready_for_flush(buf_page_t * bpage,buf_flush_t flush_type)612 buf_flush_ready_for_flush(
613 /*======================*/
614 	buf_page_t*	bpage,	/*!< in: buffer control block, must be
615 				buf_page_in_file(bpage) */
616 	buf_flush_t	flush_type)/*!< in: type of flush */
617 {
618 #ifdef UNIV_DEBUG
619 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
620 	ut_ad(buf_pool_mutex_own(buf_pool));
621 #endif /* UNIV_DEBUG */
622 
623 	ut_a(buf_page_in_file(bpage));
624 	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
625 	ut_ad(flush_type < BUF_FLUSH_N_TYPES);
626 
627 	if (bpage->oldest_modification == 0
628 	    || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
629 		return(false);
630 	}
631 
632 	ut_ad(bpage->in_flush_list);
633 
634 	switch (flush_type) {
635 	case BUF_FLUSH_LIST:
636 	case BUF_FLUSH_LRU:
637 	case BUF_FLUSH_SINGLE_PAGE:
638 		return(true);
639 
640 	case BUF_FLUSH_N_TYPES:
641 		break;
642 	}
643 
644 	ut_error;
645 	return(false);
646 }
647 
648 /********************************************************************//**
649 Remove a block from the flush list of modified blocks. */
650 void
buf_flush_remove(buf_page_t * bpage)651 buf_flush_remove(
652 /*=============*/
653 	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
654 {
655 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
656 
657 	ut_ad(buf_pool_mutex_own(buf_pool));
658 	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
659 	ut_ad(bpage->in_flush_list);
660 
661 	buf_flush_list_mutex_enter(buf_pool);
662 
663 	/* Important that we adjust the hazard pointer before removing
664 	the bpage from flush list. */
665 	buf_pool->flush_hp.adjust(bpage);
666 	buf_pool->oldest_hp.adjust(bpage);
667 
668 	switch (buf_page_get_state(bpage)) {
669 	case BUF_BLOCK_POOL_WATCH:
670 	case BUF_BLOCK_ZIP_PAGE:
671 		/* Clean compressed pages should not be on the flush list */
672 	case BUF_BLOCK_NOT_USED:
673 	case BUF_BLOCK_READY_FOR_USE:
674 	case BUF_BLOCK_MEMORY:
675 	case BUF_BLOCK_REMOVE_HASH:
676 		ut_error;
677 		return;
678 	case BUF_BLOCK_ZIP_DIRTY:
679 		buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
680 		UT_LIST_REMOVE(buf_pool->flush_list, bpage);
681 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
682 		buf_LRU_insert_zip_clean(bpage);
683 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
684 		break;
685 	case BUF_BLOCK_FILE_PAGE:
686 		UT_LIST_REMOVE(buf_pool->flush_list, bpage);
687 		break;
688 	}
689 
690 	/* If the flush_rbt is active then delete from there as well. */
691 	if (buf_pool->flush_rbt != NULL) {
692 		buf_flush_delete_from_flush_rbt(bpage);
693 	}
694 
695 	/* Must be done after we have removed it from the flush_rbt
696 	because we assert on in_flush_list in comparison function. */
697 	ut_d(bpage->in_flush_list = FALSE);
698 
699 	buf_pool->stat.flush_list_bytes -= bpage->size.physical();
700 
701 	bpage->oldest_modification = 0;
702 
703 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
704 	ut_a(buf_flush_validate_skip(buf_pool));
705 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
706 
707 	/* If there is an observer that want to know if the asynchronous
708 	flushing was done then notify it. */
709 	if (bpage->flush_observer != NULL) {
710 		bpage->flush_observer->notify_remove(buf_pool, bpage);
711 
712 		bpage->flush_observer = NULL;
713 	}
714 
715 	buf_flush_list_mutex_exit(buf_pool);
716 }
717 
718 /*******************************************************************//**
719 Relocates a buffer control block on the flush_list.
720 Note that it is assumed that the contents of bpage have already been
721 copied to dpage.
722 IMPORTANT: When this function is called bpage and dpage are not
723 exact copies of each other. For example, they both will have different
724 ::state. Also the ::list pointers in dpage may be stale. We need to
725 use the current list node (bpage) to do the list manipulation because
726 the list pointers could have changed between the time that we copied
727 the contents of bpage to the dpage and the flush list manipulation
728 below. */
729 void
buf_flush_relocate_on_flush_list(buf_page_t * bpage,buf_page_t * dpage)730 buf_flush_relocate_on_flush_list(
731 /*=============================*/
732 	buf_page_t*	bpage,	/*!< in/out: control block being moved */
733 	buf_page_t*	dpage)	/*!< in/out: destination block */
734 {
735 	buf_page_t*	prev;
736 	buf_page_t*	prev_b = NULL;
737 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
738 
739 	ut_ad(buf_pool_mutex_own(buf_pool));
740 	/* Must reside in the same buffer pool. */
741 	ut_ad(buf_pool == buf_pool_from_bpage(dpage));
742 
743 	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
744 
745 	buf_flush_list_mutex_enter(buf_pool);
746 
747 	/* FIXME: At this point we have both buf_pool and flush_list
748 	mutexes. Theoretically removal of a block from flush list is
749 	only covered by flush_list mutex but currently we do
750 	have buf_pool mutex in buf_flush_remove() therefore this block
751 	is guaranteed to be in the flush list. We need to check if
752 	this will work without the assumption of block removing code
753 	having the buf_pool mutex. */
754 	ut_ad(bpage->in_flush_list);
755 	ut_ad(dpage->in_flush_list);
756 
757 	/* If recovery is active we must swap the control blocks in
758 	the flush_rbt as well. */
759 	if (buf_pool->flush_rbt != NULL) {
760 		buf_flush_delete_from_flush_rbt(bpage);
761 		prev_b = buf_flush_insert_in_flush_rbt(dpage);
762 	}
763 
764 	/* Important that we adjust the hazard pointer before removing
765 	the bpage from the flush list. */
766 	buf_pool->flush_hp.move(bpage, dpage);
767 	buf_pool->oldest_hp.move(bpage, dpage);
768 
769 	/* Must be done after we have removed it from the flush_rbt
770 	because we assert on in_flush_list in comparison function. */
771 	ut_d(bpage->in_flush_list = FALSE);
772 
773 	prev = UT_LIST_GET_PREV(list, bpage);
774 	UT_LIST_REMOVE(buf_pool->flush_list, bpage);
775 
776 	if (prev) {
777 		ut_ad(prev->in_flush_list);
778 		UT_LIST_INSERT_AFTER( buf_pool->flush_list, prev, dpage);
779 	} else {
780 		UT_LIST_ADD_FIRST(buf_pool->flush_list, dpage);
781 	}
782 
783 	/* Just an extra check. Previous in flush_list
784 	should be the same control block as in flush_rbt. */
785 	ut_a(buf_pool->flush_rbt == NULL || prev_b == prev);
786 
787 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
788 	ut_a(buf_flush_validate_low(buf_pool));
789 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
790 
791 	buf_flush_list_mutex_exit(buf_pool);
792 }
793 
794 /********************************************************************//**
795 Updates the flush system data structures when a write is completed. */
796 void
buf_flush_write_complete(buf_page_t * bpage)797 buf_flush_write_complete(
798 /*=====================*/
799 	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
800 {
801 	buf_flush_t	flush_type;
802 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
803 
804 	ut_ad(bpage);
805 
806 	buf_flush_remove(bpage);
807 
808 	flush_type = buf_page_get_flush_type(bpage);
809 	buf_pool->n_flush[flush_type]--;
810 
811 	if (buf_pool->n_flush[flush_type] == 0
812 	    && buf_pool->init_flush[flush_type] == FALSE) {
813 
814 		/* The running flush batch has ended */
815 
816 		os_event_set(buf_pool->no_flush[flush_type]);
817 	}
818 
819 	buf_dblwr_update(bpage, flush_type);
820 }
821 #endif /* !UNIV_HOTBACKUP */
822 
823 /** Calculate the checksum of a page from compressed table and update
824 the page.
825 @param[in,out]	page	page to update
826 @param[in]	size	compressed page size
827 @param[in]	lsn	LSN to stamp on the page */
828 void
buf_flush_update_zip_checksum(buf_frame_t * page,ulint size,lsn_t lsn)829 buf_flush_update_zip_checksum(
830 	buf_frame_t*	page,
831 	ulint		size,
832 	lsn_t		lsn)
833 {
834 	ut_a(size > 0);
835 
836 	const uint32_t	checksum = page_zip_calc_checksum(
837 		page, size,
838 		static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm));
839 
840 	mach_write_to_8(page + FIL_PAGE_LSN, lsn);
841 	mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
842 }
843 
844 /** Initialize a page for writing to the tablespace.
845 @param[in]	block		buffer block; NULL if bypassing the buffer pool
846 @param[in,out]	page		page frame
847 @param[in,out]	page_zip_	compressed page, or NULL if uncompressed
848 @param[in]	newest_lsn	newest modification LSN to the page
849 @param[in]	skip_checksum	whether to disable the page checksum */
850 void
buf_flush_init_for_writing(const buf_block_t * block,byte * page,void * page_zip_,lsn_t newest_lsn,bool skip_checksum)851 buf_flush_init_for_writing(
852 	const buf_block_t*	block,
853 	byte*			page,
854 	void*			page_zip_,
855 	lsn_t			newest_lsn,
856 	bool			skip_checksum)
857 {
858 	ib_uint32_t	checksum = BUF_NO_CHECKSUM_MAGIC;
859 
860 	ut_ad(block == NULL || block->frame == page);
861 	ut_ad(block == NULL || page_zip_ == NULL
862 	      || &block->page.zip == page_zip_);
863 	ut_ad(page);
864 
865 	if (page_zip_) {
866 		page_zip_des_t*	page_zip;
867 		ulint		size;
868 
869 		page_zip = static_cast<page_zip_des_t*>(page_zip_);
870 		size = page_zip_get_size(page_zip);
871 
872 		ut_ad(size);
873 		ut_ad(ut_is_2pow(size));
874 		ut_ad(size <= UNIV_ZIP_SIZE_MAX);
875 
876 		switch (fil_page_get_type(page)) {
877 		case FIL_PAGE_TYPE_ALLOCATED:
878 		case FIL_PAGE_INODE:
879 		case FIL_PAGE_IBUF_BITMAP:
880 		case FIL_PAGE_TYPE_FSP_HDR:
881 		case FIL_PAGE_TYPE_XDES:
882 			/* These are essentially uncompressed pages. */
883 			memcpy(page_zip->data, page, size);
884 			/* fall through */
885 		case FIL_PAGE_TYPE_ZBLOB:
886 		case FIL_PAGE_TYPE_ZBLOB2:
887 		case FIL_PAGE_INDEX:
888 		case FIL_PAGE_RTREE:
889 
890 			buf_flush_update_zip_checksum(
891 				page_zip->data, size, newest_lsn);
892 
893 			return;
894 		}
895 
896 		ib::error() << "The compressed page to be written"
897 			" seems corrupt:";
898 		ut_print_buf(stderr, page, size);
899 		fputs("\nInnoDB: Possibly older version of the page:", stderr);
900 		ut_print_buf(stderr, page_zip->data, size);
901 		putc('\n', stderr);
902 		ut_error;
903 	}
904 
905 	/* Write the newest modification lsn to the page header and trailer */
906 	mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
907 
908 	mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
909 			newest_lsn);
910 
911 	if (skip_checksum) {
912 		mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
913 	} else {
914 		if (block != NULL && UNIV_PAGE_SIZE == 16384) {
915 			/* The page type could be garbage in old files
916 			created before MySQL 5.5. Such files always
917 			had a page size of 16 kilobytes. */
918 			ulint	page_type = fil_page_get_type(page);
919 			ulint	reset_type = page_type;
920 
921 			switch (block->page.id.page_no() % 16384) {
922 			case 0:
923 				reset_type = block->page.id.page_no() == 0
924 					? FIL_PAGE_TYPE_FSP_HDR
925 					: FIL_PAGE_TYPE_XDES;
926 				break;
927 			case 1:
928 				reset_type = FIL_PAGE_IBUF_BITMAP;
929 				break;
930 			default:
931 				switch (page_type) {
932 				case FIL_PAGE_INDEX:
933 				case FIL_PAGE_RTREE:
934 				case FIL_PAGE_UNDO_LOG:
935 				case FIL_PAGE_INODE:
936 				case FIL_PAGE_IBUF_FREE_LIST:
937 				case FIL_PAGE_TYPE_ALLOCATED:
938 				case FIL_PAGE_TYPE_SYS:
939 				case FIL_PAGE_TYPE_TRX_SYS:
940 				case FIL_PAGE_TYPE_BLOB:
941 				case FIL_PAGE_TYPE_ZBLOB:
942 				case FIL_PAGE_TYPE_ZBLOB2:
943 					break;
944 				case FIL_PAGE_TYPE_FSP_HDR:
945 				case FIL_PAGE_TYPE_XDES:
946 				case FIL_PAGE_IBUF_BITMAP:
947 					/* These pages should have
948 					predetermined page numbers
949 					(see above). */
950 				default:
951 					reset_type = FIL_PAGE_TYPE_UNKNOWN;
952 					break;
953 				}
954 			}
955 
956 			if (UNIV_UNLIKELY(page_type != reset_type)) {
957 				ib::info()
958 					<< "Resetting invalid page "
959 					<< block->page.id << " type "
960 					<< page_type << " to "
961 					<< reset_type << " when flushing.";
962 				fil_page_set_type(page, reset_type);
963 			}
964 		}
965 
966 		switch ((srv_checksum_algorithm_t) srv_checksum_algorithm) {
967 		case SRV_CHECKSUM_ALGORITHM_CRC32:
968 		case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
969 			checksum = buf_calc_page_crc32(page);
970 			mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
971 					checksum);
972 			break;
973 		case SRV_CHECKSUM_ALGORITHM_INNODB:
974 		case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
975 			checksum = (ib_uint32_t) buf_calc_page_new_checksum(
976 				page);
977 			mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
978 					checksum);
979 			checksum = (ib_uint32_t) buf_calc_page_old_checksum(
980 				page);
981 			break;
982 		case SRV_CHECKSUM_ALGORITHM_NONE:
983 		case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
984 			mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
985 					checksum);
986 			break;
987 			/* no default so the compiler will emit a warning if
988 			new enum is added and not handled here */
989 		}
990 	}
991 
992 	/* With the InnoDB checksum, we overwrite the first 4 bytes of
993 	the end lsn field to store the old formula checksum. Since it
994 	depends also on the field FIL_PAGE_SPACE_OR_CHKSUM, it has to
995 	be calculated after storing the new formula checksum.
996 
997 	In other cases we write the same value to both fields.
998 	If CRC32 is used then it is faster to use that checksum
999 	(calculated above) instead of calculating another one.
1000 	We can afford to store something other than
1001 	buf_calc_page_old_checksum() or BUF_NO_CHECKSUM_MAGIC in
1002 	this field because the file will not be readable by old
1003 	versions of MySQL/InnoDB anyway (older than MySQL 5.6.3) */
1004 
1005 	mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
1006 			checksum);
1007 }
1008 
1009 #ifndef UNIV_HOTBACKUP
1010 /********************************************************************//**
1011 Does an asynchronous write of a buffer page. NOTE: in simulated aio and
1012 also when the doublewrite buffer is used, we must call
1013 buf_dblwr_flush_buffered_writes after we have posted a batch of
1014 writes! */
1015 static
1016 void
buf_flush_write_block_low(buf_page_t * bpage,buf_flush_t flush_type,bool sync)1017 buf_flush_write_block_low(
1018 /*======================*/
1019 	buf_page_t*	bpage,		/*!< in: buffer block to write */
1020 	buf_flush_t	flush_type,	/*!< in: type of flush */
1021 	bool		sync)		/*!< in: true if sync IO request */
1022 {
1023 	page_t*	frame = NULL;
1024 
1025 #ifdef UNIV_DEBUG
1026 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
1027 	ut_ad(!buf_pool_mutex_own(buf_pool));
1028 #endif /* UNIV_DEBUG */
1029 
1030 	DBUG_PRINT("ib_buf", ("flush %s %u page " UINT32PF ":" UINT32PF,
1031 			      sync ? "sync" : "async", (unsigned) flush_type,
1032 			      bpage->id.space(), bpage->id.page_no()));
1033 
1034 	ut_ad(buf_page_in_file(bpage));
1035 
1036 	/* We are not holding buf_pool->mutex or block_mutex here.
1037 	Nevertheless, it is safe to access bpage, because it is
1038 	io_fixed and oldest_modification != 0.  Thus, it cannot be
1039 	relocated in the buffer pool or removed from flush_list or
1040 	LRU_list. */
1041 	ut_ad(!buf_pool_mutex_own(buf_pool));
1042 	ut_ad(!buf_flush_list_mutex_own(buf_pool));
1043 	ut_ad(!buf_page_get_mutex(bpage)->is_owned());
1044 	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
1045 	ut_ad(bpage->oldest_modification != 0);
1046 
1047 #ifdef UNIV_IBUF_COUNT_DEBUG
1048 	ut_a(ibuf_count_get(bpage->id) == 0);
1049 #endif /* UNIV_IBUF_COUNT_DEBUG */
1050 
1051 	ut_ad(bpage->newest_modification != 0);
1052 
1053 	/* Force the log to the disk before writing the modified block */
1054 	if (!srv_read_only_mode) {
1055 		log_write_up_to(bpage->newest_modification, true);
1056 	}
1057 
1058 	switch (buf_page_get_state(bpage)) {
1059 	case BUF_BLOCK_POOL_WATCH:
1060 	case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
1061 	case BUF_BLOCK_NOT_USED:
1062 	case BUF_BLOCK_READY_FOR_USE:
1063 	case BUF_BLOCK_MEMORY:
1064 	case BUF_BLOCK_REMOVE_HASH:
1065 		ut_error;
1066 		break;
1067 	case BUF_BLOCK_ZIP_DIRTY:
1068 		frame = bpage->zip.data;
1069 
1070 		mach_write_to_8(frame + FIL_PAGE_LSN,
1071 				bpage->newest_modification);
1072 
1073 		ut_a(page_zip_verify_checksum(frame, bpage->size.physical()));
1074 		break;
1075 	case BUF_BLOCK_FILE_PAGE:
1076 		frame = bpage->zip.data;
1077 		if (!frame) {
1078 			frame = ((buf_block_t*) bpage)->frame;
1079 		}
1080 
1081 		buf_flush_init_for_writing(
1082 			reinterpret_cast<const buf_block_t*>(bpage),
1083 			reinterpret_cast<const buf_block_t*>(bpage)->frame,
1084 			bpage->zip.data ? &bpage->zip : NULL,
1085 			bpage->newest_modification,
1086 			fsp_is_checksum_disabled(bpage->id.space()));
1087 		break;
1088 	}
1089 
1090 	/* Disable use of double-write buffer for temporary tablespace.
1091 	Given the nature and load of temporary tablespace doublewrite buffer
1092 	adds an overhead during flushing. */
1093 
1094 	if (!srv_use_doublewrite_buf
1095 	    || buf_dblwr == NULL
1096 	    || srv_read_only_mode
1097 	    || fsp_is_system_temporary(bpage->id.space())) {
1098 
1099 		ut_ad(!srv_read_only_mode
1100 		      || fsp_is_system_temporary(bpage->id.space()));
1101 
1102 		ulint	type = IORequest::WRITE | IORequest::DO_NOT_WAKE;
1103 
1104 		IORequest	request(type);
1105 
1106 		fil_io(request,
1107 		       sync, bpage->id, bpage->size, 0, bpage->size.physical(),
1108 		       frame, bpage);
1109 
1110 	} else if (flush_type == BUF_FLUSH_SINGLE_PAGE) {
1111 		buf_dblwr_write_single_page(bpage, sync);
1112 	} else {
1113 		ut_ad(!sync);
1114 		buf_dblwr_add_to_batch(bpage);
1115 	}
1116 
1117 	/* When doing single page flushing the IO is done synchronously
1118 	and we flush the changes to disk only for the tablespace we
1119 	are working on. */
1120 	if (sync) {
1121 		ut_ad(flush_type == BUF_FLUSH_SINGLE_PAGE);
1122 		fil_flush(bpage->id.space());
1123 
1124 		/* true means we want to evict this page from the
1125 		LRU list as well. */
1126 		buf_page_io_complete(bpage, true);
1127 	}
1128 
1129 	/* Increment the counter of I/O operations used
1130 	for selecting LRU policy. */
1131 	buf_LRU_stat_inc_io();
1132 }
1133 
1134 /********************************************************************//**
1135 Writes a flushable page asynchronously from the buffer pool to a file.
1136 NOTE: in simulated aio we must call
1137 os_aio_simulated_wake_handler_threads after we have posted a batch of
1138 writes! NOTE: buf_pool->mutex and buf_page_get_mutex(bpage) must be
1139 held upon entering this function, and they will be released by this
1140 function if it returns true.
1141 @return TRUE if the page was flushed */
1142 ibool
buf_flush_page(buf_pool_t * buf_pool,buf_page_t * bpage,buf_flush_t flush_type,bool sync)1143 buf_flush_page(
1144 /*===========*/
1145 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
1146 	buf_page_t*	bpage,		/*!< in: buffer control block */
1147 	buf_flush_t	flush_type,	/*!< in: type of flush */
1148 	bool		sync)		/*!< in: true if sync IO request */
1149 {
1150 	BPageMutex*	block_mutex;
1151 
1152 	ut_ad(flush_type < BUF_FLUSH_N_TYPES);
1153 	ut_ad(buf_pool_mutex_own(buf_pool));
1154 	ut_ad(buf_page_in_file(bpage));
1155 	ut_ad(!sync || flush_type == BUF_FLUSH_SINGLE_PAGE);
1156 
1157 	block_mutex = buf_page_get_mutex(bpage);
1158 	ut_ad(mutex_own(block_mutex));
1159 
1160 	ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
1161 
1162 	bool	is_uncompressed;
1163 
1164 	is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
1165 	ut_ad(is_uncompressed == (block_mutex != &buf_pool->zip_mutex));
1166 
1167 	ibool		flush;
1168 	rw_lock_t*	rw_lock;
1169 	bool		no_fix_count = bpage->buf_fix_count == 0;
1170 
1171 	if (!is_uncompressed) {
1172 		flush = TRUE;
1173 		rw_lock = NULL;
1174 	} else if (!(no_fix_count || flush_type == BUF_FLUSH_LIST)
1175 		   || (!no_fix_count
1176 		       && srv_shutdown_state <= SRV_SHUTDOWN_CLEANUP
1177 		       && fsp_is_system_temporary(bpage->id.space()))) {
1178 		/* This is a heuristic, to avoid expensive SX attempts. */
1179 		/* For table residing in temporary tablespace sync is done
1180 		using IO_FIX and so before scheduling for flush ensure that
1181 		page is not fixed. */
1182 		flush = FALSE;
1183 	} else {
1184 		rw_lock = &reinterpret_cast<buf_block_t*>(bpage)->lock;
1185 		if (flush_type != BUF_FLUSH_LIST) {
1186 			flush = rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE);
1187 		} else {
1188 			/* Will SX lock later */
1189 			flush = TRUE;
1190 		}
1191 	}
1192 
1193 	if (flush) {
1194 
1195 		/* We are committed to flushing by the time we get here */
1196 
1197 		buf_page_set_io_fix(bpage, BUF_IO_WRITE);
1198 
1199 		buf_page_set_flush_type(bpage, flush_type);
1200 
1201 		if (buf_pool->n_flush[flush_type] == 0) {
1202 			os_event_reset(buf_pool->no_flush[flush_type]);
1203 		}
1204 
1205 		++buf_pool->n_flush[flush_type];
1206 
1207 		mutex_exit(block_mutex);
1208 
1209 		buf_pool_mutex_exit(buf_pool);
1210 
1211 		if (flush_type == BUF_FLUSH_LIST
1212 		    && is_uncompressed
1213 		    && !rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE)) {
1214 
1215 			if (!fsp_is_system_temporary(bpage->id.space())) {
1216 				/* avoiding deadlock possibility involves
1217 				doublewrite buffer, should flush it, because
1218 				it might hold the another block->lock. */
1219 				buf_dblwr_flush_buffered_writes();
1220 			} else {
1221 				buf_dblwr_sync_datafiles();
1222 			}
1223 
1224 			rw_lock_sx_lock_gen(rw_lock, BUF_IO_WRITE);
1225 		}
1226 
1227 		/* If there is an observer that want to know if the asynchronous
1228 		flushing was sent then notify it.
1229 		Note: we set flush observer to a page with x-latch, so we can
1230 		guarantee that notify_flush and notify_remove are called in pair
1231 		with s-latch on a uncompressed page. */
1232 		if (bpage->flush_observer != NULL) {
1233 			buf_pool_mutex_enter(buf_pool);
1234 
1235 			bpage->flush_observer->notify_flush(buf_pool, bpage);
1236 
1237 			buf_pool_mutex_exit(buf_pool);
1238 		}
1239 
1240 		/* Even though bpage is not protected by any mutex at this
1241 		point, it is safe to access bpage, because it is io_fixed and
1242 		oldest_modification != 0.  Thus, it cannot be relocated in the
1243 		buffer pool or removed from flush_list or LRU_list. */
1244 
1245 		buf_flush_write_block_low(bpage, flush_type, sync);
1246 	}
1247 
1248 	return(flush);
1249 }
1250 
1251 # if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
1252 /********************************************************************//**
1253 Writes a flushable page asynchronously from the buffer pool to a file.
1254 NOTE: buf_pool->mutex and block->mutex must be held upon entering this
1255 function, and they will be released by this function after flushing.
1256 This is loosely based on buf_flush_batch() and buf_flush_page().
1257 @return TRUE if the page was flushed and the mutexes released */
1258 ibool
buf_flush_page_try(buf_pool_t * buf_pool,buf_block_t * block)1259 buf_flush_page_try(
1260 /*===============*/
1261 	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
1262 	buf_block_t*	block)		/*!< in/out: buffer control block */
1263 {
1264 	ut_ad(buf_pool_mutex_own(buf_pool));
1265 	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
1266 	ut_ad(buf_page_mutex_own(block));
1267 
1268 	if (!buf_flush_ready_for_flush(&block->page, BUF_FLUSH_SINGLE_PAGE)) {
1269 		return(FALSE);
1270 	}
1271 
1272 	/* The following call will release the buffer pool and
1273 	block mutex. */
1274 	return(buf_flush_page(
1275 			buf_pool, &block->page,
1276 			BUF_FLUSH_SINGLE_PAGE, true));
1277 }
1278 # endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
1279 
1280 /** Check the page is in buffer pool and can be flushed.
1281 @param[in]	page_id		page id
1282 @param[in]	flush_type	BUF_FLUSH_LRU or BUF_FLUSH_LIST
1283 @return true if the page can be flushed. */
1284 static
1285 bool
buf_flush_check_neighbor(const page_id_t & page_id,buf_flush_t flush_type)1286 buf_flush_check_neighbor(
1287 	const page_id_t&	page_id,
1288 	buf_flush_t		flush_type)
1289 {
1290 	buf_page_t*	bpage;
1291 	buf_pool_t*	buf_pool = buf_pool_get(page_id);
1292 	bool		ret;
1293 
1294 	ut_ad(flush_type == BUF_FLUSH_LRU
1295 	      || flush_type == BUF_FLUSH_LIST);
1296 
1297 	buf_pool_mutex_enter(buf_pool);
1298 
1299 	/* We only want to flush pages from this buffer pool. */
1300 	bpage = buf_page_hash_get(buf_pool, page_id);
1301 
1302 	if (!bpage) {
1303 
1304 		buf_pool_mutex_exit(buf_pool);
1305 		return(false);
1306 	}
1307 
1308 	ut_a(buf_page_in_file(bpage));
1309 
1310 	/* We avoid flushing 'non-old' blocks in an LRU flush,
1311 	because the flushed blocks are soon freed */
1312 
1313 	ret = false;
1314 	if (flush_type != BUF_FLUSH_LRU || buf_page_is_old(bpage)) {
1315 		BPageMutex* block_mutex = buf_page_get_mutex(bpage);
1316 
1317 		mutex_enter(block_mutex);
1318 		if (buf_flush_ready_for_flush(bpage, flush_type)) {
1319 			ret = true;
1320 		}
1321 		mutex_exit(block_mutex);
1322 	}
1323 	buf_pool_mutex_exit(buf_pool);
1324 
1325 	return(ret);
1326 }
1327 
1328 /** Flushes to disk all flushable pages within the flush area.
1329 @param[in]	page_id		page id
1330 @param[in]	flush_type	BUF_FLUSH_LRU or BUF_FLUSH_LIST
1331 @param[in]	n_flushed	number of pages flushed so far in this batch
1332 @param[in]	n_to_flush	maximum number of pages we are allowed to flush
1333 @return number of pages flushed */
1334 static
1335 ulint
buf_flush_try_neighbors(const page_id_t & page_id,buf_flush_t flush_type,ulint n_flushed,ulint n_to_flush)1336 buf_flush_try_neighbors(
1337 	const page_id_t&	page_id,
1338 	buf_flush_t		flush_type,
1339 	ulint			n_flushed,
1340 	ulint			n_to_flush)
1341 {
1342 	ulint		i;
1343 	ulint		low;
1344 	ulint		high;
1345 	ulint		count = 0;
1346 	buf_pool_t*	buf_pool = buf_pool_get(page_id);
1347 
1348 	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1349 
1350 	if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN
1351 	    || srv_flush_neighbors == 0) {
1352 		/* If there is little space or neighbor flushing is
1353 		not enabled then just flush the victim. */
1354 		low = page_id.page_no();
1355 		high = page_id.page_no() + 1;
1356 	} else {
1357 		/* When flushed, dirty blocks are searched in
1358 		neighborhoods of this size, and flushed along with the
1359 		original page. */
1360 
1361 		ulint	buf_flush_area;
1362 
1363 		buf_flush_area	= ut_min(
1364 			BUF_READ_AHEAD_AREA(buf_pool),
1365 			buf_pool->curr_size / 16);
1366 
1367 		low = (page_id.page_no() / buf_flush_area) * buf_flush_area;
1368 		high = (page_id.page_no() / buf_flush_area + 1) * buf_flush_area;
1369 
1370 		if (srv_flush_neighbors == 1) {
1371 			/* adjust 'low' and 'high' to limit
1372 			   for contiguous dirty area */
1373 			if (page_id.page_no() > low) {
1374 				for (i = page_id.page_no() - 1; i >= low; i--) {
1375 					if (!buf_flush_check_neighbor(
1376 						page_id_t(page_id.space(), i),
1377 						flush_type)) {
1378 
1379 						break;
1380 					}
1381 
1382 					if (i == low) {
1383 						/* Avoid overwrap when low == 0
1384 						and calling
1385 						buf_flush_check_neighbor() with
1386 						i == (ulint) -1 */
1387 						i--;
1388 						break;
1389 					}
1390 				}
1391 				low = i + 1;
1392 			}
1393 
1394 			for (i = page_id.page_no() + 1;
1395 			     i < high
1396 			     && buf_flush_check_neighbor(
1397 				     page_id_t(page_id.space(), i),
1398 				     flush_type);
1399 			     i++) {
1400 				/* do nothing */
1401 			}
1402 			high = i;
1403 		}
1404 	}
1405 
1406 	const ulint	space_size = fil_space_get_size(page_id.space());
1407 	if (high > space_size) {
1408 		high = space_size;
1409 	}
1410 
1411 	DBUG_PRINT("ib_buf", ("flush " UINT32PF ":%u..%u",
1412 			      page_id.space(),
1413 			      (unsigned) low, (unsigned) high));
1414 
1415 	for (ulint i = low; i < high; i++) {
1416 		buf_page_t*	bpage;
1417 
1418 		if ((count + n_flushed) >= n_to_flush) {
1419 
1420 			/* We have already flushed enough pages and
1421 			should call it a day. There is, however, one
1422 			exception. If the page whose neighbors we
1423 			are flushing has not been flushed yet then
1424 			we'll try to flush the victim that we
1425 			selected originally. */
1426 			if (i <= page_id.page_no()) {
1427 				i = page_id.page_no();
1428 			} else {
1429 				break;
1430 			}
1431 		}
1432 
1433 		const page_id_t	cur_page_id(page_id.space(), i);
1434 
1435 		buf_pool = buf_pool_get(cur_page_id);
1436 
1437 		buf_pool_mutex_enter(buf_pool);
1438 
1439 		/* We only want to flush pages from this buffer pool. */
1440 		bpage = buf_page_hash_get(buf_pool, cur_page_id);
1441 
1442 		if (bpage == NULL) {
1443 
1444 			buf_pool_mutex_exit(buf_pool);
1445 			continue;
1446 		}
1447 
1448 		ut_a(buf_page_in_file(bpage));
1449 
1450 		/* We avoid flushing 'non-old' blocks in an LRU flush,
1451 		because the flushed blocks are soon freed */
1452 
1453 		if (flush_type != BUF_FLUSH_LRU
1454 		    || i == page_id.page_no()
1455 		    || buf_page_is_old(bpage)) {
1456 
1457 			BPageMutex* block_mutex = buf_page_get_mutex(bpage);
1458 
1459 			mutex_enter(block_mutex);
1460 
1461 			if (buf_flush_ready_for_flush(bpage, flush_type)
1462 			    && (i == page_id.page_no()
1463 				|| bpage->buf_fix_count == 0)) {
1464 
1465 				/* We also try to flush those
1466 				neighbors != offset */
1467 
1468 				if (buf_flush_page(
1469 					buf_pool, bpage, flush_type, false)) {
1470 
1471 					++count;
1472 				} else {
1473 					mutex_exit(block_mutex);
1474 					buf_pool_mutex_exit(buf_pool);
1475 				}
1476 
1477 				continue;
1478 			} else {
1479 				mutex_exit(block_mutex);
1480 			}
1481 		}
1482 		buf_pool_mutex_exit(buf_pool);
1483 	}
1484 
1485 	if (count > 1) {
1486 		MONITOR_INC_VALUE_CUMULATIVE(
1487 			MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
1488 			MONITOR_FLUSH_NEIGHBOR_COUNT,
1489 			MONITOR_FLUSH_NEIGHBOR_PAGES,
1490 			(count - 1));
1491 	}
1492 
1493 	return(count);
1494 }
1495 
1496 /** Check if the block is modified and ready for flushing.
1497 If the the block is ready to flush then flush the page and try o flush
1498 its neighbors.
1499 @param[in]	bpage		buffer control block,
1500 must be buf_page_in_file(bpage)
1501 @param[in]	flush_type	BUF_FLUSH_LRU or BUF_FLUSH_LIST
1502 @param[in]	n_to_flush	number of pages to flush
1503 @param[in,out]	count		number of pages flushed
1504 @return TRUE if buf_pool mutex was released during this function.
1505 This does not guarantee that some pages were written as well.
1506 Number of pages written are incremented to the count. */
1507 static
1508 bool
buf_flush_page_and_try_neighbors(buf_page_t * bpage,buf_flush_t flush_type,ulint n_to_flush,ulint * count)1509 buf_flush_page_and_try_neighbors(
1510 	buf_page_t*		bpage,
1511 	buf_flush_t		flush_type,
1512 	ulint			n_to_flush,
1513 	ulint*			count)
1514 {
1515 #ifdef UNIV_DEBUG
1516 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
1517 
1518 	ut_ad(buf_pool_mutex_own(buf_pool));
1519 #endif /* UNIV_DEBUG */
1520 
1521 	bool		flushed;
1522 	BPageMutex*	block_mutex = buf_page_get_mutex(bpage);
1523 
1524 	mutex_enter(block_mutex);
1525 
1526 	ut_a(buf_page_in_file(bpage));
1527 
1528 	if (buf_flush_ready_for_flush(bpage, flush_type)) {
1529 		buf_pool_t*	buf_pool;
1530 
1531 		buf_pool = buf_pool_from_bpage(bpage);
1532 
1533 		const page_id_t	page_id = bpage->id;
1534 
1535 		mutex_exit(block_mutex);
1536 
1537 		buf_pool_mutex_exit(buf_pool);
1538 
1539 		/* Try to flush also all the neighbors */
1540 		*count += buf_flush_try_neighbors(
1541 			page_id, flush_type, *count, n_to_flush);
1542 
1543 		buf_pool_mutex_enter(buf_pool);
1544 		flushed = TRUE;
1545 	} else {
1546 		mutex_exit(block_mutex);
1547 
1548 		flushed = false;
1549 	}
1550 
1551 	ut_ad(buf_pool_mutex_own(buf_pool));
1552 
1553 	return(flushed);
1554 }
1555 
1556 /*******************************************************************//**
1557 This utility moves the uncompressed frames of pages to the free list.
1558 Note that this function does not actually flush any data to disk. It
1559 just detaches the uncompressed frames from the compressed pages at the
1560 tail of the unzip_LRU and puts those freed frames in the free list.
1561 Note that it is a best effort attempt and it is not guaranteed that
1562 after a call to this function there will be 'max' blocks in the free
1563 list.
1564 @return number of blocks moved to the free list. */
1565 static
1566 ulint
buf_free_from_unzip_LRU_list_batch(buf_pool_t * buf_pool,ulint max)1567 buf_free_from_unzip_LRU_list_batch(
1568 /*===============================*/
1569 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
1570 	ulint		max)		/*!< in: desired number of
1571 					blocks in the free_list */
1572 {
1573 	ulint		scanned = 0;
1574 	ulint		count = 0;
1575 	ulint		free_len = UT_LIST_GET_LEN(buf_pool->free);
1576 	ulint		lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
1577 
1578 	ut_ad(buf_pool_mutex_own(buf_pool));
1579 
1580 	buf_block_t*	block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
1581 
1582 	while (block != NULL
1583 	       && count < max
1584 	       && free_len < srv_LRU_scan_depth
1585 	       && lru_len > UT_LIST_GET_LEN(buf_pool->LRU) / 10) {
1586 
1587 		++scanned;
1588 		if (buf_LRU_free_page(&block->page, false)) {
1589 			/* Block was freed. buf_pool->mutex potentially
1590 			released and reacquired */
1591 			++count;
1592 			block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
1593 
1594 		} else {
1595 
1596 			block = UT_LIST_GET_PREV(unzip_LRU, block);
1597 		}
1598 
1599 		free_len = UT_LIST_GET_LEN(buf_pool->free);
1600 		lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
1601 	}
1602 
1603 	ut_ad(buf_pool_mutex_own(buf_pool));
1604 
1605 	if (scanned) {
1606 		MONITOR_INC_VALUE_CUMULATIVE(
1607 			MONITOR_LRU_BATCH_SCANNED,
1608 			MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
1609 			MONITOR_LRU_BATCH_SCANNED_PER_CALL,
1610 			scanned);
1611 	}
1612 
1613 	return(count);
1614 }
1615 
1616 /*******************************************************************//**
1617 This utility flushes dirty blocks from the end of the LRU list.
1618 The calling thread is not allowed to own any latches on pages!
1619 It attempts to make 'max' blocks available in the free list. Note that
1620 it is a best effort attempt and it is not guaranteed that after a call
1621 to this function there will be 'max' blocks in the free list.
1622 @return number of blocks for which the write request was queued. */
1623 static
1624 ulint
buf_flush_LRU_list_batch(buf_pool_t * buf_pool,ulint max)1625 buf_flush_LRU_list_batch(
1626 /*=====================*/
1627 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
1628 	ulint		max)		/*!< in: desired number of
1629 					blocks in the free_list */
1630 {
1631 	buf_page_t*	bpage;
1632 	ulint		scanned = 0;
1633 	ulint		evict_count = 0;
1634 	ulint		count = 0;
1635 	ulint		free_len = UT_LIST_GET_LEN(buf_pool->free);
1636 	ulint		lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
1637 	ulint		withdraw_depth = 0;
1638 
1639 	ut_ad(buf_pool_mutex_own(buf_pool));
1640 
1641 	if (buf_pool->curr_size < buf_pool->old_size
1642 	    && buf_pool->withdraw_target > 0) {
1643 		withdraw_depth = buf_pool->withdraw_target
1644 				 - UT_LIST_GET_LEN(buf_pool->withdraw);
1645 	}
1646 
1647 	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1648 	     bpage != NULL && count + evict_count < max
1649 	     && free_len < srv_LRU_scan_depth + withdraw_depth
1650 	     && lru_len > BUF_LRU_MIN_LEN;
1651 	     ++scanned,
1652 	     bpage = buf_pool->lru_hp.get()) {
1653 
1654 		buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage);
1655 		buf_pool->lru_hp.set(prev);
1656 
1657 		BPageMutex*	block_mutex = buf_page_get_mutex(bpage);
1658 
1659 		mutex_enter(block_mutex);
1660 
1661 		if (buf_flush_ready_for_replace(bpage)) {
1662 			/* block is ready for eviction i.e., it is
1663 			clean and is not IO-fixed or buffer fixed. */
1664 			mutex_exit(block_mutex);
1665 			if (buf_LRU_free_page(bpage, true)) {
1666 				++evict_count;
1667 			}
1668 		} else if (buf_flush_ready_for_flush(bpage, BUF_FLUSH_LRU)) {
1669 			/* Block is ready for flush. Dispatch an IO
1670 			request. The IO helper thread will put it on
1671 			free list in IO completion routine. */
1672 			mutex_exit(block_mutex);
1673 			buf_flush_page_and_try_neighbors(
1674 				bpage, BUF_FLUSH_LRU, max, &count);
1675 		} else {
1676 			/* Can't evict or dispatch this block. Go to
1677 			previous. */
1678 			ut_ad(buf_pool->lru_hp.is_hp(prev));
1679 			mutex_exit(block_mutex);
1680 		}
1681 
1682 		ut_ad(!mutex_own(block_mutex));
1683 		ut_ad(buf_pool_mutex_own(buf_pool));
1684 
1685 		free_len = UT_LIST_GET_LEN(buf_pool->free);
1686 		lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
1687 	}
1688 
1689 	buf_pool->lru_hp.set(NULL);
1690 
1691 	/* We keep track of all flushes happening as part of LRU
1692 	flush. When estimating the desired rate at which flush_list
1693 	should be flushed, we factor in this value. */
1694 	buf_lru_flush_page_count += count;
1695 
1696 	ut_ad(buf_pool_mutex_own(buf_pool));
1697 
1698 	if (evict_count) {
1699 		MONITOR_INC_VALUE_CUMULATIVE(
1700 			MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
1701 			MONITOR_LRU_BATCH_EVICT_COUNT,
1702 			MONITOR_LRU_BATCH_EVICT_PAGES,
1703 			evict_count);
1704 	}
1705 
1706 	if (scanned) {
1707 		MONITOR_INC_VALUE_CUMULATIVE(
1708 			MONITOR_LRU_BATCH_SCANNED,
1709 			MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
1710 			MONITOR_LRU_BATCH_SCANNED_PER_CALL,
1711 			scanned);
1712 	}
1713 
1714 	return(count);
1715 }
1716 
1717 /*******************************************************************//**
1718 Flush and move pages from LRU or unzip_LRU list to the free list.
1719 Whether LRU or unzip_LRU is used depends on the state of the system.
1720 @return number of blocks for which either the write request was queued
1721 or in case of unzip_LRU the number of blocks actually moved to the
1722 free list */
1723 static
1724 ulint
buf_do_LRU_batch(buf_pool_t * buf_pool,ulint max)1725 buf_do_LRU_batch(
1726 /*=============*/
1727 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
1728 	ulint		max)		/*!< in: desired number of
1729 					blocks in the free_list */
1730 {
1731 	ulint	count = 0;
1732 
1733 	if (buf_LRU_evict_from_unzip_LRU(buf_pool)) {
1734 		count += buf_free_from_unzip_LRU_list_batch(buf_pool, max);
1735 	}
1736 
1737 	if (max > count) {
1738 		count += buf_flush_LRU_list_batch(buf_pool, max - count);
1739 	}
1740 
1741 	return(count);
1742 }
1743 
1744 /** This utility flushes dirty blocks from the end of the flush_list.
1745 The calling thread is not allowed to own any latches on pages!
1746 @param[in]	buf_pool	buffer pool instance
1747 @param[in]	min_n		wished minimum mumber of blocks flushed (it is
1748 not guaranteed that the actual number is that big, though)
1749 @param[in]	lsn_limit	all blocks whose oldest_modification is smaller
1750 than this should be flushed (if their number does not exceed min_n)
1751 @return number of blocks for which the write request was queued;
1752 ULINT_UNDEFINED if there was a flush of the same type already
1753 running */
1754 static
1755 ulint
buf_do_flush_list_batch(buf_pool_t * buf_pool,ulint min_n,lsn_t lsn_limit)1756 buf_do_flush_list_batch(
1757 	buf_pool_t*		buf_pool,
1758 	ulint			min_n,
1759 	lsn_t			lsn_limit)
1760 {
1761 	ulint		count = 0;
1762 	ulint		scanned = 0;
1763 
1764 	ut_ad(buf_pool_mutex_own(buf_pool));
1765 
1766 	/* Start from the end of the list looking for a suitable
1767 	block to be flushed. */
1768 	buf_flush_list_mutex_enter(buf_pool);
1769 	ulint len = UT_LIST_GET_LEN(buf_pool->flush_list);
1770 
1771 	/* In order not to degenerate this scan to O(n*n) we attempt
1772 	to preserve pointer of previous block in the flush list. To do
1773 	so we declare it a hazard pointer. Any thread working on the
1774 	flush list must check the hazard pointer and if it is removing
1775 	the same block then it must reset it. */
1776 	for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
1777 	     count < min_n && bpage != NULL && len > 0
1778 	     && bpage->oldest_modification < lsn_limit;
1779 	     bpage = buf_pool->flush_hp.get(),
1780 	     ++scanned) {
1781 
1782 		buf_page_t*	prev;
1783 
1784 		ut_a(bpage->oldest_modification > 0);
1785 		ut_ad(bpage->in_flush_list);
1786 
1787 		prev = UT_LIST_GET_PREV(list, bpage);
1788 		buf_pool->flush_hp.set(prev);
1789 		buf_flush_list_mutex_exit(buf_pool);
1790 
1791 #ifdef UNIV_DEBUG
1792 		bool flushed =
1793 #endif /* UNIV_DEBUG */
1794 		buf_flush_page_and_try_neighbors(
1795 			bpage, BUF_FLUSH_LIST, min_n, &count);
1796 
1797 		buf_flush_list_mutex_enter(buf_pool);
1798 
1799 		ut_ad(flushed || buf_pool->flush_hp.is_hp(prev));
1800 
1801 		--len;
1802 	}
1803 
1804 	buf_pool->flush_hp.set(NULL);
1805 	buf_flush_list_mutex_exit(buf_pool);
1806 
1807 	if (scanned) {
1808 		MONITOR_INC_VALUE_CUMULATIVE(
1809 			MONITOR_FLUSH_BATCH_SCANNED,
1810 			MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
1811 			MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
1812 			scanned);
1813 	}
1814 
1815 	if (count) {
1816 		MONITOR_INC_VALUE_CUMULATIVE(
1817 			MONITOR_FLUSH_BATCH_TOTAL_PAGE,
1818 			MONITOR_FLUSH_BATCH_COUNT,
1819 			MONITOR_FLUSH_BATCH_PAGES,
1820 			count);
1821 	}
1822 
1823 	ut_ad(buf_pool_mutex_own(buf_pool));
1824 
1825 	return(count);
1826 }
1827 
1828 /** This utility flushes dirty blocks from the end of the LRU list or
1829 flush_list.
1830 NOTE 1: in the case of an LRU flush the calling thread may own latches to
1831 pages: to avoid deadlocks, this function must be written so that it cannot
1832 end up waiting for these latches! NOTE 2: in the case of a flush list flush,
1833 the calling thread is not allowed to own any latches on pages!
1834 @param[in]	buf_pool	buffer pool instance
1835 @param[in]	flush_type	BUF_FLUSH_LRU or BUF_FLUSH_LIST; if
1836 BUF_FLUSH_LIST, then the caller must not own any latches on pages
1837 @param[in]	min_n		wished minimum mumber of blocks flushed (it is
1838 not guaranteed that the actual number is that big, though)
1839 @param[in]	lsn_limit	in the case of BUF_FLUSH_LIST all blocks whose
1840 oldest_modification is smaller than this should be flushed (if their number
1841 does not exceed min_n), otherwise ignored
1842 @return number of blocks for which the write request was queued */
1843 static
1844 ulint
buf_flush_batch(buf_pool_t * buf_pool,buf_flush_t flush_type,ulint min_n,lsn_t lsn_limit)1845 buf_flush_batch(
1846 	buf_pool_t*		buf_pool,
1847 	buf_flush_t		flush_type,
1848 	ulint			min_n,
1849 	lsn_t			lsn_limit)
1850 {
1851 	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1852 
1853 #ifdef UNIV_DEBUG
1854 	{
1855 		dict_sync_check	check(true);
1856 
1857 		ut_ad(flush_type != BUF_FLUSH_LIST
1858 		      || !sync_check_iterate(check));
1859 	}
1860 #endif /* UNIV_DEBUG */
1861 
1862 	buf_pool_mutex_enter(buf_pool);
1863 
1864 	ulint	count = 0;
1865 
1866 	/* Note: The buffer pool mutex is released and reacquired within
1867 	the flush functions. */
1868 	switch (flush_type) {
1869 	case BUF_FLUSH_LRU:
1870 		count = buf_do_LRU_batch(buf_pool, min_n);
1871 		break;
1872 	case BUF_FLUSH_LIST:
1873 		count = buf_do_flush_list_batch(buf_pool, min_n, lsn_limit);
1874 		break;
1875 	default:
1876 		ut_error;
1877 	}
1878 
1879 	buf_pool_mutex_exit(buf_pool);
1880 
1881 	DBUG_PRINT("ib_buf", ("flush %u completed, %u pages",
1882 			      unsigned(flush_type), unsigned(count)));
1883 
1884 	return(count);
1885 }
1886 
1887 /******************************************************************//**
1888 Gather the aggregated stats for both flush list and LRU list flushing.
1889 @param page_count_flush	number of pages flushed from the end of the flush_list
1890 @param page_count_LRU	number of pages flushed from the end of the LRU list
1891 */
1892 static
1893 void
buf_flush_stats(ulint page_count_flush,ulint page_count_LRU)1894 buf_flush_stats(
1895 /*============*/
1896 	ulint		page_count_flush,
1897 	ulint		page_count_LRU)
1898 {
1899 	DBUG_PRINT("ib_buf", ("flush completed, from flush_list %u pages, "
1900 			      "from LRU_list %u pages",
1901 			      unsigned(page_count_flush),
1902 			      unsigned(page_count_LRU)));
1903 
1904 	srv_stats.buf_pool_flushed.add(page_count_flush + page_count_LRU);
1905 }
1906 
1907 /******************************************************************//**
1908 Start a buffer flush batch for LRU or flush list */
1909 static
1910 ibool
buf_flush_start(buf_pool_t * buf_pool,buf_flush_t flush_type)1911 buf_flush_start(
1912 /*============*/
1913 	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
1914 	buf_flush_t	flush_type)	/*!< in: BUF_FLUSH_LRU
1915 					or BUF_FLUSH_LIST */
1916 {
1917 	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1918 
1919 	buf_pool_mutex_enter(buf_pool);
1920 
1921 	if (buf_pool->n_flush[flush_type] > 0
1922 	   || buf_pool->init_flush[flush_type] == TRUE) {
1923 
1924 		/* There is already a flush batch of the same type running */
1925 
1926 		buf_pool_mutex_exit(buf_pool);
1927 
1928 		return(FALSE);
1929 	}
1930 
1931 	buf_pool->init_flush[flush_type] = TRUE;
1932 
1933 	os_event_reset(buf_pool->no_flush[flush_type]);
1934 
1935 	buf_pool_mutex_exit(buf_pool);
1936 
1937 	return(TRUE);
1938 }
1939 
1940 /******************************************************************//**
1941 End a buffer flush batch for LRU or flush list */
1942 static
1943 void
buf_flush_end(buf_pool_t * buf_pool,buf_flush_t flush_type)1944 buf_flush_end(
1945 /*==========*/
1946 	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
1947 	buf_flush_t	flush_type)	/*!< in: BUF_FLUSH_LRU
1948 					or BUF_FLUSH_LIST */
1949 {
1950 	buf_pool_mutex_enter(buf_pool);
1951 
1952 	buf_pool->init_flush[flush_type] = FALSE;
1953 
1954 	buf_pool->try_LRU_scan = TRUE;
1955 
1956 	if (buf_pool->n_flush[flush_type] == 0) {
1957 
1958 		/* The running flush batch has ended */
1959 
1960 		os_event_set(buf_pool->no_flush[flush_type]);
1961 	}
1962 
1963 	buf_pool_mutex_exit(buf_pool);
1964 
1965 	if (!srv_read_only_mode) {
1966 		buf_dblwr_flush_buffered_writes();
1967 	} else {
1968 		os_aio_simulated_wake_handler_threads();
1969 	}
1970 }
1971 
1972 /******************************************************************//**
1973 Waits until a flush batch of the given type ends */
1974 void
buf_flush_wait_batch_end(buf_pool_t * buf_pool,buf_flush_t type)1975 buf_flush_wait_batch_end(
1976 /*=====================*/
1977 	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
1978 	buf_flush_t	type)		/*!< in: BUF_FLUSH_LRU
1979 					or BUF_FLUSH_LIST */
1980 {
1981 	ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
1982 
1983 	if (buf_pool == NULL) {
1984 		ulint	i;
1985 
1986 		for (i = 0; i < srv_buf_pool_instances; ++i) {
1987 			buf_pool_t*	buf_pool;
1988 
1989 			buf_pool = buf_pool_from_array(i);
1990 
1991 			thd_wait_begin(NULL, THD_WAIT_DISKIO);
1992 			os_event_wait(buf_pool->no_flush[type]);
1993 			thd_wait_end(NULL);
1994 		}
1995 	} else {
1996 		thd_wait_begin(NULL, THD_WAIT_DISKIO);
1997 		os_event_wait(buf_pool->no_flush[type]);
1998 		thd_wait_end(NULL);
1999 	}
2000 }
2001 
2002 /** Do flushing batch of a given type.
2003 NOTE: The calling thread is not allowed to own any latches on pages!
2004 @param[in,out]	buf_pool	buffer pool instance
2005 @param[in]	type		flush type
2006 @param[in]	min_n		wished minimum mumber of blocks flushed
2007 (it is not guaranteed that the actual number is that big, though)
2008 @param[in]	lsn_limit	in the case BUF_FLUSH_LIST all blocks whose
2009 oldest_modification is smaller than this should be flushed (if their number
2010 does not exceed min_n), otherwise ignored
2011 @param[out]	n_processed	the number of pages which were processed is
2012 passed back to caller. Ignored if NULL
2013 @retval true	if a batch was queued successfully.
2014 @retval false	if another batch of same type was already running. */
2015 bool
buf_flush_do_batch(buf_pool_t * buf_pool,buf_flush_t type,ulint min_n,lsn_t lsn_limit,ulint * n_processed)2016 buf_flush_do_batch(
2017 	buf_pool_t*		buf_pool,
2018 	buf_flush_t		type,
2019 	ulint			min_n,
2020 	lsn_t			lsn_limit,
2021 	ulint*			n_processed)
2022 {
2023 	ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
2024 
2025 	if (n_processed != NULL) {
2026 		*n_processed = 0;
2027 	}
2028 
2029 	if (!buf_flush_start(buf_pool, type)) {
2030 		return(false);
2031 	}
2032 
2033 	ulint	page_count = buf_flush_batch(buf_pool, type, min_n, lsn_limit);
2034 
2035 	buf_flush_end(buf_pool, type);
2036 
2037 	if (n_processed != NULL) {
2038 		*n_processed = page_count;
2039 	}
2040 
2041 	return(true);
2042 }
2043 
2044 /**
2045 Waits until a flush batch of the given lsn ends
2046 @param[in]	new_oldest	target oldest_modified_lsn to wait for */
2047 
2048 void
buf_flush_wait_flushed(lsn_t new_oldest)2049 buf_flush_wait_flushed(
2050 	lsn_t		new_oldest)
2051 {
2052 	for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
2053 		buf_pool_t*	buf_pool;
2054 		lsn_t		oldest;
2055 
2056 		buf_pool = buf_pool_from_array(i);
2057 
2058 		for (;;) {
2059 			/* We don't need to wait for fsync of the flushed
2060 			blocks, because anyway we need fsync to make chekpoint.
2061 			So, we don't need to wait for the batch end here. */
2062 
2063 			buf_flush_list_mutex_enter(buf_pool);
2064 
2065 			buf_page_t*	bpage;
2066 
2067 			/* We don't need to wait for system temporary pages */
2068 			for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
2069 			     bpage != NULL
2070 				&& fsp_is_system_temporary(bpage->id.space());
2071 			     bpage = UT_LIST_GET_PREV(list, bpage)) {
2072 				/* Do nothing. */
2073 			}
2074 
2075 			if (bpage != NULL) {
2076 				ut_ad(bpage->in_flush_list);
2077 				oldest = bpage->oldest_modification;
2078 			} else {
2079 				oldest = 0;
2080 			}
2081 
2082 			buf_flush_list_mutex_exit(buf_pool);
2083 
2084 			if (oldest == 0 || oldest >= new_oldest) {
2085 				break;
2086 			}
2087 
2088 			/* sleep and retry */
2089 			os_thread_sleep(buf_flush_wait_flushed_sleep_time);
2090 
2091 			MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
2092 		}
2093 	}
2094 }
2095 
2096 /** This utility flushes dirty blocks from the end of the flush list of all
2097 buffer pool instances.
2098 NOTE: The calling thread is not allowed to own any latches on pages!
2099 @param[in]	min_n		wished minimum mumber of blocks flushed (it is
2100 not guaranteed that the actual number is that big, though)
2101 @param[in]	lsn_limit	in the case BUF_FLUSH_LIST all blocks whose
2102 oldest_modification is smaller than this should be flushed (if their number
2103 does not exceed min_n), otherwise ignored
2104 @param[out]	n_processed	the number of pages which were processed is
2105 passed back to caller. Ignored if NULL.
2106 @return true if a batch was queued successfully for each buffer pool
2107 instance. false if another batch of same type was already running in
2108 at least one of the buffer pool instance */
2109 bool
buf_flush_lists(ulint min_n,lsn_t lsn_limit,ulint * n_processed)2110 buf_flush_lists(
2111 	ulint			min_n,
2112 	lsn_t			lsn_limit,
2113 	ulint*			n_processed)
2114 {
2115 	ulint		i;
2116 	ulint		n_flushed = 0;
2117 	bool		success = true;
2118 
2119 	if (n_processed) {
2120 		*n_processed = 0;
2121 	}
2122 
2123 	if (min_n != ULINT_MAX) {
2124 		/* Ensure that flushing is spread evenly amongst the
2125 		buffer pool instances. When min_n is ULINT_MAX
2126 		we need to flush everything up to the lsn limit
2127 		so no limit here. */
2128 		min_n = (min_n + srv_buf_pool_instances - 1)
2129 			 / srv_buf_pool_instances;
2130 	}
2131 
2132 	/* Flush to lsn_limit in all buffer pool instances */
2133 	for (i = 0; i < srv_buf_pool_instances; i++) {
2134 		buf_pool_t*	buf_pool;
2135 		ulint		page_count = 0;
2136 
2137 		buf_pool = buf_pool_from_array(i);
2138 
2139 		if (!buf_flush_do_batch(buf_pool,
2140 					BUF_FLUSH_LIST,
2141 					min_n,
2142 					lsn_limit,
2143 					&page_count)) {
2144 			/* We have two choices here. If lsn_limit was
2145 			specified then skipping an instance of buffer
2146 			pool means we cannot guarantee that all pages
2147 			up to lsn_limit has been flushed. We can
2148 			return right now with failure or we can try
2149 			to flush remaining buffer pools up to the
2150 			lsn_limit. We attempt to flush other buffer
2151 			pools based on the assumption that it will
2152 			help in the retry which will follow the
2153 			failure. */
2154 			success = false;
2155 
2156 			continue;
2157 		}
2158 
2159 		n_flushed += page_count;
2160 	}
2161 
2162 	if (n_flushed) {
2163 		buf_flush_stats(n_flushed, 0);
2164 	}
2165 
2166 	if (n_processed) {
2167 		*n_processed = n_flushed;
2168 	}
2169 
2170 	return(success);
2171 }
2172 
2173 /******************************************************************//**
2174 This function picks up a single page from the tail of the LRU
2175 list, flushes it (if it is dirty), removes it from page_hash and LRU
2176 list and puts it on the free list. It is called from user threads when
2177 they are unable to find a replaceable page at the tail of the LRU
2178 list i.e.: when the background LRU flushing in the page_cleaner thread
2179 is not fast enough to keep pace with the workload.
2180 @return true if success. */
2181 bool
buf_flush_single_page_from_LRU(buf_pool_t * buf_pool)2182 buf_flush_single_page_from_LRU(
2183 /*===========================*/
2184 	buf_pool_t*	buf_pool)	/*!< in/out: buffer pool instance */
2185 {
2186 	ulint		scanned;
2187 	buf_page_t*	bpage;
2188 	ibool		freed;
2189 
2190 	buf_pool_mutex_enter(buf_pool);
2191 
2192 	for (bpage = buf_pool->single_scan_itr.start(), scanned = 0,
2193 	     freed = false;
2194 	     bpage != NULL;
2195 	     ++scanned, bpage = buf_pool->single_scan_itr.get()) {
2196 
2197 		ut_ad(buf_pool_mutex_own(buf_pool));
2198 
2199 		buf_page_t*	prev = UT_LIST_GET_PREV(LRU, bpage);
2200 
2201 		buf_pool->single_scan_itr.set(prev);
2202 
2203 		BPageMutex*	block_mutex;
2204 
2205 		block_mutex = buf_page_get_mutex(bpage);
2206 
2207 		mutex_enter(block_mutex);
2208 
2209 		if (buf_flush_ready_for_replace(bpage)) {
2210 			/* block is ready for eviction i.e., it is
2211 			clean and is not IO-fixed or buffer fixed. */
2212 			mutex_exit(block_mutex);
2213 
2214 			if (buf_LRU_free_page(bpage, true)) {
2215 				buf_pool_mutex_exit(buf_pool);
2216 				freed = true;
2217 				break;
2218 			}
2219 
2220 		} else if (buf_flush_ready_for_flush(
2221 				   bpage, BUF_FLUSH_SINGLE_PAGE)) {
2222 
2223 			/* Block is ready for flush. Try and dispatch an IO
2224 			request. We'll put it on free list in IO completion
2225 			routine if it is not buffer fixed. The following call
2226 			will release the buffer pool and block mutex.
2227 
2228 			Note: There is no guarantee that this page has actually
2229 			been freed, only that it has been flushed to disk */
2230 
2231 			freed = buf_flush_page(
2232 				buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE, true);
2233 
2234 			if (freed) {
2235 				break;
2236 			}
2237 
2238 			mutex_exit(block_mutex);
2239 		} else {
2240 			mutex_exit(block_mutex);
2241 		}
2242 
2243 		ut_ad(!mutex_own(block_mutex));
2244 	}
2245 
2246 	if (!freed) {
2247 		/* Can't find a single flushable page. */
2248 		ut_ad(!bpage);
2249 		buf_pool_mutex_exit(buf_pool);
2250 	}
2251 
2252 	if (scanned) {
2253 		MONITOR_INC_VALUE_CUMULATIVE(
2254 			MONITOR_LRU_SINGLE_FLUSH_SCANNED,
2255 			MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
2256 			MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL,
2257 			scanned);
2258 	}
2259 
2260 	ut_ad(!buf_pool_mutex_own(buf_pool));
2261 
2262 	return(freed);
2263 }
2264 
2265 /**
2266 Clears up tail of the LRU list of a given buffer pool instance:
2267 * Put replaceable pages at the tail of LRU to the free list
2268 * Flush dirty pages at the tail of LRU to the disk
2269 The depth to which we scan each buffer pool is controlled by dynamic
2270 config parameter innodb_LRU_scan_depth.
2271 @param buf_pool buffer pool instance
2272 @return total pages flushed */
2273 static
2274 ulint
buf_flush_LRU_list(buf_pool_t * buf_pool)2275 buf_flush_LRU_list(
2276 	buf_pool_t*	buf_pool)
2277 {
2278 	ulint	scan_depth, withdraw_depth;
2279 	ulint	n_flushed = 0;
2280 
2281 	ut_ad(buf_pool);
2282 
2283 	/* srv_LRU_scan_depth can be arbitrarily large value.
2284 	We cap it with current LRU size. */
2285 	buf_pool_mutex_enter(buf_pool);
2286 	scan_depth = UT_LIST_GET_LEN(buf_pool->LRU);
2287 	if (buf_pool->curr_size < buf_pool->old_size
2288 	    && buf_pool->withdraw_target > 0) {
2289 		withdraw_depth = buf_pool->withdraw_target
2290 				 - UT_LIST_GET_LEN(buf_pool->withdraw);
2291 	} else {
2292 		withdraw_depth = 0;
2293 	}
2294 	buf_pool_mutex_exit(buf_pool);
2295 
2296 	if (withdraw_depth > srv_LRU_scan_depth) {
2297 		scan_depth = ut_min(withdraw_depth, scan_depth);
2298 	} else {
2299 		scan_depth = ut_min(static_cast<ulint>(srv_LRU_scan_depth),
2300 				    scan_depth);
2301 	}
2302 
2303 	/* Currently one of page_cleaners is the only thread
2304 	that can trigger an LRU flush at the same time.
2305 	So, it is not possible that a batch triggered during
2306 	last iteration is still running, */
2307 	buf_flush_do_batch(buf_pool, BUF_FLUSH_LRU, scan_depth,
2308 			   0, &n_flushed);
2309 
2310 	return(n_flushed);
2311 }
2312 
2313 /*********************************************************************//**
2314 Clears up tail of the LRU lists:
2315 * Put replaceable pages at the tail of LRU to the free list
2316 * Flush dirty pages at the tail of LRU to the disk
2317 The depth to which we scan each buffer pool is controlled by dynamic
2318 config parameter innodb_LRU_scan_depth.
2319 @return total pages flushed */
2320 ulint
buf_flush_LRU_lists(void)2321 buf_flush_LRU_lists(void)
2322 /*=====================*/
2323 {
2324 	ulint	n_flushed = 0;
2325 
2326 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2327 
2328 		n_flushed += buf_flush_LRU_list(buf_pool_from_array(i));
2329 	}
2330 
2331 	if (n_flushed) {
2332 		buf_flush_stats(0, n_flushed);
2333 	}
2334 
2335 	return(n_flushed);
2336 }
2337 
2338 /*********************************************************************//**
2339 Wait for any possible LRU flushes that are in progress to end. */
2340 void
buf_flush_wait_LRU_batch_end(void)2341 buf_flush_wait_LRU_batch_end(void)
2342 /*==============================*/
2343 {
2344 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2345 		buf_pool_t*	buf_pool;
2346 
2347 		buf_pool = buf_pool_from_array(i);
2348 
2349 		buf_pool_mutex_enter(buf_pool);
2350 
2351 		if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0
2352 		   || buf_pool->init_flush[BUF_FLUSH_LRU]) {
2353 
2354 			buf_pool_mutex_exit(buf_pool);
2355 			buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
2356 		} else {
2357 			buf_pool_mutex_exit(buf_pool);
2358 		}
2359 	}
2360 }
2361 
2362 /*********************************************************************//**
2363 Calculates if flushing is required based on number of dirty pages in
2364 the buffer pool.
2365 @return percent of io_capacity to flush to manage dirty page ratio */
2366 static
2367 ulint
af_get_pct_for_dirty()2368 af_get_pct_for_dirty()
2369 /*==================*/
2370 {
2371 	double	dirty_pct = buf_get_modified_ratio_pct();
2372 
2373 	if (dirty_pct == 0.0) {
2374 		/* No pages modified */
2375 		return(0);
2376 	}
2377 
2378 	ut_a(srv_max_dirty_pages_pct_lwm
2379 	     <= srv_max_buf_pool_modified_pct);
2380 
2381 	if (srv_max_dirty_pages_pct_lwm == 0) {
2382 		/* The user has not set the option to preflush dirty
2383 		pages as we approach the high water mark. */
2384 		if (dirty_pct >= srv_max_buf_pool_modified_pct) {
2385 			/* We have crossed the high water mark of dirty
2386 			pages In this case we start flushing at 100% of
2387 			innodb_io_capacity. */
2388 			return(100);
2389 		}
2390 	} else if (dirty_pct >= srv_max_dirty_pages_pct_lwm) {
2391 		/* We should start flushing pages gradually. */
2392 		return(static_cast<ulint>((dirty_pct * 100)
2393 		       / (srv_max_buf_pool_modified_pct + 1)));
2394 	}
2395 
2396 	return(0);
2397 }
2398 
2399 /*********************************************************************//**
2400 Calculates if flushing is required based on redo generation rate.
2401 @return percent of io_capacity to flush to manage redo space */
2402 static
2403 ulint
af_get_pct_for_lsn(lsn_t age)2404 af_get_pct_for_lsn(
2405 /*===============*/
2406 	lsn_t	age)	/*!< in: current age of LSN. */
2407 {
2408 	lsn_t	max_async_age;
2409 	lsn_t	lsn_age_factor;
2410 	lsn_t	af_lwm = (srv_adaptive_flushing_lwm
2411 			  * log_get_capacity()) / 100;
2412 
2413 	if (age < af_lwm) {
2414 		/* No adaptive flushing. */
2415 		return(0);
2416 	}
2417 
2418 	max_async_age = log_get_max_modified_age_async();
2419 
2420 	if (age < max_async_age && !srv_adaptive_flushing) {
2421 		/* We have still not reached the max_async point and
2422 		the user has disabled adaptive flushing. */
2423 		return(0);
2424 	}
2425 
2426 	/* If we are here then we know that either:
2427 	1) User has enabled adaptive flushing
2428 	2) User may have disabled adaptive flushing but we have reached
2429 	max_async_age. */
2430 	lsn_age_factor = (age * 100) / max_async_age;
2431 
2432 	ut_ad(srv_max_io_capacity >= srv_io_capacity);
2433 	return(static_cast<ulint>(
2434 		((srv_max_io_capacity / srv_io_capacity)
2435 		* (lsn_age_factor * sqrt((double)lsn_age_factor)))
2436 		/ 7.5));
2437 }
2438 
2439 /*********************************************************************//**
2440 This function is called approximately once every second by the
2441 page_cleaner thread. Based on various factors it decides if there is a
2442 need to do flushing.
2443 @return number of pages recommended to be flushed
2444 @param lsn_limit	pointer to return LSN up to which flushing must happen
2445 @param last_pages_in	the number of pages flushed by the last flush_list
2446 			flushing. */
2447 static
2448 ulint
page_cleaner_flush_pages_recommendation(lsn_t * lsn_limit,ulint last_pages_in)2449 page_cleaner_flush_pages_recommendation(
2450 /*====================================*/
2451 	lsn_t*	lsn_limit,
2452 	ulint	last_pages_in)
2453 {
2454 	static	lsn_t		prev_lsn = 0;
2455 	static	ulint		sum_pages = 0;
2456 	static	ulint		avg_page_rate = 0;
2457 	static	ulint		n_iterations = 0;
2458 	static	ib_time_monotonic_t		prev_time;
2459 	lsn_t			oldest_lsn;
2460 	lsn_t			cur_lsn;
2461 	lsn_t			age;
2462 	lsn_t			lsn_rate;
2463 	ulint			n_pages = 0;
2464 	ulint			pct_for_dirty = 0;
2465 	ulint			pct_for_lsn = 0;
2466 	ulint			pct_total = 0;
2467 
2468 	cur_lsn = log_get_lsn();
2469 
2470 	if (prev_lsn == 0) {
2471 		/* First time around. */
2472 		prev_lsn = cur_lsn;
2473 		prev_time = ut_time_monotonic();
2474 		return(0);
2475 	}
2476 
2477 	if (prev_lsn == cur_lsn) {
2478 		return(0);
2479 	}
2480 
2481 	sum_pages += last_pages_in;
2482 
2483 	ib_time_monotonic_t	curr_time    = ut_time_monotonic();
2484 	uint64_t	        time_elapsed = curr_time - prev_time;
2485 	const ulong             avg_loop     = srv_flushing_avg_loops;
2486 
2487 	/* We update our variables every srv_flushing_avg_loops
2488 	iterations to smooth out transition in workload. */
2489 	if (++n_iterations >= avg_loop
2490 	    || time_elapsed >= (uint64_t)avg_loop) {
2491 
2492 		if (time_elapsed < 1) {
2493 			time_elapsed = 1;
2494 		}
2495 
2496 		avg_page_rate = static_cast<ulint>(
2497 			((static_cast<double>(sum_pages)
2498 			  / time_elapsed)
2499 			 + avg_page_rate) / 2);
2500 
2501 		/* How much LSN we have generated since last call. */
2502 		lsn_rate = static_cast<lsn_t>(
2503 			static_cast<double>(cur_lsn - prev_lsn)
2504 			/ time_elapsed);
2505 
2506 		lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2;
2507 
2508 
2509 		/* aggregate stats of all slots */
2510 		mutex_enter(&page_cleaner->mutex);
2511 
2512 		uint64_t  flush_tm = page_cleaner->flush_time;
2513 		ulint	flush_pass = page_cleaner->flush_pass;
2514 
2515 		page_cleaner->flush_time = 0;
2516 		page_cleaner->flush_pass = 0;
2517 
2518 		uint64_t lru_tm = 0;
2519 		uint64_t list_tm = 0;
2520 		ulint	lru_pass = 0;
2521 		ulint	list_pass = 0;
2522 
2523 		for (ulint i = 0; i < page_cleaner->n_slots; i++) {
2524 			page_cleaner_slot_t*	slot;
2525 
2526 			slot = &page_cleaner->slots[i];
2527 
2528 			lru_tm    += slot->flush_lru_time;
2529 			lru_pass  += slot->flush_lru_pass;
2530 			list_tm   += slot->flush_list_time;
2531 			list_pass += slot->flush_list_pass;
2532 
2533 			slot->flush_lru_time  = 0;
2534 			slot->flush_lru_pass  = 0;
2535 			slot->flush_list_time = 0;
2536 			slot->flush_list_pass = 0;
2537 		}
2538 
2539 		mutex_exit(&page_cleaner->mutex);
2540 
2541 		/* minimum values are 1, to avoid dividing by zero. */
2542 		if (lru_tm < 1) {
2543 			lru_tm = 1;
2544 		}
2545 		if (list_tm < 1) {
2546 			list_tm = 1;
2547 		}
2548 		if (flush_tm < 1) {
2549 			flush_tm = 1;
2550 		}
2551 
2552 		if (lru_pass < 1) {
2553 			lru_pass = 1;
2554 		}
2555 		if (list_pass < 1) {
2556 			list_pass = 1;
2557 		}
2558 		if (flush_pass < 1) {
2559 			flush_pass = 1;
2560 		}
2561 
2562 		MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_SLOT,
2563 			    list_tm / list_pass);
2564 		MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_SLOT,
2565 			    lru_tm  / lru_pass);
2566 
2567 		MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_THREAD,
2568 			    list_tm / (srv_n_page_cleaners * flush_pass));
2569 		MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_THREAD,
2570 			    lru_tm / (srv_n_page_cleaners * flush_pass));
2571 		MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_EST,
2572 			    flush_tm * list_tm / flush_pass
2573 			    / (list_tm + lru_tm));
2574 		MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_EST,
2575 			    flush_tm * lru_tm / flush_pass
2576 			    / (list_tm + lru_tm));
2577 		MONITOR_SET(MONITOR_FLUSH_AVG_TIME, flush_tm / flush_pass);
2578 
2579 		MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS,
2580 			    list_pass / page_cleaner->n_slots);
2581 		MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_PASS,
2582 			    lru_pass / page_cleaner->n_slots);
2583 		MONITOR_SET(MONITOR_FLUSH_AVG_PASS, flush_pass);
2584 
2585 		prev_lsn = cur_lsn;
2586 		prev_time = curr_time;
2587 
2588 		n_iterations = 0;
2589 
2590 		sum_pages = 0;
2591 	}
2592 
2593 	oldest_lsn = buf_pool_get_oldest_modification();
2594 
2595 	ut_ad(oldest_lsn <= log_get_lsn());
2596 
2597 	age = cur_lsn > oldest_lsn ? cur_lsn - oldest_lsn : 0;
2598 
2599 	pct_for_dirty = af_get_pct_for_dirty();
2600 	pct_for_lsn = af_get_pct_for_lsn(age);
2601 
2602 	pct_total = ut_max(pct_for_dirty, pct_for_lsn);
2603 
2604 	/* Estimate pages to be flushed for the lsn progress */
2605 	ulint	sum_pages_for_lsn = 0;
2606 	lsn_t	target_lsn = oldest_lsn
2607 			     + lsn_avg_rate * buf_flush_lsn_scan_factor;
2608 
2609 	/* Cap the maximum IO capacity that we are going to use by
2610 	max_io_capacity. Limit the value to avoid too quick increase */
2611 	const ulint	sum_pages_max = srv_max_io_capacity * 2;
2612 
2613 	/* Limit individual BP scan based on overall capacity. */
2614 	const ulint	pages_for_lsn_max =
2615 		(sum_pages_max / srv_buf_pool_instances) *
2616 		buf_flush_lsn_scan_factor * 2;
2617 
2618 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2619 		buf_pool_t*	buf_pool = buf_pool_from_array(i);
2620 		ulint		pages_for_lsn = 0;
2621 
2622 		buf_flush_list_mutex_enter(buf_pool);
2623 		for (buf_page_t* b = UT_LIST_GET_LAST(buf_pool->flush_list);
2624 		     b != NULL;
2625 		     b = UT_LIST_GET_PREV(list, b)) {
2626 			if (b->oldest_modification > target_lsn) {
2627 				break;
2628 			}
2629 			++pages_for_lsn;
2630 			if (pages_for_lsn >= pages_for_lsn_max) {
2631 				break;
2632 			}
2633 		}
2634 		buf_flush_list_mutex_exit(buf_pool);
2635 
2636 		sum_pages_for_lsn += pages_for_lsn;
2637 
2638 		mutex_enter(&page_cleaner->mutex);
2639 		ut_ad(page_cleaner->slots[i].state
2640 		      == PAGE_CLEANER_STATE_NONE);
2641 		page_cleaner->slots[i].n_pages_requested
2642 			= pages_for_lsn / buf_flush_lsn_scan_factor + 1;
2643 		mutex_exit(&page_cleaner->mutex);
2644 	}
2645 
2646 	sum_pages_for_lsn /= buf_flush_lsn_scan_factor;
2647 	if(sum_pages_for_lsn < 1) {
2648 		sum_pages_for_lsn = 1;
2649 	}
2650 
2651 	/* Cap the maximum IO capacity that we are going to use by
2652 	max_io_capacity. Limit the value to avoid too quick increase */
2653 	ulint	pages_for_lsn =
2654 		std::min<ulint>(sum_pages_for_lsn, sum_pages_max);
2655 
2656 	n_pages = (PCT_IO(pct_total) + avg_page_rate + pages_for_lsn) / 3;
2657 
2658 	if (n_pages > srv_max_io_capacity) {
2659 		n_pages = srv_max_io_capacity;
2660 	}
2661 
2662 	/* Normalize request for each instance */
2663 	mutex_enter(&page_cleaner->mutex);
2664 	ut_ad(page_cleaner->n_slots_requested == 0);
2665 	ut_ad(page_cleaner->n_slots_flushing == 0);
2666 	ut_ad(page_cleaner->n_slots_finished == 0);
2667 
2668 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2669 		/* if REDO has enough of free space,
2670 		don't care about age distribution of pages */
2671 		page_cleaner->slots[i].n_pages_requested = pct_for_lsn > 30 ?
2672 			page_cleaner->slots[i].n_pages_requested
2673 			* n_pages / sum_pages_for_lsn + 1
2674 			: n_pages / srv_buf_pool_instances;
2675 	}
2676 	mutex_exit(&page_cleaner->mutex);
2677 
2678 	MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages);
2679 
2680 	MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_BY_AGE, sum_pages_for_lsn);
2681 
2682 	MONITOR_SET(MONITOR_FLUSH_AVG_PAGE_RATE, avg_page_rate);
2683 	MONITOR_SET(MONITOR_FLUSH_LSN_AVG_RATE, lsn_avg_rate);
2684 	MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty);
2685 	MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn);
2686 
2687 	*lsn_limit = LSN_MAX;
2688 
2689 	return(n_pages);
2690 }
2691 
2692 /*********************************************************************//**
2693 Puts the page_cleaner thread to sleep if it has finished work in less
2694 than a second
2695 @retval 0 wake up by event set,
2696 @retval OS_SYNC_TIME_EXCEEDED if timeout was exceeded
2697 @param next_loop_time	time when next loop iteration should start
2698 @param sig_count	zero or the value returned by previous call of
2699 			os_event_reset() */
2700 static
2701 ulint
pc_sleep_if_needed(ib_time_monotonic_ms_t next_loop_time,int64_t sig_count)2702 pc_sleep_if_needed(
2703 /*===============*/
2704 	ib_time_monotonic_ms_t		next_loop_time,
2705 	int64_t		sig_count)
2706 {
2707 	ib_time_monotonic_ms_t	cur_time = ut_time_monotonic_ms();
2708 
2709 	if (next_loop_time > cur_time) {
2710 		/* Get sleep interval in micro seconds. We use
2711 		ut_min() to avoid long sleep in case of wrap around. */
2712 		int64_t sleep_us;
2713 
2714 		sleep_us = ut_min(int64_t(1000000),
2715 			         (next_loop_time - cur_time) * int64_t(1000));
2716 		ut_a(sleep_us > 0);
2717 
2718 		return(os_event_wait_time_low(buf_flush_event,
2719 					      sleep_us, sig_count));
2720 	}
2721 
2722 	return(OS_SYNC_TIME_EXCEEDED);
2723 }
2724 
2725 /******************************************************************//**
2726 Initialize page_cleaner. */
2727 void
buf_flush_page_cleaner_init(void)2728 buf_flush_page_cleaner_init(void)
2729 /*=============================*/
2730 {
2731 	ut_ad(page_cleaner == NULL);
2732 
2733 	page_cleaner = static_cast<page_cleaner_t*>(
2734 		ut_zalloc_nokey(sizeof(*page_cleaner)));
2735 
2736 	mutex_create(LATCH_ID_PAGE_CLEANER, &page_cleaner->mutex);
2737 
2738 	page_cleaner->is_requested = os_event_create("pc_is_requested");
2739 	page_cleaner->is_finished = os_event_create("pc_is_finished");
2740 
2741 	page_cleaner->n_slots = static_cast<ulint>(srv_buf_pool_instances);
2742 
2743 	page_cleaner->slots = static_cast<page_cleaner_slot_t*>(
2744 		ut_zalloc_nokey(page_cleaner->n_slots
2745 				* sizeof(*page_cleaner->slots)));
2746 
2747 	ut_d(page_cleaner->n_disabled_debug = 0);
2748 
2749 	page_cleaner->is_running = true;
2750 }
2751 
2752 /**
2753 Close page_cleaner. */
2754 static
2755 void
buf_flush_page_cleaner_close(void)2756 buf_flush_page_cleaner_close(void)
2757 {
2758 	/* waiting for all worker threads exit */
2759 	while (page_cleaner->n_workers > 0) {
2760 		os_thread_sleep(10000);
2761 	}
2762 
2763 	mutex_destroy(&page_cleaner->mutex);
2764 
2765 	ut_free(page_cleaner->slots);
2766 
2767 	os_event_destroy(page_cleaner->is_finished);
2768 	os_event_destroy(page_cleaner->is_requested);
2769 
2770 	ut_free(page_cleaner);
2771 
2772 	page_cleaner = NULL;
2773 }
2774 
2775 /**
2776 Requests for all slots to flush all buffer pool instances.
2777 @param min_n	wished minimum mumber of blocks flushed
2778 		(it is not guaranteed that the actual number is that big)
2779 @param lsn_limit in the case BUF_FLUSH_LIST all blocks whose
2780 		oldest_modification is smaller than this should be flushed
2781 		(if their number does not exceed min_n), otherwise ignored
2782 */
2783 static
2784 void
pc_request(ulint min_n,lsn_t lsn_limit)2785 pc_request(
2786 	ulint		min_n,
2787 	lsn_t		lsn_limit)
2788 {
2789 	if (min_n != ULINT_MAX) {
2790 		/* Ensure that flushing is spread evenly amongst the
2791 		buffer pool instances. When min_n is ULINT_MAX
2792 		we need to flush everything up to the lsn limit
2793 		so no limit here. */
2794 		min_n = (min_n + srv_buf_pool_instances - 1)
2795 			/ srv_buf_pool_instances;
2796 	}
2797 
2798 	mutex_enter(&page_cleaner->mutex);
2799 
2800 	ut_ad(page_cleaner->n_slots_requested == 0);
2801 	ut_ad(page_cleaner->n_slots_flushing == 0);
2802 	ut_ad(page_cleaner->n_slots_finished == 0);
2803 
2804 	page_cleaner->requested = (min_n > 0);
2805 	page_cleaner->lsn_limit = lsn_limit;
2806 
2807 	for (ulint i = 0; i < page_cleaner->n_slots; i++) {
2808 		page_cleaner_slot_t* slot = &page_cleaner->slots[i];
2809 
2810 		ut_ad(slot->state == PAGE_CLEANER_STATE_NONE);
2811 
2812 		if (min_n == ULINT_MAX) {
2813 			slot->n_pages_requested = ULINT_MAX;
2814 		} else if (min_n == 0) {
2815 			slot->n_pages_requested = 0;
2816 		}
2817 
2818 		/* slot->n_pages_requested was already set by
2819 		page_cleaner_flush_pages_recommendation() */
2820 
2821 		slot->state = PAGE_CLEANER_STATE_REQUESTED;
2822 	}
2823 
2824 	page_cleaner->n_slots_requested = page_cleaner->n_slots;
2825 	page_cleaner->n_slots_flushing = 0;
2826 	page_cleaner->n_slots_finished = 0;
2827 
2828 	os_event_set(page_cleaner->is_requested);
2829 
2830 	mutex_exit(&page_cleaner->mutex);
2831 }
2832 
2833 /**
2834 Do flush for one slot.
2835 @return	the number of the slots which has not been treated yet. */
2836 static
2837 ulint
pc_flush_slot(void)2838 pc_flush_slot(void)
2839 {
2840 	ib_time_monotonic_ms_t	lru_tm = 0;
2841 	ib_time_monotonic_ms_t	list_tm = 0;
2842 	int	lru_pass = 0;
2843 	int	list_pass = 0;
2844 
2845 	mutex_enter(&page_cleaner->mutex);
2846 
2847 	if (page_cleaner->n_slots_requested > 0) {
2848 		page_cleaner_slot_t*	slot = NULL;
2849 		ulint			i;
2850 
2851 		for (i = 0; i < page_cleaner->n_slots; i++) {
2852 			slot = &page_cleaner->slots[i];
2853 
2854 			if (slot->state == PAGE_CLEANER_STATE_REQUESTED) {
2855 				break;
2856 			}
2857 		}
2858 
2859 		/* slot should be found because
2860 		page_cleaner->n_slots_requested > 0 */
2861 		ut_a(i < page_cleaner->n_slots);
2862 
2863 		buf_pool_t* buf_pool = buf_pool_from_array(i);
2864 
2865 		page_cleaner->n_slots_requested--;
2866 		page_cleaner->n_slots_flushing++;
2867 		slot->state = PAGE_CLEANER_STATE_FLUSHING;
2868 
2869 		if (page_cleaner->n_slots_requested == 0) {
2870 			os_event_reset(page_cleaner->is_requested);
2871 		}
2872 
2873 		if (!page_cleaner->is_running) {
2874 			slot->n_flushed_lru = 0;
2875 			slot->n_flushed_list = 0;
2876 			goto finish_mutex;
2877 		}
2878 
2879 		mutex_exit(&page_cleaner->mutex);
2880 
2881 		lru_tm = ut_time_monotonic_ms();
2882 
2883 		/* Flush pages from end of LRU if required */
2884 		slot->n_flushed_lru = buf_flush_LRU_list(buf_pool);
2885 
2886 		lru_tm = ut_time_monotonic_ms() - lru_tm;
2887 		lru_pass++;
2888 
2889 		if (!page_cleaner->is_running) {
2890 			slot->n_flushed_list = 0;
2891 			goto finish;
2892 		}
2893 
2894 		/* Flush pages from flush_list if required */
2895 		if (page_cleaner->requested) {
2896 
2897 			list_tm = ut_time_monotonic_ms();
2898 
2899 			slot->succeeded_list = buf_flush_do_batch(
2900 				buf_pool, BUF_FLUSH_LIST,
2901 				slot->n_pages_requested,
2902 				page_cleaner->lsn_limit,
2903 				&slot->n_flushed_list);
2904 
2905 			list_tm = ut_time_monotonic_ms() - list_tm;
2906 			list_pass++;
2907 		} else {
2908 			slot->n_flushed_list = 0;
2909 			slot->succeeded_list = true;
2910 		}
2911 finish:
2912 		mutex_enter(&page_cleaner->mutex);
2913 finish_mutex:
2914 		page_cleaner->n_slots_flushing--;
2915 		page_cleaner->n_slots_finished++;
2916 		slot->state = PAGE_CLEANER_STATE_FINISHED;
2917 
2918 		slot->flush_lru_time += lru_tm;
2919 		slot->flush_list_time += list_tm;
2920 		slot->flush_lru_pass += lru_pass;
2921 		slot->flush_list_pass += list_pass;
2922 
2923 		if (page_cleaner->n_slots_requested == 0
2924 		    && page_cleaner->n_slots_flushing == 0) {
2925 			os_event_set(page_cleaner->is_finished);
2926 		}
2927 	}
2928 
2929 	ulint	ret = page_cleaner->n_slots_requested;
2930 
2931 	mutex_exit(&page_cleaner->mutex);
2932 
2933 	return(ret);
2934 }
2935 
2936 /**
2937 Wait until all flush requests are finished.
2938 @param n_flushed_lru	number of pages flushed from the end of the LRU list.
2939 @param n_flushed_list	number of pages flushed from the end of the
2940 			flush_list.
2941 @return			true if all flush_list flushing batch were success. */
2942 static
2943 bool
pc_wait_finished(ulint * n_flushed_lru,ulint * n_flushed_list)2944 pc_wait_finished(
2945 	ulint*	n_flushed_lru,
2946 	ulint*	n_flushed_list)
2947 {
2948 	bool	all_succeeded = true;
2949 
2950 	*n_flushed_lru = 0;
2951 	*n_flushed_list = 0;
2952 
2953 	os_event_wait(page_cleaner->is_finished);
2954 
2955 	mutex_enter(&page_cleaner->mutex);
2956 
2957 	ut_ad(page_cleaner->n_slots_requested == 0);
2958 	ut_ad(page_cleaner->n_slots_flushing == 0);
2959 	ut_ad(page_cleaner->n_slots_finished == page_cleaner->n_slots);
2960 
2961 	for (ulint i = 0; i < page_cleaner->n_slots; i++) {
2962 		page_cleaner_slot_t* slot = &page_cleaner->slots[i];
2963 
2964 		ut_ad(slot->state == PAGE_CLEANER_STATE_FINISHED);
2965 
2966 		*n_flushed_lru += slot->n_flushed_lru;
2967 		*n_flushed_list += slot->n_flushed_list;
2968 		all_succeeded &= slot->succeeded_list;
2969 
2970 		slot->state = PAGE_CLEANER_STATE_NONE;
2971 
2972 		slot->n_pages_requested = 0;
2973 	}
2974 
2975 	page_cleaner->n_slots_finished = 0;
2976 
2977 	os_event_reset(page_cleaner->is_finished);
2978 
2979 	mutex_exit(&page_cleaner->mutex);
2980 
2981 	return(all_succeeded);
2982 }
2983 
2984 #ifdef UNIV_LINUX
2985 /**
2986 Set priority for page_cleaner threads.
2987 @param[in]	priority	priority intended to set
2988 @return	true if set as intended */
2989 static
2990 bool
buf_flush_page_cleaner_set_priority(int priority)2991 buf_flush_page_cleaner_set_priority(
2992 	int	priority)
2993 {
2994 	setpriority(PRIO_PROCESS, (pid_t)syscall(SYS_gettid),
2995 		    priority);
2996 	return(getpriority(PRIO_PROCESS, (pid_t)syscall(SYS_gettid))
2997 	       == priority);
2998 }
2999 #endif /* UNIV_LINUX */
3000 
3001 #ifdef UNIV_DEBUG
3002 /** Loop used to disable page cleaner threads. */
3003 static
3004 void
buf_flush_page_cleaner_disabled_loop(void)3005 buf_flush_page_cleaner_disabled_loop(void)
3006 {
3007 	ut_ad(page_cleaner != NULL);
3008 
3009 	if (!innodb_page_cleaner_disabled_debug) {
3010 		/* We return to avoid entering and exiting mutex. */
3011 		return;
3012 	}
3013 
3014 	mutex_enter(&page_cleaner->mutex);
3015 	page_cleaner->n_disabled_debug++;
3016 	mutex_exit(&page_cleaner->mutex);
3017 
3018 	while (innodb_page_cleaner_disabled_debug
3019 	       && srv_shutdown_state == SRV_SHUTDOWN_NONE
3020 	       && page_cleaner->is_running) {
3021 
3022 		os_thread_sleep(100000); /* [A] */
3023 	}
3024 
3025 	/* We need to wait for threads exiting here, otherwise we would
3026 	encounter problem when we quickly perform following steps:
3027 		1) SET GLOBAL innodb_page_cleaner_disabled_debug = 1;
3028 		2) SET GLOBAL innodb_page_cleaner_disabled_debug = 0;
3029 		3) SET GLOBAL innodb_page_cleaner_disabled_debug = 1;
3030 	That's because after step 1 this thread could still be sleeping
3031 	inside the loop above at [A] and steps 2, 3 could happen before
3032 	this thread wakes up from [A]. In such case this thread would
3033 	not re-increment n_disabled_debug and we would be waiting for
3034 	him forever in buf_flush_page_cleaner_disabled_debug_update(...).
3035 
3036 	Therefore we are waiting in step 2 for this thread exiting here. */
3037 
3038 	mutex_enter(&page_cleaner->mutex);
3039 	page_cleaner->n_disabled_debug--;
3040 	mutex_exit(&page_cleaner->mutex);
3041 }
3042 
3043 /** Disables page cleaner threads (coordinator and workers).
3044 It's used by: SET GLOBAL innodb_page_cleaner_disabled_debug = 1 (0).
3045 @param[in]	thd		thread handle
3046 @param[in]	var		pointer to system variable
3047 @param[out]	var_ptr		where the formal string goes
3048 @param[in]	save		immediate result from check function */
3049 void
buf_flush_page_cleaner_disabled_debug_update(THD * thd,struct st_mysql_sys_var * var,void * var_ptr,const void * save)3050 buf_flush_page_cleaner_disabled_debug_update(
3051 	THD*				thd,
3052 	struct st_mysql_sys_var*	var,
3053 	void*				var_ptr,
3054 	const void*			save)
3055 {
3056 	if (page_cleaner == NULL) {
3057 		return;
3058 	}
3059 
3060 	if (!*static_cast<const my_bool*>(save)) {
3061 		if (!innodb_page_cleaner_disabled_debug) {
3062 			return;
3063 		}
3064 
3065 		innodb_page_cleaner_disabled_debug = false;
3066 
3067 		/* Enable page cleaner threads. */
3068 		while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
3069 			mutex_enter(&page_cleaner->mutex);
3070 			const ulint n = page_cleaner->n_disabled_debug;
3071 			mutex_exit(&page_cleaner->mutex);
3072 			/* Check if all threads have been enabled, to avoid
3073 			problem when we decide to re-disable them soon. */
3074 			if (n == 0) {
3075 				break;
3076 			}
3077 		}
3078 		return;
3079 	}
3080 
3081 	if (innodb_page_cleaner_disabled_debug) {
3082 		return;
3083 	}
3084 
3085 	innodb_page_cleaner_disabled_debug = true;
3086 
3087 	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
3088 		/* Workers are possibly sleeping on is_requested.
3089 
3090 		We have to wake them, otherwise they could possibly
3091 		have never noticed, that they should be disabled,
3092 		and we would wait for them here forever.
3093 
3094 		That's why we have sleep-loop instead of simply
3095 		waiting on some disabled_debug_event. */
3096 		os_event_set(page_cleaner->is_requested);
3097 
3098 		mutex_enter(&page_cleaner->mutex);
3099 
3100 		ut_ad(page_cleaner->n_disabled_debug
3101 		      <= srv_n_page_cleaners);
3102 
3103 		if (page_cleaner->n_disabled_debug
3104 		    == srv_n_page_cleaners) {
3105 
3106 			mutex_exit(&page_cleaner->mutex);
3107 			break;
3108 		}
3109 
3110 		mutex_exit(&page_cleaner->mutex);
3111 
3112 		os_thread_sleep(100000);
3113 	}
3114 }
3115 #endif /* UNIV_DEBUG */
3116 
3117 /******************************************************************//**
3118 page_cleaner thread tasked with flushing dirty pages from the buffer
3119 pools. As of now we'll have only one coordinator.
3120 @return a dummy parameter */
3121 extern "C"
3122 os_thread_ret_t
DECLARE_THREAD(buf_flush_page_cleaner_coordinator)3123 DECLARE_THREAD(buf_flush_page_cleaner_coordinator)(
3124 /*===============================================*/
3125 	void*	arg MY_ATTRIBUTE((unused)))
3126 			/*!< in: a dummy parameter required by
3127 			os_thread_create */
3128 {
3129 	ib_time_monotonic_t	next_loop_time = ut_time_monotonic_ms() + 1000;
3130 	ulint	n_flushed = 0;
3131 	ulint	last_activity = srv_get_activity_count();
3132 	ulint	last_pages = 0;
3133 
3134 	my_thread_init();
3135 
3136 #ifdef UNIV_PFS_THREAD
3137 	pfs_register_thread(page_cleaner_thread_key);
3138 #endif /* UNIV_PFS_THREAD */
3139 
3140 #ifdef UNIV_DEBUG_THREAD_CREATION
3141 	ib::info() << "page_cleaner thread running, id "
3142 		<< os_thread_pf(os_thread_get_curr_id());
3143 #endif /* UNIV_DEBUG_THREAD_CREATION */
3144 
3145 #ifdef UNIV_LINUX
3146 	/* linux might be able to set different setting for each thread.
3147 	worth to try to set high priority for page cleaner threads */
3148 	if (buf_flush_page_cleaner_set_priority(
3149 		buf_flush_page_cleaner_priority)) {
3150 
3151 		ib::info() << "page_cleaner coordinator priority: "
3152 			<< buf_flush_page_cleaner_priority;
3153 	} else {
3154 		ib::info() << "If the mysqld execution user is authorized,"
3155 		" page cleaner thread priority can be changed."
3156 		" See the man page of setpriority().";
3157 	}
3158 #endif /* UNIV_LINUX */
3159 
3160 	buf_page_cleaner_is_active = true;
3161 
3162 	while (!srv_read_only_mode
3163 	       && srv_shutdown_state == SRV_SHUTDOWN_NONE
3164 	       && recv_sys->heap != NULL) {
3165 		/* treat flushing requests during recovery. */
3166 		ulint	n_flushed_lru = 0;
3167 		ulint	n_flushed_list = 0;
3168 
3169 		os_event_wait(recv_sys->flush_start);
3170 
3171 		if (srv_shutdown_state != SRV_SHUTDOWN_NONE
3172 		    || recv_sys->heap == NULL) {
3173 			break;
3174 		}
3175 
3176 		switch (recv_sys->flush_type) {
3177 		case BUF_FLUSH_LRU:
3178 			/* Flush pages from end of LRU if required */
3179 			pc_request(0, LSN_MAX);
3180 			while (pc_flush_slot() > 0) {}
3181 			pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3182 			break;
3183 
3184 		case BUF_FLUSH_LIST:
3185 			/* Flush all pages */
3186 			do {
3187 				pc_request(ULINT_MAX, LSN_MAX);
3188 				while (pc_flush_slot() > 0) {}
3189 			} while (!pc_wait_finished(&n_flushed_lru,
3190 						   &n_flushed_list));
3191 			break;
3192 
3193 		default:
3194 			ut_ad(0);
3195 		}
3196 
3197 		os_event_reset(recv_sys->flush_start);
3198 		os_event_set(recv_sys->flush_end);
3199 	}
3200 
3201 	os_event_wait(buf_flush_event);
3202 
3203 	ulint		ret_sleep = 0;
3204 	ulint		n_evicted = 0;
3205 	ulint		n_flushed_last = 0;
3206 	ulint		warn_interval = 1;
3207 	ulint		warn_count = 0;
3208 	int64_t		sig_count = os_event_reset(buf_flush_event);
3209 
3210 	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
3211 
3212 		/* The page_cleaner skips sleep if the server is
3213 		idle and there are no pending IOs in the buffer pool
3214 		and there is work to do. */
3215 		if (srv_check_activity(last_activity)
3216 		    || buf_get_n_pending_read_ios()
3217 		    || n_flushed == 0) {
3218 
3219 			ret_sleep = pc_sleep_if_needed(
3220 				next_loop_time, sig_count);
3221 
3222 			if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
3223 				break;
3224 			}
3225 		} else if (ut_time_monotonic_ms() > next_loop_time) {
3226 			ret_sleep = OS_SYNC_TIME_EXCEEDED;
3227 		} else {
3228 			ret_sleep = 0;
3229 		}
3230 
3231 		sig_count = os_event_reset(buf_flush_event);
3232 
3233 		if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3234 			ib_time_monotonic_ms_t curr_time =
3235 						ut_time_monotonic_ms();
3236 
3237 			if (curr_time > next_loop_time + 3000) {
3238 				if (warn_count == 0) {
3239 					ib::info() << "page_cleaner: 1000ms"
3240 						" intended loop took "
3241 						<< 1000 + curr_time
3242 						   - next_loop_time
3243 						<< "ms. The settings might not"
3244 						" be optimal. (flushed="
3245 						<< n_flushed_last
3246 						<< " and evicted="
3247 						<< n_evicted
3248 						<< ", during the time.)";
3249 					if (warn_interval > 300) {
3250 						warn_interval = 600;
3251 					} else {
3252 						warn_interval *= 2;
3253 					}
3254 
3255 					warn_count = warn_interval;
3256 				} else {
3257 					--warn_count;
3258 				}
3259 			} else {
3260 				/* reset counter */
3261 				warn_interval = 1;
3262 				warn_count = 0;
3263 			}
3264 
3265 			next_loop_time = curr_time + 1000;
3266 			n_flushed_last = n_evicted = 0;
3267 		}
3268 
3269 		if (ret_sleep != OS_SYNC_TIME_EXCEEDED
3270 		    && srv_flush_sync
3271 		    && buf_flush_sync_lsn > 0) {
3272 			/* woke up for flush_sync */
3273 			mutex_enter(&page_cleaner->mutex);
3274 			lsn_t	lsn_limit = buf_flush_sync_lsn;
3275 			buf_flush_sync_lsn = 0;
3276 			mutex_exit(&page_cleaner->mutex);
3277 
3278 			/* Request flushing for threads */
3279 			pc_request(ULINT_MAX, lsn_limit);
3280 
3281 			ib_time_monotonic_ms_t tm = ut_time_monotonic_ms();
3282 
3283 			/* Coordinator also treats requests */
3284 			while (pc_flush_slot() > 0) {}
3285 
3286 			/* only coordinator is using these counters,
3287 			so no need to protect by lock. */
3288 			page_cleaner->flush_time += ut_time_monotonic_ms() - tm;
3289 			page_cleaner->flush_pass++;
3290 
3291 			/* Wait for all slots to be finished */
3292 			ulint	n_flushed_lru = 0;
3293 			ulint	n_flushed_list = 0;
3294 			pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3295 
3296 			if (n_flushed_list > 0 || n_flushed_lru > 0) {
3297 				buf_flush_stats(n_flushed_list, n_flushed_lru);
3298 
3299 				MONITOR_INC_VALUE_CUMULATIVE(
3300 					MONITOR_FLUSH_SYNC_TOTAL_PAGE,
3301 					MONITOR_FLUSH_SYNC_COUNT,
3302 					MONITOR_FLUSH_SYNC_PAGES,
3303 					n_flushed_lru + n_flushed_list);
3304 			}
3305 
3306 			n_flushed = n_flushed_lru + n_flushed_list;
3307 
3308 		} else if (srv_check_activity(last_activity)) {
3309 			ulint	n_to_flush;
3310 			lsn_t	lsn_limit = 0;
3311 
3312 			/* Estimate pages from flush_list to be flushed */
3313 			if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3314 				last_activity = srv_get_activity_count();
3315 				n_to_flush =
3316 					page_cleaner_flush_pages_recommendation(
3317 						&lsn_limit, last_pages);
3318 			} else {
3319 				n_to_flush = 0;
3320 			}
3321 
3322 			/* Request flushing for threads */
3323 			pc_request(n_to_flush, lsn_limit);
3324 
3325 			ib_time_monotonic_ms_t tm = ut_time_monotonic_ms();
3326 
3327 			/* Coordinator also treats requests */
3328 			while (pc_flush_slot() > 0) {
3329 				/* No op */
3330 			}
3331 
3332 			/* only coordinator is using these counters,
3333 			so no need to protect by lock. */
3334 			page_cleaner->flush_time += ut_time_monotonic_ms() - tm;
3335 			page_cleaner->flush_pass++ ;
3336 
3337 			/* Wait for all slots to be finished */
3338 			ulint	n_flushed_lru = 0;
3339 			ulint	n_flushed_list = 0;
3340 
3341 			pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3342 
3343 			if (n_flushed_list > 0 || n_flushed_lru > 0) {
3344 				buf_flush_stats(n_flushed_list, n_flushed_lru);
3345 			}
3346 
3347 			if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3348 				last_pages = n_flushed_list;
3349 			}
3350 
3351 			n_evicted += n_flushed_lru;
3352 			n_flushed_last += n_flushed_list;
3353 
3354 			n_flushed = n_flushed_lru + n_flushed_list;
3355 
3356 			if (n_flushed_lru) {
3357 				MONITOR_INC_VALUE_CUMULATIVE(
3358 					MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
3359 					MONITOR_LRU_BATCH_FLUSH_COUNT,
3360 					MONITOR_LRU_BATCH_FLUSH_PAGES,
3361 					n_flushed_lru);
3362 			}
3363 
3364 			if (n_flushed_list) {
3365 				MONITOR_INC_VALUE_CUMULATIVE(
3366 					MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
3367 					MONITOR_FLUSH_ADAPTIVE_COUNT,
3368 					MONITOR_FLUSH_ADAPTIVE_PAGES,
3369 					n_flushed_list);
3370 			}
3371 
3372 		} else if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3373 			/* no activity, slept enough */
3374 			buf_flush_lists(PCT_IO(100), LSN_MAX, &n_flushed);
3375 
3376 			n_flushed_last += n_flushed;
3377 
3378 			if (n_flushed) {
3379 				MONITOR_INC_VALUE_CUMULATIVE(
3380 					MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
3381 					MONITOR_FLUSH_BACKGROUND_COUNT,
3382 					MONITOR_FLUSH_BACKGROUND_PAGES,
3383 					n_flushed);
3384 
3385 			}
3386 
3387 		} else {
3388 			/* no activity, but woken up by event */
3389 			n_flushed = 0;
3390 		}
3391 
3392 		ut_d(buf_flush_page_cleaner_disabled_loop());
3393 	}
3394 
3395 	ut_ad(srv_shutdown_state > 0);
3396 	if (srv_fast_shutdown == 2
3397 	    || srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
3398 		/* In very fast shutdown or when innodb failed to start, we
3399 		simulate a crash of the buffer pool. We are not required to do
3400 		any flushing. */
3401 		goto thread_exit;
3402 	}
3403 
3404 	/* In case of normal and slow shutdown the page_cleaner thread
3405 	must wait for all other activity in the server to die down.
3406 	Note that we can start flushing the buffer pool as soon as the
3407 	server enters shutdown phase but we must stay alive long enough
3408 	to ensure that any work done by the master or purge threads is
3409 	also flushed.
3410 	During shutdown we pass through two stages. In the first stage,
3411 	when SRV_SHUTDOWN_CLEANUP is set other threads like the master
3412 	and the purge threads may be working as well. We start flushing
3413 	the buffer pool but can't be sure that no new pages are being
3414 	dirtied until we enter SRV_SHUTDOWN_FLUSH_PHASE phase. */
3415 
3416 	do {
3417 		pc_request(ULINT_MAX, LSN_MAX);
3418 
3419 		while (pc_flush_slot() > 0) {}
3420 
3421 		ulint	n_flushed_lru = 0;
3422 		ulint	n_flushed_list = 0;
3423 		pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3424 
3425 		n_flushed = n_flushed_lru + n_flushed_list;
3426 
3427 		/* We sleep only if there are no pages to flush */
3428 		if (n_flushed == 0) {
3429 			os_thread_sleep(100000);
3430 		}
3431 	} while (srv_shutdown_state == SRV_SHUTDOWN_CLEANUP);
3432 
3433 	/* At this point all threads including the master and the purge
3434 	thread must have been suspended. */
3435 	ut_a(srv_get_active_thread_type() == SRV_NONE);
3436 	ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
3437 
3438 	/* We can now make a final sweep on flushing the buffer pool
3439 	and exit after we have cleaned the whole buffer pool.
3440 	It is important that we wait for any running batch that has
3441 	been triggered by us to finish. Otherwise we can end up
3442 	considering end of that batch as a finish of our final
3443 	sweep and we'll come out of the loop leaving behind dirty pages
3444 	in the flush_list */
3445 	buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
3446 	buf_flush_wait_LRU_batch_end();
3447 
3448 	bool	success;
3449 
3450 	do {
3451 		pc_request(ULINT_MAX, LSN_MAX);
3452 
3453 		while (pc_flush_slot() > 0) {}
3454 
3455 		ulint	n_flushed_lru = 0;
3456 		ulint	n_flushed_list = 0;
3457 		success = pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3458 
3459 		n_flushed = n_flushed_lru + n_flushed_list;
3460 
3461 		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
3462 		buf_flush_wait_LRU_batch_end();
3463 
3464 	} while (!success || n_flushed > 0);
3465 
3466 	/* Some sanity checks */
3467 	ut_a(srv_get_active_thread_type() == SRV_NONE);
3468 	ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
3469 
3470 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
3471 		buf_pool_t* buf_pool = buf_pool_from_array(i);
3472 		ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == 0);
3473 	}
3474 
3475 	/* We have lived our life. Time to die. */
3476 
3477 thread_exit:
3478 	/* All worker threads are waiting for the event here,
3479 	and no more access to page_cleaner structure by them.
3480 	Wakes worker threads up just to make them exit. */
3481 	page_cleaner->is_running = false;
3482 	os_event_set(page_cleaner->is_requested);
3483 
3484 	buf_flush_page_cleaner_close();
3485 
3486 	buf_page_cleaner_is_active = false;
3487 
3488 	my_thread_end();
3489 
3490 	/* We count the number of threads in os_thread_exit(). A created
3491 	thread should always use that to exit and not use return() to exit. */
3492 	os_thread_exit();
3493 
3494 	OS_THREAD_DUMMY_RETURN;
3495 }
3496 
3497 /******************************************************************//**
3498 Worker thread of page_cleaner.
3499 @return a dummy parameter */
3500 extern "C"
3501 os_thread_ret_t
DECLARE_THREAD(buf_flush_page_cleaner_worker)3502 DECLARE_THREAD(buf_flush_page_cleaner_worker)(
3503 /*==========================================*/
3504 	void*	arg MY_ATTRIBUTE((unused)))
3505 			/*!< in: a dummy parameter required by
3506 			os_thread_create */
3507 {
3508 	my_thread_init();
3509 
3510 	mutex_enter(&page_cleaner->mutex);
3511 	page_cleaner->n_workers++;
3512 	mutex_exit(&page_cleaner->mutex);
3513 
3514 #ifdef UNIV_LINUX
3515 	/* linux might be able to set different setting for each thread
3516 	worth to try to set high priority for page cleaner threads */
3517 	if (buf_flush_page_cleaner_set_priority(
3518 		buf_flush_page_cleaner_priority)) {
3519 
3520 		ib::info() << "page_cleaner worker priority: "
3521 			<< buf_flush_page_cleaner_priority;
3522 	}
3523 #endif /* UNIV_LINUX */
3524 
3525 	while (true) {
3526 		os_event_wait(page_cleaner->is_requested);
3527 
3528 		ut_d(buf_flush_page_cleaner_disabled_loop());
3529 
3530 		if (!page_cleaner->is_running) {
3531 			break;
3532 		}
3533 
3534 		pc_flush_slot();
3535 	}
3536 
3537 	mutex_enter(&page_cleaner->mutex);
3538 	page_cleaner->n_workers--;
3539 	mutex_exit(&page_cleaner->mutex);
3540 
3541 	my_thread_end();
3542 
3543 	os_thread_exit();
3544 
3545 	OS_THREAD_DUMMY_RETURN;
3546 }
3547 
3548 /*******************************************************************//**
3549 Synchronously flush dirty blocks from the end of the flush list of all buffer
3550 pool instances.
3551 NOTE: The calling thread is not allowed to own any latches on pages! */
3552 void
buf_flush_sync_all_buf_pools(void)3553 buf_flush_sync_all_buf_pools(void)
3554 /*==============================*/
3555 {
3556 	bool success;
3557 	do {
3558 		success = buf_flush_lists(ULINT_MAX, LSN_MAX, NULL);
3559 		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
3560 	} while (!success);
3561 
3562 	ut_a(success);
3563 }
3564 
3565 /** Request IO burst and wake page_cleaner up.
3566 @param[in]	lsn_limit	upper limit of LSN to be flushed */
3567 void
buf_flush_request_force(lsn_t lsn_limit)3568 buf_flush_request_force(
3569 	lsn_t	lsn_limit)
3570 {
3571 	/* adjust based on lsn_avg_rate not to get old */
3572 	lsn_t	lsn_target = lsn_limit + lsn_avg_rate * 3;
3573 
3574 	mutex_enter(&page_cleaner->mutex);
3575 	if (lsn_target > buf_flush_sync_lsn) {
3576 		buf_flush_sync_lsn = lsn_target;
3577 	}
3578 	mutex_exit(&page_cleaner->mutex);
3579 
3580 	os_event_set(buf_flush_event);
3581 }
3582 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3583 
3584 /** Functor to validate the flush list. */
3585 struct	Check {
operator ()Check3586 	void	operator()(const buf_page_t* elem)
3587 	{
3588 		ut_a(elem->in_flush_list);
3589 	}
3590 };
3591 
3592 /******************************************************************//**
3593 Validates the flush list.
3594 @return TRUE if ok */
3595 static
3596 ibool
buf_flush_validate_low(buf_pool_t * buf_pool)3597 buf_flush_validate_low(
3598 /*===================*/
3599 	buf_pool_t*	buf_pool)		/*!< in: Buffer pool instance */
3600 {
3601 	buf_page_t*		bpage;
3602 	const ib_rbt_node_t*	rnode = NULL;
3603 	Check			check;
3604 
3605 	ut_ad(buf_flush_list_mutex_own(buf_pool));
3606 
3607 	ut_list_validate(buf_pool->flush_list, check);
3608 
3609 	bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
3610 
3611 	/* If we are in recovery mode i.e.: flush_rbt != NULL
3612 	then each block in the flush_list must also be present
3613 	in the flush_rbt. */
3614 	if (buf_pool->flush_rbt != NULL) {
3615 		rnode = rbt_first(buf_pool->flush_rbt);
3616 	}
3617 
3618 	while (bpage != NULL) {
3619 		const lsn_t	om = bpage->oldest_modification;
3620 
3621 		ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
3622 
3623 		ut_ad(bpage->in_flush_list);
3624 
3625 		/* A page in buf_pool->flush_list can be in
3626 		BUF_BLOCK_REMOVE_HASH state. This happens when a page
3627 		is in the middle of being relocated. In that case the
3628 		original descriptor can have this state and still be
3629 		in the flush list waiting to acquire the
3630 		buf_pool->flush_list_mutex to complete the relocation. */
3631 		ut_a(buf_page_in_file(bpage)
3632 		     || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
3633 		ut_a(om > 0);
3634 
3635 		if (buf_pool->flush_rbt != NULL) {
3636 			buf_page_t**	prpage;
3637 
3638 			ut_a(rnode != NULL);
3639 			prpage = rbt_value(buf_page_t*, rnode);
3640 
3641 			ut_a(*prpage != NULL);
3642 			ut_a(*prpage == bpage);
3643 			rnode = rbt_next(buf_pool->flush_rbt, rnode);
3644 		}
3645 
3646 		bpage = UT_LIST_GET_NEXT(list, bpage);
3647 
3648 		ut_a(bpage == NULL || om >= bpage->oldest_modification);
3649 	}
3650 
3651 	/* By this time we must have exhausted the traversal of
3652 	flush_rbt (if active) as well. */
3653 	ut_a(rnode == NULL);
3654 
3655 	return(TRUE);
3656 }
3657 
3658 /******************************************************************//**
3659 Validates the flush list.
3660 @return TRUE if ok */
3661 ibool
buf_flush_validate(buf_pool_t * buf_pool)3662 buf_flush_validate(
3663 /*===============*/
3664 	buf_pool_t*	buf_pool)	/*!< buffer pool instance */
3665 {
3666 	ibool	ret;
3667 
3668 	buf_flush_list_mutex_enter(buf_pool);
3669 
3670 	ret = buf_flush_validate_low(buf_pool);
3671 
3672 	buf_flush_list_mutex_exit(buf_pool);
3673 
3674 	return(ret);
3675 }
3676 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3677 #endif /* !UNIV_HOTBACKUP */
3678 
3679 /******************************************************************//**
3680 Check if there are any dirty pages that belong to a space id in the flush
3681 list in a particular buffer pool.
3682 @return number of dirty pages present in a single buffer pool */
3683 ulint
buf_pool_get_dirty_pages_count(buf_pool_t * buf_pool,ulint id,FlushObserver * observer)3684 buf_pool_get_dirty_pages_count(
3685 /*===========================*/
3686 	buf_pool_t*	buf_pool,	/*!< in: buffer pool */
3687 	ulint		id,		/*!< in: space id to check */
3688 	FlushObserver*	observer)	/*!< in: flush observer to check */
3689 
3690 {
3691 	ulint		count = 0;
3692 
3693 	buf_pool_mutex_enter(buf_pool);
3694 	buf_flush_list_mutex_enter(buf_pool);
3695 
3696 	buf_page_t*	bpage;
3697 
3698 	for (bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
3699 	     bpage != 0;
3700 	     bpage = UT_LIST_GET_NEXT(list, bpage)) {
3701 
3702 		ut_ad(buf_page_in_file(bpage));
3703 		ut_ad(bpage->in_flush_list);
3704 		ut_ad(bpage->oldest_modification > 0);
3705 
3706 		if ((observer != NULL
3707 		     && observer == bpage->flush_observer)
3708 		    || (observer == NULL
3709 			&& id == bpage->id.space())) {
3710 			++count;
3711 		}
3712 	}
3713 
3714 	buf_flush_list_mutex_exit(buf_pool);
3715 	buf_pool_mutex_exit(buf_pool);
3716 
3717 	return(count);
3718 }
3719 
3720 /******************************************************************//**
3721 Check if there are any dirty pages that belong to a space id in the flush list.
3722 @return number of dirty pages present in all the buffer pools */
3723 ulint
buf_flush_get_dirty_pages_count(ulint id,FlushObserver * observer)3724 buf_flush_get_dirty_pages_count(
3725 /*============================*/
3726 	ulint		id,		/*!< in: space id to check */
3727 	FlushObserver*	observer)	/*!< in: flush observer to check */
3728 {
3729 	ulint		count = 0;
3730 
3731 	for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
3732 		buf_pool_t*	buf_pool;
3733 
3734 		buf_pool = buf_pool_from_array(i);
3735 
3736 		count += buf_pool_get_dirty_pages_count(buf_pool, id, observer);
3737 	}
3738 
3739 	return(count);
3740 }
3741 
3742 /** FlushObserver constructor
3743 @param[in]	space_id	table space id
3744 @param[in]	trx		trx instance
3745 @param[in]	stage		performance schema accounting object,
3746 used by ALTER TABLE. It is passed to log_preflush_pool_modified_pages()
3747 for accounting. */
FlushObserver(ulint space_id,trx_t * trx,ut_stage_alter_t * stage)3748 FlushObserver::FlushObserver(
3749 	ulint			space_id,
3750 	trx_t*			trx,
3751 	ut_stage_alter_t*	stage)
3752 	:
3753 	m_space_id(space_id),
3754 	m_trx(trx),
3755 	m_stage(stage),
3756 	m_interrupted(false)
3757 {
3758 	m_flushed = UT_NEW_NOKEY(std::vector<ulint>(srv_buf_pool_instances));
3759 	m_removed = UT_NEW_NOKEY(std::vector<ulint>(srv_buf_pool_instances));
3760 
3761 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
3762 		m_flushed->at(i) = 0;
3763 		m_removed->at(i) = 0;
3764 	}
3765 
3766 #ifdef FLUSH_LIST_OBSERVER_DEBUG
3767 		ib::info() << "FlushObserver constructor: " << m_trx->id;
3768 #endif /* FLUSH_LIST_OBSERVER_DEBUG */
3769 }
3770 
3771 /** FlushObserver deconstructor */
~FlushObserver()3772 FlushObserver::~FlushObserver()
3773 {
3774 	ut_ad(buf_flush_get_dirty_pages_count(m_space_id, this) == 0);
3775 
3776 	UT_DELETE(m_flushed);
3777 	UT_DELETE(m_removed);
3778 
3779 #ifdef FLUSH_LIST_OBSERVER_DEBUG
3780 		ib::info() << "FlushObserver deconstructor: " << m_trx->id;
3781 #endif /* FLUSH_LIST_OBSERVER_DEBUG */
3782 }
3783 
3784 /** Check whether trx is interrupted
3785 @return true if trx is interrupted */
3786 bool
check_interrupted()3787 FlushObserver::check_interrupted()
3788 {
3789 	if (trx_is_interrupted(m_trx)) {
3790 		interrupted();
3791 
3792 		return(true);
3793 	}
3794 
3795 	return(false);
3796 }
3797 
3798 /** Notify observer of a flush
3799 @param[in]	buf_pool	buffer pool instance
3800 @param[in]	bpage		buffer page to flush */
3801 void
notify_flush(buf_pool_t * buf_pool,buf_page_t * bpage)3802 FlushObserver::notify_flush(
3803 	buf_pool_t*	buf_pool,
3804 	buf_page_t*	bpage)
3805 {
3806 	ut_ad(buf_pool_mutex_own(buf_pool));
3807 
3808 	m_flushed->at(buf_pool->instance_no)++;
3809 
3810 	if (m_stage != NULL) {
3811 		m_stage->inc();
3812 	}
3813 
3814 #ifdef FLUSH_LIST_OBSERVER_DEBUG
3815 	ib::info() << "Flush <" << bpage->id.space()
3816 		   << ", " << bpage->id.page_no() << ">";
3817 #endif /* FLUSH_LIST_OBSERVER_DEBUG */
3818 }
3819 
3820 /** Notify observer of a remove
3821 @param[in]	buf_pool	buffer pool instance
3822 @param[in]	bpage		buffer page flushed */
3823 void
notify_remove(buf_pool_t * buf_pool,buf_page_t * bpage)3824 FlushObserver::notify_remove(
3825 	buf_pool_t*	buf_pool,
3826 	buf_page_t*	bpage)
3827 {
3828 	ut_ad(buf_pool_mutex_own(buf_pool));
3829 
3830 	m_removed->at(buf_pool->instance_no)++;
3831 
3832 #ifdef FLUSH_LIST_OBSERVER_DEBUG
3833 	ib::info() << "Remove <" << bpage->id.space()
3834 		   << ", " << bpage->id.page_no() << ">";
3835 #endif /* FLUSH_LIST_OBSERVER_DEBUG */
3836 }
3837 
3838 /** Flush dirty pages and wait. */
3839 void
flush()3840 FlushObserver::flush()
3841 {
3842 	buf_remove_t	buf_remove;
3843 
3844 	if (m_interrupted) {
3845 		buf_remove = BUF_REMOVE_FLUSH_NO_WRITE;
3846 	} else {
3847 		buf_remove = BUF_REMOVE_FLUSH_WRITE;
3848 
3849 		if (m_stage != NULL) {
3850 			ulint	pages_to_flush =
3851 				buf_flush_get_dirty_pages_count(
3852 					m_space_id, this);
3853 
3854 			m_stage->begin_phase_flush(pages_to_flush);
3855 		}
3856 	}
3857 
3858 	/* Flush or remove dirty pages. */
3859 	buf_LRU_flush_or_remove_pages(m_space_id, buf_remove, m_trx);
3860 
3861 	/* Wait for all dirty pages were flushed. */
3862 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
3863 		while (!is_complete(i)) {
3864 
3865 			os_thread_sleep(2000);
3866 		}
3867 	}
3868 }
3869