1 /*****************************************************************************
2 
3 Copyright (c) 1995, 2019, Oracle and/or its affiliates. All Rights Reserved.
4 
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License, version 2.0,
7 as published by the Free Software Foundation.
8 
9 This program is also distributed with certain software (including
10 but not limited to OpenSSL) that is licensed under separate terms,
11 as designated in a particular file or component or in included license
12 documentation.  The authors of MySQL hereby grant you an additional
13 permission to link the program and your derivative works with the
14 separately licensed software that they have included with MySQL.
15 
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 GNU General Public License, version 2.0, for more details.
20 
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
24 
25 *****************************************************************************/
26 
27 /**************************************************//**
28 @file buf/buf0flu.cc
29 The database buffer buf_pool flush algorithm
30 
31 Created 11/11/1995 Heikki Tuuri
32 *******************************************************/
33 
34 #include "ha_prototypes.h"
35 #include <mysql/service_thd_wait.h>
36 #include <my_dbug.h>
37 
38 #include "buf0flu.h"
39 
40 #ifdef UNIV_NONINL
41 #include "buf0flu.ic"
42 #endif
43 
44 #include "buf0buf.h"
45 #include "buf0checksum.h"
46 #include "srv0start.h"
47 #include "srv0srv.h"
48 #include "page0zip.h"
49 #ifndef UNIV_HOTBACKUP
50 #include "ut0byte.h"
51 #include "page0page.h"
52 #include "fil0fil.h"
53 #include "buf0lru.h"
54 #include "buf0rea.h"
55 #include "ibuf0ibuf.h"
56 #include "log0log.h"
57 #include "os0file.h"
58 #include "trx0sys.h"
59 #include "srv0mon.h"
60 #include "fsp0sysspace.h"
61 #include "ut0stage.h"
62 
63 #ifdef UNIV_LINUX
64 /* include defs for CPU time priority settings */
65 #include <unistd.h>
66 #include <sys/syscall.h>
67 #include <sys/time.h>
68 #include <sys/resource.h>
69 static const int buf_flush_page_cleaner_priority = -20;
70 #endif /* UNIV_LINUX */
71 
72 /** Sleep time in microseconds for loop waiting for the oldest
73 modification lsn */
74 static const ulint buf_flush_wait_flushed_sleep_time = 10000;
75 
76 /** Number of pages flushed through non flush_list flushes. */
77 static ulint buf_lru_flush_page_count = 0;
78 
79 /** Flag indicating if the page_cleaner is in active state. This flag
80 is set to TRUE by the page_cleaner thread when it is spawned and is set
81 back to FALSE at shutdown by the page_cleaner as well. Therefore no
82 need to protect it by a mutex. It is only ever read by the thread
83 doing the shutdown */
84 bool buf_page_cleaner_is_active = false;
85 
86 /** Factor for scan length to determine n_pages for intended oldest LSN
87 progress */
88 static ulint buf_flush_lsn_scan_factor = 3;
89 
90 /** Average redo generation rate */
91 static lsn_t lsn_avg_rate = 0;
92 
93 /** Target oldest LSN for the requested flush_sync */
94 static lsn_t buf_flush_sync_lsn = 0;
95 
96 #ifdef UNIV_PFS_THREAD
97 mysql_pfs_key_t page_cleaner_thread_key;
98 #endif /* UNIV_PFS_THREAD */
99 
100 /** Event to synchronise with the flushing. */
101 os_event_t	buf_flush_event;
102 
103 /** State for page cleaner array slot */
104 enum page_cleaner_state_t {
105 	/** Not requested any yet.
106 	Moved from FINISHED by the coordinator. */
107 	PAGE_CLEANER_STATE_NONE = 0,
108 	/** Requested but not started flushing.
109 	Moved from NONE by the coordinator. */
110 	PAGE_CLEANER_STATE_REQUESTED,
111 	/** Flushing is on going.
112 	Moved from REQUESTED by the worker. */
113 	PAGE_CLEANER_STATE_FLUSHING,
114 	/** Flushing was finished.
115 	Moved from FLUSHING by the worker. */
116 	PAGE_CLEANER_STATE_FINISHED
117 };
118 
119 /** Page cleaner request state for each buffer pool instance */
120 struct page_cleaner_slot_t {
121 	page_cleaner_state_t	state;	/*!< state of the request.
122 					protected by page_cleaner_t::mutex
123 					if the worker thread got the slot and
124 					set to PAGE_CLEANER_STATE_FLUSHING,
125 					n_flushed_lru and n_flushed_list can be
126 					updated only by the worker thread */
127 	/* This value is set during state==PAGE_CLEANER_STATE_NONE */
128 	ulint			n_pages_requested;
129 					/*!< number of requested pages
130 					for the slot */
131 	/* These values are updated during state==PAGE_CLEANER_STATE_FLUSHING,
132 	and commited with state==PAGE_CLEANER_STATE_FINISHED.
133 	The consistency is protected by the 'state' */
134 	ulint			n_flushed_lru;
135 					/*!< number of flushed pages
136 					by LRU scan flushing */
137 	ulint			n_flushed_list;
138 					/*!< number of flushed pages
139 					by flush_list flushing */
140 	bool			succeeded_list;
141 					/*!< true if flush_list flushing
142 					succeeded. */
143 	uint64_t		flush_lru_time;
144 					/*!< elapsed time for LRU flushing */
145 	uint64_t		flush_list_time;
146 					/*!< elapsed time for flush_list
147 					flushing */
148 	ulint			flush_lru_pass;
149 					/*!< count to attempt LRU flushing */
150 	ulint			flush_list_pass;
151 					/*!< count to attempt flush_list
152 					flushing */
153 };
154 
155 /** Page cleaner structure common for all threads */
156 struct page_cleaner_t {
157 	ib_mutex_t		mutex;		/*!< mutex to protect whole of
158 						page_cleaner_t struct and
159 						page_cleaner_slot_t slots. */
160 	os_event_t		is_requested;	/*!< event to activate worker
161 						threads. */
162 	os_event_t		is_finished;	/*!< event to signal that all
163 						slots were finished. */
164 	volatile ulint		n_workers;	/*!< number of worker threads
165 						in existence */
166 	bool			requested;	/*!< true if requested pages
167 						to flush */
168 	lsn_t			lsn_limit;	/*!< upper limit of LSN to be
169 						flushed */
170 	ulint			n_slots;	/*!< total number of slots */
171 	ulint			n_slots_requested;
172 						/*!< number of slots
173 						in the state
174 						PAGE_CLEANER_STATE_REQUESTED */
175 	ulint			n_slots_flushing;
176 						/*!< number of slots
177 						in the state
178 						PAGE_CLEANER_STATE_FLUSHING */
179 	ulint			n_slots_finished;
180 						/*!< number of slots
181 						in the state
182 						PAGE_CLEANER_STATE_FINISHED */
183 	uint64_t		flush_time;	/*!< elapsed time to flush
184 						requests for all slots */
185 	ulint			flush_pass;	/*!< count to finish to flush
186 						requests for all slots */
187 	page_cleaner_slot_t*	slots;		/*!< pointer to the slots */
188 	bool			is_running;	/*!< false if attempt
189 						to shutdown */
190 
191 #ifdef UNIV_DEBUG
192 	ulint			n_disabled_debug;
193 						/*<! how many of pc threads
194 						have been disabled */
195 #endif /* UNIV_DEBUG */
196 };
197 
198 static page_cleaner_t*	page_cleaner = NULL;
199 
200 #ifdef UNIV_DEBUG
201 my_bool innodb_page_cleaner_disabled_debug;
202 #endif /* UNIV_DEBUG */
203 
204 /** If LRU list of a buf_pool is less than this size then LRU eviction
205 should not happen. This is because when we do LRU flushing we also put
206 the blocks on free list. If LRU list is very small then we can end up
207 in thrashing. */
208 #define BUF_LRU_MIN_LEN		256
209 
210 /* @} */
211 
212 /******************************************************************//**
213 Increases flush_list size in bytes with the page size in inline function */
214 static inline
215 void
incr_flush_list_size_in_bytes(buf_block_t * block,buf_pool_t * buf_pool)216 incr_flush_list_size_in_bytes(
217 /*==========================*/
218 	buf_block_t*	block,		/*!< in: control block */
219 	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
220 {
221 	ut_ad(buf_flush_list_mutex_own(buf_pool));
222 
223 	buf_pool->stat.flush_list_bytes += block->page.size.physical();
224 
225 	ut_ad(buf_pool->stat.flush_list_bytes <= buf_pool->curr_pool_size);
226 }
227 
228 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
229 /******************************************************************//**
230 Validates the flush list.
231 @return TRUE if ok */
232 static
233 ibool
234 buf_flush_validate_low(
235 /*===================*/
236 	buf_pool_t*	buf_pool);	/*!< in: Buffer pool instance */
237 
238 /******************************************************************//**
239 Validates the flush list some of the time.
240 @return TRUE if ok or the check was skipped */
241 static
242 ibool
buf_flush_validate_skip(buf_pool_t * buf_pool)243 buf_flush_validate_skip(
244 /*====================*/
245 	buf_pool_t*	buf_pool)	/*!< in: Buffer pool instance */
246 {
247 /** Try buf_flush_validate_low() every this many times */
248 # define BUF_FLUSH_VALIDATE_SKIP	23
249 
250 	/** The buf_flush_validate_low() call skip counter.
251 	Use a signed type because of the race condition below. */
252 	static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
253 
254 	/* There is a race condition below, but it does not matter,
255 	because this call is only for heuristic purposes. We want to
256 	reduce the call frequency of the costly buf_flush_validate_low()
257 	check in debug builds. */
258 	if (--buf_flush_validate_count > 0) {
259 		return(TRUE);
260 	}
261 
262 	buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
263 	return(buf_flush_validate_low(buf_pool));
264 }
265 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
266 
267 /******************************************************************//**
268 Insert a block in the flush_rbt and returns a pointer to its
269 predecessor or NULL if no predecessor. The ordering is maintained
270 on the basis of the <oldest_modification, space, offset> key.
271 @return pointer to the predecessor or NULL if no predecessor. */
272 static
273 buf_page_t*
buf_flush_insert_in_flush_rbt(buf_page_t * bpage)274 buf_flush_insert_in_flush_rbt(
275 /*==========================*/
276 	buf_page_t*	bpage)	/*!< in: bpage to be inserted. */
277 {
278 	const ib_rbt_node_t*	c_node;
279 	const ib_rbt_node_t*	p_node;
280 	buf_page_t*		prev = NULL;
281 	buf_pool_t*		buf_pool = buf_pool_from_bpage(bpage);
282 
283 	ut_ad(buf_flush_list_mutex_own(buf_pool));
284 
285 	/* Insert this buffer into the rbt. */
286 	c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
287 	ut_a(c_node != NULL);
288 
289 	/* Get the predecessor. */
290 	p_node = rbt_prev(buf_pool->flush_rbt, c_node);
291 
292 	if (p_node != NULL) {
293 		buf_page_t**	value;
294 		value = rbt_value(buf_page_t*, p_node);
295 		prev = *value;
296 		ut_a(prev != NULL);
297 	}
298 
299 	return(prev);
300 }
301 
302 /*********************************************************//**
303 Delete a bpage from the flush_rbt. */
304 static
305 void
buf_flush_delete_from_flush_rbt(buf_page_t * bpage)306 buf_flush_delete_from_flush_rbt(
307 /*============================*/
308 	buf_page_t*	bpage)	/*!< in: bpage to be removed. */
309 {
310 #ifdef UNIV_DEBUG
311 	ibool		ret = FALSE;
312 #endif /* UNIV_DEBUG */
313 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
314 
315 	ut_ad(buf_flush_list_mutex_own(buf_pool));
316 
317 #ifdef UNIV_DEBUG
318 	ret =
319 #endif /* UNIV_DEBUG */
320 	rbt_delete(buf_pool->flush_rbt, &bpage);
321 
322 	ut_ad(ret);
323 }
324 
325 /*****************************************************************//**
326 Compare two modified blocks in the buffer pool. The key for comparison
327 is:
328 key = <oldest_modification, space, offset>
329 This comparison is used to maintian ordering of blocks in the
330 buf_pool->flush_rbt.
331 Note that for the purpose of flush_rbt, we only need to order blocks
332 on the oldest_modification. The other two fields are used to uniquely
333 identify the blocks.
334 @return < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */
335 static
336 int
buf_flush_block_cmp(const void * p1,const void * p2)337 buf_flush_block_cmp(
338 /*================*/
339 	const void*	p1,		/*!< in: block1 */
340 	const void*	p2)		/*!< in: block2 */
341 {
342 	int			ret;
343 	const buf_page_t*	b1 = *(const buf_page_t**) p1;
344 	const buf_page_t*	b2 = *(const buf_page_t**) p2;
345 
346 	ut_ad(b1 != NULL);
347 	ut_ad(b2 != NULL);
348 
349 #ifdef UNIV_DEBUG
350 	buf_pool_t*	buf_pool = buf_pool_from_bpage(b1);
351 #endif /* UNIV_DEBUG */
352 
353 	ut_ad(buf_flush_list_mutex_own(buf_pool));
354 
355 	ut_ad(b1->in_flush_list);
356 	ut_ad(b2->in_flush_list);
357 
358 	if (b2->oldest_modification > b1->oldest_modification) {
359 		return(1);
360 	} else if (b2->oldest_modification < b1->oldest_modification) {
361 		return(-1);
362 	}
363 
364 	/* If oldest_modification is same then decide on the space. */
365 	ret = (int)(b2->id.space() - b1->id.space());
366 
367 	/* Or else decide ordering on the page number. */
368 	return(ret ? ret : (int) (b2->id.page_no() - b1->id.page_no()));
369 }
370 
371 /********************************************************************//**
372 Initialize the red-black tree to speed up insertions into the flush_list
373 during recovery process. Should be called at the start of recovery
374 process before any page has been read/written. */
375 void
buf_flush_init_flush_rbt(void)376 buf_flush_init_flush_rbt(void)
377 /*==========================*/
378 {
379 	ulint	i;
380 
381 	for (i = 0; i < srv_buf_pool_instances; i++) {
382 		buf_pool_t*	buf_pool;
383 
384 		buf_pool = buf_pool_from_array(i);
385 
386 		buf_flush_list_mutex_enter(buf_pool);
387 
388 		ut_ad(buf_pool->flush_rbt == NULL);
389 
390 		/* Create red black tree for speedy insertions in flush list. */
391 		buf_pool->flush_rbt = rbt_create(
392 			sizeof(buf_page_t*), buf_flush_block_cmp);
393 
394 		buf_flush_list_mutex_exit(buf_pool);
395 	}
396 }
397 
398 /********************************************************************//**
399 Frees up the red-black tree. */
400 void
buf_flush_free_flush_rbt(void)401 buf_flush_free_flush_rbt(void)
402 /*==========================*/
403 {
404 	ulint	i;
405 
406 	for (i = 0; i < srv_buf_pool_instances; i++) {
407 		buf_pool_t*	buf_pool;
408 
409 		buf_pool = buf_pool_from_array(i);
410 
411 		buf_flush_list_mutex_enter(buf_pool);
412 
413 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
414 		ut_a(buf_flush_validate_low(buf_pool));
415 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
416 
417 		rbt_free(buf_pool->flush_rbt);
418 		buf_pool->flush_rbt = NULL;
419 
420 		buf_flush_list_mutex_exit(buf_pool);
421 	}
422 }
423 
424 /********************************************************************//**
425 Inserts a modified block into the flush list. */
426 void
buf_flush_insert_into_flush_list(buf_pool_t * buf_pool,buf_block_t * block,lsn_t lsn)427 buf_flush_insert_into_flush_list(
428 /*=============================*/
429 	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
430 	buf_block_t*	block,		/*!< in/out: block which is modified */
431 	lsn_t		lsn)		/*!< in: oldest modification */
432 {
433 	ut_ad(!buf_pool_mutex_own(buf_pool));
434 	ut_ad(log_flush_order_mutex_own());
435 	ut_ad(buf_page_mutex_own(block));
436 
437 	buf_flush_list_mutex_enter(buf_pool);
438 
439 	ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
440 	      || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
441 		  <= lsn));
442 
443 	/* If we are in the recovery then we need to update the flush
444 	red-black tree as well. */
445 	if (buf_pool->flush_rbt != NULL) {
446 		buf_flush_list_mutex_exit(buf_pool);
447 		buf_flush_insert_sorted_into_flush_list(buf_pool, block, lsn);
448 		return;
449 	}
450 
451 	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
452 	ut_ad(!block->page.in_flush_list);
453 
454 	ut_d(block->page.in_flush_list = TRUE);
455 	block->page.oldest_modification = lsn;
456 
457 	UT_LIST_ADD_FIRST(buf_pool->flush_list, &block->page);
458 
459 	incr_flush_list_size_in_bytes(block, buf_pool);
460 
461 #ifdef UNIV_DEBUG_VALGRIND
462 	void*	p;
463 
464 	if (block->page.size.is_compressed()) {
465 		p = block->page.zip.data;
466 	} else {
467 		p = block->frame;
468 	}
469 
470 	UNIV_MEM_ASSERT_RW(p, block->page.size.physical());
471 #endif /* UNIV_DEBUG_VALGRIND */
472 
473 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
474 	ut_a(buf_flush_validate_skip(buf_pool));
475 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
476 
477 	buf_flush_list_mutex_exit(buf_pool);
478 }
479 
480 /********************************************************************//**
481 Inserts a modified block into the flush list in the right sorted position.
482 This function is used by recovery, because there the modifications do not
483 necessarily come in the order of lsn's. */
484 void
buf_flush_insert_sorted_into_flush_list(buf_pool_t * buf_pool,buf_block_t * block,lsn_t lsn)485 buf_flush_insert_sorted_into_flush_list(
486 /*====================================*/
487 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
488 	buf_block_t*	block,		/*!< in/out: block which is modified */
489 	lsn_t		lsn)		/*!< in: oldest modification */
490 {
491 	buf_page_t*	prev_b;
492 	buf_page_t*	b;
493 
494 	ut_ad(!buf_pool_mutex_own(buf_pool));
495 	ut_ad(log_flush_order_mutex_own());
496 	ut_ad(buf_page_mutex_own(block));
497 	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
498 
499 	buf_flush_list_mutex_enter(buf_pool);
500 
501 	/* The field in_LRU_list is protected by buf_pool->mutex, which
502 	we are not holding.  However, while a block is in the flush
503 	list, it is dirty and cannot be discarded, not from the
504 	page_hash or from the LRU list.  At most, the uncompressed
505 	page frame of a compressed block may be discarded or created
506 	(copying the block->page to or from a buf_page_t that is
507 	dynamically allocated from buf_buddy_alloc()).  Because those
508 	transitions hold block->mutex and the flush list mutex (via
509 	buf_flush_relocate_on_flush_list()), there is no possibility
510 	of a race condition in the assertions below. */
511 	ut_ad(block->page.in_LRU_list);
512 	ut_ad(block->page.in_page_hash);
513 	/* buf_buddy_block_register() will take a block in the
514 	BUF_BLOCK_MEMORY state, not a file page. */
515 	ut_ad(!block->page.in_zip_hash);
516 
517 	ut_ad(!block->page.in_flush_list);
518 	ut_d(block->page.in_flush_list = TRUE);
519 	block->page.oldest_modification = lsn;
520 
521 #ifdef UNIV_DEBUG_VALGRIND
522 	void*	p;
523 
524 	if (block->page.size.is_compressed()) {
525 		p = block->page.zip.data;
526 	} else {
527 		p = block->frame;
528 	}
529 
530 	UNIV_MEM_ASSERT_RW(p, block->page.size.physical());
531 #endif /* UNIV_DEBUG_VALGRIND */
532 
533 	prev_b = NULL;
534 
535 	/* For the most part when this function is called the flush_rbt
536 	should not be NULL. In a very rare boundary case it is possible
537 	that the flush_rbt has already been freed by the recovery thread
538 	before the last page was hooked up in the flush_list by the
539 	io-handler thread. In that case we'll just do a simple
540 	linear search in the else block. */
541 	if (buf_pool->flush_rbt != NULL) {
542 
543 		prev_b = buf_flush_insert_in_flush_rbt(&block->page);
544 
545 	} else {
546 
547 		b = UT_LIST_GET_FIRST(buf_pool->flush_list);
548 
549 		while (b != NULL && b->oldest_modification
550 		       > block->page.oldest_modification) {
551 
552 			ut_ad(b->in_flush_list);
553 			prev_b = b;
554 			b = UT_LIST_GET_NEXT(list, b);
555 		}
556 	}
557 
558 	if (prev_b == NULL) {
559 		UT_LIST_ADD_FIRST(buf_pool->flush_list, &block->page);
560 	} else {
561 		UT_LIST_INSERT_AFTER(buf_pool->flush_list, prev_b, &block->page);
562 	}
563 
564 	incr_flush_list_size_in_bytes(block, buf_pool);
565 
566 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
567 	ut_a(buf_flush_validate_low(buf_pool));
568 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
569 
570 	buf_flush_list_mutex_exit(buf_pool);
571 }
572 
573 /********************************************************************//**
574 Returns TRUE if the file page block is immediately suitable for replacement,
575 i.e., the transition FILE_PAGE => NOT_USED allowed.
576 @return TRUE if can replace immediately */
577 ibool
buf_flush_ready_for_replace(buf_page_t * bpage)578 buf_flush_ready_for_replace(
579 /*========================*/
580 	buf_page_t*	bpage)	/*!< in: buffer control block, must be
581 				buf_page_in_file(bpage) and in the LRU list */
582 {
583 #ifdef UNIV_DEBUG
584 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
585 	ut_ad(buf_pool_mutex_own(buf_pool));
586 #endif /* UNIV_DEBUG */
587 	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
588 	ut_ad(bpage->in_LRU_list);
589 
590 	if (buf_page_in_file(bpage)) {
591 
592 		return(bpage->oldest_modification == 0
593 		       && bpage->buf_fix_count == 0
594 		       && buf_page_get_io_fix(bpage) == BUF_IO_NONE);
595 	}
596 
597 	ib::fatal() << "Buffer block " << bpage << " state " <<  bpage->state
598 		<< " in the LRU list!";
599 
600 	return(FALSE);
601 }
602 
603 /********************************************************************//**
604 Returns true if the block is modified and ready for flushing.
605 @return true if can flush immediately */
606 bool
buf_flush_ready_for_flush(buf_page_t * bpage,buf_flush_t flush_type)607 buf_flush_ready_for_flush(
608 /*======================*/
609 	buf_page_t*	bpage,	/*!< in: buffer control block, must be
610 				buf_page_in_file(bpage) */
611 	buf_flush_t	flush_type)/*!< in: type of flush */
612 {
613 #ifdef UNIV_DEBUG
614 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
615 	ut_ad(buf_pool_mutex_own(buf_pool));
616 #endif /* UNIV_DEBUG */
617 
618 	ut_a(buf_page_in_file(bpage));
619 	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
620 	ut_ad(flush_type < BUF_FLUSH_N_TYPES);
621 
622 	if (bpage->oldest_modification == 0
623 	    || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
624 		return(false);
625 	}
626 
627 	ut_ad(bpage->in_flush_list);
628 
629 	switch (flush_type) {
630 	case BUF_FLUSH_LIST:
631 	case BUF_FLUSH_LRU:
632 	case BUF_FLUSH_SINGLE_PAGE:
633 		return(true);
634 
635 	case BUF_FLUSH_N_TYPES:
636 		break;
637 	}
638 
639 	ut_error;
640 	return(false);
641 }
642 
643 /********************************************************************//**
644 Remove a block from the flush list of modified blocks. */
645 void
buf_flush_remove(buf_page_t * bpage)646 buf_flush_remove(
647 /*=============*/
648 	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
649 {
650 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
651 
652 	ut_ad(buf_pool_mutex_own(buf_pool));
653 	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
654 	ut_ad(bpage->in_flush_list);
655 
656 	buf_flush_list_mutex_enter(buf_pool);
657 
658 	/* Important that we adjust the hazard pointer before removing
659 	the bpage from flush list. */
660 	buf_pool->flush_hp.adjust(bpage);
661 
662 	switch (buf_page_get_state(bpage)) {
663 	case BUF_BLOCK_POOL_WATCH:
664 	case BUF_BLOCK_ZIP_PAGE:
665 		/* Clean compressed pages should not be on the flush list */
666 	case BUF_BLOCK_NOT_USED:
667 	case BUF_BLOCK_READY_FOR_USE:
668 	case BUF_BLOCK_MEMORY:
669 	case BUF_BLOCK_REMOVE_HASH:
670 		ut_error;
671 		return;
672 	case BUF_BLOCK_ZIP_DIRTY:
673 		buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
674 		UT_LIST_REMOVE(buf_pool->flush_list, bpage);
675 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
676 		buf_LRU_insert_zip_clean(bpage);
677 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
678 		break;
679 	case BUF_BLOCK_FILE_PAGE:
680 		UT_LIST_REMOVE(buf_pool->flush_list, bpage);
681 		break;
682 	}
683 
684 	/* If the flush_rbt is active then delete from there as well. */
685 	if (buf_pool->flush_rbt != NULL) {
686 		buf_flush_delete_from_flush_rbt(bpage);
687 	}
688 
689 	/* Must be done after we have removed it from the flush_rbt
690 	because we assert on in_flush_list in comparison function. */
691 	ut_d(bpage->in_flush_list = FALSE);
692 
693 	buf_pool->stat.flush_list_bytes -= bpage->size.physical();
694 
695 	bpage->oldest_modification = 0;
696 
697 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
698 	ut_a(buf_flush_validate_skip(buf_pool));
699 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
700 
701 	/* If there is an observer that want to know if the asynchronous
702 	flushing was done then notify it. */
703 	if (bpage->flush_observer != NULL) {
704 		bpage->flush_observer->notify_remove(buf_pool, bpage);
705 
706 		bpage->flush_observer = NULL;
707 	}
708 
709 	buf_flush_list_mutex_exit(buf_pool);
710 }
711 
712 /*******************************************************************//**
713 Relocates a buffer control block on the flush_list.
714 Note that it is assumed that the contents of bpage have already been
715 copied to dpage.
716 IMPORTANT: When this function is called bpage and dpage are not
717 exact copies of each other. For example, they both will have different
718 ::state. Also the ::list pointers in dpage may be stale. We need to
719 use the current list node (bpage) to do the list manipulation because
720 the list pointers could have changed between the time that we copied
721 the contents of bpage to the dpage and the flush list manipulation
722 below. */
723 void
buf_flush_relocate_on_flush_list(buf_page_t * bpage,buf_page_t * dpage)724 buf_flush_relocate_on_flush_list(
725 /*=============================*/
726 	buf_page_t*	bpage,	/*!< in/out: control block being moved */
727 	buf_page_t*	dpage)	/*!< in/out: destination block */
728 {
729 	buf_page_t*	prev;
730 	buf_page_t*	prev_b = NULL;
731 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
732 
733 	ut_ad(buf_pool_mutex_own(buf_pool));
734 	/* Must reside in the same buffer pool. */
735 	ut_ad(buf_pool == buf_pool_from_bpage(dpage));
736 
737 	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
738 
739 	buf_flush_list_mutex_enter(buf_pool);
740 
741 	/* FIXME: At this point we have both buf_pool and flush_list
742 	mutexes. Theoretically removal of a block from flush list is
743 	only covered by flush_list mutex but currently we do
744 	have buf_pool mutex in buf_flush_remove() therefore this block
745 	is guaranteed to be in the flush list. We need to check if
746 	this will work without the assumption of block removing code
747 	having the buf_pool mutex. */
748 	ut_ad(bpage->in_flush_list);
749 	ut_ad(dpage->in_flush_list);
750 
751 	/* If recovery is active we must swap the control blocks in
752 	the flush_rbt as well. */
753 	if (buf_pool->flush_rbt != NULL) {
754 		buf_flush_delete_from_flush_rbt(bpage);
755 		prev_b = buf_flush_insert_in_flush_rbt(dpage);
756 	}
757 
758 	/* Important that we adjust the hazard pointer before removing
759 	the bpage from the flush list. */
760 	buf_pool->flush_hp.adjust(bpage);
761 
762 	/* Must be done after we have removed it from the flush_rbt
763 	because we assert on in_flush_list in comparison function. */
764 	ut_d(bpage->in_flush_list = FALSE);
765 
766 	prev = UT_LIST_GET_PREV(list, bpage);
767 	UT_LIST_REMOVE(buf_pool->flush_list, bpage);
768 
769 	if (prev) {
770 		ut_ad(prev->in_flush_list);
771 		UT_LIST_INSERT_AFTER( buf_pool->flush_list, prev, dpage);
772 	} else {
773 		UT_LIST_ADD_FIRST(buf_pool->flush_list, dpage);
774 	}
775 
776 	/* Just an extra check. Previous in flush_list
777 	should be the same control block as in flush_rbt. */
778 	ut_a(buf_pool->flush_rbt == NULL || prev_b == prev);
779 
780 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
781 	ut_a(buf_flush_validate_low(buf_pool));
782 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
783 
784 	buf_flush_list_mutex_exit(buf_pool);
785 }
786 
787 /********************************************************************//**
788 Updates the flush system data structures when a write is completed. */
789 void
buf_flush_write_complete(buf_page_t * bpage)790 buf_flush_write_complete(
791 /*=====================*/
792 	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
793 {
794 	buf_flush_t	flush_type;
795 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
796 
797 	ut_ad(bpage);
798 
799 	buf_flush_remove(bpage);
800 
801 	flush_type = buf_page_get_flush_type(bpage);
802 	buf_pool->n_flush[flush_type]--;
803 
804 	if (buf_pool->n_flush[flush_type] == 0
805 	    && buf_pool->init_flush[flush_type] == FALSE) {
806 
807 		/* The running flush batch has ended */
808 
809 		os_event_set(buf_pool->no_flush[flush_type]);
810 	}
811 
812 	buf_dblwr_update(bpage, flush_type);
813 }
814 #endif /* !UNIV_HOTBACKUP */
815 
816 /** Calculate the checksum of a page from compressed table and update
817 the page.
818 @param[in,out]	page	page to update
819 @param[in]	size	compressed page size
820 @param[in]	lsn	LSN to stamp on the page */
821 void
buf_flush_update_zip_checksum(buf_frame_t * page,ulint size,lsn_t lsn)822 buf_flush_update_zip_checksum(
823 	buf_frame_t*	page,
824 	ulint		size,
825 	lsn_t		lsn)
826 {
827 	ut_a(size > 0);
828 
829 	const uint32_t	checksum = page_zip_calc_checksum(
830 		page, size,
831 		static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm));
832 
833 	mach_write_to_8(page + FIL_PAGE_LSN, lsn);
834 	mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
835 }
836 
837 /** Initialize a page for writing to the tablespace.
838 @param[in]	block		buffer block; NULL if bypassing the buffer pool
839 @param[in,out]	page		page frame
840 @param[in,out]	page_zip_	compressed page, or NULL if uncompressed
841 @param[in]	newest_lsn	newest modification LSN to the page
842 @param[in]	skip_checksum	whether to disable the page checksum */
843 void
buf_flush_init_for_writing(const buf_block_t * block,byte * page,void * page_zip_,lsn_t newest_lsn,bool skip_checksum)844 buf_flush_init_for_writing(
845 	const buf_block_t*	block,
846 	byte*			page,
847 	void*			page_zip_,
848 	lsn_t			newest_lsn,
849 	bool			skip_checksum)
850 {
851 	ib_uint32_t	checksum = BUF_NO_CHECKSUM_MAGIC;
852 
853 	ut_ad(block == NULL || block->frame == page);
854 	ut_ad(block == NULL || page_zip_ == NULL
855 	      || &block->page.zip == page_zip_);
856 	ut_ad(page);
857 
858 	if (page_zip_) {
859 		page_zip_des_t*	page_zip;
860 		ulint		size;
861 
862 		page_zip = static_cast<page_zip_des_t*>(page_zip_);
863 		size = page_zip_get_size(page_zip);
864 
865 		ut_ad(size);
866 		ut_ad(ut_is_2pow(size));
867 		ut_ad(size <= UNIV_ZIP_SIZE_MAX);
868 
869 		switch (fil_page_get_type(page)) {
870 		case FIL_PAGE_TYPE_ALLOCATED:
871 		case FIL_PAGE_INODE:
872 		case FIL_PAGE_IBUF_BITMAP:
873 		case FIL_PAGE_TYPE_FSP_HDR:
874 		case FIL_PAGE_TYPE_XDES:
875 			/* These are essentially uncompressed pages. */
876 			memcpy(page_zip->data, page, size);
877 			/* fall through */
878 		case FIL_PAGE_TYPE_ZBLOB:
879 		case FIL_PAGE_TYPE_ZBLOB2:
880 		case FIL_PAGE_INDEX:
881 		case FIL_PAGE_RTREE:
882 
883 			buf_flush_update_zip_checksum(
884 				page_zip->data, size, newest_lsn);
885 
886 			return;
887 		}
888 
889 		ib::error() << "The compressed page to be written"
890 			" seems corrupt:";
891 		ut_print_buf(stderr, page, size);
892 		fputs("\nInnoDB: Possibly older version of the page:", stderr);
893 		ut_print_buf(stderr, page_zip->data, size);
894 		putc('\n', stderr);
895 		ut_error;
896 	}
897 
898 	/* Write the newest modification lsn to the page header and trailer */
899 	mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
900 
901 	mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
902 			newest_lsn);
903 
904 	if (skip_checksum) {
905 		mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
906 	} else {
907 		if (block != NULL && UNIV_PAGE_SIZE == 16384) {
908 			/* The page type could be garbage in old files
909 			created before MySQL 5.5. Such files always
910 			had a page size of 16 kilobytes. */
911 			ulint	page_type = fil_page_get_type(page);
912 			ulint	reset_type = page_type;
913 
914 			switch (block->page.id.page_no() % 16384) {
915 			case 0:
916 				reset_type = block->page.id.page_no() == 0
917 					? FIL_PAGE_TYPE_FSP_HDR
918 					: FIL_PAGE_TYPE_XDES;
919 				break;
920 			case 1:
921 				reset_type = FIL_PAGE_IBUF_BITMAP;
922 				break;
923 			case 5:
924 				if (block->page.id.page_no() == 5 &&
925 				    block->page.id.space() == TRX_SYS_SPACE) {
926 					reset_type = FIL_PAGE_TYPE_TRX_SYS;
927 				}
928 				break;
929 			case 3:
930 			case 6:
931 			case 7:
932 				if (block->page.id.page_no() < 16384 &&
933 				    block->page.id.space() == TRX_SYS_SPACE) {
934 					reset_type = FIL_PAGE_TYPE_SYS;
935 				}
936 				break;
937 			case 4:
938 				if (block->page.id.page_no() == 4 &&
939 				    block->page.id.space() == TRX_SYS_SPACE) {
940 					reset_type = FIL_PAGE_INDEX;
941 				}
942 				break;
943 			default:
944 				switch (page_type) {
945 				case FIL_PAGE_INDEX:
946 				case FIL_PAGE_RTREE:
947 				case FIL_PAGE_UNDO_LOG:
948 				case FIL_PAGE_INODE:
949 				case FIL_PAGE_IBUF_FREE_LIST:
950 				case FIL_PAGE_TYPE_ALLOCATED:
951 				case FIL_PAGE_TYPE_SYS:
952 				case FIL_PAGE_TYPE_TRX_SYS:
953 				case FIL_PAGE_TYPE_BLOB:
954 				case FIL_PAGE_TYPE_ZBLOB:
955 				case FIL_PAGE_TYPE_ZBLOB2:
956 					break;
957 				case FIL_PAGE_TYPE_FSP_HDR:
958 				case FIL_PAGE_TYPE_XDES:
959 				case FIL_PAGE_IBUF_BITMAP:
960 					/* These pages should have
961 					predetermined page numbers
962 					(see above). */
963 				default:
964 					reset_type = FIL_PAGE_TYPE_UNKNOWN;
965 					break;
966 				}
967 			}
968 
969 			if (UNIV_UNLIKELY(page_type != reset_type)) {
970 				ib::info()
971 					<< "Resetting invalid page "
972 					<< block->page.id << " type "
973 					<< page_type << " to "
974 					<< reset_type << " when flushing.";
975 				fil_page_set_type(page, reset_type);
976 			}
977 		}
978 
979 		switch ((srv_checksum_algorithm_t) srv_checksum_algorithm) {
980 		case SRV_CHECKSUM_ALGORITHM_CRC32:
981 		case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
982 			checksum = buf_calc_page_crc32(page);
983 			mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
984 					checksum);
985 			break;
986 		case SRV_CHECKSUM_ALGORITHM_INNODB:
987 		case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
988 			checksum = (srv_fast_checksum) ?
989 				(ib_uint32_t) buf_calc_page_new_checksum_32(page) :
990 			(ib_uint32_t) buf_calc_page_new_checksum(page);
991 			mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
992 					checksum);
993 			checksum = (ib_uint32_t) buf_calc_page_old_checksum(
994 				page);
995 			break;
996 		case SRV_CHECKSUM_ALGORITHM_NONE:
997 		case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
998 			mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
999 					checksum);
1000 			break;
1001 			/* no default so the compiler will emit a warning if
1002 			new enum is added and not handled here */
1003 		}
1004 	}
1005 
1006 	/* With the InnoDB checksum, we overwrite the first 4 bytes of
1007 	the end lsn field to store the old formula checksum. Since it
1008 	depends also on the field FIL_PAGE_SPACE_OR_CHKSUM, it has to
1009 	be calculated after storing the new formula checksum.
1010 
1011 	In other cases we write the same value to both fields.
1012 	If CRC32 is used then it is faster to use that checksum
1013 	(calculated above) instead of calculating another one.
1014 	We can afford to store something other than
1015 	buf_calc_page_old_checksum() or BUF_NO_CHECKSUM_MAGIC in
1016 	this field because the file will not be readable by old
1017 	versions of MySQL/InnoDB anyway (older than MySQL 5.6.3) */
1018 
1019 	mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
1020 			checksum);
1021 }
1022 
1023 #ifndef UNIV_HOTBACKUP
1024 /********************************************************************//**
1025 Does an asynchronous write of a buffer page. NOTE: in simulated aio and
1026 also when the doublewrite buffer is used, we must call
1027 buf_dblwr_flush_buffered_writes after we have posted a batch of
1028 writes! */
1029 static
1030 void
buf_flush_write_block_low(buf_page_t * bpage,buf_flush_t flush_type,bool sync)1031 buf_flush_write_block_low(
1032 /*======================*/
1033 	buf_page_t*	bpage,		/*!< in: buffer block to write */
1034 	buf_flush_t	flush_type,	/*!< in: type of flush */
1035 	bool		sync)		/*!< in: true if sync IO request */
1036 {
1037 	page_t*	frame = NULL;
1038 
1039 #ifdef UNIV_DEBUG
1040 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
1041 	ut_ad(!buf_pool_mutex_own(buf_pool));
1042 #endif /* UNIV_DEBUG */
1043 
1044 	DBUG_PRINT("ib_buf", ("flush %s %u page " UINT32PF ":" UINT32PF,
1045 			      sync ? "sync" : "async", (unsigned) flush_type,
1046 			      bpage->id.space(), bpage->id.page_no()));
1047 
1048 	ut_ad(buf_page_in_file(bpage));
1049 
1050 	/* We are not holding buf_pool->mutex or block_mutex here.
1051 	Nevertheless, it is safe to access bpage, because it is
1052 	io_fixed and oldest_modification != 0.  Thus, it cannot be
1053 	relocated in the buffer pool or removed from flush_list or
1054 	LRU_list. */
1055 	ut_ad(!buf_pool_mutex_own(buf_pool));
1056 	ut_ad(!buf_flush_list_mutex_own(buf_pool));
1057 	ut_ad(!buf_page_get_mutex(bpage)->is_owned());
1058 	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
1059 	ut_ad(bpage->oldest_modification != 0);
1060 
1061 #ifdef UNIV_IBUF_COUNT_DEBUG
1062 	ut_a(ibuf_count_get(bpage->id) == 0);
1063 #endif /* UNIV_IBUF_COUNT_DEBUG */
1064 
1065 	ut_ad(bpage->newest_modification != 0);
1066 
1067 	/* Force the log to the disk before writing the modified block */
1068 	if (!srv_read_only_mode) {
1069 		log_write_up_to(bpage->newest_modification, true);
1070 	}
1071 
1072 	switch (buf_page_get_state(bpage)) {
1073 	case BUF_BLOCK_POOL_WATCH:
1074 	case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
1075 	case BUF_BLOCK_NOT_USED:
1076 	case BUF_BLOCK_READY_FOR_USE:
1077 	case BUF_BLOCK_MEMORY:
1078 	case BUF_BLOCK_REMOVE_HASH:
1079 		ut_error;
1080 		break;
1081 	case BUF_BLOCK_ZIP_DIRTY:
1082 		frame = bpage->zip.data;
1083 
1084 		mach_write_to_8(frame + FIL_PAGE_LSN,
1085 				bpage->newest_modification);
1086 
1087 		ut_a(page_zip_verify_checksum(frame, bpage->size.physical()));
1088 		break;
1089 	case BUF_BLOCK_FILE_PAGE:
1090 		frame = bpage->zip.data;
1091 		if (!frame) {
1092 			frame = ((buf_block_t*) bpage)->frame;
1093 		}
1094 
1095 		buf_flush_init_for_writing(
1096 			reinterpret_cast<const buf_block_t*>(bpage),
1097 			reinterpret_cast<const buf_block_t*>(bpage)->frame,
1098 			bpage->zip.data ? &bpage->zip : NULL,
1099 			bpage->newest_modification,
1100 			fsp_is_checksum_disabled(bpage->id.space()));
1101 		break;
1102 	}
1103 
1104 	/* Disable use of double-write buffer for temporary tablespace.
1105 	Given the nature and load of temporary tablespace doublewrite buffer
1106 	adds an overhead during flushing. */
1107 
1108 	if (!srv_use_doublewrite_buf
1109 	    || buf_dblwr == NULL
1110 	    || srv_read_only_mode
1111 	    || fsp_is_system_temporary(bpage->id.space())) {
1112 
1113 		ut_ad(!srv_read_only_mode
1114 		      || fsp_is_system_temporary(bpage->id.space()));
1115 
1116 		ulint	type = IORequest::WRITE | IORequest::DO_NOT_WAKE;
1117 
1118 		IORequest	request(type);
1119 
1120 		fil_io(request,
1121 		       sync, bpage->id, bpage->size, 0, bpage->size.physical(),
1122 		       frame, bpage);
1123 
1124 	} else if (flush_type == BUF_FLUSH_SINGLE_PAGE) {
1125 		buf_dblwr_write_single_page(bpage, sync);
1126 	} else {
1127 		ut_ad(!sync);
1128 		buf_dblwr_add_to_batch(bpage);
1129 	}
1130 
1131 	/* When doing single page flushing the IO is done synchronously
1132 	and we flush the changes to disk only for the tablespace we
1133 	are working on. */
1134 	if (sync) {
1135 		ut_ad(flush_type == BUF_FLUSH_SINGLE_PAGE);
1136 		fil_flush(bpage->id.space());
1137 
1138 		/* true means we want to evict this page from the
1139 		LRU list as well. */
1140 		buf_page_io_complete(bpage, true);
1141 	}
1142 
1143 	/* Increment the counter of I/O operations used
1144 	for selecting LRU policy. */
1145 	buf_LRU_stat_inc_io();
1146 }
1147 
1148 /********************************************************************//**
1149 Writes a flushable page asynchronously from the buffer pool to a file.
1150 NOTE: in simulated aio we must call
1151 os_aio_simulated_wake_handler_threads after we have posted a batch of
1152 writes! NOTE: buf_pool->mutex and buf_page_get_mutex(bpage) must be
1153 held upon entering this function, and they will be released by this
1154 function if it returns true.
1155 @return TRUE if the page was flushed */
1156 ibool
buf_flush_page(buf_pool_t * buf_pool,buf_page_t * bpage,buf_flush_t flush_type,bool sync)1157 buf_flush_page(
1158 /*===========*/
1159 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
1160 	buf_page_t*	bpage,		/*!< in: buffer control block */
1161 	buf_flush_t	flush_type,	/*!< in: type of flush */
1162 	bool		sync)		/*!< in: true if sync IO request */
1163 {
1164 	BPageMutex*	block_mutex;
1165 
1166 	ut_ad(flush_type < BUF_FLUSH_N_TYPES);
1167 	ut_ad(buf_pool_mutex_own(buf_pool));
1168 	ut_ad(buf_page_in_file(bpage));
1169 	ut_ad(!sync || flush_type == BUF_FLUSH_SINGLE_PAGE);
1170 
1171 	block_mutex = buf_page_get_mutex(bpage);
1172 	ut_ad(mutex_own(block_mutex));
1173 
1174 	ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
1175 
1176 	bool	is_uncompressed;
1177 
1178 	is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
1179 	ut_ad(is_uncompressed == (block_mutex != &buf_pool->zip_mutex));
1180 
1181 	ibool		flush;
1182 	rw_lock_t*	rw_lock;
1183 	bool		no_fix_count = bpage->buf_fix_count == 0;
1184 
1185 	if (!is_uncompressed) {
1186 		flush = TRUE;
1187 		rw_lock = NULL;
1188 	} else if (!(no_fix_count || flush_type == BUF_FLUSH_LIST)
1189 		   || (!no_fix_count
1190 		       && srv_shutdown_state <= SRV_SHUTDOWN_CLEANUP
1191 		       && fsp_is_system_temporary(bpage->id.space()))) {
1192 		/* This is a heuristic, to avoid expensive SX attempts. */
1193 		/* For table residing in temporary tablespace sync is done
1194 		using IO_FIX and so before scheduling for flush ensure that
1195 		page is not fixed. */
1196 		flush = FALSE;
1197 	} else {
1198 		rw_lock = &reinterpret_cast<buf_block_t*>(bpage)->lock;
1199 		if (flush_type != BUF_FLUSH_LIST) {
1200 			flush = rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE);
1201 		} else {
1202 			/* Will SX lock later */
1203 			flush = TRUE;
1204 		}
1205 	}
1206 
1207 	if (flush) {
1208 
1209 		/* We are committed to flushing by the time we get here */
1210 
1211 		buf_page_set_io_fix(bpage, BUF_IO_WRITE);
1212 
1213 		buf_page_set_flush_type(bpage, flush_type);
1214 
1215 		if (buf_pool->n_flush[flush_type] == 0) {
1216 			os_event_reset(buf_pool->no_flush[flush_type]);
1217 		}
1218 
1219 		++buf_pool->n_flush[flush_type];
1220 
1221 		mutex_exit(block_mutex);
1222 
1223 		buf_pool_mutex_exit(buf_pool);
1224 
1225 		if (flush_type == BUF_FLUSH_LIST
1226 		    && is_uncompressed
1227 		    && !rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE)) {
1228 
1229 			if (!fsp_is_system_temporary(bpage->id.space())) {
1230 				/* avoiding deadlock possibility involves
1231 				doublewrite buffer, should flush it, because
1232 				it might hold the another block->lock. */
1233 				buf_dblwr_flush_buffered_writes();
1234 			} else {
1235 				buf_dblwr_sync_datafiles();
1236 			}
1237 
1238 			rw_lock_sx_lock_gen(rw_lock, BUF_IO_WRITE);
1239 		}
1240 
1241 		/* If there is an observer that want to know if the asynchronous
1242 		flushing was sent then notify it.
1243 		Note: we set flush observer to a page with x-latch, so we can
1244 		guarantee that notify_flush and notify_remove are called in pair
1245 		with s-latch on a uncompressed page. */
1246 		if (bpage->flush_observer != NULL) {
1247 			buf_pool_mutex_enter(buf_pool);
1248 
1249 			bpage->flush_observer->notify_flush(buf_pool, bpage);
1250 
1251 			buf_pool_mutex_exit(buf_pool);
1252 		}
1253 
1254 		/* Even though bpage is not protected by any mutex at this
1255 		point, it is safe to access bpage, because it is io_fixed and
1256 		oldest_modification != 0.  Thus, it cannot be relocated in the
1257 		buffer pool or removed from flush_list or LRU_list. */
1258 
1259 		buf_flush_write_block_low(bpage, flush_type, sync);
1260 	}
1261 
1262 	return(flush);
1263 }
1264 
1265 # if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
1266 /********************************************************************//**
1267 Writes a flushable page asynchronously from the buffer pool to a file.
1268 NOTE: buf_pool->mutex and block->mutex must be held upon entering this
1269 function, and they will be released by this function after flushing.
1270 This is loosely based on buf_flush_batch() and buf_flush_page().
1271 @return TRUE if the page was flushed and the mutexes released */
1272 ibool
buf_flush_page_try(buf_pool_t * buf_pool,buf_block_t * block)1273 buf_flush_page_try(
1274 /*===============*/
1275 	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
1276 	buf_block_t*	block)		/*!< in/out: buffer control block */
1277 {
1278 	ut_ad(buf_pool_mutex_own(buf_pool));
1279 	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
1280 	ut_ad(buf_page_mutex_own(block));
1281 
1282 	if (!buf_flush_ready_for_flush(&block->page, BUF_FLUSH_SINGLE_PAGE)) {
1283 		return(FALSE);
1284 	}
1285 
1286 	/* The following call will release the buffer pool and
1287 	block mutex. */
1288 	return(buf_flush_page(
1289 			buf_pool, &block->page,
1290 			BUF_FLUSH_SINGLE_PAGE, true));
1291 }
1292 # endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
1293 
1294 /** Check the page is in buffer pool and can be flushed.
1295 @param[in]	page_id		page id
1296 @param[in]	flush_type	BUF_FLUSH_LRU or BUF_FLUSH_LIST
1297 @return true if the page can be flushed. */
1298 static
1299 bool
buf_flush_check_neighbor(const page_id_t & page_id,buf_flush_t flush_type)1300 buf_flush_check_neighbor(
1301 	const page_id_t&	page_id,
1302 	buf_flush_t		flush_type)
1303 {
1304 	buf_page_t*	bpage;
1305 	buf_pool_t*	buf_pool = buf_pool_get(page_id);
1306 	bool		ret;
1307 
1308 	ut_ad(flush_type == BUF_FLUSH_LRU
1309 	      || flush_type == BUF_FLUSH_LIST);
1310 
1311 	buf_pool_mutex_enter(buf_pool);
1312 
1313 	/* We only want to flush pages from this buffer pool. */
1314 	bpage = buf_page_hash_get(buf_pool, page_id);
1315 
1316 	if (!bpage) {
1317 
1318 		buf_pool_mutex_exit(buf_pool);
1319 		return(false);
1320 	}
1321 
1322 	ut_a(buf_page_in_file(bpage));
1323 
1324 	/* We avoid flushing 'non-old' blocks in an LRU flush,
1325 	because the flushed blocks are soon freed */
1326 
1327 	ret = false;
1328 	if (flush_type != BUF_FLUSH_LRU || buf_page_is_old(bpage)) {
1329 		BPageMutex* block_mutex = buf_page_get_mutex(bpage);
1330 
1331 		mutex_enter(block_mutex);
1332 		if (buf_flush_ready_for_flush(bpage, flush_type)) {
1333 			ret = true;
1334 		}
1335 		mutex_exit(block_mutex);
1336 	}
1337 	buf_pool_mutex_exit(buf_pool);
1338 
1339 	return(ret);
1340 }
1341 
1342 /** Flushes to disk all flushable pages within the flush area.
1343 @param[in]	page_id		page id
1344 @param[in]	flush_type	BUF_FLUSH_LRU or BUF_FLUSH_LIST
1345 @param[in]	n_flushed	number of pages flushed so far in this batch
1346 @param[in]	n_to_flush	maximum number of pages we are allowed to flush
1347 @return number of pages flushed */
1348 static
1349 ulint
buf_flush_try_neighbors(const page_id_t & page_id,buf_flush_t flush_type,ulint n_flushed,ulint n_to_flush)1350 buf_flush_try_neighbors(
1351 	const page_id_t&	page_id,
1352 	buf_flush_t		flush_type,
1353 	ulint			n_flushed,
1354 	ulint			n_to_flush)
1355 {
1356 	ulint		i;
1357 	ulint		low;
1358 	ulint		high;
1359 	ulint		count = 0;
1360 	buf_pool_t*	buf_pool = buf_pool_get(page_id);
1361 
1362 	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1363 
1364 	if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN
1365 	    || srv_flush_neighbors == 0) {
1366 		/* If there is little space or neighbor flushing is
1367 		not enabled then just flush the victim. */
1368 		low = page_id.page_no();
1369 		high = page_id.page_no() + 1;
1370 	} else {
1371 		/* When flushed, dirty blocks are searched in
1372 		neighborhoods of this size, and flushed along with the
1373 		original page. */
1374 
1375 		ulint	buf_flush_area;
1376 
1377 		buf_flush_area	= ut_min(
1378 			BUF_READ_AHEAD_AREA(buf_pool),
1379 			buf_pool->curr_size / 16);
1380 
1381 		low = (page_id.page_no() / buf_flush_area) * buf_flush_area;
1382 		high = (page_id.page_no() / buf_flush_area + 1) * buf_flush_area;
1383 
1384 		if (srv_flush_neighbors == 1) {
1385 			/* adjust 'low' and 'high' to limit
1386 			   for contiguous dirty area */
1387 			if (page_id.page_no() > low) {
1388 				for (i = page_id.page_no() - 1; i >= low; i--) {
1389 					if (!buf_flush_check_neighbor(
1390 						page_id_t(page_id.space(), i),
1391 						flush_type)) {
1392 
1393 						break;
1394 					}
1395 
1396 					if (i == low) {
1397 						/* Avoid overwrap when low == 0
1398 						and calling
1399 						buf_flush_check_neighbor() with
1400 						i == (ulint) -1 */
1401 						i--;
1402 						break;
1403 					}
1404 				}
1405 				low = i + 1;
1406 			}
1407 
1408 			for (i = page_id.page_no() + 1;
1409 			     i < high
1410 			     && buf_flush_check_neighbor(
1411 				     page_id_t(page_id.space(), i),
1412 				     flush_type);
1413 			     i++) {
1414 				/* do nothing */
1415 			}
1416 			high = i;
1417 		}
1418 	}
1419 
1420 	const ulint	space_size = fil_space_get_size(page_id.space());
1421 	if (high > space_size) {
1422 		high = space_size;
1423 	}
1424 
1425 	DBUG_PRINT("ib_buf", ("flush " UINT32PF ":%u..%u",
1426 			      page_id.space(),
1427 			      (unsigned) low, (unsigned) high));
1428 
1429 	for (ulint i = low; i < high; i++) {
1430 		buf_page_t*	bpage;
1431 
1432 		if ((count + n_flushed) >= n_to_flush) {
1433 
1434 			/* We have already flushed enough pages and
1435 			should call it a day. There is, however, one
1436 			exception. If the page whose neighbors we
1437 			are flushing has not been flushed yet then
1438 			we'll try to flush the victim that we
1439 			selected originally. */
1440 			if (i <= page_id.page_no()) {
1441 				i = page_id.page_no();
1442 			} else {
1443 				break;
1444 			}
1445 		}
1446 
1447 		const page_id_t	cur_page_id(page_id.space(), i);
1448 
1449 		buf_pool = buf_pool_get(cur_page_id);
1450 
1451 		buf_pool_mutex_enter(buf_pool);
1452 
1453 		/* We only want to flush pages from this buffer pool. */
1454 		bpage = buf_page_hash_get(buf_pool, cur_page_id);
1455 
1456 		if (bpage == NULL) {
1457 
1458 			buf_pool_mutex_exit(buf_pool);
1459 			continue;
1460 		}
1461 
1462 		ut_a(buf_page_in_file(bpage));
1463 
1464 		/* We avoid flushing 'non-old' blocks in an LRU flush,
1465 		because the flushed blocks are soon freed */
1466 
1467 		if (flush_type != BUF_FLUSH_LRU
1468 		    || i == page_id.page_no()
1469 		    || buf_page_is_old(bpage)) {
1470 
1471 			BPageMutex* block_mutex = buf_page_get_mutex(bpage);
1472 
1473 			mutex_enter(block_mutex);
1474 
1475 			if (buf_flush_ready_for_flush(bpage, flush_type)
1476 			    && (i == page_id.page_no()
1477 				|| bpage->buf_fix_count == 0)) {
1478 
1479 				/* We also try to flush those
1480 				neighbors != offset */
1481 
1482 				if (buf_flush_page(
1483 					buf_pool, bpage, flush_type, false)) {
1484 
1485 					++count;
1486 				} else {
1487 					mutex_exit(block_mutex);
1488 					buf_pool_mutex_exit(buf_pool);
1489 				}
1490 
1491 				continue;
1492 			} else {
1493 				mutex_exit(block_mutex);
1494 			}
1495 		}
1496 		buf_pool_mutex_exit(buf_pool);
1497 	}
1498 
1499 	if (count > 1) {
1500 		MONITOR_INC_VALUE_CUMULATIVE(
1501 			MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
1502 			MONITOR_FLUSH_NEIGHBOR_COUNT,
1503 			MONITOR_FLUSH_NEIGHBOR_PAGES,
1504 			(count - 1));
1505 	}
1506 
1507 	return(count);
1508 }
1509 
1510 /** Check if the block is modified and ready for flushing.
1511 If the the block is ready to flush then flush the page and try o flush
1512 its neighbors.
1513 @param[in]	bpage		buffer control block,
1514 must be buf_page_in_file(bpage)
1515 @param[in]	flush_type	BUF_FLUSH_LRU or BUF_FLUSH_LIST
1516 @param[in]	n_to_flush	number of pages to flush
1517 @param[in,out]	count		number of pages flushed
1518 @return TRUE if buf_pool mutex was released during this function.
1519 This does not guarantee that some pages were written as well.
1520 Number of pages written are incremented to the count. */
1521 static
1522 bool
buf_flush_page_and_try_neighbors(buf_page_t * bpage,buf_flush_t flush_type,ulint n_to_flush,ulint * count)1523 buf_flush_page_and_try_neighbors(
1524 	buf_page_t*		bpage,
1525 	buf_flush_t		flush_type,
1526 	ulint			n_to_flush,
1527 	ulint*			count)
1528 {
1529 #ifdef UNIV_DEBUG
1530 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
1531 
1532 	ut_ad(buf_pool_mutex_own(buf_pool));
1533 #endif /* UNIV_DEBUG */
1534 
1535 	bool		flushed;
1536 	BPageMutex*	block_mutex = buf_page_get_mutex(bpage);
1537 
1538 	mutex_enter(block_mutex);
1539 
1540 	ut_a(buf_page_in_file(bpage));
1541 
1542 	if (buf_flush_ready_for_flush(bpage, flush_type)) {
1543 		buf_pool_t*	buf_pool;
1544 
1545 		buf_pool = buf_pool_from_bpage(bpage);
1546 
1547 		const page_id_t	page_id = bpage->id;
1548 
1549 		mutex_exit(block_mutex);
1550 
1551 		buf_pool_mutex_exit(buf_pool);
1552 
1553 		/* Try to flush also all the neighbors */
1554 		*count += buf_flush_try_neighbors(
1555 			page_id, flush_type, *count, n_to_flush);
1556 
1557 		buf_pool_mutex_enter(buf_pool);
1558 		flushed = TRUE;
1559 	} else {
1560 		mutex_exit(block_mutex);
1561 
1562 		flushed = false;
1563 	}
1564 
1565 	ut_ad(buf_pool_mutex_own(buf_pool));
1566 
1567 	return(flushed);
1568 }
1569 
1570 /*******************************************************************//**
1571 This utility moves the uncompressed frames of pages to the free list.
1572 Note that this function does not actually flush any data to disk. It
1573 just detaches the uncompressed frames from the compressed pages at the
1574 tail of the unzip_LRU and puts those freed frames in the free list.
1575 Note that it is a best effort attempt and it is not guaranteed that
1576 after a call to this function there will be 'max' blocks in the free
1577 list.
1578 @return number of blocks moved to the free list. */
1579 static
1580 ulint
buf_free_from_unzip_LRU_list_batch(buf_pool_t * buf_pool,ulint max)1581 buf_free_from_unzip_LRU_list_batch(
1582 /*===============================*/
1583 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
1584 	ulint		max)		/*!< in: desired number of
1585 					blocks in the free_list */
1586 {
1587 	ulint		scanned = 0;
1588 	ulint		count = 0;
1589 	ulint		free_len = UT_LIST_GET_LEN(buf_pool->free);
1590 	ulint		lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
1591 
1592 	ut_ad(buf_pool_mutex_own(buf_pool));
1593 
1594 	buf_block_t*	block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
1595 
1596 	while (block != NULL
1597 	       && count < max
1598 	       && free_len < srv_LRU_scan_depth
1599 	       && lru_len > UT_LIST_GET_LEN(buf_pool->LRU) / 10) {
1600 
1601 		++scanned;
1602 		if (buf_LRU_free_page(&block->page, false)) {
1603 			/* Block was freed. buf_pool->mutex potentially
1604 			released and reacquired */
1605 			++count;
1606 			block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
1607 
1608 		} else {
1609 
1610 			block = UT_LIST_GET_PREV(unzip_LRU, block);
1611 		}
1612 
1613 		free_len = UT_LIST_GET_LEN(buf_pool->free);
1614 		lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
1615 	}
1616 
1617 	ut_ad(buf_pool_mutex_own(buf_pool));
1618 
1619 	if (scanned) {
1620 		MONITOR_INC_VALUE_CUMULATIVE(
1621 			MONITOR_LRU_BATCH_SCANNED,
1622 			MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
1623 			MONITOR_LRU_BATCH_SCANNED_PER_CALL,
1624 			scanned);
1625 	}
1626 
1627 	return(count);
1628 }
1629 
1630 /*******************************************************************//**
1631 This utility flushes dirty blocks from the end of the LRU list.
1632 The calling thread is not allowed to own any latches on pages!
1633 It attempts to make 'max' blocks available in the free list. Note that
1634 it is a best effort attempt and it is not guaranteed that after a call
1635 to this function there will be 'max' blocks in the free list.
1636 @return number of blocks for which the write request was queued. */
1637 static
1638 ulint
buf_flush_LRU_list_batch(buf_pool_t * buf_pool,ulint max)1639 buf_flush_LRU_list_batch(
1640 /*=====================*/
1641 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
1642 	ulint		max)		/*!< in: desired number of
1643 					blocks in the free_list */
1644 {
1645 	buf_page_t*	bpage;
1646 	ulint		scanned = 0;
1647 	ulint		evict_count = 0;
1648 	ulint		count = 0;
1649 	ulint		free_len = UT_LIST_GET_LEN(buf_pool->free);
1650 	ulint		lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
1651 	ulint		withdraw_depth = 0;
1652 
1653 	ut_ad(buf_pool_mutex_own(buf_pool));
1654 
1655 	if (buf_pool->curr_size < buf_pool->old_size
1656 	    && buf_pool->withdraw_target > 0) {
1657 		withdraw_depth = buf_pool->withdraw_target
1658 				 - UT_LIST_GET_LEN(buf_pool->withdraw);
1659 	}
1660 
1661 	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1662 	     bpage != NULL && count + evict_count < max
1663 	     && free_len < srv_LRU_scan_depth + withdraw_depth
1664 	     && lru_len > BUF_LRU_MIN_LEN;
1665 	     ++scanned,
1666 	     bpage = buf_pool->lru_hp.get()) {
1667 
1668 		buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage);
1669 		buf_pool->lru_hp.set(prev);
1670 
1671 		BPageMutex*	block_mutex = buf_page_get_mutex(bpage);
1672 
1673 		mutex_enter(block_mutex);
1674 
1675 		if (buf_flush_ready_for_replace(bpage)) {
1676 			/* block is ready for eviction i.e., it is
1677 			clean and is not IO-fixed or buffer fixed. */
1678 			mutex_exit(block_mutex);
1679 			if (buf_LRU_free_page(bpage, true)) {
1680 				++evict_count;
1681 			}
1682 		} else if (buf_flush_ready_for_flush(bpage, BUF_FLUSH_LRU)) {
1683 			/* Block is ready for flush. Dispatch an IO
1684 			request. The IO helper thread will put it on
1685 			free list in IO completion routine. */
1686 			mutex_exit(block_mutex);
1687 			buf_flush_page_and_try_neighbors(
1688 				bpage, BUF_FLUSH_LRU, max, &count);
1689 		} else {
1690 			/* Can't evict or dispatch this block. Go to
1691 			previous. */
1692 			ut_ad(buf_pool->lru_hp.is_hp(prev));
1693 			mutex_exit(block_mutex);
1694 		}
1695 
1696 		ut_ad(!mutex_own(block_mutex));
1697 		ut_ad(buf_pool_mutex_own(buf_pool));
1698 
1699 		free_len = UT_LIST_GET_LEN(buf_pool->free);
1700 		lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
1701 	}
1702 
1703 	buf_pool->lru_hp.set(NULL);
1704 
1705 	/* We keep track of all flushes happening as part of LRU
1706 	flush. When estimating the desired rate at which flush_list
1707 	should be flushed, we factor in this value. */
1708 	buf_lru_flush_page_count += count;
1709 
1710 	ut_ad(buf_pool_mutex_own(buf_pool));
1711 
1712 	if (evict_count) {
1713 		MONITOR_INC_VALUE_CUMULATIVE(
1714 			MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
1715 			MONITOR_LRU_BATCH_EVICT_COUNT,
1716 			MONITOR_LRU_BATCH_EVICT_PAGES,
1717 			evict_count);
1718 	}
1719 
1720 	if (scanned) {
1721 		MONITOR_INC_VALUE_CUMULATIVE(
1722 			MONITOR_LRU_BATCH_SCANNED,
1723 			MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
1724 			MONITOR_LRU_BATCH_SCANNED_PER_CALL,
1725 			scanned);
1726 	}
1727 
1728 	return(count);
1729 }
1730 
1731 /*******************************************************************//**
1732 Flush and move pages from LRU or unzip_LRU list to the free list.
1733 Whether LRU or unzip_LRU is used depends on the state of the system.
1734 @return number of blocks for which either the write request was queued
1735 or in case of unzip_LRU the number of blocks actually moved to the
1736 free list */
1737 static
1738 ulint
buf_do_LRU_batch(buf_pool_t * buf_pool,ulint max)1739 buf_do_LRU_batch(
1740 /*=============*/
1741 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
1742 	ulint		max)		/*!< in: desired number of
1743 					blocks in the free_list */
1744 {
1745 	ulint	count = 0;
1746 
1747 	if (buf_LRU_evict_from_unzip_LRU(buf_pool)) {
1748 		count += buf_free_from_unzip_LRU_list_batch(buf_pool, max);
1749 	}
1750 
1751 	if (max > count) {
1752 		count += buf_flush_LRU_list_batch(buf_pool, max - count);
1753 	}
1754 
1755 	return(count);
1756 }
1757 
1758 /** This utility flushes dirty blocks from the end of the flush_list.
1759 The calling thread is not allowed to own any latches on pages!
1760 @param[in]	buf_pool	buffer pool instance
1761 @param[in]	min_n		wished minimum mumber of blocks flushed (it is
1762 not guaranteed that the actual number is that big, though)
1763 @param[in]	lsn_limit	all blocks whose oldest_modification is smaller
1764 than this should be flushed (if their number does not exceed min_n)
1765 @return number of blocks for which the write request was queued;
1766 ULINT_UNDEFINED if there was a flush of the same type already
1767 running */
1768 static
1769 ulint
buf_do_flush_list_batch(buf_pool_t * buf_pool,ulint min_n,lsn_t lsn_limit)1770 buf_do_flush_list_batch(
1771 	buf_pool_t*		buf_pool,
1772 	ulint			min_n,
1773 	lsn_t			lsn_limit)
1774 {
1775 	ulint		count = 0;
1776 	ulint		scanned = 0;
1777 
1778 	ut_ad(buf_pool_mutex_own(buf_pool));
1779 
1780 	/* Start from the end of the list looking for a suitable
1781 	block to be flushed. */
1782 	buf_flush_list_mutex_enter(buf_pool);
1783 	ulint len = UT_LIST_GET_LEN(buf_pool->flush_list);
1784 
1785 	/* In order not to degenerate this scan to O(n*n) we attempt
1786 	to preserve pointer of previous block in the flush list. To do
1787 	so we declare it a hazard pointer. Any thread working on the
1788 	flush list must check the hazard pointer and if it is removing
1789 	the same block then it must reset it. */
1790 	for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
1791 	     count < min_n && bpage != NULL && len > 0
1792 	     && bpage->oldest_modification < lsn_limit;
1793 	     bpage = buf_pool->flush_hp.get(),
1794 	     ++scanned) {
1795 
1796 		buf_page_t*	prev;
1797 
1798 		ut_a(bpage->oldest_modification > 0);
1799 		ut_ad(bpage->in_flush_list);
1800 
1801 		prev = UT_LIST_GET_PREV(list, bpage);
1802 		buf_pool->flush_hp.set(prev);
1803 		buf_flush_list_mutex_exit(buf_pool);
1804 
1805 #ifdef UNIV_DEBUG
1806 		bool flushed =
1807 #endif /* UNIV_DEBUG */
1808 		buf_flush_page_and_try_neighbors(
1809 			bpage, BUF_FLUSH_LIST, min_n, &count);
1810 
1811 		buf_flush_list_mutex_enter(buf_pool);
1812 
1813 		ut_ad(flushed || buf_pool->flush_hp.is_hp(prev));
1814 
1815 		--len;
1816 	}
1817 
1818 	buf_pool->flush_hp.set(NULL);
1819 	buf_flush_list_mutex_exit(buf_pool);
1820 
1821 	if (scanned) {
1822 		MONITOR_INC_VALUE_CUMULATIVE(
1823 			MONITOR_FLUSH_BATCH_SCANNED,
1824 			MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
1825 			MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
1826 			scanned);
1827 	}
1828 
1829 	if (count) {
1830 		MONITOR_INC_VALUE_CUMULATIVE(
1831 			MONITOR_FLUSH_BATCH_TOTAL_PAGE,
1832 			MONITOR_FLUSH_BATCH_COUNT,
1833 			MONITOR_FLUSH_BATCH_PAGES,
1834 			count);
1835 	}
1836 
1837 	ut_ad(buf_pool_mutex_own(buf_pool));
1838 
1839 	return(count);
1840 }
1841 
1842 /** This utility flushes dirty blocks from the end of the LRU list or
1843 flush_list.
1844 NOTE 1: in the case of an LRU flush the calling thread may own latches to
1845 pages: to avoid deadlocks, this function must be written so that it cannot
1846 end up waiting for these latches! NOTE 2: in the case of a flush list flush,
1847 the calling thread is not allowed to own any latches on pages!
1848 @param[in]	buf_pool	buffer pool instance
1849 @param[in]	flush_type	BUF_FLUSH_LRU or BUF_FLUSH_LIST; if
1850 BUF_FLUSH_LIST, then the caller must not own any latches on pages
1851 @param[in]	min_n		wished minimum mumber of blocks flushed (it is
1852 not guaranteed that the actual number is that big, though)
1853 @param[in]	lsn_limit	in the case of BUF_FLUSH_LIST all blocks whose
1854 oldest_modification is smaller than this should be flushed (if their number
1855 does not exceed min_n), otherwise ignored
1856 @return number of blocks for which the write request was queued */
1857 static
1858 ulint
buf_flush_batch(buf_pool_t * buf_pool,buf_flush_t flush_type,ulint min_n,lsn_t lsn_limit)1859 buf_flush_batch(
1860 	buf_pool_t*		buf_pool,
1861 	buf_flush_t		flush_type,
1862 	ulint			min_n,
1863 	lsn_t			lsn_limit)
1864 {
1865 	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1866 
1867 #ifdef UNIV_DEBUG
1868 	{
1869 		dict_sync_check	check(true);
1870 
1871 		ut_ad(flush_type != BUF_FLUSH_LIST
1872 		      || !sync_check_iterate(check));
1873 	}
1874 #endif /* UNIV_DEBUG */
1875 
1876 	buf_pool_mutex_enter(buf_pool);
1877 
1878 	ulint	count = 0;
1879 
1880 	/* Note: The buffer pool mutex is released and reacquired within
1881 	the flush functions. */
1882 	switch (flush_type) {
1883 	case BUF_FLUSH_LRU:
1884 		count = buf_do_LRU_batch(buf_pool, min_n);
1885 		break;
1886 	case BUF_FLUSH_LIST:
1887 		count = buf_do_flush_list_batch(buf_pool, min_n, lsn_limit);
1888 		break;
1889 	default:
1890 		ut_error;
1891 	}
1892 
1893 	buf_pool_mutex_exit(buf_pool);
1894 
1895 	DBUG_PRINT("ib_buf", ("flush %u completed, %u pages",
1896 			      unsigned(flush_type), unsigned(count)));
1897 
1898 	return(count);
1899 }
1900 
1901 /******************************************************************//**
1902 Gather the aggregated stats for both flush list and LRU list flushing.
1903 @param page_count_flush	number of pages flushed from the end of the flush_list
1904 @param page_count_LRU	number of pages flushed from the end of the LRU list
1905 */
1906 static
1907 void
buf_flush_stats(ulint page_count_flush,ulint page_count_LRU)1908 buf_flush_stats(
1909 /*============*/
1910 	ulint		page_count_flush,
1911 	ulint		page_count_LRU)
1912 {
1913 	DBUG_PRINT("ib_buf", ("flush completed, from flush_list %u pages, "
1914 			      "from LRU_list %u pages",
1915 			      unsigned(page_count_flush),
1916 			      unsigned(page_count_LRU)));
1917 
1918 	srv_stats.buf_pool_flushed.add(page_count_flush + page_count_LRU);
1919 }
1920 
1921 /******************************************************************//**
1922 Start a buffer flush batch for LRU or flush list */
1923 static
1924 ibool
buf_flush_start(buf_pool_t * buf_pool,buf_flush_t flush_type)1925 buf_flush_start(
1926 /*============*/
1927 	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
1928 	buf_flush_t	flush_type)	/*!< in: BUF_FLUSH_LRU
1929 					or BUF_FLUSH_LIST */
1930 {
1931 	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1932 
1933 	buf_pool_mutex_enter(buf_pool);
1934 
1935 	if (buf_pool->n_flush[flush_type] > 0
1936 	   || buf_pool->init_flush[flush_type] == TRUE) {
1937 
1938 		/* There is already a flush batch of the same type running */
1939 
1940 		buf_pool_mutex_exit(buf_pool);
1941 
1942 		return(FALSE);
1943 	}
1944 
1945 	buf_pool->init_flush[flush_type] = TRUE;
1946 
1947 	os_event_reset(buf_pool->no_flush[flush_type]);
1948 
1949 	buf_pool_mutex_exit(buf_pool);
1950 
1951 	return(TRUE);
1952 }
1953 
1954 /******************************************************************//**
1955 End a buffer flush batch for LRU or flush list */
1956 static
1957 void
buf_flush_end(buf_pool_t * buf_pool,buf_flush_t flush_type)1958 buf_flush_end(
1959 /*==========*/
1960 	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
1961 	buf_flush_t	flush_type)	/*!< in: BUF_FLUSH_LRU
1962 					or BUF_FLUSH_LIST */
1963 {
1964 	buf_pool_mutex_enter(buf_pool);
1965 
1966 	buf_pool->init_flush[flush_type] = FALSE;
1967 
1968 	buf_pool->try_LRU_scan = TRUE;
1969 
1970 	if (buf_pool->n_flush[flush_type] == 0) {
1971 
1972 		/* The running flush batch has ended */
1973 
1974 		os_event_set(buf_pool->no_flush[flush_type]);
1975 	}
1976 
1977 	buf_pool_mutex_exit(buf_pool);
1978 
1979 	if (!srv_read_only_mode) {
1980 		buf_dblwr_flush_buffered_writes();
1981 	} else {
1982 		os_aio_simulated_wake_handler_threads();
1983 	}
1984 }
1985 
1986 /******************************************************************//**
1987 Waits until a flush batch of the given type ends */
1988 void
buf_flush_wait_batch_end(buf_pool_t * buf_pool,buf_flush_t type)1989 buf_flush_wait_batch_end(
1990 /*=====================*/
1991 	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
1992 	buf_flush_t	type)		/*!< in: BUF_FLUSH_LRU
1993 					or BUF_FLUSH_LIST */
1994 {
1995 	ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
1996 
1997 	if (buf_pool == NULL) {
1998 		ulint	i;
1999 
2000 		for (i = 0; i < srv_buf_pool_instances; ++i) {
2001 			buf_pool_t*	buf_pool;
2002 
2003 			buf_pool = buf_pool_from_array(i);
2004 
2005 			thd_wait_begin(NULL, THD_WAIT_DISKIO);
2006 			os_event_wait(buf_pool->no_flush[type]);
2007 			thd_wait_end(NULL);
2008 		}
2009 	} else {
2010 		thd_wait_begin(NULL, THD_WAIT_DISKIO);
2011 		os_event_wait(buf_pool->no_flush[type]);
2012 		thd_wait_end(NULL);
2013 	}
2014 }
2015 
2016 /** Do flushing batch of a given type.
2017 NOTE: The calling thread is not allowed to own any latches on pages!
2018 @param[in,out]	buf_pool	buffer pool instance
2019 @param[in]	type		flush type
2020 @param[in]	min_n		wished minimum mumber of blocks flushed
2021 (it is not guaranteed that the actual number is that big, though)
2022 @param[in]	lsn_limit	in the case BUF_FLUSH_LIST all blocks whose
2023 oldest_modification is smaller than this should be flushed (if their number
2024 does not exceed min_n), otherwise ignored
2025 @param[out]	n_processed	the number of pages which were processed is
2026 passed back to caller. Ignored if NULL
2027 @retval true	if a batch was queued successfully.
2028 @retval false	if another batch of same type was already running. */
2029 bool
buf_flush_do_batch(buf_pool_t * buf_pool,buf_flush_t type,ulint min_n,lsn_t lsn_limit,ulint * n_processed)2030 buf_flush_do_batch(
2031 	buf_pool_t*		buf_pool,
2032 	buf_flush_t		type,
2033 	ulint			min_n,
2034 	lsn_t			lsn_limit,
2035 	ulint*			n_processed)
2036 {
2037 	ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
2038 
2039 	if (n_processed != NULL) {
2040 		*n_processed = 0;
2041 	}
2042 
2043 	if (!buf_flush_start(buf_pool, type)) {
2044 		return(false);
2045 	}
2046 
2047 	ulint	page_count = buf_flush_batch(buf_pool, type, min_n, lsn_limit);
2048 
2049 	buf_flush_end(buf_pool, type);
2050 
2051 	if (n_processed != NULL) {
2052 		*n_processed = page_count;
2053 	}
2054 
2055 	return(true);
2056 }
2057 
2058 /**
2059 Waits until a flush batch of the given lsn ends
2060 @param[in]	new_oldest	target oldest_modified_lsn to wait for */
2061 
2062 void
buf_flush_wait_flushed(lsn_t new_oldest)2063 buf_flush_wait_flushed(
2064 	lsn_t		new_oldest)
2065 {
2066 	for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
2067 		buf_pool_t*	buf_pool;
2068 		lsn_t		oldest;
2069 
2070 		buf_pool = buf_pool_from_array(i);
2071 
2072 		for (;;) {
2073 			/* We don't need to wait for fsync of the flushed
2074 			blocks, because anyway we need fsync to make chekpoint.
2075 			So, we don't need to wait for the batch end here. */
2076 
2077 			buf_flush_list_mutex_enter(buf_pool);
2078 
2079 			buf_page_t*	bpage;
2080 
2081 			/* We don't need to wait for system temporary pages */
2082 			for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
2083 			     bpage != NULL
2084 				&& fsp_is_system_temporary(bpage->id.space());
2085 			     bpage = UT_LIST_GET_PREV(list, bpage)) {
2086 				/* Do nothing. */
2087 			}
2088 
2089 			if (bpage != NULL) {
2090 				ut_ad(bpage->in_flush_list);
2091 				oldest = bpage->oldest_modification;
2092 			} else {
2093 				oldest = 0;
2094 			}
2095 
2096 			buf_flush_list_mutex_exit(buf_pool);
2097 
2098 			if (oldest == 0 || oldest >= new_oldest) {
2099 				break;
2100 			}
2101 
2102 			/* sleep and retry */
2103 			os_thread_sleep(buf_flush_wait_flushed_sleep_time);
2104 
2105 			MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
2106 		}
2107 	}
2108 }
2109 
2110 /** This utility flushes dirty blocks from the end of the flush list of all
2111 buffer pool instances.
2112 NOTE: The calling thread is not allowed to own any latches on pages!
2113 @param[in]	min_n		wished minimum mumber of blocks flushed (it is
2114 not guaranteed that the actual number is that big, though)
2115 @param[in]	lsn_limit	in the case BUF_FLUSH_LIST all blocks whose
2116 oldest_modification is smaller than this should be flushed (if their number
2117 does not exceed min_n), otherwise ignored
2118 @param[out]	n_processed	the number of pages which were processed is
2119 passed back to caller. Ignored if NULL.
2120 @return true if a batch was queued successfully for each buffer pool
2121 instance. false if another batch of same type was already running in
2122 at least one of the buffer pool instance */
2123 bool
buf_flush_lists(ulint min_n,lsn_t lsn_limit,ulint * n_processed)2124 buf_flush_lists(
2125 	ulint			min_n,
2126 	lsn_t			lsn_limit,
2127 	ulint*			n_processed)
2128 {
2129 	ulint		i;
2130 	ulint		n_flushed = 0;
2131 	bool		success = true;
2132 
2133 	if (n_processed) {
2134 		*n_processed = 0;
2135 	}
2136 
2137 	if (min_n != ULINT_MAX) {
2138 		/* Ensure that flushing is spread evenly amongst the
2139 		buffer pool instances. When min_n is ULINT_MAX
2140 		we need to flush everything up to the lsn limit
2141 		so no limit here. */
2142 		min_n = (min_n + srv_buf_pool_instances - 1)
2143 			 / srv_buf_pool_instances;
2144 	}
2145 
2146 	/* Flush to lsn_limit in all buffer pool instances */
2147 	for (i = 0; i < srv_buf_pool_instances; i++) {
2148 		buf_pool_t*	buf_pool;
2149 		ulint		page_count = 0;
2150 
2151 		buf_pool = buf_pool_from_array(i);
2152 
2153 		if (!buf_flush_do_batch(buf_pool,
2154 					BUF_FLUSH_LIST,
2155 					min_n,
2156 					lsn_limit,
2157 					&page_count)) {
2158 			/* We have two choices here. If lsn_limit was
2159 			specified then skipping an instance of buffer
2160 			pool means we cannot guarantee that all pages
2161 			up to lsn_limit has been flushed. We can
2162 			return right now with failure or we can try
2163 			to flush remaining buffer pools up to the
2164 			lsn_limit. We attempt to flush other buffer
2165 			pools based on the assumption that it will
2166 			help in the retry which will follow the
2167 			failure. */
2168 			success = false;
2169 
2170 			continue;
2171 		}
2172 
2173 		n_flushed += page_count;
2174 	}
2175 
2176 	if (n_flushed) {
2177 		buf_flush_stats(n_flushed, 0);
2178 	}
2179 
2180 	if (n_processed) {
2181 		*n_processed = n_flushed;
2182 	}
2183 
2184 	return(success);
2185 }
2186 
2187 /******************************************************************//**
2188 This function picks up a single page from the tail of the LRU
2189 list, flushes it (if it is dirty), removes it from page_hash and LRU
2190 list and puts it on the free list. It is called from user threads when
2191 they are unable to find a replaceable page at the tail of the LRU
2192 list i.e.: when the background LRU flushing in the page_cleaner thread
2193 is not fast enough to keep pace with the workload.
2194 @return true if success. */
2195 bool
buf_flush_single_page_from_LRU(buf_pool_t * buf_pool)2196 buf_flush_single_page_from_LRU(
2197 /*===========================*/
2198 	buf_pool_t*	buf_pool)	/*!< in/out: buffer pool instance */
2199 {
2200 	ulint		scanned;
2201 	buf_page_t*	bpage;
2202 	ibool		freed;
2203 
2204 	buf_pool_mutex_enter(buf_pool);
2205 
2206 	for (bpage = buf_pool->single_scan_itr.start(), scanned = 0,
2207 	     freed = false;
2208 	     bpage != NULL;
2209 	     ++scanned, bpage = buf_pool->single_scan_itr.get()) {
2210 
2211 		ut_ad(buf_pool_mutex_own(buf_pool));
2212 
2213 		buf_page_t*	prev = UT_LIST_GET_PREV(LRU, bpage);
2214 
2215 		buf_pool->single_scan_itr.set(prev);
2216 
2217 		BPageMutex*	block_mutex;
2218 
2219 		block_mutex = buf_page_get_mutex(bpage);
2220 
2221 		mutex_enter(block_mutex);
2222 
2223 		if (buf_flush_ready_for_replace(bpage)) {
2224 			/* block is ready for eviction i.e., it is
2225 			clean and is not IO-fixed or buffer fixed. */
2226 			mutex_exit(block_mutex);
2227 
2228 			if (buf_LRU_free_page(bpage, true)) {
2229 				buf_pool_mutex_exit(buf_pool);
2230 				freed = true;
2231 				break;
2232 			}
2233 
2234 		} else if (buf_flush_ready_for_flush(
2235 				   bpage, BUF_FLUSH_SINGLE_PAGE)) {
2236 
2237 			/* Block is ready for flush. Try and dispatch an IO
2238 			request. We'll put it on free list in IO completion
2239 			routine if it is not buffer fixed. The following call
2240 			will release the buffer pool and block mutex.
2241 
2242 			Note: There is no guarantee that this page has actually
2243 			been freed, only that it has been flushed to disk */
2244 
2245 			freed = buf_flush_page(
2246 				buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE, true);
2247 
2248 			if (freed) {
2249 				break;
2250 			}
2251 
2252 			mutex_exit(block_mutex);
2253 		} else {
2254 			mutex_exit(block_mutex);
2255 		}
2256 
2257 		ut_ad(!mutex_own(block_mutex));
2258 	}
2259 
2260 	if (!freed) {
2261 		/* Can't find a single flushable page. */
2262 		ut_ad(!bpage);
2263 		buf_pool_mutex_exit(buf_pool);
2264 	}
2265 
2266 	if (scanned) {
2267 		MONITOR_INC_VALUE_CUMULATIVE(
2268 			MONITOR_LRU_SINGLE_FLUSH_SCANNED,
2269 			MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
2270 			MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL,
2271 			scanned);
2272 	}
2273 
2274 	ut_ad(!buf_pool_mutex_own(buf_pool));
2275 
2276 	return(freed);
2277 }
2278 
2279 /**
2280 Clears up tail of the LRU list of a given buffer pool instance:
2281 * Put replaceable pages at the tail of LRU to the free list
2282 * Flush dirty pages at the tail of LRU to the disk
2283 The depth to which we scan each buffer pool is controlled by dynamic
2284 config parameter innodb_LRU_scan_depth.
2285 @param buf_pool buffer pool instance
2286 @return total pages flushed */
2287 static
2288 ulint
buf_flush_LRU_list(buf_pool_t * buf_pool)2289 buf_flush_LRU_list(
2290 	buf_pool_t*	buf_pool)
2291 {
2292 	ulint	scan_depth, withdraw_depth;
2293 	ulint	n_flushed = 0;
2294 
2295 	ut_ad(buf_pool);
2296 
2297 	/* srv_LRU_scan_depth can be arbitrarily large value.
2298 	We cap it with current LRU size. */
2299 	buf_pool_mutex_enter(buf_pool);
2300 	scan_depth = UT_LIST_GET_LEN(buf_pool->LRU);
2301 	if (buf_pool->curr_size < buf_pool->old_size
2302 	    && buf_pool->withdraw_target > 0) {
2303 		withdraw_depth = buf_pool->withdraw_target
2304 				 - UT_LIST_GET_LEN(buf_pool->withdraw);
2305 	} else {
2306 		withdraw_depth = 0;
2307 	}
2308 	buf_pool_mutex_exit(buf_pool);
2309 
2310 	if (withdraw_depth > srv_LRU_scan_depth) {
2311 		scan_depth = ut_min(withdraw_depth, scan_depth);
2312 	} else {
2313 		scan_depth = ut_min(static_cast<ulint>(srv_LRU_scan_depth),
2314 				    scan_depth);
2315 	}
2316 
2317 	/* Currently one of page_cleaners is the only thread
2318 	that can trigger an LRU flush at the same time.
2319 	So, it is not possible that a batch triggered during
2320 	last iteration is still running, */
2321 	buf_flush_do_batch(buf_pool, BUF_FLUSH_LRU, scan_depth,
2322 			   0, &n_flushed);
2323 
2324 	return(n_flushed);
2325 }
2326 
2327 /*********************************************************************//**
2328 Clears up tail of the LRU lists:
2329 * Put replaceable pages at the tail of LRU to the free list
2330 * Flush dirty pages at the tail of LRU to the disk
2331 The depth to which we scan each buffer pool is controlled by dynamic
2332 config parameter innodb_LRU_scan_depth.
2333 @return total pages flushed */
2334 ulint
buf_flush_LRU_lists(void)2335 buf_flush_LRU_lists(void)
2336 /*=====================*/
2337 {
2338 	ulint	n_flushed = 0;
2339 
2340 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2341 
2342 		n_flushed += buf_flush_LRU_list(buf_pool_from_array(i));
2343 	}
2344 
2345 	if (n_flushed) {
2346 		buf_flush_stats(0, n_flushed);
2347 	}
2348 
2349 	return(n_flushed);
2350 }
2351 
2352 /*********************************************************************//**
2353 Wait for any possible LRU flushes that are in progress to end. */
2354 void
buf_flush_wait_LRU_batch_end(void)2355 buf_flush_wait_LRU_batch_end(void)
2356 /*==============================*/
2357 {
2358 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2359 		buf_pool_t*	buf_pool;
2360 
2361 		buf_pool = buf_pool_from_array(i);
2362 
2363 		buf_pool_mutex_enter(buf_pool);
2364 
2365 		if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0
2366 		   || buf_pool->init_flush[BUF_FLUSH_LRU]) {
2367 
2368 			buf_pool_mutex_exit(buf_pool);
2369 			buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
2370 		} else {
2371 			buf_pool_mutex_exit(buf_pool);
2372 		}
2373 	}
2374 }
2375 
2376 /*********************************************************************//**
2377 Calculates if flushing is required based on number of dirty pages in
2378 the buffer pool.
2379 @return percent of io_capacity to flush to manage dirty page ratio */
2380 static
2381 ulint
af_get_pct_for_dirty()2382 af_get_pct_for_dirty()
2383 /*==================*/
2384 {
2385 	double	dirty_pct = buf_get_modified_ratio_pct();
2386 
2387 	if (dirty_pct == 0.0) {
2388 		/* No pages modified */
2389 		return(0);
2390 	}
2391 
2392 	ut_a(srv_max_dirty_pages_pct_lwm
2393 	     <= srv_max_buf_pool_modified_pct);
2394 
2395 	if (srv_max_dirty_pages_pct_lwm == 0) {
2396 		/* The user has not set the option to preflush dirty
2397 		pages as we approach the high water mark. */
2398 		if (dirty_pct >= srv_max_buf_pool_modified_pct) {
2399 			/* We have crossed the high water mark of dirty
2400 			pages In this case we start flushing at 100% of
2401 			innodb_io_capacity. */
2402 			return(100);
2403 		}
2404 	} else if (dirty_pct >= srv_max_dirty_pages_pct_lwm) {
2405 		/* We should start flushing pages gradually. */
2406 		return(static_cast<ulint>((dirty_pct * 100)
2407 		       / (srv_max_buf_pool_modified_pct + 1)));
2408 	}
2409 
2410 	return(0);
2411 }
2412 
2413 /*********************************************************************//**
2414 Calculates if flushing is required based on redo generation rate.
2415 @return percent of io_capacity to flush to manage redo space */
2416 static
2417 ulint
af_get_pct_for_lsn(lsn_t age)2418 af_get_pct_for_lsn(
2419 /*===============*/
2420 	lsn_t	age)	/*!< in: current age of LSN. */
2421 {
2422 	lsn_t	max_async_age;
2423 	lsn_t	lsn_age_factor;
2424 	lsn_t	af_lwm = (srv_adaptive_flushing_lwm
2425 			  * log_get_capacity()) / 100;
2426 
2427 	if (age < af_lwm) {
2428 		/* No adaptive flushing. */
2429 		return(0);
2430 	}
2431 
2432 	max_async_age = log_get_max_modified_age_async();
2433 
2434 	if (age < max_async_age && !srv_adaptive_flushing) {
2435 		/* We have still not reached the max_async point and
2436 		the user has disabled adaptive flushing. */
2437 		return(0);
2438 	}
2439 
2440 	/* If we are here then we know that either:
2441 	1) User has enabled adaptive flushing
2442 	2) User may have disabled adaptive flushing but we have reached
2443 	max_async_age. */
2444 	lsn_age_factor = (age * 100) / max_async_age;
2445 
2446 	ut_ad(srv_max_io_capacity >= srv_io_capacity);
2447 	return(static_cast<ulint>(
2448 		((srv_max_io_capacity / srv_io_capacity)
2449 		* (lsn_age_factor * sqrt((double)lsn_age_factor)))
2450 		/ 7.5));
2451 }
2452 
2453 /*********************************************************************//**
2454 This function is called approximately once every second by the
2455 page_cleaner thread. Based on various factors it decides if there is a
2456 need to do flushing.
2457 @return number of pages recommended to be flushed
2458 @param lsn_limit	pointer to return LSN up to which flushing must happen
2459 @param last_pages_in	the number of pages flushed by the last flush_list
2460 			flushing. */
2461 static
2462 ulint
page_cleaner_flush_pages_recommendation(lsn_t * lsn_limit,ulint last_pages_in)2463 page_cleaner_flush_pages_recommendation(
2464 /*====================================*/
2465 	lsn_t*	lsn_limit,
2466 	ulint	last_pages_in)
2467 {
2468 	static	lsn_t		prev_lsn = 0;
2469 	static	ulint		sum_pages = 0;
2470 	static	ulint		avg_page_rate = 0;
2471 	static	ulint		n_iterations = 0;
2472 	static	ib_time_monotonic_t		prev_time;
2473 	lsn_t			oldest_lsn;
2474 	lsn_t			cur_lsn;
2475 	lsn_t			age;
2476 	lsn_t			lsn_rate;
2477 	ulint			n_pages = 0;
2478 	ulint			pct_for_dirty = 0;
2479 	ulint			pct_for_lsn = 0;
2480 	ulint			pct_total = 0;
2481 
2482 	cur_lsn = log_get_lsn();
2483 
2484 	if (prev_lsn == 0) {
2485 		/* First time around. */
2486 		prev_lsn = cur_lsn;
2487 		prev_time = ut_time_monotonic();
2488 		return(0);
2489 	}
2490 
2491 	if (prev_lsn == cur_lsn) {
2492 		return(0);
2493 	}
2494 
2495 	sum_pages += last_pages_in;
2496 
2497 	ib_time_monotonic_t	curr_time    = ut_time_monotonic();
2498 	uint64_t	        time_elapsed = curr_time - prev_time;
2499 	const ulong             avg_loop     = srv_flushing_avg_loops;
2500 
2501 	/* We update our variables every srv_flushing_avg_loops
2502 	iterations to smooth out transition in workload. */
2503 	if (++n_iterations >= avg_loop
2504 	    || time_elapsed >= (uint64_t)avg_loop) {
2505 
2506 		if (time_elapsed < 1) {
2507 			time_elapsed = 1;
2508 		}
2509 
2510 		avg_page_rate = static_cast<ulint>(
2511 			((static_cast<double>(sum_pages)
2512 			  / time_elapsed)
2513 			 + avg_page_rate) / 2);
2514 
2515 		/* How much LSN we have generated since last call. */
2516 		lsn_rate = static_cast<lsn_t>(
2517 			static_cast<double>(cur_lsn - prev_lsn)
2518 			/ time_elapsed);
2519 
2520 		lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2;
2521 
2522 
2523 		/* aggregate stats of all slots */
2524 		mutex_enter(&page_cleaner->mutex);
2525 
2526 		uint64_t  flush_tm = page_cleaner->flush_time;
2527 		ulint	flush_pass = page_cleaner->flush_pass;
2528 
2529 		page_cleaner->flush_time = 0;
2530 		page_cleaner->flush_pass = 0;
2531 
2532 		uint64_t lru_tm = 0;
2533 		uint64_t list_tm = 0;
2534 		ulint	lru_pass = 0;
2535 		ulint	list_pass = 0;
2536 
2537 		for (ulint i = 0; i < page_cleaner->n_slots; i++) {
2538 			page_cleaner_slot_t*	slot;
2539 
2540 			slot = &page_cleaner->slots[i];
2541 
2542 			lru_tm    += slot->flush_lru_time;
2543 			lru_pass  += slot->flush_lru_pass;
2544 			list_tm   += slot->flush_list_time;
2545 			list_pass += slot->flush_list_pass;
2546 
2547 			slot->flush_lru_time  = 0;
2548 			slot->flush_lru_pass  = 0;
2549 			slot->flush_list_time = 0;
2550 			slot->flush_list_pass = 0;
2551 		}
2552 
2553 		mutex_exit(&page_cleaner->mutex);
2554 
2555 		/* minimum values are 1, to avoid dividing by zero. */
2556 		if (lru_tm < 1) {
2557 			lru_tm = 1;
2558 		}
2559 		if (list_tm < 1) {
2560 			list_tm = 1;
2561 		}
2562 		if (flush_tm < 1) {
2563 			flush_tm = 1;
2564 		}
2565 
2566 		if (lru_pass < 1) {
2567 			lru_pass = 1;
2568 		}
2569 		if (list_pass < 1) {
2570 			list_pass = 1;
2571 		}
2572 		if (flush_pass < 1) {
2573 			flush_pass = 1;
2574 		}
2575 
2576 		MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_SLOT,
2577 			    list_tm / list_pass);
2578 		MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_SLOT,
2579 			    lru_tm  / lru_pass);
2580 
2581 		MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_THREAD,
2582 			    list_tm / (srv_n_page_cleaners * flush_pass));
2583 		MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_THREAD,
2584 			    lru_tm / (srv_n_page_cleaners * flush_pass));
2585 		MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_EST,
2586 			    flush_tm * list_tm / flush_pass
2587 			    / (list_tm + lru_tm));
2588 		MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_EST,
2589 			    flush_tm * lru_tm / flush_pass
2590 			    / (list_tm + lru_tm));
2591 		MONITOR_SET(MONITOR_FLUSH_AVG_TIME, flush_tm / flush_pass);
2592 
2593 		MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS,
2594 			    list_pass / page_cleaner->n_slots);
2595 		MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_PASS,
2596 			    lru_pass / page_cleaner->n_slots);
2597 		MONITOR_SET(MONITOR_FLUSH_AVG_PASS, flush_pass);
2598 
2599 		prev_lsn = cur_lsn;
2600 		prev_time = curr_time;
2601 
2602 		n_iterations = 0;
2603 
2604 		sum_pages = 0;
2605 	}
2606 
2607 	oldest_lsn = buf_pool_get_oldest_modification();
2608 
2609 	ut_ad(oldest_lsn <= log_get_lsn());
2610 
2611 	age = cur_lsn > oldest_lsn ? cur_lsn - oldest_lsn : 0;
2612 
2613 	pct_for_dirty = af_get_pct_for_dirty();
2614 	pct_for_lsn = af_get_pct_for_lsn(age);
2615 
2616 	pct_total = ut_max(pct_for_dirty, pct_for_lsn);
2617 
2618 	/* Estimate pages to be flushed for the lsn progress */
2619 	ulint	sum_pages_for_lsn = 0;
2620 	lsn_t	target_lsn = oldest_lsn
2621 			     + lsn_avg_rate * buf_flush_lsn_scan_factor;
2622 
2623 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2624 		buf_pool_t*	buf_pool = buf_pool_from_array(i);
2625 		ulint		pages_for_lsn = 0;
2626 
2627 		buf_flush_list_mutex_enter(buf_pool);
2628 		for (buf_page_t* b = UT_LIST_GET_LAST(buf_pool->flush_list);
2629 		     b != NULL;
2630 		     b = UT_LIST_GET_PREV(list, b)) {
2631 			if (b->oldest_modification > target_lsn) {
2632 				break;
2633 			}
2634 			++pages_for_lsn;
2635 		}
2636 		buf_flush_list_mutex_exit(buf_pool);
2637 
2638 		sum_pages_for_lsn += pages_for_lsn;
2639 
2640 		mutex_enter(&page_cleaner->mutex);
2641 		ut_ad(page_cleaner->slots[i].state
2642 		      == PAGE_CLEANER_STATE_NONE);
2643 		page_cleaner->slots[i].n_pages_requested
2644 			= pages_for_lsn / buf_flush_lsn_scan_factor + 1;
2645 		mutex_exit(&page_cleaner->mutex);
2646 	}
2647 
2648 	sum_pages_for_lsn /= buf_flush_lsn_scan_factor;
2649 	if(sum_pages_for_lsn < 1) {
2650 		sum_pages_for_lsn = 1;
2651 	}
2652 
2653 	/* Cap the maximum IO capacity that we are going to use by
2654 	max_io_capacity. Limit the value to avoid too quick increase */
2655 	ulint	pages_for_lsn =
2656 		std::min<ulint>(sum_pages_for_lsn, srv_max_io_capacity * 2);
2657 
2658 	n_pages = (PCT_IO(pct_total) + avg_page_rate + pages_for_lsn) / 3;
2659 
2660 	if (n_pages > srv_max_io_capacity) {
2661 		n_pages = srv_max_io_capacity;
2662 	}
2663 
2664 	/* Normalize request for each instance */
2665 	mutex_enter(&page_cleaner->mutex);
2666 	ut_ad(page_cleaner->n_slots_requested == 0);
2667 	ut_ad(page_cleaner->n_slots_flushing == 0);
2668 	ut_ad(page_cleaner->n_slots_finished == 0);
2669 
2670 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2671 		/* if REDO has enough of free space,
2672 		don't care about age distribution of pages */
2673 		page_cleaner->slots[i].n_pages_requested = pct_for_lsn > 30 ?
2674 			page_cleaner->slots[i].n_pages_requested
2675 			* n_pages / sum_pages_for_lsn + 1
2676 			: n_pages / srv_buf_pool_instances;
2677 	}
2678 	mutex_exit(&page_cleaner->mutex);
2679 
2680 	MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages);
2681 
2682 	MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_BY_AGE, sum_pages_for_lsn);
2683 
2684 	MONITOR_SET(MONITOR_FLUSH_AVG_PAGE_RATE, avg_page_rate);
2685 	MONITOR_SET(MONITOR_FLUSH_LSN_AVG_RATE, lsn_avg_rate);
2686 	MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty);
2687 	MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn);
2688 
2689 	*lsn_limit = LSN_MAX;
2690 
2691 	return(n_pages);
2692 }
2693 
2694 /*********************************************************************//**
2695 Puts the page_cleaner thread to sleep if it has finished work in less
2696 than a second
2697 @retval 0 wake up by event set,
2698 @retval OS_SYNC_TIME_EXCEEDED if timeout was exceeded
2699 @param next_loop_time	time when next loop iteration should start
2700 @param sig_count	zero or the value returned by previous call of
2701 			os_event_reset() */
2702 static
2703 ulint
pc_sleep_if_needed(ib_time_monotonic_ms_t next_loop_time,int64_t sig_count)2704 pc_sleep_if_needed(
2705 /*===============*/
2706 	ib_time_monotonic_ms_t		next_loop_time,
2707 	int64_t		sig_count)
2708 {
2709 	ib_time_monotonic_ms_t	cur_time = ut_time_monotonic_ms();
2710 
2711 	if (next_loop_time > cur_time) {
2712 		/* Get sleep interval in micro seconds. We use
2713 		ut_min() to avoid long sleep in case of wrap around. */
2714 		int64_t sleep_us;
2715 
2716 		sleep_us = ut_min(int64_t(1000000),
2717 			         (next_loop_time - cur_time) * int64_t(1000));
2718 		ut_a(sleep_us > 0);
2719 
2720 		return(os_event_wait_time_low(buf_flush_event,
2721 					      sleep_us, sig_count));
2722 	}
2723 
2724 	return(OS_SYNC_TIME_EXCEEDED);
2725 }
2726 
2727 /******************************************************************//**
2728 Initialize page_cleaner. */
2729 void
buf_flush_page_cleaner_init(void)2730 buf_flush_page_cleaner_init(void)
2731 /*=============================*/
2732 {
2733 	ut_ad(page_cleaner == NULL);
2734 
2735 	page_cleaner = static_cast<page_cleaner_t*>(
2736 		ut_zalloc_nokey(sizeof(*page_cleaner)));
2737 
2738 	mutex_create(LATCH_ID_PAGE_CLEANER, &page_cleaner->mutex);
2739 
2740 	page_cleaner->is_requested = os_event_create("pc_is_requested");
2741 	page_cleaner->is_finished = os_event_create("pc_is_finished");
2742 
2743 	page_cleaner->n_slots = static_cast<ulint>(srv_buf_pool_instances);
2744 
2745 	page_cleaner->slots = static_cast<page_cleaner_slot_t*>(
2746 		ut_zalloc_nokey(page_cleaner->n_slots
2747 				* sizeof(*page_cleaner->slots)));
2748 
2749 	ut_d(page_cleaner->n_disabled_debug = 0);
2750 
2751 	page_cleaner->is_running = true;
2752 }
2753 
2754 /**
2755 Close page_cleaner. */
2756 static
2757 void
buf_flush_page_cleaner_close(void)2758 buf_flush_page_cleaner_close(void)
2759 {
2760 	/* waiting for all worker threads exit */
2761 	while (page_cleaner->n_workers > 0) {
2762 		os_thread_sleep(10000);
2763 	}
2764 
2765 	mutex_destroy(&page_cleaner->mutex);
2766 
2767 	ut_free(page_cleaner->slots);
2768 
2769 	os_event_destroy(page_cleaner->is_finished);
2770 	os_event_destroy(page_cleaner->is_requested);
2771 
2772 	ut_free(page_cleaner);
2773 
2774 	page_cleaner = NULL;
2775 }
2776 
2777 /**
2778 Requests for all slots to flush all buffer pool instances.
2779 @param min_n	wished minimum mumber of blocks flushed
2780 		(it is not guaranteed that the actual number is that big)
2781 @param lsn_limit in the case BUF_FLUSH_LIST all blocks whose
2782 		oldest_modification is smaller than this should be flushed
2783 		(if their number does not exceed min_n), otherwise ignored
2784 */
2785 static
2786 void
pc_request(ulint min_n,lsn_t lsn_limit)2787 pc_request(
2788 	ulint		min_n,
2789 	lsn_t		lsn_limit)
2790 {
2791 	if (min_n != ULINT_MAX) {
2792 		/* Ensure that flushing is spread evenly amongst the
2793 		buffer pool instances. When min_n is ULINT_MAX
2794 		we need to flush everything up to the lsn limit
2795 		so no limit here. */
2796 		min_n = (min_n + srv_buf_pool_instances - 1)
2797 			/ srv_buf_pool_instances;
2798 	}
2799 
2800 	mutex_enter(&page_cleaner->mutex);
2801 
2802 	ut_ad(page_cleaner->n_slots_requested == 0);
2803 	ut_ad(page_cleaner->n_slots_flushing == 0);
2804 	ut_ad(page_cleaner->n_slots_finished == 0);
2805 
2806 	page_cleaner->requested = (min_n > 0);
2807 	page_cleaner->lsn_limit = lsn_limit;
2808 
2809 	for (ulint i = 0; i < page_cleaner->n_slots; i++) {
2810 		page_cleaner_slot_t* slot = &page_cleaner->slots[i];
2811 
2812 		ut_ad(slot->state == PAGE_CLEANER_STATE_NONE);
2813 
2814 		if (min_n == ULINT_MAX) {
2815 			slot->n_pages_requested = ULINT_MAX;
2816 		} else if (min_n == 0) {
2817 			slot->n_pages_requested = 0;
2818 		}
2819 
2820 		/* slot->n_pages_requested was already set by
2821 		page_cleaner_flush_pages_recommendation() */
2822 
2823 		slot->state = PAGE_CLEANER_STATE_REQUESTED;
2824 	}
2825 
2826 	page_cleaner->n_slots_requested = page_cleaner->n_slots;
2827 	page_cleaner->n_slots_flushing = 0;
2828 	page_cleaner->n_slots_finished = 0;
2829 
2830 	os_event_set(page_cleaner->is_requested);
2831 
2832 	mutex_exit(&page_cleaner->mutex);
2833 }
2834 
2835 /**
2836 Do flush for one slot.
2837 @return	the number of the slots which has not been treated yet. */
2838 static
2839 ulint
pc_flush_slot(void)2840 pc_flush_slot(void)
2841 {
2842 	ib_time_monotonic_ms_t	lru_tm = 0;
2843 	ib_time_monotonic_ms_t	list_tm = 0;
2844 	int	lru_pass = 0;
2845 	int	list_pass = 0;
2846 
2847 	mutex_enter(&page_cleaner->mutex);
2848 
2849 	if (page_cleaner->n_slots_requested > 0) {
2850 		page_cleaner_slot_t*	slot = NULL;
2851 		ulint			i;
2852 
2853 		for (i = 0; i < page_cleaner->n_slots; i++) {
2854 			slot = &page_cleaner->slots[i];
2855 
2856 			if (slot->state == PAGE_CLEANER_STATE_REQUESTED) {
2857 				break;
2858 			}
2859 		}
2860 
2861 		/* slot should be found because
2862 		page_cleaner->n_slots_requested > 0 */
2863 		ut_a(i < page_cleaner->n_slots);
2864 
2865 		buf_pool_t* buf_pool = buf_pool_from_array(i);
2866 
2867 		page_cleaner->n_slots_requested--;
2868 		page_cleaner->n_slots_flushing++;
2869 		slot->state = PAGE_CLEANER_STATE_FLUSHING;
2870 
2871 		if (page_cleaner->n_slots_requested == 0) {
2872 			os_event_reset(page_cleaner->is_requested);
2873 		}
2874 
2875 		if (!page_cleaner->is_running) {
2876 			slot->n_flushed_lru = 0;
2877 			slot->n_flushed_list = 0;
2878 			goto finish_mutex;
2879 		}
2880 
2881 		mutex_exit(&page_cleaner->mutex);
2882 
2883 		lru_tm = ut_time_monotonic_ms();
2884 
2885 		/* Flush pages from end of LRU if required */
2886 		slot->n_flushed_lru = buf_flush_LRU_list(buf_pool);
2887 
2888 		lru_tm = ut_time_monotonic_ms() - lru_tm;
2889 		lru_pass++;
2890 
2891 		if (!page_cleaner->is_running) {
2892 			slot->n_flushed_list = 0;
2893 			goto finish;
2894 		}
2895 
2896 		/* Flush pages from flush_list if required */
2897 		if (page_cleaner->requested) {
2898 
2899 			list_tm = ut_time_monotonic_ms();
2900 
2901 			slot->succeeded_list = buf_flush_do_batch(
2902 				buf_pool, BUF_FLUSH_LIST,
2903 				slot->n_pages_requested,
2904 				page_cleaner->lsn_limit,
2905 				&slot->n_flushed_list);
2906 
2907 			list_tm = ut_time_monotonic_ms() - list_tm;
2908 			list_pass++;
2909 		} else {
2910 			slot->n_flushed_list = 0;
2911 			slot->succeeded_list = true;
2912 		}
2913 finish:
2914 		mutex_enter(&page_cleaner->mutex);
2915 finish_mutex:
2916 		page_cleaner->n_slots_flushing--;
2917 		page_cleaner->n_slots_finished++;
2918 		slot->state = PAGE_CLEANER_STATE_FINISHED;
2919 
2920 		slot->flush_lru_time += lru_tm;
2921 		slot->flush_list_time += list_tm;
2922 		slot->flush_lru_pass += lru_pass;
2923 		slot->flush_list_pass += list_pass;
2924 
2925 		if (page_cleaner->n_slots_requested == 0
2926 		    && page_cleaner->n_slots_flushing == 0) {
2927 			os_event_set(page_cleaner->is_finished);
2928 		}
2929 	}
2930 
2931 	ulint	ret = page_cleaner->n_slots_requested;
2932 
2933 	mutex_exit(&page_cleaner->mutex);
2934 
2935 	return(ret);
2936 }
2937 
2938 /**
2939 Wait until all flush requests are finished.
2940 @param n_flushed_lru	number of pages flushed from the end of the LRU list.
2941 @param n_flushed_list	number of pages flushed from the end of the
2942 			flush_list.
2943 @return			true if all flush_list flushing batch were success. */
2944 static
2945 bool
pc_wait_finished(ulint * n_flushed_lru,ulint * n_flushed_list)2946 pc_wait_finished(
2947 	ulint*	n_flushed_lru,
2948 	ulint*	n_flushed_list)
2949 {
2950 	bool	all_succeeded = true;
2951 
2952 	*n_flushed_lru = 0;
2953 	*n_flushed_list = 0;
2954 
2955 	os_event_wait(page_cleaner->is_finished);
2956 
2957 	mutex_enter(&page_cleaner->mutex);
2958 
2959 	ut_ad(page_cleaner->n_slots_requested == 0);
2960 	ut_ad(page_cleaner->n_slots_flushing == 0);
2961 	ut_ad(page_cleaner->n_slots_finished == page_cleaner->n_slots);
2962 
2963 	for (ulint i = 0; i < page_cleaner->n_slots; i++) {
2964 		page_cleaner_slot_t* slot = &page_cleaner->slots[i];
2965 
2966 		ut_ad(slot->state == PAGE_CLEANER_STATE_FINISHED);
2967 
2968 		*n_flushed_lru += slot->n_flushed_lru;
2969 		*n_flushed_list += slot->n_flushed_list;
2970 		all_succeeded &= slot->succeeded_list;
2971 
2972 		slot->state = PAGE_CLEANER_STATE_NONE;
2973 
2974 		slot->n_pages_requested = 0;
2975 	}
2976 
2977 	page_cleaner->n_slots_finished = 0;
2978 
2979 	os_event_reset(page_cleaner->is_finished);
2980 
2981 	mutex_exit(&page_cleaner->mutex);
2982 
2983 	return(all_succeeded);
2984 }
2985 
2986 #ifdef UNIV_LINUX
2987 /**
2988 Set priority for page_cleaner threads.
2989 @param[in]	priority	priority intended to set
2990 @return	true if set as intended */
2991 static
2992 bool
buf_flush_page_cleaner_set_priority(int priority)2993 buf_flush_page_cleaner_set_priority(
2994 	int	priority)
2995 {
2996 	setpriority(PRIO_PROCESS, (pid_t)syscall(SYS_gettid),
2997 		    priority);
2998 	return(getpriority(PRIO_PROCESS, (pid_t)syscall(SYS_gettid))
2999 	       == priority);
3000 }
3001 #endif /* UNIV_LINUX */
3002 
3003 #ifdef UNIV_DEBUG
3004 /** Loop used to disable page cleaner threads. */
3005 static
3006 void
buf_flush_page_cleaner_disabled_loop(void)3007 buf_flush_page_cleaner_disabled_loop(void)
3008 {
3009 	ut_ad(page_cleaner != NULL);
3010 
3011 	if (!innodb_page_cleaner_disabled_debug) {
3012 		/* We return to avoid entering and exiting mutex. */
3013 		return;
3014 	}
3015 
3016 	mutex_enter(&page_cleaner->mutex);
3017 	page_cleaner->n_disabled_debug++;
3018 	mutex_exit(&page_cleaner->mutex);
3019 
3020 	while (innodb_page_cleaner_disabled_debug
3021 	       && srv_shutdown_state == SRV_SHUTDOWN_NONE
3022 	       && page_cleaner->is_running) {
3023 
3024 		os_thread_sleep(100000); /* [A] */
3025 	}
3026 
3027 	/* We need to wait for threads exiting here, otherwise we would
3028 	encounter problem when we quickly perform following steps:
3029 		1) SET GLOBAL innodb_page_cleaner_disabled_debug = 1;
3030 		2) SET GLOBAL innodb_page_cleaner_disabled_debug = 0;
3031 		3) SET GLOBAL innodb_page_cleaner_disabled_debug = 1;
3032 	That's because after step 1 this thread could still be sleeping
3033 	inside the loop above at [A] and steps 2, 3 could happen before
3034 	this thread wakes up from [A]. In such case this thread would
3035 	not re-increment n_disabled_debug and we would be waiting for
3036 	him forever in buf_flush_page_cleaner_disabled_debug_update(...).
3037 
3038 	Therefore we are waiting in step 2 for this thread exiting here. */
3039 
3040 	mutex_enter(&page_cleaner->mutex);
3041 	page_cleaner->n_disabled_debug--;
3042 	mutex_exit(&page_cleaner->mutex);
3043 }
3044 
3045 /** Disables page cleaner threads (coordinator and workers).
3046 It's used by: SET GLOBAL innodb_page_cleaner_disabled_debug = 1 (0).
3047 @param[in]	thd		thread handle
3048 @param[in]	var		pointer to system variable
3049 @param[out]	var_ptr		where the formal string goes
3050 @param[in]	save		immediate result from check function */
3051 void
buf_flush_page_cleaner_disabled_debug_update(THD * thd,struct st_mysql_sys_var * var,void * var_ptr,const void * save)3052 buf_flush_page_cleaner_disabled_debug_update(
3053 	THD*				thd,
3054 	struct st_mysql_sys_var*	var,
3055 	void*				var_ptr,
3056 	const void*			save)
3057 {
3058 	if (page_cleaner == NULL) {
3059 		return;
3060 	}
3061 
3062 	if (!*static_cast<const my_bool*>(save)) {
3063 		if (!innodb_page_cleaner_disabled_debug) {
3064 			return;
3065 		}
3066 
3067 		innodb_page_cleaner_disabled_debug = false;
3068 
3069 		/* Enable page cleaner threads. */
3070 		while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
3071 			mutex_enter(&page_cleaner->mutex);
3072 			const ulint n = page_cleaner->n_disabled_debug;
3073 			mutex_exit(&page_cleaner->mutex);
3074 			/* Check if all threads have been enabled, to avoid
3075 			problem when we decide to re-disable them soon. */
3076 			if (n == 0) {
3077 				break;
3078 			}
3079 		}
3080 		return;
3081 	}
3082 
3083 	if (innodb_page_cleaner_disabled_debug) {
3084 		return;
3085 	}
3086 
3087 	innodb_page_cleaner_disabled_debug = true;
3088 
3089 	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
3090 		/* Workers are possibly sleeping on is_requested.
3091 
3092 		We have to wake them, otherwise they could possibly
3093 		have never noticed, that they should be disabled,
3094 		and we would wait for them here forever.
3095 
3096 		That's why we have sleep-loop instead of simply
3097 		waiting on some disabled_debug_event. */
3098 		os_event_set(page_cleaner->is_requested);
3099 
3100 		mutex_enter(&page_cleaner->mutex);
3101 
3102 		ut_ad(page_cleaner->n_disabled_debug
3103 		      <= srv_n_page_cleaners);
3104 
3105 		if (page_cleaner->n_disabled_debug
3106 		    == srv_n_page_cleaners) {
3107 
3108 			mutex_exit(&page_cleaner->mutex);
3109 			break;
3110 		}
3111 
3112 		mutex_exit(&page_cleaner->mutex);
3113 
3114 		os_thread_sleep(100000);
3115 	}
3116 }
3117 #endif /* UNIV_DEBUG */
3118 
3119 /******************************************************************//**
3120 page_cleaner thread tasked with flushing dirty pages from the buffer
3121 pools. As of now we'll have only one coordinator.
3122 @return a dummy parameter */
3123 extern "C"
3124 os_thread_ret_t
DECLARE_THREAD(buf_flush_page_cleaner_coordinator)3125 DECLARE_THREAD(buf_flush_page_cleaner_coordinator)(
3126 /*===============================================*/
3127 	void*	arg MY_ATTRIBUTE((unused)))
3128 			/*!< in: a dummy parameter required by
3129 			os_thread_create */
3130 {
3131 	ib_time_monotonic_t	next_loop_time = ut_time_monotonic_ms() + 1000;
3132 	ulint	n_flushed = 0;
3133 	ulint	last_activity = srv_get_activity_count();
3134 	ulint	last_pages = 0;
3135 
3136 	my_thread_init();
3137 
3138 #ifdef UNIV_PFS_THREAD
3139 	pfs_register_thread(page_cleaner_thread_key);
3140 #endif /* UNIV_PFS_THREAD */
3141 
3142 #ifdef UNIV_DEBUG_THREAD_CREATION
3143 	ib::info() << "page_cleaner thread running, id "
3144 		<< os_thread_pf(os_thread_get_curr_id());
3145 #endif /* UNIV_DEBUG_THREAD_CREATION */
3146 
3147 #ifdef UNIV_LINUX
3148 	/* linux might be able to set different setting for each thread.
3149 	worth to try to set high priority for page cleaner threads */
3150 	if (buf_flush_page_cleaner_set_priority(
3151 		buf_flush_page_cleaner_priority)) {
3152 
3153 		ib::info() << "page_cleaner coordinator priority: "
3154 			<< buf_flush_page_cleaner_priority;
3155 	} else {
3156 		ib::info() << "If the mysqld execution user is authorized,"
3157 		" page cleaner thread priority can be changed."
3158 		" See the man page of setpriority().";
3159 	}
3160 #endif /* UNIV_LINUX */
3161 
3162 	buf_page_cleaner_is_active = true;
3163 
3164 	while (!srv_read_only_mode
3165 	       && srv_shutdown_state == SRV_SHUTDOWN_NONE
3166 	       && recv_sys->heap != NULL) {
3167 		/* treat flushing requests during recovery. */
3168 		ulint	n_flushed_lru = 0;
3169 		ulint	n_flushed_list = 0;
3170 
3171 		os_event_wait(recv_sys->flush_start);
3172 
3173 		if (srv_shutdown_state != SRV_SHUTDOWN_NONE
3174 		    || recv_sys->heap == NULL) {
3175 			break;
3176 		}
3177 
3178 		switch (recv_sys->flush_type) {
3179 		case BUF_FLUSH_LRU:
3180 			/* Flush pages from end of LRU if required */
3181 			pc_request(0, LSN_MAX);
3182 			while (pc_flush_slot() > 0) {}
3183 			pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3184 			break;
3185 
3186 		case BUF_FLUSH_LIST:
3187 			/* Flush all pages */
3188 			do {
3189 				pc_request(ULINT_MAX, LSN_MAX);
3190 				while (pc_flush_slot() > 0) {}
3191 			} while (!pc_wait_finished(&n_flushed_lru,
3192 						   &n_flushed_list));
3193 			break;
3194 
3195 		default:
3196 			ut_ad(0);
3197 		}
3198 
3199 		os_event_reset(recv_sys->flush_start);
3200 		os_event_set(recv_sys->flush_end);
3201 	}
3202 
3203 	os_event_wait(buf_flush_event);
3204 
3205 	ulint		ret_sleep = 0;
3206 	ulint		n_evicted = 0;
3207 	ulint		n_flushed_last = 0;
3208 	ulint		warn_interval = 1;
3209 	ulint		warn_count = 0;
3210 	int64_t		sig_count = os_event_reset(buf_flush_event);
3211 
3212 	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
3213 
3214 		/* The page_cleaner skips sleep if the server is
3215 		idle and there are no pending IOs in the buffer pool
3216 		and there is work to do. */
3217 		if (srv_check_activity(last_activity)
3218 		    || buf_get_n_pending_read_ios()
3219 		    || n_flushed == 0) {
3220 
3221 			ret_sleep = pc_sleep_if_needed(
3222 				next_loop_time, sig_count);
3223 
3224 			if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
3225 				break;
3226 			}
3227 		} else if (ut_time_monotonic_ms() > next_loop_time) {
3228 			ret_sleep = OS_SYNC_TIME_EXCEEDED;
3229 		} else {
3230 			ret_sleep = 0;
3231 		}
3232 
3233 		sig_count = os_event_reset(buf_flush_event);
3234 
3235 		if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3236 			ib_time_monotonic_ms_t curr_time =
3237 						ut_time_monotonic_ms();
3238 
3239 			if (curr_time > next_loop_time + 3000) {
3240 				if (warn_count == 0) {
3241 					ib::info() << "page_cleaner: 1000ms"
3242 						" intended loop took "
3243 						<< 1000 + curr_time
3244 						   - next_loop_time
3245 						<< "ms. The settings might not"
3246 						" be optimal. (flushed="
3247 						<< n_flushed_last
3248 						<< " and evicted="
3249 						<< n_evicted
3250 						<< ", during the time.)";
3251 					if (warn_interval > 300) {
3252 						warn_interval = 600;
3253 					} else {
3254 						warn_interval *= 2;
3255 					}
3256 
3257 					warn_count = warn_interval;
3258 				} else {
3259 					--warn_count;
3260 				}
3261 			} else {
3262 				/* reset counter */
3263 				warn_interval = 1;
3264 				warn_count = 0;
3265 			}
3266 
3267 			next_loop_time = curr_time + 1000;
3268 			n_flushed_last = n_evicted = 0;
3269 		}
3270 
3271 		if (ret_sleep != OS_SYNC_TIME_EXCEEDED
3272 		    && srv_flush_sync
3273 		    && buf_flush_sync_lsn > 0) {
3274 			/* woke up for flush_sync */
3275 			mutex_enter(&page_cleaner->mutex);
3276 			lsn_t	lsn_limit = buf_flush_sync_lsn;
3277 			buf_flush_sync_lsn = 0;
3278 			mutex_exit(&page_cleaner->mutex);
3279 
3280 			/* Request flushing for threads */
3281 			pc_request(ULINT_MAX, lsn_limit);
3282 
3283 			ib_time_monotonic_ms_t tm = ut_time_monotonic_ms();
3284 
3285 			/* Coordinator also treats requests */
3286 			while (pc_flush_slot() > 0) {}
3287 
3288 			/* only coordinator is using these counters,
3289 			so no need to protect by lock. */
3290 			page_cleaner->flush_time += ut_time_monotonic_ms() - tm;
3291 			page_cleaner->flush_pass++;
3292 
3293 			/* Wait for all slots to be finished */
3294 			ulint	n_flushed_lru = 0;
3295 			ulint	n_flushed_list = 0;
3296 			pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3297 
3298 			if (n_flushed_list > 0 || n_flushed_lru > 0) {
3299 				buf_flush_stats(n_flushed_list, n_flushed_lru);
3300 
3301 				MONITOR_INC_VALUE_CUMULATIVE(
3302 					MONITOR_FLUSH_SYNC_TOTAL_PAGE,
3303 					MONITOR_FLUSH_SYNC_COUNT,
3304 					MONITOR_FLUSH_SYNC_PAGES,
3305 					n_flushed_lru + n_flushed_list);
3306 			}
3307 
3308 			n_flushed = n_flushed_lru + n_flushed_list;
3309 
3310 		} else if (srv_check_activity(last_activity)) {
3311 			ulint	n_to_flush;
3312 			lsn_t	lsn_limit = 0;
3313 
3314 			/* Estimate pages from flush_list to be flushed */
3315 			if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3316 				last_activity = srv_get_activity_count();
3317 				n_to_flush =
3318 					page_cleaner_flush_pages_recommendation(
3319 						&lsn_limit, last_pages);
3320 			} else {
3321 				n_to_flush = 0;
3322 			}
3323 
3324 			/* Request flushing for threads */
3325 			pc_request(n_to_flush, lsn_limit);
3326 
3327 			ib_time_monotonic_ms_t tm = ut_time_monotonic_ms();
3328 
3329 			/* Coordinator also treats requests */
3330 			while (pc_flush_slot() > 0) {
3331 				/* No op */
3332 			}
3333 
3334 			/* only coordinator is using these counters,
3335 			so no need to protect by lock. */
3336 			page_cleaner->flush_time += ut_time_monotonic_ms() - tm;
3337 			page_cleaner->flush_pass++ ;
3338 
3339 			/* Wait for all slots to be finished */
3340 			ulint	n_flushed_lru = 0;
3341 			ulint	n_flushed_list = 0;
3342 
3343 			pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3344 
3345 			if (n_flushed_list > 0 || n_flushed_lru > 0) {
3346 				buf_flush_stats(n_flushed_list, n_flushed_lru);
3347 			}
3348 
3349 			if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3350 				last_pages = n_flushed_list;
3351 			}
3352 
3353 			n_evicted += n_flushed_lru;
3354 			n_flushed_last += n_flushed_list;
3355 
3356 			n_flushed = n_flushed_lru + n_flushed_list;
3357 
3358 			if (n_flushed_lru) {
3359 				MONITOR_INC_VALUE_CUMULATIVE(
3360 					MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
3361 					MONITOR_LRU_BATCH_FLUSH_COUNT,
3362 					MONITOR_LRU_BATCH_FLUSH_PAGES,
3363 					n_flushed_lru);
3364 			}
3365 
3366 			if (n_flushed_list) {
3367 				MONITOR_INC_VALUE_CUMULATIVE(
3368 					MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
3369 					MONITOR_FLUSH_ADAPTIVE_COUNT,
3370 					MONITOR_FLUSH_ADAPTIVE_PAGES,
3371 					n_flushed_list);
3372 			}
3373 
3374 		} else if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3375 			/* no activity, slept enough */
3376 			buf_flush_lists(PCT_IO(100), LSN_MAX, &n_flushed);
3377 
3378 			n_flushed_last += n_flushed;
3379 
3380 			if (n_flushed) {
3381 				MONITOR_INC_VALUE_CUMULATIVE(
3382 					MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
3383 					MONITOR_FLUSH_BACKGROUND_COUNT,
3384 					MONITOR_FLUSH_BACKGROUND_PAGES,
3385 					n_flushed);
3386 
3387 			}
3388 
3389 		} else {
3390 			/* no activity, but woken up by event */
3391 			n_flushed = 0;
3392 		}
3393 
3394 		ut_d(buf_flush_page_cleaner_disabled_loop());
3395 	}
3396 
3397 	ut_ad(srv_shutdown_state > 0);
3398 	if (srv_fast_shutdown == 2
3399 	    || srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
3400 		/* In very fast shutdown or when innodb failed to start, we
3401 		simulate a crash of the buffer pool. We are not required to do
3402 		any flushing. */
3403 		goto thread_exit;
3404 	}
3405 
3406 	/* In case of normal and slow shutdown the page_cleaner thread
3407 	must wait for all other activity in the server to die down.
3408 	Note that we can start flushing the buffer pool as soon as the
3409 	server enters shutdown phase but we must stay alive long enough
3410 	to ensure that any work done by the master or purge threads is
3411 	also flushed.
3412 	During shutdown we pass through two stages. In the first stage,
3413 	when SRV_SHUTDOWN_CLEANUP is set other threads like the master
3414 	and the purge threads may be working as well. We start flushing
3415 	the buffer pool but can't be sure that no new pages are being
3416 	dirtied until we enter SRV_SHUTDOWN_FLUSH_PHASE phase. */
3417 
3418 	do {
3419 		pc_request(ULINT_MAX, LSN_MAX);
3420 
3421 		while (pc_flush_slot() > 0) {}
3422 
3423 		ulint	n_flushed_lru = 0;
3424 		ulint	n_flushed_list = 0;
3425 		pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3426 
3427 		n_flushed = n_flushed_lru + n_flushed_list;
3428 
3429 		/* We sleep only if there are no pages to flush */
3430 		if (n_flushed == 0) {
3431 			os_thread_sleep(100000);
3432 		}
3433 	} while (srv_shutdown_state == SRV_SHUTDOWN_CLEANUP);
3434 
3435 	/* At this point all threads including the master and the purge
3436 	thread must have been suspended. */
3437 	ut_a(srv_get_active_thread_type() == SRV_NONE);
3438 	ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
3439 
3440 	/* We can now make a final sweep on flushing the buffer pool
3441 	and exit after we have cleaned the whole buffer pool.
3442 	It is important that we wait for any running batch that has
3443 	been triggered by us to finish. Otherwise we can end up
3444 	considering end of that batch as a finish of our final
3445 	sweep and we'll come out of the loop leaving behind dirty pages
3446 	in the flush_list */
3447 	buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
3448 	buf_flush_wait_LRU_batch_end();
3449 
3450 	bool	success;
3451 
3452 	do {
3453 		pc_request(ULINT_MAX, LSN_MAX);
3454 
3455 		while (pc_flush_slot() > 0) {}
3456 
3457 		ulint	n_flushed_lru = 0;
3458 		ulint	n_flushed_list = 0;
3459 		success = pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3460 
3461 		n_flushed = n_flushed_lru + n_flushed_list;
3462 
3463 		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
3464 		buf_flush_wait_LRU_batch_end();
3465 
3466 	} while (!success || n_flushed > 0 || buf_get_n_pending_read_ios() > 0);
3467 
3468 	/* Some sanity checks */
3469 	ut_a(srv_get_active_thread_type() == SRV_NONE);
3470 	ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
3471 
3472 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
3473 		buf_pool_t* buf_pool = buf_pool_from_array(i);
3474 		ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == 0);
3475 	}
3476 
3477 	/* We have lived our life. Time to die. */
3478 
3479 thread_exit:
3480 	/* All worker threads are waiting for the event here,
3481 	and no more access to page_cleaner structure by them.
3482 	Wakes worker threads up just to make them exit. */
3483 	page_cleaner->is_running = false;
3484 	os_event_set(page_cleaner->is_requested);
3485 
3486 	buf_flush_page_cleaner_close();
3487 
3488 	buf_page_cleaner_is_active = false;
3489 
3490 	my_thread_end();
3491 
3492 	/* We count the number of threads in os_thread_exit(). A created
3493 	thread should always use that to exit and not use return() to exit. */
3494 	os_thread_exit();
3495 
3496 	OS_THREAD_DUMMY_RETURN;
3497 }
3498 
3499 /******************************************************************//**
3500 Worker thread of page_cleaner.
3501 @return a dummy parameter */
3502 extern "C"
3503 os_thread_ret_t
DECLARE_THREAD(buf_flush_page_cleaner_worker)3504 DECLARE_THREAD(buf_flush_page_cleaner_worker)(
3505 /*==========================================*/
3506 	void*	arg MY_ATTRIBUTE((unused)))
3507 			/*!< in: a dummy parameter required by
3508 			os_thread_create */
3509 {
3510 	my_thread_init();
3511 
3512 	mutex_enter(&page_cleaner->mutex);
3513 	page_cleaner->n_workers++;
3514 	mutex_exit(&page_cleaner->mutex);
3515 
3516 #ifdef UNIV_LINUX
3517 	/* linux might be able to set different setting for each thread
3518 	worth to try to set high priority for page cleaner threads */
3519 	if (buf_flush_page_cleaner_set_priority(
3520 		buf_flush_page_cleaner_priority)) {
3521 
3522 		ib::info() << "page_cleaner worker priority: "
3523 			<< buf_flush_page_cleaner_priority;
3524 	}
3525 #endif /* UNIV_LINUX */
3526 
3527 	while (true) {
3528 		os_event_wait(page_cleaner->is_requested);
3529 
3530 		ut_d(buf_flush_page_cleaner_disabled_loop());
3531 
3532 		if (!page_cleaner->is_running) {
3533 			break;
3534 		}
3535 
3536 		pc_flush_slot();
3537 	}
3538 
3539 	mutex_enter(&page_cleaner->mutex);
3540 	page_cleaner->n_workers--;
3541 	mutex_exit(&page_cleaner->mutex);
3542 
3543 	my_thread_end();
3544 
3545 	os_thread_exit();
3546 
3547 	OS_THREAD_DUMMY_RETURN;
3548 }
3549 
3550 /*******************************************************************//**
3551 Synchronously flush dirty blocks from the end of the flush list of all buffer
3552 pool instances.
3553 NOTE: The calling thread is not allowed to own any latches on pages! */
3554 void
buf_flush_sync_all_buf_pools(void)3555 buf_flush_sync_all_buf_pools(void)
3556 /*==============================*/
3557 {
3558 	bool success;
3559 	do {
3560 		success = buf_flush_lists(ULINT_MAX, LSN_MAX, NULL);
3561 		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
3562 	} while (!success);
3563 
3564 	ut_a(success);
3565 }
3566 
3567 /** Request IO burst and wake page_cleaner up.
3568 @param[in]	lsn_limit	upper limit of LSN to be flushed */
3569 void
buf_flush_request_force(lsn_t lsn_limit)3570 buf_flush_request_force(
3571 	lsn_t	lsn_limit)
3572 {
3573 	/* adjust based on lsn_avg_rate not to get old */
3574 	lsn_t	lsn_target = lsn_limit + lsn_avg_rate * 3;
3575 
3576 	mutex_enter(&page_cleaner->mutex);
3577 	if (lsn_target > buf_flush_sync_lsn) {
3578 		buf_flush_sync_lsn = lsn_target;
3579 	}
3580 	mutex_exit(&page_cleaner->mutex);
3581 
3582 	os_event_set(buf_flush_event);
3583 }
3584 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3585 
3586 /** Functor to validate the flush list. */
3587 struct	Check {
operator ()Check3588 	void	operator()(const buf_page_t* elem)
3589 	{
3590 		ut_a(elem->in_flush_list);
3591 	}
3592 };
3593 
3594 /******************************************************************//**
3595 Validates the flush list.
3596 @return TRUE if ok */
3597 static
3598 ibool
buf_flush_validate_low(buf_pool_t * buf_pool)3599 buf_flush_validate_low(
3600 /*===================*/
3601 	buf_pool_t*	buf_pool)		/*!< in: Buffer pool instance */
3602 {
3603 	buf_page_t*		bpage;
3604 	const ib_rbt_node_t*	rnode = NULL;
3605 	Check			check;
3606 
3607 	ut_ad(buf_flush_list_mutex_own(buf_pool));
3608 
3609 	ut_list_validate(buf_pool->flush_list, check);
3610 
3611 	bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
3612 
3613 	/* If we are in recovery mode i.e.: flush_rbt != NULL
3614 	then each block in the flush_list must also be present
3615 	in the flush_rbt. */
3616 	if (buf_pool->flush_rbt != NULL) {
3617 		rnode = rbt_first(buf_pool->flush_rbt);
3618 	}
3619 
3620 	while (bpage != NULL) {
3621 		const lsn_t	om = bpage->oldest_modification;
3622 
3623 		ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
3624 
3625 		ut_ad(bpage->in_flush_list);
3626 
3627 		/* A page in buf_pool->flush_list can be in
3628 		BUF_BLOCK_REMOVE_HASH state. This happens when a page
3629 		is in the middle of being relocated. In that case the
3630 		original descriptor can have this state and still be
3631 		in the flush list waiting to acquire the
3632 		buf_pool->flush_list_mutex to complete the relocation. */
3633 		ut_a(buf_page_in_file(bpage)
3634 		     || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
3635 		ut_a(om > 0);
3636 
3637 		if (buf_pool->flush_rbt != NULL) {
3638 			buf_page_t**	prpage;
3639 
3640 			ut_a(rnode != NULL);
3641 			prpage = rbt_value(buf_page_t*, rnode);
3642 
3643 			ut_a(*prpage != NULL);
3644 			ut_a(*prpage == bpage);
3645 			rnode = rbt_next(buf_pool->flush_rbt, rnode);
3646 		}
3647 
3648 		bpage = UT_LIST_GET_NEXT(list, bpage);
3649 
3650 		ut_a(bpage == NULL || om >= bpage->oldest_modification);
3651 	}
3652 
3653 	/* By this time we must have exhausted the traversal of
3654 	flush_rbt (if active) as well. */
3655 	ut_a(rnode == NULL);
3656 
3657 	return(TRUE);
3658 }
3659 
3660 /******************************************************************//**
3661 Validates the flush list.
3662 @return TRUE if ok */
3663 ibool
buf_flush_validate(buf_pool_t * buf_pool)3664 buf_flush_validate(
3665 /*===============*/
3666 	buf_pool_t*	buf_pool)	/*!< buffer pool instance */
3667 {
3668 	ibool	ret;
3669 
3670 	buf_flush_list_mutex_enter(buf_pool);
3671 
3672 	ret = buf_flush_validate_low(buf_pool);
3673 
3674 	buf_flush_list_mutex_exit(buf_pool);
3675 
3676 	return(ret);
3677 }
3678 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3679 #endif /* !UNIV_HOTBACKUP */
3680 
3681 /******************************************************************//**
3682 Check if there are any dirty pages that belong to a space id in the flush
3683 list in a particular buffer pool.
3684 @return number of dirty pages present in a single buffer pool */
3685 ulint
buf_pool_get_dirty_pages_count(buf_pool_t * buf_pool,ulint id,FlushObserver * observer)3686 buf_pool_get_dirty_pages_count(
3687 /*===========================*/
3688 	buf_pool_t*	buf_pool,	/*!< in: buffer pool */
3689 	ulint		id,		/*!< in: space id to check */
3690 	FlushObserver*	observer)	/*!< in: flush observer to check */
3691 
3692 {
3693 	ulint		count = 0;
3694 
3695 	buf_pool_mutex_enter(buf_pool);
3696 	buf_flush_list_mutex_enter(buf_pool);
3697 
3698 	buf_page_t*	bpage;
3699 
3700 	for (bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
3701 	     bpage != 0;
3702 	     bpage = UT_LIST_GET_NEXT(list, bpage)) {
3703 
3704 		ut_ad(buf_page_in_file(bpage));
3705 		ut_ad(bpage->in_flush_list);
3706 		ut_ad(bpage->oldest_modification > 0);
3707 
3708 		if ((observer != NULL
3709 		     && observer == bpage->flush_observer)
3710 		    || (observer == NULL
3711 			&& id == bpage->id.space())) {
3712 			++count;
3713 		}
3714 	}
3715 
3716 	buf_flush_list_mutex_exit(buf_pool);
3717 	buf_pool_mutex_exit(buf_pool);
3718 
3719 	return(count);
3720 }
3721 
3722 /******************************************************************//**
3723 Check if there are any dirty pages that belong to a space id in the flush list.
3724 @return number of dirty pages present in all the buffer pools */
3725 ulint
buf_flush_get_dirty_pages_count(ulint id,FlushObserver * observer)3726 buf_flush_get_dirty_pages_count(
3727 /*============================*/
3728 	ulint		id,		/*!< in: space id to check */
3729 	FlushObserver*	observer)	/*!< in: flush observer to check */
3730 {
3731 	ulint		count = 0;
3732 
3733 	for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
3734 		buf_pool_t*	buf_pool;
3735 
3736 		buf_pool = buf_pool_from_array(i);
3737 
3738 		count += buf_pool_get_dirty_pages_count(buf_pool, id, observer);
3739 	}
3740 
3741 	return(count);
3742 }
3743 
3744 /** FlushObserver constructor
3745 @param[in]	space_id	table space id
3746 @param[in]	trx		trx instance
3747 @param[in]	stage		performance schema accounting object,
3748 used by ALTER TABLE. It is passed to log_preflush_pool_modified_pages()
3749 for accounting. */
FlushObserver(ulint space_id,trx_t * trx,ut_stage_alter_t * stage)3750 FlushObserver::FlushObserver(
3751 	ulint			space_id,
3752 	trx_t*			trx,
3753 	ut_stage_alter_t*	stage)
3754 	:
3755 	m_space_id(space_id),
3756 	m_trx(trx),
3757 	m_stage(stage),
3758 	m_interrupted(false)
3759 {
3760 	m_flushed = UT_NEW_NOKEY(std::vector<ulint>(srv_buf_pool_instances));
3761 	m_removed = UT_NEW_NOKEY(std::vector<ulint>(srv_buf_pool_instances));
3762 
3763 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
3764 		m_flushed->at(i) = 0;
3765 		m_removed->at(i) = 0;
3766 	}
3767 
3768 #ifdef FLUSH_LIST_OBSERVER_DEBUG
3769 		ib::info() << "FlushObserver constructor: " << m_trx->id;
3770 #endif /* FLUSH_LIST_OBSERVER_DEBUG */
3771 }
3772 
3773 /** FlushObserver deconstructor */
~FlushObserver()3774 FlushObserver::~FlushObserver()
3775 {
3776 	ut_ad(buf_flush_get_dirty_pages_count(m_space_id, this) == 0);
3777 
3778 	UT_DELETE(m_flushed);
3779 	UT_DELETE(m_removed);
3780 
3781 #ifdef FLUSH_LIST_OBSERVER_DEBUG
3782 		ib::info() << "FlushObserver deconstructor: " << m_trx->id;
3783 #endif /* FLUSH_LIST_OBSERVER_DEBUG */
3784 }
3785 
3786 /** Check whether trx is interrupted
3787 @return true if trx is interrupted */
3788 bool
check_interrupted()3789 FlushObserver::check_interrupted()
3790 {
3791 	if (trx_is_interrupted(m_trx)) {
3792 		interrupted();
3793 
3794 		return(true);
3795 	}
3796 
3797 	return(false);
3798 }
3799 
3800 /** Notify observer of a flush
3801 @param[in]	buf_pool	buffer pool instance
3802 @param[in]	bpage		buffer page to flush */
3803 void
notify_flush(buf_pool_t * buf_pool,buf_page_t * bpage)3804 FlushObserver::notify_flush(
3805 	buf_pool_t*	buf_pool,
3806 	buf_page_t*	bpage)
3807 {
3808 	ut_ad(buf_pool_mutex_own(buf_pool));
3809 
3810 	m_flushed->at(buf_pool->instance_no)++;
3811 
3812 	if (m_stage != NULL) {
3813 		m_stage->inc();
3814 	}
3815 
3816 #ifdef FLUSH_LIST_OBSERVER_DEBUG
3817 	ib::info() << "Flush <" << bpage->id.space()
3818 		   << ", " << bpage->id.page_no() << ">";
3819 #endif /* FLUSH_LIST_OBSERVER_DEBUG */
3820 }
3821 
3822 /** Notify observer of a remove
3823 @param[in]	buf_pool	buffer pool instance
3824 @param[in]	bpage		buffer page flushed */
3825 void
notify_remove(buf_pool_t * buf_pool,buf_page_t * bpage)3826 FlushObserver::notify_remove(
3827 	buf_pool_t*	buf_pool,
3828 	buf_page_t*	bpage)
3829 {
3830 	ut_ad(buf_pool_mutex_own(buf_pool));
3831 
3832 	m_removed->at(buf_pool->instance_no)++;
3833 
3834 #ifdef FLUSH_LIST_OBSERVER_DEBUG
3835 	ib::info() << "Remove <" << bpage->id.space()
3836 		   << ", " << bpage->id.page_no() << ">";
3837 #endif /* FLUSH_LIST_OBSERVER_DEBUG */
3838 }
3839 
3840 /** Flush dirty pages and wait. */
3841 void
flush()3842 FlushObserver::flush()
3843 {
3844 	buf_remove_t	buf_remove;
3845 
3846 	if (m_interrupted) {
3847 		buf_remove = BUF_REMOVE_FLUSH_NO_WRITE;
3848 	} else {
3849 		buf_remove = BUF_REMOVE_FLUSH_WRITE;
3850 
3851 		if (m_stage != NULL) {
3852 			ulint	pages_to_flush =
3853 				buf_flush_get_dirty_pages_count(
3854 					m_space_id, this);
3855 
3856 			m_stage->begin_phase_flush(pages_to_flush);
3857 		}
3858 	}
3859 
3860 	/* Flush or remove dirty pages. */
3861 	buf_LRU_flush_or_remove_pages(m_space_id, buf_remove, m_trx);
3862 
3863 	/* Wait for all dirty pages were flushed. */
3864 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
3865 		while (!is_complete(i)) {
3866 
3867 			os_thread_sleep(2000);
3868 		}
3869 	}
3870 }
3871