1 /*****************************************************************************
2 
3 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2013, 2020, MariaDB Corporation.
5 Copyright (c) 2013, 2014, Fusion-io
6 
7 This program is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free Software
9 Foundation; version 2 of the License.
10 
11 This program is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
14 
15 You should have received a copy of the GNU General Public License along with
16 this program; if not, write to the Free Software Foundation, Inc.,
17 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
18 
19 *****************************************************************************/
20 
21 /**************************************************//**
22 @file buf/buf0flu.cc
23 The database buffer buf_pool flush algorithm
24 
25 Created 11/11/1995 Heikki Tuuri
26 *******************************************************/
27 
28 #include "univ.i"
29 #include <mysql/service_thd_wait.h>
30 #include <sql_class.h>
31 
32 #include "buf0flu.h"
33 #include "buf0buf.h"
34 #include "buf0checksum.h"
35 #include "srv0start.h"
36 #include "srv0srv.h"
37 #include "page0zip.h"
38 #include "ut0byte.h"
39 #include "page0page.h"
40 #include "fil0fil.h"
41 #include "buf0lru.h"
42 #include "buf0rea.h"
43 #include "ibuf0ibuf.h"
44 #include "log0log.h"
45 #include "os0file.h"
46 #include "trx0sys.h"
47 #include "srv0mon.h"
48 #include "ut0stage.h"
49 #include "fil0pagecompress.h"
50 #ifdef UNIV_LINUX
51 /* include defs for CPU time priority settings */
52 #include <unistd.h>
53 #include <sys/syscall.h>
54 #include <sys/time.h>
55 #include <sys/resource.h>
56 static const int buf_flush_page_cleaner_priority = -20;
57 #endif /* UNIV_LINUX */
58 
59 /** Sleep time in microseconds for loop waiting for the oldest
60 modification lsn */
61 static const ulint buf_flush_wait_flushed_sleep_time = 10000;
62 
63 #include <my_service_manager.h>
64 
65 /** Number of pages flushed through non flush_list flushes. */
66 static ulint buf_lru_flush_page_count = 0;
67 
68 /** Flag indicating if the page_cleaner is in active state. This flag
69 is set to TRUE by the page_cleaner thread when it is spawned and is set
70 back to FALSE at shutdown by the page_cleaner as well. Therefore no
71 need to protect it by a mutex. It is only ever read by the thread
72 doing the shutdown */
73 bool buf_page_cleaner_is_active;
74 
75 /** Factor for scan length to determine n_pages for intended oldest LSN
76 progress */
77 static ulint buf_flush_lsn_scan_factor = 3;
78 
79 /** Average redo generation rate */
80 static lsn_t lsn_avg_rate = 0;
81 
82 /** Target oldest LSN for the requested flush_sync */
83 static lsn_t buf_flush_sync_lsn = 0;
84 
85 #ifdef UNIV_PFS_THREAD
86 mysql_pfs_key_t page_cleaner_thread_key;
87 #endif /* UNIV_PFS_THREAD */
88 
89 /** Event to synchronise with the flushing. */
90 os_event_t	buf_flush_event;
91 
92 /** State for page cleaner array slot */
93 enum page_cleaner_state_t {
94 	/** Not requested any yet.
95 	Moved from FINISHED by the coordinator. */
96 	PAGE_CLEANER_STATE_NONE = 0,
97 	/** Requested but not started flushing.
98 	Moved from NONE by the coordinator. */
99 	PAGE_CLEANER_STATE_REQUESTED,
100 	/** Flushing is on going.
101 	Moved from REQUESTED by the worker. */
102 	PAGE_CLEANER_STATE_FLUSHING,
103 	/** Flushing was finished.
104 	Moved from FLUSHING by the worker. */
105 	PAGE_CLEANER_STATE_FINISHED
106 };
107 
108 /** Page cleaner request state for each buffer pool instance */
109 struct page_cleaner_slot_t {
110 	page_cleaner_state_t	state;	/*!< state of the request.
111 					protected by page_cleaner_t::mutex
112 					if the worker thread got the slot and
113 					set to PAGE_CLEANER_STATE_FLUSHING,
114 					n_flushed_lru and n_flushed_list can be
115 					updated only by the worker thread */
116 	/* This value is set during state==PAGE_CLEANER_STATE_NONE */
117 	ulint			n_pages_requested;
118 					/*!< number of requested pages
119 					for the slot */
120 	/* These values are updated during state==PAGE_CLEANER_STATE_FLUSHING,
121 	and commited with state==PAGE_CLEANER_STATE_FINISHED.
122 	The consistency is protected by the 'state' */
123 	ulint			n_flushed_lru;
124 					/*!< number of flushed pages
125 					by LRU scan flushing */
126 	ulint			n_flushed_list;
127 					/*!< number of flushed pages
128 					by flush_list flushing */
129 	bool			succeeded_list;
130 					/*!< true if flush_list flushing
131 					succeeded. */
132 	ulint			flush_lru_time;
133 					/*!< elapsed time for LRU flushing */
134 	ulint			flush_list_time;
135 					/*!< elapsed time for flush_list
136 					flushing */
137 	ulint			flush_lru_pass;
138 					/*!< count to attempt LRU flushing */
139 	ulint			flush_list_pass;
140 					/*!< count to attempt flush_list
141 					flushing */
142 };
143 
144 /** Page cleaner structure common for all threads */
145 struct page_cleaner_t {
146 	ib_mutex_t		mutex;		/*!< mutex to protect whole of
147 						page_cleaner_t struct and
148 						page_cleaner_slot_t slots. */
149 	os_event_t		is_requested;	/*!< event to activate worker
150 						threads. */
151 	os_event_t		is_finished;	/*!< event to signal that all
152 						slots were finished. */
153 	os_event_t		is_started;	/*!< event to signal that
154 						thread is started/exiting */
155 	volatile ulint		n_workers;	/*!< number of worker threads
156 						in existence */
157 	bool			requested;	/*!< true if requested pages
158 						to flush */
159 	lsn_t			lsn_limit;	/*!< upper limit of LSN to be
160 						flushed */
161 	ulint			n_slots;	/*!< total number of slots */
162 	ulint			n_slots_requested;
163 						/*!< number of slots
164 						in the state
165 						PAGE_CLEANER_STATE_REQUESTED */
166 	ulint			n_slots_flushing;
167 						/*!< number of slots
168 						in the state
169 						PAGE_CLEANER_STATE_FLUSHING */
170 	ulint			n_slots_finished;
171 						/*!< number of slots
172 						in the state
173 						PAGE_CLEANER_STATE_FINISHED */
174 	ulint			flush_time;	/*!< elapsed time to flush
175 						requests for all slots */
176 	ulint			flush_pass;	/*!< count to finish to flush
177 						requests for all slots */
178 	page_cleaner_slot_t	slots[MAX_BUFFER_POOLS];
179 	bool			is_running;	/*!< false if attempt
180 						to shutdown */
181 
182 #ifdef UNIV_DEBUG
183 	ulint			n_disabled_debug;
184 						/*<! how many of pc threads
185 						have been disabled */
186 #endif /* UNIV_DEBUG */
187 };
188 
189 static page_cleaner_t	page_cleaner;
190 
191 #ifdef UNIV_DEBUG
192 my_bool innodb_page_cleaner_disabled_debug;
193 #endif /* UNIV_DEBUG */
194 
195 /* @} */
196 
197 /******************************************************************//**
198 Increases flush_list size in bytes with the page size in inline function */
199 static inline
200 void
incr_flush_list_size_in_bytes(buf_block_t * block,buf_pool_t * buf_pool)201 incr_flush_list_size_in_bytes(
202 /*==========================*/
203 	buf_block_t*	block,		/*!< in: control block */
204 	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
205 {
206 	ut_ad(buf_flush_list_mutex_own(buf_pool));
207 
208 	buf_pool->stat.flush_list_bytes += block->page.size.physical();
209 
210 	ut_ad(buf_pool->stat.flush_list_bytes <= buf_pool->curr_pool_size);
211 }
212 
213 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
214 /******************************************************************//**
215 Validates the flush list.
216 @return TRUE if ok */
217 static
218 ibool
219 buf_flush_validate_low(
220 /*===================*/
221 	buf_pool_t*	buf_pool);	/*!< in: Buffer pool instance */
222 
223 /******************************************************************//**
224 Validates the flush list some of the time.
225 @return TRUE if ok or the check was skipped */
226 static
227 ibool
buf_flush_validate_skip(buf_pool_t * buf_pool)228 buf_flush_validate_skip(
229 /*====================*/
230 	buf_pool_t*	buf_pool)	/*!< in: Buffer pool instance */
231 {
232 /** Try buf_flush_validate_low() every this many times */
233 # define BUF_FLUSH_VALIDATE_SKIP	23
234 
235 	/** The buf_flush_validate_low() call skip counter.
236 	Use a signed type because of the race condition below. */
237 	static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
238 
239 	/* There is a race condition below, but it does not matter,
240 	because this call is only for heuristic purposes. We want to
241 	reduce the call frequency of the costly buf_flush_validate_low()
242 	check in debug builds. */
243 	if (--buf_flush_validate_count > 0) {
244 		return(TRUE);
245 	}
246 
247 	buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
248 	return(buf_flush_validate_low(buf_pool));
249 }
250 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
251 
252 /******************************************************************//**
253 Insert a block in the flush_rbt and returns a pointer to its
254 predecessor or NULL if no predecessor. The ordering is maintained
255 on the basis of the <oldest_modification, space, offset> key.
256 @return pointer to the predecessor or NULL if no predecessor. */
257 static
258 buf_page_t*
buf_flush_insert_in_flush_rbt(buf_page_t * bpage)259 buf_flush_insert_in_flush_rbt(
260 /*==========================*/
261 	buf_page_t*	bpage)	/*!< in: bpage to be inserted. */
262 {
263 	const ib_rbt_node_t*	c_node;
264 	const ib_rbt_node_t*	p_node;
265 	buf_page_t*		prev = NULL;
266 	buf_pool_t*		buf_pool = buf_pool_from_bpage(bpage);
267 
268 	ut_ad(srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE);
269 	ut_ad(buf_flush_list_mutex_own(buf_pool));
270 
271 	/* Insert this buffer into the rbt. */
272 	c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
273 	ut_a(c_node != NULL);
274 
275 	/* Get the predecessor. */
276 	p_node = rbt_prev(buf_pool->flush_rbt, c_node);
277 
278 	if (p_node != NULL) {
279 		buf_page_t**	value;
280 		value = rbt_value(buf_page_t*, p_node);
281 		prev = *value;
282 		ut_a(prev != NULL);
283 	}
284 
285 	return(prev);
286 }
287 
288 /*********************************************************//**
289 Delete a bpage from the flush_rbt. */
290 static
291 void
buf_flush_delete_from_flush_rbt(buf_page_t * bpage)292 buf_flush_delete_from_flush_rbt(
293 /*============================*/
294 	buf_page_t*	bpage)	/*!< in: bpage to be removed. */
295 {
296 #ifdef UNIV_DEBUG
297 	ibool		ret = FALSE;
298 #endif /* UNIV_DEBUG */
299 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
300 
301 	ut_ad(buf_flush_list_mutex_own(buf_pool));
302 
303 #ifdef UNIV_DEBUG
304 	ret =
305 #endif /* UNIV_DEBUG */
306 	rbt_delete(buf_pool->flush_rbt, &bpage);
307 
308 	ut_ad(ret);
309 }
310 
311 /*****************************************************************//**
312 Compare two modified blocks in the buffer pool. The key for comparison
313 is:
314 key = <oldest_modification, space, offset>
315 This comparison is used to maintian ordering of blocks in the
316 buf_pool->flush_rbt.
317 Note that for the purpose of flush_rbt, we only need to order blocks
318 on the oldest_modification. The other two fields are used to uniquely
319 identify the blocks.
320 @return < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */
321 static
322 int
buf_flush_block_cmp(const void * p1,const void * p2)323 buf_flush_block_cmp(
324 /*================*/
325 	const void*	p1,		/*!< in: block1 */
326 	const void*	p2)		/*!< in: block2 */
327 {
328 	int			ret;
329 	const buf_page_t*	b1 = *(const buf_page_t**) p1;
330 	const buf_page_t*	b2 = *(const buf_page_t**) p2;
331 
332 	ut_ad(b1 != NULL);
333 	ut_ad(b2 != NULL);
334 
335 #ifdef UNIV_DEBUG
336 	buf_pool_t*	buf_pool = buf_pool_from_bpage(b1);
337 #endif /* UNIV_DEBUG */
338 
339 	ut_ad(buf_flush_list_mutex_own(buf_pool));
340 
341 	ut_ad(b1->in_flush_list);
342 	ut_ad(b2->in_flush_list);
343 
344 	if (b2->oldest_modification > b1->oldest_modification) {
345 		return(1);
346 	} else if (b2->oldest_modification < b1->oldest_modification) {
347 		return(-1);
348 	}
349 
350 	/* If oldest_modification is same then decide on the space. */
351 	ret = (int)(b2->id.space() - b1->id.space());
352 
353 	/* Or else decide ordering on the page number. */
354 	return(ret ? ret : (int) (b2->id.page_no() - b1->id.page_no()));
355 }
356 
357 /********************************************************************//**
358 Initialize the red-black tree to speed up insertions into the flush_list
359 during recovery process. Should be called at the start of recovery
360 process before any page has been read/written. */
361 void
buf_flush_init_flush_rbt(void)362 buf_flush_init_flush_rbt(void)
363 /*==========================*/
364 {
365 	ulint	i;
366 
367 	for (i = 0; i < srv_buf_pool_instances; i++) {
368 		buf_pool_t*	buf_pool;
369 
370 		buf_pool = buf_pool_from_array(i);
371 
372 		buf_flush_list_mutex_enter(buf_pool);
373 
374 		ut_ad(buf_pool->flush_rbt == NULL);
375 
376 		/* Create red black tree for speedy insertions in flush list. */
377 		buf_pool->flush_rbt = rbt_create(
378 			sizeof(buf_page_t*), buf_flush_block_cmp);
379 
380 		buf_flush_list_mutex_exit(buf_pool);
381 	}
382 }
383 
384 /********************************************************************//**
385 Frees up the red-black tree. */
386 void
buf_flush_free_flush_rbt(void)387 buf_flush_free_flush_rbt(void)
388 /*==========================*/
389 {
390 	ulint	i;
391 
392 	for (i = 0; i < srv_buf_pool_instances; i++) {
393 		buf_pool_t*	buf_pool;
394 
395 		buf_pool = buf_pool_from_array(i);
396 
397 		buf_flush_list_mutex_enter(buf_pool);
398 
399 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
400 		ut_a(buf_flush_validate_low(buf_pool));
401 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
402 
403 		rbt_free(buf_pool->flush_rbt);
404 		buf_pool->flush_rbt = NULL;
405 
406 		buf_flush_list_mutex_exit(buf_pool);
407 	}
408 }
409 
410 /********************************************************************//**
411 Inserts a modified block into the flush list. */
412 void
buf_flush_insert_into_flush_list(buf_pool_t * buf_pool,buf_block_t * block,lsn_t lsn)413 buf_flush_insert_into_flush_list(
414 /*=============================*/
415 	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
416 	buf_block_t*	block,		/*!< in/out: block which is modified */
417 	lsn_t		lsn)		/*!< in: oldest modification */
418 {
419 	ut_ad(!buf_pool_mutex_own(buf_pool));
420 	ut_ad(log_flush_order_mutex_own());
421 	ut_ad(buf_page_mutex_own(block));
422 
423 	buf_flush_list_mutex_enter(buf_pool);
424 
425 	ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
426 	      || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
427 		  <= lsn));
428 
429 	/* If we are in the recovery then we need to update the flush
430 	red-black tree as well. */
431 	if (buf_pool->flush_rbt != NULL) {
432 		buf_flush_list_mutex_exit(buf_pool);
433 		buf_flush_insert_sorted_into_flush_list(buf_pool, block, lsn);
434 		return;
435 	}
436 
437 	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
438 	ut_ad(!block->page.in_flush_list);
439 
440 	ut_d(block->page.in_flush_list = TRUE);
441 	block->page.oldest_modification = lsn;
442 
443 	UT_LIST_ADD_FIRST(buf_pool->flush_list, &block->page);
444 
445 	incr_flush_list_size_in_bytes(block, buf_pool);
446 
447 	MEM_CHECK_DEFINED(block->page.size.is_compressed()
448 			  ? block->page.zip.data : block->frame,
449 			  block->page.size.physical());
450 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
451 	ut_a(buf_flush_validate_skip(buf_pool));
452 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
453 
454 	buf_flush_list_mutex_exit(buf_pool);
455 }
456 
457 /********************************************************************//**
458 Inserts a modified block into the flush list in the right sorted position.
459 This function is used by recovery, because there the modifications do not
460 necessarily come in the order of lsn's. */
461 void
buf_flush_insert_sorted_into_flush_list(buf_pool_t * buf_pool,buf_block_t * block,lsn_t lsn)462 buf_flush_insert_sorted_into_flush_list(
463 /*====================================*/
464 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
465 	buf_block_t*	block,		/*!< in/out: block which is modified */
466 	lsn_t		lsn)		/*!< in: oldest modification */
467 {
468 	buf_page_t*	prev_b;
469 	buf_page_t*	b;
470 
471 	ut_ad(srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE);
472 	ut_ad(!buf_pool_mutex_own(buf_pool));
473 	ut_ad(log_flush_order_mutex_own());
474 	ut_ad(buf_page_mutex_own(block));
475 	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
476 
477 	buf_flush_list_mutex_enter(buf_pool);
478 
479 	/* The field in_LRU_list is protected by buf_pool->mutex, which
480 	we are not holding.  However, while a block is in the flush
481 	list, it is dirty and cannot be discarded, not from the
482 	page_hash or from the LRU list.  At most, the uncompressed
483 	page frame of a compressed block may be discarded or created
484 	(copying the block->page to or from a buf_page_t that is
485 	dynamically allocated from buf_buddy_alloc()).  Because those
486 	transitions hold block->mutex and the flush list mutex (via
487 	buf_flush_relocate_on_flush_list()), there is no possibility
488 	of a race condition in the assertions below. */
489 	ut_ad(block->page.in_LRU_list);
490 	ut_ad(block->page.in_page_hash);
491 	/* buf_buddy_block_register() will take a block in the
492 	BUF_BLOCK_MEMORY state, not a file page. */
493 	ut_ad(!block->page.in_zip_hash);
494 
495 	ut_ad(!block->page.in_flush_list);
496 	ut_d(block->page.in_flush_list = TRUE);
497 	block->page.oldest_modification = lsn;
498 
499 	MEM_CHECK_DEFINED(block->page.size.is_compressed()
500 			  ? block->page.zip.data : block->frame,
501 			  block->page.size.physical());
502 
503 	prev_b = NULL;
504 
505 	/* For the most part when this function is called the flush_rbt
506 	should not be NULL. In a very rare boundary case it is possible
507 	that the flush_rbt has already been freed by the recovery thread
508 	before the last page was hooked up in the flush_list by the
509 	io-handler thread. In that case we'll just do a simple
510 	linear search in the else block. */
511 	if (buf_pool->flush_rbt != NULL) {
512 
513 		prev_b = buf_flush_insert_in_flush_rbt(&block->page);
514 
515 	} else {
516 
517 		b = UT_LIST_GET_FIRST(buf_pool->flush_list);
518 
519 		while (b != NULL && b->oldest_modification
520 		       > block->page.oldest_modification) {
521 
522 			ut_ad(b->in_flush_list);
523 			prev_b = b;
524 			b = UT_LIST_GET_NEXT(list, b);
525 		}
526 	}
527 
528 	if (prev_b == NULL) {
529 		UT_LIST_ADD_FIRST(buf_pool->flush_list, &block->page);
530 	} else {
531 		UT_LIST_INSERT_AFTER(buf_pool->flush_list, prev_b, &block->page);
532 	}
533 
534 	incr_flush_list_size_in_bytes(block, buf_pool);
535 
536 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
537 	ut_a(buf_flush_validate_low(buf_pool));
538 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
539 
540 	buf_flush_list_mutex_exit(buf_pool);
541 }
542 
543 /********************************************************************//**
544 Returns TRUE if the file page block is immediately suitable for replacement,
545 i.e., the transition FILE_PAGE => NOT_USED allowed.
546 @return TRUE if can replace immediately */
547 ibool
buf_flush_ready_for_replace(buf_page_t * bpage)548 buf_flush_ready_for_replace(
549 /*========================*/
550 	buf_page_t*	bpage)	/*!< in: buffer control block, must be
551 				buf_page_in_file(bpage) and in the LRU list */
552 {
553 #ifdef UNIV_DEBUG
554 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
555 	ut_ad(buf_pool_mutex_own(buf_pool));
556 #endif /* UNIV_DEBUG */
557 	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
558 	ut_ad(bpage->in_LRU_list);
559 	ut_a(buf_page_in_file(bpage));
560 
561 	return bpage->oldest_modification == 0
562 		&& bpage->buf_fix_count == 0
563 		&& buf_page_get_io_fix(bpage) == BUF_IO_NONE;
564 }
565 
566 /********************************************************************//**
567 Returns true if the block is modified and ready for flushing.
568 @return true if can flush immediately */
569 bool
buf_flush_ready_for_flush(buf_page_t * bpage,buf_flush_t flush_type)570 buf_flush_ready_for_flush(
571 /*======================*/
572 	buf_page_t*	bpage,	/*!< in: buffer control block, must be
573 				buf_page_in_file(bpage) */
574 	buf_flush_t	flush_type)/*!< in: type of flush */
575 {
576 #ifdef UNIV_DEBUG
577 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
578 	ut_ad(buf_pool_mutex_own(buf_pool));
579 #endif /* UNIV_DEBUG */
580 
581 	ut_a(buf_page_in_file(bpage));
582 	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
583 	ut_ad(flush_type < BUF_FLUSH_N_TYPES);
584 
585 	if (bpage->oldest_modification == 0
586 	    || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
587 		return(false);
588 	}
589 
590 	ut_ad(bpage->in_flush_list);
591 
592 	switch (flush_type) {
593 	case BUF_FLUSH_LIST:
594 	case BUF_FLUSH_LRU:
595 	case BUF_FLUSH_SINGLE_PAGE:
596 		return(true);
597 
598 	case BUF_FLUSH_N_TYPES:
599 		break;
600 	}
601 
602 	ut_error;
603 	return(false);
604 }
605 
606 /********************************************************************//**
607 Remove a block from the flush list of modified blocks. */
608 void
buf_flush_remove(buf_page_t * bpage)609 buf_flush_remove(
610 /*=============*/
611 	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
612 {
613 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
614 
615 #if 0 // FIXME: Rate-limit the output. Move this to the page cleaner?
616 	if (UNIV_UNLIKELY(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE)) {
617 		service_manager_extend_timeout(
618 			INNODB_EXTEND_TIMEOUT_INTERVAL,
619 			"Flush and remove page with tablespace id %u"
620 			", Poolid " ULINTPF ", flush list length " ULINTPF,
621 			bpage->space, buf_pool->instance_no,
622 			UT_LIST_GET_LEN(buf_pool->flush_list));
623 	}
624 #endif
625 
626 	ut_ad(buf_pool_mutex_own(buf_pool));
627 	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
628 	ut_ad(bpage->in_flush_list);
629 
630 	buf_flush_list_mutex_enter(buf_pool);
631 
632 	/* Important that we adjust the hazard pointer before removing
633 	the bpage from flush list. */
634 	buf_pool->flush_hp.adjust(bpage);
635 
636 	switch (buf_page_get_state(bpage)) {
637 	case BUF_BLOCK_POOL_WATCH:
638 	case BUF_BLOCK_ZIP_PAGE:
639 		/* Clean compressed pages should not be on the flush list */
640 	case BUF_BLOCK_NOT_USED:
641 	case BUF_BLOCK_READY_FOR_USE:
642 	case BUF_BLOCK_MEMORY:
643 	case BUF_BLOCK_REMOVE_HASH:
644 		ut_error;
645 		return;
646 	case BUF_BLOCK_ZIP_DIRTY:
647 		buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
648 		UT_LIST_REMOVE(buf_pool->flush_list, bpage);
649 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
650 		buf_LRU_insert_zip_clean(bpage);
651 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
652 		break;
653 	case BUF_BLOCK_FILE_PAGE:
654 		UT_LIST_REMOVE(buf_pool->flush_list, bpage);
655 		break;
656 	}
657 
658 	/* If the flush_rbt is active then delete from there as well. */
659 	if (buf_pool->flush_rbt != NULL) {
660 		buf_flush_delete_from_flush_rbt(bpage);
661 	}
662 
663 	/* Must be done after we have removed it from the flush_rbt
664 	because we assert on in_flush_list in comparison function. */
665 	ut_d(bpage->in_flush_list = FALSE);
666 
667 	buf_pool->stat.flush_list_bytes -= bpage->size.physical();
668 
669 	bpage->oldest_modification = 0;
670 
671 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
672 	ut_a(buf_flush_validate_skip(buf_pool));
673 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
674 
675 	/* If there is an observer that want to know if the asynchronous
676 	flushing was done then notify it. */
677 	if (bpage->flush_observer != NULL) {
678 		bpage->flush_observer->notify_remove(buf_pool, bpage);
679 
680 		bpage->flush_observer = NULL;
681 	}
682 
683 	buf_flush_list_mutex_exit(buf_pool);
684 }
685 
686 /*******************************************************************//**
687 Relocates a buffer control block on the flush_list.
688 Note that it is assumed that the contents of bpage have already been
689 copied to dpage.
690 IMPORTANT: When this function is called bpage and dpage are not
691 exact copies of each other. For example, they both will have different
692 ::state. Also the ::list pointers in dpage may be stale. We need to
693 use the current list node (bpage) to do the list manipulation because
694 the list pointers could have changed between the time that we copied
695 the contents of bpage to the dpage and the flush list manipulation
696 below. */
697 void
buf_flush_relocate_on_flush_list(buf_page_t * bpage,buf_page_t * dpage)698 buf_flush_relocate_on_flush_list(
699 /*=============================*/
700 	buf_page_t*	bpage,	/*!< in/out: control block being moved */
701 	buf_page_t*	dpage)	/*!< in/out: destination block */
702 {
703 	buf_page_t*	prev;
704 	buf_page_t*	prev_b = NULL;
705 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
706 
707 	ut_ad(buf_pool_mutex_own(buf_pool));
708 	/* Must reside in the same buffer pool. */
709 	ut_ad(buf_pool == buf_pool_from_bpage(dpage));
710 
711 	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
712 
713 	buf_flush_list_mutex_enter(buf_pool);
714 
715 	/* FIXME: At this point we have both buf_pool and flush_list
716 	mutexes. Theoretically removal of a block from flush list is
717 	only covered by flush_list mutex but currently we do
718 	have buf_pool mutex in buf_flush_remove() therefore this block
719 	is guaranteed to be in the flush list. We need to check if
720 	this will work without the assumption of block removing code
721 	having the buf_pool mutex. */
722 	ut_ad(bpage->in_flush_list);
723 	ut_ad(dpage->in_flush_list);
724 
725 	/* If recovery is active we must swap the control blocks in
726 	the flush_rbt as well. */
727 	if (buf_pool->flush_rbt != NULL) {
728 		buf_flush_delete_from_flush_rbt(bpage);
729 		prev_b = buf_flush_insert_in_flush_rbt(dpage);
730 	}
731 
732 	/* Important that we adjust the hazard pointer before removing
733 	the bpage from the flush list. */
734 	buf_pool->flush_hp.adjust(bpage);
735 
736 	/* Must be done after we have removed it from the flush_rbt
737 	because we assert on in_flush_list in comparison function. */
738 	ut_d(bpage->in_flush_list = FALSE);
739 
740 	prev = UT_LIST_GET_PREV(list, bpage);
741 	UT_LIST_REMOVE(buf_pool->flush_list, bpage);
742 
743 	if (prev) {
744 		ut_ad(prev->in_flush_list);
745 		UT_LIST_INSERT_AFTER( buf_pool->flush_list, prev, dpage);
746 	} else {
747 		UT_LIST_ADD_FIRST(buf_pool->flush_list, dpage);
748 	}
749 
750 	/* Just an extra check. Previous in flush_list
751 	should be the same control block as in flush_rbt. */
752 	ut_a(buf_pool->flush_rbt == NULL || prev_b == prev);
753 
754 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
755 	ut_a(buf_flush_validate_low(buf_pool));
756 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
757 
758 	buf_flush_list_mutex_exit(buf_pool);
759 }
760 
761 /** Update the flush system data structures when a write is completed.
762 @param[in,out]	bpage	flushed page
763 @param[in]	dblwr	whether the doublewrite buffer was used */
buf_flush_write_complete(buf_page_t * bpage,bool dblwr)764 void buf_flush_write_complete(buf_page_t* bpage, bool dblwr)
765 {
766 	buf_flush_t	flush_type;
767 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
768 
769 	ut_ad(bpage);
770 
771 	buf_flush_remove(bpage);
772 
773 	flush_type = buf_page_get_flush_type(bpage);
774 	buf_pool->n_flush[flush_type]--;
775 	ut_ad(buf_pool->n_flush[flush_type] != ULINT_MAX);
776 
777 	ut_ad(buf_pool_mutex_own(buf_pool));
778 
779 	if (buf_pool->n_flush[flush_type] == 0
780 	    && buf_pool->init_flush[flush_type] == FALSE) {
781 
782 		/* The running flush batch has ended */
783 
784 		os_event_set(buf_pool->no_flush[flush_type]);
785 	}
786 
787 	if (dblwr) {
788 		buf_dblwr_update(bpage, flush_type);
789 	}
790 }
791 
792 /** Calculate the checksum of a page from compressed table and update
793 the page.
794 @param[in,out]	page	page to update
795 @param[in]	size	compressed page size
796 @param[in]	lsn	LSN to stamp on the page */
797 void
buf_flush_update_zip_checksum(buf_frame_t * page,ulint size,lsn_t lsn)798 buf_flush_update_zip_checksum(
799 	buf_frame_t*	page,
800 	ulint		size,
801 	lsn_t		lsn)
802 {
803 	ut_a(size > 0);
804 
805 	const uint32_t	checksum = page_zip_calc_checksum(
806 		page, size,
807 		static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm));
808 
809 	mach_write_to_8(page + FIL_PAGE_LSN, lsn);
810 	mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
811 }
812 
813 /** Initialize a page for writing to the tablespace.
814 @param[in]	block		buffer block; NULL if bypassing the buffer pool
815 @param[in,out]	page		page frame
816 @param[in,out]	page_zip_	compressed page, or NULL if uncompressed
817 @param[in]	newest_lsn	newest modification LSN to the page */
818 void
buf_flush_init_for_writing(const buf_block_t * block,byte * page,void * page_zip_,lsn_t newest_lsn)819 buf_flush_init_for_writing(
820 	const buf_block_t*	block,
821 	byte*			page,
822 	void*			page_zip_,
823 	lsn_t			newest_lsn)
824 {
825 	ut_ad(block == NULL || block->frame == page);
826 	ut_ad(block == NULL || page_zip_ == NULL
827 	      || &block->page.zip == page_zip_);
828 	ut_ad(!block || newest_lsn);
829 	ut_ad(page);
830 #if 0 /* MDEV-15528 TODO: reinstate this check */
831 	/* innodb_immediate_scrub_data_uncompressed=ON would cause
832 	fsp_init_file_page() to be called on freed pages, and thus
833 	cause them to be written as almost-all-zeroed.
834 	In MDEV-15528 we should change that implement an option to
835 	make freed pages appear all-zero, bypassing this code. */
836 	ut_ad(!newest_lsn || fil_page_get_type(page));
837 #endif
838 
839 	if (page_zip_) {
840 		page_zip_des_t*	page_zip;
841 		ulint		size;
842 
843 		page_zip = static_cast<page_zip_des_t*>(page_zip_);
844 		size = page_zip_get_size(page_zip);
845 
846 		ut_ad(size);
847 		ut_ad(ut_is_2pow(size));
848 		ut_ad(size <= UNIV_ZIP_SIZE_MAX);
849 
850 		switch (fil_page_get_type(page)) {
851 		case FIL_PAGE_TYPE_ALLOCATED:
852 		case FIL_PAGE_INODE:
853 		case FIL_PAGE_IBUF_BITMAP:
854 		case FIL_PAGE_TYPE_FSP_HDR:
855 		case FIL_PAGE_TYPE_XDES:
856 			/* These are essentially uncompressed pages. */
857 			memcpy(page_zip->data, page, size);
858 			/* fall through */
859 		case FIL_PAGE_TYPE_ZBLOB:
860 		case FIL_PAGE_TYPE_ZBLOB2:
861 		case FIL_PAGE_INDEX:
862 		case FIL_PAGE_RTREE:
863 
864 			buf_flush_update_zip_checksum(
865 				page_zip->data, size, newest_lsn);
866 
867 			return;
868 		}
869 
870 		ib::error() << "The compressed page to be written"
871 			" seems corrupt:";
872 		ut_print_buf(stderr, page, size);
873 		fputs("\nInnoDB: Possibly older version of the page:", stderr);
874 		ut_print_buf(stderr, page_zip->data, size);
875 		putc('\n', stderr);
876 		ut_error;
877 	}
878 
879 	/* Write the newest modification lsn to the page header and trailer */
880 	mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
881 
882 	mach_write_to_8(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM,
883 			newest_lsn);
884 
885 	if (block && srv_page_size == 16384) {
886 		/* The page type could be garbage in old files
887 		created before MySQL 5.5. Such files always
888 		had a page size of 16 kilobytes. */
889 		ulint	page_type = fil_page_get_type(page);
890 		ulint	reset_type = page_type;
891 
892 		switch (block->page.id.page_no() % 16384) {
893 		case 0:
894 			reset_type = block->page.id.page_no() == 0
895 				? FIL_PAGE_TYPE_FSP_HDR
896 				: FIL_PAGE_TYPE_XDES;
897 			break;
898 		case 1:
899 			reset_type = FIL_PAGE_IBUF_BITMAP;
900 			break;
901 		case FSP_TRX_SYS_PAGE_NO:
902 			if (block->page.id.page_no()
903 			    == TRX_SYS_PAGE_NO
904 			    && block->page.id.space()
905 			    == TRX_SYS_SPACE) {
906 				reset_type = FIL_PAGE_TYPE_TRX_SYS;
907 				break;
908 			}
909 			/* fall through */
910 		default:
911 			switch (page_type) {
912 			case FIL_PAGE_INDEX:
913 			case FIL_PAGE_TYPE_INSTANT:
914 			case FIL_PAGE_RTREE:
915 			case FIL_PAGE_UNDO_LOG:
916 			case FIL_PAGE_INODE:
917 			case FIL_PAGE_IBUF_FREE_LIST:
918 			case FIL_PAGE_TYPE_ALLOCATED:
919 			case FIL_PAGE_TYPE_SYS:
920 			case FIL_PAGE_TYPE_TRX_SYS:
921 			case FIL_PAGE_TYPE_BLOB:
922 			case FIL_PAGE_TYPE_ZBLOB:
923 			case FIL_PAGE_TYPE_ZBLOB2:
924 				break;
925 			case FIL_PAGE_TYPE_FSP_HDR:
926 			case FIL_PAGE_TYPE_XDES:
927 			case FIL_PAGE_IBUF_BITMAP:
928 				/* These pages should have
929 				predetermined page numbers
930 				(see above). */
931 			default:
932 				reset_type = FIL_PAGE_TYPE_UNKNOWN;
933 				break;
934 			}
935 		}
936 
937 		if (UNIV_UNLIKELY(page_type != reset_type)) {
938 			ib::info()
939 				<< "Resetting invalid page "
940 				<< block->page.id << " type "
941 				<< page_type << " to "
942 				<< reset_type << " when flushing.";
943 			fil_page_set_type(page, reset_type);
944 		}
945 	}
946 
947 	uint32_t checksum = BUF_NO_CHECKSUM_MAGIC;
948 
949 	switch (srv_checksum_algorithm_t(srv_checksum_algorithm)) {
950 	case SRV_CHECKSUM_ALGORITHM_INNODB:
951 	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
952 		checksum = buf_calc_page_new_checksum(page);
953 		mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
954 				checksum);
955 		/* With the InnoDB checksum, we overwrite the first 4 bytes of
956 		the end lsn field to store the old formula checksum. Since it
957 		depends also on the field FIL_PAGE_SPACE_OR_CHKSUM, it has to
958 		be calculated after storing the new formula checksum. */
959 		checksum = buf_calc_page_old_checksum(page);
960 		break;
961 	case SRV_CHECKSUM_ALGORITHM_CRC32:
962 	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
963 		/* In other cases we write the same checksum to both fields. */
964 		checksum = buf_calc_page_crc32(page);
965 		mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
966 				checksum);
967 		break;
968 	case SRV_CHECKSUM_ALGORITHM_NONE:
969 	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
970 		mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
971 				checksum);
972 		break;
973 		/* no default so the compiler will emit a warning if
974 		new enum is added and not handled here */
975 	}
976 
977 	mach_write_to_4(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM,
978 			checksum);
979 }
980 
981 /********************************************************************//**
982 Does an asynchronous write of a buffer page. NOTE: in simulated aio and
983 also when the doublewrite buffer is used, we must call
984 buf_dblwr_flush_buffered_writes after we have posted a batch of
985 writes! */
986 static
987 void
buf_flush_write_block_low(buf_page_t * bpage,buf_flush_t flush_type,bool sync)988 buf_flush_write_block_low(
989 /*======================*/
990 	buf_page_t*	bpage,		/*!< in: buffer block to write */
991 	buf_flush_t	flush_type,	/*!< in: type of flush */
992 	bool		sync)		/*!< in: true if sync IO request */
993 {
994 	fil_space_t* space = fil_space_acquire_for_io(bpage->id.space());
995 	if (!space) {
996 		return;
997 	}
998 	ut_ad(space->purpose == FIL_TYPE_TEMPORARY
999 	      || space->purpose == FIL_TYPE_IMPORT
1000 	      || space->purpose == FIL_TYPE_TABLESPACE);
1001 	ut_ad((space->purpose == FIL_TYPE_TEMPORARY)
1002 	      == (space == fil_system.temp_space));
1003 	page_t*	frame = NULL;
1004 #ifdef UNIV_DEBUG
1005 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
1006 	ut_ad(!buf_pool_mutex_own(buf_pool));
1007 #endif /* UNIV_DEBUG */
1008 
1009 	DBUG_PRINT("ib_buf", ("flush %s %u page %u:%u",
1010 			      sync ? "sync" : "async", (unsigned) flush_type,
1011 			      bpage->id.space(), bpage->id.page_no()));
1012 
1013 	ut_ad(buf_page_in_file(bpage));
1014 
1015 	/* We are not holding buf_pool->mutex or block_mutex here.
1016 	Nevertheless, it is safe to access bpage, because it is
1017 	io_fixed and oldest_modification != 0.  Thus, it cannot be
1018 	relocated in the buffer pool or removed from flush_list or
1019 	LRU_list. */
1020 	ut_ad(!buf_pool_mutex_own(buf_pool));
1021 	ut_ad(!buf_flush_list_mutex_own(buf_pool));
1022 	ut_ad(!buf_page_get_mutex(bpage)->is_owned());
1023 	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
1024 	ut_ad(bpage->oldest_modification != 0);
1025 	ut_ad(bpage->newest_modification != 0);
1026 
1027 	/* Force the log to the disk before writing the modified block */
1028 	if (!srv_read_only_mode) {
1029 		log_write_up_to(bpage->newest_modification, true);
1030 	}
1031 
1032 	switch (buf_page_get_state(bpage)) {
1033 	case BUF_BLOCK_POOL_WATCH:
1034 	case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
1035 	case BUF_BLOCK_NOT_USED:
1036 	case BUF_BLOCK_READY_FOR_USE:
1037 	case BUF_BLOCK_MEMORY:
1038 	case BUF_BLOCK_REMOVE_HASH:
1039 		ut_error;
1040 		break;
1041 	case BUF_BLOCK_ZIP_DIRTY:
1042 		frame = bpage->zip.data;
1043 
1044 		buf_flush_update_zip_checksum(frame, bpage->size.physical(),
1045 					      bpage->newest_modification);
1046 		break;
1047 	case BUF_BLOCK_FILE_PAGE:
1048 		frame = bpage->zip.data;
1049 		if (!frame) {
1050 			frame = ((buf_block_t*) bpage)->frame;
1051 		}
1052 
1053 		buf_flush_init_for_writing(
1054 			reinterpret_cast<const buf_block_t*>(bpage),
1055 			reinterpret_cast<const buf_block_t*>(bpage)->frame,
1056 			bpage->zip.data ? &bpage->zip : NULL,
1057 			bpage->newest_modification);
1058 		break;
1059 	}
1060 
1061 	frame = buf_page_encrypt_before_write(space, bpage, frame);
1062 
1063 	ut_ad(space->purpose == FIL_TYPE_TABLESPACE
1064 	      || space->atomic_write_supported);
1065 	if (!space->use_doublewrite()) {
1066 		ulint	type = IORequest::WRITE | IORequest::DO_NOT_WAKE;
1067 
1068 		IORequest	request(type, bpage);
1069 
1070 		/* TODO: pass the tablespace to fil_io() */
1071 		fil_io(request,
1072 		       sync, bpage->id, bpage->size, 0, bpage->size.physical(),
1073 		       frame, bpage);
1074 	} else {
1075 		ut_ad(!srv_read_only_mode);
1076 
1077 		if (flush_type == BUF_FLUSH_SINGLE_PAGE) {
1078 			buf_dblwr_write_single_page(bpage, sync);
1079 		} else {
1080 			ut_ad(!sync);
1081 			buf_dblwr_add_to_batch(bpage);
1082 		}
1083 	}
1084 
1085 	/* When doing single page flushing the IO is done synchronously
1086 	and we flush the changes to disk only for the tablespace we
1087 	are working on. */
1088 	if (sync) {
1089 		ut_ad(flush_type == BUF_FLUSH_SINGLE_PAGE);
1090 		if (space->purpose != FIL_TYPE_TEMPORARY) {
1091 			fil_flush(space);
1092 		}
1093 
1094 		/* The tablespace could already have been dropped,
1095 		because fil_io(request, sync) would already have
1096 		decremented the node->n_pending. However,
1097 		buf_page_io_complete() only needs to look up the
1098 		tablespace during read requests, not during writes. */
1099 		ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
1100 #ifdef UNIV_DEBUG
1101 		dberr_t err =
1102 #endif
1103 		/* true means we want to evict this page from the
1104 		LRU list as well. */
1105 		buf_page_io_complete(bpage, space->use_doublewrite(), true);
1106 
1107 		ut_ad(err == DB_SUCCESS);
1108 	}
1109 
1110 	space->release_for_io();
1111 
1112 	/* Increment the counter of I/O operations used
1113 	for selecting LRU policy. */
1114 	buf_LRU_stat_inc_io();
1115 }
1116 
1117 /********************************************************************//**
1118 Writes a flushable page asynchronously from the buffer pool to a file.
1119 NOTE: in simulated aio we must call
1120 os_aio_simulated_wake_handler_threads after we have posted a batch of
1121 writes! NOTE: buf_pool->mutex and buf_page_get_mutex(bpage) must be
1122 held upon entering this function, and they will be released by this
1123 function if it returns true.
1124 @return TRUE if the page was flushed */
1125 ibool
buf_flush_page(buf_pool_t * buf_pool,buf_page_t * bpage,buf_flush_t flush_type,bool sync)1126 buf_flush_page(
1127 /*===========*/
1128 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
1129 	buf_page_t*	bpage,		/*!< in: buffer control block */
1130 	buf_flush_t	flush_type,	/*!< in: type of flush */
1131 	bool		sync)		/*!< in: true if sync IO request */
1132 {
1133 	BPageMutex*	block_mutex;
1134 
1135 	ut_ad(flush_type < BUF_FLUSH_N_TYPES);
1136 	ut_ad(buf_pool_mutex_own(buf_pool));
1137 	ut_ad(buf_page_in_file(bpage));
1138 	ut_ad(!sync || flush_type == BUF_FLUSH_SINGLE_PAGE);
1139 
1140 	block_mutex = buf_page_get_mutex(bpage);
1141 	ut_ad(mutex_own(block_mutex));
1142 
1143 	ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
1144 
1145 	bool	is_uncompressed;
1146 
1147 	is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
1148 	ut_ad(is_uncompressed == (block_mutex != &buf_pool->zip_mutex));
1149 
1150 	ibool		flush;
1151 	rw_lock_t*	rw_lock;
1152 	bool		no_fix_count = bpage->buf_fix_count == 0;
1153 
1154 	if (!is_uncompressed) {
1155 		flush = TRUE;
1156 		rw_lock = NULL;
1157 	} else if (!(no_fix_count || flush_type == BUF_FLUSH_LIST)
1158 		   || (!no_fix_count
1159 		       && srv_shutdown_state <= SRV_SHUTDOWN_CLEANUP
1160 		       && fsp_is_system_temporary(bpage->id.space()))) {
1161 		/* This is a heuristic, to avoid expensive SX attempts. */
1162 		/* For table residing in temporary tablespace sync is done
1163 		using IO_FIX and so before scheduling for flush ensure that
1164 		page is not fixed. */
1165 		flush = FALSE;
1166 	} else {
1167 		rw_lock = &reinterpret_cast<buf_block_t*>(bpage)->lock;
1168 		if (flush_type != BUF_FLUSH_LIST) {
1169 			flush = rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE);
1170 		} else {
1171 			/* Will SX lock later */
1172 			flush = TRUE;
1173 		}
1174 	}
1175 
1176 	if (flush) {
1177 
1178 		/* We are committed to flushing by the time we get here */
1179 
1180 		buf_page_set_io_fix(bpage, BUF_IO_WRITE);
1181 
1182 		buf_page_set_flush_type(bpage, flush_type);
1183 
1184 		if (buf_pool->n_flush[flush_type] == 0) {
1185 			os_event_reset(buf_pool->no_flush[flush_type]);
1186 		}
1187 
1188 		++buf_pool->n_flush[flush_type];
1189 		ut_ad(buf_pool->n_flush[flush_type] != 0);
1190 
1191 		mutex_exit(block_mutex);
1192 
1193 		buf_pool_mutex_exit(buf_pool);
1194 
1195 		if (flush_type == BUF_FLUSH_LIST
1196 		    && is_uncompressed
1197 		    && !rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE)) {
1198 
1199 			if (!fsp_is_system_temporary(bpage->id.space())) {
1200 				/* avoiding deadlock possibility involves
1201 				doublewrite buffer, should flush it, because
1202 				it might hold the another block->lock. */
1203 				buf_dblwr_flush_buffered_writes();
1204 			} else {
1205 				buf_dblwr_sync_datafiles();
1206 			}
1207 
1208 			rw_lock_sx_lock_gen(rw_lock, BUF_IO_WRITE);
1209 		}
1210 
1211 		/* If there is an observer that want to know if the asynchronous
1212 		flushing was sent then notify it.
1213 		Note: we set flush observer to a page with x-latch, so we can
1214 		guarantee that notify_flush and notify_remove are called in pair
1215 		with s-latch on a uncompressed page. */
1216 		if (bpage->flush_observer != NULL) {
1217 			buf_pool_mutex_enter(buf_pool);
1218 
1219 			bpage->flush_observer->notify_flush(buf_pool, bpage);
1220 
1221 			buf_pool_mutex_exit(buf_pool);
1222 		}
1223 
1224 		/* Even though bpage is not protected by any mutex at this
1225 		point, it is safe to access bpage, because it is io_fixed and
1226 		oldest_modification != 0.  Thus, it cannot be relocated in the
1227 		buffer pool or removed from flush_list or LRU_list. */
1228 
1229 		buf_flush_write_block_low(bpage, flush_type, sync);
1230 	}
1231 
1232 	return(flush);
1233 }
1234 
1235 # if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
1236 /********************************************************************//**
1237 Writes a flushable page asynchronously from the buffer pool to a file.
1238 NOTE: buf_pool->mutex and block->mutex must be held upon entering this
1239 function, and they will be released by this function after flushing.
1240 This is loosely based on buf_flush_batch() and buf_flush_page().
1241 @return TRUE if the page was flushed and the mutexes released */
1242 ibool
buf_flush_page_try(buf_pool_t * buf_pool,buf_block_t * block)1243 buf_flush_page_try(
1244 /*===============*/
1245 	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
1246 	buf_block_t*	block)		/*!< in/out: buffer control block */
1247 {
1248 	ut_ad(buf_pool_mutex_own(buf_pool));
1249 	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
1250 	ut_ad(buf_page_mutex_own(block));
1251 
1252 	if (!buf_flush_ready_for_flush(&block->page, BUF_FLUSH_SINGLE_PAGE)) {
1253 		return(FALSE);
1254 	}
1255 
1256 	/* The following call will release the buffer pool and
1257 	block mutex. */
1258 	return(buf_flush_page(
1259 			buf_pool, &block->page,
1260 			BUF_FLUSH_SINGLE_PAGE, true));
1261 }
1262 # endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
1263 
1264 /** Check the page is in buffer pool and can be flushed.
1265 @param[in]	page_id		page id
1266 @param[in]	flush_type	BUF_FLUSH_LRU or BUF_FLUSH_LIST
1267 @return true if the page can be flushed. */
1268 static
1269 bool
buf_flush_check_neighbor(const page_id_t page_id,buf_flush_t flush_type)1270 buf_flush_check_neighbor(
1271 	const page_id_t		page_id,
1272 	buf_flush_t		flush_type)
1273 {
1274 	buf_page_t*	bpage;
1275 	buf_pool_t*	buf_pool = buf_pool_get(page_id);
1276 	bool		ret;
1277 
1278 	ut_ad(flush_type == BUF_FLUSH_LRU
1279 	      || flush_type == BUF_FLUSH_LIST);
1280 
1281 	buf_pool_mutex_enter(buf_pool);
1282 
1283 	/* We only want to flush pages from this buffer pool. */
1284 	bpage = buf_page_hash_get(buf_pool, page_id);
1285 
1286 	if (!bpage) {
1287 
1288 		buf_pool_mutex_exit(buf_pool);
1289 		return(false);
1290 	}
1291 
1292 	ut_a(buf_page_in_file(bpage));
1293 
1294 	/* We avoid flushing 'non-old' blocks in an LRU flush,
1295 	because the flushed blocks are soon freed */
1296 
1297 	ret = false;
1298 	if (flush_type != BUF_FLUSH_LRU || buf_page_is_old(bpage)) {
1299 		BPageMutex* block_mutex = buf_page_get_mutex(bpage);
1300 
1301 		mutex_enter(block_mutex);
1302 		if (buf_flush_ready_for_flush(bpage, flush_type)) {
1303 			ret = true;
1304 		}
1305 		mutex_exit(block_mutex);
1306 	}
1307 	buf_pool_mutex_exit(buf_pool);
1308 
1309 	return(ret);
1310 }
1311 
1312 /** Flushes to disk all flushable pages within the flush area.
1313 @param[in]	page_id		page id
1314 @param[in]	flush_type	BUF_FLUSH_LRU or BUF_FLUSH_LIST
1315 @param[in]	n_flushed	number of pages flushed so far in this batch
1316 @param[in]	n_to_flush	maximum number of pages we are allowed to flush
1317 @return number of pages flushed */
1318 static
1319 ulint
buf_flush_try_neighbors(const page_id_t page_id,buf_flush_t flush_type,ulint n_flushed,ulint n_to_flush)1320 buf_flush_try_neighbors(
1321 	const page_id_t		page_id,
1322 	buf_flush_t		flush_type,
1323 	ulint			n_flushed,
1324 	ulint			n_to_flush)
1325 {
1326 	ulint		i;
1327 	ulint		low;
1328 	ulint		high;
1329 	ulint		count = 0;
1330 	buf_pool_t*	buf_pool = buf_pool_get(page_id);
1331 
1332 	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1333 
1334 	if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN
1335 	    || srv_flush_neighbors == 0) {
1336 		/* If there is little space or neighbor flushing is
1337 		not enabled then just flush the victim. */
1338 		low = page_id.page_no();
1339 		high = page_id.page_no() + 1;
1340 	} else {
1341 		/* When flushed, dirty blocks are searched in
1342 		neighborhoods of this size, and flushed along with the
1343 		original page. */
1344 
1345 		ulint	buf_flush_area;
1346 
1347 		buf_flush_area	= ut_min(
1348 			BUF_READ_AHEAD_AREA(buf_pool),
1349 			buf_pool->curr_size / 16);
1350 
1351 		low = (page_id.page_no() / buf_flush_area) * buf_flush_area;
1352 		high = (page_id.page_no() / buf_flush_area + 1) * buf_flush_area;
1353 
1354 		if (srv_flush_neighbors == 1) {
1355 			/* adjust 'low' and 'high' to limit
1356 			   for contiguous dirty area */
1357 			if (page_id.page_no() > low) {
1358 				for (i = page_id.page_no() - 1; i >= low; i--) {
1359 					if (!buf_flush_check_neighbor(
1360 						page_id_t(page_id.space(), i),
1361 						flush_type)) {
1362 
1363 						break;
1364 					}
1365 
1366 					if (i == low) {
1367 						/* Avoid overwrap when low == 0
1368 						and calling
1369 						buf_flush_check_neighbor() with
1370 						i == (ulint) -1 */
1371 						i--;
1372 						break;
1373 					}
1374 				}
1375 				low = i + 1;
1376 			}
1377 
1378 			for (i = page_id.page_no() + 1;
1379 			     i < high
1380 			     && buf_flush_check_neighbor(
1381 				     page_id_t(page_id.space(), i),
1382 				     flush_type);
1383 			     i++) {
1384 				/* do nothing */
1385 			}
1386 			high = i;
1387 		}
1388 	}
1389 
1390 	if (fil_space_t *s = fil_space_acquire_for_io(page_id.space())) {
1391 		high = s->max_page_number_for_io(high);
1392 		s->release_for_io();
1393 	} else {
1394 		return 0;
1395 	}
1396 
1397 	DBUG_PRINT("ib_buf", ("flush %u:%u..%u",
1398 			      page_id.space(),
1399 			      (unsigned) low, (unsigned) high));
1400 
1401 	for (ulint i = low; i < high; i++) {
1402 		buf_page_t*	bpage;
1403 
1404 		if ((count + n_flushed) >= n_to_flush) {
1405 
1406 			/* We have already flushed enough pages and
1407 			should call it a day. There is, however, one
1408 			exception. If the page whose neighbors we
1409 			are flushing has not been flushed yet then
1410 			we'll try to flush the victim that we
1411 			selected originally. */
1412 			if (i <= page_id.page_no()) {
1413 				i = page_id.page_no();
1414 			} else {
1415 				break;
1416 			}
1417 		}
1418 
1419 		const page_id_t	cur_page_id(page_id.space(), i);
1420 
1421 		buf_pool = buf_pool_get(cur_page_id);
1422 
1423 		buf_pool_mutex_enter(buf_pool);
1424 
1425 		/* We only want to flush pages from this buffer pool. */
1426 		bpage = buf_page_hash_get(buf_pool, cur_page_id);
1427 
1428 		if (bpage == NULL) {
1429 
1430 			buf_pool_mutex_exit(buf_pool);
1431 			continue;
1432 		}
1433 
1434 		ut_a(buf_page_in_file(bpage));
1435 
1436 		/* We avoid flushing 'non-old' blocks in an LRU flush,
1437 		because the flushed blocks are soon freed */
1438 
1439 		if (flush_type != BUF_FLUSH_LRU
1440 		    || i == page_id.page_no()
1441 		    || buf_page_is_old(bpage)) {
1442 
1443 			BPageMutex* block_mutex = buf_page_get_mutex(bpage);
1444 
1445 			mutex_enter(block_mutex);
1446 
1447 			if (buf_flush_ready_for_flush(bpage, flush_type)
1448 			    && (i == page_id.page_no()
1449 				|| bpage->buf_fix_count == 0)) {
1450 
1451 				/* We also try to flush those
1452 				neighbors != offset */
1453 
1454 				if (buf_flush_page(
1455 					buf_pool, bpage, flush_type, false)) {
1456 
1457 					++count;
1458 				} else {
1459 					mutex_exit(block_mutex);
1460 					buf_pool_mutex_exit(buf_pool);
1461 				}
1462 
1463 				continue;
1464 			} else {
1465 				mutex_exit(block_mutex);
1466 			}
1467 		}
1468 		buf_pool_mutex_exit(buf_pool);
1469 	}
1470 
1471 	if (count > 1) {
1472 		MONITOR_INC_VALUE_CUMULATIVE(
1473 			MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
1474 			MONITOR_FLUSH_NEIGHBOR_COUNT,
1475 			MONITOR_FLUSH_NEIGHBOR_PAGES,
1476 			(count - 1));
1477 	}
1478 
1479 	return(count);
1480 }
1481 
1482 /** Check if the block is modified and ready for flushing.
1483 If the the block is ready to flush then flush the page and try o flush
1484 its neighbors.
1485 @param[in]	bpage		buffer control block,
1486 must be buf_page_in_file(bpage)
1487 @param[in]	flush_type	BUF_FLUSH_LRU or BUF_FLUSH_LIST
1488 @param[in]	n_to_flush	number of pages to flush
1489 @param[in,out]	count		number of pages flushed
1490 @return TRUE if buf_pool mutex was released during this function.
1491 This does not guarantee that some pages were written as well.
1492 Number of pages written are incremented to the count. */
1493 static
1494 bool
buf_flush_page_and_try_neighbors(buf_page_t * bpage,buf_flush_t flush_type,ulint n_to_flush,ulint * count)1495 buf_flush_page_and_try_neighbors(
1496 	buf_page_t*		bpage,
1497 	buf_flush_t		flush_type,
1498 	ulint			n_to_flush,
1499 	ulint*			count)
1500 {
1501 #ifdef UNIV_DEBUG
1502 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
1503 
1504 	ut_ad(buf_pool_mutex_own(buf_pool));
1505 #endif /* UNIV_DEBUG */
1506 
1507 	bool		flushed;
1508 	BPageMutex*	block_mutex = buf_page_get_mutex(bpage);
1509 
1510 	mutex_enter(block_mutex);
1511 
1512 	ut_a(buf_page_in_file(bpage));
1513 
1514 	if (buf_flush_ready_for_flush(bpage, flush_type)) {
1515 		buf_pool_t*	buf_pool;
1516 
1517 		buf_pool = buf_pool_from_bpage(bpage);
1518 
1519 		const page_id_t	page_id = bpage->id;
1520 
1521 		mutex_exit(block_mutex);
1522 
1523 		buf_pool_mutex_exit(buf_pool);
1524 
1525 		/* Try to flush also all the neighbors */
1526 		*count += buf_flush_try_neighbors(
1527 			page_id, flush_type, *count, n_to_flush);
1528 
1529 		buf_pool_mutex_enter(buf_pool);
1530 		flushed = TRUE;
1531 	} else {
1532 		mutex_exit(block_mutex);
1533 
1534 		flushed = false;
1535 	}
1536 
1537 	ut_ad(buf_pool_mutex_own(buf_pool));
1538 
1539 	return(flushed);
1540 }
1541 
1542 /*******************************************************************//**
1543 This utility moves the uncompressed frames of pages to the free list.
1544 Note that this function does not actually flush any data to disk. It
1545 just detaches the uncompressed frames from the compressed pages at the
1546 tail of the unzip_LRU and puts those freed frames in the free list.
1547 Note that it is a best effort attempt and it is not guaranteed that
1548 after a call to this function there will be 'max' blocks in the free
1549 list.
1550 @return number of blocks moved to the free list. */
1551 static
1552 ulint
buf_free_from_unzip_LRU_list_batch(buf_pool_t * buf_pool,ulint max)1553 buf_free_from_unzip_LRU_list_batch(
1554 /*===============================*/
1555 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
1556 	ulint		max)		/*!< in: desired number of
1557 					blocks in the free_list */
1558 {
1559 	ulint		scanned = 0;
1560 	ulint		count = 0;
1561 	ulint		free_len = UT_LIST_GET_LEN(buf_pool->free);
1562 	ulint		lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
1563 
1564 	ut_ad(buf_pool_mutex_own(buf_pool));
1565 
1566 	buf_block_t*	block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
1567 
1568 	while (block != NULL
1569 	       && count < max
1570 	       && free_len < srv_LRU_scan_depth
1571 	       && lru_len > UT_LIST_GET_LEN(buf_pool->LRU) / 10) {
1572 
1573 		++scanned;
1574 		if (buf_LRU_free_page(&block->page, false)) {
1575 			/* Block was freed. buf_pool->mutex potentially
1576 			released and reacquired */
1577 			++count;
1578 			block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
1579 
1580 		} else {
1581 
1582 			block = UT_LIST_GET_PREV(unzip_LRU, block);
1583 		}
1584 
1585 		free_len = UT_LIST_GET_LEN(buf_pool->free);
1586 		lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
1587 	}
1588 
1589 	ut_ad(buf_pool_mutex_own(buf_pool));
1590 
1591 	if (scanned) {
1592 		MONITOR_INC_VALUE_CUMULATIVE(
1593 			MONITOR_LRU_BATCH_SCANNED,
1594 			MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
1595 			MONITOR_LRU_BATCH_SCANNED_PER_CALL,
1596 			scanned);
1597 	}
1598 
1599 	return(count);
1600 }
1601 
1602 /*******************************************************************//**
1603 This utility flushes dirty blocks from the end of the LRU list.
1604 The calling thread is not allowed to own any latches on pages!
1605 It attempts to make 'max' blocks available in the free list. Note that
1606 it is a best effort attempt and it is not guaranteed that after a call
1607 to this function there will be 'max' blocks in the free list.*/
1608 
1609 void
buf_flush_LRU_list_batch(buf_pool_t * buf_pool,ulint max,flush_counters_t * n)1610 buf_flush_LRU_list_batch(
1611 /*=====================*/
1612 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
1613 	ulint		max,		/*!< in: desired number of
1614 					blocks in the free_list */
1615 	flush_counters_t*	n)	/*!< out: flushed/evicted page
1616 					counts */
1617 {
1618 	buf_page_t*	bpage;
1619 	ulint		scanned = 0;
1620 	ulint		free_len = UT_LIST_GET_LEN(buf_pool->free);
1621 	ulint		lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
1622 	ulint		withdraw_depth = 0;
1623 
1624 	n->flushed = 0;
1625 	n->evicted = 0;
1626 	n->unzip_LRU_evicted = 0;
1627 	ut_ad(buf_pool_mutex_own(buf_pool));
1628 	if (buf_pool->curr_size < buf_pool->old_size
1629 	    && buf_pool->withdraw_target > 0) {
1630 		withdraw_depth = buf_pool->withdraw_target
1631 				 - UT_LIST_GET_LEN(buf_pool->withdraw);
1632 	}
1633 
1634 	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1635 	     bpage != NULL && n->flushed + n->evicted < max
1636 	     && free_len < srv_LRU_scan_depth + withdraw_depth
1637 	     && lru_len > BUF_LRU_MIN_LEN;
1638 	     ++scanned,
1639 	     bpage = buf_pool->lru_hp.get()) {
1640 
1641 		buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage);
1642 		buf_pool->lru_hp.set(prev);
1643 
1644 		BPageMutex*	block_mutex = buf_page_get_mutex(bpage);
1645 
1646 		mutex_enter(block_mutex);
1647 
1648 		if (buf_flush_ready_for_replace(bpage)) {
1649 			/* block is ready for eviction i.e., it is
1650 			clean and is not IO-fixed or buffer fixed. */
1651 			mutex_exit(block_mutex);
1652 			if (buf_LRU_free_page(bpage, true)) {
1653 				++n->evicted;
1654 			}
1655 		} else if (buf_flush_ready_for_flush(bpage, BUF_FLUSH_LRU)) {
1656 			/* Block is ready for flush. Dispatch an IO
1657 			request. The IO helper thread will put it on
1658 			free list in IO completion routine. */
1659 			mutex_exit(block_mutex);
1660 			buf_flush_page_and_try_neighbors(
1661 				bpage, BUF_FLUSH_LRU, max, &n->flushed);
1662 		} else {
1663 			/* Can't evict or dispatch this block. Go to
1664 			previous. */
1665 			ut_ad(buf_pool->lru_hp.is_hp(prev));
1666 			mutex_exit(block_mutex);
1667 		}
1668 
1669 		ut_ad(!mutex_own(block_mutex));
1670 		ut_ad(buf_pool_mutex_own(buf_pool));
1671 
1672 		free_len = UT_LIST_GET_LEN(buf_pool->free);
1673 		lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
1674 	}
1675 
1676 	buf_pool->lru_hp.set(NULL);
1677 
1678 	/* We keep track of all flushes happening as part of LRU
1679 	flush. When estimating the desired rate at which flush_list
1680 	should be flushed, we factor in this value. */
1681 	buf_lru_flush_page_count += n->flushed;
1682 
1683 	ut_ad(buf_pool_mutex_own(buf_pool));
1684 
1685 	if (n->evicted) {
1686 		MONITOR_INC_VALUE_CUMULATIVE(
1687 			MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
1688 			MONITOR_LRU_BATCH_EVICT_COUNT,
1689 			MONITOR_LRU_BATCH_EVICT_PAGES,
1690 			n->evicted);
1691 	}
1692 
1693 	if (scanned) {
1694 		MONITOR_INC_VALUE_CUMULATIVE(
1695 			MONITOR_LRU_BATCH_SCANNED,
1696 			MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
1697 			MONITOR_LRU_BATCH_SCANNED_PER_CALL,
1698 			scanned);
1699 	}
1700 }
1701 
1702 /*******************************************************************//**
1703 Flush and move pages from LRU or unzip_LRU list to the free list.
1704 Whether LRU or unzip_LRU is used depends on the state of the system.*/
1705 
1706 static
1707 void
buf_do_LRU_batch(buf_pool_t * buf_pool,ulint max,flush_counters_t * n)1708 buf_do_LRU_batch(
1709 /*=============*/
1710 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
1711 	ulint		max,		/*!< in: desired number of
1712 					blocks in the free_list */
1713 	flush_counters_t*	n)	/*!< out: flushed/evicted page
1714 					counts */
1715 {
1716 	if (buf_LRU_evict_from_unzip_LRU(buf_pool)) {
1717 		n->unzip_LRU_evicted = buf_free_from_unzip_LRU_list_batch(buf_pool, max);
1718 	} else {
1719 		n->unzip_LRU_evicted = 0;
1720 	}
1721 
1722 	if (max > n->unzip_LRU_evicted) {
1723 		buf_flush_LRU_list_batch(buf_pool, max - n->unzip_LRU_evicted, n);
1724 	} else {
1725 		n->evicted = 0;
1726 		n->flushed = 0;
1727 	}
1728 
1729 	/* Add evicted pages from unzip_LRU to the evicted pages from
1730 	the simple LRU. */
1731 	n->evicted += n->unzip_LRU_evicted;
1732 }
1733 
1734 /** This utility flushes dirty blocks from the end of the flush_list.
1735 The calling thread is not allowed to own any latches on pages!
1736 @param[in]	buf_pool	buffer pool instance
1737 @param[in]	min_n		wished minimum mumber of blocks flushed (it is
1738 not guaranteed that the actual number is that big, though)
1739 @param[in]	lsn_limit	all blocks whose oldest_modification is smaller
1740 than this should be flushed (if their number does not exceed min_n)
1741 @return number of blocks for which the write request was queued;
1742 ULINT_UNDEFINED if there was a flush of the same type already
1743 running */
1744 static
1745 ulint
buf_do_flush_list_batch(buf_pool_t * buf_pool,ulint min_n,lsn_t lsn_limit)1746 buf_do_flush_list_batch(
1747 	buf_pool_t*		buf_pool,
1748 	ulint			min_n,
1749 	lsn_t			lsn_limit)
1750 {
1751 	ulint		count = 0;
1752 	ulint		scanned = 0;
1753 
1754 	ut_ad(buf_pool_mutex_own(buf_pool));
1755 
1756 	/* Start from the end of the list looking for a suitable
1757 	block to be flushed. */
1758 	buf_flush_list_mutex_enter(buf_pool);
1759 	ulint len = UT_LIST_GET_LEN(buf_pool->flush_list);
1760 
1761 	/* In order not to degenerate this scan to O(n*n) we attempt
1762 	to preserve pointer of previous block in the flush list. To do
1763 	so we declare it a hazard pointer. Any thread working on the
1764 	flush list must check the hazard pointer and if it is removing
1765 	the same block then it must reset it. */
1766 	for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
1767 	     count < min_n && bpage != NULL && len > 0
1768 	     && bpage->oldest_modification < lsn_limit;
1769 	     bpage = buf_pool->flush_hp.get(),
1770 	     ++scanned) {
1771 
1772 		buf_page_t*	prev;
1773 
1774 		ut_a(bpage->oldest_modification > 0);
1775 		ut_ad(bpage->in_flush_list);
1776 
1777 		prev = UT_LIST_GET_PREV(list, bpage);
1778 		buf_pool->flush_hp.set(prev);
1779 		buf_flush_list_mutex_exit(buf_pool);
1780 
1781 #ifdef UNIV_DEBUG
1782 		bool flushed =
1783 #endif /* UNIV_DEBUG */
1784 		buf_flush_page_and_try_neighbors(
1785 			bpage, BUF_FLUSH_LIST, min_n, &count);
1786 
1787 		buf_flush_list_mutex_enter(buf_pool);
1788 
1789 		ut_ad(flushed || buf_pool->flush_hp.is_hp(prev));
1790 
1791 		--len;
1792 	}
1793 
1794 	buf_pool->flush_hp.set(NULL);
1795 	buf_flush_list_mutex_exit(buf_pool);
1796 
1797 	if (scanned) {
1798 		MONITOR_INC_VALUE_CUMULATIVE(
1799 			MONITOR_FLUSH_BATCH_SCANNED,
1800 			MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
1801 			MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
1802 			scanned);
1803 	}
1804 
1805 	if (count) {
1806 		MONITOR_INC_VALUE_CUMULATIVE(
1807 			MONITOR_FLUSH_BATCH_TOTAL_PAGE,
1808 			MONITOR_FLUSH_BATCH_COUNT,
1809 			MONITOR_FLUSH_BATCH_PAGES,
1810 			count);
1811 	}
1812 
1813 	ut_ad(buf_pool_mutex_own(buf_pool));
1814 
1815 	return(count);
1816 }
1817 
1818 /** This utility flushes dirty blocks from the end of the LRU list or
1819 flush_list.
1820 NOTE 1: in the case of an LRU flush the calling thread may own latches to
1821 pages: to avoid deadlocks, this function must be written so that it cannot
1822 end up waiting for these latches! NOTE 2: in the case of a flush list flush,
1823 the calling thread is not allowed to own any latches on pages!
1824 @param[in]	buf_pool	buffer pool instance
1825 @param[in]	flush_type	BUF_FLUSH_LRU or BUF_FLUSH_LIST; if
1826 BUF_FLUSH_LIST, then the caller must not own any latches on pages
1827 @param[in]	min_n		wished minimum mumber of blocks flushed (it is
1828 not guaranteed that the actual number is that big, though)
1829 @param[in]	lsn_limit	in the case of BUF_FLUSH_LIST all blocks whose
1830 oldest_modification is smaller than this should be flushed (if their number
1831 does not exceed min_n), otherwise ignored */
1832 static
1833 void
buf_flush_batch(buf_pool_t * buf_pool,buf_flush_t flush_type,ulint min_n,lsn_t lsn_limit,flush_counters_t * n)1834 buf_flush_batch(
1835 	buf_pool_t*		buf_pool,
1836 	buf_flush_t		flush_type,
1837 	ulint			min_n,
1838 	lsn_t			lsn_limit,
1839 	flush_counters_t*	n)	/*!< out: flushed/evicted page
1840 					counts  */
1841 {
1842 	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1843 	ut_ad(flush_type == BUF_FLUSH_LRU
1844 	      || !sync_check_iterate(dict_sync_check()));
1845 
1846 	buf_pool_mutex_enter(buf_pool);
1847 
1848 	/* Note: The buffer pool mutex is released and reacquired within
1849 	the flush functions. */
1850 	switch (flush_type) {
1851 	case BUF_FLUSH_LRU:
1852 		buf_do_LRU_batch(buf_pool, min_n, n);
1853 		break;
1854 	case BUF_FLUSH_LIST:
1855 		n->flushed = buf_do_flush_list_batch(buf_pool, min_n, lsn_limit);
1856 		n->evicted = 0;
1857 		break;
1858 	default:
1859 		ut_error;
1860 	}
1861 
1862 	buf_pool_mutex_exit(buf_pool);
1863 
1864 	DBUG_LOG("ib_buf", "flush " << flush_type << " completed");
1865 }
1866 
1867 /******************************************************************//**
1868 Gather the aggregated stats for both flush list and LRU list flushing.
1869 @param page_count_flush	number of pages flushed from the end of the flush_list
1870 @param page_count_LRU	number of pages flushed from the end of the LRU list
1871 */
1872 static
1873 void
buf_flush_stats(ulint page_count_flush,ulint page_count_LRU)1874 buf_flush_stats(
1875 /*============*/
1876 	ulint		page_count_flush,
1877 	ulint		page_count_LRU)
1878 {
1879 	DBUG_PRINT("ib_buf", ("flush completed, from flush_list %u pages, "
1880 			      "from LRU_list %u pages",
1881 			      unsigned(page_count_flush),
1882 			      unsigned(page_count_LRU)));
1883 
1884 	srv_stats.buf_pool_flushed.add(page_count_flush + page_count_LRU);
1885 }
1886 
1887 /******************************************************************//**
1888 Start a buffer flush batch for LRU or flush list */
1889 static
1890 ibool
buf_flush_start(buf_pool_t * buf_pool,buf_flush_t flush_type)1891 buf_flush_start(
1892 /*============*/
1893 	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
1894 	buf_flush_t	flush_type)	/*!< in: BUF_FLUSH_LRU
1895 					or BUF_FLUSH_LIST */
1896 {
1897 	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1898 
1899 	buf_pool_mutex_enter(buf_pool);
1900 
1901 	if (buf_pool->n_flush[flush_type] > 0
1902 	   || buf_pool->init_flush[flush_type] == TRUE) {
1903 
1904 		/* There is already a flush batch of the same type running */
1905 
1906 		buf_pool_mutex_exit(buf_pool);
1907 
1908 		return(FALSE);
1909 	}
1910 
1911 	buf_pool->init_flush[flush_type] = TRUE;
1912 
1913 	os_event_reset(buf_pool->no_flush[flush_type]);
1914 
1915 	buf_pool_mutex_exit(buf_pool);
1916 
1917 	return(TRUE);
1918 }
1919 
1920 /******************************************************************//**
1921 End a buffer flush batch for LRU or flush list */
1922 static
1923 void
buf_flush_end(buf_pool_t * buf_pool,buf_flush_t flush_type)1924 buf_flush_end(
1925 /*==========*/
1926 	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
1927 	buf_flush_t	flush_type)	/*!< in: BUF_FLUSH_LRU
1928 					or BUF_FLUSH_LIST */
1929 {
1930 	buf_pool_mutex_enter(buf_pool);
1931 
1932 	buf_pool->init_flush[flush_type] = FALSE;
1933 
1934 	buf_pool->try_LRU_scan = TRUE;
1935 
1936 	if (buf_pool->n_flush[flush_type] == 0) {
1937 
1938 		/* The running flush batch has ended */
1939 
1940 		os_event_set(buf_pool->no_flush[flush_type]);
1941 	}
1942 
1943 	buf_pool_mutex_exit(buf_pool);
1944 
1945 	if (!srv_read_only_mode) {
1946 		buf_dblwr_flush_buffered_writes();
1947 	} else {
1948 		os_aio_simulated_wake_handler_threads();
1949 	}
1950 }
1951 
1952 /******************************************************************//**
1953 Waits until a flush batch of the given type ends */
1954 void
buf_flush_wait_batch_end(buf_pool_t * buf_pool,buf_flush_t type)1955 buf_flush_wait_batch_end(
1956 /*=====================*/
1957 	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
1958 	buf_flush_t	type)		/*!< in: BUF_FLUSH_LRU
1959 					or BUF_FLUSH_LIST */
1960 {
1961 	ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
1962 
1963 	if (buf_pool == NULL) {
1964 		ulint	i;
1965 
1966 		for (i = 0; i < srv_buf_pool_instances; ++i) {
1967 			buf_pool_t*	buf_pool;
1968 
1969 			buf_pool = buf_pool_from_array(i);
1970 
1971 			thd_wait_begin(NULL, THD_WAIT_DISKIO);
1972 			os_event_wait(buf_pool->no_flush[type]);
1973 			thd_wait_end(NULL);
1974 		}
1975 	} else {
1976 		thd_wait_begin(NULL, THD_WAIT_DISKIO);
1977 		os_event_wait(buf_pool->no_flush[type]);
1978 		thd_wait_end(NULL);
1979 	}
1980 }
1981 
1982 /** Do flushing batch of a given type.
1983 NOTE: The calling thread is not allowed to own any latches on pages!
1984 @param[in,out]	buf_pool	buffer pool instance
1985 @param[in]	type		flush type
1986 @param[in]	min_n		wished minimum mumber of blocks flushed
1987 (it is not guaranteed that the actual number is that big, though)
1988 @param[in]	lsn_limit	in the case BUF_FLUSH_LIST all blocks whose
1989 oldest_modification is smaller than this should be flushed (if their number
1990 does not exceed min_n), otherwise ignored
1991 @param[out]	n_processed	the number of pages which were processed is
1992 passed back to caller. Ignored if NULL
1993 @retval true	if a batch was queued successfully.
1994 @retval false	if another batch of same type was already running. */
1995 bool
buf_flush_do_batch(buf_pool_t * buf_pool,buf_flush_t type,ulint min_n,lsn_t lsn_limit,flush_counters_t * n)1996 buf_flush_do_batch(
1997 	buf_pool_t*		buf_pool,
1998 	buf_flush_t		type,
1999 	ulint			min_n,
2000 	lsn_t			lsn_limit,
2001 	flush_counters_t*	n)
2002 {
2003 	ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
2004 
2005 	if (n != NULL) {
2006 		n->flushed = 0;
2007 	}
2008 
2009 	if (!buf_flush_start(buf_pool, type)) {
2010 		return(false);
2011 	}
2012 
2013 	buf_flush_batch(buf_pool, type, min_n, lsn_limit, n);
2014 
2015 	buf_flush_end(buf_pool, type);
2016 
2017 	return(true);
2018 }
2019 /**
2020 Waits until a flush batch of the given lsn ends
2021 @param[in]	new_oldest	target oldest_modified_lsn to wait for */
2022 
2023 void
buf_flush_wait_flushed(lsn_t new_oldest)2024 buf_flush_wait_flushed(
2025 	lsn_t		new_oldest)
2026 {
2027 	for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
2028 		buf_pool_t*	buf_pool;
2029 		lsn_t		oldest;
2030 
2031 		buf_pool = buf_pool_from_array(i);
2032 
2033 		for (;;) {
2034 			/* We don't need to wait for fsync of the flushed
2035 			blocks, because anyway we need fsync to make chekpoint.
2036 			So, we don't need to wait for the batch end here. */
2037 
2038 			buf_flush_list_mutex_enter(buf_pool);
2039 
2040 			buf_page_t*	bpage;
2041 
2042 			/* We don't need to wait for system temporary pages */
2043 			for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
2044 			     bpage != NULL
2045 				&& fsp_is_system_temporary(bpage->id.space());
2046 			     bpage = UT_LIST_GET_PREV(list, bpage)) {
2047 				/* Do nothing. */
2048 			}
2049 
2050 			if (bpage != NULL) {
2051 				ut_ad(bpage->in_flush_list);
2052 				oldest = bpage->oldest_modification;
2053 			} else {
2054 				oldest = 0;
2055 			}
2056 
2057 			buf_flush_list_mutex_exit(buf_pool);
2058 
2059 			if (oldest == 0 || oldest >= new_oldest) {
2060 				break;
2061 			}
2062 
2063 			/* sleep and retry */
2064 			os_thread_sleep(buf_flush_wait_flushed_sleep_time);
2065 
2066 			MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
2067 		}
2068 	}
2069 }
2070 
2071 /** This utility flushes dirty blocks from the end of the flush list of all
2072 buffer pool instances.
2073 NOTE: The calling thread is not allowed to own any latches on pages!
2074 @param[in]	min_n		wished minimum mumber of blocks flushed (it is
2075 not guaranteed that the actual number is that big, though)
2076 @param[in]	lsn_limit	in the case BUF_FLUSH_LIST all blocks whose
2077 oldest_modification is smaller than this should be flushed (if their number
2078 does not exceed min_n), otherwise ignored
2079 @param[out]	n_processed	the number of pages which were processed is
2080 passed back to caller. Ignored if NULL.
2081 @return true if a batch was queued successfully for each buffer pool
2082 instance. false if another batch of same type was already running in
2083 at least one of the buffer pool instance */
2084 bool
buf_flush_lists(ulint min_n,lsn_t lsn_limit,ulint * n_processed)2085 buf_flush_lists(
2086 	ulint			min_n,
2087 	lsn_t			lsn_limit,
2088 	ulint*			n_processed)
2089 {
2090 	ulint		i;
2091 	ulint		n_flushed = 0;
2092 	bool		success = true;
2093 
2094 	if (n_processed) {
2095 		*n_processed = 0;
2096 	}
2097 
2098 	if (min_n != ULINT_MAX) {
2099 		/* Ensure that flushing is spread evenly amongst the
2100 		buffer pool instances. When min_n is ULINT_MAX
2101 		we need to flush everything up to the lsn limit
2102 		so no limit here. */
2103 		min_n = (min_n + srv_buf_pool_instances - 1)
2104 			 / srv_buf_pool_instances;
2105 	}
2106 
2107 	/* Flush to lsn_limit in all buffer pool instances */
2108 	for (i = 0; i < srv_buf_pool_instances; i++) {
2109 		buf_pool_t*		buf_pool;
2110 		flush_counters_t	n;
2111 
2112 		memset(&n, 0, sizeof(flush_counters_t));
2113 		buf_pool = buf_pool_from_array(i);
2114 
2115 		if (!buf_flush_do_batch(buf_pool,
2116 					BUF_FLUSH_LIST,
2117 					min_n,
2118 					lsn_limit,
2119 					&n)) {
2120 			/* We have two choices here. If lsn_limit was
2121 			specified then skipping an instance of buffer
2122 			pool means we cannot guarantee that all pages
2123 			up to lsn_limit has been flushed. We can
2124 			return right now with failure or we can try
2125 			to flush remaining buffer pools up to the
2126 			lsn_limit. We attempt to flush other buffer
2127 			pools based on the assumption that it will
2128 			help in the retry which will follow the
2129 			failure. */
2130 			success = false;
2131 
2132 		}
2133 
2134 		n_flushed += n.flushed;
2135 	}
2136 
2137 	if (n_flushed) {
2138 		buf_flush_stats(n_flushed, 0);
2139 		if (n_processed) {
2140 			*n_processed = n_flushed;
2141 		}
2142 	}
2143 
2144 	return(success);
2145 }
2146 
2147 /******************************************************************//**
2148 This function picks up a single page from the tail of the LRU
2149 list, flushes it (if it is dirty), removes it from page_hash and LRU
2150 list and puts it on the free list. It is called from user threads when
2151 they are unable to find a replaceable page at the tail of the LRU
2152 list i.e.: when the background LRU flushing in the page_cleaner thread
2153 is not fast enough to keep pace with the workload.
2154 @return true if success. */
2155 bool
buf_flush_single_page_from_LRU(buf_pool_t * buf_pool)2156 buf_flush_single_page_from_LRU(
2157 /*===========================*/
2158 	buf_pool_t*	buf_pool)	/*!< in/out: buffer pool instance */
2159 {
2160 	ulint		scanned;
2161 	buf_page_t*	bpage;
2162 	ibool		freed;
2163 
2164 	buf_pool_mutex_enter(buf_pool);
2165 
2166 	for (bpage = buf_pool->single_scan_itr.start(), scanned = 0,
2167 	     freed = false;
2168 	     bpage != NULL;
2169 	     ++scanned, bpage = buf_pool->single_scan_itr.get()) {
2170 
2171 		ut_ad(buf_pool_mutex_own(buf_pool));
2172 
2173 		buf_page_t*	prev = UT_LIST_GET_PREV(LRU, bpage);
2174 		buf_pool->single_scan_itr.set(prev);
2175 		BPageMutex*	block_mutex;
2176 
2177 		block_mutex = buf_page_get_mutex(bpage);
2178 
2179 		mutex_enter(block_mutex);
2180 
2181 		if (buf_flush_ready_for_replace(bpage)) {
2182 			/* block is ready for eviction i.e., it is
2183 			clean and is not IO-fixed or buffer fixed. */
2184 			mutex_exit(block_mutex);
2185 
2186 			if (buf_LRU_free_page(bpage, true)) {
2187 				buf_pool_mutex_exit(buf_pool);
2188 				freed = true;
2189 				break;
2190 			}
2191 
2192 		} else if (buf_flush_ready_for_flush(
2193 				   bpage, BUF_FLUSH_SINGLE_PAGE)) {
2194 
2195 			/* Block is ready for flush. Try and dispatch an IO
2196 			request. We'll put it on free list in IO completion
2197 			routine if it is not buffer fixed. The following call
2198 			will release the buffer pool and block mutex.
2199 
2200 			Note: There is no guarantee that this page has actually
2201 			been freed, only that it has been flushed to disk */
2202 
2203 			freed = buf_flush_page(
2204 				buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE, true);
2205 
2206 			if (freed) {
2207 				break;
2208 			}
2209 
2210 			mutex_exit(block_mutex);
2211 		} else {
2212 			mutex_exit(block_mutex);
2213 		}
2214 		ut_ad(!mutex_own(block_mutex));
2215 	}
2216 	if (!freed) {
2217 		/* Can't find a single flushable page. */
2218 		ut_ad(!bpage);
2219 		buf_pool_mutex_exit(buf_pool);
2220 	}
2221 
2222 	if (scanned) {
2223 		MONITOR_INC_VALUE_CUMULATIVE(
2224 			MONITOR_LRU_SINGLE_FLUSH_SCANNED,
2225 			MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
2226 			MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL,
2227 			scanned);
2228 	}
2229 
2230 	ut_ad(!buf_pool_mutex_own(buf_pool));
2231 	return(freed);
2232 }
2233 
2234 /**
2235 Clears up tail of the LRU list of a given buffer pool instance:
2236 * Put replaceable pages at the tail of LRU to the free list
2237 * Flush dirty pages at the tail of LRU to the disk
2238 The depth to which we scan each buffer pool is controlled by dynamic
2239 config parameter innodb_LRU_scan_depth.
2240 @param buf_pool buffer pool instance
2241 @return total pages flushed */
2242 static
2243 ulint
buf_flush_LRU_list(buf_pool_t * buf_pool)2244 buf_flush_LRU_list(
2245 	buf_pool_t*	buf_pool)
2246 {
2247 	ulint	scan_depth, withdraw_depth;
2248 	flush_counters_t	n;
2249 
2250 	memset(&n, 0, sizeof(flush_counters_t));
2251 
2252 	ut_ad(buf_pool);
2253 	/* srv_LRU_scan_depth can be arbitrarily large value.
2254 	We cap it with current LRU size. */
2255 	buf_pool_mutex_enter(buf_pool);
2256 	scan_depth = UT_LIST_GET_LEN(buf_pool->LRU);
2257 	if (buf_pool->curr_size < buf_pool->old_size
2258 	    && buf_pool->withdraw_target > 0) {
2259 		withdraw_depth = buf_pool->withdraw_target
2260 				 - UT_LIST_GET_LEN(buf_pool->withdraw);
2261 	} else {
2262 		withdraw_depth = 0;
2263 	}
2264 	buf_pool_mutex_exit(buf_pool);
2265 	if (withdraw_depth > srv_LRU_scan_depth) {
2266 		scan_depth = ut_min(withdraw_depth, scan_depth);
2267 	} else {
2268 		scan_depth = ut_min(static_cast<ulint>(srv_LRU_scan_depth),
2269 				    scan_depth);
2270 	}
2271 	/* Currently one of page_cleaners is the only thread
2272 	that can trigger an LRU flush at the same time.
2273 	So, it is not possible that a batch triggered during
2274 	last iteration is still running, */
2275 	buf_flush_do_batch(buf_pool, BUF_FLUSH_LRU, scan_depth,
2276 			   0, &n);
2277 
2278 	return(n.flushed);
2279 }
2280 
2281 /*********************************************************************//**
2282 Wait for any possible LRU flushes that are in progress to end. */
2283 void
buf_flush_wait_LRU_batch_end(void)2284 buf_flush_wait_LRU_batch_end(void)
2285 /*==============================*/
2286 {
2287 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2288 		buf_pool_t*	buf_pool;
2289 
2290 		buf_pool = buf_pool_from_array(i);
2291 
2292 		buf_pool_mutex_enter(buf_pool);
2293 
2294 		if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0
2295 		   || buf_pool->init_flush[BUF_FLUSH_LRU]) {
2296 
2297 			buf_pool_mutex_exit(buf_pool);
2298 			buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
2299 		} else {
2300 			buf_pool_mutex_exit(buf_pool);
2301 		}
2302 	}
2303 }
2304 
2305 /*********************************************************************//**
2306 Calculates if flushing is required based on number of dirty pages in
2307 the buffer pool.
2308 @return percent of io_capacity to flush to manage dirty page ratio */
2309 static
2310 ulint
af_get_pct_for_dirty()2311 af_get_pct_for_dirty()
2312 /*==================*/
2313 {
2314 	double	dirty_pct = buf_get_modified_ratio_pct();
2315 
2316 	if (dirty_pct == 0.0) {
2317 		/* No pages modified */
2318 		return(0);
2319 	}
2320 
2321 	ut_a(srv_max_dirty_pages_pct_lwm
2322 	     <= srv_max_buf_pool_modified_pct);
2323 
2324 	if (srv_max_dirty_pages_pct_lwm == 0) {
2325 		/* The user has not set the option to preflush dirty
2326 		pages as we approach the high water mark. */
2327 		if (dirty_pct >= srv_max_buf_pool_modified_pct) {
2328 			/* We have crossed the high water mark of dirty
2329 			pages In this case we start flushing at 100% of
2330 			innodb_io_capacity. */
2331 			return(100);
2332 		}
2333 	} else if (dirty_pct >= srv_max_dirty_pages_pct_lwm) {
2334 		/* We should start flushing pages gradually. */
2335 		return(static_cast<ulint>((dirty_pct * 100)
2336 		       / (srv_max_buf_pool_modified_pct + 1)));
2337 	}
2338 
2339 	return(0);
2340 }
2341 
2342 /*********************************************************************//**
2343 Calculates if flushing is required based on redo generation rate.
2344 @return percent of io_capacity to flush to manage redo space */
2345 static
2346 ulint
af_get_pct_for_lsn(lsn_t age)2347 af_get_pct_for_lsn(
2348 /*===============*/
2349 	lsn_t	age)	/*!< in: current age of LSN. */
2350 {
2351 	lsn_t	max_async_age;
2352 	lsn_t	lsn_age_factor;
2353 	lsn_t	af_lwm = (lsn_t) ((srv_adaptive_flushing_lwm
2354 			* log_get_capacity()) / 100);
2355 
2356 	if (age < af_lwm) {
2357 		/* No adaptive flushing. */
2358 		return(0);
2359 	}
2360 
2361 	max_async_age = log_get_max_modified_age_async();
2362 
2363 	if (age < max_async_age && !srv_adaptive_flushing) {
2364 		/* We have still not reached the max_async point and
2365 		the user has disabled adaptive flushing. */
2366 		return(0);
2367 	}
2368 
2369 	/* If we are here then we know that either:
2370 	1) User has enabled adaptive flushing
2371 	2) User may have disabled adaptive flushing but we have reached
2372 	max_async_age. */
2373 	lsn_age_factor = (age * 100) / max_async_age;
2374 
2375 	ut_ad(srv_max_io_capacity >= srv_io_capacity);
2376 	return(static_cast<ulint>(
2377 		((srv_max_io_capacity / srv_io_capacity)
2378 		* (lsn_age_factor * sqrt((double)lsn_age_factor)))
2379 		/ 7.5));
2380 }
2381 
2382 /*********************************************************************//**
2383 This function is called approximately once every second by the
2384 page_cleaner thread. Based on various factors it decides if there is a
2385 need to do flushing.
2386 @return number of pages recommended to be flushed
2387 @param lsn_limit	pointer to return LSN up to which flushing must happen
2388 @param last_pages_in	the number of pages flushed by the last flush_list
2389 			flushing. */
2390 static
2391 ulint
page_cleaner_flush_pages_recommendation(lsn_t * lsn_limit,ulint last_pages_in)2392 page_cleaner_flush_pages_recommendation(
2393 /*====================================*/
2394 	lsn_t*	lsn_limit,
2395 	ulint	last_pages_in)
2396 {
2397 	static	lsn_t		prev_lsn = 0;
2398 	static	ulint		sum_pages = 0;
2399 	static	ulint		avg_page_rate = 0;
2400 	static	ulint		n_iterations = 0;
2401 	static	time_t		prev_time;
2402 	lsn_t			oldest_lsn;
2403 	lsn_t			cur_lsn;
2404 	lsn_t			age;
2405 	lsn_t			lsn_rate;
2406 	ulint			n_pages = 0;
2407 	ulint			pct_for_dirty = 0;
2408 	ulint			pct_for_lsn = 0;
2409 	ulint			pct_total = 0;
2410 
2411 	cur_lsn = log_get_lsn_nowait();
2412 
2413 	/* log_get_lsn_nowait tries to get log_sys.mutex with
2414 	mutex_enter_nowait, if this does not succeed function
2415 	returns 0, do not use that value to update stats. */
2416 	if (cur_lsn == 0) {
2417 		return(0);
2418 	}
2419 
2420 	if (prev_lsn == 0) {
2421 		/* First time around. */
2422 		prev_lsn = cur_lsn;
2423 		prev_time = time(NULL);
2424 		return(0);
2425 	}
2426 
2427 	if (prev_lsn == cur_lsn) {
2428 		return(0);
2429 	}
2430 
2431 	sum_pages += last_pages_in;
2432 
2433 	time_t	curr_time = time(NULL);
2434 	double	time_elapsed = difftime(curr_time, prev_time);
2435 
2436 	/* We update our variables every srv_flushing_avg_loops
2437 	iterations to smooth out transition in workload. */
2438 	if (++n_iterations >= srv_flushing_avg_loops
2439 	    || time_elapsed >= srv_flushing_avg_loops) {
2440 
2441 		if (time_elapsed < 1) {
2442 			time_elapsed = 1;
2443 		}
2444 
2445 		avg_page_rate = static_cast<ulint>(
2446 			((static_cast<double>(sum_pages)
2447 			  / time_elapsed)
2448 			 + avg_page_rate) / 2);
2449 
2450 		/* How much LSN we have generated since last call. */
2451 		lsn_rate = static_cast<lsn_t>(
2452 			static_cast<double>(cur_lsn - prev_lsn)
2453 			/ time_elapsed);
2454 
2455 		lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2;
2456 
2457 		/* aggregate stats of all slots */
2458 		mutex_enter(&page_cleaner.mutex);
2459 
2460 		ulint	flush_tm = page_cleaner.flush_time;
2461 		ulint	flush_pass = page_cleaner.flush_pass;
2462 
2463 		page_cleaner.flush_time = 0;
2464 		page_cleaner.flush_pass = 0;
2465 
2466 		ulint	lru_tm = 0;
2467 		ulint	list_tm = 0;
2468 		ulint	lru_pass = 0;
2469 		ulint	list_pass = 0;
2470 
2471 		for (ulint i = 0; i < page_cleaner.n_slots; i++) {
2472 			page_cleaner_slot_t*	slot;
2473 
2474 			slot = &page_cleaner.slots[i];
2475 
2476 			lru_tm    += slot->flush_lru_time;
2477 			lru_pass  += slot->flush_lru_pass;
2478 			list_tm   += slot->flush_list_time;
2479 			list_pass += slot->flush_list_pass;
2480 
2481 			slot->flush_lru_time  = 0;
2482 			slot->flush_lru_pass  = 0;
2483 			slot->flush_list_time = 0;
2484 			slot->flush_list_pass = 0;
2485 		}
2486 
2487 		mutex_exit(&page_cleaner.mutex);
2488 
2489 		/* minimum values are 1, to avoid dividing by zero. */
2490 		if (lru_tm < 1) {
2491 			lru_tm = 1;
2492 		}
2493 		if (list_tm < 1) {
2494 			list_tm = 1;
2495 		}
2496 		if (flush_tm < 1) {
2497 			flush_tm = 1;
2498 		}
2499 
2500 		if (lru_pass < 1) {
2501 			lru_pass = 1;
2502 		}
2503 		if (list_pass < 1) {
2504 			list_pass = 1;
2505 		}
2506 		if (flush_pass < 1) {
2507 			flush_pass = 1;
2508 		}
2509 
2510 		MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_SLOT,
2511 			    list_tm / list_pass);
2512 		MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_SLOT,
2513 			    lru_tm  / lru_pass);
2514 
2515 		MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_THREAD,
2516 			    list_tm / (srv_n_page_cleaners * flush_pass));
2517 		MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_THREAD,
2518 			    lru_tm / (srv_n_page_cleaners * flush_pass));
2519 		MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_EST,
2520 			    flush_tm * list_tm / flush_pass
2521 			    / (list_tm + lru_tm));
2522 		MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_EST,
2523 			    flush_tm * lru_tm / flush_pass
2524 			    / (list_tm + lru_tm));
2525 		MONITOR_SET(MONITOR_FLUSH_AVG_TIME, flush_tm / flush_pass);
2526 
2527 		MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS,
2528 			    list_pass / page_cleaner.n_slots);
2529 		MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_PASS,
2530 			    lru_pass / page_cleaner.n_slots);
2531 		MONITOR_SET(MONITOR_FLUSH_AVG_PASS, flush_pass);
2532 
2533 		prev_lsn = cur_lsn;
2534 		prev_time = curr_time;
2535 
2536 		n_iterations = 0;
2537 
2538 		sum_pages = 0;
2539 	}
2540 
2541 	oldest_lsn = buf_pool_get_oldest_modification();
2542 
2543 	ut_ad(oldest_lsn <= log_get_lsn());
2544 
2545 	age = cur_lsn > oldest_lsn ? cur_lsn - oldest_lsn : 0;
2546 
2547 	pct_for_dirty = af_get_pct_for_dirty();
2548 	pct_for_lsn = af_get_pct_for_lsn(age);
2549 
2550 	pct_total = ut_max(pct_for_dirty, pct_for_lsn);
2551 
2552 	/* Estimate pages to be flushed for the lsn progress */
2553 	ulint	sum_pages_for_lsn = 0;
2554 	lsn_t	target_lsn = oldest_lsn
2555 			     + lsn_avg_rate * buf_flush_lsn_scan_factor;
2556 
2557 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2558 		buf_pool_t*	buf_pool = buf_pool_from_array(i);
2559 		ulint		pages_for_lsn = 0;
2560 
2561 		buf_flush_list_mutex_enter(buf_pool);
2562 		for (buf_page_t* b = UT_LIST_GET_LAST(buf_pool->flush_list);
2563 		     b != NULL;
2564 		     b = UT_LIST_GET_PREV(list, b)) {
2565 			if (b->oldest_modification > target_lsn) {
2566 				break;
2567 			}
2568 			++pages_for_lsn;
2569 		}
2570 		buf_flush_list_mutex_exit(buf_pool);
2571 
2572 		sum_pages_for_lsn += pages_for_lsn;
2573 
2574 		mutex_enter(&page_cleaner.mutex);
2575 		ut_ad(page_cleaner.slots[i].state
2576 		      == PAGE_CLEANER_STATE_NONE);
2577 		page_cleaner.slots[i].n_pages_requested
2578 			= pages_for_lsn / buf_flush_lsn_scan_factor + 1;
2579 		mutex_exit(&page_cleaner.mutex);
2580 	}
2581 
2582 	sum_pages_for_lsn /= buf_flush_lsn_scan_factor;
2583 	if(sum_pages_for_lsn < 1) {
2584 		sum_pages_for_lsn = 1;
2585 	}
2586 
2587 	/* Cap the maximum IO capacity that we are going to use by
2588 	max_io_capacity. Limit the value to avoid too quick increase */
2589 	ulint	pages_for_lsn =
2590 		std::min<ulint>(sum_pages_for_lsn, srv_max_io_capacity * 2);
2591 
2592 	n_pages = (PCT_IO(pct_total) + avg_page_rate + pages_for_lsn) / 3;
2593 
2594 	if (n_pages > srv_max_io_capacity) {
2595 		n_pages = srv_max_io_capacity;
2596 	}
2597 
2598 	/* Normalize request for each instance */
2599 	mutex_enter(&page_cleaner.mutex);
2600 	ut_ad(page_cleaner.n_slots_requested == 0);
2601 	ut_ad(page_cleaner.n_slots_flushing == 0);
2602 	ut_ad(page_cleaner.n_slots_finished == 0);
2603 
2604 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2605 		/* if REDO has enough of free space,
2606 		don't care about age distribution of pages */
2607 		page_cleaner.slots[i].n_pages_requested = pct_for_lsn > 30 ?
2608 			page_cleaner.slots[i].n_pages_requested
2609 			* n_pages / sum_pages_for_lsn + 1
2610 			: n_pages / srv_buf_pool_instances;
2611 	}
2612 	mutex_exit(&page_cleaner.mutex);
2613 
2614 	MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages);
2615 
2616 	MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_BY_AGE, sum_pages_for_lsn);
2617 
2618 	MONITOR_SET(MONITOR_FLUSH_AVG_PAGE_RATE, avg_page_rate);
2619 	MONITOR_SET(MONITOR_FLUSH_LSN_AVG_RATE, lsn_avg_rate);
2620 	MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty);
2621 	MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn);
2622 
2623 	*lsn_limit = LSN_MAX;
2624 
2625 	return(n_pages);
2626 }
2627 
2628 /*********************************************************************//**
2629 Puts the page_cleaner thread to sleep if it has finished work in less
2630 than a second
2631 @retval 0 wake up by event set,
2632 @retval OS_SYNC_TIME_EXCEEDED if timeout was exceeded
2633 @param next_loop_time	time when next loop iteration should start
2634 @param sig_count	zero or the value returned by previous call of
2635 			os_event_reset()
2636 @param cur_time		current time as in ut_time_ms() */
2637 static
2638 ulint
pc_sleep_if_needed(ulint next_loop_time,int64_t sig_count,ulint cur_time)2639 pc_sleep_if_needed(
2640 /*===============*/
2641 	ulint		next_loop_time,
2642 	int64_t		sig_count,
2643 	ulint		cur_time)
2644 {
2645 	/* No sleep if we are cleaning the buffer pool during the shutdown
2646 	with everything else finished */
2647 	if (srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE)
2648 		return OS_SYNC_TIME_EXCEEDED;
2649 
2650 	if (next_loop_time > cur_time) {
2651 		/* Get sleep interval in micro seconds. We use
2652 		ut_min() to avoid long sleep in case of wrap around. */
2653 		ulint	sleep_us;
2654 
2655 		sleep_us = ut_min(static_cast<ulint>(1000000),
2656 				  (next_loop_time - cur_time) * 1000);
2657 
2658 		return(os_event_wait_time_low(buf_flush_event,
2659 					      sleep_us, sig_count));
2660 	}
2661 
2662 	return(OS_SYNC_TIME_EXCEEDED);
2663 }
2664 
2665 /******************************************************************//**
2666 Initialize page_cleaner. */
2667 void
buf_flush_page_cleaner_init(void)2668 buf_flush_page_cleaner_init(void)
2669 /*=============================*/
2670 {
2671 	ut_ad(!page_cleaner.is_running);
2672 
2673 	mutex_create(LATCH_ID_PAGE_CLEANER, &page_cleaner.mutex);
2674 
2675 	page_cleaner.is_requested = os_event_create("pc_is_requested");
2676 	page_cleaner.is_finished = os_event_create("pc_is_finished");
2677 	page_cleaner.is_started = os_event_create("pc_is_started");
2678 	page_cleaner.n_slots = static_cast<ulint>(srv_buf_pool_instances);
2679 
2680 	ut_d(page_cleaner.n_disabled_debug = 0);
2681 
2682 	page_cleaner.is_running = true;
2683 }
2684 
2685 /**
2686 Requests for all slots to flush all buffer pool instances.
2687 @param min_n	wished minimum mumber of blocks flushed
2688 		(it is not guaranteed that the actual number is that big)
2689 @param lsn_limit in the case BUF_FLUSH_LIST all blocks whose
2690 		oldest_modification is smaller than this should be flushed
2691 		(if their number does not exceed min_n), otherwise ignored
2692 */
2693 static
2694 void
pc_request(ulint min_n,lsn_t lsn_limit)2695 pc_request(
2696 	ulint		min_n,
2697 	lsn_t		lsn_limit)
2698 {
2699 	if (min_n != ULINT_MAX) {
2700 		/* Ensure that flushing is spread evenly amongst the
2701 		buffer pool instances. When min_n is ULINT_MAX
2702 		we need to flush everything up to the lsn limit
2703 		so no limit here. */
2704 		min_n = (min_n + srv_buf_pool_instances - 1)
2705 			/ srv_buf_pool_instances;
2706 	}
2707 
2708 	mutex_enter(&page_cleaner.mutex);
2709 
2710 	ut_ad(page_cleaner.n_slots_requested == 0);
2711 	ut_ad(page_cleaner.n_slots_flushing == 0);
2712 	ut_ad(page_cleaner.n_slots_finished == 0);
2713 
2714 	page_cleaner.requested = (min_n > 0);
2715 	page_cleaner.lsn_limit = lsn_limit;
2716 
2717 	for (ulint i = 0; i < page_cleaner.n_slots; i++) {
2718 		page_cleaner_slot_t* slot = &page_cleaner.slots[i];
2719 
2720 		ut_ad(slot->state == PAGE_CLEANER_STATE_NONE);
2721 
2722 		if (min_n == ULINT_MAX) {
2723 			slot->n_pages_requested = ULINT_MAX;
2724 		} else if (min_n == 0) {
2725 			slot->n_pages_requested = 0;
2726 		}
2727 
2728 		/* slot->n_pages_requested was already set by
2729 		page_cleaner_flush_pages_recommendation() */
2730 
2731 		slot->state = PAGE_CLEANER_STATE_REQUESTED;
2732 	}
2733 
2734 	page_cleaner.n_slots_requested = page_cleaner.n_slots;
2735 	page_cleaner.n_slots_flushing = 0;
2736 	page_cleaner.n_slots_finished = 0;
2737 
2738 	os_event_set(page_cleaner.is_requested);
2739 
2740 	mutex_exit(&page_cleaner.mutex);
2741 }
2742 
2743 /**
2744 Do flush for one slot.
2745 @return	the number of the slots which has not been treated yet. */
2746 static
2747 ulint
pc_flush_slot(void)2748 pc_flush_slot(void)
2749 {
2750 	ulint	lru_tm = 0;
2751 	ulint	list_tm = 0;
2752 	ulint	lru_pass = 0;
2753 	ulint	list_pass = 0;
2754 
2755 	mutex_enter(&page_cleaner.mutex);
2756 
2757 	if (!page_cleaner.n_slots_requested) {
2758 		os_event_reset(page_cleaner.is_requested);
2759 	} else {
2760 		page_cleaner_slot_t*	slot = NULL;
2761 		ulint			i;
2762 
2763 		for (i = 0; i < page_cleaner.n_slots; i++) {
2764 			slot = &page_cleaner.slots[i];
2765 
2766 			if (slot->state == PAGE_CLEANER_STATE_REQUESTED) {
2767 				break;
2768 			}
2769 		}
2770 
2771 		/* slot should be found because
2772 		page_cleaner.n_slots_requested > 0 */
2773 		ut_a(i < page_cleaner.n_slots);
2774 
2775 		buf_pool_t* buf_pool = buf_pool_from_array(i);
2776 
2777 		page_cleaner.n_slots_requested--;
2778 		page_cleaner.n_slots_flushing++;
2779 		slot->state = PAGE_CLEANER_STATE_FLUSHING;
2780 
2781 		if (UNIV_UNLIKELY(!page_cleaner.is_running)) {
2782 			slot->n_flushed_lru = 0;
2783 			slot->n_flushed_list = 0;
2784 			goto finish_mutex;
2785 		}
2786 
2787 		if (page_cleaner.n_slots_requested == 0) {
2788 			os_event_reset(page_cleaner.is_requested);
2789 		}
2790 
2791 		mutex_exit(&page_cleaner.mutex);
2792 
2793 		lru_tm = ut_time_ms();
2794 
2795 		/* Flush pages from end of LRU if required */
2796 		slot->n_flushed_lru = buf_flush_LRU_list(buf_pool);
2797 
2798 		lru_tm = ut_time_ms() - lru_tm;
2799 		lru_pass++;
2800 
2801 		if (UNIV_UNLIKELY(!page_cleaner.is_running)) {
2802 			slot->n_flushed_list = 0;
2803 			goto finish;
2804 		}
2805 
2806 		/* Flush pages from flush_list if required */
2807 		if (page_cleaner.requested) {
2808 			flush_counters_t n;
2809 			memset(&n, 0, sizeof(flush_counters_t));
2810 			list_tm = ut_time_ms();
2811 
2812 			slot->succeeded_list = buf_flush_do_batch(
2813 				buf_pool, BUF_FLUSH_LIST,
2814 				slot->n_pages_requested,
2815 				page_cleaner.lsn_limit,
2816 				&n);
2817 
2818 			slot->n_flushed_list = n.flushed;
2819 
2820 			list_tm = ut_time_ms() - list_tm;
2821 			list_pass++;
2822 		} else {
2823 			slot->n_flushed_list = 0;
2824 			slot->succeeded_list = true;
2825 		}
2826 finish:
2827 		mutex_enter(&page_cleaner.mutex);
2828 finish_mutex:
2829 		page_cleaner.n_slots_flushing--;
2830 		page_cleaner.n_slots_finished++;
2831 		slot->state = PAGE_CLEANER_STATE_FINISHED;
2832 
2833 		slot->flush_lru_time += lru_tm;
2834 		slot->flush_list_time += list_tm;
2835 		slot->flush_lru_pass += lru_pass;
2836 		slot->flush_list_pass += list_pass;
2837 
2838 		if (page_cleaner.n_slots_requested == 0
2839 		    && page_cleaner.n_slots_flushing == 0) {
2840 			os_event_set(page_cleaner.is_finished);
2841 		}
2842 	}
2843 
2844 	ulint	ret = page_cleaner.n_slots_requested;
2845 
2846 	mutex_exit(&page_cleaner.mutex);
2847 
2848 	return(ret);
2849 }
2850 
2851 /**
2852 Wait until all flush requests are finished.
2853 @param n_flushed_lru	number of pages flushed from the end of the LRU list.
2854 @param n_flushed_list	number of pages flushed from the end of the
2855 			flush_list.
2856 @return			true if all flush_list flushing batch were success. */
2857 static
2858 bool
pc_wait_finished(ulint * n_flushed_lru,ulint * n_flushed_list)2859 pc_wait_finished(
2860 	ulint*	n_flushed_lru,
2861 	ulint*	n_flushed_list)
2862 {
2863 	bool	all_succeeded = true;
2864 
2865 	*n_flushed_lru = 0;
2866 	*n_flushed_list = 0;
2867 
2868 	os_event_wait(page_cleaner.is_finished);
2869 
2870 	mutex_enter(&page_cleaner.mutex);
2871 
2872 	ut_ad(page_cleaner.n_slots_requested == 0);
2873 	ut_ad(page_cleaner.n_slots_flushing == 0);
2874 	ut_ad(page_cleaner.n_slots_finished == page_cleaner.n_slots);
2875 
2876 	for (ulint i = 0; i < page_cleaner.n_slots; i++) {
2877 		page_cleaner_slot_t* slot = &page_cleaner.slots[i];
2878 
2879 		ut_ad(slot->state == PAGE_CLEANER_STATE_FINISHED);
2880 
2881 		*n_flushed_lru += slot->n_flushed_lru;
2882 		*n_flushed_list += slot->n_flushed_list;
2883 		all_succeeded &= slot->succeeded_list;
2884 
2885 		slot->state = PAGE_CLEANER_STATE_NONE;
2886 
2887 		slot->n_pages_requested = 0;
2888 	}
2889 
2890 	page_cleaner.n_slots_finished = 0;
2891 
2892 	os_event_reset(page_cleaner.is_finished);
2893 
2894 	mutex_exit(&page_cleaner.mutex);
2895 
2896 	return(all_succeeded);
2897 }
2898 
2899 #ifdef UNIV_LINUX
2900 /**
2901 Set priority for page_cleaner threads.
2902 @param[in]	priority	priority intended to set
2903 @return	true if set as intended */
2904 static
2905 bool
buf_flush_page_cleaner_set_priority(int priority)2906 buf_flush_page_cleaner_set_priority(
2907 	int	priority)
2908 {
2909 	setpriority(PRIO_PROCESS, (pid_t)syscall(SYS_gettid),
2910 		    priority);
2911 	return(getpriority(PRIO_PROCESS, (pid_t)syscall(SYS_gettid))
2912 	       == priority);
2913 }
2914 #endif /* UNIV_LINUX */
2915 
2916 #ifdef UNIV_DEBUG
2917 /** Loop used to disable page cleaner threads. */
2918 static
2919 void
buf_flush_page_cleaner_disabled_loop(void)2920 buf_flush_page_cleaner_disabled_loop(void)
2921 {
2922 	if (!innodb_page_cleaner_disabled_debug) {
2923 		/* We return to avoid entering and exiting mutex. */
2924 		return;
2925 	}
2926 
2927 	mutex_enter(&page_cleaner.mutex);
2928 	page_cleaner.n_disabled_debug++;
2929 	mutex_exit(&page_cleaner.mutex);
2930 
2931 	while (innodb_page_cleaner_disabled_debug
2932 	       && srv_shutdown_state == SRV_SHUTDOWN_NONE
2933 	       && page_cleaner.is_running) {
2934 
2935 		os_thread_sleep(100000); /* [A] */
2936 	}
2937 
2938 	/* We need to wait for threads exiting here, otherwise we would
2939 	encounter problem when we quickly perform following steps:
2940 		1) SET GLOBAL innodb_page_cleaner_disabled_debug = 1;
2941 		2) SET GLOBAL innodb_page_cleaner_disabled_debug = 0;
2942 		3) SET GLOBAL innodb_page_cleaner_disabled_debug = 1;
2943 	That's because after step 1 this thread could still be sleeping
2944 	inside the loop above at [A] and steps 2, 3 could happen before
2945 	this thread wakes up from [A]. In such case this thread would
2946 	not re-increment n_disabled_debug and we would be waiting for
2947 	him forever in buf_flush_page_cleaner_disabled_debug_update(...).
2948 
2949 	Therefore we are waiting in step 2 for this thread exiting here. */
2950 
2951 	mutex_enter(&page_cleaner.mutex);
2952 	page_cleaner.n_disabled_debug--;
2953 	mutex_exit(&page_cleaner.mutex);
2954 }
2955 
2956 /** Disables page cleaner threads (coordinator and workers).
2957 @param[in]	save		immediate result from check function */
buf_flush_page_cleaner_disabled_debug_update(THD *,st_mysql_sys_var *,void *,const void * save)2958 void buf_flush_page_cleaner_disabled_debug_update(THD*,
2959 						  st_mysql_sys_var*, void*,
2960 						  const void* save)
2961 {
2962 	if (!page_cleaner.is_running) {
2963 		return;
2964 	}
2965 
2966 	if (!*static_cast<const my_bool*>(save)) {
2967 		if (!innodb_page_cleaner_disabled_debug) {
2968 			return;
2969 		}
2970 
2971 		innodb_page_cleaner_disabled_debug = false;
2972 
2973 		/* Enable page cleaner threads. */
2974 		while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
2975 			mutex_enter(&page_cleaner.mutex);
2976 			const ulint n = page_cleaner.n_disabled_debug;
2977 			mutex_exit(&page_cleaner.mutex);
2978 			/* Check if all threads have been enabled, to avoid
2979 			problem when we decide to re-disable them soon. */
2980 			if (n == 0) {
2981 				break;
2982 			}
2983 		}
2984 		return;
2985 	}
2986 
2987 	if (innodb_page_cleaner_disabled_debug) {
2988 		return;
2989 	}
2990 
2991 	innodb_page_cleaner_disabled_debug = true;
2992 
2993 	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
2994 		/* Workers are possibly sleeping on is_requested.
2995 
2996 		We have to wake them, otherwise they could possibly
2997 		have never noticed, that they should be disabled,
2998 		and we would wait for them here forever.
2999 
3000 		That's why we have sleep-loop instead of simply
3001 		waiting on some disabled_debug_event. */
3002 		os_event_set(page_cleaner.is_requested);
3003 
3004 		mutex_enter(&page_cleaner.mutex);
3005 
3006 		ut_ad(page_cleaner.n_disabled_debug
3007 		      <= srv_n_page_cleaners);
3008 
3009 		if (page_cleaner.n_disabled_debug
3010 		    == srv_n_page_cleaners) {
3011 
3012 			mutex_exit(&page_cleaner.mutex);
3013 			break;
3014 		}
3015 
3016 		mutex_exit(&page_cleaner.mutex);
3017 
3018 		os_thread_sleep(100000);
3019 	}
3020 }
3021 #endif /* UNIV_DEBUG */
3022 
3023 /******************************************************************//**
3024 page_cleaner thread tasked with flushing dirty pages from the buffer
3025 pools. As of now we'll have only one coordinator.
3026 @return a dummy parameter */
3027 extern "C"
3028 os_thread_ret_t
DECLARE_THREAD(buf_flush_page_cleaner_coordinator)3029 DECLARE_THREAD(buf_flush_page_cleaner_coordinator)(void*)
3030 {
3031 	my_thread_init();
3032 #ifdef UNIV_PFS_THREAD
3033 	pfs_register_thread(page_cleaner_thread_key);
3034 #endif /* UNIV_PFS_THREAD */
3035 	ut_ad(!srv_read_only_mode);
3036 
3037 #ifdef UNIV_DEBUG_THREAD_CREATION
3038 	ib::info() << "page_cleaner thread running, id "
3039 		<< os_thread_pf(os_thread_get_curr_id());
3040 #endif /* UNIV_DEBUG_THREAD_CREATION */
3041 #ifdef UNIV_LINUX
3042 	/* linux might be able to set different setting for each thread.
3043 	worth to try to set high priority for page cleaner threads */
3044 	if (buf_flush_page_cleaner_set_priority(
3045 		buf_flush_page_cleaner_priority)) {
3046 
3047 		ib::info() << "page_cleaner coordinator priority: "
3048 			<< buf_flush_page_cleaner_priority;
3049 	} else {
3050 		ib::info() << "If the mysqld execution user is authorized,"
3051 		" page cleaner thread priority can be changed."
3052 		" See the man page of setpriority().";
3053 	}
3054 	/* Signal that setpriority() has been attempted. */
3055 	os_event_set(recv_sys->flush_end);
3056 #endif /* UNIV_LINUX */
3057 
3058 	do {
3059 		/* treat flushing requests during recovery. */
3060 		ulint	n_flushed_lru = 0;
3061 		ulint	n_flushed_list = 0;
3062 
3063 		os_event_wait(recv_sys->flush_start);
3064 
3065 		if (!recv_writer_thread_active) {
3066 			break;
3067 		}
3068 
3069 		switch (recv_sys->flush_type) {
3070 		case BUF_FLUSH_LRU:
3071 			/* Flush pages from end of LRU if required */
3072 			pc_request(0, LSN_MAX);
3073 			while (pc_flush_slot() > 0) {}
3074 			pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3075 			break;
3076 
3077 		case BUF_FLUSH_LIST:
3078 			/* Flush all pages */
3079 			do {
3080 				pc_request(ULINT_MAX, LSN_MAX);
3081 				while (pc_flush_slot() > 0) {}
3082 			} while (!pc_wait_finished(&n_flushed_lru,
3083 						   &n_flushed_list));
3084 			break;
3085 
3086 		default:
3087 			ut_ad(0);
3088 		}
3089 
3090 		os_event_reset(recv_sys->flush_start);
3091 		os_event_set(recv_sys->flush_end);
3092 	} while (recv_writer_thread_active);
3093 
3094 	os_event_wait(buf_flush_event);
3095 
3096 	ulint	ret_sleep = 0;
3097 	ulint	n_evicted = 0;
3098 	ulint	n_flushed_last = 0;
3099 	ulint	warn_interval = 1;
3100 	ulint	warn_count = 0;
3101 	int64_t	sig_count = os_event_reset(buf_flush_event);
3102 	ulint	next_loop_time = ut_time_ms() + 1000;
3103 	ulint	n_flushed = 0;
3104 	ulint	last_activity = srv_get_activity_count();
3105 	ulint	last_pages = 0;
3106 
3107 	while (srv_shutdown_state <= SRV_SHUTDOWN_INITIATED) {
3108 		ulint	curr_time = ut_time_ms();
3109 
3110 		/* The page_cleaner skips sleep if the server is
3111 		idle and there are no pending IOs in the buffer pool
3112 		and there is work to do. */
3113 		if (srv_check_activity(last_activity)
3114 		    || buf_get_n_pending_read_ios()
3115 		    || n_flushed == 0) {
3116 
3117 			ret_sleep = pc_sleep_if_needed(
3118 				next_loop_time, sig_count, curr_time);
3119 		} else if (curr_time > next_loop_time) {
3120 			ret_sleep = OS_SYNC_TIME_EXCEEDED;
3121 		} else {
3122 			ret_sleep = 0;
3123 		}
3124 
3125 		if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) {
3126 			break;
3127 		}
3128 
3129 		sig_count = os_event_reset(buf_flush_event);
3130 
3131 		if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3132 			if (global_system_variables.log_warnings > 2
3133 			    && curr_time > next_loop_time + 3000
3134 			    && !(test_flags & TEST_SIGINT)) {
3135 				if (warn_count == 0) {
3136 					ib::info() << "page_cleaner: 1000ms"
3137 						" intended loop took "
3138 						<< 1000 + curr_time
3139 						   - next_loop_time
3140 						<< "ms. The settings might not"
3141 						" be optimal. (flushed="
3142 						<< n_flushed_last
3143 						<< " and evicted="
3144 						<< n_evicted
3145 						<< ", during the time.)";
3146 					if (warn_interval > 300) {
3147 						warn_interval = 600;
3148 					} else {
3149 						warn_interval *= 2;
3150 					}
3151 
3152 					warn_count = warn_interval;
3153 				} else {
3154 					--warn_count;
3155 				}
3156 			} else {
3157 				/* reset counter */
3158 				warn_interval = 1;
3159 				warn_count = 0;
3160 			}
3161 
3162 			next_loop_time = curr_time + 1000;
3163 			n_flushed_last = n_evicted = 0;
3164 		}
3165 
3166 		if (ret_sleep != OS_SYNC_TIME_EXCEEDED
3167 		    && srv_flush_sync
3168 		    && buf_flush_sync_lsn > 0) {
3169 			/* woke up for flush_sync */
3170 			mutex_enter(&page_cleaner.mutex);
3171 			lsn_t	lsn_limit = buf_flush_sync_lsn;
3172 			buf_flush_sync_lsn = 0;
3173 			mutex_exit(&page_cleaner.mutex);
3174 
3175 			/* Request flushing for threads */
3176 			pc_request(ULINT_MAX, lsn_limit);
3177 
3178 			ulint tm = ut_time_ms();
3179 
3180 			/* Coordinator also treats requests */
3181 			while (pc_flush_slot() > 0) {}
3182 
3183 			/* only coordinator is using these counters,
3184 			so no need to protect by lock. */
3185 			page_cleaner.flush_time += ut_time_ms() - tm;
3186 			page_cleaner.flush_pass++;
3187 
3188 			/* Wait for all slots to be finished */
3189 			ulint	n_flushed_lru = 0;
3190 			ulint	n_flushed_list = 0;
3191 			pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3192 
3193 			if (n_flushed_list > 0 || n_flushed_lru > 0) {
3194 				buf_flush_stats(n_flushed_list, n_flushed_lru);
3195 
3196 				MONITOR_INC_VALUE_CUMULATIVE(
3197 					MONITOR_FLUSH_SYNC_TOTAL_PAGE,
3198 					MONITOR_FLUSH_SYNC_COUNT,
3199 					MONITOR_FLUSH_SYNC_PAGES,
3200 					n_flushed_lru + n_flushed_list);
3201 			}
3202 
3203 			n_flushed = n_flushed_lru + n_flushed_list;
3204 
3205 		} else if (srv_check_activity(last_activity)) {
3206 			ulint	n_to_flush;
3207 			lsn_t	lsn_limit = 0;
3208 
3209 			/* Estimate pages from flush_list to be flushed */
3210 			if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3211 				last_activity = srv_get_activity_count();
3212 				n_to_flush =
3213 					page_cleaner_flush_pages_recommendation(
3214 						&lsn_limit, last_pages);
3215 			} else {
3216 				n_to_flush = 0;
3217 			}
3218 
3219 			/* Request flushing for threads */
3220 			pc_request(n_to_flush, lsn_limit);
3221 
3222 			ulint tm = ut_time_ms();
3223 
3224 			/* Coordinator also treats requests */
3225 			while (pc_flush_slot() > 0) {
3226 				/* No op */
3227 			}
3228 
3229 			/* only coordinator is using these counters,
3230 			so no need to protect by lock. */
3231 			page_cleaner.flush_time += ut_time_ms() - tm;
3232 			page_cleaner.flush_pass++ ;
3233 
3234 			/* Wait for all slots to be finished */
3235 			ulint	n_flushed_lru = 0;
3236 			ulint	n_flushed_list = 0;
3237 
3238 			pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3239 
3240 			if (n_flushed_list > 0 || n_flushed_lru > 0) {
3241 				buf_flush_stats(n_flushed_list, n_flushed_lru);
3242 			}
3243 
3244 			if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3245 				last_pages = n_flushed_list;
3246 			}
3247 
3248 			n_evicted += n_flushed_lru;
3249 			n_flushed_last += n_flushed_list;
3250 
3251 			n_flushed = n_flushed_lru + n_flushed_list;
3252 
3253 			if (n_flushed_lru) {
3254 				MONITOR_INC_VALUE_CUMULATIVE(
3255 					MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
3256 					MONITOR_LRU_BATCH_FLUSH_COUNT,
3257 					MONITOR_LRU_BATCH_FLUSH_PAGES,
3258 					n_flushed_lru);
3259 			}
3260 
3261 			if (n_flushed_list) {
3262 				MONITOR_INC_VALUE_CUMULATIVE(
3263 					MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
3264 					MONITOR_FLUSH_ADAPTIVE_COUNT,
3265 					MONITOR_FLUSH_ADAPTIVE_PAGES,
3266 					n_flushed_list);
3267 			}
3268 
3269 		} else if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3270 			/* no activity, slept enough */
3271 			buf_flush_lists(PCT_IO(100), LSN_MAX, &n_flushed);
3272 
3273 			n_flushed_last += n_flushed;
3274 
3275 			if (n_flushed) {
3276 				MONITOR_INC_VALUE_CUMULATIVE(
3277 					MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
3278 					MONITOR_FLUSH_BACKGROUND_COUNT,
3279 					MONITOR_FLUSH_BACKGROUND_PAGES,
3280 					n_flushed);
3281 
3282 			}
3283 
3284 		} else {
3285 			/* no activity, but woken up by event */
3286 			n_flushed = 0;
3287 		}
3288 
3289 		ut_d(buf_flush_page_cleaner_disabled_loop());
3290 	}
3291 
3292 	ut_ad(srv_shutdown_state > SRV_SHUTDOWN_INITIATED);
3293 	if (srv_fast_shutdown == 2
3294 	    || srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
3295 		/* In very fast shutdown or when innodb failed to start, we
3296 		simulate a crash of the buffer pool. We are not required to do
3297 		any flushing. */
3298 		goto thread_exit;
3299 	}
3300 
3301 	/* In case of normal and slow shutdown the page_cleaner thread
3302 	must wait for all other activity in the server to die down.
3303 	Note that we can start flushing the buffer pool as soon as the
3304 	server enters shutdown phase but we must stay alive long enough
3305 	to ensure that any work done by the master or purge threads is
3306 	also flushed.
3307 	During shutdown we pass through two stages. In the first stage,
3308 	when SRV_SHUTDOWN_CLEANUP is set other threads like the master
3309 	and the purge threads may be working as well. We start flushing
3310 	the buffer pool but can't be sure that no new pages are being
3311 	dirtied until we enter SRV_SHUTDOWN_FLUSH_PHASE phase. */
3312 
3313 	do {
3314 		pc_request(ULINT_MAX, LSN_MAX);
3315 
3316 		while (pc_flush_slot() > 0) {}
3317 
3318 		ulint	n_flushed_lru = 0;
3319 		ulint	n_flushed_list = 0;
3320 		pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3321 
3322 		n_flushed = n_flushed_lru + n_flushed_list;
3323 
3324 		/* We sleep only if there are no pages to flush */
3325 		if (n_flushed == 0) {
3326 			os_thread_sleep(100000);
3327 		}
3328 	} while (srv_shutdown_state == SRV_SHUTDOWN_CLEANUP);
3329 
3330 	/* At this point all threads including the master and the purge
3331 	thread must have been suspended. */
3332 	ut_a(srv_get_active_thread_type() == SRV_NONE);
3333 	ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
3334 
3335 	/* We can now make a final sweep on flushing the buffer pool
3336 	and exit after we have cleaned the whole buffer pool.
3337 	It is important that we wait for any running batch that has
3338 	been triggered by us to finish. Otherwise we can end up
3339 	considering end of that batch as a finish of our final
3340 	sweep and we'll come out of the loop leaving behind dirty pages
3341 	in the flush_list */
3342 	buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
3343 	buf_flush_wait_LRU_batch_end();
3344 
3345 	bool	success;
3346 
3347 	do {
3348 		pc_request(ULINT_MAX, LSN_MAX);
3349 
3350 		while (pc_flush_slot() > 0) {}
3351 
3352 		ulint	n_flushed_lru = 0;
3353 		ulint	n_flushed_list = 0;
3354 		success = pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3355 
3356 		n_flushed = n_flushed_lru + n_flushed_list;
3357 
3358 		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
3359 		buf_flush_wait_LRU_batch_end();
3360 
3361 	} while (!success || n_flushed > 0);
3362 
3363 	/* Some sanity checks */
3364 	ut_a(srv_get_active_thread_type() == SRV_NONE);
3365 	ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
3366 
3367 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
3368 		buf_pool_t* buf_pool = buf_pool_from_array(i);
3369 		ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == 0);
3370 	}
3371 
3372 	/* We have lived our life. Time to die. */
3373 
3374 thread_exit:
3375 	/* All worker threads are waiting for the event here,
3376 	and no more access to page_cleaner structure by them.
3377 	Wakes worker threads up just to make them exit. */
3378 	page_cleaner.is_running = false;
3379 
3380 	/* waiting for all worker threads exit */
3381 	while (page_cleaner.n_workers) {
3382 		os_event_set(page_cleaner.is_requested);
3383 		os_thread_sleep(10000);
3384 	}
3385 
3386 	mutex_destroy(&page_cleaner.mutex);
3387 
3388 	os_event_destroy(page_cleaner.is_finished);
3389 	os_event_destroy(page_cleaner.is_requested);
3390 	os_event_destroy(page_cleaner.is_started);
3391 
3392 	buf_page_cleaner_is_active = false;
3393 
3394 	my_thread_end();
3395 	/* We count the number of threads in os_thread_exit(). A created
3396 	thread should always use that to exit and not use return() to exit. */
3397 	os_thread_exit();
3398 
3399 	OS_THREAD_DUMMY_RETURN;
3400 }
3401 
3402 /** Adjust thread count for page cleaner workers.
3403 @param[in]	new_cnt		Number of threads to be used */
3404 void
buf_flush_set_page_cleaner_thread_cnt(ulong new_cnt)3405 buf_flush_set_page_cleaner_thread_cnt(ulong new_cnt)
3406 {
3407 	mutex_enter(&page_cleaner.mutex);
3408 
3409 	srv_n_page_cleaners = new_cnt;
3410 	if (new_cnt > page_cleaner.n_workers) {
3411 		/* User has increased the number of page
3412 		cleaner threads. */
3413 		ulint add = new_cnt - page_cleaner.n_workers;
3414 		for (ulint i = 0; i < add; i++) {
3415 			os_thread_id_t cleaner_thread_id;
3416 			os_thread_create(buf_flush_page_cleaner_worker, NULL, &cleaner_thread_id);
3417 		}
3418 	}
3419 
3420 	mutex_exit(&page_cleaner.mutex);
3421 
3422 	/* Wait until defined number of workers has started. */
3423 	while (page_cleaner.is_running &&
3424 	       page_cleaner.n_workers != (srv_n_page_cleaners - 1)) {
3425 		os_event_set(page_cleaner.is_requested);
3426 		os_event_reset(page_cleaner.is_started);
3427 		os_event_wait_time(page_cleaner.is_started, 1000000);
3428 	}
3429 }
3430 
3431 /******************************************************************//**
3432 Worker thread of page_cleaner.
3433 @return a dummy parameter */
3434 extern "C"
3435 os_thread_ret_t
DECLARE_THREAD(buf_flush_page_cleaner_worker)3436 DECLARE_THREAD(buf_flush_page_cleaner_worker)(
3437 /*==========================================*/
3438 	void*	arg MY_ATTRIBUTE((unused)))
3439 			/*!< in: a dummy parameter required by
3440 			os_thread_create */
3441 {
3442 	my_thread_init();
3443 #ifndef DBUG_OFF
3444 	os_thread_id_t cleaner_thread_id = os_thread_get_curr_id();
3445 #endif
3446 
3447 	mutex_enter(&page_cleaner.mutex);
3448 	ulint thread_no = page_cleaner.n_workers++;
3449 
3450 	DBUG_LOG("ib_buf", "Thread " << cleaner_thread_id
3451 		 << " started; n_workers=" << page_cleaner.n_workers);
3452 
3453 	/* Signal that we have started */
3454 	os_event_set(page_cleaner.is_started);
3455 	mutex_exit(&page_cleaner.mutex);
3456 
3457 #ifdef UNIV_LINUX
3458 	/* linux might be able to set different setting for each thread
3459 	worth to try to set high priority for page cleaner threads */
3460 	if (buf_flush_page_cleaner_set_priority(
3461 		buf_flush_page_cleaner_priority)) {
3462 
3463 		ib::info() << "page_cleaner worker priority: "
3464 			<< buf_flush_page_cleaner_priority;
3465 	}
3466 #endif /* UNIV_LINUX */
3467 
3468 	while (true) {
3469 		os_event_wait(page_cleaner.is_requested);
3470 
3471 		ut_d(buf_flush_page_cleaner_disabled_loop());
3472 
3473 		if (!page_cleaner.is_running) {
3474 			break;
3475 		}
3476 
3477 		ut_ad(srv_n_page_cleaners >= 1);
3478 
3479 		/* If number of page cleaner threads is decreased
3480 		exit those that are not anymore needed. */
3481 		if (srv_shutdown_state == SRV_SHUTDOWN_NONE &&
3482 		    thread_no >= (srv_n_page_cleaners - 1)) {
3483 			DBUG_LOG("ib_buf", "Exiting "
3484 				<< thread_no
3485 				<< " page cleaner worker thread_id "
3486 				<< os_thread_pf(cleaner_thread_id)
3487 				<< " total threads " << srv_n_page_cleaners << ".");
3488 			break;
3489 		}
3490 
3491 		pc_flush_slot();
3492 	}
3493 
3494 	mutex_enter(&page_cleaner.mutex);
3495 	page_cleaner.n_workers--;
3496 
3497 	DBUG_LOG("ib_buf", "Thread " << cleaner_thread_id
3498 		 << " exiting; n_workers=" << page_cleaner.n_workers);
3499 
3500 	/* Signal that we have stopped */
3501 	os_event_set(page_cleaner.is_started);
3502 	mutex_exit(&page_cleaner.mutex);
3503 
3504 	my_thread_end();
3505 
3506 	os_thread_exit();
3507 
3508 	OS_THREAD_DUMMY_RETURN;
3509 }
3510 
3511 /*******************************************************************//**
3512 Synchronously flush dirty blocks from the end of the flush list of all buffer
3513 pool instances.
3514 NOTE: The calling thread is not allowed to own any latches on pages! */
3515 void
buf_flush_sync_all_buf_pools(void)3516 buf_flush_sync_all_buf_pools(void)
3517 /*==============================*/
3518 {
3519 	bool success;
3520 	do {
3521 		success = buf_flush_lists(ULINT_MAX, LSN_MAX, NULL);
3522 		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
3523 	} while (!success);
3524 
3525 	ut_a(success);
3526 }
3527 
3528 /** Request IO burst and wake page_cleaner up.
3529 @param[in]	lsn_limit	upper limit of LSN to be flushed */
3530 void
buf_flush_request_force(lsn_t lsn_limit)3531 buf_flush_request_force(
3532 	lsn_t	lsn_limit)
3533 {
3534 	/* adjust based on lsn_avg_rate not to get old */
3535 	lsn_t	lsn_target = lsn_limit + lsn_avg_rate * 3;
3536 
3537 	mutex_enter(&page_cleaner.mutex);
3538 	if (lsn_target > buf_flush_sync_lsn) {
3539 		buf_flush_sync_lsn = lsn_target;
3540 	}
3541 	mutex_exit(&page_cleaner.mutex);
3542 
3543 	os_event_set(buf_flush_event);
3544 }
3545 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3546 
3547 /** Functor to validate the flush list. */
3548 struct	Check {
operator ()Check3549 	void operator()(const buf_page_t* elem) const
3550 	{
3551 		ut_a(elem->in_flush_list);
3552 	}
3553 };
3554 
3555 /******************************************************************//**
3556 Validates the flush list.
3557 @return TRUE if ok */
3558 static
3559 ibool
buf_flush_validate_low(buf_pool_t * buf_pool)3560 buf_flush_validate_low(
3561 /*===================*/
3562 	buf_pool_t*	buf_pool)		/*!< in: Buffer pool instance */
3563 {
3564 	buf_page_t*		bpage;
3565 	const ib_rbt_node_t*	rnode = NULL;
3566 
3567 	ut_ad(buf_flush_list_mutex_own(buf_pool));
3568 
3569 	ut_list_validate(buf_pool->flush_list, Check());
3570 
3571 	bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
3572 
3573 	/* If we are in recovery mode i.e.: flush_rbt != NULL
3574 	then each block in the flush_list must also be present
3575 	in the flush_rbt. */
3576 	if (buf_pool->flush_rbt != NULL) {
3577 		rnode = rbt_first(buf_pool->flush_rbt);
3578 	}
3579 
3580 	while (bpage != NULL) {
3581 		const lsn_t	om = bpage->oldest_modification;
3582 
3583 		ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
3584 
3585 		ut_ad(bpage->in_flush_list);
3586 
3587 		/* A page in buf_pool->flush_list can be in
3588 		BUF_BLOCK_REMOVE_HASH state. This happens when a page
3589 		is in the middle of being relocated. In that case the
3590 		original descriptor can have this state and still be
3591 		in the flush list waiting to acquire the
3592 		buf_pool->flush_list_mutex to complete the relocation. */
3593 		ut_a(buf_page_in_file(bpage)
3594 		     || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
3595 		ut_a(om > 0);
3596 
3597 		if (buf_pool->flush_rbt != NULL) {
3598 			buf_page_t**	prpage;
3599 
3600 			ut_a(rnode != NULL);
3601 			prpage = rbt_value(buf_page_t*, rnode);
3602 
3603 			ut_a(*prpage != NULL);
3604 			ut_a(*prpage == bpage);
3605 			rnode = rbt_next(buf_pool->flush_rbt, rnode);
3606 		}
3607 
3608 		bpage = UT_LIST_GET_NEXT(list, bpage);
3609 
3610 		ut_a(bpage == NULL || om >= bpage->oldest_modification);
3611 	}
3612 
3613 	/* By this time we must have exhausted the traversal of
3614 	flush_rbt (if active) as well. */
3615 	ut_a(rnode == NULL);
3616 
3617 	return(TRUE);
3618 }
3619 
3620 /******************************************************************//**
3621 Validates the flush list.
3622 @return TRUE if ok */
3623 ibool
buf_flush_validate(buf_pool_t * buf_pool)3624 buf_flush_validate(
3625 /*===============*/
3626 	buf_pool_t*	buf_pool)	/*!< buffer pool instance */
3627 {
3628 	ibool	ret;
3629 
3630 	buf_flush_list_mutex_enter(buf_pool);
3631 
3632 	ret = buf_flush_validate_low(buf_pool);
3633 
3634 	buf_flush_list_mutex_exit(buf_pool);
3635 
3636 	return(ret);
3637 }
3638 
3639 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3640 
3641 /******************************************************************//**
3642 Check if there are any dirty pages that belong to a space id in the flush
3643 list in a particular buffer pool.
3644 @return number of dirty pages present in a single buffer pool */
3645 ulint
buf_pool_get_dirty_pages_count(buf_pool_t * buf_pool,ulint id,FlushObserver * observer)3646 buf_pool_get_dirty_pages_count(
3647 /*===========================*/
3648 	buf_pool_t*	buf_pool,	/*!< in: buffer pool */
3649 	ulint		id,		/*!< in: space id to check */
3650 	FlushObserver*	observer)	/*!< in: flush observer to check */
3651 
3652 {
3653 	ulint		count = 0;
3654 
3655 	buf_pool_mutex_enter(buf_pool);
3656 	buf_flush_list_mutex_enter(buf_pool);
3657 
3658 	buf_page_t*	bpage;
3659 
3660 	for (bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
3661 	     bpage != 0;
3662 	     bpage = UT_LIST_GET_NEXT(list, bpage)) {
3663 
3664 		ut_ad(buf_page_in_file(bpage));
3665 		ut_ad(bpage->in_flush_list);
3666 		ut_ad(bpage->oldest_modification > 0);
3667 
3668 		if ((observer != NULL
3669 		     && observer == bpage->flush_observer)
3670 		    || (observer == NULL
3671 			&& id == bpage->id.space())) {
3672 			++count;
3673 		}
3674 	}
3675 
3676 	buf_flush_list_mutex_exit(buf_pool);
3677 	buf_pool_mutex_exit(buf_pool);
3678 
3679 	return(count);
3680 }
3681 
3682 /******************************************************************//**
3683 Check if there are any dirty pages that belong to a space id in the flush list.
3684 @return number of dirty pages present in all the buffer pools */
3685 static
3686 ulint
buf_flush_get_dirty_pages_count(ulint id,FlushObserver * observer)3687 buf_flush_get_dirty_pages_count(
3688 /*============================*/
3689 	ulint		id,		/*!< in: space id to check */
3690 	FlushObserver*	observer)	/*!< in: flush observer to check */
3691 {
3692 	ulint		count = 0;
3693 
3694 	for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
3695 		buf_pool_t*	buf_pool;
3696 
3697 		buf_pool = buf_pool_from_array(i);
3698 
3699 		count += buf_pool_get_dirty_pages_count(buf_pool, id, observer);
3700 	}
3701 
3702 	return(count);
3703 }
3704 
3705 /** FlushObserver constructor
3706 @param[in]	space		tablespace
3707 @param[in]	trx		trx instance
3708 @param[in]	stage		performance schema accounting object,
3709 used by ALTER TABLE. It is passed to log_preflush_pool_modified_pages()
3710 for accounting. */
FlushObserver(fil_space_t * space,trx_t * trx,ut_stage_alter_t * stage)3711 FlushObserver::FlushObserver(
3712 	fil_space_t*		space,
3713 	trx_t*			trx,
3714 	ut_stage_alter_t*	stage)
3715 	:
3716 	m_space(space),
3717 	m_trx(trx),
3718 	m_stage(stage),
3719 	m_interrupted(false)
3720 {
3721 	m_flushed = UT_NEW_NOKEY(std::vector<ulint>(srv_buf_pool_instances));
3722 	m_removed = UT_NEW_NOKEY(std::vector<ulint>(srv_buf_pool_instances));
3723 
3724 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
3725 		m_flushed->at(i) = 0;
3726 		m_removed->at(i) = 0;
3727 	}
3728 
3729 	DBUG_LOG("flush", "FlushObserver(): trx->id=" << m_trx->id);
3730 }
3731 
3732 /** FlushObserver deconstructor */
~FlushObserver()3733 FlushObserver::~FlushObserver()
3734 {
3735 	ut_ad(buf_flush_get_dirty_pages_count(m_space->id, this) == 0);
3736 
3737 	UT_DELETE(m_flushed);
3738 	UT_DELETE(m_removed);
3739 
3740 	DBUG_LOG("flush", "~FlushObserver(): trx->id=" << m_trx->id);
3741 }
3742 
3743 /** Check whether the operation has been interrupted */
check_interrupted()3744 void FlushObserver::check_interrupted()
3745 {
3746 	if (trx_is_interrupted(m_trx)) {
3747 		interrupted();
3748 	}
3749 }
3750 
3751 /** Notify observer of a flush
3752 @param[in]	buf_pool	buffer pool instance
3753 @param[in]	bpage		buffer page to flush */
3754 void
notify_flush(buf_pool_t * buf_pool,buf_page_t * bpage)3755 FlushObserver::notify_flush(
3756 	buf_pool_t*	buf_pool,
3757 	buf_page_t*	bpage)
3758 {
3759 	ut_ad(buf_pool_mutex_own(buf_pool));
3760 
3761 	m_flushed->at(buf_pool->instance_no)++;
3762 
3763 	if (m_stage != NULL) {
3764 		m_stage->inc();
3765 	}
3766 
3767 	DBUG_LOG("flush", "Flush " << bpage->id);
3768 }
3769 
3770 /** Notify observer of a remove
3771 @param[in]	buf_pool	buffer pool instance
3772 @param[in]	bpage		buffer page flushed */
3773 void
notify_remove(buf_pool_t * buf_pool,buf_page_t * bpage)3774 FlushObserver::notify_remove(
3775 	buf_pool_t*	buf_pool,
3776 	buf_page_t*	bpage)
3777 {
3778 	ut_ad(buf_pool_mutex_own(buf_pool));
3779 
3780 	m_removed->at(buf_pool->instance_no)++;
3781 
3782 	DBUG_LOG("flush", "Remove " << bpage->id);
3783 }
3784 
3785 /** Flush dirty pages and wait. */
3786 void
flush()3787 FlushObserver::flush()
3788 {
3789 	ut_ad(m_trx);
3790 
3791 	if (!m_interrupted && m_stage) {
3792 		m_stage->begin_phase_flush(buf_flush_get_dirty_pages_count(
3793 						   m_space->id, this));
3794 	}
3795 
3796 	buf_LRU_flush_or_remove_pages(m_space->id, this);
3797 
3798 	/* Wait for all dirty pages were flushed. */
3799 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
3800 		while (!is_complete(i)) {
3801 
3802 			os_thread_sleep(2000);
3803 		}
3804 	}
3805 }
3806