1 /*****************************************************************************
2 
3 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
4 
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License, version 2.0,
7 as published by the Free Software Foundation.
8 
9 This program is also distributed with certain software (including
10 but not limited to OpenSSL) that is licensed under separate terms,
11 as designated in a particular file or component or in included license
12 documentation.  The authors of MySQL hereby grant you an additional
13 permission to link the program and your derivative works with the
14 separately licensed software that they have included with MySQL.
15 
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 GNU General Public License, version 2.0, for more details.
20 
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
24 
25 *****************************************************************************/
26 
27 /**************************************************//**
28 @file buf/buf0flu.cc
29 The database buffer buf_pool flush algorithm
30 
31 Created 11/11/1995 Heikki Tuuri
32 *******************************************************/
33 
34 #include "buf0flu.h"
35 
36 #ifdef UNIV_NONINL
37 #include "buf0flu.ic"
38 #endif
39 
40 #include "buf0buf.h"
41 #include "buf0checksum.h"
42 #include "srv0start.h"
43 #include "srv0srv.h"
44 #include "page0zip.h"
45 #ifndef UNIV_HOTBACKUP
46 #include "ut0byte.h"
47 #include "ut0lst.h"
48 #include "page0page.h"
49 #include "fil0fil.h"
50 #include "buf0lru.h"
51 #include "buf0rea.h"
52 #include "ibuf0ibuf.h"
53 #include "log0log.h"
54 #include "os0file.h"
55 #include "trx0sys.h"
56 #include "srv0mon.h"
57 #include "mysql/plugin.h"
58 #include "mysql/service_thd_wait.h"
59 
60 /** Number of pages flushed through non flush_list flushes. */
61 // static ulint buf_lru_flush_page_count = 0;
62 
63 /** Flag indicating if the page_cleaner is in active state. This flag
64 is set to TRUE by the page_cleaner thread when it is spawned and is set
65 back to FALSE at shutdown by the page_cleaner as well. Therefore no
66 need to protect it by a mutex. It is only ever read by the thread
67 doing the shutdown */
68 UNIV_INTERN ibool buf_page_cleaner_is_active = FALSE;
69 
70 /** Flag indicating if the lru_manager is in active state. */
71 UNIV_INTERN bool buf_lru_manager_is_active = false;
72 
73 #ifdef UNIV_PFS_THREAD
74 UNIV_INTERN mysql_pfs_key_t buf_page_cleaner_thread_key;
75 UNIV_INTERN mysql_pfs_key_t buf_lru_manager_thread_key;
76 #endif /* UNIV_PFS_THREAD */
77 
78 /* @} */
79 
80 /** Handled page counters for a single flush */
81 struct flush_counters_t {
82 	ulint	flushed;	/*!< number of dirty pages flushed */
83 	ulint	evicted;	/*!< number of clean pages evicted, including
84 			        evicted uncompressed page images */
85 	ulint	unzip_LRU_evicted;/*!< number of uncompressed page images
86 				evicted */
87 };
88 
89 /******************************************************************//**
90 Increases flush_list size in bytes with zip_size for compressed page,
91 UNIV_PAGE_SIZE for uncompressed page in inline function */
92 static inline
93 void
incr_flush_list_size_in_bytes(buf_block_t * block,buf_pool_t * buf_pool)94 incr_flush_list_size_in_bytes(
95 /*==========================*/
96 	buf_block_t*	block,		/*!< in: control block */
97 	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
98 {
99 	ut_ad(buf_flush_list_mutex_own(buf_pool));
100 	ulint zip_size = page_zip_get_size(&block->page.zip);
101 	buf_pool->stat.flush_list_bytes += zip_size ? zip_size : UNIV_PAGE_SIZE;
102 	ut_ad(buf_pool->stat.flush_list_bytes <= buf_pool->curr_pool_size);
103 }
104 
105 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
106 /******************************************************************//**
107 Validates the flush list.
108 @return	TRUE if ok */
109 static
110 ibool
111 buf_flush_validate_low(
112 /*===================*/
113 	buf_pool_t*	buf_pool);	/*!< in: Buffer pool instance */
114 
115 /******************************************************************//**
116 Validates the flush list some of the time.
117 @return	TRUE if ok or the check was skipped */
118 static
119 ibool
buf_flush_validate_skip(buf_pool_t * buf_pool)120 buf_flush_validate_skip(
121 /*====================*/
122 	buf_pool_t*	buf_pool)	/*!< in: Buffer pool instance */
123 {
124 /** Try buf_flush_validate_low() every this many times */
125 # define BUF_FLUSH_VALIDATE_SKIP	23
126 
127 	/** The buf_flush_validate_low() call skip counter.
128 	Use a signed type because of the race condition below. */
129 	static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
130 
131 	/* There is a race condition below, but it does not matter,
132 	because this call is only for heuristic purposes. We want to
133 	reduce the call frequency of the costly buf_flush_validate_low()
134 	check in debug builds. */
135 	if (--buf_flush_validate_count > 0) {
136 		return(TRUE);
137 	}
138 
139 	buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
140 	return(buf_flush_validate_low(buf_pool));
141 }
142 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
143 
144 /*******************************************************************//**
145 Sets hazard pointer during flush_list iteration. */
146 UNIV_INLINE
147 void
buf_flush_set_hp(buf_pool_t * buf_pool,const buf_page_t * bpage)148 buf_flush_set_hp(
149 /*=============*/
150 	buf_pool_t*		buf_pool,/*!< in/out: buffer pool instance */
151 	const buf_page_t*	bpage)	/*!< in: buffer control block */
152 {
153 	ut_ad(buf_flush_list_mutex_own(buf_pool));
154 	ut_ad(buf_pool->flush_list_hp == NULL || bpage == NULL);
155 	ut_ad(!bpage || buf_page_in_file(bpage)
156 	      || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
157 	ut_ad(!bpage || bpage->in_flush_list);
158 	ut_ad(!bpage || buf_pool_from_bpage(bpage) == buf_pool);
159 
160 	buf_pool->flush_list_hp = bpage;
161 }
162 
163 /*******************************************************************//**
164 Checks if the given block is a hazard pointer
165 @return true if bpage is hazard pointer */
166 UNIV_INLINE
167 bool
buf_flush_is_hp(buf_pool_t * buf_pool,const buf_page_t * bpage)168 buf_flush_is_hp(
169 /*============*/
170 	buf_pool_t*		buf_pool,/*!< in: buffer pool instance */
171 	const buf_page_t*	bpage)	/*!< in: buffer control block */
172 {
173 	ut_ad(buf_flush_list_mutex_own(buf_pool));
174 
175 	return(buf_pool->flush_list_hp == bpage);
176 }
177 
178 /*******************************************************************//**
179 Whenever we move a block in flush_list (either to remove it or to
180 relocate it) we check the hazard pointer set by some other thread
181 doing the flush list scan. If the hazard pointer is the same as the
182 one we are about going to move then we set it to NULL to force a rescan
183 in the thread doing the batch. */
184 UNIV_INLINE
185 void
buf_flush_update_hp(buf_pool_t * buf_pool,buf_page_t * bpage)186 buf_flush_update_hp(
187 /*================*/
188 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
189 	buf_page_t*	bpage)		/*!< in: buffer control block */
190 {
191 	ut_ad(buf_flush_list_mutex_own(buf_pool));
192 
193 	if (buf_flush_is_hp(buf_pool, bpage)) {
194 		buf_flush_set_hp(buf_pool, NULL);
195 		MONITOR_INC(MONITOR_FLUSH_HP_RESCAN);
196 	}
197 }
198 
199 /******************************************************************//**
200 Insert a block in the flush_rbt and returns a pointer to its
201 predecessor or NULL if no predecessor. The ordering is maintained
202 on the basis of the <oldest_modification, space, offset> key.
203 @return	pointer to the predecessor or NULL if no predecessor. */
204 static
205 buf_page_t*
buf_flush_insert_in_flush_rbt(buf_page_t * bpage)206 buf_flush_insert_in_flush_rbt(
207 /*==========================*/
208 	buf_page_t*	bpage)	/*!< in: bpage to be inserted. */
209 {
210 	const ib_rbt_node_t*	c_node;
211 	const ib_rbt_node_t*	p_node;
212 	buf_page_t*		prev = NULL;
213 	buf_pool_t*		buf_pool = buf_pool_from_bpage(bpage);
214 
215 	ut_ad(buf_flush_list_mutex_own(buf_pool));
216 
217 	/* Insert this buffer into the rbt. */
218 	c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
219 	ut_a(c_node != NULL);
220 
221 	/* Get the predecessor. */
222 	p_node = rbt_prev(buf_pool->flush_rbt, c_node);
223 
224 	if (p_node != NULL) {
225 		buf_page_t**	value;
226 		value = rbt_value(buf_page_t*, p_node);
227 		prev = *value;
228 		ut_a(prev != NULL);
229 	}
230 
231 	return(prev);
232 }
233 
234 /*********************************************************//**
235 Delete a bpage from the flush_rbt. */
236 static
237 void
buf_flush_delete_from_flush_rbt(buf_page_t * bpage)238 buf_flush_delete_from_flush_rbt(
239 /*============================*/
240 	buf_page_t*	bpage)	/*!< in: bpage to be removed. */
241 {
242 #ifdef UNIV_DEBUG
243 	ibool		ret = FALSE;
244 #endif /* UNIV_DEBUG */
245 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
246 
247 	ut_ad(buf_flush_list_mutex_own(buf_pool));
248 
249 #ifdef UNIV_DEBUG
250 	ret =
251 #endif /* UNIV_DEBUG */
252 	rbt_delete(buf_pool->flush_rbt, &bpage);
253 
254 	ut_ad(ret);
255 }
256 
257 /*****************************************************************//**
258 Compare two modified blocks in the buffer pool. The key for comparison
259 is:
260 key = <oldest_modification, space, offset>
261 This comparison is used to maintian ordering of blocks in the
262 buf_pool->flush_rbt.
263 Note that for the purpose of flush_rbt, we only need to order blocks
264 on the oldest_modification. The other two fields are used to uniquely
265 identify the blocks.
266 @return	 < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */
267 static
268 int
buf_flush_block_cmp(const void * p1,const void * p2)269 buf_flush_block_cmp(
270 /*================*/
271 	const void*	p1,		/*!< in: block1 */
272 	const void*	p2)		/*!< in: block2 */
273 {
274 	int			ret;
275 	const buf_page_t*	b1 = *(const buf_page_t**) p1;
276 	const buf_page_t*	b2 = *(const buf_page_t**) p2;
277 #ifdef UNIV_DEBUG
278 	buf_pool_t*		buf_pool = buf_pool_from_bpage(b1);
279 #endif /* UNIV_DEBUG */
280 
281 	ut_ad(b1 != NULL);
282 	ut_ad(b2 != NULL);
283 
284 	ut_ad(buf_flush_list_mutex_own(buf_pool));
285 
286 	ut_ad(b1->in_flush_list);
287 	ut_ad(b2->in_flush_list);
288 
289 	if (b2->oldest_modification > b1->oldest_modification) {
290 		return(1);
291 	} else if (b2->oldest_modification < b1->oldest_modification) {
292 		return(-1);
293 	}
294 
295 	/* If oldest_modification is same then decide on the space. */
296 	ret = (int)(b2->space - b1->space);
297 
298 	/* Or else decide ordering on the offset field. */
299 	return(ret ? ret : (int)(b2->offset - b1->offset));
300 }
301 
302 /********************************************************************//**
303 Initialize the red-black tree to speed up insertions into the flush_list
304 during recovery process. Should be called at the start of recovery
305 process before any page has been read/written. */
306 UNIV_INTERN
307 void
buf_flush_init_flush_rbt(void)308 buf_flush_init_flush_rbt(void)
309 /*==========================*/
310 {
311 	ulint	i;
312 
313 	for (i = 0; i < srv_buf_pool_instances; i++) {
314 		buf_pool_t*	buf_pool;
315 
316 		buf_pool = buf_pool_from_array(i);
317 
318 		buf_flush_list_mutex_enter(buf_pool);
319 
320 		ut_ad(buf_pool->flush_rbt == NULL);
321 
322 		/* Create red black tree for speedy insertions in flush list. */
323 		buf_pool->flush_rbt = rbt_create(
324 			sizeof(buf_page_t*), buf_flush_block_cmp);
325 
326 		buf_flush_list_mutex_exit(buf_pool);
327 	}
328 }
329 
330 /********************************************************************//**
331 Frees up the red-black tree. */
332 UNIV_INTERN
333 void
buf_flush_free_flush_rbt(void)334 buf_flush_free_flush_rbt(void)
335 /*==========================*/
336 {
337 	ulint	i;
338 
339 	for (i = 0; i < srv_buf_pool_instances; i++) {
340 		buf_pool_t*	buf_pool;
341 
342 		buf_pool = buf_pool_from_array(i);
343 
344 		buf_flush_list_mutex_enter(buf_pool);
345 
346 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
347 		ut_a(buf_flush_validate_low(buf_pool));
348 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
349 
350 		rbt_free(buf_pool->flush_rbt);
351 		buf_pool->flush_rbt = NULL;
352 
353 		buf_flush_list_mutex_exit(buf_pool);
354 	}
355 }
356 
357 /********************************************************************//**
358 Inserts a modified block into the flush list. */
359 UNIV_INTERN
360 void
buf_flush_insert_into_flush_list(buf_pool_t * buf_pool,buf_block_t * block,lsn_t lsn)361 buf_flush_insert_into_flush_list(
362 /*=============================*/
363 	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
364 	buf_block_t*	block,		/*!< in/out: block which is modified */
365 	lsn_t		lsn)		/*!< in: oldest modification */
366 {
367 	ut_ad(srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE);
368 	ut_ad(log_flush_order_mutex_own());
369 	ut_ad(mutex_own(&block->mutex));
370 
371 	buf_flush_list_mutex_enter(buf_pool);
372 
373 	ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
374 	      || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
375 		  <= lsn));
376 
377 	/* If we are in the recovery then we need to update the flush
378 	red-black tree as well. */
379 	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
380 		buf_flush_list_mutex_exit(buf_pool);
381 		buf_flush_insert_sorted_into_flush_list(buf_pool, block, lsn);
382 		return;
383 	}
384 
385 	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
386 	ut_ad(!block->page.in_flush_list);
387 
388 	ut_d(block->page.in_flush_list = TRUE);
389 	block->page.oldest_modification = lsn;
390 	UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
391 	incr_flush_list_size_in_bytes(block, buf_pool);
392 
393 #ifdef UNIV_DEBUG_VALGRIND
394 	{
395 		ulint	zip_size = buf_block_get_zip_size(block);
396 
397 		if (zip_size) {
398 			UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
399 		} else {
400 			UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
401 		}
402 	}
403 #endif /* UNIV_DEBUG_VALGRIND */
404 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
405 	ut_a(buf_flush_validate_skip(buf_pool));
406 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
407 
408 	buf_flush_list_mutex_exit(buf_pool);
409 }
410 
411 /********************************************************************//**
412 Inserts a modified block into the flush list in the right sorted position.
413 This function is used by recovery, because there the modifications do not
414 necessarily come in the order of lsn's. */
415 UNIV_INTERN
416 void
buf_flush_insert_sorted_into_flush_list(buf_pool_t * buf_pool,buf_block_t * block,lsn_t lsn)417 buf_flush_insert_sorted_into_flush_list(
418 /*====================================*/
419 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
420 	buf_block_t*	block,		/*!< in/out: block which is modified */
421 	lsn_t		lsn)		/*!< in: oldest modification */
422 {
423 	buf_page_t*	prev_b;
424 	buf_page_t*	b;
425 
426 	ut_ad(srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE);
427 	ut_ad(log_flush_order_mutex_own());
428 	ut_ad(mutex_own(&block->mutex));
429 	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
430 
431 	buf_flush_list_mutex_enter(buf_pool);
432 
433 	/* The field in_LRU_list is protected by buf_pool->LRU_list_mutex,
434 	which we are not holding.  However, while a block is in the flush
435 	list, it is dirty and cannot be discarded, not from the
436 	page_hash or from the LRU list.  At most, the uncompressed
437 	page frame of a compressed block may be discarded or created
438 	(copying the block->page to or from a buf_page_t that is
439 	dynamically allocated from buf_buddy_alloc()).  Because those
440 	transitions hold block->mutex and the flush list mutex (via
441 	buf_flush_relocate_on_flush_list()), there is no possibility
442 	of a race condition in the assertions below. */
443 	ut_ad(block->page.in_LRU_list);
444 	ut_ad(block->page.in_page_hash);
445 	/* buf_buddy_block_register() will take a block in the
446 	BUF_BLOCK_MEMORY state, not a file page. */
447 	ut_ad(!block->page.in_zip_hash);
448 
449 	ut_ad(!block->page.in_flush_list);
450 	ut_d(block->page.in_flush_list = TRUE);
451 	block->page.oldest_modification = lsn;
452 
453 #ifdef UNIV_DEBUG_VALGRIND
454 	{
455 		ulint	zip_size = buf_block_get_zip_size(block);
456 
457 		if (zip_size) {
458 			UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
459 		} else {
460 			UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
461 		}
462 	}
463 #endif /* UNIV_DEBUG_VALGRIND */
464 
465 	prev_b = NULL;
466 
467 	/* For the most part when this function is called the flush_rbt
468 	should not be NULL. In a very rare boundary case it is possible
469 	that the flush_rbt has already been freed by the recovery thread
470 	before the last page was hooked up in the flush_list by the
471 	io-handler thread. In that case we'll  just do a simple
472 	linear search in the else block. */
473 	if (buf_pool->flush_rbt) {
474 
475 		prev_b = buf_flush_insert_in_flush_rbt(&block->page);
476 
477 	} else {
478 
479 		b = UT_LIST_GET_FIRST(buf_pool->flush_list);
480 
481 		while (b && b->oldest_modification
482 		       > block->page.oldest_modification) {
483 			ut_ad(b->in_flush_list);
484 			prev_b = b;
485 			b = UT_LIST_GET_NEXT(list, b);
486 		}
487 	}
488 
489 	if (prev_b == NULL) {
490 		UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
491 	} else {
492 		UT_LIST_INSERT_AFTER(list, buf_pool->flush_list,
493 				     prev_b, &block->page);
494 	}
495 
496 	incr_flush_list_size_in_bytes(block, buf_pool);
497 
498 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
499 	ut_a(buf_flush_validate_low(buf_pool));
500 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
501 
502 	buf_flush_list_mutex_exit(buf_pool);
503 }
504 
505 /********************************************************************//**
506 Returns TRUE if the file page block is immediately suitable for replacement,
507 i.e., the transition FILE_PAGE => NOT_USED allowed.
508 @return	TRUE if can replace immediately */
509 UNIV_INTERN
510 ibool
buf_flush_ready_for_replace(buf_page_t * bpage)511 buf_flush_ready_for_replace(
512 /*========================*/
513 	buf_page_t*	bpage)	/*!< in: buffer control block, must be
514 				buf_page_in_file(bpage) and in the LRU list */
515 {
516 #ifdef UNIV_DEBUG
517 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
518 	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
519 #endif /* UNIV_DEBUG */
520 	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
521 	ut_ad(bpage->in_LRU_list);
522 
523 	if (buf_page_in_file(bpage)) {
524 
525 		return(bpage->oldest_modification == 0
526 		       && bpage->buf_fix_count == 0
527 		       && buf_page_get_io_fix(bpage) == BUF_IO_NONE);
528 	}
529 
530 	ut_print_timestamp(stderr);
531 	fprintf(stderr,
532 		"  InnoDB: Error: buffer block state %lu"
533 		" in the LRU list!\n",
534 		(ulong) buf_page_get_state(bpage));
535 	ut_print_buf(stderr, bpage, sizeof(buf_page_t));
536 	putc('\n', stderr);
537 
538 	return(FALSE);
539 }
540 
541 /********************************************************************//**
542 Returns true if the block is modified and ready for flushing.
543 @return	true if can flush immediately */
544 UNIV_INTERN
545 bool
buf_flush_ready_for_flush(buf_page_t * bpage,buf_flush_t flush_type)546 buf_flush_ready_for_flush(
547 /*======================*/
548 	buf_page_t*	bpage,	/*!< in: buffer control block, must be
549 				buf_page_in_file(bpage) */
550 	buf_flush_t	flush_type)/*!< in: type of flush */
551 {
552 	ut_ad(flush_type < BUF_FLUSH_N_TYPES);
553 	ut_ad(mutex_own(buf_page_get_mutex(bpage))
554 	      || flush_type == BUF_FLUSH_LIST);
555 	ut_a(buf_page_in_file(bpage)
556 	     || (buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH
557 #ifdef UNIV_DEBUG
558 		 && !mutex_own(&buf_pool_from_bpage(bpage)->LRU_list_mutex)
559 #endif
560 		     ));
561 
562 	if (bpage->oldest_modification == 0
563 	    || buf_page_get_io_fix_unlocked(bpage) != BUF_IO_NONE) {
564 		return(false);
565 	}
566 
567 	ut_ad(bpage->in_flush_list);
568 
569 	switch (flush_type) {
570 	case BUF_FLUSH_LIST:
571 		return(buf_page_get_state(bpage) != BUF_BLOCK_REMOVE_HASH);
572 	case BUF_FLUSH_LRU:
573 	case BUF_FLUSH_SINGLE_PAGE:
574 		return(true);
575 
576 	case BUF_FLUSH_N_TYPES:
577 		break;
578 	}
579 
580 	ut_error;
581 	return(false);
582 }
583 
584 /********************************************************************//**
585 Remove a block from the flush list of modified blocks. */
586 UNIV_INTERN
587 void
buf_flush_remove(buf_page_t * bpage)588 buf_flush_remove(
589 /*=============*/
590 	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
591 {
592 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
593 	ulint		zip_size;
594 
595 	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
596 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
597 	ut_ad(buf_page_get_state(bpage) != BUF_BLOCK_ZIP_DIRTY
598 	      || mutex_own(&buf_pool->LRU_list_mutex));
599 #endif
600 	ut_ad(bpage->in_flush_list);
601 
602 	buf_flush_list_mutex_enter(buf_pool);
603 
604 	switch (buf_page_get_state(bpage)) {
605 	case BUF_BLOCK_POOL_WATCH:
606 	case BUF_BLOCK_ZIP_PAGE:
607 		/* Clean compressed pages should not be on the flush list */
608 	case BUF_BLOCK_NOT_USED:
609 	case BUF_BLOCK_READY_FOR_USE:
610 	case BUF_BLOCK_MEMORY:
611 	case BUF_BLOCK_REMOVE_HASH:
612 		ut_error;
613 		return;
614 	case BUF_BLOCK_ZIP_DIRTY:
615 		buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
616 		UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
617 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
618 		buf_LRU_insert_zip_clean(bpage);
619 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
620 		break;
621 	case BUF_BLOCK_FILE_PAGE:
622 		UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
623 		break;
624 	}
625 
626 	/* If the flush_rbt is active then delete from there as well. */
627 	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
628 		buf_flush_delete_from_flush_rbt(bpage);
629 	}
630 
631 	/* Must be done after we have removed it from the flush_rbt
632 	because we assert on in_flush_list in comparison function. */
633 	ut_d(bpage->in_flush_list = FALSE);
634 
635 	zip_size = page_zip_get_size(&bpage->zip);
636 	buf_pool->stat.flush_list_bytes -= zip_size ? zip_size : UNIV_PAGE_SIZE;
637 
638 	bpage->oldest_modification = 0;
639 
640 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
641 	ut_a(buf_flush_validate_skip(buf_pool));
642 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
643 
644 	buf_flush_update_hp(buf_pool, bpage);
645 	buf_flush_list_mutex_exit(buf_pool);
646 }
647 
648 /*******************************************************************//**
649 Relocates a buffer control block on the flush_list.
650 Note that it is assumed that the contents of bpage have already been
651 copied to dpage.
652 IMPORTANT: When this function is called bpage and dpage are not
653 exact copies of each other. For example, they both will have different
654 ::state. Also the ::list pointers in dpage may be stale. We need to
655 use the current list node (bpage) to do the list manipulation because
656 the list pointers could have changed between the time that we copied
657 the contents of bpage to the dpage and the flush list manipulation
658 below. */
659 UNIV_INTERN
660 void
buf_flush_relocate_on_flush_list(buf_page_t * bpage,buf_page_t * dpage)661 buf_flush_relocate_on_flush_list(
662 /*=============================*/
663 	buf_page_t*	bpage,	/*!< in/out: control block being moved */
664 	buf_page_t*	dpage)	/*!< in/out: destination block */
665 {
666 	buf_page_t*	prev;
667 	buf_page_t*	prev_b = NULL;
668 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
669 
670 	/* Must reside in the same buffer pool. */
671 	ut_ad(buf_pool == buf_pool_from_bpage(dpage));
672 
673 	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
674 
675 	buf_flush_list_mutex_enter(buf_pool);
676 
677 	ut_ad(bpage->in_flush_list);
678 	ut_ad(dpage->in_flush_list);
679 
680 	/* If recovery is active we must swap the control blocks in
681 	the flush_rbt as well. */
682 	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
683 		buf_flush_delete_from_flush_rbt(bpage);
684 		prev_b = buf_flush_insert_in_flush_rbt(dpage);
685 	}
686 
687 	/* Must be done after we have removed it from the flush_rbt
688 	because we assert on in_flush_list in comparison function. */
689 	ut_d(bpage->in_flush_list = FALSE);
690 
691 	prev = UT_LIST_GET_PREV(list, bpage);
692 	UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
693 
694 	if (prev) {
695 		ut_ad(prev->in_flush_list);
696 		UT_LIST_INSERT_AFTER(
697 			list,
698 			buf_pool->flush_list,
699 			prev, dpage);
700 	} else {
701 		UT_LIST_ADD_FIRST(
702 			list,
703 			buf_pool->flush_list,
704 			dpage);
705 	}
706 
707 	/* Just an extra check. Previous in flush_list
708 	should be the same control block as in flush_rbt. */
709 	ut_a(!buf_pool->flush_rbt || prev_b == prev);
710 
711 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
712 	ut_a(buf_flush_validate_low(buf_pool));
713 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
714 
715 	buf_flush_update_hp(buf_pool, bpage);
716 	buf_flush_list_mutex_exit(buf_pool);
717 }
718 
719 /********************************************************************//**
720 Updates the flush system data structures when a write is completed. */
721 UNIV_INTERN
722 void
buf_flush_write_complete(buf_page_t * bpage)723 buf_flush_write_complete(
724 /*=====================*/
725 	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
726 {
727 	buf_flush_t	flush_type = buf_page_get_flush_type(bpage);
728 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
729 
730 	mutex_enter(&buf_pool->flush_state_mutex);
731 
732 	buf_flush_remove(bpage);
733 
734 	buf_page_set_io_fix(bpage, BUF_IO_NONE);
735 
736 	buf_pool->n_flush[flush_type]--;
737 	ut_ad(buf_pool->n_flush[flush_type] != ULINT_MAX);
738 
739 	/* fprintf(stderr, "n pending flush %lu\n",
740 	buf_pool->n_flush[flush_type]); */
741 
742 	if (buf_pool->n_flush[flush_type] == 0
743 	    && buf_pool->init_flush[flush_type] == FALSE) {
744 
745 		/* The running flush batch has ended */
746 
747 		os_event_set(buf_pool->no_flush[flush_type]);
748 	}
749 
750 	buf_dblwr_update(bpage, flush_type);
751 
752 	mutex_exit(&buf_pool->flush_state_mutex);
753 }
754 #endif /* !UNIV_HOTBACKUP */
755 
756 /********************************************************************//**
757 Calculate the checksum of a page from compressed table and update the page. */
758 UNIV_INTERN
759 void
buf_flush_update_zip_checksum(buf_frame_t * page,ulint zip_size,lsn_t lsn)760 buf_flush_update_zip_checksum(
761 /*==========================*/
762 	buf_frame_t*	page,		/*!< in/out: Page to update */
763 	ulint		zip_size,	/*!< in: Compressed page size */
764 	lsn_t		lsn)		/*!< in: Lsn to stamp on the page */
765 {
766 	ut_a(zip_size > 0);
767 
768 	ib_uint32_t	checksum = static_cast<ib_uint32_t>(
769 		page_zip_calc_checksum(
770 			page, zip_size,
771 			static_cast<srv_checksum_algorithm_t>(
772 				srv_checksum_algorithm)));
773 
774 	mach_write_to_8(page + FIL_PAGE_LSN, lsn);
775 	memset(page + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
776 	mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
777 }
778 
779 /********************************************************************//**
780 Initializes a page for writing to the tablespace. */
781 UNIV_INTERN
782 void
buf_flush_init_for_writing(byte * page,void * page_zip_,lsn_t newest_lsn)783 buf_flush_init_for_writing(
784 /*=======================*/
785 	byte*	page,		/*!< in/out: page */
786 	void*	page_zip_,	/*!< in/out: compressed page, or NULL */
787 	lsn_t	newest_lsn)	/*!< in: newest modification lsn
788 				to the page */
789 {
790 	ib_uint32_t	checksum = 0 /* silence bogus gcc warning */;
791 
792 	ut_ad(page);
793 
794 	if (page_zip_) {
795 		page_zip_des_t*	page_zip;
796 		ulint		zip_size;
797 
798 		page_zip = static_cast<page_zip_des_t*>(page_zip_);
799 		zip_size = page_zip_get_size(page_zip);
800 
801 		ut_ad(zip_size);
802 		ut_ad(ut_is_2pow(zip_size));
803 		ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
804 
805 		switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) {
806 		case FIL_PAGE_TYPE_ALLOCATED:
807 		case FIL_PAGE_INODE:
808 		case FIL_PAGE_IBUF_BITMAP:
809 		case FIL_PAGE_TYPE_FSP_HDR:
810 		case FIL_PAGE_TYPE_XDES:
811 			/* These are essentially uncompressed pages. */
812 			memcpy(page_zip->data, page, zip_size);
813 			/* fall through */
814 		case FIL_PAGE_TYPE_ZBLOB:
815 		case FIL_PAGE_TYPE_ZBLOB2:
816 		case FIL_PAGE_INDEX:
817 
818 			buf_flush_update_zip_checksum(
819 				page_zip->data, zip_size, newest_lsn);
820 
821 			return;
822 		}
823 
824 		ut_print_timestamp(stderr);
825 		fputs("  InnoDB: ERROR: The compressed page to be written"
826 		      " seems corrupt:", stderr);
827 		ut_print_buf(stderr, page, zip_size);
828 		fputs("\nInnoDB: Possibly older version of the page:", stderr);
829 		ut_print_buf(stderr, page_zip->data, zip_size);
830 		putc('\n', stderr);
831 		ut_error;
832 	}
833 
834 	/* Write the newest modification lsn to the page header and trailer */
835 	mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
836 
837 	mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
838 			newest_lsn);
839 
840 	/* Store the new formula checksum */
841 
842 	switch ((srv_checksum_algorithm_t) srv_checksum_algorithm) {
843 	case SRV_CHECKSUM_ALGORITHM_CRC32:
844 	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
845 		checksum = buf_calc_page_crc32(page);
846 		mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
847 		break;
848 	case SRV_CHECKSUM_ALGORITHM_INNODB:
849 	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
850 		checksum = (ib_uint32_t) buf_calc_page_new_checksum(page);
851 		mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
852 		checksum = (ib_uint32_t) buf_calc_page_old_checksum(page);
853 		break;
854 	case SRV_CHECKSUM_ALGORITHM_NONE:
855 	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
856 		checksum = BUF_NO_CHECKSUM_MAGIC;
857 		mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
858 		break;
859 	/* no default so the compiler will emit a warning if new enum
860 	is added and not handled here */
861 	}
862 
863 	/* With the InnoDB checksum, we overwrite the first 4 bytes of
864 	the end lsn field to store the old formula checksum. Since it
865 	depends also on the field FIL_PAGE_SPACE_OR_CHKSUM, it has to
866 	be calculated after storing the new formula checksum.
867 
868 	In other cases we write the same value to both fields.
869 	If CRC32 is used then it is faster to use that checksum
870 	(calculated above) instead of calculating another one.
871 	We can afford to store something other than
872 	buf_calc_page_old_checksum() or BUF_NO_CHECKSUM_MAGIC in
873 	this field because the file will not be readable by old
874 	versions of MySQL/InnoDB anyway (older than MySQL 5.6.3) */
875 
876 	mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
877 			checksum);
878 }
879 
880 #ifndef UNIV_HOTBACKUP
881 /********************************************************************//**
882 Does an asynchronous write of a buffer page. NOTE: in simulated aio and
883 also when the doublewrite buffer is used, we must call
884 buf_dblwr_flush_buffered_writes after we have posted a batch of
885 writes! */
886 static
887 void
buf_flush_write_block_low(buf_page_t * bpage,buf_flush_t flush_type,bool sync)888 buf_flush_write_block_low(
889 /*======================*/
890 	buf_page_t*	bpage,		/*!< in: buffer block to write */
891 	buf_flush_t	flush_type,	/*!< in: type of flush */
892 	bool		sync)		/*!< in: true if sync IO request */
893 {
894 	ulint	zip_size	= buf_page_get_zip_size(bpage);
895 	page_t*	frame		= NULL;
896 
897 #ifdef UNIV_DEBUG
898 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
899 	ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
900 #endif
901 
902 #ifdef UNIV_LOG_DEBUG
903 	static ibool	univ_log_debug_warned;
904 #endif /* UNIV_LOG_DEBUG */
905 
906 	ut_ad(buf_page_in_file(bpage));
907 
908 	/* We are not holding block_mutex here.
909 	Nevertheless, it is safe to access bpage, because it is
910 	io_fixed and oldest_modification != 0.  Thus, it cannot be
911 	relocated in the buffer pool or removed from flush_list or
912 	LRU_list. */
913 	ut_ad(!buf_flush_list_mutex_own(buf_pool));
914 	ut_ad(!mutex_own(buf_page_get_mutex(bpage)));
915 	ut_ad(buf_page_get_io_fix_unlocked(bpage) == BUF_IO_WRITE);
916 	ut_ad(bpage->oldest_modification != 0);
917 
918 #ifdef UNIV_IBUF_COUNT_DEBUG
919 	ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
920 #endif
921 	ut_ad(bpage->newest_modification != 0);
922 
923 #ifdef UNIV_LOG_DEBUG
924 	if (!univ_log_debug_warned) {
925 		univ_log_debug_warned = TRUE;
926 		fputs("Warning: cannot force log to disk if"
927 		      " UNIV_LOG_DEBUG is defined!\n"
928 		      "Crash recovery will not work!\n",
929 		      stderr);
930 	}
931 #else
932 	/* Force the log to the disk before writing the modified block */
933 	log_write_up_to(bpage->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
934 #endif
935 	switch (buf_page_get_state(bpage)) {
936 	case BUF_BLOCK_POOL_WATCH:
937 	case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
938 	case BUF_BLOCK_NOT_USED:
939 	case BUF_BLOCK_READY_FOR_USE:
940 	case BUF_BLOCK_MEMORY:
941 	case BUF_BLOCK_REMOVE_HASH:
942 		ut_error;
943 		break;
944 	case BUF_BLOCK_ZIP_DIRTY:
945 		frame = bpage->zip.data;
946 		mach_write_to_8(frame + FIL_PAGE_LSN,
947 				bpage->newest_modification);
948 
949 		ut_a(page_zip_verify_checksum(frame, zip_size));
950 
951 		memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
952 		break;
953 	case BUF_BLOCK_FILE_PAGE:
954 		frame = bpage->zip.data;
955 		if (!frame) {
956 			frame = ((buf_block_t*) bpage)->frame;
957 		}
958 
959 		buf_flush_init_for_writing(((buf_block_t*) bpage)->frame,
960 					   bpage->zip.data
961 					   ? &bpage->zip : NULL,
962 					   bpage->newest_modification);
963 		break;
964 	}
965 
966 	if (!srv_use_doublewrite_buf || !buf_dblwr) {
967 		fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
968 		       sync, buf_page_get_space(bpage), zip_size,
969 		       buf_page_get_page_no(bpage), 0,
970 		       zip_size ? zip_size : UNIV_PAGE_SIZE,
971 		       frame, bpage);
972 	} else if (flush_type == BUF_FLUSH_SINGLE_PAGE) {
973 		buf_dblwr_write_single_page(bpage, sync);
974 	} else {
975 		ut_ad(!sync);
976 		buf_dblwr_add_to_batch(bpage);
977 	}
978 
979 	/* When doing single page flushing the IO is done synchronously
980 	and we flush the changes to disk only for the tablespace we
981 	are working on. */
982 	if (sync) {
983 		ut_ad(flush_type == BUF_FLUSH_SINGLE_PAGE);
984 		fil_flush(buf_page_get_space(bpage));
985 		buf_page_io_complete(bpage);
986 	}
987 
988 	/* Increment the counter of I/O operations used
989 	for selecting LRU policy. */
990 	buf_LRU_stat_inc_io();
991 }
992 
993 /********************************************************************//**
994 Writes a flushable page asynchronously from the buffer pool to a file.
995 NOTE: in simulated aio we must call
996 os_aio_simulated_wake_handler_threads after we have posted a batch of
997 writes! NOTE: buf_page_get_mutex(bpage) must be held upon entering this
998 function, and it will be released by this function if it returns true.
999 LRU_list_mutex must be held iff performing a single page flush and will be
1000 released by the function if it returns true.
1001 @return TRUE if the page was flushed */
1002 UNIV_INTERN
1003 bool
buf_flush_page(buf_pool_t * buf_pool,buf_page_t * bpage,buf_flush_t flush_type,bool sync)1004 buf_flush_page(
1005 /*===========*/
1006 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
1007 	buf_page_t*	bpage,		/*!< in: buffer control block */
1008 	buf_flush_t	flush_type,	/*!< in: type of flush */
1009 	bool		sync)		/*!< in: true if sync IO request */
1010 {
1011 	ut_ad(flush_type < BUF_FLUSH_N_TYPES);
1012 	/* Hold the LRU list mutex iff called for a single page LRU
1013 	flush. A single page LRU flush is already non-performant, and holding
1014 	the LRU list mutex allows us to avoid having to store the previous LRU
1015 	list page or to restart the LRU scan in
1016 	buf_flush_single_page_from_LRU(). */
1017 	ut_ad(flush_type == BUF_FLUSH_SINGLE_PAGE ||
1018 	      !mutex_own(&buf_pool->LRU_list_mutex));
1019 	ut_ad(flush_type != BUF_FLUSH_SINGLE_PAGE ||
1020 	      mutex_own(&buf_pool->LRU_list_mutex));
1021 	ut_ad(buf_page_in_file(bpage));
1022 	ut_ad(!sync || flush_type == BUF_FLUSH_SINGLE_PAGE);
1023 
1024 	ib_mutex_t*	block_mutex = buf_page_get_mutex(bpage);
1025 
1026 	ut_ad(mutex_own(block_mutex));
1027 
1028 	ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
1029 
1030         bool            is_uncompressed;
1031 
1032         is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
1033         ut_ad(is_uncompressed == (block_mutex != &buf_pool->zip_mutex));
1034 
1035         ibool           flush;
1036         rw_lock_t*	rw_lock;
1037         bool            no_fix_count = bpage->buf_fix_count == 0;
1038 
1039         if (!is_uncompressed) {
1040                 flush = TRUE;
1041 		rw_lock = NULL;
1042 
1043 	} else if (!(no_fix_count || flush_type == BUF_FLUSH_LIST)) {
1044 		/* This is a heuristic, to avoid expensive S attempts. */
1045 		flush = FALSE;
1046 	} else {
1047 
1048 		rw_lock = &reinterpret_cast<buf_block_t*>(bpage)->lock;
1049 
1050 		if (flush_type != BUF_FLUSH_LIST) {
1051 			flush = rw_lock_s_lock_gen_nowait(
1052 				rw_lock, BUF_IO_WRITE);
1053 		} else {
1054 			/* Will S lock later */
1055 			flush = TRUE;
1056 		}
1057 	}
1058 
1059         if (flush) {
1060 
1061 		/* We are committed to flushing by the time we get here */
1062 
1063 		mutex_enter(&buf_pool->flush_state_mutex);
1064 
1065 		buf_page_set_io_fix(bpage, BUF_IO_WRITE);
1066 
1067 		buf_page_set_flush_type(bpage, flush_type);
1068 
1069 		if (buf_pool->n_flush[flush_type] == 0) {
1070 
1071 			os_event_reset(buf_pool->no_flush[flush_type]);
1072 		}
1073 
1074 		++buf_pool->n_flush[flush_type];
1075 		ut_ad(buf_pool->n_flush[flush_type] != 0);
1076 
1077 		mutex_exit(&buf_pool->flush_state_mutex);
1078 
1079 		mutex_exit(block_mutex);
1080 
1081 		if (flush_type == BUF_FLUSH_SINGLE_PAGE)
1082 			mutex_exit(&buf_pool->LRU_list_mutex);
1083 
1084 		if (flush_type == BUF_FLUSH_LIST
1085 		    && is_uncompressed
1086 		    && !rw_lock_s_lock_gen_nowait(rw_lock, BUF_IO_WRITE)) {
1087 			/* avoiding deadlock possibility involves doublewrite
1088 			buffer, should flush it, because it might hold the
1089 			another block->lock. */
1090 			buf_dblwr_flush_buffered_writes();
1091 
1092 			rw_lock_s_lock_gen(rw_lock, BUF_IO_WRITE);
1093                 }
1094 
1095                 /* Even though bpage is not protected by any mutex at this
1096                 point, it is safe to access bpage, because it is io_fixed and
1097                 oldest_modification != 0.  Thus, it cannot be relocated in the
1098                 buffer pool or removed from flush_list or LRU_list. */
1099 
1100                 buf_flush_write_block_low(bpage, flush_type, sync);
1101         }
1102 
1103 	return(flush);
1104 }
1105 
1106 # if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
1107 /********************************************************************//**
1108 Writes a flushable page asynchronously from the buffer pool to a file.
1109 NOTE: block and LRU list mutexes must be held upon entering this function, and
1110 they will be released by this function after flushing. This is loosely based on
1111 buf_flush_batch() and buf_flush_page().
1112 @return TRUE if the page was flushed and the mutexes released */
1113 UNIV_INTERN
1114 ibool
buf_flush_page_try(buf_pool_t * buf_pool,buf_block_t * block)1115 buf_flush_page_try(
1116 /*===============*/
1117 	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
1118 	buf_block_t*	block)		/*!< in/out: buffer control block */
1119 {
1120 	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
1121 	ut_ad(mutex_own(&block->mutex));
1122 	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
1123 
1124 	if (!buf_flush_ready_for_flush(&block->page, BUF_FLUSH_SINGLE_PAGE)) {
1125 		return(FALSE);
1126 	}
1127 
1128 	/* The following call will release the LRU list and
1129 	block mutex if successful. */
1130 	return(buf_flush_page(
1131 			buf_pool, &block->page, BUF_FLUSH_SINGLE_PAGE, true));
1132 }
1133 # endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
1134 /***********************************************************//**
1135 Check the page is in buffer pool and can be flushed.
1136 @return	true if the page can be flushed. */
1137 static
1138 bool
buf_flush_check_neighbor(ulint space,ulint offset,buf_flush_t flush_type)1139 buf_flush_check_neighbor(
1140 /*=====================*/
1141 	ulint		space,		/*!< in: space id */
1142 	ulint		offset,		/*!< in: page offset */
1143 	buf_flush_t	flush_type)	/*!< in: BUF_FLUSH_LRU or
1144 					BUF_FLUSH_LIST */
1145 {
1146 	buf_page_t*	bpage;
1147 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
1148 	bool		ret;
1149 	prio_rw_lock_t*	hash_lock;
1150 	ib_mutex_t*	block_mutex;
1151 
1152 	ut_ad(flush_type == BUF_FLUSH_LRU
1153 	      || flush_type == BUF_FLUSH_LIST);
1154 
1155 	/* We only want to flush pages from this buffer pool. */
1156 	bpage = buf_page_hash_get_s_locked(buf_pool, space, offset,
1157 					   &hash_lock);
1158 
1159 	if (!bpage) {
1160 
1161 		return(false);
1162 	}
1163 
1164 	block_mutex = buf_page_get_mutex(bpage);
1165 
1166 	mutex_enter(block_mutex);
1167 
1168 	rw_lock_s_unlock(hash_lock);
1169 
1170 	ut_a(buf_page_in_file(bpage));
1171 
1172 	/* We avoid flushing 'non-old' blocks in an LRU flush,
1173 	because the flushed blocks are soon freed */
1174 
1175 	ret = false;
1176 	if (flush_type != BUF_FLUSH_LRU || buf_page_is_old(bpage)) {
1177 
1178 		if (buf_flush_ready_for_flush(bpage, flush_type)) {
1179 			ret = true;
1180 		}
1181 	}
1182 
1183 	mutex_exit(block_mutex);
1184 
1185 	return(ret);
1186 }
1187 
1188 /***********************************************************//**
1189 Flushes to disk all flushable pages within the flush area.
1190 @return	number of pages flushed */
1191 static
1192 ulint
buf_flush_try_neighbors(ulint space,ulint offset,buf_flush_t flush_type,ulint n_flushed,ulint n_to_flush)1193 buf_flush_try_neighbors(
1194 /*====================*/
1195 	ulint		space,		/*!< in: space id */
1196 	ulint		offset,		/*!< in: page offset */
1197 	buf_flush_t	flush_type,	/*!< in: BUF_FLUSH_LRU or
1198 					BUF_FLUSH_LIST */
1199 	ulint		n_flushed,	/*!< in: number of pages
1200 					flushed so far in this batch */
1201 	ulint		n_to_flush)	/*!< in: maximum number of pages
1202 					we are allowed to flush */
1203 {
1204 	ulint		i;
1205 	ulint		low;
1206 	ulint		high;
1207 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
1208 
1209 	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1210 	ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
1211 	ut_ad(!buf_flush_list_mutex_own(buf_pool));
1212 
1213 	if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN
1214 	    || srv_flush_neighbors == 0) {
1215 		/* If there is little space or neighbor flushing is
1216 		not enabled then just flush the victim. */
1217 		low = offset;
1218 		high = offset + 1;
1219 	} else {
1220 		/* When flushed, dirty blocks are searched in
1221 		neighborhoods of this size, and flushed along with the
1222 		original page. */
1223 
1224 		ulint	buf_flush_area;
1225 
1226 		buf_flush_area	= ut_min(
1227 			BUF_READ_AHEAD_AREA(buf_pool),
1228 			buf_pool->curr_size / 16);
1229 
1230 		low = (offset / buf_flush_area) * buf_flush_area;
1231 		high = (offset / buf_flush_area + 1) * buf_flush_area;
1232 
1233 		if (srv_flush_neighbors == 1) {
1234 			/* adjust 'low' and 'high' to limit
1235 			   for contiguous dirty area */
1236 			if (offset > low) {
1237 				for (i = offset - 1;
1238 				     i >= low
1239 				     && buf_flush_check_neighbor(
1240 						space, i, flush_type);
1241 				     i--) {
1242 					/* do nothing */
1243 				}
1244 				low = i + 1;
1245 			}
1246 
1247 			for (i = offset + 1;
1248 			     i < high
1249 			     && buf_flush_check_neighbor(
1250 						space, i, flush_type);
1251 			     i++) {
1252 				/* do nothing */
1253 			}
1254 			high = i;
1255 		}
1256 	}
1257 
1258 	/* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
1259 
1260 	if (high > fil_space_get_size(space)) {
1261 		high = fil_space_get_size(space);
1262 	}
1263 
1264 	ulint	count = 0;
1265 
1266 	for (i = low; i < high; i++) {
1267 
1268 		prio_rw_lock_t*	hash_lock;
1269 		ib_mutex_t*	block_mutex;
1270 
1271 		if ((count + n_flushed) >= n_to_flush) {
1272 
1273 			/* We have already flushed enough pages and
1274 			should call it a day. There is, however, one
1275 			exception. If the page whose neighbors we
1276 			are flushing has not been flushed yet then
1277 			we'll try to flush the victim that we
1278 			selected originally. */
1279 			if (i <= offset) {
1280 				i = offset;
1281 			} else {
1282 				break;
1283 			}
1284 		}
1285 
1286 		buf_pool = buf_pool_get(space, i);
1287 
1288 		/* We only want to flush pages from this buffer pool. */
1289 		buf_page_t*	bpage = buf_page_hash_get_s_locked(buf_pool,
1290 						   space, i, &hash_lock);
1291 
1292 		if (bpage == NULL) {
1293 
1294 			continue;
1295 		}
1296 
1297 		block_mutex = buf_page_get_mutex(bpage);
1298 
1299 		mutex_enter(block_mutex);
1300 
1301 		rw_lock_s_unlock(hash_lock);
1302 
1303 		ut_a(buf_page_in_file(bpage));
1304 
1305 		/* We avoid flushing 'non-old' blocks in an LRU flush,
1306 		because the flushed blocks are soon freed */
1307 
1308 		if (flush_type != BUF_FLUSH_LRU
1309 		    || i == offset
1310 		    || buf_page_is_old(bpage)) {
1311 
1312 			if (buf_flush_ready_for_flush(bpage, flush_type)
1313 			    && (i == offset || bpage->buf_fix_count == 0)
1314 			    && buf_flush_page(
1315 					buf_pool, bpage, flush_type, false)) {
1316 
1317 				++count;
1318 
1319 				continue;
1320 			}
1321 		}
1322 
1323 		mutex_exit(block_mutex);
1324 	}
1325 
1326 	if (count > 0) {
1327 		MONITOR_INC_VALUE_CUMULATIVE(
1328 					MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
1329 					MONITOR_FLUSH_NEIGHBOR_COUNT,
1330 					MONITOR_FLUSH_NEIGHBOR_PAGES,
1331 					(count - 1));
1332 	}
1333 
1334 	return(count);
1335 }
1336 
1337 /********************************************************************//**
1338 Check if the block is modified and ready for flushing. If the the block
1339 is ready to flush then flush the page and try o flush its neighbors.
1340 
1341 @return	TRUE if, depending on the flush type, either LRU or flush list
1342 mutex was released during this function.  This does not guarantee that some
1343 pages were written as well.
1344 Number of pages written are incremented to the count. */
1345 static
1346 ibool
buf_flush_page_and_try_neighbors(buf_page_t * bpage,buf_flush_t flush_type,ulint n_to_flush,ulint * count)1347 buf_flush_page_and_try_neighbors(
1348 /*=============================*/
1349 	buf_page_t*	bpage,		/*!< in: buffer control block,
1350 					must be
1351 					buf_page_in_file(bpage) */
1352 	buf_flush_t	flush_type,	/*!< in: BUF_FLUSH_LRU
1353 					or BUF_FLUSH_LIST */
1354 	ulint		n_to_flush,	/*!< in: number of pages to
1355 					flush */
1356 	ulint*		count)		/*!< in/out: number of pages
1357 					flushed */
1358 {
1359 	ibool		flushed;
1360 	ib_mutex_t*	block_mutex = NULL;
1361 #ifdef UNIV_DEBUG
1362 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
1363 #endif /* UNIV_DEBUG */
1364 
1365 	ut_ad((flush_type == BUF_FLUSH_LRU
1366 	       && mutex_own(&buf_pool->LRU_list_mutex))
1367 	      || (flush_type == BUF_FLUSH_LIST
1368 		  && buf_flush_list_mutex_own(buf_pool)));
1369 
1370 	if (flush_type == BUF_FLUSH_LRU) {
1371 		block_mutex = buf_page_get_mutex(bpage);
1372 		mutex_enter(block_mutex);
1373 	}
1374 
1375 	ut_a(buf_page_in_file(bpage)
1376 	     || (buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH
1377 #ifdef UNIV_DEBUG
1378 		 && !mutex_own(&buf_pool->LRU_list_mutex)
1379 #endif
1380 		 ));
1381 
1382 	if (buf_flush_ready_for_flush(bpage, flush_type)) {
1383 		buf_pool_t*	buf_pool;
1384 
1385 		buf_pool = buf_pool_from_bpage(bpage);
1386 
1387 		if (flush_type == BUF_FLUSH_LRU) {
1388 			mutex_exit(&buf_pool->LRU_list_mutex);
1389 		}
1390 
1391 		/* These fields are protected by the buf_page_get_mutex()
1392 		mutex. */
1393 		/* Read the fields directly in order to avoid asserting on
1394 		BUF_BLOCK_REMOVE_HASH pages. */
1395 		ulint	space = bpage->space;
1396 		ulint	offset = bpage->offset;
1397 
1398 		if (flush_type == BUF_FLUSH_LRU) {
1399 			mutex_exit(block_mutex);
1400 		} else {
1401 			buf_flush_list_mutex_exit(buf_pool);
1402 		}
1403 
1404 		/* Try to flush also all the neighbors */
1405 		*count += buf_flush_try_neighbors(
1406 			space, offset, flush_type, *count, n_to_flush);
1407 
1408 		if (flush_type == BUF_FLUSH_LRU) {
1409 			mutex_enter(&buf_pool->LRU_list_mutex);
1410 		} else {
1411 			buf_flush_list_mutex_enter(buf_pool);
1412 		}
1413 		flushed = TRUE;
1414 
1415 	} else if (flush_type == BUF_FLUSH_LRU) {
1416 		mutex_exit(block_mutex);
1417 		flushed = FALSE;
1418 	} else {
1419 		flushed = FALSE;
1420 	}
1421 
1422 	ut_ad((flush_type == BUF_FLUSH_LRU
1423 	       && mutex_own(&buf_pool->LRU_list_mutex))
1424 	      || (flush_type == BUF_FLUSH_LIST
1425 		  && buf_flush_list_mutex_own(buf_pool)));
1426 
1427 	return(flushed);
1428 }
1429 
1430 /*******************************************************************//**
1431 This utility moves the uncompressed frames of pages to the free list.
1432 Note that this function does not actually flush any data to disk. It
1433 just detaches the uncompressed frames from the compressed pages at the
1434 tail of the unzip_LRU and puts those freed frames in the free list.
1435 Note that it is a best effort attempt and it is not guaranteed that
1436 after a call to this function there will be 'max' blocks in the free
1437 list.
1438 @return number of blocks moved to the free list. */
1439 static
1440 ulint
buf_free_from_unzip_LRU_list_batch(buf_pool_t * buf_pool,ulint max)1441 buf_free_from_unzip_LRU_list_batch(
1442 /*===============================*/
1443 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
1444 	ulint		max)		/*!< in: desired number of
1445 					blocks in the free_list */
1446 {
1447 	buf_block_t*	block;
1448 	ulint		scanned = 0;
1449 	ulint		count = 0;
1450 	ulint		free_len = UT_LIST_GET_LEN(buf_pool->free);
1451 	ulint		lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
1452 
1453 	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
1454 
1455 	block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
1456 	while (block != NULL && count < max
1457 	       && free_len < srv_LRU_scan_depth
1458 	       && lru_len > UT_LIST_GET_LEN(buf_pool->LRU) / 10) {
1459 
1460 		ib_mutex_t*	block_mutex = buf_page_get_mutex(&block->page);
1461 
1462 		++scanned;
1463 
1464 		mutex_enter(block_mutex);
1465 
1466 		if (buf_LRU_free_page(&block->page, false)) {
1467 
1468 			mutex_exit(block_mutex);
1469 			/* Block was freed. LRU list mutex potentially
1470 			released and reacquired */
1471 			++count;
1472 			mutex_enter(&buf_pool->LRU_list_mutex);
1473 			block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
1474 
1475 		} else {
1476 
1477 			mutex_exit(block_mutex);
1478 			block = UT_LIST_GET_PREV(unzip_LRU, block);
1479 		}
1480 
1481 		free_len = UT_LIST_GET_LEN(buf_pool->free);
1482 		lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
1483 	}
1484 
1485 	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
1486 
1487 	if (scanned) {
1488 		MONITOR_INC_VALUE_CUMULATIVE(
1489 			MONITOR_LRU_BATCH_SCANNED,
1490 			MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
1491 			MONITOR_LRU_BATCH_SCANNED_PER_CALL,
1492 			scanned);
1493 	}
1494 
1495 	return(count);
1496 }
1497 
1498 /*******************************************************************//**
1499 This utility flushes dirty blocks from the end of the LRU list.
1500 The calling thread is not allowed to own any latches on pages!
1501 It attempts to make 'max' blocks available in the free list. Note that
1502 it is a best effort attempt and it is not guaranteed that after a call
1503 to this function there will be 'max' blocks in the free list.
1504 @return number of blocks for which the write request was queued. */
1505 MY_ATTRIBUTE((nonnull))
1506 static
1507 void
buf_flush_LRU_list_batch(buf_pool_t * buf_pool,ulint max,bool limited_scan,flush_counters_t * n)1508 buf_flush_LRU_list_batch(
1509 /*=====================*/
1510 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
1511 	ulint		max,		/*!< in: desired number of
1512 					blocks in the free_list */
1513 	bool		limited_scan,	/*!< in: if true, allow to scan only up
1514 					to srv_LRU_scan_depth pages in total */
1515 	flush_counters_t*	n)	/*!< out: flushed/evicted page
1516 					counts */
1517 {
1518 	buf_page_t*	bpage;
1519 	ulint		scanned = 0;
1520 	ulint		lru_position = 0;
1521 	ulint		max_lru_position;
1522 	ulint		max_scanned_pages;
1523 	ulint		free_len = UT_LIST_GET_LEN(buf_pool->free);
1524 	ulint		lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
1525 
1526 	n->flushed = 0;
1527 	n->evicted = 0;
1528 	n->unzip_LRU_evicted = 0;
1529 
1530 	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
1531 
1532 	max_scanned_pages = limited_scan ? srv_LRU_scan_depth : lru_len * max;
1533 	max_lru_position = ut_min(srv_LRU_scan_depth, lru_len);
1534 
1535 	bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1536 	while (bpage != NULL
1537 	       && (srv_cleaner_eviction_factor ? n->evicted : n->flushed) < max
1538 	       && free_len < srv_LRU_scan_depth
1539 	       && lru_len > BUF_LRU_MIN_LEN
1540 	       && lru_position < max_lru_position
1541 	       && scanned < max_scanned_pages) {
1542 
1543 		ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
1544 		ibool	 evict;
1545 		ulint	failed_acquire;
1546 
1547 		++scanned;
1548 		++lru_position;
1549 
1550 		failed_acquire = mutex_enter_nowait(block_mutex);
1551 
1552 		evict = UNIV_LIKELY(!failed_acquire)
1553 			&& buf_flush_ready_for_replace(bpage);
1554 
1555 		if (UNIV_LIKELY(!failed_acquire) && !evict) {
1556 
1557 			mutex_exit(block_mutex);
1558 		}
1559 
1560 		/* If the block is ready to be replaced we try to
1561 		free it i.e.: put it on the free list.
1562 		Otherwise we try to flush the block and its
1563 		neighbors. In this case we'll put it on the
1564 		free list in the next pass. We do this extra work
1565 		of putting blocks to the free list instead of
1566 		just flushing them because after every flush
1567 		we have to restart the scan from the tail of
1568 		the LRU list and if we don't clear the tail
1569 		of the flushed pages then the scan becomes
1570 		O(n*n). */
1571 		if (evict) {
1572 
1573 			if (buf_LRU_free_page(bpage, true)) {
1574 
1575 				mutex_exit(block_mutex);
1576 				n->evicted++;
1577 				lru_position = 0;
1578 				mutex_enter(&buf_pool->LRU_list_mutex);
1579 				bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1580 			} else {
1581 
1582 				bpage = UT_LIST_GET_PREV(LRU, bpage);
1583 				mutex_exit(block_mutex);
1584 			}
1585 		} else if (UNIV_LIKELY(!failed_acquire)) {
1586 
1587 			ulint		space;
1588 			ulint		offset;
1589 			buf_page_t*	prev_bpage;
1590 
1591 			prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
1592 
1593 			/* Save the previous bpage */
1594 
1595 			if (prev_bpage != NULL) {
1596 				space = prev_bpage->space;
1597 				offset = prev_bpage->offset;
1598 			} else {
1599 				space = ULINT_UNDEFINED;
1600 				offset = ULINT_UNDEFINED;
1601 			}
1602 
1603 			if (buf_flush_page_and_try_neighbors(
1604 				bpage,
1605 				BUF_FLUSH_LRU, max, &n->flushed)) {
1606 
1607 				/* LRU list mutex was released.
1608 				reposition the iterator. Note: the
1609 				prev block could have been repositioned
1610 				too but that should be rare. */
1611 
1612 				if (prev_bpage != NULL) {
1613 
1614 					ut_ad(space != ULINT_UNDEFINED);
1615 					ut_ad(offset != ULINT_UNDEFINED);
1616 
1617 					prev_bpage = buf_page_hash_get(
1618 						buf_pool, space, offset);
1619 				}
1620 			}
1621 
1622 			bpage = prev_bpage;
1623 		}
1624 
1625 		free_len = UT_LIST_GET_LEN(buf_pool->free);
1626 		lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
1627 	}
1628 
1629 	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
1630 
1631 	/* We keep track of all flushes happening as part of LRU
1632 	flush. When estimating the desired rate at which flush_list
1633 	should be flushed, we factor in this value. */
1634 	buf_pool->stat.buf_lru_flush_page_count += n->flushed;
1635 
1636 	if (scanned) {
1637 		MONITOR_INC_VALUE_CUMULATIVE(
1638 			MONITOR_LRU_BATCH_SCANNED,
1639 			MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
1640 			MONITOR_LRU_BATCH_SCANNED_PER_CALL,
1641 			scanned);
1642 	}
1643 }
1644 
1645 /*******************************************************************//**
1646 Flush and move pages from LRU or unzip_LRU list to the free list.
1647 Whether LRU or unzip_LRU is used depends on the state of the system.
1648 @return number of blocks for which either the write request was queued
1649 or in case of unzip_LRU the number of blocks actually moved to the
1650 free list */
1651 MY_ATTRIBUTE((nonnull))
1652 static
1653 void
buf_do_LRU_batch(buf_pool_t * buf_pool,ulint max,bool limited_scan,flush_counters_t * n)1654 buf_do_LRU_batch(
1655 /*=============*/
1656 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
1657 	ulint		max,		/*!< in: desired number of
1658 					blocks in the free_list */
1659 	bool		limited_scan,	/*!< in: if true, allow to scan only up
1660 					to srv_LRU_scan_depth pages in total */
1661 	flush_counters_t*	n)	/*!< out: flushed/evicted page
1662 					counts */
1663 {
1664 	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
1665 
1666 	if (buf_LRU_evict_from_unzip_LRU(buf_pool)) {
1667 		n->unzip_LRU_evicted
1668 			= buf_free_from_unzip_LRU_list_batch(buf_pool, max);
1669 	} else {
1670 		n->unzip_LRU_evicted = 0;
1671 	}
1672 
1673 	if (max > n->unzip_LRU_evicted) {
1674 		buf_flush_LRU_list_batch(buf_pool, max - n->unzip_LRU_evicted,
1675 					 limited_scan, n);
1676 	} else {
1677 		n->evicted = 0;
1678 		n->flushed = 0;
1679 	}
1680 
1681 	n->evicted += n->unzip_LRU_evicted;
1682 }
1683 
1684 /*******************************************************************//**
1685 This utility flushes dirty blocks from the end of the flush_list.
1686 the calling thread is not allowed to own any latches on pages!
1687 @return number of blocks for which the write request was queued;
1688 ULINT_UNDEFINED if there was a flush of the same type already
1689 running */
1690 static
1691 ulint
buf_do_flush_list_batch(buf_pool_t * buf_pool,ulint min_n,lsn_t lsn_limit)1692 buf_do_flush_list_batch(
1693 /*====================*/
1694 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
1695 	ulint		min_n,		/*!< in: wished minimum mumber
1696 					of blocks flushed (it is not
1697 					guaranteed that the actual
1698 					number is that big, though) */
1699 	lsn_t		lsn_limit)	/*!< all blocks whose
1700 					oldest_modification is smaller
1701 					than this should be flushed (if
1702 					their number does not exceed
1703 					min_n) */
1704 {
1705 	ulint		count = 0;
1706 	ulint		scanned = 0;
1707 
1708 	/* Start from the end of the list looking for a suitable
1709 	block to be flushed. */
1710 	buf_flush_list_mutex_enter(buf_pool);
1711 	ulint len = UT_LIST_GET_LEN(buf_pool->flush_list);
1712 
1713 	/* In order not to degenerate this scan to O(n*n) we attempt
1714 	to preserve pointer of previous block in the flush list. To do
1715 	so we declare it a hazard pointer. Any thread working on the
1716 	flush list must check the hazard pointer and if it is removing
1717 	the same block then it must reset it. */
1718 	for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
1719 	     count < min_n && bpage != NULL && len > 0
1720 	     && bpage->oldest_modification < lsn_limit;
1721 	     ++scanned) {
1722 
1723 		buf_page_t*	prev;
1724 
1725 		ut_a(bpage->oldest_modification > 0);
1726 		ut_ad(bpage->in_flush_list);
1727 
1728 		prev = UT_LIST_GET_PREV(list, bpage);
1729 		buf_flush_set_hp(buf_pool, prev);
1730 
1731 #ifdef UNIV_DEBUG
1732 		bool flushed =
1733 #endif /* UNIV_DEBUG */
1734 		buf_flush_page_and_try_neighbors(
1735 			bpage, BUF_FLUSH_LIST, min_n, &count);
1736 
1737 		ut_ad(flushed || buf_flush_is_hp(buf_pool, prev));
1738 
1739 		if (!buf_flush_is_hp(buf_pool, prev)) {
1740 			/* The hazard pointer was reset by some other
1741 			thread. Restart the scan. */
1742 			ut_ad(buf_flush_is_hp(buf_pool, NULL));
1743 			bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
1744 			len = UT_LIST_GET_LEN(buf_pool->flush_list);
1745 		} else {
1746 			bpage = prev;
1747 			--len;
1748 			buf_flush_set_hp(buf_pool, NULL);
1749 		}
1750 
1751 		ut_ad(!bpage || bpage->in_flush_list);
1752 	}
1753 
1754 	buf_flush_list_mutex_exit(buf_pool);
1755 
1756 	MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_SCANNED,
1757 				     MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
1758 				     MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
1759 				     scanned);
1760 
1761 	return(count);
1762 }
1763 
1764 /*******************************************************************//**
1765 This utility flushes dirty blocks from the end of the LRU list or flush_list.
1766 NOTE 1: in the case of an LRU flush the calling thread may own latches to
1767 pages: to avoid deadlocks, this function must be written so that it cannot
1768 end up waiting for these latches! NOTE 2: in the case of a flush list flush,
1769 the calling thread is not allowed to own any latches on pages!
1770 @return number of blocks for which the write request was queued */
1771 MY_ATTRIBUTE((nonnull))
1772 static
1773 void
buf_flush_batch(buf_pool_t * buf_pool,buf_flush_t flush_type,ulint min_n,lsn_t lsn_limit,bool limited_lru_scan,flush_counters_t * n)1774 buf_flush_batch(
1775 /*============*/
1776 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
1777 	buf_flush_t	flush_type,	/*!< in: BUF_FLUSH_LRU or
1778 					BUF_FLUSH_LIST; if BUF_FLUSH_LIST,
1779 					then the caller must not own any
1780 					latches on pages */
1781 	ulint		min_n,		/*!< in: wished minimum mumber of blocks
1782 					flushed (it is not guaranteed that the
1783 					actual number is that big, though) */
1784 	lsn_t		lsn_limit,	/*!< in: in the case of BUF_FLUSH_LIST
1785 					all blocks whose oldest_modification is
1786 					smaller than this should be flushed
1787 					(if their number does not exceed
1788 					min_n), otherwise ignored */
1789 	bool		limited_lru_scan,/*!< in: for LRU flushes, if true,
1790 					allow to scan only up to
1791 					srv_LRU_scan_depth pages in total */
1792 	flush_counters_t*	n)	/*!< out: flushed/evicted page
1793 					counts  */
1794 {
1795 	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1796 #ifdef UNIV_SYNC_DEBUG
1797 	ut_ad((flush_type != BUF_FLUSH_LIST)
1798 	      || sync_thread_levels_empty_except_dict());
1799 #endif /* UNIV_SYNC_DEBUG */
1800 
1801 	/* Note: The buffer pool mutexes are released and reacquired within
1802 	the flush functions. */
1803 	switch (flush_type) {
1804 	case BUF_FLUSH_LRU:
1805 		mutex_enter(&buf_pool->LRU_list_mutex);
1806 		buf_do_LRU_batch(buf_pool, min_n, limited_lru_scan, n);
1807 		mutex_exit(&buf_pool->LRU_list_mutex);
1808 		break;
1809 	case BUF_FLUSH_LIST:
1810 		ut_ad(!limited_lru_scan);
1811 		n->flushed = buf_do_flush_list_batch(buf_pool, min_n,
1812 						     lsn_limit);
1813 		n->evicted = 0;
1814 		break;
1815 	default:
1816 		ut_error;
1817 	}
1818 
1819 #ifdef UNIV_DEBUG
1820 	if (buf_debug_prints && n->flushed > 0) {
1821 		fprintf(stderr, flush_type == BUF_FLUSH_LRU
1822 			? "Flushed %lu pages in LRU flush\n"
1823 			: "Flushed %lu pages in flush list flush\n",
1824 			(ulong) n->flushed);
1825 	}
1826 #endif /* UNIV_DEBUG */
1827 }
1828 
1829 /******************************************************************//**
1830 Gather the aggregated stats for both flush list and LRU list flushing */
1831 static
1832 void
buf_flush_common(buf_flush_t flush_type,ulint page_count)1833 buf_flush_common(
1834 /*=============*/
1835 	buf_flush_t	flush_type,	/*!< in: type of flush */
1836 	ulint		page_count)	/*!< in: number of pages flushed */
1837 {
1838 	if (page_count) {
1839 		buf_dblwr_flush_buffered_writes();
1840 	}
1841 
1842 	ut_a(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1843 
1844 #ifdef UNIV_DEBUG
1845 	if (buf_debug_prints && page_count > 0) {
1846 		fprintf(stderr, flush_type == BUF_FLUSH_LRU
1847 			? "Flushed %lu pages in LRU flush\n"
1848 			: "Flushed %lu pages in flush list flush\n",
1849 			(ulong) page_count);
1850 	}
1851 #endif /* UNIV_DEBUG */
1852 
1853 	srv_stats.buf_pool_flushed.add(page_count);
1854 }
1855 
1856 /******************************************************************//**
1857 Start a buffer flush batch for LRU or flush list */
1858 static
1859 ibool
buf_flush_start(buf_pool_t * buf_pool,buf_flush_t flush_type)1860 buf_flush_start(
1861 /*============*/
1862 	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
1863 	buf_flush_t	flush_type)	/*!< in: BUF_FLUSH_LRU
1864 					or BUF_FLUSH_LIST */
1865 {
1866 	mutex_enter(&buf_pool->flush_state_mutex);
1867 
1868 	if (buf_pool->n_flush[flush_type] > 0
1869 	    || buf_pool->init_flush[flush_type] == TRUE) {
1870 
1871 		/* There is already a flush batch of the same type running */
1872 
1873 		mutex_exit(&buf_pool->flush_state_mutex);
1874 
1875 		return(FALSE);
1876 	}
1877 
1878 	buf_pool->init_flush[flush_type] = TRUE;
1879 
1880 	mutex_exit(&buf_pool->flush_state_mutex);
1881 
1882 	return(TRUE);
1883 }
1884 
1885 /******************************************************************//**
1886 End a buffer flush batch for LRU or flush list */
1887 static
1888 void
buf_flush_end(buf_pool_t * buf_pool,buf_flush_t flush_type)1889 buf_flush_end(
1890 /*==========*/
1891 	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
1892 	buf_flush_t	flush_type)	/*!< in: BUF_FLUSH_LRU
1893 					or BUF_FLUSH_LIST */
1894 {
1895 	mutex_enter(&buf_pool->flush_state_mutex);
1896 
1897 	buf_pool->init_flush[flush_type] = FALSE;
1898 
1899 	buf_pool->try_LRU_scan = TRUE;
1900 
1901 	if (buf_pool->n_flush[flush_type] == 0) {
1902 
1903 		/* The running flush batch has ended */
1904 
1905 		os_event_set(buf_pool->no_flush[flush_type]);
1906 	}
1907 
1908 	mutex_exit(&buf_pool->flush_state_mutex);
1909 }
1910 
1911 /******************************************************************//**
1912 Waits until a flush batch of the given type ends */
1913 UNIV_INTERN
1914 void
buf_flush_wait_batch_end(buf_pool_t * buf_pool,buf_flush_t type)1915 buf_flush_wait_batch_end(
1916 /*=====================*/
1917 	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
1918 	buf_flush_t	type)		/*!< in: BUF_FLUSH_LRU
1919 					or BUF_FLUSH_LIST */
1920 {
1921 	ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
1922 
1923 	if (buf_pool == NULL) {
1924 		ulint	i;
1925 
1926 		for (i = 0; i < srv_buf_pool_instances; ++i) {
1927 			buf_pool_t*	buf_pool;
1928 
1929 			buf_pool = buf_pool_from_array(i);
1930 
1931 			thd_wait_begin(NULL, THD_WAIT_DISKIO);
1932 			os_event_wait(buf_pool->no_flush[type]);
1933 			thd_wait_end(NULL);
1934 		}
1935 	} else {
1936 		thd_wait_begin(NULL, THD_WAIT_DISKIO);
1937 		os_event_wait(buf_pool->no_flush[type]);
1938 		thd_wait_end(NULL);
1939 	}
1940 }
1941 
1942 /*******************************************************************//**
1943 This utility flushes dirty blocks from the end of the LRU list and also
1944 puts replaceable clean pages from the end of the LRU list to the free
1945 list.
1946 NOTE: The calling thread is not allowed to own any latches on pages!
1947 @return true if a batch was queued successfully. false if another batch
1948 of same type was already running. */
1949 MY_ATTRIBUTE((nonnull))
1950 static
1951 bool
buf_flush_LRU(buf_pool_t * buf_pool,ulint min_n,bool limited_scan,flush_counters_t * n)1952 buf_flush_LRU(
1953 /*==========*/
1954 	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
1955 	ulint		min_n,		/*!< in: wished minimum mumber of blocks
1956 					flushed (it is not guaranteed that the
1957 					actual number is that big, though) */
1958 	bool			limited_scan,	/*!< in: if true, allow to scan
1959 						only up to srv_LRU_scan_depth
1960 						pages in total */
1961 	flush_counters_t	*n)	/*!< out: flushed/evicted page
1962 					counts */
1963 {
1964 	if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) {
1965 		n->flushed = 0;
1966 		n->evicted = 0;
1967 		n->unzip_LRU_evicted = 0;
1968 		return(false);
1969 	}
1970 
1971 	buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0, limited_scan, n);
1972 
1973 	buf_flush_end(buf_pool, BUF_FLUSH_LRU);
1974 
1975 	buf_flush_common(BUF_FLUSH_LRU, n->flushed);
1976 
1977 	return(true);
1978 }
1979 
1980 /*******************************************************************//**
1981 This utility flushes dirty blocks from the end of the flush list of
1982 all buffer pool instances.
1983 NOTE: The calling thread is not allowed to own any latches on pages!
1984 @return true if a batch was queued successfully for each buffer pool
1985 instance. false if another batch of same type was already running in
1986 at least one of the buffer pool instance */
1987 UNIV_INTERN
1988 bool
buf_flush_list(ulint min_n,lsn_t lsn_limit,ulint * n_processed)1989 buf_flush_list(
1990 /*===========*/
1991 	ulint		min_n,		/*!< in: wished minimum mumber of blocks
1992 					flushed (it is not guaranteed that the
1993 					actual number is that big, though) */
1994 	lsn_t		lsn_limit,	/*!< in the case BUF_FLUSH_LIST all
1995 					blocks whose oldest_modification is
1996 					smaller than this should be flushed
1997 					(if their number does not exceed
1998 					min_n), otherwise ignored */
1999 	ulint*		n_processed)	/*!< out: the number of pages
2000 					which were processed is passed
2001 					back to caller. Ignored if NULL */
2002 
2003 {
2004 	ulint		i;
2005 
2006 	ulint		requested_pages[MAX_BUFFER_POOLS];
2007 	bool		active_instance[MAX_BUFFER_POOLS];
2008 	ulint		remaining_instances = srv_buf_pool_instances;
2009 	bool		timeout = false;
2010 	ulint		flush_start_time = 0;
2011 
2012 	for (i = 0; i < srv_buf_pool_instances; i++) {
2013 		requested_pages[i] = 0;
2014 		active_instance[i] = true;
2015 	}
2016 
2017 	if (n_processed) {
2018 		*n_processed = 0;
2019 	}
2020 
2021 	if (min_n != ULINT_MAX) {
2022 		/* Ensure that flushing is spread evenly amongst the
2023 		buffer pool instances. When min_n is ULINT_MAX
2024 		we need to flush everything up to the lsn limit
2025 		so no limit here. */
2026 		min_n = (min_n + srv_buf_pool_instances - 1)
2027 			 / srv_buf_pool_instances;
2028 		if (lsn_limit != LSN_MAX) {
2029 			flush_start_time = ut_time_ms();
2030 		}
2031 	}
2032 
2033 	/* Flush to lsn_limit in all buffer pool instances */
2034 	while (remaining_instances && !timeout) {
2035 
2036 		ulint flush_common_batch = 0;
2037 
2038 		for (i = 0; i < srv_buf_pool_instances; i++) {
2039 
2040 			if (flush_start_time
2041 			    && (ut_time_ms() - flush_start_time
2042 				>= srv_cleaner_max_flush_time)) {
2043 
2044 				timeout = true;
2045 				break;
2046 			}
2047 
2048 			if (active_instance[i]) {
2049 
2050 				buf_pool_t*	buf_pool;
2051 				ulint		chunk_size;
2052 				flush_counters_t n;
2053 
2054 				chunk_size = ut_min(
2055 					srv_cleaner_flush_chunk_size,
2056 					min_n - requested_pages[i]);
2057 
2058 				buf_pool = buf_pool_from_array(i);
2059 
2060 				if (!buf_flush_start(buf_pool,
2061 						     BUF_FLUSH_LIST)) {
2062 
2063 					continue;
2064 				}
2065 
2066 				buf_flush_batch(buf_pool, BUF_FLUSH_LIST,
2067 						chunk_size, lsn_limit, false,
2068 						&n);
2069 
2070 				buf_flush_end(buf_pool, BUF_FLUSH_LIST);
2071 
2072 				flush_common_batch += n.flushed;
2073 
2074 				if (n_processed) {
2075 					*n_processed += n.flushed;
2076 				}
2077 
2078 				requested_pages[i] += chunk_size;
2079 
2080 				if (requested_pages[i] >= min_n
2081 				    || !n.flushed) {
2082 
2083 					active_instance[i] = false;
2084 					remaining_instances--;
2085 				}
2086 
2087 				if (n.flushed) {
2088 					MONITOR_INC_VALUE_CUMULATIVE(
2089 						MONITOR_FLUSH_BATCH_TOTAL_PAGE,
2090 						MONITOR_FLUSH_BATCH_COUNT,
2091 						MONITOR_FLUSH_BATCH_PAGES,
2092 						n.flushed);
2093 				}
2094 			}
2095 		}
2096 
2097 		buf_flush_common(BUF_FLUSH_LIST, flush_common_batch);
2098 	}
2099 
2100 	/* If we haven't flushed all the instances due to timeout or a repeat
2101 	failure to start a flush, return failure */
2102 	for (i = 0; i < srv_buf_pool_instances; i++) {
2103 		if (active_instance[i]) {
2104 			return(false);
2105 		}
2106 	}
2107 
2108 	return(true);
2109 }
2110 
2111 /******************************************************************//**
2112 This function picks up a single dirty page from the tail of the LRU
2113 list, flushes it, removes it from page_hash and LRU list and puts
2114 it on the free list. It is called from user threads when they are
2115 unable to find a replaceable page at the tail of the LRU list i.e.:
2116 when the background LRU flushing in the page_cleaner thread is not
2117 fast enough to keep pace with the workload.
2118 @return TRUE if success. */
2119 UNIV_INTERN
2120 ibool
buf_flush_single_page_from_LRU(buf_pool_t * buf_pool)2121 buf_flush_single_page_from_LRU(
2122 /*===========================*/
2123 	buf_pool_t*	buf_pool)	/*!< in/out: buffer pool instance */
2124 {
2125 	ulint		scanned;
2126 	buf_page_t*	bpage;
2127 	ibool		flushed = FALSE;
2128 
2129 	mutex_enter(&buf_pool->LRU_list_mutex);
2130 
2131 	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU), scanned = 1;
2132 	     bpage != NULL;
2133 	     bpage = UT_LIST_GET_PREV(LRU, bpage), ++scanned) {
2134 
2135 		ib_mutex_t*	block_mutex = buf_page_get_mutex(bpage);
2136 
2137 		mutex_enter(block_mutex);
2138 
2139 		if (buf_flush_ready_for_flush(bpage, BUF_FLUSH_SINGLE_PAGE)) {
2140 
2141 			/* The following call will release the LRU list
2142 			and block mutex. */
2143 
2144 			flushed = buf_flush_page(buf_pool, bpage,
2145 						 BUF_FLUSH_SINGLE_PAGE, true);
2146 
2147 			if (flushed) {
2148 				/* buf_flush_page() will release the
2149 				block mutex */
2150 				break;
2151 			}
2152 		}
2153 
2154 		mutex_exit(block_mutex);
2155 	}
2156 
2157 	if (!flushed)
2158 		mutex_exit(&buf_pool->LRU_list_mutex);
2159 
2160 	MONITOR_INC_VALUE_CUMULATIVE(
2161 		MONITOR_LRU_SINGLE_FLUSH_SCANNED,
2162 		MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
2163 		MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL,
2164 		scanned);
2165 
2166 	if (bpage == NULL) {
2167 		/* Can't find a single flushable page. */
2168 		return(FALSE);
2169 	}
2170 
2171 
2172 	ibool	freed = FALSE;
2173 
2174 	/* At this point the page has been written to the disk.
2175 	As we are not holding LRU list or buf_page_get_mutex() mutex therefore
2176 	we cannot use the bpage safely. It may have been plucked out
2177 	of the LRU list by some other thread or it may even have
2178 	relocated in case of a compressed page. We need to start
2179 	the scan of LRU list again to remove the block from the LRU
2180 	list and put it on the free list. */
2181 	mutex_enter(&buf_pool->LRU_list_mutex);
2182 
2183 	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
2184 	     bpage != NULL;
2185 	     bpage = UT_LIST_GET_PREV(LRU, bpage)) {
2186 
2187 		ib_mutex_t*	block_mutex = buf_page_get_mutex(bpage);
2188 
2189 		mutex_enter(block_mutex);
2190 
2191 		ibool	ready = buf_flush_ready_for_replace(bpage);
2192 
2193 		if (ready) {
2194 			bool	evict_zip;
2195 
2196 			evict_zip = !buf_LRU_evict_from_unzip_LRU(buf_pool);;
2197 
2198 			freed = buf_LRU_free_page(bpage, evict_zip);
2199 
2200 			mutex_exit(block_mutex);
2201 
2202 			break;
2203 		}
2204 
2205 		mutex_exit(block_mutex);
2206 
2207 	}
2208 
2209 	if (!freed)
2210 		mutex_exit(&buf_pool->LRU_list_mutex);
2211 
2212 	return(freed);
2213 }
2214 
2215 /*********************************************************************//**
2216 Clears up tail of the LRU lists:
2217 * Put replaceable pages at the tail of LRU to the free list
2218 * Flush dirty pages at the tail of LRU to the disk
2219 The depth to which we scan each buffer pool is controlled by dynamic
2220 config parameter innodb_LRU_scan_depth.
2221 @return number of flushed and evicted pages */
2222 UNIV_INTERN
2223 ulint
buf_flush_LRU_tail(void)2224 buf_flush_LRU_tail(void)
2225 /*====================*/
2226 {
2227 	ulint	total_flushed = 0;
2228 	ulint	total_evicted = 0;
2229 	ulint	start_time = ut_time_ms();
2230 	ulint	scan_depth[MAX_BUFFER_POOLS];
2231 	ulint	requested_pages[MAX_BUFFER_POOLS];
2232 	bool	active_instance[MAX_BUFFER_POOLS];
2233 	bool	limited_scan[MAX_BUFFER_POOLS];
2234 	ulint	previous_evicted[MAX_BUFFER_POOLS];
2235 	ulint	remaining_instances = srv_buf_pool_instances;
2236 	ulint	lru_chunk_size = srv_cleaner_lru_chunk_size;
2237 	ulint	free_list_lwm = srv_LRU_scan_depth / 100
2238 		* srv_cleaner_free_list_lwm;
2239 
2240 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2241 
2242 		const buf_pool_t* buf_pool = buf_pool_from_array(i);
2243 
2244 		scan_depth[i] = ut_min(srv_LRU_scan_depth,
2245 				       UT_LIST_GET_LEN(buf_pool->LRU));
2246 		requested_pages[i] = 0;
2247 		active_instance[i] = true;
2248 		limited_scan[i] = true;
2249 		previous_evicted[i] = 0;
2250 	}
2251 
2252 	while (remaining_instances) {
2253 
2254 		if (ut_time_ms() - start_time >= srv_cleaner_max_lru_time) {
2255 
2256 			break;
2257 		}
2258 
2259 		for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2260 
2261 			if (!active_instance[i]) {
2262 				continue;
2263 			}
2264 
2265 			ulint free_len = free_list_lwm;
2266 			buf_pool_t* buf_pool = buf_pool_from_array(i);
2267 
2268 			do {
2269 				flush_counters_t	n;
2270 
2271 				ut_ad(requested_pages[i] <= scan_depth[i]);
2272 
2273 				/* Currently page_cleaner is the only thread
2274 				that can trigger an LRU flush. It is possible
2275 				that a batch triggered during last iteration is
2276 				still running, */
2277 				if (buf_flush_LRU(buf_pool, lru_chunk_size,
2278 						  limited_scan[i], &n)) {
2279 
2280 					/* Allowed only one batch per
2281 					buffer pool instance. */
2282 					buf_flush_wait_batch_end(
2283 						buf_pool, BUF_FLUSH_LRU);
2284 				}
2285 
2286 				total_flushed += n.flushed;
2287 
2288 				/* When we evict less pages than we did	on a
2289 				previous try we relax the LRU scan limit in
2290 				order to attempt to evict more */
2291 				limited_scan[i]
2292 					= (previous_evicted[i] > n.evicted);
2293 				previous_evicted[i] = n.evicted;
2294 				total_evicted += n.evicted;
2295 
2296 				requested_pages[i] += lru_chunk_size;
2297 
2298 				/* If we failed to flush or evict this
2299 				instance, do not bother anymore. But take into
2300 			        account that we might have zero flushed pages
2301 				because the flushing request was fully
2302 				satisfied by unzip_LRU evictions. */
2303 				if (requested_pages[i] >= scan_depth[i]
2304 				    || !(srv_cleaner_eviction_factor
2305 					? n.evicted
2306 					: (n.flushed + n.unzip_LRU_evicted))) {
2307 
2308 					active_instance[i] = false;
2309 					remaining_instances--;
2310 				} else {
2311 
2312 					free_len = UT_LIST_GET_LEN(
2313 						buf_pool->free);
2314 				}
2315 			} while (active_instance[i]
2316 				 && free_len <= free_list_lwm);
2317 		}
2318 	}
2319 
2320 	if (total_flushed) {
2321 		MONITOR_INC_VALUE_CUMULATIVE(
2322 			MONITOR_LRU_BATCH_TOTAL_PAGE,
2323 			MONITOR_LRU_BATCH_COUNT,
2324 			MONITOR_LRU_BATCH_PAGES,
2325 			total_flushed);
2326 	}
2327 	return(total_flushed + total_evicted);
2328 }
2329 
2330 /*********************************************************************//**
2331 Wait for any possible LRU flushes that are in progress to end. */
2332 UNIV_INTERN
2333 void
buf_flush_wait_LRU_batch_end(void)2334 buf_flush_wait_LRU_batch_end(void)
2335 /*==============================*/
2336 {
2337 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2338 		buf_pool_t*	buf_pool;
2339 
2340 		buf_pool = buf_pool_from_array(i);
2341 
2342 		mutex_enter(&buf_pool->flush_state_mutex);
2343 
2344 		if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0
2345 		   || buf_pool->init_flush[BUF_FLUSH_LRU]) {
2346 
2347 			mutex_exit(&buf_pool->flush_state_mutex);
2348 			buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
2349 		} else {
2350 			mutex_exit(&buf_pool->flush_state_mutex);
2351 		}
2352 	}
2353 }
2354 
2355 /*********************************************************************//**
2356 Flush a batch of dirty pages from the flush list
2357 @return number of pages flushed, 0 if no page is flushed or if another
2358 flush_list type batch is running */
2359 static
2360 ulint
page_cleaner_do_flush_batch(ulint n_to_flush,lsn_t lsn_limit)2361 page_cleaner_do_flush_batch(
2362 /*========================*/
2363 	ulint		n_to_flush,	/*!< in: number of pages that
2364 					we should attempt to flush. */
2365 	lsn_t		lsn_limit)	/*!< in: LSN up to which flushing
2366 					must happen */
2367 {
2368 	ulint n_flushed;
2369 
2370 	buf_flush_list(n_to_flush, lsn_limit, &n_flushed);
2371 
2372 	return(n_flushed);
2373 }
2374 
2375 /*********************************************************************//**
2376 Calculates if flushing is required based on number of dirty pages in
2377 the buffer pool.
2378 @return percent of io_capacity to flush to manage dirty page ratio */
2379 static
2380 ulint
af_get_pct_for_dirty()2381 af_get_pct_for_dirty()
2382 /*==================*/
2383 {
2384 	ulint dirty_pct = buf_get_modified_ratio_pct();
2385 
2386 	if (dirty_pct > 0 && srv_max_buf_pool_modified_pct == 0) {
2387 		return(100);
2388 	}
2389 
2390 	ut_a(srv_max_dirty_pages_pct_lwm
2391 	     <= srv_max_buf_pool_modified_pct);
2392 
2393 	if (srv_max_dirty_pages_pct_lwm == 0) {
2394 		/* The user has not set the option to preflush dirty
2395 		pages as we approach the high water mark. */
2396 		if (dirty_pct > srv_max_buf_pool_modified_pct) {
2397 			/* We have crossed the high water mark of dirty
2398 			pages In this case we start flushing at 100% of
2399 			innodb_io_capacity. */
2400 			return(100);
2401 		}
2402 	} else if (dirty_pct > srv_max_dirty_pages_pct_lwm) {
2403 		/* We should start flushing pages gradually. */
2404 		return((dirty_pct * 100)
2405 		       / (srv_max_buf_pool_modified_pct + 1));
2406 	}
2407 
2408 	return(0);
2409 }
2410 
2411 /*********************************************************************//**
2412 Calculates if flushing is required based on redo generation rate.
2413 @return percent of io_capacity to flush to manage redo space */
2414 static
2415 ulint
af_get_pct_for_lsn(lsn_t age)2416 af_get_pct_for_lsn(
2417 /*===============*/
2418 	lsn_t	age)	/*!< in: current age of LSN. */
2419 {
2420 	lsn_t	max_async_age;
2421 	lsn_t	lsn_age_factor;
2422 	lsn_t	af_lwm = (srv_adaptive_flushing_lwm
2423 			  * log_get_capacity()) / 100;
2424 
2425 	if (age < af_lwm) {
2426 		/* No adaptive flushing. */
2427 		return(0);
2428 	}
2429 
2430 	max_async_age = log_get_max_modified_age_async();
2431 
2432 	if (age < max_async_age && !srv_adaptive_flushing) {
2433 		/* We have still not reached the max_async point and
2434 		the user has disabled adaptive flushing. */
2435 		return(0);
2436 	}
2437 
2438 	/* If we are here then we know that either:
2439 	1) User has enabled adaptive flushing
2440 	2) User may have disabled adaptive flushing but we have reached
2441 	max_async_age. */
2442 	lsn_age_factor = (age * 100) / max_async_age;
2443 
2444 	ut_ad(srv_max_io_capacity >= srv_io_capacity);
2445 	switch ((srv_cleaner_lsn_age_factor_t)srv_cleaner_lsn_age_factor) {
2446 	case SRV_CLEANER_LSN_AGE_FACTOR_LEGACY:
2447 		return(static_cast<ulint>(
2448 			       ((srv_max_io_capacity / srv_io_capacity)
2449 				* (lsn_age_factor
2450 				   * sqrt((double)lsn_age_factor)))
2451 			       / 7.5));
2452 	case SRV_CLEANER_LSN_AGE_FACTOR_HIGH_CHECKPOINT:
2453 		return(static_cast<ulint>(
2454 			       ((srv_max_io_capacity / srv_io_capacity)
2455 				* (lsn_age_factor * lsn_age_factor
2456 				   * sqrt((double)lsn_age_factor)))
2457 			       / 700.5));
2458 	default:
2459 		ut_error;
2460 	}
2461 }
2462 
2463 /*********************************************************************//**
2464 This function is called approximately once every second by the
2465 page_cleaner thread. Based on various factors it decides if there is a
2466 need to do flushing. If flushing is needed it is performed and the
2467 number of pages flushed is returned.
2468 @return number of pages flushed */
2469 static
2470 ulint
page_cleaner_flush_pages_if_needed(void)2471 page_cleaner_flush_pages_if_needed(void)
2472 /*====================================*/
2473 {
2474 	static	lsn_t		lsn_avg_rate = 0;
2475 	static	lsn_t		prev_lsn = 0;
2476 	static	lsn_t		last_lsn = 0;
2477 	static	ulint		sum_pages = 0;
2478 	static	ulint		last_pages = 0;
2479 	static	ulint		prev_pages = 0;
2480 	static	ulint		avg_page_rate = 0;
2481 	static	ulint		n_iterations = 0;
2482 	lsn_t			oldest_lsn;
2483 	lsn_t			cur_lsn;
2484 	lsn_t			age;
2485 	lsn_t			lsn_rate;
2486 	ulint			n_pages = 0;
2487 	ulint			pct_for_dirty = 0;
2488 	ulint			pct_for_lsn = 0;
2489 	ulint			pct_total = 0;
2490 	int			age_factor = 0;
2491 
2492 	cur_lsn = log_get_lsn();
2493 
2494 	if (prev_lsn == 0) {
2495 		/* First time around. */
2496 		prev_lsn = cur_lsn;
2497 		return(0);
2498 	}
2499 
2500 	if (prev_lsn == cur_lsn) {
2501 		return(0);
2502 	}
2503 
2504 	/* We update our variables every srv_flushing_avg_loops
2505 	iterations to smooth out transition in workload. */
2506 	if (++n_iterations >= srv_flushing_avg_loops) {
2507 
2508 		avg_page_rate = ((sum_pages / srv_flushing_avg_loops)
2509 				 + avg_page_rate) / 2;
2510 
2511 		/* How much LSN we have generated since last call. */
2512 		lsn_rate = (cur_lsn - prev_lsn) / srv_flushing_avg_loops;
2513 
2514 		lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2;
2515 
2516 		prev_lsn = cur_lsn;
2517 
2518 		n_iterations = 0;
2519 
2520 		sum_pages = 0;
2521 	}
2522 
2523 	oldest_lsn = buf_pool_get_oldest_modification();
2524 
2525 	ut_ad(oldest_lsn <= log_get_lsn());
2526 
2527 	age = cur_lsn > oldest_lsn ? cur_lsn - oldest_lsn : 0;
2528 
2529 	pct_for_dirty = af_get_pct_for_dirty();
2530 	pct_for_lsn = af_get_pct_for_lsn(age);
2531 
2532 	pct_total = ut_max(pct_for_dirty, pct_for_lsn);
2533 
2534 	/* Cap the maximum IO capacity that we are going to use by
2535 	max_io_capacity. */
2536 	n_pages = PCT_IO(pct_total);
2537 	if (age < log_get_max_modified_age_async())
2538 		n_pages = (n_pages + avg_page_rate) / 2;
2539 
2540 	if (n_pages > srv_max_io_capacity) {
2541 		n_pages = srv_max_io_capacity;
2542 	}
2543 
2544 	if (last_pages && cur_lsn - last_lsn > lsn_avg_rate / 2) {
2545 		age_factor = static_cast<int>(prev_pages / last_pages);
2546 	}
2547 
2548 	MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages);
2549 
2550 	prev_pages = n_pages;
2551 	n_pages = page_cleaner_do_flush_batch(
2552 		n_pages, oldest_lsn + lsn_avg_rate * (age_factor + 1));
2553 
2554 	last_lsn= cur_lsn;
2555 	last_pages= n_pages + 1;
2556 
2557 	MONITOR_SET(MONITOR_FLUSH_AVG_PAGE_RATE, avg_page_rate);
2558 	MONITOR_SET(MONITOR_FLUSH_LSN_AVG_RATE, lsn_avg_rate);
2559 	MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty);
2560 	MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn);
2561 
2562 	if (n_pages) {
2563 		MONITOR_INC_VALUE_CUMULATIVE(
2564 			MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
2565 			MONITOR_FLUSH_ADAPTIVE_COUNT,
2566 			MONITOR_FLUSH_ADAPTIVE_PAGES,
2567 			n_pages);
2568 
2569 		sum_pages += n_pages;
2570 	}
2571 
2572 	return(n_pages);
2573 }
2574 
2575 /*********************************************************************//**
2576 Puts the page_cleaner thread to sleep if it has finished work in less
2577 than a second */
2578 static void
page_cleaner_sleep_if_needed(ut_monotonic_time next_loop_time)2579 page_cleaner_sleep_if_needed(ut_monotonic_time next_loop_time) {
2580 	/* No sleep if we are cleaning the buffer pool during the shutdown
2581 	with everything else finished */
2582 	if (srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE)
2583 		return;
2584 
2585 	const ut_monotonic_time cur_time = ut_monotonic_time_ms();
2586 
2587 	if (next_loop_time.ms > cur_time.ms) {
2588 		/* Get sleep interval in micro seconds. We use
2589 		ut_min() to avoid long sleep in case of
2590 		wrap around. */
2591 		os_thread_sleep(ut_min(
2592 		    1000000, (next_loop_time.ms - cur_time.ms) * 1000));
2593 	}
2594 }
2595 
2596 /*********************************************************************//**
2597 Returns the aggregate free list length over all buffer pool instances.
2598 @return total free list length. */
2599 MY_ATTRIBUTE((warn_unused_result))
2600 static
2601 ulint
buf_get_total_free_list_length(void)2602 buf_get_total_free_list_length(void)
2603 /*================================*/
2604 {
2605 	ulint result = 0;
2606 
2607 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2608 
2609 		result += UT_LIST_GET_LEN(buf_pool_from_array(i)->free);
2610 	}
2611 
2612 	return result;
2613 }
2614 
2615 /** Returns the aggregate LRU list length over all buffer pool instances.
2616 @return total LRU list length. */
2617 MY_ATTRIBUTE((warn_unused_result))
2618 static
2619 ulint
buf_get_total_LRU_list_length(void)2620 buf_get_total_LRU_list_length(void)
2621 {
2622         ulint result = 0;
2623 
2624         for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2625 
2626                 result += UT_LIST_GET_LEN(buf_pool_from_array(i)->LRU);
2627         }
2628 
2629         return result;
2630 }
2631 
2632 /*********************************************************************//**
2633 Adjust the desired page cleaner thread sleep time for LRU flushes.  */
2634 MY_ATTRIBUTE((nonnull))
2635 static void
page_cleaner_adapt_lru_sleep_time(ut_monotonic_time * lru_sleep_time,ulint lru_n_flushed)2636 page_cleaner_adapt_lru_sleep_time(
2637 	ut_monotonic_time	* lru_sleep_time, /*!< in/out: desired page cleaner thread
2638 					sleep time for LRU flushes  */
2639 	ulint	lru_n_flushed)  /*!< in: number of flushed in previous batch */
2640 {
2641         ulint free_len = buf_get_total_free_list_length();
2642         ulint max_free_len = ut_min(buf_get_total_LRU_list_length(),
2643                         srv_LRU_scan_depth * srv_buf_pool_instances);
2644 
2645 	if (free_len < max_free_len / 100 && lru_n_flushed) {
2646 
2647 		/* Free lists filled less than 1%
2648 		and iteration was able to flush, no sleep */
2649 		lru_sleep_time->ms = 0;
2650 	} else if (free_len > max_free_len / 5
2651 		   || (free_len < max_free_len / 100 && lru_n_flushed == 0)) {
2652 
2653 		/* Free lists filled more than 20%
2654 		or no pages flushed in previous batch, sleep a bit more */
2655 		lru_sleep_time->ms += 1;
2656 		if (lru_sleep_time->ms > srv_cleaner_max_lru_time)
2657 			lru_sleep_time->ms = srv_cleaner_max_lru_time;
2658 	} else if (free_len < max_free_len / 20 && lru_sleep_time->ms >= 50) {
2659 		/* Free lists filled less than 5%, sleep a bit less */
2660 		lru_sleep_time->ms -= 50;
2661 	} else {
2662 
2663 		/* Free lists filled between 5% and 20%, no change */
2664 	}
2665 }
2666 
2667 /*********************************************************************//**
2668 Get the desired page cleaner thread sleep time for flush list flushes.
2669 @return desired sleep time */
2670 MY_ATTRIBUTE((warn_unused_result))
2671 static
2672 ulint
page_cleaner_adapt_flush_sleep_time(void)2673 page_cleaner_adapt_flush_sleep_time(void)
2674 /*=====================================*/
2675 {
2676 	lsn_t	age = log_get_lsn() - log_sys->last_checkpoint_lsn;
2677 
2678 	if (age > log_sys->max_modified_age_sync) {
2679 
2680 		/* No sleep if in sync preflush zone */
2681 		return(0);
2682 	}
2683 
2684 	/* In all other cases flush list factors do not influence the page
2685 	cleaner sleep time */
2686 	return(srv_cleaner_max_flush_time);
2687 }
2688 
2689 /******************************************************************//**
2690 page_cleaner thread tasked with flushing dirty pages from the buffer
2691 pool flush lists. As of now we'll have only one instance of this thread.
2692 @return a dummy parameter */
2693 extern "C" UNIV_INTERN
2694 os_thread_ret_t
DECLARE_THREAD(buf_flush_page_cleaner_thread)2695 DECLARE_THREAD(buf_flush_page_cleaner_thread)(
2696 /*==========================================*/
2697 	void*	arg MY_ATTRIBUTE((unused)))
2698 			/*!< in: a dummy parameter required by
2699 			os_thread_create */
2700 {
2701 	my_thread_init();
2702 	ut_monotonic_time next_loop_time = ut_monotonic_time_ms();
2703 	next_loop_time.ms += 1000;
2704 	ulint	n_flushed = 0;
2705 	ulint	last_activity = srv_get_activity_count();
2706 	ut_monotonic_time last_activity_time = ut_monotonic_time_ms();
2707 
2708 	ut_ad(!srv_read_only_mode);
2709 
2710 #ifdef UNIV_PFS_THREAD
2711 	pfs_register_thread(buf_page_cleaner_thread_key);
2712 #endif /* UNIV_PFS_THREAD */
2713 
2714 	srv_cleaner_tid = os_thread_get_tid();
2715 
2716 	os_thread_set_priority(srv_cleaner_tid, srv_sched_priority_cleaner);
2717 
2718 #ifdef UNIV_DEBUG_THREAD_CREATION
2719 	fprintf(stderr, "InnoDB: page_cleaner thread running, id %lu\n",
2720 		os_thread_pf(os_thread_get_curr_id()));
2721 #endif /* UNIV_DEBUG_THREAD_CREATION */
2722 
2723 	buf_page_cleaner_is_active = TRUE;
2724 
2725 	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
2726 		ulint page_cleaner_sleep_time;
2727 		ibool	server_active;
2728 
2729 		srv_current_thread_priority = srv_cleaner_thread_priority;
2730 
2731 		/* The page_cleaner skips sleep if the server is
2732 		idle and there are no pending IOs in the buffer pool
2733 		and there is work to do. */
2734 		if (srv_check_activity(last_activity)
2735 		    || buf_get_n_pending_read_ios()
2736 		    || n_flushed == 0) {
2737 			page_cleaner_sleep_if_needed(next_loop_time);
2738 		}
2739 
2740 		page_cleaner_sleep_time =
2741 		    page_cleaner_adapt_flush_sleep_time();
2742 
2743 		next_loop_time.ms =
2744 		    ut_monotonic_time_ms().ms + page_cleaner_sleep_time;
2745 
2746 		server_active = srv_check_activity(last_activity);
2747 		if (server_active ||
2748 		    ut_monotonic_time_ms().ms - last_activity_time.ms <
2749 			1000) {
2750 			if (server_active) {
2751 
2752 				last_activity = srv_get_activity_count();
2753 				last_activity_time = ut_monotonic_time_ms();
2754 			}
2755 
2756 			/* Flush pages from flush_list if required */
2757 			n_flushed = page_cleaner_flush_pages_if_needed();
2758 
2759 		} else {
2760 			n_flushed = page_cleaner_do_flush_batch(
2761 							PCT_IO(100),
2762 							LSN_MAX);
2763 
2764 			if (n_flushed) {
2765 				MONITOR_INC_VALUE_CUMULATIVE(
2766 					MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
2767 					MONITOR_FLUSH_BACKGROUND_COUNT,
2768 					MONITOR_FLUSH_BACKGROUND_PAGES,
2769 					n_flushed);
2770 			}
2771 		}
2772 	}
2773 
2774 	ut_ad(srv_shutdown_state > 0);
2775 	if (srv_fast_shutdown == 2) {
2776 		/* In very fast shutdown we simulate a crash of
2777 		buffer pool. We are not required to do any flushing */
2778 		goto thread_exit;
2779 	}
2780 
2781 	/* In case of normal and slow shutdown the page_cleaner thread
2782 	must wait for all other activity in the server to die down.
2783 	Note that we can start flushing the buffer pool as soon as the
2784 	server enters shutdown phase but we must stay alive long enough
2785 	to ensure that any work done by the master or purge threads is
2786 	also flushed.
2787 	During shutdown we pass through two stages. In the first stage,
2788 	when SRV_SHUTDOWN_CLEANUP is set other threads like the master
2789 	and the purge threads may be working as well. We start flushing
2790 	the buffer pool but can't be sure that no new pages are being
2791 	dirtied until we enter SRV_SHUTDOWN_FLUSH_PHASE phase. Because
2792 	the LRU manager thread is also flushing at SRV_SHUTDOWN_CLEANUP
2793 	but not SRV_SHUTDOWN_FLUSH_PHASE, we only leave the
2794 	SRV_SHUTDOWN_CLEANUP loop when the LRU manager quits. */
2795 
2796 	do {
2797 		n_flushed = page_cleaner_do_flush_batch(PCT_IO(100), LSN_MAX);
2798 
2799 		/* We sleep only if there are no pages to flush */
2800 		if (n_flushed == 0) {
2801 			os_thread_sleep(100000);
2802 		}
2803 
2804 		os_rmb;
2805 	} while (srv_shutdown_state == SRV_SHUTDOWN_CLEANUP
2806 		 || buf_lru_manager_is_active);
2807 
2808 	/* At this point all threads including the master and the purge
2809 	thread must have been suspended. */
2810 	ut_a(srv_get_active_thread_type() == SRV_NONE);
2811 	ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
2812 
2813 	/* We can now make a final sweep on flushing the buffer pool
2814 	and exit after we have cleaned the whole buffer pool.
2815 	It is important that we wait for any running batch that has
2816 	been triggered by us to finish. Otherwise we can end up
2817 	considering end of that batch as a finish of our final
2818 	sweep and we'll come out of the loop leaving behind dirty pages
2819 	in the flush_list */
2820 	buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
2821 	buf_flush_wait_LRU_batch_end();
2822 
2823 #ifdef UNIV_DEBUG
2824 	os_rmb;
2825 	ut_ad(!buf_lru_manager_is_active);
2826 #endif
2827 
2828 	bool	success;
2829 
2830 	do {
2831 
2832 		success = buf_flush_list(PCT_IO(100), LSN_MAX, &n_flushed);
2833 		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
2834 
2835 	} while (!success || n_flushed > 0);
2836 
2837 	/* Some sanity checks */
2838 	ut_a(srv_get_active_thread_type() == SRV_NONE);
2839 	ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
2840 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2841 		buf_pool_t* buf_pool = buf_pool_from_array(i);
2842 		ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == 0);
2843 	}
2844 
2845 	/* We have lived our life. Time to die. */
2846 
2847 thread_exit:
2848 	buf_page_cleaner_is_active = FALSE;
2849 
2850 	my_thread_end();
2851 	/* We count the number of threads in os_thread_exit(). A created
2852 	thread should always use that to exit and not use return() to exit. */
2853 	os_thread_exit(NULL);
2854 
2855 	OS_THREAD_DUMMY_RETURN;
2856 }
2857 
2858 /******************************************************************//**
2859 lru_manager thread tasked with performing LRU flushes and evictions to refill
2860 the buffer pool free lists.  As of now we'll have only one instance of this
2861 thread.
2862 @return a dummy parameter */
2863 extern "C" UNIV_INTERN
2864 os_thread_ret_t
DECLARE_THREAD(buf_flush_lru_manager_thread)2865 DECLARE_THREAD(buf_flush_lru_manager_thread)(
2866 /*==========================================*/
2867 	void*	arg MY_ATTRIBUTE((unused)))
2868 			/*!< in: a dummy parameter required by
2869 			os_thread_create */
2870 {
2871 	ut_monotonic_time next_loop_time = ut_monotonic_time_ms();
2872 	next_loop_time.ms += 1000;
2873 	ut_monotonic_time lru_sleep_time;
2874 	lru_sleep_time.ms = srv_cleaner_max_lru_time;
2875 	ulint	lru_n_flushed = 1;
2876 
2877 #ifdef UNIV_PFS_THREAD
2878 	pfs_register_thread(buf_lru_manager_thread_key);
2879 #endif /* UNIV_PFS_THREAD */
2880 
2881 	srv_lru_manager_tid = os_thread_get_tid();
2882 
2883 	os_thread_set_priority(srv_lru_manager_tid,
2884 			       srv_sched_priority_cleaner);
2885 
2886 #ifdef UNIV_DEBUG_THREAD_CREATION
2887 	fprintf(stderr, "InnoDB: lru_manager thread running, id %lu\n",
2888 		os_thread_pf(os_thread_get_curr_id()));
2889 #endif /* UNIV_DEBUG_THREAD_CREATION */
2890 
2891 	buf_lru_manager_is_active = true;
2892 	os_wmb;
2893 
2894 	/* On server shutdown, the LRU manager thread runs through cleanup
2895 	phase to provide free pages for the master and purge threads.  */
2896 	while (srv_shutdown_state == SRV_SHUTDOWN_NONE
2897 	       || srv_shutdown_state == SRV_SHUTDOWN_CLEANUP) {
2898 
2899 		srv_current_thread_priority = srv_cleaner_thread_priority;
2900 
2901 		page_cleaner_sleep_if_needed(next_loop_time);
2902 
2903 		page_cleaner_adapt_lru_sleep_time(&lru_sleep_time, lru_n_flushed);
2904 
2905 		next_loop_time.ms =
2906 		    ut_monotonic_time_ms().ms + lru_sleep_time.ms;
2907 
2908 		lru_n_flushed = buf_flush_LRU_tail();
2909 	}
2910 
2911 	buf_lru_manager_is_active = false;
2912 	os_wmb;
2913 
2914 	/* We count the number of threads in os_thread_exit(). A created
2915 	thread should always use that to exit and not use return() to exit. */
2916 	os_thread_exit(NULL);
2917 
2918 	OS_THREAD_DUMMY_RETURN;
2919 }
2920 
2921 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
2922 
2923 /** Functor to validate the flush list. */
2924 struct	Check {
operator ()Check2925 	void	operator()(const buf_page_t* elem)
2926 	{
2927 		ut_a(elem->in_flush_list);
2928 	}
2929 };
2930 
2931 /******************************************************************//**
2932 Validates the flush list.
2933 @return	TRUE if ok */
2934 static
2935 ibool
buf_flush_validate_low(buf_pool_t * buf_pool)2936 buf_flush_validate_low(
2937 /*===================*/
2938 	buf_pool_t*	buf_pool)		/*!< in: Buffer pool instance */
2939 {
2940 	buf_page_t*		bpage;
2941 	const ib_rbt_node_t*	rnode = NULL;
2942 
2943 	ut_ad(buf_flush_list_mutex_own(buf_pool));
2944 
2945 	UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list, Check());
2946 
2947 	bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
2948 
2949 	/* If we are in recovery mode i.e.: flush_rbt != NULL
2950 	then each block in the flush_list must also be present
2951 	in the flush_rbt. */
2952 	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
2953 		rnode = rbt_first(buf_pool->flush_rbt);
2954 	}
2955 
2956 	while (bpage != NULL) {
2957 		const lsn_t	om = bpage->oldest_modification;
2958 
2959 		ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
2960 
2961 		ut_ad(bpage->in_flush_list);
2962 
2963 		/* A page in buf_pool->flush_list can be in
2964 		BUF_BLOCK_REMOVE_HASH state. This happens when a page
2965 		is in the middle of being relocated. In that case the
2966 		original descriptor can have this state and still be
2967 		in the flush list waiting to acquire the
2968 		buf_pool->flush_list_mutex to complete the relocation. */
2969 		ut_a(buf_page_in_file(bpage)
2970 		     || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
2971 		ut_a(om > 0);
2972 
2973 		if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
2974 			buf_page_t** prpage;
2975 
2976 			ut_a(rnode);
2977 			prpage = rbt_value(buf_page_t*, rnode);
2978 
2979 			ut_a(*prpage);
2980 			ut_a(*prpage == bpage);
2981 			rnode = rbt_next(buf_pool->flush_rbt, rnode);
2982 		}
2983 
2984 		bpage = UT_LIST_GET_NEXT(list, bpage);
2985 
2986 		ut_a(!bpage || om >= bpage->oldest_modification);
2987 	}
2988 
2989 	/* By this time we must have exhausted the traversal of
2990 	flush_rbt (if active) as well. */
2991 	ut_a(rnode == NULL);
2992 
2993 	return(TRUE);
2994 }
2995 
2996 /******************************************************************//**
2997 Validates the flush list.
2998 @return	TRUE if ok */
2999 UNIV_INTERN
3000 ibool
buf_flush_validate(buf_pool_t * buf_pool)3001 buf_flush_validate(
3002 /*===============*/
3003 	buf_pool_t*	buf_pool)	/*!< buffer pool instance */
3004 {
3005 	ibool	ret;
3006 
3007 	buf_flush_list_mutex_enter(buf_pool);
3008 
3009 	ret = buf_flush_validate_low(buf_pool);
3010 
3011 	buf_flush_list_mutex_exit(buf_pool);
3012 
3013 	return(ret);
3014 }
3015 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3016 #endif /* !UNIV_HOTBACKUP */
3017 
3018 #ifdef UNIV_DEBUG
3019 /******************************************************************//**
3020 Check if there are any dirty pages that belong to a space id in the flush
3021 list in a particular buffer pool.
3022 @return	number of dirty pages present in a single buffer pool */
3023 UNIV_INTERN
3024 ulint
buf_pool_get_dirty_pages_count(buf_pool_t * buf_pool,ulint id)3025 buf_pool_get_dirty_pages_count(
3026 /*===========================*/
3027 	buf_pool_t*	buf_pool,	/*!< in: buffer pool */
3028 	ulint		id)		/*!< in: space id to check */
3029 
3030 {
3031 	ulint		count = 0;
3032 
3033 	buf_flush_list_mutex_enter(buf_pool);
3034 
3035 	buf_page_t*	bpage;
3036 
3037 	for (bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
3038 	     bpage != 0;
3039 	     bpage = UT_LIST_GET_NEXT(list, bpage)) {
3040 
3041 		ut_ad(buf_page_in_file(bpage)
3042 		      || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
3043 		ut_ad(bpage->in_flush_list);
3044 		ut_ad(bpage->oldest_modification > 0);
3045 
3046 		if (bpage->space == id) {
3047 			++count;
3048 		}
3049 	}
3050 
3051 	buf_flush_list_mutex_exit(buf_pool);
3052 
3053 	return(count);
3054 }
3055 
3056 /******************************************************************//**
3057 Check if there are any dirty pages that belong to a space id in the flush list.
3058 @return	number of dirty pages present in all the buffer pools */
3059 UNIV_INTERN
3060 ulint
buf_flush_get_dirty_pages_count(ulint id)3061 buf_flush_get_dirty_pages_count(
3062 /*============================*/
3063 	ulint		id)		/*!< in: space id to check */
3064 
3065 {
3066 	ulint		count = 0;
3067 
3068 	for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
3069 		buf_pool_t*	buf_pool;
3070 
3071 		buf_pool = buf_pool_from_array(i);
3072 
3073 		count += buf_pool_get_dirty_pages_count(buf_pool, id);
3074 	}
3075 
3076 	return(count);
3077 }
3078 #endif /* UNIV_DEBUG */
3079