1 /*****************************************************************************
2 
3 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2013, 2022, MariaDB Corporation.
5 Copyright (c) 2013, 2014, Fusion-io
6 
7 This program is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free Software
9 Foundation; version 2 of the License.
10 
11 This program is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
14 
15 You should have received a copy of the GNU General Public License along with
16 this program; if not, write to the Free Software Foundation, Inc.,
17 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
18 
19 *****************************************************************************/
20 
21 /**************************************************//**
22 @file buf/buf0flu.cc
23 The database buffer buf_pool flush algorithm
24 
25 Created 11/11/1995 Heikki Tuuri
26 *******************************************************/
27 
28 #include "univ.i"
29 #include <my_service_manager.h>
30 #include <mysql/service_thd_wait.h>
31 #include <sql_class.h>
32 
33 #include "buf0flu.h"
34 #include "buf0buf.h"
35 #include "buf0checksum.h"
36 #include "buf0dblwr.h"
37 #include "srv0start.h"
38 #include "page0zip.h"
39 #include "fil0fil.h"
40 #include "log0crypt.h"
41 #include "srv0mon.h"
42 #include "fil0pagecompress.h"
43 #ifdef HAVE_LZO
44 # include "lzo/lzo1x.h"
45 #elif defined HAVE_SNAPPY
46 # include "snappy-c.h"
47 #endif
48 
49 /** Number of pages flushed via LRU. Protected by buf_pool.mutex.
50 Also included in buf_flush_page_count. */
51 ulint buf_lru_flush_page_count;
52 
53 /** Number of pages flushed. Protected by buf_pool.mutex. */
54 ulint buf_flush_page_count;
55 
56 /** Flag indicating if the page_cleaner is in active state. */
57 bool buf_page_cleaner_is_active;
58 
59 /** Factor for scan length to determine n_pages for intended oldest LSN
60 progress */
61 static constexpr ulint buf_flush_lsn_scan_factor = 3;
62 
63 /** Average redo generation rate */
64 static lsn_t lsn_avg_rate = 0;
65 
66 /** Target oldest_modification for the page cleaner background flushing;
67 writes are protected by buf_pool.flush_list_mutex */
68 static Atomic_relaxed<lsn_t> buf_flush_async_lsn;
69 /** Target oldest_modification for the page cleaner furious flushing;
70 writes are protected by buf_pool.flush_list_mutex */
71 static Atomic_relaxed<lsn_t> buf_flush_sync_lsn;
72 
73 #ifdef UNIV_PFS_THREAD
74 mysql_pfs_key_t page_cleaner_thread_key;
75 #endif /* UNIV_PFS_THREAD */
76 
77 /** Page cleaner structure */
78 static struct
79 {
80   /** total elapsed time in adaptive flushing, in seconds */
81   ulint flush_time;
82   /** number of adaptive flushing passes */
83   ulint flush_pass;
84 } page_cleaner;
85 
86 #ifdef UNIV_DEBUG
87 my_bool innodb_page_cleaner_disabled_debug;
88 #endif /* UNIV_DEBUG */
89 
90 /* @} */
91 
92 #ifdef UNIV_DEBUG
93 /** Validate the flush list. */
94 static void buf_flush_validate_low();
95 
96 /** Validates the flush list some of the time. */
buf_flush_validate_skip()97 static void buf_flush_validate_skip()
98 {
99 /** Try buf_flush_validate_low() every this many times */
100 # define BUF_FLUSH_VALIDATE_SKIP	23
101 
102 	/** The buf_flush_validate_low() call skip counter.
103 	Use a signed type because of the race condition below. */
104 	static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
105 
106 	/* There is a race condition below, but it does not matter,
107 	because this call is only for heuristic purposes. We want to
108 	reduce the call frequency of the costly buf_flush_validate_low()
109 	check in debug builds. */
110 	if (--buf_flush_validate_count > 0) {
111 		return;
112 	}
113 
114 	buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
115 	buf_flush_validate_low();
116 }
117 #endif /* UNIV_DEBUG */
118 
119 /** Wake up the page cleaner if needed */
page_cleaner_wakeup()120 void buf_pool_t::page_cleaner_wakeup()
121 {
122   if (!page_cleaner_idle())
123     return;
124   double dirty_pct= double(UT_LIST_GET_LEN(buf_pool.flush_list)) * 100.0 /
125     double(UT_LIST_GET_LEN(buf_pool.LRU) + UT_LIST_GET_LEN(buf_pool.free));
126   double pct_lwm= srv_max_dirty_pages_pct_lwm;
127 
128   /* if pct_lwm != 0.0, adaptive flushing is enabled.
129   signal buf page cleaner thread
130   - if pct_lwm <= dirty_pct then it will invoke apdative flushing flow
131   - if pct_lwm > dirty_pct then it will invoke idle flushing flow.
132 
133   idle_flushing:
134   dirty_pct < innodb_max_dirty_pages_pct_lwm so it could be an
135   idle flushing use-case.
136 
137   Why is last_activity_count not updated always?
138   - let's first understand when is server activity count updated.
139   - it is updated on commit of a transaction trx_t::commit() and not
140     on adding a page to the flush list.
141   - page_cleaner_wakeup is called when a page is added to the flush list.
142 
143   - now let's say the first user thread, updates the count from X -> Y but
144     is yet to commit the transaction (so activity count is still Y).
145     followup user threads will see the updated count as (Y) that is matching
146     the universal server activity count (Y), giving a false impression that
147     the server is idle.
148 
149   How to avoid this?
150   - by allowing last_activity_count to updated when page-cleaner is made
151     active and has work to do. This ensures that the last_activity signal
152     is consumed by the page-cleaner before the next one is generated. */
153   if ((pct_lwm != 0.0 && pct_lwm <= dirty_pct) ||
154       (pct_lwm != 0.0 && last_activity_count == srv_get_activity_count()) ||
155       srv_max_buf_pool_modified_pct <= dirty_pct)
156   {
157     page_cleaner_is_idle= false;
158     pthread_cond_signal(&do_flush_list);
159   }
160 }
161 
delete_from_flush_list_low(buf_page_t * bpage)162 inline void buf_pool_t::delete_from_flush_list_low(buf_page_t *bpage)
163 {
164   ut_ad(!fsp_is_system_temporary(bpage->id().space()));
165   mysql_mutex_assert_owner(&flush_list_mutex);
166   flush_hp.adjust(bpage);
167   UT_LIST_REMOVE(flush_list, bpage);
168 }
169 
170 /** Insert a modified block into the flush list.
171 @param block    modified block
172 @param lsn      start LSN of the mini-transaction that modified the block */
insert_into_flush_list(buf_block_t * block,lsn_t lsn)173 void buf_pool_t::insert_into_flush_list(buf_block_t *block, lsn_t lsn)
174 {
175   mysql_mutex_assert_not_owner(&mutex);
176   mysql_mutex_assert_owner(&log_sys.flush_order_mutex);
177   ut_ad(lsn > 2);
178   ut_ad(!fsp_is_system_temporary(block->page.id().space()));
179 
180   mysql_mutex_lock(&flush_list_mutex);
181   if (ut_d(const lsn_t old=) block->page.oldest_modification())
182   {
183     ut_ad(old == 1);
184     delete_from_flush_list_low(&block->page);
185   }
186   else
187     stat.flush_list_bytes+= block->physical_size();
188   ut_ad(stat.flush_list_bytes <= curr_pool_size);
189 
190   block->page.set_oldest_modification(lsn);
191   MEM_CHECK_DEFINED(block->page.zip.data
192                     ? block->page.zip.data : block->frame,
193                     block->physical_size());
194   UT_LIST_ADD_FIRST(flush_list, &block->page);
195   ut_d(buf_flush_validate_skip());
196   page_cleaner_wakeup();
197   mysql_mutex_unlock(&flush_list_mutex);
198 }
199 
200 /** Remove a block from flush_list.
201 @param bpage   buffer pool page
202 @param clear   whether to invoke buf_page_t::clear_oldest_modification() */
delete_from_flush_list(buf_page_t * bpage,bool clear)203 void buf_pool_t::delete_from_flush_list(buf_page_t *bpage, bool clear)
204 {
205   delete_from_flush_list_low(bpage);
206   stat.flush_list_bytes-= bpage->physical_size();
207   if (clear)
208     bpage->clear_oldest_modification();
209 #ifdef UNIV_DEBUG
210   buf_flush_validate_skip();
211 #endif /* UNIV_DEBUG */
212 }
213 
214 /** Remove all dirty pages belonging to a given tablespace when we are
215 deleting the data file of that tablespace.
216 The pages still remain a part of LRU and are evicted from
217 the list as they age towards the tail of the LRU.
218 @param id    tablespace identifier */
buf_flush_remove_pages(ulint id)219 void buf_flush_remove_pages(ulint id)
220 {
221   const page_id_t first(id, 0), end(id + 1, 0);
222   ut_ad(id);
223   mysql_mutex_lock(&buf_pool.mutex);
224 
225   for (;;)
226   {
227     bool deferred= false;
228 
229     mysql_mutex_lock(&buf_pool.flush_list_mutex);
230 
231     for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; )
232     {
233       ut_d(const auto s= bpage->state());
234       ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE ||
235             s == BUF_BLOCK_REMOVE_HASH);
236       buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
237 
238       const page_id_t bpage_id(bpage->id());
239 
240       if (bpage_id < first || bpage_id >= end);
241       else if (bpage->io_fix() != BUF_IO_NONE)
242         deferred= true;
243       else
244         buf_pool.delete_from_flush_list(bpage);
245 
246       bpage= prev;
247     }
248 
249     mysql_mutex_unlock(&buf_pool.flush_list_mutex);
250 
251     if (!deferred)
252       break;
253 
254     mysql_mutex_unlock(&buf_pool.mutex);
255     os_thread_yield();
256     mysql_mutex_lock(&buf_pool.mutex);
257     buf_flush_wait_batch_end(false);
258   }
259 
260   mysql_mutex_unlock(&buf_pool.mutex);
261 }
262 
263 /*******************************************************************//**
264 Relocates a buffer control block on the flush_list.
265 Note that it is assumed that the contents of bpage have already been
266 copied to dpage.
267 IMPORTANT: When this function is called bpage and dpage are not
268 exact copies of each other. For example, they both will have different
269 ::state. Also the ::list pointers in dpage may be stale. We need to
270 use the current list node (bpage) to do the list manipulation because
271 the list pointers could have changed between the time that we copied
272 the contents of bpage to the dpage and the flush list manipulation
273 below. */
274 ATTRIBUTE_COLD
275 void
buf_flush_relocate_on_flush_list(buf_page_t * bpage,buf_page_t * dpage)276 buf_flush_relocate_on_flush_list(
277 /*=============================*/
278 	buf_page_t*	bpage,	/*!< in/out: control block being moved */
279 	buf_page_t*	dpage)	/*!< in/out: destination block */
280 {
281 	buf_page_t*	prev;
282 
283 	mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
284 	ut_ad(!fsp_is_system_temporary(bpage->id().space()));
285 
286 	const lsn_t lsn = bpage->oldest_modification();
287 
288 	if (!lsn) {
289 		return;
290 	}
291 
292 	ut_ad(lsn == 1 || lsn > 2);
293 	ut_ad(dpage->oldest_modification() == lsn);
294 
295 	/* Important that we adjust the hazard pointer before removing
296 	the bpage from the flush list. */
297 	buf_pool.flush_hp.adjust(bpage);
298 
299 	prev = UT_LIST_GET_PREV(list, bpage);
300 	UT_LIST_REMOVE(buf_pool.flush_list, bpage);
301 
302 	bpage->clear_oldest_modification();
303 
304 	if (lsn == 1) {
305 		buf_pool.stat.flush_list_bytes -= dpage->physical_size();
306 		dpage->list.prev = nullptr;
307 		dpage->list.next = nullptr;
308 		dpage->clear_oldest_modification();
309 	} else if (prev) {
310 		ut_ad(prev->oldest_modification());
311 		UT_LIST_INSERT_AFTER(buf_pool.flush_list, prev, dpage);
312 	} else {
313 		UT_LIST_ADD_FIRST(buf_pool.flush_list, dpage);
314 	}
315 
316 	ut_d(buf_flush_validate_low());
317 }
318 
319 /** Complete write of a file page from buf_pool.
320 @param request write request */
buf_page_write_complete(const IORequest & request)321 void buf_page_write_complete(const IORequest &request)
322 {
323   ut_ad(request.is_write());
324   ut_ad(!srv_read_only_mode/* ||
325         request.node->space->purpose == FIL_TYPE_TEMPORARY*/);
326   buf_page_t *bpage= request.bpage;
327   ut_ad(bpage);
328   ut_ad(bpage->in_file());
329   /* bpage->io_fix() can only be changed by buf_page_write_complete()
330   and buf_page_read_complete() from BUF_IO_READ or BUF_IO_WRITE */
331   ut_ad(bpage->io_fix() == BUF_IO_WRITE);
332   ut_ad(!buf_dblwr.is_inside(bpage->id()));
333   ut_ad(request.node->space->id == bpage->id().space());
334 
335   if (bpage->status == buf_page_t::INIT_ON_FLUSH)
336     bpage->status= buf_page_t::NORMAL;
337   else
338   {
339     ut_ad(bpage->status == buf_page_t::NORMAL);
340     if (request.node->space->use_doublewrite())
341     {
342       ut_ad(request.node->space != fil_system.temp_space);
343       buf_dblwr.write_completed();
344     }
345   }
346 
347   if (bpage->slot)
348   {
349     bpage->slot->release();
350     bpage->slot= nullptr;
351   }
352 
353   if (UNIV_UNLIKELY(MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)))
354     buf_page_monitor(bpage, BUF_IO_WRITE);
355   DBUG_PRINT("ib_buf", ("write page %u:%u",
356                         bpage->id().space(), bpage->id().page_no()));
357   const bool temp= fsp_is_system_temporary(bpage->id().space());
358 
359   mysql_mutex_lock(&buf_pool.mutex);
360   mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
361   buf_pool.stat.n_pages_written++;
362   /* While we do not need any mutex for clearing oldest_modification
363   here, we hope that it will be in the same cache line with io_fix,
364   whose changes must be protected by buf_pool.mutex. */
365   ut_ad(temp || bpage->oldest_modification() > 2);
366   bpage->clear_oldest_modification(temp);
367   ut_ad(bpage->io_fix() == BUF_IO_WRITE);
368   bpage->set_io_fix(BUF_IO_NONE);
369 
370   /* Because this thread which does the unlocking might not be the same that
371   did the locking, we use a pass value != 0 in unlock, which simply
372   removes the newest lock debug record, without checking the thread id. */
373   if (bpage->state() == BUF_BLOCK_FILE_PAGE)
374     rw_lock_sx_unlock_gen(&((buf_block_t*) bpage)->lock, BUF_IO_WRITE);
375 
376   if (request.is_LRU())
377   {
378     buf_LRU_free_page(bpage, true);
379 
380     ut_ad(buf_pool.n_flush_LRU_);
381     if (!--buf_pool.n_flush_LRU_)
382     {
383       pthread_cond_broadcast(&buf_pool.done_flush_LRU);
384       pthread_cond_signal(&buf_pool.done_free);
385     }
386   }
387   else
388   {
389     ut_ad(!temp);
390     ut_ad(buf_pool.n_flush_list_);
391     if (!--buf_pool.n_flush_list_)
392       pthread_cond_broadcast(&buf_pool.done_flush_list);
393   }
394 
395   mysql_mutex_unlock(&buf_pool.mutex);
396 }
397 
398 /** Calculate a ROW_FORMAT=COMPRESSED page checksum and update the page.
399 @param[in,out]	page		page to update
400 @param[in]	size		compressed page size */
buf_flush_update_zip_checksum(buf_frame_t * page,ulint size)401 void buf_flush_update_zip_checksum(buf_frame_t *page, ulint size)
402 {
403   ut_ad(size > 0);
404   mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
405                   page_zip_calc_checksum(page, size,
406                                          static_cast<srv_checksum_algorithm_t>
407                                          (srv_checksum_algorithm)));
408 }
409 
410 /** Assign the full crc32 checksum for non-compressed page.
411 @param[in,out]	page	page to be updated */
buf_flush_assign_full_crc32_checksum(byte * page)412 void buf_flush_assign_full_crc32_checksum(byte* page)
413 {
414 	ut_d(bool compressed = false);
415 	ut_d(bool corrupted = false);
416 	ut_d(const uint size = buf_page_full_crc32_size(page, &compressed,
417 							&corrupted));
418 	ut_ad(!compressed);
419 	ut_ad(!corrupted);
420 	ut_ad(size == uint(srv_page_size));
421 	const ulint payload = srv_page_size - FIL_PAGE_FCRC32_CHECKSUM;
422 	mach_write_to_4(page + payload, ut_crc32(page, payload));
423 }
424 
425 /** Initialize a page for writing to the tablespace.
426 @param[in]	block			buffer block; NULL if bypassing
427 					the buffer pool
428 @param[in,out]	page			page frame
429 @param[in,out]	page_zip_		compressed page, or NULL if
430 					uncompressed
431 @param[in]	use_full_checksum	whether tablespace uses full checksum */
432 void
buf_flush_init_for_writing(const buf_block_t * block,byte * page,void * page_zip_,bool use_full_checksum)433 buf_flush_init_for_writing(
434 	const buf_block_t*	block,
435 	byte*			page,
436 	void*			page_zip_,
437 	bool			use_full_checksum)
438 {
439 	if (block != NULL && block->frame != page) {
440 		/* If page is encrypted in full crc32 format then
441 		checksum stored already as a part of fil_encrypt_buf() */
442 		ut_ad(use_full_checksum);
443 		return;
444 	}
445 
446 	ut_ad(block == NULL || block->frame == page);
447 	ut_ad(block == NULL || page_zip_ == NULL
448 	      || &block->page.zip == page_zip_);
449 	ut_ad(page);
450 
451 	if (page_zip_) {
452 		page_zip_des_t*	page_zip;
453 		ulint		size;
454 
455 		page_zip = static_cast<page_zip_des_t*>(page_zip_);
456 		size = page_zip_get_size(page_zip);
457 
458 		ut_ad(size);
459 		ut_ad(ut_is_2pow(size));
460 		ut_ad(size <= UNIV_ZIP_SIZE_MAX);
461 
462 		switch (fil_page_get_type(page)) {
463 		case FIL_PAGE_TYPE_ALLOCATED:
464 		case FIL_PAGE_INODE:
465 		case FIL_PAGE_IBUF_BITMAP:
466 		case FIL_PAGE_TYPE_FSP_HDR:
467 		case FIL_PAGE_TYPE_XDES:
468 			/* These are essentially uncompressed pages. */
469 			memcpy(page_zip->data, page, size);
470 			/* fall through */
471 		case FIL_PAGE_TYPE_ZBLOB:
472 		case FIL_PAGE_TYPE_ZBLOB2:
473 		case FIL_PAGE_INDEX:
474 		case FIL_PAGE_RTREE:
475 			buf_flush_update_zip_checksum(page_zip->data, size);
476 			return;
477 		}
478 
479 		ib::error() << "The compressed page to be written"
480 			" seems corrupt:";
481 		ut_print_buf(stderr, page, size);
482 		fputs("\nInnoDB: Possibly older version of the page:", stderr);
483 		ut_print_buf(stderr, page_zip->data, size);
484 		putc('\n', stderr);
485 		ut_error;
486 	}
487 
488 	if (use_full_checksum) {
489 		static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "aligned");
490 		static_assert(FIL_PAGE_LSN % 4 == 0, "aligned");
491 		memcpy_aligned<4>(page + srv_page_size
492 				  - FIL_PAGE_FCRC32_END_LSN,
493 				  FIL_PAGE_LSN + 4 + page, 4);
494 		return buf_flush_assign_full_crc32_checksum(page);
495 	}
496 
497 	static_assert(FIL_PAGE_END_LSN_OLD_CHKSUM % 8 == 0, "aligned");
498 	static_assert(FIL_PAGE_LSN % 8 == 0, "aligned");
499 	memcpy_aligned<8>(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM,
500 			  FIL_PAGE_LSN + page, 8);
501 
502 	if (block && srv_page_size == 16384) {
503 		/* The page type could be garbage in old files
504 		created before MySQL 5.5. Such files always
505 		had a page size of 16 kilobytes. */
506 		ulint	page_type = fil_page_get_type(page);
507 		ulint	reset_type = page_type;
508 
509 		switch (block->page.id().page_no() % 16384) {
510 		case 0:
511 			reset_type = block->page.id().page_no() == 0
512 				? FIL_PAGE_TYPE_FSP_HDR
513 				: FIL_PAGE_TYPE_XDES;
514 			break;
515 		case 1:
516 			reset_type = FIL_PAGE_IBUF_BITMAP;
517 			break;
518 		case FSP_TRX_SYS_PAGE_NO:
519 			if (block->page.id()
520 			    == page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO)) {
521 				reset_type = FIL_PAGE_TYPE_TRX_SYS;
522 				break;
523 			}
524 			/* fall through */
525 		default:
526 			switch (page_type) {
527 			case FIL_PAGE_INDEX:
528 			case FIL_PAGE_TYPE_INSTANT:
529 			case FIL_PAGE_RTREE:
530 			case FIL_PAGE_UNDO_LOG:
531 			case FIL_PAGE_INODE:
532 			case FIL_PAGE_IBUF_FREE_LIST:
533 			case FIL_PAGE_TYPE_ALLOCATED:
534 			case FIL_PAGE_TYPE_SYS:
535 			case FIL_PAGE_TYPE_TRX_SYS:
536 			case FIL_PAGE_TYPE_BLOB:
537 			case FIL_PAGE_TYPE_ZBLOB:
538 			case FIL_PAGE_TYPE_ZBLOB2:
539 				break;
540 			case FIL_PAGE_TYPE_FSP_HDR:
541 			case FIL_PAGE_TYPE_XDES:
542 			case FIL_PAGE_IBUF_BITMAP:
543 				/* These pages should have
544 				predetermined page numbers
545 				(see above). */
546 			default:
547 				reset_type = FIL_PAGE_TYPE_UNKNOWN;
548 				break;
549 			}
550 		}
551 
552 		if (UNIV_UNLIKELY(page_type != reset_type)) {
553 			ib::info()
554 				<< "Resetting invalid page "
555 				<< block->page.id() << " type "
556 				<< page_type << " to "
557 				<< reset_type << " when flushing.";
558 			fil_page_set_type(page, reset_type);
559 		}
560 	}
561 
562 	uint32_t checksum = BUF_NO_CHECKSUM_MAGIC;
563 
564 	switch (srv_checksum_algorithm_t(srv_checksum_algorithm)) {
565 	case SRV_CHECKSUM_ALGORITHM_INNODB:
566 	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
567 		checksum = buf_calc_page_new_checksum(page);
568 		mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
569 				checksum);
570 		/* With the InnoDB checksum, we overwrite the first 4 bytes of
571 		the end lsn field to store the old formula checksum. Since it
572 		depends also on the field FIL_PAGE_SPACE_OR_CHKSUM, it has to
573 		be calculated after storing the new formula checksum. */
574 		checksum = buf_calc_page_old_checksum(page);
575 		break;
576 	case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
577 	case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
578 	case SRV_CHECKSUM_ALGORITHM_CRC32:
579 	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
580 		/* In other cases we write the same checksum to both fields. */
581 		checksum = buf_calc_page_crc32(page);
582 		mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
583 				checksum);
584 		break;
585 	case SRV_CHECKSUM_ALGORITHM_NONE:
586 	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
587 		mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
588 				checksum);
589 		break;
590 		/* no default so the compiler will emit a warning if
591 		new enum is added and not handled here */
592 	}
593 
594 	mach_write_to_4(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM,
595 			checksum);
596 }
597 
598 /** Reserve a buffer for compression.
599 @param[in,out]  slot    reserved slot */
buf_tmp_reserve_compression_buf(buf_tmp_buffer_t * slot)600 static void buf_tmp_reserve_compression_buf(buf_tmp_buffer_t* slot)
601 {
602   if (slot->comp_buf)
603     return;
604   /* Both Snappy and LZO compression methods require that the output
605   buffer be bigger than input buffer. Adjust the allocated size. */
606   ulint size= srv_page_size;
607 #ifdef HAVE_LZO
608   size+= LZO1X_1_15_MEM_COMPRESS;
609 #elif defined HAVE_SNAPPY
610   size= snappy_max_compressed_length(size);
611 #endif
612   slot->comp_buf= static_cast<byte*>(aligned_malloc(size, srv_page_size));
613 }
614 
615 /** Encrypt a buffer of temporary tablespace
616 @param[in]      offset  Page offset
617 @param[in]      s       Page to encrypt
618 @param[in,out]  d       Output buffer
619 @return encrypted buffer or NULL */
buf_tmp_page_encrypt(ulint offset,const byte * s,byte * d)620 static byte* buf_tmp_page_encrypt(ulint offset, const byte* s, byte* d)
621 {
622   /* Calculate the start offset in a page */
623   uint srclen= static_cast<uint>(srv_page_size) -
624     (FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION +
625      FIL_PAGE_FCRC32_CHECKSUM);
626   const byte* src= s + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
627   byte* dst= d + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
628 
629   memcpy(d, s, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
630 
631   if (!log_tmp_block_encrypt(src, srclen, dst, (offset * srv_page_size), true))
632     return NULL;
633 
634   const ulint payload= srv_page_size - FIL_PAGE_FCRC32_CHECKSUM;
635   mach_write_to_4(d + payload, ut_crc32(d, payload));
636 
637   srv_stats.pages_encrypted.inc();
638   srv_stats.n_temp_blocks_encrypted.inc();
639   return d;
640 }
641 
642 /** Encryption and page_compression hook that is called just before
643 a page is written to disk.
644 @param[in,out]  space   tablespace
645 @param[in,out]  bpage   buffer page
646 @param[in]      s       physical page frame that is being encrypted
647 @param[in,out]  size    payload size in bytes
648 @return page frame to be written to file
649 (may be src_frame or an encrypted/compressed copy of it) */
buf_page_encrypt(fil_space_t * space,buf_page_t * bpage,byte * s,size_t * size)650 static byte *buf_page_encrypt(fil_space_t* space, buf_page_t* bpage, byte* s,
651                               size_t *size)
652 {
653   ut_ad(bpage->status != buf_page_t::FREED);
654   ut_ad(space->id == bpage->id().space());
655 
656   ut_d(fil_page_type_validate(space, s));
657   const uint32_t page_no= bpage->id().page_no();
658 
659   switch (page_no) {
660   case TRX_SYS_PAGE_NO:
661     if (bpage->id().space() != TRX_SYS_SPACE)
662       break;
663     /* The TRX_SYS page is neither encrypted nor compressed, because
664     it contains the address of the doublewrite buffer. */
665     /* fall through */
666   case 0:
667     /* Page 0 of a tablespace is not encrypted/compressed */
668     return s;
669   }
670 
671   fil_space_crypt_t *crypt_data= space->crypt_data;
672   bool encrypted, page_compressed;
673   if (space->purpose == FIL_TYPE_TEMPORARY)
674   {
675     ut_ad(!crypt_data);
676     encrypted= innodb_encrypt_temporary_tables;
677     page_compressed= false;
678   }
679   else
680   {
681     encrypted= crypt_data && !crypt_data->not_encrypted() &&
682       crypt_data->type != CRYPT_SCHEME_UNENCRYPTED &&
683       (!crypt_data->is_default_encryption() || srv_encrypt_tables);
684     page_compressed= space->is_compressed();
685   }
686 
687   const bool full_crc32= space->full_crc32();
688 
689   if (!encrypted && !page_compressed)
690   {
691     /* No need to encrypt or compress. Clear key-version & crypt-checksum. */
692     static_assert(FIL_PAGE_FCRC32_KEY_VERSION % 4 == 0, "alignment");
693     static_assert(FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION % 4 == 2,
694                   "not perfect alignment");
695     if (full_crc32)
696       memset_aligned<4>(s + FIL_PAGE_FCRC32_KEY_VERSION, 0, 4);
697     else
698       memset_aligned<2>(s + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
699     return s;
700   }
701 
702   static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment");
703   static_assert(FIL_PAGE_LSN % 8 == 0, "alignment");
704   if (full_crc32)
705     memcpy_aligned<4>(s + srv_page_size - FIL_PAGE_FCRC32_END_LSN,
706                       FIL_PAGE_LSN + 4 + s, 4);
707 
708   ut_ad(!bpage->zip_size() || !page_compressed);
709   /* Find free slot from temporary memory array */
710   buf_tmp_buffer_t *slot= buf_pool.io_buf_reserve();
711   ut_a(slot);
712   slot->allocate();
713   slot->out_buf= NULL;
714   bpage->slot= slot;
715 
716   byte *d= slot->crypt_buf;
717 
718   if (!page_compressed)
719   {
720 not_compressed:
721     byte *tmp= space->purpose == FIL_TYPE_TEMPORARY
722       ? buf_tmp_page_encrypt(page_no, s, d)
723       : fil_space_encrypt(space, page_no, s, d);
724 
725     slot->out_buf= d= tmp;
726 
727     ut_d(fil_page_type_validate(space, tmp));
728   }
729   else
730   {
731     ut_ad(space->purpose != FIL_TYPE_TEMPORARY);
732     /* First we compress the page content */
733     buf_tmp_reserve_compression_buf(slot);
734     byte *tmp= slot->comp_buf;
735     ulint len= fil_page_compress(s, tmp, space->flags,
736                                  fil_space_get_block_size(space, page_no),
737                                  encrypted);
738 
739     if (!len)
740       goto not_compressed;
741 
742     *size= len;
743 
744     if (full_crc32)
745     {
746       ut_d(bool compressed = false);
747       len= buf_page_full_crc32_size(tmp,
748 #ifdef UNIV_DEBUG
749                                     &compressed,
750 #else
751                                     NULL,
752 #endif
753                                     NULL);
754       ut_ad(compressed);
755     }
756 
757     /* Workaround for MDEV-15527. */
758     memset(tmp + len, 0 , srv_page_size - len);
759     ut_d(fil_page_type_validate(space, tmp));
760 
761     if (encrypted)
762       tmp = fil_space_encrypt(space, page_no, tmp, d);
763 
764     if (full_crc32)
765     {
766       static_assert(FIL_PAGE_FCRC32_CHECKSUM == 4, "alignment");
767       mach_write_to_4(tmp + len - 4, ut_crc32(tmp, len - 4));
768       ut_ad(!buf_page_is_corrupted(true, tmp, space->flags));
769     }
770 
771     slot->out_buf= d= tmp;
772   }
773 
774   ut_d(fil_page_type_validate(space, d));
775   return d;
776 }
777 
778 /** Free a page whose underlying file page has been freed. */
release_freed_page(buf_page_t * bpage)779 inline void buf_pool_t::release_freed_page(buf_page_t *bpage)
780 {
781   ut_ad(bpage->in_file());
782   const bool uncompressed= bpage->state() == BUF_BLOCK_FILE_PAGE;
783   mysql_mutex_lock(&mutex);
784   bpage->set_io_fix(BUF_IO_NONE);
785   bpage->status= buf_page_t::NORMAL;
786   mysql_mutex_lock(&flush_list_mutex);
787   ut_d(const lsn_t oldest_modification= bpage->oldest_modification();)
788   if (fsp_is_system_temporary(bpage->id().space()))
789   {
790     ut_ad(uncompressed);
791     ut_ad(oldest_modification == 2);
792   }
793   else
794   {
795     ut_ad(oldest_modification > 2);
796     delete_from_flush_list(bpage, false);
797   }
798   bpage->clear_oldest_modification();
799   mysql_mutex_unlock(&flush_list_mutex);
800 
801   if (uncompressed)
802     rw_lock_sx_unlock_gen(&reinterpret_cast<buf_block_t*>(bpage)->lock,
803                           BUF_IO_WRITE);
804 
805   buf_LRU_free_page(bpage, true);
806   mysql_mutex_unlock(&mutex);
807 }
808 
809 /** Write a flushable page from buf_pool to a file.
810 buf_pool.mutex must be held.
811 @param bpage       buffer control block
812 @param lru         true=buf_pool.LRU; false=buf_pool.flush_list
813 @param space       tablespace
814 @return whether the page was flushed and buf_pool.mutex was released */
buf_flush_page(buf_page_t * bpage,bool lru,fil_space_t * space)815 static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space)
816 {
817   ut_ad(bpage->in_file());
818   ut_ad(bpage->ready_for_flush());
819   ut_ad((space->purpose == FIL_TYPE_TEMPORARY) ==
820         (space == fil_system.temp_space));
821   ut_ad(space->purpose == FIL_TYPE_TABLESPACE ||
822         space->atomic_write_supported);
823   ut_ad(space->referenced());
824   ut_ad(lru || space != fil_system.temp_space);
825 
826   rw_lock_t *rw_lock;
827 
828   if (bpage->state() != BUF_BLOCK_FILE_PAGE)
829     rw_lock= nullptr;
830   else
831   {
832     rw_lock= &reinterpret_cast<buf_block_t*>(bpage)->lock;
833     if (!rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE))
834       return false;
835   }
836 
837   bpage->set_io_fix(BUF_IO_WRITE);
838   /* Because bpage->status can only be changed while buf_block_t
839   exists, it cannot be modified for ROW_FORMAT=COMPRESSED pages
840   without first allocating the uncompressed page frame. Such
841   allocation cannot be completed due to our io_fix. So, bpage->status
842   is protected even if !rw_lock. */
843   const auto status= bpage->status;
844 
845   if (status != buf_page_t::FREED)
846   {
847     if (lru)
848       buf_pool.n_flush_LRU_++;
849     else
850       buf_pool.n_flush_list_++;
851     buf_flush_page_count++;
852   }
853 
854   mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
855 
856   /* We are holding rw_lock = buf_block_t::lock in SX mode except if
857   this is a ROW_FORMAT=COMPRESSED page whose uncompressed page frame
858   has been evicted from the buffer pool.
859 
860   Apart from possible rw_lock protection, bpage is also protected by
861   io_fix and oldest_modification()!=0. Thus, it cannot be relocated in
862   the buffer pool or removed from flush_list or LRU_list. */
863 
864   DBUG_PRINT("ib_buf", ("%s %u page %u:%u",
865                         lru ? "LRU" : "flush_list",
866                         bpage->id().space(), bpage->id().page_no()));
867   ut_ad(bpage->io_fix() == BUF_IO_WRITE);
868   ut_d(const lsn_t oldest_modification= bpage->oldest_modification());
869   ut_ad(space == fil_system.temp_space
870         ? oldest_modification == 2
871         : oldest_modification > 2);
872   ut_ad(bpage->state() ==
873         (rw_lock ? BUF_BLOCK_FILE_PAGE : BUF_BLOCK_ZIP_PAGE));
874   ut_ad(ULINT_UNDEFINED >
875         (lru ? buf_pool.n_flush_LRU_ : buf_pool.n_flush_list_));
876   mysql_mutex_unlock(&buf_pool.mutex);
877 
878   buf_block_t *block= reinterpret_cast<buf_block_t*>(bpage);
879   page_t *frame= bpage->zip.data;
880 
881   if (status == buf_page_t::FREED)
882     buf_pool.release_freed_page(&block->page);
883   else
884   {
885     space->reacquire();
886     ut_ad(status == buf_page_t::NORMAL || status == buf_page_t::INIT_ON_FLUSH);
887     size_t size;
888 #if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
889     size_t orig_size;
890 #endif
891     IORequest::Type type= lru ? IORequest::WRITE_LRU : IORequest::WRITE_ASYNC;
892 
893     if (UNIV_UNLIKELY(!rw_lock)) /* ROW_FORMAT=COMPRESSED */
894     {
895       ut_ad(!space->full_crc32());
896       ut_ad(!space->is_compressed()); /* not page_compressed */
897       size= bpage->zip_size();
898 #if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
899       orig_size= size;
900 #endif
901       buf_flush_update_zip_checksum(frame, size);
902       frame= buf_page_encrypt(space, bpage, frame, &size);
903       ut_ad(size == bpage->zip_size());
904     }
905     else
906     {
907       byte *page= block->frame;
908       size= block->physical_size();
909 #if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
910       orig_size= size;
911 #endif
912 
913       if (space->full_crc32())
914       {
915         /* innodb_checksum_algorithm=full_crc32 is not implemented for
916         ROW_FORMAT=COMPRESSED pages. */
917         ut_ad(!frame);
918         page= buf_page_encrypt(space, bpage, page, &size);
919         buf_flush_init_for_writing(block, page, nullptr, true);
920       }
921       else
922       {
923         buf_flush_init_for_writing(block, page, frame ? &bpage->zip : nullptr,
924                                    false);
925         page= buf_page_encrypt(space, bpage, frame ? frame : page, &size);
926       }
927 
928 #if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
929       if (size != orig_size && space->punch_hole)
930         type= lru ? IORequest::PUNCH_LRU : IORequest::PUNCH;
931 #endif
932       frame=page;
933     }
934 
935     ut_ad(status == bpage->status);
936     ut_ad(oldest_modification == bpage->oldest_modification());
937 
938     if (status != buf_page_t::NORMAL || !space->use_doublewrite())
939     {
940       if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE))
941       {
942         const lsn_t lsn= mach_read_from_8(my_assume_aligned<8>
943                                           (FIL_PAGE_LSN + (frame ? frame
944                                                            : block->frame)));
945         ut_ad(lsn >= oldest_modification);
946         if (lsn > log_sys.get_flushed_lsn())
947           log_write_up_to(lsn, true);
948       }
949       space->io(IORequest(type, bpage),
950                 bpage->physical_offset(), size, frame, bpage);
951     }
952     else
953       buf_dblwr.add_to_batch(IORequest(bpage, space->chain.start, type), size);
954   }
955 
956   /* Increment the I/O operation count used for selecting LRU policy. */
957   buf_LRU_stat_inc_io();
958   return true;
959 }
960 
961 /** Check whether a page can be flushed from the buf_pool.
962 @param id          page identifier
963 @param fold        id.fold()
964 @param lru         true=buf_pool.LRU; false=buf_pool.flush_list
965 @return whether the page can be flushed */
buf_flush_check_neighbor(const page_id_t id,ulint fold,bool lru)966 static bool buf_flush_check_neighbor(const page_id_t id, ulint fold, bool lru)
967 {
968   mysql_mutex_assert_owner(&buf_pool.mutex);
969   ut_ad(fold == id.fold());
970 
971   buf_page_t *bpage= buf_pool.page_hash_get_low(id, fold);
972 
973   if (!bpage || buf_pool.watch_is_sentinel(*bpage))
974     return false;
975 
976   /* We avoid flushing 'non-old' blocks in an LRU flush, because the
977   flushed blocks are soon freed */
978   if (lru && !bpage->is_old())
979     return false;
980 
981   return bpage->oldest_modification() > 1 && bpage->ready_for_flush();
982 }
983 
984 /** Check which neighbors of a page can be flushed from the buf_pool.
985 @param space       tablespace
986 @param id          page identifier of a dirty page
987 @param contiguous  whether to consider contiguous areas of pages
988 @param lru         true=buf_pool.LRU; false=buf_pool.flush_list
989 @return last page number that can be flushed */
buf_flush_check_neighbors(const fil_space_t & space,page_id_t & id,bool contiguous,bool lru)990 static page_id_t buf_flush_check_neighbors(const fil_space_t &space,
991                                            page_id_t &id, bool contiguous,
992                                            bool lru)
993 {
994   ut_ad(id.page_no() < space.size +
995         (space.physical_size() == 2048 ? 1
996          : space.physical_size() == 1024 ? 3 : 0));
997   /* When flushed, dirty blocks are searched in neighborhoods of this
998   size, and flushed along with the original page. */
999   const ulint s= buf_pool.curr_size / 16;
1000   const uint32_t read_ahead= buf_pool.read_ahead_area;
1001   const uint32_t buf_flush_area= read_ahead > s
1002     ? static_cast<uint32_t>(s) : read_ahead;
1003   page_id_t low= id - (id.page_no() % buf_flush_area);
1004   page_id_t high= low + buf_flush_area;
1005   high.set_page_no(std::min(high.page_no(), space.last_page_number()));
1006 
1007   if (!contiguous)
1008   {
1009     high= std::max(id + 1, high);
1010     id= low;
1011     return high;
1012   }
1013 
1014   /* Determine the contiguous dirty area around id. */
1015   const ulint id_fold= id.fold();
1016 
1017   mysql_mutex_lock(&buf_pool.mutex);
1018 
1019   if (id > low)
1020   {
1021     ulint fold= id_fold;
1022     for (page_id_t i= id - 1;; --i)
1023     {
1024       fold--;
1025       if (!buf_flush_check_neighbor(i, fold, lru))
1026       {
1027         low= i + 1;
1028         break;
1029       }
1030       if (i == low)
1031         break;
1032     }
1033   }
1034 
1035   page_id_t i= id;
1036   id= low;
1037   ulint fold= id_fold;
1038   while (++i < high)
1039   {
1040     ++fold;
1041     if (!buf_flush_check_neighbor(i, fold, lru))
1042       break;
1043   }
1044 
1045   mysql_mutex_unlock(&buf_pool.mutex);
1046   return i;
1047 }
1048 
1049 MY_ATTRIBUTE((nonnull))
1050 /** Write punch-hole or zeroes of the freed ranges when
1051 innodb_immediate_scrub_data_uncompressed from the freed ranges.
1052 @param space   tablespace which may contain ranges of freed pages */
buf_flush_freed_pages(fil_space_t * space)1053 static void buf_flush_freed_pages(fil_space_t *space)
1054 {
1055   const bool punch_hole= space->punch_hole;
1056   if (!srv_immediate_scrub_data_uncompressed && !punch_hole)
1057     return;
1058   lsn_t flush_to_disk_lsn= log_sys.get_flushed_lsn();
1059 
1060   std::unique_lock<std::mutex> freed_lock(space->freed_range_mutex);
1061   if (space->freed_ranges.empty()
1062       || flush_to_disk_lsn < space->get_last_freed_lsn())
1063   {
1064     freed_lock.unlock();
1065     return;
1066   }
1067 
1068   range_set freed_ranges= std::move(space->freed_ranges);
1069   freed_lock.unlock();
1070 
1071   for (const auto &range : freed_ranges)
1072   {
1073     const ulint physical_size= space->physical_size();
1074 
1075     if (punch_hole)
1076     {
1077       space->reacquire();
1078       space->io(IORequest(IORequest::PUNCH_RANGE),
1079                           os_offset_t{range.first} * physical_size,
1080                           (range.last - range.first + 1) * physical_size,
1081                           nullptr);
1082     }
1083     else if (srv_immediate_scrub_data_uncompressed)
1084     {
1085       for (os_offset_t i= range.first; i <= range.last; i++)
1086       {
1087         space->reacquire();
1088         space->io(IORequest(IORequest::WRITE_ASYNC),
1089                   i * physical_size, physical_size,
1090                   const_cast<byte*>(field_ref_zero));
1091       }
1092     }
1093     buf_pool.stat.n_pages_written+= (range.last - range.first + 1);
1094   }
1095 }
1096 
1097 /** Flushes to disk all flushable pages within the flush area
1098 and also write zeroes or punch the hole for the freed ranges of pages.
1099 @param space       tablespace
1100 @param page_id     page identifier
1101 @param contiguous  whether to consider contiguous areas of pages
1102 @param lru         true=buf_pool.LRU; false=buf_pool.flush_list
1103 @param n_flushed   number of pages flushed so far in this batch
1104 @param n_to_flush  maximum number of pages we are allowed to flush
1105 @return number of pages flushed */
buf_flush_try_neighbors(fil_space_t * space,const page_id_t page_id,bool contiguous,bool lru,ulint n_flushed,ulint n_to_flush)1106 static ulint buf_flush_try_neighbors(fil_space_t *space,
1107                                      const page_id_t page_id,
1108                                      bool contiguous, bool lru,
1109                                      ulint n_flushed, ulint n_to_flush)
1110 {
1111   ut_ad(space->id == page_id.space());
1112 
1113   ulint count= 0;
1114   page_id_t id= page_id;
1115   page_id_t high= buf_flush_check_neighbors(*space, id, contiguous, lru);
1116 
1117   ut_ad(page_id >= id);
1118   ut_ad(page_id < high);
1119 
1120   for (ulint id_fold= id.fold(); id < high && !space->is_stopping();
1121        ++id, ++id_fold)
1122   {
1123     if (count + n_flushed >= n_to_flush)
1124     {
1125       if (id > page_id)
1126         break;
1127       /* If the page whose neighbors we are flushing has not been
1128       flushed yet, we must flush the page that we selected originally. */
1129       id= page_id;
1130       id_fold= id.fold();
1131     }
1132 
1133     mysql_mutex_lock(&buf_pool.mutex);
1134 
1135     if (buf_page_t *bpage= buf_pool.page_hash_get_low(id, id_fold))
1136     {
1137       ut_ad(bpage->in_file());
1138       /* We avoid flushing 'non-old' blocks in an LRU flush,
1139       because the flushed blocks are soon freed */
1140       if (!lru || id == page_id || bpage->is_old())
1141       {
1142         if (!buf_pool.watch_is_sentinel(*bpage) &&
1143             bpage->oldest_modification() > 1 &&
1144             bpage->ready_for_flush() && buf_flush_page(bpage, lru, space))
1145         {
1146           ++count;
1147           continue;
1148         }
1149       }
1150     }
1151 
1152     mysql_mutex_unlock(&buf_pool.mutex);
1153   }
1154 
1155   if (auto n= count - 1)
1156   {
1157     MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
1158                                  MONITOR_FLUSH_NEIGHBOR_COUNT,
1159                                  MONITOR_FLUSH_NEIGHBOR_PAGES, n);
1160   }
1161 
1162   return count;
1163 }
1164 
1165 /*******************************************************************//**
1166 This utility moves the uncompressed frames of pages to the free list.
1167 Note that this function does not actually flush any data to disk. It
1168 just detaches the uncompressed frames from the compressed pages at the
1169 tail of the unzip_LRU and puts those freed frames in the free list.
1170 Note that it is a best effort attempt and it is not guaranteed that
1171 after a call to this function there will be 'max' blocks in the free
1172 list.
1173 @param[in]	max		desired number of blocks in the free_list
1174 @return number of blocks moved to the free list. */
buf_free_from_unzip_LRU_list_batch(ulint max)1175 static ulint buf_free_from_unzip_LRU_list_batch(ulint max)
1176 {
1177 	ulint		scanned = 0;
1178 	ulint		count = 0;
1179 
1180 	mysql_mutex_assert_owner(&buf_pool.mutex);
1181 
1182 	buf_block_t*	block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
1183 
1184 	while (block
1185 	       && count < max
1186 	       && UT_LIST_GET_LEN(buf_pool.free) < srv_LRU_scan_depth
1187 	       && UT_LIST_GET_LEN(buf_pool.unzip_LRU)
1188 	       > UT_LIST_GET_LEN(buf_pool.LRU) / 10) {
1189 
1190 		++scanned;
1191 		if (buf_LRU_free_page(&block->page, false)) {
1192 			/* Block was freed. buf_pool.mutex potentially
1193 			released and reacquired */
1194 			++count;
1195 			block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
1196 		} else {
1197 			block = UT_LIST_GET_PREV(unzip_LRU, block);
1198 		}
1199 	}
1200 
1201 	mysql_mutex_assert_owner(&buf_pool.mutex);
1202 
1203 	if (scanned) {
1204 		MONITOR_INC_VALUE_CUMULATIVE(
1205 			MONITOR_LRU_BATCH_SCANNED,
1206 			MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
1207 			MONITOR_LRU_BATCH_SCANNED_PER_CALL,
1208 			scanned);
1209 	}
1210 
1211 	return(count);
1212 }
1213 
1214 /** Start writing out pages for a tablespace.
1215 @param id   tablespace identifier
1216 @return tablespace
1217 @retval nullptr if the pages for this tablespace should be discarded */
buf_flush_space(const uint32_t id)1218 static fil_space_t *buf_flush_space(const uint32_t id)
1219 {
1220   fil_space_t *space= fil_space_t::get(id);
1221   if (space)
1222     buf_flush_freed_pages(space);
1223   return space;
1224 }
1225 
1226 struct flush_counters_t
1227 {
1228   /** number of dirty pages flushed */
1229   ulint flushed;
1230   /** number of clean pages evicted */
1231   ulint evicted;
1232 };
1233 
1234 /** Try to discard a dirty page.
1235 @param bpage      dirty page whose tablespace is not accessible */
buf_flush_discard_page(buf_page_t * bpage)1236 static void buf_flush_discard_page(buf_page_t *bpage)
1237 {
1238   mysql_mutex_assert_owner(&buf_pool.mutex);
1239   mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
1240   ut_ad(bpage->in_file());
1241   ut_ad(bpage->oldest_modification());
1242 
1243   rw_lock_t *rw_lock;
1244 
1245   if (bpage->state() != BUF_BLOCK_FILE_PAGE)
1246     rw_lock= nullptr;
1247   else
1248   {
1249     rw_lock= &reinterpret_cast<buf_block_t*>(bpage)->lock;
1250     if (!rw_lock_sx_lock_nowait(rw_lock, 0))
1251       return;
1252   }
1253 
1254   bpage->status= buf_page_t::NORMAL;
1255   mysql_mutex_lock(&buf_pool.flush_list_mutex);
1256   buf_pool.delete_from_flush_list(bpage);
1257   mysql_mutex_unlock(&buf_pool.flush_list_mutex);
1258 
1259   if (rw_lock)
1260     rw_lock_sx_unlock(rw_lock);
1261 
1262   buf_LRU_free_page(bpage, true);
1263 }
1264 
1265 /** Flush dirty blocks from the end of the LRU list.
1266 @param max   maximum number of blocks to make available in buf_pool.free
1267 @param n     counts of flushed and evicted pages */
buf_flush_LRU_list_batch(ulint max,flush_counters_t * n)1268 static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n)
1269 {
1270   ulint scanned= 0;
1271   ulint free_limit= srv_LRU_scan_depth;
1272 
1273   mysql_mutex_assert_owner(&buf_pool.mutex);
1274   if (buf_pool.withdraw_target && buf_pool.curr_size < buf_pool.old_size)
1275     free_limit+= buf_pool.withdraw_target - UT_LIST_GET_LEN(buf_pool.withdraw);
1276 
1277   const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN
1278     ? 0 : srv_flush_neighbors;
1279   fil_space_t *space= nullptr;
1280   uint32_t last_space_id= FIL_NULL;
1281   static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency");
1282   static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency");
1283 
1284   for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.LRU);
1285        bpage &&
1286        ((UT_LIST_GET_LEN(buf_pool.LRU) > BUF_LRU_MIN_LEN &&
1287          UT_LIST_GET_LEN(buf_pool.free) < free_limit &&
1288          n->flushed + n->evicted < max) ||
1289         recv_recovery_is_on()); ++scanned)
1290   {
1291     buf_page_t *prev= UT_LIST_GET_PREV(LRU, bpage);
1292     const lsn_t oldest_modification= bpage->oldest_modification();
1293     buf_pool.lru_hp.set(prev);
1294 
1295     if (oldest_modification <= 1 && bpage->can_relocate())
1296     {
1297       /* block is ready for eviction i.e., it is clean and is not
1298       IO-fixed or buffer fixed. */
1299       if (buf_LRU_free_page(bpage, true))
1300         ++n->evicted;
1301     }
1302     else if (oldest_modification > 1 && bpage->ready_for_flush())
1303     {
1304       /* Block is ready for flush. Dispatch an IO request. The IO
1305       helper thread will put it on free list in IO completion routine. */
1306       const page_id_t page_id(bpage->id());
1307       const uint32_t space_id= page_id.space();
1308       if (!space || space->id != space_id)
1309       {
1310         if (last_space_id != space_id)
1311         {
1312           if (space)
1313             space->release();
1314           space= buf_flush_space(space_id);
1315           last_space_id= space_id;
1316         }
1317         else
1318           ut_ad(!space);
1319       }
1320       else if (space->is_stopping())
1321       {
1322         space->release();
1323         space= nullptr;
1324       }
1325 
1326       if (!space)
1327         buf_flush_discard_page(bpage);
1328       else if (neighbors && space->is_rotational())
1329       {
1330         mysql_mutex_unlock(&buf_pool.mutex);
1331         n->flushed+= buf_flush_try_neighbors(space, page_id, neighbors == 1,
1332                                              true, n->flushed, max);
1333 reacquire_mutex:
1334         mysql_mutex_lock(&buf_pool.mutex);
1335       }
1336       else if (buf_flush_page(bpage, true, space))
1337       {
1338         ++n->flushed;
1339         goto reacquire_mutex;
1340       }
1341     }
1342     else
1343       /* Can't evict or dispatch this block. Go to previous. */
1344       ut_ad(buf_pool.lru_hp.is_hp(prev));
1345     bpage= buf_pool.lru_hp.get();
1346   }
1347 
1348   buf_pool.lru_hp.set(nullptr);
1349 
1350   if (space)
1351     space->release();
1352 
1353   /* We keep track of all flushes happening as part of LRU flush. When
1354   estimating the desired rate at which flush_list should be flushed,
1355   we factor in this value. */
1356   buf_lru_flush_page_count+= n->flushed;
1357 
1358   mysql_mutex_assert_owner(&buf_pool.mutex);
1359 
1360   if (scanned)
1361     MONITOR_INC_VALUE_CUMULATIVE(MONITOR_LRU_BATCH_SCANNED,
1362                                  MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
1363                                  MONITOR_LRU_BATCH_SCANNED_PER_CALL,
1364                                  scanned);
1365 }
1366 
1367 /** Flush and move pages from LRU or unzip_LRU list to the free list.
1368 Whether LRU or unzip_LRU is used depends on the state of the system.
1369 @param max   maximum number of blocks to make available in buf_pool.free
1370 @return number of flushed pages */
buf_do_LRU_batch(ulint max)1371 static ulint buf_do_LRU_batch(ulint max)
1372 {
1373   const ulint n_unzip_LRU_evicted= buf_LRU_evict_from_unzip_LRU()
1374     ? buf_free_from_unzip_LRU_list_batch(max)
1375     : 0;
1376   flush_counters_t n;
1377   n.flushed= 0;
1378   n.evicted= n_unzip_LRU_evicted;
1379   buf_flush_LRU_list_batch(max, &n);
1380 
1381   if (const ulint evicted= n.evicted - n_unzip_LRU_evicted)
1382   {
1383     MONITOR_INC_VALUE_CUMULATIVE(MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
1384                                  MONITOR_LRU_BATCH_EVICT_COUNT,
1385                                  MONITOR_LRU_BATCH_EVICT_PAGES,
1386                                  evicted);
1387   }
1388 
1389   return n.flushed;
1390 }
1391 
1392 /** This utility flushes dirty blocks from the end of the flush_list.
1393 The calling thread is not allowed to own any latches on pages!
1394 @param max_n    maximum mumber of blocks to flush
1395 @param lsn      once an oldest_modification>=lsn is found, terminate the batch
1396 @return number of blocks for which the write request was queued */
buf_do_flush_list_batch(ulint max_n,lsn_t lsn)1397 static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
1398 {
1399   ulint count= 0;
1400   ulint scanned= 0;
1401 
1402   mysql_mutex_assert_owner(&buf_pool.mutex);
1403 
1404   const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN
1405     ? 0 : srv_flush_neighbors;
1406   fil_space_t *space= nullptr;
1407   uint32_t last_space_id= FIL_NULL;
1408   static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency");
1409   static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency");
1410 
1411   /* Start from the end of the list looking for a suitable block to be
1412   flushed. */
1413   mysql_mutex_lock(&buf_pool.flush_list_mutex);
1414   ulint len= UT_LIST_GET_LEN(buf_pool.flush_list);
1415 
1416   for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list);
1417        bpage && len && count < max_n; ++scanned, len--)
1418   {
1419     const lsn_t oldest_modification= bpage->oldest_modification();
1420     if (oldest_modification >= lsn)
1421       break;
1422     ut_ad(bpage->in_file());
1423 
1424     buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
1425 
1426     if (oldest_modification == 1)
1427     {
1428       buf_pool.delete_from_flush_list(bpage);
1429     skip:
1430       bpage= prev;
1431       continue;
1432     }
1433 
1434     ut_ad(oldest_modification > 2);
1435     ut_ad(bpage->in_file());
1436 
1437     if (!bpage->ready_for_flush())
1438       goto skip;
1439 
1440     /* In order not to degenerate this scan to O(n*n) we attempt to
1441     preserve the pointer position. Any thread that would remove 'prev'
1442     from buf_pool.flush_list must adjust the hazard pointer.
1443 
1444     Note: A concurrent execution of buf_flush_list_space() may
1445     terminate this scan prematurely. The buf_pool.n_flush_list()
1446     should prevent multiple threads from executing
1447     buf_do_flush_list_batch() concurrently,
1448     but buf_flush_list_space() is ignoring that. */
1449     buf_pool.flush_hp.set(prev);
1450     mysql_mutex_unlock(&buf_pool.flush_list_mutex);
1451 
1452     const page_id_t page_id(bpage->id());
1453     const uint32_t space_id= page_id.space();
1454     if (!space || space->id != space_id)
1455     {
1456       if (last_space_id != space_id)
1457       {
1458         if (space)
1459           space->release();
1460         space= buf_flush_space(space_id);
1461         last_space_id= space_id;
1462       }
1463       else
1464         ut_ad(!space);
1465     }
1466     else if (space->is_stopping())
1467     {
1468       space->release();
1469       space= nullptr;
1470     }
1471 
1472     if (!space)
1473       buf_flush_discard_page(bpage);
1474     else if (neighbors && space->is_rotational())
1475     {
1476       mysql_mutex_unlock(&buf_pool.mutex);
1477       count+= buf_flush_try_neighbors(space, page_id, neighbors == 1,
1478                                       false, count, max_n);
1479     reacquire_mutex:
1480       mysql_mutex_lock(&buf_pool.mutex);
1481     }
1482     else if (buf_flush_page(bpage, false, space))
1483     {
1484       ++count;
1485       goto reacquire_mutex;
1486     }
1487 
1488     mysql_mutex_lock(&buf_pool.flush_list_mutex);
1489     bpage= buf_pool.flush_hp.get();
1490   }
1491 
1492   buf_pool.flush_hp.set(nullptr);
1493   mysql_mutex_unlock(&buf_pool.flush_list_mutex);
1494 
1495   if (space)
1496     space->release();
1497 
1498   if (scanned)
1499     MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_SCANNED,
1500                                  MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
1501                                  MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
1502                                  scanned);
1503   if (count)
1504     MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_TOTAL_PAGE,
1505                                  MONITOR_FLUSH_BATCH_COUNT,
1506                                  MONITOR_FLUSH_BATCH_PAGES,
1507                                  count);
1508   mysql_mutex_assert_owner(&buf_pool.mutex);
1509   return count;
1510 }
1511 
1512 /** Wait until a flush batch ends.
1513 @param lru    true=buf_pool.LRU; false=buf_pool.flush_list */
buf_flush_wait_batch_end(bool lru)1514 void buf_flush_wait_batch_end(bool lru)
1515 {
1516   const auto &n_flush= lru ? buf_pool.n_flush_LRU_ : buf_pool.n_flush_list_;
1517 
1518   if (n_flush)
1519   {
1520     auto cond= lru ? &buf_pool.done_flush_LRU : &buf_pool.done_flush_list;
1521     tpool::tpool_wait_begin();
1522     thd_wait_begin(nullptr, THD_WAIT_DISKIO);
1523     do
1524       my_cond_wait(cond, &buf_pool.mutex.m_mutex);
1525     while (n_flush);
1526     tpool::tpool_wait_end();
1527     thd_wait_end(nullptr);
1528     pthread_cond_broadcast(cond);
1529   }
1530 }
1531 
1532 /** Write out dirty blocks from buf_pool.flush_list.
1533 @param max_n    wished maximum mumber of blocks flushed
1534 @param lsn      buf_pool.get_oldest_modification(LSN_MAX) target
1535 @return the number of processed pages
1536 @retval 0 if a buf_pool.flush_list batch is already running */
buf_flush_list(ulint max_n=ULINT_UNDEFINED,lsn_t lsn=LSN_MAX)1537 static ulint buf_flush_list(ulint max_n= ULINT_UNDEFINED, lsn_t lsn= LSN_MAX)
1538 {
1539   ut_ad(lsn);
1540 
1541   if (buf_pool.n_flush_list())
1542     return 0;
1543 
1544   mysql_mutex_lock(&buf_pool.mutex);
1545   const bool running= buf_pool.n_flush_list_ != 0;
1546   /* FIXME: we are performing a dirty read of buf_pool.flush_list.count
1547   while not holding buf_pool.flush_list_mutex */
1548   if (running || !UT_LIST_GET_LEN(buf_pool.flush_list))
1549   {
1550     if (!running)
1551       pthread_cond_broadcast(&buf_pool.done_flush_list);
1552     mysql_mutex_unlock(&buf_pool.mutex);
1553     return 0;
1554   }
1555 
1556   buf_pool.n_flush_list_++;
1557   const ulint n_flushed= buf_do_flush_list_batch(max_n, lsn);
1558   const ulint n_flushing= --buf_pool.n_flush_list_;
1559 
1560   buf_pool.try_LRU_scan= true;
1561 
1562   mysql_mutex_unlock(&buf_pool.mutex);
1563 
1564   if (!n_flushing)
1565     pthread_cond_broadcast(&buf_pool.done_flush_list);
1566 
1567   buf_dblwr.flush_buffered_writes();
1568 
1569   DBUG_PRINT("ib_buf", ("flush_list completed, " ULINTPF " pages", n_flushed));
1570   return n_flushed;
1571 }
1572 
1573 /** Try to flush all the dirty pages that belong to a given tablespace.
1574 @param space       tablespace
1575 @param n_flushed   number of pages written
1576 @return whether the flush for some pages might not have been initiated */
buf_flush_list_space(fil_space_t * space,ulint * n_flushed)1577 bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed)
1578 {
1579   const auto space_id= space->id;
1580   ut_ad(space_id <= SRV_SPACE_ID_UPPER_BOUND);
1581 
1582   bool may_have_skipped= false;
1583   ulint max_n_flush= srv_io_capacity;
1584 
1585   mysql_mutex_lock(&buf_pool.mutex);
1586   mysql_mutex_lock(&buf_pool.flush_list_mutex);
1587 
1588   bool acquired= space->acquire();
1589   buf_flush_freed_pages(space);
1590 
1591   for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; )
1592   {
1593     ut_d(const auto s= bpage->state());
1594     ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE ||
1595           s == BUF_BLOCK_REMOVE_HASH);
1596     ut_ad(bpage->oldest_modification());
1597     ut_ad(bpage->in_file());
1598 
1599     buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
1600     if (bpage->id().space() != space_id);
1601     else if (bpage->oldest_modification() == 1)
1602       buf_pool.delete_from_flush_list(bpage);
1603     else if (!bpage->ready_for_flush())
1604       may_have_skipped= true;
1605     else
1606     {
1607       /* In order not to degenerate this scan to O(n*n) we attempt to
1608       preserve the pointer position. Any thread that would remove 'prev'
1609       from buf_pool.flush_list must adjust the hazard pointer.
1610 
1611       Note: Multiple executions of buf_flush_list_space() may be
1612       interleaved, and also buf_do_flush_list_batch() may be running
1613       concurrently. This may terminate our iteration prematurely,
1614       leading us to return may_have_skipped=true. */
1615       buf_pool.flush_hp.set(prev);
1616       mysql_mutex_unlock(&buf_pool.flush_list_mutex);
1617 
1618       if (!acquired)
1619       {
1620       was_freed:
1621         buf_flush_discard_page(bpage);
1622       }
1623       else
1624       {
1625         if (space->is_stopping())
1626         {
1627           space->release();
1628           acquired= false;
1629           goto was_freed;
1630         }
1631         if (!buf_flush_page(bpage, false, space))
1632         {
1633           may_have_skipped= true;
1634           mysql_mutex_lock(&buf_pool.flush_list_mutex);
1635           goto next_after_skip;
1636         }
1637         if (n_flushed)
1638           ++*n_flushed;
1639         if (!--max_n_flush)
1640         {
1641           mysql_mutex_lock(&buf_pool.mutex);
1642           mysql_mutex_lock(&buf_pool.flush_list_mutex);
1643           may_have_skipped= true;
1644           break;
1645         }
1646         mysql_mutex_lock(&buf_pool.mutex);
1647       }
1648 
1649       mysql_mutex_lock(&buf_pool.flush_list_mutex);
1650       if (!buf_pool.flush_hp.is_hp(prev))
1651         may_have_skipped= true;
1652     next_after_skip:
1653       bpage= buf_pool.flush_hp.get();
1654       continue;
1655     }
1656 
1657     bpage= prev;
1658   }
1659 
1660   /* Note: this loop may have been executed concurrently with
1661   buf_do_flush_list_batch() as well as other threads executing
1662   buf_flush_list_space(). We should always return true from
1663   buf_flush_list_space() if that should be the case; in
1664   buf_do_flush_list_batch() we will simply perform less work. */
1665 
1666   buf_pool.flush_hp.set(nullptr);
1667   mysql_mutex_unlock(&buf_pool.flush_list_mutex);
1668 
1669   buf_pool.try_LRU_scan= true;
1670 
1671   mysql_mutex_unlock(&buf_pool.mutex);
1672 
1673   if (acquired)
1674     space->release();
1675 
1676   if (space->purpose == FIL_TYPE_IMPORT)
1677     os_aio_wait_until_no_pending_writes();
1678   else
1679     buf_dblwr.flush_buffered_writes();
1680 
1681   return may_have_skipped;
1682 }
1683 
1684 /** Write out dirty blocks from buf_pool.LRU.
1685 @param max_n    wished maximum mumber of blocks flushed
1686 @return the number of processed pages
1687 @retval 0 if a buf_pool.LRU batch is already running */
buf_flush_LRU(ulint max_n)1688 ulint buf_flush_LRU(ulint max_n)
1689 {
1690   if (buf_pool.n_flush_LRU())
1691     return 0;
1692 
1693   log_buffer_flush_to_disk(true);
1694 
1695   mysql_mutex_lock(&buf_pool.mutex);
1696   if (buf_pool.n_flush_LRU_)
1697   {
1698     mysql_mutex_unlock(&buf_pool.mutex);
1699     return 0;
1700   }
1701   buf_pool.n_flush_LRU_++;
1702 
1703   ulint n_flushed= buf_do_LRU_batch(max_n);
1704 
1705   const ulint n_flushing= --buf_pool.n_flush_LRU_;
1706 
1707   buf_pool.try_LRU_scan= true;
1708 
1709   mysql_mutex_unlock(&buf_pool.mutex);
1710 
1711   if (!n_flushing)
1712   {
1713     pthread_cond_broadcast(&buf_pool.done_flush_LRU);
1714     pthread_cond_signal(&buf_pool.done_free);
1715   }
1716 
1717   buf_dblwr.flush_buffered_writes();
1718 
1719   DBUG_PRINT("ib_buf", ("LRU flush completed, " ULINTPF " pages", n_flushed));
1720   return n_flushed;
1721 }
1722 
1723 /** Initiate a log checkpoint, discarding the start of the log.
1724 @param oldest_lsn   the checkpoint LSN
1725 @param end_lsn      log_sys.get_lsn()
1726 @return true if success, false if a checkpoint write was already running */
log_checkpoint_low(lsn_t oldest_lsn,lsn_t end_lsn)1727 static bool log_checkpoint_low(lsn_t oldest_lsn, lsn_t end_lsn)
1728 {
1729   ut_ad(!srv_read_only_mode);
1730   mysql_mutex_assert_owner(&log_sys.mutex);
1731   ut_ad(oldest_lsn <= end_lsn);
1732   ut_ad(end_lsn == log_sys.get_lsn());
1733   ut_ad(!recv_no_log_write);
1734 
1735   ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn);
1736 
1737   if (oldest_lsn > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT)
1738     /* Some log has been written since the previous checkpoint. */;
1739   else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED)
1740     /* MariaDB startup expects the redo log file to be logically empty
1741     (not even containing a FILE_CHECKPOINT record) after a clean shutdown.
1742     Perform an extra checkpoint at shutdown. */;
1743   else
1744   {
1745     /* Do nothing, because nothing was logged (other than a
1746     FILE_CHECKPOINT record) since the previous checkpoint. */
1747     mysql_mutex_unlock(&log_sys.mutex);
1748     return true;
1749   }
1750 
1751   /* Repeat the FILE_MODIFY records after the checkpoint, in case some
1752   log records between the checkpoint and log_sys.lsn need them.
1753   Finally, write a FILE_CHECKPOINT record. Redo log apply expects to
1754   see a FILE_CHECKPOINT after the checkpoint, except on clean
1755   shutdown, where the log will be empty after the checkpoint.
1756 
1757   It is important that we write out the redo log before any further
1758   dirty pages are flushed to the tablespace files.  At this point,
1759   because we hold log_sys.mutex, mtr_t::commit() in other threads will
1760   be blocked, and no pages can be added to the flush lists. */
1761   lsn_t flush_lsn= oldest_lsn;
1762 
1763   if (fil_names_clear(flush_lsn, oldest_lsn != end_lsn ||
1764                       srv_shutdown_state <= SRV_SHUTDOWN_INITIATED))
1765   {
1766     flush_lsn= log_sys.get_lsn();
1767     ut_ad(flush_lsn >= end_lsn + SIZE_OF_FILE_CHECKPOINT);
1768     mysql_mutex_unlock(&log_sys.mutex);
1769     log_write_up_to(flush_lsn, true, true);
1770     mysql_mutex_lock(&log_sys.mutex);
1771     if (log_sys.last_checkpoint_lsn >= oldest_lsn)
1772     {
1773       mysql_mutex_unlock(&log_sys.mutex);
1774       return true;
1775     }
1776   }
1777   else
1778     ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn);
1779 
1780   ut_ad(log_sys.get_flushed_lsn() >= flush_lsn);
1781 
1782   if (log_sys.n_pending_checkpoint_writes)
1783   {
1784     /* A checkpoint write is running */
1785     mysql_mutex_unlock(&log_sys.mutex);
1786     return false;
1787   }
1788 
1789   log_sys.next_checkpoint_lsn= oldest_lsn;
1790   log_write_checkpoint_info(end_lsn);
1791   mysql_mutex_assert_not_owner(&log_sys.mutex);
1792 
1793   return true;
1794 }
1795 
1796 /** Make a checkpoint. Note that this function does not flush dirty
1797 blocks from the buffer pool: it only checks what is lsn of the oldest
1798 modification in the pool, and writes information about the lsn in
1799 log file. Use log_make_checkpoint() to flush also the pool.
1800 @retval true if the checkpoint was or had been made
1801 @retval false if a checkpoint write was already running */
log_checkpoint()1802 static bool log_checkpoint()
1803 {
1804   if (recv_recovery_is_on())
1805     recv_sys.apply(true);
1806 
1807   switch (srv_file_flush_method) {
1808   case SRV_NOSYNC:
1809   case SRV_O_DIRECT_NO_FSYNC:
1810     break;
1811   default:
1812     fil_flush_file_spaces();
1813   }
1814 
1815   mysql_mutex_lock(&log_sys.mutex);
1816   const lsn_t end_lsn= log_sys.get_lsn();
1817   mysql_mutex_lock(&log_sys.flush_order_mutex);
1818   mysql_mutex_lock(&buf_pool.flush_list_mutex);
1819   const lsn_t oldest_lsn= buf_pool.get_oldest_modification(end_lsn);
1820   mysql_mutex_unlock(&buf_pool.flush_list_mutex);
1821   mysql_mutex_unlock(&log_sys.flush_order_mutex);
1822   return log_checkpoint_low(oldest_lsn, end_lsn);
1823 }
1824 
1825 /** Make a checkpoint. */
log_make_checkpoint()1826 ATTRIBUTE_COLD void log_make_checkpoint()
1827 {
1828   buf_flush_wait_flushed(log_sys.get_lsn(std::memory_order_acquire));
1829   while (!log_checkpoint());
1830 }
1831 
1832 /** Wait for all dirty pages up to an LSN to be written out.
1833 NOTE: The calling thread is not allowed to hold any buffer page latches! */
buf_flush_wait(lsn_t lsn)1834 static void buf_flush_wait(lsn_t lsn)
1835 {
1836   ut_ad(lsn <= log_sys.get_lsn());
1837 
1838   while (buf_pool.get_oldest_modification(lsn) < lsn)
1839   {
1840     if (buf_flush_sync_lsn < lsn)
1841     {
1842       buf_flush_sync_lsn= lsn;
1843       buf_pool.page_cleaner_set_idle(false);
1844       pthread_cond_signal(&buf_pool.do_flush_list);
1845     }
1846     my_cond_wait(&buf_pool.done_flush_list,
1847                  &buf_pool.flush_list_mutex.m_mutex);
1848   }
1849 }
1850 
1851 /** Wait until all persistent pages are flushed up to a limit.
1852 @param sync_lsn   buf_pool.get_oldest_modification(LSN_MAX) to wait for */
buf_flush_wait_flushed(lsn_t sync_lsn)1853 ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn)
1854 {
1855   ut_ad(sync_lsn);
1856   ut_ad(sync_lsn < LSN_MAX);
1857   mysql_mutex_assert_not_owner(&log_sys.mutex);
1858   ut_ad(!srv_read_only_mode);
1859 
1860   if (recv_recovery_is_on())
1861     recv_sys.apply(true);
1862 
1863   mysql_mutex_lock(&buf_pool.flush_list_mutex);
1864 
1865   if (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn)
1866   {
1867     MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
1868 #if 1 /* FIXME: remove this, and guarantee that the page cleaner serves us */
1869     if (UNIV_UNLIKELY(!buf_page_cleaner_is_active))
1870     {
1871       do
1872       {
1873         mysql_mutex_unlock(&buf_pool.flush_list_mutex);
1874         ulint n_pages= buf_flush_list(srv_max_io_capacity, sync_lsn);
1875         buf_flush_wait_batch_end_acquiring_mutex(false);
1876         if (n_pages)
1877         {
1878           MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE,
1879                                        MONITOR_FLUSH_SYNC_COUNT,
1880                                        MONITOR_FLUSH_SYNC_PAGES, n_pages);
1881         }
1882         mysql_mutex_lock(&buf_pool.flush_list_mutex);
1883       }
1884       while (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn);
1885     }
1886     else
1887 #endif
1888     {
1889       thd_wait_begin(nullptr, THD_WAIT_DISKIO);
1890       tpool::tpool_wait_begin();
1891       buf_flush_wait(sync_lsn);
1892       tpool::tpool_wait_end();
1893       thd_wait_end(nullptr);
1894     }
1895   }
1896 
1897   mysql_mutex_unlock(&buf_pool.flush_list_mutex);
1898 
1899   if (UNIV_UNLIKELY(log_sys.last_checkpoint_lsn < sync_lsn))
1900   {
1901     /* If the buffer pool was clean, no log write was guaranteed
1902     to happen until now. There could be an outstanding FILE_CHECKPOINT
1903     record from a previous fil_names_clear() call, which we must
1904     write out before we can advance the checkpoint. */
1905     if (sync_lsn > log_sys.get_flushed_lsn())
1906       log_write_up_to(sync_lsn, true);
1907     log_checkpoint();
1908   }
1909 }
1910 
1911 /** Initiate more eager page flushing if the log checkpoint age is too old.
1912 @param lsn      buf_pool.get_oldest_modification(LSN_MAX) target
1913 @param furious  true=furious flushing, false=limit to innodb_io_capacity */
buf_flush_ahead(lsn_t lsn,bool furious)1914 ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious)
1915 {
1916   mysql_mutex_assert_not_owner(&log_sys.mutex);
1917   ut_ad(!srv_read_only_mode);
1918 
1919   if (recv_recovery_is_on())
1920     recv_sys.apply(true);
1921 
1922   Atomic_relaxed<lsn_t> &limit= furious
1923     ? buf_flush_sync_lsn : buf_flush_async_lsn;
1924 
1925   if (limit < lsn)
1926   {
1927     mysql_mutex_lock(&buf_pool.flush_list_mutex);
1928     if (limit < lsn)
1929     {
1930       limit= lsn;
1931       buf_pool.page_cleaner_set_idle(false);
1932       pthread_cond_signal(&buf_pool.do_flush_list);
1933     }
1934     mysql_mutex_unlock(&buf_pool.flush_list_mutex);
1935   }
1936 }
1937 
1938 /** Wait for pending flushes to complete. */
buf_flush_wait_batch_end_acquiring_mutex(bool lru)1939 void buf_flush_wait_batch_end_acquiring_mutex(bool lru)
1940 {
1941   if (lru ? buf_pool.n_flush_LRU() : buf_pool.n_flush_list())
1942   {
1943     mysql_mutex_lock(&buf_pool.mutex);
1944     buf_flush_wait_batch_end(lru);
1945     mysql_mutex_unlock(&buf_pool.mutex);
1946   }
1947 }
1948 
1949 /** Conduct checkpoint-related flushing for innodb_flush_sync=ON,
1950 and try to initiate checkpoints until the target is met.
1951 @param lsn   minimum value of buf_pool.get_oldest_modification(LSN_MAX) */
buf_flush_sync_for_checkpoint(lsn_t lsn)1952 ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn)
1953 {
1954   ut_ad(!srv_read_only_mode);
1955 
1956   for (;;)
1957   {
1958     mysql_mutex_unlock(&buf_pool.flush_list_mutex);
1959 
1960     if (ulint n_flushed= buf_flush_list(srv_max_io_capacity, lsn))
1961     {
1962       MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE,
1963                                    MONITOR_FLUSH_SYNC_COUNT,
1964                                    MONITOR_FLUSH_SYNC_PAGES, n_flushed);
1965     }
1966 
1967     switch (srv_file_flush_method) {
1968     case SRV_NOSYNC:
1969     case SRV_O_DIRECT_NO_FSYNC:
1970       break;
1971     default:
1972       fil_flush_file_spaces();
1973     }
1974 
1975     mysql_mutex_lock(&log_sys.mutex);
1976     const lsn_t newest_lsn= log_sys.get_lsn();
1977     mysql_mutex_lock(&log_sys.flush_order_mutex);
1978     mysql_mutex_lock(&buf_pool.flush_list_mutex);
1979     lsn_t measure= buf_pool.get_oldest_modification(0);
1980     mysql_mutex_unlock(&log_sys.flush_order_mutex);
1981     const lsn_t checkpoint_lsn= measure ? measure : newest_lsn;
1982 
1983     if (!recv_recovery_is_on() &&
1984         checkpoint_lsn > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT)
1985     {
1986       mysql_mutex_unlock(&buf_pool.flush_list_mutex);
1987       log_checkpoint_low(checkpoint_lsn, newest_lsn);
1988       mysql_mutex_lock(&buf_pool.flush_list_mutex);
1989       measure= buf_pool.get_oldest_modification(LSN_MAX);
1990     }
1991     else
1992     {
1993       mysql_mutex_unlock(&log_sys.mutex);
1994       if (!measure)
1995         measure= LSN_MAX;
1996     }
1997 
1998     mysql_mutex_assert_not_owner(&log_sys.mutex);
1999 
2000     /* After attempting log checkpoint, check if we have reached our target. */
2001     const lsn_t target= buf_flush_sync_lsn;
2002 
2003     if (measure >= target)
2004       buf_flush_sync_lsn= 0;
2005     else if (measure >= buf_flush_async_lsn)
2006       buf_flush_async_lsn= 0;
2007 
2008     /* wake up buf_flush_wait() */
2009     pthread_cond_broadcast(&buf_pool.done_flush_list);
2010 
2011     lsn= std::max(lsn, target);
2012 
2013     if (measure >= lsn)
2014       return;
2015   }
2016 }
2017 
2018 /** Check if the adpative flushing threshold is recommended based on
2019 redo log capacity filled threshold.
2020 @param oldest_lsn     buf_pool.get_oldest_modification()
2021 @return true if adaptive flushing is recommended. */
af_needed_for_redo(lsn_t oldest_lsn)2022 static bool af_needed_for_redo(lsn_t oldest_lsn)
2023 {
2024   lsn_t age= (log_sys.get_lsn() - oldest_lsn);
2025   lsn_t af_lwm= static_cast<lsn_t>(srv_adaptive_flushing_lwm *
2026     static_cast<double>(log_sys.log_capacity) / 100);
2027 
2028   /* if age > af_lwm adaptive flushing is recommended */
2029   return (age > af_lwm);
2030 }
2031 
2032 /*********************************************************************//**
2033 Calculates if flushing is required based on redo generation rate.
2034 @return percent of io_capacity to flush to manage redo space */
2035 static
2036 ulint
af_get_pct_for_lsn(lsn_t age)2037 af_get_pct_for_lsn(
2038 /*===============*/
2039 	lsn_t	age)	/*!< in: current age of LSN. */
2040 {
2041 	lsn_t	af_lwm = static_cast<lsn_t>(
2042 		srv_adaptive_flushing_lwm
2043 		* static_cast<double>(log_sys.log_capacity) / 100);
2044 
2045 	if (age < af_lwm) {
2046 		/* No adaptive flushing. */
2047 		return(0);
2048 	}
2049 
2050 	lsn_t lsn_age_factor = (age * 100) / log_sys.max_modified_age_async;
2051 
2052 	ut_ad(srv_max_io_capacity >= srv_io_capacity);
2053 	return static_cast<ulint>(
2054 		(static_cast<double>(srv_max_io_capacity / srv_io_capacity
2055 				     * lsn_age_factor)
2056 		 * sqrt(static_cast<double>(lsn_age_factor))
2057 		 / 7.5));
2058 }
2059 
2060 /** This function is called approximately once every second by the
2061 page_cleaner thread if innodb_adaptive_flushing=ON.
2062 Based on various factors it decides if there is a need to do flushing.
2063 @return number of pages recommended to be flushed
2064 @param last_pages_in  number of pages flushed in previous batch
2065 @param oldest_lsn     buf_pool.get_oldest_modification(0)
2066 @param dirty_blocks   UT_LIST_GET_LEN(buf_pool.flush_list)
2067 @param dirty_pct      100*flush_list.count / (LRU.count + free.count) */
page_cleaner_flush_pages_recommendation(ulint last_pages_in,lsn_t oldest_lsn,ulint dirty_blocks,double dirty_pct)2068 static ulint page_cleaner_flush_pages_recommendation(ulint last_pages_in,
2069                                                      lsn_t oldest_lsn,
2070                                                      ulint dirty_blocks,
2071                                                      double dirty_pct)
2072 {
2073 	static	lsn_t		prev_lsn = 0;
2074 	static	ulint		sum_pages = 0;
2075 	static	ulint		avg_page_rate = 0;
2076 	static	ulint		n_iterations = 0;
2077 	static	time_t		prev_time;
2078 	lsn_t			lsn_rate;
2079 	ulint			n_pages = 0;
2080 
2081 	const lsn_t cur_lsn = log_sys.get_lsn();
2082 	ut_ad(oldest_lsn <= cur_lsn);
2083 	ulint pct_for_lsn = af_get_pct_for_lsn(cur_lsn - oldest_lsn);
2084 	time_t curr_time = time(nullptr);
2085 	const double max_pct = srv_max_buf_pool_modified_pct;
2086 
2087 	if (!prev_lsn || !pct_for_lsn) {
2088 		prev_time = curr_time;
2089 		prev_lsn = cur_lsn;
2090 		if (max_pct > 0.0) {
2091 			dirty_pct /= max_pct;
2092 		}
2093 
2094 		n_pages = ulint(dirty_pct * double(srv_io_capacity));
2095 		if (n_pages < dirty_blocks) {
2096 			n_pages= std::min<ulint>(srv_io_capacity, dirty_blocks);
2097 		}
2098 
2099 		return n_pages;
2100 	}
2101 
2102 	sum_pages += last_pages_in;
2103 
2104 	double	time_elapsed = difftime(curr_time, prev_time);
2105 
2106 	/* We update our variables every srv_flushing_avg_loops
2107 	iterations to smooth out transition in workload. */
2108 	if (++n_iterations >= srv_flushing_avg_loops
2109 	    || time_elapsed >= static_cast<double>(srv_flushing_avg_loops)) {
2110 
2111 		if (time_elapsed < 1) {
2112 			time_elapsed = 1;
2113 		}
2114 
2115 		avg_page_rate = static_cast<ulint>(
2116 			((static_cast<double>(sum_pages)
2117 			  / time_elapsed)
2118 			 + static_cast<double>(avg_page_rate)) / 2);
2119 
2120 		/* How much LSN we have generated since last call. */
2121 		lsn_rate = static_cast<lsn_t>(
2122 			static_cast<double>(cur_lsn - prev_lsn)
2123 			/ time_elapsed);
2124 
2125 		lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2;
2126 
2127 		ulint	flush_tm = page_cleaner.flush_time;
2128 		ulint	flush_pass = page_cleaner.flush_pass;
2129 
2130 		page_cleaner.flush_time = 0;
2131 		page_cleaner.flush_pass = 0;
2132 
2133 		if (flush_pass) {
2134 			flush_tm /= flush_pass;
2135 		}
2136 
2137 		MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME, flush_tm);
2138 		MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS, flush_pass);
2139 
2140 		prev_lsn = cur_lsn;
2141 		prev_time = curr_time;
2142 
2143 		n_iterations = 0;
2144 
2145 		sum_pages = 0;
2146 	}
2147 
2148 	const ulint pct_for_dirty = srv_max_dirty_pages_pct_lwm == 0
2149 		? (dirty_pct >= max_pct ? 100 : 0)
2150 		: static_cast<ulint>
2151 		(max_pct > 0.0 ? dirty_pct / max_pct : dirty_pct);
2152 	ulint pct_total = std::max(pct_for_dirty, pct_for_lsn);
2153 
2154 	/* Estimate pages to be flushed for the lsn progress */
2155 	lsn_t	target_lsn = oldest_lsn
2156 		+ lsn_avg_rate * buf_flush_lsn_scan_factor;
2157 	ulint	pages_for_lsn = 0;
2158 
2159 	mysql_mutex_lock(&buf_pool.flush_list_mutex);
2160 
2161 	for (buf_page_t* b = UT_LIST_GET_LAST(buf_pool.flush_list);
2162 	     b != NULL;
2163 	     b = UT_LIST_GET_PREV(list, b)) {
2164 		if (b->oldest_modification() > target_lsn) {
2165 			break;
2166 		}
2167 		if (++pages_for_lsn >= srv_max_io_capacity) {
2168 			break;
2169 		}
2170 	}
2171 	mysql_mutex_unlock(&buf_pool.flush_list_mutex);
2172 
2173 	pages_for_lsn /= buf_flush_lsn_scan_factor;
2174 	if (pages_for_lsn < 1) {
2175 		pages_for_lsn = 1;
2176 	}
2177 
2178 	n_pages = (ulint(double(srv_io_capacity) * double(pct_total) / 100.0)
2179 		   + avg_page_rate + pages_for_lsn) / 3;
2180 
2181 	if (n_pages > srv_max_io_capacity) {
2182 		n_pages = srv_max_io_capacity;
2183 	}
2184 
2185 	MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages);
2186 
2187 	MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_BY_AGE, pages_for_lsn);
2188 
2189 	MONITOR_SET(MONITOR_FLUSH_AVG_PAGE_RATE, avg_page_rate);
2190 	MONITOR_SET(MONITOR_FLUSH_LSN_AVG_RATE, lsn_avg_rate);
2191 	MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty);
2192 	MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn);
2193 
2194 	return(n_pages);
2195 }
2196 
2197 /******************************************************************//**
2198 page_cleaner thread tasked with flushing dirty pages from the buffer
2199 pools. As of now we'll have only one coordinator.
2200 @return a dummy parameter */
DECLARE_THREAD(buf_flush_page_cleaner)2201 static os_thread_ret_t DECLARE_THREAD(buf_flush_page_cleaner)(void*)
2202 {
2203   my_thread_init();
2204 #ifdef UNIV_PFS_THREAD
2205   pfs_register_thread(page_cleaner_thread_key);
2206 #endif /* UNIV_PFS_THREAD */
2207   ut_ad(!srv_read_only_mode);
2208   ut_ad(buf_page_cleaner_is_active);
2209 
2210   ulint last_pages= 0;
2211   timespec abstime;
2212   set_timespec(abstime, 1);
2213 
2214   mysql_mutex_lock(&buf_pool.flush_list_mutex);
2215 
2216   lsn_t lsn_limit;
2217   ulint last_activity_count= srv_get_activity_count();
2218 
2219   for (;;)
2220   {
2221     lsn_limit= buf_flush_sync_lsn;
2222 
2223     if (UNIV_UNLIKELY(lsn_limit != 0))
2224     {
2225 furious_flush:
2226       if (UNIV_LIKELY(srv_flush_sync))
2227       {
2228         buf_flush_sync_for_checkpoint(lsn_limit);
2229         last_pages= 0;
2230         set_timespec(abstime, 1);
2231         continue;
2232       }
2233     }
2234     else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED)
2235       break;
2236 
2237     /* If buf pager cleaner is idle and there is no work
2238     (either dirty pages are all flushed or adaptive flushing
2239     is not enabled) then opt for non-timed wait */
2240     if (buf_pool.page_cleaner_idle() &&
2241         (!UT_LIST_GET_LEN(buf_pool.flush_list) ||
2242          srv_max_dirty_pages_pct_lwm == 0.0))
2243       my_cond_wait(&buf_pool.do_flush_list, &buf_pool.flush_list_mutex.m_mutex);
2244     else
2245       my_cond_timedwait(&buf_pool.do_flush_list,
2246                         &buf_pool.flush_list_mutex.m_mutex, &abstime);
2247 
2248     set_timespec(abstime, 1);
2249 
2250     lsn_t soft_lsn_limit= buf_flush_async_lsn;
2251     lsn_limit= buf_flush_sync_lsn;
2252 
2253     if (UNIV_UNLIKELY(lsn_limit != 0))
2254     {
2255       if (UNIV_LIKELY(srv_flush_sync))
2256         goto furious_flush;
2257     }
2258     else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED)
2259       break;
2260 
2261     const lsn_t oldest_lsn= buf_pool.get_oldest_modification(0);
2262 
2263     if (!oldest_lsn)
2264     {
2265       if (UNIV_UNLIKELY(lsn_limit != 0))
2266       {
2267         buf_flush_sync_lsn= 0;
2268         /* wake up buf_flush_wait() */
2269         pthread_cond_broadcast(&buf_pool.done_flush_list);
2270       }
2271 unemployed:
2272       buf_flush_async_lsn= 0;
2273       buf_pool.page_cleaner_set_idle(true);
2274 
2275       DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", continue;);
2276 
2277       mysql_mutex_unlock(&buf_pool.flush_list_mutex);
2278 
2279       if (!recv_recovery_is_on() &&
2280           !srv_startup_is_before_trx_rollback_phase &&
2281           srv_operation == SRV_OPERATION_NORMAL)
2282         log_checkpoint();
2283 
2284       mysql_mutex_lock(&buf_pool.flush_list_mutex);
2285       continue;
2286     }
2287 
2288     const ulint dirty_blocks= UT_LIST_GET_LEN(buf_pool.flush_list);
2289     ut_ad(dirty_blocks);
2290     /* We perform dirty reads of the LRU+free list lengths here.
2291     Division by zero is not possible, because buf_pool.flush_list is
2292     guaranteed to be nonempty, and it is a subset of buf_pool.LRU. */
2293     const double dirty_pct= double(dirty_blocks) * 100.0 /
2294       double(UT_LIST_GET_LEN(buf_pool.LRU) + UT_LIST_GET_LEN(buf_pool.free));
2295 
2296     bool idle_flush= false;
2297 
2298     if (lsn_limit || soft_lsn_limit);
2299     else if (af_needed_for_redo(oldest_lsn));
2300     else if (srv_max_dirty_pages_pct_lwm != 0.0)
2301     {
2302       const ulint activity_count= srv_get_activity_count();
2303       if (activity_count != last_activity_count)
2304         last_activity_count= activity_count;
2305       else if (buf_pool.page_cleaner_idle() && buf_pool.n_pend_reads == 0)
2306       {
2307          /* reaching here means 3 things:
2308          - last_activity_count == activity_count: suggesting server is idle
2309            (no trx_t::commit activity)
2310          - page cleaner is idle (dirty_pct < srv_max_dirty_pages_pct_lwm)
2311          - there are no pending reads but there are dirty pages to flush */
2312         idle_flush= true;
2313         buf_pool.update_last_activity_count(activity_count);
2314       }
2315 
2316       if (!idle_flush && dirty_pct < srv_max_dirty_pages_pct_lwm)
2317         goto unemployed;
2318     }
2319     else if (dirty_pct < srv_max_buf_pool_modified_pct)
2320       goto unemployed;
2321 
2322     if (UNIV_UNLIKELY(lsn_limit != 0) && oldest_lsn >= lsn_limit)
2323       lsn_limit= buf_flush_sync_lsn= 0;
2324     if (UNIV_UNLIKELY(soft_lsn_limit != 0) && oldest_lsn >= soft_lsn_limit)
2325       soft_lsn_limit= buf_flush_async_lsn= 0;
2326 
2327     buf_pool.page_cleaner_set_idle(false);
2328     mysql_mutex_unlock(&buf_pool.flush_list_mutex);
2329 
2330     if (!lsn_limit)
2331       lsn_limit= soft_lsn_limit;
2332 
2333     ulint n_flushed;
2334 
2335     if (UNIV_UNLIKELY(lsn_limit != 0))
2336     {
2337       n_flushed= buf_flush_list(srv_max_io_capacity, lsn_limit);
2338       /* wake up buf_flush_wait() */
2339       pthread_cond_broadcast(&buf_pool.done_flush_list);
2340       goto try_checkpoint;
2341     }
2342     else if (idle_flush || !srv_adaptive_flushing)
2343     {
2344       n_flushed= buf_flush_list(srv_io_capacity);
2345 try_checkpoint:
2346       if (n_flushed)
2347       {
2348         MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
2349                                      MONITOR_FLUSH_BACKGROUND_COUNT,
2350                                      MONITOR_FLUSH_BACKGROUND_PAGES,
2351                                      n_flushed);
2352 do_checkpoint:
2353         /* The periodic log_checkpoint() call here makes it harder to
2354         reproduce bugs in crash recovery or mariabackup --prepare, or
2355         in code that writes the redo log records. Omitting the call
2356         here should not affect correctness, because log_free_check()
2357         should still be invoking checkpoints when needed. */
2358         DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", goto next;);
2359 
2360         if (!recv_recovery_is_on() && srv_operation == SRV_OPERATION_NORMAL)
2361           log_checkpoint();
2362       }
2363     }
2364     else if (ulint n= page_cleaner_flush_pages_recommendation(last_pages,
2365                                                               oldest_lsn,
2366                                                               dirty_blocks,
2367                                                               dirty_pct))
2368     {
2369       page_cleaner.flush_pass++;
2370       const ulint tm= ut_time_ms();
2371       last_pages= n_flushed= buf_flush_list(n);
2372       page_cleaner.flush_time+= ut_time_ms() - tm;
2373 
2374       if (n_flushed)
2375       {
2376         MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
2377                                      MONITOR_FLUSH_ADAPTIVE_COUNT,
2378                                      MONITOR_FLUSH_ADAPTIVE_PAGES,
2379                                      n_flushed);
2380         goto do_checkpoint;
2381       }
2382     }
2383     else if (buf_flush_async_lsn <= oldest_lsn)
2384     {
2385       mysql_mutex_lock(&buf_pool.flush_list_mutex);
2386       goto unemployed;
2387     }
2388 
2389 #ifdef UNIV_DEBUG
2390     while (innodb_page_cleaner_disabled_debug && !buf_flush_sync_lsn &&
2391            srv_shutdown_state == SRV_SHUTDOWN_NONE)
2392       os_thread_sleep(100000);
2393 #endif /* UNIV_DEBUG */
2394 
2395 #ifndef DBUG_OFF
2396 next:
2397 #endif /* !DBUG_OFF */
2398     mysql_mutex_lock(&buf_pool.flush_list_mutex);
2399 
2400     /* when idle flushing kicks in page_cleaner is marked active.
2401     reset it back to idle since the it was made active as part of
2402     idle flushing stage. */
2403     if (idle_flush)
2404       buf_pool.page_cleaner_set_idle(true);
2405   }
2406 
2407   mysql_mutex_unlock(&buf_pool.flush_list_mutex);
2408 
2409   if (srv_fast_shutdown != 2)
2410   {
2411     buf_flush_wait_batch_end_acquiring_mutex(true);
2412     buf_flush_wait_batch_end_acquiring_mutex(false);
2413   }
2414 
2415   mysql_mutex_lock(&buf_pool.flush_list_mutex);
2416   lsn_limit= buf_flush_sync_lsn;
2417   if (UNIV_UNLIKELY(lsn_limit != 0))
2418     goto furious_flush;
2419   buf_page_cleaner_is_active= false;
2420   pthread_cond_broadcast(&buf_pool.done_flush_list);
2421   mysql_mutex_unlock(&buf_pool.flush_list_mutex);
2422 
2423   my_thread_end();
2424   /* We count the number of threads in os_thread_exit(). A created
2425   thread should always use that to exit and not use return() to exit. */
2426   os_thread_exit();
2427 
2428   OS_THREAD_DUMMY_RETURN;
2429 }
2430 
2431 /** Initialize page_cleaner. */
buf_flush_page_cleaner_init()2432 ATTRIBUTE_COLD void buf_flush_page_cleaner_init()
2433 {
2434   ut_ad(!buf_page_cleaner_is_active);
2435   ut_ad(srv_operation == SRV_OPERATION_NORMAL ||
2436         srv_operation == SRV_OPERATION_RESTORE ||
2437         srv_operation == SRV_OPERATION_RESTORE_EXPORT);
2438   buf_flush_async_lsn= 0;
2439   buf_flush_sync_lsn= 0;
2440   buf_page_cleaner_is_active= true;
2441   os_thread_create(buf_flush_page_cleaner);
2442 }
2443 
2444 #if defined(HAVE_SYSTEMD) && !defined(EMBEDDED_LIBRARY)
2445 /** @return the number of dirty pages in the buffer pool */
buf_flush_list_length()2446 static ulint buf_flush_list_length()
2447 {
2448   mysql_mutex_lock(&buf_pool.flush_list_mutex);
2449   const ulint len= UT_LIST_GET_LEN(buf_pool.flush_list);
2450   mysql_mutex_unlock(&buf_pool.flush_list_mutex);
2451   return len;
2452 }
2453 #endif
2454 
2455 /** Flush the buffer pool on shutdown. */
buf_flush_buffer_pool()2456 ATTRIBUTE_COLD void buf_flush_buffer_pool()
2457 {
2458   ut_ad(!buf_page_cleaner_is_active);
2459   ut_ad(!buf_flush_sync_lsn);
2460 
2461   service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
2462                                  "Waiting to flush the buffer pool");
2463 
2464   mysql_mutex_lock(&buf_pool.flush_list_mutex);
2465 
2466   while (buf_pool.get_oldest_modification(0))
2467   {
2468     mysql_mutex_unlock(&buf_pool.flush_list_mutex);
2469     buf_flush_list(srv_max_io_capacity);
2470     if (buf_pool.n_flush_list())
2471     {
2472       timespec abstime;
2473       service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
2474                                      "Waiting to flush " ULINTPF " pages",
2475                                      buf_flush_list_length());
2476       set_timespec(abstime, INNODB_EXTEND_TIMEOUT_INTERVAL / 2);
2477       mysql_mutex_lock(&buf_pool.mutex);
2478       while (buf_pool.n_flush_list_)
2479         my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex.m_mutex,
2480                           &abstime);
2481       mysql_mutex_unlock(&buf_pool.mutex);
2482     }
2483     mysql_mutex_lock(&buf_pool.flush_list_mutex);
2484   }
2485 
2486   mysql_mutex_unlock(&buf_pool.flush_list_mutex);
2487   ut_ad(!buf_pool.any_io_pending());
2488 }
2489 
2490 /** Synchronously flush dirty blocks during recv_sys_t::apply().
2491 NOTE: The calling thread is not allowed to hold any buffer page latches! */
buf_flush_sync_batch(lsn_t lsn)2492 void buf_flush_sync_batch(lsn_t lsn)
2493 {
2494   thd_wait_begin(nullptr, THD_WAIT_DISKIO);
2495   tpool::tpool_wait_begin();
2496   mysql_mutex_lock(&buf_pool.flush_list_mutex);
2497   buf_flush_wait(lsn);
2498   mysql_mutex_unlock(&buf_pool.flush_list_mutex);
2499   tpool::tpool_wait_end();
2500   thd_wait_end(nullptr);
2501 }
2502 
2503 /** Synchronously flush dirty blocks.
2504 NOTE: The calling thread is not allowed to hold any buffer page latches! */
buf_flush_sync()2505 void buf_flush_sync()
2506 {
2507   ut_ad(!sync_check_iterate(dict_sync_check()));
2508 
2509   if (recv_recovery_is_on())
2510     recv_sys.apply(true);
2511 
2512   thd_wait_begin(nullptr, THD_WAIT_DISKIO);
2513   tpool::tpool_wait_begin();
2514   mysql_mutex_lock(&buf_pool.flush_list_mutex);
2515   for (;;)
2516   {
2517     const lsn_t lsn= log_sys.get_lsn();
2518     buf_flush_wait(lsn);
2519     if (lsn == log_sys.get_lsn())
2520       break;
2521   }
2522   mysql_mutex_unlock(&buf_pool.flush_list_mutex);
2523   tpool::tpool_wait_end();
2524   thd_wait_end(nullptr);
2525 }
2526 
2527 #ifdef UNIV_DEBUG
2528 /** Functor to validate the flush list. */
2529 struct	Check {
operator ()Check2530 	void operator()(const buf_page_t* elem) const
2531 	{
2532 		ut_ad(elem->oldest_modification());
2533 		ut_ad(!fsp_is_system_temporary(elem->id().space()));
2534 	}
2535 };
2536 
2537 /** Validate the flush list. */
buf_flush_validate_low()2538 static void buf_flush_validate_low()
2539 {
2540 	buf_page_t*		bpage;
2541 
2542 	mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
2543 
2544 	ut_list_validate(buf_pool.flush_list, Check());
2545 
2546 	bpage = UT_LIST_GET_FIRST(buf_pool.flush_list);
2547 
2548 	while (bpage != NULL) {
2549 		const lsn_t	om = bpage->oldest_modification();
2550 		/* A page in buf_pool.flush_list can be in
2551 		BUF_BLOCK_REMOVE_HASH state. This happens when a page
2552 		is in the middle of being relocated. In that case the
2553 		original descriptor can have this state and still be
2554 		in the flush list waiting to acquire the
2555 		buf_pool.flush_list_mutex to complete the relocation. */
2556 		ut_d(const auto s= bpage->state());
2557 		ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE
2558 		      || s == BUF_BLOCK_REMOVE_HASH);
2559 		ut_ad(om == 1 || om > 2);
2560 
2561 		bpage = UT_LIST_GET_NEXT(list, bpage);
2562 		ut_ad(om == 1 || !bpage || recv_recovery_is_on()
2563 		      || om >= bpage->oldest_modification());
2564 	}
2565 }
2566 
2567 /** Validate the flush list. */
buf_flush_validate()2568 void buf_flush_validate()
2569 {
2570   mysql_mutex_lock(&buf_pool.flush_list_mutex);
2571   buf_flush_validate_low();
2572   mysql_mutex_unlock(&buf_pool.flush_list_mutex);
2573 }
2574 #endif /* UNIV_DEBUG */
2575