1 /*****************************************************************************
2
3 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2013, 2022, MariaDB Corporation.
5 Copyright (c) 2013, 2014, Fusion-io
6
7 This program is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free Software
9 Foundation; version 2 of the License.
10
11 This program is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License along with
16 this program; if not, write to the Free Software Foundation, Inc.,
17 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
18
19 *****************************************************************************/
20
21 /**************************************************//**
22 @file buf/buf0flu.cc
23 The database buffer buf_pool flush algorithm
24
25 Created 11/11/1995 Heikki Tuuri
26 *******************************************************/
27
28 #include "univ.i"
29 #include <my_service_manager.h>
30 #include <mysql/service_thd_wait.h>
31 #include <sql_class.h>
32
33 #include "buf0flu.h"
34 #include "buf0buf.h"
35 #include "buf0checksum.h"
36 #include "buf0dblwr.h"
37 #include "srv0start.h"
38 #include "page0zip.h"
39 #include "fil0fil.h"
40 #include "log0crypt.h"
41 #include "srv0mon.h"
42 #include "fil0pagecompress.h"
43 #ifdef HAVE_LZO
44 # include "lzo/lzo1x.h"
45 #elif defined HAVE_SNAPPY
46 # include "snappy-c.h"
47 #endif
48
49 /** Number of pages flushed via LRU. Protected by buf_pool.mutex.
50 Also included in buf_flush_page_count. */
51 ulint buf_lru_flush_page_count;
52
53 /** Number of pages flushed. Protected by buf_pool.mutex. */
54 ulint buf_flush_page_count;
55
56 /** Flag indicating if the page_cleaner is in active state. */
57 bool buf_page_cleaner_is_active;
58
59 /** Factor for scan length to determine n_pages for intended oldest LSN
60 progress */
61 static constexpr ulint buf_flush_lsn_scan_factor = 3;
62
63 /** Average redo generation rate */
64 static lsn_t lsn_avg_rate = 0;
65
66 /** Target oldest_modification for the page cleaner background flushing;
67 writes are protected by buf_pool.flush_list_mutex */
68 static Atomic_relaxed<lsn_t> buf_flush_async_lsn;
69 /** Target oldest_modification for the page cleaner furious flushing;
70 writes are protected by buf_pool.flush_list_mutex */
71 static Atomic_relaxed<lsn_t> buf_flush_sync_lsn;
72
73 #ifdef UNIV_PFS_THREAD
74 mysql_pfs_key_t page_cleaner_thread_key;
75 #endif /* UNIV_PFS_THREAD */
76
77 /** Page cleaner structure */
78 static struct
79 {
80 /** total elapsed time in adaptive flushing, in seconds */
81 ulint flush_time;
82 /** number of adaptive flushing passes */
83 ulint flush_pass;
84 } page_cleaner;
85
86 #ifdef UNIV_DEBUG
87 my_bool innodb_page_cleaner_disabled_debug;
88 #endif /* UNIV_DEBUG */
89
90 /* @} */
91
92 #ifdef UNIV_DEBUG
93 /** Validate the flush list. */
94 static void buf_flush_validate_low();
95
96 /** Validates the flush list some of the time. */
buf_flush_validate_skip()97 static void buf_flush_validate_skip()
98 {
99 /** Try buf_flush_validate_low() every this many times */
100 # define BUF_FLUSH_VALIDATE_SKIP 23
101
102 /** The buf_flush_validate_low() call skip counter.
103 Use a signed type because of the race condition below. */
104 static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
105
106 /* There is a race condition below, but it does not matter,
107 because this call is only for heuristic purposes. We want to
108 reduce the call frequency of the costly buf_flush_validate_low()
109 check in debug builds. */
110 if (--buf_flush_validate_count > 0) {
111 return;
112 }
113
114 buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
115 buf_flush_validate_low();
116 }
117 #endif /* UNIV_DEBUG */
118
119 /** Wake up the page cleaner if needed */
page_cleaner_wakeup()120 void buf_pool_t::page_cleaner_wakeup()
121 {
122 if (!page_cleaner_idle())
123 return;
124 double dirty_pct= double(UT_LIST_GET_LEN(buf_pool.flush_list)) * 100.0 /
125 double(UT_LIST_GET_LEN(buf_pool.LRU) + UT_LIST_GET_LEN(buf_pool.free));
126 double pct_lwm= srv_max_dirty_pages_pct_lwm;
127
128 /* if pct_lwm != 0.0, adaptive flushing is enabled.
129 signal buf page cleaner thread
130 - if pct_lwm <= dirty_pct then it will invoke apdative flushing flow
131 - if pct_lwm > dirty_pct then it will invoke idle flushing flow.
132
133 idle_flushing:
134 dirty_pct < innodb_max_dirty_pages_pct_lwm so it could be an
135 idle flushing use-case.
136
137 Why is last_activity_count not updated always?
138 - let's first understand when is server activity count updated.
139 - it is updated on commit of a transaction trx_t::commit() and not
140 on adding a page to the flush list.
141 - page_cleaner_wakeup is called when a page is added to the flush list.
142
143 - now let's say the first user thread, updates the count from X -> Y but
144 is yet to commit the transaction (so activity count is still Y).
145 followup user threads will see the updated count as (Y) that is matching
146 the universal server activity count (Y), giving a false impression that
147 the server is idle.
148
149 How to avoid this?
150 - by allowing last_activity_count to updated when page-cleaner is made
151 active and has work to do. This ensures that the last_activity signal
152 is consumed by the page-cleaner before the next one is generated. */
153 if ((pct_lwm != 0.0 && pct_lwm <= dirty_pct) ||
154 (pct_lwm != 0.0 && last_activity_count == srv_get_activity_count()) ||
155 srv_max_buf_pool_modified_pct <= dirty_pct)
156 {
157 page_cleaner_is_idle= false;
158 pthread_cond_signal(&do_flush_list);
159 }
160 }
161
delete_from_flush_list_low(buf_page_t * bpage)162 inline void buf_pool_t::delete_from_flush_list_low(buf_page_t *bpage)
163 {
164 ut_ad(!fsp_is_system_temporary(bpage->id().space()));
165 mysql_mutex_assert_owner(&flush_list_mutex);
166 flush_hp.adjust(bpage);
167 UT_LIST_REMOVE(flush_list, bpage);
168 }
169
170 /** Insert a modified block into the flush list.
171 @param block modified block
172 @param lsn start LSN of the mini-transaction that modified the block */
insert_into_flush_list(buf_block_t * block,lsn_t lsn)173 void buf_pool_t::insert_into_flush_list(buf_block_t *block, lsn_t lsn)
174 {
175 mysql_mutex_assert_not_owner(&mutex);
176 mysql_mutex_assert_owner(&log_sys.flush_order_mutex);
177 ut_ad(lsn > 2);
178 ut_ad(!fsp_is_system_temporary(block->page.id().space()));
179
180 mysql_mutex_lock(&flush_list_mutex);
181 if (ut_d(const lsn_t old=) block->page.oldest_modification())
182 {
183 ut_ad(old == 1);
184 delete_from_flush_list_low(&block->page);
185 }
186 else
187 stat.flush_list_bytes+= block->physical_size();
188 ut_ad(stat.flush_list_bytes <= curr_pool_size);
189
190 block->page.set_oldest_modification(lsn);
191 MEM_CHECK_DEFINED(block->page.zip.data
192 ? block->page.zip.data : block->frame,
193 block->physical_size());
194 UT_LIST_ADD_FIRST(flush_list, &block->page);
195 ut_d(buf_flush_validate_skip());
196 page_cleaner_wakeup();
197 mysql_mutex_unlock(&flush_list_mutex);
198 }
199
200 /** Remove a block from flush_list.
201 @param bpage buffer pool page
202 @param clear whether to invoke buf_page_t::clear_oldest_modification() */
delete_from_flush_list(buf_page_t * bpage,bool clear)203 void buf_pool_t::delete_from_flush_list(buf_page_t *bpage, bool clear)
204 {
205 delete_from_flush_list_low(bpage);
206 stat.flush_list_bytes-= bpage->physical_size();
207 if (clear)
208 bpage->clear_oldest_modification();
209 #ifdef UNIV_DEBUG
210 buf_flush_validate_skip();
211 #endif /* UNIV_DEBUG */
212 }
213
214 /** Remove all dirty pages belonging to a given tablespace when we are
215 deleting the data file of that tablespace.
216 The pages still remain a part of LRU and are evicted from
217 the list as they age towards the tail of the LRU.
218 @param id tablespace identifier */
buf_flush_remove_pages(ulint id)219 void buf_flush_remove_pages(ulint id)
220 {
221 const page_id_t first(id, 0), end(id + 1, 0);
222 ut_ad(id);
223 mysql_mutex_lock(&buf_pool.mutex);
224
225 for (;;)
226 {
227 bool deferred= false;
228
229 mysql_mutex_lock(&buf_pool.flush_list_mutex);
230
231 for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; )
232 {
233 ut_d(const auto s= bpage->state());
234 ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE ||
235 s == BUF_BLOCK_REMOVE_HASH);
236 buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
237
238 const page_id_t bpage_id(bpage->id());
239
240 if (bpage_id < first || bpage_id >= end);
241 else if (bpage->io_fix() != BUF_IO_NONE)
242 deferred= true;
243 else
244 buf_pool.delete_from_flush_list(bpage);
245
246 bpage= prev;
247 }
248
249 mysql_mutex_unlock(&buf_pool.flush_list_mutex);
250
251 if (!deferred)
252 break;
253
254 mysql_mutex_unlock(&buf_pool.mutex);
255 os_thread_yield();
256 mysql_mutex_lock(&buf_pool.mutex);
257 buf_flush_wait_batch_end(false);
258 }
259
260 mysql_mutex_unlock(&buf_pool.mutex);
261 }
262
263 /*******************************************************************//**
264 Relocates a buffer control block on the flush_list.
265 Note that it is assumed that the contents of bpage have already been
266 copied to dpage.
267 IMPORTANT: When this function is called bpage and dpage are not
268 exact copies of each other. For example, they both will have different
269 ::state. Also the ::list pointers in dpage may be stale. We need to
270 use the current list node (bpage) to do the list manipulation because
271 the list pointers could have changed between the time that we copied
272 the contents of bpage to the dpage and the flush list manipulation
273 below. */
274 ATTRIBUTE_COLD
275 void
buf_flush_relocate_on_flush_list(buf_page_t * bpage,buf_page_t * dpage)276 buf_flush_relocate_on_flush_list(
277 /*=============================*/
278 buf_page_t* bpage, /*!< in/out: control block being moved */
279 buf_page_t* dpage) /*!< in/out: destination block */
280 {
281 buf_page_t* prev;
282
283 mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
284 ut_ad(!fsp_is_system_temporary(bpage->id().space()));
285
286 const lsn_t lsn = bpage->oldest_modification();
287
288 if (!lsn) {
289 return;
290 }
291
292 ut_ad(lsn == 1 || lsn > 2);
293 ut_ad(dpage->oldest_modification() == lsn);
294
295 /* Important that we adjust the hazard pointer before removing
296 the bpage from the flush list. */
297 buf_pool.flush_hp.adjust(bpage);
298
299 prev = UT_LIST_GET_PREV(list, bpage);
300 UT_LIST_REMOVE(buf_pool.flush_list, bpage);
301
302 bpage->clear_oldest_modification();
303
304 if (lsn == 1) {
305 buf_pool.stat.flush_list_bytes -= dpage->physical_size();
306 dpage->list.prev = nullptr;
307 dpage->list.next = nullptr;
308 dpage->clear_oldest_modification();
309 } else if (prev) {
310 ut_ad(prev->oldest_modification());
311 UT_LIST_INSERT_AFTER(buf_pool.flush_list, prev, dpage);
312 } else {
313 UT_LIST_ADD_FIRST(buf_pool.flush_list, dpage);
314 }
315
316 ut_d(buf_flush_validate_low());
317 }
318
319 /** Complete write of a file page from buf_pool.
320 @param request write request */
buf_page_write_complete(const IORequest & request)321 void buf_page_write_complete(const IORequest &request)
322 {
323 ut_ad(request.is_write());
324 ut_ad(!srv_read_only_mode/* ||
325 request.node->space->purpose == FIL_TYPE_TEMPORARY*/);
326 buf_page_t *bpage= request.bpage;
327 ut_ad(bpage);
328 ut_ad(bpage->in_file());
329 /* bpage->io_fix() can only be changed by buf_page_write_complete()
330 and buf_page_read_complete() from BUF_IO_READ or BUF_IO_WRITE */
331 ut_ad(bpage->io_fix() == BUF_IO_WRITE);
332 ut_ad(!buf_dblwr.is_inside(bpage->id()));
333 ut_ad(request.node->space->id == bpage->id().space());
334
335 if (bpage->status == buf_page_t::INIT_ON_FLUSH)
336 bpage->status= buf_page_t::NORMAL;
337 else
338 {
339 ut_ad(bpage->status == buf_page_t::NORMAL);
340 if (request.node->space->use_doublewrite())
341 {
342 ut_ad(request.node->space != fil_system.temp_space);
343 buf_dblwr.write_completed();
344 }
345 }
346
347 if (bpage->slot)
348 {
349 bpage->slot->release();
350 bpage->slot= nullptr;
351 }
352
353 if (UNIV_UNLIKELY(MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)))
354 buf_page_monitor(bpage, BUF_IO_WRITE);
355 DBUG_PRINT("ib_buf", ("write page %u:%u",
356 bpage->id().space(), bpage->id().page_no()));
357 const bool temp= fsp_is_system_temporary(bpage->id().space());
358
359 mysql_mutex_lock(&buf_pool.mutex);
360 mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
361 buf_pool.stat.n_pages_written++;
362 /* While we do not need any mutex for clearing oldest_modification
363 here, we hope that it will be in the same cache line with io_fix,
364 whose changes must be protected by buf_pool.mutex. */
365 ut_ad(temp || bpage->oldest_modification() > 2);
366 bpage->clear_oldest_modification(temp);
367 ut_ad(bpage->io_fix() == BUF_IO_WRITE);
368 bpage->set_io_fix(BUF_IO_NONE);
369
370 /* Because this thread which does the unlocking might not be the same that
371 did the locking, we use a pass value != 0 in unlock, which simply
372 removes the newest lock debug record, without checking the thread id. */
373 if (bpage->state() == BUF_BLOCK_FILE_PAGE)
374 rw_lock_sx_unlock_gen(&((buf_block_t*) bpage)->lock, BUF_IO_WRITE);
375
376 if (request.is_LRU())
377 {
378 buf_LRU_free_page(bpage, true);
379
380 ut_ad(buf_pool.n_flush_LRU_);
381 if (!--buf_pool.n_flush_LRU_)
382 {
383 pthread_cond_broadcast(&buf_pool.done_flush_LRU);
384 pthread_cond_signal(&buf_pool.done_free);
385 }
386 }
387 else
388 {
389 ut_ad(!temp);
390 ut_ad(buf_pool.n_flush_list_);
391 if (!--buf_pool.n_flush_list_)
392 pthread_cond_broadcast(&buf_pool.done_flush_list);
393 }
394
395 mysql_mutex_unlock(&buf_pool.mutex);
396 }
397
398 /** Calculate a ROW_FORMAT=COMPRESSED page checksum and update the page.
399 @param[in,out] page page to update
400 @param[in] size compressed page size */
buf_flush_update_zip_checksum(buf_frame_t * page,ulint size)401 void buf_flush_update_zip_checksum(buf_frame_t *page, ulint size)
402 {
403 ut_ad(size > 0);
404 mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
405 page_zip_calc_checksum(page, size,
406 static_cast<srv_checksum_algorithm_t>
407 (srv_checksum_algorithm)));
408 }
409
410 /** Assign the full crc32 checksum for non-compressed page.
411 @param[in,out] page page to be updated */
buf_flush_assign_full_crc32_checksum(byte * page)412 void buf_flush_assign_full_crc32_checksum(byte* page)
413 {
414 ut_d(bool compressed = false);
415 ut_d(bool corrupted = false);
416 ut_d(const uint size = buf_page_full_crc32_size(page, &compressed,
417 &corrupted));
418 ut_ad(!compressed);
419 ut_ad(!corrupted);
420 ut_ad(size == uint(srv_page_size));
421 const ulint payload = srv_page_size - FIL_PAGE_FCRC32_CHECKSUM;
422 mach_write_to_4(page + payload, ut_crc32(page, payload));
423 }
424
425 /** Initialize a page for writing to the tablespace.
426 @param[in] block buffer block; NULL if bypassing
427 the buffer pool
428 @param[in,out] page page frame
429 @param[in,out] page_zip_ compressed page, or NULL if
430 uncompressed
431 @param[in] use_full_checksum whether tablespace uses full checksum */
432 void
buf_flush_init_for_writing(const buf_block_t * block,byte * page,void * page_zip_,bool use_full_checksum)433 buf_flush_init_for_writing(
434 const buf_block_t* block,
435 byte* page,
436 void* page_zip_,
437 bool use_full_checksum)
438 {
439 if (block != NULL && block->frame != page) {
440 /* If page is encrypted in full crc32 format then
441 checksum stored already as a part of fil_encrypt_buf() */
442 ut_ad(use_full_checksum);
443 return;
444 }
445
446 ut_ad(block == NULL || block->frame == page);
447 ut_ad(block == NULL || page_zip_ == NULL
448 || &block->page.zip == page_zip_);
449 ut_ad(page);
450
451 if (page_zip_) {
452 page_zip_des_t* page_zip;
453 ulint size;
454
455 page_zip = static_cast<page_zip_des_t*>(page_zip_);
456 size = page_zip_get_size(page_zip);
457
458 ut_ad(size);
459 ut_ad(ut_is_2pow(size));
460 ut_ad(size <= UNIV_ZIP_SIZE_MAX);
461
462 switch (fil_page_get_type(page)) {
463 case FIL_PAGE_TYPE_ALLOCATED:
464 case FIL_PAGE_INODE:
465 case FIL_PAGE_IBUF_BITMAP:
466 case FIL_PAGE_TYPE_FSP_HDR:
467 case FIL_PAGE_TYPE_XDES:
468 /* These are essentially uncompressed pages. */
469 memcpy(page_zip->data, page, size);
470 /* fall through */
471 case FIL_PAGE_TYPE_ZBLOB:
472 case FIL_PAGE_TYPE_ZBLOB2:
473 case FIL_PAGE_INDEX:
474 case FIL_PAGE_RTREE:
475 buf_flush_update_zip_checksum(page_zip->data, size);
476 return;
477 }
478
479 ib::error() << "The compressed page to be written"
480 " seems corrupt:";
481 ut_print_buf(stderr, page, size);
482 fputs("\nInnoDB: Possibly older version of the page:", stderr);
483 ut_print_buf(stderr, page_zip->data, size);
484 putc('\n', stderr);
485 ut_error;
486 }
487
488 if (use_full_checksum) {
489 static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "aligned");
490 static_assert(FIL_PAGE_LSN % 4 == 0, "aligned");
491 memcpy_aligned<4>(page + srv_page_size
492 - FIL_PAGE_FCRC32_END_LSN,
493 FIL_PAGE_LSN + 4 + page, 4);
494 return buf_flush_assign_full_crc32_checksum(page);
495 }
496
497 static_assert(FIL_PAGE_END_LSN_OLD_CHKSUM % 8 == 0, "aligned");
498 static_assert(FIL_PAGE_LSN % 8 == 0, "aligned");
499 memcpy_aligned<8>(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM,
500 FIL_PAGE_LSN + page, 8);
501
502 if (block && srv_page_size == 16384) {
503 /* The page type could be garbage in old files
504 created before MySQL 5.5. Such files always
505 had a page size of 16 kilobytes. */
506 ulint page_type = fil_page_get_type(page);
507 ulint reset_type = page_type;
508
509 switch (block->page.id().page_no() % 16384) {
510 case 0:
511 reset_type = block->page.id().page_no() == 0
512 ? FIL_PAGE_TYPE_FSP_HDR
513 : FIL_PAGE_TYPE_XDES;
514 break;
515 case 1:
516 reset_type = FIL_PAGE_IBUF_BITMAP;
517 break;
518 case FSP_TRX_SYS_PAGE_NO:
519 if (block->page.id()
520 == page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO)) {
521 reset_type = FIL_PAGE_TYPE_TRX_SYS;
522 break;
523 }
524 /* fall through */
525 default:
526 switch (page_type) {
527 case FIL_PAGE_INDEX:
528 case FIL_PAGE_TYPE_INSTANT:
529 case FIL_PAGE_RTREE:
530 case FIL_PAGE_UNDO_LOG:
531 case FIL_PAGE_INODE:
532 case FIL_PAGE_IBUF_FREE_LIST:
533 case FIL_PAGE_TYPE_ALLOCATED:
534 case FIL_PAGE_TYPE_SYS:
535 case FIL_PAGE_TYPE_TRX_SYS:
536 case FIL_PAGE_TYPE_BLOB:
537 case FIL_PAGE_TYPE_ZBLOB:
538 case FIL_PAGE_TYPE_ZBLOB2:
539 break;
540 case FIL_PAGE_TYPE_FSP_HDR:
541 case FIL_PAGE_TYPE_XDES:
542 case FIL_PAGE_IBUF_BITMAP:
543 /* These pages should have
544 predetermined page numbers
545 (see above). */
546 default:
547 reset_type = FIL_PAGE_TYPE_UNKNOWN;
548 break;
549 }
550 }
551
552 if (UNIV_UNLIKELY(page_type != reset_type)) {
553 ib::info()
554 << "Resetting invalid page "
555 << block->page.id() << " type "
556 << page_type << " to "
557 << reset_type << " when flushing.";
558 fil_page_set_type(page, reset_type);
559 }
560 }
561
562 uint32_t checksum = BUF_NO_CHECKSUM_MAGIC;
563
564 switch (srv_checksum_algorithm_t(srv_checksum_algorithm)) {
565 case SRV_CHECKSUM_ALGORITHM_INNODB:
566 case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
567 checksum = buf_calc_page_new_checksum(page);
568 mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
569 checksum);
570 /* With the InnoDB checksum, we overwrite the first 4 bytes of
571 the end lsn field to store the old formula checksum. Since it
572 depends also on the field FIL_PAGE_SPACE_OR_CHKSUM, it has to
573 be calculated after storing the new formula checksum. */
574 checksum = buf_calc_page_old_checksum(page);
575 break;
576 case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
577 case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
578 case SRV_CHECKSUM_ALGORITHM_CRC32:
579 case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
580 /* In other cases we write the same checksum to both fields. */
581 checksum = buf_calc_page_crc32(page);
582 mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
583 checksum);
584 break;
585 case SRV_CHECKSUM_ALGORITHM_NONE:
586 case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
587 mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
588 checksum);
589 break;
590 /* no default so the compiler will emit a warning if
591 new enum is added and not handled here */
592 }
593
594 mach_write_to_4(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM,
595 checksum);
596 }
597
598 /** Reserve a buffer for compression.
599 @param[in,out] slot reserved slot */
buf_tmp_reserve_compression_buf(buf_tmp_buffer_t * slot)600 static void buf_tmp_reserve_compression_buf(buf_tmp_buffer_t* slot)
601 {
602 if (slot->comp_buf)
603 return;
604 /* Both Snappy and LZO compression methods require that the output
605 buffer be bigger than input buffer. Adjust the allocated size. */
606 ulint size= srv_page_size;
607 #ifdef HAVE_LZO
608 size+= LZO1X_1_15_MEM_COMPRESS;
609 #elif defined HAVE_SNAPPY
610 size= snappy_max_compressed_length(size);
611 #endif
612 slot->comp_buf= static_cast<byte*>(aligned_malloc(size, srv_page_size));
613 }
614
615 /** Encrypt a buffer of temporary tablespace
616 @param[in] offset Page offset
617 @param[in] s Page to encrypt
618 @param[in,out] d Output buffer
619 @return encrypted buffer or NULL */
buf_tmp_page_encrypt(ulint offset,const byte * s,byte * d)620 static byte* buf_tmp_page_encrypt(ulint offset, const byte* s, byte* d)
621 {
622 /* Calculate the start offset in a page */
623 uint srclen= static_cast<uint>(srv_page_size) -
624 (FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION +
625 FIL_PAGE_FCRC32_CHECKSUM);
626 const byte* src= s + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
627 byte* dst= d + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
628
629 memcpy(d, s, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
630
631 if (!log_tmp_block_encrypt(src, srclen, dst, (offset * srv_page_size), true))
632 return NULL;
633
634 const ulint payload= srv_page_size - FIL_PAGE_FCRC32_CHECKSUM;
635 mach_write_to_4(d + payload, ut_crc32(d, payload));
636
637 srv_stats.pages_encrypted.inc();
638 srv_stats.n_temp_blocks_encrypted.inc();
639 return d;
640 }
641
642 /** Encryption and page_compression hook that is called just before
643 a page is written to disk.
644 @param[in,out] space tablespace
645 @param[in,out] bpage buffer page
646 @param[in] s physical page frame that is being encrypted
647 @param[in,out] size payload size in bytes
648 @return page frame to be written to file
649 (may be src_frame or an encrypted/compressed copy of it) */
buf_page_encrypt(fil_space_t * space,buf_page_t * bpage,byte * s,size_t * size)650 static byte *buf_page_encrypt(fil_space_t* space, buf_page_t* bpage, byte* s,
651 size_t *size)
652 {
653 ut_ad(bpage->status != buf_page_t::FREED);
654 ut_ad(space->id == bpage->id().space());
655
656 ut_d(fil_page_type_validate(space, s));
657 const uint32_t page_no= bpage->id().page_no();
658
659 switch (page_no) {
660 case TRX_SYS_PAGE_NO:
661 if (bpage->id().space() != TRX_SYS_SPACE)
662 break;
663 /* The TRX_SYS page is neither encrypted nor compressed, because
664 it contains the address of the doublewrite buffer. */
665 /* fall through */
666 case 0:
667 /* Page 0 of a tablespace is not encrypted/compressed */
668 return s;
669 }
670
671 fil_space_crypt_t *crypt_data= space->crypt_data;
672 bool encrypted, page_compressed;
673 if (space->purpose == FIL_TYPE_TEMPORARY)
674 {
675 ut_ad(!crypt_data);
676 encrypted= innodb_encrypt_temporary_tables;
677 page_compressed= false;
678 }
679 else
680 {
681 encrypted= crypt_data && !crypt_data->not_encrypted() &&
682 crypt_data->type != CRYPT_SCHEME_UNENCRYPTED &&
683 (!crypt_data->is_default_encryption() || srv_encrypt_tables);
684 page_compressed= space->is_compressed();
685 }
686
687 const bool full_crc32= space->full_crc32();
688
689 if (!encrypted && !page_compressed)
690 {
691 /* No need to encrypt or compress. Clear key-version & crypt-checksum. */
692 static_assert(FIL_PAGE_FCRC32_KEY_VERSION % 4 == 0, "alignment");
693 static_assert(FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION % 4 == 2,
694 "not perfect alignment");
695 if (full_crc32)
696 memset_aligned<4>(s + FIL_PAGE_FCRC32_KEY_VERSION, 0, 4);
697 else
698 memset_aligned<2>(s + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
699 return s;
700 }
701
702 static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment");
703 static_assert(FIL_PAGE_LSN % 8 == 0, "alignment");
704 if (full_crc32)
705 memcpy_aligned<4>(s + srv_page_size - FIL_PAGE_FCRC32_END_LSN,
706 FIL_PAGE_LSN + 4 + s, 4);
707
708 ut_ad(!bpage->zip_size() || !page_compressed);
709 /* Find free slot from temporary memory array */
710 buf_tmp_buffer_t *slot= buf_pool.io_buf_reserve();
711 ut_a(slot);
712 slot->allocate();
713 slot->out_buf= NULL;
714 bpage->slot= slot;
715
716 byte *d= slot->crypt_buf;
717
718 if (!page_compressed)
719 {
720 not_compressed:
721 byte *tmp= space->purpose == FIL_TYPE_TEMPORARY
722 ? buf_tmp_page_encrypt(page_no, s, d)
723 : fil_space_encrypt(space, page_no, s, d);
724
725 slot->out_buf= d= tmp;
726
727 ut_d(fil_page_type_validate(space, tmp));
728 }
729 else
730 {
731 ut_ad(space->purpose != FIL_TYPE_TEMPORARY);
732 /* First we compress the page content */
733 buf_tmp_reserve_compression_buf(slot);
734 byte *tmp= slot->comp_buf;
735 ulint len= fil_page_compress(s, tmp, space->flags,
736 fil_space_get_block_size(space, page_no),
737 encrypted);
738
739 if (!len)
740 goto not_compressed;
741
742 *size= len;
743
744 if (full_crc32)
745 {
746 ut_d(bool compressed = false);
747 len= buf_page_full_crc32_size(tmp,
748 #ifdef UNIV_DEBUG
749 &compressed,
750 #else
751 NULL,
752 #endif
753 NULL);
754 ut_ad(compressed);
755 }
756
757 /* Workaround for MDEV-15527. */
758 memset(tmp + len, 0 , srv_page_size - len);
759 ut_d(fil_page_type_validate(space, tmp));
760
761 if (encrypted)
762 tmp = fil_space_encrypt(space, page_no, tmp, d);
763
764 if (full_crc32)
765 {
766 static_assert(FIL_PAGE_FCRC32_CHECKSUM == 4, "alignment");
767 mach_write_to_4(tmp + len - 4, ut_crc32(tmp, len - 4));
768 ut_ad(!buf_page_is_corrupted(true, tmp, space->flags));
769 }
770
771 slot->out_buf= d= tmp;
772 }
773
774 ut_d(fil_page_type_validate(space, d));
775 return d;
776 }
777
778 /** Free a page whose underlying file page has been freed. */
release_freed_page(buf_page_t * bpage)779 inline void buf_pool_t::release_freed_page(buf_page_t *bpage)
780 {
781 ut_ad(bpage->in_file());
782 const bool uncompressed= bpage->state() == BUF_BLOCK_FILE_PAGE;
783 mysql_mutex_lock(&mutex);
784 bpage->set_io_fix(BUF_IO_NONE);
785 bpage->status= buf_page_t::NORMAL;
786 mysql_mutex_lock(&flush_list_mutex);
787 ut_d(const lsn_t oldest_modification= bpage->oldest_modification();)
788 if (fsp_is_system_temporary(bpage->id().space()))
789 {
790 ut_ad(uncompressed);
791 ut_ad(oldest_modification == 2);
792 }
793 else
794 {
795 ut_ad(oldest_modification > 2);
796 delete_from_flush_list(bpage, false);
797 }
798 bpage->clear_oldest_modification();
799 mysql_mutex_unlock(&flush_list_mutex);
800
801 if (uncompressed)
802 rw_lock_sx_unlock_gen(&reinterpret_cast<buf_block_t*>(bpage)->lock,
803 BUF_IO_WRITE);
804
805 buf_LRU_free_page(bpage, true);
806 mysql_mutex_unlock(&mutex);
807 }
808
809 /** Write a flushable page from buf_pool to a file.
810 buf_pool.mutex must be held.
811 @param bpage buffer control block
812 @param lru true=buf_pool.LRU; false=buf_pool.flush_list
813 @param space tablespace
814 @return whether the page was flushed and buf_pool.mutex was released */
buf_flush_page(buf_page_t * bpage,bool lru,fil_space_t * space)815 static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space)
816 {
817 ut_ad(bpage->in_file());
818 ut_ad(bpage->ready_for_flush());
819 ut_ad((space->purpose == FIL_TYPE_TEMPORARY) ==
820 (space == fil_system.temp_space));
821 ut_ad(space->purpose == FIL_TYPE_TABLESPACE ||
822 space->atomic_write_supported);
823 ut_ad(space->referenced());
824 ut_ad(lru || space != fil_system.temp_space);
825
826 rw_lock_t *rw_lock;
827
828 if (bpage->state() != BUF_BLOCK_FILE_PAGE)
829 rw_lock= nullptr;
830 else
831 {
832 rw_lock= &reinterpret_cast<buf_block_t*>(bpage)->lock;
833 if (!rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE))
834 return false;
835 }
836
837 bpage->set_io_fix(BUF_IO_WRITE);
838 /* Because bpage->status can only be changed while buf_block_t
839 exists, it cannot be modified for ROW_FORMAT=COMPRESSED pages
840 without first allocating the uncompressed page frame. Such
841 allocation cannot be completed due to our io_fix. So, bpage->status
842 is protected even if !rw_lock. */
843 const auto status= bpage->status;
844
845 if (status != buf_page_t::FREED)
846 {
847 if (lru)
848 buf_pool.n_flush_LRU_++;
849 else
850 buf_pool.n_flush_list_++;
851 buf_flush_page_count++;
852 }
853
854 mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
855
856 /* We are holding rw_lock = buf_block_t::lock in SX mode except if
857 this is a ROW_FORMAT=COMPRESSED page whose uncompressed page frame
858 has been evicted from the buffer pool.
859
860 Apart from possible rw_lock protection, bpage is also protected by
861 io_fix and oldest_modification()!=0. Thus, it cannot be relocated in
862 the buffer pool or removed from flush_list or LRU_list. */
863
864 DBUG_PRINT("ib_buf", ("%s %u page %u:%u",
865 lru ? "LRU" : "flush_list",
866 bpage->id().space(), bpage->id().page_no()));
867 ut_ad(bpage->io_fix() == BUF_IO_WRITE);
868 ut_d(const lsn_t oldest_modification= bpage->oldest_modification());
869 ut_ad(space == fil_system.temp_space
870 ? oldest_modification == 2
871 : oldest_modification > 2);
872 ut_ad(bpage->state() ==
873 (rw_lock ? BUF_BLOCK_FILE_PAGE : BUF_BLOCK_ZIP_PAGE));
874 ut_ad(ULINT_UNDEFINED >
875 (lru ? buf_pool.n_flush_LRU_ : buf_pool.n_flush_list_));
876 mysql_mutex_unlock(&buf_pool.mutex);
877
878 buf_block_t *block= reinterpret_cast<buf_block_t*>(bpage);
879 page_t *frame= bpage->zip.data;
880
881 if (status == buf_page_t::FREED)
882 buf_pool.release_freed_page(&block->page);
883 else
884 {
885 space->reacquire();
886 ut_ad(status == buf_page_t::NORMAL || status == buf_page_t::INIT_ON_FLUSH);
887 size_t size;
888 #if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
889 size_t orig_size;
890 #endif
891 IORequest::Type type= lru ? IORequest::WRITE_LRU : IORequest::WRITE_ASYNC;
892
893 if (UNIV_UNLIKELY(!rw_lock)) /* ROW_FORMAT=COMPRESSED */
894 {
895 ut_ad(!space->full_crc32());
896 ut_ad(!space->is_compressed()); /* not page_compressed */
897 size= bpage->zip_size();
898 #if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
899 orig_size= size;
900 #endif
901 buf_flush_update_zip_checksum(frame, size);
902 frame= buf_page_encrypt(space, bpage, frame, &size);
903 ut_ad(size == bpage->zip_size());
904 }
905 else
906 {
907 byte *page= block->frame;
908 size= block->physical_size();
909 #if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
910 orig_size= size;
911 #endif
912
913 if (space->full_crc32())
914 {
915 /* innodb_checksum_algorithm=full_crc32 is not implemented for
916 ROW_FORMAT=COMPRESSED pages. */
917 ut_ad(!frame);
918 page= buf_page_encrypt(space, bpage, page, &size);
919 buf_flush_init_for_writing(block, page, nullptr, true);
920 }
921 else
922 {
923 buf_flush_init_for_writing(block, page, frame ? &bpage->zip : nullptr,
924 false);
925 page= buf_page_encrypt(space, bpage, frame ? frame : page, &size);
926 }
927
928 #if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
929 if (size != orig_size && space->punch_hole)
930 type= lru ? IORequest::PUNCH_LRU : IORequest::PUNCH;
931 #endif
932 frame=page;
933 }
934
935 ut_ad(status == bpage->status);
936 ut_ad(oldest_modification == bpage->oldest_modification());
937
938 if (status != buf_page_t::NORMAL || !space->use_doublewrite())
939 {
940 if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE))
941 {
942 const lsn_t lsn= mach_read_from_8(my_assume_aligned<8>
943 (FIL_PAGE_LSN + (frame ? frame
944 : block->frame)));
945 ut_ad(lsn >= oldest_modification);
946 if (lsn > log_sys.get_flushed_lsn())
947 log_write_up_to(lsn, true);
948 }
949 space->io(IORequest(type, bpage),
950 bpage->physical_offset(), size, frame, bpage);
951 }
952 else
953 buf_dblwr.add_to_batch(IORequest(bpage, space->chain.start, type), size);
954 }
955
956 /* Increment the I/O operation count used for selecting LRU policy. */
957 buf_LRU_stat_inc_io();
958 return true;
959 }
960
961 /** Check whether a page can be flushed from the buf_pool.
962 @param id page identifier
963 @param fold id.fold()
964 @param lru true=buf_pool.LRU; false=buf_pool.flush_list
965 @return whether the page can be flushed */
buf_flush_check_neighbor(const page_id_t id,ulint fold,bool lru)966 static bool buf_flush_check_neighbor(const page_id_t id, ulint fold, bool lru)
967 {
968 mysql_mutex_assert_owner(&buf_pool.mutex);
969 ut_ad(fold == id.fold());
970
971 buf_page_t *bpage= buf_pool.page_hash_get_low(id, fold);
972
973 if (!bpage || buf_pool.watch_is_sentinel(*bpage))
974 return false;
975
976 /* We avoid flushing 'non-old' blocks in an LRU flush, because the
977 flushed blocks are soon freed */
978 if (lru && !bpage->is_old())
979 return false;
980
981 return bpage->oldest_modification() > 1 && bpage->ready_for_flush();
982 }
983
984 /** Check which neighbors of a page can be flushed from the buf_pool.
985 @param space tablespace
986 @param id page identifier of a dirty page
987 @param contiguous whether to consider contiguous areas of pages
988 @param lru true=buf_pool.LRU; false=buf_pool.flush_list
989 @return last page number that can be flushed */
buf_flush_check_neighbors(const fil_space_t & space,page_id_t & id,bool contiguous,bool lru)990 static page_id_t buf_flush_check_neighbors(const fil_space_t &space,
991 page_id_t &id, bool contiguous,
992 bool lru)
993 {
994 ut_ad(id.page_no() < space.size +
995 (space.physical_size() == 2048 ? 1
996 : space.physical_size() == 1024 ? 3 : 0));
997 /* When flushed, dirty blocks are searched in neighborhoods of this
998 size, and flushed along with the original page. */
999 const ulint s= buf_pool.curr_size / 16;
1000 const uint32_t read_ahead= buf_pool.read_ahead_area;
1001 const uint32_t buf_flush_area= read_ahead > s
1002 ? static_cast<uint32_t>(s) : read_ahead;
1003 page_id_t low= id - (id.page_no() % buf_flush_area);
1004 page_id_t high= low + buf_flush_area;
1005 high.set_page_no(std::min(high.page_no(), space.last_page_number()));
1006
1007 if (!contiguous)
1008 {
1009 high= std::max(id + 1, high);
1010 id= low;
1011 return high;
1012 }
1013
1014 /* Determine the contiguous dirty area around id. */
1015 const ulint id_fold= id.fold();
1016
1017 mysql_mutex_lock(&buf_pool.mutex);
1018
1019 if (id > low)
1020 {
1021 ulint fold= id_fold;
1022 for (page_id_t i= id - 1;; --i)
1023 {
1024 fold--;
1025 if (!buf_flush_check_neighbor(i, fold, lru))
1026 {
1027 low= i + 1;
1028 break;
1029 }
1030 if (i == low)
1031 break;
1032 }
1033 }
1034
1035 page_id_t i= id;
1036 id= low;
1037 ulint fold= id_fold;
1038 while (++i < high)
1039 {
1040 ++fold;
1041 if (!buf_flush_check_neighbor(i, fold, lru))
1042 break;
1043 }
1044
1045 mysql_mutex_unlock(&buf_pool.mutex);
1046 return i;
1047 }
1048
1049 MY_ATTRIBUTE((nonnull))
1050 /** Write punch-hole or zeroes of the freed ranges when
1051 innodb_immediate_scrub_data_uncompressed from the freed ranges.
1052 @param space tablespace which may contain ranges of freed pages */
buf_flush_freed_pages(fil_space_t * space)1053 static void buf_flush_freed_pages(fil_space_t *space)
1054 {
1055 const bool punch_hole= space->punch_hole;
1056 if (!srv_immediate_scrub_data_uncompressed && !punch_hole)
1057 return;
1058 lsn_t flush_to_disk_lsn= log_sys.get_flushed_lsn();
1059
1060 std::unique_lock<std::mutex> freed_lock(space->freed_range_mutex);
1061 if (space->freed_ranges.empty()
1062 || flush_to_disk_lsn < space->get_last_freed_lsn())
1063 {
1064 freed_lock.unlock();
1065 return;
1066 }
1067
1068 range_set freed_ranges= std::move(space->freed_ranges);
1069 freed_lock.unlock();
1070
1071 for (const auto &range : freed_ranges)
1072 {
1073 const ulint physical_size= space->physical_size();
1074
1075 if (punch_hole)
1076 {
1077 space->reacquire();
1078 space->io(IORequest(IORequest::PUNCH_RANGE),
1079 os_offset_t{range.first} * physical_size,
1080 (range.last - range.first + 1) * physical_size,
1081 nullptr);
1082 }
1083 else if (srv_immediate_scrub_data_uncompressed)
1084 {
1085 for (os_offset_t i= range.first; i <= range.last; i++)
1086 {
1087 space->reacquire();
1088 space->io(IORequest(IORequest::WRITE_ASYNC),
1089 i * physical_size, physical_size,
1090 const_cast<byte*>(field_ref_zero));
1091 }
1092 }
1093 buf_pool.stat.n_pages_written+= (range.last - range.first + 1);
1094 }
1095 }
1096
1097 /** Flushes to disk all flushable pages within the flush area
1098 and also write zeroes or punch the hole for the freed ranges of pages.
1099 @param space tablespace
1100 @param page_id page identifier
1101 @param contiguous whether to consider contiguous areas of pages
1102 @param lru true=buf_pool.LRU; false=buf_pool.flush_list
1103 @param n_flushed number of pages flushed so far in this batch
1104 @param n_to_flush maximum number of pages we are allowed to flush
1105 @return number of pages flushed */
buf_flush_try_neighbors(fil_space_t * space,const page_id_t page_id,bool contiguous,bool lru,ulint n_flushed,ulint n_to_flush)1106 static ulint buf_flush_try_neighbors(fil_space_t *space,
1107 const page_id_t page_id,
1108 bool contiguous, bool lru,
1109 ulint n_flushed, ulint n_to_flush)
1110 {
1111 ut_ad(space->id == page_id.space());
1112
1113 ulint count= 0;
1114 page_id_t id= page_id;
1115 page_id_t high= buf_flush_check_neighbors(*space, id, contiguous, lru);
1116
1117 ut_ad(page_id >= id);
1118 ut_ad(page_id < high);
1119
1120 for (ulint id_fold= id.fold(); id < high && !space->is_stopping();
1121 ++id, ++id_fold)
1122 {
1123 if (count + n_flushed >= n_to_flush)
1124 {
1125 if (id > page_id)
1126 break;
1127 /* If the page whose neighbors we are flushing has not been
1128 flushed yet, we must flush the page that we selected originally. */
1129 id= page_id;
1130 id_fold= id.fold();
1131 }
1132
1133 mysql_mutex_lock(&buf_pool.mutex);
1134
1135 if (buf_page_t *bpage= buf_pool.page_hash_get_low(id, id_fold))
1136 {
1137 ut_ad(bpage->in_file());
1138 /* We avoid flushing 'non-old' blocks in an LRU flush,
1139 because the flushed blocks are soon freed */
1140 if (!lru || id == page_id || bpage->is_old())
1141 {
1142 if (!buf_pool.watch_is_sentinel(*bpage) &&
1143 bpage->oldest_modification() > 1 &&
1144 bpage->ready_for_flush() && buf_flush_page(bpage, lru, space))
1145 {
1146 ++count;
1147 continue;
1148 }
1149 }
1150 }
1151
1152 mysql_mutex_unlock(&buf_pool.mutex);
1153 }
1154
1155 if (auto n= count - 1)
1156 {
1157 MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
1158 MONITOR_FLUSH_NEIGHBOR_COUNT,
1159 MONITOR_FLUSH_NEIGHBOR_PAGES, n);
1160 }
1161
1162 return count;
1163 }
1164
1165 /*******************************************************************//**
1166 This utility moves the uncompressed frames of pages to the free list.
1167 Note that this function does not actually flush any data to disk. It
1168 just detaches the uncompressed frames from the compressed pages at the
1169 tail of the unzip_LRU and puts those freed frames in the free list.
1170 Note that it is a best effort attempt and it is not guaranteed that
1171 after a call to this function there will be 'max' blocks in the free
1172 list.
1173 @param[in] max desired number of blocks in the free_list
1174 @return number of blocks moved to the free list. */
buf_free_from_unzip_LRU_list_batch(ulint max)1175 static ulint buf_free_from_unzip_LRU_list_batch(ulint max)
1176 {
1177 ulint scanned = 0;
1178 ulint count = 0;
1179
1180 mysql_mutex_assert_owner(&buf_pool.mutex);
1181
1182 buf_block_t* block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
1183
1184 while (block
1185 && count < max
1186 && UT_LIST_GET_LEN(buf_pool.free) < srv_LRU_scan_depth
1187 && UT_LIST_GET_LEN(buf_pool.unzip_LRU)
1188 > UT_LIST_GET_LEN(buf_pool.LRU) / 10) {
1189
1190 ++scanned;
1191 if (buf_LRU_free_page(&block->page, false)) {
1192 /* Block was freed. buf_pool.mutex potentially
1193 released and reacquired */
1194 ++count;
1195 block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
1196 } else {
1197 block = UT_LIST_GET_PREV(unzip_LRU, block);
1198 }
1199 }
1200
1201 mysql_mutex_assert_owner(&buf_pool.mutex);
1202
1203 if (scanned) {
1204 MONITOR_INC_VALUE_CUMULATIVE(
1205 MONITOR_LRU_BATCH_SCANNED,
1206 MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
1207 MONITOR_LRU_BATCH_SCANNED_PER_CALL,
1208 scanned);
1209 }
1210
1211 return(count);
1212 }
1213
1214 /** Start writing out pages for a tablespace.
1215 @param id tablespace identifier
1216 @return tablespace
1217 @retval nullptr if the pages for this tablespace should be discarded */
buf_flush_space(const uint32_t id)1218 static fil_space_t *buf_flush_space(const uint32_t id)
1219 {
1220 fil_space_t *space= fil_space_t::get(id);
1221 if (space)
1222 buf_flush_freed_pages(space);
1223 return space;
1224 }
1225
1226 struct flush_counters_t
1227 {
1228 /** number of dirty pages flushed */
1229 ulint flushed;
1230 /** number of clean pages evicted */
1231 ulint evicted;
1232 };
1233
1234 /** Try to discard a dirty page.
1235 @param bpage dirty page whose tablespace is not accessible */
buf_flush_discard_page(buf_page_t * bpage)1236 static void buf_flush_discard_page(buf_page_t *bpage)
1237 {
1238 mysql_mutex_assert_owner(&buf_pool.mutex);
1239 mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
1240 ut_ad(bpage->in_file());
1241 ut_ad(bpage->oldest_modification());
1242
1243 rw_lock_t *rw_lock;
1244
1245 if (bpage->state() != BUF_BLOCK_FILE_PAGE)
1246 rw_lock= nullptr;
1247 else
1248 {
1249 rw_lock= &reinterpret_cast<buf_block_t*>(bpage)->lock;
1250 if (!rw_lock_sx_lock_nowait(rw_lock, 0))
1251 return;
1252 }
1253
1254 bpage->status= buf_page_t::NORMAL;
1255 mysql_mutex_lock(&buf_pool.flush_list_mutex);
1256 buf_pool.delete_from_flush_list(bpage);
1257 mysql_mutex_unlock(&buf_pool.flush_list_mutex);
1258
1259 if (rw_lock)
1260 rw_lock_sx_unlock(rw_lock);
1261
1262 buf_LRU_free_page(bpage, true);
1263 }
1264
1265 /** Flush dirty blocks from the end of the LRU list.
1266 @param max maximum number of blocks to make available in buf_pool.free
1267 @param n counts of flushed and evicted pages */
buf_flush_LRU_list_batch(ulint max,flush_counters_t * n)1268 static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n)
1269 {
1270 ulint scanned= 0;
1271 ulint free_limit= srv_LRU_scan_depth;
1272
1273 mysql_mutex_assert_owner(&buf_pool.mutex);
1274 if (buf_pool.withdraw_target && buf_pool.curr_size < buf_pool.old_size)
1275 free_limit+= buf_pool.withdraw_target - UT_LIST_GET_LEN(buf_pool.withdraw);
1276
1277 const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN
1278 ? 0 : srv_flush_neighbors;
1279 fil_space_t *space= nullptr;
1280 uint32_t last_space_id= FIL_NULL;
1281 static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency");
1282 static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency");
1283
1284 for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.LRU);
1285 bpage &&
1286 ((UT_LIST_GET_LEN(buf_pool.LRU) > BUF_LRU_MIN_LEN &&
1287 UT_LIST_GET_LEN(buf_pool.free) < free_limit &&
1288 n->flushed + n->evicted < max) ||
1289 recv_recovery_is_on()); ++scanned)
1290 {
1291 buf_page_t *prev= UT_LIST_GET_PREV(LRU, bpage);
1292 const lsn_t oldest_modification= bpage->oldest_modification();
1293 buf_pool.lru_hp.set(prev);
1294
1295 if (oldest_modification <= 1 && bpage->can_relocate())
1296 {
1297 /* block is ready for eviction i.e., it is clean and is not
1298 IO-fixed or buffer fixed. */
1299 if (buf_LRU_free_page(bpage, true))
1300 ++n->evicted;
1301 }
1302 else if (oldest_modification > 1 && bpage->ready_for_flush())
1303 {
1304 /* Block is ready for flush. Dispatch an IO request. The IO
1305 helper thread will put it on free list in IO completion routine. */
1306 const page_id_t page_id(bpage->id());
1307 const uint32_t space_id= page_id.space();
1308 if (!space || space->id != space_id)
1309 {
1310 if (last_space_id != space_id)
1311 {
1312 if (space)
1313 space->release();
1314 space= buf_flush_space(space_id);
1315 last_space_id= space_id;
1316 }
1317 else
1318 ut_ad(!space);
1319 }
1320 else if (space->is_stopping())
1321 {
1322 space->release();
1323 space= nullptr;
1324 }
1325
1326 if (!space)
1327 buf_flush_discard_page(bpage);
1328 else if (neighbors && space->is_rotational())
1329 {
1330 mysql_mutex_unlock(&buf_pool.mutex);
1331 n->flushed+= buf_flush_try_neighbors(space, page_id, neighbors == 1,
1332 true, n->flushed, max);
1333 reacquire_mutex:
1334 mysql_mutex_lock(&buf_pool.mutex);
1335 }
1336 else if (buf_flush_page(bpage, true, space))
1337 {
1338 ++n->flushed;
1339 goto reacquire_mutex;
1340 }
1341 }
1342 else
1343 /* Can't evict or dispatch this block. Go to previous. */
1344 ut_ad(buf_pool.lru_hp.is_hp(prev));
1345 bpage= buf_pool.lru_hp.get();
1346 }
1347
1348 buf_pool.lru_hp.set(nullptr);
1349
1350 if (space)
1351 space->release();
1352
1353 /* We keep track of all flushes happening as part of LRU flush. When
1354 estimating the desired rate at which flush_list should be flushed,
1355 we factor in this value. */
1356 buf_lru_flush_page_count+= n->flushed;
1357
1358 mysql_mutex_assert_owner(&buf_pool.mutex);
1359
1360 if (scanned)
1361 MONITOR_INC_VALUE_CUMULATIVE(MONITOR_LRU_BATCH_SCANNED,
1362 MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
1363 MONITOR_LRU_BATCH_SCANNED_PER_CALL,
1364 scanned);
1365 }
1366
1367 /** Flush and move pages from LRU or unzip_LRU list to the free list.
1368 Whether LRU or unzip_LRU is used depends on the state of the system.
1369 @param max maximum number of blocks to make available in buf_pool.free
1370 @return number of flushed pages */
buf_do_LRU_batch(ulint max)1371 static ulint buf_do_LRU_batch(ulint max)
1372 {
1373 const ulint n_unzip_LRU_evicted= buf_LRU_evict_from_unzip_LRU()
1374 ? buf_free_from_unzip_LRU_list_batch(max)
1375 : 0;
1376 flush_counters_t n;
1377 n.flushed= 0;
1378 n.evicted= n_unzip_LRU_evicted;
1379 buf_flush_LRU_list_batch(max, &n);
1380
1381 if (const ulint evicted= n.evicted - n_unzip_LRU_evicted)
1382 {
1383 MONITOR_INC_VALUE_CUMULATIVE(MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
1384 MONITOR_LRU_BATCH_EVICT_COUNT,
1385 MONITOR_LRU_BATCH_EVICT_PAGES,
1386 evicted);
1387 }
1388
1389 return n.flushed;
1390 }
1391
1392 /** This utility flushes dirty blocks from the end of the flush_list.
1393 The calling thread is not allowed to own any latches on pages!
1394 @param max_n maximum mumber of blocks to flush
1395 @param lsn once an oldest_modification>=lsn is found, terminate the batch
1396 @return number of blocks for which the write request was queued */
buf_do_flush_list_batch(ulint max_n,lsn_t lsn)1397 static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
1398 {
1399 ulint count= 0;
1400 ulint scanned= 0;
1401
1402 mysql_mutex_assert_owner(&buf_pool.mutex);
1403
1404 const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN
1405 ? 0 : srv_flush_neighbors;
1406 fil_space_t *space= nullptr;
1407 uint32_t last_space_id= FIL_NULL;
1408 static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency");
1409 static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency");
1410
1411 /* Start from the end of the list looking for a suitable block to be
1412 flushed. */
1413 mysql_mutex_lock(&buf_pool.flush_list_mutex);
1414 ulint len= UT_LIST_GET_LEN(buf_pool.flush_list);
1415
1416 for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list);
1417 bpage && len && count < max_n; ++scanned, len--)
1418 {
1419 const lsn_t oldest_modification= bpage->oldest_modification();
1420 if (oldest_modification >= lsn)
1421 break;
1422 ut_ad(bpage->in_file());
1423
1424 buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
1425
1426 if (oldest_modification == 1)
1427 {
1428 buf_pool.delete_from_flush_list(bpage);
1429 skip:
1430 bpage= prev;
1431 continue;
1432 }
1433
1434 ut_ad(oldest_modification > 2);
1435 ut_ad(bpage->in_file());
1436
1437 if (!bpage->ready_for_flush())
1438 goto skip;
1439
1440 /* In order not to degenerate this scan to O(n*n) we attempt to
1441 preserve the pointer position. Any thread that would remove 'prev'
1442 from buf_pool.flush_list must adjust the hazard pointer.
1443
1444 Note: A concurrent execution of buf_flush_list_space() may
1445 terminate this scan prematurely. The buf_pool.n_flush_list()
1446 should prevent multiple threads from executing
1447 buf_do_flush_list_batch() concurrently,
1448 but buf_flush_list_space() is ignoring that. */
1449 buf_pool.flush_hp.set(prev);
1450 mysql_mutex_unlock(&buf_pool.flush_list_mutex);
1451
1452 const page_id_t page_id(bpage->id());
1453 const uint32_t space_id= page_id.space();
1454 if (!space || space->id != space_id)
1455 {
1456 if (last_space_id != space_id)
1457 {
1458 if (space)
1459 space->release();
1460 space= buf_flush_space(space_id);
1461 last_space_id= space_id;
1462 }
1463 else
1464 ut_ad(!space);
1465 }
1466 else if (space->is_stopping())
1467 {
1468 space->release();
1469 space= nullptr;
1470 }
1471
1472 if (!space)
1473 buf_flush_discard_page(bpage);
1474 else if (neighbors && space->is_rotational())
1475 {
1476 mysql_mutex_unlock(&buf_pool.mutex);
1477 count+= buf_flush_try_neighbors(space, page_id, neighbors == 1,
1478 false, count, max_n);
1479 reacquire_mutex:
1480 mysql_mutex_lock(&buf_pool.mutex);
1481 }
1482 else if (buf_flush_page(bpage, false, space))
1483 {
1484 ++count;
1485 goto reacquire_mutex;
1486 }
1487
1488 mysql_mutex_lock(&buf_pool.flush_list_mutex);
1489 bpage= buf_pool.flush_hp.get();
1490 }
1491
1492 buf_pool.flush_hp.set(nullptr);
1493 mysql_mutex_unlock(&buf_pool.flush_list_mutex);
1494
1495 if (space)
1496 space->release();
1497
1498 if (scanned)
1499 MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_SCANNED,
1500 MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
1501 MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
1502 scanned);
1503 if (count)
1504 MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_TOTAL_PAGE,
1505 MONITOR_FLUSH_BATCH_COUNT,
1506 MONITOR_FLUSH_BATCH_PAGES,
1507 count);
1508 mysql_mutex_assert_owner(&buf_pool.mutex);
1509 return count;
1510 }
1511
1512 /** Wait until a flush batch ends.
1513 @param lru true=buf_pool.LRU; false=buf_pool.flush_list */
buf_flush_wait_batch_end(bool lru)1514 void buf_flush_wait_batch_end(bool lru)
1515 {
1516 const auto &n_flush= lru ? buf_pool.n_flush_LRU_ : buf_pool.n_flush_list_;
1517
1518 if (n_flush)
1519 {
1520 auto cond= lru ? &buf_pool.done_flush_LRU : &buf_pool.done_flush_list;
1521 tpool::tpool_wait_begin();
1522 thd_wait_begin(nullptr, THD_WAIT_DISKIO);
1523 do
1524 my_cond_wait(cond, &buf_pool.mutex.m_mutex);
1525 while (n_flush);
1526 tpool::tpool_wait_end();
1527 thd_wait_end(nullptr);
1528 pthread_cond_broadcast(cond);
1529 }
1530 }
1531
1532 /** Write out dirty blocks from buf_pool.flush_list.
1533 @param max_n wished maximum mumber of blocks flushed
1534 @param lsn buf_pool.get_oldest_modification(LSN_MAX) target
1535 @return the number of processed pages
1536 @retval 0 if a buf_pool.flush_list batch is already running */
buf_flush_list(ulint max_n=ULINT_UNDEFINED,lsn_t lsn=LSN_MAX)1537 static ulint buf_flush_list(ulint max_n= ULINT_UNDEFINED, lsn_t lsn= LSN_MAX)
1538 {
1539 ut_ad(lsn);
1540
1541 if (buf_pool.n_flush_list())
1542 return 0;
1543
1544 mysql_mutex_lock(&buf_pool.mutex);
1545 const bool running= buf_pool.n_flush_list_ != 0;
1546 /* FIXME: we are performing a dirty read of buf_pool.flush_list.count
1547 while not holding buf_pool.flush_list_mutex */
1548 if (running || !UT_LIST_GET_LEN(buf_pool.flush_list))
1549 {
1550 if (!running)
1551 pthread_cond_broadcast(&buf_pool.done_flush_list);
1552 mysql_mutex_unlock(&buf_pool.mutex);
1553 return 0;
1554 }
1555
1556 buf_pool.n_flush_list_++;
1557 const ulint n_flushed= buf_do_flush_list_batch(max_n, lsn);
1558 const ulint n_flushing= --buf_pool.n_flush_list_;
1559
1560 buf_pool.try_LRU_scan= true;
1561
1562 mysql_mutex_unlock(&buf_pool.mutex);
1563
1564 if (!n_flushing)
1565 pthread_cond_broadcast(&buf_pool.done_flush_list);
1566
1567 buf_dblwr.flush_buffered_writes();
1568
1569 DBUG_PRINT("ib_buf", ("flush_list completed, " ULINTPF " pages", n_flushed));
1570 return n_flushed;
1571 }
1572
1573 /** Try to flush all the dirty pages that belong to a given tablespace.
1574 @param space tablespace
1575 @param n_flushed number of pages written
1576 @return whether the flush for some pages might not have been initiated */
buf_flush_list_space(fil_space_t * space,ulint * n_flushed)1577 bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed)
1578 {
1579 const auto space_id= space->id;
1580 ut_ad(space_id <= SRV_SPACE_ID_UPPER_BOUND);
1581
1582 bool may_have_skipped= false;
1583 ulint max_n_flush= srv_io_capacity;
1584
1585 mysql_mutex_lock(&buf_pool.mutex);
1586 mysql_mutex_lock(&buf_pool.flush_list_mutex);
1587
1588 bool acquired= space->acquire();
1589 buf_flush_freed_pages(space);
1590
1591 for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; )
1592 {
1593 ut_d(const auto s= bpage->state());
1594 ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE ||
1595 s == BUF_BLOCK_REMOVE_HASH);
1596 ut_ad(bpage->oldest_modification());
1597 ut_ad(bpage->in_file());
1598
1599 buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
1600 if (bpage->id().space() != space_id);
1601 else if (bpage->oldest_modification() == 1)
1602 buf_pool.delete_from_flush_list(bpage);
1603 else if (!bpage->ready_for_flush())
1604 may_have_skipped= true;
1605 else
1606 {
1607 /* In order not to degenerate this scan to O(n*n) we attempt to
1608 preserve the pointer position. Any thread that would remove 'prev'
1609 from buf_pool.flush_list must adjust the hazard pointer.
1610
1611 Note: Multiple executions of buf_flush_list_space() may be
1612 interleaved, and also buf_do_flush_list_batch() may be running
1613 concurrently. This may terminate our iteration prematurely,
1614 leading us to return may_have_skipped=true. */
1615 buf_pool.flush_hp.set(prev);
1616 mysql_mutex_unlock(&buf_pool.flush_list_mutex);
1617
1618 if (!acquired)
1619 {
1620 was_freed:
1621 buf_flush_discard_page(bpage);
1622 }
1623 else
1624 {
1625 if (space->is_stopping())
1626 {
1627 space->release();
1628 acquired= false;
1629 goto was_freed;
1630 }
1631 if (!buf_flush_page(bpage, false, space))
1632 {
1633 may_have_skipped= true;
1634 mysql_mutex_lock(&buf_pool.flush_list_mutex);
1635 goto next_after_skip;
1636 }
1637 if (n_flushed)
1638 ++*n_flushed;
1639 if (!--max_n_flush)
1640 {
1641 mysql_mutex_lock(&buf_pool.mutex);
1642 mysql_mutex_lock(&buf_pool.flush_list_mutex);
1643 may_have_skipped= true;
1644 break;
1645 }
1646 mysql_mutex_lock(&buf_pool.mutex);
1647 }
1648
1649 mysql_mutex_lock(&buf_pool.flush_list_mutex);
1650 if (!buf_pool.flush_hp.is_hp(prev))
1651 may_have_skipped= true;
1652 next_after_skip:
1653 bpage= buf_pool.flush_hp.get();
1654 continue;
1655 }
1656
1657 bpage= prev;
1658 }
1659
1660 /* Note: this loop may have been executed concurrently with
1661 buf_do_flush_list_batch() as well as other threads executing
1662 buf_flush_list_space(). We should always return true from
1663 buf_flush_list_space() if that should be the case; in
1664 buf_do_flush_list_batch() we will simply perform less work. */
1665
1666 buf_pool.flush_hp.set(nullptr);
1667 mysql_mutex_unlock(&buf_pool.flush_list_mutex);
1668
1669 buf_pool.try_LRU_scan= true;
1670
1671 mysql_mutex_unlock(&buf_pool.mutex);
1672
1673 if (acquired)
1674 space->release();
1675
1676 if (space->purpose == FIL_TYPE_IMPORT)
1677 os_aio_wait_until_no_pending_writes();
1678 else
1679 buf_dblwr.flush_buffered_writes();
1680
1681 return may_have_skipped;
1682 }
1683
1684 /** Write out dirty blocks from buf_pool.LRU.
1685 @param max_n wished maximum mumber of blocks flushed
1686 @return the number of processed pages
1687 @retval 0 if a buf_pool.LRU batch is already running */
buf_flush_LRU(ulint max_n)1688 ulint buf_flush_LRU(ulint max_n)
1689 {
1690 if (buf_pool.n_flush_LRU())
1691 return 0;
1692
1693 log_buffer_flush_to_disk(true);
1694
1695 mysql_mutex_lock(&buf_pool.mutex);
1696 if (buf_pool.n_flush_LRU_)
1697 {
1698 mysql_mutex_unlock(&buf_pool.mutex);
1699 return 0;
1700 }
1701 buf_pool.n_flush_LRU_++;
1702
1703 ulint n_flushed= buf_do_LRU_batch(max_n);
1704
1705 const ulint n_flushing= --buf_pool.n_flush_LRU_;
1706
1707 buf_pool.try_LRU_scan= true;
1708
1709 mysql_mutex_unlock(&buf_pool.mutex);
1710
1711 if (!n_flushing)
1712 {
1713 pthread_cond_broadcast(&buf_pool.done_flush_LRU);
1714 pthread_cond_signal(&buf_pool.done_free);
1715 }
1716
1717 buf_dblwr.flush_buffered_writes();
1718
1719 DBUG_PRINT("ib_buf", ("LRU flush completed, " ULINTPF " pages", n_flushed));
1720 return n_flushed;
1721 }
1722
1723 /** Initiate a log checkpoint, discarding the start of the log.
1724 @param oldest_lsn the checkpoint LSN
1725 @param end_lsn log_sys.get_lsn()
1726 @return true if success, false if a checkpoint write was already running */
log_checkpoint_low(lsn_t oldest_lsn,lsn_t end_lsn)1727 static bool log_checkpoint_low(lsn_t oldest_lsn, lsn_t end_lsn)
1728 {
1729 ut_ad(!srv_read_only_mode);
1730 mysql_mutex_assert_owner(&log_sys.mutex);
1731 ut_ad(oldest_lsn <= end_lsn);
1732 ut_ad(end_lsn == log_sys.get_lsn());
1733 ut_ad(!recv_no_log_write);
1734
1735 ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn);
1736
1737 if (oldest_lsn > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT)
1738 /* Some log has been written since the previous checkpoint. */;
1739 else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED)
1740 /* MariaDB startup expects the redo log file to be logically empty
1741 (not even containing a FILE_CHECKPOINT record) after a clean shutdown.
1742 Perform an extra checkpoint at shutdown. */;
1743 else
1744 {
1745 /* Do nothing, because nothing was logged (other than a
1746 FILE_CHECKPOINT record) since the previous checkpoint. */
1747 mysql_mutex_unlock(&log_sys.mutex);
1748 return true;
1749 }
1750
1751 /* Repeat the FILE_MODIFY records after the checkpoint, in case some
1752 log records between the checkpoint and log_sys.lsn need them.
1753 Finally, write a FILE_CHECKPOINT record. Redo log apply expects to
1754 see a FILE_CHECKPOINT after the checkpoint, except on clean
1755 shutdown, where the log will be empty after the checkpoint.
1756
1757 It is important that we write out the redo log before any further
1758 dirty pages are flushed to the tablespace files. At this point,
1759 because we hold log_sys.mutex, mtr_t::commit() in other threads will
1760 be blocked, and no pages can be added to the flush lists. */
1761 lsn_t flush_lsn= oldest_lsn;
1762
1763 if (fil_names_clear(flush_lsn, oldest_lsn != end_lsn ||
1764 srv_shutdown_state <= SRV_SHUTDOWN_INITIATED))
1765 {
1766 flush_lsn= log_sys.get_lsn();
1767 ut_ad(flush_lsn >= end_lsn + SIZE_OF_FILE_CHECKPOINT);
1768 mysql_mutex_unlock(&log_sys.mutex);
1769 log_write_up_to(flush_lsn, true, true);
1770 mysql_mutex_lock(&log_sys.mutex);
1771 if (log_sys.last_checkpoint_lsn >= oldest_lsn)
1772 {
1773 mysql_mutex_unlock(&log_sys.mutex);
1774 return true;
1775 }
1776 }
1777 else
1778 ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn);
1779
1780 ut_ad(log_sys.get_flushed_lsn() >= flush_lsn);
1781
1782 if (log_sys.n_pending_checkpoint_writes)
1783 {
1784 /* A checkpoint write is running */
1785 mysql_mutex_unlock(&log_sys.mutex);
1786 return false;
1787 }
1788
1789 log_sys.next_checkpoint_lsn= oldest_lsn;
1790 log_write_checkpoint_info(end_lsn);
1791 mysql_mutex_assert_not_owner(&log_sys.mutex);
1792
1793 return true;
1794 }
1795
1796 /** Make a checkpoint. Note that this function does not flush dirty
1797 blocks from the buffer pool: it only checks what is lsn of the oldest
1798 modification in the pool, and writes information about the lsn in
1799 log file. Use log_make_checkpoint() to flush also the pool.
1800 @retval true if the checkpoint was or had been made
1801 @retval false if a checkpoint write was already running */
log_checkpoint()1802 static bool log_checkpoint()
1803 {
1804 if (recv_recovery_is_on())
1805 recv_sys.apply(true);
1806
1807 switch (srv_file_flush_method) {
1808 case SRV_NOSYNC:
1809 case SRV_O_DIRECT_NO_FSYNC:
1810 break;
1811 default:
1812 fil_flush_file_spaces();
1813 }
1814
1815 mysql_mutex_lock(&log_sys.mutex);
1816 const lsn_t end_lsn= log_sys.get_lsn();
1817 mysql_mutex_lock(&log_sys.flush_order_mutex);
1818 mysql_mutex_lock(&buf_pool.flush_list_mutex);
1819 const lsn_t oldest_lsn= buf_pool.get_oldest_modification(end_lsn);
1820 mysql_mutex_unlock(&buf_pool.flush_list_mutex);
1821 mysql_mutex_unlock(&log_sys.flush_order_mutex);
1822 return log_checkpoint_low(oldest_lsn, end_lsn);
1823 }
1824
1825 /** Make a checkpoint. */
log_make_checkpoint()1826 ATTRIBUTE_COLD void log_make_checkpoint()
1827 {
1828 buf_flush_wait_flushed(log_sys.get_lsn(std::memory_order_acquire));
1829 while (!log_checkpoint());
1830 }
1831
1832 /** Wait for all dirty pages up to an LSN to be written out.
1833 NOTE: The calling thread is not allowed to hold any buffer page latches! */
buf_flush_wait(lsn_t lsn)1834 static void buf_flush_wait(lsn_t lsn)
1835 {
1836 ut_ad(lsn <= log_sys.get_lsn());
1837
1838 while (buf_pool.get_oldest_modification(lsn) < lsn)
1839 {
1840 if (buf_flush_sync_lsn < lsn)
1841 {
1842 buf_flush_sync_lsn= lsn;
1843 buf_pool.page_cleaner_set_idle(false);
1844 pthread_cond_signal(&buf_pool.do_flush_list);
1845 }
1846 my_cond_wait(&buf_pool.done_flush_list,
1847 &buf_pool.flush_list_mutex.m_mutex);
1848 }
1849 }
1850
1851 /** Wait until all persistent pages are flushed up to a limit.
1852 @param sync_lsn buf_pool.get_oldest_modification(LSN_MAX) to wait for */
buf_flush_wait_flushed(lsn_t sync_lsn)1853 ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn)
1854 {
1855 ut_ad(sync_lsn);
1856 ut_ad(sync_lsn < LSN_MAX);
1857 mysql_mutex_assert_not_owner(&log_sys.mutex);
1858 ut_ad(!srv_read_only_mode);
1859
1860 if (recv_recovery_is_on())
1861 recv_sys.apply(true);
1862
1863 mysql_mutex_lock(&buf_pool.flush_list_mutex);
1864
1865 if (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn)
1866 {
1867 MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
1868 #if 1 /* FIXME: remove this, and guarantee that the page cleaner serves us */
1869 if (UNIV_UNLIKELY(!buf_page_cleaner_is_active))
1870 {
1871 do
1872 {
1873 mysql_mutex_unlock(&buf_pool.flush_list_mutex);
1874 ulint n_pages= buf_flush_list(srv_max_io_capacity, sync_lsn);
1875 buf_flush_wait_batch_end_acquiring_mutex(false);
1876 if (n_pages)
1877 {
1878 MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE,
1879 MONITOR_FLUSH_SYNC_COUNT,
1880 MONITOR_FLUSH_SYNC_PAGES, n_pages);
1881 }
1882 mysql_mutex_lock(&buf_pool.flush_list_mutex);
1883 }
1884 while (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn);
1885 }
1886 else
1887 #endif
1888 {
1889 thd_wait_begin(nullptr, THD_WAIT_DISKIO);
1890 tpool::tpool_wait_begin();
1891 buf_flush_wait(sync_lsn);
1892 tpool::tpool_wait_end();
1893 thd_wait_end(nullptr);
1894 }
1895 }
1896
1897 mysql_mutex_unlock(&buf_pool.flush_list_mutex);
1898
1899 if (UNIV_UNLIKELY(log_sys.last_checkpoint_lsn < sync_lsn))
1900 {
1901 /* If the buffer pool was clean, no log write was guaranteed
1902 to happen until now. There could be an outstanding FILE_CHECKPOINT
1903 record from a previous fil_names_clear() call, which we must
1904 write out before we can advance the checkpoint. */
1905 if (sync_lsn > log_sys.get_flushed_lsn())
1906 log_write_up_to(sync_lsn, true);
1907 log_checkpoint();
1908 }
1909 }
1910
1911 /** Initiate more eager page flushing if the log checkpoint age is too old.
1912 @param lsn buf_pool.get_oldest_modification(LSN_MAX) target
1913 @param furious true=furious flushing, false=limit to innodb_io_capacity */
buf_flush_ahead(lsn_t lsn,bool furious)1914 ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious)
1915 {
1916 mysql_mutex_assert_not_owner(&log_sys.mutex);
1917 ut_ad(!srv_read_only_mode);
1918
1919 if (recv_recovery_is_on())
1920 recv_sys.apply(true);
1921
1922 Atomic_relaxed<lsn_t> &limit= furious
1923 ? buf_flush_sync_lsn : buf_flush_async_lsn;
1924
1925 if (limit < lsn)
1926 {
1927 mysql_mutex_lock(&buf_pool.flush_list_mutex);
1928 if (limit < lsn)
1929 {
1930 limit= lsn;
1931 buf_pool.page_cleaner_set_idle(false);
1932 pthread_cond_signal(&buf_pool.do_flush_list);
1933 }
1934 mysql_mutex_unlock(&buf_pool.flush_list_mutex);
1935 }
1936 }
1937
1938 /** Wait for pending flushes to complete. */
buf_flush_wait_batch_end_acquiring_mutex(bool lru)1939 void buf_flush_wait_batch_end_acquiring_mutex(bool lru)
1940 {
1941 if (lru ? buf_pool.n_flush_LRU() : buf_pool.n_flush_list())
1942 {
1943 mysql_mutex_lock(&buf_pool.mutex);
1944 buf_flush_wait_batch_end(lru);
1945 mysql_mutex_unlock(&buf_pool.mutex);
1946 }
1947 }
1948
1949 /** Conduct checkpoint-related flushing for innodb_flush_sync=ON,
1950 and try to initiate checkpoints until the target is met.
1951 @param lsn minimum value of buf_pool.get_oldest_modification(LSN_MAX) */
buf_flush_sync_for_checkpoint(lsn_t lsn)1952 ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn)
1953 {
1954 ut_ad(!srv_read_only_mode);
1955
1956 for (;;)
1957 {
1958 mysql_mutex_unlock(&buf_pool.flush_list_mutex);
1959
1960 if (ulint n_flushed= buf_flush_list(srv_max_io_capacity, lsn))
1961 {
1962 MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE,
1963 MONITOR_FLUSH_SYNC_COUNT,
1964 MONITOR_FLUSH_SYNC_PAGES, n_flushed);
1965 }
1966
1967 switch (srv_file_flush_method) {
1968 case SRV_NOSYNC:
1969 case SRV_O_DIRECT_NO_FSYNC:
1970 break;
1971 default:
1972 fil_flush_file_spaces();
1973 }
1974
1975 mysql_mutex_lock(&log_sys.mutex);
1976 const lsn_t newest_lsn= log_sys.get_lsn();
1977 mysql_mutex_lock(&log_sys.flush_order_mutex);
1978 mysql_mutex_lock(&buf_pool.flush_list_mutex);
1979 lsn_t measure= buf_pool.get_oldest_modification(0);
1980 mysql_mutex_unlock(&log_sys.flush_order_mutex);
1981 const lsn_t checkpoint_lsn= measure ? measure : newest_lsn;
1982
1983 if (!recv_recovery_is_on() &&
1984 checkpoint_lsn > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT)
1985 {
1986 mysql_mutex_unlock(&buf_pool.flush_list_mutex);
1987 log_checkpoint_low(checkpoint_lsn, newest_lsn);
1988 mysql_mutex_lock(&buf_pool.flush_list_mutex);
1989 measure= buf_pool.get_oldest_modification(LSN_MAX);
1990 }
1991 else
1992 {
1993 mysql_mutex_unlock(&log_sys.mutex);
1994 if (!measure)
1995 measure= LSN_MAX;
1996 }
1997
1998 mysql_mutex_assert_not_owner(&log_sys.mutex);
1999
2000 /* After attempting log checkpoint, check if we have reached our target. */
2001 const lsn_t target= buf_flush_sync_lsn;
2002
2003 if (measure >= target)
2004 buf_flush_sync_lsn= 0;
2005 else if (measure >= buf_flush_async_lsn)
2006 buf_flush_async_lsn= 0;
2007
2008 /* wake up buf_flush_wait() */
2009 pthread_cond_broadcast(&buf_pool.done_flush_list);
2010
2011 lsn= std::max(lsn, target);
2012
2013 if (measure >= lsn)
2014 return;
2015 }
2016 }
2017
2018 /** Check if the adpative flushing threshold is recommended based on
2019 redo log capacity filled threshold.
2020 @param oldest_lsn buf_pool.get_oldest_modification()
2021 @return true if adaptive flushing is recommended. */
af_needed_for_redo(lsn_t oldest_lsn)2022 static bool af_needed_for_redo(lsn_t oldest_lsn)
2023 {
2024 lsn_t age= (log_sys.get_lsn() - oldest_lsn);
2025 lsn_t af_lwm= static_cast<lsn_t>(srv_adaptive_flushing_lwm *
2026 static_cast<double>(log_sys.log_capacity) / 100);
2027
2028 /* if age > af_lwm adaptive flushing is recommended */
2029 return (age > af_lwm);
2030 }
2031
2032 /*********************************************************************//**
2033 Calculates if flushing is required based on redo generation rate.
2034 @return percent of io_capacity to flush to manage redo space */
2035 static
2036 ulint
af_get_pct_for_lsn(lsn_t age)2037 af_get_pct_for_lsn(
2038 /*===============*/
2039 lsn_t age) /*!< in: current age of LSN. */
2040 {
2041 lsn_t af_lwm = static_cast<lsn_t>(
2042 srv_adaptive_flushing_lwm
2043 * static_cast<double>(log_sys.log_capacity) / 100);
2044
2045 if (age < af_lwm) {
2046 /* No adaptive flushing. */
2047 return(0);
2048 }
2049
2050 lsn_t lsn_age_factor = (age * 100) / log_sys.max_modified_age_async;
2051
2052 ut_ad(srv_max_io_capacity >= srv_io_capacity);
2053 return static_cast<ulint>(
2054 (static_cast<double>(srv_max_io_capacity / srv_io_capacity
2055 * lsn_age_factor)
2056 * sqrt(static_cast<double>(lsn_age_factor))
2057 / 7.5));
2058 }
2059
2060 /** This function is called approximately once every second by the
2061 page_cleaner thread if innodb_adaptive_flushing=ON.
2062 Based on various factors it decides if there is a need to do flushing.
2063 @return number of pages recommended to be flushed
2064 @param last_pages_in number of pages flushed in previous batch
2065 @param oldest_lsn buf_pool.get_oldest_modification(0)
2066 @param dirty_blocks UT_LIST_GET_LEN(buf_pool.flush_list)
2067 @param dirty_pct 100*flush_list.count / (LRU.count + free.count) */
page_cleaner_flush_pages_recommendation(ulint last_pages_in,lsn_t oldest_lsn,ulint dirty_blocks,double dirty_pct)2068 static ulint page_cleaner_flush_pages_recommendation(ulint last_pages_in,
2069 lsn_t oldest_lsn,
2070 ulint dirty_blocks,
2071 double dirty_pct)
2072 {
2073 static lsn_t prev_lsn = 0;
2074 static ulint sum_pages = 0;
2075 static ulint avg_page_rate = 0;
2076 static ulint n_iterations = 0;
2077 static time_t prev_time;
2078 lsn_t lsn_rate;
2079 ulint n_pages = 0;
2080
2081 const lsn_t cur_lsn = log_sys.get_lsn();
2082 ut_ad(oldest_lsn <= cur_lsn);
2083 ulint pct_for_lsn = af_get_pct_for_lsn(cur_lsn - oldest_lsn);
2084 time_t curr_time = time(nullptr);
2085 const double max_pct = srv_max_buf_pool_modified_pct;
2086
2087 if (!prev_lsn || !pct_for_lsn) {
2088 prev_time = curr_time;
2089 prev_lsn = cur_lsn;
2090 if (max_pct > 0.0) {
2091 dirty_pct /= max_pct;
2092 }
2093
2094 n_pages = ulint(dirty_pct * double(srv_io_capacity));
2095 if (n_pages < dirty_blocks) {
2096 n_pages= std::min<ulint>(srv_io_capacity, dirty_blocks);
2097 }
2098
2099 return n_pages;
2100 }
2101
2102 sum_pages += last_pages_in;
2103
2104 double time_elapsed = difftime(curr_time, prev_time);
2105
2106 /* We update our variables every srv_flushing_avg_loops
2107 iterations to smooth out transition in workload. */
2108 if (++n_iterations >= srv_flushing_avg_loops
2109 || time_elapsed >= static_cast<double>(srv_flushing_avg_loops)) {
2110
2111 if (time_elapsed < 1) {
2112 time_elapsed = 1;
2113 }
2114
2115 avg_page_rate = static_cast<ulint>(
2116 ((static_cast<double>(sum_pages)
2117 / time_elapsed)
2118 + static_cast<double>(avg_page_rate)) / 2);
2119
2120 /* How much LSN we have generated since last call. */
2121 lsn_rate = static_cast<lsn_t>(
2122 static_cast<double>(cur_lsn - prev_lsn)
2123 / time_elapsed);
2124
2125 lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2;
2126
2127 ulint flush_tm = page_cleaner.flush_time;
2128 ulint flush_pass = page_cleaner.flush_pass;
2129
2130 page_cleaner.flush_time = 0;
2131 page_cleaner.flush_pass = 0;
2132
2133 if (flush_pass) {
2134 flush_tm /= flush_pass;
2135 }
2136
2137 MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME, flush_tm);
2138 MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS, flush_pass);
2139
2140 prev_lsn = cur_lsn;
2141 prev_time = curr_time;
2142
2143 n_iterations = 0;
2144
2145 sum_pages = 0;
2146 }
2147
2148 const ulint pct_for_dirty = srv_max_dirty_pages_pct_lwm == 0
2149 ? (dirty_pct >= max_pct ? 100 : 0)
2150 : static_cast<ulint>
2151 (max_pct > 0.0 ? dirty_pct / max_pct : dirty_pct);
2152 ulint pct_total = std::max(pct_for_dirty, pct_for_lsn);
2153
2154 /* Estimate pages to be flushed for the lsn progress */
2155 lsn_t target_lsn = oldest_lsn
2156 + lsn_avg_rate * buf_flush_lsn_scan_factor;
2157 ulint pages_for_lsn = 0;
2158
2159 mysql_mutex_lock(&buf_pool.flush_list_mutex);
2160
2161 for (buf_page_t* b = UT_LIST_GET_LAST(buf_pool.flush_list);
2162 b != NULL;
2163 b = UT_LIST_GET_PREV(list, b)) {
2164 if (b->oldest_modification() > target_lsn) {
2165 break;
2166 }
2167 if (++pages_for_lsn >= srv_max_io_capacity) {
2168 break;
2169 }
2170 }
2171 mysql_mutex_unlock(&buf_pool.flush_list_mutex);
2172
2173 pages_for_lsn /= buf_flush_lsn_scan_factor;
2174 if (pages_for_lsn < 1) {
2175 pages_for_lsn = 1;
2176 }
2177
2178 n_pages = (ulint(double(srv_io_capacity) * double(pct_total) / 100.0)
2179 + avg_page_rate + pages_for_lsn) / 3;
2180
2181 if (n_pages > srv_max_io_capacity) {
2182 n_pages = srv_max_io_capacity;
2183 }
2184
2185 MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages);
2186
2187 MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_BY_AGE, pages_for_lsn);
2188
2189 MONITOR_SET(MONITOR_FLUSH_AVG_PAGE_RATE, avg_page_rate);
2190 MONITOR_SET(MONITOR_FLUSH_LSN_AVG_RATE, lsn_avg_rate);
2191 MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty);
2192 MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn);
2193
2194 return(n_pages);
2195 }
2196
2197 /******************************************************************//**
2198 page_cleaner thread tasked with flushing dirty pages from the buffer
2199 pools. As of now we'll have only one coordinator.
2200 @return a dummy parameter */
DECLARE_THREAD(buf_flush_page_cleaner)2201 static os_thread_ret_t DECLARE_THREAD(buf_flush_page_cleaner)(void*)
2202 {
2203 my_thread_init();
2204 #ifdef UNIV_PFS_THREAD
2205 pfs_register_thread(page_cleaner_thread_key);
2206 #endif /* UNIV_PFS_THREAD */
2207 ut_ad(!srv_read_only_mode);
2208 ut_ad(buf_page_cleaner_is_active);
2209
2210 ulint last_pages= 0;
2211 timespec abstime;
2212 set_timespec(abstime, 1);
2213
2214 mysql_mutex_lock(&buf_pool.flush_list_mutex);
2215
2216 lsn_t lsn_limit;
2217 ulint last_activity_count= srv_get_activity_count();
2218
2219 for (;;)
2220 {
2221 lsn_limit= buf_flush_sync_lsn;
2222
2223 if (UNIV_UNLIKELY(lsn_limit != 0))
2224 {
2225 furious_flush:
2226 if (UNIV_LIKELY(srv_flush_sync))
2227 {
2228 buf_flush_sync_for_checkpoint(lsn_limit);
2229 last_pages= 0;
2230 set_timespec(abstime, 1);
2231 continue;
2232 }
2233 }
2234 else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED)
2235 break;
2236
2237 /* If buf pager cleaner is idle and there is no work
2238 (either dirty pages are all flushed or adaptive flushing
2239 is not enabled) then opt for non-timed wait */
2240 if (buf_pool.page_cleaner_idle() &&
2241 (!UT_LIST_GET_LEN(buf_pool.flush_list) ||
2242 srv_max_dirty_pages_pct_lwm == 0.0))
2243 my_cond_wait(&buf_pool.do_flush_list, &buf_pool.flush_list_mutex.m_mutex);
2244 else
2245 my_cond_timedwait(&buf_pool.do_flush_list,
2246 &buf_pool.flush_list_mutex.m_mutex, &abstime);
2247
2248 set_timespec(abstime, 1);
2249
2250 lsn_t soft_lsn_limit= buf_flush_async_lsn;
2251 lsn_limit= buf_flush_sync_lsn;
2252
2253 if (UNIV_UNLIKELY(lsn_limit != 0))
2254 {
2255 if (UNIV_LIKELY(srv_flush_sync))
2256 goto furious_flush;
2257 }
2258 else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED)
2259 break;
2260
2261 const lsn_t oldest_lsn= buf_pool.get_oldest_modification(0);
2262
2263 if (!oldest_lsn)
2264 {
2265 if (UNIV_UNLIKELY(lsn_limit != 0))
2266 {
2267 buf_flush_sync_lsn= 0;
2268 /* wake up buf_flush_wait() */
2269 pthread_cond_broadcast(&buf_pool.done_flush_list);
2270 }
2271 unemployed:
2272 buf_flush_async_lsn= 0;
2273 buf_pool.page_cleaner_set_idle(true);
2274
2275 DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", continue;);
2276
2277 mysql_mutex_unlock(&buf_pool.flush_list_mutex);
2278
2279 if (!recv_recovery_is_on() &&
2280 !srv_startup_is_before_trx_rollback_phase &&
2281 srv_operation == SRV_OPERATION_NORMAL)
2282 log_checkpoint();
2283
2284 mysql_mutex_lock(&buf_pool.flush_list_mutex);
2285 continue;
2286 }
2287
2288 const ulint dirty_blocks= UT_LIST_GET_LEN(buf_pool.flush_list);
2289 ut_ad(dirty_blocks);
2290 /* We perform dirty reads of the LRU+free list lengths here.
2291 Division by zero is not possible, because buf_pool.flush_list is
2292 guaranteed to be nonempty, and it is a subset of buf_pool.LRU. */
2293 const double dirty_pct= double(dirty_blocks) * 100.0 /
2294 double(UT_LIST_GET_LEN(buf_pool.LRU) + UT_LIST_GET_LEN(buf_pool.free));
2295
2296 bool idle_flush= false;
2297
2298 if (lsn_limit || soft_lsn_limit);
2299 else if (af_needed_for_redo(oldest_lsn));
2300 else if (srv_max_dirty_pages_pct_lwm != 0.0)
2301 {
2302 const ulint activity_count= srv_get_activity_count();
2303 if (activity_count != last_activity_count)
2304 last_activity_count= activity_count;
2305 else if (buf_pool.page_cleaner_idle() && buf_pool.n_pend_reads == 0)
2306 {
2307 /* reaching here means 3 things:
2308 - last_activity_count == activity_count: suggesting server is idle
2309 (no trx_t::commit activity)
2310 - page cleaner is idle (dirty_pct < srv_max_dirty_pages_pct_lwm)
2311 - there are no pending reads but there are dirty pages to flush */
2312 idle_flush= true;
2313 buf_pool.update_last_activity_count(activity_count);
2314 }
2315
2316 if (!idle_flush && dirty_pct < srv_max_dirty_pages_pct_lwm)
2317 goto unemployed;
2318 }
2319 else if (dirty_pct < srv_max_buf_pool_modified_pct)
2320 goto unemployed;
2321
2322 if (UNIV_UNLIKELY(lsn_limit != 0) && oldest_lsn >= lsn_limit)
2323 lsn_limit= buf_flush_sync_lsn= 0;
2324 if (UNIV_UNLIKELY(soft_lsn_limit != 0) && oldest_lsn >= soft_lsn_limit)
2325 soft_lsn_limit= buf_flush_async_lsn= 0;
2326
2327 buf_pool.page_cleaner_set_idle(false);
2328 mysql_mutex_unlock(&buf_pool.flush_list_mutex);
2329
2330 if (!lsn_limit)
2331 lsn_limit= soft_lsn_limit;
2332
2333 ulint n_flushed;
2334
2335 if (UNIV_UNLIKELY(lsn_limit != 0))
2336 {
2337 n_flushed= buf_flush_list(srv_max_io_capacity, lsn_limit);
2338 /* wake up buf_flush_wait() */
2339 pthread_cond_broadcast(&buf_pool.done_flush_list);
2340 goto try_checkpoint;
2341 }
2342 else if (idle_flush || !srv_adaptive_flushing)
2343 {
2344 n_flushed= buf_flush_list(srv_io_capacity);
2345 try_checkpoint:
2346 if (n_flushed)
2347 {
2348 MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
2349 MONITOR_FLUSH_BACKGROUND_COUNT,
2350 MONITOR_FLUSH_BACKGROUND_PAGES,
2351 n_flushed);
2352 do_checkpoint:
2353 /* The periodic log_checkpoint() call here makes it harder to
2354 reproduce bugs in crash recovery or mariabackup --prepare, or
2355 in code that writes the redo log records. Omitting the call
2356 here should not affect correctness, because log_free_check()
2357 should still be invoking checkpoints when needed. */
2358 DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", goto next;);
2359
2360 if (!recv_recovery_is_on() && srv_operation == SRV_OPERATION_NORMAL)
2361 log_checkpoint();
2362 }
2363 }
2364 else if (ulint n= page_cleaner_flush_pages_recommendation(last_pages,
2365 oldest_lsn,
2366 dirty_blocks,
2367 dirty_pct))
2368 {
2369 page_cleaner.flush_pass++;
2370 const ulint tm= ut_time_ms();
2371 last_pages= n_flushed= buf_flush_list(n);
2372 page_cleaner.flush_time+= ut_time_ms() - tm;
2373
2374 if (n_flushed)
2375 {
2376 MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
2377 MONITOR_FLUSH_ADAPTIVE_COUNT,
2378 MONITOR_FLUSH_ADAPTIVE_PAGES,
2379 n_flushed);
2380 goto do_checkpoint;
2381 }
2382 }
2383 else if (buf_flush_async_lsn <= oldest_lsn)
2384 {
2385 mysql_mutex_lock(&buf_pool.flush_list_mutex);
2386 goto unemployed;
2387 }
2388
2389 #ifdef UNIV_DEBUG
2390 while (innodb_page_cleaner_disabled_debug && !buf_flush_sync_lsn &&
2391 srv_shutdown_state == SRV_SHUTDOWN_NONE)
2392 os_thread_sleep(100000);
2393 #endif /* UNIV_DEBUG */
2394
2395 #ifndef DBUG_OFF
2396 next:
2397 #endif /* !DBUG_OFF */
2398 mysql_mutex_lock(&buf_pool.flush_list_mutex);
2399
2400 /* when idle flushing kicks in page_cleaner is marked active.
2401 reset it back to idle since the it was made active as part of
2402 idle flushing stage. */
2403 if (idle_flush)
2404 buf_pool.page_cleaner_set_idle(true);
2405 }
2406
2407 mysql_mutex_unlock(&buf_pool.flush_list_mutex);
2408
2409 if (srv_fast_shutdown != 2)
2410 {
2411 buf_flush_wait_batch_end_acquiring_mutex(true);
2412 buf_flush_wait_batch_end_acquiring_mutex(false);
2413 }
2414
2415 mysql_mutex_lock(&buf_pool.flush_list_mutex);
2416 lsn_limit= buf_flush_sync_lsn;
2417 if (UNIV_UNLIKELY(lsn_limit != 0))
2418 goto furious_flush;
2419 buf_page_cleaner_is_active= false;
2420 pthread_cond_broadcast(&buf_pool.done_flush_list);
2421 mysql_mutex_unlock(&buf_pool.flush_list_mutex);
2422
2423 my_thread_end();
2424 /* We count the number of threads in os_thread_exit(). A created
2425 thread should always use that to exit and not use return() to exit. */
2426 os_thread_exit();
2427
2428 OS_THREAD_DUMMY_RETURN;
2429 }
2430
2431 /** Initialize page_cleaner. */
buf_flush_page_cleaner_init()2432 ATTRIBUTE_COLD void buf_flush_page_cleaner_init()
2433 {
2434 ut_ad(!buf_page_cleaner_is_active);
2435 ut_ad(srv_operation == SRV_OPERATION_NORMAL ||
2436 srv_operation == SRV_OPERATION_RESTORE ||
2437 srv_operation == SRV_OPERATION_RESTORE_EXPORT);
2438 buf_flush_async_lsn= 0;
2439 buf_flush_sync_lsn= 0;
2440 buf_page_cleaner_is_active= true;
2441 os_thread_create(buf_flush_page_cleaner);
2442 }
2443
2444 #if defined(HAVE_SYSTEMD) && !defined(EMBEDDED_LIBRARY)
2445 /** @return the number of dirty pages in the buffer pool */
buf_flush_list_length()2446 static ulint buf_flush_list_length()
2447 {
2448 mysql_mutex_lock(&buf_pool.flush_list_mutex);
2449 const ulint len= UT_LIST_GET_LEN(buf_pool.flush_list);
2450 mysql_mutex_unlock(&buf_pool.flush_list_mutex);
2451 return len;
2452 }
2453 #endif
2454
2455 /** Flush the buffer pool on shutdown. */
buf_flush_buffer_pool()2456 ATTRIBUTE_COLD void buf_flush_buffer_pool()
2457 {
2458 ut_ad(!buf_page_cleaner_is_active);
2459 ut_ad(!buf_flush_sync_lsn);
2460
2461 service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
2462 "Waiting to flush the buffer pool");
2463
2464 mysql_mutex_lock(&buf_pool.flush_list_mutex);
2465
2466 while (buf_pool.get_oldest_modification(0))
2467 {
2468 mysql_mutex_unlock(&buf_pool.flush_list_mutex);
2469 buf_flush_list(srv_max_io_capacity);
2470 if (buf_pool.n_flush_list())
2471 {
2472 timespec abstime;
2473 service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
2474 "Waiting to flush " ULINTPF " pages",
2475 buf_flush_list_length());
2476 set_timespec(abstime, INNODB_EXTEND_TIMEOUT_INTERVAL / 2);
2477 mysql_mutex_lock(&buf_pool.mutex);
2478 while (buf_pool.n_flush_list_)
2479 my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex.m_mutex,
2480 &abstime);
2481 mysql_mutex_unlock(&buf_pool.mutex);
2482 }
2483 mysql_mutex_lock(&buf_pool.flush_list_mutex);
2484 }
2485
2486 mysql_mutex_unlock(&buf_pool.flush_list_mutex);
2487 ut_ad(!buf_pool.any_io_pending());
2488 }
2489
2490 /** Synchronously flush dirty blocks during recv_sys_t::apply().
2491 NOTE: The calling thread is not allowed to hold any buffer page latches! */
buf_flush_sync_batch(lsn_t lsn)2492 void buf_flush_sync_batch(lsn_t lsn)
2493 {
2494 thd_wait_begin(nullptr, THD_WAIT_DISKIO);
2495 tpool::tpool_wait_begin();
2496 mysql_mutex_lock(&buf_pool.flush_list_mutex);
2497 buf_flush_wait(lsn);
2498 mysql_mutex_unlock(&buf_pool.flush_list_mutex);
2499 tpool::tpool_wait_end();
2500 thd_wait_end(nullptr);
2501 }
2502
2503 /** Synchronously flush dirty blocks.
2504 NOTE: The calling thread is not allowed to hold any buffer page latches! */
buf_flush_sync()2505 void buf_flush_sync()
2506 {
2507 ut_ad(!sync_check_iterate(dict_sync_check()));
2508
2509 if (recv_recovery_is_on())
2510 recv_sys.apply(true);
2511
2512 thd_wait_begin(nullptr, THD_WAIT_DISKIO);
2513 tpool::tpool_wait_begin();
2514 mysql_mutex_lock(&buf_pool.flush_list_mutex);
2515 for (;;)
2516 {
2517 const lsn_t lsn= log_sys.get_lsn();
2518 buf_flush_wait(lsn);
2519 if (lsn == log_sys.get_lsn())
2520 break;
2521 }
2522 mysql_mutex_unlock(&buf_pool.flush_list_mutex);
2523 tpool::tpool_wait_end();
2524 thd_wait_end(nullptr);
2525 }
2526
2527 #ifdef UNIV_DEBUG
2528 /** Functor to validate the flush list. */
2529 struct Check {
operator ()Check2530 void operator()(const buf_page_t* elem) const
2531 {
2532 ut_ad(elem->oldest_modification());
2533 ut_ad(!fsp_is_system_temporary(elem->id().space()));
2534 }
2535 };
2536
2537 /** Validate the flush list. */
buf_flush_validate_low()2538 static void buf_flush_validate_low()
2539 {
2540 buf_page_t* bpage;
2541
2542 mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
2543
2544 ut_list_validate(buf_pool.flush_list, Check());
2545
2546 bpage = UT_LIST_GET_FIRST(buf_pool.flush_list);
2547
2548 while (bpage != NULL) {
2549 const lsn_t om = bpage->oldest_modification();
2550 /* A page in buf_pool.flush_list can be in
2551 BUF_BLOCK_REMOVE_HASH state. This happens when a page
2552 is in the middle of being relocated. In that case the
2553 original descriptor can have this state and still be
2554 in the flush list waiting to acquire the
2555 buf_pool.flush_list_mutex to complete the relocation. */
2556 ut_d(const auto s= bpage->state());
2557 ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE
2558 || s == BUF_BLOCK_REMOVE_HASH);
2559 ut_ad(om == 1 || om > 2);
2560
2561 bpage = UT_LIST_GET_NEXT(list, bpage);
2562 ut_ad(om == 1 || !bpage || recv_recovery_is_on()
2563 || om >= bpage->oldest_modification());
2564 }
2565 }
2566
2567 /** Validate the flush list. */
buf_flush_validate()2568 void buf_flush_validate()
2569 {
2570 mysql_mutex_lock(&buf_pool.flush_list_mutex);
2571 buf_flush_validate_low();
2572 mysql_mutex_unlock(&buf_pool.flush_list_mutex);
2573 }
2574 #endif /* UNIV_DEBUG */
2575