1 /*****************************************************************************
2
3 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2009, Google Inc.
5 Copyright (c) 2014, 2021, MariaDB Corporation.
6
7 Portions of this file contain modifications contributed and copyrighted by
8 Google, Inc. Those modifications are gratefully acknowledged and are described
9 briefly in the InnoDB documentation. The contributions by Google are
10 incorporated with their permission, and subject to the conditions contained in
11 the file COPYING.Google.
12
13 This program is free software; you can redistribute it and/or modify it under
14 the terms of the GNU General Public License as published by the Free Software
15 Foundation; version 2 of the License.
16
17 This program is distributed in the hope that it will be useful, but WITHOUT
18 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
19 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
24
25 *****************************************************************************/
26
27 /**************************************************//**
28 @file log/log0log.cc
29 Database log
30
31 Created 12/9/1995 Heikki Tuuri
32 *******************************************************/
33
34 #include "univ.i"
35 #include <debug_sync.h>
36 #include <my_service_manager.h>
37
38 #include "log0log.h"
39 #include "log0crypt.h"
40 #include "mem0mem.h"
41 #include "buf0buf.h"
42 #include "buf0flu.h"
43 #include "lock0lock.h"
44 #include "log0recv.h"
45 #include "fil0fil.h"
46 #include "dict0boot.h"
47 #include "dict0stats_bg.h"
48 #include "btr0defragment.h"
49 #include "srv0srv.h"
50 #include "srv0start.h"
51 #include "trx0sys.h"
52 #include "trx0trx.h"
53 #include "trx0roll.h"
54 #include "srv0mon.h"
55 #include "sync0sync.h"
56
57 /*
58 General philosophy of InnoDB redo-logs:
59
60 1) Every change to a contents of a data page must be done
61 through mtr, which in mtr_commit() writes log records
62 to the InnoDB redo log.
63
64 2) Normally these changes are performed using a mlog_write_ulint()
65 or similar function.
66
67 3) In some page level operations only a code number of a
68 c-function and its parameters are written to the log to
69 reduce the size of the log.
70
71 3a) You should not add parameters to these kind of functions
72 (e.g. trx_undo_header_create())
73
74 3b) You should not add such functionality which either change
75 working when compared with the old or are dependent on data
76 outside of the page. These kind of functions should implement
77 self-contained page transformation and it should be unchanged
78 if you don't have very essential reasons to change log
79 semantics or format.
80
81 */
82
83 /** Redo log system */
84 log_t log_sys;
85
86 /** Whether to require checksums on the redo log pages */
87 my_bool innodb_log_checksums;
88
89 /* Next log block number to do dummy record filling if no log records written
90 for a while */
91 static ulint next_lbn_to_pad = 0;
92
93 /* These control how often we print warnings if the last checkpoint is too
94 old */
95 static bool log_has_printed_chkp_warning = false;
96 static time_t log_last_warning_time;
97
98 static bool log_has_printed_chkp_margine_warning = false;
99 static time_t log_last_margine_warning_time;
100
101 /* A margin for free space in the log buffer before a log entry is catenated */
102 #define LOG_BUF_WRITE_MARGIN (4 * OS_FILE_LOG_BLOCK_SIZE)
103
104 /* Margins for free space in the log buffer after a log entry is catenated */
105 #define LOG_BUF_FLUSH_RATIO 2
106 #define LOG_BUF_FLUSH_MARGIN (LOG_BUF_WRITE_MARGIN \
107 + (4U << srv_page_size_shift))
108
109 /* This parameter controls asynchronous making of a new checkpoint; the value
110 should be bigger than LOG_POOL_PREFLUSH_RATIO_SYNC */
111
112 #define LOG_POOL_CHECKPOINT_RATIO_ASYNC 32
113
114 /* This parameter controls synchronous preflushing of modified buffer pages */
115 #define LOG_POOL_PREFLUSH_RATIO_SYNC 16
116
117 /* The same ratio for asynchronous preflushing; this value should be less than
118 the previous */
119 #define LOG_POOL_PREFLUSH_RATIO_ASYNC 8
120
121 /* Codes used in unlocking flush latches */
122 #define LOG_UNLOCK_NONE_FLUSHED_LOCK 1
123 #define LOG_UNLOCK_FLUSH_LOCK 2
124
125 /** Event to wake up log_scrub_thread */
126 os_event_t log_scrub_event;
127 /** Whether log_scrub_thread is active */
128 bool log_scrub_thread_active;
129
130 extern "C" UNIV_INTERN
131 os_thread_ret_t
132 DECLARE_THREAD(log_scrub_thread)(void*);
133
134 /****************************************************************//**
135 Returns the oldest modified block lsn in the pool, or log_sys.lsn if none
136 exists.
137 @return LSN of oldest modification */
138 static
139 lsn_t
log_buf_pool_get_oldest_modification(void)140 log_buf_pool_get_oldest_modification(void)
141 /*======================================*/
142 {
143 lsn_t lsn;
144
145 ut_ad(log_mutex_own());
146
147 lsn = buf_pool_get_oldest_modification();
148
149 if (!lsn) {
150
151 lsn = log_sys.lsn;
152 }
153
154 return(lsn);
155 }
156
157 /** Extends the log buffer.
158 @param[in] len requested minimum size in bytes */
log_buffer_extend(ulong len)159 void log_buffer_extend(ulong len)
160 {
161 const ulong new_buf_size = ut_calc_align(len, srv_page_size);
162 byte* new_buf = static_cast<byte*>(ut_malloc_dontdump(new_buf_size));
163 TRASH_ALLOC(new_buf, new_buf_size);
164 byte* new_flush_buf =
165 static_cast<byte*>(ut_malloc_dontdump(new_buf_size));
166 TRASH_ALLOC(new_flush_buf, new_buf_size);
167
168 log_mutex_enter();
169
170 if (len <= srv_log_buffer_size) {
171 /* Already extended enough by the others */
172 log_mutex_exit();
173 ut_free_dodump(new_buf, new_buf_size);
174 ut_free_dodump(new_flush_buf, new_buf_size);
175 return;
176 }
177
178 ib::warn() << "The redo log transaction size " << len <<
179 " exceeds innodb_log_buffer_size="
180 << srv_log_buffer_size << " / 2). Trying to extend it.";
181
182 byte* old_buf = log_sys.buf;
183 byte* old_flush_buf = log_sys.flush_buf;
184 const ulong old_buf_size = srv_log_buffer_size;
185
186 srv_log_buffer_size = new_buf_size;
187 log_sys.buf = new_buf;
188 log_sys.flush_buf = new_flush_buf;
189 memcpy(new_buf, old_buf, log_sys.buf_free);
190
191 log_sys.max_buf_free = new_buf_size / LOG_BUF_FLUSH_RATIO
192 - LOG_BUF_FLUSH_MARGIN;
193
194 log_mutex_exit();
195
196 ut_free_dodump(old_buf, old_buf_size);
197 ut_free_dodump(old_flush_buf, old_buf_size);
198
199 ib::info() << "innodb_log_buffer_size was extended to "
200 << new_buf_size << ".";
201 }
202
203 /** Calculate actual length in redo buffer and file including
204 block header and trailer.
205 @param[in] len length to write
206 @return actual length to write including header and trailer. */
207 static inline
208 ulint
log_calculate_actual_len(ulint len)209 log_calculate_actual_len(
210 ulint len)
211 {
212 ut_ad(log_mutex_own());
213
214 const ulint framing_size = log_sys.framing_size();
215 /* actual length stored per block */
216 const ulint len_per_blk = OS_FILE_LOG_BLOCK_SIZE - framing_size;
217
218 /* actual data length in last block already written */
219 ulint extra_len = (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE);
220
221 ut_ad(extra_len >= LOG_BLOCK_HDR_SIZE);
222 extra_len -= LOG_BLOCK_HDR_SIZE;
223
224 /* total extra length for block header and trailer */
225 extra_len = ((len + extra_len) / len_per_blk) * framing_size;
226
227 return(len + extra_len);
228 }
229
230 /** Check margin not to overwrite transaction log from the last checkpoint.
231 If would estimate the log write to exceed the log_group_capacity,
232 waits for the checkpoint is done enough.
233 @param[in] len length of the data to be written */
234
235 void
log_margin_checkpoint_age(ulint len)236 log_margin_checkpoint_age(
237 ulint len)
238 {
239 ulint margin = log_calculate_actual_len(len);
240
241 ut_ad(log_mutex_own());
242
243 if (margin > log_sys.log_group_capacity) {
244 /* return with warning output to avoid deadlock */
245 if (!log_has_printed_chkp_margine_warning
246 || difftime(time(NULL),
247 log_last_margine_warning_time) > 15) {
248 log_has_printed_chkp_margine_warning = true;
249 log_last_margine_warning_time = time(NULL);
250
251 ib::error() << "The transaction log files are too"
252 " small for the single transaction log (size="
253 << len << "). So, the last checkpoint age"
254 " might exceed the log group capacity "
255 << log_sys.log_group_capacity << ".";
256 }
257
258 return;
259 }
260
261 /* Our margin check should ensure that we never reach this condition.
262 Try to do checkpoint once. We cannot keep waiting here as it might
263 result in hang in case the current mtr has latch on oldest lsn */
264 if (log_sys.lsn - log_sys.last_checkpoint_lsn + margin
265 > log_sys.log_group_capacity) {
266 /* The log write of 'len' might overwrite the transaction log
267 after the last checkpoint. Makes checkpoint. */
268
269 bool flushed_enough = false;
270
271 if (log_sys.lsn - log_buf_pool_get_oldest_modification()
272 + margin
273 <= log_sys.log_group_capacity) {
274 flushed_enough = true;
275 }
276
277 log_sys.check_flush_or_checkpoint = true;
278 log_mutex_exit();
279
280 DEBUG_SYNC_C("margin_checkpoint_age_rescue");
281
282 if (!flushed_enough) {
283 os_thread_sleep(100000);
284 }
285 log_checkpoint(true);
286
287 log_mutex_enter();
288 }
289
290 return;
291 }
292
293 /** Open the log for log_write_low. The log must be closed with log_close.
294 @param[in] len length of the data to be written
295 @return start lsn of the log record */
296 lsn_t
log_reserve_and_open(ulint len)297 log_reserve_and_open(
298 ulint len)
299 {
300 ulint len_upper_limit;
301 #ifdef UNIV_DEBUG
302 ulint count = 0;
303 #endif /* UNIV_DEBUG */
304
305 loop:
306 ut_ad(log_mutex_own());
307
308 /* Calculate an upper limit for the space the string may take in the
309 log buffer */
310
311 len_upper_limit = LOG_BUF_WRITE_MARGIN + srv_log_write_ahead_size
312 + (5 * len) / 4;
313
314 if (log_sys.buf_free + len_upper_limit > srv_log_buffer_size) {
315 log_mutex_exit();
316
317 DEBUG_SYNC_C("log_buf_size_exceeded");
318
319 /* Not enough free space, do a write of the log buffer */
320 log_buffer_sync_in_background(false);
321
322 srv_stats.log_waits.inc();
323
324 ut_ad(++count < 50);
325
326 log_mutex_enter();
327 goto loop;
328 }
329
330 return(log_sys.lsn);
331 }
332
333 /************************************************************//**
334 Writes to the log the string given. It is assumed that the caller holds the
335 log mutex. */
336 void
log_write_low(const byte * str,ulint str_len)337 log_write_low(
338 /*==========*/
339 const byte* str, /*!< in: string */
340 ulint str_len) /*!< in: string length */
341 {
342 ulint len;
343
344 ut_ad(log_mutex_own());
345 const ulint trailer_offset = log_sys.trailer_offset();
346 part_loop:
347 /* Calculate a part length */
348
349 ulint data_len = (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE) + str_len;
350
351 if (data_len <= trailer_offset) {
352
353 /* The string fits within the current log block */
354
355 len = str_len;
356 } else {
357 data_len = trailer_offset;
358
359 len = trailer_offset
360 - log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE;
361 }
362
363 memcpy(log_sys.buf + log_sys.buf_free, str, len);
364
365 str_len -= len;
366 str = str + len;
367
368 byte* log_block = static_cast<byte*>(
369 ut_align_down(log_sys.buf + log_sys.buf_free,
370 OS_FILE_LOG_BLOCK_SIZE));
371
372 log_block_set_data_len(log_block, data_len);
373
374 if (data_len == trailer_offset) {
375 /* This block became full */
376 log_block_set_data_len(log_block, OS_FILE_LOG_BLOCK_SIZE);
377 log_block_set_checkpoint_no(log_block,
378 log_sys.next_checkpoint_no);
379 len += log_sys.framing_size();
380
381 log_sys.lsn += len;
382
383 /* Initialize the next block header */
384 log_block_init(log_block + OS_FILE_LOG_BLOCK_SIZE,
385 log_sys.lsn);
386 } else {
387 log_sys.lsn += len;
388 }
389
390 log_sys.buf_free += ulong(len);
391
392 ut_ad(log_sys.buf_free <= srv_log_buffer_size);
393
394 if (str_len > 0) {
395 goto part_loop;
396 }
397
398 srv_stats.log_write_requests.inc();
399 }
400
401 /************************************************************//**
402 Closes the log.
403 @return lsn */
404 lsn_t
log_close(void)405 log_close(void)
406 /*===========*/
407 {
408 byte* log_block;
409 ulint first_rec_group;
410 lsn_t oldest_lsn;
411 lsn_t lsn;
412 lsn_t checkpoint_age;
413
414 ut_ad(log_mutex_own());
415
416 lsn = log_sys.lsn;
417
418 log_block = static_cast<byte*>(
419 ut_align_down(log_sys.buf + log_sys.buf_free,
420 OS_FILE_LOG_BLOCK_SIZE));
421
422 first_rec_group = log_block_get_first_rec_group(log_block);
423
424 if (first_rec_group == 0) {
425 /* We initialized a new log block which was not written
426 full by the current mtr: the next mtr log record group
427 will start within this block at the offset data_len */
428
429 log_block_set_first_rec_group(
430 log_block, log_block_get_data_len(log_block));
431 }
432
433 if (log_sys.buf_free > log_sys.max_buf_free) {
434 log_sys.check_flush_or_checkpoint = true;
435 }
436
437 checkpoint_age = lsn - log_sys.last_checkpoint_lsn;
438
439 if (checkpoint_age >= log_sys.log_group_capacity) {
440 DBUG_EXECUTE_IF(
441 "print_all_chkp_warnings",
442 log_has_printed_chkp_warning = false;);
443
444 if (!log_has_printed_chkp_warning
445 || difftime(time(NULL), log_last_warning_time) > 15) {
446
447 log_has_printed_chkp_warning = true;
448 log_last_warning_time = time(NULL);
449
450 ib::error() << "The age of the last checkpoint is "
451 << checkpoint_age << ", which exceeds the log"
452 " group capacity "
453 << log_sys.log_group_capacity
454 << ".";
455 }
456 }
457
458 if (checkpoint_age <= log_sys.max_modified_age_sync) {
459 goto function_exit;
460 }
461
462 oldest_lsn = buf_pool_get_oldest_modification();
463
464 if (!oldest_lsn
465 || lsn - oldest_lsn > log_sys.max_modified_age_sync
466 || checkpoint_age > log_sys.max_checkpoint_age_async) {
467 log_sys.check_flush_or_checkpoint = true;
468 }
469 function_exit:
470
471 return(lsn);
472 }
473
474 /** Calculate the recommended highest values for lsn - last_checkpoint_lsn
475 and lsn - buf_get_oldest_modification().
476 @param[in] file_size requested innodb_log_file_size
477 @retval true on success
478 @retval false if the smallest log group is too small to
479 accommodate the number of OS threads in the database server */
480 bool
log_set_capacity(ulonglong file_size)481 log_set_capacity(ulonglong file_size)
482 {
483 lsn_t margin;
484 ulint free;
485
486 lsn_t smallest_capacity = (file_size - LOG_FILE_HDR_SIZE)
487 * srv_n_log_files;
488 /* Add extra safety */
489 smallest_capacity -= smallest_capacity / 10;
490
491 /* For each OS thread we must reserve so much free space in the
492 smallest log group that it can accommodate the log entries produced
493 by single query steps: running out of free log space is a serious
494 system error which requires rebooting the database. */
495
496 free = LOG_CHECKPOINT_FREE_PER_THREAD * (10 + srv_thread_concurrency)
497 + LOG_CHECKPOINT_EXTRA_FREE;
498 if (free >= smallest_capacity / 2) {
499 ib::error() << "Cannot continue operation. ib_logfiles are too"
500 " small for innodb_thread_concurrency="
501 << srv_thread_concurrency << ". The combined size of"
502 " ib_logfiles should be bigger than"
503 " 200 kB * innodb_thread_concurrency. "
504 << INNODB_PARAMETERS_MSG;
505 return(false);
506 }
507
508 margin = smallest_capacity - free;
509 margin = margin - margin / 10; /* Add still some extra safety */
510
511 log_mutex_enter();
512
513 log_sys.log_group_capacity = smallest_capacity;
514
515 log_sys.max_modified_age_async = margin
516 - margin / LOG_POOL_PREFLUSH_RATIO_ASYNC;
517 log_sys.max_modified_age_sync = margin
518 - margin / LOG_POOL_PREFLUSH_RATIO_SYNC;
519
520 log_sys.max_checkpoint_age_async = margin - margin
521 / LOG_POOL_CHECKPOINT_RATIO_ASYNC;
522 log_sys.max_checkpoint_age = margin;
523
524 log_mutex_exit();
525
526 return(true);
527 }
528
529 /** Initialize the redo log subsystem. */
create()530 void log_t::create()
531 {
532 ut_ad(this == &log_sys);
533 ut_ad(!is_initialised());
534 m_initialised= true;
535
536 mutex_create(LATCH_ID_LOG_SYS, &mutex);
537 mutex_create(LATCH_ID_LOG_WRITE, &write_mutex);
538 mutex_create(LATCH_ID_LOG_FLUSH_ORDER, &log_flush_order_mutex);
539
540 /* Start the lsn from one log block from zero: this way every
541 log record has a non-zero start lsn, a fact which we will use */
542
543 lsn= LOG_START_LSN;
544
545 ut_ad(srv_log_buffer_size >= 16 * OS_FILE_LOG_BLOCK_SIZE);
546 ut_ad(srv_log_buffer_size >= 4U << srv_page_size_shift);
547
548 buf= static_cast<byte*>(ut_malloc_dontdump(srv_log_buffer_size));
549 TRASH_ALLOC(buf, srv_log_buffer_size);
550 flush_buf= static_cast<byte*>(ut_malloc_dontdump(srv_log_buffer_size));
551 TRASH_ALLOC(flush_buf, srv_log_buffer_size);
552
553 max_buf_free= srv_log_buffer_size / LOG_BUF_FLUSH_RATIO -
554 LOG_BUF_FLUSH_MARGIN;
555 check_flush_or_checkpoint= true;
556
557 n_log_ios_old= n_log_ios;
558 last_printout_time= time(NULL);
559
560 buf_next_to_write= 0;
561 write_lsn= lsn;
562 flushed_to_disk_lsn= 0;
563 n_pending_flushes= 0;
564 flush_event = os_event_create("log_flush_event");
565 os_event_set(flush_event);
566 n_log_ios= 0;
567 n_log_ios_old= 0;
568 log_group_capacity= 0;
569 max_modified_age_async= 0;
570 max_modified_age_sync= 0;
571 max_checkpoint_age_async= 0;
572 max_checkpoint_age= 0;
573 next_checkpoint_no= 0;
574 next_checkpoint_lsn= 0;
575 append_on_checkpoint= NULL;
576 n_pending_checkpoint_writes= 0;
577
578 last_checkpoint_lsn= lsn;
579 rw_lock_create(checkpoint_lock_key, &checkpoint_lock, SYNC_NO_ORDER_CHECK);
580
581 log_block_init(buf, lsn);
582 log_block_set_first_rec_group(buf, LOG_BLOCK_HDR_SIZE);
583
584 buf_free= LOG_BLOCK_HDR_SIZE;
585 lsn= LOG_START_LSN + LOG_BLOCK_HDR_SIZE;
586
587 MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE, lsn - last_checkpoint_lsn);
588
589 log_scrub_thread_active= !srv_read_only_mode && srv_scrub_log;
590 if (log_scrub_thread_active) {
591 log_scrub_event= os_event_create("log_scrub_event");
592 os_thread_create(log_scrub_thread, NULL, NULL);
593 }
594 }
595
596 /** Initialize the redo log.
597 @param[in] n_files number of files */
create(ulint n_files)598 void log_t::files::create(ulint n_files)
599 {
600 ut_ad(n_files <= SRV_N_LOG_FILES_MAX);
601 ut_ad(this == &log_sys.log);
602 ut_ad(log_sys.is_initialised());
603
604 this->n_files= n_files;
605 format= srv_encrypt_log ? log_t::FORMAT_ENC_10_4 : log_t::FORMAT_10_4;
606 subformat= 2;
607 file_size= srv_log_file_size;
608 lsn= LOG_START_LSN;
609 lsn_offset= LOG_FILE_HDR_SIZE;
610 }
611
612 /******************************************************//**
613 Writes a log file header to a log file space. */
614 static
615 void
log_file_header_flush(ulint nth_file,lsn_t start_lsn)616 log_file_header_flush(
617 ulint nth_file, /*!< in: header to the nth file in the
618 log file space */
619 lsn_t start_lsn) /*!< in: log file data starts at this
620 lsn */
621 {
622 lsn_t dest_offset;
623
624 ut_ad(log_write_mutex_own());
625 ut_ad(!recv_no_log_write);
626 ut_a(nth_file < log_sys.log.n_files);
627 ut_ad(log_sys.log.format == log_t::FORMAT_10_4
628 || log_sys.log.format == log_t::FORMAT_ENC_10_4);
629
630 // man 2 open suggests this buffer to be aligned by 512 for O_DIRECT
631 MY_ALIGNED(OS_FILE_LOG_BLOCK_SIZE)
632 byte buf[OS_FILE_LOG_BLOCK_SIZE] = {0};
633
634 mach_write_to_4(buf + LOG_HEADER_FORMAT, log_sys.log.format);
635 mach_write_to_4(buf + LOG_HEADER_SUBFORMAT, log_sys.log.subformat);
636 mach_write_to_8(buf + LOG_HEADER_START_LSN, start_lsn);
637 strcpy(reinterpret_cast<char*>(buf) + LOG_HEADER_CREATOR,
638 LOG_HEADER_CREATOR_CURRENT);
639 ut_ad(LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR
640 >= sizeof LOG_HEADER_CREATOR_CURRENT);
641 log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf));
642
643 dest_offset = nth_file * log_sys.log.file_size;
644
645 DBUG_PRINT("ib_log", ("write " LSN_PF
646 " file " ULINTPF " header",
647 start_lsn, nth_file));
648
649 log_sys.n_log_ios++;
650
651 srv_stats.os_log_pending_writes.inc();
652
653 const ulint page_no = ulint(dest_offset >> srv_page_size_shift);
654
655 fil_io(IORequestLogWrite, true,
656 page_id_t(SRV_LOG_SPACE_FIRST_ID, page_no),
657 0,
658 ulint(dest_offset & (srv_page_size - 1)),
659 OS_FILE_LOG_BLOCK_SIZE, buf, NULL);
660
661 srv_stats.os_log_pending_writes.dec();
662 }
663
664 /******************************************************//**
665 Stores a 4-byte checksum to the trailer checksum field of a log block
666 before writing it to a log file. This checksum is used in recovery to
667 check the consistency of a log block. */
668 static
669 void
log_block_store_checksum(byte * block)670 log_block_store_checksum(
671 /*=====================*/
672 byte* block) /*!< in/out: pointer to a log block */
673 {
674 log_block_set_checksum(block, log_block_calc_checksum_crc32(block));
675 }
676
677 /******************************************************//**
678 Writes a buffer to a log file. */
679 static
680 void
log_write_buf(byte * buf,ulint len,ulint pad_len,lsn_t start_lsn,ulint new_data_offset)681 log_write_buf(
682 byte* buf, /*!< in: buffer */
683 ulint len, /*!< in: buffer len; must be divisible
684 by OS_FILE_LOG_BLOCK_SIZE */
685 #ifdef UNIV_DEBUG
686 ulint pad_len, /*!< in: pad len in the buffer len */
687 #endif /* UNIV_DEBUG */
688 lsn_t start_lsn, /*!< in: start lsn of the buffer; must
689 be divisible by
690 OS_FILE_LOG_BLOCK_SIZE */
691 ulint new_data_offset)/*!< in: start offset of new data in
692 buf: this parameter is used to decide
693 if we have to write a new log file
694 header */
695 {
696 ulint write_len;
697 bool write_header = new_data_offset == 0;
698 lsn_t next_offset;
699 ulint i;
700
701 ut_ad(log_write_mutex_own());
702 ut_ad(!recv_no_log_write);
703 ut_a(len % OS_FILE_LOG_BLOCK_SIZE == 0);
704 ut_a(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
705
706 loop:
707 if (len == 0) {
708
709 return;
710 }
711
712 next_offset = log_sys.log.calc_lsn_offset(start_lsn);
713
714 if (write_header
715 && next_offset % log_sys.log.file_size == LOG_FILE_HDR_SIZE) {
716 /* We start to write a new log file instance in the group */
717
718 ut_a(next_offset / log_sys.log.file_size <= ULINT_MAX);
719
720 log_file_header_flush(
721 ulint(next_offset / log_sys.log.file_size), start_lsn);
722 srv_stats.os_log_written.add(OS_FILE_LOG_BLOCK_SIZE);
723
724 srv_stats.log_writes.inc();
725 }
726
727 if ((next_offset % log_sys.log.file_size) + len
728 > log_sys.log.file_size) {
729 /* if the above condition holds, then the below expression
730 is < len which is ulint, so the typecast is ok */
731 write_len = ulint(log_sys.log.file_size
732 - (next_offset % log_sys.log.file_size));
733 } else {
734 write_len = len;
735 }
736
737 DBUG_PRINT("ib_log",
738 ("write " LSN_PF " to " LSN_PF
739 ": len " ULINTPF
740 " blocks " ULINTPF ".." ULINTPF,
741 start_lsn, next_offset,
742 write_len,
743 log_block_get_hdr_no(buf),
744 log_block_get_hdr_no(
745 buf + write_len
746 - OS_FILE_LOG_BLOCK_SIZE)));
747
748 ut_ad(pad_len >= len
749 || log_block_get_hdr_no(buf)
750 == log_block_convert_lsn_to_no(start_lsn));
751
752 /* Calculate the checksums for each log block and write them to
753 the trailer fields of the log blocks */
754
755 for (i = 0; i < write_len / OS_FILE_LOG_BLOCK_SIZE; i++) {
756 #ifdef UNIV_DEBUG
757 ulint hdr_no_2 = log_block_get_hdr_no(buf) + i;
758 DBUG_EXECUTE_IF("innodb_small_log_block_no_limit",
759 hdr_no_2 = ((hdr_no_2 - 1) & 0xFUL) + 1;);
760 #endif
761 ut_ad(pad_len >= len
762 || i * OS_FILE_LOG_BLOCK_SIZE >= len - pad_len
763 || log_block_get_hdr_no(buf + i * OS_FILE_LOG_BLOCK_SIZE) == hdr_no_2);
764 log_block_store_checksum(buf + i * OS_FILE_LOG_BLOCK_SIZE);
765 }
766
767 log_sys.n_log_ios++;
768
769 srv_stats.os_log_pending_writes.inc();
770
771 ut_a((next_offset >> srv_page_size_shift) <= ULINT_MAX);
772
773 const ulint page_no = ulint(next_offset >> srv_page_size_shift);
774
775 fil_io(IORequestLogWrite, true,
776 page_id_t(SRV_LOG_SPACE_FIRST_ID, page_no),
777 0,
778 ulint(next_offset & (srv_page_size - 1)), write_len, buf, NULL);
779
780 srv_stats.os_log_pending_writes.dec();
781
782 srv_stats.os_log_written.add(write_len);
783 srv_stats.log_writes.inc();
784
785 if (write_len < len) {
786 start_lsn += write_len;
787 len -= write_len;
788 buf += write_len;
789
790 write_header = true;
791
792 goto loop;
793 }
794 }
795
796 /** Flush the recently written changes to the log file.
797 and invoke log_mutex_enter(). */
log_write_flush_to_disk_low()798 static void log_write_flush_to_disk_low()
799 {
800 ut_a(log_sys.n_pending_flushes);
801
802 bool do_flush = srv_file_flush_method != SRV_O_DSYNC;
803
804 if (do_flush) {
805 fil_flush(SRV_LOG_SPACE_FIRST_ID);
806 }
807
808 log_mutex_enter();
809 if (do_flush) {
810 log_sys.flushed_to_disk_lsn = log_sys.current_flush_lsn;
811 }
812
813 log_sys.n_pending_flushes--;
814
815 os_event_set(log_sys.flush_event);
816 }
817
818 /** Swap log buffers, and copy the content of last block
819 from old buf to the head of the new buf. Thus, buf_free and
820 buf_next_to_write would be changed accordingly */
821 static inline
822 void
log_buffer_switch()823 log_buffer_switch()
824 {
825 ut_ad(log_mutex_own());
826 ut_ad(log_write_mutex_own());
827
828 ulong area_end = ut_calc_align(
829 log_sys.buf_free, ulong(OS_FILE_LOG_BLOCK_SIZE));
830
831 /* Copy the last block to new buf */
832 ut_memcpy(log_sys.flush_buf,
833 log_sys.buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
834 OS_FILE_LOG_BLOCK_SIZE);
835
836 std::swap(log_sys.buf, log_sys.flush_buf);
837
838 log_sys.buf_free %= OS_FILE_LOG_BLOCK_SIZE;
839 log_sys.buf_next_to_write = log_sys.buf_free;
840 }
841
842 /** Ensure that the log has been written to the log file up to a given
843 log entry (such as that of a transaction commit). Start a new write, or
844 wait and check if an already running write is covering the request.
845 @param[in] lsn log sequence number that should be
846 included in the redo log file write
847 @param[in] flush_to_disk whether the written log should also
848 be flushed to the file system
849 @param[in] rotate_key whether to rotate the encryption key */
log_write_up_to(lsn_t lsn,bool flush_to_disk,bool rotate_key)850 void log_write_up_to(lsn_t lsn, bool flush_to_disk, bool rotate_key)
851 {
852 #ifdef UNIV_DEBUG
853 ulint loop_count = 0;
854 #endif /* UNIV_DEBUG */
855 byte* write_buf;
856 lsn_t write_lsn;
857
858 ut_ad(!srv_read_only_mode);
859 ut_ad(!rotate_key || flush_to_disk);
860
861 if (recv_no_ibuf_operations) {
862 /* Recovery is running and no operations on the log files are
863 allowed yet (the variable name .._no_ibuf_.. is misleading) */
864
865 return;
866 }
867
868 loop:
869 ut_ad(++loop_count < 128);
870
871 #if UNIV_WORD_SIZE > 7
872 /* We can do a dirty read of LSN. */
873 /* NOTE: Currently doesn't do dirty read for
874 (flush_to_disk == true) case, because the log_mutex
875 contention also works as the arbitrator for write-IO
876 (fsync) bandwidth between log files and data files. */
877 if (!flush_to_disk && log_sys.write_lsn >= lsn) {
878 return;
879 }
880 #endif
881
882 log_write_mutex_enter();
883 ut_ad(!recv_no_log_write);
884
885 lsn_t limit_lsn = flush_to_disk
886 ? log_sys.flushed_to_disk_lsn
887 : log_sys.write_lsn;
888
889 if (limit_lsn >= lsn) {
890 log_write_mutex_exit();
891 return;
892 }
893
894 /* If it is a write call we should just go ahead and do it
895 as we checked that write_lsn is not where we'd like it to
896 be. If we have to flush as well then we check if there is a
897 pending flush and based on that we wait for it to finish
898 before proceeding further. */
899 if (flush_to_disk
900 && (log_sys.n_pending_flushes > 0
901 || !os_event_is_set(log_sys.flush_event))) {
902 /* Figure out if the current flush will do the job
903 for us. */
904 bool work_done = log_sys.current_flush_lsn >= lsn;
905
906 log_write_mutex_exit();
907
908 os_event_wait(log_sys.flush_event);
909
910 if (work_done) {
911 return;
912 } else {
913 goto loop;
914 }
915 }
916
917 log_mutex_enter();
918 if (!flush_to_disk
919 && log_sys.buf_free == log_sys.buf_next_to_write) {
920 /* Nothing to write and no flush to disk requested */
921 log_mutex_exit_all();
922 return;
923 }
924
925 ulint start_offset;
926 ulint end_offset;
927 ulint area_start;
928 ulint area_end;
929 ulong write_ahead_size = srv_log_write_ahead_size;
930 ulint pad_size;
931
932 DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF,
933 log_sys.write_lsn,
934 log_sys.lsn));
935 if (flush_to_disk) {
936 log_sys.n_pending_flushes++;
937 log_sys.current_flush_lsn = log_sys.lsn;
938 os_event_reset(log_sys.flush_event);
939
940 if (log_sys.buf_free == log_sys.buf_next_to_write) {
941 /* Nothing to write, flush only */
942 log_mutex_exit_all();
943 log_write_flush_to_disk_low();
944 log_mutex_exit();
945 return;
946 }
947 }
948
949 start_offset = log_sys.buf_next_to_write;
950 end_offset = log_sys.buf_free;
951
952 area_start = ut_2pow_round(start_offset,
953 ulint(OS_FILE_LOG_BLOCK_SIZE));
954 area_end = ut_calc_align(end_offset, ulint(OS_FILE_LOG_BLOCK_SIZE));
955
956 ut_ad(area_end - area_start > 0);
957
958 log_block_set_flush_bit(log_sys.buf + area_start, TRUE);
959 log_block_set_checkpoint_no(
960 log_sys.buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
961 log_sys.next_checkpoint_no);
962
963 write_lsn = log_sys.lsn;
964 write_buf = log_sys.buf;
965
966 log_buffer_switch();
967
968 log_sys.log.set_fields(log_sys.write_lsn);
969
970 log_mutex_exit();
971 /* Erase the end of the last log block. */
972 memset(write_buf + end_offset, 0,
973 ~end_offset & (OS_FILE_LOG_BLOCK_SIZE - 1));
974
975 /* Calculate pad_size if needed. */
976 pad_size = 0;
977 if (write_ahead_size > OS_FILE_LOG_BLOCK_SIZE) {
978 ulint end_offset_in_unit;
979 lsn_t end_offset = log_sys.log.calc_lsn_offset(
980 ut_uint64_align_up(write_lsn, OS_FILE_LOG_BLOCK_SIZE));
981 end_offset_in_unit = (ulint) (end_offset % write_ahead_size);
982
983 if (end_offset_in_unit > 0
984 && (area_end - area_start) > end_offset_in_unit) {
985 /* The first block in the unit was initialized
986 after the last writing.
987 Needs to be written padded data once. */
988 pad_size = std::min<ulint>(
989 ulint(write_ahead_size) - end_offset_in_unit,
990 srv_log_buffer_size - area_end);
991 ::memset(write_buf + area_end, 0, pad_size);
992 }
993 }
994
995 if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED)) {
996 service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
997 "InnoDB log write: "
998 LSN_PF "," LSN_PF,
999 log_sys.write_lsn, lsn);
1000 }
1001
1002 if (log_sys.is_encrypted()) {
1003 log_crypt(write_buf + area_start, log_sys.write_lsn,
1004 area_end - area_start,
1005 rotate_key ? LOG_ENCRYPT_ROTATE_KEY : LOG_ENCRYPT);
1006 }
1007
1008 /* Do the write to the log files */
1009 log_write_buf(
1010 write_buf + area_start, area_end - area_start + pad_size,
1011 #ifdef UNIV_DEBUG
1012 pad_size,
1013 #endif /* UNIV_DEBUG */
1014 ut_uint64_align_down(log_sys.write_lsn,
1015 OS_FILE_LOG_BLOCK_SIZE),
1016 start_offset - area_start);
1017 srv_stats.log_padded.add(pad_size);
1018 log_sys.write_lsn = write_lsn;
1019
1020
1021 if (srv_file_flush_method == SRV_O_DSYNC) {
1022 /* O_SYNC means the OS did not buffer the log file at all:
1023 so we have also flushed to disk what we have written */
1024 log_sys.flushed_to_disk_lsn = log_sys.write_lsn;
1025 }
1026
1027 log_write_mutex_exit();
1028
1029 if (flush_to_disk) {
1030 log_write_flush_to_disk_low();
1031 ib_uint64_t flush_lsn = log_sys.flushed_to_disk_lsn;
1032 log_mutex_exit();
1033
1034 innobase_mysql_log_notify(flush_lsn);
1035 }
1036 }
1037
1038 /** write to the log file up to the last log entry.
1039 @param[in] sync whether we want the written log
1040 also to be flushed to disk. */
log_buffer_flush_to_disk(bool sync)1041 void log_buffer_flush_to_disk(bool sync)
1042 {
1043 ut_ad(!srv_read_only_mode);
1044 log_write_up_to(log_get_lsn(), sync);
1045 }
1046
1047
1048 /** Durably write the log and release log_sys.mutex */
log_write_and_flush()1049 ATTRIBUTE_COLD void log_write_and_flush()
1050 {
1051 ut_ad(!srv_read_only_mode);
1052 ut_ad(!recv_no_log_write);
1053 ut_ad(!recv_recovery_is_on());
1054
1055 /* The following code is adapted from log_write_up_to(). */
1056 DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF,
1057 log_sys.write_lsn, log_sys.lsn));
1058 log_sys.n_pending_flushes++;
1059 log_sys.current_flush_lsn= log_sys.lsn;
1060 os_event_reset(log_sys.flush_event);
1061 ut_ad(log_sys.buf_free != log_sys.buf_next_to_write);
1062 ulint start_offset= log_sys.buf_next_to_write;
1063 ulint end_offset= log_sys.buf_free;
1064 ulint area_start= ut_2pow_round(start_offset, ulint(OS_FILE_LOG_BLOCK_SIZE));
1065 ulint area_end= ut_calc_align(end_offset, ulint(OS_FILE_LOG_BLOCK_SIZE));
1066 ulong write_ahead_size= srv_log_write_ahead_size;
1067
1068 log_block_set_flush_bit(log_sys.buf + area_start, TRUE);
1069 log_block_set_checkpoint_no(log_sys.buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
1070 log_sys.next_checkpoint_no);
1071 lsn_t write_lsn= log_sys.lsn;
1072 byte *write_buf= log_sys.buf;
1073
1074 ut_ad(area_end - area_start > 0);
1075
1076 log_buffer_switch();
1077
1078 log_sys.log.set_fields(log_sys.write_lsn);
1079
1080 /* Erase the end of the last log block. */
1081 memset(write_buf + end_offset, 0,
1082 ~end_offset & (OS_FILE_LOG_BLOCK_SIZE - 1));
1083 /* Calculate pad_size if needed. */
1084 ulint pad_size= 0;
1085 if (write_ahead_size > OS_FILE_LOG_BLOCK_SIZE)
1086 {
1087 lsn_t end_offset=
1088 log_sys.log.calc_lsn_offset(ut_uint64_align_up(write_lsn,
1089 OS_FILE_LOG_BLOCK_SIZE));
1090 ulint end_offset_in_unit= (ulint) (end_offset % write_ahead_size);
1091
1092 if (end_offset_in_unit && (area_end - area_start) > end_offset_in_unit)
1093 {
1094 /* The first block in the unit was initialized after the last
1095 writing. Needs to be written padded data once. */
1096 pad_size= std::min<ulint>(ulint(write_ahead_size) - end_offset_in_unit,
1097 srv_log_buffer_size - area_end);
1098 memset(write_buf + area_end, 0, pad_size);
1099 }
1100 }
1101
1102 if (log_sys.is_encrypted())
1103 log_crypt(write_buf + area_start, log_sys.write_lsn,
1104 area_end - area_start);
1105
1106 /* Do the write to the log files */
1107 log_write_buf(write_buf + area_start, area_end - area_start + pad_size,
1108 #ifdef UNIV_DEBUG
1109 pad_size,
1110 #endif /* UNIV_DEBUG */
1111 ut_uint64_align_down(log_sys.write_lsn,
1112 OS_FILE_LOG_BLOCK_SIZE),
1113 start_offset - area_start);
1114 srv_stats.log_padded.add(pad_size);
1115 log_sys.write_lsn= write_lsn;
1116
1117 log_write_mutex_exit();
1118
1119 /* Code adapted from log_write_flush_to_disk_low() */
1120
1121 ut_a(log_sys.n_pending_flushes);
1122
1123 if (srv_file_flush_method != SRV_O_DSYNC)
1124 fil_flush(SRV_LOG_SPACE_FIRST_ID);
1125
1126 log_sys.flushed_to_disk_lsn= log_sys.current_flush_lsn;
1127
1128 log_sys.n_pending_flushes--;
1129
1130 os_event_set(log_sys.flush_event);
1131
1132 const lsn_t flush_lsn= log_sys.flushed_to_disk_lsn;
1133 log_mutex_exit();
1134
1135 innobase_mysql_log_notify(flush_lsn);
1136 }
1137
1138 /****************************************************************//**
1139 This functions writes the log buffer to the log file and if 'flush'
1140 is set it forces a flush of the log file as well. This is meant to be
1141 called from background master thread only as it does not wait for
1142 the write (+ possible flush) to finish. */
1143 void
log_buffer_sync_in_background(bool flush)1144 log_buffer_sync_in_background(
1145 /*==========================*/
1146 bool flush) /*!< in: flush the logs to disk */
1147 {
1148 lsn_t lsn;
1149
1150 log_mutex_enter();
1151
1152 lsn = log_sys.lsn;
1153
1154 if (flush
1155 && log_sys.n_pending_flushes > 0
1156 && log_sys.current_flush_lsn >= lsn) {
1157 /* The write + flush will write enough */
1158 log_mutex_exit();
1159 return;
1160 }
1161
1162 log_mutex_exit();
1163
1164 log_write_up_to(lsn, flush);
1165 }
1166
1167 /********************************************************************
1168
1169 Tries to establish a big enough margin of free space in the log buffer, such
1170 that a new log entry can be catenated without an immediate need for a flush. */
1171 static
1172 void
log_flush_margin(void)1173 log_flush_margin(void)
1174 /*==================*/
1175 {
1176 lsn_t lsn = 0;
1177
1178 log_mutex_enter();
1179
1180 if (log_sys.buf_free > log_sys.max_buf_free) {
1181 /* We can write during flush */
1182 lsn = log_sys.lsn;
1183 }
1184
1185 log_mutex_exit();
1186
1187 if (lsn) {
1188 log_write_up_to(lsn, false);
1189 }
1190 }
1191
1192 /** Advances the smallest lsn for which there are unflushed dirty blocks in the
1193 buffer pool.
1194 NOTE: this function may only be called if the calling thread owns no
1195 synchronization objects!
1196 @param[in] new_oldest try to advance oldest_modified_lsn at least to
1197 this lsn
1198 @return false if there was a flush batch of the same type running,
1199 which means that we could not start this flush batch */
log_preflush_pool_modified_pages(lsn_t new_oldest)1200 static bool log_preflush_pool_modified_pages(lsn_t new_oldest)
1201 {
1202 bool success;
1203
1204 if (recv_recovery_is_on()) {
1205 /* If the recovery is running, we must first apply all
1206 log records to their respective file pages to get the
1207 right modify lsn values to these pages: otherwise, there
1208 might be pages on disk which are not yet recovered to the
1209 current lsn, and even after calling this function, we could
1210 not know how up-to-date the disk version of the database is,
1211 and we could not make a new checkpoint on the basis of the
1212 info on the buffer pool only. */
1213 recv_apply_hashed_log_recs(true);
1214 }
1215
1216 if (new_oldest == LSN_MAX
1217 || !buf_page_cleaner_is_active
1218 || srv_is_being_started) {
1219
1220 ulint n_pages;
1221
1222 success = buf_flush_lists(ULINT_MAX, new_oldest, &n_pages);
1223
1224 buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
1225
1226 if (!success) {
1227 MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
1228 }
1229
1230 MONITOR_INC_VALUE_CUMULATIVE(
1231 MONITOR_FLUSH_SYNC_TOTAL_PAGE,
1232 MONITOR_FLUSH_SYNC_COUNT,
1233 MONITOR_FLUSH_SYNC_PAGES,
1234 n_pages);
1235 } else {
1236 /* better to wait for flushed by page cleaner */
1237
1238 if (srv_flush_sync) {
1239 /* wake page cleaner for IO burst */
1240 buf_flush_request_force(new_oldest);
1241 }
1242
1243 buf_flush_wait_flushed(new_oldest);
1244
1245 success = true;
1246 }
1247
1248 return(success);
1249 }
1250
1251 /******************************************************//**
1252 Completes a checkpoint. */
1253 static
1254 void
log_complete_checkpoint(void)1255 log_complete_checkpoint(void)
1256 /*=========================*/
1257 {
1258 ut_ad(log_mutex_own());
1259 ut_ad(log_sys.n_pending_checkpoint_writes == 0);
1260
1261 log_sys.next_checkpoint_no++;
1262
1263 log_sys.last_checkpoint_lsn = log_sys.next_checkpoint_lsn;
1264 MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
1265 log_sys.lsn - log_sys.last_checkpoint_lsn);
1266
1267 DBUG_PRINT("ib_log", ("checkpoint ended at " LSN_PF
1268 ", flushed to " LSN_PF,
1269 log_sys.last_checkpoint_lsn,
1270 log_sys.flushed_to_disk_lsn));
1271
1272 rw_lock_x_unlock_gen(&(log_sys.checkpoint_lock), LOG_CHECKPOINT);
1273 }
1274
1275 /** Complete an asynchronous checkpoint write. */
complete_checkpoint()1276 void log_t::complete_checkpoint()
1277 {
1278 ut_ad(this == &log_sys);
1279 MONITOR_DEC(MONITOR_PENDING_CHECKPOINT_WRITE);
1280
1281 log_mutex_enter();
1282
1283 ut_ad(n_pending_checkpoint_writes > 0);
1284
1285 if (!--n_pending_checkpoint_writes) {
1286 log_complete_checkpoint();
1287 }
1288
1289 log_mutex_exit();
1290 }
1291
1292 /** Write checkpoint info to the log header.
1293 @param[in] end_lsn start LSN of the MLOG_CHECKPOINT mini-transaction */
1294 static
1295 void
log_group_checkpoint(lsn_t end_lsn)1296 log_group_checkpoint(lsn_t end_lsn)
1297 {
1298 lsn_t lsn_offset;
1299
1300 ut_ad(!srv_read_only_mode);
1301 ut_ad(log_mutex_own());
1302 ut_ad(end_lsn == 0 || end_lsn >= log_sys.next_checkpoint_lsn);
1303 ut_ad(end_lsn <= log_sys.lsn);
1304 ut_ad(end_lsn + SIZE_OF_MLOG_CHECKPOINT <= log_sys.lsn
1305 || srv_shutdown_state > SRV_SHUTDOWN_INITIATED);
1306
1307 DBUG_PRINT("ib_log", ("checkpoint " UINT64PF " at " LSN_PF
1308 " written",
1309 log_sys.next_checkpoint_no,
1310 log_sys.next_checkpoint_lsn));
1311
1312 byte* buf = log_sys.checkpoint_buf;
1313 memset(buf, 0, OS_FILE_LOG_BLOCK_SIZE);
1314
1315 mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys.next_checkpoint_no);
1316 mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys.next_checkpoint_lsn);
1317
1318 if (log_sys.is_encrypted()) {
1319 log_crypt_write_checkpoint_buf(buf);
1320 }
1321
1322 lsn_offset = log_sys.log.calc_lsn_offset(log_sys.next_checkpoint_lsn);
1323 mach_write_to_8(buf + LOG_CHECKPOINT_OFFSET, lsn_offset);
1324 mach_write_to_8(buf + LOG_CHECKPOINT_LOG_BUF_SIZE,
1325 srv_log_buffer_size);
1326 mach_write_to_8(buf + LOG_CHECKPOINT_END_LSN, end_lsn);
1327
1328 log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf));
1329
1330 MONITOR_INC(MONITOR_PENDING_CHECKPOINT_WRITE);
1331
1332 log_sys.n_log_ios++;
1333
1334 MONITOR_INC(MONITOR_LOG_IO);
1335
1336 ut_ad(LOG_CHECKPOINT_1 < srv_page_size);
1337 ut_ad(LOG_CHECKPOINT_2 < srv_page_size);
1338
1339 if (log_sys.n_pending_checkpoint_writes++ == 0) {
1340 rw_lock_x_lock_gen(&log_sys.checkpoint_lock,
1341 LOG_CHECKPOINT);
1342 }
1343
1344 /* Note: We alternate the physical place of the checkpoint info.
1345 See the (next_checkpoint_no & 1) below. */
1346
1347 fil_io(IORequestLogWrite, false,
1348 page_id_t(SRV_LOG_SPACE_FIRST_ID, 0),
1349 0,
1350 (log_sys.next_checkpoint_no & 1)
1351 ? LOG_CHECKPOINT_2 : LOG_CHECKPOINT_1,
1352 OS_FILE_LOG_BLOCK_SIZE,
1353 buf, reinterpret_cast<void*>(1) /* checkpoint write */);
1354 }
1355
1356 /** Read a log group header page to log_sys.checkpoint_buf.
1357 @param[in] header 0 or LOG_CHECKPOINT_1 or LOG_CHECKPOINT2 */
log_header_read(ulint header)1358 void log_header_read(ulint header)
1359 {
1360 ut_ad(log_mutex_own());
1361
1362 log_sys.n_log_ios++;
1363
1364 MONITOR_INC(MONITOR_LOG_IO);
1365
1366 fil_io(IORequestLogRead, true,
1367 page_id_t(SRV_LOG_SPACE_FIRST_ID,
1368 header >> srv_page_size_shift),
1369 0, header & (srv_page_size - 1),
1370 OS_FILE_LOG_BLOCK_SIZE, log_sys.checkpoint_buf, NULL);
1371 }
1372
1373 /** Write checkpoint info to the log header and invoke log_mutex_exit().
1374 @param[in] sync whether to wait for the write to complete
1375 @param[in] end_lsn start LSN of the MLOG_CHECKPOINT mini-transaction */
1376 void
log_write_checkpoint_info(bool sync,lsn_t end_lsn)1377 log_write_checkpoint_info(bool sync, lsn_t end_lsn)
1378 {
1379 ut_ad(log_mutex_own());
1380 ut_ad(!srv_read_only_mode);
1381
1382 log_group_checkpoint(end_lsn);
1383
1384 log_mutex_exit();
1385
1386 MONITOR_INC(MONITOR_NUM_CHECKPOINT);
1387
1388 if (sync) {
1389 /* Wait for the checkpoint write to complete */
1390 rw_lock_s_lock(&log_sys.checkpoint_lock);
1391 rw_lock_s_unlock(&log_sys.checkpoint_lock);
1392
1393 DBUG_EXECUTE_IF(
1394 "crash_after_checkpoint",
1395 DBUG_SUICIDE(););
1396 }
1397 }
1398
1399 /** Set extra data to be written to the redo log during checkpoint.
1400 @param[in] buf data to be appended on checkpoint, or NULL
1401 @return pointer to previous data to be appended on checkpoint */
1402 mtr_buf_t*
log_append_on_checkpoint(mtr_buf_t * buf)1403 log_append_on_checkpoint(
1404 mtr_buf_t* buf)
1405 {
1406 log_mutex_enter();
1407 mtr_buf_t* old = log_sys.append_on_checkpoint;
1408 log_sys.append_on_checkpoint = buf;
1409 log_mutex_exit();
1410 return(old);
1411 }
1412
1413 /** Make a checkpoint. Note that this function does not flush dirty
1414 blocks from the buffer pool: it only checks what is lsn of the oldest
1415 modification in the pool, and writes information about the lsn in
1416 log files. Use log_make_checkpoint() to flush also the pool.
1417 @param[in] sync whether to wait for the write to complete
1418 @return true if success, false if a checkpoint write was already running */
log_checkpoint(bool sync)1419 bool log_checkpoint(bool sync)
1420 {
1421 lsn_t oldest_lsn;
1422
1423 ut_ad(!srv_read_only_mode);
1424
1425 DBUG_EXECUTE_IF("no_checkpoint",
1426 /* We sleep for a long enough time, forcing
1427 the checkpoint doesn't happen any more. */
1428 os_thread_sleep(360000000););
1429
1430 if (recv_recovery_is_on()) {
1431 recv_apply_hashed_log_recs(true);
1432 }
1433
1434 switch (srv_file_flush_method) {
1435 case SRV_NOSYNC:
1436 break;
1437 case SRV_O_DSYNC:
1438 case SRV_FSYNC:
1439 case SRV_LITTLESYNC:
1440 case SRV_O_DIRECT:
1441 case SRV_O_DIRECT_NO_FSYNC:
1442 #ifdef _WIN32
1443 case SRV_ALL_O_DIRECT_FSYNC:
1444 #endif
1445 fil_flush_file_spaces(FIL_TYPE_TABLESPACE);
1446 }
1447
1448 log_mutex_enter();
1449
1450 ut_ad(!recv_no_log_write);
1451 oldest_lsn = log_buf_pool_get_oldest_modification();
1452
1453 /* Because log also contains headers and dummy log records,
1454 log_buf_pool_get_oldest_modification() will return log_sys.lsn
1455 if the buffer pool contains no dirty buffers.
1456 We must make sure that the log is flushed up to that lsn.
1457 If there are dirty buffers in the buffer pool, then our
1458 write-ahead-logging algorithm ensures that the log has been
1459 flushed up to oldest_lsn. */
1460
1461 ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn);
1462 if (oldest_lsn
1463 > log_sys.last_checkpoint_lsn + SIZE_OF_MLOG_CHECKPOINT) {
1464 /* Some log has been written since the previous checkpoint. */
1465 } else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) {
1466 /* MariaDB 10.3 startup expects the redo log file to be
1467 logically empty (not even containing a MLOG_CHECKPOINT record)
1468 after a clean shutdown. Perform an extra checkpoint at
1469 shutdown. */
1470 } else {
1471 /* Do nothing, because nothing was logged (other than
1472 a MLOG_CHECKPOINT marker) since the previous checkpoint. */
1473 log_mutex_exit();
1474 return(true);
1475 }
1476 /* Repeat the MLOG_FILE_NAME records after the checkpoint, in
1477 case some log records between the checkpoint and log_sys.lsn
1478 need them. Finally, write a MLOG_CHECKPOINT marker. Redo log
1479 apply expects to see a MLOG_CHECKPOINT after the checkpoint,
1480 except on clean shutdown, where the log will be empty after
1481 the checkpoint.
1482 It is important that we write out the redo log before any
1483 further dirty pages are flushed to the tablespace files. At
1484 this point, because log_mutex_own(), mtr_commit() in other
1485 threads will be blocked, and no pages can be added to the
1486 flush lists. */
1487 lsn_t flush_lsn = oldest_lsn;
1488 const lsn_t end_lsn = log_sys.lsn;
1489 const bool do_write
1490 = srv_shutdown_state <= SRV_SHUTDOWN_INITIATED
1491 || flush_lsn != end_lsn;
1492
1493 if (fil_names_clear(flush_lsn, do_write)) {
1494 ut_ad(log_sys.lsn >= end_lsn + SIZE_OF_MLOG_CHECKPOINT);
1495 flush_lsn = log_sys.lsn;
1496 }
1497
1498 log_mutex_exit();
1499
1500 log_write_up_to(flush_lsn, true, true);
1501
1502 log_mutex_enter();
1503
1504 ut_ad(log_sys.flushed_to_disk_lsn >= flush_lsn);
1505 ut_ad(flush_lsn >= oldest_lsn);
1506
1507 if (log_sys.last_checkpoint_lsn >= oldest_lsn) {
1508 log_mutex_exit();
1509 return(true);
1510 }
1511
1512 if (log_sys.n_pending_checkpoint_writes > 0) {
1513 /* A checkpoint write is running */
1514 log_mutex_exit();
1515
1516 if (sync) {
1517 /* Wait for the checkpoint write to complete */
1518 rw_lock_s_lock(&log_sys.checkpoint_lock);
1519 rw_lock_s_unlock(&log_sys.checkpoint_lock);
1520 }
1521
1522 return(false);
1523 }
1524
1525 log_sys.next_checkpoint_lsn = oldest_lsn;
1526 log_write_checkpoint_info(sync, end_lsn);
1527 ut_ad(!log_mutex_own());
1528
1529 return(true);
1530 }
1531
1532 /** Make a checkpoint */
log_make_checkpoint()1533 void log_make_checkpoint()
1534 {
1535 /* Preflush pages synchronously */
1536
1537 while (!log_preflush_pool_modified_pages(LSN_MAX)) {
1538 /* Flush as much as we can */
1539 }
1540
1541 while (!log_checkpoint(true)) {
1542 /* Force a checkpoint */
1543 }
1544 }
1545
1546 /****************************************************************//**
1547 Tries to establish a big enough margin of free space in the log groups, such
1548 that a new log entry can be catenated without an immediate need for a
1549 checkpoint. NOTE: this function may only be called if the calling thread
1550 owns no synchronization objects! */
1551 static
1552 void
log_checkpoint_margin(void)1553 log_checkpoint_margin(void)
1554 /*=======================*/
1555 {
1556 lsn_t age;
1557 lsn_t checkpoint_age;
1558 ib_uint64_t advance;
1559 lsn_t oldest_lsn;
1560 bool success;
1561 loop:
1562 advance = 0;
1563
1564 log_mutex_enter();
1565 ut_ad(!recv_no_log_write);
1566
1567 if (!log_sys.check_flush_or_checkpoint) {
1568 log_mutex_exit();
1569 return;
1570 }
1571
1572 oldest_lsn = log_buf_pool_get_oldest_modification();
1573
1574 age = log_sys.lsn - oldest_lsn;
1575
1576 if (age > log_sys.max_modified_age_sync) {
1577
1578 /* A flush is urgent: we have to do a synchronous preflush */
1579 advance = age - log_sys.max_modified_age_sync;
1580 }
1581
1582 checkpoint_age = log_sys.lsn - log_sys.last_checkpoint_lsn;
1583
1584 bool checkpoint_sync;
1585 bool do_checkpoint;
1586
1587 if (checkpoint_age > log_sys.max_checkpoint_age) {
1588 /* A checkpoint is urgent: we do it synchronously */
1589 checkpoint_sync = true;
1590 do_checkpoint = true;
1591 } else if (checkpoint_age > log_sys.max_checkpoint_age_async) {
1592 /* A checkpoint is not urgent: do it asynchronously */
1593 do_checkpoint = true;
1594 checkpoint_sync = false;
1595 log_sys.check_flush_or_checkpoint = false;
1596 } else {
1597 do_checkpoint = false;
1598 checkpoint_sync = false;
1599 log_sys.check_flush_or_checkpoint = false;
1600 }
1601
1602 log_mutex_exit();
1603
1604 if (advance) {
1605 lsn_t new_oldest = oldest_lsn + advance;
1606
1607 success = log_preflush_pool_modified_pages(new_oldest);
1608
1609 /* If the flush succeeded, this thread has done its part
1610 and can proceed. If it did not succeed, there was another
1611 thread doing a flush at the same time. */
1612 if (!success) {
1613 log_mutex_enter();
1614 log_sys.check_flush_or_checkpoint = true;
1615 log_mutex_exit();
1616 goto loop;
1617 }
1618 }
1619
1620 if (do_checkpoint) {
1621 log_checkpoint(checkpoint_sync);
1622
1623 if (checkpoint_sync) {
1624
1625 goto loop;
1626 }
1627 }
1628 }
1629
1630 /**
1631 Checks that there is enough free space in the log to start a new query step.
1632 Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
1633 function may only be called if the calling thread owns no synchronization
1634 objects! */
1635 void
log_check_margins(void)1636 log_check_margins(void)
1637 {
1638 bool check;
1639
1640 do {
1641 log_flush_margin();
1642 log_checkpoint_margin();
1643 log_mutex_enter();
1644 ut_ad(!recv_no_log_write);
1645 check = log_sys.check_flush_or_checkpoint;
1646 log_mutex_exit();
1647 } while (check);
1648 }
1649
1650 /****************************************************************//**
1651 Makes a checkpoint at the latest lsn and writes it to first page of each
1652 data file in the database, so that we know that the file spaces contain
1653 all modifications up to that lsn. This can only be called at database
1654 shutdown. This function also writes all log in log files to the log archive. */
1655 void
logs_empty_and_mark_files_at_shutdown(void)1656 logs_empty_and_mark_files_at_shutdown(void)
1657 /*=======================================*/
1658 {
1659 lsn_t lsn;
1660 ulint count = 0;
1661
1662 ib::info() << "Starting shutdown...";
1663
1664 /* Wait until the master thread and all other operations are idle: our
1665 algorithm only works if the server is idle at shutdown */
1666
1667 srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
1668 loop:
1669 ut_ad(lock_sys.is_initialised() || !srv_was_started);
1670 ut_ad(log_sys.is_initialised() || !srv_was_started);
1671 ut_ad(fil_system.is_initialised() || !srv_was_started);
1672 os_event_set(srv_buf_resize_event);
1673
1674 if (!srv_read_only_mode) {
1675 os_event_set(srv_error_event);
1676 os_event_set(srv_monitor_event);
1677 os_event_set(srv_buf_dump_event);
1678 if (lock_sys.timeout_thread_active) {
1679 os_event_set(lock_sys.timeout_event);
1680 }
1681 if (dict_stats_event) {
1682 os_event_set(dict_stats_event);
1683 } else {
1684 ut_ad(!srv_dict_stats_thread_active);
1685 }
1686 if (recv_sys.flush_start) {
1687 /* This is in case recv_writer_thread was never
1688 started, or buf_flush_page_cleaner_coordinator
1689 failed to notice its termination. */
1690 os_event_set(recv_sys.flush_start);
1691 }
1692 }
1693 #define COUNT_INTERVAL 600U
1694 #define CHECK_INTERVAL 100000U
1695 os_thread_sleep(CHECK_INTERVAL);
1696
1697 count++;
1698
1699 /* Check that there are no longer transactions, except for
1700 PREPARED ones. We need this wait even for the 'very fast'
1701 shutdown, because the InnoDB layer may have committed or
1702 prepared transactions and we don't want to lose them. */
1703
1704 if (ulint total_trx = srv_was_started && !srv_read_only_mode
1705 && srv_force_recovery < SRV_FORCE_NO_TRX_UNDO
1706 ? trx_sys.any_active_transactions() : 0) {
1707
1708 if (srv_print_verbose_log && count > COUNT_INTERVAL) {
1709 service_manager_extend_timeout(
1710 COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2,
1711 "Waiting for %lu active transactions to finish",
1712 (ulong) total_trx);
1713 ib::info() << "Waiting for " << total_trx << " active"
1714 << " transactions to finish";
1715
1716 count = 0;
1717 }
1718
1719 goto loop;
1720 }
1721
1722 /* We need these threads to stop early in shutdown. */
1723 const char* thread_name;
1724
1725 if (srv_error_monitor_active) {
1726 thread_name = "srv_error_monitor_thread";
1727 } else if (srv_monitor_active) {
1728 thread_name = "srv_monitor_thread";
1729 } else if (srv_buf_resize_thread_active) {
1730 thread_name = "buf_resize_thread";
1731 goto wait_suspend_loop;
1732 } else if (srv_dict_stats_thread_active) {
1733 thread_name = "dict_stats_thread";
1734 } else if (lock_sys.timeout_thread_active) {
1735 thread_name = "lock_wait_timeout_thread";
1736 } else if (srv_buf_dump_thread_active) {
1737 thread_name = "buf_dump_thread";
1738 goto wait_suspend_loop;
1739 } else if (btr_defragment_thread_active) {
1740 thread_name = "btr_defragment_thread";
1741 } else if (srv_fast_shutdown != 2 && trx_rollback_is_active) {
1742 thread_name = "rollback of recovered transactions";
1743 } else {
1744 thread_name = NULL;
1745 }
1746
1747 if (thread_name) {
1748 ut_ad(!srv_read_only_mode);
1749 wait_suspend_loop:
1750 service_manager_extend_timeout(
1751 COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2,
1752 "Waiting for %s to exit", thread_name);
1753 if (srv_print_verbose_log && count > COUNT_INTERVAL) {
1754 ib::info() << "Waiting for " << thread_name
1755 << " to exit";
1756 count = 0;
1757 }
1758 goto loop;
1759 }
1760
1761 /* Check that the background threads are suspended */
1762
1763 switch (srv_get_active_thread_type()) {
1764 case SRV_NONE:
1765 if (!srv_n_fil_crypt_threads_started) {
1766 srv_shutdown_state = SRV_SHUTDOWN_FLUSH_PHASE;
1767 break;
1768 }
1769 os_event_set(fil_crypt_threads_event);
1770 thread_name = "fil_crypt_thread";
1771 goto wait_suspend_loop;
1772 case SRV_PURGE:
1773 case SRV_WORKER:
1774 ut_ad(!"purge was not shut down");
1775 srv_purge_wakeup();
1776 thread_name = "purge thread";
1777 goto wait_suspend_loop;
1778 case SRV_MASTER:
1779 thread_name = "master thread";
1780 goto wait_suspend_loop;
1781 }
1782
1783 /* At this point only page_cleaner should be active. We wait
1784 here to let it complete the flushing of the buffer pools
1785 before proceeding further. */
1786
1787 count = 0;
1788 service_manager_extend_timeout(COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2,
1789 "Waiting for page cleaner");
1790 while (buf_page_cleaner_is_active) {
1791 ++count;
1792 os_thread_sleep(CHECK_INTERVAL);
1793 if (srv_print_verbose_log && count > COUNT_INTERVAL) {
1794 service_manager_extend_timeout(COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2,
1795 "Waiting for page cleaner");
1796 ib::info() << "Waiting for page_cleaner to "
1797 "finish flushing of buffer pool";
1798 /* This is a workaround to avoid the InnoDB hang
1799 when OS datetime changed backwards */
1800 os_event_set(buf_flush_event);
1801 count = 0;
1802 }
1803 }
1804
1805 if (log_scrub_thread_active) {
1806 ut_ad(!srv_read_only_mode);
1807 os_event_set(log_scrub_event);
1808 }
1809
1810 if (log_sys.is_initialised()) {
1811 log_mutex_enter();
1812 const ulint n_write = log_sys.n_pending_checkpoint_writes;
1813 const ulint n_flush = log_sys.n_pending_flushes;
1814 log_mutex_exit();
1815
1816 if (log_scrub_thread_active || n_write || n_flush) {
1817 if (srv_print_verbose_log && count > 600) {
1818 ib::info() << "Pending checkpoint_writes: "
1819 << n_write
1820 << ". Pending log flush writes: "
1821 << n_flush;
1822 count = 0;
1823 }
1824 goto loop;
1825 }
1826 }
1827
1828 ut_ad(!log_scrub_thread_active);
1829
1830 if (!buf_pool_ptr) {
1831 ut_ad(!srv_was_started);
1832 } else if (ulint pending_io = buf_pool_check_no_pending_io()) {
1833 if (srv_print_verbose_log && count > 600) {
1834 ib::info() << "Waiting for " << pending_io << " buffer"
1835 " page I/Os to complete";
1836 count = 0;
1837 }
1838
1839 goto loop;
1840 }
1841
1842 if (srv_fast_shutdown == 2 || !srv_was_started) {
1843 if (!srv_read_only_mode && srv_was_started) {
1844 ib::info() << "MySQL has requested a very fast"
1845 " shutdown without flushing the InnoDB buffer"
1846 " pool to data files. At the next mysqld"
1847 " startup InnoDB will do a crash recovery!";
1848
1849 /* In this fastest shutdown we do not flush the
1850 buffer pool:
1851
1852 it is essentially a 'crash' of the InnoDB server.
1853 Make sure that the log is all flushed to disk, so
1854 that we can recover all committed transactions in
1855 a crash recovery. We must not write the lsn stamps
1856 to the data files, since at a startup InnoDB deduces
1857 from the stamps if the previous shutdown was clean. */
1858
1859 log_buffer_flush_to_disk();
1860 }
1861
1862 srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
1863
1864 if (fil_system.is_initialised()) {
1865 fil_close_all_files();
1866 }
1867 return;
1868 }
1869
1870 if (!srv_read_only_mode) {
1871 service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
1872 "ensuring dirty buffer pool are written to log");
1873 log_make_checkpoint();
1874
1875 log_mutex_enter();
1876
1877 lsn = log_sys.lsn;
1878
1879 const bool lsn_changed = lsn != log_sys.last_checkpoint_lsn;
1880 ut_ad(lsn >= log_sys.last_checkpoint_lsn);
1881
1882 log_mutex_exit();
1883
1884 if (lsn_changed) {
1885 goto loop;
1886 }
1887
1888 /* Ensure that all buffered changes are written to the
1889 redo log before fil_close_all_files(). */
1890 fil_flush_file_spaces(FIL_TYPE_LOG);
1891 } else {
1892 lsn = srv_start_lsn;
1893 }
1894
1895 srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
1896
1897 /* Make some checks that the server really is quiet */
1898 ut_a(srv_get_active_thread_type() == SRV_NONE);
1899
1900 service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
1901 "Free innodb buffer pool");
1902 buf_all_freed();
1903
1904 ut_a(lsn == log_sys.lsn
1905 || srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
1906
1907 if (lsn < srv_start_lsn) {
1908 ib::error() << "Shutdown LSN=" << lsn
1909 << " is less than start LSN=" << srv_start_lsn;
1910 }
1911
1912 srv_shutdown_lsn = lsn;
1913
1914 if (!srv_read_only_mode) {
1915 dberr_t err = fil_write_flushed_lsn(lsn);
1916
1917 if (err != DB_SUCCESS) {
1918 ib::error() << "Writing flushed lsn " << lsn
1919 << " failed; error=" << err;
1920 }
1921 }
1922
1923 fil_close_all_files();
1924
1925 /* Make some checks that the server really is quiet */
1926 ut_a(srv_get_active_thread_type() == SRV_NONE);
1927
1928 ut_a(lsn == log_sys.lsn
1929 || srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
1930 }
1931
1932 /******************************************************//**
1933 Peeks the current lsn.
1934 @return TRUE if success, FALSE if could not get the log system mutex */
1935 ibool
log_peek_lsn(lsn_t * lsn)1936 log_peek_lsn(
1937 /*=========*/
1938 lsn_t* lsn) /*!< out: if returns TRUE, current lsn is here */
1939 {
1940 if (0 == mutex_enter_nowait(&(log_sys.mutex))) {
1941 *lsn = log_sys.lsn;
1942
1943 log_mutex_exit();
1944
1945 return(TRUE);
1946 }
1947
1948 return(FALSE);
1949 }
1950
1951 /******************************************************//**
1952 Prints info of the log. */
1953 void
log_print(FILE * file)1954 log_print(
1955 /*======*/
1956 FILE* file) /*!< in: file where to print */
1957 {
1958 double time_elapsed;
1959 time_t current_time;
1960
1961 log_mutex_enter();
1962
1963 fprintf(file,
1964 "Log sequence number " LSN_PF "\n"
1965 "Log flushed up to " LSN_PF "\n"
1966 "Pages flushed up to " LSN_PF "\n"
1967 "Last checkpoint at " LSN_PF "\n",
1968 log_sys.lsn,
1969 log_sys.flushed_to_disk_lsn,
1970 log_buf_pool_get_oldest_modification(),
1971 log_sys.last_checkpoint_lsn);
1972
1973 current_time = time(NULL);
1974
1975 time_elapsed = difftime(current_time,
1976 log_sys.last_printout_time);
1977
1978 if (time_elapsed <= 0) {
1979 time_elapsed = 1;
1980 }
1981
1982 fprintf(file,
1983 ULINTPF " pending log flushes, "
1984 ULINTPF " pending chkp writes\n"
1985 ULINTPF " log i/o's done, %.2f log i/o's/second\n",
1986 log_sys.n_pending_flushes,
1987 log_sys.n_pending_checkpoint_writes,
1988 log_sys.n_log_ios,
1989 static_cast<double>(
1990 log_sys.n_log_ios - log_sys.n_log_ios_old)
1991 / time_elapsed);
1992
1993 log_sys.n_log_ios_old = log_sys.n_log_ios;
1994 log_sys.last_printout_time = current_time;
1995
1996 log_mutex_exit();
1997 }
1998
1999 /**********************************************************************//**
2000 Refreshes the statistics used to print per-second averages. */
2001 void
log_refresh_stats(void)2002 log_refresh_stats(void)
2003 /*===================*/
2004 {
2005 log_sys.n_log_ios_old = log_sys.n_log_ios;
2006 log_sys.last_printout_time = time(NULL);
2007 }
2008
2009 /** Shut down the redo log subsystem. */
close()2010 void log_t::close()
2011 {
2012 ut_ad(this == &log_sys);
2013 if (!is_initialised()) return;
2014 m_initialised = false;
2015 log.close();
2016
2017 ut_free_dodump(buf, srv_log_buffer_size);
2018 buf = NULL;
2019 ut_free_dodump(flush_buf, srv_log_buffer_size);
2020 flush_buf = NULL;
2021
2022 os_event_destroy(flush_event);
2023 rw_lock_free(&checkpoint_lock);
2024 mutex_free(&mutex);
2025 mutex_free(&write_mutex);
2026 mutex_free(&log_flush_order_mutex);
2027
2028 if (!srv_read_only_mode && srv_scrub_log)
2029 os_event_destroy(log_scrub_event);
2030
2031 recv_sys.close();
2032 }
2033
2034 /******************************************************//**
2035 Pads the current log block full with dummy log records. Used in producing
2036 consistent archived log files and scrubbing redo log. */
2037 static
2038 void
log_pad_current_log_block(void)2039 log_pad_current_log_block(void)
2040 /*===========================*/
2041 {
2042 byte b = MLOG_DUMMY_RECORD;
2043 ulint pad_length;
2044 ulint i;
2045 lsn_t lsn;
2046
2047 ut_ad(!recv_no_log_write);
2048 /* We retrieve lsn only because otherwise gcc crashed on HP-UX */
2049 lsn = log_reserve_and_open(OS_FILE_LOG_BLOCK_SIZE);
2050
2051 pad_length = log_sys.trailer_offset()
2052 - log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE;
2053 if (pad_length == log_sys.payload_size()) {
2054 pad_length = 0;
2055 }
2056
2057 if (pad_length) {
2058 srv_stats.n_log_scrubs.inc();
2059 }
2060
2061 for (i = 0; i < pad_length; i++) {
2062 log_write_low(&b, 1);
2063 }
2064
2065 lsn = log_sys.lsn;
2066
2067 log_close();
2068
2069 ut_a(lsn % OS_FILE_LOG_BLOCK_SIZE == LOG_BLOCK_HDR_SIZE);
2070 }
2071
2072 /*****************************************************************//*
2073 If no log record has been written for a while, fill current log
2074 block with dummy records. */
2075 static
2076 void
log_scrub()2077 log_scrub()
2078 /*=========*/
2079 {
2080 log_mutex_enter();
2081 ulint cur_lbn = log_block_convert_lsn_to_no(log_sys.lsn);
2082
2083 if (next_lbn_to_pad == cur_lbn)
2084 {
2085 log_pad_current_log_block();
2086 }
2087
2088 next_lbn_to_pad = log_block_convert_lsn_to_no(log_sys.lsn);
2089 log_mutex_exit();
2090 }
2091
2092 /* log scrubbing speed, in bytes/sec */
2093 UNIV_INTERN ulonglong innodb_scrub_log_speed;
2094
2095 /*****************************************************************//**
2096 This is the main thread for log scrub. It waits for an event and
2097 when waked up fills current log block with dummy records and
2098 sleeps again.
2099 @return this function does not return, it calls os_thread_exit() */
2100 extern "C" UNIV_INTERN
2101 os_thread_ret_t
DECLARE_THREAD(log_scrub_thread)2102 DECLARE_THREAD(log_scrub_thread)(void*)
2103 {
2104 ut_ad(!srv_read_only_mode);
2105
2106 while (srv_shutdown_state < SRV_SHUTDOWN_FLUSH_PHASE) {
2107 /* log scrubbing interval in µs. */
2108 ulonglong interval = 1000*1000*512/innodb_scrub_log_speed;
2109
2110 os_event_wait_time(log_scrub_event, static_cast<ulint>(interval));
2111
2112 log_scrub();
2113
2114 os_event_reset(log_scrub_event);
2115 }
2116
2117 log_scrub_thread_active = false;
2118
2119 /* We count the number of threads in os_thread_exit(). A created
2120 thread should always use that to exit and not use return() to exit. */
2121 os_thread_exit();
2122
2123 OS_THREAD_DUMMY_RETURN;
2124 }
2125