1 /*****************************************************************************
2
3 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2009, Google Inc.
5 Copyright (c) 2014, 2021, MariaDB Corporation.
6
7 Portions of this file contain modifications contributed and copyrighted by
8 Google, Inc. Those modifications are gratefully acknowledged and are described
9 briefly in the InnoDB documentation. The contributions by Google are
10 incorporated with their permission, and subject to the conditions contained in
11 the file COPYING.Google.
12
13 This program is free software; you can redistribute it and/or modify it under
14 the terms of the GNU General Public License as published by the Free Software
15 Foundation; version 2 of the License.
16
17 This program is distributed in the hope that it will be useful, but WITHOUT
18 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
19 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
24
25 *****************************************************************************/
26
27 /**************************************************//**
28 @file log/log0log.cc
29 Database log
30
31 Created 12/9/1995 Heikki Tuuri
32 *******************************************************/
33
34 #include "univ.i"
35 #include <debug_sync.h>
36 #include <my_service_manager.h>
37
38 #include "log0log.h"
39 #include "log0crypt.h"
40 #include "mem0mem.h"
41 #include "buf0buf.h"
42 #include "buf0flu.h"
43 #include "lock0lock.h"
44 #include "log0recv.h"
45 #include "fil0fil.h"
46 #include "dict0boot.h"
47 #include "dict0stats_bg.h"
48 #include "btr0defragment.h"
49 #include "srv0srv.h"
50 #include "srv0start.h"
51 #include "trx0sys.h"
52 #include "trx0trx.h"
53 #include "trx0roll.h"
54 #include "srv0mon.h"
55 #include "sync0sync.h"
56
57 /*
58 General philosophy of InnoDB redo-logs:
59
60 1) Every change to a contents of a data page must be done
61 through mtr, which in mtr_commit() writes log records
62 to the InnoDB redo log.
63
64 2) Normally these changes are performed using a mlog_write_ulint()
65 or similar function.
66
67 3) In some page level operations only a code number of a
68 c-function and its parameters are written to the log to
69 reduce the size of the log.
70
71 3a) You should not add parameters to these kind of functions
72 (e.g. trx_undo_header_create())
73
74 3b) You should not add such functionality which either change
75 working when compared with the old or are dependent on data
76 outside of the page. These kind of functions should implement
77 self-contained page transformation and it should be unchanged
78 if you don't have very essential reasons to change log
79 semantics or format.
80
81 */
82
83 /** Redo log system */
84 log_t log_sys;
85
86 /** Whether to require checksums on the redo log pages */
87 my_bool innodb_log_checksums;
88
89 /* Next log block number to do dummy record filling if no log records written
90 for a while */
91 static ulint next_lbn_to_pad = 0;
92
93 /* These control how often we print warnings if the last checkpoint is too
94 old */
95 static bool log_has_printed_chkp_warning = false;
96 static time_t log_last_warning_time;
97
98 static bool log_has_printed_chkp_margine_warning = false;
99 static time_t log_last_margine_warning_time;
100
101 /* A margin for free space in the log buffer before a log entry is catenated */
102 #define LOG_BUF_WRITE_MARGIN (4 * OS_FILE_LOG_BLOCK_SIZE)
103
104 /* Margins for free space in the log buffer after a log entry is catenated */
105 #define LOG_BUF_FLUSH_RATIO 2
106 #define LOG_BUF_FLUSH_MARGIN (LOG_BUF_WRITE_MARGIN \
107 + (4U << srv_page_size_shift))
108
109 /* This parameter controls asynchronous making of a new checkpoint; the value
110 should be bigger than LOG_POOL_PREFLUSH_RATIO_SYNC */
111
112 #define LOG_POOL_CHECKPOINT_RATIO_ASYNC 32
113
114 /* This parameter controls synchronous preflushing of modified buffer pages */
115 #define LOG_POOL_PREFLUSH_RATIO_SYNC 16
116
117 /* The same ratio for asynchronous preflushing; this value should be less than
118 the previous */
119 #define LOG_POOL_PREFLUSH_RATIO_ASYNC 8
120
121 /* Codes used in unlocking flush latches */
122 #define LOG_UNLOCK_NONE_FLUSHED_LOCK 1
123 #define LOG_UNLOCK_FLUSH_LOCK 2
124
125 /** Event to wake up log_scrub_thread */
126 os_event_t log_scrub_event;
127 /** Whether log_scrub_thread is active */
128 bool log_scrub_thread_active;
129
130 extern "C" UNIV_INTERN
131 os_thread_ret_t
132 DECLARE_THREAD(log_scrub_thread)(void*);
133
134 /****************************************************************//**
135 Returns the oldest modified block lsn in the pool, or log_sys.lsn if none
136 exists.
137 @return LSN of oldest modification */
138 static
139 lsn_t
log_buf_pool_get_oldest_modification(void)140 log_buf_pool_get_oldest_modification(void)
141 /*======================================*/
142 {
143 lsn_t lsn;
144
145 ut_ad(log_mutex_own());
146
147 lsn = buf_pool_get_oldest_modification();
148
149 if (!lsn) {
150
151 lsn = log_sys.lsn;
152 }
153
154 return(lsn);
155 }
156
157 /** Extends the log buffer.
158 @param[in] len requested minimum size in bytes */
log_buffer_extend(ulong len)159 void log_buffer_extend(ulong len)
160 {
161 const ulong new_buf_size = ut_calc_align(len, srv_page_size);
162 byte* new_buf = static_cast<byte*>(ut_malloc_dontdump(new_buf_size));
163 TRASH_ALLOC(new_buf, new_buf_size);
164 byte* new_flush_buf =
165 static_cast<byte*>(ut_malloc_dontdump(new_buf_size));
166 TRASH_ALLOC(new_flush_buf, new_buf_size);
167
168 log_mutex_enter();
169
170 if (len <= srv_log_buffer_size) {
171 /* Already extended enough by the others */
172 log_mutex_exit();
173 ut_free_dodump(new_buf, new_buf_size);
174 ut_free_dodump(new_flush_buf, new_buf_size);
175 return;
176 }
177
178 ib::warn() << "The redo log transaction size " << len <<
179 " exceeds innodb_log_buffer_size="
180 << srv_log_buffer_size << " / 2). Trying to extend it.";
181
182 byte* old_buf = log_sys.buf;
183 byte* old_flush_buf = log_sys.flush_buf;
184 const ulong old_buf_size = srv_log_buffer_size;
185
186 srv_log_buffer_size = new_buf_size;
187 log_sys.buf = new_buf;
188 log_sys.flush_buf = new_flush_buf;
189 memcpy(new_buf, old_buf, log_sys.buf_free);
190
191 log_sys.max_buf_free = new_buf_size / LOG_BUF_FLUSH_RATIO
192 - LOG_BUF_FLUSH_MARGIN;
193
194 log_mutex_exit();
195
196 ut_free_dodump(old_buf, old_buf_size);
197 ut_free_dodump(old_flush_buf, old_buf_size);
198
199 ib::info() << "innodb_log_buffer_size was extended to "
200 << new_buf_size << ".";
201 }
202
203 /** Calculate actual length in redo buffer and file including
204 block header and trailer.
205 @param[in] len length to write
206 @return actual length to write including header and trailer. */
207 static inline
208 ulint
log_calculate_actual_len(ulint len)209 log_calculate_actual_len(
210 ulint len)
211 {
212 ut_ad(log_mutex_own());
213
214 /* actual length stored per block */
215 const ulint len_per_blk = OS_FILE_LOG_BLOCK_SIZE
216 - (LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE);
217
218 /* actual data length in last block already written */
219 ulint extra_len = (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE);
220
221 ut_ad(extra_len >= LOG_BLOCK_HDR_SIZE);
222 extra_len -= LOG_BLOCK_HDR_SIZE;
223
224 /* total extra length for block header and trailer */
225 extra_len = ((len + extra_len) / len_per_blk)
226 * (LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE);
227
228 return(len + extra_len);
229 }
230
231 /** Check margin not to overwrite transaction log from the last checkpoint.
232 If would estimate the log write to exceed the log_group_capacity,
233 waits for the checkpoint is done enough.
234 @param[in] len length of the data to be written */
235
236 void
log_margin_checkpoint_age(ulint len)237 log_margin_checkpoint_age(
238 ulint len)
239 {
240 ulint margin = log_calculate_actual_len(len);
241
242 ut_ad(log_mutex_own());
243
244 if (margin > log_sys.log_group_capacity) {
245 /* return with warning output to avoid deadlock */
246 if (!log_has_printed_chkp_margine_warning
247 || difftime(time(NULL),
248 log_last_margine_warning_time) > 15) {
249 log_has_printed_chkp_margine_warning = true;
250 log_last_margine_warning_time = time(NULL);
251
252 ib::error() << "The transaction log files are too"
253 " small for the single transaction log (size="
254 << len << "). So, the last checkpoint age"
255 " might exceed the log group capacity "
256 << log_sys.log_group_capacity << ".";
257 }
258
259 return;
260 }
261
262 /* Our margin check should ensure that we never reach this condition.
263 Try to do checkpoint once. We cannot keep waiting here as it might
264 result in hang in case the current mtr has latch on oldest lsn */
265 if (log_sys.lsn - log_sys.last_checkpoint_lsn + margin
266 > log_sys.log_group_capacity) {
267 /* The log write of 'len' might overwrite the transaction log
268 after the last checkpoint. Makes checkpoint. */
269
270 bool flushed_enough = false;
271
272 if (log_sys.lsn - log_buf_pool_get_oldest_modification()
273 + margin
274 <= log_sys.log_group_capacity) {
275 flushed_enough = true;
276 }
277
278 log_sys.check_flush_or_checkpoint = true;
279 log_mutex_exit();
280
281 DEBUG_SYNC_C("margin_checkpoint_age_rescue");
282
283 if (!flushed_enough) {
284 os_thread_sleep(100000);
285 }
286 log_checkpoint(true);
287
288 log_mutex_enter();
289 }
290
291 return;
292 }
293
294 /** Open the log for log_write_low. The log must be closed with log_close.
295 @param[in] len length of the data to be written
296 @return start lsn of the log record */
297 lsn_t
log_reserve_and_open(ulint len)298 log_reserve_and_open(
299 ulint len)
300 {
301 ulint len_upper_limit;
302 #ifdef UNIV_DEBUG
303 ulint count = 0;
304 #endif /* UNIV_DEBUG */
305
306 loop:
307 ut_ad(log_mutex_own());
308
309 /* Calculate an upper limit for the space the string may take in the
310 log buffer */
311
312 len_upper_limit = LOG_BUF_WRITE_MARGIN + srv_log_write_ahead_size
313 + (5 * len) / 4;
314
315 if (log_sys.buf_free + len_upper_limit > srv_log_buffer_size) {
316 log_mutex_exit();
317
318 DEBUG_SYNC_C("log_buf_size_exceeded");
319
320 /* Not enough free space, do a write of the log buffer */
321 log_buffer_sync_in_background(false);
322
323 srv_stats.log_waits.inc();
324
325 ut_ad(++count < 50);
326
327 log_mutex_enter();
328 goto loop;
329 }
330
331 return(log_sys.lsn);
332 }
333
334 /************************************************************//**
335 Writes to the log the string given. It is assumed that the caller holds the
336 log mutex. */
337 void
log_write_low(const byte * str,ulint str_len)338 log_write_low(
339 /*==========*/
340 const byte* str, /*!< in: string */
341 ulint str_len) /*!< in: string length */
342 {
343 ulint len;
344 ulint data_len;
345 byte* log_block;
346
347 ut_ad(log_mutex_own());
348 part_loop:
349 /* Calculate a part length */
350
351 data_len = (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE) + str_len;
352
353 if (data_len <= OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
354
355 /* The string fits within the current log block */
356
357 len = str_len;
358 } else {
359 data_len = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE;
360
361 len = OS_FILE_LOG_BLOCK_SIZE
362 - (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE)
363 - LOG_BLOCK_TRL_SIZE;
364 }
365
366 memcpy(log_sys.buf + log_sys.buf_free, str, len);
367
368 str_len -= len;
369 str = str + len;
370
371 log_block = static_cast<byte*>(
372 ut_align_down(log_sys.buf + log_sys.buf_free,
373 OS_FILE_LOG_BLOCK_SIZE));
374
375 log_block_set_data_len(log_block, data_len);
376
377 if (data_len == OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
378 /* This block became full */
379 log_block_set_data_len(log_block, OS_FILE_LOG_BLOCK_SIZE);
380 log_block_set_checkpoint_no(log_block,
381 log_sys.next_checkpoint_no);
382 len += LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE;
383
384 log_sys.lsn += len;
385
386 /* Initialize the next block header */
387 log_block_init(log_block + OS_FILE_LOG_BLOCK_SIZE,
388 log_sys.lsn);
389 } else {
390 log_sys.lsn += len;
391 }
392
393 log_sys.buf_free += ulong(len);
394
395 ut_ad(log_sys.buf_free <= srv_log_buffer_size);
396
397 if (str_len > 0) {
398 goto part_loop;
399 }
400
401 srv_stats.log_write_requests.inc();
402 }
403
404 /************************************************************//**
405 Closes the log.
406 @return lsn */
407 lsn_t
log_close(void)408 log_close(void)
409 /*===========*/
410 {
411 byte* log_block;
412 ulint first_rec_group;
413 lsn_t oldest_lsn;
414 lsn_t lsn;
415 lsn_t checkpoint_age;
416
417 ut_ad(log_mutex_own());
418
419 lsn = log_sys.lsn;
420
421 log_block = static_cast<byte*>(
422 ut_align_down(log_sys.buf + log_sys.buf_free,
423 OS_FILE_LOG_BLOCK_SIZE));
424
425 first_rec_group = log_block_get_first_rec_group(log_block);
426
427 if (first_rec_group == 0) {
428 /* We initialized a new log block which was not written
429 full by the current mtr: the next mtr log record group
430 will start within this block at the offset data_len */
431
432 log_block_set_first_rec_group(
433 log_block, log_block_get_data_len(log_block));
434 }
435
436 if (log_sys.buf_free > log_sys.max_buf_free) {
437 log_sys.check_flush_or_checkpoint = true;
438 }
439
440 checkpoint_age = lsn - log_sys.last_checkpoint_lsn;
441
442 if (checkpoint_age >= log_sys.log_group_capacity) {
443 DBUG_EXECUTE_IF(
444 "print_all_chkp_warnings",
445 log_has_printed_chkp_warning = false;);
446
447 if (!log_has_printed_chkp_warning
448 || difftime(time(NULL), log_last_warning_time) > 15) {
449
450 log_has_printed_chkp_warning = true;
451 log_last_warning_time = time(NULL);
452
453 ib::error() << "The age of the last checkpoint is "
454 << checkpoint_age << ", which exceeds the log"
455 " group capacity "
456 << log_sys.log_group_capacity
457 << ".";
458 }
459 }
460
461 if (checkpoint_age <= log_sys.max_modified_age_sync) {
462 goto function_exit;
463 }
464
465 oldest_lsn = buf_pool_get_oldest_modification();
466
467 if (!oldest_lsn
468 || lsn - oldest_lsn > log_sys.max_modified_age_sync
469 || checkpoint_age > log_sys.max_checkpoint_age_async) {
470 log_sys.check_flush_or_checkpoint = true;
471 }
472 function_exit:
473
474 return(lsn);
475 }
476
477 /** Calculate the recommended highest values for lsn - last_checkpoint_lsn
478 and lsn - buf_get_oldest_modification().
479 @param[in] file_size requested innodb_log_file_size
480 @retval true on success
481 @retval false if the smallest log group is too small to
482 accommodate the number of OS threads in the database server */
483 bool
log_set_capacity(ulonglong file_size)484 log_set_capacity(ulonglong file_size)
485 {
486 lsn_t margin;
487 ulint free;
488
489 lsn_t smallest_capacity = (file_size - LOG_FILE_HDR_SIZE)
490 * srv_n_log_files;
491 /* Add extra safety */
492 smallest_capacity -= smallest_capacity / 10;
493
494 /* For each OS thread we must reserve so much free space in the
495 smallest log group that it can accommodate the log entries produced
496 by single query steps: running out of free log space is a serious
497 system error which requires rebooting the database. */
498
499 free = LOG_CHECKPOINT_FREE_PER_THREAD * (10 + srv_thread_concurrency)
500 + LOG_CHECKPOINT_EXTRA_FREE;
501 if (free >= smallest_capacity / 2) {
502 ib::error() << "Cannot continue operation. ib_logfiles are too"
503 " small for innodb_thread_concurrency="
504 << srv_thread_concurrency << ". The combined size of"
505 " ib_logfiles should be bigger than"
506 " 200 kB * innodb_thread_concurrency. "
507 << INNODB_PARAMETERS_MSG;
508 return(false);
509 }
510
511 margin = smallest_capacity - free;
512 margin = margin - margin / 10; /* Add still some extra safety */
513
514 log_mutex_enter();
515
516 log_sys.log_group_capacity = smallest_capacity;
517
518 log_sys.max_modified_age_async = margin
519 - margin / LOG_POOL_PREFLUSH_RATIO_ASYNC;
520 log_sys.max_modified_age_sync = margin
521 - margin / LOG_POOL_PREFLUSH_RATIO_SYNC;
522
523 log_sys.max_checkpoint_age_async = margin - margin
524 / LOG_POOL_CHECKPOINT_RATIO_ASYNC;
525 log_sys.max_checkpoint_age = margin;
526
527 log_mutex_exit();
528
529 return(true);
530 }
531
532 /** Initialize the redo log subsystem. */
create()533 void log_t::create()
534 {
535 ut_ad(this == &log_sys);
536 ut_ad(!is_initialised());
537 m_initialised= true;
538
539 mutex_create(LATCH_ID_LOG_SYS, &mutex);
540 mutex_create(LATCH_ID_LOG_WRITE, &write_mutex);
541 mutex_create(LATCH_ID_LOG_FLUSH_ORDER, &log_flush_order_mutex);
542
543 /* Start the lsn from one log block from zero: this way every
544 log record has a non-zero start lsn, a fact which we will use */
545
546 lsn= LOG_START_LSN;
547
548 ut_ad(srv_log_buffer_size >= 16 * OS_FILE_LOG_BLOCK_SIZE);
549 ut_ad(srv_log_buffer_size >= 4U << srv_page_size_shift);
550
551 buf= static_cast<byte*>(ut_malloc_dontdump(srv_log_buffer_size));
552 TRASH_ALLOC(buf, srv_log_buffer_size);
553 flush_buf= static_cast<byte*>(ut_malloc_dontdump(srv_log_buffer_size));
554 TRASH_ALLOC(flush_buf, srv_log_buffer_size);
555
556 max_buf_free= srv_log_buffer_size / LOG_BUF_FLUSH_RATIO -
557 LOG_BUF_FLUSH_MARGIN;
558 check_flush_or_checkpoint= true;
559
560 n_log_ios_old= n_log_ios;
561 last_printout_time= time(NULL);
562
563 buf_next_to_write= 0;
564 write_lsn= lsn;
565 flushed_to_disk_lsn= 0;
566 n_pending_flushes= 0;
567 flush_event = os_event_create("log_flush_event");
568 os_event_set(flush_event);
569 n_log_ios= 0;
570 n_log_ios_old= 0;
571 log_group_capacity= 0;
572 max_modified_age_async= 0;
573 max_modified_age_sync= 0;
574 max_checkpoint_age_async= 0;
575 max_checkpoint_age= 0;
576 next_checkpoint_no= 0;
577 next_checkpoint_lsn= 0;
578 append_on_checkpoint= NULL;
579 n_pending_checkpoint_writes= 0;
580
581 last_checkpoint_lsn= lsn;
582 rw_lock_create(checkpoint_lock_key, &checkpoint_lock, SYNC_NO_ORDER_CHECK);
583
584 log_block_init(buf, lsn);
585 log_block_set_first_rec_group(buf, LOG_BLOCK_HDR_SIZE);
586
587 buf_free= LOG_BLOCK_HDR_SIZE;
588 lsn= LOG_START_LSN + LOG_BLOCK_HDR_SIZE;
589
590 MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE, lsn - last_checkpoint_lsn);
591
592 log_scrub_thread_active= !srv_read_only_mode && srv_scrub_log;
593 if (log_scrub_thread_active) {
594 log_scrub_event= os_event_create("log_scrub_event");
595 os_thread_create(log_scrub_thread, NULL, NULL);
596 }
597 }
598
599 /** Initialize the redo log.
600 @param[in] n_files number of files */
create(ulint n_files)601 void log_t::files::create(ulint n_files)
602 {
603 ut_ad(n_files <= SRV_N_LOG_FILES_MAX);
604 ut_ad(this == &log_sys.log);
605 ut_ad(log_sys.is_initialised());
606
607 this->n_files= n_files;
608 format= srv_encrypt_log
609 ? LOG_HEADER_FORMAT_CURRENT | LOG_HEADER_FORMAT_ENCRYPTED
610 : LOG_HEADER_FORMAT_CURRENT;
611 subformat= 2;
612 file_size= srv_log_file_size;
613 lsn= LOG_START_LSN;
614 lsn_offset= LOG_FILE_HDR_SIZE;
615 }
616
617 /******************************************************//**
618 Writes a log file header to a log file space. */
619 static
620 void
log_file_header_flush(ulint nth_file,lsn_t start_lsn)621 log_file_header_flush(
622 ulint nth_file, /*!< in: header to the nth file in the
623 log file space */
624 lsn_t start_lsn) /*!< in: log file data starts at this
625 lsn */
626 {
627 lsn_t dest_offset;
628
629 ut_ad(log_write_mutex_own());
630 ut_ad(!recv_no_log_write);
631 ut_a(nth_file < log_sys.log.n_files);
632 ut_ad((log_sys.log.format & ~LOG_HEADER_FORMAT_ENCRYPTED)
633 == LOG_HEADER_FORMAT_CURRENT);
634
635 // man 2 open suggests this buffer to be aligned by 512 for O_DIRECT
636 MY_ALIGNED(OS_FILE_LOG_BLOCK_SIZE)
637 byte buf[OS_FILE_LOG_BLOCK_SIZE] = {0};
638
639 mach_write_to_4(buf + LOG_HEADER_FORMAT, log_sys.log.format);
640 mach_write_to_4(buf + LOG_HEADER_SUBFORMAT, log_sys.log.subformat);
641 mach_write_to_8(buf + LOG_HEADER_START_LSN, start_lsn);
642 strcpy(reinterpret_cast<char*>(buf) + LOG_HEADER_CREATOR,
643 LOG_HEADER_CREATOR_CURRENT);
644 ut_ad(LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR
645 >= sizeof LOG_HEADER_CREATOR_CURRENT);
646 log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf));
647
648 dest_offset = nth_file * log_sys.log.file_size;
649
650 DBUG_PRINT("ib_log", ("write " LSN_PF
651 " file " ULINTPF " header",
652 start_lsn, nth_file));
653
654 log_sys.n_log_ios++;
655
656 srv_stats.os_log_pending_writes.inc();
657
658 const ulint page_no = ulint(dest_offset >> srv_page_size_shift);
659
660 fil_io(IORequestLogWrite, true,
661 page_id_t(SRV_LOG_SPACE_FIRST_ID, page_no),
662 univ_page_size,
663 ulint(dest_offset & (srv_page_size - 1)),
664 OS_FILE_LOG_BLOCK_SIZE, buf, NULL);
665
666 srv_stats.os_log_pending_writes.dec();
667 }
668
669 /******************************************************//**
670 Stores a 4-byte checksum to the trailer checksum field of a log block
671 before writing it to a log file. This checksum is used in recovery to
672 check the consistency of a log block. */
673 static
674 void
log_block_store_checksum(byte * block)675 log_block_store_checksum(
676 /*=====================*/
677 byte* block) /*!< in/out: pointer to a log block */
678 {
679 log_block_set_checksum(block, log_block_calc_checksum_crc32(block));
680 }
681
682 /******************************************************//**
683 Writes a buffer to a log file. */
684 static
685 void
log_write_buf(byte * buf,ulint len,ulint pad_len,lsn_t start_lsn,ulint new_data_offset)686 log_write_buf(
687 byte* buf, /*!< in: buffer */
688 ulint len, /*!< in: buffer len; must be divisible
689 by OS_FILE_LOG_BLOCK_SIZE */
690 #ifdef UNIV_DEBUG
691 ulint pad_len, /*!< in: pad len in the buffer len */
692 #endif /* UNIV_DEBUG */
693 lsn_t start_lsn, /*!< in: start lsn of the buffer; must
694 be divisible by
695 OS_FILE_LOG_BLOCK_SIZE */
696 ulint new_data_offset)/*!< in: start offset of new data in
697 buf: this parameter is used to decide
698 if we have to write a new log file
699 header */
700 {
701 ulint write_len;
702 bool write_header = new_data_offset == 0;
703 lsn_t next_offset;
704 ulint i;
705
706 ut_ad(log_write_mutex_own());
707 ut_ad(!recv_no_log_write);
708 ut_a(len % OS_FILE_LOG_BLOCK_SIZE == 0);
709 ut_a(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
710
711 loop:
712 if (len == 0) {
713
714 return;
715 }
716
717 next_offset = log_sys.log.calc_lsn_offset(start_lsn);
718
719 if (write_header
720 && next_offset % log_sys.log.file_size == LOG_FILE_HDR_SIZE) {
721 /* We start to write a new log file instance in the group */
722
723 ut_a(next_offset / log_sys.log.file_size <= ULINT_MAX);
724
725 log_file_header_flush(
726 ulint(next_offset / log_sys.log.file_size), start_lsn);
727 srv_stats.os_log_written.add(OS_FILE_LOG_BLOCK_SIZE);
728
729 srv_stats.log_writes.inc();
730 }
731
732 if ((next_offset % log_sys.log.file_size) + len
733 > log_sys.log.file_size) {
734 /* if the above condition holds, then the below expression
735 is < len which is ulint, so the typecast is ok */
736 write_len = ulint(log_sys.log.file_size
737 - (next_offset % log_sys.log.file_size));
738 } else {
739 write_len = len;
740 }
741
742 DBUG_PRINT("ib_log",
743 ("write " LSN_PF " to " LSN_PF
744 ": len " ULINTPF
745 " blocks " ULINTPF ".." ULINTPF,
746 start_lsn, next_offset,
747 write_len,
748 log_block_get_hdr_no(buf),
749 log_block_get_hdr_no(
750 buf + write_len
751 - OS_FILE_LOG_BLOCK_SIZE)));
752
753 ut_ad(pad_len >= len
754 || log_block_get_hdr_no(buf)
755 == log_block_convert_lsn_to_no(start_lsn));
756
757 /* Calculate the checksums for each log block and write them to
758 the trailer fields of the log blocks */
759
760 for (i = 0; i < write_len / OS_FILE_LOG_BLOCK_SIZE; i++) {
761 #ifdef UNIV_DEBUG
762 ulint hdr_no_2 = log_block_get_hdr_no(buf) + i;
763 DBUG_EXECUTE_IF("innodb_small_log_block_no_limit",
764 hdr_no_2 = ((hdr_no_2 - 1) & 0xFUL) + 1;);
765 #endif
766 ut_ad(pad_len >= len
767 || i * OS_FILE_LOG_BLOCK_SIZE >= len - pad_len
768 || log_block_get_hdr_no(buf + i * OS_FILE_LOG_BLOCK_SIZE) == hdr_no_2);
769 log_block_store_checksum(buf + i * OS_FILE_LOG_BLOCK_SIZE);
770 }
771
772 log_sys.n_log_ios++;
773
774 srv_stats.os_log_pending_writes.inc();
775
776 ut_a((next_offset >> srv_page_size_shift) <= ULINT_MAX);
777
778 const ulint page_no = ulint(next_offset >> srv_page_size_shift);
779
780 fil_io(IORequestLogWrite, true,
781 page_id_t(SRV_LOG_SPACE_FIRST_ID, page_no),
782 univ_page_size,
783 ulint(next_offset & (srv_page_size - 1)), write_len, buf, NULL);
784
785 srv_stats.os_log_pending_writes.dec();
786
787 srv_stats.os_log_written.add(write_len);
788 srv_stats.log_writes.inc();
789
790 if (write_len < len) {
791 start_lsn += write_len;
792 len -= write_len;
793 buf += write_len;
794
795 write_header = true;
796
797 goto loop;
798 }
799 }
800
801 /** Flush the recently written changes to the log file.
802 and invoke log_mutex_enter(). */
log_write_flush_to_disk_low()803 static void log_write_flush_to_disk_low()
804 {
805 ut_a(log_sys.n_pending_flushes);
806
807 bool do_flush = srv_file_flush_method != SRV_O_DSYNC;
808
809 if (do_flush) {
810 fil_flush(SRV_LOG_SPACE_FIRST_ID);
811 }
812
813 log_mutex_enter();
814 if (do_flush) {
815 log_sys.flushed_to_disk_lsn = log_sys.current_flush_lsn;
816 }
817
818 log_sys.n_pending_flushes--;
819
820 os_event_set(log_sys.flush_event);
821 }
822
823 /** Swap log buffers, and copy the content of last block
824 from old buf to the head of the new buf. Thus, buf_free and
825 buf_next_to_write would be changed accordingly */
826 static inline
827 void
log_buffer_switch()828 log_buffer_switch()
829 {
830 ut_ad(log_mutex_own());
831 ut_ad(log_write_mutex_own());
832
833 ulong area_end = ut_calc_align(
834 log_sys.buf_free, ulong(OS_FILE_LOG_BLOCK_SIZE));
835
836 /* Copy the last block to new buf */
837 ut_memcpy(log_sys.flush_buf,
838 log_sys.buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
839 OS_FILE_LOG_BLOCK_SIZE);
840
841 std::swap(log_sys.buf, log_sys.flush_buf);
842
843 log_sys.buf_free %= OS_FILE_LOG_BLOCK_SIZE;
844 log_sys.buf_next_to_write = log_sys.buf_free;
845 }
846
847 /** Ensure that the log has been written to the log file up to a given
848 log entry (such as that of a transaction commit). Start a new write, or
849 wait and check if an already running write is covering the request.
850 @param[in] lsn log sequence number that should be
851 included in the redo log file write
852 @param[in] flush_to_disk whether the written log should also
853 be flushed to the file system */
854 void
log_write_up_to(lsn_t lsn,bool flush_to_disk)855 log_write_up_to(
856 lsn_t lsn,
857 bool flush_to_disk)
858 {
859 #ifdef UNIV_DEBUG
860 ulint loop_count = 0;
861 #endif /* UNIV_DEBUG */
862 byte* write_buf;
863 lsn_t write_lsn;
864
865 ut_ad(!srv_read_only_mode);
866
867 if (recv_no_ibuf_operations) {
868 /* Recovery is running and no operations on the log files are
869 allowed yet (the variable name .._no_ibuf_.. is misleading) */
870
871 return;
872 }
873
874 loop:
875 ut_ad(++loop_count < 128);
876
877 #if UNIV_WORD_SIZE > 7
878 /* We can do a dirty read of LSN. */
879 /* NOTE: Currently doesn't do dirty read for
880 (flush_to_disk == true) case, because the log_mutex
881 contention also works as the arbitrator for write-IO
882 (fsync) bandwidth between log files and data files. */
883 if (!flush_to_disk && log_sys.write_lsn >= lsn) {
884 return;
885 }
886 #endif
887
888 log_write_mutex_enter();
889 ut_ad(!recv_no_log_write);
890
891 lsn_t limit_lsn = flush_to_disk
892 ? log_sys.flushed_to_disk_lsn
893 : log_sys.write_lsn;
894
895 if (limit_lsn >= lsn) {
896 log_write_mutex_exit();
897 return;
898 }
899
900 /* If it is a write call we should just go ahead and do it
901 as we checked that write_lsn is not where we'd like it to
902 be. If we have to flush as well then we check if there is a
903 pending flush and based on that we wait for it to finish
904 before proceeding further. */
905 if (flush_to_disk
906 && (log_sys.n_pending_flushes > 0
907 || !os_event_is_set(log_sys.flush_event))) {
908 /* Figure out if the current flush will do the job
909 for us. */
910 bool work_done = log_sys.current_flush_lsn >= lsn;
911
912 log_write_mutex_exit();
913
914 os_event_wait(log_sys.flush_event);
915
916 if (work_done) {
917 return;
918 } else {
919 goto loop;
920 }
921 }
922
923 log_mutex_enter();
924 if (!flush_to_disk
925 && log_sys.buf_free == log_sys.buf_next_to_write) {
926 /* Nothing to write and no flush to disk requested */
927 log_mutex_exit_all();
928 return;
929 }
930
931 ulint start_offset;
932 ulint end_offset;
933 ulint area_start;
934 ulint area_end;
935 ulong write_ahead_size = srv_log_write_ahead_size;
936 ulint pad_size;
937
938 DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF,
939 log_sys.write_lsn,
940 log_sys.lsn));
941 if (flush_to_disk) {
942 log_sys.n_pending_flushes++;
943 log_sys.current_flush_lsn = log_sys.lsn;
944 os_event_reset(log_sys.flush_event);
945
946 if (log_sys.buf_free == log_sys.buf_next_to_write) {
947 /* Nothing to write, flush only */
948 log_mutex_exit_all();
949 log_write_flush_to_disk_low();
950 log_mutex_exit();
951 return;
952 }
953 }
954
955 start_offset = log_sys.buf_next_to_write;
956 end_offset = log_sys.buf_free;
957
958 area_start = ut_2pow_round(start_offset,
959 ulint(OS_FILE_LOG_BLOCK_SIZE));
960 area_end = ut_calc_align(end_offset, ulint(OS_FILE_LOG_BLOCK_SIZE));
961
962 ut_ad(area_end - area_start > 0);
963
964 log_block_set_flush_bit(log_sys.buf + area_start, TRUE);
965 log_block_set_checkpoint_no(
966 log_sys.buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
967 log_sys.next_checkpoint_no);
968
969 write_lsn = log_sys.lsn;
970 write_buf = log_sys.buf;
971
972 log_buffer_switch();
973
974 log_sys.log.set_fields(log_sys.write_lsn);
975
976 log_mutex_exit();
977 /* Erase the end of the last log block. */
978 memset(write_buf + end_offset, 0,
979 ~end_offset & (OS_FILE_LOG_BLOCK_SIZE - 1));
980
981 /* Calculate pad_size if needed. */
982 pad_size = 0;
983 if (write_ahead_size > OS_FILE_LOG_BLOCK_SIZE) {
984 ulint end_offset_in_unit;
985 lsn_t end_offset = log_sys.log.calc_lsn_offset(
986 ut_uint64_align_up(write_lsn, OS_FILE_LOG_BLOCK_SIZE));
987 end_offset_in_unit = (ulint) (end_offset % write_ahead_size);
988
989 if (end_offset_in_unit > 0
990 && (area_end - area_start) > end_offset_in_unit) {
991 /* The first block in the unit was initialized
992 after the last writing.
993 Needs to be written padded data once. */
994 pad_size = std::min<ulint>(
995 ulint(write_ahead_size) - end_offset_in_unit,
996 srv_log_buffer_size - area_end);
997 ::memset(write_buf + area_end, 0, pad_size);
998 }
999 }
1000
1001 if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED)) {
1002 service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
1003 "InnoDB log write: "
1004 LSN_PF "," LSN_PF,
1005 log_sys.write_lsn, lsn);
1006 }
1007
1008 if (log_sys.is_encrypted()) {
1009 log_crypt(write_buf + area_start, log_sys.write_lsn,
1010 area_end - area_start);
1011 }
1012
1013 /* Do the write to the log files */
1014 log_write_buf(
1015 write_buf + area_start, area_end - area_start + pad_size,
1016 #ifdef UNIV_DEBUG
1017 pad_size,
1018 #endif /* UNIV_DEBUG */
1019 ut_uint64_align_down(log_sys.write_lsn,
1020 OS_FILE_LOG_BLOCK_SIZE),
1021 start_offset - area_start);
1022 srv_stats.log_padded.add(pad_size);
1023 log_sys.write_lsn = write_lsn;
1024
1025
1026 if (srv_file_flush_method == SRV_O_DSYNC) {
1027 /* O_SYNC means the OS did not buffer the log file at all:
1028 so we have also flushed to disk what we have written */
1029 log_sys.flushed_to_disk_lsn = log_sys.write_lsn;
1030 }
1031
1032 log_write_mutex_exit();
1033
1034 if (flush_to_disk) {
1035 log_write_flush_to_disk_low();
1036 ib_uint64_t flush_lsn = log_sys.flushed_to_disk_lsn;
1037 log_mutex_exit();
1038
1039 innobase_mysql_log_notify(flush_lsn);
1040 }
1041 }
1042
1043 /** write to the log file up to the last log entry.
1044 @param[in] sync whether we want the written log
1045 also to be flushed to disk. */
log_buffer_flush_to_disk(bool sync)1046 void log_buffer_flush_to_disk(bool sync)
1047 {
1048 ut_ad(!srv_read_only_mode);
1049 log_write_up_to(log_get_lsn(), sync);
1050 }
1051
1052
1053 /** Durably write the log and release log_sys.mutex */
log_write_and_flush()1054 ATTRIBUTE_COLD void log_write_and_flush()
1055 {
1056 ut_ad(!srv_read_only_mode);
1057 ut_ad(!recv_no_log_write);
1058 ut_ad(!recv_recovery_is_on());
1059
1060 /* The following code is adapted from log_write_up_to(). */
1061 DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF,
1062 log_sys.write_lsn, log_sys.lsn));
1063 log_sys.n_pending_flushes++;
1064 log_sys.current_flush_lsn= log_sys.lsn;
1065 os_event_reset(log_sys.flush_event);
1066 ut_ad(log_sys.buf_free != log_sys.buf_next_to_write);
1067 ulint start_offset= log_sys.buf_next_to_write;
1068 ulint end_offset= log_sys.buf_free;
1069 ulint area_start= ut_2pow_round(start_offset, ulint(OS_FILE_LOG_BLOCK_SIZE));
1070 ulint area_end= ut_calc_align(end_offset, ulint(OS_FILE_LOG_BLOCK_SIZE));
1071 ulong write_ahead_size= srv_log_write_ahead_size;
1072
1073 log_block_set_flush_bit(log_sys.buf + area_start, TRUE);
1074 log_block_set_checkpoint_no(log_sys.buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
1075 log_sys.next_checkpoint_no);
1076 lsn_t write_lsn= log_sys.lsn;
1077 byte *write_buf= log_sys.buf;
1078
1079 ut_ad(area_end - area_start > 0);
1080
1081 log_buffer_switch();
1082
1083 log_sys.log.set_fields(log_sys.write_lsn);
1084
1085 /* Erase the end of the last log block. */
1086 memset(write_buf + end_offset, 0,
1087 ~end_offset & (OS_FILE_LOG_BLOCK_SIZE - 1));
1088 /* Calculate pad_size if needed. */
1089 ulint pad_size= 0;
1090 if (write_ahead_size > OS_FILE_LOG_BLOCK_SIZE)
1091 {
1092 lsn_t end_offset=
1093 log_sys.log.calc_lsn_offset(ut_uint64_align_up(write_lsn,
1094 OS_FILE_LOG_BLOCK_SIZE));
1095 ulint end_offset_in_unit= (ulint) (end_offset % write_ahead_size);
1096
1097 if (end_offset_in_unit && (area_end - area_start) > end_offset_in_unit)
1098 {
1099 /* The first block in the unit was initialized after the last
1100 writing. Needs to be written padded data once. */
1101 pad_size= std::min<ulint>(ulint(write_ahead_size) - end_offset_in_unit,
1102 srv_log_buffer_size - area_end);
1103 memset(write_buf + area_end, 0, pad_size);
1104 }
1105 }
1106
1107 if (log_sys.is_encrypted())
1108 log_crypt(write_buf + area_start, log_sys.write_lsn,
1109 area_end - area_start);
1110
1111 /* Do the write to the log files */
1112 log_write_buf(write_buf + area_start, area_end - area_start + pad_size,
1113 #ifdef UNIV_DEBUG
1114 pad_size,
1115 #endif /* UNIV_DEBUG */
1116 ut_uint64_align_down(log_sys.write_lsn,
1117 OS_FILE_LOG_BLOCK_SIZE),
1118 start_offset - area_start);
1119 srv_stats.log_padded.add(pad_size);
1120 log_sys.write_lsn= write_lsn;
1121
1122 log_write_mutex_exit();
1123
1124 /* Code adapted from log_write_flush_to_disk_low() */
1125
1126 ut_a(log_sys.n_pending_flushes);
1127
1128 if (srv_file_flush_method != SRV_O_DSYNC)
1129 fil_flush(SRV_LOG_SPACE_FIRST_ID);
1130
1131 log_sys.flushed_to_disk_lsn= log_sys.current_flush_lsn;
1132
1133 log_sys.n_pending_flushes--;
1134
1135 os_event_set(log_sys.flush_event);
1136
1137 const lsn_t flush_lsn= log_sys.flushed_to_disk_lsn;
1138 log_mutex_exit();
1139
1140 innobase_mysql_log_notify(flush_lsn);
1141 }
1142
1143 /****************************************************************//**
1144 This functions writes the log buffer to the log file and if 'flush'
1145 is set it forces a flush of the log file as well. This is meant to be
1146 called from background master thread only as it does not wait for
1147 the write (+ possible flush) to finish. */
1148 void
log_buffer_sync_in_background(bool flush)1149 log_buffer_sync_in_background(
1150 /*==========================*/
1151 bool flush) /*!< in: flush the logs to disk */
1152 {
1153 lsn_t lsn;
1154
1155 log_mutex_enter();
1156
1157 lsn = log_sys.lsn;
1158
1159 if (flush
1160 && log_sys.n_pending_flushes > 0
1161 && log_sys.current_flush_lsn >= lsn) {
1162 /* The write + flush will write enough */
1163 log_mutex_exit();
1164 return;
1165 }
1166
1167 log_mutex_exit();
1168
1169 log_write_up_to(lsn, flush);
1170 }
1171
1172 /********************************************************************
1173
1174 Tries to establish a big enough margin of free space in the log buffer, such
1175 that a new log entry can be catenated without an immediate need for a flush. */
1176 static
1177 void
log_flush_margin(void)1178 log_flush_margin(void)
1179 /*==================*/
1180 {
1181 lsn_t lsn = 0;
1182
1183 log_mutex_enter();
1184
1185 if (log_sys.buf_free > log_sys.max_buf_free) {
1186 /* We can write during flush */
1187 lsn = log_sys.lsn;
1188 }
1189
1190 log_mutex_exit();
1191
1192 if (lsn) {
1193 log_write_up_to(lsn, false);
1194 }
1195 }
1196
1197 /** Advances the smallest lsn for which there are unflushed dirty blocks in the
1198 buffer pool.
1199 NOTE: this function may only be called if the calling thread owns no
1200 synchronization objects!
1201 @param[in] new_oldest try to advance oldest_modified_lsn at least to
1202 this lsn
1203 @return false if there was a flush batch of the same type running,
1204 which means that we could not start this flush batch */
log_preflush_pool_modified_pages(lsn_t new_oldest)1205 static bool log_preflush_pool_modified_pages(lsn_t new_oldest)
1206 {
1207 bool success;
1208
1209 if (recv_recovery_is_on()) {
1210 /* If the recovery is running, we must first apply all
1211 log records to their respective file pages to get the
1212 right modify lsn values to these pages: otherwise, there
1213 might be pages on disk which are not yet recovered to the
1214 current lsn, and even after calling this function, we could
1215 not know how up-to-date the disk version of the database is,
1216 and we could not make a new checkpoint on the basis of the
1217 info on the buffer pool only. */
1218 recv_apply_hashed_log_recs(true);
1219 }
1220
1221 if (new_oldest == LSN_MAX
1222 || !buf_page_cleaner_is_active
1223 || srv_is_being_started) {
1224
1225 ulint n_pages;
1226
1227 success = buf_flush_lists(ULINT_MAX, new_oldest, &n_pages);
1228
1229 buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
1230
1231 if (!success) {
1232 MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
1233 }
1234
1235 MONITOR_INC_VALUE_CUMULATIVE(
1236 MONITOR_FLUSH_SYNC_TOTAL_PAGE,
1237 MONITOR_FLUSH_SYNC_COUNT,
1238 MONITOR_FLUSH_SYNC_PAGES,
1239 n_pages);
1240 } else {
1241 /* better to wait for flushed by page cleaner */
1242
1243 if (srv_flush_sync) {
1244 /* wake page cleaner for IO burst */
1245 buf_flush_request_force(new_oldest);
1246 }
1247
1248 buf_flush_wait_flushed(new_oldest);
1249
1250 success = true;
1251 }
1252
1253 return(success);
1254 }
1255
1256 /******************************************************//**
1257 Completes a checkpoint. */
1258 static
1259 void
log_complete_checkpoint(void)1260 log_complete_checkpoint(void)
1261 /*=========================*/
1262 {
1263 ut_ad(log_mutex_own());
1264 ut_ad(log_sys.n_pending_checkpoint_writes == 0);
1265
1266 log_sys.next_checkpoint_no++;
1267
1268 log_sys.last_checkpoint_lsn = log_sys.next_checkpoint_lsn;
1269 MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
1270 log_sys.lsn - log_sys.last_checkpoint_lsn);
1271
1272 DBUG_PRINT("ib_log", ("checkpoint ended at " LSN_PF
1273 ", flushed to " LSN_PF,
1274 log_sys.last_checkpoint_lsn,
1275 log_sys.flushed_to_disk_lsn));
1276
1277 rw_lock_x_unlock_gen(&(log_sys.checkpoint_lock), LOG_CHECKPOINT);
1278 }
1279
1280 /** Complete an asynchronous checkpoint write. */
complete_checkpoint()1281 void log_t::complete_checkpoint()
1282 {
1283 ut_ad(this == &log_sys);
1284 MONITOR_DEC(MONITOR_PENDING_CHECKPOINT_WRITE);
1285
1286 log_mutex_enter();
1287
1288 ut_ad(n_pending_checkpoint_writes > 0);
1289
1290 if (!--n_pending_checkpoint_writes) {
1291 log_complete_checkpoint();
1292 }
1293
1294 log_mutex_exit();
1295 }
1296
1297 /** Write checkpoint info to the log header.
1298 @param[in] end_lsn start LSN of the MLOG_CHECKPOINT mini-transaction */
1299 static
1300 void
log_group_checkpoint(lsn_t end_lsn)1301 log_group_checkpoint(lsn_t end_lsn)
1302 {
1303 lsn_t lsn_offset;
1304
1305 ut_ad(!srv_read_only_mode);
1306 ut_ad(log_mutex_own());
1307 ut_ad(end_lsn == 0 || end_lsn >= log_sys.next_checkpoint_lsn);
1308 ut_ad(end_lsn <= log_sys.lsn);
1309 ut_ad(end_lsn + SIZE_OF_MLOG_CHECKPOINT <= log_sys.lsn
1310 || srv_shutdown_state > SRV_SHUTDOWN_INITIATED);
1311
1312 DBUG_PRINT("ib_log", ("checkpoint " UINT64PF " at " LSN_PF
1313 " written",
1314 log_sys.next_checkpoint_no,
1315 log_sys.next_checkpoint_lsn));
1316
1317 byte* buf = log_sys.checkpoint_buf;
1318 memset(buf, 0, OS_FILE_LOG_BLOCK_SIZE);
1319
1320 mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys.next_checkpoint_no);
1321 mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys.next_checkpoint_lsn);
1322
1323 if (log_sys.is_encrypted()) {
1324 log_crypt_write_checkpoint_buf(buf);
1325 }
1326
1327 lsn_offset = log_sys.log.calc_lsn_offset(log_sys.next_checkpoint_lsn);
1328 mach_write_to_8(buf + LOG_CHECKPOINT_OFFSET, lsn_offset);
1329 mach_write_to_8(buf + LOG_CHECKPOINT_LOG_BUF_SIZE,
1330 srv_log_buffer_size);
1331 mach_write_to_8(buf + LOG_CHECKPOINT_END_LSN, end_lsn);
1332
1333 log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf));
1334
1335 MONITOR_INC(MONITOR_PENDING_CHECKPOINT_WRITE);
1336
1337 log_sys.n_log_ios++;
1338
1339 MONITOR_INC(MONITOR_LOG_IO);
1340
1341 ut_ad(LOG_CHECKPOINT_1 < srv_page_size);
1342 ut_ad(LOG_CHECKPOINT_2 < srv_page_size);
1343
1344 if (log_sys.n_pending_checkpoint_writes++ == 0) {
1345 rw_lock_x_lock_gen(&log_sys.checkpoint_lock,
1346 LOG_CHECKPOINT);
1347 }
1348
1349 /* Note: We alternate the physical place of the checkpoint info.
1350 See the (next_checkpoint_no & 1) below. */
1351
1352 fil_io(IORequestLogWrite, false,
1353 page_id_t(SRV_LOG_SPACE_FIRST_ID, 0),
1354 univ_page_size,
1355 (log_sys.next_checkpoint_no & 1)
1356 ? LOG_CHECKPOINT_2 : LOG_CHECKPOINT_1,
1357 OS_FILE_LOG_BLOCK_SIZE,
1358 buf, reinterpret_cast<void*>(1) /* checkpoint write */);
1359 }
1360
1361 /** Read a log group header page to log_sys.checkpoint_buf.
1362 @param[in] header 0 or LOG_CHECKPOINT_1 or LOG_CHECKPOINT2 */
log_header_read(ulint header)1363 void log_header_read(ulint header)
1364 {
1365 ut_ad(log_mutex_own());
1366
1367 log_sys.n_log_ios++;
1368
1369 MONITOR_INC(MONITOR_LOG_IO);
1370
1371 fil_io(IORequestLogRead, true,
1372 page_id_t(SRV_LOG_SPACE_FIRST_ID,
1373 header >> srv_page_size_shift),
1374 univ_page_size, header & (srv_page_size - 1),
1375 OS_FILE_LOG_BLOCK_SIZE, log_sys.checkpoint_buf, NULL);
1376 }
1377
1378 /** Write checkpoint info to the log header and invoke log_mutex_exit().
1379 @param[in] sync whether to wait for the write to complete
1380 @param[in] end_lsn start LSN of the MLOG_CHECKPOINT mini-transaction */
1381 void
log_write_checkpoint_info(bool sync,lsn_t end_lsn)1382 log_write_checkpoint_info(bool sync, lsn_t end_lsn)
1383 {
1384 ut_ad(log_mutex_own());
1385 ut_ad(!srv_read_only_mode);
1386
1387 log_group_checkpoint(end_lsn);
1388
1389 log_mutex_exit();
1390
1391 MONITOR_INC(MONITOR_NUM_CHECKPOINT);
1392
1393 if (sync) {
1394 /* Wait for the checkpoint write to complete */
1395 rw_lock_s_lock(&log_sys.checkpoint_lock);
1396 rw_lock_s_unlock(&log_sys.checkpoint_lock);
1397
1398 DBUG_EXECUTE_IF(
1399 "crash_after_checkpoint",
1400 DBUG_SUICIDE(););
1401 }
1402 }
1403
1404 /** Set extra data to be written to the redo log during checkpoint.
1405 @param[in] buf data to be appended on checkpoint, or NULL
1406 @return pointer to previous data to be appended on checkpoint */
1407 mtr_buf_t*
log_append_on_checkpoint(mtr_buf_t * buf)1408 log_append_on_checkpoint(
1409 mtr_buf_t* buf)
1410 {
1411 log_mutex_enter();
1412 mtr_buf_t* old = log_sys.append_on_checkpoint;
1413 log_sys.append_on_checkpoint = buf;
1414 log_mutex_exit();
1415 return(old);
1416 }
1417
1418 /** Make a checkpoint. Note that this function does not flush dirty
1419 blocks from the buffer pool: it only checks what is lsn of the oldest
1420 modification in the pool, and writes information about the lsn in
1421 log files. Use log_make_checkpoint() to flush also the pool.
1422 @param[in] sync whether to wait for the write to complete
1423 @return true if success, false if a checkpoint write was already running */
log_checkpoint(bool sync)1424 bool log_checkpoint(bool sync)
1425 {
1426 lsn_t oldest_lsn;
1427
1428 ut_ad(!srv_read_only_mode);
1429
1430 DBUG_EXECUTE_IF("no_checkpoint",
1431 /* We sleep for a long enough time, forcing
1432 the checkpoint doesn't happen any more. */
1433 os_thread_sleep(360000000););
1434
1435 if (recv_recovery_is_on()) {
1436 recv_apply_hashed_log_recs(true);
1437 }
1438
1439 switch (srv_file_flush_method) {
1440 case SRV_NOSYNC:
1441 break;
1442 case SRV_O_DSYNC:
1443 case SRV_FSYNC:
1444 case SRV_LITTLESYNC:
1445 case SRV_O_DIRECT:
1446 case SRV_O_DIRECT_NO_FSYNC:
1447 #ifdef _WIN32
1448 case SRV_ALL_O_DIRECT_FSYNC:
1449 #endif
1450 fil_flush_file_spaces(FIL_TYPE_TABLESPACE);
1451 }
1452
1453 log_mutex_enter();
1454
1455 ut_ad(!recv_no_log_write);
1456 oldest_lsn = log_buf_pool_get_oldest_modification();
1457
1458 /* Because log also contains headers and dummy log records,
1459 log_buf_pool_get_oldest_modification() will return log_sys.lsn
1460 if the buffer pool contains no dirty buffers.
1461 We must make sure that the log is flushed up to that lsn.
1462 If there are dirty buffers in the buffer pool, then our
1463 write-ahead-logging algorithm ensures that the log has been
1464 flushed up to oldest_lsn. */
1465
1466 ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn);
1467 if (oldest_lsn
1468 > log_sys.last_checkpoint_lsn + SIZE_OF_MLOG_CHECKPOINT) {
1469 /* Some log has been written since the previous checkpoint. */
1470 } else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) {
1471 /* MariaDB 10.3 startup expects the redo log file to be
1472 logically empty (not even containing a MLOG_CHECKPOINT record)
1473 after a clean shutdown. Perform an extra checkpoint at
1474 shutdown. */
1475 } else {
1476 /* Do nothing, because nothing was logged (other than
1477 a MLOG_CHECKPOINT marker) since the previous checkpoint. */
1478 log_mutex_exit();
1479 return(true);
1480 }
1481 /* Repeat the MLOG_FILE_NAME records after the checkpoint, in
1482 case some log records between the checkpoint and log_sys.lsn
1483 need them. Finally, write a MLOG_CHECKPOINT marker. Redo log
1484 apply expects to see a MLOG_CHECKPOINT after the checkpoint,
1485 except on clean shutdown, where the log will be empty after
1486 the checkpoint.
1487 It is important that we write out the redo log before any
1488 further dirty pages are flushed to the tablespace files. At
1489 this point, because log_mutex_own(), mtr_commit() in other
1490 threads will be blocked, and no pages can be added to the
1491 flush lists. */
1492 lsn_t flush_lsn = oldest_lsn;
1493 const lsn_t end_lsn = log_sys.lsn;
1494 const bool do_write
1495 = srv_shutdown_state <= SRV_SHUTDOWN_INITIATED
1496 || flush_lsn != end_lsn;
1497
1498 if (fil_names_clear(flush_lsn, do_write)) {
1499 ut_ad(log_sys.lsn >= end_lsn + SIZE_OF_MLOG_CHECKPOINT);
1500 flush_lsn = log_sys.lsn;
1501 }
1502
1503 log_mutex_exit();
1504
1505 log_write_up_to(flush_lsn, true);
1506
1507 log_mutex_enter();
1508
1509 ut_ad(log_sys.flushed_to_disk_lsn >= flush_lsn);
1510 ut_ad(flush_lsn >= oldest_lsn);
1511
1512 if (log_sys.last_checkpoint_lsn >= oldest_lsn) {
1513 log_mutex_exit();
1514 return(true);
1515 }
1516
1517 if (log_sys.n_pending_checkpoint_writes > 0) {
1518 /* A checkpoint write is running */
1519 log_mutex_exit();
1520
1521 if (sync) {
1522 /* Wait for the checkpoint write to complete */
1523 rw_lock_s_lock(&log_sys.checkpoint_lock);
1524 rw_lock_s_unlock(&log_sys.checkpoint_lock);
1525 }
1526
1527 return(false);
1528 }
1529
1530 log_sys.next_checkpoint_lsn = oldest_lsn;
1531 log_write_checkpoint_info(sync, end_lsn);
1532 ut_ad(!log_mutex_own());
1533
1534 return(true);
1535 }
1536
1537 /** Make a checkpoint */
log_make_checkpoint()1538 void log_make_checkpoint()
1539 {
1540 /* Preflush pages synchronously */
1541
1542 while (!log_preflush_pool_modified_pages(LSN_MAX)) {
1543 /* Flush as much as we can */
1544 }
1545
1546 while (!log_checkpoint(true)) {
1547 /* Force a checkpoint */
1548 }
1549 }
1550
1551 /****************************************************************//**
1552 Tries to establish a big enough margin of free space in the log groups, such
1553 that a new log entry can be catenated without an immediate need for a
1554 checkpoint. NOTE: this function may only be called if the calling thread
1555 owns no synchronization objects! */
1556 static
1557 void
log_checkpoint_margin(void)1558 log_checkpoint_margin(void)
1559 /*=======================*/
1560 {
1561 lsn_t age;
1562 lsn_t checkpoint_age;
1563 ib_uint64_t advance;
1564 lsn_t oldest_lsn;
1565 bool success;
1566 loop:
1567 advance = 0;
1568
1569 log_mutex_enter();
1570 ut_ad(!recv_no_log_write);
1571
1572 if (!log_sys.check_flush_or_checkpoint) {
1573 log_mutex_exit();
1574 return;
1575 }
1576
1577 oldest_lsn = log_buf_pool_get_oldest_modification();
1578
1579 age = log_sys.lsn - oldest_lsn;
1580
1581 if (age > log_sys.max_modified_age_sync) {
1582
1583 /* A flush is urgent: we have to do a synchronous preflush */
1584 advance = age - log_sys.max_modified_age_sync;
1585 }
1586
1587 checkpoint_age = log_sys.lsn - log_sys.last_checkpoint_lsn;
1588
1589 bool checkpoint_sync;
1590 bool do_checkpoint;
1591
1592 if (checkpoint_age > log_sys.max_checkpoint_age) {
1593 /* A checkpoint is urgent: we do it synchronously */
1594 checkpoint_sync = true;
1595 do_checkpoint = true;
1596 } else if (checkpoint_age > log_sys.max_checkpoint_age_async) {
1597 /* A checkpoint is not urgent: do it asynchronously */
1598 do_checkpoint = true;
1599 checkpoint_sync = false;
1600 log_sys.check_flush_or_checkpoint = false;
1601 } else {
1602 do_checkpoint = false;
1603 checkpoint_sync = false;
1604 log_sys.check_flush_or_checkpoint = false;
1605 }
1606
1607 log_mutex_exit();
1608
1609 if (advance) {
1610 lsn_t new_oldest = oldest_lsn + advance;
1611
1612 success = log_preflush_pool_modified_pages(new_oldest);
1613
1614 /* If the flush succeeded, this thread has done its part
1615 and can proceed. If it did not succeed, there was another
1616 thread doing a flush at the same time. */
1617 if (!success) {
1618 log_mutex_enter();
1619 log_sys.check_flush_or_checkpoint = true;
1620 log_mutex_exit();
1621 goto loop;
1622 }
1623 }
1624
1625 if (do_checkpoint) {
1626 log_checkpoint(checkpoint_sync);
1627
1628 if (checkpoint_sync) {
1629
1630 goto loop;
1631 }
1632 }
1633 }
1634
1635 /**
1636 Checks that there is enough free space in the log to start a new query step.
1637 Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
1638 function may only be called if the calling thread owns no synchronization
1639 objects! */
1640 void
log_check_margins(void)1641 log_check_margins(void)
1642 {
1643 bool check;
1644
1645 do {
1646 log_flush_margin();
1647 log_checkpoint_margin();
1648 log_mutex_enter();
1649 ut_ad(!recv_no_log_write);
1650 check = log_sys.check_flush_or_checkpoint;
1651 log_mutex_exit();
1652 } while (check);
1653 }
1654
1655 /****************************************************************//**
1656 Makes a checkpoint at the latest lsn and writes it to first page of each
1657 data file in the database, so that we know that the file spaces contain
1658 all modifications up to that lsn. This can only be called at database
1659 shutdown. This function also writes all log in log files to the log archive. */
1660 void
logs_empty_and_mark_files_at_shutdown(void)1661 logs_empty_and_mark_files_at_shutdown(void)
1662 /*=======================================*/
1663 {
1664 lsn_t lsn;
1665 ulint count = 0;
1666
1667 ib::info() << "Starting shutdown...";
1668
1669 /* Wait until the master thread and all other operations are idle: our
1670 algorithm only works if the server is idle at shutdown */
1671
1672 srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
1673 loop:
1674 ut_ad(lock_sys.is_initialised() || !srv_was_started);
1675 ut_ad(log_sys.is_initialised() || !srv_was_started);
1676 ut_ad(fil_system.is_initialised() || !srv_was_started);
1677 os_event_set(srv_buf_resize_event);
1678
1679 if (!srv_read_only_mode) {
1680 os_event_set(srv_error_event);
1681 os_event_set(srv_monitor_event);
1682 os_event_set(srv_buf_dump_event);
1683 if (lock_sys.timeout_thread_active) {
1684 os_event_set(lock_sys.timeout_event);
1685 }
1686 if (dict_stats_event) {
1687 os_event_set(dict_stats_event);
1688 } else {
1689 ut_ad(!srv_dict_stats_thread_active);
1690 }
1691 if (recv_sys && recv_sys->flush_start) {
1692 /* This is in case recv_writer_thread was never
1693 started, or buf_flush_page_cleaner_coordinator
1694 failed to notice its termination. */
1695 os_event_set(recv_sys->flush_start);
1696 }
1697 }
1698 #define COUNT_INTERVAL 600U
1699 #define CHECK_INTERVAL 100000U
1700 os_thread_sleep(CHECK_INTERVAL);
1701
1702 count++;
1703
1704 /* Check that there are no longer transactions, except for
1705 PREPARED ones. We need this wait even for the 'very fast'
1706 shutdown, because the InnoDB layer may have committed or
1707 prepared transactions and we don't want to lose them. */
1708
1709 if (ulint total_trx = srv_was_started && !srv_read_only_mode
1710 && srv_force_recovery < SRV_FORCE_NO_TRX_UNDO
1711 ? trx_sys.any_active_transactions() : 0) {
1712
1713 if (srv_print_verbose_log && count > COUNT_INTERVAL) {
1714 service_manager_extend_timeout(
1715 COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2,
1716 "Waiting for %lu active transactions to finish",
1717 (ulong) total_trx);
1718 ib::info() << "Waiting for " << total_trx << " active"
1719 << " transactions to finish";
1720
1721 count = 0;
1722 }
1723
1724 goto loop;
1725 }
1726
1727 /* We need these threads to stop early in shutdown. */
1728 const char* thread_name;
1729
1730 if (srv_error_monitor_active) {
1731 thread_name = "srv_error_monitor_thread";
1732 } else if (srv_monitor_active) {
1733 thread_name = "srv_monitor_thread";
1734 } else if (srv_buf_resize_thread_active) {
1735 thread_name = "buf_resize_thread";
1736 goto wait_suspend_loop;
1737 } else if (srv_dict_stats_thread_active) {
1738 thread_name = "dict_stats_thread";
1739 } else if (lock_sys.timeout_thread_active) {
1740 thread_name = "lock_wait_timeout_thread";
1741 } else if (srv_buf_dump_thread_active) {
1742 thread_name = "buf_dump_thread";
1743 goto wait_suspend_loop;
1744 } else if (btr_defragment_thread_active) {
1745 thread_name = "btr_defragment_thread";
1746 } else if (srv_fast_shutdown != 2 && trx_rollback_is_active) {
1747 thread_name = "rollback of recovered transactions";
1748 } else {
1749 thread_name = NULL;
1750 }
1751
1752 if (thread_name) {
1753 ut_ad(!srv_read_only_mode);
1754 wait_suspend_loop:
1755 service_manager_extend_timeout(
1756 COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2,
1757 "Waiting for %s to exit", thread_name);
1758 if (srv_print_verbose_log && count > COUNT_INTERVAL) {
1759 ib::info() << "Waiting for " << thread_name
1760 << " to exit";
1761 count = 0;
1762 }
1763 goto loop;
1764 }
1765
1766 /* Check that the background threads are suspended */
1767
1768 switch (srv_get_active_thread_type()) {
1769 case SRV_NONE:
1770 if (!srv_n_fil_crypt_threads_started) {
1771 srv_shutdown_state = SRV_SHUTDOWN_FLUSH_PHASE;
1772 break;
1773 }
1774 os_event_set(fil_crypt_threads_event);
1775 thread_name = "fil_crypt_thread";
1776 goto wait_suspend_loop;
1777 case SRV_PURGE:
1778 case SRV_WORKER:
1779 ut_ad(!"purge was not shut down");
1780 srv_purge_wakeup();
1781 thread_name = "purge thread";
1782 goto wait_suspend_loop;
1783 case SRV_MASTER:
1784 thread_name = "master thread";
1785 goto wait_suspend_loop;
1786 }
1787
1788 /* At this point only page_cleaner should be active. We wait
1789 here to let it complete the flushing of the buffer pools
1790 before proceeding further. */
1791
1792 count = 0;
1793 service_manager_extend_timeout(COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2,
1794 "Waiting for page cleaner");
1795 while (buf_page_cleaner_is_active) {
1796 ++count;
1797 os_thread_sleep(CHECK_INTERVAL);
1798 if (srv_print_verbose_log && count > COUNT_INTERVAL) {
1799 service_manager_extend_timeout(COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2,
1800 "Waiting for page cleaner");
1801 ib::info() << "Waiting for page_cleaner to "
1802 "finish flushing of buffer pool";
1803 /* This is a workaround to avoid the InnoDB hang
1804 when OS datetime changed backwards */
1805 os_event_set(buf_flush_event);
1806 count = 0;
1807 }
1808 }
1809
1810 if (log_scrub_thread_active) {
1811 ut_ad(!srv_read_only_mode);
1812 os_event_set(log_scrub_event);
1813 }
1814
1815 if (log_sys.is_initialised()) {
1816 log_mutex_enter();
1817 const ulint n_write = log_sys.n_pending_checkpoint_writes;
1818 const ulint n_flush = log_sys.n_pending_flushes;
1819 log_mutex_exit();
1820
1821 if (log_scrub_thread_active || n_write || n_flush) {
1822 if (srv_print_verbose_log && count > 600) {
1823 ib::info() << "Pending checkpoint_writes: "
1824 << n_write
1825 << ". Pending log flush writes: "
1826 << n_flush;
1827 count = 0;
1828 }
1829 goto loop;
1830 }
1831 }
1832
1833 ut_ad(!log_scrub_thread_active);
1834
1835 if (!buf_pool_ptr) {
1836 ut_ad(!srv_was_started);
1837 } else if (ulint pending_io = buf_pool_check_no_pending_io()) {
1838 if (srv_print_verbose_log && count > 600) {
1839 ib::info() << "Waiting for " << pending_io << " buffer"
1840 " page I/Os to complete";
1841 count = 0;
1842 }
1843
1844 goto loop;
1845 }
1846
1847 if (srv_fast_shutdown == 2 || !srv_was_started) {
1848 if (!srv_read_only_mode && srv_was_started) {
1849 ib::info() << "MySQL has requested a very fast"
1850 " shutdown without flushing the InnoDB buffer"
1851 " pool to data files. At the next mysqld"
1852 " startup InnoDB will do a crash recovery!";
1853
1854 /* In this fastest shutdown we do not flush the
1855 buffer pool:
1856
1857 it is essentially a 'crash' of the InnoDB server.
1858 Make sure that the log is all flushed to disk, so
1859 that we can recover all committed transactions in
1860 a crash recovery. We must not write the lsn stamps
1861 to the data files, since at a startup InnoDB deduces
1862 from the stamps if the previous shutdown was clean. */
1863
1864 log_buffer_flush_to_disk();
1865 }
1866
1867 srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
1868
1869 if (fil_system.is_initialised()) {
1870 fil_close_all_files();
1871 }
1872 return;
1873 }
1874
1875 if (!srv_read_only_mode) {
1876 service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
1877 "ensuring dirty buffer pool are written to log");
1878 log_make_checkpoint();
1879
1880 log_mutex_enter();
1881
1882 lsn = log_sys.lsn;
1883
1884 const bool lsn_changed = lsn != log_sys.last_checkpoint_lsn;
1885 ut_ad(lsn >= log_sys.last_checkpoint_lsn);
1886
1887 log_mutex_exit();
1888
1889 if (lsn_changed) {
1890 goto loop;
1891 }
1892
1893 /* Ensure that all buffered changes are written to the
1894 redo log before fil_close_all_files(). */
1895 fil_flush_file_spaces(FIL_TYPE_LOG);
1896 } else {
1897 lsn = srv_start_lsn;
1898 }
1899
1900 srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
1901
1902 /* Make some checks that the server really is quiet */
1903 ut_a(srv_get_active_thread_type() == SRV_NONE);
1904
1905 service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
1906 "Free innodb buffer pool");
1907 buf_all_freed();
1908
1909 ut_a(lsn == log_sys.lsn
1910 || srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
1911
1912 if (lsn < srv_start_lsn) {
1913 ib::error() << "Shutdown LSN=" << lsn
1914 << " is less than start LSN=" << srv_start_lsn;
1915 }
1916
1917 srv_shutdown_lsn = lsn;
1918
1919 if (!srv_read_only_mode) {
1920 dberr_t err = fil_write_flushed_lsn(lsn);
1921
1922 if (err != DB_SUCCESS) {
1923 ib::error() << "Writing flushed lsn " << lsn
1924 << " failed; error=" << err;
1925 }
1926 }
1927
1928 fil_close_all_files();
1929
1930 /* Make some checks that the server really is quiet */
1931 ut_a(srv_get_active_thread_type() == SRV_NONE);
1932
1933 ut_a(lsn == log_sys.lsn
1934 || srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
1935 }
1936
1937 /******************************************************//**
1938 Peeks the current lsn.
1939 @return TRUE if success, FALSE if could not get the log system mutex */
1940 ibool
log_peek_lsn(lsn_t * lsn)1941 log_peek_lsn(
1942 /*=========*/
1943 lsn_t* lsn) /*!< out: if returns TRUE, current lsn is here */
1944 {
1945 if (0 == mutex_enter_nowait(&(log_sys.mutex))) {
1946 *lsn = log_sys.lsn;
1947
1948 log_mutex_exit();
1949
1950 return(TRUE);
1951 }
1952
1953 return(FALSE);
1954 }
1955
1956 /******************************************************//**
1957 Prints info of the log. */
1958 void
log_print(FILE * file)1959 log_print(
1960 /*======*/
1961 FILE* file) /*!< in: file where to print */
1962 {
1963 double time_elapsed;
1964 time_t current_time;
1965
1966 log_mutex_enter();
1967
1968 fprintf(file,
1969 "Log sequence number " LSN_PF "\n"
1970 "Log flushed up to " LSN_PF "\n"
1971 "Pages flushed up to " LSN_PF "\n"
1972 "Last checkpoint at " LSN_PF "\n",
1973 log_sys.lsn,
1974 log_sys.flushed_to_disk_lsn,
1975 log_buf_pool_get_oldest_modification(),
1976 log_sys.last_checkpoint_lsn);
1977
1978 current_time = time(NULL);
1979
1980 time_elapsed = difftime(current_time,
1981 log_sys.last_printout_time);
1982
1983 if (time_elapsed <= 0) {
1984 time_elapsed = 1;
1985 }
1986
1987 fprintf(file,
1988 ULINTPF " pending log flushes, "
1989 ULINTPF " pending chkp writes\n"
1990 ULINTPF " log i/o's done, %.2f log i/o's/second\n",
1991 log_sys.n_pending_flushes,
1992 log_sys.n_pending_checkpoint_writes,
1993 log_sys.n_log_ios,
1994 static_cast<double>(
1995 log_sys.n_log_ios - log_sys.n_log_ios_old)
1996 / time_elapsed);
1997
1998 log_sys.n_log_ios_old = log_sys.n_log_ios;
1999 log_sys.last_printout_time = current_time;
2000
2001 log_mutex_exit();
2002 }
2003
2004 /**********************************************************************//**
2005 Refreshes the statistics used to print per-second averages. */
2006 void
log_refresh_stats(void)2007 log_refresh_stats(void)
2008 /*===================*/
2009 {
2010 log_sys.n_log_ios_old = log_sys.n_log_ios;
2011 log_sys.last_printout_time = time(NULL);
2012 }
2013
2014 /** Shut down the redo log subsystem. */
close()2015 void log_t::close()
2016 {
2017 ut_ad(this == &log_sys);
2018 if (!is_initialised()) return;
2019 m_initialised = false;
2020 log.close();
2021
2022 ut_free_dodump(buf, srv_log_buffer_size);
2023 buf = NULL;
2024 ut_free_dodump(flush_buf, srv_log_buffer_size);
2025 flush_buf = NULL;
2026
2027 os_event_destroy(flush_event);
2028 rw_lock_free(&checkpoint_lock);
2029 mutex_free(&mutex);
2030 mutex_free(&write_mutex);
2031 mutex_free(&log_flush_order_mutex);
2032
2033 if (!srv_read_only_mode && srv_scrub_log)
2034 os_event_destroy(log_scrub_event);
2035
2036 recv_sys_close();
2037 }
2038
2039 /******************************************************//**
2040 Pads the current log block full with dummy log records. Used in producing
2041 consistent archived log files and scrubbing redo log. */
2042 static
2043 void
log_pad_current_log_block(void)2044 log_pad_current_log_block(void)
2045 /*===========================*/
2046 {
2047 byte b = MLOG_DUMMY_RECORD;
2048 ulint pad_length;
2049 ulint i;
2050 lsn_t lsn;
2051
2052 ut_ad(!recv_no_log_write);
2053 /* We retrieve lsn only because otherwise gcc crashed on HP-UX */
2054 lsn = log_reserve_and_open(OS_FILE_LOG_BLOCK_SIZE);
2055
2056 pad_length = OS_FILE_LOG_BLOCK_SIZE
2057 - (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE)
2058 - LOG_BLOCK_TRL_SIZE;
2059 if (pad_length
2060 == (OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE
2061 - LOG_BLOCK_TRL_SIZE)) {
2062
2063 pad_length = 0;
2064 }
2065
2066 if (pad_length) {
2067 srv_stats.n_log_scrubs.inc();
2068 }
2069
2070 for (i = 0; i < pad_length; i++) {
2071 log_write_low(&b, 1);
2072 }
2073
2074 lsn = log_sys.lsn;
2075
2076 log_close();
2077
2078 ut_a(lsn % OS_FILE_LOG_BLOCK_SIZE == LOG_BLOCK_HDR_SIZE);
2079 }
2080
2081 /*****************************************************************//*
2082 If no log record has been written for a while, fill current log
2083 block with dummy records. */
2084 static
2085 void
log_scrub()2086 log_scrub()
2087 /*=========*/
2088 {
2089 log_mutex_enter();
2090 ulint cur_lbn = log_block_convert_lsn_to_no(log_sys.lsn);
2091
2092 if (next_lbn_to_pad == cur_lbn)
2093 {
2094 log_pad_current_log_block();
2095 }
2096
2097 next_lbn_to_pad = log_block_convert_lsn_to_no(log_sys.lsn);
2098 log_mutex_exit();
2099 }
2100
2101 /* log scrubbing speed, in bytes/sec */
2102 UNIV_INTERN ulonglong innodb_scrub_log_speed;
2103
2104 /*****************************************************************//**
2105 This is the main thread for log scrub. It waits for an event and
2106 when waked up fills current log block with dummy records and
2107 sleeps again.
2108 @return this function does not return, it calls os_thread_exit() */
2109 extern "C" UNIV_INTERN
2110 os_thread_ret_t
DECLARE_THREAD(log_scrub_thread)2111 DECLARE_THREAD(log_scrub_thread)(void*)
2112 {
2113 ut_ad(!srv_read_only_mode);
2114
2115 while (srv_shutdown_state < SRV_SHUTDOWN_FLUSH_PHASE) {
2116 /* log scrubbing interval in µs. */
2117 ulonglong interval = 1000*1000*512/innodb_scrub_log_speed;
2118
2119 os_event_wait_time(log_scrub_event, static_cast<ulint>(interval));
2120
2121 log_scrub();
2122
2123 os_event_reset(log_scrub_event);
2124 }
2125
2126 log_scrub_thread_active = false;
2127
2128 /* We count the number of threads in os_thread_exit(). A created
2129 thread should always use that to exit and not use return() to exit. */
2130 os_thread_exit();
2131
2132 OS_THREAD_DUMMY_RETURN;
2133 }
2134