1 /*****************************************************************************
2
3 Copyright (c) 1995, 2021, Oracle and/or its affiliates.
4 Copyright (c) 2009, Google Inc.
5 Copyright (c) 2016, Percona Inc. All Rights Reserved.
6
7 Portions of this file contain modifications contributed and copyrighted by
8 Google, Inc. Those modifications are gratefully acknowledged and are described
9 briefly in the InnoDB documentation. The contributions by Google are
10 incorporated with their permission, and subject to the conditions contained in
11 the file COPYING.Google.
12
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License, version 2.0,
15 as published by the Free Software Foundation.
16
17 This program is also distributed with certain software (including
18 but not limited to OpenSSL) that is licensed under separate terms,
19 as designated in a particular file or component or in included license
20 documentation. The authors of MySQL hereby grant you an additional
21 permission to link the program and your derivative works with the
22 separately licensed software that they have included with MySQL.
23
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License, version 2.0, for more details.
28
29 You should have received a copy of the GNU General Public License along with
30 this program; if not, write to the Free Software Foundation, Inc.,
31 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
32
33 *****************************************************************************/
34
35 /**************************************************//**
36 @file log/log0log.cc
37 Database log
38
39 Created 12/9/1995 Heikki Tuuri
40 *******************************************************/
41
42 #include "ha_prototypes.h"
43 #include <debug_sync.h>
44
45 #include "log0log.h"
46
47 #ifdef UNIV_NONINL
48 #include "log0log.ic"
49 #endif
50
51 #include "mem0mem.h"
52 #include "buf0buf.h"
53 #ifndef UNIV_HOTBACKUP
54 #include "buf0flu.h"
55 #include "srv0srv.h"
56 #include "log0recv.h"
57 #include "lock0lock.h"
58 #include "fil0fil.h"
59 #include "fil0crypt.h"
60 #include "dict0boot.h"
61 #include "dict0stats_bg.h"
62 #include "srv0srv.h"
63 #include "srv0start.h"
64 #include "trx0sys.h"
65 #include "trx0trx.h"
66 #include "trx0roll.h"
67 #include "srv0mon.h"
68 #include "sync0sync.h"
69 #endif /* !UNIV_HOTBACKUP */
70
71 #include "system_key.h"
72
73 redo_log_encrypt_enum existing_redo_encryption_mode = REDO_LOG_ENCRYPT_OFF;
74
75 /*
76 General philosophy of InnoDB redo-logs:
77
78 1) Every change to a contents of a data page must be done
79 through mtr, which in mtr_commit() writes log records
80 to the InnoDB redo log.
81
82 2) Normally these changes are performed using a mlog_write_ulint()
83 or similar function.
84
85 3) In some page level operations only a code number of a
86 c-function and its parameters are written to the log to
87 reduce the size of the log.
88
89 3a) You should not add parameters to these kind of functions
90 (e.g. trx_undo_header_create(), trx_undo_insert_header_reuse())
91
92 3b) You should not add such functionality which either change
93 working when compared with the old or are dependent on data
94 outside of the page. These kind of functions should implement
95 self-contained page transformation and it should be unchanged
96 if you don't have very essential reasons to change log
97 semantics or format.
98
99 */
100
101 /** Redo log system */
102 log_t* log_sys = NULL;
103
104 /** Whether to generate and require checksums on the redo log pages */
105 my_bool innodb_log_checksums;
106
107 /** Pointer to the log checksum calculation function */
108 log_checksum_func_t log_checksum_algorithm_ptr;
109
110 /* Next log block number to do dummy record filling if no log records written
111 for a while */
112 static ulint next_lbn_to_pad = 0;
113
114 /* These control how often we print warnings if the last checkpoint is too
115 old */
116 bool log_has_printed_chkp_warning = false;
117 time_t log_last_warning_time;
118
119 bool log_has_printed_chkp_margine_warning = false;
120 time_t log_last_margine_warning_time;
121
122 /* A margin for free space in the log buffer before a log entry is catenated */
123 #define LOG_BUF_WRITE_MARGIN (4 * OS_FILE_LOG_BLOCK_SIZE)
124
125 /* Margins for free space in the log buffer after a log entry is catenated */
126 #define LOG_BUF_FLUSH_RATIO 2
127 #define LOG_BUF_FLUSH_MARGIN (LOG_BUF_WRITE_MARGIN + 4 * UNIV_PAGE_SIZE)
128
129 /* This parameter controls asynchronous making of a new checkpoint; the value
130 should be bigger than LOG_POOL_PREFLUSH_RATIO_SYNC */
131
132 #define LOG_POOL_CHECKPOINT_RATIO_ASYNC 32
133
134 /* This parameter controls synchronous preflushing of modified buffer pages */
135 #define LOG_POOL_PREFLUSH_RATIO_SYNC 16
136
137 /* The same ratio for asynchronous preflushing; this value should be less than
138 the previous */
139 #define LOG_POOL_PREFLUSH_RATIO_ASYNC 8
140
141 /* Codes used in unlocking flush latches */
142 #define LOG_UNLOCK_NONE_FLUSHED_LOCK 1
143 #define LOG_UNLOCK_FLUSH_LOCK 2
144
145 /** Event to wake up log_scrub_thread */
146 os_event_t log_scrub_event;
147 /** Whether log_scrub_thread is active */
148 bool log_scrub_thread_active;
149
150 extern "C"
151 os_thread_ret_t
152 DECLARE_THREAD(log_scrub_thread)(void*);
153
154
155 /******************************************************//**
156 Completes a checkpoint write i/o to a log file. */
157 static
158 void
159 log_io_complete_checkpoint(void);
160 /*============================*/
161
162 #ifndef UNIV_HOTBACKUP
163 /****************************************************************//**
164 Returns the oldest modified block lsn in the pool, or log_sys->lsn if none
165 exists.
166 @return LSN of oldest modification */
167 static
168 lsn_t
log_buf_pool_get_oldest_modification(void)169 log_buf_pool_get_oldest_modification(void)
170 /*======================================*/
171 {
172 lsn_t lsn;
173
174 ut_ad(log_mutex_own());
175
176 lsn = buf_pool_get_oldest_modification();
177
178 if (!lsn) {
179
180 lsn = log_sys->lsn;
181 }
182
183 return(lsn);
184 }
185 #endif /* !UNIV_HOTBACKUP */
186
187 /****************************************************************//**
188 Checks if the log groups have a big enough margin of free space in
189 so that a new log entry can be written without overwriting log data
190 that is not read by the changed page bitmap thread.
191 @return true if there is not enough free space. */
192 static
193 bool
log_check_tracking_margin(ulint lsn_advance)194 log_check_tracking_margin(
195 ulint lsn_advance) /*!< in: an upper limit on how much log data we
196 plan to write. If zero, the margin will be
197 checked for the already-written log. */
198 {
199 lsn_t tracked_lsn;
200 lsn_t tracked_lsn_age;
201
202 if (!srv_track_changed_pages) {
203 return false;
204 }
205
206 ut_ad(mutex_own(&(log_sys->mutex)));
207
208 tracked_lsn = log_get_tracked_lsn();
209 tracked_lsn_age = log_sys->lsn - tracked_lsn;
210
211 /* The overwrite would happen when log_sys->log_group_capacity is
212 exceeded, but we use max_checkpoint_age for an extra safety margin. */
213 return tracked_lsn_age + lsn_advance > log_sys->max_checkpoint_age;
214 }
215
216 /** Extends the log buffer.
217 @param[in] len requested minimum size in bytes */
218 void
log_buffer_extend(ulint len)219 log_buffer_extend(
220 ulint len)
221 {
222 ulint move_start;
223 ulint move_end;
224 byte* tmp_buf[OS_FILE_LOG_BLOCK_SIZE];
225
226 log_mutex_enter_all();
227
228 while (log_sys->is_extending) {
229 /* Another thread is trying to extend already.
230 Needs to wait for. */
231 log_mutex_exit_all();
232
233 log_buffer_flush_to_disk();
234
235 log_mutex_enter_all();
236
237 if (srv_log_buffer_size > len / UNIV_PAGE_SIZE) {
238 /* Already extended enough by the others */
239 log_mutex_exit_all();
240 return;
241 }
242 }
243
244 if (len >= log_sys->buf_size / 2) {
245 DBUG_EXECUTE_IF("ib_log_buffer_is_short_crash",
246 DBUG_SUICIDE(););
247
248 /* log_buffer is too small. try to extend instead of crash. */
249 ib::warn() << "The transaction log size is too large"
250 " for innodb_log_buffer_size (" << len << " >= "
251 << LOG_BUFFER_SIZE << " / 2). Trying to extend it.";
252 }
253
254 log_sys->is_extending = true;
255
256 while (ut_calc_align_down(log_sys->buf_free,
257 OS_FILE_LOG_BLOCK_SIZE)
258 != ut_calc_align_down(log_sys->buf_next_to_write,
259 OS_FILE_LOG_BLOCK_SIZE)) {
260 /* Buffer might have >1 blocks to write still. */
261 log_mutex_exit_all();
262
263 log_buffer_flush_to_disk();
264
265 log_mutex_enter_all();
266 }
267
268 move_start = ut_calc_align_down(
269 log_sys->buf_free,
270 OS_FILE_LOG_BLOCK_SIZE);
271 move_end = log_sys->buf_free;
272
273 /* store the last log block in buffer */
274 ut_memcpy(tmp_buf, log_sys->buf + move_start,
275 move_end - move_start);
276
277 log_sys->buf_free -= move_start;
278 log_sys->buf_next_to_write -= move_start;
279
280 /* reallocate log buffer */
281 srv_log_buffer_size = len / UNIV_PAGE_SIZE + 1;
282 ut_free(log_sys->buf_ptr);
283
284 log_sys->buf_size = LOG_BUFFER_SIZE;
285
286 log_sys->buf_ptr = static_cast<byte*>(
287 ut_zalloc_nokey(log_sys->buf_size * 2
288 + MAX_SRV_LOG_WRITE_AHEAD_SIZE));
289 log_sys->buf = static_cast<byte*>(
290 ut_align(log_sys->buf_ptr, MAX_SRV_LOG_WRITE_AHEAD_SIZE));
291
292 log_sys->first_in_use = true;
293
294 log_sys->max_buf_free = log_sys->buf_size / LOG_BUF_FLUSH_RATIO
295 - LOG_BUF_FLUSH_MARGIN;
296
297 /* restore the last log block */
298 ut_memcpy(log_sys->buf, tmp_buf, move_end - move_start);
299
300 ut_ad(log_sys->is_extending);
301 log_sys->is_extending = false;
302
303 log_mutex_exit_all();
304
305 ib::info() << "innodb_log_buffer_size was extended to "
306 << LOG_BUFFER_SIZE << ".";
307 }
308
309 #ifndef UNIV_HOTBACKUP
310 /** Calculate actual length in redo buffer and file including
311 block header and trailer.
312 @param[in] len length to write
313 @return actual length to write including header and trailer. */
314 static inline
315 ulint
log_calculate_actual_len(ulint len)316 log_calculate_actual_len(
317 ulint len)
318 {
319 ut_ad(log_mutex_own());
320
321 /* actual length stored per block */
322 const ulint len_per_blk = OS_FILE_LOG_BLOCK_SIZE
323 - (LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE);
324
325 /* actual data length in last block already written */
326 ulint extra_len = (log_sys->buf_free % OS_FILE_LOG_BLOCK_SIZE);
327
328 ut_ad(extra_len >= LOG_BLOCK_HDR_SIZE);
329 extra_len -= LOG_BLOCK_HDR_SIZE;
330
331 /* total extra length for block header and trailer */
332 extra_len = ((len + extra_len) / len_per_blk)
333 * (LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE);
334
335 return(len + extra_len);
336 }
337
338 /** Check margin not to overwrite transaction log from the last checkpoint.
339 If would estimate the log write to exceed the log_group_capacity,
340 waits for the checkpoint is done enough.
341 @param[in] len length of the data to be written */
342
343 void
log_margin_checkpoint_age(ulint len)344 log_margin_checkpoint_age(
345 ulint len)
346 {
347 ulint margin = log_calculate_actual_len(len);
348
349 ut_ad(log_mutex_own());
350
351 if (margin > log_sys->log_group_capacity) {
352 /* return with warning output to avoid deadlock */
353 if (!log_has_printed_chkp_margine_warning
354 || difftime(time(NULL),
355 log_last_margine_warning_time) > 15) {
356 log_has_printed_chkp_margine_warning = true;
357 log_last_margine_warning_time = time(NULL);
358
359 ib::error() << "The transaction log files are too"
360 " small for the single transaction log (size="
361 << len << "). So, the last checkpoint age"
362 " might exceed the log group capacity "
363 << log_sys->log_group_capacity << ".";
364 }
365
366 return;
367 }
368
369 /* Our margin check should ensure that we never reach this condition.
370 Try to do checkpoint once. We cannot keep waiting here as it might
371 result in hang in case the current mtr has latch on oldest lsn */
372 if (log_sys->lsn - log_sys->last_checkpoint_lsn + margin
373 > log_sys->log_group_capacity) {
374 /* The log write of 'len' might overwrite the transaction log
375 after the last checkpoint. Makes checkpoint. */
376
377 bool flushed_enough = false;
378
379 if (log_sys->lsn - log_buf_pool_get_oldest_modification()
380 + margin
381 <= log_sys->log_group_capacity) {
382 flushed_enough = true;
383 }
384
385 log_sys->check_flush_or_checkpoint = true;
386 log_mutex_exit();
387
388 DEBUG_SYNC_C("margin_checkpoint_age_rescue");
389
390 if (!flushed_enough) {
391 os_thread_sleep(100000);
392 }
393 log_checkpoint(true, false);
394
395 log_mutex_enter();
396 }
397
398 return;
399 }
400 #endif /* !UNIV_HOTBACKUP */
401 /** Open the log for log_write_low. The log must be closed with log_close.
402 @param[in] len length of the data to be written
403 @return start lsn of the log record */
404 lsn_t
log_reserve_and_open(ulint len)405 log_reserve_and_open(
406 ulint len)
407 {
408 ulint len_upper_limit;
409 ulint count = 0;
410 ulint tcount = 0;
411
412 loop:
413 ut_ad(log_mutex_own());
414 ut_ad(!recv_no_log_write);
415
416 if (log_sys->is_extending) {
417 log_mutex_exit();
418
419 /* Log buffer size is extending. Writing up to the next block
420 should wait for the extending finished. */
421
422 os_thread_sleep(100000);
423
424 ut_ad(++count < 50);
425
426 log_mutex_enter();
427 goto loop;
428 }
429
430 /* Calculate an upper limit for the space the string may take in the
431 log buffer */
432
433 len_upper_limit = LOG_BUF_WRITE_MARGIN + srv_log_write_ahead_size
434 + (5 * len) / 4;
435
436 if (log_sys->buf_free + len_upper_limit > log_sys->buf_size) {
437 log_mutex_exit();
438
439 DEBUG_SYNC_C("log_buf_size_exceeded");
440
441 /* Not enough free space, do a write of the log buffer */
442
443 log_buffer_sync_in_background(false);
444
445 srv_stats.log_waits.inc();
446
447 ut_ad(++count < 50);
448
449 log_mutex_enter();
450 goto loop;
451 }
452
453 if (log_check_tracking_margin(len_upper_limit) &&
454 (++tcount + count < 50)) {
455
456 /* This log write would violate the untracked LSN free space
457 margin. Limit this to 50 retries as there might be situations
458 where we have no choice but to proceed anyway, i.e. if the log
459 is about to be overflown, log tracking or not. */
460 log_mutex_exit();
461
462 os_thread_sleep(10000);
463
464 log_mutex_enter();
465 goto loop;
466 }
467
468 return(log_sys->lsn);
469 }
470
471 /************************************************************//**
472 Writes to the log the string given. It is assumed that the caller holds the
473 log mutex. */
474 void
log_write_low(const byte * str,ulint str_len)475 log_write_low(
476 /*==========*/
477 const byte* str, /*!< in: string */
478 ulint str_len) /*!< in: string length */
479 {
480 log_t* log = log_sys;
481 ulint len;
482 ulint data_len;
483 byte* log_block;
484
485 ut_ad(log_mutex_own());
486 part_loop:
487 ut_ad(!recv_no_log_write);
488 /* Calculate a part length */
489
490 data_len = (log->buf_free % OS_FILE_LOG_BLOCK_SIZE) + str_len;
491
492 if (data_len <= OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
493
494 /* The string fits within the current log block */
495
496 len = str_len;
497 } else {
498 data_len = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE;
499
500 len = OS_FILE_LOG_BLOCK_SIZE
501 - (log->buf_free % OS_FILE_LOG_BLOCK_SIZE)
502 - LOG_BLOCK_TRL_SIZE;
503 }
504
505 ut_memcpy(log->buf + log->buf_free, str, len);
506
507 str_len -= len;
508 str = str + len;
509
510 log_block = static_cast<byte*>(
511 ut_align_down(
512 log->buf + log->buf_free, OS_FILE_LOG_BLOCK_SIZE));
513
514 log_block_set_data_len(log_block, data_len);
515
516 if (data_len == OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
517 /* This block became full */
518 log_block_set_data_len(log_block, OS_FILE_LOG_BLOCK_SIZE);
519 log_block_set_checkpoint_no(log_block,
520 log_sys->next_checkpoint_no);
521 len += LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE;
522
523 log->lsn += len;
524
525 /* Initialize the next block header */
526 log_block_init(log_block + OS_FILE_LOG_BLOCK_SIZE, log->lsn);
527 } else {
528 log->lsn += len;
529 }
530
531 log->buf_free += len;
532
533 ut_ad(log->buf_free <= log->buf_size);
534
535 if (str_len > 0) {
536 goto part_loop;
537 }
538
539 srv_stats.log_write_requests.inc();
540 }
541
542 /************************************************************//**
543 Closes the log.
544 @return lsn */
545 lsn_t
log_close(void)546 log_close(void)
547 /*===========*/
548 {
549 byte* log_block;
550 ulint first_rec_group;
551 lsn_t oldest_lsn;
552 lsn_t lsn;
553 log_t* log = log_sys;
554 lsn_t checkpoint_age;
555
556 ut_ad(log_mutex_own());
557 ut_ad(!recv_no_log_write);
558
559 lsn = log->lsn;
560
561 log_block = static_cast<byte*>(
562 ut_align_down(
563 log->buf + log->buf_free, OS_FILE_LOG_BLOCK_SIZE));
564
565 first_rec_group = log_block_get_first_rec_group(log_block);
566
567 if (first_rec_group == 0) {
568 /* We initialized a new log block which was not written
569 full by the current mtr: the next mtr log record group
570 will start within this block at the offset data_len */
571
572 log_block_set_first_rec_group(
573 log_block, log_block_get_data_len(log_block));
574 }
575
576 if (log->buf_free > log->max_buf_free) {
577
578 log->check_flush_or_checkpoint = true;
579 }
580
581 if (srv_track_changed_pages) {
582
583 lsn_t tracked_lsn = log_get_tracked_lsn();
584 ut_ad(tracked_lsn > 0);
585 lsn_t tracked_lsn_age = lsn - tracked_lsn;
586
587 if (tracked_lsn_age >= log->log_group_capacity) {
588
589 ib::error() << "The age of the oldest untracked "
590 "record exceeds the log group capacity!";
591 ib::error() << "Stopping the log tracking thread at "
592 "LSN " << tracked_lsn;
593 srv_track_changed_pages = FALSE;
594 }
595 }
596
597 checkpoint_age = lsn - log->last_checkpoint_lsn;
598
599 if (checkpoint_age >= log->log_group_capacity) {
600 DBUG_EXECUTE_IF(
601 "print_all_chkp_warnings",
602 log_has_printed_chkp_warning = false;);
603
604 if (!log_has_printed_chkp_warning
605 || difftime(time(NULL), log_last_warning_time) > 15) {
606
607 log_has_printed_chkp_warning = true;
608 log_last_warning_time = time(NULL);
609
610 ib::error() << "The age of the last checkpoint is "
611 << checkpoint_age << ", which exceeds the log"
612 " group capacity " << log->log_group_capacity
613 << ".";
614 }
615 }
616
617 if (checkpoint_age <= log->max_modified_age_sync) {
618
619 goto function_exit;
620 }
621
622 oldest_lsn = buf_pool_get_oldest_modification();
623
624 if (!oldest_lsn
625 || lsn - oldest_lsn > log->max_modified_age_sync
626 || checkpoint_age > log->max_checkpoint_age_async) {
627
628 log->check_flush_or_checkpoint = true;
629 }
630 function_exit:
631
632 return(lsn);
633 }
634
635 /******************************************************//**
636 Calculates the data capacity of a log group, when the log file headers are not
637 included.
638 @return capacity in bytes */
639 lsn_t
log_group_get_capacity(const log_group_t * group)640 log_group_get_capacity(
641 /*===================*/
642 const log_group_t* group) /*!< in: log group */
643 {
644 /* The lsn parameters are updated while holding both the mutexes
645 and it is ok to have either of them while reading */
646 ut_ad(log_mutex_own() || log_write_mutex_own());
647
648 return((group->file_size - LOG_FILE_HDR_SIZE) * group->n_files);
649 }
650
651 /******************************************************//**
652 Calculates the offset within a log group, when the log file headers are not
653 included.
654 @return size offset (<= offset) */
655 UNIV_INLINE
656 lsn_t
log_group_calc_size_offset(lsn_t offset,const log_group_t * group)657 log_group_calc_size_offset(
658 /*=======================*/
659 lsn_t offset, /*!< in: real offset within the
660 log group */
661 const log_group_t* group) /*!< in: log group */
662 {
663 /* The lsn parameters are updated while holding both the mutexes
664 and it is ok to have either of them while reading */
665 ut_ad(log_mutex_own() || log_write_mutex_own());
666
667 return(offset - LOG_FILE_HDR_SIZE * (1 + offset / group->file_size));
668 }
669
670 /******************************************************//**
671 Calculates the offset within a log group, when the log file headers are
672 included.
673 @return real offset (>= offset) */
674 UNIV_INLINE
675 lsn_t
log_group_calc_real_offset(lsn_t offset,const log_group_t * group)676 log_group_calc_real_offset(
677 /*=======================*/
678 lsn_t offset, /*!< in: size offset within the
679 log group */
680 const log_group_t* group) /*!< in: log group */
681 {
682 /* The lsn parameters are updated while holding both the mutexes
683 and it is ok to have either of them while reading */
684 ut_ad(log_mutex_own() || log_write_mutex_own());
685
686 return(offset + LOG_FILE_HDR_SIZE
687 * (1 + offset / (group->file_size - LOG_FILE_HDR_SIZE)));
688 }
689
690 /** Calculate the offset of an lsn within a log group.
691 @param[in] lsn log sequence number
692 @param[in] group log group
693 @return offset within the log group */
694 lsn_t
log_group_calc_lsn_offset(lsn_t lsn,const log_group_t * group)695 log_group_calc_lsn_offset(
696 lsn_t lsn,
697 const log_group_t* group)
698 {
699 lsn_t gr_lsn;
700 lsn_t gr_lsn_size_offset;
701 lsn_t difference;
702 lsn_t group_size;
703 lsn_t offset;
704
705 /* The lsn parameters are updated while holding both the mutexes
706 and it is ok to have either of them while reading */
707 ut_ad(log_mutex_own() || log_write_mutex_own());
708
709 gr_lsn = group->lsn;
710
711 gr_lsn_size_offset = log_group_calc_size_offset(
712 group->lsn_offset, group);
713
714 group_size = log_group_get_capacity(group);
715
716 if (lsn >= gr_lsn) {
717
718 difference = lsn - gr_lsn;
719 } else {
720 difference = gr_lsn - lsn;
721
722 difference = difference % group_size;
723
724 difference = group_size - difference;
725 }
726
727 offset = (gr_lsn_size_offset + difference) % group_size;
728
729 /* fprintf(stderr,
730 "Offset is " LSN_PF " gr_lsn_offset is " LSN_PF
731 " difference is " LSN_PF "\n",
732 offset, gr_lsn_size_offset, difference);
733 */
734
735 return(log_group_calc_real_offset(offset, group));
736 }
737
738 /*******************************************************************//**
739 Calculates where in log files we find a specified lsn.
740 @return log file number */
741 ulint
log_calc_where_lsn_is(int64_t * log_file_offset,ib_uint64_t first_header_lsn,ib_uint64_t lsn,ulint n_log_files,int64_t log_file_size)742 log_calc_where_lsn_is(
743 /*==================*/
744 int64_t* log_file_offset, /*!< out: offset in that file
745 (including the header) */
746 ib_uint64_t first_header_lsn, /*!< in: first log file start
747 lsn */
748 ib_uint64_t lsn, /*!< in: lsn whose position to
749 determine */
750 ulint n_log_files, /*!< in: total number of log
751 files */
752 int64_t log_file_size) /*!< in: log file size
753 (including the header) */
754 {
755 int64_t capacity = log_file_size - LOG_FILE_HDR_SIZE;
756 ulint file_no;
757 int64_t add_this_many;
758
759 if (lsn < first_header_lsn) {
760 add_this_many = 1 + (first_header_lsn - lsn)
761 / (capacity * static_cast<int64_t>(n_log_files));
762 lsn += add_this_many
763 * capacity * static_cast<int64_t>(n_log_files);
764 }
765
766 ut_a(lsn >= first_header_lsn);
767
768 file_no = ((ulint)((lsn - first_header_lsn) / capacity))
769 % n_log_files;
770 *log_file_offset = (lsn - first_header_lsn) % capacity;
771
772 *log_file_offset = *log_file_offset + LOG_FILE_HDR_SIZE;
773
774 return(file_no);
775 }
776
777
778 /********************************************************//**
779 Sets the field values in group to correspond to a given lsn. For this function
780 to work, the values must already be correctly initialized to correspond to
781 some lsn, for instance, a checkpoint lsn. */
782 void
log_group_set_fields(log_group_t * group,lsn_t lsn)783 log_group_set_fields(
784 /*=================*/
785 log_group_t* group, /*!< in/out: group */
786 lsn_t lsn) /*!< in: lsn for which the values should be
787 set */
788 {
789 group->lsn_offset = log_group_calc_lsn_offset(lsn, group);
790 group->lsn = lsn;
791 }
792 #ifndef UNIV_HOTBACKUP
793 /*****************************************************************//**
794 Calculates the recommended highest values for lsn - last_checkpoint_lsn
795 and lsn - buf_get_oldest_modification().
796 @retval true on success
797 @retval false if the smallest log group is too small to
798 accommodate the number of OS threads in the database server */
799 static MY_ATTRIBUTE((warn_unused_result))
800 bool
log_calc_max_ages(void)801 log_calc_max_ages(void)
802 /*===================*/
803 {
804 log_group_t* group;
805 lsn_t margin;
806 ulint free;
807 bool success = true;
808 lsn_t smallest_capacity;
809
810 log_mutex_enter();
811
812 group = UT_LIST_GET_FIRST(log_sys->log_groups);
813
814 ut_ad(group);
815
816 smallest_capacity = LSN_MAX;
817
818 while (group) {
819 if (log_group_get_capacity(group) < smallest_capacity) {
820
821 smallest_capacity = log_group_get_capacity(group);
822 }
823
824 group = UT_LIST_GET_NEXT(log_groups, group);
825 }
826
827 /* Add extra safety */
828 smallest_capacity = smallest_capacity - smallest_capacity / 10;
829
830 /* For each OS thread we must reserve so much free space in the
831 smallest log group that it can accommodate the log entries produced
832 by single query steps: running out of free log space is a serious
833 system error which requires rebooting the database. */
834
835 free = LOG_CHECKPOINT_FREE_PER_THREAD * (10 + srv_thread_concurrency)
836 + LOG_CHECKPOINT_EXTRA_FREE;
837 if (free >= smallest_capacity / 2) {
838 success = false;
839
840 goto failure;
841 } else {
842 margin = smallest_capacity - free;
843 }
844
845 margin = margin - margin / 10; /* Add still some extra safety */
846
847 log_sys->log_group_capacity = smallest_capacity;
848
849 log_sys->max_modified_age_async = margin
850 - margin / LOG_POOL_PREFLUSH_RATIO_ASYNC;
851 log_sys->max_modified_age_sync = margin
852 - margin / LOG_POOL_PREFLUSH_RATIO_SYNC;
853
854 log_sys->max_checkpoint_age_async = margin - margin
855 / LOG_POOL_CHECKPOINT_RATIO_ASYNC;
856 log_sys->max_checkpoint_age = margin;
857
858 failure:
859 log_mutex_exit();
860
861 if (!success) {
862 ib::error() << "Cannot continue operation. ib_logfiles are too"
863 " small for innodb_thread_concurrency "
864 << srv_thread_concurrency << ". The combined size of"
865 " ib_logfiles should be bigger than"
866 " 200 kB * innodb_thread_concurrency. To get mysqld"
867 " to start up, set innodb_thread_concurrency in"
868 " my.cnf to a lower value, for example, to 8. After"
869 " an ERROR-FREE shutdown of mysqld you can adjust"
870 " the size of ib_logfiles. " << INNODB_PARAMETERS_MSG;
871 }
872
873 return(success);
874 }
875
876 /******************************************************//**
877 Initializes the log. */
878 void
log_init(void)879 log_init(void)
880 /*==========*/
881 {
882 log_sys = static_cast<log_t*>(ut_zalloc_nokey(sizeof(log_t)));
883
884 mutex_create(LATCH_ID_LOG_SYS, &log_sys->mutex);
885 mutex_create(LATCH_ID_LOG_WRITE, &log_sys->write_mutex);
886
887 mutex_create(LATCH_ID_LOG_FLUSH_ORDER, &log_sys->log_flush_order_mutex);
888
889 /* Start the lsn from one log block from zero: this way every
890 log record has a start lsn != zero, a fact which we will use */
891
892 log_sys->lsn = LOG_START_LSN;
893
894 ut_a(LOG_BUFFER_SIZE >= 16 * OS_FILE_LOG_BLOCK_SIZE);
895 ut_a(LOG_BUFFER_SIZE >= 4 * UNIV_PAGE_SIZE);
896
897 log_sys->buf_size = LOG_BUFFER_SIZE;
898
899 log_sys->buf_ptr = static_cast<byte*>(
900 ut_zalloc_nokey(log_sys->buf_size * 2
901 + MAX_SRV_LOG_WRITE_AHEAD_SIZE));
902 log_sys->buf = static_cast<byte*>(
903 ut_align(log_sys->buf_ptr, MAX_SRV_LOG_WRITE_AHEAD_SIZE));
904
905 log_sys->first_in_use = true;
906
907 log_sys->max_buf_free = log_sys->buf_size / LOG_BUF_FLUSH_RATIO
908 - LOG_BUF_FLUSH_MARGIN;
909 log_sys->check_flush_or_checkpoint = true;
910 UT_LIST_INIT(log_sys->log_groups, &log_group_t::log_groups);
911
912 log_sys->n_log_ios_old = log_sys->n_log_ios;
913 log_sys->last_printout_time = time(NULL);
914 /*----------------------------*/
915
916 log_sys->write_lsn = log_sys->lsn;
917
918 log_sys->flush_event = os_event_create(0);
919
920 os_event_set(log_sys->flush_event);
921
922 /*----------------------------*/
923
924 log_sys->last_checkpoint_lsn = log_sys->lsn;
925 log_sys->next_checkpoint_lsn = log_sys->lsn;
926
927 rw_lock_create(
928 checkpoint_lock_key, &log_sys->checkpoint_lock,
929 SYNC_NO_ORDER_CHECK);
930
931 log_sys->checkpoint_buf_ptr = static_cast<byte*>(
932 ut_zalloc_nokey(OS_FILE_LOG_BLOCK_SIZE
933 + MAX_SRV_LOG_WRITE_AHEAD_SIZE));
934
935 log_sys->checkpoint_buf = static_cast<byte*>(
936 ut_align(log_sys->checkpoint_buf_ptr,
937 MAX_SRV_LOG_WRITE_AHEAD_SIZE));
938
939 /*----------------------------*/
940
941 log_block_init(log_sys->buf, log_sys->lsn);
942 log_block_set_first_rec_group(log_sys->buf, LOG_BLOCK_HDR_SIZE);
943
944 log_sys->buf_free = LOG_BLOCK_HDR_SIZE;
945 log_sys->lsn = LOG_START_LSN + LOG_BLOCK_HDR_SIZE;
946
947 MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
948 log_sys->lsn - log_sys->last_checkpoint_lsn);
949
950 log_scrub_thread_active= !srv_read_only_mode && srv_scrub_log;
951 if (log_scrub_thread_active) {
952 log_scrub_event= os_event_create("log_scrub_event");
953 os_thread_create(log_scrub_thread, NULL, NULL);
954 }
955 }
956
957 /******************************************************************//**
958 Inits a log group to the log system.
959 @return true if success, false if not */
960 MY_ATTRIBUTE((warn_unused_result))
961 bool
log_group_init(ulint id,ulint n_files,lsn_t file_size,ulint space_id)962 log_group_init(
963 /*===========*/
964 ulint id, /*!< in: group id */
965 ulint n_files, /*!< in: number of log files */
966 lsn_t file_size, /*!< in: log file size in bytes */
967 ulint space_id) /*!< in: space id of the file space
968 which contains the log files of this
969 group */
970 {
971 ulint i;
972 log_group_t* group;
973
974 group = static_cast<log_group_t*>(ut_malloc_nokey(sizeof(log_group_t)));
975
976 group->id = id;
977 group->n_files = n_files;
978 group->format = LOG_HEADER_FORMAT_CURRENT;
979 group->file_size = file_size;
980 group->space_id = space_id;
981 group->state = LOG_GROUP_OK;
982 group->lsn = LOG_START_LSN;
983 group->lsn_offset = LOG_FILE_HDR_SIZE;
984
985 group->file_header_bufs_ptr = static_cast<byte**>(
986 ut_zalloc_nokey(sizeof(byte*) * n_files));
987
988 group->file_header_bufs = static_cast<byte**>(
989 ut_zalloc_nokey(sizeof(byte**) * n_files));
990
991 for (i = 0; i < n_files; i++) {
992 group->file_header_bufs_ptr[i] = static_cast<byte*>(
993 ut_zalloc_nokey(LOG_FILE_HDR_SIZE
994 + MAX_SRV_LOG_WRITE_AHEAD_SIZE));
995
996 group->file_header_bufs[i] = static_cast<byte*>(
997 ut_align(group->file_header_bufs_ptr[i],
998 MAX_SRV_LOG_WRITE_AHEAD_SIZE));
999 }
1000
1001 group->checkpoint_buf_ptr = static_cast<byte*>(
1002 ut_zalloc_nokey(OS_FILE_LOG_BLOCK_SIZE +
1003 MAX_SRV_LOG_WRITE_AHEAD_SIZE));
1004
1005 group->checkpoint_buf = static_cast<byte*>(
1006 ut_align(group->checkpoint_buf_ptr,
1007 MAX_SRV_LOG_WRITE_AHEAD_SIZE));
1008
1009 UT_LIST_ADD_LAST(log_sys->log_groups, group);
1010
1011 return(log_calc_max_ages());
1012 }
1013 #endif /* !UNIV_HOTBACKUP */
1014 /******************************************************//**
1015 Completes an i/o to a log file. */
1016 void
log_io_complete(log_group_t * group)1017 log_io_complete(
1018 /*============*/
1019 log_group_t* group) /*!< in: log group or a dummy pointer */
1020 {
1021 if ((ulint) group & 0x1UL) {
1022 /* It was a checkpoint write */
1023 group = (log_group_t*)((ulint) group - 1);
1024
1025 #ifdef _WIN32
1026 fil_flush(group->space_id);
1027 #else
1028 switch (srv_unix_file_flush_method) {
1029 case SRV_UNIX_O_DSYNC:
1030 case SRV_UNIX_NOSYNC:
1031 case SRV_UNIX_ALL_O_DIRECT:
1032 break;
1033 case SRV_UNIX_FSYNC:
1034 case SRV_UNIX_LITTLESYNC:
1035 case SRV_UNIX_O_DIRECT:
1036 case SRV_UNIX_O_DIRECT_NO_FSYNC:
1037 if (thd_flush_log_at_trx_commit(NULL) != 2)
1038 fil_flush(group->space_id);
1039 }
1040 #endif /* _WIN32 */
1041
1042 DBUG_PRINT("ib_log", ("checkpoint info written to group %u",
1043 unsigned(group->id)));
1044 log_io_complete_checkpoint();
1045
1046 return;
1047 }
1048
1049 ut_error; /*!< We currently use synchronous writing of the
1050 logs and cannot end up here! */
1051 }
1052
1053 /******************************************************//**
1054 Writes a log file header to a log file space. */
1055 static
1056 void
log_group_file_header_flush(log_group_t * group,ulint nth_file,lsn_t start_lsn)1057 log_group_file_header_flush(
1058 /*========================*/
1059 log_group_t* group, /*!< in: log group */
1060 ulint nth_file, /*!< in: header to the nth file in the
1061 log file space */
1062 lsn_t start_lsn) /*!< in: log file data starts at this
1063 lsn */
1064 {
1065 byte* buf;
1066 lsn_t dest_offset;
1067
1068 ut_ad(log_write_mutex_own());
1069 ut_ad(!recv_no_log_write);
1070 ut_ad(group->id == 0);
1071 ut_a(nth_file < group->n_files);
1072
1073 buf = *(group->file_header_bufs + nth_file);
1074
1075 memset(buf, 0, OS_FILE_LOG_BLOCK_SIZE);
1076 mach_write_to_4(buf + LOG_HEADER_FORMAT, LOG_HEADER_FORMAT_CURRENT);
1077 mach_write_to_8(buf + LOG_HEADER_START_LSN, start_lsn);
1078 strcpy(reinterpret_cast<char*>(buf) + LOG_HEADER_CREATOR,
1079 LOG_HEADER_CREATOR_CURRENT);
1080 ut_ad(LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR
1081 >= sizeof LOG_HEADER_CREATOR_CURRENT);
1082 log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf));
1083
1084 dest_offset = nth_file * group->file_size;
1085
1086 DBUG_PRINT("ib_log", ("write " LSN_PF
1087 " group " ULINTPF
1088 " file " ULINTPF " header",
1089 start_lsn, group->id, nth_file));
1090
1091 log_sys->n_log_ios++;
1092
1093 MONITOR_INC(MONITOR_LOG_IO);
1094
1095 srv_stats.os_log_pending_writes.inc();
1096
1097 const ulint page_no
1098 = (ulint) (dest_offset / univ_page_size.physical());
1099
1100 fil_io(IORequestLogWrite, true,
1101 page_id_t(group->space_id, page_no),
1102 univ_page_size,
1103 (ulint) (dest_offset % univ_page_size.physical()),
1104 OS_FILE_LOG_BLOCK_SIZE, buf, group);
1105
1106 srv_stats.os_log_pending_writes.dec();
1107 }
1108
log_encrypt_name(redo_log_encrypt_enum val)1109 const char* log_encrypt_name(redo_log_encrypt_enum val) {
1110 switch(val) {
1111 case REDO_LOG_ENCRYPT_OFF:
1112 return "off";
1113 case REDO_LOG_ENCRYPT_MK:
1114 return "master_key";
1115 case REDO_LOG_ENCRYPT_RK:
1116 return "keyring_key";
1117 }
1118 return "unknown";
1119 }
1120
1121 /* Read the first log file header to get the encryption. It's in the
1122 3rd block.
1123 @return true if success */
1124 bool
log_read_encryption()1125 log_read_encryption() {
1126 byte key[ENCRYPTION_KEY_LEN];
1127 byte iv[ENCRYPTION_KEY_LEN];
1128
1129 byte* log_block_buf_ptr =
1130 static_cast<byte *>(ut_malloc_nokey(2 * OS_FILE_LOG_BLOCK_SIZE));
1131 memset(log_block_buf_ptr, 0, 2 * OS_FILE_LOG_BLOCK_SIZE);
1132 byte* log_block_buf = static_cast<byte *>(
1133 ut_align(log_block_buf_ptr, OS_FILE_LOG_BLOCK_SIZE));
1134
1135 ulint log_space_id = SRV_LOG_SPACE_FIRST_ID;
1136 const page_id_t page_id(log_space_id, 0);
1137 fil_io(IORequestLogRead, true, page_id, univ_page_size,
1138 LOG_CHECKPOINT_1 + OS_FILE_LOG_BLOCK_SIZE,
1139 OS_FILE_LOG_BLOCK_SIZE, log_block_buf, NULL);
1140
1141 bool encryption_magic = false;
1142 bool encrypted_log = false;
1143 redo_log_key* mkey = NULL;
1144 Encryption::Type encryption_type = Encryption::NONE;
1145
1146 if (memcmp(log_block_buf + LOG_HEADER_CREATOR_END,
1147 ENCRYPTION_KEY_MAGIC_RK, ENCRYPTION_MAGIC_SIZE) == 0) {
1148 encryption_magic = true;
1149 existing_redo_encryption_mode = REDO_LOG_ENCRYPT_RK;
1150
1151 /* Make sure the keyring is loaded. */
1152 if (!Encryption::check_keyring()) {
1153 ut_free(log_block_buf_ptr);
1154 ib::error() << "Redo log was encrypted,"
1155 << " but keyring plugin is not loaded.";
1156 return(false);
1157 }
1158
1159 unsigned char* info_ptr = log_block_buf +
1160 LOG_HEADER_CREATOR_END +
1161 ENCRYPTION_MAGIC_SIZE;
1162 uint version = mach_read_from_4(info_ptr);
1163
1164 memcpy(iv, info_ptr + ENCRYPTION_SERVER_UUID_LEN + 4,
1165 ENCRYPTION_KEY_LEN);
1166
1167 #ifdef UNIV_ENCRYPT_DEBUG
1168 fprintf(stderr, "Using redo log encryption key version: %u\n",
1169 version);
1170 #endif
1171
1172 mkey = redo_log_key_mgr.load_key_version(NULL, version);
1173 if (mkey != NULL) {
1174 encrypted_log = true;
1175 memcpy(key, mkey->key, ENCRYPTION_KEY_LEN);
1176 encryption_type = Encryption::KEYRING;
1177 srv_redo_log_key_version = mkey->version;
1178 }
1179 }
1180
1181 if (memcmp(log_block_buf + LOG_HEADER_CREATOR_END,
1182 ENCRYPTION_KEY_MAGIC_V2, ENCRYPTION_MAGIC_SIZE) == 0
1183 ||
1184 memcmp(log_block_buf + LOG_HEADER_CREATOR_END,
1185 ENCRYPTION_KEY_MAGIC_V3, ENCRYPTION_MAGIC_SIZE) == 0
1186 ) {
1187 encryption_magic = true;
1188 existing_redo_encryption_mode = REDO_LOG_ENCRYPT_MK;
1189
1190 /* Make sure the keyring is loaded. */
1191 if (!Encryption::check_keyring()) {
1192 ib::error() << "Redo log was encrypted,"
1193 << " but keyring plugin is not loaded.";
1194 } else if (Encryption::decode_encryption_info(
1195 key, iv,
1196 log_block_buf + LOG_HEADER_CREATOR_END)) {
1197 encrypted_log = true;
1198 encryption_type = Encryption::AES;
1199 }
1200 }
1201 if (encrypted_log) {
1202 if (existing_redo_encryption_mode != srv_redo_log_encrypt &&
1203 srv_redo_log_encrypt != REDO_LOG_ENCRYPT_OFF) {
1204 ib::error() <<
1205 " Redo log encryption mode"
1206 " can't be switched without stopping the server and"
1207 " recreating the redo logs. Current mode is "
1208 << log_encrypt_name(existing_redo_encryption_mode)
1209 << ", requested "
1210 << log_encrypt_name(
1211 static_cast<redo_log_encrypt_enum>(srv_redo_log_encrypt))
1212 << ".";
1213
1214 return(false);
1215 }
1216
1217
1218 /* If redo log encryption is enabled, set the
1219 space flag. Otherwise, we just fill the encryption
1220 information to space object for decrypting old
1221 redo log blocks. */
1222 fil_space_t* space = fil_space_get(log_space_id);
1223 space->encryption_redo_key = mkey;
1224 space->flags |= FSP_FLAGS_MASK_ENCRYPTION;
1225 dberr_t err =
1226 fil_set_encryption(space->id, encryption_type, key, iv);
1227
1228 if (err == DB_SUCCESS) {
1229 ut_free(log_block_buf_ptr);
1230 ib::info() << "Read redo log encryption"
1231 << " metadata successful.";
1232 return(true);
1233 } else {
1234 ut_free(log_block_buf_ptr);
1235 ib::fatal() << "Can't set redo log tablespace"
1236 << " encryption metadata.";
1237 return(false);
1238 }
1239 } else if (encryption_magic) {
1240 ut_free(log_block_buf_ptr);
1241 ib::error() << "Cannot read the encryption"
1242 " information in log file header, please"
1243 " check if keyring plugin loaded and"
1244 " the key file exists.";
1245 return(false);
1246 }
1247
1248 ut_free(log_block_buf_ptr);
1249 return(true);
1250 }
1251
1252
1253 /** Writes encryption information to log header.
1254 @param[in,out] buf log file header
1255 @param[in] key encryption key
1256 @param[in] iv encryption iv */
1257 static bool
log_file_header_fill_encryption(byte * buf,byte * key,byte * iv)1258 log_file_header_fill_encryption(byte* buf, byte* key, byte* iv) {
1259 byte encryption_info[ENCRYPTION_INFO_SIZE_V2];
1260
1261 if (!fsp_header_fill_encryption_info(key, iv, encryption_info)) {
1262 return(false);
1263 }
1264
1265 ut_ad(LOG_HEADER_CREATOR_END + ENCRYPTION_INFO_SIZE_V2 <
1266 OS_FILE_LOG_BLOCK_SIZE);
1267
1268 memcpy(buf + LOG_HEADER_CREATOR_END, encryption_info,
1269 ENCRYPTION_INFO_SIZE_V2);
1270
1271 return(true);
1272 }
1273
1274 static bool
log_file_header_fill_encryption(byte * buf,ulint key_version,byte * iv)1275 log_file_header_fill_encryption(byte* buf, ulint key_version, byte* iv) {
1276 byte encryption_info[ENCRYPTION_INFO_SIZE_V2] = {};
1277
1278 if (!fsp_header_fill_encryption_info(key_version, iv,
1279 encryption_info)) {
1280 return(false);
1281 }
1282
1283 ut_ad(LOG_HEADER_CREATOR_END + ENCRYPTION_INFO_SIZE_V2 <
1284 OS_FILE_LOG_BLOCK_SIZE);
1285
1286 memcpy(buf + LOG_HEADER_CREATOR_END, encryption_info,
1287 ENCRYPTION_INFO_SIZE_V2);
1288
1289 return(true);
1290 }
1291
1292 /** Write the encryption info into the log file header(the 3rd block).
1293 It just need to flush the file header block with current master key.
1294 @param[in] key encryption key
1295 @param[in] iv encryption iv
1296 @param[in] redo_log_encrypt encryption mode
1297 @return true if success. */
1298 bool
log_write_encryption(byte * key,byte * iv,redo_log_encrypt_enum redo_log_encrypt)1299 log_write_encryption(byte* key, byte* iv,
1300 redo_log_encrypt_enum redo_log_encrypt) {
1301 const page_id_t page_id(SRV_LOG_SPACE_FIRST_ID, 0);
1302 byte *log_block_buf_ptr;
1303 byte *log_block_buf;
1304 ulint version = 1;
1305
1306 log_block_buf_ptr =
1307 static_cast<byte *>(ut_malloc_nokey(2 * OS_FILE_LOG_BLOCK_SIZE));
1308 memset(log_block_buf_ptr, 0, 2 * OS_FILE_LOG_BLOCK_SIZE);
1309 log_block_buf = static_cast<byte *>(
1310 ut_align(log_block_buf_ptr, OS_FILE_LOG_BLOCK_SIZE));
1311
1312 if (key == NULL && iv == NULL) {
1313 fil_space_t* space = fil_space_get(SRV_LOG_SPACE_FIRST_ID);
1314
1315 key = space->encryption_key;
1316 iv = space->encryption_iv;
1317 version = space->encryption_key_version;
1318 }
1319
1320 log_write_mutex_enter();
1321 if (redo_log_encrypt == REDO_LOG_ENCRYPT_MK ||
1322 existing_redo_encryption_mode == REDO_LOG_ENCRYPT_MK) {
1323 ut_ad(existing_redo_encryption_mode != REDO_LOG_ENCRYPT_RK);
1324 ut_ad(redo_log_encrypt != REDO_LOG_ENCRYPT_RK);
1325 if (!log_file_header_fill_encryption(log_block_buf, key,
1326 iv)) {
1327 ut_free(log_block_buf_ptr);
1328 log_write_mutex_exit();
1329 return(false);
1330 }
1331 existing_redo_encryption_mode = REDO_LOG_ENCRYPT_MK;
1332 } else if (redo_log_encrypt == REDO_LOG_ENCRYPT_RK ||
1333 existing_redo_encryption_mode == REDO_LOG_ENCRYPT_RK) {
1334 ut_ad(existing_redo_encryption_mode != REDO_LOG_ENCRYPT_MK);
1335 ut_ad(redo_log_encrypt != REDO_LOG_ENCRYPT_MK);
1336 if (!log_file_header_fill_encryption(log_block_buf, version,
1337 iv)) {
1338 ut_free(log_block_buf_ptr);
1339 log_write_mutex_exit();
1340 return(false);
1341 }
1342 existing_redo_encryption_mode = REDO_LOG_ENCRYPT_RK;
1343 } else {
1344 ut_ad(0);
1345 }
1346
1347 log_sys->n_log_ios++;
1348
1349 MONITOR_INC(MONITOR_LOG_IO);
1350
1351 srv_stats.os_log_pending_writes.inc();
1352
1353 fil_io(IORequestLogWrite, true, page_id, univ_page_size,
1354 LOG_CHECKPOINT_1 + OS_FILE_LOG_BLOCK_SIZE,
1355 OS_FILE_LOG_BLOCK_SIZE, log_block_buf, NULL);
1356
1357 srv_stats.os_log_pending_writes.dec();
1358 log_write_mutex_exit();
1359
1360 ut_free(log_block_buf_ptr);
1361 return(true);
1362 }
1363
1364 /** Rotate the redo log encryption
1365 It will re-encrypt the redo log encryption metadata and write it to
1366 redo log file header.
1367 @return true if success. */
1368 bool
log_rotate_encryption()1369 log_rotate_encryption() {
1370 fil_space_t *space = fil_space_get(SRV_LOG_SPACE_FIRST_ID);
1371 if (!FSP_FLAGS_GET_ENCRYPTION(space->flags)) {
1372 return(true);
1373 }
1374 /* Rotate log tablespace */
1375 return (log_write_encryption(
1376 NULL, NULL,
1377 static_cast<redo_log_encrypt_enum>(srv_redo_log_encrypt)));
1378 }
1379
1380 void
log_check_new_key_version()1381 log_check_new_key_version() {
1382 if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
1383 return;
1384 }
1385 fil_space_t* space = fil_space_get(SRV_LOG_SPACE_FIRST_ID);
1386 if (!FSP_FLAGS_GET_ENCRYPTION(space->flags)) {
1387 return;
1388 }
1389 if (srv_redo_log_encrypt == REDO_LOG_ENCRYPT_RK) {
1390 /* re-fetch latest key */
1391 redo_log_key* mkey = redo_log_key_mgr.load_latest_key(NULL, false);
1392 if (mkey != NULL) {
1393 space->encryption_redo_key = mkey;
1394 srv_redo_log_key_version = mkey->version;
1395 }
1396 }
1397 }
1398
1399 /** Check the redo log encryption is enabled or not.
1400 It will try to enable the redo log encryption and write the metadata to
1401 redo log file header. */
1402 void
log_rotate_default_key()1403 log_rotate_default_key() {
1404 fil_space_t* space = fil_space_get(SRV_LOG_SPACE_FIRST_ID);
1405
1406 if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
1407 return;
1408 }
1409
1410 /* If the redo log space is using default key, rotate it.
1411 We also need the server_uuid initialized. */
1412 if (space->encryption_type != Encryption::NONE &&
1413 Encryption::master_key_id == ENCRYPTION_DEFAULT_MASTER_KEY_ID &&
1414 !srv_read_only_mode &&
1415 srv_redo_log_encrypt == REDO_LOG_ENCRYPT_MK) {
1416 ut_ad(strlen(server_uuid) > 0);
1417 ut_ad(FSP_FLAGS_GET_ENCRYPTION(space->flags));
1418
1419 log_write_encryption(NULL, NULL, REDO_LOG_ENCRYPT_MK);
1420 }
1421
1422 if (space->encryption_type != Encryption::NONE &&
1423 space->encryption_key_version == REDO_LOG_ENCRYPT_NO_VERSION &&
1424 !srv_read_only_mode &&
1425 srv_redo_log_encrypt == REDO_LOG_ENCRYPT_RK) {
1426 /* This only happens when the server uuid was just generated, so we can
1427 save the key to the keyring */
1428 ut_ad(strlen(server_uuid) > 0);
1429 if (!redo_log_key_mgr.store_used_keys()) {
1430 srv_redo_log_encrypt = REDO_LOG_ENCRYPT_OFF;
1431 ib::error() << "Can't store redo log encryption key.";
1432 }
1433 redo_log_key* key = redo_log_key_mgr.load_latest_key(NULL, true);
1434 space->encryption_key_version = key->version;
1435 space->encryption_redo_key = key;
1436 srv_redo_log_key_version = key->version;
1437 }
1438 }
1439
1440 /******************************************************//**
1441 Stores a 4-byte checksum to the trailer checksum field of a log block
1442 before writing it to a log file. This checksum is used in recovery to
1443 check the consistency of a log block. */
1444 static
1445 void
log_block_store_checksum(byte * block)1446 log_block_store_checksum(
1447 /*=====================*/
1448 byte* block) /*!< in/out: pointer to a log block */
1449 {
1450 log_block_set_checksum(block, log_block_calc_checksum(block));
1451 }
1452
1453 /******************************************************//**
1454 Writes a buffer to a log file group. */
1455 static
1456 void
log_group_write_buf(log_group_t * group,byte * buf,ulint len,ulint pad_len,lsn_t start_lsn,ulint new_data_offset)1457 log_group_write_buf(
1458 /*================*/
1459 log_group_t* group, /*!< in: log group */
1460 byte* buf, /*!< in: buffer */
1461 ulint len, /*!< in: buffer len; must be divisible
1462 by OS_FILE_LOG_BLOCK_SIZE */
1463 #ifdef UNIV_DEBUG
1464 ulint pad_len, /*!< in: pad len in the buffer len */
1465 #endif /* UNIV_DEBUG */
1466 lsn_t start_lsn, /*!< in: start lsn of the buffer; must
1467 be divisible by
1468 OS_FILE_LOG_BLOCK_SIZE */
1469 ulint new_data_offset)/*!< in: start offset of new data in
1470 buf: this parameter is used to decide
1471 if we have to write a new log file
1472 header */
1473 {
1474 ulint write_len;
1475 bool write_header = new_data_offset == 0;
1476 lsn_t next_offset;
1477 ulint i;
1478
1479 ut_ad(log_write_mutex_own());
1480 ut_ad(!recv_no_log_write);
1481 ut_a(len % OS_FILE_LOG_BLOCK_SIZE == 0);
1482 ut_a(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
1483
1484 loop:
1485 if (len == 0) {
1486
1487 return;
1488 }
1489
1490 next_offset = log_group_calc_lsn_offset(start_lsn, group);
1491
1492 if (write_header
1493 && next_offset % group->file_size == LOG_FILE_HDR_SIZE) {
1494 /* We start to write a new log file instance in the group */
1495
1496 ut_a(next_offset / group->file_size <= ULINT_MAX);
1497
1498 log_group_file_header_flush(group, (ulint)
1499 (next_offset / group->file_size),
1500 start_lsn);
1501 srv_stats.os_log_written.add(OS_FILE_LOG_BLOCK_SIZE);
1502
1503 srv_stats.log_writes.inc();
1504 }
1505
1506 if ((next_offset % group->file_size) + len > group->file_size) {
1507
1508 /* if the above condition holds, then the below expression
1509 is < len which is ulint, so the typecast is ok */
1510 write_len = (ulint)
1511 (group->file_size - (next_offset % group->file_size));
1512 } else {
1513 write_len = len;
1514 }
1515
1516 DBUG_PRINT("ib_log",
1517 ("write " LSN_PF " to " LSN_PF
1518 ": group " ULINTPF " len " ULINTPF
1519 " blocks " ULINTPF ".." ULINTPF,
1520 start_lsn, next_offset,
1521 group->id, write_len,
1522 log_block_get_hdr_no(buf),
1523 log_block_get_hdr_no(
1524 buf + write_len
1525 - OS_FILE_LOG_BLOCK_SIZE)));
1526
1527 ut_ad(pad_len >= len
1528 || log_block_get_hdr_no(buf)
1529 == log_block_convert_lsn_to_no(start_lsn));
1530
1531 /* Calculate the checksums for each log block and write them to
1532 the trailer fields of the log blocks */
1533
1534 for (i = 0; i < write_len / OS_FILE_LOG_BLOCK_SIZE; i++) {
1535 ut_ad(pad_len >= len
1536 || i * OS_FILE_LOG_BLOCK_SIZE >= len - pad_len
1537 || log_block_get_hdr_no(
1538 buf + i * OS_FILE_LOG_BLOCK_SIZE)
1539 == log_block_get_hdr_no(buf) + i);
1540 log_block_store_checksum(buf + i * OS_FILE_LOG_BLOCK_SIZE);
1541 }
1542
1543 log_sys->n_log_ios++;
1544
1545 MONITOR_INC(MONITOR_LOG_IO);
1546
1547 srv_stats.os_log_pending_writes.inc();
1548
1549 ut_a(next_offset / UNIV_PAGE_SIZE <= ULINT_MAX);
1550
1551 const ulint page_no
1552 = (ulint) (next_offset / univ_page_size.physical());
1553
1554 fil_io(IORequestLogWrite, true,
1555 page_id_t(group->space_id, page_no),
1556 univ_page_size,
1557 (ulint) (next_offset % UNIV_PAGE_SIZE), write_len, buf,
1558 group);
1559
1560 srv_stats.os_log_pending_writes.dec();
1561
1562 srv_stats.os_log_written.add(write_len);
1563 srv_stats.log_writes.inc();
1564
1565 if (write_len < len) {
1566 start_lsn += write_len;
1567 len -= write_len;
1568 buf += write_len;
1569
1570 write_header = true;
1571
1572 goto loop;
1573 }
1574 }
1575
1576 /** Flush the log has been written to the log file. */
1577 static
1578 void
log_write_flush_to_disk_low()1579 log_write_flush_to_disk_low()
1580 {
1581 ut_a(log_sys->n_pending_flushes == 1); /* No other threads here */
1582
1583 #ifndef _WIN32
1584 bool do_flush = srv_unix_file_flush_method != SRV_UNIX_O_DSYNC;
1585 #else
1586 bool do_flush = true;
1587 #endif
1588 if (do_flush) {
1589 log_group_t* group = UT_LIST_GET_FIRST(log_sys->log_groups);
1590 fil_flush(group->space_id);
1591 log_sys->flushed_to_disk_lsn = log_sys->current_flush_lsn;
1592 }
1593
1594 log_sys->n_pending_flushes--;
1595 MONITOR_DEC(MONITOR_PENDING_LOG_FLUSH);
1596
1597 os_event_set(log_sys->flush_event);
1598 }
1599
1600 /** Switch the log buffer in use, and copy the content of last block
1601 from old log buffer to the head of the to be used one. Thus, buf_free and
1602 buf_next_to_write would be changed accordingly */
1603 static inline
1604 void
log_buffer_switch()1605 log_buffer_switch()
1606 {
1607 ut_ad(log_mutex_own());
1608 ut_ad(log_write_mutex_own());
1609
1610 const byte* old_buf = log_sys->buf;
1611 ulint area_end = ut_calc_align(log_sys->buf_free,
1612 OS_FILE_LOG_BLOCK_SIZE);
1613
1614 if (log_sys->first_in_use) {
1615 ut_ad((reinterpret_cast<uintptr_t>(log_sys->buf)
1616 % srv_log_write_ahead_size) == 0);
1617 log_sys->buf += log_sys->buf_size;
1618 } else {
1619 log_sys->buf -= log_sys->buf_size;
1620 ut_ad((reinterpret_cast<uintptr_t>(log_sys->buf)
1621 % srv_log_write_ahead_size) == 0);
1622 }
1623
1624 log_sys->first_in_use = !log_sys->first_in_use;
1625
1626 /* Copy the last block to new buf */
1627 ut_memcpy(log_sys->buf,
1628 old_buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
1629 OS_FILE_LOG_BLOCK_SIZE);
1630
1631 log_sys->buf_free %= OS_FILE_LOG_BLOCK_SIZE;
1632 log_sys->buf_next_to_write = log_sys->buf_free;
1633 }
1634
1635 /** Ensure that the log has been written to the log file up to a given
1636 log entry (such as that of a transaction commit). Start a new write, or
1637 wait and check if an already running write is covering the request.
1638 @param[in] lsn log sequence number that should be
1639 included in the redo log file write
1640 @param[in] flush_to_disk whether the written log should also
1641 be flushed to the file system */
1642 void
log_write_up_to(lsn_t lsn,bool flush_to_disk)1643 log_write_up_to(
1644 lsn_t lsn,
1645 bool flush_to_disk)
1646 {
1647 #ifdef UNIV_DEBUG
1648 ulint loop_count = 0;
1649 #endif /* UNIV_DEBUG */
1650 byte* write_buf;
1651 lsn_t write_lsn;
1652
1653 ut_ad(!srv_read_only_mode);
1654
1655 if (recv_no_ibuf_operations) {
1656 /* Recovery is running and no operations on the log files are
1657 allowed yet (the variable name .._no_ibuf_.. is misleading) */
1658
1659 return;
1660 }
1661
1662 loop:
1663 ut_ad(++loop_count < 128);
1664
1665 #if UNIV_WORD_SIZE > 7
1666 /* We can do a dirty read of LSN. */
1667 /* NOTE: Currently doesn't do dirty read for
1668 (flush_to_disk == true) case, because the log_mutex
1669 contention also works as the arbitrator for write-IO
1670 (fsync) bandwidth between log files and data files. */
1671 os_rmb;
1672 if (!flush_to_disk && log_sys->write_lsn >= lsn) {
1673 return;
1674 }
1675 #endif
1676
1677 log_write_mutex_enter();
1678 ut_ad(!recv_no_log_write);
1679
1680 lsn_t limit_lsn = flush_to_disk
1681 ? log_sys->flushed_to_disk_lsn
1682 : log_sys->write_lsn;
1683
1684 if (limit_lsn >= lsn) {
1685 log_write_mutex_exit();
1686 return;
1687 }
1688
1689 #ifdef _WIN32
1690 # ifndef UNIV_HOTBACKUP
1691 /* write requests during fil_flush() might not be good for Windows */
1692 if (log_sys->n_pending_flushes > 0
1693 || !os_event_is_set(log_sys->flush_event)) {
1694 log_write_mutex_exit();
1695 os_event_wait(log_sys->flush_event);
1696 goto loop;
1697 }
1698 # else
1699 if (log_sys->n_pending_flushes > 0) {
1700 goto loop;
1701 }
1702 # endif /* !UNIV_HOTBACKUP */
1703 #endif /* _WIN32 */
1704
1705 /* If it is a write call we should just go ahead and do it
1706 as we checked that write_lsn is not where we'd like it to
1707 be. If we have to flush as well then we check if there is a
1708 pending flush and based on that we wait for it to finish
1709 before proceeding further. */
1710 if (flush_to_disk
1711 && (log_sys->n_pending_flushes > 0
1712 || !os_event_is_set(log_sys->flush_event))) {
1713
1714 /* Figure out if the current flush will do the job
1715 for us. */
1716 bool work_done = log_sys->current_flush_lsn >= lsn;
1717
1718 log_write_mutex_exit();
1719
1720 os_event_wait(log_sys->flush_event);
1721
1722 if (work_done) {
1723 return;
1724 } else {
1725 goto loop;
1726 }
1727 }
1728
1729 log_mutex_enter();
1730 if (!flush_to_disk
1731 && log_sys->buf_free == log_sys->buf_next_to_write) {
1732 /* Nothing to write and no flush to disk requested */
1733 log_mutex_exit_all();
1734 return;
1735 }
1736
1737 log_group_t* group;
1738 ulint start_offset;
1739 ulint end_offset;
1740 ulint area_start;
1741 ulint area_end;
1742 ulong write_ahead_size = srv_log_write_ahead_size;
1743 ulint pad_size;
1744
1745 DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF,
1746 log_sys->write_lsn,
1747 log_sys->lsn));
1748
1749 if (flush_to_disk) {
1750 log_sys->n_pending_flushes++;
1751 log_sys->current_flush_lsn = log_sys->lsn;
1752 MONITOR_INC(MONITOR_PENDING_LOG_FLUSH);
1753 os_event_reset(log_sys->flush_event);
1754
1755 if (log_sys->buf_free == log_sys->buf_next_to_write) {
1756 /* Nothing to write, flush only */
1757 log_mutex_exit_all();
1758 log_write_flush_to_disk_low();
1759 return;
1760 }
1761 }
1762
1763 start_offset = log_sys->buf_next_to_write;
1764 end_offset = log_sys->buf_free;
1765
1766 area_start = ut_calc_align_down(start_offset, OS_FILE_LOG_BLOCK_SIZE);
1767 area_end = ut_calc_align(end_offset, OS_FILE_LOG_BLOCK_SIZE);
1768
1769 ut_ad(area_end - area_start > 0);
1770
1771 log_block_set_flush_bit(log_sys->buf + area_start, TRUE);
1772 log_block_set_checkpoint_no(
1773 log_sys->buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
1774 log_sys->next_checkpoint_no);
1775
1776 write_lsn = log_sys->lsn;
1777 write_buf = log_sys->buf;
1778
1779 log_buffer_switch();
1780
1781 group = UT_LIST_GET_FIRST(log_sys->log_groups);
1782
1783 log_group_set_fields(group, log_sys->write_lsn);
1784
1785 log_mutex_exit();
1786
1787 /* Calculate pad_size if needed. */
1788 pad_size = 0;
1789 if (write_ahead_size > OS_FILE_LOG_BLOCK_SIZE) {
1790 lsn_t end_offset;
1791 ulint end_offset_in_unit;
1792
1793 end_offset = log_group_calc_lsn_offset(
1794 ut_uint64_align_up(write_lsn,
1795 OS_FILE_LOG_BLOCK_SIZE),
1796 group);
1797 end_offset_in_unit = (ulint) (end_offset % write_ahead_size);
1798
1799 if (end_offset_in_unit > 0
1800 && (area_end - area_start) > end_offset_in_unit) {
1801 /* The first block in the unit was initialized
1802 after the last writing.
1803 Needs to be written padded data once. */
1804 pad_size = write_ahead_size - end_offset_in_unit;
1805
1806 if (area_end + pad_size > log_sys->buf_size) {
1807 pad_size = log_sys->buf_size - area_end;
1808 }
1809
1810 ::memset(write_buf + area_end, 0, pad_size);
1811 }
1812 }
1813
1814 /* Do the write to the log files */
1815 log_group_write_buf(
1816 group, write_buf + area_start,
1817 area_end - area_start + pad_size,
1818 #ifdef UNIV_DEBUG
1819 pad_size,
1820 #endif /* UNIV_DEBUG */
1821 ut_uint64_align_down(log_sys->write_lsn,
1822 OS_FILE_LOG_BLOCK_SIZE),
1823 start_offset - area_start);
1824
1825 srv_stats.log_padded.add(pad_size);
1826
1827 log_sys->write_lsn = write_lsn;
1828
1829 #ifndef _WIN32
1830 if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC
1831 || srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) {
1832 /* O_SYNC and ALL_O_DIRECT mean the OS did not buffer the log
1833 file at all: so we have also flushed to disk what we have
1834 written */
1835 log_sys->flushed_to_disk_lsn = log_sys->write_lsn;
1836 }
1837 #endif /* !_WIN32 */
1838
1839 log_write_mutex_exit();
1840
1841 if (flush_to_disk) {
1842 log_write_flush_to_disk_low();
1843 }
1844 }
1845
1846 /** write to the log file up to the last log entry.
1847 @param[in] sync whether we want the written log
1848 also to be flushed to disk. */
1849 void
log_buffer_flush_to_disk(bool sync)1850 log_buffer_flush_to_disk(
1851 bool sync)
1852 {
1853 ut_ad(!srv_read_only_mode);
1854 log_write_up_to(log_get_lsn(), sync);
1855 }
1856
1857 /****************************************************************//**
1858 This functions writes the log buffer to the log file and if 'flush'
1859 is set it forces a flush of the log file as well. This is meant to be
1860 called from background master thread only as it does not wait for
1861 the write (+ possible flush) to finish. */
1862 void
log_buffer_sync_in_background(bool flush)1863 log_buffer_sync_in_background(
1864 /*==========================*/
1865 bool flush) /*!< in: flush the logs to disk */
1866 {
1867 lsn_t lsn;
1868
1869 log_mutex_enter();
1870
1871 lsn = log_sys->lsn;
1872
1873 if (flush
1874 && log_sys->n_pending_flushes > 0
1875 && log_sys->current_flush_lsn >= lsn) {
1876 /* The write + flush will write enough */
1877 log_mutex_exit();
1878 return;
1879 }
1880
1881 log_mutex_exit();
1882
1883 log_write_up_to(lsn, flush);
1884 }
1885
1886 /********************************************************************
1887
1888 Tries to establish a big enough margin of free space in the log buffer, such
1889 that a new log entry can be catenated without an immediate need for a flush. */
1890 static
1891 void
log_flush_margin(void)1892 log_flush_margin(void)
1893 /*==================*/
1894 {
1895 log_t* log = log_sys;
1896 lsn_t lsn = 0;
1897
1898 log_mutex_enter();
1899
1900 if (log->buf_free > log->max_buf_free) {
1901 /* We can write during flush */
1902 lsn = log->lsn;
1903 }
1904
1905 log_mutex_exit();
1906
1907 if (lsn) {
1908 log_write_up_to(lsn, false);
1909 }
1910 }
1911 #ifndef UNIV_HOTBACKUP
1912 /** Advances the smallest lsn for which there are unflushed dirty blocks in the
1913 buffer pool.
1914 NOTE: this function may only be called if the calling thread owns no
1915 synchronization objects!
1916 @param[in] new_oldest try to advance oldest_modified_lsn at least to
1917 this lsn
1918 @return false if there was a flush batch of the same type running,
1919 which means that we could not start this flush batch */
1920 static
1921 bool
log_preflush_pool_modified_pages(lsn_t new_oldest)1922 log_preflush_pool_modified_pages(
1923 lsn_t new_oldest)
1924 {
1925 if (recv_recovery_on) {
1926 /* If the recovery is running, we must first apply all
1927 log records to their respective file pages to get the
1928 right modify lsn values to these pages: otherwise, there
1929 might be pages on disk which are not yet recovered to the
1930 current lsn, and even after calling this function, we could
1931 not know how up-to-date the disk version of the database is,
1932 and we could not make a new checkpoint on the basis of the
1933 info on the buffer pool only. */
1934
1935 recv_apply_hashed_log_recs(TRUE);
1936 }
1937
1938 /* better to wait for flushed by page cleaner */
1939 ut_ad(buf_page_cleaner_is_active);
1940
1941 if (srv_flush_sync) {
1942 /* wake page cleaner for IO burst */
1943 buf_flush_request_force(new_oldest);
1944 }
1945
1946 buf_flush_wait_flushed(new_oldest);
1947
1948 return(true);
1949 }
1950 #endif /* !UNIV_HOTBACKUP */
1951 /******************************************************//**
1952 Completes a checkpoint. */
1953 static
1954 void
log_complete_checkpoint(void)1955 log_complete_checkpoint(void)
1956 /*=========================*/
1957 {
1958 ut_ad(log_mutex_own());
1959 ut_ad(log_sys->n_pending_checkpoint_writes == 0);
1960
1961 log_sys->next_checkpoint_no++;
1962
1963 ut_ad(log_sys->next_checkpoint_lsn >= log_sys->last_checkpoint_lsn);
1964 log_sys->last_checkpoint_lsn = log_sys->next_checkpoint_lsn;
1965 MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
1966 log_sys->lsn - log_sys->last_checkpoint_lsn);
1967
1968 DBUG_PRINT("ib_log", ("checkpoint ended at " LSN_PF
1969 ", flushed to " LSN_PF,
1970 log_sys->last_checkpoint_lsn,
1971 log_sys->flushed_to_disk_lsn));
1972
1973 rw_lock_x_unlock_gen(&(log_sys->checkpoint_lock), LOG_CHECKPOINT);
1974 }
1975
1976 /******************************************************//**
1977 Completes an asynchronous checkpoint info write i/o to a log file. */
1978 static
1979 void
log_io_complete_checkpoint(void)1980 log_io_complete_checkpoint(void)
1981 /*============================*/
1982 {
1983 MONITOR_DEC(MONITOR_PENDING_CHECKPOINT_WRITE);
1984
1985 log_mutex_enter();
1986
1987 ut_ad(log_sys->n_pending_checkpoint_writes > 0);
1988
1989 if (--log_sys->n_pending_checkpoint_writes == 0) {
1990 log_complete_checkpoint();
1991 }
1992
1993 log_mutex_exit();
1994
1995 /* Wake the redo log watching thread to parse the log up to this
1996 checkpoint. */
1997 if (srv_track_changed_pages) {
1998 os_event_reset(srv_redo_log_tracked_event);
1999 os_event_set(srv_checkpoint_completed_event);
2000 }
2001 }
2002
2003 /******************************************************//**
2004 Writes the checkpoint info to a log group header. */
2005 static
2006 void
log_group_checkpoint(log_group_t * group)2007 log_group_checkpoint(
2008 /*=================*/
2009 log_group_t* group) /*!< in: log group */
2010 {
2011 lsn_t lsn_offset;
2012 byte* buf;
2013
2014 ut_ad(!srv_read_only_mode);
2015 ut_ad(log_mutex_own());
2016 ut_ad(srv_shutdown_state != SRV_SHUTDOWN_LAST_PHASE);
2017 #if LOG_CHECKPOINT_SIZE > OS_FILE_LOG_BLOCK_SIZE
2018 # error "LOG_CHECKPOINT_SIZE > OS_FILE_LOG_BLOCK_SIZE"
2019 #endif
2020
2021 DBUG_PRINT("ib_log", ("checkpoint " UINT64PF " at " LSN_PF
2022 " written to group " ULINTPF,
2023 log_sys->next_checkpoint_no,
2024 log_sys->next_checkpoint_lsn,
2025 group->id));
2026
2027 buf = group->checkpoint_buf;
2028 memset(buf, 0, OS_FILE_LOG_BLOCK_SIZE);
2029
2030 #ifdef UNIV_DEBUG
2031 lsn_t old_next_checkpoint_lsn
2032 = mach_read_from_8(buf + LOG_CHECKPOINT_LSN);
2033 ut_ad(old_next_checkpoint_lsn <= log_sys->next_checkpoint_lsn);
2034 #endif /* UNIV_DEBUG */
2035 mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys->next_checkpoint_no);
2036 mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys->next_checkpoint_lsn);
2037
2038 lsn_offset = log_group_calc_lsn_offset(log_sys->next_checkpoint_lsn,
2039 group);
2040 mach_write_to_8(buf + LOG_CHECKPOINT_OFFSET, lsn_offset);
2041 mach_write_to_8(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, log_sys->buf_size);
2042
2043 log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf));
2044
2045 MONITOR_INC(MONITOR_PENDING_CHECKPOINT_WRITE);
2046
2047 log_sys->n_log_ios++;
2048
2049 MONITOR_INC(MONITOR_LOG_IO);
2050
2051 ut_ad(LOG_CHECKPOINT_1 < univ_page_size.physical());
2052 ut_ad(LOG_CHECKPOINT_2 < univ_page_size.physical());
2053
2054 if (log_sys->n_pending_checkpoint_writes++ == 0) {
2055 rw_lock_x_lock_gen(&log_sys->checkpoint_lock,
2056 LOG_CHECKPOINT);
2057 }
2058
2059 /* Note: We alternate the physical place of the checkpoint info.
2060 See the (next_checkpoint_no & 1) below. */
2061
2062 /* We send as the last parameter the group machine address
2063 added with 1, as we want to distinguish between a normal log
2064 file write and a checkpoint field write */
2065
2066 fil_io(IORequestLogWrite, false,
2067 page_id_t(group->space_id, 0),
2068 univ_page_size,
2069 (log_sys->next_checkpoint_no & 1)
2070 ? LOG_CHECKPOINT_2 : LOG_CHECKPOINT_1,
2071 OS_FILE_LOG_BLOCK_SIZE,
2072 buf, (byte*) group + 1);
2073
2074 ut_ad(((ulint) group & 0x1UL) == 0);
2075 }
2076
2077 #ifdef UNIV_HOTBACKUP
2078 /******************************************************//**
2079 Writes info to a buffer of a log group when log files are created in
2080 backup restoration. */
2081 void
log_reset_first_header_and_checkpoint(byte * hdr_buf,ib_uint64_t start)2082 log_reset_first_header_and_checkpoint(
2083 /*==================================*/
2084 byte* hdr_buf,/*!< in: buffer which will be written to the
2085 start of the first log file */
2086 ib_uint64_t start) /*!< in: lsn of the start of the first log file;
2087 we pretend that there is a checkpoint at
2088 start + LOG_BLOCK_HDR_SIZE */
2089 {
2090 byte* buf;
2091 ib_uint64_t lsn;
2092
2093 mach_write_to_4(hdr_buf + LOG_HEADER_FORMAT,
2094 LOG_HEADER_FORMAT_CURRENT);
2095 mach_write_to_8(hdr_buf + LOG_HEADER_START_LSN, start);
2096
2097 lsn = start + LOG_BLOCK_HDR_SIZE;
2098
2099 /* Write the label of mysqlbackup --restore */
2100 strcpy((char*)hdr_buf + LOG_HEADER_CREATOR, LOG_HEADER_CREATOR_CURRENT);
2101 ut_sprintf_timestamp((char*) hdr_buf
2102 + (LOG_HEADER_CREATOR
2103 + (sizeof LOG_HEADER_CREATOR_CURRENT) - 1));
2104 buf = hdr_buf + LOG_CHECKPOINT_1;
2105 memset(buf, 0, OS_FILE_LOG_BLOCK_SIZE);
2106
2107 /*mach_write_to_8(buf + LOG_CHECKPOINT_NO, 0);*/
2108 mach_write_to_8(buf + LOG_CHECKPOINT_LSN, lsn);
2109
2110 mach_write_to_8(buf + LOG_CHECKPOINT_OFFSET,
2111 LOG_FILE_HDR_SIZE + LOG_BLOCK_HDR_SIZE);
2112 mach_write_to_8(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, 2 * 1024 * 1024);
2113
2114 log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf));
2115 }
2116 #endif /* UNIV_HOTBACKUP */
2117
2118 #ifndef UNIV_HOTBACKUP
2119 /** Read a log group header page to log_sys->checkpoint_buf.
2120 @param[in] group log group
2121 @param[in] header 0 or LOG_CHEKCPOINT_1 or LOG_CHECKPOINT2 */
2122 void
log_group_header_read(const log_group_t * group,ulint header)2123 log_group_header_read(
2124 const log_group_t* group,
2125 ulint header)
2126 {
2127 ut_ad(log_mutex_own());
2128
2129 log_sys->n_log_ios++;
2130
2131 MONITOR_INC(MONITOR_LOG_IO);
2132
2133 fil_io(IORequestLogRead, true,
2134 page_id_t(group->space_id, header / univ_page_size.physical()),
2135 univ_page_size, header % univ_page_size.physical(),
2136 OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL);
2137 }
2138
2139 /** Write checkpoint info to the log header and invoke log_mutex_exit().
2140 @param[in] sync whether to wait for the write to complete */
2141 void
log_write_checkpoint_info(bool sync)2142 log_write_checkpoint_info(
2143 bool sync)
2144 {
2145 log_group_t* group;
2146
2147 ut_ad(log_mutex_own());
2148
2149 if (!srv_read_only_mode) {
2150 for (group = UT_LIST_GET_FIRST(log_sys->log_groups);
2151 group;
2152 group = UT_LIST_GET_NEXT(log_groups, group)) {
2153
2154 log_group_checkpoint(group);
2155 }
2156 }
2157
2158 log_mutex_exit();
2159
2160 MONITOR_INC(MONITOR_NUM_CHECKPOINT);
2161
2162 if (sync) {
2163 /* Wait for the checkpoint write to complete */
2164 rw_lock_s_lock(&log_sys->checkpoint_lock);
2165 rw_lock_s_unlock(&log_sys->checkpoint_lock);
2166
2167 DEBUG_SYNC_C("checkpoint_completed");
2168
2169 DBUG_EXECUTE_IF(
2170 "crash_after_checkpoint",
2171 DBUG_SUICIDE(););
2172 }
2173 }
2174
2175 /** Set extra data to be written to the redo log during checkpoint.
2176 @param[in] buf data to be appended on checkpoint, or NULL
2177 @return pointer to previous data to be appended on checkpoint */
2178 mtr_buf_t*
log_append_on_checkpoint(mtr_buf_t * buf)2179 log_append_on_checkpoint(
2180 mtr_buf_t* buf)
2181 {
2182 log_mutex_enter();
2183 mtr_buf_t* old = log_sys->append_on_checkpoint;
2184 log_sys->append_on_checkpoint = buf;
2185 log_mutex_exit();
2186 return(old);
2187 }
2188
2189 /** Make a checkpoint. Note that this function does not flush dirty
2190 blocks from the buffer pool: it only checks what is lsn of the oldest
2191 modification in the pool, and writes information about the lsn in
2192 log files. Use log_make_checkpoint_at() to flush also the pool.
2193 @param[in] sync whether to wait for the write to complete
2194 @param[in] write_always force a write even if no log
2195 has been generated since the latest checkpoint
2196 @return true if success, false if a checkpoint write was already running */
2197 bool
log_checkpoint(bool sync,bool write_always)2198 log_checkpoint(
2199 bool sync,
2200 bool write_always)
2201 {
2202 lsn_t oldest_lsn;
2203
2204 ut_ad(!srv_read_only_mode);
2205
2206 if (recv_recovery_is_on()) {
2207 recv_apply_hashed_log_recs(TRUE);
2208 }
2209
2210 #ifndef _WIN32
2211 switch (srv_unix_file_flush_method) {
2212 case SRV_UNIX_NOSYNC:
2213 case SRV_UNIX_ALL_O_DIRECT:
2214 break;
2215 case SRV_UNIX_O_DSYNC:
2216 case SRV_UNIX_FSYNC:
2217 case SRV_UNIX_LITTLESYNC:
2218 case SRV_UNIX_O_DIRECT:
2219 case SRV_UNIX_O_DIRECT_NO_FSYNC:
2220 fil_flush_file_spaces(FIL_TYPE_TABLESPACE);
2221 }
2222 #endif /* !_WIN32 */
2223
2224 log_mutex_enter();
2225
2226 ut_ad(!recv_no_log_write);
2227 oldest_lsn = log_buf_pool_get_oldest_modification();
2228
2229 /* Because log also contains headers and dummy log records,
2230 log_buf_pool_get_oldest_modification() will return log_sys->lsn
2231 if the buffer pool contains no dirty buffers.
2232 We must make sure that the log is flushed up to that lsn.
2233 If there are dirty buffers in the buffer pool, then our
2234 write-ahead-logging algorithm ensures that the log has been
2235 flushed up to oldest_lsn. */
2236
2237 ut_ad(oldest_lsn >= log_sys->last_checkpoint_lsn);
2238 if (!write_always
2239 && oldest_lsn
2240 <= log_sys->last_checkpoint_lsn + SIZE_OF_MLOG_CHECKPOINT) {
2241 /* Do nothing, because nothing was logged (other than
2242 a MLOG_CHECKPOINT marker) since the previous checkpoint. */
2243 log_mutex_exit();
2244 return(true);
2245 }
2246
2247 /* Repeat the MLOG_FILE_NAME records after the checkpoint, in
2248 case some log records between the checkpoint and log_sys->lsn
2249 need them. Finally, write a MLOG_CHECKPOINT marker. Redo log
2250 apply expects to see a MLOG_CHECKPOINT after the checkpoint,
2251 except on clean shutdown, where the log will be empty after
2252 the checkpoint.
2253
2254 It is important that we write out the redo log before any
2255 further dirty pages are flushed to the tablespace files. At
2256 this point, because log_mutex_own(), mtr_commit() in other
2257 threads will be blocked, and no pages can be added to the
2258 flush lists. */
2259 lsn_t flush_lsn = oldest_lsn;
2260 const bool do_write
2261 = srv_shutdown_state == SRV_SHUTDOWN_NONE
2262 || flush_lsn != log_sys->lsn;
2263
2264 if (fil_names_clear(flush_lsn, do_write)) {
2265 ut_ad(log_sys->lsn >= flush_lsn + SIZE_OF_MLOG_CHECKPOINT);
2266 flush_lsn = log_sys->lsn;
2267 }
2268
2269 log_mutex_exit();
2270
2271 log_write_up_to(flush_lsn, true);
2272
2273 DBUG_EXECUTE_IF(
2274 "using_wa_checkpoint_middle",
2275 if (write_always) {
2276 DEBUG_SYNC_C("wa_checkpoint_middle");
2277
2278 const my_bool b = TRUE;
2279 buf_flush_page_cleaner_disabled_debug_update(
2280 NULL, NULL, NULL, &b);
2281 dict_stats_disabled_debug_update(
2282 NULL, NULL, NULL, &b);
2283 srv_master_thread_disabled_debug_update(
2284 NULL, NULL, NULL, &b);
2285 });
2286
2287 log_mutex_enter();
2288
2289 ut_ad(log_sys->flushed_to_disk_lsn >= flush_lsn);
2290 ut_ad(flush_lsn >= oldest_lsn);
2291
2292 if (log_sys->last_checkpoint_lsn >= oldest_lsn) {
2293 log_mutex_exit();
2294 return(true);
2295 }
2296
2297 if (log_sys->n_pending_checkpoint_writes > 0) {
2298 /* A checkpoint write is running */
2299 log_mutex_exit();
2300
2301 if (sync) {
2302 /* Wait for the checkpoint write to complete */
2303 rw_lock_s_lock(&log_sys->checkpoint_lock);
2304 rw_lock_s_unlock(&log_sys->checkpoint_lock);
2305 }
2306
2307 return(false);
2308 }
2309
2310 ut_ad(oldest_lsn >= log_sys->next_checkpoint_lsn);
2311 log_sys->next_checkpoint_lsn = oldest_lsn;
2312 log_write_checkpoint_info(sync);
2313 ut_ad(!log_mutex_own());
2314
2315 return(true);
2316 }
2317
2318 /** Make a checkpoint at or after a specified LSN.
2319 @param[in] lsn the log sequence number, or LSN_MAX
2320 for the latest LSN
2321 @param[in] write_always force a write even if no log
2322 has been generated since the latest checkpoint */
2323 void
log_make_checkpoint_at(lsn_t lsn,bool write_always)2324 log_make_checkpoint_at(
2325 lsn_t lsn,
2326 bool write_always)
2327 {
2328 /* Preflush pages synchronously */
2329
2330 if (srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE) {
2331 while (!log_preflush_pool_modified_pages(lsn)) {
2332 /* Flush as much as we can */
2333 }
2334 }
2335
2336 while (!log_checkpoint(true, write_always)) {
2337 /* Force a checkpoint */
2338 }
2339 }
2340
2341 /****************************************************************//**
2342 Tries to establish a big enough margin of free space in the log groups, such
2343 that a new log entry can be catenated without an immediate need for a
2344 checkpoint. NOTE: this function may only be called if the calling thread
2345 owns no synchronization objects! */
2346 static
2347 void
log_checkpoint_margin(void)2348 log_checkpoint_margin(void)
2349 /*=======================*/
2350 {
2351 log_t* log = log_sys;
2352 lsn_t age;
2353 lsn_t checkpoint_age;
2354 ib_uint64_t advance;
2355 lsn_t oldest_lsn;
2356 bool success;
2357 loop:
2358 advance = 0;
2359
2360 log_mutex_enter();
2361 ut_ad(!recv_no_log_write);
2362
2363 if (!log->check_flush_or_checkpoint) {
2364 log_mutex_exit();
2365 return;
2366 }
2367
2368 oldest_lsn = log_buf_pool_get_oldest_modification();
2369
2370 age = log->lsn - oldest_lsn;
2371
2372 if (age > log->max_modified_age_sync) {
2373
2374 /* A flush is urgent: we have to do a synchronous preflush */
2375 advance = age - log->max_modified_age_sync;
2376 }
2377
2378 checkpoint_age = log->lsn - log->last_checkpoint_lsn;
2379
2380 bool checkpoint_sync;
2381 bool do_checkpoint;
2382
2383 if (checkpoint_age > log->max_checkpoint_age) {
2384 /* A checkpoint is urgent: we do it synchronously */
2385 checkpoint_sync = true;
2386 do_checkpoint = true;
2387 } else if (checkpoint_age > log->max_checkpoint_age_async) {
2388 /* A checkpoint is not urgent: do it asynchronously */
2389 do_checkpoint = true;
2390 checkpoint_sync = false;
2391 log->check_flush_or_checkpoint = false;
2392 } else {
2393 do_checkpoint = false;
2394 checkpoint_sync = false;
2395 log->check_flush_or_checkpoint = false;
2396 }
2397
2398 log_mutex_exit();
2399
2400 if (advance) {
2401 lsn_t new_oldest = oldest_lsn + advance;
2402
2403 success = log_preflush_pool_modified_pages(new_oldest);
2404
2405 /* If the flush succeeded, this thread has done its part
2406 and can proceed. If it did not succeed, there was another
2407 thread doing a flush at the same time. */
2408 if (!success) {
2409 log_mutex_enter();
2410
2411 log->check_flush_or_checkpoint = true;
2412
2413 log_mutex_exit();
2414 goto loop;
2415 }
2416 }
2417
2418 if (do_checkpoint) {
2419 log_checkpoint(checkpoint_sync, FALSE);
2420
2421 if (checkpoint_sync) {
2422
2423 goto loop;
2424 }
2425 }
2426 }
2427
2428 /******************************************************//**
2429 Reads a specified log segment to a buffer. Optionally releases the log mutex
2430 before the I/O.*/
2431 void
log_group_read_log_seg(byte * buf,log_group_t * group,lsn_t start_lsn,lsn_t end_lsn,bool release_mutex)2432 log_group_read_log_seg(
2433 /*===================*/
2434 byte* buf, /*!< in: buffer where to read */
2435 log_group_t* group, /*!< in: log group */
2436 lsn_t start_lsn, /*!< in: read area start */
2437 lsn_t end_lsn, /*!< in: read area end */
2438 bool release_mutex) /*!< in: whether the log_sys->mutex
2439 should be released before the read */
2440 {
2441 ulint len;
2442 lsn_t source_offset;
2443
2444 ut_ad(log_mutex_own());
2445
2446 loop:
2447 source_offset = log_group_calc_lsn_offset(start_lsn, group);
2448
2449 ut_a(end_lsn - start_lsn <= ULINT_MAX);
2450 len = (ulint) (end_lsn - start_lsn);
2451
2452 ut_ad(len != 0);
2453
2454 if ((source_offset % group->file_size) + len > group->file_size) {
2455
2456 /* If the above condition is true then len (which is ulint)
2457 is > the expression below, so the typecast is ok */
2458 len = (ulint) (group->file_size -
2459 (source_offset % group->file_size));
2460 }
2461
2462 log_sys->n_log_ios++;
2463
2464 MONITOR_INC(MONITOR_LOG_IO);
2465
2466 ut_a(source_offset / UNIV_PAGE_SIZE <= ULINT_MAX);
2467
2468 if (release_mutex) {
2469 log_mutex_exit();
2470 }
2471
2472 const ulint page_no
2473 = (ulint) (source_offset / univ_page_size.physical());
2474
2475 fil_io(IORequestLogRead, true,
2476 page_id_t(group->space_id, page_no),
2477 univ_page_size,
2478 (ulint) (source_offset % univ_page_size.physical()),
2479 len, buf, NULL);
2480
2481 start_lsn += len;
2482 buf += len;
2483
2484 if (start_lsn != end_lsn) {
2485
2486 if (release_mutex) {
2487 log_mutex_enter();
2488 }
2489 goto loop;
2490 }
2491 }
2492
2493 /**
2494 Checks that there is enough free space in the log to start a new query step.
2495 Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
2496 function may only be called if the calling thread owns no synchronization
2497 objects! */
2498 void
log_check_margins(void)2499 log_check_margins(void)
2500 {
2501 bool check = true;
2502
2503 do {
2504 log_flush_margin();
2505 log_checkpoint_margin();
2506 log_mutex_enter();
2507 if (log_check_tracking_margin(0)) {
2508 log_mutex_exit();
2509 os_thread_sleep(10000);
2510 continue;
2511 }
2512 ut_ad(!recv_no_log_write);
2513 check = log_sys->check_flush_or_checkpoint;
2514 log_mutex_exit();
2515 } while (check);
2516 }
2517
2518 /****************************************************************//**
2519 Makes a checkpoint at the latest lsn and writes it to first page of each
2520 data file in the database, so that we know that the file spaces contain
2521 all modifications up to that lsn. This can only be called at database
2522 shutdown. This function also writes all log in log files to the log archive. */
2523 void
logs_empty_and_mark_files_at_shutdown(void)2524 logs_empty_and_mark_files_at_shutdown(void)
2525 /*=======================================*/
2526 {
2527 lsn_t lsn;
2528 lsn_t tracked_lsn;
2529 ulint count = 0;
2530 ulint total_trx;
2531 ulint pending_io;
2532 enum srv_thread_type active_thd;
2533 const char* thread_name;
2534
2535 ib::info() << "Starting shutdown...";
2536
2537 while (srv_fast_shutdown == 0 && trx_rollback_or_clean_is_active) {
2538 /* we should wait until rollback after recovery end
2539 for slow shutdown */
2540 os_thread_sleep(100000);
2541 }
2542
2543 /* Wait until the master thread and all other operations are idle: our
2544 algorithm only works if the server is idle at shutdown */
2545
2546 srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
2547 loop:
2548 os_thread_sleep(100000);
2549
2550 count++;
2551
2552 /* We need the monitor threads to stop before we proceed with
2553 a shutdown. */
2554
2555 thread_name = srv_any_background_threads_are_active();
2556
2557 if (thread_name != NULL) {
2558 /* Print a message every 60 seconds if we are waiting
2559 for the monitor thread to exit. Master and worker
2560 threads check will be done later. */
2561
2562 if (srv_print_verbose_log && count > 600) {
2563 ib::info() << "Waiting for " << thread_name
2564 << " to exit";
2565 count = 0;
2566 }
2567
2568 goto loop;
2569 }
2570
2571 /* Check that there are no longer transactions, except for
2572 PREPARED ones. We need this wait even for the 'very fast'
2573 shutdown, because the InnoDB layer may have committed or
2574 prepared transactions and we don't want to lose them. */
2575
2576 total_trx = trx_sys_any_active_transactions();
2577
2578 if (total_trx > 0) {
2579
2580 if (srv_print_verbose_log && count > 600) {
2581 ib::info() << "Waiting for " << total_trx << " active"
2582 << " transactions to finish";
2583
2584 count = 0;
2585 }
2586
2587 goto loop;
2588 }
2589
2590 /* Check that the background threads are suspended */
2591
2592 active_thd = srv_get_active_thread_type();
2593
2594 if (active_thd != SRV_NONE || srv_n_fil_crypt_threads_started) {
2595
2596 if (active_thd == SRV_PURGE) {
2597 srv_purge_wakeup();
2598 }
2599
2600 /* The srv_lock_timeout_thread, srv_error_monitor_thread
2601 and srv_monitor_thread should already exit by now. The
2602 only threads to be suspended are the master threads
2603 and worker threads (purge threads). Print the thread
2604 type if any of such threads not in suspended mode */
2605 if (srv_print_verbose_log && count > 600) {
2606 const char* thread_type = "<null>";
2607
2608 switch (active_thd) {
2609 case SRV_NONE:
2610 ut_ad(srv_n_fil_crypt_threads_started);
2611 thread_type = "encryption thread";
2612 break;
2613 case SRV_WORKER:
2614 thread_type = "worker threads";
2615 break;
2616 case SRV_MASTER:
2617 thread_type = "master thread";
2618 break;
2619 case SRV_PURGE:
2620 thread_type = "purge thread";
2621 break;
2622 }
2623
2624 ib::info() << "Waiting for " << thread_type
2625 << " to be suspended";
2626
2627 count = 0;
2628 }
2629
2630 goto loop;
2631 }
2632
2633 /* At this point only page_cleaner should be active. We wait
2634 here to let it complete the flushing of the buffer pools
2635 before proceeding further. */
2636 os_rmb;
2637 ut_ad(buf_lru_manager_running_threads == srv_buf_pool_instances
2638 || buf_lru_manager_running_threads == 0);
2639 srv_shutdown_state = SRV_SHUTDOWN_FLUSH_PHASE;
2640 count = 0;
2641 while (buf_page_cleaner_is_active
2642 || buf_lru_manager_running_threads > 0) {
2643
2644 if (srv_print_verbose_log && count == 0) {
2645 ib::info() << "Waiting for page_cleaner to"
2646 " finish flushing of buffer pool";
2647 }
2648 ++count;
2649 os_thread_sleep(100000);
2650 if (count > 600) {
2651 count = 0;
2652 }
2653
2654 os_rmb;
2655 }
2656
2657 log_mutex_enter();
2658 const ulint n_write = log_sys->n_pending_checkpoint_writes;
2659 const ulint n_flush = log_sys->n_pending_flushes;
2660 log_mutex_exit();
2661
2662 if (log_scrub_thread_active || n_write != 0 || n_flush != 0) {
2663 if (srv_print_verbose_log && count > 600) {
2664 ib::info() << "Pending checkpoint_writes: " << n_write
2665 << ". Pending log flush writes: " << n_flush;
2666 count = 0;
2667 }
2668 goto loop;
2669 }
2670
2671 pending_io = buf_pool_check_no_pending_io();
2672
2673 if (log_scrub_thread_active) {
2674 ut_ad(!srv_read_only_mode);
2675 os_event_set(log_scrub_event);
2676 }
2677
2678 ut_ad(!log_scrub_thread_active);
2679
2680 if (pending_io) {
2681 if (srv_print_verbose_log && count > 600) {
2682 ib::info() << "Waiting for " << pending_io << " buffer"
2683 " page I/Os to complete";
2684 count = 0;
2685 }
2686
2687 goto loop;
2688 }
2689
2690 if (srv_fast_shutdown == 2) {
2691 if (!srv_read_only_mode) {
2692 ib::info() << "MySQL has requested a very fast"
2693 " shutdown without flushing the InnoDB buffer"
2694 " pool to data files. At the next mysqld"
2695 " startup InnoDB will do a crash recovery!";
2696
2697 /* In this fastest shutdown we do not flush the
2698 buffer pool:
2699
2700 it is essentially a 'crash' of the InnoDB server.
2701 Make sure that the log is all flushed to disk, so
2702 that we can recover all committed transactions in
2703 a crash recovery. We must not write the lsn stamps
2704 to the data files, since at a startup InnoDB deduces
2705 from the stamps if the previous shutdown was clean. */
2706
2707 log_buffer_flush_to_disk();
2708
2709 /* Check that the background threads stay suspended */
2710 thread_name = srv_any_background_threads_are_active();
2711
2712 if (thread_name != NULL) {
2713 ib::warn() << "Background thread "
2714 << thread_name << " woke up during"
2715 " shutdown";
2716 goto loop;
2717 }
2718 }
2719
2720 srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
2721
2722 /* Wake the log tracking thread which will then immediatelly
2723 quit because of srv_shutdown_state value */
2724 if (srv_redo_log_thread_started) {
2725 os_event_reset(srv_redo_log_tracked_event);
2726 os_event_set(srv_checkpoint_completed_event);
2727 }
2728
2729 fil_close_all_files();
2730
2731 thread_name = srv_any_background_threads_are_active();
2732
2733 ut_a(!thread_name);
2734
2735 return;
2736 }
2737
2738 if (!srv_read_only_mode) {
2739 log_make_checkpoint_at(LSN_MAX, TRUE);
2740 }
2741
2742 log_mutex_enter();
2743
2744 tracked_lsn = log_get_tracked_lsn();
2745
2746 lsn = log_sys->lsn;
2747
2748 /** If innodb_force_recovery is set to 6 then log_sys doesn't
2749 have recent checkpoint information. So last checkpoint lsn
2750 will never be equal to current lsn. */
2751 const bool is_last =
2752 ((srv_force_recovery == SRV_FORCE_NO_LOG_REDO
2753 && lsn == log_sys->last_checkpoint_lsn
2754 + LOG_BLOCK_HDR_SIZE)
2755 || lsn == log_sys->last_checkpoint_lsn)
2756 && (!srv_track_changed_pages
2757 || tracked_lsn == log_sys->last_checkpoint_lsn);
2758
2759 ut_ad(lsn >= log_sys->last_checkpoint_lsn);
2760
2761 log_mutex_exit();
2762
2763 if (!is_last) {
2764 goto loop;
2765 }
2766
2767 /* Check that the background threads stay suspended */
2768 thread_name = srv_any_background_threads_are_active();
2769 if (thread_name != NULL) {
2770 ib::warn() << "Background thread " << thread_name << " woke up"
2771 " during shutdown";
2772
2773 goto loop;
2774 }
2775
2776 if (!srv_read_only_mode) {
2777 fil_flush_file_spaces(FIL_TYPE_TABLESPACE);
2778 fil_flush_file_spaces(FIL_TYPE_LOG);
2779 }
2780
2781 /* The call fil_write_flushed_lsn() will bypass the buffer
2782 pool: therefore it is essential that the buffer pool has been
2783 completely flushed to disk! (We do not call fil_write... if the
2784 'very fast' shutdown is enabled.) */
2785
2786 if (!buf_all_freed()) {
2787
2788 if (srv_print_verbose_log && count > 600) {
2789 ib::info() << "Waiting for dirty buffer pages to be"
2790 " flushed";
2791 count = 0;
2792 }
2793
2794 goto loop;
2795 }
2796
2797 srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
2798
2799 /* Signal the log following thread to quit */
2800 if (srv_redo_log_thread_started) {
2801 os_event_reset(srv_redo_log_tracked_event);
2802 os_event_set(srv_checkpoint_completed_event);
2803 }
2804
2805 /* Make some checks that the server really is quiet */
2806 srv_thread_type type = srv_get_active_thread_type();
2807 ut_a(type == SRV_NONE);
2808
2809 bool freed = buf_all_freed();
2810 ut_a(freed);
2811
2812 ut_a(lsn == log_sys->lsn);
2813 ut_ad(srv_force_recovery >= SRV_FORCE_NO_LOG_REDO
2814 || lsn == log_sys->last_checkpoint_lsn);
2815
2816 if (lsn < srv_start_lsn) {
2817 ib::error() << "Log sequence number at shutdown " << lsn
2818 << " is lower than at startup " << srv_start_lsn
2819 << "!";
2820 }
2821
2822 srv_shutdown_lsn = lsn;
2823
2824 if (!srv_read_only_mode) {
2825 fil_write_flushed_lsn(lsn);
2826 }
2827
2828 fil_close_all_files();
2829
2830 /* Make some checks that the server really is quiet */
2831 type = srv_get_active_thread_type();
2832 ut_a(type == SRV_NONE);
2833
2834 freed = buf_all_freed();
2835 ut_a(freed);
2836
2837 ut_a(lsn == log_sys->lsn);
2838 }
2839
2840 /******************************************************//**
2841 Peeks the current lsn.
2842 @return TRUE if success, FALSE if could not get the log system mutex */
2843 ibool
log_peek_lsn(lsn_t * lsn)2844 log_peek_lsn(
2845 /*=========*/
2846 lsn_t* lsn) /*!< out: if returns TRUE, current lsn is here */
2847 {
2848 if (0 == mutex_enter_nowait(&(log_sys->mutex))) {
2849 *lsn = log_sys->lsn;
2850
2851 log_mutex_exit();
2852
2853 return(TRUE);
2854 }
2855
2856 return(FALSE);
2857 }
2858
2859 /******************************************************//**
2860 Prints info of the log. */
2861 void
log_print(FILE * file)2862 log_print(
2863 /*======*/
2864 FILE* file) /*!< in: file where to print */
2865 {
2866 double time_elapsed;
2867 time_t current_time;
2868
2869 log_mutex_enter();
2870
2871 fprintf(file,
2872 "Log sequence number " LSN_PF "\n"
2873 "Log flushed up to " LSN_PF "\n"
2874 "Pages flushed up to " LSN_PF "\n"
2875 "Last checkpoint at " LSN_PF "\n",
2876 log_sys->lsn,
2877 log_sys->flushed_to_disk_lsn,
2878 log_buf_pool_get_oldest_modification(),
2879 log_sys->last_checkpoint_lsn);
2880
2881 fprintf(file,
2882 "Max checkpoint age " LSN_PF "\n"
2883 "Checkpoint age target " LSN_PF "\n"
2884 "Modified age " LSN_PF "\n"
2885 "Checkpoint age " LSN_PF "\n",
2886 log_sys->max_checkpoint_age,
2887 log_sys->max_checkpoint_age_async,
2888 log_sys->lsn -log_buf_pool_get_oldest_modification(),
2889 log_sys->lsn - log_sys->last_checkpoint_lsn);
2890
2891 current_time = time(NULL);
2892
2893 time_elapsed = difftime(current_time,
2894 log_sys->last_printout_time);
2895
2896 if (time_elapsed <= 0) {
2897 time_elapsed = 1;
2898 }
2899
2900 fprintf(file,
2901 ULINTPF " pending log flushes, "
2902 ULINTPF " pending chkp writes\n"
2903 ULINTPF " log i/o's done, %.2f log i/o's/second\n",
2904 log_sys->n_pending_flushes,
2905 log_sys->n_pending_checkpoint_writes,
2906 log_sys->n_log_ios,
2907 static_cast<double>(
2908 log_sys->n_log_ios - log_sys->n_log_ios_old)
2909 / time_elapsed);
2910
2911 if (srv_track_changed_pages) {
2912
2913 /* The maximum tracked LSN age is equal to the maximum
2914 checkpoint age */
2915 fprintf(file,
2916 "Log tracking enabled\n"
2917 "Log tracked up to " LSN_PF "\n"
2918 "Max tracked LSN age " LSN_PF "\n",
2919 log_get_tracked_lsn(),
2920 log_sys->max_checkpoint_age);
2921 }
2922
2923 log_sys->n_log_ios_old = log_sys->n_log_ios;
2924 log_sys->last_printout_time = current_time;
2925
2926 log_mutex_exit();
2927 }
2928
2929 /**********************************************************************//**
2930 Refreshes the statistics used to print per-second averages. */
2931 void
log_refresh_stats(void)2932 log_refresh_stats(void)
2933 /*===================*/
2934 {
2935 log_sys->n_log_ios_old = log_sys->n_log_ios;
2936 log_sys->last_printout_time = time(NULL);
2937 }
2938
2939 /********************************************************//**
2940 Closes a log group. */
2941 static
2942 void
log_group_close(log_group_t * group)2943 log_group_close(
2944 /*===========*/
2945 log_group_t* group) /* in,own: log group to close */
2946 {
2947 ulint i;
2948
2949 for (i = 0; i < group->n_files; i++) {
2950 ut_free(group->file_header_bufs_ptr[i]);
2951 }
2952
2953 ut_free(group->file_header_bufs_ptr);
2954 ut_free(group->file_header_bufs);
2955 ut_free(group->checkpoint_buf_ptr);
2956 ut_free(group);
2957 }
2958
2959 /********************************************************//**
2960 Closes all log groups. */
2961 void
log_group_close_all(void)2962 log_group_close_all(void)
2963 /*=====================*/
2964 {
2965 log_group_t* group;
2966
2967 group = UT_LIST_GET_FIRST(log_sys->log_groups);
2968
2969 while (UT_LIST_GET_LEN(log_sys->log_groups) > 0) {
2970 log_group_t* prev_group = group;
2971
2972 group = UT_LIST_GET_NEXT(log_groups, group);
2973
2974 UT_LIST_REMOVE(log_sys->log_groups, prev_group);
2975
2976 log_group_close(prev_group);
2977 }
2978 }
2979
2980 /********************************************************//**
2981 Shutdown the log system but do not release all the memory. */
2982 void
log_shutdown(void)2983 log_shutdown(void)
2984 /*==============*/
2985 {
2986 log_group_close_all();
2987
2988 ut_free(log_sys->buf_ptr);
2989 log_sys->buf_ptr = NULL;
2990 log_sys->buf = NULL;
2991 ut_free(log_sys->checkpoint_buf_ptr);
2992 log_sys->checkpoint_buf_ptr = NULL;
2993 log_sys->checkpoint_buf = NULL;
2994
2995 os_event_destroy(log_sys->flush_event);
2996
2997 rw_lock_free(&log_sys->checkpoint_lock);
2998
2999 mutex_free(&log_sys->mutex);
3000 mutex_free(&log_sys->write_mutex);
3001 mutex_free(&log_sys->log_flush_order_mutex);
3002
3003 if (!srv_read_only_mode && srv_scrub_log) {
3004 os_event_destroy(log_scrub_event);
3005 }
3006
3007 recv_sys_close();
3008 }
3009
3010 /********************************************************//**
3011 Free the log system data structures. */
3012 void
log_mem_free(void)3013 log_mem_free(void)
3014 /*==============*/
3015 {
3016 if (log_sys != NULL) {
3017 recv_sys_mem_free();
3018 ut_free(log_sys);
3019
3020 log_sys = NULL;
3021 }
3022 }
3023
3024 static void
log_pad_current_log_block(void)3025 log_pad_current_log_block(void)
3026 {
3027 ut_ad(!recv_no_log_write);
3028 lsn_t lsn = log_reserve_and_open(OS_FILE_LOG_BLOCK_SIZE);
3029
3030 ulint pad_length = OS_FILE_LOG_BLOCK_SIZE -
3031 log_sys->buf_free % OS_FILE_LOG_BLOCK_SIZE -
3032 LOG_BLOCK_TRL_SIZE;
3033 if (pad_length == (OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE -
3034 LOG_BLOCK_TRL_SIZE)) {
3035 pad_length = 0;
3036 }
3037
3038 if (pad_length) {
3039 srv_stats.n_log_scrubs.inc();
3040 }
3041
3042 for (ulint i = 0; i < pad_length; i++) {
3043 byte b = MLOG_DUMMY_RECORD;
3044 log_write_low(&b, 1);
3045 }
3046
3047 lsn = log_sys->lsn;
3048
3049 log_close();
3050
3051 ut_a(lsn % OS_FILE_LOG_BLOCK_SIZE == LOG_BLOCK_HDR_SIZE);
3052 }
3053
3054 /** If no log record has been written for a while, fill current log
3055 block with dummy records. */
3056 static void
log_scrub()3057 log_scrub()
3058 {
3059 log_mutex_enter();
3060 ulint cur_lbn = log_block_convert_lsn_to_no(log_sys->lsn);
3061
3062 if (next_lbn_to_pad == cur_lbn) {
3063 log_pad_current_log_block();
3064 }
3065
3066 next_lbn_to_pad = log_block_convert_lsn_to_no(log_sys->lsn);
3067 log_mutex_exit();
3068 }
3069
3070 /* log scrubbing speed, in bytes/sec */
3071 ulonglong innodb_scrub_log_speed;
3072
3073 /** This is the main thread for log scrub. It waits for an event and
3074 when waked up fills current log block with dummy records and sleeps again.
3075 @return this function does not return, it calls os_thread_exit() */
3076 extern "C" os_thread_ret_t
DECLARE_THREAD(log_scrub_thread)3077 DECLARE_THREAD(log_scrub_thread)(void *) {
3078 ut_ad(!srv_read_only_mode);
3079
3080 while (srv_shutdown_state < SRV_SHUTDOWN_FLUSH_PHASE) {
3081 /* log scrubbing interval in µs. */
3082 ulonglong interval =
3083 1000 * 1000 * 512 / innodb_scrub_log_speed;
3084
3085 os_event_wait_time(log_scrub_event,
3086 static_cast<ulint>(interval));
3087
3088 log_scrub();
3089
3090 os_event_reset(log_scrub_event);
3091 }
3092
3093 log_scrub_thread_active = false;
3094
3095 /* We count the number of threads in os_thread_exit(). A created
3096 thread should always use that to exit and not use return() to exit. */
3097 os_thread_exit();
3098
3099 OS_THREAD_DUMMY_RETURN;
3100 }
3101
3102 uint srv_redo_log_key_version = 0;
3103 #endif /* !UNIV_HOTBACKUP */
3104