1 /*****************************************************************************
2
3 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2009, Google Inc.
5
6 Portions of this file contain modifications contributed and copyrighted by
7 Google, Inc. Those modifications are gratefully acknowledged and are described
8 briefly in the InnoDB documentation. The contributions by Google are
9 incorporated with their permission, and subject to the conditions contained in
10 the file COPYING.Google.
11
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License, version 2.0,
14 as published by the Free Software Foundation.
15
16 This program is also distributed with certain software (including
17 but not limited to OpenSSL) that is licensed under separate terms,
18 as designated in a particular file or component or in included license
19 documentation. The authors of MySQL hereby grant you an additional
20 permission to link the program and your derivative works with the
21 separately licensed software that they have included with MySQL.
22
23 This program is distributed in the hope that it will be useful,
24 but WITHOUT ANY WARRANTY; without even the implied warranty of
25 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26 GNU General Public License, version 2.0, for more details.
27
28 You should have received a copy of the GNU General Public License along with
29 this program; if not, write to the Free Software Foundation, Inc.,
30 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
31
32 *****************************************************************************/
33
34 /**************************************************//**
35 @file log/log0log.cc
36 Database log
37
38 Created 12/9/1995 Heikki Tuuri
39 *******************************************************/
40
41 #include "ha_prototypes.h"
42 #include <debug_sync.h>
43
44 #include "log0log.h"
45
46 #ifdef UNIV_NONINL
47 #include "log0log.ic"
48 #endif
49
50 #include "mem0mem.h"
51 #include "buf0buf.h"
52 #ifndef UNIV_HOTBACKUP
53 #include "buf0flu.h"
54 #include "srv0srv.h"
55 #include "log0recv.h"
56 #include "fil0fil.h"
57 #include "dict0boot.h"
58 #include "dict0stats_bg.h"
59 #include "srv0srv.h"
60 #include "srv0start.h"
61 #include "trx0sys.h"
62 #include "trx0trx.h"
63 #include "trx0roll.h"
64 #include "srv0mon.h"
65 #include "sync0sync.h"
66 #endif /* !UNIV_HOTBACKUP */
67 #include "xb0xb.h"
68
69 /*
70 General philosophy of InnoDB redo-logs:
71
72 1) Every change to a contents of a data page must be done
73 through mtr, which in mtr_commit() writes log records
74 to the InnoDB redo log.
75
76 2) Normally these changes are performed using a mlog_write_ulint()
77 or similar function.
78
79 3) In some page level operations only a code number of a
80 c-function and its parameters are written to the log to
81 reduce the size of the log.
82
83 3a) You should not add parameters to these kind of functions
84 (e.g. trx_undo_header_create(), trx_undo_insert_header_reuse())
85
86 3b) You should not add such functionality which either change
87 working when compared with the old or are dependent on data
88 outside of the page. These kind of functions should implement
89 self-contained page transformation and it should be unchanged
90 if you don't have very essential reasons to change log
91 semantics or format.
92
93 */
94
95 /** Redo log system */
96 log_t* log_sys = NULL;
97
98 /** Whether to generate and require checksums on the redo log pages */
99 my_bool innodb_log_checksums;
100
101 /** Pointer to the log checksum calculation function */
102 log_checksum_func_t log_checksum_algorithm_ptr;
103
104 /* These control how often we print warnings if the last checkpoint is too
105 old */
106 bool log_has_printed_chkp_warning = false;
107 time_t log_last_warning_time;
108
109 bool log_has_printed_chkp_margine_warning = false;
110 time_t log_last_margine_warning_time;
111
112 /* A margin for free space in the log buffer before a log entry is catenated */
113 #define LOG_BUF_WRITE_MARGIN (4 * OS_FILE_LOG_BLOCK_SIZE)
114
115 /* Margins for free space in the log buffer after a log entry is catenated */
116 #define LOG_BUF_FLUSH_RATIO 2
117 #define LOG_BUF_FLUSH_MARGIN (LOG_BUF_WRITE_MARGIN + 4 * UNIV_PAGE_SIZE)
118
119 /* This parameter controls asynchronous making of a new checkpoint; the value
120 should be bigger than LOG_POOL_PREFLUSH_RATIO_SYNC */
121
122 #define LOG_POOL_CHECKPOINT_RATIO_ASYNC 32
123
124 /* This parameter controls synchronous preflushing of modified buffer pages */
125 #define LOG_POOL_PREFLUSH_RATIO_SYNC 16
126
127 /* The same ratio for asynchronous preflushing; this value should be less than
128 the previous */
129 #define LOG_POOL_PREFLUSH_RATIO_ASYNC 8
130
131 /* Codes used in unlocking flush latches */
132 #define LOG_UNLOCK_NONE_FLUSHED_LOCK 1
133 #define LOG_UNLOCK_FLUSH_LOCK 2
134
135 /******************************************************//**
136 Completes a checkpoint write i/o to a log file. */
137 static
138 void
139 log_io_complete_checkpoint(void);
140 /*============================*/
141
142 #ifndef UNIV_HOTBACKUP
143 /****************************************************************//**
144 Returns the oldest modified block lsn in the pool, or log_sys->lsn if none
145 exists.
146 @return LSN of oldest modification */
147 static
148 lsn_t
log_buf_pool_get_oldest_modification(void)149 log_buf_pool_get_oldest_modification(void)
150 /*======================================*/
151 {
152 lsn_t lsn;
153
154 ut_ad(log_mutex_own());
155
156 lsn = buf_pool_get_oldest_modification();
157
158 if (!lsn) {
159
160 lsn = log_sys->lsn;
161 }
162
163 return(lsn);
164 }
165 #endif /* !UNIV_HOTBACKUP */
166
167 /** Extends the log buffer.
168 @param[in] len requested minimum size in bytes */
169 void
log_buffer_extend(ulint len)170 log_buffer_extend(
171 ulint len)
172 {
173 ulint move_start;
174 ulint move_end;
175 byte *tmp_buf = static_cast<byte *>(alloca(OS_FILE_LOG_BLOCK_SIZE));
176
177 log_mutex_enter_all();
178
179 while (log_sys->is_extending) {
180 /* Another thread is trying to extend already.
181 Needs to wait for. */
182 log_mutex_exit_all();
183
184 log_buffer_flush_to_disk();
185
186 log_mutex_enter_all();
187
188 if (srv_log_buffer_size > len / UNIV_PAGE_SIZE) {
189 /* Already extended enough by the others */
190 log_mutex_exit_all();
191 return;
192 }
193 }
194
195 if (len >= log_sys->buf_size / 2) {
196 DBUG_EXECUTE_IF("ib_log_buffer_is_short_crash",
197 DBUG_SUICIDE(););
198
199 /* log_buffer is too small. try to extend instead of crash. */
200 ib::warn() << "The transaction log size is too large"
201 " for innodb_log_buffer_size (" << len << " >= "
202 << LOG_BUFFER_SIZE << " / 2). Trying to extend it.";
203 }
204
205 log_sys->is_extending = true;
206
207 while (ut_calc_align_down(log_sys->buf_free,
208 OS_FILE_LOG_BLOCK_SIZE)
209 != ut_calc_align_down(log_sys->buf_next_to_write,
210 OS_FILE_LOG_BLOCK_SIZE)) {
211 /* Buffer might have >1 blocks to write still. */
212 log_mutex_exit_all();
213
214 log_buffer_flush_to_disk();
215
216 log_mutex_enter_all();
217 }
218
219 move_start = ut_calc_align_down(
220 log_sys->buf_free,
221 OS_FILE_LOG_BLOCK_SIZE);
222 move_end = log_sys->buf_free;
223
224 /* store the last log block in buffer */
225 ut_memcpy(tmp_buf, log_sys->buf + move_start,
226 move_end - move_start);
227
228 log_sys->buf_free -= move_start;
229 log_sys->buf_next_to_write -= move_start;
230
231 /* reallocate log buffer */
232 srv_log_buffer_size = len / UNIV_PAGE_SIZE + 1;
233 ut_free(log_sys->buf_ptr);
234
235 log_sys->buf_size = LOG_BUFFER_SIZE;
236
237 log_sys->buf_ptr = static_cast<byte*>(
238 ut_zalloc_nokey(log_sys->buf_size * 2 + OS_FILE_LOG_BLOCK_SIZE));
239 log_sys->buf = static_cast<byte*>(
240 ut_align(log_sys->buf_ptr, OS_FILE_LOG_BLOCK_SIZE));
241
242 log_sys->first_in_use = true;
243
244 log_sys->max_buf_free = log_sys->buf_size / LOG_BUF_FLUSH_RATIO
245 - LOG_BUF_FLUSH_MARGIN;
246
247 /* restore the last log block */
248 ut_memcpy(log_sys->buf, tmp_buf, move_end - move_start);
249
250 ut_ad(log_sys->is_extending);
251 log_sys->is_extending = false;
252
253 log_mutex_exit_all();
254
255 ib::info() << "innodb_log_buffer_size was extended to "
256 << LOG_BUFFER_SIZE << ".";
257 }
258
259 #ifndef UNIV_HOTBACKUP
260 /** Calculate actual length in redo buffer and file including
261 block header and trailer.
262 @param[in] len length to write
263 @return actual length to write including header and trailer. */
264 static inline
265 ulint
log_calculate_actual_len(ulint len)266 log_calculate_actual_len(
267 ulint len)
268 {
269 ut_ad(log_mutex_own());
270
271 /* actual length stored per block */
272 const ulint len_per_blk = OS_FILE_LOG_BLOCK_SIZE
273 - (LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE);
274
275 /* actual data length in last block already written */
276 ulint extra_len = (log_sys->buf_free % OS_FILE_LOG_BLOCK_SIZE);
277
278 ut_ad(extra_len >= LOG_BLOCK_HDR_SIZE);
279 extra_len -= LOG_BLOCK_HDR_SIZE;
280
281 /* total extra length for block header and trailer */
282 extra_len = ((len + extra_len) / len_per_blk)
283 * (LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE);
284
285 return(len + extra_len);
286 }
287
288 /** Check margin not to overwrite transaction log from the last checkpoint.
289 If would estimate the log write to exceed the log_group_capacity,
290 waits for the checkpoint is done enough.
291 @param[in] len length of the data to be written */
292
293 void
log_margin_checkpoint_age(ulint len)294 log_margin_checkpoint_age(
295 ulint len)
296 {
297 ulint margin = log_calculate_actual_len(len);
298
299 ut_ad(log_mutex_own());
300
301 if (margin > log_sys->log_group_capacity) {
302 /* return with warning output to avoid deadlock */
303 if (!log_has_printed_chkp_margine_warning
304 || difftime(time(NULL),
305 log_last_margine_warning_time) > 15) {
306 log_has_printed_chkp_margine_warning = true;
307 log_last_margine_warning_time = time(NULL);
308
309 ib::error() << "The transaction log files are too"
310 " small for the single transaction log (size="
311 << len << "). So, the last checkpoint age"
312 " might exceed the log group capacity "
313 << log_sys->log_group_capacity << ".";
314 }
315
316 return;
317 }
318
319 /* Our margin check should ensure that we never reach this condition.
320 Try to do checkpoint once. We cannot keep waiting here as it might
321 result in hang in case the current mtr has latch on oldest lsn */
322 if (log_sys->lsn - log_sys->last_checkpoint_lsn + margin
323 > log_sys->log_group_capacity) {
324 /* The log write of 'len' might overwrite the transaction log
325 after the last checkpoint. Makes checkpoint. */
326
327 bool flushed_enough = false;
328
329 if (log_sys->lsn - log_buf_pool_get_oldest_modification()
330 + margin
331 <= log_sys->log_group_capacity) {
332 flushed_enough = true;
333 }
334
335 log_sys->check_flush_or_checkpoint = true;
336 log_mutex_exit();
337
338 DEBUG_SYNC_C("margin_checkpoint_age_rescue");
339
340 if (!flushed_enough) {
341 os_thread_sleep(100000);
342 }
343 log_checkpoint(true, false);
344
345 log_mutex_enter();
346 }
347
348 return;
349 }
350 #endif /* !UNIV_HOTBACKUP */
351 /** Open the log for log_write_low. The log must be closed with log_close.
352 @param[in] len length of the data to be written
353 @return start lsn of the log record */
354 lsn_t
log_reserve_and_open(ulint len)355 log_reserve_and_open(
356 ulint len)
357 {
358 ulint len_upper_limit;
359 #ifdef UNIV_DEBUG
360 ulint count = 0;
361 #endif /* UNIV_DEBUG */
362
363 loop:
364 ut_ad(log_mutex_own());
365 ut_ad(!recv_no_log_write);
366
367 if (log_sys->is_extending) {
368 log_mutex_exit();
369
370 /* Log buffer size is extending. Writing up to the next block
371 should wait for the extending finished. */
372
373 os_thread_sleep(100000);
374
375 ut_ad(++count < 50);
376
377 log_mutex_enter();
378 goto loop;
379 }
380
381 /* Calculate an upper limit for the space the string may take in the
382 log buffer */
383
384 len_upper_limit = LOG_BUF_WRITE_MARGIN + srv_log_write_ahead_size
385 + (5 * len) / 4;
386
387 if (log_sys->buf_free + len_upper_limit > log_sys->buf_size) {
388 log_mutex_exit();
389
390 DEBUG_SYNC_C("log_buf_size_exceeded");
391
392 /* Not enough free space, do a write of the log buffer */
393
394 log_buffer_sync_in_background(false);
395
396 srv_stats.log_waits.inc();
397
398 ut_ad(++count < 50);
399
400 log_mutex_enter();
401 goto loop;
402 }
403
404 return(log_sys->lsn);
405 }
406
407 /************************************************************//**
408 Writes to the log the string given. It is assumed that the caller holds the
409 log mutex. */
410 void
log_write_low(const byte * str,ulint str_len)411 log_write_low(
412 /*==========*/
413 const byte* str, /*!< in: string */
414 ulint str_len) /*!< in: string length */
415 {
416 log_t* log = log_sys;
417 ulint len;
418 ulint data_len;
419 byte* log_block;
420
421 ut_ad(log_mutex_own());
422 part_loop:
423 ut_ad(!recv_no_log_write);
424 /* Calculate a part length */
425
426 data_len = (log->buf_free % OS_FILE_LOG_BLOCK_SIZE) + str_len;
427
428 if (data_len <= OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
429
430 /* The string fits within the current log block */
431
432 len = str_len;
433 } else {
434 data_len = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE;
435
436 len = OS_FILE_LOG_BLOCK_SIZE
437 - (log->buf_free % OS_FILE_LOG_BLOCK_SIZE)
438 - LOG_BLOCK_TRL_SIZE;
439 }
440
441 ut_memcpy(log->buf + log->buf_free, str, len);
442
443 str_len -= len;
444 str = str + len;
445
446 log_block = static_cast<byte*>(
447 ut_align_down(
448 log->buf + log->buf_free, OS_FILE_LOG_BLOCK_SIZE));
449
450 log_block_set_data_len(log_block, data_len);
451
452 if (data_len == OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
453 /* This block became full */
454 log_block_set_data_len(log_block, OS_FILE_LOG_BLOCK_SIZE);
455 log_block_set_checkpoint_no(log_block,
456 log_sys->next_checkpoint_no);
457 len += LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE;
458
459 log->lsn += len;
460
461 /* Initialize the next block header */
462 log_block_init(log_block + OS_FILE_LOG_BLOCK_SIZE, log->lsn);
463 } else {
464 log->lsn += len;
465 }
466
467 log->buf_free += len;
468
469 ut_ad(log->buf_free <= log->buf_size);
470
471 if (str_len > 0) {
472 goto part_loop;
473 }
474
475 srv_stats.log_write_requests.inc();
476 }
477
478 /************************************************************//**
479 Closes the log.
480 @return lsn */
481 lsn_t
log_close(void)482 log_close(void)
483 /*===========*/
484 {
485 byte* log_block;
486 ulint first_rec_group;
487 lsn_t oldest_lsn;
488 lsn_t lsn;
489 log_t* log = log_sys;
490 lsn_t checkpoint_age;
491
492 ut_ad(log_mutex_own());
493 ut_ad(!recv_no_log_write);
494
495 lsn = log->lsn;
496
497 log_block = static_cast<byte*>(
498 ut_align_down(
499 log->buf + log->buf_free, OS_FILE_LOG_BLOCK_SIZE));
500
501 first_rec_group = log_block_get_first_rec_group(log_block);
502
503 if (first_rec_group == 0) {
504 /* We initialized a new log block which was not written
505 full by the current mtr: the next mtr log record group
506 will start within this block at the offset data_len */
507
508 log_block_set_first_rec_group(
509 log_block, log_block_get_data_len(log_block));
510 }
511
512 if (log->buf_free > log->max_buf_free) {
513
514 log->check_flush_or_checkpoint = true;
515 }
516
517 checkpoint_age = lsn - log->last_checkpoint_lsn;
518
519 if (checkpoint_age >= log->log_group_capacity) {
520 DBUG_EXECUTE_IF(
521 "print_all_chkp_warnings",
522 log_has_printed_chkp_warning = false;);
523
524 if (!log_has_printed_chkp_warning
525 || difftime(time(NULL), log_last_warning_time) > 15) {
526
527 log_has_printed_chkp_warning = true;
528 log_last_warning_time = time(NULL);
529
530 ib::error() << "The age of the last checkpoint is "
531 << checkpoint_age << ", which exceeds the log"
532 " group capacity " << log->log_group_capacity
533 << ".";
534 }
535 }
536
537 if (checkpoint_age <= log->max_modified_age_sync) {
538
539 goto function_exit;
540 }
541
542 oldest_lsn = buf_pool_get_oldest_modification();
543
544 if (!oldest_lsn
545 || lsn - oldest_lsn > log->max_modified_age_sync
546 || checkpoint_age > log->max_checkpoint_age_async) {
547
548 log->check_flush_or_checkpoint = true;
549 }
550 function_exit:
551
552 return(lsn);
553 }
554
555 /******************************************************//**
556 Calculates the data capacity of a log group, when the log file headers are not
557 included.
558 @return capacity in bytes */
559 lsn_t
log_group_get_capacity(const log_group_t * group)560 log_group_get_capacity(
561 /*===================*/
562 const log_group_t* group) /*!< in: log group */
563 {
564 /* The lsn parameters are updated while holding both the mutexes
565 and it is ok to have either of them while reading */
566 ut_ad(log_mutex_own() || log_write_mutex_own());
567
568 return((group->file_size - LOG_FILE_HDR_SIZE) * group->n_files);
569 }
570
571 /******************************************************//**
572 Calculates the offset within a log group, when the log file headers are not
573 included.
574 @return size offset (<= offset) */
575 UNIV_INLINE
576 lsn_t
log_group_calc_size_offset(lsn_t offset,const log_group_t * group)577 log_group_calc_size_offset(
578 /*=======================*/
579 lsn_t offset, /*!< in: real offset within the
580 log group */
581 const log_group_t* group) /*!< in: log group */
582 {
583 /* The lsn parameters are updated while holding both the mutexes
584 and it is ok to have either of them while reading */
585 ut_ad(log_mutex_own() || log_write_mutex_own());
586
587 return(offset - LOG_FILE_HDR_SIZE * (1 + offset / group->file_size));
588 }
589
590 /******************************************************//**
591 Calculates the offset within a log group, when the log file headers are
592 included.
593 @return real offset (>= offset) */
594 UNIV_INLINE
595 lsn_t
log_group_calc_real_offset(lsn_t offset,const log_group_t * group)596 log_group_calc_real_offset(
597 /*=======================*/
598 lsn_t offset, /*!< in: size offset within the
599 log group */
600 const log_group_t* group) /*!< in: log group */
601 {
602 /* The lsn parameters are updated while holding both the mutexes
603 and it is ok to have either of them while reading */
604 ut_ad(log_mutex_own() || log_write_mutex_own());
605
606 return(offset + LOG_FILE_HDR_SIZE
607 * (1 + offset / (group->file_size - LOG_FILE_HDR_SIZE)));
608 }
609
610 /** Calculate the offset of an lsn within a log group.
611 @param[in] lsn log sequence number
612 @param[in] group log group
613 @return offset within the log group */
614 lsn_t
log_group_calc_lsn_offset(lsn_t lsn,const log_group_t * group)615 log_group_calc_lsn_offset(
616 lsn_t lsn,
617 const log_group_t* group)
618 {
619 lsn_t gr_lsn;
620 lsn_t gr_lsn_size_offset;
621 lsn_t difference;
622 lsn_t group_size;
623 lsn_t offset;
624
625 /* The lsn parameters are updated while holding both the mutexes
626 and it is ok to have either of them while reading */
627 ut_ad(log_mutex_own() || log_write_mutex_own());
628
629 gr_lsn = group->lsn;
630
631 gr_lsn_size_offset = log_group_calc_size_offset(
632 group->lsn_offset, group);
633
634 group_size = log_group_get_capacity(group);
635
636 if (lsn >= gr_lsn) {
637
638 difference = lsn - gr_lsn;
639 } else {
640 difference = gr_lsn - lsn;
641
642 difference = difference % group_size;
643
644 difference = group_size - difference;
645 }
646
647 offset = (gr_lsn_size_offset + difference) % group_size;
648
649 /* fprintf(stderr,
650 "Offset is " LSN_PF " gr_lsn_offset is " LSN_PF
651 " difference is " LSN_PF "\n",
652 offset, gr_lsn_size_offset, difference);
653 */
654
655 return(log_group_calc_real_offset(offset, group));
656 }
657
658 /*******************************************************************//**
659 Calculates where in log files we find a specified lsn.
660 @return log file number */
661 ulint
log_calc_where_lsn_is(int64_t * log_file_offset,ib_uint64_t first_header_lsn,ib_uint64_t lsn,ulint n_log_files,int64_t log_file_size)662 log_calc_where_lsn_is(
663 /*==================*/
664 int64_t* log_file_offset, /*!< out: offset in that file
665 (including the header) */
666 ib_uint64_t first_header_lsn, /*!< in: first log file start
667 lsn */
668 ib_uint64_t lsn, /*!< in: lsn whose position to
669 determine */
670 ulint n_log_files, /*!< in: total number of log
671 files */
672 int64_t log_file_size) /*!< in: log file size
673 (including the header) */
674 {
675 int64_t capacity = log_file_size - LOG_FILE_HDR_SIZE;
676 ulint file_no;
677 int64_t add_this_many;
678
679 if (lsn < first_header_lsn) {
680 add_this_many = 1 + (first_header_lsn - lsn)
681 / (capacity * static_cast<int64_t>(n_log_files));
682 lsn += add_this_many
683 * capacity * static_cast<int64_t>(n_log_files);
684 }
685
686 ut_a(lsn >= first_header_lsn);
687
688 file_no = ((ulint)((lsn - first_header_lsn) / capacity))
689 % n_log_files;
690 *log_file_offset = (lsn - first_header_lsn) % capacity;
691
692 *log_file_offset = *log_file_offset + LOG_FILE_HDR_SIZE;
693
694 return(file_no);
695 }
696
697
698 /********************************************************//**
699 Sets the field values in group to correspond to a given lsn. For this function
700 to work, the values must already be correctly initialized to correspond to
701 some lsn, for instance, a checkpoint lsn. */
702 void
log_group_set_fields(log_group_t * group,lsn_t lsn)703 log_group_set_fields(
704 /*=================*/
705 log_group_t* group, /*!< in/out: group */
706 lsn_t lsn) /*!< in: lsn for which the values should be
707 set */
708 {
709 group->lsn_offset = log_group_calc_lsn_offset(lsn, group);
710 group->lsn = lsn;
711 }
712 #ifndef UNIV_HOTBACKUP
713 /*****************************************************************//**
714 Calculates the recommended highest values for lsn - last_checkpoint_lsn
715 and lsn - buf_get_oldest_modification().
716 @retval true on success
717 @retval false if the smallest log group is too small to
718 accommodate the number of OS threads in the database server */
719 static MY_ATTRIBUTE((warn_unused_result))
720 bool
log_calc_max_ages(void)721 log_calc_max_ages(void)
722 /*===================*/
723 {
724 log_group_t* group;
725 lsn_t margin;
726 ulint free;
727 bool success = true;
728 lsn_t smallest_capacity;
729
730 log_mutex_enter();
731
732 group = UT_LIST_GET_FIRST(log_sys->log_groups);
733
734 ut_ad(group);
735
736 smallest_capacity = LSN_MAX;
737
738 while (group) {
739 if (log_group_get_capacity(group) < smallest_capacity) {
740
741 smallest_capacity = log_group_get_capacity(group);
742 }
743
744 group = UT_LIST_GET_NEXT(log_groups, group);
745 }
746
747 /* Add extra safety */
748 smallest_capacity = smallest_capacity - smallest_capacity / 10;
749
750 /* For each OS thread we must reserve so much free space in the
751 smallest log group that it can accommodate the log entries produced
752 by single query steps: running out of free log space is a serious
753 system error which requires rebooting the database. */
754
755 free = LOG_CHECKPOINT_FREE_PER_THREAD * (10 + srv_thread_concurrency)
756 + LOG_CHECKPOINT_EXTRA_FREE;
757 if (free >= smallest_capacity / 2) {
758 success = false;
759
760 goto failure;
761 } else {
762 margin = smallest_capacity - free;
763 }
764
765 margin = margin - margin / 10; /* Add still some extra safety */
766
767 log_sys->log_group_capacity = smallest_capacity;
768
769 log_sys->max_modified_age_async = margin
770 - margin / LOG_POOL_PREFLUSH_RATIO_ASYNC;
771 log_sys->max_modified_age_sync = margin
772 - margin / LOG_POOL_PREFLUSH_RATIO_SYNC;
773
774 log_sys->max_checkpoint_age_async = margin - margin
775 / LOG_POOL_CHECKPOINT_RATIO_ASYNC;
776 log_sys->max_checkpoint_age = margin;
777
778 failure:
779 log_mutex_exit();
780
781 if (!success) {
782 ib::error() << "Cannot continue operation. ib_logfiles are too"
783 " small for innodb_thread_concurrency "
784 << srv_thread_concurrency << ". The combined size of"
785 " ib_logfiles should be bigger than"
786 " 200 kB * innodb_thread_concurrency. To get mysqld"
787 " to start up, set innodb_thread_concurrency in"
788 " my.cnf to a lower value, for example, to 8. After"
789 " an ERROR-FREE shutdown of mysqld you can adjust"
790 " the size of ib_logfiles. " << INNODB_PARAMETERS_MSG;
791 }
792
793 return(success);
794 }
795
796 /******************************************************//**
797 Initializes the log. */
798 void
log_init(void)799 log_init(void)
800 /*==========*/
801 {
802 log_sys = static_cast<log_t*>(ut_zalloc_nokey(sizeof(log_t)));
803
804 mutex_create(LATCH_ID_LOG_SYS, &log_sys->mutex);
805 mutex_create(LATCH_ID_LOG_WRITE, &log_sys->write_mutex);
806
807 mutex_create(LATCH_ID_LOG_FLUSH_ORDER, &log_sys->log_flush_order_mutex);
808
809 /* Start the lsn from one log block from zero: this way every
810 log record has a start lsn != zero, a fact which we will use */
811
812 log_sys->lsn = LOG_START_LSN;
813
814 ut_a(LOG_BUFFER_SIZE >= 16 * OS_FILE_LOG_BLOCK_SIZE);
815 ut_a(LOG_BUFFER_SIZE >= 4 * UNIV_PAGE_SIZE);
816
817 log_sys->buf_size = LOG_BUFFER_SIZE;
818
819 log_sys->buf_ptr = static_cast<byte*>(
820 ut_zalloc_nokey(log_sys->buf_size * 2 + OS_FILE_LOG_BLOCK_SIZE));
821 log_sys->buf = static_cast<byte*>(
822 ut_align(log_sys->buf_ptr, OS_FILE_LOG_BLOCK_SIZE));
823
824 log_sys->first_in_use = true;
825
826 log_sys->max_buf_free = log_sys->buf_size / LOG_BUF_FLUSH_RATIO
827 - LOG_BUF_FLUSH_MARGIN;
828 log_sys->check_flush_or_checkpoint = true;
829 UT_LIST_INIT(log_sys->log_groups, &log_group_t::log_groups);
830
831 log_sys->n_log_ios_old = log_sys->n_log_ios;
832 log_sys->last_printout_time = time(NULL);
833 /*----------------------------*/
834
835 log_sys->write_lsn = log_sys->lsn;
836
837 log_sys->flush_event = os_event_create(0);
838
839 os_event_set(log_sys->flush_event);
840
841 /*----------------------------*/
842
843 log_sys->last_checkpoint_lsn = log_sys->lsn;
844
845 rw_lock_create(
846 checkpoint_lock_key, &log_sys->checkpoint_lock,
847 SYNC_NO_ORDER_CHECK);
848
849 log_sys->checkpoint_buf_ptr = static_cast<byte*>(
850 ut_zalloc_nokey(2 * OS_FILE_LOG_BLOCK_SIZE));
851
852 log_sys->checkpoint_buf = static_cast<byte*>(
853 ut_align(log_sys->checkpoint_buf_ptr, OS_FILE_LOG_BLOCK_SIZE));
854
855 /*----------------------------*/
856
857 log_block_init(log_sys->buf, log_sys->lsn);
858 log_block_set_first_rec_group(log_sys->buf, LOG_BLOCK_HDR_SIZE);
859
860 log_sys->buf_free = LOG_BLOCK_HDR_SIZE;
861 log_sys->lsn = LOG_START_LSN + LOG_BLOCK_HDR_SIZE;
862
863 MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
864 log_sys->lsn - log_sys->last_checkpoint_lsn);
865 }
866
867 /******************************************************************//**
868 Inits a log group to the log system.
869 @return true if success, false if not */
870 MY_ATTRIBUTE((warn_unused_result))
871 bool
log_group_init(ulint id,ulint n_files,lsn_t file_size,ulint space_id)872 log_group_init(
873 /*===========*/
874 ulint id, /*!< in: group id */
875 ulint n_files, /*!< in: number of log files */
876 lsn_t file_size, /*!< in: log file size in bytes */
877 ulint space_id) /*!< in: space id of the file space
878 which contains the log files of this
879 group */
880 {
881 ulint i;
882 log_group_t* group;
883
884 group = static_cast<log_group_t*>(ut_malloc_nokey(sizeof(log_group_t)));
885
886 group->id = id;
887 group->n_files = n_files;
888 group->format = LOG_HEADER_FORMAT_CURRENT;
889 group->file_size = file_size;
890 group->space_id = space_id;
891 group->state = LOG_GROUP_OK;
892 group->lsn = LOG_START_LSN;
893 group->lsn_offset = LOG_FILE_HDR_SIZE;
894 group->lsn_offset_ps55 = LOG_FILE_HDR_SIZE;
895
896 group->file_header_bufs_ptr = static_cast<byte**>(
897 ut_zalloc_nokey(sizeof(byte*) * n_files));
898
899 group->file_header_bufs = static_cast<byte**>(
900 ut_zalloc_nokey(sizeof(byte**) * n_files));
901
902 for (i = 0; i < n_files; i++) {
903 group->file_header_bufs_ptr[i] = static_cast<byte*>(
904 ut_zalloc_nokey(LOG_FILE_HDR_SIZE
905 + OS_FILE_LOG_BLOCK_SIZE));
906
907 group->file_header_bufs[i] = static_cast<byte*>(
908 ut_align(group->file_header_bufs_ptr[i],
909 OS_FILE_LOG_BLOCK_SIZE));
910 }
911
912 group->checkpoint_buf_ptr = static_cast<byte*>(
913 ut_zalloc_nokey(2 * OS_FILE_LOG_BLOCK_SIZE));
914
915 group->checkpoint_buf = static_cast<byte*>(
916 ut_align(group->checkpoint_buf_ptr,OS_FILE_LOG_BLOCK_SIZE));
917
918 UT_LIST_ADD_LAST(log_sys->log_groups, group);
919
920 return(log_calc_max_ages());
921 }
922 #endif /* !UNIV_HOTBACKUP */
923 /******************************************************//**
924 Completes an i/o to a log file. */
925 void
log_io_complete(log_group_t * group)926 log_io_complete(
927 /*============*/
928 log_group_t* group) /*!< in: log group or a dummy pointer */
929 {
930 if ((ulint) group & 0x1UL) {
931 /* It was a checkpoint write */
932 group = (log_group_t*)((ulint) group - 1);
933
934 #ifdef _WIN32
935 fil_flush(group->space_id);
936 #else
937 switch (srv_unix_file_flush_method) {
938 case SRV_UNIX_O_DSYNC:
939 case SRV_UNIX_NOSYNC:
940 case SRV_UNIX_ALL_O_DIRECT:
941 break;
942 case SRV_UNIX_FSYNC:
943 case SRV_UNIX_LITTLESYNC:
944 case SRV_UNIX_O_DIRECT:
945 case SRV_UNIX_O_DIRECT_NO_FSYNC:
946 fil_flush(group->space_id);
947 }
948 #endif /* _WIN32 */
949
950 DBUG_PRINT("ib_log", ("checkpoint info written to group %u",
951 unsigned(group->id)));
952 log_io_complete_checkpoint();
953
954 return;
955 }
956
957 ut_error; /*!< We currently use synchronous writing of the
958 logs and cannot end up here! */
959 }
960
961 /******************************************************//**
962 Writes a log file header to a log file space. */
963 static
964 void
log_group_file_header_flush_0(log_group_t * group,ulint nth_file,lsn_t start_lsn)965 log_group_file_header_flush_0(
966 /*========================*/
967 log_group_t* group, /*!< in: log group */
968 ulint nth_file, /*!< in: header to the nth file in the
969 log file space */
970 lsn_t start_lsn) /*!< in: log file data starts at this
971 lsn */
972 {
973 byte* buf;
974 lsn_t dest_offset;
975
976 /* log group number */
977 static const uint GROUP_ID = 16;
978 /* lsn of the start of data in this log file */
979 static const uint FILE_START_LSN = 4;
980
981 ut_ad(log_write_mutex_own());
982 ut_ad(!recv_no_log_write);
983 ut_a(nth_file < group->n_files);
984
985 buf = *(group->file_header_bufs + nth_file);
986
987 mach_write_to_4(buf + GROUP_ID, group->id);
988 mach_write_to_8(buf + FILE_START_LSN, start_lsn);
989
990 /* Wipe over possible label of mysqlbackup --restore */
991 memcpy(buf + LOG_HEADER_CREATOR, " ", 4);
992
993 if (srv_log_block_size > 512) {
994 mach_write_to_4(buf + LOG_FILE_OS_FILE_LOG_BLOCK_SIZE,
995 srv_log_block_size);
996 }
997
998 dest_offset = nth_file * group->file_size;
999
1000 DBUG_PRINT("ib_log", ("write " LSN_PF
1001 " group " ULINTPF
1002 " file " ULINTPF " header",
1003 start_lsn, group->id, nth_file));
1004
1005 log_sys->n_log_ios++;
1006
1007 MONITOR_INC(MONITOR_LOG_IO);
1008
1009 srv_stats.os_log_pending_writes.inc();
1010
1011 const ulint page_no
1012 = (ulint) (dest_offset / univ_page_size.physical());
1013
1014 fil_io(IORequestLogWrite, true,
1015 page_id_t(group->space_id, page_no),
1016 univ_page_size,
1017 (ulint) (dest_offset % univ_page_size.physical()),
1018 OS_FILE_LOG_BLOCK_SIZE, buf, group);
1019
1020 srv_stats.os_log_pending_writes.dec();
1021 }
1022
1023 /******************************************************//**
1024 Writes a log file header to a log file space. */
1025 static
1026 void
log_group_file_header_flush(log_group_t * group,ulint nth_file,lsn_t start_lsn)1027 log_group_file_header_flush(
1028 /*========================*/
1029 log_group_t* group, /*!< in: log group */
1030 ulint nth_file, /*!< in: header to the nth file in the
1031 log file space */
1032 lsn_t start_lsn) /*!< in: log file data starts at this
1033 lsn */
1034 {
1035 byte* buf;
1036 lsn_t dest_offset;
1037
1038 ut_ad(log_write_mutex_own());
1039 ut_ad(!recv_no_log_write);
1040 ut_ad(group->id == 0);
1041 ut_a(nth_file < group->n_files);
1042
1043 if (redo_log_version == REDO_LOG_V0) {
1044 log_group_file_header_flush_0(group, nth_file, start_lsn);
1045 return;
1046 }
1047
1048 buf = *(group->file_header_bufs + nth_file);
1049
1050 memset(buf, 0, OS_FILE_LOG_BLOCK_SIZE);
1051 mach_write_to_4(buf + LOG_HEADER_FORMAT, LOG_HEADER_FORMAT_CURRENT);
1052 mach_write_to_8(buf + LOG_HEADER_START_LSN, start_lsn);
1053 strcpy(reinterpret_cast<char*>(buf) + LOG_HEADER_CREATOR,
1054 LOG_HEADER_CREATOR_CURRENT);
1055 ut_ad(LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR
1056 >= sizeof LOG_HEADER_CREATOR_CURRENT);
1057 log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf));
1058
1059 if (srv_log_block_size > 512) {
1060 mach_write_to_4(buf + LOG_FILE_OS_FILE_LOG_BLOCK_SIZE,
1061 srv_log_block_size);
1062 }
1063
1064 dest_offset = nth_file * group->file_size;
1065
1066 DBUG_PRINT("ib_log", ("write " LSN_PF
1067 " group " ULINTPF
1068 " file " ULINTPF " header",
1069 start_lsn, group->id, nth_file));
1070
1071 log_sys->n_log_ios++;
1072
1073 MONITOR_INC(MONITOR_LOG_IO);
1074
1075 srv_stats.os_log_pending_writes.inc();
1076
1077 const ulint page_no
1078 = (ulint) (dest_offset / univ_page_size.physical());
1079
1080 fil_io(IORequestLogWrite, true,
1081 page_id_t(group->space_id, page_no),
1082 univ_page_size,
1083 (ulint) (dest_offset % univ_page_size.physical()),
1084 OS_FILE_LOG_BLOCK_SIZE, buf, group);
1085
1086 srv_stats.os_log_pending_writes.dec();
1087 }
1088
1089 /******************************************************//**
1090 Stores a 4-byte checksum to the trailer checksum field of a log block
1091 before writing it to a log file. This checksum is used in recovery to
1092 check the consistency of a log block. */
1093 static
1094 void
log_block_store_checksum(byte * block)1095 log_block_store_checksum(
1096 /*=====================*/
1097 byte* block) /*!< in/out: pointer to a log block */
1098 {
1099 log_block_set_checksum(block, log_block_calc_checksum(block));
1100 }
1101
1102 /******************************************************//**
1103 Writes a buffer to a log file group. */
1104 static
1105 void
log_group_write_buf(log_group_t * group,byte * buf,ulint len,ulint pad_len,lsn_t start_lsn,ulint new_data_offset)1106 log_group_write_buf(
1107 /*================*/
1108 log_group_t* group, /*!< in: log group */
1109 byte* buf, /*!< in: buffer */
1110 ulint len, /*!< in: buffer len; must be divisible
1111 by OS_FILE_LOG_BLOCK_SIZE */
1112 #ifdef UNIV_DEBUG
1113 ulint pad_len, /*!< in: pad len in the buffer len */
1114 #endif /* UNIV_DEBUG */
1115 lsn_t start_lsn, /*!< in: start lsn of the buffer; must
1116 be divisible by
1117 OS_FILE_LOG_BLOCK_SIZE */
1118 ulint new_data_offset)/*!< in: start offset of new data in
1119 buf: this parameter is used to decide
1120 if we have to write a new log file
1121 header */
1122 {
1123 ulint write_len;
1124 bool write_header = new_data_offset == 0;
1125 lsn_t next_offset;
1126 ulint i;
1127
1128 ut_ad(log_write_mutex_own());
1129 ut_ad(!recv_no_log_write);
1130 ut_a(len % OS_FILE_LOG_BLOCK_SIZE == 0);
1131 ut_a(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
1132
1133 loop:
1134 if (len == 0) {
1135
1136 return;
1137 }
1138
1139 next_offset = log_group_calc_lsn_offset(start_lsn, group);
1140
1141 if (write_header
1142 && next_offset % group->file_size == LOG_FILE_HDR_SIZE) {
1143 /* We start to write a new log file instance in the group */
1144
1145 ut_a(next_offset / group->file_size <= ULINT_MAX);
1146
1147 log_group_file_header_flush(group, (ulint)
1148 (next_offset / group->file_size),
1149 start_lsn);
1150 srv_stats.os_log_written.add(OS_FILE_LOG_BLOCK_SIZE);
1151
1152 srv_stats.log_writes.inc();
1153 }
1154
1155 if ((next_offset % group->file_size) + len > group->file_size) {
1156
1157 /* if the above condition holds, then the below expression
1158 is < len which is ulint, so the typecast is ok */
1159 write_len = (ulint)
1160 (group->file_size - (next_offset % group->file_size));
1161 } else {
1162 write_len = len;
1163 }
1164
1165 DBUG_PRINT("ib_log",
1166 ("write " LSN_PF " to " LSN_PF
1167 ": group " ULINTPF " len " ULINTPF
1168 " blocks " ULINTPF ".." ULINTPF,
1169 start_lsn, next_offset,
1170 group->id, write_len,
1171 log_block_get_hdr_no(buf),
1172 log_block_get_hdr_no(
1173 buf + write_len
1174 - OS_FILE_LOG_BLOCK_SIZE)));
1175
1176 ut_ad(pad_len >= len
1177 || log_block_get_hdr_no(buf)
1178 == log_block_convert_lsn_to_no(start_lsn));
1179
1180 /* Calculate the checksums for each log block and write them to
1181 the trailer fields of the log blocks */
1182
1183 for (i = 0; i < write_len / OS_FILE_LOG_BLOCK_SIZE; i++) {
1184 ut_ad(pad_len >= len
1185 || i * OS_FILE_LOG_BLOCK_SIZE >= len - pad_len
1186 || log_block_get_hdr_no(
1187 buf + i * OS_FILE_LOG_BLOCK_SIZE)
1188 == log_block_get_hdr_no(buf) + i);
1189 log_block_store_checksum(buf + i * OS_FILE_LOG_BLOCK_SIZE);
1190 }
1191
1192 log_sys->n_log_ios++;
1193
1194 MONITOR_INC(MONITOR_LOG_IO);
1195
1196 srv_stats.os_log_pending_writes.inc();
1197
1198 ut_a(next_offset / UNIV_PAGE_SIZE <= ULINT_MAX);
1199
1200 const ulint page_no
1201 = (ulint) (next_offset / univ_page_size.physical());
1202
1203 fil_io(IORequestLogWrite, true,
1204 page_id_t(group->space_id, page_no),
1205 univ_page_size,
1206 (ulint) (next_offset % UNIV_PAGE_SIZE), write_len, buf,
1207 group);
1208
1209 srv_stats.os_log_pending_writes.dec();
1210
1211 srv_stats.os_log_written.add(write_len);
1212 srv_stats.log_writes.inc();
1213
1214 if (write_len < len) {
1215 start_lsn += write_len;
1216 len -= write_len;
1217 buf += write_len;
1218
1219 write_header = true;
1220
1221 goto loop;
1222 }
1223 }
1224
1225 /** Flush the log has been written to the log file. */
1226 static
1227 void
log_write_flush_to_disk_low()1228 log_write_flush_to_disk_low()
1229 {
1230 ut_a(log_sys->n_pending_flushes == 1); /* No other threads here */
1231
1232 #ifndef _WIN32
1233 bool do_flush = srv_unix_file_flush_method != SRV_UNIX_O_DSYNC;
1234 #else
1235 bool do_flush = true;
1236 #endif
1237 if (do_flush) {
1238 log_group_t* group = UT_LIST_GET_FIRST(log_sys->log_groups);
1239 fil_flush(group->space_id);
1240 log_sys->flushed_to_disk_lsn = log_sys->current_flush_lsn;
1241 }
1242
1243 log_sys->n_pending_flushes--;
1244 MONITOR_DEC(MONITOR_PENDING_LOG_FLUSH);
1245
1246 os_event_set(log_sys->flush_event);
1247 }
1248
1249 /** Switch the log buffer in use, and copy the content of last block
1250 from old log buffer to the head of the to be used one. Thus, buf_free and
1251 buf_next_to_write would be changed accordingly */
1252 static inline
1253 void
log_buffer_switch()1254 log_buffer_switch()
1255 {
1256 ut_ad(log_mutex_own());
1257 ut_ad(log_write_mutex_own());
1258
1259 const byte* old_buf = log_sys->buf;
1260 ulint area_end = ut_calc_align(log_sys->buf_free,
1261 OS_FILE_LOG_BLOCK_SIZE);
1262
1263 if (log_sys->first_in_use) {
1264 ut_ad(log_sys->buf == ut_align(log_sys->buf_ptr,
1265 OS_FILE_LOG_BLOCK_SIZE));
1266 log_sys->buf += log_sys->buf_size;
1267 } else {
1268 log_sys->buf -= log_sys->buf_size;
1269 ut_ad(log_sys->buf == ut_align(log_sys->buf_ptr,
1270 OS_FILE_LOG_BLOCK_SIZE));
1271 }
1272
1273 log_sys->first_in_use = !log_sys->first_in_use;
1274
1275 /* Copy the last block to new buf */
1276 ut_memcpy(log_sys->buf,
1277 old_buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
1278 OS_FILE_LOG_BLOCK_SIZE);
1279
1280 log_sys->buf_free %= OS_FILE_LOG_BLOCK_SIZE;
1281 log_sys->buf_next_to_write = log_sys->buf_free;
1282 }
1283
1284 /** Ensure that the log has been written to the log file up to a given
1285 log entry (such as that of a transaction commit). Start a new write, or
1286 wait and check if an already running write is covering the request.
1287 @param[in] lsn log sequence number that should be
1288 included in the redo log file write
1289 @param[in] flush_to_disk whether the written log should also
1290 be flushed to the file system */
1291 void
log_write_up_to(lsn_t lsn,bool flush_to_disk)1292 log_write_up_to(
1293 lsn_t lsn,
1294 bool flush_to_disk)
1295 {
1296 #ifdef UNIV_DEBUG
1297 ulint loop_count = 0;
1298 #endif /* UNIV_DEBUG */
1299 byte* write_buf;
1300 lsn_t write_lsn;
1301
1302 ut_ad(!srv_read_only_mode);
1303
1304 if (recv_no_ibuf_operations) {
1305 /* Recovery is running and no operations on the log files are
1306 allowed yet (the variable name .._no_ibuf_.. is misleading) */
1307
1308 return;
1309 }
1310
1311 loop:
1312 ut_ad(++loop_count < 128);
1313
1314 #if UNIV_WORD_SIZE > 7
1315 /* We can do a dirty read of LSN. */
1316 /* NOTE: Currently doesn't do dirty read for
1317 (flush_to_disk == true) case, because the log_mutex
1318 contention also works as the arbitrator for write-IO
1319 (fsync) bandwidth between log files and data files. */
1320 os_rmb;
1321 if (!flush_to_disk && log_sys->write_lsn >= lsn) {
1322 return;
1323 }
1324 #endif
1325
1326 log_write_mutex_enter();
1327 ut_ad(!recv_no_log_write);
1328
1329 lsn_t limit_lsn = flush_to_disk
1330 ? log_sys->flushed_to_disk_lsn
1331 : log_sys->write_lsn;
1332
1333 if (limit_lsn >= lsn) {
1334 log_write_mutex_exit();
1335 return;
1336 }
1337
1338 #ifdef _WIN32
1339 # ifndef UNIV_HOTBACKUP
1340 /* write requests during fil_flush() might not be good for Windows */
1341 if (log_sys->n_pending_flushes > 0
1342 || !os_event_is_set(log_sys->flush_event)) {
1343 log_write_mutex_exit();
1344 os_event_wait(log_sys->flush_event);
1345 goto loop;
1346 }
1347 # else
1348 if (log_sys->n_pending_flushes > 0) {
1349 goto loop;
1350 }
1351 # endif /* !UNIV_HOTBACKUP */
1352 #endif /* _WIN32 */
1353
1354 /* If it is a write call we should just go ahead and do it
1355 as we checked that write_lsn is not where we'd like it to
1356 be. If we have to flush as well then we check if there is a
1357 pending flush and based on that we wait for it to finish
1358 before proceeding further. */
1359 if (flush_to_disk
1360 && (log_sys->n_pending_flushes > 0
1361 || !os_event_is_set(log_sys->flush_event))) {
1362
1363 /* Figure out if the current flush will do the job
1364 for us. */
1365 bool work_done = log_sys->current_flush_lsn >= lsn;
1366
1367 log_write_mutex_exit();
1368
1369 os_event_wait(log_sys->flush_event);
1370
1371 if (work_done) {
1372 return;
1373 } else {
1374 goto loop;
1375 }
1376 }
1377
1378 log_mutex_enter();
1379 if (!flush_to_disk
1380 && log_sys->buf_free == log_sys->buf_next_to_write) {
1381 /* Nothing to write and no flush to disk requested */
1382 log_mutex_exit_all();
1383 return;
1384 }
1385
1386 log_group_t* group;
1387 ulint start_offset;
1388 ulint end_offset;
1389 ulint area_start;
1390 ulint area_end;
1391 ulong write_ahead_size = srv_log_write_ahead_size;
1392 ulint pad_size;
1393
1394 DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF,
1395 log_sys->write_lsn,
1396 log_sys->lsn));
1397
1398 if (flush_to_disk) {
1399 log_sys->n_pending_flushes++;
1400 log_sys->current_flush_lsn = log_sys->lsn;
1401 MONITOR_INC(MONITOR_PENDING_LOG_FLUSH);
1402 os_event_reset(log_sys->flush_event);
1403
1404 if (log_sys->buf_free == log_sys->buf_next_to_write) {
1405 /* Nothing to write, flush only */
1406 log_mutex_exit_all();
1407 log_write_flush_to_disk_low();
1408 return;
1409 }
1410 }
1411
1412 start_offset = log_sys->buf_next_to_write;
1413 end_offset = log_sys->buf_free;
1414
1415 area_start = ut_calc_align_down(start_offset, OS_FILE_LOG_BLOCK_SIZE);
1416 area_end = ut_calc_align(end_offset, OS_FILE_LOG_BLOCK_SIZE);
1417
1418 ut_ad(area_end - area_start > 0);
1419
1420 log_block_set_flush_bit(log_sys->buf + area_start, TRUE);
1421 log_block_set_checkpoint_no(
1422 log_sys->buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
1423 log_sys->next_checkpoint_no);
1424
1425 write_lsn = log_sys->lsn;
1426 write_buf = log_sys->buf;
1427
1428 log_buffer_switch();
1429
1430 group = UT_LIST_GET_FIRST(log_sys->log_groups);
1431
1432 log_group_set_fields(group, log_sys->write_lsn);
1433
1434 log_mutex_exit();
1435
1436 /* Calculate pad_size if needed. */
1437 pad_size = 0;
1438 if (write_ahead_size > OS_FILE_LOG_BLOCK_SIZE) {
1439 lsn_t end_offset;
1440 ulint end_offset_in_unit;
1441
1442 end_offset = log_group_calc_lsn_offset(
1443 ut_uint64_align_up(write_lsn,
1444 OS_FILE_LOG_BLOCK_SIZE),
1445 group);
1446 end_offset_in_unit = (ulint) (end_offset % write_ahead_size);
1447
1448 if (end_offset_in_unit > 0
1449 && (area_end - area_start) > end_offset_in_unit) {
1450 /* The first block in the unit was initialized
1451 after the last writing.
1452 Needs to be written padded data once. */
1453 pad_size = write_ahead_size - end_offset_in_unit;
1454
1455 if (area_end + pad_size > log_sys->buf_size) {
1456 pad_size = log_sys->buf_size - area_end;
1457 }
1458
1459 ::memset(write_buf + area_end, 0, pad_size);
1460 }
1461 }
1462
1463 /* Do the write to the log files */
1464 log_group_write_buf(
1465 group, write_buf + area_start,
1466 area_end - area_start + pad_size,
1467 #ifdef UNIV_DEBUG
1468 pad_size,
1469 #endif /* UNIV_DEBUG */
1470 ut_uint64_align_down(log_sys->write_lsn,
1471 OS_FILE_LOG_BLOCK_SIZE),
1472 start_offset - area_start);
1473
1474 srv_stats.log_padded.add(pad_size);
1475
1476 log_sys->write_lsn = write_lsn;
1477
1478 #ifndef _WIN32
1479 if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC ||
1480 srv_unix_file_flush_method != SRV_UNIX_ALL_O_DIRECT) {
1481 /* O_DSYNC or SRV_UNIX_ALL_O_DIRECT means the OS did not buffer
1482 the log file at all: so we have also flushed to disk what we
1483 have written */
1484 log_sys->flushed_to_disk_lsn = log_sys->write_lsn;
1485 }
1486 #endif /* !_WIN32 */
1487
1488 log_write_mutex_exit();
1489
1490 if (flush_to_disk) {
1491 log_write_flush_to_disk_low();
1492 }
1493 }
1494
1495 /** write to the log file up to the last log entry.
1496 @param[in] sync whether we want the written log
1497 also to be flushed to disk. */
1498 void
log_buffer_flush_to_disk(bool sync)1499 log_buffer_flush_to_disk(
1500 bool sync)
1501 {
1502 ut_ad(!srv_read_only_mode);
1503 log_write_up_to(log_get_lsn(), sync);
1504 }
1505
1506 /****************************************************************//**
1507 This functions writes the log buffer to the log file and if 'flush'
1508 is set it forces a flush of the log file as well. This is meant to be
1509 called from background master thread only as it does not wait for
1510 the write (+ possible flush) to finish. */
1511 void
log_buffer_sync_in_background(bool flush)1512 log_buffer_sync_in_background(
1513 /*==========================*/
1514 bool flush) /*!< in: flush the logs to disk */
1515 {
1516 lsn_t lsn;
1517
1518 log_mutex_enter();
1519
1520 lsn = log_sys->lsn;
1521
1522 if (flush
1523 && log_sys->n_pending_flushes > 0
1524 && log_sys->current_flush_lsn >= lsn) {
1525 /* The write + flush will write enough */
1526 log_mutex_exit();
1527 return;
1528 }
1529
1530 log_mutex_exit();
1531
1532 log_write_up_to(lsn, flush);
1533 }
1534
1535 /********************************************************************
1536
1537 Tries to establish a big enough margin of free space in the log buffer, such
1538 that a new log entry can be catenated without an immediate need for a flush. */
1539 static
1540 void
log_flush_margin(void)1541 log_flush_margin(void)
1542 /*==================*/
1543 {
1544 log_t* log = log_sys;
1545 lsn_t lsn = 0;
1546
1547 log_mutex_enter();
1548
1549 if (log->buf_free > log->max_buf_free) {
1550 /* We can write during flush */
1551 lsn = log->lsn;
1552 }
1553
1554 log_mutex_exit();
1555
1556 if (lsn) {
1557 log_write_up_to(lsn, false);
1558 }
1559 }
1560 #ifndef UNIV_HOTBACKUP
1561 /** Advances the smallest lsn for which there are unflushed dirty blocks in the
1562 buffer pool.
1563 NOTE: this function may only be called if the calling thread owns no
1564 synchronization objects!
1565 @param[in] new_oldest try to advance oldest_modified_lsn at least to
1566 this lsn
1567 @return false if there was a flush batch of the same type running,
1568 which means that we could not start this flush batch */
1569 static
1570 bool
log_preflush_pool_modified_pages(lsn_t new_oldest)1571 log_preflush_pool_modified_pages(
1572 lsn_t new_oldest)
1573 {
1574 bool success;
1575
1576 if (recv_recovery_on) {
1577 /* If the recovery is running, we must first apply all
1578 log records to their respective file pages to get the
1579 right modify lsn values to these pages: otherwise, there
1580 might be pages on disk which are not yet recovered to the
1581 current lsn, and even after calling this function, we could
1582 not know how up-to-date the disk version of the database is,
1583 and we could not make a new checkpoint on the basis of the
1584 info on the buffer pool only. */
1585
1586 recv_apply_hashed_log_recs(TRUE);
1587 }
1588
1589 if (new_oldest == LSN_MAX
1590 || !buf_page_cleaner_is_active
1591 || srv_is_being_started) {
1592
1593 ulint n_pages;
1594
1595 success = buf_flush_lists(ULINT_MAX, new_oldest, &n_pages);
1596
1597 buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
1598
1599 if (!success) {
1600 MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
1601 }
1602
1603 MONITOR_INC_VALUE_CUMULATIVE(
1604 MONITOR_FLUSH_SYNC_TOTAL_PAGE,
1605 MONITOR_FLUSH_SYNC_COUNT,
1606 MONITOR_FLUSH_SYNC_PAGES,
1607 n_pages);
1608 } else {
1609 /* better to wait for flushed by page cleaner */
1610
1611 if (srv_flush_sync) {
1612 /* wake page cleaner for IO burst */
1613 buf_flush_request_force(new_oldest);
1614 }
1615
1616 buf_flush_wait_flushed(new_oldest);
1617
1618 success = true;
1619 }
1620
1621 return(success);
1622 }
1623 #endif /* !UNIV_HOTBACKUP */
1624 /******************************************************//**
1625 Completes a checkpoint. */
1626 static
1627 void
log_complete_checkpoint(void)1628 log_complete_checkpoint(void)
1629 /*=========================*/
1630 {
1631 ut_ad(log_mutex_own());
1632 ut_ad(log_sys->n_pending_checkpoint_writes == 0);
1633
1634 log_sys->next_checkpoint_no++;
1635
1636 log_sys->last_checkpoint_lsn = log_sys->next_checkpoint_lsn;
1637 MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
1638 log_sys->lsn - log_sys->last_checkpoint_lsn);
1639
1640 DBUG_PRINT("ib_log", ("checkpoint ended at " LSN_PF
1641 ", flushed to " LSN_PF,
1642 log_sys->last_checkpoint_lsn,
1643 log_sys->flushed_to_disk_lsn));
1644
1645 rw_lock_x_unlock_gen(&(log_sys->checkpoint_lock), LOG_CHECKPOINT);
1646 }
1647
1648 /******************************************************//**
1649 Completes an asynchronous checkpoint info write i/o to a log file. */
1650 static
1651 void
log_io_complete_checkpoint(void)1652 log_io_complete_checkpoint(void)
1653 /*============================*/
1654 {
1655 MONITOR_DEC(MONITOR_PENDING_CHECKPOINT_WRITE);
1656
1657 log_mutex_enter();
1658
1659 ut_ad(log_sys->n_pending_checkpoint_writes > 0);
1660
1661 if (--log_sys->n_pending_checkpoint_writes == 0) {
1662 log_complete_checkpoint();
1663 }
1664
1665 log_mutex_exit();
1666 }
1667
1668 static
1669 void
log_group_checkpoint_0(log_group_t * group)1670 log_group_checkpoint_0(
1671 /*===================*/
1672 log_group_t* group) /*!< in: log group */
1673 {
1674 ulint fold;
1675 byte* buf;
1676 lsn_t lsn_offset;
1677
1678 /** Offset of the first checkpoint checksum */
1679 static const uint CHECKSUM_1 = 288;
1680 /** Offset of the second checkpoint checksum */
1681 static const uint CHECKSUM_2 = CHECKSUM_1 + 4;
1682 /** Most significant bits of the checkpoint offset */
1683 static const uint OFFSET_HIGH32 = CHECKSUM_2 + 12;
1684 /** Least significant bits of the checkpoint offset */
1685 static const uint OFFSET_LOW32 = 16;
1686 /** Checkpoint offset read by PS 5.5 */
1687 static const uint ARCHIVED_LSN = 24;
1688
1689 buf = group->checkpoint_buf;
1690
1691 mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys->next_checkpoint_no);
1692 mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys->next_checkpoint_lsn);
1693
1694 lsn_offset = log_group_calc_lsn_offset(log_sys->next_checkpoint_lsn,
1695 group);
1696 mach_write_to_4(buf + OFFSET_LOW32, lsn_offset & 0xFFFFFFFFUL);
1697 mach_write_to_4(buf + OFFSET_HIGH32, lsn_offset >> 32);
1698
1699 mach_write_to_4(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, log_sys->buf_size);
1700
1701 mach_write_to_8(buf + ARCHIVED_LSN, lsn_offset);
1702
1703 fold = ut_fold_binary(buf, CHECKSUM_1);
1704 mach_write_to_4(buf + CHECKSUM_1, fold);
1705
1706 fold = ut_fold_binary(buf + LOG_CHECKPOINT_LSN,
1707 CHECKSUM_2 - LOG_CHECKPOINT_LSN);
1708 mach_write_to_4(buf + CHECKSUM_2, fold);
1709
1710 MONITOR_INC(MONITOR_PENDING_CHECKPOINT_WRITE);
1711
1712 log_sys->n_log_ios++;
1713
1714 MONITOR_INC(MONITOR_LOG_IO);
1715
1716 if (log_sys->n_pending_checkpoint_writes++ == 0) {
1717 rw_lock_x_lock_gen(&log_sys->checkpoint_lock,
1718 LOG_CHECKPOINT);
1719 }
1720
1721 /* Note: We alternate the physical place of the checkpoint info.
1722 See the (next_checkpoint_no & 1) below. */
1723
1724 /* We send as the last parameter the group machine address
1725 added with 1, as we want to distinguish between a normal log
1726 file write and a checkpoint field write */
1727
1728 fil_io(IORequestLogWrite, false,
1729 page_id_t(group->space_id, 0),
1730 univ_page_size,
1731 (log_sys->next_checkpoint_no & 1)
1732 ? LOG_CHECKPOINT_2 : LOG_CHECKPOINT_1,
1733 OS_FILE_LOG_BLOCK_SIZE,
1734 buf, (byte*) group + 1);
1735
1736 ut_ad(((ulint) group & 0x1UL) == 0);
1737 }
1738
1739 /******************************************************//**
1740 Writes the checkpoint info to a log group header. */
1741 static
1742 void
log_group_checkpoint(log_group_t * group)1743 log_group_checkpoint(
1744 /*=================*/
1745 log_group_t* group) /*!< in: log group */
1746 {
1747 lsn_t lsn_offset;
1748 byte* buf;
1749
1750 ut_ad(!srv_read_only_mode);
1751 ut_ad(LOG_CHECKPOINT_SIZE <= OS_FILE_LOG_BLOCK_SIZE);
1752 ut_ad(log_mutex_own());
1753
1754 DBUG_PRINT("ib_log", ("checkpoint " UINT64PF " at " LSN_PF
1755 " written to group " ULINTPF,
1756 log_sys->next_checkpoint_no,
1757 log_sys->next_checkpoint_lsn,
1758 group->id));
1759
1760 if (redo_log_version == REDO_LOG_V0) {
1761 log_group_checkpoint_0(group);
1762 return;
1763 }
1764
1765 buf = group->checkpoint_buf;
1766 memset(buf, 0, OS_FILE_LOG_BLOCK_SIZE);
1767
1768 mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys->next_checkpoint_no);
1769 mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys->next_checkpoint_lsn);
1770
1771 lsn_offset = log_group_calc_lsn_offset(log_sys->next_checkpoint_lsn,
1772 group);
1773 mach_write_to_8(buf + LOG_CHECKPOINT_OFFSET, lsn_offset);
1774 mach_write_to_8(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, log_sys->buf_size);
1775
1776 log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf));
1777
1778 MONITOR_INC(MONITOR_PENDING_CHECKPOINT_WRITE);
1779
1780 log_sys->n_log_ios++;
1781
1782 MONITOR_INC(MONITOR_LOG_IO);
1783
1784 ut_ad(LOG_CHECKPOINT_1 < univ_page_size.physical());
1785 ut_ad(LOG_CHECKPOINT_2 < univ_page_size.physical());
1786
1787 if (log_sys->n_pending_checkpoint_writes++ == 0) {
1788 rw_lock_x_lock_gen(&log_sys->checkpoint_lock,
1789 LOG_CHECKPOINT);
1790 }
1791
1792 /* Note: We alternate the physical place of the checkpoint info.
1793 See the (next_checkpoint_no & 1) below. */
1794
1795 /* We send as the last parameter the group machine address
1796 added with 1, as we want to distinguish between a normal log
1797 file write and a checkpoint field write */
1798
1799 fil_io(IORequestLogWrite, false,
1800 page_id_t(group->space_id, 0),
1801 univ_page_size,
1802 (log_sys->next_checkpoint_no & 1)
1803 ? LOG_CHECKPOINT_2 : LOG_CHECKPOINT_1,
1804 OS_FILE_LOG_BLOCK_SIZE,
1805 buf, (byte*) group + 1);
1806
1807 ut_ad(((ulint) group & 0x1UL) == 0);
1808 }
1809
1810 #ifdef UNIV_HOTBACKUP
1811 /******************************************************//**
1812 Writes info to a buffer of a log group when log files are created in
1813 backup restoration. */
1814 void
log_reset_first_header_and_checkpoint(byte * hdr_buf,ib_uint64_t start)1815 log_reset_first_header_and_checkpoint(
1816 /*==================================*/
1817 byte* hdr_buf,/*!< in: buffer which will be written to the
1818 start of the first log file */
1819 ib_uint64_t start) /*!< in: lsn of the start of the first log file;
1820 we pretend that there is a checkpoint at
1821 start + LOG_BLOCK_HDR_SIZE */
1822 {
1823 byte* buf;
1824 ib_uint64_t lsn;
1825
1826 mach_write_to_4(hdr_buf + LOG_HEADER_FORMAT,
1827 LOG_HEADER_FORMAT_CURRENT);
1828 mach_write_to_8(hdr_buf + LOG_HEADER_START_LSN, start);
1829
1830 lsn = start + LOG_BLOCK_HDR_SIZE;
1831
1832 /* Write the label of mysqlbackup --restore */
1833 strcpy((char*)hdr_buf + LOG_HEADER_CREATOR, LOG_HEADER_CREATOR_CURRENT);
1834 ut_sprintf_timestamp((char*) hdr_buf
1835 + (LOG_HEADER_CREATOR
1836 + (sizeof LOG_HEADER_CREATOR_CURRENT) - 1));
1837 buf = hdr_buf + LOG_CHECKPOINT_1;
1838 memset(buf, 0, OS_FILE_LOG_BLOCK_SIZE);
1839
1840 /*mach_write_to_8(buf + LOG_CHECKPOINT_NO, 0);*/
1841 mach_write_to_8(buf + LOG_CHECKPOINT_LSN, lsn);
1842
1843 mach_write_to_8(buf + LOG_CHECKPOINT_OFFSET,
1844 LOG_FILE_HDR_SIZE + LOG_BLOCK_HDR_SIZE);
1845 mach_write_to_8(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, 2 * 1024 * 1024);
1846
1847 log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf));
1848 }
1849 #endif /* UNIV_HOTBACKUP */
1850
1851 #ifndef UNIV_HOTBACKUP
1852 /** Read a log group header page to log_sys->checkpoint_buf.
1853 @param[in] group log group
1854 @param[in] header 0 or LOG_CHEKCPOINT_1 or LOG_CHECKPOINT2 */
1855 void
log_group_header_read(const log_group_t * group,ulint header)1856 log_group_header_read(
1857 const log_group_t* group,
1858 ulint header)
1859 {
1860 ut_ad(log_mutex_own());
1861
1862 log_sys->n_log_ios++;
1863
1864 MONITOR_INC(MONITOR_LOG_IO);
1865
1866 fil_io(IORequestLogRead, true,
1867 page_id_t(group->space_id, header / univ_page_size.physical()),
1868 univ_page_size, header % univ_page_size.physical(),
1869 OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL);
1870 }
1871
1872 /** Write checkpoint info to the log header and invoke log_mutex_exit().
1873 @param[in] sync whether to wait for the write to complete */
1874 void
log_write_checkpoint_info(bool sync)1875 log_write_checkpoint_info(
1876 bool sync)
1877 {
1878 log_group_t* group;
1879
1880 ut_ad(log_mutex_own());
1881
1882 if (!srv_read_only_mode) {
1883 for (group = UT_LIST_GET_FIRST(log_sys->log_groups);
1884 group;
1885 group = UT_LIST_GET_NEXT(log_groups, group)) {
1886
1887 log_group_checkpoint(group);
1888 }
1889 }
1890
1891 log_mutex_exit();
1892
1893 MONITOR_INC(MONITOR_NUM_CHECKPOINT);
1894
1895 if (sync) {
1896 /* Wait for the checkpoint write to complete */
1897 rw_lock_s_lock(&log_sys->checkpoint_lock);
1898 rw_lock_s_unlock(&log_sys->checkpoint_lock);
1899
1900 DEBUG_SYNC_C("checkpoint_completed");
1901
1902 DBUG_EXECUTE_IF(
1903 "crash_after_checkpoint",
1904 DBUG_SUICIDE(););
1905 }
1906 }
1907
1908 /** Set extra data to be written to the redo log during checkpoint.
1909 @param[in] buf data to be appended on checkpoint, or NULL
1910 @return pointer to previous data to be appended on checkpoint */
1911 mtr_buf_t*
log_append_on_checkpoint(mtr_buf_t * buf)1912 log_append_on_checkpoint(
1913 mtr_buf_t* buf)
1914 {
1915 log_mutex_enter();
1916 mtr_buf_t* old = log_sys->append_on_checkpoint;
1917 log_sys->append_on_checkpoint = buf;
1918 log_mutex_exit();
1919 return(old);
1920 }
1921
1922 /** Make a checkpoint. Note that this function does not flush dirty
1923 blocks from the buffer pool: it only checks what is lsn of the oldest
1924 modification in the pool, and writes information about the lsn in
1925 log files. Use log_make_checkpoint_at() to flush also the pool.
1926 @param[in] sync whether to wait for the write to complete
1927 @param[in] write_always force a write even if no log
1928 has been generated since the latest checkpoint
1929 @return true if success, false if a checkpoint write was already running */
1930 bool
log_checkpoint(bool sync,bool write_always)1931 log_checkpoint(
1932 bool sync,
1933 bool write_always)
1934 {
1935 lsn_t oldest_lsn;
1936
1937 ut_ad(!srv_read_only_mode);
1938
1939 if (recv_recovery_is_on()) {
1940 recv_apply_hashed_log_recs(TRUE);
1941 }
1942
1943 #ifndef _WIN32
1944 switch (srv_unix_file_flush_method) {
1945 case SRV_UNIX_NOSYNC:
1946 case SRV_UNIX_ALL_O_DIRECT:
1947 break;
1948 case SRV_UNIX_O_DSYNC:
1949 case SRV_UNIX_FSYNC:
1950 case SRV_UNIX_LITTLESYNC:
1951 case SRV_UNIX_O_DIRECT:
1952 case SRV_UNIX_O_DIRECT_NO_FSYNC:
1953 fil_flush_file_spaces(FIL_TYPE_TABLESPACE);
1954 }
1955 #endif /* !_WIN32 */
1956
1957 log_mutex_enter();
1958
1959 ut_ad(!recv_no_log_write);
1960 oldest_lsn = log_buf_pool_get_oldest_modification();
1961
1962 /* Because log also contains headers and dummy log records,
1963 log_buf_pool_get_oldest_modification() will return log_sys->lsn
1964 if the buffer pool contains no dirty buffers.
1965 We must make sure that the log is flushed up to that lsn.
1966 If there are dirty buffers in the buffer pool, then our
1967 write-ahead-logging algorithm ensures that the log has been
1968 flushed up to oldest_lsn. */
1969
1970 ut_ad(oldest_lsn >= log_sys->last_checkpoint_lsn);
1971 if (!write_always
1972 && oldest_lsn
1973 <= log_sys->last_checkpoint_lsn + SIZE_OF_MLOG_CHECKPOINT) {
1974 /* Do nothing, because nothing was logged (other than
1975 a MLOG_CHECKPOINT marker) since the previous checkpoint. */
1976 log_mutex_exit();
1977 return(true);
1978 }
1979
1980 /* Repeat the MLOG_FILE_NAME records after the checkpoint, in
1981 case some log records between the checkpoint and log_sys->lsn
1982 need them. Finally, write a MLOG_CHECKPOINT marker. Redo log
1983 apply expects to see a MLOG_CHECKPOINT after the checkpoint,
1984 except on clean shutdown, where the log will be empty after
1985 the checkpoint.
1986
1987 It is important that we write out the redo log before any
1988 further dirty pages are flushed to the tablespace files. At
1989 this point, because log_mutex_own(), mtr_commit() in other
1990 threads will be blocked, and no pages can be added to the
1991 flush lists. */
1992 lsn_t flush_lsn = oldest_lsn;
1993 const bool do_write
1994 = srv_shutdown_state == SRV_SHUTDOWN_NONE
1995 || flush_lsn != log_sys->lsn;
1996
1997 if (fil_names_clear(flush_lsn, do_write)) {
1998 ut_ad(log_sys->lsn >= flush_lsn + SIZE_OF_MLOG_CHECKPOINT);
1999 flush_lsn = log_sys->lsn;
2000 }
2001
2002 log_mutex_exit();
2003
2004 log_write_up_to(flush_lsn, true);
2005
2006 DBUG_EXECUTE_IF(
2007 "using_wa_checkpoint_middle",
2008 if (write_always) {
2009 DEBUG_SYNC_C("wa_checkpoint_middle");
2010
2011 const my_bool b = TRUE;
2012 buf_flush_page_cleaner_disabled_debug_update(
2013 NULL, NULL, NULL, &b);
2014 dict_stats_disabled_debug_update(
2015 NULL, NULL, NULL, &b);
2016 srv_master_thread_disabled_debug_update(
2017 NULL, NULL, NULL, &b);
2018 });
2019
2020 log_mutex_enter();
2021
2022 ut_ad(log_sys->flushed_to_disk_lsn >= flush_lsn);
2023 ut_ad(flush_lsn >= oldest_lsn);
2024
2025 if (log_sys->last_checkpoint_lsn >= oldest_lsn) {
2026 log_mutex_exit();
2027 return(true);
2028 }
2029
2030 if (log_sys->n_pending_checkpoint_writes > 0) {
2031 /* A checkpoint write is running */
2032 log_mutex_exit();
2033
2034 if (sync) {
2035 /* Wait for the checkpoint write to complete */
2036 rw_lock_s_lock(&log_sys->checkpoint_lock);
2037 rw_lock_s_unlock(&log_sys->checkpoint_lock);
2038 }
2039
2040 return(false);
2041 }
2042
2043 log_sys->next_checkpoint_lsn = oldest_lsn;
2044 log_write_checkpoint_info(sync);
2045 ut_ad(!log_mutex_own());
2046
2047 return(true);
2048 }
2049
2050 /** Make a checkpoint at or after a specified LSN.
2051 @param[in] lsn the log sequence number, or LSN_MAX
2052 for the latest LSN
2053 @param[in] write_always force a write even if no log
2054 has been generated since the latest checkpoint */
2055 void
log_make_checkpoint_at(lsn_t lsn,bool write_always)2056 log_make_checkpoint_at(
2057 lsn_t lsn,
2058 bool write_always)
2059 {
2060 /* Preflush pages synchronously */
2061
2062 while (!log_preflush_pool_modified_pages(lsn)) {
2063 /* Flush as much as we can */
2064 }
2065
2066 while (!log_checkpoint(true, write_always)) {
2067 /* Force a checkpoint */
2068 }
2069 }
2070
2071 /****************************************************************//**
2072 Tries to establish a big enough margin of free space in the log groups, such
2073 that a new log entry can be catenated without an immediate need for a
2074 checkpoint. NOTE: this function may only be called if the calling thread
2075 owns no synchronization objects! */
2076 static
2077 void
log_checkpoint_margin(void)2078 log_checkpoint_margin(void)
2079 /*=======================*/
2080 {
2081 log_t* log = log_sys;
2082 lsn_t age;
2083 lsn_t checkpoint_age;
2084 ib_uint64_t advance;
2085 lsn_t oldest_lsn;
2086 bool success;
2087 loop:
2088 advance = 0;
2089
2090 log_mutex_enter();
2091 ut_ad(!recv_no_log_write);
2092
2093 if (!log->check_flush_or_checkpoint) {
2094 log_mutex_exit();
2095 return;
2096 }
2097
2098 oldest_lsn = log_buf_pool_get_oldest_modification();
2099
2100 age = log->lsn - oldest_lsn;
2101
2102 if (age > log->max_modified_age_sync) {
2103
2104 /* A flush is urgent: we have to do a synchronous preflush */
2105 advance = age - log->max_modified_age_sync;
2106 }
2107
2108 checkpoint_age = log->lsn - log->last_checkpoint_lsn;
2109
2110 bool checkpoint_sync;
2111 bool do_checkpoint;
2112
2113 if (checkpoint_age > log->max_checkpoint_age) {
2114 /* A checkpoint is urgent: we do it synchronously */
2115 checkpoint_sync = true;
2116 do_checkpoint = true;
2117 } else if (checkpoint_age > log->max_checkpoint_age_async) {
2118 /* A checkpoint is not urgent: do it asynchronously */
2119 do_checkpoint = true;
2120 checkpoint_sync = false;
2121 log->check_flush_or_checkpoint = false;
2122 } else {
2123 do_checkpoint = false;
2124 checkpoint_sync = false;
2125 log->check_flush_or_checkpoint = false;
2126 }
2127
2128 log_mutex_exit();
2129
2130 if (advance) {
2131 lsn_t new_oldest = oldest_lsn + advance;
2132
2133 success = log_preflush_pool_modified_pages(new_oldest);
2134
2135 /* If the flush succeeded, this thread has done its part
2136 and can proceed. If it did not succeed, there was another
2137 thread doing a flush at the same time. */
2138 if (!success) {
2139 log_mutex_enter();
2140
2141 log->check_flush_or_checkpoint = true;
2142
2143 log_mutex_exit();
2144 goto loop;
2145 }
2146 }
2147
2148 if (do_checkpoint) {
2149 log_checkpoint(checkpoint_sync, FALSE);
2150
2151 if (checkpoint_sync) {
2152
2153 goto loop;
2154 }
2155 }
2156 }
2157
2158 /******************************************************//**
2159 Reads a specified log segment to a buffer. */
2160 void
log_group_read_log_seg(byte * buf,log_group_t * group,lsn_t start_lsn,lsn_t end_lsn)2161 log_group_read_log_seg(
2162 /*===================*/
2163 byte* buf, /*!< in: buffer where to read */
2164 log_group_t* group, /*!< in: log group */
2165 lsn_t start_lsn, /*!< in: read area start */
2166 lsn_t end_lsn) /*!< in: read area end */
2167 {
2168 ulint len;
2169 lsn_t source_offset;
2170
2171 ut_ad(log_mutex_own());
2172
2173 loop:
2174 source_offset = log_group_calc_lsn_offset(start_lsn, group);
2175
2176 ut_a(end_lsn - start_lsn <= ULINT_MAX);
2177 len = (ulint) (end_lsn - start_lsn);
2178
2179 ut_ad(len != 0);
2180
2181 if ((source_offset % group->file_size) + len > group->file_size) {
2182
2183 /* If the above condition is true then len (which is ulint)
2184 is > the expression below, so the typecast is ok */
2185 len = (ulint) (group->file_size -
2186 (source_offset % group->file_size));
2187 }
2188
2189 log_sys->n_log_ios++;
2190
2191 MONITOR_INC(MONITOR_LOG_IO);
2192
2193 ut_a(source_offset / UNIV_PAGE_SIZE <= ULINT_MAX);
2194
2195 const ulint page_no
2196 = (ulint) (source_offset / univ_page_size.physical());
2197
2198 fil_io(IORequestLogRead, true,
2199 page_id_t(group->space_id, page_no),
2200 univ_page_size,
2201 (ulint) (source_offset % univ_page_size.physical()),
2202 len, buf, NULL);
2203
2204 start_lsn += len;
2205 buf += len;
2206
2207 if (start_lsn != end_lsn) {
2208
2209 goto loop;
2210 }
2211 }
2212
2213 /**
2214 Checks that there is enough free space in the log to start a new query step.
2215 Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
2216 function may only be called if the calling thread owns no synchronization
2217 objects! */
2218 void
log_check_margins(void)2219 log_check_margins(void)
2220 {
2221 bool check;
2222
2223 do {
2224 log_flush_margin();
2225 log_checkpoint_margin();
2226 log_mutex_enter();
2227 ut_ad(!recv_no_log_write);
2228 check = log_sys->check_flush_or_checkpoint;
2229 log_mutex_exit();
2230 } while (check);
2231 }
2232
2233 /****************************************************************//**
2234 Makes a checkpoint at the latest lsn and writes it to first page of each
2235 data file in the database, so that we know that the file spaces contain
2236 all modifications up to that lsn. This can only be called at database
2237 shutdown. This function also writes all log in log files to the log archive. */
2238 void
logs_empty_and_mark_files_at_shutdown(void)2239 logs_empty_and_mark_files_at_shutdown(void)
2240 /*=======================================*/
2241 {
2242 lsn_t lsn;
2243 ulint count = 0;
2244 ulint total_trx;
2245 ulint pending_io;
2246 enum srv_thread_type active_thd;
2247 const char* thread_name;
2248
2249 ib::info() << "Starting shutdown...";
2250
2251 while (srv_fast_shutdown == 0 && trx_rollback_or_clean_is_active) {
2252 /* we should wait until rollback after recovery end
2253 for slow shutdown */
2254 os_thread_sleep(100000);
2255 }
2256
2257 /* Wait until the master thread and all other operations are idle: our
2258 algorithm only works if the server is idle at shutdown */
2259
2260 srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
2261 loop:
2262 os_thread_sleep(100000);
2263
2264 count++;
2265
2266 /* We need the monitor threads to stop before we proceed with
2267 a shutdown. */
2268
2269 thread_name = srv_any_background_threads_are_active();
2270
2271 if (thread_name != NULL) {
2272 /* Print a message every 60 seconds if we are waiting
2273 for the monitor thread to exit. Master and worker
2274 threads check will be done later. */
2275
2276 if (srv_print_verbose_log && count > 600) {
2277 ib::info() << "Waiting for " << thread_name
2278 << " to exit";
2279 count = 0;
2280 }
2281
2282 goto loop;
2283 }
2284
2285 /* Check that there are no longer transactions, except for
2286 PREPARED ones. We need this wait even for the 'very fast'
2287 shutdown, because the InnoDB layer may have committed or
2288 prepared transactions and we don't want to lose them. */
2289
2290 total_trx = trx_sys_any_active_transactions();
2291
2292 if (total_trx > 0) {
2293
2294 if (srv_print_verbose_log && count > 600) {
2295 ib::info() << "Waiting for " << total_trx << " active"
2296 << " transactions to finish";
2297
2298 count = 0;
2299 }
2300
2301 goto loop;
2302 }
2303
2304 /* Check that the background threads are suspended */
2305
2306 active_thd = srv_get_active_thread_type();
2307
2308 if (active_thd != SRV_NONE
2309 || (srv_fast_shutdown != 2
2310 && trx_rollback_or_clean_is_active)) {
2311
2312 if (active_thd == SRV_PURGE) {
2313 srv_purge_wakeup();
2314 }
2315
2316 /* The srv_lock_timeout_thread, srv_error_monitor_thread
2317 and srv_monitor_thread should already exit by now. The
2318 only threads to be suspended are the master threads
2319 and worker threads (purge threads). Print the thread
2320 type if any of such threads not in suspended mode */
2321 if (srv_print_verbose_log && count > 600) {
2322 const char* thread_type = "<null>";
2323
2324 switch (active_thd) {
2325 case SRV_NONE:
2326 thread_type = "rollback";
2327 break;
2328 case SRV_WORKER:
2329 thread_type = "worker threads";
2330 break;
2331 case SRV_MASTER:
2332 thread_type = "master thread";
2333 break;
2334 case SRV_PURGE:
2335 thread_type = "purge thread";
2336 break;
2337 }
2338
2339 ib::info() << "Waiting for " << thread_type
2340 << " to be suspended";
2341
2342 count = 0;
2343 }
2344
2345 goto loop;
2346 }
2347
2348 /* At this point only page_cleaner should be active. We wait
2349 here to let it complete the flushing of the buffer pools
2350 before proceeding further. */
2351 srv_shutdown_state = SRV_SHUTDOWN_FLUSH_PHASE;
2352 count = 0;
2353 while (buf_page_cleaner_is_active) {
2354 ++count;
2355 os_thread_sleep(100000);
2356 if (srv_print_verbose_log && count > 600) {
2357 ib::info() << "Waiting for page_cleaner to"
2358 " finish flushing of buffer pool";
2359 count = 0;
2360 }
2361 }
2362
2363 log_mutex_enter();
2364 const ulint n_write = log_sys->n_pending_checkpoint_writes;
2365 const ulint n_flush = log_sys->n_pending_flushes;
2366 log_mutex_exit();
2367
2368 if (n_write != 0 || n_flush != 0) {
2369 if (srv_print_verbose_log && count > 600) {
2370 ib::info() << "Pending checkpoint_writes: " << n_write
2371 << ". Pending log flush writes: " << n_flush;
2372 count = 0;
2373 }
2374 goto loop;
2375 }
2376
2377 pending_io = buf_pool_check_no_pending_io();
2378
2379 if (pending_io) {
2380 if (srv_print_verbose_log && count > 600) {
2381 ib::info() << "Waiting for " << pending_io << " buffer"
2382 " page I/Os to complete";
2383 count = 0;
2384 }
2385
2386 goto loop;
2387 }
2388
2389 if (srv_fast_shutdown == 2) {
2390 if (!srv_read_only_mode) {
2391 ib::info() << "MySQL has requested a very fast"
2392 " shutdown without flushing the InnoDB buffer"
2393 " pool to data files. At the next mysqld"
2394 " startup InnoDB will do a crash recovery!";
2395
2396 /* In this fastest shutdown we do not flush the
2397 buffer pool:
2398
2399 it is essentially a 'crash' of the InnoDB server.
2400 Make sure that the log is all flushed to disk, so
2401 that we can recover all committed transactions in
2402 a crash recovery. We must not write the lsn stamps
2403 to the data files, since at a startup InnoDB deduces
2404 from the stamps if the previous shutdown was clean. */
2405
2406 log_buffer_flush_to_disk();
2407
2408 /* Check that the background threads stay suspended */
2409 thread_name = srv_any_background_threads_are_active();
2410
2411 if (thread_name != NULL) {
2412 ib::warn() << "Background thread "
2413 << thread_name << " woke up during"
2414 " shutdown";
2415 goto loop;
2416 }
2417 }
2418
2419 srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
2420
2421 fil_close_all_files();
2422
2423 thread_name = srv_any_background_threads_are_active();
2424
2425 ut_a(!thread_name);
2426
2427 return;
2428 }
2429
2430 if (!srv_read_only_mode) {
2431 log_make_checkpoint_at(LSN_MAX, TRUE);
2432 }
2433
2434 log_mutex_enter();
2435
2436 lsn = log_sys->lsn;
2437
2438 ut_ad(lsn >= log_sys->last_checkpoint_lsn);
2439
2440 log_mutex_exit();
2441
2442 /** If innodb_force_recovery is set to 6 then log_sys doesn't
2443 have recent checkpoint information. So last checkpoint lsn
2444 will never be equal to current lsn. */
2445 const bool is_last = ((srv_force_recovery == SRV_FORCE_NO_LOG_REDO
2446 && lsn == log_sys->last_checkpoint_lsn
2447 + LOG_BLOCK_HDR_SIZE)
2448 || lsn == log_sys->last_checkpoint_lsn);
2449
2450 if (!is_last) {
2451 goto loop;
2452 }
2453
2454 /* Check that the background threads stay suspended */
2455 thread_name = srv_any_background_threads_are_active();
2456 if (thread_name != NULL) {
2457 ib::warn() << "Background thread " << thread_name << " woke up"
2458 " during shutdown";
2459
2460 goto loop;
2461 }
2462
2463 if (!srv_read_only_mode) {
2464 fil_flush_file_spaces(FIL_TYPE_TABLESPACE);
2465 fil_flush_file_spaces(FIL_TYPE_LOG);
2466 }
2467
2468 /* The call fil_write_flushed_lsn() will bypass the buffer
2469 pool: therefore it is essential that the buffer pool has been
2470 completely flushed to disk! (We do not call fil_write... if the
2471 'very fast' shutdown is enabled.) */
2472
2473 if (!buf_all_freed()) {
2474
2475 if (srv_print_verbose_log && count > 600) {
2476 ib::info() << "Waiting for dirty buffer pages to be"
2477 " flushed";
2478 count = 0;
2479 }
2480
2481 goto loop;
2482 }
2483
2484 srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
2485
2486 /* Make some checks that the server really is quiet */
2487 srv_thread_type type = srv_get_active_thread_type();
2488 ut_a(type == SRV_NONE);
2489
2490 bool freed = buf_all_freed();
2491 ut_a(freed);
2492
2493 ut_a(lsn == log_sys->lsn);
2494
2495 if (lsn < srv_start_lsn) {
2496 ib::error() << "Log sequence number at shutdown " << lsn
2497 << " is lower than at startup " << srv_start_lsn
2498 << "!";
2499 }
2500
2501 srv_shutdown_lsn = lsn;
2502
2503 if (!srv_read_only_mode) {
2504 fil_write_flushed_lsn_to_data_files(lsn);
2505 }
2506
2507 fil_close_all_files();
2508
2509 /* Make some checks that the server really is quiet */
2510 type = srv_get_active_thread_type();
2511 ut_a(type == SRV_NONE);
2512
2513 freed = buf_all_freed();
2514 ut_a(freed);
2515
2516 ut_a(lsn == log_sys->lsn);
2517 }
2518
2519 /******************************************************//**
2520 Peeks the current lsn.
2521 @return TRUE if success, FALSE if could not get the log system mutex */
2522 ibool
log_peek_lsn(lsn_t * lsn)2523 log_peek_lsn(
2524 /*=========*/
2525 lsn_t* lsn) /*!< out: if returns TRUE, current lsn is here */
2526 {
2527 if (0 == mutex_enter_nowait(&(log_sys->mutex))) {
2528 *lsn = log_sys->lsn;
2529
2530 log_mutex_exit();
2531
2532 return(TRUE);
2533 }
2534
2535 return(FALSE);
2536 }
2537
2538 /******************************************************//**
2539 Prints info of the log. */
2540 void
log_print(FILE * file)2541 log_print(
2542 /*======*/
2543 FILE* file) /*!< in: file where to print */
2544 {
2545 double time_elapsed;
2546 time_t current_time;
2547
2548 log_mutex_enter();
2549
2550 fprintf(file,
2551 "Log sequence number " LSN_PF "\n"
2552 "Log flushed up to " LSN_PF "\n"
2553 "Pages flushed up to " LSN_PF "\n"
2554 "Last checkpoint at " LSN_PF "\n",
2555 log_sys->lsn,
2556 log_sys->flushed_to_disk_lsn,
2557 log_buf_pool_get_oldest_modification(),
2558 log_sys->last_checkpoint_lsn);
2559
2560 current_time = time(NULL);
2561
2562 time_elapsed = difftime(current_time,
2563 log_sys->last_printout_time);
2564
2565 if (time_elapsed <= 0) {
2566 time_elapsed = 1;
2567 }
2568
2569 fprintf(file,
2570 ULINTPF " pending log flushes, "
2571 ULINTPF " pending chkp writes\n"
2572 ULINTPF " log i/o's done, %.2f log i/o's/second\n",
2573 log_sys->n_pending_flushes,
2574 log_sys->n_pending_checkpoint_writes,
2575 log_sys->n_log_ios,
2576 static_cast<double>(
2577 log_sys->n_log_ios - log_sys->n_log_ios_old)
2578 / time_elapsed);
2579
2580 log_sys->n_log_ios_old = log_sys->n_log_ios;
2581 log_sys->last_printout_time = current_time;
2582
2583 log_mutex_exit();
2584 }
2585
2586 /**********************************************************************//**
2587 Refreshes the statistics used to print per-second averages. */
2588 void
log_refresh_stats(void)2589 log_refresh_stats(void)
2590 /*===================*/
2591 {
2592 log_sys->n_log_ios_old = log_sys->n_log_ios;
2593 log_sys->last_printout_time = time(NULL);
2594 }
2595
2596 /********************************************************//**
2597 Closes a log group. */
2598 static
2599 void
log_group_close(log_group_t * group)2600 log_group_close(
2601 /*===========*/
2602 log_group_t* group) /* in,own: log group to close */
2603 {
2604 ulint i;
2605
2606 for (i = 0; i < group->n_files; i++) {
2607 ut_free(group->file_header_bufs_ptr[i]);
2608 }
2609
2610 ut_free(group->file_header_bufs_ptr);
2611 ut_free(group->file_header_bufs);
2612 ut_free(group->checkpoint_buf_ptr);
2613 ut_free(group);
2614 }
2615
2616 /********************************************************//**
2617 Closes all log groups. */
2618 void
log_group_close_all(void)2619 log_group_close_all(void)
2620 /*=====================*/
2621 {
2622 log_group_t* group;
2623
2624 group = UT_LIST_GET_FIRST(log_sys->log_groups);
2625
2626 while (UT_LIST_GET_LEN(log_sys->log_groups) > 0) {
2627 log_group_t* prev_group = group;
2628
2629 group = UT_LIST_GET_NEXT(log_groups, group);
2630
2631 UT_LIST_REMOVE(log_sys->log_groups, prev_group);
2632
2633 log_group_close(prev_group);
2634 }
2635 }
2636
2637 /********************************************************//**
2638 Shutdown the log system but do not release all the memory. */
2639 void
log_shutdown(void)2640 log_shutdown(void)
2641 /*==============*/
2642 {
2643 log_group_close_all();
2644
2645 ut_free(log_sys->buf_ptr);
2646 log_sys->buf_ptr = NULL;
2647 log_sys->buf = NULL;
2648 ut_free(log_sys->checkpoint_buf_ptr);
2649 log_sys->checkpoint_buf_ptr = NULL;
2650 log_sys->checkpoint_buf = NULL;
2651
2652 os_event_destroy(log_sys->flush_event);
2653
2654 rw_lock_free(&log_sys->checkpoint_lock);
2655
2656 mutex_free(&log_sys->mutex);
2657 mutex_free(&log_sys->write_mutex);
2658 mutex_free(&log_sys->log_flush_order_mutex);
2659
2660 recv_sys_close();
2661 }
2662
2663 /********************************************************//**
2664 Free the log system data structures. */
2665 void
log_mem_free(void)2666 log_mem_free(void)
2667 /*==============*/
2668 {
2669 if (log_sys != NULL) {
2670 recv_sys_mem_free();
2671 ut_free(log_sys);
2672
2673 log_sys = NULL;
2674 }
2675 }
2676 #endif /* !UNIV_HOTBACKUP */
2677