1 /*****************************************************************************
2
3 Copyright (c) 1995, 2021, Oracle and/or its affiliates.
4 Copyright (c) 2009, Google Inc.
5
6 Portions of this file contain modifications contributed and copyrighted by
7 Google, Inc. Those modifications are gratefully acknowledged and are described
8 briefly in the InnoDB documentation. The contributions by Google are
9 incorporated with their permission, and subject to the conditions contained in
10 the file COPYING.Google.
11
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License, version 2.0,
14 as published by the Free Software Foundation.
15
16 This program is also distributed with certain software (including
17 but not limited to OpenSSL) that is licensed under separate terms,
18 as designated in a particular file or component or in included license
19 documentation. The authors of MySQL hereby grant you an additional
20 permission to link the program and your derivative works with the
21 separately licensed software that they have included with MySQL.
22
23 This program is distributed in the hope that it will be useful,
24 but WITHOUT ANY WARRANTY; without even the implied warranty of
25 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26 GNU General Public License, version 2.0, for more details.
27
28 You should have received a copy of the GNU General Public License along with
29 this program; if not, write to the Free Software Foundation, Inc.,
30 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
31
32 *****************************************************************************/
33
34 /**************************************************//**
35 @file log/log0log.cc
36 Database log
37
38 Created 12/9/1995 Heikki Tuuri
39 *******************************************************/
40
41 #include "ha_prototypes.h"
42 #include <debug_sync.h>
43
44 #include "log0log.h"
45
46 #ifdef UNIV_NONINL
47 #include "log0log.ic"
48 #endif
49
50 #include "mem0mem.h"
51 #include "buf0buf.h"
52 #ifndef UNIV_HOTBACKUP
53 #include "buf0flu.h"
54 #include "srv0srv.h"
55 #include "log0recv.h"
56 #include "fil0fil.h"
57 #include "dict0boot.h"
58 #include "dict0stats_bg.h"
59 #include "srv0srv.h"
60 #include "srv0start.h"
61 #include "trx0sys.h"
62 #include "trx0trx.h"
63 #include "trx0roll.h"
64 #include "srv0mon.h"
65 #include "sync0sync.h"
66 #endif /* !UNIV_HOTBACKUP */
67
68 /*
69 General philosophy of InnoDB redo-logs:
70
71 1) Every change to a contents of a data page must be done
72 through mtr, which in mtr_commit() writes log records
73 to the InnoDB redo log.
74
75 2) Normally these changes are performed using a mlog_write_ulint()
76 or similar function.
77
78 3) In some page level operations only a code number of a
79 c-function and its parameters are written to the log to
80 reduce the size of the log.
81
82 3a) You should not add parameters to these kind of functions
83 (e.g. trx_undo_header_create(), trx_undo_insert_header_reuse())
84
85 3b) You should not add such functionality which either change
86 working when compared with the old or are dependent on data
87 outside of the page. These kind of functions should implement
88 self-contained page transformation and it should be unchanged
89 if you don't have very essential reasons to change log
90 semantics or format.
91
92 */
93
94 /** Redo log system */
95 log_t* log_sys = NULL;
96
97 /** Whether to generate and require checksums on the redo log pages */
98 my_bool innodb_log_checksums;
99
100 /** Pointer to the log checksum calculation function */
101 log_checksum_func_t log_checksum_algorithm_ptr;
102
103 /* These control how often we print warnings if the last checkpoint is too
104 old */
105 bool log_has_printed_chkp_warning = false;
106 time_t log_last_warning_time;
107
108 bool log_has_printed_chkp_margine_warning = false;
109 time_t log_last_margine_warning_time;
110
111 /* A margin for free space in the log buffer before a log entry is catenated */
112 #define LOG_BUF_WRITE_MARGIN (4 * OS_FILE_LOG_BLOCK_SIZE)
113
114 /* Margins for free space in the log buffer after a log entry is catenated */
115 #define LOG_BUF_FLUSH_RATIO 2
116 #define LOG_BUF_FLUSH_MARGIN (LOG_BUF_WRITE_MARGIN + 4 * UNIV_PAGE_SIZE)
117
118 /* This parameter controls asynchronous making of a new checkpoint; the value
119 should be bigger than LOG_POOL_PREFLUSH_RATIO_SYNC */
120
121 #define LOG_POOL_CHECKPOINT_RATIO_ASYNC 32
122
123 /* This parameter controls synchronous preflushing of modified buffer pages */
124 #define LOG_POOL_PREFLUSH_RATIO_SYNC 16
125
126 /* The same ratio for asynchronous preflushing; this value should be less than
127 the previous */
128 #define LOG_POOL_PREFLUSH_RATIO_ASYNC 8
129
130 /* Codes used in unlocking flush latches */
131 #define LOG_UNLOCK_NONE_FLUSHED_LOCK 1
132 #define LOG_UNLOCK_FLUSH_LOCK 2
133
134 /******************************************************//**
135 Completes a checkpoint write i/o to a log file. */
136 static
137 void
138 log_io_complete_checkpoint(void);
139 /*============================*/
140
141 #ifndef UNIV_HOTBACKUP
142 /****************************************************************//**
143 Returns the oldest modified block lsn in the pool, or log_sys->lsn if none
144 exists.
145 @return LSN of oldest modification */
146 static
147 lsn_t
log_buf_pool_get_oldest_modification(void)148 log_buf_pool_get_oldest_modification(void)
149 /*======================================*/
150 {
151 lsn_t lsn;
152
153 ut_ad(log_mutex_own());
154
155 lsn = buf_pool_get_oldest_modification();
156
157 if (!lsn) {
158
159 lsn = log_sys->lsn;
160 }
161
162 return(lsn);
163 }
164 #endif /* !UNIV_HOTBACKUP */
165
166 /** Extends the log buffer.
167 @param[in] len requested minimum size in bytes */
168 void
log_buffer_extend(ulint len)169 log_buffer_extend(
170 ulint len)
171 {
172 ulint move_start;
173 ulint move_end;
174 byte tmp_buf[OS_FILE_LOG_BLOCK_SIZE];
175
176 log_mutex_enter_all();
177
178 while (log_sys->is_extending) {
179 /* Another thread is trying to extend already.
180 Needs to wait for. */
181 log_mutex_exit_all();
182
183 log_buffer_flush_to_disk();
184
185 log_mutex_enter_all();
186
187 if (srv_log_buffer_size > len / UNIV_PAGE_SIZE) {
188 /* Already extended enough by the others */
189 log_mutex_exit_all();
190 return;
191 }
192 }
193
194 if (len >= log_sys->buf_size / 2) {
195 DBUG_EXECUTE_IF("ib_log_buffer_is_short_crash",
196 DBUG_SUICIDE(););
197
198 /* log_buffer is too small. try to extend instead of crash. */
199 ib::warn() << "The transaction log size is too large"
200 " for innodb_log_buffer_size (" << len << " >= "
201 << LOG_BUFFER_SIZE << " / 2). Trying to extend it.";
202 }
203
204 log_sys->is_extending = true;
205
206 while (ut_calc_align_down(log_sys->buf_free,
207 OS_FILE_LOG_BLOCK_SIZE)
208 != ut_calc_align_down(log_sys->buf_next_to_write,
209 OS_FILE_LOG_BLOCK_SIZE)) {
210 /* Buffer might have >1 blocks to write still. */
211 log_mutex_exit_all();
212
213 log_buffer_flush_to_disk();
214
215 log_mutex_enter_all();
216 }
217
218 move_start = ut_calc_align_down(
219 log_sys->buf_free,
220 OS_FILE_LOG_BLOCK_SIZE);
221 move_end = log_sys->buf_free;
222
223 /* store the last log block in buffer */
224 ut_memcpy(tmp_buf, log_sys->buf + move_start,
225 move_end - move_start);
226
227 log_sys->buf_free -= move_start;
228 log_sys->buf_next_to_write -= move_start;
229
230 /* reallocate log buffer */
231 srv_log_buffer_size = len / UNIV_PAGE_SIZE + 1;
232 ut_free(log_sys->buf_ptr);
233
234 log_sys->buf_size = LOG_BUFFER_SIZE;
235
236 log_sys->buf_ptr = static_cast<byte*>(
237 ut_zalloc_nokey(log_sys->buf_size * 2 + OS_FILE_LOG_BLOCK_SIZE));
238 log_sys->buf = static_cast<byte*>(
239 ut_align(log_sys->buf_ptr, OS_FILE_LOG_BLOCK_SIZE));
240
241 log_sys->first_in_use = true;
242
243 log_sys->max_buf_free = log_sys->buf_size / LOG_BUF_FLUSH_RATIO
244 - LOG_BUF_FLUSH_MARGIN;
245
246 /* restore the last log block */
247 ut_memcpy(log_sys->buf, tmp_buf, move_end - move_start);
248
249 ut_ad(log_sys->is_extending);
250 log_sys->is_extending = false;
251
252 log_mutex_exit_all();
253
254 ib::info() << "innodb_log_buffer_size was extended to "
255 << LOG_BUFFER_SIZE << ".";
256 }
257
258 #ifndef UNIV_HOTBACKUP
259 /** Calculate actual length in redo buffer and file including
260 block header and trailer.
261 @param[in] len length to write
262 @return actual length to write including header and trailer. */
263 static inline
264 ulint
log_calculate_actual_len(ulint len)265 log_calculate_actual_len(
266 ulint len)
267 {
268 ut_ad(log_mutex_own());
269
270 /* actual length stored per block */
271 const ulint len_per_blk = OS_FILE_LOG_BLOCK_SIZE
272 - (LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE);
273
274 /* actual data length in last block already written */
275 ulint extra_len = (log_sys->buf_free % OS_FILE_LOG_BLOCK_SIZE);
276
277 ut_ad(extra_len >= LOG_BLOCK_HDR_SIZE);
278 extra_len -= LOG_BLOCK_HDR_SIZE;
279
280 /* total extra length for block header and trailer */
281 extra_len = ((len + extra_len) / len_per_blk)
282 * (LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE);
283
284 return(len + extra_len);
285 }
286
287 /** Check margin not to overwrite transaction log from the last checkpoint.
288 If would estimate the log write to exceed the log_group_capacity,
289 waits for the checkpoint is done enough.
290 @param[in] len length of the data to be written */
291
292 void
log_margin_checkpoint_age(ulint len)293 log_margin_checkpoint_age(
294 ulint len)
295 {
296 ulint margin = log_calculate_actual_len(len);
297
298 ut_ad(log_mutex_own());
299
300 if (margin > log_sys->log_group_capacity) {
301 /* return with warning output to avoid deadlock */
302 if (!log_has_printed_chkp_margine_warning
303 || difftime(time(NULL),
304 log_last_margine_warning_time) > 15) {
305 log_has_printed_chkp_margine_warning = true;
306 log_last_margine_warning_time = time(NULL);
307
308 ib::error() << "The transaction log files are too"
309 " small for the single transaction log (size="
310 << len << "). So, the last checkpoint age"
311 " might exceed the log group capacity "
312 << log_sys->log_group_capacity << ".";
313 }
314
315 return;
316 }
317
318 /* Our margin check should ensure that we never reach this condition.
319 Try to do checkpoint once. We cannot keep waiting here as it might
320 result in hang in case the current mtr has latch on oldest lsn */
321 if (log_sys->lsn - log_sys->last_checkpoint_lsn + margin
322 > log_sys->log_group_capacity) {
323 /* The log write of 'len' might overwrite the transaction log
324 after the last checkpoint. Makes checkpoint. */
325
326 bool flushed_enough = false;
327
328 if (log_sys->lsn - log_buf_pool_get_oldest_modification()
329 + margin
330 <= log_sys->log_group_capacity) {
331 flushed_enough = true;
332 }
333
334 log_sys->check_flush_or_checkpoint = true;
335 log_mutex_exit();
336
337 DEBUG_SYNC_C("margin_checkpoint_age_rescue");
338
339 if (!flushed_enough) {
340 os_thread_sleep(100000);
341 }
342 log_checkpoint(true, false);
343
344 log_mutex_enter();
345 }
346
347 return;
348 }
349 #endif /* !UNIV_HOTBACKUP */
350 /** Open the log for log_write_low. The log must be closed with log_close.
351 @param[in] len length of the data to be written
352 @return start lsn of the log record */
353 lsn_t
log_reserve_and_open(ulint len)354 log_reserve_and_open(
355 ulint len)
356 {
357 ulint len_upper_limit;
358 #ifdef UNIV_DEBUG
359 ulint count = 0;
360 #endif /* UNIV_DEBUG */
361
362 loop:
363 ut_ad(log_mutex_own());
364 ut_ad(!recv_no_log_write);
365
366 if (log_sys->is_extending) {
367 log_mutex_exit();
368
369 /* Log buffer size is extending. Writing up to the next block
370 should wait for the extending finished. */
371
372 os_thread_sleep(100000);
373
374 ut_ad(++count < 50);
375
376 log_mutex_enter();
377 goto loop;
378 }
379
380 /* Calculate an upper limit for the space the string may take in the
381 log buffer */
382
383 len_upper_limit = LOG_BUF_WRITE_MARGIN + srv_log_write_ahead_size
384 + (5 * len) / 4;
385
386 if (log_sys->buf_free + len_upper_limit > log_sys->buf_size) {
387 log_mutex_exit();
388
389 DEBUG_SYNC_C("log_buf_size_exceeded");
390
391 /* Not enough free space, do a write of the log buffer */
392
393 log_buffer_sync_in_background(false);
394
395 srv_stats.log_waits.inc();
396
397 ut_ad(++count < 50);
398
399 log_mutex_enter();
400 goto loop;
401 }
402
403 return(log_sys->lsn);
404 }
405
406 /************************************************************//**
407 Writes to the log the string given. It is assumed that the caller holds the
408 log mutex. */
409 void
log_write_low(const byte * str,ulint str_len)410 log_write_low(
411 /*==========*/
412 const byte* str, /*!< in: string */
413 ulint str_len) /*!< in: string length */
414 {
415 log_t* log = log_sys;
416 ulint len;
417 ulint data_len;
418 byte* log_block;
419
420 ut_ad(log_mutex_own());
421 part_loop:
422 ut_ad(!recv_no_log_write);
423 /* Calculate a part length */
424
425 data_len = (log->buf_free % OS_FILE_LOG_BLOCK_SIZE) + str_len;
426
427 if (data_len <= OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
428
429 /* The string fits within the current log block */
430
431 len = str_len;
432 } else {
433 data_len = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE;
434
435 len = OS_FILE_LOG_BLOCK_SIZE
436 - (log->buf_free % OS_FILE_LOG_BLOCK_SIZE)
437 - LOG_BLOCK_TRL_SIZE;
438 }
439
440 ut_memcpy(log->buf + log->buf_free, str, len);
441
442 str_len -= len;
443 str = str + len;
444
445 log_block = static_cast<byte*>(
446 ut_align_down(
447 log->buf + log->buf_free, OS_FILE_LOG_BLOCK_SIZE));
448
449 log_block_set_data_len(log_block, data_len);
450
451 if (data_len == OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
452 /* This block became full */
453 log_block_set_data_len(log_block, OS_FILE_LOG_BLOCK_SIZE);
454 log_block_set_checkpoint_no(log_block,
455 log_sys->next_checkpoint_no);
456 len += LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE;
457
458 log->lsn += len;
459
460 /* Initialize the next block header */
461 log_block_init(log_block + OS_FILE_LOG_BLOCK_SIZE, log->lsn);
462 } else {
463 log->lsn += len;
464 }
465
466 log->buf_free += len;
467
468 ut_ad(log->buf_free <= log->buf_size);
469
470 if (str_len > 0) {
471 goto part_loop;
472 }
473
474 srv_stats.log_write_requests.inc();
475 }
476
477 /************************************************************//**
478 Closes the log.
479 @return lsn */
480 lsn_t
log_close(void)481 log_close(void)
482 /*===========*/
483 {
484 byte* log_block;
485 ulint first_rec_group;
486 lsn_t oldest_lsn;
487 lsn_t lsn;
488 log_t* log = log_sys;
489 lsn_t checkpoint_age;
490
491 ut_ad(log_mutex_own());
492 ut_ad(!recv_no_log_write);
493
494 lsn = log->lsn;
495
496 log_block = static_cast<byte*>(
497 ut_align_down(
498 log->buf + log->buf_free, OS_FILE_LOG_BLOCK_SIZE));
499
500 first_rec_group = log_block_get_first_rec_group(log_block);
501
502 if (first_rec_group == 0) {
503 /* We initialized a new log block which was not written
504 full by the current mtr: the next mtr log record group
505 will start within this block at the offset data_len */
506
507 log_block_set_first_rec_group(
508 log_block, log_block_get_data_len(log_block));
509 }
510
511 if (log->buf_free > log->max_buf_free) {
512
513 log->check_flush_or_checkpoint = true;
514 }
515
516 checkpoint_age = lsn - log->last_checkpoint_lsn;
517
518 if (checkpoint_age >= log->log_group_capacity) {
519 DBUG_EXECUTE_IF(
520 "print_all_chkp_warnings",
521 log_has_printed_chkp_warning = false;);
522
523 if (!log_has_printed_chkp_warning
524 || difftime(time(NULL), log_last_warning_time) > 15) {
525
526 log_has_printed_chkp_warning = true;
527 log_last_warning_time = time(NULL);
528
529 ib::error() << "The age of the last checkpoint is "
530 << checkpoint_age << ", which exceeds the log"
531 " group capacity " << log->log_group_capacity
532 << ".";
533 }
534 }
535
536 if (checkpoint_age <= log->max_modified_age_sync) {
537
538 goto function_exit;
539 }
540
541 oldest_lsn = buf_pool_get_oldest_modification();
542
543 if (!oldest_lsn
544 || lsn - oldest_lsn > log->max_modified_age_sync
545 || checkpoint_age > log->max_checkpoint_age_async) {
546
547 log->check_flush_or_checkpoint = true;
548 }
549 function_exit:
550
551 return(lsn);
552 }
553
554 /******************************************************//**
555 Calculates the data capacity of a log group, when the log file headers are not
556 included.
557 @return capacity in bytes */
558 lsn_t
log_group_get_capacity(const log_group_t * group)559 log_group_get_capacity(
560 /*===================*/
561 const log_group_t* group) /*!< in: log group */
562 {
563 /* The lsn parameters are updated while holding both the mutexes
564 and it is ok to have either of them while reading */
565 ut_ad(log_mutex_own() || log_write_mutex_own());
566
567 return((group->file_size - LOG_FILE_HDR_SIZE) * group->n_files);
568 }
569
570 /******************************************************//**
571 Calculates the offset within a log group, when the log file headers are not
572 included.
573 @return size offset (<= offset) */
574 UNIV_INLINE
575 lsn_t
log_group_calc_size_offset(lsn_t offset,const log_group_t * group)576 log_group_calc_size_offset(
577 /*=======================*/
578 lsn_t offset, /*!< in: real offset within the
579 log group */
580 const log_group_t* group) /*!< in: log group */
581 {
582 /* The lsn parameters are updated while holding both the mutexes
583 and it is ok to have either of them while reading */
584 ut_ad(log_mutex_own() || log_write_mutex_own());
585
586 return(offset - LOG_FILE_HDR_SIZE * (1 + offset / group->file_size));
587 }
588
589 /******************************************************//**
590 Calculates the offset within a log group, when the log file headers are
591 included.
592 @return real offset (>= offset) */
593 UNIV_INLINE
594 lsn_t
log_group_calc_real_offset(lsn_t offset,const log_group_t * group)595 log_group_calc_real_offset(
596 /*=======================*/
597 lsn_t offset, /*!< in: size offset within the
598 log group */
599 const log_group_t* group) /*!< in: log group */
600 {
601 /* The lsn parameters are updated while holding both the mutexes
602 and it is ok to have either of them while reading */
603 ut_ad(log_mutex_own() || log_write_mutex_own());
604
605 return(offset + LOG_FILE_HDR_SIZE
606 * (1 + offset / (group->file_size - LOG_FILE_HDR_SIZE)));
607 }
608
609 /** Calculate the offset of an lsn within a log group.
610 @param[in] lsn log sequence number
611 @param[in] group log group
612 @return offset within the log group */
613 lsn_t
log_group_calc_lsn_offset(lsn_t lsn,const log_group_t * group)614 log_group_calc_lsn_offset(
615 lsn_t lsn,
616 const log_group_t* group)
617 {
618 lsn_t gr_lsn;
619 lsn_t gr_lsn_size_offset;
620 lsn_t difference;
621 lsn_t group_size;
622 lsn_t offset;
623
624 /* The lsn parameters are updated while holding both the mutexes
625 and it is ok to have either of them while reading */
626 ut_ad(log_mutex_own() || log_write_mutex_own());
627
628 gr_lsn = group->lsn;
629
630 gr_lsn_size_offset = log_group_calc_size_offset(
631 group->lsn_offset, group);
632
633 group_size = log_group_get_capacity(group);
634
635 if (lsn >= gr_lsn) {
636
637 difference = lsn - gr_lsn;
638 } else {
639 difference = gr_lsn - lsn;
640
641 difference = difference % group_size;
642
643 difference = group_size - difference;
644 }
645
646 offset = (gr_lsn_size_offset + difference) % group_size;
647
648 /* fprintf(stderr,
649 "Offset is " LSN_PF " gr_lsn_offset is " LSN_PF
650 " difference is " LSN_PF "\n",
651 offset, gr_lsn_size_offset, difference);
652 */
653
654 return(log_group_calc_real_offset(offset, group));
655 }
656
657 /*******************************************************************//**
658 Calculates where in log files we find a specified lsn.
659 @return log file number */
660 ulint
log_calc_where_lsn_is(int64_t * log_file_offset,ib_uint64_t first_header_lsn,ib_uint64_t lsn,ulint n_log_files,int64_t log_file_size)661 log_calc_where_lsn_is(
662 /*==================*/
663 int64_t* log_file_offset, /*!< out: offset in that file
664 (including the header) */
665 ib_uint64_t first_header_lsn, /*!< in: first log file start
666 lsn */
667 ib_uint64_t lsn, /*!< in: lsn whose position to
668 determine */
669 ulint n_log_files, /*!< in: total number of log
670 files */
671 int64_t log_file_size) /*!< in: log file size
672 (including the header) */
673 {
674 int64_t capacity = log_file_size - LOG_FILE_HDR_SIZE;
675 ulint file_no;
676 int64_t add_this_many;
677
678 if (lsn < first_header_lsn) {
679 add_this_many = 1 + (first_header_lsn - lsn)
680 / (capacity * static_cast<int64_t>(n_log_files));
681 lsn += add_this_many
682 * capacity * static_cast<int64_t>(n_log_files);
683 }
684
685 ut_a(lsn >= first_header_lsn);
686
687 file_no = ((ulint)((lsn - first_header_lsn) / capacity))
688 % n_log_files;
689 *log_file_offset = (lsn - first_header_lsn) % capacity;
690
691 *log_file_offset = *log_file_offset + LOG_FILE_HDR_SIZE;
692
693 return(file_no);
694 }
695
696
697 /********************************************************//**
698 Sets the field values in group to correspond to a given lsn. For this function
699 to work, the values must already be correctly initialized to correspond to
700 some lsn, for instance, a checkpoint lsn. */
701 void
log_group_set_fields(log_group_t * group,lsn_t lsn)702 log_group_set_fields(
703 /*=================*/
704 log_group_t* group, /*!< in/out: group */
705 lsn_t lsn) /*!< in: lsn for which the values should be
706 set */
707 {
708 group->lsn_offset = log_group_calc_lsn_offset(lsn, group);
709 group->lsn = lsn;
710 }
711 #ifndef UNIV_HOTBACKUP
712 /*****************************************************************//**
713 Calculates the recommended highest values for lsn - last_checkpoint_lsn
714 and lsn - buf_get_oldest_modification().
715 @retval true on success
716 @retval false if the smallest log group is too small to
717 accommodate the number of OS threads in the database server */
718 static MY_ATTRIBUTE((warn_unused_result))
719 bool
log_calc_max_ages(void)720 log_calc_max_ages(void)
721 /*===================*/
722 {
723 log_group_t* group;
724 lsn_t margin;
725 ulint free;
726 bool success = true;
727 lsn_t smallest_capacity;
728
729 log_mutex_enter();
730
731 group = UT_LIST_GET_FIRST(log_sys->log_groups);
732
733 ut_ad(group);
734
735 smallest_capacity = LSN_MAX;
736
737 while (group) {
738 if (log_group_get_capacity(group) < smallest_capacity) {
739
740 smallest_capacity = log_group_get_capacity(group);
741 }
742
743 group = UT_LIST_GET_NEXT(log_groups, group);
744 }
745
746 /* Add extra safety */
747 smallest_capacity = smallest_capacity - smallest_capacity / 10;
748
749 /* For each OS thread we must reserve so much free space in the
750 smallest log group that it can accommodate the log entries produced
751 by single query steps: running out of free log space is a serious
752 system error which requires rebooting the database. */
753
754 free = LOG_CHECKPOINT_FREE_PER_THREAD * (10 + srv_thread_concurrency)
755 + LOG_CHECKPOINT_EXTRA_FREE;
756 if (free >= smallest_capacity / 2) {
757 success = false;
758
759 goto failure;
760 } else {
761 margin = smallest_capacity - free;
762 }
763
764 margin = margin - margin / 10; /* Add still some extra safety */
765
766 log_sys->log_group_capacity = smallest_capacity;
767
768 log_sys->max_modified_age_async = margin
769 - margin / LOG_POOL_PREFLUSH_RATIO_ASYNC;
770 log_sys->max_modified_age_sync = margin
771 - margin / LOG_POOL_PREFLUSH_RATIO_SYNC;
772
773 log_sys->max_checkpoint_age_async = margin - margin
774 / LOG_POOL_CHECKPOINT_RATIO_ASYNC;
775 log_sys->max_checkpoint_age = margin;
776
777 failure:
778 log_mutex_exit();
779
780 if (!success) {
781 ib::error() << "Cannot continue operation. ib_logfiles are too"
782 " small for innodb_thread_concurrency "
783 << srv_thread_concurrency << ". The combined size of"
784 " ib_logfiles should be bigger than"
785 " 200 kB * innodb_thread_concurrency. To get mysqld"
786 " to start up, set innodb_thread_concurrency in"
787 " my.cnf to a lower value, for example, to 8. After"
788 " an ERROR-FREE shutdown of mysqld you can adjust"
789 " the size of ib_logfiles. " << INNODB_PARAMETERS_MSG;
790 }
791
792 return(success);
793 }
794
795 /******************************************************//**
796 Initializes the log. */
797 void
log_init(void)798 log_init(void)
799 /*==========*/
800 {
801 log_sys = static_cast<log_t*>(ut_zalloc_nokey(sizeof(log_t)));
802
803 mutex_create(LATCH_ID_LOG_SYS, &log_sys->mutex);
804 mutex_create(LATCH_ID_LOG_WRITE, &log_sys->write_mutex);
805
806 mutex_create(LATCH_ID_LOG_FLUSH_ORDER, &log_sys->log_flush_order_mutex);
807
808 /* Start the lsn from one log block from zero: this way every
809 log record has a start lsn != zero, a fact which we will use */
810
811 log_sys->lsn = LOG_START_LSN;
812
813 ut_a(LOG_BUFFER_SIZE >= 16 * OS_FILE_LOG_BLOCK_SIZE);
814 ut_a(LOG_BUFFER_SIZE >= 4 * UNIV_PAGE_SIZE);
815
816 log_sys->buf_size = LOG_BUFFER_SIZE;
817
818 log_sys->buf_ptr = static_cast<byte*>(
819 ut_zalloc_nokey(log_sys->buf_size * 2 + OS_FILE_LOG_BLOCK_SIZE));
820 log_sys->buf = static_cast<byte*>(
821 ut_align(log_sys->buf_ptr, OS_FILE_LOG_BLOCK_SIZE));
822
823 log_sys->first_in_use = true;
824
825 log_sys->max_buf_free = log_sys->buf_size / LOG_BUF_FLUSH_RATIO
826 - LOG_BUF_FLUSH_MARGIN;
827 log_sys->check_flush_or_checkpoint = true;
828 UT_LIST_INIT(log_sys->log_groups, &log_group_t::log_groups);
829
830 log_sys->n_log_ios_old = log_sys->n_log_ios;
831 log_sys->last_printout_time = time(NULL);
832 /*----------------------------*/
833
834 log_sys->write_lsn = log_sys->lsn;
835
836 log_sys->flush_event = os_event_create(0);
837
838 os_event_set(log_sys->flush_event);
839
840 /*----------------------------*/
841
842 log_sys->last_checkpoint_lsn = log_sys->lsn;
843
844 rw_lock_create(
845 checkpoint_lock_key, &log_sys->checkpoint_lock,
846 SYNC_NO_ORDER_CHECK);
847
848 log_sys->checkpoint_buf_ptr = static_cast<byte*>(
849 ut_zalloc_nokey(2 * OS_FILE_LOG_BLOCK_SIZE));
850
851 log_sys->checkpoint_buf = static_cast<byte*>(
852 ut_align(log_sys->checkpoint_buf_ptr, OS_FILE_LOG_BLOCK_SIZE));
853
854 /*----------------------------*/
855
856 log_block_init(log_sys->buf, log_sys->lsn);
857 log_block_set_first_rec_group(log_sys->buf, LOG_BLOCK_HDR_SIZE);
858
859 log_sys->buf_free = LOG_BLOCK_HDR_SIZE;
860 log_sys->lsn = LOG_START_LSN + LOG_BLOCK_HDR_SIZE;
861
862 MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
863 log_sys->lsn - log_sys->last_checkpoint_lsn);
864 }
865
866 /******************************************************************//**
867 Inits a log group to the log system.
868 @return true if success, false if not */
869 MY_ATTRIBUTE((warn_unused_result))
870 bool
log_group_init(ulint id,ulint n_files,lsn_t file_size,ulint space_id)871 log_group_init(
872 /*===========*/
873 ulint id, /*!< in: group id */
874 ulint n_files, /*!< in: number of log files */
875 lsn_t file_size, /*!< in: log file size in bytes */
876 ulint space_id) /*!< in: space id of the file space
877 which contains the log files of this
878 group */
879 {
880 ulint i;
881 log_group_t* group;
882
883 group = static_cast<log_group_t*>(ut_malloc_nokey(sizeof(log_group_t)));
884
885 group->id = id;
886 group->n_files = n_files;
887 group->format = LOG_HEADER_FORMAT_CURRENT;
888 group->file_size = file_size;
889 group->space_id = space_id;
890 group->state = LOG_GROUP_OK;
891 group->lsn = LOG_START_LSN;
892 group->lsn_offset = LOG_FILE_HDR_SIZE;
893
894 group->file_header_bufs_ptr = static_cast<byte**>(
895 ut_zalloc_nokey(sizeof(byte*) * n_files));
896
897 group->file_header_bufs = static_cast<byte**>(
898 ut_zalloc_nokey(sizeof(byte**) * n_files));
899
900 for (i = 0; i < n_files; i++) {
901 group->file_header_bufs_ptr[i] = static_cast<byte*>(
902 ut_zalloc_nokey(LOG_FILE_HDR_SIZE
903 + OS_FILE_LOG_BLOCK_SIZE));
904
905 group->file_header_bufs[i] = static_cast<byte*>(
906 ut_align(group->file_header_bufs_ptr[i],
907 OS_FILE_LOG_BLOCK_SIZE));
908 }
909
910 group->checkpoint_buf_ptr = static_cast<byte*>(
911 ut_zalloc_nokey(2 * OS_FILE_LOG_BLOCK_SIZE));
912
913 group->checkpoint_buf = static_cast<byte*>(
914 ut_align(group->checkpoint_buf_ptr,OS_FILE_LOG_BLOCK_SIZE));
915
916 UT_LIST_ADD_LAST(log_sys->log_groups, group);
917
918 return(log_calc_max_ages());
919 }
920 #endif /* !UNIV_HOTBACKUP */
921 /******************************************************//**
922 Completes an i/o to a log file. */
923 void
log_io_complete(log_group_t * group)924 log_io_complete(
925 /*============*/
926 log_group_t* group) /*!< in: log group or a dummy pointer */
927 {
928 if ((ulint) group & 0x1UL) {
929 /* It was a checkpoint write */
930 group = (log_group_t*)((ulint) group - 1);
931
932 #ifdef _WIN32
933 fil_flush(group->space_id);
934 #else
935 switch (srv_unix_file_flush_method) {
936 case SRV_UNIX_O_DSYNC:
937 case SRV_UNIX_NOSYNC:
938 break;
939 case SRV_UNIX_FSYNC:
940 case SRV_UNIX_LITTLESYNC:
941 case SRV_UNIX_O_DIRECT:
942 case SRV_UNIX_O_DIRECT_NO_FSYNC:
943 fil_flush(group->space_id);
944 }
945 #endif /* _WIN32 */
946
947 DBUG_PRINT("ib_log", ("checkpoint info written to group %u",
948 unsigned(group->id)));
949 log_io_complete_checkpoint();
950
951 return;
952 }
953
954 ut_error; /*!< We currently use synchronous writing of the
955 logs and cannot end up here! */
956 }
957
958 /******************************************************//**
959 Writes a log file header to a log file space. */
960 static
961 void
log_group_file_header_flush(log_group_t * group,ulint nth_file,lsn_t start_lsn)962 log_group_file_header_flush(
963 /*========================*/
964 log_group_t* group, /*!< in: log group */
965 ulint nth_file, /*!< in: header to the nth file in the
966 log file space */
967 lsn_t start_lsn) /*!< in: log file data starts at this
968 lsn */
969 {
970 byte* buf;
971 lsn_t dest_offset;
972
973 ut_ad(log_write_mutex_own());
974 ut_ad(!recv_no_log_write);
975 ut_ad(group->id == 0);
976 ut_a(nth_file < group->n_files);
977
978 buf = *(group->file_header_bufs + nth_file);
979
980 memset(buf, 0, OS_FILE_LOG_BLOCK_SIZE);
981 mach_write_to_4(buf + LOG_HEADER_FORMAT, LOG_HEADER_FORMAT_CURRENT);
982 mach_write_to_8(buf + LOG_HEADER_START_LSN, start_lsn);
983 strcpy(reinterpret_cast<char*>(buf) + LOG_HEADER_CREATOR,
984 LOG_HEADER_CREATOR_CURRENT);
985 ut_ad(LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR
986 >= sizeof LOG_HEADER_CREATOR_CURRENT);
987 log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf));
988
989 dest_offset = nth_file * group->file_size;
990
991 DBUG_PRINT("ib_log", ("write " LSN_PF
992 " group " ULINTPF
993 " file " ULINTPF " header",
994 start_lsn, group->id, nth_file));
995
996 log_sys->n_log_ios++;
997
998 MONITOR_INC(MONITOR_LOG_IO);
999
1000 srv_stats.os_log_pending_writes.inc();
1001
1002 const ulint page_no
1003 = (ulint) (dest_offset / univ_page_size.physical());
1004
1005 fil_io(IORequestLogWrite, true,
1006 page_id_t(group->space_id, page_no),
1007 univ_page_size,
1008 (ulint) (dest_offset % univ_page_size.physical()),
1009 OS_FILE_LOG_BLOCK_SIZE, buf, group);
1010
1011 srv_stats.os_log_pending_writes.dec();
1012 }
1013
1014 /******************************************************//**
1015 Stores a 4-byte checksum to the trailer checksum field of a log block
1016 before writing it to a log file. This checksum is used in recovery to
1017 check the consistency of a log block. */
1018 static
1019 void
log_block_store_checksum(byte * block)1020 log_block_store_checksum(
1021 /*=====================*/
1022 byte* block) /*!< in/out: pointer to a log block */
1023 {
1024 log_block_set_checksum(block, log_block_calc_checksum(block));
1025 }
1026
1027 /******************************************************//**
1028 Writes a buffer to a log file group. */
1029 static
1030 void
log_group_write_buf(log_group_t * group,byte * buf,ulint len,ulint pad_len,lsn_t start_lsn,ulint new_data_offset)1031 log_group_write_buf(
1032 /*================*/
1033 log_group_t* group, /*!< in: log group */
1034 byte* buf, /*!< in: buffer */
1035 ulint len, /*!< in: buffer len; must be divisible
1036 by OS_FILE_LOG_BLOCK_SIZE */
1037 #ifdef UNIV_DEBUG
1038 ulint pad_len, /*!< in: pad len in the buffer len */
1039 #endif /* UNIV_DEBUG */
1040 lsn_t start_lsn, /*!< in: start lsn of the buffer; must
1041 be divisible by
1042 OS_FILE_LOG_BLOCK_SIZE */
1043 ulint new_data_offset)/*!< in: start offset of new data in
1044 buf: this parameter is used to decide
1045 if we have to write a new log file
1046 header */
1047 {
1048 ulint write_len;
1049 bool write_header = new_data_offset == 0;
1050 lsn_t next_offset;
1051 ulint i;
1052
1053 ut_ad(log_write_mutex_own());
1054 ut_ad(!recv_no_log_write);
1055 ut_a(len % OS_FILE_LOG_BLOCK_SIZE == 0);
1056 ut_a(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
1057
1058 loop:
1059 if (len == 0) {
1060
1061 return;
1062 }
1063
1064 next_offset = log_group_calc_lsn_offset(start_lsn, group);
1065
1066 if (write_header
1067 && next_offset % group->file_size == LOG_FILE_HDR_SIZE) {
1068 /* We start to write a new log file instance in the group */
1069
1070 ut_a(next_offset / group->file_size <= ULINT_MAX);
1071
1072 log_group_file_header_flush(group, (ulint)
1073 (next_offset / group->file_size),
1074 start_lsn);
1075 srv_stats.os_log_written.add(OS_FILE_LOG_BLOCK_SIZE);
1076
1077 srv_stats.log_writes.inc();
1078 }
1079
1080 if ((next_offset % group->file_size) + len > group->file_size) {
1081
1082 /* if the above condition holds, then the below expression
1083 is < len which is ulint, so the typecast is ok */
1084 write_len = (ulint)
1085 (group->file_size - (next_offset % group->file_size));
1086 } else {
1087 write_len = len;
1088 }
1089
1090 DBUG_PRINT("ib_log",
1091 ("write " LSN_PF " to " LSN_PF
1092 ": group " ULINTPF " len " ULINTPF
1093 " blocks " ULINTPF ".." ULINTPF,
1094 start_lsn, next_offset,
1095 group->id, write_len,
1096 log_block_get_hdr_no(buf),
1097 log_block_get_hdr_no(
1098 buf + write_len
1099 - OS_FILE_LOG_BLOCK_SIZE)));
1100
1101 ut_ad(pad_len >= len
1102 || log_block_get_hdr_no(buf)
1103 == log_block_convert_lsn_to_no(start_lsn));
1104
1105 /* Calculate the checksums for each log block and write them to
1106 the trailer fields of the log blocks */
1107
1108 for (i = 0; i < write_len / OS_FILE_LOG_BLOCK_SIZE; i++) {
1109 ut_ad(pad_len >= len
1110 || i * OS_FILE_LOG_BLOCK_SIZE >= len - pad_len
1111 || log_block_get_hdr_no(
1112 buf + i * OS_FILE_LOG_BLOCK_SIZE)
1113 == log_block_get_hdr_no(buf) + i);
1114 log_block_store_checksum(buf + i * OS_FILE_LOG_BLOCK_SIZE);
1115 }
1116
1117 log_sys->n_log_ios++;
1118
1119 MONITOR_INC(MONITOR_LOG_IO);
1120
1121 srv_stats.os_log_pending_writes.inc();
1122
1123 ut_a(next_offset / UNIV_PAGE_SIZE <= ULINT_MAX);
1124
1125 const ulint page_no
1126 = (ulint) (next_offset / univ_page_size.physical());
1127
1128 fil_io(IORequestLogWrite, true,
1129 page_id_t(group->space_id, page_no),
1130 univ_page_size,
1131 (ulint) (next_offset % UNIV_PAGE_SIZE), write_len, buf,
1132 group);
1133
1134 srv_stats.os_log_pending_writes.dec();
1135
1136 srv_stats.os_log_written.add(write_len);
1137 srv_stats.log_writes.inc();
1138
1139 if (write_len < len) {
1140 start_lsn += write_len;
1141 len -= write_len;
1142 buf += write_len;
1143
1144 write_header = true;
1145
1146 goto loop;
1147 }
1148 }
1149
1150 /** Flush the log has been written to the log file. */
1151 static
1152 void
log_write_flush_to_disk_low()1153 log_write_flush_to_disk_low()
1154 {
1155 ut_a(log_sys->n_pending_flushes == 1); /* No other threads here */
1156
1157 #ifndef _WIN32
1158 bool do_flush = srv_unix_file_flush_method != SRV_UNIX_O_DSYNC;
1159 #else
1160 bool do_flush = true;
1161 #endif
1162 if (do_flush) {
1163 log_group_t* group = UT_LIST_GET_FIRST(log_sys->log_groups);
1164 fil_flush(group->space_id);
1165 log_sys->flushed_to_disk_lsn = log_sys->current_flush_lsn;
1166 }
1167
1168 log_sys->n_pending_flushes--;
1169 MONITOR_DEC(MONITOR_PENDING_LOG_FLUSH);
1170
1171 os_event_set(log_sys->flush_event);
1172 }
1173
1174 /** Switch the log buffer in use, and copy the content of last block
1175 from old log buffer to the head of the to be used one. Thus, buf_free and
1176 buf_next_to_write would be changed accordingly */
1177 static inline
1178 void
log_buffer_switch()1179 log_buffer_switch()
1180 {
1181 ut_ad(log_mutex_own());
1182 ut_ad(log_write_mutex_own());
1183
1184 const byte* old_buf = log_sys->buf;
1185 ulint area_end = ut_calc_align(log_sys->buf_free,
1186 OS_FILE_LOG_BLOCK_SIZE);
1187
1188 if (log_sys->first_in_use) {
1189 ut_ad(log_sys->buf == ut_align(log_sys->buf_ptr,
1190 OS_FILE_LOG_BLOCK_SIZE));
1191 log_sys->buf += log_sys->buf_size;
1192 } else {
1193 log_sys->buf -= log_sys->buf_size;
1194 ut_ad(log_sys->buf == ut_align(log_sys->buf_ptr,
1195 OS_FILE_LOG_BLOCK_SIZE));
1196 }
1197
1198 log_sys->first_in_use = !log_sys->first_in_use;
1199
1200 /* Copy the last block to new buf */
1201 ut_memcpy(log_sys->buf,
1202 old_buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
1203 OS_FILE_LOG_BLOCK_SIZE);
1204
1205 log_sys->buf_free %= OS_FILE_LOG_BLOCK_SIZE;
1206 log_sys->buf_next_to_write = log_sys->buf_free;
1207 }
1208
1209 /** Ensure that the log has been written to the log file up to a given
1210 log entry (such as that of a transaction commit). Start a new write, or
1211 wait and check if an already running write is covering the request.
1212 @param[in] lsn log sequence number that should be
1213 included in the redo log file write
1214 @param[in] flush_to_disk whether the written log should also
1215 be flushed to the file system */
1216 void
log_write_up_to(lsn_t lsn,bool flush_to_disk)1217 log_write_up_to(
1218 lsn_t lsn,
1219 bool flush_to_disk)
1220 {
1221 #ifdef UNIV_DEBUG
1222 ulint loop_count = 0;
1223 #endif /* UNIV_DEBUG */
1224 byte* write_buf;
1225 lsn_t write_lsn;
1226
1227 ut_ad(!srv_read_only_mode);
1228
1229 if (recv_no_ibuf_operations) {
1230 /* Recovery is running and no operations on the log files are
1231 allowed yet (the variable name .._no_ibuf_.. is misleading) */
1232
1233 return;
1234 }
1235
1236 loop:
1237 ut_ad(++loop_count < 128);
1238
1239 #if UNIV_WORD_SIZE > 7
1240 /* We can do a dirty read of LSN. */
1241 /* NOTE: Currently doesn't do dirty read for
1242 (flush_to_disk == true) case, because the log_mutex
1243 contention also works as the arbitrator for write-IO
1244 (fsync) bandwidth between log files and data files. */
1245 os_rmb;
1246 if (!flush_to_disk && log_sys->write_lsn >= lsn) {
1247 return;
1248 }
1249 #endif
1250
1251 log_write_mutex_enter();
1252 ut_ad(!recv_no_log_write);
1253
1254 lsn_t limit_lsn = flush_to_disk
1255 ? log_sys->flushed_to_disk_lsn
1256 : log_sys->write_lsn;
1257
1258 if (limit_lsn >= lsn) {
1259 log_write_mutex_exit();
1260 return;
1261 }
1262
1263 #ifdef _WIN32
1264 # ifndef UNIV_HOTBACKUP
1265 /* write requests during fil_flush() might not be good for Windows */
1266 if (log_sys->n_pending_flushes > 0
1267 || !os_event_is_set(log_sys->flush_event)) {
1268 log_write_mutex_exit();
1269 os_event_wait(log_sys->flush_event);
1270 goto loop;
1271 }
1272 # else
1273 if (log_sys->n_pending_flushes > 0) {
1274 goto loop;
1275 }
1276 # endif /* !UNIV_HOTBACKUP */
1277 #endif /* _WIN32 */
1278
1279 /* If it is a write call we should just go ahead and do it
1280 as we checked that write_lsn is not where we'd like it to
1281 be. If we have to flush as well then we check if there is a
1282 pending flush and based on that we wait for it to finish
1283 before proceeding further. */
1284 if (flush_to_disk
1285 && (log_sys->n_pending_flushes > 0
1286 || !os_event_is_set(log_sys->flush_event))) {
1287
1288 /* Figure out if the current flush will do the job
1289 for us. */
1290 bool work_done = log_sys->current_flush_lsn >= lsn;
1291
1292 log_write_mutex_exit();
1293
1294 os_event_wait(log_sys->flush_event);
1295
1296 if (work_done) {
1297 return;
1298 } else {
1299 goto loop;
1300 }
1301 }
1302
1303 log_mutex_enter();
1304 if (!flush_to_disk
1305 && log_sys->buf_free == log_sys->buf_next_to_write) {
1306 /* Nothing to write and no flush to disk requested */
1307 log_mutex_exit_all();
1308 return;
1309 }
1310
1311 log_group_t* group;
1312 ulint start_offset;
1313 ulint end_offset;
1314 ulint area_start;
1315 ulint area_end;
1316 ulong write_ahead_size = srv_log_write_ahead_size;
1317 ulint pad_size;
1318
1319 DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF,
1320 log_sys->write_lsn,
1321 log_sys->lsn));
1322
1323 if (flush_to_disk) {
1324 log_sys->n_pending_flushes++;
1325 log_sys->current_flush_lsn = log_sys->lsn;
1326 MONITOR_INC(MONITOR_PENDING_LOG_FLUSH);
1327 os_event_reset(log_sys->flush_event);
1328
1329 if (log_sys->buf_free == log_sys->buf_next_to_write) {
1330 /* Nothing to write, flush only */
1331 log_mutex_exit_all();
1332 log_write_flush_to_disk_low();
1333 return;
1334 }
1335 }
1336
1337 start_offset = log_sys->buf_next_to_write;
1338 end_offset = log_sys->buf_free;
1339
1340 area_start = ut_calc_align_down(start_offset, OS_FILE_LOG_BLOCK_SIZE);
1341 area_end = ut_calc_align(end_offset, OS_FILE_LOG_BLOCK_SIZE);
1342
1343 ut_ad(area_end - area_start > 0);
1344
1345 log_block_set_flush_bit(log_sys->buf + area_start, TRUE);
1346 log_block_set_checkpoint_no(
1347 log_sys->buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
1348 log_sys->next_checkpoint_no);
1349
1350 write_lsn = log_sys->lsn;
1351 write_buf = log_sys->buf;
1352
1353 log_buffer_switch();
1354
1355 group = UT_LIST_GET_FIRST(log_sys->log_groups);
1356
1357 log_group_set_fields(group, log_sys->write_lsn);
1358
1359 log_mutex_exit();
1360
1361 /* Calculate pad_size if needed. */
1362 pad_size = 0;
1363 if (write_ahead_size > OS_FILE_LOG_BLOCK_SIZE) {
1364 lsn_t end_offset;
1365 ulint end_offset_in_unit;
1366
1367 end_offset = log_group_calc_lsn_offset(
1368 ut_uint64_align_up(write_lsn,
1369 OS_FILE_LOG_BLOCK_SIZE),
1370 group);
1371 end_offset_in_unit = (ulint) (end_offset % write_ahead_size);
1372
1373 if (end_offset_in_unit > 0
1374 && (area_end - area_start) > end_offset_in_unit) {
1375 /* The first block in the unit was initialized
1376 after the last writing.
1377 Needs to be written padded data once. */
1378 pad_size = write_ahead_size - end_offset_in_unit;
1379
1380 if (area_end + pad_size > log_sys->buf_size) {
1381 pad_size = log_sys->buf_size - area_end;
1382 }
1383
1384 ::memset(write_buf + area_end, 0, pad_size);
1385 }
1386 }
1387
1388 /* Do the write to the log files */
1389 log_group_write_buf(
1390 group, write_buf + area_start,
1391 area_end - area_start + pad_size,
1392 #ifdef UNIV_DEBUG
1393 pad_size,
1394 #endif /* UNIV_DEBUG */
1395 ut_uint64_align_down(log_sys->write_lsn,
1396 OS_FILE_LOG_BLOCK_SIZE),
1397 start_offset - area_start);
1398
1399 srv_stats.log_padded.add(pad_size);
1400
1401 log_sys->write_lsn = write_lsn;
1402
1403 #ifndef _WIN32
1404 if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
1405 /* O_SYNC means the OS did not buffer the log file at all:
1406 so we have also flushed to disk what we have written */
1407 log_sys->flushed_to_disk_lsn = log_sys->write_lsn;
1408 }
1409 #endif /* !_WIN32 */
1410
1411 log_write_mutex_exit();
1412
1413 if (flush_to_disk) {
1414 log_write_flush_to_disk_low();
1415 }
1416 }
1417
1418 /** write to the log file up to the last log entry.
1419 @param[in] sync whether we want the written log
1420 also to be flushed to disk. */
1421 void
log_buffer_flush_to_disk(bool sync)1422 log_buffer_flush_to_disk(
1423 bool sync)
1424 {
1425 ut_ad(!srv_read_only_mode);
1426 log_write_up_to(log_get_lsn(), sync);
1427 }
1428
1429 /****************************************************************//**
1430 This functions writes the log buffer to the log file and if 'flush'
1431 is set it forces a flush of the log file as well. This is meant to be
1432 called from background master thread only as it does not wait for
1433 the write (+ possible flush) to finish. */
1434 void
log_buffer_sync_in_background(bool flush)1435 log_buffer_sync_in_background(
1436 /*==========================*/
1437 bool flush) /*!< in: flush the logs to disk */
1438 {
1439 lsn_t lsn;
1440
1441 log_mutex_enter();
1442
1443 lsn = log_sys->lsn;
1444
1445 if (flush
1446 && log_sys->n_pending_flushes > 0
1447 && log_sys->current_flush_lsn >= lsn) {
1448 /* The write + flush will write enough */
1449 log_mutex_exit();
1450 return;
1451 }
1452
1453 log_mutex_exit();
1454
1455 log_write_up_to(lsn, flush);
1456 }
1457
1458 /********************************************************************
1459
1460 Tries to establish a big enough margin of free space in the log buffer, such
1461 that a new log entry can be catenated without an immediate need for a flush. */
1462 static
1463 void
log_flush_margin(void)1464 log_flush_margin(void)
1465 /*==================*/
1466 {
1467 log_t* log = log_sys;
1468 lsn_t lsn = 0;
1469
1470 log_mutex_enter();
1471
1472 if (log->buf_free > log->max_buf_free) {
1473 /* We can write during flush */
1474 lsn = log->lsn;
1475 }
1476
1477 log_mutex_exit();
1478
1479 if (lsn) {
1480 log_write_up_to(lsn, false);
1481 }
1482 }
1483 #ifndef UNIV_HOTBACKUP
1484 /** Advances the smallest lsn for which there are unflushed dirty blocks in the
1485 buffer pool.
1486 NOTE: this function may only be called if the calling thread owns no
1487 synchronization objects!
1488 @param[in] new_oldest try to advance oldest_modified_lsn at least to
1489 this lsn
1490 @return false if there was a flush batch of the same type running,
1491 which means that we could not start this flush batch */
1492 static
1493 bool
log_preflush_pool_modified_pages(lsn_t new_oldest)1494 log_preflush_pool_modified_pages(
1495 lsn_t new_oldest)
1496 {
1497 bool success;
1498
1499 if (recv_recovery_on) {
1500 /* If the recovery is running, we must first apply all
1501 log records to their respective file pages to get the
1502 right modify lsn values to these pages: otherwise, there
1503 might be pages on disk which are not yet recovered to the
1504 current lsn, and even after calling this function, we could
1505 not know how up-to-date the disk version of the database is,
1506 and we could not make a new checkpoint on the basis of the
1507 info on the buffer pool only. */
1508
1509 recv_apply_hashed_log_recs(TRUE);
1510 }
1511
1512 if (new_oldest == LSN_MAX
1513 || !buf_page_cleaner_is_active
1514 || srv_is_being_started) {
1515
1516 ulint n_pages;
1517
1518 success = buf_flush_lists(ULINT_MAX, new_oldest, &n_pages);
1519
1520 buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
1521
1522 if (!success) {
1523 MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
1524 }
1525
1526 MONITOR_INC_VALUE_CUMULATIVE(
1527 MONITOR_FLUSH_SYNC_TOTAL_PAGE,
1528 MONITOR_FLUSH_SYNC_COUNT,
1529 MONITOR_FLUSH_SYNC_PAGES,
1530 n_pages);
1531 } else {
1532 /* better to wait for flushed by page cleaner */
1533
1534 if (srv_flush_sync) {
1535 /* wake page cleaner for IO burst */
1536 buf_flush_request_force(new_oldest);
1537 }
1538
1539 buf_flush_wait_flushed(new_oldest);
1540
1541 success = true;
1542 }
1543
1544 return(success);
1545 }
1546 #endif /* !UNIV_HOTBACKUP */
1547 /******************************************************//**
1548 Completes a checkpoint. */
1549 static
1550 void
log_complete_checkpoint(void)1551 log_complete_checkpoint(void)
1552 /*=========================*/
1553 {
1554 ut_ad(log_mutex_own());
1555 ut_ad(log_sys->n_pending_checkpoint_writes == 0);
1556
1557 log_sys->next_checkpoint_no++;
1558
1559 log_sys->last_checkpoint_lsn = log_sys->next_checkpoint_lsn;
1560 MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
1561 log_sys->lsn - log_sys->last_checkpoint_lsn);
1562
1563 DBUG_PRINT("ib_log", ("checkpoint ended at " LSN_PF
1564 ", flushed to " LSN_PF,
1565 log_sys->last_checkpoint_lsn,
1566 log_sys->flushed_to_disk_lsn));
1567
1568 rw_lock_x_unlock_gen(&(log_sys->checkpoint_lock), LOG_CHECKPOINT);
1569 }
1570
1571 /******************************************************//**
1572 Completes an asynchronous checkpoint info write i/o to a log file. */
1573 static
1574 void
log_io_complete_checkpoint(void)1575 log_io_complete_checkpoint(void)
1576 /*============================*/
1577 {
1578 MONITOR_DEC(MONITOR_PENDING_CHECKPOINT_WRITE);
1579
1580 log_mutex_enter();
1581
1582 ut_ad(log_sys->n_pending_checkpoint_writes > 0);
1583
1584 if (--log_sys->n_pending_checkpoint_writes == 0) {
1585 log_complete_checkpoint();
1586 }
1587
1588 log_mutex_exit();
1589 }
1590
1591 /******************************************************//**
1592 Writes the checkpoint info to a log group header. */
1593 static
1594 void
log_group_checkpoint(log_group_t * group)1595 log_group_checkpoint(
1596 /*=================*/
1597 log_group_t* group) /*!< in: log group */
1598 {
1599 lsn_t lsn_offset;
1600 byte* buf;
1601
1602 ut_ad(!srv_read_only_mode);
1603 ut_ad(log_mutex_own());
1604 #if LOG_CHECKPOINT_SIZE > OS_FILE_LOG_BLOCK_SIZE
1605 # error "LOG_CHECKPOINT_SIZE > OS_FILE_LOG_BLOCK_SIZE"
1606 #endif
1607
1608 DBUG_PRINT("ib_log", ("checkpoint " UINT64PF " at " LSN_PF
1609 " written to group " ULINTPF,
1610 log_sys->next_checkpoint_no,
1611 log_sys->next_checkpoint_lsn,
1612 group->id));
1613
1614 buf = group->checkpoint_buf;
1615 memset(buf, 0, OS_FILE_LOG_BLOCK_SIZE);
1616
1617 mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys->next_checkpoint_no);
1618 mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys->next_checkpoint_lsn);
1619
1620 lsn_offset = log_group_calc_lsn_offset(log_sys->next_checkpoint_lsn,
1621 group);
1622 mach_write_to_8(buf + LOG_CHECKPOINT_OFFSET, lsn_offset);
1623 mach_write_to_8(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, log_sys->buf_size);
1624
1625 log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf));
1626
1627 MONITOR_INC(MONITOR_PENDING_CHECKPOINT_WRITE);
1628
1629 log_sys->n_log_ios++;
1630
1631 MONITOR_INC(MONITOR_LOG_IO);
1632
1633 ut_ad(LOG_CHECKPOINT_1 < univ_page_size.physical());
1634 ut_ad(LOG_CHECKPOINT_2 < univ_page_size.physical());
1635
1636 if (log_sys->n_pending_checkpoint_writes++ == 0) {
1637 rw_lock_x_lock_gen(&log_sys->checkpoint_lock,
1638 LOG_CHECKPOINT);
1639 }
1640
1641 /* Note: We alternate the physical place of the checkpoint info.
1642 See the (next_checkpoint_no & 1) below. */
1643
1644 /* We send as the last parameter the group machine address
1645 added with 1, as we want to distinguish between a normal log
1646 file write and a checkpoint field write */
1647
1648 fil_io(IORequestLogWrite, false,
1649 page_id_t(group->space_id, 0),
1650 univ_page_size,
1651 (log_sys->next_checkpoint_no & 1)
1652 ? LOG_CHECKPOINT_2 : LOG_CHECKPOINT_1,
1653 OS_FILE_LOG_BLOCK_SIZE,
1654 buf, (byte*) group + 1);
1655
1656 ut_ad(((ulint) group & 0x1UL) == 0);
1657 }
1658
1659 #ifdef UNIV_HOTBACKUP
1660 /******************************************************//**
1661 Writes info to a buffer of a log group when log files are created in
1662 backup restoration. */
1663 void
log_reset_first_header_and_checkpoint(byte * hdr_buf,ib_uint64_t start)1664 log_reset_first_header_and_checkpoint(
1665 /*==================================*/
1666 byte* hdr_buf,/*!< in: buffer which will be written to the
1667 start of the first log file */
1668 ib_uint64_t start) /*!< in: lsn of the start of the first log file;
1669 we pretend that there is a checkpoint at
1670 start + LOG_BLOCK_HDR_SIZE */
1671 {
1672 byte* buf;
1673 ib_uint64_t lsn;
1674
1675 mach_write_to_4(hdr_buf + LOG_HEADER_FORMAT,
1676 LOG_HEADER_FORMAT_CURRENT);
1677 mach_write_to_8(hdr_buf + LOG_HEADER_START_LSN, start);
1678
1679 lsn = start + LOG_BLOCK_HDR_SIZE;
1680
1681 /* Write the label of mysqlbackup --restore */
1682 strcpy((char*)hdr_buf + LOG_HEADER_CREATOR, LOG_HEADER_CREATOR_CURRENT);
1683 ut_sprintf_timestamp((char*) hdr_buf
1684 + (LOG_HEADER_CREATOR
1685 + (sizeof LOG_HEADER_CREATOR_CURRENT) - 1));
1686 buf = hdr_buf + LOG_CHECKPOINT_1;
1687 memset(buf, 0, OS_FILE_LOG_BLOCK_SIZE);
1688
1689 /*mach_write_to_8(buf + LOG_CHECKPOINT_NO, 0);*/
1690 mach_write_to_8(buf + LOG_CHECKPOINT_LSN, lsn);
1691
1692 mach_write_to_8(buf + LOG_CHECKPOINT_OFFSET,
1693 LOG_FILE_HDR_SIZE + LOG_BLOCK_HDR_SIZE);
1694 mach_write_to_8(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, 2 * 1024 * 1024);
1695
1696 log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf));
1697 }
1698 #endif /* UNIV_HOTBACKUP */
1699
1700 #ifndef UNIV_HOTBACKUP
1701 /** Read a log group header page to log_sys->checkpoint_buf.
1702 @param[in] group log group
1703 @param[in] header 0 or LOG_CHEKCPOINT_1 or LOG_CHECKPOINT2 */
1704 void
log_group_header_read(const log_group_t * group,ulint header)1705 log_group_header_read(
1706 const log_group_t* group,
1707 ulint header)
1708 {
1709 ut_ad(log_mutex_own());
1710
1711 log_sys->n_log_ios++;
1712
1713 MONITOR_INC(MONITOR_LOG_IO);
1714
1715 fil_io(IORequestLogRead, true,
1716 page_id_t(group->space_id, header / univ_page_size.physical()),
1717 univ_page_size, header % univ_page_size.physical(),
1718 OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL);
1719 }
1720
1721 /** Write checkpoint info to the log header and invoke log_mutex_exit().
1722 @param[in] sync whether to wait for the write to complete */
1723 void
log_write_checkpoint_info(bool sync)1724 log_write_checkpoint_info(
1725 bool sync)
1726 {
1727 log_group_t* group;
1728
1729 ut_ad(log_mutex_own());
1730
1731 if (!srv_read_only_mode) {
1732 for (group = UT_LIST_GET_FIRST(log_sys->log_groups);
1733 group;
1734 group = UT_LIST_GET_NEXT(log_groups, group)) {
1735
1736 log_group_checkpoint(group);
1737 }
1738 }
1739
1740 log_mutex_exit();
1741
1742 MONITOR_INC(MONITOR_NUM_CHECKPOINT);
1743
1744 if (sync) {
1745 /* Wait for the checkpoint write to complete */
1746 rw_lock_s_lock(&log_sys->checkpoint_lock);
1747 rw_lock_s_unlock(&log_sys->checkpoint_lock);
1748
1749 DEBUG_SYNC_C("checkpoint_completed");
1750
1751 DBUG_EXECUTE_IF(
1752 "crash_after_checkpoint",
1753 DBUG_SUICIDE(););
1754 }
1755 }
1756
1757 /** Set extra data to be written to the redo log during checkpoint.
1758 @param[in] buf data to be appended on checkpoint, or NULL
1759 @return pointer to previous data to be appended on checkpoint */
1760 mtr_buf_t*
log_append_on_checkpoint(mtr_buf_t * buf)1761 log_append_on_checkpoint(
1762 mtr_buf_t* buf)
1763 {
1764 log_mutex_enter();
1765 mtr_buf_t* old = log_sys->append_on_checkpoint;
1766 log_sys->append_on_checkpoint = buf;
1767 log_mutex_exit();
1768 return(old);
1769 }
1770
1771 /** Make a checkpoint. Note that this function does not flush dirty
1772 blocks from the buffer pool: it only checks what is lsn of the oldest
1773 modification in the pool, and writes information about the lsn in
1774 log files. Use log_make_checkpoint_at() to flush also the pool.
1775 @param[in] sync whether to wait for the write to complete
1776 @param[in] write_always force a write even if no log
1777 has been generated since the latest checkpoint
1778 @return true if success, false if a checkpoint write was already running */
1779 bool
log_checkpoint(bool sync,bool write_always)1780 log_checkpoint(
1781 bool sync,
1782 bool write_always)
1783 {
1784 lsn_t oldest_lsn;
1785
1786 ut_ad(!srv_read_only_mode);
1787
1788 if (recv_recovery_is_on()) {
1789 recv_apply_hashed_log_recs(TRUE);
1790 }
1791
1792 #ifndef _WIN32
1793 switch (srv_unix_file_flush_method) {
1794 case SRV_UNIX_NOSYNC:
1795 break;
1796 case SRV_UNIX_O_DSYNC:
1797 case SRV_UNIX_FSYNC:
1798 case SRV_UNIX_LITTLESYNC:
1799 case SRV_UNIX_O_DIRECT:
1800 case SRV_UNIX_O_DIRECT_NO_FSYNC:
1801 fil_flush_file_spaces(FIL_TYPE_TABLESPACE);
1802 }
1803 #endif /* !_WIN32 */
1804
1805 log_mutex_enter();
1806
1807 ut_ad(!recv_no_log_write);
1808 oldest_lsn = log_buf_pool_get_oldest_modification();
1809
1810 /* Because log also contains headers and dummy log records,
1811 log_buf_pool_get_oldest_modification() will return log_sys->lsn
1812 if the buffer pool contains no dirty buffers.
1813 We must make sure that the log is flushed up to that lsn.
1814 If there are dirty buffers in the buffer pool, then our
1815 write-ahead-logging algorithm ensures that the log has been
1816 flushed up to oldest_lsn. */
1817
1818 ut_ad(oldest_lsn >= log_sys->last_checkpoint_lsn);
1819 if (!write_always
1820 && oldest_lsn
1821 <= log_sys->last_checkpoint_lsn + SIZE_OF_MLOG_CHECKPOINT) {
1822 /* Do nothing, because nothing was logged (other than
1823 a MLOG_CHECKPOINT marker) since the previous checkpoint. */
1824 log_mutex_exit();
1825 return(true);
1826 }
1827
1828 /* Repeat the MLOG_FILE_NAME records after the checkpoint, in
1829 case some log records between the checkpoint and log_sys->lsn
1830 need them. Finally, write a MLOG_CHECKPOINT marker. Redo log
1831 apply expects to see a MLOG_CHECKPOINT after the checkpoint,
1832 except on clean shutdown, where the log will be empty after
1833 the checkpoint.
1834
1835 It is important that we write out the redo log before any
1836 further dirty pages are flushed to the tablespace files. At
1837 this point, because log_mutex_own(), mtr_commit() in other
1838 threads will be blocked, and no pages can be added to the
1839 flush lists. */
1840 lsn_t flush_lsn = oldest_lsn;
1841 const bool do_write
1842 = srv_shutdown_state == SRV_SHUTDOWN_NONE
1843 || flush_lsn != log_sys->lsn;
1844
1845 if (fil_names_clear(flush_lsn, do_write)) {
1846 ut_ad(log_sys->lsn >= flush_lsn + SIZE_OF_MLOG_CHECKPOINT);
1847 flush_lsn = log_sys->lsn;
1848 }
1849
1850 log_mutex_exit();
1851
1852 log_write_up_to(flush_lsn, true);
1853
1854 DBUG_EXECUTE_IF(
1855 "using_wa_checkpoint_middle",
1856 if (write_always) {
1857 DEBUG_SYNC_C("wa_checkpoint_middle");
1858
1859 const my_bool b = TRUE;
1860 buf_flush_page_cleaner_disabled_debug_update(
1861 NULL, NULL, NULL, &b);
1862 dict_stats_disabled_debug_update(
1863 NULL, NULL, NULL, &b);
1864 srv_master_thread_disabled_debug_update(
1865 NULL, NULL, NULL, &b);
1866 });
1867
1868 log_mutex_enter();
1869
1870 ut_ad(log_sys->flushed_to_disk_lsn >= flush_lsn);
1871 ut_ad(flush_lsn >= oldest_lsn);
1872
1873 if (log_sys->last_checkpoint_lsn >= oldest_lsn) {
1874 log_mutex_exit();
1875 return(true);
1876 }
1877
1878 if (log_sys->n_pending_checkpoint_writes > 0) {
1879 /* A checkpoint write is running */
1880 log_mutex_exit();
1881
1882 if (sync) {
1883 /* Wait for the checkpoint write to complete */
1884 rw_lock_s_lock(&log_sys->checkpoint_lock);
1885 rw_lock_s_unlock(&log_sys->checkpoint_lock);
1886 }
1887
1888 return(false);
1889 }
1890
1891 log_sys->next_checkpoint_lsn = oldest_lsn;
1892 log_write_checkpoint_info(sync);
1893 ut_ad(!log_mutex_own());
1894
1895 return(true);
1896 }
1897
1898 /** Make a checkpoint at or after a specified LSN.
1899 @param[in] lsn the log sequence number, or LSN_MAX
1900 for the latest LSN
1901 @param[in] write_always force a write even if no log
1902 has been generated since the latest checkpoint */
1903 void
log_make_checkpoint_at(lsn_t lsn,bool write_always)1904 log_make_checkpoint_at(
1905 lsn_t lsn,
1906 bool write_always)
1907 {
1908 /* Preflush pages synchronously */
1909
1910 while (!log_preflush_pool_modified_pages(lsn)) {
1911 /* Flush as much as we can */
1912 }
1913
1914 while (!log_checkpoint(true, write_always)) {
1915 /* Force a checkpoint */
1916 }
1917 }
1918
1919 /****************************************************************//**
1920 Tries to establish a big enough margin of free space in the log groups, such
1921 that a new log entry can be catenated without an immediate need for a
1922 checkpoint. NOTE: this function may only be called if the calling thread
1923 owns no synchronization objects! */
1924 static
1925 void
log_checkpoint_margin(void)1926 log_checkpoint_margin(void)
1927 /*=======================*/
1928 {
1929 log_t* log = log_sys;
1930 lsn_t age;
1931 lsn_t checkpoint_age;
1932 ib_uint64_t advance;
1933 lsn_t oldest_lsn;
1934 bool success;
1935 loop:
1936 advance = 0;
1937
1938 log_mutex_enter();
1939 ut_ad(!recv_no_log_write);
1940
1941 if (!log->check_flush_or_checkpoint) {
1942 log_mutex_exit();
1943 return;
1944 }
1945
1946 oldest_lsn = log_buf_pool_get_oldest_modification();
1947
1948 age = log->lsn - oldest_lsn;
1949
1950 if (age > log->max_modified_age_sync) {
1951
1952 /* A flush is urgent: we have to do a synchronous preflush */
1953 advance = age - log->max_modified_age_sync;
1954 }
1955
1956 checkpoint_age = log->lsn - log->last_checkpoint_lsn;
1957
1958 bool checkpoint_sync;
1959 bool do_checkpoint;
1960
1961 if (checkpoint_age > log->max_checkpoint_age) {
1962 /* A checkpoint is urgent: we do it synchronously */
1963 checkpoint_sync = true;
1964 do_checkpoint = true;
1965 } else if (checkpoint_age > log->max_checkpoint_age_async) {
1966 /* A checkpoint is not urgent: do it asynchronously */
1967 do_checkpoint = true;
1968 checkpoint_sync = false;
1969 log->check_flush_or_checkpoint = false;
1970 } else {
1971 do_checkpoint = false;
1972 checkpoint_sync = false;
1973 log->check_flush_or_checkpoint = false;
1974 }
1975
1976 log_mutex_exit();
1977
1978 if (advance) {
1979 lsn_t new_oldest = oldest_lsn + advance;
1980
1981 success = log_preflush_pool_modified_pages(new_oldest);
1982
1983 /* If the flush succeeded, this thread has done its part
1984 and can proceed. If it did not succeed, there was another
1985 thread doing a flush at the same time. */
1986 if (!success) {
1987 log_mutex_enter();
1988
1989 log->check_flush_or_checkpoint = true;
1990
1991 log_mutex_exit();
1992 goto loop;
1993 }
1994 }
1995
1996 if (do_checkpoint) {
1997 log_checkpoint(checkpoint_sync, FALSE);
1998
1999 if (checkpoint_sync) {
2000
2001 goto loop;
2002 }
2003 }
2004 }
2005
2006 /******************************************************//**
2007 Reads a specified log segment to a buffer. */
2008 void
log_group_read_log_seg(byte * buf,log_group_t * group,lsn_t start_lsn,lsn_t end_lsn)2009 log_group_read_log_seg(
2010 /*===================*/
2011 byte* buf, /*!< in: buffer where to read */
2012 log_group_t* group, /*!< in: log group */
2013 lsn_t start_lsn, /*!< in: read area start */
2014 lsn_t end_lsn) /*!< in: read area end */
2015 {
2016 ulint len;
2017 lsn_t source_offset;
2018
2019 ut_ad(log_mutex_own());
2020
2021 loop:
2022 source_offset = log_group_calc_lsn_offset(start_lsn, group);
2023
2024 ut_a(end_lsn - start_lsn <= ULINT_MAX);
2025 len = (ulint) (end_lsn - start_lsn);
2026
2027 ut_ad(len != 0);
2028
2029 if ((source_offset % group->file_size) + len > group->file_size) {
2030
2031 /* If the above condition is true then len (which is ulint)
2032 is > the expression below, so the typecast is ok */
2033 len = (ulint) (group->file_size -
2034 (source_offset % group->file_size));
2035 }
2036
2037 log_sys->n_log_ios++;
2038
2039 MONITOR_INC(MONITOR_LOG_IO);
2040
2041 ut_a(source_offset / UNIV_PAGE_SIZE <= ULINT_MAX);
2042
2043 const ulint page_no
2044 = (ulint) (source_offset / univ_page_size.physical());
2045
2046 fil_io(IORequestLogRead, true,
2047 page_id_t(group->space_id, page_no),
2048 univ_page_size,
2049 (ulint) (source_offset % univ_page_size.physical()),
2050 len, buf, NULL);
2051
2052 start_lsn += len;
2053 buf += len;
2054
2055 if (start_lsn != end_lsn) {
2056
2057 goto loop;
2058 }
2059 }
2060
2061 /**
2062 Checks that there is enough free space in the log to start a new query step.
2063 Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
2064 function may only be called if the calling thread owns no synchronization
2065 objects! */
2066 void
log_check_margins(void)2067 log_check_margins(void)
2068 {
2069 bool check;
2070
2071 do {
2072 log_flush_margin();
2073 log_checkpoint_margin();
2074 log_mutex_enter();
2075 ut_ad(!recv_no_log_write);
2076 check = log_sys->check_flush_or_checkpoint;
2077 log_mutex_exit();
2078 } while (check);
2079 }
2080
2081 /****************************************************************//**
2082 Makes a checkpoint at the latest lsn and writes it to first page of each
2083 data file in the database, so that we know that the file spaces contain
2084 all modifications up to that lsn. This can only be called at database
2085 shutdown. This function also writes all log in log files to the log archive. */
2086 void
logs_empty_and_mark_files_at_shutdown(void)2087 logs_empty_and_mark_files_at_shutdown(void)
2088 /*=======================================*/
2089 {
2090 lsn_t lsn;
2091 ulint count = 0;
2092 ulint total_trx;
2093 ulint pending_io;
2094 enum srv_thread_type active_thd;
2095 const char* thread_name;
2096
2097 ib::info() << "Starting shutdown...";
2098
2099 while (srv_fast_shutdown == 0 && trx_rollback_or_clean_is_active) {
2100 /* we should wait until rollback after recovery end
2101 for slow shutdown */
2102 os_thread_sleep(100000);
2103 }
2104
2105 /* Wait until the master thread and all other operations are idle: our
2106 algorithm only works if the server is idle at shutdown */
2107
2108 srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
2109 loop:
2110 os_thread_sleep(100000);
2111
2112 count++;
2113
2114 /* We need the monitor threads to stop before we proceed with
2115 a shutdown. */
2116
2117 thread_name = srv_any_background_threads_are_active();
2118
2119 if (thread_name != NULL) {
2120 /* Print a message every 60 seconds if we are waiting
2121 for the monitor thread to exit. Master and worker
2122 threads check will be done later. */
2123
2124 if (srv_print_verbose_log && count > 600) {
2125 ib::info() << "Waiting for " << thread_name
2126 << " to exit";
2127 count = 0;
2128 }
2129
2130 goto loop;
2131 }
2132
2133 /* Check that there are no longer transactions, except for
2134 PREPARED ones. We need this wait even for the 'very fast'
2135 shutdown, because the InnoDB layer may have committed or
2136 prepared transactions and we don't want to lose them. */
2137
2138 total_trx = trx_sys_any_active_transactions();
2139
2140 if (total_trx > 0) {
2141
2142 if (srv_print_verbose_log && count > 600) {
2143 ib::info() << "Waiting for " << total_trx << " active"
2144 << " transactions to finish";
2145
2146 count = 0;
2147 }
2148
2149 goto loop;
2150 }
2151
2152 /* Check that the background threads are suspended */
2153
2154 active_thd = srv_get_active_thread_type();
2155
2156 if (active_thd != SRV_NONE) {
2157
2158 if (active_thd == SRV_PURGE) {
2159 srv_purge_wakeup();
2160 }
2161
2162 /* The srv_lock_timeout_thread, srv_error_monitor_thread
2163 and srv_monitor_thread should already exit by now. The
2164 only threads to be suspended are the master threads
2165 and worker threads (purge threads). Print the thread
2166 type if any of such threads not in suspended mode */
2167 if (srv_print_verbose_log && count > 600) {
2168 const char* thread_type = "<null>";
2169
2170 switch (active_thd) {
2171 case SRV_NONE:
2172 /* This shouldn't happen because we've
2173 already checked for this case before
2174 entering the if(). We handle it here
2175 to avoid a compiler warning. */
2176 ut_error;
2177 case SRV_WORKER:
2178 thread_type = "worker threads";
2179 break;
2180 case SRV_MASTER:
2181 thread_type = "master thread";
2182 break;
2183 case SRV_PURGE:
2184 thread_type = "purge thread";
2185 break;
2186 }
2187
2188 ib::info() << "Waiting for " << thread_type
2189 << " to be suspended";
2190
2191 count = 0;
2192 }
2193
2194 goto loop;
2195 }
2196
2197 /* At this point only page_cleaner should be active. We wait
2198 here to let it complete the flushing of the buffer pools
2199 before proceeding further. */
2200 srv_shutdown_state = SRV_SHUTDOWN_FLUSH_PHASE;
2201 count = 0;
2202 while (buf_page_cleaner_is_active) {
2203 ++count;
2204 os_thread_sleep(100000);
2205 if (srv_print_verbose_log && count > 600) {
2206 ib::info() << "Waiting for page_cleaner to"
2207 " finish flushing of buffer pool";
2208 count = 0;
2209 }
2210 }
2211
2212 log_mutex_enter();
2213 const ulint n_write = log_sys->n_pending_checkpoint_writes;
2214 const ulint n_flush = log_sys->n_pending_flushes;
2215 log_mutex_exit();
2216
2217 if (n_write != 0 || n_flush != 0) {
2218 if (srv_print_verbose_log && count > 600) {
2219 ib::info() << "Pending checkpoint_writes: " << n_write
2220 << ". Pending log flush writes: " << n_flush;
2221 count = 0;
2222 }
2223 goto loop;
2224 }
2225
2226 pending_io = buf_pool_check_no_pending_io();
2227
2228 if (pending_io) {
2229 if (srv_print_verbose_log && count > 600) {
2230 ib::info() << "Waiting for " << pending_io << " buffer"
2231 " page I/Os to complete";
2232 count = 0;
2233 }
2234
2235 goto loop;
2236 }
2237
2238 if (srv_fast_shutdown == 2) {
2239 if (!srv_read_only_mode) {
2240 ib::info() << "MySQL has requested a very fast"
2241 " shutdown without flushing the InnoDB buffer"
2242 " pool to data files. At the next mysqld"
2243 " startup InnoDB will do a crash recovery!";
2244
2245 /* In this fastest shutdown we do not flush the
2246 buffer pool:
2247
2248 it is essentially a 'crash' of the InnoDB server.
2249 Make sure that the log is all flushed to disk, so
2250 that we can recover all committed transactions in
2251 a crash recovery. We must not write the lsn stamps
2252 to the data files, since at a startup InnoDB deduces
2253 from the stamps if the previous shutdown was clean. */
2254
2255 log_buffer_flush_to_disk();
2256
2257 /* Check that the background threads stay suspended */
2258 thread_name = srv_any_background_threads_are_active();
2259
2260 if (thread_name != NULL) {
2261 ib::warn() << "Background thread "
2262 << thread_name << " woke up during"
2263 " shutdown";
2264 goto loop;
2265 }
2266 }
2267
2268 srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
2269
2270 fil_close_all_files();
2271
2272 thread_name = srv_any_background_threads_are_active();
2273
2274 ut_a(!thread_name);
2275
2276 return;
2277 }
2278
2279 if (!srv_read_only_mode) {
2280 log_make_checkpoint_at(LSN_MAX, TRUE);
2281 }
2282
2283 log_mutex_enter();
2284
2285 lsn = log_sys->lsn;
2286
2287 ut_ad(lsn >= log_sys->last_checkpoint_lsn);
2288
2289 log_mutex_exit();
2290
2291 /** If innodb_force_recovery is set to 6 then log_sys doesn't
2292 have recent checkpoint information. So last checkpoint lsn
2293 will never be equal to current lsn. */
2294 const bool is_last = ((srv_force_recovery == SRV_FORCE_NO_LOG_REDO
2295 && lsn == log_sys->last_checkpoint_lsn
2296 + LOG_BLOCK_HDR_SIZE)
2297 || lsn == log_sys->last_checkpoint_lsn);
2298
2299 if (!is_last) {
2300 goto loop;
2301 }
2302
2303 /* Check that the background threads stay suspended */
2304 thread_name = srv_any_background_threads_are_active();
2305 if (thread_name != NULL) {
2306 ib::warn() << "Background thread " << thread_name << " woke up"
2307 " during shutdown";
2308
2309 goto loop;
2310 }
2311
2312 if (!srv_read_only_mode) {
2313 fil_flush_file_spaces(FIL_TYPE_TABLESPACE);
2314 fil_flush_file_spaces(FIL_TYPE_LOG);
2315 }
2316
2317 /* The call fil_write_flushed_lsn() will bypass the buffer
2318 pool: therefore it is essential that the buffer pool has been
2319 completely flushed to disk! (We do not call fil_write... if the
2320 'very fast' shutdown is enabled.) */
2321
2322 if (!buf_all_freed()) {
2323
2324 if (srv_print_verbose_log && count > 600) {
2325 ib::info() << "Waiting for dirty buffer pages to be"
2326 " flushed";
2327 count = 0;
2328 }
2329
2330 goto loop;
2331 }
2332
2333 srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
2334
2335 /* Make some checks that the server really is quiet */
2336 srv_thread_type type = srv_get_active_thread_type();
2337 ut_a(type == SRV_NONE);
2338
2339 bool freed = buf_all_freed();
2340 ut_a(freed);
2341
2342 ut_a(lsn == log_sys->lsn);
2343
2344 if (lsn < srv_start_lsn) {
2345 ib::error() << "Log sequence number at shutdown " << lsn
2346 << " is lower than at startup " << srv_start_lsn
2347 << "!";
2348 }
2349
2350 srv_shutdown_lsn = lsn;
2351
2352 if (!srv_read_only_mode) {
2353 fil_write_flushed_lsn(lsn);
2354 }
2355
2356 fil_close_all_files();
2357
2358 /* Make some checks that the server really is quiet */
2359 type = srv_get_active_thread_type();
2360 ut_a(type == SRV_NONE);
2361
2362 freed = buf_all_freed();
2363 ut_a(freed);
2364
2365 ut_a(lsn == log_sys->lsn);
2366 }
2367
2368 /******************************************************//**
2369 Peeks the current lsn.
2370 @return TRUE if success, FALSE if could not get the log system mutex */
2371 ibool
log_peek_lsn(lsn_t * lsn)2372 log_peek_lsn(
2373 /*=========*/
2374 lsn_t* lsn) /*!< out: if returns TRUE, current lsn is here */
2375 {
2376 if (0 == mutex_enter_nowait(&(log_sys->mutex))) {
2377 *lsn = log_sys->lsn;
2378
2379 log_mutex_exit();
2380
2381 return(TRUE);
2382 }
2383
2384 return(FALSE);
2385 }
2386
2387 /******************************************************//**
2388 Prints info of the log. */
2389 void
log_print(FILE * file)2390 log_print(
2391 /*======*/
2392 FILE* file) /*!< in: file where to print */
2393 {
2394 double time_elapsed;
2395 time_t current_time;
2396
2397 log_mutex_enter();
2398
2399 fprintf(file,
2400 "Log sequence number " LSN_PF "\n"
2401 "Log flushed up to " LSN_PF "\n"
2402 "Pages flushed up to " LSN_PF "\n"
2403 "Last checkpoint at " LSN_PF "\n",
2404 log_sys->lsn,
2405 log_sys->flushed_to_disk_lsn,
2406 log_buf_pool_get_oldest_modification(),
2407 log_sys->last_checkpoint_lsn);
2408
2409 current_time = time(NULL);
2410
2411 time_elapsed = difftime(current_time,
2412 log_sys->last_printout_time);
2413
2414 if (time_elapsed <= 0) {
2415 time_elapsed = 1;
2416 }
2417
2418 fprintf(file,
2419 ULINTPF " pending log flushes, "
2420 ULINTPF " pending chkp writes\n"
2421 ULINTPF " log i/o's done, %.2f log i/o's/second\n",
2422 log_sys->n_pending_flushes,
2423 log_sys->n_pending_checkpoint_writes,
2424 log_sys->n_log_ios,
2425 static_cast<double>(
2426 log_sys->n_log_ios - log_sys->n_log_ios_old)
2427 / time_elapsed);
2428
2429 log_sys->n_log_ios_old = log_sys->n_log_ios;
2430 log_sys->last_printout_time = current_time;
2431
2432 log_mutex_exit();
2433 }
2434
2435 /**********************************************************************//**
2436 Refreshes the statistics used to print per-second averages. */
2437 void
log_refresh_stats(void)2438 log_refresh_stats(void)
2439 /*===================*/
2440 {
2441 log_sys->n_log_ios_old = log_sys->n_log_ios;
2442 log_sys->last_printout_time = time(NULL);
2443 }
2444
2445 /********************************************************//**
2446 Closes a log group. */
2447 static
2448 void
log_group_close(log_group_t * group)2449 log_group_close(
2450 /*===========*/
2451 log_group_t* group) /* in,own: log group to close */
2452 {
2453 ulint i;
2454
2455 for (i = 0; i < group->n_files; i++) {
2456 ut_free(group->file_header_bufs_ptr[i]);
2457 }
2458
2459 ut_free(group->file_header_bufs_ptr);
2460 ut_free(group->file_header_bufs);
2461 ut_free(group->checkpoint_buf_ptr);
2462 ut_free(group);
2463 }
2464
2465 /********************************************************//**
2466 Closes all log groups. */
2467 void
log_group_close_all(void)2468 log_group_close_all(void)
2469 /*=====================*/
2470 {
2471 log_group_t* group;
2472
2473 group = UT_LIST_GET_FIRST(log_sys->log_groups);
2474
2475 while (UT_LIST_GET_LEN(log_sys->log_groups) > 0) {
2476 log_group_t* prev_group = group;
2477
2478 group = UT_LIST_GET_NEXT(log_groups, group);
2479
2480 UT_LIST_REMOVE(log_sys->log_groups, prev_group);
2481
2482 log_group_close(prev_group);
2483 }
2484 }
2485
2486 /********************************************************//**
2487 Shutdown the log system but do not release all the memory. */
2488 void
log_shutdown(void)2489 log_shutdown(void)
2490 /*==============*/
2491 {
2492 log_group_close_all();
2493
2494 ut_free(log_sys->buf_ptr);
2495 log_sys->buf_ptr = NULL;
2496 log_sys->buf = NULL;
2497 ut_free(log_sys->checkpoint_buf_ptr);
2498 log_sys->checkpoint_buf_ptr = NULL;
2499 log_sys->checkpoint_buf = NULL;
2500
2501 os_event_destroy(log_sys->flush_event);
2502
2503 rw_lock_free(&log_sys->checkpoint_lock);
2504
2505 mutex_free(&log_sys->mutex);
2506 mutex_free(&log_sys->write_mutex);
2507 mutex_free(&log_sys->log_flush_order_mutex);
2508
2509 recv_sys_close();
2510 }
2511
2512 /********************************************************//**
2513 Free the log system data structures. */
2514 void
log_mem_free(void)2515 log_mem_free(void)
2516 /*==============*/
2517 {
2518 if (log_sys != NULL) {
2519 recv_sys_mem_free();
2520 ut_free(log_sys);
2521
2522 log_sys = NULL;
2523 }
2524 }
2525 #endif /* !UNIV_HOTBACKUP */
2526