1 /*****************************************************************************
2
3 Copyright (c) 1995, 2020, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2009, Google Inc.
5
6 Portions of this file contain modifications contributed and copyrighted by
7 Google, Inc. Those modifications are gratefully acknowledged and are described
8 briefly in the InnoDB documentation. The contributions by Google are
9 incorporated with their permission, and subject to the conditions contained in
10 the file COPYING.Google.
11
12 This program is free software; you can redistribute it and/or modify it under
13 the terms of the GNU General Public License, version 2.0, as published by the
14 Free Software Foundation.
15
16 This program is also distributed with certain software (including but not
17 limited to OpenSSL) that is licensed under separate terms, as designated in a
18 particular file or component or in included license documentation. The authors
19 of MySQL hereby grant you an additional permission to link the program and
20 your derivative works with the separately licensed software that they have
21 included with MySQL.
22
23 This program is distributed in the hope that it will be useful, but WITHOUT
24 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
25 FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0,
26 for more details.
27
28 You should have received a copy of the GNU General Public License along with
29 this program; if not, write to the Free Software Foundation, Inc.,
30 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
31
32 *****************************************************************************/
33
34 /**************************************************/ /**
35 @file log/log0log.cc
36
37 Redo log system - provides durability for unflushed modifications
38 to contents of data pages.
39
40 This file covers general maintenance, including:
41 -# Allocation and deallocation of the redo log data structures.
42 -# Initialization and shutdown of the redo log.
43 -# Start / stop for the log background threads.
44 -# Runtime updates of server variables.
45 -# Extending size of the redo log buffers.
46 -# Locking redo position (for replication).
47
48 Code responsible for writing to redo log could be found in log0buf.cc,
49 log0write.cc, and log0log.ic. The log writer, flusher, write notifier,
50 flush notifier, and closer threads are implemented in log0write.cc.
51
52 Code responsible for checkpoints could be found in log0chkp.cc.
53 The log checkpointer thread is implemented there.
54
55 Created 12/9/1995 Heikki Tuuri
56 *******************************************************/
57
58 #include "log0types.h"
59
60 /** Pointer to the log checksum calculation function. */
61 log_checksum_func_t log_checksum_algorithm_ptr;
62
63 #ifndef UNIV_HOTBACKUP
64
65 #include <debug_sync.h>
66 #include <sys/types.h>
67 #include <time.h>
68 #include "dict0boot.h"
69 #include "ha_prototypes.h"
70 #include "log0meb.h"
71 #include "os0thread-create.h"
72 #include "trx0sys.h"
73
74 /**
75 @page PAGE_INNODB_REDO_LOG Innodb redo log
76
77 @section sect_redo_log_general General idea of redo log
78
79 The redo log is a write ahead log of changes applied to contents of data pages.
80 It provides durability for all changes applied to the pages. In case of crash,
81 it is used to recover modifications to pages that were modified but have not
82 been flushed to disk.
83
84 @note In case of clean shutdown, the redo log should be logically empty.
85 This means that after the checkpoint lsn there should be no records to apply.
86 However the log files still could contain some old data (which is not used
87 during the recovery process).
88
89 Every change to content of a data page must be done through a mini transaction
90 (so called mtr - mtr_t), which in mtr_commit() writes all its log records
91 to the redo log.
92
93 @remarks Normally these changes are performed using the mlog_write_ulint()
94 or similar function. In some page-level operations, only a code number of
95 a c-function and its parameters are written to the redo log, to reduce the
96 size of the redo log. You should not add parameters to such functions
97 (e.g. trx_undo_header_create(), trx_undo_insert_header_reuse()).
98 You should not add functionality which can either change when compared to older
99 versions, or which is dependent on data outside of the page being modified.
100 Therefore all functions must implement self-contained page transformation
101 and it should be unchanged if you don't have very essential reasons to change
102 the log semantics or format.
103
104 Single mtr can cover changes to multiple pages. In case of crash, either the
105 whole set of changes from a given mtr is recovered or none of the changes.
106
107 During life time of a mtr, a log of changes is collected inside an internal
108 buffer of the mtr. It contains multiple log records, which describe changes
109 applied to possibly different modified pages. When the mtr is committed, all
110 the log records are written to the log buffer within a single group of the log
111 records. Procedure:
112
113 -# Total number of data bytes of log records is calculated.
114 -# Space for the log records is reserved. Range of lsn values is assigned for
115 a group of log records.
116 -# %Log records are written to the reserved space in the log buffer.
117 -# Modified pages are marked as dirty and moved to flush lists.
118 All the dirty pages are marked with the same range of lsn values.
119 -# Reserved space is closed.
120
121 Background threads are responsible for writing of new changes in the log buffer
122 to the log files. User threads that require durability for the logged records,
123 have to wait until the log gets flushed up to the required point.
124
125 During recovery only complete groups of log records are recovered and applied.
126 Example given, if we had rotation in a tree, which resulted in changes to three
127 nodes (pages), we have a guarantee, that either the whole rotation is recovered
128 or nothing, so we will not end up with a tree that has incorrect structure.
129
130 Consecutive bytes written to the redo log are enumerated by the lsn values.
131 Every single byte written to the log buffer corresponds to current lsn
132 increased by one.
133
134 Data in the redo log is structured in consecutive blocks of 512 bytes
135 (_OS_FILE_LOG_BLOCK_SIZE_). Each block contains a header of 12 bytes
136 (_LOG_BLOCK_HDR_SIZE_) and a footer of 4 bytes (_LOG_BLOCK_TRL_SIZE_).
137 These extra bytes are also enumerated by lsn values. Whenever we refer to
138 data bytes, we mean actual bytes of log records - not bytes of headers and
139 footers of log blocks. The sequence of enumerated data bytes, is called the
140 sn values. All headers and footers of log blocks are added within the log
141 buffer, where data is actually stored in proper redo format.
142
143 When a user transaction commits, extra mtr is committed (related to undo log),
144 and then user thread waits until the redo log is flushed up to the point,
145 where log records of that mtr end.
146
147 When a dirty page is being flushed, a thread doing the flush, first needs to
148 wait until the redo log gets flushed up to the newest modification of the page.
149 Afterwards the page might be flushed. In case of crash, we might end up with
150 the newest version of the page and without any earlier versions of the page.
151 Then other pages, which potentially have not been flushed before the crash,
152 need to be recovered to that version. This applies to:
153 * pages modified within the same group of log records,
154 * and pages modified within any earlier group of log records.
155
156 @section sect_redo_log_architecture Architecture of redo log
157
158 @subsection subsect_redo_log_data_layers Data layers
159
160 Redo log consists of following data layers:
161
162 -# %Log files (typically 4 - 32 GB) - physical redo files that reside on
163 the disk.
164
165 -# %Log buffer (64 MB by default) - groups data to write to log files,
166 formats data in proper way: include headers/footers of log blocks,
167 calculates checksums, maintains boundaries of record groups.
168
169 -# %Log recent written buffer (e.g. 4MB) - tracks recent writes to the
170 log buffer. Allows to have concurrent writes to the log buffer and tracks
171 up to which lsn all such writes have been already finished.
172
173 -# %Log recent closed buffer (e.g. 4MB) - tracks for which recent writes,
174 corresponding dirty pages have been already added to the flush lists.
175 Allows to relax order in which dirty pages have to be added to the flush
176 lists and tracks up to which lsn, all dirty pages have been added.
177 This is required to not make checkpoint at lsn which is larger than
178 oldest_modification of some dirty page, which still has not been added
179 to the flush list (because user thread was scheduled out).
180
181 -# %Log write ahead buffer (e.g. 4kB) - used to write ahead more bytes
182 to the redo files, to avoid read-on-write problem. This buffer is also
183 used when we need to write an incomplete log block, which might
184 concurrently be receiving even more data from next user threads. In such
185 case we first copy the incomplete block to the write ahead buffer.
186
187 @subsection subsect_redo_log_general_rules General rules
188
189 -# User threads write their redo data only to the log buffer.
190
191 -# User threads write concurrently to the log buffer, without synchronization
192 between each other.
193
194 -# The log recent written buffer is maintained to track concurrent writes.
195
196 -# Background log threads write and flush the log buffer to disk.
197
198 -# User threads do not touch log files. Background log threads are the only
199 allowed to touch the log files.
200
201 -# User threads wait for the background threads when they need flushed redo.
202
203 -# %Events per log block are exposed by redo log for users interested in waiting
204 for the flushed redo.
205
206 -# Users can see up to which point log has been written / flushed.
207
208 -# User threads need to wait if there is no space in the log buffer.
209
210 @diafile storage/innobase/log/arch_writing.dia "Writing to the redo log"
211
212 -# User threads add dirty pages to flush lists in the relaxed order.
213
214 -# Order in which user threads reserve ranges of lsn values, order in which
215 they write to the log buffer, and order in which they add dirty pages to
216 flush lists, could all be three completely different orders.
217
218 -# User threads do not write checkpoints (are not allowed to touch log files).
219
220 -# Checkpoint is automatically written from time to time by a background thread.
221
222 -# User threads can request a forced write of checkpoint and wait.
223
224 -# User threads need to wait if there is no space in the log files.
225
226 @diafile storage/innobase/log/arch_deleting.dia "Reclaiming space in the redo
227 log"
228
229 -# Well thought out and tested set of _MONITOR_ counters is maintained and
230 documented.
231
232 -# All settings are configurable through server variables, but the new server
233 variables are hidden unless a special _EXPERIMENTAL_ mode has been defined
234 when running cmake.
235
236 -# All the new buffers could be resized dynamically during runtime. In practice,
237 only size of the log buffer is accessible without the _EXPERIMENTAL_ mode.
238
239 @note
240 This is a functional change - the log buffer could be resized dynamically
241 by users (also decreased).
242
243 @section sect_redo_log_lsn_values Glossary of lsn values
244
245 Different fragments of head of the redo log are tracked by different values:
246 - @ref subsect_redo_log_write_lsn,
247 - @ref subsect_redo_log_buf_ready_for_write_lsn,
248 - @ref subsect_redo_log_sn.
249
250 Different fragments of the redo log's tail are tracked by different values:
251 - @ref subsect_redo_log_buf_dirty_pages_added_up_to_lsn,
252 - @ref subsect_redo_log_available_for_checkpoint_lsn,
253 - @ref subsect_redo_log_last_checkpoint_lsn.
254
255 @subsection subsect_redo_log_write_lsn log.write_lsn
256
257 Up to this lsn we have written all data to log files. It's the beginning of
258 the unwritten log buffer. Older bytes in the buffer are not required and might
259 be overwritten in cyclic manner for lsn values larger by _log.buf_size_.
260
261 Value is updated by: [log writer thread](@ref sect_redo_log_writer).
262
263 @subsection subsect_redo_log_buf_ready_for_write_lsn log.buf_ready_for_write_lsn
264
265 Up to this lsn, all concurrent writes to log buffer have been finished.
266 We don't need older part of the log recent-written buffer.
267
268 It obviously holds:
269
270 log.buf_ready_for_write_lsn >= log.write_lsn
271
272 Value is updated by: [log writer thread](@ref sect_redo_log_writer).
273
274 @subsection subsect_redo_log_flushed_to_disk_lsn log.flushed_to_disk_lsn
275
276 Up to this lsn, we have written and flushed data to log files.
277
278 It obviously holds:
279
280 log.flushed_to_disk_lsn <= log.write_lsn
281
282 Value is updated by: [log flusher thread](@ref sect_redo_log_flusher).
283
284 @subsection subsect_redo_log_sn log.sn
285
286 Corresponds to current lsn. Maximum assigned sn value (enumerates only
287 data bytes).
288
289 It obviously holds:
290
291 log.sn >= log_translate_lsn_to_sn(log.buf_ready_for_write_lsn)
292
293 Value is updated by: user threads during reservation of space.
294
295 @subsection subsect_redo_log_buf_dirty_pages_added_up_to_lsn
296 log.buf_dirty_pages_added_up_to_lsn
297
298 Up to this lsn user threads have added all dirty pages to flush lists.
299
300 The redo log records are allowed to be deleted not further than up to this lsn.
301 That's because there could be a page with _oldest_modification_ smaller than
302 the minimum _oldest_modification_ available in flush lists. Note that such page
303 is just about to be added to flush list by a user thread, but there is no mutex
304 protecting access to the minimum _oldest_modification_, which would be acquired
305 by the user thread before writing to redo log. Hence for any lsn greater than
306 _buf_dirty_pages_added_up_to_lsn_ we cannot trust that flush lists are complete
307 and minimum calculated value (or its approximation) is valid.
308
309 @note
310 Note that we do not delete redo log records physically, but we still can delete
311 them logically by doing checkpoint at given lsn.
312
313 It holds (unless the log writer thread misses an update of the
314 @ref subsect_redo_log_buf_ready_for_write_lsn):
315
316 log.buf_dirty_pages_added_up_to_lsn <= log.buf_ready_for_write_lsn.
317
318 Value is updated by: [log closer thread](@ref sect_redo_log_closer).
319
320 @subsection subsect_redo_log_available_for_checkpoint_lsn
321 log.available_for_checkpoint_lsn
322
323 Up to this lsn all dirty pages have been flushed to disk. However, this value
324 is not guaranteed to be the maximum such value. As insertion order to flush
325 lists is relaxed, the buf_pool_get_oldest_modification_approx() returns
326 modification time of some page that was inserted the earliest, it doesn't
327 have to be the oldest modification though. However, the maximum difference
328 between the first page in flush list, and one with the oldest modification
329 lsn is limited by the number of entries in the log recent closed buffer.
330
331 That's why from result of buf_pool_get_oldest_modification_approx() size of
332 the log recent closed buffer is subtracted. The result is used to update the
333 lsn available for a next checkpoint.
334
335 This has impact on the redo format, because the checkpoint_lsn can now point
336 to the middle of some group of log records (even to the middle of a single
337 log record). Log files with such checkpoint are not recoverable by older
338 versions of InnoDB by default.
339
340 Value is updated by:
341 [log checkpointer thread](@ref sect_redo_log_checkpointer).
342
343 @see @ref sect_redo_log_add_dirty_pages
344
345 @subsection subsect_redo_log_last_checkpoint_lsn log.last_checkpoint_lsn
346
347 Up to this lsn all dirty pages have been flushed to disk and the lsn value
348 has been flushed to header of the first log file (_ib_logfile0_).
349
350 The lsn value points to place where recovery is supposed to start. Data bytes
351 for smaller lsn values are not required and might be overwritten (log files
352 are circular). One could consider them logically deleted.
353
354 Value is updated by:
355 [log checkpointer thread](@ref sect_redo_log_checkpointer).
356
357 It holds:
358
359 log.last_checkpoint_lsn
360 <= log.available_for_checkpoint_lsn
361 <= log.buf_dirty_pages_added_up_to_lsn.
362
363
364 Read more about redo log details:
365 - @subpage PAGE_INNODB_REDO_LOG_BUF
366 - @subpage PAGE_INNODB_REDO_LOG_THREADS
367 - @subpage PAGE_INNODB_REDO_LOG_FORMAT
368
369 *******************************************************/
370
371 /** Redo log system. Singleton used to populate global pointer. */
372 aligned_pointer<log_t> *log_sys_object;
373
374 /** Redo log system (singleton). */
375 log_t *log_sys;
376
377 #ifdef UNIV_PFS_THREAD
378
379 /** PFS key for the log writer thread. */
380 mysql_pfs_key_t log_writer_thread_key;
381
382 /** PFS key for the log closer thread. */
383 mysql_pfs_key_t log_closer_thread_key;
384
385 /** PFS key for the log checkpointer thread. */
386 mysql_pfs_key_t log_checkpointer_thread_key;
387
388 /** PFS key for the log flusher thread. */
389 mysql_pfs_key_t log_flusher_thread_key;
390
391 /** PFS key for the log flush notifier thread. */
392 mysql_pfs_key_t log_flush_notifier_thread_key;
393
394 /** PFS key for the log write notifier thread. */
395 mysql_pfs_key_t log_write_notifier_thread_key;
396
397 #endif /* UNIV_PFS_THREAD */
398
399 /** Calculates proper size for the log buffer and allocates the log buffer.
400 @param[out] log redo log */
401 static void log_allocate_buffer(log_t &log);
402
403 /** Deallocates the log buffer.
404 @param[out] log redo log */
405 static void log_deallocate_buffer(log_t &log);
406
407 /** Allocates the log write-ahead buffer (aligned to system page for
408 easier migrations between NUMA nodes).
409 @param[out] log redo log */
410 static void log_allocate_write_ahead_buffer(log_t &log);
411
412 /** Deallocates the log write-ahead buffer.
413 @param[out] log redo log */
414 static void log_deallocate_write_ahead_buffer(log_t &log);
415
416 /** Allocates the log checkpoint buffer (used to write checkpoint headers).
417 @param[out] log redo log */
418 static void log_allocate_checkpoint_buffer(log_t &log);
419
420 /** Deallocates the log checkpoint buffer.
421 @param[out] log redo log */
422 static void log_deallocate_checkpoint_buffer(log_t &log);
423
424 /** Allocates the array with flush events.
425 @param[out] log redo log */
426 static void log_allocate_flush_events(log_t &log);
427
428 /** Deallocates the array with flush events.
429 @param[out] log redo log */
430 static void log_deallocate_flush_events(log_t &log);
431
432 /** Deallocates the array with write events.
433 @param[out] log redo log */
434 static void log_deallocate_write_events(log_t &log);
435
436 /** Allocates the array with write events.
437 @param[out] log redo log */
438 static void log_allocate_write_events(log_t &log);
439
440 /** Allocates the log recent written buffer.
441 @param[out] log redo log */
442 static void log_allocate_recent_written(log_t &log);
443
444 /** Deallocates the log recent written buffer.
445 @param[out] log redo log */
446 static void log_deallocate_recent_written(log_t &log);
447
448 /** Allocates the log recent closed buffer.
449 @param[out] log redo log */
450 static void log_allocate_recent_closed(log_t &log);
451
452 /** Deallocates the log recent closed buffer.
453 @param[out] log redo log */
454 static void log_deallocate_recent_closed(log_t &log);
455
456 /** Allocates buffers for headers of the log files.
457 @param[out] log redo log */
458 static void log_allocate_file_header_buffers(log_t &log);
459
460 /** Deallocates buffers for headers of the log files.
461 @param[out] log redo log */
462 static void log_deallocate_file_header_buffers(log_t &log);
463
464 /** Calculates proper size of the log buffer and updates related fields.
465 Calculations are based on current value of srv_log_buffer_size. Note,
466 that the proper size of the log buffer should be a power of two.
467 @param[out] log redo log */
468 static void log_calc_buf_size(log_t &log);
469
470 uint32_t log_detected_format = UINT32_MAX;
471
472 /**************************************************/ /**
473
474 @name Initialization and finalization of log_sys
475
476 *******************************************************/
477
478 /* @{ */
479
log_sys_init(uint32_t n_files,uint64_t file_size,space_id_t space_id)480 bool log_sys_init(uint32_t n_files, uint64_t file_size, space_id_t space_id) {
481 ut_a(log_sys == nullptr);
482
483 /* The log_sys_object is pointer to aligned_pointer. That's
484 temporary solution until we refactor redo log more.
485
486 That's required for now, because the aligned_pointer, has dtor
487 which tries to free the memory and as long as this is global
488 variable it will have the dtor called. However because we can
489 exit without proper cleanup for redo log in some cases, we
490 need to forbid dtor calls then. */
491
492 log_sys_object = UT_NEW_NOKEY(aligned_pointer<log_t>{});
493
494 log_sys_object->create();
495 log_sys = *log_sys_object;
496
497 log_t &log = *log_sys;
498
499 /* Initialize simple value fields. */
500 log.dict_persist_margin.store(0);
501 log.periodical_checkpoints_enabled = false;
502 if (log_detected_format != UINT32_MAX) {
503 ut_a(log_detected_format <= LOG_HEADER_FORMAT_CURRENT);
504 log.format = log_detected_format;
505 } else {
506 log.format = LOG_HEADER_FORMAT_CURRENT;
507 }
508
509 log.files_space_id = space_id;
510 log.state = log_state_t::OK;
511 log.n_log_ios_old = log.n_log_ios;
512 log.last_printout_time = time(nullptr);
513 ut_d(log.first_block_is_correct_for_lsn = 0);
514
515 ut_a(file_size <= std::numeric_limits<uint64_t>::max() / n_files);
516 log.file_size = file_size;
517 log.n_files = n_files;
518 log.files_real_capacity = file_size * n_files;
519
520 log.current_file_lsn = LOG_START_LSN;
521 log.current_file_real_offset = LOG_FILE_HDR_SIZE;
522 log_files_update_offsets(log, log.current_file_lsn);
523
524 log.checkpointer_event = os_event_create();
525 log.closer_event = os_event_create();
526 log.write_notifier_event = os_event_create();
527 log.flush_notifier_event = os_event_create();
528 log.writer_event = os_event_create();
529 log.flusher_event = os_event_create();
530
531 mutex_create(LATCH_ID_LOG_CHECKPOINTER, &log.checkpointer_mutex);
532 mutex_create(LATCH_ID_LOG_CLOSER, &log.closer_mutex);
533 mutex_create(LATCH_ID_LOG_WRITER, &log.writer_mutex);
534 mutex_create(LATCH_ID_LOG_FLUSHER, &log.flusher_mutex);
535 mutex_create(LATCH_ID_LOG_WRITE_NOTIFIER, &log.write_notifier_mutex);
536 mutex_create(LATCH_ID_LOG_FLUSH_NOTIFIER, &log.flush_notifier_mutex);
537 mutex_create(LATCH_ID_LOG_LIMITS, &log.limits_mutex);
538
539 log.sn_lock.create(
540 #ifdef UNIV_PFS_RWLOCK
541 log_sn_lock_key,
542 #else
543 PSI_NOT_INSTRUMENTED,
544 #endif
545 SYNC_LOG_SN, 64);
546
547 /* Allocate buffers. */
548 log_allocate_buffer(log);
549 log_allocate_write_ahead_buffer(log);
550 log_allocate_checkpoint_buffer(log);
551 log_allocate_recent_written(log);
552 log_allocate_recent_closed(log);
553 log_allocate_flush_events(log);
554 log_allocate_write_events(log);
555 log_allocate_file_header_buffers(log);
556
557 log_calc_buf_size(log);
558 log_calc_max_ages(log);
559
560 log.m_crash_unsafe = false;
561 log.m_disable = false;
562 log.m_first_file_lsn = LOG_START_LSN;
563
564 if (!log_calc_concurrency_margin(log)) {
565 ib::error(ER_IB_MSG_1267)
566 << "Cannot continue operation. ib_logfiles are too"
567 << " small for innodb_thread_concurrency " << srv_thread_concurrency
568 << ". The combined size of"
569 << " ib_logfiles should be bigger than"
570 << " 200 kB * innodb_thread_concurrency. To get mysqld"
571 << " to start up, set innodb_thread_concurrency in"
572 << " my.cnf to a lower value, for example, to 8. After"
573 << " an ERROR-FREE shutdown of mysqld you can adjust"
574 << " the size of ib_logfiles. " << INNODB_PARAMETERS_MSG;
575
576 return (false);
577 }
578
579 return (true);
580 }
581
log_start(log_t & log,checkpoint_no_t checkpoint_no,lsn_t checkpoint_lsn,lsn_t start_lsn)582 void log_start(log_t &log, checkpoint_no_t checkpoint_no, lsn_t checkpoint_lsn,
583 lsn_t start_lsn) {
584 ut_a(log_sys != nullptr);
585 ut_a(checkpoint_lsn >= OS_FILE_LOG_BLOCK_SIZE);
586 ut_a(checkpoint_lsn >= LOG_START_LSN);
587 ut_a(start_lsn >= checkpoint_lsn);
588
589 log.write_to_file_requests_total.store(0);
590 log.write_to_file_requests_interval.store(0);
591
592 log.recovered_lsn = start_lsn;
593 log.last_checkpoint_lsn = checkpoint_lsn;
594 log.next_checkpoint_no = checkpoint_no;
595 log.available_for_checkpoint_lsn = checkpoint_lsn;
596
597 log.sn = log_translate_lsn_to_sn(log.recovered_lsn);
598
599 if ((start_lsn + LOG_BLOCK_TRL_SIZE) % OS_FILE_LOG_BLOCK_SIZE == 0) {
600 start_lsn += LOG_BLOCK_TRL_SIZE + LOG_BLOCK_HDR_SIZE;
601 } else if (start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0) {
602 start_lsn += LOG_BLOCK_HDR_SIZE;
603 }
604 ut_a(start_lsn > LOG_START_LSN);
605
606 log.recent_written.add_link(0, start_lsn);
607 log.recent_written.advance_tail();
608 ut_a(log_buffer_ready_for_write_lsn(log) == start_lsn);
609
610 log.recent_closed.add_link(0, start_lsn);
611 log.recent_closed.advance_tail();
612 ut_a(log_buffer_dirty_pages_added_up_to_lsn(log) == start_lsn);
613
614 log.write_lsn = start_lsn;
615 log.flushed_to_disk_lsn = start_lsn;
616
617 log_files_update_offsets(log, start_lsn);
618
619 log.write_ahead_end_offset = ut_uint64_align_up(log.current_file_real_offset,
620 srv_log_write_ahead_size);
621
622 lsn_t block_lsn;
623 byte *block;
624
625 block_lsn = ut_uint64_align_down(start_lsn, OS_FILE_LOG_BLOCK_SIZE);
626
627 ut_a(block_lsn % log.buf_size + OS_FILE_LOG_BLOCK_SIZE <= log.buf_size);
628
629 block = static_cast<byte *>(log.buf) + block_lsn % log.buf_size;
630
631 log_block_set_hdr_no(block, log_block_convert_lsn_to_no(block_lsn));
632
633 log_block_set_flush_bit(block, true);
634
635 log_block_set_data_len(block, start_lsn - block_lsn);
636
637 const auto first_rec_group = log_block_get_first_rec_group(block);
638
639 ut_ad(log.first_block_is_correct_for_lsn == start_lsn);
640 ut_ad(first_rec_group >= LOG_BLOCK_HDR_SIZE);
641 ut_a(first_rec_group <= start_lsn - block_lsn);
642
643 log_update_buf_limit(log, start_lsn);
644 log_update_limits(log);
645
646 /* Do not reorder writes above, below this line. For x86 this
647 protects only from unlikely compile-time reordering. */
648 std::atomic_thread_fence(std::memory_order_release);
649 }
650
log_sys_close()651 void log_sys_close() {
652 ut_a(log_sys != nullptr);
653
654 log_t &log = *log_sys;
655
656 log_deallocate_file_header_buffers(log);
657 log_deallocate_write_events(log);
658 log_deallocate_flush_events(log);
659 log_deallocate_recent_closed(log);
660 log_deallocate_recent_written(log);
661 log_deallocate_checkpoint_buffer(log);
662 log_deallocate_write_ahead_buffer(log);
663 log_deallocate_buffer(log);
664
665 log.sn_lock.free();
666
667 mutex_free(&log.limits_mutex);
668 mutex_free(&log.write_notifier_mutex);
669 mutex_free(&log.flush_notifier_mutex);
670 mutex_free(&log.flusher_mutex);
671 mutex_free(&log.writer_mutex);
672 mutex_free(&log.closer_mutex);
673 mutex_free(&log.checkpointer_mutex);
674
675 os_event_destroy(log.write_notifier_event);
676 os_event_destroy(log.flush_notifier_event);
677 os_event_destroy(log.closer_event);
678 os_event_destroy(log.checkpointer_event);
679 os_event_destroy(log.writer_event);
680 os_event_destroy(log.flusher_event);
681
682 log_sys_object->destroy();
683
684 UT_DELETE(log_sys_object);
685 log_sys_object = nullptr;
686
687 log_sys = nullptr;
688 }
689
690 /* @} */
691
692 /**************************************************/ /**
693
694 @name Start / stop of background threads
695
696 *******************************************************/
697
698 /* @{ */
699
log_writer_thread_active_validate(const log_t & log)700 void log_writer_thread_active_validate(const log_t &log) {
701 ut_a(log_writer_is_active());
702 }
703
log_closer_thread_active_validate(const log_t & log)704 void log_closer_thread_active_validate(const log_t &log) {
705 ut_a(log_closer_is_active());
706 }
707
log_background_write_threads_active_validate(const log_t & log)708 void log_background_write_threads_active_validate(const log_t &log) {
709 ut_ad(!log.disable_redo_writes);
710
711 ut_a(log_writer_is_active());
712 ut_a(log_flusher_is_active());
713 }
714
log_background_threads_active_validate(const log_t & log)715 void log_background_threads_active_validate(const log_t &log) {
716 log_background_write_threads_active_validate(log);
717
718 ut_a(log_write_notifier_is_active());
719 ut_a(log_flush_notifier_is_active());
720 ut_a(log_closer_is_active());
721 ut_a(log_checkpointer_is_active());
722 }
723
log_background_threads_inactive_validate(const log_t & log)724 void log_background_threads_inactive_validate(const log_t &log) {
725 ut_a(!log_checkpointer_is_active());
726 ut_a(!log_closer_is_active());
727 ut_a(!log_write_notifier_is_active());
728 ut_a(!log_flush_notifier_is_active());
729 ut_a(!log_writer_is_active());
730 ut_a(!log_flusher_is_active());
731 }
732
log_start_background_threads(log_t & log)733 void log_start_background_threads(log_t &log) {
734 ib::info(ER_IB_MSG_1258) << "Log background threads are being started...";
735
736 log_background_threads_inactive_validate(log);
737
738 ut_ad(!log.disable_redo_writes);
739 ut_a(!srv_read_only_mode);
740 ut_a(log.sn.load() > 0);
741
742 log.should_stop_threads.store(false);
743
744 srv_threads.m_log_checkpointer =
745 os_thread_create(log_checkpointer_thread_key, log_checkpointer, &log);
746
747 srv_threads.m_log_closer =
748 os_thread_create(log_closer_thread_key, log_closer, &log);
749
750 srv_threads.m_log_flush_notifier =
751 os_thread_create(log_flush_notifier_thread_key, log_flush_notifier, &log);
752
753 srv_threads.m_log_flusher =
754 os_thread_create(log_flusher_thread_key, log_flusher, &log);
755
756 srv_threads.m_log_write_notifier =
757 os_thread_create(log_write_notifier_thread_key, log_write_notifier, &log);
758
759 srv_threads.m_log_writer =
760 os_thread_create(log_writer_thread_key, log_writer, &log);
761
762 srv_threads.m_log_checkpointer.start();
763 srv_threads.m_log_closer.start();
764 srv_threads.m_log_flush_notifier.start();
765 srv_threads.m_log_flusher.start();
766 srv_threads.m_log_write_notifier.start();
767 srv_threads.m_log_writer.start();
768
769 log_background_threads_active_validate(log);
770
771 #ifndef XTRABACKUP
772 meb::redo_log_archive_init();
773 #endif
774 }
775
log_stop_background_threads(log_t & log)776 void log_stop_background_threads(log_t &log) {
777 /* We cannot stop threads when x-lock is acquired, because of scenario:
778 * log_checkpointer starts log_checkpoint()
779 * log_checkpoint() asks to persist dd dynamic metadata
780 * dict_persist_dd_table_buffer() tries to write to redo
781 * but cannot acquire shared lock on log.sn_lock
782 * so log_checkpointer thread waits for this thread
783 until the x-lock is released
784 * but this thread waits until log background threads
785 have been stopped - log_checkpointer is not stopped. */
786 ut_ad(!log.sn_lock.x_own());
787
788 ib::info(ER_IB_MSG_1259) << "Log background threads are being closed...";
789
790 meb::redo_log_archive_deinit();
791
792 log_background_threads_active_validate(log);
793
794 ut_a(!srv_read_only_mode);
795
796 log.should_stop_threads.store(true);
797
798 /* Wait until threads are closed. */
799 while (log_writer_is_active()) {
800 os_event_set(log.writer_event);
801 os_thread_sleep(10);
802 }
803 while (log_write_notifier_is_active()) {
804 os_event_set(log.write_notifier_event);
805 os_thread_sleep(10);
806 }
807 while (log_flusher_is_active()) {
808 os_event_set(log.flusher_event);
809 os_thread_sleep(10);
810 }
811 while (log_flush_notifier_is_active()) {
812 os_event_set(log.flush_notifier_event);
813 os_thread_sleep(10);
814 }
815 while (log_closer_is_active()) {
816 os_event_set(log.closer_event);
817 os_thread_sleep(10);
818 }
819 while (log_checkpointer_is_active()) {
820 os_event_set(log.checkpointer_event);
821 os_thread_sleep(10);
822 }
823
824 log_background_threads_inactive_validate(log);
825 }
826
log_stop_background_threads_nowait(log_t & log)827 void log_stop_background_threads_nowait(log_t &log) {
828 log.should_stop_threads.store(true);
829 log_wake_threads(log);
830 }
831
log_wake_threads(log_t & log)832 void log_wake_threads(log_t &log) {
833 if (log_closer_is_active()) {
834 os_event_set(log.closer_event);
835 }
836 if (log_checkpointer_is_active()) {
837 os_event_set(log.checkpointer_event);
838 }
839 if (log_writer_is_active()) {
840 os_event_set(log.writer_event);
841 }
842 if (log_flusher_is_active()) {
843 os_event_set(log.flusher_event);
844 }
845 if (log_write_notifier_is_active()) {
846 os_event_set(log.write_notifier_event);
847 }
848 }
849
850 /* @} */
851
852 /**************************************************/ /**
853
854 @name Status printing
855
856 *******************************************************/
857
858 /* @{ */
859
log_print(const log_t & log,FILE * file)860 void log_print(const log_t &log, FILE *file) {
861 lsn_t last_checkpoint_lsn;
862 lsn_t dirty_pages_added_up_to_lsn;
863 lsn_t ready_for_write_lsn;
864 lsn_t write_lsn;
865 lsn_t flush_lsn;
866 lsn_t max_assigned_lsn;
867 lsn_t current_lsn;
868 lsn_t oldest_lsn;
869
870 last_checkpoint_lsn = log.last_checkpoint_lsn.load();
871 dirty_pages_added_up_to_lsn = log_buffer_dirty_pages_added_up_to_lsn(log);
872 ready_for_write_lsn = log_buffer_ready_for_write_lsn(log);
873 write_lsn = log.write_lsn.load();
874 flush_lsn = log.flushed_to_disk_lsn.load();
875 max_assigned_lsn = log_get_lsn(log);
876 current_lsn = log_get_lsn(log);
877
878 log_limits_mutex_enter(log);
879 oldest_lsn = log.available_for_checkpoint_lsn;
880 log_limits_mutex_exit(log);
881
882 fprintf(file,
883 "Log sequence number " LSN_PF
884 "\n"
885 "Log buffer assigned up to " LSN_PF
886 "\n"
887 "Log buffer completed up to " LSN_PF
888 "\n"
889 "Log written up to " LSN_PF
890 "\n"
891 "Log flushed up to " LSN_PF
892 "\n"
893 "Added dirty pages up to " LSN_PF
894 "\n"
895 "Pages flushed up to " LSN_PF
896 "\n"
897 "Last checkpoint at " LSN_PF "\n",
898 current_lsn, max_assigned_lsn, ready_for_write_lsn, write_lsn,
899 flush_lsn, dirty_pages_added_up_to_lsn, oldest_lsn,
900 last_checkpoint_lsn);
901
902 time_t current_time = time(nullptr);
903
904 double time_elapsed = difftime(current_time, log.last_printout_time);
905
906 if (time_elapsed <= 0) {
907 time_elapsed = 1;
908 }
909
910 fprintf(
911 file, ULINTPF " log i/o's done, %.2f log i/o's/second\n",
912 ulint(log.n_log_ios),
913 static_cast<double>(log.n_log_ios - log.n_log_ios_old) / time_elapsed);
914
915 log.n_log_ios_old = log.n_log_ios;
916 log.last_printout_time = current_time;
917 }
918
log_refresh_stats(log_t & log)919 void log_refresh_stats(log_t &log) {
920 log.n_log_ios_old = log.n_log_ios;
921 log.last_printout_time = time(nullptr);
922 }
923
924 /* @} */
925
926 /**************************************************/ /**
927
928 @name Resizing of buffers
929
930 *******************************************************/
931
932 /* @{ */
933
log_buffer_resize_low(log_t & log,size_t new_size,lsn_t end_lsn)934 bool log_buffer_resize_low(log_t &log, size_t new_size, lsn_t end_lsn) {
935 ut_ad(log_checkpointer_mutex_own(log));
936 ut_ad(log_writer_mutex_own(log));
937
938 const lsn_t start_lsn =
939 ut_uint64_align_down(log.write_lsn.load(), OS_FILE_LOG_BLOCK_SIZE);
940
941 end_lsn = ut_uint64_align_up(end_lsn, OS_FILE_LOG_BLOCK_SIZE);
942
943 if (end_lsn == start_lsn) {
944 end_lsn += OS_FILE_LOG_BLOCK_SIZE;
945 }
946
947 ut_ad(end_lsn - start_lsn <= log.buf_size);
948
949 if (end_lsn - start_lsn > new_size) {
950 return (false);
951 }
952
953 /* Save the contents. */
954 byte *tmp_buf = UT_NEW_ARRAY_NOKEY(byte, end_lsn - start_lsn);
955 for (auto i = start_lsn; i < end_lsn; i += OS_FILE_LOG_BLOCK_SIZE) {
956 std::memcpy(&tmp_buf[i - start_lsn], &log.buf[i % log.buf_size],
957 OS_FILE_LOG_BLOCK_SIZE);
958 }
959
960 /* Re-allocate log buffer. */
961 srv_log_buffer_size = static_cast<ulong>(new_size);
962 log_deallocate_buffer(log);
963 log_allocate_buffer(log);
964
965 /* Restore the contents. */
966 for (auto i = start_lsn; i < end_lsn; i += OS_FILE_LOG_BLOCK_SIZE) {
967 std::memcpy(&log.buf[i % new_size], &tmp_buf[i - start_lsn],
968 OS_FILE_LOG_BLOCK_SIZE);
969 }
970 UT_DELETE_ARRAY(tmp_buf);
971
972 log_calc_buf_size(log);
973
974 log_update_buf_limit(log);
975
976 ut_a(srv_log_buffer_size == log.buf_size);
977
978 ib::info(ER_IB_MSG_1260) << "srv_log_buffer_size was extended to "
979 << log.buf_size << ".";
980
981 return (true);
982 }
983
log_buffer_resize(log_t & log,size_t new_size)984 bool log_buffer_resize(log_t &log, size_t new_size) {
985 log_buffer_x_lock_enter(log);
986
987 const lsn_t end_lsn = log_get_lsn(log);
988
989 log_checkpointer_mutex_enter(log);
990 log_writer_mutex_enter(log);
991
992 const bool ret = log_buffer_resize_low(log, new_size, end_lsn);
993
994 log_writer_mutex_exit(log);
995 log_checkpointer_mutex_exit(log);
996 log_buffer_x_lock_exit(log);
997
998 return (ret);
999 }
1000
log_write_ahead_resize(log_t & log,size_t new_size)1001 void log_write_ahead_resize(log_t &log, size_t new_size) {
1002 ut_a(new_size >= INNODB_LOG_WRITE_AHEAD_SIZE_MIN);
1003 ut_a(new_size <= INNODB_LOG_WRITE_AHEAD_SIZE_MAX);
1004
1005 log_writer_mutex_enter(log);
1006
1007 log_deallocate_write_ahead_buffer(log);
1008 srv_log_write_ahead_size = static_cast<ulong>(new_size);
1009
1010 log.write_ahead_end_offset =
1011 ut_uint64_align_down(log.write_ahead_end_offset, new_size);
1012
1013 log_allocate_write_ahead_buffer(log);
1014
1015 log_writer_mutex_exit(log);
1016 }
1017
log_calc_buf_size(log_t & log)1018 static void log_calc_buf_size(log_t &log) {
1019 ut_a(srv_log_buffer_size >= INNODB_LOG_BUFFER_SIZE_MIN);
1020 ut_a(srv_log_buffer_size <= INNODB_LOG_BUFFER_SIZE_MAX);
1021
1022 log.buf_size = srv_log_buffer_size;
1023
1024 /* The following update has to be the last operation during resize
1025 procedure of log buffer. That's because since this moment, possibly
1026 new concurrent writes for higher sn will start (which were waiting
1027 for free space in the log buffer). */
1028
1029 log.buf_size_sn = log_translate_lsn_to_sn(log.buf_size);
1030 }
1031
1032 /* @} */
1033
1034 /**************************************************/ /**
1035
1036 @name Allocation / deallocation of buffers
1037
1038 *******************************************************/
1039
1040 /* @{ */
1041
log_allocate_buffer(log_t & log)1042 static void log_allocate_buffer(log_t &log) {
1043 ut_a(srv_log_buffer_size >= INNODB_LOG_BUFFER_SIZE_MIN);
1044 ut_a(srv_log_buffer_size <= INNODB_LOG_BUFFER_SIZE_MAX);
1045 ut_a(srv_log_buffer_size >= 4 * UNIV_PAGE_SIZE);
1046
1047 log.buf.create(srv_log_buffer_size);
1048 }
1049
log_deallocate_buffer(log_t & log)1050 static void log_deallocate_buffer(log_t &log) { log.buf.destroy(); }
1051
log_allocate_write_ahead_buffer(log_t & log)1052 static void log_allocate_write_ahead_buffer(log_t &log) {
1053 ut_a(srv_log_write_ahead_size >= INNODB_LOG_WRITE_AHEAD_SIZE_MIN);
1054 ut_a(srv_log_write_ahead_size <= INNODB_LOG_WRITE_AHEAD_SIZE_MAX);
1055
1056 log.write_ahead_buf_size = srv_log_write_ahead_size;
1057 log.write_ahead_buf.create(log.write_ahead_buf_size);
1058 }
1059
log_deallocate_write_ahead_buffer(log_t & log)1060 static void log_deallocate_write_ahead_buffer(log_t &log) {
1061 log.write_ahead_buf.destroy();
1062 }
1063
log_allocate_checkpoint_buffer(log_t & log)1064 static void log_allocate_checkpoint_buffer(log_t &log) {
1065 log.checkpoint_buf.create(OS_FILE_LOG_BLOCK_SIZE);
1066 }
1067
log_deallocate_checkpoint_buffer(log_t & log)1068 static void log_deallocate_checkpoint_buffer(log_t &log) {
1069 log.checkpoint_buf.destroy();
1070 }
1071
log_allocate_flush_events(log_t & log)1072 static void log_allocate_flush_events(log_t &log) {
1073 const size_t n = srv_log_flush_events;
1074
1075 ut_a(log.flush_events == nullptr);
1076 ut_a(n >= 1);
1077 ut_a((n & (n - 1)) == 0);
1078
1079 log.flush_events_size = n;
1080 log.flush_events = UT_NEW_ARRAY_NOKEY(os_event_t, n);
1081
1082 for (size_t i = 0; i < log.flush_events_size; ++i) {
1083 log.flush_events[i] = os_event_create();
1084 }
1085 }
1086
log_deallocate_flush_events(log_t & log)1087 static void log_deallocate_flush_events(log_t &log) {
1088 ut_a(log.flush_events != nullptr);
1089
1090 for (size_t i = 0; i < log.flush_events_size; ++i) {
1091 os_event_destroy(log.flush_events[i]);
1092 }
1093
1094 UT_DELETE_ARRAY(log.flush_events);
1095 log.flush_events = nullptr;
1096 }
1097
log_allocate_write_events(log_t & log)1098 static void log_allocate_write_events(log_t &log) {
1099 const size_t n = srv_log_write_events;
1100
1101 ut_a(log.write_events == nullptr);
1102 ut_a(n >= 1);
1103 ut_a((n & (n - 1)) == 0);
1104
1105 log.write_events_size = n;
1106 log.write_events = UT_NEW_ARRAY_NOKEY(os_event_t, n);
1107
1108 for (size_t i = 0; i < log.write_events_size; ++i) {
1109 log.write_events[i] = os_event_create();
1110 }
1111 }
1112
log_deallocate_write_events(log_t & log)1113 static void log_deallocate_write_events(log_t &log) {
1114 ut_a(log.write_events != nullptr);
1115
1116 for (size_t i = 0; i < log.write_events_size; ++i) {
1117 os_event_destroy(log.write_events[i]);
1118 }
1119
1120 UT_DELETE_ARRAY(log.write_events);
1121 log.write_events = nullptr;
1122 }
1123
log_allocate_recent_written(log_t & log)1124 static void log_allocate_recent_written(log_t &log) {
1125 log.recent_written = Link_buf<lsn_t>{srv_log_recent_written_size};
1126 }
log_deallocate_recent_written(log_t & log)1127 static void log_deallocate_recent_written(log_t &log) {
1128 log.recent_written.validate_no_links();
1129 log.recent_written = {};
1130 }
1131
log_allocate_recent_closed(log_t & log)1132 static void log_allocate_recent_closed(log_t &log) {
1133 log.recent_closed = Link_buf<lsn_t>{srv_log_recent_closed_size};
1134 }
1135
log_deallocate_recent_closed(log_t & log)1136 static void log_deallocate_recent_closed(log_t &log) {
1137 log.recent_closed.validate_no_links();
1138 log.recent_closed = {};
1139 }
1140
log_allocate_file_header_buffers(log_t & log)1141 static void log_allocate_file_header_buffers(log_t &log) {
1142 const uint32_t n_files = log.n_files;
1143
1144 using Buf_ptr = aligned_array_pointer<byte, OS_FILE_LOG_BLOCK_SIZE>;
1145
1146 log.file_header_bufs = UT_NEW_ARRAY_NOKEY(Buf_ptr, n_files);
1147
1148 for (uint32_t i = 0; i < n_files; i++) {
1149 log.file_header_bufs[i].create(LOG_FILE_HDR_SIZE);
1150 }
1151 }
1152
log_deallocate_file_header_buffers(log_t & log)1153 static void log_deallocate_file_header_buffers(log_t &log) {
1154 ut_a(log.n_files > 0);
1155 ut_a(log.file_header_bufs != nullptr);
1156
1157 UT_DELETE_ARRAY(log.file_header_bufs);
1158 log.file_header_bufs = nullptr;
1159 }
1160
1161 /* @} */
1162
1163 /**************************************************/ /**
1164
1165 @name Log position locking (for replication)
1166
1167 *******************************************************/
1168
1169 /* @{ */
1170
log_position_lock(log_t & log)1171 void log_position_lock(log_t &log) {
1172 log_buffer_x_lock_enter(log);
1173
1174 log_checkpointer_mutex_enter(log);
1175 }
1176
log_position_unlock(log_t & log)1177 void log_position_unlock(log_t &log) {
1178 log_checkpointer_mutex_exit(log);
1179
1180 log_buffer_x_lock_exit(log);
1181 }
1182
log_position_collect_lsn_info(const log_t & log,lsn_t * current_lsn,lsn_t * checkpoint_lsn)1183 void log_position_collect_lsn_info(const log_t &log, lsn_t *current_lsn,
1184 lsn_t *checkpoint_lsn) {
1185 ut_ad(log_buffer_x_lock_own(log));
1186 ut_ad(log_checkpointer_mutex_own(log));
1187
1188 *checkpoint_lsn = log.last_checkpoint_lsn.load();
1189
1190 *current_lsn = log_get_lsn(log);
1191
1192 /* Ensure we have redo log started. */
1193 ut_a(*current_lsn >= LOG_START_LSN);
1194 ut_a(*checkpoint_lsn >= LOG_START_LSN);
1195
1196 /* Obviously current lsn cannot point to before checkpoint. */
1197 ut_a(*current_lsn >= *checkpoint_lsn);
1198 }
1199
1200 /* @} */
1201
1202 #endif /* !UNIV_HOTBACKUP */
1203