1 /*****************************************************************************
2 
3 Copyright (c) 1995, 2020, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2009, Google Inc.
5 
6 Portions of this file contain modifications contributed and copyrighted by
7 Google, Inc. Those modifications are gratefully acknowledged and are described
8 briefly in the InnoDB documentation. The contributions by Google are
9 incorporated with their permission, and subject to the conditions contained in
10 the file COPYING.Google.
11 
12 This program is free software; you can redistribute it and/or modify it under
13 the terms of the GNU General Public License, version 2.0, as published by the
14 Free Software Foundation.
15 
16 This program is also distributed with certain software (including but not
17 limited to OpenSSL) that is licensed under separate terms, as designated in a
18 particular file or component or in included license documentation. The authors
19 of MySQL hereby grant you an additional permission to link the program and
20 your derivative works with the separately licensed software that they have
21 included with MySQL.
22 
23 This program is distributed in the hope that it will be useful, but WITHOUT
24 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
25 FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0,
26 for more details.
27 
28 You should have received a copy of the GNU General Public License along with
29 this program; if not, write to the Free Software Foundation, Inc.,
30 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA
31 
32 *****************************************************************************/
33 
34 /**************************************************/ /**
35  @file log/log0log.cc
36 
37  Redo log system - provides durability for unflushed modifications
38  to contents of data pages.
39 
40  This file covers general maintenance, including:
41  -# Allocation and deallocation of the redo log data structures.
42  -# Initialization and shutdown of the redo log.
43  -# Start / stop for the log background threads.
44  -# Runtime updates of server variables.
45  -# Extending size of the redo log buffers.
46  -# Locking redo position (for replication).
47 
48  Code responsible for writing to redo log could be found in log0buf.cc,
49  log0write.cc, and log0log.ic. The log writer, flusher, write notifier,
50  flush notifier, and closer threads are implemented in log0write.cc.
51 
52  Code responsible for checkpoints could be found in log0chkp.cc.
53  The log checkpointer thread is implemented there.
54 
55  Created 12/9/1995 Heikki Tuuri
56  *******************************************************/
57 
58 #include "log0types.h"
59 
60 /** Pointer to the log checksum calculation function. */
61 log_checksum_func_t log_checksum_algorithm_ptr;
62 
63 #ifndef UNIV_HOTBACKUP
64 
65 #include <debug_sync.h>
66 #include <sys/types.h>
67 #include <time.h>
68 #include "dict0boot.h"
69 #include "ha_prototypes.h"
70 #include "log0meb.h"
71 #include "os0thread-create.h"
72 #include "trx0sys.h"
73 
74 /**
75 @page PAGE_INNODB_REDO_LOG Innodb redo log
76 
77 @section sect_redo_log_general General idea of redo log
78 
79 The redo log is a write ahead log of changes applied to contents of data pages.
80 It provides durability for all changes applied to the pages. In case of crash,
81 it is used to recover modifications to pages that were modified but have not
82 been flushed to disk.
83 
84 @note In case of clean shutdown, the redo log should be logically empty.
85 This means that after the checkpoint lsn there should be no records to apply.
86 However the log files still could contain some old data (which is not used
87 during the recovery process).
88 
89 Every change to content of a data page must be done through a mini transaction
90 (so called mtr - mtr_t), which in mtr_commit() writes all its log records
91 to the redo log.
92 
93 @remarks Normally these changes are performed using the mlog_write_ulint()
94 or similar function. In some page-level operations, only a code number of
95 a c-function and its parameters are written to the redo log, to reduce the
96 size of the redo log. You should not add parameters to such functions
97 (e.g. trx_undo_header_create(), trx_undo_insert_header_reuse()).
98 You should not add functionality which can either change when compared to older
99 versions, or which is dependent on data outside of the page being modified.
100 Therefore all functions must implement self-contained page transformation
101 and it should be unchanged if you don't have very essential reasons to change
102 the log semantics or format.
103 
104 Single mtr can cover changes to multiple pages. In case of crash, either the
105 whole set of changes from a given mtr is recovered or none of the changes.
106 
107 During life time of a mtr, a log of changes is collected inside an internal
108 buffer of the mtr. It contains multiple log records, which describe changes
109 applied to possibly different modified pages. When the mtr is committed, all
110 the log records are written to the log buffer within a single group of the log
111 records. Procedure:
112 
113 -# Total number of data bytes of log records is calculated.
114 -# Space for the log records is reserved. Range of lsn values is assigned for
115    a group of log records.
116 -# %Log records are written to the reserved space in the log buffer.
117 -# Modified pages are marked as dirty and moved to flush lists.
118    All the dirty pages are marked with the same range of lsn values.
119 -# Reserved space is closed.
120 
121 Background threads are responsible for writing of new changes in the log buffer
122 to the log files. User threads that require durability for the logged records,
123 have to wait until the log gets flushed up to the required point.
124 
125 During recovery only complete groups of log records are recovered and applied.
126 Example given, if we had rotation in a tree, which resulted in changes to three
127 nodes (pages), we have a guarantee, that either the whole rotation is recovered
128 or nothing, so we will not end up with a tree that has incorrect structure.
129 
130 Consecutive bytes written to the redo log are enumerated by the lsn values.
131 Every single byte written to the log buffer corresponds to current lsn
132 increased by one.
133 
134 Data in the redo log is structured in consecutive blocks of 512 bytes
135 (_OS_FILE_LOG_BLOCK_SIZE_). Each block contains a header of 12 bytes
136 (_LOG_BLOCK_HDR_SIZE_) and a footer of 4 bytes (_LOG_BLOCK_TRL_SIZE_).
137 These extra bytes are also enumerated by lsn values. Whenever we refer to
138 data bytes, we mean actual bytes of log records - not bytes of headers and
139 footers of log blocks. The sequence of enumerated data bytes, is called the
140 sn values. All headers and footers of log blocks are added within the log
141 buffer, where data is actually stored in proper redo format.
142 
143 When a user transaction commits, extra mtr is committed (related to undo log),
144 and then user thread waits until the redo log is flushed up to the point,
145 where log records of that mtr end.
146 
147 When a dirty page is being flushed, a thread doing the flush, first needs to
148 wait until the redo log gets flushed up to the newest modification of the page.
149 Afterwards the page might be flushed. In case of crash, we might end up with
150 the newest version of the page and without any earlier versions of the page.
151 Then other pages, which potentially have not been flushed before the crash,
152 need to be recovered to that version. This applies to:
153         * pages modified within the same group of log records,
154         * and pages modified within any earlier group of log records.
155 
156 @section sect_redo_log_architecture Architecture of redo log
157 
158 @subsection subsect_redo_log_data_layers Data layers
159 
160 Redo log consists of following data layers:
161 
162 -# %Log files (typically 4 - 32 GB) - physical redo files that reside on
163    the disk.
164 
165 -# %Log buffer (64 MB by default) - groups data to write to log files,
166    formats data in proper way: include headers/footers of log blocks,
167    calculates checksums, maintains boundaries of record groups.
168 
169 -# %Log recent written buffer (e.g. 4MB) - tracks recent writes to the
170    log buffer. Allows to have concurrent writes to the log buffer and tracks
171    up to which lsn all such writes have been already finished.
172 
173 -# %Log recent closed buffer (e.g. 4MB) - tracks for which recent writes,
174    corresponding dirty pages have been already added to the flush lists.
175    Allows to relax order in which dirty pages have to be added to the flush
176    lists and tracks up to which lsn, all dirty pages have been added.
177    This is required to not make checkpoint at lsn which is larger than
178    oldest_modification of some dirty page, which still has not been added
179    to the flush list (because user thread was scheduled out).
180 
181 -# %Log write ahead buffer (e.g. 4kB) - used to write ahead more bytes
182    to the redo files, to avoid read-on-write problem. This buffer is also
183    used when we need to write an incomplete log block, which might
184    concurrently be receiving even more data from next user threads. In such
185    case we first copy the incomplete block to the write ahead buffer.
186 
187 @subsection subsect_redo_log_general_rules General rules
188 
189 -# User threads write their redo data only to the log buffer.
190 
191 -# User threads write concurrently to the log buffer, without synchronization
192    between each other.
193 
194 -# The log recent written buffer is maintained to track concurrent writes.
195 
196 -# Background log threads write and flush the log buffer to disk.
197 
198 -# User threads do not touch log files. Background log threads are the only
199    allowed to touch the log files.
200 
201 -# User threads wait for the background threads when they need flushed redo.
202 
203 -# %Events per log block are exposed by redo log for users interested in waiting
204    for the flushed redo.
205 
206 -# Users can see up to which point log has been written / flushed.
207 
208 -# User threads need to wait if there is no space in the log buffer.
209 
210    @diafile storage/innobase/log/arch_writing.dia "Writing to the redo log"
211 
212 -# User threads add dirty pages to flush lists in the relaxed order.
213 
214 -# Order in which user threads reserve ranges of lsn values, order in which
215    they write to the log buffer, and order in which they add dirty pages to
216    flush lists, could all be three completely different orders.
217 
218 -# User threads do not write checkpoints (are not allowed to touch log files).
219 
220 -# Checkpoint is automatically written from time to time by a background thread.
221 
222 -# User threads can request a forced write of checkpoint and wait.
223 
224 -# User threads need to wait if there is no space in the log files.
225 
226    @diafile storage/innobase/log/arch_deleting.dia "Reclaiming space in the redo
227 log"
228 
229 -# Well thought out and tested set of _MONITOR_ counters is maintained and
230    documented.
231 
232 -# All settings are configurable through server variables, but the new server
233    variables are hidden unless a special _EXPERIMENTAL_ mode has been defined
234    when running cmake.
235 
236 -# All the new buffers could be resized dynamically during runtime. In practice,
237    only size of the log buffer is accessible without the _EXPERIMENTAL_ mode.
238 
239    @note
240    This is a functional change - the log buffer could be resized dynamically
241    by users (also decreased).
242 
243 @section sect_redo_log_lsn_values Glossary of lsn values
244 
245 Different fragments of head of the redo log are tracked by different values:
246   - @ref subsect_redo_log_write_lsn,
247   - @ref subsect_redo_log_buf_ready_for_write_lsn,
248   - @ref subsect_redo_log_sn.
249 
250 Different fragments of the redo log's tail are tracked by different values:
251   - @ref subsect_redo_log_buf_dirty_pages_added_up_to_lsn,
252   - @ref subsect_redo_log_available_for_checkpoint_lsn,
253   - @ref subsect_redo_log_last_checkpoint_lsn.
254 
255 @subsection subsect_redo_log_write_lsn log.write_lsn
256 
257 Up to this lsn we have written all data to log files. It's the beginning of
258 the unwritten log buffer. Older bytes in the buffer are not required and might
259 be overwritten in cyclic manner for lsn values larger by _log.buf_size_.
260 
261 Value is updated by: [log writer thread](@ref sect_redo_log_writer).
262 
263 @subsection subsect_redo_log_buf_ready_for_write_lsn log.buf_ready_for_write_lsn
264 
265 Up to this lsn, all concurrent writes to log buffer have been finished.
266 We don't need older part of the log recent-written buffer.
267 
268 It obviously holds:
269 
270         log.buf_ready_for_write_lsn >= log.write_lsn
271 
272 Value is updated by: [log writer thread](@ref sect_redo_log_writer).
273 
274 @subsection subsect_redo_log_flushed_to_disk_lsn log.flushed_to_disk_lsn
275 
276 Up to this lsn, we have written and flushed data to log files.
277 
278 It obviously holds:
279 
280         log.flushed_to_disk_lsn <= log.write_lsn
281 
282 Value is updated by: [log flusher thread](@ref sect_redo_log_flusher).
283 
284 @subsection subsect_redo_log_sn log.sn
285 
286 Corresponds to current lsn. Maximum assigned sn value (enumerates only
287 data bytes).
288 
289 It obviously holds:
290 
291         log.sn >= log_translate_lsn_to_sn(log.buf_ready_for_write_lsn)
292 
293 Value is updated by: user threads during reservation of space.
294 
295 @subsection subsect_redo_log_buf_dirty_pages_added_up_to_lsn
296 log.buf_dirty_pages_added_up_to_lsn
297 
298 Up to this lsn user threads have added all dirty pages to flush lists.
299 
300 The redo log records are allowed to be deleted not further than up to this lsn.
301 That's because there could be a page with _oldest_modification_ smaller than
302 the minimum _oldest_modification_ available in flush lists. Note that such page
303 is just about to be added to flush list by a user thread, but there is no mutex
304 protecting access to the minimum _oldest_modification_, which would be acquired
305 by the user thread before writing to redo log. Hence for any lsn greater than
306 _buf_dirty_pages_added_up_to_lsn_ we cannot trust that flush lists are complete
307 and minimum calculated value (or its approximation) is valid.
308 
309 @note
310 Note that we do not delete redo log records physically, but we still can delete
311 them logically by doing checkpoint at given lsn.
312 
313 It holds (unless the log writer thread misses an update of the
314 @ref subsect_redo_log_buf_ready_for_write_lsn):
315 
316         log.buf_dirty_pages_added_up_to_lsn <= log.buf_ready_for_write_lsn.
317 
318 Value is updated by: [log closer thread](@ref sect_redo_log_closer).
319 
320 @subsection subsect_redo_log_available_for_checkpoint_lsn
321 log.available_for_checkpoint_lsn
322 
323 Up to this lsn all dirty pages have been flushed to disk. However, this value
324 is not guaranteed to be the maximum such value. As insertion order to flush
325 lists is relaxed, the buf_pool_get_oldest_modification_approx() returns
326 modification time of some page that was inserted the earliest, it doesn't
327 have to be the oldest modification though. However, the maximum difference
328 between the first page in flush list, and one with the oldest modification
329 lsn is limited by the number of entries in the log recent closed buffer.
330 
331 That's why from result of buf_pool_get_oldest_modification_approx() size of
332 the log recent closed buffer is subtracted. The result is used to update the
333 lsn available for a next checkpoint.
334 
335 This has impact on the redo format, because the checkpoint_lsn can now point
336 to the middle of some group of log records (even to the middle of a single
337 log record). Log files with such checkpoint are not recoverable by older
338 versions of InnoDB by default.
339 
340 Value is updated by:
341 [log checkpointer thread](@ref sect_redo_log_checkpointer).
342 
343 @see @ref sect_redo_log_add_dirty_pages
344 
345 @subsection subsect_redo_log_last_checkpoint_lsn log.last_checkpoint_lsn
346 
347 Up to this lsn all dirty pages have been flushed to disk and the lsn value
348 has been flushed to header of the first log file (_ib_logfile0_).
349 
350 The lsn value points to place where recovery is supposed to start. Data bytes
351 for smaller lsn values are not required and might be overwritten (log files
352 are circular). One could consider them logically deleted.
353 
354 Value is updated by:
355 [log checkpointer thread](@ref sect_redo_log_checkpointer).
356 
357 It holds:
358 
359         log.last_checkpoint_lsn
360         <= log.available_for_checkpoint_lsn
361         <= log.buf_dirty_pages_added_up_to_lsn.
362 
363 
364 Read more about redo log details:
365 - @subpage PAGE_INNODB_REDO_LOG_BUF
366 - @subpage PAGE_INNODB_REDO_LOG_THREADS
367 - @subpage PAGE_INNODB_REDO_LOG_FORMAT
368 
369 *******************************************************/
370 
371 /** Redo log system. Singleton used to populate global pointer. */
372 aligned_pointer<log_t> *log_sys_object;
373 
374 /** Redo log system (singleton). */
375 log_t *log_sys;
376 
377 #ifdef UNIV_PFS_THREAD
378 
379 /** PFS key for the log writer thread. */
380 mysql_pfs_key_t log_writer_thread_key;
381 
382 /** PFS key for the log closer thread. */
383 mysql_pfs_key_t log_closer_thread_key;
384 
385 /** PFS key for the log checkpointer thread. */
386 mysql_pfs_key_t log_checkpointer_thread_key;
387 
388 /** PFS key for the log flusher thread. */
389 mysql_pfs_key_t log_flusher_thread_key;
390 
391 /** PFS key for the log flush notifier thread. */
392 mysql_pfs_key_t log_flush_notifier_thread_key;
393 
394 /** PFS key for the log write notifier thread. */
395 mysql_pfs_key_t log_write_notifier_thread_key;
396 
397 #endif /* UNIV_PFS_THREAD */
398 
399 /** Calculates proper size for the log buffer and allocates the log buffer.
400 @param[out]	log	redo log */
401 static void log_allocate_buffer(log_t &log);
402 
403 /** Deallocates the log buffer.
404 @param[out]	log	redo log */
405 static void log_deallocate_buffer(log_t &log);
406 
407 /** Allocates the log write-ahead buffer (aligned to system page for
408 easier migrations between NUMA nodes).
409 @param[out]	log	redo log */
410 static void log_allocate_write_ahead_buffer(log_t &log);
411 
412 /** Deallocates the log write-ahead buffer.
413 @param[out]	log	redo log */
414 static void log_deallocate_write_ahead_buffer(log_t &log);
415 
416 /** Allocates the log checkpoint buffer (used to write checkpoint headers).
417 @param[out]	log	redo log */
418 static void log_allocate_checkpoint_buffer(log_t &log);
419 
420 /** Deallocates the log checkpoint buffer.
421 @param[out]	log	redo log */
422 static void log_deallocate_checkpoint_buffer(log_t &log);
423 
424 /** Allocates the array with flush events.
425 @param[out]	log	redo log */
426 static void log_allocate_flush_events(log_t &log);
427 
428 /** Deallocates the array with flush events.
429 @param[out]	log	redo log */
430 static void log_deallocate_flush_events(log_t &log);
431 
432 /** Deallocates the array with write events.
433 @param[out]	log	redo log */
434 static void log_deallocate_write_events(log_t &log);
435 
436 /** Allocates the array with write events.
437 @param[out]	log	redo log */
438 static void log_allocate_write_events(log_t &log);
439 
440 /** Allocates the log recent written buffer.
441 @param[out]	log	redo log */
442 static void log_allocate_recent_written(log_t &log);
443 
444 /** Deallocates the log recent written buffer.
445 @param[out]	log	redo log */
446 static void log_deallocate_recent_written(log_t &log);
447 
448 /** Allocates the log recent closed buffer.
449 @param[out]	log	redo log */
450 static void log_allocate_recent_closed(log_t &log);
451 
452 /** Deallocates the log recent closed buffer.
453 @param[out]	log	redo log */
454 static void log_deallocate_recent_closed(log_t &log);
455 
456 /** Allocates buffers for headers of the log files.
457 @param[out]	log	redo log */
458 static void log_allocate_file_header_buffers(log_t &log);
459 
460 /** Deallocates buffers for headers of the log files.
461 @param[out]	log	redo log */
462 static void log_deallocate_file_header_buffers(log_t &log);
463 
464 /** Calculates proper size of the log buffer and updates related fields.
465 Calculations are based on current value of srv_log_buffer_size. Note,
466 that the proper size of the log buffer should be a power of two.
467 @param[out]	log		redo log */
468 static void log_calc_buf_size(log_t &log);
469 
470 uint32_t log_detected_format = UINT32_MAX;
471 
472 /**************************************************/ /**
473 
474  @name	Initialization and finalization of log_sys
475 
476  *******************************************************/
477 
478 /* @{ */
479 
log_sys_init(uint32_t n_files,uint64_t file_size,space_id_t space_id)480 bool log_sys_init(uint32_t n_files, uint64_t file_size, space_id_t space_id) {
481   ut_a(log_sys == nullptr);
482 
483   /* The log_sys_object is pointer to aligned_pointer. That's
484   temporary solution until we refactor redo log more.
485 
486   That's required for now, because the aligned_pointer, has dtor
487   which tries to free the memory and as long as this is global
488   variable it will have the dtor called. However because we can
489   exit without proper cleanup for redo log in some cases, we
490   need to forbid dtor calls then. */
491 
492   log_sys_object = UT_NEW_NOKEY(aligned_pointer<log_t>{});
493 
494   log_sys_object->create();
495   log_sys = *log_sys_object;
496 
497   log_t &log = *log_sys;
498 
499   /* Initialize simple value fields. */
500   log.dict_persist_margin.store(0);
501   log.periodical_checkpoints_enabled = false;
502   if (log_detected_format != UINT32_MAX) {
503     ut_a(log_detected_format <= LOG_HEADER_FORMAT_CURRENT);
504     log.format = log_detected_format;
505   } else {
506     log.format = LOG_HEADER_FORMAT_CURRENT;
507   }
508 
509   log.files_space_id = space_id;
510   log.state = log_state_t::OK;
511   log.n_log_ios_old = log.n_log_ios;
512   log.last_printout_time = time(nullptr);
513   ut_d(log.first_block_is_correct_for_lsn = 0);
514 
515   ut_a(file_size <= std::numeric_limits<uint64_t>::max() / n_files);
516   log.file_size = file_size;
517   log.n_files = n_files;
518   log.files_real_capacity = file_size * n_files;
519 
520   log.current_file_lsn = LOG_START_LSN;
521   log.current_file_real_offset = LOG_FILE_HDR_SIZE;
522   log_files_update_offsets(log, log.current_file_lsn);
523 
524   log.checkpointer_event = os_event_create();
525   log.closer_event = os_event_create();
526   log.write_notifier_event = os_event_create();
527   log.flush_notifier_event = os_event_create();
528   log.writer_event = os_event_create();
529   log.flusher_event = os_event_create();
530 
531   mutex_create(LATCH_ID_LOG_CHECKPOINTER, &log.checkpointer_mutex);
532   mutex_create(LATCH_ID_LOG_CLOSER, &log.closer_mutex);
533   mutex_create(LATCH_ID_LOG_WRITER, &log.writer_mutex);
534   mutex_create(LATCH_ID_LOG_FLUSHER, &log.flusher_mutex);
535   mutex_create(LATCH_ID_LOG_WRITE_NOTIFIER, &log.write_notifier_mutex);
536   mutex_create(LATCH_ID_LOG_FLUSH_NOTIFIER, &log.flush_notifier_mutex);
537   mutex_create(LATCH_ID_LOG_LIMITS, &log.limits_mutex);
538 
539   log.sn_lock.create(
540 #ifdef UNIV_PFS_RWLOCK
541       log_sn_lock_key,
542 #else
543       PSI_NOT_INSTRUMENTED,
544 #endif
545       SYNC_LOG_SN, 64);
546 
547   /* Allocate buffers. */
548   log_allocate_buffer(log);
549   log_allocate_write_ahead_buffer(log);
550   log_allocate_checkpoint_buffer(log);
551   log_allocate_recent_written(log);
552   log_allocate_recent_closed(log);
553   log_allocate_flush_events(log);
554   log_allocate_write_events(log);
555   log_allocate_file_header_buffers(log);
556 
557   log_calc_buf_size(log);
558   log_calc_max_ages(log);
559 
560   log.m_crash_unsafe = false;
561   log.m_disable = false;
562   log.m_first_file_lsn = LOG_START_LSN;
563 
564   if (!log_calc_concurrency_margin(log)) {
565     ib::error(ER_IB_MSG_1267)
566         << "Cannot continue operation. ib_logfiles are too"
567         << " small for innodb_thread_concurrency " << srv_thread_concurrency
568         << ". The combined size of"
569         << " ib_logfiles should be bigger than"
570         << " 200 kB * innodb_thread_concurrency. To get mysqld"
571         << " to start up, set innodb_thread_concurrency in"
572         << " my.cnf to a lower value, for example, to 8. After"
573         << " an ERROR-FREE shutdown of mysqld you can adjust"
574         << " the size of ib_logfiles. " << INNODB_PARAMETERS_MSG;
575 
576     return (false);
577   }
578 
579   return (true);
580 }
581 
log_start(log_t & log,checkpoint_no_t checkpoint_no,lsn_t checkpoint_lsn,lsn_t start_lsn)582 void log_start(log_t &log, checkpoint_no_t checkpoint_no, lsn_t checkpoint_lsn,
583                lsn_t start_lsn) {
584   ut_a(log_sys != nullptr);
585   ut_a(checkpoint_lsn >= OS_FILE_LOG_BLOCK_SIZE);
586   ut_a(checkpoint_lsn >= LOG_START_LSN);
587   ut_a(start_lsn >= checkpoint_lsn);
588 
589   log.write_to_file_requests_total.store(0);
590   log.write_to_file_requests_interval.store(0);
591 
592   log.recovered_lsn = start_lsn;
593   log.last_checkpoint_lsn = checkpoint_lsn;
594   log.next_checkpoint_no = checkpoint_no;
595   log.available_for_checkpoint_lsn = checkpoint_lsn;
596 
597   log.sn = log_translate_lsn_to_sn(log.recovered_lsn);
598 
599   if ((start_lsn + LOG_BLOCK_TRL_SIZE) % OS_FILE_LOG_BLOCK_SIZE == 0) {
600     start_lsn += LOG_BLOCK_TRL_SIZE + LOG_BLOCK_HDR_SIZE;
601   } else if (start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0) {
602     start_lsn += LOG_BLOCK_HDR_SIZE;
603   }
604   ut_a(start_lsn > LOG_START_LSN);
605 
606   log.recent_written.add_link(0, start_lsn);
607   log.recent_written.advance_tail();
608   ut_a(log_buffer_ready_for_write_lsn(log) == start_lsn);
609 
610   log.recent_closed.add_link(0, start_lsn);
611   log.recent_closed.advance_tail();
612   ut_a(log_buffer_dirty_pages_added_up_to_lsn(log) == start_lsn);
613 
614   log.write_lsn = start_lsn;
615   log.flushed_to_disk_lsn = start_lsn;
616 
617   log_files_update_offsets(log, start_lsn);
618 
619   log.write_ahead_end_offset = ut_uint64_align_up(log.current_file_real_offset,
620                                                   srv_log_write_ahead_size);
621 
622   lsn_t block_lsn;
623   byte *block;
624 
625   block_lsn = ut_uint64_align_down(start_lsn, OS_FILE_LOG_BLOCK_SIZE);
626 
627   ut_a(block_lsn % log.buf_size + OS_FILE_LOG_BLOCK_SIZE <= log.buf_size);
628 
629   block = static_cast<byte *>(log.buf) + block_lsn % log.buf_size;
630 
631   log_block_set_hdr_no(block, log_block_convert_lsn_to_no(block_lsn));
632 
633   log_block_set_flush_bit(block, true);
634 
635   log_block_set_data_len(block, start_lsn - block_lsn);
636 
637   const auto first_rec_group = log_block_get_first_rec_group(block);
638 
639   ut_ad(log.first_block_is_correct_for_lsn == start_lsn);
640   ut_ad(first_rec_group >= LOG_BLOCK_HDR_SIZE);
641   ut_a(first_rec_group <= start_lsn - block_lsn);
642 
643   log_update_buf_limit(log, start_lsn);
644   log_update_limits(log);
645 
646   /* Do not reorder writes above, below this line. For x86 this
647   protects only from unlikely compile-time reordering. */
648   std::atomic_thread_fence(std::memory_order_release);
649 }
650 
log_sys_close()651 void log_sys_close() {
652   ut_a(log_sys != nullptr);
653 
654   log_t &log = *log_sys;
655 
656   log_deallocate_file_header_buffers(log);
657   log_deallocate_write_events(log);
658   log_deallocate_flush_events(log);
659   log_deallocate_recent_closed(log);
660   log_deallocate_recent_written(log);
661   log_deallocate_checkpoint_buffer(log);
662   log_deallocate_write_ahead_buffer(log);
663   log_deallocate_buffer(log);
664 
665   log.sn_lock.free();
666 
667   mutex_free(&log.limits_mutex);
668   mutex_free(&log.write_notifier_mutex);
669   mutex_free(&log.flush_notifier_mutex);
670   mutex_free(&log.flusher_mutex);
671   mutex_free(&log.writer_mutex);
672   mutex_free(&log.closer_mutex);
673   mutex_free(&log.checkpointer_mutex);
674 
675   os_event_destroy(log.write_notifier_event);
676   os_event_destroy(log.flush_notifier_event);
677   os_event_destroy(log.closer_event);
678   os_event_destroy(log.checkpointer_event);
679   os_event_destroy(log.writer_event);
680   os_event_destroy(log.flusher_event);
681 
682   log_sys_object->destroy();
683 
684   UT_DELETE(log_sys_object);
685   log_sys_object = nullptr;
686 
687   log_sys = nullptr;
688 }
689 
690 /* @} */
691 
692 /**************************************************/ /**
693 
694  @name	Start / stop of background threads
695 
696  *******************************************************/
697 
698 /* @{ */
699 
log_writer_thread_active_validate(const log_t & log)700 void log_writer_thread_active_validate(const log_t &log) {
701   ut_a(log_writer_is_active());
702 }
703 
log_closer_thread_active_validate(const log_t & log)704 void log_closer_thread_active_validate(const log_t &log) {
705   ut_a(log_closer_is_active());
706 }
707 
log_background_write_threads_active_validate(const log_t & log)708 void log_background_write_threads_active_validate(const log_t &log) {
709   ut_ad(!log.disable_redo_writes);
710 
711   ut_a(log_writer_is_active());
712   ut_a(log_flusher_is_active());
713 }
714 
log_background_threads_active_validate(const log_t & log)715 void log_background_threads_active_validate(const log_t &log) {
716   log_background_write_threads_active_validate(log);
717 
718   ut_a(log_write_notifier_is_active());
719   ut_a(log_flush_notifier_is_active());
720   ut_a(log_closer_is_active());
721   ut_a(log_checkpointer_is_active());
722 }
723 
log_background_threads_inactive_validate(const log_t & log)724 void log_background_threads_inactive_validate(const log_t &log) {
725   ut_a(!log_checkpointer_is_active());
726   ut_a(!log_closer_is_active());
727   ut_a(!log_write_notifier_is_active());
728   ut_a(!log_flush_notifier_is_active());
729   ut_a(!log_writer_is_active());
730   ut_a(!log_flusher_is_active());
731 }
732 
log_start_background_threads(log_t & log)733 void log_start_background_threads(log_t &log) {
734   ib::info(ER_IB_MSG_1258) << "Log background threads are being started...";
735 
736   log_background_threads_inactive_validate(log);
737 
738   ut_ad(!log.disable_redo_writes);
739   ut_a(!srv_read_only_mode);
740   ut_a(log.sn.load() > 0);
741 
742   log.should_stop_threads.store(false);
743 
744   srv_threads.m_log_checkpointer =
745       os_thread_create(log_checkpointer_thread_key, log_checkpointer, &log);
746 
747   srv_threads.m_log_closer =
748       os_thread_create(log_closer_thread_key, log_closer, &log);
749 
750   srv_threads.m_log_flush_notifier =
751       os_thread_create(log_flush_notifier_thread_key, log_flush_notifier, &log);
752 
753   srv_threads.m_log_flusher =
754       os_thread_create(log_flusher_thread_key, log_flusher, &log);
755 
756   srv_threads.m_log_write_notifier =
757       os_thread_create(log_write_notifier_thread_key, log_write_notifier, &log);
758 
759   srv_threads.m_log_writer =
760       os_thread_create(log_writer_thread_key, log_writer, &log);
761 
762   srv_threads.m_log_checkpointer.start();
763   srv_threads.m_log_closer.start();
764   srv_threads.m_log_flush_notifier.start();
765   srv_threads.m_log_flusher.start();
766   srv_threads.m_log_write_notifier.start();
767   srv_threads.m_log_writer.start();
768 
769   log_background_threads_active_validate(log);
770 
771 #ifndef XTRABACKUP
772   meb::redo_log_archive_init();
773 #endif
774 }
775 
log_stop_background_threads(log_t & log)776 void log_stop_background_threads(log_t &log) {
777   /* We cannot stop threads when x-lock is acquired, because of scenario:
778           * log_checkpointer starts log_checkpoint()
779           * log_checkpoint() asks to persist dd dynamic metadata
780           * dict_persist_dd_table_buffer() tries to write to redo
781           * but cannot acquire shared lock on log.sn_lock
782           * so log_checkpointer thread waits for this thread
783             until the x-lock is released
784           * but this thread waits until log background threads
785             have been stopped - log_checkpointer is not stopped. */
786   ut_ad(!log.sn_lock.x_own());
787 
788   ib::info(ER_IB_MSG_1259) << "Log background threads are being closed...";
789 
790   meb::redo_log_archive_deinit();
791 
792   log_background_threads_active_validate(log);
793 
794   ut_a(!srv_read_only_mode);
795 
796   log.should_stop_threads.store(true);
797 
798   /* Wait until threads are closed. */
799   while (log_writer_is_active()) {
800     os_event_set(log.writer_event);
801     os_thread_sleep(10);
802   }
803   while (log_write_notifier_is_active()) {
804     os_event_set(log.write_notifier_event);
805     os_thread_sleep(10);
806   }
807   while (log_flusher_is_active()) {
808     os_event_set(log.flusher_event);
809     os_thread_sleep(10);
810   }
811   while (log_flush_notifier_is_active()) {
812     os_event_set(log.flush_notifier_event);
813     os_thread_sleep(10);
814   }
815   while (log_closer_is_active()) {
816     os_event_set(log.closer_event);
817     os_thread_sleep(10);
818   }
819   while (log_checkpointer_is_active()) {
820     os_event_set(log.checkpointer_event);
821     os_thread_sleep(10);
822   }
823 
824   log_background_threads_inactive_validate(log);
825 }
826 
log_stop_background_threads_nowait(log_t & log)827 void log_stop_background_threads_nowait(log_t &log) {
828   log.should_stop_threads.store(true);
829   log_wake_threads(log);
830 }
831 
log_wake_threads(log_t & log)832 void log_wake_threads(log_t &log) {
833   if (log_closer_is_active()) {
834     os_event_set(log.closer_event);
835   }
836   if (log_checkpointer_is_active()) {
837     os_event_set(log.checkpointer_event);
838   }
839   if (log_writer_is_active()) {
840     os_event_set(log.writer_event);
841   }
842   if (log_flusher_is_active()) {
843     os_event_set(log.flusher_event);
844   }
845   if (log_write_notifier_is_active()) {
846     os_event_set(log.write_notifier_event);
847   }
848 }
849 
850 /* @} */
851 
852 /**************************************************/ /**
853 
854  @name	Status printing
855 
856  *******************************************************/
857 
858 /* @{ */
859 
log_print(const log_t & log,FILE * file)860 void log_print(const log_t &log, FILE *file) {
861   lsn_t last_checkpoint_lsn;
862   lsn_t dirty_pages_added_up_to_lsn;
863   lsn_t ready_for_write_lsn;
864   lsn_t write_lsn;
865   lsn_t flush_lsn;
866   lsn_t max_assigned_lsn;
867   lsn_t current_lsn;
868   lsn_t oldest_lsn;
869 
870   last_checkpoint_lsn = log.last_checkpoint_lsn.load();
871   dirty_pages_added_up_to_lsn = log_buffer_dirty_pages_added_up_to_lsn(log);
872   ready_for_write_lsn = log_buffer_ready_for_write_lsn(log);
873   write_lsn = log.write_lsn.load();
874   flush_lsn = log.flushed_to_disk_lsn.load();
875   max_assigned_lsn = log_get_lsn(log);
876   current_lsn = log_get_lsn(log);
877 
878   log_limits_mutex_enter(log);
879   oldest_lsn = log.available_for_checkpoint_lsn;
880   log_limits_mutex_exit(log);
881 
882   fprintf(file,
883           "Log sequence number          " LSN_PF
884           "\n"
885           "Log buffer assigned up to    " LSN_PF
886           "\n"
887           "Log buffer completed up to   " LSN_PF
888           "\n"
889           "Log written up to            " LSN_PF
890           "\n"
891           "Log flushed up to            " LSN_PF
892           "\n"
893           "Added dirty pages up to      " LSN_PF
894           "\n"
895           "Pages flushed up to          " LSN_PF
896           "\n"
897           "Last checkpoint at           " LSN_PF "\n",
898           current_lsn, max_assigned_lsn, ready_for_write_lsn, write_lsn,
899           flush_lsn, dirty_pages_added_up_to_lsn, oldest_lsn,
900           last_checkpoint_lsn);
901 
902   time_t current_time = time(nullptr);
903 
904   double time_elapsed = difftime(current_time, log.last_printout_time);
905 
906   if (time_elapsed <= 0) {
907     time_elapsed = 1;
908   }
909 
910   fprintf(
911       file, ULINTPF " log i/o's done, %.2f log i/o's/second\n",
912       ulint(log.n_log_ios),
913       static_cast<double>(log.n_log_ios - log.n_log_ios_old) / time_elapsed);
914 
915   log.n_log_ios_old = log.n_log_ios;
916   log.last_printout_time = current_time;
917 }
918 
log_refresh_stats(log_t & log)919 void log_refresh_stats(log_t &log) {
920   log.n_log_ios_old = log.n_log_ios;
921   log.last_printout_time = time(nullptr);
922 }
923 
924 /* @} */
925 
926 /**************************************************/ /**
927 
928  @name	Resizing of buffers
929 
930  *******************************************************/
931 
932 /* @{ */
933 
log_buffer_resize_low(log_t & log,size_t new_size,lsn_t end_lsn)934 bool log_buffer_resize_low(log_t &log, size_t new_size, lsn_t end_lsn) {
935   ut_ad(log_checkpointer_mutex_own(log));
936   ut_ad(log_writer_mutex_own(log));
937 
938   const lsn_t start_lsn =
939       ut_uint64_align_down(log.write_lsn.load(), OS_FILE_LOG_BLOCK_SIZE);
940 
941   end_lsn = ut_uint64_align_up(end_lsn, OS_FILE_LOG_BLOCK_SIZE);
942 
943   if (end_lsn == start_lsn) {
944     end_lsn += OS_FILE_LOG_BLOCK_SIZE;
945   }
946 
947   ut_ad(end_lsn - start_lsn <= log.buf_size);
948 
949   if (end_lsn - start_lsn > new_size) {
950     return (false);
951   }
952 
953   /* Save the contents. */
954   byte *tmp_buf = UT_NEW_ARRAY_NOKEY(byte, end_lsn - start_lsn);
955   for (auto i = start_lsn; i < end_lsn; i += OS_FILE_LOG_BLOCK_SIZE) {
956     std::memcpy(&tmp_buf[i - start_lsn], &log.buf[i % log.buf_size],
957                 OS_FILE_LOG_BLOCK_SIZE);
958   }
959 
960   /* Re-allocate log buffer. */
961   srv_log_buffer_size = static_cast<ulong>(new_size);
962   log_deallocate_buffer(log);
963   log_allocate_buffer(log);
964 
965   /* Restore the contents. */
966   for (auto i = start_lsn; i < end_lsn; i += OS_FILE_LOG_BLOCK_SIZE) {
967     std::memcpy(&log.buf[i % new_size], &tmp_buf[i - start_lsn],
968                 OS_FILE_LOG_BLOCK_SIZE);
969   }
970   UT_DELETE_ARRAY(tmp_buf);
971 
972   log_calc_buf_size(log);
973 
974   log_update_buf_limit(log);
975 
976   ut_a(srv_log_buffer_size == log.buf_size);
977 
978   ib::info(ER_IB_MSG_1260) << "srv_log_buffer_size was extended to "
979                            << log.buf_size << ".";
980 
981   return (true);
982 }
983 
log_buffer_resize(log_t & log,size_t new_size)984 bool log_buffer_resize(log_t &log, size_t new_size) {
985   log_buffer_x_lock_enter(log);
986 
987   const lsn_t end_lsn = log_get_lsn(log);
988 
989   log_checkpointer_mutex_enter(log);
990   log_writer_mutex_enter(log);
991 
992   const bool ret = log_buffer_resize_low(log, new_size, end_lsn);
993 
994   log_writer_mutex_exit(log);
995   log_checkpointer_mutex_exit(log);
996   log_buffer_x_lock_exit(log);
997 
998   return (ret);
999 }
1000 
log_write_ahead_resize(log_t & log,size_t new_size)1001 void log_write_ahead_resize(log_t &log, size_t new_size) {
1002   ut_a(new_size >= INNODB_LOG_WRITE_AHEAD_SIZE_MIN);
1003   ut_a(new_size <= INNODB_LOG_WRITE_AHEAD_SIZE_MAX);
1004 
1005   log_writer_mutex_enter(log);
1006 
1007   log_deallocate_write_ahead_buffer(log);
1008   srv_log_write_ahead_size = static_cast<ulong>(new_size);
1009 
1010   log.write_ahead_end_offset =
1011       ut_uint64_align_down(log.write_ahead_end_offset, new_size);
1012 
1013   log_allocate_write_ahead_buffer(log);
1014 
1015   log_writer_mutex_exit(log);
1016 }
1017 
log_calc_buf_size(log_t & log)1018 static void log_calc_buf_size(log_t &log) {
1019   ut_a(srv_log_buffer_size >= INNODB_LOG_BUFFER_SIZE_MIN);
1020   ut_a(srv_log_buffer_size <= INNODB_LOG_BUFFER_SIZE_MAX);
1021 
1022   log.buf_size = srv_log_buffer_size;
1023 
1024   /* The following update has to be the last operation during resize
1025   procedure of log buffer. That's because since this moment, possibly
1026   new concurrent writes for higher sn will start (which were waiting
1027   for free space in the log buffer). */
1028 
1029   log.buf_size_sn = log_translate_lsn_to_sn(log.buf_size);
1030 }
1031 
1032 /* @} */
1033 
1034 /**************************************************/ /**
1035 
1036  @name	Allocation / deallocation of buffers
1037 
1038  *******************************************************/
1039 
1040 /* @{ */
1041 
log_allocate_buffer(log_t & log)1042 static void log_allocate_buffer(log_t &log) {
1043   ut_a(srv_log_buffer_size >= INNODB_LOG_BUFFER_SIZE_MIN);
1044   ut_a(srv_log_buffer_size <= INNODB_LOG_BUFFER_SIZE_MAX);
1045   ut_a(srv_log_buffer_size >= 4 * UNIV_PAGE_SIZE);
1046 
1047   log.buf.create(srv_log_buffer_size);
1048 }
1049 
log_deallocate_buffer(log_t & log)1050 static void log_deallocate_buffer(log_t &log) { log.buf.destroy(); }
1051 
log_allocate_write_ahead_buffer(log_t & log)1052 static void log_allocate_write_ahead_buffer(log_t &log) {
1053   ut_a(srv_log_write_ahead_size >= INNODB_LOG_WRITE_AHEAD_SIZE_MIN);
1054   ut_a(srv_log_write_ahead_size <= INNODB_LOG_WRITE_AHEAD_SIZE_MAX);
1055 
1056   log.write_ahead_buf_size = srv_log_write_ahead_size;
1057   log.write_ahead_buf.create(log.write_ahead_buf_size);
1058 }
1059 
log_deallocate_write_ahead_buffer(log_t & log)1060 static void log_deallocate_write_ahead_buffer(log_t &log) {
1061   log.write_ahead_buf.destroy();
1062 }
1063 
log_allocate_checkpoint_buffer(log_t & log)1064 static void log_allocate_checkpoint_buffer(log_t &log) {
1065   log.checkpoint_buf.create(OS_FILE_LOG_BLOCK_SIZE);
1066 }
1067 
log_deallocate_checkpoint_buffer(log_t & log)1068 static void log_deallocate_checkpoint_buffer(log_t &log) {
1069   log.checkpoint_buf.destroy();
1070 }
1071 
log_allocate_flush_events(log_t & log)1072 static void log_allocate_flush_events(log_t &log) {
1073   const size_t n = srv_log_flush_events;
1074 
1075   ut_a(log.flush_events == nullptr);
1076   ut_a(n >= 1);
1077   ut_a((n & (n - 1)) == 0);
1078 
1079   log.flush_events_size = n;
1080   log.flush_events = UT_NEW_ARRAY_NOKEY(os_event_t, n);
1081 
1082   for (size_t i = 0; i < log.flush_events_size; ++i) {
1083     log.flush_events[i] = os_event_create();
1084   }
1085 }
1086 
log_deallocate_flush_events(log_t & log)1087 static void log_deallocate_flush_events(log_t &log) {
1088   ut_a(log.flush_events != nullptr);
1089 
1090   for (size_t i = 0; i < log.flush_events_size; ++i) {
1091     os_event_destroy(log.flush_events[i]);
1092   }
1093 
1094   UT_DELETE_ARRAY(log.flush_events);
1095   log.flush_events = nullptr;
1096 }
1097 
log_allocate_write_events(log_t & log)1098 static void log_allocate_write_events(log_t &log) {
1099   const size_t n = srv_log_write_events;
1100 
1101   ut_a(log.write_events == nullptr);
1102   ut_a(n >= 1);
1103   ut_a((n & (n - 1)) == 0);
1104 
1105   log.write_events_size = n;
1106   log.write_events = UT_NEW_ARRAY_NOKEY(os_event_t, n);
1107 
1108   for (size_t i = 0; i < log.write_events_size; ++i) {
1109     log.write_events[i] = os_event_create();
1110   }
1111 }
1112 
log_deallocate_write_events(log_t & log)1113 static void log_deallocate_write_events(log_t &log) {
1114   ut_a(log.write_events != nullptr);
1115 
1116   for (size_t i = 0; i < log.write_events_size; ++i) {
1117     os_event_destroy(log.write_events[i]);
1118   }
1119 
1120   UT_DELETE_ARRAY(log.write_events);
1121   log.write_events = nullptr;
1122 }
1123 
log_allocate_recent_written(log_t & log)1124 static void log_allocate_recent_written(log_t &log) {
1125   log.recent_written = Link_buf<lsn_t>{srv_log_recent_written_size};
1126 }
log_deallocate_recent_written(log_t & log)1127 static void log_deallocate_recent_written(log_t &log) {
1128   log.recent_written.validate_no_links();
1129   log.recent_written = {};
1130 }
1131 
log_allocate_recent_closed(log_t & log)1132 static void log_allocate_recent_closed(log_t &log) {
1133   log.recent_closed = Link_buf<lsn_t>{srv_log_recent_closed_size};
1134 }
1135 
log_deallocate_recent_closed(log_t & log)1136 static void log_deallocate_recent_closed(log_t &log) {
1137   log.recent_closed.validate_no_links();
1138   log.recent_closed = {};
1139 }
1140 
log_allocate_file_header_buffers(log_t & log)1141 static void log_allocate_file_header_buffers(log_t &log) {
1142   const uint32_t n_files = log.n_files;
1143 
1144   using Buf_ptr = aligned_array_pointer<byte, OS_FILE_LOG_BLOCK_SIZE>;
1145 
1146   log.file_header_bufs = UT_NEW_ARRAY_NOKEY(Buf_ptr, n_files);
1147 
1148   for (uint32_t i = 0; i < n_files; i++) {
1149     log.file_header_bufs[i].create(LOG_FILE_HDR_SIZE);
1150   }
1151 }
1152 
log_deallocate_file_header_buffers(log_t & log)1153 static void log_deallocate_file_header_buffers(log_t &log) {
1154   ut_a(log.n_files > 0);
1155   ut_a(log.file_header_bufs != nullptr);
1156 
1157   UT_DELETE_ARRAY(log.file_header_bufs);
1158   log.file_header_bufs = nullptr;
1159 }
1160 
1161 /* @} */
1162 
1163 /**************************************************/ /**
1164 
1165  @name	Log position locking (for replication)
1166 
1167  *******************************************************/
1168 
1169 /* @{ */
1170 
log_position_lock(log_t & log)1171 void log_position_lock(log_t &log) {
1172   log_buffer_x_lock_enter(log);
1173 
1174   log_checkpointer_mutex_enter(log);
1175 }
1176 
log_position_unlock(log_t & log)1177 void log_position_unlock(log_t &log) {
1178   log_checkpointer_mutex_exit(log);
1179 
1180   log_buffer_x_lock_exit(log);
1181 }
1182 
log_position_collect_lsn_info(const log_t & log,lsn_t * current_lsn,lsn_t * checkpoint_lsn)1183 void log_position_collect_lsn_info(const log_t &log, lsn_t *current_lsn,
1184                                    lsn_t *checkpoint_lsn) {
1185   ut_ad(log_buffer_x_lock_own(log));
1186   ut_ad(log_checkpointer_mutex_own(log));
1187 
1188   *checkpoint_lsn = log.last_checkpoint_lsn.load();
1189 
1190   *current_lsn = log_get_lsn(log);
1191 
1192   /* Ensure we have redo log started. */
1193   ut_a(*current_lsn >= LOG_START_LSN);
1194   ut_a(*checkpoint_lsn >= LOG_START_LSN);
1195 
1196   /* Obviously current lsn cannot point to before checkpoint. */
1197   ut_a(*current_lsn >= *checkpoint_lsn);
1198 }
1199 
1200 /* @} */
1201 
1202 #endif /* !UNIV_HOTBACKUP */
1203