1 /*****************************************************************************
2 
3 Copyright (c) 1995, 2021, Oracle and/or its affiliates.
4 Copyright (c) 2009, Google Inc.
5 Copyright (c) 2016, Percona Inc. All Rights Reserved.
6 
7 Portions of this file contain modifications contributed and copyrighted by
8 Google, Inc. Those modifications are gratefully acknowledged and are described
9 briefly in the InnoDB documentation. The contributions by Google are
10 incorporated with their permission, and subject to the conditions contained in
11 the file COPYING.Google.
12 
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License, version 2.0,
15 as published by the Free Software Foundation.
16 
17 This program is also distributed with certain software (including
18 but not limited to OpenSSL) that is licensed under separate terms,
19 as designated in a particular file or component or in included license
20 documentation.  The authors of MySQL hereby grant you an additional
21 permission to link the program and your derivative works with the
22 separately licensed software that they have included with MySQL.
23 
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27 GNU General Public License, version 2.0, for more details.
28 
29 You should have received a copy of the GNU General Public License along with
30 this program; if not, write to the Free Software Foundation, Inc.,
31 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
32 
33 *****************************************************************************/
34 
35 /**************************************************//**
36 @file log/log0log.cc
37 Database log
38 
39 Created 12/9/1995 Heikki Tuuri
40 *******************************************************/
41 
42 #include "ha_prototypes.h"
43 #include <debug_sync.h>
44 
45 #include "log0log.h"
46 
47 #ifdef UNIV_NONINL
48 #include "log0log.ic"
49 #endif
50 
51 #include "mem0mem.h"
52 #include "buf0buf.h"
53 #ifndef UNIV_HOTBACKUP
54 #include "buf0flu.h"
55 #include "srv0srv.h"
56 #include "log0recv.h"
57 #include "lock0lock.h"
58 #include "fil0fil.h"
59 #include "fil0crypt.h"
60 #include "dict0boot.h"
61 #include "dict0stats_bg.h"
62 #include "srv0srv.h"
63 #include "srv0start.h"
64 #include "trx0sys.h"
65 #include "trx0trx.h"
66 #include "trx0roll.h"
67 #include "srv0mon.h"
68 #include "sync0sync.h"
69 #endif /* !UNIV_HOTBACKUP */
70 
71 #include "system_key.h"
72 
73 redo_log_encrypt_enum existing_redo_encryption_mode = REDO_LOG_ENCRYPT_OFF;
74 
75 /*
76 General philosophy of InnoDB redo-logs:
77 
78 1) Every change to a contents of a data page must be done
79 through mtr, which in mtr_commit() writes log records
80 to the InnoDB redo log.
81 
82 2) Normally these changes are performed using a mlog_write_ulint()
83 or similar function.
84 
85 3) In some page level operations only a code number of a
86 c-function and its parameters are written to the log to
87 reduce the size of the log.
88 
89   3a) You should not add parameters to these kind of functions
90   (e.g. trx_undo_header_create(), trx_undo_insert_header_reuse())
91 
92   3b) You should not add such functionality which either change
93   working when compared with the old or are dependent on data
94   outside of the page. These kind of functions should implement
95   self-contained page transformation and it should be unchanged
96   if you don't have very essential reasons to change log
97   semantics or format.
98 
99 */
100 
101 /** Redo log system */
102 log_t*	log_sys	= NULL;
103 
104 /** Whether to generate and require checksums on the redo log pages */
105 my_bool	innodb_log_checksums;
106 
107 /** Pointer to the log checksum calculation function */
108 log_checksum_func_t log_checksum_algorithm_ptr;
109 
110 /* Next log block number to do dummy record filling if no log records written
111 for a while */
112 static ulint		next_lbn_to_pad = 0;
113 
114 /* These control how often we print warnings if the last checkpoint is too
115 old */
116 bool	log_has_printed_chkp_warning = false;
117 time_t	log_last_warning_time;
118 
119 bool	log_has_printed_chkp_margine_warning = false;
120 time_t	log_last_margine_warning_time;
121 
122 /* A margin for free space in the log buffer before a log entry is catenated */
123 #define LOG_BUF_WRITE_MARGIN	(4 * OS_FILE_LOG_BLOCK_SIZE)
124 
125 /* Margins for free space in the log buffer after a log entry is catenated */
126 #define LOG_BUF_FLUSH_RATIO	2
127 #define LOG_BUF_FLUSH_MARGIN	(LOG_BUF_WRITE_MARGIN + 4 * UNIV_PAGE_SIZE)
128 
129 /* This parameter controls asynchronous making of a new checkpoint; the value
130 should be bigger than LOG_POOL_PREFLUSH_RATIO_SYNC */
131 
132 #define LOG_POOL_CHECKPOINT_RATIO_ASYNC	32
133 
134 /* This parameter controls synchronous preflushing of modified buffer pages */
135 #define LOG_POOL_PREFLUSH_RATIO_SYNC	16
136 
137 /* The same ratio for asynchronous preflushing; this value should be less than
138 the previous */
139 #define LOG_POOL_PREFLUSH_RATIO_ASYNC	8
140 
141 /* Codes used in unlocking flush latches */
142 #define LOG_UNLOCK_NONE_FLUSHED_LOCK	1
143 #define LOG_UNLOCK_FLUSH_LOCK		2
144 
145 /** Event to wake up log_scrub_thread */
146 os_event_t      log_scrub_event;
147 /** Whether log_scrub_thread is active */
148 bool            log_scrub_thread_active;
149 
150 extern "C"
151 os_thread_ret_t
152 DECLARE_THREAD(log_scrub_thread)(void*);
153 
154 
155 /******************************************************//**
156 Completes a checkpoint write i/o to a log file. */
157 static
158 void
159 log_io_complete_checkpoint(void);
160 /*============================*/
161 
162 #ifndef UNIV_HOTBACKUP
163 /****************************************************************//**
164 Returns the oldest modified block lsn in the pool, or log_sys->lsn if none
165 exists.
166 @return LSN of oldest modification */
167 static
168 lsn_t
log_buf_pool_get_oldest_modification(void)169 log_buf_pool_get_oldest_modification(void)
170 /*======================================*/
171 {
172 	lsn_t	lsn;
173 
174 	ut_ad(log_mutex_own());
175 
176 	lsn = buf_pool_get_oldest_modification();
177 
178 	if (!lsn) {
179 
180 		lsn = log_sys->lsn;
181 	}
182 
183 	return(lsn);
184 }
185 #endif  /* !UNIV_HOTBACKUP */
186 
187 /****************************************************************//**
188 Checks if the log groups have a big enough margin of free space in
189 so that a new log entry can be written without overwriting log data
190 that is not read by the changed page bitmap thread.
191 @return true if there is not enough free space. */
192 static
193 bool
log_check_tracking_margin(ulint lsn_advance)194 log_check_tracking_margin(
195 	ulint	lsn_advance)	/*!< in: an upper limit on how much log data we
196 				plan to write.  If zero, the margin will be
197 				checked for the already-written log. */
198 {
199 	lsn_t	tracked_lsn;
200 	lsn_t	tracked_lsn_age;
201 
202 	if (!srv_track_changed_pages) {
203 		return false;
204 	}
205 
206 	ut_ad(mutex_own(&(log_sys->mutex)));
207 
208 	tracked_lsn = log_get_tracked_lsn();
209 	tracked_lsn_age = log_sys->lsn - tracked_lsn;
210 
211 	/* The overwrite would happen when log_sys->log_group_capacity is
212 	exceeded, but we use max_checkpoint_age for an extra safety margin. */
213 	return tracked_lsn_age + lsn_advance > log_sys->max_checkpoint_age;
214 }
215 
216 /** Extends the log buffer.
217 @param[in]	len	requested minimum size in bytes */
218 void
log_buffer_extend(ulint len)219 log_buffer_extend(
220 	ulint	len)
221 {
222 	ulint	move_start;
223 	ulint	move_end;
224 	byte*	tmp_buf[OS_FILE_LOG_BLOCK_SIZE];
225 
226 	log_mutex_enter_all();
227 
228 	while (log_sys->is_extending) {
229 		/* Another thread is trying to extend already.
230 		Needs to wait for. */
231 		log_mutex_exit_all();
232 
233 		log_buffer_flush_to_disk();
234 
235 		log_mutex_enter_all();
236 
237 		if (srv_log_buffer_size > len / UNIV_PAGE_SIZE) {
238 			/* Already extended enough by the others */
239 			log_mutex_exit_all();
240 			return;
241 		}
242 	}
243 
244 	if (len >= log_sys->buf_size / 2) {
245 		DBUG_EXECUTE_IF("ib_log_buffer_is_short_crash",
246 				DBUG_SUICIDE(););
247 
248 		/* log_buffer is too small. try to extend instead of crash. */
249 		ib::warn() << "The transaction log size is too large"
250 			" for innodb_log_buffer_size (" << len << " >= "
251 			<< LOG_BUFFER_SIZE << " / 2). Trying to extend it.";
252 	}
253 
254 	log_sys->is_extending = true;
255 
256 	while (ut_calc_align_down(log_sys->buf_free,
257 				  OS_FILE_LOG_BLOCK_SIZE)
258 	       != ut_calc_align_down(log_sys->buf_next_to_write,
259 				     OS_FILE_LOG_BLOCK_SIZE)) {
260 		/* Buffer might have >1 blocks to write still. */
261 		log_mutex_exit_all();
262 
263 		log_buffer_flush_to_disk();
264 
265 		log_mutex_enter_all();
266 	}
267 
268 	move_start = ut_calc_align_down(
269 		log_sys->buf_free,
270 		OS_FILE_LOG_BLOCK_SIZE);
271 	move_end = log_sys->buf_free;
272 
273 	/* store the last log block in buffer */
274 	ut_memcpy(tmp_buf, log_sys->buf + move_start,
275 		  move_end - move_start);
276 
277 	log_sys->buf_free -= move_start;
278 	log_sys->buf_next_to_write -= move_start;
279 
280 	/* reallocate log buffer */
281 	srv_log_buffer_size = len / UNIV_PAGE_SIZE + 1;
282 	ut_free(log_sys->buf_ptr);
283 
284 	log_sys->buf_size = LOG_BUFFER_SIZE;
285 
286 	log_sys->buf_ptr = static_cast<byte*>(
287 		ut_zalloc_nokey(log_sys->buf_size * 2
288 				+ MAX_SRV_LOG_WRITE_AHEAD_SIZE));
289 	log_sys->buf = static_cast<byte*>(
290 		ut_align(log_sys->buf_ptr, MAX_SRV_LOG_WRITE_AHEAD_SIZE));
291 
292 	log_sys->first_in_use = true;
293 
294 	log_sys->max_buf_free = log_sys->buf_size / LOG_BUF_FLUSH_RATIO
295 		- LOG_BUF_FLUSH_MARGIN;
296 
297 	/* restore the last log block */
298 	ut_memcpy(log_sys->buf, tmp_buf, move_end - move_start);
299 
300 	ut_ad(log_sys->is_extending);
301 	log_sys->is_extending = false;
302 
303 	log_mutex_exit_all();
304 
305 	ib::info() << "innodb_log_buffer_size was extended to "
306 		<< LOG_BUFFER_SIZE << ".";
307 }
308 
309 #ifndef UNIV_HOTBACKUP
310 /** Calculate actual length in redo buffer and file including
311 block header and trailer.
312 @param[in]	len	length to write
313 @return actual length to write including header and trailer. */
314 static inline
315 ulint
log_calculate_actual_len(ulint len)316 log_calculate_actual_len(
317 	ulint len)
318 {
319 	ut_ad(log_mutex_own());
320 
321 	/* actual length stored per block */
322 	const ulint	len_per_blk = OS_FILE_LOG_BLOCK_SIZE
323 		- (LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE);
324 
325 	/* actual data length in last block already written */
326 	ulint	extra_len = (log_sys->buf_free % OS_FILE_LOG_BLOCK_SIZE);
327 
328 	ut_ad(extra_len >= LOG_BLOCK_HDR_SIZE);
329 	extra_len -= LOG_BLOCK_HDR_SIZE;
330 
331 	/* total extra length for block header and trailer */
332 	extra_len = ((len + extra_len) / len_per_blk)
333 		* (LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE);
334 
335 	return(len + extra_len);
336 }
337 
338 /** Check margin not to overwrite transaction log from the last checkpoint.
339 If would estimate the log write to exceed the log_group_capacity,
340 waits for the checkpoint is done enough.
341 @param[in]	len	length of the data to be written */
342 
343 void
log_margin_checkpoint_age(ulint len)344 log_margin_checkpoint_age(
345 	ulint	len)
346 {
347 	ulint	margin = log_calculate_actual_len(len);
348 
349 	ut_ad(log_mutex_own());
350 
351 	if (margin > log_sys->log_group_capacity) {
352 		/* return with warning output to avoid deadlock */
353 		if (!log_has_printed_chkp_margine_warning
354 		    || difftime(time(NULL),
355 				log_last_margine_warning_time) > 15) {
356 			log_has_printed_chkp_margine_warning = true;
357 			log_last_margine_warning_time = time(NULL);
358 
359 			ib::error() << "The transaction log files are too"
360 				" small for the single transaction log (size="
361 				<< len << "). So, the last checkpoint age"
362 				" might exceed the log group capacity "
363 				<< log_sys->log_group_capacity << ".";
364 		}
365 
366 		return;
367 	}
368 
369 	/* Our margin check should ensure that we never reach this condition.
370 	Try to do checkpoint once. We cannot keep waiting here as it might
371 	result in hang in case the current mtr has latch on oldest lsn */
372 	if (log_sys->lsn - log_sys->last_checkpoint_lsn + margin
373 	    > log_sys->log_group_capacity) {
374 		/* The log write of 'len' might overwrite the transaction log
375 		after the last checkpoint. Makes checkpoint. */
376 
377 		bool	flushed_enough = false;
378 
379 		if (log_sys->lsn - log_buf_pool_get_oldest_modification()
380 		    + margin
381 		    <= log_sys->log_group_capacity) {
382 			flushed_enough = true;
383 		}
384 
385 		log_sys->check_flush_or_checkpoint = true;
386 		log_mutex_exit();
387 
388 		DEBUG_SYNC_C("margin_checkpoint_age_rescue");
389 
390 		if (!flushed_enough) {
391 			os_thread_sleep(100000);
392 		}
393 		log_checkpoint(true, false);
394 
395 		log_mutex_enter();
396 	}
397 
398 	return;
399 }
400 #endif /* !UNIV_HOTBACKUP */
401 /** Open the log for log_write_low. The log must be closed with log_close.
402 @param[in]	len	length of the data to be written
403 @return start lsn of the log record */
404 lsn_t
log_reserve_and_open(ulint len)405 log_reserve_and_open(
406 	ulint	len)
407 {
408 	ulint	len_upper_limit;
409 	ulint	count			= 0;
410 	ulint	tcount			= 0;
411 
412 loop:
413 	ut_ad(log_mutex_own());
414 	ut_ad(!recv_no_log_write);
415 
416 	if (log_sys->is_extending) {
417 		log_mutex_exit();
418 
419 		/* Log buffer size is extending. Writing up to the next block
420 		should wait for the extending finished. */
421 
422 		os_thread_sleep(100000);
423 
424 		ut_ad(++count < 50);
425 
426 		log_mutex_enter();
427 		goto loop;
428 	}
429 
430 	/* Calculate an upper limit for the space the string may take in the
431 	log buffer */
432 
433 	len_upper_limit = LOG_BUF_WRITE_MARGIN + srv_log_write_ahead_size
434 			  + (5 * len) / 4;
435 
436 	if (log_sys->buf_free + len_upper_limit > log_sys->buf_size) {
437 		log_mutex_exit();
438 
439 		DEBUG_SYNC_C("log_buf_size_exceeded");
440 
441 		/* Not enough free space, do a write of the log buffer */
442 
443 		log_buffer_sync_in_background(false);
444 
445 		srv_stats.log_waits.inc();
446 
447 		ut_ad(++count < 50);
448 
449 		log_mutex_enter();
450 		goto loop;
451 	}
452 
453 	if (log_check_tracking_margin(len_upper_limit) &&
454 		(++tcount + count < 50)) {
455 
456 		/* This log write would violate the untracked LSN free space
457 		margin.  Limit this to 50 retries as there might be situations
458 		where we have no choice but to proceed anyway, i.e. if the log
459 		is about to be overflown, log tracking or not. */
460 		log_mutex_exit();
461 
462 		os_thread_sleep(10000);
463 
464 		log_mutex_enter();
465 		goto loop;
466 	}
467 
468 	return(log_sys->lsn);
469 }
470 
471 /************************************************************//**
472 Writes to the log the string given. It is assumed that the caller holds the
473 log mutex. */
474 void
log_write_low(const byte * str,ulint str_len)475 log_write_low(
476 /*==========*/
477 	const byte*	str,		/*!< in: string */
478 	ulint		str_len)	/*!< in: string length */
479 {
480 	log_t*	log	= log_sys;
481 	ulint	len;
482 	ulint	data_len;
483 	byte*	log_block;
484 
485 	ut_ad(log_mutex_own());
486 part_loop:
487 	ut_ad(!recv_no_log_write);
488 	/* Calculate a part length */
489 
490 	data_len = (log->buf_free % OS_FILE_LOG_BLOCK_SIZE) + str_len;
491 
492 	if (data_len <= OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
493 
494 		/* The string fits within the current log block */
495 
496 		len = str_len;
497 	} else {
498 		data_len = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE;
499 
500 		len = OS_FILE_LOG_BLOCK_SIZE
501 			- (log->buf_free % OS_FILE_LOG_BLOCK_SIZE)
502 			- LOG_BLOCK_TRL_SIZE;
503 	}
504 
505 	ut_memcpy(log->buf + log->buf_free, str, len);
506 
507 	str_len -= len;
508 	str = str + len;
509 
510 	log_block = static_cast<byte*>(
511 		ut_align_down(
512 			log->buf + log->buf_free, OS_FILE_LOG_BLOCK_SIZE));
513 
514 	log_block_set_data_len(log_block, data_len);
515 
516 	if (data_len == OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
517 		/* This block became full */
518 		log_block_set_data_len(log_block, OS_FILE_LOG_BLOCK_SIZE);
519 		log_block_set_checkpoint_no(log_block,
520 					    log_sys->next_checkpoint_no);
521 		len += LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE;
522 
523 		log->lsn += len;
524 
525 		/* Initialize the next block header */
526 		log_block_init(log_block + OS_FILE_LOG_BLOCK_SIZE, log->lsn);
527 	} else {
528 		log->lsn += len;
529 	}
530 
531 	log->buf_free += len;
532 
533 	ut_ad(log->buf_free <= log->buf_size);
534 
535 	if (str_len > 0) {
536 		goto part_loop;
537 	}
538 
539 	srv_stats.log_write_requests.inc();
540 }
541 
542 /************************************************************//**
543 Closes the log.
544 @return lsn */
545 lsn_t
log_close(void)546 log_close(void)
547 /*===========*/
548 {
549 	byte*		log_block;
550 	ulint		first_rec_group;
551 	lsn_t		oldest_lsn;
552 	lsn_t		lsn;
553 	log_t*		log	= log_sys;
554 	lsn_t		checkpoint_age;
555 
556 	ut_ad(log_mutex_own());
557 	ut_ad(!recv_no_log_write);
558 
559 	lsn = log->lsn;
560 
561 	log_block = static_cast<byte*>(
562 		ut_align_down(
563 			log->buf + log->buf_free, OS_FILE_LOG_BLOCK_SIZE));
564 
565 	first_rec_group = log_block_get_first_rec_group(log_block);
566 
567 	if (first_rec_group == 0) {
568 		/* We initialized a new log block which was not written
569 		full by the current mtr: the next mtr log record group
570 		will start within this block at the offset data_len */
571 
572 		log_block_set_first_rec_group(
573 			log_block, log_block_get_data_len(log_block));
574 	}
575 
576 	if (log->buf_free > log->max_buf_free) {
577 
578 		log->check_flush_or_checkpoint = true;
579 	}
580 
581 	if (srv_track_changed_pages) {
582 
583 		lsn_t tracked_lsn = log_get_tracked_lsn();
584 		ut_ad(tracked_lsn > 0);
585 		lsn_t tracked_lsn_age = lsn - tracked_lsn;
586 
587 		if (tracked_lsn_age >= log->log_group_capacity) {
588 
589 			ib::error() << "The age of the oldest untracked "
590 				"record exceeds the log group capacity!";
591 			ib::error() << "Stopping the log tracking thread at "
592 				"LSN " << tracked_lsn;
593 			srv_track_changed_pages = FALSE;
594 		}
595 	}
596 
597 	checkpoint_age = lsn - log->last_checkpoint_lsn;
598 
599 	if (checkpoint_age >= log->log_group_capacity) {
600 		DBUG_EXECUTE_IF(
601 			"print_all_chkp_warnings",
602 			log_has_printed_chkp_warning = false;);
603 
604 		if (!log_has_printed_chkp_warning
605 		    || difftime(time(NULL), log_last_warning_time) > 15) {
606 
607 			log_has_printed_chkp_warning = true;
608 			log_last_warning_time = time(NULL);
609 
610 			ib::error() << "The age of the last checkpoint is "
611 				<< checkpoint_age << ", which exceeds the log"
612 				" group capacity " << log->log_group_capacity
613 				<< ".";
614 		}
615 	}
616 
617 	if (checkpoint_age <= log->max_modified_age_sync) {
618 
619 		goto function_exit;
620 	}
621 
622 	oldest_lsn = buf_pool_get_oldest_modification();
623 
624 	if (!oldest_lsn
625 	    || lsn - oldest_lsn > log->max_modified_age_sync
626 	    || checkpoint_age > log->max_checkpoint_age_async) {
627 
628 		log->check_flush_or_checkpoint = true;
629 	}
630 function_exit:
631 
632 	return(lsn);
633 }
634 
635 /******************************************************//**
636 Calculates the data capacity of a log group, when the log file headers are not
637 included.
638 @return capacity in bytes */
639 lsn_t
log_group_get_capacity(const log_group_t * group)640 log_group_get_capacity(
641 /*===================*/
642 	const log_group_t*	group)	/*!< in: log group */
643 {
644 	/* The lsn parameters are updated while holding both the mutexes
645 	and it is ok to have either of them while reading */
646 	ut_ad(log_mutex_own() || log_write_mutex_own());
647 
648 	return((group->file_size - LOG_FILE_HDR_SIZE) * group->n_files);
649 }
650 
651 /******************************************************//**
652 Calculates the offset within a log group, when the log file headers are not
653 included.
654 @return size offset (<= offset) */
655 UNIV_INLINE
656 lsn_t
log_group_calc_size_offset(lsn_t offset,const log_group_t * group)657 log_group_calc_size_offset(
658 /*=======================*/
659 	lsn_t			offset,	/*!< in: real offset within the
660 					log group */
661 	const log_group_t*	group)	/*!< in: log group */
662 {
663 	/* The lsn parameters are updated while holding both the mutexes
664 	and it is ok to have either of them while reading */
665 	ut_ad(log_mutex_own() || log_write_mutex_own());
666 
667 	return(offset - LOG_FILE_HDR_SIZE * (1 + offset / group->file_size));
668 }
669 
670 /******************************************************//**
671 Calculates the offset within a log group, when the log file headers are
672 included.
673 @return real offset (>= offset) */
674 UNIV_INLINE
675 lsn_t
log_group_calc_real_offset(lsn_t offset,const log_group_t * group)676 log_group_calc_real_offset(
677 /*=======================*/
678 	lsn_t			offset,	/*!< in: size offset within the
679 					log group */
680 	const log_group_t*	group)	/*!< in: log group */
681 {
682 	/* The lsn parameters are updated while holding both the mutexes
683 	and it is ok to have either of them while reading */
684 	ut_ad(log_mutex_own() || log_write_mutex_own());
685 
686 	return(offset + LOG_FILE_HDR_SIZE
687 	       * (1 + offset / (group->file_size - LOG_FILE_HDR_SIZE)));
688 }
689 
690 /** Calculate the offset of an lsn within a log group.
691 @param[in]	lsn	log sequence number
692 @param[in]	group	log group
693 @return offset within the log group */
694 lsn_t
log_group_calc_lsn_offset(lsn_t lsn,const log_group_t * group)695 log_group_calc_lsn_offset(
696 	lsn_t			lsn,
697 	const log_group_t*	group)
698 {
699 	lsn_t	gr_lsn;
700 	lsn_t	gr_lsn_size_offset;
701 	lsn_t	difference;
702 	lsn_t	group_size;
703 	lsn_t	offset;
704 
705 	/* The lsn parameters are updated while holding both the mutexes
706 	and it is ok to have either of them while reading */
707 	ut_ad(log_mutex_own() || log_write_mutex_own());
708 
709 	gr_lsn = group->lsn;
710 
711 	gr_lsn_size_offset = log_group_calc_size_offset(
712 		group->lsn_offset, group);
713 
714 	group_size = log_group_get_capacity(group);
715 
716 	if (lsn >= gr_lsn) {
717 
718 		difference = lsn - gr_lsn;
719 	} else {
720 		difference = gr_lsn - lsn;
721 
722 		difference = difference % group_size;
723 
724 		difference = group_size - difference;
725 	}
726 
727 	offset = (gr_lsn_size_offset + difference) % group_size;
728 
729 	/* fprintf(stderr,
730 	"Offset is " LSN_PF " gr_lsn_offset is " LSN_PF
731 	" difference is " LSN_PF "\n",
732 	offset, gr_lsn_size_offset, difference);
733 	*/
734 
735 	return(log_group_calc_real_offset(offset, group));
736 }
737 
738 /*******************************************************************//**
739 Calculates where in log files we find a specified lsn.
740 @return log file number */
741 ulint
log_calc_where_lsn_is(int64_t * log_file_offset,ib_uint64_t first_header_lsn,ib_uint64_t lsn,ulint n_log_files,int64_t log_file_size)742 log_calc_where_lsn_is(
743 /*==================*/
744 	int64_t*	log_file_offset,	/*!< out: offset in that file
745 						(including the header) */
746 	ib_uint64_t	first_header_lsn,	/*!< in: first log file start
747 						lsn */
748 	ib_uint64_t	lsn,			/*!< in: lsn whose position to
749 						determine */
750 	ulint		n_log_files,		/*!< in: total number of log
751 						files */
752 	int64_t		log_file_size)		/*!< in: log file size
753 						(including the header) */
754 {
755 	int64_t		capacity	= log_file_size - LOG_FILE_HDR_SIZE;
756 	ulint		file_no;
757 	int64_t		add_this_many;
758 
759 	if (lsn < first_header_lsn) {
760 		add_this_many = 1 + (first_header_lsn - lsn)
761 			/ (capacity * static_cast<int64_t>(n_log_files));
762 		lsn += add_this_many
763 			* capacity * static_cast<int64_t>(n_log_files);
764 	}
765 
766 	ut_a(lsn >= first_header_lsn);
767 
768 	file_no = ((ulint)((lsn - first_header_lsn) / capacity))
769 		% n_log_files;
770 	*log_file_offset = (lsn - first_header_lsn) % capacity;
771 
772 	*log_file_offset = *log_file_offset + LOG_FILE_HDR_SIZE;
773 
774 	return(file_no);
775 }
776 
777 
778 /********************************************************//**
779 Sets the field values in group to correspond to a given lsn. For this function
780 to work, the values must already be correctly initialized to correspond to
781 some lsn, for instance, a checkpoint lsn. */
782 void
log_group_set_fields(log_group_t * group,lsn_t lsn)783 log_group_set_fields(
784 /*=================*/
785 	log_group_t*	group,	/*!< in/out: group */
786 	lsn_t		lsn)	/*!< in: lsn for which the values should be
787 				set */
788 {
789 	group->lsn_offset = log_group_calc_lsn_offset(lsn, group);
790 	group->lsn = lsn;
791 }
792 #ifndef UNIV_HOTBACKUP
793 /*****************************************************************//**
794 Calculates the recommended highest values for lsn - last_checkpoint_lsn
795 and lsn - buf_get_oldest_modification().
796 @retval true on success
797 @retval false if the smallest log group is too small to
798 accommodate the number of OS threads in the database server */
799 static MY_ATTRIBUTE((warn_unused_result))
800 bool
log_calc_max_ages(void)801 log_calc_max_ages(void)
802 /*===================*/
803 {
804 	log_group_t*	group;
805 	lsn_t		margin;
806 	ulint		free;
807 	bool		success	= true;
808 	lsn_t		smallest_capacity;
809 
810 	log_mutex_enter();
811 
812 	group = UT_LIST_GET_FIRST(log_sys->log_groups);
813 
814 	ut_ad(group);
815 
816 	smallest_capacity = LSN_MAX;
817 
818 	while (group) {
819 		if (log_group_get_capacity(group) < smallest_capacity) {
820 
821 			smallest_capacity = log_group_get_capacity(group);
822 		}
823 
824 		group = UT_LIST_GET_NEXT(log_groups, group);
825 	}
826 
827 	/* Add extra safety */
828 	smallest_capacity = smallest_capacity - smallest_capacity / 10;
829 
830 	/* For each OS thread we must reserve so much free space in the
831 	smallest log group that it can accommodate the log entries produced
832 	by single query steps: running out of free log space is a serious
833 	system error which requires rebooting the database. */
834 
835 	free = LOG_CHECKPOINT_FREE_PER_THREAD * (10 + srv_thread_concurrency)
836 		+ LOG_CHECKPOINT_EXTRA_FREE;
837 	if (free >= smallest_capacity / 2) {
838 		success = false;
839 
840 		goto failure;
841 	} else {
842 		margin = smallest_capacity - free;
843 	}
844 
845 	margin = margin - margin / 10;	/* Add still some extra safety */
846 
847 	log_sys->log_group_capacity = smallest_capacity;
848 
849 	log_sys->max_modified_age_async = margin
850 		- margin / LOG_POOL_PREFLUSH_RATIO_ASYNC;
851 	log_sys->max_modified_age_sync = margin
852 		- margin / LOG_POOL_PREFLUSH_RATIO_SYNC;
853 
854 	log_sys->max_checkpoint_age_async = margin - margin
855 		/ LOG_POOL_CHECKPOINT_RATIO_ASYNC;
856 	log_sys->max_checkpoint_age = margin;
857 
858 failure:
859 	log_mutex_exit();
860 
861 	if (!success) {
862 		ib::error() << "Cannot continue operation. ib_logfiles are too"
863 			" small for innodb_thread_concurrency "
864 			<< srv_thread_concurrency << ". The combined size of"
865 			" ib_logfiles should be bigger than"
866 			" 200 kB * innodb_thread_concurrency. To get mysqld"
867 			" to start up, set innodb_thread_concurrency in"
868 			" my.cnf to a lower value, for example, to 8. After"
869 			" an ERROR-FREE shutdown of mysqld you can adjust"
870 			" the size of ib_logfiles. " << INNODB_PARAMETERS_MSG;
871 	}
872 
873 	return(success);
874 }
875 
876 /******************************************************//**
877 Initializes the log. */
878 void
log_init(void)879 log_init(void)
880 /*==========*/
881 {
882 	log_sys = static_cast<log_t*>(ut_zalloc_nokey(sizeof(log_t)));
883 
884 	mutex_create(LATCH_ID_LOG_SYS, &log_sys->mutex);
885 	mutex_create(LATCH_ID_LOG_WRITE, &log_sys->write_mutex);
886 
887 	mutex_create(LATCH_ID_LOG_FLUSH_ORDER, &log_sys->log_flush_order_mutex);
888 
889 	/* Start the lsn from one log block from zero: this way every
890 	log record has a start lsn != zero, a fact which we will use */
891 
892 	log_sys->lsn = LOG_START_LSN;
893 
894 	ut_a(LOG_BUFFER_SIZE >= 16 * OS_FILE_LOG_BLOCK_SIZE);
895 	ut_a(LOG_BUFFER_SIZE >= 4 * UNIV_PAGE_SIZE);
896 
897 	log_sys->buf_size = LOG_BUFFER_SIZE;
898 
899 	log_sys->buf_ptr = static_cast<byte*>(
900 		ut_zalloc_nokey(log_sys->buf_size * 2
901 				+ MAX_SRV_LOG_WRITE_AHEAD_SIZE));
902 	log_sys->buf = static_cast<byte*>(
903 		ut_align(log_sys->buf_ptr, MAX_SRV_LOG_WRITE_AHEAD_SIZE));
904 
905 	log_sys->first_in_use = true;
906 
907 	log_sys->max_buf_free = log_sys->buf_size / LOG_BUF_FLUSH_RATIO
908 		- LOG_BUF_FLUSH_MARGIN;
909 	log_sys->check_flush_or_checkpoint = true;
910 	UT_LIST_INIT(log_sys->log_groups, &log_group_t::log_groups);
911 
912 	log_sys->n_log_ios_old = log_sys->n_log_ios;
913 	log_sys->last_printout_time = time(NULL);
914 	/*----------------------------*/
915 
916 	log_sys->write_lsn = log_sys->lsn;
917 
918 	log_sys->flush_event = os_event_create(0);
919 
920 	os_event_set(log_sys->flush_event);
921 
922 	/*----------------------------*/
923 
924 	log_sys->last_checkpoint_lsn = log_sys->lsn;
925 	log_sys->next_checkpoint_lsn = log_sys->lsn;
926 
927 	rw_lock_create(
928 		checkpoint_lock_key, &log_sys->checkpoint_lock,
929 		SYNC_NO_ORDER_CHECK);
930 
931 	log_sys->checkpoint_buf_ptr = static_cast<byte*>(
932 		ut_zalloc_nokey(OS_FILE_LOG_BLOCK_SIZE
933 				+ MAX_SRV_LOG_WRITE_AHEAD_SIZE));
934 
935 	log_sys->checkpoint_buf = static_cast<byte*>(
936 		ut_align(log_sys->checkpoint_buf_ptr,
937 			 MAX_SRV_LOG_WRITE_AHEAD_SIZE));
938 
939 	/*----------------------------*/
940 
941 	log_block_init(log_sys->buf, log_sys->lsn);
942 	log_block_set_first_rec_group(log_sys->buf, LOG_BLOCK_HDR_SIZE);
943 
944 	log_sys->buf_free = LOG_BLOCK_HDR_SIZE;
945 	log_sys->lsn = LOG_START_LSN + LOG_BLOCK_HDR_SIZE;
946 
947 	MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
948 		    log_sys->lsn - log_sys->last_checkpoint_lsn);
949 
950 	log_scrub_thread_active= !srv_read_only_mode && srv_scrub_log;
951 	if (log_scrub_thread_active) {
952 		log_scrub_event= os_event_create("log_scrub_event");
953 		os_thread_create(log_scrub_thread, NULL, NULL);
954 	}
955 }
956 
957 /******************************************************************//**
958 Inits a log group to the log system.
959 @return true if success, false if not */
960 MY_ATTRIBUTE((warn_unused_result))
961 bool
log_group_init(ulint id,ulint n_files,lsn_t file_size,ulint space_id)962 log_group_init(
963 /*===========*/
964 	ulint	id,			/*!< in: group id */
965 	ulint	n_files,		/*!< in: number of log files */
966 	lsn_t	file_size,		/*!< in: log file size in bytes */
967 	ulint	space_id)		/*!< in: space id of the file space
968 					which contains the log files of this
969 					group */
970 {
971 	ulint	i;
972 	log_group_t*	group;
973 
974 	group = static_cast<log_group_t*>(ut_malloc_nokey(sizeof(log_group_t)));
975 
976 	group->id = id;
977 	group->n_files = n_files;
978 	group->format = LOG_HEADER_FORMAT_CURRENT;
979 	group->file_size = file_size;
980 	group->space_id = space_id;
981 	group->state = LOG_GROUP_OK;
982 	group->lsn = LOG_START_LSN;
983 	group->lsn_offset = LOG_FILE_HDR_SIZE;
984 
985 	group->file_header_bufs_ptr = static_cast<byte**>(
986 		ut_zalloc_nokey(sizeof(byte*) * n_files));
987 
988 	group->file_header_bufs = static_cast<byte**>(
989 		ut_zalloc_nokey(sizeof(byte**) * n_files));
990 
991 	for (i = 0; i < n_files; i++) {
992 		group->file_header_bufs_ptr[i] = static_cast<byte*>(
993 			ut_zalloc_nokey(LOG_FILE_HDR_SIZE
994 					+ MAX_SRV_LOG_WRITE_AHEAD_SIZE));
995 
996 		group->file_header_bufs[i] = static_cast<byte*>(
997 			ut_align(group->file_header_bufs_ptr[i],
998 				 MAX_SRV_LOG_WRITE_AHEAD_SIZE));
999 	}
1000 
1001 	group->checkpoint_buf_ptr = static_cast<byte*>(
1002 		ut_zalloc_nokey(OS_FILE_LOG_BLOCK_SIZE +
1003 				MAX_SRV_LOG_WRITE_AHEAD_SIZE));
1004 
1005 	group->checkpoint_buf = static_cast<byte*>(
1006 		ut_align(group->checkpoint_buf_ptr,
1007 			 MAX_SRV_LOG_WRITE_AHEAD_SIZE));
1008 
1009 	UT_LIST_ADD_LAST(log_sys->log_groups, group);
1010 
1011 	return(log_calc_max_ages());
1012 }
1013 #endif /* !UNIV_HOTBACKUP */
1014 /******************************************************//**
1015 Completes an i/o to a log file. */
1016 void
log_io_complete(log_group_t * group)1017 log_io_complete(
1018 /*============*/
1019 	log_group_t*	group)	/*!< in: log group or a dummy pointer */
1020 {
1021 	if ((ulint) group & 0x1UL) {
1022 		/* It was a checkpoint write */
1023 		group = (log_group_t*)((ulint) group - 1);
1024 
1025 #ifdef _WIN32
1026 		fil_flush(group->space_id);
1027 #else
1028 		switch (srv_unix_file_flush_method) {
1029 		case SRV_UNIX_O_DSYNC:
1030 		case SRV_UNIX_NOSYNC:
1031 		case SRV_UNIX_ALL_O_DIRECT:
1032 			break;
1033 		case SRV_UNIX_FSYNC:
1034 		case SRV_UNIX_LITTLESYNC:
1035 		case SRV_UNIX_O_DIRECT:
1036 		case SRV_UNIX_O_DIRECT_NO_FSYNC:
1037 			if (thd_flush_log_at_trx_commit(NULL) != 2)
1038 				fil_flush(group->space_id);
1039 		}
1040 #endif /* _WIN32 */
1041 
1042 		DBUG_PRINT("ib_log", ("checkpoint info written to group %u",
1043 				      unsigned(group->id)));
1044 		log_io_complete_checkpoint();
1045 
1046 		return;
1047 	}
1048 
1049 	ut_error;	/*!< We currently use synchronous writing of the
1050 			logs and cannot end up here! */
1051 }
1052 
1053 /******************************************************//**
1054 Writes a log file header to a log file space. */
1055 static
1056 void
log_group_file_header_flush(log_group_t * group,ulint nth_file,lsn_t start_lsn)1057 log_group_file_header_flush(
1058 /*========================*/
1059 	log_group_t*	group,		/*!< in: log group */
1060 	ulint		nth_file,	/*!< in: header to the nth file in the
1061 					log file space */
1062 	lsn_t		start_lsn)	/*!< in: log file data starts at this
1063 					lsn */
1064 {
1065 	byte*	buf;
1066 	lsn_t	dest_offset;
1067 
1068 	ut_ad(log_write_mutex_own());
1069 	ut_ad(!recv_no_log_write);
1070 	ut_ad(group->id == 0);
1071 	ut_a(nth_file < group->n_files);
1072 
1073 	buf = *(group->file_header_bufs + nth_file);
1074 
1075 	memset(buf, 0, OS_FILE_LOG_BLOCK_SIZE);
1076 	mach_write_to_4(buf + LOG_HEADER_FORMAT, LOG_HEADER_FORMAT_CURRENT);
1077 	mach_write_to_8(buf + LOG_HEADER_START_LSN, start_lsn);
1078 	strcpy(reinterpret_cast<char*>(buf) + LOG_HEADER_CREATOR,
1079 	       LOG_HEADER_CREATOR_CURRENT);
1080 	ut_ad(LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR
1081 	      >= sizeof LOG_HEADER_CREATOR_CURRENT);
1082 	log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf));
1083 
1084 	dest_offset = nth_file * group->file_size;
1085 
1086 	DBUG_PRINT("ib_log", ("write " LSN_PF
1087 			      " group " ULINTPF
1088 			      " file " ULINTPF " header",
1089 			      start_lsn, group->id, nth_file));
1090 
1091 	log_sys->n_log_ios++;
1092 
1093 	MONITOR_INC(MONITOR_LOG_IO);
1094 
1095 	srv_stats.os_log_pending_writes.inc();
1096 
1097 	const ulint	page_no
1098 		= (ulint) (dest_offset / univ_page_size.physical());
1099 
1100 	fil_io(IORequestLogWrite, true,
1101 	       page_id_t(group->space_id, page_no),
1102 	       univ_page_size,
1103 	       (ulint) (dest_offset % univ_page_size.physical()),
1104 	       OS_FILE_LOG_BLOCK_SIZE, buf, group);
1105 
1106 	srv_stats.os_log_pending_writes.dec();
1107 }
1108 
log_encrypt_name(redo_log_encrypt_enum val)1109 const char* log_encrypt_name(redo_log_encrypt_enum val) {
1110 	switch(val) {
1111 	case REDO_LOG_ENCRYPT_OFF:
1112 		return "off";
1113 	case REDO_LOG_ENCRYPT_MK:
1114 		return "master_key";
1115 	case REDO_LOG_ENCRYPT_RK:
1116 		return "keyring_key";
1117 	}
1118 	return "unknown";
1119 }
1120 
1121 /* Read the first log file header to get the encryption. It's in the
1122 3rd block.
1123 @return true if success */
1124 bool
log_read_encryption()1125 log_read_encryption() {
1126 	byte key[ENCRYPTION_KEY_LEN];
1127 	byte iv[ENCRYPTION_KEY_LEN];
1128 
1129 	byte* log_block_buf_ptr =
1130 	    static_cast<byte *>(ut_malloc_nokey(2 * OS_FILE_LOG_BLOCK_SIZE));
1131 	memset(log_block_buf_ptr, 0, 2 * OS_FILE_LOG_BLOCK_SIZE);
1132 	byte* log_block_buf = static_cast<byte *>(
1133 	    ut_align(log_block_buf_ptr, OS_FILE_LOG_BLOCK_SIZE));
1134 
1135 	ulint		log_space_id = SRV_LOG_SPACE_FIRST_ID;
1136 	const page_id_t page_id(log_space_id, 0);
1137 	fil_io(IORequestLogRead, true, page_id, univ_page_size,
1138 	       LOG_CHECKPOINT_1 + OS_FILE_LOG_BLOCK_SIZE,
1139 	       OS_FILE_LOG_BLOCK_SIZE, log_block_buf, NULL);
1140 
1141 	bool		 encryption_magic = false;
1142 	bool		 encrypted_log = false;
1143 	redo_log_key*    mkey = NULL;
1144 	Encryption::Type encryption_type = Encryption::NONE;
1145 
1146 	if (memcmp(log_block_buf + LOG_HEADER_CREATOR_END,
1147 		   ENCRYPTION_KEY_MAGIC_RK, ENCRYPTION_MAGIC_SIZE) == 0) {
1148 		encryption_magic = true;
1149 		existing_redo_encryption_mode = REDO_LOG_ENCRYPT_RK;
1150 
1151 		/* Make sure the keyring is loaded. */
1152 		if (!Encryption::check_keyring()) {
1153 			ut_free(log_block_buf_ptr);
1154 			ib::error() << "Redo log was encrypted,"
1155 				    << " but keyring plugin is not loaded.";
1156 			return(false);
1157 		}
1158 
1159 		unsigned char* info_ptr = log_block_buf +
1160 					  LOG_HEADER_CREATOR_END +
1161 					  ENCRYPTION_MAGIC_SIZE;
1162 		uint version = mach_read_from_4(info_ptr);
1163 
1164 		memcpy(iv, info_ptr + ENCRYPTION_SERVER_UUID_LEN + 4,
1165 		       ENCRYPTION_KEY_LEN);
1166 
1167 #ifdef UNIV_ENCRYPT_DEBUG
1168 		fprintf(stderr, "Using redo log encryption key version: %u\n",
1169 			version);
1170 #endif
1171 
1172 		mkey = redo_log_key_mgr.load_key_version(NULL, version);
1173 		if (mkey != NULL) {
1174 			encrypted_log = true;
1175 			memcpy(key, mkey->key, ENCRYPTION_KEY_LEN);
1176 			encryption_type = Encryption::KEYRING;
1177 			srv_redo_log_key_version = mkey->version;
1178 		}
1179 	}
1180 
1181 	if (memcmp(log_block_buf + LOG_HEADER_CREATOR_END,
1182 		   ENCRYPTION_KEY_MAGIC_V2, ENCRYPTION_MAGIC_SIZE) == 0
1183 	    ||
1184 	    memcmp(log_block_buf + LOG_HEADER_CREATOR_END,
1185 		   ENCRYPTION_KEY_MAGIC_V3, ENCRYPTION_MAGIC_SIZE) == 0
1186 	    ) {
1187 		encryption_magic = true;
1188 		existing_redo_encryption_mode = REDO_LOG_ENCRYPT_MK;
1189 
1190 		/* Make sure the keyring is loaded. */
1191 		if (!Encryption::check_keyring()) {
1192 			ib::error() << "Redo log was encrypted,"
1193 				    << " but keyring plugin is not loaded.";
1194 		} else if (Encryption::decode_encryption_info(
1195 			       key, iv,
1196 			       log_block_buf + LOG_HEADER_CREATOR_END)) {
1197 			encrypted_log = true;
1198 			encryption_type = Encryption::AES;
1199 		}
1200 	}
1201 	if (encrypted_log) {
1202 		if (existing_redo_encryption_mode != srv_redo_log_encrypt &&
1203 		    srv_redo_log_encrypt != REDO_LOG_ENCRYPT_OFF) {
1204 			ib::error() <<
1205 				" Redo log encryption mode"
1206 				" can't be switched without stopping the server and"
1207 				" recreating the redo logs. Current mode is "
1208 				<< log_encrypt_name(existing_redo_encryption_mode)
1209 				<< ", requested "
1210 				<< log_encrypt_name(
1211 				static_cast<redo_log_encrypt_enum>(srv_redo_log_encrypt))
1212 				<< ".";
1213 
1214 			return(false);
1215 		}
1216 
1217 
1218 		/* If redo log encryption is enabled, set the
1219 		   space flag. Otherwise, we just fill the encryption
1220 		   information to space object for decrypting old
1221 		   redo log blocks. */
1222 		fil_space_t* space = fil_space_get(log_space_id);
1223 		space->encryption_redo_key = mkey;
1224 		space->flags |= FSP_FLAGS_MASK_ENCRYPTION;
1225 		dberr_t err =
1226 		    fil_set_encryption(space->id, encryption_type, key, iv);
1227 
1228 		if (err == DB_SUCCESS) {
1229 			ut_free(log_block_buf_ptr);
1230 			ib::info() << "Read redo log encryption"
1231 				   << " metadata successful.";
1232 			return(true);
1233 		} else {
1234 			ut_free(log_block_buf_ptr);
1235 			ib::fatal() << "Can't set redo log tablespace"
1236 				    << " encryption metadata.";
1237 			return(false);
1238 		}
1239 	} else if (encryption_magic) {
1240 		ut_free(log_block_buf_ptr);
1241 		ib::error() << "Cannot read the encryption"
1242 			       " information in log file header, please"
1243 			       " check if keyring plugin loaded and"
1244 			       " the key file exists.";
1245 		return(false);
1246 	}
1247 
1248 	ut_free(log_block_buf_ptr);
1249 	return(true);
1250 }
1251 
1252 
1253 /** Writes encryption information to log header.
1254 @param[in,out]	buf	log file header
1255 @param[in]	key	encryption key
1256 @param[in]	iv	encryption iv */
1257 static bool
log_file_header_fill_encryption(byte * buf,byte * key,byte * iv)1258 log_file_header_fill_encryption(byte* buf, byte* key, byte* iv) {
1259 	byte encryption_info[ENCRYPTION_INFO_SIZE_V2];
1260 
1261 	if (!fsp_header_fill_encryption_info(key, iv, encryption_info)) {
1262 		return(false);
1263 	}
1264 
1265 	ut_ad(LOG_HEADER_CREATOR_END + ENCRYPTION_INFO_SIZE_V2 <
1266 	      OS_FILE_LOG_BLOCK_SIZE);
1267 
1268 	memcpy(buf + LOG_HEADER_CREATOR_END, encryption_info,
1269 	       ENCRYPTION_INFO_SIZE_V2);
1270 
1271 	return(true);
1272 }
1273 
1274 static bool
log_file_header_fill_encryption(byte * buf,ulint key_version,byte * iv)1275 log_file_header_fill_encryption(byte* buf, ulint key_version, byte* iv) {
1276 	byte encryption_info[ENCRYPTION_INFO_SIZE_V2] = {};
1277 
1278 	if (!fsp_header_fill_encryption_info(key_version, iv,
1279 					     encryption_info)) {
1280 		return(false);
1281 	}
1282 
1283 	ut_ad(LOG_HEADER_CREATOR_END + ENCRYPTION_INFO_SIZE_V2 <
1284 	      OS_FILE_LOG_BLOCK_SIZE);
1285 
1286 	memcpy(buf + LOG_HEADER_CREATOR_END, encryption_info,
1287 	       ENCRYPTION_INFO_SIZE_V2);
1288 
1289 	return(true);
1290 }
1291 
1292 /** Write the encryption info into the log file header(the 3rd block).
1293 It just need to flush the file header block with current master key.
1294 @param[in]	key			encryption key
1295 @param[in]	iv			encryption iv
1296 @param[in]	redo_log_encrypt	encryption mode
1297 @return true if success. */
1298 bool
log_write_encryption(byte * key,byte * iv,redo_log_encrypt_enum redo_log_encrypt)1299 log_write_encryption(byte* key, byte* iv,
1300 		     redo_log_encrypt_enum redo_log_encrypt) {
1301 	const page_id_t page_id(SRV_LOG_SPACE_FIRST_ID, 0);
1302 	byte 		*log_block_buf_ptr;
1303 	byte 		*log_block_buf;
1304 	ulint		version = 1;
1305 
1306 	log_block_buf_ptr =
1307 	    static_cast<byte *>(ut_malloc_nokey(2 * OS_FILE_LOG_BLOCK_SIZE));
1308 	memset(log_block_buf_ptr, 0, 2 * OS_FILE_LOG_BLOCK_SIZE);
1309 	log_block_buf = static_cast<byte *>(
1310 	    ut_align(log_block_buf_ptr, OS_FILE_LOG_BLOCK_SIZE));
1311 
1312 	if (key == NULL && iv == NULL) {
1313 		fil_space_t* space = fil_space_get(SRV_LOG_SPACE_FIRST_ID);
1314 
1315 		key = space->encryption_key;
1316 		iv = space->encryption_iv;
1317 		version = space->encryption_key_version;
1318 	}
1319 
1320 	log_write_mutex_enter();
1321 	if (redo_log_encrypt == REDO_LOG_ENCRYPT_MK ||
1322 	    existing_redo_encryption_mode == REDO_LOG_ENCRYPT_MK) {
1323 		ut_ad(existing_redo_encryption_mode != REDO_LOG_ENCRYPT_RK);
1324 		ut_ad(redo_log_encrypt != REDO_LOG_ENCRYPT_RK);
1325 		if (!log_file_header_fill_encryption(log_block_buf, key,
1326 						     iv)) {
1327 			ut_free(log_block_buf_ptr);
1328 			log_write_mutex_exit();
1329 			return(false);
1330 		}
1331 		existing_redo_encryption_mode = REDO_LOG_ENCRYPT_MK;
1332 	} else if (redo_log_encrypt == REDO_LOG_ENCRYPT_RK ||
1333 		   existing_redo_encryption_mode == REDO_LOG_ENCRYPT_RK) {
1334 		ut_ad(existing_redo_encryption_mode != REDO_LOG_ENCRYPT_MK);
1335 		ut_ad(redo_log_encrypt != REDO_LOG_ENCRYPT_MK);
1336 		if (!log_file_header_fill_encryption(log_block_buf, version,
1337 						     iv)) {
1338 			ut_free(log_block_buf_ptr);
1339 			log_write_mutex_exit();
1340 			return(false);
1341 		}
1342 		existing_redo_encryption_mode = REDO_LOG_ENCRYPT_RK;
1343 	} else {
1344 		ut_ad(0);
1345 	}
1346 
1347 	log_sys->n_log_ios++;
1348 
1349 	MONITOR_INC(MONITOR_LOG_IO);
1350 
1351 	srv_stats.os_log_pending_writes.inc();
1352 
1353 	fil_io(IORequestLogWrite, true, page_id, univ_page_size,
1354 	       LOG_CHECKPOINT_1 + OS_FILE_LOG_BLOCK_SIZE,
1355 	       OS_FILE_LOG_BLOCK_SIZE, log_block_buf, NULL);
1356 
1357 	srv_stats.os_log_pending_writes.dec();
1358 	log_write_mutex_exit();
1359 
1360 	ut_free(log_block_buf_ptr);
1361 	return(true);
1362 }
1363 
1364 /** Rotate the redo log encryption
1365 It will re-encrypt the redo log encryption metadata and write it to
1366 redo log file header.
1367 @return true if success. */
1368 bool
log_rotate_encryption()1369 log_rotate_encryption() {
1370 	fil_space_t *space = fil_space_get(SRV_LOG_SPACE_FIRST_ID);
1371 	if (!FSP_FLAGS_GET_ENCRYPTION(space->flags)) {
1372 		return(true);
1373 	}
1374 	/* Rotate log tablespace */
1375 	return (log_write_encryption(
1376 	    NULL, NULL,
1377 	    static_cast<redo_log_encrypt_enum>(srv_redo_log_encrypt)));
1378 }
1379 
1380 void
log_check_new_key_version()1381 log_check_new_key_version() {
1382 	if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
1383 		return;
1384 	}
1385 	fil_space_t* space = fil_space_get(SRV_LOG_SPACE_FIRST_ID);
1386 	if (!FSP_FLAGS_GET_ENCRYPTION(space->flags)) {
1387 		return;
1388 	}
1389 	if (srv_redo_log_encrypt == REDO_LOG_ENCRYPT_RK) {
1390 		/* re-fetch latest key */
1391 		redo_log_key* mkey = redo_log_key_mgr.load_latest_key(NULL, false);
1392 		if (mkey != NULL) {
1393 			space->encryption_redo_key = mkey;
1394 			srv_redo_log_key_version = mkey->version;
1395 		}
1396 	}
1397 }
1398 
1399 /** Check the redo log encryption is enabled or not.
1400 It will try to enable the redo log encryption and write the metadata to
1401 redo log file header. */
1402 void
log_rotate_default_key()1403 log_rotate_default_key() {
1404 	fil_space_t* space = fil_space_get(SRV_LOG_SPACE_FIRST_ID);
1405 
1406 	if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
1407 		return;
1408 	}
1409 
1410 	/* If the redo log space is using default key, rotate it.
1411 	We also need the server_uuid initialized. */
1412 	if (space->encryption_type != Encryption::NONE &&
1413 	    Encryption::master_key_id == ENCRYPTION_DEFAULT_MASTER_KEY_ID &&
1414 	    !srv_read_only_mode &&
1415 	    srv_redo_log_encrypt == REDO_LOG_ENCRYPT_MK) {
1416 		ut_ad(strlen(server_uuid) > 0);
1417 		ut_ad(FSP_FLAGS_GET_ENCRYPTION(space->flags));
1418 
1419 		log_write_encryption(NULL, NULL, REDO_LOG_ENCRYPT_MK);
1420 	}
1421 
1422 	if (space->encryption_type != Encryption::NONE &&
1423 	    space->encryption_key_version == REDO_LOG_ENCRYPT_NO_VERSION &&
1424 	    !srv_read_only_mode &&
1425 	    srv_redo_log_encrypt == REDO_LOG_ENCRYPT_RK) {
1426 		/* This only happens when the server uuid was just generated, so we can
1427 		save the key to the keyring */
1428 		ut_ad(strlen(server_uuid) > 0);
1429 		if (!redo_log_key_mgr.store_used_keys()) {
1430 			srv_redo_log_encrypt = REDO_LOG_ENCRYPT_OFF;
1431 			ib::error() << "Can't store redo log encryption key.";
1432 		}
1433 		redo_log_key* key = redo_log_key_mgr.load_latest_key(NULL, true);
1434 		space->encryption_key_version = key->version;
1435 		space->encryption_redo_key = key;
1436 		srv_redo_log_key_version = key->version;
1437 	}
1438 }
1439 
1440 /******************************************************//**
1441 Stores a 4-byte checksum to the trailer checksum field of a log block
1442 before writing it to a log file. This checksum is used in recovery to
1443 check the consistency of a log block. */
1444 static
1445 void
log_block_store_checksum(byte * block)1446 log_block_store_checksum(
1447 /*=====================*/
1448 	byte*	block)	/*!< in/out: pointer to a log block */
1449 {
1450 	log_block_set_checksum(block, log_block_calc_checksum(block));
1451 }
1452 
1453 /******************************************************//**
1454 Writes a buffer to a log file group. */
1455 static
1456 void
log_group_write_buf(log_group_t * group,byte * buf,ulint len,ulint pad_len,lsn_t start_lsn,ulint new_data_offset)1457 log_group_write_buf(
1458 /*================*/
1459 	log_group_t*	group,		/*!< in: log group */
1460 	byte*		buf,		/*!< in: buffer */
1461 	ulint		len,		/*!< in: buffer len; must be divisible
1462 					by OS_FILE_LOG_BLOCK_SIZE */
1463 #ifdef UNIV_DEBUG
1464 	ulint		pad_len,	/*!< in: pad len in the buffer len */
1465 #endif /* UNIV_DEBUG */
1466 	lsn_t		start_lsn,	/*!< in: start lsn of the buffer; must
1467 					be divisible by
1468 					OS_FILE_LOG_BLOCK_SIZE */
1469 	ulint		new_data_offset)/*!< in: start offset of new data in
1470 					buf: this parameter is used to decide
1471 					if we have to write a new log file
1472 					header */
1473 {
1474 	ulint		write_len;
1475 	bool		write_header	= new_data_offset == 0;
1476 	lsn_t		next_offset;
1477 	ulint		i;
1478 
1479 	ut_ad(log_write_mutex_own());
1480 	ut_ad(!recv_no_log_write);
1481 	ut_a(len % OS_FILE_LOG_BLOCK_SIZE == 0);
1482 	ut_a(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
1483 
1484 loop:
1485 	if (len == 0) {
1486 
1487 		return;
1488 	}
1489 
1490 	next_offset = log_group_calc_lsn_offset(start_lsn, group);
1491 
1492 	if (write_header
1493 	    && next_offset % group->file_size == LOG_FILE_HDR_SIZE) {
1494 		/* We start to write a new log file instance in the group */
1495 
1496 		ut_a(next_offset / group->file_size <= ULINT_MAX);
1497 
1498 		log_group_file_header_flush(group, (ulint)
1499 					    (next_offset / group->file_size),
1500 					    start_lsn);
1501 		srv_stats.os_log_written.add(OS_FILE_LOG_BLOCK_SIZE);
1502 
1503 		srv_stats.log_writes.inc();
1504 	}
1505 
1506 	if ((next_offset % group->file_size) + len > group->file_size) {
1507 
1508 		/* if the above condition holds, then the below expression
1509 		is < len which is ulint, so the typecast is ok */
1510 		write_len = (ulint)
1511 			(group->file_size - (next_offset % group->file_size));
1512 	} else {
1513 		write_len = len;
1514 	}
1515 
1516 	DBUG_PRINT("ib_log",
1517 		   ("write " LSN_PF " to " LSN_PF
1518 		    ": group " ULINTPF " len " ULINTPF
1519 		    " blocks " ULINTPF ".." ULINTPF,
1520 		    start_lsn, next_offset,
1521 		    group->id, write_len,
1522 		    log_block_get_hdr_no(buf),
1523 		    log_block_get_hdr_no(
1524 			    buf + write_len
1525 			    - OS_FILE_LOG_BLOCK_SIZE)));
1526 
1527 	ut_ad(pad_len >= len
1528 	      || log_block_get_hdr_no(buf)
1529 		 == log_block_convert_lsn_to_no(start_lsn));
1530 
1531 	/* Calculate the checksums for each log block and write them to
1532 	the trailer fields of the log blocks */
1533 
1534 	for (i = 0; i < write_len / OS_FILE_LOG_BLOCK_SIZE; i++) {
1535 		ut_ad(pad_len >= len
1536 		      || i * OS_FILE_LOG_BLOCK_SIZE >= len - pad_len
1537 		      || log_block_get_hdr_no(
1538 			      buf + i * OS_FILE_LOG_BLOCK_SIZE)
1539 			 == log_block_get_hdr_no(buf) + i);
1540 		log_block_store_checksum(buf + i * OS_FILE_LOG_BLOCK_SIZE);
1541 	}
1542 
1543 	log_sys->n_log_ios++;
1544 
1545 	MONITOR_INC(MONITOR_LOG_IO);
1546 
1547 	srv_stats.os_log_pending_writes.inc();
1548 
1549 	ut_a(next_offset / UNIV_PAGE_SIZE <= ULINT_MAX);
1550 
1551 	const ulint	page_no
1552 		= (ulint) (next_offset / univ_page_size.physical());
1553 
1554 	fil_io(IORequestLogWrite, true,
1555 	       page_id_t(group->space_id, page_no),
1556 	       univ_page_size,
1557 	       (ulint) (next_offset % UNIV_PAGE_SIZE), write_len, buf,
1558 	       group);
1559 
1560 	srv_stats.os_log_pending_writes.dec();
1561 
1562 	srv_stats.os_log_written.add(write_len);
1563 	srv_stats.log_writes.inc();
1564 
1565 	if (write_len < len) {
1566 		start_lsn += write_len;
1567 		len -= write_len;
1568 		buf += write_len;
1569 
1570 		write_header = true;
1571 
1572 		goto loop;
1573 	}
1574 }
1575 
1576 /** Flush the log has been written to the log file. */
1577 static
1578 void
log_write_flush_to_disk_low()1579 log_write_flush_to_disk_low()
1580 {
1581 	ut_a(log_sys->n_pending_flushes == 1); /* No other threads here */
1582 
1583 #ifndef _WIN32
1584 	bool	do_flush = srv_unix_file_flush_method != SRV_UNIX_O_DSYNC;
1585 #else
1586 	bool	do_flush = true;
1587 #endif
1588 	if (do_flush) {
1589 		log_group_t*	group = UT_LIST_GET_FIRST(log_sys->log_groups);
1590 		fil_flush(group->space_id);
1591 		log_sys->flushed_to_disk_lsn = log_sys->current_flush_lsn;
1592 	}
1593 
1594 	log_sys->n_pending_flushes--;
1595 	MONITOR_DEC(MONITOR_PENDING_LOG_FLUSH);
1596 
1597 	os_event_set(log_sys->flush_event);
1598 }
1599 
1600 /** Switch the log buffer in use, and copy the content of last block
1601 from old log buffer to the head of the to be used one. Thus, buf_free and
1602 buf_next_to_write would be changed accordingly */
1603 static inline
1604 void
log_buffer_switch()1605 log_buffer_switch()
1606 {
1607 	ut_ad(log_mutex_own());
1608 	ut_ad(log_write_mutex_own());
1609 
1610 	const byte*	old_buf = log_sys->buf;
1611 	ulint		area_end = ut_calc_align(log_sys->buf_free,
1612 						 OS_FILE_LOG_BLOCK_SIZE);
1613 
1614 	if (log_sys->first_in_use) {
1615 		ut_ad((reinterpret_cast<uintptr_t>(log_sys->buf)
1616 		       % srv_log_write_ahead_size) == 0);
1617 		log_sys->buf += log_sys->buf_size;
1618 	} else {
1619 		log_sys->buf -= log_sys->buf_size;
1620 		ut_ad((reinterpret_cast<uintptr_t>(log_sys->buf)
1621 		       % srv_log_write_ahead_size) == 0);
1622 	}
1623 
1624 	log_sys->first_in_use = !log_sys->first_in_use;
1625 
1626 	/* Copy the last block to new buf */
1627 	ut_memcpy(log_sys->buf,
1628 		  old_buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
1629 		  OS_FILE_LOG_BLOCK_SIZE);
1630 
1631 	log_sys->buf_free %= OS_FILE_LOG_BLOCK_SIZE;
1632 	log_sys->buf_next_to_write = log_sys->buf_free;
1633 }
1634 
1635 /** Ensure that the log has been written to the log file up to a given
1636 log entry (such as that of a transaction commit). Start a new write, or
1637 wait and check if an already running write is covering the request.
1638 @param[in]	lsn		log sequence number that should be
1639 included in the redo log file write
1640 @param[in]	flush_to_disk	whether the written log should also
1641 be flushed to the file system */
1642 void
log_write_up_to(lsn_t lsn,bool flush_to_disk)1643 log_write_up_to(
1644 	lsn_t	lsn,
1645 	bool	flush_to_disk)
1646 {
1647 #ifdef UNIV_DEBUG
1648 	ulint		loop_count	= 0;
1649 #endif /* UNIV_DEBUG */
1650 	byte*           write_buf;
1651 	lsn_t           write_lsn;
1652 
1653 	ut_ad(!srv_read_only_mode);
1654 
1655 	if (recv_no_ibuf_operations) {
1656 		/* Recovery is running and no operations on the log files are
1657 		allowed yet (the variable name .._no_ibuf_.. is misleading) */
1658 
1659 		return;
1660 	}
1661 
1662 loop:
1663 	ut_ad(++loop_count < 128);
1664 
1665 #if UNIV_WORD_SIZE > 7
1666 	/* We can do a dirty read of LSN. */
1667 	/* NOTE: Currently doesn't do dirty read for
1668 	(flush_to_disk == true) case, because the log_mutex
1669 	contention also works as the arbitrator for write-IO
1670 	(fsync) bandwidth between log files and data files. */
1671 	os_rmb;
1672 	if (!flush_to_disk && log_sys->write_lsn >= lsn) {
1673 		return;
1674 	}
1675 #endif
1676 
1677 	log_write_mutex_enter();
1678 	ut_ad(!recv_no_log_write);
1679 
1680 	lsn_t	limit_lsn = flush_to_disk
1681 		? log_sys->flushed_to_disk_lsn
1682 		: log_sys->write_lsn;
1683 
1684 	if (limit_lsn >= lsn) {
1685 		log_write_mutex_exit();
1686 		return;
1687 	}
1688 
1689 #ifdef _WIN32
1690 # ifndef UNIV_HOTBACKUP
1691 	/* write requests during fil_flush() might not be good for Windows */
1692 	if (log_sys->n_pending_flushes > 0
1693 	    || !os_event_is_set(log_sys->flush_event)) {
1694 		log_write_mutex_exit();
1695 		os_event_wait(log_sys->flush_event);
1696 		goto loop;
1697 	}
1698 # else
1699 	if (log_sys->n_pending_flushes > 0) {
1700 		goto loop;
1701 	}
1702 # endif  /* !UNIV_HOTBACKUP */
1703 #endif /* _WIN32 */
1704 
1705 	/* If it is a write call we should just go ahead and do it
1706 	as we checked that write_lsn is not where we'd like it to
1707 	be. If we have to flush as well then we check if there is a
1708 	pending flush and based on that we wait for it to finish
1709 	before proceeding further. */
1710 	if (flush_to_disk
1711 	    && (log_sys->n_pending_flushes > 0
1712 		|| !os_event_is_set(log_sys->flush_event))) {
1713 
1714 		/* Figure out if the current flush will do the job
1715 		for us. */
1716 		bool work_done = log_sys->current_flush_lsn >= lsn;
1717 
1718 		log_write_mutex_exit();
1719 
1720 		os_event_wait(log_sys->flush_event);
1721 
1722 		if (work_done) {
1723 			return;
1724 		} else {
1725 			goto loop;
1726 		}
1727 	}
1728 
1729 	log_mutex_enter();
1730 	if (!flush_to_disk
1731 	    && log_sys->buf_free == log_sys->buf_next_to_write) {
1732 		/* Nothing to write and no flush to disk requested */
1733 		log_mutex_exit_all();
1734 		return;
1735 	}
1736 
1737 	log_group_t*	group;
1738 	ulint		start_offset;
1739 	ulint		end_offset;
1740 	ulint		area_start;
1741 	ulint		area_end;
1742 	ulong		write_ahead_size = srv_log_write_ahead_size;
1743 	ulint		pad_size;
1744 
1745 	DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF,
1746 			      log_sys->write_lsn,
1747 			      log_sys->lsn));
1748 
1749 	if (flush_to_disk) {
1750 		log_sys->n_pending_flushes++;
1751 		log_sys->current_flush_lsn = log_sys->lsn;
1752 		MONITOR_INC(MONITOR_PENDING_LOG_FLUSH);
1753 		os_event_reset(log_sys->flush_event);
1754 
1755 		if (log_sys->buf_free == log_sys->buf_next_to_write) {
1756 			/* Nothing to write, flush only */
1757 			log_mutex_exit_all();
1758 			log_write_flush_to_disk_low();
1759 			return;
1760 		}
1761 	}
1762 
1763 	start_offset = log_sys->buf_next_to_write;
1764 	end_offset = log_sys->buf_free;
1765 
1766 	area_start = ut_calc_align_down(start_offset, OS_FILE_LOG_BLOCK_SIZE);
1767 	area_end = ut_calc_align(end_offset, OS_FILE_LOG_BLOCK_SIZE);
1768 
1769 	ut_ad(area_end - area_start > 0);
1770 
1771 	log_block_set_flush_bit(log_sys->buf + area_start, TRUE);
1772 	log_block_set_checkpoint_no(
1773 		log_sys->buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
1774 		log_sys->next_checkpoint_no);
1775 
1776 	write_lsn = log_sys->lsn;
1777 	write_buf = log_sys->buf;
1778 
1779 	log_buffer_switch();
1780 
1781 	group = UT_LIST_GET_FIRST(log_sys->log_groups);
1782 
1783 	log_group_set_fields(group, log_sys->write_lsn);
1784 
1785 	log_mutex_exit();
1786 
1787 	/* Calculate pad_size if needed. */
1788 	pad_size = 0;
1789 	if (write_ahead_size > OS_FILE_LOG_BLOCK_SIZE) {
1790 		lsn_t	end_offset;
1791 		ulint	end_offset_in_unit;
1792 
1793 		end_offset = log_group_calc_lsn_offset(
1794 			ut_uint64_align_up(write_lsn,
1795 					   OS_FILE_LOG_BLOCK_SIZE),
1796 			group);
1797 		end_offset_in_unit = (ulint) (end_offset % write_ahead_size);
1798 
1799 		if (end_offset_in_unit > 0
1800 		    && (area_end - area_start) > end_offset_in_unit) {
1801 			/* The first block in the unit was initialized
1802 			after the last writing.
1803 			Needs to be written padded data once. */
1804 			pad_size = write_ahead_size - end_offset_in_unit;
1805 
1806 			if (area_end + pad_size > log_sys->buf_size) {
1807 				pad_size = log_sys->buf_size - area_end;
1808 			}
1809 
1810 			::memset(write_buf + area_end, 0, pad_size);
1811 		}
1812 	}
1813 
1814 	/* Do the write to the log files */
1815 	log_group_write_buf(
1816 		group, write_buf + area_start,
1817 		area_end - area_start + pad_size,
1818 #ifdef UNIV_DEBUG
1819 		pad_size,
1820 #endif /* UNIV_DEBUG */
1821 		ut_uint64_align_down(log_sys->write_lsn,
1822 				     OS_FILE_LOG_BLOCK_SIZE),
1823 		start_offset - area_start);
1824 
1825 	srv_stats.log_padded.add(pad_size);
1826 
1827 	log_sys->write_lsn = write_lsn;
1828 
1829 #ifndef _WIN32
1830 	if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC
1831 	    || srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) {
1832 		/* O_SYNC and ALL_O_DIRECT mean the OS did not buffer the log
1833 		file at all: so we have also flushed to disk what we have
1834 		written */
1835 		log_sys->flushed_to_disk_lsn = log_sys->write_lsn;
1836 	}
1837 #endif /* !_WIN32 */
1838 
1839 	log_write_mutex_exit();
1840 
1841 	if (flush_to_disk) {
1842 		log_write_flush_to_disk_low();
1843 	}
1844 }
1845 
1846 /** write to the log file up to the last log entry.
1847 @param[in]	sync	whether we want the written log
1848 also to be flushed to disk. */
1849 void
log_buffer_flush_to_disk(bool sync)1850 log_buffer_flush_to_disk(
1851 	bool sync)
1852 {
1853 	ut_ad(!srv_read_only_mode);
1854 	log_write_up_to(log_get_lsn(), sync);
1855 }
1856 
1857 /****************************************************************//**
1858 This functions writes the log buffer to the log file and if 'flush'
1859 is set it forces a flush of the log file as well. This is meant to be
1860 called from background master thread only as it does not wait for
1861 the write (+ possible flush) to finish. */
1862 void
log_buffer_sync_in_background(bool flush)1863 log_buffer_sync_in_background(
1864 /*==========================*/
1865 	bool	flush)	/*!< in: flush the logs to disk */
1866 {
1867 	lsn_t	lsn;
1868 
1869 	log_mutex_enter();
1870 
1871 	lsn = log_sys->lsn;
1872 
1873 	if (flush
1874 	    && log_sys->n_pending_flushes > 0
1875 	    && log_sys->current_flush_lsn >= lsn) {
1876 		/* The write + flush will write enough */
1877 		log_mutex_exit();
1878 		return;
1879 	}
1880 
1881 	log_mutex_exit();
1882 
1883 	log_write_up_to(lsn, flush);
1884 }
1885 
1886 /********************************************************************
1887 
1888 Tries to establish a big enough margin of free space in the log buffer, such
1889 that a new log entry can be catenated without an immediate need for a flush. */
1890 static
1891 void
log_flush_margin(void)1892 log_flush_margin(void)
1893 /*==================*/
1894 {
1895 	log_t*	log	= log_sys;
1896 	lsn_t	lsn	= 0;
1897 
1898 	log_mutex_enter();
1899 
1900 	if (log->buf_free > log->max_buf_free) {
1901 		/* We can write during flush */
1902 		lsn = log->lsn;
1903 	}
1904 
1905 	log_mutex_exit();
1906 
1907 	if (lsn) {
1908 		log_write_up_to(lsn, false);
1909 	}
1910 }
1911 #ifndef UNIV_HOTBACKUP
1912 /** Advances the smallest lsn for which there are unflushed dirty blocks in the
1913 buffer pool.
1914 NOTE: this function may only be called if the calling thread owns no
1915 synchronization objects!
1916 @param[in]	new_oldest	try to advance oldest_modified_lsn at least to
1917 this lsn
1918 @return false if there was a flush batch of the same type running,
1919 which means that we could not start this flush batch */
1920 static
1921 bool
log_preflush_pool_modified_pages(lsn_t new_oldest)1922 log_preflush_pool_modified_pages(
1923 	lsn_t			new_oldest)
1924 {
1925 	if (recv_recovery_on) {
1926 		/* If the recovery is running, we must first apply all
1927 		log records to their respective file pages to get the
1928 		right modify lsn values to these pages: otherwise, there
1929 		might be pages on disk which are not yet recovered to the
1930 		current lsn, and even after calling this function, we could
1931 		not know how up-to-date the disk version of the database is,
1932 		and we could not make a new checkpoint on the basis of the
1933 		info on the buffer pool only. */
1934 
1935 		recv_apply_hashed_log_recs(TRUE);
1936 	}
1937 
1938 	/* better to wait for flushed by page cleaner */
1939 	ut_ad(buf_page_cleaner_is_active);
1940 
1941 	if (srv_flush_sync) {
1942 		/* wake page cleaner for IO burst */
1943 		buf_flush_request_force(new_oldest);
1944 	}
1945 
1946 	buf_flush_wait_flushed(new_oldest);
1947 
1948 	return(true);
1949 }
1950 #endif /* !UNIV_HOTBACKUP */
1951 /******************************************************//**
1952 Completes a checkpoint. */
1953 static
1954 void
log_complete_checkpoint(void)1955 log_complete_checkpoint(void)
1956 /*=========================*/
1957 {
1958 	ut_ad(log_mutex_own());
1959 	ut_ad(log_sys->n_pending_checkpoint_writes == 0);
1960 
1961 	log_sys->next_checkpoint_no++;
1962 
1963 	ut_ad(log_sys->next_checkpoint_lsn >= log_sys->last_checkpoint_lsn);
1964 	log_sys->last_checkpoint_lsn = log_sys->next_checkpoint_lsn;
1965 	MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
1966 		    log_sys->lsn - log_sys->last_checkpoint_lsn);
1967 
1968 	DBUG_PRINT("ib_log", ("checkpoint ended at " LSN_PF
1969 			      ", flushed to " LSN_PF,
1970 			      log_sys->last_checkpoint_lsn,
1971 			      log_sys->flushed_to_disk_lsn));
1972 
1973 	rw_lock_x_unlock_gen(&(log_sys->checkpoint_lock), LOG_CHECKPOINT);
1974 }
1975 
1976 /******************************************************//**
1977 Completes an asynchronous checkpoint info write i/o to a log file. */
1978 static
1979 void
log_io_complete_checkpoint(void)1980 log_io_complete_checkpoint(void)
1981 /*============================*/
1982 {
1983 	MONITOR_DEC(MONITOR_PENDING_CHECKPOINT_WRITE);
1984 
1985 	log_mutex_enter();
1986 
1987 	ut_ad(log_sys->n_pending_checkpoint_writes > 0);
1988 
1989 	if (--log_sys->n_pending_checkpoint_writes == 0) {
1990 		log_complete_checkpoint();
1991 	}
1992 
1993 	log_mutex_exit();
1994 
1995 	/* Wake the redo log watching thread to parse the log up to this
1996 	checkpoint. */
1997 	if (srv_track_changed_pages) {
1998 		os_event_reset(srv_redo_log_tracked_event);
1999 		os_event_set(srv_checkpoint_completed_event);
2000 	}
2001 }
2002 
2003 /******************************************************//**
2004 Writes the checkpoint info to a log group header. */
2005 static
2006 void
log_group_checkpoint(log_group_t * group)2007 log_group_checkpoint(
2008 /*=================*/
2009 	log_group_t*	group)	/*!< in: log group */
2010 {
2011 	lsn_t		lsn_offset;
2012 	byte*		buf;
2013 
2014 	ut_ad(!srv_read_only_mode);
2015 	ut_ad(log_mutex_own());
2016 	ut_ad(srv_shutdown_state != SRV_SHUTDOWN_LAST_PHASE);
2017 #if LOG_CHECKPOINT_SIZE > OS_FILE_LOG_BLOCK_SIZE
2018 # error "LOG_CHECKPOINT_SIZE > OS_FILE_LOG_BLOCK_SIZE"
2019 #endif
2020 
2021 	DBUG_PRINT("ib_log", ("checkpoint " UINT64PF " at " LSN_PF
2022 			      " written to group " ULINTPF,
2023 			      log_sys->next_checkpoint_no,
2024 			      log_sys->next_checkpoint_lsn,
2025 			      group->id));
2026 
2027 	buf = group->checkpoint_buf;
2028 	memset(buf, 0, OS_FILE_LOG_BLOCK_SIZE);
2029 
2030 #ifdef UNIV_DEBUG
2031 	lsn_t		old_next_checkpoint_lsn
2032 		= mach_read_from_8(buf + LOG_CHECKPOINT_LSN);
2033 	ut_ad(old_next_checkpoint_lsn <= log_sys->next_checkpoint_lsn);
2034 #endif /* UNIV_DEBUG */
2035 	mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys->next_checkpoint_no);
2036 	mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys->next_checkpoint_lsn);
2037 
2038 	lsn_offset = log_group_calc_lsn_offset(log_sys->next_checkpoint_lsn,
2039 					       group);
2040 	mach_write_to_8(buf + LOG_CHECKPOINT_OFFSET, lsn_offset);
2041 	mach_write_to_8(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, log_sys->buf_size);
2042 
2043 	log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf));
2044 
2045 	MONITOR_INC(MONITOR_PENDING_CHECKPOINT_WRITE);
2046 
2047 	log_sys->n_log_ios++;
2048 
2049 	MONITOR_INC(MONITOR_LOG_IO);
2050 
2051 	ut_ad(LOG_CHECKPOINT_1 < univ_page_size.physical());
2052 	ut_ad(LOG_CHECKPOINT_2 < univ_page_size.physical());
2053 
2054 	if (log_sys->n_pending_checkpoint_writes++ == 0) {
2055 		rw_lock_x_lock_gen(&log_sys->checkpoint_lock,
2056 				   LOG_CHECKPOINT);
2057 	}
2058 
2059 	/* Note: We alternate the physical place of the checkpoint info.
2060 	See the (next_checkpoint_no & 1) below. */
2061 
2062 	/* We send as the last parameter the group machine address
2063 	added with 1, as we want to distinguish between a normal log
2064 	file write and a checkpoint field write */
2065 
2066 	fil_io(IORequestLogWrite, false,
2067 	       page_id_t(group->space_id, 0),
2068 	       univ_page_size,
2069 	       (log_sys->next_checkpoint_no & 1)
2070 	       ? LOG_CHECKPOINT_2 : LOG_CHECKPOINT_1,
2071 	       OS_FILE_LOG_BLOCK_SIZE,
2072 	       buf, (byte*) group + 1);
2073 
2074 	ut_ad(((ulint) group & 0x1UL) == 0);
2075 }
2076 
2077 #ifdef UNIV_HOTBACKUP
2078 /******************************************************//**
2079 Writes info to a buffer of a log group when log files are created in
2080 backup restoration. */
2081 void
log_reset_first_header_and_checkpoint(byte * hdr_buf,ib_uint64_t start)2082 log_reset_first_header_and_checkpoint(
2083 /*==================================*/
2084 	byte*		hdr_buf,/*!< in: buffer which will be written to the
2085 				start of the first log file */
2086 	ib_uint64_t	start)	/*!< in: lsn of the start of the first log file;
2087 				we pretend that there is a checkpoint at
2088 				start + LOG_BLOCK_HDR_SIZE */
2089 {
2090 	byte*		buf;
2091 	ib_uint64_t	lsn;
2092 
2093 	mach_write_to_4(hdr_buf + LOG_HEADER_FORMAT,
2094 			LOG_HEADER_FORMAT_CURRENT);
2095 	mach_write_to_8(hdr_buf + LOG_HEADER_START_LSN, start);
2096 
2097 	lsn = start + LOG_BLOCK_HDR_SIZE;
2098 
2099 	/* Write the label of mysqlbackup --restore */
2100 	strcpy((char*)hdr_buf + LOG_HEADER_CREATOR, LOG_HEADER_CREATOR_CURRENT);
2101 	ut_sprintf_timestamp((char*) hdr_buf
2102 			     + (LOG_HEADER_CREATOR
2103 			     + (sizeof LOG_HEADER_CREATOR_CURRENT) - 1));
2104 	buf = hdr_buf + LOG_CHECKPOINT_1;
2105 	memset(buf, 0, OS_FILE_LOG_BLOCK_SIZE);
2106 
2107 	/*mach_write_to_8(buf + LOG_CHECKPOINT_NO, 0);*/
2108 	mach_write_to_8(buf + LOG_CHECKPOINT_LSN, lsn);
2109 
2110 	mach_write_to_8(buf + LOG_CHECKPOINT_OFFSET,
2111 			LOG_FILE_HDR_SIZE + LOG_BLOCK_HDR_SIZE);
2112 	mach_write_to_8(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, 2 * 1024 * 1024);
2113 
2114 	log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf));
2115 }
2116 #endif /* UNIV_HOTBACKUP */
2117 
2118 #ifndef UNIV_HOTBACKUP
2119 /** Read a log group header page to log_sys->checkpoint_buf.
2120 @param[in]	group	log group
2121 @param[in]	header	0 or LOG_CHEKCPOINT_1 or LOG_CHECKPOINT2 */
2122 void
log_group_header_read(const log_group_t * group,ulint header)2123 log_group_header_read(
2124 	const log_group_t*	group,
2125 	ulint			header)
2126 {
2127 	ut_ad(log_mutex_own());
2128 
2129 	log_sys->n_log_ios++;
2130 
2131 	MONITOR_INC(MONITOR_LOG_IO);
2132 
2133 	fil_io(IORequestLogRead, true,
2134 	       page_id_t(group->space_id, header / univ_page_size.physical()),
2135 	       univ_page_size, header % univ_page_size.physical(),
2136 	       OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL);
2137 }
2138 
2139 /** Write checkpoint info to the log header and invoke log_mutex_exit().
2140 @param[in]	sync	whether to wait for the write to complete */
2141 void
log_write_checkpoint_info(bool sync)2142 log_write_checkpoint_info(
2143 	bool	sync)
2144 {
2145 	log_group_t*	group;
2146 
2147 	ut_ad(log_mutex_own());
2148 
2149 	if (!srv_read_only_mode) {
2150 		for (group = UT_LIST_GET_FIRST(log_sys->log_groups);
2151 		     group;
2152 		     group = UT_LIST_GET_NEXT(log_groups, group)) {
2153 
2154 			log_group_checkpoint(group);
2155 		}
2156 	}
2157 
2158 	log_mutex_exit();
2159 
2160 	MONITOR_INC(MONITOR_NUM_CHECKPOINT);
2161 
2162 	if (sync) {
2163 		/* Wait for the checkpoint write to complete */
2164 		rw_lock_s_lock(&log_sys->checkpoint_lock);
2165 		rw_lock_s_unlock(&log_sys->checkpoint_lock);
2166 
2167 		DEBUG_SYNC_C("checkpoint_completed");
2168 
2169 		DBUG_EXECUTE_IF(
2170 			"crash_after_checkpoint",
2171 			DBUG_SUICIDE(););
2172 	}
2173 }
2174 
2175 /** Set extra data to be written to the redo log during checkpoint.
2176 @param[in]	buf	data to be appended on checkpoint, or NULL
2177 @return pointer to previous data to be appended on checkpoint */
2178 mtr_buf_t*
log_append_on_checkpoint(mtr_buf_t * buf)2179 log_append_on_checkpoint(
2180 	mtr_buf_t*	buf)
2181 {
2182 	log_mutex_enter();
2183 	mtr_buf_t*	old = log_sys->append_on_checkpoint;
2184 	log_sys->append_on_checkpoint = buf;
2185 	log_mutex_exit();
2186 	return(old);
2187 }
2188 
2189 /** Make a checkpoint. Note that this function does not flush dirty
2190 blocks from the buffer pool: it only checks what is lsn of the oldest
2191 modification in the pool, and writes information about the lsn in
2192 log files. Use log_make_checkpoint_at() to flush also the pool.
2193 @param[in]	sync		whether to wait for the write to complete
2194 @param[in]	write_always	force a write even if no log
2195 has been generated since the latest checkpoint
2196 @return true if success, false if a checkpoint write was already running */
2197 bool
log_checkpoint(bool sync,bool write_always)2198 log_checkpoint(
2199 	bool	sync,
2200 	bool	write_always)
2201 {
2202 	lsn_t	oldest_lsn;
2203 
2204 	ut_ad(!srv_read_only_mode);
2205 
2206 	if (recv_recovery_is_on()) {
2207 		recv_apply_hashed_log_recs(TRUE);
2208 	}
2209 
2210 #ifndef _WIN32
2211 	switch (srv_unix_file_flush_method) {
2212 	case SRV_UNIX_NOSYNC:
2213 	case SRV_UNIX_ALL_O_DIRECT:
2214 		break;
2215 	case SRV_UNIX_O_DSYNC:
2216 	case SRV_UNIX_FSYNC:
2217 	case SRV_UNIX_LITTLESYNC:
2218 	case SRV_UNIX_O_DIRECT:
2219 	case SRV_UNIX_O_DIRECT_NO_FSYNC:
2220 		fil_flush_file_spaces(FIL_TYPE_TABLESPACE);
2221 	}
2222 #endif /* !_WIN32 */
2223 
2224 	log_mutex_enter();
2225 
2226 	ut_ad(!recv_no_log_write);
2227 	oldest_lsn = log_buf_pool_get_oldest_modification();
2228 
2229 	/* Because log also contains headers and dummy log records,
2230 	log_buf_pool_get_oldest_modification() will return log_sys->lsn
2231 	if the buffer pool contains no dirty buffers.
2232 	We must make sure that the log is flushed up to that lsn.
2233 	If there are dirty buffers in the buffer pool, then our
2234 	write-ahead-logging algorithm ensures that the log has been
2235 	flushed up to oldest_lsn. */
2236 
2237 	ut_ad(oldest_lsn >= log_sys->last_checkpoint_lsn);
2238 	if (!write_always
2239 	    && oldest_lsn
2240 	    <= log_sys->last_checkpoint_lsn + SIZE_OF_MLOG_CHECKPOINT) {
2241 		/* Do nothing, because nothing was logged (other than
2242 		a MLOG_CHECKPOINT marker) since the previous checkpoint. */
2243 		log_mutex_exit();
2244 		return(true);
2245 	}
2246 
2247 	/* Repeat the MLOG_FILE_NAME records after the checkpoint, in
2248 	case some log records between the checkpoint and log_sys->lsn
2249 	need them. Finally, write a MLOG_CHECKPOINT marker. Redo log
2250 	apply expects to see a MLOG_CHECKPOINT after the checkpoint,
2251 	except on clean shutdown, where the log will be empty after
2252 	the checkpoint.
2253 
2254 	It is important that we write out the redo log before any
2255 	further dirty pages are flushed to the tablespace files.  At
2256 	this point, because log_mutex_own(), mtr_commit() in other
2257 	threads will be blocked, and no pages can be added to the
2258 	flush lists. */
2259 	lsn_t		flush_lsn	= oldest_lsn;
2260 	const bool	do_write
2261 		= srv_shutdown_state == SRV_SHUTDOWN_NONE
2262 		|| flush_lsn != log_sys->lsn;
2263 
2264 	if (fil_names_clear(flush_lsn, do_write)) {
2265 		ut_ad(log_sys->lsn >= flush_lsn + SIZE_OF_MLOG_CHECKPOINT);
2266 		flush_lsn = log_sys->lsn;
2267 	}
2268 
2269 	log_mutex_exit();
2270 
2271 	log_write_up_to(flush_lsn, true);
2272 
2273 	DBUG_EXECUTE_IF(
2274 		"using_wa_checkpoint_middle",
2275 		if (write_always) {
2276 			DEBUG_SYNC_C("wa_checkpoint_middle");
2277 
2278 			const my_bool b = TRUE;
2279 			buf_flush_page_cleaner_disabled_debug_update(
2280 				NULL, NULL, NULL, &b);
2281 			dict_stats_disabled_debug_update(
2282 				NULL, NULL, NULL, &b);
2283 			srv_master_thread_disabled_debug_update(
2284 				NULL, NULL, NULL, &b);
2285 		});
2286 
2287 	log_mutex_enter();
2288 
2289 	ut_ad(log_sys->flushed_to_disk_lsn >= flush_lsn);
2290 	ut_ad(flush_lsn >= oldest_lsn);
2291 
2292 	if (log_sys->last_checkpoint_lsn >= oldest_lsn) {
2293 		log_mutex_exit();
2294 		return(true);
2295 	}
2296 
2297 	if (log_sys->n_pending_checkpoint_writes > 0) {
2298 		/* A checkpoint write is running */
2299 		log_mutex_exit();
2300 
2301 		if (sync) {
2302 			/* Wait for the checkpoint write to complete */
2303 			rw_lock_s_lock(&log_sys->checkpoint_lock);
2304 			rw_lock_s_unlock(&log_sys->checkpoint_lock);
2305 		}
2306 
2307 		return(false);
2308 	}
2309 
2310 	ut_ad(oldest_lsn >= log_sys->next_checkpoint_lsn);
2311 	log_sys->next_checkpoint_lsn = oldest_lsn;
2312 	log_write_checkpoint_info(sync);
2313 	ut_ad(!log_mutex_own());
2314 
2315 	return(true);
2316 }
2317 
2318 /** Make a checkpoint at or after a specified LSN.
2319 @param[in]	lsn		the log sequence number, or LSN_MAX
2320 for the latest LSN
2321 @param[in]	write_always	force a write even if no log
2322 has been generated since the latest checkpoint */
2323 void
log_make_checkpoint_at(lsn_t lsn,bool write_always)2324 log_make_checkpoint_at(
2325 	lsn_t			lsn,
2326 	bool			write_always)
2327 {
2328 	/* Preflush pages synchronously */
2329 
2330 	if (srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE) {
2331 		while (!log_preflush_pool_modified_pages(lsn)) {
2332 			/* Flush as much as we can */
2333 		}
2334 	}
2335 
2336 	while (!log_checkpoint(true, write_always)) {
2337 		/* Force a checkpoint */
2338 	}
2339 }
2340 
2341 /****************************************************************//**
2342 Tries to establish a big enough margin of free space in the log groups, such
2343 that a new log entry can be catenated without an immediate need for a
2344 checkpoint. NOTE: this function may only be called if the calling thread
2345 owns no synchronization objects! */
2346 static
2347 void
log_checkpoint_margin(void)2348 log_checkpoint_margin(void)
2349 /*=======================*/
2350 {
2351 	log_t*		log		= log_sys;
2352 	lsn_t		age;
2353 	lsn_t		checkpoint_age;
2354 	ib_uint64_t	advance;
2355 	lsn_t		oldest_lsn;
2356 	bool		success;
2357 loop:
2358 	advance = 0;
2359 
2360 	log_mutex_enter();
2361 	ut_ad(!recv_no_log_write);
2362 
2363 	if (!log->check_flush_or_checkpoint) {
2364 		log_mutex_exit();
2365 		return;
2366 	}
2367 
2368 	oldest_lsn = log_buf_pool_get_oldest_modification();
2369 
2370 	age = log->lsn - oldest_lsn;
2371 
2372 	if (age > log->max_modified_age_sync) {
2373 
2374 		/* A flush is urgent: we have to do a synchronous preflush */
2375 		advance = age - log->max_modified_age_sync;
2376 	}
2377 
2378 	checkpoint_age = log->lsn - log->last_checkpoint_lsn;
2379 
2380 	bool	checkpoint_sync;
2381 	bool	do_checkpoint;
2382 
2383 	if (checkpoint_age > log->max_checkpoint_age) {
2384 		/* A checkpoint is urgent: we do it synchronously */
2385 		checkpoint_sync = true;
2386 		do_checkpoint = true;
2387 	} else if (checkpoint_age > log->max_checkpoint_age_async) {
2388 		/* A checkpoint is not urgent: do it asynchronously */
2389 		do_checkpoint = true;
2390 		checkpoint_sync = false;
2391 		log->check_flush_or_checkpoint = false;
2392 	} else {
2393 		do_checkpoint = false;
2394 		checkpoint_sync = false;
2395 		log->check_flush_or_checkpoint = false;
2396 	}
2397 
2398 	log_mutex_exit();
2399 
2400 	if (advance) {
2401 		lsn_t	new_oldest = oldest_lsn + advance;
2402 
2403 		success = log_preflush_pool_modified_pages(new_oldest);
2404 
2405 		/* If the flush succeeded, this thread has done its part
2406 		and can proceed. If it did not succeed, there was another
2407 		thread doing a flush at the same time. */
2408 		if (!success) {
2409 			log_mutex_enter();
2410 
2411 			log->check_flush_or_checkpoint = true;
2412 
2413 			log_mutex_exit();
2414 			goto loop;
2415 		}
2416 	}
2417 
2418 	if (do_checkpoint) {
2419 		log_checkpoint(checkpoint_sync, FALSE);
2420 
2421 		if (checkpoint_sync) {
2422 
2423 			goto loop;
2424 		}
2425 	}
2426 }
2427 
2428 /******************************************************//**
2429 Reads a specified log segment to a buffer. Optionally releases the log mutex
2430 before the I/O.*/
2431 void
log_group_read_log_seg(byte * buf,log_group_t * group,lsn_t start_lsn,lsn_t end_lsn,bool release_mutex)2432 log_group_read_log_seg(
2433 /*===================*/
2434 	byte*		buf,		/*!< in: buffer where to read */
2435 	log_group_t*	group,		/*!< in: log group */
2436 	lsn_t		start_lsn,	/*!< in: read area start */
2437 	lsn_t		end_lsn,	/*!< in: read area end */
2438 	bool		release_mutex)	/*!< in: whether the log_sys->mutex
2439 					should be released before the read */
2440 {
2441 	ulint	len;
2442 	lsn_t	source_offset;
2443 
2444 	ut_ad(log_mutex_own());
2445 
2446 loop:
2447 	source_offset = log_group_calc_lsn_offset(start_lsn, group);
2448 
2449 	ut_a(end_lsn - start_lsn <= ULINT_MAX);
2450 	len = (ulint) (end_lsn - start_lsn);
2451 
2452 	ut_ad(len != 0);
2453 
2454 	if ((source_offset % group->file_size) + len > group->file_size) {
2455 
2456 		/* If the above condition is true then len (which is ulint)
2457 		is > the expression below, so the typecast is ok */
2458 		len = (ulint) (group->file_size -
2459 			(source_offset % group->file_size));
2460 	}
2461 
2462 	log_sys->n_log_ios++;
2463 
2464 	MONITOR_INC(MONITOR_LOG_IO);
2465 
2466 	ut_a(source_offset / UNIV_PAGE_SIZE <= ULINT_MAX);
2467 
2468 	if (release_mutex) {
2469 		log_mutex_exit();
2470 	}
2471 
2472 	const ulint	page_no
2473 		= (ulint) (source_offset / univ_page_size.physical());
2474 
2475 	fil_io(IORequestLogRead, true,
2476 	       page_id_t(group->space_id, page_no),
2477 	       univ_page_size,
2478 	       (ulint) (source_offset % univ_page_size.physical()),
2479 	       len, buf, NULL);
2480 
2481 	start_lsn += len;
2482 	buf += len;
2483 
2484 	if (start_lsn != end_lsn) {
2485 
2486 		if (release_mutex) {
2487 			log_mutex_enter();
2488 		}
2489 		goto loop;
2490 	}
2491 }
2492 
2493 /**
2494 Checks that there is enough free space in the log to start a new query step.
2495 Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
2496 function may only be called if the calling thread owns no synchronization
2497 objects! */
2498 void
log_check_margins(void)2499 log_check_margins(void)
2500 {
2501 	bool	check	= true;
2502 
2503 	do {
2504 		log_flush_margin();
2505 		log_checkpoint_margin();
2506 		log_mutex_enter();
2507 		if (log_check_tracking_margin(0)) {
2508 			log_mutex_exit();
2509 			os_thread_sleep(10000);
2510 			continue;
2511 		}
2512 		ut_ad(!recv_no_log_write);
2513 		check = log_sys->check_flush_or_checkpoint;
2514 		log_mutex_exit();
2515 	} while (check);
2516 }
2517 
2518 /****************************************************************//**
2519 Makes a checkpoint at the latest lsn and writes it to first page of each
2520 data file in the database, so that we know that the file spaces contain
2521 all modifications up to that lsn. This can only be called at database
2522 shutdown. This function also writes all log in log files to the log archive. */
2523 void
logs_empty_and_mark_files_at_shutdown(void)2524 logs_empty_and_mark_files_at_shutdown(void)
2525 /*=======================================*/
2526 {
2527 	lsn_t			lsn;
2528 	lsn_t			tracked_lsn;
2529 	ulint			count = 0;
2530 	ulint			total_trx;
2531 	ulint			pending_io;
2532 	enum srv_thread_type	active_thd;
2533 	const char*		thread_name;
2534 
2535 	ib::info() << "Starting shutdown...";
2536 
2537 	while (srv_fast_shutdown == 0 && trx_rollback_or_clean_is_active) {
2538 		/* we should wait until rollback after recovery end
2539 		for slow shutdown */
2540 		os_thread_sleep(100000);
2541 	}
2542 
2543 	/* Wait until the master thread and all other operations are idle: our
2544 	algorithm only works if the server is idle at shutdown */
2545 
2546 	srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
2547 loop:
2548 	os_thread_sleep(100000);
2549 
2550 	count++;
2551 
2552 	/* We need the monitor threads to stop before we proceed with
2553 	a shutdown. */
2554 
2555 	thread_name = srv_any_background_threads_are_active();
2556 
2557 	if (thread_name != NULL) {
2558 		/* Print a message every 60 seconds if we are waiting
2559 		for the monitor thread to exit. Master and worker
2560 		threads check will be done later. */
2561 
2562 		if (srv_print_verbose_log && count > 600) {
2563 			ib::info() << "Waiting for " << thread_name
2564 				<< " to exit";
2565 			count = 0;
2566 		}
2567 
2568 		goto loop;
2569 	}
2570 
2571 	/* Check that there are no longer transactions, except for
2572 	PREPARED ones. We need this wait even for the 'very fast'
2573 	shutdown, because the InnoDB layer may have committed or
2574 	prepared transactions and we don't want to lose them. */
2575 
2576 	total_trx = trx_sys_any_active_transactions();
2577 
2578 	if (total_trx > 0) {
2579 
2580 		if (srv_print_verbose_log && count > 600) {
2581 			ib::info() << "Waiting for " << total_trx << " active"
2582 				<< " transactions to finish";
2583 
2584 			count = 0;
2585 		}
2586 
2587 		goto loop;
2588 	}
2589 
2590 	/* Check that the background threads are suspended */
2591 
2592 	active_thd = srv_get_active_thread_type();
2593 
2594 	if (active_thd != SRV_NONE || srv_n_fil_crypt_threads_started) {
2595 
2596 		if (active_thd == SRV_PURGE) {
2597 			srv_purge_wakeup();
2598 		}
2599 
2600 		/* The srv_lock_timeout_thread, srv_error_monitor_thread
2601 		and srv_monitor_thread should already exit by now. The
2602 		only threads to be suspended are the master threads
2603 		and worker threads (purge threads). Print the thread
2604 		type if any of such threads not in suspended mode */
2605 		if (srv_print_verbose_log && count > 600) {
2606 			const char*	thread_type = "<null>";
2607 
2608 			switch (active_thd) {
2609 			case SRV_NONE:
2610 				ut_ad(srv_n_fil_crypt_threads_started);
2611 				thread_type = "encryption thread";
2612 				break;
2613 			case SRV_WORKER:
2614 				thread_type = "worker threads";
2615 				break;
2616 			case SRV_MASTER:
2617 				thread_type = "master thread";
2618 				break;
2619 			case SRV_PURGE:
2620 				thread_type = "purge thread";
2621 				break;
2622 			}
2623 
2624 			ib::info() << "Waiting for " << thread_type
2625 				<< " to be suspended";
2626 
2627 			count = 0;
2628 		}
2629 
2630 		goto loop;
2631 	}
2632 
2633 	/* At this point only page_cleaner should be active. We wait
2634 	here to let it complete the flushing of the buffer pools
2635 	before proceeding further. */
2636 	os_rmb;
2637 	ut_ad(buf_lru_manager_running_threads == srv_buf_pool_instances
2638 	      || buf_lru_manager_running_threads == 0);
2639 	srv_shutdown_state = SRV_SHUTDOWN_FLUSH_PHASE;
2640 	count = 0;
2641 	while (buf_page_cleaner_is_active
2642 	       || buf_lru_manager_running_threads > 0) {
2643 
2644 		if (srv_print_verbose_log && count == 0) {
2645 			ib::info() << "Waiting for page_cleaner to"
2646 				" finish flushing of buffer pool";
2647 		}
2648 		++count;
2649 		os_thread_sleep(100000);
2650 		if (count > 600) {
2651 			count = 0;
2652 		}
2653 
2654 		os_rmb;
2655 	}
2656 
2657 	log_mutex_enter();
2658 	const ulint	n_write	= log_sys->n_pending_checkpoint_writes;
2659 	const ulint	n_flush	= log_sys->n_pending_flushes;
2660 	log_mutex_exit();
2661 
2662 	if (log_scrub_thread_active || n_write != 0 || n_flush != 0) {
2663 		if (srv_print_verbose_log && count > 600) {
2664 			ib::info() << "Pending checkpoint_writes: " << n_write
2665 				<< ". Pending log flush writes: " << n_flush;
2666 			count = 0;
2667 		}
2668 		goto loop;
2669 	}
2670 
2671 	pending_io = buf_pool_check_no_pending_io();
2672 
2673         if (log_scrub_thread_active) {
2674                 ut_ad(!srv_read_only_mode);
2675                 os_event_set(log_scrub_event);
2676         }
2677 
2678         ut_ad(!log_scrub_thread_active);
2679 
2680 	if (pending_io) {
2681 		if (srv_print_verbose_log && count > 600) {
2682 			ib::info() << "Waiting for " << pending_io << " buffer"
2683 				" page I/Os to complete";
2684 			count = 0;
2685 		}
2686 
2687 		goto loop;
2688 	}
2689 
2690 	if (srv_fast_shutdown == 2) {
2691 		if (!srv_read_only_mode) {
2692 			ib::info() << "MySQL has requested a very fast"
2693 				" shutdown without flushing the InnoDB buffer"
2694 				" pool to data files. At the next mysqld"
2695 				" startup InnoDB will do a crash recovery!";
2696 
2697 			/* In this fastest shutdown we do not flush the
2698 			buffer pool:
2699 
2700 			it is essentially a 'crash' of the InnoDB server.
2701 			Make sure that the log is all flushed to disk, so
2702 			that we can recover all committed transactions in
2703 			a crash recovery. We must not write the lsn stamps
2704 			to the data files, since at a startup InnoDB deduces
2705 			from the stamps if the previous shutdown was clean. */
2706 
2707 			log_buffer_flush_to_disk();
2708 
2709 			/* Check that the background threads stay suspended */
2710 			thread_name = srv_any_background_threads_are_active();
2711 
2712 			if (thread_name != NULL) {
2713 				ib::warn() << "Background thread "
2714 					<< thread_name << " woke up during"
2715 					" shutdown";
2716 				goto loop;
2717 			}
2718 		}
2719 
2720 		srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
2721 
2722 		/* Wake the log tracking thread which will then immediatelly
2723 		quit because of srv_shutdown_state value */
2724 		if (srv_redo_log_thread_started) {
2725 			os_event_reset(srv_redo_log_tracked_event);
2726 			os_event_set(srv_checkpoint_completed_event);
2727 		}
2728 
2729 		fil_close_all_files();
2730 
2731 		thread_name = srv_any_background_threads_are_active();
2732 
2733 		ut_a(!thread_name);
2734 
2735 		return;
2736 	}
2737 
2738 	if (!srv_read_only_mode) {
2739 		log_make_checkpoint_at(LSN_MAX, TRUE);
2740 	}
2741 
2742 	log_mutex_enter();
2743 
2744 	tracked_lsn = log_get_tracked_lsn();
2745 
2746 	lsn = log_sys->lsn;
2747 
2748 	/** If innodb_force_recovery is set to 6 then log_sys doesn't
2749 	have recent checkpoint information. So last checkpoint lsn
2750 	will never be equal to current lsn. */
2751 	const bool      is_last =
2752 		((srv_force_recovery == SRV_FORCE_NO_LOG_REDO
2753 		  && lsn == log_sys->last_checkpoint_lsn
2754 		  + LOG_BLOCK_HDR_SIZE)
2755 		 || lsn == log_sys->last_checkpoint_lsn)
2756 		&& (!srv_track_changed_pages
2757 		    || tracked_lsn == log_sys->last_checkpoint_lsn);
2758 
2759 	ut_ad(lsn >= log_sys->last_checkpoint_lsn);
2760 
2761 	log_mutex_exit();
2762 
2763 	if (!is_last) {
2764 		goto loop;
2765 	}
2766 
2767 	/* Check that the background threads stay suspended */
2768 	thread_name = srv_any_background_threads_are_active();
2769 	if (thread_name != NULL) {
2770 		ib::warn() << "Background thread " << thread_name << " woke up"
2771 			" during shutdown";
2772 
2773 		goto loop;
2774 	}
2775 
2776 	if (!srv_read_only_mode) {
2777 		fil_flush_file_spaces(FIL_TYPE_TABLESPACE);
2778 		fil_flush_file_spaces(FIL_TYPE_LOG);
2779 	}
2780 
2781 	/* The call fil_write_flushed_lsn() will bypass the buffer
2782 	pool: therefore it is essential that the buffer pool has been
2783 	completely flushed to disk! (We do not call fil_write... if the
2784 	'very fast' shutdown is enabled.) */
2785 
2786 	if (!buf_all_freed()) {
2787 
2788 		if (srv_print_verbose_log && count > 600) {
2789 			ib::info() << "Waiting for dirty buffer pages to be"
2790 				" flushed";
2791 			count = 0;
2792 		}
2793 
2794 		goto loop;
2795 	}
2796 
2797 	srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
2798 
2799 	/* Signal the log following thread to quit */
2800 	if (srv_redo_log_thread_started) {
2801 		os_event_reset(srv_redo_log_tracked_event);
2802 		os_event_set(srv_checkpoint_completed_event);
2803 	}
2804 
2805 	/* Make some checks that the server really is quiet */
2806 	srv_thread_type	type = srv_get_active_thread_type();
2807 	ut_a(type == SRV_NONE);
2808 
2809 	bool	freed = buf_all_freed();
2810 	ut_a(freed);
2811 
2812 	ut_a(lsn == log_sys->lsn);
2813 	ut_ad(srv_force_recovery >= SRV_FORCE_NO_LOG_REDO
2814 	      || lsn == log_sys->last_checkpoint_lsn);
2815 
2816 	if (lsn < srv_start_lsn) {
2817 		ib::error() << "Log sequence number at shutdown " << lsn
2818 			<< " is lower than at startup " << srv_start_lsn
2819 			<< "!";
2820 	}
2821 
2822 	srv_shutdown_lsn = lsn;
2823 
2824 	if (!srv_read_only_mode) {
2825 		fil_write_flushed_lsn(lsn);
2826 	}
2827 
2828 	fil_close_all_files();
2829 
2830 	/* Make some checks that the server really is quiet */
2831 	type = srv_get_active_thread_type();
2832 	ut_a(type == SRV_NONE);
2833 
2834 	freed = buf_all_freed();
2835 	ut_a(freed);
2836 
2837 	ut_a(lsn == log_sys->lsn);
2838 }
2839 
2840 /******************************************************//**
2841 Peeks the current lsn.
2842 @return TRUE if success, FALSE if could not get the log system mutex */
2843 ibool
log_peek_lsn(lsn_t * lsn)2844 log_peek_lsn(
2845 /*=========*/
2846 	lsn_t*	lsn)	/*!< out: if returns TRUE, current lsn is here */
2847 {
2848 	if (0 == mutex_enter_nowait(&(log_sys->mutex))) {
2849 		*lsn = log_sys->lsn;
2850 
2851 		log_mutex_exit();
2852 
2853 		return(TRUE);
2854 	}
2855 
2856 	return(FALSE);
2857 }
2858 
2859 /******************************************************//**
2860 Prints info of the log. */
2861 void
log_print(FILE * file)2862 log_print(
2863 /*======*/
2864 	FILE*	file)	/*!< in: file where to print */
2865 {
2866 	double	time_elapsed;
2867 	time_t	current_time;
2868 
2869 	log_mutex_enter();
2870 
2871 	fprintf(file,
2872 		"Log sequence number " LSN_PF "\n"
2873 		"Log flushed up to   " LSN_PF "\n"
2874 		"Pages flushed up to " LSN_PF "\n"
2875 		"Last checkpoint at  " LSN_PF "\n",
2876 		log_sys->lsn,
2877 		log_sys->flushed_to_disk_lsn,
2878 		log_buf_pool_get_oldest_modification(),
2879 		log_sys->last_checkpoint_lsn);
2880 
2881 	fprintf(file,
2882 		"Max checkpoint age    " LSN_PF "\n"
2883 		"Checkpoint age target " LSN_PF "\n"
2884 		"Modified age          " LSN_PF "\n"
2885 		"Checkpoint age        " LSN_PF "\n",
2886 		log_sys->max_checkpoint_age,
2887 		log_sys->max_checkpoint_age_async,
2888 		log_sys->lsn -log_buf_pool_get_oldest_modification(),
2889 		log_sys->lsn - log_sys->last_checkpoint_lsn);
2890 
2891 	current_time = time(NULL);
2892 
2893 	time_elapsed = difftime(current_time,
2894 				log_sys->last_printout_time);
2895 
2896 	if (time_elapsed <= 0) {
2897 		time_elapsed = 1;
2898 	}
2899 
2900 	fprintf(file,
2901 		ULINTPF " pending log flushes, "
2902 		ULINTPF " pending chkp writes\n"
2903 		ULINTPF " log i/o's done, %.2f log i/o's/second\n",
2904 		log_sys->n_pending_flushes,
2905 		log_sys->n_pending_checkpoint_writes,
2906 		log_sys->n_log_ios,
2907 		static_cast<double>(
2908 			log_sys->n_log_ios - log_sys->n_log_ios_old)
2909 		/ time_elapsed);
2910 
2911 	if (srv_track_changed_pages) {
2912 
2913 		/* The maximum tracked LSN age is equal to the maximum
2914 		checkpoint age */
2915 		fprintf(file,
2916 			"Log tracking enabled\n"
2917 			"Log tracked up to   " LSN_PF "\n"
2918 			"Max tracked LSN age " LSN_PF "\n",
2919 			log_get_tracked_lsn(),
2920 			log_sys->max_checkpoint_age);
2921 	}
2922 
2923 	log_sys->n_log_ios_old = log_sys->n_log_ios;
2924 	log_sys->last_printout_time = current_time;
2925 
2926 	log_mutex_exit();
2927 }
2928 
2929 /**********************************************************************//**
2930 Refreshes the statistics used to print per-second averages. */
2931 void
log_refresh_stats(void)2932 log_refresh_stats(void)
2933 /*===================*/
2934 {
2935 	log_sys->n_log_ios_old = log_sys->n_log_ios;
2936 	log_sys->last_printout_time = time(NULL);
2937 }
2938 
2939 /********************************************************//**
2940 Closes a log group. */
2941 static
2942 void
log_group_close(log_group_t * group)2943 log_group_close(
2944 /*===========*/
2945 	log_group_t*	group)		/* in,own: log group to close */
2946 {
2947 	ulint	i;
2948 
2949 	for (i = 0; i < group->n_files; i++) {
2950 		ut_free(group->file_header_bufs_ptr[i]);
2951 	}
2952 
2953 	ut_free(group->file_header_bufs_ptr);
2954 	ut_free(group->file_header_bufs);
2955 	ut_free(group->checkpoint_buf_ptr);
2956 	ut_free(group);
2957 }
2958 
2959 /********************************************************//**
2960 Closes all log groups. */
2961 void
log_group_close_all(void)2962 log_group_close_all(void)
2963 /*=====================*/
2964 {
2965 	log_group_t*	group;
2966 
2967 	group = UT_LIST_GET_FIRST(log_sys->log_groups);
2968 
2969 	while (UT_LIST_GET_LEN(log_sys->log_groups) > 0) {
2970 		log_group_t*	prev_group = group;
2971 
2972 		group = UT_LIST_GET_NEXT(log_groups, group);
2973 
2974 		UT_LIST_REMOVE(log_sys->log_groups, prev_group);
2975 
2976 		log_group_close(prev_group);
2977 	}
2978 }
2979 
2980 /********************************************************//**
2981 Shutdown the log system but do not release all the memory. */
2982 void
log_shutdown(void)2983 log_shutdown(void)
2984 /*==============*/
2985 {
2986 	log_group_close_all();
2987 
2988 	ut_free(log_sys->buf_ptr);
2989 	log_sys->buf_ptr = NULL;
2990 	log_sys->buf = NULL;
2991 	ut_free(log_sys->checkpoint_buf_ptr);
2992 	log_sys->checkpoint_buf_ptr = NULL;
2993 	log_sys->checkpoint_buf = NULL;
2994 
2995 	os_event_destroy(log_sys->flush_event);
2996 
2997 	rw_lock_free(&log_sys->checkpoint_lock);
2998 
2999 	mutex_free(&log_sys->mutex);
3000 	mutex_free(&log_sys->write_mutex);
3001 	mutex_free(&log_sys->log_flush_order_mutex);
3002 
3003 	if (!srv_read_only_mode && srv_scrub_log) {
3004 		os_event_destroy(log_scrub_event);
3005 	}
3006 
3007 	recv_sys_close();
3008 }
3009 
3010 /********************************************************//**
3011 Free the log system data structures. */
3012 void
log_mem_free(void)3013 log_mem_free(void)
3014 /*==============*/
3015 {
3016 	if (log_sys != NULL) {
3017 		recv_sys_mem_free();
3018 		ut_free(log_sys);
3019 
3020 		log_sys = NULL;
3021 	}
3022 }
3023 
3024 static void
log_pad_current_log_block(void)3025 log_pad_current_log_block(void)
3026 {
3027 	ut_ad(!recv_no_log_write);
3028 	lsn_t lsn = log_reserve_and_open(OS_FILE_LOG_BLOCK_SIZE);
3029 
3030 	ulint pad_length = OS_FILE_LOG_BLOCK_SIZE -
3031 			   log_sys->buf_free % OS_FILE_LOG_BLOCK_SIZE -
3032 			   LOG_BLOCK_TRL_SIZE;
3033 	if (pad_length == (OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE -
3034 			   LOG_BLOCK_TRL_SIZE)) {
3035 		pad_length = 0;
3036 	}
3037 
3038 	if (pad_length) {
3039 		srv_stats.n_log_scrubs.inc();
3040 	}
3041 
3042 	for (ulint i = 0; i < pad_length; i++) {
3043 		byte b = MLOG_DUMMY_RECORD;
3044 		log_write_low(&b, 1);
3045 	}
3046 
3047 	lsn = log_sys->lsn;
3048 
3049 	log_close();
3050 
3051 	ut_a(lsn % OS_FILE_LOG_BLOCK_SIZE == LOG_BLOCK_HDR_SIZE);
3052 }
3053 
3054 /** If no log record has been written for a while, fill current log
3055 block with dummy records. */
3056 static void
log_scrub()3057 log_scrub()
3058 {
3059 	log_mutex_enter();
3060 	ulint cur_lbn = log_block_convert_lsn_to_no(log_sys->lsn);
3061 
3062 	if (next_lbn_to_pad == cur_lbn) {
3063 		log_pad_current_log_block();
3064 	}
3065 
3066 	next_lbn_to_pad = log_block_convert_lsn_to_no(log_sys->lsn);
3067 	log_mutex_exit();
3068 }
3069 
3070 /* log scrubbing speed, in bytes/sec */
3071 ulonglong innodb_scrub_log_speed;
3072 
3073 /** This is the main thread for log scrub. It waits for an event and
3074 when waked up fills current log block with dummy records and sleeps again.
3075 @return this function does not return, it calls os_thread_exit() */
3076 extern "C" os_thread_ret_t
DECLARE_THREAD(log_scrub_thread)3077 DECLARE_THREAD(log_scrub_thread)(void *) {
3078 	ut_ad(!srv_read_only_mode);
3079 
3080 	while (srv_shutdown_state < SRV_SHUTDOWN_FLUSH_PHASE) {
3081 		/* log scrubbing interval in µs. */
3082 		ulonglong interval =
3083 		    1000 * 1000 * 512 / innodb_scrub_log_speed;
3084 
3085 		os_event_wait_time(log_scrub_event,
3086 				   static_cast<ulint>(interval));
3087 
3088 		log_scrub();
3089 
3090 		os_event_reset(log_scrub_event);
3091 	}
3092 
3093 	log_scrub_thread_active = false;
3094 
3095 	/* We count the number of threads in os_thread_exit(). A created
3096         thread should always use that to exit and not use return() to exit. */
3097 	os_thread_exit();
3098 
3099 	OS_THREAD_DUMMY_RETURN;
3100 }
3101 
3102 uint srv_redo_log_key_version = 0;
3103 #endif /* !UNIV_HOTBACKUP */
3104