1 /*****************************************************************************
2 
3 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2009, Google Inc.
5 Copyright (c) 2014, 2021, MariaDB Corporation.
6 
7 Portions of this file contain modifications contributed and copyrighted by
8 Google, Inc. Those modifications are gratefully acknowledged and are described
9 briefly in the InnoDB documentation. The contributions by Google are
10 incorporated with their permission, and subject to the conditions contained in
11 the file COPYING.Google.
12 
13 This program is free software; you can redistribute it and/or modify it under
14 the terms of the GNU General Public License as published by the Free Software
15 Foundation; version 2 of the License.
16 
17 This program is distributed in the hope that it will be useful, but WITHOUT
18 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
19 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
20 
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
24 
25 *****************************************************************************/
26 
27 /**************************************************//**
28 @file log/log0log.cc
29 Database log
30 
31 Created 12/9/1995 Heikki Tuuri
32 *******************************************************/
33 
34 #include "univ.i"
35 #include <debug_sync.h>
36 #include <my_service_manager.h>
37 
38 #include "log0log.h"
39 #include "log0crypt.h"
40 #include "mem0mem.h"
41 #include "buf0buf.h"
42 #include "buf0flu.h"
43 #include "lock0lock.h"
44 #include "log0recv.h"
45 #include "fil0fil.h"
46 #include "dict0boot.h"
47 #include "dict0stats_bg.h"
48 #include "btr0defragment.h"
49 #include "srv0srv.h"
50 #include "srv0start.h"
51 #include "trx0sys.h"
52 #include "trx0trx.h"
53 #include "trx0roll.h"
54 #include "srv0mon.h"
55 #include "sync0sync.h"
56 
57 /*
58 General philosophy of InnoDB redo-logs:
59 
60 1) Every change to a contents of a data page must be done
61 through mtr, which in mtr_commit() writes log records
62 to the InnoDB redo log.
63 
64 2) Normally these changes are performed using a mlog_write_ulint()
65 or similar function.
66 
67 3) In some page level operations only a code number of a
68 c-function and its parameters are written to the log to
69 reduce the size of the log.
70 
71   3a) You should not add parameters to these kind of functions
72   (e.g. trx_undo_header_create())
73 
74   3b) You should not add such functionality which either change
75   working when compared with the old or are dependent on data
76   outside of the page. These kind of functions should implement
77   self-contained page transformation and it should be unchanged
78   if you don't have very essential reasons to change log
79   semantics or format.
80 
81 */
82 
83 /** Redo log system */
84 log_t	log_sys;
85 
86 /** Whether to require checksums on the redo log pages */
87 my_bool	innodb_log_checksums;
88 
89 /* Next log block number to do dummy record filling if no log records written
90 for a while */
91 static ulint		next_lbn_to_pad = 0;
92 
93 /* These control how often we print warnings if the last checkpoint is too
94 old */
95 static bool	log_has_printed_chkp_warning = false;
96 static time_t	log_last_warning_time;
97 
98 static bool	log_has_printed_chkp_margine_warning = false;
99 static time_t	log_last_margine_warning_time;
100 
101 /* A margin for free space in the log buffer before a log entry is catenated */
102 #define LOG_BUF_WRITE_MARGIN	(4 * OS_FILE_LOG_BLOCK_SIZE)
103 
104 /* Margins for free space in the log buffer after a log entry is catenated */
105 #define LOG_BUF_FLUSH_RATIO	2
106 #define LOG_BUF_FLUSH_MARGIN	(LOG_BUF_WRITE_MARGIN		\
107 				 + (4U << srv_page_size_shift))
108 
109 /* This parameter controls asynchronous making of a new checkpoint; the value
110 should be bigger than LOG_POOL_PREFLUSH_RATIO_SYNC */
111 
112 #define LOG_POOL_CHECKPOINT_RATIO_ASYNC	32
113 
114 /* This parameter controls synchronous preflushing of modified buffer pages */
115 #define LOG_POOL_PREFLUSH_RATIO_SYNC	16
116 
117 /* The same ratio for asynchronous preflushing; this value should be less than
118 the previous */
119 #define LOG_POOL_PREFLUSH_RATIO_ASYNC	8
120 
121 /* Codes used in unlocking flush latches */
122 #define LOG_UNLOCK_NONE_FLUSHED_LOCK	1
123 #define LOG_UNLOCK_FLUSH_LOCK		2
124 
125 /** Event to wake up log_scrub_thread */
126 os_event_t	log_scrub_event;
127 /** Whether log_scrub_thread is active */
128 bool		log_scrub_thread_active;
129 
130 extern "C" UNIV_INTERN
131 os_thread_ret_t
132 DECLARE_THREAD(log_scrub_thread)(void*);
133 
134 /****************************************************************//**
135 Returns the oldest modified block lsn in the pool, or log_sys.lsn if none
136 exists.
137 @return LSN of oldest modification */
138 static
139 lsn_t
log_buf_pool_get_oldest_modification(void)140 log_buf_pool_get_oldest_modification(void)
141 /*======================================*/
142 {
143 	lsn_t	lsn;
144 
145 	ut_ad(log_mutex_own());
146 
147 	lsn = buf_pool_get_oldest_modification();
148 
149 	if (!lsn) {
150 
151 		lsn = log_sys.lsn;
152 	}
153 
154 	return(lsn);
155 }
156 
157 /** Extends the log buffer.
158 @param[in]	len	requested minimum size in bytes */
log_buffer_extend(ulong len)159 void log_buffer_extend(ulong len)
160 {
161 	const ulong new_buf_size = ut_calc_align(len, srv_page_size);
162 	byte* new_buf = static_cast<byte*>(ut_malloc_dontdump(new_buf_size));
163 	TRASH_ALLOC(new_buf, new_buf_size);
164 	byte* new_flush_buf =
165 		static_cast<byte*>(ut_malloc_dontdump(new_buf_size));
166 	TRASH_ALLOC(new_flush_buf, new_buf_size);
167 
168 	log_mutex_enter();
169 
170 	if (len <= srv_log_buffer_size) {
171 		/* Already extended enough by the others */
172 		log_mutex_exit();
173 		ut_free_dodump(new_buf, new_buf_size);
174 		ut_free_dodump(new_flush_buf, new_buf_size);
175 		return;
176 	}
177 
178 	ib::warn() << "The redo log transaction size " << len <<
179 		" exceeds innodb_log_buffer_size="
180 		<< srv_log_buffer_size << " / 2). Trying to extend it.";
181 
182 	byte* old_buf = log_sys.buf;
183 	byte* old_flush_buf = log_sys.flush_buf;
184 	const ulong old_buf_size = srv_log_buffer_size;
185 
186 	srv_log_buffer_size = new_buf_size;
187 	log_sys.buf = new_buf;
188 	log_sys.flush_buf = new_flush_buf;
189 	memcpy(new_buf, old_buf, log_sys.buf_free);
190 
191 	log_sys.max_buf_free = new_buf_size / LOG_BUF_FLUSH_RATIO
192 		- LOG_BUF_FLUSH_MARGIN;
193 
194 	log_mutex_exit();
195 
196 	ut_free_dodump(old_buf, old_buf_size);
197 	ut_free_dodump(old_flush_buf, old_buf_size);
198 
199 	ib::info() << "innodb_log_buffer_size was extended to "
200 		<< new_buf_size << ".";
201 }
202 
203 /** Calculate actual length in redo buffer and file including
204 block header and trailer.
205 @param[in]	len	length to write
206 @return actual length to write including header and trailer. */
207 static inline
208 ulint
log_calculate_actual_len(ulint len)209 log_calculate_actual_len(
210 	ulint len)
211 {
212 	ut_ad(log_mutex_own());
213 
214 	/* actual length stored per block */
215 	const ulint	len_per_blk = OS_FILE_LOG_BLOCK_SIZE
216 		- (LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE);
217 
218 	/* actual data length in last block already written */
219 	ulint	extra_len = (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE);
220 
221 	ut_ad(extra_len >= LOG_BLOCK_HDR_SIZE);
222 	extra_len -= LOG_BLOCK_HDR_SIZE;
223 
224 	/* total extra length for block header and trailer */
225 	extra_len = ((len + extra_len) / len_per_blk)
226 		* (LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE);
227 
228 	return(len + extra_len);
229 }
230 
231 /** Check margin not to overwrite transaction log from the last checkpoint.
232 If would estimate the log write to exceed the log_group_capacity,
233 waits for the checkpoint is done enough.
234 @param[in]	len	length of the data to be written */
235 
236 void
log_margin_checkpoint_age(ulint len)237 log_margin_checkpoint_age(
238 	ulint	len)
239 {
240 	ulint	margin = log_calculate_actual_len(len);
241 
242 	ut_ad(log_mutex_own());
243 
244 	if (margin > log_sys.log_group_capacity) {
245 		/* return with warning output to avoid deadlock */
246 		if (!log_has_printed_chkp_margine_warning
247 		    || difftime(time(NULL),
248 				log_last_margine_warning_time) > 15) {
249 			log_has_printed_chkp_margine_warning = true;
250 			log_last_margine_warning_time = time(NULL);
251 
252 			ib::error() << "The transaction log files are too"
253 				" small for the single transaction log (size="
254 				<< len << "). So, the last checkpoint age"
255 				" might exceed the log group capacity "
256 				<< log_sys.log_group_capacity << ".";
257 		}
258 
259 		return;
260 	}
261 
262 	/* Our margin check should ensure that we never reach this condition.
263 	Try to do checkpoint once. We cannot keep waiting here as it might
264 	result in hang in case the current mtr has latch on oldest lsn */
265 	if (log_sys.lsn - log_sys.last_checkpoint_lsn + margin
266 	    > log_sys.log_group_capacity) {
267 		/* The log write of 'len' might overwrite the transaction log
268 		after the last checkpoint. Makes checkpoint. */
269 
270 		bool	flushed_enough = false;
271 
272 		if (log_sys.lsn - log_buf_pool_get_oldest_modification()
273 		    + margin
274 		    <= log_sys.log_group_capacity) {
275 			flushed_enough = true;
276 		}
277 
278 		log_sys.check_flush_or_checkpoint = true;
279 		log_mutex_exit();
280 
281 		DEBUG_SYNC_C("margin_checkpoint_age_rescue");
282 
283 		if (!flushed_enough) {
284 			os_thread_sleep(100000);
285 		}
286 		log_checkpoint(true);
287 
288 		log_mutex_enter();
289 	}
290 
291 	return;
292 }
293 
294 /** Open the log for log_write_low. The log must be closed with log_close.
295 @param[in]	len	length of the data to be written
296 @return start lsn of the log record */
297 lsn_t
log_reserve_and_open(ulint len)298 log_reserve_and_open(
299 	ulint	len)
300 {
301 	ulint	len_upper_limit;
302 #ifdef UNIV_DEBUG
303 	ulint	count			= 0;
304 #endif /* UNIV_DEBUG */
305 
306 loop:
307 	ut_ad(log_mutex_own());
308 
309 	/* Calculate an upper limit for the space the string may take in the
310 	log buffer */
311 
312 	len_upper_limit = LOG_BUF_WRITE_MARGIN + srv_log_write_ahead_size
313 			  + (5 * len) / 4;
314 
315 	if (log_sys.buf_free + len_upper_limit > srv_log_buffer_size) {
316 		log_mutex_exit();
317 
318 		DEBUG_SYNC_C("log_buf_size_exceeded");
319 
320 		/* Not enough free space, do a write of the log buffer */
321 		log_buffer_sync_in_background(false);
322 
323 		srv_stats.log_waits.inc();
324 
325 		ut_ad(++count < 50);
326 
327 		log_mutex_enter();
328 		goto loop;
329 	}
330 
331 	return(log_sys.lsn);
332 }
333 
334 /************************************************************//**
335 Writes to the log the string given. It is assumed that the caller holds the
336 log mutex. */
337 void
log_write_low(const byte * str,ulint str_len)338 log_write_low(
339 /*==========*/
340 	const byte*	str,		/*!< in: string */
341 	ulint		str_len)	/*!< in: string length */
342 {
343 	ulint	len;
344 	ulint	data_len;
345 	byte*	log_block;
346 
347 	ut_ad(log_mutex_own());
348 part_loop:
349 	/* Calculate a part length */
350 
351 	data_len = (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE) + str_len;
352 
353 	if (data_len <= OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
354 
355 		/* The string fits within the current log block */
356 
357 		len = str_len;
358 	} else {
359 		data_len = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE;
360 
361 		len = OS_FILE_LOG_BLOCK_SIZE
362 			- (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE)
363 			- LOG_BLOCK_TRL_SIZE;
364 	}
365 
366 	memcpy(log_sys.buf + log_sys.buf_free, str, len);
367 
368 	str_len -= len;
369 	str = str + len;
370 
371 	log_block = static_cast<byte*>(
372 		ut_align_down(log_sys.buf + log_sys.buf_free,
373 			      OS_FILE_LOG_BLOCK_SIZE));
374 
375 	log_block_set_data_len(log_block, data_len);
376 
377 	if (data_len == OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
378 		/* This block became full */
379 		log_block_set_data_len(log_block, OS_FILE_LOG_BLOCK_SIZE);
380 		log_block_set_checkpoint_no(log_block,
381 					    log_sys.next_checkpoint_no);
382 		len += LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE;
383 
384 		log_sys.lsn += len;
385 
386 		/* Initialize the next block header */
387 		log_block_init(log_block + OS_FILE_LOG_BLOCK_SIZE,
388 			       log_sys.lsn);
389 	} else {
390 		log_sys.lsn += len;
391 	}
392 
393 	log_sys.buf_free += ulong(len);
394 
395 	ut_ad(log_sys.buf_free <= srv_log_buffer_size);
396 
397 	if (str_len > 0) {
398 		goto part_loop;
399 	}
400 
401 	srv_stats.log_write_requests.inc();
402 }
403 
404 /************************************************************//**
405 Closes the log.
406 @return lsn */
407 lsn_t
log_close(void)408 log_close(void)
409 /*===========*/
410 {
411 	byte*		log_block;
412 	ulint		first_rec_group;
413 	lsn_t		oldest_lsn;
414 	lsn_t		lsn;
415 	lsn_t		checkpoint_age;
416 
417 	ut_ad(log_mutex_own());
418 
419 	lsn = log_sys.lsn;
420 
421 	log_block = static_cast<byte*>(
422 		ut_align_down(log_sys.buf + log_sys.buf_free,
423 			      OS_FILE_LOG_BLOCK_SIZE));
424 
425 	first_rec_group = log_block_get_first_rec_group(log_block);
426 
427 	if (first_rec_group == 0) {
428 		/* We initialized a new log block which was not written
429 		full by the current mtr: the next mtr log record group
430 		will start within this block at the offset data_len */
431 
432 		log_block_set_first_rec_group(
433 			log_block, log_block_get_data_len(log_block));
434 	}
435 
436 	if (log_sys.buf_free > log_sys.max_buf_free) {
437 		log_sys.check_flush_or_checkpoint = true;
438 	}
439 
440 	checkpoint_age = lsn - log_sys.last_checkpoint_lsn;
441 
442 	if (checkpoint_age >= log_sys.log_group_capacity) {
443 		DBUG_EXECUTE_IF(
444 			"print_all_chkp_warnings",
445 			log_has_printed_chkp_warning = false;);
446 
447 		if (!log_has_printed_chkp_warning
448 		    || difftime(time(NULL), log_last_warning_time) > 15) {
449 
450 			log_has_printed_chkp_warning = true;
451 			log_last_warning_time = time(NULL);
452 
453 			ib::error() << "The age of the last checkpoint is "
454 				<< checkpoint_age << ", which exceeds the log"
455 				" group capacity "
456 				<< log_sys.log_group_capacity
457 				<< ".";
458 		}
459 	}
460 
461 	if (checkpoint_age <= log_sys.max_modified_age_sync) {
462 		goto function_exit;
463 	}
464 
465 	oldest_lsn = buf_pool_get_oldest_modification();
466 
467 	if (!oldest_lsn
468 	    || lsn - oldest_lsn > log_sys.max_modified_age_sync
469 	    || checkpoint_age > log_sys.max_checkpoint_age_async) {
470 		log_sys.check_flush_or_checkpoint = true;
471 	}
472 function_exit:
473 
474 	return(lsn);
475 }
476 
477 /** Calculate the recommended highest values for lsn - last_checkpoint_lsn
478 and lsn - buf_get_oldest_modification().
479 @param[in]	file_size	requested innodb_log_file_size
480 @retval true on success
481 @retval false if the smallest log group is too small to
482 accommodate the number of OS threads in the database server */
483 bool
log_set_capacity(ulonglong file_size)484 log_set_capacity(ulonglong file_size)
485 {
486 	lsn_t		margin;
487 	ulint		free;
488 
489 	lsn_t smallest_capacity = (file_size - LOG_FILE_HDR_SIZE)
490 		* srv_n_log_files;
491 	/* Add extra safety */
492 	smallest_capacity -= smallest_capacity / 10;
493 
494 	/* For each OS thread we must reserve so much free space in the
495 	smallest log group that it can accommodate the log entries produced
496 	by single query steps: running out of free log space is a serious
497 	system error which requires rebooting the database. */
498 
499 	free = LOG_CHECKPOINT_FREE_PER_THREAD * (10 + srv_thread_concurrency)
500 		+ LOG_CHECKPOINT_EXTRA_FREE;
501 	if (free >= smallest_capacity / 2) {
502 		ib::error() << "Cannot continue operation. ib_logfiles are too"
503 			" small for innodb_thread_concurrency="
504 			<< srv_thread_concurrency << ". The combined size of"
505 			" ib_logfiles should be bigger than"
506 			" 200 kB * innodb_thread_concurrency. "
507 			<< INNODB_PARAMETERS_MSG;
508 		return(false);
509 	}
510 
511 	margin = smallest_capacity - free;
512 	margin = margin - margin / 10;	/* Add still some extra safety */
513 
514 	log_mutex_enter();
515 
516 	log_sys.log_group_capacity = smallest_capacity;
517 
518 	log_sys.max_modified_age_async = margin
519 		- margin / LOG_POOL_PREFLUSH_RATIO_ASYNC;
520 	log_sys.max_modified_age_sync = margin
521 		- margin / LOG_POOL_PREFLUSH_RATIO_SYNC;
522 
523 	log_sys.max_checkpoint_age_async = margin - margin
524 		/ LOG_POOL_CHECKPOINT_RATIO_ASYNC;
525 	log_sys.max_checkpoint_age = margin;
526 
527 	log_mutex_exit();
528 
529 	return(true);
530 }
531 
532 /** Initialize the redo log subsystem. */
create()533 void log_t::create()
534 {
535   ut_ad(this == &log_sys);
536   ut_ad(!is_initialised());
537   m_initialised= true;
538 
539   mutex_create(LATCH_ID_LOG_SYS, &mutex);
540   mutex_create(LATCH_ID_LOG_WRITE, &write_mutex);
541   mutex_create(LATCH_ID_LOG_FLUSH_ORDER, &log_flush_order_mutex);
542 
543   /* Start the lsn from one log block from zero: this way every
544   log record has a non-zero start lsn, a fact which we will use */
545 
546   lsn= LOG_START_LSN;
547 
548   ut_ad(srv_log_buffer_size >= 16 * OS_FILE_LOG_BLOCK_SIZE);
549   ut_ad(srv_log_buffer_size >= 4U << srv_page_size_shift);
550 
551   buf= static_cast<byte*>(ut_malloc_dontdump(srv_log_buffer_size));
552   TRASH_ALLOC(buf, srv_log_buffer_size);
553   flush_buf= static_cast<byte*>(ut_malloc_dontdump(srv_log_buffer_size));
554   TRASH_ALLOC(flush_buf, srv_log_buffer_size);
555 
556   max_buf_free= srv_log_buffer_size / LOG_BUF_FLUSH_RATIO -
557     LOG_BUF_FLUSH_MARGIN;
558   check_flush_or_checkpoint= true;
559 
560   n_log_ios_old= n_log_ios;
561   last_printout_time= time(NULL);
562 
563   buf_next_to_write= 0;
564   write_lsn= lsn;
565   flushed_to_disk_lsn= 0;
566   n_pending_flushes= 0;
567   flush_event = os_event_create("log_flush_event");
568   os_event_set(flush_event);
569   n_log_ios= 0;
570   n_log_ios_old= 0;
571   log_group_capacity= 0;
572   max_modified_age_async= 0;
573   max_modified_age_sync= 0;
574   max_checkpoint_age_async= 0;
575   max_checkpoint_age= 0;
576   next_checkpoint_no= 0;
577   next_checkpoint_lsn= 0;
578   append_on_checkpoint= NULL;
579   n_pending_checkpoint_writes= 0;
580 
581   last_checkpoint_lsn= lsn;
582   rw_lock_create(checkpoint_lock_key, &checkpoint_lock, SYNC_NO_ORDER_CHECK);
583 
584   log_block_init(buf, lsn);
585   log_block_set_first_rec_group(buf, LOG_BLOCK_HDR_SIZE);
586 
587   buf_free= LOG_BLOCK_HDR_SIZE;
588   lsn= LOG_START_LSN + LOG_BLOCK_HDR_SIZE;
589 
590   MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE, lsn - last_checkpoint_lsn);
591 
592   log_scrub_thread_active= !srv_read_only_mode && srv_scrub_log;
593   if (log_scrub_thread_active) {
594     log_scrub_event= os_event_create("log_scrub_event");
595     os_thread_create(log_scrub_thread, NULL, NULL);
596   }
597 }
598 
599 /** Initialize the redo log.
600 @param[in]	n_files		number of files */
create(ulint n_files)601 void log_t::files::create(ulint n_files)
602 {
603   ut_ad(n_files <= SRV_N_LOG_FILES_MAX);
604   ut_ad(this == &log_sys.log);
605   ut_ad(log_sys.is_initialised());
606 
607   this->n_files= n_files;
608   format= srv_encrypt_log
609     ? LOG_HEADER_FORMAT_CURRENT | LOG_HEADER_FORMAT_ENCRYPTED
610     : LOG_HEADER_FORMAT_CURRENT;
611   subformat= 2;
612   file_size= srv_log_file_size;
613   lsn= LOG_START_LSN;
614   lsn_offset= LOG_FILE_HDR_SIZE;
615 }
616 
617 /******************************************************//**
618 Writes a log file header to a log file space. */
619 static
620 void
log_file_header_flush(ulint nth_file,lsn_t start_lsn)621 log_file_header_flush(
622 	ulint		nth_file,	/*!< in: header to the nth file in the
623 					log file space */
624 	lsn_t		start_lsn)	/*!< in: log file data starts at this
625 					lsn */
626 {
627 	lsn_t	dest_offset;
628 
629 	ut_ad(log_write_mutex_own());
630 	ut_ad(!recv_no_log_write);
631 	ut_a(nth_file < log_sys.log.n_files);
632 	ut_ad((log_sys.log.format & ~LOG_HEADER_FORMAT_ENCRYPTED)
633 	      == LOG_HEADER_FORMAT_CURRENT);
634 
635 	// man 2 open suggests this buffer to be aligned by 512 for O_DIRECT
636 	MY_ALIGNED(OS_FILE_LOG_BLOCK_SIZE)
637 	byte buf[OS_FILE_LOG_BLOCK_SIZE] = {0};
638 
639 	mach_write_to_4(buf + LOG_HEADER_FORMAT, log_sys.log.format);
640 	mach_write_to_4(buf + LOG_HEADER_SUBFORMAT, log_sys.log.subformat);
641 	mach_write_to_8(buf + LOG_HEADER_START_LSN, start_lsn);
642 	strcpy(reinterpret_cast<char*>(buf) + LOG_HEADER_CREATOR,
643 	       LOG_HEADER_CREATOR_CURRENT);
644 	ut_ad(LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR
645 	      >= sizeof LOG_HEADER_CREATOR_CURRENT);
646 	log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf));
647 
648 	dest_offset = nth_file * log_sys.log.file_size;
649 
650 	DBUG_PRINT("ib_log", ("write " LSN_PF
651 			      " file " ULINTPF " header",
652 			      start_lsn, nth_file));
653 
654 	log_sys.n_log_ios++;
655 
656 	srv_stats.os_log_pending_writes.inc();
657 
658 	const ulint	page_no = ulint(dest_offset >> srv_page_size_shift);
659 
660 	fil_io(IORequestLogWrite, true,
661 	       page_id_t(SRV_LOG_SPACE_FIRST_ID, page_no),
662 	       univ_page_size,
663 	       ulint(dest_offset & (srv_page_size - 1)),
664 	       OS_FILE_LOG_BLOCK_SIZE, buf, NULL);
665 
666 	srv_stats.os_log_pending_writes.dec();
667 }
668 
669 /******************************************************//**
670 Stores a 4-byte checksum to the trailer checksum field of a log block
671 before writing it to a log file. This checksum is used in recovery to
672 check the consistency of a log block. */
673 static
674 void
log_block_store_checksum(byte * block)675 log_block_store_checksum(
676 /*=====================*/
677 	byte*	block)	/*!< in/out: pointer to a log block */
678 {
679 	log_block_set_checksum(block, log_block_calc_checksum_crc32(block));
680 }
681 
682 /******************************************************//**
683 Writes a buffer to a log file. */
684 static
685 void
log_write_buf(byte * buf,ulint len,ulint pad_len,lsn_t start_lsn,ulint new_data_offset)686 log_write_buf(
687 	byte*		buf,		/*!< in: buffer */
688 	ulint		len,		/*!< in: buffer len; must be divisible
689 					by OS_FILE_LOG_BLOCK_SIZE */
690 #ifdef UNIV_DEBUG
691 	ulint		pad_len,	/*!< in: pad len in the buffer len */
692 #endif /* UNIV_DEBUG */
693 	lsn_t		start_lsn,	/*!< in: start lsn of the buffer; must
694 					be divisible by
695 					OS_FILE_LOG_BLOCK_SIZE */
696 	ulint		new_data_offset)/*!< in: start offset of new data in
697 					buf: this parameter is used to decide
698 					if we have to write a new log file
699 					header */
700 {
701 	ulint		write_len;
702 	bool		write_header	= new_data_offset == 0;
703 	lsn_t		next_offset;
704 	ulint		i;
705 
706 	ut_ad(log_write_mutex_own());
707 	ut_ad(!recv_no_log_write);
708 	ut_a(len % OS_FILE_LOG_BLOCK_SIZE == 0);
709 	ut_a(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
710 
711 loop:
712 	if (len == 0) {
713 
714 		return;
715 	}
716 
717 	next_offset = log_sys.log.calc_lsn_offset(start_lsn);
718 
719 	if (write_header
720 	    && next_offset % log_sys.log.file_size == LOG_FILE_HDR_SIZE) {
721 		/* We start to write a new log file instance in the group */
722 
723 		ut_a(next_offset / log_sys.log.file_size <= ULINT_MAX);
724 
725 		log_file_header_flush(
726 			ulint(next_offset / log_sys.log.file_size), start_lsn);
727 		srv_stats.os_log_written.add(OS_FILE_LOG_BLOCK_SIZE);
728 
729 		srv_stats.log_writes.inc();
730 	}
731 
732 	if ((next_offset % log_sys.log.file_size) + len
733 	    > log_sys.log.file_size) {
734 		/* if the above condition holds, then the below expression
735 		is < len which is ulint, so the typecast is ok */
736 		write_len = ulint(log_sys.log.file_size
737 				  - (next_offset % log_sys.log.file_size));
738 	} else {
739 		write_len = len;
740 	}
741 
742 	DBUG_PRINT("ib_log",
743 		   ("write " LSN_PF " to " LSN_PF
744 		    ": len " ULINTPF
745 		    " blocks " ULINTPF ".." ULINTPF,
746 		    start_lsn, next_offset,
747 		    write_len,
748 		    log_block_get_hdr_no(buf),
749 		    log_block_get_hdr_no(
750 			    buf + write_len
751 			    - OS_FILE_LOG_BLOCK_SIZE)));
752 
753 	ut_ad(pad_len >= len
754 	      || log_block_get_hdr_no(buf)
755 		 == log_block_convert_lsn_to_no(start_lsn));
756 
757 	/* Calculate the checksums for each log block and write them to
758 	the trailer fields of the log blocks */
759 
760 	for (i = 0; i < write_len / OS_FILE_LOG_BLOCK_SIZE; i++) {
761 #ifdef UNIV_DEBUG
762 		ulint hdr_no_2 = log_block_get_hdr_no(buf) + i;
763 		DBUG_EXECUTE_IF("innodb_small_log_block_no_limit",
764 				hdr_no_2 = ((hdr_no_2 - 1) & 0xFUL) + 1;);
765 #endif
766 		ut_ad(pad_len >= len
767 			|| i * OS_FILE_LOG_BLOCK_SIZE >= len - pad_len
768 			|| log_block_get_hdr_no(buf + i * OS_FILE_LOG_BLOCK_SIZE) == hdr_no_2);
769 		log_block_store_checksum(buf + i * OS_FILE_LOG_BLOCK_SIZE);
770 	}
771 
772 	log_sys.n_log_ios++;
773 
774 	srv_stats.os_log_pending_writes.inc();
775 
776 	ut_a((next_offset >> srv_page_size_shift) <= ULINT_MAX);
777 
778 	const ulint	page_no = ulint(next_offset >> srv_page_size_shift);
779 
780 	fil_io(IORequestLogWrite, true,
781 	       page_id_t(SRV_LOG_SPACE_FIRST_ID, page_no),
782 	       univ_page_size,
783 	       ulint(next_offset & (srv_page_size - 1)), write_len, buf, NULL);
784 
785 	srv_stats.os_log_pending_writes.dec();
786 
787 	srv_stats.os_log_written.add(write_len);
788 	srv_stats.log_writes.inc();
789 
790 	if (write_len < len) {
791 		start_lsn += write_len;
792 		len -= write_len;
793 		buf += write_len;
794 
795 		write_header = true;
796 
797 		goto loop;
798 	}
799 }
800 
801 /** Flush the recently written changes to the log file.
802 and invoke log_mutex_enter(). */
log_write_flush_to_disk_low()803 static void log_write_flush_to_disk_low()
804 {
805 	ut_a(log_sys.n_pending_flushes);
806 
807 	bool	do_flush = srv_file_flush_method != SRV_O_DSYNC;
808 
809 	if (do_flush) {
810 		fil_flush(SRV_LOG_SPACE_FIRST_ID);
811 	}
812 
813 	log_mutex_enter();
814 	if (do_flush) {
815 		log_sys.flushed_to_disk_lsn = log_sys.current_flush_lsn;
816 	}
817 
818 	log_sys.n_pending_flushes--;
819 
820 	os_event_set(log_sys.flush_event);
821 }
822 
823 /** Swap log buffers, and copy the content of last block
824 from old buf to the head of the new buf. Thus, buf_free and
825 buf_next_to_write would be changed accordingly */
826 static inline
827 void
log_buffer_switch()828 log_buffer_switch()
829 {
830 	ut_ad(log_mutex_own());
831 	ut_ad(log_write_mutex_own());
832 
833 	ulong		area_end = ut_calc_align(
834 		log_sys.buf_free, ulong(OS_FILE_LOG_BLOCK_SIZE));
835 
836 	/* Copy the last block to new buf */
837 	ut_memcpy(log_sys.flush_buf,
838 		  log_sys.buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
839 		  OS_FILE_LOG_BLOCK_SIZE);
840 
841 	std::swap(log_sys.buf, log_sys.flush_buf);
842 
843 	log_sys.buf_free %= OS_FILE_LOG_BLOCK_SIZE;
844 	log_sys.buf_next_to_write = log_sys.buf_free;
845 }
846 
847 /** Ensure that the log has been written to the log file up to a given
848 log entry (such as that of a transaction commit). Start a new write, or
849 wait and check if an already running write is covering the request.
850 @param[in]	lsn		log sequence number that should be
851 included in the redo log file write
852 @param[in]	flush_to_disk	whether the written log should also
853 be flushed to the file system */
854 void
log_write_up_to(lsn_t lsn,bool flush_to_disk)855 log_write_up_to(
856 	lsn_t	lsn,
857 	bool	flush_to_disk)
858 {
859 #ifdef UNIV_DEBUG
860 	ulint		loop_count	= 0;
861 #endif /* UNIV_DEBUG */
862 	byte*           write_buf;
863 	lsn_t           write_lsn;
864 
865 	ut_ad(!srv_read_only_mode);
866 
867 	if (recv_no_ibuf_operations) {
868 		/* Recovery is running and no operations on the log files are
869 		allowed yet (the variable name .._no_ibuf_.. is misleading) */
870 
871 		return;
872 	}
873 
874 loop:
875 	ut_ad(++loop_count < 128);
876 
877 #if UNIV_WORD_SIZE > 7
878 	/* We can do a dirty read of LSN. */
879 	/* NOTE: Currently doesn't do dirty read for
880 	(flush_to_disk == true) case, because the log_mutex
881 	contention also works as the arbitrator for write-IO
882 	(fsync) bandwidth between log files and data files. */
883 	if (!flush_to_disk && log_sys.write_lsn >= lsn) {
884 		return;
885 	}
886 #endif
887 
888 	log_write_mutex_enter();
889 	ut_ad(!recv_no_log_write);
890 
891 	lsn_t	limit_lsn = flush_to_disk
892 		? log_sys.flushed_to_disk_lsn
893 		: log_sys.write_lsn;
894 
895 	if (limit_lsn >= lsn) {
896 		log_write_mutex_exit();
897 		return;
898 	}
899 
900 	/* If it is a write call we should just go ahead and do it
901 	as we checked that write_lsn is not where we'd like it to
902 	be. If we have to flush as well then we check if there is a
903 	pending flush and based on that we wait for it to finish
904 	before proceeding further. */
905 	if (flush_to_disk
906 	    && (log_sys.n_pending_flushes > 0
907 		|| !os_event_is_set(log_sys.flush_event))) {
908 		/* Figure out if the current flush will do the job
909 		for us. */
910 		bool work_done = log_sys.current_flush_lsn >= lsn;
911 
912 		log_write_mutex_exit();
913 
914 		os_event_wait(log_sys.flush_event);
915 
916 		if (work_done) {
917 			return;
918 		} else {
919 			goto loop;
920 		}
921 	}
922 
923 	log_mutex_enter();
924 	if (!flush_to_disk
925 	    && log_sys.buf_free == log_sys.buf_next_to_write) {
926 		/* Nothing to write and no flush to disk requested */
927 		log_mutex_exit_all();
928 		return;
929 	}
930 
931 	ulint		start_offset;
932 	ulint		end_offset;
933 	ulint		area_start;
934 	ulint		area_end;
935 	ulong		write_ahead_size = srv_log_write_ahead_size;
936 	ulint		pad_size;
937 
938 	DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF,
939 			      log_sys.write_lsn,
940 			      log_sys.lsn));
941 	if (flush_to_disk) {
942 		log_sys.n_pending_flushes++;
943 		log_sys.current_flush_lsn = log_sys.lsn;
944 		os_event_reset(log_sys.flush_event);
945 
946 		if (log_sys.buf_free == log_sys.buf_next_to_write) {
947 			/* Nothing to write, flush only */
948 			log_mutex_exit_all();
949 			log_write_flush_to_disk_low();
950 			log_mutex_exit();
951 			return;
952 		}
953 	}
954 
955 	start_offset = log_sys.buf_next_to_write;
956 	end_offset = log_sys.buf_free;
957 
958 	area_start = ut_2pow_round(start_offset,
959 				   ulint(OS_FILE_LOG_BLOCK_SIZE));
960 	area_end = ut_calc_align(end_offset, ulint(OS_FILE_LOG_BLOCK_SIZE));
961 
962 	ut_ad(area_end - area_start > 0);
963 
964 	log_block_set_flush_bit(log_sys.buf + area_start, TRUE);
965 	log_block_set_checkpoint_no(
966 		log_sys.buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
967 		log_sys.next_checkpoint_no);
968 
969 	write_lsn = log_sys.lsn;
970 	write_buf = log_sys.buf;
971 
972 	log_buffer_switch();
973 
974 	log_sys.log.set_fields(log_sys.write_lsn);
975 
976 	log_mutex_exit();
977 	/* Erase the end of the last log block. */
978 	memset(write_buf + end_offset, 0,
979 	       ~end_offset & (OS_FILE_LOG_BLOCK_SIZE - 1));
980 
981 	/* Calculate pad_size if needed. */
982 	pad_size = 0;
983 	if (write_ahead_size > OS_FILE_LOG_BLOCK_SIZE) {
984 		ulint	end_offset_in_unit;
985 		lsn_t	end_offset = log_sys.log.calc_lsn_offset(
986 			ut_uint64_align_up(write_lsn, OS_FILE_LOG_BLOCK_SIZE));
987 		end_offset_in_unit = (ulint) (end_offset % write_ahead_size);
988 
989 		if (end_offset_in_unit > 0
990 		    && (area_end - area_start) > end_offset_in_unit) {
991 			/* The first block in the unit was initialized
992 			after the last writing.
993 			Needs to be written padded data once. */
994 			pad_size = std::min<ulint>(
995 				ulint(write_ahead_size) - end_offset_in_unit,
996 				srv_log_buffer_size - area_end);
997 			::memset(write_buf + area_end, 0, pad_size);
998 		}
999 	}
1000 
1001 	if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED)) {
1002 		service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
1003 					       "InnoDB log write: "
1004 					       LSN_PF "," LSN_PF,
1005 					       log_sys.write_lsn, lsn);
1006 	}
1007 
1008 	if (log_sys.is_encrypted()) {
1009 		log_crypt(write_buf + area_start, log_sys.write_lsn,
1010 			  area_end - area_start);
1011 	}
1012 
1013 	/* Do the write to the log files */
1014 	log_write_buf(
1015 		write_buf + area_start, area_end - area_start + pad_size,
1016 #ifdef UNIV_DEBUG
1017 		pad_size,
1018 #endif /* UNIV_DEBUG */
1019 		ut_uint64_align_down(log_sys.write_lsn,
1020 				     OS_FILE_LOG_BLOCK_SIZE),
1021 		start_offset - area_start);
1022 	srv_stats.log_padded.add(pad_size);
1023 	log_sys.write_lsn = write_lsn;
1024 
1025 
1026 	if (srv_file_flush_method == SRV_O_DSYNC) {
1027 		/* O_SYNC means the OS did not buffer the log file at all:
1028 		so we have also flushed to disk what we have written */
1029 		log_sys.flushed_to_disk_lsn = log_sys.write_lsn;
1030 	}
1031 
1032 	log_write_mutex_exit();
1033 
1034 	if (flush_to_disk) {
1035 		log_write_flush_to_disk_low();
1036 		ib_uint64_t flush_lsn = log_sys.flushed_to_disk_lsn;
1037 		log_mutex_exit();
1038 
1039 		innobase_mysql_log_notify(flush_lsn);
1040 	}
1041 }
1042 
1043 /** write to the log file up to the last log entry.
1044 @param[in]	sync	whether we want the written log
1045 also to be flushed to disk. */
log_buffer_flush_to_disk(bool sync)1046 void log_buffer_flush_to_disk(bool sync)
1047 {
1048   ut_ad(!srv_read_only_mode);
1049   log_write_up_to(log_get_lsn(), sync);
1050 }
1051 
1052 
1053 /** Durably write the log and release log_sys.mutex */
log_write_and_flush()1054 ATTRIBUTE_COLD void log_write_and_flush()
1055 {
1056   ut_ad(!srv_read_only_mode);
1057   ut_ad(!recv_no_log_write);
1058   ut_ad(!recv_recovery_is_on());
1059 
1060   /* The following code is adapted from log_write_up_to(). */
1061   DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF,
1062                         log_sys.write_lsn, log_sys.lsn));
1063   log_sys.n_pending_flushes++;
1064   log_sys.current_flush_lsn= log_sys.lsn;
1065   os_event_reset(log_sys.flush_event);
1066   ut_ad(log_sys.buf_free != log_sys.buf_next_to_write);
1067   ulint start_offset= log_sys.buf_next_to_write;
1068   ulint end_offset= log_sys.buf_free;
1069   ulint area_start= ut_2pow_round(start_offset, ulint(OS_FILE_LOG_BLOCK_SIZE));
1070   ulint area_end= ut_calc_align(end_offset, ulint(OS_FILE_LOG_BLOCK_SIZE));
1071   ulong write_ahead_size= srv_log_write_ahead_size;
1072 
1073   log_block_set_flush_bit(log_sys.buf + area_start, TRUE);
1074   log_block_set_checkpoint_no(log_sys.buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
1075                               log_sys.next_checkpoint_no);
1076   lsn_t write_lsn= log_sys.lsn;
1077   byte *write_buf= log_sys.buf;
1078 
1079   ut_ad(area_end - area_start > 0);
1080 
1081   log_buffer_switch();
1082 
1083   log_sys.log.set_fields(log_sys.write_lsn);
1084 
1085   /* Erase the end of the last log block. */
1086   memset(write_buf + end_offset, 0,
1087          ~end_offset & (OS_FILE_LOG_BLOCK_SIZE - 1));
1088   /* Calculate pad_size if needed. */
1089   ulint pad_size= 0;
1090   if (write_ahead_size > OS_FILE_LOG_BLOCK_SIZE)
1091   {
1092     lsn_t end_offset=
1093       log_sys.log.calc_lsn_offset(ut_uint64_align_up(write_lsn,
1094                                                      OS_FILE_LOG_BLOCK_SIZE));
1095     ulint end_offset_in_unit= (ulint) (end_offset % write_ahead_size);
1096 
1097     if (end_offset_in_unit && (area_end - area_start) > end_offset_in_unit)
1098     {
1099       /* The first block in the unit was initialized after the last
1100       writing. Needs to be written padded data once. */
1101       pad_size= std::min<ulint>(ulint(write_ahead_size) - end_offset_in_unit,
1102                                 srv_log_buffer_size - area_end);
1103       memset(write_buf + area_end, 0, pad_size);
1104     }
1105   }
1106 
1107   if (log_sys.is_encrypted())
1108     log_crypt(write_buf + area_start, log_sys.write_lsn,
1109               area_end - area_start);
1110 
1111   /* Do the write to the log files */
1112   log_write_buf(write_buf + area_start, area_end - area_start + pad_size,
1113 #ifdef UNIV_DEBUG
1114                 pad_size,
1115 #endif /* UNIV_DEBUG */
1116                 ut_uint64_align_down(log_sys.write_lsn,
1117                                      OS_FILE_LOG_BLOCK_SIZE),
1118                 start_offset - area_start);
1119   srv_stats.log_padded.add(pad_size);
1120   log_sys.write_lsn= write_lsn;
1121 
1122   log_write_mutex_exit();
1123 
1124   /* Code adapted from log_write_flush_to_disk_low() */
1125 
1126   ut_a(log_sys.n_pending_flushes);
1127 
1128   if (srv_file_flush_method != SRV_O_DSYNC)
1129     fil_flush(SRV_LOG_SPACE_FIRST_ID);
1130 
1131   log_sys.flushed_to_disk_lsn= log_sys.current_flush_lsn;
1132 
1133   log_sys.n_pending_flushes--;
1134 
1135   os_event_set(log_sys.flush_event);
1136 
1137   const lsn_t flush_lsn= log_sys.flushed_to_disk_lsn;
1138   log_mutex_exit();
1139 
1140   innobase_mysql_log_notify(flush_lsn);
1141 }
1142 
1143 /****************************************************************//**
1144 This functions writes the log buffer to the log file and if 'flush'
1145 is set it forces a flush of the log file as well. This is meant to be
1146 called from background master thread only as it does not wait for
1147 the write (+ possible flush) to finish. */
1148 void
log_buffer_sync_in_background(bool flush)1149 log_buffer_sync_in_background(
1150 /*==========================*/
1151 	bool	flush)	/*!< in: flush the logs to disk */
1152 {
1153 	lsn_t	lsn;
1154 
1155 	log_mutex_enter();
1156 
1157 	lsn = log_sys.lsn;
1158 
1159 	if (flush
1160 	    && log_sys.n_pending_flushes > 0
1161 	    && log_sys.current_flush_lsn >= lsn) {
1162 		/* The write + flush will write enough */
1163 		log_mutex_exit();
1164 		return;
1165 	}
1166 
1167 	log_mutex_exit();
1168 
1169 	log_write_up_to(lsn, flush);
1170 }
1171 
1172 /********************************************************************
1173 
1174 Tries to establish a big enough margin of free space in the log buffer, such
1175 that a new log entry can be catenated without an immediate need for a flush. */
1176 static
1177 void
log_flush_margin(void)1178 log_flush_margin(void)
1179 /*==================*/
1180 {
1181 	lsn_t	lsn	= 0;
1182 
1183 	log_mutex_enter();
1184 
1185 	if (log_sys.buf_free > log_sys.max_buf_free) {
1186 		/* We can write during flush */
1187 		lsn = log_sys.lsn;
1188 	}
1189 
1190 	log_mutex_exit();
1191 
1192 	if (lsn) {
1193 		log_write_up_to(lsn, false);
1194 	}
1195 }
1196 
1197 /** Advances the smallest lsn for which there are unflushed dirty blocks in the
1198 buffer pool.
1199 NOTE: this function may only be called if the calling thread owns no
1200 synchronization objects!
1201 @param[in]	new_oldest	try to advance oldest_modified_lsn at least to
1202 this lsn
1203 @return false if there was a flush batch of the same type running,
1204 which means that we could not start this flush batch */
log_preflush_pool_modified_pages(lsn_t new_oldest)1205 static bool log_preflush_pool_modified_pages(lsn_t new_oldest)
1206 {
1207 	bool	success;
1208 
1209 	if (recv_recovery_is_on()) {
1210 		/* If the recovery is running, we must first apply all
1211 		log records to their respective file pages to get the
1212 		right modify lsn values to these pages: otherwise, there
1213 		might be pages on disk which are not yet recovered to the
1214 		current lsn, and even after calling this function, we could
1215 		not know how up-to-date the disk version of the database is,
1216 		and we could not make a new checkpoint on the basis of the
1217 		info on the buffer pool only. */
1218 		recv_apply_hashed_log_recs(true);
1219 	}
1220 
1221 	if (new_oldest == LSN_MAX
1222 	    || !buf_page_cleaner_is_active
1223 	    || srv_is_being_started) {
1224 
1225 		ulint	n_pages;
1226 
1227 		success = buf_flush_lists(ULINT_MAX, new_oldest, &n_pages);
1228 
1229 		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
1230 
1231 		if (!success) {
1232 			MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
1233 		}
1234 
1235 		MONITOR_INC_VALUE_CUMULATIVE(
1236 			MONITOR_FLUSH_SYNC_TOTAL_PAGE,
1237 			MONITOR_FLUSH_SYNC_COUNT,
1238 			MONITOR_FLUSH_SYNC_PAGES,
1239 			n_pages);
1240 	} else {
1241 		/* better to wait for flushed by page cleaner */
1242 
1243 		if (srv_flush_sync) {
1244 			/* wake page cleaner for IO burst */
1245 			buf_flush_request_force(new_oldest);
1246 		}
1247 
1248 		buf_flush_wait_flushed(new_oldest);
1249 
1250 		success = true;
1251 	}
1252 
1253 	return(success);
1254 }
1255 
1256 /******************************************************//**
1257 Completes a checkpoint. */
1258 static
1259 void
log_complete_checkpoint(void)1260 log_complete_checkpoint(void)
1261 /*=========================*/
1262 {
1263 	ut_ad(log_mutex_own());
1264 	ut_ad(log_sys.n_pending_checkpoint_writes == 0);
1265 
1266 	log_sys.next_checkpoint_no++;
1267 
1268 	log_sys.last_checkpoint_lsn = log_sys.next_checkpoint_lsn;
1269 	MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
1270 		    log_sys.lsn - log_sys.last_checkpoint_lsn);
1271 
1272 	DBUG_PRINT("ib_log", ("checkpoint ended at " LSN_PF
1273 			      ", flushed to " LSN_PF,
1274 			      log_sys.last_checkpoint_lsn,
1275 			      log_sys.flushed_to_disk_lsn));
1276 
1277 	rw_lock_x_unlock_gen(&(log_sys.checkpoint_lock), LOG_CHECKPOINT);
1278 }
1279 
1280 /** Complete an asynchronous checkpoint write. */
complete_checkpoint()1281 void log_t::complete_checkpoint()
1282 {
1283 	ut_ad(this == &log_sys);
1284 	MONITOR_DEC(MONITOR_PENDING_CHECKPOINT_WRITE);
1285 
1286 	log_mutex_enter();
1287 
1288 	ut_ad(n_pending_checkpoint_writes > 0);
1289 
1290 	if (!--n_pending_checkpoint_writes) {
1291 		log_complete_checkpoint();
1292 	}
1293 
1294 	log_mutex_exit();
1295 }
1296 
1297 /** Write checkpoint info to the log header.
1298 @param[in]	end_lsn	start LSN of the MLOG_CHECKPOINT mini-transaction */
1299 static
1300 void
log_group_checkpoint(lsn_t end_lsn)1301 log_group_checkpoint(lsn_t end_lsn)
1302 {
1303 	lsn_t		lsn_offset;
1304 
1305 	ut_ad(!srv_read_only_mode);
1306 	ut_ad(log_mutex_own());
1307 	ut_ad(end_lsn == 0 || end_lsn >= log_sys.next_checkpoint_lsn);
1308 	ut_ad(end_lsn <= log_sys.lsn);
1309 	ut_ad(end_lsn + SIZE_OF_MLOG_CHECKPOINT <= log_sys.lsn
1310 	      || srv_shutdown_state > SRV_SHUTDOWN_INITIATED);
1311 
1312 	DBUG_PRINT("ib_log", ("checkpoint " UINT64PF " at " LSN_PF
1313 			      " written",
1314 			      log_sys.next_checkpoint_no,
1315 			      log_sys.next_checkpoint_lsn));
1316 
1317 	byte* buf = log_sys.checkpoint_buf;
1318 	memset(buf, 0, OS_FILE_LOG_BLOCK_SIZE);
1319 
1320 	mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys.next_checkpoint_no);
1321 	mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys.next_checkpoint_lsn);
1322 
1323 	if (log_sys.is_encrypted()) {
1324 		log_crypt_write_checkpoint_buf(buf);
1325 	}
1326 
1327 	lsn_offset = log_sys.log.calc_lsn_offset(log_sys.next_checkpoint_lsn);
1328 	mach_write_to_8(buf + LOG_CHECKPOINT_OFFSET, lsn_offset);
1329 	mach_write_to_8(buf + LOG_CHECKPOINT_LOG_BUF_SIZE,
1330 			srv_log_buffer_size);
1331 	mach_write_to_8(buf + LOG_CHECKPOINT_END_LSN, end_lsn);
1332 
1333 	log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf));
1334 
1335 	MONITOR_INC(MONITOR_PENDING_CHECKPOINT_WRITE);
1336 
1337 	log_sys.n_log_ios++;
1338 
1339 	MONITOR_INC(MONITOR_LOG_IO);
1340 
1341 	ut_ad(LOG_CHECKPOINT_1 < srv_page_size);
1342 	ut_ad(LOG_CHECKPOINT_2 < srv_page_size);
1343 
1344 	if (log_sys.n_pending_checkpoint_writes++ == 0) {
1345 		rw_lock_x_lock_gen(&log_sys.checkpoint_lock,
1346 				   LOG_CHECKPOINT);
1347 	}
1348 
1349 	/* Note: We alternate the physical place of the checkpoint info.
1350 	See the (next_checkpoint_no & 1) below. */
1351 
1352 	fil_io(IORequestLogWrite, false,
1353 	       page_id_t(SRV_LOG_SPACE_FIRST_ID, 0),
1354 	       univ_page_size,
1355 	       (log_sys.next_checkpoint_no & 1)
1356 	       ? LOG_CHECKPOINT_2 : LOG_CHECKPOINT_1,
1357 	       OS_FILE_LOG_BLOCK_SIZE,
1358 	       buf, reinterpret_cast<void*>(1) /* checkpoint write */);
1359 }
1360 
1361 /** Read a log group header page to log_sys.checkpoint_buf.
1362 @param[in]	header	0 or LOG_CHECKPOINT_1 or LOG_CHECKPOINT2 */
log_header_read(ulint header)1363 void log_header_read(ulint header)
1364 {
1365 	ut_ad(log_mutex_own());
1366 
1367 	log_sys.n_log_ios++;
1368 
1369 	MONITOR_INC(MONITOR_LOG_IO);
1370 
1371 	fil_io(IORequestLogRead, true,
1372 	       page_id_t(SRV_LOG_SPACE_FIRST_ID,
1373 			 header >> srv_page_size_shift),
1374 	       univ_page_size, header & (srv_page_size - 1),
1375 	       OS_FILE_LOG_BLOCK_SIZE, log_sys.checkpoint_buf, NULL);
1376 }
1377 
1378 /** Write checkpoint info to the log header and invoke log_mutex_exit().
1379 @param[in]	sync	whether to wait for the write to complete
1380 @param[in]	end_lsn	start LSN of the MLOG_CHECKPOINT mini-transaction */
1381 void
log_write_checkpoint_info(bool sync,lsn_t end_lsn)1382 log_write_checkpoint_info(bool sync, lsn_t end_lsn)
1383 {
1384 	ut_ad(log_mutex_own());
1385 	ut_ad(!srv_read_only_mode);
1386 
1387 	log_group_checkpoint(end_lsn);
1388 
1389 	log_mutex_exit();
1390 
1391 	MONITOR_INC(MONITOR_NUM_CHECKPOINT);
1392 
1393 	if (sync) {
1394 		/* Wait for the checkpoint write to complete */
1395 		rw_lock_s_lock(&log_sys.checkpoint_lock);
1396 		rw_lock_s_unlock(&log_sys.checkpoint_lock);
1397 
1398 		DBUG_EXECUTE_IF(
1399 			"crash_after_checkpoint",
1400 			DBUG_SUICIDE(););
1401 	}
1402 }
1403 
1404 /** Set extra data to be written to the redo log during checkpoint.
1405 @param[in]	buf	data to be appended on checkpoint, or NULL
1406 @return pointer to previous data to be appended on checkpoint */
1407 mtr_buf_t*
log_append_on_checkpoint(mtr_buf_t * buf)1408 log_append_on_checkpoint(
1409 	mtr_buf_t*	buf)
1410 {
1411 	log_mutex_enter();
1412 	mtr_buf_t*	old = log_sys.append_on_checkpoint;
1413 	log_sys.append_on_checkpoint = buf;
1414 	log_mutex_exit();
1415 	return(old);
1416 }
1417 
1418 /** Make a checkpoint. Note that this function does not flush dirty
1419 blocks from the buffer pool: it only checks what is lsn of the oldest
1420 modification in the pool, and writes information about the lsn in
1421 log files. Use log_make_checkpoint() to flush also the pool.
1422 @param[in]	sync		whether to wait for the write to complete
1423 @return true if success, false if a checkpoint write was already running */
log_checkpoint(bool sync)1424 bool log_checkpoint(bool sync)
1425 {
1426 	lsn_t	oldest_lsn;
1427 
1428 	ut_ad(!srv_read_only_mode);
1429 
1430 	DBUG_EXECUTE_IF("no_checkpoint",
1431 			/* We sleep for a long enough time, forcing
1432 			the checkpoint doesn't happen any more. */
1433 			os_thread_sleep(360000000););
1434 
1435 	if (recv_recovery_is_on()) {
1436 		recv_apply_hashed_log_recs(true);
1437 	}
1438 
1439 	switch (srv_file_flush_method) {
1440 	case SRV_NOSYNC:
1441 		break;
1442 	case SRV_O_DSYNC:
1443 	case SRV_FSYNC:
1444 	case SRV_LITTLESYNC:
1445 	case SRV_O_DIRECT:
1446 	case SRV_O_DIRECT_NO_FSYNC:
1447 #ifdef _WIN32
1448 	case SRV_ALL_O_DIRECT_FSYNC:
1449 #endif
1450 		fil_flush_file_spaces(FIL_TYPE_TABLESPACE);
1451 	}
1452 
1453 	log_mutex_enter();
1454 
1455 	ut_ad(!recv_no_log_write);
1456 	oldest_lsn = log_buf_pool_get_oldest_modification();
1457 
1458 	/* Because log also contains headers and dummy log records,
1459 	log_buf_pool_get_oldest_modification() will return log_sys.lsn
1460 	if the buffer pool contains no dirty buffers.
1461 	We must make sure that the log is flushed up to that lsn.
1462 	If there are dirty buffers in the buffer pool, then our
1463 	write-ahead-logging algorithm ensures that the log has been
1464 	flushed up to oldest_lsn. */
1465 
1466 	ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn);
1467 	if (oldest_lsn
1468 	    > log_sys.last_checkpoint_lsn + SIZE_OF_MLOG_CHECKPOINT) {
1469 		/* Some log has been written since the previous checkpoint. */
1470 	} else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) {
1471 		/* MariaDB 10.3 startup expects the redo log file to be
1472 		logically empty (not even containing a MLOG_CHECKPOINT record)
1473 		after a clean shutdown. Perform an extra checkpoint at
1474 		shutdown. */
1475 	} else {
1476 		/* Do nothing, because nothing was logged (other than
1477 		a MLOG_CHECKPOINT marker) since the previous checkpoint. */
1478 		log_mutex_exit();
1479 		return(true);
1480 	}
1481 	/* Repeat the MLOG_FILE_NAME records after the checkpoint, in
1482 	case some log records between the checkpoint and log_sys.lsn
1483 	need them. Finally, write a MLOG_CHECKPOINT marker. Redo log
1484 	apply expects to see a MLOG_CHECKPOINT after the checkpoint,
1485 	except on clean shutdown, where the log will be empty after
1486 	the checkpoint.
1487 	It is important that we write out the redo log before any
1488 	further dirty pages are flushed to the tablespace files.  At
1489 	this point, because log_mutex_own(), mtr_commit() in other
1490 	threads will be blocked, and no pages can be added to the
1491 	flush lists. */
1492 	lsn_t		flush_lsn	= oldest_lsn;
1493 	const lsn_t	end_lsn		= log_sys.lsn;
1494 	const bool	do_write
1495 		= srv_shutdown_state <= SRV_SHUTDOWN_INITIATED
1496 		|| flush_lsn != end_lsn;
1497 
1498 	if (fil_names_clear(flush_lsn, do_write)) {
1499 		ut_ad(log_sys.lsn >= end_lsn + SIZE_OF_MLOG_CHECKPOINT);
1500 		flush_lsn = log_sys.lsn;
1501 	}
1502 
1503 	log_mutex_exit();
1504 
1505 	log_write_up_to(flush_lsn, true);
1506 
1507 	log_mutex_enter();
1508 
1509 	ut_ad(log_sys.flushed_to_disk_lsn >= flush_lsn);
1510 	ut_ad(flush_lsn >= oldest_lsn);
1511 
1512 	if (log_sys.last_checkpoint_lsn >= oldest_lsn) {
1513 		log_mutex_exit();
1514 		return(true);
1515 	}
1516 
1517 	if (log_sys.n_pending_checkpoint_writes > 0) {
1518 		/* A checkpoint write is running */
1519 		log_mutex_exit();
1520 
1521 		if (sync) {
1522 			/* Wait for the checkpoint write to complete */
1523 			rw_lock_s_lock(&log_sys.checkpoint_lock);
1524 			rw_lock_s_unlock(&log_sys.checkpoint_lock);
1525 		}
1526 
1527 		return(false);
1528 	}
1529 
1530 	log_sys.next_checkpoint_lsn = oldest_lsn;
1531 	log_write_checkpoint_info(sync, end_lsn);
1532 	ut_ad(!log_mutex_own());
1533 
1534 	return(true);
1535 }
1536 
1537 /** Make a checkpoint */
log_make_checkpoint()1538 void log_make_checkpoint()
1539 {
1540 	/* Preflush pages synchronously */
1541 
1542 	while (!log_preflush_pool_modified_pages(LSN_MAX)) {
1543 		/* Flush as much as we can */
1544 	}
1545 
1546 	while (!log_checkpoint(true)) {
1547 		/* Force a checkpoint */
1548 	}
1549 }
1550 
1551 /****************************************************************//**
1552 Tries to establish a big enough margin of free space in the log groups, such
1553 that a new log entry can be catenated without an immediate need for a
1554 checkpoint. NOTE: this function may only be called if the calling thread
1555 owns no synchronization objects! */
1556 static
1557 void
log_checkpoint_margin(void)1558 log_checkpoint_margin(void)
1559 /*=======================*/
1560 {
1561 	lsn_t		age;
1562 	lsn_t		checkpoint_age;
1563 	ib_uint64_t	advance;
1564 	lsn_t		oldest_lsn;
1565 	bool		success;
1566 loop:
1567 	advance = 0;
1568 
1569 	log_mutex_enter();
1570 	ut_ad(!recv_no_log_write);
1571 
1572 	if (!log_sys.check_flush_or_checkpoint) {
1573 		log_mutex_exit();
1574 		return;
1575 	}
1576 
1577 	oldest_lsn = log_buf_pool_get_oldest_modification();
1578 
1579 	age = log_sys.lsn - oldest_lsn;
1580 
1581 	if (age > log_sys.max_modified_age_sync) {
1582 
1583 		/* A flush is urgent: we have to do a synchronous preflush */
1584 		advance = age - log_sys.max_modified_age_sync;
1585 	}
1586 
1587 	checkpoint_age = log_sys.lsn - log_sys.last_checkpoint_lsn;
1588 
1589 	bool	checkpoint_sync;
1590 	bool	do_checkpoint;
1591 
1592 	if (checkpoint_age > log_sys.max_checkpoint_age) {
1593 		/* A checkpoint is urgent: we do it synchronously */
1594 		checkpoint_sync = true;
1595 		do_checkpoint = true;
1596 	} else if (checkpoint_age > log_sys.max_checkpoint_age_async) {
1597 		/* A checkpoint is not urgent: do it asynchronously */
1598 		do_checkpoint = true;
1599 		checkpoint_sync = false;
1600 		log_sys.check_flush_or_checkpoint = false;
1601 	} else {
1602 		do_checkpoint = false;
1603 		checkpoint_sync = false;
1604 		log_sys.check_flush_or_checkpoint = false;
1605 	}
1606 
1607 	log_mutex_exit();
1608 
1609 	if (advance) {
1610 		lsn_t	new_oldest = oldest_lsn + advance;
1611 
1612 		success = log_preflush_pool_modified_pages(new_oldest);
1613 
1614 		/* If the flush succeeded, this thread has done its part
1615 		and can proceed. If it did not succeed, there was another
1616 		thread doing a flush at the same time. */
1617 		if (!success) {
1618 			log_mutex_enter();
1619 			log_sys.check_flush_or_checkpoint = true;
1620 			log_mutex_exit();
1621 			goto loop;
1622 		}
1623 	}
1624 
1625 	if (do_checkpoint) {
1626 		log_checkpoint(checkpoint_sync);
1627 
1628 		if (checkpoint_sync) {
1629 
1630 			goto loop;
1631 		}
1632 	}
1633 }
1634 
1635 /**
1636 Checks that there is enough free space in the log to start a new query step.
1637 Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
1638 function may only be called if the calling thread owns no synchronization
1639 objects! */
1640 void
log_check_margins(void)1641 log_check_margins(void)
1642 {
1643 	bool	check;
1644 
1645 	do {
1646 		log_flush_margin();
1647 		log_checkpoint_margin();
1648 		log_mutex_enter();
1649 		ut_ad(!recv_no_log_write);
1650 		check = log_sys.check_flush_or_checkpoint;
1651 		log_mutex_exit();
1652 	} while (check);
1653 }
1654 
1655 /****************************************************************//**
1656 Makes a checkpoint at the latest lsn and writes it to first page of each
1657 data file in the database, so that we know that the file spaces contain
1658 all modifications up to that lsn. This can only be called at database
1659 shutdown. This function also writes all log in log files to the log archive. */
1660 void
logs_empty_and_mark_files_at_shutdown(void)1661 logs_empty_and_mark_files_at_shutdown(void)
1662 /*=======================================*/
1663 {
1664 	lsn_t			lsn;
1665 	ulint			count = 0;
1666 
1667 	ib::info() << "Starting shutdown...";
1668 
1669 	/* Wait until the master thread and all other operations are idle: our
1670 	algorithm only works if the server is idle at shutdown */
1671 
1672 	srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
1673 loop:
1674 	ut_ad(lock_sys.is_initialised() || !srv_was_started);
1675 	ut_ad(log_sys.is_initialised() || !srv_was_started);
1676 	ut_ad(fil_system.is_initialised() || !srv_was_started);
1677 	os_event_set(srv_buf_resize_event);
1678 
1679 	if (!srv_read_only_mode) {
1680 		os_event_set(srv_error_event);
1681 		os_event_set(srv_monitor_event);
1682 		os_event_set(srv_buf_dump_event);
1683 		if (lock_sys.timeout_thread_active) {
1684 			os_event_set(lock_sys.timeout_event);
1685 		}
1686 		if (dict_stats_event) {
1687 			os_event_set(dict_stats_event);
1688 		} else {
1689 			ut_ad(!srv_dict_stats_thread_active);
1690 		}
1691 		if (recv_sys && recv_sys->flush_start) {
1692 			/* This is in case recv_writer_thread was never
1693 			started, or buf_flush_page_cleaner_coordinator
1694 			failed to notice its termination. */
1695 			os_event_set(recv_sys->flush_start);
1696 		}
1697 	}
1698 #define COUNT_INTERVAL 600U
1699 #define CHECK_INTERVAL 100000U
1700 	os_thread_sleep(CHECK_INTERVAL);
1701 
1702 	count++;
1703 
1704 	/* Check that there are no longer transactions, except for
1705 	PREPARED ones. We need this wait even for the 'very fast'
1706 	shutdown, because the InnoDB layer may have committed or
1707 	prepared transactions and we don't want to lose them. */
1708 
1709 	if (ulint total_trx = srv_was_started && !srv_read_only_mode
1710 	    && srv_force_recovery < SRV_FORCE_NO_TRX_UNDO
1711 	    ? trx_sys.any_active_transactions() : 0) {
1712 
1713 		if (srv_print_verbose_log && count > COUNT_INTERVAL) {
1714 			service_manager_extend_timeout(
1715 				COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2,
1716 				"Waiting for %lu active transactions to finish",
1717 				(ulong) total_trx);
1718 			ib::info() << "Waiting for " << total_trx << " active"
1719 				<< " transactions to finish";
1720 
1721 			count = 0;
1722 		}
1723 
1724 		goto loop;
1725 	}
1726 
1727 	/* We need these threads to stop early in shutdown. */
1728 	const char* thread_name;
1729 
1730 	if (srv_error_monitor_active) {
1731 		thread_name = "srv_error_monitor_thread";
1732 	} else if (srv_monitor_active) {
1733 		thread_name = "srv_monitor_thread";
1734 	} else if (srv_buf_resize_thread_active) {
1735 		thread_name = "buf_resize_thread";
1736 		goto wait_suspend_loop;
1737 	} else if (srv_dict_stats_thread_active) {
1738 		thread_name = "dict_stats_thread";
1739 	} else if (lock_sys.timeout_thread_active) {
1740 		thread_name = "lock_wait_timeout_thread";
1741 	} else if (srv_buf_dump_thread_active) {
1742 		thread_name = "buf_dump_thread";
1743 		goto wait_suspend_loop;
1744 	} else if (btr_defragment_thread_active) {
1745 		thread_name = "btr_defragment_thread";
1746 	} else if (srv_fast_shutdown != 2 && trx_rollback_is_active) {
1747 		thread_name = "rollback of recovered transactions";
1748 	} else {
1749 		thread_name = NULL;
1750 	}
1751 
1752 	if (thread_name) {
1753 		ut_ad(!srv_read_only_mode);
1754 wait_suspend_loop:
1755 		service_manager_extend_timeout(
1756 			COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2,
1757 			"Waiting for %s to exit", thread_name);
1758 		if (srv_print_verbose_log && count > COUNT_INTERVAL) {
1759 			ib::info() << "Waiting for " << thread_name
1760 				   << " to exit";
1761 			count = 0;
1762 		}
1763 		goto loop;
1764 	}
1765 
1766 	/* Check that the background threads are suspended */
1767 
1768 	switch (srv_get_active_thread_type()) {
1769 	case SRV_NONE:
1770 		if (!srv_n_fil_crypt_threads_started) {
1771 			srv_shutdown_state = SRV_SHUTDOWN_FLUSH_PHASE;
1772 			break;
1773 		}
1774 		os_event_set(fil_crypt_threads_event);
1775 		thread_name = "fil_crypt_thread";
1776 		goto wait_suspend_loop;
1777 	case SRV_PURGE:
1778 	case SRV_WORKER:
1779 		ut_ad(!"purge was not shut down");
1780 		srv_purge_wakeup();
1781 		thread_name = "purge thread";
1782 		goto wait_suspend_loop;
1783 	case SRV_MASTER:
1784 		thread_name = "master thread";
1785 		goto wait_suspend_loop;
1786 	}
1787 
1788 	/* At this point only page_cleaner should be active. We wait
1789 	here to let it complete the flushing of the buffer pools
1790 	before proceeding further. */
1791 
1792 	count = 0;
1793 	service_manager_extend_timeout(COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2,
1794 		"Waiting for page cleaner");
1795 	while (buf_page_cleaner_is_active) {
1796 		++count;
1797 		os_thread_sleep(CHECK_INTERVAL);
1798 		if (srv_print_verbose_log && count > COUNT_INTERVAL) {
1799 			service_manager_extend_timeout(COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2,
1800 				"Waiting for page cleaner");
1801 			ib::info() << "Waiting for page_cleaner to "
1802 				"finish flushing of buffer pool";
1803 			/* This is a workaround to avoid the InnoDB hang
1804 			when OS datetime changed backwards */
1805 			os_event_set(buf_flush_event);
1806 			count = 0;
1807 		}
1808 	}
1809 
1810 	if (log_scrub_thread_active) {
1811 		ut_ad(!srv_read_only_mode);
1812 		os_event_set(log_scrub_event);
1813 	}
1814 
1815 	if (log_sys.is_initialised()) {
1816 		log_mutex_enter();
1817 		const ulint	n_write	= log_sys.n_pending_checkpoint_writes;
1818 		const ulint	n_flush	= log_sys.n_pending_flushes;
1819 		log_mutex_exit();
1820 
1821 		if (log_scrub_thread_active || n_write || n_flush) {
1822 			if (srv_print_verbose_log && count > 600) {
1823 				ib::info() << "Pending checkpoint_writes: "
1824 					<< n_write
1825 					<< ". Pending log flush writes: "
1826 					<< n_flush;
1827 				count = 0;
1828 			}
1829 			goto loop;
1830 		}
1831 	}
1832 
1833 	ut_ad(!log_scrub_thread_active);
1834 
1835 	if (!buf_pool_ptr) {
1836 		ut_ad(!srv_was_started);
1837 	} else if (ulint pending_io = buf_pool_check_no_pending_io()) {
1838 		if (srv_print_verbose_log && count > 600) {
1839 			ib::info() << "Waiting for " << pending_io << " buffer"
1840 				" page I/Os to complete";
1841 			count = 0;
1842 		}
1843 
1844 		goto loop;
1845 	}
1846 
1847 	if (srv_fast_shutdown == 2 || !srv_was_started) {
1848 		if (!srv_read_only_mode && srv_was_started) {
1849 			ib::info() << "MySQL has requested a very fast"
1850 				" shutdown without flushing the InnoDB buffer"
1851 				" pool to data files. At the next mysqld"
1852 				" startup InnoDB will do a crash recovery!";
1853 
1854 			/* In this fastest shutdown we do not flush the
1855 			buffer pool:
1856 
1857 			it is essentially a 'crash' of the InnoDB server.
1858 			Make sure that the log is all flushed to disk, so
1859 			that we can recover all committed transactions in
1860 			a crash recovery. We must not write the lsn stamps
1861 			to the data files, since at a startup InnoDB deduces
1862 			from the stamps if the previous shutdown was clean. */
1863 
1864 			log_buffer_flush_to_disk();
1865 		}
1866 
1867 		srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
1868 
1869 		if (fil_system.is_initialised()) {
1870 			fil_close_all_files();
1871 		}
1872 		return;
1873 	}
1874 
1875 	if (!srv_read_only_mode) {
1876 		service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
1877 			"ensuring dirty buffer pool are written to log");
1878 		log_make_checkpoint();
1879 
1880 		log_mutex_enter();
1881 
1882 		lsn = log_sys.lsn;
1883 
1884 		const bool lsn_changed = lsn != log_sys.last_checkpoint_lsn;
1885 		ut_ad(lsn >= log_sys.last_checkpoint_lsn);
1886 
1887 		log_mutex_exit();
1888 
1889 		if (lsn_changed) {
1890 			goto loop;
1891 		}
1892 
1893 		/* Ensure that all buffered changes are written to the
1894 		redo log before fil_close_all_files(). */
1895 		fil_flush_file_spaces(FIL_TYPE_LOG);
1896 	} else {
1897 		lsn = srv_start_lsn;
1898 	}
1899 
1900 	srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
1901 
1902 	/* Make some checks that the server really is quiet */
1903 	ut_a(srv_get_active_thread_type() == SRV_NONE);
1904 
1905 	service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
1906 				       "Free innodb buffer pool");
1907 	buf_all_freed();
1908 
1909 	ut_a(lsn == log_sys.lsn
1910 	     || srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
1911 
1912 	if (lsn < srv_start_lsn) {
1913 		ib::error() << "Shutdown LSN=" << lsn
1914 			<< " is less than start LSN=" << srv_start_lsn;
1915 	}
1916 
1917 	srv_shutdown_lsn = lsn;
1918 
1919 	if (!srv_read_only_mode) {
1920 		dberr_t err = fil_write_flushed_lsn(lsn);
1921 
1922 		if (err != DB_SUCCESS) {
1923 			ib::error() << "Writing flushed lsn " << lsn
1924 				<< " failed; error=" << err;
1925 		}
1926 	}
1927 
1928 	fil_close_all_files();
1929 
1930 	/* Make some checks that the server really is quiet */
1931 	ut_a(srv_get_active_thread_type() == SRV_NONE);
1932 
1933 	ut_a(lsn == log_sys.lsn
1934 	     || srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
1935 }
1936 
1937 /******************************************************//**
1938 Peeks the current lsn.
1939 @return TRUE if success, FALSE if could not get the log system mutex */
1940 ibool
log_peek_lsn(lsn_t * lsn)1941 log_peek_lsn(
1942 /*=========*/
1943 	lsn_t*	lsn)	/*!< out: if returns TRUE, current lsn is here */
1944 {
1945 	if (0 == mutex_enter_nowait(&(log_sys.mutex))) {
1946 		*lsn = log_sys.lsn;
1947 
1948 		log_mutex_exit();
1949 
1950 		return(TRUE);
1951 	}
1952 
1953 	return(FALSE);
1954 }
1955 
1956 /******************************************************//**
1957 Prints info of the log. */
1958 void
log_print(FILE * file)1959 log_print(
1960 /*======*/
1961 	FILE*	file)	/*!< in: file where to print */
1962 {
1963 	double	time_elapsed;
1964 	time_t	current_time;
1965 
1966 	log_mutex_enter();
1967 
1968 	fprintf(file,
1969 		"Log sequence number " LSN_PF "\n"
1970 		"Log flushed up to   " LSN_PF "\n"
1971 		"Pages flushed up to " LSN_PF "\n"
1972 		"Last checkpoint at  " LSN_PF "\n",
1973 		log_sys.lsn,
1974 		log_sys.flushed_to_disk_lsn,
1975 		log_buf_pool_get_oldest_modification(),
1976 		log_sys.last_checkpoint_lsn);
1977 
1978 	current_time = time(NULL);
1979 
1980 	time_elapsed = difftime(current_time,
1981 				log_sys.last_printout_time);
1982 
1983 	if (time_elapsed <= 0) {
1984 		time_elapsed = 1;
1985 	}
1986 
1987 	fprintf(file,
1988 		ULINTPF " pending log flushes, "
1989 		ULINTPF " pending chkp writes\n"
1990 		ULINTPF " log i/o's done, %.2f log i/o's/second\n",
1991 		log_sys.n_pending_flushes,
1992 		log_sys.n_pending_checkpoint_writes,
1993 		log_sys.n_log_ios,
1994 		static_cast<double>(
1995 			log_sys.n_log_ios - log_sys.n_log_ios_old)
1996 		/ time_elapsed);
1997 
1998 	log_sys.n_log_ios_old = log_sys.n_log_ios;
1999 	log_sys.last_printout_time = current_time;
2000 
2001 	log_mutex_exit();
2002 }
2003 
2004 /**********************************************************************//**
2005 Refreshes the statistics used to print per-second averages. */
2006 void
log_refresh_stats(void)2007 log_refresh_stats(void)
2008 /*===================*/
2009 {
2010 	log_sys.n_log_ios_old = log_sys.n_log_ios;
2011 	log_sys.last_printout_time = time(NULL);
2012 }
2013 
2014 /** Shut down the redo log subsystem. */
close()2015 void log_t::close()
2016 {
2017   ut_ad(this == &log_sys);
2018   if (!is_initialised()) return;
2019   m_initialised = false;
2020   log.close();
2021 
2022   ut_free_dodump(buf, srv_log_buffer_size);
2023   buf = NULL;
2024   ut_free_dodump(flush_buf, srv_log_buffer_size);
2025   flush_buf = NULL;
2026 
2027   os_event_destroy(flush_event);
2028   rw_lock_free(&checkpoint_lock);
2029   mutex_free(&mutex);
2030   mutex_free(&write_mutex);
2031   mutex_free(&log_flush_order_mutex);
2032 
2033   if (!srv_read_only_mode && srv_scrub_log)
2034     os_event_destroy(log_scrub_event);
2035 
2036   recv_sys_close();
2037 }
2038 
2039 /******************************************************//**
2040 Pads the current log block full with dummy log records. Used in producing
2041 consistent archived log files and scrubbing redo log. */
2042 static
2043 void
log_pad_current_log_block(void)2044 log_pad_current_log_block(void)
2045 /*===========================*/
2046 {
2047 	byte		b		= MLOG_DUMMY_RECORD;
2048 	ulint		pad_length;
2049 	ulint		i;
2050 	lsn_t		lsn;
2051 
2052 	ut_ad(!recv_no_log_write);
2053 	/* We retrieve lsn only because otherwise gcc crashed on HP-UX */
2054 	lsn = log_reserve_and_open(OS_FILE_LOG_BLOCK_SIZE);
2055 
2056 	pad_length = OS_FILE_LOG_BLOCK_SIZE
2057 		- (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE)
2058 		- LOG_BLOCK_TRL_SIZE;
2059 	if (pad_length
2060 	    == (OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE
2061 		- LOG_BLOCK_TRL_SIZE)) {
2062 
2063 		pad_length = 0;
2064 	}
2065 
2066 	if (pad_length) {
2067 		srv_stats.n_log_scrubs.inc();
2068 	}
2069 
2070 	for (i = 0; i < pad_length; i++) {
2071 		log_write_low(&b, 1);
2072 	}
2073 
2074 	lsn = log_sys.lsn;
2075 
2076 	log_close();
2077 
2078 	ut_a(lsn % OS_FILE_LOG_BLOCK_SIZE == LOG_BLOCK_HDR_SIZE);
2079 }
2080 
2081 /*****************************************************************//*
2082 If no log record has been written for a while, fill current log
2083 block with dummy records. */
2084 static
2085 void
log_scrub()2086 log_scrub()
2087 /*=========*/
2088 {
2089 	log_mutex_enter();
2090 	ulint cur_lbn = log_block_convert_lsn_to_no(log_sys.lsn);
2091 
2092 	if (next_lbn_to_pad == cur_lbn)
2093 	{
2094 		log_pad_current_log_block();
2095 	}
2096 
2097 	next_lbn_to_pad = log_block_convert_lsn_to_no(log_sys.lsn);
2098 	log_mutex_exit();
2099 }
2100 
2101 /* log scrubbing speed, in bytes/sec */
2102 UNIV_INTERN ulonglong innodb_scrub_log_speed;
2103 
2104 /*****************************************************************//**
2105 This is the main thread for log scrub. It waits for an event and
2106 when waked up fills current log block with dummy records and
2107 sleeps again.
2108 @return this function does not return, it calls os_thread_exit() */
2109 extern "C" UNIV_INTERN
2110 os_thread_ret_t
DECLARE_THREAD(log_scrub_thread)2111 DECLARE_THREAD(log_scrub_thread)(void*)
2112 {
2113 	ut_ad(!srv_read_only_mode);
2114 
2115 	while (srv_shutdown_state < SRV_SHUTDOWN_FLUSH_PHASE) {
2116 		/* log scrubbing interval in µs. */
2117 		ulonglong interval = 1000*1000*512/innodb_scrub_log_speed;
2118 
2119 		os_event_wait_time(log_scrub_event, static_cast<ulint>(interval));
2120 
2121 		log_scrub();
2122 
2123 		os_event_reset(log_scrub_event);
2124 	}
2125 
2126 	log_scrub_thread_active = false;
2127 
2128 	/* We count the number of threads in os_thread_exit(). A created
2129 	thread should always use that to exit and not use return() to exit. */
2130 	os_thread_exit();
2131 
2132 	OS_THREAD_DUMMY_RETURN;
2133 }
2134