1 /*****************************************************************************
2 
3 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2009, Google Inc.
5 Copyright (c) 2014, 2021, MariaDB Corporation.
6 
7 Portions of this file contain modifications contributed and copyrighted by
8 Google, Inc. Those modifications are gratefully acknowledged and are described
9 briefly in the InnoDB documentation. The contributions by Google are
10 incorporated with their permission, and subject to the conditions contained in
11 the file COPYING.Google.
12 
13 This program is free software; you can redistribute it and/or modify it under
14 the terms of the GNU General Public License as published by the Free Software
15 Foundation; version 2 of the License.
16 
17 This program is distributed in the hope that it will be useful, but WITHOUT
18 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
19 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
20 
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
24 
25 *****************************************************************************/
26 
27 /**************************************************//**
28 @file log/log0log.cc
29 Database log
30 
31 Created 12/9/1995 Heikki Tuuri
32 *******************************************************/
33 
34 #include "univ.i"
35 #include <debug_sync.h>
36 #include <my_service_manager.h>
37 
38 #include "log0log.h"
39 #include "log0crypt.h"
40 #include "mem0mem.h"
41 #include "buf0buf.h"
42 #include "buf0flu.h"
43 #include "lock0lock.h"
44 #include "log0recv.h"
45 #include "fil0fil.h"
46 #include "dict0boot.h"
47 #include "dict0stats_bg.h"
48 #include "btr0defragment.h"
49 #include "srv0srv.h"
50 #include "srv0start.h"
51 #include "trx0sys.h"
52 #include "trx0trx.h"
53 #include "trx0roll.h"
54 #include "srv0mon.h"
55 #include "sync0sync.h"
56 
57 /*
58 General philosophy of InnoDB redo-logs:
59 
60 1) Every change to a contents of a data page must be done
61 through mtr, which in mtr_commit() writes log records
62 to the InnoDB redo log.
63 
64 2) Normally these changes are performed using a mlog_write_ulint()
65 or similar function.
66 
67 3) In some page level operations only a code number of a
68 c-function and its parameters are written to the log to
69 reduce the size of the log.
70 
71   3a) You should not add parameters to these kind of functions
72   (e.g. trx_undo_header_create())
73 
74   3b) You should not add such functionality which either change
75   working when compared with the old or are dependent on data
76   outside of the page. These kind of functions should implement
77   self-contained page transformation and it should be unchanged
78   if you don't have very essential reasons to change log
79   semantics or format.
80 
81 */
82 
83 /** Redo log system */
84 log_t	log_sys;
85 
86 /** Whether to require checksums on the redo log pages */
87 my_bool	innodb_log_checksums;
88 
89 /* Next log block number to do dummy record filling if no log records written
90 for a while */
91 static ulint		next_lbn_to_pad = 0;
92 
93 /* These control how often we print warnings if the last checkpoint is too
94 old */
95 static bool	log_has_printed_chkp_warning = false;
96 static time_t	log_last_warning_time;
97 
98 static bool	log_has_printed_chkp_margine_warning = false;
99 static time_t	log_last_margine_warning_time;
100 
101 /* A margin for free space in the log buffer before a log entry is catenated */
102 #define LOG_BUF_WRITE_MARGIN	(4 * OS_FILE_LOG_BLOCK_SIZE)
103 
104 /* Margins for free space in the log buffer after a log entry is catenated */
105 #define LOG_BUF_FLUSH_RATIO	2
106 #define LOG_BUF_FLUSH_MARGIN	(LOG_BUF_WRITE_MARGIN		\
107 				 + (4U << srv_page_size_shift))
108 
109 /* This parameter controls asynchronous making of a new checkpoint; the value
110 should be bigger than LOG_POOL_PREFLUSH_RATIO_SYNC */
111 
112 #define LOG_POOL_CHECKPOINT_RATIO_ASYNC	32
113 
114 /* This parameter controls synchronous preflushing of modified buffer pages */
115 #define LOG_POOL_PREFLUSH_RATIO_SYNC	16
116 
117 /* The same ratio for asynchronous preflushing; this value should be less than
118 the previous */
119 #define LOG_POOL_PREFLUSH_RATIO_ASYNC	8
120 
121 /* Codes used in unlocking flush latches */
122 #define LOG_UNLOCK_NONE_FLUSHED_LOCK	1
123 #define LOG_UNLOCK_FLUSH_LOCK		2
124 
125 /** Event to wake up log_scrub_thread */
126 os_event_t	log_scrub_event;
127 /** Whether log_scrub_thread is active */
128 bool		log_scrub_thread_active;
129 
130 extern "C" UNIV_INTERN
131 os_thread_ret_t
132 DECLARE_THREAD(log_scrub_thread)(void*);
133 
134 /****************************************************************//**
135 Returns the oldest modified block lsn in the pool, or log_sys.lsn if none
136 exists.
137 @return LSN of oldest modification */
138 static
139 lsn_t
log_buf_pool_get_oldest_modification(void)140 log_buf_pool_get_oldest_modification(void)
141 /*======================================*/
142 {
143 	lsn_t	lsn;
144 
145 	ut_ad(log_mutex_own());
146 
147 	lsn = buf_pool_get_oldest_modification();
148 
149 	if (!lsn) {
150 
151 		lsn = log_sys.lsn;
152 	}
153 
154 	return(lsn);
155 }
156 
157 /** Extends the log buffer.
158 @param[in]	len	requested minimum size in bytes */
log_buffer_extend(ulong len)159 void log_buffer_extend(ulong len)
160 {
161 	const ulong new_buf_size = ut_calc_align(len, srv_page_size);
162 	byte* new_buf = static_cast<byte*>(ut_malloc_dontdump(new_buf_size));
163 	TRASH_ALLOC(new_buf, new_buf_size);
164 	byte* new_flush_buf =
165 		static_cast<byte*>(ut_malloc_dontdump(new_buf_size));
166 	TRASH_ALLOC(new_flush_buf, new_buf_size);
167 
168 	log_mutex_enter();
169 
170 	if (len <= srv_log_buffer_size) {
171 		/* Already extended enough by the others */
172 		log_mutex_exit();
173 		ut_free_dodump(new_buf, new_buf_size);
174 		ut_free_dodump(new_flush_buf, new_buf_size);
175 		return;
176 	}
177 
178 	ib::warn() << "The redo log transaction size " << len <<
179 		" exceeds innodb_log_buffer_size="
180 		<< srv_log_buffer_size << " / 2). Trying to extend it.";
181 
182 	byte* old_buf = log_sys.buf;
183 	byte* old_flush_buf = log_sys.flush_buf;
184 	const ulong old_buf_size = srv_log_buffer_size;
185 
186 	srv_log_buffer_size = new_buf_size;
187 	log_sys.buf = new_buf;
188 	log_sys.flush_buf = new_flush_buf;
189 	memcpy(new_buf, old_buf, log_sys.buf_free);
190 
191 	log_sys.max_buf_free = new_buf_size / LOG_BUF_FLUSH_RATIO
192 		- LOG_BUF_FLUSH_MARGIN;
193 
194 	log_mutex_exit();
195 
196 	ut_free_dodump(old_buf, old_buf_size);
197 	ut_free_dodump(old_flush_buf, old_buf_size);
198 
199 	ib::info() << "innodb_log_buffer_size was extended to "
200 		<< new_buf_size << ".";
201 }
202 
203 /** Calculate actual length in redo buffer and file including
204 block header and trailer.
205 @param[in]	len	length to write
206 @return actual length to write including header and trailer. */
207 static inline
208 ulint
log_calculate_actual_len(ulint len)209 log_calculate_actual_len(
210 	ulint len)
211 {
212 	ut_ad(log_mutex_own());
213 
214 	const ulint	framing_size = log_sys.framing_size();
215 	/* actual length stored per block */
216 	const ulint	len_per_blk = OS_FILE_LOG_BLOCK_SIZE - framing_size;
217 
218 	/* actual data length in last block already written */
219 	ulint	extra_len = (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE);
220 
221 	ut_ad(extra_len >= LOG_BLOCK_HDR_SIZE);
222 	extra_len -= LOG_BLOCK_HDR_SIZE;
223 
224 	/* total extra length for block header and trailer */
225 	extra_len = ((len + extra_len) / len_per_blk) * framing_size;
226 
227 	return(len + extra_len);
228 }
229 
230 /** Check margin not to overwrite transaction log from the last checkpoint.
231 If would estimate the log write to exceed the log_group_capacity,
232 waits for the checkpoint is done enough.
233 @param[in]	len	length of the data to be written */
234 
235 void
log_margin_checkpoint_age(ulint len)236 log_margin_checkpoint_age(
237 	ulint	len)
238 {
239 	ulint	margin = log_calculate_actual_len(len);
240 
241 	ut_ad(log_mutex_own());
242 
243 	if (margin > log_sys.log_group_capacity) {
244 		/* return with warning output to avoid deadlock */
245 		if (!log_has_printed_chkp_margine_warning
246 		    || difftime(time(NULL),
247 				log_last_margine_warning_time) > 15) {
248 			log_has_printed_chkp_margine_warning = true;
249 			log_last_margine_warning_time = time(NULL);
250 
251 			ib::error() << "The transaction log files are too"
252 				" small for the single transaction log (size="
253 				<< len << "). So, the last checkpoint age"
254 				" might exceed the log group capacity "
255 				<< log_sys.log_group_capacity << ".";
256 		}
257 
258 		return;
259 	}
260 
261 	/* Our margin check should ensure that we never reach this condition.
262 	Try to do checkpoint once. We cannot keep waiting here as it might
263 	result in hang in case the current mtr has latch on oldest lsn */
264 	if (log_sys.lsn - log_sys.last_checkpoint_lsn + margin
265 	    > log_sys.log_group_capacity) {
266 		/* The log write of 'len' might overwrite the transaction log
267 		after the last checkpoint. Makes checkpoint. */
268 
269 		bool	flushed_enough = false;
270 
271 		if (log_sys.lsn - log_buf_pool_get_oldest_modification()
272 		    + margin
273 		    <= log_sys.log_group_capacity) {
274 			flushed_enough = true;
275 		}
276 
277 		log_sys.check_flush_or_checkpoint = true;
278 		log_mutex_exit();
279 
280 		DEBUG_SYNC_C("margin_checkpoint_age_rescue");
281 
282 		if (!flushed_enough) {
283 			os_thread_sleep(100000);
284 		}
285 		log_checkpoint(true);
286 
287 		log_mutex_enter();
288 	}
289 
290 	return;
291 }
292 
293 /** Open the log for log_write_low. The log must be closed with log_close.
294 @param[in]	len	length of the data to be written
295 @return start lsn of the log record */
296 lsn_t
log_reserve_and_open(ulint len)297 log_reserve_and_open(
298 	ulint	len)
299 {
300 	ulint	len_upper_limit;
301 #ifdef UNIV_DEBUG
302 	ulint	count			= 0;
303 #endif /* UNIV_DEBUG */
304 
305 loop:
306 	ut_ad(log_mutex_own());
307 
308 	/* Calculate an upper limit for the space the string may take in the
309 	log buffer */
310 
311 	len_upper_limit = LOG_BUF_WRITE_MARGIN + srv_log_write_ahead_size
312 			  + (5 * len) / 4;
313 
314 	if (log_sys.buf_free + len_upper_limit > srv_log_buffer_size) {
315 		log_mutex_exit();
316 
317 		DEBUG_SYNC_C("log_buf_size_exceeded");
318 
319 		/* Not enough free space, do a write of the log buffer */
320 		log_buffer_sync_in_background(false);
321 
322 		srv_stats.log_waits.inc();
323 
324 		ut_ad(++count < 50);
325 
326 		log_mutex_enter();
327 		goto loop;
328 	}
329 
330 	return(log_sys.lsn);
331 }
332 
333 /************************************************************//**
334 Writes to the log the string given. It is assumed that the caller holds the
335 log mutex. */
336 void
log_write_low(const byte * str,ulint str_len)337 log_write_low(
338 /*==========*/
339 	const byte*	str,		/*!< in: string */
340 	ulint		str_len)	/*!< in: string length */
341 {
342 	ulint	len;
343 
344 	ut_ad(log_mutex_own());
345 	const ulint trailer_offset = log_sys.trailer_offset();
346 part_loop:
347 	/* Calculate a part length */
348 
349 	ulint data_len = (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE) + str_len;
350 
351 	if (data_len <= trailer_offset) {
352 
353 		/* The string fits within the current log block */
354 
355 		len = str_len;
356 	} else {
357 		data_len = trailer_offset;
358 
359 		len = trailer_offset
360 			- log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE;
361 	}
362 
363 	memcpy(log_sys.buf + log_sys.buf_free, str, len);
364 
365 	str_len -= len;
366 	str = str + len;
367 
368 	byte* log_block = static_cast<byte*>(
369 		ut_align_down(log_sys.buf + log_sys.buf_free,
370 			      OS_FILE_LOG_BLOCK_SIZE));
371 
372 	log_block_set_data_len(log_block, data_len);
373 
374 	if (data_len == trailer_offset) {
375 		/* This block became full */
376 		log_block_set_data_len(log_block, OS_FILE_LOG_BLOCK_SIZE);
377 		log_block_set_checkpoint_no(log_block,
378 					    log_sys.next_checkpoint_no);
379 		len += log_sys.framing_size();
380 
381 		log_sys.lsn += len;
382 
383 		/* Initialize the next block header */
384 		log_block_init(log_block + OS_FILE_LOG_BLOCK_SIZE,
385 			       log_sys.lsn);
386 	} else {
387 		log_sys.lsn += len;
388 	}
389 
390 	log_sys.buf_free += ulong(len);
391 
392 	ut_ad(log_sys.buf_free <= srv_log_buffer_size);
393 
394 	if (str_len > 0) {
395 		goto part_loop;
396 	}
397 
398 	srv_stats.log_write_requests.inc();
399 }
400 
401 /************************************************************//**
402 Closes the log.
403 @return lsn */
404 lsn_t
log_close(void)405 log_close(void)
406 /*===========*/
407 {
408 	byte*		log_block;
409 	ulint		first_rec_group;
410 	lsn_t		oldest_lsn;
411 	lsn_t		lsn;
412 	lsn_t		checkpoint_age;
413 
414 	ut_ad(log_mutex_own());
415 
416 	lsn = log_sys.lsn;
417 
418 	log_block = static_cast<byte*>(
419 		ut_align_down(log_sys.buf + log_sys.buf_free,
420 			      OS_FILE_LOG_BLOCK_SIZE));
421 
422 	first_rec_group = log_block_get_first_rec_group(log_block);
423 
424 	if (first_rec_group == 0) {
425 		/* We initialized a new log block which was not written
426 		full by the current mtr: the next mtr log record group
427 		will start within this block at the offset data_len */
428 
429 		log_block_set_first_rec_group(
430 			log_block, log_block_get_data_len(log_block));
431 	}
432 
433 	if (log_sys.buf_free > log_sys.max_buf_free) {
434 		log_sys.check_flush_or_checkpoint = true;
435 	}
436 
437 	checkpoint_age = lsn - log_sys.last_checkpoint_lsn;
438 
439 	if (checkpoint_age >= log_sys.log_group_capacity) {
440 		DBUG_EXECUTE_IF(
441 			"print_all_chkp_warnings",
442 			log_has_printed_chkp_warning = false;);
443 
444 		if (!log_has_printed_chkp_warning
445 		    || difftime(time(NULL), log_last_warning_time) > 15) {
446 
447 			log_has_printed_chkp_warning = true;
448 			log_last_warning_time = time(NULL);
449 
450 			ib::error() << "The age of the last checkpoint is "
451 				<< checkpoint_age << ", which exceeds the log"
452 				" group capacity "
453 				<< log_sys.log_group_capacity
454 				<< ".";
455 		}
456 	}
457 
458 	if (checkpoint_age <= log_sys.max_modified_age_sync) {
459 		goto function_exit;
460 	}
461 
462 	oldest_lsn = buf_pool_get_oldest_modification();
463 
464 	if (!oldest_lsn
465 	    || lsn - oldest_lsn > log_sys.max_modified_age_sync
466 	    || checkpoint_age > log_sys.max_checkpoint_age_async) {
467 		log_sys.check_flush_or_checkpoint = true;
468 	}
469 function_exit:
470 
471 	return(lsn);
472 }
473 
474 /** Calculate the recommended highest values for lsn - last_checkpoint_lsn
475 and lsn - buf_get_oldest_modification().
476 @param[in]	file_size	requested innodb_log_file_size
477 @retval true on success
478 @retval false if the smallest log group is too small to
479 accommodate the number of OS threads in the database server */
480 bool
log_set_capacity(ulonglong file_size)481 log_set_capacity(ulonglong file_size)
482 {
483 	lsn_t		margin;
484 	ulint		free;
485 
486 	lsn_t smallest_capacity = (file_size - LOG_FILE_HDR_SIZE)
487 		* srv_n_log_files;
488 	/* Add extra safety */
489 	smallest_capacity -= smallest_capacity / 10;
490 
491 	/* For each OS thread we must reserve so much free space in the
492 	smallest log group that it can accommodate the log entries produced
493 	by single query steps: running out of free log space is a serious
494 	system error which requires rebooting the database. */
495 
496 	free = LOG_CHECKPOINT_FREE_PER_THREAD * (10 + srv_thread_concurrency)
497 		+ LOG_CHECKPOINT_EXTRA_FREE;
498 	if (free >= smallest_capacity / 2) {
499 		ib::error() << "Cannot continue operation. ib_logfiles are too"
500 			" small for innodb_thread_concurrency="
501 			<< srv_thread_concurrency << ". The combined size of"
502 			" ib_logfiles should be bigger than"
503 			" 200 kB * innodb_thread_concurrency. "
504 			<< INNODB_PARAMETERS_MSG;
505 		return(false);
506 	}
507 
508 	margin = smallest_capacity - free;
509 	margin = margin - margin / 10;	/* Add still some extra safety */
510 
511 	log_mutex_enter();
512 
513 	log_sys.log_group_capacity = smallest_capacity;
514 
515 	log_sys.max_modified_age_async = margin
516 		- margin / LOG_POOL_PREFLUSH_RATIO_ASYNC;
517 	log_sys.max_modified_age_sync = margin
518 		- margin / LOG_POOL_PREFLUSH_RATIO_SYNC;
519 
520 	log_sys.max_checkpoint_age_async = margin - margin
521 		/ LOG_POOL_CHECKPOINT_RATIO_ASYNC;
522 	log_sys.max_checkpoint_age = margin;
523 
524 	log_mutex_exit();
525 
526 	return(true);
527 }
528 
529 /** Initialize the redo log subsystem. */
create()530 void log_t::create()
531 {
532   ut_ad(this == &log_sys);
533   ut_ad(!is_initialised());
534   m_initialised= true;
535 
536   mutex_create(LATCH_ID_LOG_SYS, &mutex);
537   mutex_create(LATCH_ID_LOG_WRITE, &write_mutex);
538   mutex_create(LATCH_ID_LOG_FLUSH_ORDER, &log_flush_order_mutex);
539 
540   /* Start the lsn from one log block from zero: this way every
541   log record has a non-zero start lsn, a fact which we will use */
542 
543   lsn= LOG_START_LSN;
544 
545   ut_ad(srv_log_buffer_size >= 16 * OS_FILE_LOG_BLOCK_SIZE);
546   ut_ad(srv_log_buffer_size >= 4U << srv_page_size_shift);
547 
548   buf= static_cast<byte*>(ut_malloc_dontdump(srv_log_buffer_size));
549   TRASH_ALLOC(buf, srv_log_buffer_size);
550   flush_buf= static_cast<byte*>(ut_malloc_dontdump(srv_log_buffer_size));
551   TRASH_ALLOC(flush_buf, srv_log_buffer_size);
552 
553   max_buf_free= srv_log_buffer_size / LOG_BUF_FLUSH_RATIO -
554     LOG_BUF_FLUSH_MARGIN;
555   check_flush_or_checkpoint= true;
556 
557   n_log_ios_old= n_log_ios;
558   last_printout_time= time(NULL);
559 
560   buf_next_to_write= 0;
561   write_lsn= lsn;
562   flushed_to_disk_lsn= 0;
563   n_pending_flushes= 0;
564   flush_event = os_event_create("log_flush_event");
565   os_event_set(flush_event);
566   n_log_ios= 0;
567   n_log_ios_old= 0;
568   log_group_capacity= 0;
569   max_modified_age_async= 0;
570   max_modified_age_sync= 0;
571   max_checkpoint_age_async= 0;
572   max_checkpoint_age= 0;
573   next_checkpoint_no= 0;
574   next_checkpoint_lsn= 0;
575   append_on_checkpoint= NULL;
576   n_pending_checkpoint_writes= 0;
577 
578   last_checkpoint_lsn= lsn;
579   rw_lock_create(checkpoint_lock_key, &checkpoint_lock, SYNC_NO_ORDER_CHECK);
580 
581   log_block_init(buf, lsn);
582   log_block_set_first_rec_group(buf, LOG_BLOCK_HDR_SIZE);
583 
584   buf_free= LOG_BLOCK_HDR_SIZE;
585   lsn= LOG_START_LSN + LOG_BLOCK_HDR_SIZE;
586 
587   MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE, lsn - last_checkpoint_lsn);
588 
589   log_scrub_thread_active= !srv_read_only_mode && srv_scrub_log;
590   if (log_scrub_thread_active) {
591     log_scrub_event= os_event_create("log_scrub_event");
592     os_thread_create(log_scrub_thread, NULL, NULL);
593   }
594 }
595 
596 /** Initialize the redo log.
597 @param[in]	n_files		number of files */
create(ulint n_files)598 void log_t::files::create(ulint n_files)
599 {
600   ut_ad(n_files <= SRV_N_LOG_FILES_MAX);
601   ut_ad(this == &log_sys.log);
602   ut_ad(log_sys.is_initialised());
603 
604   this->n_files= n_files;
605   format= srv_encrypt_log ? log_t::FORMAT_ENC_10_4 : log_t::FORMAT_10_4;
606   subformat= 2;
607   file_size= srv_log_file_size;
608   lsn= LOG_START_LSN;
609   lsn_offset= LOG_FILE_HDR_SIZE;
610 }
611 
612 /******************************************************//**
613 Writes a log file header to a log file space. */
614 static
615 void
log_file_header_flush(ulint nth_file,lsn_t start_lsn)616 log_file_header_flush(
617 	ulint		nth_file,	/*!< in: header to the nth file in the
618 					log file space */
619 	lsn_t		start_lsn)	/*!< in: log file data starts at this
620 					lsn */
621 {
622 	lsn_t	dest_offset;
623 
624 	ut_ad(log_write_mutex_own());
625 	ut_ad(!recv_no_log_write);
626 	ut_a(nth_file < log_sys.log.n_files);
627 	ut_ad(log_sys.log.format == log_t::FORMAT_10_4
628 	      || log_sys.log.format == log_t::FORMAT_ENC_10_4);
629 
630 	// man 2 open suggests this buffer to be aligned by 512 for O_DIRECT
631 	MY_ALIGNED(OS_FILE_LOG_BLOCK_SIZE)
632 	byte buf[OS_FILE_LOG_BLOCK_SIZE] = {0};
633 
634 	mach_write_to_4(buf + LOG_HEADER_FORMAT, log_sys.log.format);
635 	mach_write_to_4(buf + LOG_HEADER_SUBFORMAT, log_sys.log.subformat);
636 	mach_write_to_8(buf + LOG_HEADER_START_LSN, start_lsn);
637 	strcpy(reinterpret_cast<char*>(buf) + LOG_HEADER_CREATOR,
638 	       LOG_HEADER_CREATOR_CURRENT);
639 	ut_ad(LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR
640 	      >= sizeof LOG_HEADER_CREATOR_CURRENT);
641 	log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf));
642 
643 	dest_offset = nth_file * log_sys.log.file_size;
644 
645 	DBUG_PRINT("ib_log", ("write " LSN_PF
646 			      " file " ULINTPF " header",
647 			      start_lsn, nth_file));
648 
649 	log_sys.n_log_ios++;
650 
651 	srv_stats.os_log_pending_writes.inc();
652 
653 	const ulint	page_no = ulint(dest_offset >> srv_page_size_shift);
654 
655 	fil_io(IORequestLogWrite, true,
656 	       page_id_t(SRV_LOG_SPACE_FIRST_ID, page_no),
657 	       0,
658 	       ulint(dest_offset & (srv_page_size - 1)),
659 	       OS_FILE_LOG_BLOCK_SIZE, buf, NULL);
660 
661 	srv_stats.os_log_pending_writes.dec();
662 }
663 
664 /******************************************************//**
665 Stores a 4-byte checksum to the trailer checksum field of a log block
666 before writing it to a log file. This checksum is used in recovery to
667 check the consistency of a log block. */
668 static
669 void
log_block_store_checksum(byte * block)670 log_block_store_checksum(
671 /*=====================*/
672 	byte*	block)	/*!< in/out: pointer to a log block */
673 {
674 	log_block_set_checksum(block, log_block_calc_checksum_crc32(block));
675 }
676 
677 /******************************************************//**
678 Writes a buffer to a log file. */
679 static
680 void
log_write_buf(byte * buf,ulint len,ulint pad_len,lsn_t start_lsn,ulint new_data_offset)681 log_write_buf(
682 	byte*		buf,		/*!< in: buffer */
683 	ulint		len,		/*!< in: buffer len; must be divisible
684 					by OS_FILE_LOG_BLOCK_SIZE */
685 #ifdef UNIV_DEBUG
686 	ulint		pad_len,	/*!< in: pad len in the buffer len */
687 #endif /* UNIV_DEBUG */
688 	lsn_t		start_lsn,	/*!< in: start lsn of the buffer; must
689 					be divisible by
690 					OS_FILE_LOG_BLOCK_SIZE */
691 	ulint		new_data_offset)/*!< in: start offset of new data in
692 					buf: this parameter is used to decide
693 					if we have to write a new log file
694 					header */
695 {
696 	ulint		write_len;
697 	bool		write_header	= new_data_offset == 0;
698 	lsn_t		next_offset;
699 	ulint		i;
700 
701 	ut_ad(log_write_mutex_own());
702 	ut_ad(!recv_no_log_write);
703 	ut_a(len % OS_FILE_LOG_BLOCK_SIZE == 0);
704 	ut_a(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
705 
706 loop:
707 	if (len == 0) {
708 
709 		return;
710 	}
711 
712 	next_offset = log_sys.log.calc_lsn_offset(start_lsn);
713 
714 	if (write_header
715 	    && next_offset % log_sys.log.file_size == LOG_FILE_HDR_SIZE) {
716 		/* We start to write a new log file instance in the group */
717 
718 		ut_a(next_offset / log_sys.log.file_size <= ULINT_MAX);
719 
720 		log_file_header_flush(
721 			ulint(next_offset / log_sys.log.file_size), start_lsn);
722 		srv_stats.os_log_written.add(OS_FILE_LOG_BLOCK_SIZE);
723 
724 		srv_stats.log_writes.inc();
725 	}
726 
727 	if ((next_offset % log_sys.log.file_size) + len
728 	    > log_sys.log.file_size) {
729 		/* if the above condition holds, then the below expression
730 		is < len which is ulint, so the typecast is ok */
731 		write_len = ulint(log_sys.log.file_size
732 				  - (next_offset % log_sys.log.file_size));
733 	} else {
734 		write_len = len;
735 	}
736 
737 	DBUG_PRINT("ib_log",
738 		   ("write " LSN_PF " to " LSN_PF
739 		    ": len " ULINTPF
740 		    " blocks " ULINTPF ".." ULINTPF,
741 		    start_lsn, next_offset,
742 		    write_len,
743 		    log_block_get_hdr_no(buf),
744 		    log_block_get_hdr_no(
745 			    buf + write_len
746 			    - OS_FILE_LOG_BLOCK_SIZE)));
747 
748 	ut_ad(pad_len >= len
749 	      || log_block_get_hdr_no(buf)
750 		 == log_block_convert_lsn_to_no(start_lsn));
751 
752 	/* Calculate the checksums for each log block and write them to
753 	the trailer fields of the log blocks */
754 
755 	for (i = 0; i < write_len / OS_FILE_LOG_BLOCK_SIZE; i++) {
756 #ifdef UNIV_DEBUG
757 		ulint hdr_no_2 = log_block_get_hdr_no(buf) + i;
758 		DBUG_EXECUTE_IF("innodb_small_log_block_no_limit",
759 				hdr_no_2 = ((hdr_no_2 - 1) & 0xFUL) + 1;);
760 #endif
761 		ut_ad(pad_len >= len
762 			|| i * OS_FILE_LOG_BLOCK_SIZE >= len - pad_len
763 			|| log_block_get_hdr_no(buf + i * OS_FILE_LOG_BLOCK_SIZE) == hdr_no_2);
764 		log_block_store_checksum(buf + i * OS_FILE_LOG_BLOCK_SIZE);
765 	}
766 
767 	log_sys.n_log_ios++;
768 
769 	srv_stats.os_log_pending_writes.inc();
770 
771 	ut_a((next_offset >> srv_page_size_shift) <= ULINT_MAX);
772 
773 	const ulint	page_no = ulint(next_offset >> srv_page_size_shift);
774 
775 	fil_io(IORequestLogWrite, true,
776 	       page_id_t(SRV_LOG_SPACE_FIRST_ID, page_no),
777 	       0,
778 	       ulint(next_offset & (srv_page_size - 1)), write_len, buf, NULL);
779 
780 	srv_stats.os_log_pending_writes.dec();
781 
782 	srv_stats.os_log_written.add(write_len);
783 	srv_stats.log_writes.inc();
784 
785 	if (write_len < len) {
786 		start_lsn += write_len;
787 		len -= write_len;
788 		buf += write_len;
789 
790 		write_header = true;
791 
792 		goto loop;
793 	}
794 }
795 
796 /** Flush the recently written changes to the log file.
797 and invoke log_mutex_enter(). */
log_write_flush_to_disk_low()798 static void log_write_flush_to_disk_low()
799 {
800 	ut_a(log_sys.n_pending_flushes);
801 
802 	bool	do_flush = srv_file_flush_method != SRV_O_DSYNC;
803 
804 	if (do_flush) {
805 		fil_flush(SRV_LOG_SPACE_FIRST_ID);
806 	}
807 
808 	log_mutex_enter();
809 	if (do_flush) {
810 		log_sys.flushed_to_disk_lsn = log_sys.current_flush_lsn;
811 	}
812 
813 	log_sys.n_pending_flushes--;
814 
815 	os_event_set(log_sys.flush_event);
816 }
817 
818 /** Swap log buffers, and copy the content of last block
819 from old buf to the head of the new buf. Thus, buf_free and
820 buf_next_to_write would be changed accordingly */
821 static inline
822 void
log_buffer_switch()823 log_buffer_switch()
824 {
825 	ut_ad(log_mutex_own());
826 	ut_ad(log_write_mutex_own());
827 
828 	ulong		area_end = ut_calc_align(
829 		log_sys.buf_free, ulong(OS_FILE_LOG_BLOCK_SIZE));
830 
831 	/* Copy the last block to new buf */
832 	ut_memcpy(log_sys.flush_buf,
833 		  log_sys.buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
834 		  OS_FILE_LOG_BLOCK_SIZE);
835 
836 	std::swap(log_sys.buf, log_sys.flush_buf);
837 
838 	log_sys.buf_free %= OS_FILE_LOG_BLOCK_SIZE;
839 	log_sys.buf_next_to_write = log_sys.buf_free;
840 }
841 
842 /** Ensure that the log has been written to the log file up to a given
843 log entry (such as that of a transaction commit). Start a new write, or
844 wait and check if an already running write is covering the request.
845 @param[in]	lsn		log sequence number that should be
846 included in the redo log file write
847 @param[in]	flush_to_disk	whether the written log should also
848 be flushed to the file system
849 @param[in]	rotate_key	whether to rotate the encryption key */
log_write_up_to(lsn_t lsn,bool flush_to_disk,bool rotate_key)850 void log_write_up_to(lsn_t lsn, bool flush_to_disk, bool rotate_key)
851 {
852 #ifdef UNIV_DEBUG
853 	ulint		loop_count	= 0;
854 #endif /* UNIV_DEBUG */
855 	byte*           write_buf;
856 	lsn_t           write_lsn;
857 
858 	ut_ad(!srv_read_only_mode);
859 	ut_ad(!rotate_key || flush_to_disk);
860 
861 	if (recv_no_ibuf_operations) {
862 		/* Recovery is running and no operations on the log files are
863 		allowed yet (the variable name .._no_ibuf_.. is misleading) */
864 
865 		return;
866 	}
867 
868 loop:
869 	ut_ad(++loop_count < 128);
870 
871 #if UNIV_WORD_SIZE > 7
872 	/* We can do a dirty read of LSN. */
873 	/* NOTE: Currently doesn't do dirty read for
874 	(flush_to_disk == true) case, because the log_mutex
875 	contention also works as the arbitrator for write-IO
876 	(fsync) bandwidth between log files and data files. */
877 	if (!flush_to_disk && log_sys.write_lsn >= lsn) {
878 		return;
879 	}
880 #endif
881 
882 	log_write_mutex_enter();
883 	ut_ad(!recv_no_log_write);
884 
885 	lsn_t	limit_lsn = flush_to_disk
886 		? log_sys.flushed_to_disk_lsn
887 		: log_sys.write_lsn;
888 
889 	if (limit_lsn >= lsn) {
890 		log_write_mutex_exit();
891 		return;
892 	}
893 
894 	/* If it is a write call we should just go ahead and do it
895 	as we checked that write_lsn is not where we'd like it to
896 	be. If we have to flush as well then we check if there is a
897 	pending flush and based on that we wait for it to finish
898 	before proceeding further. */
899 	if (flush_to_disk
900 	    && (log_sys.n_pending_flushes > 0
901 		|| !os_event_is_set(log_sys.flush_event))) {
902 		/* Figure out if the current flush will do the job
903 		for us. */
904 		bool work_done = log_sys.current_flush_lsn >= lsn;
905 
906 		log_write_mutex_exit();
907 
908 		os_event_wait(log_sys.flush_event);
909 
910 		if (work_done) {
911 			return;
912 		} else {
913 			goto loop;
914 		}
915 	}
916 
917 	log_mutex_enter();
918 	if (!flush_to_disk
919 	    && log_sys.buf_free == log_sys.buf_next_to_write) {
920 		/* Nothing to write and no flush to disk requested */
921 		log_mutex_exit_all();
922 		return;
923 	}
924 
925 	ulint		start_offset;
926 	ulint		end_offset;
927 	ulint		area_start;
928 	ulint		area_end;
929 	ulong		write_ahead_size = srv_log_write_ahead_size;
930 	ulint		pad_size;
931 
932 	DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF,
933 			      log_sys.write_lsn,
934 			      log_sys.lsn));
935 	if (flush_to_disk) {
936 		log_sys.n_pending_flushes++;
937 		log_sys.current_flush_lsn = log_sys.lsn;
938 		os_event_reset(log_sys.flush_event);
939 
940 		if (log_sys.buf_free == log_sys.buf_next_to_write) {
941 			/* Nothing to write, flush only */
942 			log_mutex_exit_all();
943 			log_write_flush_to_disk_low();
944 			log_mutex_exit();
945 			return;
946 		}
947 	}
948 
949 	start_offset = log_sys.buf_next_to_write;
950 	end_offset = log_sys.buf_free;
951 
952 	area_start = ut_2pow_round(start_offset,
953 				   ulint(OS_FILE_LOG_BLOCK_SIZE));
954 	area_end = ut_calc_align(end_offset, ulint(OS_FILE_LOG_BLOCK_SIZE));
955 
956 	ut_ad(area_end - area_start > 0);
957 
958 	log_block_set_flush_bit(log_sys.buf + area_start, TRUE);
959 	log_block_set_checkpoint_no(
960 		log_sys.buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
961 		log_sys.next_checkpoint_no);
962 
963 	write_lsn = log_sys.lsn;
964 	write_buf = log_sys.buf;
965 
966 	log_buffer_switch();
967 
968 	log_sys.log.set_fields(log_sys.write_lsn);
969 
970 	log_mutex_exit();
971 	/* Erase the end of the last log block. */
972 	memset(write_buf + end_offset, 0,
973 	       ~end_offset & (OS_FILE_LOG_BLOCK_SIZE - 1));
974 
975 	/* Calculate pad_size if needed. */
976 	pad_size = 0;
977 	if (write_ahead_size > OS_FILE_LOG_BLOCK_SIZE) {
978 		ulint	end_offset_in_unit;
979 		lsn_t	end_offset = log_sys.log.calc_lsn_offset(
980 			ut_uint64_align_up(write_lsn, OS_FILE_LOG_BLOCK_SIZE));
981 		end_offset_in_unit = (ulint) (end_offset % write_ahead_size);
982 
983 		if (end_offset_in_unit > 0
984 		    && (area_end - area_start) > end_offset_in_unit) {
985 			/* The first block in the unit was initialized
986 			after the last writing.
987 			Needs to be written padded data once. */
988 			pad_size = std::min<ulint>(
989 				ulint(write_ahead_size) - end_offset_in_unit,
990 				srv_log_buffer_size - area_end);
991 			::memset(write_buf + area_end, 0, pad_size);
992 		}
993 	}
994 
995 	if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED)) {
996 		service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
997 					       "InnoDB log write: "
998 					       LSN_PF "," LSN_PF,
999 					       log_sys.write_lsn, lsn);
1000 	}
1001 
1002 	if (log_sys.is_encrypted()) {
1003 		log_crypt(write_buf + area_start, log_sys.write_lsn,
1004 			  area_end - area_start,
1005 			  rotate_key ? LOG_ENCRYPT_ROTATE_KEY : LOG_ENCRYPT);
1006 	}
1007 
1008 	/* Do the write to the log files */
1009 	log_write_buf(
1010 		write_buf + area_start, area_end - area_start + pad_size,
1011 #ifdef UNIV_DEBUG
1012 		pad_size,
1013 #endif /* UNIV_DEBUG */
1014 		ut_uint64_align_down(log_sys.write_lsn,
1015 				     OS_FILE_LOG_BLOCK_SIZE),
1016 		start_offset - area_start);
1017 	srv_stats.log_padded.add(pad_size);
1018 	log_sys.write_lsn = write_lsn;
1019 
1020 
1021 	if (srv_file_flush_method == SRV_O_DSYNC) {
1022 		/* O_SYNC means the OS did not buffer the log file at all:
1023 		so we have also flushed to disk what we have written */
1024 		log_sys.flushed_to_disk_lsn = log_sys.write_lsn;
1025 	}
1026 
1027 	log_write_mutex_exit();
1028 
1029 	if (flush_to_disk) {
1030 		log_write_flush_to_disk_low();
1031 		ib_uint64_t flush_lsn = log_sys.flushed_to_disk_lsn;
1032 		log_mutex_exit();
1033 
1034 		innobase_mysql_log_notify(flush_lsn);
1035 	}
1036 }
1037 
1038 /** write to the log file up to the last log entry.
1039 @param[in]	sync	whether we want the written log
1040 also to be flushed to disk. */
log_buffer_flush_to_disk(bool sync)1041 void log_buffer_flush_to_disk(bool sync)
1042 {
1043   ut_ad(!srv_read_only_mode);
1044   log_write_up_to(log_get_lsn(), sync);
1045 }
1046 
1047 
1048 /** Durably write the log and release log_sys.mutex */
log_write_and_flush()1049 ATTRIBUTE_COLD void log_write_and_flush()
1050 {
1051   ut_ad(!srv_read_only_mode);
1052   ut_ad(!recv_no_log_write);
1053   ut_ad(!recv_recovery_is_on());
1054 
1055   /* The following code is adapted from log_write_up_to(). */
1056   DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF,
1057                         log_sys.write_lsn, log_sys.lsn));
1058   log_sys.n_pending_flushes++;
1059   log_sys.current_flush_lsn= log_sys.lsn;
1060   os_event_reset(log_sys.flush_event);
1061   ut_ad(log_sys.buf_free != log_sys.buf_next_to_write);
1062   ulint start_offset= log_sys.buf_next_to_write;
1063   ulint end_offset= log_sys.buf_free;
1064   ulint area_start= ut_2pow_round(start_offset, ulint(OS_FILE_LOG_BLOCK_SIZE));
1065   ulint area_end= ut_calc_align(end_offset, ulint(OS_FILE_LOG_BLOCK_SIZE));
1066   ulong write_ahead_size= srv_log_write_ahead_size;
1067 
1068   log_block_set_flush_bit(log_sys.buf + area_start, TRUE);
1069   log_block_set_checkpoint_no(log_sys.buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
1070                               log_sys.next_checkpoint_no);
1071   lsn_t write_lsn= log_sys.lsn;
1072   byte *write_buf= log_sys.buf;
1073 
1074   ut_ad(area_end - area_start > 0);
1075 
1076   log_buffer_switch();
1077 
1078   log_sys.log.set_fields(log_sys.write_lsn);
1079 
1080   /* Erase the end of the last log block. */
1081   memset(write_buf + end_offset, 0,
1082          ~end_offset & (OS_FILE_LOG_BLOCK_SIZE - 1));
1083   /* Calculate pad_size if needed. */
1084   ulint pad_size= 0;
1085   if (write_ahead_size > OS_FILE_LOG_BLOCK_SIZE)
1086   {
1087     lsn_t end_offset=
1088       log_sys.log.calc_lsn_offset(ut_uint64_align_up(write_lsn,
1089                                                      OS_FILE_LOG_BLOCK_SIZE));
1090     ulint end_offset_in_unit= (ulint) (end_offset % write_ahead_size);
1091 
1092     if (end_offset_in_unit && (area_end - area_start) > end_offset_in_unit)
1093     {
1094       /* The first block in the unit was initialized after the last
1095       writing. Needs to be written padded data once. */
1096       pad_size= std::min<ulint>(ulint(write_ahead_size) - end_offset_in_unit,
1097                                 srv_log_buffer_size - area_end);
1098       memset(write_buf + area_end, 0, pad_size);
1099     }
1100   }
1101 
1102   if (log_sys.is_encrypted())
1103     log_crypt(write_buf + area_start, log_sys.write_lsn,
1104               area_end - area_start);
1105 
1106   /* Do the write to the log files */
1107   log_write_buf(write_buf + area_start, area_end - area_start + pad_size,
1108 #ifdef UNIV_DEBUG
1109                 pad_size,
1110 #endif /* UNIV_DEBUG */
1111                 ut_uint64_align_down(log_sys.write_lsn,
1112                                      OS_FILE_LOG_BLOCK_SIZE),
1113                 start_offset - area_start);
1114   srv_stats.log_padded.add(pad_size);
1115   log_sys.write_lsn= write_lsn;
1116 
1117   log_write_mutex_exit();
1118 
1119   /* Code adapted from log_write_flush_to_disk_low() */
1120 
1121   ut_a(log_sys.n_pending_flushes);
1122 
1123   if (srv_file_flush_method != SRV_O_DSYNC)
1124     fil_flush(SRV_LOG_SPACE_FIRST_ID);
1125 
1126   log_sys.flushed_to_disk_lsn= log_sys.current_flush_lsn;
1127 
1128   log_sys.n_pending_flushes--;
1129 
1130   os_event_set(log_sys.flush_event);
1131 
1132   const lsn_t flush_lsn= log_sys.flushed_to_disk_lsn;
1133   log_mutex_exit();
1134 
1135   innobase_mysql_log_notify(flush_lsn);
1136 }
1137 
1138 /****************************************************************//**
1139 This functions writes the log buffer to the log file and if 'flush'
1140 is set it forces a flush of the log file as well. This is meant to be
1141 called from background master thread only as it does not wait for
1142 the write (+ possible flush) to finish. */
1143 void
log_buffer_sync_in_background(bool flush)1144 log_buffer_sync_in_background(
1145 /*==========================*/
1146 	bool	flush)	/*!< in: flush the logs to disk */
1147 {
1148 	lsn_t	lsn;
1149 
1150 	log_mutex_enter();
1151 
1152 	lsn = log_sys.lsn;
1153 
1154 	if (flush
1155 	    && log_sys.n_pending_flushes > 0
1156 	    && log_sys.current_flush_lsn >= lsn) {
1157 		/* The write + flush will write enough */
1158 		log_mutex_exit();
1159 		return;
1160 	}
1161 
1162 	log_mutex_exit();
1163 
1164 	log_write_up_to(lsn, flush);
1165 }
1166 
1167 /********************************************************************
1168 
1169 Tries to establish a big enough margin of free space in the log buffer, such
1170 that a new log entry can be catenated without an immediate need for a flush. */
1171 static
1172 void
log_flush_margin(void)1173 log_flush_margin(void)
1174 /*==================*/
1175 {
1176 	lsn_t	lsn	= 0;
1177 
1178 	log_mutex_enter();
1179 
1180 	if (log_sys.buf_free > log_sys.max_buf_free) {
1181 		/* We can write during flush */
1182 		lsn = log_sys.lsn;
1183 	}
1184 
1185 	log_mutex_exit();
1186 
1187 	if (lsn) {
1188 		log_write_up_to(lsn, false);
1189 	}
1190 }
1191 
1192 /** Advances the smallest lsn for which there are unflushed dirty blocks in the
1193 buffer pool.
1194 NOTE: this function may only be called if the calling thread owns no
1195 synchronization objects!
1196 @param[in]	new_oldest	try to advance oldest_modified_lsn at least to
1197 this lsn
1198 @return false if there was a flush batch of the same type running,
1199 which means that we could not start this flush batch */
log_preflush_pool_modified_pages(lsn_t new_oldest)1200 static bool log_preflush_pool_modified_pages(lsn_t new_oldest)
1201 {
1202 	bool	success;
1203 
1204 	if (recv_recovery_is_on()) {
1205 		/* If the recovery is running, we must first apply all
1206 		log records to their respective file pages to get the
1207 		right modify lsn values to these pages: otherwise, there
1208 		might be pages on disk which are not yet recovered to the
1209 		current lsn, and even after calling this function, we could
1210 		not know how up-to-date the disk version of the database is,
1211 		and we could not make a new checkpoint on the basis of the
1212 		info on the buffer pool only. */
1213 		recv_apply_hashed_log_recs(true);
1214 	}
1215 
1216 	if (new_oldest == LSN_MAX
1217 	    || !buf_page_cleaner_is_active
1218 	    || srv_is_being_started) {
1219 
1220 		ulint	n_pages;
1221 
1222 		success = buf_flush_lists(ULINT_MAX, new_oldest, &n_pages);
1223 
1224 		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
1225 
1226 		if (!success) {
1227 			MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
1228 		}
1229 
1230 		MONITOR_INC_VALUE_CUMULATIVE(
1231 			MONITOR_FLUSH_SYNC_TOTAL_PAGE,
1232 			MONITOR_FLUSH_SYNC_COUNT,
1233 			MONITOR_FLUSH_SYNC_PAGES,
1234 			n_pages);
1235 	} else {
1236 		/* better to wait for flushed by page cleaner */
1237 
1238 		if (srv_flush_sync) {
1239 			/* wake page cleaner for IO burst */
1240 			buf_flush_request_force(new_oldest);
1241 		}
1242 
1243 		buf_flush_wait_flushed(new_oldest);
1244 
1245 		success = true;
1246 	}
1247 
1248 	return(success);
1249 }
1250 
1251 /******************************************************//**
1252 Completes a checkpoint. */
1253 static
1254 void
log_complete_checkpoint(void)1255 log_complete_checkpoint(void)
1256 /*=========================*/
1257 {
1258 	ut_ad(log_mutex_own());
1259 	ut_ad(log_sys.n_pending_checkpoint_writes == 0);
1260 
1261 	log_sys.next_checkpoint_no++;
1262 
1263 	log_sys.last_checkpoint_lsn = log_sys.next_checkpoint_lsn;
1264 	MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
1265 		    log_sys.lsn - log_sys.last_checkpoint_lsn);
1266 
1267 	DBUG_PRINT("ib_log", ("checkpoint ended at " LSN_PF
1268 			      ", flushed to " LSN_PF,
1269 			      log_sys.last_checkpoint_lsn,
1270 			      log_sys.flushed_to_disk_lsn));
1271 
1272 	rw_lock_x_unlock_gen(&(log_sys.checkpoint_lock), LOG_CHECKPOINT);
1273 }
1274 
1275 /** Complete an asynchronous checkpoint write. */
complete_checkpoint()1276 void log_t::complete_checkpoint()
1277 {
1278 	ut_ad(this == &log_sys);
1279 	MONITOR_DEC(MONITOR_PENDING_CHECKPOINT_WRITE);
1280 
1281 	log_mutex_enter();
1282 
1283 	ut_ad(n_pending_checkpoint_writes > 0);
1284 
1285 	if (!--n_pending_checkpoint_writes) {
1286 		log_complete_checkpoint();
1287 	}
1288 
1289 	log_mutex_exit();
1290 }
1291 
1292 /** Write checkpoint info to the log header.
1293 @param[in]	end_lsn	start LSN of the MLOG_CHECKPOINT mini-transaction */
1294 static
1295 void
log_group_checkpoint(lsn_t end_lsn)1296 log_group_checkpoint(lsn_t end_lsn)
1297 {
1298 	lsn_t		lsn_offset;
1299 
1300 	ut_ad(!srv_read_only_mode);
1301 	ut_ad(log_mutex_own());
1302 	ut_ad(end_lsn == 0 || end_lsn >= log_sys.next_checkpoint_lsn);
1303 	ut_ad(end_lsn <= log_sys.lsn);
1304 	ut_ad(end_lsn + SIZE_OF_MLOG_CHECKPOINT <= log_sys.lsn
1305 	      || srv_shutdown_state > SRV_SHUTDOWN_INITIATED);
1306 
1307 	DBUG_PRINT("ib_log", ("checkpoint " UINT64PF " at " LSN_PF
1308 			      " written",
1309 			      log_sys.next_checkpoint_no,
1310 			      log_sys.next_checkpoint_lsn));
1311 
1312 	byte* buf = log_sys.checkpoint_buf;
1313 	memset(buf, 0, OS_FILE_LOG_BLOCK_SIZE);
1314 
1315 	mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys.next_checkpoint_no);
1316 	mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys.next_checkpoint_lsn);
1317 
1318 	if (log_sys.is_encrypted()) {
1319 		log_crypt_write_checkpoint_buf(buf);
1320 	}
1321 
1322 	lsn_offset = log_sys.log.calc_lsn_offset(log_sys.next_checkpoint_lsn);
1323 	mach_write_to_8(buf + LOG_CHECKPOINT_OFFSET, lsn_offset);
1324 	mach_write_to_8(buf + LOG_CHECKPOINT_LOG_BUF_SIZE,
1325 			srv_log_buffer_size);
1326 	mach_write_to_8(buf + LOG_CHECKPOINT_END_LSN, end_lsn);
1327 
1328 	log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf));
1329 
1330 	MONITOR_INC(MONITOR_PENDING_CHECKPOINT_WRITE);
1331 
1332 	log_sys.n_log_ios++;
1333 
1334 	MONITOR_INC(MONITOR_LOG_IO);
1335 
1336 	ut_ad(LOG_CHECKPOINT_1 < srv_page_size);
1337 	ut_ad(LOG_CHECKPOINT_2 < srv_page_size);
1338 
1339 	if (log_sys.n_pending_checkpoint_writes++ == 0) {
1340 		rw_lock_x_lock_gen(&log_sys.checkpoint_lock,
1341 				   LOG_CHECKPOINT);
1342 	}
1343 
1344 	/* Note: We alternate the physical place of the checkpoint info.
1345 	See the (next_checkpoint_no & 1) below. */
1346 
1347 	fil_io(IORequestLogWrite, false,
1348 	       page_id_t(SRV_LOG_SPACE_FIRST_ID, 0),
1349 	       0,
1350 	       (log_sys.next_checkpoint_no & 1)
1351 	       ? LOG_CHECKPOINT_2 : LOG_CHECKPOINT_1,
1352 	       OS_FILE_LOG_BLOCK_SIZE,
1353 	       buf, reinterpret_cast<void*>(1) /* checkpoint write */);
1354 }
1355 
1356 /** Read a log group header page to log_sys.checkpoint_buf.
1357 @param[in]	header	0 or LOG_CHECKPOINT_1 or LOG_CHECKPOINT2 */
log_header_read(ulint header)1358 void log_header_read(ulint header)
1359 {
1360 	ut_ad(log_mutex_own());
1361 
1362 	log_sys.n_log_ios++;
1363 
1364 	MONITOR_INC(MONITOR_LOG_IO);
1365 
1366 	fil_io(IORequestLogRead, true,
1367 	       page_id_t(SRV_LOG_SPACE_FIRST_ID,
1368 			 header >> srv_page_size_shift),
1369 	       0, header & (srv_page_size - 1),
1370 	       OS_FILE_LOG_BLOCK_SIZE, log_sys.checkpoint_buf, NULL);
1371 }
1372 
1373 /** Write checkpoint info to the log header and invoke log_mutex_exit().
1374 @param[in]	sync	whether to wait for the write to complete
1375 @param[in]	end_lsn	start LSN of the MLOG_CHECKPOINT mini-transaction */
1376 void
log_write_checkpoint_info(bool sync,lsn_t end_lsn)1377 log_write_checkpoint_info(bool sync, lsn_t end_lsn)
1378 {
1379 	ut_ad(log_mutex_own());
1380 	ut_ad(!srv_read_only_mode);
1381 
1382 	log_group_checkpoint(end_lsn);
1383 
1384 	log_mutex_exit();
1385 
1386 	MONITOR_INC(MONITOR_NUM_CHECKPOINT);
1387 
1388 	if (sync) {
1389 		/* Wait for the checkpoint write to complete */
1390 		rw_lock_s_lock(&log_sys.checkpoint_lock);
1391 		rw_lock_s_unlock(&log_sys.checkpoint_lock);
1392 
1393 		DBUG_EXECUTE_IF(
1394 			"crash_after_checkpoint",
1395 			DBUG_SUICIDE(););
1396 	}
1397 }
1398 
1399 /** Set extra data to be written to the redo log during checkpoint.
1400 @param[in]	buf	data to be appended on checkpoint, or NULL
1401 @return pointer to previous data to be appended on checkpoint */
1402 mtr_buf_t*
log_append_on_checkpoint(mtr_buf_t * buf)1403 log_append_on_checkpoint(
1404 	mtr_buf_t*	buf)
1405 {
1406 	log_mutex_enter();
1407 	mtr_buf_t*	old = log_sys.append_on_checkpoint;
1408 	log_sys.append_on_checkpoint = buf;
1409 	log_mutex_exit();
1410 	return(old);
1411 }
1412 
1413 /** Make a checkpoint. Note that this function does not flush dirty
1414 blocks from the buffer pool: it only checks what is lsn of the oldest
1415 modification in the pool, and writes information about the lsn in
1416 log files. Use log_make_checkpoint() to flush also the pool.
1417 @param[in]	sync		whether to wait for the write to complete
1418 @return true if success, false if a checkpoint write was already running */
log_checkpoint(bool sync)1419 bool log_checkpoint(bool sync)
1420 {
1421 	lsn_t	oldest_lsn;
1422 
1423 	ut_ad(!srv_read_only_mode);
1424 
1425 	DBUG_EXECUTE_IF("no_checkpoint",
1426 			/* We sleep for a long enough time, forcing
1427 			the checkpoint doesn't happen any more. */
1428 			os_thread_sleep(360000000););
1429 
1430 	if (recv_recovery_is_on()) {
1431 		recv_apply_hashed_log_recs(true);
1432 	}
1433 
1434 	switch (srv_file_flush_method) {
1435 	case SRV_NOSYNC:
1436 		break;
1437 	case SRV_O_DSYNC:
1438 	case SRV_FSYNC:
1439 	case SRV_LITTLESYNC:
1440 	case SRV_O_DIRECT:
1441 	case SRV_O_DIRECT_NO_FSYNC:
1442 #ifdef _WIN32
1443 	case SRV_ALL_O_DIRECT_FSYNC:
1444 #endif
1445 		fil_flush_file_spaces(FIL_TYPE_TABLESPACE);
1446 	}
1447 
1448 	log_mutex_enter();
1449 
1450 	ut_ad(!recv_no_log_write);
1451 	oldest_lsn = log_buf_pool_get_oldest_modification();
1452 
1453 	/* Because log also contains headers and dummy log records,
1454 	log_buf_pool_get_oldest_modification() will return log_sys.lsn
1455 	if the buffer pool contains no dirty buffers.
1456 	We must make sure that the log is flushed up to that lsn.
1457 	If there are dirty buffers in the buffer pool, then our
1458 	write-ahead-logging algorithm ensures that the log has been
1459 	flushed up to oldest_lsn. */
1460 
1461 	ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn);
1462 	if (oldest_lsn
1463 	    > log_sys.last_checkpoint_lsn + SIZE_OF_MLOG_CHECKPOINT) {
1464 		/* Some log has been written since the previous checkpoint. */
1465 	} else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) {
1466 		/* MariaDB 10.3 startup expects the redo log file to be
1467 		logically empty (not even containing a MLOG_CHECKPOINT record)
1468 		after a clean shutdown. Perform an extra checkpoint at
1469 		shutdown. */
1470 	} else {
1471 		/* Do nothing, because nothing was logged (other than
1472 		a MLOG_CHECKPOINT marker) since the previous checkpoint. */
1473 		log_mutex_exit();
1474 		return(true);
1475 	}
1476 	/* Repeat the MLOG_FILE_NAME records after the checkpoint, in
1477 	case some log records between the checkpoint and log_sys.lsn
1478 	need them. Finally, write a MLOG_CHECKPOINT marker. Redo log
1479 	apply expects to see a MLOG_CHECKPOINT after the checkpoint,
1480 	except on clean shutdown, where the log will be empty after
1481 	the checkpoint.
1482 	It is important that we write out the redo log before any
1483 	further dirty pages are flushed to the tablespace files.  At
1484 	this point, because log_mutex_own(), mtr_commit() in other
1485 	threads will be blocked, and no pages can be added to the
1486 	flush lists. */
1487 	lsn_t		flush_lsn	= oldest_lsn;
1488 	const lsn_t	end_lsn		= log_sys.lsn;
1489 	const bool	do_write
1490 		= srv_shutdown_state <= SRV_SHUTDOWN_INITIATED
1491 		|| flush_lsn != end_lsn;
1492 
1493 	if (fil_names_clear(flush_lsn, do_write)) {
1494 		ut_ad(log_sys.lsn >= end_lsn + SIZE_OF_MLOG_CHECKPOINT);
1495 		flush_lsn = log_sys.lsn;
1496 	}
1497 
1498 	log_mutex_exit();
1499 
1500 	log_write_up_to(flush_lsn, true, true);
1501 
1502 	log_mutex_enter();
1503 
1504 	ut_ad(log_sys.flushed_to_disk_lsn >= flush_lsn);
1505 	ut_ad(flush_lsn >= oldest_lsn);
1506 
1507 	if (log_sys.last_checkpoint_lsn >= oldest_lsn) {
1508 		log_mutex_exit();
1509 		return(true);
1510 	}
1511 
1512 	if (log_sys.n_pending_checkpoint_writes > 0) {
1513 		/* A checkpoint write is running */
1514 		log_mutex_exit();
1515 
1516 		if (sync) {
1517 			/* Wait for the checkpoint write to complete */
1518 			rw_lock_s_lock(&log_sys.checkpoint_lock);
1519 			rw_lock_s_unlock(&log_sys.checkpoint_lock);
1520 		}
1521 
1522 		return(false);
1523 	}
1524 
1525 	log_sys.next_checkpoint_lsn = oldest_lsn;
1526 	log_write_checkpoint_info(sync, end_lsn);
1527 	ut_ad(!log_mutex_own());
1528 
1529 	return(true);
1530 }
1531 
1532 /** Make a checkpoint */
log_make_checkpoint()1533 void log_make_checkpoint()
1534 {
1535 	/* Preflush pages synchronously */
1536 
1537 	while (!log_preflush_pool_modified_pages(LSN_MAX)) {
1538 		/* Flush as much as we can */
1539 	}
1540 
1541 	while (!log_checkpoint(true)) {
1542 		/* Force a checkpoint */
1543 	}
1544 }
1545 
1546 /****************************************************************//**
1547 Tries to establish a big enough margin of free space in the log groups, such
1548 that a new log entry can be catenated without an immediate need for a
1549 checkpoint. NOTE: this function may only be called if the calling thread
1550 owns no synchronization objects! */
1551 static
1552 void
log_checkpoint_margin(void)1553 log_checkpoint_margin(void)
1554 /*=======================*/
1555 {
1556 	lsn_t		age;
1557 	lsn_t		checkpoint_age;
1558 	ib_uint64_t	advance;
1559 	lsn_t		oldest_lsn;
1560 	bool		success;
1561 loop:
1562 	advance = 0;
1563 
1564 	log_mutex_enter();
1565 	ut_ad(!recv_no_log_write);
1566 
1567 	if (!log_sys.check_flush_or_checkpoint) {
1568 		log_mutex_exit();
1569 		return;
1570 	}
1571 
1572 	oldest_lsn = log_buf_pool_get_oldest_modification();
1573 
1574 	age = log_sys.lsn - oldest_lsn;
1575 
1576 	if (age > log_sys.max_modified_age_sync) {
1577 
1578 		/* A flush is urgent: we have to do a synchronous preflush */
1579 		advance = age - log_sys.max_modified_age_sync;
1580 	}
1581 
1582 	checkpoint_age = log_sys.lsn - log_sys.last_checkpoint_lsn;
1583 
1584 	bool	checkpoint_sync;
1585 	bool	do_checkpoint;
1586 
1587 	if (checkpoint_age > log_sys.max_checkpoint_age) {
1588 		/* A checkpoint is urgent: we do it synchronously */
1589 		checkpoint_sync = true;
1590 		do_checkpoint = true;
1591 	} else if (checkpoint_age > log_sys.max_checkpoint_age_async) {
1592 		/* A checkpoint is not urgent: do it asynchronously */
1593 		do_checkpoint = true;
1594 		checkpoint_sync = false;
1595 		log_sys.check_flush_or_checkpoint = false;
1596 	} else {
1597 		do_checkpoint = false;
1598 		checkpoint_sync = false;
1599 		log_sys.check_flush_or_checkpoint = false;
1600 	}
1601 
1602 	log_mutex_exit();
1603 
1604 	if (advance) {
1605 		lsn_t	new_oldest = oldest_lsn + advance;
1606 
1607 		success = log_preflush_pool_modified_pages(new_oldest);
1608 
1609 		/* If the flush succeeded, this thread has done its part
1610 		and can proceed. If it did not succeed, there was another
1611 		thread doing a flush at the same time. */
1612 		if (!success) {
1613 			log_mutex_enter();
1614 			log_sys.check_flush_or_checkpoint = true;
1615 			log_mutex_exit();
1616 			goto loop;
1617 		}
1618 	}
1619 
1620 	if (do_checkpoint) {
1621 		log_checkpoint(checkpoint_sync);
1622 
1623 		if (checkpoint_sync) {
1624 
1625 			goto loop;
1626 		}
1627 	}
1628 }
1629 
1630 /**
1631 Checks that there is enough free space in the log to start a new query step.
1632 Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
1633 function may only be called if the calling thread owns no synchronization
1634 objects! */
1635 void
log_check_margins(void)1636 log_check_margins(void)
1637 {
1638 	bool	check;
1639 
1640 	do {
1641 		log_flush_margin();
1642 		log_checkpoint_margin();
1643 		log_mutex_enter();
1644 		ut_ad(!recv_no_log_write);
1645 		check = log_sys.check_flush_or_checkpoint;
1646 		log_mutex_exit();
1647 	} while (check);
1648 }
1649 
1650 /****************************************************************//**
1651 Makes a checkpoint at the latest lsn and writes it to first page of each
1652 data file in the database, so that we know that the file spaces contain
1653 all modifications up to that lsn. This can only be called at database
1654 shutdown. This function also writes all log in log files to the log archive. */
1655 void
logs_empty_and_mark_files_at_shutdown(void)1656 logs_empty_and_mark_files_at_shutdown(void)
1657 /*=======================================*/
1658 {
1659 	lsn_t			lsn;
1660 	ulint			count = 0;
1661 
1662 	ib::info() << "Starting shutdown...";
1663 
1664 	/* Wait until the master thread and all other operations are idle: our
1665 	algorithm only works if the server is idle at shutdown */
1666 
1667 	srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
1668 loop:
1669 	ut_ad(lock_sys.is_initialised() || !srv_was_started);
1670 	ut_ad(log_sys.is_initialised() || !srv_was_started);
1671 	ut_ad(fil_system.is_initialised() || !srv_was_started);
1672 	os_event_set(srv_buf_resize_event);
1673 
1674 	if (!srv_read_only_mode) {
1675 		os_event_set(srv_error_event);
1676 		os_event_set(srv_monitor_event);
1677 		os_event_set(srv_buf_dump_event);
1678 		if (lock_sys.timeout_thread_active) {
1679 			os_event_set(lock_sys.timeout_event);
1680 		}
1681 		if (dict_stats_event) {
1682 			os_event_set(dict_stats_event);
1683 		} else {
1684 			ut_ad(!srv_dict_stats_thread_active);
1685 		}
1686 		if (recv_sys.flush_start) {
1687 			/* This is in case recv_writer_thread was never
1688 			started, or buf_flush_page_cleaner_coordinator
1689 			failed to notice its termination. */
1690 			os_event_set(recv_sys.flush_start);
1691 		}
1692 	}
1693 #define COUNT_INTERVAL 600U
1694 #define CHECK_INTERVAL 100000U
1695 	os_thread_sleep(CHECK_INTERVAL);
1696 
1697 	count++;
1698 
1699 	/* Check that there are no longer transactions, except for
1700 	PREPARED ones. We need this wait even for the 'very fast'
1701 	shutdown, because the InnoDB layer may have committed or
1702 	prepared transactions and we don't want to lose them. */
1703 
1704 	if (ulint total_trx = srv_was_started && !srv_read_only_mode
1705 	    && srv_force_recovery < SRV_FORCE_NO_TRX_UNDO
1706 	    ? trx_sys.any_active_transactions() : 0) {
1707 
1708 		if (srv_print_verbose_log && count > COUNT_INTERVAL) {
1709 			service_manager_extend_timeout(
1710 				COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2,
1711 				"Waiting for %lu active transactions to finish",
1712 				(ulong) total_trx);
1713 			ib::info() << "Waiting for " << total_trx << " active"
1714 				<< " transactions to finish";
1715 
1716 			count = 0;
1717 		}
1718 
1719 		goto loop;
1720 	}
1721 
1722 	/* We need these threads to stop early in shutdown. */
1723 	const char* thread_name;
1724 
1725 	if (srv_error_monitor_active) {
1726 		thread_name = "srv_error_monitor_thread";
1727 	} else if (srv_monitor_active) {
1728 		thread_name = "srv_monitor_thread";
1729 	} else if (srv_buf_resize_thread_active) {
1730 		thread_name = "buf_resize_thread";
1731 		goto wait_suspend_loop;
1732 	} else if (srv_dict_stats_thread_active) {
1733 		thread_name = "dict_stats_thread";
1734 	} else if (lock_sys.timeout_thread_active) {
1735 		thread_name = "lock_wait_timeout_thread";
1736 	} else if (srv_buf_dump_thread_active) {
1737 		thread_name = "buf_dump_thread";
1738 		goto wait_suspend_loop;
1739 	} else if (btr_defragment_thread_active) {
1740 		thread_name = "btr_defragment_thread";
1741 	} else if (srv_fast_shutdown != 2 && trx_rollback_is_active) {
1742 		thread_name = "rollback of recovered transactions";
1743 	} else {
1744 		thread_name = NULL;
1745 	}
1746 
1747 	if (thread_name) {
1748 		ut_ad(!srv_read_only_mode);
1749 wait_suspend_loop:
1750 		service_manager_extend_timeout(
1751 			COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2,
1752 			"Waiting for %s to exit", thread_name);
1753 		if (srv_print_verbose_log && count > COUNT_INTERVAL) {
1754 			ib::info() << "Waiting for " << thread_name
1755 				   << " to exit";
1756 			count = 0;
1757 		}
1758 		goto loop;
1759 	}
1760 
1761 	/* Check that the background threads are suspended */
1762 
1763 	switch (srv_get_active_thread_type()) {
1764 	case SRV_NONE:
1765 		if (!srv_n_fil_crypt_threads_started) {
1766 			srv_shutdown_state = SRV_SHUTDOWN_FLUSH_PHASE;
1767 			break;
1768 		}
1769 		os_event_set(fil_crypt_threads_event);
1770 		thread_name = "fil_crypt_thread";
1771 		goto wait_suspend_loop;
1772 	case SRV_PURGE:
1773 	case SRV_WORKER:
1774 		ut_ad(!"purge was not shut down");
1775 		srv_purge_wakeup();
1776 		thread_name = "purge thread";
1777 		goto wait_suspend_loop;
1778 	case SRV_MASTER:
1779 		thread_name = "master thread";
1780 		goto wait_suspend_loop;
1781 	}
1782 
1783 	/* At this point only page_cleaner should be active. We wait
1784 	here to let it complete the flushing of the buffer pools
1785 	before proceeding further. */
1786 
1787 	count = 0;
1788 	service_manager_extend_timeout(COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2,
1789 		"Waiting for page cleaner");
1790 	while (buf_page_cleaner_is_active) {
1791 		++count;
1792 		os_thread_sleep(CHECK_INTERVAL);
1793 		if (srv_print_verbose_log && count > COUNT_INTERVAL) {
1794 			service_manager_extend_timeout(COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2,
1795 				"Waiting for page cleaner");
1796 			ib::info() << "Waiting for page_cleaner to "
1797 				"finish flushing of buffer pool";
1798 			/* This is a workaround to avoid the InnoDB hang
1799 			when OS datetime changed backwards */
1800 			os_event_set(buf_flush_event);
1801 			count = 0;
1802 		}
1803 	}
1804 
1805 	if (log_scrub_thread_active) {
1806 		ut_ad(!srv_read_only_mode);
1807 		os_event_set(log_scrub_event);
1808 	}
1809 
1810 	if (log_sys.is_initialised()) {
1811 		log_mutex_enter();
1812 		const ulint	n_write	= log_sys.n_pending_checkpoint_writes;
1813 		const ulint	n_flush	= log_sys.n_pending_flushes;
1814 		log_mutex_exit();
1815 
1816 		if (log_scrub_thread_active || n_write || n_flush) {
1817 			if (srv_print_verbose_log && count > 600) {
1818 				ib::info() << "Pending checkpoint_writes: "
1819 					<< n_write
1820 					<< ". Pending log flush writes: "
1821 					<< n_flush;
1822 				count = 0;
1823 			}
1824 			goto loop;
1825 		}
1826 	}
1827 
1828 	ut_ad(!log_scrub_thread_active);
1829 
1830 	if (!buf_pool_ptr) {
1831 		ut_ad(!srv_was_started);
1832 	} else if (ulint pending_io = buf_pool_check_no_pending_io()) {
1833 		if (srv_print_verbose_log && count > 600) {
1834 			ib::info() << "Waiting for " << pending_io << " buffer"
1835 				" page I/Os to complete";
1836 			count = 0;
1837 		}
1838 
1839 		goto loop;
1840 	}
1841 
1842 	if (srv_fast_shutdown == 2 || !srv_was_started) {
1843 		if (!srv_read_only_mode && srv_was_started) {
1844 			ib::info() << "MySQL has requested a very fast"
1845 				" shutdown without flushing the InnoDB buffer"
1846 				" pool to data files. At the next mysqld"
1847 				" startup InnoDB will do a crash recovery!";
1848 
1849 			/* In this fastest shutdown we do not flush the
1850 			buffer pool:
1851 
1852 			it is essentially a 'crash' of the InnoDB server.
1853 			Make sure that the log is all flushed to disk, so
1854 			that we can recover all committed transactions in
1855 			a crash recovery. We must not write the lsn stamps
1856 			to the data files, since at a startup InnoDB deduces
1857 			from the stamps if the previous shutdown was clean. */
1858 
1859 			log_buffer_flush_to_disk();
1860 		}
1861 
1862 		srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
1863 
1864 		if (fil_system.is_initialised()) {
1865 			fil_close_all_files();
1866 		}
1867 		return;
1868 	}
1869 
1870 	if (!srv_read_only_mode) {
1871 		service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
1872 			"ensuring dirty buffer pool are written to log");
1873 		log_make_checkpoint();
1874 
1875 		log_mutex_enter();
1876 
1877 		lsn = log_sys.lsn;
1878 
1879 		const bool lsn_changed = lsn != log_sys.last_checkpoint_lsn;
1880 		ut_ad(lsn >= log_sys.last_checkpoint_lsn);
1881 
1882 		log_mutex_exit();
1883 
1884 		if (lsn_changed) {
1885 			goto loop;
1886 		}
1887 
1888 		/* Ensure that all buffered changes are written to the
1889 		redo log before fil_close_all_files(). */
1890 		fil_flush_file_spaces(FIL_TYPE_LOG);
1891 	} else {
1892 		lsn = srv_start_lsn;
1893 	}
1894 
1895 	srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
1896 
1897 	/* Make some checks that the server really is quiet */
1898 	ut_a(srv_get_active_thread_type() == SRV_NONE);
1899 
1900 	service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
1901 				       "Free innodb buffer pool");
1902 	buf_all_freed();
1903 
1904 	ut_a(lsn == log_sys.lsn
1905 	     || srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
1906 
1907 	if (lsn < srv_start_lsn) {
1908 		ib::error() << "Shutdown LSN=" << lsn
1909 			<< " is less than start LSN=" << srv_start_lsn;
1910 	}
1911 
1912 	srv_shutdown_lsn = lsn;
1913 
1914 	if (!srv_read_only_mode) {
1915 		dberr_t err = fil_write_flushed_lsn(lsn);
1916 
1917 		if (err != DB_SUCCESS) {
1918 			ib::error() << "Writing flushed lsn " << lsn
1919 				<< " failed; error=" << err;
1920 		}
1921 	}
1922 
1923 	fil_close_all_files();
1924 
1925 	/* Make some checks that the server really is quiet */
1926 	ut_a(srv_get_active_thread_type() == SRV_NONE);
1927 
1928 	ut_a(lsn == log_sys.lsn
1929 	     || srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
1930 }
1931 
1932 /******************************************************//**
1933 Peeks the current lsn.
1934 @return TRUE if success, FALSE if could not get the log system mutex */
1935 ibool
log_peek_lsn(lsn_t * lsn)1936 log_peek_lsn(
1937 /*=========*/
1938 	lsn_t*	lsn)	/*!< out: if returns TRUE, current lsn is here */
1939 {
1940 	if (0 == mutex_enter_nowait(&(log_sys.mutex))) {
1941 		*lsn = log_sys.lsn;
1942 
1943 		log_mutex_exit();
1944 
1945 		return(TRUE);
1946 	}
1947 
1948 	return(FALSE);
1949 }
1950 
1951 /******************************************************//**
1952 Prints info of the log. */
1953 void
log_print(FILE * file)1954 log_print(
1955 /*======*/
1956 	FILE*	file)	/*!< in: file where to print */
1957 {
1958 	double	time_elapsed;
1959 	time_t	current_time;
1960 
1961 	log_mutex_enter();
1962 
1963 	fprintf(file,
1964 		"Log sequence number " LSN_PF "\n"
1965 		"Log flushed up to   " LSN_PF "\n"
1966 		"Pages flushed up to " LSN_PF "\n"
1967 		"Last checkpoint at  " LSN_PF "\n",
1968 		log_sys.lsn,
1969 		log_sys.flushed_to_disk_lsn,
1970 		log_buf_pool_get_oldest_modification(),
1971 		log_sys.last_checkpoint_lsn);
1972 
1973 	current_time = time(NULL);
1974 
1975 	time_elapsed = difftime(current_time,
1976 				log_sys.last_printout_time);
1977 
1978 	if (time_elapsed <= 0) {
1979 		time_elapsed = 1;
1980 	}
1981 
1982 	fprintf(file,
1983 		ULINTPF " pending log flushes, "
1984 		ULINTPF " pending chkp writes\n"
1985 		ULINTPF " log i/o's done, %.2f log i/o's/second\n",
1986 		log_sys.n_pending_flushes,
1987 		log_sys.n_pending_checkpoint_writes,
1988 		log_sys.n_log_ios,
1989 		static_cast<double>(
1990 			log_sys.n_log_ios - log_sys.n_log_ios_old)
1991 		/ time_elapsed);
1992 
1993 	log_sys.n_log_ios_old = log_sys.n_log_ios;
1994 	log_sys.last_printout_time = current_time;
1995 
1996 	log_mutex_exit();
1997 }
1998 
1999 /**********************************************************************//**
2000 Refreshes the statistics used to print per-second averages. */
2001 void
log_refresh_stats(void)2002 log_refresh_stats(void)
2003 /*===================*/
2004 {
2005 	log_sys.n_log_ios_old = log_sys.n_log_ios;
2006 	log_sys.last_printout_time = time(NULL);
2007 }
2008 
2009 /** Shut down the redo log subsystem. */
close()2010 void log_t::close()
2011 {
2012   ut_ad(this == &log_sys);
2013   if (!is_initialised()) return;
2014   m_initialised = false;
2015   log.close();
2016 
2017   ut_free_dodump(buf, srv_log_buffer_size);
2018   buf = NULL;
2019   ut_free_dodump(flush_buf, srv_log_buffer_size);
2020   flush_buf = NULL;
2021 
2022   os_event_destroy(flush_event);
2023   rw_lock_free(&checkpoint_lock);
2024   mutex_free(&mutex);
2025   mutex_free(&write_mutex);
2026   mutex_free(&log_flush_order_mutex);
2027 
2028   if (!srv_read_only_mode && srv_scrub_log)
2029     os_event_destroy(log_scrub_event);
2030 
2031   recv_sys.close();
2032 }
2033 
2034 /******************************************************//**
2035 Pads the current log block full with dummy log records. Used in producing
2036 consistent archived log files and scrubbing redo log. */
2037 static
2038 void
log_pad_current_log_block(void)2039 log_pad_current_log_block(void)
2040 /*===========================*/
2041 {
2042 	byte		b		= MLOG_DUMMY_RECORD;
2043 	ulint		pad_length;
2044 	ulint		i;
2045 	lsn_t		lsn;
2046 
2047 	ut_ad(!recv_no_log_write);
2048 	/* We retrieve lsn only because otherwise gcc crashed on HP-UX */
2049 	lsn = log_reserve_and_open(OS_FILE_LOG_BLOCK_SIZE);
2050 
2051 	pad_length = log_sys.trailer_offset()
2052 		- log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE;
2053 	if (pad_length == log_sys.payload_size()) {
2054 		pad_length = 0;
2055 	}
2056 
2057 	if (pad_length) {
2058 		srv_stats.n_log_scrubs.inc();
2059 	}
2060 
2061 	for (i = 0; i < pad_length; i++) {
2062 		log_write_low(&b, 1);
2063 	}
2064 
2065 	lsn = log_sys.lsn;
2066 
2067 	log_close();
2068 
2069 	ut_a(lsn % OS_FILE_LOG_BLOCK_SIZE == LOG_BLOCK_HDR_SIZE);
2070 }
2071 
2072 /*****************************************************************//*
2073 If no log record has been written for a while, fill current log
2074 block with dummy records. */
2075 static
2076 void
log_scrub()2077 log_scrub()
2078 /*=========*/
2079 {
2080 	log_mutex_enter();
2081 	ulint cur_lbn = log_block_convert_lsn_to_no(log_sys.lsn);
2082 
2083 	if (next_lbn_to_pad == cur_lbn)
2084 	{
2085 		log_pad_current_log_block();
2086 	}
2087 
2088 	next_lbn_to_pad = log_block_convert_lsn_to_no(log_sys.lsn);
2089 	log_mutex_exit();
2090 }
2091 
2092 /* log scrubbing speed, in bytes/sec */
2093 UNIV_INTERN ulonglong innodb_scrub_log_speed;
2094 
2095 /*****************************************************************//**
2096 This is the main thread for log scrub. It waits for an event and
2097 when waked up fills current log block with dummy records and
2098 sleeps again.
2099 @return this function does not return, it calls os_thread_exit() */
2100 extern "C" UNIV_INTERN
2101 os_thread_ret_t
DECLARE_THREAD(log_scrub_thread)2102 DECLARE_THREAD(log_scrub_thread)(void*)
2103 {
2104 	ut_ad(!srv_read_only_mode);
2105 
2106 	while (srv_shutdown_state < SRV_SHUTDOWN_FLUSH_PHASE) {
2107 		/* log scrubbing interval in µs. */
2108 		ulonglong interval = 1000*1000*512/innodb_scrub_log_speed;
2109 
2110 		os_event_wait_time(log_scrub_event, static_cast<ulint>(interval));
2111 
2112 		log_scrub();
2113 
2114 		os_event_reset(log_scrub_event);
2115 	}
2116 
2117 	log_scrub_thread_active = false;
2118 
2119 	/* We count the number of threads in os_thread_exit(). A created
2120 	thread should always use that to exit and not use return() to exit. */
2121 	os_thread_exit();
2122 
2123 	OS_THREAD_DUMMY_RETURN;
2124 }
2125