1 /*****************************************************************************
2 
3 Copyright (c) 1995, 2021, Oracle and/or its affiliates.
4 Copyright (c) 2009, Google Inc.
5 
6 Portions of this file contain modifications contributed and copyrighted by
7 Google, Inc. Those modifications are gratefully acknowledged and are described
8 briefly in the InnoDB documentation. The contributions by Google are
9 incorporated with their permission, and subject to the conditions contained in
10 the file COPYING.Google.
11 
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License, version 2.0,
14 as published by the Free Software Foundation.
15 
16 This program is also distributed with certain software (including
17 but not limited to OpenSSL) that is licensed under separate terms,
18 as designated in a particular file or component or in included license
19 documentation.  The authors of MySQL hereby grant you an additional
20 permission to link the program and your derivative works with the
21 separately licensed software that they have included with MySQL.
22 
23 This program is distributed in the hope that it will be useful,
24 but WITHOUT ANY WARRANTY; without even the implied warranty of
25 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26 GNU General Public License, version 2.0, for more details.
27 
28 You should have received a copy of the GNU General Public License along with
29 this program; if not, write to the Free Software Foundation, Inc.,
30 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
31 
32 *****************************************************************************/
33 
34 /**************************************************//**
35 @file log/log0log.cc
36 Database log
37 
38 Created 12/9/1995 Heikki Tuuri
39 *******************************************************/
40 
41 #include "ha_prototypes.h"
42 #include <debug_sync.h>
43 
44 #include "log0log.h"
45 
46 #ifdef UNIV_NONINL
47 #include "log0log.ic"
48 #endif
49 
50 #include "mem0mem.h"
51 #include "buf0buf.h"
52 #ifndef UNIV_HOTBACKUP
53 #include "buf0flu.h"
54 #include "srv0srv.h"
55 #include "log0recv.h"
56 #include "fil0fil.h"
57 #include "dict0boot.h"
58 #include "dict0stats_bg.h"
59 #include "srv0srv.h"
60 #include "srv0start.h"
61 #include "trx0sys.h"
62 #include "trx0trx.h"
63 #include "trx0roll.h"
64 #include "srv0mon.h"
65 #include "sync0sync.h"
66 #endif /* !UNIV_HOTBACKUP */
67 
68 /*
69 General philosophy of InnoDB redo-logs:
70 
71 1) Every change to a contents of a data page must be done
72 through mtr, which in mtr_commit() writes log records
73 to the InnoDB redo log.
74 
75 2) Normally these changes are performed using a mlog_write_ulint()
76 or similar function.
77 
78 3) In some page level operations only a code number of a
79 c-function and its parameters are written to the log to
80 reduce the size of the log.
81 
82   3a) You should not add parameters to these kind of functions
83   (e.g. trx_undo_header_create(), trx_undo_insert_header_reuse())
84 
85   3b) You should not add such functionality which either change
86   working when compared with the old or are dependent on data
87   outside of the page. These kind of functions should implement
88   self-contained page transformation and it should be unchanged
89   if you don't have very essential reasons to change log
90   semantics or format.
91 
92 */
93 
94 /** Redo log system */
95 log_t*	log_sys	= NULL;
96 
97 /** Whether to generate and require checksums on the redo log pages */
98 my_bool	innodb_log_checksums;
99 
100 /** Pointer to the log checksum calculation function */
101 log_checksum_func_t log_checksum_algorithm_ptr;
102 
103 /* These control how often we print warnings if the last checkpoint is too
104 old */
105 bool	log_has_printed_chkp_warning = false;
106 time_t	log_last_warning_time;
107 
108 bool	log_has_printed_chkp_margine_warning = false;
109 time_t	log_last_margine_warning_time;
110 
111 /* A margin for free space in the log buffer before a log entry is catenated */
112 #define LOG_BUF_WRITE_MARGIN	(4 * OS_FILE_LOG_BLOCK_SIZE)
113 
114 /* Margins for free space in the log buffer after a log entry is catenated */
115 #define LOG_BUF_FLUSH_RATIO	2
116 #define LOG_BUF_FLUSH_MARGIN	(LOG_BUF_WRITE_MARGIN + 4 * UNIV_PAGE_SIZE)
117 
118 /* This parameter controls asynchronous making of a new checkpoint; the value
119 should be bigger than LOG_POOL_PREFLUSH_RATIO_SYNC */
120 
121 #define LOG_POOL_CHECKPOINT_RATIO_ASYNC	32
122 
123 /* This parameter controls synchronous preflushing of modified buffer pages */
124 #define LOG_POOL_PREFLUSH_RATIO_SYNC	16
125 
126 /* The same ratio for asynchronous preflushing; this value should be less than
127 the previous */
128 #define LOG_POOL_PREFLUSH_RATIO_ASYNC	8
129 
130 /* Codes used in unlocking flush latches */
131 #define LOG_UNLOCK_NONE_FLUSHED_LOCK	1
132 #define LOG_UNLOCK_FLUSH_LOCK		2
133 
134 /******************************************************//**
135 Completes a checkpoint write i/o to a log file. */
136 static
137 void
138 log_io_complete_checkpoint(void);
139 /*============================*/
140 
141 #ifndef UNIV_HOTBACKUP
142 /****************************************************************//**
143 Returns the oldest modified block lsn in the pool, or log_sys->lsn if none
144 exists.
145 @return LSN of oldest modification */
146 static
147 lsn_t
log_buf_pool_get_oldest_modification(void)148 log_buf_pool_get_oldest_modification(void)
149 /*======================================*/
150 {
151 	lsn_t	lsn;
152 
153 	ut_ad(log_mutex_own());
154 
155 	lsn = buf_pool_get_oldest_modification();
156 
157 	if (!lsn) {
158 
159 		lsn = log_sys->lsn;
160 	}
161 
162 	return(lsn);
163 }
164 #endif  /* !UNIV_HOTBACKUP */
165 
166 /** Extends the log buffer.
167 @param[in]	len	requested minimum size in bytes */
168 void
log_buffer_extend(ulint len)169 log_buffer_extend(
170 	ulint	len)
171 {
172 	ulint	move_start;
173 	ulint	move_end;
174 	byte	tmp_buf[OS_FILE_LOG_BLOCK_SIZE];
175 
176 	log_mutex_enter_all();
177 
178 	while (log_sys->is_extending) {
179 		/* Another thread is trying to extend already.
180 		Needs to wait for. */
181 		log_mutex_exit_all();
182 
183 		log_buffer_flush_to_disk();
184 
185 		log_mutex_enter_all();
186 
187 		if (srv_log_buffer_size > len / UNIV_PAGE_SIZE) {
188 			/* Already extended enough by the others */
189 			log_mutex_exit_all();
190 			return;
191 		}
192 	}
193 
194 	if (len >= log_sys->buf_size / 2) {
195 		DBUG_EXECUTE_IF("ib_log_buffer_is_short_crash",
196 				DBUG_SUICIDE(););
197 
198 		/* log_buffer is too small. try to extend instead of crash. */
199 		ib::warn() << "The transaction log size is too large"
200 			" for innodb_log_buffer_size (" << len << " >= "
201 			<< LOG_BUFFER_SIZE << " / 2). Trying to extend it.";
202 	}
203 
204 	log_sys->is_extending = true;
205 
206 	while (ut_calc_align_down(log_sys->buf_free,
207 				  OS_FILE_LOG_BLOCK_SIZE)
208 	       != ut_calc_align_down(log_sys->buf_next_to_write,
209 				     OS_FILE_LOG_BLOCK_SIZE)) {
210 		/* Buffer might have >1 blocks to write still. */
211 		log_mutex_exit_all();
212 
213 		log_buffer_flush_to_disk();
214 
215 		log_mutex_enter_all();
216 	}
217 
218 	move_start = ut_calc_align_down(
219 		log_sys->buf_free,
220 		OS_FILE_LOG_BLOCK_SIZE);
221 	move_end = log_sys->buf_free;
222 
223 	/* store the last log block in buffer */
224 	ut_memcpy(tmp_buf, log_sys->buf + move_start,
225 		  move_end - move_start);
226 
227 	log_sys->buf_free -= move_start;
228 	log_sys->buf_next_to_write -= move_start;
229 
230 	/* reallocate log buffer */
231 	srv_log_buffer_size = len / UNIV_PAGE_SIZE + 1;
232 	ut_free(log_sys->buf_ptr);
233 
234 	log_sys->buf_size = LOG_BUFFER_SIZE;
235 
236 	log_sys->buf_ptr = static_cast<byte*>(
237 		ut_zalloc_nokey(log_sys->buf_size * 2 + OS_FILE_LOG_BLOCK_SIZE));
238 	log_sys->buf = static_cast<byte*>(
239 		ut_align(log_sys->buf_ptr, OS_FILE_LOG_BLOCK_SIZE));
240 
241 	log_sys->first_in_use = true;
242 
243 	log_sys->max_buf_free = log_sys->buf_size / LOG_BUF_FLUSH_RATIO
244 		- LOG_BUF_FLUSH_MARGIN;
245 
246 	/* restore the last log block */
247 	ut_memcpy(log_sys->buf, tmp_buf, move_end - move_start);
248 
249 	ut_ad(log_sys->is_extending);
250 	log_sys->is_extending = false;
251 
252 	log_mutex_exit_all();
253 
254 	ib::info() << "innodb_log_buffer_size was extended to "
255 		<< LOG_BUFFER_SIZE << ".";
256 }
257 
258 #ifndef UNIV_HOTBACKUP
259 /** Calculate actual length in redo buffer and file including
260 block header and trailer.
261 @param[in]	len	length to write
262 @return actual length to write including header and trailer. */
263 static inline
264 ulint
log_calculate_actual_len(ulint len)265 log_calculate_actual_len(
266 	ulint len)
267 {
268 	ut_ad(log_mutex_own());
269 
270 	/* actual length stored per block */
271 	const ulint	len_per_blk = OS_FILE_LOG_BLOCK_SIZE
272 		- (LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE);
273 
274 	/* actual data length in last block already written */
275 	ulint	extra_len = (log_sys->buf_free % OS_FILE_LOG_BLOCK_SIZE);
276 
277 	ut_ad(extra_len >= LOG_BLOCK_HDR_SIZE);
278 	extra_len -= LOG_BLOCK_HDR_SIZE;
279 
280 	/* total extra length for block header and trailer */
281 	extra_len = ((len + extra_len) / len_per_blk)
282 		* (LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE);
283 
284 	return(len + extra_len);
285 }
286 
287 /** Check margin not to overwrite transaction log from the last checkpoint.
288 If would estimate the log write to exceed the log_group_capacity,
289 waits for the checkpoint is done enough.
290 @param[in]	len	length of the data to be written */
291 
292 void
log_margin_checkpoint_age(ulint len)293 log_margin_checkpoint_age(
294 	ulint	len)
295 {
296 	ulint	margin = log_calculate_actual_len(len);
297 
298 	ut_ad(log_mutex_own());
299 
300 	if (margin > log_sys->log_group_capacity) {
301 		/* return with warning output to avoid deadlock */
302 		if (!log_has_printed_chkp_margine_warning
303 		    || difftime(time(NULL),
304 				log_last_margine_warning_time) > 15) {
305 			log_has_printed_chkp_margine_warning = true;
306 			log_last_margine_warning_time = time(NULL);
307 
308 			ib::error() << "The transaction log files are too"
309 				" small for the single transaction log (size="
310 				<< len << "). So, the last checkpoint age"
311 				" might exceed the log group capacity "
312 				<< log_sys->log_group_capacity << ".";
313 		}
314 
315 		return;
316 	}
317 
318 	/* Our margin check should ensure that we never reach this condition.
319 	Try to do checkpoint once. We cannot keep waiting here as it might
320 	result in hang in case the current mtr has latch on oldest lsn */
321 	if (log_sys->lsn - log_sys->last_checkpoint_lsn + margin
322 	    > log_sys->log_group_capacity) {
323 		/* The log write of 'len' might overwrite the transaction log
324 		after the last checkpoint. Makes checkpoint. */
325 
326 		bool	flushed_enough = false;
327 
328 		if (log_sys->lsn - log_buf_pool_get_oldest_modification()
329 		    + margin
330 		    <= log_sys->log_group_capacity) {
331 			flushed_enough = true;
332 		}
333 
334 		log_sys->check_flush_or_checkpoint = true;
335 		log_mutex_exit();
336 
337 		DEBUG_SYNC_C("margin_checkpoint_age_rescue");
338 
339 		if (!flushed_enough) {
340 			os_thread_sleep(100000);
341 		}
342 		log_checkpoint(true, false);
343 
344 		log_mutex_enter();
345 	}
346 
347 	return;
348 }
349 #endif /* !UNIV_HOTBACKUP */
350 /** Open the log for log_write_low. The log must be closed with log_close.
351 @param[in]	len	length of the data to be written
352 @return start lsn of the log record */
353 lsn_t
log_reserve_and_open(ulint len)354 log_reserve_and_open(
355 	ulint	len)
356 {
357 	ulint	len_upper_limit;
358 #ifdef UNIV_DEBUG
359 	ulint	count			= 0;
360 #endif /* UNIV_DEBUG */
361 
362 loop:
363 	ut_ad(log_mutex_own());
364 	ut_ad(!recv_no_log_write);
365 
366 	if (log_sys->is_extending) {
367 		log_mutex_exit();
368 
369 		/* Log buffer size is extending. Writing up to the next block
370 		should wait for the extending finished. */
371 
372 		os_thread_sleep(100000);
373 
374 		ut_ad(++count < 50);
375 
376 		log_mutex_enter();
377 		goto loop;
378 	}
379 
380 	/* Calculate an upper limit for the space the string may take in the
381 	log buffer */
382 
383 	len_upper_limit = LOG_BUF_WRITE_MARGIN + srv_log_write_ahead_size
384 			  + (5 * len) / 4;
385 
386 	if (log_sys->buf_free + len_upper_limit > log_sys->buf_size) {
387 		log_mutex_exit();
388 
389 		DEBUG_SYNC_C("log_buf_size_exceeded");
390 
391 		/* Not enough free space, do a write of the log buffer */
392 
393 		log_buffer_sync_in_background(false);
394 
395 		srv_stats.log_waits.inc();
396 
397 		ut_ad(++count < 50);
398 
399 		log_mutex_enter();
400 		goto loop;
401 	}
402 
403 	return(log_sys->lsn);
404 }
405 
406 /************************************************************//**
407 Writes to the log the string given. It is assumed that the caller holds the
408 log mutex. */
409 void
log_write_low(const byte * str,ulint str_len)410 log_write_low(
411 /*==========*/
412 	const byte*	str,		/*!< in: string */
413 	ulint		str_len)	/*!< in: string length */
414 {
415 	log_t*	log	= log_sys;
416 	ulint	len;
417 	ulint	data_len;
418 	byte*	log_block;
419 
420 	ut_ad(log_mutex_own());
421 part_loop:
422 	ut_ad(!recv_no_log_write);
423 	/* Calculate a part length */
424 
425 	data_len = (log->buf_free % OS_FILE_LOG_BLOCK_SIZE) + str_len;
426 
427 	if (data_len <= OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
428 
429 		/* The string fits within the current log block */
430 
431 		len = str_len;
432 	} else {
433 		data_len = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE;
434 
435 		len = OS_FILE_LOG_BLOCK_SIZE
436 			- (log->buf_free % OS_FILE_LOG_BLOCK_SIZE)
437 			- LOG_BLOCK_TRL_SIZE;
438 	}
439 
440 	ut_memcpy(log->buf + log->buf_free, str, len);
441 
442 	str_len -= len;
443 	str = str + len;
444 
445 	log_block = static_cast<byte*>(
446 		ut_align_down(
447 			log->buf + log->buf_free, OS_FILE_LOG_BLOCK_SIZE));
448 
449 	log_block_set_data_len(log_block, data_len);
450 
451 	if (data_len == OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
452 		/* This block became full */
453 		log_block_set_data_len(log_block, OS_FILE_LOG_BLOCK_SIZE);
454 		log_block_set_checkpoint_no(log_block,
455 					    log_sys->next_checkpoint_no);
456 		len += LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE;
457 
458 		log->lsn += len;
459 
460 		/* Initialize the next block header */
461 		log_block_init(log_block + OS_FILE_LOG_BLOCK_SIZE, log->lsn);
462 	} else {
463 		log->lsn += len;
464 	}
465 
466 	log->buf_free += len;
467 
468 	ut_ad(log->buf_free <= log->buf_size);
469 
470 	if (str_len > 0) {
471 		goto part_loop;
472 	}
473 
474 	srv_stats.log_write_requests.inc();
475 }
476 
477 /************************************************************//**
478 Closes the log.
479 @return lsn */
480 lsn_t
log_close(void)481 log_close(void)
482 /*===========*/
483 {
484 	byte*		log_block;
485 	ulint		first_rec_group;
486 	lsn_t		oldest_lsn;
487 	lsn_t		lsn;
488 	log_t*		log	= log_sys;
489 	lsn_t		checkpoint_age;
490 
491 	ut_ad(log_mutex_own());
492 	ut_ad(!recv_no_log_write);
493 
494 	lsn = log->lsn;
495 
496 	log_block = static_cast<byte*>(
497 		ut_align_down(
498 			log->buf + log->buf_free, OS_FILE_LOG_BLOCK_SIZE));
499 
500 	first_rec_group = log_block_get_first_rec_group(log_block);
501 
502 	if (first_rec_group == 0) {
503 		/* We initialized a new log block which was not written
504 		full by the current mtr: the next mtr log record group
505 		will start within this block at the offset data_len */
506 
507 		log_block_set_first_rec_group(
508 			log_block, log_block_get_data_len(log_block));
509 	}
510 
511 	if (log->buf_free > log->max_buf_free) {
512 
513 		log->check_flush_or_checkpoint = true;
514 	}
515 
516 	checkpoint_age = lsn - log->last_checkpoint_lsn;
517 
518 	if (checkpoint_age >= log->log_group_capacity) {
519 		DBUG_EXECUTE_IF(
520 			"print_all_chkp_warnings",
521 			log_has_printed_chkp_warning = false;);
522 
523 		if (!log_has_printed_chkp_warning
524 		    || difftime(time(NULL), log_last_warning_time) > 15) {
525 
526 			log_has_printed_chkp_warning = true;
527 			log_last_warning_time = time(NULL);
528 
529 			ib::error() << "The age of the last checkpoint is "
530 				<< checkpoint_age << ", which exceeds the log"
531 				" group capacity " << log->log_group_capacity
532 				<< ".";
533 		}
534 	}
535 
536 	if (checkpoint_age <= log->max_modified_age_sync) {
537 
538 		goto function_exit;
539 	}
540 
541 	oldest_lsn = buf_pool_get_oldest_modification();
542 
543 	if (!oldest_lsn
544 	    || lsn - oldest_lsn > log->max_modified_age_sync
545 	    || checkpoint_age > log->max_checkpoint_age_async) {
546 
547 		log->check_flush_or_checkpoint = true;
548 	}
549 function_exit:
550 
551 	return(lsn);
552 }
553 
554 /******************************************************//**
555 Calculates the data capacity of a log group, when the log file headers are not
556 included.
557 @return capacity in bytes */
558 lsn_t
log_group_get_capacity(const log_group_t * group)559 log_group_get_capacity(
560 /*===================*/
561 	const log_group_t*	group)	/*!< in: log group */
562 {
563 	/* The lsn parameters are updated while holding both the mutexes
564 	and it is ok to have either of them while reading */
565 	ut_ad(log_mutex_own() || log_write_mutex_own());
566 
567 	return((group->file_size - LOG_FILE_HDR_SIZE) * group->n_files);
568 }
569 
570 /******************************************************//**
571 Calculates the offset within a log group, when the log file headers are not
572 included.
573 @return size offset (<= offset) */
574 UNIV_INLINE
575 lsn_t
log_group_calc_size_offset(lsn_t offset,const log_group_t * group)576 log_group_calc_size_offset(
577 /*=======================*/
578 	lsn_t			offset,	/*!< in: real offset within the
579 					log group */
580 	const log_group_t*	group)	/*!< in: log group */
581 {
582 	/* The lsn parameters are updated while holding both the mutexes
583 	and it is ok to have either of them while reading */
584 	ut_ad(log_mutex_own() || log_write_mutex_own());
585 
586 	return(offset - LOG_FILE_HDR_SIZE * (1 + offset / group->file_size));
587 }
588 
589 /******************************************************//**
590 Calculates the offset within a log group, when the log file headers are
591 included.
592 @return real offset (>= offset) */
593 UNIV_INLINE
594 lsn_t
log_group_calc_real_offset(lsn_t offset,const log_group_t * group)595 log_group_calc_real_offset(
596 /*=======================*/
597 	lsn_t			offset,	/*!< in: size offset within the
598 					log group */
599 	const log_group_t*	group)	/*!< in: log group */
600 {
601 	/* The lsn parameters are updated while holding both the mutexes
602 	and it is ok to have either of them while reading */
603 	ut_ad(log_mutex_own() || log_write_mutex_own());
604 
605 	return(offset + LOG_FILE_HDR_SIZE
606 	       * (1 + offset / (group->file_size - LOG_FILE_HDR_SIZE)));
607 }
608 
609 /** Calculate the offset of an lsn within a log group.
610 @param[in]	lsn	log sequence number
611 @param[in]	group	log group
612 @return offset within the log group */
613 lsn_t
log_group_calc_lsn_offset(lsn_t lsn,const log_group_t * group)614 log_group_calc_lsn_offset(
615 	lsn_t			lsn,
616 	const log_group_t*	group)
617 {
618 	lsn_t	gr_lsn;
619 	lsn_t	gr_lsn_size_offset;
620 	lsn_t	difference;
621 	lsn_t	group_size;
622 	lsn_t	offset;
623 
624 	/* The lsn parameters are updated while holding both the mutexes
625 	and it is ok to have either of them while reading */
626 	ut_ad(log_mutex_own() || log_write_mutex_own());
627 
628 	gr_lsn = group->lsn;
629 
630 	gr_lsn_size_offset = log_group_calc_size_offset(
631 		group->lsn_offset, group);
632 
633 	group_size = log_group_get_capacity(group);
634 
635 	if (lsn >= gr_lsn) {
636 
637 		difference = lsn - gr_lsn;
638 	} else {
639 		difference = gr_lsn - lsn;
640 
641 		difference = difference % group_size;
642 
643 		difference = group_size - difference;
644 	}
645 
646 	offset = (gr_lsn_size_offset + difference) % group_size;
647 
648 	/* fprintf(stderr,
649 	"Offset is " LSN_PF " gr_lsn_offset is " LSN_PF
650 	" difference is " LSN_PF "\n",
651 	offset, gr_lsn_size_offset, difference);
652 	*/
653 
654 	return(log_group_calc_real_offset(offset, group));
655 }
656 
657 /*******************************************************************//**
658 Calculates where in log files we find a specified lsn.
659 @return log file number */
660 ulint
log_calc_where_lsn_is(int64_t * log_file_offset,ib_uint64_t first_header_lsn,ib_uint64_t lsn,ulint n_log_files,int64_t log_file_size)661 log_calc_where_lsn_is(
662 /*==================*/
663 	int64_t*	log_file_offset,	/*!< out: offset in that file
664 						(including the header) */
665 	ib_uint64_t	first_header_lsn,	/*!< in: first log file start
666 						lsn */
667 	ib_uint64_t	lsn,			/*!< in: lsn whose position to
668 						determine */
669 	ulint		n_log_files,		/*!< in: total number of log
670 						files */
671 	int64_t		log_file_size)		/*!< in: log file size
672 						(including the header) */
673 {
674 	int64_t		capacity	= log_file_size - LOG_FILE_HDR_SIZE;
675 	ulint		file_no;
676 	int64_t		add_this_many;
677 
678 	if (lsn < first_header_lsn) {
679 		add_this_many = 1 + (first_header_lsn - lsn)
680 			/ (capacity * static_cast<int64_t>(n_log_files));
681 		lsn += add_this_many
682 			* capacity * static_cast<int64_t>(n_log_files);
683 	}
684 
685 	ut_a(lsn >= first_header_lsn);
686 
687 	file_no = ((ulint)((lsn - first_header_lsn) / capacity))
688 		% n_log_files;
689 	*log_file_offset = (lsn - first_header_lsn) % capacity;
690 
691 	*log_file_offset = *log_file_offset + LOG_FILE_HDR_SIZE;
692 
693 	return(file_no);
694 }
695 
696 
697 /********************************************************//**
698 Sets the field values in group to correspond to a given lsn. For this function
699 to work, the values must already be correctly initialized to correspond to
700 some lsn, for instance, a checkpoint lsn. */
701 void
log_group_set_fields(log_group_t * group,lsn_t lsn)702 log_group_set_fields(
703 /*=================*/
704 	log_group_t*	group,	/*!< in/out: group */
705 	lsn_t		lsn)	/*!< in: lsn for which the values should be
706 				set */
707 {
708 	group->lsn_offset = log_group_calc_lsn_offset(lsn, group);
709 	group->lsn = lsn;
710 }
711 #ifndef UNIV_HOTBACKUP
712 /*****************************************************************//**
713 Calculates the recommended highest values for lsn - last_checkpoint_lsn
714 and lsn - buf_get_oldest_modification().
715 @retval true on success
716 @retval false if the smallest log group is too small to
717 accommodate the number of OS threads in the database server */
718 static MY_ATTRIBUTE((warn_unused_result))
719 bool
log_calc_max_ages(void)720 log_calc_max_ages(void)
721 /*===================*/
722 {
723 	log_group_t*	group;
724 	lsn_t		margin;
725 	ulint		free;
726 	bool		success	= true;
727 	lsn_t		smallest_capacity;
728 
729 	log_mutex_enter();
730 
731 	group = UT_LIST_GET_FIRST(log_sys->log_groups);
732 
733 	ut_ad(group);
734 
735 	smallest_capacity = LSN_MAX;
736 
737 	while (group) {
738 		if (log_group_get_capacity(group) < smallest_capacity) {
739 
740 			smallest_capacity = log_group_get_capacity(group);
741 		}
742 
743 		group = UT_LIST_GET_NEXT(log_groups, group);
744 	}
745 
746 	/* Add extra safety */
747 	smallest_capacity = smallest_capacity - smallest_capacity / 10;
748 
749 	/* For each OS thread we must reserve so much free space in the
750 	smallest log group that it can accommodate the log entries produced
751 	by single query steps: running out of free log space is a serious
752 	system error which requires rebooting the database. */
753 
754 	free = LOG_CHECKPOINT_FREE_PER_THREAD * (10 + srv_thread_concurrency)
755 		+ LOG_CHECKPOINT_EXTRA_FREE;
756 	if (free >= smallest_capacity / 2) {
757 		success = false;
758 
759 		goto failure;
760 	} else {
761 		margin = smallest_capacity - free;
762 	}
763 
764 	margin = margin - margin / 10;	/* Add still some extra safety */
765 
766 	log_sys->log_group_capacity = smallest_capacity;
767 
768 	log_sys->max_modified_age_async = margin
769 		- margin / LOG_POOL_PREFLUSH_RATIO_ASYNC;
770 	log_sys->max_modified_age_sync = margin
771 		- margin / LOG_POOL_PREFLUSH_RATIO_SYNC;
772 
773 	log_sys->max_checkpoint_age_async = margin - margin
774 		/ LOG_POOL_CHECKPOINT_RATIO_ASYNC;
775 	log_sys->max_checkpoint_age = margin;
776 
777 failure:
778 	log_mutex_exit();
779 
780 	if (!success) {
781 		ib::error() << "Cannot continue operation. ib_logfiles are too"
782 			" small for innodb_thread_concurrency "
783 			<< srv_thread_concurrency << ". The combined size of"
784 			" ib_logfiles should be bigger than"
785 			" 200 kB * innodb_thread_concurrency. To get mysqld"
786 			" to start up, set innodb_thread_concurrency in"
787 			" my.cnf to a lower value, for example, to 8. After"
788 			" an ERROR-FREE shutdown of mysqld you can adjust"
789 			" the size of ib_logfiles. " << INNODB_PARAMETERS_MSG;
790 	}
791 
792 	return(success);
793 }
794 
795 /******************************************************//**
796 Initializes the log. */
797 void
log_init(void)798 log_init(void)
799 /*==========*/
800 {
801 	log_sys = static_cast<log_t*>(ut_zalloc_nokey(sizeof(log_t)));
802 
803 	mutex_create(LATCH_ID_LOG_SYS, &log_sys->mutex);
804 	mutex_create(LATCH_ID_LOG_WRITE, &log_sys->write_mutex);
805 
806 	mutex_create(LATCH_ID_LOG_FLUSH_ORDER, &log_sys->log_flush_order_mutex);
807 
808 	/* Start the lsn from one log block from zero: this way every
809 	log record has a start lsn != zero, a fact which we will use */
810 
811 	log_sys->lsn = LOG_START_LSN;
812 
813 	ut_a(LOG_BUFFER_SIZE >= 16 * OS_FILE_LOG_BLOCK_SIZE);
814 	ut_a(LOG_BUFFER_SIZE >= 4 * UNIV_PAGE_SIZE);
815 
816 	log_sys->buf_size = LOG_BUFFER_SIZE;
817 
818 	log_sys->buf_ptr = static_cast<byte*>(
819 		ut_zalloc_nokey(log_sys->buf_size * 2 + OS_FILE_LOG_BLOCK_SIZE));
820 	log_sys->buf = static_cast<byte*>(
821 		ut_align(log_sys->buf_ptr, OS_FILE_LOG_BLOCK_SIZE));
822 
823 	log_sys->first_in_use = true;
824 
825 	log_sys->max_buf_free = log_sys->buf_size / LOG_BUF_FLUSH_RATIO
826 		- LOG_BUF_FLUSH_MARGIN;
827 	log_sys->check_flush_or_checkpoint = true;
828 	UT_LIST_INIT(log_sys->log_groups, &log_group_t::log_groups);
829 
830 	log_sys->n_log_ios_old = log_sys->n_log_ios;
831 	log_sys->last_printout_time = time(NULL);
832 	/*----------------------------*/
833 
834 	log_sys->write_lsn = log_sys->lsn;
835 
836 	log_sys->flush_event = os_event_create(0);
837 
838 	os_event_set(log_sys->flush_event);
839 
840 	/*----------------------------*/
841 
842 	log_sys->last_checkpoint_lsn = log_sys->lsn;
843 
844 	rw_lock_create(
845 		checkpoint_lock_key, &log_sys->checkpoint_lock,
846 		SYNC_NO_ORDER_CHECK);
847 
848 	log_sys->checkpoint_buf_ptr = static_cast<byte*>(
849 		ut_zalloc_nokey(2 * OS_FILE_LOG_BLOCK_SIZE));
850 
851 	log_sys->checkpoint_buf = static_cast<byte*>(
852 		ut_align(log_sys->checkpoint_buf_ptr, OS_FILE_LOG_BLOCK_SIZE));
853 
854 	/*----------------------------*/
855 
856 	log_block_init(log_sys->buf, log_sys->lsn);
857 	log_block_set_first_rec_group(log_sys->buf, LOG_BLOCK_HDR_SIZE);
858 
859 	log_sys->buf_free = LOG_BLOCK_HDR_SIZE;
860 	log_sys->lsn = LOG_START_LSN + LOG_BLOCK_HDR_SIZE;
861 
862 	MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
863 		    log_sys->lsn - log_sys->last_checkpoint_lsn);
864 }
865 
866 /******************************************************************//**
867 Inits a log group to the log system.
868 @return true if success, false if not */
869 MY_ATTRIBUTE((warn_unused_result))
870 bool
log_group_init(ulint id,ulint n_files,lsn_t file_size,ulint space_id)871 log_group_init(
872 /*===========*/
873 	ulint	id,			/*!< in: group id */
874 	ulint	n_files,		/*!< in: number of log files */
875 	lsn_t	file_size,		/*!< in: log file size in bytes */
876 	ulint	space_id)		/*!< in: space id of the file space
877 					which contains the log files of this
878 					group */
879 {
880 	ulint	i;
881 	log_group_t*	group;
882 
883 	group = static_cast<log_group_t*>(ut_malloc_nokey(sizeof(log_group_t)));
884 
885 	group->id = id;
886 	group->n_files = n_files;
887 	group->format = LOG_HEADER_FORMAT_CURRENT;
888 	group->file_size = file_size;
889 	group->space_id = space_id;
890 	group->state = LOG_GROUP_OK;
891 	group->lsn = LOG_START_LSN;
892 	group->lsn_offset = LOG_FILE_HDR_SIZE;
893 
894 	group->file_header_bufs_ptr = static_cast<byte**>(
895 		ut_zalloc_nokey(sizeof(byte*) * n_files));
896 
897 	group->file_header_bufs = static_cast<byte**>(
898 		ut_zalloc_nokey(sizeof(byte**) * n_files));
899 
900 	for (i = 0; i < n_files; i++) {
901 		group->file_header_bufs_ptr[i] = static_cast<byte*>(
902 			ut_zalloc_nokey(LOG_FILE_HDR_SIZE
903 					+ OS_FILE_LOG_BLOCK_SIZE));
904 
905 		group->file_header_bufs[i] = static_cast<byte*>(
906 			ut_align(group->file_header_bufs_ptr[i],
907 				 OS_FILE_LOG_BLOCK_SIZE));
908 	}
909 
910 	group->checkpoint_buf_ptr = static_cast<byte*>(
911 		ut_zalloc_nokey(2 * OS_FILE_LOG_BLOCK_SIZE));
912 
913 	group->checkpoint_buf = static_cast<byte*>(
914 		ut_align(group->checkpoint_buf_ptr,OS_FILE_LOG_BLOCK_SIZE));
915 
916 	UT_LIST_ADD_LAST(log_sys->log_groups, group);
917 
918 	return(log_calc_max_ages());
919 }
920 #endif /* !UNIV_HOTBACKUP */
921 /******************************************************//**
922 Completes an i/o to a log file. */
923 void
log_io_complete(log_group_t * group)924 log_io_complete(
925 /*============*/
926 	log_group_t*	group)	/*!< in: log group or a dummy pointer */
927 {
928 	if ((ulint) group & 0x1UL) {
929 		/* It was a checkpoint write */
930 		group = (log_group_t*)((ulint) group - 1);
931 
932 #ifdef _WIN32
933 		fil_flush(group->space_id);
934 #else
935 		switch (srv_unix_file_flush_method) {
936 		case SRV_UNIX_O_DSYNC:
937 		case SRV_UNIX_NOSYNC:
938 			break;
939 		case SRV_UNIX_FSYNC:
940 		case SRV_UNIX_LITTLESYNC:
941 		case SRV_UNIX_O_DIRECT:
942 		case SRV_UNIX_O_DIRECT_NO_FSYNC:
943 			fil_flush(group->space_id);
944 		}
945 #endif /* _WIN32 */
946 
947 		DBUG_PRINT("ib_log", ("checkpoint info written to group %u",
948 				      unsigned(group->id)));
949 		log_io_complete_checkpoint();
950 
951 		return;
952 	}
953 
954 	ut_error;	/*!< We currently use synchronous writing of the
955 			logs and cannot end up here! */
956 }
957 
958 /******************************************************//**
959 Writes a log file header to a log file space. */
960 static
961 void
log_group_file_header_flush(log_group_t * group,ulint nth_file,lsn_t start_lsn)962 log_group_file_header_flush(
963 /*========================*/
964 	log_group_t*	group,		/*!< in: log group */
965 	ulint		nth_file,	/*!< in: header to the nth file in the
966 					log file space */
967 	lsn_t		start_lsn)	/*!< in: log file data starts at this
968 					lsn */
969 {
970 	byte*	buf;
971 	lsn_t	dest_offset;
972 
973 	ut_ad(log_write_mutex_own());
974 	ut_ad(!recv_no_log_write);
975 	ut_ad(group->id == 0);
976 	ut_a(nth_file < group->n_files);
977 
978 	buf = *(group->file_header_bufs + nth_file);
979 
980 	memset(buf, 0, OS_FILE_LOG_BLOCK_SIZE);
981 	mach_write_to_4(buf + LOG_HEADER_FORMAT, LOG_HEADER_FORMAT_CURRENT);
982 	mach_write_to_8(buf + LOG_HEADER_START_LSN, start_lsn);
983 	strcpy(reinterpret_cast<char*>(buf) + LOG_HEADER_CREATOR,
984 	       LOG_HEADER_CREATOR_CURRENT);
985 	ut_ad(LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR
986 	      >= sizeof LOG_HEADER_CREATOR_CURRENT);
987 	log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf));
988 
989 	dest_offset = nth_file * group->file_size;
990 
991 	DBUG_PRINT("ib_log", ("write " LSN_PF
992 			      " group " ULINTPF
993 			      " file " ULINTPF " header",
994 			      start_lsn, group->id, nth_file));
995 
996 	log_sys->n_log_ios++;
997 
998 	MONITOR_INC(MONITOR_LOG_IO);
999 
1000 	srv_stats.os_log_pending_writes.inc();
1001 
1002 	const ulint	page_no
1003 		= (ulint) (dest_offset / univ_page_size.physical());
1004 
1005 	fil_io(IORequestLogWrite, true,
1006 	       page_id_t(group->space_id, page_no),
1007 	       univ_page_size,
1008 	       (ulint) (dest_offset % univ_page_size.physical()),
1009 	       OS_FILE_LOG_BLOCK_SIZE, buf, group);
1010 
1011 	srv_stats.os_log_pending_writes.dec();
1012 }
1013 
1014 /******************************************************//**
1015 Stores a 4-byte checksum to the trailer checksum field of a log block
1016 before writing it to a log file. This checksum is used in recovery to
1017 check the consistency of a log block. */
1018 static
1019 void
log_block_store_checksum(byte * block)1020 log_block_store_checksum(
1021 /*=====================*/
1022 	byte*	block)	/*!< in/out: pointer to a log block */
1023 {
1024 	log_block_set_checksum(block, log_block_calc_checksum(block));
1025 }
1026 
1027 /******************************************************//**
1028 Writes a buffer to a log file group. */
1029 static
1030 void
log_group_write_buf(log_group_t * group,byte * buf,ulint len,ulint pad_len,lsn_t start_lsn,ulint new_data_offset)1031 log_group_write_buf(
1032 /*================*/
1033 	log_group_t*	group,		/*!< in: log group */
1034 	byte*		buf,		/*!< in: buffer */
1035 	ulint		len,		/*!< in: buffer len; must be divisible
1036 					by OS_FILE_LOG_BLOCK_SIZE */
1037 #ifdef UNIV_DEBUG
1038 	ulint		pad_len,	/*!< in: pad len in the buffer len */
1039 #endif /* UNIV_DEBUG */
1040 	lsn_t		start_lsn,	/*!< in: start lsn of the buffer; must
1041 					be divisible by
1042 					OS_FILE_LOG_BLOCK_SIZE */
1043 	ulint		new_data_offset)/*!< in: start offset of new data in
1044 					buf: this parameter is used to decide
1045 					if we have to write a new log file
1046 					header */
1047 {
1048 	ulint		write_len;
1049 	bool		write_header	= new_data_offset == 0;
1050 	lsn_t		next_offset;
1051 	ulint		i;
1052 
1053 	ut_ad(log_write_mutex_own());
1054 	ut_ad(!recv_no_log_write);
1055 	ut_a(len % OS_FILE_LOG_BLOCK_SIZE == 0);
1056 	ut_a(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
1057 
1058 loop:
1059 	if (len == 0) {
1060 
1061 		return;
1062 	}
1063 
1064 	next_offset = log_group_calc_lsn_offset(start_lsn, group);
1065 
1066 	if (write_header
1067 	    && next_offset % group->file_size == LOG_FILE_HDR_SIZE) {
1068 		/* We start to write a new log file instance in the group */
1069 
1070 		ut_a(next_offset / group->file_size <= ULINT_MAX);
1071 
1072 		log_group_file_header_flush(group, (ulint)
1073 					    (next_offset / group->file_size),
1074 					    start_lsn);
1075 		srv_stats.os_log_written.add(OS_FILE_LOG_BLOCK_SIZE);
1076 
1077 		srv_stats.log_writes.inc();
1078 	}
1079 
1080 	if ((next_offset % group->file_size) + len > group->file_size) {
1081 
1082 		/* if the above condition holds, then the below expression
1083 		is < len which is ulint, so the typecast is ok */
1084 		write_len = (ulint)
1085 			(group->file_size - (next_offset % group->file_size));
1086 	} else {
1087 		write_len = len;
1088 	}
1089 
1090 	DBUG_PRINT("ib_log",
1091 		   ("write " LSN_PF " to " LSN_PF
1092 		    ": group " ULINTPF " len " ULINTPF
1093 		    " blocks " ULINTPF ".." ULINTPF,
1094 		    start_lsn, next_offset,
1095 		    group->id, write_len,
1096 		    log_block_get_hdr_no(buf),
1097 		    log_block_get_hdr_no(
1098 			    buf + write_len
1099 			    - OS_FILE_LOG_BLOCK_SIZE)));
1100 
1101 	ut_ad(pad_len >= len
1102 	      || log_block_get_hdr_no(buf)
1103 		 == log_block_convert_lsn_to_no(start_lsn));
1104 
1105 	/* Calculate the checksums for each log block and write them to
1106 	the trailer fields of the log blocks */
1107 
1108 	for (i = 0; i < write_len / OS_FILE_LOG_BLOCK_SIZE; i++) {
1109 		ut_ad(pad_len >= len
1110 		      || i * OS_FILE_LOG_BLOCK_SIZE >= len - pad_len
1111 		      || log_block_get_hdr_no(
1112 			      buf + i * OS_FILE_LOG_BLOCK_SIZE)
1113 			 == log_block_get_hdr_no(buf) + i);
1114 		log_block_store_checksum(buf + i * OS_FILE_LOG_BLOCK_SIZE);
1115 	}
1116 
1117 	log_sys->n_log_ios++;
1118 
1119 	MONITOR_INC(MONITOR_LOG_IO);
1120 
1121 	srv_stats.os_log_pending_writes.inc();
1122 
1123 	ut_a(next_offset / UNIV_PAGE_SIZE <= ULINT_MAX);
1124 
1125 	const ulint	page_no
1126 		= (ulint) (next_offset / univ_page_size.physical());
1127 
1128 	fil_io(IORequestLogWrite, true,
1129 	       page_id_t(group->space_id, page_no),
1130 	       univ_page_size,
1131 	       (ulint) (next_offset % UNIV_PAGE_SIZE), write_len, buf,
1132 	       group);
1133 
1134 	srv_stats.os_log_pending_writes.dec();
1135 
1136 	srv_stats.os_log_written.add(write_len);
1137 	srv_stats.log_writes.inc();
1138 
1139 	if (write_len < len) {
1140 		start_lsn += write_len;
1141 		len -= write_len;
1142 		buf += write_len;
1143 
1144 		write_header = true;
1145 
1146 		goto loop;
1147 	}
1148 }
1149 
1150 /** Flush the log has been written to the log file. */
1151 static
1152 void
log_write_flush_to_disk_low()1153 log_write_flush_to_disk_low()
1154 {
1155 	ut_a(log_sys->n_pending_flushes == 1); /* No other threads here */
1156 
1157 #ifndef _WIN32
1158 	bool	do_flush = srv_unix_file_flush_method != SRV_UNIX_O_DSYNC;
1159 #else
1160 	bool	do_flush = true;
1161 #endif
1162 	if (do_flush) {
1163 		log_group_t*	group = UT_LIST_GET_FIRST(log_sys->log_groups);
1164 		fil_flush(group->space_id);
1165 		log_sys->flushed_to_disk_lsn = log_sys->current_flush_lsn;
1166 	}
1167 
1168 	log_sys->n_pending_flushes--;
1169 	MONITOR_DEC(MONITOR_PENDING_LOG_FLUSH);
1170 
1171 	os_event_set(log_sys->flush_event);
1172 }
1173 
1174 /** Switch the log buffer in use, and copy the content of last block
1175 from old log buffer to the head of the to be used one. Thus, buf_free and
1176 buf_next_to_write would be changed accordingly */
1177 static inline
1178 void
log_buffer_switch()1179 log_buffer_switch()
1180 {
1181 	ut_ad(log_mutex_own());
1182 	ut_ad(log_write_mutex_own());
1183 
1184 	const byte*	old_buf = log_sys->buf;
1185 	ulint		area_end = ut_calc_align(log_sys->buf_free,
1186 						 OS_FILE_LOG_BLOCK_SIZE);
1187 
1188 	if (log_sys->first_in_use) {
1189 		ut_ad(log_sys->buf == ut_align(log_sys->buf_ptr,
1190 					       OS_FILE_LOG_BLOCK_SIZE));
1191 		log_sys->buf += log_sys->buf_size;
1192 	} else {
1193 		log_sys->buf -= log_sys->buf_size;
1194 		ut_ad(log_sys->buf == ut_align(log_sys->buf_ptr,
1195 					       OS_FILE_LOG_BLOCK_SIZE));
1196 	}
1197 
1198 	log_sys->first_in_use = !log_sys->first_in_use;
1199 
1200 	/* Copy the last block to new buf */
1201 	ut_memcpy(log_sys->buf,
1202 		  old_buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
1203 		  OS_FILE_LOG_BLOCK_SIZE);
1204 
1205 	log_sys->buf_free %= OS_FILE_LOG_BLOCK_SIZE;
1206 	log_sys->buf_next_to_write = log_sys->buf_free;
1207 }
1208 
1209 /** Ensure that the log has been written to the log file up to a given
1210 log entry (such as that of a transaction commit). Start a new write, or
1211 wait and check if an already running write is covering the request.
1212 @param[in]	lsn		log sequence number that should be
1213 included in the redo log file write
1214 @param[in]	flush_to_disk	whether the written log should also
1215 be flushed to the file system */
1216 void
log_write_up_to(lsn_t lsn,bool flush_to_disk)1217 log_write_up_to(
1218 	lsn_t	lsn,
1219 	bool	flush_to_disk)
1220 {
1221 #ifdef UNIV_DEBUG
1222 	ulint		loop_count	= 0;
1223 #endif /* UNIV_DEBUG */
1224 	byte*           write_buf;
1225 	lsn_t           write_lsn;
1226 
1227 	ut_ad(!srv_read_only_mode);
1228 
1229 	if (recv_no_ibuf_operations) {
1230 		/* Recovery is running and no operations on the log files are
1231 		allowed yet (the variable name .._no_ibuf_.. is misleading) */
1232 
1233 		return;
1234 	}
1235 
1236 loop:
1237 	ut_ad(++loop_count < 128);
1238 
1239 #if UNIV_WORD_SIZE > 7
1240 	/* We can do a dirty read of LSN. */
1241 	/* NOTE: Currently doesn't do dirty read for
1242 	(flush_to_disk == true) case, because the log_mutex
1243 	contention also works as the arbitrator for write-IO
1244 	(fsync) bandwidth between log files and data files. */
1245 	os_rmb;
1246 	if (!flush_to_disk && log_sys->write_lsn >= lsn) {
1247 		return;
1248 	}
1249 #endif
1250 
1251 	log_write_mutex_enter();
1252 	ut_ad(!recv_no_log_write);
1253 
1254 	lsn_t	limit_lsn = flush_to_disk
1255 		? log_sys->flushed_to_disk_lsn
1256 		: log_sys->write_lsn;
1257 
1258 	if (limit_lsn >= lsn) {
1259 		log_write_mutex_exit();
1260 		return;
1261 	}
1262 
1263 #ifdef _WIN32
1264 # ifndef UNIV_HOTBACKUP
1265 	/* write requests during fil_flush() might not be good for Windows */
1266 	if (log_sys->n_pending_flushes > 0
1267 	    || !os_event_is_set(log_sys->flush_event)) {
1268 		log_write_mutex_exit();
1269 		os_event_wait(log_sys->flush_event);
1270 		goto loop;
1271 	}
1272 # else
1273 	if (log_sys->n_pending_flushes > 0) {
1274 		goto loop;
1275 	}
1276 # endif  /* !UNIV_HOTBACKUP */
1277 #endif /* _WIN32 */
1278 
1279 	/* If it is a write call we should just go ahead and do it
1280 	as we checked that write_lsn is not where we'd like it to
1281 	be. If we have to flush as well then we check if there is a
1282 	pending flush and based on that we wait for it to finish
1283 	before proceeding further. */
1284 	if (flush_to_disk
1285 	    && (log_sys->n_pending_flushes > 0
1286 		|| !os_event_is_set(log_sys->flush_event))) {
1287 
1288 		/* Figure out if the current flush will do the job
1289 		for us. */
1290 		bool work_done = log_sys->current_flush_lsn >= lsn;
1291 
1292 		log_write_mutex_exit();
1293 
1294 		os_event_wait(log_sys->flush_event);
1295 
1296 		if (work_done) {
1297 			return;
1298 		} else {
1299 			goto loop;
1300 		}
1301 	}
1302 
1303 	log_mutex_enter();
1304 	if (!flush_to_disk
1305 	    && log_sys->buf_free == log_sys->buf_next_to_write) {
1306 		/* Nothing to write and no flush to disk requested */
1307 		log_mutex_exit_all();
1308 		return;
1309 	}
1310 
1311 	log_group_t*	group;
1312 	ulint		start_offset;
1313 	ulint		end_offset;
1314 	ulint		area_start;
1315 	ulint		area_end;
1316 	ulong		write_ahead_size = srv_log_write_ahead_size;
1317 	ulint		pad_size;
1318 
1319 	DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF,
1320 			      log_sys->write_lsn,
1321 			      log_sys->lsn));
1322 
1323 	if (flush_to_disk) {
1324 		log_sys->n_pending_flushes++;
1325 		log_sys->current_flush_lsn = log_sys->lsn;
1326 		MONITOR_INC(MONITOR_PENDING_LOG_FLUSH);
1327 		os_event_reset(log_sys->flush_event);
1328 
1329 		if (log_sys->buf_free == log_sys->buf_next_to_write) {
1330 			/* Nothing to write, flush only */
1331 			log_mutex_exit_all();
1332 			log_write_flush_to_disk_low();
1333 			return;
1334 		}
1335 	}
1336 
1337 	start_offset = log_sys->buf_next_to_write;
1338 	end_offset = log_sys->buf_free;
1339 
1340 	area_start = ut_calc_align_down(start_offset, OS_FILE_LOG_BLOCK_SIZE);
1341 	area_end = ut_calc_align(end_offset, OS_FILE_LOG_BLOCK_SIZE);
1342 
1343 	ut_ad(area_end - area_start > 0);
1344 
1345 	log_block_set_flush_bit(log_sys->buf + area_start, TRUE);
1346 	log_block_set_checkpoint_no(
1347 		log_sys->buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
1348 		log_sys->next_checkpoint_no);
1349 
1350 	write_lsn = log_sys->lsn;
1351 	write_buf = log_sys->buf;
1352 
1353 	log_buffer_switch();
1354 
1355 	group = UT_LIST_GET_FIRST(log_sys->log_groups);
1356 
1357 	log_group_set_fields(group, log_sys->write_lsn);
1358 
1359 	log_mutex_exit();
1360 
1361 	/* Calculate pad_size if needed. */
1362 	pad_size = 0;
1363 	if (write_ahead_size > OS_FILE_LOG_BLOCK_SIZE) {
1364 		lsn_t	end_offset;
1365 		ulint	end_offset_in_unit;
1366 
1367 		end_offset = log_group_calc_lsn_offset(
1368 			ut_uint64_align_up(write_lsn,
1369 					   OS_FILE_LOG_BLOCK_SIZE),
1370 			group);
1371 		end_offset_in_unit = (ulint) (end_offset % write_ahead_size);
1372 
1373 		if (end_offset_in_unit > 0
1374 		    && (area_end - area_start) > end_offset_in_unit) {
1375 			/* The first block in the unit was initialized
1376 			after the last writing.
1377 			Needs to be written padded data once. */
1378 			pad_size = write_ahead_size - end_offset_in_unit;
1379 
1380 			if (area_end + pad_size > log_sys->buf_size) {
1381 				pad_size = log_sys->buf_size - area_end;
1382 			}
1383 
1384 			::memset(write_buf + area_end, 0, pad_size);
1385 		}
1386 	}
1387 
1388 	/* Do the write to the log files */
1389 	log_group_write_buf(
1390 		group, write_buf + area_start,
1391 		area_end - area_start + pad_size,
1392 #ifdef UNIV_DEBUG
1393 		pad_size,
1394 #endif /* UNIV_DEBUG */
1395 		ut_uint64_align_down(log_sys->write_lsn,
1396 				     OS_FILE_LOG_BLOCK_SIZE),
1397 		start_offset - area_start);
1398 
1399 	srv_stats.log_padded.add(pad_size);
1400 
1401 	log_sys->write_lsn = write_lsn;
1402 
1403 #ifndef _WIN32
1404 	if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
1405 		/* O_SYNC means the OS did not buffer the log file at all:
1406 		so we have also flushed to disk what we have written */
1407 		log_sys->flushed_to_disk_lsn = log_sys->write_lsn;
1408 	}
1409 #endif /* !_WIN32 */
1410 
1411 	log_write_mutex_exit();
1412 
1413 	if (flush_to_disk) {
1414 		log_write_flush_to_disk_low();
1415 	}
1416 }
1417 
1418 /** write to the log file up to the last log entry.
1419 @param[in]	sync	whether we want the written log
1420 also to be flushed to disk. */
1421 void
log_buffer_flush_to_disk(bool sync)1422 log_buffer_flush_to_disk(
1423 	bool sync)
1424 {
1425 	ut_ad(!srv_read_only_mode);
1426 	log_write_up_to(log_get_lsn(), sync);
1427 }
1428 
1429 /****************************************************************//**
1430 This functions writes the log buffer to the log file and if 'flush'
1431 is set it forces a flush of the log file as well. This is meant to be
1432 called from background master thread only as it does not wait for
1433 the write (+ possible flush) to finish. */
1434 void
log_buffer_sync_in_background(bool flush)1435 log_buffer_sync_in_background(
1436 /*==========================*/
1437 	bool	flush)	/*!< in: flush the logs to disk */
1438 {
1439 	lsn_t	lsn;
1440 
1441 	log_mutex_enter();
1442 
1443 	lsn = log_sys->lsn;
1444 
1445 	if (flush
1446 	    && log_sys->n_pending_flushes > 0
1447 	    && log_sys->current_flush_lsn >= lsn) {
1448 		/* The write + flush will write enough */
1449 		log_mutex_exit();
1450 		return;
1451 	}
1452 
1453 	log_mutex_exit();
1454 
1455 	log_write_up_to(lsn, flush);
1456 }
1457 
1458 /********************************************************************
1459 
1460 Tries to establish a big enough margin of free space in the log buffer, such
1461 that a new log entry can be catenated without an immediate need for a flush. */
1462 static
1463 void
log_flush_margin(void)1464 log_flush_margin(void)
1465 /*==================*/
1466 {
1467 	log_t*	log	= log_sys;
1468 	lsn_t	lsn	= 0;
1469 
1470 	log_mutex_enter();
1471 
1472 	if (log->buf_free > log->max_buf_free) {
1473 		/* We can write during flush */
1474 		lsn = log->lsn;
1475 	}
1476 
1477 	log_mutex_exit();
1478 
1479 	if (lsn) {
1480 		log_write_up_to(lsn, false);
1481 	}
1482 }
1483 #ifndef UNIV_HOTBACKUP
1484 /** Advances the smallest lsn for which there are unflushed dirty blocks in the
1485 buffer pool.
1486 NOTE: this function may only be called if the calling thread owns no
1487 synchronization objects!
1488 @param[in]	new_oldest	try to advance oldest_modified_lsn at least to
1489 this lsn
1490 @return false if there was a flush batch of the same type running,
1491 which means that we could not start this flush batch */
1492 static
1493 bool
log_preflush_pool_modified_pages(lsn_t new_oldest)1494 log_preflush_pool_modified_pages(
1495 	lsn_t			new_oldest)
1496 {
1497 	bool	success;
1498 
1499 	if (recv_recovery_on) {
1500 		/* If the recovery is running, we must first apply all
1501 		log records to their respective file pages to get the
1502 		right modify lsn values to these pages: otherwise, there
1503 		might be pages on disk which are not yet recovered to the
1504 		current lsn, and even after calling this function, we could
1505 		not know how up-to-date the disk version of the database is,
1506 		and we could not make a new checkpoint on the basis of the
1507 		info on the buffer pool only. */
1508 
1509 		recv_apply_hashed_log_recs(TRUE);
1510 	}
1511 
1512 	if (new_oldest == LSN_MAX
1513 	    || !buf_page_cleaner_is_active
1514 	    || srv_is_being_started) {
1515 
1516 		ulint	n_pages;
1517 
1518 		success = buf_flush_lists(ULINT_MAX, new_oldest, &n_pages);
1519 
1520 		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
1521 
1522 		if (!success) {
1523 			MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
1524 		}
1525 
1526 		MONITOR_INC_VALUE_CUMULATIVE(
1527 			MONITOR_FLUSH_SYNC_TOTAL_PAGE,
1528 			MONITOR_FLUSH_SYNC_COUNT,
1529 			MONITOR_FLUSH_SYNC_PAGES,
1530 			n_pages);
1531 	} else {
1532 		/* better to wait for flushed by page cleaner */
1533 
1534 		if (srv_flush_sync) {
1535 			/* wake page cleaner for IO burst */
1536 			buf_flush_request_force(new_oldest);
1537 		}
1538 
1539 		buf_flush_wait_flushed(new_oldest);
1540 
1541 		success = true;
1542 	}
1543 
1544 	return(success);
1545 }
1546 #endif /* !UNIV_HOTBACKUP */
1547 /******************************************************//**
1548 Completes a checkpoint. */
1549 static
1550 void
log_complete_checkpoint(void)1551 log_complete_checkpoint(void)
1552 /*=========================*/
1553 {
1554 	ut_ad(log_mutex_own());
1555 	ut_ad(log_sys->n_pending_checkpoint_writes == 0);
1556 
1557 	log_sys->next_checkpoint_no++;
1558 
1559 	log_sys->last_checkpoint_lsn = log_sys->next_checkpoint_lsn;
1560 	MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
1561 		    log_sys->lsn - log_sys->last_checkpoint_lsn);
1562 
1563 	DBUG_PRINT("ib_log", ("checkpoint ended at " LSN_PF
1564 			      ", flushed to " LSN_PF,
1565 			      log_sys->last_checkpoint_lsn,
1566 			      log_sys->flushed_to_disk_lsn));
1567 
1568 	rw_lock_x_unlock_gen(&(log_sys->checkpoint_lock), LOG_CHECKPOINT);
1569 }
1570 
1571 /******************************************************//**
1572 Completes an asynchronous checkpoint info write i/o to a log file. */
1573 static
1574 void
log_io_complete_checkpoint(void)1575 log_io_complete_checkpoint(void)
1576 /*============================*/
1577 {
1578 	MONITOR_DEC(MONITOR_PENDING_CHECKPOINT_WRITE);
1579 
1580 	log_mutex_enter();
1581 
1582 	ut_ad(log_sys->n_pending_checkpoint_writes > 0);
1583 
1584 	if (--log_sys->n_pending_checkpoint_writes == 0) {
1585 		log_complete_checkpoint();
1586 	}
1587 
1588 	log_mutex_exit();
1589 }
1590 
1591 /******************************************************//**
1592 Writes the checkpoint info to a log group header. */
1593 static
1594 void
log_group_checkpoint(log_group_t * group)1595 log_group_checkpoint(
1596 /*=================*/
1597 	log_group_t*	group)	/*!< in: log group */
1598 {
1599 	lsn_t		lsn_offset;
1600 	byte*		buf;
1601 
1602 	ut_ad(!srv_read_only_mode);
1603 	ut_ad(log_mutex_own());
1604 #if LOG_CHECKPOINT_SIZE > OS_FILE_LOG_BLOCK_SIZE
1605 # error "LOG_CHECKPOINT_SIZE > OS_FILE_LOG_BLOCK_SIZE"
1606 #endif
1607 
1608 	DBUG_PRINT("ib_log", ("checkpoint " UINT64PF " at " LSN_PF
1609 			      " written to group " ULINTPF,
1610 			      log_sys->next_checkpoint_no,
1611 			      log_sys->next_checkpoint_lsn,
1612 			      group->id));
1613 
1614 	buf = group->checkpoint_buf;
1615 	memset(buf, 0, OS_FILE_LOG_BLOCK_SIZE);
1616 
1617 	mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys->next_checkpoint_no);
1618 	mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys->next_checkpoint_lsn);
1619 
1620 	lsn_offset = log_group_calc_lsn_offset(log_sys->next_checkpoint_lsn,
1621 					       group);
1622 	mach_write_to_8(buf + LOG_CHECKPOINT_OFFSET, lsn_offset);
1623 	mach_write_to_8(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, log_sys->buf_size);
1624 
1625 	log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf));
1626 
1627 	MONITOR_INC(MONITOR_PENDING_CHECKPOINT_WRITE);
1628 
1629 	log_sys->n_log_ios++;
1630 
1631 	MONITOR_INC(MONITOR_LOG_IO);
1632 
1633 	ut_ad(LOG_CHECKPOINT_1 < univ_page_size.physical());
1634 	ut_ad(LOG_CHECKPOINT_2 < univ_page_size.physical());
1635 
1636 	if (log_sys->n_pending_checkpoint_writes++ == 0) {
1637 		rw_lock_x_lock_gen(&log_sys->checkpoint_lock,
1638 				   LOG_CHECKPOINT);
1639 	}
1640 
1641 	/* Note: We alternate the physical place of the checkpoint info.
1642 	See the (next_checkpoint_no & 1) below. */
1643 
1644 	/* We send as the last parameter the group machine address
1645 	added with 1, as we want to distinguish between a normal log
1646 	file write and a checkpoint field write */
1647 
1648 	fil_io(IORequestLogWrite, false,
1649 	       page_id_t(group->space_id, 0),
1650 	       univ_page_size,
1651 	       (log_sys->next_checkpoint_no & 1)
1652 	       ? LOG_CHECKPOINT_2 : LOG_CHECKPOINT_1,
1653 	       OS_FILE_LOG_BLOCK_SIZE,
1654 	       buf, (byte*) group + 1);
1655 
1656 	ut_ad(((ulint) group & 0x1UL) == 0);
1657 }
1658 
1659 #ifdef UNIV_HOTBACKUP
1660 /******************************************************//**
1661 Writes info to a buffer of a log group when log files are created in
1662 backup restoration. */
1663 void
log_reset_first_header_and_checkpoint(byte * hdr_buf,ib_uint64_t start)1664 log_reset_first_header_and_checkpoint(
1665 /*==================================*/
1666 	byte*		hdr_buf,/*!< in: buffer which will be written to the
1667 				start of the first log file */
1668 	ib_uint64_t	start)	/*!< in: lsn of the start of the first log file;
1669 				we pretend that there is a checkpoint at
1670 				start + LOG_BLOCK_HDR_SIZE */
1671 {
1672 	byte*		buf;
1673 	ib_uint64_t	lsn;
1674 
1675 	mach_write_to_4(hdr_buf + LOG_HEADER_FORMAT,
1676 			LOG_HEADER_FORMAT_CURRENT);
1677 	mach_write_to_8(hdr_buf + LOG_HEADER_START_LSN, start);
1678 
1679 	lsn = start + LOG_BLOCK_HDR_SIZE;
1680 
1681 	/* Write the label of mysqlbackup --restore */
1682 	strcpy((char*)hdr_buf + LOG_HEADER_CREATOR, LOG_HEADER_CREATOR_CURRENT);
1683 	ut_sprintf_timestamp((char*) hdr_buf
1684 			     + (LOG_HEADER_CREATOR
1685 			     + (sizeof LOG_HEADER_CREATOR_CURRENT) - 1));
1686 	buf = hdr_buf + LOG_CHECKPOINT_1;
1687 	memset(buf, 0, OS_FILE_LOG_BLOCK_SIZE);
1688 
1689 	/*mach_write_to_8(buf + LOG_CHECKPOINT_NO, 0);*/
1690 	mach_write_to_8(buf + LOG_CHECKPOINT_LSN, lsn);
1691 
1692 	mach_write_to_8(buf + LOG_CHECKPOINT_OFFSET,
1693 			LOG_FILE_HDR_SIZE + LOG_BLOCK_HDR_SIZE);
1694 	mach_write_to_8(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, 2 * 1024 * 1024);
1695 
1696 	log_block_set_checksum(buf, log_block_calc_checksum_crc32(buf));
1697 }
1698 #endif /* UNIV_HOTBACKUP */
1699 
1700 #ifndef UNIV_HOTBACKUP
1701 /** Read a log group header page to log_sys->checkpoint_buf.
1702 @param[in]	group	log group
1703 @param[in]	header	0 or LOG_CHEKCPOINT_1 or LOG_CHECKPOINT2 */
1704 void
log_group_header_read(const log_group_t * group,ulint header)1705 log_group_header_read(
1706 	const log_group_t*	group,
1707 	ulint			header)
1708 {
1709 	ut_ad(log_mutex_own());
1710 
1711 	log_sys->n_log_ios++;
1712 
1713 	MONITOR_INC(MONITOR_LOG_IO);
1714 
1715 	fil_io(IORequestLogRead, true,
1716 	       page_id_t(group->space_id, header / univ_page_size.physical()),
1717 	       univ_page_size, header % univ_page_size.physical(),
1718 	       OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL);
1719 }
1720 
1721 /** Write checkpoint info to the log header and invoke log_mutex_exit().
1722 @param[in]	sync	whether to wait for the write to complete */
1723 void
log_write_checkpoint_info(bool sync)1724 log_write_checkpoint_info(
1725 	bool	sync)
1726 {
1727 	log_group_t*	group;
1728 
1729 	ut_ad(log_mutex_own());
1730 
1731 	if (!srv_read_only_mode) {
1732 		for (group = UT_LIST_GET_FIRST(log_sys->log_groups);
1733 		     group;
1734 		     group = UT_LIST_GET_NEXT(log_groups, group)) {
1735 
1736 			log_group_checkpoint(group);
1737 		}
1738 	}
1739 
1740 	log_mutex_exit();
1741 
1742 	MONITOR_INC(MONITOR_NUM_CHECKPOINT);
1743 
1744 	if (sync) {
1745 		/* Wait for the checkpoint write to complete */
1746 		rw_lock_s_lock(&log_sys->checkpoint_lock);
1747 		rw_lock_s_unlock(&log_sys->checkpoint_lock);
1748 
1749 		DEBUG_SYNC_C("checkpoint_completed");
1750 
1751 		DBUG_EXECUTE_IF(
1752 			"crash_after_checkpoint",
1753 			DBUG_SUICIDE(););
1754 	}
1755 }
1756 
1757 /** Set extra data to be written to the redo log during checkpoint.
1758 @param[in]	buf	data to be appended on checkpoint, or NULL
1759 @return pointer to previous data to be appended on checkpoint */
1760 mtr_buf_t*
log_append_on_checkpoint(mtr_buf_t * buf)1761 log_append_on_checkpoint(
1762 	mtr_buf_t*	buf)
1763 {
1764 	log_mutex_enter();
1765 	mtr_buf_t*	old = log_sys->append_on_checkpoint;
1766 	log_sys->append_on_checkpoint = buf;
1767 	log_mutex_exit();
1768 	return(old);
1769 }
1770 
1771 /** Make a checkpoint. Note that this function does not flush dirty
1772 blocks from the buffer pool: it only checks what is lsn of the oldest
1773 modification in the pool, and writes information about the lsn in
1774 log files. Use log_make_checkpoint_at() to flush also the pool.
1775 @param[in]	sync		whether to wait for the write to complete
1776 @param[in]	write_always	force a write even if no log
1777 has been generated since the latest checkpoint
1778 @return true if success, false if a checkpoint write was already running */
1779 bool
log_checkpoint(bool sync,bool write_always)1780 log_checkpoint(
1781 	bool	sync,
1782 	bool	write_always)
1783 {
1784 	lsn_t	oldest_lsn;
1785 
1786 	ut_ad(!srv_read_only_mode);
1787 
1788 	if (recv_recovery_is_on()) {
1789 		recv_apply_hashed_log_recs(TRUE);
1790 	}
1791 
1792 #ifndef _WIN32
1793 	switch (srv_unix_file_flush_method) {
1794 	case SRV_UNIX_NOSYNC:
1795 		break;
1796 	case SRV_UNIX_O_DSYNC:
1797 	case SRV_UNIX_FSYNC:
1798 	case SRV_UNIX_LITTLESYNC:
1799 	case SRV_UNIX_O_DIRECT:
1800 	case SRV_UNIX_O_DIRECT_NO_FSYNC:
1801 		fil_flush_file_spaces(FIL_TYPE_TABLESPACE);
1802 	}
1803 #endif /* !_WIN32 */
1804 
1805 	log_mutex_enter();
1806 
1807 	ut_ad(!recv_no_log_write);
1808 	oldest_lsn = log_buf_pool_get_oldest_modification();
1809 
1810 	/* Because log also contains headers and dummy log records,
1811 	log_buf_pool_get_oldest_modification() will return log_sys->lsn
1812 	if the buffer pool contains no dirty buffers.
1813 	We must make sure that the log is flushed up to that lsn.
1814 	If there are dirty buffers in the buffer pool, then our
1815 	write-ahead-logging algorithm ensures that the log has been
1816 	flushed up to oldest_lsn. */
1817 
1818 	ut_ad(oldest_lsn >= log_sys->last_checkpoint_lsn);
1819 	if (!write_always
1820 	    && oldest_lsn
1821 	    <= log_sys->last_checkpoint_lsn + SIZE_OF_MLOG_CHECKPOINT) {
1822 		/* Do nothing, because nothing was logged (other than
1823 		a MLOG_CHECKPOINT marker) since the previous checkpoint. */
1824 		log_mutex_exit();
1825 		return(true);
1826 	}
1827 
1828 	/* Repeat the MLOG_FILE_NAME records after the checkpoint, in
1829 	case some log records between the checkpoint and log_sys->lsn
1830 	need them. Finally, write a MLOG_CHECKPOINT marker. Redo log
1831 	apply expects to see a MLOG_CHECKPOINT after the checkpoint,
1832 	except on clean shutdown, where the log will be empty after
1833 	the checkpoint.
1834 
1835 	It is important that we write out the redo log before any
1836 	further dirty pages are flushed to the tablespace files.  At
1837 	this point, because log_mutex_own(), mtr_commit() in other
1838 	threads will be blocked, and no pages can be added to the
1839 	flush lists. */
1840 	lsn_t		flush_lsn	= oldest_lsn;
1841 	const bool	do_write
1842 		= srv_shutdown_state == SRV_SHUTDOWN_NONE
1843 		|| flush_lsn != log_sys->lsn;
1844 
1845 	if (fil_names_clear(flush_lsn, do_write)) {
1846 		ut_ad(log_sys->lsn >= flush_lsn + SIZE_OF_MLOG_CHECKPOINT);
1847 		flush_lsn = log_sys->lsn;
1848 	}
1849 
1850 	log_mutex_exit();
1851 
1852 	log_write_up_to(flush_lsn, true);
1853 
1854 	DBUG_EXECUTE_IF(
1855 		"using_wa_checkpoint_middle",
1856 		if (write_always) {
1857 			DEBUG_SYNC_C("wa_checkpoint_middle");
1858 
1859 			const my_bool b = TRUE;
1860 			buf_flush_page_cleaner_disabled_debug_update(
1861 				NULL, NULL, NULL, &b);
1862 			dict_stats_disabled_debug_update(
1863 				NULL, NULL, NULL, &b);
1864 			srv_master_thread_disabled_debug_update(
1865 				NULL, NULL, NULL, &b);
1866 		});
1867 
1868 	log_mutex_enter();
1869 
1870 	ut_ad(log_sys->flushed_to_disk_lsn >= flush_lsn);
1871 	ut_ad(flush_lsn >= oldest_lsn);
1872 
1873 	if (log_sys->last_checkpoint_lsn >= oldest_lsn) {
1874 		log_mutex_exit();
1875 		return(true);
1876 	}
1877 
1878 	if (log_sys->n_pending_checkpoint_writes > 0) {
1879 		/* A checkpoint write is running */
1880 		log_mutex_exit();
1881 
1882 		if (sync) {
1883 			/* Wait for the checkpoint write to complete */
1884 			rw_lock_s_lock(&log_sys->checkpoint_lock);
1885 			rw_lock_s_unlock(&log_sys->checkpoint_lock);
1886 		}
1887 
1888 		return(false);
1889 	}
1890 
1891 	log_sys->next_checkpoint_lsn = oldest_lsn;
1892 	log_write_checkpoint_info(sync);
1893 	ut_ad(!log_mutex_own());
1894 
1895 	return(true);
1896 }
1897 
1898 /** Make a checkpoint at or after a specified LSN.
1899 @param[in]	lsn		the log sequence number, or LSN_MAX
1900 for the latest LSN
1901 @param[in]	write_always	force a write even if no log
1902 has been generated since the latest checkpoint */
1903 void
log_make_checkpoint_at(lsn_t lsn,bool write_always)1904 log_make_checkpoint_at(
1905 	lsn_t			lsn,
1906 	bool			write_always)
1907 {
1908 	/* Preflush pages synchronously */
1909 
1910 	while (!log_preflush_pool_modified_pages(lsn)) {
1911 		/* Flush as much as we can */
1912 	}
1913 
1914 	while (!log_checkpoint(true, write_always)) {
1915 		/* Force a checkpoint */
1916 	}
1917 }
1918 
1919 /****************************************************************//**
1920 Tries to establish a big enough margin of free space in the log groups, such
1921 that a new log entry can be catenated without an immediate need for a
1922 checkpoint. NOTE: this function may only be called if the calling thread
1923 owns no synchronization objects! */
1924 static
1925 void
log_checkpoint_margin(void)1926 log_checkpoint_margin(void)
1927 /*=======================*/
1928 {
1929 	log_t*		log		= log_sys;
1930 	lsn_t		age;
1931 	lsn_t		checkpoint_age;
1932 	ib_uint64_t	advance;
1933 	lsn_t		oldest_lsn;
1934 	bool		success;
1935 loop:
1936 	advance = 0;
1937 
1938 	log_mutex_enter();
1939 	ut_ad(!recv_no_log_write);
1940 
1941 	if (!log->check_flush_or_checkpoint) {
1942 		log_mutex_exit();
1943 		return;
1944 	}
1945 
1946 	oldest_lsn = log_buf_pool_get_oldest_modification();
1947 
1948 	age = log->lsn - oldest_lsn;
1949 
1950 	if (age > log->max_modified_age_sync) {
1951 
1952 		/* A flush is urgent: we have to do a synchronous preflush */
1953 		advance = age - log->max_modified_age_sync;
1954 	}
1955 
1956 	checkpoint_age = log->lsn - log->last_checkpoint_lsn;
1957 
1958 	bool	checkpoint_sync;
1959 	bool	do_checkpoint;
1960 
1961 	if (checkpoint_age > log->max_checkpoint_age) {
1962 		/* A checkpoint is urgent: we do it synchronously */
1963 		checkpoint_sync = true;
1964 		do_checkpoint = true;
1965 	} else if (checkpoint_age > log->max_checkpoint_age_async) {
1966 		/* A checkpoint is not urgent: do it asynchronously */
1967 		do_checkpoint = true;
1968 		checkpoint_sync = false;
1969 		log->check_flush_or_checkpoint = false;
1970 	} else {
1971 		do_checkpoint = false;
1972 		checkpoint_sync = false;
1973 		log->check_flush_or_checkpoint = false;
1974 	}
1975 
1976 	log_mutex_exit();
1977 
1978 	if (advance) {
1979 		lsn_t	new_oldest = oldest_lsn + advance;
1980 
1981 		success = log_preflush_pool_modified_pages(new_oldest);
1982 
1983 		/* If the flush succeeded, this thread has done its part
1984 		and can proceed. If it did not succeed, there was another
1985 		thread doing a flush at the same time. */
1986 		if (!success) {
1987 			log_mutex_enter();
1988 
1989 			log->check_flush_or_checkpoint = true;
1990 
1991 			log_mutex_exit();
1992 			goto loop;
1993 		}
1994 	}
1995 
1996 	if (do_checkpoint) {
1997 		log_checkpoint(checkpoint_sync, FALSE);
1998 
1999 		if (checkpoint_sync) {
2000 
2001 			goto loop;
2002 		}
2003 	}
2004 }
2005 
2006 /******************************************************//**
2007 Reads a specified log segment to a buffer. */
2008 void
log_group_read_log_seg(byte * buf,log_group_t * group,lsn_t start_lsn,lsn_t end_lsn)2009 log_group_read_log_seg(
2010 /*===================*/
2011 	byte*		buf,		/*!< in: buffer where to read */
2012 	log_group_t*	group,		/*!< in: log group */
2013 	lsn_t		start_lsn,	/*!< in: read area start */
2014 	lsn_t		end_lsn)	/*!< in: read area end */
2015 {
2016 	ulint	len;
2017 	lsn_t	source_offset;
2018 
2019 	ut_ad(log_mutex_own());
2020 
2021 loop:
2022 	source_offset = log_group_calc_lsn_offset(start_lsn, group);
2023 
2024 	ut_a(end_lsn - start_lsn <= ULINT_MAX);
2025 	len = (ulint) (end_lsn - start_lsn);
2026 
2027 	ut_ad(len != 0);
2028 
2029 	if ((source_offset % group->file_size) + len > group->file_size) {
2030 
2031 		/* If the above condition is true then len (which is ulint)
2032 		is > the expression below, so the typecast is ok */
2033 		len = (ulint) (group->file_size -
2034 			(source_offset % group->file_size));
2035 	}
2036 
2037 	log_sys->n_log_ios++;
2038 
2039 	MONITOR_INC(MONITOR_LOG_IO);
2040 
2041 	ut_a(source_offset / UNIV_PAGE_SIZE <= ULINT_MAX);
2042 
2043 	const ulint	page_no
2044 		= (ulint) (source_offset / univ_page_size.physical());
2045 
2046 	fil_io(IORequestLogRead, true,
2047 	       page_id_t(group->space_id, page_no),
2048 	       univ_page_size,
2049 	       (ulint) (source_offset % univ_page_size.physical()),
2050 	       len, buf, NULL);
2051 
2052 	start_lsn += len;
2053 	buf += len;
2054 
2055 	if (start_lsn != end_lsn) {
2056 
2057 		goto loop;
2058 	}
2059 }
2060 
2061 /**
2062 Checks that there is enough free space in the log to start a new query step.
2063 Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
2064 function may only be called if the calling thread owns no synchronization
2065 objects! */
2066 void
log_check_margins(void)2067 log_check_margins(void)
2068 {
2069 	bool	check;
2070 
2071 	do {
2072 		log_flush_margin();
2073 		log_checkpoint_margin();
2074 		log_mutex_enter();
2075 		ut_ad(!recv_no_log_write);
2076 		check = log_sys->check_flush_or_checkpoint;
2077 		log_mutex_exit();
2078 	} while (check);
2079 }
2080 
2081 /****************************************************************//**
2082 Makes a checkpoint at the latest lsn and writes it to first page of each
2083 data file in the database, so that we know that the file spaces contain
2084 all modifications up to that lsn. This can only be called at database
2085 shutdown. This function also writes all log in log files to the log archive. */
2086 void
logs_empty_and_mark_files_at_shutdown(void)2087 logs_empty_and_mark_files_at_shutdown(void)
2088 /*=======================================*/
2089 {
2090 	lsn_t			lsn;
2091 	ulint			count = 0;
2092 	ulint			total_trx;
2093 	ulint			pending_io;
2094 	enum srv_thread_type	active_thd;
2095 	const char*		thread_name;
2096 
2097 	ib::info() << "Starting shutdown...";
2098 
2099 	while (srv_fast_shutdown == 0 && trx_rollback_or_clean_is_active) {
2100 		/* we should wait until rollback after recovery end
2101 		for slow shutdown */
2102 		os_thread_sleep(100000);
2103 	}
2104 
2105 	/* Wait until the master thread and all other operations are idle: our
2106 	algorithm only works if the server is idle at shutdown */
2107 
2108 	srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
2109 loop:
2110 	os_thread_sleep(100000);
2111 
2112 	count++;
2113 
2114 	/* We need the monitor threads to stop before we proceed with
2115 	a shutdown. */
2116 
2117 	thread_name = srv_any_background_threads_are_active();
2118 
2119 	if (thread_name != NULL) {
2120 		/* Print a message every 60 seconds if we are waiting
2121 		for the monitor thread to exit. Master and worker
2122 		threads check will be done later. */
2123 
2124 		if (srv_print_verbose_log && count > 600) {
2125 			ib::info() << "Waiting for " << thread_name
2126 				<< " to exit";
2127 			count = 0;
2128 		}
2129 
2130 		goto loop;
2131 	}
2132 
2133 	/* Check that there are no longer transactions, except for
2134 	PREPARED ones. We need this wait even for the 'very fast'
2135 	shutdown, because the InnoDB layer may have committed or
2136 	prepared transactions and we don't want to lose them. */
2137 
2138 	total_trx = trx_sys_any_active_transactions();
2139 
2140 	if (total_trx > 0) {
2141 
2142 		if (srv_print_verbose_log && count > 600) {
2143 			ib::info() << "Waiting for " << total_trx << " active"
2144 				<< " transactions to finish";
2145 
2146 			count = 0;
2147 		}
2148 
2149 		goto loop;
2150 	}
2151 
2152 	/* Check that the background threads are suspended */
2153 
2154 	active_thd = srv_get_active_thread_type();
2155 
2156 	if (active_thd != SRV_NONE) {
2157 
2158 		if (active_thd == SRV_PURGE) {
2159 			srv_purge_wakeup();
2160 		}
2161 
2162 		/* The srv_lock_timeout_thread, srv_error_monitor_thread
2163 		and srv_monitor_thread should already exit by now. The
2164 		only threads to be suspended are the master threads
2165 		and worker threads (purge threads). Print the thread
2166 		type if any of such threads not in suspended mode */
2167 		if (srv_print_verbose_log && count > 600) {
2168 			const char*	thread_type = "<null>";
2169 
2170 			switch (active_thd) {
2171 			case SRV_NONE:
2172 				/* This shouldn't happen because we've
2173 				already checked for this case before
2174 				entering the if(). We handle it here
2175 				to avoid a compiler warning. */
2176 				ut_error;
2177 			case SRV_WORKER:
2178 				thread_type = "worker threads";
2179 				break;
2180 			case SRV_MASTER:
2181 				thread_type = "master thread";
2182 				break;
2183 			case SRV_PURGE:
2184 				thread_type = "purge thread";
2185 				break;
2186 			}
2187 
2188 			ib::info() << "Waiting for " << thread_type
2189 				<< " to be suspended";
2190 
2191 			count = 0;
2192 		}
2193 
2194 		goto loop;
2195 	}
2196 
2197 	/* At this point only page_cleaner should be active. We wait
2198 	here to let it complete the flushing of the buffer pools
2199 	before proceeding further. */
2200 	srv_shutdown_state = SRV_SHUTDOWN_FLUSH_PHASE;
2201 	count = 0;
2202 	while (buf_page_cleaner_is_active) {
2203 		++count;
2204 		os_thread_sleep(100000);
2205 		if (srv_print_verbose_log && count > 600) {
2206 			ib::info() << "Waiting for page_cleaner to"
2207 				" finish flushing of buffer pool";
2208 			count = 0;
2209 		}
2210 	}
2211 
2212 	log_mutex_enter();
2213 	const ulint	n_write	= log_sys->n_pending_checkpoint_writes;
2214 	const ulint	n_flush	= log_sys->n_pending_flushes;
2215 	log_mutex_exit();
2216 
2217 	if (n_write != 0 || n_flush != 0) {
2218 		if (srv_print_verbose_log && count > 600) {
2219 			ib::info() << "Pending checkpoint_writes: " << n_write
2220 				<< ". Pending log flush writes: " << n_flush;
2221 			count = 0;
2222 		}
2223 		goto loop;
2224 	}
2225 
2226 	pending_io = buf_pool_check_no_pending_io();
2227 
2228 	if (pending_io) {
2229 		if (srv_print_verbose_log && count > 600) {
2230 			ib::info() << "Waiting for " << pending_io << " buffer"
2231 				" page I/Os to complete";
2232 			count = 0;
2233 		}
2234 
2235 		goto loop;
2236 	}
2237 
2238 	if (srv_fast_shutdown == 2) {
2239 		if (!srv_read_only_mode) {
2240 			ib::info() << "MySQL has requested a very fast"
2241 				" shutdown without flushing the InnoDB buffer"
2242 				" pool to data files. At the next mysqld"
2243 				" startup InnoDB will do a crash recovery!";
2244 
2245 			/* In this fastest shutdown we do not flush the
2246 			buffer pool:
2247 
2248 			it is essentially a 'crash' of the InnoDB server.
2249 			Make sure that the log is all flushed to disk, so
2250 			that we can recover all committed transactions in
2251 			a crash recovery. We must not write the lsn stamps
2252 			to the data files, since at a startup InnoDB deduces
2253 			from the stamps if the previous shutdown was clean. */
2254 
2255 			log_buffer_flush_to_disk();
2256 
2257 			/* Check that the background threads stay suspended */
2258 			thread_name = srv_any_background_threads_are_active();
2259 
2260 			if (thread_name != NULL) {
2261 				ib::warn() << "Background thread "
2262 					<< thread_name << " woke up during"
2263 					" shutdown";
2264 				goto loop;
2265 			}
2266 		}
2267 
2268 		srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
2269 
2270 		fil_close_all_files();
2271 
2272 		thread_name = srv_any_background_threads_are_active();
2273 
2274 		ut_a(!thread_name);
2275 
2276 		return;
2277 	}
2278 
2279 	if (!srv_read_only_mode) {
2280 		log_make_checkpoint_at(LSN_MAX, TRUE);
2281 	}
2282 
2283 	log_mutex_enter();
2284 
2285 	lsn = log_sys->lsn;
2286 
2287 	ut_ad(lsn >= log_sys->last_checkpoint_lsn);
2288 
2289 	log_mutex_exit();
2290 
2291 	/** If innodb_force_recovery is set to 6 then log_sys doesn't
2292 	have recent checkpoint information. So last checkpoint lsn
2293 	will never be equal to current lsn. */
2294 	const bool	is_last = ((srv_force_recovery == SRV_FORCE_NO_LOG_REDO
2295 				    && lsn == log_sys->last_checkpoint_lsn
2296 						+ LOG_BLOCK_HDR_SIZE)
2297 				   || lsn == log_sys->last_checkpoint_lsn);
2298 
2299 	if (!is_last) {
2300 		goto loop;
2301 	}
2302 
2303 	/* Check that the background threads stay suspended */
2304 	thread_name = srv_any_background_threads_are_active();
2305 	if (thread_name != NULL) {
2306 		ib::warn() << "Background thread " << thread_name << " woke up"
2307 			" during shutdown";
2308 
2309 		goto loop;
2310 	}
2311 
2312 	if (!srv_read_only_mode) {
2313 		fil_flush_file_spaces(FIL_TYPE_TABLESPACE);
2314 		fil_flush_file_spaces(FIL_TYPE_LOG);
2315 	}
2316 
2317 	/* The call fil_write_flushed_lsn() will bypass the buffer
2318 	pool: therefore it is essential that the buffer pool has been
2319 	completely flushed to disk! (We do not call fil_write... if the
2320 	'very fast' shutdown is enabled.) */
2321 
2322 	if (!buf_all_freed()) {
2323 
2324 		if (srv_print_verbose_log && count > 600) {
2325 			ib::info() << "Waiting for dirty buffer pages to be"
2326 				" flushed";
2327 			count = 0;
2328 		}
2329 
2330 		goto loop;
2331 	}
2332 
2333 	srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
2334 
2335 	/* Make some checks that the server really is quiet */
2336 	srv_thread_type	type = srv_get_active_thread_type();
2337 	ut_a(type == SRV_NONE);
2338 
2339 	bool	freed = buf_all_freed();
2340 	ut_a(freed);
2341 
2342 	ut_a(lsn == log_sys->lsn);
2343 
2344 	if (lsn < srv_start_lsn) {
2345 		ib::error() << "Log sequence number at shutdown " << lsn
2346 			<< " is lower than at startup " << srv_start_lsn
2347 			<< "!";
2348 	}
2349 
2350 	srv_shutdown_lsn = lsn;
2351 
2352 	if (!srv_read_only_mode) {
2353 		fil_write_flushed_lsn(lsn);
2354 	}
2355 
2356 	fil_close_all_files();
2357 
2358 	/* Make some checks that the server really is quiet */
2359 	type = srv_get_active_thread_type();
2360 	ut_a(type == SRV_NONE);
2361 
2362 	freed = buf_all_freed();
2363 	ut_a(freed);
2364 
2365 	ut_a(lsn == log_sys->lsn);
2366 }
2367 
2368 /******************************************************//**
2369 Peeks the current lsn.
2370 @return TRUE if success, FALSE if could not get the log system mutex */
2371 ibool
log_peek_lsn(lsn_t * lsn)2372 log_peek_lsn(
2373 /*=========*/
2374 	lsn_t*	lsn)	/*!< out: if returns TRUE, current lsn is here */
2375 {
2376 	if (0 == mutex_enter_nowait(&(log_sys->mutex))) {
2377 		*lsn = log_sys->lsn;
2378 
2379 		log_mutex_exit();
2380 
2381 		return(TRUE);
2382 	}
2383 
2384 	return(FALSE);
2385 }
2386 
2387 /******************************************************//**
2388 Prints info of the log. */
2389 void
log_print(FILE * file)2390 log_print(
2391 /*======*/
2392 	FILE*	file)	/*!< in: file where to print */
2393 {
2394 	double	time_elapsed;
2395 	time_t	current_time;
2396 
2397 	log_mutex_enter();
2398 
2399 	fprintf(file,
2400 		"Log sequence number " LSN_PF "\n"
2401 		"Log flushed up to   " LSN_PF "\n"
2402 		"Pages flushed up to " LSN_PF "\n"
2403 		"Last checkpoint at  " LSN_PF "\n",
2404 		log_sys->lsn,
2405 		log_sys->flushed_to_disk_lsn,
2406 		log_buf_pool_get_oldest_modification(),
2407 		log_sys->last_checkpoint_lsn);
2408 
2409 	current_time = time(NULL);
2410 
2411 	time_elapsed = difftime(current_time,
2412 				log_sys->last_printout_time);
2413 
2414 	if (time_elapsed <= 0) {
2415 		time_elapsed = 1;
2416 	}
2417 
2418 	fprintf(file,
2419 		ULINTPF " pending log flushes, "
2420 		ULINTPF " pending chkp writes\n"
2421 		ULINTPF " log i/o's done, %.2f log i/o's/second\n",
2422 		log_sys->n_pending_flushes,
2423 		log_sys->n_pending_checkpoint_writes,
2424 		log_sys->n_log_ios,
2425 		static_cast<double>(
2426 			log_sys->n_log_ios - log_sys->n_log_ios_old)
2427 		/ time_elapsed);
2428 
2429 	log_sys->n_log_ios_old = log_sys->n_log_ios;
2430 	log_sys->last_printout_time = current_time;
2431 
2432 	log_mutex_exit();
2433 }
2434 
2435 /**********************************************************************//**
2436 Refreshes the statistics used to print per-second averages. */
2437 void
log_refresh_stats(void)2438 log_refresh_stats(void)
2439 /*===================*/
2440 {
2441 	log_sys->n_log_ios_old = log_sys->n_log_ios;
2442 	log_sys->last_printout_time = time(NULL);
2443 }
2444 
2445 /********************************************************//**
2446 Closes a log group. */
2447 static
2448 void
log_group_close(log_group_t * group)2449 log_group_close(
2450 /*===========*/
2451 	log_group_t*	group)		/* in,own: log group to close */
2452 {
2453 	ulint	i;
2454 
2455 	for (i = 0; i < group->n_files; i++) {
2456 		ut_free(group->file_header_bufs_ptr[i]);
2457 	}
2458 
2459 	ut_free(group->file_header_bufs_ptr);
2460 	ut_free(group->file_header_bufs);
2461 	ut_free(group->checkpoint_buf_ptr);
2462 	ut_free(group);
2463 }
2464 
2465 /********************************************************//**
2466 Closes all log groups. */
2467 void
log_group_close_all(void)2468 log_group_close_all(void)
2469 /*=====================*/
2470 {
2471 	log_group_t*	group;
2472 
2473 	group = UT_LIST_GET_FIRST(log_sys->log_groups);
2474 
2475 	while (UT_LIST_GET_LEN(log_sys->log_groups) > 0) {
2476 		log_group_t*	prev_group = group;
2477 
2478 		group = UT_LIST_GET_NEXT(log_groups, group);
2479 
2480 		UT_LIST_REMOVE(log_sys->log_groups, prev_group);
2481 
2482 		log_group_close(prev_group);
2483 	}
2484 }
2485 
2486 /********************************************************//**
2487 Shutdown the log system but do not release all the memory. */
2488 void
log_shutdown(void)2489 log_shutdown(void)
2490 /*==============*/
2491 {
2492 	log_group_close_all();
2493 
2494 	ut_free(log_sys->buf_ptr);
2495 	log_sys->buf_ptr = NULL;
2496 	log_sys->buf = NULL;
2497 	ut_free(log_sys->checkpoint_buf_ptr);
2498 	log_sys->checkpoint_buf_ptr = NULL;
2499 	log_sys->checkpoint_buf = NULL;
2500 
2501 	os_event_destroy(log_sys->flush_event);
2502 
2503 	rw_lock_free(&log_sys->checkpoint_lock);
2504 
2505 	mutex_free(&log_sys->mutex);
2506 	mutex_free(&log_sys->write_mutex);
2507 	mutex_free(&log_sys->log_flush_order_mutex);
2508 
2509 	recv_sys_close();
2510 }
2511 
2512 /********************************************************//**
2513 Free the log system data structures. */
2514 void
log_mem_free(void)2515 log_mem_free(void)
2516 /*==============*/
2517 {
2518 	if (log_sys != NULL) {
2519 		recv_sys_mem_free();
2520 		ut_free(log_sys);
2521 
2522 		log_sys = NULL;
2523 	}
2524 }
2525 #endif /* !UNIV_HOTBACKUP */
2526