1 /*****************************************************************************
2 
3 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
4 
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License, version 2.0,
7 as published by the Free Software Foundation.
8 
9 This program is also distributed with certain software (including
10 but not limited to OpenSSL) that is licensed under separate terms,
11 as designated in a particular file or component or in included license
12 documentation.  The authors of MySQL hereby grant you an additional
13 permission to link the program and your derivative works with the
14 separately licensed software that they have included with MySQL.
15 
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 GNU General Public License, version 2.0, for more details.
20 
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
24 
25 *****************************************************************************/
26 
27 /**************************************************//**
28 @file buf/buf0dblwr.cc
29 Doublwrite buffer module
30 
31 Created 2011/12/19
32 *******************************************************/
33 
34 #include "buf0dblwr.h"
35 
36 #ifdef UNIV_NONINL
37 #include "buf0buf.ic"
38 #endif
39 
40 #include "buf0buf.h"
41 #include "buf0checksum.h"
42 #include "srv0start.h"
43 #include "srv0srv.h"
44 #include "page0zip.h"
45 #include "trx0sys.h"
46 
47 #ifndef UNIV_HOTBACKUP
48 
49 #ifdef UNIV_PFS_MUTEX
50 /* Key to register the mutex with performance schema */
51 UNIV_INTERN mysql_pfs_key_t	buf_dblwr_mutex_key;
52 #endif /* UNIV_PFS_RWLOCK */
53 
54 /** The doublewrite buffer */
55 UNIV_INTERN buf_dblwr_t*	buf_dblwr = NULL;
56 
57 /** Set to TRUE when the doublewrite buffer is being created */
58 UNIV_INTERN ibool	buf_dblwr_being_created = FALSE;
59 
60 /****************************************************************//**
61 Determines if a page number is located inside the doublewrite buffer.
62 @return TRUE if the location is inside the two blocks of the
63 doublewrite buffer */
64 UNIV_INTERN
65 ibool
buf_dblwr_page_inside(ulint page_no)66 buf_dblwr_page_inside(
67 /*==================*/
68 	ulint	page_no)	/*!< in: page number */
69 {
70 	if (buf_dblwr == NULL) {
71 
72 		return(FALSE);
73 	}
74 
75 	if (page_no >= buf_dblwr->block1
76 	    && page_no < buf_dblwr->block1
77 	    + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
78 		return(TRUE);
79 	}
80 
81 	if (page_no >= buf_dblwr->block2
82 	    && page_no < buf_dblwr->block2
83 	    + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
84 		return(TRUE);
85 	}
86 
87 	return(FALSE);
88 }
89 
90 /****************************************************************//**
91 Calls buf_page_get() on the TRX_SYS_PAGE and returns a pointer to the
92 doublewrite buffer within it.
93 @return	pointer to the doublewrite buffer within the filespace header
94 page. */
95 UNIV_INLINE
96 byte*
buf_dblwr_get(mtr_t * mtr)97 buf_dblwr_get(
98 /*==========*/
99 	mtr_t*	mtr)	/*!< in/out: MTR to hold the page latch */
100 {
101 	buf_block_t*	block;
102 
103 	block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO,
104 			     RW_X_LATCH, mtr);
105 	buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
106 
107 	return(buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE);
108 }
109 
110 /********************************************************************//**
111 Flush a batch of writes to the datafiles that have already been
112 written to the dblwr buffer on disk. */
113 UNIV_INLINE
114 void
buf_dblwr_sync_datafiles()115 buf_dblwr_sync_datafiles()
116 /*======================*/
117 {
118 	/* Wake possible simulated aio thread to actually post the
119 	writes to the operating system */
120 	os_aio_simulated_wake_handler_threads();
121 
122 	/* Wait that all async writes to tablespaces have been posted to
123 	the OS */
124 	os_aio_wait_until_no_pending_writes();
125 
126 	/* Now we flush the data to disk (for example, with fsync) */
127 	fil_flush_file_spaces(FIL_TABLESPACE);
128 }
129 
130 /****************************************************************//**
131 Creates or initialializes the doublewrite buffer at a database start. */
132 static
133 void
buf_dblwr_init(byte * doublewrite)134 buf_dblwr_init(
135 /*===========*/
136 	byte*	doublewrite)	/*!< in: pointer to the doublewrite buf
137 				header on trx sys page */
138 {
139 	ulint	buf_size;
140 
141 	buf_dblwr = static_cast<buf_dblwr_t*>(
142 		mem_zalloc(sizeof(buf_dblwr_t)));
143 
144 	/* There are two blocks of same size in the doublewrite
145 	buffer. */
146 	buf_size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
147 
148 	/* There must be atleast one buffer for single page writes
149 	and one buffer for batch writes. */
150 	ut_a(srv_doublewrite_batch_size > 0
151 	     && srv_doublewrite_batch_size < buf_size);
152 
153 	mutex_create(buf_dblwr_mutex_key,
154 		     &buf_dblwr->mutex, SYNC_DOUBLEWRITE);
155 
156 	buf_dblwr->b_event = os_event_create();
157 	buf_dblwr->s_event = os_event_create();
158 	buf_dblwr->first_free = 0;
159 	buf_dblwr->s_reserved = 0;
160 	buf_dblwr->b_reserved = 0;
161 
162 	buf_dblwr->block1 = mach_read_from_4(
163 		doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK1);
164 	buf_dblwr->block2 = mach_read_from_4(
165 		doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK2);
166 
167 	buf_dblwr->in_use = static_cast<bool*>(
168 		mem_zalloc(buf_size * sizeof(bool)));
169 
170 	buf_dblwr->write_buf_unaligned = static_cast<byte*>(
171 		ut_malloc((1 + buf_size) * UNIV_PAGE_SIZE));
172 
173 	buf_dblwr->write_buf = static_cast<byte*>(
174 		ut_align(buf_dblwr->write_buf_unaligned,
175 			 UNIV_PAGE_SIZE));
176 
177 	buf_dblwr->buf_block_arr = static_cast<buf_page_t**>(
178 		mem_zalloc(buf_size * sizeof(void*)));
179 }
180 
181 /****************************************************************//**
182 Creates the doublewrite buffer to a new InnoDB installation. The header of the
183 doublewrite buffer is placed on the trx system header page. */
184 UNIV_INTERN
185 void
buf_dblwr_create(void)186 buf_dblwr_create(void)
187 /*==================*/
188 {
189 	buf_block_t*	block2;
190 	buf_block_t*	new_block;
191 	byte*	doublewrite;
192 	byte*	fseg_header;
193 	ulint	page_no;
194 	ulint	prev_page_no;
195 	ulint	i;
196 	mtr_t	mtr;
197 
198 	if (buf_dblwr) {
199 		/* Already inited */
200 
201 		return;
202 	}
203 
204 start_again:
205 	mtr_start(&mtr);
206 	buf_dblwr_being_created = TRUE;
207 
208 	doublewrite = buf_dblwr_get(&mtr);
209 
210 	if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
211 	    == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
212 		/* The doublewrite buffer has already been created:
213 		just read in some numbers */
214 
215 		buf_dblwr_init(doublewrite);
216 
217 		mtr_commit(&mtr);
218 		buf_dblwr_being_created = FALSE;
219 		return;
220 	}
221 
222 	ib_logf(IB_LOG_LEVEL_INFO,
223 		"Doublewrite buffer not found: creating new");
224 
225 	if (buf_pool_get_curr_size()
226 	    < ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
227 		+ FSP_EXTENT_SIZE / 2 + 100)
228 	       * UNIV_PAGE_SIZE)) {
229 
230 		ib_logf(IB_LOG_LEVEL_ERROR,
231 			"Cannot create doublewrite buffer: you must "
232 			"increase your buffer pool size. Cannot continue "
233 			"operation.");
234 
235 		exit(EXIT_FAILURE);
236 	}
237 
238 	block2 = fseg_create(TRX_SYS_SPACE, TRX_SYS_PAGE_NO,
239 			     TRX_SYS_DOUBLEWRITE
240 			     + TRX_SYS_DOUBLEWRITE_FSEG, &mtr);
241 
242 	/* fseg_create acquires a second latch on the page,
243 	therefore we must declare it: */
244 
245 	buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK);
246 
247 	if (block2 == NULL) {
248 		ib_logf(IB_LOG_LEVEL_ERROR,
249 			"Cannot create doublewrite buffer: you must "
250 			"increase your tablespace size. "
251 			"Cannot continue operation.");
252 
253 		/* We exit without committing the mtr to prevent
254 		its modifications to the database getting to disk */
255 
256 		exit(EXIT_FAILURE);
257 	}
258 
259 	fseg_header = doublewrite + TRX_SYS_DOUBLEWRITE_FSEG;
260 	prev_page_no = 0;
261 
262 	for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
263 		     + FSP_EXTENT_SIZE / 2; i++) {
264 		new_block = fseg_alloc_free_page(
265 			fseg_header, prev_page_no + 1, FSP_UP, &mtr);
266 		if (new_block == NULL) {
267 			ib_logf(IB_LOG_LEVEL_ERROR,
268 				"Cannot create doublewrite buffer: you must "
269 				"increase your tablespace size. "
270 				"Cannot continue operation.");
271 
272 			exit(EXIT_FAILURE);
273 		}
274 
275 		/* We read the allocated pages to the buffer pool;
276 		when they are written to disk in a flush, the space
277 		id and page number fields are also written to the
278 		pages. When we at database startup read pages
279 		from the doublewrite buffer, we know that if the
280 		space id and page number in them are the same as
281 		the page position in the tablespace, then the page
282 		has not been written to in doublewrite. */
283 
284 		ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1);
285 		page_no = buf_block_get_page_no(new_block);
286 
287 		if (i == FSP_EXTENT_SIZE / 2) {
288 			ut_a(page_no == FSP_EXTENT_SIZE);
289 			mlog_write_ulint(doublewrite
290 					 + TRX_SYS_DOUBLEWRITE_BLOCK1,
291 					 page_no, MLOG_4BYTES, &mtr);
292 			mlog_write_ulint(doublewrite
293 					 + TRX_SYS_DOUBLEWRITE_REPEAT
294 					 + TRX_SYS_DOUBLEWRITE_BLOCK1,
295 					 page_no, MLOG_4BYTES, &mtr);
296 
297 		} else if (i == FSP_EXTENT_SIZE / 2
298 			   + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
299 			ut_a(page_no == 2 * FSP_EXTENT_SIZE);
300 			mlog_write_ulint(doublewrite
301 					 + TRX_SYS_DOUBLEWRITE_BLOCK2,
302 					 page_no, MLOG_4BYTES, &mtr);
303 			mlog_write_ulint(doublewrite
304 					 + TRX_SYS_DOUBLEWRITE_REPEAT
305 					 + TRX_SYS_DOUBLEWRITE_BLOCK2,
306 					 page_no, MLOG_4BYTES, &mtr);
307 
308 		} else if (i > FSP_EXTENT_SIZE / 2) {
309 			ut_a(page_no == prev_page_no + 1);
310 		}
311 
312 		if (((i + 1) & 15) == 0) {
313 			/* rw_locks can only be recursively x-locked
314 			2048 times. (on 32 bit platforms,
315 			(lint) 0 - (X_LOCK_DECR * 2049)
316 			is no longer a negative number, and thus
317 			lock_word becomes like a shared lock).
318 			For 4k page size this loop will
319 			lock the fseg header too many times. Since
320 			this code is not done while any other threads
321 			are active, restart the MTR occasionally. */
322 			mtr_commit(&mtr);
323 			mtr_start(&mtr);
324 			doublewrite = buf_dblwr_get(&mtr);
325 			fseg_header = doublewrite
326 				      + TRX_SYS_DOUBLEWRITE_FSEG;
327 		}
328 
329 		prev_page_no = page_no;
330 	}
331 
332 	mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC,
333 			 TRX_SYS_DOUBLEWRITE_MAGIC_N,
334 			 MLOG_4BYTES, &mtr);
335 	mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC
336 			 + TRX_SYS_DOUBLEWRITE_REPEAT,
337 			 TRX_SYS_DOUBLEWRITE_MAGIC_N,
338 			 MLOG_4BYTES, &mtr);
339 
340 	mlog_write_ulint(doublewrite
341 			 + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
342 			 TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
343 			 MLOG_4BYTES, &mtr);
344 	mtr_commit(&mtr);
345 
346 	/* Flush the modified pages to disk and make a checkpoint */
347 	log_make_checkpoint_at(LSN_MAX, TRUE);
348 
349 	/* Remove doublewrite pages from LRU */
350 	buf_pool_invalidate();
351 
352 	ib_logf(IB_LOG_LEVEL_INFO, "Doublewrite buffer created");
353 
354 	goto start_again;
355 }
356 
357 /****************************************************************//**
358 At a database startup initializes the doublewrite buffer memory structure if
359 we already have a doublewrite buffer created in the data files. If we are
360 upgrading to an InnoDB version which supports multiple tablespaces, then this
361 function performs the necessary update operations. If we are in a crash
362 recovery, this function loads the pages from double write buffer into memory. */
363 void
buf_dblwr_init_or_load_pages(pfs_os_file_t file,char * path,bool load_corrupt_pages)364 buf_dblwr_init_or_load_pages(
365 /*=========================*/
366 	pfs_os_file_t	file,
367 	char*		path,
368 	bool		load_corrupt_pages)
369 {
370 	byte*	buf;
371 	byte*	read_buf;
372 	byte*	unaligned_read_buf;
373 	ulint	block1;
374 	ulint	block2;
375 	byte*	page;
376 	ibool	reset_space_ids = FALSE;
377 	byte*	doublewrite;
378 	ulint	space_id;
379 	ulint	i;
380         ulint	block_bytes = 0;
381 	recv_dblwr_t& recv_dblwr = recv_sys->dblwr;
382 
383 	/* We do the file i/o past the buffer pool */
384 
385 	unaligned_read_buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE));
386 
387 	read_buf = static_cast<byte*>(
388 		ut_align(unaligned_read_buf, UNIV_PAGE_SIZE));
389 
390 	/* Read the trx sys header to check if we are using the doublewrite
391 	buffer */
392 	off_t  trx_sys_page = TRX_SYS_PAGE_NO * UNIV_PAGE_SIZE;
393 	os_file_read(file, read_buf, trx_sys_page, UNIV_PAGE_SIZE);
394 
395 	doublewrite = read_buf + TRX_SYS_DOUBLEWRITE;
396 
397 	if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
398 	    == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
399 		/* The doublewrite buffer has been created */
400 
401 		buf_dblwr_init(doublewrite);
402 
403 		block1 = buf_dblwr->block1;
404 		block2 = buf_dblwr->block2;
405 
406 		buf = buf_dblwr->write_buf;
407 	} else {
408 		goto leave_func;
409 	}
410 
411 	if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED)
412 	    != TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) {
413 
414 		/* We are upgrading from a version < 4.1.x to a version where
415 		multiple tablespaces are supported. We must reset the space id
416 		field in the pages in the doublewrite buffer because starting
417 		from this version the space id is stored to
418 		FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
419 
420 		reset_space_ids = TRUE;
421 
422 		ib_logf(IB_LOG_LEVEL_INFO,
423 			"Resetting space id's in the doublewrite buffer");
424 	}
425 
426 	/* Read the pages from the doublewrite buffer to memory */
427 
428         block_bytes = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
429 
430 	os_file_read(file, buf, block1 * UNIV_PAGE_SIZE, block_bytes);
431 	os_file_read(file, buf + block_bytes, block2 * UNIV_PAGE_SIZE,
432 		     block_bytes);
433 
434 	/* Check if any of these pages is half-written in data files, in the
435 	intended position */
436 
437 	page = buf;
438 
439 	for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) {
440 
441 		ulint source_page_no;
442 
443 		if (reset_space_ids) {
444 
445 			space_id = 0;
446 			mach_write_to_4(page
447 					+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id);
448 			/* We do not need to calculate new checksums for the
449 			pages because the field .._SPACE_ID does not affect
450 			them. Write the page back to where we read it from. */
451 
452 			if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
453 				source_page_no = block1 + i;
454 			} else {
455 				source_page_no = block2
456 					+ i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
457 			}
458 
459 			os_file_write(path, file, page,
460 				      source_page_no * UNIV_PAGE_SIZE,
461 				      UNIV_PAGE_SIZE);
462 
463 		} else if (load_corrupt_pages) {
464 
465 			recv_dblwr.add(page);
466 		}
467 
468 		page += UNIV_PAGE_SIZE;
469 	}
470 
471 	if (reset_space_ids) {
472 		os_file_flush(file);
473 	}
474 
475 leave_func:
476 	ut_free(unaligned_read_buf);
477 }
478 
479 /****************************************************************//**
480 Process the double write buffer pages. */
481 void
buf_dblwr_process()482 buf_dblwr_process()
483 /*===============*/
484 {
485 	ulint	space_id;
486 	ulint	page_no;
487 	ulint	page_no_dblwr = 0;
488 	byte*	page;
489 	byte*	read_buf;
490 	byte*	unaligned_read_buf;
491 	recv_dblwr_t& recv_dblwr = recv_sys->dblwr;
492 
493 	unaligned_read_buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE));
494 
495 	read_buf = static_cast<byte*>(
496 		ut_align(unaligned_read_buf, UNIV_PAGE_SIZE));
497 
498 	for (std::list<byte*>::iterator i = recv_dblwr.pages.begin();
499 	     i != recv_dblwr.pages.end(); ++i, ++page_no_dblwr ) {
500 
501 		page = *i;
502 		page_no  = mach_read_from_4(page + FIL_PAGE_OFFSET);
503 		space_id = mach_read_from_4(page + FIL_PAGE_SPACE_ID);
504 
505 		if (!fil_tablespace_exists_in_mem(space_id)) {
506 			/* Maybe we have dropped the single-table tablespace
507 			and this page once belonged to it: do nothing */
508 
509 		} else if (!fil_check_adress_in_tablespace(space_id,
510 							   page_no)) {
511 			ib_logf(IB_LOG_LEVEL_WARN,
512 				"A page in the doublewrite buffer is not "
513 				"within space bounds; space id %lu "
514 				"page number %lu, page %lu in "
515 				"doublewrite buf.",
516 				(ulong) space_id, (ulong) page_no,
517 				page_no_dblwr);
518 		} else {
519 			ulint	zip_size = fil_space_get_zip_size(space_id);
520 
521 			/* Read in the actual page from the file */
522 			fil_io(OS_FILE_READ, true, space_id, zip_size,
523 			       page_no, 0,
524 			       zip_size ? zip_size : UNIV_PAGE_SIZE,
525 			       read_buf, NULL);
526 
527 			/* Check if the page is corrupt */
528 
529 			if (buf_page_is_corrupted(true, read_buf, zip_size)) {
530 
531 				fprintf(stderr,
532 					"InnoDB: Database page"
533 					" corruption or a failed\n"
534 					"InnoDB: file read of"
535 					" space %lu page %lu.\n"
536 					"InnoDB: Trying to recover it from"
537 					" the doublewrite buffer.\n",
538 					(ulong) space_id, (ulong) page_no);
539 
540 				if (buf_page_is_corrupted(true,
541 							  page, zip_size)) {
542 					fprintf(stderr,
543 						"InnoDB: Dump of the page:\n");
544 					buf_page_print(
545 						read_buf, zip_size,
546 						BUF_PAGE_PRINT_NO_CRASH);
547 					fprintf(stderr,
548 						"InnoDB: Dump of"
549 						" corresponding page"
550 						" in doublewrite buffer:\n");
551 					buf_page_print(
552 						page, zip_size,
553 						BUF_PAGE_PRINT_NO_CRASH);
554 
555 					fprintf(stderr,
556 						"InnoDB: Also the page in the"
557 						" doublewrite buffer"
558 						" is corrupt.\n"
559 						"InnoDB: Cannot continue"
560 						" operation.\n"
561 						"InnoDB: You can try to"
562 						" recover the database"
563 						" with the my.cnf\n"
564 						"InnoDB: option:\n"
565 						"InnoDB:"
566 						" innodb_force_recovery=6\n");
567 					ut_error;
568 				}
569 
570 				/* Write the good page from the
571 				doublewrite buffer to the intended
572 				position */
573 
574 				fil_io(OS_FILE_WRITE, true, space_id,
575 				       zip_size, page_no, 0,
576 				       zip_size ? zip_size : UNIV_PAGE_SIZE,
577 				       page, NULL);
578 
579 				ib_logf(IB_LOG_LEVEL_INFO,
580 					"Recovered the page from"
581 					" the doublewrite buffer.");
582 
583 			} else if (buf_page_is_zeroes(read_buf, zip_size)) {
584 
585 				if (!buf_page_is_zeroes(page, zip_size)
586 				    && !buf_page_is_corrupted(true, page,
587 							      zip_size)) {
588 
589 					/* Database page contained only
590 					zeroes, while a valid copy is
591 					available in dblwr buffer. */
592 
593 					fil_io(OS_FILE_WRITE, true, space_id,
594 					       zip_size, page_no, 0,
595 					       zip_size ? zip_size
596 							: UNIV_PAGE_SIZE,
597 					       page, NULL);
598 				}
599 			}
600 		}
601 	}
602 
603 	fil_flush_file_spaces(FIL_TABLESPACE);
604 	ut_free(unaligned_read_buf);
605 }
606 
607 /****************************************************************//**
608 Frees doublewrite buffer. */
609 UNIV_INTERN
610 void
buf_dblwr_free(void)611 buf_dblwr_free(void)
612 /*================*/
613 {
614 	/* Free the double write data structures. */
615 	ut_a(buf_dblwr != NULL);
616 	ut_ad(buf_dblwr->s_reserved == 0);
617 	ut_ad(buf_dblwr->b_reserved == 0);
618 
619 	os_event_free(buf_dblwr->b_event);
620 	os_event_free(buf_dblwr->s_event);
621 	ut_free(buf_dblwr->write_buf_unaligned);
622 	buf_dblwr->write_buf_unaligned = NULL;
623 
624 	mem_free(buf_dblwr->buf_block_arr);
625 	buf_dblwr->buf_block_arr = NULL;
626 
627 	mem_free(buf_dblwr->in_use);
628 	buf_dblwr->in_use = NULL;
629 
630 	mutex_free(&buf_dblwr->mutex);
631 	mem_free(buf_dblwr);
632 	buf_dblwr = NULL;
633 }
634 
635 /********************************************************************//**
636 Updates the doublewrite buffer when an IO request is completed. */
637 UNIV_INTERN
638 void
buf_dblwr_update(const buf_page_t * bpage,buf_flush_t flush_type)639 buf_dblwr_update(
640 /*=============*/
641 	const buf_page_t*	bpage,	/*!< in: buffer block descriptor */
642 	buf_flush_t		flush_type)/*!< in: flush type */
643 {
644 	if (!srv_use_doublewrite_buf || buf_dblwr == NULL) {
645 		return;
646 	}
647 
648 	switch (flush_type) {
649 	case BUF_FLUSH_LIST:
650 	case BUF_FLUSH_LRU:
651 		mutex_enter(&buf_dblwr->mutex);
652 
653 		ut_ad(buf_dblwr->batch_running);
654 		ut_ad(buf_dblwr->b_reserved > 0);
655 		ut_ad(buf_dblwr->b_reserved <= buf_dblwr->first_free);
656 
657 		buf_dblwr->b_reserved--;
658 
659 		if (buf_dblwr->b_reserved == 0) {
660 			mutex_exit(&buf_dblwr->mutex);
661 			/* This will finish the batch. Sync data files
662 			to the disk. */
663 			fil_flush_file_spaces(FIL_TABLESPACE);
664 			mutex_enter(&buf_dblwr->mutex);
665 
666 			/* We can now reuse the doublewrite memory buffer: */
667 			buf_dblwr->first_free = 0;
668 			buf_dblwr->batch_running = false;
669 			os_event_set(buf_dblwr->b_event);
670 		}
671 
672 		mutex_exit(&buf_dblwr->mutex);
673 		break;
674 	case BUF_FLUSH_SINGLE_PAGE:
675 		{
676 			const ulint size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
677 			ulint i;
678 			mutex_enter(&buf_dblwr->mutex);
679 			for (i = srv_doublewrite_batch_size; i < size; ++i) {
680 				if (buf_dblwr->buf_block_arr[i] == bpage) {
681 					buf_dblwr->s_reserved--;
682 					buf_dblwr->buf_block_arr[i] = NULL;
683 					buf_dblwr->in_use[i] = false;
684 					break;
685 				}
686 			}
687 
688 			/* The block we are looking for must exist as a
689 			reserved block. */
690 			ut_a(i < size);
691 		}
692 		os_event_set(buf_dblwr->s_event);
693 		mutex_exit(&buf_dblwr->mutex);
694 		break;
695 	case BUF_FLUSH_N_TYPES:
696 		ut_error;
697 	}
698 }
699 
700 /********************************************************************//**
701 Check the LSN values on the page. */
702 static
703 void
buf_dblwr_check_page_lsn(const page_t * page)704 buf_dblwr_check_page_lsn(
705 /*=====================*/
706 	const page_t*	page)		/*!< in: page to check */
707 {
708 	if (memcmp(page + (FIL_PAGE_LSN + 4),
709 		   page + (UNIV_PAGE_SIZE
710 			   - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
711 		   4)) {
712 
713 		ut_print_timestamp(stderr);
714 		fprintf(stderr,
715 			" InnoDB: ERROR: The page to be written"
716 			" seems corrupt!\n"
717 			"InnoDB: The low 4 bytes of LSN fields do not match "
718 			"(" ULINTPF " != " ULINTPF ")!"
719 			" Noticed in the buffer pool.\n",
720 			mach_read_from_4(
721 				page + FIL_PAGE_LSN + 4),
722 			mach_read_from_4(
723 				page + UNIV_PAGE_SIZE
724 				- FIL_PAGE_END_LSN_OLD_CHKSUM + 4));
725 	}
726 }
727 
728 /********************************************************************//**
729 Asserts when a corrupt block is find during writing out data to the
730 disk. */
731 static
732 void
buf_dblwr_assert_on_corrupt_block(const buf_block_t * block)733 buf_dblwr_assert_on_corrupt_block(
734 /*==============================*/
735 	const buf_block_t*	block)	/*!< in: block to check */
736 {
737 	buf_page_print(block->frame, 0, BUF_PAGE_PRINT_NO_CRASH);
738 
739 	ut_print_timestamp(stderr);
740 	fprintf(stderr,
741 		"  InnoDB: Apparent corruption of an"
742 		" index page n:o %lu in space %lu\n"
743 		"InnoDB: to be written to data file."
744 		" We intentionally crash server\n"
745 		"InnoDB: to prevent corrupt data"
746 		" from ending up in data\n"
747 		"InnoDB: files.\n",
748 		(ulong) buf_block_get_page_no(block),
749 		(ulong) buf_block_get_space(block));
750 
751 	ut_error;
752 }
753 
754 /********************************************************************//**
755 Check the LSN values on the page with which this block is associated.
756 Also validate the page if the option is set. */
757 static
758 void
buf_dblwr_check_block(const buf_block_t * block)759 buf_dblwr_check_block(
760 /*==================*/
761 	const buf_block_t*	block)	/*!< in: block to check */
762 {
763 	if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
764 	    || block->page.zip.data) {
765 		/* No simple validate for compressed pages exists. */
766 		return;
767 	}
768 
769 	buf_dblwr_check_page_lsn(block->frame);
770 
771 	if (!block->check_index_page_at_flush) {
772 		return;
773 	}
774 
775 	if (page_is_comp(block->frame)) {
776 		if (!page_simple_validate_new(block->frame)) {
777 			buf_dblwr_assert_on_corrupt_block(block);
778 		}
779 	} else if (!page_simple_validate_old(block->frame)) {
780 
781 		buf_dblwr_assert_on_corrupt_block(block);
782 	}
783 }
784 
785 /********************************************************************//**
786 Writes a page that has already been written to the doublewrite buffer
787 to the datafile. It is the job of the caller to sync the datafile. */
788 static
789 void
buf_dblwr_write_block_to_datafile(const buf_page_t * bpage,bool sync)790 buf_dblwr_write_block_to_datafile(
791 /*==============================*/
792 	const buf_page_t*	bpage,	/*!< in: page to write */
793 	bool			sync)	/*!< in: true if sync IO
794 					is requested */
795 {
796 	ut_a(bpage);
797 	ut_a(buf_page_in_file(bpage));
798 
799 	const ulint flags = sync
800 		? OS_FILE_WRITE
801 		: OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER;
802 
803 	if (bpage->zip.data) {
804 		fil_io(flags, sync, buf_page_get_space(bpage),
805 		       buf_page_get_zip_size(bpage),
806 		       buf_page_get_page_no(bpage), 0,
807 		       buf_page_get_zip_size(bpage),
808 		       (void*) bpage->zip.data,
809 		       (void*) bpage);
810 
811 		return;
812 	}
813 
814 
815 	const buf_block_t* block = (buf_block_t*) bpage;
816 	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
817 	buf_dblwr_check_page_lsn(block->frame);
818 
819 	fil_io(flags, sync, buf_block_get_space(block), 0,
820 	       buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
821 	       (void*) block->frame, (void*) block);
822 
823 }
824 
825 /********************************************************************//**
826 Flushes possible buffered writes from the doublewrite memory buffer to disk,
827 and also wakes up the aio thread if simulated aio is used. It is very
828 important to call this function after a batch of writes has been posted,
829 and also when we may have to wait for a page latch! Otherwise a deadlock
830 of threads can occur. */
831 UNIV_INTERN
832 void
buf_dblwr_flush_buffered_writes(void)833 buf_dblwr_flush_buffered_writes(void)
834 /*=================================*/
835 {
836 	byte*		write_buf;
837 	ulint		first_free;
838 	ulint		len;
839 
840 	if (!srv_use_doublewrite_buf || buf_dblwr == NULL) {
841 		/* Sync the writes to the disk. */
842 		buf_dblwr_sync_datafiles();
843 		return;
844 	}
845 
846 try_again:
847 	mutex_enter(&buf_dblwr->mutex);
848 
849 	/* Write first to doublewrite buffer blocks. We use synchronous
850 	aio and thus know that file write has been completed when the
851 	control returns. */
852 
853 	if (buf_dblwr->first_free == 0) {
854 
855 		mutex_exit(&buf_dblwr->mutex);
856 
857 		return;
858 	}
859 
860 	if (buf_dblwr->batch_running) {
861 		/* Another thread is running the batch right now. Wait
862 		for it to finish. */
863 		ib_int64_t	sig_count = os_event_reset(buf_dblwr->b_event);
864 		mutex_exit(&buf_dblwr->mutex);
865 
866 		os_event_wait_low(buf_dblwr->b_event, sig_count);
867 		goto try_again;
868 	}
869 
870 	ut_a(!buf_dblwr->batch_running);
871 	ut_ad(buf_dblwr->first_free == buf_dblwr->b_reserved);
872 
873 	/* Disallow anyone else to post to doublewrite buffer or to
874 	start another batch of flushing. */
875 	buf_dblwr->batch_running = true;
876 	first_free = buf_dblwr->first_free;
877 
878 	/* Now safe to release the mutex. Note that though no other
879 	thread is allowed to post to the doublewrite batch flushing
880 	but any threads working on single page flushes are allowed
881 	to proceed. */
882 	mutex_exit(&buf_dblwr->mutex);
883 
884 	write_buf = buf_dblwr->write_buf;
885 
886 	for (ulint len2 = 0, i = 0;
887 	     i < buf_dblwr->first_free;
888 	     len2 += UNIV_PAGE_SIZE, i++) {
889 
890 		const buf_block_t*	block;
891 
892 		block = (buf_block_t*) buf_dblwr->buf_block_arr[i];
893 
894 		if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
895 		    || block->page.zip.data) {
896 			/* No simple validate for compressed
897 			pages exists. */
898 			continue;
899 		}
900 
901 		/* Check that the actual page in the buffer pool is
902 		not corrupt and the LSN values are sane. */
903 		buf_dblwr_check_block(block);
904 
905 		/* Check that the page as written to the doublewrite
906 		buffer has sane LSN values. */
907 		buf_dblwr_check_page_lsn(write_buf + len2);
908 	}
909 
910 	/* Write out the first block of the doublewrite buffer */
911 	len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE,
912 		     buf_dblwr->first_free) * UNIV_PAGE_SIZE;
913 
914 	fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
915 	       buf_dblwr->block1, 0, len,
916 	       (void*) write_buf, NULL);
917 
918 	if (buf_dblwr->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
919 		/* No unwritten pages in the second block. */
920 		goto flush;
921 	}
922 
923 	/* Write out the second block of the doublewrite buffer. */
924 	len = (buf_dblwr->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
925 	       * UNIV_PAGE_SIZE;
926 
927 	write_buf = buf_dblwr->write_buf
928 		    + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
929 
930 	fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
931 	       buf_dblwr->block2, 0, len,
932 	       (void*) write_buf, NULL);
933 
934 flush:
935 	/* increment the doublewrite flushed pages counter */
936 	srv_stats.dblwr_pages_written.add(buf_dblwr->first_free);
937 	srv_stats.dblwr_writes.inc();
938 
939 	/* Now flush the doublewrite buffer data to disk */
940 	fil_flush(TRX_SYS_SPACE);
941 
942 	/* We know that the writes have been flushed to disk now
943 	and in recovery we will find them in the doublewrite buffer
944 	blocks. Next do the writes to the intended positions. */
945 
946 	/* Up to this point first_free and buf_dblwr->first_free are
947 	same because we have set the buf_dblwr->batch_running flag
948 	disallowing any other thread to post any request but we
949 	can't safely access buf_dblwr->first_free in the loop below.
950 	This is so because it is possible that after we are done with
951 	the last iteration and before we terminate the loop, the batch
952 	gets finished in the IO helper thread and another thread posts
953 	a new batch setting buf_dblwr->first_free to a higher value.
954 	If this happens and we are using buf_dblwr->first_free in the
955 	loop termination condition then we'll end up dispatching
956 	the same block twice from two different threads. */
957 	ut_ad(first_free == buf_dblwr->first_free);
958 	for (ulint i = 0; i < first_free; i++) {
959 		buf_dblwr_write_block_to_datafile(
960 			buf_dblwr->buf_block_arr[i], false);
961 	}
962 
963 	/* Wake possible simulated aio thread to actually post the
964 	writes to the operating system. We don't flush the files
965 	at this point. We leave it to the IO helper thread to flush
966 	datafiles when the whole batch has been processed. */
967 	os_aio_simulated_wake_handler_threads();
968 }
969 
970 /********************************************************************//**
971 Posts a buffer page for writing. If the doublewrite memory buffer is
972 full, calls buf_dblwr_flush_buffered_writes and waits for for free
973 space to appear. */
974 UNIV_INTERN
975 void
buf_dblwr_add_to_batch(buf_page_t * bpage)976 buf_dblwr_add_to_batch(
977 /*====================*/
978 	buf_page_t*	bpage)	/*!< in: buffer block to write */
979 {
980 	ulint	zip_size;
981 
982 	ut_a(buf_page_in_file(bpage));
983 	ut_ad(!mutex_own(&buf_pool_from_bpage(bpage)->LRU_list_mutex));
984 
985 try_again:
986 	mutex_enter(&buf_dblwr->mutex);
987 
988 	ut_a(buf_dblwr->first_free <= srv_doublewrite_batch_size);
989 
990 	if (buf_dblwr->batch_running) {
991 
992 		/* This not nearly as bad as it looks. There is only
993 		page_cleaner thread which does background flushing
994 		in batches therefore it is unlikely to be a contention
995 		point. The only exception is when a user thread is
996 		forced to do a flush batch because of a sync
997 		checkpoint. */
998 		ib_int64_t	sig_count = os_event_reset(buf_dblwr->b_event);
999 		mutex_exit(&buf_dblwr->mutex);
1000 
1001 		os_event_wait_low(buf_dblwr->b_event, sig_count);
1002 		goto try_again;
1003 	}
1004 
1005 	if (buf_dblwr->first_free == srv_doublewrite_batch_size) {
1006 		mutex_exit(&(buf_dblwr->mutex));
1007 
1008 		buf_dblwr_flush_buffered_writes();
1009 
1010 		goto try_again;
1011 	}
1012 
1013 	zip_size = buf_page_get_zip_size(bpage);
1014 
1015 	if (zip_size) {
1016 		UNIV_MEM_ASSERT_RW(bpage->zip.data, zip_size);
1017 		/* Copy the compressed page and clear the rest. */
1018 		memcpy(buf_dblwr->write_buf
1019 		       + UNIV_PAGE_SIZE * buf_dblwr->first_free,
1020 		       bpage->zip.data, zip_size);
1021 		memset(buf_dblwr->write_buf
1022 		       + UNIV_PAGE_SIZE * buf_dblwr->first_free
1023 		       + zip_size, 0, UNIV_PAGE_SIZE - zip_size);
1024 	} else {
1025 		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
1026 		UNIV_MEM_ASSERT_RW(((buf_block_t*) bpage)->frame,
1027 				   UNIV_PAGE_SIZE);
1028 
1029 		memcpy(buf_dblwr->write_buf
1030 		       + UNIV_PAGE_SIZE * buf_dblwr->first_free,
1031 		       ((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE);
1032 	}
1033 
1034 	buf_dblwr->buf_block_arr[buf_dblwr->first_free] = bpage;
1035 
1036 	buf_dblwr->first_free++;
1037 	buf_dblwr->b_reserved++;
1038 
1039 	ut_ad(!buf_dblwr->batch_running);
1040 	ut_ad(buf_dblwr->first_free == buf_dblwr->b_reserved);
1041 	ut_ad(buf_dblwr->b_reserved <= srv_doublewrite_batch_size);
1042 
1043 	if (buf_dblwr->first_free == srv_doublewrite_batch_size) {
1044 		mutex_exit(&(buf_dblwr->mutex));
1045 
1046 		buf_dblwr_flush_buffered_writes();
1047 
1048 		return;
1049 	}
1050 
1051 	mutex_exit(&(buf_dblwr->mutex));
1052 }
1053 
1054 /********************************************************************//**
1055 Writes a page to the doublewrite buffer on disk, sync it, then write
1056 the page to the datafile and sync the datafile. This function is used
1057 for single page flushes. If all the buffers allocated for single page
1058 flushes in the doublewrite buffer are in use we wait here for one to
1059 become free. We are guaranteed that a slot will become free because any
1060 thread that is using a slot must also release the slot before leaving
1061 this function. */
1062 UNIV_INTERN
1063 void
buf_dblwr_write_single_page(buf_page_t * bpage,bool sync)1064 buf_dblwr_write_single_page(
1065 /*========================*/
1066 	buf_page_t*	bpage,	/*!< in: buffer block to write */
1067 	bool		sync)	/*!< in: true if sync IO requested */
1068 {
1069 	ulint		n_slots;
1070 	ulint		size;
1071 	ulint		zip_size;
1072 	ulint		offset;
1073 	ulint		i;
1074 
1075 	ut_a(buf_page_in_file(bpage));
1076 	ut_a(srv_use_doublewrite_buf);
1077 	ut_a(buf_dblwr != NULL);
1078 
1079 	/* total number of slots available for single page flushes
1080 	starts from srv_doublewrite_batch_size to the end of the
1081 	buffer. */
1082 	size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
1083 	ut_a(size > srv_doublewrite_batch_size);
1084 	n_slots = size - srv_doublewrite_batch_size;
1085 
1086 	if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
1087 
1088 		/* Check that the actual page in the buffer pool is
1089 		not corrupt and the LSN values are sane. */
1090 		buf_dblwr_check_block((buf_block_t*) bpage);
1091 
1092 		/* Check that the page as written to the doublewrite
1093 		buffer has sane LSN values. */
1094 		if (!bpage->zip.data) {
1095 			buf_dblwr_check_page_lsn(
1096 				((buf_block_t*) bpage)->frame);
1097 		}
1098 	}
1099 
1100 retry:
1101 	mutex_enter(&buf_dblwr->mutex);
1102 	if (buf_dblwr->s_reserved == n_slots) {
1103 
1104 		/* All slots are reserved. */
1105 		ib_int64_t	sig_count =
1106 			os_event_reset(buf_dblwr->s_event);
1107 		mutex_exit(&buf_dblwr->mutex);
1108 		os_event_wait_low(buf_dblwr->s_event, sig_count);
1109 
1110 		goto retry;
1111 	}
1112 
1113 	for (i = srv_doublewrite_batch_size; i < size; ++i) {
1114 
1115 		if (!buf_dblwr->in_use[i]) {
1116 			break;
1117 		}
1118 	}
1119 
1120 	/* We are guaranteed to find a slot. */
1121 	ut_a(i < size);
1122 	buf_dblwr->in_use[i] = true;
1123 	buf_dblwr->s_reserved++;
1124 	buf_dblwr->buf_block_arr[i] = bpage;
1125 
1126 	/* increment the doublewrite flushed pages counter */
1127 	srv_stats.dblwr_pages_written.inc();
1128 	srv_stats.dblwr_writes.inc();
1129 
1130 	mutex_exit(&buf_dblwr->mutex);
1131 
1132 	/* Lets see if we are going to write in the first or second
1133 	block of the doublewrite buffer. */
1134 	if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
1135 		offset = buf_dblwr->block1 + i;
1136 	} else {
1137 		offset = buf_dblwr->block2 + i
1138 			 - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
1139 	}
1140 
1141 	/* We deal with compressed and uncompressed pages a little
1142 	differently here. In case of uncompressed pages we can
1143 	directly write the block to the allocated slot in the
1144 	doublewrite buffer in the system tablespace and then after
1145 	syncing the system table space we can proceed to write the page
1146 	in the datafile.
1147 	In case of compressed page we first do a memcpy of the block
1148 	to the in-memory buffer of doublewrite before proceeding to
1149 	write it. This is so because we want to pad the remaining
1150 	bytes in the doublewrite page with zeros. */
1151 
1152 	zip_size = buf_page_get_zip_size(bpage);
1153 	if (zip_size) {
1154 		memcpy(buf_dblwr->write_buf + UNIV_PAGE_SIZE * i,
1155 		       bpage->zip.data, zip_size);
1156 		memset(buf_dblwr->write_buf + UNIV_PAGE_SIZE * i
1157 		       + zip_size, 0, UNIV_PAGE_SIZE - zip_size);
1158 
1159 		fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
1160 		       offset, 0, UNIV_PAGE_SIZE,
1161 		       (void*) (buf_dblwr->write_buf
1162 				+ UNIV_PAGE_SIZE * i), NULL);
1163 	} else {
1164 		/* It is a regular page. Write it directly to the
1165 		doublewrite buffer */
1166 		fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
1167 		       offset, 0, UNIV_PAGE_SIZE,
1168 		       (void*) ((buf_block_t*) bpage)->frame,
1169 		       NULL);
1170 	}
1171 
1172 	/* Now flush the doublewrite buffer data to disk */
1173 	fil_flush(TRX_SYS_SPACE);
1174 
1175 	/* We know that the write has been flushed to disk now
1176 	and during recovery we will find it in the doublewrite buffer
1177 	blocks. Next do the write to the intended position. */
1178 	buf_dblwr_write_block_to_datafile(bpage, sync);
1179 }
1180 #endif /* !UNIV_HOTBACKUP */
1181