1 /*****************************************************************************
2
3 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License, version 2.0,
7 as published by the Free Software Foundation.
8
9 This program is also distributed with certain software (including
10 but not limited to OpenSSL) that is licensed under separate terms,
11 as designated in a particular file or component or in included license
12 documentation. The authors of MySQL hereby grant you an additional
13 permission to link the program and your derivative works with the
14 separately licensed software that they have included with MySQL.
15
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License, version 2.0, for more details.
20
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
24
25 *****************************************************************************/
26
27 /**************************************************//**
28 @file buf/buf0dblwr.cc
29 Doublwrite buffer module
30
31 Created 2011/12/19
32 *******************************************************/
33
34 #include "buf0dblwr.h"
35
36 #ifdef UNIV_NONINL
37 #include "buf0buf.ic"
38 #endif
39
40 #include "buf0buf.h"
41 #include "buf0checksum.h"
42 #include "srv0start.h"
43 #include "srv0srv.h"
44 #include "page0zip.h"
45 #include "trx0sys.h"
46
47 #ifndef UNIV_HOTBACKUP
48
49 #ifdef UNIV_PFS_MUTEX
50 /* Key to register the mutex with performance schema */
51 UNIV_INTERN mysql_pfs_key_t buf_dblwr_mutex_key;
52 #endif /* UNIV_PFS_RWLOCK */
53
54 /** The doublewrite buffer */
55 UNIV_INTERN buf_dblwr_t* buf_dblwr = NULL;
56
57 /** Set to TRUE when the doublewrite buffer is being created */
58 UNIV_INTERN ibool buf_dblwr_being_created = FALSE;
59
60 /****************************************************************//**
61 Determines if a page number is located inside the doublewrite buffer.
62 @return TRUE if the location is inside the two blocks of the
63 doublewrite buffer */
64 UNIV_INTERN
65 ibool
buf_dblwr_page_inside(ulint page_no)66 buf_dblwr_page_inside(
67 /*==================*/
68 ulint page_no) /*!< in: page number */
69 {
70 if (buf_dblwr == NULL) {
71
72 return(FALSE);
73 }
74
75 if (page_no >= buf_dblwr->block1
76 && page_no < buf_dblwr->block1
77 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
78 return(TRUE);
79 }
80
81 if (page_no >= buf_dblwr->block2
82 && page_no < buf_dblwr->block2
83 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
84 return(TRUE);
85 }
86
87 return(FALSE);
88 }
89
90 /****************************************************************//**
91 Calls buf_page_get() on the TRX_SYS_PAGE and returns a pointer to the
92 doublewrite buffer within it.
93 @return pointer to the doublewrite buffer within the filespace header
94 page. */
95 UNIV_INLINE
96 byte*
buf_dblwr_get(mtr_t * mtr)97 buf_dblwr_get(
98 /*==========*/
99 mtr_t* mtr) /*!< in/out: MTR to hold the page latch */
100 {
101 buf_block_t* block;
102
103 block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO,
104 RW_X_LATCH, mtr);
105 buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
106
107 return(buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE);
108 }
109
110 /********************************************************************//**
111 Flush a batch of writes to the datafiles that have already been
112 written to the dblwr buffer on disk. */
113 UNIV_INLINE
114 void
buf_dblwr_sync_datafiles()115 buf_dblwr_sync_datafiles()
116 /*======================*/
117 {
118 /* Wake possible simulated aio thread to actually post the
119 writes to the operating system */
120 os_aio_simulated_wake_handler_threads();
121
122 /* Wait that all async writes to tablespaces have been posted to
123 the OS */
124 os_aio_wait_until_no_pending_writes();
125
126 /* Now we flush the data to disk (for example, with fsync) */
127 fil_flush_file_spaces(FIL_TABLESPACE);
128 }
129
130 /****************************************************************//**
131 Creates or initialializes the doublewrite buffer at a database start. */
132 static
133 void
buf_dblwr_init(byte * doublewrite)134 buf_dblwr_init(
135 /*===========*/
136 byte* doublewrite) /*!< in: pointer to the doublewrite buf
137 header on trx sys page */
138 {
139 ulint buf_size;
140
141 buf_dblwr = static_cast<buf_dblwr_t*>(
142 mem_zalloc(sizeof(buf_dblwr_t)));
143
144 /* There are two blocks of same size in the doublewrite
145 buffer. */
146 buf_size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
147
148 /* There must be atleast one buffer for single page writes
149 and one buffer for batch writes. */
150 ut_a(srv_doublewrite_batch_size > 0
151 && srv_doublewrite_batch_size < buf_size);
152
153 mutex_create(buf_dblwr_mutex_key,
154 &buf_dblwr->mutex, SYNC_DOUBLEWRITE);
155
156 buf_dblwr->b_event = os_event_create();
157 buf_dblwr->s_event = os_event_create();
158 buf_dblwr->first_free = 0;
159 buf_dblwr->s_reserved = 0;
160 buf_dblwr->b_reserved = 0;
161
162 buf_dblwr->block1 = mach_read_from_4(
163 doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK1);
164 buf_dblwr->block2 = mach_read_from_4(
165 doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK2);
166
167 buf_dblwr->in_use = static_cast<bool*>(
168 mem_zalloc(buf_size * sizeof(bool)));
169
170 buf_dblwr->write_buf_unaligned = static_cast<byte*>(
171 ut_malloc((1 + buf_size) * UNIV_PAGE_SIZE));
172
173 buf_dblwr->write_buf = static_cast<byte*>(
174 ut_align(buf_dblwr->write_buf_unaligned,
175 UNIV_PAGE_SIZE));
176
177 buf_dblwr->buf_block_arr = static_cast<buf_page_t**>(
178 mem_zalloc(buf_size * sizeof(void*)));
179 }
180
181 /****************************************************************//**
182 Creates the doublewrite buffer to a new InnoDB installation. The header of the
183 doublewrite buffer is placed on the trx system header page. */
184 UNIV_INTERN
185 void
buf_dblwr_create(void)186 buf_dblwr_create(void)
187 /*==================*/
188 {
189 buf_block_t* block2;
190 buf_block_t* new_block;
191 byte* doublewrite;
192 byte* fseg_header;
193 ulint page_no;
194 ulint prev_page_no;
195 ulint i;
196 mtr_t mtr;
197
198 if (buf_dblwr) {
199 /* Already inited */
200
201 return;
202 }
203
204 start_again:
205 mtr_start(&mtr);
206 buf_dblwr_being_created = TRUE;
207
208 doublewrite = buf_dblwr_get(&mtr);
209
210 if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
211 == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
212 /* The doublewrite buffer has already been created:
213 just read in some numbers */
214
215 buf_dblwr_init(doublewrite);
216
217 mtr_commit(&mtr);
218 buf_dblwr_being_created = FALSE;
219 return;
220 }
221
222 ib_logf(IB_LOG_LEVEL_INFO,
223 "Doublewrite buffer not found: creating new");
224
225 if (buf_pool_get_curr_size()
226 < ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
227 + FSP_EXTENT_SIZE / 2 + 100)
228 * UNIV_PAGE_SIZE)) {
229
230 ib_logf(IB_LOG_LEVEL_ERROR,
231 "Cannot create doublewrite buffer: you must "
232 "increase your buffer pool size. Cannot continue "
233 "operation.");
234
235 exit(EXIT_FAILURE);
236 }
237
238 block2 = fseg_create(TRX_SYS_SPACE, TRX_SYS_PAGE_NO,
239 TRX_SYS_DOUBLEWRITE
240 + TRX_SYS_DOUBLEWRITE_FSEG, &mtr);
241
242 /* fseg_create acquires a second latch on the page,
243 therefore we must declare it: */
244
245 buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK);
246
247 if (block2 == NULL) {
248 ib_logf(IB_LOG_LEVEL_ERROR,
249 "Cannot create doublewrite buffer: you must "
250 "increase your tablespace size. "
251 "Cannot continue operation.");
252
253 /* We exit without committing the mtr to prevent
254 its modifications to the database getting to disk */
255
256 exit(EXIT_FAILURE);
257 }
258
259 fseg_header = doublewrite + TRX_SYS_DOUBLEWRITE_FSEG;
260 prev_page_no = 0;
261
262 for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
263 + FSP_EXTENT_SIZE / 2; i++) {
264 new_block = fseg_alloc_free_page(
265 fseg_header, prev_page_no + 1, FSP_UP, &mtr);
266 if (new_block == NULL) {
267 ib_logf(IB_LOG_LEVEL_ERROR,
268 "Cannot create doublewrite buffer: you must "
269 "increase your tablespace size. "
270 "Cannot continue operation.");
271
272 exit(EXIT_FAILURE);
273 }
274
275 /* We read the allocated pages to the buffer pool;
276 when they are written to disk in a flush, the space
277 id and page number fields are also written to the
278 pages. When we at database startup read pages
279 from the doublewrite buffer, we know that if the
280 space id and page number in them are the same as
281 the page position in the tablespace, then the page
282 has not been written to in doublewrite. */
283
284 ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1);
285 page_no = buf_block_get_page_no(new_block);
286
287 if (i == FSP_EXTENT_SIZE / 2) {
288 ut_a(page_no == FSP_EXTENT_SIZE);
289 mlog_write_ulint(doublewrite
290 + TRX_SYS_DOUBLEWRITE_BLOCK1,
291 page_no, MLOG_4BYTES, &mtr);
292 mlog_write_ulint(doublewrite
293 + TRX_SYS_DOUBLEWRITE_REPEAT
294 + TRX_SYS_DOUBLEWRITE_BLOCK1,
295 page_no, MLOG_4BYTES, &mtr);
296
297 } else if (i == FSP_EXTENT_SIZE / 2
298 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
299 ut_a(page_no == 2 * FSP_EXTENT_SIZE);
300 mlog_write_ulint(doublewrite
301 + TRX_SYS_DOUBLEWRITE_BLOCK2,
302 page_no, MLOG_4BYTES, &mtr);
303 mlog_write_ulint(doublewrite
304 + TRX_SYS_DOUBLEWRITE_REPEAT
305 + TRX_SYS_DOUBLEWRITE_BLOCK2,
306 page_no, MLOG_4BYTES, &mtr);
307
308 } else if (i > FSP_EXTENT_SIZE / 2) {
309 ut_a(page_no == prev_page_no + 1);
310 }
311
312 if (((i + 1) & 15) == 0) {
313 /* rw_locks can only be recursively x-locked
314 2048 times. (on 32 bit platforms,
315 (lint) 0 - (X_LOCK_DECR * 2049)
316 is no longer a negative number, and thus
317 lock_word becomes like a shared lock).
318 For 4k page size this loop will
319 lock the fseg header too many times. Since
320 this code is not done while any other threads
321 are active, restart the MTR occasionally. */
322 mtr_commit(&mtr);
323 mtr_start(&mtr);
324 doublewrite = buf_dblwr_get(&mtr);
325 fseg_header = doublewrite
326 + TRX_SYS_DOUBLEWRITE_FSEG;
327 }
328
329 prev_page_no = page_no;
330 }
331
332 mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC,
333 TRX_SYS_DOUBLEWRITE_MAGIC_N,
334 MLOG_4BYTES, &mtr);
335 mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC
336 + TRX_SYS_DOUBLEWRITE_REPEAT,
337 TRX_SYS_DOUBLEWRITE_MAGIC_N,
338 MLOG_4BYTES, &mtr);
339
340 mlog_write_ulint(doublewrite
341 + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
342 TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
343 MLOG_4BYTES, &mtr);
344 mtr_commit(&mtr);
345
346 /* Flush the modified pages to disk and make a checkpoint */
347 log_make_checkpoint_at(LSN_MAX, TRUE);
348
349 /* Remove doublewrite pages from LRU */
350 buf_pool_invalidate();
351
352 ib_logf(IB_LOG_LEVEL_INFO, "Doublewrite buffer created");
353
354 goto start_again;
355 }
356
357 /****************************************************************//**
358 At a database startup initializes the doublewrite buffer memory structure if
359 we already have a doublewrite buffer created in the data files. If we are
360 upgrading to an InnoDB version which supports multiple tablespaces, then this
361 function performs the necessary update operations. If we are in a crash
362 recovery, this function loads the pages from double write buffer into memory. */
363 void
buf_dblwr_init_or_load_pages(pfs_os_file_t file,char * path,bool load_corrupt_pages)364 buf_dblwr_init_or_load_pages(
365 /*=========================*/
366 pfs_os_file_t file,
367 char* path,
368 bool load_corrupt_pages)
369 {
370 byte* buf;
371 byte* read_buf;
372 byte* unaligned_read_buf;
373 ulint block1;
374 ulint block2;
375 byte* page;
376 ibool reset_space_ids = FALSE;
377 byte* doublewrite;
378 ulint space_id;
379 ulint i;
380 ulint block_bytes = 0;
381 recv_dblwr_t& recv_dblwr = recv_sys->dblwr;
382
383 /* We do the file i/o past the buffer pool */
384
385 unaligned_read_buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE));
386
387 read_buf = static_cast<byte*>(
388 ut_align(unaligned_read_buf, UNIV_PAGE_SIZE));
389
390 /* Read the trx sys header to check if we are using the doublewrite
391 buffer */
392 off_t trx_sys_page = TRX_SYS_PAGE_NO * UNIV_PAGE_SIZE;
393 os_file_read(file, read_buf, trx_sys_page, UNIV_PAGE_SIZE);
394
395 doublewrite = read_buf + TRX_SYS_DOUBLEWRITE;
396
397 if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
398 == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
399 /* The doublewrite buffer has been created */
400
401 buf_dblwr_init(doublewrite);
402
403 block1 = buf_dblwr->block1;
404 block2 = buf_dblwr->block2;
405
406 buf = buf_dblwr->write_buf;
407 } else {
408 goto leave_func;
409 }
410
411 if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED)
412 != TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) {
413
414 /* We are upgrading from a version < 4.1.x to a version where
415 multiple tablespaces are supported. We must reset the space id
416 field in the pages in the doublewrite buffer because starting
417 from this version the space id is stored to
418 FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
419
420 reset_space_ids = TRUE;
421
422 ib_logf(IB_LOG_LEVEL_INFO,
423 "Resetting space id's in the doublewrite buffer");
424 }
425
426 /* Read the pages from the doublewrite buffer to memory */
427
428 block_bytes = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
429
430 os_file_read(file, buf, block1 * UNIV_PAGE_SIZE, block_bytes);
431 os_file_read(file, buf + block_bytes, block2 * UNIV_PAGE_SIZE,
432 block_bytes);
433
434 /* Check if any of these pages is half-written in data files, in the
435 intended position */
436
437 page = buf;
438
439 for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) {
440
441 ulint source_page_no;
442
443 if (reset_space_ids) {
444
445 space_id = 0;
446 mach_write_to_4(page
447 + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id);
448 /* We do not need to calculate new checksums for the
449 pages because the field .._SPACE_ID does not affect
450 them. Write the page back to where we read it from. */
451
452 if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
453 source_page_no = block1 + i;
454 } else {
455 source_page_no = block2
456 + i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
457 }
458
459 os_file_write(path, file, page,
460 source_page_no * UNIV_PAGE_SIZE,
461 UNIV_PAGE_SIZE);
462
463 } else if (load_corrupt_pages) {
464
465 recv_dblwr.add(page);
466 }
467
468 page += UNIV_PAGE_SIZE;
469 }
470
471 if (reset_space_ids) {
472 os_file_flush(file);
473 }
474
475 leave_func:
476 ut_free(unaligned_read_buf);
477 }
478
479 /****************************************************************//**
480 Process the double write buffer pages. */
481 void
buf_dblwr_process()482 buf_dblwr_process()
483 /*===============*/
484 {
485 ulint space_id;
486 ulint page_no;
487 ulint page_no_dblwr = 0;
488 byte* page;
489 byte* read_buf;
490 byte* unaligned_read_buf;
491 recv_dblwr_t& recv_dblwr = recv_sys->dblwr;
492
493 unaligned_read_buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE));
494
495 read_buf = static_cast<byte*>(
496 ut_align(unaligned_read_buf, UNIV_PAGE_SIZE));
497
498 for (std::list<byte*>::iterator i = recv_dblwr.pages.begin();
499 i != recv_dblwr.pages.end(); ++i, ++page_no_dblwr ) {
500
501 page = *i;
502 page_no = mach_read_from_4(page + FIL_PAGE_OFFSET);
503 space_id = mach_read_from_4(page + FIL_PAGE_SPACE_ID);
504
505 if (!fil_tablespace_exists_in_mem(space_id)) {
506 /* Maybe we have dropped the single-table tablespace
507 and this page once belonged to it: do nothing */
508
509 } else if (!fil_check_adress_in_tablespace(space_id,
510 page_no)) {
511 ib_logf(IB_LOG_LEVEL_WARN,
512 "A page in the doublewrite buffer is not "
513 "within space bounds; space id %lu "
514 "page number %lu, page %lu in "
515 "doublewrite buf.",
516 (ulong) space_id, (ulong) page_no,
517 page_no_dblwr);
518 } else {
519 ulint zip_size = fil_space_get_zip_size(space_id);
520
521 /* Read in the actual page from the file */
522 fil_io(OS_FILE_READ, true, space_id, zip_size,
523 page_no, 0,
524 zip_size ? zip_size : UNIV_PAGE_SIZE,
525 read_buf, NULL);
526
527 /* Check if the page is corrupt */
528
529 if (buf_page_is_corrupted(true, read_buf, zip_size)) {
530
531 fprintf(stderr,
532 "InnoDB: Database page"
533 " corruption or a failed\n"
534 "InnoDB: file read of"
535 " space %lu page %lu.\n"
536 "InnoDB: Trying to recover it from"
537 " the doublewrite buffer.\n",
538 (ulong) space_id, (ulong) page_no);
539
540 if (buf_page_is_corrupted(true,
541 page, zip_size)) {
542 fprintf(stderr,
543 "InnoDB: Dump of the page:\n");
544 buf_page_print(
545 read_buf, zip_size,
546 BUF_PAGE_PRINT_NO_CRASH);
547 fprintf(stderr,
548 "InnoDB: Dump of"
549 " corresponding page"
550 " in doublewrite buffer:\n");
551 buf_page_print(
552 page, zip_size,
553 BUF_PAGE_PRINT_NO_CRASH);
554
555 fprintf(stderr,
556 "InnoDB: Also the page in the"
557 " doublewrite buffer"
558 " is corrupt.\n"
559 "InnoDB: Cannot continue"
560 " operation.\n"
561 "InnoDB: You can try to"
562 " recover the database"
563 " with the my.cnf\n"
564 "InnoDB: option:\n"
565 "InnoDB:"
566 " innodb_force_recovery=6\n");
567 ut_error;
568 }
569
570 /* Write the good page from the
571 doublewrite buffer to the intended
572 position */
573
574 fil_io(OS_FILE_WRITE, true, space_id,
575 zip_size, page_no, 0,
576 zip_size ? zip_size : UNIV_PAGE_SIZE,
577 page, NULL);
578
579 ib_logf(IB_LOG_LEVEL_INFO,
580 "Recovered the page from"
581 " the doublewrite buffer.");
582
583 } else if (buf_page_is_zeroes(read_buf, zip_size)) {
584
585 if (!buf_page_is_zeroes(page, zip_size)
586 && !buf_page_is_corrupted(true, page,
587 zip_size)) {
588
589 /* Database page contained only
590 zeroes, while a valid copy is
591 available in dblwr buffer. */
592
593 fil_io(OS_FILE_WRITE, true, space_id,
594 zip_size, page_no, 0,
595 zip_size ? zip_size
596 : UNIV_PAGE_SIZE,
597 page, NULL);
598 }
599 }
600 }
601 }
602
603 fil_flush_file_spaces(FIL_TABLESPACE);
604 ut_free(unaligned_read_buf);
605 }
606
607 /****************************************************************//**
608 Frees doublewrite buffer. */
609 UNIV_INTERN
610 void
buf_dblwr_free(void)611 buf_dblwr_free(void)
612 /*================*/
613 {
614 /* Free the double write data structures. */
615 ut_a(buf_dblwr != NULL);
616 ut_ad(buf_dblwr->s_reserved == 0);
617 ut_ad(buf_dblwr->b_reserved == 0);
618
619 os_event_free(buf_dblwr->b_event);
620 os_event_free(buf_dblwr->s_event);
621 ut_free(buf_dblwr->write_buf_unaligned);
622 buf_dblwr->write_buf_unaligned = NULL;
623
624 mem_free(buf_dblwr->buf_block_arr);
625 buf_dblwr->buf_block_arr = NULL;
626
627 mem_free(buf_dblwr->in_use);
628 buf_dblwr->in_use = NULL;
629
630 mutex_free(&buf_dblwr->mutex);
631 mem_free(buf_dblwr);
632 buf_dblwr = NULL;
633 }
634
635 /********************************************************************//**
636 Updates the doublewrite buffer when an IO request is completed. */
637 UNIV_INTERN
638 void
buf_dblwr_update(const buf_page_t * bpage,buf_flush_t flush_type)639 buf_dblwr_update(
640 /*=============*/
641 const buf_page_t* bpage, /*!< in: buffer block descriptor */
642 buf_flush_t flush_type)/*!< in: flush type */
643 {
644 if (!srv_use_doublewrite_buf || buf_dblwr == NULL) {
645 return;
646 }
647
648 switch (flush_type) {
649 case BUF_FLUSH_LIST:
650 case BUF_FLUSH_LRU:
651 mutex_enter(&buf_dblwr->mutex);
652
653 ut_ad(buf_dblwr->batch_running);
654 ut_ad(buf_dblwr->b_reserved > 0);
655 ut_ad(buf_dblwr->b_reserved <= buf_dblwr->first_free);
656
657 buf_dblwr->b_reserved--;
658
659 if (buf_dblwr->b_reserved == 0) {
660 mutex_exit(&buf_dblwr->mutex);
661 /* This will finish the batch. Sync data files
662 to the disk. */
663 fil_flush_file_spaces(FIL_TABLESPACE);
664 mutex_enter(&buf_dblwr->mutex);
665
666 /* We can now reuse the doublewrite memory buffer: */
667 buf_dblwr->first_free = 0;
668 buf_dblwr->batch_running = false;
669 os_event_set(buf_dblwr->b_event);
670 }
671
672 mutex_exit(&buf_dblwr->mutex);
673 break;
674 case BUF_FLUSH_SINGLE_PAGE:
675 {
676 const ulint size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
677 ulint i;
678 mutex_enter(&buf_dblwr->mutex);
679 for (i = srv_doublewrite_batch_size; i < size; ++i) {
680 if (buf_dblwr->buf_block_arr[i] == bpage) {
681 buf_dblwr->s_reserved--;
682 buf_dblwr->buf_block_arr[i] = NULL;
683 buf_dblwr->in_use[i] = false;
684 break;
685 }
686 }
687
688 /* The block we are looking for must exist as a
689 reserved block. */
690 ut_a(i < size);
691 }
692 os_event_set(buf_dblwr->s_event);
693 mutex_exit(&buf_dblwr->mutex);
694 break;
695 case BUF_FLUSH_N_TYPES:
696 ut_error;
697 }
698 }
699
700 /********************************************************************//**
701 Check the LSN values on the page. */
702 static
703 void
buf_dblwr_check_page_lsn(const page_t * page)704 buf_dblwr_check_page_lsn(
705 /*=====================*/
706 const page_t* page) /*!< in: page to check */
707 {
708 if (memcmp(page + (FIL_PAGE_LSN + 4),
709 page + (UNIV_PAGE_SIZE
710 - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
711 4)) {
712
713 ut_print_timestamp(stderr);
714 fprintf(stderr,
715 " InnoDB: ERROR: The page to be written"
716 " seems corrupt!\n"
717 "InnoDB: The low 4 bytes of LSN fields do not match "
718 "(" ULINTPF " != " ULINTPF ")!"
719 " Noticed in the buffer pool.\n",
720 mach_read_from_4(
721 page + FIL_PAGE_LSN + 4),
722 mach_read_from_4(
723 page + UNIV_PAGE_SIZE
724 - FIL_PAGE_END_LSN_OLD_CHKSUM + 4));
725 }
726 }
727
728 /********************************************************************//**
729 Asserts when a corrupt block is find during writing out data to the
730 disk. */
731 static
732 void
buf_dblwr_assert_on_corrupt_block(const buf_block_t * block)733 buf_dblwr_assert_on_corrupt_block(
734 /*==============================*/
735 const buf_block_t* block) /*!< in: block to check */
736 {
737 buf_page_print(block->frame, 0, BUF_PAGE_PRINT_NO_CRASH);
738
739 ut_print_timestamp(stderr);
740 fprintf(stderr,
741 " InnoDB: Apparent corruption of an"
742 " index page n:o %lu in space %lu\n"
743 "InnoDB: to be written to data file."
744 " We intentionally crash server\n"
745 "InnoDB: to prevent corrupt data"
746 " from ending up in data\n"
747 "InnoDB: files.\n",
748 (ulong) buf_block_get_page_no(block),
749 (ulong) buf_block_get_space(block));
750
751 ut_error;
752 }
753
754 /********************************************************************//**
755 Check the LSN values on the page with which this block is associated.
756 Also validate the page if the option is set. */
757 static
758 void
buf_dblwr_check_block(const buf_block_t * block)759 buf_dblwr_check_block(
760 /*==================*/
761 const buf_block_t* block) /*!< in: block to check */
762 {
763 if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
764 || block->page.zip.data) {
765 /* No simple validate for compressed pages exists. */
766 return;
767 }
768
769 buf_dblwr_check_page_lsn(block->frame);
770
771 if (!block->check_index_page_at_flush) {
772 return;
773 }
774
775 if (page_is_comp(block->frame)) {
776 if (!page_simple_validate_new(block->frame)) {
777 buf_dblwr_assert_on_corrupt_block(block);
778 }
779 } else if (!page_simple_validate_old(block->frame)) {
780
781 buf_dblwr_assert_on_corrupt_block(block);
782 }
783 }
784
785 /********************************************************************//**
786 Writes a page that has already been written to the doublewrite buffer
787 to the datafile. It is the job of the caller to sync the datafile. */
788 static
789 void
buf_dblwr_write_block_to_datafile(const buf_page_t * bpage,bool sync)790 buf_dblwr_write_block_to_datafile(
791 /*==============================*/
792 const buf_page_t* bpage, /*!< in: page to write */
793 bool sync) /*!< in: true if sync IO
794 is requested */
795 {
796 ut_a(bpage);
797 ut_a(buf_page_in_file(bpage));
798
799 const ulint flags = sync
800 ? OS_FILE_WRITE
801 : OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER;
802
803 if (bpage->zip.data) {
804 fil_io(flags, sync, buf_page_get_space(bpage),
805 buf_page_get_zip_size(bpage),
806 buf_page_get_page_no(bpage), 0,
807 buf_page_get_zip_size(bpage),
808 (void*) bpage->zip.data,
809 (void*) bpage);
810
811 return;
812 }
813
814
815 const buf_block_t* block = (buf_block_t*) bpage;
816 ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
817 buf_dblwr_check_page_lsn(block->frame);
818
819 fil_io(flags, sync, buf_block_get_space(block), 0,
820 buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
821 (void*) block->frame, (void*) block);
822
823 }
824
825 /********************************************************************//**
826 Flushes possible buffered writes from the doublewrite memory buffer to disk,
827 and also wakes up the aio thread if simulated aio is used. It is very
828 important to call this function after a batch of writes has been posted,
829 and also when we may have to wait for a page latch! Otherwise a deadlock
830 of threads can occur. */
831 UNIV_INTERN
832 void
buf_dblwr_flush_buffered_writes(void)833 buf_dblwr_flush_buffered_writes(void)
834 /*=================================*/
835 {
836 byte* write_buf;
837 ulint first_free;
838 ulint len;
839
840 if (!srv_use_doublewrite_buf || buf_dblwr == NULL) {
841 /* Sync the writes to the disk. */
842 buf_dblwr_sync_datafiles();
843 return;
844 }
845
846 try_again:
847 mutex_enter(&buf_dblwr->mutex);
848
849 /* Write first to doublewrite buffer blocks. We use synchronous
850 aio and thus know that file write has been completed when the
851 control returns. */
852
853 if (buf_dblwr->first_free == 0) {
854
855 mutex_exit(&buf_dblwr->mutex);
856
857 return;
858 }
859
860 if (buf_dblwr->batch_running) {
861 /* Another thread is running the batch right now. Wait
862 for it to finish. */
863 ib_int64_t sig_count = os_event_reset(buf_dblwr->b_event);
864 mutex_exit(&buf_dblwr->mutex);
865
866 os_event_wait_low(buf_dblwr->b_event, sig_count);
867 goto try_again;
868 }
869
870 ut_a(!buf_dblwr->batch_running);
871 ut_ad(buf_dblwr->first_free == buf_dblwr->b_reserved);
872
873 /* Disallow anyone else to post to doublewrite buffer or to
874 start another batch of flushing. */
875 buf_dblwr->batch_running = true;
876 first_free = buf_dblwr->first_free;
877
878 /* Now safe to release the mutex. Note that though no other
879 thread is allowed to post to the doublewrite batch flushing
880 but any threads working on single page flushes are allowed
881 to proceed. */
882 mutex_exit(&buf_dblwr->mutex);
883
884 write_buf = buf_dblwr->write_buf;
885
886 for (ulint len2 = 0, i = 0;
887 i < buf_dblwr->first_free;
888 len2 += UNIV_PAGE_SIZE, i++) {
889
890 const buf_block_t* block;
891
892 block = (buf_block_t*) buf_dblwr->buf_block_arr[i];
893
894 if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
895 || block->page.zip.data) {
896 /* No simple validate for compressed
897 pages exists. */
898 continue;
899 }
900
901 /* Check that the actual page in the buffer pool is
902 not corrupt and the LSN values are sane. */
903 buf_dblwr_check_block(block);
904
905 /* Check that the page as written to the doublewrite
906 buffer has sane LSN values. */
907 buf_dblwr_check_page_lsn(write_buf + len2);
908 }
909
910 /* Write out the first block of the doublewrite buffer */
911 len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE,
912 buf_dblwr->first_free) * UNIV_PAGE_SIZE;
913
914 fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
915 buf_dblwr->block1, 0, len,
916 (void*) write_buf, NULL);
917
918 if (buf_dblwr->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
919 /* No unwritten pages in the second block. */
920 goto flush;
921 }
922
923 /* Write out the second block of the doublewrite buffer. */
924 len = (buf_dblwr->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
925 * UNIV_PAGE_SIZE;
926
927 write_buf = buf_dblwr->write_buf
928 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
929
930 fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
931 buf_dblwr->block2, 0, len,
932 (void*) write_buf, NULL);
933
934 flush:
935 /* increment the doublewrite flushed pages counter */
936 srv_stats.dblwr_pages_written.add(buf_dblwr->first_free);
937 srv_stats.dblwr_writes.inc();
938
939 /* Now flush the doublewrite buffer data to disk */
940 fil_flush(TRX_SYS_SPACE);
941
942 /* We know that the writes have been flushed to disk now
943 and in recovery we will find them in the doublewrite buffer
944 blocks. Next do the writes to the intended positions. */
945
946 /* Up to this point first_free and buf_dblwr->first_free are
947 same because we have set the buf_dblwr->batch_running flag
948 disallowing any other thread to post any request but we
949 can't safely access buf_dblwr->first_free in the loop below.
950 This is so because it is possible that after we are done with
951 the last iteration and before we terminate the loop, the batch
952 gets finished in the IO helper thread and another thread posts
953 a new batch setting buf_dblwr->first_free to a higher value.
954 If this happens and we are using buf_dblwr->first_free in the
955 loop termination condition then we'll end up dispatching
956 the same block twice from two different threads. */
957 ut_ad(first_free == buf_dblwr->first_free);
958 for (ulint i = 0; i < first_free; i++) {
959 buf_dblwr_write_block_to_datafile(
960 buf_dblwr->buf_block_arr[i], false);
961 }
962
963 /* Wake possible simulated aio thread to actually post the
964 writes to the operating system. We don't flush the files
965 at this point. We leave it to the IO helper thread to flush
966 datafiles when the whole batch has been processed. */
967 os_aio_simulated_wake_handler_threads();
968 }
969
970 /********************************************************************//**
971 Posts a buffer page for writing. If the doublewrite memory buffer is
972 full, calls buf_dblwr_flush_buffered_writes and waits for for free
973 space to appear. */
974 UNIV_INTERN
975 void
buf_dblwr_add_to_batch(buf_page_t * bpage)976 buf_dblwr_add_to_batch(
977 /*====================*/
978 buf_page_t* bpage) /*!< in: buffer block to write */
979 {
980 ulint zip_size;
981
982 ut_a(buf_page_in_file(bpage));
983 ut_ad(!mutex_own(&buf_pool_from_bpage(bpage)->LRU_list_mutex));
984
985 try_again:
986 mutex_enter(&buf_dblwr->mutex);
987
988 ut_a(buf_dblwr->first_free <= srv_doublewrite_batch_size);
989
990 if (buf_dblwr->batch_running) {
991
992 /* This not nearly as bad as it looks. There is only
993 page_cleaner thread which does background flushing
994 in batches therefore it is unlikely to be a contention
995 point. The only exception is when a user thread is
996 forced to do a flush batch because of a sync
997 checkpoint. */
998 ib_int64_t sig_count = os_event_reset(buf_dblwr->b_event);
999 mutex_exit(&buf_dblwr->mutex);
1000
1001 os_event_wait_low(buf_dblwr->b_event, sig_count);
1002 goto try_again;
1003 }
1004
1005 if (buf_dblwr->first_free == srv_doublewrite_batch_size) {
1006 mutex_exit(&(buf_dblwr->mutex));
1007
1008 buf_dblwr_flush_buffered_writes();
1009
1010 goto try_again;
1011 }
1012
1013 zip_size = buf_page_get_zip_size(bpage);
1014
1015 if (zip_size) {
1016 UNIV_MEM_ASSERT_RW(bpage->zip.data, zip_size);
1017 /* Copy the compressed page and clear the rest. */
1018 memcpy(buf_dblwr->write_buf
1019 + UNIV_PAGE_SIZE * buf_dblwr->first_free,
1020 bpage->zip.data, zip_size);
1021 memset(buf_dblwr->write_buf
1022 + UNIV_PAGE_SIZE * buf_dblwr->first_free
1023 + zip_size, 0, UNIV_PAGE_SIZE - zip_size);
1024 } else {
1025 ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
1026 UNIV_MEM_ASSERT_RW(((buf_block_t*) bpage)->frame,
1027 UNIV_PAGE_SIZE);
1028
1029 memcpy(buf_dblwr->write_buf
1030 + UNIV_PAGE_SIZE * buf_dblwr->first_free,
1031 ((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE);
1032 }
1033
1034 buf_dblwr->buf_block_arr[buf_dblwr->first_free] = bpage;
1035
1036 buf_dblwr->first_free++;
1037 buf_dblwr->b_reserved++;
1038
1039 ut_ad(!buf_dblwr->batch_running);
1040 ut_ad(buf_dblwr->first_free == buf_dblwr->b_reserved);
1041 ut_ad(buf_dblwr->b_reserved <= srv_doublewrite_batch_size);
1042
1043 if (buf_dblwr->first_free == srv_doublewrite_batch_size) {
1044 mutex_exit(&(buf_dblwr->mutex));
1045
1046 buf_dblwr_flush_buffered_writes();
1047
1048 return;
1049 }
1050
1051 mutex_exit(&(buf_dblwr->mutex));
1052 }
1053
1054 /********************************************************************//**
1055 Writes a page to the doublewrite buffer on disk, sync it, then write
1056 the page to the datafile and sync the datafile. This function is used
1057 for single page flushes. If all the buffers allocated for single page
1058 flushes in the doublewrite buffer are in use we wait here for one to
1059 become free. We are guaranteed that a slot will become free because any
1060 thread that is using a slot must also release the slot before leaving
1061 this function. */
1062 UNIV_INTERN
1063 void
buf_dblwr_write_single_page(buf_page_t * bpage,bool sync)1064 buf_dblwr_write_single_page(
1065 /*========================*/
1066 buf_page_t* bpage, /*!< in: buffer block to write */
1067 bool sync) /*!< in: true if sync IO requested */
1068 {
1069 ulint n_slots;
1070 ulint size;
1071 ulint zip_size;
1072 ulint offset;
1073 ulint i;
1074
1075 ut_a(buf_page_in_file(bpage));
1076 ut_a(srv_use_doublewrite_buf);
1077 ut_a(buf_dblwr != NULL);
1078
1079 /* total number of slots available for single page flushes
1080 starts from srv_doublewrite_batch_size to the end of the
1081 buffer. */
1082 size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
1083 ut_a(size > srv_doublewrite_batch_size);
1084 n_slots = size - srv_doublewrite_batch_size;
1085
1086 if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
1087
1088 /* Check that the actual page in the buffer pool is
1089 not corrupt and the LSN values are sane. */
1090 buf_dblwr_check_block((buf_block_t*) bpage);
1091
1092 /* Check that the page as written to the doublewrite
1093 buffer has sane LSN values. */
1094 if (!bpage->zip.data) {
1095 buf_dblwr_check_page_lsn(
1096 ((buf_block_t*) bpage)->frame);
1097 }
1098 }
1099
1100 retry:
1101 mutex_enter(&buf_dblwr->mutex);
1102 if (buf_dblwr->s_reserved == n_slots) {
1103
1104 /* All slots are reserved. */
1105 ib_int64_t sig_count =
1106 os_event_reset(buf_dblwr->s_event);
1107 mutex_exit(&buf_dblwr->mutex);
1108 os_event_wait_low(buf_dblwr->s_event, sig_count);
1109
1110 goto retry;
1111 }
1112
1113 for (i = srv_doublewrite_batch_size; i < size; ++i) {
1114
1115 if (!buf_dblwr->in_use[i]) {
1116 break;
1117 }
1118 }
1119
1120 /* We are guaranteed to find a slot. */
1121 ut_a(i < size);
1122 buf_dblwr->in_use[i] = true;
1123 buf_dblwr->s_reserved++;
1124 buf_dblwr->buf_block_arr[i] = bpage;
1125
1126 /* increment the doublewrite flushed pages counter */
1127 srv_stats.dblwr_pages_written.inc();
1128 srv_stats.dblwr_writes.inc();
1129
1130 mutex_exit(&buf_dblwr->mutex);
1131
1132 /* Lets see if we are going to write in the first or second
1133 block of the doublewrite buffer. */
1134 if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
1135 offset = buf_dblwr->block1 + i;
1136 } else {
1137 offset = buf_dblwr->block2 + i
1138 - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
1139 }
1140
1141 /* We deal with compressed and uncompressed pages a little
1142 differently here. In case of uncompressed pages we can
1143 directly write the block to the allocated slot in the
1144 doublewrite buffer in the system tablespace and then after
1145 syncing the system table space we can proceed to write the page
1146 in the datafile.
1147 In case of compressed page we first do a memcpy of the block
1148 to the in-memory buffer of doublewrite before proceeding to
1149 write it. This is so because we want to pad the remaining
1150 bytes in the doublewrite page with zeros. */
1151
1152 zip_size = buf_page_get_zip_size(bpage);
1153 if (zip_size) {
1154 memcpy(buf_dblwr->write_buf + UNIV_PAGE_SIZE * i,
1155 bpage->zip.data, zip_size);
1156 memset(buf_dblwr->write_buf + UNIV_PAGE_SIZE * i
1157 + zip_size, 0, UNIV_PAGE_SIZE - zip_size);
1158
1159 fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
1160 offset, 0, UNIV_PAGE_SIZE,
1161 (void*) (buf_dblwr->write_buf
1162 + UNIV_PAGE_SIZE * i), NULL);
1163 } else {
1164 /* It is a regular page. Write it directly to the
1165 doublewrite buffer */
1166 fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
1167 offset, 0, UNIV_PAGE_SIZE,
1168 (void*) ((buf_block_t*) bpage)->frame,
1169 NULL);
1170 }
1171
1172 /* Now flush the doublewrite buffer data to disk */
1173 fil_flush(TRX_SYS_SPACE);
1174
1175 /* We know that the write has been flushed to disk now
1176 and during recovery we will find it in the doublewrite buffer
1177 blocks. Next do the write to the intended position. */
1178 buf_dblwr_write_block_to_datafile(bpage, sync);
1179 }
1180 #endif /* !UNIV_HOTBACKUP */
1181