1 /*****************************************************************************
2
3 Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2008, Google Inc.
5
6 Portions of this file contain modifications contributed and copyrighted by
7 Google, Inc. Those modifications are gratefully acknowledged and are described
8 briefly in the InnoDB documentation. The contributions by Google are
9 incorporated with their permission, and subject to the conditions contained in
10 the file COPYING.Google.
11
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License, version 2.0,
14 as published by the Free Software Foundation.
15
16 This program is also distributed with certain software (including
17 but not limited to OpenSSL) that is licensed under separate terms,
18 as designated in a particular file or component or in included license
19 documentation. The authors of MySQL hereby grant you an additional
20 permission to link the program and your derivative works with the
21 separately licensed software that they have included with MySQL.
22
23 This program is distributed in the hope that it will be useful,
24 but WITHOUT ANY WARRANTY; without even the implied warranty of
25 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26 GNU General Public License, version 2.0, for more details.
27
28 You should have received a copy of the GNU General Public License along with
29 this program; if not, write to the Free Software Foundation, Inc.,
30 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
31
32 *****************************************************************************/
33
34 /**************************************************//**
35 @file buf/buf0buf.cc
36 The database buffer buf_pool
37
38 Created 11/5/1995 Heikki Tuuri
39 *******************************************************/
40
41 #include "buf0buf.h"
42
43 #ifdef UNIV_NONINL
44 #include "buf0buf.ic"
45 #endif
46
47 #include "mem0mem.h"
48 #include "btr0btr.h"
49 #include "fil0fil.h"
50 #ifndef UNIV_HOTBACKUP
51 #include "buf0buddy.h"
52 #include "lock0lock.h"
53 #include "btr0sea.h"
54 #include "ibuf0ibuf.h"
55 #include "trx0undo.h"
56 #include "log0log.h"
57 #endif /* !UNIV_HOTBACKUP */
58 #include "srv0srv.h"
59 #include "dict0dict.h"
60 #include "log0recv.h"
61 #include "page0zip.h"
62 #include "srv0mon.h"
63 #include "buf0checksum.h"
64 #ifdef HAVE_LIBNUMA
65 #include <numa.h>
66 #include <numaif.h>
67 #endif // HAVE_LIBNUMA
68
69 /*
70 IMPLEMENTATION OF THE BUFFER POOL
71 =================================
72
73 Performance improvement:
74 ------------------------
75 Thread scheduling in NT may be so slow that the OS wait mechanism should
76 not be used even in waiting for disk reads to complete.
77 Rather, we should put waiting query threads to the queue of
78 waiting jobs, and let the OS thread do something useful while the i/o
79 is processed. In this way we could remove most OS thread switches in
80 an i/o-intensive benchmark like TPC-C.
81
82 A possibility is to put a user space thread library between the database
83 and NT. User space thread libraries might be very fast.
84
85 SQL Server 7.0 can be configured to use 'fibers' which are lightweight
86 threads in NT. These should be studied.
87
88 Buffer frames and blocks
89 ------------------------
90 Following the terminology of Gray and Reuter, we call the memory
91 blocks where file pages are loaded buffer frames. For each buffer
92 frame there is a control block, or shortly, a block, in the buffer
93 control array. The control info which does not need to be stored
94 in the file along with the file page, resides in the control block.
95
96 Buffer pool struct
97 ------------------
98 The buffer buf_pool contains a single mutex which protects all the
99 control data structures of the buf_pool. The content of a buffer frame is
100 protected by a separate read-write lock in its control block, though.
101 These locks can be locked and unlocked without owning the buf_pool->mutex.
102 The OS events in the buf_pool struct can be waited for without owning the
103 buf_pool->mutex.
104
105 The buf_pool->mutex is a hot-spot in main memory, causing a lot of
106 memory bus traffic on multiprocessor systems when processors
107 alternately access the mutex. On our Pentium, the mutex is accessed
108 maybe every 10 microseconds. We gave up the solution to have mutexes
109 for each control block, for instance, because it seemed to be
110 complicated.
111
112 A solution to reduce mutex contention of the buf_pool->mutex is to
113 create a separate mutex for the page hash table. On Pentium,
114 accessing the hash table takes 2 microseconds, about half
115 of the total buf_pool->mutex hold time.
116
117 Control blocks
118 --------------
119
120 The control block contains, for instance, the bufferfix count
121 which is incremented when a thread wants a file page to be fixed
122 in a buffer frame. The bufferfix operation does not lock the
123 contents of the frame, however. For this purpose, the control
124 block contains a read-write lock.
125
126 The buffer frames have to be aligned so that the start memory
127 address of a frame is divisible by the universal page size, which
128 is a power of two.
129
130 We intend to make the buffer buf_pool size on-line reconfigurable,
131 that is, the buf_pool size can be changed without closing the database.
132 Then the database administarator may adjust it to be bigger
133 at night, for example. The control block array must
134 contain enough control blocks for the maximum buffer buf_pool size
135 which is used in the particular database.
136 If the buf_pool size is cut, we exploit the virtual memory mechanism of
137 the OS, and just refrain from using frames at high addresses. Then the OS
138 can swap them to disk.
139
140 The control blocks containing file pages are put to a hash table
141 according to the file address of the page.
142 We could speed up the access to an individual page by using
143 "pointer swizzling": we could replace the page references on
144 non-leaf index pages by direct pointers to the page, if it exists
145 in the buf_pool. We could make a separate hash table where we could
146 chain all the page references in non-leaf pages residing in the buf_pool,
147 using the page reference as the hash key,
148 and at the time of reading of a page update the pointers accordingly.
149 Drawbacks of this solution are added complexity and,
150 possibly, extra space required on non-leaf pages for memory pointers.
151 A simpler solution is just to speed up the hash table mechanism
152 in the database, using tables whose size is a power of 2.
153
154 Lists of blocks
155 ---------------
156
157 There are several lists of control blocks.
158
159 The free list (buf_pool->free) contains blocks which are currently not
160 used.
161
162 The common LRU list contains all the blocks holding a file page
163 except those for which the bufferfix count is non-zero.
164 The pages are in the LRU list roughly in the order of the last
165 access to the page, so that the oldest pages are at the end of the
166 list. We also keep a pointer to near the end of the LRU list,
167 which we can use when we want to artificially age a page in the
168 buf_pool. This is used if we know that some page is not needed
169 again for some time: we insert the block right after the pointer,
170 causing it to be replaced sooner than would normally be the case.
171 Currently this aging mechanism is used for read-ahead mechanism
172 of pages, and it can also be used when there is a scan of a full
173 table which cannot fit in the memory. Putting the pages near the
174 end of the LRU list, we make sure that most of the buf_pool stays
175 in the main memory, undisturbed.
176
177 The unzip_LRU list contains a subset of the common LRU list. The
178 blocks on the unzip_LRU list hold a compressed file page and the
179 corresponding uncompressed page frame. A block is in unzip_LRU if and
180 only if the predicate buf_page_belongs_to_unzip_LRU(&block->page)
181 holds. The blocks in unzip_LRU will be in same order as they are in
182 the common LRU list. That is, each manipulation of the common LRU
183 list will result in the same manipulation of the unzip_LRU list.
184
185 The chain of modified blocks (buf_pool->flush_list) contains the blocks
186 holding file pages that have been modified in the memory
187 but not written to disk yet. The block with the oldest modification
188 which has not yet been written to disk is at the end of the chain.
189 The access to this list is protected by buf_pool->flush_list_mutex.
190
191 The chain of unmodified compressed blocks (buf_pool->zip_clean)
192 contains the control blocks (buf_page_t) of those compressed pages
193 that are not in buf_pool->flush_list and for which no uncompressed
194 page has been allocated in the buffer pool. The control blocks for
195 uncompressed pages are accessible via buf_block_t objects that are
196 reachable via buf_pool->chunks[].
197
198 The chains of free memory blocks (buf_pool->zip_free[]) are used by
199 the buddy allocator (buf0buddy.cc) to keep track of currently unused
200 memory blocks of size sizeof(buf_page_t)..UNIV_PAGE_SIZE / 2. These
201 blocks are inside the UNIV_PAGE_SIZE-sized memory blocks of type
202 BUF_BLOCK_MEMORY that the buddy allocator requests from the buffer
203 pool. The buddy allocator is solely used for allocating control
204 blocks for compressed pages (buf_page_t) and compressed page frames.
205
206 Loading a file page
207 -------------------
208
209 First, a victim block for replacement has to be found in the
210 buf_pool. It is taken from the free list or searched for from the
211 end of the LRU-list. An exclusive lock is reserved for the frame,
212 the io_fix field is set in the block fixing the block in buf_pool,
213 and the io-operation for loading the page is queued. The io-handler thread
214 releases the X-lock on the frame and resets the io_fix field
215 when the io operation completes.
216
217 A thread may request the above operation using the function
218 buf_page_get(). It may then continue to request a lock on the frame.
219 The lock is granted when the io-handler releases the x-lock.
220
221 Read-ahead
222 ----------
223
224 The read-ahead mechanism is intended to be intelligent and
225 isolated from the semantically higher levels of the database
226 index management. From the higher level we only need the
227 information if a file page has a natural successor or
228 predecessor page. On the leaf level of a B-tree index,
229 these are the next and previous pages in the natural
230 order of the pages.
231
232 Let us first explain the read-ahead mechanism when the leafs
233 of a B-tree are scanned in an ascending or descending order.
234 When a read page is the first time referenced in the buf_pool,
235 the buffer manager checks if it is at the border of a so-called
236 linear read-ahead area. The tablespace is divided into these
237 areas of size 64 blocks, for example. So if the page is at the
238 border of such an area, the read-ahead mechanism checks if
239 all the other blocks in the area have been accessed in an
240 ascending or descending order. If this is the case, the system
241 looks at the natural successor or predecessor of the page,
242 checks if that is at the border of another area, and in this case
243 issues read-requests for all the pages in that area. Maybe
244 we could relax the condition that all the pages in the area
245 have to be accessed: if data is deleted from a table, there may
246 appear holes of unused pages in the area.
247
248 A different read-ahead mechanism is used when there appears
249 to be a random access pattern to a file.
250 If a new page is referenced in the buf_pool, and several pages
251 of its random access area (for instance, 32 consecutive pages
252 in a tablespace) have recently been referenced, we may predict
253 that the whole area may be needed in the near future, and issue
254 the read requests for the whole area.
255 */
256
257 #ifndef UNIV_HOTBACKUP
258 /** Value in microseconds */
259 static const int WAIT_FOR_READ = 100;
260 /** Number of attemtps made to read in a page in the buffer pool */
261 static const ulint BUF_PAGE_READ_MAX_RETRIES = 100;
262
263 /** The buffer pools of the database */
264 UNIV_INTERN buf_pool_t* buf_pool_ptr;
265
266 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
267 static ulint buf_dbg_counter = 0; /*!< This is used to insert validation
268 operations in execution in the
269 debug version */
270 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
271 #ifdef UNIV_DEBUG
272 /** If this is set TRUE, the program prints info whenever
273 read-ahead or flush occurs */
274 UNIV_INTERN ibool buf_debug_prints = FALSE;
275 #endif /* UNIV_DEBUG */
276
277 #ifdef UNIV_PFS_RWLOCK
278 /* Keys to register buffer block related rwlocks and mutexes with
279 performance schema */
280 UNIV_INTERN mysql_pfs_key_t buf_block_lock_key;
281 # ifdef UNIV_SYNC_DEBUG
282 UNIV_INTERN mysql_pfs_key_t buf_block_debug_latch_key;
283 # endif /* UNIV_SYNC_DEBUG */
284 #endif /* UNIV_PFS_RWLOCK */
285
286 #ifdef UNIV_PFS_MUTEX
287 UNIV_INTERN mysql_pfs_key_t buffer_block_mutex_key;
288 UNIV_INTERN mysql_pfs_key_t buf_pool_mutex_key;
289 UNIV_INTERN mysql_pfs_key_t buf_pool_zip_mutex_key;
290 UNIV_INTERN mysql_pfs_key_t flush_list_mutex_key;
291 #endif /* UNIV_PFS_MUTEX */
292
293 #if defined UNIV_PFS_MUTEX || defined UNIV_PFS_RWLOCK
294 # ifndef PFS_SKIP_BUFFER_MUTEX_RWLOCK
295
296 /* Buffer block mutexes and rwlocks can be registered
297 in one group rather than individually. If PFS_GROUP_BUFFER_SYNC
298 is defined, register buffer block mutex and rwlock
299 in one group after their initialization. */
300 # define PFS_GROUP_BUFFER_SYNC
301
302 /* This define caps the number of mutexes/rwlocks can
303 be registered with performance schema. Developers can
304 modify this define if necessary. Please note, this would
305 be effective only if PFS_GROUP_BUFFER_SYNC is defined. */
306 # define PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER ULINT_MAX
307
308 # endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */
309 #endif /* UNIV_PFS_MUTEX || UNIV_PFS_RWLOCK */
310
311 /** Macro to determine whether the read of write counter is used depending
312 on the io_type */
313 #define MONITOR_RW_COUNTER(io_type, counter) \
314 ((io_type == BUF_IO_READ) \
315 ? (counter##_READ) \
316 : (counter##_WRITTEN))
317
318 /********************************************************************//**
319 Gets the smallest oldest_modification lsn for any page in the pool. Returns
320 zero if all modified pages have been flushed to disk.
321 @return oldest modification in pool, zero if none */
322 UNIV_INTERN
323 lsn_t
buf_pool_get_oldest_modification(void)324 buf_pool_get_oldest_modification(void)
325 /*==================================*/
326 {
327 ulint i;
328 buf_page_t* bpage;
329 lsn_t lsn = 0;
330 lsn_t oldest_lsn = 0;
331
332 /* When we traverse all the flush lists we don't want another
333 thread to add a dirty page to any flush list. */
334 log_flush_order_mutex_enter();
335
336 for (i = 0; i < srv_buf_pool_instances; i++) {
337 buf_pool_t* buf_pool;
338
339 buf_pool = buf_pool_from_array(i);
340
341 buf_flush_list_mutex_enter(buf_pool);
342
343 bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
344
345 if (bpage != NULL) {
346 ut_ad(bpage->in_flush_list);
347 lsn = bpage->oldest_modification;
348 }
349
350 buf_flush_list_mutex_exit(buf_pool);
351
352 if (!oldest_lsn || oldest_lsn > lsn) {
353 oldest_lsn = lsn;
354 }
355 }
356
357 log_flush_order_mutex_exit();
358
359 /* The returned answer may be out of date: the flush_list can
360 change after the mutex has been released. */
361
362 return(oldest_lsn);
363 }
364
365 /********************************************************************//**
366 Get total buffer pool statistics. */
367 UNIV_INTERN
368 void
buf_get_total_list_len(ulint * LRU_len,ulint * free_len,ulint * flush_list_len)369 buf_get_total_list_len(
370 /*===================*/
371 ulint* LRU_len, /*!< out: length of all LRU lists */
372 ulint* free_len, /*!< out: length of all free lists */
373 ulint* flush_list_len) /*!< out: length of all flush lists */
374 {
375 ulint i;
376
377 *LRU_len = 0;
378 *free_len = 0;
379 *flush_list_len = 0;
380
381 for (i = 0; i < srv_buf_pool_instances; i++) {
382 buf_pool_t* buf_pool;
383
384 buf_pool = buf_pool_from_array(i);
385
386 *LRU_len += UT_LIST_GET_LEN(buf_pool->LRU);
387 *free_len += UT_LIST_GET_LEN(buf_pool->free);
388 *flush_list_len += UT_LIST_GET_LEN(buf_pool->flush_list);
389 }
390 }
391
392 /********************************************************************//**
393 Get total list size in bytes from all buffer pools. */
394 UNIV_INTERN
395 void
buf_get_total_list_size_in_bytes(buf_pools_list_size_t * buf_pools_list_size)396 buf_get_total_list_size_in_bytes(
397 /*=============================*/
398 buf_pools_list_size_t* buf_pools_list_size) /*!< out: list sizes
399 in all buffer pools */
400 {
401 ut_ad(buf_pools_list_size);
402 memset(buf_pools_list_size, 0, sizeof(*buf_pools_list_size));
403
404 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
405 buf_pool_t* buf_pool;
406
407 buf_pool = buf_pool_from_array(i);
408 /* We don't need mutex protection since this is
409 for statistics purpose */
410 buf_pools_list_size->LRU_bytes += buf_pool->stat.LRU_bytes;
411 buf_pools_list_size->unzip_LRU_bytes +=
412 UT_LIST_GET_LEN(buf_pool->unzip_LRU) * UNIV_PAGE_SIZE;
413 buf_pools_list_size->flush_list_bytes +=
414 buf_pool->stat.flush_list_bytes;
415 }
416 }
417
418 /********************************************************************//**
419 Get total buffer pool statistics. */
420 UNIV_INTERN
421 void
buf_get_total_stat(buf_pool_stat_t * tot_stat)422 buf_get_total_stat(
423 /*===============*/
424 buf_pool_stat_t* tot_stat) /*!< out: buffer pool stats */
425 {
426 ulint i;
427
428 memset(tot_stat, 0, sizeof(*tot_stat));
429
430 for (i = 0; i < srv_buf_pool_instances; i++) {
431 buf_pool_stat_t*buf_stat;
432 buf_pool_t* buf_pool;
433
434 buf_pool = buf_pool_from_array(i);
435
436 buf_stat = &buf_pool->stat;
437 tot_stat->n_page_gets += buf_stat->n_page_gets;
438 tot_stat->n_pages_read += buf_stat->n_pages_read;
439 tot_stat->n_pages_written += buf_stat->n_pages_written;
440 tot_stat->n_pages_created += buf_stat->n_pages_created;
441 tot_stat->n_ra_pages_read_rnd += buf_stat->n_ra_pages_read_rnd;
442 tot_stat->n_ra_pages_read += buf_stat->n_ra_pages_read;
443 tot_stat->n_ra_pages_evicted += buf_stat->n_ra_pages_evicted;
444 tot_stat->n_pages_made_young += buf_stat->n_pages_made_young;
445
446 tot_stat->n_pages_not_made_young +=
447 buf_stat->n_pages_not_made_young;
448 }
449 }
450
451 /********************************************************************//**
452 Allocates a buffer block.
453 @return own: the allocated block, in state BUF_BLOCK_MEMORY */
454 UNIV_INTERN
455 buf_block_t*
buf_block_alloc(buf_pool_t * buf_pool)456 buf_block_alloc(
457 /*============*/
458 buf_pool_t* buf_pool) /*!< in/out: buffer pool instance,
459 or NULL for round-robin selection
460 of the buffer pool */
461 {
462 buf_block_t* block;
463 ulint index;
464 static ulint buf_pool_index;
465
466 if (buf_pool == NULL) {
467 /* We are allocating memory from any buffer pool, ensure
468 we spread the grace on all buffer pool instances. */
469 index = buf_pool_index++ % srv_buf_pool_instances;
470 buf_pool = buf_pool_from_array(index);
471 }
472
473 block = buf_LRU_get_free_block(buf_pool);
474
475 buf_block_set_state(block, BUF_BLOCK_MEMORY);
476
477 return(block);
478 }
479 #endif /* !UNIV_HOTBACKUP */
480
481 /********************************************************************//**
482 Checks if a page is all zeroes.
483 @return TRUE if the page is all zeroes */
484 bool
buf_page_is_zeroes(const byte * read_buf,const ulint zip_size)485 buf_page_is_zeroes(
486 /*===============*/
487 const byte* read_buf, /*!< in: a database page */
488 const ulint zip_size) /*!< in: size of compressed page;
489 0 for uncompressed pages */
490 {
491 const ulint page_size = zip_size ? zip_size : UNIV_PAGE_SIZE;
492
493 for (ulint i = 0; i < page_size; i++) {
494 if (read_buf[i] != 0) {
495 return(false);
496 }
497 }
498 return(true);
499 }
500
501 /** Checks if the page is in crc32 checksum format.
502 @param[in] read_buf database page
503 @param[in] checksum_field1 new checksum field
504 @param[in] checksum_field2 old checksum field
505 @return true if the page is in crc32 checksum format */
506 UNIV_INLINE
507 bool
buf_page_is_checksum_valid_crc32(const byte * read_buf,ulint checksum_field1,ulint checksum_field2)508 buf_page_is_checksum_valid_crc32(
509 const byte* read_buf,
510 ulint checksum_field1,
511 ulint checksum_field2)
512 {
513 ib_uint32_t crc32 = buf_calc_page_crc32(read_buf);
514
515 return(checksum_field1 == crc32 && checksum_field2 == crc32);
516 }
517
518 /** Checks if the page is in innodb checksum format.
519 @param[in] read_buf database page
520 @param[in] checksum_field1 new checksum field
521 @param[in] checksum_field2 old checksum field
522 @return true if the page is in innodb checksum format */
523 UNIV_INLINE
524 bool
buf_page_is_checksum_valid_innodb(const byte * read_buf,ulint checksum_field1,ulint checksum_field2)525 buf_page_is_checksum_valid_innodb(
526 const byte* read_buf,
527 ulint checksum_field1,
528 ulint checksum_field2)
529 {
530 /* There are 2 valid formulas for
531 checksum_field2 (old checksum field) which algo=innodb could have
532 written to the page:
533
534 1. Very old versions of InnoDB only stored 8 byte lsn to the
535 start and the end of the page.
536
537 2. Newer InnoDB versions store the old formula checksum
538 (buf_calc_page_old_checksum()). */
539
540 if (checksum_field2 != mach_read_from_4(read_buf + FIL_PAGE_LSN)
541 && checksum_field2 != buf_calc_page_old_checksum(read_buf)) {
542 return(false);
543 }
544
545 /* old field is fine, check the new field */
546
547 /* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id
548 (always equal to 0), to FIL_PAGE_SPACE_OR_CHKSUM */
549
550 if (checksum_field1 != 0
551 && checksum_field1 != buf_calc_page_new_checksum(read_buf)) {
552 return(false);
553 }
554
555 return(true);
556 }
557
558 /** Checks if the page is in none checksum format.
559 @param[in] read_buf database page
560 @param[in] checksum_field1 new checksum field
561 @param[in] checksum_field2 old checksum field
562 @return true if the page is in none checksum format */
563 UNIV_INLINE
564 bool
buf_page_is_checksum_valid_none(const byte * read_buf,ulint checksum_field1,ulint checksum_field2)565 buf_page_is_checksum_valid_none(
566 const byte* read_buf,
567 ulint checksum_field1,
568 ulint checksum_field2)
569 {
570 return(checksum_field1 == checksum_field2
571 && checksum_field1 == BUF_NO_CHECKSUM_MAGIC);
572 }
573
574 /********************************************************************//**
575 Checks if a page is corrupt.
576 @return TRUE if corrupted */
577 UNIV_INTERN
578 ibool
buf_page_is_corrupted(bool check_lsn,const byte * read_buf,ulint zip_size)579 buf_page_is_corrupted(
580 /*==================*/
581 bool check_lsn, /*!< in: true if we need to check
582 and complain about the LSN */
583 const byte* read_buf, /*!< in: a database page */
584 ulint zip_size) /*!< in: size of compressed page;
585 0 for uncompressed pages */
586 {
587 ulint checksum_field1;
588 ulint checksum_field2;
589
590 if (!zip_size
591 && memcmp(read_buf + FIL_PAGE_LSN + 4,
592 read_buf + UNIV_PAGE_SIZE
593 - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) {
594
595 /* Stored log sequence numbers at the start and the end
596 of page do not match */
597
598 return(TRUE);
599 }
600
601 #ifndef UNIV_HOTBACKUP
602 if (check_lsn && recv_lsn_checks_on) {
603 lsn_t current_lsn;
604
605 /* Since we are going to reset the page LSN during the import
606 phase it makes no sense to spam the log with error messages. */
607
608 if (log_peek_lsn(¤t_lsn)
609 && current_lsn
610 < mach_read_from_8(read_buf + FIL_PAGE_LSN)) {
611 ut_print_timestamp(stderr);
612
613 fprintf(stderr,
614 " InnoDB: Error: page %lu log sequence number"
615 " " LSN_PF "\n"
616 "InnoDB: is in the future! Current system "
617 "log sequence number " LSN_PF ".\n"
618 "InnoDB: Your database may be corrupt or "
619 "you may have copied the InnoDB\n"
620 "InnoDB: tablespace but not the InnoDB "
621 "log files. See\n"
622 "InnoDB: " REFMAN
623 "forcing-innodb-recovery.html\n"
624 "InnoDB: for more information.\n",
625 (ulong) mach_read_from_4(
626 read_buf + FIL_PAGE_OFFSET),
627 (lsn_t) mach_read_from_8(
628 read_buf + FIL_PAGE_LSN),
629 current_lsn);
630 }
631 }
632 #endif
633
634 /* Check whether the checksum fields have correct values */
635
636 if (srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_NONE) {
637 return(FALSE);
638 }
639
640 if (zip_size) {
641 return(!page_zip_verify_checksum(read_buf, zip_size));
642 }
643
644 checksum_field1 = mach_read_from_4(
645 read_buf + FIL_PAGE_SPACE_OR_CHKSUM);
646
647 checksum_field2 = mach_read_from_4(
648 read_buf + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM);
649
650 #if FIL_PAGE_LSN % 8
651 #error "FIL_PAGE_LSN must be 64 bit aligned"
652 #endif
653
654 /* declare empty pages non-corrupted */
655 if (checksum_field1 == 0 && checksum_field2 == 0
656 && *reinterpret_cast<const ib_uint64_t*>(read_buf +
657 FIL_PAGE_LSN) == 0) {
658 /* make sure that the page is really empty */
659 for (ulint i = 0; i < UNIV_PAGE_SIZE; i++) {
660 if (read_buf[i] != 0) {
661 return(TRUE);
662 }
663 }
664
665 return(FALSE);
666 }
667
668 DBUG_EXECUTE_IF("buf_page_is_corrupt_failure", return(TRUE); );
669
670 ulint page_no = mach_read_from_4(read_buf + FIL_PAGE_OFFSET);
671 ulint space_id = mach_read_from_4(read_buf + FIL_PAGE_SPACE_ID);
672 const srv_checksum_algorithm_t curr_algo =
673 static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm);
674
675 switch (curr_algo) {
676 case SRV_CHECKSUM_ALGORITHM_CRC32:
677 case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
678
679 if (buf_page_is_checksum_valid_crc32(read_buf,
680 checksum_field1, checksum_field2)) {
681 return(FALSE);
682 }
683
684 if (buf_page_is_checksum_valid_none(read_buf,
685 checksum_field1, checksum_field2)) {
686 if (curr_algo
687 == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32) {
688 page_warn_strict_checksum(
689 curr_algo,
690 SRV_CHECKSUM_ALGORITHM_NONE,
691 space_id, page_no);
692 }
693
694 return(FALSE);
695 }
696
697 if (buf_page_is_checksum_valid_innodb(read_buf,
698 checksum_field1, checksum_field2)) {
699 if (curr_algo
700 == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32) {
701 page_warn_strict_checksum(
702 curr_algo,
703 SRV_CHECKSUM_ALGORITHM_INNODB,
704 space_id, page_no);
705 }
706
707 return(FALSE);
708 }
709
710 return(TRUE);
711
712 case SRV_CHECKSUM_ALGORITHM_INNODB:
713 case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
714
715 if (buf_page_is_checksum_valid_innodb(read_buf,
716 checksum_field1, checksum_field2)) {
717 return(FALSE);
718 }
719
720 if (buf_page_is_checksum_valid_none(read_buf,
721 checksum_field1, checksum_field2)) {
722 if (curr_algo
723 == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB) {
724 page_warn_strict_checksum(
725 curr_algo,
726 SRV_CHECKSUM_ALGORITHM_NONE,
727 space_id, page_no);
728 }
729
730 return(FALSE);
731 }
732
733 if (buf_page_is_checksum_valid_crc32(read_buf,
734 checksum_field1, checksum_field2)) {
735 if (curr_algo
736 == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB) {
737 page_warn_strict_checksum(
738 curr_algo,
739 SRV_CHECKSUM_ALGORITHM_CRC32,
740 space_id, page_no);
741 }
742
743 return(FALSE);
744 }
745
746 return(TRUE);
747
748 case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
749
750 if (buf_page_is_checksum_valid_none(read_buf,
751 checksum_field1, checksum_field2)) {
752 return(FALSE);
753 }
754
755 if (buf_page_is_checksum_valid_crc32(read_buf,
756 checksum_field1, checksum_field2)) {
757 page_warn_strict_checksum(
758 curr_algo,
759 SRV_CHECKSUM_ALGORITHM_CRC32,
760 space_id, page_no);
761 return(FALSE);
762 }
763
764 if (buf_page_is_checksum_valid_innodb(read_buf,
765 checksum_field1, checksum_field2)) {
766 page_warn_strict_checksum(
767 curr_algo,
768 SRV_CHECKSUM_ALGORITHM_INNODB,
769 space_id, page_no);
770 return(FALSE);
771 }
772
773 return(TRUE);
774
775 case SRV_CHECKSUM_ALGORITHM_NONE:
776 /* should have returned FALSE earlier */
777 break;
778 /* no default so the compiler will emit a warning if new enum
779 is added and not handled here */
780 }
781
782 ut_error;
783 return(FALSE);
784 }
785
786 /********************************************************************//**
787 Prints a page to stderr. */
788 UNIV_INTERN
789 void
buf_page_print(const byte * read_buf,ulint zip_size,ulint flags)790 buf_page_print(
791 /*===========*/
792 const byte* read_buf, /*!< in: a database page */
793 ulint zip_size, /*!< in: compressed page size, or
794 0 for uncompressed pages */
795 ulint flags) /*!< in: 0 or
796 BUF_PAGE_PRINT_NO_CRASH or
797 BUF_PAGE_PRINT_NO_FULL */
798
799 {
800 #ifndef UNIV_HOTBACKUP
801 dict_index_t* index;
802 #endif /* !UNIV_HOTBACKUP */
803 ulint size = zip_size;
804
805 if (!size) {
806 size = UNIV_PAGE_SIZE;
807 }
808
809 if (!(flags & BUF_PAGE_PRINT_NO_FULL)) {
810 ut_print_timestamp(stderr);
811 fprintf(stderr,
812 " InnoDB: Page dump in ascii and hex (%lu bytes):\n",
813 (ulong) size);
814 ut_print_buf(stderr, read_buf, size);
815 fputs("\nInnoDB: End of page dump\n", stderr);
816 }
817
818 if (zip_size) {
819 /* Print compressed page. */
820 ut_print_timestamp(stderr);
821 fprintf(stderr,
822 " InnoDB: Compressed page type (" ULINTPF "); "
823 "stored checksum in field1 " ULINTPF "; "
824 "calculated checksums for field1: "
825 "%s " ULINTPF ", "
826 "%s " ULINTPF ", "
827 "%s " ULINTPF "; "
828 "page LSN " LSN_PF "; "
829 "page number (if stored to page already) " ULINTPF "; "
830 "space id (if stored to page already) " ULINTPF "\n",
831 fil_page_get_type(read_buf),
832 mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
833 buf_checksum_algorithm_name(
834 SRV_CHECKSUM_ALGORITHM_CRC32),
835 page_zip_calc_checksum(read_buf, zip_size,
836 SRV_CHECKSUM_ALGORITHM_CRC32),
837 buf_checksum_algorithm_name(
838 SRV_CHECKSUM_ALGORITHM_INNODB),
839 page_zip_calc_checksum(read_buf, zip_size,
840 SRV_CHECKSUM_ALGORITHM_INNODB),
841 buf_checksum_algorithm_name(
842 SRV_CHECKSUM_ALGORITHM_NONE),
843 page_zip_calc_checksum(read_buf, zip_size,
844 SRV_CHECKSUM_ALGORITHM_NONE),
845 mach_read_from_8(read_buf + FIL_PAGE_LSN),
846 mach_read_from_4(read_buf + FIL_PAGE_OFFSET),
847 mach_read_from_4(read_buf
848 + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
849 } else {
850 ut_print_timestamp(stderr);
851 fprintf(stderr, " InnoDB: uncompressed page, "
852 "stored checksum in field1 " ULINTPF ", "
853 "calculated checksums for field1: "
854 "%s " UINT32PF ", "
855 "%s " ULINTPF ", "
856 "%s " ULINTPF ", "
857
858 "stored checksum in field2 " ULINTPF ", "
859 "calculated checksums for field2: "
860 "%s " UINT32PF ", "
861 "%s " ULINTPF ", "
862 "%s " ULINTPF ", "
863
864 "page LSN " ULINTPF " " ULINTPF ", "
865 "low 4 bytes of LSN at page end " ULINTPF ", "
866 "page number (if stored to page already) " ULINTPF ", "
867 "space id (if created with >= MySQL-4.1.1 "
868 "and stored already) %lu\n",
869 mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
870 buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_CRC32),
871 buf_calc_page_crc32(read_buf),
872 buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_INNODB),
873 buf_calc_page_new_checksum(read_buf),
874 buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_NONE),
875 BUF_NO_CHECKSUM_MAGIC,
876
877 mach_read_from_4(read_buf + UNIV_PAGE_SIZE
878 - FIL_PAGE_END_LSN_OLD_CHKSUM),
879 buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_CRC32),
880 buf_calc_page_crc32(read_buf),
881 buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_INNODB),
882 buf_calc_page_old_checksum(read_buf),
883 buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_NONE),
884 BUF_NO_CHECKSUM_MAGIC,
885
886 mach_read_from_4(read_buf + FIL_PAGE_LSN),
887 mach_read_from_4(read_buf + FIL_PAGE_LSN + 4),
888 mach_read_from_4(read_buf + UNIV_PAGE_SIZE
889 - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
890 mach_read_from_4(read_buf + FIL_PAGE_OFFSET),
891 mach_read_from_4(read_buf
892 + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
893 }
894
895 #ifndef UNIV_HOTBACKUP
896 if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE)
897 == TRX_UNDO_INSERT) {
898 fprintf(stderr,
899 "InnoDB: Page may be an insert undo log page\n");
900 } else if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR
901 + TRX_UNDO_PAGE_TYPE)
902 == TRX_UNDO_UPDATE) {
903 fprintf(stderr,
904 "InnoDB: Page may be an update undo log page\n");
905 }
906 #endif /* !UNIV_HOTBACKUP */
907
908 switch (fil_page_get_type(read_buf)) {
909 index_id_t index_id;
910 case FIL_PAGE_INDEX:
911 index_id = btr_page_get_index_id(read_buf);
912 fprintf(stderr,
913 "InnoDB: Page may be an index page where"
914 " index id is %llu\n",
915 (ullint) index_id);
916 #ifndef UNIV_HOTBACKUP
917 index = dict_index_find_on_id_low(index_id);
918 if (index) {
919 fputs("InnoDB: (", stderr);
920 dict_index_name_print(stderr, NULL, index);
921 fputs(")\n", stderr);
922 }
923 #endif /* !UNIV_HOTBACKUP */
924 break;
925 case FIL_PAGE_INODE:
926 fputs("InnoDB: Page may be an 'inode' page\n", stderr);
927 break;
928 case FIL_PAGE_IBUF_FREE_LIST:
929 fputs("InnoDB: Page may be an insert buffer free list page\n",
930 stderr);
931 break;
932 case FIL_PAGE_TYPE_ALLOCATED:
933 fputs("InnoDB: Page may be a freshly allocated page\n",
934 stderr);
935 break;
936 case FIL_PAGE_IBUF_BITMAP:
937 fputs("InnoDB: Page may be an insert buffer bitmap page\n",
938 stderr);
939 break;
940 case FIL_PAGE_TYPE_SYS:
941 fputs("InnoDB: Page may be a system page\n",
942 stderr);
943 break;
944 case FIL_PAGE_TYPE_TRX_SYS:
945 fputs("InnoDB: Page may be a transaction system page\n",
946 stderr);
947 break;
948 case FIL_PAGE_TYPE_FSP_HDR:
949 fputs("InnoDB: Page may be a file space header page\n",
950 stderr);
951 break;
952 case FIL_PAGE_TYPE_XDES:
953 fputs("InnoDB: Page may be an extent descriptor page\n",
954 stderr);
955 break;
956 case FIL_PAGE_TYPE_BLOB:
957 fputs("InnoDB: Page may be a BLOB page\n",
958 stderr);
959 break;
960 case FIL_PAGE_TYPE_ZBLOB:
961 case FIL_PAGE_TYPE_ZBLOB2:
962 fputs("InnoDB: Page may be a compressed BLOB page\n",
963 stderr);
964 break;
965 }
966
967 ut_ad(flags & BUF_PAGE_PRINT_NO_CRASH);
968 }
969
970 #ifndef UNIV_HOTBACKUP
971
972 # ifdef PFS_GROUP_BUFFER_SYNC
973 /********************************************************************//**
974 This function registers mutexes and rwlocks in buffer blocks with
975 performance schema. If PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER is
976 defined to be a value less than chunk->size, then only mutexes
977 and rwlocks in the first PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER
978 blocks are registered. */
979 static
980 void
pfs_register_buffer_block(buf_chunk_t * chunk)981 pfs_register_buffer_block(
982 /*======================*/
983 buf_chunk_t* chunk) /*!< in/out: chunk of buffers */
984 {
985 ulint i;
986 ulint num_to_register;
987 buf_block_t* block;
988
989 block = chunk->blocks;
990
991 num_to_register = ut_min(chunk->size,
992 PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER);
993
994 for (i = 0; i < num_to_register; i++) {
995 ib_mutex_t* mutex;
996 rw_lock_t* rwlock;
997
998 # ifdef UNIV_PFS_MUTEX
999 mutex = &block->mutex;
1000 ut_a(!mutex->pfs_psi);
1001 mutex->pfs_psi = (PSI_server)
1002 ? PSI_server->init_mutex(buffer_block_mutex_key, mutex)
1003 : NULL;
1004 # endif /* UNIV_PFS_MUTEX */
1005
1006 # ifdef UNIV_PFS_RWLOCK
1007 rwlock = &block->lock;
1008 ut_a(!rwlock->pfs_psi);
1009 rwlock->pfs_psi = (PSI_server)
1010 ? PSI_server->init_rwlock(buf_block_lock_key, rwlock)
1011 : NULL;
1012
1013 # ifdef UNIV_SYNC_DEBUG
1014 rwlock = &block->debug_latch;
1015 ut_a(!rwlock->pfs_psi);
1016 rwlock->pfs_psi = (PSI_server)
1017 ? PSI_server->init_rwlock(buf_block_debug_latch_key,
1018 rwlock)
1019 : NULL;
1020 # endif /* UNIV_SYNC_DEBUG */
1021
1022 # endif /* UNIV_PFS_RWLOCK */
1023 block++;
1024 }
1025 }
1026 # endif /* PFS_GROUP_BUFFER_SYNC */
1027
1028 /********************************************************************//**
1029 Initializes a buffer control block when the buf_pool is created. */
1030 static
1031 void
buf_block_init(buf_pool_t * buf_pool,buf_block_t * block,byte * frame)1032 buf_block_init(
1033 /*===========*/
1034 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1035 buf_block_t* block, /*!< in: pointer to control block */
1036 byte* frame) /*!< in: pointer to buffer frame */
1037 {
1038 UNIV_MEM_DESC(frame, UNIV_PAGE_SIZE);
1039
1040 block->frame = frame;
1041
1042 block->page.buf_pool_index = buf_pool_index(buf_pool);
1043 block->page.state = BUF_BLOCK_NOT_USED;
1044 block->page.buf_fix_count = 0;
1045 block->page.io_fix = BUF_IO_NONE;
1046
1047 block->modify_clock = 0;
1048
1049 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
1050 block->page.file_page_was_freed = FALSE;
1051 #endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
1052
1053 block->check_index_page_at_flush = FALSE;
1054 block->index = NULL;
1055
1056 #ifdef UNIV_DEBUG
1057 block->page.in_page_hash = FALSE;
1058 block->page.in_zip_hash = FALSE;
1059 block->page.in_flush_list = FALSE;
1060 block->page.in_free_list = FALSE;
1061 block->page.in_LRU_list = FALSE;
1062 block->in_unzip_LRU_list = FALSE;
1063 #endif /* UNIV_DEBUG */
1064 #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
1065 block->n_pointers = 0;
1066 #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
1067 page_zip_des_init(&block->page.zip);
1068
1069 #if defined PFS_SKIP_BUFFER_MUTEX_RWLOCK || defined PFS_GROUP_BUFFER_SYNC
1070 /* If PFS_SKIP_BUFFER_MUTEX_RWLOCK is defined, skip registration
1071 of buffer block mutex/rwlock with performance schema. If
1072 PFS_GROUP_BUFFER_SYNC is defined, skip the registration
1073 since buffer block mutex/rwlock will be registered later in
1074 pfs_register_buffer_block() */
1075
1076 mutex_create(PFS_NOT_INSTRUMENTED, &block->mutex, SYNC_BUF_BLOCK);
1077 rw_lock_create(PFS_NOT_INSTRUMENTED, &block->lock, SYNC_LEVEL_VARYING);
1078
1079 # ifdef UNIV_SYNC_DEBUG
1080 rw_lock_create(PFS_NOT_INSTRUMENTED,
1081 &block->debug_latch, SYNC_NO_ORDER_CHECK);
1082 # endif /* UNIV_SYNC_DEBUG */
1083
1084 #else /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */
1085 mutex_create(buffer_block_mutex_key, &block->mutex, SYNC_BUF_BLOCK);
1086 rw_lock_create(buf_block_lock_key, &block->lock, SYNC_LEVEL_VARYING);
1087
1088 # ifdef UNIV_SYNC_DEBUG
1089 rw_lock_create(buf_block_debug_latch_key,
1090 &block->debug_latch, SYNC_NO_ORDER_CHECK);
1091 # endif /* UNIV_SYNC_DEBUG */
1092 #endif /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */
1093
1094 ut_ad(rw_lock_validate(&(block->lock)));
1095 }
1096
1097 /********************************************************************//**
1098 Allocates a chunk of buffer frames.
1099 @return chunk, or NULL on failure */
1100 static
1101 buf_chunk_t*
buf_chunk_init(buf_pool_t * buf_pool,buf_chunk_t * chunk,ulint mem_size)1102 buf_chunk_init(
1103 /*===========*/
1104 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1105 buf_chunk_t* chunk, /*!< out: chunk of buffers */
1106 ulint mem_size) /*!< in: requested size in bytes */
1107 {
1108 buf_block_t* block;
1109 byte* frame;
1110 ulint i;
1111
1112 /* Round down to a multiple of page size,
1113 although it already should be. */
1114 mem_size = ut_2pow_round(mem_size, UNIV_PAGE_SIZE);
1115 /* Reserve space for the block descriptors. */
1116 mem_size += ut_2pow_round((mem_size / UNIV_PAGE_SIZE) * (sizeof *block)
1117 + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE);
1118
1119 chunk->mem_size = mem_size;
1120 chunk->mem = os_mem_alloc_large(&chunk->mem_size);
1121
1122 if (UNIV_UNLIKELY(chunk->mem == NULL)) {
1123
1124 return(NULL);
1125 }
1126
1127 #ifdef HAVE_LIBNUMA
1128 if (srv_numa_interleave) {
1129 int st = mbind(chunk->mem, chunk->mem_size,
1130 MPOL_INTERLEAVE,
1131 numa_all_nodes_ptr->maskp,
1132 numa_all_nodes_ptr->size,
1133 MPOL_MF_MOVE);
1134 if (st != 0) {
1135 ib_logf(IB_LOG_LEVEL_WARN,
1136 "Failed to set NUMA memory policy of buffer"
1137 " pool page frames to MPOL_INTERLEAVE"
1138 " (error: %s).", strerror(errno));
1139 }
1140 }
1141 #endif // HAVE_LIBNUMA
1142
1143 /* Allocate the block descriptors from
1144 the start of the memory block. */
1145 chunk->blocks = (buf_block_t*) chunk->mem;
1146
1147 /* Align a pointer to the first frame. Note that when
1148 os_large_page_size is smaller than UNIV_PAGE_SIZE,
1149 we may allocate one fewer block than requested. When
1150 it is bigger, we may allocate more blocks than requested. */
1151
1152 frame = (byte*) ut_align(chunk->mem, UNIV_PAGE_SIZE);
1153 chunk->size = chunk->mem_size / UNIV_PAGE_SIZE
1154 - (frame != chunk->mem);
1155
1156 /* Subtract the space needed for block descriptors. */
1157 {
1158 ulint size = chunk->size;
1159
1160 while (frame < (byte*) (chunk->blocks + size)) {
1161 frame += UNIV_PAGE_SIZE;
1162 size--;
1163 }
1164
1165 chunk->size = size;
1166 }
1167
1168 /* Init block structs and assign frames for them. Then we
1169 assign the frames to the first blocks (we already mapped the
1170 memory above). */
1171
1172 block = chunk->blocks;
1173
1174 for (i = chunk->size; i--; ) {
1175
1176 buf_block_init(buf_pool, block, frame);
1177 UNIV_MEM_INVALID(block->frame, UNIV_PAGE_SIZE);
1178
1179 /* Add the block to the free list */
1180 UT_LIST_ADD_LAST(list, buf_pool->free, (&block->page));
1181
1182 ut_d(block->page.in_free_list = TRUE);
1183 ut_ad(buf_pool_from_block(block) == buf_pool);
1184
1185 block++;
1186 frame += UNIV_PAGE_SIZE;
1187 }
1188
1189 #ifdef PFS_GROUP_BUFFER_SYNC
1190 pfs_register_buffer_block(chunk);
1191 #endif
1192 return(chunk);
1193 }
1194
1195 #ifdef UNIV_DEBUG
1196 /*********************************************************************//**
1197 Finds a block in the given buffer chunk that points to a
1198 given compressed page.
1199 @return buffer block pointing to the compressed page, or NULL */
1200 static
1201 buf_block_t*
buf_chunk_contains_zip(buf_chunk_t * chunk,const void * data)1202 buf_chunk_contains_zip(
1203 /*===================*/
1204 buf_chunk_t* chunk, /*!< in: chunk being checked */
1205 const void* data) /*!< in: pointer to compressed page */
1206 {
1207 buf_block_t* block;
1208 ulint i;
1209
1210 block = chunk->blocks;
1211
1212 for (i = chunk->size; i--; block++) {
1213 if (block->page.zip.data == data) {
1214
1215 return(block);
1216 }
1217 }
1218
1219 return(NULL);
1220 }
1221
1222 /*********************************************************************//**
1223 Finds a block in the buffer pool that points to a
1224 given compressed page.
1225 @return buffer block pointing to the compressed page, or NULL */
1226 UNIV_INTERN
1227 buf_block_t*
buf_pool_contains_zip(buf_pool_t * buf_pool,const void * data)1228 buf_pool_contains_zip(
1229 /*==================*/
1230 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1231 const void* data) /*!< in: pointer to compressed page */
1232 {
1233 ulint n;
1234 buf_chunk_t* chunk = buf_pool->chunks;
1235
1236 ut_ad(buf_pool);
1237 ut_ad(buf_pool_mutex_own(buf_pool));
1238 for (n = buf_pool->n_chunks; n--; chunk++) {
1239
1240 buf_block_t* block = buf_chunk_contains_zip(chunk, data);
1241
1242 if (block) {
1243 return(block);
1244 }
1245 }
1246
1247 return(NULL);
1248 }
1249 #endif /* UNIV_DEBUG */
1250
1251 /*********************************************************************//**
1252 Checks that all file pages in the buffer chunk are in a replaceable state.
1253 @return address of a non-free block, or NULL if all freed */
1254 static
1255 const buf_block_t*
buf_chunk_not_freed(buf_chunk_t * chunk)1256 buf_chunk_not_freed(
1257 /*================*/
1258 buf_chunk_t* chunk) /*!< in: chunk being checked */
1259 {
1260 buf_block_t* block;
1261 ulint i;
1262
1263 block = chunk->blocks;
1264
1265 for (i = chunk->size; i--; block++) {
1266 ibool ready;
1267
1268 switch (buf_block_get_state(block)) {
1269 case BUF_BLOCK_POOL_WATCH:
1270 case BUF_BLOCK_ZIP_PAGE:
1271 case BUF_BLOCK_ZIP_DIRTY:
1272 /* The uncompressed buffer pool should never
1273 contain compressed block descriptors. */
1274 ut_error;
1275 break;
1276 case BUF_BLOCK_NOT_USED:
1277 case BUF_BLOCK_READY_FOR_USE:
1278 case BUF_BLOCK_MEMORY:
1279 case BUF_BLOCK_REMOVE_HASH:
1280 /* Skip blocks that are not being used for
1281 file pages. */
1282 break;
1283 case BUF_BLOCK_FILE_PAGE:
1284 mutex_enter(&block->mutex);
1285 ready = buf_flush_ready_for_replace(&block->page);
1286 mutex_exit(&block->mutex);
1287
1288 if (!ready) {
1289
1290 return(block);
1291 }
1292
1293 break;
1294 }
1295 }
1296
1297 return(NULL);
1298 }
1299
1300 /********************************************************************//**
1301 Set buffer pool size variables after resizing it */
1302 static
1303 void
buf_pool_set_sizes(void)1304 buf_pool_set_sizes(void)
1305 /*====================*/
1306 {
1307 ulint i;
1308 ulint curr_size = 0;
1309
1310 buf_pool_mutex_enter_all();
1311
1312 for (i = 0; i < srv_buf_pool_instances; i++) {
1313 buf_pool_t* buf_pool;
1314
1315 buf_pool = buf_pool_from_array(i);
1316 curr_size += buf_pool->curr_pool_size;
1317 }
1318
1319 srv_buf_pool_curr_size = curr_size;
1320 srv_buf_pool_old_size = srv_buf_pool_size;
1321
1322 buf_pool_mutex_exit_all();
1323 }
1324
1325 /********************************************************************//**
1326 Initialize a buffer pool instance.
1327 @return DB_SUCCESS if all goes well. */
1328 UNIV_INTERN
1329 ulint
buf_pool_init_instance(buf_pool_t * buf_pool,ulint buf_pool_size,ulint instance_no)1330 buf_pool_init_instance(
1331 /*===================*/
1332 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1333 ulint buf_pool_size, /*!< in: size in bytes */
1334 ulint instance_no) /*!< in: id of the instance */
1335 {
1336 ulint i;
1337 buf_chunk_t* chunk;
1338
1339 /* 1. Initialize general fields
1340 ------------------------------- */
1341 mutex_create(buf_pool_mutex_key,
1342 &buf_pool->mutex, SYNC_BUF_POOL);
1343 mutex_create(buf_pool_zip_mutex_key,
1344 &buf_pool->zip_mutex, SYNC_BUF_BLOCK);
1345
1346 buf_pool_mutex_enter(buf_pool);
1347
1348 if (buf_pool_size > 0) {
1349 buf_pool->n_chunks = 1;
1350
1351 buf_pool->chunks = chunk =
1352 (buf_chunk_t*) mem_zalloc(sizeof *chunk);
1353
1354 UT_LIST_INIT(buf_pool->free);
1355
1356 if (!buf_chunk_init(buf_pool, chunk, buf_pool_size)) {
1357 mem_free(chunk);
1358 mem_free(buf_pool);
1359
1360 buf_pool_mutex_exit(buf_pool);
1361
1362 return(DB_ERROR);
1363 }
1364
1365 buf_pool->instance_no = instance_no;
1366 buf_pool->old_pool_size = buf_pool_size;
1367 buf_pool->curr_size = chunk->size;
1368 buf_pool->curr_pool_size = buf_pool->curr_size * UNIV_PAGE_SIZE;
1369
1370 /* Number of locks protecting page_hash must be a
1371 power of two */
1372 srv_n_page_hash_locks = static_cast<ulong>(
1373 ut_2_power_up(srv_n_page_hash_locks));
1374 ut_a(srv_n_page_hash_locks != 0);
1375 ut_a(srv_n_page_hash_locks <= MAX_PAGE_HASH_LOCKS);
1376
1377 buf_pool->page_hash = ib_create(2 * buf_pool->curr_size,
1378 srv_n_page_hash_locks,
1379 MEM_HEAP_FOR_PAGE_HASH,
1380 SYNC_BUF_PAGE_HASH);
1381
1382 buf_pool->zip_hash = hash_create(2 * buf_pool->curr_size);
1383
1384 buf_pool->last_printout_time = ut_time();
1385 }
1386 /* 2. Initialize flushing fields
1387 -------------------------------- */
1388
1389 mutex_create(flush_list_mutex_key, &buf_pool->flush_list_mutex,
1390 SYNC_BUF_FLUSH_LIST);
1391
1392 for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
1393 buf_pool->no_flush[i] = os_event_create();
1394 }
1395
1396 buf_pool->watch = (buf_page_t*) mem_zalloc(
1397 sizeof(*buf_pool->watch) * BUF_POOL_WATCH_SIZE);
1398
1399 /* All fields are initialized by mem_zalloc(). */
1400
1401 buf_pool->try_LRU_scan = TRUE;
1402
1403 buf_pool_mutex_exit(buf_pool);
1404
1405 return(DB_SUCCESS);
1406 }
1407
1408 /********************************************************************//**
1409 free one buffer pool instance */
1410 static
1411 void
buf_pool_free_instance(buf_pool_t * buf_pool)1412 buf_pool_free_instance(
1413 /*===================*/
1414 buf_pool_t* buf_pool) /* in,own: buffer pool instance
1415 to free */
1416 {
1417 buf_chunk_t* chunk;
1418 buf_chunk_t* chunks;
1419 buf_page_t* bpage;
1420
1421 bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1422 while (bpage != NULL) {
1423 buf_page_t* prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
1424 enum buf_page_state state = buf_page_get_state(bpage);
1425
1426 ut_ad(buf_page_in_file(bpage));
1427 ut_ad(bpage->in_LRU_list);
1428
1429 if (state != BUF_BLOCK_FILE_PAGE) {
1430 /* We must not have any dirty block except
1431 when doing a fast shutdown. */
1432 ut_ad(state == BUF_BLOCK_ZIP_PAGE
1433 || srv_fast_shutdown == 2);
1434 buf_page_free_descriptor(bpage);
1435 }
1436
1437 bpage = prev_bpage;
1438 }
1439
1440 mem_free(buf_pool->watch);
1441 buf_pool->watch = NULL;
1442
1443 chunks = buf_pool->chunks;
1444 chunk = chunks + buf_pool->n_chunks;
1445
1446 while (--chunk >= chunks) {
1447 os_mem_free_large(chunk->mem, chunk->mem_size);
1448 }
1449
1450 mem_free(buf_pool->chunks);
1451 ha_clear(buf_pool->page_hash);
1452 hash_table_free(buf_pool->page_hash);
1453 hash_table_free(buf_pool->zip_hash);
1454 }
1455
1456 /********************************************************************//**
1457 Creates the buffer pool.
1458 @return DB_SUCCESS if success, DB_ERROR if not enough memory or error */
1459 UNIV_INTERN
1460 dberr_t
buf_pool_init(ulint total_size,ulint n_instances)1461 buf_pool_init(
1462 /*==========*/
1463 ulint total_size, /*!< in: size of the total pool in bytes */
1464 ulint n_instances) /*!< in: number of instances */
1465 {
1466 ulint i;
1467 const ulint size = total_size / n_instances;
1468
1469 ut_ad(n_instances > 0);
1470 ut_ad(n_instances <= MAX_BUFFER_POOLS);
1471 ut_ad(n_instances == srv_buf_pool_instances);
1472
1473 #ifdef HAVE_LIBNUMA
1474 if (srv_numa_interleave) {
1475 ib_logf(IB_LOG_LEVEL_INFO,
1476 "Setting NUMA memory policy to MPOL_INTERLEAVE");
1477 if (set_mempolicy(MPOL_INTERLEAVE,
1478 numa_all_nodes_ptr->maskp,
1479 numa_all_nodes_ptr->size) != 0) {
1480 ib_logf(IB_LOG_LEVEL_WARN,
1481 "Failed to set NUMA memory policy to"
1482 " MPOL_INTERLEAVE (error: %s).",
1483 strerror(errno));
1484 }
1485 }
1486 #endif // HAVE_LIBNUMA
1487
1488 buf_pool_ptr = (buf_pool_t*) mem_zalloc(
1489 n_instances * sizeof *buf_pool_ptr);
1490
1491 for (i = 0; i < n_instances; i++) {
1492 buf_pool_t* ptr = &buf_pool_ptr[i];
1493
1494 if (buf_pool_init_instance(ptr, size, i) != DB_SUCCESS) {
1495
1496 /* Free all the instances created so far. */
1497 buf_pool_free(i);
1498
1499 return(DB_ERROR);
1500 }
1501 }
1502
1503 buf_pool_set_sizes();
1504 buf_LRU_old_ratio_update(100 * 3/ 8, FALSE);
1505
1506 btr_search_sys_create(buf_pool_get_curr_size() / sizeof(void*) / 64);
1507
1508 #ifdef HAVE_LIBNUMA
1509 if (srv_numa_interleave) {
1510 ib_logf(IB_LOG_LEVEL_INFO,
1511 "Setting NUMA memory policy to MPOL_DEFAULT");
1512 if (set_mempolicy(MPOL_DEFAULT, NULL, 0) != 0) {
1513 ib_logf(IB_LOG_LEVEL_WARN,
1514 "Failed to set NUMA memory policy to"
1515 " MPOL_DEFAULT (error: %s).", strerror(errno));
1516 }
1517 }
1518 #endif // HAVE_LIBNUMA
1519
1520 return(DB_SUCCESS);
1521 }
1522
1523 /********************************************************************//**
1524 Frees the buffer pool at shutdown. This must not be invoked before
1525 freeing all mutexes. */
1526 UNIV_INTERN
1527 void
buf_pool_free(ulint n_instances)1528 buf_pool_free(
1529 /*==========*/
1530 ulint n_instances) /*!< in: numbere of instances to free */
1531 {
1532 ulint i;
1533
1534 for (i = 0; i < n_instances; i++) {
1535 buf_pool_free_instance(buf_pool_from_array(i));
1536 }
1537
1538 mem_free(buf_pool_ptr);
1539 buf_pool_ptr = NULL;
1540 }
1541
1542 /********************************************************************//**
1543 Clears the adaptive hash index on all pages in the buffer pool. */
1544 UNIV_INTERN
1545 void
buf_pool_clear_hash_index(void)1546 buf_pool_clear_hash_index(void)
1547 /*===========================*/
1548 {
1549 ulint p;
1550
1551 #ifdef UNIV_SYNC_DEBUG
1552 ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX));
1553 #endif /* UNIV_SYNC_DEBUG */
1554 ut_ad(!btr_search_enabled);
1555
1556 for (p = 0; p < srv_buf_pool_instances; p++) {
1557 buf_pool_t* buf_pool = buf_pool_from_array(p);
1558 buf_chunk_t* chunks = buf_pool->chunks;
1559 buf_chunk_t* chunk = chunks + buf_pool->n_chunks;
1560
1561 while (--chunk >= chunks) {
1562 buf_block_t* block = chunk->blocks;
1563 ulint i = chunk->size;
1564
1565 for (; i--; block++) {
1566 dict_index_t* index = block->index;
1567
1568 /* We can set block->index = NULL
1569 when we have an x-latch on btr_search_latch;
1570 see the comment in buf0buf.h */
1571
1572 if (!index) {
1573 /* Not hashed */
1574 continue;
1575 }
1576
1577 block->index = NULL;
1578 # if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
1579 block->n_pointers = 0;
1580 # endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
1581 }
1582 }
1583 }
1584 }
1585
1586 /********************************************************************//**
1587 Relocate a buffer control block. Relocates the block on the LRU list
1588 and in buf_pool->page_hash. Does not relocate bpage->list.
1589 The caller must take care of relocating bpage->list. */
1590 UNIV_INTERN
1591 void
buf_relocate(buf_page_t * bpage,buf_page_t * dpage)1592 buf_relocate(
1593 /*=========*/
1594 buf_page_t* bpage, /*!< in/out: control block being relocated;
1595 buf_page_get_state(bpage) must be
1596 BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_ZIP_PAGE */
1597 buf_page_t* dpage) /*!< in/out: destination control block */
1598 {
1599 buf_page_t* b;
1600 ulint fold;
1601 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1602
1603 fold = buf_page_address_fold(bpage->space, bpage->offset);
1604
1605 ut_ad(buf_pool_mutex_own(buf_pool));
1606 ut_ad(buf_page_hash_lock_held_x(buf_pool, bpage));
1607 ut_ad(mutex_own(buf_page_get_mutex(bpage)));
1608 ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
1609 ut_a(bpage->buf_fix_count == 0);
1610 ut_ad(bpage->in_LRU_list);
1611 ut_ad(!bpage->in_zip_hash);
1612 ut_ad(bpage->in_page_hash);
1613 ut_ad(bpage == buf_page_hash_get_low(buf_pool,
1614 bpage->space,
1615 bpage->offset,
1616 fold));
1617
1618 ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
1619 #ifdef UNIV_DEBUG
1620 switch (buf_page_get_state(bpage)) {
1621 case BUF_BLOCK_POOL_WATCH:
1622 case BUF_BLOCK_NOT_USED:
1623 case BUF_BLOCK_READY_FOR_USE:
1624 case BUF_BLOCK_FILE_PAGE:
1625 case BUF_BLOCK_MEMORY:
1626 case BUF_BLOCK_REMOVE_HASH:
1627 ut_error;
1628 case BUF_BLOCK_ZIP_DIRTY:
1629 case BUF_BLOCK_ZIP_PAGE:
1630 break;
1631 }
1632 #endif /* UNIV_DEBUG */
1633
1634 memcpy(dpage, bpage, sizeof *dpage);
1635
1636 ut_d(bpage->in_LRU_list = FALSE);
1637 ut_d(bpage->in_page_hash = FALSE);
1638
1639 /* relocate buf_pool->LRU */
1640 b = UT_LIST_GET_PREV(LRU, bpage);
1641 UT_LIST_REMOVE(LRU, buf_pool->LRU, bpage);
1642
1643 if (b) {
1644 UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU, b, dpage);
1645 } else {
1646 UT_LIST_ADD_FIRST(LRU, buf_pool->LRU, dpage);
1647 }
1648
1649 if (UNIV_UNLIKELY(buf_pool->LRU_old == bpage)) {
1650 buf_pool->LRU_old = dpage;
1651 #ifdef UNIV_LRU_DEBUG
1652 /* buf_pool->LRU_old must be the first item in the LRU list
1653 whose "old" flag is set. */
1654 ut_a(buf_pool->LRU_old->old);
1655 ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)
1656 || !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old);
1657 ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)
1658 || UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old);
1659 } else {
1660 /* Check that the "old" flag is consistent in
1661 the block and its neighbours. */
1662 buf_page_set_old(dpage, buf_page_is_old(dpage));
1663 #endif /* UNIV_LRU_DEBUG */
1664 }
1665
1666 ut_d(UT_LIST_VALIDATE(
1667 LRU, buf_page_t, buf_pool->LRU, CheckInLRUList()));
1668
1669 /* relocate buf_pool->page_hash */
1670 HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, bpage);
1671 HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, dpage);
1672 }
1673
1674 /********************************************************************//**
1675 Determine if a block is a sentinel for a buffer pool watch.
1676 @return TRUE if a sentinel for a buffer pool watch, FALSE if not */
1677 UNIV_INTERN
1678 ibool
buf_pool_watch_is_sentinel(buf_pool_t * buf_pool,const buf_page_t * bpage)1679 buf_pool_watch_is_sentinel(
1680 /*=======================*/
1681 buf_pool_t* buf_pool, /*!< buffer pool instance */
1682 const buf_page_t* bpage) /*!< in: block */
1683 {
1684 /* We must also own the appropriate hash lock. */
1685 ut_ad(buf_page_hash_lock_held_s_or_x(buf_pool, bpage));
1686 ut_ad(buf_page_in_file(bpage));
1687
1688 if (bpage < &buf_pool->watch[0]
1689 || bpage >= &buf_pool->watch[BUF_POOL_WATCH_SIZE]) {
1690
1691 ut_ad(buf_page_get_state(bpage) != BUF_BLOCK_ZIP_PAGE
1692 || bpage->zip.data != NULL);
1693
1694 return(FALSE);
1695 }
1696
1697 ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE);
1698 ut_ad(!bpage->in_zip_hash);
1699 ut_ad(bpage->in_page_hash);
1700 ut_ad(bpage->zip.data == NULL);
1701 ut_ad(bpage->buf_fix_count > 0);
1702 return(TRUE);
1703 }
1704
1705 /****************************************************************//**
1706 Add watch for the given page to be read in. Caller must have
1707 appropriate hash_lock for the bpage. This function may release the
1708 hash_lock and reacquire it.
1709 @return NULL if watch set, block if the page is in the buffer pool */
1710 UNIV_INTERN
1711 buf_page_t*
buf_pool_watch_set(ulint space,ulint offset,ulint fold)1712 buf_pool_watch_set(
1713 /*===============*/
1714 ulint space, /*!< in: space id */
1715 ulint offset, /*!< in: page number */
1716 ulint fold) /*!< in: buf_page_address_fold(space, offset) */
1717 {
1718 buf_page_t* bpage;
1719 ulint i;
1720 buf_pool_t* buf_pool = buf_pool_get(space, offset);
1721 rw_lock_t* hash_lock;
1722
1723 hash_lock = buf_page_hash_lock_get(buf_pool, fold);
1724
1725 #ifdef UNIV_SYNC_DEBUG
1726 ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX));
1727 #endif /* UNIV_SYNC_DEBUG */
1728
1729 bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
1730
1731 if (bpage != NULL) {
1732 page_found:
1733 if (!buf_pool_watch_is_sentinel(buf_pool, bpage)) {
1734 /* The page was loaded meanwhile. */
1735 return(bpage);
1736 }
1737
1738 /* Add to an existing watch. */
1739 #ifdef PAGE_ATOMIC_REF_COUNT
1740 os_atomic_increment_uint32(&bpage->buf_fix_count, 1);
1741 #else
1742 ++bpage->buf_fix_count;
1743 #endif /* PAGE_ATOMIC_REF_COUNT */
1744 return(NULL);
1745 }
1746
1747 /* From this point this function becomes fairly heavy in terms
1748 of latching. We acquire the buf_pool mutex as well as all the
1749 hash_locks. buf_pool mutex is needed because any changes to
1750 the page_hash must be covered by it and hash_locks are needed
1751 because we don't want to read any stale information in
1752 buf_pool->watch[]. However, it is not in the critical code path
1753 as this function will be called only by the purge thread. */
1754
1755
1756 /* To obey latching order first release the hash_lock. */
1757 rw_lock_x_unlock(hash_lock);
1758
1759 buf_pool_mutex_enter(buf_pool);
1760 hash_lock_x_all(buf_pool->page_hash);
1761
1762 /* We have to recheck that the page
1763 was not loaded or a watch set by some other
1764 purge thread. This is because of the small
1765 time window between when we release the
1766 hash_lock to acquire buf_pool mutex above. */
1767
1768 bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
1769 if (UNIV_LIKELY_NULL(bpage)) {
1770 buf_pool_mutex_exit(buf_pool);
1771 hash_unlock_x_all_but(buf_pool->page_hash, hash_lock);
1772 goto page_found;
1773 }
1774
1775 /* The maximum number of purge threads should never exceed
1776 BUF_POOL_WATCH_SIZE. So there is no way for purge thread
1777 instance to hold a watch when setting another watch. */
1778 for (i = 0; i < BUF_POOL_WATCH_SIZE; i++) {
1779 bpage = &buf_pool->watch[i];
1780
1781 ut_ad(bpage->access_time == 0);
1782 ut_ad(bpage->newest_modification == 0);
1783 ut_ad(bpage->oldest_modification == 0);
1784 ut_ad(bpage->zip.data == NULL);
1785 ut_ad(!bpage->in_zip_hash);
1786
1787 switch (bpage->state) {
1788 case BUF_BLOCK_POOL_WATCH:
1789 ut_ad(!bpage->in_page_hash);
1790 ut_ad(bpage->buf_fix_count == 0);
1791
1792 /* bpage is pointing to buf_pool->watch[],
1793 which is protected by buf_pool->mutex.
1794 Normally, buf_page_t objects are protected by
1795 buf_block_t::mutex or buf_pool->zip_mutex or both. */
1796
1797 bpage->state = BUF_BLOCK_ZIP_PAGE;
1798 bpage->space = static_cast<ib_uint32_t>(space);
1799 bpage->offset = static_cast<ib_uint32_t>(offset);
1800 bpage->buf_fix_count = 1;
1801
1802 ut_d(bpage->in_page_hash = TRUE);
1803 HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
1804 fold, bpage);
1805
1806 buf_pool_mutex_exit(buf_pool);
1807 /* Once the sentinel is in the page_hash we can
1808 safely release all locks except just the
1809 relevant hash_lock */
1810 hash_unlock_x_all_but(buf_pool->page_hash,
1811 hash_lock);
1812
1813 return(NULL);
1814 case BUF_BLOCK_ZIP_PAGE:
1815 ut_ad(bpage->in_page_hash);
1816 ut_ad(bpage->buf_fix_count > 0);
1817 break;
1818 default:
1819 ut_error;
1820 }
1821 }
1822
1823 /* Allocation failed. Either the maximum number of purge
1824 threads should never exceed BUF_POOL_WATCH_SIZE, or this code
1825 should be modified to return a special non-NULL value and the
1826 caller should purge the record directly. */
1827 ut_error;
1828
1829 /* Fix compiler warning */
1830 return(NULL);
1831 }
1832
1833 /****************************************************************//**
1834 Remove the sentinel block for the watch before replacing it with a real block.
1835 buf_page_watch_clear() or buf_page_watch_occurred() will notice that
1836 the block has been replaced with the real block.
1837 @return reference count, to be added to the replacement block */
1838 static
1839 void
buf_pool_watch_remove(buf_pool_t * buf_pool,ulint fold,buf_page_t * watch)1840 buf_pool_watch_remove(
1841 /*==================*/
1842 buf_pool_t* buf_pool, /*!< buffer pool instance */
1843 ulint fold, /*!< in: buf_page_address_fold(
1844 space, offset) */
1845 buf_page_t* watch) /*!< in/out: sentinel for watch */
1846 {
1847 #ifdef UNIV_SYNC_DEBUG
1848 /* We must also own the appropriate hash_bucket mutex. */
1849 rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, fold);
1850 ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX));
1851 #endif /* UNIV_SYNC_DEBUG */
1852
1853 ut_ad(buf_pool_mutex_own(buf_pool));
1854
1855 HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, watch);
1856 ut_d(watch->in_page_hash = FALSE);
1857 watch->buf_fix_count = 0;
1858 watch->state = BUF_BLOCK_POOL_WATCH;
1859 }
1860
1861 /****************************************************************//**
1862 Stop watching if the page has been read in.
1863 buf_pool_watch_set(space,offset) must have returned NULL before. */
1864 UNIV_INTERN
1865 void
buf_pool_watch_unset(ulint space,ulint offset)1866 buf_pool_watch_unset(
1867 /*=================*/
1868 ulint space, /*!< in: space id */
1869 ulint offset) /*!< in: page number */
1870 {
1871 buf_page_t* bpage;
1872 buf_pool_t* buf_pool = buf_pool_get(space, offset);
1873 ulint fold = buf_page_address_fold(space, offset);
1874 rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, fold);
1875
1876 /* We only need to have buf_pool mutex in case where we end
1877 up calling buf_pool_watch_remove but to obey latching order
1878 we acquire it here before acquiring hash_lock. This should
1879 not cause too much grief as this function is only ever
1880 called from the purge thread. */
1881 buf_pool_mutex_enter(buf_pool);
1882
1883 rw_lock_x_lock(hash_lock);
1884
1885 /* The page must exist because buf_pool_watch_set() increments
1886 buf_fix_count. */
1887
1888 bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
1889
1890 if (!buf_pool_watch_is_sentinel(buf_pool, bpage)) {
1891 buf_block_unfix(reinterpret_cast<buf_block_t*>(bpage));
1892 } else {
1893
1894 ut_ad(bpage->buf_fix_count > 0);
1895
1896 #ifdef PAGE_ATOMIC_REF_COUNT
1897 os_atomic_decrement_uint32(&bpage->buf_fix_count, 1);
1898 #else
1899 --bpage->buf_fix_count;
1900 #endif /* PAGE_ATOMIC_REF_COUNT */
1901
1902 if (bpage->buf_fix_count == 0) {
1903 buf_pool_watch_remove(buf_pool, fold, bpage);
1904 }
1905 }
1906
1907 buf_pool_mutex_exit(buf_pool);
1908 rw_lock_x_unlock(hash_lock);
1909 }
1910
1911 /****************************************************************//**
1912 Check if the page has been read in.
1913 This may only be called after buf_pool_watch_set(space,offset)
1914 has returned NULL and before invoking buf_pool_watch_unset(space,offset).
1915 @return FALSE if the given page was not read in, TRUE if it was */
1916 UNIV_INTERN
1917 ibool
buf_pool_watch_occurred(ulint space,ulint offset)1918 buf_pool_watch_occurred(
1919 /*====================*/
1920 ulint space, /*!< in: space id */
1921 ulint offset) /*!< in: page number */
1922 {
1923 ibool ret;
1924 buf_page_t* bpage;
1925 buf_pool_t* buf_pool = buf_pool_get(space, offset);
1926 ulint fold = buf_page_address_fold(space, offset);
1927 rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool,
1928 fold);
1929
1930 rw_lock_s_lock(hash_lock);
1931
1932 /* The page must exist because buf_pool_watch_set()
1933 increments buf_fix_count. */
1934 bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
1935
1936 ret = !buf_pool_watch_is_sentinel(buf_pool, bpage);
1937 rw_lock_s_unlock(hash_lock);
1938
1939 return(ret);
1940 }
1941
1942 /********************************************************************//**
1943 Moves a page to the start of the buffer pool LRU list. This high-level
1944 function can be used to prevent an important page from slipping out of
1945 the buffer pool. */
1946 UNIV_INTERN
1947 void
buf_page_make_young(buf_page_t * bpage)1948 buf_page_make_young(
1949 /*================*/
1950 buf_page_t* bpage) /*!< in: buffer block of a file page */
1951 {
1952 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1953
1954 buf_pool_mutex_enter(buf_pool);
1955
1956 ut_a(buf_page_in_file(bpage));
1957
1958 buf_LRU_make_block_young(bpage);
1959
1960 buf_pool_mutex_exit(buf_pool);
1961 }
1962
1963 /********************************************************************//**
1964 Moves a page to the start of the buffer pool LRU list if it is too old.
1965 This high-level function can be used to prevent an important page from
1966 slipping out of the buffer pool. */
1967 static
1968 void
buf_page_make_young_if_needed(buf_page_t * bpage)1969 buf_page_make_young_if_needed(
1970 /*==========================*/
1971 buf_page_t* bpage) /*!< in/out: buffer block of a
1972 file page */
1973 {
1974 #ifdef UNIV_DEBUG
1975 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1976 ut_ad(!buf_pool_mutex_own(buf_pool));
1977 #endif /* UNIV_DEBUG */
1978 ut_a(buf_page_in_file(bpage));
1979
1980 if (buf_page_peek_if_too_old(bpage)) {
1981 buf_page_make_young(bpage);
1982 }
1983 }
1984
1985 /********************************************************************//**
1986 Resets the check_index_page_at_flush field of a page if found in the buffer
1987 pool. */
1988 UNIV_INTERN
1989 void
buf_reset_check_index_page_at_flush(ulint space,ulint offset)1990 buf_reset_check_index_page_at_flush(
1991 /*================================*/
1992 ulint space, /*!< in: space id */
1993 ulint offset) /*!< in: page number */
1994 {
1995 buf_block_t* block;
1996 buf_pool_t* buf_pool = buf_pool_get(space, offset);
1997
1998 buf_pool_mutex_enter(buf_pool);
1999
2000 block = (buf_block_t*) buf_page_hash_get(buf_pool, space, offset);
2001
2002 if (block && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE) {
2003 ut_ad(!buf_pool_watch_is_sentinel(buf_pool, &block->page));
2004 block->check_index_page_at_flush = FALSE;
2005 }
2006
2007 buf_pool_mutex_exit(buf_pool);
2008 }
2009
2010 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
2011 /********************************************************************//**
2012 Sets file_page_was_freed TRUE if the page is found in the buffer pool.
2013 This function should be called when we free a file page and want the
2014 debug version to check that it is not accessed any more unless
2015 reallocated.
2016 @return control block if found in page hash table, otherwise NULL */
2017 UNIV_INTERN
2018 buf_page_t*
buf_page_set_file_page_was_freed(ulint space,ulint offset)2019 buf_page_set_file_page_was_freed(
2020 /*=============================*/
2021 ulint space, /*!< in: space id */
2022 ulint offset) /*!< in: page number */
2023 {
2024 buf_page_t* bpage;
2025 buf_pool_t* buf_pool = buf_pool_get(space, offset);
2026 rw_lock_t* hash_lock;
2027
2028 bpage = buf_page_hash_get_s_locked(buf_pool, space, offset,
2029 &hash_lock);
2030
2031 if (bpage) {
2032 ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
2033 ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
2034 mutex_enter(block_mutex);
2035 rw_lock_s_unlock(hash_lock);
2036 /* bpage->file_page_was_freed can already hold
2037 when this code is invoked from dict_drop_index_tree() */
2038 bpage->file_page_was_freed = TRUE;
2039 mutex_exit(block_mutex);
2040 }
2041
2042 return(bpage);
2043 }
2044
2045 /********************************************************************//**
2046 Sets file_page_was_freed FALSE if the page is found in the buffer pool.
2047 This function should be called when we free a file page and want the
2048 debug version to check that it is not accessed any more unless
2049 reallocated.
2050 @return control block if found in page hash table, otherwise NULL */
2051 UNIV_INTERN
2052 buf_page_t*
buf_page_reset_file_page_was_freed(ulint space,ulint offset)2053 buf_page_reset_file_page_was_freed(
2054 /*===============================*/
2055 ulint space, /*!< in: space id */
2056 ulint offset) /*!< in: page number */
2057 {
2058 buf_page_t* bpage;
2059 buf_pool_t* buf_pool = buf_pool_get(space, offset);
2060 rw_lock_t* hash_lock;
2061
2062 bpage = buf_page_hash_get_s_locked(buf_pool, space, offset,
2063 &hash_lock);
2064 if (bpage) {
2065 ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
2066 ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
2067 mutex_enter(block_mutex);
2068 rw_lock_s_unlock(hash_lock);
2069 bpage->file_page_was_freed = FALSE;
2070 mutex_exit(block_mutex);
2071 }
2072
2073 return(bpage);
2074 }
2075 #endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
2076
2077 /********************************************************************//**
2078 Attempts to discard the uncompressed frame of a compressed page. The
2079 caller should not be holding any mutexes when this function is called.
2080 @return TRUE if successful, FALSE otherwise. */
2081 static
2082 void
buf_block_try_discard_uncompressed(ulint space,ulint offset)2083 buf_block_try_discard_uncompressed(
2084 /*===============================*/
2085 ulint space, /*!< in: space id */
2086 ulint offset) /*!< in: page number */
2087 {
2088 buf_page_t* bpage;
2089 buf_pool_t* buf_pool = buf_pool_get(space, offset);
2090
2091 /* Since we need to acquire buf_pool mutex to discard
2092 the uncompressed frame and because page_hash mutex resides
2093 below buf_pool mutex in sync ordering therefore we must
2094 first release the page_hash mutex. This means that the
2095 block in question can move out of page_hash. Therefore
2096 we need to check again if the block is still in page_hash. */
2097 buf_pool_mutex_enter(buf_pool);
2098
2099 bpage = buf_page_hash_get(buf_pool, space, offset);
2100
2101 if (bpage) {
2102 buf_LRU_free_page(bpage, false);
2103 }
2104
2105 buf_pool_mutex_exit(buf_pool);
2106 }
2107
2108 /********************************************************************//**
2109 Get read access to a compressed page (usually of type
2110 FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
2111 The page must be released with buf_page_release_zip().
2112 NOTE: the page is not protected by any latch. Mutual exclusion has to
2113 be implemented at a higher level. In other words, all possible
2114 accesses to a given page through this function must be protected by
2115 the same set of mutexes or latches.
2116 @return pointer to the block */
2117 UNIV_INTERN
2118 buf_page_t*
buf_page_get_zip(ulint space,ulint zip_size,ulint offset)2119 buf_page_get_zip(
2120 /*=============*/
2121 ulint space, /*!< in: space id */
2122 ulint zip_size,/*!< in: compressed page size */
2123 ulint offset) /*!< in: page number */
2124 {
2125 buf_page_t* bpage;
2126 ib_mutex_t* block_mutex;
2127 rw_lock_t* hash_lock;
2128 ibool discard_attempted = FALSE;
2129 ibool must_read;
2130 buf_pool_t* buf_pool = buf_pool_get(space, offset);
2131
2132 buf_pool->stat.n_page_gets++;
2133
2134 for (;;) {
2135 lookup:
2136
2137 /* The following call will also grab the page_hash
2138 mutex if the page is found. */
2139 bpage = buf_page_hash_get_s_locked(buf_pool, space,
2140 offset, &hash_lock);
2141 if (bpage) {
2142 ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
2143 break;
2144 }
2145
2146 /* Page not in buf_pool: needs to be read from file */
2147
2148 ut_ad(!hash_lock);
2149 buf_read_page(space, zip_size, offset);
2150
2151 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
2152 ut_a(++buf_dbg_counter % 5771 || buf_validate());
2153 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
2154 }
2155
2156 ut_ad(buf_page_hash_lock_held_s(buf_pool, bpage));
2157
2158 if (!bpage->zip.data) {
2159 /* There is no compressed page. */
2160 err_exit:
2161 rw_lock_s_unlock(hash_lock);
2162 return(NULL);
2163 }
2164
2165 ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
2166
2167 switch (buf_page_get_state(bpage)) {
2168 case BUF_BLOCK_POOL_WATCH:
2169 case BUF_BLOCK_NOT_USED:
2170 case BUF_BLOCK_READY_FOR_USE:
2171 case BUF_BLOCK_MEMORY:
2172 case BUF_BLOCK_REMOVE_HASH:
2173 ut_error;
2174
2175 case BUF_BLOCK_ZIP_PAGE:
2176 case BUF_BLOCK_ZIP_DIRTY:
2177 block_mutex = &buf_pool->zip_mutex;
2178 mutex_enter(block_mutex);
2179 #ifdef PAGE_ATOMIC_REF_COUNT
2180 os_atomic_increment_uint32(&bpage->buf_fix_count, 1);
2181 #else
2182 ++bpage->buf_fix_count;
2183 #endif /* PAGE_ATOMIC_REF_COUNT */
2184 goto got_block;
2185 case BUF_BLOCK_FILE_PAGE:
2186 /* Discard the uncompressed page frame if possible. */
2187 if (!discard_attempted) {
2188 rw_lock_s_unlock(hash_lock);
2189 buf_block_try_discard_uncompressed(space, offset);
2190 discard_attempted = TRUE;
2191 goto lookup;
2192 }
2193
2194 block_mutex = &((buf_block_t*) bpage)->mutex;
2195
2196 mutex_enter(block_mutex);
2197
2198 buf_block_buf_fix_inc((buf_block_t*) bpage, __FILE__, __LINE__);
2199 goto got_block;
2200 }
2201
2202 ut_error;
2203 goto err_exit;
2204
2205 got_block:
2206 must_read = buf_page_get_io_fix(bpage) == BUF_IO_READ;
2207
2208 rw_lock_s_unlock(hash_lock);
2209 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
2210 ut_a(!bpage->file_page_was_freed);
2211 #endif /* defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG */
2212
2213 buf_page_set_accessed(bpage);
2214
2215 mutex_exit(block_mutex);
2216
2217 buf_page_make_young_if_needed(bpage);
2218
2219 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
2220 ut_a(++buf_dbg_counter % 5771 || buf_validate());
2221 ut_a(bpage->buf_fix_count > 0);
2222 ut_a(buf_page_in_file(bpage));
2223 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
2224
2225 if (must_read) {
2226 /* Let us wait until the read operation
2227 completes */
2228
2229 for (;;) {
2230 enum buf_io_fix io_fix;
2231
2232 mutex_enter(block_mutex);
2233 io_fix = buf_page_get_io_fix(bpage);
2234 mutex_exit(block_mutex);
2235
2236 if (io_fix == BUF_IO_READ) {
2237
2238 os_thread_sleep(WAIT_FOR_READ);
2239 } else {
2240 break;
2241 }
2242 }
2243 }
2244
2245 #ifdef UNIV_IBUF_COUNT_DEBUG
2246 ut_a(ibuf_count_get(buf_page_get_space(bpage),
2247 buf_page_get_page_no(bpage)) == 0);
2248 #endif
2249 return(bpage);
2250 }
2251
2252 /********************************************************************//**
2253 Initialize some fields of a control block. */
2254 UNIV_INLINE
2255 void
buf_block_init_low(buf_block_t * block)2256 buf_block_init_low(
2257 /*===============*/
2258 buf_block_t* block) /*!< in: block to init */
2259 {
2260 block->check_index_page_at_flush = FALSE;
2261 block->index = NULL;
2262
2263 block->n_hash_helps = 0;
2264 block->n_fields = 1;
2265 block->n_bytes = 0;
2266 block->left_side = TRUE;
2267 }
2268 #endif /* !UNIV_HOTBACKUP */
2269
2270 /********************************************************************//**
2271 Decompress a block.
2272 @return TRUE if successful */
2273 UNIV_INTERN
2274 ibool
buf_zip_decompress(buf_block_t * block,ibool check)2275 buf_zip_decompress(
2276 /*===============*/
2277 buf_block_t* block, /*!< in/out: block */
2278 ibool check) /*!< in: TRUE=verify the page checksum */
2279 {
2280 const byte* frame = block->page.zip.data;
2281 ulint size = page_zip_get_size(&block->page.zip);
2282
2283 ut_ad(buf_block_get_zip_size(block));
2284 ut_a(buf_block_get_space(block) != 0);
2285
2286 if (UNIV_UNLIKELY(check && !page_zip_verify_checksum(frame, size))) {
2287
2288 ut_print_timestamp(stderr);
2289 fprintf(stderr,
2290 " InnoDB: compressed page checksum mismatch"
2291 " (space %u page %u): stored: %lu, crc32: %lu "
2292 "innodb: %lu, none: %lu\n",
2293 block->page.space, block->page.offset,
2294 mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM),
2295 page_zip_calc_checksum(frame, size,
2296 SRV_CHECKSUM_ALGORITHM_CRC32),
2297 page_zip_calc_checksum(frame, size,
2298 SRV_CHECKSUM_ALGORITHM_INNODB),
2299 page_zip_calc_checksum(frame, size,
2300 SRV_CHECKSUM_ALGORITHM_NONE));
2301 return(FALSE);
2302 }
2303
2304 switch (fil_page_get_type(frame)) {
2305 case FIL_PAGE_INDEX:
2306 if (page_zip_decompress(&block->page.zip,
2307 block->frame, TRUE)) {
2308 return(TRUE);
2309 }
2310
2311 fprintf(stderr,
2312 "InnoDB: unable to decompress space %lu page %lu\n",
2313 (ulong) block->page.space,
2314 (ulong) block->page.offset);
2315 return(FALSE);
2316
2317 case FIL_PAGE_TYPE_ALLOCATED:
2318 case FIL_PAGE_INODE:
2319 case FIL_PAGE_IBUF_BITMAP:
2320 case FIL_PAGE_TYPE_FSP_HDR:
2321 case FIL_PAGE_TYPE_XDES:
2322 case FIL_PAGE_TYPE_ZBLOB:
2323 case FIL_PAGE_TYPE_ZBLOB2:
2324 /* Copy to uncompressed storage. */
2325 memcpy(block->frame, frame,
2326 buf_block_get_zip_size(block));
2327 return(TRUE);
2328 }
2329
2330 ut_print_timestamp(stderr);
2331 fprintf(stderr,
2332 " InnoDB: unknown compressed page"
2333 " type %lu\n",
2334 fil_page_get_type(frame));
2335 return(FALSE);
2336 }
2337
2338 #ifndef UNIV_HOTBACKUP
2339 /*******************************************************************//**
2340 Gets the block to whose frame the pointer is pointing to if found
2341 in this buffer pool instance.
2342 @return pointer to block */
2343 UNIV_INTERN
2344 buf_block_t*
buf_block_align_instance(buf_pool_t * buf_pool,const byte * ptr)2345 buf_block_align_instance(
2346 /*=====================*/
2347 buf_pool_t* buf_pool, /*!< in: buffer in which the block
2348 resides */
2349 const byte* ptr) /*!< in: pointer to a frame */
2350 {
2351 buf_chunk_t* chunk;
2352 ulint i;
2353
2354 /* TODO: protect buf_pool->chunks with a mutex (it will
2355 currently remain constant after buf_pool_init()) */
2356 for (chunk = buf_pool->chunks, i = buf_pool->n_chunks; i--; chunk++) {
2357 ulint offs;
2358
2359 if (UNIV_UNLIKELY(ptr < chunk->blocks->frame)) {
2360
2361 continue;
2362 }
2363 /* else */
2364
2365 offs = ptr - chunk->blocks->frame;
2366
2367 offs >>= UNIV_PAGE_SIZE_SHIFT;
2368
2369 if (UNIV_LIKELY(offs < chunk->size)) {
2370 buf_block_t* block = &chunk->blocks[offs];
2371
2372 /* The function buf_chunk_init() invokes
2373 buf_block_init() so that block[n].frame ==
2374 block->frame + n * UNIV_PAGE_SIZE. Check it. */
2375 ut_ad(block->frame == page_align(ptr));
2376 #ifdef UNIV_DEBUG
2377 /* A thread that updates these fields must
2378 hold buf_pool->mutex and block->mutex. Acquire
2379 only the latter. */
2380 mutex_enter(&block->mutex);
2381
2382 switch (buf_block_get_state(block)) {
2383 case BUF_BLOCK_POOL_WATCH:
2384 case BUF_BLOCK_ZIP_PAGE:
2385 case BUF_BLOCK_ZIP_DIRTY:
2386 /* These types should only be used in
2387 the compressed buffer pool, whose
2388 memory is allocated from
2389 buf_pool->chunks, in UNIV_PAGE_SIZE
2390 blocks flagged as BUF_BLOCK_MEMORY. */
2391 ut_error;
2392 break;
2393 case BUF_BLOCK_NOT_USED:
2394 case BUF_BLOCK_READY_FOR_USE:
2395 case BUF_BLOCK_MEMORY:
2396 /* Some data structures contain
2397 "guess" pointers to file pages. The
2398 file pages may have been freed and
2399 reused. Do not complain. */
2400 break;
2401 case BUF_BLOCK_REMOVE_HASH:
2402 /* buf_LRU_block_remove_hashed_page()
2403 will overwrite the FIL_PAGE_OFFSET and
2404 FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID with
2405 0xff and set the state to
2406 BUF_BLOCK_REMOVE_HASH. */
2407 ut_ad(page_get_space_id(page_align(ptr))
2408 == 0xffffffff);
2409 ut_ad(page_get_page_no(page_align(ptr))
2410 == 0xffffffff);
2411 break;
2412 case BUF_BLOCK_FILE_PAGE:
2413 ut_ad(block->page.space
2414 == page_get_space_id(page_align(ptr)));
2415 ut_ad(block->page.offset
2416 == page_get_page_no(page_align(ptr)));
2417 break;
2418 }
2419
2420 mutex_exit(&block->mutex);
2421 #endif /* UNIV_DEBUG */
2422
2423 return(block);
2424 }
2425 }
2426
2427 return(NULL);
2428 }
2429
2430 /*******************************************************************//**
2431 Gets the block to whose frame the pointer is pointing to.
2432 @return pointer to block, never NULL */
2433 UNIV_INTERN
2434 buf_block_t*
buf_block_align(const byte * ptr)2435 buf_block_align(
2436 /*============*/
2437 const byte* ptr) /*!< in: pointer to a frame */
2438 {
2439 ulint i;
2440
2441 for (i = 0; i < srv_buf_pool_instances; i++) {
2442 buf_block_t* block;
2443
2444 block = buf_block_align_instance(
2445 buf_pool_from_array(i), ptr);
2446 if (block) {
2447 return(block);
2448 }
2449 }
2450
2451 /* The block should always be found. */
2452 ut_error;
2453 return(NULL);
2454 }
2455
2456 /********************************************************************//**
2457 Find out if a pointer belongs to a buf_block_t. It can be a pointer to
2458 the buf_block_t itself or a member of it. This functions checks one of
2459 the buffer pool instances.
2460 @return TRUE if ptr belongs to a buf_block_t struct */
2461 static
2462 ibool
buf_pointer_is_block_field_instance(buf_pool_t * buf_pool,const void * ptr)2463 buf_pointer_is_block_field_instance(
2464 /*================================*/
2465 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
2466 const void* ptr) /*!< in: pointer not dereferenced */
2467 {
2468 const buf_chunk_t* chunk = buf_pool->chunks;
2469 const buf_chunk_t* const echunk = chunk + buf_pool->n_chunks;
2470
2471 /* TODO: protect buf_pool->chunks with a mutex (it will
2472 currently remain constant after buf_pool_init()) */
2473 while (chunk < echunk) {
2474 if (ptr >= (void*) chunk->blocks
2475 && ptr < (void*) (chunk->blocks + chunk->size)) {
2476
2477 return(TRUE);
2478 }
2479
2480 chunk++;
2481 }
2482
2483 return(FALSE);
2484 }
2485
2486 /********************************************************************//**
2487 Find out if a pointer belongs to a buf_block_t. It can be a pointer to
2488 the buf_block_t itself or a member of it
2489 @return TRUE if ptr belongs to a buf_block_t struct */
2490 UNIV_INTERN
2491 ibool
buf_pointer_is_block_field(const void * ptr)2492 buf_pointer_is_block_field(
2493 /*=======================*/
2494 const void* ptr) /*!< in: pointer not dereferenced */
2495 {
2496 ulint i;
2497
2498 for (i = 0; i < srv_buf_pool_instances; i++) {
2499 ibool found;
2500
2501 found = buf_pointer_is_block_field_instance(
2502 buf_pool_from_array(i), ptr);
2503 if (found) {
2504 return(TRUE);
2505 }
2506 }
2507
2508 return(FALSE);
2509 }
2510
2511 /********************************************************************//**
2512 Find out if a buffer block was created by buf_chunk_init().
2513 @return TRUE if "block" has been added to buf_pool->free by buf_chunk_init() */
2514 static
2515 ibool
buf_block_is_uncompressed(buf_pool_t * buf_pool,const buf_block_t * block)2516 buf_block_is_uncompressed(
2517 /*======================*/
2518 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
2519 const buf_block_t* block) /*!< in: pointer to block,
2520 not dereferenced */
2521 {
2522 if ((((ulint) block) % sizeof *block) != 0) {
2523 /* The pointer should be aligned. */
2524 return(FALSE);
2525 }
2526
2527 return(buf_pointer_is_block_field_instance(buf_pool, (void*) block));
2528 }
2529
2530 #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
2531 /********************************************************************//**
2532 Return true if probe is enabled.
2533 @return true if probe enabled. */
2534 static
2535 bool
buf_debug_execute_is_force_flush()2536 buf_debug_execute_is_force_flush()
2537 /*==============================*/
2538 {
2539 DBUG_EXECUTE_IF("ib_buf_force_flush", return(true); );
2540
2541 /* This is used during queisce testing, we want to ensure maximum
2542 buffering by the change buffer. */
2543
2544 if (srv_ibuf_disable_background_merge) {
2545 return(true);
2546 }
2547
2548 return(false);
2549 }
2550 #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
2551
2552 /**
2553 Wait for the block to be read in.
2554 @param block The block to check */
2555 static
2556 void
buf_wait_for_read(buf_block_t * block)2557 buf_wait_for_read(buf_block_t* block)
2558 {
2559 /* Note: For the PAGE_ATOMIC_REF_COUNT case:
2560
2561 We are using the block->lock to check for IO state (and a dirty read).
2562 We set the IO_READ state under the protection of the hash_lock
2563 (and block->mutex). This is safe because another thread can only
2564 access the block (and check for IO state) after the block has been
2565 added to the page hashtable. */
2566
2567 if (buf_block_get_io_fix(block) == BUF_IO_READ) {
2568
2569 /* Wait until the read operation completes */
2570
2571 ib_mutex_t* mutex = buf_page_get_mutex(&block->page);
2572
2573 for (;;) {
2574 buf_io_fix io_fix;
2575
2576 mutex_enter(mutex);
2577
2578 io_fix = buf_block_get_io_fix(block);
2579
2580 mutex_exit(mutex);
2581
2582 if (io_fix == BUF_IO_READ) {
2583 /* Wait by temporaly s-latch */
2584 rw_lock_s_lock(&block->lock);
2585 rw_lock_s_unlock(&block->lock);
2586 } else {
2587 break;
2588 }
2589 }
2590 }
2591 }
2592
2593 /********************************************************************//**
2594 This is the general function used to get access to a database page.
2595 @return pointer to the block or NULL */
2596 UNIV_INTERN
2597 buf_block_t*
buf_page_get_gen(ulint space,ulint zip_size,ulint offset,ulint rw_latch,buf_block_t * guess,ulint mode,const char * file,ulint line,mtr_t * mtr)2598 buf_page_get_gen(
2599 /*=============*/
2600 ulint space, /*!< in: space id */
2601 ulint zip_size,/*!< in: compressed page size in bytes
2602 or 0 for uncompressed pages */
2603 ulint offset, /*!< in: page number */
2604 ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
2605 buf_block_t* guess, /*!< in: guessed block or NULL */
2606 ulint mode, /*!< in: BUF_GET, BUF_GET_IF_IN_POOL,
2607 BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or
2608 BUF_GET_IF_IN_POOL_OR_WATCH */
2609 const char* file, /*!< in: file name */
2610 ulint line, /*!< in: line where called */
2611 mtr_t* mtr) /*!< in: mini-transaction */
2612 {
2613 buf_block_t* block;
2614 ulint fold;
2615 unsigned access_time;
2616 ulint fix_type;
2617 rw_lock_t* hash_lock;
2618 ulint retries = 0;
2619 buf_block_t* fix_block;
2620 ib_mutex_t* fix_mutex = NULL;
2621 buf_pool_t* buf_pool = buf_pool_get(space, offset);
2622
2623 ut_ad(mtr);
2624 ut_ad(mtr->state == MTR_ACTIVE);
2625 ut_ad((rw_latch == RW_S_LATCH)
2626 || (rw_latch == RW_X_LATCH)
2627 || (rw_latch == RW_NO_LATCH));
2628 #ifdef UNIV_DEBUG
2629 switch (mode) {
2630 case BUF_GET_NO_LATCH:
2631 ut_ad(rw_latch == RW_NO_LATCH);
2632 break;
2633 case BUF_GET:
2634 case BUF_GET_IF_IN_POOL:
2635 case BUF_PEEK_IF_IN_POOL:
2636 case BUF_GET_IF_IN_POOL_OR_WATCH:
2637 case BUF_GET_POSSIBLY_FREED:
2638 break;
2639 default:
2640 ut_error;
2641 }
2642 #endif /* UNIV_DEBUG */
2643 ut_ad(zip_size == fil_space_get_zip_size(space));
2644 ut_ad(ut_is_2pow(zip_size));
2645 #ifndef UNIV_LOG_DEBUG
2646 ut_ad(!ibuf_inside(mtr)
2647 || ibuf_page_low(space, zip_size, offset,
2648 FALSE, file, line, NULL));
2649 #endif
2650 buf_pool->stat.n_page_gets++;
2651 fold = buf_page_address_fold(space, offset);
2652 hash_lock = buf_page_hash_lock_get(buf_pool, fold);
2653 loop:
2654 block = guess;
2655
2656 rw_lock_s_lock(hash_lock);
2657
2658 if (block != NULL) {
2659
2660 /* If the guess is a compressed page descriptor that
2661 has been allocated by buf_page_alloc_descriptor(),
2662 it may have been freed by buf_relocate(). */
2663
2664 if (!buf_block_is_uncompressed(buf_pool, block)
2665 || offset != block->page.offset
2666 || space != block->page.space
2667 || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
2668
2669 /* Our guess was bogus or things have changed
2670 since. */
2671 block = guess = NULL;
2672 } else {
2673 ut_ad(!block->page.in_zip_hash);
2674 }
2675 }
2676
2677 if (block == NULL) {
2678 block = (buf_block_t*) buf_page_hash_get_low(
2679 buf_pool, space, offset, fold);
2680 }
2681
2682 if (!block || buf_pool_watch_is_sentinel(buf_pool, &block->page)) {
2683 rw_lock_s_unlock(hash_lock);
2684 block = NULL;
2685 }
2686
2687 if (block == NULL) {
2688 /* Page not in buf_pool: needs to be read from file */
2689
2690 if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
2691 rw_lock_x_lock(hash_lock);
2692 block = (buf_block_t*) buf_pool_watch_set(
2693 space, offset, fold);
2694
2695 if (UNIV_LIKELY_NULL(block)) {
2696 /* We can release hash_lock after we
2697 increment the fix count to make
2698 sure that no state change takes place. */
2699 fix_block = block;
2700 buf_block_fix(fix_block);
2701
2702 /* Now safe to release page_hash mutex */
2703 rw_lock_x_unlock(hash_lock);
2704 goto got_block;
2705 }
2706
2707 rw_lock_x_unlock(hash_lock);
2708 }
2709
2710 if (mode == BUF_GET_IF_IN_POOL
2711 || mode == BUF_PEEK_IF_IN_POOL
2712 || mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
2713 #ifdef UNIV_SYNC_DEBUG
2714 ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
2715 ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
2716 #endif /* UNIV_SYNC_DEBUG */
2717 return(NULL);
2718 }
2719
2720 if (buf_read_page(space, zip_size, offset)) {
2721 buf_read_ahead_random(space, zip_size, offset,
2722 ibuf_inside(mtr));
2723
2724 retries = 0;
2725 } else if (retries < BUF_PAGE_READ_MAX_RETRIES) {
2726 ++retries;
2727 DBUG_EXECUTE_IF(
2728 "innodb_page_corruption_retries",
2729 retries = BUF_PAGE_READ_MAX_RETRIES;
2730 );
2731 } else {
2732 fprintf(stderr, "InnoDB: Error: Unable"
2733 " to read tablespace %lu page no"
2734 " %lu into the buffer pool after"
2735 " %lu attempts\n"
2736 "InnoDB: The most probable cause"
2737 " of this error may be that the"
2738 " table has been corrupted.\n"
2739 "InnoDB: You can try to fix this"
2740 " problem by using"
2741 " innodb_force_recovery.\n"
2742 "InnoDB: Please see reference manual"
2743 " for more details.\n"
2744 "InnoDB: Aborting...\n",
2745 space, offset,
2746 BUF_PAGE_READ_MAX_RETRIES);
2747
2748 ut_error;
2749 }
2750
2751 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
2752 ut_a(++buf_dbg_counter % 5771 || buf_validate());
2753 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
2754 goto loop;
2755 } else {
2756 fix_block = block;
2757 }
2758
2759 buf_block_fix(fix_block);
2760
2761 /* Now safe to release page_hash mutex */
2762 rw_lock_s_unlock(hash_lock);
2763
2764 got_block:
2765
2766 fix_mutex = buf_page_get_mutex(&fix_block->page);
2767
2768 ut_ad(page_zip_get_size(&block->page.zip) == zip_size);
2769
2770 if (mode == BUF_GET_IF_IN_POOL || mode == BUF_PEEK_IF_IN_POOL) {
2771
2772 bool must_read;
2773
2774 {
2775 buf_page_t* fix_page = &fix_block->page;
2776
2777 mutex_enter(fix_mutex);
2778
2779 buf_io_fix io_fix = buf_page_get_io_fix(fix_page);
2780
2781 must_read = (io_fix == BUF_IO_READ);
2782
2783 mutex_exit(fix_mutex);
2784 }
2785
2786 if (must_read) {
2787 /* The page is being read to buffer pool,
2788 but we cannot wait around for the read to
2789 complete. */
2790 buf_block_unfix(fix_block);
2791
2792 return(NULL);
2793 }
2794 }
2795
2796 switch(buf_block_get_state(fix_block)) {
2797 buf_page_t* bpage;
2798
2799 case BUF_BLOCK_FILE_PAGE:
2800 break;
2801
2802 case BUF_BLOCK_ZIP_PAGE:
2803 case BUF_BLOCK_ZIP_DIRTY:
2804 if (mode == BUF_PEEK_IF_IN_POOL) {
2805 /* This mode is only used for dropping an
2806 adaptive hash index. There cannot be an
2807 adaptive hash index for a compressed-only
2808 page, so do not bother decompressing the page. */
2809 buf_block_unfix(fix_block);
2810
2811 return(NULL);
2812 }
2813
2814 bpage = &block->page;
2815
2816 /* Note: We have already buffer fixed this block. */
2817 if (bpage->buf_fix_count > 1
2818 || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
2819
2820 /* This condition often occurs when the buffer
2821 is not buffer-fixed, but I/O-fixed by
2822 buf_page_init_for_read(). */
2823 buf_block_unfix(fix_block);
2824
2825 /* The block is buffer-fixed or I/O-fixed.
2826 Try again later. */
2827 os_thread_sleep(WAIT_FOR_READ);
2828
2829 goto loop;
2830 }
2831
2832 /* Buffer-fix the block so that it cannot be evicted
2833 or relocated while we are attempting to allocate an
2834 uncompressed page. */
2835
2836 block = buf_LRU_get_free_block(buf_pool);
2837
2838 buf_pool_mutex_enter(buf_pool);
2839
2840 rw_lock_x_lock(hash_lock);
2841
2842 /* Buffer-fixing prevents the page_hash from changing. */
2843 ut_ad(bpage == buf_page_hash_get_low(
2844 buf_pool, space, offset, fold));
2845
2846 buf_block_mutex_enter(block);
2847
2848 mutex_enter(&buf_pool->zip_mutex);
2849
2850 ut_ad(fix_block->page.buf_fix_count > 0);
2851
2852 #ifdef PAGE_ATOMIC_REF_COUNT
2853 os_atomic_decrement_uint32(&fix_block->page.buf_fix_count, 1);
2854 #else
2855 --fix_block->page.buf_fix_count;
2856 #endif /* PAGE_ATOMIC_REF_COUNT */
2857
2858 fix_block = block;
2859
2860 if (bpage->buf_fix_count > 0
2861 || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
2862
2863 mutex_exit(&buf_pool->zip_mutex);
2864 /* The block was buffer-fixed or I/O-fixed while
2865 buf_pool->mutex was not held by this thread.
2866 Free the block that was allocated and retry.
2867 This should be extremely unlikely, for example,
2868 if buf_page_get_zip() was invoked. */
2869
2870 buf_LRU_block_free_non_file_page(block);
2871 buf_pool_mutex_exit(buf_pool);
2872 rw_lock_x_unlock(hash_lock);
2873 buf_block_mutex_exit(block);
2874
2875 /* Try again */
2876 goto loop;
2877 }
2878
2879 /* Move the compressed page from bpage to block,
2880 and uncompress it. */
2881
2882 /* Note: this is the uncompressed block and it is not
2883 accessible by other threads yet because it is not in
2884 any list or hash table */
2885 buf_relocate(bpage, &block->page);
2886
2887 buf_block_init_low(block);
2888
2889 /* Set after relocate(). */
2890 block->page.buf_fix_count = 1;
2891
2892 block->lock_hash_val = lock_rec_hash(space, offset);
2893
2894 UNIV_MEM_DESC(&block->page.zip.data,
2895 page_zip_get_size(&block->page.zip));
2896
2897 if (buf_page_get_state(&block->page) == BUF_BLOCK_ZIP_PAGE) {
2898 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
2899 UT_LIST_REMOVE(list, buf_pool->zip_clean,
2900 &block->page);
2901 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
2902 ut_ad(!block->page.in_flush_list);
2903 } else {
2904 /* Relocate buf_pool->flush_list. */
2905 buf_flush_relocate_on_flush_list(bpage, &block->page);
2906 }
2907
2908 /* Buffer-fix, I/O-fix, and X-latch the block
2909 for the duration of the decompression.
2910 Also add the block to the unzip_LRU list. */
2911 block->page.state = BUF_BLOCK_FILE_PAGE;
2912
2913 /* Insert at the front of unzip_LRU list */
2914 buf_unzip_LRU_add_block(block, FALSE);
2915
2916 buf_block_set_io_fix(block, BUF_IO_READ);
2917 rw_lock_x_lock_inline(&block->lock, 0, file, line);
2918
2919 UNIV_MEM_INVALID(bpage, sizeof *bpage);
2920
2921 rw_lock_x_unlock(hash_lock);
2922
2923 ++buf_pool->n_pend_unzip;
2924
2925 mutex_exit(&buf_pool->zip_mutex);
2926 buf_pool_mutex_exit(buf_pool);
2927
2928 access_time = buf_page_is_accessed(&block->page);
2929
2930 buf_block_mutex_exit(block);
2931
2932 buf_page_free_descriptor(bpage);
2933
2934 /* Decompress the page while not holding
2935 buf_pool->mutex or block->mutex. */
2936
2937 /* Page checksum verification is already done when
2938 the page is read from disk. Hence page checksum
2939 verification is not necessary when decompressing the page. */
2940 {
2941 bool success = buf_zip_decompress(block, FALSE);
2942 ut_a(success);
2943 }
2944
2945 if (!recv_no_ibuf_operations) {
2946 if (access_time) {
2947 #ifdef UNIV_IBUF_COUNT_DEBUG
2948 ut_a(ibuf_count_get(space, offset) == 0);
2949 #endif /* UNIV_IBUF_COUNT_DEBUG */
2950 } else {
2951 ibuf_merge_or_delete_for_page(
2952 block, space, offset, zip_size, TRUE);
2953 }
2954 }
2955
2956 buf_pool_mutex_enter(buf_pool);
2957
2958 /* Unfix and unlatch the block. */
2959 buf_block_mutex_enter(fix_block);
2960
2961 buf_block_set_io_fix(fix_block, BUF_IO_NONE);
2962
2963 buf_block_mutex_exit(fix_block);
2964
2965 --buf_pool->n_pend_unzip;
2966
2967 buf_pool_mutex_exit(buf_pool);
2968
2969 rw_lock_x_unlock(&block->lock);
2970
2971 break;
2972
2973 case BUF_BLOCK_POOL_WATCH:
2974 case BUF_BLOCK_NOT_USED:
2975 case BUF_BLOCK_READY_FOR_USE:
2976 case BUF_BLOCK_MEMORY:
2977 case BUF_BLOCK_REMOVE_HASH:
2978 ut_error;
2979 break;
2980 }
2981
2982 ut_ad(block == fix_block);
2983 ut_ad(fix_block->page.buf_fix_count > 0);
2984
2985 #ifdef UNIV_SYNC_DEBUG
2986 ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
2987 ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
2988 #endif /* UNIV_SYNC_DEBUG */
2989
2990 ut_ad(buf_block_get_state(fix_block) == BUF_BLOCK_FILE_PAGE);
2991
2992 #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
2993
2994 if ((mode == BUF_GET_IF_IN_POOL || mode == BUF_GET_IF_IN_POOL_OR_WATCH)
2995 && (ibuf_debug || buf_debug_execute_is_force_flush())) {
2996
2997 /* Try to evict the block from the buffer pool, to use the
2998 insert buffer (change buffer) as much as possible. */
2999
3000 buf_pool_mutex_enter(buf_pool);
3001
3002 buf_block_unfix(fix_block);
3003
3004 /* Now we are only holding the buf_pool->mutex,
3005 not block->mutex or hash_lock. Blocks cannot be
3006 relocated or enter or exit the buf_pool while we
3007 are holding the buf_pool->mutex. */
3008
3009 if (buf_LRU_free_page(&fix_block->page, true)) {
3010 buf_pool_mutex_exit(buf_pool);
3011 rw_lock_x_lock(hash_lock);
3012
3013 if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
3014 /* Set the watch, as it would have
3015 been set if the page were not in the
3016 buffer pool in the first place. */
3017 block = (buf_block_t*) buf_pool_watch_set(
3018 space, offset, fold);
3019 } else {
3020 block = (buf_block_t*) buf_page_hash_get_low(
3021 buf_pool, space, offset, fold);
3022 }
3023
3024 rw_lock_x_unlock(hash_lock);
3025
3026 if (block != NULL) {
3027 /* Either the page has been read in or
3028 a watch was set on that in the window
3029 where we released the buf_pool::mutex
3030 and before we acquire the hash_lock
3031 above. Try again. */
3032 guess = block;
3033 goto loop;
3034 }
3035
3036 fprintf(stderr,
3037 "innodb_change_buffering_debug evict %u %u\n",
3038 (unsigned) space, (unsigned) offset);
3039 return(NULL);
3040 }
3041
3042 mutex_enter(&fix_block->mutex);
3043
3044 if (buf_flush_page_try(buf_pool, fix_block)) {
3045 fprintf(stderr,
3046 "innodb_change_buffering_debug flush %u %u\n",
3047 (unsigned) space, (unsigned) offset);
3048 guess = fix_block;
3049 goto loop;
3050 }
3051
3052 buf_block_mutex_exit(fix_block);
3053
3054 buf_block_fix(fix_block);
3055
3056 /* Failed to evict the page; change it directly */
3057
3058 buf_pool_mutex_exit(buf_pool);
3059 }
3060 #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
3061
3062 ut_ad(fix_block->page.buf_fix_count > 0);
3063
3064 #ifdef UNIV_SYNC_DEBUG
3065 /* We have already buffer fixed the page, and we are committed to
3066 returning this page to the caller. Register for debugging. */
3067 {
3068 ibool ret;
3069 ret = rw_lock_s_lock_nowait(&fix_block->debug_latch, file, line);
3070 ut_a(ret);
3071 }
3072 #endif /* UNIV_SYNC_DEBUG */
3073
3074 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
3075 ut_a(mode == BUF_GET_POSSIBLY_FREED
3076 || !fix_block->page.file_page_was_freed);
3077 #endif
3078 /* Check if this is the first access to the page */
3079 access_time = buf_page_is_accessed(&fix_block->page);
3080
3081 /* This is a heuristic and we don't care about ordering issues. */
3082 if (access_time == 0) {
3083 buf_block_mutex_enter(fix_block);
3084
3085 buf_page_set_accessed(&fix_block->page);
3086
3087 buf_block_mutex_exit(fix_block);
3088 }
3089
3090 if (mode != BUF_PEEK_IF_IN_POOL) {
3091 buf_page_make_young_if_needed(&fix_block->page);
3092 }
3093
3094 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3095 ut_a(++buf_dbg_counter % 5771 || buf_validate());
3096 ut_a(fix_block->page.buf_fix_count > 0);
3097 ut_a(buf_block_get_state(fix_block) == BUF_BLOCK_FILE_PAGE);
3098 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3099
3100 #ifdef PAGE_ATOMIC_REF_COUNT
3101 /* We have to wait here because the IO_READ state was set
3102 under the protection of the hash_lock and the block->mutex
3103 but not the block->lock. */
3104 buf_wait_for_read(fix_block);
3105 #endif /* PAGE_ATOMIC_REF_COUNT */
3106
3107 switch (rw_latch) {
3108 case RW_NO_LATCH:
3109
3110 #ifndef PAGE_ATOMIC_REF_COUNT
3111 buf_wait_for_read(fix_block);
3112 #endif /* !PAGE_ATOMIC_REF_COUNT */
3113
3114 fix_type = MTR_MEMO_BUF_FIX;
3115 break;
3116
3117 case RW_S_LATCH:
3118 rw_lock_s_lock_inline(&fix_block->lock, 0, file, line);
3119
3120 fix_type = MTR_MEMO_PAGE_S_FIX;
3121 break;
3122
3123 default:
3124 ut_ad(rw_latch == RW_X_LATCH);
3125 rw_lock_x_lock_inline(&fix_block->lock, 0, file, line);
3126
3127 fix_type = MTR_MEMO_PAGE_X_FIX;
3128 break;
3129 }
3130
3131 mtr_memo_push(mtr, fix_block, fix_type);
3132
3133 if (mode != BUF_PEEK_IF_IN_POOL && !access_time) {
3134 /* In the case of a first access, try to apply linear
3135 read-ahead */
3136
3137 buf_read_ahead_linear(
3138 space, zip_size, offset, ibuf_inside(mtr));
3139 }
3140
3141 #ifdef UNIV_IBUF_COUNT_DEBUG
3142 ut_a(ibuf_count_get(buf_block_get_space(fix_block),
3143 buf_block_get_page_no(fix_block)) == 0);
3144 #endif
3145 #ifdef UNIV_SYNC_DEBUG
3146 ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
3147 ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
3148 #endif /* UNIV_SYNC_DEBUG */
3149 return(fix_block);
3150 }
3151
3152 /********************************************************************//**
3153 This is the general function used to get optimistic access to a database
3154 page.
3155 @return TRUE if success */
3156 UNIV_INTERN
3157 ibool
buf_page_optimistic_get(ulint rw_latch,buf_block_t * block,ib_uint64_t modify_clock,const char * file,ulint line,mtr_t * mtr)3158 buf_page_optimistic_get(
3159 /*====================*/
3160 ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
3161 buf_block_t* block, /*!< in: guessed buffer block */
3162 ib_uint64_t modify_clock,/*!< in: modify clock value */
3163 const char* file, /*!< in: file name */
3164 ulint line, /*!< in: line where called */
3165 mtr_t* mtr) /*!< in: mini-transaction */
3166 {
3167 buf_pool_t* buf_pool;
3168 unsigned access_time;
3169 ibool success;
3170 ulint fix_type;
3171
3172 ut_ad(block);
3173 ut_ad(mtr);
3174 ut_ad(mtr->state == MTR_ACTIVE);
3175 ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
3176
3177 mutex_enter(&block->mutex);
3178
3179 if (UNIV_UNLIKELY(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE)) {
3180
3181 mutex_exit(&block->mutex);
3182
3183 return(FALSE);
3184 }
3185
3186 buf_block_buf_fix_inc(block, file, line);
3187
3188 access_time = buf_page_is_accessed(&block->page);
3189
3190 buf_page_set_accessed(&block->page);
3191
3192 mutex_exit(&block->mutex);
3193
3194 buf_page_make_young_if_needed(&block->page);
3195
3196 ut_ad(!ibuf_inside(mtr)
3197 || ibuf_page(buf_block_get_space(block),
3198 buf_block_get_zip_size(block),
3199 buf_block_get_page_no(block), NULL));
3200
3201 if (rw_latch == RW_S_LATCH) {
3202 success = rw_lock_s_lock_nowait(&(block->lock),
3203 file, line);
3204 fix_type = MTR_MEMO_PAGE_S_FIX;
3205 } else {
3206 success = rw_lock_x_lock_func_nowait_inline(&(block->lock),
3207 file, line);
3208 fix_type = MTR_MEMO_PAGE_X_FIX;
3209 }
3210
3211 if (UNIV_UNLIKELY(!success)) {
3212 buf_block_buf_fix_dec(block);
3213
3214 return(FALSE);
3215 }
3216
3217 if (UNIV_UNLIKELY(modify_clock != block->modify_clock)) {
3218 buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
3219
3220 if (rw_latch == RW_S_LATCH) {
3221 rw_lock_s_unlock(&(block->lock));
3222 } else {
3223 rw_lock_x_unlock(&(block->lock));
3224 }
3225
3226 buf_block_buf_fix_dec(block);
3227
3228 return(FALSE);
3229 }
3230
3231 mtr_memo_push(mtr, block, fix_type);
3232
3233 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3234 ut_a(++buf_dbg_counter % 5771 || buf_validate());
3235 ut_a(block->page.buf_fix_count > 0);
3236 ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
3237 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3238
3239 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
3240 mutex_enter(&block->mutex);
3241 ut_a(!block->page.file_page_was_freed);
3242 mutex_exit(&block->mutex);
3243 #endif
3244
3245 if (!access_time) {
3246 /* In the case of a first access, try to apply linear
3247 read-ahead */
3248
3249 buf_read_ahead_linear(buf_block_get_space(block),
3250 buf_block_get_zip_size(block),
3251 buf_block_get_page_no(block),
3252 ibuf_inside(mtr));
3253 }
3254
3255 #ifdef UNIV_IBUF_COUNT_DEBUG
3256 ut_a(ibuf_count_get(buf_block_get_space(block),
3257 buf_block_get_page_no(block)) == 0);
3258 #endif
3259 buf_pool = buf_pool_from_block(block);
3260 buf_pool->stat.n_page_gets++;
3261
3262 return(TRUE);
3263 }
3264
3265 /********************************************************************//**
3266 This is used to get access to a known database page, when no waiting can be
3267 done. For example, if a search in an adaptive hash index leads us to this
3268 frame.
3269 @return TRUE if success */
3270 UNIV_INTERN
3271 ibool
buf_page_get_known_nowait(ulint rw_latch,buf_block_t * block,ulint mode,const char * file,ulint line,mtr_t * mtr)3272 buf_page_get_known_nowait(
3273 /*======================*/
3274 ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
3275 buf_block_t* block, /*!< in: the known page */
3276 ulint mode, /*!< in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */
3277 const char* file, /*!< in: file name */
3278 ulint line, /*!< in: line where called */
3279 mtr_t* mtr) /*!< in: mini-transaction */
3280 {
3281 buf_pool_t* buf_pool;
3282 ibool success;
3283 ulint fix_type;
3284
3285 ut_ad(mtr);
3286 ut_ad(mtr->state == MTR_ACTIVE);
3287 ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
3288
3289 mutex_enter(&block->mutex);
3290
3291 if (buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH) {
3292 /* Another thread is just freeing the block from the LRU list
3293 of the buffer pool: do not try to access this page; this
3294 attempt to access the page can only come through the hash
3295 index because when the buffer block state is ..._REMOVE_HASH,
3296 we have already removed it from the page address hash table
3297 of the buffer pool. */
3298
3299 mutex_exit(&block->mutex);
3300
3301 return(FALSE);
3302 }
3303
3304 ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
3305
3306 buf_block_buf_fix_inc(block, file, line);
3307
3308 buf_page_set_accessed(&block->page);
3309
3310 mutex_exit(&block->mutex);
3311
3312 buf_pool = buf_pool_from_block(block);
3313
3314 if (mode == BUF_MAKE_YOUNG) {
3315 buf_page_make_young_if_needed(&block->page);
3316 }
3317
3318 ut_ad(!ibuf_inside(mtr) || mode == BUF_KEEP_OLD);
3319
3320 if (rw_latch == RW_S_LATCH) {
3321 success = rw_lock_s_lock_nowait(&(block->lock),
3322 file, line);
3323 fix_type = MTR_MEMO_PAGE_S_FIX;
3324 } else {
3325 success = rw_lock_x_lock_func_nowait_inline(&(block->lock),
3326 file, line);
3327 fix_type = MTR_MEMO_PAGE_X_FIX;
3328 }
3329
3330 if (!success) {
3331 buf_block_buf_fix_dec(block);
3332
3333 return(FALSE);
3334 }
3335
3336 mtr_memo_push(mtr, block, fix_type);
3337
3338 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3339 ut_a(++buf_dbg_counter % 5771 || buf_validate());
3340 ut_a(block->page.buf_fix_count > 0);
3341 ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
3342 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3343 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
3344 if (mode != BUF_KEEP_OLD) {
3345 /* If mode == BUF_KEEP_OLD, we are executing an I/O
3346 completion routine. Avoid a bogus assertion failure
3347 when ibuf_merge_or_delete_for_page() is processing a
3348 page that was just freed due to DROP INDEX, or
3349 deleting a record from SYS_INDEXES. This check will be
3350 skipped in recv_recover_page() as well. */
3351
3352 mutex_enter(&block->mutex);
3353 ut_a(!block->page.file_page_was_freed);
3354 mutex_exit(&block->mutex);
3355 }
3356 #endif
3357
3358 #ifdef UNIV_IBUF_COUNT_DEBUG
3359 ut_a((mode == BUF_KEEP_OLD)
3360 || (ibuf_count_get(buf_block_get_space(block),
3361 buf_block_get_page_no(block)) == 0));
3362 #endif
3363 buf_pool->stat.n_page_gets++;
3364
3365 return(TRUE);
3366 }
3367
3368 /*******************************************************************//**
3369 Given a tablespace id and page number tries to get that page. If the
3370 page is not in the buffer pool it is not loaded and NULL is returned.
3371 Suitable for using when holding the lock_sys_t::mutex.
3372 @return pointer to a page or NULL */
3373 UNIV_INTERN
3374 const buf_block_t*
buf_page_try_get_func(ulint space_id,ulint page_no,const char * file,ulint line,mtr_t * mtr)3375 buf_page_try_get_func(
3376 /*==================*/
3377 ulint space_id,/*!< in: tablespace id */
3378 ulint page_no,/*!< in: page number */
3379 const char* file, /*!< in: file name */
3380 ulint line, /*!< in: line where called */
3381 mtr_t* mtr) /*!< in: mini-transaction */
3382 {
3383 buf_block_t* block;
3384 ibool success;
3385 ulint fix_type;
3386 buf_pool_t* buf_pool = buf_pool_get(space_id, page_no);
3387 rw_lock_t* hash_lock;
3388
3389 ut_ad(mtr);
3390 ut_ad(mtr->state == MTR_ACTIVE);
3391
3392 block = buf_block_hash_get_s_locked(buf_pool, space_id,
3393 page_no, &hash_lock);
3394
3395 if (!block || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
3396 if (block) {
3397 rw_lock_s_unlock(hash_lock);
3398 }
3399 return(NULL);
3400 }
3401
3402 ut_ad(!buf_pool_watch_is_sentinel(buf_pool, &block->page));
3403
3404 mutex_enter(&block->mutex);
3405 rw_lock_s_unlock(hash_lock);
3406
3407 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3408 ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
3409 ut_a(buf_block_get_space(block) == space_id);
3410 ut_a(buf_block_get_page_no(block) == page_no);
3411 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3412
3413 buf_block_buf_fix_inc(block, file, line);
3414 mutex_exit(&block->mutex);
3415
3416 fix_type = MTR_MEMO_PAGE_S_FIX;
3417 success = rw_lock_s_lock_nowait(&block->lock, file, line);
3418
3419 if (!success) {
3420 /* Let us try to get an X-latch. If the current thread
3421 is holding an X-latch on the page, we cannot get an
3422 S-latch. */
3423
3424 fix_type = MTR_MEMO_PAGE_X_FIX;
3425 success = rw_lock_x_lock_func_nowait_inline(&block->lock,
3426 file, line);
3427 }
3428
3429 if (!success) {
3430 buf_block_buf_fix_dec(block);
3431
3432 return(NULL);
3433 }
3434
3435 mtr_memo_push(mtr, block, fix_type);
3436 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3437 ut_a(++buf_dbg_counter % 5771 || buf_validate());
3438 ut_a(block->page.buf_fix_count > 0);
3439 ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
3440 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3441 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
3442 mutex_enter(&block->mutex);
3443 ut_a(!block->page.file_page_was_freed);
3444 mutex_exit(&block->mutex);
3445 #endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
3446 buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
3447
3448 buf_pool->stat.n_page_gets++;
3449
3450 #ifdef UNIV_IBUF_COUNT_DEBUG
3451 ut_a(ibuf_count_get(buf_block_get_space(block),
3452 buf_block_get_page_no(block)) == 0);
3453 #endif
3454
3455 return(block);
3456 }
3457
3458 /********************************************************************//**
3459 Initialize some fields of a control block. */
3460 UNIV_INLINE
3461 void
buf_page_init_low(buf_page_t * bpage)3462 buf_page_init_low(
3463 /*==============*/
3464 buf_page_t* bpage) /*!< in: block to init */
3465 {
3466 bpage->flush_type = BUF_FLUSH_LRU;
3467 bpage->io_fix = BUF_IO_NONE;
3468 bpage->buf_fix_count = 0;
3469 bpage->freed_page_clock = 0;
3470 bpage->access_time = 0;
3471 bpage->newest_modification = 0;
3472 bpage->oldest_modification = 0;
3473 HASH_INVALIDATE(bpage, hash);
3474 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
3475 bpage->file_page_was_freed = FALSE;
3476 #endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
3477 }
3478
3479 /********************************************************************//**
3480 Inits a page to the buffer buf_pool. */
3481 static MY_ATTRIBUTE((nonnull))
3482 void
buf_page_init(buf_pool_t * buf_pool,ulint space,ulint offset,ulint fold,ulint zip_size,buf_block_t * block)3483 buf_page_init(
3484 /*==========*/
3485 buf_pool_t* buf_pool,/*!< in/out: buffer pool */
3486 ulint space, /*!< in: space id */
3487 ulint offset, /*!< in: offset of the page within space
3488 in units of a page */
3489 ulint fold, /*!< in: buf_page_address_fold(space,offset) */
3490 ulint zip_size,/*!< in: compressed page size, or 0 */
3491 buf_block_t* block) /*!< in/out: block to init */
3492 {
3493 buf_page_t* hash_page;
3494
3495 ut_ad(buf_pool == buf_pool_get(space, offset));
3496 ut_ad(buf_pool_mutex_own(buf_pool));
3497
3498 ut_ad(mutex_own(&(block->mutex)));
3499 ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE);
3500
3501 #ifdef UNIV_SYNC_DEBUG
3502 ut_ad(rw_lock_own(buf_page_hash_lock_get(buf_pool, fold),
3503 RW_LOCK_EX));
3504 #endif /* UNIV_SYNC_DEBUG */
3505
3506 /* Set the state of the block */
3507 buf_block_set_file_page(block, space, offset);
3508
3509 #ifdef UNIV_DEBUG_VALGRIND
3510 if (!space) {
3511 /* Silence valid Valgrind warnings about uninitialized
3512 data being written to data files. There are some unused
3513 bytes on some pages that InnoDB does not initialize. */
3514 UNIV_MEM_VALID(block->frame, UNIV_PAGE_SIZE);
3515 }
3516 #endif /* UNIV_DEBUG_VALGRIND */
3517
3518 buf_block_init_low(block);
3519
3520 block->lock_hash_val = lock_rec_hash(space, offset);
3521
3522 buf_page_init_low(&block->page);
3523
3524 /* Insert into the hash table of file pages */
3525
3526 hash_page = buf_page_hash_get_low(buf_pool, space, offset, fold);
3527
3528 if (hash_page == NULL) {
3529 /* Block not found in the hash table */
3530 } else if (buf_pool_watch_is_sentinel(buf_pool, hash_page)) {
3531 ib_uint32_t buf_fix_count = hash_page->buf_fix_count;
3532
3533 ut_a(buf_fix_count > 0);
3534
3535 #ifdef PAGE_ATOMIC_REF_COUNT
3536 os_atomic_increment_uint32(
3537 &block->page.buf_fix_count, buf_fix_count);
3538 #else
3539 block->page.buf_fix_count += ulint(buf_fix_count);
3540 #endif /* PAGE_ATOMIC_REF_COUNT */
3541
3542 buf_pool_watch_remove(buf_pool, fold, hash_page);
3543 } else {
3544 fprintf(stderr,
3545 "InnoDB: Error: page %lu %lu already found"
3546 " in the hash table: %p, %p\n",
3547 (ulong) space,
3548 (ulong) offset,
3549 (const void*) hash_page, (const void*) block);
3550 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3551 mutex_exit(&block->mutex);
3552 buf_pool_mutex_exit(buf_pool);
3553 buf_print();
3554 buf_LRU_print();
3555 buf_validate();
3556 buf_LRU_validate();
3557 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3558 ut_error;
3559 }
3560
3561 ut_ad(!block->page.in_zip_hash);
3562 ut_ad(!block->page.in_page_hash);
3563 ut_d(block->page.in_page_hash = TRUE);
3564
3565 HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, &block->page);
3566
3567 if (zip_size) {
3568 page_zip_set_size(&block->page.zip, zip_size);
3569 }
3570 }
3571
3572 /********************************************************************//**
3573 Function which inits a page for read to the buffer buf_pool. If the page is
3574 (1) already in buf_pool, or
3575 (2) if we specify to read only ibuf pages and the page is not an ibuf page, or
3576 (3) if the space is deleted or being deleted,
3577 then this function does nothing.
3578 Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock
3579 on the buffer frame. The io-handler must take care that the flag is cleared
3580 and the lock released later.
3581 @return pointer to the block or NULL */
3582 UNIV_INTERN
3583 buf_page_t*
buf_page_init_for_read(dberr_t * err,ulint mode,ulint space,ulint zip_size,ibool unzip,ib_int64_t tablespace_version,ulint offset)3584 buf_page_init_for_read(
3585 /*===================*/
3586 dberr_t* err, /*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED */
3587 ulint mode, /*!< in: BUF_READ_IBUF_PAGES_ONLY, ... */
3588 ulint space, /*!< in: space id */
3589 ulint zip_size,/*!< in: compressed page size, or 0 */
3590 ibool unzip, /*!< in: TRUE=request uncompressed page */
3591 ib_int64_t tablespace_version,
3592 /*!< in: prevents reading from a wrong
3593 version of the tablespace in case we have done
3594 DISCARD + IMPORT */
3595 ulint offset) /*!< in: page number */
3596 {
3597 buf_block_t* block;
3598 buf_page_t* bpage = NULL;
3599 buf_page_t* watch_page;
3600 rw_lock_t* hash_lock;
3601 mtr_t mtr;
3602 ulint fold;
3603 ibool lru = FALSE;
3604 void* data;
3605 buf_pool_t* buf_pool = buf_pool_get(space, offset);
3606
3607 ut_ad(buf_pool);
3608
3609 *err = DB_SUCCESS;
3610
3611 if (mode == BUF_READ_IBUF_PAGES_ONLY) {
3612 /* It is a read-ahead within an ibuf routine */
3613
3614 ut_ad(!ibuf_bitmap_page(zip_size, offset));
3615
3616 ibuf_mtr_start(&mtr);
3617
3618 if (!recv_no_ibuf_operations
3619 && !ibuf_page(space, zip_size, offset, &mtr)) {
3620
3621 ibuf_mtr_commit(&mtr);
3622
3623 return(NULL);
3624 }
3625 } else {
3626 ut_ad(mode == BUF_READ_ANY_PAGE);
3627 }
3628
3629 if (zip_size && !unzip && !recv_recovery_is_on()) {
3630 block = NULL;
3631 } else {
3632 block = buf_LRU_get_free_block(buf_pool);
3633 ut_ad(block);
3634 ut_ad(buf_pool_from_block(block) == buf_pool);
3635 }
3636
3637 fold = buf_page_address_fold(space, offset);
3638 hash_lock = buf_page_hash_lock_get(buf_pool, fold);
3639
3640 buf_pool_mutex_enter(buf_pool);
3641 rw_lock_x_lock(hash_lock);
3642
3643 watch_page = buf_page_hash_get_low(buf_pool, space, offset, fold);
3644 if (watch_page && !buf_pool_watch_is_sentinel(buf_pool, watch_page)) {
3645 /* The page is already in the buffer pool. */
3646 watch_page = NULL;
3647 err_exit:
3648 rw_lock_x_unlock(hash_lock);
3649 if (block) {
3650 mutex_enter(&block->mutex);
3651 buf_LRU_block_free_non_file_page(block);
3652 mutex_exit(&block->mutex);
3653 }
3654
3655 bpage = NULL;
3656 goto func_exit;
3657 }
3658
3659 if (fil_tablespace_deleted_or_being_deleted_in_mem(
3660 space, tablespace_version)) {
3661 /* The page belongs to a space which has been
3662 deleted or is being deleted. */
3663 *err = DB_TABLESPACE_DELETED;
3664
3665 goto err_exit;
3666 }
3667
3668 if (block) {
3669 bpage = &block->page;
3670
3671 mutex_enter(&block->mutex);
3672
3673 ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
3674
3675 buf_page_init(buf_pool, space, offset, fold, zip_size, block);
3676
3677 #ifdef PAGE_ATOMIC_REF_COUNT
3678 /* Note: We set the io state without the protection of
3679 the block->lock. This is because other threads cannot
3680 access this block unless it is in the hash table. */
3681
3682 buf_page_set_io_fix(bpage, BUF_IO_READ);
3683 #endif /* PAGE_ATOMIC_REF_COUNT */
3684
3685 rw_lock_x_unlock(hash_lock);
3686
3687 /* The block must be put to the LRU list, to the old blocks */
3688 buf_LRU_add_block(bpage, TRUE/* to old blocks */);
3689
3690 /* We set a pass-type x-lock on the frame because then
3691 the same thread which called for the read operation
3692 (and is running now at this point of code) can wait
3693 for the read to complete by waiting for the x-lock on
3694 the frame; if the x-lock were recursive, the same
3695 thread would illegally get the x-lock before the page
3696 read is completed. The x-lock is cleared by the
3697 io-handler thread. */
3698
3699 rw_lock_x_lock_gen(&block->lock, BUF_IO_READ);
3700
3701 #ifndef PAGE_ATOMIC_REF_COUNT
3702 buf_page_set_io_fix(bpage, BUF_IO_READ);
3703 #endif /* !PAGE_ATOMIC_REF_COUNT */
3704
3705 if (zip_size) {
3706 /* buf_pool->mutex may be released and
3707 reacquired by buf_buddy_alloc(). Thus, we
3708 must release block->mutex in order not to
3709 break the latching order in the reacquisition
3710 of buf_pool->mutex. We also must defer this
3711 operation until after the block descriptor has
3712 been added to buf_pool->LRU and
3713 buf_pool->page_hash. */
3714 mutex_exit(&block->mutex);
3715 data = buf_buddy_alloc(buf_pool, zip_size, &lru);
3716 mutex_enter(&block->mutex);
3717 block->page.zip.data = (page_zip_t*) data;
3718
3719 /* To maintain the invariant
3720 block->in_unzip_LRU_list
3721 == buf_page_belongs_to_unzip_LRU(&block->page)
3722 we have to add this block to unzip_LRU
3723 after block->page.zip.data is set. */
3724 ut_ad(buf_page_belongs_to_unzip_LRU(&block->page));
3725 buf_unzip_LRU_add_block(block, TRUE);
3726 }
3727
3728 mutex_exit(&block->mutex);
3729 } else {
3730 rw_lock_x_unlock(hash_lock);
3731
3732 /* The compressed page must be allocated before the
3733 control block (bpage), in order to avoid the
3734 invocation of buf_buddy_relocate_block() on
3735 uninitialized data. */
3736 data = buf_buddy_alloc(buf_pool, zip_size, &lru);
3737
3738 rw_lock_x_lock(hash_lock);
3739
3740 /* If buf_buddy_alloc() allocated storage from the LRU list,
3741 it released and reacquired buf_pool->mutex. Thus, we must
3742 check the page_hash again, as it may have been modified. */
3743 if (UNIV_UNLIKELY(lru)) {
3744
3745 watch_page = buf_page_hash_get_low(
3746 buf_pool, space, offset, fold);
3747
3748 if (UNIV_UNLIKELY(watch_page
3749 && !buf_pool_watch_is_sentinel(buf_pool,
3750 watch_page))) {
3751
3752 /* The block was added by some other thread. */
3753 rw_lock_x_unlock(hash_lock);
3754 watch_page = NULL;
3755 buf_buddy_free(buf_pool, data, zip_size);
3756
3757 bpage = NULL;
3758 goto func_exit;
3759 }
3760 }
3761
3762 bpage = buf_page_alloc_descriptor();
3763
3764 /* Initialize the buf_pool pointer. */
3765 bpage->buf_pool_index = buf_pool_index(buf_pool);
3766
3767 page_zip_des_init(&bpage->zip);
3768 page_zip_set_size(&bpage->zip, zip_size);
3769 bpage->zip.data = (page_zip_t*) data;
3770
3771 mutex_enter(&buf_pool->zip_mutex);
3772 UNIV_MEM_DESC(bpage->zip.data,
3773 page_zip_get_size(&bpage->zip));
3774
3775 buf_page_init_low(bpage);
3776
3777 bpage->state = BUF_BLOCK_ZIP_PAGE;
3778 bpage->space = static_cast<ib_uint32_t>(space);
3779 bpage->offset = static_cast<ib_uint32_t>(offset);
3780
3781 #ifdef UNIV_DEBUG
3782 bpage->in_page_hash = FALSE;
3783 bpage->in_zip_hash = FALSE;
3784 bpage->in_flush_list = FALSE;
3785 bpage->in_free_list = FALSE;
3786 bpage->in_LRU_list = FALSE;
3787 #endif /* UNIV_DEBUG */
3788
3789 ut_d(bpage->in_page_hash = TRUE);
3790
3791 if (watch_page != NULL) {
3792
3793 /* Preserve the reference count. */
3794 ib_uint32_t buf_fix_count;
3795
3796 buf_fix_count = watch_page->buf_fix_count;
3797
3798 ut_a(buf_fix_count > 0);
3799
3800 #ifdef PAGE_ATOMIC_REF_COUNT
3801 os_atomic_increment_uint32(
3802 &bpage->buf_fix_count, buf_fix_count);
3803 #else
3804 bpage->buf_fix_count += buf_fix_count;
3805 #endif /* PAGE_ATOMIC_REF_COUNT */
3806
3807 ut_ad(buf_pool_watch_is_sentinel(buf_pool, watch_page));
3808 buf_pool_watch_remove(buf_pool, fold, watch_page);
3809 }
3810
3811 HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold,
3812 bpage);
3813
3814 rw_lock_x_unlock(hash_lock);
3815
3816 /* The block must be put to the LRU list, to the old blocks.
3817 The zip_size is already set into the page zip */
3818 buf_LRU_add_block(bpage, TRUE/* to old blocks */);
3819 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3820 buf_LRU_insert_zip_clean(bpage);
3821 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3822
3823 buf_page_set_io_fix(bpage, BUF_IO_READ);
3824
3825 mutex_exit(&buf_pool->zip_mutex);
3826 }
3827
3828 buf_pool->n_pend_reads++;
3829 func_exit:
3830 buf_pool_mutex_exit(buf_pool);
3831
3832 if (mode == BUF_READ_IBUF_PAGES_ONLY) {
3833
3834 ibuf_mtr_commit(&mtr);
3835 }
3836
3837
3838 #ifdef UNIV_SYNC_DEBUG
3839 ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
3840 ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
3841 #endif /* UNIV_SYNC_DEBUG */
3842
3843 ut_ad(!bpage || buf_page_in_file(bpage));
3844 return(bpage);
3845 }
3846
3847 /********************************************************************//**
3848 Initializes a page to the buffer buf_pool. The page is usually not read
3849 from a file even if it cannot be found in the buffer buf_pool. This is one
3850 of the functions which perform to a block a state transition NOT_USED =>
3851 FILE_PAGE (the other is buf_page_get_gen).
3852 @return pointer to the block, page bufferfixed */
3853 UNIV_INTERN
3854 buf_block_t*
buf_page_create(ulint space,ulint offset,ulint zip_size,mtr_t * mtr)3855 buf_page_create(
3856 /*============*/
3857 ulint space, /*!< in: space id */
3858 ulint offset, /*!< in: offset of the page within space in units of
3859 a page */
3860 ulint zip_size,/*!< in: compressed page size, or 0 */
3861 mtr_t* mtr) /*!< in: mini-transaction handle */
3862 {
3863 buf_frame_t* frame;
3864 buf_block_t* block;
3865 ulint fold;
3866 buf_block_t* free_block = NULL;
3867 buf_pool_t* buf_pool = buf_pool_get(space, offset);
3868 rw_lock_t* hash_lock;
3869
3870 ut_ad(mtr);
3871 ut_ad(mtr->state == MTR_ACTIVE);
3872 ut_ad(space || !zip_size);
3873
3874 free_block = buf_LRU_get_free_block(buf_pool);
3875
3876 fold = buf_page_address_fold(space, offset);
3877 hash_lock = buf_page_hash_lock_get(buf_pool, fold);
3878
3879 buf_pool_mutex_enter(buf_pool);
3880 rw_lock_x_lock(hash_lock);
3881
3882 block = (buf_block_t*) buf_page_hash_get_low(
3883 buf_pool, space, offset, fold);
3884
3885 if (block
3886 && buf_page_in_file(&block->page)
3887 && !buf_pool_watch_is_sentinel(buf_pool, &block->page)) {
3888 #ifdef UNIV_IBUF_COUNT_DEBUG
3889 ut_a(ibuf_count_get(space, offset) == 0);
3890 #endif
3891 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
3892 block->page.file_page_was_freed = FALSE;
3893 #endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
3894
3895 /* Page can be found in buf_pool */
3896 buf_pool_mutex_exit(buf_pool);
3897 rw_lock_x_unlock(hash_lock);
3898
3899 buf_block_free(free_block);
3900
3901 return(buf_page_get_with_no_latch(space, zip_size, offset, mtr));
3902 }
3903
3904 /* If we get here, the page was not in buf_pool: init it there */
3905
3906 #ifdef UNIV_DEBUG
3907 if (buf_debug_prints) {
3908 fprintf(stderr, "Creating space %lu page %lu to buffer\n",
3909 (ulong) space, (ulong) offset);
3910 }
3911 #endif /* UNIV_DEBUG */
3912
3913 block = free_block;
3914
3915 mutex_enter(&block->mutex);
3916
3917 buf_page_init(buf_pool, space, offset, fold, zip_size, block);
3918
3919 rw_lock_x_unlock(hash_lock);
3920
3921 /* The block must be put to the LRU list */
3922 buf_LRU_add_block(&block->page, FALSE);
3923
3924 buf_block_buf_fix_inc(block, __FILE__, __LINE__);
3925 buf_pool->stat.n_pages_created++;
3926
3927 if (zip_size) {
3928 void* data;
3929 ibool lru;
3930
3931 /* Prevent race conditions during buf_buddy_alloc(),
3932 which may release and reacquire buf_pool->mutex,
3933 by IO-fixing and X-latching the block. */
3934
3935 buf_page_set_io_fix(&block->page, BUF_IO_READ);
3936 rw_lock_x_lock(&block->lock);
3937
3938 mutex_exit(&block->mutex);
3939 /* buf_pool->mutex may be released and reacquired by
3940 buf_buddy_alloc(). Thus, we must release block->mutex
3941 in order not to break the latching order in
3942 the reacquisition of buf_pool->mutex. We also must
3943 defer this operation until after the block descriptor
3944 has been added to buf_pool->LRU and buf_pool->page_hash. */
3945 data = buf_buddy_alloc(buf_pool, zip_size, &lru);
3946 mutex_enter(&block->mutex);
3947 block->page.zip.data = (page_zip_t*) data;
3948
3949 /* To maintain the invariant
3950 block->in_unzip_LRU_list
3951 == buf_page_belongs_to_unzip_LRU(&block->page)
3952 we have to add this block to unzip_LRU after
3953 block->page.zip.data is set. */
3954 ut_ad(buf_page_belongs_to_unzip_LRU(&block->page));
3955 buf_unzip_LRU_add_block(block, FALSE);
3956
3957 buf_page_set_io_fix(&block->page, BUF_IO_NONE);
3958 rw_lock_x_unlock(&block->lock);
3959 }
3960
3961 buf_pool_mutex_exit(buf_pool);
3962
3963 mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX);
3964
3965 buf_page_set_accessed(&block->page);
3966
3967 mutex_exit(&block->mutex);
3968
3969 /* Delete possible entries for the page from the insert buffer:
3970 such can exist if the page belonged to an index which was dropped */
3971
3972 ibuf_merge_or_delete_for_page(NULL, space, offset, zip_size, TRUE);
3973
3974 frame = block->frame;
3975
3976 memset(frame + FIL_PAGE_PREV, 0xff, 4);
3977 memset(frame + FIL_PAGE_NEXT, 0xff, 4);
3978 mach_write_to_2(frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED);
3979
3980 /* Reset to zero the file flush lsn field in the page; if the first
3981 page of an ibdata file is 'created' in this function into the buffer
3982 pool then we lose the original contents of the file flush lsn stamp.
3983 Then InnoDB could in a crash recovery print a big, false, corruption
3984 warning if the stamp contains an lsn bigger than the ib_logfile lsn. */
3985
3986 memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
3987
3988 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3989 ut_a(++buf_dbg_counter % 5771 || buf_validate());
3990 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3991 #ifdef UNIV_IBUF_COUNT_DEBUG
3992 ut_a(ibuf_count_get(buf_block_get_space(block),
3993 buf_block_get_page_no(block)) == 0);
3994 #endif
3995 return(block);
3996 }
3997
3998 /********************************************************************//**
3999 Monitor the buffer page read/write activity, and increment corresponding
4000 counter value if MONITOR_MODULE_BUF_PAGE (module_buf_page) module is
4001 enabled. */
4002 static
4003 void
buf_page_monitor(const buf_page_t * bpage,enum buf_io_fix io_type)4004 buf_page_monitor(
4005 /*=============*/
4006 const buf_page_t* bpage, /*!< in: pointer to the block */
4007 enum buf_io_fix io_type)/*!< in: io_fix types */
4008 {
4009 const byte* frame;
4010 monitor_id_t counter;
4011
4012 /* If the counter module is not turned on, just return */
4013 if (!MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)) {
4014 return;
4015 }
4016
4017 ut_a(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE);
4018
4019 frame = bpage->zip.data
4020 ? bpage->zip.data
4021 : ((buf_block_t*) bpage)->frame;
4022
4023 switch (fil_page_get_type(frame)) {
4024 ulint level;
4025
4026 case FIL_PAGE_INDEX:
4027 level = btr_page_get_level_low(frame);
4028
4029 /* Check if it is an index page for insert buffer */
4030 if (btr_page_get_index_id(frame)
4031 == (index_id_t)(DICT_IBUF_ID_MIN + IBUF_SPACE_ID)) {
4032 if (level == 0) {
4033 counter = MONITOR_RW_COUNTER(
4034 io_type, MONITOR_INDEX_IBUF_LEAF_PAGE);
4035 } else {
4036 counter = MONITOR_RW_COUNTER(
4037 io_type,
4038 MONITOR_INDEX_IBUF_NON_LEAF_PAGE);
4039 }
4040 } else {
4041 if (level == 0) {
4042 counter = MONITOR_RW_COUNTER(
4043 io_type, MONITOR_INDEX_LEAF_PAGE);
4044 } else {
4045 counter = MONITOR_RW_COUNTER(
4046 io_type, MONITOR_INDEX_NON_LEAF_PAGE);
4047 }
4048 }
4049 break;
4050
4051 case FIL_PAGE_UNDO_LOG:
4052 counter = MONITOR_RW_COUNTER(io_type, MONITOR_UNDO_LOG_PAGE);
4053 break;
4054
4055 case FIL_PAGE_INODE:
4056 counter = MONITOR_RW_COUNTER(io_type, MONITOR_INODE_PAGE);
4057 break;
4058
4059 case FIL_PAGE_IBUF_FREE_LIST:
4060 counter = MONITOR_RW_COUNTER(io_type,
4061 MONITOR_IBUF_FREELIST_PAGE);
4062 break;
4063
4064 case FIL_PAGE_IBUF_BITMAP:
4065 counter = MONITOR_RW_COUNTER(io_type,
4066 MONITOR_IBUF_BITMAP_PAGE);
4067 break;
4068
4069 case FIL_PAGE_TYPE_SYS:
4070 counter = MONITOR_RW_COUNTER(io_type, MONITOR_SYSTEM_PAGE);
4071 break;
4072
4073 case FIL_PAGE_TYPE_TRX_SYS:
4074 counter = MONITOR_RW_COUNTER(io_type, MONITOR_TRX_SYSTEM_PAGE);
4075 break;
4076
4077 case FIL_PAGE_TYPE_FSP_HDR:
4078 counter = MONITOR_RW_COUNTER(io_type, MONITOR_FSP_HDR_PAGE);
4079 break;
4080
4081 case FIL_PAGE_TYPE_XDES:
4082 counter = MONITOR_RW_COUNTER(io_type, MONITOR_XDES_PAGE);
4083 break;
4084
4085 case FIL_PAGE_TYPE_BLOB:
4086 counter = MONITOR_RW_COUNTER(io_type, MONITOR_BLOB_PAGE);
4087 break;
4088
4089 case FIL_PAGE_TYPE_ZBLOB:
4090 counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB_PAGE);
4091 break;
4092
4093 case FIL_PAGE_TYPE_ZBLOB2:
4094 counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB2_PAGE);
4095 break;
4096
4097 default:
4098 counter = MONITOR_RW_COUNTER(io_type, MONITOR_OTHER_PAGE);
4099 }
4100
4101 MONITOR_INC_NOCHECK(counter);
4102 }
4103
4104 /********************************************************************//**
4105 Mark a table with the specified space pointed by bpage->space corrupted.
4106 Also remove the bpage from LRU list.
4107 @return TRUE if successful */
4108 static
4109 ibool
buf_mark_space_corrupt(buf_page_t * bpage)4110 buf_mark_space_corrupt(
4111 /*===================*/
4112 buf_page_t* bpage) /*!< in: pointer to the block in question */
4113 {
4114 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
4115 const ibool uncompressed = (buf_page_get_state(bpage)
4116 == BUF_BLOCK_FILE_PAGE);
4117 ulint space = bpage->space;
4118 ibool ret = TRUE;
4119
4120 /* First unfix and release lock on the bpage */
4121 buf_pool_mutex_enter(buf_pool);
4122 mutex_enter(buf_page_get_mutex(bpage));
4123 ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_READ);
4124 ut_ad(bpage->buf_fix_count == 0);
4125
4126 /* Set BUF_IO_NONE before we remove the block from LRU list */
4127 buf_page_set_io_fix(bpage, BUF_IO_NONE);
4128
4129 if (uncompressed) {
4130 rw_lock_x_unlock_gen(
4131 &((buf_block_t*) bpage)->lock,
4132 BUF_IO_READ);
4133 }
4134
4135 mutex_exit(buf_page_get_mutex(bpage));
4136
4137 /* Find the table with specified space id, and mark it corrupted */
4138 if (dict_set_corrupted_by_space(space)) {
4139 buf_LRU_free_one_page(bpage);
4140 } else {
4141 ret = FALSE;
4142 }
4143
4144 ut_ad(buf_pool->n_pend_reads > 0);
4145 buf_pool->n_pend_reads--;
4146
4147 buf_pool_mutex_exit(buf_pool);
4148
4149 return(ret);
4150 }
4151
4152 /********************************************************************//**
4153 Completes an asynchronous read or write request of a file page to or from
4154 the buffer pool.
4155 @return true if successful */
4156 UNIV_INTERN
4157 bool
buf_page_io_complete(buf_page_t * bpage)4158 buf_page_io_complete(
4159 /*=================*/
4160 buf_page_t* bpage) /*!< in: pointer to the block in question */
4161 {
4162 enum buf_io_fix io_type;
4163 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
4164 const ibool uncompressed = (buf_page_get_state(bpage)
4165 == BUF_BLOCK_FILE_PAGE);
4166
4167 ut_a(buf_page_in_file(bpage));
4168
4169 /* We do not need protect io_fix here by mutex to read
4170 it because this is the only function where we can change the value
4171 from BUF_IO_READ or BUF_IO_WRITE to some other value, and our code
4172 ensures that this is the only thread that handles the i/o for this
4173 block. */
4174
4175 io_type = buf_page_get_io_fix(bpage);
4176 ut_ad(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE);
4177
4178 if (io_type == BUF_IO_READ) {
4179 ulint read_page_no;
4180 ulint read_space_id;
4181 byte* frame;
4182
4183 if (buf_page_get_zip_size(bpage)) {
4184 frame = bpage->zip.data;
4185 buf_pool->n_pend_unzip++;
4186 if (uncompressed
4187 && !buf_zip_decompress((buf_block_t*) bpage,
4188 FALSE)) {
4189
4190 buf_pool->n_pend_unzip--;
4191 goto corrupt;
4192 }
4193 buf_pool->n_pend_unzip--;
4194 } else {
4195 ut_a(uncompressed);
4196 frame = ((buf_block_t*) bpage)->frame;
4197 }
4198
4199 /* If this page is not uninitialized and not in the
4200 doublewrite buffer, then the page number and space id
4201 should be the same as in block. */
4202 read_page_no = mach_read_from_4(frame + FIL_PAGE_OFFSET);
4203 read_space_id = mach_read_from_4(
4204 frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
4205
4206 if (bpage->space == TRX_SYS_SPACE
4207 && buf_dblwr_page_inside(bpage->offset)) {
4208
4209 ut_print_timestamp(stderr);
4210 fprintf(stderr,
4211 " InnoDB: Error: reading page %lu\n"
4212 "InnoDB: which is in the"
4213 " doublewrite buffer!\n",
4214 (ulong) bpage->offset);
4215 } else if (!read_space_id && !read_page_no) {
4216 /* This is likely an uninitialized page. */
4217 } else if ((bpage->space
4218 && bpage->space != read_space_id)
4219 || bpage->offset != read_page_no) {
4220 /* We did not compare space_id to read_space_id
4221 if bpage->space == 0, because the field on the
4222 page may contain garbage in MySQL < 4.1.1,
4223 which only supported bpage->space == 0. */
4224
4225 ut_print_timestamp(stderr);
4226 fprintf(stderr,
4227 " InnoDB: Error: space id and page n:o"
4228 " stored in the page\n"
4229 "InnoDB: read in are %lu:%lu,"
4230 " should be %lu:%lu!\n",
4231 (ulong) read_space_id, (ulong) read_page_no,
4232 (ulong) bpage->space,
4233 (ulong) bpage->offset);
4234 }
4235
4236 /* From version 3.23.38 up we store the page checksum
4237 to the 4 first bytes of the page end lsn field */
4238
4239 if (buf_page_is_corrupted(true, frame,
4240 buf_page_get_zip_size(bpage))) {
4241
4242 /* Not a real corruption if it was triggered by
4243 error injection */
4244 DBUG_EXECUTE_IF("buf_page_is_corrupt_failure",
4245 if (bpage->space > TRX_SYS_SPACE
4246 && buf_mark_space_corrupt(bpage)) {
4247 ib_logf(IB_LOG_LEVEL_INFO,
4248 "Simulated page corruption");
4249 return(true);
4250 }
4251 goto page_not_corrupt;
4252 ;);
4253 corrupt:
4254 fprintf(stderr,
4255 "InnoDB: Database page corruption on disk"
4256 " or a failed\n"
4257 "InnoDB: file read of page %lu.\n"
4258 "InnoDB: You may have to recover"
4259 " from a backup.\n",
4260 (ulong) bpage->offset);
4261 buf_page_print(frame, buf_page_get_zip_size(bpage),
4262 BUF_PAGE_PRINT_NO_CRASH);
4263 fprintf(stderr,
4264 "InnoDB: Database page corruption on disk"
4265 " or a failed\n"
4266 "InnoDB: file read of page %lu.\n"
4267 "InnoDB: You may have to recover"
4268 " from a backup.\n",
4269 (ulong) bpage->offset);
4270 fputs("InnoDB: It is also possible that"
4271 " your operating\n"
4272 "InnoDB: system has corrupted its"
4273 " own file cache\n"
4274 "InnoDB: and rebooting your computer"
4275 " removes the\n"
4276 "InnoDB: error.\n"
4277 "InnoDB: If the corrupt page is an index page\n"
4278 "InnoDB: you can also try to"
4279 " fix the corruption\n"
4280 "InnoDB: by dumping, dropping,"
4281 " and reimporting\n"
4282 "InnoDB: the corrupt table."
4283 " You can use CHECK\n"
4284 "InnoDB: TABLE to scan your"
4285 " table for corruption.\n"
4286 "InnoDB: See also "
4287 REFMAN "forcing-innodb-recovery.html\n"
4288 "InnoDB: about forcing recovery.\n", stderr);
4289
4290 if (srv_force_recovery < SRV_FORCE_IGNORE_CORRUPT) {
4291 /* If page space id is larger than TRX_SYS_SPACE
4292 (0), we will attempt to mark the corresponding
4293 table as corrupted instead of crashing server */
4294 if (bpage->space > TRX_SYS_SPACE
4295 && buf_mark_space_corrupt(bpage)) {
4296 return(false);
4297 } else {
4298 fputs("InnoDB: Ending processing"
4299 " because of"
4300 " a corrupt database page.\n",
4301 stderr);
4302
4303 ut_error;
4304 }
4305 }
4306 }
4307
4308 DBUG_EXECUTE_IF("buf_page_is_corrupt_failure",
4309 page_not_corrupt: bpage = bpage; );
4310
4311 if (recv_recovery_is_on()) {
4312 /* Pages must be uncompressed for crash recovery. */
4313 ut_a(uncompressed);
4314 recv_recover_page(TRUE, (buf_block_t*) bpage);
4315 }
4316
4317 if (uncompressed && !recv_no_ibuf_operations) {
4318 ibuf_merge_or_delete_for_page(
4319 (buf_block_t*) bpage, bpage->space,
4320 bpage->offset, buf_page_get_zip_size(bpage),
4321 TRUE);
4322 }
4323 }
4324
4325 buf_pool_mutex_enter(buf_pool);
4326 mutex_enter(buf_page_get_mutex(bpage));
4327
4328 #ifdef UNIV_IBUF_COUNT_DEBUG
4329 if (io_type == BUF_IO_WRITE || uncompressed) {
4330 /* For BUF_IO_READ of compressed-only blocks, the
4331 buffered operations will be merged by buf_page_get_gen()
4332 after the block has been uncompressed. */
4333 ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
4334 }
4335 #endif
4336 /* Because this thread which does the unlocking is not the same that
4337 did the locking, we use a pass value != 0 in unlock, which simply
4338 removes the newest lock debug record, without checking the thread
4339 id. */
4340
4341 buf_page_set_io_fix(bpage, BUF_IO_NONE);
4342
4343 switch (io_type) {
4344 case BUF_IO_READ:
4345 /* NOTE that the call to ibuf may have moved the ownership of
4346 the x-latch to this OS thread: do not let this confuse you in
4347 debugging! */
4348
4349 ut_ad(buf_pool->n_pend_reads > 0);
4350 buf_pool->n_pend_reads--;
4351 buf_pool->stat.n_pages_read++;
4352
4353 if (uncompressed) {
4354 rw_lock_x_unlock_gen(&((buf_block_t*) bpage)->lock,
4355 BUF_IO_READ);
4356 }
4357
4358 break;
4359
4360 case BUF_IO_WRITE:
4361 /* Write means a flush operation: call the completion
4362 routine in the flush system */
4363
4364 buf_flush_write_complete(bpage);
4365
4366 if (uncompressed) {
4367 rw_lock_s_unlock_gen(&((buf_block_t*) bpage)->lock,
4368 BUF_IO_WRITE);
4369 }
4370
4371 buf_pool->stat.n_pages_written++;
4372
4373 break;
4374
4375 default:
4376 ut_error;
4377 }
4378
4379 buf_page_monitor(bpage, io_type);
4380
4381 #ifdef UNIV_DEBUG
4382 if (buf_debug_prints) {
4383 fprintf(stderr, "Has %s page space %lu page no %lu\n",
4384 io_type == BUF_IO_READ ? "read" : "written",
4385 (ulong) buf_page_get_space(bpage),
4386 (ulong) buf_page_get_page_no(bpage));
4387 }
4388 #endif /* UNIV_DEBUG */
4389
4390 mutex_exit(buf_page_get_mutex(bpage));
4391 buf_pool_mutex_exit(buf_pool);
4392
4393 return(true);
4394 }
4395
4396 /*********************************************************************//**
4397 Asserts that all file pages in the buffer are in a replaceable state.
4398 @return TRUE */
4399 static
4400 ibool
buf_all_freed_instance(buf_pool_t * buf_pool)4401 buf_all_freed_instance(
4402 /*===================*/
4403 buf_pool_t* buf_pool) /*!< in: buffer pool instancce */
4404 {
4405 ulint i;
4406 buf_chunk_t* chunk;
4407
4408 ut_ad(buf_pool);
4409
4410 buf_pool_mutex_enter(buf_pool);
4411
4412 chunk = buf_pool->chunks;
4413
4414 for (i = buf_pool->n_chunks; i--; chunk++) {
4415
4416 const buf_block_t* block = buf_chunk_not_freed(chunk);
4417
4418 if (UNIV_LIKELY_NULL(block)) {
4419 fprintf(stderr,
4420 "Page %lu %lu still fixed or dirty\n",
4421 (ulong) block->page.space,
4422 (ulong) block->page.offset);
4423 ut_error;
4424 }
4425 }
4426
4427 buf_pool_mutex_exit(buf_pool);
4428
4429 return(TRUE);
4430 }
4431
4432 /*********************************************************************//**
4433 Invalidates file pages in one buffer pool instance */
4434 static
4435 void
buf_pool_invalidate_instance(buf_pool_t * buf_pool)4436 buf_pool_invalidate_instance(
4437 /*=========================*/
4438 buf_pool_t* buf_pool) /*!< in: buffer pool instance */
4439 {
4440 ulint i;
4441
4442 buf_pool_mutex_enter(buf_pool);
4443
4444 for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
4445
4446 /* As this function is called during startup and
4447 during redo application phase during recovery, InnoDB
4448 is single threaded (apart from IO helper threads) at
4449 this stage. No new write batch can be in intialization
4450 stage at this point. */
4451 ut_ad(buf_pool->init_flush[i] == FALSE);
4452
4453 /* However, it is possible that a write batch that has
4454 been posted earlier is still not complete. For buffer
4455 pool invalidation to proceed we must ensure there is NO
4456 write activity happening. */
4457 if (buf_pool->n_flush[i] > 0) {
4458 buf_flush_t type = static_cast<buf_flush_t>(i);
4459
4460 buf_pool_mutex_exit(buf_pool);
4461 buf_flush_wait_batch_end(buf_pool, type);
4462 buf_pool_mutex_enter(buf_pool);
4463 }
4464 }
4465
4466 buf_pool_mutex_exit(buf_pool);
4467
4468 ut_ad(buf_all_freed_instance(buf_pool));
4469
4470 buf_pool_mutex_enter(buf_pool);
4471
4472 while (buf_LRU_scan_and_free_block(buf_pool, TRUE)) {
4473 }
4474
4475 ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0);
4476 ut_ad(UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0);
4477
4478 buf_pool->freed_page_clock = 0;
4479 buf_pool->LRU_old = NULL;
4480 buf_pool->LRU_old_len = 0;
4481
4482 memset(&buf_pool->stat, 0x00, sizeof(buf_pool->stat));
4483 buf_refresh_io_stats(buf_pool);
4484
4485 buf_pool_mutex_exit(buf_pool);
4486 }
4487
4488 /*********************************************************************//**
4489 Invalidates the file pages in the buffer pool when an archive recovery is
4490 completed. All the file pages buffered must be in a replaceable state when
4491 this function is called: not latched and not modified. */
4492 UNIV_INTERN
4493 void
buf_pool_invalidate(void)4494 buf_pool_invalidate(void)
4495 /*=====================*/
4496 {
4497 ulint i;
4498
4499 for (i = 0; i < srv_buf_pool_instances; i++) {
4500 buf_pool_invalidate_instance(buf_pool_from_array(i));
4501 }
4502 }
4503
4504 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
4505 /*********************************************************************//**
4506 Validates data in one buffer pool instance
4507 @return TRUE */
4508 static
4509 ibool
buf_pool_validate_instance(buf_pool_t * buf_pool)4510 buf_pool_validate_instance(
4511 /*=======================*/
4512 buf_pool_t* buf_pool) /*!< in: buffer pool instance */
4513 {
4514 buf_page_t* b;
4515 buf_chunk_t* chunk;
4516 ulint i;
4517 ulint n_lru_flush = 0;
4518 ulint n_page_flush = 0;
4519 ulint n_list_flush = 0;
4520 ulint n_lru = 0;
4521 ulint n_flush = 0;
4522 ulint n_free = 0;
4523 ulint n_zip = 0;
4524 ulint fold = 0;
4525 ulint space = 0;
4526 ulint offset = 0;
4527
4528 ut_ad(buf_pool);
4529
4530 buf_pool_mutex_enter(buf_pool);
4531 hash_lock_x_all(buf_pool->page_hash);
4532
4533 chunk = buf_pool->chunks;
4534
4535 /* Check the uncompressed blocks. */
4536
4537 for (i = buf_pool->n_chunks; i--; chunk++) {
4538
4539 ulint j;
4540 buf_block_t* block = chunk->blocks;
4541
4542 for (j = chunk->size; j--; block++) {
4543
4544 mutex_enter(&block->mutex);
4545
4546 switch (buf_block_get_state(block)) {
4547 case BUF_BLOCK_POOL_WATCH:
4548 case BUF_BLOCK_ZIP_PAGE:
4549 case BUF_BLOCK_ZIP_DIRTY:
4550 /* These should only occur on
4551 zip_clean, zip_free[], or flush_list. */
4552 ut_error;
4553 break;
4554
4555 case BUF_BLOCK_FILE_PAGE:
4556 space = buf_block_get_space(block);
4557 offset = buf_block_get_page_no(block);
4558 fold = buf_page_address_fold(space, offset);
4559 ut_a(buf_page_hash_get_low(buf_pool,
4560 space,
4561 offset,
4562 fold)
4563 == &block->page);
4564
4565 #ifdef UNIV_IBUF_COUNT_DEBUG
4566 ut_a(buf_page_get_io_fix(&block->page)
4567 == BUF_IO_READ
4568 || !ibuf_count_get(buf_block_get_space(
4569 block),
4570 buf_block_get_page_no(
4571 block)));
4572 #endif
4573 switch (buf_page_get_io_fix(&block->page)) {
4574 case BUF_IO_NONE:
4575 break;
4576
4577 case BUF_IO_WRITE:
4578 switch (buf_page_get_flush_type(
4579 &block->page)) {
4580 case BUF_FLUSH_LRU:
4581 n_lru_flush++;
4582 goto assert_s_latched;
4583 case BUF_FLUSH_SINGLE_PAGE:
4584 n_page_flush++;
4585 assert_s_latched:
4586 ut_a(rw_lock_is_locked(
4587 &block->lock,
4588 RW_LOCK_SHARED));
4589 break;
4590 case BUF_FLUSH_LIST:
4591 n_list_flush++;
4592 break;
4593 default:
4594 ut_error;
4595 }
4596
4597 break;
4598
4599 case BUF_IO_READ:
4600
4601 ut_a(rw_lock_is_locked(&block->lock,
4602 RW_LOCK_EX));
4603 break;
4604
4605 case BUF_IO_PIN:
4606 break;
4607 }
4608
4609 n_lru++;
4610 break;
4611
4612 case BUF_BLOCK_NOT_USED:
4613 n_free++;
4614 break;
4615
4616 case BUF_BLOCK_READY_FOR_USE:
4617 case BUF_BLOCK_MEMORY:
4618 case BUF_BLOCK_REMOVE_HASH:
4619 /* do nothing */
4620 break;
4621 }
4622
4623 mutex_exit(&block->mutex);
4624 }
4625 }
4626
4627 mutex_enter(&buf_pool->zip_mutex);
4628
4629 /* Check clean compressed-only blocks. */
4630
4631 for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
4632 b = UT_LIST_GET_NEXT(list, b)) {
4633 ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
4634 switch (buf_page_get_io_fix(b)) {
4635 case BUF_IO_NONE:
4636 case BUF_IO_PIN:
4637 /* All clean blocks should be I/O-unfixed. */
4638 break;
4639 case BUF_IO_READ:
4640 /* In buf_LRU_free_page(), we temporarily set
4641 b->io_fix = BUF_IO_READ for a newly allocated
4642 control block in order to prevent
4643 buf_page_get_gen() from decompressing the block. */
4644 break;
4645 default:
4646 ut_error;
4647 break;
4648 }
4649
4650 /* It is OK to read oldest_modification here because
4651 we have acquired buf_pool->zip_mutex above which acts
4652 as the 'block->mutex' for these bpages. */
4653 ut_a(!b->oldest_modification);
4654 fold = buf_page_address_fold(b->space, b->offset);
4655 ut_a(buf_page_hash_get_low(buf_pool, b->space, b->offset,
4656 fold) == b);
4657 n_lru++;
4658 n_zip++;
4659 }
4660
4661 /* Check dirty blocks. */
4662
4663 buf_flush_list_mutex_enter(buf_pool);
4664 for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
4665 b = UT_LIST_GET_NEXT(list, b)) {
4666 ut_ad(b->in_flush_list);
4667 ut_a(b->oldest_modification);
4668 n_flush++;
4669
4670 switch (buf_page_get_state(b)) {
4671 case BUF_BLOCK_ZIP_DIRTY:
4672 n_lru++;
4673 n_zip++;
4674 switch (buf_page_get_io_fix(b)) {
4675 case BUF_IO_NONE:
4676 case BUF_IO_READ:
4677 case BUF_IO_PIN:
4678 break;
4679 case BUF_IO_WRITE:
4680 switch (buf_page_get_flush_type(b)) {
4681 case BUF_FLUSH_LRU:
4682 n_lru_flush++;
4683 break;
4684 case BUF_FLUSH_SINGLE_PAGE:
4685 n_page_flush++;
4686 break;
4687 case BUF_FLUSH_LIST:
4688 n_list_flush++;
4689 break;
4690 default:
4691 ut_error;
4692 }
4693 break;
4694 }
4695 break;
4696 case BUF_BLOCK_FILE_PAGE:
4697 /* uncompressed page */
4698 break;
4699 case BUF_BLOCK_POOL_WATCH:
4700 case BUF_BLOCK_ZIP_PAGE:
4701 case BUF_BLOCK_NOT_USED:
4702 case BUF_BLOCK_READY_FOR_USE:
4703 case BUF_BLOCK_MEMORY:
4704 case BUF_BLOCK_REMOVE_HASH:
4705 ut_error;
4706 break;
4707 }
4708 fold = buf_page_address_fold(b->space, b->offset);
4709 ut_a(buf_page_hash_get_low(buf_pool, b->space, b->offset,
4710 fold) == b);
4711 }
4712
4713 ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush);
4714
4715 hash_unlock_x_all(buf_pool->page_hash);
4716 buf_flush_list_mutex_exit(buf_pool);
4717
4718 mutex_exit(&buf_pool->zip_mutex);
4719
4720 if (n_lru + n_free > buf_pool->curr_size + n_zip) {
4721 fprintf(stderr, "n LRU %lu, n free %lu, pool %lu zip %lu\n",
4722 (ulong) n_lru, (ulong) n_free,
4723 (ulong) buf_pool->curr_size, (ulong) n_zip);
4724 ut_error;
4725 }
4726
4727 ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru);
4728 if (UT_LIST_GET_LEN(buf_pool->free) != n_free) {
4729 fprintf(stderr, "Free list len %lu, free blocks %lu\n",
4730 (ulong) UT_LIST_GET_LEN(buf_pool->free),
4731 (ulong) n_free);
4732 ut_error;
4733 }
4734
4735 ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush);
4736 ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush);
4737 ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_page_flush);
4738
4739 buf_pool_mutex_exit(buf_pool);
4740
4741 ut_a(buf_LRU_validate());
4742 ut_a(buf_flush_validate(buf_pool));
4743
4744 return(TRUE);
4745 }
4746
4747 /*********************************************************************//**
4748 Validates the buffer buf_pool data structure.
4749 @return TRUE */
4750 UNIV_INTERN
4751 ibool
buf_validate(void)4752 buf_validate(void)
4753 /*==============*/
4754 {
4755 ulint i;
4756
4757 for (i = 0; i < srv_buf_pool_instances; i++) {
4758 buf_pool_t* buf_pool;
4759
4760 buf_pool = buf_pool_from_array(i);
4761
4762 buf_pool_validate_instance(buf_pool);
4763 }
4764 return(TRUE);
4765 }
4766
4767 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
4768
4769 #if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
4770 /*********************************************************************//**
4771 Prints info of the buffer buf_pool data structure for one instance. */
4772 static
4773 void
buf_print_instance(buf_pool_t * buf_pool)4774 buf_print_instance(
4775 /*===============*/
4776 buf_pool_t* buf_pool)
4777 {
4778 index_id_t* index_ids;
4779 ulint* counts;
4780 ulint size;
4781 ulint i;
4782 ulint j;
4783 index_id_t id;
4784 ulint n_found;
4785 buf_chunk_t* chunk;
4786 dict_index_t* index;
4787
4788 ut_ad(buf_pool);
4789
4790 size = buf_pool->curr_size;
4791
4792 index_ids = static_cast<index_id_t*>(
4793 mem_alloc(size * sizeof *index_ids));
4794
4795 counts = static_cast<ulint*>(mem_alloc(sizeof(ulint) * size));
4796
4797 buf_pool_mutex_enter(buf_pool);
4798 buf_flush_list_mutex_enter(buf_pool);
4799
4800 fprintf(stderr,
4801 "buf_pool size %lu\n"
4802 "database pages %lu\n"
4803 "free pages %lu\n"
4804 "modified database pages %lu\n"
4805 "n pending decompressions %lu\n"
4806 "n pending reads %lu\n"
4807 "n pending flush LRU %lu list %lu single page %lu\n"
4808 "pages made young %lu, not young %lu\n"
4809 "pages read %lu, created %lu, written %lu\n",
4810 (ulong) size,
4811 (ulong) UT_LIST_GET_LEN(buf_pool->LRU),
4812 (ulong) UT_LIST_GET_LEN(buf_pool->free),
4813 (ulong) UT_LIST_GET_LEN(buf_pool->flush_list),
4814 (ulong) buf_pool->n_pend_unzip,
4815 (ulong) buf_pool->n_pend_reads,
4816 (ulong) buf_pool->n_flush[BUF_FLUSH_LRU],
4817 (ulong) buf_pool->n_flush[BUF_FLUSH_LIST],
4818 (ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE],
4819 (ulong) buf_pool->stat.n_pages_made_young,
4820 (ulong) buf_pool->stat.n_pages_not_made_young,
4821 (ulong) buf_pool->stat.n_pages_read,
4822 (ulong) buf_pool->stat.n_pages_created,
4823 (ulong) buf_pool->stat.n_pages_written);
4824
4825 buf_flush_list_mutex_exit(buf_pool);
4826
4827 /* Count the number of blocks belonging to each index in the buffer */
4828
4829 n_found = 0;
4830
4831 chunk = buf_pool->chunks;
4832
4833 for (i = buf_pool->n_chunks; i--; chunk++) {
4834 buf_block_t* block = chunk->blocks;
4835 ulint n_blocks = chunk->size;
4836
4837 for (; n_blocks--; block++) {
4838 const buf_frame_t* frame = block->frame;
4839
4840 if (fil_page_get_type(frame) == FIL_PAGE_INDEX) {
4841
4842 id = btr_page_get_index_id(frame);
4843
4844 /* Look for the id in the index_ids array */
4845 j = 0;
4846
4847 while (j < n_found) {
4848
4849 if (index_ids[j] == id) {
4850 counts[j]++;
4851
4852 break;
4853 }
4854 j++;
4855 }
4856
4857 if (j == n_found) {
4858 n_found++;
4859 index_ids[j] = id;
4860 counts[j] = 1;
4861 }
4862 }
4863 }
4864 }
4865
4866 buf_pool_mutex_exit(buf_pool);
4867
4868 for (i = 0; i < n_found; i++) {
4869 index = dict_index_get_if_in_cache(index_ids[i]);
4870
4871 fprintf(stderr,
4872 "Block count for index %llu in buffer is about %lu",
4873 (ullint) index_ids[i],
4874 (ulong) counts[i]);
4875
4876 if (index) {
4877 putc(' ', stderr);
4878 dict_index_name_print(stderr, NULL, index);
4879 }
4880
4881 putc('\n', stderr);
4882 }
4883
4884 mem_free(index_ids);
4885 mem_free(counts);
4886
4887 ut_a(buf_pool_validate_instance(buf_pool));
4888 }
4889
4890 /*********************************************************************//**
4891 Prints info of the buffer buf_pool data structure. */
4892 UNIV_INTERN
4893 void
buf_print(void)4894 buf_print(void)
4895 /*===========*/
4896 {
4897 ulint i;
4898
4899 for (i = 0; i < srv_buf_pool_instances; i++) {
4900 buf_pool_t* buf_pool;
4901
4902 buf_pool = buf_pool_from_array(i);
4903 buf_print_instance(buf_pool);
4904 }
4905 }
4906 #endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
4907
4908 #ifdef UNIV_DEBUG
4909 /*********************************************************************//**
4910 Returns the number of latched pages in the buffer pool.
4911 @return number of latched pages */
4912 UNIV_INTERN
4913 ulint
buf_get_latched_pages_number_instance(buf_pool_t * buf_pool)4914 buf_get_latched_pages_number_instance(
4915 /*==================================*/
4916 buf_pool_t* buf_pool) /*!< in: buffer pool instance */
4917 {
4918 buf_page_t* b;
4919 ulint i;
4920 buf_chunk_t* chunk;
4921 ulint fixed_pages_number = 0;
4922
4923 buf_pool_mutex_enter(buf_pool);
4924
4925 chunk = buf_pool->chunks;
4926
4927 for (i = buf_pool->n_chunks; i--; chunk++) {
4928 buf_block_t* block;
4929 ulint j;
4930
4931 block = chunk->blocks;
4932
4933 for (j = chunk->size; j--; block++) {
4934 if (buf_block_get_state(block)
4935 != BUF_BLOCK_FILE_PAGE) {
4936
4937 continue;
4938 }
4939
4940 mutex_enter(&block->mutex);
4941
4942 if (block->page.buf_fix_count != 0
4943 || buf_page_get_io_fix(&block->page)
4944 != BUF_IO_NONE) {
4945 fixed_pages_number++;
4946 }
4947
4948 mutex_exit(&block->mutex);
4949 }
4950 }
4951
4952 mutex_enter(&buf_pool->zip_mutex);
4953
4954 /* Traverse the lists of clean and dirty compressed-only blocks. */
4955
4956 for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
4957 b = UT_LIST_GET_NEXT(list, b)) {
4958 ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
4959 ut_a(buf_page_get_io_fix(b) != BUF_IO_WRITE);
4960
4961 if (b->buf_fix_count != 0
4962 || buf_page_get_io_fix(b) != BUF_IO_NONE) {
4963 fixed_pages_number++;
4964 }
4965 }
4966
4967 buf_flush_list_mutex_enter(buf_pool);
4968 for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
4969 b = UT_LIST_GET_NEXT(list, b)) {
4970 ut_ad(b->in_flush_list);
4971
4972 switch (buf_page_get_state(b)) {
4973 case BUF_BLOCK_ZIP_DIRTY:
4974 if (b->buf_fix_count != 0
4975 || buf_page_get_io_fix(b) != BUF_IO_NONE) {
4976 fixed_pages_number++;
4977 }
4978 break;
4979 case BUF_BLOCK_FILE_PAGE:
4980 /* uncompressed page */
4981 break;
4982 case BUF_BLOCK_POOL_WATCH:
4983 case BUF_BLOCK_ZIP_PAGE:
4984 case BUF_BLOCK_NOT_USED:
4985 case BUF_BLOCK_READY_FOR_USE:
4986 case BUF_BLOCK_MEMORY:
4987 case BUF_BLOCK_REMOVE_HASH:
4988 ut_error;
4989 break;
4990 }
4991 }
4992
4993 buf_flush_list_mutex_exit(buf_pool);
4994 mutex_exit(&buf_pool->zip_mutex);
4995 buf_pool_mutex_exit(buf_pool);
4996
4997 return(fixed_pages_number);
4998 }
4999
5000 /*********************************************************************//**
5001 Returns the number of latched pages in all the buffer pools.
5002 @return number of latched pages */
5003 UNIV_INTERN
5004 ulint
buf_get_latched_pages_number(void)5005 buf_get_latched_pages_number(void)
5006 /*==============================*/
5007 {
5008 ulint i;
5009 ulint total_latched_pages = 0;
5010
5011 for (i = 0; i < srv_buf_pool_instances; i++) {
5012 buf_pool_t* buf_pool;
5013
5014 buf_pool = buf_pool_from_array(i);
5015
5016 total_latched_pages += buf_get_latched_pages_number_instance(
5017 buf_pool);
5018 }
5019
5020 return(total_latched_pages);
5021 }
5022
5023 #endif /* UNIV_DEBUG */
5024
5025 /*********************************************************************//**
5026 Returns the number of pending buf pool read ios.
5027 @return number of pending read I/O operations */
5028 UNIV_INTERN
5029 ulint
buf_get_n_pending_read_ios(void)5030 buf_get_n_pending_read_ios(void)
5031 /*============================*/
5032 {
5033 ulint i;
5034 ulint pend_ios = 0;
5035
5036 for (i = 0; i < srv_buf_pool_instances; i++) {
5037 pend_ios += buf_pool_from_array(i)->n_pend_reads;
5038 }
5039
5040 return(pend_ios);
5041 }
5042
5043 /*********************************************************************//**
5044 Returns the ratio in percents of modified pages in the buffer pool /
5045 database pages in the buffer pool.
5046 @return modified page percentage ratio */
5047 UNIV_INTERN
5048 ulint
buf_get_modified_ratio_pct(void)5049 buf_get_modified_ratio_pct(void)
5050 /*============================*/
5051 {
5052 ulint ratio;
5053 ulint lru_len = 0;
5054 ulint free_len = 0;
5055 ulint flush_list_len = 0;
5056
5057 buf_get_total_list_len(&lru_len, &free_len, &flush_list_len);
5058
5059 ratio = (100 * flush_list_len) / (1 + lru_len + free_len);
5060
5061 /* 1 + is there to avoid division by zero */
5062
5063 return(ratio);
5064 }
5065
5066 /*******************************************************************//**
5067 Aggregates a pool stats information with the total buffer pool stats */
5068 static
5069 void
buf_stats_aggregate_pool_info(buf_pool_info_t * total_info,const buf_pool_info_t * pool_info)5070 buf_stats_aggregate_pool_info(
5071 /*==========================*/
5072 buf_pool_info_t* total_info, /*!< in/out: the buffer pool
5073 info to store aggregated
5074 result */
5075 const buf_pool_info_t* pool_info) /*!< in: individual buffer pool
5076 stats info */
5077 {
5078 ut_a(total_info && pool_info);
5079
5080 /* Nothing to copy if total_info is the same as pool_info */
5081 if (total_info == pool_info) {
5082 return;
5083 }
5084
5085 total_info->pool_size += pool_info->pool_size;
5086 total_info->lru_len += pool_info->lru_len;
5087 total_info->old_lru_len += pool_info->old_lru_len;
5088 total_info->free_list_len += pool_info->free_list_len;
5089 total_info->flush_list_len += pool_info->flush_list_len;
5090 total_info->n_pend_unzip += pool_info->n_pend_unzip;
5091 total_info->n_pend_reads += pool_info->n_pend_reads;
5092 total_info->n_pending_flush_lru += pool_info->n_pending_flush_lru;
5093 total_info->n_pending_flush_list += pool_info->n_pending_flush_list;
5094 total_info->n_pages_made_young += pool_info->n_pages_made_young;
5095 total_info->n_pages_not_made_young += pool_info->n_pages_not_made_young;
5096 total_info->n_pages_read += pool_info->n_pages_read;
5097 total_info->n_pages_created += pool_info->n_pages_created;
5098 total_info->n_pages_written += pool_info->n_pages_written;
5099 total_info->n_page_gets += pool_info->n_page_gets;
5100 total_info->n_ra_pages_read_rnd += pool_info->n_ra_pages_read_rnd;
5101 total_info->n_ra_pages_read += pool_info->n_ra_pages_read;
5102 total_info->n_ra_pages_evicted += pool_info->n_ra_pages_evicted;
5103 total_info->page_made_young_rate += pool_info->page_made_young_rate;
5104 total_info->page_not_made_young_rate +=
5105 pool_info->page_not_made_young_rate;
5106 total_info->pages_read_rate += pool_info->pages_read_rate;
5107 total_info->pages_created_rate += pool_info->pages_created_rate;
5108 total_info->pages_written_rate += pool_info->pages_written_rate;
5109 total_info->n_page_get_delta += pool_info->n_page_get_delta;
5110 total_info->page_read_delta += pool_info->page_read_delta;
5111 total_info->young_making_delta += pool_info->young_making_delta;
5112 total_info->not_young_making_delta += pool_info->not_young_making_delta;
5113 total_info->pages_readahead_rnd_rate += pool_info->pages_readahead_rnd_rate;
5114 total_info->pages_readahead_rate += pool_info->pages_readahead_rate;
5115 total_info->pages_evicted_rate += pool_info->pages_evicted_rate;
5116 total_info->unzip_lru_len += pool_info->unzip_lru_len;
5117 total_info->io_sum += pool_info->io_sum;
5118 total_info->io_cur += pool_info->io_cur;
5119 total_info->unzip_sum += pool_info->unzip_sum;
5120 total_info->unzip_cur += pool_info->unzip_cur;
5121 }
5122 /*******************************************************************//**
5123 Collect buffer pool stats information for a buffer pool. Also
5124 record aggregated stats if there are more than one buffer pool
5125 in the server */
5126 UNIV_INTERN
5127 void
buf_stats_get_pool_info(buf_pool_t * buf_pool,ulint pool_id,buf_pool_info_t * all_pool_info)5128 buf_stats_get_pool_info(
5129 /*====================*/
5130 buf_pool_t* buf_pool, /*!< in: buffer pool */
5131 ulint pool_id, /*!< in: buffer pool ID */
5132 buf_pool_info_t* all_pool_info) /*!< in/out: buffer pool info
5133 to fill */
5134 {
5135 buf_pool_info_t* pool_info;
5136 time_t current_time;
5137 double time_elapsed;
5138
5139 /* Find appropriate pool_info to store stats for this buffer pool */
5140 pool_info = &all_pool_info[pool_id];
5141
5142 buf_pool_mutex_enter(buf_pool);
5143 buf_flush_list_mutex_enter(buf_pool);
5144
5145 pool_info->pool_unique_id = pool_id;
5146
5147 pool_info->pool_size = buf_pool->curr_size;
5148
5149 pool_info->lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
5150
5151 pool_info->old_lru_len = buf_pool->LRU_old_len;
5152
5153 pool_info->free_list_len = UT_LIST_GET_LEN(buf_pool->free);
5154
5155 pool_info->flush_list_len = UT_LIST_GET_LEN(buf_pool->flush_list);
5156
5157 pool_info->n_pend_unzip = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
5158
5159 pool_info->n_pend_reads = buf_pool->n_pend_reads;
5160
5161 pool_info->n_pending_flush_lru =
5162 (buf_pool->n_flush[BUF_FLUSH_LRU]
5163 + buf_pool->init_flush[BUF_FLUSH_LRU]);
5164
5165 pool_info->n_pending_flush_list =
5166 (buf_pool->n_flush[BUF_FLUSH_LIST]
5167 + buf_pool->init_flush[BUF_FLUSH_LIST]);
5168
5169 pool_info->n_pending_flush_single_page =
5170 (buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]
5171 + buf_pool->init_flush[BUF_FLUSH_SINGLE_PAGE]);
5172
5173 buf_flush_list_mutex_exit(buf_pool);
5174
5175 current_time = time(NULL);
5176 time_elapsed = 0.001 + difftime(current_time,
5177 buf_pool->last_printout_time);
5178
5179 pool_info->n_pages_made_young = buf_pool->stat.n_pages_made_young;
5180
5181 pool_info->n_pages_not_made_young =
5182 buf_pool->stat.n_pages_not_made_young;
5183
5184 pool_info->n_pages_read = buf_pool->stat.n_pages_read;
5185
5186 pool_info->n_pages_created = buf_pool->stat.n_pages_created;
5187
5188 pool_info->n_pages_written = buf_pool->stat.n_pages_written;
5189
5190 pool_info->n_page_gets = buf_pool->stat.n_page_gets;
5191
5192 pool_info->n_ra_pages_read_rnd = buf_pool->stat.n_ra_pages_read_rnd;
5193 pool_info->n_ra_pages_read = buf_pool->stat.n_ra_pages_read;
5194
5195 pool_info->n_ra_pages_evicted = buf_pool->stat.n_ra_pages_evicted;
5196
5197 pool_info->page_made_young_rate =
5198 (buf_pool->stat.n_pages_made_young
5199 - buf_pool->old_stat.n_pages_made_young) / time_elapsed;
5200
5201 pool_info->page_not_made_young_rate =
5202 (buf_pool->stat.n_pages_not_made_young
5203 - buf_pool->old_stat.n_pages_not_made_young) / time_elapsed;
5204
5205 pool_info->pages_read_rate =
5206 (buf_pool->stat.n_pages_read
5207 - buf_pool->old_stat.n_pages_read) / time_elapsed;
5208
5209 pool_info->pages_created_rate =
5210 (buf_pool->stat.n_pages_created
5211 - buf_pool->old_stat.n_pages_created) / time_elapsed;
5212
5213 pool_info->pages_written_rate =
5214 (buf_pool->stat.n_pages_written
5215 - buf_pool->old_stat.n_pages_written) / time_elapsed;
5216
5217 pool_info->n_page_get_delta = buf_pool->stat.n_page_gets
5218 - buf_pool->old_stat.n_page_gets;
5219
5220 if (pool_info->n_page_get_delta) {
5221 pool_info->page_read_delta = buf_pool->stat.n_pages_read
5222 - buf_pool->old_stat.n_pages_read;
5223
5224 pool_info->young_making_delta =
5225 buf_pool->stat.n_pages_made_young
5226 - buf_pool->old_stat.n_pages_made_young;
5227
5228 pool_info->not_young_making_delta =
5229 buf_pool->stat.n_pages_not_made_young
5230 - buf_pool->old_stat.n_pages_not_made_young;
5231 }
5232 pool_info->pages_readahead_rnd_rate =
5233 (buf_pool->stat.n_ra_pages_read_rnd
5234 - buf_pool->old_stat.n_ra_pages_read_rnd) / time_elapsed;
5235
5236
5237 pool_info->pages_readahead_rate =
5238 (buf_pool->stat.n_ra_pages_read
5239 - buf_pool->old_stat.n_ra_pages_read) / time_elapsed;
5240
5241 pool_info->pages_evicted_rate =
5242 (buf_pool->stat.n_ra_pages_evicted
5243 - buf_pool->old_stat.n_ra_pages_evicted) / time_elapsed;
5244
5245 pool_info->unzip_lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
5246
5247 pool_info->io_sum = buf_LRU_stat_sum.io;
5248
5249 pool_info->io_cur = buf_LRU_stat_cur.io;
5250
5251 pool_info->unzip_sum = buf_LRU_stat_sum.unzip;
5252
5253 pool_info->unzip_cur = buf_LRU_stat_cur.unzip;
5254
5255 buf_refresh_io_stats(buf_pool);
5256 buf_pool_mutex_exit(buf_pool);
5257 }
5258
5259 /*********************************************************************//**
5260 Prints info of the buffer i/o. */
5261 UNIV_INTERN
5262 void
buf_print_io_instance(buf_pool_info_t * pool_info,FILE * file)5263 buf_print_io_instance(
5264 /*==================*/
5265 buf_pool_info_t*pool_info, /*!< in: buffer pool info */
5266 FILE* file) /*!< in/out: buffer where to print */
5267 {
5268 ut_ad(pool_info);
5269
5270 fprintf(file,
5271 "Buffer pool size %lu\n"
5272 "Free buffers %lu\n"
5273 "Database pages %lu\n"
5274 "Old database pages %lu\n"
5275 "Modified db pages %lu\n"
5276 "Pending reads %lu\n"
5277 "Pending writes: LRU %lu, flush list %lu, single page %lu\n",
5278 pool_info->pool_size,
5279 pool_info->free_list_len,
5280 pool_info->lru_len,
5281 pool_info->old_lru_len,
5282 pool_info->flush_list_len,
5283 pool_info->n_pend_reads,
5284 pool_info->n_pending_flush_lru,
5285 pool_info->n_pending_flush_list,
5286 pool_info->n_pending_flush_single_page);
5287
5288 fprintf(file,
5289 "Pages made young %lu, not young %lu\n"
5290 "%.2f youngs/s, %.2f non-youngs/s\n"
5291 "Pages read %lu, created %lu, written %lu\n"
5292 "%.2f reads/s, %.2f creates/s, %.2f writes/s\n",
5293 pool_info->n_pages_made_young,
5294 pool_info->n_pages_not_made_young,
5295 pool_info->page_made_young_rate,
5296 pool_info->page_not_made_young_rate,
5297 pool_info->n_pages_read,
5298 pool_info->n_pages_created,
5299 pool_info->n_pages_written,
5300 pool_info->pages_read_rate,
5301 pool_info->pages_created_rate,
5302 pool_info->pages_written_rate);
5303
5304 if (pool_info->n_page_get_delta) {
5305 fprintf(file,
5306 "Buffer pool hit rate %lu / 1000,"
5307 " young-making rate %lu / 1000 not %lu / 1000\n",
5308 (ulong) (1000 - (1000 * pool_info->page_read_delta
5309 / pool_info->n_page_get_delta)),
5310 (ulong) (1000 * pool_info->young_making_delta
5311 / pool_info->n_page_get_delta),
5312 (ulong) (1000 * pool_info->not_young_making_delta
5313 / pool_info->n_page_get_delta));
5314 } else {
5315 fputs("No buffer pool page gets since the last printout\n",
5316 file);
5317 }
5318
5319 /* Statistics about read ahead algorithm */
5320 fprintf(file, "Pages read ahead %.2f/s,"
5321 " evicted without access %.2f/s,"
5322 " Random read ahead %.2f/s\n",
5323
5324 pool_info->pages_readahead_rate,
5325 pool_info->pages_evicted_rate,
5326 pool_info->pages_readahead_rnd_rate);
5327
5328 /* Print some values to help us with visualizing what is
5329 happening with LRU eviction. */
5330 fprintf(file,
5331 "LRU len: %lu, unzip_LRU len: %lu\n"
5332 "I/O sum[%lu]:cur[%lu], unzip sum[%lu]:cur[%lu]\n",
5333 pool_info->lru_len, pool_info->unzip_lru_len,
5334 pool_info->io_sum, pool_info->io_cur,
5335 pool_info->unzip_sum, pool_info->unzip_cur);
5336 }
5337
5338 /*********************************************************************//**
5339 Prints info of the buffer i/o. */
5340 UNIV_INTERN
5341 void
buf_print_io(FILE * file)5342 buf_print_io(
5343 /*=========*/
5344 FILE* file) /*!< in/out: buffer where to print */
5345 {
5346 ulint i;
5347 buf_pool_info_t* pool_info;
5348 buf_pool_info_t* pool_info_total;
5349
5350 /* If srv_buf_pool_instances is greater than 1, allocate
5351 one extra buf_pool_info_t, the last one stores
5352 aggregated/total values from all pools */
5353 if (srv_buf_pool_instances > 1) {
5354 pool_info = (buf_pool_info_t*) mem_zalloc((
5355 srv_buf_pool_instances + 1) * sizeof *pool_info);
5356
5357 pool_info_total = &pool_info[srv_buf_pool_instances];
5358 } else {
5359 ut_a(srv_buf_pool_instances == 1);
5360
5361 pool_info_total = pool_info =
5362 static_cast<buf_pool_info_t*>(
5363 mem_zalloc(sizeof *pool_info));
5364 }
5365
5366 for (i = 0; i < srv_buf_pool_instances; i++) {
5367 buf_pool_t* buf_pool;
5368
5369 buf_pool = buf_pool_from_array(i);
5370
5371 /* Fetch individual buffer pool info and calculate
5372 aggregated stats along the way */
5373 buf_stats_get_pool_info(buf_pool, i, pool_info);
5374
5375 /* If we have more than one buffer pool, store
5376 the aggregated stats */
5377 if (srv_buf_pool_instances > 1) {
5378 buf_stats_aggregate_pool_info(pool_info_total,
5379 &pool_info[i]);
5380 }
5381 }
5382
5383 /* Print the aggreate buffer pool info */
5384 buf_print_io_instance(pool_info_total, file);
5385
5386 /* If there are more than one buffer pool, print each individual pool
5387 info */
5388 if (srv_buf_pool_instances > 1) {
5389 fputs("----------------------\n"
5390 "INDIVIDUAL BUFFER POOL INFO\n"
5391 "----------------------\n", file);
5392
5393 for (i = 0; i < srv_buf_pool_instances; i++) {
5394 fprintf(file, "---BUFFER POOL %lu\n", i);
5395 buf_print_io_instance(&pool_info[i], file);
5396 }
5397 }
5398
5399 mem_free(pool_info);
5400 }
5401
5402 /**********************************************************************//**
5403 Refreshes the statistics used to print per-second averages. */
5404 UNIV_INTERN
5405 void
buf_refresh_io_stats(buf_pool_t * buf_pool)5406 buf_refresh_io_stats(
5407 /*=================*/
5408 buf_pool_t* buf_pool) /*!< in: buffer pool instance */
5409 {
5410 buf_pool->last_printout_time = ut_time();
5411 buf_pool->old_stat = buf_pool->stat;
5412 }
5413
5414 /**********************************************************************//**
5415 Refreshes the statistics used to print per-second averages. */
5416 UNIV_INTERN
5417 void
buf_refresh_io_stats_all(void)5418 buf_refresh_io_stats_all(void)
5419 /*==========================*/
5420 {
5421 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
5422 buf_pool_t* buf_pool;
5423
5424 buf_pool = buf_pool_from_array(i);
5425
5426 buf_refresh_io_stats(buf_pool);
5427 }
5428 }
5429
5430 /**********************************************************************//**
5431 Check if all pages in all buffer pools are in a replacable state.
5432 @return FALSE if not */
5433 UNIV_INTERN
5434 ibool
buf_all_freed(void)5435 buf_all_freed(void)
5436 /*===============*/
5437 {
5438 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
5439 buf_pool_t* buf_pool;
5440
5441 buf_pool = buf_pool_from_array(i);
5442
5443 if (!buf_all_freed_instance(buf_pool)) {
5444 return(FALSE);
5445 }
5446 }
5447
5448 return(TRUE);
5449 }
5450
5451 /*********************************************************************//**
5452 Checks that there currently are no pending i/o-operations for the buffer
5453 pool.
5454 @return number of pending i/o */
5455 UNIV_INTERN
5456 ulint
buf_pool_check_no_pending_io(void)5457 buf_pool_check_no_pending_io(void)
5458 /*==============================*/
5459 {
5460 ulint i;
5461 ulint pending_io = 0;
5462
5463 buf_pool_mutex_enter_all();
5464
5465 for (i = 0; i < srv_buf_pool_instances; i++) {
5466 const buf_pool_t* buf_pool;
5467
5468 buf_pool = buf_pool_from_array(i);
5469
5470 pending_io += buf_pool->n_pend_reads
5471 + buf_pool->n_flush[BUF_FLUSH_LRU]
5472 + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]
5473 + buf_pool->n_flush[BUF_FLUSH_LIST];
5474
5475 }
5476
5477 buf_pool_mutex_exit_all();
5478
5479 return(pending_io);
5480 }
5481
5482 #if 0
5483 Code currently not used
5484 /*********************************************************************//**
5485 Gets the current length of the free list of buffer blocks.
5486 @return length of the free list */
5487 UNIV_INTERN
5488 ulint
5489 buf_get_free_list_len(void)
5490 /*=======================*/
5491 {
5492 ulint len;
5493
5494 buf_pool_mutex_enter(buf_pool);
5495
5496 len = UT_LIST_GET_LEN(buf_pool->free);
5497
5498 buf_pool_mutex_exit(buf_pool);
5499
5500 return(len);
5501 }
5502 #endif
5503
5504 #else /* !UNIV_HOTBACKUP */
5505 /********************************************************************//**
5506 Inits a page to the buffer buf_pool, for use in mysqlbackup --restore. */
5507 UNIV_INTERN
5508 void
buf_page_init_for_backup_restore(ulint space,ulint offset,ulint zip_size,buf_block_t * block)5509 buf_page_init_for_backup_restore(
5510 /*=============================*/
5511 ulint space, /*!< in: space id */
5512 ulint offset, /*!< in: offset of the page within space
5513 in units of a page */
5514 ulint zip_size,/*!< in: compressed page size in bytes
5515 or 0 for uncompressed pages */
5516 buf_block_t* block) /*!< in: block to init */
5517 {
5518 block->page.state = BUF_BLOCK_FILE_PAGE;
5519 block->page.space = space;
5520 block->page.offset = offset;
5521
5522 page_zip_des_init(&block->page.zip);
5523
5524 /* We assume that block->page.data has been allocated
5525 with zip_size == UNIV_PAGE_SIZE. */
5526 ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
5527 ut_ad(ut_is_2pow(zip_size));
5528 page_zip_set_size(&block->page.zip, zip_size);
5529 if (zip_size) {
5530 block->page.zip.data = block->frame + UNIV_PAGE_SIZE;
5531 }
5532 }
5533 #endif /* !UNIV_HOTBACKUP */
5534