1 /*****************************************************************************
2
3 Copyright (c) 1995, 2021, Oracle and/or its affiliates.
4 Copyright (c) 2008, Google Inc.
5 Copyright (c) 2013, 2021, MariaDB Corporation.
6
7 Portions of this file contain modifications contributed and copyrighted by
8 Google, Inc. Those modifications are gratefully acknowledged and are described
9 briefly in the InnoDB documentation. The contributions by Google are
10 incorporated with their permission, and subject to the conditions contained in
11 the file COPYING.Google.
12
13 This program is free software; you can redistribute it and/or modify it under
14 the terms of the GNU General Public License as published by the Free Software
15 Foundation; version 2 of the License.
16
17 This program is distributed in the hope that it will be useful, but WITHOUT
18 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
19 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
24
25 *****************************************************************************/
26
27 /**************************************************//**
28 @file buf/buf0buf.cc
29 The database buffer buf_pool
30
31 Created 11/5/1995 Heikki Tuuri
32 *******************************************************/
33
34 #include "mtr0types.h"
35 #include "mach0data.h"
36 #include "page0size.h"
37 #include "buf0buf.h"
38 #include <string.h>
39
40 #ifdef UNIV_NONINL
41 #include "buf0buf.inl"
42 #endif
43
44 #ifndef UNIV_INNOCHECKSUM
45 #include "mem0mem.h"
46 #include "btr0btr.h"
47 #include "fil0fil.h"
48 #include "fil0crypt.h"
49 #include "buf0buddy.h"
50 #include "lock0lock.h"
51 #include "sync0rw.h"
52 #include "btr0sea.h"
53 #include "ibuf0ibuf.h"
54 #include "trx0undo.h"
55 #include "trx0purge.h"
56 #include "log0log.h"
57 #include "dict0stats_bg.h"
58 #include "srv0srv.h"
59 #include "srv0start.h"
60 #include "dict0dict.h"
61 #include "log0recv.h"
62 #include "srv0mon.h"
63 #include "log0crypt.h"
64 #endif /* !UNIV_INNOCHECKSUM */
65 #include "page0zip.h"
66 #include "sync0sync.h"
67 #include "buf0dump.h"
68 #include <new>
69 #include <map>
70 #include <sstream>
71 #ifndef UNIV_INNOCHECKSUM
72 #include "fil0pagecompress.h"
73 #include "fsp0pagecompress.h"
74 #endif
75 #include "ut0byte.h"
76 #include <new>
77
78 #ifdef UNIV_LINUX
79 #include <stdlib.h>
80 #endif
81
82 #ifdef HAVE_LZO
83 #include "lzo/lzo1x.h"
84 #endif
85
86 using st_::span;
87
88 #ifdef HAVE_LIBNUMA
89 #include <numa.h>
90 #include <numaif.h>
91 struct set_numa_interleave_t
92 {
93 set_numa_interleave_t()
94 {
95 if (srv_numa_interleave) {
96
97 struct bitmask *numa_mems_allowed = numa_get_mems_allowed();
98 ib::info() << "Setting NUMA memory policy to"
99 " MPOL_INTERLEAVE";
100 if (set_mempolicy(MPOL_INTERLEAVE,
101 numa_mems_allowed->maskp,
102 numa_mems_allowed->size) != 0) {
103
104 ib::warn() << "Failed to set NUMA memory"
105 " policy to MPOL_INTERLEAVE: "
106 << strerror(errno);
107 }
108 numa_bitmask_free(numa_mems_allowed);
109 }
110 }
111
112 ~set_numa_interleave_t()
113 {
114 if (srv_numa_interleave) {
115
116 ib::info() << "Setting NUMA memory policy to"
117 " MPOL_DEFAULT";
118 if (set_mempolicy(MPOL_DEFAULT, NULL, 0) != 0) {
119 ib::warn() << "Failed to set NUMA memory"
120 " policy to MPOL_DEFAULT: "
121 << strerror(errno);
122 }
123 }
124 }
125 };
126
127 #define NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE set_numa_interleave_t scoped_numa
128 #else
129 #define NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE
130 #endif /* HAVE_LIBNUMA */
131
132 #ifdef HAVE_SNAPPY
133 #include "snappy-c.h"
134 #endif
135
136 /*
137 IMPLEMENTATION OF THE BUFFER POOL
138 =================================
139
140 Performance improvement:
141 ------------------------
142 Thread scheduling in NT may be so slow that the OS wait mechanism should
143 not be used even in waiting for disk reads to complete.
144 Rather, we should put waiting query threads to the queue of
145 waiting jobs, and let the OS thread do something useful while the i/o
146 is processed. In this way we could remove most OS thread switches in
147 an i/o-intensive benchmark like TPC-C.
148
149 A possibility is to put a user space thread library between the database
150 and NT. User space thread libraries might be very fast.
151
152 SQL Server 7.0 can be configured to use 'fibers' which are lightweight
153 threads in NT. These should be studied.
154
155 Buffer frames and blocks
156 ------------------------
157 Following the terminology of Gray and Reuter, we call the memory
158 blocks where file pages are loaded buffer frames. For each buffer
159 frame there is a control block, or shortly, a block, in the buffer
160 control array. The control info which does not need to be stored
161 in the file along with the file page, resides in the control block.
162
163 Buffer pool struct
164 ------------------
165 The buffer buf_pool contains a single mutex which protects all the
166 control data structures of the buf_pool. The content of a buffer frame is
167 protected by a separate read-write lock in its control block, though.
168 These locks can be locked and unlocked without owning the buf_pool->mutex.
169 The OS events in the buf_pool struct can be waited for without owning the
170 buf_pool->mutex.
171
172 The buf_pool->mutex is a hot-spot in main memory, causing a lot of
173 memory bus traffic on multiprocessor systems when processors
174 alternately access the mutex. On our Pentium, the mutex is accessed
175 maybe every 10 microseconds. We gave up the solution to have mutexes
176 for each control block, for instance, because it seemed to be
177 complicated.
178
179 A solution to reduce mutex contention of the buf_pool->mutex is to
180 create a separate mutex for the page hash table. On Pentium,
181 accessing the hash table takes 2 microseconds, about half
182 of the total buf_pool->mutex hold time.
183
184 Control blocks
185 --------------
186
187 The control block contains, for instance, the bufferfix count
188 which is incremented when a thread wants a file page to be fixed
189 in a buffer frame. The bufferfix operation does not lock the
190 contents of the frame, however. For this purpose, the control
191 block contains a read-write lock.
192
193 The buffer frames have to be aligned so that the start memory
194 address of a frame is divisible by the universal page size, which
195 is a power of two.
196
197 We intend to make the buffer buf_pool size on-line reconfigurable,
198 that is, the buf_pool size can be changed without closing the database.
199 Then the database administarator may adjust it to be bigger
200 at night, for example. The control block array must
201 contain enough control blocks for the maximum buffer buf_pool size
202 which is used in the particular database.
203 If the buf_pool size is cut, we exploit the virtual memory mechanism of
204 the OS, and just refrain from using frames at high addresses. Then the OS
205 can swap them to disk.
206
207 The control blocks containing file pages are put to a hash table
208 according to the file address of the page.
209 We could speed up the access to an individual page by using
210 "pointer swizzling": we could replace the page references on
211 non-leaf index pages by direct pointers to the page, if it exists
212 in the buf_pool. We could make a separate hash table where we could
213 chain all the page references in non-leaf pages residing in the buf_pool,
214 using the page reference as the hash key,
215 and at the time of reading of a page update the pointers accordingly.
216 Drawbacks of this solution are added complexity and,
217 possibly, extra space required on non-leaf pages for memory pointers.
218 A simpler solution is just to speed up the hash table mechanism
219 in the database, using tables whose size is a power of 2.
220
221 Lists of blocks
222 ---------------
223
224 There are several lists of control blocks.
225
226 The free list (buf_pool->free) contains blocks which are currently not
227 used.
228
229 The common LRU list contains all the blocks holding a file page
230 except those for which the bufferfix count is non-zero.
231 The pages are in the LRU list roughly in the order of the last
232 access to the page, so that the oldest pages are at the end of the
233 list. We also keep a pointer to near the end of the LRU list,
234 which we can use when we want to artificially age a page in the
235 buf_pool. This is used if we know that some page is not needed
236 again for some time: we insert the block right after the pointer,
237 causing it to be replaced sooner than would normally be the case.
238 Currently this aging mechanism is used for read-ahead mechanism
239 of pages, and it can also be used when there is a scan of a full
240 table which cannot fit in the memory. Putting the pages near the
241 end of the LRU list, we make sure that most of the buf_pool stays
242 in the main memory, undisturbed.
243
244 The unzip_LRU list contains a subset of the common LRU list. The
245 blocks on the unzip_LRU list hold a compressed file page and the
246 corresponding uncompressed page frame. A block is in unzip_LRU if and
247 only if the predicate buf_page_belongs_to_unzip_LRU(&block->page)
248 holds. The blocks in unzip_LRU will be in same order as they are in
249 the common LRU list. That is, each manipulation of the common LRU
250 list will result in the same manipulation of the unzip_LRU list.
251
252 The chain of modified blocks (buf_pool->flush_list) contains the blocks
253 holding file pages that have been modified in the memory
254 but not written to disk yet. The block with the oldest modification
255 which has not yet been written to disk is at the end of the chain.
256 The access to this list is protected by buf_pool->flush_list_mutex.
257
BlockingInitFIFO(PaUtilRingBuffer * rbuf,long numFrames,long bytesPerFrame)258 The chain of unmodified compressed blocks (buf_pool->zip_clean)
259 contains the control blocks (buf_page_t) of those compressed pages
260 that are not in buf_pool->flush_list and for which no uncompressed
261 page has been allocated in the buffer pool. The control blocks for
262 uncompressed pages are accessible via buf_block_t objects that are
263 reachable via buf_pool->chunks[].
264
265 The chains of free memory blocks (buf_pool->zip_free[]) are used by
266 the buddy allocator (buf0buddy.cc) to keep track of currently unused
267 memory blocks of size sizeof(buf_page_t)..srv_page_size / 2. These
268 blocks are inside the srv_page_size-sized memory blocks of type
269 BUF_BLOCK_MEMORY that the buddy allocator requests from the buffer
270 pool. The buddy allocator is solely used for allocating control
271 blocks for compressed pages (buf_page_t) and compressed page frames.
272
273 Loading a file page
274 -------------------
275
276 First, a victim block for replacement has to be found in the
277 buf_pool. It is taken from the free list or searched for from the
278 end of the LRU-list. An exclusive lock is reserved for the frame,
279 the io_fix field is set in the block fixing the block in buf_pool,
280 and the io-operation for loading the page is queued. The io-handler thread
281 releases the X-lock on the frame and resets the io_fix field
282 when the io operation completes.
283
284 A thread may request the above operation using the function
285 buf_page_get(). It may then continue to request a lock on the frame.
286 The lock is granted when the io-handler releases the x-lock.
287
288 Read-ahead
289 ----------
290
291 The read-ahead mechanism is intended to be intelligent and
292 isolated from the semantically higher levels of the database
293 index management. From the higher level we only need the
294 information if a file page has a natural successor or
295 predecessor page. On the leaf level of a B-tree index,
296 these are the next and previous pages in the natural
297 order of the pages.
298
299 Let us first explain the read-ahead mechanism when the leafs
300 of a B-tree are scanned in an ascending or descending order.
301 When a read page is the first time referenced in the buf_pool,
302 the buffer manager checks if it is at the border of a so-called
303 linear read-ahead area. The tablespace is divided into these
304 areas of size 64 blocks, for example. So if the page is at the
305 border of such an area, the read-ahead mechanism checks if
306 all the other blocks in the area have been accessed in an
307 ascending or descending order. If this is the case, the system
308 looks at the natural successor or predecessor of the page,
309 checks if that is at the border of another area, and in this case
310 issues read-requests for all the pages in that area. Maybe
311 we could relax the condition that all the pages in the area
312 have to be accessed: if data is deleted from a table, there may
313 appear holes of unused pages in the area.
314
315 A different read-ahead mechanism is used when there appears
316 to be a random access pattern to a file.
317 If a new page is referenced in the buf_pool, and several pages
318 of its random access area (for instance, 32 consecutive pages
319 in a tablespace) have recently been referenced, we may predict
320 that the whole area may be needed in the near future, and issue
321 the read requests for the whole area.
322 */
323
324 #ifndef UNIV_INNOCHECKSUM
325 /** Value in microseconds */
326 static const int WAIT_FOR_READ = 100;
327 static const int WAIT_FOR_WRITE = 100;
328 /** Number of attempts made to read in a page in the buffer pool */
329 static const ulint BUF_PAGE_READ_MAX_RETRIES = 100;
330 /** Number of pages to read ahead */
331 static const ulint BUF_READ_AHEAD_PAGES = 64;
332 /** The maximum portion of the buffer pool that can be used for the
333 read-ahead buffer. (Divide buf_pool size by this amount) */
334 static const ulint BUF_READ_AHEAD_PORTION = 32;
335
336 /** The buffer pools of the database */
337 buf_pool_t* buf_pool_ptr;
338
339 /** true when resizing buffer pool is in the critical path. */
340 volatile bool buf_pool_resizing;
341
342 /** Map of buffer pool chunks by its first frame address
343 This is newly made by initialization of buffer pool and buf_resize_thread.
344 Currently, no need mutex protection for update. */
345 typedef std::map<
346 const byte*,
BlockingEnd(PaJackStream * stream)347 buf_chunk_t*,
348 std::less<const byte*>,
349 ut_allocator<std::pair<const byte* const, buf_chunk_t*> > >
350 buf_pool_chunk_map_t;
351
352 static buf_pool_chunk_map_t* buf_chunk_map_reg;
353
354 /** Chunk map to be used to lookup.
BlockingReadStream(PaStream * s,void * data,unsigned long numFrames)355 The map pointed by this should not be updated */
356 static buf_pool_chunk_map_t* buf_chunk_map_ref = NULL;
357
358 #ifdef UNIV_DEBUG
359 /** Disable resizing buffer pool to make assertion code not expensive. */
360 my_bool buf_disable_resize_buffer_pool_debug = TRUE;
361 #endif /* UNIV_DEBUG */
362
363 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
364 /** This is used to insert validation operations in execution
365 in the debug version */
366 static ulint buf_dbg_counter = 0;
367 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
368
369 #if defined UNIV_PFS_MUTEX || defined UNIV_PFS_RWLOCK
370 # ifndef PFS_SKIP_BUFFER_MUTEX_RWLOCK
371
372 /* Buffer block mutexes and rwlocks can be registered
373 in one group rather than individually. If PFS_GROUP_BUFFER_SYNC
374 is defined, register buffer block mutex and rwlock
375 in one group after their initialization. */
376 # define PFS_GROUP_BUFFER_SYNC
377
378 /* This define caps the number of mutexes/rwlocks can
379 be registered with performance schema. Developers can
380 modify this define if necessary. Please note, this would
BlockingWriteStream(PaStream * s,const void * data,unsigned long numFrames)381 be effective only if PFS_GROUP_BUFFER_SYNC is defined. */
382 # define PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER ULINT_MAX
383
384 # endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */
385 #endif /* UNIV_PFS_MUTEX || UNIV_PFS_RWLOCK */
386
387 /** Macro to determine whether the read of write counter is used depending
388 on the io_type */
389 #define MONITOR_RW_COUNTER(io_type, counter) \
390 ((io_type == BUF_IO_READ) \
391 ? (counter##_READ) \
392 : (counter##_WRITTEN))
393
394
395 /** Reserve a buffer slot for encryption, decryption or page compression.
396 @param[in,out] buf_pool buffer pool
397 @return reserved buffer slot */
398 static buf_tmp_buffer_t* buf_pool_reserve_tmp_slot(buf_pool_t* buf_pool)
399 {
400 for (ulint i = 0; i < buf_pool->tmp_arr->n_slots; i++) {
401 buf_tmp_buffer_t* slot = &buf_pool->tmp_arr->slots[i];
402 if (slot->acquire()) {
403 return slot;
404 }
405 }
406
407 /* We assume that free slot is found */
408 ut_error;
409 return NULL;
410 }
411
412 /** Reserve a buffer for encryption, decryption or decompression.
413 @param[in,out] slot reserved slot */
414 static void buf_tmp_reserve_crypt_buf(buf_tmp_buffer_t* slot)
415 {
416 if (!slot->crypt_buf) {
417 slot->crypt_buf = static_cast<byte*>(
418 aligned_malloc(srv_page_size, srv_page_size));
419 }
BlockingGetStreamReadAvailable(PaStream * s)420 }
421
422 /** Reserve a buffer for compression.
423 @param[in,out] slot reserved slot */
424 static void buf_tmp_reserve_compression_buf(buf_tmp_buffer_t* slot)
425 {
426 if (!slot->comp_buf) {
427 /* Both snappy and lzo compression methods require that
428 output buffer used for compression is bigger than input
429 buffer. Increase the allocated buffer size accordingly. */
430 ulint size = srv_page_size;
431 #ifdef HAVE_LZO
432 size += LZO1X_1_15_MEM_COMPRESS;
433 #elif defined HAVE_SNAPPY
434 size = snappy_max_compressed_length(size);
435 #endif
436 slot->comp_buf = static_cast<byte*>(
437 aligned_malloc(size, srv_page_size));
438 }
439 }
440
441 /** Registers a chunk to buf_pool_chunk_map
442 @param[in] chunk chunk of buffers */
443 static
444 void
445 buf_pool_register_chunk(
446 buf_chunk_t* chunk)
447 {
448 buf_chunk_map_reg->insert(buf_pool_chunk_map_t::value_type(
449 chunk->blocks->frame, chunk));
450 }
451
452 /** Decrypt a page for temporary tablespace.
453 @param[in,out] tmp_frame Temporary buffer
454 @param[in] src_frame Page to decrypt
455 @return true if temporary tablespace decrypted, false if not */
456 static bool buf_tmp_page_decrypt(byte* tmp_frame, byte* src_frame)
457 {
458 if (buf_is_zeroes(span<const byte>(src_frame, srv_page_size))) {
BuildDeviceList(PaJackHostApiRepresentation * jackApi)459 return true;
460 }
461
462 /* read space & lsn */
463 uint header_len = FIL_PAGE_DATA;
464
465 /* Copy FIL page header, it is not encrypted */
466 memcpy(tmp_frame, src_frame, header_len);
467
468 /* Calculate the offset where decryption starts */
469 const byte* src = src_frame + header_len;
470 byte* dst = tmp_frame + header_len;
471 uint srclen = uint(srv_page_size)
472 - header_len - FIL_PAGE_DATA_END;
473 ulint offset = mach_read_from_4(src_frame + FIL_PAGE_OFFSET);
474
475 if (!log_tmp_block_decrypt(src, srclen, dst,
476 (offset * srv_page_size))) {
477 return false;
478 }
479
480 memcpy(tmp_frame + srv_page_size - FIL_PAGE_DATA_END,
481 src_frame + srv_page_size - FIL_PAGE_DATA_END,
482 FIL_PAGE_DATA_END);
483
484 memcpy(src_frame, tmp_frame, srv_page_size);
485 srv_stats.pages_decrypted.inc();
486 srv_stats.n_temp_blocks_decrypted.inc();
487
488 return true; /* page was decrypted */
489 }
490
491 /** Decrypt a page.
492 @param[in,out] bpage Page control block
493 @param[in,out] space tablespace
494 @return whether the operation was successful */
495 static bool buf_page_decrypt_after_read(buf_page_t* bpage, fil_space_t* space)
496 {
497 ut_ad(space->pending_io());
498 ut_ad(space->id == bpage->id.space());
499
500 byte* dst_frame = bpage->zip.data ? bpage->zip.data :
501 ((buf_block_t*) bpage)->frame;
502 bool page_compressed = fil_page_is_compressed(dst_frame);
503 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
504
505 if (bpage->id.page_no() == 0) {
506 /* File header pages are not encrypted/compressed */
507 return (true);
508 }
509
510 if (space->purpose == FIL_TYPE_TEMPORARY
511 && innodb_encrypt_temporary_tables) {
512 buf_tmp_buffer_t* slot = buf_pool_reserve_tmp_slot(buf_pool);
513 buf_tmp_reserve_crypt_buf(slot);
514
515 if (!buf_tmp_page_decrypt(slot->crypt_buf, dst_frame)) {
516 slot->release();
517 ib::error() << "Encrypted page " << bpage->id
518 << " in file " << space->chain.start->name;
519 return false;
520 }
521
522 slot->release();
523 return true;
524 }
525
526 /* Page is encrypted if encryption information is found from
527 tablespace and page contains used key_version. This is true
528 also for pages first compressed and then encrypted. */
529
530 buf_tmp_buffer_t* slot;
531
532 if (page_compressed) {
533 /* the page we read is unencrypted */
534 /* Find free slot from temporary memory array */
535 decompress:
536 slot = buf_pool_reserve_tmp_slot(buf_pool);
537 /* For decompression, use crypt_buf. */
538 buf_tmp_reserve_crypt_buf(slot);
539 decompress_with_slot:
540 ut_d(fil_page_type_validate(dst_frame));
541
542 ulint write_size = fil_page_decompress(slot->crypt_buf,
543 dst_frame);
544 slot->release();
545
546 ut_ad(!write_size || fil_page_type_validate(dst_frame));
547 ut_ad(space->pending_io());
548 return write_size != 0;
549 }
550
551 if (space->crypt_data
552 && mach_read_from_4(FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
553 + dst_frame)) {
554 /* Verify encryption checksum before we even try to
555 decrypt. */
556 if (!fil_space_verify_crypt_checksum(dst_frame, bpage->size)) {
557 decrypt_failed:
558 ib::error() << "Encrypted page " << bpage->id
559 << " in file " << space->chain.start->name
560 << " looks corrupted; key_version="
561 << mach_read_from_4(
562 FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
563 + dst_frame);
564 return false;
565 }
566
567 /* Find free slot from temporary memory array */
568 slot = buf_pool_reserve_tmp_slot(buf_pool);
569 buf_tmp_reserve_crypt_buf(slot);
570
571 ut_d(fil_page_type_validate(dst_frame));
572
573 /* decrypt using crypt_buf to dst_frame */
574 if (!fil_space_decrypt(space, slot->crypt_buf, dst_frame)) {
575 slot->release();
576 goto decrypt_failed;
577 }
578
579 ut_d(fil_page_type_validate(dst_frame));
580
581 if (fil_page_is_compressed_encrypted(dst_frame)) {
582 goto decompress_with_slot;
583 }
584
585 slot->release();
586 } else if (fil_page_is_compressed_encrypted(dst_frame)) {
587 goto decompress;
588 }
589
590 ut_ad(space->pending_io());
591 return true;
592 }
593
594 /********************************************************************//**
595 Gets the smallest oldest_modification lsn for any page in the pool. Returns
596 zero if all modified pages have been flushed to disk.
597 @return oldest modification in pool, zero if none */
598 lsn_t
599 buf_pool_get_oldest_modification(void)
600 /*==================================*/
601 {
602 lsn_t lsn = 0;
603 lsn_t oldest_lsn = 0;
604
605 /* When we traverse all the flush lists we don't want another
606 thread to add a dirty page to any flush list. */
607 log_flush_order_mutex_enter();
608
609 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
610 buf_pool_t* buf_pool;
611
612 buf_pool = buf_pool_from_array(i);
613
614 buf_flush_list_mutex_enter(buf_pool);
615
616 buf_page_t* bpage;
617
618 /* We don't let log-checkpoint halt because pages from system
619 temporary are not yet flushed to the disk. Anyway, object
620 residing in system temporary doesn't generate REDO logging. */
621 for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
622 bpage != NULL
623 && fsp_is_system_temporary(bpage->id.space());
624 bpage = UT_LIST_GET_PREV(list, bpage)) {
625 /* Do nothing. */
626 }
627
628 if (bpage != NULL) {
629 ut_ad(bpage->in_flush_list);
630 lsn = bpage->oldest_modification;
631 }
632
633 buf_flush_list_mutex_exit(buf_pool);
634
635 if (!oldest_lsn || oldest_lsn > lsn) {
636 oldest_lsn = lsn;
637 }
638 }
639
640 log_flush_order_mutex_exit();
641
642 /* The returned answer may be out of date: the flush_list can
643 change after the mutex has been released. */
644
UpdateSampleRate(PaJackStream * stream,double sampleRate)645 return(oldest_lsn);
646 }
647
648 /********************************************************************//**
649 Get total buffer pool statistics. */
650 void
651 buf_get_total_list_len(
JackErrorCallback(const char * msg)652 /*===================*/
653 ulint* LRU_len, /*!< out: length of all LRU lists */
654 ulint* free_len, /*!< out: length of all free lists */
655 ulint* flush_list_len) /*!< out: length of all flush lists */
656 {
657 ulint i;
658
659 *LRU_len = 0;
660 *free_len = 0;
661 *flush_list_len = 0;
662
663 for (i = 0; i < srv_buf_pool_instances; i++) {
664 buf_pool_t* buf_pool;
665
666 buf_pool = buf_pool_from_array(i);
667
668 *LRU_len += UT_LIST_GET_LEN(buf_pool->LRU);
669 *free_len += UT_LIST_GET_LEN(buf_pool->free);
670 *flush_list_len += UT_LIST_GET_LEN(buf_pool->flush_list);
671 }
672 }
673
674 /********************************************************************//**
675 Get total list size in bytes from all buffer pools. */
676 void
677 buf_get_total_list_size_in_bytes(
678 /*=============================*/
679 buf_pools_list_size_t* buf_pools_list_size) /*!< out: list sizes
680 in all buffer pools */
JackSrCb(jack_nframes_t nframes,void * arg)681 {
682 ut_ad(buf_pools_list_size);
683 memset(buf_pools_list_size, 0, sizeof(*buf_pools_list_size));
684
685 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
686 buf_pool_t* buf_pool;
687
688 buf_pool = buf_pool_from_array(i);
689 /* We don't need mutex protection since this is
690 for statistics purpose */
691 buf_pools_list_size->LRU_bytes += buf_pool->stat.LRU_bytes;
692 buf_pools_list_size->unzip_LRU_bytes +=
693 UT_LIST_GET_LEN(buf_pool->unzip_LRU)
694 << srv_page_size_shift;
695 buf_pools_list_size->flush_list_bytes +=
696 buf_pool->stat.flush_list_bytes;
697 }
698 }
699
700 /********************************************************************//**
JackXRunCb(void * arg)701 Get total buffer pool statistics. */
702 void
703 buf_get_total_stat(
704 /*===============*/
705 buf_pool_stat_t* tot_stat) /*!< out: buffer pool stats */
706 {
707 ulint i;
708
709 memset(tot_stat, 0, sizeof(*tot_stat));
710
711 for (i = 0; i < srv_buf_pool_instances; i++) {
712 buf_pool_stat_t*buf_stat;
713 buf_pool_t* buf_pool;
714
715 buf_pool = buf_pool_from_array(i);
716
717 buf_stat = &buf_pool->stat;
718 tot_stat->n_page_gets += buf_stat->n_page_gets;
719 tot_stat->n_pages_read += buf_stat->n_pages_read;
720 tot_stat->n_pages_written += buf_stat->n_pages_written;
721 tot_stat->n_pages_created += buf_stat->n_pages_created;
722 tot_stat->n_ra_pages_read_rnd += buf_stat->n_ra_pages_read_rnd;
723 tot_stat->n_ra_pages_read += buf_stat->n_ra_pages_read;
724 tot_stat->n_ra_pages_evicted += buf_stat->n_ra_pages_evicted;
725 tot_stat->n_pages_made_young += buf_stat->n_pages_made_young;
726
727 tot_stat->n_pages_not_made_young +=
728 buf_stat->n_pages_not_made_young;
729 }
730 }
731
732 /********************************************************************//**
733 Allocates a buffer block.
734 @return own: the allocated block, in state BUF_BLOCK_MEMORY */
735 buf_block_t*
736 buf_block_alloc(
737 /*============*/
738 buf_pool_t* buf_pool) /*!< in/out: buffer pool instance,
739 or NULL for round-robin selection
740 of the buffer pool */
741 {
742 buf_block_t* block;
743 ulint index;
744 static ulint buf_pool_index;
745
746 if (buf_pool == NULL) {
747 /* We are allocating memory from any buffer pool, ensure
748 we spread the grace on all buffer pool instances. */
749 index = buf_pool_index++ % srv_buf_pool_instances;
750 buf_pool = buf_pool_from_array(index);
751 }
752
753 block = buf_LRU_get_free_block(buf_pool);
754
755 buf_block_set_state(block, BUF_BLOCK_MEMORY);
756
757 return(block);
758 }
759 #endif /* !UNIV_INNOCHECKSUM */
760
761 /** Checks if the page is in crc32 checksum format.
762 @param[in] read_buf database page
763 @param[in] checksum_field1 new checksum field
764 @param[in] checksum_field2 old checksum field
765 @return true if the page is in crc32 checksum format. */
766 bool
767 buf_page_is_checksum_valid_crc32(
768 const byte* read_buf,
769 ulint checksum_field1,
770 ulint checksum_field2)
771 {
772 const uint32_t crc32 = buf_calc_page_crc32(read_buf);
773
774 #ifdef UNIV_INNOCHECKSUM
775 if (log_file
776 && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32) {
777 fprintf(log_file, "page::" UINT32PF ";"
778 " crc32 calculated = " UINT32PF ";"
779 " recorded checksum field1 = " ULINTPF " recorded"
780 " checksum field2 =" ULINTPF "\n", cur_page_num,
781 crc32, checksum_field1, checksum_field2);
782 }
783 #endif /* UNIV_INNOCHECKSUM */
784
785 if (checksum_field1 != checksum_field2) {
786 return false;
787 }
788
789 return checksum_field1 == crc32
790 #ifdef INNODB_BUG_ENDIAN_CRC32
791 || checksum_field1 == buf_calc_page_crc32(read_buf, true)
792 #endif
793 ;
794 }
795
796 /** Checks if the page is in innodb checksum format.
797 @param[in] read_buf database page
798 @param[in] checksum_field1 new checksum field
799 @param[in] checksum_field2 old checksum field
800 @return true if the page is in innodb checksum format. */
801 bool
802 buf_page_is_checksum_valid_innodb(
803 const byte* read_buf,
804 ulint checksum_field1,
805 ulint checksum_field2)
806 {
807 /* There are 2 valid formulas for
808 checksum_field2 (old checksum field) which algo=innodb could have
809 written to the page:
810
811 1. Very old versions of InnoDB only stored 8 byte lsn to the
812 start and the end of the page.
813
Terminate(struct PaUtilHostApiRepresentation * hostApi)814 2. Newer InnoDB versions store the old formula checksum
815 (buf_calc_page_old_checksum()). */
816
817 ulint old_checksum = buf_calc_page_old_checksum(read_buf);
818 ulint new_checksum = buf_calc_page_new_checksum(read_buf);
819
820 #ifdef UNIV_INNOCHECKSUM
821 if (log_file
822 && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_INNODB) {
823 fprintf(log_file, "page::" UINT32PF ";"
824 " old style: calculated ="
825 " " ULINTPF "; recorded = " ULINTPF "\n",
826 cur_page_num, old_checksum,
827 checksum_field2);
828 fprintf(log_file, "page::" UINT32PF ";"
829 " new style: calculated ="
830 " " ULINTPF "; crc32 = " UINT32PF "; recorded = " ULINTPF "\n",
831 cur_page_num, new_checksum,
832 buf_calc_page_crc32(read_buf), checksum_field1);
833 }
834
835 if (log_file
836 && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB) {
837 fprintf(log_file, "page::" UINT32PF ";"
838 " old style: calculated ="
IsFormatSupported(struct PaUtilHostApiRepresentation * hostApi,const PaStreamParameters * inputParameters,const PaStreamParameters * outputParameters,double sampleRate)839 " " ULINTPF "; recorded checksum = " ULINTPF "\n",
840 cur_page_num, old_checksum,
841 checksum_field2);
842 fprintf(log_file, "page::" UINT32PF ";"
843 " new style: calculated ="
844 " " ULINTPF "; recorded checksum = " ULINTPF "\n",
845 cur_page_num, new_checksum,
846 checksum_field1);
847 }
848 #endif /* UNIV_INNOCHECKSUM */
849
850
851 if (checksum_field2 != mach_read_from_4(read_buf + FIL_PAGE_LSN)
852 && checksum_field2 != old_checksum) {
853 DBUG_LOG("checksum",
854 "Page checksum crc32 not valid"
855 << " field1 " << checksum_field1
856 << " field2 " << checksum_field2
857 << " crc32 " << buf_calc_page_old_checksum(read_buf)
858 << " lsn " << mach_read_from_4(
859 read_buf + FIL_PAGE_LSN));
860 return(false);
861 }
862
863 /* old field is fine, check the new field */
864
865 /* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id
866 (always equal to 0), to FIL_PAGE_SPACE_OR_CHKSUM */
867
868 if (checksum_field1 != 0 && checksum_field1 != new_checksum) {
869 DBUG_LOG("checksum",
870 "Page checksum crc32 not valid"
871 << " field1 " << checksum_field1
872 << " field2 " << checksum_field2
873 << " crc32 " << buf_calc_page_new_checksum(read_buf)
874 << " lsn " << mach_read_from_4(
875 read_buf + FIL_PAGE_LSN));
876 return(false);
877 }
878
879 return(true);
880 }
881
882 /** Checks if the page is in none checksum format.
883 @param[in] read_buf database page
884 @param[in] checksum_field1 new checksum field
885 @param[in] checksum_field2 old checksum field
886 @return true if the page is in none checksum format. */
887 bool
888 buf_page_is_checksum_valid_none(
889 const byte* read_buf,
890 ulint checksum_field1,
891 ulint checksum_field2)
892 {
893 #ifndef DBUG_OFF
894 if (checksum_field1 != checksum_field2
895 && checksum_field1 != BUF_NO_CHECKSUM_MAGIC) {
896 DBUG_LOG("checksum",
897 "Page checksum crc32 not valid"
898 << " field1 " << checksum_field1
899 << " field2 " << checksum_field2
900 << " crc32 " << BUF_NO_CHECKSUM_MAGIC
901 << " lsn " << mach_read_from_4(read_buf
902 + FIL_PAGE_LSN));
903 }
904 #endif /* DBUG_OFF */
905
906 #ifdef UNIV_INNOCHECKSUM
907 if (log_file
908 && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_NONE) {
909 fprintf(log_file,
910 "page::" UINT32PF "; none checksum: calculated"
911 " = %lu; recorded checksum_field1 = " ULINTPF
912 " recorded checksum_field2 = " ULINTPF "\n",
913 cur_page_num, BUF_NO_CHECKSUM_MAGIC,
914 checksum_field1, checksum_field2);
915 }
916 #endif /* UNIV_INNOCHECKSUM */
917
918 return(checksum_field1 == checksum_field2
919 && checksum_field1 == BUF_NO_CHECKSUM_MAGIC);
920 }
921
922 #ifdef INNODB_BUG_ENDIAN_CRC32
923 /** Validate the CRC-32C checksum of a page.
924 @param[in] page buffer page (srv_page_size bytes)
925 @param[in] checksum CRC-32C checksum stored on page
InitializeStream(PaJackStream * stream,PaJackHostApiRepresentation * hostApi,int numInputChannels,int numOutputChannels)926 @return computed checksum */
927 static uint32_t buf_page_check_crc32(const byte* page, uint32_t checksum)
928 {
929 uint32_t crc32 = buf_calc_page_crc32(page);
930
931 if (checksum != crc32) {
932 crc32 = buf_calc_page_crc32(page, true);
933 }
934
935 return crc32;
936 }
937 #else /* INNODB_BUG_ENDIAN_CRC32 */
938 /** Validate the CRC-32C checksum of a page.
939 @param[in] page buffer page (srv_page_size bytes)
940 @param[in] checksum CRC-32C checksum stored on page
941 @return computed checksum */
942 # define buf_page_check_crc32(page, checksum) buf_calc_page_crc32(page)
943 #endif /* INNODB_BUG_ENDIAN_CRC32 */
944
945
946 /** Check if a buffer is all zeroes.
947 @param[in] buf data to check
948 @return whether the buffer is all zeroes */
949 bool buf_is_zeroes(span<const byte> buf)
950 {
951 ut_ad(buf.size() <= sizeof field_ref_zero);
952 return memcmp(buf.data(), field_ref_zero, buf.size()) == 0;
953 }
954
955 /** Check if a page is corrupt.
956 @param[in] check_lsn whether the LSN should be checked
957 @param[in] read_buf database page
958 @param[in] page_size page size
959 @param[in] space tablespace
960 @return whether the page is corrupted */
961 bool
962 buf_page_is_corrupted(
963 bool check_lsn,
964 const byte* read_buf,
965 const page_size_t& page_size,
966 #ifndef UNIV_INNOCHECKSUM
967 const fil_space_t* space)
968 #else
969 const void* space)
970 #endif
971 {
CleanUpStream(PaJackStream * stream,int terminateStreamRepresentation,int terminateBufferProcessor)972 ut_ad(page_size.logical() == srv_page_size);
973 #ifndef UNIV_INNOCHECKSUM
974 DBUG_EXECUTE_IF("buf_page_import_corrupt_failure", return(true); );
975 #endif
976 size_t checksum_field1 = 0;
977 size_t checksum_field2 = 0;
978 uint32_t crc32 = 0;
979 bool crc32_inited = false;
980
981 ulint page_type = mach_read_from_2(read_buf + FIL_PAGE_TYPE);
982
983 /* We can trust page type if page compression is set on tablespace
984 flags because page compression flag means file must have been
985 created with 10.1 (later than 5.5 code base). In 10.1 page
986 compressed tables do not contain post compression checksum and
987 FIL_PAGE_END_LSN_OLD_CHKSUM field stored. Note that space can
988 be null if we are in fil_check_first_page() and first page
989 is not compressed or encrypted. Page checksum is verified
990 after decompression (i.e. normally pages are already
991 decompressed at this stage). */
992 if ((page_type == FIL_PAGE_PAGE_COMPRESSED ||
993 page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED)
994 #ifndef UNIV_INNOCHECKSUM
995 && space && FSP_FLAGS_HAS_PAGE_COMPRESSION(space->flags)
996 #endif
997 ) {
998 return(false);
999 }
1000
1001 if (!page_size.is_compressed()
1002 && memcmp(read_buf + FIL_PAGE_LSN + 4,
1003 read_buf + page_size.logical()
WaitCondition(PaJackHostApiRepresentation * hostApi)1004 - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) {
1005
1006 /* Stored log sequence numbers at the start and the end
1007 of page do not match */
1008
1009 return(true);
1010 }
1011
1012 #ifndef UNIV_INNOCHECKSUM
1013 if (check_lsn && recv_lsn_checks_on) {
1014 lsn_t current_lsn;
1015 const lsn_t page_lsn
1016 = mach_read_from_8(read_buf + FIL_PAGE_LSN);
1017
1018 /* Since we are going to reset the page LSN during the import
1019 phase it makes no sense to spam the log with error messages. */
1020
1021 if (log_peek_lsn(¤t_lsn) && current_lsn < page_lsn) {
1022
1023 const ulint space_id = mach_read_from_4(
AddStream(PaJackStream * stream)1024 read_buf + FIL_PAGE_SPACE_ID);
1025 const ulint page_no = mach_read_from_4(
1026 read_buf + FIL_PAGE_OFFSET);
1027
1028 ib::error() << "Page " << page_id_t(space_id, page_no)
1029 << " log sequence number " << page_lsn
1030 << " is in the future! Current system"
1031 << " log sequence number "
1032 << current_lsn << ".";
1033
1034 ib::error() << "Your database may be corrupt or"
1035 " you may have copied the InnoDB"
1036 " tablespace but not the InnoDB"
1037 " log files. "
1038 << FORCE_RECOVERY_MSG;
1039
1040 }
1041 }
1042 #endif /* !UNIV_INNOCHECKSUM */
1043
1044 /* Check whether the checksum fields have correct values */
1045
RemoveStream(PaJackStream * stream)1046 const srv_checksum_algorithm_t curr_algo =
1047 static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm);
1048
1049 if (curr_algo == SRV_CHECKSUM_ALGORITHM_NONE) {
1050 return(false);
1051 }
1052
1053 if (page_size.is_compressed()) {
1054 return(!page_zip_verify_checksum(read_buf,
1055 page_size.physical()));
1056 }
1057
1058 checksum_field1 = mach_read_from_4(
1059 read_buf + FIL_PAGE_SPACE_OR_CHKSUM);
1060
1061 checksum_field2 = mach_read_from_4(
1062 read_buf + page_size.logical() - FIL_PAGE_END_LSN_OLD_CHKSUM);
1063
1064 compile_time_assert(!(FIL_PAGE_LSN % 8));
1065
1066 /* A page filled with NUL bytes is considered not corrupted.
OpenStream(struct PaUtilHostApiRepresentation * hostApi,PaStream ** s,const PaStreamParameters * inputParameters,const PaStreamParameters * outputParameters,double sampleRate,unsigned long framesPerBuffer,PaStreamFlags streamFlags,PaStreamCallback * streamCallback,void * userData)1067 The FIL_PAGE_FILE_FLUSH_LSN field may be written nonzero for
1068 the first page of the system tablespace.
1069 Ignore it for the system tablespace. */
1070 if (!checksum_field1 && !checksum_field2) {
1071 /* Checksum fields can have valid value as zero.
1072 If the page is not empty then do the checksum
1073 calculation for the page. */
1074 bool all_zeroes = true;
1075 for (size_t i = 0; i < srv_page_size; i++) {
1076 #ifndef UNIV_INNOCHECKSUM
1077 if (i == FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
1078 && (!space || !space->id)) {
1079 i += 8;
1080 }
1081 #endif
1082 if (read_buf[i]) {
1083 all_zeroes = false;
1084 break;
1085 }
1086 }
1087
1088 if (all_zeroes) {
1089 return false;
1090 }
1091 }
1092
1093 switch (curr_algo) {
1094 case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
1095 return !buf_page_is_checksum_valid_crc32(
1096 read_buf, checksum_field1, checksum_field2);
1097 case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
1098 return !buf_page_is_checksum_valid_innodb(
1099 read_buf, checksum_field1, checksum_field2);
1100 case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
1101 return !buf_page_is_checksum_valid_none(
1102 read_buf, checksum_field1, checksum_field2);
1103 case SRV_CHECKSUM_ALGORITHM_CRC32:
1104 case SRV_CHECKSUM_ALGORITHM_INNODB:
1105 if (buf_page_is_checksum_valid_none(read_buf,
1106 checksum_field1, checksum_field2)) {
1107 #ifdef UNIV_INNOCHECKSUM
1108 if (log_file) {
1109 fprintf(log_file, "page::" UINT32PF ";"
1110 " old style: calculated = %u;"
1111 " recorded = " ULINTPF ";\n",
1112 cur_page_num,
1113 buf_calc_page_old_checksum(read_buf),
1114 checksum_field2);
1115 fprintf(log_file, "page::" UINT32PF ";"
1116 " new style: calculated = " UINT32PF ";"
1117 " crc32 = " UINT32PF "; recorded = " ULINTPF ";\n",
1118 cur_page_num,
1119 buf_calc_page_new_checksum(read_buf),
1120 buf_calc_page_crc32(read_buf),
1121 checksum_field1);
1122 }
1123 #endif /* UNIV_INNOCHECKSUM */
1124 return false;
1125 }
1126
1127 /* Very old versions of InnoDB only stored 8 byte lsn to the
1128 start and the end of the page. */
1129
1130 /* Since innodb_checksum_algorithm is not strict_* allow
1131 any of the algos to match for the old field */
1132
1133 if (checksum_field2
1134 != mach_read_from_4(read_buf + FIL_PAGE_LSN)
1135 && checksum_field2 != BUF_NO_CHECKSUM_MAGIC) {
1136
1137 if (curr_algo == SRV_CHECKSUM_ALGORITHM_CRC32) {
1138 DBUG_EXECUTE_IF(
1139 "page_intermittent_checksum_mismatch", {
1140 static int page_counter;
1141 if (page_counter++ == 2) {
1142 checksum_field2++;
1143 }
1144 });
1145
1146 crc32 = buf_page_check_crc32(read_buf,
1147 checksum_field2);
1148 crc32_inited = true;
1149
1150 if (checksum_field2 != crc32
1151 && checksum_field2
1152 != buf_calc_page_old_checksum(read_buf)) {
1153 return true;
1154 }
1155 } else {
1156 ut_ad(curr_algo
1157 == SRV_CHECKSUM_ALGORITHM_INNODB);
1158
1159 if (checksum_field2
1160 != buf_calc_page_old_checksum(read_buf)) {
1161 crc32 = buf_page_check_crc32(
1162 read_buf, checksum_field2);
1163 crc32_inited = true;
1164
1165 if (checksum_field2 != crc32) {
1166 return true;
1167 }
1168 }
1169 }
1170 }
1171
1172 if (checksum_field1 == 0
1173 || checksum_field1 == BUF_NO_CHECKSUM_MAGIC) {
1174 } else if (curr_algo == SRV_CHECKSUM_ALGORITHM_CRC32) {
1175 if (!crc32_inited) {
1176 crc32 = buf_page_check_crc32(
1177 read_buf, checksum_field2);
1178 crc32_inited = true;
1179 }
1180
1181 if (checksum_field1 != crc32
1182 && checksum_field1
1183 != buf_calc_page_new_checksum(read_buf)) {
1184 return true;
1185 }
1186 } else {
1187 ut_ad(curr_algo == SRV_CHECKSUM_ALGORITHM_INNODB);
1188
1189 if (checksum_field1
1190 != buf_calc_page_new_checksum(read_buf)) {
1191
1192 if (!crc32_inited) {
1193 crc32 = buf_page_check_crc32(
1194 read_buf, checksum_field2);
1195 crc32_inited = true;
1196 }
1197
1198 if (checksum_field1 != crc32) {
1199 return true;
1200 }
1201 }
1202 }
1203
1204 if (crc32_inited
1205 && ((checksum_field1 == crc32
1206 && checksum_field2 != crc32)
1207 || (checksum_field1 != crc32
1208 && checksum_field2 == crc32))) {
1209 return true;
1210 }
1211
1212 break;
1213 case SRV_CHECKSUM_ALGORITHM_NONE:
1214 /* should have returned false earlier */
1215 break;
1216 }
1217
1218 return false;
1219 }
1220
1221 #ifndef UNIV_INNOCHECKSUM
1222
1223 #if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DODUMP)
1224 /** Enable buffers to be dumped to core files
1225
1226 A convience function, not called anyhwere directly however
1227 it is left available for gdb or any debugger to call
1228 in the event that you want all of the memory to be dumped
1229 to a core file.
1230
1231 Returns number of errors found in madvise calls. */
1232 int
1233 buf_madvise_do_dump()
1234 {
1235 int ret= 0;
1236 buf_pool_t* buf_pool;
1237 buf_chunk_t* chunk;
1238
1239 /* mirrors allocation in log_t::create() */
1240 if (log_sys.buf) {
1241 ret += madvise(log_sys.buf,
1242 srv_log_buffer_size,
1243 MADV_DODUMP);
1244 ret += madvise(log_sys.flush_buf,
1245 srv_log_buffer_size,
1246 MADV_DODUMP);
1247 }
1248 /* mirrors recv_sys_init() */
1249 if (recv_sys->buf)
1250 {
1251 ret+= madvise(recv_sys->buf, recv_sys->len, MADV_DODUMP);
1252 }
1253
1254 buf_pool_mutex_enter_all();
1255
1256 for (ulong i= 0; i < srv_buf_pool_instances; i++)
1257 {
1258 buf_pool = buf_pool_from_array(i);
1259 chunk = buf_pool->chunks;
1260
1261 for (int n = buf_pool->n_chunks; n--; chunk++)
1262 {
1263 ret+= madvise(chunk->mem, chunk->mem_size(), MADV_DODUMP);
1264 }
1265 }
1266
1267 buf_pool_mutex_exit_all();
1268
1269 return ret;
1270 }
1271 #endif
1272
1273 /** Dump a page to stderr.
1274 @param[in] read_buf database page
1275 @param[in] page_size page size */
1276 UNIV_INTERN
1277 void
1278 buf_page_print(const byte* read_buf, const page_size_t& page_size)
1279 {
1280 dict_index_t* index;
1281
1282 #ifndef UNIV_DEBUG
1283 ib::info() << "Page dump in ascii and hex ("
1284 << page_size.physical() << " bytes):";
1285
1286 ut_print_buf(stderr, read_buf, page_size.physical());
1287 fputs("\nInnoDB: End of page dump\n", stderr);
1288 #endif
1289
1290 if (page_size.is_compressed()) {
1291 /* Print compressed page. */
1292 ib::info() << "Compressed page type ("
1293 << fil_page_get_type(read_buf)
1294 << "); stored checksum in field1 "
1295 << mach_read_from_4(
1296 read_buf + FIL_PAGE_SPACE_OR_CHKSUM)
1297 << "; calculated checksums for field1: "
1298 << buf_checksum_algorithm_name(
1299 SRV_CHECKSUM_ALGORITHM_CRC32)
1300 << " "
1301 << page_zip_calc_checksum(
1302 read_buf, page_size.physical(),
1303 SRV_CHECKSUM_ALGORITHM_CRC32)
1304 #ifdef INNODB_BUG_ENDIAN_CRC32
1305 << "/"
1306 << page_zip_calc_checksum(
1307 read_buf, page_size.physical(),
1308 SRV_CHECKSUM_ALGORITHM_CRC32, true)
1309 #endif
1310 << ", "
1311 << buf_checksum_algorithm_name(
1312 SRV_CHECKSUM_ALGORITHM_INNODB)
1313 << " "
1314 << page_zip_calc_checksum(
1315 read_buf, page_size.physical(),
1316 SRV_CHECKSUM_ALGORITHM_INNODB)
1317 << ", "
1318 << buf_checksum_algorithm_name(
1319 SRV_CHECKSUM_ALGORITHM_NONE)
1320 << " "
1321 << page_zip_calc_checksum(
1322 read_buf, page_size.physical(),
1323 SRV_CHECKSUM_ALGORITHM_NONE)
1324 << "; page LSN "
1325 << mach_read_from_8(read_buf + FIL_PAGE_LSN)
1326 << "; page number (if stored to page"
1327 << " already) "
1328 << mach_read_from_4(read_buf + FIL_PAGE_OFFSET)
1329 << "; space id (if stored to page already) "
CloseStream(PaStream * s)1330 << mach_read_from_4(
1331 read_buf + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
1332
1333 } else {
1334 const uint32_t crc32 = buf_calc_page_crc32(read_buf);
1335 #ifdef INNODB_BUG_ENDIAN_CRC32
1336 const uint32_t crc32_legacy = buf_calc_page_crc32(read_buf,
1337 true);
1338 #endif /* INNODB_BUG_ENDIAN_CRC32 */
1339 ulint page_type = fil_page_get_type(read_buf);
1340
1341 ib::info() << "Uncompressed page, stored checksum in field1 "
1342 << mach_read_from_4(
RealProcess(PaJackStream * stream,jack_nframes_t frames)1343 read_buf + FIL_PAGE_SPACE_OR_CHKSUM)
1344 << ", calculated checksums for field1: "
1345 << buf_checksum_algorithm_name(
1346 SRV_CHECKSUM_ALGORITHM_CRC32) << " "
1347 << crc32
1348 #ifdef INNODB_BUG_ENDIAN_CRC32
1349 << "/" << crc32_legacy
1350 #endif
1351 << ", "
1352 << buf_checksum_algorithm_name(
1353 SRV_CHECKSUM_ALGORITHM_INNODB) << " "
1354 << buf_calc_page_new_checksum(read_buf)
1355 << ", "
1356 << " page type " << page_type << " == "
1357 << fil_get_page_type_name(page_type) << "."
1358 << buf_checksum_algorithm_name(
1359 SRV_CHECKSUM_ALGORITHM_NONE) << " "
1360 << BUF_NO_CHECKSUM_MAGIC
1361 << ", stored checksum in field2 "
1362 << mach_read_from_4(read_buf + page_size.logical()
1363 - FIL_PAGE_END_LSN_OLD_CHKSUM)
1364 << ", calculated checksums for field2: "
1365 << buf_checksum_algorithm_name(
1366 SRV_CHECKSUM_ALGORITHM_CRC32) << " "
1367 << crc32
1368 #ifdef INNODB_BUG_ENDIAN_CRC32
1369 << "/" << crc32_legacy
1370 #endif
1371 << ", "
1372 << buf_checksum_algorithm_name(
1373 SRV_CHECKSUM_ALGORITHM_INNODB) << " "
1374 << buf_calc_page_old_checksum(read_buf)
1375 << ", "
1376 << buf_checksum_algorithm_name(
1377 SRV_CHECKSUM_ALGORITHM_NONE) << " "
1378 << BUF_NO_CHECKSUM_MAGIC
1379 << ", page LSN "
1380 << mach_read_from_4(read_buf + FIL_PAGE_LSN)
1381 << " "
1382 << mach_read_from_4(read_buf + FIL_PAGE_LSN + 4)
1383 << ", low 4 bytes of LSN at page end "
1384 << mach_read_from_4(read_buf + page_size.logical()
1385 - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)
1386 << ", page number (if stored to page already) "
1387 << mach_read_from_4(read_buf + FIL_PAGE_OFFSET)
1388 << ", space id (if created with >= MySQL-4.1.1"
1389 " and stored already) "
1390 << mach_read_from_4(
1391 read_buf + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
1392 }
1393
1394 switch (fil_page_get_type(read_buf)) {
1395 index_id_t index_id;
1396 case FIL_PAGE_INDEX:
1397 case FIL_PAGE_TYPE_INSTANT:
1398 case FIL_PAGE_RTREE:
1399 index_id = btr_page_get_index_id(read_buf);
1400 ib::info() << "Page may be an index page where"
1401 " index id is " << index_id;
1402
1403 index = dict_index_find_on_id_low(index_id);
1404 if (index) {
1405 ib::info()
1406 << "Index " << index_id
1407 << " is " << index->name
1408 << " in table " << index->table->name;
1409 }
1410 break;
1411 case FIL_PAGE_UNDO_LOG:
1412 fputs("InnoDB: Page may be an undo log page\n", stderr);
1413 break;
1414 case FIL_PAGE_INODE:
1415 fputs("InnoDB: Page may be an 'inode' page\n", stderr);
1416 break;
1417 case FIL_PAGE_IBUF_FREE_LIST:
1418 fputs("InnoDB: Page may be an insert buffer free list page\n",
1419 stderr);
1420 break;
1421 case FIL_PAGE_TYPE_ALLOCATED:
1422 fputs("InnoDB: Page may be a freshly allocated page\n",
UpdateQueue(PaJackHostApiRepresentation * hostApi)1423 stderr);
1424 break;
1425 case FIL_PAGE_IBUF_BITMAP:
1426 fputs("InnoDB: Page may be an insert buffer bitmap page\n",
1427 stderr);
1428 break;
1429 case FIL_PAGE_TYPE_SYS:
1430 fputs("InnoDB: Page may be a system page\n",
1431 stderr);
1432 break;
1433 case FIL_PAGE_TYPE_TRX_SYS:
1434 fputs("InnoDB: Page may be a transaction system page\n",
1435 stderr);
1436 break;
1437 case FIL_PAGE_TYPE_FSP_HDR:
1438 fputs("InnoDB: Page may be a file space header page\n",
1439 stderr);
1440 break;
1441 case FIL_PAGE_TYPE_XDES:
1442 fputs("InnoDB: Page may be an extent descriptor page\n",
1443 stderr);
1444 break;
1445 case FIL_PAGE_TYPE_BLOB:
1446 fputs("InnoDB: Page may be a BLOB page\n",
1447 stderr);
1448 break;
1449 case FIL_PAGE_TYPE_ZBLOB:
1450 case FIL_PAGE_TYPE_ZBLOB2:
1451 fputs("InnoDB: Page may be a compressed BLOB page\n",
1452 stderr);
1453 break;
1454 }
1455 }
1456
1457 # ifdef PFS_GROUP_BUFFER_SYNC
1458 extern mysql_pfs_key_t buffer_block_mutex_key;
1459
1460 /********************************************************************//**
1461 This function registers mutexes and rwlocks in buffer blocks with
1462 performance schema. If PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER is
1463 defined to be a value less than chunk->size, then only mutexes
1464 and rwlocks in the first PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER
1465 blocks are registered. */
1466 static
1467 void
1468 pfs_register_buffer_block(
1469 /*======================*/
1470 buf_chunk_t* chunk) /*!< in/out: chunk of buffers */
1471 {
1472 buf_block_t* block;
1473 ulint num_to_register;
1474
1475 block = chunk->blocks;
1476
1477 num_to_register = ut_min(
1478 chunk->size, PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER);
1479
1480 for (ulint i = 0; i < num_to_register; i++) {
1481 # ifdef UNIV_PFS_MUTEX
1482 BPageMutex* mutex;
1483
1484 mutex = &block->mutex;
1485 mutex->pfs_add(buffer_block_mutex_key);
1486 # endif /* UNIV_PFS_MUTEX */
1487
1488 rw_lock_t* rwlock;
1489
1490 # ifdef UNIV_PFS_RWLOCK
1491 rwlock = &block->lock;
1492 ut_a(!rwlock->pfs_psi);
1493 rwlock->pfs_psi = (PSI_server)
1494 ? PSI_server->init_rwlock(buf_block_lock_key, rwlock)
1495 : NULL;
1496
1497 # ifdef UNIV_DEBUG
1498 rwlock = &block->debug_latch;
1499 ut_a(!rwlock->pfs_psi);
1500 rwlock->pfs_psi = (PSI_server)
JackCallback(jack_nframes_t frames,void * userData)1501 ? PSI_server->init_rwlock(buf_block_debug_latch_key,
1502 rwlock)
1503 : NULL;
1504 # endif /* UNIV_DEBUG */
1505
1506 # endif /* UNIV_PFS_RWLOCK */
1507 block++;
1508 }
1509 }
1510 # endif /* PFS_GROUP_BUFFER_SYNC */
1511
1512 /********************************************************************//**
1513 Initializes a buffer control block when the buf_pool is created. */
1514 static
1515 void
1516 buf_block_init(
1517 /*===========*/
1518 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1519 buf_block_t* block, /*!< in: pointer to control block */
1520 byte* frame) /*!< in: pointer to buffer frame */
1521 {
1522 /* This function should only be executed at database startup or by
1523 buf_pool_resize(). Either way, adaptive hash index must not exist. */
1524 assert_block_ahi_empty_on_init(block);
1525
1526 block->frame = frame;
1527
1528 block->page.buf_pool_index = buf_pool_index(buf_pool);
1529 block->page.flush_type = BUF_FLUSH_LRU;
1530 block->page.state = BUF_BLOCK_NOT_USED;
1531 block->page.buf_fix_count = 0;
1532 block->page.io_fix = BUF_IO_NONE;
1533 block->page.flush_observer = NULL;
1534 block->page.real_size = 0;
1535 block->modify_clock = 0;
1536 block->page.slot = NULL;
1537
1538 ut_d(block->page.file_page_was_freed = FALSE);
1539
1540 #ifdef BTR_CUR_HASH_ADAPT
1541 block->index = NULL;
1542 #endif /* BTR_CUR_HASH_ADAPT */
1543 ut_d(block->page.in_page_hash = FALSE);
1544 ut_d(block->page.in_zip_hash = FALSE);
1545 ut_d(block->page.in_flush_list = FALSE);
1546 ut_d(block->page.in_free_list = FALSE);
1547 ut_d(block->page.in_LRU_list = FALSE);
1548 ut_d(block->in_unzip_LRU_list = FALSE);
1549 ut_d(block->in_withdraw_list = FALSE);
1550
1551 page_zip_des_init(&block->page.zip);
1552
1553 mutex_create(LATCH_ID_BUF_BLOCK_MUTEX, &block->mutex);
1554
1555 #if defined PFS_SKIP_BUFFER_MUTEX_RWLOCK || defined PFS_GROUP_BUFFER_SYNC
1556 /* If PFS_SKIP_BUFFER_MUTEX_RWLOCK is defined, skip registration
1557 of buffer block rwlock with performance schema.
1558
1559 If PFS_GROUP_BUFFER_SYNC is defined, skip the registration
1560 since buffer block rwlock will be registered later in
1561 pfs_register_buffer_block(). */
1562
1563 rw_lock_create(PFS_NOT_INSTRUMENTED, &block->lock, SYNC_LEVEL_VARYING);
1564
1565 ut_d(rw_lock_create(PFS_NOT_INSTRUMENTED, &block->debug_latch,
1566 SYNC_LEVEL_VARYING));
1567
1568 #else /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */
1569
1570 rw_lock_create(buf_block_lock_key, &block->lock, SYNC_LEVEL_VARYING);
1571
1572 ut_d(rw_lock_create(buf_block_debug_latch_key,
1573 &block->debug_latch, SYNC_LEVEL_VARYING));
1574
1575 #endif /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */
1576
1577 block->lock.is_block_lock = 1;
1578
1579 ut_ad(rw_lock_validate(&(block->lock)));
1580 }
1581
1582 /********************************************************************//**
1583 Allocates a chunk of buffer frames.
1584 @return chunk, or NULL on failure */
1585 static
1586 buf_chunk_t*
1587 buf_chunk_init(
1588 /*===========*/
1589 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1590 buf_chunk_t* chunk, /*!< out: chunk of buffers */
1591 ulint mem_size) /*!< in: requested size in bytes */
1592 {
StartStream(PaStream * s)1593 buf_block_t* block;
1594 byte* frame;
1595 ulint i;
1596
1597 /* Round down to a multiple of page size,
1598 although it already should be. */
1599 mem_size = ut_2pow_round<ulint>(mem_size, srv_page_size);
1600 /* Reserve space for the block descriptors. */
1601 mem_size += ut_2pow_round<ulint>((mem_size >> srv_page_size_shift)
1602 * (sizeof *block)
1603 + (srv_page_size - 1),
1604 srv_page_size);
1605
1606 DBUG_EXECUTE_IF("ib_buf_chunk_init_fails", return(NULL););
1607
1608 chunk->mem = buf_pool->allocator.allocate_large_dontdump(mem_size, &chunk->mem_pfx);
1609
1610 if (UNIV_UNLIKELY(chunk->mem == NULL)) {
1611
1612 return(NULL);
1613 }
1614
1615 #ifdef HAVE_LIBNUMA
1616 if (srv_numa_interleave) {
1617 struct bitmask *numa_mems_allowed = numa_get_mems_allowed();
1618 int st = mbind(chunk->mem, chunk->mem_size(),
1619 MPOL_INTERLEAVE,
1620 numa_mems_allowed->maskp,
1621 numa_mems_allowed->size,
1622 MPOL_MF_MOVE);
1623 if (st != 0) {
1624 ib::warn() << "Failed to set NUMA memory policy of"
1625 " buffer pool page frames to MPOL_INTERLEAVE"
1626 " (error: " << strerror(errno) << ").";
1627 }
1628 numa_bitmask_free(numa_mems_allowed);
1629 }
1630 #endif /* HAVE_LIBNUMA */
1631
1632
1633 /* Allocate the block descriptors from
1634 the start of the memory block. */
1635 chunk->blocks = (buf_block_t*) chunk->mem;
1636
1637 /* Align a pointer to the first frame. Note that when
1638 opt_large_page_size is smaller than srv_page_size,
1639 we may allocate one fewer block than requested. When
1640 it is bigger, we may allocate more blocks than requested. */
1641
1642 frame = (byte*) ut_align(chunk->mem, srv_page_size);
1643 chunk->size = (chunk->mem_pfx.m_size >> srv_page_size_shift)
1644 - (frame != chunk->mem);
1645
1646 /* Subtract the space needed for block descriptors. */
1647 {
1648 ulint size = chunk->size;
1649
1650 while (frame < (byte*) (chunk->blocks + size)) {
1651 frame += srv_page_size;
1652 size--;
1653 }
1654
1655 chunk->size = size;
RealStop(PaJackStream * stream,int abort)1656 }
1657
1658 /* Init block structs and assign frames for them. Then we
1659 assign the frames to the first blocks (we already mapped the
1660 memory above). */
1661
1662 block = chunk->blocks;
1663
1664 for (i = chunk->size; i--; ) {
1665
1666 buf_block_init(buf_pool, block, frame);
1667 MEM_UNDEFINED(block->frame, srv_page_size);
1668
1669 /* Add the block to the free list */
1670 UT_LIST_ADD_LAST(buf_pool->free, &block->page);
1671
1672 ut_d(block->page.in_free_list = TRUE);
1673 ut_ad(buf_pool_from_block(block) == buf_pool);
1674
1675 block++;
1676 frame += srv_page_size;
1677 }
1678
1679 buf_pool_register_chunk(chunk);
1680
1681 #ifdef PFS_GROUP_BUFFER_SYNC
1682 pfs_register_buffer_block(chunk);
1683 #endif /* PFS_GROUP_BUFFER_SYNC */
1684 return(chunk);
1685 }
1686
1687 #ifdef UNIV_DEBUG
1688 /*********************************************************************//**
1689 Finds a block in the given buffer chunk that points to a
1690 given compressed page.
1691 @return buffer block pointing to the compressed page, or NULL */
1692 static
1693 buf_block_t*
1694 buf_chunk_contains_zip(
1695 /*===================*/
1696 buf_chunk_t* chunk, /*!< in: chunk being checked */
1697 const void* data) /*!< in: pointer to compressed page */
1698 {
1699 buf_block_t* block;
1700 ulint i;
1701
1702 block = chunk->blocks;
1703
1704 for (i = chunk->size; i--; block++) {
1705 if (block->page.zip.data == data) {
1706
StopStream(PaStream * s)1707 return(block);
1708 }
1709 }
1710
1711 return(NULL);
1712 }
AbortStream(PaStream * s)1713
1714 /*********************************************************************//**
1715 Finds a block in the buffer pool that points to a
1716 given compressed page.
1717 @return buffer block pointing to the compressed page, or NULL */
1718 buf_block_t*
1719 buf_pool_contains_zip(
1720 /*==================*/
1721 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1722 const void* data) /*!< in: pointer to compressed page */
1723 {
1724 ulint n;
1725 buf_chunk_t* chunk = buf_pool->chunks;
1726
1727 ut_ad(buf_pool);
1728 ut_ad(buf_pool_mutex_own(buf_pool));
1729 for (n = buf_pool->n_chunks; n--; chunk++) {
1730
1731 buf_block_t* block = buf_chunk_contains_zip(chunk, data);
1732
1733 if (block) {
1734 return(block);
1735 }
1736 }
1737
1738 return(NULL);
1739 }
1740 #endif /* UNIV_DEBUG */
1741
GetStreamCpuLoad(PaStream * s)1742 /*********************************************************************//**
1743 Checks that all file pages in the buffer chunk are in a replaceable state.
1744 @return address of a non-free block, or NULL if all freed */
1745 static
1746 const buf_block_t*
1747 buf_chunk_not_freed(
1748 /*================*/
1749 buf_chunk_t* chunk) /*!< in: chunk being checked */
1750 {
1751 buf_block_t* block;
1752 ulint i;
1753
1754 block = chunk->blocks;
1755
1756 for (i = chunk->size; i--; block++) {
1757 ibool ready;
1758
1759 switch (buf_block_get_state(block)) {
1760 case BUF_BLOCK_POOL_WATCH:
1761 case BUF_BLOCK_ZIP_PAGE:
1762 case BUF_BLOCK_ZIP_DIRTY:
1763 /* The uncompressed buffer pool should never
1764 contain compressed block descriptors. */
1765 ut_error;
1766 break;
1767 case BUF_BLOCK_NOT_USED:
1768 case BUF_BLOCK_READY_FOR_USE:
1769 case BUF_BLOCK_MEMORY:
1770 case BUF_BLOCK_REMOVE_HASH:
1771 /* Skip blocks that are not being used for
1772 file pages. */
1773 break;
1774 case BUF_BLOCK_FILE_PAGE:
1775 if (srv_read_only_mode) {
1776 /* The page cleaner is disabled in
1777 read-only mode. No pages can be
1778 dirtied, so all of them must be clean. */
1779 ut_ad(block->page.oldest_modification
1780 == block->page.newest_modification);
1781 ut_ad(block->page.oldest_modification == 0
1782 || block->page.oldest_modification
1783 == recv_sys->recovered_lsn
1784 || srv_force_recovery
1785 == SRV_FORCE_NO_LOG_REDO);
1786 ut_ad(block->page.buf_fix_count == 0);
1787 ut_ad(block->page.io_fix == BUF_IO_NONE);
1788 break;
1789 }
1790
1791 buf_page_mutex_enter(block);
1792 ready = buf_flush_ready_for_replace(&block->page);
1793 buf_page_mutex_exit(block);
1794
1795 if (!ready) {
1796 return(block);
1797 }
1798
1799 break;
1800 }
1801 }
1802
1803 return(NULL);
1804 }
1805
1806 /********************************************************************//**
1807 Set buffer pool size variables after resizing it */
1808 static
1809 void
1810 buf_pool_set_sizes(void)
1811 /*====================*/
1812 {
1813 ulint i;
1814 ulint curr_size = 0;
1815
1816 buf_pool_mutex_enter_all();
1817
1818 for (i = 0; i < srv_buf_pool_instances; i++) {
1819 buf_pool_t* buf_pool;
1820
1821 buf_pool = buf_pool_from_array(i);
1822 curr_size += buf_pool->curr_pool_size;
1823 }
1824
1825 srv_buf_pool_curr_size = curr_size;
1826 srv_buf_pool_old_size = srv_buf_pool_size;
1827 srv_buf_pool_base_size = srv_buf_pool_size;
1828
1829 buf_pool_mutex_exit_all();
1830 }
1831
1832 /********************************************************************//**
1833 Initialize a buffer pool instance.
1834 @return DB_SUCCESS if all goes well. */
1835 static
1836 ulint
1837 buf_pool_init_instance(
1838 /*===================*/
1839 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1840 ulint buf_pool_size, /*!< in: size in bytes */
1841 ulint instance_no) /*!< in: id of the instance */
1842 {
1843 ulint i;
1844 ulint chunk_size;
1845 buf_chunk_t* chunk;
1846
1847 ut_ad(buf_pool_size % srv_buf_pool_chunk_unit == 0);
1848
1849 /* 1. Initialize general fields
1850 ------------------------------- */
1851 mutex_create(LATCH_ID_BUF_POOL, &buf_pool->mutex);
1852
1853 mutex_create(LATCH_ID_BUF_POOL_ZIP, &buf_pool->zip_mutex);
1854
1855 new(&buf_pool->allocator)
1856 ut_allocator<unsigned char>(mem_key_buf_buf_pool);
1857
1858 buf_pool_mutex_enter(buf_pool);
1859
1860 if (buf_pool_size > 0) {
1861 buf_pool->n_chunks
1862 = buf_pool_size / srv_buf_pool_chunk_unit;
1863 chunk_size = srv_buf_pool_chunk_unit;
1864
1865 buf_pool->chunks =
1866 reinterpret_cast<buf_chunk_t*>(ut_zalloc_nokey(
1867 buf_pool->n_chunks * sizeof(*chunk)));
1868 buf_pool->chunks_old = NULL;
1869
1870 UT_LIST_INIT(buf_pool->LRU, &buf_page_t::LRU);
1871 UT_LIST_INIT(buf_pool->free, &buf_page_t::list);
1872 UT_LIST_INIT(buf_pool->withdraw, &buf_page_t::list);
1873 buf_pool->withdraw_target = 0;
1874 UT_LIST_INIT(buf_pool->flush_list, &buf_page_t::list);
1875 UT_LIST_INIT(buf_pool->unzip_LRU, &buf_block_t::unzip_LRU);
1876
1877 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
1878 UT_LIST_INIT(buf_pool->zip_clean, &buf_page_t::list);
1879 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
1880
1881 for (i = 0; i < UT_ARR_SIZE(buf_pool->zip_free); ++i) {
1882 UT_LIST_INIT(
1883 buf_pool->zip_free[i], &buf_buddy_free_t::list);
1884 }
1885
1886 buf_pool->curr_size = 0;
1887 chunk = buf_pool->chunks;
1888
1889 do {
1890 if (!buf_chunk_init(buf_pool, chunk, chunk_size)) {
1891 while (--chunk >= buf_pool->chunks) {
1892 buf_block_t* block = chunk->blocks;
1893
1894 for (i = chunk->size; i--; block++) {
1895 mutex_free(&block->mutex);
1896 rw_lock_free(&block->lock);
1897
1898 ut_d(rw_lock_free(
1899 &block->debug_latch));
1900 }
1901
1902 buf_pool->allocator.deallocate_large_dodump(
1903 chunk->mem, &chunk->mem_pfx, chunk->mem_size());
1904 }
1905 ut_free(buf_pool->chunks);
1906 buf_pool_mutex_exit(buf_pool);
1907
1908 /* InnoDB should free the mutex which was
1909 created so far before freeing the instance */
1910 mutex_free(&buf_pool->mutex);
1911 mutex_free(&buf_pool->zip_mutex);
1912 return(DB_ERROR);
1913 }
1914
1915 buf_pool->curr_size += chunk->size;
1916 } while (++chunk < buf_pool->chunks + buf_pool->n_chunks);
1917
1918 buf_pool->instance_no = instance_no;
1919 buf_pool->read_ahead_area =
1920 ut_min(BUF_READ_AHEAD_PAGES,
1921 ut_2_power_up(buf_pool->curr_size /
1922 BUF_READ_AHEAD_PORTION));
1923 buf_pool->curr_pool_size = buf_pool->curr_size
1924 << srv_page_size_shift;
1925
1926 buf_pool->old_size = buf_pool->curr_size;
1927 buf_pool->n_chunks_new = buf_pool->n_chunks;
1928
1929 /* Number of locks protecting page_hash must be a
1930 power of two */
1931 srv_n_page_hash_locks = static_cast<ulong>(
1932 ut_2_power_up(srv_n_page_hash_locks));
1933 ut_a(srv_n_page_hash_locks != 0);
1934 ut_a(srv_n_page_hash_locks <= MAX_PAGE_HASH_LOCKS);
1935
1936 buf_pool->page_hash = ib_create(
1937 2 * buf_pool->curr_size,
1938 LATCH_ID_HASH_TABLE_RW_LOCK,
1939 srv_n_page_hash_locks, MEM_HEAP_FOR_PAGE_HASH);
1940
1941 buf_pool->zip_hash = hash_create(2 * buf_pool->curr_size);
1942
1943 buf_pool->last_printout_time = time(NULL);
1944 }
1945 /* 2. Initialize flushing fields
1946 -------------------------------- */
1947
1948 mutex_create(LATCH_ID_FLUSH_LIST, &buf_pool->flush_list_mutex);
1949
1950 for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
1951 buf_pool->no_flush[i] = os_event_create(0);
1952 }
1953
1954 buf_pool->watch = (buf_page_t*) ut_zalloc_nokey(
1955 sizeof(*buf_pool->watch) * BUF_POOL_WATCH_SIZE);
1956 for (i = 0; i < BUF_POOL_WATCH_SIZE; i++) {
1957 buf_pool->watch[i].buf_pool_index
1958 = unsigned(buf_pool->instance_no);
1959 }
1960
1961 /* All fields are initialized by ut_zalloc_nokey(). */
1962
1963 buf_pool->try_LRU_scan = TRUE;
1964
1965 /* Initialize the hazard pointer for flush_list batches */
1966 new(&buf_pool->flush_hp)
1967 FlushHp(buf_pool, &buf_pool->flush_list_mutex);
1968
1969 /* Initialize the hazard pointer for LRU batches */
1970 new(&buf_pool->lru_hp) LRUHp(buf_pool, &buf_pool->mutex);
1971
1972 /* Initialize the iterator for LRU scan search */
1973 new(&buf_pool->lru_scan_itr) LRUItr(buf_pool, &buf_pool->mutex);
1974
1975 /* Initialize the iterator for single page scan search */
1976 new(&buf_pool->single_scan_itr) LRUItr(buf_pool, &buf_pool->mutex);
1977
1978 /* Initialize the temporal memory array and slots */
1979 buf_pool->tmp_arr = (buf_tmp_array_t *)ut_malloc_nokey(sizeof(buf_tmp_array_t));
1980 memset(buf_pool->tmp_arr, 0, sizeof(buf_tmp_array_t));
1981 ulint n_slots = (srv_n_read_io_threads + srv_n_write_io_threads) * (8 * OS_AIO_N_PENDING_IOS_PER_THREAD);
1982 buf_pool->tmp_arr->n_slots = n_slots;
1983 buf_pool->tmp_arr->slots = (buf_tmp_buffer_t*)ut_malloc_nokey(sizeof(buf_tmp_buffer_t) * n_slots);
1984 memset(buf_pool->tmp_arr->slots, 0, (sizeof(buf_tmp_buffer_t) * n_slots));
1985
1986 buf_pool_mutex_exit(buf_pool);
1987
1988 DBUG_EXECUTE_IF("buf_pool_init_instance_force_oom",
1989 return(DB_ERROR); );
1990
1991 return(DB_SUCCESS);
1992 }
1993
1994 /********************************************************************//**
1995 free one buffer pool instance */
1996 static
1997 void
1998 buf_pool_free_instance(
1999 /*===================*/
2000 buf_pool_t* buf_pool) /* in,own: buffer pool instance
2001 to free */
2002 {
2003 buf_chunk_t* chunk;
2004 buf_chunk_t* chunks;
2005 buf_page_t* bpage;
2006 buf_page_t* prev_bpage = 0;
2007
2008 mutex_free(&buf_pool->mutex);
2009 mutex_free(&buf_pool->zip_mutex);
2010 mutex_free(&buf_pool->flush_list_mutex);
2011
2012 if (buf_pool->flush_rbt) {
2013 rbt_free(buf_pool->flush_rbt);
2014 buf_pool->flush_rbt = NULL;
2015 }
2016
2017 for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
2018 bpage != NULL;
2019 bpage = prev_bpage) {
2020
2021 prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
2022 buf_page_state state = buf_page_get_state(bpage);
2023
2024 ut_ad(buf_page_in_file(bpage));
2025 ut_ad(bpage->in_LRU_list);
2026
2027 if (state != BUF_BLOCK_FILE_PAGE) {
2028 /* We must not have any dirty block except
2029 when doing a fast shutdown. */
2030 ut_ad(state == BUF_BLOCK_ZIP_PAGE
2031 || srv_fast_shutdown == 2);
2032 buf_page_free_descriptor(bpage);
2033 }
2034 }
2035
2036 ut_free(buf_pool->watch);
2037 buf_pool->watch = NULL;
2038
2039 chunks = buf_pool->chunks;
2040 chunk = chunks + buf_pool->n_chunks;
2041
2042 while (--chunk >= chunks) {
2043 buf_block_t* block = chunk->blocks;
2044
2045 for (ulint i = chunk->size; i--; block++) {
2046 mutex_free(&block->mutex);
2047 rw_lock_free(&block->lock);
2048
2049 ut_d(rw_lock_free(&block->debug_latch));
2050 }
2051
2052 buf_pool->allocator.deallocate_large_dodump(
2053 chunk->mem, &chunk->mem_pfx, chunk->mem_size());
2054 }
2055
2056 for (ulint i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; ++i) {
2057 os_event_destroy(buf_pool->no_flush[i]);
2058 }
2059
2060 ut_free(buf_pool->chunks);
2061 ha_clear(buf_pool->page_hash);
2062 hash_table_free(buf_pool->page_hash);
2063 hash_table_free(buf_pool->zip_hash);
2064
2065 /* Free all used temporary slots */
2066 if (buf_pool->tmp_arr) {
2067 for(ulint i = 0; i < buf_pool->tmp_arr->n_slots; i++) {
2068 buf_tmp_buffer_t* slot = &(buf_pool->tmp_arr->slots[i]);
2069 if (slot && slot->crypt_buf) {
2070 aligned_free(slot->crypt_buf);
2071 slot->crypt_buf = NULL;
2072 }
2073
2074 if (slot && slot->comp_buf) {
2075 aligned_free(slot->comp_buf);
2076 slot->comp_buf = NULL;
2077 }
2078 }
2079
2080 ut_free(buf_pool->tmp_arr->slots);
2081 ut_free(buf_pool->tmp_arr);
2082 buf_pool->tmp_arr = NULL;
2083 }
2084
2085 buf_pool->allocator.~ut_allocator();
2086 }
2087
2088 /********************************************************************//**
2089 Creates the buffer pool.
2090 @return DB_SUCCESS if success, DB_ERROR if not enough memory or error */
2091 dberr_t
2092 buf_pool_init(
2093 /*==========*/
2094 ulint total_size, /*!< in: size of the total pool in bytes */
2095 ulint n_instances) /*!< in: number of instances */
2096 {
2097 ulint i;
2098 const ulint size = total_size / n_instances;
2099
2100 ut_ad(n_instances > 0);
2101 ut_ad(n_instances <= MAX_BUFFER_POOLS);
2102 ut_ad(n_instances == srv_buf_pool_instances);
2103
2104 NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE;
2105
2106 buf_pool_resizing = false;
2107
2108 buf_pool_ptr = (buf_pool_t*) ut_zalloc_nokey(
2109 n_instances * sizeof *buf_pool_ptr);
2110
2111 buf_chunk_map_reg = UT_NEW_NOKEY(buf_pool_chunk_map_t());
2112
2113 for (i = 0; i < n_instances; i++) {
2114 buf_pool_t* ptr = &buf_pool_ptr[i];
2115
2116 if (buf_pool_init_instance(ptr, size, i) != DB_SUCCESS) {
2117
2118 /* Free all the instances created so far. */
2119 buf_pool_free(i);
2120
2121 return(DB_ERROR);
2122 }
2123 }
2124
2125 buf_chunk_map_ref = buf_chunk_map_reg;
2126
2127 buf_pool_set_sizes();
2128 buf_LRU_old_ratio_update(100 * 3/ 8, FALSE);
2129
2130 btr_search_sys_create(buf_pool_get_curr_size() / sizeof(void*) / 64);
2131
2132 return(DB_SUCCESS);
2133 }
2134
2135 /********************************************************************//**
2136 Frees the buffer pool at shutdown. This must not be invoked before
2137 freeing all mutexes. */
2138 void
2139 buf_pool_free(
2140 /*==========*/
2141 ulint n_instances) /*!< in: numbere of instances to free */
2142 {
2143 for (ulint i = 0; i < n_instances; i++) {
2144 buf_pool_free_instance(buf_pool_from_array(i));
2145 }
2146
2147 UT_DELETE(buf_chunk_map_reg);
2148 buf_chunk_map_reg = buf_chunk_map_ref = NULL;
2149
2150 ut_free(buf_pool_ptr);
2151 buf_pool_ptr = NULL;
2152 }
2153
2154 /** Reallocate a control block.
2155 @param[in] buf_pool buffer pool instance
2156 @param[in] block pointer to control block
2157 @retval false if failed because of no free blocks. */
2158 static
2159 bool
2160 buf_page_realloc(
2161 buf_pool_t* buf_pool,
2162 buf_block_t* block)
2163 {
2164 buf_block_t* new_block;
2165
2166 ut_ad(buf_pool_mutex_own(buf_pool));
2167 ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
2168
2169 new_block = buf_LRU_get_free_only(buf_pool);
2170
2171 if (new_block == NULL) {
2172 return(false); /* free_list was not enough */
2173 }
2174
2175 rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, block->page.id);
2176
2177 rw_lock_x_lock(hash_lock);
2178 mutex_enter(&block->mutex);
2179
2180 if (buf_page_can_relocate(&block->page)) {
2181 mutex_enter(&new_block->mutex);
2182
2183 memcpy(new_block->frame, block->frame, srv_page_size);
2184 new (&new_block->page) buf_page_t(block->page);
2185
2186 /* relocate LRU list */
2187 ut_ad(block->page.in_LRU_list);
2188 ut_ad(!block->page.in_zip_hash);
2189 ut_d(block->page.in_LRU_list = FALSE);
2190
2191 buf_LRU_adjust_hp(buf_pool, &block->page);
2192
2193 buf_page_t* prev_b = UT_LIST_GET_PREV(LRU, &block->page);
2194 UT_LIST_REMOVE(buf_pool->LRU, &block->page);
2195
2196 if (prev_b != NULL) {
2197 UT_LIST_INSERT_AFTER(buf_pool->LRU, prev_b, &new_block->page);
2198 } else {
2199 UT_LIST_ADD_FIRST(buf_pool->LRU, &new_block->page);
2200 }
2201
2202 if (buf_pool->LRU_old == &block->page) {
2203 buf_pool->LRU_old = &new_block->page;
2204 }
2205
2206 ut_ad(new_block->page.in_LRU_list);
2207
2208 /* relocate unzip_LRU list */
2209 if (block->page.zip.data != NULL) {
2210 ut_ad(block->in_unzip_LRU_list);
2211 ut_d(new_block->in_unzip_LRU_list = TRUE);
2212
2213 buf_block_t* prev_block = UT_LIST_GET_PREV(unzip_LRU, block);
2214 UT_LIST_REMOVE(buf_pool->unzip_LRU, block);
2215
2216 ut_d(block->in_unzip_LRU_list = FALSE);
2217 block->page.zip.data = NULL;
2218 page_zip_set_size(&block->page.zip, 0);
2219
2220 if (prev_block != NULL) {
2221 UT_LIST_INSERT_AFTER(buf_pool->unzip_LRU, prev_block, new_block);
2222 } else {
2223 UT_LIST_ADD_FIRST(buf_pool->unzip_LRU, new_block);
2224 }
2225 } else {
2226 ut_ad(!block->in_unzip_LRU_list);
2227 ut_d(new_block->in_unzip_LRU_list = FALSE);
2228 }
2229
2230 /* relocate buf_pool->page_hash */
2231 ut_ad(block->page.in_page_hash);
2232 ut_ad(&block->page == buf_page_hash_get_low(buf_pool,
2233 block->page.id));
2234 ut_d(block->page.in_page_hash = FALSE);
2235 ulint fold = block->page.id.fold();
2236 ut_ad(fold == new_block->page.id.fold());
2237 HASH_REPLACE(buf_page_t, hash, buf_pool->page_hash, fold,
2238 &block->page, &new_block->page);
2239
2240 ut_ad(new_block->page.in_page_hash);
2241
2242 buf_block_modify_clock_inc(block);
2243 memset(block->frame + FIL_PAGE_OFFSET, 0xff, 4);
2244 memset(block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4);
2245 MEM_UNDEFINED(block->frame, srv_page_size);
2246 buf_block_set_state(block, BUF_BLOCK_REMOVE_HASH);
2247 block->page.id
2248 = page_id_t(ULINT32_UNDEFINED, ULINT32_UNDEFINED);
2249
2250 /* Relocate buf_pool->flush_list. */
2251 if (block->page.oldest_modification) {
2252 buf_flush_relocate_on_flush_list(
2253 &block->page, &new_block->page);
2254 }
2255
2256 /* set other flags of buf_block_t */
2257
2258 #ifdef BTR_CUR_HASH_ADAPT
2259 /* This code should only be executed by buf_pool_resize(),
2260 while the adaptive hash index is disabled. */
2261 assert_block_ahi_empty(block);
2262 assert_block_ahi_empty_on_init(new_block);
2263 ut_ad(!block->index);
2264 new_block->index = NULL;
2265 new_block->n_hash_helps = 0;
2266 new_block->n_fields = 1;
2267 new_block->left_side = TRUE;
2268 #endif /* BTR_CUR_HASH_ADAPT */
2269
2270 new_block->lock_hash_val = block->lock_hash_val;
2271 ut_ad(new_block->lock_hash_val == lock_rec_hash(
2272 new_block->page.id.space(),
2273 new_block->page.id.page_no()));
2274
2275 rw_lock_x_unlock(hash_lock);
2276 mutex_exit(&new_block->mutex);
2277
2278 /* free block */
2279 buf_block_set_state(block, BUF_BLOCK_MEMORY);
2280 buf_LRU_block_free_non_file_page(block);
2281
2282 mutex_exit(&block->mutex);
2283 } else {
2284 rw_lock_x_unlock(hash_lock);
2285 mutex_exit(&block->mutex);
2286
2287 /* free new_block */
2288 mutex_enter(&new_block->mutex);
2289 buf_LRU_block_free_non_file_page(new_block);
2290 mutex_exit(&new_block->mutex);
2291 }
2292
2293 return(true); /* free_list was enough */
2294 }
2295
2296 /** Sets the global variable that feeds MySQL's innodb_buffer_pool_resize_status
2297 to the specified string. The format and the following parameters are the
2298 same as the ones used for printf(3).
2299 @param[in] fmt format
2300 @param[in] ... extra parameters according to fmt */
2301 static
2302 void
2303 buf_resize_status(
2304 const char* fmt,
2305 ...)
2306 {
2307 va_list ap;
2308
2309 va_start(ap, fmt);
2310
2311 vsnprintf(
2312 export_vars.innodb_buffer_pool_resize_status,
2313 sizeof(export_vars.innodb_buffer_pool_resize_status),
2314 fmt, ap);
2315
2316 va_end(ap);
2317
2318 ib::info() << export_vars.innodb_buffer_pool_resize_status;
2319 }
2320
2321 /** Determines if a block is intended to be withdrawn.
2322 @param[in] buf_pool buffer pool instance
2323 @param[in] block pointer to control block
2324 @retval true if will be withdrawn */
2325 bool
2326 buf_block_will_withdrawn(
2327 buf_pool_t* buf_pool,
2328 const buf_block_t* block)
2329 {
2330 ut_ad(buf_pool->curr_size < buf_pool->old_size);
2331 ut_ad(!buf_pool_resizing || buf_pool_mutex_own(buf_pool));
2332
2333 const buf_chunk_t* chunk
2334 = buf_pool->chunks + buf_pool->n_chunks_new;
2335 const buf_chunk_t* echunk
2336 = buf_pool->chunks + buf_pool->n_chunks;
2337
2338 while (chunk < echunk) {
2339 if (block >= chunk->blocks
2340 && block < chunk->blocks + chunk->size) {
2341 return(true);
2342 }
2343 ++chunk;
2344 }
2345
2346 return(false);
2347 }
2348
2349 /** Determines if a frame is intended to be withdrawn.
2350 @param[in] buf_pool buffer pool instance
2351 @param[in] ptr pointer to a frame
2352 @retval true if will be withdrawn */
2353 bool
2354 buf_frame_will_withdrawn(
2355 buf_pool_t* buf_pool,
2356 const byte* ptr)
2357 {
2358 ut_ad(buf_pool->curr_size < buf_pool->old_size);
2359 ut_ad(!buf_pool_resizing || buf_pool_mutex_own(buf_pool));
2360
2361 const buf_chunk_t* chunk
2362 = buf_pool->chunks + buf_pool->n_chunks_new;
2363 const buf_chunk_t* echunk
2364 = buf_pool->chunks + buf_pool->n_chunks;
2365
2366 while (chunk < echunk) {
2367 if (ptr >= chunk->blocks->frame
2368 && ptr < (chunk->blocks + chunk->size - 1)->frame
2369 + srv_page_size) {
2370 return(true);
2371 }
2372 ++chunk;
2373 }
2374
2375 return(false);
2376 }
2377
2378 /** Withdraw the buffer pool blocks from end of the buffer pool instance
2379 until withdrawn by buf_pool->withdraw_target.
2380 @param[in] buf_pool buffer pool instance
2381 @retval true if retry is needed */
2382 static
2383 bool
2384 buf_pool_withdraw_blocks(
2385 buf_pool_t* buf_pool)
2386 {
2387 buf_block_t* block;
2388 ulint loop_count = 0;
2389 ulint i = buf_pool_index(buf_pool);
2390
2391 ib::info() << "buffer pool " << i
2392 << " : start to withdraw the last "
2393 << buf_pool->withdraw_target << " blocks.";
2394
2395 /* Minimize buf_pool->zip_free[i] lists */
2396 buf_pool_mutex_enter(buf_pool);
2397 buf_buddy_condense_free(buf_pool);
2398 buf_pool_mutex_exit(buf_pool);
2399
2400 while (UT_LIST_GET_LEN(buf_pool->withdraw)
2401 < buf_pool->withdraw_target) {
2402
2403 /* try to withdraw from free_list */
2404 ulint count1 = 0;
2405
2406 buf_pool_mutex_enter(buf_pool);
2407 block = reinterpret_cast<buf_block_t*>(
2408 UT_LIST_GET_FIRST(buf_pool->free));
2409 while (block != NULL
2410 && UT_LIST_GET_LEN(buf_pool->withdraw)
2411 < buf_pool->withdraw_target) {
2412 ut_ad(block->page.in_free_list);
2413 ut_ad(!block->page.in_flush_list);
2414 ut_ad(!block->page.in_LRU_list);
2415 ut_a(!buf_page_in_file(&block->page));
2416
2417 buf_block_t* next_block;
2418 next_block = reinterpret_cast<buf_block_t*>(
2419 UT_LIST_GET_NEXT(
2420 list, &block->page));
2421
2422 if (buf_block_will_withdrawn(buf_pool, block)) {
2423 /* This should be withdrawn */
2424 UT_LIST_REMOVE(
2425 buf_pool->free,
2426 &block->page);
2427 UT_LIST_ADD_LAST(
2428 buf_pool->withdraw,
2429 &block->page);
2430 ut_d(block->in_withdraw_list = TRUE);
2431 count1++;
2432 }
2433
2434 block = next_block;
2435 }
2436 buf_pool_mutex_exit(buf_pool);
2437
2438 /* reserve free_list length */
2439 if (UT_LIST_GET_LEN(buf_pool->withdraw)
2440 < buf_pool->withdraw_target) {
2441 ulint scan_depth;
2442 flush_counters_t n;
2443
2444 /* cap scan_depth with current LRU size. */
2445 buf_pool_mutex_enter(buf_pool);
2446 scan_depth = UT_LIST_GET_LEN(buf_pool->LRU);
2447 buf_pool_mutex_exit(buf_pool);
2448
2449 scan_depth = ut_min(
2450 ut_max(buf_pool->withdraw_target
2451 - UT_LIST_GET_LEN(buf_pool->withdraw),
2452 static_cast<ulint>(srv_LRU_scan_depth)),
2453 scan_depth);
2454
2455 buf_flush_do_batch(buf_pool, BUF_FLUSH_LRU,
2456 scan_depth, 0, &n);
2457 buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
2458
2459 if (n.flushed) {
2460 MONITOR_INC_VALUE_CUMULATIVE(
2461 MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
2462 MONITOR_LRU_BATCH_FLUSH_COUNT,
2463 MONITOR_LRU_BATCH_FLUSH_PAGES,
2464 n.flushed);
2465 }
2466 }
2467
2468 /* relocate blocks/buddies in withdrawn area */
2469 ulint count2 = 0;
2470
2471 buf_pool_mutex_enter(buf_pool);
2472 buf_page_t* bpage;
2473 bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
2474 while (bpage != NULL) {
2475 BPageMutex* block_mutex;
2476 buf_page_t* next_bpage;
2477
2478 block_mutex = buf_page_get_mutex(bpage);
2479 mutex_enter(block_mutex);
2480
2481 next_bpage = UT_LIST_GET_NEXT(LRU, bpage);
2482
2483 if (bpage->zip.data != NULL
2484 && buf_frame_will_withdrawn(
2485 buf_pool,
2486 static_cast<byte*>(bpage->zip.data))) {
2487
2488 if (buf_page_can_relocate(bpage)) {
2489 mutex_exit(block_mutex);
2490 buf_pool_mutex_exit_forbid(buf_pool);
2491 if(!buf_buddy_realloc(
2492 buf_pool, bpage->zip.data,
2493 page_zip_get_size(
2494 &bpage->zip))) {
2495
2496 /* failed to allocate block */
2497 buf_pool_mutex_exit_allow(
2498 buf_pool);
2499 break;
2500 }
2501 buf_pool_mutex_exit_allow(buf_pool);
2502 mutex_enter(block_mutex);
2503 count2++;
2504 }
2505 /* NOTE: if the page is in use,
2506 not reallocated yet */
2507 }
2508
2509 if (buf_page_get_state(bpage)
2510 == BUF_BLOCK_FILE_PAGE
2511 && buf_block_will_withdrawn(
2512 buf_pool,
2513 reinterpret_cast<buf_block_t*>(bpage))) {
2514
2515 if (buf_page_can_relocate(bpage)) {
2516 mutex_exit(block_mutex);
2517 buf_pool_mutex_exit_forbid(buf_pool);
2518 if(!buf_page_realloc(
2519 buf_pool,
2520 reinterpret_cast<buf_block_t*>(
2521 bpage))) {
2522 /* failed to allocate block */
2523 buf_pool_mutex_exit_allow(
2524 buf_pool);
2525 break;
2526 }
2527 buf_pool_mutex_exit_allow(buf_pool);
2528 count2++;
2529 } else {
2530 mutex_exit(block_mutex);
2531 }
2532 /* NOTE: if the page is in use,
2533 not reallocated yet */
2534 } else {
2535 mutex_exit(block_mutex);
2536 }
2537
2538 bpage = next_bpage;
2539 }
2540 buf_pool_mutex_exit(buf_pool);
2541
2542 buf_resize_status(
2543 "buffer pool %lu : withdrawing blocks. (%lu/%lu)",
2544 i, UT_LIST_GET_LEN(buf_pool->withdraw),
2545 buf_pool->withdraw_target);
2546
2547 ib::info() << "buffer pool " << i << " : withdrew "
2548 << count1 << " blocks from free list."
2549 << " Tried to relocate " << count2 << " pages ("
2550 << UT_LIST_GET_LEN(buf_pool->withdraw) << "/"
2551 << buf_pool->withdraw_target << ").";
2552
2553 if (++loop_count >= 10) {
2554 /* give up for now.
2555 retried after user threads paused. */
2556
2557 ib::info() << "buffer pool " << i
2558 << " : will retry to withdraw later.";
2559
2560 /* need retry later */
2561 return(true);
2562 }
2563 }
2564
2565 /* confirm withdrawn enough */
2566 const buf_chunk_t* chunk
2567 = buf_pool->chunks + buf_pool->n_chunks_new;
2568 const buf_chunk_t* echunk
2569 = buf_pool->chunks + buf_pool->n_chunks;
2570
2571 while (chunk < echunk) {
2572 block = chunk->blocks;
2573 for (ulint j = chunk->size; j--; block++) {
2574 /* If !=BUF_BLOCK_NOT_USED block in the
2575 withdrawn area, it means corruption
2576 something */
2577 ut_a(buf_block_get_state(block)
2578 == BUF_BLOCK_NOT_USED);
2579 ut_ad(block->in_withdraw_list);
2580 }
2581 ++chunk;
2582 }
2583
2584 ib::info() << "buffer pool " << i << " : withdrawn target "
2585 << UT_LIST_GET_LEN(buf_pool->withdraw) << " blocks.";
2586
2587 return(false);
2588 }
2589
2590 /** resize page_hash and zip_hash for a buffer pool instance.
2591 @param[in] buf_pool buffer pool instance */
2592 static
2593 void
2594 buf_pool_resize_hash(
2595 buf_pool_t* buf_pool)
2596 {
2597 hash_table_t* new_hash_table;
2598
2599 /* recreate page_hash */
2600 new_hash_table = ib_recreate(
2601 buf_pool->page_hash, 2 * buf_pool->curr_size);
2602
2603 for (ulint i = 0; i < hash_get_n_cells(buf_pool->page_hash); i++) {
2604 buf_page_t* bpage;
2605
2606 bpage = static_cast<buf_page_t*>(
2607 HASH_GET_FIRST(
2608 buf_pool->page_hash, i));
2609
2610 while (bpage) {
2611 buf_page_t* prev_bpage = bpage;
2612 ulint fold;
2613
2614 bpage = static_cast<buf_page_t*>(
2615 HASH_GET_NEXT(
2616 hash, prev_bpage));
2617
2618 fold = prev_bpage->id.fold();
2619
2620 HASH_DELETE(buf_page_t, hash,
2621 buf_pool->page_hash, fold,
2622 prev_bpage);
2623
2624 HASH_INSERT(buf_page_t, hash,
2625 new_hash_table, fold,
2626 prev_bpage);
2627 }
2628 }
2629
2630 /* Concurrent threads may be accessing
2631 buf_pool->page_hash->n_cells, n_sync_obj and try to latch
2632 sync_obj[i] while we are resizing. Therefore we never
2633 deallocate page_hash, instead we overwrite n_cells (and other
2634 fields) with the new values. The n_sync_obj and sync_obj are
2635 actually same in both. */
2636 std::swap(*buf_pool->page_hash, *new_hash_table);
2637 hash_table_free(new_hash_table);
2638
2639 /* recreate zip_hash */
2640 new_hash_table = hash_create(2 * buf_pool->curr_size);
2641
2642 for (ulint i = 0; i < hash_get_n_cells(buf_pool->zip_hash); i++) {
2643 buf_page_t* bpage;
2644
2645 bpage = static_cast<buf_page_t*>(
2646 HASH_GET_FIRST(buf_pool->zip_hash, i));
2647
2648 while (bpage) {
2649 buf_page_t* prev_bpage = bpage;
2650 ulint fold;
2651
2652 bpage = static_cast<buf_page_t*>(
2653 HASH_GET_NEXT(
2654 hash, prev_bpage));
2655
2656 fold = BUF_POOL_ZIP_FOLD(
2657 reinterpret_cast<buf_block_t*>(
2658 prev_bpage));
2659
2660 HASH_DELETE(buf_page_t, hash,
2661 buf_pool->zip_hash, fold,
2662 prev_bpage);
2663
2664 HASH_INSERT(buf_page_t, hash,
2665 new_hash_table, fold,
2666 prev_bpage);
2667 }
2668 }
2669
2670 hash_table_free(buf_pool->zip_hash);
2671 buf_pool->zip_hash = new_hash_table;
2672 }
2673
2674 /** Resize the buffer pool based on srv_buf_pool_size from
2675 srv_buf_pool_old_size. */
2676 static
2677 void
2678 buf_pool_resize()
2679 {
2680 buf_pool_t* buf_pool;
2681 ulint new_instance_size;
2682 bool warning = false;
2683
2684 NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE;
2685
2686 ut_ad(!buf_pool_resizing);
2687 ut_ad(srv_buf_pool_chunk_unit > 0);
2688
2689 new_instance_size = srv_buf_pool_size / srv_buf_pool_instances;
2690 new_instance_size >>= srv_page_size_shift;
2691
2692 buf_resize_status("Resizing buffer pool from " ULINTPF " to "
2693 ULINTPF " (unit=" ULINTPF ").",
2694 srv_buf_pool_old_size, srv_buf_pool_size,
2695 srv_buf_pool_chunk_unit);
2696
2697 /* set new limit for all buffer pool for resizing */
2698 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2699 buf_pool = buf_pool_from_array(i);
2700 buf_pool_mutex_enter(buf_pool);
2701
2702 ut_ad(buf_pool->curr_size == buf_pool->old_size);
2703 ut_ad(buf_pool->n_chunks_new == buf_pool->n_chunks);
2704 ut_ad(UT_LIST_GET_LEN(buf_pool->withdraw) == 0);
2705 ut_ad(buf_pool->flush_rbt == NULL);
2706
2707 buf_pool->curr_size = new_instance_size;
2708
2709 buf_pool->n_chunks_new =
2710 (new_instance_size << srv_page_size_shift)
2711 / srv_buf_pool_chunk_unit;
2712
2713 buf_pool_mutex_exit(buf_pool);
2714 }
2715 #ifdef BTR_CUR_HASH_ADAPT
2716 /* disable AHI if needed */
2717 bool btr_search_disabled = false;
2718
2719 buf_resize_status("Disabling adaptive hash index.");
2720
2721 btr_search_s_lock_all();
2722 if (btr_search_enabled) {
2723 btr_search_s_unlock_all();
2724 btr_search_disabled = true;
2725 } else {
2726 btr_search_s_unlock_all();
2727 }
2728
2729 btr_search_disable();
2730
2731 if (btr_search_disabled) {
2732 ib::info() << "disabled adaptive hash index.";
2733 }
2734 #endif /* BTR_CUR_HASH_ADAPT */
2735
2736 /* set withdraw target */
2737 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2738 buf_pool = buf_pool_from_array(i);
2739 if (buf_pool->curr_size < buf_pool->old_size) {
2740 ulint withdraw_target = 0;
2741
2742 const buf_chunk_t* chunk
2743 = buf_pool->chunks + buf_pool->n_chunks_new;
2744 const buf_chunk_t* echunk
2745 = buf_pool->chunks + buf_pool->n_chunks;
2746
2747 while (chunk < echunk) {
2748 withdraw_target += chunk->size;
2749 ++chunk;
2750 }
2751
2752 ut_ad(buf_pool->withdraw_target == 0);
2753 buf_pool->withdraw_target = withdraw_target;
2754 }
2755 }
2756
2757 buf_resize_status("Withdrawing blocks to be shrunken.");
2758
2759 time_t withdraw_started = time(NULL);
2760 ulint message_interval = 60;
2761 ulint retry_interval = 1;
2762
2763 withdraw_retry:
2764 bool should_retry_withdraw = false;
2765
2766 /* wait for the number of blocks fit to the new size (if needed)*/
2767 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2768 buf_pool = buf_pool_from_array(i);
2769 if (buf_pool->curr_size < buf_pool->old_size) {
2770
2771 should_retry_withdraw |=
2772 buf_pool_withdraw_blocks(buf_pool);
2773 }
2774 }
2775
2776 if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
2777 /* abort to resize for shutdown. */
2778 return;
2779 }
2780
2781 /* abort buffer pool load */
2782 buf_load_abort();
2783
2784 const time_t current_time = time(NULL);
2785
2786 if (should_retry_withdraw
2787 && difftime(current_time, withdraw_started) >= message_interval) {
2788
2789 if (message_interval > 900) {
2790 message_interval = 1800;
2791 } else {
2792 message_interval *= 2;
2793 }
2794
2795 lock_mutex_enter();
2796 mutex_enter(&trx_sys.mutex);
2797 bool found = false;
2798 for (trx_t* trx = UT_LIST_GET_FIRST(trx_sys.trx_list);
2799 trx != NULL;
2800 trx = UT_LIST_GET_NEXT(trx_list, trx)) {
2801 if (trx->state != TRX_STATE_NOT_STARTED
2802 && trx->mysql_thd != NULL
2803 && withdraw_started > trx->start_time) {
2804 if (!found) {
2805 ib::warn() <<
2806 "The following trx might hold"
2807 " the blocks in buffer pool to"
2808 " be withdrawn. Buffer pool"
2809 " resizing can complete only"
2810 " after all the transactions"
2811 " below release the blocks.";
2812 found = true;
2813 }
2814
2815 lock_trx_print_wait_and_mvcc_state(
2816 stderr, trx, current_time);
2817 }
2818 }
2819 mutex_exit(&trx_sys.mutex);
2820 lock_mutex_exit();
2821
2822 withdraw_started = current_time;
2823 }
2824
2825 if (should_retry_withdraw) {
2826 ib::info() << "Will retry to withdraw " << retry_interval
2827 << " seconds later.";
2828 os_thread_sleep(retry_interval * 1000000);
2829
2830 if (retry_interval > 5) {
2831 retry_interval = 10;
2832 } else {
2833 retry_interval *= 2;
2834 }
2835
2836 goto withdraw_retry;
2837 }
2838
2839
2840 buf_resize_status("Latching whole of buffer pool.");
2841
2842 #ifndef DBUG_OFF
2843 {
2844 bool should_wait = true;
2845
2846 while (should_wait) {
2847 should_wait = false;
2848 DBUG_EXECUTE_IF(
2849 "ib_buf_pool_resize_wait_before_resize",
2850 should_wait = true; os_thread_sleep(10000););
2851 }
2852 }
2853 #endif /* !DBUG_OFF */
2854
2855 if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
2856 return;
2857 }
2858
2859 /* Indicate critical path */
2860 buf_pool_resizing = true;
2861
2862 /* Acquire all buf_pool_mutex/hash_lock */
2863 for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
2864 buf_pool_t* buf_pool = buf_pool_from_array(i);
2865
2866 buf_pool_mutex_enter(buf_pool);
2867 }
2868 for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
2869 buf_pool_t* buf_pool = buf_pool_from_array(i);
2870
2871 hash_lock_x_all(buf_pool->page_hash);
2872 }
2873
2874 buf_chunk_map_reg = UT_NEW_NOKEY(buf_pool_chunk_map_t());
2875
2876 /* add/delete chunks */
2877 for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
2878 buf_pool_t* buf_pool = buf_pool_from_array(i);
2879 buf_chunk_t* chunk;
2880 buf_chunk_t* echunk;
2881
2882 buf_resize_status("buffer pool %lu :"
2883 " resizing with chunks %lu to %lu.",
2884 i, buf_pool->n_chunks, buf_pool->n_chunks_new);
2885
2886 if (buf_pool->n_chunks_new < buf_pool->n_chunks) {
2887 /* delete chunks */
2888 chunk = buf_pool->chunks
2889 + buf_pool->n_chunks_new;
2890 echunk = buf_pool->chunks + buf_pool->n_chunks;
2891
2892 ulint sum_freed = 0;
2893
2894 while (chunk < echunk) {
2895 buf_block_t* block = chunk->blocks;
2896
2897 /* buf_LRU_block_free_non_file_page()
2898 invokes MEM_NOACCESS() on any blocks
2899 that are in free_list. We must
2900 cancel the effect of that. In MemorySanitizer,
2901 MEM_NOACCESS() is no-op, so we must not do
2902 anything special for it here. */
2903 #ifdef HAVE_valgrind
2904 # if !__has_feature(memory_sanitizer)
2905 MEM_MAKE_DEFINED(chunk->mem,
2906 chunk->mem_size());
2907 # endif
2908 #else
2909 MEM_MAKE_ADDRESSABLE(chunk->mem,
2910 chunk->mem_size());
2911 #endif
2912
2913 for (ulint j = chunk->size;
2914 j--; block++) {
2915 mutex_free(&block->mutex);
2916 rw_lock_free(&block->lock);
2917
2918 ut_d(rw_lock_free(
2919 &block->debug_latch));
2920 }
2921
2922 buf_pool->allocator.deallocate_large_dodump(
2923 chunk->mem, &chunk->mem_pfx, chunk->mem_size());
2924
2925 sum_freed += chunk->size;
2926
2927 ++chunk;
2928 }
2929
2930 /* discard withdraw list */
2931 UT_LIST_INIT(buf_pool->withdraw,
2932 &buf_page_t::list);
2933 buf_pool->withdraw_target = 0;
2934
2935 ib::info() << "buffer pool " << i << " : "
2936 << buf_pool->n_chunks - buf_pool->n_chunks_new
2937 << " chunks (" << sum_freed
2938 << " blocks) were freed.";
2939
2940 buf_pool->n_chunks = buf_pool->n_chunks_new;
2941 }
2942
2943 {
2944 /* reallocate buf_pool->chunks */
2945 const ulint new_chunks_size
2946 = buf_pool->n_chunks_new * sizeof(*chunk);
2947
2948 buf_chunk_t* new_chunks
2949 = reinterpret_cast<buf_chunk_t*>(
2950 ut_zalloc_nokey_nofatal(new_chunks_size));
2951
2952 DBUG_EXECUTE_IF("buf_pool_resize_chunk_null",
2953 ut_free(new_chunks);
2954 new_chunks = NULL;);
2955
2956 if (new_chunks == NULL) {
2957 ib::error() << "buffer pool " << i
2958 << " : failed to allocate"
2959 " the chunk array.";
2960 buf_pool->n_chunks_new
2961 = buf_pool->n_chunks;
2962 warning = true;
2963 buf_pool->chunks_old = NULL;
2964 for (ulint j = 0; j < buf_pool->n_chunks_new; j++) {
2965 buf_pool_register_chunk(&(buf_pool->chunks[j]));
2966 }
2967 goto calc_buf_pool_size;
2968 }
2969
2970 ulint n_chunks_copy = ut_min(buf_pool->n_chunks_new,
2971 buf_pool->n_chunks);
2972
2973 memcpy(new_chunks, buf_pool->chunks,
2974 n_chunks_copy * sizeof(*chunk));
2975
2976 for (ulint j = 0; j < n_chunks_copy; j++) {
2977 buf_pool_register_chunk(&new_chunks[j]);
2978 }
2979
2980 buf_pool->chunks_old = buf_pool->chunks;
2981 buf_pool->chunks = new_chunks;
2982 }
2983
2984
2985 if (buf_pool->n_chunks_new > buf_pool->n_chunks) {
2986 /* add chunks */
2987 chunk = buf_pool->chunks + buf_pool->n_chunks;
2988 echunk = buf_pool->chunks
2989 + buf_pool->n_chunks_new;
2990
2991 ulint sum_added = 0;
2992 ulint n_chunks = buf_pool->n_chunks;
2993
2994 while (chunk < echunk) {
2995 ulong unit = srv_buf_pool_chunk_unit;
2996
2997 if (!buf_chunk_init(buf_pool, chunk, unit)) {
2998
2999 ib::error() << "buffer pool " << i
3000 << " : failed to allocate"
3001 " new memory.";
3002
3003 warning = true;
3004
3005 buf_pool->n_chunks_new
3006 = n_chunks;
3007
3008 break;
3009 }
3010
3011 sum_added += chunk->size;
3012
3013 ++n_chunks;
3014 ++chunk;
3015 }
3016
3017 ib::info() << "buffer pool " << i << " : "
3018 << buf_pool->n_chunks_new - buf_pool->n_chunks
3019 << " chunks (" << sum_added
3020 << " blocks) were added.";
3021
3022 buf_pool->n_chunks = n_chunks;
3023 }
3024 calc_buf_pool_size:
3025
3026 /* recalc buf_pool->curr_size */
3027 ulint new_size = 0;
3028
3029 chunk = buf_pool->chunks;
3030 do {
3031 new_size += chunk->size;
3032 } while (++chunk < buf_pool->chunks
3033 + buf_pool->n_chunks);
3034
3035 buf_pool->curr_size = new_size;
3036 buf_pool->n_chunks_new = buf_pool->n_chunks;
3037
3038 if (buf_pool->chunks_old) {
3039 ut_free(buf_pool->chunks_old);
3040 buf_pool->chunks_old = NULL;
3041 }
3042 }
3043
3044 buf_pool_chunk_map_t* chunk_map_old = buf_chunk_map_ref;
3045 buf_chunk_map_ref = buf_chunk_map_reg;
3046
3047 /* set instance sizes */
3048 {
3049 ulint curr_size = 0;
3050
3051 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
3052 buf_pool = buf_pool_from_array(i);
3053
3054 ut_ad(UT_LIST_GET_LEN(buf_pool->withdraw) == 0);
3055
3056 buf_pool->read_ahead_area =
3057 ut_min(BUF_READ_AHEAD_PAGES,
3058 ut_2_power_up(buf_pool->curr_size /
3059 BUF_READ_AHEAD_PORTION));
3060 buf_pool->curr_pool_size
3061 = buf_pool->curr_size << srv_page_size_shift;
3062 curr_size += buf_pool->curr_pool_size;
3063 buf_pool->old_size = buf_pool->curr_size;
3064 }
3065 srv_buf_pool_curr_size = curr_size;
3066 innodb_set_buf_pool_size(buf_pool_size_align(curr_size));
3067 }
3068
3069 const bool new_size_too_diff
3070 = srv_buf_pool_base_size > srv_buf_pool_size * 2
3071 || srv_buf_pool_base_size * 2 < srv_buf_pool_size;
3072
3073 /* Normalize page_hash and zip_hash,
3074 if the new size is too different */
3075 if (!warning && new_size_too_diff) {
3076
3077 buf_resize_status("Resizing hash tables.");
3078
3079 for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
3080 buf_pool_t* buf_pool = buf_pool_from_array(i);
3081
3082 buf_pool_resize_hash(buf_pool);
3083
3084 ib::info() << "buffer pool " << i
3085 << " : hash tables were resized.";
3086 }
3087 }
3088
3089 /* Release all buf_pool_mutex/page_hash */
3090 for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
3091 buf_pool_t* buf_pool = buf_pool_from_array(i);
3092
3093 hash_unlock_x_all(buf_pool->page_hash);
3094 buf_pool_mutex_exit(buf_pool);
3095 }
3096
3097 UT_DELETE(chunk_map_old);
3098
3099 buf_pool_resizing = false;
3100
3101 /* Normalize other components, if the new size is too different */
3102 if (!warning && new_size_too_diff) {
3103 srv_buf_pool_base_size = srv_buf_pool_size;
3104
3105 buf_resize_status("Resizing also other hash tables.");
3106
3107 /* normalize lock_sys */
3108 srv_lock_table_size = 5
3109 * (srv_buf_pool_size >> srv_page_size_shift);
3110 lock_sys.resize(srv_lock_table_size);
3111
3112 /* normalize dict_sys */
3113 dict_resize();
3114
3115 ib::info() << "Resized hash tables at lock_sys,"
3116 #ifdef BTR_CUR_HASH_ADAPT
3117 " adaptive hash index,"
3118 #endif /* BTR_CUR_HASH_ADAPT */
3119 " dictionary.";
3120 }
3121
3122 /* normalize ibuf->max_size */
3123 ibuf_max_size_update(srv_change_buffer_max_size);
3124
3125 if (srv_buf_pool_old_size != srv_buf_pool_size) {
3126
3127 ib::info() << "Completed to resize buffer pool from "
3128 << srv_buf_pool_old_size
3129 << " to " << srv_buf_pool_size << ".";
3130 srv_buf_pool_old_size = srv_buf_pool_size;
3131 }
3132
3133 #ifdef BTR_CUR_HASH_ADAPT
3134 /* enable AHI if needed */
3135 if (btr_search_disabled) {
3136 btr_search_enable(true);
3137 ib::info() << "Re-enabled adaptive hash index.";
3138 }
3139 #endif /* BTR_CUR_HASH_ADAPT */
3140
3141 char now[32];
3142
3143 ut_sprintf_timestamp(now);
3144 if (!warning) {
3145 buf_resize_status("Completed resizing buffer pool at %s.",
3146 now);
3147 } else {
3148 buf_resize_status("Resizing buffer pool failed,"
3149 " finished resizing at %s.", now);
3150 }
3151
3152 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3153 ut_a(buf_validate());
3154 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3155
3156 return;
3157 }
3158
3159 /** This is the thread for resizing buffer pool. It waits for an event and
3160 when waked up either performs a resizing and sleeps again.
3161 @return this function does not return, calls os_thread_exit()
3162 */
3163 extern "C"
3164 os_thread_ret_t
3165 DECLARE_THREAD(buf_resize_thread)(void*)
3166 {
3167 my_thread_init();
3168
3169 while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
3170 os_event_wait(srv_buf_resize_event);
3171 os_event_reset(srv_buf_resize_event);
3172
3173 if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
3174 break;
3175 }
3176
3177 buf_pool_mutex_enter_all();
3178 if (srv_buf_pool_old_size == srv_buf_pool_size) {
3179 buf_pool_mutex_exit_all();
3180 std::ostringstream sout;
3181 sout << "Size did not change (old size = new size = "
3182 << srv_buf_pool_size << ". Nothing to do.";
3183 buf_resize_status(sout.str().c_str());
3184
3185 /* nothing to do */
3186 continue;
3187 }
3188 buf_pool_mutex_exit_all();
3189
3190 buf_pool_resize();
3191 }
3192
3193 srv_buf_resize_thread_active = false;
3194
3195 my_thread_end();
3196 os_thread_exit();
3197
3198 OS_THREAD_DUMMY_RETURN;
3199 }
3200
3201 /********************************************************************//**
3202 Relocate a buffer control block. Relocates the block on the LRU list
3203 and in buf_pool->page_hash. Does not relocate bpage->list.
3204 The caller must take care of relocating bpage->list. */
3205 static
3206 void
3207 buf_relocate(
3208 /*=========*/
3209 buf_page_t* bpage, /*!< in/out: control block being relocated;
3210 buf_page_get_state(bpage) must be
3211 BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_ZIP_PAGE */
3212 buf_page_t* dpage) /*!< in/out: destination control block */
3213 {
3214 buf_page_t* b;
3215 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
3216
3217 ut_ad(buf_pool_mutex_own(buf_pool));
3218 ut_ad(buf_page_hash_lock_held_x(buf_pool, bpage));
3219 ut_ad(mutex_own(buf_page_get_mutex(bpage)));
3220 ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
3221 ut_a(bpage->buf_fix_count == 0);
3222 ut_ad(bpage->in_LRU_list);
3223 ut_ad(!bpage->in_zip_hash);
3224 ut_ad(bpage->in_page_hash);
3225 ut_ad(bpage == buf_page_hash_get_low(buf_pool, bpage->id));
3226
3227 ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
3228 #ifdef UNIV_DEBUG
3229 switch (buf_page_get_state(bpage)) {
3230 case BUF_BLOCK_POOL_WATCH:
3231 case BUF_BLOCK_NOT_USED:
3232 case BUF_BLOCK_READY_FOR_USE:
3233 case BUF_BLOCK_FILE_PAGE:
3234 case BUF_BLOCK_MEMORY:
3235 case BUF_BLOCK_REMOVE_HASH:
3236 ut_error;
3237 case BUF_BLOCK_ZIP_DIRTY:
3238 case BUF_BLOCK_ZIP_PAGE:
3239 break;
3240 }
3241 #endif /* UNIV_DEBUG */
3242
3243 new (dpage) buf_page_t(*bpage);
3244
3245 /* Important that we adjust the hazard pointer before
3246 removing bpage from LRU list. */
3247 buf_LRU_adjust_hp(buf_pool, bpage);
3248
3249 ut_d(bpage->in_LRU_list = FALSE);
3250 ut_d(bpage->in_page_hash = FALSE);
3251
3252 /* relocate buf_pool->LRU */
3253 b = UT_LIST_GET_PREV(LRU, bpage);
3254 UT_LIST_REMOVE(buf_pool->LRU, bpage);
3255
3256 if (b != NULL) {
3257 UT_LIST_INSERT_AFTER(buf_pool->LRU, b, dpage);
3258 } else {
3259 UT_LIST_ADD_FIRST(buf_pool->LRU, dpage);
3260 }
3261
3262 if (UNIV_UNLIKELY(buf_pool->LRU_old == bpage)) {
3263 buf_pool->LRU_old = dpage;
3264 #ifdef UNIV_LRU_DEBUG
3265 /* buf_pool->LRU_old must be the first item in the LRU list
3266 whose "old" flag is set. */
3267 ut_a(buf_pool->LRU_old->old);
3268 ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)
3269 || !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old);
3270 ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)
3271 || UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old);
3272 } else {
3273 /* Check that the "old" flag is consistent in
3274 the block and its neighbours. */
3275 buf_page_set_old(dpage, buf_page_is_old(dpage));
3276 #endif /* UNIV_LRU_DEBUG */
3277 }
3278
3279 ut_d(CheckInLRUList::validate(buf_pool));
3280
3281 /* relocate buf_pool->page_hash */
3282 ulint fold = bpage->id.fold();
3283 ut_ad(fold == dpage->id.fold());
3284 HASH_REPLACE(buf_page_t, hash, buf_pool->page_hash, fold, bpage,
3285 dpage);
3286 }
3287
3288 /** Hazard Pointer implementation. */
3289
3290 /** Set current value
3291 @param bpage buffer block to be set as hp */
3292 void
3293 HazardPointer::set(buf_page_t* bpage)
3294 {
3295 ut_ad(mutex_own(m_mutex));
3296 ut_ad(!bpage || buf_pool_from_bpage(bpage) == m_buf_pool);
3297 ut_ad(!bpage || buf_page_in_file(bpage));
3298
3299 m_hp = bpage;
3300 }
3301
3302 /** Checks if a bpage is the hp
3303 @param bpage buffer block to be compared
3304 @return true if it is hp */
3305
3306 bool
3307 HazardPointer::is_hp(const buf_page_t* bpage)
3308 {
3309 ut_ad(mutex_own(m_mutex));
3310 ut_ad(!m_hp || buf_pool_from_bpage(m_hp) == m_buf_pool);
3311 ut_ad(!bpage || buf_pool_from_bpage(bpage) == m_buf_pool);
3312
3313 return(bpage == m_hp);
3314 }
3315
3316 /** Adjust the value of hp. This happens when some other thread working
3317 on the same list attempts to remove the hp from the list.
3318 @param bpage buffer block to be compared */
3319
3320 void
3321 FlushHp::adjust(const buf_page_t* bpage)
3322 {
3323 ut_ad(bpage != NULL);
3324
3325 /** We only support reverse traversal for now. */
3326 if (is_hp(bpage)) {
3327 m_hp = UT_LIST_GET_PREV(list, m_hp);
3328 }
3329
3330 ut_ad(!m_hp || m_hp->in_flush_list);
3331 }
3332
3333 /** Adjust the value of hp. This happens when some other thread working
3334 on the same list attempts to remove the hp from the list.
3335 @param bpage buffer block to be compared */
3336
3337 void
3338 LRUHp::adjust(const buf_page_t* bpage)
3339 {
3340 ut_ad(bpage);
3341
3342 /** We only support reverse traversal for now. */
3343 if (is_hp(bpage)) {
3344 m_hp = UT_LIST_GET_PREV(LRU, m_hp);
3345 }
3346
3347 ut_ad(!m_hp || m_hp->in_LRU_list);
3348 }
3349
3350 /** Selects from where to start a scan. If we have scanned too deep into
3351 the LRU list it resets the value to the tail of the LRU list.
3352 @return buf_page_t from where to start scan. */
3353
3354 buf_page_t*
3355 LRUItr::start()
3356 {
3357 ut_ad(mutex_own(m_mutex));
3358
3359 if (!m_hp || m_hp->old) {
3360 m_hp = UT_LIST_GET_LAST(m_buf_pool->LRU);
3361 }
3362
3363 return(m_hp);
3364 }
3365
3366 /** Determine if a block is a sentinel for a buffer pool watch.
3367 @param[in] buf_pool buffer pool instance
3368 @param[in] bpage block
3369 @return TRUE if a sentinel for a buffer pool watch, FALSE if not */
3370 ibool
3371 buf_pool_watch_is_sentinel(
3372 const buf_pool_t* buf_pool,
3373 const buf_page_t* bpage)
3374 {
3375 /* We must also own the appropriate hash lock. */
3376 ut_ad(buf_page_hash_lock_held_s_or_x(buf_pool, bpage));
3377 ut_ad(buf_page_in_file(bpage));
3378
3379 if (bpage < &buf_pool->watch[0]
3380 || bpage >= &buf_pool->watch[BUF_POOL_WATCH_SIZE]) {
3381
3382 ut_ad(buf_page_get_state(bpage) != BUF_BLOCK_ZIP_PAGE
3383 || bpage->zip.data != NULL);
3384
3385 return(FALSE);
3386 }
3387
3388 ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE);
3389 ut_ad(!bpage->in_zip_hash);
3390 ut_ad(bpage->in_page_hash);
3391 ut_ad(bpage->zip.data == NULL);
3392 return(TRUE);
3393 }
3394
3395 /** Add watch for the given page to be read in. Caller must have
3396 appropriate hash_lock for the bpage. This function may release the
3397 hash_lock and reacquire it.
3398 @param[in] page_id page id
3399 @param[in,out] hash_lock hash_lock currently latched
3400 @return NULL if watch set, block if the page is in the buffer pool */
3401 static
3402 buf_page_t*
3403 buf_pool_watch_set(
3404 const page_id_t page_id,
3405 rw_lock_t** hash_lock)
3406 {
3407 buf_page_t* bpage;
3408 ulint i;
3409 buf_pool_t* buf_pool = buf_pool_get(page_id);
3410
3411 ut_ad(*hash_lock == buf_page_hash_lock_get(buf_pool, page_id));
3412
3413 ut_ad(rw_lock_own(*hash_lock, RW_LOCK_X));
3414
3415 bpage = buf_page_hash_get_low(buf_pool, page_id);
3416
3417 if (bpage != NULL) {
3418 page_found:
3419 if (!buf_pool_watch_is_sentinel(buf_pool, bpage)) {
3420 /* The page was loaded meanwhile. */
3421 return(bpage);
3422 }
3423
3424 /* Add to an existing watch. */
3425 buf_block_fix(bpage);
3426 return(NULL);
3427 }
3428
3429 /* From this point this function becomes fairly heavy in terms
3430 of latching. We acquire the buf_pool mutex as well as all the
3431 hash_locks. buf_pool mutex is needed because any changes to
3432 the page_hash must be covered by it and hash_locks are needed
3433 because we don't want to read any stale information in
3434 buf_pool->watch[]. However, it is not in the critical code path
3435 as this function will be called only by the purge thread. */
3436
3437 /* To obey latching order first release the hash_lock. */
3438 rw_lock_x_unlock(*hash_lock);
3439
3440 buf_pool_mutex_enter(buf_pool);
3441 hash_lock_x_all(buf_pool->page_hash);
3442
3443 /* If not own buf_pool_mutex, page_hash can be changed. */
3444 *hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
3445
3446 /* We have to recheck that the page
3447 was not loaded or a watch set by some other
3448 purge thread. This is because of the small
3449 time window between when we release the
3450 hash_lock to acquire buf_pool mutex above. */
3451
3452 bpage = buf_page_hash_get_low(buf_pool, page_id);
3453 if (UNIV_LIKELY_NULL(bpage)) {
3454 buf_pool_mutex_exit(buf_pool);
3455 hash_unlock_x_all_but(buf_pool->page_hash, *hash_lock);
3456 goto page_found;
3457 }
3458
3459 /* The maximum number of purge threads should never exceed
3460 BUF_POOL_WATCH_SIZE. So there is no way for purge thread
3461 instance to hold a watch when setting another watch. */
3462 for (i = 0; i < BUF_POOL_WATCH_SIZE; i++) {
3463 bpage = &buf_pool->watch[i];
3464
3465 ut_ad(bpage->access_time == 0);
3466 ut_ad(bpage->newest_modification == 0);
3467 ut_ad(bpage->oldest_modification == 0);
3468 ut_ad(bpage->zip.data == NULL);
3469 ut_ad(!bpage->in_zip_hash);
3470
3471 switch (bpage->state) {
3472 case BUF_BLOCK_POOL_WATCH:
3473 ut_ad(!bpage->in_page_hash);
3474 ut_ad(bpage->buf_fix_count == 0);
3475
3476 /* bpage is pointing to buf_pool->watch[],
3477 which is protected by buf_pool->mutex.
3478 Normally, buf_page_t objects are protected by
3479 buf_block_t::mutex or buf_pool->zip_mutex or both. */
3480
3481 bpage->state = BUF_BLOCK_ZIP_PAGE;
3482 bpage->id = page_id;
3483 bpage->buf_fix_count = 1;
3484
3485 ut_d(bpage->in_page_hash = TRUE);
3486 HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
3487 page_id.fold(), bpage);
3488
3489 buf_pool_mutex_exit(buf_pool);
3490 /* Once the sentinel is in the page_hash we can
3491 safely release all locks except just the
3492 relevant hash_lock */
3493 hash_unlock_x_all_but(buf_pool->page_hash,
3494 *hash_lock);
3495
3496 return(NULL);
3497 case BUF_BLOCK_ZIP_PAGE:
3498 ut_ad(bpage->in_page_hash);
3499 ut_ad(bpage->buf_fix_count > 0);
3500 break;
3501 default:
3502 ut_error;
3503 }
3504 }
3505
3506 /* Allocation failed. Either the maximum number of purge
3507 threads should never exceed BUF_POOL_WATCH_SIZE, or this code
3508 should be modified to return a special non-NULL value and the
3509 caller should purge the record directly. */
3510 ut_error;
3511
3512 /* Fix compiler warning */
3513 return(NULL);
3514 }
3515
3516 /** Remove the sentinel block for the watch before replacing it with a
3517 real block. buf_page_watch_clear() or buf_page_watch_occurred() will notice
3518 that the block has been replaced with the real block.
3519 @param[in,out] buf_pool buffer pool instance
3520 @param[in,out] watch sentinel for watch
3521 @return reference count, to be added to the replacement block */
3522 static
3523 void
3524 buf_pool_watch_remove(
3525 buf_pool_t* buf_pool,
3526 buf_page_t* watch)
3527 {
3528 #ifdef UNIV_DEBUG
3529 /* We must also own the appropriate hash_bucket mutex. */
3530 rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, watch->id);
3531 ut_ad(rw_lock_own(hash_lock, RW_LOCK_X));
3532 #endif /* UNIV_DEBUG */
3533
3534 ut_ad(buf_pool_mutex_own(buf_pool));
3535
3536 HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, watch->id.fold(),
3537 watch);
3538 ut_d(watch->in_page_hash = FALSE);
3539 watch->buf_fix_count = 0;
3540 watch->state = BUF_BLOCK_POOL_WATCH;
3541 }
3542
3543 /** Stop watching if the page has been read in.
3544 buf_pool_watch_set(same_page_id) must have returned NULL before.
3545 @param[in] page_id page id */
3546 void buf_pool_watch_unset(const page_id_t page_id)
3547 {
3548 buf_page_t* bpage;
3549 buf_pool_t* buf_pool = buf_pool_get(page_id);
3550
3551 /* We only need to have buf_pool mutex in case where we end
3552 up calling buf_pool_watch_remove but to obey latching order
3553 we acquire it here before acquiring hash_lock. This should
3554 not cause too much grief as this function is only ever
3555 called from the purge thread. */
3556 buf_pool_mutex_enter(buf_pool);
3557
3558 rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
3559 rw_lock_x_lock(hash_lock);
3560
3561 /* The page must exist because buf_pool_watch_set()
3562 increments buf_fix_count. */
3563 bpage = buf_page_hash_get_low(buf_pool, page_id);
3564
3565 if (buf_block_unfix(bpage) == 0
3566 && buf_pool_watch_is_sentinel(buf_pool, bpage)) {
3567 buf_pool_watch_remove(buf_pool, bpage);
3568 }
3569
3570 buf_pool_mutex_exit(buf_pool);
3571 rw_lock_x_unlock(hash_lock);
3572 }
3573
3574 /** Check if the page has been read in.
3575 This may only be called after buf_pool_watch_set(same_page_id)
3576 has returned NULL and before invoking buf_pool_watch_unset(same_page_id).
3577 @param[in] page_id page id
3578 @return false if the given page was not read in, true if it was */
3579 bool buf_pool_watch_occurred(const page_id_t page_id)
3580 {
3581 bool ret;
3582 buf_page_t* bpage;
3583 buf_pool_t* buf_pool = buf_pool_get(page_id);
3584 rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
3585
3586 rw_lock_s_lock(hash_lock);
3587
3588 /* If not own buf_pool_mutex, page_hash can be changed. */
3589 hash_lock = buf_page_hash_lock_s_confirm(hash_lock, buf_pool, page_id);
3590
3591 /* The page must exist because buf_pool_watch_set()
3592 increments buf_fix_count. */
3593 bpage = buf_page_hash_get_low(buf_pool, page_id);
3594
3595 ret = !buf_pool_watch_is_sentinel(buf_pool, bpage);
3596 rw_lock_s_unlock(hash_lock);
3597
3598 return(ret);
3599 }
3600
3601 /********************************************************************//**
3602 Moves a page to the start of the buffer pool LRU list. This high-level
3603 function can be used to prevent an important page from slipping out of
3604 the buffer pool. */
3605 void
3606 buf_page_make_young(
3607 /*================*/
3608 buf_page_t* bpage) /*!< in: buffer block of a file page */
3609 {
3610 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
3611
3612 buf_pool_mutex_enter(buf_pool);
3613
3614 ut_a(buf_page_in_file(bpage));
3615
3616 buf_LRU_make_block_young(bpage);
3617
3618 buf_pool_mutex_exit(buf_pool);
3619 }
3620
3621 /********************************************************************//**
3622 Moves a page to the start of the buffer pool LRU list if it is too old.
3623 This high-level function can be used to prevent an important page from
3624 slipping out of the buffer pool. */
3625 static
3626 void
3627 buf_page_make_young_if_needed(
3628 /*==========================*/
3629 buf_page_t* bpage) /*!< in/out: buffer block of a
3630 file page */
3631 {
3632 #ifdef UNIV_DEBUG
3633 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
3634 ut_ad(!buf_pool_mutex_own(buf_pool));
3635 #endif /* UNIV_DEBUG */
3636 ut_a(buf_page_in_file(bpage));
3637
3638 if (buf_page_peek_if_too_old(bpage)) {
3639 buf_page_make_young(bpage);
3640 }
3641 }
3642
3643 #ifdef UNIV_DEBUG
3644
3645 /** Sets file_page_was_freed TRUE if the page is found in the buffer pool.
3646 This function should be called when we free a file page and want the
3647 debug version to check that it is not accessed any more unless
3648 reallocated.
3649 @param[in] page_id page id
3650 @return control block if found in page hash table, otherwise NULL */
3651 buf_page_t* buf_page_set_file_page_was_freed(const page_id_t page_id)
3652 {
3653 buf_page_t* bpage;
3654 buf_pool_t* buf_pool = buf_pool_get(page_id);
3655 rw_lock_t* hash_lock;
3656
3657 bpage = buf_page_hash_get_s_locked(buf_pool, page_id, &hash_lock);
3658
3659 if (bpage) {
3660 BPageMutex* block_mutex = buf_page_get_mutex(bpage);
3661 ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
3662 mutex_enter(block_mutex);
3663 rw_lock_s_unlock(hash_lock);
3664 /* bpage->file_page_was_freed can already hold
3665 when this code is invoked from dict_drop_index_tree() */
3666 bpage->file_page_was_freed = TRUE;
3667 mutex_exit(block_mutex);
3668 }
3669
3670 return(bpage);
3671 }
3672
3673 /** Sets file_page_was_freed FALSE if the page is found in the buffer pool.
3674 This function should be called when we free a file page and want the
3675 debug version to check that it is not accessed any more unless
3676 reallocated.
3677 @param[in] page_id page id
3678 @return control block if found in page hash table, otherwise NULL */
3679 buf_page_t* buf_page_reset_file_page_was_freed(const page_id_t page_id)
3680 {
3681 buf_page_t* bpage;
3682 buf_pool_t* buf_pool = buf_pool_get(page_id);
3683 rw_lock_t* hash_lock;
3684
3685 bpage = buf_page_hash_get_s_locked(buf_pool, page_id, &hash_lock);
3686 if (bpage) {
3687 BPageMutex* block_mutex = buf_page_get_mutex(bpage);
3688 ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
3689 mutex_enter(block_mutex);
3690 rw_lock_s_unlock(hash_lock);
3691 bpage->file_page_was_freed = FALSE;
3692 mutex_exit(block_mutex);
3693 }
3694
3695 return(bpage);
3696 }
3697 #endif /* UNIV_DEBUG */
3698
3699 /** Attempts to discard the uncompressed frame of a compressed page.
3700 The caller should not be holding any mutexes when this function is called.
3701 @param[in] page_id page id */
3702 static void buf_block_try_discard_uncompressed(const page_id_t page_id)
3703 {
3704 buf_page_t* bpage;
3705 buf_pool_t* buf_pool = buf_pool_get(page_id);
3706
3707 /* Since we need to acquire buf_pool mutex to discard
3708 the uncompressed frame and because page_hash mutex resides
3709 below buf_pool mutex in sync ordering therefore we must
3710 first release the page_hash mutex. This means that the
3711 block in question can move out of page_hash. Therefore
3712 we need to check again if the block is still in page_hash. */
3713 buf_pool_mutex_enter(buf_pool);
3714
3715 bpage = buf_page_hash_get(buf_pool, page_id);
3716
3717 if (bpage) {
3718 buf_LRU_free_page(bpage, false);
3719 }
3720
3721 buf_pool_mutex_exit(buf_pool);
3722 }
3723
3724 /** Get read access to a compressed page (usually of type
3725 FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
3726 The page must be released with buf_page_release_zip().
3727 NOTE: the page is not protected by any latch. Mutual exclusion has to
3728 be implemented at a higher level. In other words, all possible
3729 accesses to a given page through this function must be protected by
3730 the same set of mutexes or latches.
3731 @param[in] page_id page id
3732 @param[in] page_size page size
3733 @return pointer to the block */
3734 buf_page_t*
3735 buf_page_get_zip(
3736 const page_id_t page_id,
3737 const page_size_t& page_size)
3738 {
3739 buf_page_t* bpage;
3740 BPageMutex* block_mutex;
3741 rw_lock_t* hash_lock;
3742 ibool discard_attempted = FALSE;
3743 ibool must_read;
3744 buf_pool_t* buf_pool = buf_pool_get(page_id);
3745
3746 buf_pool->stat.n_page_gets++;
3747
3748 for (;;) {
3749 lookup:
3750
3751 /* The following call will also grab the page_hash
3752 mutex if the page is found. */
3753 bpage = buf_page_hash_get_s_locked(buf_pool, page_id,
3754 &hash_lock);
3755 if (bpage) {
3756 ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
3757 break;
3758 }
3759
3760 /* Page not in buf_pool: needs to be read from file */
3761
3762 ut_ad(!hash_lock);
3763 dberr_t err = buf_read_page(page_id, page_size);
3764
3765 if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
3766 ib::error() << "Reading compressed page " << page_id
3767 << " failed with error: " << err;
3768
3769 goto err_exit;
3770 }
3771
3772 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3773 ut_a(++buf_dbg_counter % 5771 || buf_validate());
3774 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3775 }
3776
3777 ut_ad(buf_page_hash_lock_held_s(buf_pool, bpage));
3778
3779 if (!bpage->zip.data) {
3780 /* There is no compressed page. */
3781 err_exit:
3782 rw_lock_s_unlock(hash_lock);
3783 return(NULL);
3784 }
3785
3786 ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
3787
3788 switch (buf_page_get_state(bpage)) {
3789 case BUF_BLOCK_ZIP_PAGE:
3790 case BUF_BLOCK_ZIP_DIRTY:
3791 buf_block_fix(bpage);
3792 block_mutex = &buf_pool->zip_mutex;
3793 goto got_block;
3794 case BUF_BLOCK_FILE_PAGE:
3795 /* Discard the uncompressed page frame if possible. */
3796 if (!discard_attempted) {
3797 rw_lock_s_unlock(hash_lock);
3798 buf_block_try_discard_uncompressed(page_id);
3799 discard_attempted = TRUE;
3800 goto lookup;
3801 }
3802
3803 buf_block_buf_fix_inc((buf_block_t*) bpage,
3804 __FILE__, __LINE__);
3805
3806 block_mutex = &((buf_block_t*) bpage)->mutex;
3807 goto got_block;
3808 default:
3809 break;
3810 }
3811
3812 ut_error;
3813 goto err_exit;
3814
3815 got_block:
3816 mutex_enter(block_mutex);
3817 must_read = buf_page_get_io_fix(bpage) == BUF_IO_READ;
3818
3819 rw_lock_s_unlock(hash_lock);
3820
3821 ut_ad(!bpage->file_page_was_freed);
3822
3823 buf_page_set_accessed(bpage);
3824
3825 mutex_exit(block_mutex);
3826
3827 buf_page_make_young_if_needed(bpage);
3828
3829 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3830 ut_a(++buf_dbg_counter % 5771 || buf_validate());
3831 ut_a(bpage->buf_fix_count > 0);
3832 ut_a(buf_page_in_file(bpage));
3833 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3834
3835 if (must_read) {
3836 /* Let us wait until the read operation
3837 completes */
3838
3839 for (;;) {
3840 enum buf_io_fix io_fix;
3841
3842 mutex_enter(block_mutex);
3843 io_fix = buf_page_get_io_fix(bpage);
3844 mutex_exit(block_mutex);
3845
3846 if (io_fix == BUF_IO_READ) {
3847
3848 os_thread_sleep(WAIT_FOR_READ);
3849 } else {
3850 break;
3851 }
3852 }
3853 }
3854
3855 return(bpage);
3856 }
3857
3858 /********************************************************************//**
3859 Initialize some fields of a control block. */
3860 UNIV_INLINE
3861 void
3862 buf_block_init_low(
3863 /*===============*/
3864 buf_block_t* block) /*!< in: block to init */
3865 {
3866 #ifdef BTR_CUR_HASH_ADAPT
3867 /* No adaptive hash index entries may point to a previously
3868 unused (and now freshly allocated) block. */
3869 assert_block_ahi_empty_on_init(block);
3870 block->index = NULL;
3871
3872 block->n_hash_helps = 0;
3873 block->n_fields = 1;
3874 block->n_bytes = 0;
3875 block->left_side = TRUE;
3876 #endif /* BTR_CUR_HASH_ADAPT */
3877 }
3878
3879 /********************************************************************//**
3880 Decompress a block.
3881 @return TRUE if successful */
3882 ibool
3883 buf_zip_decompress(
3884 /*===============*/
3885 buf_block_t* block, /*!< in/out: block */
3886 ibool check) /*!< in: TRUE=verify the page checksum */
3887 {
3888 const byte* frame = block->page.zip.data;
3889 ulint size = page_zip_get_size(&block->page.zip);
3890 /* The tablespace will not be found if this function is called
3891 during IMPORT. */
3892 fil_space_t* space = fil_space_acquire_for_io(block->page.id.space());
3893 const unsigned key_version = mach_read_from_4(
3894 frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
3895 fil_space_crypt_t* crypt_data = space ? space->crypt_data : NULL;
3896 const bool encrypted = crypt_data
3897 && crypt_data->type != CRYPT_SCHEME_UNENCRYPTED
3898 && (!crypt_data->is_default_encryption()
3899 || srv_encrypt_tables);
3900
3901 ut_ad(block->page.size.is_compressed());
3902 ut_a(block->page.id.space() != 0);
3903
3904 if (UNIV_UNLIKELY(check && !page_zip_verify_checksum(frame, size))) {
3905
3906 ib::error() << "Compressed page checksum mismatch for "
3907 << (space ? space->chain.start->name : "")
3908 << block->page.id << ": stored: "
3909 << mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM)
3910 << ", crc32: "
3911 << page_zip_calc_checksum(
3912 frame, size, SRV_CHECKSUM_ALGORITHM_CRC32)
3913 #ifdef INNODB_BUG_ENDIAN_CRC32
3914 << "/"
3915 << page_zip_calc_checksum(
3916 frame, size, SRV_CHECKSUM_ALGORITHM_CRC32,
3917 true)
3918 #endif
3919 << " innodb: "
3920 << page_zip_calc_checksum(
3921 frame, size, SRV_CHECKSUM_ALGORITHM_INNODB)
3922 << ", none: "
3923 << page_zip_calc_checksum(
3924 frame, size, SRV_CHECKSUM_ALGORITHM_NONE)
3925 << " (algorithm: " << srv_checksum_algorithm << ")";
3926
3927 goto err_exit;
3928 }
3929
3930 switch (fil_page_get_type(frame)) {
3931 case FIL_PAGE_INDEX:
3932 case FIL_PAGE_RTREE:
3933 if (page_zip_decompress(&block->page.zip,
3934 block->frame, TRUE)) {
3935 if (space) {
3936 space->release_for_io();
3937 }
3938 return(TRUE);
3939 }
3940
3941 ib::error() << "Unable to decompress "
3942 << (space ? space->chain.start->name : "")
3943 << block->page.id;
3944 goto err_exit;
3945 case FIL_PAGE_TYPE_ALLOCATED:
3946 case FIL_PAGE_INODE:
3947 case FIL_PAGE_IBUF_BITMAP:
3948 case FIL_PAGE_TYPE_FSP_HDR:
3949 case FIL_PAGE_TYPE_XDES:
3950 case FIL_PAGE_TYPE_ZBLOB:
3951 case FIL_PAGE_TYPE_ZBLOB2:
3952 /* Copy to uncompressed storage. */
3953 memcpy(block->frame, frame, block->page.size.physical());
3954 if (space) {
3955 space->release_for_io();
3956 }
3957
3958 return(TRUE);
3959 }
3960
3961 ib::error() << "Unknown compressed page type "
3962 << fil_page_get_type(frame)
3963 << " in " << (space ? space->chain.start->name : "")
3964 << block->page.id;
3965
3966 err_exit:
3967 if (encrypted) {
3968 ib::info() << "Row compressed page could be encrypted"
3969 " with key_version " << key_version;
3970 }
3971
3972 if (space) {
3973 if (encrypted) {
3974 dict_set_encrypted_by_space(space);
3975 } else {
3976 dict_set_corrupted_by_space(space);
3977 }
3978
3979 space->release_for_io();
3980 }
3981
3982 return(FALSE);
3983 }
3984
3985 #ifdef BTR_CUR_HASH_ADAPT
3986 /** Get a buffer block from an adaptive hash index pointer.
3987 This function does not return if the block is not identified.
3988 @param[in] ptr pointer to within a page frame
3989 @return pointer to block, never NULL */
3990 buf_block_t*
3991 buf_block_from_ahi(const byte* ptr)
3992 {
3993 buf_pool_chunk_map_t::iterator it;
3994
3995 buf_pool_chunk_map_t* chunk_map = buf_chunk_map_ref;
3996 ut_ad(buf_chunk_map_ref == buf_chunk_map_reg);
3997 ut_ad(!buf_pool_resizing);
3998
3999 buf_chunk_t* chunk;
4000 it = chunk_map->upper_bound(ptr);
4001
4002 ut_a(it != chunk_map->begin());
4003
4004 if (it == chunk_map->end()) {
4005 chunk = chunk_map->rbegin()->second;
4006 } else {
4007 chunk = (--it)->second;
4008 }
4009
4010 ulint offs = ulint(ptr - chunk->blocks->frame);
4011
4012 offs >>= srv_page_size_shift;
4013
4014 ut_a(offs < chunk->size);
4015
4016 buf_block_t* block = &chunk->blocks[offs];
4017
4018 /* The function buf_chunk_init() invokes buf_block_init() so that
4019 block[n].frame == block->frame + n * srv_page_size. Check it. */
4020 ut_ad(block->frame == page_align(ptr));
4021 /* Read the state of the block without holding a mutex.
4022 A state transition from BUF_BLOCK_FILE_PAGE to
4023 BUF_BLOCK_REMOVE_HASH is possible during this execution. */
4024 ut_d(const buf_page_state state = buf_block_get_state(block));
4025 ut_ad(state == BUF_BLOCK_FILE_PAGE || state == BUF_BLOCK_REMOVE_HASH);
4026 return(block);
4027 }
4028 #endif /* BTR_CUR_HASH_ADAPT */
4029
4030 /********************************************************************//**
4031 Find out if a pointer belongs to a buf_block_t. It can be a pointer to
4032 the buf_block_t itself or a member of it
4033 @return TRUE if ptr belongs to a buf_block_t struct */
4034 ibool
4035 buf_pointer_is_block_field(
4036 /*=======================*/
4037 const void* ptr) /*!< in: pointer not dereferenced */
4038 {
4039 ulint i;
4040
4041 for (i = 0; i < srv_buf_pool_instances; i++) {
4042 if (buf_pool_from_array(i)->is_block_field(ptr)) {
4043 return(TRUE);
4044 }
4045 }
4046
4047 return(FALSE);
4048 }
4049
4050 #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
4051 /********************************************************************//**
4052 Return true if probe is enabled.
4053 @return true if probe enabled. */
4054 static
4055 bool
4056 buf_debug_execute_is_force_flush()
4057 /*==============================*/
4058 {
4059 DBUG_EXECUTE_IF("ib_buf_force_flush", return(true); );
4060
4061 /* This is used during queisce testing, we want to ensure maximum
4062 buffering by the change buffer. */
4063
4064 if (srv_ibuf_disable_background_merge) {
4065 return(true);
4066 }
4067
4068 return(false);
4069 }
4070 #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
4071
4072 /** Wait for the block to be read in.
4073 @param[in] block The block to check */
4074 static
4075 void
4076 buf_wait_for_read(
4077 buf_block_t* block)
4078 {
4079 /* Note:
4080
4081 We are using the block->lock to check for IO state (and a dirty read).
4082 We set the IO_READ state under the protection of the hash_lock
4083 (and block->mutex). This is safe because another thread can only
4084 access the block (and check for IO state) after the block has been
4085 added to the page hashtable. */
4086
4087 if (buf_block_get_io_fix(block) == BUF_IO_READ) {
4088
4089 /* Wait until the read operation completes */
4090
4091 BPageMutex* mutex = buf_page_get_mutex(&block->page);
4092
4093 for (;;) {
4094 buf_io_fix io_fix;
4095
4096 mutex_enter(mutex);
4097
4098 io_fix = buf_block_get_io_fix(block);
4099
4100 mutex_exit(mutex);
4101
4102 if (io_fix == BUF_IO_READ) {
4103 /* Wait by temporaly s-latch */
4104 rw_lock_s_lock(&block->lock);
4105 rw_lock_s_unlock(&block->lock);
4106 } else {
4107 break;
4108 }
4109 }
4110 }
4111 }
4112
4113 #ifdef BTR_CUR_HASH_ADAPT
4114 /** If a stale adaptive hash index exists on the block, drop it.
4115 Multiple executions of btr_search_drop_page_hash_index() on the
4116 same block must be prevented by exclusive page latch. */
4117 ATTRIBUTE_COLD
4118 static void buf_defer_drop_ahi(buf_block_t *block, mtr_memo_type_t fix_type)
4119 {
4120 switch (fix_type) {
4121 case MTR_MEMO_BUF_FIX:
4122 /* We do not drop the adaptive hash index, because safely doing
4123 so would require acquiring block->lock, and that is not safe
4124 to acquire in some RW_NO_LATCH access paths. Those code paths
4125 should have no business accessing the adaptive hash index anyway. */
4126 break;
4127 case MTR_MEMO_PAGE_S_FIX:
4128 /* Temporarily release our S-latch. */
4129 rw_lock_s_unlock(&block->lock);
4130 rw_lock_x_lock(&block->lock);
4131 if (dict_index_t *index= block->index)
4132 if (index->freed())
4133 btr_search_drop_page_hash_index(block);
4134 rw_lock_x_unlock(&block->lock);
4135 rw_lock_s_lock(&block->lock);
4136 break;
4137 case MTR_MEMO_PAGE_SX_FIX:
4138 rw_lock_sx_unlock(&block->lock);
4139 rw_lock_x_lock(&block->lock);
4140 if (dict_index_t *index= block->index)
4141 if (index->freed())
4142 btr_search_drop_page_hash_index(block);
4143 rw_lock_x_unlock(&block->lock);
4144 rw_lock_sx_lock(&block->lock);
4145 break;
4146 default:
4147 ut_ad(fix_type == MTR_MEMO_PAGE_X_FIX);
4148 btr_search_drop_page_hash_index(block);
4149 }
4150 }
4151 #endif /* BTR_CUR_HASH_ADAPT */
4152
4153 /** Lock the page with the given latch type.
4154 @param[in,out] block block to be locked
4155 @param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
4156 @param[in] mtr mini-transaction
4157 @param[in] file file name
4158 @param[in] line line where called
4159 @return pointer to locked block */
4160 static buf_block_t* buf_page_mtr_lock(buf_block_t *block,
4161 ulint rw_latch,
4162 mtr_t* mtr,
4163 const char *file,
4164 unsigned line)
4165 {
4166 mtr_memo_type_t fix_type;
4167 switch (rw_latch)
4168 {
4169 case RW_NO_LATCH:
4170 fix_type= MTR_MEMO_BUF_FIX;
4171 goto done;
4172 case RW_S_LATCH:
4173 rw_lock_s_lock_inline(&block->lock, 0, file, line);
4174 fix_type= MTR_MEMO_PAGE_S_FIX;
4175 break;
4176 case RW_SX_LATCH:
4177 rw_lock_sx_lock_inline(&block->lock, 0, file, line);
4178 fix_type= MTR_MEMO_PAGE_SX_FIX;
4179 break;
4180 default:
4181 ut_ad(rw_latch == RW_X_LATCH);
4182 rw_lock_x_lock_inline(&block->lock, 0, file, line);
4183 fix_type= MTR_MEMO_PAGE_X_FIX;
4184 break;
4185 }
4186
4187 #ifdef BTR_CUR_HASH_ADAPT
4188 {
4189 dict_index_t *index= block->index;
4190 if (index && index->freed())
4191 buf_defer_drop_ahi(block, fix_type);
4192 }
4193 #endif /* BTR_CUR_HASH_ADAPT */
4194
4195 done:
4196 mtr_memo_push(mtr, block, fix_type);
4197 return block;
4198 }
4199
4200 /** This is the low level function used to get access to a database page.
4201 @param[in] page_id page id
4202 @param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
4203 @param[in] guess guessed block or NULL
4204 @param[in] mode BUF_GET, BUF_GET_IF_IN_POOL,
4205 BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH
4206 @param[in] file file name
4207 @param[in] line line where called
4208 @param[in] mtr mini-transaction
4209 @return pointer to the block or NULL */
4210 buf_block_t*
4211 buf_page_get_low(
4212 const page_id_t page_id,
4213 const page_size_t& page_size,
4214 ulint rw_latch,
4215 buf_block_t* guess,
4216 ulint mode,
4217 const char* file,
4218 unsigned line,
4219 mtr_t* mtr,
4220 dberr_t* err)
4221 {
4222 buf_block_t* block;
4223 unsigned access_time;
4224 rw_lock_t* hash_lock;
4225 buf_block_t* fix_block;
4226 ulint retries = 0;
4227 buf_pool_t* buf_pool = buf_pool_get(page_id);
4228
4229 ut_ad((mtr == NULL) == (mode == BUF_EVICT_IF_IN_POOL));
4230 ut_ad(!mtr || mtr->is_active());
4231 ut_ad((rw_latch == RW_S_LATCH)
4232 || (rw_latch == RW_X_LATCH)
4233 || (rw_latch == RW_SX_LATCH)
4234 || (rw_latch == RW_NO_LATCH));
4235
4236 if (err) {
4237 *err = DB_SUCCESS;
4238 }
4239
4240 #ifdef UNIV_DEBUG
4241 switch (mode) {
4242 case BUF_EVICT_IF_IN_POOL:
4243 /* After DISCARD TABLESPACE, the tablespace would not exist,
4244 but in IMPORT TABLESPACE, PageConverter::operator() must
4245 replace any old pages, which were not evicted during DISCARD.
4246 Skip the assertion on space_page_size. */
4247 break;
4248 case BUF_PEEK_IF_IN_POOL:
4249 case BUF_GET_IF_IN_POOL:
4250 /* The caller may pass a dummy page size,
4251 because it does not really matter. */
4252 break;
4253 default:
4254 ut_error;
4255 case BUF_GET_NO_LATCH:
4256 ut_ad(rw_latch == RW_NO_LATCH);
4257 /* fall through */
4258 case BUF_GET:
4259 case BUF_GET_IF_IN_POOL_OR_WATCH:
4260 case BUF_GET_POSSIBLY_FREED:
4261 bool found;
4262 const page_size_t& space_page_size
4263 = fil_space_get_page_size(page_id.space(), &found);
4264 ut_ad(found);
4265 ut_ad(page_size.equals_to(space_page_size));
4266 }
4267 #endif /* UNIV_DEBUG */
4268
4269 ut_ad(!mtr || !ibuf_inside(mtr)
4270 || ibuf_page_low(page_id, page_size, FALSE, file, line, NULL));
4271
4272 buf_pool->stat.n_page_gets++;
4273 hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
4274 loop:
4275 block = guess;
4276
4277 rw_lock_s_lock(hash_lock);
4278
4279 /* If not own buf_pool_mutex, page_hash can be changed. */
4280 hash_lock = buf_page_hash_lock_s_confirm(hash_lock, buf_pool, page_id);
4281
4282 if (block != NULL) {
4283
4284 /* If the guess is a compressed page descriptor that
4285 has been allocated by buf_page_alloc_descriptor(),
4286 it may have been freed by buf_relocate(). */
4287
4288 if (!buf_pool->is_block_field(block)
4289 || page_id != block->page.id
4290 || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
4291
4292 /* Our guess was bogus or things have changed
4293 since. */
4294 block = guess = NULL;
4295 } else {
4296 ut_ad(!block->page.in_zip_hash);
4297 }
4298 }
4299
4300 if (block == NULL) {
4301 block = (buf_block_t*) buf_page_hash_get_low(buf_pool, page_id);
4302 }
4303
4304 if (!block || buf_pool_watch_is_sentinel(buf_pool, &block->page)) {
4305 rw_lock_s_unlock(hash_lock);
4306 block = NULL;
4307 }
4308
4309 if (block == NULL) {
4310
4311 /* Page not in buf_pool: needs to be read from file */
4312
4313 if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
4314 rw_lock_x_lock(hash_lock);
4315
4316 /* If not own buf_pool_mutex,
4317 page_hash can be changed. */
4318 hash_lock = buf_page_hash_lock_x_confirm(
4319 hash_lock, buf_pool, page_id);
4320
4321 block = (buf_block_t*) buf_pool_watch_set(
4322 page_id, &hash_lock);
4323
4324 if (block) {
4325 /* We can release hash_lock after we
4326 increment the fix count to make
4327 sure that no state change takes place. */
4328 fix_block = block;
4329
4330 if (fsp_is_system_temporary(page_id.space())) {
4331 /* For temporary tablespace,
4332 the mutex is being used for
4333 synchronization between user
4334 thread and flush thread,
4335 instead of block->lock. See
4336 buf_flush_page() for the flush
4337 thread counterpart. */
4338
4339 BPageMutex* fix_mutex
4340 = buf_page_get_mutex(
4341 &fix_block->page);
4342 mutex_enter(fix_mutex);
4343 buf_block_fix(fix_block);
4344 mutex_exit(fix_mutex);
4345 } else {
4346 buf_block_fix(fix_block);
4347 }
4348
4349 /* Now safe to release page_hash mutex */
4350 rw_lock_x_unlock(hash_lock);
4351 goto got_block;
4352 }
4353
4354 rw_lock_x_unlock(hash_lock);
4355 }
4356
4357 switch (mode) {
4358 case BUF_GET_IF_IN_POOL:
4359 case BUF_GET_IF_IN_POOL_OR_WATCH:
4360 case BUF_PEEK_IF_IN_POOL:
4361 case BUF_EVICT_IF_IN_POOL:
4362 ut_ad(!rw_lock_own_flagged(
4363 hash_lock,
4364 RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
4365 return(NULL);
4366 }
4367
4368 /* The call path is buf_read_page() ->
4369 buf_read_page_low() (fil_io()) ->
4370 buf_page_io_complete() ->
4371 buf_decrypt_after_read(). Here fil_space_t* is used
4372 and we decrypt -> buf_page_check_corrupt() where page
4373 checksums are compared. Decryption, decompression as
4374 well as error handling takes place at a lower level.
4375 Here we only need to know whether the page really is
4376 corrupted, or if an encrypted page with a valid
4377 checksum cannot be decypted. */
4378
4379 dberr_t local_err = buf_read_page(page_id, page_size);
4380
4381 if (local_err == DB_SUCCESS) {
4382 buf_read_ahead_random(page_id, page_size,
4383 ibuf_inside(mtr));
4384
4385 retries = 0;
4386 } else if (mode == BUF_GET_POSSIBLY_FREED) {
4387 if (err) {
4388 *err = local_err;
4389 }
4390 return NULL;
4391 } else if (retries < BUF_PAGE_READ_MAX_RETRIES) {
4392 ++retries;
4393
4394 DBUG_EXECUTE_IF(
4395 "innodb_page_corruption_retries",
4396 retries = BUF_PAGE_READ_MAX_RETRIES;
4397 );
4398 } else {
4399 if (err) {
4400 *err = local_err;
4401 }
4402
4403 /* Pages whose encryption key is unavailable or used
4404 key, encryption algorithm or encryption method is
4405 incorrect are marked as encrypted in
4406 buf_page_check_corrupt(). Unencrypted page could be
4407 corrupted in a way where the key_id field is
4408 nonzero. There is no checksum on field
4409 FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION. */
4410 if (local_err == DB_DECRYPTION_FAILED) {
4411 return (NULL);
4412 }
4413
4414 if (local_err == DB_PAGE_CORRUPTED
4415 && srv_force_recovery) {
4416 return NULL;
4417 }
4418
4419 /* Try to set table as corrupted instead of
4420 asserting. */
4421 if (page_id.space() == TRX_SYS_SPACE) {
4422 } else if (page_id.space() == SRV_TMP_SPACE_ID) {
4423 } else if (fil_space_t* space
4424 = fil_space_acquire_for_io(
4425 page_id.space())) {
4426 bool set = dict_set_corrupted_by_space(space);
4427 space->release_for_io();
4428 if (set) {
4429 return NULL;
4430 }
4431 }
4432
4433 if (local_err == DB_IO_ERROR) {
4434 return NULL;
4435 }
4436
4437 ib::fatal() << "Unable to read page " << page_id
4438 << " into the buffer pool after "
4439 << BUF_PAGE_READ_MAX_RETRIES
4440 << ". The most probable cause"
4441 " of this error may be that the"
4442 " table has been corrupted."
4443 " See https://mariadb.com/kb/en/library/innodb-recovery-modes/";
4444 }
4445
4446 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
4447 ut_a(++buf_dbg_counter % 5771 || buf_validate());
4448 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
4449 goto loop;
4450 } else {
4451 fix_block = block;
4452 }
4453
4454 if (fsp_is_system_temporary(page_id.space())) {
4455 /* For temporary tablespace, the mutex is being used
4456 for synchronization between user thread and flush
4457 thread, instead of block->lock. See buf_flush_page()
4458 for the flush thread counterpart. */
4459 BPageMutex* fix_mutex = buf_page_get_mutex(
4460 &fix_block->page);
4461 mutex_enter(fix_mutex);
4462 buf_block_fix(fix_block);
4463 mutex_exit(fix_mutex);
4464 } else {
4465 buf_block_fix(fix_block);
4466 }
4467
4468 /* Now safe to release page_hash mutex */
4469 rw_lock_s_unlock(hash_lock);
4470
4471 got_block:
4472
4473 switch (mode) {
4474 case BUF_GET_IF_IN_POOL:
4475 case BUF_PEEK_IF_IN_POOL:
4476 case BUF_EVICT_IF_IN_POOL:
4477 buf_page_t* fix_page = &fix_block->page;
4478 BPageMutex* fix_mutex = buf_page_get_mutex(fix_page);
4479 mutex_enter(fix_mutex);
4480 const bool must_read
4481 = (buf_page_get_io_fix(fix_page) == BUF_IO_READ);
4482 mutex_exit(fix_mutex);
4483
4484 if (must_read) {
4485 /* The page is being read to buffer pool,
4486 but we cannot wait around for the read to
4487 complete. */
4488 buf_block_unfix(fix_block);
4489
4490 return(NULL);
4491 }
4492 }
4493
4494 switch (buf_block_get_state(fix_block)) {
4495 buf_page_t* bpage;
4496
4497 case BUF_BLOCK_FILE_PAGE:
4498 bpage = &block->page;
4499 if (fsp_is_system_temporary(page_id.space())
4500 && buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
4501 /* This suggests that the page is being flushed.
4502 Avoid returning reference to this page.
4503 Instead wait for the flush action to complete. */
4504 buf_block_unfix(fix_block);
4505 os_thread_sleep(WAIT_FOR_WRITE);
4506 goto loop;
4507 }
4508
4509 if (UNIV_UNLIKELY(mode == BUF_EVICT_IF_IN_POOL)) {
4510 evict_from_pool:
4511 ut_ad(!fix_block->page.oldest_modification);
4512 buf_pool_mutex_enter(buf_pool);
4513 buf_block_unfix(fix_block);
4514
4515 if (!buf_LRU_free_page(&fix_block->page, true)) {
4516 ut_ad(0);
4517 }
4518
4519 buf_pool_mutex_exit(buf_pool);
4520 return(NULL);
4521 }
4522
4523 break;
4524
4525 case BUF_BLOCK_ZIP_PAGE:
4526 case BUF_BLOCK_ZIP_DIRTY:
4527 if (mode == BUF_PEEK_IF_IN_POOL) {
4528 /* This mode is only used for dropping an
4529 adaptive hash index. There cannot be an
4530 adaptive hash index for a compressed-only
4531 page, so do not bother decompressing the page. */
4532 buf_block_unfix(fix_block);
4533
4534 return(NULL);
4535 }
4536
4537 bpage = &block->page;
4538
4539 /* Note: We have already buffer fixed this block. */
4540 if (bpage->buf_fix_count > 1
4541 || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
4542
4543 /* This condition often occurs when the buffer
4544 is not buffer-fixed, but I/O-fixed by
4545 buf_page_init_for_read(). */
4546 buf_block_unfix(fix_block);
4547
4548 /* The block is buffer-fixed or I/O-fixed.
4549 Try again later. */
4550 os_thread_sleep(WAIT_FOR_READ);
4551
4552 goto loop;
4553 }
4554
4555 if (UNIV_UNLIKELY(mode == BUF_EVICT_IF_IN_POOL)) {
4556 goto evict_from_pool;
4557 }
4558
4559 /* Buffer-fix the block so that it cannot be evicted
4560 or relocated while we are attempting to allocate an
4561 uncompressed page. */
4562
4563 block = buf_LRU_get_free_block(buf_pool);
4564
4565 buf_pool_mutex_enter(buf_pool);
4566
4567 /* If not own buf_pool_mutex, page_hash can be changed. */
4568 hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
4569
4570 rw_lock_x_lock(hash_lock);
4571
4572 /* Buffer-fixing prevents the page_hash from changing. */
4573 ut_ad(bpage == buf_page_hash_get_low(buf_pool, page_id));
4574
4575 buf_block_unfix(fix_block);
4576
4577 buf_page_mutex_enter(block);
4578 mutex_enter(&buf_pool->zip_mutex);
4579
4580 fix_block = block;
4581
4582 if (bpage->buf_fix_count > 0
4583 || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
4584
4585 mutex_exit(&buf_pool->zip_mutex);
4586 /* The block was buffer-fixed or I/O-fixed while
4587 buf_pool->mutex was not held by this thread.
4588 Free the block that was allocated and retry.
4589 This should be extremely unlikely, for example,
4590 if buf_page_get_zip() was invoked. */
4591
4592 buf_LRU_block_free_non_file_page(block);
4593 buf_pool_mutex_exit(buf_pool);
4594 rw_lock_x_unlock(hash_lock);
4595 buf_page_mutex_exit(block);
4596
4597 /* Try again */
4598 goto loop;
4599 }
4600
4601 /* Move the compressed page from bpage to block,
4602 and uncompress it. */
4603
4604 /* Note: this is the uncompressed block and it is not
4605 accessible by other threads yet because it is not in
4606 any list or hash table */
4607 buf_relocate(bpage, &block->page);
4608
4609 buf_block_init_low(block);
4610
4611 /* Set after buf_relocate(). */
4612 block->page.buf_fix_count = 1;
4613
4614 block->lock_hash_val = lock_rec_hash(page_id.space(),
4615 page_id.page_no());
4616
4617 if (buf_page_get_state(&block->page) == BUF_BLOCK_ZIP_PAGE) {
4618 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
4619 UT_LIST_REMOVE(buf_pool->zip_clean, &block->page);
4620 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
4621 ut_ad(!block->page.in_flush_list);
4622 } else {
4623 /* Relocate buf_pool->flush_list. */
4624 buf_flush_relocate_on_flush_list(bpage, &block->page);
4625 }
4626
4627 /* Buffer-fix, I/O-fix, and X-latch the block
4628 for the duration of the decompression.
4629 Also add the block to the unzip_LRU list. */
4630 block->page.state = BUF_BLOCK_FILE_PAGE;
4631
4632 /* Insert at the front of unzip_LRU list */
4633 buf_unzip_LRU_add_block(block, FALSE);
4634
4635 buf_block_set_io_fix(block, BUF_IO_READ);
4636 rw_lock_x_lock_inline(&block->lock, 0, file, line);
4637
4638 MEM_UNDEFINED(bpage, sizeof *bpage);
4639
4640 rw_lock_x_unlock(hash_lock);
4641 buf_pool->n_pend_unzip++;
4642 mutex_exit(&buf_pool->zip_mutex);
4643 buf_pool_mutex_exit(buf_pool);
4644
4645 access_time = buf_page_is_accessed(&block->page);
4646
4647 buf_page_mutex_exit(block);
4648
4649 buf_page_free_descriptor(bpage);
4650
4651 /* Decompress the page while not holding
4652 buf_pool->mutex or block->mutex. */
4653
4654 {
4655 bool success = buf_zip_decompress(block, false);
4656
4657 if (!success) {
4658 buf_pool_mutex_enter(buf_pool);
4659 buf_page_mutex_enter(fix_block);
4660 buf_block_set_io_fix(fix_block, BUF_IO_NONE);
4661 buf_page_mutex_exit(fix_block);
4662
4663 --buf_pool->n_pend_unzip;
4664 buf_block_unfix(fix_block);
4665 buf_pool_mutex_exit(buf_pool);
4666 rw_lock_x_unlock(&fix_block->lock);
4667
4668 if (err) {
4669 *err = DB_PAGE_CORRUPTED;
4670 }
4671 return NULL;
4672 }
4673 }
4674
4675 if (!access_time && !recv_no_ibuf_operations) {
4676 ibuf_merge_or_delete_for_page(
4677 block, page_id, page_size);
4678 }
4679
4680 buf_pool_mutex_enter(buf_pool);
4681
4682 buf_page_mutex_enter(fix_block);
4683
4684 buf_block_set_io_fix(fix_block, BUF_IO_NONE);
4685
4686 buf_page_mutex_exit(fix_block);
4687
4688 --buf_pool->n_pend_unzip;
4689
4690 buf_pool_mutex_exit(buf_pool);
4691
4692 rw_lock_x_unlock(&block->lock);
4693
4694 break;
4695
4696 case BUF_BLOCK_POOL_WATCH:
4697 case BUF_BLOCK_NOT_USED:
4698 case BUF_BLOCK_READY_FOR_USE:
4699 case BUF_BLOCK_MEMORY:
4700 case BUF_BLOCK_REMOVE_HASH:
4701 ut_error;
4702 break;
4703 }
4704
4705 ut_ad(block == fix_block);
4706 ut_ad(fix_block->page.buf_fix_count > 0);
4707
4708 ut_ad(!rw_lock_own_flagged(hash_lock,
4709 RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
4710
4711 ut_ad(buf_block_get_state(fix_block) == BUF_BLOCK_FILE_PAGE);
4712
4713 #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
4714
4715 if ((mode == BUF_GET_IF_IN_POOL || mode == BUF_GET_IF_IN_POOL_OR_WATCH)
4716 && (ibuf_debug || buf_debug_execute_is_force_flush())) {
4717
4718 /* Try to evict the block from the buffer pool, to use the
4719 insert buffer (change buffer) as much as possible. */
4720
4721 buf_pool_mutex_enter(buf_pool);
4722
4723 buf_block_unfix(fix_block);
4724
4725 /* Now we are only holding the buf_pool->mutex,
4726 not block->mutex or hash_lock. Blocks cannot be
4727 relocated or enter or exit the buf_pool while we
4728 are holding the buf_pool->mutex. */
4729
4730 if (buf_LRU_free_page(&fix_block->page, true)) {
4731
4732 buf_pool_mutex_exit(buf_pool);
4733
4734 /* If not own buf_pool_mutex,
4735 page_hash can be changed. */
4736 hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
4737
4738 rw_lock_x_lock(hash_lock);
4739
4740 /* If not own buf_pool_mutex,
4741 page_hash can be changed. */
4742 hash_lock = buf_page_hash_lock_x_confirm(
4743 hash_lock, buf_pool, page_id);
4744
4745 if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
4746 /* Set the watch, as it would have
4747 been set if the page were not in the
4748 buffer pool in the first place. */
4749 block = (buf_block_t*) buf_pool_watch_set(
4750 page_id, &hash_lock);
4751 } else {
4752 block = (buf_block_t*) buf_page_hash_get_low(
4753 buf_pool, page_id);
4754 }
4755
4756 rw_lock_x_unlock(hash_lock);
4757
4758 if (block != NULL) {
4759 /* Either the page has been read in or
4760 a watch was set on that in the window
4761 where we released the buf_pool::mutex
4762 and before we acquire the hash_lock
4763 above. Try again. */
4764 guess = block;
4765
4766 goto loop;
4767 }
4768
4769 return(NULL);
4770 }
4771
4772 buf_page_mutex_enter(fix_block);
4773
4774 if (buf_flush_page_try(buf_pool, fix_block)) {
4775 guess = fix_block;
4776
4777 goto loop;
4778 }
4779
4780 buf_page_mutex_exit(fix_block);
4781
4782 buf_block_fix(fix_block);
4783
4784 /* Failed to evict the page; change it directly */
4785
4786 buf_pool_mutex_exit(buf_pool);
4787 }
4788 #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
4789
4790 ut_ad(fix_block->page.buf_fix_count > 0);
4791
4792 #ifdef UNIV_DEBUG
4793 /* We have already buffer fixed the page, and we are committed to
4794 returning this page to the caller. Register for debugging.
4795 Avoid debug latching if page/block belongs to system temporary
4796 tablespace (Not much needed for table with single threaded access.). */
4797 if (!fsp_is_system_temporary(page_id.space())) {
4798 ibool ret;
4799 ret = rw_lock_s_lock_nowait(
4800 &fix_block->debug_latch, file, line);
4801 ut_a(ret);
4802 }
4803 #endif /* UNIV_DEBUG */
4804
4805 /* While tablespace is reinited the indexes are already freed but the
4806 blocks related to it still resides in buffer pool. Trying to remove
4807 such blocks from buffer pool would invoke removal of AHI entries
4808 associated with these blocks. Logic to remove AHI entry will try to
4809 load the block but block is already in free state. Handle the said case
4810 with mode = BUF_PEEK_IF_IN_POOL that is invoked from
4811 "btr_search_drop_page_hash_when_freed". */
4812 ut_ad(mode == BUF_GET_POSSIBLY_FREED
4813 || mode == BUF_PEEK_IF_IN_POOL
4814 || !fix_block->page.file_page_was_freed);
4815
4816 /* Check if this is the first access to the page */
4817 access_time = buf_page_is_accessed(&fix_block->page);
4818
4819 /* This is a heuristic and we don't care about ordering issues. */
4820 if (access_time == 0) {
4821 buf_page_mutex_enter(fix_block);
4822
4823 buf_page_set_accessed(&fix_block->page);
4824
4825 buf_page_mutex_exit(fix_block);
4826 }
4827
4828 if (mode != BUF_PEEK_IF_IN_POOL) {
4829 buf_page_make_young_if_needed(&fix_block->page);
4830 }
4831
4832 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
4833 ut_a(++buf_dbg_counter % 5771 || buf_validate());
4834 ut_a(buf_block_get_state(fix_block) == BUF_BLOCK_FILE_PAGE);
4835 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
4836
4837 /* We have to wait here because the IO_READ state was set
4838 under the protection of the hash_lock and not the block->mutex
4839 and block->lock. */
4840 buf_wait_for_read(fix_block);
4841
4842 if (fix_block->page.id != page_id) {
4843
4844 buf_block_unfix(fix_block);
4845
4846 #ifdef UNIV_DEBUG
4847 if (!fsp_is_system_temporary(page_id.space())) {
4848 rw_lock_s_unlock(&fix_block->debug_latch);
4849 }
4850 #endif /* UNIV_DEBUG */
4851
4852 if (err) {
4853 *err = DB_PAGE_CORRUPTED;
4854 }
4855
4856 return NULL;
4857 }
4858
4859 fix_block = buf_page_mtr_lock(fix_block, rw_latch, mtr, file, line);
4860
4861 if (mode != BUF_PEEK_IF_IN_POOL && !access_time) {
4862 /* In the case of a first access, try to apply linear
4863 read-ahead */
4864
4865 buf_read_ahead_linear(page_id, page_size, ibuf_inside(mtr));
4866 }
4867
4868 ut_ad(!rw_lock_own_flagged(hash_lock,
4869 RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
4870
4871 return(fix_block);
4872 }
4873
4874 /** This is the general function used to get access to a database page.
4875 It does page initialization and applies the buffered redo logs.
4876 @param[in] page_id page id
4877 @param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
4878 @param[in] guess guessed block or NULL
4879 @param[in] mode BUF_GET, BUF_GET_IF_IN_POOL,
4880 BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH
4881 @param[in] file file name
4882 @param[in] line line where called
4883 @param[in] mtr mini-transaction
4884 @param[out] err DB_SUCCESS or error code
4885 @return pointer to the block or NULL */
4886 buf_block_t*
4887 buf_page_get_gen(
4888 const page_id_t page_id,
4889 const page_size_t& page_size,
4890 ulint rw_latch,
4891 buf_block_t* guess,
4892 ulint mode,
4893 const char* file,
4894 unsigned line,
4895 mtr_t* mtr,
4896 dberr_t* err)
4897 {
4898 if (buf_block_t *block = recv_recovery_create_page(page_id))
4899 {
4900 buf_block_fix(block);
4901 ut_ad(rw_lock_s_lock_nowait(&block->debug_latch, file, line));
4902 block= buf_page_mtr_lock(block, rw_latch, mtr, file, line);
4903 return block;
4904 }
4905
4906 return buf_page_get_low(page_id, page_size, rw_latch,
4907 guess, mode, file, line, mtr, err);
4908 }
4909
4910 /********************************************************************//**
4911 This is the general function used to get optimistic access to a database
4912 page.
4913 @return TRUE if success */
4914 ibool
4915 buf_page_optimistic_get(
4916 /*====================*/
4917 ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
4918 buf_block_t* block, /*!< in: guessed buffer block */
4919 ib_uint64_t modify_clock,/*!< in: modify clock value */
4920 const char* file, /*!< in: file name */
4921 unsigned line, /*!< in: line where called */
4922 mtr_t* mtr) /*!< in: mini-transaction */
4923 {
4924 buf_pool_t* buf_pool;
4925 unsigned access_time;
4926 ibool success;
4927
4928 ut_ad(block);
4929 ut_ad(mtr);
4930 ut_ad(mtr->is_active());
4931 ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
4932
4933 buf_page_mutex_enter(block);
4934
4935 if (UNIV_UNLIKELY(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE)) {
4936
4937 buf_page_mutex_exit(block);
4938
4939 return(FALSE);
4940 }
4941
4942 buf_block_buf_fix_inc(block, file, line);
4943
4944 access_time = buf_page_is_accessed(&block->page);
4945
4946 buf_page_set_accessed(&block->page);
4947
4948 buf_page_mutex_exit(block);
4949
4950 buf_page_make_young_if_needed(&block->page);
4951
4952 ut_ad(!ibuf_inside(mtr)
4953 || ibuf_page(block->page.id, block->page.size, NULL));
4954
4955 mtr_memo_type_t fix_type;
4956
4957 switch (rw_latch) {
4958 case RW_S_LATCH:
4959 success = rw_lock_s_lock_nowait(&block->lock, file, line);
4960
4961 fix_type = MTR_MEMO_PAGE_S_FIX;
4962 break;
4963 case RW_X_LATCH:
4964 success = rw_lock_x_lock_func_nowait_inline(
4965 &block->lock, file, line);
4966
4967 fix_type = MTR_MEMO_PAGE_X_FIX;
4968 break;
4969 default:
4970 ut_error; /* RW_SX_LATCH is not implemented yet */
4971 }
4972
4973 if (!success) {
4974 buf_block_buf_fix_dec(block);
4975 return(FALSE);
4976 }
4977
4978 if (modify_clock != block->modify_clock) {
4979
4980 buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
4981
4982 if (rw_latch == RW_S_LATCH) {
4983 rw_lock_s_unlock(&block->lock);
4984 } else {
4985 rw_lock_x_unlock(&block->lock);
4986 }
4987
4988 buf_block_buf_fix_dec(block);
4989 return(FALSE);
4990 }
4991
4992 mtr_memo_push(mtr, block, fix_type);
4993
4994 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
4995 ut_a(++buf_dbg_counter % 5771 || buf_validate());
4996 ut_a(block->page.buf_fix_count > 0);
4997 ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
4998 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
4999
5000 ut_d(buf_page_mutex_enter(block));
5001 ut_ad(!block->page.file_page_was_freed);
5002 ut_d(buf_page_mutex_exit(block));
5003
5004 if (!access_time) {
5005 /* In the case of a first access, try to apply linear
5006 read-ahead */
5007 buf_read_ahead_linear(block->page.id, block->page.size,
5008 ibuf_inside(mtr));
5009 }
5010
5011 buf_pool = buf_pool_from_block(block);
5012 buf_pool->stat.n_page_gets++;
5013
5014 return(TRUE);
5015 }
5016
5017 /********************************************************************//**
5018 This is used to get access to a known database page, when no waiting can be
5019 done. For example, if a search in an adaptive hash index leads us to this
5020 frame.
5021 @return TRUE if success */
5022 ibool
5023 buf_page_get_known_nowait(
5024 /*======================*/
5025 ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
5026 buf_block_t* block, /*!< in: the known page */
5027 ulint mode, /*!< in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */
5028 const char* file, /*!< in: file name */
5029 unsigned line, /*!< in: line where called */
5030 mtr_t* mtr) /*!< in: mini-transaction */
5031 {
5032 buf_pool_t* buf_pool;
5033 ibool success;
5034
5035 ut_ad(mtr->is_active());
5036 ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
5037
5038 buf_page_mutex_enter(block);
5039
5040 if (buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH) {
5041 /* Another thread is just freeing the block from the LRU list
5042 of the buffer pool: do not try to access this page; this
5043 attempt to access the page can only come through the hash
5044 index because when the buffer block state is ..._REMOVE_HASH,
5045 we have already removed it from the page address hash table
5046 of the buffer pool. */
5047
5048 buf_page_mutex_exit(block);
5049
5050 return(FALSE);
5051 }
5052
5053 ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
5054
5055 buf_block_buf_fix_inc(block, file, line);
5056
5057 buf_page_set_accessed(&block->page);
5058
5059 buf_page_mutex_exit(block);
5060
5061 buf_pool = buf_pool_from_block(block);
5062
5063 #ifdef BTR_CUR_HASH_ADAPT
5064 if (mode == BUF_MAKE_YOUNG) {
5065 buf_page_make_young_if_needed(&block->page);
5066 }
5067 #endif /* BTR_CUR_HASH_ADAPT */
5068
5069 ut_ad(!ibuf_inside(mtr) || mode == BUF_KEEP_OLD);
5070
5071 mtr_memo_type_t fix_type;
5072
5073 switch (rw_latch) {
5074 case RW_S_LATCH:
5075 success = rw_lock_s_lock_nowait(&block->lock, file, line);
5076 fix_type = MTR_MEMO_PAGE_S_FIX;
5077 break;
5078 case RW_X_LATCH:
5079 success = rw_lock_x_lock_func_nowait_inline(
5080 &block->lock, file, line);
5081
5082 fix_type = MTR_MEMO_PAGE_X_FIX;
5083 break;
5084 default:
5085 ut_error; /* RW_SX_LATCH is not implemented yet */
5086 }
5087
5088 if (!success) {
5089 buf_block_buf_fix_dec(block);
5090 return(FALSE);
5091 }
5092
5093 mtr_memo_push(mtr, block, fix_type);
5094
5095 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
5096 ut_a(++buf_dbg_counter % 5771 || buf_validate());
5097 ut_a(block->page.buf_fix_count > 0);
5098 ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
5099 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
5100
5101 #ifdef UNIV_DEBUG
5102 if (mode != BUF_KEEP_OLD) {
5103 /* If mode == BUF_KEEP_OLD, we are executing an I/O
5104 completion routine. Avoid a bogus assertion failure
5105 when ibuf_merge_or_delete_for_page() is processing a
5106 page that was just freed due to DROP INDEX, or
5107 deleting a record from SYS_INDEXES. This check will be
5108 skipped in recv_recover_page() as well. */
5109
5110 # ifdef BTR_CUR_HASH_ADAPT
5111 ut_ad(!block->page.file_page_was_freed
5112 || (block->index && block->index->freed()));
5113 # else /* BTR_CUR_HASH_ADAPT */
5114 ut_ad(!block->page.file_page_was_freed);
5115 # endif /* BTR_CUR_HASH_ADAPT */
5116 }
5117 #endif /* UNIV_DEBUG */
5118
5119 buf_pool->stat.n_page_gets++;
5120
5121 return(TRUE);
5122 }
5123
5124 /** Given a tablespace id and page number tries to get that page. If the
5125 page is not in the buffer pool it is not loaded and NULL is returned.
5126 Suitable for using when holding the lock_sys_t::mutex.
5127 @param[in] page_id page id
5128 @param[in] file file name
5129 @param[in] line line where called
5130 @param[in] mtr mini-transaction
5131 @return pointer to a page or NULL */
5132 buf_block_t*
5133 buf_page_try_get_func(
5134 const page_id_t page_id,
5135 const char* file,
5136 unsigned line,
5137 mtr_t* mtr)
5138 {
5139 buf_block_t* block;
5140 ibool success;
5141 buf_pool_t* buf_pool = buf_pool_get(page_id);
5142 rw_lock_t* hash_lock;
5143
5144 ut_ad(mtr);
5145 ut_ad(mtr->is_active());
5146
5147 block = buf_block_hash_get_s_locked(buf_pool, page_id, &hash_lock);
5148
5149 if (!block || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
5150 if (block) {
5151 rw_lock_s_unlock(hash_lock);
5152 }
5153 return(NULL);
5154 }
5155
5156 ut_ad(!buf_pool_watch_is_sentinel(buf_pool, &block->page));
5157
5158 buf_page_mutex_enter(block);
5159 rw_lock_s_unlock(hash_lock);
5160
5161 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
5162 ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
5163 ut_a(page_id == block->page.id);
5164 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
5165
5166 buf_block_buf_fix_inc(block, file, line);
5167 buf_page_mutex_exit(block);
5168
5169 mtr_memo_type_t fix_type = MTR_MEMO_PAGE_S_FIX;
5170 success = rw_lock_s_lock_nowait(&block->lock, file, line);
5171
5172 if (!success) {
5173 /* Let us try to get an X-latch. If the current thread
5174 is holding an X-latch on the page, we cannot get an
5175 S-latch. */
5176
5177 fix_type = MTR_MEMO_PAGE_X_FIX;
5178 success = rw_lock_x_lock_func_nowait_inline(&block->lock,
5179 file, line);
5180 }
5181
5182 if (!success) {
5183 buf_block_buf_fix_dec(block);
5184 return(NULL);
5185 }
5186
5187 mtr_memo_push(mtr, block, fix_type);
5188
5189 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
5190 ut_a(++buf_dbg_counter % 5771 || buf_validate());
5191 ut_a(block->page.buf_fix_count > 0);
5192 ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
5193 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
5194
5195 ut_d(buf_page_mutex_enter(block));
5196 ut_d(ut_a(!block->page.file_page_was_freed));
5197 ut_d(buf_page_mutex_exit(block));
5198
5199 buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
5200
5201 buf_pool->stat.n_page_gets++;
5202
5203 return(block);
5204 }
5205
5206 /********************************************************************//**
5207 Initialize some fields of a control block. */
5208 UNIV_INLINE
5209 void
5210 buf_page_init_low(
5211 /*==============*/
5212 buf_page_t* bpage) /*!< in: block to init */
5213 {
5214 bpage->flush_type = BUF_FLUSH_LRU;
5215 bpage->io_fix = BUF_IO_NONE;
5216 bpage->buf_fix_count = 0;
5217 bpage->old = 0;
5218 bpage->freed_page_clock = 0;
5219 bpage->access_time = 0;
5220 bpage->newest_modification = 0;
5221 bpage->oldest_modification = 0;
5222 bpage->real_size = 0;
5223 bpage->slot = NULL;
5224
5225 HASH_INVALIDATE(bpage, hash);
5226
5227 ut_d(bpage->file_page_was_freed = FALSE);
5228 }
5229
5230 /** Inits a page to the buffer buf_pool.
5231 @param[in,out] buf_pool buffer pool
5232 @param[in] page_id page id
5233 @param[in,out] block block to init */
5234 static
5235 void
5236 buf_page_init(
5237 buf_pool_t* buf_pool,
5238 const page_id_t page_id,
5239 const page_size_t& page_size,
5240 buf_block_t* block)
5241 {
5242 buf_page_t* hash_page;
5243
5244 ut_ad(buf_pool == buf_pool_get(page_id));
5245 ut_ad(buf_pool_mutex_own(buf_pool));
5246
5247 ut_ad(buf_page_mutex_own(block));
5248 ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE);
5249
5250 ut_ad(rw_lock_own(buf_page_hash_lock_get(buf_pool, page_id),
5251 RW_LOCK_X));
5252
5253 /* Set the state of the block */
5254 buf_block_set_file_page(block, page_id);
5255
5256 buf_block_init_low(block);
5257
5258 block->lock_hash_val = lock_rec_hash(page_id.space(),
5259 page_id.page_no());
5260
5261 buf_page_init_low(&block->page);
5262
5263 /* Insert into the hash table of file pages */
5264
5265 hash_page = buf_page_hash_get_low(buf_pool, page_id);
5266
5267 if (hash_page == NULL) {
5268 /* Block not found in hash table */
5269 } else if (UNIV_LIKELY(buf_pool_watch_is_sentinel(buf_pool,
5270 hash_page))) {
5271 /* Preserve the reference count. */
5272 ib_uint32_t buf_fix_count = hash_page->buf_fix_count;
5273
5274 ut_a(buf_fix_count > 0);
5275
5276 my_atomic_add32((int32*) &block->page.buf_fix_count, buf_fix_count);
5277
5278 buf_pool_watch_remove(buf_pool, hash_page);
5279 } else {
5280 ib::fatal() << "Page already foudn in the hash table: "
5281 << page_id;
5282 }
5283
5284 ut_ad(!block->page.in_zip_hash);
5285 ut_ad(!block->page.in_page_hash);
5286 ut_d(block->page.in_page_hash = TRUE);
5287
5288 block->page.id = page_id;
5289 block->page.size.copy_from(page_size);
5290
5291 HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
5292 page_id.fold(), &block->page);
5293
5294 if (page_size.is_compressed()) {
5295 page_zip_set_size(&block->page.zip, page_size.physical());
5296 }
5297 }
5298
5299 /** Initialize a page for read to the buffer buf_pool. If the page is
5300 (1) already in buf_pool, or
5301 (2) if we specify to read only ibuf pages and the page is not an ibuf page, or
5302 (3) if the space is deleted or being deleted,
5303 then this function does nothing.
5304 Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock
5305 on the buffer frame. The io-handler must take care that the flag is cleared
5306 and the lock released later.
5307 @param[out] err DB_SUCCESS or DB_TABLESPACE_DELETED
5308 @param[in] mode BUF_READ_IBUF_PAGES_ONLY, ...
5309 @param[in] page_id page id
5310 @param[in] unzip whether the uncompressed page is
5311 requested (for ROW_FORMAT=COMPRESSED)
5312 @return pointer to the block
5313 @retval NULL in case of an error */
5314 buf_page_t*
5315 buf_page_init_for_read(
5316 dberr_t* err,
5317 ulint mode,
5318 const page_id_t page_id,
5319 const page_size_t& page_size,
5320 bool unzip)
5321 {
5322 buf_block_t* block;
5323 buf_page_t* bpage = NULL;
5324 buf_page_t* watch_page;
5325 rw_lock_t* hash_lock;
5326 mtr_t mtr;
5327 bool lru = false;
5328 void* data;
5329 buf_pool_t* buf_pool = buf_pool_get(page_id);
5330
5331 ut_ad(buf_pool);
5332
5333 *err = DB_SUCCESS;
5334
5335 if (mode == BUF_READ_IBUF_PAGES_ONLY) {
5336 /* It is a read-ahead within an ibuf routine */
5337
5338 ut_ad(!ibuf_bitmap_page(page_id, page_size));
5339
5340 ibuf_mtr_start(&mtr);
5341
5342 if (!recv_no_ibuf_operations &&
5343 !ibuf_page(page_id, page_size, &mtr)) {
5344
5345 ibuf_mtr_commit(&mtr);
5346
5347 return(NULL);
5348 }
5349 } else {
5350 ut_ad(mode == BUF_READ_ANY_PAGE);
5351 }
5352
5353 if (page_size.is_compressed() && !unzip && !recv_recovery_is_on()) {
5354 block = NULL;
5355 } else {
5356 block = buf_LRU_get_free_block(buf_pool);
5357 ut_ad(block);
5358 ut_ad(buf_pool_from_block(block) == buf_pool);
5359 }
5360
5361 buf_pool_mutex_enter(buf_pool);
5362
5363 hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
5364 rw_lock_x_lock(hash_lock);
5365
5366 watch_page = buf_page_hash_get_low(buf_pool, page_id);
5367 if (watch_page && !buf_pool_watch_is_sentinel(buf_pool, watch_page)) {
5368 /* The page is already in the buffer pool. */
5369 watch_page = NULL;
5370 rw_lock_x_unlock(hash_lock);
5371 if (block) {
5372 buf_page_mutex_enter(block);
5373 buf_LRU_block_free_non_file_page(block);
5374 buf_page_mutex_exit(block);
5375 }
5376
5377 bpage = NULL;
5378 goto func_exit;
5379 }
5380
5381 if (block) {
5382 bpage = &block->page;
5383
5384 buf_page_mutex_enter(block);
5385
5386 ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
5387
5388 buf_page_init(buf_pool, page_id, page_size, block);
5389
5390 /* Note: We are using the hash_lock for protection. This is
5391 safe because no other thread can lookup the block from the
5392 page hashtable yet. */
5393
5394 buf_page_set_io_fix(bpage, BUF_IO_READ);
5395
5396 rw_lock_x_unlock(hash_lock);
5397
5398 /* The block must be put to the LRU list, to the old blocks */
5399 buf_LRU_add_block(bpage, TRUE/* to old blocks */);
5400
5401 /* We set a pass-type x-lock on the frame because then
5402 the same thread which called for the read operation
5403 (and is running now at this point of code) can wait
5404 for the read to complete by waiting for the x-lock on
5405 the frame; if the x-lock were recursive, the same
5406 thread would illegally get the x-lock before the page
5407 read is completed. The x-lock is cleared by the
5408 io-handler thread. */
5409
5410 rw_lock_x_lock_gen(&block->lock, BUF_IO_READ);
5411
5412 if (page_size.is_compressed()) {
5413 /* buf_pool->mutex may be released and
5414 reacquired by buf_buddy_alloc(). Thus, we
5415 must release block->mutex in order not to
5416 break the latching order in the reacquisition
5417 of buf_pool->mutex. We also must defer this
5418 operation until after the block descriptor has
5419 been added to buf_pool->LRU and
5420 buf_pool->page_hash. */
5421 buf_page_mutex_exit(block);
5422 data = buf_buddy_alloc(buf_pool, page_size.physical(),
5423 &lru);
5424 buf_page_mutex_enter(block);
5425 block->page.zip.data = (page_zip_t*) data;
5426
5427 /* To maintain the invariant
5428 block->in_unzip_LRU_list
5429 == buf_page_belongs_to_unzip_LRU(&block->page)
5430 we have to add this block to unzip_LRU
5431 after block->page.zip.data is set. */
5432 ut_ad(buf_page_belongs_to_unzip_LRU(&block->page));
5433 buf_unzip_LRU_add_block(block, TRUE);
5434 }
5435
5436 buf_page_mutex_exit(block);
5437 } else {
5438 rw_lock_x_unlock(hash_lock);
5439
5440 /* The compressed page must be allocated before the
5441 control block (bpage), in order to avoid the
5442 invocation of buf_buddy_relocate_block() on
5443 uninitialized data. */
5444 data = buf_buddy_alloc(buf_pool, page_size.physical(), &lru);
5445
5446 rw_lock_x_lock(hash_lock);
5447
5448 /* If buf_buddy_alloc() allocated storage from the LRU list,
5449 it released and reacquired buf_pool->mutex. Thus, we must
5450 check the page_hash again, as it may have been modified. */
5451 if (UNIV_UNLIKELY(lru)) {
5452
5453 watch_page = buf_page_hash_get_low(buf_pool, page_id);
5454
5455 if (UNIV_UNLIKELY(watch_page
5456 && !buf_pool_watch_is_sentinel(buf_pool,
5457 watch_page))) {
5458
5459 /* The block was added by some other thread. */
5460 rw_lock_x_unlock(hash_lock);
5461 watch_page = NULL;
5462 buf_buddy_free(buf_pool, data,
5463 page_size.physical());
5464
5465 bpage = NULL;
5466 goto func_exit;
5467 }
5468 }
5469
5470 bpage = buf_page_alloc_descriptor();
5471
5472 /* Initialize the buf_pool pointer. */
5473 bpage->buf_pool_index = buf_pool_index(buf_pool);
5474
5475 page_zip_des_init(&bpage->zip);
5476 page_zip_set_size(&bpage->zip, page_size.physical());
5477 bpage->zip.data = (page_zip_t*) data;
5478
5479 bpage->size.copy_from(page_size);
5480
5481 mutex_enter(&buf_pool->zip_mutex);
5482
5483 buf_page_init_low(bpage);
5484
5485 bpage->state = BUF_BLOCK_ZIP_PAGE;
5486 bpage->id = page_id;
5487 bpage->flush_observer = NULL;
5488
5489 ut_d(bpage->in_page_hash = FALSE);
5490 ut_d(bpage->in_zip_hash = FALSE);
5491 ut_d(bpage->in_flush_list = FALSE);
5492 ut_d(bpage->in_free_list = FALSE);
5493 ut_d(bpage->in_LRU_list = FALSE);
5494
5495 ut_d(bpage->in_page_hash = TRUE);
5496
5497 if (watch_page != NULL) {
5498
5499 /* Preserve the reference count. */
5500 ib_uint32_t buf_fix_count;
5501
5502 buf_fix_count = watch_page->buf_fix_count;
5503
5504 ut_a(buf_fix_count > 0);
5505
5506 my_atomic_add32((int32*) &bpage->buf_fix_count, buf_fix_count);
5507
5508 ut_ad(buf_pool_watch_is_sentinel(buf_pool, watch_page));
5509 buf_pool_watch_remove(buf_pool, watch_page);
5510 }
5511
5512 HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
5513 bpage->id.fold(), bpage);
5514
5515 rw_lock_x_unlock(hash_lock);
5516
5517 /* The block must be put to the LRU list, to the old blocks.
5518 The zip size is already set into the page zip */
5519 buf_LRU_add_block(bpage, TRUE/* to old blocks */);
5520 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
5521 buf_LRU_insert_zip_clean(bpage);
5522 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
5523
5524 buf_page_set_io_fix(bpage, BUF_IO_READ);
5525
5526 mutex_exit(&buf_pool->zip_mutex);
5527 }
5528
5529 buf_pool->n_pend_reads++;
5530 func_exit:
5531 buf_pool_mutex_exit(buf_pool);
5532
5533 if (mode == BUF_READ_IBUF_PAGES_ONLY) {
5534
5535 ibuf_mtr_commit(&mtr);
5536 }
5537
5538 ut_ad(!rw_lock_own_flagged(hash_lock,
5539 RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
5540 ut_ad(!bpage || buf_page_in_file(bpage));
5541
5542 return(bpage);
5543 }
5544
5545 /** Initializes a page to the buffer buf_pool. The page is usually not read
5546 from a file even if it cannot be found in the buffer buf_pool. This is one
5547 of the functions which perform to a block a state transition NOT_USED =>
5548 FILE_PAGE (the other is buf_page_get_gen).
5549 @param[in] page_id page id
5550 @param[in] page_size page size
5551 @param[in] mtr mini-transaction
5552 @return pointer to the block, page bufferfixed */
5553 buf_block_t*
5554 buf_page_create(
5555 const page_id_t page_id,
5556 const page_size_t& page_size,
5557 mtr_t* mtr)
5558 {
5559 buf_frame_t* frame;
5560 buf_block_t* block;
5561 buf_block_t* free_block = NULL;
5562 buf_pool_t* buf_pool= buf_pool_get(page_id);
5563 rw_lock_t* hash_lock;
5564
5565 ut_ad(mtr->is_active());
5566 ut_ad(page_id.space() != 0 || !page_size.is_compressed());
5567 loop:
5568 free_block = buf_LRU_get_free_block(buf_pool);
5569 buf_pool_mutex_enter(buf_pool);
5570
5571 hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
5572 rw_lock_x_lock(hash_lock);
5573
5574 block = (buf_block_t*) buf_page_hash_get_low(buf_pool, page_id);
5575
5576 if (block
5577 && buf_page_in_file(&block->page)
5578 && !buf_pool_watch_is_sentinel(buf_pool, &block->page)) {
5579 ut_d(block->page.file_page_was_freed = FALSE);
5580 buf_page_state page_state = buf_block_get_state(block);
5581 bool have_x_latch = false;
5582 #ifdef BTR_CUR_HASH_ADAPT
5583 const dict_index_t *drop_hash_entry= NULL;
5584 #endif
5585 switch (page_state) {
5586 default:
5587 ut_ad(0);
5588 break;
5589 case BUF_BLOCK_ZIP_PAGE:
5590 case BUF_BLOCK_ZIP_DIRTY:
5591 buf_block_init_low(free_block);
5592 mutex_enter(&buf_pool->zip_mutex);
5593
5594 buf_page_mutex_enter(free_block);
5595 if (buf_page_get_io_fix(&block->page) != BUF_IO_NONE) {
5596 mutex_exit(&buf_pool->zip_mutex);
5597 rw_lock_x_unlock(hash_lock);
5598 buf_LRU_block_free_non_file_page(free_block);
5599 buf_pool_mutex_exit(buf_pool);
5600 buf_page_mutex_exit(free_block);
5601
5602 goto loop;
5603 }
5604
5605 rw_lock_x_lock(&free_block->lock);
5606
5607 buf_relocate(&block->page, &free_block->page);
5608 if (page_state == BUF_BLOCK_ZIP_DIRTY) {
5609 ut_ad(block->page.in_flush_list);
5610 ut_ad(block->page.oldest_modification > 0);
5611 buf_flush_relocate_on_flush_list(
5612 &block->page, &free_block->page);
5613 } else {
5614 ut_ad(block->page.oldest_modification == 0);
5615 ut_ad(!block->page.in_flush_list);
5616 #ifdef UNIV_DEBUG
5617 UT_LIST_REMOVE(
5618 buf_pool->zip_clean, &block->page);
5619 #endif
5620 }
5621
5622 free_block->page.state = BUF_BLOCK_FILE_PAGE;
5623 mutex_exit(&buf_pool->zip_mutex);
5624 free_block->lock_hash_val = lock_rec_hash(
5625 page_id.space(), page_id.page_no());
5626 buf_unzip_LRU_add_block(free_block, false);
5627 buf_page_free_descriptor(&block->page);
5628 block = free_block;
5629 buf_block_fix(block);
5630 buf_page_mutex_exit(free_block);
5631 free_block = NULL;
5632 break;
5633 case BUF_BLOCK_FILE_PAGE:
5634 have_x_latch = mtr->have_x_latch(*block);
5635 if (!have_x_latch) {
5636 buf_block_fix(block);
5637 buf_page_mutex_enter(block);
5638 while (buf_block_get_io_fix(block)
5639 != BUF_IO_NONE
5640 || block->page.buf_fix_count != 1) {
5641 buf_page_mutex_exit(block);
5642 buf_pool_mutex_exit(buf_pool);
5643 rw_lock_x_unlock(hash_lock);
5644
5645 os_thread_sleep(1000);
5646
5647 buf_pool_mutex_enter(buf_pool);
5648 rw_lock_x_lock(hash_lock);
5649 buf_page_mutex_enter(block);
5650 }
5651 rw_lock_x_lock(&block->lock);
5652 buf_page_mutex_exit(block);
5653 }
5654 #ifdef BTR_CUR_HASH_ADAPT
5655 drop_hash_entry = block->index;
5656 #endif
5657 break;
5658 }
5659 /* Page can be found in buf_pool */
5660 buf_pool_mutex_exit(buf_pool);
5661 rw_lock_x_unlock(hash_lock);
5662
5663 if (free_block) {
5664 buf_block_free(free_block);
5665 }
5666 #ifdef BTR_CUR_HASH_ADAPT
5667 if (drop_hash_entry) {
5668 btr_search_drop_page_hash_index(block);
5669 }
5670 #endif /* BTR_CUR_HASH_ADAPT */
5671
5672 if (!have_x_latch) {
5673 #ifdef UNIV_DEBUG
5674 if (!fsp_is_system_temporary(page_id.space())) {
5675 rw_lock_s_lock_nowait(
5676 &block->debug_latch,
5677 __FILE__, __LINE__);
5678 }
5679 #endif /* UNIV_DEBUG */
5680
5681 mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX);
5682 }
5683 return block;
5684 }
5685
5686 /* If we get here, the page was not in buf_pool: init it there */
5687
5688 DBUG_PRINT("ib_buf", ("create page %u:%u",
5689 page_id.space(), page_id.page_no()));
5690
5691 block = free_block;
5692
5693 buf_page_mutex_enter(block);
5694
5695 buf_page_init(buf_pool, page_id, page_size, block);
5696
5697 rw_lock_x_lock(&block->lock);
5698
5699 rw_lock_x_unlock(hash_lock);
5700
5701 /* The block must be put to the LRU list */
5702 buf_LRU_add_block(&block->page, FALSE);
5703
5704 buf_block_buf_fix_inc(block, __FILE__, __LINE__);
5705 buf_pool->stat.n_pages_created++;
5706
5707 if (page_size.is_compressed()) {
5708 void* data;
5709 bool lru;
5710
5711 /* Prevent race conditions during buf_buddy_alloc(),
5712 which may release and reacquire buf_pool->mutex,
5713 by IO-fixing and X-latching the block. */
5714
5715 buf_page_set_io_fix(&block->page, BUF_IO_READ);
5716
5717 buf_page_mutex_exit(block);
5718 /* buf_pool->mutex may be released and reacquired by
5719 buf_buddy_alloc(). Thus, we must release block->mutex
5720 in order not to break the latching order in
5721 the reacquisition of buf_pool->mutex. We also must
5722 defer this operation until after the block descriptor
5723 has been added to buf_pool->LRU and buf_pool->page_hash. */
5724 data = buf_buddy_alloc(buf_pool, page_size.physical(), &lru);
5725 buf_page_mutex_enter(block);
5726 block->page.zip.data = (page_zip_t*) data;
5727
5728 /* To maintain the invariant
5729 block->in_unzip_LRU_list
5730 == buf_page_belongs_to_unzip_LRU(&block->page)
5731 we have to add this block to unzip_LRU after
5732 block->page.zip.data is set. */
5733 ut_ad(buf_page_belongs_to_unzip_LRU(&block->page));
5734 buf_unzip_LRU_add_block(block, FALSE);
5735
5736 buf_page_set_io_fix(&block->page, BUF_IO_NONE);
5737 }
5738
5739 buf_pool_mutex_exit(buf_pool);
5740
5741 mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX);
5742
5743 buf_page_set_accessed(&block->page);
5744
5745 buf_page_mutex_exit(block);
5746
5747 /* Delete possible entries for the page from the insert buffer:
5748 such can exist if the page belonged to an index which was dropped */
5749 if (!recv_recovery_is_on()) {
5750 ibuf_merge_or_delete_for_page(NULL, page_id, page_size);
5751 }
5752
5753 frame = block->frame;
5754
5755 memset(frame + FIL_PAGE_PREV, 0xff, 4);
5756 memset(frame + FIL_PAGE_NEXT, 0xff, 4);
5757 mach_write_to_2(frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED);
5758
5759 /* FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION is only used on the
5760 following pages:
5761 (1) The first page of the InnoDB system tablespace (page 0:0)
5762 (2) FIL_RTREE_SPLIT_SEQ_NUM on R-tree pages
5763 (3) key_version on encrypted pages (not page 0:0) */
5764
5765 memset(frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
5766 memset(frame + FIL_PAGE_LSN, 0, 8);
5767
5768 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
5769 ut_a(++buf_dbg_counter % 5771 || buf_validate());
5770 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
5771 return(block);
5772 }
5773
5774 /********************************************************************//**
5775 Monitor the buffer page read/write activity, and increment corresponding
5776 counter value if MONITOR_MODULE_BUF_PAGE (module_buf_page) module is
5777 enabled. */
5778 static
5779 void
5780 buf_page_monitor(
5781 /*=============*/
5782 const buf_page_t* bpage, /*!< in: pointer to the block */
5783 enum buf_io_fix io_type)/*!< in: io_fix types */
5784 {
5785 const byte* frame;
5786 monitor_id_t counter;
5787
5788 /* If the counter module is not turned on, just return */
5789 if (!MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)) {
5790 return;
5791 }
5792
5793 ut_a(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE);
5794
5795 frame = bpage->zip.data
5796 ? bpage->zip.data
5797 : ((buf_block_t*) bpage)->frame;
5798
5799 switch (fil_page_get_type(frame)) {
5800 ulint level;
5801 case FIL_PAGE_TYPE_INSTANT:
5802 case FIL_PAGE_INDEX:
5803 case FIL_PAGE_RTREE:
5804 level = btr_page_get_level(frame);
5805
5806 /* Check if it is an index page for insert buffer */
5807 if (fil_page_get_type(frame) == FIL_PAGE_INDEX
5808 && btr_page_get_index_id(frame)
5809 == (index_id_t)(DICT_IBUF_ID_MIN + IBUF_SPACE_ID)) {
5810 if (level == 0) {
5811 counter = MONITOR_RW_COUNTER(
5812 io_type, MONITOR_INDEX_IBUF_LEAF_PAGE);
5813 } else {
5814 counter = MONITOR_RW_COUNTER(
5815 io_type,
5816 MONITOR_INDEX_IBUF_NON_LEAF_PAGE);
5817 }
5818 } else {
5819 if (level == 0) {
5820 counter = MONITOR_RW_COUNTER(
5821 io_type, MONITOR_INDEX_LEAF_PAGE);
5822 } else {
5823 counter = MONITOR_RW_COUNTER(
5824 io_type, MONITOR_INDEX_NON_LEAF_PAGE);
5825 }
5826 }
5827 break;
5828
5829 case FIL_PAGE_UNDO_LOG:
5830 counter = MONITOR_RW_COUNTER(io_type, MONITOR_UNDO_LOG_PAGE);
5831 break;
5832
5833 case FIL_PAGE_INODE:
5834 counter = MONITOR_RW_COUNTER(io_type, MONITOR_INODE_PAGE);
5835 break;
5836
5837 case FIL_PAGE_IBUF_FREE_LIST:
5838 counter = MONITOR_RW_COUNTER(io_type,
5839 MONITOR_IBUF_FREELIST_PAGE);
5840 break;
5841
5842 case FIL_PAGE_IBUF_BITMAP:
5843 counter = MONITOR_RW_COUNTER(io_type,
5844 MONITOR_IBUF_BITMAP_PAGE);
5845 break;
5846
5847 case FIL_PAGE_TYPE_SYS:
5848 counter = MONITOR_RW_COUNTER(io_type, MONITOR_SYSTEM_PAGE);
5849 break;
5850
5851 case FIL_PAGE_TYPE_TRX_SYS:
5852 counter = MONITOR_RW_COUNTER(io_type, MONITOR_TRX_SYSTEM_PAGE);
5853 break;
5854
5855 case FIL_PAGE_TYPE_FSP_HDR:
5856 counter = MONITOR_RW_COUNTER(io_type, MONITOR_FSP_HDR_PAGE);
5857 break;
5858
5859 case FIL_PAGE_TYPE_XDES:
5860 counter = MONITOR_RW_COUNTER(io_type, MONITOR_XDES_PAGE);
5861 break;
5862
5863 case FIL_PAGE_TYPE_BLOB:
5864 counter = MONITOR_RW_COUNTER(io_type, MONITOR_BLOB_PAGE);
5865 break;
5866
5867 case FIL_PAGE_TYPE_ZBLOB:
5868 counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB_PAGE);
5869 break;
5870
5871 case FIL_PAGE_TYPE_ZBLOB2:
5872 counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB2_PAGE);
5873 break;
5874
5875 default:
5876 counter = MONITOR_RW_COUNTER(io_type, MONITOR_OTHER_PAGE);
5877 }
5878
5879 MONITOR_INC_NOCHECK(counter);
5880 }
5881
5882 /** Mark a table corrupted.
5883 @param[in] bpage corrupted page
5884 @param[in] space tablespace of the corrupted page */
5885 ATTRIBUTE_COLD
5886 static void buf_mark_space_corrupt(buf_page_t* bpage, const fil_space_t& space)
5887 {
5888 /* If block is not encrypted find the table with specified
5889 space id, and mark it corrupted. Encrypted tables
5890 are marked unusable later e.g. in ::open(). */
5891 if (!space.crypt_data
5892 || space.crypt_data->type == CRYPT_SCHEME_UNENCRYPTED) {
5893 dict_set_corrupted_by_space(&space);
5894 } else {
5895 dict_set_encrypted_by_space(&space);
5896 }
5897 }
5898
5899 /** Mark a table corrupted.
5900 @param[in] bpage Corrupted page
5901 @param[in] space Corrupted page belongs to tablespace
5902 Also remove the bpage from LRU list. */
5903 static
5904 void
5905 buf_corrupt_page_release(buf_page_t* bpage, const fil_space_t* space)
5906 {
5907 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
5908 const ibool uncompressed = (buf_page_get_state(bpage)
5909 == BUF_BLOCK_FILE_PAGE);
5910 page_id_t old_page_id = bpage->id;
5911
5912 /* First unfix and release lock on the bpage */
5913 buf_pool_mutex_enter(buf_pool);
5914 mutex_enter(buf_page_get_mutex(bpage));
5915 ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_READ);
5916 ut_ad(bpage->id.space() == space->id);
5917
5918 /* buf_fix_count can be greater than zero. Because other thread
5919 can wait in buf_page_wait_read() for the page to be read. */
5920
5921 bpage->id.set_corrupt_id();
5922 /* Set BUF_IO_NONE before we remove the block from LRU list */
5923 buf_page_set_io_fix(bpage, BUF_IO_NONE);
5924
5925 if (uncompressed) {
5926 rw_lock_x_unlock_gen(
5927 &((buf_block_t*) bpage)->lock,
5928 BUF_IO_READ);
5929 }
5930
5931 mutex_exit(buf_page_get_mutex(bpage));
5932
5933 if (!srv_force_recovery) {
5934 buf_mark_space_corrupt(bpage, *space);
5935 }
5936
5937 /* After this point bpage can't be referenced. */
5938 buf_LRU_free_one_page(bpage, old_page_id);
5939
5940 ut_ad(buf_pool->n_pend_reads > 0);
5941 buf_pool->n_pend_reads--;
5942
5943 buf_pool_mutex_exit(buf_pool);
5944 }
5945
5946 /** Check if page is maybe compressed, encrypted or both when we encounter
5947 corrupted page. Note that we can't be 100% sure if page is corrupted
5948 or decrypt/decompress just failed.
5949 @param[in,out] bpage page
5950 @param[in,out] space tablespace from fil_space_acquire_for_io()
5951 @return whether the operation succeeded
5952 @retval DB_SUCCESS if page has been read and is not corrupted
5953 @retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted
5954 @retval DB_DECRYPTION_FAILED if page post encryption checksum matches but
5955 after decryption normal page checksum does not match.
5956 @retval DB_TABLESPACE_DELETED if accessed tablespace is not found */
5957 static dberr_t buf_page_check_corrupt(buf_page_t* bpage, fil_space_t* space)
5958 {
5959 ut_ad(space->pending_io());
5960
5961 byte* dst_frame = (bpage->zip.data) ? bpage->zip.data :
5962 ((buf_block_t*) bpage)->frame;
5963 dberr_t err = DB_SUCCESS;
5964
5965 /* In buf_decrypt_after_read we have either decrypted the page if
5966 page post encryption checksum matches and used key_id is found
5967 from the encryption plugin. If checksum did not match page was
5968 not decrypted and it could be either encrypted and corrupted
5969 or corrupted or good page. If we decrypted, there page could
5970 still be corrupted if used key does not match. */
5971 const bool seems_encrypted = mach_read_from_4(
5972 dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION)
5973 && space->crypt_data
5974 && space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED;
5975
5976 /* If traditional checksums match, we assume that page is
5977 not anymore encrypted. */
5978 if (buf_page_is_corrupted(
5979 true, dst_frame, bpage->size, space)) {
5980 err = DB_PAGE_CORRUPTED;
5981 }
5982
5983 if (seems_encrypted && err == DB_PAGE_CORRUPTED
5984 && bpage->id.page_no() != 0) {
5985 err = DB_DECRYPTION_FAILED;
5986
5987 ib::error()
5988 << "The page " << bpage->id << " in file '"
5989 << space->chain.start->name
5990 << "' cannot be decrypted.";
5991
5992 ib::info()
5993 << "However key management plugin or used key_version "
5994 << mach_read_from_4(dst_frame
5995 + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION)
5996 << " is not found or"
5997 " used encryption algorithm or method does not match.";
5998
5999 if (bpage->id.space() != TRX_SYS_SPACE) {
6000 ib::info()
6001 << "Marking tablespace as missing."
6002 " You may drop this table or"
6003 " install correct key management plugin"
6004 " and key file.";
6005 }
6006 }
6007
6008 return (err);
6009 }
6010
6011 /** Complete a read or write request of a file page to or from the buffer pool.
6012 @param[in,out] bpage page to complete
6013 @param[in] dblwr whether the doublewrite buffer was used (on write)
6014 @param[in] evict whether or not to evict the page from LRU list
6015 @return whether the operation succeeded
6016 @retval DB_SUCCESS always when writing, or if a read page was OK
6017 @retval DB_TABLESPACE_DELETED if the tablespace does not exist
6018 @retval DB_PAGE_CORRUPTED if the checksum fails on a page read
6019 @retval DB_DECRYPTION_FAILED if page post encryption checksum matches but
6020 after decryption normal page checksum does
6021 not match */
6022 UNIV_INTERN
6023 dberr_t
6024 buf_page_io_complete(buf_page_t* bpage, bool dblwr, bool evict)
6025 {
6026 enum buf_io_fix io_type;
6027 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
6028 const bool uncompressed = (buf_page_get_state(bpage)
6029 == BUF_BLOCK_FILE_PAGE);
6030 ut_a(buf_page_in_file(bpage));
6031
6032 /* We do not need protect io_fix here by mutex to read
6033 it because this is the only function where we can change the value
6034 from BUF_IO_READ or BUF_IO_WRITE to some other value, and our code
6035 ensures that this is the only thread that handles the i/o for this
6036 block. */
6037
6038 io_type = buf_page_get_io_fix(bpage);
6039 ut_ad(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE);
6040 ut_ad(bpage->size.is_compressed() == (bpage->zip.data != NULL));
6041 ut_ad(uncompressed || bpage->zip.data);
6042
6043 if (io_type == BUF_IO_READ) {
6044 ulint read_page_no = 0;
6045 ulint read_space_id = 0;
6046 byte* frame = bpage->zip.data
6047 ? bpage->zip.data
6048 : reinterpret_cast<buf_block_t*>(bpage)->frame;
6049 ut_ad(frame);
6050 fil_space_t* space = fil_space_acquire_for_io(
6051 bpage->id.space());
6052 if (!space) {
6053 return DB_TABLESPACE_DELETED;
6054 }
6055
6056 dberr_t err;
6057
6058 if (!buf_page_decrypt_after_read(bpage, space)) {
6059 err = DB_DECRYPTION_FAILED;
6060 goto database_corrupted;
6061 }
6062
6063 if (bpage->zip.data && uncompressed) {
6064 my_atomic_addlint(&buf_pool->n_pend_unzip, 1);
6065 ibool ok = buf_zip_decompress((buf_block_t*) bpage,
6066 FALSE);
6067 my_atomic_addlint(&buf_pool->n_pend_unzip, ulint(-1));
6068
6069 if (!ok) {
6070 ib::info() << "Page "
6071 << bpage->id
6072 << " zip_decompress failure.";
6073
6074 err = DB_PAGE_CORRUPTED;
6075 goto database_corrupted;
6076 }
6077 }
6078
6079 /* If this page is not uninitialized and not in the
6080 doublewrite buffer, then the page number and space id
6081 should be the same as in block. */
6082 read_page_no = mach_read_from_4(frame + FIL_PAGE_OFFSET);
6083 read_space_id = mach_read_from_4(
6084 frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
6085
6086 if (bpage->id.space() == TRX_SYS_SPACE
6087 && buf_dblwr_page_inside(bpage->id.page_no())) {
6088
6089 ib::error() << "Reading page " << bpage->id
6090 << ", which is in the doublewrite buffer!";
6091
6092 } else if (read_space_id == 0 && read_page_no == 0) {
6093 /* This is likely an uninitialized page. */
6094 } else if ((bpage->id.space() != TRX_SYS_SPACE
6095 && bpage->id.space() != read_space_id)
6096 || bpage->id.page_no() != read_page_no) {
6097 /* We did not compare space_id to read_space_id
6098 in the system tablespace, because the field
6099 was written as garbage before MySQL 4.1.1,
6100 which did not support innodb_file_per_table. */
6101
6102 ib::error() << "Space id and page no stored in "
6103 "the page, read in are "
6104 << page_id_t(read_space_id, read_page_no)
6105 << ", should be " << bpage->id;
6106 }
6107
6108 err = buf_page_check_corrupt(bpage, space);
6109
6110 if (err != DB_SUCCESS) {
6111 database_corrupted:
6112 /* Not a real corruption if it was triggered by
6113 error injection */
6114 DBUG_EXECUTE_IF(
6115 "buf_page_import_corrupt_failure",
6116 if (!is_predefined_tablespace(
6117 bpage->id.space())) {
6118 buf_corrupt_page_release(bpage, space);
6119 ib::info() << "Simulated IMPORT "
6120 "corruption";
6121 space->release_for_io();
6122 return(err);
6123 }
6124 err = DB_SUCCESS;
6125 goto page_not_corrupt;
6126 );
6127
6128 if (uncompressed && bpage->zip.data) {
6129 memset(reinterpret_cast<buf_block_t*>(bpage)
6130 ->frame, 0, srv_page_size);
6131 }
6132
6133 if (err == DB_PAGE_CORRUPTED) {
6134 ib::error()
6135 << "Database page corruption on disk"
6136 " or a failed file read of tablespace "
6137 << space->name << " page " << bpage->id
6138 << ". You may have to recover from "
6139 << "a backup.";
6140
6141 buf_page_print(frame, bpage->size);
6142
6143 ib::info()
6144 << "It is also possible that your"
6145 " operating system has corrupted"
6146 " its own file cache and rebooting"
6147 " your computer removes the error."
6148 " If the corrupt page is an index page."
6149 " You can also try to fix the"
6150 " corruption by dumping, dropping,"
6151 " and reimporting the corrupt table."
6152 " You can use CHECK TABLE to scan"
6153 " your table for corruption. "
6154 << FORCE_RECOVERY_MSG;
6155 }
6156
6157 if (!srv_force_recovery) {
6158
6159 /* If page space id is larger than TRX_SYS_SPACE
6160 (0), we will attempt to mark the corresponding
6161 table as corrupted instead of crashing server */
6162 if (bpage->id.space() == TRX_SYS_SPACE) {
6163 ib::fatal() << "Aborting because of"
6164 " a corrupt database page.";
6165 }
6166
6167 buf_corrupt_page_release(bpage, space);
6168 space->release_for_io();
6169 return(err);
6170 }
6171 }
6172
6173 DBUG_EXECUTE_IF("buf_page_import_corrupt_failure",
6174 page_not_corrupt: bpage = bpage; );
6175
6176 if (err == DB_PAGE_CORRUPTED
6177 || err == DB_DECRYPTION_FAILED) {
6178 const page_id_t corrupt_page_id = bpage->id;
6179
6180 buf_corrupt_page_release(bpage, space);
6181
6182 if (recv_recovery_is_on()) {
6183 recv_recover_corrupt_page(corrupt_page_id);
6184 }
6185
6186 space->release_for_io();
6187 return err;
6188 }
6189
6190 if (recv_recovery_is_on()) {
6191 recv_recover_page(bpage);
6192 }
6193
6194 /* If space is being truncated then avoid ibuf operation.
6195 During re-init we have already freed ibuf entries. */
6196 if (uncompressed
6197 && !recv_no_ibuf_operations
6198 && (bpage->id.space() == 0
6199 || !is_predefined_tablespace(bpage->id.space()))
6200 && !srv_is_tablespace_truncated(bpage->id.space())
6201 && fil_page_get_type(frame) == FIL_PAGE_INDEX
6202 && page_is_leaf(frame)) {
6203
6204 ibuf_merge_or_delete_for_page(
6205 (buf_block_t*) bpage, bpage->id,
6206 bpage->size);
6207 }
6208
6209 space->release_for_io();
6210 } else {
6211 /* io_type == BUF_IO_WRITE */
6212 if (bpage->slot) {
6213 /* Mark slot free */
6214 bpage->slot->release();
6215 bpage->slot = NULL;
6216 }
6217 }
6218
6219 BPageMutex* block_mutex = buf_page_get_mutex(bpage);
6220 buf_pool_mutex_enter(buf_pool);
6221 mutex_enter(block_mutex);
6222
6223 /* Because this thread which does the unlocking is not the same that
6224 did the locking, we use a pass value != 0 in unlock, which simply
6225 removes the newest lock debug record, without checking the thread
6226 id. */
6227
6228 buf_page_set_io_fix(bpage, BUF_IO_NONE);
6229 buf_page_monitor(bpage, io_type);
6230
6231 if (io_type == BUF_IO_READ) {
6232 /* NOTE that the call to ibuf may have moved the ownership of
6233 the x-latch to this OS thread: do not let this confuse you in
6234 debugging! */
6235
6236 ut_ad(buf_pool->n_pend_reads > 0);
6237 buf_pool->n_pend_reads--;
6238 buf_pool->stat.n_pages_read++;
6239
6240 if (uncompressed) {
6241 rw_lock_x_unlock_gen(&((buf_block_t*) bpage)->lock,
6242 BUF_IO_READ);
6243 }
6244
6245 mutex_exit(block_mutex);
6246 } else {
6247 /* Write means a flush operation: call the completion
6248 routine in the flush system */
6249
6250 buf_flush_write_complete(bpage, dblwr);
6251
6252 if (uncompressed) {
6253 rw_lock_sx_unlock_gen(&((buf_block_t*) bpage)->lock,
6254 BUF_IO_WRITE);
6255 }
6256
6257 buf_pool->stat.n_pages_written++;
6258
6259 /* We decide whether or not to evict the page from the
6260 LRU list based on the flush_type.
6261 * BUF_FLUSH_LIST: don't evict
6262 * BUF_FLUSH_LRU: always evict
6263 * BUF_FLUSH_SINGLE_PAGE: eviction preference is passed
6264 by the caller explicitly. */
6265 if (buf_page_get_flush_type(bpage) == BUF_FLUSH_LRU) {
6266 evict = true;
6267 }
6268
6269 mutex_exit(block_mutex);
6270
6271 if (evict) {
6272 buf_LRU_free_page(bpage, true);
6273 }
6274 }
6275
6276 DBUG_PRINT("ib_buf", ("%s page %u:%u",
6277 io_type == BUF_IO_READ ? "read" : "wrote",
6278 bpage->id.space(), bpage->id.page_no()));
6279
6280 buf_pool_mutex_exit(buf_pool);
6281
6282 return DB_SUCCESS;
6283 }
6284
6285 /*********************************************************************//**
6286 Asserts that all file pages in the buffer are in a replaceable state.
6287 @return TRUE */
6288 static
6289 ibool
6290 buf_all_freed_instance(
6291 /*===================*/
6292 buf_pool_t* buf_pool) /*!< in: buffer pool instancce */
6293 {
6294 ulint i;
6295 buf_chunk_t* chunk;
6296
6297 ut_ad(buf_pool);
6298
6299 buf_pool_mutex_enter(buf_pool);
6300
6301 chunk = buf_pool->chunks;
6302
6303 for (i = buf_pool->n_chunks; i--; chunk++) {
6304
6305 if (const buf_block_t* block = buf_chunk_not_freed(chunk)) {
6306 ib::fatal() << "Page " << block->page.id
6307 << " still fixed or dirty";
6308 }
6309 }
6310
6311 buf_pool_mutex_exit(buf_pool);
6312
6313 return(TRUE);
6314 }
6315
6316 /** Refreshes the statistics used to print per-second averages.
6317 @param[in,out] buf_pool buffer pool instance */
6318 static
6319 void
6320 buf_refresh_io_stats(
6321 buf_pool_t* buf_pool)
6322 {
6323 buf_pool->last_printout_time = time(NULL);
6324 buf_pool->old_stat = buf_pool->stat;
6325 }
6326
6327 /*********************************************************************//**
6328 Invalidates file pages in one buffer pool instance */
6329 static
6330 void
6331 buf_pool_invalidate_instance(
6332 /*=========================*/
6333 buf_pool_t* buf_pool) /*!< in: buffer pool instance */
6334 {
6335 ulint i;
6336
6337 buf_pool_mutex_enter(buf_pool);
6338
6339 for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
6340
6341 /* As this function is called during startup and
6342 during redo application phase during recovery, InnoDB
6343 is single threaded (apart from IO helper threads) at
6344 this stage. No new write batch can be in intialization
6345 stage at this point. */
6346 ut_ad(buf_pool->init_flush[i] == FALSE);
6347
6348 /* However, it is possible that a write batch that has
6349 been posted earlier is still not complete. For buffer
6350 pool invalidation to proceed we must ensure there is NO
6351 write activity happening. */
6352 if (buf_pool->n_flush[i] > 0) {
6353 buf_flush_t type = static_cast<buf_flush_t>(i);
6354
6355 buf_pool_mutex_exit(buf_pool);
6356 buf_flush_wait_batch_end(buf_pool, type);
6357 buf_pool_mutex_enter(buf_pool);
6358 }
6359 }
6360
6361 buf_pool_mutex_exit(buf_pool);
6362
6363 ut_ad(buf_all_freed_instance(buf_pool));
6364
6365 buf_pool_mutex_enter(buf_pool);
6366
6367 while (buf_LRU_scan_and_free_block(buf_pool, true)) {
6368 }
6369
6370 ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0);
6371 ut_ad(UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0);
6372
6373 buf_pool->freed_page_clock = 0;
6374 buf_pool->LRU_old = NULL;
6375 buf_pool->LRU_old_len = 0;
6376
6377 memset(&buf_pool->stat, 0x00, sizeof(buf_pool->stat));
6378 buf_refresh_io_stats(buf_pool);
6379
6380 buf_pool_mutex_exit(buf_pool);
6381 }
6382
6383 /*********************************************************************//**
6384 Invalidates the file pages in the buffer pool when an archive recovery is
6385 completed. All the file pages buffered must be in a replaceable state when
6386 this function is called: not latched and not modified. */
6387 void
6388 buf_pool_invalidate(void)
6389 /*=====================*/
6390 {
6391 ulint i;
6392
6393 for (i = 0; i < srv_buf_pool_instances; i++) {
6394 buf_pool_invalidate_instance(buf_pool_from_array(i));
6395 }
6396 }
6397
6398 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
6399 /*********************************************************************//**
6400 Validates data in one buffer pool instance
6401 @return TRUE */
6402 static
6403 ibool
6404 buf_pool_validate_instance(
6405 /*=======================*/
6406 buf_pool_t* buf_pool) /*!< in: buffer pool instance */
6407 {
6408 buf_page_t* b;
6409 buf_chunk_t* chunk;
6410 ulint i;
6411 ulint n_lru_flush = 0;
6412 ulint n_page_flush = 0;
6413 ulint n_list_flush = 0;
6414 ulint n_lru = 0;
6415 ulint n_flush = 0;
6416 ulint n_free = 0;
6417 ulint n_zip = 0;
6418
6419 ut_ad(buf_pool);
6420
6421 buf_pool_mutex_enter(buf_pool);
6422 hash_lock_x_all(buf_pool->page_hash);
6423
6424 chunk = buf_pool->chunks;
6425
6426 /* Check the uncompressed blocks. */
6427
6428 for (i = buf_pool->n_chunks; i--; chunk++) {
6429
6430 ulint j;
6431 buf_block_t* block = chunk->blocks;
6432
6433 for (j = chunk->size; j--; block++) {
6434
6435 buf_page_mutex_enter(block);
6436
6437 switch (buf_block_get_state(block)) {
6438 case BUF_BLOCK_POOL_WATCH:
6439 case BUF_BLOCK_ZIP_PAGE:
6440 case BUF_BLOCK_ZIP_DIRTY:
6441 /* These should only occur on
6442 zip_clean, zip_free[], or flush_list. */
6443 ut_error;
6444 break;
6445
6446 case BUF_BLOCK_FILE_PAGE:
6447 ut_a(buf_page_hash_get_low(
6448 buf_pool, block->page.id)
6449 == &block->page);
6450
6451 switch (buf_page_get_io_fix(&block->page)) {
6452 case BUF_IO_NONE:
6453 break;
6454
6455 case BUF_IO_WRITE:
6456 switch (buf_page_get_flush_type(
6457 &block->page)) {
6458 case BUF_FLUSH_LRU:
6459 n_lru_flush++;
6460 goto assert_s_latched;
6461 case BUF_FLUSH_SINGLE_PAGE:
6462 n_page_flush++;
6463 assert_s_latched:
6464 ut_a(rw_lock_is_locked(
6465 &block->lock,
6466 RW_LOCK_S)
6467 || rw_lock_is_locked(
6468 &block->lock,
6469 RW_LOCK_SX));
6470 break;
6471 case BUF_FLUSH_LIST:
6472 n_list_flush++;
6473 break;
6474 default:
6475 ut_error;
6476 }
6477
6478 break;
6479
6480 case BUF_IO_READ:
6481
6482 ut_a(rw_lock_is_locked(&block->lock,
6483 RW_LOCK_X));
6484 break;
6485
6486 case BUF_IO_PIN:
6487 break;
6488 }
6489
6490 n_lru++;
6491 break;
6492
6493 case BUF_BLOCK_NOT_USED:
6494 n_free++;
6495 break;
6496
6497 case BUF_BLOCK_READY_FOR_USE:
6498 case BUF_BLOCK_MEMORY:
6499 case BUF_BLOCK_REMOVE_HASH:
6500 /* do nothing */
6501 break;
6502 }
6503
6504 buf_page_mutex_exit(block);
6505 }
6506 }
6507
6508 mutex_enter(&buf_pool->zip_mutex);
6509
6510 /* Check clean compressed-only blocks. */
6511
6512 for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
6513 b = UT_LIST_GET_NEXT(list, b)) {
6514 ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
6515 switch (buf_page_get_io_fix(b)) {
6516 case BUF_IO_NONE:
6517 case BUF_IO_PIN:
6518 /* All clean blocks should be I/O-unfixed. */
6519 break;
6520 case BUF_IO_READ:
6521 /* In buf_LRU_free_page(), we temporarily set
6522 b->io_fix = BUF_IO_READ for a newly allocated
6523 control block in order to prevent
6524 buf_page_get_gen() from decompressing the block. */
6525 break;
6526 default:
6527 ut_error;
6528 break;
6529 }
6530
6531 /* It is OK to read oldest_modification here because
6532 we have acquired buf_pool->zip_mutex above which acts
6533 as the 'block->mutex' for these bpages. */
6534 ut_a(!b->oldest_modification);
6535 ut_a(buf_page_hash_get_low(buf_pool, b->id) == b);
6536 n_lru++;
6537 n_zip++;
6538 }
6539
6540 /* Check dirty blocks. */
6541
6542 buf_flush_list_mutex_enter(buf_pool);
6543 for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
6544 b = UT_LIST_GET_NEXT(list, b)) {
6545 ut_ad(b->in_flush_list);
6546 ut_a(b->oldest_modification);
6547 n_flush++;
6548
6549 switch (buf_page_get_state(b)) {
6550 case BUF_BLOCK_ZIP_DIRTY:
6551 n_lru++;
6552 n_zip++;
6553 switch (buf_page_get_io_fix(b)) {
6554 case BUF_IO_NONE:
6555 case BUF_IO_READ:
6556 case BUF_IO_PIN:
6557 break;
6558 case BUF_IO_WRITE:
6559 switch (buf_page_get_flush_type(b)) {
6560 case BUF_FLUSH_LRU:
6561 n_lru_flush++;
6562 break;
6563 case BUF_FLUSH_SINGLE_PAGE:
6564 n_page_flush++;
6565 break;
6566 case BUF_FLUSH_LIST:
6567 n_list_flush++;
6568 break;
6569 default:
6570 ut_error;
6571 }
6572 break;
6573 }
6574 break;
6575 case BUF_BLOCK_FILE_PAGE:
6576 /* uncompressed page */
6577 break;
6578 case BUF_BLOCK_POOL_WATCH:
6579 case BUF_BLOCK_ZIP_PAGE:
6580 case BUF_BLOCK_NOT_USED:
6581 case BUF_BLOCK_READY_FOR_USE:
6582 case BUF_BLOCK_MEMORY:
6583 case BUF_BLOCK_REMOVE_HASH:
6584 ut_error;
6585 break;
6586 }
6587 ut_a(buf_page_hash_get_low(buf_pool, b->id) == b);
6588 }
6589
6590 ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush);
6591
6592 hash_unlock_x_all(buf_pool->page_hash);
6593 buf_flush_list_mutex_exit(buf_pool);
6594
6595 mutex_exit(&buf_pool->zip_mutex);
6596
6597 if (buf_pool->curr_size == buf_pool->old_size
6598 && n_lru + n_free > buf_pool->curr_size + n_zip) {
6599
6600 ib::fatal() << "n_LRU " << n_lru << ", n_free " << n_free
6601 << ", pool " << buf_pool->curr_size
6602 << " zip " << n_zip << ". Aborting...";
6603 }
6604
6605 ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru);
6606 if (buf_pool->curr_size == buf_pool->old_size
6607 && UT_LIST_GET_LEN(buf_pool->free) != n_free) {
6608
6609 ib::fatal() << "Free list len "
6610 << UT_LIST_GET_LEN(buf_pool->free)
6611 << ", free blocks " << n_free << ". Aborting...";
6612 }
6613
6614 ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush);
6615 ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush);
6616 ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_page_flush);
6617
6618 buf_pool_mutex_exit(buf_pool);
6619
6620 ut_a(buf_LRU_validate());
6621 ut_a(buf_flush_validate(buf_pool));
6622
6623 return(TRUE);
6624 }
6625
6626 /*********************************************************************//**
6627 Validates the buffer buf_pool data structure.
6628 @return TRUE */
6629 ibool
6630 buf_validate(void)
6631 /*==============*/
6632 {
6633 ulint i;
6634
6635 for (i = 0; i < srv_buf_pool_instances; i++) {
6636 buf_pool_t* buf_pool;
6637
6638 buf_pool = buf_pool_from_array(i);
6639
6640 buf_pool_validate_instance(buf_pool);
6641 }
6642 return(TRUE);
6643 }
6644
6645 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
6646
6647 #if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
6648 /*********************************************************************//**
6649 Prints info of the buffer buf_pool data structure for one instance. */
6650 static
6651 void
6652 buf_print_instance(
6653 /*===============*/
6654 buf_pool_t* buf_pool)
6655 {
6656 index_id_t* index_ids;
6657 ulint* counts;
6658 ulint size;
6659 ulint i;
6660 ulint j;
6661 index_id_t id;
6662 ulint n_found;
6663 buf_chunk_t* chunk;
6664 dict_index_t* index;
6665
6666 ut_ad(buf_pool);
6667
6668 size = buf_pool->curr_size;
6669
6670 index_ids = static_cast<index_id_t*>(
6671 ut_malloc_nokey(size * sizeof *index_ids));
6672
6673 counts = static_cast<ulint*>(ut_malloc_nokey(sizeof(ulint) * size));
6674
6675 buf_pool_mutex_enter(buf_pool);
6676 buf_flush_list_mutex_enter(buf_pool);
6677
6678 ib::info() << *buf_pool;
6679
6680 buf_flush_list_mutex_exit(buf_pool);
6681
6682 /* Count the number of blocks belonging to each index in the buffer */
6683
6684 n_found = 0;
6685
6686 chunk = buf_pool->chunks;
6687
6688 for (i = buf_pool->n_chunks; i--; chunk++) {
6689 buf_block_t* block = chunk->blocks;
6690 ulint n_blocks = chunk->size;
6691
6692 for (; n_blocks--; block++) {
6693 const buf_frame_t* frame = block->frame;
6694
6695 if (fil_page_index_page_check(frame)) {
6696
6697 id = btr_page_get_index_id(frame);
6698
6699 /* Look for the id in the index_ids array */
6700 j = 0;
6701
6702 while (j < n_found) {
6703
6704 if (index_ids[j] == id) {
6705 counts[j]++;
6706
6707 break;
6708 }
6709 j++;
6710 }
6711
6712 if (j == n_found) {
6713 n_found++;
6714 index_ids[j] = id;
6715 counts[j] = 1;
6716 }
6717 }
6718 }
6719 }
6720
6721 buf_pool_mutex_exit(buf_pool);
6722
6723 for (i = 0; i < n_found; i++) {
6724 index = dict_index_get_if_in_cache(index_ids[i]);
6725
6726 if (!index) {
6727 ib::info() << "Block count for index "
6728 << index_ids[i] << " in buffer is about "
6729 << counts[i];
6730 } else {
6731 ib::info() << "Block count for index " << index_ids[i]
6732 << " in buffer is about " << counts[i]
6733 << ", index " << index->name
6734 << " of table " << index->table->name;
6735 }
6736 }
6737
6738 ut_free(index_ids);
6739 ut_free(counts);
6740
6741 ut_a(buf_pool_validate_instance(buf_pool));
6742 }
6743
6744 /*********************************************************************//**
6745 Prints info of the buffer buf_pool data structure. */
6746 void
6747 buf_print(void)
6748 /*===========*/
6749 {
6750 ulint i;
6751
6752 for (i = 0; i < srv_buf_pool_instances; i++) {
6753 buf_pool_t* buf_pool;
6754
6755 buf_pool = buf_pool_from_array(i);
6756 buf_print_instance(buf_pool);
6757 }
6758 }
6759 #endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
6760
6761 #ifdef UNIV_DEBUG
6762 /*********************************************************************//**
6763 Returns the number of latched pages in the buffer pool.
6764 @return number of latched pages */
6765 static
6766 ulint
6767 buf_get_latched_pages_number_instance(
6768 /*==================================*/
6769 buf_pool_t* buf_pool) /*!< in: buffer pool instance */
6770 {
6771 buf_page_t* b;
6772 ulint i;
6773 buf_chunk_t* chunk;
6774 ulint fixed_pages_number = 0;
6775
6776 buf_pool_mutex_enter(buf_pool);
6777
6778 chunk = buf_pool->chunks;
6779
6780 for (i = buf_pool->n_chunks; i--; chunk++) {
6781 buf_block_t* block;
6782 ulint j;
6783
6784 block = chunk->blocks;
6785
6786 for (j = chunk->size; j--; block++) {
6787 if (buf_block_get_state(block)
6788 != BUF_BLOCK_FILE_PAGE) {
6789
6790 continue;
6791 }
6792
6793 buf_page_mutex_enter(block);
6794
6795 if (block->page.buf_fix_count != 0
6796 || buf_page_get_io_fix(&block->page)
6797 != BUF_IO_NONE) {
6798 fixed_pages_number++;
6799 }
6800
6801 buf_page_mutex_exit(block);
6802 }
6803 }
6804
6805 mutex_enter(&buf_pool->zip_mutex);
6806
6807 /* Traverse the lists of clean and dirty compressed-only blocks. */
6808
6809 for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
6810 b = UT_LIST_GET_NEXT(list, b)) {
6811 ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
6812 ut_a(buf_page_get_io_fix(b) != BUF_IO_WRITE);
6813
6814 if (b->buf_fix_count != 0
6815 || buf_page_get_io_fix(b) != BUF_IO_NONE) {
6816 fixed_pages_number++;
6817 }
6818 }
6819
6820 buf_flush_list_mutex_enter(buf_pool);
6821 for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
6822 b = UT_LIST_GET_NEXT(list, b)) {
6823 ut_ad(b->in_flush_list);
6824
6825 switch (buf_page_get_state(b)) {
6826 case BUF_BLOCK_ZIP_DIRTY:
6827 if (b->buf_fix_count != 0
6828 || buf_page_get_io_fix(b) != BUF_IO_NONE) {
6829 fixed_pages_number++;
6830 }
6831 break;
6832 case BUF_BLOCK_FILE_PAGE:
6833 /* uncompressed page */
6834 break;
6835 case BUF_BLOCK_POOL_WATCH:
6836 case BUF_BLOCK_ZIP_PAGE:
6837 case BUF_BLOCK_NOT_USED:
6838 case BUF_BLOCK_READY_FOR_USE:
6839 case BUF_BLOCK_MEMORY:
6840 case BUF_BLOCK_REMOVE_HASH:
6841 ut_error;
6842 break;
6843 }
6844 }
6845
6846 buf_flush_list_mutex_exit(buf_pool);
6847 mutex_exit(&buf_pool->zip_mutex);
6848 buf_pool_mutex_exit(buf_pool);
6849
6850 return(fixed_pages_number);
6851 }
6852
6853 /*********************************************************************//**
6854 Returns the number of latched pages in all the buffer pools.
6855 @return number of latched pages */
6856 ulint
6857 buf_get_latched_pages_number(void)
6858 /*==============================*/
6859 {
6860 ulint i;
6861 ulint total_latched_pages = 0;
6862
6863 for (i = 0; i < srv_buf_pool_instances; i++) {
6864 buf_pool_t* buf_pool;
6865
6866 buf_pool = buf_pool_from_array(i);
6867
6868 total_latched_pages += buf_get_latched_pages_number_instance(
6869 buf_pool);
6870 }
6871
6872 return(total_latched_pages);
6873 }
6874
6875 #endif /* UNIV_DEBUG */
6876
6877 /*********************************************************************//**
6878 Returns the number of pending buf pool read ios.
6879 @return number of pending read I/O operations */
6880 ulint
6881 buf_get_n_pending_read_ios(void)
6882 /*============================*/
6883 {
6884 ulint pend_ios = 0;
6885
6886 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
6887 pend_ios += buf_pool_from_array(i)->n_pend_reads;
6888 }
6889
6890 return(pend_ios);
6891 }
6892
6893 /*********************************************************************//**
6894 Returns the ratio in percents of modified pages in the buffer pool /
6895 database pages in the buffer pool.
6896 @return modified page percentage ratio */
6897 double
6898 buf_get_modified_ratio_pct(void)
6899 /*============================*/
6900 {
6901 double ratio;
6902 ulint lru_len = 0;
6903 ulint free_len = 0;
6904 ulint flush_list_len = 0;
6905
6906 buf_get_total_list_len(&lru_len, &free_len, &flush_list_len);
6907
6908 ratio = static_cast<double>(100 * flush_list_len)
6909 / (1 + lru_len + free_len);
6910
6911 /* 1 + is there to avoid division by zero */
6912
6913 return(ratio);
6914 }
6915
6916 /*******************************************************************//**
6917 Aggregates a pool stats information with the total buffer pool stats */
6918 static
6919 void
6920 buf_stats_aggregate_pool_info(
6921 /*==========================*/
6922 buf_pool_info_t* total_info, /*!< in/out: the buffer pool
6923 info to store aggregated
6924 result */
6925 const buf_pool_info_t* pool_info) /*!< in: individual buffer pool
6926 stats info */
6927 {
6928 ut_a(total_info && pool_info);
6929
6930 /* Nothing to copy if total_info is the same as pool_info */
6931 if (total_info == pool_info) {
6932 return;
6933 }
6934
6935 total_info->pool_size += pool_info->pool_size;
6936 total_info->lru_len += pool_info->lru_len;
6937 total_info->old_lru_len += pool_info->old_lru_len;
6938 total_info->free_list_len += pool_info->free_list_len;
6939 total_info->flush_list_len += pool_info->flush_list_len;
6940 total_info->n_pend_unzip += pool_info->n_pend_unzip;
6941 total_info->n_pend_reads += pool_info->n_pend_reads;
6942 total_info->n_pending_flush_lru += pool_info->n_pending_flush_lru;
6943 total_info->n_pending_flush_list += pool_info->n_pending_flush_list;
6944 total_info->n_pages_made_young += pool_info->n_pages_made_young;
6945 total_info->n_pages_not_made_young += pool_info->n_pages_not_made_young;
6946 total_info->n_pages_read += pool_info->n_pages_read;
6947 total_info->n_pages_created += pool_info->n_pages_created;
6948 total_info->n_pages_written += pool_info->n_pages_written;
6949 total_info->n_page_gets += pool_info->n_page_gets;
6950 total_info->n_ra_pages_read_rnd += pool_info->n_ra_pages_read_rnd;
6951 total_info->n_ra_pages_read += pool_info->n_ra_pages_read;
6952 total_info->n_ra_pages_evicted += pool_info->n_ra_pages_evicted;
6953 total_info->page_made_young_rate += pool_info->page_made_young_rate;
6954 total_info->page_not_made_young_rate +=
6955 pool_info->page_not_made_young_rate;
6956 total_info->pages_read_rate += pool_info->pages_read_rate;
6957 total_info->pages_created_rate += pool_info->pages_created_rate;
6958 total_info->pages_written_rate += pool_info->pages_written_rate;
6959 total_info->n_page_get_delta += pool_info->n_page_get_delta;
6960 total_info->page_read_delta += pool_info->page_read_delta;
6961 total_info->young_making_delta += pool_info->young_making_delta;
6962 total_info->not_young_making_delta += pool_info->not_young_making_delta;
6963 total_info->pages_readahead_rnd_rate += pool_info->pages_readahead_rnd_rate;
6964 total_info->pages_readahead_rate += pool_info->pages_readahead_rate;
6965 total_info->pages_evicted_rate += pool_info->pages_evicted_rate;
6966 total_info->unzip_lru_len += pool_info->unzip_lru_len;
6967 total_info->io_sum += pool_info->io_sum;
6968 total_info->io_cur += pool_info->io_cur;
6969 total_info->unzip_sum += pool_info->unzip_sum;
6970 total_info->unzip_cur += pool_info->unzip_cur;
6971 }
6972 /*******************************************************************//**
6973 Collect buffer pool stats information for a buffer pool. Also
6974 record aggregated stats if there are more than one buffer pool
6975 in the server */
6976 void
6977 buf_stats_get_pool_info(
6978 /*====================*/
6979 buf_pool_t* buf_pool, /*!< in: buffer pool */
6980 ulint pool_id, /*!< in: buffer pool ID */
6981 buf_pool_info_t* all_pool_info) /*!< in/out: buffer pool info
6982 to fill */
6983 {
6984 buf_pool_info_t* pool_info;
6985 time_t current_time;
6986 double time_elapsed;
6987
6988 /* Find appropriate pool_info to store stats for this buffer pool */
6989 pool_info = &all_pool_info[pool_id];
6990
6991 buf_pool_mutex_enter(buf_pool);
6992 buf_flush_list_mutex_enter(buf_pool);
6993
6994 pool_info->pool_unique_id = pool_id;
6995
6996 pool_info->pool_size = buf_pool->curr_size;
6997
6998 pool_info->lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
6999
7000 pool_info->old_lru_len = buf_pool->LRU_old_len;
7001
7002 pool_info->free_list_len = UT_LIST_GET_LEN(buf_pool->free);
7003
7004 pool_info->flush_list_len = UT_LIST_GET_LEN(buf_pool->flush_list);
7005
7006 pool_info->n_pend_unzip = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
7007
7008 pool_info->n_pend_reads = buf_pool->n_pend_reads;
7009
7010 pool_info->n_pending_flush_lru =
7011 (buf_pool->n_flush[BUF_FLUSH_LRU]
7012 + buf_pool->init_flush[BUF_FLUSH_LRU]);
7013
7014 pool_info->n_pending_flush_list =
7015 (buf_pool->n_flush[BUF_FLUSH_LIST]
7016 + buf_pool->init_flush[BUF_FLUSH_LIST]);
7017
7018 pool_info->n_pending_flush_single_page =
7019 (buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]
7020 + buf_pool->init_flush[BUF_FLUSH_SINGLE_PAGE]);
7021
7022 buf_flush_list_mutex_exit(buf_pool);
7023
7024 current_time = time(NULL);
7025 time_elapsed = 0.001 + difftime(current_time,
7026 buf_pool->last_printout_time);
7027
7028 pool_info->n_pages_made_young = buf_pool->stat.n_pages_made_young;
7029
7030 pool_info->n_pages_not_made_young =
7031 buf_pool->stat.n_pages_not_made_young;
7032
7033 pool_info->n_pages_read = buf_pool->stat.n_pages_read;
7034
7035 pool_info->n_pages_created = buf_pool->stat.n_pages_created;
7036
7037 pool_info->n_pages_written = buf_pool->stat.n_pages_written;
7038
7039 pool_info->n_page_gets = buf_pool->stat.n_page_gets;
7040
7041 pool_info->n_ra_pages_read_rnd = buf_pool->stat.n_ra_pages_read_rnd;
7042 pool_info->n_ra_pages_read = buf_pool->stat.n_ra_pages_read;
7043
7044 pool_info->n_ra_pages_evicted = buf_pool->stat.n_ra_pages_evicted;
7045
7046 pool_info->page_made_young_rate =
7047 (buf_pool->stat.n_pages_made_young
7048 - buf_pool->old_stat.n_pages_made_young) / time_elapsed;
7049
7050 pool_info->page_not_made_young_rate =
7051 (buf_pool->stat.n_pages_not_made_young
7052 - buf_pool->old_stat.n_pages_not_made_young) / time_elapsed;
7053
7054 pool_info->pages_read_rate =
7055 (buf_pool->stat.n_pages_read
7056 - buf_pool->old_stat.n_pages_read) / time_elapsed;
7057
7058 pool_info->pages_created_rate =
7059 (buf_pool->stat.n_pages_created
7060 - buf_pool->old_stat.n_pages_created) / time_elapsed;
7061
7062 pool_info->pages_written_rate =
7063 (buf_pool->stat.n_pages_written
7064 - buf_pool->old_stat.n_pages_written) / time_elapsed;
7065
7066 pool_info->n_page_get_delta = buf_pool->stat.n_page_gets
7067 - buf_pool->old_stat.n_page_gets;
7068
7069 if (pool_info->n_page_get_delta) {
7070 pool_info->page_read_delta = buf_pool->stat.n_pages_read
7071 - buf_pool->old_stat.n_pages_read;
7072
7073 pool_info->young_making_delta =
7074 buf_pool->stat.n_pages_made_young
7075 - buf_pool->old_stat.n_pages_made_young;
7076
7077 pool_info->not_young_making_delta =
7078 buf_pool->stat.n_pages_not_made_young
7079 - buf_pool->old_stat.n_pages_not_made_young;
7080 }
7081 pool_info->pages_readahead_rnd_rate =
7082 (buf_pool->stat.n_ra_pages_read_rnd
7083 - buf_pool->old_stat.n_ra_pages_read_rnd) / time_elapsed;
7084
7085
7086 pool_info->pages_readahead_rate =
7087 (buf_pool->stat.n_ra_pages_read
7088 - buf_pool->old_stat.n_ra_pages_read) / time_elapsed;
7089
7090 pool_info->pages_evicted_rate =
7091 (buf_pool->stat.n_ra_pages_evicted
7092 - buf_pool->old_stat.n_ra_pages_evicted) / time_elapsed;
7093
7094 pool_info->unzip_lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
7095
7096 pool_info->io_sum = buf_LRU_stat_sum.io;
7097
7098 pool_info->io_cur = buf_LRU_stat_cur.io;
7099
7100 pool_info->unzip_sum = buf_LRU_stat_sum.unzip;
7101
7102 pool_info->unzip_cur = buf_LRU_stat_cur.unzip;
7103
7104 buf_refresh_io_stats(buf_pool);
7105 buf_pool_mutex_exit(buf_pool);
7106 }
7107
7108 /*********************************************************************//**
7109 Prints info of the buffer i/o. */
7110 static
7111 void
7112 buf_print_io_instance(
7113 /*==================*/
7114 buf_pool_info_t*pool_info, /*!< in: buffer pool info */
7115 FILE* file) /*!< in/out: buffer where to print */
7116 {
7117 ut_ad(pool_info);
7118
7119 fprintf(file,
7120 "Buffer pool size " ULINTPF "\n"
7121 "Free buffers " ULINTPF "\n"
7122 "Database pages " ULINTPF "\n"
7123 "Old database pages " ULINTPF "\n"
7124 "Modified db pages " ULINTPF "\n"
7125 "Percent of dirty pages(LRU & free pages): %.3f\n"
7126 "Max dirty pages percent: %.3f\n"
7127 "Pending reads " ULINTPF "\n"
7128 "Pending writes: LRU " ULINTPF ", flush list " ULINTPF
7129 ", single page " ULINTPF "\n",
7130 pool_info->pool_size,
7131 pool_info->free_list_len,
7132 pool_info->lru_len,
7133 pool_info->old_lru_len,
7134 pool_info->flush_list_len,
7135 (((double) pool_info->flush_list_len) /
7136 (pool_info->lru_len + pool_info->free_list_len + 1.0)) * 100.0,
7137 srv_max_buf_pool_modified_pct,
7138 pool_info->n_pend_reads,
7139 pool_info->n_pending_flush_lru,
7140 pool_info->n_pending_flush_list,
7141 pool_info->n_pending_flush_single_page);
7142
7143 fprintf(file,
7144 "Pages made young " ULINTPF ", not young " ULINTPF "\n"
7145 "%.2f youngs/s, %.2f non-youngs/s\n"
7146 "Pages read " ULINTPF ", created " ULINTPF
7147 ", written " ULINTPF "\n"
7148 "%.2f reads/s, %.2f creates/s, %.2f writes/s\n",
7149 pool_info->n_pages_made_young,
7150 pool_info->n_pages_not_made_young,
7151 pool_info->page_made_young_rate,
7152 pool_info->page_not_made_young_rate,
7153 pool_info->n_pages_read,
7154 pool_info->n_pages_created,
7155 pool_info->n_pages_written,
7156 pool_info->pages_read_rate,
7157 pool_info->pages_created_rate,
7158 pool_info->pages_written_rate);
7159
7160 if (pool_info->n_page_get_delta) {
7161 double hit_rate = double(pool_info->page_read_delta)
7162 / pool_info->n_page_get_delta;
7163
7164 if (hit_rate > 1) {
7165 hit_rate = 1;
7166 }
7167
7168 fprintf(file,
7169 "Buffer pool hit rate " ULINTPF " / 1000,"
7170 " young-making rate " ULINTPF " / 1000 not "
7171 ULINTPF " / 1000\n",
7172 ulint(1000 * (1 - hit_rate)),
7173 ulint(1000 * double(pool_info->young_making_delta)
7174 / pool_info->n_page_get_delta),
7175 ulint(1000 * double(pool_info->not_young_making_delta)
7176 / pool_info->n_page_get_delta));
7177 } else {
7178 fputs("No buffer pool page gets since the last printout\n",
7179 file);
7180 }
7181
7182 /* Statistics about read ahead algorithm */
7183 fprintf(file, "Pages read ahead %.2f/s,"
7184 " evicted without access %.2f/s,"
7185 " Random read ahead %.2f/s\n",
7186
7187 pool_info->pages_readahead_rate,
7188 pool_info->pages_evicted_rate,
7189 pool_info->pages_readahead_rnd_rate);
7190
7191 /* Print some values to help us with visualizing what is
7192 happening with LRU eviction. */
7193 fprintf(file,
7194 "LRU len: " ULINTPF ", unzip_LRU len: " ULINTPF "\n"
7195 "I/O sum[" ULINTPF "]:cur[" ULINTPF "], "
7196 "unzip sum[" ULINTPF "]:cur[" ULINTPF "]\n",
7197 pool_info->lru_len, pool_info->unzip_lru_len,
7198 pool_info->io_sum, pool_info->io_cur,
7199 pool_info->unzip_sum, pool_info->unzip_cur);
7200 }
7201
7202 /*********************************************************************//**
7203 Prints info of the buffer i/o. */
7204 void
7205 buf_print_io(
7206 /*=========*/
7207 FILE* file) /*!< in/out: buffer where to print */
7208 {
7209 ulint i;
7210 buf_pool_info_t* pool_info;
7211 buf_pool_info_t* pool_info_total;
7212
7213 /* If srv_buf_pool_instances is greater than 1, allocate
7214 one extra buf_pool_info_t, the last one stores
7215 aggregated/total values from all pools */
7216 if (srv_buf_pool_instances > 1) {
7217 pool_info = (buf_pool_info_t*) ut_zalloc_nokey((
7218 srv_buf_pool_instances + 1) * sizeof *pool_info);
7219
7220 pool_info_total = &pool_info[srv_buf_pool_instances];
7221 } else {
7222 ut_a(srv_buf_pool_instances == 1);
7223
7224 pool_info_total = pool_info =
7225 static_cast<buf_pool_info_t*>(
7226 ut_zalloc_nokey(sizeof *pool_info));
7227 }
7228
7229 for (i = 0; i < srv_buf_pool_instances; i++) {
7230 buf_pool_t* buf_pool;
7231
7232 buf_pool = buf_pool_from_array(i);
7233
7234 /* Fetch individual buffer pool info and calculate
7235 aggregated stats along the way */
7236 buf_stats_get_pool_info(buf_pool, i, pool_info);
7237
7238 /* If we have more than one buffer pool, store
7239 the aggregated stats */
7240 if (srv_buf_pool_instances > 1) {
7241 buf_stats_aggregate_pool_info(pool_info_total,
7242 &pool_info[i]);
7243 }
7244 }
7245
7246 /* Print the aggreate buffer pool info */
7247 buf_print_io_instance(pool_info_total, file);
7248
7249 /* If there are more than one buffer pool, print each individual pool
7250 info */
7251 if (srv_buf_pool_instances > 1) {
7252 fputs("----------------------\n"
7253 "INDIVIDUAL BUFFER POOL INFO\n"
7254 "----------------------\n", file);
7255
7256 for (i = 0; i < srv_buf_pool_instances; i++) {
7257 fprintf(file, "---BUFFER POOL " ULINTPF "\n", i);
7258 buf_print_io_instance(&pool_info[i], file);
7259 }
7260 }
7261
7262 ut_free(pool_info);
7263 }
7264
7265 /**********************************************************************//**
7266 Refreshes the statistics used to print per-second averages. */
7267 void
7268 buf_refresh_io_stats_all(void)
7269 /*==========================*/
7270 {
7271 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
7272 buf_pool_t* buf_pool;
7273
7274 buf_pool = buf_pool_from_array(i);
7275
7276 buf_refresh_io_stats(buf_pool);
7277 }
7278 }
7279
7280 /**********************************************************************//**
7281 Check if all pages in all buffer pools are in a replacable state.
7282 @return FALSE if not */
7283 ibool
7284 buf_all_freed(void)
7285 /*===============*/
7286 {
7287 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
7288 buf_pool_t* buf_pool;
7289
7290 buf_pool = buf_pool_from_array(i);
7291
7292 if (!buf_all_freed_instance(buf_pool)) {
7293 return(FALSE);
7294 }
7295 }
7296
7297 return(TRUE);
7298 }
7299
7300 /*********************************************************************//**
7301 Checks that there currently are no pending i/o-operations for the buffer
7302 pool.
7303 @return number of pending i/o */
7304 ulint
7305 buf_pool_check_no_pending_io(void)
7306 /*==============================*/
7307 {
7308 ulint i;
7309 ulint pending_io = 0;
7310
7311 buf_pool_mutex_enter_all();
7312
7313 for (i = 0; i < srv_buf_pool_instances; i++) {
7314 const buf_pool_t* buf_pool;
7315
7316 buf_pool = buf_pool_from_array(i);
7317
7318 pending_io += buf_pool->n_pend_reads
7319 + buf_pool->n_flush[BUF_FLUSH_LRU]
7320 + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]
7321 + buf_pool->n_flush[BUF_FLUSH_LIST];
7322
7323 }
7324
7325 buf_pool_mutex_exit_all();
7326
7327 return(pending_io);
7328 }
7329
7330 /** Print the given page_id_t object.
7331 @param[in,out] out the output stream
7332 @param[in] page_id the page_id_t object to be printed
7333 @return the output stream */
7334 std::ostream&
7335 operator<<(
7336 std::ostream& out,
7337 const page_id_t page_id)
7338 {
7339 out << "[page id: space=" << page_id.m_space
7340 << ", page number=" << page_id.m_page_no << "]";
7341 return(out);
7342 }
7343
7344 #if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
7345 /** Print the given buf_pool_t object.
7346 @param[in,out] out the output stream
7347 @param[in] buf_pool the buf_pool_t object to be printed
7348 @return the output stream */
7349 std::ostream&
7350 operator<<(
7351 std::ostream& out,
7352 const buf_pool_t& buf_pool)
7353 {
7354 out << "[buffer pool instance: "
7355 << "buf_pool size=" << buf_pool.curr_size
7356 << ", database pages=" << UT_LIST_GET_LEN(buf_pool.LRU)
7357 << ", free pages=" << UT_LIST_GET_LEN(buf_pool.free)
7358 << ", modified database pages="
7359 << UT_LIST_GET_LEN(buf_pool.flush_list)
7360 << ", n pending decompressions=" << buf_pool.n_pend_unzip
7361 << ", n pending reads=" << buf_pool.n_pend_reads
7362 << ", n pending flush LRU=" << buf_pool.n_flush[BUF_FLUSH_LRU]
7363 << " list=" << buf_pool.n_flush[BUF_FLUSH_LIST]
7364 << " single page=" << buf_pool.n_flush[BUF_FLUSH_SINGLE_PAGE]
7365 << ", pages made young=" << buf_pool.stat.n_pages_made_young
7366 << ", not young=" << buf_pool.stat.n_pages_not_made_young
7367 << ", pages read=" << buf_pool.stat.n_pages_read
7368 << ", created=" << buf_pool.stat.n_pages_created
7369 << ", written=" << buf_pool.stat.n_pages_written << "]";
7370 return(out);
7371 }
7372 #endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
7373
7374 /** Encrypt a buffer of temporary tablespace
7375 @param[in] offset Page offset
7376 @param[in] src_frame Page to encrypt
7377 @param[in,out] dst_frame Output buffer
7378 @return encrypted buffer or NULL */
7379 static byte* buf_tmp_page_encrypt(
7380 ulint offset,
7381 byte* src_frame,
7382 byte* dst_frame)
7383 {
7384 uint header_len = FIL_PAGE_DATA;
7385 /* FIL page header is not encrypted */
7386 memcpy(dst_frame, src_frame, header_len);
7387
7388 /* Calculate the start offset in a page */
7389 uint unencrypted_bytes = header_len + FIL_PAGE_DATA_END;
7390 uint srclen = srv_page_size - unencrypted_bytes;
7391 const byte* src = src_frame + header_len;
7392 byte* dst = dst_frame + header_len;
7393
7394 if (!log_tmp_block_encrypt(src, srclen, dst, (offset * srv_page_size),
7395 true)) {
7396 return NULL;
7397 }
7398
7399 memcpy(dst_frame + srv_page_size - FIL_PAGE_DATA_END,
7400 src_frame + srv_page_size - FIL_PAGE_DATA_END,
7401 FIL_PAGE_DATA_END);
7402
7403 /* Handle post encryption checksum */
7404 mach_write_to_4(dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + 4,
7405 buf_calc_page_crc32(dst_frame));
7406
7407 srv_stats.pages_encrypted.inc();
7408 srv_stats.n_temp_blocks_encrypted.inc();
7409 return dst_frame;
7410 }
7411
7412 /** Encryption and page_compression hook that is called just before
7413 a page is written to disk.
7414 @param[in,out] space tablespace
7415 @param[in,out] bpage buffer page
7416 @param[in] src_frame physical page frame that is being encrypted
7417 @return page frame to be written to file
7418 (may be src_frame or an encrypted/compressed copy of it) */
7419 UNIV_INTERN
7420 byte*
7421 buf_page_encrypt_before_write(
7422 fil_space_t* space,
7423 buf_page_t* bpage,
7424 byte* src_frame)
7425 {
7426 ut_ad(space->id == bpage->id.space());
7427 bpage->real_size = srv_page_size;
7428
7429 fil_page_type_validate(src_frame);
7430
7431 switch (bpage->id.page_no()) {
7432 case 0:
7433 /* Page 0 of a tablespace is not encrypted/compressed */
7434 return src_frame;
7435 case TRX_SYS_PAGE_NO:
7436 if (bpage->id.space() == TRX_SYS_SPACE) {
7437 /* don't encrypt/compress page as it contains
7438 address to dblwr buffer */
7439 return src_frame;
7440 }
7441 }
7442
7443 fil_space_crypt_t* crypt_data = space->crypt_data;
7444
7445 bool encrypted, page_compressed;
7446
7447 if (space->purpose == FIL_TYPE_TEMPORARY) {
7448 ut_ad(!crypt_data);
7449 encrypted = innodb_encrypt_temporary_tables;
7450 page_compressed = false;
7451 } else {
7452 encrypted = crypt_data
7453 && !crypt_data->not_encrypted()
7454 && crypt_data->type != CRYPT_SCHEME_UNENCRYPTED
7455 && (!crypt_data->is_default_encryption()
7456 || srv_encrypt_tables);
7457
7458 page_compressed = FSP_FLAGS_HAS_PAGE_COMPRESSION(space->flags);
7459 }
7460
7461 if (!encrypted && !page_compressed) {
7462 /* No need to encrypt or page compress the page.
7463 Clear key-version & crypt-checksum. */
7464 memset(src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
7465 return src_frame;
7466 }
7467
7468 ut_ad(!bpage->size.is_compressed() || !page_compressed);
7469 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
7470 /* Find free slot from temporary memory array */
7471 buf_tmp_buffer_t* slot = buf_pool_reserve_tmp_slot(buf_pool);
7472 slot->out_buf = NULL;
7473 bpage->slot = slot;
7474
7475 buf_tmp_reserve_crypt_buf(slot);
7476 byte *dst_frame = slot->crypt_buf;
7477
7478 if (!page_compressed) {
7479 not_compressed:
7480 byte* tmp;
7481 if (space->purpose == FIL_TYPE_TEMPORARY) {
7482 /* Encrypt temporary tablespace page content */
7483 tmp = buf_tmp_page_encrypt(bpage->id.page_no(),
7484 src_frame, dst_frame);
7485 } else {
7486 /* Encrypt page content */
7487 tmp = fil_space_encrypt(
7488 space, bpage->id.page_no(),
7489 bpage->newest_modification,
7490 src_frame, dst_frame);
7491 }
7492
7493 bpage->real_size = srv_page_size;
7494 slot->out_buf = dst_frame = tmp;
7495
7496 ut_d(fil_page_type_validate(tmp));
7497 } else {
7498 ut_ad(space->purpose != FIL_TYPE_TEMPORARY);
7499 /* First we compress the page content */
7500 buf_tmp_reserve_compression_buf(slot);
7501 byte* tmp = slot->comp_buf;
7502 ulint out_len = fil_page_compress(
7503 src_frame, tmp,
7504 fsp_flags_get_page_compression_level(space->flags),
7505 fil_space_get_block_size(space, bpage->id.page_no()),
7506 encrypted);
7507 if (!out_len) {
7508 goto not_compressed;
7509 }
7510
7511 bpage->real_size = out_len;
7512
7513 /* Workaround for MDEV-15527. */
7514 memset(tmp + out_len, 0 , srv_page_size - out_len);
7515 ut_d(fil_page_type_validate(tmp));
7516
7517 if (encrypted) {
7518 /* And then we encrypt the page content */
7519 tmp = fil_space_encrypt(space,
7520 bpage->id.page_no(),
7521 bpage->newest_modification,
7522 tmp,
7523 dst_frame);
7524 }
7525
7526 slot->out_buf = dst_frame = tmp;
7527 }
7528
7529 ut_d(fil_page_type_validate(dst_frame));
7530
7531 // return dst_frame which will be written
7532 return dst_frame;
7533 }
7534
7535 /**
7536 Should we punch hole to deallocate unused portion of the page.
7537 @param[in] bpage Page control block
7538 @return true if punch hole should be used, false if not */
7539 bool
7540 buf_page_should_punch_hole(
7541 const buf_page_t* bpage)
7542 {
7543 return (bpage->real_size != bpage->size.physical());
7544 }
7545
7546 /**
7547 Calculate the length of trim (punch_hole) operation.
7548 @param[in] bpage Page control block
7549 @param[in] write_length Write length
7550 @return length of the trim or zero. */
7551 ulint
7552 buf_page_get_trim_length(
7553 const buf_page_t* bpage,
7554 ulint write_length)
7555 {
7556 return (bpage->size.physical() - write_length);
7557 }
7558 #endif /* !UNIV_INNOCHECKSUM */
7559