1 /*****************************************************************************
2 
3 Copyright (c) 1995, 2021, Oracle and/or its affiliates.
4 Copyright (c) 2008, Google Inc.
5 Copyright (c) 2013, 2021, MariaDB Corporation.
6 
7 Portions of this file contain modifications contributed and copyrighted by
8 Google, Inc. Those modifications are gratefully acknowledged and are described
9 briefly in the InnoDB documentation. The contributions by Google are
10 incorporated with their permission, and subject to the conditions contained in
11 the file COPYING.Google.
12 
13 This program is free software; you can redistribute it and/or modify it under
14 the terms of the GNU General Public License as published by the Free Software
15 Foundation; version 2 of the License.
16 
17 This program is distributed in the hope that it will be useful, but WITHOUT
18 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
19 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
20 
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
24 
25 *****************************************************************************/
26 
27 /**************************************************//**
28 @file buf/buf0buf.cc
29 The database buffer buf_pool
30 
31 Created 11/5/1995 Heikki Tuuri
32 *******************************************************/
33 
34 #include "mtr0types.h"
35 #include "mach0data.h"
36 #include "page0size.h"
37 #include "buf0buf.h"
38 #include <string.h>
39 
40 #ifdef UNIV_NONINL
41 #include "buf0buf.inl"
42 #endif
43 
44 #ifndef UNIV_INNOCHECKSUM
45 #include "mem0mem.h"
46 #include "btr0btr.h"
47 #include "fil0fil.h"
48 #include "fil0crypt.h"
49 #include "buf0buddy.h"
50 #include "lock0lock.h"
51 #include "sync0rw.h"
52 #include "btr0sea.h"
53 #include "ibuf0ibuf.h"
54 #include "trx0undo.h"
55 #include "trx0purge.h"
56 #include "log0log.h"
57 #include "dict0stats_bg.h"
58 #include "srv0srv.h"
59 #include "srv0start.h"
60 #include "dict0dict.h"
61 #include "log0recv.h"
62 #include "srv0mon.h"
63 #include "log0crypt.h"
64 #endif /* !UNIV_INNOCHECKSUM */
65 #include "page0zip.h"
66 #include "sync0sync.h"
67 #include "buf0dump.h"
68 #include <new>
69 #include <map>
70 #include <sstream>
71 #ifndef UNIV_INNOCHECKSUM
72 #include "fil0pagecompress.h"
73 #include "fsp0pagecompress.h"
74 #endif
75 #include "ut0byte.h"
76 #include <new>
77 
78 #ifdef UNIV_LINUX
79 #include <stdlib.h>
80 #endif
81 
82 #ifdef HAVE_LZO
83 #include "lzo/lzo1x.h"
84 #endif
85 
86 using st_::span;
87 
88 #ifdef HAVE_LIBNUMA
89 #include <numa.h>
90 #include <numaif.h>
91 struct set_numa_interleave_t
92 {
93 	set_numa_interleave_t()
94 	{
95 		if (srv_numa_interleave) {
96 
97 			struct bitmask *numa_mems_allowed = numa_get_mems_allowed();
98 			ib::info() << "Setting NUMA memory policy to"
99 				" MPOL_INTERLEAVE";
100 			if (set_mempolicy(MPOL_INTERLEAVE,
101 					  numa_mems_allowed->maskp,
102 					  numa_mems_allowed->size) != 0) {
103 
104 				ib::warn() << "Failed to set NUMA memory"
105 					" policy to MPOL_INTERLEAVE: "
106 					<< strerror(errno);
107 			}
108 			numa_bitmask_free(numa_mems_allowed);
109 		}
110 	}
111 
112 	~set_numa_interleave_t()
113 	{
114 		if (srv_numa_interleave) {
115 
116 			ib::info() << "Setting NUMA memory policy to"
117 				" MPOL_DEFAULT";
118 			if (set_mempolicy(MPOL_DEFAULT, NULL, 0) != 0) {
119 				ib::warn() << "Failed to set NUMA memory"
120 					" policy to MPOL_DEFAULT: "
121 					<< strerror(errno);
122 			}
123 		}
124 	}
125 };
126 
127 #define NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE set_numa_interleave_t scoped_numa
128 #else
129 #define NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE
130 #endif /* HAVE_LIBNUMA */
131 
132 #ifdef HAVE_SNAPPY
133 #include "snappy-c.h"
134 #endif
135 
136 /*
137 		IMPLEMENTATION OF THE BUFFER POOL
138 		=================================
139 
140 Performance improvement:
141 ------------------------
142 Thread scheduling in NT may be so slow that the OS wait mechanism should
143 not be used even in waiting for disk reads to complete.
144 Rather, we should put waiting query threads to the queue of
145 waiting jobs, and let the OS thread do something useful while the i/o
146 is processed. In this way we could remove most OS thread switches in
147 an i/o-intensive benchmark like TPC-C.
148 
149 A possibility is to put a user space thread library between the database
150 and NT. User space thread libraries might be very fast.
151 
152 SQL Server 7.0 can be configured to use 'fibers' which are lightweight
153 threads in NT. These should be studied.
154 
155 		Buffer frames and blocks
156 		------------------------
157 Following the terminology of Gray and Reuter, we call the memory
158 blocks where file pages are loaded buffer frames. For each buffer
159 frame there is a control block, or shortly, a block, in the buffer
160 control array. The control info which does not need to be stored
161 in the file along with the file page, resides in the control block.
162 
163 		Buffer pool struct
164 		------------------
165 The buffer buf_pool contains a single mutex which protects all the
166 control data structures of the buf_pool. The content of a buffer frame is
167 protected by a separate read-write lock in its control block, though.
168 These locks can be locked and unlocked without owning the buf_pool->mutex.
169 The OS events in the buf_pool struct can be waited for without owning the
170 buf_pool->mutex.
171 
172 The buf_pool->mutex is a hot-spot in main memory, causing a lot of
173 memory bus traffic on multiprocessor systems when processors
174 alternately access the mutex. On our Pentium, the mutex is accessed
175 maybe every 10 microseconds. We gave up the solution to have mutexes
176 for each control block, for instance, because it seemed to be
177 complicated.
178 
179 A solution to reduce mutex contention of the buf_pool->mutex is to
180 create a separate mutex for the page hash table. On Pentium,
181 accessing the hash table takes 2 microseconds, about half
182 of the total buf_pool->mutex hold time.
183 
184 		Control blocks
185 		--------------
186 
187 The control block contains, for instance, the bufferfix count
188 which is incremented when a thread wants a file page to be fixed
189 in a buffer frame. The bufferfix operation does not lock the
190 contents of the frame, however. For this purpose, the control
191 block contains a read-write lock.
192 
193 The buffer frames have to be aligned so that the start memory
194 address of a frame is divisible by the universal page size, which
195 is a power of two.
196 
197 We intend to make the buffer buf_pool size on-line reconfigurable,
198 that is, the buf_pool size can be changed without closing the database.
199 Then the database administarator may adjust it to be bigger
200 at night, for example. The control block array must
201 contain enough control blocks for the maximum buffer buf_pool size
202 which is used in the particular database.
203 If the buf_pool size is cut, we exploit the virtual memory mechanism of
204 the OS, and just refrain from using frames at high addresses. Then the OS
205 can swap them to disk.
206 
207 The control blocks containing file pages are put to a hash table
208 according to the file address of the page.
209 We could speed up the access to an individual page by using
210 "pointer swizzling": we could replace the page references on
211 non-leaf index pages by direct pointers to the page, if it exists
212 in the buf_pool. We could make a separate hash table where we could
213 chain all the page references in non-leaf pages residing in the buf_pool,
214 using the page reference as the hash key,
215 and at the time of reading of a page update the pointers accordingly.
216 Drawbacks of this solution are added complexity and,
217 possibly, extra space required on non-leaf pages for memory pointers.
218 A simpler solution is just to speed up the hash table mechanism
219 in the database, using tables whose size is a power of 2.
220 
221 		Lists of blocks
222 		---------------
223 
224 There are several lists of control blocks.
225 
226 The free list (buf_pool->free) contains blocks which are currently not
227 used.
228 
229 The common LRU list contains all the blocks holding a file page
230 except those for which the bufferfix count is non-zero.
231 The pages are in the LRU list roughly in the order of the last
232 access to the page, so that the oldest pages are at the end of the
233 list. We also keep a pointer to near the end of the LRU list,
234 which we can use when we want to artificially age a page in the
235 buf_pool. This is used if we know that some page is not needed
236 again for some time: we insert the block right after the pointer,
237 causing it to be replaced sooner than would normally be the case.
238 Currently this aging mechanism is used for read-ahead mechanism
239 of pages, and it can also be used when there is a scan of a full
240 table which cannot fit in the memory. Putting the pages near the
241 end of the LRU list, we make sure that most of the buf_pool stays
242 in the main memory, undisturbed.
243 
244 The unzip_LRU list contains a subset of the common LRU list.  The
245 blocks on the unzip_LRU list hold a compressed file page and the
246 corresponding uncompressed page frame.  A block is in unzip_LRU if and
247 only if the predicate buf_page_belongs_to_unzip_LRU(&block->page)
248 holds.  The blocks in unzip_LRU will be in same order as they are in
249 the common LRU list.  That is, each manipulation of the common LRU
250 list will result in the same manipulation of the unzip_LRU list.
251 
252 The chain of modified blocks (buf_pool->flush_list) contains the blocks
253 holding file pages that have been modified in the memory
254 but not written to disk yet. The block with the oldest modification
255 which has not yet been written to disk is at the end of the chain.
256 The access to this list is protected by buf_pool->flush_list_mutex.
257 
BlockingInitFIFO(PaUtilRingBuffer * rbuf,long numFrames,long bytesPerFrame)258 The chain of unmodified compressed blocks (buf_pool->zip_clean)
259 contains the control blocks (buf_page_t) of those compressed pages
260 that are not in buf_pool->flush_list and for which no uncompressed
261 page has been allocated in the buffer pool.  The control blocks for
262 uncompressed pages are accessible via buf_block_t objects that are
263 reachable via buf_pool->chunks[].
264 
265 The chains of free memory blocks (buf_pool->zip_free[]) are used by
266 the buddy allocator (buf0buddy.cc) to keep track of currently unused
267 memory blocks of size sizeof(buf_page_t)..srv_page_size / 2.  These
268 blocks are inside the srv_page_size-sized memory blocks of type
269 BUF_BLOCK_MEMORY that the buddy allocator requests from the buffer
270 pool.  The buddy allocator is solely used for allocating control
271 blocks for compressed pages (buf_page_t) and compressed page frames.
272 
273 		Loading a file page
274 		-------------------
275 
276 First, a victim block for replacement has to be found in the
277 buf_pool. It is taken from the free list or searched for from the
278 end of the LRU-list. An exclusive lock is reserved for the frame,
279 the io_fix field is set in the block fixing the block in buf_pool,
280 and the io-operation for loading the page is queued. The io-handler thread
281 releases the X-lock on the frame and resets the io_fix field
282 when the io operation completes.
283 
284 A thread may request the above operation using the function
285 buf_page_get(). It may then continue to request a lock on the frame.
286 The lock is granted when the io-handler releases the x-lock.
287 
288 		Read-ahead
289 		----------
290 
291 The read-ahead mechanism is intended to be intelligent and
292 isolated from the semantically higher levels of the database
293 index management. From the higher level we only need the
294 information if a file page has a natural successor or
295 predecessor page. On the leaf level of a B-tree index,
296 these are the next and previous pages in the natural
297 order of the pages.
298 
299 Let us first explain the read-ahead mechanism when the leafs
300 of a B-tree are scanned in an ascending or descending order.
301 When a read page is the first time referenced in the buf_pool,
302 the buffer manager checks if it is at the border of a so-called
303 linear read-ahead area. The tablespace is divided into these
304 areas of size 64 blocks, for example. So if the page is at the
305 border of such an area, the read-ahead mechanism checks if
306 all the other blocks in the area have been accessed in an
307 ascending or descending order. If this is the case, the system
308 looks at the natural successor or predecessor of the page,
309 checks if that is at the border of another area, and in this case
310 issues read-requests for all the pages in that area. Maybe
311 we could relax the condition that all the pages in the area
312 have to be accessed: if data is deleted from a table, there may
313 appear holes of unused pages in the area.
314 
315 A different read-ahead mechanism is used when there appears
316 to be a random access pattern to a file.
317 If a new page is referenced in the buf_pool, and several pages
318 of its random access area (for instance, 32 consecutive pages
319 in a tablespace) have recently been referenced, we may predict
320 that the whole area may be needed in the near future, and issue
321 the read requests for the whole area.
322 */
323 
324 #ifndef UNIV_INNOCHECKSUM
325 /** Value in microseconds */
326 static const int WAIT_FOR_READ	= 100;
327 static const int WAIT_FOR_WRITE = 100;
328 /** Number of attempts made to read in a page in the buffer pool */
329 static const ulint	BUF_PAGE_READ_MAX_RETRIES = 100;
330 /** Number of pages to read ahead */
331 static const ulint	BUF_READ_AHEAD_PAGES = 64;
332 /** The maximum portion of the buffer pool that can be used for the
333 read-ahead buffer.  (Divide buf_pool size by this amount) */
334 static const ulint	BUF_READ_AHEAD_PORTION = 32;
335 
336 /** The buffer pools of the database */
337 buf_pool_t*	buf_pool_ptr;
338 
339 /** true when resizing buffer pool is in the critical path. */
340 volatile bool	buf_pool_resizing;
341 
342 /** Map of buffer pool chunks by its first frame address
343 This is newly made by initialization of buffer pool and buf_resize_thread.
344 Currently, no need mutex protection for update. */
345 typedef std::map<
346 	const byte*,
BlockingEnd(PaJackStream * stream)347 	buf_chunk_t*,
348 	std::less<const byte*>,
349 	ut_allocator<std::pair<const byte* const, buf_chunk_t*> > >
350 	buf_pool_chunk_map_t;
351 
352 static buf_pool_chunk_map_t*			buf_chunk_map_reg;
353 
354 /** Chunk map to be used to lookup.
BlockingReadStream(PaStream * s,void * data,unsigned long numFrames)355 The map pointed by this should not be updated */
356 static buf_pool_chunk_map_t*	buf_chunk_map_ref = NULL;
357 
358 #ifdef UNIV_DEBUG
359 /** Disable resizing buffer pool to make assertion code not expensive. */
360 my_bool			buf_disable_resize_buffer_pool_debug = TRUE;
361 #endif /* UNIV_DEBUG */
362 
363 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
364 /** This is used to insert validation operations in execution
365 in the debug version */
366 static ulint	buf_dbg_counter	= 0;
367 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
368 
369 #if defined UNIV_PFS_MUTEX || defined UNIV_PFS_RWLOCK
370 # ifndef PFS_SKIP_BUFFER_MUTEX_RWLOCK
371 
372 /* Buffer block mutexes and rwlocks can be registered
373 in one group rather than individually. If PFS_GROUP_BUFFER_SYNC
374 is defined, register buffer block mutex and rwlock
375 in one group after their initialization. */
376 #  define PFS_GROUP_BUFFER_SYNC
377 
378 /* This define caps the number of mutexes/rwlocks can
379 be registered with performance schema. Developers can
380 modify this define if necessary. Please note, this would
BlockingWriteStream(PaStream * s,const void * data,unsigned long numFrames)381 be effective only if PFS_GROUP_BUFFER_SYNC is defined. */
382 #  define PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER	ULINT_MAX
383 
384 # endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */
385 #endif /* UNIV_PFS_MUTEX || UNIV_PFS_RWLOCK */
386 
387 /** Macro to determine whether the read of write counter is used depending
388 on the io_type */
389 #define MONITOR_RW_COUNTER(io_type, counter)		\
390 	((io_type == BUF_IO_READ)			\
391 	 ? (counter##_READ)				\
392 	 : (counter##_WRITTEN))
393 
394 
395 /** Reserve a buffer slot for encryption, decryption or page compression.
396 @param[in,out]	buf_pool	buffer pool
397 @return reserved buffer slot */
398 static buf_tmp_buffer_t* buf_pool_reserve_tmp_slot(buf_pool_t* buf_pool)
399 {
400 	for (ulint i = 0; i < buf_pool->tmp_arr->n_slots; i++) {
401 		buf_tmp_buffer_t* slot = &buf_pool->tmp_arr->slots[i];
402 		if (slot->acquire()) {
403 			return slot;
404 		}
405 	}
406 
407 	/* We assume that free slot is found */
408 	ut_error;
409 	return NULL;
410 }
411 
412 /** Reserve a buffer for encryption, decryption or decompression.
413 @param[in,out]	slot	reserved slot */
414 static void buf_tmp_reserve_crypt_buf(buf_tmp_buffer_t* slot)
415 {
416 	if (!slot->crypt_buf) {
417 		slot->crypt_buf = static_cast<byte*>(
418 			aligned_malloc(srv_page_size, srv_page_size));
419 	}
BlockingGetStreamReadAvailable(PaStream * s)420 }
421 
422 /** Reserve a buffer for compression.
423 @param[in,out]	slot	reserved slot */
424 static void buf_tmp_reserve_compression_buf(buf_tmp_buffer_t* slot)
425 {
426 	if (!slot->comp_buf) {
427 		/* Both snappy and lzo compression methods require that
428 		output buffer used for compression is bigger than input
429 		buffer. Increase the allocated buffer size accordingly. */
430 		ulint size = srv_page_size;
431 #ifdef HAVE_LZO
432 		size += LZO1X_1_15_MEM_COMPRESS;
433 #elif defined HAVE_SNAPPY
434 		size = snappy_max_compressed_length(size);
435 #endif
436 		slot->comp_buf = static_cast<byte*>(
437 			aligned_malloc(size, srv_page_size));
438 	}
439 }
440 
441 /** Registers a chunk to buf_pool_chunk_map
442 @param[in]	chunk	chunk of buffers */
443 static
444 void
445 buf_pool_register_chunk(
446 	buf_chunk_t*	chunk)
447 {
448 	buf_chunk_map_reg->insert(buf_pool_chunk_map_t::value_type(
449 		chunk->blocks->frame, chunk));
450 }
451 
452 /** Decrypt a page for temporary tablespace.
453 @param[in,out]	tmp_frame	Temporary buffer
454 @param[in]	src_frame	Page to decrypt
455 @return true if temporary tablespace decrypted, false if not */
456 static bool buf_tmp_page_decrypt(byte* tmp_frame, byte* src_frame)
457 {
458 	if (buf_is_zeroes(span<const byte>(src_frame, srv_page_size))) {
BuildDeviceList(PaJackHostApiRepresentation * jackApi)459 		return true;
460 	}
461 
462 	/* read space & lsn */
463 	uint header_len = FIL_PAGE_DATA;
464 
465 	/* Copy FIL page header, it is not encrypted */
466 	memcpy(tmp_frame, src_frame, header_len);
467 
468 	/* Calculate the offset where decryption starts */
469 	const byte* src = src_frame + header_len;
470 	byte* dst = tmp_frame + header_len;
471 	uint srclen = uint(srv_page_size)
472 		- header_len - FIL_PAGE_DATA_END;
473 	ulint offset = mach_read_from_4(src_frame + FIL_PAGE_OFFSET);
474 
475 	if (!log_tmp_block_decrypt(src, srclen, dst,
476 				   (offset * srv_page_size))) {
477 		return false;
478 	}
479 
480 	memcpy(tmp_frame + srv_page_size - FIL_PAGE_DATA_END,
481 	       src_frame + srv_page_size - FIL_PAGE_DATA_END,
482 	       FIL_PAGE_DATA_END);
483 
484 	memcpy(src_frame, tmp_frame, srv_page_size);
485 	srv_stats.pages_decrypted.inc();
486 	srv_stats.n_temp_blocks_decrypted.inc();
487 
488 	return true; /* page was decrypted */
489 }
490 
491 /** Decrypt a page.
492 @param[in,out]	bpage	Page control block
493 @param[in,out]	space	tablespace
494 @return whether the operation was successful */
495 static bool buf_page_decrypt_after_read(buf_page_t* bpage, fil_space_t* space)
496 {
497 	ut_ad(space->pending_io());
498 	ut_ad(space->id == bpage->id.space());
499 
500 	byte* dst_frame = bpage->zip.data ? bpage->zip.data :
501 		((buf_block_t*) bpage)->frame;
502 	bool page_compressed = fil_page_is_compressed(dst_frame);
503 	buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
504 
505 	if (bpage->id.page_no() == 0) {
506 		/* File header pages are not encrypted/compressed */
507 		return (true);
508 	}
509 
510 	if (space->purpose == FIL_TYPE_TEMPORARY
511 	    && innodb_encrypt_temporary_tables) {
512 		buf_tmp_buffer_t* slot = buf_pool_reserve_tmp_slot(buf_pool);
513 		buf_tmp_reserve_crypt_buf(slot);
514 
515 		if (!buf_tmp_page_decrypt(slot->crypt_buf, dst_frame)) {
516 			slot->release();
517 			ib::error() << "Encrypted page " << bpage->id
518 				    << " in file " << space->chain.start->name;
519 			return false;
520 		}
521 
522 		slot->release();
523 		return true;
524 	}
525 
526 	/* Page is encrypted if encryption information is found from
527 	tablespace and page contains used key_version. This is true
528 	also for pages first compressed and then encrypted. */
529 
530 	buf_tmp_buffer_t* slot;
531 
532 	if (page_compressed) {
533 		/* the page we read is unencrypted */
534 		/* Find free slot from temporary memory array */
535 decompress:
536 		slot = buf_pool_reserve_tmp_slot(buf_pool);
537 		/* For decompression, use crypt_buf. */
538 		buf_tmp_reserve_crypt_buf(slot);
539 decompress_with_slot:
540 		ut_d(fil_page_type_validate(dst_frame));
541 
542 		ulint write_size = fil_page_decompress(slot->crypt_buf,
543 						       dst_frame);
544 		slot->release();
545 
546 		ut_ad(!write_size || fil_page_type_validate(dst_frame));
547 		ut_ad(space->pending_io());
548 		return write_size != 0;
549 	}
550 
551 	if (space->crypt_data
552 	    && mach_read_from_4(FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
553 			       + dst_frame)) {
554 		/* Verify encryption checksum before we even try to
555 		decrypt. */
556 		if (!fil_space_verify_crypt_checksum(dst_frame, bpage->size)) {
557 decrypt_failed:
558 			ib::error() << "Encrypted page " << bpage->id
559 				    << " in file " << space->chain.start->name
560 				    << " looks corrupted; key_version="
561 				    << mach_read_from_4(
562 					    FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
563 					    + dst_frame);
564 			return false;
565 		}
566 
567 		/* Find free slot from temporary memory array */
568 		slot = buf_pool_reserve_tmp_slot(buf_pool);
569 		buf_tmp_reserve_crypt_buf(slot);
570 
571 		ut_d(fil_page_type_validate(dst_frame));
572 
573 		/* decrypt using crypt_buf to dst_frame */
574 		if (!fil_space_decrypt(space, slot->crypt_buf, dst_frame)) {
575 			slot->release();
576 			goto decrypt_failed;
577 		}
578 
579 		ut_d(fil_page_type_validate(dst_frame));
580 
581 		if (fil_page_is_compressed_encrypted(dst_frame)) {
582 			goto decompress_with_slot;
583 		}
584 
585 		slot->release();
586 	} else if (fil_page_is_compressed_encrypted(dst_frame)) {
587 		goto decompress;
588 	}
589 
590 	ut_ad(space->pending_io());
591 	return true;
592 }
593 
594 /********************************************************************//**
595 Gets the smallest oldest_modification lsn for any page in the pool. Returns
596 zero if all modified pages have been flushed to disk.
597 @return oldest modification in pool, zero if none */
598 lsn_t
599 buf_pool_get_oldest_modification(void)
600 /*==================================*/
601 {
602 	lsn_t		lsn = 0;
603 	lsn_t		oldest_lsn = 0;
604 
605 	/* When we traverse all the flush lists we don't want another
606 	thread to add a dirty page to any flush list. */
607 	log_flush_order_mutex_enter();
608 
609 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
610 		buf_pool_t*	buf_pool;
611 
612 		buf_pool = buf_pool_from_array(i);
613 
614 		buf_flush_list_mutex_enter(buf_pool);
615 
616 		buf_page_t*	bpage;
617 
618 		/* We don't let log-checkpoint halt because pages from system
619 		temporary are not yet flushed to the disk. Anyway, object
620 		residing in system temporary doesn't generate REDO logging. */
621 		for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
622 		     bpage != NULL
623 			&& fsp_is_system_temporary(bpage->id.space());
624 		     bpage = UT_LIST_GET_PREV(list, bpage)) {
625 			/* Do nothing. */
626 		}
627 
628 		if (bpage != NULL) {
629 			ut_ad(bpage->in_flush_list);
630 			lsn = bpage->oldest_modification;
631 		}
632 
633 		buf_flush_list_mutex_exit(buf_pool);
634 
635 		if (!oldest_lsn || oldest_lsn > lsn) {
636 			oldest_lsn = lsn;
637 		}
638 	}
639 
640 	log_flush_order_mutex_exit();
641 
642 	/* The returned answer may be out of date: the flush_list can
643 	change after the mutex has been released. */
644 
UpdateSampleRate(PaJackStream * stream,double sampleRate)645 	return(oldest_lsn);
646 }
647 
648 /********************************************************************//**
649 Get total buffer pool statistics. */
650 void
651 buf_get_total_list_len(
JackErrorCallback(const char * msg)652 /*===================*/
653 	ulint*		LRU_len,	/*!< out: length of all LRU lists */
654 	ulint*		free_len,	/*!< out: length of all free lists */
655 	ulint*		flush_list_len)	/*!< out: length of all flush lists */
656 {
657 	ulint		i;
658 
659 	*LRU_len = 0;
660 	*free_len = 0;
661 	*flush_list_len = 0;
662 
663 	for (i = 0; i < srv_buf_pool_instances; i++) {
664 		buf_pool_t*	buf_pool;
665 
666 		buf_pool = buf_pool_from_array(i);
667 
668 		*LRU_len += UT_LIST_GET_LEN(buf_pool->LRU);
669 		*free_len += UT_LIST_GET_LEN(buf_pool->free);
670 		*flush_list_len += UT_LIST_GET_LEN(buf_pool->flush_list);
671 	}
672 }
673 
674 /********************************************************************//**
675 Get total list size in bytes from all buffer pools. */
676 void
677 buf_get_total_list_size_in_bytes(
678 /*=============================*/
679 	buf_pools_list_size_t*	buf_pools_list_size)	/*!< out: list sizes
680 							in all buffer pools */
JackSrCb(jack_nframes_t nframes,void * arg)681 {
682 	ut_ad(buf_pools_list_size);
683 	memset(buf_pools_list_size, 0, sizeof(*buf_pools_list_size));
684 
685 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
686 		buf_pool_t*	buf_pool;
687 
688 		buf_pool = buf_pool_from_array(i);
689 		/* We don't need mutex protection since this is
690 		for statistics purpose */
691 		buf_pools_list_size->LRU_bytes += buf_pool->stat.LRU_bytes;
692 		buf_pools_list_size->unzip_LRU_bytes +=
693 			UT_LIST_GET_LEN(buf_pool->unzip_LRU)
694 			<< srv_page_size_shift;
695 		buf_pools_list_size->flush_list_bytes +=
696 			buf_pool->stat.flush_list_bytes;
697 	}
698 }
699 
700 /********************************************************************//**
JackXRunCb(void * arg)701 Get total buffer pool statistics. */
702 void
703 buf_get_total_stat(
704 /*===============*/
705 	buf_pool_stat_t*	tot_stat)	/*!< out: buffer pool stats */
706 {
707 	ulint			i;
708 
709 	memset(tot_stat, 0, sizeof(*tot_stat));
710 
711 	for (i = 0; i < srv_buf_pool_instances; i++) {
712 		buf_pool_stat_t*buf_stat;
713 		buf_pool_t*	buf_pool;
714 
715 		buf_pool = buf_pool_from_array(i);
716 
717 		buf_stat = &buf_pool->stat;
718 		tot_stat->n_page_gets += buf_stat->n_page_gets;
719 		tot_stat->n_pages_read += buf_stat->n_pages_read;
720 		tot_stat->n_pages_written += buf_stat->n_pages_written;
721 		tot_stat->n_pages_created += buf_stat->n_pages_created;
722 		tot_stat->n_ra_pages_read_rnd += buf_stat->n_ra_pages_read_rnd;
723 		tot_stat->n_ra_pages_read += buf_stat->n_ra_pages_read;
724 		tot_stat->n_ra_pages_evicted += buf_stat->n_ra_pages_evicted;
725 		tot_stat->n_pages_made_young += buf_stat->n_pages_made_young;
726 
727 		tot_stat->n_pages_not_made_young +=
728 			buf_stat->n_pages_not_made_young;
729 	}
730 }
731 
732 /********************************************************************//**
733 Allocates a buffer block.
734 @return own: the allocated block, in state BUF_BLOCK_MEMORY */
735 buf_block_t*
736 buf_block_alloc(
737 /*============*/
738 	buf_pool_t*	buf_pool)	/*!< in/out: buffer pool instance,
739 					or NULL for round-robin selection
740 					of the buffer pool */
741 {
742 	buf_block_t*	block;
743 	ulint		index;
744 	static ulint	buf_pool_index;
745 
746 	if (buf_pool == NULL) {
747 		/* We are allocating memory from any buffer pool, ensure
748 		we spread the grace on all buffer pool instances. */
749 		index = buf_pool_index++ % srv_buf_pool_instances;
750 		buf_pool = buf_pool_from_array(index);
751 	}
752 
753 	block = buf_LRU_get_free_block(buf_pool);
754 
755 	buf_block_set_state(block, BUF_BLOCK_MEMORY);
756 
757 	return(block);
758 }
759 #endif /* !UNIV_INNOCHECKSUM */
760 
761 /** Checks if the page is in crc32 checksum format.
762 @param[in]	read_buf		database page
763 @param[in]	checksum_field1		new checksum field
764 @param[in]	checksum_field2		old checksum field
765 @return true if the page is in crc32 checksum format. */
766 bool
767 buf_page_is_checksum_valid_crc32(
768 	const byte*			read_buf,
769 	ulint				checksum_field1,
770 	ulint				checksum_field2)
771 {
772 	const uint32_t	crc32 = buf_calc_page_crc32(read_buf);
773 
774 #ifdef UNIV_INNOCHECKSUM
775 	if (log_file
776 	    && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32) {
777 		fprintf(log_file, "page::" UINT32PF ";"
778 			" crc32 calculated = " UINT32PF ";"
779 			" recorded checksum field1 = " ULINTPF " recorded"
780 			" checksum field2 =" ULINTPF "\n", cur_page_num,
781 			crc32, checksum_field1, checksum_field2);
782 	}
783 #endif /* UNIV_INNOCHECKSUM */
784 
785 	if (checksum_field1 != checksum_field2) {
786 		return false;
787 	}
788 
789 	return checksum_field1 == crc32
790 #ifdef INNODB_BUG_ENDIAN_CRC32
791 		|| checksum_field1 == buf_calc_page_crc32(read_buf, true)
792 #endif
793 		;
794 }
795 
796 /** Checks if the page is in innodb checksum format.
797 @param[in]	read_buf	database page
798 @param[in]	checksum_field1	new checksum field
799 @param[in]	checksum_field2	old checksum field
800 @return true if the page is in innodb checksum format. */
801 bool
802 buf_page_is_checksum_valid_innodb(
803 	const byte*			read_buf,
804 	ulint				checksum_field1,
805 	ulint				checksum_field2)
806 {
807 	/* There are 2 valid formulas for
808 	checksum_field2 (old checksum field) which algo=innodb could have
809 	written to the page:
810 
811 	1. Very old versions of InnoDB only stored 8 byte lsn to the
812 	start and the end of the page.
813 
Terminate(struct PaUtilHostApiRepresentation * hostApi)814 	2. Newer InnoDB versions store the old formula checksum
815 	(buf_calc_page_old_checksum()). */
816 
817 	ulint	old_checksum = buf_calc_page_old_checksum(read_buf);
818 	ulint	new_checksum = buf_calc_page_new_checksum(read_buf);
819 
820 #ifdef UNIV_INNOCHECKSUM
821 	if (log_file
822 	    && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_INNODB) {
823 		fprintf(log_file, "page::" UINT32PF ";"
824 			" old style: calculated ="
825 			" " ULINTPF "; recorded = " ULINTPF "\n",
826 			cur_page_num, old_checksum,
827 			checksum_field2);
828 		fprintf(log_file, "page::" UINT32PF ";"
829 			" new style: calculated ="
830 			" " ULINTPF "; crc32 = " UINT32PF "; recorded = " ULINTPF "\n",
831 			cur_page_num, new_checksum,
832 			buf_calc_page_crc32(read_buf), checksum_field1);
833 	}
834 
835 	if (log_file
836 	    && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB) {
837 		fprintf(log_file, "page::" UINT32PF ";"
838 			" old style: calculated ="
IsFormatSupported(struct PaUtilHostApiRepresentation * hostApi,const PaStreamParameters * inputParameters,const PaStreamParameters * outputParameters,double sampleRate)839 			" " ULINTPF "; recorded checksum = " ULINTPF "\n",
840 			cur_page_num, old_checksum,
841 			checksum_field2);
842 		fprintf(log_file, "page::" UINT32PF ";"
843 			" new style: calculated ="
844 			" " ULINTPF "; recorded checksum  = " ULINTPF "\n",
845 			cur_page_num, new_checksum,
846 			checksum_field1);
847 	}
848 #endif /* UNIV_INNOCHECKSUM */
849 
850 
851 	if (checksum_field2 != mach_read_from_4(read_buf + FIL_PAGE_LSN)
852 	    && checksum_field2 != old_checksum) {
853 		DBUG_LOG("checksum",
854 			 "Page checksum crc32 not valid"
855 			 << " field1 " << checksum_field1
856 			 << " field2 " << checksum_field2
857 			 << " crc32 " << buf_calc_page_old_checksum(read_buf)
858 			 << " lsn " << mach_read_from_4(
859 				 read_buf + FIL_PAGE_LSN));
860 		return(false);
861 	}
862 
863 	/* old field is fine, check the new field */
864 
865 	/* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id
866 	(always equal to 0), to FIL_PAGE_SPACE_OR_CHKSUM */
867 
868 	if (checksum_field1 != 0 && checksum_field1 != new_checksum) {
869 		DBUG_LOG("checksum",
870 			 "Page checksum crc32 not valid"
871 			 << " field1 " << checksum_field1
872 			 << " field2 " << checksum_field2
873 			 << " crc32 " << buf_calc_page_new_checksum(read_buf)
874 			 << " lsn " << mach_read_from_4(
875 				 read_buf + FIL_PAGE_LSN));
876 		return(false);
877 	}
878 
879 	return(true);
880 }
881 
882 /** Checks if the page is in none checksum format.
883 @param[in]	read_buf	database page
884 @param[in]	checksum_field1	new checksum field
885 @param[in]	checksum_field2	old checksum field
886 @return true if the page is in none checksum format. */
887 bool
888 buf_page_is_checksum_valid_none(
889 	const byte*			read_buf,
890 	ulint				checksum_field1,
891 	ulint				checksum_field2)
892 {
893 #ifndef DBUG_OFF
894 	if (checksum_field1 != checksum_field2
895 	    && checksum_field1 != BUF_NO_CHECKSUM_MAGIC) {
896 		DBUG_LOG("checksum",
897 			 "Page checksum crc32 not valid"
898 			 << " field1 " << checksum_field1
899 			 << " field2 " << checksum_field2
900 			 << " crc32 " << BUF_NO_CHECKSUM_MAGIC
901 			 << " lsn " << mach_read_from_4(read_buf
902 							+ FIL_PAGE_LSN));
903 	}
904 #endif /* DBUG_OFF */
905 
906 #ifdef UNIV_INNOCHECKSUM
907 	if (log_file
908 	    && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_NONE) {
909 		fprintf(log_file,
910 			"page::" UINT32PF "; none checksum: calculated"
911 			" = %lu; recorded checksum_field1 = " ULINTPF
912 			" recorded checksum_field2 = " ULINTPF "\n",
913 			cur_page_num, BUF_NO_CHECKSUM_MAGIC,
914 			checksum_field1, checksum_field2);
915 	}
916 #endif /* UNIV_INNOCHECKSUM */
917 
918 	return(checksum_field1 == checksum_field2
919 	       && checksum_field1 == BUF_NO_CHECKSUM_MAGIC);
920 }
921 
922 #ifdef INNODB_BUG_ENDIAN_CRC32
923 /** Validate the CRC-32C checksum of a page.
924 @param[in]	page		buffer page (srv_page_size bytes)
925 @param[in]	checksum	CRC-32C checksum stored on page
InitializeStream(PaJackStream * stream,PaJackHostApiRepresentation * hostApi,int numInputChannels,int numOutputChannels)926 @return	computed checksum */
927 static uint32_t buf_page_check_crc32(const byte* page, uint32_t checksum)
928 {
929 	uint32_t crc32 = buf_calc_page_crc32(page);
930 
931 	if (checksum != crc32) {
932 		crc32 = buf_calc_page_crc32(page, true);
933 	}
934 
935 	return crc32;
936 }
937 #else /* INNODB_BUG_ENDIAN_CRC32 */
938 /** Validate the CRC-32C checksum of a page.
939 @param[in]	page		buffer page (srv_page_size bytes)
940 @param[in]	checksum	CRC-32C checksum stored on page
941 @return	computed checksum */
942 # define buf_page_check_crc32(page, checksum) buf_calc_page_crc32(page)
943 #endif /* INNODB_BUG_ENDIAN_CRC32 */
944 
945 
946 /** Check if a buffer is all zeroes.
947 @param[in]	buf	data to check
948 @return whether the buffer is all zeroes */
949 bool buf_is_zeroes(span<const byte> buf)
950 {
951   ut_ad(buf.size() <= sizeof field_ref_zero);
952   return memcmp(buf.data(), field_ref_zero, buf.size()) == 0;
953 }
954 
955 /** Check if a page is corrupt.
956 @param[in]	check_lsn	whether the LSN should be checked
957 @param[in]	read_buf	database page
958 @param[in]	page_size	page size
959 @param[in]	space		tablespace
960 @return whether the page is corrupted */
961 bool
962 buf_page_is_corrupted(
963 	bool			check_lsn,
964 	const byte*		read_buf,
965 	const page_size_t&	page_size,
966 #ifndef UNIV_INNOCHECKSUM
967 	const fil_space_t* 	space)
968 #else
969 	const void* 	 	space)
970 #endif
971 {
CleanUpStream(PaJackStream * stream,int terminateStreamRepresentation,int terminateBufferProcessor)972 	ut_ad(page_size.logical() == srv_page_size);
973 #ifndef UNIV_INNOCHECKSUM
974 	DBUG_EXECUTE_IF("buf_page_import_corrupt_failure", return(true); );
975 #endif
976 	size_t		checksum_field1 = 0;
977 	size_t		checksum_field2 = 0;
978 	uint32_t	crc32 = 0;
979 	bool		crc32_inited = false;
980 
981 	ulint page_type = mach_read_from_2(read_buf + FIL_PAGE_TYPE);
982 
983 	/* We can trust page type if page compression is set on tablespace
984 	flags because page compression flag means file must have been
985 	created with 10.1 (later than 5.5 code base). In 10.1 page
986 	compressed tables do not contain post compression checksum and
987 	FIL_PAGE_END_LSN_OLD_CHKSUM field stored. Note that space can
988 	be null if we are in fil_check_first_page() and first page
989 	is not compressed or encrypted. Page checksum is verified
990 	after decompression (i.e. normally pages are already
991 	decompressed at this stage). */
992 	if ((page_type == FIL_PAGE_PAGE_COMPRESSED ||
993 	     page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED)
994 #ifndef UNIV_INNOCHECKSUM
995 	    && space && FSP_FLAGS_HAS_PAGE_COMPRESSION(space->flags)
996 #endif
997 	) {
998 		return(false);
999 	}
1000 
1001 	if (!page_size.is_compressed()
1002 	    && memcmp(read_buf + FIL_PAGE_LSN + 4,
1003 		      read_buf + page_size.logical()
WaitCondition(PaJackHostApiRepresentation * hostApi)1004 		      - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) {
1005 
1006 		/* Stored log sequence numbers at the start and the end
1007 		of page do not match */
1008 
1009 		return(true);
1010 	}
1011 
1012 #ifndef UNIV_INNOCHECKSUM
1013 	if (check_lsn && recv_lsn_checks_on) {
1014 		lsn_t		current_lsn;
1015 		const lsn_t	page_lsn
1016 			= mach_read_from_8(read_buf + FIL_PAGE_LSN);
1017 
1018 		/* Since we are going to reset the page LSN during the import
1019 		phase it makes no sense to spam the log with error messages. */
1020 
1021 		if (log_peek_lsn(&current_lsn) && current_lsn < page_lsn) {
1022 
1023 			const ulint	space_id = mach_read_from_4(
AddStream(PaJackStream * stream)1024 				read_buf + FIL_PAGE_SPACE_ID);
1025 			const ulint	page_no = mach_read_from_4(
1026 				read_buf + FIL_PAGE_OFFSET);
1027 
1028 			ib::error() << "Page " << page_id_t(space_id, page_no)
1029 				<< " log sequence number " << page_lsn
1030 				<< " is in the future! Current system"
1031 				<< " log sequence number "
1032 				<< current_lsn << ".";
1033 
1034 			ib::error() << "Your database may be corrupt or"
1035 				" you may have copied the InnoDB"
1036 				" tablespace but not the InnoDB"
1037 				" log files. "
1038 				<< FORCE_RECOVERY_MSG;
1039 
1040 		}
1041 	}
1042 #endif /* !UNIV_INNOCHECKSUM */
1043 
1044 	/* Check whether the checksum fields have correct values */
1045 
RemoveStream(PaJackStream * stream)1046 	const srv_checksum_algorithm_t curr_algo =
1047 		static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm);
1048 
1049 	if (curr_algo == SRV_CHECKSUM_ALGORITHM_NONE) {
1050 		return(false);
1051 	}
1052 
1053 	if (page_size.is_compressed()) {
1054 		return(!page_zip_verify_checksum(read_buf,
1055 						 page_size.physical()));
1056 	}
1057 
1058 	checksum_field1 = mach_read_from_4(
1059 		read_buf + FIL_PAGE_SPACE_OR_CHKSUM);
1060 
1061 	checksum_field2 = mach_read_from_4(
1062 		read_buf + page_size.logical() - FIL_PAGE_END_LSN_OLD_CHKSUM);
1063 
1064 	compile_time_assert(!(FIL_PAGE_LSN % 8));
1065 
1066 	/* A page filled with NUL bytes is considered not corrupted.
OpenStream(struct PaUtilHostApiRepresentation * hostApi,PaStream ** s,const PaStreamParameters * inputParameters,const PaStreamParameters * outputParameters,double sampleRate,unsigned long framesPerBuffer,PaStreamFlags streamFlags,PaStreamCallback * streamCallback,void * userData)1067 	The FIL_PAGE_FILE_FLUSH_LSN field may be written nonzero for
1068 	the first page of the system tablespace.
1069 	Ignore it for the system tablespace. */
1070 	if (!checksum_field1 && !checksum_field2) {
1071 		/* Checksum fields can have valid value as zero.
1072 		If the page is not empty then do the checksum
1073 		calculation for the page. */
1074 		bool all_zeroes = true;
1075 		for (size_t i = 0; i < srv_page_size; i++) {
1076 #ifndef UNIV_INNOCHECKSUM
1077 			if (i == FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
1078 			    && (!space || !space->id)) {
1079 				i += 8;
1080 			}
1081 #endif
1082 			if (read_buf[i]) {
1083 				all_zeroes = false;
1084 				break;
1085 			}
1086 		}
1087 
1088 		if (all_zeroes) {
1089 			return false;
1090 		}
1091 	}
1092 
1093 	switch (curr_algo) {
1094 	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
1095 		return !buf_page_is_checksum_valid_crc32(
1096 			read_buf, checksum_field1, checksum_field2);
1097 	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
1098 		return !buf_page_is_checksum_valid_innodb(
1099 			read_buf, checksum_field1, checksum_field2);
1100 	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
1101 		return !buf_page_is_checksum_valid_none(
1102 			read_buf, checksum_field1, checksum_field2);
1103 	case SRV_CHECKSUM_ALGORITHM_CRC32:
1104 	case SRV_CHECKSUM_ALGORITHM_INNODB:
1105 		if (buf_page_is_checksum_valid_none(read_buf,
1106 			checksum_field1, checksum_field2)) {
1107 #ifdef UNIV_INNOCHECKSUM
1108 			if (log_file) {
1109 				fprintf(log_file, "page::" UINT32PF ";"
1110 					" old style: calculated = %u;"
1111 					" recorded = " ULINTPF ";\n",
1112 					cur_page_num,
1113 					buf_calc_page_old_checksum(read_buf),
1114 					checksum_field2);
1115 				fprintf(log_file, "page::" UINT32PF ";"
1116 					" new style: calculated = " UINT32PF ";"
1117 					" crc32 = " UINT32PF "; recorded = " ULINTPF ";\n",
1118 					cur_page_num,
1119 					buf_calc_page_new_checksum(read_buf),
1120 					buf_calc_page_crc32(read_buf),
1121 					checksum_field1);
1122 			}
1123 #endif /* UNIV_INNOCHECKSUM */
1124 			return false;
1125 		}
1126 
1127 		/* Very old versions of InnoDB only stored 8 byte lsn to the
1128 		start and the end of the page. */
1129 
1130 		/* Since innodb_checksum_algorithm is not strict_* allow
1131 		any of the algos to match for the old field */
1132 
1133 		if (checksum_field2
1134 		    != mach_read_from_4(read_buf + FIL_PAGE_LSN)
1135 		    && checksum_field2 != BUF_NO_CHECKSUM_MAGIC) {
1136 
1137 			if (curr_algo == SRV_CHECKSUM_ALGORITHM_CRC32) {
1138 				DBUG_EXECUTE_IF(
1139 					"page_intermittent_checksum_mismatch", {
1140 					static int page_counter;
1141 					if (page_counter++ == 2) {
1142 						checksum_field2++;
1143 					}
1144 				});
1145 
1146 				crc32 = buf_page_check_crc32(read_buf,
1147 							     checksum_field2);
1148 				crc32_inited = true;
1149 
1150 				if (checksum_field2 != crc32
1151 				    && checksum_field2
1152 				       != buf_calc_page_old_checksum(read_buf)) {
1153 					return true;
1154 				}
1155 			} else {
1156 				ut_ad(curr_algo
1157 				      == SRV_CHECKSUM_ALGORITHM_INNODB);
1158 
1159 				if (checksum_field2
1160 				    != buf_calc_page_old_checksum(read_buf)) {
1161 					crc32 = buf_page_check_crc32(
1162 						read_buf, checksum_field2);
1163 					crc32_inited = true;
1164 
1165 					if (checksum_field2 != crc32) {
1166 						return true;
1167 					}
1168 				}
1169 			}
1170 		}
1171 
1172 		if (checksum_field1 == 0
1173 		    || checksum_field1 == BUF_NO_CHECKSUM_MAGIC) {
1174 		} else if (curr_algo == SRV_CHECKSUM_ALGORITHM_CRC32) {
1175 			if (!crc32_inited) {
1176 				crc32 = buf_page_check_crc32(
1177 					read_buf, checksum_field2);
1178 				crc32_inited = true;
1179 			}
1180 
1181 			if (checksum_field1 != crc32
1182 			    && checksum_field1
1183 			    != buf_calc_page_new_checksum(read_buf)) {
1184 				return true;
1185 			}
1186 		} else {
1187 			ut_ad(curr_algo == SRV_CHECKSUM_ALGORITHM_INNODB);
1188 
1189 			if (checksum_field1
1190 			    != buf_calc_page_new_checksum(read_buf)) {
1191 
1192 				if (!crc32_inited) {
1193 					crc32 = buf_page_check_crc32(
1194 						read_buf, checksum_field2);
1195 					crc32_inited = true;
1196 				}
1197 
1198 				if (checksum_field1 != crc32) {
1199 					return true;
1200 				}
1201 			}
1202 		}
1203 
1204 		if (crc32_inited
1205 		    && ((checksum_field1 == crc32
1206 			 && checksum_field2 != crc32)
1207 			|| (checksum_field1 != crc32
1208 			    && checksum_field2 == crc32))) {
1209 			return true;
1210 		}
1211 
1212 		break;
1213 	case SRV_CHECKSUM_ALGORITHM_NONE:
1214 		/* should have returned false earlier */
1215 		break;
1216 	}
1217 
1218 	return false;
1219 }
1220 
1221 #ifndef UNIV_INNOCHECKSUM
1222 
1223 #if defined(DBUG_OFF) && defined(HAVE_MADVISE) &&  defined(MADV_DODUMP)
1224 /** Enable buffers to be dumped to core files
1225 
1226 A convience function, not called anyhwere directly however
1227 it is left available for gdb or any debugger to call
1228 in the event that you want all of the memory to be dumped
1229 to a core file.
1230 
1231 Returns number of errors found in madvise calls. */
1232 int
1233 buf_madvise_do_dump()
1234 {
1235 	int ret= 0;
1236 	buf_pool_t*	buf_pool;
1237 	buf_chunk_t*	chunk;
1238 
1239 	/* mirrors allocation in log_t::create() */
1240 	if (log_sys.buf) {
1241 		ret += madvise(log_sys.buf,
1242 			       srv_log_buffer_size,
1243 			       MADV_DODUMP);
1244 		ret += madvise(log_sys.flush_buf,
1245 			       srv_log_buffer_size,
1246 			       MADV_DODUMP);
1247 	}
1248 	/* mirrors recv_sys_init() */
1249 	if (recv_sys->buf)
1250 	{
1251 		ret+= madvise(recv_sys->buf, recv_sys->len, MADV_DODUMP);
1252 	}
1253 
1254 	buf_pool_mutex_enter_all();
1255 
1256 	for (ulong i= 0; i < srv_buf_pool_instances; i++)
1257 	{
1258 		buf_pool = buf_pool_from_array(i);
1259 		chunk = buf_pool->chunks;
1260 
1261 		for (int n = buf_pool->n_chunks; n--; chunk++)
1262 		{
1263 			ret+= madvise(chunk->mem, chunk->mem_size(), MADV_DODUMP);
1264 		}
1265 	}
1266 
1267 	buf_pool_mutex_exit_all();
1268 
1269 	return ret;
1270 }
1271 #endif
1272 
1273 /** Dump a page to stderr.
1274 @param[in]	read_buf	database page
1275 @param[in]	page_size	page size */
1276 UNIV_INTERN
1277 void
1278 buf_page_print(const byte* read_buf, const page_size_t& page_size)
1279 {
1280 	dict_index_t*	index;
1281 
1282 #ifndef UNIV_DEBUG
1283 	ib::info() << "Page dump in ascii and hex ("
1284 		<< page_size.physical() << " bytes):";
1285 
1286 	ut_print_buf(stderr, read_buf, page_size.physical());
1287 	fputs("\nInnoDB: End of page dump\n", stderr);
1288 #endif
1289 
1290 	if (page_size.is_compressed()) {
1291 		/* Print compressed page. */
1292 		ib::info() << "Compressed page type ("
1293 			<< fil_page_get_type(read_buf)
1294 			<< "); stored checksum in field1 "
1295 			<< mach_read_from_4(
1296 				read_buf + FIL_PAGE_SPACE_OR_CHKSUM)
1297 			<< "; calculated checksums for field1: "
1298 			<< buf_checksum_algorithm_name(
1299 				SRV_CHECKSUM_ALGORITHM_CRC32)
1300 			<< " "
1301 			<< page_zip_calc_checksum(
1302 				read_buf, page_size.physical(),
1303 				SRV_CHECKSUM_ALGORITHM_CRC32)
1304 #ifdef INNODB_BUG_ENDIAN_CRC32
1305 			<< "/"
1306 			<< page_zip_calc_checksum(
1307 				read_buf, page_size.physical(),
1308 				SRV_CHECKSUM_ALGORITHM_CRC32, true)
1309 #endif
1310 			<< ", "
1311 			<< buf_checksum_algorithm_name(
1312 				SRV_CHECKSUM_ALGORITHM_INNODB)
1313 			<< " "
1314 			<< page_zip_calc_checksum(
1315 				read_buf, page_size.physical(),
1316 				SRV_CHECKSUM_ALGORITHM_INNODB)
1317 			<< ", "
1318 			<< buf_checksum_algorithm_name(
1319 				SRV_CHECKSUM_ALGORITHM_NONE)
1320 			<< " "
1321 			<< page_zip_calc_checksum(
1322 				read_buf, page_size.physical(),
1323 				SRV_CHECKSUM_ALGORITHM_NONE)
1324 			<< "; page LSN "
1325 			<< mach_read_from_8(read_buf + FIL_PAGE_LSN)
1326 			<< "; page number (if stored to page"
1327 			<< " already) "
1328 			<< mach_read_from_4(read_buf + FIL_PAGE_OFFSET)
1329 			<< "; space id (if stored to page already) "
CloseStream(PaStream * s)1330 			<< mach_read_from_4(
1331 				read_buf + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
1332 
1333 	} else {
1334 		const uint32_t	crc32 = buf_calc_page_crc32(read_buf);
1335 #ifdef INNODB_BUG_ENDIAN_CRC32
1336 		const uint32_t	crc32_legacy = buf_calc_page_crc32(read_buf,
1337 								   true);
1338 #endif /* INNODB_BUG_ENDIAN_CRC32 */
1339 		ulint page_type = fil_page_get_type(read_buf);
1340 
1341 		ib::info() << "Uncompressed page, stored checksum in field1 "
1342 			<< mach_read_from_4(
RealProcess(PaJackStream * stream,jack_nframes_t frames)1343 				read_buf + FIL_PAGE_SPACE_OR_CHKSUM)
1344 			<< ", calculated checksums for field1: "
1345 			<< buf_checksum_algorithm_name(
1346 				SRV_CHECKSUM_ALGORITHM_CRC32) << " "
1347 			<< crc32
1348 #ifdef INNODB_BUG_ENDIAN_CRC32
1349 			<< "/" << crc32_legacy
1350 #endif
1351 			<< ", "
1352 			<< buf_checksum_algorithm_name(
1353 				SRV_CHECKSUM_ALGORITHM_INNODB) << " "
1354 			<< buf_calc_page_new_checksum(read_buf)
1355 			<< ", "
1356 			<< " page type " << page_type << " == "
1357 			<< fil_get_page_type_name(page_type) << "."
1358 			<< buf_checksum_algorithm_name(
1359 				SRV_CHECKSUM_ALGORITHM_NONE) << " "
1360 			<< BUF_NO_CHECKSUM_MAGIC
1361 			<< ", stored checksum in field2 "
1362 			<< mach_read_from_4(read_buf + page_size.logical()
1363 					    - FIL_PAGE_END_LSN_OLD_CHKSUM)
1364 			<< ", calculated checksums for field2: "
1365 			<< buf_checksum_algorithm_name(
1366 				SRV_CHECKSUM_ALGORITHM_CRC32) << " "
1367 			<< crc32
1368 #ifdef INNODB_BUG_ENDIAN_CRC32
1369 			<< "/" << crc32_legacy
1370 #endif
1371 			<< ", "
1372 			<< buf_checksum_algorithm_name(
1373 				SRV_CHECKSUM_ALGORITHM_INNODB) << " "
1374 			<< buf_calc_page_old_checksum(read_buf)
1375 			<< ", "
1376 			<< buf_checksum_algorithm_name(
1377 				SRV_CHECKSUM_ALGORITHM_NONE) << " "
1378 			<< BUF_NO_CHECKSUM_MAGIC
1379 			<< ",  page LSN "
1380 			<< mach_read_from_4(read_buf + FIL_PAGE_LSN)
1381 			<< " "
1382 			<< mach_read_from_4(read_buf + FIL_PAGE_LSN + 4)
1383 			<< ", low 4 bytes of LSN at page end "
1384 			<< mach_read_from_4(read_buf + page_size.logical()
1385 					    - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)
1386 			<< ", page number (if stored to page already) "
1387 			<< mach_read_from_4(read_buf + FIL_PAGE_OFFSET)
1388 			<< ", space id (if created with >= MySQL-4.1.1"
1389 			   " and stored already) "
1390 			<< mach_read_from_4(
1391 				read_buf + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
1392 	}
1393 
1394 	switch (fil_page_get_type(read_buf)) {
1395 		index_id_t	index_id;
1396 	case FIL_PAGE_INDEX:
1397 	case FIL_PAGE_TYPE_INSTANT:
1398 	case FIL_PAGE_RTREE:
1399 		index_id = btr_page_get_index_id(read_buf);
1400 		ib::info() << "Page may be an index page where"
1401 			" index id is " << index_id;
1402 
1403 		index = dict_index_find_on_id_low(index_id);
1404 		if (index) {
1405 			ib::info()
1406 				<< "Index " << index_id
1407 				<< " is " << index->name
1408 				<< " in table " << index->table->name;
1409 		}
1410 		break;
1411 	case FIL_PAGE_UNDO_LOG:
1412 		fputs("InnoDB: Page may be an undo log page\n", stderr);
1413 		break;
1414 	case FIL_PAGE_INODE:
1415 		fputs("InnoDB: Page may be an 'inode' page\n", stderr);
1416 		break;
1417 	case FIL_PAGE_IBUF_FREE_LIST:
1418 		fputs("InnoDB: Page may be an insert buffer free list page\n",
1419 		      stderr);
1420 		break;
1421 	case FIL_PAGE_TYPE_ALLOCATED:
1422 		fputs("InnoDB: Page may be a freshly allocated page\n",
UpdateQueue(PaJackHostApiRepresentation * hostApi)1423 		      stderr);
1424 		break;
1425 	case FIL_PAGE_IBUF_BITMAP:
1426 		fputs("InnoDB: Page may be an insert buffer bitmap page\n",
1427 		      stderr);
1428 		break;
1429 	case FIL_PAGE_TYPE_SYS:
1430 		fputs("InnoDB: Page may be a system page\n",
1431 		      stderr);
1432 		break;
1433 	case FIL_PAGE_TYPE_TRX_SYS:
1434 		fputs("InnoDB: Page may be a transaction system page\n",
1435 		      stderr);
1436 		break;
1437 	case FIL_PAGE_TYPE_FSP_HDR:
1438 		fputs("InnoDB: Page may be a file space header page\n",
1439 		      stderr);
1440 		break;
1441 	case FIL_PAGE_TYPE_XDES:
1442 		fputs("InnoDB: Page may be an extent descriptor page\n",
1443 		      stderr);
1444 		break;
1445 	case FIL_PAGE_TYPE_BLOB:
1446 		fputs("InnoDB: Page may be a BLOB page\n",
1447 		      stderr);
1448 		break;
1449 	case FIL_PAGE_TYPE_ZBLOB:
1450 	case FIL_PAGE_TYPE_ZBLOB2:
1451 		fputs("InnoDB: Page may be a compressed BLOB page\n",
1452 		      stderr);
1453 		break;
1454 	}
1455 }
1456 
1457 # ifdef PFS_GROUP_BUFFER_SYNC
1458 extern mysql_pfs_key_t	buffer_block_mutex_key;
1459 
1460 /********************************************************************//**
1461 This function registers mutexes and rwlocks in buffer blocks with
1462 performance schema. If PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER is
1463 defined to be a value less than chunk->size, then only mutexes
1464 and rwlocks in the first PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER
1465 blocks are registered. */
1466 static
1467 void
1468 pfs_register_buffer_block(
1469 /*======================*/
1470 	buf_chunk_t*	chunk)		/*!< in/out: chunk of buffers */
1471 {
1472 	buf_block_t*    block;
1473 	ulint		num_to_register;
1474 
1475 	block = chunk->blocks;
1476 
1477 	num_to_register = ut_min(
1478 		chunk->size, PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER);
1479 
1480 	for (ulint i = 0; i < num_to_register; i++) {
1481 #  ifdef UNIV_PFS_MUTEX
1482 		BPageMutex*	mutex;
1483 
1484 		mutex = &block->mutex;
1485 		mutex->pfs_add(buffer_block_mutex_key);
1486 #  endif /* UNIV_PFS_MUTEX */
1487 
1488 		rw_lock_t*	rwlock;
1489 
1490 #  ifdef UNIV_PFS_RWLOCK
1491 		rwlock = &block->lock;
1492 		ut_a(!rwlock->pfs_psi);
1493 		rwlock->pfs_psi = (PSI_server)
1494 			? PSI_server->init_rwlock(buf_block_lock_key, rwlock)
1495 			: NULL;
1496 
1497 #   ifdef UNIV_DEBUG
1498 		rwlock = &block->debug_latch;
1499 		ut_a(!rwlock->pfs_psi);
1500 		rwlock->pfs_psi = (PSI_server)
JackCallback(jack_nframes_t frames,void * userData)1501 			? PSI_server->init_rwlock(buf_block_debug_latch_key,
1502 						  rwlock)
1503 			: NULL;
1504 #   endif /* UNIV_DEBUG */
1505 
1506 #  endif /* UNIV_PFS_RWLOCK */
1507 		block++;
1508 	}
1509 }
1510 # endif /* PFS_GROUP_BUFFER_SYNC */
1511 
1512 /********************************************************************//**
1513 Initializes a buffer control block when the buf_pool is created. */
1514 static
1515 void
1516 buf_block_init(
1517 /*===========*/
1518 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
1519 	buf_block_t*	block,		/*!< in: pointer to control block */
1520 	byte*		frame)		/*!< in: pointer to buffer frame */
1521 {
1522 	/* This function should only be executed at database startup or by
1523 	buf_pool_resize(). Either way, adaptive hash index must not exist. */
1524 	assert_block_ahi_empty_on_init(block);
1525 
1526 	block->frame = frame;
1527 
1528 	block->page.buf_pool_index = buf_pool_index(buf_pool);
1529 	block->page.flush_type = BUF_FLUSH_LRU;
1530 	block->page.state = BUF_BLOCK_NOT_USED;
1531 	block->page.buf_fix_count = 0;
1532 	block->page.io_fix = BUF_IO_NONE;
1533 	block->page.flush_observer = NULL;
1534 	block->page.real_size = 0;
1535 	block->modify_clock = 0;
1536 	block->page.slot = NULL;
1537 
1538 	ut_d(block->page.file_page_was_freed = FALSE);
1539 
1540 #ifdef BTR_CUR_HASH_ADAPT
1541 	block->index = NULL;
1542 #endif /* BTR_CUR_HASH_ADAPT */
1543 	ut_d(block->page.in_page_hash = FALSE);
1544 	ut_d(block->page.in_zip_hash = FALSE);
1545 	ut_d(block->page.in_flush_list = FALSE);
1546 	ut_d(block->page.in_free_list = FALSE);
1547 	ut_d(block->page.in_LRU_list = FALSE);
1548 	ut_d(block->in_unzip_LRU_list = FALSE);
1549 	ut_d(block->in_withdraw_list = FALSE);
1550 
1551 	page_zip_des_init(&block->page.zip);
1552 
1553 	mutex_create(LATCH_ID_BUF_BLOCK_MUTEX, &block->mutex);
1554 
1555 #if defined PFS_SKIP_BUFFER_MUTEX_RWLOCK || defined PFS_GROUP_BUFFER_SYNC
1556 	/* If PFS_SKIP_BUFFER_MUTEX_RWLOCK is defined, skip registration
1557 	of buffer block rwlock with performance schema.
1558 
1559 	If PFS_GROUP_BUFFER_SYNC is defined, skip the registration
1560 	since buffer block rwlock will be registered later in
1561 	pfs_register_buffer_block(). */
1562 
1563 	rw_lock_create(PFS_NOT_INSTRUMENTED, &block->lock, SYNC_LEVEL_VARYING);
1564 
1565 	ut_d(rw_lock_create(PFS_NOT_INSTRUMENTED, &block->debug_latch,
1566 			    SYNC_LEVEL_VARYING));
1567 
1568 #else /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */
1569 
1570 	rw_lock_create(buf_block_lock_key, &block->lock, SYNC_LEVEL_VARYING);
1571 
1572 	ut_d(rw_lock_create(buf_block_debug_latch_key,
1573 			    &block->debug_latch, SYNC_LEVEL_VARYING));
1574 
1575 #endif /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */
1576 
1577 	block->lock.is_block_lock = 1;
1578 
1579 	ut_ad(rw_lock_validate(&(block->lock)));
1580 }
1581 
1582 /********************************************************************//**
1583 Allocates a chunk of buffer frames.
1584 @return chunk, or NULL on failure */
1585 static
1586 buf_chunk_t*
1587 buf_chunk_init(
1588 /*===========*/
1589 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
1590 	buf_chunk_t*	chunk,		/*!< out: chunk of buffers */
1591 	ulint		mem_size)	/*!< in: requested size in bytes */
1592 {
StartStream(PaStream * s)1593 	buf_block_t*	block;
1594 	byte*		frame;
1595 	ulint		i;
1596 
1597 	/* Round down to a multiple of page size,
1598 	although it already should be. */
1599 	mem_size = ut_2pow_round<ulint>(mem_size, srv_page_size);
1600 	/* Reserve space for the block descriptors. */
1601 	mem_size += ut_2pow_round<ulint>((mem_size >> srv_page_size_shift)
1602 					 * (sizeof *block)
1603 					 + (srv_page_size - 1),
1604 					 srv_page_size);
1605 
1606 	DBUG_EXECUTE_IF("ib_buf_chunk_init_fails", return(NULL););
1607 
1608 	chunk->mem = buf_pool->allocator.allocate_large_dontdump(mem_size, &chunk->mem_pfx);
1609 
1610 	if (UNIV_UNLIKELY(chunk->mem == NULL)) {
1611 
1612 		return(NULL);
1613 	}
1614 
1615 #ifdef HAVE_LIBNUMA
1616 	if (srv_numa_interleave) {
1617 		struct bitmask *numa_mems_allowed = numa_get_mems_allowed();
1618 		int	st = mbind(chunk->mem, chunk->mem_size(),
1619 				   MPOL_INTERLEAVE,
1620 				   numa_mems_allowed->maskp,
1621 				   numa_mems_allowed->size,
1622 				   MPOL_MF_MOVE);
1623 		if (st != 0) {
1624 			ib::warn() << "Failed to set NUMA memory policy of"
1625 				" buffer pool page frames to MPOL_INTERLEAVE"
1626 				" (error: " << strerror(errno) << ").";
1627 		}
1628 		numa_bitmask_free(numa_mems_allowed);
1629 	}
1630 #endif /* HAVE_LIBNUMA */
1631 
1632 
1633 	/* Allocate the block descriptors from
1634 	the start of the memory block. */
1635 	chunk->blocks = (buf_block_t*) chunk->mem;
1636 
1637 	/* Align a pointer to the first frame.  Note that when
1638 	opt_large_page_size is smaller than srv_page_size,
1639 	we may allocate one fewer block than requested.  When
1640 	it is bigger, we may allocate more blocks than requested. */
1641 
1642 	frame = (byte*) ut_align(chunk->mem, srv_page_size);
1643 	chunk->size = (chunk->mem_pfx.m_size >> srv_page_size_shift)
1644 		- (frame != chunk->mem);
1645 
1646 	/* Subtract the space needed for block descriptors. */
1647 	{
1648 		ulint	size = chunk->size;
1649 
1650 		while (frame < (byte*) (chunk->blocks + size)) {
1651 			frame += srv_page_size;
1652 			size--;
1653 		}
1654 
1655 		chunk->size = size;
RealStop(PaJackStream * stream,int abort)1656 	}
1657 
1658 	/* Init block structs and assign frames for them. Then we
1659 	assign the frames to the first blocks (we already mapped the
1660 	memory above). */
1661 
1662 	block = chunk->blocks;
1663 
1664 	for (i = chunk->size; i--; ) {
1665 
1666 		buf_block_init(buf_pool, block, frame);
1667 		MEM_UNDEFINED(block->frame, srv_page_size);
1668 
1669 		/* Add the block to the free list */
1670 		UT_LIST_ADD_LAST(buf_pool->free, &block->page);
1671 
1672 		ut_d(block->page.in_free_list = TRUE);
1673 		ut_ad(buf_pool_from_block(block) == buf_pool);
1674 
1675 		block++;
1676 		frame += srv_page_size;
1677 	}
1678 
1679 	buf_pool_register_chunk(chunk);
1680 
1681 #ifdef PFS_GROUP_BUFFER_SYNC
1682 	pfs_register_buffer_block(chunk);
1683 #endif /* PFS_GROUP_BUFFER_SYNC */
1684 	return(chunk);
1685 }
1686 
1687 #ifdef UNIV_DEBUG
1688 /*********************************************************************//**
1689 Finds a block in the given buffer chunk that points to a
1690 given compressed page.
1691 @return buffer block pointing to the compressed page, or NULL */
1692 static
1693 buf_block_t*
1694 buf_chunk_contains_zip(
1695 /*===================*/
1696 	buf_chunk_t*	chunk,	/*!< in: chunk being checked */
1697 	const void*	data)	/*!< in: pointer to compressed page */
1698 {
1699 	buf_block_t*	block;
1700 	ulint		i;
1701 
1702 	block = chunk->blocks;
1703 
1704 	for (i = chunk->size; i--; block++) {
1705 		if (block->page.zip.data == data) {
1706 
StopStream(PaStream * s)1707 			return(block);
1708 		}
1709 	}
1710 
1711 	return(NULL);
1712 }
AbortStream(PaStream * s)1713 
1714 /*********************************************************************//**
1715 Finds a block in the buffer pool that points to a
1716 given compressed page.
1717 @return buffer block pointing to the compressed page, or NULL */
1718 buf_block_t*
1719 buf_pool_contains_zip(
1720 /*==================*/
1721 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
1722 	const void*	data)		/*!< in: pointer to compressed page */
1723 {
1724 	ulint		n;
1725 	buf_chunk_t*	chunk = buf_pool->chunks;
1726 
1727 	ut_ad(buf_pool);
1728 	ut_ad(buf_pool_mutex_own(buf_pool));
1729 	for (n = buf_pool->n_chunks; n--; chunk++) {
1730 
1731 		buf_block_t* block = buf_chunk_contains_zip(chunk, data);
1732 
1733 		if (block) {
1734 			return(block);
1735 		}
1736 	}
1737 
1738 	return(NULL);
1739 }
1740 #endif /* UNIV_DEBUG */
1741 
GetStreamCpuLoad(PaStream * s)1742 /*********************************************************************//**
1743 Checks that all file pages in the buffer chunk are in a replaceable state.
1744 @return address of a non-free block, or NULL if all freed */
1745 static
1746 const buf_block_t*
1747 buf_chunk_not_freed(
1748 /*================*/
1749 	buf_chunk_t*	chunk)	/*!< in: chunk being checked */
1750 {
1751 	buf_block_t*	block;
1752 	ulint		i;
1753 
1754 	block = chunk->blocks;
1755 
1756 	for (i = chunk->size; i--; block++) {
1757 		ibool	ready;
1758 
1759 		switch (buf_block_get_state(block)) {
1760 		case BUF_BLOCK_POOL_WATCH:
1761 		case BUF_BLOCK_ZIP_PAGE:
1762 		case BUF_BLOCK_ZIP_DIRTY:
1763 			/* The uncompressed buffer pool should never
1764 			contain compressed block descriptors. */
1765 			ut_error;
1766 			break;
1767 		case BUF_BLOCK_NOT_USED:
1768 		case BUF_BLOCK_READY_FOR_USE:
1769 		case BUF_BLOCK_MEMORY:
1770 		case BUF_BLOCK_REMOVE_HASH:
1771 			/* Skip blocks that are not being used for
1772 			file pages. */
1773 			break;
1774 		case BUF_BLOCK_FILE_PAGE:
1775 			if (srv_read_only_mode) {
1776 				/* The page cleaner is disabled in
1777 				read-only mode.  No pages can be
1778 				dirtied, so all of them must be clean. */
1779 				ut_ad(block->page.oldest_modification
1780 				      == block->page.newest_modification);
1781 				ut_ad(block->page.oldest_modification == 0
1782 				      || block->page.oldest_modification
1783 				      == recv_sys->recovered_lsn
1784 				      || srv_force_recovery
1785 				      == SRV_FORCE_NO_LOG_REDO);
1786 				ut_ad(block->page.buf_fix_count == 0);
1787 				ut_ad(block->page.io_fix == BUF_IO_NONE);
1788 				break;
1789 			}
1790 
1791 			buf_page_mutex_enter(block);
1792 			ready = buf_flush_ready_for_replace(&block->page);
1793 			buf_page_mutex_exit(block);
1794 
1795 			if (!ready) {
1796 				return(block);
1797 			}
1798 
1799 			break;
1800 		}
1801 	}
1802 
1803 	return(NULL);
1804 }
1805 
1806 /********************************************************************//**
1807 Set buffer pool size variables after resizing it */
1808 static
1809 void
1810 buf_pool_set_sizes(void)
1811 /*====================*/
1812 {
1813 	ulint	i;
1814 	ulint	curr_size = 0;
1815 
1816 	buf_pool_mutex_enter_all();
1817 
1818 	for (i = 0; i < srv_buf_pool_instances; i++) {
1819 		buf_pool_t*	buf_pool;
1820 
1821 		buf_pool = buf_pool_from_array(i);
1822 		curr_size += buf_pool->curr_pool_size;
1823 	}
1824 
1825 	srv_buf_pool_curr_size = curr_size;
1826 	srv_buf_pool_old_size = srv_buf_pool_size;
1827 	srv_buf_pool_base_size = srv_buf_pool_size;
1828 
1829 	buf_pool_mutex_exit_all();
1830 }
1831 
1832 /********************************************************************//**
1833 Initialize a buffer pool instance.
1834 @return DB_SUCCESS if all goes well. */
1835 static
1836 ulint
1837 buf_pool_init_instance(
1838 /*===================*/
1839 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
1840 	ulint		buf_pool_size,	/*!< in: size in bytes */
1841 	ulint		instance_no)	/*!< in: id of the instance */
1842 {
1843 	ulint		i;
1844 	ulint		chunk_size;
1845 	buf_chunk_t*	chunk;
1846 
1847 	ut_ad(buf_pool_size % srv_buf_pool_chunk_unit == 0);
1848 
1849 	/* 1. Initialize general fields
1850 	------------------------------- */
1851 	mutex_create(LATCH_ID_BUF_POOL, &buf_pool->mutex);
1852 
1853 	mutex_create(LATCH_ID_BUF_POOL_ZIP, &buf_pool->zip_mutex);
1854 
1855 	new(&buf_pool->allocator)
1856 		ut_allocator<unsigned char>(mem_key_buf_buf_pool);
1857 
1858 	buf_pool_mutex_enter(buf_pool);
1859 
1860 	if (buf_pool_size > 0) {
1861 		buf_pool->n_chunks
1862 			= buf_pool_size / srv_buf_pool_chunk_unit;
1863 		chunk_size = srv_buf_pool_chunk_unit;
1864 
1865 		buf_pool->chunks =
1866 			reinterpret_cast<buf_chunk_t*>(ut_zalloc_nokey(
1867 				buf_pool->n_chunks * sizeof(*chunk)));
1868 		buf_pool->chunks_old = NULL;
1869 
1870 		UT_LIST_INIT(buf_pool->LRU, &buf_page_t::LRU);
1871 		UT_LIST_INIT(buf_pool->free, &buf_page_t::list);
1872 		UT_LIST_INIT(buf_pool->withdraw, &buf_page_t::list);
1873 		buf_pool->withdraw_target = 0;
1874 		UT_LIST_INIT(buf_pool->flush_list, &buf_page_t::list);
1875 		UT_LIST_INIT(buf_pool->unzip_LRU, &buf_block_t::unzip_LRU);
1876 
1877 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
1878 		UT_LIST_INIT(buf_pool->zip_clean, &buf_page_t::list);
1879 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
1880 
1881 		for (i = 0; i < UT_ARR_SIZE(buf_pool->zip_free); ++i) {
1882 			UT_LIST_INIT(
1883 				buf_pool->zip_free[i], &buf_buddy_free_t::list);
1884 		}
1885 
1886 		buf_pool->curr_size = 0;
1887 		chunk = buf_pool->chunks;
1888 
1889 		do {
1890 			if (!buf_chunk_init(buf_pool, chunk, chunk_size)) {
1891 				while (--chunk >= buf_pool->chunks) {
1892 					buf_block_t*	block = chunk->blocks;
1893 
1894 					for (i = chunk->size; i--; block++) {
1895 						mutex_free(&block->mutex);
1896 						rw_lock_free(&block->lock);
1897 
1898 						ut_d(rw_lock_free(
1899 							&block->debug_latch));
1900 					}
1901 
1902 					buf_pool->allocator.deallocate_large_dodump(
1903 						chunk->mem, &chunk->mem_pfx, chunk->mem_size());
1904 				}
1905 				ut_free(buf_pool->chunks);
1906 				buf_pool_mutex_exit(buf_pool);
1907 
1908 				/* InnoDB should free the mutex which was
1909 				created so far before freeing the instance */
1910 				mutex_free(&buf_pool->mutex);
1911 				mutex_free(&buf_pool->zip_mutex);
1912 				return(DB_ERROR);
1913 			}
1914 
1915 			buf_pool->curr_size += chunk->size;
1916 		} while (++chunk < buf_pool->chunks + buf_pool->n_chunks);
1917 
1918 		buf_pool->instance_no = instance_no;
1919 		buf_pool->read_ahead_area =
1920 			ut_min(BUF_READ_AHEAD_PAGES,
1921 			       ut_2_power_up(buf_pool->curr_size /
1922 					     BUF_READ_AHEAD_PORTION));
1923 		buf_pool->curr_pool_size = buf_pool->curr_size
1924 			<< srv_page_size_shift;
1925 
1926 		buf_pool->old_size = buf_pool->curr_size;
1927 		buf_pool->n_chunks_new = buf_pool->n_chunks;
1928 
1929 		/* Number of locks protecting page_hash must be a
1930 		power of two */
1931 		srv_n_page_hash_locks = static_cast<ulong>(
1932 			 ut_2_power_up(srv_n_page_hash_locks));
1933 		ut_a(srv_n_page_hash_locks != 0);
1934 		ut_a(srv_n_page_hash_locks <= MAX_PAGE_HASH_LOCKS);
1935 
1936 		buf_pool->page_hash = ib_create(
1937 			2 * buf_pool->curr_size,
1938 			LATCH_ID_HASH_TABLE_RW_LOCK,
1939 			srv_n_page_hash_locks, MEM_HEAP_FOR_PAGE_HASH);
1940 
1941 		buf_pool->zip_hash = hash_create(2 * buf_pool->curr_size);
1942 
1943 		buf_pool->last_printout_time = time(NULL);
1944 	}
1945 	/* 2. Initialize flushing fields
1946 	-------------------------------- */
1947 
1948 	mutex_create(LATCH_ID_FLUSH_LIST, &buf_pool->flush_list_mutex);
1949 
1950 	for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
1951 		buf_pool->no_flush[i] = os_event_create(0);
1952 	}
1953 
1954 	buf_pool->watch = (buf_page_t*) ut_zalloc_nokey(
1955 		sizeof(*buf_pool->watch) * BUF_POOL_WATCH_SIZE);
1956 	for (i = 0; i < BUF_POOL_WATCH_SIZE; i++) {
1957 		buf_pool->watch[i].buf_pool_index
1958 			= unsigned(buf_pool->instance_no);
1959 	}
1960 
1961 	/* All fields are initialized by ut_zalloc_nokey(). */
1962 
1963 	buf_pool->try_LRU_scan = TRUE;
1964 
1965 	/* Initialize the hazard pointer for flush_list batches */
1966 	new(&buf_pool->flush_hp)
1967 		FlushHp(buf_pool, &buf_pool->flush_list_mutex);
1968 
1969 	/* Initialize the hazard pointer for LRU batches */
1970 	new(&buf_pool->lru_hp) LRUHp(buf_pool, &buf_pool->mutex);
1971 
1972 	/* Initialize the iterator for LRU scan search */
1973 	new(&buf_pool->lru_scan_itr) LRUItr(buf_pool, &buf_pool->mutex);
1974 
1975 	/* Initialize the iterator for single page scan search */
1976 	new(&buf_pool->single_scan_itr) LRUItr(buf_pool, &buf_pool->mutex);
1977 
1978 	/* Initialize the temporal memory array and slots */
1979 	buf_pool->tmp_arr = (buf_tmp_array_t *)ut_malloc_nokey(sizeof(buf_tmp_array_t));
1980 	memset(buf_pool->tmp_arr, 0, sizeof(buf_tmp_array_t));
1981 	ulint n_slots = (srv_n_read_io_threads + srv_n_write_io_threads) * (8 * OS_AIO_N_PENDING_IOS_PER_THREAD);
1982 	buf_pool->tmp_arr->n_slots = n_slots;
1983 	buf_pool->tmp_arr->slots = (buf_tmp_buffer_t*)ut_malloc_nokey(sizeof(buf_tmp_buffer_t) * n_slots);
1984 	memset(buf_pool->tmp_arr->slots, 0, (sizeof(buf_tmp_buffer_t) * n_slots));
1985 
1986 	buf_pool_mutex_exit(buf_pool);
1987 
1988 	DBUG_EXECUTE_IF("buf_pool_init_instance_force_oom",
1989 		return(DB_ERROR); );
1990 
1991 	return(DB_SUCCESS);
1992 }
1993 
1994 /********************************************************************//**
1995 free one buffer pool instance */
1996 static
1997 void
1998 buf_pool_free_instance(
1999 /*===================*/
2000 	buf_pool_t*	buf_pool)	/* in,own: buffer pool instance
2001 					to free */
2002 {
2003 	buf_chunk_t*	chunk;
2004 	buf_chunk_t*	chunks;
2005 	buf_page_t*	bpage;
2006 	buf_page_t*	prev_bpage = 0;
2007 
2008 	mutex_free(&buf_pool->mutex);
2009 	mutex_free(&buf_pool->zip_mutex);
2010 	mutex_free(&buf_pool->flush_list_mutex);
2011 
2012 	if (buf_pool->flush_rbt) {
2013 		rbt_free(buf_pool->flush_rbt);
2014 		buf_pool->flush_rbt = NULL;
2015 	}
2016 
2017 	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
2018 	     bpage != NULL;
2019 	     bpage = prev_bpage) {
2020 
2021 		prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
2022 		buf_page_state	state = buf_page_get_state(bpage);
2023 
2024 		ut_ad(buf_page_in_file(bpage));
2025 		ut_ad(bpage->in_LRU_list);
2026 
2027 		if (state != BUF_BLOCK_FILE_PAGE) {
2028 			/* We must not have any dirty block except
2029 			when doing a fast shutdown. */
2030 			ut_ad(state == BUF_BLOCK_ZIP_PAGE
2031 			      || srv_fast_shutdown == 2);
2032 			buf_page_free_descriptor(bpage);
2033 		}
2034 	}
2035 
2036 	ut_free(buf_pool->watch);
2037 	buf_pool->watch = NULL;
2038 
2039 	chunks = buf_pool->chunks;
2040 	chunk = chunks + buf_pool->n_chunks;
2041 
2042 	while (--chunk >= chunks) {
2043 		buf_block_t*	block = chunk->blocks;
2044 
2045 		for (ulint i = chunk->size; i--; block++) {
2046 			mutex_free(&block->mutex);
2047 			rw_lock_free(&block->lock);
2048 
2049 			ut_d(rw_lock_free(&block->debug_latch));
2050 		}
2051 
2052 		buf_pool->allocator.deallocate_large_dodump(
2053 			chunk->mem, &chunk->mem_pfx, chunk->mem_size());
2054 	}
2055 
2056 	for (ulint i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; ++i) {
2057 		os_event_destroy(buf_pool->no_flush[i]);
2058 	}
2059 
2060 	ut_free(buf_pool->chunks);
2061 	ha_clear(buf_pool->page_hash);
2062 	hash_table_free(buf_pool->page_hash);
2063 	hash_table_free(buf_pool->zip_hash);
2064 
2065 	/* Free all used temporary slots */
2066 	if (buf_pool->tmp_arr) {
2067 		for(ulint i = 0; i < buf_pool->tmp_arr->n_slots; i++) {
2068 			buf_tmp_buffer_t* slot = &(buf_pool->tmp_arr->slots[i]);
2069 			if (slot && slot->crypt_buf) {
2070 				aligned_free(slot->crypt_buf);
2071 				slot->crypt_buf = NULL;
2072 			}
2073 
2074 			if (slot && slot->comp_buf) {
2075 				aligned_free(slot->comp_buf);
2076 				slot->comp_buf = NULL;
2077 			}
2078 		}
2079 
2080 		ut_free(buf_pool->tmp_arr->slots);
2081 		ut_free(buf_pool->tmp_arr);
2082 		buf_pool->tmp_arr = NULL;
2083 	}
2084 
2085 	buf_pool->allocator.~ut_allocator();
2086 }
2087 
2088 /********************************************************************//**
2089 Creates the buffer pool.
2090 @return DB_SUCCESS if success, DB_ERROR if not enough memory or error */
2091 dberr_t
2092 buf_pool_init(
2093 /*==========*/
2094 	ulint	total_size,	/*!< in: size of the total pool in bytes */
2095 	ulint	n_instances)	/*!< in: number of instances */
2096 {
2097 	ulint		i;
2098 	const ulint	size	= total_size / n_instances;
2099 
2100 	ut_ad(n_instances > 0);
2101 	ut_ad(n_instances <= MAX_BUFFER_POOLS);
2102 	ut_ad(n_instances == srv_buf_pool_instances);
2103 
2104 	NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE;
2105 
2106 	buf_pool_resizing = false;
2107 
2108 	buf_pool_ptr = (buf_pool_t*) ut_zalloc_nokey(
2109 		n_instances * sizeof *buf_pool_ptr);
2110 
2111 	buf_chunk_map_reg = UT_NEW_NOKEY(buf_pool_chunk_map_t());
2112 
2113 	for (i = 0; i < n_instances; i++) {
2114 		buf_pool_t*	ptr	= &buf_pool_ptr[i];
2115 
2116 		if (buf_pool_init_instance(ptr, size, i) != DB_SUCCESS) {
2117 
2118 			/* Free all the instances created so far. */
2119 			buf_pool_free(i);
2120 
2121 			return(DB_ERROR);
2122 		}
2123 	}
2124 
2125 	buf_chunk_map_ref = buf_chunk_map_reg;
2126 
2127 	buf_pool_set_sizes();
2128 	buf_LRU_old_ratio_update(100 * 3/ 8, FALSE);
2129 
2130 	btr_search_sys_create(buf_pool_get_curr_size() / sizeof(void*) / 64);
2131 
2132 	return(DB_SUCCESS);
2133 }
2134 
2135 /********************************************************************//**
2136 Frees the buffer pool at shutdown.  This must not be invoked before
2137 freeing all mutexes. */
2138 void
2139 buf_pool_free(
2140 /*==========*/
2141 	ulint	n_instances)	/*!< in: numbere of instances to free */
2142 {
2143 	for (ulint i = 0; i < n_instances; i++) {
2144 		buf_pool_free_instance(buf_pool_from_array(i));
2145 	}
2146 
2147 	UT_DELETE(buf_chunk_map_reg);
2148 	buf_chunk_map_reg = buf_chunk_map_ref = NULL;
2149 
2150 	ut_free(buf_pool_ptr);
2151 	buf_pool_ptr = NULL;
2152 }
2153 
2154 /** Reallocate a control block.
2155 @param[in]	buf_pool	buffer pool instance
2156 @param[in]	block		pointer to control block
2157 @retval false	if failed because of no free blocks. */
2158 static
2159 bool
2160 buf_page_realloc(
2161 	buf_pool_t*	buf_pool,
2162 	buf_block_t*	block)
2163 {
2164 	buf_block_t*	new_block;
2165 
2166 	ut_ad(buf_pool_mutex_own(buf_pool));
2167 	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
2168 
2169 	new_block = buf_LRU_get_free_only(buf_pool);
2170 
2171 	if (new_block == NULL) {
2172 		return(false); /* free_list was not enough */
2173 	}
2174 
2175 	rw_lock_t*	hash_lock = buf_page_hash_lock_get(buf_pool, block->page.id);
2176 
2177 	rw_lock_x_lock(hash_lock);
2178 	mutex_enter(&block->mutex);
2179 
2180 	if (buf_page_can_relocate(&block->page)) {
2181 		mutex_enter(&new_block->mutex);
2182 
2183 		memcpy(new_block->frame, block->frame, srv_page_size);
2184 		new (&new_block->page) buf_page_t(block->page);
2185 
2186 		/* relocate LRU list */
2187 		ut_ad(block->page.in_LRU_list);
2188 		ut_ad(!block->page.in_zip_hash);
2189 		ut_d(block->page.in_LRU_list = FALSE);
2190 
2191 		buf_LRU_adjust_hp(buf_pool, &block->page);
2192 
2193 		buf_page_t*	prev_b = UT_LIST_GET_PREV(LRU, &block->page);
2194 		UT_LIST_REMOVE(buf_pool->LRU, &block->page);
2195 
2196 		if (prev_b != NULL) {
2197 			UT_LIST_INSERT_AFTER(buf_pool->LRU, prev_b, &new_block->page);
2198 		} else {
2199 			UT_LIST_ADD_FIRST(buf_pool->LRU, &new_block->page);
2200 		}
2201 
2202 		if (buf_pool->LRU_old == &block->page) {
2203 			buf_pool->LRU_old = &new_block->page;
2204 		}
2205 
2206 		ut_ad(new_block->page.in_LRU_list);
2207 
2208 		/* relocate unzip_LRU list */
2209 		if (block->page.zip.data != NULL) {
2210 			ut_ad(block->in_unzip_LRU_list);
2211 			ut_d(new_block->in_unzip_LRU_list = TRUE);
2212 
2213 			buf_block_t*	prev_block = UT_LIST_GET_PREV(unzip_LRU, block);
2214 			UT_LIST_REMOVE(buf_pool->unzip_LRU, block);
2215 
2216 			ut_d(block->in_unzip_LRU_list = FALSE);
2217 			block->page.zip.data = NULL;
2218 			page_zip_set_size(&block->page.zip, 0);
2219 
2220 			if (prev_block != NULL) {
2221 				UT_LIST_INSERT_AFTER(buf_pool->unzip_LRU, prev_block, new_block);
2222 			} else {
2223 				UT_LIST_ADD_FIRST(buf_pool->unzip_LRU, new_block);
2224 			}
2225 		} else {
2226 			ut_ad(!block->in_unzip_LRU_list);
2227 			ut_d(new_block->in_unzip_LRU_list = FALSE);
2228 		}
2229 
2230 		/* relocate buf_pool->page_hash */
2231 		ut_ad(block->page.in_page_hash);
2232 		ut_ad(&block->page == buf_page_hash_get_low(buf_pool,
2233 							    block->page.id));
2234 		ut_d(block->page.in_page_hash = FALSE);
2235 		ulint	fold = block->page.id.fold();
2236 		ut_ad(fold == new_block->page.id.fold());
2237 		HASH_REPLACE(buf_page_t, hash, buf_pool->page_hash, fold,
2238 			     &block->page, &new_block->page);
2239 
2240 		ut_ad(new_block->page.in_page_hash);
2241 
2242 		buf_block_modify_clock_inc(block);
2243 		memset(block->frame + FIL_PAGE_OFFSET, 0xff, 4);
2244 		memset(block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4);
2245 		MEM_UNDEFINED(block->frame, srv_page_size);
2246 		buf_block_set_state(block, BUF_BLOCK_REMOVE_HASH);
2247 		block->page.id
2248 		    = page_id_t(ULINT32_UNDEFINED, ULINT32_UNDEFINED);
2249 
2250 		/* Relocate buf_pool->flush_list. */
2251 		if (block->page.oldest_modification) {
2252 			buf_flush_relocate_on_flush_list(
2253 				&block->page, &new_block->page);
2254 		}
2255 
2256 		/* set other flags of buf_block_t */
2257 
2258 #ifdef BTR_CUR_HASH_ADAPT
2259 		/* This code should only be executed by buf_pool_resize(),
2260 		while the adaptive hash index is disabled. */
2261 		assert_block_ahi_empty(block);
2262 		assert_block_ahi_empty_on_init(new_block);
2263 		ut_ad(!block->index);
2264 		new_block->index	= NULL;
2265 		new_block->n_hash_helps	= 0;
2266 		new_block->n_fields	= 1;
2267 		new_block->left_side	= TRUE;
2268 #endif /* BTR_CUR_HASH_ADAPT */
2269 
2270 		new_block->lock_hash_val = block->lock_hash_val;
2271 		ut_ad(new_block->lock_hash_val == lock_rec_hash(
2272 			new_block->page.id.space(),
2273 			new_block->page.id.page_no()));
2274 
2275 		rw_lock_x_unlock(hash_lock);
2276 		mutex_exit(&new_block->mutex);
2277 
2278 		/* free block */
2279 		buf_block_set_state(block, BUF_BLOCK_MEMORY);
2280 		buf_LRU_block_free_non_file_page(block);
2281 
2282 		mutex_exit(&block->mutex);
2283 	} else {
2284 		rw_lock_x_unlock(hash_lock);
2285 		mutex_exit(&block->mutex);
2286 
2287 		/* free new_block */
2288 		mutex_enter(&new_block->mutex);
2289 		buf_LRU_block_free_non_file_page(new_block);
2290 		mutex_exit(&new_block->mutex);
2291 	}
2292 
2293 	return(true); /* free_list was enough */
2294 }
2295 
2296 /** Sets the global variable that feeds MySQL's innodb_buffer_pool_resize_status
2297 to the specified string. The format and the following parameters are the
2298 same as the ones used for printf(3).
2299 @param[in]	fmt	format
2300 @param[in]	...	extra parameters according to fmt */
2301 static
2302 void
2303 buf_resize_status(
2304 	const char*	fmt,
2305 	...)
2306 {
2307 	va_list	ap;
2308 
2309 	va_start(ap, fmt);
2310 
2311 	vsnprintf(
2312 		export_vars.innodb_buffer_pool_resize_status,
2313 		sizeof(export_vars.innodb_buffer_pool_resize_status),
2314 		fmt, ap);
2315 
2316 	va_end(ap);
2317 
2318 	ib::info() << export_vars.innodb_buffer_pool_resize_status;
2319 }
2320 
2321 /** Determines if a block is intended to be withdrawn.
2322 @param[in]	buf_pool	buffer pool instance
2323 @param[in]	block		pointer to control block
2324 @retval true	if will be withdrawn */
2325 bool
2326 buf_block_will_withdrawn(
2327 	buf_pool_t*		buf_pool,
2328 	const buf_block_t*	block)
2329 {
2330 	ut_ad(buf_pool->curr_size < buf_pool->old_size);
2331 	ut_ad(!buf_pool_resizing || buf_pool_mutex_own(buf_pool));
2332 
2333 	const buf_chunk_t*	chunk
2334 		= buf_pool->chunks + buf_pool->n_chunks_new;
2335 	const buf_chunk_t*	echunk
2336 		= buf_pool->chunks + buf_pool->n_chunks;
2337 
2338 	while (chunk < echunk) {
2339 		if (block >= chunk->blocks
2340 		    && block < chunk->blocks + chunk->size) {
2341 			return(true);
2342 		}
2343 		++chunk;
2344 	}
2345 
2346 	return(false);
2347 }
2348 
2349 /** Determines if a frame is intended to be withdrawn.
2350 @param[in]	buf_pool	buffer pool instance
2351 @param[in]	ptr		pointer to a frame
2352 @retval true	if will be withdrawn */
2353 bool
2354 buf_frame_will_withdrawn(
2355 	buf_pool_t*	buf_pool,
2356 	const byte*	ptr)
2357 {
2358 	ut_ad(buf_pool->curr_size < buf_pool->old_size);
2359 	ut_ad(!buf_pool_resizing || buf_pool_mutex_own(buf_pool));
2360 
2361 	const buf_chunk_t*	chunk
2362 		= buf_pool->chunks + buf_pool->n_chunks_new;
2363 	const buf_chunk_t*	echunk
2364 		= buf_pool->chunks + buf_pool->n_chunks;
2365 
2366 	while (chunk < echunk) {
2367 		if (ptr >= chunk->blocks->frame
2368 		    && ptr < (chunk->blocks + chunk->size - 1)->frame
2369 			     + srv_page_size) {
2370 			return(true);
2371 		}
2372 		++chunk;
2373 	}
2374 
2375 	return(false);
2376 }
2377 
2378 /** Withdraw the buffer pool blocks from end of the buffer pool instance
2379 until withdrawn by buf_pool->withdraw_target.
2380 @param[in]	buf_pool	buffer pool instance
2381 @retval true	if retry is needed */
2382 static
2383 bool
2384 buf_pool_withdraw_blocks(
2385 	buf_pool_t*	buf_pool)
2386 {
2387 	buf_block_t*	block;
2388 	ulint		loop_count = 0;
2389 	ulint		i = buf_pool_index(buf_pool);
2390 
2391 	ib::info() << "buffer pool " << i
2392 		<< " : start to withdraw the last "
2393 		<< buf_pool->withdraw_target << " blocks.";
2394 
2395 	/* Minimize buf_pool->zip_free[i] lists */
2396 	buf_pool_mutex_enter(buf_pool);
2397 	buf_buddy_condense_free(buf_pool);
2398 	buf_pool_mutex_exit(buf_pool);
2399 
2400 	while (UT_LIST_GET_LEN(buf_pool->withdraw)
2401 	       < buf_pool->withdraw_target) {
2402 
2403 		/* try to withdraw from free_list */
2404 		ulint	count1 = 0;
2405 
2406 		buf_pool_mutex_enter(buf_pool);
2407 		block = reinterpret_cast<buf_block_t*>(
2408 			UT_LIST_GET_FIRST(buf_pool->free));
2409 		while (block != NULL
2410 		       && UT_LIST_GET_LEN(buf_pool->withdraw)
2411 			  < buf_pool->withdraw_target) {
2412 			ut_ad(block->page.in_free_list);
2413 			ut_ad(!block->page.in_flush_list);
2414 			ut_ad(!block->page.in_LRU_list);
2415 			ut_a(!buf_page_in_file(&block->page));
2416 
2417 			buf_block_t*	next_block;
2418 			next_block = reinterpret_cast<buf_block_t*>(
2419 				UT_LIST_GET_NEXT(
2420 					list, &block->page));
2421 
2422 			if (buf_block_will_withdrawn(buf_pool, block)) {
2423 				/* This should be withdrawn */
2424 				UT_LIST_REMOVE(
2425 					buf_pool->free,
2426 					&block->page);
2427 				UT_LIST_ADD_LAST(
2428 					buf_pool->withdraw,
2429 					&block->page);
2430 				ut_d(block->in_withdraw_list = TRUE);
2431 				count1++;
2432 			}
2433 
2434 			block = next_block;
2435 		}
2436 		buf_pool_mutex_exit(buf_pool);
2437 
2438 		/* reserve free_list length */
2439 		if (UT_LIST_GET_LEN(buf_pool->withdraw)
2440 		    < buf_pool->withdraw_target) {
2441 			ulint	scan_depth;
2442 			flush_counters_t n;
2443 
2444 			/* cap scan_depth with current LRU size. */
2445 			buf_pool_mutex_enter(buf_pool);
2446 			scan_depth = UT_LIST_GET_LEN(buf_pool->LRU);
2447 			buf_pool_mutex_exit(buf_pool);
2448 
2449 			scan_depth = ut_min(
2450 				ut_max(buf_pool->withdraw_target
2451 				       - UT_LIST_GET_LEN(buf_pool->withdraw),
2452 				       static_cast<ulint>(srv_LRU_scan_depth)),
2453 				scan_depth);
2454 
2455 			buf_flush_do_batch(buf_pool, BUF_FLUSH_LRU,
2456 				scan_depth, 0, &n);
2457 			buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
2458 
2459 			if (n.flushed) {
2460 				MONITOR_INC_VALUE_CUMULATIVE(
2461 					MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
2462 					MONITOR_LRU_BATCH_FLUSH_COUNT,
2463 					MONITOR_LRU_BATCH_FLUSH_PAGES,
2464 					n.flushed);
2465 			}
2466 		}
2467 
2468 		/* relocate blocks/buddies in withdrawn area */
2469 		ulint	count2 = 0;
2470 
2471 		buf_pool_mutex_enter(buf_pool);
2472 		buf_page_t*	bpage;
2473 		bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
2474 		while (bpage != NULL) {
2475 			BPageMutex*	block_mutex;
2476 			buf_page_t*	next_bpage;
2477 
2478 			block_mutex = buf_page_get_mutex(bpage);
2479 			mutex_enter(block_mutex);
2480 
2481 			next_bpage = UT_LIST_GET_NEXT(LRU, bpage);
2482 
2483 			if (bpage->zip.data != NULL
2484 			    && buf_frame_will_withdrawn(
2485 				buf_pool,
2486 				static_cast<byte*>(bpage->zip.data))) {
2487 
2488 				if (buf_page_can_relocate(bpage)) {
2489 					mutex_exit(block_mutex);
2490 					buf_pool_mutex_exit_forbid(buf_pool);
2491 					if(!buf_buddy_realloc(
2492 						buf_pool, bpage->zip.data,
2493 						page_zip_get_size(
2494 							&bpage->zip))) {
2495 
2496 						/* failed to allocate block */
2497 						buf_pool_mutex_exit_allow(
2498 							buf_pool);
2499 						break;
2500 					}
2501 					buf_pool_mutex_exit_allow(buf_pool);
2502 					mutex_enter(block_mutex);
2503 					count2++;
2504 				}
2505 				/* NOTE: if the page is in use,
2506 				not reallocated yet */
2507 			}
2508 
2509 			if (buf_page_get_state(bpage)
2510 			    == BUF_BLOCK_FILE_PAGE
2511 			    && buf_block_will_withdrawn(
2512 				buf_pool,
2513 				reinterpret_cast<buf_block_t*>(bpage))) {
2514 
2515 				if (buf_page_can_relocate(bpage)) {
2516 					mutex_exit(block_mutex);
2517 					buf_pool_mutex_exit_forbid(buf_pool);
2518 					if(!buf_page_realloc(
2519 						buf_pool,
2520 						reinterpret_cast<buf_block_t*>(
2521 							bpage))) {
2522 						/* failed to allocate block */
2523 						buf_pool_mutex_exit_allow(
2524 							buf_pool);
2525 						break;
2526 					}
2527 					buf_pool_mutex_exit_allow(buf_pool);
2528 					count2++;
2529 				} else {
2530 					mutex_exit(block_mutex);
2531 				}
2532 				/* NOTE: if the page is in use,
2533 				not reallocated yet */
2534 			} else {
2535 				mutex_exit(block_mutex);
2536 			}
2537 
2538 			bpage = next_bpage;
2539 		}
2540 		buf_pool_mutex_exit(buf_pool);
2541 
2542 		buf_resize_status(
2543 			"buffer pool %lu : withdrawing blocks. (%lu/%lu)",
2544 			i, UT_LIST_GET_LEN(buf_pool->withdraw),
2545 			buf_pool->withdraw_target);
2546 
2547 		ib::info() << "buffer pool " << i << " : withdrew "
2548 			<< count1 << " blocks from free list."
2549 			<< " Tried to relocate " << count2 << " pages ("
2550 			<< UT_LIST_GET_LEN(buf_pool->withdraw) << "/"
2551 			<< buf_pool->withdraw_target << ").";
2552 
2553 		if (++loop_count >= 10) {
2554 			/* give up for now.
2555 			retried after user threads paused. */
2556 
2557 			ib::info() << "buffer pool " << i
2558 				<< " : will retry to withdraw later.";
2559 
2560 			/* need retry later */
2561 			return(true);
2562 		}
2563 	}
2564 
2565 	/* confirm withdrawn enough */
2566 	const buf_chunk_t*	chunk
2567 		= buf_pool->chunks + buf_pool->n_chunks_new;
2568 	const buf_chunk_t*	echunk
2569 		= buf_pool->chunks + buf_pool->n_chunks;
2570 
2571 	while (chunk < echunk) {
2572 		block = chunk->blocks;
2573 		for (ulint j = chunk->size; j--; block++) {
2574 			/* If !=BUF_BLOCK_NOT_USED block in the
2575 			withdrawn area, it means corruption
2576 			something */
2577 			ut_a(buf_block_get_state(block)
2578 				== BUF_BLOCK_NOT_USED);
2579 			ut_ad(block->in_withdraw_list);
2580 		}
2581 		++chunk;
2582 	}
2583 
2584 	ib::info() << "buffer pool " << i << " : withdrawn target "
2585 		<< UT_LIST_GET_LEN(buf_pool->withdraw) << " blocks.";
2586 
2587 	return(false);
2588 }
2589 
2590 /** resize page_hash and zip_hash for a buffer pool instance.
2591 @param[in]	buf_pool	buffer pool instance */
2592 static
2593 void
2594 buf_pool_resize_hash(
2595 	buf_pool_t*	buf_pool)
2596 {
2597 	hash_table_t*	new_hash_table;
2598 
2599 	/* recreate page_hash */
2600 	new_hash_table = ib_recreate(
2601 		buf_pool->page_hash, 2 * buf_pool->curr_size);
2602 
2603 	for (ulint i = 0; i < hash_get_n_cells(buf_pool->page_hash); i++) {
2604 		buf_page_t*	bpage;
2605 
2606 		bpage = static_cast<buf_page_t*>(
2607 			HASH_GET_FIRST(
2608 				buf_pool->page_hash, i));
2609 
2610 		while (bpage) {
2611 			buf_page_t*	prev_bpage = bpage;
2612 			ulint		fold;
2613 
2614 			bpage = static_cast<buf_page_t*>(
2615 				HASH_GET_NEXT(
2616 					hash, prev_bpage));
2617 
2618 			fold = prev_bpage->id.fold();
2619 
2620 			HASH_DELETE(buf_page_t, hash,
2621 				buf_pool->page_hash, fold,
2622 				prev_bpage);
2623 
2624 			HASH_INSERT(buf_page_t, hash,
2625 				new_hash_table, fold,
2626 				prev_bpage);
2627 		}
2628 	}
2629 
2630 	/* Concurrent threads may be accessing
2631 	buf_pool->page_hash->n_cells, n_sync_obj and try to latch
2632 	sync_obj[i] while we are resizing. Therefore we never
2633 	deallocate page_hash, instead we overwrite n_cells (and other
2634 	fields) with the new values. The n_sync_obj and sync_obj are
2635 	actually same in both. */
2636 	std::swap(*buf_pool->page_hash, *new_hash_table);
2637 	hash_table_free(new_hash_table);
2638 
2639 	/* recreate zip_hash */
2640 	new_hash_table = hash_create(2 * buf_pool->curr_size);
2641 
2642 	for (ulint i = 0; i < hash_get_n_cells(buf_pool->zip_hash); i++) {
2643 		buf_page_t*	bpage;
2644 
2645 		bpage = static_cast<buf_page_t*>(
2646 			HASH_GET_FIRST(buf_pool->zip_hash, i));
2647 
2648 		while (bpage) {
2649 			buf_page_t*	prev_bpage = bpage;
2650 			ulint		fold;
2651 
2652 			bpage = static_cast<buf_page_t*>(
2653 				HASH_GET_NEXT(
2654 					hash, prev_bpage));
2655 
2656 			fold = BUF_POOL_ZIP_FOLD(
2657 				reinterpret_cast<buf_block_t*>(
2658 					prev_bpage));
2659 
2660 			HASH_DELETE(buf_page_t, hash,
2661 				buf_pool->zip_hash, fold,
2662 				prev_bpage);
2663 
2664 			HASH_INSERT(buf_page_t, hash,
2665 				new_hash_table, fold,
2666 				prev_bpage);
2667 		}
2668 	}
2669 
2670 	hash_table_free(buf_pool->zip_hash);
2671 	buf_pool->zip_hash = new_hash_table;
2672 }
2673 
2674 /** Resize the buffer pool based on srv_buf_pool_size from
2675 srv_buf_pool_old_size. */
2676 static
2677 void
2678 buf_pool_resize()
2679 {
2680 	buf_pool_t*	buf_pool;
2681 	ulint		new_instance_size;
2682 	bool		warning = false;
2683 
2684 	NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE;
2685 
2686 	ut_ad(!buf_pool_resizing);
2687 	ut_ad(srv_buf_pool_chunk_unit > 0);
2688 
2689 	new_instance_size = srv_buf_pool_size / srv_buf_pool_instances;
2690 	new_instance_size >>= srv_page_size_shift;
2691 
2692 	buf_resize_status("Resizing buffer pool from " ULINTPF " to "
2693 			  ULINTPF " (unit=" ULINTPF ").",
2694 			  srv_buf_pool_old_size, srv_buf_pool_size,
2695 			  srv_buf_pool_chunk_unit);
2696 
2697 	/* set new limit for all buffer pool for resizing */
2698 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2699 		buf_pool = buf_pool_from_array(i);
2700 		buf_pool_mutex_enter(buf_pool);
2701 
2702 		ut_ad(buf_pool->curr_size == buf_pool->old_size);
2703 		ut_ad(buf_pool->n_chunks_new == buf_pool->n_chunks);
2704 		ut_ad(UT_LIST_GET_LEN(buf_pool->withdraw) == 0);
2705 		ut_ad(buf_pool->flush_rbt == NULL);
2706 
2707 		buf_pool->curr_size = new_instance_size;
2708 
2709 		buf_pool->n_chunks_new =
2710 			(new_instance_size << srv_page_size_shift)
2711 			/ srv_buf_pool_chunk_unit;
2712 
2713 		buf_pool_mutex_exit(buf_pool);
2714 	}
2715 #ifdef BTR_CUR_HASH_ADAPT
2716 	/* disable AHI if needed */
2717 	bool	btr_search_disabled = false;
2718 
2719 	buf_resize_status("Disabling adaptive hash index.");
2720 
2721 	btr_search_s_lock_all();
2722 	if (btr_search_enabled) {
2723 		btr_search_s_unlock_all();
2724 		btr_search_disabled = true;
2725 	} else {
2726 		btr_search_s_unlock_all();
2727 	}
2728 
2729 	btr_search_disable();
2730 
2731 	if (btr_search_disabled) {
2732 		ib::info() << "disabled adaptive hash index.";
2733 	}
2734 #endif /* BTR_CUR_HASH_ADAPT */
2735 
2736 	/* set withdraw target */
2737 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2738 		buf_pool = buf_pool_from_array(i);
2739 		if (buf_pool->curr_size < buf_pool->old_size) {
2740 			ulint	withdraw_target = 0;
2741 
2742 			const buf_chunk_t*	chunk
2743 				= buf_pool->chunks + buf_pool->n_chunks_new;
2744 			const buf_chunk_t*	echunk
2745 				= buf_pool->chunks + buf_pool->n_chunks;
2746 
2747 			while (chunk < echunk) {
2748 				withdraw_target += chunk->size;
2749 				++chunk;
2750 			}
2751 
2752 			ut_ad(buf_pool->withdraw_target == 0);
2753 			buf_pool->withdraw_target = withdraw_target;
2754 		}
2755 	}
2756 
2757 	buf_resize_status("Withdrawing blocks to be shrunken.");
2758 
2759 	time_t		withdraw_started = time(NULL);
2760 	ulint		message_interval = 60;
2761 	ulint		retry_interval = 1;
2762 
2763 withdraw_retry:
2764 	bool	should_retry_withdraw = false;
2765 
2766 	/* wait for the number of blocks fit to the new size (if needed)*/
2767 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2768 		buf_pool = buf_pool_from_array(i);
2769 		if (buf_pool->curr_size < buf_pool->old_size) {
2770 
2771 			should_retry_withdraw |=
2772 				buf_pool_withdraw_blocks(buf_pool);
2773 		}
2774 	}
2775 
2776 	if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
2777 		/* abort to resize for shutdown. */
2778 		return;
2779 	}
2780 
2781 	/* abort buffer pool load */
2782 	buf_load_abort();
2783 
2784 	const time_t current_time = time(NULL);
2785 
2786 	if (should_retry_withdraw
2787 	    && difftime(current_time, withdraw_started) >= message_interval) {
2788 
2789 		if (message_interval > 900) {
2790 			message_interval = 1800;
2791 		} else {
2792 			message_interval *= 2;
2793 		}
2794 
2795 		lock_mutex_enter();
2796 		mutex_enter(&trx_sys.mutex);
2797 		bool	found = false;
2798 		for (trx_t* trx = UT_LIST_GET_FIRST(trx_sys.trx_list);
2799 		     trx != NULL;
2800 		     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
2801 			if (trx->state != TRX_STATE_NOT_STARTED
2802 			    && trx->mysql_thd != NULL
2803 			    && withdraw_started > trx->start_time) {
2804 				if (!found) {
2805 					ib::warn() <<
2806 						"The following trx might hold"
2807 						" the blocks in buffer pool to"
2808 					        " be withdrawn. Buffer pool"
2809 						" resizing can complete only"
2810 						" after all the transactions"
2811 						" below release the blocks.";
2812 					found = true;
2813 				}
2814 
2815 				lock_trx_print_wait_and_mvcc_state(
2816 					stderr, trx, current_time);
2817 			}
2818 		}
2819 		mutex_exit(&trx_sys.mutex);
2820 		lock_mutex_exit();
2821 
2822 		withdraw_started = current_time;
2823 	}
2824 
2825 	if (should_retry_withdraw) {
2826 		ib::info() << "Will retry to withdraw " << retry_interval
2827 			<< " seconds later.";
2828 		os_thread_sleep(retry_interval * 1000000);
2829 
2830 		if (retry_interval > 5) {
2831 			retry_interval = 10;
2832 		} else {
2833 			retry_interval *= 2;
2834 		}
2835 
2836 		goto withdraw_retry;
2837 	}
2838 
2839 
2840 	buf_resize_status("Latching whole of buffer pool.");
2841 
2842 #ifndef DBUG_OFF
2843 	{
2844 		bool	should_wait = true;
2845 
2846 		while (should_wait) {
2847 			should_wait = false;
2848 			DBUG_EXECUTE_IF(
2849 				"ib_buf_pool_resize_wait_before_resize",
2850 				should_wait = true; os_thread_sleep(10000););
2851 		}
2852 	}
2853 #endif /* !DBUG_OFF */
2854 
2855 	if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
2856 		return;
2857 	}
2858 
2859 	/* Indicate critical path */
2860 	buf_pool_resizing = true;
2861 
2862 	/* Acquire all buf_pool_mutex/hash_lock */
2863 	for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
2864 		buf_pool_t*	buf_pool = buf_pool_from_array(i);
2865 
2866 		buf_pool_mutex_enter(buf_pool);
2867 	}
2868 	for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
2869 		buf_pool_t*	buf_pool = buf_pool_from_array(i);
2870 
2871 		hash_lock_x_all(buf_pool->page_hash);
2872 	}
2873 
2874 	buf_chunk_map_reg = UT_NEW_NOKEY(buf_pool_chunk_map_t());
2875 
2876 	/* add/delete chunks */
2877 	for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
2878 		buf_pool_t*	buf_pool = buf_pool_from_array(i);
2879 		buf_chunk_t*	chunk;
2880 		buf_chunk_t*	echunk;
2881 
2882 		buf_resize_status("buffer pool %lu :"
2883 			" resizing with chunks %lu to %lu.",
2884 			i, buf_pool->n_chunks, buf_pool->n_chunks_new);
2885 
2886 		if (buf_pool->n_chunks_new < buf_pool->n_chunks) {
2887 			/* delete chunks */
2888 			chunk = buf_pool->chunks
2889 				+ buf_pool->n_chunks_new;
2890 			echunk = buf_pool->chunks + buf_pool->n_chunks;
2891 
2892 			ulint	sum_freed = 0;
2893 
2894 			while (chunk < echunk) {
2895 				buf_block_t*	block = chunk->blocks;
2896 
2897 				/* buf_LRU_block_free_non_file_page()
2898 				invokes MEM_NOACCESS() on any blocks
2899 				that are in free_list. We must
2900 				cancel the effect of that. In MemorySanitizer,
2901 				MEM_NOACCESS() is no-op, so we must not do
2902 				anything special for it here. */
2903 #ifdef HAVE_valgrind
2904 # if !__has_feature(memory_sanitizer)
2905 				MEM_MAKE_DEFINED(chunk->mem,
2906 						 chunk->mem_size());
2907 # endif
2908 #else
2909 				MEM_MAKE_ADDRESSABLE(chunk->mem,
2910 						     chunk->mem_size());
2911 #endif
2912 
2913 				for (ulint j = chunk->size;
2914 				     j--; block++) {
2915 					mutex_free(&block->mutex);
2916 					rw_lock_free(&block->lock);
2917 
2918 					ut_d(rw_lock_free(
2919 						&block->debug_latch));
2920 				}
2921 
2922 				buf_pool->allocator.deallocate_large_dodump(
2923 					chunk->mem, &chunk->mem_pfx, chunk->mem_size());
2924 
2925 				sum_freed += chunk->size;
2926 
2927 				++chunk;
2928 			}
2929 
2930 			/* discard withdraw list */
2931 			UT_LIST_INIT(buf_pool->withdraw,
2932 				     &buf_page_t::list);
2933 			buf_pool->withdraw_target = 0;
2934 
2935 			ib::info() << "buffer pool " << i << " : "
2936 				<< buf_pool->n_chunks - buf_pool->n_chunks_new
2937 				<< " chunks (" << sum_freed
2938 				<< " blocks) were freed.";
2939 
2940 			buf_pool->n_chunks = buf_pool->n_chunks_new;
2941 		}
2942 
2943 		{
2944 			/* reallocate buf_pool->chunks */
2945 			const ulint	new_chunks_size
2946 				= buf_pool->n_chunks_new * sizeof(*chunk);
2947 
2948 			buf_chunk_t*	new_chunks
2949 				= reinterpret_cast<buf_chunk_t*>(
2950 					ut_zalloc_nokey_nofatal(new_chunks_size));
2951 
2952 			DBUG_EXECUTE_IF("buf_pool_resize_chunk_null",
2953 					ut_free(new_chunks);
2954 					new_chunks = NULL;);
2955 
2956 			if (new_chunks == NULL) {
2957 				ib::error() << "buffer pool " << i
2958 					<< " : failed to allocate"
2959 					" the chunk array.";
2960 				buf_pool->n_chunks_new
2961 					= buf_pool->n_chunks;
2962 				warning = true;
2963 				buf_pool->chunks_old = NULL;
2964 				for (ulint j = 0; j < buf_pool->n_chunks_new; j++) {
2965 					buf_pool_register_chunk(&(buf_pool->chunks[j]));
2966 				}
2967 				goto calc_buf_pool_size;
2968 			}
2969 
2970 			ulint	n_chunks_copy = ut_min(buf_pool->n_chunks_new,
2971 						       buf_pool->n_chunks);
2972 
2973 			memcpy(new_chunks, buf_pool->chunks,
2974 			       n_chunks_copy * sizeof(*chunk));
2975 
2976 			for (ulint j = 0; j < n_chunks_copy; j++) {
2977 				buf_pool_register_chunk(&new_chunks[j]);
2978 			}
2979 
2980 			buf_pool->chunks_old = buf_pool->chunks;
2981 			buf_pool->chunks = new_chunks;
2982 		}
2983 
2984 
2985 		if (buf_pool->n_chunks_new > buf_pool->n_chunks) {
2986 			/* add chunks */
2987 			chunk = buf_pool->chunks + buf_pool->n_chunks;
2988 			echunk = buf_pool->chunks
2989 				+ buf_pool->n_chunks_new;
2990 
2991 			ulint	sum_added = 0;
2992 			ulint	n_chunks = buf_pool->n_chunks;
2993 
2994 			while (chunk < echunk) {
2995 				ulong	unit = srv_buf_pool_chunk_unit;
2996 
2997 				if (!buf_chunk_init(buf_pool, chunk, unit)) {
2998 
2999 					ib::error() << "buffer pool " << i
3000 						<< " : failed to allocate"
3001 						" new memory.";
3002 
3003 					warning = true;
3004 
3005 					buf_pool->n_chunks_new
3006 						= n_chunks;
3007 
3008 					break;
3009 				}
3010 
3011 				sum_added += chunk->size;
3012 
3013 				++n_chunks;
3014 				++chunk;
3015 			}
3016 
3017 			ib::info() << "buffer pool " << i << " : "
3018 				<< buf_pool->n_chunks_new - buf_pool->n_chunks
3019 				<< " chunks (" << sum_added
3020 				<< " blocks) were added.";
3021 
3022 			buf_pool->n_chunks = n_chunks;
3023 		}
3024 calc_buf_pool_size:
3025 
3026 		/* recalc buf_pool->curr_size */
3027 		ulint	new_size = 0;
3028 
3029 		chunk = buf_pool->chunks;
3030 		do {
3031 			new_size += chunk->size;
3032 		} while (++chunk < buf_pool->chunks
3033 				   + buf_pool->n_chunks);
3034 
3035 		buf_pool->curr_size = new_size;
3036 		buf_pool->n_chunks_new = buf_pool->n_chunks;
3037 
3038 		if (buf_pool->chunks_old) {
3039 			ut_free(buf_pool->chunks_old);
3040 			buf_pool->chunks_old = NULL;
3041 		}
3042 	}
3043 
3044 	buf_pool_chunk_map_t*	chunk_map_old = buf_chunk_map_ref;
3045 	buf_chunk_map_ref = buf_chunk_map_reg;
3046 
3047 	/* set instance sizes */
3048 	{
3049 		ulint	curr_size = 0;
3050 
3051 		for (ulint i = 0; i < srv_buf_pool_instances; i++) {
3052 			buf_pool = buf_pool_from_array(i);
3053 
3054 			ut_ad(UT_LIST_GET_LEN(buf_pool->withdraw) == 0);
3055 
3056 			buf_pool->read_ahead_area =
3057 				ut_min(BUF_READ_AHEAD_PAGES,
3058 				       ut_2_power_up(buf_pool->curr_size /
3059 						      BUF_READ_AHEAD_PORTION));
3060 			buf_pool->curr_pool_size
3061 				= buf_pool->curr_size << srv_page_size_shift;
3062 			curr_size += buf_pool->curr_pool_size;
3063 			buf_pool->old_size = buf_pool->curr_size;
3064 		}
3065 		srv_buf_pool_curr_size = curr_size;
3066 		innodb_set_buf_pool_size(buf_pool_size_align(curr_size));
3067 	}
3068 
3069 	const bool	new_size_too_diff
3070 		= srv_buf_pool_base_size > srv_buf_pool_size * 2
3071 			|| srv_buf_pool_base_size * 2 < srv_buf_pool_size;
3072 
3073 	/* Normalize page_hash and zip_hash,
3074 	if the new size is too different */
3075 	if (!warning && new_size_too_diff) {
3076 
3077 		buf_resize_status("Resizing hash tables.");
3078 
3079 		for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
3080 			buf_pool_t*	buf_pool = buf_pool_from_array(i);
3081 
3082 			buf_pool_resize_hash(buf_pool);
3083 
3084 			ib::info() << "buffer pool " << i
3085 				<< " : hash tables were resized.";
3086 		}
3087 	}
3088 
3089 	/* Release all buf_pool_mutex/page_hash */
3090 	for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
3091 		buf_pool_t*	buf_pool = buf_pool_from_array(i);
3092 
3093 		hash_unlock_x_all(buf_pool->page_hash);
3094 		buf_pool_mutex_exit(buf_pool);
3095 	}
3096 
3097 	UT_DELETE(chunk_map_old);
3098 
3099 	buf_pool_resizing = false;
3100 
3101 	/* Normalize other components, if the new size is too different */
3102 	if (!warning && new_size_too_diff) {
3103 		srv_buf_pool_base_size = srv_buf_pool_size;
3104 
3105 		buf_resize_status("Resizing also other hash tables.");
3106 
3107 		/* normalize lock_sys */
3108 		srv_lock_table_size = 5
3109 			* (srv_buf_pool_size >> srv_page_size_shift);
3110 		lock_sys.resize(srv_lock_table_size);
3111 
3112 		/* normalize dict_sys */
3113 		dict_resize();
3114 
3115 		ib::info() << "Resized hash tables at lock_sys,"
3116 #ifdef BTR_CUR_HASH_ADAPT
3117 			" adaptive hash index,"
3118 #endif /* BTR_CUR_HASH_ADAPT */
3119 			" dictionary.";
3120 	}
3121 
3122 	/* normalize ibuf->max_size */
3123 	ibuf_max_size_update(srv_change_buffer_max_size);
3124 
3125 	if (srv_buf_pool_old_size != srv_buf_pool_size) {
3126 
3127 		ib::info() << "Completed to resize buffer pool from "
3128 			<< srv_buf_pool_old_size
3129 			<< " to " << srv_buf_pool_size << ".";
3130 		srv_buf_pool_old_size = srv_buf_pool_size;
3131 	}
3132 
3133 #ifdef BTR_CUR_HASH_ADAPT
3134 	/* enable AHI if needed */
3135 	if (btr_search_disabled) {
3136 		btr_search_enable(true);
3137 		ib::info() << "Re-enabled adaptive hash index.";
3138 	}
3139 #endif /* BTR_CUR_HASH_ADAPT */
3140 
3141 	char	now[32];
3142 
3143 	ut_sprintf_timestamp(now);
3144 	if (!warning) {
3145 		buf_resize_status("Completed resizing buffer pool at %s.",
3146 			now);
3147 	} else {
3148 		buf_resize_status("Resizing buffer pool failed,"
3149 			" finished resizing at %s.", now);
3150 	}
3151 
3152 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3153 	ut_a(buf_validate());
3154 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3155 
3156 	return;
3157 }
3158 
3159 /** This is the thread for resizing buffer pool. It waits for an event and
3160 when waked up either performs a resizing and sleeps again.
3161 @return	this function does not return, calls os_thread_exit()
3162 */
3163 extern "C"
3164 os_thread_ret_t
3165 DECLARE_THREAD(buf_resize_thread)(void*)
3166 {
3167 	my_thread_init();
3168 
3169 	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
3170 		os_event_wait(srv_buf_resize_event);
3171 		os_event_reset(srv_buf_resize_event);
3172 
3173 		if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
3174 			break;
3175 		}
3176 
3177 		buf_pool_mutex_enter_all();
3178 		if (srv_buf_pool_old_size == srv_buf_pool_size) {
3179 			buf_pool_mutex_exit_all();
3180 			std::ostringstream sout;
3181 			sout << "Size did not change (old size = new size = "
3182 				<< srv_buf_pool_size << ". Nothing to do.";
3183 			buf_resize_status(sout.str().c_str());
3184 
3185 			/* nothing to do */
3186 			continue;
3187 		}
3188 		buf_pool_mutex_exit_all();
3189 
3190 		buf_pool_resize();
3191 	}
3192 
3193 	srv_buf_resize_thread_active = false;
3194 
3195 	my_thread_end();
3196 	os_thread_exit();
3197 
3198 	OS_THREAD_DUMMY_RETURN;
3199 }
3200 
3201 /********************************************************************//**
3202 Relocate a buffer control block.  Relocates the block on the LRU list
3203 and in buf_pool->page_hash.  Does not relocate bpage->list.
3204 The caller must take care of relocating bpage->list. */
3205 static
3206 void
3207 buf_relocate(
3208 /*=========*/
3209 	buf_page_t*	bpage,	/*!< in/out: control block being relocated;
3210 				buf_page_get_state(bpage) must be
3211 				BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_ZIP_PAGE */
3212 	buf_page_t*	dpage)	/*!< in/out: destination control block */
3213 {
3214 	buf_page_t*	b;
3215 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
3216 
3217 	ut_ad(buf_pool_mutex_own(buf_pool));
3218 	ut_ad(buf_page_hash_lock_held_x(buf_pool, bpage));
3219 	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
3220 	ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
3221 	ut_a(bpage->buf_fix_count == 0);
3222 	ut_ad(bpage->in_LRU_list);
3223 	ut_ad(!bpage->in_zip_hash);
3224 	ut_ad(bpage->in_page_hash);
3225 	ut_ad(bpage == buf_page_hash_get_low(buf_pool, bpage->id));
3226 
3227 	ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
3228 #ifdef UNIV_DEBUG
3229 	switch (buf_page_get_state(bpage)) {
3230 	case BUF_BLOCK_POOL_WATCH:
3231 	case BUF_BLOCK_NOT_USED:
3232 	case BUF_BLOCK_READY_FOR_USE:
3233 	case BUF_BLOCK_FILE_PAGE:
3234 	case BUF_BLOCK_MEMORY:
3235 	case BUF_BLOCK_REMOVE_HASH:
3236 		ut_error;
3237 	case BUF_BLOCK_ZIP_DIRTY:
3238 	case BUF_BLOCK_ZIP_PAGE:
3239 		break;
3240 	}
3241 #endif /* UNIV_DEBUG */
3242 
3243 	new (dpage) buf_page_t(*bpage);
3244 
3245 	/* Important that we adjust the hazard pointer before
3246 	removing bpage from LRU list. */
3247 	buf_LRU_adjust_hp(buf_pool, bpage);
3248 
3249 	ut_d(bpage->in_LRU_list = FALSE);
3250 	ut_d(bpage->in_page_hash = FALSE);
3251 
3252 	/* relocate buf_pool->LRU */
3253 	b = UT_LIST_GET_PREV(LRU, bpage);
3254 	UT_LIST_REMOVE(buf_pool->LRU, bpage);
3255 
3256 	if (b != NULL) {
3257 		UT_LIST_INSERT_AFTER(buf_pool->LRU, b, dpage);
3258 	} else {
3259 		UT_LIST_ADD_FIRST(buf_pool->LRU, dpage);
3260 	}
3261 
3262 	if (UNIV_UNLIKELY(buf_pool->LRU_old == bpage)) {
3263 		buf_pool->LRU_old = dpage;
3264 #ifdef UNIV_LRU_DEBUG
3265 		/* buf_pool->LRU_old must be the first item in the LRU list
3266 		whose "old" flag is set. */
3267 		ut_a(buf_pool->LRU_old->old);
3268 		ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)
3269 		     || !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old);
3270 		ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)
3271 		     || UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old);
3272 	} else {
3273 		/* Check that the "old" flag is consistent in
3274 		the block and its neighbours. */
3275 		buf_page_set_old(dpage, buf_page_is_old(dpage));
3276 #endif /* UNIV_LRU_DEBUG */
3277 	}
3278 
3279         ut_d(CheckInLRUList::validate(buf_pool));
3280 
3281 	/* relocate buf_pool->page_hash */
3282 	ulint	fold = bpage->id.fold();
3283 	ut_ad(fold == dpage->id.fold());
3284 	HASH_REPLACE(buf_page_t, hash, buf_pool->page_hash, fold, bpage,
3285 		     dpage);
3286 }
3287 
3288 /** Hazard Pointer implementation. */
3289 
3290 /** Set current value
3291 @param bpage	buffer block to be set as hp */
3292 void
3293 HazardPointer::set(buf_page_t* bpage)
3294 {
3295 	ut_ad(mutex_own(m_mutex));
3296 	ut_ad(!bpage || buf_pool_from_bpage(bpage) == m_buf_pool);
3297 	ut_ad(!bpage || buf_page_in_file(bpage));
3298 
3299 	m_hp = bpage;
3300 }
3301 
3302 /** Checks if a bpage is the hp
3303 @param bpage    buffer block to be compared
3304 @return true if it is hp */
3305 
3306 bool
3307 HazardPointer::is_hp(const buf_page_t* bpage)
3308 {
3309 	ut_ad(mutex_own(m_mutex));
3310 	ut_ad(!m_hp || buf_pool_from_bpage(m_hp) == m_buf_pool);
3311 	ut_ad(!bpage || buf_pool_from_bpage(bpage) == m_buf_pool);
3312 
3313 	return(bpage == m_hp);
3314 }
3315 
3316 /** Adjust the value of hp. This happens when some other thread working
3317 on the same list attempts to remove the hp from the list.
3318 @param bpage	buffer block to be compared */
3319 
3320 void
3321 FlushHp::adjust(const buf_page_t* bpage)
3322 {
3323 	ut_ad(bpage != NULL);
3324 
3325 	/** We only support reverse traversal for now. */
3326 	if (is_hp(bpage)) {
3327 		m_hp = UT_LIST_GET_PREV(list, m_hp);
3328 	}
3329 
3330 	ut_ad(!m_hp || m_hp->in_flush_list);
3331 }
3332 
3333 /** Adjust the value of hp. This happens when some other thread working
3334 on the same list attempts to remove the hp from the list.
3335 @param bpage	buffer block to be compared */
3336 
3337 void
3338 LRUHp::adjust(const buf_page_t* bpage)
3339 {
3340 	ut_ad(bpage);
3341 
3342 	/** We only support reverse traversal for now. */
3343 	if (is_hp(bpage)) {
3344 		m_hp = UT_LIST_GET_PREV(LRU, m_hp);
3345 	}
3346 
3347 	ut_ad(!m_hp || m_hp->in_LRU_list);
3348 }
3349 
3350 /** Selects from where to start a scan. If we have scanned too deep into
3351 the LRU list it resets the value to the tail of the LRU list.
3352 @return buf_page_t from where to start scan. */
3353 
3354 buf_page_t*
3355 LRUItr::start()
3356 {
3357 	ut_ad(mutex_own(m_mutex));
3358 
3359 	if (!m_hp || m_hp->old) {
3360 		m_hp = UT_LIST_GET_LAST(m_buf_pool->LRU);
3361 	}
3362 
3363 	return(m_hp);
3364 }
3365 
3366 /** Determine if a block is a sentinel for a buffer pool watch.
3367 @param[in]	buf_pool	buffer pool instance
3368 @param[in]	bpage		block
3369 @return TRUE if a sentinel for a buffer pool watch, FALSE if not */
3370 ibool
3371 buf_pool_watch_is_sentinel(
3372 	const buf_pool_t*	buf_pool,
3373 	const buf_page_t*	bpage)
3374 {
3375 	/* We must also own the appropriate hash lock. */
3376 	ut_ad(buf_page_hash_lock_held_s_or_x(buf_pool, bpage));
3377 	ut_ad(buf_page_in_file(bpage));
3378 
3379 	if (bpage < &buf_pool->watch[0]
3380 	    || bpage >= &buf_pool->watch[BUF_POOL_WATCH_SIZE]) {
3381 
3382 		ut_ad(buf_page_get_state(bpage) != BUF_BLOCK_ZIP_PAGE
3383 		      || bpage->zip.data != NULL);
3384 
3385 		return(FALSE);
3386 	}
3387 
3388 	ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE);
3389 	ut_ad(!bpage->in_zip_hash);
3390 	ut_ad(bpage->in_page_hash);
3391 	ut_ad(bpage->zip.data == NULL);
3392 	return(TRUE);
3393 }
3394 
3395 /** Add watch for the given page to be read in. Caller must have
3396 appropriate hash_lock for the bpage. This function may release the
3397 hash_lock and reacquire it.
3398 @param[in]	page_id		page id
3399 @param[in,out]	hash_lock	hash_lock currently latched
3400 @return NULL if watch set, block if the page is in the buffer pool */
3401 static
3402 buf_page_t*
3403 buf_pool_watch_set(
3404 	const page_id_t		page_id,
3405 	rw_lock_t**		hash_lock)
3406 {
3407 	buf_page_t*	bpage;
3408 	ulint		i;
3409 	buf_pool_t*	buf_pool = buf_pool_get(page_id);
3410 
3411 	ut_ad(*hash_lock == buf_page_hash_lock_get(buf_pool, page_id));
3412 
3413 	ut_ad(rw_lock_own(*hash_lock, RW_LOCK_X));
3414 
3415 	bpage = buf_page_hash_get_low(buf_pool, page_id);
3416 
3417 	if (bpage != NULL) {
3418 page_found:
3419 		if (!buf_pool_watch_is_sentinel(buf_pool, bpage)) {
3420 			/* The page was loaded meanwhile. */
3421 			return(bpage);
3422 		}
3423 
3424 		/* Add to an existing watch. */
3425 		buf_block_fix(bpage);
3426 		return(NULL);
3427 	}
3428 
3429 	/* From this point this function becomes fairly heavy in terms
3430 	of latching. We acquire the buf_pool mutex as well as all the
3431 	hash_locks. buf_pool mutex is needed because any changes to
3432 	the page_hash must be covered by it and hash_locks are needed
3433 	because we don't want to read any stale information in
3434 	buf_pool->watch[]. However, it is not in the critical code path
3435 	as this function will be called only by the purge thread. */
3436 
3437 	/* To obey latching order first release the hash_lock. */
3438 	rw_lock_x_unlock(*hash_lock);
3439 
3440 	buf_pool_mutex_enter(buf_pool);
3441 	hash_lock_x_all(buf_pool->page_hash);
3442 
3443 	/* If not own buf_pool_mutex, page_hash can be changed. */
3444 	*hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
3445 
3446 	/* We have to recheck that the page
3447 	was not loaded or a watch set by some other
3448 	purge thread. This is because of the small
3449 	time window between when we release the
3450 	hash_lock to acquire buf_pool mutex above. */
3451 
3452 	bpage = buf_page_hash_get_low(buf_pool, page_id);
3453 	if (UNIV_LIKELY_NULL(bpage)) {
3454 		buf_pool_mutex_exit(buf_pool);
3455 		hash_unlock_x_all_but(buf_pool->page_hash, *hash_lock);
3456 		goto page_found;
3457 	}
3458 
3459 	/* The maximum number of purge threads should never exceed
3460 	BUF_POOL_WATCH_SIZE. So there is no way for purge thread
3461 	instance to hold a watch when setting another watch. */
3462 	for (i = 0; i < BUF_POOL_WATCH_SIZE; i++) {
3463 		bpage = &buf_pool->watch[i];
3464 
3465 		ut_ad(bpage->access_time == 0);
3466 		ut_ad(bpage->newest_modification == 0);
3467 		ut_ad(bpage->oldest_modification == 0);
3468 		ut_ad(bpage->zip.data == NULL);
3469 		ut_ad(!bpage->in_zip_hash);
3470 
3471 		switch (bpage->state) {
3472 		case BUF_BLOCK_POOL_WATCH:
3473 			ut_ad(!bpage->in_page_hash);
3474 			ut_ad(bpage->buf_fix_count == 0);
3475 
3476 			/* bpage is pointing to buf_pool->watch[],
3477 			which is protected by buf_pool->mutex.
3478 			Normally, buf_page_t objects are protected by
3479 			buf_block_t::mutex or buf_pool->zip_mutex or both. */
3480 
3481 			bpage->state = BUF_BLOCK_ZIP_PAGE;
3482 			bpage->id = page_id;
3483 			bpage->buf_fix_count = 1;
3484 
3485 			ut_d(bpage->in_page_hash = TRUE);
3486 			HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
3487 				    page_id.fold(), bpage);
3488 
3489 			buf_pool_mutex_exit(buf_pool);
3490 			/* Once the sentinel is in the page_hash we can
3491 			safely release all locks except just the
3492 			relevant hash_lock */
3493 			hash_unlock_x_all_but(buf_pool->page_hash,
3494 						*hash_lock);
3495 
3496 			return(NULL);
3497 		case BUF_BLOCK_ZIP_PAGE:
3498 			ut_ad(bpage->in_page_hash);
3499 			ut_ad(bpage->buf_fix_count > 0);
3500 			break;
3501 		default:
3502 			ut_error;
3503 		}
3504 	}
3505 
3506 	/* Allocation failed.  Either the maximum number of purge
3507 	threads should never exceed BUF_POOL_WATCH_SIZE, or this code
3508 	should be modified to return a special non-NULL value and the
3509 	caller should purge the record directly. */
3510 	ut_error;
3511 
3512 	/* Fix compiler warning */
3513 	return(NULL);
3514 }
3515 
3516 /** Remove the sentinel block for the watch before replacing it with a
3517 real block. buf_page_watch_clear() or buf_page_watch_occurred() will notice
3518 that the block has been replaced with the real block.
3519 @param[in,out]	buf_pool	buffer pool instance
3520 @param[in,out]	watch		sentinel for watch
3521 @return reference count, to be added to the replacement block */
3522 static
3523 void
3524 buf_pool_watch_remove(
3525 	buf_pool_t*	buf_pool,
3526 	buf_page_t*	watch)
3527 {
3528 #ifdef UNIV_DEBUG
3529 	/* We must also own the appropriate hash_bucket mutex. */
3530 	rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, watch->id);
3531 	ut_ad(rw_lock_own(hash_lock, RW_LOCK_X));
3532 #endif /* UNIV_DEBUG */
3533 
3534 	ut_ad(buf_pool_mutex_own(buf_pool));
3535 
3536 	HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, watch->id.fold(),
3537 		    watch);
3538 	ut_d(watch->in_page_hash = FALSE);
3539 	watch->buf_fix_count = 0;
3540 	watch->state = BUF_BLOCK_POOL_WATCH;
3541 }
3542 
3543 /** Stop watching if the page has been read in.
3544 buf_pool_watch_set(same_page_id) must have returned NULL before.
3545 @param[in]	page_id	page id */
3546 void buf_pool_watch_unset(const page_id_t page_id)
3547 {
3548 	buf_page_t*	bpage;
3549 	buf_pool_t*	buf_pool = buf_pool_get(page_id);
3550 
3551 	/* We only need to have buf_pool mutex in case where we end
3552 	up calling buf_pool_watch_remove but to obey latching order
3553 	we acquire it here before acquiring hash_lock. This should
3554 	not cause too much grief as this function is only ever
3555 	called from the purge thread. */
3556 	buf_pool_mutex_enter(buf_pool);
3557 
3558 	rw_lock_t*	hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
3559 	rw_lock_x_lock(hash_lock);
3560 
3561 	/* The page must exist because buf_pool_watch_set()
3562 	increments buf_fix_count. */
3563 	bpage = buf_page_hash_get_low(buf_pool, page_id);
3564 
3565 	if (buf_block_unfix(bpage) == 0
3566 	    && buf_pool_watch_is_sentinel(buf_pool, bpage)) {
3567 		buf_pool_watch_remove(buf_pool, bpage);
3568 	}
3569 
3570 	buf_pool_mutex_exit(buf_pool);
3571 	rw_lock_x_unlock(hash_lock);
3572 }
3573 
3574 /** Check if the page has been read in.
3575 This may only be called after buf_pool_watch_set(same_page_id)
3576 has returned NULL and before invoking buf_pool_watch_unset(same_page_id).
3577 @param[in]	page_id	page id
3578 @return false if the given page was not read in, true if it was */
3579 bool buf_pool_watch_occurred(const page_id_t page_id)
3580 {
3581 	bool		ret;
3582 	buf_page_t*	bpage;
3583 	buf_pool_t*	buf_pool = buf_pool_get(page_id);
3584 	rw_lock_t*	hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
3585 
3586 	rw_lock_s_lock(hash_lock);
3587 
3588 	/* If not own buf_pool_mutex, page_hash can be changed. */
3589 	hash_lock = buf_page_hash_lock_s_confirm(hash_lock, buf_pool, page_id);
3590 
3591 	/* The page must exist because buf_pool_watch_set()
3592 	increments buf_fix_count. */
3593 	bpage = buf_page_hash_get_low(buf_pool, page_id);
3594 
3595 	ret = !buf_pool_watch_is_sentinel(buf_pool, bpage);
3596 	rw_lock_s_unlock(hash_lock);
3597 
3598 	return(ret);
3599 }
3600 
3601 /********************************************************************//**
3602 Moves a page to the start of the buffer pool LRU list. This high-level
3603 function can be used to prevent an important page from slipping out of
3604 the buffer pool. */
3605 void
3606 buf_page_make_young(
3607 /*================*/
3608 	buf_page_t*	bpage)	/*!< in: buffer block of a file page */
3609 {
3610 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
3611 
3612 	buf_pool_mutex_enter(buf_pool);
3613 
3614 	ut_a(buf_page_in_file(bpage));
3615 
3616 	buf_LRU_make_block_young(bpage);
3617 
3618 	buf_pool_mutex_exit(buf_pool);
3619 }
3620 
3621 /********************************************************************//**
3622 Moves a page to the start of the buffer pool LRU list if it is too old.
3623 This high-level function can be used to prevent an important page from
3624 slipping out of the buffer pool. */
3625 static
3626 void
3627 buf_page_make_young_if_needed(
3628 /*==========================*/
3629 	buf_page_t*	bpage)		/*!< in/out: buffer block of a
3630 					file page */
3631 {
3632 #ifdef UNIV_DEBUG
3633 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
3634 	ut_ad(!buf_pool_mutex_own(buf_pool));
3635 #endif /* UNIV_DEBUG */
3636 	ut_a(buf_page_in_file(bpage));
3637 
3638 	if (buf_page_peek_if_too_old(bpage)) {
3639 		buf_page_make_young(bpage);
3640 	}
3641 }
3642 
3643 #ifdef UNIV_DEBUG
3644 
3645 /** Sets file_page_was_freed TRUE if the page is found in the buffer pool.
3646 This function should be called when we free a file page and want the
3647 debug version to check that it is not accessed any more unless
3648 reallocated.
3649 @param[in]	page_id	page id
3650 @return control block if found in page hash table, otherwise NULL */
3651 buf_page_t* buf_page_set_file_page_was_freed(const page_id_t page_id)
3652 {
3653 	buf_page_t*	bpage;
3654 	buf_pool_t*	buf_pool = buf_pool_get(page_id);
3655 	rw_lock_t*	hash_lock;
3656 
3657 	bpage = buf_page_hash_get_s_locked(buf_pool, page_id, &hash_lock);
3658 
3659 	if (bpage) {
3660 		BPageMutex*	block_mutex = buf_page_get_mutex(bpage);
3661 		ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
3662 		mutex_enter(block_mutex);
3663 		rw_lock_s_unlock(hash_lock);
3664 		/* bpage->file_page_was_freed can already hold
3665 		when this code is invoked from dict_drop_index_tree() */
3666 		bpage->file_page_was_freed = TRUE;
3667 		mutex_exit(block_mutex);
3668 	}
3669 
3670 	return(bpage);
3671 }
3672 
3673 /** Sets file_page_was_freed FALSE if the page is found in the buffer pool.
3674 This function should be called when we free a file page and want the
3675 debug version to check that it is not accessed any more unless
3676 reallocated.
3677 @param[in]	page_id	page id
3678 @return control block if found in page hash table, otherwise NULL */
3679 buf_page_t* buf_page_reset_file_page_was_freed(const page_id_t page_id)
3680 {
3681 	buf_page_t*	bpage;
3682 	buf_pool_t*	buf_pool = buf_pool_get(page_id);
3683 	rw_lock_t*	hash_lock;
3684 
3685 	bpage = buf_page_hash_get_s_locked(buf_pool, page_id, &hash_lock);
3686 	if (bpage) {
3687 		BPageMutex*	block_mutex = buf_page_get_mutex(bpage);
3688 		ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
3689 		mutex_enter(block_mutex);
3690 		rw_lock_s_unlock(hash_lock);
3691 		bpage->file_page_was_freed = FALSE;
3692 		mutex_exit(block_mutex);
3693 	}
3694 
3695 	return(bpage);
3696 }
3697 #endif /* UNIV_DEBUG */
3698 
3699 /** Attempts to discard the uncompressed frame of a compressed page.
3700 The caller should not be holding any mutexes when this function is called.
3701 @param[in]	page_id	page id */
3702 static void buf_block_try_discard_uncompressed(const page_id_t page_id)
3703 {
3704 	buf_page_t*	bpage;
3705 	buf_pool_t*	buf_pool = buf_pool_get(page_id);
3706 
3707 	/* Since we need to acquire buf_pool mutex to discard
3708 	the uncompressed frame and because page_hash mutex resides
3709 	below buf_pool mutex in sync ordering therefore we must
3710 	first release the page_hash mutex. This means that the
3711 	block in question can move out of page_hash. Therefore
3712 	we need to check again if the block is still in page_hash. */
3713 	buf_pool_mutex_enter(buf_pool);
3714 
3715 	bpage = buf_page_hash_get(buf_pool, page_id);
3716 
3717 	if (bpage) {
3718 		buf_LRU_free_page(bpage, false);
3719 	}
3720 
3721 	buf_pool_mutex_exit(buf_pool);
3722 }
3723 
3724 /** Get read access to a compressed page (usually of type
3725 FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
3726 The page must be released with buf_page_release_zip().
3727 NOTE: the page is not protected by any latch.  Mutual exclusion has to
3728 be implemented at a higher level.  In other words, all possible
3729 accesses to a given page through this function must be protected by
3730 the same set of mutexes or latches.
3731 @param[in]	page_id		page id
3732 @param[in]	page_size	page size
3733 @return pointer to the block */
3734 buf_page_t*
3735 buf_page_get_zip(
3736 	const page_id_t		page_id,
3737 	const page_size_t&	page_size)
3738 {
3739 	buf_page_t*	bpage;
3740 	BPageMutex*	block_mutex;
3741 	rw_lock_t*	hash_lock;
3742 	ibool		discard_attempted = FALSE;
3743 	ibool		must_read;
3744 	buf_pool_t*	buf_pool = buf_pool_get(page_id);
3745 
3746 	buf_pool->stat.n_page_gets++;
3747 
3748 	for (;;) {
3749 lookup:
3750 
3751 		/* The following call will also grab the page_hash
3752 		mutex if the page is found. */
3753 		bpage = buf_page_hash_get_s_locked(buf_pool, page_id,
3754 						   &hash_lock);
3755 		if (bpage) {
3756 			ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
3757 			break;
3758 		}
3759 
3760 		/* Page not in buf_pool: needs to be read from file */
3761 
3762 		ut_ad(!hash_lock);
3763 		dberr_t err = buf_read_page(page_id, page_size);
3764 
3765 		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
3766 			ib::error() << "Reading compressed page " << page_id
3767 				<< " failed with error: " << err;
3768 
3769 			goto err_exit;
3770 		}
3771 
3772 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3773 		ut_a(++buf_dbg_counter % 5771 || buf_validate());
3774 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3775 	}
3776 
3777 	ut_ad(buf_page_hash_lock_held_s(buf_pool, bpage));
3778 
3779 	if (!bpage->zip.data) {
3780 		/* There is no compressed page. */
3781 err_exit:
3782 		rw_lock_s_unlock(hash_lock);
3783 		return(NULL);
3784 	}
3785 
3786 	ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
3787 
3788 	switch (buf_page_get_state(bpage)) {
3789 	case BUF_BLOCK_ZIP_PAGE:
3790 	case BUF_BLOCK_ZIP_DIRTY:
3791 		buf_block_fix(bpage);
3792 		block_mutex = &buf_pool->zip_mutex;
3793 		goto got_block;
3794 	case BUF_BLOCK_FILE_PAGE:
3795 		/* Discard the uncompressed page frame if possible. */
3796 		if (!discard_attempted) {
3797 			rw_lock_s_unlock(hash_lock);
3798 			buf_block_try_discard_uncompressed(page_id);
3799 			discard_attempted = TRUE;
3800 			goto lookup;
3801 		}
3802 
3803 		buf_block_buf_fix_inc((buf_block_t*) bpage,
3804 				      __FILE__, __LINE__);
3805 
3806 		block_mutex = &((buf_block_t*) bpage)->mutex;
3807 		goto got_block;
3808 	default:
3809 		break;
3810 	}
3811 
3812 	ut_error;
3813 	goto err_exit;
3814 
3815 got_block:
3816 	mutex_enter(block_mutex);
3817 	must_read = buf_page_get_io_fix(bpage) == BUF_IO_READ;
3818 
3819 	rw_lock_s_unlock(hash_lock);
3820 
3821 	ut_ad(!bpage->file_page_was_freed);
3822 
3823 	buf_page_set_accessed(bpage);
3824 
3825 	mutex_exit(block_mutex);
3826 
3827 	buf_page_make_young_if_needed(bpage);
3828 
3829 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3830 	ut_a(++buf_dbg_counter % 5771 || buf_validate());
3831 	ut_a(bpage->buf_fix_count > 0);
3832 	ut_a(buf_page_in_file(bpage));
3833 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3834 
3835 	if (must_read) {
3836 		/* Let us wait until the read operation
3837 		completes */
3838 
3839 		for (;;) {
3840 			enum buf_io_fix	io_fix;
3841 
3842 			mutex_enter(block_mutex);
3843 			io_fix = buf_page_get_io_fix(bpage);
3844 			mutex_exit(block_mutex);
3845 
3846 			if (io_fix == BUF_IO_READ) {
3847 
3848 				os_thread_sleep(WAIT_FOR_READ);
3849 			} else {
3850 				break;
3851 			}
3852 		}
3853 	}
3854 
3855 	return(bpage);
3856 }
3857 
3858 /********************************************************************//**
3859 Initialize some fields of a control block. */
3860 UNIV_INLINE
3861 void
3862 buf_block_init_low(
3863 /*===============*/
3864 	buf_block_t*	block)	/*!< in: block to init */
3865 {
3866 #ifdef BTR_CUR_HASH_ADAPT
3867 	/* No adaptive hash index entries may point to a previously
3868 	unused (and now freshly allocated) block. */
3869 	assert_block_ahi_empty_on_init(block);
3870 	block->index		= NULL;
3871 
3872 	block->n_hash_helps	= 0;
3873 	block->n_fields		= 1;
3874 	block->n_bytes		= 0;
3875 	block->left_side	= TRUE;
3876 #endif /* BTR_CUR_HASH_ADAPT */
3877 }
3878 
3879 /********************************************************************//**
3880 Decompress a block.
3881 @return TRUE if successful */
3882 ibool
3883 buf_zip_decompress(
3884 /*===============*/
3885 	buf_block_t*	block,	/*!< in/out: block */
3886 	ibool		check)	/*!< in: TRUE=verify the page checksum */
3887 {
3888 	const byte*	frame = block->page.zip.data;
3889 	ulint		size = page_zip_get_size(&block->page.zip);
3890 	/* The tablespace will not be found if this function is called
3891 	during IMPORT. */
3892 	fil_space_t* space = fil_space_acquire_for_io(block->page.id.space());
3893 	const unsigned key_version = mach_read_from_4(
3894 		frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
3895 	fil_space_crypt_t* crypt_data = space ? space->crypt_data : NULL;
3896 	const bool encrypted = crypt_data
3897 		&& crypt_data->type != CRYPT_SCHEME_UNENCRYPTED
3898 		&& (!crypt_data->is_default_encryption()
3899 		    || srv_encrypt_tables);
3900 
3901 	ut_ad(block->page.size.is_compressed());
3902 	ut_a(block->page.id.space() != 0);
3903 
3904 	if (UNIV_UNLIKELY(check && !page_zip_verify_checksum(frame, size))) {
3905 
3906 		ib::error() << "Compressed page checksum mismatch for "
3907 			<< (space ? space->chain.start->name : "")
3908 			<< block->page.id << ": stored: "
3909 			<< mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM)
3910 			<< ", crc32: "
3911 			<< page_zip_calc_checksum(
3912 				frame, size, SRV_CHECKSUM_ALGORITHM_CRC32)
3913 #ifdef INNODB_BUG_ENDIAN_CRC32
3914 			<< "/"
3915 			<< page_zip_calc_checksum(
3916 				frame, size, SRV_CHECKSUM_ALGORITHM_CRC32,
3917 				true)
3918 #endif
3919 			<< " innodb: "
3920 			<< page_zip_calc_checksum(
3921 				frame, size, SRV_CHECKSUM_ALGORITHM_INNODB)
3922 			<< ", none: "
3923 			<< page_zip_calc_checksum(
3924 				frame, size, SRV_CHECKSUM_ALGORITHM_NONE)
3925 			<< " (algorithm: " << srv_checksum_algorithm << ")";
3926 
3927 		goto err_exit;
3928 	}
3929 
3930 	switch (fil_page_get_type(frame)) {
3931 	case FIL_PAGE_INDEX:
3932 	case FIL_PAGE_RTREE:
3933 		if (page_zip_decompress(&block->page.zip,
3934 					block->frame, TRUE)) {
3935 			if (space) {
3936 				space->release_for_io();
3937 			}
3938 			return(TRUE);
3939 		}
3940 
3941 		ib::error() << "Unable to decompress "
3942 			<< (space ? space->chain.start->name : "")
3943 			<< block->page.id;
3944 		goto err_exit;
3945 	case FIL_PAGE_TYPE_ALLOCATED:
3946 	case FIL_PAGE_INODE:
3947 	case FIL_PAGE_IBUF_BITMAP:
3948 	case FIL_PAGE_TYPE_FSP_HDR:
3949 	case FIL_PAGE_TYPE_XDES:
3950 	case FIL_PAGE_TYPE_ZBLOB:
3951 	case FIL_PAGE_TYPE_ZBLOB2:
3952 		/* Copy to uncompressed storage. */
3953 		memcpy(block->frame, frame, block->page.size.physical());
3954 		if (space) {
3955 			space->release_for_io();
3956 		}
3957 
3958 		return(TRUE);
3959 	}
3960 
3961 	ib::error() << "Unknown compressed page type "
3962 		<< fil_page_get_type(frame)
3963 		<< " in " << (space ? space->chain.start->name : "")
3964 		<< block->page.id;
3965 
3966 err_exit:
3967 	if (encrypted) {
3968 		ib::info() << "Row compressed page could be encrypted"
3969 			" with key_version " << key_version;
3970 	}
3971 
3972 	if (space) {
3973 		if (encrypted) {
3974 			dict_set_encrypted_by_space(space);
3975 		} else {
3976 			dict_set_corrupted_by_space(space);
3977 		}
3978 
3979 		space->release_for_io();
3980 	}
3981 
3982 	return(FALSE);
3983 }
3984 
3985 #ifdef BTR_CUR_HASH_ADAPT
3986 /** Get a buffer block from an adaptive hash index pointer.
3987 This function does not return if the block is not identified.
3988 @param[in]	ptr	pointer to within a page frame
3989 @return pointer to block, never NULL */
3990 buf_block_t*
3991 buf_block_from_ahi(const byte* ptr)
3992 {
3993 	buf_pool_chunk_map_t::iterator it;
3994 
3995 	buf_pool_chunk_map_t*	chunk_map = buf_chunk_map_ref;
3996 	ut_ad(buf_chunk_map_ref == buf_chunk_map_reg);
3997 	ut_ad(!buf_pool_resizing);
3998 
3999 	buf_chunk_t*	chunk;
4000 	it = chunk_map->upper_bound(ptr);
4001 
4002 	ut_a(it != chunk_map->begin());
4003 
4004 	if (it == chunk_map->end()) {
4005 		chunk = chunk_map->rbegin()->second;
4006 	} else {
4007 		chunk = (--it)->second;
4008 	}
4009 
4010 	ulint		offs = ulint(ptr - chunk->blocks->frame);
4011 
4012 	offs >>= srv_page_size_shift;
4013 
4014 	ut_a(offs < chunk->size);
4015 
4016 	buf_block_t*	block = &chunk->blocks[offs];
4017 
4018 	/* The function buf_chunk_init() invokes buf_block_init() so that
4019 	block[n].frame == block->frame + n * srv_page_size.  Check it. */
4020 	ut_ad(block->frame == page_align(ptr));
4021 	/* Read the state of the block without holding a mutex.
4022 	A state transition from BUF_BLOCK_FILE_PAGE to
4023 	BUF_BLOCK_REMOVE_HASH is possible during this execution. */
4024 	ut_d(const buf_page_state state = buf_block_get_state(block));
4025 	ut_ad(state == BUF_BLOCK_FILE_PAGE || state == BUF_BLOCK_REMOVE_HASH);
4026 	return(block);
4027 }
4028 #endif /* BTR_CUR_HASH_ADAPT */
4029 
4030 /********************************************************************//**
4031 Find out if a pointer belongs to a buf_block_t. It can be a pointer to
4032 the buf_block_t itself or a member of it
4033 @return TRUE if ptr belongs to a buf_block_t struct */
4034 ibool
4035 buf_pointer_is_block_field(
4036 /*=======================*/
4037 	const void*	ptr)	/*!< in: pointer not dereferenced */
4038 {
4039 	ulint	i;
4040 
4041 	for (i = 0; i < srv_buf_pool_instances; i++) {
4042 		if (buf_pool_from_array(i)->is_block_field(ptr)) {
4043 			return(TRUE);
4044 		}
4045 	}
4046 
4047 	return(FALSE);
4048 }
4049 
4050 #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
4051 /********************************************************************//**
4052 Return true if probe is enabled.
4053 @return true if probe enabled. */
4054 static
4055 bool
4056 buf_debug_execute_is_force_flush()
4057 /*==============================*/
4058 {
4059 	DBUG_EXECUTE_IF("ib_buf_force_flush", return(true); );
4060 
4061 	/* This is used during queisce testing, we want to ensure maximum
4062 	buffering by the change buffer. */
4063 
4064 	if (srv_ibuf_disable_background_merge) {
4065 		return(true);
4066 	}
4067 
4068 	return(false);
4069 }
4070 #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
4071 
4072 /** Wait for the block to be read in.
4073 @param[in]	block	The block to check */
4074 static
4075 void
4076 buf_wait_for_read(
4077 	buf_block_t*	block)
4078 {
4079 	/* Note:
4080 
4081 	We are using the block->lock to check for IO state (and a dirty read).
4082 	We set the IO_READ state under the protection of the hash_lock
4083 	(and block->mutex). This is safe because another thread can only
4084 	access the block (and check for IO state) after the block has been
4085 	added to the page hashtable. */
4086 
4087 	if (buf_block_get_io_fix(block) == BUF_IO_READ) {
4088 
4089 		/* Wait until the read operation completes */
4090 
4091 		BPageMutex*	mutex = buf_page_get_mutex(&block->page);
4092 
4093 		for (;;) {
4094 			buf_io_fix	io_fix;
4095 
4096 			mutex_enter(mutex);
4097 
4098 			io_fix = buf_block_get_io_fix(block);
4099 
4100 			mutex_exit(mutex);
4101 
4102 			if (io_fix == BUF_IO_READ) {
4103 				/* Wait by temporaly s-latch */
4104 				rw_lock_s_lock(&block->lock);
4105 				rw_lock_s_unlock(&block->lock);
4106 			} else {
4107 				break;
4108 			}
4109 		}
4110 	}
4111 }
4112 
4113 #ifdef BTR_CUR_HASH_ADAPT
4114 /** If a stale adaptive hash index exists on the block, drop it.
4115 Multiple executions of btr_search_drop_page_hash_index() on the
4116 same block must be prevented by exclusive page latch. */
4117 ATTRIBUTE_COLD
4118 static void buf_defer_drop_ahi(buf_block_t *block, mtr_memo_type_t fix_type)
4119 {
4120   switch (fix_type) {
4121   case MTR_MEMO_BUF_FIX:
4122     /* We do not drop the adaptive hash index, because safely doing
4123     so would require acquiring block->lock, and that is not safe
4124     to acquire in some RW_NO_LATCH access paths. Those code paths
4125     should have no business accessing the adaptive hash index anyway. */
4126     break;
4127   case MTR_MEMO_PAGE_S_FIX:
4128     /* Temporarily release our S-latch. */
4129     rw_lock_s_unlock(&block->lock);
4130     rw_lock_x_lock(&block->lock);
4131     if (dict_index_t *index= block->index)
4132       if (index->freed())
4133         btr_search_drop_page_hash_index(block);
4134     rw_lock_x_unlock(&block->lock);
4135     rw_lock_s_lock(&block->lock);
4136     break;
4137   case MTR_MEMO_PAGE_SX_FIX:
4138     rw_lock_sx_unlock(&block->lock);
4139     rw_lock_x_lock(&block->lock);
4140     if (dict_index_t *index= block->index)
4141       if (index->freed())
4142         btr_search_drop_page_hash_index(block);
4143     rw_lock_x_unlock(&block->lock);
4144     rw_lock_sx_lock(&block->lock);
4145     break;
4146   default:
4147     ut_ad(fix_type == MTR_MEMO_PAGE_X_FIX);
4148     btr_search_drop_page_hash_index(block);
4149   }
4150 }
4151 #endif /* BTR_CUR_HASH_ADAPT */
4152 
4153 /** Lock the page with the given latch type.
4154 @param[in,out]	block		block to be locked
4155 @param[in]	rw_latch	RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
4156 @param[in]	mtr		mini-transaction
4157 @param[in]	file		file name
4158 @param[in]	line		line where called
4159 @return pointer to locked block */
4160 static buf_block_t* buf_page_mtr_lock(buf_block_t *block,
4161                                       ulint rw_latch,
4162                                       mtr_t* mtr,
4163                                       const char *file,
4164                                       unsigned line)
4165 {
4166   mtr_memo_type_t fix_type;
4167   switch (rw_latch)
4168   {
4169   case RW_NO_LATCH:
4170     fix_type= MTR_MEMO_BUF_FIX;
4171     goto done;
4172   case RW_S_LATCH:
4173     rw_lock_s_lock_inline(&block->lock, 0, file, line);
4174     fix_type= MTR_MEMO_PAGE_S_FIX;
4175     break;
4176   case RW_SX_LATCH:
4177     rw_lock_sx_lock_inline(&block->lock, 0, file, line);
4178     fix_type= MTR_MEMO_PAGE_SX_FIX;
4179     break;
4180   default:
4181     ut_ad(rw_latch == RW_X_LATCH);
4182     rw_lock_x_lock_inline(&block->lock, 0, file, line);
4183     fix_type= MTR_MEMO_PAGE_X_FIX;
4184     break;
4185   }
4186 
4187 #ifdef BTR_CUR_HASH_ADAPT
4188   {
4189     dict_index_t *index= block->index;
4190     if (index && index->freed())
4191       buf_defer_drop_ahi(block, fix_type);
4192   }
4193 #endif /* BTR_CUR_HASH_ADAPT */
4194 
4195 done:
4196   mtr_memo_push(mtr, block, fix_type);
4197   return block;
4198 }
4199 
4200 /** This is the low level function used to get access to a database page.
4201 @param[in]	page_id		page id
4202 @param[in]	rw_latch	RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
4203 @param[in]	guess		guessed block or NULL
4204 @param[in]	mode		BUF_GET, BUF_GET_IF_IN_POOL,
4205 BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH
4206 @param[in]	file		file name
4207 @param[in]	line		line where called
4208 @param[in]	mtr		mini-transaction
4209 @return pointer to the block or NULL */
4210 buf_block_t*
4211 buf_page_get_low(
4212 	const page_id_t		page_id,
4213 	const page_size_t&	page_size,
4214 	ulint			rw_latch,
4215 	buf_block_t*		guess,
4216 	ulint			mode,
4217 	const char*		file,
4218 	unsigned		line,
4219 	mtr_t*			mtr,
4220 	dberr_t*		err)
4221 {
4222 	buf_block_t*	block;
4223 	unsigned	access_time;
4224 	rw_lock_t*	hash_lock;
4225 	buf_block_t*	fix_block;
4226 	ulint		retries = 0;
4227 	buf_pool_t*	buf_pool = buf_pool_get(page_id);
4228 
4229 	ut_ad((mtr == NULL) == (mode == BUF_EVICT_IF_IN_POOL));
4230 	ut_ad(!mtr || mtr->is_active());
4231 	ut_ad((rw_latch == RW_S_LATCH)
4232 	      || (rw_latch == RW_X_LATCH)
4233 	      || (rw_latch == RW_SX_LATCH)
4234 	      || (rw_latch == RW_NO_LATCH));
4235 
4236 	if (err) {
4237 		*err = DB_SUCCESS;
4238 	}
4239 
4240 #ifdef UNIV_DEBUG
4241 	switch (mode) {
4242 	case BUF_EVICT_IF_IN_POOL:
4243 		/* After DISCARD TABLESPACE, the tablespace would not exist,
4244 		but in IMPORT TABLESPACE, PageConverter::operator() must
4245 		replace any old pages, which were not evicted during DISCARD.
4246 		Skip the assertion on space_page_size. */
4247 		break;
4248 	case BUF_PEEK_IF_IN_POOL:
4249 	case BUF_GET_IF_IN_POOL:
4250 		/* The caller may pass a dummy page size,
4251 		because it does not really matter. */
4252 		break;
4253 	default:
4254 		ut_error;
4255 	case BUF_GET_NO_LATCH:
4256 		ut_ad(rw_latch == RW_NO_LATCH);
4257 		/* fall through */
4258 	case BUF_GET:
4259 	case BUF_GET_IF_IN_POOL_OR_WATCH:
4260 	case BUF_GET_POSSIBLY_FREED:
4261 		bool			found;
4262 		const page_size_t&	space_page_size
4263 			= fil_space_get_page_size(page_id.space(), &found);
4264 		ut_ad(found);
4265 		ut_ad(page_size.equals_to(space_page_size));
4266 	}
4267 #endif /* UNIV_DEBUG */
4268 
4269 	ut_ad(!mtr || !ibuf_inside(mtr)
4270 	      || ibuf_page_low(page_id, page_size, FALSE, file, line, NULL));
4271 
4272 	buf_pool->stat.n_page_gets++;
4273 	hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
4274 loop:
4275 	block = guess;
4276 
4277 	rw_lock_s_lock(hash_lock);
4278 
4279 	/* If not own buf_pool_mutex, page_hash can be changed. */
4280 	hash_lock = buf_page_hash_lock_s_confirm(hash_lock, buf_pool, page_id);
4281 
4282 	if (block != NULL) {
4283 
4284 		/* If the guess is a compressed page descriptor that
4285 		has been allocated by buf_page_alloc_descriptor(),
4286 		it may have been freed by buf_relocate(). */
4287 
4288 		if (!buf_pool->is_block_field(block)
4289 		    || page_id != block->page.id
4290 		    || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
4291 
4292 			/* Our guess was bogus or things have changed
4293 			since. */
4294 			block = guess = NULL;
4295 		} else {
4296 			ut_ad(!block->page.in_zip_hash);
4297 		}
4298 	}
4299 
4300 	if (block == NULL) {
4301 		block = (buf_block_t*) buf_page_hash_get_low(buf_pool, page_id);
4302 	}
4303 
4304 	if (!block || buf_pool_watch_is_sentinel(buf_pool, &block->page)) {
4305 		rw_lock_s_unlock(hash_lock);
4306 		block = NULL;
4307 	}
4308 
4309 	if (block == NULL) {
4310 
4311 		/* Page not in buf_pool: needs to be read from file */
4312 
4313 		if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
4314 			rw_lock_x_lock(hash_lock);
4315 
4316 			/* If not own buf_pool_mutex,
4317 			page_hash can be changed. */
4318 			hash_lock = buf_page_hash_lock_x_confirm(
4319 				hash_lock, buf_pool, page_id);
4320 
4321 			block = (buf_block_t*) buf_pool_watch_set(
4322 				page_id, &hash_lock);
4323 
4324 			if (block) {
4325 				/* We can release hash_lock after we
4326 				increment the fix count to make
4327 				sure that no state change takes place. */
4328 				fix_block = block;
4329 
4330 				if (fsp_is_system_temporary(page_id.space())) {
4331 					/* For temporary tablespace,
4332 					the mutex is being used for
4333 					synchronization between user
4334 					thread and flush thread,
4335 					instead of block->lock. See
4336 					buf_flush_page() for the flush
4337 					thread counterpart. */
4338 
4339 					BPageMutex*	fix_mutex
4340 						= buf_page_get_mutex(
4341 							&fix_block->page);
4342 					mutex_enter(fix_mutex);
4343 					buf_block_fix(fix_block);
4344 					mutex_exit(fix_mutex);
4345 				} else {
4346 					buf_block_fix(fix_block);
4347 				}
4348 
4349 				/* Now safe to release page_hash mutex */
4350 				rw_lock_x_unlock(hash_lock);
4351 				goto got_block;
4352 			}
4353 
4354 			rw_lock_x_unlock(hash_lock);
4355 		}
4356 
4357 		switch (mode) {
4358 		case BUF_GET_IF_IN_POOL:
4359 		case BUF_GET_IF_IN_POOL_OR_WATCH:
4360 		case BUF_PEEK_IF_IN_POOL:
4361 		case BUF_EVICT_IF_IN_POOL:
4362 			ut_ad(!rw_lock_own_flagged(
4363 				      hash_lock,
4364 				      RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
4365 			return(NULL);
4366 		}
4367 
4368 		/* The call path is buf_read_page() ->
4369 		buf_read_page_low() (fil_io()) ->
4370 		buf_page_io_complete() ->
4371 		buf_decrypt_after_read(). Here fil_space_t* is used
4372 		and we decrypt -> buf_page_check_corrupt() where page
4373 		checksums are compared. Decryption, decompression as
4374 		well as error handling takes place at a lower level.
4375 		Here we only need to know whether the page really is
4376 		corrupted, or if an encrypted page with a valid
4377 		checksum cannot be decypted. */
4378 
4379 		dberr_t local_err = buf_read_page(page_id, page_size);
4380 
4381 		if (local_err == DB_SUCCESS) {
4382 			buf_read_ahead_random(page_id, page_size,
4383 					      ibuf_inside(mtr));
4384 
4385 			retries = 0;
4386 		} else if (mode == BUF_GET_POSSIBLY_FREED) {
4387 			if (err) {
4388 				*err = local_err;
4389 			}
4390 			return NULL;
4391 		} else if (retries < BUF_PAGE_READ_MAX_RETRIES) {
4392 			++retries;
4393 
4394 			DBUG_EXECUTE_IF(
4395 				"innodb_page_corruption_retries",
4396 				retries = BUF_PAGE_READ_MAX_RETRIES;
4397 			);
4398 		} else {
4399 			if (err) {
4400 				*err = local_err;
4401 			}
4402 
4403 			/* Pages whose encryption key is unavailable or used
4404 			key, encryption algorithm or encryption method is
4405 			incorrect are marked as encrypted in
4406 			buf_page_check_corrupt(). Unencrypted page could be
4407 			corrupted in a way where the key_id field is
4408 			nonzero. There is no checksum on field
4409 			FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION. */
4410 			if (local_err == DB_DECRYPTION_FAILED) {
4411 				return (NULL);
4412 			}
4413 
4414 			if (local_err == DB_PAGE_CORRUPTED
4415 			    && srv_force_recovery) {
4416 				return NULL;
4417 			}
4418 
4419 			/* Try to set table as corrupted instead of
4420 			asserting. */
4421 			if (page_id.space() == TRX_SYS_SPACE) {
4422 			} else if (page_id.space() == SRV_TMP_SPACE_ID) {
4423 			} else if (fil_space_t* space
4424 				   = fil_space_acquire_for_io(
4425 					   page_id.space())) {
4426 				bool set = dict_set_corrupted_by_space(space);
4427 				space->release_for_io();
4428 				if (set) {
4429 					return NULL;
4430 				}
4431 			}
4432 
4433 			if (local_err == DB_IO_ERROR) {
4434 				return NULL;
4435 			}
4436 
4437 			ib::fatal() << "Unable to read page " << page_id
4438 				<< " into the buffer pool after "
4439 				<< BUF_PAGE_READ_MAX_RETRIES
4440 				<< ". The most probable cause"
4441 				" of this error may be that the"
4442 				" table has been corrupted."
4443 				" See https://mariadb.com/kb/en/library/innodb-recovery-modes/";
4444 		}
4445 
4446 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
4447 		ut_a(++buf_dbg_counter % 5771 || buf_validate());
4448 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
4449 		goto loop;
4450 	} else {
4451 		fix_block = block;
4452 	}
4453 
4454 	if (fsp_is_system_temporary(page_id.space())) {
4455 		/* For temporary tablespace, the mutex is being used
4456 		for synchronization between user thread and flush
4457 		thread, instead of block->lock. See buf_flush_page()
4458 		for the flush thread counterpart. */
4459 		BPageMutex*	fix_mutex = buf_page_get_mutex(
4460 			&fix_block->page);
4461 		mutex_enter(fix_mutex);
4462 		buf_block_fix(fix_block);
4463 		mutex_exit(fix_mutex);
4464 	} else {
4465 		buf_block_fix(fix_block);
4466 	}
4467 
4468 	/* Now safe to release page_hash mutex */
4469 	rw_lock_s_unlock(hash_lock);
4470 
4471 got_block:
4472 
4473 	switch (mode) {
4474 	case BUF_GET_IF_IN_POOL:
4475 	case BUF_PEEK_IF_IN_POOL:
4476 	case BUF_EVICT_IF_IN_POOL:
4477 		buf_page_t*	fix_page = &fix_block->page;
4478 		BPageMutex*	fix_mutex = buf_page_get_mutex(fix_page);
4479 		mutex_enter(fix_mutex);
4480 		const bool	must_read
4481 			= (buf_page_get_io_fix(fix_page) == BUF_IO_READ);
4482 		mutex_exit(fix_mutex);
4483 
4484 		if (must_read) {
4485 			/* The page is being read to buffer pool,
4486 			but we cannot wait around for the read to
4487 			complete. */
4488 			buf_block_unfix(fix_block);
4489 
4490 			return(NULL);
4491 		}
4492 	}
4493 
4494 	switch (buf_block_get_state(fix_block)) {
4495 		buf_page_t*	bpage;
4496 
4497 	case BUF_BLOCK_FILE_PAGE:
4498 		bpage = &block->page;
4499 		if (fsp_is_system_temporary(page_id.space())
4500 		    && buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
4501 			/* This suggests that the page is being flushed.
4502 			Avoid returning reference to this page.
4503 			Instead wait for the flush action to complete. */
4504 			buf_block_unfix(fix_block);
4505 			os_thread_sleep(WAIT_FOR_WRITE);
4506 			goto loop;
4507 		}
4508 
4509 		if (UNIV_UNLIKELY(mode == BUF_EVICT_IF_IN_POOL)) {
4510 evict_from_pool:
4511 			ut_ad(!fix_block->page.oldest_modification);
4512 			buf_pool_mutex_enter(buf_pool);
4513 			buf_block_unfix(fix_block);
4514 
4515 			if (!buf_LRU_free_page(&fix_block->page, true)) {
4516 				ut_ad(0);
4517 			}
4518 
4519 			buf_pool_mutex_exit(buf_pool);
4520 			return(NULL);
4521 		}
4522 
4523 		break;
4524 
4525 	case BUF_BLOCK_ZIP_PAGE:
4526 	case BUF_BLOCK_ZIP_DIRTY:
4527 		if (mode == BUF_PEEK_IF_IN_POOL) {
4528 			/* This mode is only used for dropping an
4529 			adaptive hash index.  There cannot be an
4530 			adaptive hash index for a compressed-only
4531 			page, so do not bother decompressing the page. */
4532 			buf_block_unfix(fix_block);
4533 
4534 			return(NULL);
4535 		}
4536 
4537 		bpage = &block->page;
4538 
4539 		/* Note: We have already buffer fixed this block. */
4540 		if (bpage->buf_fix_count > 1
4541 		    || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
4542 
4543 			/* This condition often occurs when the buffer
4544 			is not buffer-fixed, but I/O-fixed by
4545 			buf_page_init_for_read(). */
4546 			buf_block_unfix(fix_block);
4547 
4548 			/* The block is buffer-fixed or I/O-fixed.
4549 			Try again later. */
4550 			os_thread_sleep(WAIT_FOR_READ);
4551 
4552 			goto loop;
4553 		}
4554 
4555 		if (UNIV_UNLIKELY(mode == BUF_EVICT_IF_IN_POOL)) {
4556 			goto evict_from_pool;
4557 		}
4558 
4559 		/* Buffer-fix the block so that it cannot be evicted
4560 		or relocated while we are attempting to allocate an
4561 		uncompressed page. */
4562 
4563 		block = buf_LRU_get_free_block(buf_pool);
4564 
4565 		buf_pool_mutex_enter(buf_pool);
4566 
4567 		/* If not own buf_pool_mutex, page_hash can be changed. */
4568 		hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
4569 
4570 		rw_lock_x_lock(hash_lock);
4571 
4572 		/* Buffer-fixing prevents the page_hash from changing. */
4573 		ut_ad(bpage == buf_page_hash_get_low(buf_pool, page_id));
4574 
4575 		buf_block_unfix(fix_block);
4576 
4577 		buf_page_mutex_enter(block);
4578 		mutex_enter(&buf_pool->zip_mutex);
4579 
4580 		fix_block = block;
4581 
4582 		if (bpage->buf_fix_count > 0
4583 		    || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
4584 
4585 			mutex_exit(&buf_pool->zip_mutex);
4586 			/* The block was buffer-fixed or I/O-fixed while
4587 			buf_pool->mutex was not held by this thread.
4588 			Free the block that was allocated and retry.
4589 			This should be extremely unlikely, for example,
4590 			if buf_page_get_zip() was invoked. */
4591 
4592 			buf_LRU_block_free_non_file_page(block);
4593 			buf_pool_mutex_exit(buf_pool);
4594 			rw_lock_x_unlock(hash_lock);
4595 			buf_page_mutex_exit(block);
4596 
4597 			/* Try again */
4598 			goto loop;
4599 		}
4600 
4601 		/* Move the compressed page from bpage to block,
4602 		and uncompress it. */
4603 
4604 		/* Note: this is the uncompressed block and it is not
4605 		accessible by other threads yet because it is not in
4606 		any list or hash table */
4607 		buf_relocate(bpage, &block->page);
4608 
4609 		buf_block_init_low(block);
4610 
4611 		/* Set after buf_relocate(). */
4612 		block->page.buf_fix_count = 1;
4613 
4614 		block->lock_hash_val = lock_rec_hash(page_id.space(),
4615 						     page_id.page_no());
4616 
4617 		if (buf_page_get_state(&block->page) == BUF_BLOCK_ZIP_PAGE) {
4618 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
4619 			UT_LIST_REMOVE(buf_pool->zip_clean, &block->page);
4620 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
4621 			ut_ad(!block->page.in_flush_list);
4622 		} else {
4623 			/* Relocate buf_pool->flush_list. */
4624 			buf_flush_relocate_on_flush_list(bpage, &block->page);
4625 		}
4626 
4627 		/* Buffer-fix, I/O-fix, and X-latch the block
4628 		for the duration of the decompression.
4629 		Also add the block to the unzip_LRU list. */
4630 		block->page.state = BUF_BLOCK_FILE_PAGE;
4631 
4632 		/* Insert at the front of unzip_LRU list */
4633 		buf_unzip_LRU_add_block(block, FALSE);
4634 
4635 		buf_block_set_io_fix(block, BUF_IO_READ);
4636 		rw_lock_x_lock_inline(&block->lock, 0, file, line);
4637 
4638 		MEM_UNDEFINED(bpage, sizeof *bpage);
4639 
4640 		rw_lock_x_unlock(hash_lock);
4641 		buf_pool->n_pend_unzip++;
4642 		mutex_exit(&buf_pool->zip_mutex);
4643 		buf_pool_mutex_exit(buf_pool);
4644 
4645 		access_time = buf_page_is_accessed(&block->page);
4646 
4647 		buf_page_mutex_exit(block);
4648 
4649 		buf_page_free_descriptor(bpage);
4650 
4651 		/* Decompress the page while not holding
4652 		buf_pool->mutex or block->mutex. */
4653 
4654 		{
4655 			bool	success = buf_zip_decompress(block, false);
4656 
4657 			if (!success) {
4658 				buf_pool_mutex_enter(buf_pool);
4659 				buf_page_mutex_enter(fix_block);
4660 				buf_block_set_io_fix(fix_block, BUF_IO_NONE);
4661 				buf_page_mutex_exit(fix_block);
4662 
4663 				--buf_pool->n_pend_unzip;
4664 				buf_block_unfix(fix_block);
4665 				buf_pool_mutex_exit(buf_pool);
4666 				rw_lock_x_unlock(&fix_block->lock);
4667 
4668 				if (err) {
4669 					*err = DB_PAGE_CORRUPTED;
4670 				}
4671 				return NULL;
4672 			}
4673 		}
4674 
4675 		if (!access_time && !recv_no_ibuf_operations) {
4676 			ibuf_merge_or_delete_for_page(
4677 				block, page_id, page_size);
4678 		}
4679 
4680 		buf_pool_mutex_enter(buf_pool);
4681 
4682 		buf_page_mutex_enter(fix_block);
4683 
4684 		buf_block_set_io_fix(fix_block, BUF_IO_NONE);
4685 
4686 		buf_page_mutex_exit(fix_block);
4687 
4688 		--buf_pool->n_pend_unzip;
4689 
4690 		buf_pool_mutex_exit(buf_pool);
4691 
4692 		rw_lock_x_unlock(&block->lock);
4693 
4694 		break;
4695 
4696 	case BUF_BLOCK_POOL_WATCH:
4697 	case BUF_BLOCK_NOT_USED:
4698 	case BUF_BLOCK_READY_FOR_USE:
4699 	case BUF_BLOCK_MEMORY:
4700 	case BUF_BLOCK_REMOVE_HASH:
4701 		ut_error;
4702 		break;
4703 	}
4704 
4705 	ut_ad(block == fix_block);
4706 	ut_ad(fix_block->page.buf_fix_count > 0);
4707 
4708 	ut_ad(!rw_lock_own_flagged(hash_lock,
4709 				   RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
4710 
4711 	ut_ad(buf_block_get_state(fix_block) == BUF_BLOCK_FILE_PAGE);
4712 
4713 #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
4714 
4715 	if ((mode == BUF_GET_IF_IN_POOL || mode == BUF_GET_IF_IN_POOL_OR_WATCH)
4716 	    && (ibuf_debug || buf_debug_execute_is_force_flush())) {
4717 
4718 		/* Try to evict the block from the buffer pool, to use the
4719 		insert buffer (change buffer) as much as possible. */
4720 
4721 		buf_pool_mutex_enter(buf_pool);
4722 
4723 		buf_block_unfix(fix_block);
4724 
4725 		/* Now we are only holding the buf_pool->mutex,
4726 		not block->mutex or hash_lock. Blocks cannot be
4727 		relocated or enter or exit the buf_pool while we
4728 		are holding the buf_pool->mutex. */
4729 
4730 		if (buf_LRU_free_page(&fix_block->page, true)) {
4731 
4732 			buf_pool_mutex_exit(buf_pool);
4733 
4734 			/* If not own buf_pool_mutex,
4735 			page_hash can be changed. */
4736 			hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
4737 
4738 			rw_lock_x_lock(hash_lock);
4739 
4740 			/* If not own buf_pool_mutex,
4741 			page_hash can be changed. */
4742 			hash_lock = buf_page_hash_lock_x_confirm(
4743 				hash_lock, buf_pool, page_id);
4744 
4745 			if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
4746 				/* Set the watch, as it would have
4747 				been set if the page were not in the
4748 				buffer pool in the first place. */
4749 				block = (buf_block_t*) buf_pool_watch_set(
4750 					page_id, &hash_lock);
4751 			} else {
4752 				block = (buf_block_t*) buf_page_hash_get_low(
4753 					buf_pool, page_id);
4754 			}
4755 
4756 			rw_lock_x_unlock(hash_lock);
4757 
4758 			if (block != NULL) {
4759 				/* Either the page has been read in or
4760 				a watch was set on that in the window
4761 				where we released the buf_pool::mutex
4762 				and before we acquire the hash_lock
4763 				above. Try again. */
4764 				guess = block;
4765 
4766 				goto loop;
4767 			}
4768 
4769 			return(NULL);
4770 		}
4771 
4772 		buf_page_mutex_enter(fix_block);
4773 
4774 		if (buf_flush_page_try(buf_pool, fix_block)) {
4775 			guess = fix_block;
4776 
4777 			goto loop;
4778 		}
4779 
4780 		buf_page_mutex_exit(fix_block);
4781 
4782 		buf_block_fix(fix_block);
4783 
4784 		/* Failed to evict the page; change it directly */
4785 
4786 		buf_pool_mutex_exit(buf_pool);
4787 	}
4788 #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
4789 
4790 	ut_ad(fix_block->page.buf_fix_count > 0);
4791 
4792 #ifdef UNIV_DEBUG
4793 	/* We have already buffer fixed the page, and we are committed to
4794 	returning this page to the caller. Register for debugging.
4795 	Avoid debug latching if page/block belongs to system temporary
4796 	tablespace (Not much needed for table with single threaded access.). */
4797 	if (!fsp_is_system_temporary(page_id.space())) {
4798 		ibool   ret;
4799 		ret = rw_lock_s_lock_nowait(
4800 			&fix_block->debug_latch, file, line);
4801 		ut_a(ret);
4802 	}
4803 #endif /* UNIV_DEBUG */
4804 
4805 	/* While tablespace is reinited the indexes are already freed but the
4806 	blocks related to it still resides in buffer pool. Trying to remove
4807 	such blocks from buffer pool would invoke removal of AHI entries
4808 	associated with these blocks. Logic to remove AHI entry will try to
4809 	load the block but block is already in free state. Handle the said case
4810 	with mode = BUF_PEEK_IF_IN_POOL that is invoked from
4811 	"btr_search_drop_page_hash_when_freed". */
4812 	ut_ad(mode == BUF_GET_POSSIBLY_FREED
4813 	      || mode == BUF_PEEK_IF_IN_POOL
4814 	      || !fix_block->page.file_page_was_freed);
4815 
4816 	/* Check if this is the first access to the page */
4817 	access_time = buf_page_is_accessed(&fix_block->page);
4818 
4819 	/* This is a heuristic and we don't care about ordering issues. */
4820 	if (access_time == 0) {
4821 		buf_page_mutex_enter(fix_block);
4822 
4823 		buf_page_set_accessed(&fix_block->page);
4824 
4825 		buf_page_mutex_exit(fix_block);
4826 	}
4827 
4828 	if (mode != BUF_PEEK_IF_IN_POOL) {
4829 		buf_page_make_young_if_needed(&fix_block->page);
4830 	}
4831 
4832 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
4833 	ut_a(++buf_dbg_counter % 5771 || buf_validate());
4834 	ut_a(buf_block_get_state(fix_block) == BUF_BLOCK_FILE_PAGE);
4835 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
4836 
4837 	/* We have to wait here because the IO_READ state was set
4838 	under the protection of the hash_lock and not the block->mutex
4839 	and block->lock. */
4840 	buf_wait_for_read(fix_block);
4841 
4842 	if (fix_block->page.id != page_id) {
4843 
4844 		buf_block_unfix(fix_block);
4845 
4846 #ifdef UNIV_DEBUG
4847 		if (!fsp_is_system_temporary(page_id.space())) {
4848 			rw_lock_s_unlock(&fix_block->debug_latch);
4849 		}
4850 #endif /* UNIV_DEBUG */
4851 
4852 		if (err) {
4853 			*err = DB_PAGE_CORRUPTED;
4854 		}
4855 
4856 		return NULL;
4857 	}
4858 
4859 	fix_block = buf_page_mtr_lock(fix_block, rw_latch, mtr, file, line);
4860 
4861 	if (mode != BUF_PEEK_IF_IN_POOL && !access_time) {
4862 		/* In the case of a first access, try to apply linear
4863 		read-ahead */
4864 
4865 		buf_read_ahead_linear(page_id, page_size, ibuf_inside(mtr));
4866 	}
4867 
4868 	ut_ad(!rw_lock_own_flagged(hash_lock,
4869 				   RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
4870 
4871 	return(fix_block);
4872 }
4873 
4874 /** This is the general function used to get access to a database page.
4875 It does page initialization and applies the buffered redo logs.
4876 @param[in]	page_id		page id
4877 @param[in]	rw_latch	RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
4878 @param[in]	guess		guessed block or NULL
4879 @param[in]	mode		BUF_GET, BUF_GET_IF_IN_POOL,
4880 BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH
4881 @param[in]	file		file name
4882 @param[in]	line		line where called
4883 @param[in]	mtr		mini-transaction
4884 @param[out]	err		DB_SUCCESS or error code
4885 @return pointer to the block or NULL */
4886 buf_block_t*
4887 buf_page_get_gen(
4888 	const page_id_t		page_id,
4889 	const page_size_t&	page_size,
4890 	ulint			rw_latch,
4891 	buf_block_t*		guess,
4892 	ulint			mode,
4893 	const char*		file,
4894 	unsigned		line,
4895 	mtr_t*			mtr,
4896 	dberr_t*		err)
4897 {
4898   if (buf_block_t *block = recv_recovery_create_page(page_id))
4899   {
4900     buf_block_fix(block);
4901     ut_ad(rw_lock_s_lock_nowait(&block->debug_latch, file, line));
4902     block= buf_page_mtr_lock(block, rw_latch, mtr, file, line);
4903     return block;
4904   }
4905 
4906   return buf_page_get_low(page_id, page_size, rw_latch,
4907                           guess, mode, file, line, mtr, err);
4908 }
4909 
4910 /********************************************************************//**
4911 This is the general function used to get optimistic access to a database
4912 page.
4913 @return TRUE if success */
4914 ibool
4915 buf_page_optimistic_get(
4916 /*====================*/
4917 	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
4918 	buf_block_t*	block,	/*!< in: guessed buffer block */
4919 	ib_uint64_t	modify_clock,/*!< in: modify clock value */
4920 	const char*	file,	/*!< in: file name */
4921 	unsigned	line,	/*!< in: line where called */
4922 	mtr_t*		mtr)	/*!< in: mini-transaction */
4923 {
4924 	buf_pool_t*	buf_pool;
4925 	unsigned	access_time;
4926 	ibool		success;
4927 
4928 	ut_ad(block);
4929 	ut_ad(mtr);
4930 	ut_ad(mtr->is_active());
4931 	ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
4932 
4933 	buf_page_mutex_enter(block);
4934 
4935 	if (UNIV_UNLIKELY(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE)) {
4936 
4937 		buf_page_mutex_exit(block);
4938 
4939 		return(FALSE);
4940 	}
4941 
4942 	buf_block_buf_fix_inc(block, file, line);
4943 
4944 	access_time = buf_page_is_accessed(&block->page);
4945 
4946 	buf_page_set_accessed(&block->page);
4947 
4948 	buf_page_mutex_exit(block);
4949 
4950 	buf_page_make_young_if_needed(&block->page);
4951 
4952 	ut_ad(!ibuf_inside(mtr)
4953 	      || ibuf_page(block->page.id, block->page.size, NULL));
4954 
4955 	mtr_memo_type_t	fix_type;
4956 
4957 	switch (rw_latch) {
4958 	case RW_S_LATCH:
4959 		success = rw_lock_s_lock_nowait(&block->lock, file, line);
4960 
4961 		fix_type = MTR_MEMO_PAGE_S_FIX;
4962 		break;
4963 	case RW_X_LATCH:
4964 		success = rw_lock_x_lock_func_nowait_inline(
4965 			&block->lock, file, line);
4966 
4967 		fix_type = MTR_MEMO_PAGE_X_FIX;
4968 		break;
4969 	default:
4970 		ut_error; /* RW_SX_LATCH is not implemented yet */
4971 	}
4972 
4973 	if (!success) {
4974 		buf_block_buf_fix_dec(block);
4975 		return(FALSE);
4976 	}
4977 
4978 	if (modify_clock != block->modify_clock) {
4979 
4980 		buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
4981 
4982 		if (rw_latch == RW_S_LATCH) {
4983 			rw_lock_s_unlock(&block->lock);
4984 		} else {
4985 			rw_lock_x_unlock(&block->lock);
4986 		}
4987 
4988 		buf_block_buf_fix_dec(block);
4989 		return(FALSE);
4990 	}
4991 
4992 	mtr_memo_push(mtr, block, fix_type);
4993 
4994 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
4995 	ut_a(++buf_dbg_counter % 5771 || buf_validate());
4996 	ut_a(block->page.buf_fix_count > 0);
4997 	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
4998 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
4999 
5000 	ut_d(buf_page_mutex_enter(block));
5001 	ut_ad(!block->page.file_page_was_freed);
5002 	ut_d(buf_page_mutex_exit(block));
5003 
5004 	if (!access_time) {
5005 		/* In the case of a first access, try to apply linear
5006 		read-ahead */
5007 		buf_read_ahead_linear(block->page.id, block->page.size,
5008 				      ibuf_inside(mtr));
5009 	}
5010 
5011 	buf_pool = buf_pool_from_block(block);
5012 	buf_pool->stat.n_page_gets++;
5013 
5014 	return(TRUE);
5015 }
5016 
5017 /********************************************************************//**
5018 This is used to get access to a known database page, when no waiting can be
5019 done. For example, if a search in an adaptive hash index leads us to this
5020 frame.
5021 @return TRUE if success */
5022 ibool
5023 buf_page_get_known_nowait(
5024 /*======================*/
5025 	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
5026 	buf_block_t*	block,	/*!< in: the known page */
5027 	ulint		mode,	/*!< in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */
5028 	const char*	file,	/*!< in: file name */
5029 	unsigned	line,	/*!< in: line where called */
5030 	mtr_t*		mtr)	/*!< in: mini-transaction */
5031 {
5032 	buf_pool_t*	buf_pool;
5033 	ibool		success;
5034 
5035 	ut_ad(mtr->is_active());
5036 	ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
5037 
5038 	buf_page_mutex_enter(block);
5039 
5040 	if (buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH) {
5041 		/* Another thread is just freeing the block from the LRU list
5042 		of the buffer pool: do not try to access this page; this
5043 		attempt to access the page can only come through the hash
5044 		index because when the buffer block state is ..._REMOVE_HASH,
5045 		we have already removed it from the page address hash table
5046 		of the buffer pool. */
5047 
5048 		buf_page_mutex_exit(block);
5049 
5050 		return(FALSE);
5051 	}
5052 
5053 	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
5054 
5055 	buf_block_buf_fix_inc(block, file, line);
5056 
5057 	buf_page_set_accessed(&block->page);
5058 
5059 	buf_page_mutex_exit(block);
5060 
5061 	buf_pool = buf_pool_from_block(block);
5062 
5063 #ifdef BTR_CUR_HASH_ADAPT
5064 	if (mode == BUF_MAKE_YOUNG) {
5065 		buf_page_make_young_if_needed(&block->page);
5066 	}
5067 #endif /* BTR_CUR_HASH_ADAPT */
5068 
5069 	ut_ad(!ibuf_inside(mtr) || mode == BUF_KEEP_OLD);
5070 
5071 	mtr_memo_type_t	fix_type;
5072 
5073 	switch (rw_latch) {
5074 	case RW_S_LATCH:
5075 		success = rw_lock_s_lock_nowait(&block->lock, file, line);
5076 		fix_type = MTR_MEMO_PAGE_S_FIX;
5077 		break;
5078 	case RW_X_LATCH:
5079 		success = rw_lock_x_lock_func_nowait_inline(
5080 			&block->lock, file, line);
5081 
5082 		fix_type = MTR_MEMO_PAGE_X_FIX;
5083 		break;
5084 	default:
5085 		ut_error; /* RW_SX_LATCH is not implemented yet */
5086 	}
5087 
5088 	if (!success) {
5089 		buf_block_buf_fix_dec(block);
5090 		return(FALSE);
5091 	}
5092 
5093 	mtr_memo_push(mtr, block, fix_type);
5094 
5095 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
5096 	ut_a(++buf_dbg_counter % 5771 || buf_validate());
5097 	ut_a(block->page.buf_fix_count > 0);
5098 	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
5099 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
5100 
5101 #ifdef UNIV_DEBUG
5102 	if (mode != BUF_KEEP_OLD) {
5103 		/* If mode == BUF_KEEP_OLD, we are executing an I/O
5104 		completion routine.  Avoid a bogus assertion failure
5105 		when ibuf_merge_or_delete_for_page() is processing a
5106 		page that was just freed due to DROP INDEX, or
5107 		deleting a record from SYS_INDEXES. This check will be
5108 		skipped in recv_recover_page() as well. */
5109 
5110 # ifdef BTR_CUR_HASH_ADAPT
5111 		ut_ad(!block->page.file_page_was_freed
5112 		      || (block->index && block->index->freed()));
5113 # else /* BTR_CUR_HASH_ADAPT */
5114 		ut_ad(!block->page.file_page_was_freed);
5115 # endif /* BTR_CUR_HASH_ADAPT */
5116 	}
5117 #endif /* UNIV_DEBUG */
5118 
5119 	buf_pool->stat.n_page_gets++;
5120 
5121 	return(TRUE);
5122 }
5123 
5124 /** Given a tablespace id and page number tries to get that page. If the
5125 page is not in the buffer pool it is not loaded and NULL is returned.
5126 Suitable for using when holding the lock_sys_t::mutex.
5127 @param[in]	page_id	page id
5128 @param[in]	file	file name
5129 @param[in]	line	line where called
5130 @param[in]	mtr	mini-transaction
5131 @return pointer to a page or NULL */
5132 buf_block_t*
5133 buf_page_try_get_func(
5134 	const page_id_t		page_id,
5135 	const char*		file,
5136 	unsigned		line,
5137 	mtr_t*			mtr)
5138 {
5139 	buf_block_t*	block;
5140 	ibool		success;
5141 	buf_pool_t*	buf_pool = buf_pool_get(page_id);
5142 	rw_lock_t*	hash_lock;
5143 
5144 	ut_ad(mtr);
5145 	ut_ad(mtr->is_active());
5146 
5147 	block = buf_block_hash_get_s_locked(buf_pool, page_id, &hash_lock);
5148 
5149 	if (!block || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
5150 		if (block) {
5151 			rw_lock_s_unlock(hash_lock);
5152 		}
5153 		return(NULL);
5154 	}
5155 
5156 	ut_ad(!buf_pool_watch_is_sentinel(buf_pool, &block->page));
5157 
5158 	buf_page_mutex_enter(block);
5159 	rw_lock_s_unlock(hash_lock);
5160 
5161 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
5162 	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
5163 	ut_a(page_id == block->page.id);
5164 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
5165 
5166 	buf_block_buf_fix_inc(block, file, line);
5167 	buf_page_mutex_exit(block);
5168 
5169 	mtr_memo_type_t	fix_type = MTR_MEMO_PAGE_S_FIX;
5170 	success = rw_lock_s_lock_nowait(&block->lock, file, line);
5171 
5172 	if (!success) {
5173 		/* Let us try to get an X-latch. If the current thread
5174 		is holding an X-latch on the page, we cannot get an
5175 		S-latch. */
5176 
5177 		fix_type = MTR_MEMO_PAGE_X_FIX;
5178 		success = rw_lock_x_lock_func_nowait_inline(&block->lock,
5179 							    file, line);
5180 	}
5181 
5182 	if (!success) {
5183 		buf_block_buf_fix_dec(block);
5184 		return(NULL);
5185 	}
5186 
5187 	mtr_memo_push(mtr, block, fix_type);
5188 
5189 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
5190 	ut_a(++buf_dbg_counter % 5771 || buf_validate());
5191 	ut_a(block->page.buf_fix_count > 0);
5192 	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
5193 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
5194 
5195 	ut_d(buf_page_mutex_enter(block));
5196 	ut_d(ut_a(!block->page.file_page_was_freed));
5197 	ut_d(buf_page_mutex_exit(block));
5198 
5199 	buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
5200 
5201 	buf_pool->stat.n_page_gets++;
5202 
5203 	return(block);
5204 }
5205 
5206 /********************************************************************//**
5207 Initialize some fields of a control block. */
5208 UNIV_INLINE
5209 void
5210 buf_page_init_low(
5211 /*==============*/
5212 	buf_page_t*	bpage)	/*!< in: block to init */
5213 {
5214 	bpage->flush_type = BUF_FLUSH_LRU;
5215 	bpage->io_fix = BUF_IO_NONE;
5216 	bpage->buf_fix_count = 0;
5217 	bpage->old = 0;
5218 	bpage->freed_page_clock = 0;
5219 	bpage->access_time = 0;
5220 	bpage->newest_modification = 0;
5221 	bpage->oldest_modification = 0;
5222 	bpage->real_size = 0;
5223 	bpage->slot = NULL;
5224 
5225 	HASH_INVALIDATE(bpage, hash);
5226 
5227 	ut_d(bpage->file_page_was_freed = FALSE);
5228 }
5229 
5230 /** Inits a page to the buffer buf_pool.
5231 @param[in,out]	buf_pool	buffer pool
5232 @param[in]	page_id		page id
5233 @param[in,out]	block		block to init */
5234 static
5235 void
5236 buf_page_init(
5237 	buf_pool_t*		buf_pool,
5238 	const page_id_t		page_id,
5239 	const page_size_t&	page_size,
5240 	buf_block_t*		block)
5241 {
5242 	buf_page_t*	hash_page;
5243 
5244 	ut_ad(buf_pool == buf_pool_get(page_id));
5245 	ut_ad(buf_pool_mutex_own(buf_pool));
5246 
5247 	ut_ad(buf_page_mutex_own(block));
5248 	ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE);
5249 
5250 	ut_ad(rw_lock_own(buf_page_hash_lock_get(buf_pool, page_id),
5251 			  RW_LOCK_X));
5252 
5253 	/* Set the state of the block */
5254 	buf_block_set_file_page(block, page_id);
5255 
5256 	buf_block_init_low(block);
5257 
5258 	block->lock_hash_val = lock_rec_hash(page_id.space(),
5259 					     page_id.page_no());
5260 
5261 	buf_page_init_low(&block->page);
5262 
5263 	/* Insert into the hash table of file pages */
5264 
5265 	hash_page = buf_page_hash_get_low(buf_pool, page_id);
5266 
5267 	if (hash_page == NULL) {
5268 		/* Block not found in hash table */
5269 	} else if (UNIV_LIKELY(buf_pool_watch_is_sentinel(buf_pool,
5270 							  hash_page))) {
5271 		/* Preserve the reference count. */
5272 		ib_uint32_t	buf_fix_count = hash_page->buf_fix_count;
5273 
5274 		ut_a(buf_fix_count > 0);
5275 
5276 		my_atomic_add32((int32*) &block->page.buf_fix_count, buf_fix_count);
5277 
5278 		buf_pool_watch_remove(buf_pool, hash_page);
5279 	} else {
5280 		ib::fatal() << "Page already foudn in the hash table: "
5281 			    << page_id;
5282 	}
5283 
5284 	ut_ad(!block->page.in_zip_hash);
5285 	ut_ad(!block->page.in_page_hash);
5286 	ut_d(block->page.in_page_hash = TRUE);
5287 
5288 	block->page.id = page_id;
5289 	block->page.size.copy_from(page_size);
5290 
5291 	HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
5292 		    page_id.fold(), &block->page);
5293 
5294 	if (page_size.is_compressed()) {
5295 		page_zip_set_size(&block->page.zip, page_size.physical());
5296 	}
5297 }
5298 
5299 /** Initialize a page for read to the buffer buf_pool. If the page is
5300 (1) already in buf_pool, or
5301 (2) if we specify to read only ibuf pages and the page is not an ibuf page, or
5302 (3) if the space is deleted or being deleted,
5303 then this function does nothing.
5304 Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock
5305 on the buffer frame. The io-handler must take care that the flag is cleared
5306 and the lock released later.
5307 @param[out]	err			DB_SUCCESS or DB_TABLESPACE_DELETED
5308 @param[in]	mode			BUF_READ_IBUF_PAGES_ONLY, ...
5309 @param[in]	page_id			page id
5310 @param[in]	unzip			whether the uncompressed page is
5311 					requested (for ROW_FORMAT=COMPRESSED)
5312 @return pointer to the block
5313 @retval	NULL	in case of an error */
5314 buf_page_t*
5315 buf_page_init_for_read(
5316 	dberr_t*		err,
5317 	ulint			mode,
5318 	const page_id_t		page_id,
5319 	const page_size_t&	page_size,
5320 	bool			unzip)
5321 {
5322 	buf_block_t*	block;
5323 	buf_page_t*	bpage	= NULL;
5324 	buf_page_t*	watch_page;
5325 	rw_lock_t*	hash_lock;
5326 	mtr_t		mtr;
5327 	bool		lru	= false;
5328 	void*		data;
5329 	buf_pool_t*	buf_pool = buf_pool_get(page_id);
5330 
5331 	ut_ad(buf_pool);
5332 
5333 	*err = DB_SUCCESS;
5334 
5335 	if (mode == BUF_READ_IBUF_PAGES_ONLY) {
5336 		/* It is a read-ahead within an ibuf routine */
5337 
5338 		ut_ad(!ibuf_bitmap_page(page_id, page_size));
5339 
5340 		ibuf_mtr_start(&mtr);
5341 
5342 		if (!recv_no_ibuf_operations &&
5343 		    !ibuf_page(page_id, page_size, &mtr)) {
5344 
5345 			ibuf_mtr_commit(&mtr);
5346 
5347 			return(NULL);
5348 		}
5349 	} else {
5350 		ut_ad(mode == BUF_READ_ANY_PAGE);
5351 	}
5352 
5353 	if (page_size.is_compressed() && !unzip && !recv_recovery_is_on()) {
5354 		block = NULL;
5355 	} else {
5356 		block = buf_LRU_get_free_block(buf_pool);
5357 		ut_ad(block);
5358 		ut_ad(buf_pool_from_block(block) == buf_pool);
5359 	}
5360 
5361 	buf_pool_mutex_enter(buf_pool);
5362 
5363 	hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
5364 	rw_lock_x_lock(hash_lock);
5365 
5366 	watch_page = buf_page_hash_get_low(buf_pool, page_id);
5367 	if (watch_page && !buf_pool_watch_is_sentinel(buf_pool, watch_page)) {
5368 		/* The page is already in the buffer pool. */
5369 		watch_page = NULL;
5370 		rw_lock_x_unlock(hash_lock);
5371 		if (block) {
5372 			buf_page_mutex_enter(block);
5373 			buf_LRU_block_free_non_file_page(block);
5374 			buf_page_mutex_exit(block);
5375 		}
5376 
5377 		bpage = NULL;
5378 		goto func_exit;
5379 	}
5380 
5381 	if (block) {
5382 		bpage = &block->page;
5383 
5384 		buf_page_mutex_enter(block);
5385 
5386 		ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
5387 
5388 		buf_page_init(buf_pool, page_id, page_size, block);
5389 
5390 		/* Note: We are using the hash_lock for protection. This is
5391 		safe because no other thread can lookup the block from the
5392 		page hashtable yet. */
5393 
5394 		buf_page_set_io_fix(bpage, BUF_IO_READ);
5395 
5396 		rw_lock_x_unlock(hash_lock);
5397 
5398 		/* The block must be put to the LRU list, to the old blocks */
5399 		buf_LRU_add_block(bpage, TRUE/* to old blocks */);
5400 
5401 		/* We set a pass-type x-lock on the frame because then
5402 		the same thread which called for the read operation
5403 		(and is running now at this point of code) can wait
5404 		for the read to complete by waiting for the x-lock on
5405 		the frame; if the x-lock were recursive, the same
5406 		thread would illegally get the x-lock before the page
5407 		read is completed.  The x-lock is cleared by the
5408 		io-handler thread. */
5409 
5410 		rw_lock_x_lock_gen(&block->lock, BUF_IO_READ);
5411 
5412 		if (page_size.is_compressed()) {
5413 			/* buf_pool->mutex may be released and
5414 			reacquired by buf_buddy_alloc().  Thus, we
5415 			must release block->mutex in order not to
5416 			break the latching order in the reacquisition
5417 			of buf_pool->mutex.  We also must defer this
5418 			operation until after the block descriptor has
5419 			been added to buf_pool->LRU and
5420 			buf_pool->page_hash. */
5421 			buf_page_mutex_exit(block);
5422 			data = buf_buddy_alloc(buf_pool, page_size.physical(),
5423 					       &lru);
5424 			buf_page_mutex_enter(block);
5425 			block->page.zip.data = (page_zip_t*) data;
5426 
5427 			/* To maintain the invariant
5428 			block->in_unzip_LRU_list
5429 			== buf_page_belongs_to_unzip_LRU(&block->page)
5430 			we have to add this block to unzip_LRU
5431 			after block->page.zip.data is set. */
5432 			ut_ad(buf_page_belongs_to_unzip_LRU(&block->page));
5433 			buf_unzip_LRU_add_block(block, TRUE);
5434 		}
5435 
5436 		buf_page_mutex_exit(block);
5437 	} else {
5438 		rw_lock_x_unlock(hash_lock);
5439 
5440 		/* The compressed page must be allocated before the
5441 		control block (bpage), in order to avoid the
5442 		invocation of buf_buddy_relocate_block() on
5443 		uninitialized data. */
5444 		data = buf_buddy_alloc(buf_pool, page_size.physical(), &lru);
5445 
5446 		rw_lock_x_lock(hash_lock);
5447 
5448 		/* If buf_buddy_alloc() allocated storage from the LRU list,
5449 		it released and reacquired buf_pool->mutex.  Thus, we must
5450 		check the page_hash again, as it may have been modified. */
5451 		if (UNIV_UNLIKELY(lru)) {
5452 
5453 			watch_page = buf_page_hash_get_low(buf_pool, page_id);
5454 
5455 			if (UNIV_UNLIKELY(watch_page
5456 			    && !buf_pool_watch_is_sentinel(buf_pool,
5457 							   watch_page))) {
5458 
5459 				/* The block was added by some other thread. */
5460 				rw_lock_x_unlock(hash_lock);
5461 				watch_page = NULL;
5462 				buf_buddy_free(buf_pool, data,
5463 					       page_size.physical());
5464 
5465 				bpage = NULL;
5466 				goto func_exit;
5467 			}
5468 		}
5469 
5470 		bpage = buf_page_alloc_descriptor();
5471 
5472 		/* Initialize the buf_pool pointer. */
5473 		bpage->buf_pool_index = buf_pool_index(buf_pool);
5474 
5475 		page_zip_des_init(&bpage->zip);
5476 		page_zip_set_size(&bpage->zip, page_size.physical());
5477 		bpage->zip.data = (page_zip_t*) data;
5478 
5479 		bpage->size.copy_from(page_size);
5480 
5481 		mutex_enter(&buf_pool->zip_mutex);
5482 
5483 		buf_page_init_low(bpage);
5484 
5485 		bpage->state = BUF_BLOCK_ZIP_PAGE;
5486 		bpage->id = page_id;
5487 		bpage->flush_observer = NULL;
5488 
5489 		ut_d(bpage->in_page_hash = FALSE);
5490 		ut_d(bpage->in_zip_hash = FALSE);
5491 		ut_d(bpage->in_flush_list = FALSE);
5492 		ut_d(bpage->in_free_list = FALSE);
5493 		ut_d(bpage->in_LRU_list = FALSE);
5494 
5495 		ut_d(bpage->in_page_hash = TRUE);
5496 
5497 		if (watch_page != NULL) {
5498 
5499 			/* Preserve the reference count. */
5500 			ib_uint32_t	buf_fix_count;
5501 
5502 			buf_fix_count = watch_page->buf_fix_count;
5503 
5504 			ut_a(buf_fix_count > 0);
5505 
5506 			my_atomic_add32((int32*) &bpage->buf_fix_count, buf_fix_count);
5507 
5508 			ut_ad(buf_pool_watch_is_sentinel(buf_pool, watch_page));
5509 			buf_pool_watch_remove(buf_pool, watch_page);
5510 		}
5511 
5512 		HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
5513 			    bpage->id.fold(), bpage);
5514 
5515 		rw_lock_x_unlock(hash_lock);
5516 
5517 		/* The block must be put to the LRU list, to the old blocks.
5518 		The zip size is already set into the page zip */
5519 		buf_LRU_add_block(bpage, TRUE/* to old blocks */);
5520 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
5521 		buf_LRU_insert_zip_clean(bpage);
5522 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
5523 
5524 		buf_page_set_io_fix(bpage, BUF_IO_READ);
5525 
5526 		mutex_exit(&buf_pool->zip_mutex);
5527 	}
5528 
5529 	buf_pool->n_pend_reads++;
5530 func_exit:
5531 	buf_pool_mutex_exit(buf_pool);
5532 
5533 	if (mode == BUF_READ_IBUF_PAGES_ONLY) {
5534 
5535 		ibuf_mtr_commit(&mtr);
5536 	}
5537 
5538 	ut_ad(!rw_lock_own_flagged(hash_lock,
5539 				   RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
5540 	ut_ad(!bpage || buf_page_in_file(bpage));
5541 
5542 	return(bpage);
5543 }
5544 
5545 /** Initializes a page to the buffer buf_pool. The page is usually not read
5546 from a file even if it cannot be found in the buffer buf_pool. This is one
5547 of the functions which perform to a block a state transition NOT_USED =>
5548 FILE_PAGE (the other is buf_page_get_gen).
5549 @param[in]	page_id		page id
5550 @param[in]	page_size	page size
5551 @param[in]	mtr		mini-transaction
5552 @return pointer to the block, page bufferfixed */
5553 buf_block_t*
5554 buf_page_create(
5555 	const page_id_t		page_id,
5556 	const page_size_t&	page_size,
5557 	mtr_t*			mtr)
5558 {
5559 	buf_frame_t*	frame;
5560 	buf_block_t*	block;
5561 	buf_block_t*	free_block	= NULL;
5562 	buf_pool_t*	buf_pool= buf_pool_get(page_id);
5563 	rw_lock_t*	hash_lock;
5564 
5565 	ut_ad(mtr->is_active());
5566 	ut_ad(page_id.space() != 0 || !page_size.is_compressed());
5567 loop:
5568 	free_block = buf_LRU_get_free_block(buf_pool);
5569 	buf_pool_mutex_enter(buf_pool);
5570 
5571 	hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
5572 	rw_lock_x_lock(hash_lock);
5573 
5574 	block = (buf_block_t*) buf_page_hash_get_low(buf_pool, page_id);
5575 
5576 	if (block
5577 	    && buf_page_in_file(&block->page)
5578 	    && !buf_pool_watch_is_sentinel(buf_pool, &block->page)) {
5579 		ut_d(block->page.file_page_was_freed = FALSE);
5580 		buf_page_state page_state = buf_block_get_state(block);
5581 		bool have_x_latch = false;
5582 #ifdef BTR_CUR_HASH_ADAPT
5583 		const dict_index_t *drop_hash_entry= NULL;
5584 #endif
5585 		switch (page_state) {
5586 		default:
5587 			ut_ad(0);
5588 			break;
5589 		case BUF_BLOCK_ZIP_PAGE:
5590 		case BUF_BLOCK_ZIP_DIRTY:
5591 			buf_block_init_low(free_block);
5592 			mutex_enter(&buf_pool->zip_mutex);
5593 
5594 			buf_page_mutex_enter(free_block);
5595 			if (buf_page_get_io_fix(&block->page) != BUF_IO_NONE) {
5596 				mutex_exit(&buf_pool->zip_mutex);
5597 				rw_lock_x_unlock(hash_lock);
5598 				buf_LRU_block_free_non_file_page(free_block);
5599 				buf_pool_mutex_exit(buf_pool);
5600 				buf_page_mutex_exit(free_block);
5601 
5602 				goto loop;
5603 			}
5604 
5605 			rw_lock_x_lock(&free_block->lock);
5606 
5607 			buf_relocate(&block->page, &free_block->page);
5608 			if (page_state == BUF_BLOCK_ZIP_DIRTY) {
5609 				ut_ad(block->page.in_flush_list);
5610 				ut_ad(block->page.oldest_modification > 0);
5611 				buf_flush_relocate_on_flush_list(
5612 					&block->page, &free_block->page);
5613 			} else {
5614 				ut_ad(block->page.oldest_modification == 0);
5615 				ut_ad(!block->page.in_flush_list);
5616 #ifdef UNIV_DEBUG
5617 				UT_LIST_REMOVE(
5618 					buf_pool->zip_clean, &block->page);
5619 #endif
5620 			}
5621 
5622 			free_block->page.state = BUF_BLOCK_FILE_PAGE;
5623 			mutex_exit(&buf_pool->zip_mutex);
5624 			free_block->lock_hash_val = lock_rec_hash(
5625 					page_id.space(), page_id.page_no());
5626 			buf_unzip_LRU_add_block(free_block, false);
5627 			buf_page_free_descriptor(&block->page);
5628 			block = free_block;
5629 			buf_block_fix(block);
5630 			buf_page_mutex_exit(free_block);
5631 			free_block = NULL;
5632 			break;
5633 		case BUF_BLOCK_FILE_PAGE:
5634 			have_x_latch = mtr->have_x_latch(*block);
5635 			if (!have_x_latch) {
5636 				buf_block_fix(block);
5637 				buf_page_mutex_enter(block);
5638 				while (buf_block_get_io_fix(block)
5639 				       != BUF_IO_NONE
5640 				       || block->page.buf_fix_count != 1) {
5641 					buf_page_mutex_exit(block);
5642 					buf_pool_mutex_exit(buf_pool);
5643 					rw_lock_x_unlock(hash_lock);
5644 
5645 					os_thread_sleep(1000);
5646 
5647 					buf_pool_mutex_enter(buf_pool);
5648 					rw_lock_x_lock(hash_lock);
5649 					buf_page_mutex_enter(block);
5650 				}
5651 				rw_lock_x_lock(&block->lock);
5652 				buf_page_mutex_exit(block);
5653 			}
5654 #ifdef BTR_CUR_HASH_ADAPT
5655 			drop_hash_entry = block->index;
5656 #endif
5657 			break;
5658 		}
5659 		/* Page can be found in buf_pool */
5660 		buf_pool_mutex_exit(buf_pool);
5661 		rw_lock_x_unlock(hash_lock);
5662 
5663 		if (free_block) {
5664 			buf_block_free(free_block);
5665 		}
5666 #ifdef BTR_CUR_HASH_ADAPT
5667 		if (drop_hash_entry) {
5668 			btr_search_drop_page_hash_index(block);
5669 		}
5670 #endif /* BTR_CUR_HASH_ADAPT */
5671 
5672 		if (!have_x_latch) {
5673 #ifdef UNIV_DEBUG
5674 			if (!fsp_is_system_temporary(page_id.space())) {
5675 				rw_lock_s_lock_nowait(
5676 					&block->debug_latch,
5677 					__FILE__, __LINE__);
5678 			}
5679 #endif /* UNIV_DEBUG */
5680 
5681 			mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX);
5682 		}
5683 		return block;
5684 	}
5685 
5686 	/* If we get here, the page was not in buf_pool: init it there */
5687 
5688 	DBUG_PRINT("ib_buf", ("create page %u:%u",
5689 			      page_id.space(), page_id.page_no()));
5690 
5691 	block = free_block;
5692 
5693 	buf_page_mutex_enter(block);
5694 
5695 	buf_page_init(buf_pool, page_id, page_size, block);
5696 
5697 	rw_lock_x_lock(&block->lock);
5698 
5699 	rw_lock_x_unlock(hash_lock);
5700 
5701 	/* The block must be put to the LRU list */
5702 	buf_LRU_add_block(&block->page, FALSE);
5703 
5704 	buf_block_buf_fix_inc(block, __FILE__, __LINE__);
5705 	buf_pool->stat.n_pages_created++;
5706 
5707 	if (page_size.is_compressed()) {
5708 		void*	data;
5709 		bool	lru;
5710 
5711 		/* Prevent race conditions during buf_buddy_alloc(),
5712 		which may release and reacquire buf_pool->mutex,
5713 		by IO-fixing and X-latching the block. */
5714 
5715 		buf_page_set_io_fix(&block->page, BUF_IO_READ);
5716 
5717 		buf_page_mutex_exit(block);
5718 		/* buf_pool->mutex may be released and reacquired by
5719 		buf_buddy_alloc().  Thus, we must release block->mutex
5720 		in order not to break the latching order in
5721 		the reacquisition of buf_pool->mutex.  We also must
5722 		defer this operation until after the block descriptor
5723 		has been added to buf_pool->LRU and buf_pool->page_hash. */
5724 		data = buf_buddy_alloc(buf_pool, page_size.physical(), &lru);
5725 		buf_page_mutex_enter(block);
5726 		block->page.zip.data = (page_zip_t*) data;
5727 
5728 		/* To maintain the invariant
5729 		block->in_unzip_LRU_list
5730 		== buf_page_belongs_to_unzip_LRU(&block->page)
5731 		we have to add this block to unzip_LRU after
5732 		block->page.zip.data is set. */
5733 		ut_ad(buf_page_belongs_to_unzip_LRU(&block->page));
5734 		buf_unzip_LRU_add_block(block, FALSE);
5735 
5736 		buf_page_set_io_fix(&block->page, BUF_IO_NONE);
5737 	}
5738 
5739 	buf_pool_mutex_exit(buf_pool);
5740 
5741 	mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX);
5742 
5743 	buf_page_set_accessed(&block->page);
5744 
5745 	buf_page_mutex_exit(block);
5746 
5747 	/* Delete possible entries for the page from the insert buffer:
5748 	such can exist if the page belonged to an index which was dropped */
5749 	if (!recv_recovery_is_on()) {
5750 		ibuf_merge_or_delete_for_page(NULL, page_id, page_size);
5751 	}
5752 
5753 	frame = block->frame;
5754 
5755 	memset(frame + FIL_PAGE_PREV, 0xff, 4);
5756 	memset(frame + FIL_PAGE_NEXT, 0xff, 4);
5757 	mach_write_to_2(frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED);
5758 
5759 	/* FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION is only used on the
5760 	following pages:
5761 	(1) The first page of the InnoDB system tablespace (page 0:0)
5762 	(2) FIL_RTREE_SPLIT_SEQ_NUM on R-tree pages
5763 	(3) key_version on encrypted pages (not page 0:0) */
5764 
5765 	memset(frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
5766 	memset(frame + FIL_PAGE_LSN, 0, 8);
5767 
5768 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
5769 	ut_a(++buf_dbg_counter % 5771 || buf_validate());
5770 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
5771 	return(block);
5772 }
5773 
5774 /********************************************************************//**
5775 Monitor the buffer page read/write activity, and increment corresponding
5776 counter value if MONITOR_MODULE_BUF_PAGE (module_buf_page) module is
5777 enabled. */
5778 static
5779 void
5780 buf_page_monitor(
5781 /*=============*/
5782 	const buf_page_t*	bpage,	/*!< in: pointer to the block */
5783 	enum buf_io_fix		io_type)/*!< in: io_fix types */
5784 {
5785 	const byte*	frame;
5786 	monitor_id_t	counter;
5787 
5788 	/* If the counter module is not turned on, just return */
5789 	if (!MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)) {
5790 		return;
5791 	}
5792 
5793 	ut_a(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE);
5794 
5795 	frame = bpage->zip.data
5796 		? bpage->zip.data
5797 		: ((buf_block_t*) bpage)->frame;
5798 
5799 	switch (fil_page_get_type(frame)) {
5800 		ulint	level;
5801 	case FIL_PAGE_TYPE_INSTANT:
5802 	case FIL_PAGE_INDEX:
5803 	case FIL_PAGE_RTREE:
5804 		level = btr_page_get_level(frame);
5805 
5806 		/* Check if it is an index page for insert buffer */
5807 		if (fil_page_get_type(frame) == FIL_PAGE_INDEX
5808 		    && btr_page_get_index_id(frame)
5809 		    == (index_id_t)(DICT_IBUF_ID_MIN + IBUF_SPACE_ID)) {
5810 			if (level == 0) {
5811 				counter = MONITOR_RW_COUNTER(
5812 					io_type, MONITOR_INDEX_IBUF_LEAF_PAGE);
5813 			} else {
5814 				counter = MONITOR_RW_COUNTER(
5815 					io_type,
5816 					MONITOR_INDEX_IBUF_NON_LEAF_PAGE);
5817 			}
5818 		} else {
5819 			if (level == 0) {
5820 				counter = MONITOR_RW_COUNTER(
5821 					io_type, MONITOR_INDEX_LEAF_PAGE);
5822 			} else {
5823 				counter = MONITOR_RW_COUNTER(
5824 					io_type, MONITOR_INDEX_NON_LEAF_PAGE);
5825 			}
5826 		}
5827 		break;
5828 
5829 	case FIL_PAGE_UNDO_LOG:
5830 		counter = MONITOR_RW_COUNTER(io_type, MONITOR_UNDO_LOG_PAGE);
5831 		break;
5832 
5833 	case FIL_PAGE_INODE:
5834 		counter = MONITOR_RW_COUNTER(io_type, MONITOR_INODE_PAGE);
5835 		break;
5836 
5837 	case FIL_PAGE_IBUF_FREE_LIST:
5838 		counter = MONITOR_RW_COUNTER(io_type,
5839 					     MONITOR_IBUF_FREELIST_PAGE);
5840 		break;
5841 
5842 	case FIL_PAGE_IBUF_BITMAP:
5843 		counter = MONITOR_RW_COUNTER(io_type,
5844 					     MONITOR_IBUF_BITMAP_PAGE);
5845 		break;
5846 
5847 	case FIL_PAGE_TYPE_SYS:
5848 		counter = MONITOR_RW_COUNTER(io_type, MONITOR_SYSTEM_PAGE);
5849 		break;
5850 
5851 	case FIL_PAGE_TYPE_TRX_SYS:
5852 		counter = MONITOR_RW_COUNTER(io_type, MONITOR_TRX_SYSTEM_PAGE);
5853 		break;
5854 
5855 	case FIL_PAGE_TYPE_FSP_HDR:
5856 		counter = MONITOR_RW_COUNTER(io_type, MONITOR_FSP_HDR_PAGE);
5857 		break;
5858 
5859 	case FIL_PAGE_TYPE_XDES:
5860 		counter = MONITOR_RW_COUNTER(io_type, MONITOR_XDES_PAGE);
5861 		break;
5862 
5863 	case FIL_PAGE_TYPE_BLOB:
5864 		counter = MONITOR_RW_COUNTER(io_type, MONITOR_BLOB_PAGE);
5865 		break;
5866 
5867 	case FIL_PAGE_TYPE_ZBLOB:
5868 		counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB_PAGE);
5869 		break;
5870 
5871 	case FIL_PAGE_TYPE_ZBLOB2:
5872 		counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB2_PAGE);
5873 		break;
5874 
5875 	default:
5876 		counter = MONITOR_RW_COUNTER(io_type, MONITOR_OTHER_PAGE);
5877 	}
5878 
5879 	MONITOR_INC_NOCHECK(counter);
5880 }
5881 
5882 /** Mark a table corrupted.
5883 @param[in]	bpage	corrupted page
5884 @param[in]	space	tablespace of the corrupted page */
5885 ATTRIBUTE_COLD
5886 static void buf_mark_space_corrupt(buf_page_t* bpage, const fil_space_t& space)
5887 {
5888 	/* If block is not encrypted find the table with specified
5889 	space id, and mark it corrupted. Encrypted tables
5890 	are marked unusable later e.g. in ::open(). */
5891 	if (!space.crypt_data
5892 	    || space.crypt_data->type == CRYPT_SCHEME_UNENCRYPTED) {
5893 		dict_set_corrupted_by_space(&space);
5894 	} else {
5895 		dict_set_encrypted_by_space(&space);
5896 	}
5897 }
5898 
5899 /** Mark a table corrupted.
5900 @param[in]	bpage	Corrupted page
5901 @param[in]	space	Corrupted page belongs to tablespace
5902 Also remove the bpage from LRU list. */
5903 static
5904 void
5905 buf_corrupt_page_release(buf_page_t* bpage, const fil_space_t* space)
5906 {
5907 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
5908 	const ibool	uncompressed = (buf_page_get_state(bpage)
5909 					== BUF_BLOCK_FILE_PAGE);
5910 	page_id_t	old_page_id = bpage->id;
5911 
5912 	/* First unfix and release lock on the bpage */
5913 	buf_pool_mutex_enter(buf_pool);
5914 	mutex_enter(buf_page_get_mutex(bpage));
5915 	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_READ);
5916 	ut_ad(bpage->id.space() == space->id);
5917 
5918 	/* buf_fix_count can be greater than zero. Because other thread
5919 	can wait in buf_page_wait_read() for the page to be read. */
5920 
5921 	bpage->id.set_corrupt_id();
5922 	/* Set BUF_IO_NONE before we remove the block from LRU list */
5923 	buf_page_set_io_fix(bpage, BUF_IO_NONE);
5924 
5925 	if (uncompressed) {
5926 		rw_lock_x_unlock_gen(
5927 			&((buf_block_t*) bpage)->lock,
5928 			BUF_IO_READ);
5929 	}
5930 
5931 	mutex_exit(buf_page_get_mutex(bpage));
5932 
5933 	if (!srv_force_recovery) {
5934 		buf_mark_space_corrupt(bpage, *space);
5935 	}
5936 
5937 	/* After this point bpage can't be referenced. */
5938 	buf_LRU_free_one_page(bpage, old_page_id);
5939 
5940 	ut_ad(buf_pool->n_pend_reads > 0);
5941 	buf_pool->n_pend_reads--;
5942 
5943 	buf_pool_mutex_exit(buf_pool);
5944 }
5945 
5946 /** Check if page is maybe compressed, encrypted or both when we encounter
5947 corrupted page. Note that we can't be 100% sure if page is corrupted
5948 or decrypt/decompress just failed.
5949 @param[in,out]	bpage		page
5950 @param[in,out]	space		tablespace from fil_space_acquire_for_io()
5951 @return	whether the operation succeeded
5952 @retval	DB_SUCCESS		if page has been read and is not corrupted
5953 @retval	DB_PAGE_CORRUPTED	if page based on checksum check is corrupted
5954 @retval	DB_DECRYPTION_FAILED	if page post encryption checksum matches but
5955 after decryption normal page checksum does not match.
5956 @retval	DB_TABLESPACE_DELETED	if accessed tablespace is not found */
5957 static dberr_t buf_page_check_corrupt(buf_page_t* bpage, fil_space_t* space)
5958 {
5959 	ut_ad(space->pending_io());
5960 
5961 	byte* dst_frame = (bpage->zip.data) ? bpage->zip.data :
5962 		((buf_block_t*) bpage)->frame;
5963 	dberr_t err = DB_SUCCESS;
5964 
5965 	/* In buf_decrypt_after_read we have either decrypted the page if
5966 	page post encryption checksum matches and used key_id is found
5967 	from the encryption plugin. If checksum did not match page was
5968 	not decrypted and it could be either encrypted and corrupted
5969 	or corrupted or good page. If we decrypted, there page could
5970 	still be corrupted if used key does not match. */
5971 	const bool seems_encrypted = mach_read_from_4(
5972 		dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION)
5973 		&& space->crypt_data
5974 		&& space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED;
5975 
5976 	/* If traditional checksums match, we assume that page is
5977 	not anymore encrypted. */
5978 	if (buf_page_is_corrupted(
5979 		true, dst_frame, bpage->size, space)) {
5980 		err = DB_PAGE_CORRUPTED;
5981 	}
5982 
5983 	if (seems_encrypted && err == DB_PAGE_CORRUPTED
5984 	    && bpage->id.page_no() != 0) {
5985 		err = DB_DECRYPTION_FAILED;
5986 
5987 		ib::error()
5988 			<< "The page " << bpage->id << " in file '"
5989 			<< space->chain.start->name
5990 			<< "' cannot be decrypted.";
5991 
5992 		ib::info()
5993 			<< "However key management plugin or used key_version "
5994 			<< mach_read_from_4(dst_frame
5995 					    + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION)
5996 			<< " is not found or"
5997 			" used encryption algorithm or method does not match.";
5998 
5999 		if (bpage->id.space() != TRX_SYS_SPACE) {
6000 			ib::info()
6001 				<< "Marking tablespace as missing."
6002 				" You may drop this table or"
6003 				" install correct key management plugin"
6004 				" and key file.";
6005 		}
6006 	}
6007 
6008 	return (err);
6009 }
6010 
6011 /** Complete a read or write request of a file page to or from the buffer pool.
6012 @param[in,out]	bpage	page to complete
6013 @param[in]	dblwr	whether the doublewrite buffer was used (on write)
6014 @param[in]	evict	whether or not to evict the page from LRU list
6015 @return whether the operation succeeded
6016 @retval	DB_SUCCESS		always when writing, or if a read page was OK
6017 @retval	DB_TABLESPACE_DELETED	if the tablespace does not exist
6018 @retval	DB_PAGE_CORRUPTED	if the checksum fails on a page read
6019 @retval	DB_DECRYPTION_FAILED	if page post encryption checksum matches but
6020 				after decryption normal page checksum does
6021 				not match */
6022 UNIV_INTERN
6023 dberr_t
6024 buf_page_io_complete(buf_page_t* bpage, bool dblwr, bool evict)
6025 {
6026 	enum buf_io_fix	io_type;
6027 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
6028 	const bool	uncompressed = (buf_page_get_state(bpage)
6029 					== BUF_BLOCK_FILE_PAGE);
6030 	ut_a(buf_page_in_file(bpage));
6031 
6032 	/* We do not need protect io_fix here by mutex to read
6033 	it because this is the only function where we can change the value
6034 	from BUF_IO_READ or BUF_IO_WRITE to some other value, and our code
6035 	ensures that this is the only thread that handles the i/o for this
6036 	block. */
6037 
6038 	io_type = buf_page_get_io_fix(bpage);
6039 	ut_ad(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE);
6040 	ut_ad(bpage->size.is_compressed() == (bpage->zip.data != NULL));
6041 	ut_ad(uncompressed || bpage->zip.data);
6042 
6043 	if (io_type == BUF_IO_READ) {
6044 		ulint	read_page_no = 0;
6045 		ulint	read_space_id = 0;
6046 		byte*	frame = bpage->zip.data
6047 			? bpage->zip.data
6048 			: reinterpret_cast<buf_block_t*>(bpage)->frame;
6049 		ut_ad(frame);
6050 		fil_space_t* space = fil_space_acquire_for_io(
6051 			bpage->id.space());
6052 		if (!space) {
6053 			return DB_TABLESPACE_DELETED;
6054 		}
6055 
6056 		dberr_t	err;
6057 
6058 		if (!buf_page_decrypt_after_read(bpage, space)) {
6059 			err = DB_DECRYPTION_FAILED;
6060 			goto database_corrupted;
6061 		}
6062 
6063 		if (bpage->zip.data && uncompressed) {
6064 			my_atomic_addlint(&buf_pool->n_pend_unzip, 1);
6065 			ibool ok = buf_zip_decompress((buf_block_t*) bpage,
6066 						      FALSE);
6067 			my_atomic_addlint(&buf_pool->n_pend_unzip, ulint(-1));
6068 
6069 			if (!ok) {
6070 				ib::info() << "Page "
6071 					   << bpage->id
6072 					   << " zip_decompress failure.";
6073 
6074 				err = DB_PAGE_CORRUPTED;
6075 				goto database_corrupted;
6076 			}
6077 		}
6078 
6079 		/* If this page is not uninitialized and not in the
6080 		doublewrite buffer, then the page number and space id
6081 		should be the same as in block. */
6082 		read_page_no = mach_read_from_4(frame + FIL_PAGE_OFFSET);
6083 		read_space_id = mach_read_from_4(
6084 			frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
6085 
6086 		if (bpage->id.space() == TRX_SYS_SPACE
6087 		    && buf_dblwr_page_inside(bpage->id.page_no())) {
6088 
6089 			ib::error() << "Reading page " << bpage->id
6090 				<< ", which is in the doublewrite buffer!";
6091 
6092 		} else if (read_space_id == 0 && read_page_no == 0) {
6093 			/* This is likely an uninitialized page. */
6094 		} else if ((bpage->id.space() != TRX_SYS_SPACE
6095 			    && bpage->id.space() != read_space_id)
6096 			   || bpage->id.page_no() != read_page_no) {
6097 			/* We did not compare space_id to read_space_id
6098 			in the system tablespace, because the field
6099 			was written as garbage before MySQL 4.1.1,
6100 			which did not support innodb_file_per_table. */
6101 
6102 			ib::error() << "Space id and page no stored in "
6103 				"the page, read in are "
6104 				<< page_id_t(read_space_id, read_page_no)
6105 				<< ", should be " << bpage->id;
6106 		}
6107 
6108 		err = buf_page_check_corrupt(bpage, space);
6109 
6110 		if (err != DB_SUCCESS) {
6111 database_corrupted:
6112 			/* Not a real corruption if it was triggered by
6113 			error injection */
6114 			DBUG_EXECUTE_IF(
6115 				"buf_page_import_corrupt_failure",
6116 				if (!is_predefined_tablespace(
6117 					    bpage->id.space())) {
6118 					buf_corrupt_page_release(bpage, space);
6119 					ib::info() << "Simulated IMPORT "
6120 						"corruption";
6121 					space->release_for_io();
6122 					return(err);
6123 				}
6124 				err = DB_SUCCESS;
6125 				goto page_not_corrupt;
6126 			);
6127 
6128 			if (uncompressed && bpage->zip.data) {
6129 				memset(reinterpret_cast<buf_block_t*>(bpage)
6130 				       ->frame, 0, srv_page_size);
6131 			}
6132 
6133 			if (err == DB_PAGE_CORRUPTED) {
6134 				ib::error()
6135 					<< "Database page corruption on disk"
6136 					" or a failed file read of tablespace "
6137 					<< space->name << " page " << bpage->id
6138 					<< ". You may have to recover from "
6139 					<< "a backup.";
6140 
6141 				buf_page_print(frame, bpage->size);
6142 
6143 				ib::info()
6144 					<< "It is also possible that your"
6145 					" operating system has corrupted"
6146 					" its own file cache and rebooting"
6147 					" your computer removes the error."
6148 					" If the corrupt page is an index page."
6149 					" You can also try to fix the"
6150 					" corruption by dumping, dropping,"
6151 					" and reimporting the corrupt table."
6152 					" You can use CHECK TABLE to scan"
6153 					" your table for corruption. "
6154 					<< FORCE_RECOVERY_MSG;
6155 			}
6156 
6157 			if (!srv_force_recovery) {
6158 
6159 				/* If page space id is larger than TRX_SYS_SPACE
6160 				(0), we will attempt to mark the corresponding
6161 				table as corrupted instead of crashing server */
6162 				if (bpage->id.space() == TRX_SYS_SPACE) {
6163 					ib::fatal() << "Aborting because of"
6164 						" a corrupt database page.";
6165 				}
6166 
6167 				buf_corrupt_page_release(bpage, space);
6168 				space->release_for_io();
6169 				return(err);
6170 			}
6171 		}
6172 
6173 		DBUG_EXECUTE_IF("buf_page_import_corrupt_failure",
6174 				page_not_corrupt: bpage = bpage; );
6175 
6176 		if (err == DB_PAGE_CORRUPTED
6177 		    || err == DB_DECRYPTION_FAILED) {
6178 			const page_id_t corrupt_page_id = bpage->id;
6179 
6180 			buf_corrupt_page_release(bpage, space);
6181 
6182 			if (recv_recovery_is_on()) {
6183 				recv_recover_corrupt_page(corrupt_page_id);
6184 			}
6185 
6186 			space->release_for_io();
6187 			return err;
6188 		}
6189 
6190 		if (recv_recovery_is_on()) {
6191 			recv_recover_page(bpage);
6192 		}
6193 
6194 		/* If space is being truncated then avoid ibuf operation.
6195 		During re-init we have already freed ibuf entries. */
6196 		if (uncompressed
6197 		    && !recv_no_ibuf_operations
6198 		    && (bpage->id.space() == 0
6199 			|| !is_predefined_tablespace(bpage->id.space()))
6200 		    && !srv_is_tablespace_truncated(bpage->id.space())
6201 		    && fil_page_get_type(frame) == FIL_PAGE_INDEX
6202 		    && page_is_leaf(frame)) {
6203 
6204 			ibuf_merge_or_delete_for_page(
6205 				(buf_block_t*) bpage, bpage->id,
6206 				bpage->size);
6207 		}
6208 
6209 		space->release_for_io();
6210 	} else {
6211 		/* io_type == BUF_IO_WRITE */
6212 		if (bpage->slot) {
6213 			/* Mark slot free */
6214 			bpage->slot->release();
6215 			bpage->slot = NULL;
6216 		}
6217 	}
6218 
6219 	BPageMutex* block_mutex = buf_page_get_mutex(bpage);
6220 	buf_pool_mutex_enter(buf_pool);
6221 	mutex_enter(block_mutex);
6222 
6223 	/* Because this thread which does the unlocking is not the same that
6224 	did the locking, we use a pass value != 0 in unlock, which simply
6225 	removes the newest lock debug record, without checking the thread
6226 	id. */
6227 
6228 	buf_page_set_io_fix(bpage, BUF_IO_NONE);
6229 	buf_page_monitor(bpage, io_type);
6230 
6231 	if (io_type == BUF_IO_READ) {
6232 		/* NOTE that the call to ibuf may have moved the ownership of
6233 		the x-latch to this OS thread: do not let this confuse you in
6234 		debugging! */
6235 
6236 		ut_ad(buf_pool->n_pend_reads > 0);
6237 		buf_pool->n_pend_reads--;
6238 		buf_pool->stat.n_pages_read++;
6239 
6240 		if (uncompressed) {
6241 			rw_lock_x_unlock_gen(&((buf_block_t*) bpage)->lock,
6242 					     BUF_IO_READ);
6243 		}
6244 
6245 		mutex_exit(block_mutex);
6246 	} else {
6247 		/* Write means a flush operation: call the completion
6248 		routine in the flush system */
6249 
6250 		buf_flush_write_complete(bpage, dblwr);
6251 
6252 		if (uncompressed) {
6253 			rw_lock_sx_unlock_gen(&((buf_block_t*) bpage)->lock,
6254 					      BUF_IO_WRITE);
6255 		}
6256 
6257 		buf_pool->stat.n_pages_written++;
6258 
6259 		/* We decide whether or not to evict the page from the
6260 		LRU list based on the flush_type.
6261 		* BUF_FLUSH_LIST: don't evict
6262 		* BUF_FLUSH_LRU: always evict
6263 		* BUF_FLUSH_SINGLE_PAGE: eviction preference is passed
6264 		by the caller explicitly. */
6265 		if (buf_page_get_flush_type(bpage) == BUF_FLUSH_LRU) {
6266 			evict = true;
6267 		}
6268 
6269 		mutex_exit(block_mutex);
6270 
6271 		if (evict) {
6272 			buf_LRU_free_page(bpage, true);
6273 		}
6274 	}
6275 
6276 	DBUG_PRINT("ib_buf", ("%s page %u:%u",
6277 			      io_type == BUF_IO_READ ? "read" : "wrote",
6278 			      bpage->id.space(), bpage->id.page_no()));
6279 
6280 	buf_pool_mutex_exit(buf_pool);
6281 
6282 	return DB_SUCCESS;
6283 }
6284 
6285 /*********************************************************************//**
6286 Asserts that all file pages in the buffer are in a replaceable state.
6287 @return TRUE */
6288 static
6289 ibool
6290 buf_all_freed_instance(
6291 /*===================*/
6292 	buf_pool_t*	buf_pool)	/*!< in: buffer pool instancce */
6293 {
6294 	ulint		i;
6295 	buf_chunk_t*	chunk;
6296 
6297 	ut_ad(buf_pool);
6298 
6299 	buf_pool_mutex_enter(buf_pool);
6300 
6301 	chunk = buf_pool->chunks;
6302 
6303 	for (i = buf_pool->n_chunks; i--; chunk++) {
6304 
6305 		if (const buf_block_t* block = buf_chunk_not_freed(chunk)) {
6306 			ib::fatal() << "Page " << block->page.id
6307 				<< " still fixed or dirty";
6308 		}
6309 	}
6310 
6311 	buf_pool_mutex_exit(buf_pool);
6312 
6313 	return(TRUE);
6314 }
6315 
6316 /** Refreshes the statistics used to print per-second averages.
6317 @param[in,out]	buf_pool	buffer pool instance */
6318 static
6319 void
6320 buf_refresh_io_stats(
6321 	buf_pool_t*	buf_pool)
6322 {
6323 	buf_pool->last_printout_time = time(NULL);
6324 	buf_pool->old_stat = buf_pool->stat;
6325 }
6326 
6327 /*********************************************************************//**
6328 Invalidates file pages in one buffer pool instance */
6329 static
6330 void
6331 buf_pool_invalidate_instance(
6332 /*=========================*/
6333 	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
6334 {
6335 	ulint		i;
6336 
6337 	buf_pool_mutex_enter(buf_pool);
6338 
6339 	for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
6340 
6341 		/* As this function is called during startup and
6342 		during redo application phase during recovery, InnoDB
6343 		is single threaded (apart from IO helper threads) at
6344 		this stage. No new write batch can be in intialization
6345 		stage at this point. */
6346 		ut_ad(buf_pool->init_flush[i] == FALSE);
6347 
6348 		/* However, it is possible that a write batch that has
6349 		been posted earlier is still not complete. For buffer
6350 		pool invalidation to proceed we must ensure there is NO
6351 		write activity happening. */
6352 		if (buf_pool->n_flush[i] > 0) {
6353 			buf_flush_t	type = static_cast<buf_flush_t>(i);
6354 
6355 			buf_pool_mutex_exit(buf_pool);
6356 			buf_flush_wait_batch_end(buf_pool, type);
6357 			buf_pool_mutex_enter(buf_pool);
6358 		}
6359 	}
6360 
6361 	buf_pool_mutex_exit(buf_pool);
6362 
6363 	ut_ad(buf_all_freed_instance(buf_pool));
6364 
6365 	buf_pool_mutex_enter(buf_pool);
6366 
6367 	while (buf_LRU_scan_and_free_block(buf_pool, true)) {
6368 	}
6369 
6370 	ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0);
6371 	ut_ad(UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0);
6372 
6373 	buf_pool->freed_page_clock = 0;
6374 	buf_pool->LRU_old = NULL;
6375 	buf_pool->LRU_old_len = 0;
6376 
6377 	memset(&buf_pool->stat, 0x00, sizeof(buf_pool->stat));
6378 	buf_refresh_io_stats(buf_pool);
6379 
6380 	buf_pool_mutex_exit(buf_pool);
6381 }
6382 
6383 /*********************************************************************//**
6384 Invalidates the file pages in the buffer pool when an archive recovery is
6385 completed. All the file pages buffered must be in a replaceable state when
6386 this function is called: not latched and not modified. */
6387 void
6388 buf_pool_invalidate(void)
6389 /*=====================*/
6390 {
6391 	ulint   i;
6392 
6393 	for (i = 0; i < srv_buf_pool_instances; i++) {
6394 		buf_pool_invalidate_instance(buf_pool_from_array(i));
6395 	}
6396 }
6397 
6398 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
6399 /*********************************************************************//**
6400 Validates data in one buffer pool instance
6401 @return TRUE */
6402 static
6403 ibool
6404 buf_pool_validate_instance(
6405 /*=======================*/
6406 	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
6407 {
6408 	buf_page_t*	b;
6409 	buf_chunk_t*	chunk;
6410 	ulint		i;
6411 	ulint		n_lru_flush	= 0;
6412 	ulint		n_page_flush	= 0;
6413 	ulint		n_list_flush	= 0;
6414 	ulint		n_lru		= 0;
6415 	ulint		n_flush		= 0;
6416 	ulint		n_free		= 0;
6417 	ulint		n_zip		= 0;
6418 
6419 	ut_ad(buf_pool);
6420 
6421 	buf_pool_mutex_enter(buf_pool);
6422 	hash_lock_x_all(buf_pool->page_hash);
6423 
6424 	chunk = buf_pool->chunks;
6425 
6426 	/* Check the uncompressed blocks. */
6427 
6428 	for (i = buf_pool->n_chunks; i--; chunk++) {
6429 
6430 		ulint		j;
6431 		buf_block_t*	block = chunk->blocks;
6432 
6433 		for (j = chunk->size; j--; block++) {
6434 
6435 			buf_page_mutex_enter(block);
6436 
6437 			switch (buf_block_get_state(block)) {
6438 			case BUF_BLOCK_POOL_WATCH:
6439 			case BUF_BLOCK_ZIP_PAGE:
6440 			case BUF_BLOCK_ZIP_DIRTY:
6441 				/* These should only occur on
6442 				zip_clean, zip_free[], or flush_list. */
6443 				ut_error;
6444 				break;
6445 
6446 			case BUF_BLOCK_FILE_PAGE:
6447 				ut_a(buf_page_hash_get_low(
6448 						buf_pool, block->page.id)
6449 				     == &block->page);
6450 
6451 				switch (buf_page_get_io_fix(&block->page)) {
6452 				case BUF_IO_NONE:
6453 					break;
6454 
6455 				case BUF_IO_WRITE:
6456 					switch (buf_page_get_flush_type(
6457 							&block->page)) {
6458 					case BUF_FLUSH_LRU:
6459 						n_lru_flush++;
6460 						goto assert_s_latched;
6461 					case BUF_FLUSH_SINGLE_PAGE:
6462 						n_page_flush++;
6463 assert_s_latched:
6464 						ut_a(rw_lock_is_locked(
6465 							     &block->lock,
6466 								     RW_LOCK_S)
6467 						     || rw_lock_is_locked(
6468 								&block->lock,
6469 								RW_LOCK_SX));
6470 						break;
6471 					case BUF_FLUSH_LIST:
6472 						n_list_flush++;
6473 						break;
6474 					default:
6475 						ut_error;
6476 					}
6477 
6478 					break;
6479 
6480 				case BUF_IO_READ:
6481 
6482 					ut_a(rw_lock_is_locked(&block->lock,
6483 							       RW_LOCK_X));
6484 					break;
6485 
6486 				case BUF_IO_PIN:
6487 					break;
6488 				}
6489 
6490 				n_lru++;
6491 				break;
6492 
6493 			case BUF_BLOCK_NOT_USED:
6494 				n_free++;
6495 				break;
6496 
6497 			case BUF_BLOCK_READY_FOR_USE:
6498 			case BUF_BLOCK_MEMORY:
6499 			case BUF_BLOCK_REMOVE_HASH:
6500 				/* do nothing */
6501 				break;
6502 			}
6503 
6504 			buf_page_mutex_exit(block);
6505 		}
6506 	}
6507 
6508 	mutex_enter(&buf_pool->zip_mutex);
6509 
6510 	/* Check clean compressed-only blocks. */
6511 
6512 	for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
6513 	     b = UT_LIST_GET_NEXT(list, b)) {
6514 		ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
6515 		switch (buf_page_get_io_fix(b)) {
6516 		case BUF_IO_NONE:
6517 		case BUF_IO_PIN:
6518 			/* All clean blocks should be I/O-unfixed. */
6519 			break;
6520 		case BUF_IO_READ:
6521 			/* In buf_LRU_free_page(), we temporarily set
6522 			b->io_fix = BUF_IO_READ for a newly allocated
6523 			control block in order to prevent
6524 			buf_page_get_gen() from decompressing the block. */
6525 			break;
6526 		default:
6527 			ut_error;
6528 			break;
6529 		}
6530 
6531 		/* It is OK to read oldest_modification here because
6532 		we have acquired buf_pool->zip_mutex above which acts
6533 		as the 'block->mutex' for these bpages. */
6534 		ut_a(!b->oldest_modification);
6535 		ut_a(buf_page_hash_get_low(buf_pool, b->id) == b);
6536 		n_lru++;
6537 		n_zip++;
6538 	}
6539 
6540 	/* Check dirty blocks. */
6541 
6542 	buf_flush_list_mutex_enter(buf_pool);
6543 	for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
6544 	     b = UT_LIST_GET_NEXT(list, b)) {
6545 		ut_ad(b->in_flush_list);
6546 		ut_a(b->oldest_modification);
6547 		n_flush++;
6548 
6549 		switch (buf_page_get_state(b)) {
6550 		case BUF_BLOCK_ZIP_DIRTY:
6551 			n_lru++;
6552 			n_zip++;
6553 			switch (buf_page_get_io_fix(b)) {
6554 			case BUF_IO_NONE:
6555 			case BUF_IO_READ:
6556 			case BUF_IO_PIN:
6557 				break;
6558 			case BUF_IO_WRITE:
6559 				switch (buf_page_get_flush_type(b)) {
6560 				case BUF_FLUSH_LRU:
6561 					n_lru_flush++;
6562 					break;
6563 				case BUF_FLUSH_SINGLE_PAGE:
6564 					n_page_flush++;
6565 					break;
6566 				case BUF_FLUSH_LIST:
6567 					n_list_flush++;
6568 					break;
6569 				default:
6570 					ut_error;
6571 				}
6572 				break;
6573 			}
6574 			break;
6575 		case BUF_BLOCK_FILE_PAGE:
6576 			/* uncompressed page */
6577 			break;
6578 		case BUF_BLOCK_POOL_WATCH:
6579 		case BUF_BLOCK_ZIP_PAGE:
6580 		case BUF_BLOCK_NOT_USED:
6581 		case BUF_BLOCK_READY_FOR_USE:
6582 		case BUF_BLOCK_MEMORY:
6583 		case BUF_BLOCK_REMOVE_HASH:
6584 			ut_error;
6585 			break;
6586 		}
6587 		ut_a(buf_page_hash_get_low(buf_pool, b->id) == b);
6588 	}
6589 
6590 	ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush);
6591 
6592 	hash_unlock_x_all(buf_pool->page_hash);
6593 	buf_flush_list_mutex_exit(buf_pool);
6594 
6595 	mutex_exit(&buf_pool->zip_mutex);
6596 
6597 	if (buf_pool->curr_size == buf_pool->old_size
6598 	    && n_lru + n_free > buf_pool->curr_size + n_zip) {
6599 
6600 		ib::fatal() << "n_LRU " << n_lru << ", n_free " << n_free
6601 			<< ", pool " << buf_pool->curr_size
6602 			<< " zip " << n_zip << ". Aborting...";
6603 	}
6604 
6605 	ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru);
6606 	if (buf_pool->curr_size == buf_pool->old_size
6607 	    && UT_LIST_GET_LEN(buf_pool->free) != n_free) {
6608 
6609 		ib::fatal() << "Free list len "
6610 			<< UT_LIST_GET_LEN(buf_pool->free)
6611 			<< ", free blocks " << n_free << ". Aborting...";
6612 	}
6613 
6614 	ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush);
6615 	ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush);
6616 	ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_page_flush);
6617 
6618 	buf_pool_mutex_exit(buf_pool);
6619 
6620 	ut_a(buf_LRU_validate());
6621 	ut_a(buf_flush_validate(buf_pool));
6622 
6623 	return(TRUE);
6624 }
6625 
6626 /*********************************************************************//**
6627 Validates the buffer buf_pool data structure.
6628 @return TRUE */
6629 ibool
6630 buf_validate(void)
6631 /*==============*/
6632 {
6633 	ulint	i;
6634 
6635 	for (i = 0; i < srv_buf_pool_instances; i++) {
6636 		buf_pool_t*	buf_pool;
6637 
6638 		buf_pool = buf_pool_from_array(i);
6639 
6640 		buf_pool_validate_instance(buf_pool);
6641 	}
6642 	return(TRUE);
6643 }
6644 
6645 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
6646 
6647 #if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
6648 /*********************************************************************//**
6649 Prints info of the buffer buf_pool data structure for one instance. */
6650 static
6651 void
6652 buf_print_instance(
6653 /*===============*/
6654 	buf_pool_t*	buf_pool)
6655 {
6656 	index_id_t*	index_ids;
6657 	ulint*		counts;
6658 	ulint		size;
6659 	ulint		i;
6660 	ulint		j;
6661 	index_id_t	id;
6662 	ulint		n_found;
6663 	buf_chunk_t*	chunk;
6664 	dict_index_t*	index;
6665 
6666 	ut_ad(buf_pool);
6667 
6668 	size = buf_pool->curr_size;
6669 
6670 	index_ids = static_cast<index_id_t*>(
6671 		ut_malloc_nokey(size * sizeof *index_ids));
6672 
6673 	counts = static_cast<ulint*>(ut_malloc_nokey(sizeof(ulint) * size));
6674 
6675 	buf_pool_mutex_enter(buf_pool);
6676 	buf_flush_list_mutex_enter(buf_pool);
6677 
6678 	ib::info() << *buf_pool;
6679 
6680 	buf_flush_list_mutex_exit(buf_pool);
6681 
6682 	/* Count the number of blocks belonging to each index in the buffer */
6683 
6684 	n_found = 0;
6685 
6686 	chunk = buf_pool->chunks;
6687 
6688 	for (i = buf_pool->n_chunks; i--; chunk++) {
6689 		buf_block_t*	block		= chunk->blocks;
6690 		ulint		n_blocks	= chunk->size;
6691 
6692 		for (; n_blocks--; block++) {
6693 			const buf_frame_t* frame = block->frame;
6694 
6695 			if (fil_page_index_page_check(frame)) {
6696 
6697 				id = btr_page_get_index_id(frame);
6698 
6699 				/* Look for the id in the index_ids array */
6700 				j = 0;
6701 
6702 				while (j < n_found) {
6703 
6704 					if (index_ids[j] == id) {
6705 						counts[j]++;
6706 
6707 						break;
6708 					}
6709 					j++;
6710 				}
6711 
6712 				if (j == n_found) {
6713 					n_found++;
6714 					index_ids[j] = id;
6715 					counts[j] = 1;
6716 				}
6717 			}
6718 		}
6719 	}
6720 
6721 	buf_pool_mutex_exit(buf_pool);
6722 
6723 	for (i = 0; i < n_found; i++) {
6724 		index = dict_index_get_if_in_cache(index_ids[i]);
6725 
6726 		if (!index) {
6727 			ib::info() << "Block count for index "
6728 				<< index_ids[i] << " in buffer is about "
6729 				<< counts[i];
6730 		} else {
6731 			ib::info() << "Block count for index " << index_ids[i]
6732 				<< " in buffer is about " << counts[i]
6733 				<< ", index " << index->name
6734 				<< " of table " << index->table->name;
6735 		}
6736 	}
6737 
6738 	ut_free(index_ids);
6739 	ut_free(counts);
6740 
6741 	ut_a(buf_pool_validate_instance(buf_pool));
6742 }
6743 
6744 /*********************************************************************//**
6745 Prints info of the buffer buf_pool data structure. */
6746 void
6747 buf_print(void)
6748 /*===========*/
6749 {
6750 	ulint   i;
6751 
6752 	for (i = 0; i < srv_buf_pool_instances; i++) {
6753 		buf_pool_t*	buf_pool;
6754 
6755 		buf_pool = buf_pool_from_array(i);
6756 		buf_print_instance(buf_pool);
6757 	}
6758 }
6759 #endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
6760 
6761 #ifdef UNIV_DEBUG
6762 /*********************************************************************//**
6763 Returns the number of latched pages in the buffer pool.
6764 @return number of latched pages */
6765 static
6766 ulint
6767 buf_get_latched_pages_number_instance(
6768 /*==================================*/
6769 	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
6770 {
6771 	buf_page_t*	b;
6772 	ulint		i;
6773 	buf_chunk_t*	chunk;
6774 	ulint		fixed_pages_number = 0;
6775 
6776 	buf_pool_mutex_enter(buf_pool);
6777 
6778 	chunk = buf_pool->chunks;
6779 
6780 	for (i = buf_pool->n_chunks; i--; chunk++) {
6781 		buf_block_t*	block;
6782 		ulint		j;
6783 
6784 		block = chunk->blocks;
6785 
6786 		for (j = chunk->size; j--; block++) {
6787 			if (buf_block_get_state(block)
6788 			    != BUF_BLOCK_FILE_PAGE) {
6789 
6790 				continue;
6791 			}
6792 
6793 			buf_page_mutex_enter(block);
6794 
6795 			if (block->page.buf_fix_count != 0
6796 			    || buf_page_get_io_fix(&block->page)
6797 			    != BUF_IO_NONE) {
6798 				fixed_pages_number++;
6799 			}
6800 
6801 			buf_page_mutex_exit(block);
6802 		}
6803 	}
6804 
6805 	mutex_enter(&buf_pool->zip_mutex);
6806 
6807 	/* Traverse the lists of clean and dirty compressed-only blocks. */
6808 
6809 	for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
6810 	     b = UT_LIST_GET_NEXT(list, b)) {
6811 		ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
6812 		ut_a(buf_page_get_io_fix(b) != BUF_IO_WRITE);
6813 
6814 		if (b->buf_fix_count != 0
6815 		    || buf_page_get_io_fix(b) != BUF_IO_NONE) {
6816 			fixed_pages_number++;
6817 		}
6818 	}
6819 
6820 	buf_flush_list_mutex_enter(buf_pool);
6821 	for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
6822 	     b = UT_LIST_GET_NEXT(list, b)) {
6823 		ut_ad(b->in_flush_list);
6824 
6825 		switch (buf_page_get_state(b)) {
6826 		case BUF_BLOCK_ZIP_DIRTY:
6827 			if (b->buf_fix_count != 0
6828 			    || buf_page_get_io_fix(b) != BUF_IO_NONE) {
6829 				fixed_pages_number++;
6830 			}
6831 			break;
6832 		case BUF_BLOCK_FILE_PAGE:
6833 			/* uncompressed page */
6834 			break;
6835 		case BUF_BLOCK_POOL_WATCH:
6836 		case BUF_BLOCK_ZIP_PAGE:
6837 		case BUF_BLOCK_NOT_USED:
6838 		case BUF_BLOCK_READY_FOR_USE:
6839 		case BUF_BLOCK_MEMORY:
6840 		case BUF_BLOCK_REMOVE_HASH:
6841 			ut_error;
6842 			break;
6843 		}
6844 	}
6845 
6846 	buf_flush_list_mutex_exit(buf_pool);
6847 	mutex_exit(&buf_pool->zip_mutex);
6848 	buf_pool_mutex_exit(buf_pool);
6849 
6850 	return(fixed_pages_number);
6851 }
6852 
6853 /*********************************************************************//**
6854 Returns the number of latched pages in all the buffer pools.
6855 @return number of latched pages */
6856 ulint
6857 buf_get_latched_pages_number(void)
6858 /*==============================*/
6859 {
6860 	ulint	i;
6861 	ulint	total_latched_pages = 0;
6862 
6863 	for (i = 0; i < srv_buf_pool_instances; i++) {
6864 		buf_pool_t*	buf_pool;
6865 
6866 		buf_pool = buf_pool_from_array(i);
6867 
6868 		total_latched_pages += buf_get_latched_pages_number_instance(
6869 			buf_pool);
6870 	}
6871 
6872 	return(total_latched_pages);
6873 }
6874 
6875 #endif /* UNIV_DEBUG */
6876 
6877 /*********************************************************************//**
6878 Returns the number of pending buf pool read ios.
6879 @return number of pending read I/O operations */
6880 ulint
6881 buf_get_n_pending_read_ios(void)
6882 /*============================*/
6883 {
6884 	ulint	pend_ios = 0;
6885 
6886 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
6887 		pend_ios += buf_pool_from_array(i)->n_pend_reads;
6888 	}
6889 
6890 	return(pend_ios);
6891 }
6892 
6893 /*********************************************************************//**
6894 Returns the ratio in percents of modified pages in the buffer pool /
6895 database pages in the buffer pool.
6896 @return modified page percentage ratio */
6897 double
6898 buf_get_modified_ratio_pct(void)
6899 /*============================*/
6900 {
6901 	double		ratio;
6902 	ulint		lru_len = 0;
6903 	ulint		free_len = 0;
6904 	ulint		flush_list_len = 0;
6905 
6906 	buf_get_total_list_len(&lru_len, &free_len, &flush_list_len);
6907 
6908 	ratio = static_cast<double>(100 * flush_list_len)
6909 		/ (1 + lru_len + free_len);
6910 
6911 	/* 1 + is there to avoid division by zero */
6912 
6913 	return(ratio);
6914 }
6915 
6916 /*******************************************************************//**
6917 Aggregates a pool stats information with the total buffer pool stats  */
6918 static
6919 void
6920 buf_stats_aggregate_pool_info(
6921 /*==========================*/
6922 	buf_pool_info_t*	total_info,	/*!< in/out: the buffer pool
6923 						info to store aggregated
6924 						result */
6925 	const buf_pool_info_t*	pool_info)	/*!< in: individual buffer pool
6926 						stats info */
6927 {
6928 	ut_a(total_info && pool_info);
6929 
6930 	/* Nothing to copy if total_info is the same as pool_info */
6931 	if (total_info == pool_info) {
6932 		return;
6933 	}
6934 
6935 	total_info->pool_size += pool_info->pool_size;
6936 	total_info->lru_len += pool_info->lru_len;
6937 	total_info->old_lru_len += pool_info->old_lru_len;
6938 	total_info->free_list_len += pool_info->free_list_len;
6939 	total_info->flush_list_len += pool_info->flush_list_len;
6940 	total_info->n_pend_unzip += pool_info->n_pend_unzip;
6941 	total_info->n_pend_reads += pool_info->n_pend_reads;
6942 	total_info->n_pending_flush_lru += pool_info->n_pending_flush_lru;
6943 	total_info->n_pending_flush_list += pool_info->n_pending_flush_list;
6944 	total_info->n_pages_made_young += pool_info->n_pages_made_young;
6945 	total_info->n_pages_not_made_young += pool_info->n_pages_not_made_young;
6946 	total_info->n_pages_read += pool_info->n_pages_read;
6947 	total_info->n_pages_created += pool_info->n_pages_created;
6948 	total_info->n_pages_written += pool_info->n_pages_written;
6949 	total_info->n_page_gets += pool_info->n_page_gets;
6950 	total_info->n_ra_pages_read_rnd += pool_info->n_ra_pages_read_rnd;
6951 	total_info->n_ra_pages_read += pool_info->n_ra_pages_read;
6952 	total_info->n_ra_pages_evicted += pool_info->n_ra_pages_evicted;
6953 	total_info->page_made_young_rate += pool_info->page_made_young_rate;
6954 	total_info->page_not_made_young_rate +=
6955 		pool_info->page_not_made_young_rate;
6956 	total_info->pages_read_rate += pool_info->pages_read_rate;
6957 	total_info->pages_created_rate += pool_info->pages_created_rate;
6958 	total_info->pages_written_rate += pool_info->pages_written_rate;
6959 	total_info->n_page_get_delta += pool_info->n_page_get_delta;
6960 	total_info->page_read_delta += pool_info->page_read_delta;
6961 	total_info->young_making_delta += pool_info->young_making_delta;
6962 	total_info->not_young_making_delta += pool_info->not_young_making_delta;
6963 	total_info->pages_readahead_rnd_rate += pool_info->pages_readahead_rnd_rate;
6964 	total_info->pages_readahead_rate += pool_info->pages_readahead_rate;
6965 	total_info->pages_evicted_rate += pool_info->pages_evicted_rate;
6966 	total_info->unzip_lru_len += pool_info->unzip_lru_len;
6967 	total_info->io_sum += pool_info->io_sum;
6968 	total_info->io_cur += pool_info->io_cur;
6969 	total_info->unzip_sum += pool_info->unzip_sum;
6970 	total_info->unzip_cur += pool_info->unzip_cur;
6971 }
6972 /*******************************************************************//**
6973 Collect buffer pool stats information for a buffer pool. Also
6974 record aggregated stats if there are more than one buffer pool
6975 in the server */
6976 void
6977 buf_stats_get_pool_info(
6978 /*====================*/
6979 	buf_pool_t*		buf_pool,	/*!< in: buffer pool */
6980 	ulint			pool_id,	/*!< in: buffer pool ID */
6981 	buf_pool_info_t*	all_pool_info)	/*!< in/out: buffer pool info
6982 						to fill */
6983 {
6984 	buf_pool_info_t*	pool_info;
6985 	time_t			current_time;
6986 	double			time_elapsed;
6987 
6988 	/* Find appropriate pool_info to store stats for this buffer pool */
6989 	pool_info = &all_pool_info[pool_id];
6990 
6991 	buf_pool_mutex_enter(buf_pool);
6992 	buf_flush_list_mutex_enter(buf_pool);
6993 
6994 	pool_info->pool_unique_id = pool_id;
6995 
6996 	pool_info->pool_size = buf_pool->curr_size;
6997 
6998 	pool_info->lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
6999 
7000 	pool_info->old_lru_len = buf_pool->LRU_old_len;
7001 
7002 	pool_info->free_list_len = UT_LIST_GET_LEN(buf_pool->free);
7003 
7004 	pool_info->flush_list_len = UT_LIST_GET_LEN(buf_pool->flush_list);
7005 
7006 	pool_info->n_pend_unzip = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
7007 
7008 	pool_info->n_pend_reads = buf_pool->n_pend_reads;
7009 
7010 	pool_info->n_pending_flush_lru =
7011 		 (buf_pool->n_flush[BUF_FLUSH_LRU]
7012 		  + buf_pool->init_flush[BUF_FLUSH_LRU]);
7013 
7014 	pool_info->n_pending_flush_list =
7015 		 (buf_pool->n_flush[BUF_FLUSH_LIST]
7016 		  + buf_pool->init_flush[BUF_FLUSH_LIST]);
7017 
7018 	pool_info->n_pending_flush_single_page =
7019 		 (buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]
7020 		  + buf_pool->init_flush[BUF_FLUSH_SINGLE_PAGE]);
7021 
7022 	buf_flush_list_mutex_exit(buf_pool);
7023 
7024 	current_time = time(NULL);
7025 	time_elapsed = 0.001 + difftime(current_time,
7026 					buf_pool->last_printout_time);
7027 
7028 	pool_info->n_pages_made_young = buf_pool->stat.n_pages_made_young;
7029 
7030 	pool_info->n_pages_not_made_young =
7031 		buf_pool->stat.n_pages_not_made_young;
7032 
7033 	pool_info->n_pages_read = buf_pool->stat.n_pages_read;
7034 
7035 	pool_info->n_pages_created = buf_pool->stat.n_pages_created;
7036 
7037 	pool_info->n_pages_written = buf_pool->stat.n_pages_written;
7038 
7039 	pool_info->n_page_gets = buf_pool->stat.n_page_gets;
7040 
7041 	pool_info->n_ra_pages_read_rnd = buf_pool->stat.n_ra_pages_read_rnd;
7042 	pool_info->n_ra_pages_read = buf_pool->stat.n_ra_pages_read;
7043 
7044 	pool_info->n_ra_pages_evicted = buf_pool->stat.n_ra_pages_evicted;
7045 
7046 	pool_info->page_made_young_rate =
7047 		 (buf_pool->stat.n_pages_made_young
7048 		  - buf_pool->old_stat.n_pages_made_young) / time_elapsed;
7049 
7050 	pool_info->page_not_made_young_rate =
7051 		 (buf_pool->stat.n_pages_not_made_young
7052 		  - buf_pool->old_stat.n_pages_not_made_young) / time_elapsed;
7053 
7054 	pool_info->pages_read_rate =
7055 		(buf_pool->stat.n_pages_read
7056 		  - buf_pool->old_stat.n_pages_read) / time_elapsed;
7057 
7058 	pool_info->pages_created_rate =
7059 		(buf_pool->stat.n_pages_created
7060 		 - buf_pool->old_stat.n_pages_created) / time_elapsed;
7061 
7062 	pool_info->pages_written_rate =
7063 		(buf_pool->stat.n_pages_written
7064 		 - buf_pool->old_stat.n_pages_written) / time_elapsed;
7065 
7066 	pool_info->n_page_get_delta = buf_pool->stat.n_page_gets
7067 				      - buf_pool->old_stat.n_page_gets;
7068 
7069 	if (pool_info->n_page_get_delta) {
7070 		pool_info->page_read_delta = buf_pool->stat.n_pages_read
7071 					     - buf_pool->old_stat.n_pages_read;
7072 
7073 		pool_info->young_making_delta =
7074 			buf_pool->stat.n_pages_made_young
7075 			- buf_pool->old_stat.n_pages_made_young;
7076 
7077 		pool_info->not_young_making_delta =
7078 			buf_pool->stat.n_pages_not_made_young
7079 			- buf_pool->old_stat.n_pages_not_made_young;
7080 	}
7081 	pool_info->pages_readahead_rnd_rate =
7082 		 (buf_pool->stat.n_ra_pages_read_rnd
7083 		  - buf_pool->old_stat.n_ra_pages_read_rnd) / time_elapsed;
7084 
7085 
7086 	pool_info->pages_readahead_rate =
7087 		 (buf_pool->stat.n_ra_pages_read
7088 		  - buf_pool->old_stat.n_ra_pages_read) / time_elapsed;
7089 
7090 	pool_info->pages_evicted_rate =
7091 		(buf_pool->stat.n_ra_pages_evicted
7092 		 - buf_pool->old_stat.n_ra_pages_evicted) / time_elapsed;
7093 
7094 	pool_info->unzip_lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
7095 
7096 	pool_info->io_sum = buf_LRU_stat_sum.io;
7097 
7098 	pool_info->io_cur = buf_LRU_stat_cur.io;
7099 
7100 	pool_info->unzip_sum = buf_LRU_stat_sum.unzip;
7101 
7102 	pool_info->unzip_cur = buf_LRU_stat_cur.unzip;
7103 
7104 	buf_refresh_io_stats(buf_pool);
7105 	buf_pool_mutex_exit(buf_pool);
7106 }
7107 
7108 /*********************************************************************//**
7109 Prints info of the buffer i/o. */
7110 static
7111 void
7112 buf_print_io_instance(
7113 /*==================*/
7114 	buf_pool_info_t*pool_info,	/*!< in: buffer pool info */
7115 	FILE*		file)		/*!< in/out: buffer where to print */
7116 {
7117 	ut_ad(pool_info);
7118 
7119 	fprintf(file,
7120 		"Buffer pool size   " ULINTPF "\n"
7121 		"Free buffers       " ULINTPF "\n"
7122 		"Database pages     " ULINTPF "\n"
7123 		"Old database pages " ULINTPF "\n"
7124 		"Modified db pages  " ULINTPF "\n"
7125 		"Percent of dirty pages(LRU & free pages): %.3f\n"
7126 		"Max dirty pages percent: %.3f\n"
7127 		"Pending reads " ULINTPF "\n"
7128 		"Pending writes: LRU " ULINTPF ", flush list " ULINTPF
7129 		", single page " ULINTPF "\n",
7130 		pool_info->pool_size,
7131 		pool_info->free_list_len,
7132 		pool_info->lru_len,
7133 		pool_info->old_lru_len,
7134 		pool_info->flush_list_len,
7135 		(((double) pool_info->flush_list_len) /
7136 		  (pool_info->lru_len + pool_info->free_list_len + 1.0)) * 100.0,
7137 		srv_max_buf_pool_modified_pct,
7138 		pool_info->n_pend_reads,
7139 		pool_info->n_pending_flush_lru,
7140 		pool_info->n_pending_flush_list,
7141 		pool_info->n_pending_flush_single_page);
7142 
7143 	fprintf(file,
7144 		"Pages made young " ULINTPF ", not young " ULINTPF "\n"
7145 		"%.2f youngs/s, %.2f non-youngs/s\n"
7146 		"Pages read " ULINTPF ", created " ULINTPF
7147 		", written " ULINTPF "\n"
7148 		"%.2f reads/s, %.2f creates/s, %.2f writes/s\n",
7149 		pool_info->n_pages_made_young,
7150 		pool_info->n_pages_not_made_young,
7151 		pool_info->page_made_young_rate,
7152 		pool_info->page_not_made_young_rate,
7153 		pool_info->n_pages_read,
7154 		pool_info->n_pages_created,
7155 		pool_info->n_pages_written,
7156 		pool_info->pages_read_rate,
7157 		pool_info->pages_created_rate,
7158 		pool_info->pages_written_rate);
7159 
7160 	if (pool_info->n_page_get_delta) {
7161 		double hit_rate = double(pool_info->page_read_delta)
7162 			/ pool_info->n_page_get_delta;
7163 
7164 		if (hit_rate > 1) {
7165 			hit_rate = 1;
7166 		}
7167 
7168 		fprintf(file,
7169 			"Buffer pool hit rate " ULINTPF " / 1000,"
7170 			" young-making rate " ULINTPF " / 1000 not "
7171 			ULINTPF " / 1000\n",
7172 			ulint(1000 * (1 - hit_rate)),
7173 			ulint(1000 * double(pool_info->young_making_delta)
7174 			      / pool_info->n_page_get_delta),
7175 			ulint(1000 * double(pool_info->not_young_making_delta)
7176 			      / pool_info->n_page_get_delta));
7177 	} else {
7178 		fputs("No buffer pool page gets since the last printout\n",
7179 		      file);
7180 	}
7181 
7182 	/* Statistics about read ahead algorithm */
7183 	fprintf(file, "Pages read ahead %.2f/s,"
7184 		" evicted without access %.2f/s,"
7185 		" Random read ahead %.2f/s\n",
7186 
7187 		pool_info->pages_readahead_rate,
7188 		pool_info->pages_evicted_rate,
7189 		pool_info->pages_readahead_rnd_rate);
7190 
7191 	/* Print some values to help us with visualizing what is
7192 	happening with LRU eviction. */
7193 	fprintf(file,
7194 		"LRU len: " ULINTPF ", unzip_LRU len: " ULINTPF "\n"
7195 		"I/O sum[" ULINTPF "]:cur[" ULINTPF "], "
7196 		"unzip sum[" ULINTPF "]:cur[" ULINTPF "]\n",
7197 		pool_info->lru_len, pool_info->unzip_lru_len,
7198 		pool_info->io_sum, pool_info->io_cur,
7199 		pool_info->unzip_sum, pool_info->unzip_cur);
7200 }
7201 
7202 /*********************************************************************//**
7203 Prints info of the buffer i/o. */
7204 void
7205 buf_print_io(
7206 /*=========*/
7207 	FILE*	file)	/*!< in/out: buffer where to print */
7208 {
7209 	ulint			i;
7210 	buf_pool_info_t*	pool_info;
7211 	buf_pool_info_t*	pool_info_total;
7212 
7213 	/* If srv_buf_pool_instances is greater than 1, allocate
7214 	one extra buf_pool_info_t, the last one stores
7215 	aggregated/total values from all pools */
7216 	if (srv_buf_pool_instances > 1) {
7217 		pool_info = (buf_pool_info_t*) ut_zalloc_nokey((
7218 			srv_buf_pool_instances + 1) * sizeof *pool_info);
7219 
7220 		pool_info_total = &pool_info[srv_buf_pool_instances];
7221 	} else {
7222 		ut_a(srv_buf_pool_instances == 1);
7223 
7224 		pool_info_total = pool_info =
7225 			static_cast<buf_pool_info_t*>(
7226 				ut_zalloc_nokey(sizeof *pool_info));
7227 	}
7228 
7229 	for (i = 0; i < srv_buf_pool_instances; i++) {
7230 		buf_pool_t*	buf_pool;
7231 
7232 		buf_pool = buf_pool_from_array(i);
7233 
7234 		/* Fetch individual buffer pool info and calculate
7235 		aggregated stats along the way */
7236 		buf_stats_get_pool_info(buf_pool, i, pool_info);
7237 
7238 		/* If we have more than one buffer pool, store
7239 		the aggregated stats  */
7240 		if (srv_buf_pool_instances > 1) {
7241 			buf_stats_aggregate_pool_info(pool_info_total,
7242 						      &pool_info[i]);
7243 		}
7244 	}
7245 
7246 	/* Print the aggreate buffer pool info */
7247 	buf_print_io_instance(pool_info_total, file);
7248 
7249 	/* If there are more than one buffer pool, print each individual pool
7250 	info */
7251 	if (srv_buf_pool_instances > 1) {
7252 		fputs("----------------------\n"
7253 		"INDIVIDUAL BUFFER POOL INFO\n"
7254 		"----------------------\n", file);
7255 
7256 		for (i = 0; i < srv_buf_pool_instances; i++) {
7257 			fprintf(file, "---BUFFER POOL " ULINTPF "\n", i);
7258 			buf_print_io_instance(&pool_info[i], file);
7259 		}
7260 	}
7261 
7262 	ut_free(pool_info);
7263 }
7264 
7265 /**********************************************************************//**
7266 Refreshes the statistics used to print per-second averages. */
7267 void
7268 buf_refresh_io_stats_all(void)
7269 /*==========================*/
7270 {
7271 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
7272 		buf_pool_t*	buf_pool;
7273 
7274 		buf_pool = buf_pool_from_array(i);
7275 
7276 		buf_refresh_io_stats(buf_pool);
7277 	}
7278 }
7279 
7280 /**********************************************************************//**
7281 Check if all pages in all buffer pools are in a replacable state.
7282 @return FALSE if not */
7283 ibool
7284 buf_all_freed(void)
7285 /*===============*/
7286 {
7287 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
7288 		buf_pool_t*	buf_pool;
7289 
7290 		buf_pool = buf_pool_from_array(i);
7291 
7292 		if (!buf_all_freed_instance(buf_pool)) {
7293 			return(FALSE);
7294 		}
7295 	}
7296 
7297 	return(TRUE);
7298 }
7299 
7300 /*********************************************************************//**
7301 Checks that there currently are no pending i/o-operations for the buffer
7302 pool.
7303 @return number of pending i/o */
7304 ulint
7305 buf_pool_check_no_pending_io(void)
7306 /*==============================*/
7307 {
7308 	ulint		i;
7309 	ulint		pending_io = 0;
7310 
7311 	buf_pool_mutex_enter_all();
7312 
7313 	for (i = 0; i < srv_buf_pool_instances; i++) {
7314 		const buf_pool_t*	buf_pool;
7315 
7316 		buf_pool = buf_pool_from_array(i);
7317 
7318 		pending_io += buf_pool->n_pend_reads
7319 			      + buf_pool->n_flush[BUF_FLUSH_LRU]
7320 			      + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]
7321 			      + buf_pool->n_flush[BUF_FLUSH_LIST];
7322 
7323 	}
7324 
7325 	buf_pool_mutex_exit_all();
7326 
7327 	return(pending_io);
7328 }
7329 
7330 /** Print the given page_id_t object.
7331 @param[in,out]	out	the output stream
7332 @param[in]	page_id	the page_id_t object to be printed
7333 @return the output stream */
7334 std::ostream&
7335 operator<<(
7336 	std::ostream&		out,
7337 	const page_id_t		page_id)
7338 {
7339 	out << "[page id: space=" << page_id.m_space
7340 		<< ", page number=" << page_id.m_page_no << "]";
7341 	return(out);
7342 }
7343 
7344 #if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
7345 /** Print the given buf_pool_t object.
7346 @param[in,out]	out		the output stream
7347 @param[in]	buf_pool	the buf_pool_t object to be printed
7348 @return the output stream */
7349 std::ostream&
7350 operator<<(
7351 	std::ostream&		out,
7352 	const buf_pool_t&	buf_pool)
7353 {
7354 	out << "[buffer pool instance: "
7355 		<< "buf_pool size=" << buf_pool.curr_size
7356 		<< ", database pages=" << UT_LIST_GET_LEN(buf_pool.LRU)
7357 		<< ", free pages=" << UT_LIST_GET_LEN(buf_pool.free)
7358 		<< ", modified database pages="
7359 		<< UT_LIST_GET_LEN(buf_pool.flush_list)
7360 		<< ", n pending decompressions=" << buf_pool.n_pend_unzip
7361 		<< ", n pending reads=" << buf_pool.n_pend_reads
7362 		<< ", n pending flush LRU=" << buf_pool.n_flush[BUF_FLUSH_LRU]
7363 		<< " list=" << buf_pool.n_flush[BUF_FLUSH_LIST]
7364 		<< " single page=" << buf_pool.n_flush[BUF_FLUSH_SINGLE_PAGE]
7365 		<< ", pages made young=" << buf_pool.stat.n_pages_made_young
7366 		<< ", not young=" << buf_pool.stat.n_pages_not_made_young
7367 		<< ", pages read=" << buf_pool.stat.n_pages_read
7368 		<< ", created=" << buf_pool.stat.n_pages_created
7369 		<< ", written=" << buf_pool.stat.n_pages_written << "]";
7370 	return(out);
7371 }
7372 #endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
7373 
7374 /** Encrypt a buffer of temporary tablespace
7375 @param[in]	offset		Page offset
7376 @param[in]	src_frame	Page to encrypt
7377 @param[in,out]	dst_frame	Output buffer
7378 @return encrypted buffer or NULL */
7379 static byte* buf_tmp_page_encrypt(
7380 	ulint	offset,
7381 	byte*	src_frame,
7382 	byte*	dst_frame)
7383 {
7384 	uint header_len = FIL_PAGE_DATA;
7385 	/* FIL page header is not encrypted */
7386 	memcpy(dst_frame, src_frame, header_len);
7387 
7388 	/* Calculate the start offset in a page */
7389 	uint unencrypted_bytes = header_len + FIL_PAGE_DATA_END;
7390 	uint srclen = srv_page_size - unencrypted_bytes;
7391 	const byte* src = src_frame + header_len;
7392 	byte* dst = dst_frame + header_len;
7393 
7394 	if (!log_tmp_block_encrypt(src, srclen, dst, (offset * srv_page_size),
7395 				   true)) {
7396 		return NULL;
7397 	}
7398 
7399 	memcpy(dst_frame + srv_page_size - FIL_PAGE_DATA_END,
7400 	       src_frame + srv_page_size - FIL_PAGE_DATA_END,
7401 	       FIL_PAGE_DATA_END);
7402 
7403 	/* Handle post encryption checksum */
7404 	mach_write_to_4(dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + 4,
7405 			buf_calc_page_crc32(dst_frame));
7406 
7407 	srv_stats.pages_encrypted.inc();
7408 	srv_stats.n_temp_blocks_encrypted.inc();
7409 	return dst_frame;
7410 }
7411 
7412 /** Encryption and page_compression hook that is called just before
7413 a page is written to disk.
7414 @param[in,out]	space		tablespace
7415 @param[in,out]	bpage		buffer page
7416 @param[in]	src_frame	physical page frame that is being encrypted
7417 @return	page frame to be written to file
7418 (may be src_frame or an encrypted/compressed copy of it) */
7419 UNIV_INTERN
7420 byte*
7421 buf_page_encrypt_before_write(
7422 	fil_space_t*	space,
7423 	buf_page_t*	bpage,
7424 	byte*		src_frame)
7425 {
7426 	ut_ad(space->id == bpage->id.space());
7427 	bpage->real_size = srv_page_size;
7428 
7429 	fil_page_type_validate(src_frame);
7430 
7431 	switch (bpage->id.page_no()) {
7432 	case 0:
7433 		/* Page 0 of a tablespace is not encrypted/compressed */
7434 		return src_frame;
7435 	case TRX_SYS_PAGE_NO:
7436 		if (bpage->id.space() == TRX_SYS_SPACE) {
7437 			/* don't encrypt/compress page as it contains
7438 			address to dblwr buffer */
7439 			return src_frame;
7440 		}
7441 	}
7442 
7443 	fil_space_crypt_t* crypt_data = space->crypt_data;
7444 
7445 	bool encrypted, page_compressed;
7446 
7447 	if (space->purpose == FIL_TYPE_TEMPORARY) {
7448 		ut_ad(!crypt_data);
7449 		encrypted = innodb_encrypt_temporary_tables;
7450 		page_compressed = false;
7451 	} else {
7452 		encrypted = crypt_data
7453 			&& !crypt_data->not_encrypted()
7454 			&& crypt_data->type != CRYPT_SCHEME_UNENCRYPTED
7455 			&& (!crypt_data->is_default_encryption()
7456 			    || srv_encrypt_tables);
7457 
7458 		page_compressed = FSP_FLAGS_HAS_PAGE_COMPRESSION(space->flags);
7459 	}
7460 
7461 	if (!encrypted && !page_compressed) {
7462 		/* No need to encrypt or page compress the page.
7463 		Clear key-version & crypt-checksum. */
7464 		memset(src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
7465 		return src_frame;
7466 	}
7467 
7468 	ut_ad(!bpage->size.is_compressed() || !page_compressed);
7469 	buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
7470 	/* Find free slot from temporary memory array */
7471 	buf_tmp_buffer_t* slot = buf_pool_reserve_tmp_slot(buf_pool);
7472 	slot->out_buf = NULL;
7473 	bpage->slot = slot;
7474 
7475 	buf_tmp_reserve_crypt_buf(slot);
7476 	byte *dst_frame = slot->crypt_buf;
7477 
7478 	if (!page_compressed) {
7479 not_compressed:
7480 		byte* tmp;
7481 		if (space->purpose == FIL_TYPE_TEMPORARY) {
7482 			/* Encrypt temporary tablespace page content */
7483 			tmp = buf_tmp_page_encrypt(bpage->id.page_no(),
7484 						   src_frame, dst_frame);
7485 		} else {
7486 			/* Encrypt page content */
7487 			tmp = fil_space_encrypt(
7488 					space, bpage->id.page_no(),
7489 					bpage->newest_modification,
7490 					src_frame, dst_frame);
7491 		}
7492 
7493 		bpage->real_size = srv_page_size;
7494 		slot->out_buf = dst_frame = tmp;
7495 
7496 		ut_d(fil_page_type_validate(tmp));
7497 	} else {
7498 		ut_ad(space->purpose != FIL_TYPE_TEMPORARY);
7499 		/* First we compress the page content */
7500 		buf_tmp_reserve_compression_buf(slot);
7501 		byte* tmp = slot->comp_buf;
7502 		ulint out_len = fil_page_compress(
7503 			src_frame, tmp,
7504 			fsp_flags_get_page_compression_level(space->flags),
7505 			fil_space_get_block_size(space, bpage->id.page_no()),
7506 			encrypted);
7507 		if (!out_len) {
7508 			goto not_compressed;
7509 		}
7510 
7511 		bpage->real_size = out_len;
7512 
7513 		/* Workaround for MDEV-15527. */
7514 		memset(tmp + out_len, 0 , srv_page_size - out_len);
7515 		ut_d(fil_page_type_validate(tmp));
7516 
7517 		if (encrypted) {
7518 			/* And then we encrypt the page content */
7519 			tmp = fil_space_encrypt(space,
7520 						bpage->id.page_no(),
7521 						bpage->newest_modification,
7522 						tmp,
7523 						dst_frame);
7524 		}
7525 
7526 		slot->out_buf = dst_frame = tmp;
7527 	}
7528 
7529 	ut_d(fil_page_type_validate(dst_frame));
7530 
7531 	// return dst_frame which will be written
7532 	return dst_frame;
7533 }
7534 
7535 /**
7536 Should we punch hole to deallocate unused portion of the page.
7537 @param[in]	bpage		Page control block
7538 @return true if punch hole should be used, false if not */
7539 bool
7540 buf_page_should_punch_hole(
7541 	const buf_page_t* bpage)
7542 {
7543 	return (bpage->real_size != bpage->size.physical());
7544 }
7545 
7546 /**
7547 Calculate the length of trim (punch_hole) operation.
7548 @param[in]	bpage		Page control block
7549 @param[in]	write_length	Write length
7550 @return length of the trim or zero. */
7551 ulint
7552 buf_page_get_trim_length(
7553 	const buf_page_t*	bpage,
7554 	ulint			write_length)
7555 {
7556 	return (bpage->size.physical() - write_length);
7557 }
7558 #endif /* !UNIV_INNOCHECKSUM */
7559