1 /*****************************************************************************
2 
3 Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2008, Google Inc.
5 
6 Portions of this file contain modifications contributed and copyrighted by
7 Google, Inc. Those modifications are gratefully acknowledged and are described
8 briefly in the InnoDB documentation. The contributions by Google are
9 incorporated with their permission, and subject to the conditions contained in
10 the file COPYING.Google.
11 
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License, version 2.0,
14 as published by the Free Software Foundation.
15 
16 This program is also distributed with certain software (including
17 but not limited to OpenSSL) that is licensed under separate terms,
18 as designated in a particular file or component or in included license
19 documentation.  The authors of MySQL hereby grant you an additional
20 permission to link the program and your derivative works with the
21 separately licensed software that they have included with MySQL.
22 
23 This program is distributed in the hope that it will be useful,
24 but WITHOUT ANY WARRANTY; without even the implied warranty of
25 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26 GNU General Public License, version 2.0, for more details.
27 
28 You should have received a copy of the GNU General Public License along with
29 this program; if not, write to the Free Software Foundation, Inc.,
30 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
31 
32 *****************************************************************************/
33 
34 /**************************************************//**
35 @file buf/buf0buf.cc
36 The database buffer buf_pool
37 
38 Created 11/5/1995 Heikki Tuuri
39 *******************************************************/
40 
41 #include "buf0buf.h"
42 
43 #ifdef UNIV_NONINL
44 #include "buf0buf.ic"
45 #endif
46 
47 #include "mem0mem.h"
48 #include "btr0btr.h"
49 #include "fil0fil.h"
50 #ifndef UNIV_HOTBACKUP
51 #include "buf0buddy.h"
52 #include "lock0lock.h"
53 #include "btr0sea.h"
54 #include "ibuf0ibuf.h"
55 #include "trx0undo.h"
56 #include "log0log.h"
57 #endif /* !UNIV_HOTBACKUP */
58 #include "srv0srv.h"
59 #include "dict0dict.h"
60 #include "log0recv.h"
61 #include "page0zip.h"
62 #include "srv0mon.h"
63 #include "buf0checksum.h"
64 #ifdef HAVE_LIBNUMA
65 #include <numa.h>
66 #include <numaif.h>
67 #endif // HAVE_LIBNUMA
68 
69 /*
70 		IMPLEMENTATION OF THE BUFFER POOL
71 		=================================
72 
73 Performance improvement:
74 ------------------------
75 Thread scheduling in NT may be so slow that the OS wait mechanism should
76 not be used even in waiting for disk reads to complete.
77 Rather, we should put waiting query threads to the queue of
78 waiting jobs, and let the OS thread do something useful while the i/o
79 is processed. In this way we could remove most OS thread switches in
80 an i/o-intensive benchmark like TPC-C.
81 
82 A possibility is to put a user space thread library between the database
83 and NT. User space thread libraries might be very fast.
84 
85 SQL Server 7.0 can be configured to use 'fibers' which are lightweight
86 threads in NT. These should be studied.
87 
88 		Buffer frames and blocks
89 		------------------------
90 Following the terminology of Gray and Reuter, we call the memory
91 blocks where file pages are loaded buffer frames. For each buffer
92 frame there is a control block, or shortly, a block, in the buffer
93 control array. The control info which does not need to be stored
94 in the file along with the file page, resides in the control block.
95 
96 		Buffer pool struct
97 		------------------
98 The buffer buf_pool contains a single mutex which protects all the
99 control data structures of the buf_pool. The content of a buffer frame is
100 protected by a separate read-write lock in its control block, though.
101 These locks can be locked and unlocked without owning the buf_pool->mutex.
102 The OS events in the buf_pool struct can be waited for without owning the
103 buf_pool->mutex.
104 
105 The buf_pool->mutex is a hot-spot in main memory, causing a lot of
106 memory bus traffic on multiprocessor systems when processors
107 alternately access the mutex. On our Pentium, the mutex is accessed
108 maybe every 10 microseconds. We gave up the solution to have mutexes
109 for each control block, for instance, because it seemed to be
110 complicated.
111 
112 A solution to reduce mutex contention of the buf_pool->mutex is to
113 create a separate mutex for the page hash table. On Pentium,
114 accessing the hash table takes 2 microseconds, about half
115 of the total buf_pool->mutex hold time.
116 
117 		Control blocks
118 		--------------
119 
120 The control block contains, for instance, the bufferfix count
121 which is incremented when a thread wants a file page to be fixed
122 in a buffer frame. The bufferfix operation does not lock the
123 contents of the frame, however. For this purpose, the control
124 block contains a read-write lock.
125 
126 The buffer frames have to be aligned so that the start memory
127 address of a frame is divisible by the universal page size, which
128 is a power of two.
129 
130 We intend to make the buffer buf_pool size on-line reconfigurable,
131 that is, the buf_pool size can be changed without closing the database.
132 Then the database administarator may adjust it to be bigger
133 at night, for example. The control block array must
134 contain enough control blocks for the maximum buffer buf_pool size
135 which is used in the particular database.
136 If the buf_pool size is cut, we exploit the virtual memory mechanism of
137 the OS, and just refrain from using frames at high addresses. Then the OS
138 can swap them to disk.
139 
140 The control blocks containing file pages are put to a hash table
141 according to the file address of the page.
142 We could speed up the access to an individual page by using
143 "pointer swizzling": we could replace the page references on
144 non-leaf index pages by direct pointers to the page, if it exists
145 in the buf_pool. We could make a separate hash table where we could
146 chain all the page references in non-leaf pages residing in the buf_pool,
147 using the page reference as the hash key,
148 and at the time of reading of a page update the pointers accordingly.
149 Drawbacks of this solution are added complexity and,
150 possibly, extra space required on non-leaf pages for memory pointers.
151 A simpler solution is just to speed up the hash table mechanism
152 in the database, using tables whose size is a power of 2.
153 
154 		Lists of blocks
155 		---------------
156 
157 There are several lists of control blocks.
158 
159 The free list (buf_pool->free) contains blocks which are currently not
160 used.
161 
162 The common LRU list contains all the blocks holding a file page
163 except those for which the bufferfix count is non-zero.
164 The pages are in the LRU list roughly in the order of the last
165 access to the page, so that the oldest pages are at the end of the
166 list. We also keep a pointer to near the end of the LRU list,
167 which we can use when we want to artificially age a page in the
168 buf_pool. This is used if we know that some page is not needed
169 again for some time: we insert the block right after the pointer,
170 causing it to be replaced sooner than would normally be the case.
171 Currently this aging mechanism is used for read-ahead mechanism
172 of pages, and it can also be used when there is a scan of a full
173 table which cannot fit in the memory. Putting the pages near the
174 end of the LRU list, we make sure that most of the buf_pool stays
175 in the main memory, undisturbed.
176 
177 The unzip_LRU list contains a subset of the common LRU list.  The
178 blocks on the unzip_LRU list hold a compressed file page and the
179 corresponding uncompressed page frame.  A block is in unzip_LRU if and
180 only if the predicate buf_page_belongs_to_unzip_LRU(&block->page)
181 holds.  The blocks in unzip_LRU will be in same order as they are in
182 the common LRU list.  That is, each manipulation of the common LRU
183 list will result in the same manipulation of the unzip_LRU list.
184 
185 The chain of modified blocks (buf_pool->flush_list) contains the blocks
186 holding file pages that have been modified in the memory
187 but not written to disk yet. The block with the oldest modification
188 which has not yet been written to disk is at the end of the chain.
189 The access to this list is protected by buf_pool->flush_list_mutex.
190 
191 The chain of unmodified compressed blocks (buf_pool->zip_clean)
192 contains the control blocks (buf_page_t) of those compressed pages
193 that are not in buf_pool->flush_list and for which no uncompressed
194 page has been allocated in the buffer pool.  The control blocks for
195 uncompressed pages are accessible via buf_block_t objects that are
196 reachable via buf_pool->chunks[].
197 
198 The chains of free memory blocks (buf_pool->zip_free[]) are used by
199 the buddy allocator (buf0buddy.cc) to keep track of currently unused
200 memory blocks of size sizeof(buf_page_t)..UNIV_PAGE_SIZE / 2.  These
201 blocks are inside the UNIV_PAGE_SIZE-sized memory blocks of type
202 BUF_BLOCK_MEMORY that the buddy allocator requests from the buffer
203 pool.  The buddy allocator is solely used for allocating control
204 blocks for compressed pages (buf_page_t) and compressed page frames.
205 
206 		Loading a file page
207 		-------------------
208 
209 First, a victim block for replacement has to be found in the
210 buf_pool. It is taken from the free list or searched for from the
211 end of the LRU-list. An exclusive lock is reserved for the frame,
212 the io_fix field is set in the block fixing the block in buf_pool,
213 and the io-operation for loading the page is queued. The io-handler thread
214 releases the X-lock on the frame and resets the io_fix field
215 when the io operation completes.
216 
217 A thread may request the above operation using the function
218 buf_page_get(). It may then continue to request a lock on the frame.
219 The lock is granted when the io-handler releases the x-lock.
220 
221 		Read-ahead
222 		----------
223 
224 The read-ahead mechanism is intended to be intelligent and
225 isolated from the semantically higher levels of the database
226 index management. From the higher level we only need the
227 information if a file page has a natural successor or
228 predecessor page. On the leaf level of a B-tree index,
229 these are the next and previous pages in the natural
230 order of the pages.
231 
232 Let us first explain the read-ahead mechanism when the leafs
233 of a B-tree are scanned in an ascending or descending order.
234 When a read page is the first time referenced in the buf_pool,
235 the buffer manager checks if it is at the border of a so-called
236 linear read-ahead area. The tablespace is divided into these
237 areas of size 64 blocks, for example. So if the page is at the
238 border of such an area, the read-ahead mechanism checks if
239 all the other blocks in the area have been accessed in an
240 ascending or descending order. If this is the case, the system
241 looks at the natural successor or predecessor of the page,
242 checks if that is at the border of another area, and in this case
243 issues read-requests for all the pages in that area. Maybe
244 we could relax the condition that all the pages in the area
245 have to be accessed: if data is deleted from a table, there may
246 appear holes of unused pages in the area.
247 
248 A different read-ahead mechanism is used when there appears
249 to be a random access pattern to a file.
250 If a new page is referenced in the buf_pool, and several pages
251 of its random access area (for instance, 32 consecutive pages
252 in a tablespace) have recently been referenced, we may predict
253 that the whole area may be needed in the near future, and issue
254 the read requests for the whole area.
255 */
256 
257 #ifndef UNIV_HOTBACKUP
258 /** Value in microseconds */
259 static const int WAIT_FOR_READ	= 100;
260 /** Number of attemtps made to read in a page in the buffer pool */
261 static const ulint BUF_PAGE_READ_MAX_RETRIES = 100;
262 
263 /** The buffer pools of the database */
264 UNIV_INTERN buf_pool_t*	buf_pool_ptr;
265 
266 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
267 static ulint	buf_dbg_counter	= 0; /*!< This is used to insert validation
268 					operations in execution in the
269 					debug version */
270 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
271 #ifdef UNIV_DEBUG
272 /** If this is set TRUE, the program prints info whenever
273 read-ahead or flush occurs */
274 UNIV_INTERN ibool		buf_debug_prints = FALSE;
275 #endif /* UNIV_DEBUG */
276 
277 #ifdef UNIV_PFS_RWLOCK
278 /* Keys to register buffer block related rwlocks and mutexes with
279 performance schema */
280 UNIV_INTERN mysql_pfs_key_t	buf_block_lock_key;
281 # ifdef UNIV_SYNC_DEBUG
282 UNIV_INTERN mysql_pfs_key_t	buf_block_debug_latch_key;
283 # endif /* UNIV_SYNC_DEBUG */
284 #endif /* UNIV_PFS_RWLOCK */
285 
286 #ifdef UNIV_PFS_MUTEX
287 UNIV_INTERN mysql_pfs_key_t	buffer_block_mutex_key;
288 UNIV_INTERN mysql_pfs_key_t	buf_pool_mutex_key;
289 UNIV_INTERN mysql_pfs_key_t	buf_pool_zip_mutex_key;
290 UNIV_INTERN mysql_pfs_key_t	flush_list_mutex_key;
291 #endif /* UNIV_PFS_MUTEX */
292 
293 #if defined UNIV_PFS_MUTEX || defined UNIV_PFS_RWLOCK
294 # ifndef PFS_SKIP_BUFFER_MUTEX_RWLOCK
295 
296 /* Buffer block mutexes and rwlocks can be registered
297 in one group rather than individually. If PFS_GROUP_BUFFER_SYNC
298 is defined, register buffer block mutex and rwlock
299 in one group after their initialization. */
300 #  define PFS_GROUP_BUFFER_SYNC
301 
302 /* This define caps the number of mutexes/rwlocks can
303 be registered with performance schema. Developers can
304 modify this define if necessary. Please note, this would
305 be effective only if PFS_GROUP_BUFFER_SYNC is defined. */
306 #  define PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER	ULINT_MAX
307 
308 # endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */
309 #endif /* UNIV_PFS_MUTEX || UNIV_PFS_RWLOCK */
310 
311 /** Macro to determine whether the read of write counter is used depending
312 on the io_type */
313 #define MONITOR_RW_COUNTER(io_type, counter)		\
314 	((io_type == BUF_IO_READ)			\
315 	 ? (counter##_READ)				\
316 	 : (counter##_WRITTEN))
317 
318 /********************************************************************//**
319 Gets the smallest oldest_modification lsn for any page in the pool. Returns
320 zero if all modified pages have been flushed to disk.
321 @return oldest modification in pool, zero if none */
322 UNIV_INTERN
323 lsn_t
buf_pool_get_oldest_modification(void)324 buf_pool_get_oldest_modification(void)
325 /*==================================*/
326 {
327 	ulint		i;
328 	buf_page_t*	bpage;
329 	lsn_t		lsn = 0;
330 	lsn_t		oldest_lsn = 0;
331 
332 	/* When we traverse all the flush lists we don't want another
333 	thread to add a dirty page to any flush list. */
334 	log_flush_order_mutex_enter();
335 
336 	for (i = 0; i < srv_buf_pool_instances; i++) {
337 		buf_pool_t*	buf_pool;
338 
339 		buf_pool = buf_pool_from_array(i);
340 
341 		buf_flush_list_mutex_enter(buf_pool);
342 
343 		bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
344 
345 		if (bpage != NULL) {
346 			ut_ad(bpage->in_flush_list);
347 			lsn = bpage->oldest_modification;
348 		}
349 
350 		buf_flush_list_mutex_exit(buf_pool);
351 
352 		if (!oldest_lsn || oldest_lsn > lsn) {
353 			oldest_lsn = lsn;
354 		}
355 	}
356 
357 	log_flush_order_mutex_exit();
358 
359 	/* The returned answer may be out of date: the flush_list can
360 	change after the mutex has been released. */
361 
362 	return(oldest_lsn);
363 }
364 
365 /********************************************************************//**
366 Get total buffer pool statistics. */
367 UNIV_INTERN
368 void
buf_get_total_list_len(ulint * LRU_len,ulint * free_len,ulint * flush_list_len)369 buf_get_total_list_len(
370 /*===================*/
371 	ulint*		LRU_len,	/*!< out: length of all LRU lists */
372 	ulint*		free_len,	/*!< out: length of all free lists */
373 	ulint*		flush_list_len)	/*!< out: length of all flush lists */
374 {
375 	ulint		i;
376 
377 	*LRU_len = 0;
378 	*free_len = 0;
379 	*flush_list_len = 0;
380 
381 	for (i = 0; i < srv_buf_pool_instances; i++) {
382 		buf_pool_t*	buf_pool;
383 
384 		buf_pool = buf_pool_from_array(i);
385 
386 		*LRU_len += UT_LIST_GET_LEN(buf_pool->LRU);
387 		*free_len += UT_LIST_GET_LEN(buf_pool->free);
388 		*flush_list_len += UT_LIST_GET_LEN(buf_pool->flush_list);
389 	}
390 }
391 
392 /********************************************************************//**
393 Get total list size in bytes from all buffer pools. */
394 UNIV_INTERN
395 void
buf_get_total_list_size_in_bytes(buf_pools_list_size_t * buf_pools_list_size)396 buf_get_total_list_size_in_bytes(
397 /*=============================*/
398 	buf_pools_list_size_t*	buf_pools_list_size)	/*!< out: list sizes
399 							in all buffer pools */
400 {
401 	ut_ad(buf_pools_list_size);
402 	memset(buf_pools_list_size, 0, sizeof(*buf_pools_list_size));
403 
404 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
405 		buf_pool_t*	buf_pool;
406 
407 		buf_pool = buf_pool_from_array(i);
408 		/* We don't need mutex protection since this is
409 		for statistics purpose */
410 		buf_pools_list_size->LRU_bytes += buf_pool->stat.LRU_bytes;
411 		buf_pools_list_size->unzip_LRU_bytes +=
412 			UT_LIST_GET_LEN(buf_pool->unzip_LRU) * UNIV_PAGE_SIZE;
413 		buf_pools_list_size->flush_list_bytes +=
414 			buf_pool->stat.flush_list_bytes;
415 	}
416 }
417 
418 /********************************************************************//**
419 Get total buffer pool statistics. */
420 UNIV_INTERN
421 void
buf_get_total_stat(buf_pool_stat_t * tot_stat)422 buf_get_total_stat(
423 /*===============*/
424 	buf_pool_stat_t*	tot_stat)	/*!< out: buffer pool stats */
425 {
426 	ulint			i;
427 
428 	memset(tot_stat, 0, sizeof(*tot_stat));
429 
430 	for (i = 0; i < srv_buf_pool_instances; i++) {
431 		buf_pool_stat_t*buf_stat;
432 		buf_pool_t*	buf_pool;
433 
434 		buf_pool = buf_pool_from_array(i);
435 
436 		buf_stat = &buf_pool->stat;
437 		tot_stat->n_page_gets += buf_stat->n_page_gets;
438 		tot_stat->n_pages_read += buf_stat->n_pages_read;
439 		tot_stat->n_pages_written += buf_stat->n_pages_written;
440 		tot_stat->n_pages_created += buf_stat->n_pages_created;
441 		tot_stat->n_ra_pages_read_rnd += buf_stat->n_ra_pages_read_rnd;
442 		tot_stat->n_ra_pages_read += buf_stat->n_ra_pages_read;
443 		tot_stat->n_ra_pages_evicted += buf_stat->n_ra_pages_evicted;
444 		tot_stat->n_pages_made_young += buf_stat->n_pages_made_young;
445 
446 		tot_stat->n_pages_not_made_young +=
447 			buf_stat->n_pages_not_made_young;
448 	}
449 }
450 
451 /********************************************************************//**
452 Allocates a buffer block.
453 @return own: the allocated block, in state BUF_BLOCK_MEMORY */
454 UNIV_INTERN
455 buf_block_t*
buf_block_alloc(buf_pool_t * buf_pool)456 buf_block_alloc(
457 /*============*/
458 	buf_pool_t*	buf_pool)	/*!< in/out: buffer pool instance,
459 					or NULL for round-robin selection
460 					of the buffer pool */
461 {
462 	buf_block_t*	block;
463 	ulint		index;
464 	static ulint	buf_pool_index;
465 
466 	if (buf_pool == NULL) {
467 		/* We are allocating memory from any buffer pool, ensure
468 		we spread the grace on all buffer pool instances. */
469 		index = buf_pool_index++ % srv_buf_pool_instances;
470 		buf_pool = buf_pool_from_array(index);
471 	}
472 
473 	block = buf_LRU_get_free_block(buf_pool);
474 
475 	buf_block_set_state(block, BUF_BLOCK_MEMORY);
476 
477 	return(block);
478 }
479 #endif /* !UNIV_HOTBACKUP */
480 
481 /********************************************************************//**
482 Checks if a page is all zeroes.
483 @return	TRUE if the page is all zeroes */
484 bool
buf_page_is_zeroes(const byte * read_buf,const ulint zip_size)485 buf_page_is_zeroes(
486 /*===============*/
487 	const byte*	read_buf,	/*!< in: a database page */
488 	const ulint	zip_size)	/*!< in: size of compressed page;
489 					0 for uncompressed pages */
490 {
491 	const ulint page_size = zip_size ? zip_size : UNIV_PAGE_SIZE;
492 
493 	for (ulint i = 0; i < page_size; i++) {
494 		if (read_buf[i] != 0) {
495 			return(false);
496 		}
497 	}
498 	return(true);
499 }
500 
501 /** Checks if the page is in crc32 checksum format.
502 @param[in]	read_buf	database page
503 @param[in]	checksum_field1	new checksum field
504 @param[in]	checksum_field2	old checksum field
505 @return true if the page is in crc32 checksum format */
506 UNIV_INLINE
507 bool
buf_page_is_checksum_valid_crc32(const byte * read_buf,ulint checksum_field1,ulint checksum_field2)508 buf_page_is_checksum_valid_crc32(
509 	const byte*	read_buf,
510 	ulint		checksum_field1,
511 	ulint		checksum_field2)
512 {
513 	ib_uint32_t	crc32 = buf_calc_page_crc32(read_buf);
514 
515 	return(checksum_field1 == crc32 && checksum_field2 == crc32);
516 }
517 
518 /** Checks if the page is in innodb checksum format.
519 @param[in]	read_buf	database page
520 @param[in]	checksum_field1	new checksum field
521 @param[in]	checksum_field2	old checksum field
522 @return true if the page is in innodb checksum format */
523 UNIV_INLINE
524 bool
buf_page_is_checksum_valid_innodb(const byte * read_buf,ulint checksum_field1,ulint checksum_field2)525 buf_page_is_checksum_valid_innodb(
526 	const byte*	read_buf,
527 	ulint		checksum_field1,
528 	ulint		checksum_field2)
529 {
530 	/* There are 2 valid formulas for
531 	checksum_field2 (old checksum field) which algo=innodb could have
532 	written to the page:
533 
534 	1. Very old versions of InnoDB only stored 8 byte lsn to the
535 	start and the end of the page.
536 
537 	2. Newer InnoDB versions store the old formula checksum
538 	(buf_calc_page_old_checksum()). */
539 
540 	if (checksum_field2 != mach_read_from_4(read_buf + FIL_PAGE_LSN)
541 	    && checksum_field2 != buf_calc_page_old_checksum(read_buf)) {
542 		return(false);
543 	}
544 
545 	/* old field is fine, check the new field */
546 
547 	/* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id
548 	(always equal to 0), to FIL_PAGE_SPACE_OR_CHKSUM */
549 
550 	if (checksum_field1 != 0
551 	    && checksum_field1 != buf_calc_page_new_checksum(read_buf)) {
552 		return(false);
553 	}
554 
555 	return(true);
556 }
557 
558 /** Checks if the page is in none checksum format.
559 @param[in]	read_buf	database page
560 @param[in]	checksum_field1	new checksum field
561 @param[in]	checksum_field2	old checksum field
562 @return true if the page is in none checksum format */
563 UNIV_INLINE
564 bool
buf_page_is_checksum_valid_none(const byte * read_buf,ulint checksum_field1,ulint checksum_field2)565 buf_page_is_checksum_valid_none(
566 	const byte*	read_buf,
567 	ulint		checksum_field1,
568 	ulint		checksum_field2)
569 {
570 	return(checksum_field1 == checksum_field2
571 	       && checksum_field1 == BUF_NO_CHECKSUM_MAGIC);
572 }
573 
574 /********************************************************************//**
575 Checks if a page is corrupt.
576 @return	TRUE if corrupted */
577 UNIV_INTERN
578 ibool
buf_page_is_corrupted(bool check_lsn,const byte * read_buf,ulint zip_size)579 buf_page_is_corrupted(
580 /*==================*/
581 	bool		check_lsn,	/*!< in: true if we need to check
582 					and complain about the LSN */
583 	const byte*	read_buf,	/*!< in: a database page */
584 	ulint		zip_size)	/*!< in: size of compressed page;
585 					0 for uncompressed pages */
586 {
587 	ulint		checksum_field1;
588 	ulint		checksum_field2;
589 
590 	if (!zip_size
591 	    && memcmp(read_buf + FIL_PAGE_LSN + 4,
592 		      read_buf + UNIV_PAGE_SIZE
593 		      - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) {
594 
595 		/* Stored log sequence numbers at the start and the end
596 		of page do not match */
597 
598 		return(TRUE);
599 	}
600 
601 #ifndef UNIV_HOTBACKUP
602 	if (check_lsn && recv_lsn_checks_on) {
603 		lsn_t	current_lsn;
604 
605 		/* Since we are going to reset the page LSN during the import
606 		phase it makes no sense to spam the log with error messages. */
607 
608 		if (log_peek_lsn(&current_lsn)
609 		    && current_lsn
610 		    < mach_read_from_8(read_buf + FIL_PAGE_LSN)) {
611 			ut_print_timestamp(stderr);
612 
613 			fprintf(stderr,
614 				" InnoDB: Error: page %lu log sequence number"
615 				" " LSN_PF "\n"
616 				"InnoDB: is in the future! Current system "
617 				"log sequence number " LSN_PF ".\n"
618 				"InnoDB: Your database may be corrupt or "
619 				"you may have copied the InnoDB\n"
620 				"InnoDB: tablespace but not the InnoDB "
621 				"log files. See\n"
622 				"InnoDB: " REFMAN
623 				"forcing-innodb-recovery.html\n"
624 				"InnoDB: for more information.\n",
625 				(ulong) mach_read_from_4(
626 					read_buf + FIL_PAGE_OFFSET),
627 				(lsn_t) mach_read_from_8(
628 					read_buf + FIL_PAGE_LSN),
629 				current_lsn);
630 		}
631 	}
632 #endif
633 
634 	/* Check whether the checksum fields have correct values */
635 
636 	if (srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_NONE) {
637 		return(FALSE);
638 	}
639 
640 	if (zip_size) {
641 		return(!page_zip_verify_checksum(read_buf, zip_size));
642 	}
643 
644 	checksum_field1 = mach_read_from_4(
645 		read_buf + FIL_PAGE_SPACE_OR_CHKSUM);
646 
647 	checksum_field2 = mach_read_from_4(
648 		read_buf + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM);
649 
650 #if FIL_PAGE_LSN % 8
651 #error "FIL_PAGE_LSN must be 64 bit aligned"
652 #endif
653 
654 	/* declare empty pages non-corrupted */
655 	if (checksum_field1 == 0 && checksum_field2 == 0
656 	    && *reinterpret_cast<const ib_uint64_t*>(read_buf +
657 						     FIL_PAGE_LSN) == 0) {
658 		/* make sure that the page is really empty */
659 		for (ulint i = 0; i < UNIV_PAGE_SIZE; i++) {
660 			if (read_buf[i] != 0) {
661 				return(TRUE);
662 			}
663 		}
664 
665 		return(FALSE);
666 	}
667 
668 	DBUG_EXECUTE_IF("buf_page_is_corrupt_failure", return(TRUE); );
669 
670 	ulint	page_no = mach_read_from_4(read_buf + FIL_PAGE_OFFSET);
671 	ulint	space_id = mach_read_from_4(read_buf + FIL_PAGE_SPACE_ID);
672 	const srv_checksum_algorithm_t	curr_algo =
673 		static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm);
674 
675 	switch (curr_algo) {
676 	case SRV_CHECKSUM_ALGORITHM_CRC32:
677 	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
678 
679 		if (buf_page_is_checksum_valid_crc32(read_buf,
680 			checksum_field1, checksum_field2)) {
681 			return(FALSE);
682 		}
683 
684 		if (buf_page_is_checksum_valid_none(read_buf,
685 			checksum_field1, checksum_field2)) {
686 			if (curr_algo
687 			    == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32) {
688 				page_warn_strict_checksum(
689 					curr_algo,
690 					SRV_CHECKSUM_ALGORITHM_NONE,
691 					space_id, page_no);
692 			}
693 
694 			return(FALSE);
695 		}
696 
697 		if (buf_page_is_checksum_valid_innodb(read_buf,
698 			checksum_field1, checksum_field2)) {
699 			if (curr_algo
700 			    == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32) {
701 				page_warn_strict_checksum(
702 					curr_algo,
703 					SRV_CHECKSUM_ALGORITHM_INNODB,
704 					space_id, page_no);
705 			}
706 
707 			return(FALSE);
708 		}
709 
710 		return(TRUE);
711 
712 	case SRV_CHECKSUM_ALGORITHM_INNODB:
713 	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
714 
715 		if (buf_page_is_checksum_valid_innodb(read_buf,
716 			checksum_field1, checksum_field2)) {
717 			return(FALSE);
718 		}
719 
720 		if (buf_page_is_checksum_valid_none(read_buf,
721 			checksum_field1, checksum_field2)) {
722 			if (curr_algo
723 			    == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB) {
724 				page_warn_strict_checksum(
725 					curr_algo,
726 					SRV_CHECKSUM_ALGORITHM_NONE,
727 					space_id, page_no);
728 			}
729 
730 			return(FALSE);
731 		}
732 
733 		if (buf_page_is_checksum_valid_crc32(read_buf,
734 			checksum_field1, checksum_field2)) {
735 			if (curr_algo
736 			    == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB) {
737 				page_warn_strict_checksum(
738 					curr_algo,
739 					SRV_CHECKSUM_ALGORITHM_CRC32,
740 					space_id, page_no);
741 			}
742 
743 			return(FALSE);
744 		}
745 
746 		return(TRUE);
747 
748 	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
749 
750 		if (buf_page_is_checksum_valid_none(read_buf,
751 			checksum_field1, checksum_field2)) {
752 			return(FALSE);
753 		}
754 
755 		if (buf_page_is_checksum_valid_crc32(read_buf,
756 			checksum_field1, checksum_field2)) {
757 			page_warn_strict_checksum(
758 				curr_algo,
759 				SRV_CHECKSUM_ALGORITHM_CRC32,
760 				space_id, page_no);
761 			return(FALSE);
762 		}
763 
764 		if (buf_page_is_checksum_valid_innodb(read_buf,
765 			checksum_field1, checksum_field2)) {
766 			page_warn_strict_checksum(
767 				curr_algo,
768 				SRV_CHECKSUM_ALGORITHM_INNODB,
769 				space_id, page_no);
770 			return(FALSE);
771 		}
772 
773 		return(TRUE);
774 
775 	case SRV_CHECKSUM_ALGORITHM_NONE:
776 		/* should have returned FALSE earlier */
777 		break;
778 	/* no default so the compiler will emit a warning if new enum
779 	is added and not handled here */
780 	}
781 
782 	ut_error;
783 	return(FALSE);
784 }
785 
786 /********************************************************************//**
787 Prints a page to stderr. */
788 UNIV_INTERN
789 void
buf_page_print(const byte * read_buf,ulint zip_size,ulint flags)790 buf_page_print(
791 /*===========*/
792 	const byte*	read_buf,	/*!< in: a database page */
793 	ulint		zip_size,	/*!< in: compressed page size, or
794 					0 for uncompressed pages */
795 	ulint		flags)		/*!< in: 0 or
796 					BUF_PAGE_PRINT_NO_CRASH or
797 					BUF_PAGE_PRINT_NO_FULL */
798 
799 {
800 #ifndef UNIV_HOTBACKUP
801 	dict_index_t*	index;
802 #endif /* !UNIV_HOTBACKUP */
803 	ulint		size = zip_size;
804 
805 	if (!size) {
806 		size = UNIV_PAGE_SIZE;
807 	}
808 
809 	if (!(flags & BUF_PAGE_PRINT_NO_FULL)) {
810 		ut_print_timestamp(stderr);
811 		fprintf(stderr,
812 			" InnoDB: Page dump in ascii and hex (%lu bytes):\n",
813 			(ulong) size);
814 		ut_print_buf(stderr, read_buf, size);
815 		fputs("\nInnoDB: End of page dump\n", stderr);
816 	}
817 
818 	if (zip_size) {
819 		/* Print compressed page. */
820 		ut_print_timestamp(stderr);
821 		fprintf(stderr,
822 			" InnoDB: Compressed page type (" ULINTPF "); "
823 			"stored checksum in field1 " ULINTPF "; "
824 			"calculated checksums for field1: "
825 			"%s " ULINTPF ", "
826 			"%s " ULINTPF ", "
827 			"%s " ULINTPF "; "
828 			"page LSN " LSN_PF "; "
829 			"page number (if stored to page already) " ULINTPF "; "
830 			"space id (if stored to page already) " ULINTPF "\n",
831 			fil_page_get_type(read_buf),
832 			mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
833 			buf_checksum_algorithm_name(
834 				SRV_CHECKSUM_ALGORITHM_CRC32),
835 			page_zip_calc_checksum(read_buf, zip_size,
836 				SRV_CHECKSUM_ALGORITHM_CRC32),
837 			buf_checksum_algorithm_name(
838 				SRV_CHECKSUM_ALGORITHM_INNODB),
839 			page_zip_calc_checksum(read_buf, zip_size,
840 				SRV_CHECKSUM_ALGORITHM_INNODB),
841 			buf_checksum_algorithm_name(
842 				SRV_CHECKSUM_ALGORITHM_NONE),
843 			page_zip_calc_checksum(read_buf, zip_size,
844 				SRV_CHECKSUM_ALGORITHM_NONE),
845 			mach_read_from_8(read_buf + FIL_PAGE_LSN),
846 			mach_read_from_4(read_buf + FIL_PAGE_OFFSET),
847 			mach_read_from_4(read_buf
848 					 + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
849 	} else {
850 		ut_print_timestamp(stderr);
851 		fprintf(stderr, " InnoDB: uncompressed page, "
852 			"stored checksum in field1 " ULINTPF ", "
853 			"calculated checksums for field1: "
854 			"%s " UINT32PF ", "
855 			"%s " ULINTPF ", "
856 			"%s " ULINTPF ", "
857 
858 			"stored checksum in field2 " ULINTPF ", "
859 			"calculated checksums for field2: "
860 			"%s " UINT32PF ", "
861 			"%s " ULINTPF ", "
862 			"%s " ULINTPF ", "
863 
864 			"page LSN " ULINTPF " " ULINTPF ", "
865 			"low 4 bytes of LSN at page end " ULINTPF ", "
866 			"page number (if stored to page already) " ULINTPF ", "
867 			"space id (if created with >= MySQL-4.1.1 "
868 			"and stored already) %lu\n",
869 			mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
870 			buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_CRC32),
871 			buf_calc_page_crc32(read_buf),
872 			buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_INNODB),
873 			buf_calc_page_new_checksum(read_buf),
874 			buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_NONE),
875 			BUF_NO_CHECKSUM_MAGIC,
876 
877 			mach_read_from_4(read_buf + UNIV_PAGE_SIZE
878 					 - FIL_PAGE_END_LSN_OLD_CHKSUM),
879 			buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_CRC32),
880 			buf_calc_page_crc32(read_buf),
881 			buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_INNODB),
882 			buf_calc_page_old_checksum(read_buf),
883 			buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_NONE),
884 			BUF_NO_CHECKSUM_MAGIC,
885 
886 			mach_read_from_4(read_buf + FIL_PAGE_LSN),
887 			mach_read_from_4(read_buf + FIL_PAGE_LSN + 4),
888 			mach_read_from_4(read_buf + UNIV_PAGE_SIZE
889 					 - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
890 			mach_read_from_4(read_buf + FIL_PAGE_OFFSET),
891 			mach_read_from_4(read_buf
892 					 + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
893 	}
894 
895 #ifndef UNIV_HOTBACKUP
896 	if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE)
897 	    == TRX_UNDO_INSERT) {
898 		fprintf(stderr,
899 			"InnoDB: Page may be an insert undo log page\n");
900 	} else if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR
901 				    + TRX_UNDO_PAGE_TYPE)
902 		   == TRX_UNDO_UPDATE) {
903 		fprintf(stderr,
904 			"InnoDB: Page may be an update undo log page\n");
905 	}
906 #endif /* !UNIV_HOTBACKUP */
907 
908 	switch (fil_page_get_type(read_buf)) {
909 		index_id_t	index_id;
910 	case FIL_PAGE_INDEX:
911 		index_id = btr_page_get_index_id(read_buf);
912 		fprintf(stderr,
913 			"InnoDB: Page may be an index page where"
914 			" index id is %llu\n",
915 			(ullint) index_id);
916 #ifndef UNIV_HOTBACKUP
917 		index = dict_index_find_on_id_low(index_id);
918 		if (index) {
919 			fputs("InnoDB: (", stderr);
920 			dict_index_name_print(stderr, NULL, index);
921 			fputs(")\n", stderr);
922 		}
923 #endif /* !UNIV_HOTBACKUP */
924 		break;
925 	case FIL_PAGE_INODE:
926 		fputs("InnoDB: Page may be an 'inode' page\n", stderr);
927 		break;
928 	case FIL_PAGE_IBUF_FREE_LIST:
929 		fputs("InnoDB: Page may be an insert buffer free list page\n",
930 		      stderr);
931 		break;
932 	case FIL_PAGE_TYPE_ALLOCATED:
933 		fputs("InnoDB: Page may be a freshly allocated page\n",
934 		      stderr);
935 		break;
936 	case FIL_PAGE_IBUF_BITMAP:
937 		fputs("InnoDB: Page may be an insert buffer bitmap page\n",
938 		      stderr);
939 		break;
940 	case FIL_PAGE_TYPE_SYS:
941 		fputs("InnoDB: Page may be a system page\n",
942 		      stderr);
943 		break;
944 	case FIL_PAGE_TYPE_TRX_SYS:
945 		fputs("InnoDB: Page may be a transaction system page\n",
946 		      stderr);
947 		break;
948 	case FIL_PAGE_TYPE_FSP_HDR:
949 		fputs("InnoDB: Page may be a file space header page\n",
950 		      stderr);
951 		break;
952 	case FIL_PAGE_TYPE_XDES:
953 		fputs("InnoDB: Page may be an extent descriptor page\n",
954 		      stderr);
955 		break;
956 	case FIL_PAGE_TYPE_BLOB:
957 		fputs("InnoDB: Page may be a BLOB page\n",
958 		      stderr);
959 		break;
960 	case FIL_PAGE_TYPE_ZBLOB:
961 	case FIL_PAGE_TYPE_ZBLOB2:
962 		fputs("InnoDB: Page may be a compressed BLOB page\n",
963 		      stderr);
964 		break;
965 	}
966 
967 	ut_ad(flags & BUF_PAGE_PRINT_NO_CRASH);
968 }
969 
970 #ifndef UNIV_HOTBACKUP
971 
972 # ifdef PFS_GROUP_BUFFER_SYNC
973 /********************************************************************//**
974 This function registers mutexes and rwlocks in buffer blocks with
975 performance schema. If PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER is
976 defined to be a value less than chunk->size, then only mutexes
977 and rwlocks in the first PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER
978 blocks are registered. */
979 static
980 void
pfs_register_buffer_block(buf_chunk_t * chunk)981 pfs_register_buffer_block(
982 /*======================*/
983 	buf_chunk_t*	chunk)		/*!< in/out: chunk of buffers */
984 {
985 	ulint		i;
986 	ulint		num_to_register;
987 	buf_block_t*    block;
988 
989 	block = chunk->blocks;
990 
991 	num_to_register = ut_min(chunk->size,
992 				 PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER);
993 
994 	for (i = 0; i < num_to_register; i++) {
995 		ib_mutex_t*	mutex;
996 		rw_lock_t*	rwlock;
997 
998 #  ifdef UNIV_PFS_MUTEX
999 		mutex = &block->mutex;
1000 		ut_a(!mutex->pfs_psi);
1001 		mutex->pfs_psi = (PSI_server)
1002 			? PSI_server->init_mutex(buffer_block_mutex_key, mutex)
1003 			: NULL;
1004 #  endif /* UNIV_PFS_MUTEX */
1005 
1006 #  ifdef UNIV_PFS_RWLOCK
1007 		rwlock = &block->lock;
1008 		ut_a(!rwlock->pfs_psi);
1009 		rwlock->pfs_psi = (PSI_server)
1010 			? PSI_server->init_rwlock(buf_block_lock_key, rwlock)
1011 			: NULL;
1012 
1013 #   ifdef UNIV_SYNC_DEBUG
1014 		rwlock = &block->debug_latch;
1015 		ut_a(!rwlock->pfs_psi);
1016 		rwlock->pfs_psi = (PSI_server)
1017 			? PSI_server->init_rwlock(buf_block_debug_latch_key,
1018 						  rwlock)
1019 			: NULL;
1020 #   endif /* UNIV_SYNC_DEBUG */
1021 
1022 #  endif /* UNIV_PFS_RWLOCK */
1023 		block++;
1024 	}
1025 }
1026 # endif /* PFS_GROUP_BUFFER_SYNC */
1027 
1028 /********************************************************************//**
1029 Initializes a buffer control block when the buf_pool is created. */
1030 static
1031 void
buf_block_init(buf_pool_t * buf_pool,buf_block_t * block,byte * frame)1032 buf_block_init(
1033 /*===========*/
1034 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
1035 	buf_block_t*	block,		/*!< in: pointer to control block */
1036 	byte*		frame)		/*!< in: pointer to buffer frame */
1037 {
1038 	UNIV_MEM_DESC(frame, UNIV_PAGE_SIZE);
1039 
1040 	block->frame = frame;
1041 
1042 	block->page.buf_pool_index = buf_pool_index(buf_pool);
1043 	block->page.state = BUF_BLOCK_NOT_USED;
1044 	block->page.buf_fix_count = 0;
1045 	block->page.io_fix = BUF_IO_NONE;
1046 
1047 	block->modify_clock = 0;
1048 
1049 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
1050 	block->page.file_page_was_freed = FALSE;
1051 #endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
1052 
1053 	block->check_index_page_at_flush = FALSE;
1054 	block->index = NULL;
1055 
1056 #ifdef UNIV_DEBUG
1057 	block->page.in_page_hash = FALSE;
1058 	block->page.in_zip_hash = FALSE;
1059 	block->page.in_flush_list = FALSE;
1060 	block->page.in_free_list = FALSE;
1061 	block->page.in_LRU_list = FALSE;
1062 	block->in_unzip_LRU_list = FALSE;
1063 #endif /* UNIV_DEBUG */
1064 #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
1065 	block->n_pointers = 0;
1066 #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
1067 	page_zip_des_init(&block->page.zip);
1068 
1069 #if defined PFS_SKIP_BUFFER_MUTEX_RWLOCK || defined PFS_GROUP_BUFFER_SYNC
1070 	/* If PFS_SKIP_BUFFER_MUTEX_RWLOCK is defined, skip registration
1071 	of buffer block mutex/rwlock with performance schema. If
1072 	PFS_GROUP_BUFFER_SYNC is defined, skip the registration
1073 	since buffer block mutex/rwlock will be registered later in
1074 	pfs_register_buffer_block() */
1075 
1076 	mutex_create(PFS_NOT_INSTRUMENTED, &block->mutex, SYNC_BUF_BLOCK);
1077 	rw_lock_create(PFS_NOT_INSTRUMENTED, &block->lock, SYNC_LEVEL_VARYING);
1078 
1079 # ifdef UNIV_SYNC_DEBUG
1080 	rw_lock_create(PFS_NOT_INSTRUMENTED,
1081 		       &block->debug_latch, SYNC_NO_ORDER_CHECK);
1082 # endif /* UNIV_SYNC_DEBUG */
1083 
1084 #else /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */
1085 	mutex_create(buffer_block_mutex_key, &block->mutex, SYNC_BUF_BLOCK);
1086 	rw_lock_create(buf_block_lock_key, &block->lock, SYNC_LEVEL_VARYING);
1087 
1088 # ifdef UNIV_SYNC_DEBUG
1089 	rw_lock_create(buf_block_debug_latch_key,
1090 		       &block->debug_latch, SYNC_NO_ORDER_CHECK);
1091 # endif /* UNIV_SYNC_DEBUG */
1092 #endif /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */
1093 
1094 	ut_ad(rw_lock_validate(&(block->lock)));
1095 }
1096 
1097 /********************************************************************//**
1098 Allocates a chunk of buffer frames.
1099 @return	chunk, or NULL on failure */
1100 static
1101 buf_chunk_t*
buf_chunk_init(buf_pool_t * buf_pool,buf_chunk_t * chunk,ulint mem_size)1102 buf_chunk_init(
1103 /*===========*/
1104 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
1105 	buf_chunk_t*	chunk,		/*!< out: chunk of buffers */
1106 	ulint		mem_size)	/*!< in: requested size in bytes */
1107 {
1108 	buf_block_t*	block;
1109 	byte*		frame;
1110 	ulint		i;
1111 
1112 	/* Round down to a multiple of page size,
1113 	although it already should be. */
1114 	mem_size = ut_2pow_round(mem_size, UNIV_PAGE_SIZE);
1115 	/* Reserve space for the block descriptors. */
1116 	mem_size += ut_2pow_round((mem_size / UNIV_PAGE_SIZE) * (sizeof *block)
1117 				  + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE);
1118 
1119 	chunk->mem_size = mem_size;
1120 	chunk->mem = os_mem_alloc_large(&chunk->mem_size);
1121 
1122 	if (UNIV_UNLIKELY(chunk->mem == NULL)) {
1123 
1124 		return(NULL);
1125 	}
1126 
1127 #ifdef HAVE_LIBNUMA
1128 	if (srv_numa_interleave) {
1129 		int	st = mbind(chunk->mem, chunk->mem_size,
1130 				   MPOL_INTERLEAVE,
1131 				   numa_all_nodes_ptr->maskp,
1132 				   numa_all_nodes_ptr->size,
1133 				   MPOL_MF_MOVE);
1134 		if (st != 0) {
1135 			ib_logf(IB_LOG_LEVEL_WARN,
1136 				"Failed to set NUMA memory policy of buffer"
1137 				" pool page frames to MPOL_INTERLEAVE"
1138 				" (error: %s).", strerror(errno));
1139 		}
1140 	}
1141 #endif // HAVE_LIBNUMA
1142 
1143 	/* Allocate the block descriptors from
1144 	the start of the memory block. */
1145 	chunk->blocks = (buf_block_t*) chunk->mem;
1146 
1147 	/* Align a pointer to the first frame.  Note that when
1148 	os_large_page_size is smaller than UNIV_PAGE_SIZE,
1149 	we may allocate one fewer block than requested.  When
1150 	it is bigger, we may allocate more blocks than requested. */
1151 
1152 	frame = (byte*) ut_align(chunk->mem, UNIV_PAGE_SIZE);
1153 	chunk->size = chunk->mem_size / UNIV_PAGE_SIZE
1154 		- (frame != chunk->mem);
1155 
1156 	/* Subtract the space needed for block descriptors. */
1157 	{
1158 		ulint	size = chunk->size;
1159 
1160 		while (frame < (byte*) (chunk->blocks + size)) {
1161 			frame += UNIV_PAGE_SIZE;
1162 			size--;
1163 		}
1164 
1165 		chunk->size = size;
1166 	}
1167 
1168 	/* Init block structs and assign frames for them. Then we
1169 	assign the frames to the first blocks (we already mapped the
1170 	memory above). */
1171 
1172 	block = chunk->blocks;
1173 
1174 	for (i = chunk->size; i--; ) {
1175 
1176 		buf_block_init(buf_pool, block, frame);
1177 		UNIV_MEM_INVALID(block->frame, UNIV_PAGE_SIZE);
1178 
1179 		/* Add the block to the free list */
1180 		UT_LIST_ADD_LAST(list, buf_pool->free, (&block->page));
1181 
1182 		ut_d(block->page.in_free_list = TRUE);
1183 		ut_ad(buf_pool_from_block(block) == buf_pool);
1184 
1185 		block++;
1186 		frame += UNIV_PAGE_SIZE;
1187 	}
1188 
1189 #ifdef PFS_GROUP_BUFFER_SYNC
1190 	pfs_register_buffer_block(chunk);
1191 #endif
1192 	return(chunk);
1193 }
1194 
1195 #ifdef UNIV_DEBUG
1196 /*********************************************************************//**
1197 Finds a block in the given buffer chunk that points to a
1198 given compressed page.
1199 @return	buffer block pointing to the compressed page, or NULL */
1200 static
1201 buf_block_t*
buf_chunk_contains_zip(buf_chunk_t * chunk,const void * data)1202 buf_chunk_contains_zip(
1203 /*===================*/
1204 	buf_chunk_t*	chunk,	/*!< in: chunk being checked */
1205 	const void*	data)	/*!< in: pointer to compressed page */
1206 {
1207 	buf_block_t*	block;
1208 	ulint		i;
1209 
1210 	block = chunk->blocks;
1211 
1212 	for (i = chunk->size; i--; block++) {
1213 		if (block->page.zip.data == data) {
1214 
1215 			return(block);
1216 		}
1217 	}
1218 
1219 	return(NULL);
1220 }
1221 
1222 /*********************************************************************//**
1223 Finds a block in the buffer pool that points to a
1224 given compressed page.
1225 @return	buffer block pointing to the compressed page, or NULL */
1226 UNIV_INTERN
1227 buf_block_t*
buf_pool_contains_zip(buf_pool_t * buf_pool,const void * data)1228 buf_pool_contains_zip(
1229 /*==================*/
1230 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
1231 	const void*	data)		/*!< in: pointer to compressed page */
1232 {
1233 	ulint		n;
1234 	buf_chunk_t*	chunk = buf_pool->chunks;
1235 
1236 	ut_ad(buf_pool);
1237 	ut_ad(buf_pool_mutex_own(buf_pool));
1238 	for (n = buf_pool->n_chunks; n--; chunk++) {
1239 
1240 		buf_block_t* block = buf_chunk_contains_zip(chunk, data);
1241 
1242 		if (block) {
1243 			return(block);
1244 		}
1245 	}
1246 
1247 	return(NULL);
1248 }
1249 #endif /* UNIV_DEBUG */
1250 
1251 /*********************************************************************//**
1252 Checks that all file pages in the buffer chunk are in a replaceable state.
1253 @return	address of a non-free block, or NULL if all freed */
1254 static
1255 const buf_block_t*
buf_chunk_not_freed(buf_chunk_t * chunk)1256 buf_chunk_not_freed(
1257 /*================*/
1258 	buf_chunk_t*	chunk)	/*!< in: chunk being checked */
1259 {
1260 	buf_block_t*	block;
1261 	ulint		i;
1262 
1263 	block = chunk->blocks;
1264 
1265 	for (i = chunk->size; i--; block++) {
1266 		ibool	ready;
1267 
1268 		switch (buf_block_get_state(block)) {
1269 		case BUF_BLOCK_POOL_WATCH:
1270 		case BUF_BLOCK_ZIP_PAGE:
1271 		case BUF_BLOCK_ZIP_DIRTY:
1272 			/* The uncompressed buffer pool should never
1273 			contain compressed block descriptors. */
1274 			ut_error;
1275 			break;
1276 		case BUF_BLOCK_NOT_USED:
1277 		case BUF_BLOCK_READY_FOR_USE:
1278 		case BUF_BLOCK_MEMORY:
1279 		case BUF_BLOCK_REMOVE_HASH:
1280 			/* Skip blocks that are not being used for
1281 			file pages. */
1282 			break;
1283 		case BUF_BLOCK_FILE_PAGE:
1284 			mutex_enter(&block->mutex);
1285 			ready = buf_flush_ready_for_replace(&block->page);
1286 			mutex_exit(&block->mutex);
1287 
1288 			if (!ready) {
1289 
1290 				return(block);
1291 			}
1292 
1293 			break;
1294 		}
1295 	}
1296 
1297 	return(NULL);
1298 }
1299 
1300 /********************************************************************//**
1301 Set buffer pool size variables after resizing it */
1302 static
1303 void
buf_pool_set_sizes(void)1304 buf_pool_set_sizes(void)
1305 /*====================*/
1306 {
1307 	ulint	i;
1308 	ulint	curr_size = 0;
1309 
1310 	buf_pool_mutex_enter_all();
1311 
1312 	for (i = 0; i < srv_buf_pool_instances; i++) {
1313 		buf_pool_t*	buf_pool;
1314 
1315 		buf_pool = buf_pool_from_array(i);
1316 		curr_size += buf_pool->curr_pool_size;
1317 	}
1318 
1319 	srv_buf_pool_curr_size = curr_size;
1320 	srv_buf_pool_old_size = srv_buf_pool_size;
1321 
1322 	buf_pool_mutex_exit_all();
1323 }
1324 
1325 /********************************************************************//**
1326 Initialize a buffer pool instance.
1327 @return DB_SUCCESS if all goes well. */
1328 UNIV_INTERN
1329 ulint
buf_pool_init_instance(buf_pool_t * buf_pool,ulint buf_pool_size,ulint instance_no)1330 buf_pool_init_instance(
1331 /*===================*/
1332 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
1333 	ulint		buf_pool_size,	/*!< in: size in bytes */
1334 	ulint		instance_no)	/*!< in: id of the instance */
1335 {
1336 	ulint		i;
1337 	buf_chunk_t*	chunk;
1338 
1339 	/* 1. Initialize general fields
1340 	------------------------------- */
1341 	mutex_create(buf_pool_mutex_key,
1342 		     &buf_pool->mutex, SYNC_BUF_POOL);
1343 	mutex_create(buf_pool_zip_mutex_key,
1344 		     &buf_pool->zip_mutex, SYNC_BUF_BLOCK);
1345 
1346 	buf_pool_mutex_enter(buf_pool);
1347 
1348 	if (buf_pool_size > 0) {
1349 		buf_pool->n_chunks = 1;
1350 
1351 		buf_pool->chunks = chunk =
1352 			(buf_chunk_t*) mem_zalloc(sizeof *chunk);
1353 
1354 		UT_LIST_INIT(buf_pool->free);
1355 
1356 		if (!buf_chunk_init(buf_pool, chunk, buf_pool_size)) {
1357 			mem_free(chunk);
1358 			mem_free(buf_pool);
1359 
1360 			buf_pool_mutex_exit(buf_pool);
1361 
1362 			return(DB_ERROR);
1363 		}
1364 
1365 		buf_pool->instance_no = instance_no;
1366 		buf_pool->old_pool_size = buf_pool_size;
1367 		buf_pool->curr_size = chunk->size;
1368 		buf_pool->curr_pool_size = buf_pool->curr_size * UNIV_PAGE_SIZE;
1369 
1370 		/* Number of locks protecting page_hash must be a
1371 		power of two */
1372 		srv_n_page_hash_locks = static_cast<ulong>(
1373 				 ut_2_power_up(srv_n_page_hash_locks));
1374 		ut_a(srv_n_page_hash_locks != 0);
1375 		ut_a(srv_n_page_hash_locks <= MAX_PAGE_HASH_LOCKS);
1376 
1377 		buf_pool->page_hash = ib_create(2 * buf_pool->curr_size,
1378 						srv_n_page_hash_locks,
1379 						MEM_HEAP_FOR_PAGE_HASH,
1380 						SYNC_BUF_PAGE_HASH);
1381 
1382 		buf_pool->zip_hash = hash_create(2 * buf_pool->curr_size);
1383 
1384 		buf_pool->last_printout_time = ut_time();
1385 	}
1386 	/* 2. Initialize flushing fields
1387 	-------------------------------- */
1388 
1389 	mutex_create(flush_list_mutex_key, &buf_pool->flush_list_mutex,
1390 		     SYNC_BUF_FLUSH_LIST);
1391 
1392 	for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
1393 		buf_pool->no_flush[i] = os_event_create();
1394 	}
1395 
1396 	buf_pool->watch = (buf_page_t*) mem_zalloc(
1397 		sizeof(*buf_pool->watch) * BUF_POOL_WATCH_SIZE);
1398 
1399 	/* All fields are initialized by mem_zalloc(). */
1400 
1401 	buf_pool->try_LRU_scan = TRUE;
1402 
1403 	buf_pool_mutex_exit(buf_pool);
1404 
1405 	return(DB_SUCCESS);
1406 }
1407 
1408 /********************************************************************//**
1409 free one buffer pool instance */
1410 static
1411 void
buf_pool_free_instance(buf_pool_t * buf_pool)1412 buf_pool_free_instance(
1413 /*===================*/
1414 	buf_pool_t*	buf_pool)	/* in,own: buffer pool instance
1415 					to free */
1416 {
1417 	buf_chunk_t*	chunk;
1418 	buf_chunk_t*	chunks;
1419 	buf_page_t*	bpage;
1420 
1421 	bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1422 	while (bpage != NULL) {
1423 		buf_page_t*	prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
1424 		enum buf_page_state	state = buf_page_get_state(bpage);
1425 
1426 		ut_ad(buf_page_in_file(bpage));
1427 		ut_ad(bpage->in_LRU_list);
1428 
1429 		if (state != BUF_BLOCK_FILE_PAGE) {
1430 			/* We must not have any dirty block except
1431 			when doing a fast shutdown. */
1432 			ut_ad(state == BUF_BLOCK_ZIP_PAGE
1433 			      || srv_fast_shutdown == 2);
1434 			buf_page_free_descriptor(bpage);
1435 		}
1436 
1437 		bpage = prev_bpage;
1438 	}
1439 
1440 	mem_free(buf_pool->watch);
1441 	buf_pool->watch = NULL;
1442 
1443 	chunks = buf_pool->chunks;
1444 	chunk = chunks + buf_pool->n_chunks;
1445 
1446 	while (--chunk >= chunks) {
1447 		os_mem_free_large(chunk->mem, chunk->mem_size);
1448 	}
1449 
1450 	mem_free(buf_pool->chunks);
1451 	ha_clear(buf_pool->page_hash);
1452 	hash_table_free(buf_pool->page_hash);
1453 	hash_table_free(buf_pool->zip_hash);
1454 }
1455 
1456 /********************************************************************//**
1457 Creates the buffer pool.
1458 @return	DB_SUCCESS if success, DB_ERROR if not enough memory or error */
1459 UNIV_INTERN
1460 dberr_t
buf_pool_init(ulint total_size,ulint n_instances)1461 buf_pool_init(
1462 /*==========*/
1463 	ulint	total_size,	/*!< in: size of the total pool in bytes */
1464 	ulint	n_instances)	/*!< in: number of instances */
1465 {
1466 	ulint		i;
1467 	const ulint	size	= total_size / n_instances;
1468 
1469 	ut_ad(n_instances > 0);
1470 	ut_ad(n_instances <= MAX_BUFFER_POOLS);
1471 	ut_ad(n_instances == srv_buf_pool_instances);
1472 
1473 #ifdef HAVE_LIBNUMA
1474 	if (srv_numa_interleave) {
1475 		ib_logf(IB_LOG_LEVEL_INFO,
1476 			"Setting NUMA memory policy to MPOL_INTERLEAVE");
1477 		if (set_mempolicy(MPOL_INTERLEAVE,
1478 				  numa_all_nodes_ptr->maskp,
1479 				  numa_all_nodes_ptr->size) != 0) {
1480 			ib_logf(IB_LOG_LEVEL_WARN,
1481 				"Failed to set NUMA memory policy to"
1482 				" MPOL_INTERLEAVE (error: %s).",
1483 				strerror(errno));
1484 		}
1485 	}
1486 #endif // HAVE_LIBNUMA
1487 
1488 	buf_pool_ptr = (buf_pool_t*) mem_zalloc(
1489 		n_instances * sizeof *buf_pool_ptr);
1490 
1491 	for (i = 0; i < n_instances; i++) {
1492 		buf_pool_t*	ptr	= &buf_pool_ptr[i];
1493 
1494 		if (buf_pool_init_instance(ptr, size, i) != DB_SUCCESS) {
1495 
1496 			/* Free all the instances created so far. */
1497 			buf_pool_free(i);
1498 
1499 			return(DB_ERROR);
1500 		}
1501 	}
1502 
1503 	buf_pool_set_sizes();
1504 	buf_LRU_old_ratio_update(100 * 3/ 8, FALSE);
1505 
1506 	btr_search_sys_create(buf_pool_get_curr_size() / sizeof(void*) / 64);
1507 
1508 #ifdef HAVE_LIBNUMA
1509 	if (srv_numa_interleave) {
1510 		ib_logf(IB_LOG_LEVEL_INFO,
1511 			"Setting NUMA memory policy to MPOL_DEFAULT");
1512 		if (set_mempolicy(MPOL_DEFAULT, NULL, 0) != 0) {
1513 			ib_logf(IB_LOG_LEVEL_WARN,
1514 				"Failed to set NUMA memory policy to"
1515 				" MPOL_DEFAULT (error: %s).", strerror(errno));
1516 		}
1517 	}
1518 #endif // HAVE_LIBNUMA
1519 
1520 	return(DB_SUCCESS);
1521 }
1522 
1523 /********************************************************************//**
1524 Frees the buffer pool at shutdown.  This must not be invoked before
1525 freeing all mutexes. */
1526 UNIV_INTERN
1527 void
buf_pool_free(ulint n_instances)1528 buf_pool_free(
1529 /*==========*/
1530 	ulint	n_instances)	/*!< in: numbere of instances to free */
1531 {
1532 	ulint	i;
1533 
1534 	for (i = 0; i < n_instances; i++) {
1535 		buf_pool_free_instance(buf_pool_from_array(i));
1536 	}
1537 
1538 	mem_free(buf_pool_ptr);
1539 	buf_pool_ptr = NULL;
1540 }
1541 
1542 /********************************************************************//**
1543 Clears the adaptive hash index on all pages in the buffer pool. */
1544 UNIV_INTERN
1545 void
buf_pool_clear_hash_index(void)1546 buf_pool_clear_hash_index(void)
1547 /*===========================*/
1548 {
1549 	ulint	p;
1550 
1551 #ifdef UNIV_SYNC_DEBUG
1552 	ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX));
1553 #endif /* UNIV_SYNC_DEBUG */
1554 	ut_ad(!btr_search_enabled);
1555 
1556 	for (p = 0; p < srv_buf_pool_instances; p++) {
1557 		buf_pool_t*	buf_pool = buf_pool_from_array(p);
1558 		buf_chunk_t*	chunks	= buf_pool->chunks;
1559 		buf_chunk_t*	chunk	= chunks + buf_pool->n_chunks;
1560 
1561 		while (--chunk >= chunks) {
1562 			buf_block_t*	block	= chunk->blocks;
1563 			ulint		i	= chunk->size;
1564 
1565 			for (; i--; block++) {
1566 				dict_index_t*	index	= block->index;
1567 
1568 				/* We can set block->index = NULL
1569 				when we have an x-latch on btr_search_latch;
1570 				see the comment in buf0buf.h */
1571 
1572 				if (!index) {
1573 					/* Not hashed */
1574 					continue;
1575 				}
1576 
1577 				block->index = NULL;
1578 # if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
1579 				block->n_pointers = 0;
1580 # endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
1581 			}
1582 		}
1583 	}
1584 }
1585 
1586 /********************************************************************//**
1587 Relocate a buffer control block.  Relocates the block on the LRU list
1588 and in buf_pool->page_hash.  Does not relocate bpage->list.
1589 The caller must take care of relocating bpage->list. */
1590 UNIV_INTERN
1591 void
buf_relocate(buf_page_t * bpage,buf_page_t * dpage)1592 buf_relocate(
1593 /*=========*/
1594 	buf_page_t*	bpage,	/*!< in/out: control block being relocated;
1595 				buf_page_get_state(bpage) must be
1596 				BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_ZIP_PAGE */
1597 	buf_page_t*	dpage)	/*!< in/out: destination control block */
1598 {
1599 	buf_page_t*	b;
1600 	ulint		fold;
1601 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
1602 
1603 	fold = buf_page_address_fold(bpage->space, bpage->offset);
1604 
1605 	ut_ad(buf_pool_mutex_own(buf_pool));
1606 	ut_ad(buf_page_hash_lock_held_x(buf_pool, bpage));
1607 	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
1608 	ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
1609 	ut_a(bpage->buf_fix_count == 0);
1610 	ut_ad(bpage->in_LRU_list);
1611 	ut_ad(!bpage->in_zip_hash);
1612 	ut_ad(bpage->in_page_hash);
1613 	ut_ad(bpage == buf_page_hash_get_low(buf_pool,
1614 					     bpage->space,
1615 					     bpage->offset,
1616 					     fold));
1617 
1618 	ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
1619 #ifdef UNIV_DEBUG
1620 	switch (buf_page_get_state(bpage)) {
1621 	case BUF_BLOCK_POOL_WATCH:
1622 	case BUF_BLOCK_NOT_USED:
1623 	case BUF_BLOCK_READY_FOR_USE:
1624 	case BUF_BLOCK_FILE_PAGE:
1625 	case BUF_BLOCK_MEMORY:
1626 	case BUF_BLOCK_REMOVE_HASH:
1627 		ut_error;
1628 	case BUF_BLOCK_ZIP_DIRTY:
1629 	case BUF_BLOCK_ZIP_PAGE:
1630 		break;
1631 	}
1632 #endif /* UNIV_DEBUG */
1633 
1634 	memcpy(dpage, bpage, sizeof *dpage);
1635 
1636 	ut_d(bpage->in_LRU_list = FALSE);
1637 	ut_d(bpage->in_page_hash = FALSE);
1638 
1639 	/* relocate buf_pool->LRU */
1640 	b = UT_LIST_GET_PREV(LRU, bpage);
1641 	UT_LIST_REMOVE(LRU, buf_pool->LRU, bpage);
1642 
1643 	if (b) {
1644 		UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU, b, dpage);
1645 	} else {
1646 		UT_LIST_ADD_FIRST(LRU, buf_pool->LRU, dpage);
1647 	}
1648 
1649 	if (UNIV_UNLIKELY(buf_pool->LRU_old == bpage)) {
1650 		buf_pool->LRU_old = dpage;
1651 #ifdef UNIV_LRU_DEBUG
1652 		/* buf_pool->LRU_old must be the first item in the LRU list
1653 		whose "old" flag is set. */
1654 		ut_a(buf_pool->LRU_old->old);
1655 		ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)
1656 		     || !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old);
1657 		ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)
1658 		     || UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old);
1659 	} else {
1660 		/* Check that the "old" flag is consistent in
1661 		the block and its neighbours. */
1662 		buf_page_set_old(dpage, buf_page_is_old(dpage));
1663 #endif /* UNIV_LRU_DEBUG */
1664 	}
1665 
1666         ut_d(UT_LIST_VALIDATE(
1667 		LRU, buf_page_t, buf_pool->LRU, CheckInLRUList()));
1668 
1669 	/* relocate buf_pool->page_hash */
1670 	HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, bpage);
1671 	HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, dpage);
1672 }
1673 
1674 /********************************************************************//**
1675 Determine if a block is a sentinel for a buffer pool watch.
1676 @return	TRUE if a sentinel for a buffer pool watch, FALSE if not */
1677 UNIV_INTERN
1678 ibool
buf_pool_watch_is_sentinel(buf_pool_t * buf_pool,const buf_page_t * bpage)1679 buf_pool_watch_is_sentinel(
1680 /*=======================*/
1681 	buf_pool_t*		buf_pool,	/*!< buffer pool instance */
1682 	const buf_page_t*	bpage)		/*!< in: block */
1683 {
1684 	/* We must also own the appropriate hash lock. */
1685 	ut_ad(buf_page_hash_lock_held_s_or_x(buf_pool, bpage));
1686 	ut_ad(buf_page_in_file(bpage));
1687 
1688 	if (bpage < &buf_pool->watch[0]
1689 	    || bpage >= &buf_pool->watch[BUF_POOL_WATCH_SIZE]) {
1690 
1691 		ut_ad(buf_page_get_state(bpage) != BUF_BLOCK_ZIP_PAGE
1692 		      || bpage->zip.data != NULL);
1693 
1694 		return(FALSE);
1695 	}
1696 
1697 	ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE);
1698 	ut_ad(!bpage->in_zip_hash);
1699 	ut_ad(bpage->in_page_hash);
1700 	ut_ad(bpage->zip.data == NULL);
1701 	ut_ad(bpage->buf_fix_count > 0);
1702 	return(TRUE);
1703 }
1704 
1705 /****************************************************************//**
1706 Add watch for the given page to be read in. Caller must have
1707 appropriate hash_lock for the bpage. This function may release the
1708 hash_lock and reacquire it.
1709 @return NULL if watch set, block if the page is in the buffer pool */
1710 UNIV_INTERN
1711 buf_page_t*
buf_pool_watch_set(ulint space,ulint offset,ulint fold)1712 buf_pool_watch_set(
1713 /*===============*/
1714 	ulint	space,	/*!< in: space id */
1715 	ulint	offset,	/*!< in: page number */
1716 	ulint	fold)	/*!< in: buf_page_address_fold(space, offset) */
1717 {
1718 	buf_page_t*	bpage;
1719 	ulint		i;
1720 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
1721 	rw_lock_t*	hash_lock;
1722 
1723 	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
1724 
1725 #ifdef UNIV_SYNC_DEBUG
1726 	ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX));
1727 #endif /* UNIV_SYNC_DEBUG */
1728 
1729 	bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
1730 
1731 	if (bpage != NULL) {
1732 page_found:
1733 		if (!buf_pool_watch_is_sentinel(buf_pool, bpage)) {
1734 			/* The page was loaded meanwhile. */
1735 			return(bpage);
1736 		}
1737 
1738 		/* Add to an existing watch. */
1739 #ifdef PAGE_ATOMIC_REF_COUNT
1740 		os_atomic_increment_uint32(&bpage->buf_fix_count, 1);
1741 #else
1742 		++bpage->buf_fix_count;
1743 #endif /* PAGE_ATOMIC_REF_COUNT */
1744 		return(NULL);
1745 	}
1746 
1747 	/* From this point this function becomes fairly heavy in terms
1748 	of latching. We acquire the buf_pool mutex as well as all the
1749 	hash_locks. buf_pool mutex is needed because any changes to
1750 	the page_hash must be covered by it and hash_locks are needed
1751 	because we don't want to read any stale information in
1752 	buf_pool->watch[]. However, it is not in the critical code path
1753 	as this function will be called only by the purge thread. */
1754 
1755 
1756 	/* To obey latching order first release the hash_lock. */
1757 	rw_lock_x_unlock(hash_lock);
1758 
1759 	buf_pool_mutex_enter(buf_pool);
1760 	hash_lock_x_all(buf_pool->page_hash);
1761 
1762 	/* We have to recheck that the page
1763 	was not loaded or a watch set by some other
1764 	purge thread. This is because of the small
1765 	time window between when we release the
1766 	hash_lock to acquire buf_pool mutex above. */
1767 
1768 	bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
1769 	if (UNIV_LIKELY_NULL(bpage)) {
1770 		buf_pool_mutex_exit(buf_pool);
1771 		hash_unlock_x_all_but(buf_pool->page_hash, hash_lock);
1772 		goto page_found;
1773 	}
1774 
1775 	/* The maximum number of purge threads should never exceed
1776 	BUF_POOL_WATCH_SIZE. So there is no way for purge thread
1777 	instance to hold a watch when setting another watch. */
1778 	for (i = 0; i < BUF_POOL_WATCH_SIZE; i++) {
1779 		bpage = &buf_pool->watch[i];
1780 
1781 		ut_ad(bpage->access_time == 0);
1782 		ut_ad(bpage->newest_modification == 0);
1783 		ut_ad(bpage->oldest_modification == 0);
1784 		ut_ad(bpage->zip.data == NULL);
1785 		ut_ad(!bpage->in_zip_hash);
1786 
1787 		switch (bpage->state) {
1788 		case BUF_BLOCK_POOL_WATCH:
1789 			ut_ad(!bpage->in_page_hash);
1790 			ut_ad(bpage->buf_fix_count == 0);
1791 
1792 			/* bpage is pointing to buf_pool->watch[],
1793 			which is protected by buf_pool->mutex.
1794 			Normally, buf_page_t objects are protected by
1795 			buf_block_t::mutex or buf_pool->zip_mutex or both. */
1796 
1797 			bpage->state = BUF_BLOCK_ZIP_PAGE;
1798 			bpage->space = static_cast<ib_uint32_t>(space);
1799 			bpage->offset = static_cast<ib_uint32_t>(offset);
1800 			bpage->buf_fix_count = 1;
1801 
1802 			ut_d(bpage->in_page_hash = TRUE);
1803 			HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
1804 				    fold, bpage);
1805 
1806 			buf_pool_mutex_exit(buf_pool);
1807 			/* Once the sentinel is in the page_hash we can
1808 			safely release all locks except just the
1809 			relevant hash_lock */
1810 			hash_unlock_x_all_but(buf_pool->page_hash,
1811 						hash_lock);
1812 
1813 			return(NULL);
1814 		case BUF_BLOCK_ZIP_PAGE:
1815 			ut_ad(bpage->in_page_hash);
1816 			ut_ad(bpage->buf_fix_count > 0);
1817 			break;
1818 		default:
1819 			ut_error;
1820 		}
1821 	}
1822 
1823 	/* Allocation failed.  Either the maximum number of purge
1824 	threads should never exceed BUF_POOL_WATCH_SIZE, or this code
1825 	should be modified to return a special non-NULL value and the
1826 	caller should purge the record directly. */
1827 	ut_error;
1828 
1829 	/* Fix compiler warning */
1830 	return(NULL);
1831 }
1832 
1833 /****************************************************************//**
1834 Remove the sentinel block for the watch before replacing it with a real block.
1835 buf_page_watch_clear() or buf_page_watch_occurred() will notice that
1836 the block has been replaced with the real block.
1837 @return reference count, to be added to the replacement block */
1838 static
1839 void
buf_pool_watch_remove(buf_pool_t * buf_pool,ulint fold,buf_page_t * watch)1840 buf_pool_watch_remove(
1841 /*==================*/
1842 	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
1843 	ulint		fold,		/*!< in: buf_page_address_fold(
1844 					space, offset) */
1845 	buf_page_t*	watch)		/*!< in/out: sentinel for watch */
1846 {
1847 #ifdef UNIV_SYNC_DEBUG
1848 	/* We must also own the appropriate hash_bucket mutex. */
1849 	rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, fold);
1850 	ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX));
1851 #endif /* UNIV_SYNC_DEBUG */
1852 
1853 	ut_ad(buf_pool_mutex_own(buf_pool));
1854 
1855 	HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, watch);
1856 	ut_d(watch->in_page_hash = FALSE);
1857 	watch->buf_fix_count = 0;
1858 	watch->state = BUF_BLOCK_POOL_WATCH;
1859 }
1860 
1861 /****************************************************************//**
1862 Stop watching if the page has been read in.
1863 buf_pool_watch_set(space,offset) must have returned NULL before. */
1864 UNIV_INTERN
1865 void
buf_pool_watch_unset(ulint space,ulint offset)1866 buf_pool_watch_unset(
1867 /*=================*/
1868 	ulint	space,	/*!< in: space id */
1869 	ulint	offset)	/*!< in: page number */
1870 {
1871 	buf_page_t*	bpage;
1872 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
1873 	ulint		fold = buf_page_address_fold(space, offset);
1874 	rw_lock_t*	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
1875 
1876 	/* We only need to have buf_pool mutex in case where we end
1877 	up calling buf_pool_watch_remove but to obey latching order
1878 	we acquire it here before acquiring hash_lock. This should
1879 	not cause too much grief as this function is only ever
1880 	called from the purge thread. */
1881 	buf_pool_mutex_enter(buf_pool);
1882 
1883 	rw_lock_x_lock(hash_lock);
1884 
1885 	/* The page must exist because buf_pool_watch_set() increments
1886 	buf_fix_count. */
1887 
1888 	bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
1889 
1890 	if (!buf_pool_watch_is_sentinel(buf_pool, bpage)) {
1891 		buf_block_unfix(reinterpret_cast<buf_block_t*>(bpage));
1892 	} else {
1893 
1894 		ut_ad(bpage->buf_fix_count > 0);
1895 
1896 #ifdef PAGE_ATOMIC_REF_COUNT
1897 		os_atomic_decrement_uint32(&bpage->buf_fix_count, 1);
1898 #else
1899 		--bpage->buf_fix_count;
1900 #endif /* PAGE_ATOMIC_REF_COUNT */
1901 
1902 		if (bpage->buf_fix_count == 0) {
1903 			buf_pool_watch_remove(buf_pool, fold, bpage);
1904 		}
1905 	}
1906 
1907 	buf_pool_mutex_exit(buf_pool);
1908 	rw_lock_x_unlock(hash_lock);
1909 }
1910 
1911 /****************************************************************//**
1912 Check if the page has been read in.
1913 This may only be called after buf_pool_watch_set(space,offset)
1914 has returned NULL and before invoking buf_pool_watch_unset(space,offset).
1915 @return	FALSE if the given page was not read in, TRUE if it was */
1916 UNIV_INTERN
1917 ibool
buf_pool_watch_occurred(ulint space,ulint offset)1918 buf_pool_watch_occurred(
1919 /*====================*/
1920 	ulint	space,	/*!< in: space id */
1921 	ulint	offset)	/*!< in: page number */
1922 {
1923 	ibool		ret;
1924 	buf_page_t*	bpage;
1925 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
1926 	ulint		fold	= buf_page_address_fold(space, offset);
1927 	rw_lock_t*	hash_lock = buf_page_hash_lock_get(buf_pool,
1928 							     fold);
1929 
1930 	rw_lock_s_lock(hash_lock);
1931 
1932 	/* The page must exist because buf_pool_watch_set()
1933 	increments buf_fix_count. */
1934 	bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
1935 
1936 	ret = !buf_pool_watch_is_sentinel(buf_pool, bpage);
1937 	rw_lock_s_unlock(hash_lock);
1938 
1939 	return(ret);
1940 }
1941 
1942 /********************************************************************//**
1943 Moves a page to the start of the buffer pool LRU list. This high-level
1944 function can be used to prevent an important page from slipping out of
1945 the buffer pool. */
1946 UNIV_INTERN
1947 void
buf_page_make_young(buf_page_t * bpage)1948 buf_page_make_young(
1949 /*================*/
1950 	buf_page_t*	bpage)	/*!< in: buffer block of a file page */
1951 {
1952 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
1953 
1954 	buf_pool_mutex_enter(buf_pool);
1955 
1956 	ut_a(buf_page_in_file(bpage));
1957 
1958 	buf_LRU_make_block_young(bpage);
1959 
1960 	buf_pool_mutex_exit(buf_pool);
1961 }
1962 
1963 /********************************************************************//**
1964 Moves a page to the start of the buffer pool LRU list if it is too old.
1965 This high-level function can be used to prevent an important page from
1966 slipping out of the buffer pool. */
1967 static
1968 void
buf_page_make_young_if_needed(buf_page_t * bpage)1969 buf_page_make_young_if_needed(
1970 /*==========================*/
1971 	buf_page_t*	bpage)		/*!< in/out: buffer block of a
1972 					file page */
1973 {
1974 #ifdef UNIV_DEBUG
1975 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
1976 	ut_ad(!buf_pool_mutex_own(buf_pool));
1977 #endif /* UNIV_DEBUG */
1978 	ut_a(buf_page_in_file(bpage));
1979 
1980 	if (buf_page_peek_if_too_old(bpage)) {
1981 		buf_page_make_young(bpage);
1982 	}
1983 }
1984 
1985 /********************************************************************//**
1986 Resets the check_index_page_at_flush field of a page if found in the buffer
1987 pool. */
1988 UNIV_INTERN
1989 void
buf_reset_check_index_page_at_flush(ulint space,ulint offset)1990 buf_reset_check_index_page_at_flush(
1991 /*================================*/
1992 	ulint	space,	/*!< in: space id */
1993 	ulint	offset)	/*!< in: page number */
1994 {
1995 	buf_block_t*	block;
1996 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
1997 
1998 	buf_pool_mutex_enter(buf_pool);
1999 
2000 	block = (buf_block_t*) buf_page_hash_get(buf_pool, space, offset);
2001 
2002 	if (block && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE) {
2003 		ut_ad(!buf_pool_watch_is_sentinel(buf_pool, &block->page));
2004 		block->check_index_page_at_flush = FALSE;
2005 	}
2006 
2007 	buf_pool_mutex_exit(buf_pool);
2008 }
2009 
2010 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
2011 /********************************************************************//**
2012 Sets file_page_was_freed TRUE if the page is found in the buffer pool.
2013 This function should be called when we free a file page and want the
2014 debug version to check that it is not accessed any more unless
2015 reallocated.
2016 @return	control block if found in page hash table, otherwise NULL */
2017 UNIV_INTERN
2018 buf_page_t*
buf_page_set_file_page_was_freed(ulint space,ulint offset)2019 buf_page_set_file_page_was_freed(
2020 /*=============================*/
2021 	ulint	space,	/*!< in: space id */
2022 	ulint	offset)	/*!< in: page number */
2023 {
2024 	buf_page_t*	bpage;
2025 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
2026 	rw_lock_t*	hash_lock;
2027 
2028 	bpage = buf_page_hash_get_s_locked(buf_pool, space, offset,
2029 					   &hash_lock);
2030 
2031 	if (bpage) {
2032 		ib_mutex_t*	block_mutex = buf_page_get_mutex(bpage);
2033 		ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
2034 		mutex_enter(block_mutex);
2035 		rw_lock_s_unlock(hash_lock);
2036 		/* bpage->file_page_was_freed can already hold
2037 		when this code is invoked from dict_drop_index_tree() */
2038 		bpage->file_page_was_freed = TRUE;
2039 		mutex_exit(block_mutex);
2040 	}
2041 
2042 	return(bpage);
2043 }
2044 
2045 /********************************************************************//**
2046 Sets file_page_was_freed FALSE if the page is found in the buffer pool.
2047 This function should be called when we free a file page and want the
2048 debug version to check that it is not accessed any more unless
2049 reallocated.
2050 @return	control block if found in page hash table, otherwise NULL */
2051 UNIV_INTERN
2052 buf_page_t*
buf_page_reset_file_page_was_freed(ulint space,ulint offset)2053 buf_page_reset_file_page_was_freed(
2054 /*===============================*/
2055 	ulint	space,	/*!< in: space id */
2056 	ulint	offset)	/*!< in: page number */
2057 {
2058 	buf_page_t*	bpage;
2059 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
2060 	rw_lock_t*	hash_lock;
2061 
2062 	bpage = buf_page_hash_get_s_locked(buf_pool, space, offset,
2063 					   &hash_lock);
2064 	if (bpage) {
2065 		ib_mutex_t*	block_mutex = buf_page_get_mutex(bpage);
2066 		ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
2067 		mutex_enter(block_mutex);
2068 		rw_lock_s_unlock(hash_lock);
2069 		bpage->file_page_was_freed = FALSE;
2070 		mutex_exit(block_mutex);
2071 	}
2072 
2073 	return(bpage);
2074 }
2075 #endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
2076 
2077 /********************************************************************//**
2078 Attempts to discard the uncompressed frame of a compressed page. The
2079 caller should not be holding any mutexes when this function is called.
2080 @return	TRUE if successful, FALSE otherwise. */
2081 static
2082 void
buf_block_try_discard_uncompressed(ulint space,ulint offset)2083 buf_block_try_discard_uncompressed(
2084 /*===============================*/
2085 	ulint		space,	/*!< in: space id */
2086 	ulint		offset)	/*!< in: page number */
2087 {
2088 	buf_page_t*	bpage;
2089 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
2090 
2091 	/* Since we need to acquire buf_pool mutex to discard
2092 	the uncompressed frame and because page_hash mutex resides
2093 	below buf_pool mutex in sync ordering therefore we must
2094 	first release the page_hash mutex. This means that the
2095 	block in question can move out of page_hash. Therefore
2096 	we need to check again if the block is still in page_hash. */
2097 	buf_pool_mutex_enter(buf_pool);
2098 
2099 	bpage = buf_page_hash_get(buf_pool, space, offset);
2100 
2101 	if (bpage) {
2102 		buf_LRU_free_page(bpage, false);
2103 	}
2104 
2105 	buf_pool_mutex_exit(buf_pool);
2106 }
2107 
2108 /********************************************************************//**
2109 Get read access to a compressed page (usually of type
2110 FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
2111 The page must be released with buf_page_release_zip().
2112 NOTE: the page is not protected by any latch.  Mutual exclusion has to
2113 be implemented at a higher level.  In other words, all possible
2114 accesses to a given page through this function must be protected by
2115 the same set of mutexes or latches.
2116 @return	pointer to the block */
2117 UNIV_INTERN
2118 buf_page_t*
buf_page_get_zip(ulint space,ulint zip_size,ulint offset)2119 buf_page_get_zip(
2120 /*=============*/
2121 	ulint		space,	/*!< in: space id */
2122 	ulint		zip_size,/*!< in: compressed page size */
2123 	ulint		offset)	/*!< in: page number */
2124 {
2125 	buf_page_t*	bpage;
2126 	ib_mutex_t*	block_mutex;
2127 	rw_lock_t*	hash_lock;
2128 	ibool		discard_attempted = FALSE;
2129 	ibool		must_read;
2130 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
2131 
2132 	buf_pool->stat.n_page_gets++;
2133 
2134 	for (;;) {
2135 lookup:
2136 
2137 		/* The following call will also grab the page_hash
2138 		mutex if the page is found. */
2139 		bpage = buf_page_hash_get_s_locked(buf_pool, space,
2140 						offset, &hash_lock);
2141 		if (bpage) {
2142 			ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
2143 			break;
2144 		}
2145 
2146 		/* Page not in buf_pool: needs to be read from file */
2147 
2148 		ut_ad(!hash_lock);
2149 		buf_read_page(space, zip_size, offset);
2150 
2151 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
2152 		ut_a(++buf_dbg_counter % 5771 || buf_validate());
2153 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
2154 	}
2155 
2156 	ut_ad(buf_page_hash_lock_held_s(buf_pool, bpage));
2157 
2158 	if (!bpage->zip.data) {
2159 		/* There is no compressed page. */
2160 err_exit:
2161 		rw_lock_s_unlock(hash_lock);
2162 		return(NULL);
2163 	}
2164 
2165 	ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
2166 
2167 	switch (buf_page_get_state(bpage)) {
2168 	case BUF_BLOCK_POOL_WATCH:
2169 	case BUF_BLOCK_NOT_USED:
2170 	case BUF_BLOCK_READY_FOR_USE:
2171 	case BUF_BLOCK_MEMORY:
2172 	case BUF_BLOCK_REMOVE_HASH:
2173 		ut_error;
2174 
2175 	case BUF_BLOCK_ZIP_PAGE:
2176 	case BUF_BLOCK_ZIP_DIRTY:
2177 		block_mutex = &buf_pool->zip_mutex;
2178 		mutex_enter(block_mutex);
2179 #ifdef PAGE_ATOMIC_REF_COUNT
2180 		os_atomic_increment_uint32(&bpage->buf_fix_count, 1);
2181 #else
2182 		++bpage->buf_fix_count;
2183 #endif /* PAGE_ATOMIC_REF_COUNT */
2184 		goto got_block;
2185 	case BUF_BLOCK_FILE_PAGE:
2186 		/* Discard the uncompressed page frame if possible. */
2187 		if (!discard_attempted) {
2188 			rw_lock_s_unlock(hash_lock);
2189 			buf_block_try_discard_uncompressed(space, offset);
2190 			discard_attempted = TRUE;
2191 			goto lookup;
2192 		}
2193 
2194 		block_mutex = &((buf_block_t*) bpage)->mutex;
2195 
2196 		mutex_enter(block_mutex);
2197 
2198 		buf_block_buf_fix_inc((buf_block_t*) bpage, __FILE__, __LINE__);
2199 		goto got_block;
2200 	}
2201 
2202 	ut_error;
2203 	goto err_exit;
2204 
2205 got_block:
2206 	must_read = buf_page_get_io_fix(bpage) == BUF_IO_READ;
2207 
2208 	rw_lock_s_unlock(hash_lock);
2209 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
2210 	ut_a(!bpage->file_page_was_freed);
2211 #endif /* defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG */
2212 
2213 	buf_page_set_accessed(bpage);
2214 
2215 	mutex_exit(block_mutex);
2216 
2217 	buf_page_make_young_if_needed(bpage);
2218 
2219 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
2220 	ut_a(++buf_dbg_counter % 5771 || buf_validate());
2221 	ut_a(bpage->buf_fix_count > 0);
2222 	ut_a(buf_page_in_file(bpage));
2223 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
2224 
2225 	if (must_read) {
2226 		/* Let us wait until the read operation
2227 		completes */
2228 
2229 		for (;;) {
2230 			enum buf_io_fix	io_fix;
2231 
2232 			mutex_enter(block_mutex);
2233 			io_fix = buf_page_get_io_fix(bpage);
2234 			mutex_exit(block_mutex);
2235 
2236 			if (io_fix == BUF_IO_READ) {
2237 
2238 				os_thread_sleep(WAIT_FOR_READ);
2239 			} else {
2240 				break;
2241 			}
2242 		}
2243 	}
2244 
2245 #ifdef UNIV_IBUF_COUNT_DEBUG
2246 	ut_a(ibuf_count_get(buf_page_get_space(bpage),
2247 			    buf_page_get_page_no(bpage)) == 0);
2248 #endif
2249 	return(bpage);
2250 }
2251 
2252 /********************************************************************//**
2253 Initialize some fields of a control block. */
2254 UNIV_INLINE
2255 void
buf_block_init_low(buf_block_t * block)2256 buf_block_init_low(
2257 /*===============*/
2258 	buf_block_t*	block)	/*!< in: block to init */
2259 {
2260 	block->check_index_page_at_flush = FALSE;
2261 	block->index		= NULL;
2262 
2263 	block->n_hash_helps	= 0;
2264 	block->n_fields		= 1;
2265 	block->n_bytes		= 0;
2266 	block->left_side	= TRUE;
2267 }
2268 #endif /* !UNIV_HOTBACKUP */
2269 
2270 /********************************************************************//**
2271 Decompress a block.
2272 @return	TRUE if successful */
2273 UNIV_INTERN
2274 ibool
buf_zip_decompress(buf_block_t * block,ibool check)2275 buf_zip_decompress(
2276 /*===============*/
2277 	buf_block_t*	block,	/*!< in/out: block */
2278 	ibool		check)	/*!< in: TRUE=verify the page checksum */
2279 {
2280 	const byte*	frame = block->page.zip.data;
2281 	ulint		size = page_zip_get_size(&block->page.zip);
2282 
2283 	ut_ad(buf_block_get_zip_size(block));
2284 	ut_a(buf_block_get_space(block) != 0);
2285 
2286 	if (UNIV_UNLIKELY(check && !page_zip_verify_checksum(frame, size))) {
2287 
2288 		ut_print_timestamp(stderr);
2289 		fprintf(stderr,
2290 			"  InnoDB: compressed page checksum mismatch"
2291 			" (space %u page %u): stored: %lu, crc32: %lu "
2292 			"innodb: %lu, none: %lu\n",
2293 			block->page.space, block->page.offset,
2294 			mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM),
2295 			page_zip_calc_checksum(frame, size,
2296 					       SRV_CHECKSUM_ALGORITHM_CRC32),
2297 			page_zip_calc_checksum(frame, size,
2298 					       SRV_CHECKSUM_ALGORITHM_INNODB),
2299 			page_zip_calc_checksum(frame, size,
2300 					       SRV_CHECKSUM_ALGORITHM_NONE));
2301 		return(FALSE);
2302 	}
2303 
2304 	switch (fil_page_get_type(frame)) {
2305 	case FIL_PAGE_INDEX:
2306 		if (page_zip_decompress(&block->page.zip,
2307 					block->frame, TRUE)) {
2308 			return(TRUE);
2309 		}
2310 
2311 		fprintf(stderr,
2312 			"InnoDB: unable to decompress space %lu page %lu\n",
2313 			(ulong) block->page.space,
2314 			(ulong) block->page.offset);
2315 		return(FALSE);
2316 
2317 	case FIL_PAGE_TYPE_ALLOCATED:
2318 	case FIL_PAGE_INODE:
2319 	case FIL_PAGE_IBUF_BITMAP:
2320 	case FIL_PAGE_TYPE_FSP_HDR:
2321 	case FIL_PAGE_TYPE_XDES:
2322 	case FIL_PAGE_TYPE_ZBLOB:
2323 	case FIL_PAGE_TYPE_ZBLOB2:
2324 		/* Copy to uncompressed storage. */
2325 		memcpy(block->frame, frame,
2326 		       buf_block_get_zip_size(block));
2327 		return(TRUE);
2328 	}
2329 
2330 	ut_print_timestamp(stderr);
2331 	fprintf(stderr,
2332 		"  InnoDB: unknown compressed page"
2333 		" type %lu\n",
2334 		fil_page_get_type(frame));
2335 	return(FALSE);
2336 }
2337 
2338 #ifndef UNIV_HOTBACKUP
2339 /*******************************************************************//**
2340 Gets the block to whose frame the pointer is pointing to if found
2341 in this buffer pool instance.
2342 @return	pointer to block */
2343 UNIV_INTERN
2344 buf_block_t*
buf_block_align_instance(buf_pool_t * buf_pool,const byte * ptr)2345 buf_block_align_instance(
2346 /*=====================*/
2347  	buf_pool_t*	buf_pool,	/*!< in: buffer in which the block
2348 					resides */
2349 	const byte*	ptr)		/*!< in: pointer to a frame */
2350 {
2351 	buf_chunk_t*	chunk;
2352 	ulint		i;
2353 
2354 	/* TODO: protect buf_pool->chunks with a mutex (it will
2355 	currently remain constant after buf_pool_init()) */
2356 	for (chunk = buf_pool->chunks, i = buf_pool->n_chunks; i--; chunk++) {
2357 		ulint	offs;
2358 
2359 		if (UNIV_UNLIKELY(ptr < chunk->blocks->frame)) {
2360 
2361 			continue;
2362 		}
2363 		/* else */
2364 
2365 		offs = ptr - chunk->blocks->frame;
2366 
2367 		offs >>= UNIV_PAGE_SIZE_SHIFT;
2368 
2369 		if (UNIV_LIKELY(offs < chunk->size)) {
2370 			buf_block_t*	block = &chunk->blocks[offs];
2371 
2372 			/* The function buf_chunk_init() invokes
2373 			buf_block_init() so that block[n].frame ==
2374 			block->frame + n * UNIV_PAGE_SIZE.  Check it. */
2375 			ut_ad(block->frame == page_align(ptr));
2376 #ifdef UNIV_DEBUG
2377 			/* A thread that updates these fields must
2378 			hold buf_pool->mutex and block->mutex.  Acquire
2379 			only the latter. */
2380 			mutex_enter(&block->mutex);
2381 
2382 			switch (buf_block_get_state(block)) {
2383 			case BUF_BLOCK_POOL_WATCH:
2384 			case BUF_BLOCK_ZIP_PAGE:
2385 			case BUF_BLOCK_ZIP_DIRTY:
2386 				/* These types should only be used in
2387 				the compressed buffer pool, whose
2388 				memory is allocated from
2389 				buf_pool->chunks, in UNIV_PAGE_SIZE
2390 				blocks flagged as BUF_BLOCK_MEMORY. */
2391 				ut_error;
2392 				break;
2393 			case BUF_BLOCK_NOT_USED:
2394 			case BUF_BLOCK_READY_FOR_USE:
2395 			case BUF_BLOCK_MEMORY:
2396 				/* Some data structures contain
2397 				"guess" pointers to file pages.  The
2398 				file pages may have been freed and
2399 				reused.  Do not complain. */
2400 				break;
2401 			case BUF_BLOCK_REMOVE_HASH:
2402 				/* buf_LRU_block_remove_hashed_page()
2403 				will overwrite the FIL_PAGE_OFFSET and
2404 				FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID with
2405 				0xff and set the state to
2406 				BUF_BLOCK_REMOVE_HASH. */
2407 				ut_ad(page_get_space_id(page_align(ptr))
2408 				      == 0xffffffff);
2409 				ut_ad(page_get_page_no(page_align(ptr))
2410 				      == 0xffffffff);
2411 				break;
2412 			case BUF_BLOCK_FILE_PAGE:
2413 				ut_ad(block->page.space
2414 				      == page_get_space_id(page_align(ptr)));
2415 				ut_ad(block->page.offset
2416 				      == page_get_page_no(page_align(ptr)));
2417 				break;
2418 			}
2419 
2420 			mutex_exit(&block->mutex);
2421 #endif /* UNIV_DEBUG */
2422 
2423 			return(block);
2424 		}
2425 	}
2426 
2427 	return(NULL);
2428 }
2429 
2430 /*******************************************************************//**
2431 Gets the block to whose frame the pointer is pointing to.
2432 @return	pointer to block, never NULL */
2433 UNIV_INTERN
2434 buf_block_t*
buf_block_align(const byte * ptr)2435 buf_block_align(
2436 /*============*/
2437 	const byte*	ptr)	/*!< in: pointer to a frame */
2438 {
2439 	ulint		i;
2440 
2441 	for (i = 0; i < srv_buf_pool_instances; i++) {
2442 		buf_block_t*	block;
2443 
2444 		block = buf_block_align_instance(
2445 			buf_pool_from_array(i), ptr);
2446 		if (block) {
2447 			return(block);
2448 		}
2449 	}
2450 
2451 	/* The block should always be found. */
2452 	ut_error;
2453 	return(NULL);
2454 }
2455 
2456 /********************************************************************//**
2457 Find out if a pointer belongs to a buf_block_t. It can be a pointer to
2458 the buf_block_t itself or a member of it. This functions checks one of
2459 the buffer pool instances.
2460 @return	TRUE if ptr belongs to a buf_block_t struct */
2461 static
2462 ibool
buf_pointer_is_block_field_instance(buf_pool_t * buf_pool,const void * ptr)2463 buf_pointer_is_block_field_instance(
2464 /*================================*/
2465 	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
2466 	const void*	ptr)		/*!< in: pointer not dereferenced */
2467 {
2468 	const buf_chunk_t*		chunk	= buf_pool->chunks;
2469 	const buf_chunk_t* const	echunk	= chunk + buf_pool->n_chunks;
2470 
2471 	/* TODO: protect buf_pool->chunks with a mutex (it will
2472 	currently remain constant after buf_pool_init()) */
2473 	while (chunk < echunk) {
2474 		if (ptr >= (void*) chunk->blocks
2475 		    && ptr < (void*) (chunk->blocks + chunk->size)) {
2476 
2477 			return(TRUE);
2478 		}
2479 
2480 		chunk++;
2481 	}
2482 
2483 	return(FALSE);
2484 }
2485 
2486 /********************************************************************//**
2487 Find out if a pointer belongs to a buf_block_t. It can be a pointer to
2488 the buf_block_t itself or a member of it
2489 @return	TRUE if ptr belongs to a buf_block_t struct */
2490 UNIV_INTERN
2491 ibool
buf_pointer_is_block_field(const void * ptr)2492 buf_pointer_is_block_field(
2493 /*=======================*/
2494 	const void*	ptr)	/*!< in: pointer not dereferenced */
2495 {
2496 	ulint	i;
2497 
2498 	for (i = 0; i < srv_buf_pool_instances; i++) {
2499 		ibool	found;
2500 
2501 		found = buf_pointer_is_block_field_instance(
2502 			buf_pool_from_array(i), ptr);
2503 		if (found) {
2504 			return(TRUE);
2505 		}
2506 	}
2507 
2508 	return(FALSE);
2509 }
2510 
2511 /********************************************************************//**
2512 Find out if a buffer block was created by buf_chunk_init().
2513 @return	TRUE if "block" has been added to buf_pool->free by buf_chunk_init() */
2514 static
2515 ibool
buf_block_is_uncompressed(buf_pool_t * buf_pool,const buf_block_t * block)2516 buf_block_is_uncompressed(
2517 /*======================*/
2518 	buf_pool_t*		buf_pool,	/*!< in: buffer pool instance */
2519 	const buf_block_t*	block)		/*!< in: pointer to block,
2520 						not dereferenced */
2521 {
2522 	if ((((ulint) block) % sizeof *block) != 0) {
2523 		/* The pointer should be aligned. */
2524 		return(FALSE);
2525 	}
2526 
2527 	return(buf_pointer_is_block_field_instance(buf_pool, (void*) block));
2528 }
2529 
2530 #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
2531 /********************************************************************//**
2532 Return true if probe is enabled.
2533 @return true if probe enabled. */
2534 static
2535 bool
buf_debug_execute_is_force_flush()2536 buf_debug_execute_is_force_flush()
2537 /*==============================*/
2538 {
2539 	DBUG_EXECUTE_IF("ib_buf_force_flush", return(true); );
2540 
2541 	/* This is used during queisce testing, we want to ensure maximum
2542 	buffering by the change buffer. */
2543 
2544 	if (srv_ibuf_disable_background_merge) {
2545 		return(true);
2546 	}
2547 
2548 	return(false);
2549 }
2550 #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
2551 
2552 /**
2553 Wait for the block to be read in.
2554 @param block	The block to check */
2555 static
2556 void
buf_wait_for_read(buf_block_t * block)2557 buf_wait_for_read(buf_block_t* block)
2558 {
2559 	/* Note: For the PAGE_ATOMIC_REF_COUNT case:
2560 
2561 	We are using the block->lock to check for IO state (and a dirty read).
2562 	We set the IO_READ state under the protection of the hash_lock
2563 	(and block->mutex). This is safe because another thread can only
2564 	access the block (and check for IO state) after the block has been
2565 	added to the page hashtable. */
2566 
2567 	if (buf_block_get_io_fix(block) == BUF_IO_READ) {
2568 
2569 		/* Wait until the read operation completes */
2570 
2571 		ib_mutex_t*	mutex = buf_page_get_mutex(&block->page);
2572 
2573 		for (;;) {
2574 			buf_io_fix	io_fix;
2575 
2576 			mutex_enter(mutex);
2577 
2578 			io_fix = buf_block_get_io_fix(block);
2579 
2580 			mutex_exit(mutex);
2581 
2582 			if (io_fix == BUF_IO_READ) {
2583 				/* Wait by temporaly s-latch */
2584 				rw_lock_s_lock(&block->lock);
2585 				rw_lock_s_unlock(&block->lock);
2586 			} else {
2587 				break;
2588 			}
2589 		}
2590 	}
2591 }
2592 
2593 /********************************************************************//**
2594 This is the general function used to get access to a database page.
2595 @return	pointer to the block or NULL */
2596 UNIV_INTERN
2597 buf_block_t*
buf_page_get_gen(ulint space,ulint zip_size,ulint offset,ulint rw_latch,buf_block_t * guess,ulint mode,const char * file,ulint line,mtr_t * mtr)2598 buf_page_get_gen(
2599 /*=============*/
2600 	ulint		space,	/*!< in: space id */
2601 	ulint		zip_size,/*!< in: compressed page size in bytes
2602 				or 0 for uncompressed pages */
2603 	ulint		offset,	/*!< in: page number */
2604 	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
2605 	buf_block_t*	guess,	/*!< in: guessed block or NULL */
2606 	ulint		mode,	/*!< in: BUF_GET, BUF_GET_IF_IN_POOL,
2607 				BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or
2608 				BUF_GET_IF_IN_POOL_OR_WATCH */
2609 	const char*	file,	/*!< in: file name */
2610 	ulint		line,	/*!< in: line where called */
2611 	mtr_t*		mtr)	/*!< in: mini-transaction */
2612 {
2613 	buf_block_t*	block;
2614 	ulint		fold;
2615 	unsigned	access_time;
2616 	ulint		fix_type;
2617 	rw_lock_t*	hash_lock;
2618 	ulint		retries = 0;
2619 	buf_block_t*	fix_block;
2620 	ib_mutex_t*	fix_mutex = NULL;
2621 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
2622 
2623 	ut_ad(mtr);
2624 	ut_ad(mtr->state == MTR_ACTIVE);
2625 	ut_ad((rw_latch == RW_S_LATCH)
2626 	      || (rw_latch == RW_X_LATCH)
2627 	      || (rw_latch == RW_NO_LATCH));
2628 #ifdef UNIV_DEBUG
2629 	switch (mode) {
2630 	case BUF_GET_NO_LATCH:
2631 		ut_ad(rw_latch == RW_NO_LATCH);
2632 		break;
2633 	case BUF_GET:
2634 	case BUF_GET_IF_IN_POOL:
2635 	case BUF_PEEK_IF_IN_POOL:
2636 	case BUF_GET_IF_IN_POOL_OR_WATCH:
2637 	case BUF_GET_POSSIBLY_FREED:
2638 		break;
2639 	default:
2640 		ut_error;
2641 	}
2642 #endif /* UNIV_DEBUG */
2643 	ut_ad(zip_size == fil_space_get_zip_size(space));
2644 	ut_ad(ut_is_2pow(zip_size));
2645 #ifndef UNIV_LOG_DEBUG
2646 	ut_ad(!ibuf_inside(mtr)
2647 	      || ibuf_page_low(space, zip_size, offset,
2648 			       FALSE, file, line, NULL));
2649 #endif
2650 	buf_pool->stat.n_page_gets++;
2651 	fold = buf_page_address_fold(space, offset);
2652 	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
2653 loop:
2654 	block = guess;
2655 
2656 	rw_lock_s_lock(hash_lock);
2657 
2658 	if (block != NULL) {
2659 
2660 		/* If the guess is a compressed page descriptor that
2661 		has been allocated by buf_page_alloc_descriptor(),
2662 		it may have been freed by buf_relocate(). */
2663 
2664 		if (!buf_block_is_uncompressed(buf_pool, block)
2665 		    || offset != block->page.offset
2666 		    || space != block->page.space
2667 		    || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
2668 
2669 			/* Our guess was bogus or things have changed
2670 			since. */
2671 			block = guess = NULL;
2672 		} else {
2673 			ut_ad(!block->page.in_zip_hash);
2674 		}
2675 	}
2676 
2677 	if (block == NULL) {
2678 		block = (buf_block_t*) buf_page_hash_get_low(
2679 			buf_pool, space, offset, fold);
2680 	}
2681 
2682 	if (!block || buf_pool_watch_is_sentinel(buf_pool, &block->page)) {
2683 		rw_lock_s_unlock(hash_lock);
2684 		block = NULL;
2685 	}
2686 
2687 	if (block == NULL) {
2688 		/* Page not in buf_pool: needs to be read from file */
2689 
2690 		if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
2691 			rw_lock_x_lock(hash_lock);
2692 			block = (buf_block_t*) buf_pool_watch_set(
2693 				space, offset, fold);
2694 
2695 			if (UNIV_LIKELY_NULL(block)) {
2696 				/* We can release hash_lock after we
2697 				increment the fix count to make
2698 				sure that no state change takes place. */
2699 				fix_block = block;
2700 				buf_block_fix(fix_block);
2701 
2702 				/* Now safe to release page_hash mutex */
2703 				rw_lock_x_unlock(hash_lock);
2704 				goto got_block;
2705 			}
2706 
2707 			rw_lock_x_unlock(hash_lock);
2708 		}
2709 
2710 		if (mode == BUF_GET_IF_IN_POOL
2711 		    || mode == BUF_PEEK_IF_IN_POOL
2712 		    || mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
2713 #ifdef UNIV_SYNC_DEBUG
2714 			ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
2715 			ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
2716 #endif /* UNIV_SYNC_DEBUG */
2717 			return(NULL);
2718 		}
2719 
2720 		if (buf_read_page(space, zip_size, offset)) {
2721 			buf_read_ahead_random(space, zip_size, offset,
2722 					      ibuf_inside(mtr));
2723 
2724 			retries = 0;
2725 		} else if (retries < BUF_PAGE_READ_MAX_RETRIES) {
2726 			++retries;
2727 			DBUG_EXECUTE_IF(
2728 				"innodb_page_corruption_retries",
2729 				retries = BUF_PAGE_READ_MAX_RETRIES;
2730 			);
2731 		} else {
2732 			fprintf(stderr, "InnoDB: Error: Unable"
2733 				" to read tablespace %lu page no"
2734 				" %lu into the buffer pool after"
2735 				" %lu attempts\n"
2736 				"InnoDB: The most probable cause"
2737 				" of this error may be that the"
2738 				" table has been corrupted.\n"
2739 				"InnoDB: You can try to fix this"
2740 				" problem by using"
2741 				" innodb_force_recovery.\n"
2742 				"InnoDB: Please see reference manual"
2743 				" for more details.\n"
2744 				"InnoDB: Aborting...\n",
2745 				space, offset,
2746 				BUF_PAGE_READ_MAX_RETRIES);
2747 
2748 			ut_error;
2749 		}
2750 
2751 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
2752 		ut_a(++buf_dbg_counter % 5771 || buf_validate());
2753 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
2754 		goto loop;
2755 	} else {
2756 		fix_block = block;
2757 	}
2758 
2759 	buf_block_fix(fix_block);
2760 
2761 	/* Now safe to release page_hash mutex */
2762 	rw_lock_s_unlock(hash_lock);
2763 
2764 got_block:
2765 
2766 	fix_mutex = buf_page_get_mutex(&fix_block->page);
2767 
2768 	ut_ad(page_zip_get_size(&block->page.zip) == zip_size);
2769 
2770 	if (mode == BUF_GET_IF_IN_POOL || mode == BUF_PEEK_IF_IN_POOL) {
2771 
2772 		bool	must_read;
2773 
2774 		{
2775 			buf_page_t*	fix_page = &fix_block->page;
2776 
2777 			mutex_enter(fix_mutex);
2778 
2779 			buf_io_fix	io_fix = buf_page_get_io_fix(fix_page);
2780 
2781 			must_read = (io_fix == BUF_IO_READ);
2782 
2783 			mutex_exit(fix_mutex);
2784 		}
2785 
2786 		if (must_read) {
2787 			/* The page is being read to buffer pool,
2788 			but we cannot wait around for the read to
2789 			complete. */
2790 			buf_block_unfix(fix_block);
2791 
2792 			return(NULL);
2793 		}
2794 	}
2795 
2796 	switch(buf_block_get_state(fix_block)) {
2797 		buf_page_t*	bpage;
2798 
2799 	case BUF_BLOCK_FILE_PAGE:
2800 		break;
2801 
2802 	case BUF_BLOCK_ZIP_PAGE:
2803 	case BUF_BLOCK_ZIP_DIRTY:
2804 		if (mode == BUF_PEEK_IF_IN_POOL) {
2805 			/* This mode is only used for dropping an
2806 			adaptive hash index.  There cannot be an
2807 			adaptive hash index for a compressed-only
2808 			page, so do not bother decompressing the page. */
2809 			buf_block_unfix(fix_block);
2810 
2811 			return(NULL);
2812 		}
2813 
2814 		bpage = &block->page;
2815 
2816 		/* Note: We have already buffer fixed this block. */
2817 		if (bpage->buf_fix_count > 1
2818 		    || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
2819 
2820 			/* This condition often occurs when the buffer
2821 			is not buffer-fixed, but I/O-fixed by
2822 			buf_page_init_for_read(). */
2823 			buf_block_unfix(fix_block);
2824 
2825 			/* The block is buffer-fixed or I/O-fixed.
2826 			Try again later. */
2827 			os_thread_sleep(WAIT_FOR_READ);
2828 
2829 			goto loop;
2830 		}
2831 
2832 		/* Buffer-fix the block so that it cannot be evicted
2833 		or relocated while we are attempting to allocate an
2834 		uncompressed page. */
2835 
2836 		block = buf_LRU_get_free_block(buf_pool);
2837 
2838 		buf_pool_mutex_enter(buf_pool);
2839 
2840 		rw_lock_x_lock(hash_lock);
2841 
2842 		/* Buffer-fixing prevents the page_hash from changing. */
2843 		ut_ad(bpage == buf_page_hash_get_low(
2844 			      buf_pool, space, offset, fold));
2845 
2846 		buf_block_mutex_enter(block);
2847 
2848 		mutex_enter(&buf_pool->zip_mutex);
2849 
2850 		ut_ad(fix_block->page.buf_fix_count > 0);
2851 
2852 #ifdef PAGE_ATOMIC_REF_COUNT
2853 		os_atomic_decrement_uint32(&fix_block->page.buf_fix_count, 1);
2854 #else
2855 		--fix_block->page.buf_fix_count;
2856 #endif /* PAGE_ATOMIC_REF_COUNT */
2857 
2858 		fix_block = block;
2859 
2860 		if (bpage->buf_fix_count > 0
2861 		    || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
2862 
2863 			mutex_exit(&buf_pool->zip_mutex);
2864 			/* The block was buffer-fixed or I/O-fixed while
2865 			buf_pool->mutex was not held by this thread.
2866 			Free the block that was allocated and retry.
2867 			This should be extremely unlikely, for example,
2868 			if buf_page_get_zip() was invoked. */
2869 
2870 			buf_LRU_block_free_non_file_page(block);
2871 			buf_pool_mutex_exit(buf_pool);
2872 			rw_lock_x_unlock(hash_lock);
2873 			buf_block_mutex_exit(block);
2874 
2875 			/* Try again */
2876 			goto loop;
2877 		}
2878 
2879 		/* Move the compressed page from bpage to block,
2880 		and uncompress it. */
2881 
2882 		/* Note: this is the uncompressed block and it is not
2883 		accessible by other threads yet because it is not in
2884 		any list or hash table */
2885 		buf_relocate(bpage, &block->page);
2886 
2887 		buf_block_init_low(block);
2888 
2889 		/* Set after relocate(). */
2890 		block->page.buf_fix_count = 1;
2891 
2892 		block->lock_hash_val = lock_rec_hash(space, offset);
2893 
2894 		UNIV_MEM_DESC(&block->page.zip.data,
2895 			page_zip_get_size(&block->page.zip));
2896 
2897 		if (buf_page_get_state(&block->page) == BUF_BLOCK_ZIP_PAGE) {
2898 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
2899 			UT_LIST_REMOVE(list, buf_pool->zip_clean,
2900 				       &block->page);
2901 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
2902 			ut_ad(!block->page.in_flush_list);
2903 		} else {
2904 			/* Relocate buf_pool->flush_list. */
2905 			buf_flush_relocate_on_flush_list(bpage, &block->page);
2906 		}
2907 
2908 		/* Buffer-fix, I/O-fix, and X-latch the block
2909 		for the duration of the decompression.
2910 		Also add the block to the unzip_LRU list. */
2911 		block->page.state = BUF_BLOCK_FILE_PAGE;
2912 
2913 		/* Insert at the front of unzip_LRU list */
2914 		buf_unzip_LRU_add_block(block, FALSE);
2915 
2916 		buf_block_set_io_fix(block, BUF_IO_READ);
2917 		rw_lock_x_lock_inline(&block->lock, 0, file, line);
2918 
2919 		UNIV_MEM_INVALID(bpage, sizeof *bpage);
2920 
2921 		rw_lock_x_unlock(hash_lock);
2922 
2923 		++buf_pool->n_pend_unzip;
2924 
2925 		mutex_exit(&buf_pool->zip_mutex);
2926 		buf_pool_mutex_exit(buf_pool);
2927 
2928 		access_time = buf_page_is_accessed(&block->page);
2929 
2930 		buf_block_mutex_exit(block);
2931 
2932 		buf_page_free_descriptor(bpage);
2933 
2934 		/* Decompress the page while not holding
2935 		buf_pool->mutex or block->mutex. */
2936 
2937 		/* Page checksum verification is already done when
2938 		the page is read from disk. Hence page checksum
2939 		verification is not necessary when decompressing the page. */
2940 		{
2941 			bool	success = buf_zip_decompress(block, FALSE);
2942 			ut_a(success);
2943 		}
2944 
2945 		if (!recv_no_ibuf_operations) {
2946 			if (access_time) {
2947 #ifdef UNIV_IBUF_COUNT_DEBUG
2948 				ut_a(ibuf_count_get(space, offset) == 0);
2949 #endif /* UNIV_IBUF_COUNT_DEBUG */
2950 			} else {
2951 				ibuf_merge_or_delete_for_page(
2952 					block, space, offset, zip_size, TRUE);
2953 			}
2954 		}
2955 
2956 		buf_pool_mutex_enter(buf_pool);
2957 
2958 		/* Unfix and unlatch the block. */
2959 		buf_block_mutex_enter(fix_block);
2960 
2961 		buf_block_set_io_fix(fix_block, BUF_IO_NONE);
2962 
2963 		buf_block_mutex_exit(fix_block);
2964 
2965 		--buf_pool->n_pend_unzip;
2966 
2967 		buf_pool_mutex_exit(buf_pool);
2968 
2969 		rw_lock_x_unlock(&block->lock);
2970 
2971 		break;
2972 
2973 	case BUF_BLOCK_POOL_WATCH:
2974 	case BUF_BLOCK_NOT_USED:
2975 	case BUF_BLOCK_READY_FOR_USE:
2976 	case BUF_BLOCK_MEMORY:
2977 	case BUF_BLOCK_REMOVE_HASH:
2978 		ut_error;
2979 		break;
2980 	}
2981 
2982 	ut_ad(block == fix_block);
2983 	ut_ad(fix_block->page.buf_fix_count > 0);
2984 
2985 #ifdef UNIV_SYNC_DEBUG
2986 	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
2987 	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
2988 #endif /* UNIV_SYNC_DEBUG */
2989 
2990 	ut_ad(buf_block_get_state(fix_block) == BUF_BLOCK_FILE_PAGE);
2991 
2992 #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
2993 
2994 	if ((mode == BUF_GET_IF_IN_POOL || mode == BUF_GET_IF_IN_POOL_OR_WATCH)
2995 	    && (ibuf_debug || buf_debug_execute_is_force_flush())) {
2996 
2997 		/* Try to evict the block from the buffer pool, to use the
2998 		insert buffer (change buffer) as much as possible. */
2999 
3000 		buf_pool_mutex_enter(buf_pool);
3001 
3002 		buf_block_unfix(fix_block);
3003 
3004 		/* Now we are only holding the buf_pool->mutex,
3005 		not block->mutex or hash_lock. Blocks cannot be
3006 		relocated or enter or exit the buf_pool while we
3007 		are holding the buf_pool->mutex. */
3008 
3009 		if (buf_LRU_free_page(&fix_block->page, true)) {
3010 			buf_pool_mutex_exit(buf_pool);
3011 			rw_lock_x_lock(hash_lock);
3012 
3013 			if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
3014 				/* Set the watch, as it would have
3015 				been set if the page were not in the
3016 				buffer pool in the first place. */
3017 				block = (buf_block_t*) buf_pool_watch_set(
3018 					space, offset, fold);
3019 			} else {
3020 				block = (buf_block_t*) buf_page_hash_get_low(
3021 					buf_pool, space, offset, fold);
3022 			}
3023 
3024 			rw_lock_x_unlock(hash_lock);
3025 
3026 			if (block != NULL) {
3027 				/* Either the page has been read in or
3028 				a watch was set on that in the window
3029 				where we released the buf_pool::mutex
3030 				and before we acquire the hash_lock
3031 				above. Try again. */
3032 				guess = block;
3033 				goto loop;
3034 			}
3035 
3036 			fprintf(stderr,
3037 				"innodb_change_buffering_debug evict %u %u\n",
3038 				(unsigned) space, (unsigned) offset);
3039 			return(NULL);
3040 		}
3041 
3042 		mutex_enter(&fix_block->mutex);
3043 
3044 		if (buf_flush_page_try(buf_pool, fix_block)) {
3045 			fprintf(stderr,
3046 				"innodb_change_buffering_debug flush %u %u\n",
3047 				(unsigned) space, (unsigned) offset);
3048 			guess = fix_block;
3049 			goto loop;
3050 		}
3051 
3052 		buf_block_mutex_exit(fix_block);
3053 
3054 		buf_block_fix(fix_block);
3055 
3056 		/* Failed to evict the page; change it directly */
3057 
3058 		buf_pool_mutex_exit(buf_pool);
3059 	}
3060 #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
3061 
3062 	ut_ad(fix_block->page.buf_fix_count > 0);
3063 
3064 #ifdef UNIV_SYNC_DEBUG
3065 	/* We have already buffer fixed the page, and we are committed to
3066 	returning this page to the caller. Register for debugging. */
3067 	{
3068 		ibool	ret;
3069 		ret = rw_lock_s_lock_nowait(&fix_block->debug_latch, file, line);
3070 		ut_a(ret);
3071 	}
3072 #endif /* UNIV_SYNC_DEBUG */
3073 
3074 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
3075 	ut_a(mode == BUF_GET_POSSIBLY_FREED
3076 	     || !fix_block->page.file_page_was_freed);
3077 #endif
3078 	/* Check if this is the first access to the page */
3079 	access_time = buf_page_is_accessed(&fix_block->page);
3080 
3081 	/* This is a heuristic and we don't care about ordering issues. */
3082 	if (access_time == 0) {
3083 		buf_block_mutex_enter(fix_block);
3084 
3085 		buf_page_set_accessed(&fix_block->page);
3086 
3087 		buf_block_mutex_exit(fix_block);
3088 	}
3089 
3090 	if (mode != BUF_PEEK_IF_IN_POOL) {
3091 		buf_page_make_young_if_needed(&fix_block->page);
3092 	}
3093 
3094 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3095 	ut_a(++buf_dbg_counter % 5771 || buf_validate());
3096 	ut_a(fix_block->page.buf_fix_count > 0);
3097 	ut_a(buf_block_get_state(fix_block) == BUF_BLOCK_FILE_PAGE);
3098 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3099 
3100 #ifdef PAGE_ATOMIC_REF_COUNT
3101 	/* We have to wait here because the IO_READ state was set
3102 	under the protection of the hash_lock and the block->mutex
3103 	but not the block->lock. */
3104 	buf_wait_for_read(fix_block);
3105 #endif /* PAGE_ATOMIC_REF_COUNT */
3106 
3107 	switch (rw_latch) {
3108 	case RW_NO_LATCH:
3109 
3110 #ifndef PAGE_ATOMIC_REF_COUNT
3111 		buf_wait_for_read(fix_block);
3112 #endif /* !PAGE_ATOMIC_REF_COUNT */
3113 
3114 		fix_type = MTR_MEMO_BUF_FIX;
3115 		break;
3116 
3117 	case RW_S_LATCH:
3118 		rw_lock_s_lock_inline(&fix_block->lock, 0, file, line);
3119 
3120 		fix_type = MTR_MEMO_PAGE_S_FIX;
3121 		break;
3122 
3123 	default:
3124 		ut_ad(rw_latch == RW_X_LATCH);
3125 		rw_lock_x_lock_inline(&fix_block->lock, 0, file, line);
3126 
3127 		fix_type = MTR_MEMO_PAGE_X_FIX;
3128 		break;
3129 	}
3130 
3131 	mtr_memo_push(mtr, fix_block, fix_type);
3132 
3133 	if (mode != BUF_PEEK_IF_IN_POOL && !access_time) {
3134 		/* In the case of a first access, try to apply linear
3135 		read-ahead */
3136 
3137 		buf_read_ahead_linear(
3138 			space, zip_size, offset, ibuf_inside(mtr));
3139 	}
3140 
3141 #ifdef UNIV_IBUF_COUNT_DEBUG
3142 	ut_a(ibuf_count_get(buf_block_get_space(fix_block),
3143 			    buf_block_get_page_no(fix_block)) == 0);
3144 #endif
3145 #ifdef UNIV_SYNC_DEBUG
3146 	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
3147 	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
3148 #endif /* UNIV_SYNC_DEBUG */
3149 	return(fix_block);
3150 }
3151 
3152 /********************************************************************//**
3153 This is the general function used to get optimistic access to a database
3154 page.
3155 @return	TRUE if success */
3156 UNIV_INTERN
3157 ibool
buf_page_optimistic_get(ulint rw_latch,buf_block_t * block,ib_uint64_t modify_clock,const char * file,ulint line,mtr_t * mtr)3158 buf_page_optimistic_get(
3159 /*====================*/
3160 	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
3161 	buf_block_t*	block,	/*!< in: guessed buffer block */
3162 	ib_uint64_t	modify_clock,/*!< in: modify clock value */
3163 	const char*	file,	/*!< in: file name */
3164 	ulint		line,	/*!< in: line where called */
3165 	mtr_t*		mtr)	/*!< in: mini-transaction */
3166 {
3167 	buf_pool_t*	buf_pool;
3168 	unsigned	access_time;
3169 	ibool		success;
3170 	ulint		fix_type;
3171 
3172 	ut_ad(block);
3173 	ut_ad(mtr);
3174 	ut_ad(mtr->state == MTR_ACTIVE);
3175 	ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
3176 
3177 	mutex_enter(&block->mutex);
3178 
3179 	if (UNIV_UNLIKELY(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE)) {
3180 
3181 		mutex_exit(&block->mutex);
3182 
3183 		return(FALSE);
3184 	}
3185 
3186 	buf_block_buf_fix_inc(block, file, line);
3187 
3188 	access_time = buf_page_is_accessed(&block->page);
3189 
3190 	buf_page_set_accessed(&block->page);
3191 
3192 	mutex_exit(&block->mutex);
3193 
3194 	buf_page_make_young_if_needed(&block->page);
3195 
3196 	ut_ad(!ibuf_inside(mtr)
3197 	      || ibuf_page(buf_block_get_space(block),
3198 			   buf_block_get_zip_size(block),
3199 			   buf_block_get_page_no(block), NULL));
3200 
3201 	if (rw_latch == RW_S_LATCH) {
3202 		success = rw_lock_s_lock_nowait(&(block->lock),
3203 						file, line);
3204 		fix_type = MTR_MEMO_PAGE_S_FIX;
3205 	} else {
3206 		success = rw_lock_x_lock_func_nowait_inline(&(block->lock),
3207 							    file, line);
3208 		fix_type = MTR_MEMO_PAGE_X_FIX;
3209 	}
3210 
3211 	if (UNIV_UNLIKELY(!success)) {
3212 		buf_block_buf_fix_dec(block);
3213 
3214 		return(FALSE);
3215 	}
3216 
3217 	if (UNIV_UNLIKELY(modify_clock != block->modify_clock)) {
3218 		buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
3219 
3220 		if (rw_latch == RW_S_LATCH) {
3221 			rw_lock_s_unlock(&(block->lock));
3222 		} else {
3223 			rw_lock_x_unlock(&(block->lock));
3224 		}
3225 
3226 		buf_block_buf_fix_dec(block);
3227 
3228 		return(FALSE);
3229 	}
3230 
3231 	mtr_memo_push(mtr, block, fix_type);
3232 
3233 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3234 	ut_a(++buf_dbg_counter % 5771 || buf_validate());
3235 	ut_a(block->page.buf_fix_count > 0);
3236 	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
3237 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3238 
3239 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
3240 	mutex_enter(&block->mutex);
3241 	ut_a(!block->page.file_page_was_freed);
3242 	mutex_exit(&block->mutex);
3243 #endif
3244 
3245 	if (!access_time) {
3246 		/* In the case of a first access, try to apply linear
3247 		read-ahead */
3248 
3249 		buf_read_ahead_linear(buf_block_get_space(block),
3250 				      buf_block_get_zip_size(block),
3251 				      buf_block_get_page_no(block),
3252 				      ibuf_inside(mtr));
3253 	}
3254 
3255 #ifdef UNIV_IBUF_COUNT_DEBUG
3256 	ut_a(ibuf_count_get(buf_block_get_space(block),
3257 			    buf_block_get_page_no(block)) == 0);
3258 #endif
3259 	buf_pool = buf_pool_from_block(block);
3260 	buf_pool->stat.n_page_gets++;
3261 
3262 	return(TRUE);
3263 }
3264 
3265 /********************************************************************//**
3266 This is used to get access to a known database page, when no waiting can be
3267 done. For example, if a search in an adaptive hash index leads us to this
3268 frame.
3269 @return	TRUE if success */
3270 UNIV_INTERN
3271 ibool
buf_page_get_known_nowait(ulint rw_latch,buf_block_t * block,ulint mode,const char * file,ulint line,mtr_t * mtr)3272 buf_page_get_known_nowait(
3273 /*======================*/
3274 	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
3275 	buf_block_t*	block,	/*!< in: the known page */
3276 	ulint		mode,	/*!< in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */
3277 	const char*	file,	/*!< in: file name */
3278 	ulint		line,	/*!< in: line where called */
3279 	mtr_t*		mtr)	/*!< in: mini-transaction */
3280 {
3281 	buf_pool_t*	buf_pool;
3282 	ibool		success;
3283 	ulint		fix_type;
3284 
3285 	ut_ad(mtr);
3286 	ut_ad(mtr->state == MTR_ACTIVE);
3287 	ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
3288 
3289 	mutex_enter(&block->mutex);
3290 
3291 	if (buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH) {
3292 		/* Another thread is just freeing the block from the LRU list
3293 		of the buffer pool: do not try to access this page; this
3294 		attempt to access the page can only come through the hash
3295 		index because when the buffer block state is ..._REMOVE_HASH,
3296 		we have already removed it from the page address hash table
3297 		of the buffer pool. */
3298 
3299 		mutex_exit(&block->mutex);
3300 
3301 		return(FALSE);
3302 	}
3303 
3304 	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
3305 
3306 	buf_block_buf_fix_inc(block, file, line);
3307 
3308 	buf_page_set_accessed(&block->page);
3309 
3310 	mutex_exit(&block->mutex);
3311 
3312 	buf_pool = buf_pool_from_block(block);
3313 
3314 	if (mode == BUF_MAKE_YOUNG) {
3315 		buf_page_make_young_if_needed(&block->page);
3316 	}
3317 
3318 	ut_ad(!ibuf_inside(mtr) || mode == BUF_KEEP_OLD);
3319 
3320 	if (rw_latch == RW_S_LATCH) {
3321 		success = rw_lock_s_lock_nowait(&(block->lock),
3322 						file, line);
3323 		fix_type = MTR_MEMO_PAGE_S_FIX;
3324 	} else {
3325 		success = rw_lock_x_lock_func_nowait_inline(&(block->lock),
3326 							    file, line);
3327 		fix_type = MTR_MEMO_PAGE_X_FIX;
3328 	}
3329 
3330 	if (!success) {
3331 		buf_block_buf_fix_dec(block);
3332 
3333 		return(FALSE);
3334 	}
3335 
3336 	mtr_memo_push(mtr, block, fix_type);
3337 
3338 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3339 	ut_a(++buf_dbg_counter % 5771 || buf_validate());
3340 	ut_a(block->page.buf_fix_count > 0);
3341 	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
3342 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3343 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
3344 	if (mode != BUF_KEEP_OLD) {
3345 		/* If mode == BUF_KEEP_OLD, we are executing an I/O
3346 		completion routine.  Avoid a bogus assertion failure
3347 		when ibuf_merge_or_delete_for_page() is processing a
3348 		page that was just freed due to DROP INDEX, or
3349 		deleting a record from SYS_INDEXES. This check will be
3350 		skipped in recv_recover_page() as well. */
3351 
3352 		mutex_enter(&block->mutex);
3353 		ut_a(!block->page.file_page_was_freed);
3354 		mutex_exit(&block->mutex);
3355 	}
3356 #endif
3357 
3358 #ifdef UNIV_IBUF_COUNT_DEBUG
3359 	ut_a((mode == BUF_KEEP_OLD)
3360 	     || (ibuf_count_get(buf_block_get_space(block),
3361 				buf_block_get_page_no(block)) == 0));
3362 #endif
3363 	buf_pool->stat.n_page_gets++;
3364 
3365 	return(TRUE);
3366 }
3367 
3368 /*******************************************************************//**
3369 Given a tablespace id and page number tries to get that page. If the
3370 page is not in the buffer pool it is not loaded and NULL is returned.
3371 Suitable for using when holding the lock_sys_t::mutex.
3372 @return	pointer to a page or NULL */
3373 UNIV_INTERN
3374 const buf_block_t*
buf_page_try_get_func(ulint space_id,ulint page_no,const char * file,ulint line,mtr_t * mtr)3375 buf_page_try_get_func(
3376 /*==================*/
3377 	ulint		space_id,/*!< in: tablespace id */
3378 	ulint		page_no,/*!< in: page number */
3379 	const char*	file,	/*!< in: file name */
3380 	ulint		line,	/*!< in: line where called */
3381 	mtr_t*		mtr)	/*!< in: mini-transaction */
3382 {
3383 	buf_block_t*	block;
3384 	ibool		success;
3385 	ulint		fix_type;
3386 	buf_pool_t*	buf_pool = buf_pool_get(space_id, page_no);
3387 	rw_lock_t*	hash_lock;
3388 
3389 	ut_ad(mtr);
3390 	ut_ad(mtr->state == MTR_ACTIVE);
3391 
3392 	block = buf_block_hash_get_s_locked(buf_pool, space_id,
3393 					    page_no, &hash_lock);
3394 
3395 	if (!block || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
3396 		if (block) {
3397 			rw_lock_s_unlock(hash_lock);
3398 		}
3399 		return(NULL);
3400 	}
3401 
3402 	ut_ad(!buf_pool_watch_is_sentinel(buf_pool, &block->page));
3403 
3404 	mutex_enter(&block->mutex);
3405 	rw_lock_s_unlock(hash_lock);
3406 
3407 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3408 	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
3409 	ut_a(buf_block_get_space(block) == space_id);
3410 	ut_a(buf_block_get_page_no(block) == page_no);
3411 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3412 
3413 	buf_block_buf_fix_inc(block, file, line);
3414 	mutex_exit(&block->mutex);
3415 
3416 	fix_type = MTR_MEMO_PAGE_S_FIX;
3417 	success = rw_lock_s_lock_nowait(&block->lock, file, line);
3418 
3419 	if (!success) {
3420 		/* Let us try to get an X-latch. If the current thread
3421 		is holding an X-latch on the page, we cannot get an
3422 		S-latch. */
3423 
3424 		fix_type = MTR_MEMO_PAGE_X_FIX;
3425 		success = rw_lock_x_lock_func_nowait_inline(&block->lock,
3426 							    file, line);
3427 	}
3428 
3429 	if (!success) {
3430 		buf_block_buf_fix_dec(block);
3431 
3432 		return(NULL);
3433 	}
3434 
3435 	mtr_memo_push(mtr, block, fix_type);
3436 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3437 	ut_a(++buf_dbg_counter % 5771 || buf_validate());
3438 	ut_a(block->page.buf_fix_count > 0);
3439 	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
3440 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3441 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
3442 	mutex_enter(&block->mutex);
3443 	ut_a(!block->page.file_page_was_freed);
3444 	mutex_exit(&block->mutex);
3445 #endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
3446 	buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
3447 
3448 	buf_pool->stat.n_page_gets++;
3449 
3450 #ifdef UNIV_IBUF_COUNT_DEBUG
3451 	ut_a(ibuf_count_get(buf_block_get_space(block),
3452 			    buf_block_get_page_no(block)) == 0);
3453 #endif
3454 
3455 	return(block);
3456 }
3457 
3458 /********************************************************************//**
3459 Initialize some fields of a control block. */
3460 UNIV_INLINE
3461 void
buf_page_init_low(buf_page_t * bpage)3462 buf_page_init_low(
3463 /*==============*/
3464 	buf_page_t*	bpage)	/*!< in: block to init */
3465 {
3466 	bpage->flush_type = BUF_FLUSH_LRU;
3467 	bpage->io_fix = BUF_IO_NONE;
3468 	bpage->buf_fix_count = 0;
3469 	bpage->freed_page_clock = 0;
3470 	bpage->access_time = 0;
3471 	bpage->newest_modification = 0;
3472 	bpage->oldest_modification = 0;
3473 	HASH_INVALIDATE(bpage, hash);
3474 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
3475 	bpage->file_page_was_freed = FALSE;
3476 #endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
3477 }
3478 
3479 /********************************************************************//**
3480 Inits a page to the buffer buf_pool. */
3481 static MY_ATTRIBUTE((nonnull))
3482 void
buf_page_init(buf_pool_t * buf_pool,ulint space,ulint offset,ulint fold,ulint zip_size,buf_block_t * block)3483 buf_page_init(
3484 /*==========*/
3485 	buf_pool_t*	buf_pool,/*!< in/out: buffer pool */
3486 	ulint		space,	/*!< in: space id */
3487 	ulint		offset,	/*!< in: offset of the page within space
3488 				in units of a page */
3489 	ulint		fold,	/*!< in: buf_page_address_fold(space,offset) */
3490 	ulint		zip_size,/*!< in: compressed page size, or 0 */
3491 	buf_block_t*	block)	/*!< in/out: block to init */
3492 {
3493 	buf_page_t*	hash_page;
3494 
3495 	ut_ad(buf_pool == buf_pool_get(space, offset));
3496 	ut_ad(buf_pool_mutex_own(buf_pool));
3497 
3498 	ut_ad(mutex_own(&(block->mutex)));
3499 	ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE);
3500 
3501 #ifdef UNIV_SYNC_DEBUG
3502 	ut_ad(rw_lock_own(buf_page_hash_lock_get(buf_pool, fold),
3503 			  RW_LOCK_EX));
3504 #endif /* UNIV_SYNC_DEBUG */
3505 
3506 	/* Set the state of the block */
3507 	buf_block_set_file_page(block, space, offset);
3508 
3509 #ifdef UNIV_DEBUG_VALGRIND
3510 	if (!space) {
3511 		/* Silence valid Valgrind warnings about uninitialized
3512 		data being written to data files.  There are some unused
3513 		bytes on some pages that InnoDB does not initialize. */
3514 		UNIV_MEM_VALID(block->frame, UNIV_PAGE_SIZE);
3515 	}
3516 #endif /* UNIV_DEBUG_VALGRIND */
3517 
3518 	buf_block_init_low(block);
3519 
3520 	block->lock_hash_val = lock_rec_hash(space, offset);
3521 
3522 	buf_page_init_low(&block->page);
3523 
3524 	/* Insert into the hash table of file pages */
3525 
3526 	hash_page = buf_page_hash_get_low(buf_pool, space, offset, fold);
3527 
3528 	if (hash_page == NULL) {
3529 		/* Block not found in the hash table */
3530 	} else if (buf_pool_watch_is_sentinel(buf_pool, hash_page)) {
3531 		ib_uint32_t	buf_fix_count = hash_page->buf_fix_count;
3532 
3533 	ut_a(buf_fix_count > 0);
3534 
3535 #ifdef PAGE_ATOMIC_REF_COUNT
3536 		os_atomic_increment_uint32(
3537 			&block->page.buf_fix_count, buf_fix_count);
3538 #else
3539 		block->page.buf_fix_count += ulint(buf_fix_count);
3540 #endif /* PAGE_ATOMIC_REF_COUNT */
3541 
3542 		buf_pool_watch_remove(buf_pool, fold, hash_page);
3543 	} else {
3544 		fprintf(stderr,
3545 			"InnoDB: Error: page %lu %lu already found"
3546 			" in the hash table: %p, %p\n",
3547 			(ulong) space,
3548 			(ulong) offset,
3549 			(const void*) hash_page, (const void*) block);
3550 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3551 		mutex_exit(&block->mutex);
3552 		buf_pool_mutex_exit(buf_pool);
3553 		buf_print();
3554 		buf_LRU_print();
3555 		buf_validate();
3556 		buf_LRU_validate();
3557 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3558 		ut_error;
3559 	}
3560 
3561 	ut_ad(!block->page.in_zip_hash);
3562 	ut_ad(!block->page.in_page_hash);
3563 	ut_d(block->page.in_page_hash = TRUE);
3564 
3565 	HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, &block->page);
3566 
3567 	if (zip_size) {
3568 		page_zip_set_size(&block->page.zip, zip_size);
3569 	}
3570 }
3571 
3572 /********************************************************************//**
3573 Function which inits a page for read to the buffer buf_pool. If the page is
3574 (1) already in buf_pool, or
3575 (2) if we specify to read only ibuf pages and the page is not an ibuf page, or
3576 (3) if the space is deleted or being deleted,
3577 then this function does nothing.
3578 Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock
3579 on the buffer frame. The io-handler must take care that the flag is cleared
3580 and the lock released later.
3581 @return	pointer to the block or NULL */
3582 UNIV_INTERN
3583 buf_page_t*
buf_page_init_for_read(dberr_t * err,ulint mode,ulint space,ulint zip_size,ibool unzip,ib_int64_t tablespace_version,ulint offset)3584 buf_page_init_for_read(
3585 /*===================*/
3586 	dberr_t*	err,	/*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED */
3587 	ulint		mode,	/*!< in: BUF_READ_IBUF_PAGES_ONLY, ... */
3588 	ulint		space,	/*!< in: space id */
3589 	ulint		zip_size,/*!< in: compressed page size, or 0 */
3590 	ibool		unzip,	/*!< in: TRUE=request uncompressed page */
3591 	ib_int64_t	tablespace_version,
3592 				/*!< in: prevents reading from a wrong
3593 				version of the tablespace in case we have done
3594 				DISCARD + IMPORT */
3595 	ulint		offset)	/*!< in: page number */
3596 {
3597 	buf_block_t*	block;
3598 	buf_page_t*	bpage	= NULL;
3599 	buf_page_t*	watch_page;
3600 	rw_lock_t*	hash_lock;
3601 	mtr_t		mtr;
3602 	ulint		fold;
3603 	ibool		lru	= FALSE;
3604 	void*		data;
3605 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
3606 
3607 	ut_ad(buf_pool);
3608 
3609 	*err = DB_SUCCESS;
3610 
3611 	if (mode == BUF_READ_IBUF_PAGES_ONLY) {
3612 		/* It is a read-ahead within an ibuf routine */
3613 
3614 		ut_ad(!ibuf_bitmap_page(zip_size, offset));
3615 
3616 		ibuf_mtr_start(&mtr);
3617 
3618 		if (!recv_no_ibuf_operations
3619 		    && !ibuf_page(space, zip_size, offset, &mtr)) {
3620 
3621 			ibuf_mtr_commit(&mtr);
3622 
3623 			return(NULL);
3624 		}
3625 	} else {
3626 		ut_ad(mode == BUF_READ_ANY_PAGE);
3627 	}
3628 
3629 	if (zip_size && !unzip && !recv_recovery_is_on()) {
3630 		block = NULL;
3631 	} else {
3632 		block = buf_LRU_get_free_block(buf_pool);
3633 		ut_ad(block);
3634 		ut_ad(buf_pool_from_block(block) == buf_pool);
3635 	}
3636 
3637 	fold = buf_page_address_fold(space, offset);
3638 	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
3639 
3640 	buf_pool_mutex_enter(buf_pool);
3641 	rw_lock_x_lock(hash_lock);
3642 
3643 	watch_page = buf_page_hash_get_low(buf_pool, space, offset, fold);
3644 	if (watch_page && !buf_pool_watch_is_sentinel(buf_pool, watch_page)) {
3645 		/* The page is already in the buffer pool. */
3646 		watch_page = NULL;
3647 err_exit:
3648 		rw_lock_x_unlock(hash_lock);
3649 		if (block) {
3650 			mutex_enter(&block->mutex);
3651 			buf_LRU_block_free_non_file_page(block);
3652 			mutex_exit(&block->mutex);
3653 		}
3654 
3655 		bpage = NULL;
3656 		goto func_exit;
3657 	}
3658 
3659 	if (fil_tablespace_deleted_or_being_deleted_in_mem(
3660 		    space, tablespace_version)) {
3661 		/* The page belongs to a space which has been
3662 		deleted or is being deleted. */
3663 		*err = DB_TABLESPACE_DELETED;
3664 
3665 		goto err_exit;
3666 	}
3667 
3668 	if (block) {
3669 		bpage = &block->page;
3670 
3671 		mutex_enter(&block->mutex);
3672 
3673 		ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
3674 
3675 		buf_page_init(buf_pool, space, offset, fold, zip_size, block);
3676 
3677 #ifdef PAGE_ATOMIC_REF_COUNT
3678 		/* Note: We set the io state without the protection of
3679 		the block->lock. This is because other threads cannot
3680 		access this block unless it is in the hash table. */
3681 
3682 		buf_page_set_io_fix(bpage, BUF_IO_READ);
3683 #endif /* PAGE_ATOMIC_REF_COUNT */
3684 
3685 		rw_lock_x_unlock(hash_lock);
3686 
3687 		/* The block must be put to the LRU list, to the old blocks */
3688 		buf_LRU_add_block(bpage, TRUE/* to old blocks */);
3689 
3690 		/* We set a pass-type x-lock on the frame because then
3691 		the same thread which called for the read operation
3692 		(and is running now at this point of code) can wait
3693 		for the read to complete by waiting for the x-lock on
3694 		the frame; if the x-lock were recursive, the same
3695 		thread would illegally get the x-lock before the page
3696 		read is completed.  The x-lock is cleared by the
3697 		io-handler thread. */
3698 
3699 		rw_lock_x_lock_gen(&block->lock, BUF_IO_READ);
3700 
3701 #ifndef PAGE_ATOMIC_REF_COUNT
3702 		buf_page_set_io_fix(bpage, BUF_IO_READ);
3703 #endif /* !PAGE_ATOMIC_REF_COUNT */
3704 
3705 		if (zip_size) {
3706 			/* buf_pool->mutex may be released and
3707 			reacquired by buf_buddy_alloc().  Thus, we
3708 			must release block->mutex in order not to
3709 			break the latching order in the reacquisition
3710 			of buf_pool->mutex.  We also must defer this
3711 			operation until after the block descriptor has
3712 			been added to buf_pool->LRU and
3713 			buf_pool->page_hash. */
3714 			mutex_exit(&block->mutex);
3715 			data = buf_buddy_alloc(buf_pool, zip_size, &lru);
3716 			mutex_enter(&block->mutex);
3717 			block->page.zip.data = (page_zip_t*) data;
3718 
3719 			/* To maintain the invariant
3720 			block->in_unzip_LRU_list
3721 			== buf_page_belongs_to_unzip_LRU(&block->page)
3722 			we have to add this block to unzip_LRU
3723 			after block->page.zip.data is set. */
3724 			ut_ad(buf_page_belongs_to_unzip_LRU(&block->page));
3725 			buf_unzip_LRU_add_block(block, TRUE);
3726 		}
3727 
3728 		mutex_exit(&block->mutex);
3729 	} else {
3730 		rw_lock_x_unlock(hash_lock);
3731 
3732 		/* The compressed page must be allocated before the
3733 		control block (bpage), in order to avoid the
3734 		invocation of buf_buddy_relocate_block() on
3735 		uninitialized data. */
3736 		data = buf_buddy_alloc(buf_pool, zip_size, &lru);
3737 
3738 		rw_lock_x_lock(hash_lock);
3739 
3740 		/* If buf_buddy_alloc() allocated storage from the LRU list,
3741 		it released and reacquired buf_pool->mutex.  Thus, we must
3742 		check the page_hash again, as it may have been modified. */
3743 		if (UNIV_UNLIKELY(lru)) {
3744 
3745 			watch_page = buf_page_hash_get_low(
3746 				buf_pool, space, offset, fold);
3747 
3748 			if (UNIV_UNLIKELY(watch_page
3749 			    && !buf_pool_watch_is_sentinel(buf_pool,
3750 							   watch_page))) {
3751 
3752 				/* The block was added by some other thread. */
3753 				rw_lock_x_unlock(hash_lock);
3754 				watch_page = NULL;
3755 				buf_buddy_free(buf_pool, data, zip_size);
3756 
3757 				bpage = NULL;
3758 				goto func_exit;
3759 			}
3760 		}
3761 
3762 		bpage = buf_page_alloc_descriptor();
3763 
3764 		/* Initialize the buf_pool pointer. */
3765 		bpage->buf_pool_index = buf_pool_index(buf_pool);
3766 
3767 		page_zip_des_init(&bpage->zip);
3768 		page_zip_set_size(&bpage->zip, zip_size);
3769 		bpage->zip.data = (page_zip_t*) data;
3770 
3771 		mutex_enter(&buf_pool->zip_mutex);
3772 		UNIV_MEM_DESC(bpage->zip.data,
3773 			      page_zip_get_size(&bpage->zip));
3774 
3775 		buf_page_init_low(bpage);
3776 
3777 		bpage->state	= BUF_BLOCK_ZIP_PAGE;
3778 		bpage->space	= static_cast<ib_uint32_t>(space);
3779 		bpage->offset	= static_cast<ib_uint32_t>(offset);
3780 
3781 #ifdef UNIV_DEBUG
3782 		bpage->in_page_hash = FALSE;
3783 		bpage->in_zip_hash = FALSE;
3784 		bpage->in_flush_list = FALSE;
3785 		bpage->in_free_list = FALSE;
3786 		bpage->in_LRU_list = FALSE;
3787 #endif /* UNIV_DEBUG */
3788 
3789 		ut_d(bpage->in_page_hash = TRUE);
3790 
3791 		if (watch_page != NULL) {
3792 
3793 			/* Preserve the reference count. */
3794 			ib_uint32_t	buf_fix_count;
3795 
3796 			buf_fix_count = watch_page->buf_fix_count;
3797 
3798 			ut_a(buf_fix_count > 0);
3799 
3800 #ifdef PAGE_ATOMIC_REF_COUNT
3801 			os_atomic_increment_uint32(
3802 				&bpage->buf_fix_count, buf_fix_count);
3803 #else
3804 			bpage->buf_fix_count += buf_fix_count;
3805 #endif /* PAGE_ATOMIC_REF_COUNT */
3806 
3807 			ut_ad(buf_pool_watch_is_sentinel(buf_pool, watch_page));
3808 			buf_pool_watch_remove(buf_pool, fold, watch_page);
3809 		}
3810 
3811 		HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold,
3812 			    bpage);
3813 
3814 		rw_lock_x_unlock(hash_lock);
3815 
3816 		/* The block must be put to the LRU list, to the old blocks.
3817 		The zip_size is already set into the page zip */
3818 		buf_LRU_add_block(bpage, TRUE/* to old blocks */);
3819 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3820 		buf_LRU_insert_zip_clean(bpage);
3821 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3822 
3823 		buf_page_set_io_fix(bpage, BUF_IO_READ);
3824 
3825 		mutex_exit(&buf_pool->zip_mutex);
3826 	}
3827 
3828 	buf_pool->n_pend_reads++;
3829 func_exit:
3830 	buf_pool_mutex_exit(buf_pool);
3831 
3832 	if (mode == BUF_READ_IBUF_PAGES_ONLY) {
3833 
3834 		ibuf_mtr_commit(&mtr);
3835 	}
3836 
3837 
3838 #ifdef UNIV_SYNC_DEBUG
3839 	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
3840 	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
3841 #endif /* UNIV_SYNC_DEBUG */
3842 
3843 	ut_ad(!bpage || buf_page_in_file(bpage));
3844 	return(bpage);
3845 }
3846 
3847 /********************************************************************//**
3848 Initializes a page to the buffer buf_pool. The page is usually not read
3849 from a file even if it cannot be found in the buffer buf_pool. This is one
3850 of the functions which perform to a block a state transition NOT_USED =>
3851 FILE_PAGE (the other is buf_page_get_gen).
3852 @return	pointer to the block, page bufferfixed */
3853 UNIV_INTERN
3854 buf_block_t*
buf_page_create(ulint space,ulint offset,ulint zip_size,mtr_t * mtr)3855 buf_page_create(
3856 /*============*/
3857 	ulint	space,	/*!< in: space id */
3858 	ulint	offset,	/*!< in: offset of the page within space in units of
3859 			a page */
3860 	ulint	zip_size,/*!< in: compressed page size, or 0 */
3861 	mtr_t*	mtr)	/*!< in: mini-transaction handle */
3862 {
3863 	buf_frame_t*	frame;
3864 	buf_block_t*	block;
3865 	ulint		fold;
3866 	buf_block_t*	free_block	= NULL;
3867 	buf_pool_t*	buf_pool	= buf_pool_get(space, offset);
3868 	rw_lock_t*	hash_lock;
3869 
3870 	ut_ad(mtr);
3871 	ut_ad(mtr->state == MTR_ACTIVE);
3872 	ut_ad(space || !zip_size);
3873 
3874 	free_block = buf_LRU_get_free_block(buf_pool);
3875 
3876 	fold = buf_page_address_fold(space, offset);
3877 	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
3878 
3879 	buf_pool_mutex_enter(buf_pool);
3880 	rw_lock_x_lock(hash_lock);
3881 
3882 	block = (buf_block_t*) buf_page_hash_get_low(
3883 		buf_pool, space, offset, fold);
3884 
3885 	if (block
3886 	    && buf_page_in_file(&block->page)
3887 	    && !buf_pool_watch_is_sentinel(buf_pool, &block->page)) {
3888 #ifdef UNIV_IBUF_COUNT_DEBUG
3889 		ut_a(ibuf_count_get(space, offset) == 0);
3890 #endif
3891 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
3892 		block->page.file_page_was_freed = FALSE;
3893 #endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
3894 
3895 		/* Page can be found in buf_pool */
3896 		buf_pool_mutex_exit(buf_pool);
3897 		rw_lock_x_unlock(hash_lock);
3898 
3899 		buf_block_free(free_block);
3900 
3901 		return(buf_page_get_with_no_latch(space, zip_size, offset, mtr));
3902 	}
3903 
3904 	/* If we get here, the page was not in buf_pool: init it there */
3905 
3906 #ifdef UNIV_DEBUG
3907 	if (buf_debug_prints) {
3908 		fprintf(stderr, "Creating space %lu page %lu to buffer\n",
3909 			(ulong) space, (ulong) offset);
3910 	}
3911 #endif /* UNIV_DEBUG */
3912 
3913 	block = free_block;
3914 
3915 	mutex_enter(&block->mutex);
3916 
3917 	buf_page_init(buf_pool, space, offset, fold, zip_size, block);
3918 
3919 	rw_lock_x_unlock(hash_lock);
3920 
3921 	/* The block must be put to the LRU list */
3922 	buf_LRU_add_block(&block->page, FALSE);
3923 
3924 	buf_block_buf_fix_inc(block, __FILE__, __LINE__);
3925 	buf_pool->stat.n_pages_created++;
3926 
3927 	if (zip_size) {
3928 		void*	data;
3929 		ibool	lru;
3930 
3931 		/* Prevent race conditions during buf_buddy_alloc(),
3932 		which may release and reacquire buf_pool->mutex,
3933 		by IO-fixing and X-latching the block. */
3934 
3935 		buf_page_set_io_fix(&block->page, BUF_IO_READ);
3936 		rw_lock_x_lock(&block->lock);
3937 
3938 		mutex_exit(&block->mutex);
3939 		/* buf_pool->mutex may be released and reacquired by
3940 		buf_buddy_alloc().  Thus, we must release block->mutex
3941 		in order not to break the latching order in
3942 		the reacquisition of buf_pool->mutex.  We also must
3943 		defer this operation until after the block descriptor
3944 		has been added to buf_pool->LRU and buf_pool->page_hash. */
3945 		data = buf_buddy_alloc(buf_pool, zip_size, &lru);
3946 		mutex_enter(&block->mutex);
3947 		block->page.zip.data = (page_zip_t*) data;
3948 
3949 		/* To maintain the invariant
3950 		block->in_unzip_LRU_list
3951 		== buf_page_belongs_to_unzip_LRU(&block->page)
3952 		we have to add this block to unzip_LRU after
3953 		block->page.zip.data is set. */
3954 		ut_ad(buf_page_belongs_to_unzip_LRU(&block->page));
3955 		buf_unzip_LRU_add_block(block, FALSE);
3956 
3957 		buf_page_set_io_fix(&block->page, BUF_IO_NONE);
3958 		rw_lock_x_unlock(&block->lock);
3959 	}
3960 
3961 	buf_pool_mutex_exit(buf_pool);
3962 
3963 	mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX);
3964 
3965 	buf_page_set_accessed(&block->page);
3966 
3967 	mutex_exit(&block->mutex);
3968 
3969 	/* Delete possible entries for the page from the insert buffer:
3970 	such can exist if the page belonged to an index which was dropped */
3971 
3972 	ibuf_merge_or_delete_for_page(NULL, space, offset, zip_size, TRUE);
3973 
3974 	frame = block->frame;
3975 
3976 	memset(frame + FIL_PAGE_PREV, 0xff, 4);
3977 	memset(frame + FIL_PAGE_NEXT, 0xff, 4);
3978 	mach_write_to_2(frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED);
3979 
3980 	/* Reset to zero the file flush lsn field in the page; if the first
3981 	page of an ibdata file is 'created' in this function into the buffer
3982 	pool then we lose the original contents of the file flush lsn stamp.
3983 	Then InnoDB could in a crash recovery print a big, false, corruption
3984 	warning if the stamp contains an lsn bigger than the ib_logfile lsn. */
3985 
3986 	memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
3987 
3988 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3989 	ut_a(++buf_dbg_counter % 5771 || buf_validate());
3990 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3991 #ifdef UNIV_IBUF_COUNT_DEBUG
3992 	ut_a(ibuf_count_get(buf_block_get_space(block),
3993 			    buf_block_get_page_no(block)) == 0);
3994 #endif
3995 	return(block);
3996 }
3997 
3998 /********************************************************************//**
3999 Monitor the buffer page read/write activity, and increment corresponding
4000 counter value if MONITOR_MODULE_BUF_PAGE (module_buf_page) module is
4001 enabled. */
4002 static
4003 void
buf_page_monitor(const buf_page_t * bpage,enum buf_io_fix io_type)4004 buf_page_monitor(
4005 /*=============*/
4006 	const buf_page_t*	bpage,	/*!< in: pointer to the block */
4007 	enum buf_io_fix		io_type)/*!< in: io_fix types */
4008 {
4009 	const byte*	frame;
4010 	monitor_id_t	counter;
4011 
4012 	/* If the counter module is not turned on, just return */
4013 	if (!MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)) {
4014 		return;
4015 	}
4016 
4017 	ut_a(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE);
4018 
4019 	frame = bpage->zip.data
4020 		? bpage->zip.data
4021 		: ((buf_block_t*) bpage)->frame;
4022 
4023 	switch (fil_page_get_type(frame)) {
4024 		ulint	level;
4025 
4026 	case FIL_PAGE_INDEX:
4027 		level = btr_page_get_level_low(frame);
4028 
4029 		/* Check if it is an index page for insert buffer */
4030 		if (btr_page_get_index_id(frame)
4031 		    == (index_id_t)(DICT_IBUF_ID_MIN + IBUF_SPACE_ID)) {
4032 			if (level == 0) {
4033 				counter = MONITOR_RW_COUNTER(
4034 					io_type, MONITOR_INDEX_IBUF_LEAF_PAGE);
4035 			} else {
4036 				counter = MONITOR_RW_COUNTER(
4037 					io_type,
4038 					MONITOR_INDEX_IBUF_NON_LEAF_PAGE);
4039 			}
4040 		} else {
4041 			if (level == 0) {
4042 				counter = MONITOR_RW_COUNTER(
4043 					io_type, MONITOR_INDEX_LEAF_PAGE);
4044 			} else {
4045 				counter = MONITOR_RW_COUNTER(
4046 					io_type, MONITOR_INDEX_NON_LEAF_PAGE);
4047 			}
4048 		}
4049 		break;
4050 
4051         case FIL_PAGE_UNDO_LOG:
4052 		counter = MONITOR_RW_COUNTER(io_type, MONITOR_UNDO_LOG_PAGE);
4053 		break;
4054 
4055         case FIL_PAGE_INODE:
4056 		counter = MONITOR_RW_COUNTER(io_type, MONITOR_INODE_PAGE);
4057 		break;
4058 
4059         case FIL_PAGE_IBUF_FREE_LIST:
4060 		counter = MONITOR_RW_COUNTER(io_type,
4061 					     MONITOR_IBUF_FREELIST_PAGE);
4062 		break;
4063 
4064         case FIL_PAGE_IBUF_BITMAP:
4065 		counter = MONITOR_RW_COUNTER(io_type,
4066 					     MONITOR_IBUF_BITMAP_PAGE);
4067 		break;
4068 
4069         case FIL_PAGE_TYPE_SYS:
4070 		counter = MONITOR_RW_COUNTER(io_type, MONITOR_SYSTEM_PAGE);
4071 		break;
4072 
4073         case FIL_PAGE_TYPE_TRX_SYS:
4074 		counter = MONITOR_RW_COUNTER(io_type, MONITOR_TRX_SYSTEM_PAGE);
4075 		break;
4076 
4077         case FIL_PAGE_TYPE_FSP_HDR:
4078 		counter = MONITOR_RW_COUNTER(io_type, MONITOR_FSP_HDR_PAGE);
4079 		break;
4080 
4081         case FIL_PAGE_TYPE_XDES:
4082 		counter = MONITOR_RW_COUNTER(io_type, MONITOR_XDES_PAGE);
4083 		break;
4084 
4085         case FIL_PAGE_TYPE_BLOB:
4086 		counter = MONITOR_RW_COUNTER(io_type, MONITOR_BLOB_PAGE);
4087 		break;
4088 
4089         case FIL_PAGE_TYPE_ZBLOB:
4090 		counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB_PAGE);
4091 		break;
4092 
4093         case FIL_PAGE_TYPE_ZBLOB2:
4094 		counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB2_PAGE);
4095 		break;
4096 
4097 	default:
4098 		counter = MONITOR_RW_COUNTER(io_type, MONITOR_OTHER_PAGE);
4099 	}
4100 
4101 	MONITOR_INC_NOCHECK(counter);
4102 }
4103 
4104 /********************************************************************//**
4105 Mark a table with the specified space pointed by bpage->space corrupted.
4106 Also remove the bpage from LRU list.
4107 @return TRUE if successful */
4108 static
4109 ibool
buf_mark_space_corrupt(buf_page_t * bpage)4110 buf_mark_space_corrupt(
4111 /*===================*/
4112 	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
4113 {
4114 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
4115 	const ibool	uncompressed = (buf_page_get_state(bpage)
4116 					== BUF_BLOCK_FILE_PAGE);
4117 	ulint		space = bpage->space;
4118 	ibool		ret = TRUE;
4119 
4120 	/* First unfix and release lock on the bpage */
4121 	buf_pool_mutex_enter(buf_pool);
4122 	mutex_enter(buf_page_get_mutex(bpage));
4123 	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_READ);
4124 	ut_ad(bpage->buf_fix_count == 0);
4125 
4126 	/* Set BUF_IO_NONE before we remove the block from LRU list */
4127 	buf_page_set_io_fix(bpage, BUF_IO_NONE);
4128 
4129 	if (uncompressed) {
4130 		rw_lock_x_unlock_gen(
4131 			&((buf_block_t*) bpage)->lock,
4132 			BUF_IO_READ);
4133 	}
4134 
4135 	mutex_exit(buf_page_get_mutex(bpage));
4136 
4137 	/* Find the table with specified space id, and mark it corrupted */
4138 	if (dict_set_corrupted_by_space(space)) {
4139 		buf_LRU_free_one_page(bpage);
4140 	} else {
4141 		ret = FALSE;
4142 	}
4143 
4144 	ut_ad(buf_pool->n_pend_reads > 0);
4145 	buf_pool->n_pend_reads--;
4146 
4147 	buf_pool_mutex_exit(buf_pool);
4148 
4149 	return(ret);
4150 }
4151 
4152 /********************************************************************//**
4153 Completes an asynchronous read or write request of a file page to or from
4154 the buffer pool.
4155 @return true if successful */
4156 UNIV_INTERN
4157 bool
buf_page_io_complete(buf_page_t * bpage)4158 buf_page_io_complete(
4159 /*=================*/
4160 	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
4161 {
4162 	enum buf_io_fix	io_type;
4163 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
4164 	const ibool	uncompressed = (buf_page_get_state(bpage)
4165 					== BUF_BLOCK_FILE_PAGE);
4166 
4167 	ut_a(buf_page_in_file(bpage));
4168 
4169 	/* We do not need protect io_fix here by mutex to read
4170 	it because this is the only function where we can change the value
4171 	from BUF_IO_READ or BUF_IO_WRITE to some other value, and our code
4172 	ensures that this is the only thread that handles the i/o for this
4173 	block. */
4174 
4175 	io_type = buf_page_get_io_fix(bpage);
4176 	ut_ad(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE);
4177 
4178 	if (io_type == BUF_IO_READ) {
4179 		ulint	read_page_no;
4180 		ulint	read_space_id;
4181 		byte*	frame;
4182 
4183 		if (buf_page_get_zip_size(bpage)) {
4184 			frame = bpage->zip.data;
4185 			buf_pool->n_pend_unzip++;
4186 			if (uncompressed
4187 			    && !buf_zip_decompress((buf_block_t*) bpage,
4188 						   FALSE)) {
4189 
4190 				buf_pool->n_pend_unzip--;
4191 				goto corrupt;
4192 			}
4193 			buf_pool->n_pend_unzip--;
4194 		} else {
4195 			ut_a(uncompressed);
4196 			frame = ((buf_block_t*) bpage)->frame;
4197 		}
4198 
4199 		/* If this page is not uninitialized and not in the
4200 		doublewrite buffer, then the page number and space id
4201 		should be the same as in block. */
4202 		read_page_no = mach_read_from_4(frame + FIL_PAGE_OFFSET);
4203 		read_space_id = mach_read_from_4(
4204 			frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
4205 
4206 		if (bpage->space == TRX_SYS_SPACE
4207 		    && buf_dblwr_page_inside(bpage->offset)) {
4208 
4209 			ut_print_timestamp(stderr);
4210 			fprintf(stderr,
4211 				"  InnoDB: Error: reading page %lu\n"
4212 				"InnoDB: which is in the"
4213 				" doublewrite buffer!\n",
4214 				(ulong) bpage->offset);
4215 		} else if (!read_space_id && !read_page_no) {
4216 			/* This is likely an uninitialized page. */
4217 		} else if ((bpage->space
4218 			    && bpage->space != read_space_id)
4219 			   || bpage->offset != read_page_no) {
4220 			/* We did not compare space_id to read_space_id
4221 			if bpage->space == 0, because the field on the
4222 			page may contain garbage in MySQL < 4.1.1,
4223 			which only supported bpage->space == 0. */
4224 
4225 			ut_print_timestamp(stderr);
4226 			fprintf(stderr,
4227 				"  InnoDB: Error: space id and page n:o"
4228 				" stored in the page\n"
4229 				"InnoDB: read in are %lu:%lu,"
4230 				" should be %lu:%lu!\n",
4231 				(ulong) read_space_id, (ulong) read_page_no,
4232 				(ulong) bpage->space,
4233 				(ulong) bpage->offset);
4234 		}
4235 
4236 		/* From version 3.23.38 up we store the page checksum
4237 		to the 4 first bytes of the page end lsn field */
4238 
4239 		if (buf_page_is_corrupted(true, frame,
4240 					  buf_page_get_zip_size(bpage))) {
4241 
4242 			/* Not a real corruption if it was triggered by
4243 			error injection */
4244 			DBUG_EXECUTE_IF("buf_page_is_corrupt_failure",
4245 				if (bpage->space > TRX_SYS_SPACE
4246 				    && buf_mark_space_corrupt(bpage)) {
4247 					ib_logf(IB_LOG_LEVEL_INFO,
4248 						"Simulated page corruption");
4249 					return(true);
4250 				}
4251 				goto page_not_corrupt;
4252 				;);
4253 corrupt:
4254 			fprintf(stderr,
4255 				"InnoDB: Database page corruption on disk"
4256 				" or a failed\n"
4257 				"InnoDB: file read of page %lu.\n"
4258 				"InnoDB: You may have to recover"
4259 				" from a backup.\n",
4260 				(ulong) bpage->offset);
4261 			buf_page_print(frame, buf_page_get_zip_size(bpage),
4262 				       BUF_PAGE_PRINT_NO_CRASH);
4263 			fprintf(stderr,
4264 				"InnoDB: Database page corruption on disk"
4265 				" or a failed\n"
4266 				"InnoDB: file read of page %lu.\n"
4267 				"InnoDB: You may have to recover"
4268 				" from a backup.\n",
4269 				(ulong) bpage->offset);
4270 			fputs("InnoDB: It is also possible that"
4271 			      " your operating\n"
4272 			      "InnoDB: system has corrupted its"
4273 			      " own file cache\n"
4274 			      "InnoDB: and rebooting your computer"
4275 			      " removes the\n"
4276 			      "InnoDB: error.\n"
4277 			      "InnoDB: If the corrupt page is an index page\n"
4278 			      "InnoDB: you can also try to"
4279 			      " fix the corruption\n"
4280 			      "InnoDB: by dumping, dropping,"
4281 			      " and reimporting\n"
4282 			      "InnoDB: the corrupt table."
4283 			      " You can use CHECK\n"
4284 			      "InnoDB: TABLE to scan your"
4285 			      " table for corruption.\n"
4286 			      "InnoDB: See also "
4287 			      REFMAN "forcing-innodb-recovery.html\n"
4288 			      "InnoDB: about forcing recovery.\n", stderr);
4289 
4290 			if (srv_force_recovery < SRV_FORCE_IGNORE_CORRUPT) {
4291 				/* If page space id is larger than TRX_SYS_SPACE
4292 				(0), we will attempt to mark the corresponding
4293 				table as corrupted instead of crashing server */
4294 				if (bpage->space > TRX_SYS_SPACE
4295 				    && buf_mark_space_corrupt(bpage)) {
4296 					return(false);
4297 				} else {
4298 					fputs("InnoDB: Ending processing"
4299 					      " because of"
4300 					      " a corrupt database page.\n",
4301 					      stderr);
4302 
4303 					ut_error;
4304 				}
4305 			}
4306 		}
4307 
4308 		DBUG_EXECUTE_IF("buf_page_is_corrupt_failure",
4309 				page_not_corrupt:  bpage = bpage; );
4310 
4311 		if (recv_recovery_is_on()) {
4312 			/* Pages must be uncompressed for crash recovery. */
4313 			ut_a(uncompressed);
4314 			recv_recover_page(TRUE, (buf_block_t*) bpage);
4315 		}
4316 
4317 		if (uncompressed && !recv_no_ibuf_operations) {
4318 			ibuf_merge_or_delete_for_page(
4319 				(buf_block_t*) bpage, bpage->space,
4320 				bpage->offset, buf_page_get_zip_size(bpage),
4321 				TRUE);
4322 		}
4323 	}
4324 
4325 	buf_pool_mutex_enter(buf_pool);
4326 	mutex_enter(buf_page_get_mutex(bpage));
4327 
4328 #ifdef UNIV_IBUF_COUNT_DEBUG
4329 	if (io_type == BUF_IO_WRITE || uncompressed) {
4330 		/* For BUF_IO_READ of compressed-only blocks, the
4331 		buffered operations will be merged by buf_page_get_gen()
4332 		after the block has been uncompressed. */
4333 		ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
4334 	}
4335 #endif
4336 	/* Because this thread which does the unlocking is not the same that
4337 	did the locking, we use a pass value != 0 in unlock, which simply
4338 	removes the newest lock debug record, without checking the thread
4339 	id. */
4340 
4341 	buf_page_set_io_fix(bpage, BUF_IO_NONE);
4342 
4343 	switch (io_type) {
4344 	case BUF_IO_READ:
4345 		/* NOTE that the call to ibuf may have moved the ownership of
4346 		the x-latch to this OS thread: do not let this confuse you in
4347 		debugging! */
4348 
4349 		ut_ad(buf_pool->n_pend_reads > 0);
4350 		buf_pool->n_pend_reads--;
4351 		buf_pool->stat.n_pages_read++;
4352 
4353 		if (uncompressed) {
4354 			rw_lock_x_unlock_gen(&((buf_block_t*) bpage)->lock,
4355 					     BUF_IO_READ);
4356 		}
4357 
4358 		break;
4359 
4360 	case BUF_IO_WRITE:
4361 		/* Write means a flush operation: call the completion
4362 		routine in the flush system */
4363 
4364 		buf_flush_write_complete(bpage);
4365 
4366 		if (uncompressed) {
4367 			rw_lock_s_unlock_gen(&((buf_block_t*) bpage)->lock,
4368 					     BUF_IO_WRITE);
4369 		}
4370 
4371 		buf_pool->stat.n_pages_written++;
4372 
4373 		break;
4374 
4375 	default:
4376 		ut_error;
4377 	}
4378 
4379 	buf_page_monitor(bpage, io_type);
4380 
4381 #ifdef UNIV_DEBUG
4382 	if (buf_debug_prints) {
4383 		fprintf(stderr, "Has %s page space %lu page no %lu\n",
4384 			io_type == BUF_IO_READ ? "read" : "written",
4385 			(ulong) buf_page_get_space(bpage),
4386 			(ulong) buf_page_get_page_no(bpage));
4387 	}
4388 #endif /* UNIV_DEBUG */
4389 
4390 	mutex_exit(buf_page_get_mutex(bpage));
4391 	buf_pool_mutex_exit(buf_pool);
4392 
4393 	return(true);
4394 }
4395 
4396 /*********************************************************************//**
4397 Asserts that all file pages in the buffer are in a replaceable state.
4398 @return	TRUE */
4399 static
4400 ibool
buf_all_freed_instance(buf_pool_t * buf_pool)4401 buf_all_freed_instance(
4402 /*===================*/
4403 	buf_pool_t*	buf_pool)	/*!< in: buffer pool instancce */
4404 {
4405 	ulint		i;
4406 	buf_chunk_t*	chunk;
4407 
4408 	ut_ad(buf_pool);
4409 
4410 	buf_pool_mutex_enter(buf_pool);
4411 
4412 	chunk = buf_pool->chunks;
4413 
4414 	for (i = buf_pool->n_chunks; i--; chunk++) {
4415 
4416 		const buf_block_t* block = buf_chunk_not_freed(chunk);
4417 
4418 		if (UNIV_LIKELY_NULL(block)) {
4419 			fprintf(stderr,
4420 				"Page %lu %lu still fixed or dirty\n",
4421 				(ulong) block->page.space,
4422 				(ulong) block->page.offset);
4423 			ut_error;
4424 		}
4425 	}
4426 
4427 	buf_pool_mutex_exit(buf_pool);
4428 
4429 	return(TRUE);
4430 }
4431 
4432 /*********************************************************************//**
4433 Invalidates file pages in one buffer pool instance */
4434 static
4435 void
buf_pool_invalidate_instance(buf_pool_t * buf_pool)4436 buf_pool_invalidate_instance(
4437 /*=========================*/
4438 	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
4439 {
4440 	ulint		i;
4441 
4442 	buf_pool_mutex_enter(buf_pool);
4443 
4444 	for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
4445 
4446 		/* As this function is called during startup and
4447 		during redo application phase during recovery, InnoDB
4448 		is single threaded (apart from IO helper threads) at
4449 		this stage. No new write batch can be in intialization
4450 		stage at this point. */
4451 		ut_ad(buf_pool->init_flush[i] == FALSE);
4452 
4453 		/* However, it is possible that a write batch that has
4454 		been posted earlier is still not complete. For buffer
4455 		pool invalidation to proceed we must ensure there is NO
4456 		write activity happening. */
4457 		if (buf_pool->n_flush[i] > 0) {
4458 			buf_flush_t	type = static_cast<buf_flush_t>(i);
4459 
4460 			buf_pool_mutex_exit(buf_pool);
4461 			buf_flush_wait_batch_end(buf_pool, type);
4462 			buf_pool_mutex_enter(buf_pool);
4463 		}
4464 	}
4465 
4466 	buf_pool_mutex_exit(buf_pool);
4467 
4468 	ut_ad(buf_all_freed_instance(buf_pool));
4469 
4470 	buf_pool_mutex_enter(buf_pool);
4471 
4472 	while (buf_LRU_scan_and_free_block(buf_pool, TRUE)) {
4473 	}
4474 
4475 	ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0);
4476 	ut_ad(UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0);
4477 
4478 	buf_pool->freed_page_clock = 0;
4479 	buf_pool->LRU_old = NULL;
4480 	buf_pool->LRU_old_len = 0;
4481 
4482 	memset(&buf_pool->stat, 0x00, sizeof(buf_pool->stat));
4483 	buf_refresh_io_stats(buf_pool);
4484 
4485 	buf_pool_mutex_exit(buf_pool);
4486 }
4487 
4488 /*********************************************************************//**
4489 Invalidates the file pages in the buffer pool when an archive recovery is
4490 completed. All the file pages buffered must be in a replaceable state when
4491 this function is called: not latched and not modified. */
4492 UNIV_INTERN
4493 void
buf_pool_invalidate(void)4494 buf_pool_invalidate(void)
4495 /*=====================*/
4496 {
4497 	ulint   i;
4498 
4499 	for (i = 0; i < srv_buf_pool_instances; i++) {
4500 		buf_pool_invalidate_instance(buf_pool_from_array(i));
4501 	}
4502 }
4503 
4504 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
4505 /*********************************************************************//**
4506 Validates data in one buffer pool instance
4507 @return	TRUE */
4508 static
4509 ibool
buf_pool_validate_instance(buf_pool_t * buf_pool)4510 buf_pool_validate_instance(
4511 /*=======================*/
4512 	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
4513 {
4514 	buf_page_t*	b;
4515 	buf_chunk_t*	chunk;
4516 	ulint		i;
4517 	ulint		n_lru_flush	= 0;
4518 	ulint		n_page_flush	= 0;
4519 	ulint		n_list_flush	= 0;
4520 	ulint		n_lru		= 0;
4521 	ulint		n_flush		= 0;
4522 	ulint		n_free		= 0;
4523 	ulint		n_zip		= 0;
4524 	ulint		fold		= 0;
4525 	ulint		space		= 0;
4526 	ulint		offset		= 0;
4527 
4528 	ut_ad(buf_pool);
4529 
4530 	buf_pool_mutex_enter(buf_pool);
4531 	hash_lock_x_all(buf_pool->page_hash);
4532 
4533 	chunk = buf_pool->chunks;
4534 
4535 	/* Check the uncompressed blocks. */
4536 
4537 	for (i = buf_pool->n_chunks; i--; chunk++) {
4538 
4539 		ulint		j;
4540 		buf_block_t*	block = chunk->blocks;
4541 
4542 		for (j = chunk->size; j--; block++) {
4543 
4544 			mutex_enter(&block->mutex);
4545 
4546 			switch (buf_block_get_state(block)) {
4547 			case BUF_BLOCK_POOL_WATCH:
4548 			case BUF_BLOCK_ZIP_PAGE:
4549 			case BUF_BLOCK_ZIP_DIRTY:
4550 				/* These should only occur on
4551 				zip_clean, zip_free[], or flush_list. */
4552 				ut_error;
4553 				break;
4554 
4555 			case BUF_BLOCK_FILE_PAGE:
4556 				space = buf_block_get_space(block);
4557 				offset = buf_block_get_page_no(block);
4558 				fold = buf_page_address_fold(space, offset);
4559 				ut_a(buf_page_hash_get_low(buf_pool,
4560 							   space,
4561 							   offset,
4562 							   fold)
4563 				     == &block->page);
4564 
4565 #ifdef UNIV_IBUF_COUNT_DEBUG
4566 				ut_a(buf_page_get_io_fix(&block->page)
4567 				     == BUF_IO_READ
4568 				     || !ibuf_count_get(buf_block_get_space(
4569 								block),
4570 							buf_block_get_page_no(
4571 								block)));
4572 #endif
4573 				switch (buf_page_get_io_fix(&block->page)) {
4574 				case BUF_IO_NONE:
4575 					break;
4576 
4577 				case BUF_IO_WRITE:
4578 					switch (buf_page_get_flush_type(
4579 							&block->page)) {
4580 					case BUF_FLUSH_LRU:
4581 						n_lru_flush++;
4582 						goto assert_s_latched;
4583 					case BUF_FLUSH_SINGLE_PAGE:
4584 						n_page_flush++;
4585 assert_s_latched:
4586 						ut_a(rw_lock_is_locked(
4587 							     &block->lock,
4588 								     RW_LOCK_SHARED));
4589 						break;
4590 					case BUF_FLUSH_LIST:
4591 						n_list_flush++;
4592 						break;
4593 					default:
4594 						ut_error;
4595 					}
4596 
4597 					break;
4598 
4599 				case BUF_IO_READ:
4600 
4601 					ut_a(rw_lock_is_locked(&block->lock,
4602 							       RW_LOCK_EX));
4603 					break;
4604 
4605 				case BUF_IO_PIN:
4606 					break;
4607 				}
4608 
4609 				n_lru++;
4610 				break;
4611 
4612 			case BUF_BLOCK_NOT_USED:
4613 				n_free++;
4614 				break;
4615 
4616 			case BUF_BLOCK_READY_FOR_USE:
4617 			case BUF_BLOCK_MEMORY:
4618 			case BUF_BLOCK_REMOVE_HASH:
4619 				/* do nothing */
4620 				break;
4621 			}
4622 
4623 			mutex_exit(&block->mutex);
4624 		}
4625 	}
4626 
4627 	mutex_enter(&buf_pool->zip_mutex);
4628 
4629 	/* Check clean compressed-only blocks. */
4630 
4631 	for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
4632 	     b = UT_LIST_GET_NEXT(list, b)) {
4633 		ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
4634 		switch (buf_page_get_io_fix(b)) {
4635 		case BUF_IO_NONE:
4636 		case BUF_IO_PIN:
4637 			/* All clean blocks should be I/O-unfixed. */
4638 			break;
4639 		case BUF_IO_READ:
4640 			/* In buf_LRU_free_page(), we temporarily set
4641 			b->io_fix = BUF_IO_READ for a newly allocated
4642 			control block in order to prevent
4643 			buf_page_get_gen() from decompressing the block. */
4644 			break;
4645 		default:
4646 			ut_error;
4647 			break;
4648 		}
4649 
4650 		/* It is OK to read oldest_modification here because
4651 		we have acquired buf_pool->zip_mutex above which acts
4652 		as the 'block->mutex' for these bpages. */
4653 		ut_a(!b->oldest_modification);
4654 		fold = buf_page_address_fold(b->space, b->offset);
4655 		ut_a(buf_page_hash_get_low(buf_pool, b->space, b->offset,
4656 					   fold) == b);
4657 		n_lru++;
4658 		n_zip++;
4659 	}
4660 
4661 	/* Check dirty blocks. */
4662 
4663 	buf_flush_list_mutex_enter(buf_pool);
4664 	for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
4665 	     b = UT_LIST_GET_NEXT(list, b)) {
4666 		ut_ad(b->in_flush_list);
4667 		ut_a(b->oldest_modification);
4668 		n_flush++;
4669 
4670 		switch (buf_page_get_state(b)) {
4671 		case BUF_BLOCK_ZIP_DIRTY:
4672 			n_lru++;
4673 			n_zip++;
4674 			switch (buf_page_get_io_fix(b)) {
4675 			case BUF_IO_NONE:
4676 			case BUF_IO_READ:
4677 			case BUF_IO_PIN:
4678 				break;
4679 			case BUF_IO_WRITE:
4680 				switch (buf_page_get_flush_type(b)) {
4681 				case BUF_FLUSH_LRU:
4682 					n_lru_flush++;
4683 					break;
4684 				case BUF_FLUSH_SINGLE_PAGE:
4685 					n_page_flush++;
4686 					break;
4687 				case BUF_FLUSH_LIST:
4688 					n_list_flush++;
4689 					break;
4690 				default:
4691 					ut_error;
4692 				}
4693 				break;
4694 			}
4695 			break;
4696 		case BUF_BLOCK_FILE_PAGE:
4697 			/* uncompressed page */
4698 			break;
4699 		case BUF_BLOCK_POOL_WATCH:
4700 		case BUF_BLOCK_ZIP_PAGE:
4701 		case BUF_BLOCK_NOT_USED:
4702 		case BUF_BLOCK_READY_FOR_USE:
4703 		case BUF_BLOCK_MEMORY:
4704 		case BUF_BLOCK_REMOVE_HASH:
4705 			ut_error;
4706 			break;
4707 		}
4708 		fold = buf_page_address_fold(b->space, b->offset);
4709 		ut_a(buf_page_hash_get_low(buf_pool, b->space, b->offset,
4710 					   fold) == b);
4711 	}
4712 
4713 	ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush);
4714 
4715 	hash_unlock_x_all(buf_pool->page_hash);
4716 	buf_flush_list_mutex_exit(buf_pool);
4717 
4718 	mutex_exit(&buf_pool->zip_mutex);
4719 
4720 	if (n_lru + n_free > buf_pool->curr_size + n_zip) {
4721 		fprintf(stderr, "n LRU %lu, n free %lu, pool %lu zip %lu\n",
4722 			(ulong) n_lru, (ulong) n_free,
4723 			(ulong) buf_pool->curr_size, (ulong) n_zip);
4724 		ut_error;
4725 	}
4726 
4727 	ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru);
4728 	if (UT_LIST_GET_LEN(buf_pool->free) != n_free) {
4729 		fprintf(stderr, "Free list len %lu, free blocks %lu\n",
4730 			(ulong) UT_LIST_GET_LEN(buf_pool->free),
4731 			(ulong) n_free);
4732 		ut_error;
4733 	}
4734 
4735 	ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush);
4736 	ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush);
4737 	ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_page_flush);
4738 
4739 	buf_pool_mutex_exit(buf_pool);
4740 
4741 	ut_a(buf_LRU_validate());
4742 	ut_a(buf_flush_validate(buf_pool));
4743 
4744 	return(TRUE);
4745 }
4746 
4747 /*********************************************************************//**
4748 Validates the buffer buf_pool data structure.
4749 @return	TRUE */
4750 UNIV_INTERN
4751 ibool
buf_validate(void)4752 buf_validate(void)
4753 /*==============*/
4754 {
4755 	ulint	i;
4756 
4757 	for (i = 0; i < srv_buf_pool_instances; i++) {
4758 		buf_pool_t*	buf_pool;
4759 
4760 		buf_pool = buf_pool_from_array(i);
4761 
4762 		buf_pool_validate_instance(buf_pool);
4763 	}
4764 	return(TRUE);
4765 }
4766 
4767 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
4768 
4769 #if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
4770 /*********************************************************************//**
4771 Prints info of the buffer buf_pool data structure for one instance. */
4772 static
4773 void
buf_print_instance(buf_pool_t * buf_pool)4774 buf_print_instance(
4775 /*===============*/
4776 	buf_pool_t*	buf_pool)
4777 {
4778 	index_id_t*	index_ids;
4779 	ulint*		counts;
4780 	ulint		size;
4781 	ulint		i;
4782 	ulint		j;
4783 	index_id_t	id;
4784 	ulint		n_found;
4785 	buf_chunk_t*	chunk;
4786 	dict_index_t*	index;
4787 
4788 	ut_ad(buf_pool);
4789 
4790 	size = buf_pool->curr_size;
4791 
4792 	index_ids = static_cast<index_id_t*>(
4793 		mem_alloc(size * sizeof *index_ids));
4794 
4795 	counts = static_cast<ulint*>(mem_alloc(sizeof(ulint) * size));
4796 
4797 	buf_pool_mutex_enter(buf_pool);
4798 	buf_flush_list_mutex_enter(buf_pool);
4799 
4800 	fprintf(stderr,
4801 		"buf_pool size %lu\n"
4802 		"database pages %lu\n"
4803 		"free pages %lu\n"
4804 		"modified database pages %lu\n"
4805 		"n pending decompressions %lu\n"
4806 		"n pending reads %lu\n"
4807 		"n pending flush LRU %lu list %lu single page %lu\n"
4808 		"pages made young %lu, not young %lu\n"
4809 		"pages read %lu, created %lu, written %lu\n",
4810 		(ulong) size,
4811 		(ulong) UT_LIST_GET_LEN(buf_pool->LRU),
4812 		(ulong) UT_LIST_GET_LEN(buf_pool->free),
4813 		(ulong) UT_LIST_GET_LEN(buf_pool->flush_list),
4814 		(ulong) buf_pool->n_pend_unzip,
4815 		(ulong) buf_pool->n_pend_reads,
4816 		(ulong) buf_pool->n_flush[BUF_FLUSH_LRU],
4817 		(ulong) buf_pool->n_flush[BUF_FLUSH_LIST],
4818 		(ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE],
4819 		(ulong) buf_pool->stat.n_pages_made_young,
4820 		(ulong) buf_pool->stat.n_pages_not_made_young,
4821 		(ulong) buf_pool->stat.n_pages_read,
4822 		(ulong) buf_pool->stat.n_pages_created,
4823 		(ulong) buf_pool->stat.n_pages_written);
4824 
4825 	buf_flush_list_mutex_exit(buf_pool);
4826 
4827 	/* Count the number of blocks belonging to each index in the buffer */
4828 
4829 	n_found = 0;
4830 
4831 	chunk = buf_pool->chunks;
4832 
4833 	for (i = buf_pool->n_chunks; i--; chunk++) {
4834 		buf_block_t*	block		= chunk->blocks;
4835 		ulint		n_blocks	= chunk->size;
4836 
4837 		for (; n_blocks--; block++) {
4838 			const buf_frame_t* frame = block->frame;
4839 
4840 			if (fil_page_get_type(frame) == FIL_PAGE_INDEX) {
4841 
4842 				id = btr_page_get_index_id(frame);
4843 
4844 				/* Look for the id in the index_ids array */
4845 				j = 0;
4846 
4847 				while (j < n_found) {
4848 
4849 					if (index_ids[j] == id) {
4850 						counts[j]++;
4851 
4852 						break;
4853 					}
4854 					j++;
4855 				}
4856 
4857 				if (j == n_found) {
4858 					n_found++;
4859 					index_ids[j] = id;
4860 					counts[j] = 1;
4861 				}
4862 			}
4863 		}
4864 	}
4865 
4866 	buf_pool_mutex_exit(buf_pool);
4867 
4868 	for (i = 0; i < n_found; i++) {
4869 		index = dict_index_get_if_in_cache(index_ids[i]);
4870 
4871 		fprintf(stderr,
4872 			"Block count for index %llu in buffer is about %lu",
4873 			(ullint) index_ids[i],
4874 			(ulong) counts[i]);
4875 
4876 		if (index) {
4877 			putc(' ', stderr);
4878 			dict_index_name_print(stderr, NULL, index);
4879 		}
4880 
4881 		putc('\n', stderr);
4882 	}
4883 
4884 	mem_free(index_ids);
4885 	mem_free(counts);
4886 
4887 	ut_a(buf_pool_validate_instance(buf_pool));
4888 }
4889 
4890 /*********************************************************************//**
4891 Prints info of the buffer buf_pool data structure. */
4892 UNIV_INTERN
4893 void
buf_print(void)4894 buf_print(void)
4895 /*===========*/
4896 {
4897 	ulint   i;
4898 
4899 	for (i = 0; i < srv_buf_pool_instances; i++) {
4900 		buf_pool_t*	buf_pool;
4901 
4902 		buf_pool = buf_pool_from_array(i);
4903 		buf_print_instance(buf_pool);
4904 	}
4905 }
4906 #endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
4907 
4908 #ifdef UNIV_DEBUG
4909 /*********************************************************************//**
4910 Returns the number of latched pages in the buffer pool.
4911 @return	number of latched pages */
4912 UNIV_INTERN
4913 ulint
buf_get_latched_pages_number_instance(buf_pool_t * buf_pool)4914 buf_get_latched_pages_number_instance(
4915 /*==================================*/
4916 	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
4917 {
4918 	buf_page_t*	b;
4919 	ulint		i;
4920 	buf_chunk_t*	chunk;
4921 	ulint		fixed_pages_number = 0;
4922 
4923 	buf_pool_mutex_enter(buf_pool);
4924 
4925 	chunk = buf_pool->chunks;
4926 
4927 	for (i = buf_pool->n_chunks; i--; chunk++) {
4928 		buf_block_t*	block;
4929 		ulint		j;
4930 
4931 		block = chunk->blocks;
4932 
4933 		for (j = chunk->size; j--; block++) {
4934 			if (buf_block_get_state(block)
4935 			    != BUF_BLOCK_FILE_PAGE) {
4936 
4937 				continue;
4938 			}
4939 
4940 			mutex_enter(&block->mutex);
4941 
4942 			if (block->page.buf_fix_count != 0
4943 			    || buf_page_get_io_fix(&block->page)
4944 			    != BUF_IO_NONE) {
4945 				fixed_pages_number++;
4946 			}
4947 
4948 			mutex_exit(&block->mutex);
4949 		}
4950 	}
4951 
4952 	mutex_enter(&buf_pool->zip_mutex);
4953 
4954 	/* Traverse the lists of clean and dirty compressed-only blocks. */
4955 
4956 	for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
4957 	     b = UT_LIST_GET_NEXT(list, b)) {
4958 		ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
4959 		ut_a(buf_page_get_io_fix(b) != BUF_IO_WRITE);
4960 
4961 		if (b->buf_fix_count != 0
4962 		    || buf_page_get_io_fix(b) != BUF_IO_NONE) {
4963 			fixed_pages_number++;
4964 		}
4965 	}
4966 
4967 	buf_flush_list_mutex_enter(buf_pool);
4968 	for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
4969 	     b = UT_LIST_GET_NEXT(list, b)) {
4970 		ut_ad(b->in_flush_list);
4971 
4972 		switch (buf_page_get_state(b)) {
4973 		case BUF_BLOCK_ZIP_DIRTY:
4974 			if (b->buf_fix_count != 0
4975 			    || buf_page_get_io_fix(b) != BUF_IO_NONE) {
4976 				fixed_pages_number++;
4977 			}
4978 			break;
4979 		case BUF_BLOCK_FILE_PAGE:
4980 			/* uncompressed page */
4981 			break;
4982 		case BUF_BLOCK_POOL_WATCH:
4983 		case BUF_BLOCK_ZIP_PAGE:
4984 		case BUF_BLOCK_NOT_USED:
4985 		case BUF_BLOCK_READY_FOR_USE:
4986 		case BUF_BLOCK_MEMORY:
4987 		case BUF_BLOCK_REMOVE_HASH:
4988 			ut_error;
4989 			break;
4990 		}
4991 	}
4992 
4993 	buf_flush_list_mutex_exit(buf_pool);
4994 	mutex_exit(&buf_pool->zip_mutex);
4995 	buf_pool_mutex_exit(buf_pool);
4996 
4997 	return(fixed_pages_number);
4998 }
4999 
5000 /*********************************************************************//**
5001 Returns the number of latched pages in all the buffer pools.
5002 @return	number of latched pages */
5003 UNIV_INTERN
5004 ulint
buf_get_latched_pages_number(void)5005 buf_get_latched_pages_number(void)
5006 /*==============================*/
5007 {
5008 	ulint	i;
5009 	ulint	total_latched_pages = 0;
5010 
5011 	for (i = 0; i < srv_buf_pool_instances; i++) {
5012 		buf_pool_t*	buf_pool;
5013 
5014 		buf_pool = buf_pool_from_array(i);
5015 
5016 		total_latched_pages += buf_get_latched_pages_number_instance(
5017 			buf_pool);
5018 	}
5019 
5020 	return(total_latched_pages);
5021 }
5022 
5023 #endif /* UNIV_DEBUG */
5024 
5025 /*********************************************************************//**
5026 Returns the number of pending buf pool read ios.
5027 @return	number of pending read I/O operations */
5028 UNIV_INTERN
5029 ulint
buf_get_n_pending_read_ios(void)5030 buf_get_n_pending_read_ios(void)
5031 /*============================*/
5032 {
5033 	ulint	i;
5034 	ulint	pend_ios = 0;
5035 
5036 	for (i = 0; i < srv_buf_pool_instances; i++) {
5037 		pend_ios += buf_pool_from_array(i)->n_pend_reads;
5038 	}
5039 
5040 	return(pend_ios);
5041 }
5042 
5043 /*********************************************************************//**
5044 Returns the ratio in percents of modified pages in the buffer pool /
5045 database pages in the buffer pool.
5046 @return	modified page percentage ratio */
5047 UNIV_INTERN
5048 ulint
buf_get_modified_ratio_pct(void)5049 buf_get_modified_ratio_pct(void)
5050 /*============================*/
5051 {
5052 	ulint		ratio;
5053 	ulint		lru_len = 0;
5054 	ulint		free_len = 0;
5055 	ulint		flush_list_len = 0;
5056 
5057 	buf_get_total_list_len(&lru_len, &free_len, &flush_list_len);
5058 
5059 	ratio = (100 * flush_list_len) / (1 + lru_len + free_len);
5060 
5061 	/* 1 + is there to avoid division by zero */
5062 
5063 	return(ratio);
5064 }
5065 
5066 /*******************************************************************//**
5067 Aggregates a pool stats information with the total buffer pool stats  */
5068 static
5069 void
buf_stats_aggregate_pool_info(buf_pool_info_t * total_info,const buf_pool_info_t * pool_info)5070 buf_stats_aggregate_pool_info(
5071 /*==========================*/
5072 	buf_pool_info_t*	total_info,	/*!< in/out: the buffer pool
5073 						info to store aggregated
5074 						result */
5075 	const buf_pool_info_t*	pool_info)	/*!< in: individual buffer pool
5076 						stats info */
5077 {
5078 	ut_a(total_info && pool_info);
5079 
5080 	/* Nothing to copy if total_info is the same as pool_info */
5081 	if (total_info == pool_info) {
5082 		return;
5083 	}
5084 
5085 	total_info->pool_size += pool_info->pool_size;
5086 	total_info->lru_len += pool_info->lru_len;
5087 	total_info->old_lru_len += pool_info->old_lru_len;
5088 	total_info->free_list_len += pool_info->free_list_len;
5089 	total_info->flush_list_len += pool_info->flush_list_len;
5090 	total_info->n_pend_unzip += pool_info->n_pend_unzip;
5091 	total_info->n_pend_reads += pool_info->n_pend_reads;
5092 	total_info->n_pending_flush_lru += pool_info->n_pending_flush_lru;
5093 	total_info->n_pending_flush_list += pool_info->n_pending_flush_list;
5094 	total_info->n_pages_made_young += pool_info->n_pages_made_young;
5095 	total_info->n_pages_not_made_young += pool_info->n_pages_not_made_young;
5096 	total_info->n_pages_read += pool_info->n_pages_read;
5097 	total_info->n_pages_created += pool_info->n_pages_created;
5098 	total_info->n_pages_written += pool_info->n_pages_written;
5099 	total_info->n_page_gets += pool_info->n_page_gets;
5100 	total_info->n_ra_pages_read_rnd += pool_info->n_ra_pages_read_rnd;
5101 	total_info->n_ra_pages_read += pool_info->n_ra_pages_read;
5102 	total_info->n_ra_pages_evicted += pool_info->n_ra_pages_evicted;
5103 	total_info->page_made_young_rate += pool_info->page_made_young_rate;
5104 	total_info->page_not_made_young_rate +=
5105 		pool_info->page_not_made_young_rate;
5106 	total_info->pages_read_rate += pool_info->pages_read_rate;
5107 	total_info->pages_created_rate += pool_info->pages_created_rate;
5108 	total_info->pages_written_rate += pool_info->pages_written_rate;
5109 	total_info->n_page_get_delta += pool_info->n_page_get_delta;
5110 	total_info->page_read_delta += pool_info->page_read_delta;
5111 	total_info->young_making_delta += pool_info->young_making_delta;
5112 	total_info->not_young_making_delta += pool_info->not_young_making_delta;
5113 	total_info->pages_readahead_rnd_rate += pool_info->pages_readahead_rnd_rate;
5114 	total_info->pages_readahead_rate += pool_info->pages_readahead_rate;
5115 	total_info->pages_evicted_rate += pool_info->pages_evicted_rate;
5116 	total_info->unzip_lru_len += pool_info->unzip_lru_len;
5117 	total_info->io_sum += pool_info->io_sum;
5118 	total_info->io_cur += pool_info->io_cur;
5119 	total_info->unzip_sum += pool_info->unzip_sum;
5120 	total_info->unzip_cur += pool_info->unzip_cur;
5121 }
5122 /*******************************************************************//**
5123 Collect buffer pool stats information for a buffer pool. Also
5124 record aggregated stats if there are more than one buffer pool
5125 in the server */
5126 UNIV_INTERN
5127 void
buf_stats_get_pool_info(buf_pool_t * buf_pool,ulint pool_id,buf_pool_info_t * all_pool_info)5128 buf_stats_get_pool_info(
5129 /*====================*/
5130 	buf_pool_t*		buf_pool,	/*!< in: buffer pool */
5131 	ulint			pool_id,	/*!< in: buffer pool ID */
5132 	buf_pool_info_t*	all_pool_info)	/*!< in/out: buffer pool info
5133 						to fill */
5134 {
5135 	buf_pool_info_t*        pool_info;
5136 	time_t			current_time;
5137 	double			time_elapsed;
5138 
5139 	/* Find appropriate pool_info to store stats for this buffer pool */
5140 	pool_info = &all_pool_info[pool_id];
5141 
5142 	buf_pool_mutex_enter(buf_pool);
5143 	buf_flush_list_mutex_enter(buf_pool);
5144 
5145 	pool_info->pool_unique_id = pool_id;
5146 
5147 	pool_info->pool_size = buf_pool->curr_size;
5148 
5149 	pool_info->lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
5150 
5151 	pool_info->old_lru_len = buf_pool->LRU_old_len;
5152 
5153 	pool_info->free_list_len = UT_LIST_GET_LEN(buf_pool->free);
5154 
5155 	pool_info->flush_list_len = UT_LIST_GET_LEN(buf_pool->flush_list);
5156 
5157 	pool_info->n_pend_unzip = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
5158 
5159 	pool_info->n_pend_reads = buf_pool->n_pend_reads;
5160 
5161 	pool_info->n_pending_flush_lru =
5162 		 (buf_pool->n_flush[BUF_FLUSH_LRU]
5163 		  + buf_pool->init_flush[BUF_FLUSH_LRU]);
5164 
5165 	pool_info->n_pending_flush_list =
5166 		 (buf_pool->n_flush[BUF_FLUSH_LIST]
5167 		  + buf_pool->init_flush[BUF_FLUSH_LIST]);
5168 
5169 	pool_info->n_pending_flush_single_page =
5170 		 (buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]
5171 		  + buf_pool->init_flush[BUF_FLUSH_SINGLE_PAGE]);
5172 
5173 	buf_flush_list_mutex_exit(buf_pool);
5174 
5175 	current_time = time(NULL);
5176 	time_elapsed = 0.001 + difftime(current_time,
5177 					buf_pool->last_printout_time);
5178 
5179 	pool_info->n_pages_made_young = buf_pool->stat.n_pages_made_young;
5180 
5181 	pool_info->n_pages_not_made_young =
5182 		buf_pool->stat.n_pages_not_made_young;
5183 
5184 	pool_info->n_pages_read = buf_pool->stat.n_pages_read;
5185 
5186 	pool_info->n_pages_created = buf_pool->stat.n_pages_created;
5187 
5188 	pool_info->n_pages_written = buf_pool->stat.n_pages_written;
5189 
5190 	pool_info->n_page_gets = buf_pool->stat.n_page_gets;
5191 
5192 	pool_info->n_ra_pages_read_rnd = buf_pool->stat.n_ra_pages_read_rnd;
5193 	pool_info->n_ra_pages_read = buf_pool->stat.n_ra_pages_read;
5194 
5195 	pool_info->n_ra_pages_evicted = buf_pool->stat.n_ra_pages_evicted;
5196 
5197 	pool_info->page_made_young_rate =
5198 		 (buf_pool->stat.n_pages_made_young
5199 		  - buf_pool->old_stat.n_pages_made_young) / time_elapsed;
5200 
5201 	pool_info->page_not_made_young_rate =
5202 		 (buf_pool->stat.n_pages_not_made_young
5203 		  - buf_pool->old_stat.n_pages_not_made_young) / time_elapsed;
5204 
5205 	pool_info->pages_read_rate =
5206 		(buf_pool->stat.n_pages_read
5207 		  - buf_pool->old_stat.n_pages_read) / time_elapsed;
5208 
5209 	pool_info->pages_created_rate =
5210 		(buf_pool->stat.n_pages_created
5211 		 - buf_pool->old_stat.n_pages_created) / time_elapsed;
5212 
5213 	pool_info->pages_written_rate =
5214 		(buf_pool->stat.n_pages_written
5215 		 - buf_pool->old_stat.n_pages_written) / time_elapsed;
5216 
5217 	pool_info->n_page_get_delta = buf_pool->stat.n_page_gets
5218 				      - buf_pool->old_stat.n_page_gets;
5219 
5220 	if (pool_info->n_page_get_delta) {
5221 		pool_info->page_read_delta = buf_pool->stat.n_pages_read
5222 					     - buf_pool->old_stat.n_pages_read;
5223 
5224 		pool_info->young_making_delta =
5225 			buf_pool->stat.n_pages_made_young
5226 			- buf_pool->old_stat.n_pages_made_young;
5227 
5228 		pool_info->not_young_making_delta =
5229 			buf_pool->stat.n_pages_not_made_young
5230 			- buf_pool->old_stat.n_pages_not_made_young;
5231 	}
5232 	pool_info->pages_readahead_rnd_rate =
5233 		 (buf_pool->stat.n_ra_pages_read_rnd
5234 		  - buf_pool->old_stat.n_ra_pages_read_rnd) / time_elapsed;
5235 
5236 
5237 	pool_info->pages_readahead_rate =
5238 		 (buf_pool->stat.n_ra_pages_read
5239 		  - buf_pool->old_stat.n_ra_pages_read) / time_elapsed;
5240 
5241 	pool_info->pages_evicted_rate =
5242 		(buf_pool->stat.n_ra_pages_evicted
5243 		 - buf_pool->old_stat.n_ra_pages_evicted) / time_elapsed;
5244 
5245 	pool_info->unzip_lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
5246 
5247 	pool_info->io_sum = buf_LRU_stat_sum.io;
5248 
5249 	pool_info->io_cur = buf_LRU_stat_cur.io;
5250 
5251 	pool_info->unzip_sum = buf_LRU_stat_sum.unzip;
5252 
5253 	pool_info->unzip_cur = buf_LRU_stat_cur.unzip;
5254 
5255 	buf_refresh_io_stats(buf_pool);
5256 	buf_pool_mutex_exit(buf_pool);
5257 }
5258 
5259 /*********************************************************************//**
5260 Prints info of the buffer i/o. */
5261 UNIV_INTERN
5262 void
buf_print_io_instance(buf_pool_info_t * pool_info,FILE * file)5263 buf_print_io_instance(
5264 /*==================*/
5265 	buf_pool_info_t*pool_info,	/*!< in: buffer pool info */
5266 	FILE*		file)		/*!< in/out: buffer where to print */
5267 {
5268 	ut_ad(pool_info);
5269 
5270 	fprintf(file,
5271 		"Buffer pool size   %lu\n"
5272 		"Free buffers       %lu\n"
5273 		"Database pages     %lu\n"
5274 		"Old database pages %lu\n"
5275 		"Modified db pages  %lu\n"
5276 		"Pending reads %lu\n"
5277 		"Pending writes: LRU %lu, flush list %lu, single page %lu\n",
5278 		pool_info->pool_size,
5279 		pool_info->free_list_len,
5280 		pool_info->lru_len,
5281 		pool_info->old_lru_len,
5282 		pool_info->flush_list_len,
5283 		pool_info->n_pend_reads,
5284 		pool_info->n_pending_flush_lru,
5285 		pool_info->n_pending_flush_list,
5286 		pool_info->n_pending_flush_single_page);
5287 
5288 	fprintf(file,
5289 		"Pages made young %lu, not young %lu\n"
5290 		"%.2f youngs/s, %.2f non-youngs/s\n"
5291 		"Pages read %lu, created %lu, written %lu\n"
5292 		"%.2f reads/s, %.2f creates/s, %.2f writes/s\n",
5293 		pool_info->n_pages_made_young,
5294 		pool_info->n_pages_not_made_young,
5295 		pool_info->page_made_young_rate,
5296 		pool_info->page_not_made_young_rate,
5297 		pool_info->n_pages_read,
5298 		pool_info->n_pages_created,
5299 		pool_info->n_pages_written,
5300 		pool_info->pages_read_rate,
5301 		pool_info->pages_created_rate,
5302 		pool_info->pages_written_rate);
5303 
5304 	if (pool_info->n_page_get_delta) {
5305 		fprintf(file,
5306 			"Buffer pool hit rate %lu / 1000,"
5307 			" young-making rate %lu / 1000 not %lu / 1000\n",
5308 			(ulong) (1000 - (1000 * pool_info->page_read_delta
5309 					 / pool_info->n_page_get_delta)),
5310 			(ulong) (1000 * pool_info->young_making_delta
5311 				 / pool_info->n_page_get_delta),
5312 			(ulong) (1000 * pool_info->not_young_making_delta
5313 				 / pool_info->n_page_get_delta));
5314 	} else {
5315 		fputs("No buffer pool page gets since the last printout\n",
5316 		      file);
5317 	}
5318 
5319 	/* Statistics about read ahead algorithm */
5320 	fprintf(file, "Pages read ahead %.2f/s,"
5321 		" evicted without access %.2f/s,"
5322 		" Random read ahead %.2f/s\n",
5323 
5324 		pool_info->pages_readahead_rate,
5325 		pool_info->pages_evicted_rate,
5326 		pool_info->pages_readahead_rnd_rate);
5327 
5328 	/* Print some values to help us with visualizing what is
5329 	happening with LRU eviction. */
5330 	fprintf(file,
5331 		"LRU len: %lu, unzip_LRU len: %lu\n"
5332 		"I/O sum[%lu]:cur[%lu], unzip sum[%lu]:cur[%lu]\n",
5333 		pool_info->lru_len, pool_info->unzip_lru_len,
5334 		pool_info->io_sum, pool_info->io_cur,
5335 		pool_info->unzip_sum, pool_info->unzip_cur);
5336 }
5337 
5338 /*********************************************************************//**
5339 Prints info of the buffer i/o. */
5340 UNIV_INTERN
5341 void
buf_print_io(FILE * file)5342 buf_print_io(
5343 /*=========*/
5344 	FILE*	file)	/*!< in/out: buffer where to print */
5345 {
5346 	ulint			i;
5347 	buf_pool_info_t*	pool_info;
5348 	buf_pool_info_t*	pool_info_total;
5349 
5350 	/* If srv_buf_pool_instances is greater than 1, allocate
5351 	one extra buf_pool_info_t, the last one stores
5352 	aggregated/total values from all pools */
5353 	if (srv_buf_pool_instances > 1) {
5354 		pool_info = (buf_pool_info_t*) mem_zalloc((
5355 			srv_buf_pool_instances + 1) * sizeof *pool_info);
5356 
5357 		pool_info_total = &pool_info[srv_buf_pool_instances];
5358 	} else {
5359 		ut_a(srv_buf_pool_instances == 1);
5360 
5361 		pool_info_total = pool_info =
5362 			static_cast<buf_pool_info_t*>(
5363 				mem_zalloc(sizeof *pool_info));
5364 	}
5365 
5366 	for (i = 0; i < srv_buf_pool_instances; i++) {
5367 		buf_pool_t*	buf_pool;
5368 
5369 		buf_pool = buf_pool_from_array(i);
5370 
5371 		/* Fetch individual buffer pool info and calculate
5372 		aggregated stats along the way */
5373 		buf_stats_get_pool_info(buf_pool, i, pool_info);
5374 
5375 		/* If we have more than one buffer pool, store
5376 		the aggregated stats  */
5377 		if (srv_buf_pool_instances > 1) {
5378 			buf_stats_aggregate_pool_info(pool_info_total,
5379 						      &pool_info[i]);
5380 		}
5381 	}
5382 
5383 	/* Print the aggreate buffer pool info */
5384 	buf_print_io_instance(pool_info_total, file);
5385 
5386 	/* If there are more than one buffer pool, print each individual pool
5387 	info */
5388 	if (srv_buf_pool_instances > 1) {
5389 		fputs("----------------------\n"
5390 		"INDIVIDUAL BUFFER POOL INFO\n"
5391 		"----------------------\n", file);
5392 
5393 		for (i = 0; i < srv_buf_pool_instances; i++) {
5394 			fprintf(file, "---BUFFER POOL %lu\n", i);
5395 			buf_print_io_instance(&pool_info[i], file);
5396 		}
5397 	}
5398 
5399 	mem_free(pool_info);
5400 }
5401 
5402 /**********************************************************************//**
5403 Refreshes the statistics used to print per-second averages. */
5404 UNIV_INTERN
5405 void
buf_refresh_io_stats(buf_pool_t * buf_pool)5406 buf_refresh_io_stats(
5407 /*=================*/
5408 	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
5409 {
5410 	buf_pool->last_printout_time = ut_time();
5411 	buf_pool->old_stat = buf_pool->stat;
5412 }
5413 
5414 /**********************************************************************//**
5415 Refreshes the statistics used to print per-second averages. */
5416 UNIV_INTERN
5417 void
buf_refresh_io_stats_all(void)5418 buf_refresh_io_stats_all(void)
5419 /*==========================*/
5420 {
5421 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
5422 		buf_pool_t*	buf_pool;
5423 
5424 		buf_pool = buf_pool_from_array(i);
5425 
5426 		buf_refresh_io_stats(buf_pool);
5427 	}
5428 }
5429 
5430 /**********************************************************************//**
5431 Check if all pages in all buffer pools are in a replacable state.
5432 @return FALSE if not */
5433 UNIV_INTERN
5434 ibool
buf_all_freed(void)5435 buf_all_freed(void)
5436 /*===============*/
5437 {
5438 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
5439 		buf_pool_t*	buf_pool;
5440 
5441 		buf_pool = buf_pool_from_array(i);
5442 
5443 		if (!buf_all_freed_instance(buf_pool)) {
5444 			return(FALSE);
5445 		}
5446 	}
5447 
5448 	return(TRUE);
5449 }
5450 
5451 /*********************************************************************//**
5452 Checks that there currently are no pending i/o-operations for the buffer
5453 pool.
5454 @return	number of pending i/o */
5455 UNIV_INTERN
5456 ulint
buf_pool_check_no_pending_io(void)5457 buf_pool_check_no_pending_io(void)
5458 /*==============================*/
5459 {
5460 	ulint		i;
5461 	ulint		pending_io = 0;
5462 
5463 	buf_pool_mutex_enter_all();
5464 
5465 	for (i = 0; i < srv_buf_pool_instances; i++) {
5466 		const buf_pool_t*	buf_pool;
5467 
5468 		buf_pool = buf_pool_from_array(i);
5469 
5470 		pending_io += buf_pool->n_pend_reads
5471 			      + buf_pool->n_flush[BUF_FLUSH_LRU]
5472 			      + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]
5473 			      + buf_pool->n_flush[BUF_FLUSH_LIST];
5474 
5475 	}
5476 
5477 	buf_pool_mutex_exit_all();
5478 
5479 	return(pending_io);
5480 }
5481 
5482 #if 0
5483 Code currently not used
5484 /*********************************************************************//**
5485 Gets the current length of the free list of buffer blocks.
5486 @return	length of the free list */
5487 UNIV_INTERN
5488 ulint
5489 buf_get_free_list_len(void)
5490 /*=======================*/
5491 {
5492 	ulint	len;
5493 
5494 	buf_pool_mutex_enter(buf_pool);
5495 
5496 	len = UT_LIST_GET_LEN(buf_pool->free);
5497 
5498 	buf_pool_mutex_exit(buf_pool);
5499 
5500 	return(len);
5501 }
5502 #endif
5503 
5504 #else /* !UNIV_HOTBACKUP */
5505 /********************************************************************//**
5506 Inits a page to the buffer buf_pool, for use in mysqlbackup --restore. */
5507 UNIV_INTERN
5508 void
buf_page_init_for_backup_restore(ulint space,ulint offset,ulint zip_size,buf_block_t * block)5509 buf_page_init_for_backup_restore(
5510 /*=============================*/
5511 	ulint		space,	/*!< in: space id */
5512 	ulint		offset,	/*!< in: offset of the page within space
5513 				in units of a page */
5514 	ulint		zip_size,/*!< in: compressed page size in bytes
5515 				or 0 for uncompressed pages */
5516 	buf_block_t*	block)	/*!< in: block to init */
5517 {
5518 	block->page.state	= BUF_BLOCK_FILE_PAGE;
5519 	block->page.space	= space;
5520 	block->page.offset	= offset;
5521 
5522 	page_zip_des_init(&block->page.zip);
5523 
5524 	/* We assume that block->page.data has been allocated
5525 	with zip_size == UNIV_PAGE_SIZE. */
5526 	ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
5527 	ut_ad(ut_is_2pow(zip_size));
5528 	page_zip_set_size(&block->page.zip, zip_size);
5529 	if (zip_size) {
5530 		block->page.zip.data = block->frame + UNIV_PAGE_SIZE;
5531 	}
5532 }
5533 #endif /* !UNIV_HOTBACKUP */
5534