1 /*****************************************************************************
2 
3 Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2013, 2021, MariaDB Corporation.
5 
6 This program is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free Software
8 Foundation; version 2 of the License.
9 
10 This program is distributed in the hope that it will be useful, but WITHOUT
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13 
14 You should have received a copy of the GNU General Public License along with
15 this program; if not, write to the Free Software Foundation, Inc.,
16 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
17 
18 *****************************************************************************/
19 
20 /**************************************************//**
21 @file include/buf0buf.h
22 The database buffer pool high-level routines
23 
24 Created 11/5/1995 Heikki Tuuri
25 *******************************************************/
26 
27 #ifndef buf0buf_h
28 #define buf0buf_h
29 
30 /** Magic value to use instead of checksums when they are disabled */
31 #define BUF_NO_CHECKSUM_MAGIC 0xDEADBEEFUL
32 
33 #include "fil0fil.h"
34 #include "mtr0types.h"
35 #include "buf0types.h"
36 #include "span.h"
37 #include "assume_aligned.h"
38 #ifndef UNIV_INNOCHECKSUM
39 #include "hash0hash.h"
40 #include "ut0byte.h"
41 #include "page0types.h"
42 #include "log0log.h"
43 #include "srv0srv.h"
44 #include <ostream>
45 
46 // Forward declaration
47 struct fil_addr_t;
48 
49 /** @name Modes for buf_page_get_gen */
50 /* @{ */
51 #define BUF_GET			10	/*!< get always */
52 #define	BUF_GET_IF_IN_POOL	11	/*!< get if in pool */
53 #define BUF_PEEK_IF_IN_POOL	12	/*!< get if in pool, do not make
54 					the block young in the LRU list */
55 #define BUF_GET_NO_LATCH	14	/*!< get and bufferfix, but
56 					set no latch; we have
57 					separated this case, because
58 					it is error-prone programming
59 					not to set a latch, and it
60 					should be used with care */
61 #define BUF_GET_IF_IN_POOL_OR_WATCH	15
62 					/*!< Get the page only if it's in the
63 					buffer pool, if not then set a watch
64 					on the page. */
65 #define BUF_GET_POSSIBLY_FREED		16
66 					/*!< Like BUF_GET, but do not mind
67 					if the file page has been freed. */
68 #define BUF_EVICT_IF_IN_POOL	20	/*!< evict a clean block if found */
69 /* @} */
70 
71 /** If LRU list of a buf_pool is less than this size then LRU eviction
72 should not happen. This is because when we do LRU flushing we also put
73 the blocks on free list. If LRU list is very small then we can end up
74 in thrashing. */
75 #define BUF_LRU_MIN_LEN		256
76 
77 # ifdef UNIV_DEBUG
78 extern my_bool	buf_disable_resize_buffer_pool_debug; /*!< if TRUE, resizing
79 					buffer pool is not allowed. */
80 # endif /* UNIV_DEBUG */
81 
82 /** buf_page_t::state() values, distinguishing buf_page_t and buf_block_t */
83 enum buf_page_state
84 {
85   /** available in buf_pool.free or buf_pool.watch */
86   BUF_BLOCK_NOT_USED,
87   /** allocated for something else than a file page */
88   BUF_BLOCK_MEMORY,
89   /** a previously allocated file page, in transit to NOT_USED */
90   BUF_BLOCK_REMOVE_HASH,
91   /** a buf_block_t that is also in buf_pool.LRU */
92   BUF_BLOCK_FILE_PAGE,
93   /** the buf_page_t of a ROW_FORMAT=COMPRESSED page
94   whose uncompressed page frame has been evicted */
95   BUF_BLOCK_ZIP_PAGE
96 };
97 
98 /** This structure defines information we will fetch from each buffer pool. It
99 will be used to print table IO stats */
100 struct buf_pool_info_t
101 {
102 	/* General buffer pool info */
103 	ulint	pool_size;		/*!< Buffer Pool size in pages */
104 	ulint	lru_len;		/*!< Length of buf_pool.LRU */
105 	ulint	old_lru_len;		/*!< buf_pool.LRU_old_len */
106 	ulint	free_list_len;		/*!< Length of buf_pool.free list */
107 	ulint	flush_list_len;		/*!< Length of buf_pool.flush_list */
108 	ulint	n_pend_unzip;		/*!< buf_pool.n_pend_unzip, pages
109 					pending decompress */
110 	ulint	n_pend_reads;		/*!< buf_pool.n_pend_reads, pages
111 					pending read */
112 	ulint	n_pending_flush_lru;	/*!< Pages pending flush in LRU */
113 	ulint	n_pending_flush_list;	/*!< Pages pending flush in FLUSH
114 					LIST */
115 	ulint	n_pages_made_young;	/*!< number of pages made young */
116 	ulint	n_pages_not_made_young;	/*!< number of pages not made young */
117 	ulint	n_pages_read;		/*!< buf_pool.n_pages_read */
118 	ulint	n_pages_created;	/*!< buf_pool.n_pages_created */
119 	ulint	n_pages_written;	/*!< buf_pool.n_pages_written */
120 	ulint	n_page_gets;		/*!< buf_pool.n_page_gets */
121 	ulint	n_ra_pages_read_rnd;	/*!< buf_pool.n_ra_pages_read_rnd,
122 					number of pages readahead */
123 	ulint	n_ra_pages_read;	/*!< buf_pool.n_ra_pages_read, number
124 					of pages readahead */
125 	ulint	n_ra_pages_evicted;	/*!< buf_pool.n_ra_pages_evicted,
126 					number of readahead pages evicted
127 					without access */
128 	ulint	n_page_get_delta;	/*!< num of buffer pool page gets since
129 					last printout */
130 
131 	/* Buffer pool access stats */
132 	double	page_made_young_rate;	/*!< page made young rate in pages
133 					per second */
134 	double	page_not_made_young_rate;/*!< page not made young rate
135 					in pages per second */
136 	double	pages_read_rate;	/*!< num of pages read per second */
137 	double	pages_created_rate;	/*!< num of pages create per second */
138 	double	pages_written_rate;	/*!< num of  pages written per second */
139 	ulint	page_read_delta;	/*!< num of pages read since last
140 					printout */
141 	ulint	young_making_delta;	/*!< num of pages made young since
142 					last printout */
143 	ulint	not_young_making_delta;	/*!< num of pages not make young since
144 					last printout */
145 
146 	/* Statistics about read ahead algorithm.  */
147 	double	pages_readahead_rnd_rate;/*!< random readahead rate in pages per
148 					second */
149 	double	pages_readahead_rate;	/*!< readahead rate in pages per
150 					second */
151 	double	pages_evicted_rate;	/*!< rate of readahead page evicted
152 					without access, in pages per second */
153 
154 	/* Stats about LRU eviction */
155 	ulint	unzip_lru_len;		/*!< length of buf_pool.unzip_LRU
156 					list */
157 	/* Counters for LRU policy */
158 	ulint	io_sum;			/*!< buf_LRU_stat_sum.io */
159 	ulint	io_cur;			/*!< buf_LRU_stat_cur.io, num of IO
160 					for current interval */
161 	ulint	unzip_sum;		/*!< buf_LRU_stat_sum.unzip */
162 	ulint	unzip_cur;		/*!< buf_LRU_stat_cur.unzip, num
163 					pages decompressed in current
164 					interval */
165 };
166 #endif /* !UNIV_INNOCHECKSUM */
167 
168 /** Print the given page_id_t object.
169 @param[in,out]	out	the output stream
170 @param[in]	page_id	the page_id_t object to be printed
171 @return the output stream */
172 std::ostream&
173 operator<<(
174 	std::ostream&		out,
175 	const page_id_t		page_id);
176 
177 #ifndef UNIV_INNOCHECKSUM
178 /*********************************************************************//**
179 Gets the current size of buffer buf_pool in bytes.
180 @return size in bytes */
181 UNIV_INLINE
182 ulint
183 buf_pool_get_curr_size(void);
184 /*========================*/
185 
186 /********************************************************************//**
187 Allocates a buf_page_t descriptor. This function must succeed. In case
188 of failure we assert in this function. */
189 UNIV_INLINE
190 buf_page_t*
191 buf_page_alloc_descriptor(void)
192 /*===========================*/
193 	MY_ATTRIBUTE((malloc));
194 /********************************************************************//**
195 Free a buf_page_t descriptor. */
196 UNIV_INLINE
197 void
198 buf_page_free_descriptor(
199 /*=====================*/
200 	buf_page_t*	bpage)	/*!< in: bpage descriptor to free. */
201 	MY_ATTRIBUTE((nonnull));
202 
203 /** Allocate a buffer block.
204 @return own: the allocated block, in state BUF_BLOCK_MEMORY */
205 inline buf_block_t *buf_block_alloc();
206 /********************************************************************//**
207 Frees a buffer block which does not contain a file page. */
208 UNIV_INLINE
209 void
210 buf_block_free(
211 /*===========*/
212 	buf_block_t*	block);	/*!< in, own: block to be freed */
213 
214 /**************************************************************//**
215 NOTE! The following macros should be used instead of buf_page_get_gen,
216 to improve debugging. Only values RW_S_LATCH and RW_X_LATCH are allowed
217 in LA! */
218 #define buf_page_get(ID, SIZE, LA, MTR)					\
219 	buf_page_get_gen(ID, SIZE, LA, NULL, BUF_GET, __FILE__, __LINE__, MTR)
220 
221 /**************************************************************//**
222 Use these macros to bufferfix a page with no latching. Remember not to
223 read the contents of the page unless you know it is safe. Do not modify
224 the contents of the page! We have separated this case, because it is
225 error-prone programming not to set a latch, and it should be used
226 with care. */
227 #define buf_page_get_with_no_latch(ID, SIZE, MTR)	\
228 	buf_page_get_gen(ID, SIZE, RW_NO_LATCH, NULL, BUF_GET_NO_LATCH, \
229 			 __FILE__, __LINE__, MTR)
230 /********************************************************************//**
231 This is the general function used to get optimistic access to a database
232 page.
233 @return TRUE if success */
234 ibool
235 buf_page_optimistic_get(
236 /*====================*/
237 	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
238 	buf_block_t*	block,	/*!< in: guessed block */
239 	ib_uint64_t	modify_clock,/*!< in: modify clock value */
240 	const char*	file,	/*!< in: file name */
241 	unsigned	line,	/*!< in: line where called */
242 	mtr_t*		mtr);	/*!< in: mini-transaction */
243 
244 /** Given a tablespace id and page number tries to get that page. If the
245 page is not in the buffer pool it is not loaded and NULL is returned.
246 Suitable for using when holding the lock_sys_t::mutex.
247 @param[in]	page_id	page id
248 @param[in]	file	file name
249 @param[in]	line	line where called
250 @param[in]	mtr	mini-transaction
251 @return pointer to a page or NULL */
252 buf_block_t*
253 buf_page_try_get_func(
254 	const page_id_t		page_id,
255 	const char*		file,
256 	unsigned		line,
257 	mtr_t*			mtr);
258 
259 /** Tries to get a page.
260 If the page is not in the buffer pool it is not loaded. Suitable for using
261 when holding the lock_sys_t::mutex.
262 @param[in]	page_id	page identifier
263 @param[in]	mtr	mini-transaction
264 @return the page if in buffer pool, NULL if not */
265 #define buf_page_try_get(page_id, mtr)	\
266 	buf_page_try_get_func((page_id), __FILE__, __LINE__, mtr);
267 
268 /** Get read access to a compressed page (usually of type
269 FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
270 The page must be released with buf_page_release_zip().
271 NOTE: the page is not protected by any latch.  Mutual exclusion has to
272 be implemented at a higher level.  In other words, all possible
273 accesses to a given page through this function must be protected by
274 the same set of mutexes or latches.
275 @param[in]	page_id		page id
276 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size
277 @return pointer to the block */
278 buf_page_t* buf_page_get_zip(const page_id_t page_id, ulint zip_size);
279 
280 /** Get access to a database page. Buffered redo log may be applied.
281 @param[in]	page_id			page id
282 @param[in]	zip_size		ROW_FORMAT=COMPRESSED page size, or 0
283 @param[in]	rw_latch		RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
284 @param[in]	guess			guessed block or NULL
285 @param[in]	mode			BUF_GET, BUF_GET_IF_IN_POOL,
286 BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH
287 @param[in]	file			file name
288 @param[in]	line			line where called
289 @param[in]	mtr			mini-transaction
290 @param[out]	err			DB_SUCCESS or error code
291 @param[in]	allow_ibuf_merge	Allow change buffer merge while
292 reading the pages from file.
293 @return pointer to the block or NULL */
294 buf_block_t*
295 buf_page_get_gen(
296 	const page_id_t		page_id,
297 	ulint			zip_size,
298 	ulint			rw_latch,
299 	buf_block_t*		guess,
300 	ulint			mode,
301 	const char*		file,
302 	unsigned		line,
303 	mtr_t*			mtr,
304 	dberr_t*		err = NULL,
305 	bool			allow_ibuf_merge = false);
306 
307 /** This is the low level function used to get access to a database page.
308 @param[in]	page_id			page id
309 @param[in]	zip_size		ROW_FORMAT=COMPRESSED page size, or 0
310 @param[in]	rw_latch		RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
311 @param[in]	guess			guessed block or NULL
312 @param[in]	mode			BUF_GET, BUF_GET_IF_IN_POOL,
313 BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH
314 @param[in]	file			file name
315 @param[in]	line			line where called
316 @param[in]	mtr			mini-transaction
317 @param[out]	err			DB_SUCCESS or error code
318 @param[in]	allow_ibuf_merge	Allow change buffer merge to happen
319 while reading the page from file
320 then it makes sure that it does merging of change buffer changes while
321 reading the page from file.
322 @return pointer to the block or NULL */
323 buf_block_t*
324 buf_page_get_low(
325 	const page_id_t		page_id,
326 	ulint			zip_size,
327 	ulint			rw_latch,
328 	buf_block_t*		guess,
329 	ulint			mode,
330 	const char*		file,
331 	unsigned		line,
332 	mtr_t*			mtr,
333 	dberr_t*		err,
334 	bool			allow_ibuf_merge);
335 
336 /** Initialize a page in the buffer pool. The page is usually not read
337 from a file even if it cannot be found in the buffer buf_pool. This is one
338 of the functions which perform to a block a state transition NOT_USED =>
339 FILE_PAGE (the other is buf_page_get_gen).
340 @param[in,out]	space		space object
341 @param[in]	offset		offset of the tablespace
342 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
343 @param[in,out]	mtr		mini-transaction
344 @param[in,out]	free_block	pre-allocated buffer block
345 @return pointer to the block, page bufferfixed */
346 buf_block_t*
347 buf_page_create(fil_space_t *space, uint32_t offset,
348                 ulint zip_size, mtr_t *mtr, buf_block_t *free_block);
349 
350 /********************************************************************//**
351 Releases a compressed-only page acquired with buf_page_get_zip(). */
352 UNIV_INLINE
353 void
354 buf_page_release_zip(
355 /*=================*/
356 	buf_page_t*	bpage);		/*!< in: buffer block */
357 /********************************************************************//**
358 Releases a latch, if specified. */
359 UNIV_INLINE
360 void
361 buf_page_release_latch(
362 /*=====================*/
363 	buf_block_t*	block,		/*!< in: buffer block */
364 	ulint		rw_latch);	/*!< in: RW_S_LATCH, RW_X_LATCH,
365 					RW_NO_LATCH */
366 /** Move a block to the start of the LRU list. */
367 void buf_page_make_young(buf_page_t *bpage);
368 /** Mark the page status as FREED for the given tablespace id and
369 page number. If the page is not in buffer pool then ignore it.
370 @param[in,out]	space	tablespace
371 @param[in]	page	page number
372 @param[in,out]	mtr	mini-transaction
373 @param[in]	file	file name
374 @param[in]	line	line where called */
375 void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr,
376                    const char *file, unsigned line);
377 
378 /********************************************************************//**
379 Reads the freed_page_clock of a buffer block.
380 @return freed_page_clock */
381 UNIV_INLINE
382 unsigned
383 buf_page_get_freed_page_clock(
384 /*==========================*/
385 	const buf_page_t*	bpage)	/*!< in: block */
386 	MY_ATTRIBUTE((warn_unused_result));
387 /********************************************************************//**
388 Reads the freed_page_clock of a buffer block.
389 @return freed_page_clock */
390 UNIV_INLINE
391 unsigned
392 buf_block_get_freed_page_clock(
393 /*===========================*/
394 	const buf_block_t*	block)	/*!< in: block */
395 	MY_ATTRIBUTE((warn_unused_result));
396 
397 /** Determine if a block is still close enough to the MRU end of the LRU list
398 meaning that it is not in danger of getting evicted and also implying
399 that it has been accessed recently.
400 Note that this is for heuristics only and does not reserve buffer pool
401 mutex.
402 @param[in]	bpage		buffer pool page
403 @return whether bpage is close to MRU end of LRU */
404 inline bool buf_page_peek_if_young(const buf_page_t *bpage);
405 
406 /** Determine if a block should be moved to the start of the LRU list if
407 there is danger of dropping from the buffer pool.
408 @param[in]	bpage		buffer pool page
409 @return true if bpage should be made younger */
410 inline bool buf_page_peek_if_too_old(const buf_page_t *bpage);
411 
412 /** Move a page to the start of the buffer pool LRU list if it is too old.
413 @param[in,out]	bpage		buffer pool page */
buf_page_make_young_if_needed(buf_page_t * bpage)414 inline void buf_page_make_young_if_needed(buf_page_t *bpage)
415 {
416 	if (UNIV_UNLIKELY(buf_page_peek_if_too_old(bpage))) {
417 		buf_page_make_young(bpage);
418 	}
419 }
420 
421 /********************************************************************//**
422 Increments the modify clock of a frame by 1. The caller must (1) own the
423 buf_pool.mutex and block bufferfix count has to be zero, (2) or own an x-lock
424 on the block. */
425 UNIV_INLINE
426 void
427 buf_block_modify_clock_inc(
428 /*=======================*/
429 	buf_block_t*	block);	/*!< in: block */
430 /********************************************************************//**
431 Returns the value of the modify clock. The caller must have an s-lock
432 or x-lock on the block.
433 @return value */
434 UNIV_INLINE
435 ib_uint64_t
436 buf_block_get_modify_clock(
437 /*=======================*/
438 	buf_block_t*	block);	/*!< in: block */
439 /*******************************************************************//**
440 Increments the bufferfix count. */
441 UNIV_INLINE
442 void
443 buf_block_buf_fix_inc_func(
444 /*=======================*/
445 # ifdef UNIV_DEBUG
446 	const char*	file,	/*!< in: file name */
447 	unsigned	line,	/*!< in: line */
448 # endif /* UNIV_DEBUG */
449 	buf_block_t*	block)	/*!< in/out: block to bufferfix */
450 	MY_ATTRIBUTE((nonnull));
451 
452 # ifdef UNIV_DEBUG
453 /** Increments the bufferfix count.
454 @param[in,out]	b	block to bufferfix
455 @param[in]	f	file name where requested
456 @param[in]	l	line number where requested */
457 #  define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(f,l,b)
458 # else /* UNIV_DEBUG */
459 /** Increments the bufferfix count.
460 @param[in,out]	b	block to bufferfix
461 @param[in]	f	file name where requested
462 @param[in]	l	line number where requested */
463 #  define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(b)
464 # endif /* UNIV_DEBUG */
465 #endif /* !UNIV_INNOCHECKSUM */
466 
467 /** Check if a buffer is all zeroes.
468 @param[in]	buf	data to check
469 @return whether the buffer is all zeroes */
470 bool buf_is_zeroes(st_::span<const byte> buf);
471 
472 /** Checks if the page is in crc32 checksum format.
473 @param[in]	read_buf		database page
474 @param[in]	checksum_field1		new checksum field
475 @param[in]	checksum_field2		old checksum field
476 @return true if the page is in crc32 checksum format. */
477 bool
478 buf_page_is_checksum_valid_crc32(
479 	const byte*			read_buf,
480 	ulint				checksum_field1,
481 	ulint				checksum_field2)
482 	MY_ATTRIBUTE((nonnull(1), warn_unused_result));
483 
484 /** Checks if the page is in innodb checksum format.
485 @param[in]	read_buf	database page
486 @param[in]	checksum_field1	new checksum field
487 @param[in]	checksum_field2	old checksum field
488 @return true if the page is in innodb checksum format. */
489 bool
490 buf_page_is_checksum_valid_innodb(
491 	const byte*			read_buf,
492 	ulint				checksum_field1,
493 	ulint				checksum_field2)
494 	MY_ATTRIBUTE((nonnull(1), warn_unused_result));
495 
496 /** Checks if the page is in none checksum format.
497 @param[in]	read_buf	database page
498 @param[in]	checksum_field1	new checksum field
499 @param[in]	checksum_field2	old checksum field
500 @return true if the page is in none checksum format. */
501 bool
502 buf_page_is_checksum_valid_none(
503 	const byte*			read_buf,
504 	ulint				checksum_field1,
505 	ulint				checksum_field2)
506 	MY_ATTRIBUTE((nonnull(1), warn_unused_result));
507 
508 /** Check if a page is corrupt.
509 @param[in]	check_lsn	whether the LSN should be checked
510 @param[in]	read_buf	database page
511 @param[in]	fsp_flags	tablespace flags
512 @return whether the page is corrupted */
513 bool
514 buf_page_is_corrupted(
515 	bool			check_lsn,
516 	const byte*		read_buf,
517 	ulint			fsp_flags)
518 	MY_ATTRIBUTE((warn_unused_result));
519 
aligned_malloc(size_t size,size_t align)520 inline void *aligned_malloc(size_t size, size_t align)
521 {
522 #ifdef _MSC_VER
523   return _aligned_malloc(size, align);
524 #else
525   void *result;
526   if (posix_memalign(&result, align, size))
527     result= NULL;
528   return result;
529 #endif
530 }
531 
aligned_free(void * ptr)532 inline void aligned_free(void *ptr)
533 {
534 #ifdef _MSC_VER
535   _aligned_free(ptr);
536 #else
537   free(ptr);
538 #endif
539 }
540 
541 /** Read the key version from the page. In full crc32 format,
542 key version is stored at {0-3th} bytes. In other format, it is
543 stored in 26th position.
544 @param[in]	read_buf	database page
545 @param[in]	fsp_flags	tablespace flags
546 @return key version of the page. */
buf_page_get_key_version(const byte * read_buf,ulint fsp_flags)547 inline uint32_t buf_page_get_key_version(const byte* read_buf, ulint fsp_flags)
548 {
549   static_assert(FIL_PAGE_FCRC32_KEY_VERSION == 0, "compatibility");
550   return fil_space_t::full_crc32(fsp_flags)
551     ? mach_read_from_4(my_assume_aligned<4>(read_buf))
552     : mach_read_from_4(my_assume_aligned<2>
553 		       (read_buf + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION));
554 }
555 
556 /** Read the compression info from the page. In full crc32 format,
557 compression info is at MSB of page type. In other format, it is
558 stored in page type.
559 @param[in]	read_buf	database page
560 @param[in]	fsp_flags	tablespace flags
561 @return true if page is compressed. */
buf_page_is_compressed(const byte * read_buf,ulint fsp_flags)562 inline bool buf_page_is_compressed(const byte* read_buf, ulint fsp_flags)
563 {
564   uint16_t page_type= fil_page_get_type(read_buf);
565   return fil_space_t::full_crc32(fsp_flags)
566     ? !!(page_type & 1U << FIL_PAGE_COMPRESS_FCRC32_MARKER)
567     : page_type == FIL_PAGE_PAGE_COMPRESSED;
568 }
569 
570 /** Get the compressed or uncompressed size of a full_crc32 page.
571 @param[in]	buf	page_compressed or uncompressed page
572 @param[out]	comp	whether the page could be compressed
573 @param[out]	cr	whether the page could be corrupted
574 @return the payload size in the file page */
buf_page_full_crc32_size(const byte * buf,bool * comp,bool * cr)575 inline uint buf_page_full_crc32_size(const byte* buf, bool* comp, bool* cr)
576 {
577 	uint t = fil_page_get_type(buf);
578 	uint page_size = uint(srv_page_size);
579 
580 	if (!(t & 1U << FIL_PAGE_COMPRESS_FCRC32_MARKER)) {
581 		return page_size;
582 	}
583 
584 	t &= ~(1U << FIL_PAGE_COMPRESS_FCRC32_MARKER);
585 	t <<= 8;
586 
587 	if (t < page_size) {
588 		page_size = t;
589 		if (comp) {
590 			*comp = true;
591 		}
592 	} else if (cr) {
593 		*cr = true;
594 	}
595 
596 	return page_size;
597 }
598 
599 #ifndef UNIV_INNOCHECKSUM
600 /** Dump a page to stderr.
601 @param[in]	read_buf	database page
602 @param[in]	zip_size	compressed page size, or 0 */
603 void buf_page_print(const byte* read_buf, ulint zip_size = 0)
604 	ATTRIBUTE_COLD __attribute__((nonnull));
605 /********************************************************************//**
606 Decompress a block.
607 @return TRUE if successful */
608 ibool
609 buf_zip_decompress(
610 /*===============*/
611 	buf_block_t*	block,	/*!< in/out: block */
612 	ibool		check);	/*!< in: TRUE=verify the page checksum */
613 
614 #ifdef UNIV_DEBUG
615 /** @return the number of latched pages in the buffer pool */
616 ulint buf_get_latched_pages_number();
617 #endif /* UNIV_DEBUG */
618 /*********************************************************************//**
619 Prints info of the buffer i/o. */
620 void
621 buf_print_io(
622 /*=========*/
623 	FILE*	file);	/*!< in: file where to print */
624 /** Collect buffer pool metadata.
625 @param[out]	pool_info	buffer pool metadata */
626 void buf_stats_get_pool_info(buf_pool_info_t *pool_info);
627 
628 /** Refresh the statistics used to print per-second averages. */
629 void buf_refresh_io_stats();
630 
631 /** Invalidate all pages in the buffer pool.
632 All pages must be in a replaceable state (not modified or latched). */
633 void buf_pool_invalidate();
634 
635 /*========================================================================
636 --------------------------- LOWER LEVEL ROUTINES -------------------------
637 =========================================================================*/
638 
639 #ifdef UNIV_DEBUG
640 /*********************************************************************//**
641 Adds latch level info for the rw-lock protecting the buffer frame. This
642 should be called in the debug version after a successful latching of a
643 page if we know the latching order level of the acquired latch. */
644 UNIV_INLINE
645 void
646 buf_block_dbg_add_level(
647 /*====================*/
648 	buf_block_t*	block,	/*!< in: buffer page
649 				where we have acquired latch */
650 	latch_level_t	level);	/*!< in: latching order level */
651 #else /* UNIV_DEBUG */
652 # define buf_block_dbg_add_level(block, level) /* nothing */
653 #endif /* UNIV_DEBUG */
654 
655 #ifdef UNIV_DEBUG
656 /*********************************************************************//**
657 Gets a pointer to the memory frame of a block.
658 @return pointer to the frame */
659 UNIV_INLINE
660 buf_frame_t*
661 buf_block_get_frame(
662 /*================*/
663 	const buf_block_t*	block)	/*!< in: pointer to the control block */
664 	MY_ATTRIBUTE((warn_unused_result));
665 #else /* UNIV_DEBUG */
666 # define buf_block_get_frame(block) (block)->frame
667 #endif /* UNIV_DEBUG */
668 
669 /*********************************************************************//**
670 Gets the compressed page descriptor corresponding to an uncompressed page
671 if applicable. */
672 #define buf_block_get_page_zip(block) \
673 	(UNIV_LIKELY_NULL((block)->page.zip.data) ? &(block)->page.zip : NULL)
674 #define is_buf_block_get_page_zip(block) \
675         UNIV_LIKELY_NULL((block)->page.zip.data)
676 
677 /** Monitor the buffer page read/write activity, and increment corresponding
678 counter value in MONITOR_MODULE_BUF_PAGE.
679 @param bpage   buffer page whose read or write was completed
680 @param io_type BUF_IO_READ or BUF_IO_WRITE */
681 ATTRIBUTE_COLD __attribute__((nonnull))
682 void buf_page_monitor(const buf_page_t *bpage, buf_io_fix io_type);
683 
684 /** Complete a read request of a file page to buf_pool.
685 @param bpage    recently read page
686 @param node     data file
687 @return whether the operation succeeded
688 @retval DB_SUCCESS              always when writing, or if a read page was OK
689 @retval DB_PAGE_CORRUPTED       if the checksum fails on a page read
690 @retval DB_DECRYPTION_FAILED    if the page cannot be decrypted */
691 dberr_t buf_page_read_complete(buf_page_t *bpage, const fil_node_t &node);
692 
693 /** Calculate aligned buffer pool size based on srv_buf_pool_chunk_unit,
694 if needed.
695 @param[in]	size	size in bytes
696 @return	aligned size */
697 ulint
698 buf_pool_size_align(
699 	ulint	size);
700 
701 /** Verify that post encryption checksum match with the calculated checksum.
702 This function should be called only if tablespace contains crypt data metadata.
703 @param[in]	page		page frame
704 @param[in]	fsp_flags	tablespace flags
705 @return true if page is encrypted and OK, false otherwise */
706 bool buf_page_verify_crypt_checksum(
707 	const byte*	page,
708 	ulint		fsp_flags);
709 
710 /** Calculate a ROW_FORMAT=COMPRESSED page checksum and update the page.
711 @param[in,out]	page		page to update
712 @param[in]	size		compressed page size */
713 void buf_flush_update_zip_checksum(buf_frame_t* page, ulint size);
714 
715 /** @brief The temporary memory structure.
716 
717 NOTE! The definition appears here only for other modules of this
718 directory (buf) to see it. Do not use from outside! */
719 
720 class buf_tmp_buffer_t
721 {
722   /** whether this slot is reserved */
723   std::atomic<bool> reserved;
724 public:
725   /** For encryption, the data needs to be copied to a separate buffer
726   before it's encrypted&written. The buffer block itself can be replaced
727   while a write of crypt_buf to file is in progress. */
728   byte *crypt_buf;
729   /** buffer for fil_page_compress(), for flushing page_compressed pages */
730   byte *comp_buf;
731   /** pointer to resulting buffer after encryption or compression;
732   not separately allocated memory */
733   byte *out_buf;
734 
735   /** Release the slot */
release()736   void release() { reserved.store(false, std::memory_order_relaxed); }
737 
738   /** Acquire the slot
739   @return whether the slot was acquired */
acquire()740   bool acquire() { return !reserved.exchange(true, std::memory_order_relaxed);}
741 
742   /** Allocate a buffer for encryption, decryption or decompression. */
allocate()743   void allocate()
744   {
745     if (!crypt_buf)
746       crypt_buf= static_cast<byte*>
747       (aligned_malloc(srv_page_size, srv_page_size));
748   }
749 };
750 
751 /** The common buffer control block structure
752 for compressed and uncompressed frames */
753 
754 class buf_pool_t;
755 
756 class buf_page_t
757 {
758   friend buf_pool_t;
759   friend buf_block_t;
760   /** @name General fields */
761   /* @{ */
762 
763 public: // FIXME: fix fil_iterate()
764   /** Page id. Protected by buf_pool.hash_lock_get(id) when
765   the page is in buf_pool.page_hash. */
766   page_id_t id_;
767 private:
768   /** Count of how manyfold this block is currently bufferfixed. */
769   Atomic_counter<uint32_t> buf_fix_count_;
770 
771   /** log sequence number of the START of the log entry written of the
772   oldest modification to this block which has not yet been written
773   to the data file;
774 
775   0 if no modifications are pending;
776   1 if no modifications are pending, but the block is in buf_pool.flush_list;
777   2 if modifications are pending, but the block is not in buf_pool.flush_list
778   (because id().space() is the temporary tablespace). */
779   Atomic_relaxed<lsn_t> oldest_modification_;
780 
781   /** type of pending I/O operation; protected by buf_pool.mutex
782   if in_LRU_list */
783   Atomic_relaxed<buf_io_fix> io_fix_;
784   /** Block state. @see in_file().
785   State transitions between in_file() states and to
786   BUF_BLOCK_REMOVE_HASH are protected by buf_pool.hash_lock_get(id)
787   when the block is in buf_pool.page_hash.
788   Other transitions when in_LRU_list are protected by buf_pool.mutex. */
789   buf_page_state state_;
790 
791 public:
792   /** buf_pool.page_hash link; protected by buf_pool.hash_lock_get(id) */
793   buf_page_t *hash;
794   /* @} */
795 	page_zip_des_t	zip;		/*!< compressed page; zip.data
796 					(but not the data it points to) is
797 					also protected by buf_pool.mutex;
798 					state == BUF_BLOCK_ZIP_PAGE and
799 					zip.data == NULL means an active
800 					buf_pool.watch */
801 
802 	buf_tmp_buffer_t* slot;		/*!< Slot for temporary memory
803 					used for encryption/compression
804 					or NULL */
805 #ifdef UNIV_DEBUG
806   /** whether this->list is in buf_pool.zip_hash; protected by buf_pool.mutex */
807   bool in_zip_hash;
808   /** whether this->LRU is in buf_pool.LRU (in_file() holds);
809   protected by buf_pool.mutex */
810   bool in_LRU_list;
811   /** whether this is in buf_pool.page_hash (in_file() holds);
812   protected by buf_pool.mutex */
813   bool in_page_hash;
814   /** whether this->list is in buf_pool.free (state() == BUF_BLOCK_NOT_USED);
815   protected by buf_pool.flush_list_mutex */
816   bool in_free_list;
817 #endif /* UNIV_DEBUG */
818   /** list member in one of the lists of buf_pool; protected by
819   buf_pool.mutex or buf_pool.flush_list_mutex
820 
821   state() == BUF_BLOCK_NOT_USED: buf_pool.free or buf_pool.withdraw
822 
823   in_file() && oldest_modification():
824   buf_pool.flush_list (protected by buf_pool.flush_list_mutex)
825 
826   The contents is undefined if in_file() && !oldest_modification(),
827   or if state() is BUF_BLOCK_MEMORY or BUF_BLOCK_REMOVE_HASH. */
828   UT_LIST_NODE_T(buf_page_t) list;
829 
830 	/** @name LRU replacement algorithm fields.
831 	Protected by buf_pool.mutex. */
832 	/* @{ */
833 
834 	UT_LIST_NODE_T(buf_page_t) LRU;
835 					/*!< node of the LRU list */
836 	unsigned	old:1;		/*!< TRUE if the block is in the old
837 					blocks in buf_pool.LRU_old */
838 	unsigned	freed_page_clock:31;/*!< the value of
839 					buf_pool.freed_page_clock
840 					when this block was the last
841 					time put to the head of the
842 					LRU list; a thread is allowed
843 					to read this for heuristic
844 					purposes without holding any
845 					mutex or latch */
846 	/* @} */
847 	Atomic_counter<unsigned> access_time;	/*!< time of first access, or
848 					0 if the block was never accessed
849 					in the buffer pool.
850 
851 					For state==BUF_BLOCK_MEMORY
852 					blocks, this field can be repurposed
853 					for something else.
854 
855 					When this field counts log records
856 					and bytes allocated for recv_sys.pages,
857 					the field is protected by
858 					recv_sys_t::mutex. */
859   /** Change buffer entries for the page exist.
860   Protected by io_fix()==BUF_IO_READ or by buf_block_t::lock. */
861   bool ibuf_exist;
862 
863   /** Block initialization status. Can be modified while holding io_fix()
864   or buf_block_t::lock X-latch */
865   enum {
866     /** the page was read normally and should be flushed normally */
867     NORMAL = 0,
868     /** the page was (re)initialized, and the doublewrite buffer can be
869     skipped on the next flush */
870     INIT_ON_FLUSH,
871     /** the page was freed and need to be flushed.
872     For page_compressed, page flush will punch a hole to free space.
873     Else if innodb_immediate_scrub_data_uncompressed, the page will
874     be overwritten with zeroes. */
875     FREED
876   } status;
877 
buf_page_t()878   buf_page_t() : id_(0)
879   {
880     static_assert(BUF_BLOCK_NOT_USED == 0, "compatibility");
881     memset((void*) this, 0, sizeof *this);
882   }
883 
884   /** Initialize some fields */
init()885   void init()
886   {
887     io_fix_= BUF_IO_NONE;
888     buf_fix_count_= 0;
889     old= 0;
890     freed_page_clock= 0;
891     access_time= 0;
892     oldest_modification_= 0;
893     slot= nullptr;
894     ibuf_exist= false;
895     status= NORMAL;
896     ut_d(in_zip_hash= false);
897     ut_d(in_free_list= false);
898     ut_d(in_LRU_list= false);
899     ut_d(in_page_hash= false);
900     HASH_INVALIDATE(this, hash);
901   }
902 
903   /** Initialize some more fields */
904   void init(buf_page_state state, page_id_t id, uint32_t buf_fix_count= 0)
905   {
906     init();
907     state_= state;
908     id_= id;
909     buf_fix_count_= buf_fix_count;
910   }
911 
912   /** Initialize some more fields */
913   void init(page_id_t id, uint32_t buf_fix_count= 0)
914   {
915     init();
916     id_= id;
917     buf_fix_count_= buf_fix_count;
918   }
919 
920 public:
id()921   const page_id_t &id() const { return id_; }
state()922   buf_page_state state() const { return state_; }
buf_fix_count()923   uint32_t buf_fix_count() const { return buf_fix_count_; }
io_fix()924   buf_io_fix io_fix() const { return io_fix_; }
io_unfix()925   void io_unfix()
926   {
927     ut_d(const auto old_io_fix= io_fix());
928     ut_ad(old_io_fix == BUF_IO_READ || old_io_fix == BUF_IO_PIN);
929     io_fix_= BUF_IO_NONE;
930   }
931 
932   /** @return if this belongs to buf_pool.unzip_LRU */
belongs_to_unzip_LRU()933   bool belongs_to_unzip_LRU() const
934   {
935     return zip.data && state() != BUF_BLOCK_ZIP_PAGE;
936   }
937 
938   inline void add_buf_fix_count(uint32_t count);
939   inline void set_buf_fix_count(uint32_t count);
940   inline void set_state(buf_page_state state);
941   inline void set_io_fix(buf_io_fix io_fix);
942   inline void set_corrupt_id();
943 
944   /** @return the log sequence number of the oldest pending modification
945   @retval 0 if the block is being removed from (or not in) buf_pool.flush_list
946   @retval 1 if the block is in buf_pool.flush_list but not modified
947   @retval 2 if the block belongs to the temporary tablespace and
948   has unwritten changes */
oldest_modification()949   lsn_t oldest_modification() const { return oldest_modification_; }
950   /** @return the log sequence number of the oldest pending modification,
951   @retval 0 if the block is definitely not in buf_pool.flush_list
952   @retval 1 if the block is in buf_pool.flush_list but not modified
953   @retval 2 if the block belongs to the temporary tablespace and
954   has unwritten changes */
oldest_modification_acquire()955   lsn_t oldest_modification_acquire() const
956   { return oldest_modification_.load(std::memory_order_acquire); }
957   /** Set oldest_modification when adding to buf_pool.flush_list */
958   inline void set_oldest_modification(lsn_t lsn);
959   /** Clear oldest_modification after removing from buf_pool.flush_list */
960   inline void clear_oldest_modification();
961   /** Note that a block is no longer dirty, while not removing
962   it from buf_pool.flush_list */
963   inline void clear_oldest_modification(bool temporary);
964 
965   /** Notify that a page in a temporary tablespace has been modified. */
set_temp_modified()966   void set_temp_modified()
967   {
968     ut_ad(fsp_is_system_temporary(id().space()));
969     ut_ad(state() == BUF_BLOCK_FILE_PAGE);
970     ut_ad(!oldest_modification());
971     oldest_modification_= 2;
972   }
973 
974   /** Prepare to release a file page to buf_pool.free. */
free_file_page()975   void free_file_page()
976   {
977     ut_ad(state() == BUF_BLOCK_REMOVE_HASH);
978     /* buf_LRU_block_free_non_file_page() asserts !oldest_modification() */
979     ut_d(oldest_modification_= 0;)
980     set_corrupt_id();
981     ut_d(set_state(BUF_BLOCK_MEMORY));
982   }
983 
fix()984   void fix() { buf_fix_count_++; }
unfix()985   uint32_t unfix()
986   {
987     uint32_t count= buf_fix_count_--;
988     ut_ad(count != 0);
989     return count - 1;
990   }
991 
992   /** @return the physical size, in bytes */
physical_size()993   ulint physical_size() const
994   {
995     return zip.ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << zip.ssize : srv_page_size;
996   }
997 
998   /** @return the ROW_FORMAT=COMPRESSED physical size, in bytes
999   @retval 0 if not compressed */
zip_size()1000   ulint zip_size() const
1001   {
1002     return zip.ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << zip.ssize : 0;
1003   }
1004 
1005   /** @return the byte offset of the page within a file */
physical_offset()1006   os_offset_t physical_offset() const
1007   {
1008     os_offset_t o= id().page_no();
1009     return zip.ssize
1010       ? o << (zip.ssize + (UNIV_ZIP_SIZE_SHIFT_MIN - 1))
1011       : o << srv_page_size_shift;
1012   }
1013 
1014   /** @return whether the block is mapped to a data file */
in_file()1015   bool in_file() const
1016   {
1017     switch (state_) {
1018     case BUF_BLOCK_ZIP_PAGE:
1019     case BUF_BLOCK_FILE_PAGE:
1020       return true;
1021     case BUF_BLOCK_NOT_USED:
1022     case BUF_BLOCK_MEMORY:
1023     case BUF_BLOCK_REMOVE_HASH:
1024       return false;
1025     }
1026 
1027     ut_error;
1028     return false;
1029   }
1030 
1031   /** @return whether the block is modified and ready for flushing */
1032   inline bool ready_for_flush() const;
1033   /** @return whether the state can be changed to BUF_BLOCK_NOT_USED */
ready_for_replace()1034   bool ready_for_replace() const
1035   { return !oldest_modification() && can_relocate(); }
1036   /** @return whether the block can be relocated in memory.
1037   The block can be dirty, but it must not be I/O-fixed or bufferfixed. */
1038   inline bool can_relocate() const;
1039   /** @return whether the block has been flagged old in buf_pool.LRU */
1040   inline bool is_old() const;
1041   /** Set whether a block is old in buf_pool.LRU */
1042   inline void set_old(bool old);
1043   /** Flag a page accessed in buf_pool
1044   @return whether this is not the first access */
set_accessed()1045   bool set_accessed()
1046   {
1047     if (is_accessed()) return true;
1048     access_time= static_cast<uint32_t>(ut_time_ms());
1049     return false;
1050   }
1051   /** @return ut_time_ms() at the time of first access of a block in buf_pool
1052   @retval 0 if not accessed */
is_accessed()1053   unsigned is_accessed() const { ut_ad(in_file()); return access_time; }
1054 };
1055 
1056 /** The buffer control block structure */
1057 
1058 struct buf_block_t{
1059 
1060 	/** @name General fields */
1061 	/* @{ */
1062 
1063 	buf_page_t	page;		/*!< page information; this must
1064 					be the first field, so that
1065 					buf_pool.page_hash can point
1066 					to buf_page_t or buf_block_t */
1067 	byte*		frame;		/*!< pointer to buffer frame which
1068 					is of size srv_page_size, and
1069 					aligned to an address divisible by
1070 					srv_page_size */
1071 	rw_lock_t	lock;		/*!< read-write lock of the buffer
1072 					frame */
1073 #ifdef UNIV_DEBUG
1074   /** whether page.list is in buf_pool.withdraw
1075   ((state() == BUF_BLOCK_NOT_USED)) and the buffer pool is being shrunk;
1076   protected by buf_pool.mutex */
1077   bool in_withdraw_list;
1078   /** whether unzip_LRU is in buf_pool.unzip_LRU
1079   (state() == BUF_BLOCK_FILE_PAGE and zip.data != nullptr);
1080   protected by buf_pool.mutex */
1081   bool in_unzip_LRU_list;
1082 #endif
1083 	UT_LIST_NODE_T(buf_block_t) unzip_LRU;
1084 					/*!< node of the decompressed LRU list;
1085 					a block is in the unzip_LRU list
1086 					if page.state() == BUF_BLOCK_FILE_PAGE
1087 					and page.zip.data != NULL */
1088 	/* @} */
1089 	/** @name Optimistic search field */
1090 	/* @{ */
1091 
1092 	ib_uint64_t	modify_clock;	/*!< this clock is incremented every
1093 					time a pointer to a record on the
1094 					page may become obsolete; this is
1095 					used in the optimistic cursor
1096 					positioning: if the modify clock has
1097 					not changed, we know that the pointer
1098 					is still valid; this field may be
1099 					changed if the thread (1) owns the
1100 					pool mutex and the page is not
1101 					bufferfixed, or (2) the thread has an
1102 					x-latch on the block */
1103 	/* @} */
1104 #ifdef BTR_CUR_HASH_ADAPT
1105 	/** @name Hash search fields (unprotected)
1106 	NOTE that these fields are NOT protected by any semaphore! */
1107 	/* @{ */
1108 
1109 	volatile uint16_t n_bytes;	/*!< recommended prefix length for hash
1110 					search: number of bytes in
1111 					an incomplete last field */
1112 	volatile uint16_t n_fields;	/*!< recommended prefix length for hash
1113 					search: number of full fields */
1114 	uint16_t	n_hash_helps;	/*!< counter which controls building
1115 					of a new hash index for the page */
1116 	volatile bool	left_side;	/*!< true or false, depending on
1117 					whether the leftmost record of several
1118 					records with the same prefix should be
1119 					indexed in the hash index */
1120 	/* @} */
1121 
1122 	/** @name Hash search fields
1123 	These 5 fields may only be modified when:
1124 	we are holding the appropriate x-latch in btr_search_latches[], and
1125 	one of the following holds:
1126 	(1) the block state is BUF_BLOCK_FILE_PAGE, and
1127 	we are holding an s-latch or x-latch on buf_block_t::lock, or
1128 	(2) buf_block_t::buf_fix_count == 0, or
1129 	(3) the block state is BUF_BLOCK_REMOVE_HASH.
1130 
1131 	An exception to this is when we init or create a page
1132 	in the buffer pool in buf0buf.cc.
1133 
1134 	Another exception for buf_pool_t::clear_hash_index() is that
1135 	assigning block->index = NULL (and block->n_pointers = 0)
1136 	is allowed whenever btr_search_own_all(RW_LOCK_X).
1137 
1138 	Another exception is that ha_insert_for_fold() may
1139 	decrement n_pointers without holding the appropriate latch
1140 	in btr_search_latches[]. Thus, n_pointers must be
1141 	protected by atomic memory access.
1142 
1143 	This implies that the fields may be read without race
1144 	condition whenever any of the following hold:
1145 	- the btr_search_latches[] s-latch or x-latch is being held, or
1146 	- the block state is not BUF_BLOCK_FILE_PAGE or BUF_BLOCK_REMOVE_HASH,
1147 	and holding some latch prevents the state from changing to that.
1148 
1149 	Some use of assert_block_ahi_empty() or assert_block_ahi_valid()
1150 	is prone to race conditions while buf_pool_t::clear_hash_index() is
1151 	executing (the adaptive hash index is being disabled). Such use
1152 	is explicitly commented. */
1153 
1154 	/* @{ */
1155 
1156 # if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
1157 	Atomic_counter<ulint>
1158 			n_pointers;	/*!< used in debugging: the number of
1159 					pointers in the adaptive hash index
1160 					pointing to this frame;
1161 					protected by atomic memory access
1162 					or btr_search_own_all(). */
1163 #  define assert_block_ahi_empty(block)					\
1164 	ut_a((block)->n_pointers == 0)
1165 #  define assert_block_ahi_empty_on_init(block) do {			\
1166 	MEM_MAKE_DEFINED(&(block)->n_pointers, sizeof (block)->n_pointers); \
1167 	assert_block_ahi_empty(block);					\
1168 } while (0)
1169 #  define assert_block_ahi_valid(block)					\
1170 	ut_a((block)->index || (block)->n_pointers == 0)
1171 # else /* UNIV_AHI_DEBUG || UNIV_DEBUG */
1172 #  define assert_block_ahi_empty(block) /* nothing */
1173 #  define assert_block_ahi_empty_on_init(block) /* nothing */
1174 #  define assert_block_ahi_valid(block) /* nothing */
1175 # endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
1176 	unsigned	curr_n_fields:10;/*!< prefix length for hash indexing:
1177 					number of full fields */
1178 	unsigned	curr_n_bytes:15;/*!< number of bytes in hash
1179 					indexing */
1180 	unsigned	curr_left_side:1;/*!< TRUE or FALSE in hash indexing */
1181 	dict_index_t*	index;		/*!< Index for which the
1182 					adaptive hash index has been
1183 					created, or NULL if the page
1184 					does not exist in the
1185 					index. Note that it does not
1186 					guarantee that the index is
1187 					complete, though: there may
1188 					have been hash collisions,
1189 					record deletions, etc. */
1190 	/* @} */
1191 #else /* BTR_CUR_HASH_ADAPT */
1192 # define assert_block_ahi_empty(block) /* nothing */
1193 # define assert_block_ahi_empty_on_init(block) /* nothing */
1194 # define assert_block_ahi_valid(block) /* nothing */
1195 #endif /* BTR_CUR_HASH_ADAPT */
1196 # ifdef UNIV_DEBUG
1197 	/** @name Debug fields */
1198 	/* @{ */
1199 	rw_lock_t*	debug_latch;	/*!< in the debug version, each thread
1200 					which bufferfixes the block acquires
1201 					an s-latch here; so we can use the
1202 					debug utilities in sync0rw */
1203 	/* @} */
1204 # endif
fixbuf_block_t1205   void fix() { page.fix(); }
unfixbuf_block_t1206   uint32_t unfix()
1207   {
1208     ut_ad(page.buf_fix_count() || page.io_fix() != BUF_IO_NONE ||
1209           page.state() == BUF_BLOCK_ZIP_PAGE ||
1210           !rw_lock_own_flagged(&lock, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S |
1211                                RW_LOCK_FLAG_SX));
1212     return page.unfix();
1213   }
1214 
1215   /** @return the physical size, in bytes */
physical_sizebuf_block_t1216   ulint physical_size() const { return page.physical_size(); }
1217 
1218   /** @return the ROW_FORMAT=COMPRESSED physical size, in bytes
1219   @retval 0 if not compressed */
zip_sizebuf_block_t1220   ulint zip_size() const { return page.zip_size(); }
1221 
1222   /** Initialize the block.
1223   @param page_id  page identifier
1224   @param zip_size ROW_FORMAT=COMPRESSED page size, or 0
1225   @param fix      initial buf_fix_count() */
1226   void initialise(const page_id_t page_id, ulint zip_size, uint32_t fix= 0);
1227 };
1228 
1229 /**********************************************************************//**
1230 Compute the hash fold value for blocks in buf_pool.zip_hash. */
1231 /* @{ */
1232 #define BUF_POOL_ZIP_FOLD_PTR(ptr) (ulint(ptr) >> srv_page_size_shift)
1233 #define BUF_POOL_ZIP_FOLD(b) BUF_POOL_ZIP_FOLD_PTR((b)->frame)
1234 #define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b))
1235 /* @} */
1236 
1237 /** A "Hazard Pointer" class used to iterate over page lists
1238 inside the buffer pool. A hazard pointer is a buf_page_t pointer
1239 which we intend to iterate over next and we want it remain valid
1240 even after we release the buffer pool mutex. */
1241 class HazardPointer
1242 {
1243 public:
~HazardPointer()1244   virtual ~HazardPointer() {}
1245 
1246   /** @return current value */
get()1247   buf_page_t *get() const { mysql_mutex_assert_owner(m_mutex); return m_hp; }
1248 
1249   /** Set current value
1250   @param bpage buffer block to be set as hp */
set(buf_page_t * bpage)1251   void set(buf_page_t *bpage)
1252   {
1253     mysql_mutex_assert_owner(m_mutex);
1254     ut_ad(!bpage || bpage->in_file());
1255     m_hp= bpage;
1256   }
1257 
1258   /** Checks if a bpage is the hp
1259   @param bpage  buffer block to be compared
1260   @return true if it is hp */
is_hp(const buf_page_t * bpage)1261   bool is_hp(const buf_page_t *bpage) const
1262   { mysql_mutex_assert_owner(m_mutex); return bpage == m_hp; }
1263 
1264   /** Adjust the value of hp. This happens when some
1265   other thread working on the same list attempts to
1266   remove the hp from the list. */
1267   virtual void adjust(const buf_page_t*) = 0;
1268 
1269 #ifdef UNIV_DEBUG
1270   /** mutex that protects access to the m_hp. */
1271   const mysql_mutex_t *m_mutex= nullptr;
1272 #endif /* UNIV_DEBUG */
1273 
1274 protected:
1275   /** hazard pointer */
1276   buf_page_t *m_hp= nullptr;
1277 };
1278 
1279 /** Class implementing buf_pool.flush_list hazard pointer */
1280 class FlushHp : public HazardPointer
1281 {
1282 public:
~FlushHp()1283   ~FlushHp() override {}
1284 
1285   /** Adjust the value of hp. This happens when some
1286   other thread working on the same list attempts to
1287   remove the hp from the list.
1288   @param bpage  buffer block to be compared */
adjust(const buf_page_t * bpage)1289   void adjust(const buf_page_t *bpage) override
1290   {
1291     ut_ad(bpage != NULL);
1292 
1293     /* We only support reverse traversal for now. */
1294     if (is_hp(bpage))
1295       m_hp= UT_LIST_GET_PREV(list, m_hp);
1296 
1297     ut_ad(!m_hp || m_hp->oldest_modification());
1298   }
1299 };
1300 
1301 /** Class implementing buf_pool.LRU hazard pointer */
1302 class LRUHp : public HazardPointer {
1303 public:
~LRUHp()1304   ~LRUHp() override {}
1305 
1306   /** Adjust the value of hp. This happens when some
1307   other thread working on the same list attempts to
1308   remove the hp from the list.
1309   @param bpage  buffer block to be compared */
adjust(const buf_page_t * bpage)1310   void adjust(const buf_page_t *bpage) override
1311   {
1312     ut_ad(bpage);
1313     /** We only support reverse traversal for now. */
1314     if (is_hp(bpage))
1315       m_hp= UT_LIST_GET_PREV(LRU, m_hp);
1316 
1317     ut_ad(!m_hp || m_hp->in_LRU_list);
1318   }
1319 };
1320 
1321 /** Special purpose iterators to be used when scanning the LRU list.
1322 The idea is that when one thread finishes the scan it leaves the
1323 itr in that position and the other thread can start scan from
1324 there */
1325 class LRUItr : public LRUHp {
1326 public:
LRUItr()1327   LRUItr() : LRUHp() {}
~LRUItr()1328   ~LRUItr() override {}
1329 
1330   /** Select from where to start a scan. If we have scanned
1331   too deep into the LRU list it resets the value to the tail
1332   of the LRU list.
1333   @return buf_page_t from where to start scan. */
1334   inline buf_page_t *start();
1335 };
1336 
1337 /** Struct that is embedded in the free zip blocks */
1338 struct buf_buddy_free_t {
1339 	union {
1340 		ulint	size;	/*!< size of the block */
1341 		byte	bytes[FIL_PAGE_DATA];
1342 				/*!< stamp[FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID]
1343 				== BUF_BUDDY_FREE_STAMP denotes a free
1344 				block. If the space_id field of buddy
1345 				block != BUF_BUDDY_FREE_STAMP, the block
1346 				is not in any zip_free list. If the
1347 				space_id is BUF_BUDDY_FREE_STAMP then
1348 				stamp[0] will contain the
1349 				buddy block size. */
1350 	} stamp;
1351 
1352 	buf_page_t	bpage;	/*!< Embedded bpage descriptor */
1353 	UT_LIST_NODE_T(buf_buddy_free_t) list;
1354 				/*!< Node of zip_free list */
1355 };
1356 
1357 /** @brief The buffer pool statistics structure. */
1358 struct buf_pool_stat_t{
1359 	ulint	n_page_gets;	/*!< number of page gets performed;
1360 				also successful searches through
1361 				the adaptive hash index are
1362 				counted as page gets; this field
1363 				is NOT protected by the buffer
1364 				pool mutex */
1365 	ulint	n_pages_read;	/*!< number read operations */
1366 	ulint	n_pages_written;/*!< number write operations */
1367 	ulint	n_pages_created;/*!< number of pages created
1368 				in the pool with no read */
1369 	ulint	n_ra_pages_read_rnd;/*!< number of pages read in
1370 				as part of random read ahead */
1371 	ulint	n_ra_pages_read;/*!< number of pages read in
1372 				as part of read ahead */
1373 	ulint	n_ra_pages_evicted;/*!< number of read ahead
1374 				pages that are evicted without
1375 				being accessed */
1376 	ulint	n_pages_made_young; /*!< number of pages made young, in
1377 				buf_page_make_young() */
1378 	ulint	n_pages_not_made_young; /*!< number of pages not made
1379 				young because the first access
1380 				was not long enough ago, in
1381 				buf_page_peek_if_too_old() */
1382 	/** number of waits for eviction; writes protected by buf_pool.mutex */
1383 	ulint	LRU_waits;
1384 	ulint	LRU_bytes;	/*!< LRU size in bytes */
1385 	ulint	flush_list_bytes;/*!< flush_list size in bytes */
1386 };
1387 
1388 /** Statistics of buddy blocks of a given size. */
1389 struct buf_buddy_stat_t {
1390 	/** Number of blocks allocated from the buddy system. */
1391 	ulint		used;
1392 	/** Number of blocks relocated by the buddy system. */
1393 	ib_uint64_t	relocated;
1394 	/** Total duration of block relocations, in microseconds. */
1395 	ib_uint64_t	relocated_usec;
1396 };
1397 
1398 /** The buffer pool */
1399 class buf_pool_t
1400 {
1401   /** A chunk of buffers */
1402   struct chunk_t
1403   {
1404     /** number of elements in blocks[] */
1405     size_t size;
1406     /** memory allocated for the page frames */
1407     unsigned char *mem;
1408     /** descriptor of mem */
1409     ut_new_pfx_t mem_pfx;
1410     /** array of buffer control blocks */
1411     buf_block_t *blocks;
1412 
1413     /** Map of first page frame address to chunks[] */
1414     using map= std::map<const void*, chunk_t*, std::less<const void*>,
1415                         ut_allocator<std::pair<const void* const,chunk_t*>>>;
1416     /** Chunk map that may be under construction by buf_resize_thread() */
1417     static map *map_reg;
1418     /** Current chunk map for lookup only */
1419     static map *map_ref;
1420 
1421     /** @return the memory size bytes. */
mem_sizechunk_t1422     size_t mem_size() const { return mem_pfx.m_size; }
1423 
1424     /** Register the chunk */
regchunk_t1425     void reg() { map_reg->emplace(map::value_type(blocks->frame, this)); }
1426 
1427     /** Allocate a chunk of buffer frames.
1428     @param bytes    requested size
1429     @return whether the allocation succeeded */
1430     inline bool create(size_t bytes);
1431 
1432 #ifdef UNIV_DEBUG
1433     /** Find a block that points to a ROW_FORMAT=COMPRESSED page
1434     @param data  pointer to the start of a ROW_FORMAT=COMPRESSED page frame
1435     @return the block
1436     @retval nullptr  if not found */
contains_zipchunk_t1437     const buf_block_t *contains_zip(const void *data) const
1438     {
1439       const buf_block_t *block= blocks;
1440       for (auto i= size; i--; block++)
1441         if (block->page.zip.data == data)
1442           return block;
1443       return nullptr;
1444     }
1445 
1446     /** Check that all blocks are in a replaceable state.
1447     @return address of a non-free block
1448     @retval nullptr if all freed */
1449     inline const buf_block_t *not_freed() const;
1450 #endif /* UNIV_DEBUG */
1451   };
1452 
1453   /** Withdraw blocks from the buffer pool until meeting withdraw_target.
1454   @return whether retry is needed */
1455   inline bool withdraw_blocks();
1456 
1457   /** Determine if a pointer belongs to a buf_block_t. It can be a pointer to
1458   the buf_block_t itself or a member of it.
1459   @param ptr    a pointer that will not be dereferenced
1460   @return whether the ptr belongs to a buf_block_t struct */
is_block_field(const void * ptr)1461   bool is_block_field(const void *ptr) const
1462   {
1463     const chunk_t *chunk= chunks;
1464     const chunk_t *const echunk= chunk + ut_min(n_chunks, n_chunks_new);
1465 
1466     /* TODO: protect chunks with a mutex (the older pointer will
1467     currently remain during resize()) */
1468     for (; chunk < echunk; chunk++)
1469       if (ptr >= reinterpret_cast<const void*>(chunk->blocks) &&
1470           ptr < reinterpret_cast<const void*>(chunk->blocks + chunk->size))
1471         return true;
1472     return false;
1473   }
1474 
1475   /** Try to reallocate a control block.
1476   @param block  control block to reallocate
1477   @return whether the reallocation succeeded */
1478   inline bool realloc(buf_block_t *block);
1479 
1480 public:
is_initialised()1481   bool is_initialised() const { return chunks != nullptr; }
1482 
1483   /** Create the buffer pool.
1484   @return whether the creation failed */
1485   bool create();
1486 
1487   /** Clean up after successful create() */
1488   void close();
1489 
1490   /** Resize from srv_buf_pool_old_size to srv_buf_pool_size. */
1491   inline void resize();
1492 
1493   /** @return whether resize() is in progress */
resize_in_progress()1494   bool resize_in_progress() const
1495   {
1496     return UNIV_UNLIKELY(resizing.load(std::memory_order_relaxed));
1497   }
1498 
1499   /** @return the current size in blocks */
get_n_pages()1500   size_t get_n_pages() const
1501   {
1502     ut_ad(is_initialised());
1503     size_t size= 0;
1504     for (auto j= n_chunks; j--; )
1505       size+= chunks[j].size;
1506     return size;
1507   }
1508 
1509   /** Determine whether a frame is intended to be withdrawn during resize().
1510   @param ptr    pointer within a buf_block_t::frame
1511   @return whether the frame will be withdrawn */
will_be_withdrawn(const byte * ptr)1512   bool will_be_withdrawn(const byte *ptr) const
1513   {
1514     ut_ad(curr_size < old_size);
1515 #ifdef SAFE_MUTEX
1516     if (resizing.load(std::memory_order_relaxed))
1517       mysql_mutex_assert_owner(&mutex);
1518 #endif /* SAFE_MUTEX */
1519 
1520     for (const chunk_t *chunk= chunks + n_chunks_new,
1521          * const echunk= chunks + n_chunks;
1522          chunk != echunk; chunk++)
1523       if (ptr >= chunk->blocks->frame &&
1524           ptr < (chunk->blocks + chunk->size - 1)->frame + srv_page_size)
1525         return true;
1526     return false;
1527   }
1528 
1529   /** Determine whether a block is intended to be withdrawn during resize().
1530   @param bpage  buffer pool block
1531   @return whether the frame will be withdrawn */
will_be_withdrawn(const buf_page_t & bpage)1532   bool will_be_withdrawn(const buf_page_t &bpage) const
1533   {
1534     ut_ad(curr_size < old_size);
1535 #ifdef SAFE_MUTEX
1536     if (resizing.load(std::memory_order_relaxed))
1537       mysql_mutex_assert_owner(&mutex);
1538 #endif /* SAFE_MUTEX */
1539 
1540     for (const chunk_t *chunk= chunks + n_chunks_new,
1541          * const echunk= chunks + n_chunks;
1542          chunk != echunk; chunk++)
1543       if (&bpage >= &chunk->blocks->page &&
1544           &bpage < &chunk->blocks[chunk->size].page)
1545         return true;
1546     return false;
1547   }
1548 
1549   /** Release and evict a corrupted page.
1550   @param bpage    page that was being read */
1551   ATTRIBUTE_COLD void corrupted_evict(buf_page_t *bpage);
1552 
1553   /** Release a memory block to the buffer pool. */
1554   ATTRIBUTE_COLD void free_block(buf_block_t *block);
1555 
1556 #ifdef UNIV_DEBUG
1557   /** Find a block that points to a ROW_FORMAT=COMPRESSED page
1558   @param data  pointer to the start of a ROW_FORMAT=COMPRESSED page frame
1559   @return the block
1560   @retval nullptr  if not found */
contains_zip(const void * data)1561   const buf_block_t *contains_zip(const void *data) const
1562   {
1563     mysql_mutex_assert_owner(&mutex);
1564     for (const chunk_t *chunk= chunks, * const end= chunks + n_chunks;
1565          chunk != end; chunk++)
1566       if (const buf_block_t *block= chunk->contains_zip(data))
1567         return block;
1568     return nullptr;
1569   }
1570 
1571   /** Assert that all buffer pool pages are in a replaceable state */
1572   void assert_all_freed();
1573 #endif /* UNIV_DEBUG */
1574 
1575 #ifdef BTR_CUR_HASH_ADAPT
1576   /** Clear the adaptive hash index on all pages in the buffer pool. */
1577   inline void clear_hash_index();
1578 
1579   /** Get a buffer block from an adaptive hash index pointer.
1580   This function does not return if the block is not identified.
1581   @param ptr  pointer to within a page frame
1582   @return pointer to block, never NULL */
1583   inline buf_block_t *block_from_ahi(const byte *ptr) const;
1584 #endif /* BTR_CUR_HASH_ADAPT */
1585 
is_block_lock(const rw_lock_t * l)1586   bool is_block_lock(const rw_lock_t *l) const
1587   { return is_block_field(static_cast<const void*>(l)); }
1588 
1589   /**
1590   @return the smallest oldest_modification lsn for any page
1591   @retval empty_lsn if all modified persistent pages have been flushed */
get_oldest_modification(lsn_t empty_lsn)1592   lsn_t get_oldest_modification(lsn_t empty_lsn)
1593   {
1594     mysql_mutex_assert_owner(&flush_list_mutex);
1595     while (buf_page_t *bpage= UT_LIST_GET_LAST(flush_list))
1596     {
1597       ut_ad(!fsp_is_system_temporary(bpage->id().space()));
1598       lsn_t lsn= bpage->oldest_modification();
1599       if (lsn != 1)
1600       {
1601         ut_ad(lsn > 2);
1602         return lsn;
1603       }
1604       delete_from_flush_list(bpage);
1605     }
1606     return empty_lsn;
1607   }
1608 
1609   /** Determine if a buffer block was created by chunk_t::create().
1610   @param block  block descriptor (not dereferenced)
1611   @return whether block has been created by chunk_t::create() */
is_uncompressed(const buf_block_t * block)1612   bool is_uncompressed(const buf_block_t *block) const
1613   {
1614     return is_block_field(reinterpret_cast<const void*>(block));
1615   }
1616 
1617   /** Get the page_hash latch for a page */
hash_lock_get(const page_id_t id)1618   page_hash_latch *hash_lock_get(const page_id_t id) const
1619   {
1620     return page_hash.lock_get(id.fold());
1621   }
1622 
1623   /** Look up a block descriptor.
1624   @param id    page identifier
1625   @param fold  id.fold()
1626   @return block descriptor, possibly in watch[]
1627   @retval nullptr  if not found*/
page_hash_get_low(const page_id_t id,const ulint fold)1628   buf_page_t *page_hash_get_low(const page_id_t id, const ulint fold)
1629   {
1630     ut_ad(id.fold() == fold);
1631 #ifdef SAFE_MUTEX
1632     DBUG_ASSERT(mysql_mutex_is_owner(&mutex) ||
1633                 page_hash.lock_get(fold)->is_locked());
1634 #endif /* SAFE_MUTEX */
1635     buf_page_t *bpage;
1636     /* Look for the page in the hash table */
1637     HASH_SEARCH(hash, &page_hash, fold, buf_page_t*, bpage,
1638                 ut_ad(bpage->in_page_hash), id == bpage->id());
1639     return bpage;
1640   }
1641 private:
1642   /** Look up a block descriptor.
1643   @tparam exclusive  whether the latch is to be acquired exclusively
1644   @tparam watch      whether to allow watch_is_sentinel()
1645   @param page_id     page identifier
1646   @param fold        page_id.fold()
1647   @param hash_lock   pointer to the acquired latch (to be released by caller)
1648   @return pointer to the block
1649   @retval nullptr  if no block was found; !lock || !*lock will also hold */
1650   template<bool exclusive,bool watch>
page_hash_get_locked(const page_id_t page_id,ulint fold,page_hash_latch ** hash_lock)1651   buf_page_t *page_hash_get_locked(const page_id_t page_id, ulint fold,
1652                                    page_hash_latch **hash_lock)
1653   {
1654     ut_ad(hash_lock || !exclusive);
1655     page_hash_latch *latch= page_hash.lock<exclusive>(fold);
1656     buf_page_t *bpage= page_hash_get_low(page_id, fold);
1657     if (!bpage || watch_is_sentinel(*bpage))
1658     {
1659       latch->release<exclusive>();
1660       if (hash_lock)
1661         *hash_lock= nullptr;
1662       return watch ? bpage : nullptr;
1663     }
1664 
1665     ut_ad(bpage->in_file());
1666     ut_ad(page_id == bpage->id());
1667 
1668     if (hash_lock)
1669       *hash_lock= latch; /* to be released by the caller */
1670     else
1671       latch->release<exclusive>();
1672     return bpage;
1673   }
1674 public:
1675   /** Look up a block descriptor.
1676   @tparam exclusive  whether the latch is to be acquired exclusively
1677   @param page_id     page identifier
1678   @param fold        page_id.fold()
1679   @param hash_lock   pointer to the acquired latch (to be released by caller)
1680   @return pointer to the block
1681   @retval nullptr  if no block was found; !lock || !*lock will also hold */
1682   template<bool exclusive>
page_hash_get_locked(const page_id_t page_id,ulint fold,page_hash_latch ** hash_lock)1683   buf_page_t *page_hash_get_locked(const page_id_t page_id, ulint fold,
1684                                    page_hash_latch **hash_lock)
1685   { return page_hash_get_locked<exclusive,false>(page_id, fold, hash_lock); }
1686 
1687   /** @return whether the buffer pool contains a page
1688   @tparam watch      whether to allow watch_is_sentinel()
1689   @param page_id     page identifier */
1690   template<bool watch= false>
page_hash_contains(const page_id_t page_id)1691   bool page_hash_contains(const page_id_t page_id)
1692   {
1693     return page_hash_get_locked<false,watch>(page_id, page_id.fold(), nullptr);
1694   }
1695 
1696   /** Determine if a block is a sentinel for a buffer pool watch.
1697   @param bpage page descriptor
1698   @return whether bpage a sentinel for a buffer pool watch */
watch_is_sentinel(const buf_page_t & bpage)1699   bool watch_is_sentinel(const buf_page_t &bpage)
1700   {
1701 #ifdef SAFE_MUTEX
1702     DBUG_ASSERT(mysql_mutex_is_owner(&mutex) ||
1703                 hash_lock_get(bpage.id())->is_locked());
1704 #endif /* SAFE_MUTEX */
1705     ut_ad(bpage.in_file());
1706 
1707     if (&bpage < &watch[0] || &bpage >= &watch[UT_ARR_SIZE(watch)])
1708     {
1709       ut_ad(bpage.state() != BUF_BLOCK_ZIP_PAGE || bpage.zip.data);
1710       return false;
1711     }
1712 
1713     ut_ad(bpage.state() == BUF_BLOCK_ZIP_PAGE);
1714     ut_ad(!bpage.in_zip_hash);
1715     ut_ad(!bpage.zip.data);
1716     return true;
1717   }
1718 
1719   /** Check if a watched page has been read.
1720   This may only be called after !watch_set() and before invoking watch_unset().
1721   @param id   page identifier
1722   @return whether the page was read to the buffer pool */
watch_occurred(const page_id_t id)1723   bool watch_occurred(const page_id_t id)
1724   {
1725     const ulint fold= id.fold();
1726     page_hash_latch *hash_lock= page_hash.lock<false>(fold);
1727     /* The page must exist because watch_set() increments buf_fix_count. */
1728     buf_page_t *bpage= page_hash_get_low(id, fold);
1729     const bool is_sentinel= watch_is_sentinel(*bpage);
1730     hash_lock->read_unlock();
1731     return !is_sentinel;
1732   }
1733 
1734   /** Register a watch for a page identifier. The caller must hold an
1735   exclusive page hash latch. The *hash_lock may be released,
1736   relocated, and reacquired.
1737   @param id         page identifier
1738   @param hash_lock  exclusively held page_hash latch
1739   @return a buffer pool block corresponding to id
1740   @retval nullptr   if the block was not present, and a watch was installed */
1741   inline buf_page_t *watch_set(const page_id_t id,
1742                                page_hash_latch **hash_lock);
1743 
1744   /** Stop watching whether a page has been read in.
1745   watch_set(id) must have returned nullptr before.
1746   @param id   page identifier */
1747   void watch_unset(const page_id_t id);
1748 
1749   /** Remove the sentinel block for the watch before replacing it with a
1750   real block. watch_unset() or watch_occurred() will notice
1751   that the block has been replaced with the real block.
1752   @param watch   sentinel */
1753   inline void watch_remove(buf_page_t *watch);
1754 
1755   /** @return whether less than 1/4 of the buffer pool is available */
running_out()1756   bool running_out() const
1757   {
1758     return !recv_recovery_is_on() &&
1759       UNIV_UNLIKELY(UT_LIST_GET_LEN(free) + UT_LIST_GET_LEN(LRU) <
1760                     std::min(curr_size, old_size) / 4);
1761   }
1762 
1763 #ifdef UNIV_DEBUG
1764   /** Validate the buffer pool. */
1765   void validate();
1766 #endif /* UNIV_DEBUG */
1767 #if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG
1768   /** Write information of the buf_pool to the error log. */
1769   void print();
1770 #endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */
1771 
1772   /** Remove a block from the LRU list.
1773   @return the predecessor in the LRU list */
LRU_remove(buf_page_t * bpage)1774   buf_page_t *LRU_remove(buf_page_t *bpage)
1775   {
1776     mysql_mutex_assert_owner(&mutex);
1777     ut_ad(bpage->in_LRU_list);
1778     ut_ad(bpage->in_page_hash);
1779     ut_ad(!bpage->in_zip_hash);
1780     ut_ad(bpage->in_file());
1781     lru_hp.adjust(bpage);
1782     lru_scan_itr.adjust(bpage);
1783     ut_d(bpage->in_LRU_list= false);
1784     buf_page_t *prev= UT_LIST_GET_PREV(LRU, bpage);
1785     UT_LIST_REMOVE(LRU, bpage);
1786     return prev;
1787   }
1788 
1789   /** Number of pages to read ahead */
1790   static constexpr uint32_t READ_AHEAD_PAGES= 64;
1791 
1792   /** Buffer pool mutex */
1793   MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex;
1794   /** Number of pending LRU flush; protected by mutex. */
1795   ulint n_flush_LRU_;
1796   /** broadcast when n_flush_LRU reaches 0; protected by mutex */
1797   pthread_cond_t done_flush_LRU;
1798   /** Number of pending flush_list flush; protected by mutex */
1799   ulint n_flush_list_;
1800   /** broadcast when n_flush_list reaches 0; protected by mutex */
1801   pthread_cond_t done_flush_list;
1802 
n_flush_LRU()1803   TPOOL_SUPPRESS_TSAN ulint n_flush_LRU() const { return n_flush_LRU_; }
n_flush_list()1804   TPOOL_SUPPRESS_TSAN ulint n_flush_list() const { return n_flush_list_; }
1805 
1806 	/** @name General fields */
1807 	/* @{ */
1808 	ulint		curr_pool_size;	/*!< Current pool size in bytes */
1809 	ulint		LRU_old_ratio;  /*!< Reserve this much of the buffer
1810 					pool for "old" blocks */
1811 #ifdef UNIV_DEBUG
1812 	ulint		buddy_n_frames; /*!< Number of frames allocated from
1813 					the buffer pool to the buddy system */
1814 	ulint		mutex_exit_forbidden; /*!< Forbid release mutex */
1815 #endif
1816 	ut_allocator<unsigned char>	allocator;	/*!< Allocator used for
1817 					allocating memory for the the "chunks"
1818 					member. */
1819 	volatile ulint	n_chunks;	/*!< number of buffer pool chunks */
1820 	volatile ulint	n_chunks_new;	/*!< new number of buffer pool chunks */
1821 	chunk_t*	chunks;		/*!< buffer pool chunks */
1822 	chunk_t*	chunks_old;	/*!< old buffer pool chunks to be freed
1823 					after resizing buffer pool */
1824 	/** current pool size in pages */
1825 	Atomic_counter<ulint> curr_size;
1826 	/** previous pool size in pages */
1827 	Atomic_counter<ulint> old_size;
1828 	/** read-ahead request size in pages */
1829 	Atomic_counter<uint32_t> read_ahead_area;
1830 
1831   /** Hash table with singly-linked overflow lists. @see hash_table_t */
1832   struct page_hash_table
1833   {
1834     /** Number of array[] elements per page_hash_latch.
1835     Must be one less than a power of 2. */
1836     static constexpr size_t ELEMENTS_PER_LATCH= CPU_LEVEL1_DCACHE_LINESIZE /
1837       sizeof(void*) - 1;
1838 
1839     /** number of payload elements in array[] */
1840     Atomic_relaxed<ulint> n_cells;
1841     /** the hash table, with pad(n_cells) elements, aligned to L1 cache size */
1842     hash_cell_t *array;
1843 
1844     /** Create the hash table.
1845     @param n  the lower bound of n_cells */
1846     void create(ulint n);
1847 
1848     /** Free the hash table. */
freepage_hash_table1849     void free() { aligned_free(array); array= nullptr; }
1850 
1851     /** @return the index of an array element */
calc_hashpage_hash_table1852     ulint calc_hash(ulint fold) const { return calc_hash(fold, n_cells); }
1853     /** @return raw array index converted to padded index */
padpage_hash_table1854     static ulint pad(ulint h) { return 1 + (h / ELEMENTS_PER_LATCH) + h; }
1855   private:
1856     /** @return the hash value before any ELEMENTS_PER_LATCH padding */
hashpage_hash_table1857     static ulint hash(ulint fold, ulint n) { return ut_hash_ulint(fold, n); }
1858 
1859     /** @return the index of an array element */
calc_hashpage_hash_table1860     static ulint calc_hash(ulint fold, ulint n_cells)
1861     {
1862       return pad(hash(fold, n_cells));
1863     }
1864     /** Get a page_hash latch. */
lock_getpage_hash_table1865     page_hash_latch *lock_get(ulint fold, ulint n) const
1866     {
1867       static_assert(!((ELEMENTS_PER_LATCH + 1) & ELEMENTS_PER_LATCH),
1868                     "must be one less than a power of 2");
1869       return reinterpret_cast<page_hash_latch*>
1870         (&array[calc_hash(fold, n) & ~ELEMENTS_PER_LATCH]);
1871     }
1872   public:
1873     /** Get a page_hash latch. */
lock_getpage_hash_table1874     page_hash_latch *lock_get(ulint fold) const
1875     { return lock_get(fold, n_cells); }
1876 
1877     /** Acquire an array latch.
1878     @tparam exclusive  whether the latch is to be acquired exclusively
1879     @param fold    hash bucket key */
lockpage_hash_table1880     template<bool exclusive> page_hash_latch *lock(ulint fold)
1881     {
1882       page_hash_latch *latch= lock_get(fold, n_cells);
1883       latch->acquire<exclusive>();
1884       return latch;
1885     }
1886 
1887     /** Exclusively aqcuire all latches */
1888     inline void write_lock_all();
1889 
1890     /** Release all latches */
1891     inline void write_unlock_all();
1892   };
1893 
1894   /** Hash table of file pages (buf_page_t::in_file() holds),
1895   indexed by page_id_t. Protected by both mutex and page_hash.lock_get(). */
1896   page_hash_table page_hash;
1897 
1898   /** map of block->frame to buf_block_t blocks that belong
1899   to buf_buddy_alloc(); protected by buf_pool.mutex */
1900   hash_table_t zip_hash;
1901 	/** number of pending read operations */
1902 	Atomic_counter<ulint> n_pend_reads;
1903 	Atomic_counter<ulint>
1904 			n_pend_unzip;	/*!< number of pending decompressions */
1905 
1906 	time_t		last_printout_time;
1907 					/*!< when buf_print_io was last time
1908 					called */
1909 	buf_buddy_stat_t buddy_stat[BUF_BUDDY_SIZES_MAX + 1];
1910 					/*!< Statistics of buddy system,
1911 					indexed by block size */
1912 	buf_pool_stat_t	stat;		/*!< current statistics */
1913 	buf_pool_stat_t	old_stat;	/*!< old statistics */
1914 
1915 	/* @} */
1916 
1917   /** @name Page flushing algorithm fields */
1918   /* @{ */
1919 
1920   /** mutex protecting flush_list, buf_page_t::set_oldest_modification()
1921   and buf_page_t::list pointers when !oldest_modification() */
1922   MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t flush_list_mutex;
1923   /** "hazard pointer" for flush_list scans; protected by flush_list_mutex */
1924   FlushHp flush_hp;
1925   /** modified blocks (a subset of LRU) */
1926   UT_LIST_BASE_NODE_T(buf_page_t) flush_list;
1927 private:
1928   /** whether the page cleaner needs wakeup from indefinite sleep */
1929   bool page_cleaner_is_idle;
1930   /** track server activity count for signaling idle flushing */
1931   ulint last_activity_count;
1932 public:
1933   /** signalled to wake up the page_cleaner; protected by flush_list_mutex */
1934   pthread_cond_t do_flush_list;
1935 
1936   /** @return whether the page cleaner must sleep due to being idle */
page_cleaner_idle()1937   bool page_cleaner_idle() const
1938   {
1939     mysql_mutex_assert_owner(&flush_list_mutex);
1940     return page_cleaner_is_idle;
1941   }
1942   /** Wake up the page cleaner if needed */
1943   void page_cleaner_wakeup();
1944 
1945   /** Register whether an explicit wakeup of the page cleaner is needed */
page_cleaner_set_idle(bool deep_sleep)1946   void page_cleaner_set_idle(bool deep_sleep)
1947   {
1948     mysql_mutex_assert_owner(&flush_list_mutex);
1949     page_cleaner_is_idle= deep_sleep;
1950   }
1951 
1952   /** Update server last activity count */
update_last_activity_count(ulint activity_count)1953   void update_last_activity_count(ulint activity_count)
1954   {
1955     mysql_mutex_assert_owner(&flush_list_mutex);
1956     last_activity_count= activity_count;
1957   }
1958 
1959   // n_flush_LRU() + n_flush_list()
1960   // is approximately COUNT(io_fix()==BUF_IO_WRITE) in flush_list
1961 
1962 	unsigned	freed_page_clock;/*!< a sequence number used
1963 					to count the number of buffer
1964 					blocks removed from the end of
1965 					the LRU list; NOTE that this
1966 					counter may wrap around at 4
1967 					billion! A thread is allowed
1968 					to read this for heuristic
1969 					purposes without holding any
1970 					mutex or latch */
1971 	bool		try_LRU_scan;	/*!< Cleared when an LRU
1972 					scan for free block fails. This
1973 					flag is used to avoid repeated
1974 					scans of LRU list when we know
1975 					that there is no free block
1976 					available in the scan depth for
1977 					eviction. Set whenever
1978 					we flush a batch from the
1979 					buffer pool. Protected by the
1980 					buf_pool.mutex */
1981 	/* @} */
1982 
1983 	/** @name LRU replacement algorithm fields */
1984 	/* @{ */
1985 
1986 	UT_LIST_BASE_NODE_T(buf_page_t) free;
1987 					/*!< base node of the free
1988 					block list */
1989   /** signaled each time when the free list grows; protected by mutex */
1990   pthread_cond_t done_free;
1991 
1992 	UT_LIST_BASE_NODE_T(buf_page_t) withdraw;
1993 					/*!< base node of the withdraw
1994 					block list. It is only used during
1995 					shrinking buffer pool size, not to
1996 					reuse the blocks will be removed */
1997 
1998 	ulint		withdraw_target;/*!< target length of withdraw
1999 					block list, when withdrawing */
2000 
2001 	/** "hazard pointer" used during scan of LRU while doing
2002 	LRU list batch.  Protected by buf_pool_t::mutex. */
2003 	LRUHp		lru_hp;
2004 
2005 	/** Iterator used to scan the LRU list when searching for
2006 	replacable victim. Protected by buf_pool_t::mutex. */
2007 	LRUItr		lru_scan_itr;
2008 
2009 	UT_LIST_BASE_NODE_T(buf_page_t) LRU;
2010 					/*!< base node of the LRU list */
2011 
2012 	buf_page_t*	LRU_old;	/*!< pointer to the about
2013 					LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
2014 					oldest blocks in the LRU list;
2015 					NULL if LRU length less than
2016 					BUF_LRU_OLD_MIN_LEN;
2017 					NOTE: when LRU_old != NULL, its length
2018 					should always equal LRU_old_len */
2019 	ulint		LRU_old_len;	/*!< length of the LRU list from
2020 					the block to which LRU_old points
2021 					onward, including that block;
2022 					see buf0lru.cc for the restrictions
2023 					on this value; 0 if LRU_old == NULL;
2024 					NOTE: LRU_old_len must be adjusted
2025 					whenever LRU_old shrinks or grows! */
2026 
2027 	UT_LIST_BASE_NODE_T(buf_block_t) unzip_LRU;
2028 					/*!< base node of the
2029 					unzip_LRU list */
2030 
2031 	/* @} */
2032   /** free ROW_FORMAT=COMPRESSED page frames */
2033   UT_LIST_BASE_NODE_T(buf_buddy_free_t) zip_free[BUF_BUDDY_SIZES_MAX];
2034 #if BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN
2035 # error "BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN"
2036 #endif
2037 
2038   /** Sentinels to detect if pages are read into the buffer pool while
2039   a delete-buffering operation is pending. Protected by mutex. */
2040   buf_page_t watch[innodb_purge_threads_MAX + 1];
2041   /** Reserve a buffer. */
io_buf_reserve()2042   buf_tmp_buffer_t *io_buf_reserve() { return io_buf.reserve(); }
2043 
2044   /** @return whether any I/O is pending */
any_io_pending()2045   bool any_io_pending() const
2046   {
2047     return n_pend_reads || n_flush_LRU() || n_flush_list();
2048   }
2049   /** @return total amount of pending I/O */
io_pending()2050   ulint io_pending() const
2051   {
2052     return n_pend_reads + n_flush_LRU() + n_flush_list();
2053   }
2054 
2055 private:
2056   /** Remove a block from the flush list. */
2057   inline void delete_from_flush_list_low(buf_page_t *bpage);
2058   /** Remove a block from flush_list.
2059   @param bpage   buffer pool page
2060   @param clear   whether to invoke buf_page_t::clear_oldest_modification() */
2061   void delete_from_flush_list(buf_page_t *bpage, bool clear);
2062 public:
2063   /** Remove a block from flush_list.
2064   @param bpage   buffer pool page */
delete_from_flush_list(buf_page_t * bpage)2065   void delete_from_flush_list(buf_page_t *bpage)
2066   { delete_from_flush_list(bpage, true); }
2067 
2068   /** Insert a modified block into the flush list.
2069   @param block    modified block
2070   @param lsn      start LSN of the mini-transaction that modified the block */
2071   void insert_into_flush_list(buf_block_t *block, lsn_t lsn);
2072 
2073   /** Free a page whose underlying file page has been freed. */
2074   inline void release_freed_page(buf_page_t *bpage);
2075 
2076 private:
2077   /** Temporary memory for page_compressed and encrypted I/O */
2078   struct io_buf_t
2079   {
2080     /** number of elements in slots[] */
2081     ulint n_slots;
2082     /** array of slots */
2083     buf_tmp_buffer_t *slots;
2084 
createio_buf_t2085     void create(ulint n_slots)
2086     {
2087       this->n_slots= n_slots;
2088       slots= static_cast<buf_tmp_buffer_t*>
2089         (ut_malloc_nokey(n_slots * sizeof *slots));
2090       memset((void*) slots, 0, n_slots * sizeof *slots);
2091     }
2092 
closeio_buf_t2093     void close()
2094     {
2095       for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
2096       {
2097         aligned_free(s->crypt_buf);
2098         aligned_free(s->comp_buf);
2099       }
2100       ut_free(slots);
2101       slots= nullptr;
2102       n_slots= 0;
2103     }
2104 
2105     /** Reserve a buffer */
reserveio_buf_t2106     buf_tmp_buffer_t *reserve()
2107     {
2108       for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
2109         if (s->acquire())
2110           return s;
2111       return nullptr;
2112     }
2113   } io_buf;
2114 
2115   /** whether resize() is in the critical path */
2116   std::atomic<bool> resizing;
2117 };
2118 
2119 /** The InnoDB buffer pool */
2120 extern buf_pool_t buf_pool;
2121 
read_lock()2122 inline void page_hash_latch::read_lock()
2123 {
2124   mysql_mutex_assert_not_owner(&buf_pool.mutex);
2125   if (!read_trylock())
2126     read_lock_wait();
2127 }
2128 
write_lock()2129 inline void page_hash_latch::write_lock()
2130 {
2131   if (!write_trylock())
2132     write_lock_wait();
2133 }
2134 
add_buf_fix_count(uint32_t count)2135 inline void buf_page_t::add_buf_fix_count(uint32_t count)
2136 {
2137   mysql_mutex_assert_owner(&buf_pool.mutex);
2138   buf_fix_count_+= count;
2139 }
2140 
set_buf_fix_count(uint32_t count)2141 inline void buf_page_t::set_buf_fix_count(uint32_t count)
2142 {
2143   mysql_mutex_assert_owner(&buf_pool.mutex);
2144   buf_fix_count_= count;
2145 }
2146 
set_state(buf_page_state state)2147 inline void buf_page_t::set_state(buf_page_state state)
2148 {
2149   mysql_mutex_assert_owner(&buf_pool.mutex);
2150 #ifdef UNIV_DEBUG
2151   switch (state) {
2152   case BUF_BLOCK_REMOVE_HASH:
2153     /* buf_pool_t::corrupted_evict() invokes set_corrupt_id()
2154     before buf_LRU_free_one_page(), so we cannot assert that
2155     we are holding the hash_lock. */
2156     break;
2157   case BUF_BLOCK_MEMORY:
2158     if (!in_file()) break;
2159     /* fall through */
2160   case BUF_BLOCK_FILE_PAGE:
2161     ut_ad(buf_pool.hash_lock_get(id_)->is_write_locked());
2162     break;
2163   case BUF_BLOCK_NOT_USED:
2164     if (!in_file()) break;
2165     /* fall through */
2166   case BUF_BLOCK_ZIP_PAGE:
2167     ut_ad(buf_pool.hash_lock_get(id_)->is_write_locked() ||
2168           (this >= &buf_pool.watch[0] &&
2169            this <= &buf_pool.watch[UT_ARR_SIZE(buf_pool.watch)]));
2170     break;
2171   }
2172 #endif
2173   state_= state;
2174 }
2175 
set_io_fix(buf_io_fix io_fix)2176 inline void buf_page_t::set_io_fix(buf_io_fix io_fix)
2177 {
2178   mysql_mutex_assert_owner(&buf_pool.mutex);
2179   io_fix_= io_fix;
2180 }
2181 
set_corrupt_id()2182 inline void buf_page_t::set_corrupt_id()
2183 {
2184 #ifdef UNIV_DEBUG
2185   switch (oldest_modification()) {
2186   case 0:
2187     break;
2188   case 2:
2189     ut_ad(fsp_is_system_temporary(id().space()));
2190     /* buf_LRU_block_free_non_file_page() asserts !oldest_modification() */
2191     ut_d(oldest_modification_= 0;)
2192     break;
2193   default:
2194     ut_ad("block is dirty" == 0);
2195   }
2196   switch (state()) {
2197   case BUF_BLOCK_REMOVE_HASH:
2198     break;
2199   case BUF_BLOCK_ZIP_PAGE:
2200   case BUF_BLOCK_FILE_PAGE:
2201     ut_ad(buf_pool.hash_lock_get(id_)->is_write_locked());
2202     break;
2203   case BUF_BLOCK_NOT_USED:
2204   case BUF_BLOCK_MEMORY:
2205     ut_ad("invalid state" == 0);
2206   }
2207 #endif
2208   id_= page_id_t(~0ULL);
2209 }
2210 
2211 /** Set oldest_modification when adding to buf_pool.flush_list */
set_oldest_modification(lsn_t lsn)2212 inline void buf_page_t::set_oldest_modification(lsn_t lsn)
2213 {
2214   mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
2215   ut_ad(oldest_modification() <= 1);
2216   ut_ad(lsn > 2);
2217   oldest_modification_= lsn;
2218 }
2219 
2220 /** Clear oldest_modification after removing from buf_pool.flush_list */
clear_oldest_modification()2221 inline void buf_page_t::clear_oldest_modification()
2222 {
2223   mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
2224   ut_d(const auto state= state_);
2225   ut_ad(state == BUF_BLOCK_FILE_PAGE || state == BUF_BLOCK_ZIP_PAGE ||
2226         state == BUF_BLOCK_REMOVE_HASH);
2227   ut_ad(oldest_modification());
2228   ut_ad(!list.prev);
2229   ut_ad(!list.next);
2230   /* We must use release memory order to guarantee that callers of
2231   oldest_modification_acquire() will observe the block as
2232   being detached from buf_pool.flush_list, after reading the value 0. */
2233   oldest_modification_.store(0, std::memory_order_release);
2234 }
2235 
2236 /** Note that a block is no longer dirty, while not removing
2237 it from buf_pool.flush_list */
clear_oldest_modification(bool temporary)2238 inline void buf_page_t::clear_oldest_modification(bool temporary)
2239 {
2240   ut_ad(temporary == fsp_is_system_temporary(id().space()));
2241   if (temporary)
2242   {
2243     ut_ad(oldest_modification() == 2);
2244     oldest_modification_= 0;
2245   }
2246   else
2247   {
2248     /* We use release memory order to guarantee that callers of
2249     oldest_modification_acquire() will observe the block as
2250     being detached from buf_pool.flush_list, after reading the value 0. */
2251     ut_ad(oldest_modification() > 2);
2252     oldest_modification_.store(1, std::memory_order_release);
2253   }
2254 }
2255 
2256 /** @return whether the block is modified and ready for flushing */
ready_for_flush()2257 inline bool buf_page_t::ready_for_flush() const
2258 {
2259   mysql_mutex_assert_owner(&buf_pool.mutex);
2260   ut_ad(in_LRU_list);
2261   ut_a(in_file());
2262   ut_ad(fsp_is_system_temporary(id().space())
2263         ? oldest_modification() == 2
2264         : oldest_modification() > 2);
2265   return io_fix_ == BUF_IO_NONE;
2266 }
2267 
2268 /** @return whether the block can be relocated in memory.
2269 The block can be dirty, but it must not be I/O-fixed or bufferfixed. */
can_relocate()2270 inline bool buf_page_t::can_relocate() const
2271 {
2272   mysql_mutex_assert_owner(&buf_pool.mutex);
2273   ut_ad(in_file());
2274   ut_ad(in_LRU_list);
2275   return io_fix_ == BUF_IO_NONE && !buf_fix_count_;
2276 }
2277 
2278 /** @return whether the block has been flagged old in buf_pool.LRU */
is_old()2279 inline bool buf_page_t::is_old() const
2280 {
2281   mysql_mutex_assert_owner(&buf_pool.mutex);
2282   ut_ad(in_file());
2283   ut_ad(in_LRU_list);
2284   return old;
2285 }
2286 
2287 /** Set whether a block is old in buf_pool.LRU */
set_old(bool old)2288 inline void buf_page_t::set_old(bool old)
2289 {
2290   mysql_mutex_assert_owner(&buf_pool.mutex);
2291   ut_ad(in_LRU_list);
2292 
2293 #ifdef UNIV_LRU_DEBUG
2294   ut_a((buf_pool.LRU_old_len == 0) == (buf_pool.LRU_old == nullptr));
2295   /* If a block is flagged "old", the LRU_old list must exist. */
2296   ut_a(!old || buf_pool.LRU_old);
2297 
2298   if (UT_LIST_GET_PREV(LRU, this) && UT_LIST_GET_NEXT(LRU, this))
2299   {
2300     const buf_page_t *prev= UT_LIST_GET_PREV(LRU, this);
2301     const buf_page_t *next = UT_LIST_GET_NEXT(LRU, this);
2302     if (prev->old == next->old)
2303       ut_a(prev->old == old);
2304     else
2305     {
2306       ut_a(!prev->old);
2307       ut_a(buf_pool.LRU_old == (old ? this : next));
2308     }
2309   }
2310 #endif /* UNIV_LRU_DEBUG */
2311 
2312   this->old= old;
2313 }
2314 
2315 #ifdef UNIV_DEBUG
2316 /** Forbid the release of the buffer pool mutex. */
2317 # define buf_pool_mutex_exit_forbid() do {		\
2318 	mysql_mutex_assert_owner(&buf_pool.mutex);	\
2319 	buf_pool.mutex_exit_forbidden++;		\
2320 } while (0)
2321 /** Allow the release of the buffer pool mutex. */
2322 # define buf_pool_mutex_exit_allow() do {		\
2323 	mysql_mutex_assert_owner(&buf_pool.mutex);	\
2324 	ut_ad(buf_pool.mutex_exit_forbidden--);		\
2325 } while (0)
2326 #else
2327 /** Forbid the release of the buffer pool mutex. */
2328 # define buf_pool_mutex_exit_forbid() ((void) 0)
2329 /** Allow the release of the buffer pool mutex. */
2330 # define buf_pool_mutex_exit_allow() ((void) 0)
2331 #endif
2332 
2333 /**********************************************************************
2334 Let us list the consistency conditions for different control block states.
2335 
2336 NOT_USED:	is in free list, not in LRU list, not in flush list, nor
2337 		page hash table
2338 MEMORY:		is not in free list, LRU list, or flush list, nor page
2339 		hash table
2340 FILE_PAGE:	space and offset are defined, is in page hash table
2341 		if io_fix == BUF_IO_WRITE,
2342 			buf_pool.n_flush_LRU() || buf_pool.n_flush_list()
2343 
2344 		(1) if buf_fix_count == 0, then
2345 			is in LRU list, not in free list
2346 			is in flush list,
2347 				if and only if oldest_modification > 0
2348 			is x-locked,
2349 				if and only if io_fix == BUF_IO_READ
2350 			is s-locked,
2351 				if and only if io_fix == BUF_IO_WRITE
2352 
2353 		(2) if buf_fix_count > 0, then
2354 			is not in LRU list, not in free list
2355 			is in flush list,
2356 				if and only if oldest_modification > 0
2357 			if io_fix == BUF_IO_READ,
2358 				is x-locked
2359 			if io_fix == BUF_IO_WRITE,
2360 				is s-locked
2361 
2362 State transitions:
2363 
2364 NOT_USED => MEMORY
2365 MEMORY => FILE_PAGE
2366 MEMORY => NOT_USED
2367 FILE_PAGE => NOT_USED	NOTE: This transition is allowed if and only if
2368 				(1) buf_fix_count == 0,
2369 				(2) oldest_modification == 0, and
2370 				(3) io_fix == 0.
2371 */
2372 
2373 /** Select from where to start a scan. If we have scanned
2374 too deep into the LRU list it resets the value to the tail
2375 of the LRU list.
2376 @return buf_page_t from where to start scan. */
start()2377 inline buf_page_t *LRUItr::start()
2378 {
2379   mysql_mutex_assert_owner(m_mutex);
2380 
2381   if (!m_hp || m_hp->old)
2382     m_hp= UT_LIST_GET_LAST(buf_pool.LRU);
2383 
2384   return m_hp;
2385 }
2386 
2387 #ifdef UNIV_DEBUG
2388 /** Functor to validate the LRU list. */
2389 struct	CheckInLRUList {
operatorCheckInLRUList2390 	void	operator()(const buf_page_t* elem) const
2391 	{
2392 		ut_a(elem->in_LRU_list);
2393 	}
2394 
validateCheckInLRUList2395 	static void validate()
2396 	{
2397 		ut_list_validate(buf_pool.LRU, CheckInLRUList());
2398 	}
2399 };
2400 
2401 /** Functor to validate the LRU list. */
2402 struct	CheckInFreeList {
operatorCheckInFreeList2403 	void	operator()(const buf_page_t* elem) const
2404 	{
2405 		ut_a(elem->in_free_list);
2406 	}
2407 
validateCheckInFreeList2408 	static void validate()
2409 	{
2410 		ut_list_validate(buf_pool.free, CheckInFreeList());
2411 	}
2412 };
2413 
2414 struct	CheckUnzipLRUAndLRUList {
operatorCheckUnzipLRUAndLRUList2415 	void	operator()(const buf_block_t* elem) const
2416 	{
2417                 ut_a(elem->page.in_LRU_list);
2418                 ut_a(elem->in_unzip_LRU_list);
2419 	}
2420 
validateCheckUnzipLRUAndLRUList2421 	static void validate()
2422 	{
2423 		ut_list_validate(buf_pool.unzip_LRU,
2424 				 CheckUnzipLRUAndLRUList());
2425 	}
2426 };
2427 #endif /* UNIV_DEBUG */
2428 
2429 #include "buf0buf.inl"
2430 
2431 #endif /* !UNIV_INNOCHECKSUM */
2432 
2433 #endif
2434