1 /*****************************************************************************
2 
3 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2015, 2021, MariaDB Corporation.
5 
6 This program is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free Software
8 Foundation; version 2 of the License.
9 
10 This program is distributed in the hope that it will be useful, but WITHOUT
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13 
14 You should have received a copy of the GNU General Public License along with
15 this program; if not, write to the Free Software Foundation, Inc.,
16 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
17 
18 *****************************************************************************/
19 
20 /**************************************************//**
21 @file buf/buf0rea.cc
22 The database buffer read
23 
24 Created 11/5/1995 Heikki Tuuri
25 *******************************************************/
26 
27 #include "univ.i"
28 #include <mysql/service_thd_wait.h>
29 
30 #include "buf0rea.h"
31 #include "fil0fil.h"
32 #include "mtr0mtr.h"
33 #include "buf0buf.h"
34 #include "buf0flu.h"
35 #include "buf0lru.h"
36 #include "buf0buddy.h"
37 #include "buf0dblwr.h"
38 #include "ibuf0ibuf.h"
39 #include "log0recv.h"
40 #include "trx0sys.h"
41 #include "os0file.h"
42 #include "srv0start.h"
43 #include "srv0srv.h"
44 
45 /** If there are buf_pool.curr_size per the number below pending reads, then
46 read-ahead is not done: this is to prevent flooding the buffer pool with
47 i/o-fixed buffer blocks */
48 #define BUF_READ_AHEAD_PEND_LIMIT	2
49 
50 /** Remove the sentinel block for the watch before replacing it with a
51 real block. watch_unset() or watch_occurred() will notice
52 that the block has been replaced with the real block.
53 @param watch   sentinel */
watch_remove(buf_page_t * watch)54 inline void buf_pool_t::watch_remove(buf_page_t *watch)
55 {
56   mysql_mutex_assert_owner(&buf_pool.mutex);
57   ut_ad(hash_lock_get(watch->id())->is_write_locked());
58   ut_a(watch_is_sentinel(*watch));
59   if (watch->buf_fix_count())
60   {
61     ut_ad(watch->in_page_hash);
62     ut_d(watch->in_page_hash= false);
63     HASH_DELETE(buf_page_t, hash, &page_hash, watch->id().fold(), watch);
64     watch->set_buf_fix_count(0);
65   }
66   ut_ad(!watch->in_page_hash);
67   watch->set_state(BUF_BLOCK_NOT_USED);
68   watch->id_= page_id_t(~0ULL);
69 }
70 
71 /** Initialize a page for read to the buffer buf_pool. If the page is
72 (1) already in buf_pool, or
73 (2) if we specify to read only ibuf pages and the page is not an ibuf page, or
74 (3) if the space is deleted or being deleted,
75 then this function does nothing.
76 Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock
77 on the buffer frame. The io-handler must take care that the flag is cleared
78 and the lock released later.
79 @param[in]	mode			BUF_READ_IBUF_PAGES_ONLY, ...
80 @param[in]	page_id			page id
81 @param[in]	zip_size		ROW_FORMAT=COMPRESSED page size, or 0
82 @param[in]	unzip			whether the uncompressed page is
83 					requested (for ROW_FORMAT=COMPRESSED)
84 @return pointer to the block
85 @retval	NULL	in case of an error */
buf_page_init_for_read(ulint mode,const page_id_t page_id,ulint zip_size,bool unzip)86 static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
87                                           ulint zip_size, bool unzip)
88 {
89   mtr_t mtr;
90 
91   if (mode == BUF_READ_IBUF_PAGES_ONLY)
92   {
93     /* It is a read-ahead within an ibuf routine */
94     ut_ad(!ibuf_bitmap_page(page_id, zip_size));
95     ibuf_mtr_start(&mtr);
96 
97     if (!recv_no_ibuf_operations && !ibuf_page(page_id, zip_size, &mtr))
98     {
99       ibuf_mtr_commit(&mtr);
100       return nullptr;
101     }
102   }
103   else
104     ut_ad(mode == BUF_READ_ANY_PAGE);
105 
106   buf_page_t *bpage= nullptr;
107   buf_block_t *block= nullptr;
108   if (!zip_size || unzip || recv_recovery_is_on())
109   {
110     block= buf_LRU_get_free_block(false);
111     block->initialise(page_id, zip_size);
112     /* We set a pass-type x-lock on the frame because then
113     the same thread which called for the read operation
114     (and is running now at this point of code) can wait
115     for the read to complete by waiting for the x-lock on
116     the frame; if the x-lock were recursive, the same
117     thread would illegally get the x-lock before the page
118     read is completed.  The x-lock will be released
119     in buf_page_read_complete() by the io-handler thread. */
120     rw_lock_x_lock_gen(&block->lock, BUF_IO_READ);
121   }
122 
123   const ulint fold= page_id.fold();
124 
125   mysql_mutex_lock(&buf_pool.mutex);
126 
127   buf_page_t *hash_page= buf_pool.page_hash_get_low(page_id, fold);
128   if (hash_page && !buf_pool.watch_is_sentinel(*hash_page))
129   {
130     /* The page is already in the buffer pool. */
131     if (block)
132     {
133       rw_lock_x_unlock_gen(&block->lock, BUF_IO_READ);
134       buf_LRU_block_free_non_file_page(block);
135     }
136     goto func_exit;
137   }
138 
139   if (UNIV_LIKELY(block != nullptr))
140   {
141     bpage= &block->page;
142 
143     /* Insert into the hash table of file pages */
144     page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
145     hash_lock->write_lock();
146 
147     if (hash_page)
148     {
149       /* Preserve the reference count. */
150       auto buf_fix_count= hash_page->buf_fix_count();
151       ut_a(buf_fix_count > 0);
152       block->page.add_buf_fix_count(buf_fix_count);
153       buf_pool.watch_remove(hash_page);
154     }
155 
156     block->page.set_io_fix(BUF_IO_READ);
157     block->page.set_state(BUF_BLOCK_FILE_PAGE);
158     ut_ad(!block->page.in_page_hash);
159     ut_d(block->page.in_page_hash= true);
160     HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, bpage);
161     hash_lock->write_unlock();
162 
163     /* The block must be put to the LRU list, to the old blocks */
164     buf_LRU_add_block(bpage, true/* to old blocks */);
165 
166     if (UNIV_UNLIKELY(zip_size))
167     {
168       /* buf_pool.mutex may be released and reacquired by
169       buf_buddy_alloc(). We must defer this operation until after the
170       block descriptor has been added to buf_pool.LRU and
171       buf_pool.page_hash. */
172       block->page.zip.data= static_cast<page_zip_t*>
173         (buf_buddy_alloc(zip_size));
174 
175       /* To maintain the invariant
176       block->in_unzip_LRU_list == block->page.belongs_to_unzip_LRU()
177       we have to add this block to unzip_LRU
178       after block->page.zip.data is set. */
179       ut_ad(block->page.belongs_to_unzip_LRU());
180       buf_unzip_LRU_add_block(block, TRUE);
181     }
182   }
183   else
184   {
185     /* The compressed page must be allocated before the
186     control block (bpage), in order to avoid the
187     invocation of buf_buddy_relocate_block() on
188     uninitialized data. */
189     bool lru= false;
190     void *data= buf_buddy_alloc(zip_size, &lru);
191 
192     /* If buf_buddy_alloc() allocated storage from the LRU list,
193     it released and reacquired buf_pool.mutex.  Thus, we must
194     check the page_hash again, as it may have been modified. */
195     if (UNIV_UNLIKELY(lru))
196     {
197       hash_page= buf_pool.page_hash_get_low(page_id, fold);
198 
199       if (UNIV_UNLIKELY(hash_page && !buf_pool.watch_is_sentinel(*hash_page)))
200       {
201         /* The block was added by some other thread. */
202         buf_buddy_free(data, zip_size);
203         goto func_exit;
204       }
205     }
206 
207     bpage= buf_page_alloc_descriptor();
208 
209     page_zip_des_init(&bpage->zip);
210     page_zip_set_size(&bpage->zip, zip_size);
211     bpage->zip.data = (page_zip_t*) data;
212 
213     bpage->init(BUF_BLOCK_ZIP_PAGE, page_id);
214 
215     page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
216     hash_lock->write_lock();
217 
218     if (hash_page)
219     {
220       /* Preserve the reference count. It can be 0 if
221       buf_pool_t::watch_unset() is executing concurrently,
222       waiting for buf_pool.mutex, which we are holding. */
223       bpage->add_buf_fix_count(hash_page->buf_fix_count());
224       buf_pool.watch_remove(hash_page);
225     }
226 
227     ut_ad(!bpage->in_page_hash);
228     ut_d(bpage->in_page_hash= true);
229     HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, bpage);
230     bpage->set_io_fix(BUF_IO_READ);
231     hash_lock->write_unlock();
232 
233     /* The block must be put to the LRU list, to the old blocks.
234     The zip size is already set into the page zip */
235     buf_LRU_add_block(bpage, true/* to old blocks */);
236   }
237 
238   mysql_mutex_unlock(&buf_pool.mutex);
239   buf_pool.n_pend_reads++;
240   goto func_exit_no_mutex;
241 func_exit:
242   mysql_mutex_unlock(&buf_pool.mutex);
243 func_exit_no_mutex:
244   if (mode == BUF_READ_IBUF_PAGES_ONLY)
245     ibuf_mtr_commit(&mtr);
246 
247   ut_ad(!bpage || bpage->in_file());
248 
249   return bpage;
250 }
251 
252 /** Low-level function which reads a page asynchronously from a file to the
253 buffer buf_pool if it is not already there, in which case does nothing.
254 Sets the io_fix flag and sets an exclusive lock on the buffer frame. The
255 flag is cleared and the x-lock released by an i/o-handler thread.
256 
257 @param[out] err		DB_SUCCESS or DB_TABLESPACE_DELETED
258 			if we are trying
259 			to read from a non-existent tablespace
260 @param[in,out] space	tablespace
261 @param[in] sync		true if synchronous aio is desired
262 @param[in] mode		BUF_READ_IBUF_PAGES_ONLY, ...,
263 @param[in] page_id	page id
264 @param[in] zip_size	ROW_FORMAT=COMPRESSED page size, or 0
265 @param[in] unzip	true=request uncompressed page
266 @return whether a read request was queued */
267 static
268 bool
buf_read_page_low(dberr_t * err,fil_space_t * space,bool sync,ulint mode,const page_id_t page_id,ulint zip_size,bool unzip)269 buf_read_page_low(
270 	dberr_t*		err,
271 	fil_space_t*		space,
272 	bool			sync,
273 	ulint			mode,
274 	const page_id_t		page_id,
275 	ulint			zip_size,
276 	bool			unzip)
277 {
278 	buf_page_t*	bpage;
279 
280 	*err = DB_SUCCESS;
281 
282 	if (buf_dblwr.is_inside(page_id)) {
283 		ib::error() << "Trying to read doublewrite buffer page "
284 			<< page_id;
285 		ut_ad(0);
286 nothing_read:
287 		space->release();
288 		return false;
289 	}
290 
291 	if (sync) {
292 	} else if (trx_sys_hdr_page(page_id)
293 		   || ibuf_bitmap_page(page_id, zip_size)
294 		   || (!recv_no_ibuf_operations
295 		       && ibuf_page(page_id, zip_size, nullptr))) {
296 
297 		/* Trx sys header is so low in the latching order that we play
298 		safe and do not leave the i/o-completion to an asynchronous
299 		i/o-thread. Change buffer pages must always be read with
300 		syncronous i/o, to make sure they do not get involved in
301 		thread deadlocks. */
302 		sync = true;
303 	}
304 
305 	/* The following call will also check if the tablespace does not exist
306 	or is being dropped; if we succeed in initing the page in the buffer
307 	pool for read, then DISCARD cannot proceed until the read has
308 	completed */
309 	bpage = buf_page_init_for_read(mode, page_id, zip_size, unzip);
310 
311 	if (bpage == NULL) {
312 		goto nothing_read;
313 	}
314 
315 	ut_ad(bpage->in_file());
316 
317 	if (sync) {
318 		thd_wait_begin(nullptr, THD_WAIT_DISKIO);
319 	}
320 
321 	DBUG_LOG("ib_buf",
322 		 "read page " << page_id << " zip_size=" << zip_size
323 		 << " unzip=" << unzip << ',' << (sync ? "sync" : "async"));
324 
325 	void*	dst;
326 
327 	if (zip_size) {
328 		dst = bpage->zip.data;
329 	} else {
330 		ut_a(bpage->state() == BUF_BLOCK_FILE_PAGE);
331 
332 		dst = ((buf_block_t*) bpage)->frame;
333 	}
334 
335 	const ulint len = zip_size ? zip_size : srv_page_size;
336 
337 	auto fio = space->io(IORequest(sync
338 				       ? IORequest::READ_SYNC
339 				       : IORequest::READ_ASYNC),
340 			     page_id.page_no() * len, len, dst, bpage);
341 	*err= fio.err;
342 
343 	if (UNIV_UNLIKELY(fio.err != DB_SUCCESS)) {
344 		if (!sync || fio.err == DB_TABLESPACE_DELETED
345 		    || fio.err == DB_IO_ERROR) {
346 			buf_pool.corrupted_evict(bpage);
347 			return false;
348 		}
349 
350 		ut_error;
351 	}
352 
353 	if (sync) {
354 		thd_wait_end(NULL);
355 
356 		/* The i/o was already completed in space->io() */
357 		*err = buf_page_read_complete(bpage, *fio.node);
358 		space->release();
359 
360 		if (*err != DB_SUCCESS) {
361 			return false;
362 		}
363 	}
364 
365 	return true;
366 }
367 
368 /** Applies a random read-ahead in buf_pool if there are at least a threshold
369 value of accessed pages from the random read-ahead area. Does not read any
370 page, not even the one at the position (space, offset), if the read-ahead
371 mechanism is not activated. NOTE 1: the calling thread may own latches on
372 pages: to avoid deadlocks this function must be written such that it cannot
373 end up waiting for these latches! NOTE 2: the calling thread must want
374 access to the page given: this rule is set to prevent unintended read-aheads
375 performed by ibuf routines, a situation which could result in a deadlock if
376 the OS does not support asynchronous i/o.
377 @param[in]	page_id		page id of a page which the current thread
378 wants to access
379 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
380 @param[in]	ibuf		whether we are inside ibuf routine
381 @return number of page read requests issued; NOTE that if we read ibuf
382 pages, it may happen that the page at the given page number does not
383 get read even if we return a positive value! */
384 ulint
buf_read_ahead_random(const page_id_t page_id,ulint zip_size,bool ibuf)385 buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf)
386 {
387   if (!srv_random_read_ahead)
388     return 0;
389 
390   if (srv_startup_is_before_trx_rollback_phase)
391     /* No read-ahead to avoid thread deadlocks */
392     return 0;
393 
394   if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id))
395     /* If it is an ibuf bitmap page or trx sys hdr, we do no
396     read-ahead, as that could break the ibuf page access order */
397     return 0;
398 
399   if (buf_pool.n_pend_reads > buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT)
400     return 0;
401 
402   fil_space_t* space= fil_space_t::get(page_id.space());
403   if (!space)
404     return 0;
405 
406   const uint32_t buf_read_ahead_area= buf_pool.read_ahead_area;
407   ulint count= 5 + buf_read_ahead_area / 8;
408   const page_id_t low= page_id - (page_id.page_no() % buf_read_ahead_area);
409   page_id_t high= low + buf_read_ahead_area;
410   high.set_page_no(std::min(high.page_no(), space->last_page_number()));
411 
412   /* Count how many blocks in the area have been recently accessed,
413   that is, reside near the start of the LRU list. */
414 
415   for (page_id_t i= low; i < high; ++i)
416   {
417     const ulint fold= i.fold();
418     page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold);
419     const buf_page_t *bpage= buf_pool.page_hash_get_low(i, fold);
420     bool found= bpage && bpage->is_accessed() && buf_page_peek_if_young(bpage);
421     hash_lock->read_unlock();
422     if (found && !--count)
423       goto read_ahead;
424   }
425 
426 no_read_ahead:
427   space->release();
428   return 0;
429 
430 read_ahead:
431   if (space->is_stopping())
432     goto no_read_ahead;
433 
434   /* Read all the suitable blocks within the area */
435   const ulint ibuf_mode= ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE;
436 
437   for (page_id_t i= low; i < high; ++i)
438   {
439     if (ibuf_bitmap_page(i, zip_size))
440       continue;
441     if (space->is_stopping())
442       break;
443     dberr_t err;
444     space->reacquire();
445     if (buf_read_page_low(&err, space, false, ibuf_mode, i, zip_size, false))
446       count++;
447   }
448 
449   if (count)
450     DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u",
451 			  count, space->chain.start->name,
452 			  low.page_no()));
453   space->release();
454 
455   /* Read ahead is considered one I/O operation for the purpose of
456   LRU policy decision. */
457   buf_LRU_stat_inc_io();
458 
459   buf_pool.stat.n_ra_pages_read_rnd+= count;
460   srv_stats.buf_pool_reads.add(count);
461   return count;
462 }
463 
464 /** High-level function which reads a page from a file to buf_pool
465 if it is not already there. Sets the io_fix and an exclusive lock
466 on the buffer frame. The flag is cleared and the x-lock
467 released by the i/o-handler thread.
468 @param[in]	page_id		page id
469 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
470 @retval DB_SUCCESS if the page was read and is not corrupted,
471 @retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted,
472 @retval DB_DECRYPTION_FAILED if page post encryption checksum matches but
473 after decryption normal page checksum does not match.
474 @retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */
buf_read_page(const page_id_t page_id,ulint zip_size)475 dberr_t buf_read_page(const page_id_t page_id, ulint zip_size)
476 {
477   fil_space_t *space= fil_space_t::get(page_id.space());
478   if (!space)
479   {
480     ib::info() << "trying to read page " << page_id
481                << " in nonexisting or being-dropped tablespace";
482     return DB_TABLESPACE_DELETED;
483   }
484 
485   dberr_t err;
486   if (buf_read_page_low(&err, space, true, BUF_READ_ANY_PAGE,
487 			page_id, zip_size, false))
488     srv_stats.buf_pool_reads.add(1);
489 
490   buf_LRU_stat_inc_io();
491   return err;
492 }
493 
494 /** High-level function which reads a page asynchronously from a file to the
495 buffer buf_pool if it is not already there. Sets the io_fix flag and sets
496 an exclusive lock on the buffer frame. The flag is cleared and the x-lock
497 released by the i/o-handler thread.
498 @param[in,out]	space		tablespace
499 @param[in]	page_id		page id
500 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0 */
buf_read_page_background(fil_space_t * space,const page_id_t page_id,ulint zip_size)501 void buf_read_page_background(fil_space_t *space, const page_id_t page_id,
502                               ulint zip_size)
503 {
504 	dberr_t		err;
505 
506 	if (buf_read_page_low(&err, space, false, BUF_READ_ANY_PAGE,
507 			      page_id, zip_size, false)) {
508 		srv_stats.buf_pool_reads.add(1);
509 	}
510 
511 	switch (err) {
512 	case DB_SUCCESS:
513 	case DB_ERROR:
514 		break;
515 	case DB_TABLESPACE_DELETED:
516 		ib::info() << "trying to read page " << page_id
517 			<< " in the background"
518 			" in a non-existing or being-dropped tablespace";
519 		break;
520 	case DB_PAGE_CORRUPTED:
521 	case DB_DECRYPTION_FAILED:
522 		ib::error()
523 			<< "Background Page read failed to "
524 			"read or decrypt " << page_id;
525 		break;
526 	default:
527 		ib::fatal() << "Error " << err << " in background read of "
528 			<< page_id;
529 	}
530 
531 	/* We do not increment number of I/O operations used for LRU policy
532 	here (buf_LRU_stat_inc_io()). We use this in heuristics to decide
533 	about evicting uncompressed version of compressed pages from the
534 	buffer pool. Since this function is called from buffer pool load
535 	these IOs are deliberate and are not part of normal workload we can
536 	ignore these in our heuristics. */
537 }
538 
539 /** Applies linear read-ahead if in the buf_pool the page is a border page of
540 a linear read-ahead area and all the pages in the area have been accessed.
541 Does not read any page if the read-ahead mechanism is not activated. Note
542 that the algorithm looks at the 'natural' adjacent successor and
543 predecessor of the page, which on the leaf level of a B-tree are the next
544 and previous page in the chain of leaves. To know these, the page specified
545 in (space, offset) must already be present in the buf_pool. Thus, the
546 natural way to use this function is to call it when a page in the buf_pool
547 is accessed the first time, calling this function just after it has been
548 bufferfixed.
549 NOTE 1: as this function looks at the natural predecessor and successor
550 fields on the page, what happens, if these are not initialized to any
551 sensible value? No problem, before applying read-ahead we check that the
552 area to read is within the span of the space, if not, read-ahead is not
553 applied. An uninitialized value may result in a useless read operation, but
554 only very improbably.
555 NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
556 function must be written such that it cannot end up waiting for these
557 latches!
558 NOTE 3: the calling thread must want access to the page given: this rule is
559 set to prevent unintended read-aheads performed by ibuf routines, a situation
560 which could result in a deadlock if the OS does not support asynchronous io.
561 @param[in]	page_id		page id; see NOTE 3 above
562 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
563 @param[in]	ibuf		whether if we are inside ibuf routine
564 @return number of page read requests issued */
565 ulint
buf_read_ahead_linear(const page_id_t page_id,ulint zip_size,bool ibuf)566 buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf)
567 {
568   /* check if readahead is disabled */
569   if (!srv_read_ahead_threshold)
570     return 0;
571 
572   if (srv_startup_is_before_trx_rollback_phase)
573     /* No read-ahead to avoid thread deadlocks */
574     return 0;
575 
576   if (buf_pool.n_pend_reads > buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT)
577     return 0;
578 
579   const uint32_t buf_read_ahead_area= buf_pool.read_ahead_area;
580   const page_id_t low= page_id - (page_id.page_no() % buf_read_ahead_area);
581   const page_id_t high_1= low + (buf_read_ahead_area - 1);
582 
583   /* We will check that almost all pages in the area have been accessed
584   in the desired order. */
585   const bool descending= page_id == low;
586 
587   if (!descending && page_id != high_1)
588     /* This is not a border page of the area */
589     return 0;
590 
591   if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id))
592     /* If it is an ibuf bitmap page or trx sys hdr, we do no
593     read-ahead, as that could break the ibuf page access order */
594     return 0;
595 
596   fil_space_t *space= fil_space_t::get(page_id.space());
597   if (!space)
598     return 0;
599 
600   if (high_1.page_no() > space->last_page_number())
601   {
602     /* The area is not whole. */
603 fail:
604     space->release();
605     return 0;
606   }
607 
608   /* How many out of order accessed pages can we ignore
609   when working out the access pattern for linear readahead */
610   ulint count= std::min<ulint>(buf_pool_t::READ_AHEAD_PAGES -
611                                srv_read_ahead_threshold,
612                                uint32_t{buf_pool.read_ahead_area});
613   page_id_t new_low= low, new_high_1= high_1;
614   unsigned prev_accessed= 0;
615   for (page_id_t i= low; i != high_1; ++i)
616   {
617     const ulint fold= i.fold();
618     page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold);
619     const buf_page_t* bpage= buf_pool.page_hash_get_low(i, fold);
620     if (i == page_id)
621     {
622       /* Read the natural predecessor and successor page addresses from
623       the page; NOTE that because the calling thread may have an x-latch
624       on the page, we do not acquire an s-latch on the page, this is to
625       prevent deadlocks. The hash_lock is only protecting the
626       buf_pool.page_hash for page i, not the bpage contents itself. */
627       if (!bpage)
628       {
629 hard_fail:
630         hash_lock->read_unlock();
631 	goto fail;
632       }
633       const byte *f;
634       switch (UNIV_EXPECT(bpage->state(), BUF_BLOCK_FILE_PAGE)) {
635       case BUF_BLOCK_FILE_PAGE:
636         f= reinterpret_cast<const buf_block_t*>(bpage)->frame;
637         break;
638       case BUF_BLOCK_ZIP_PAGE:
639         f= bpage->zip.data;
640         break;
641       default:
642         goto hard_fail;
643       }
644 
645       uint32_t prev= mach_read_from_4(my_assume_aligned<4>(f + FIL_PAGE_PREV));
646       uint32_t next= mach_read_from_4(my_assume_aligned<4>(f + FIL_PAGE_NEXT));
647       if (prev == FIL_NULL || next == FIL_NULL)
648         goto hard_fail;
649       page_id_t id= page_id;
650       if (descending && next - 1 == page_id.page_no())
651         id.set_page_no(prev);
652       else if (!descending && prev + 1 == page_id.page_no())
653         id.set_page_no(next);
654       else
655         goto hard_fail; /* Successor or predecessor not in the right order */
656 
657       new_low= id - (id.page_no() % buf_read_ahead_area);
658       new_high_1= new_low + (buf_read_ahead_area - 1);
659 
660       if (id != new_low && id != new_high_1)
661         /* This is not a border page of the area: return */
662         goto hard_fail;
663       if (new_high_1.page_no() > space->last_page_number())
664         /* The area is not whole */
665         goto hard_fail;
666     }
667     else if (!bpage)
668     {
669 failed:
670       hash_lock->read_unlock();
671       if (--count)
672         continue;
673       goto fail;
674     }
675 
676     const unsigned accessed= bpage->is_accessed();
677     if (!accessed)
678       goto failed;
679     /* Note that buf_page_t::is_accessed() returns the time of the
680     first access. If some blocks of the extent existed in the buffer
681     pool at the time of a linear access pattern, the first access
682     times may be nonmonotonic, even though the latest access times
683     were linear. The threshold (srv_read_ahead_factor) should help a
684     little against this. */
685     bool fail= prev_accessed &&
686       (descending ? prev_accessed > accessed : prev_accessed < accessed);
687     prev_accessed= accessed;
688     if (fail)
689       goto failed;
690     hash_lock->read_unlock();
691   }
692 
693   /* If we got this far, read-ahead can be sensible: do it */
694   count= 0;
695   for (ulint ibuf_mode= ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE;
696        new_low != new_high_1; ++new_low)
697   {
698     if (ibuf_bitmap_page(new_low, zip_size))
699       continue;
700     if (space->is_stopping())
701       break;
702     dberr_t err;
703     space->reacquire();
704     count+= buf_read_page_low(&err, space, false, ibuf_mode, new_low, zip_size,
705                               false);
706   }
707 
708   if (count)
709     DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u",
710                           count, space->chain.start->name,
711                           new_low.page_no()));
712   space->release();
713 
714   /* Read ahead is considered one I/O operation for the purpose of
715   LRU policy decision. */
716   buf_LRU_stat_inc_io();
717 
718   buf_pool.stat.n_ra_pages_read+= count;
719   return count;
720 }
721 
722 /** Issues read requests for pages which recovery wants to read in.
723 @param[in]	space_id	tablespace id
724 @param[in]	page_nos	array of page numbers to read, with the
725 highest page number the last in the array
726 @param[in]	n		number of page numbers in the array */
buf_read_recv_pages(ulint space_id,const uint32_t * page_nos,ulint n)727 void buf_read_recv_pages(ulint space_id, const uint32_t* page_nos, ulint n)
728 {
729 	fil_space_t* space = fil_space_t::get(space_id);
730 
731 	if (!space) {
732 		/* The tablespace is missing or unreadable: do nothing */
733 		return;
734 	}
735 
736 	const ulint zip_size = space->zip_size();
737 
738 	for (ulint i = 0; i < n; i++) {
739 
740 		/* Ignore if the page already present in freed ranges. */
741 		if (space->freed_ranges.contains(page_nos[i])) {
742 			continue;
743 		}
744 
745 		const page_id_t	cur_page_id(space_id, page_nos[i]);
746 
747 		ulint limit = 0;
748 		for (ulint j = 0; j < buf_pool.n_chunks; j++) {
749 			limit += buf_pool.chunks[j].size / 2;
750 		}
751 
752 		for (ulint count = 0; buf_pool.n_pend_reads >= limit; ) {
753 			os_thread_sleep(10000);
754 
755 			if (!(++count % 1000)) {
756 
757 				ib::error()
758 					<< "Waited for " << count / 100
759 					<< " seconds for "
760 					<< buf_pool.n_pend_reads
761 					<< " pending reads";
762 			}
763 		}
764 
765 		dberr_t err;
766 		space->reacquire();
767 		buf_read_page_low(&err, space, false,
768 				  BUF_READ_ANY_PAGE, cur_page_id, zip_size,
769 				  true);
770 
771 		if (err == DB_DECRYPTION_FAILED || err == DB_PAGE_CORRUPTED) {
772 			ib::error() << "Recovery failed to read or decrypt "
773 				<< cur_page_id;
774 		}
775 	}
776 
777 
778         DBUG_PRINT("ib_buf", ("recovery read (%u pages) for %s", n,
779 			      space->chain.start->name));
780 	space->release();
781 }
782