1 /*****************************************************************************
2 
3 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2015, 2021, MariaDB Corporation.
5 
6 This program is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free Software
8 Foundation; version 2 of the License.
9 
10 This program is distributed in the hope that it will be useful, but WITHOUT
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13 
14 You should have received a copy of the GNU General Public License along with
15 this program; if not, write to the Free Software Foundation, Inc.,
16 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
17 
18 *****************************************************************************/
19 
20 /**************************************************//**
21 @file buf/buf0rea.cc
22 The database buffer read
23 
24 Created 11/5/1995 Heikki Tuuri
25 *******************************************************/
26 
27 #include "univ.i"
28 #include <mysql/service_thd_wait.h>
29 
30 #include "buf0rea.h"
31 #include "fil0fil.h"
32 #include "mtr0mtr.h"
33 #include "buf0buf.h"
34 #include "buf0flu.h"
35 #include "buf0lru.h"
36 #include "buf0dblwr.h"
37 #include "ibuf0ibuf.h"
38 #include "log0recv.h"
39 #include "trx0sys.h"
40 #include "os0file.h"
41 #include "srv0start.h"
42 #include "srv0srv.h"
43 
44 /** There must be at least this many pages in buf_pool in the area to start
45 a random read-ahead */
46 #define BUF_READ_AHEAD_RANDOM_THRESHOLD(b)	\
47 				(5 + BUF_READ_AHEAD_AREA(b) / 8)
48 
49 /** If there are buf_pool->curr_size per the number below pending reads, then
50 read-ahead is not done: this is to prevent flooding the buffer pool with
51 i/o-fixed buffer blocks */
52 #define BUF_READ_AHEAD_PEND_LIMIT	2
53 
54 /********************************************************************//**
55 Unfixes the pages, unlatches the page,
56 removes it from page_hash and removes it from LRU. */
57 static
58 void
buf_read_page_handle_error(buf_page_t * bpage)59 buf_read_page_handle_error(
60 /*=======================*/
61 	buf_page_t*	bpage)	/*!< in: pointer to the block */
62 {
63 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
64 	const bool	uncompressed = (buf_page_get_state(bpage)
65 					== BUF_BLOCK_FILE_PAGE);
66 	const page_id_t	old_page_id = bpage->id;
67 
68 	/* First unfix and release lock on the bpage */
69 	buf_pool_mutex_enter(buf_pool);
70 	mutex_enter(buf_page_get_mutex(bpage));
71 	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_READ);
72 
73 	bpage->id.set_corrupt_id();
74 	/* Set BUF_IO_NONE before we remove the block from LRU list */
75 	buf_page_set_io_fix(bpage, BUF_IO_NONE);
76 
77 	if (uncompressed) {
78 		rw_lock_x_unlock_gen(
79 			&((buf_block_t*) bpage)->lock,
80 			BUF_IO_READ);
81 	}
82 
83 	mutex_exit(buf_page_get_mutex(bpage));
84 
85 	/* remove the block from LRU list */
86 	buf_LRU_free_one_page(bpage, old_page_id);
87 
88 	ut_ad(buf_pool->n_pend_reads > 0);
89 	buf_pool->n_pend_reads--;
90 
91 	buf_pool_mutex_exit(buf_pool);
92 }
93 
94 /** Low-level function which reads a page asynchronously from a file to the
95 buffer buf_pool if it is not already there, in which case does nothing.
96 Sets the io_fix flag and sets an exclusive lock on the buffer frame. The
97 flag is cleared and the x-lock released by an i/o-handler thread.
98 
99 @param[out] err		DB_SUCCESS, DB_TABLESPACE_DELETED or
100 			DB_TABLESPACE_TRUNCATED if we are trying
101 			to read from a non-existent tablespace, a
102 			tablespace which is just now being dropped,
103 			or a tablespace which is truncated
104 @param[in] sync		true if synchronous aio is desired
105 @param[in] type		IO type, SIMULATED, IGNORE_MISSING
106 @param[in] mode		BUF_READ_IBUF_PAGES_ONLY, ...,
107 @param[in] page_id	page id
108 @param[in] unzip	true=request uncompressed page
109 @param[in] ignore_missing_space  true=ignore missing space when reading
110 @return 1 if a read request was queued, 0 if the page already resided
111 in buf_pool, or if the page is in the doublewrite buffer blocks in
112 which case it is never read into the pool, or if the tablespace does
113 not exist or is being dropped */
114 static
115 ulint
buf_read_page_low(dberr_t * err,bool sync,ulint type,ulint mode,const page_id_t page_id,const page_size_t & page_size,bool unzip,bool ignore_missing_space=false)116 buf_read_page_low(
117 	dberr_t*		err,
118 	bool			sync,
119 	ulint			type,
120 	ulint			mode,
121 	const page_id_t		page_id,
122 	const page_size_t&	page_size,
123 	bool			unzip,
124 	bool			ignore_missing_space = false)
125 {
126 	buf_page_t*	bpage;
127 
128 	*err = DB_SUCCESS;
129 
130 	if (page_id.space() == TRX_SYS_SPACE
131 	    && buf_dblwr_page_inside(page_id.page_no())) {
132 
133 		ib::error() << "Trying to read doublewrite buffer page "
134 			<< page_id;
135 		return(0);
136 	}
137 
138 	if (ibuf_bitmap_page(page_id, page_size) || trx_sys_hdr_page(page_id)) {
139 
140 		/* Trx sys header is so low in the latching order that we play
141 		safe and do not leave the i/o-completion to an asynchronous
142 		i/o-thread. Ibuf bitmap pages must always be read with
143 		syncronous i/o, to make sure they do not get involved in
144 		thread deadlocks. */
145 
146 		sync = true;
147 	}
148 
149 	/* The following call will also check if the tablespace does not exist
150 	or is being dropped; if we succeed in initing the page in the buffer
151 	pool for read, then DISCARD cannot proceed until the read has
152 	completed */
153 	bpage = buf_page_init_for_read(err, mode, page_id, page_size, unzip);
154 
155 	if (bpage == NULL) {
156 
157 		return(0);
158 	}
159 
160 	DBUG_LOG("ib_buf",
161 		 "read page " << page_id << " size=" << page_size.physical()
162 		 << " unzip=" << unzip << ',' << (sync ? "sync" : "async"));
163 
164 	ut_ad(buf_page_in_file(bpage));
165 
166 	if (sync) {
167 		thd_wait_begin(NULL, THD_WAIT_DISKIO);
168 	}
169 
170 	void*	dst;
171 
172 	if (page_size.is_compressed()) {
173 		dst = bpage->zip.data;
174 	} else {
175 		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
176 
177 		dst = ((buf_block_t*) bpage)->frame;
178 	}
179 
180 	IORequest	request(type | IORequest::READ);
181 
182 	*err = fil_io(
183 		request, sync, page_id, page_size, 0, page_size.physical(),
184 		dst, bpage, ignore_missing_space);
185 
186 	if (sync) {
187 		thd_wait_end(NULL);
188 	}
189 
190 	if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
191 		if (*err == DB_TABLESPACE_TRUNCATED) {
192 			/* Remove the page which is outside the
193 			truncated tablespace bounds when recovering
194 			from a crash happened during a truncation */
195 			buf_read_page_handle_error(bpage);
196 			if (recv_recovery_is_on()) {
197 				mutex_enter(&recv_sys->mutex);
198 				ut_ad(recv_sys->n_addrs > 0);
199 				recv_sys->n_addrs--;
200 				mutex_exit(&recv_sys->mutex);
201 			}
202 			return(0);
203 		} else if (IORequest::ignore_missing(type)
204 			   || *err == DB_TABLESPACE_DELETED
205 			   || *err == DB_IO_ERROR) {
206 			buf_read_page_handle_error(bpage);
207 			return(0);
208 		}
209 
210 		ut_error;
211 	}
212 
213 	if (sync) {
214 		/* The i/o is already completed when we arrive from
215 		fil_read */
216 		*err = buf_page_io_complete(bpage);
217 
218 		if (*err != DB_SUCCESS) {
219 			return(0);
220 		}
221 	}
222 
223 	return(1);
224 }
225 
226 /** Applies a random read-ahead in buf_pool if there are at least a threshold
227 value of accessed pages from the random read-ahead area. Does not read any
228 page, not even the one at the position (space, offset), if the read-ahead
229 mechanism is not activated. NOTE 1: the calling thread may own latches on
230 pages: to avoid deadlocks this function must be written such that it cannot
231 end up waiting for these latches! NOTE 2: the calling thread must want
232 access to the page given: this rule is set to prevent unintended read-aheads
233 performed by ibuf routines, a situation which could result in a deadlock if
234 the OS does not support asynchronous i/o.
235 @param[in]	page_id		page id of a page which the current thread
236 wants to access
237 @param[in]	page_size	page size
238 @param[in]	inside_ibuf	TRUE if we are inside ibuf routine
239 @return number of page read requests issued; NOTE that if we read ibuf
240 pages, it may happen that the page at the given page number does not
241 get read even if we return a positive value! */
242 ulint
buf_read_ahead_random(const page_id_t page_id,const page_size_t & page_size,ibool inside_ibuf)243 buf_read_ahead_random(
244 	const page_id_t		page_id,
245 	const page_size_t&	page_size,
246 	ibool			inside_ibuf)
247 {
248 	buf_pool_t*	buf_pool = buf_pool_get(page_id);
249 	ulint		recent_blocks	= 0;
250 	ulint		ibuf_mode;
251 	ulint		count;
252 	ulint		low, high;
253 	dberr_t		err = DB_SUCCESS;
254 	ulint		i;
255 	const ulint	buf_read_ahead_random_area
256 				= BUF_READ_AHEAD_AREA(buf_pool);
257 
258 	if (!srv_random_read_ahead) {
259 		/* Disabled by user */
260 		return(0);
261 	}
262 
263 	if (srv_startup_is_before_trx_rollback_phase) {
264 		/* No read-ahead to avoid thread deadlocks */
265 		return(0);
266 	}
267 
268 	if (ibuf_bitmap_page(page_id, page_size) || trx_sys_hdr_page(page_id)) {
269 
270 		/* If it is an ibuf bitmap page or trx sys hdr, we do
271 		no read-ahead, as that could break the ibuf page access
272 		order */
273 
274 		return(0);
275 	}
276 
277 	low  = (page_id.page_no() / buf_read_ahead_random_area)
278 		* buf_read_ahead_random_area;
279 
280 	high = (page_id.page_no() / buf_read_ahead_random_area + 1)
281 		* buf_read_ahead_random_area;
282 
283 	if (fil_space_t* space = fil_space_acquire(page_id.space())) {
284 		high = space->max_page_number_for_io(high);
285 		space->release();
286 	} else {
287 		return(0);
288 	}
289 
290 	buf_pool_mutex_enter(buf_pool);
291 
292 	if (buf_pool->n_pend_reads
293 	    > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
294 		buf_pool_mutex_exit(buf_pool);
295 
296 		return(0);
297 	}
298 
299 	/* Count how many blocks in the area have been recently accessed,
300 	that is, reside near the start of the LRU list. */
301 
302 	for (i = low; i < high; i++) {
303 		const buf_page_t*	bpage = buf_page_hash_get(
304 			buf_pool, page_id_t(page_id.space(), i));
305 
306 		if (bpage != NULL
307 		    && buf_page_is_accessed(bpage)
308 		    && buf_page_peek_if_young(bpage)) {
309 
310 			recent_blocks++;
311 
312 			if (recent_blocks
313 			    >= BUF_READ_AHEAD_RANDOM_THRESHOLD(buf_pool)) {
314 
315 				buf_pool_mutex_exit(buf_pool);
316 				goto read_ahead;
317 			}
318 		}
319 	}
320 
321 	buf_pool_mutex_exit(buf_pool);
322 	/* Do nothing */
323 	return(0);
324 
325 read_ahead:
326 	/* Read all the suitable blocks within the area */
327 
328 	if (inside_ibuf) {
329 		ibuf_mode = BUF_READ_IBUF_PAGES_ONLY;
330 	} else {
331 		ibuf_mode = BUF_READ_ANY_PAGE;
332 	}
333 
334 	count = 0;
335 
336 	for (i = low; i < high; i++) {
337 		/* It is only sensible to do read-ahead in the non-sync aio
338 		mode: hence FALSE as the first parameter */
339 
340 		const page_id_t	cur_page_id(page_id.space(), i);
341 
342 		if (!ibuf_bitmap_page(cur_page_id, page_size)) {
343 			count += buf_read_page_low(
344 				&err, false,
345 				IORequest::DO_NOT_WAKE,
346 				ibuf_mode,
347 				cur_page_id, page_size, false);
348 
349 			switch (err) {
350 			case DB_SUCCESS:
351 			case DB_TABLESPACE_TRUNCATED:
352 			case DB_ERROR:
353 				break;
354 			case DB_TABLESPACE_DELETED:
355 				ib::info() << "Random readahead trying to"
356 					" access page " << cur_page_id
357 					<< " in nonexisting or"
358 					" being-dropped tablespace";
359 				break;
360 			default:
361 				ut_error;
362 			}
363 		}
364 	}
365 
366 	/* In simulated aio we wake the aio handler threads only after
367 	queuing all aio requests, in native aio the following call does
368 	nothing: */
369 
370 	os_aio_simulated_wake_handler_threads();
371 
372 	if (count) {
373 		DBUG_PRINT("ib_buf", ("random read-ahead %u pages, %u:%u",
374 				      (unsigned) count,
375 				      (unsigned) page_id.space(),
376 				      (unsigned) page_id.page_no()));
377 	}
378 
379 	/* Read ahead is considered one I/O operation for the purpose of
380 	LRU policy decision. */
381 	buf_LRU_stat_inc_io();
382 
383 	buf_pool->stat.n_ra_pages_read_rnd += count;
384 	srv_stats.buf_pool_reads.add(count);
385 	return(count);
386 }
387 
388 /** High-level function which reads a page asynchronously from a file to the
389 buffer buf_pool if it is not already there. Sets the io_fix flag and sets
390 an exclusive lock on the buffer frame. The flag is cleared and the x-lock
391 released by the i/o-handler thread.
392 @param[in]	page_id		page id
393 @param[in]	page_size	page size
394 @retval DB_SUCCESS if the page was read and is not corrupted,
395 @retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted,
396 @retval DB_DECRYPTION_FAILED if page post encryption checksum matches but
397 after decryption normal page checksum does not match.
398 @retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */
399 dberr_t
buf_read_page(const page_id_t page_id,const page_size_t & page_size)400 buf_read_page(
401 	const page_id_t		page_id,
402 	const page_size_t&	page_size)
403 {
404 	ulint		count;
405 	dberr_t		err = DB_SUCCESS;
406 
407 	/* We do synchronous IO because our AIO completion code
408 	is sub-optimal. See buf_page_io_complete(), we have to
409 	acquire the buffer pool mutex before acquiring the block
410 	mutex, required for updating the page state. The acquire
411 	of the buffer pool mutex becomes an expensive bottleneck. */
412 
413 	count = buf_read_page_low(
414 		&err, true,
415 		0, BUF_READ_ANY_PAGE, page_id, page_size, false);
416 
417 	srv_stats.buf_pool_reads.add(count);
418 
419 	if (err == DB_TABLESPACE_DELETED) {
420 		ib::info() << "trying to read page " << page_id
421 			<< " in nonexisting or being-dropped tablespace";
422 	}
423 
424 	/* Increment number of I/O operations used for LRU policy. */
425 	buf_LRU_stat_inc_io();
426 
427 	return(err);
428 }
429 
430 /** High-level function which reads a page asynchronously from a file to the
431 buffer buf_pool if it is not already there. Sets the io_fix flag and sets
432 an exclusive lock on the buffer frame. The flag is cleared and the x-lock
433 released by the i/o-handler thread.
434 @param[in]	page_id		page id
435 @param[in]	page_size	page size
436 @param[in]	sync		true if synchronous aio is desired */
437 void
buf_read_page_background(const page_id_t page_id,const page_size_t & page_size,bool sync)438 buf_read_page_background(
439 	const page_id_t		page_id,
440 	const page_size_t&	page_size,
441 	bool			sync)
442 {
443 	ulint		count;
444 	dberr_t		err;
445 
446 	count = buf_read_page_low(
447 		&err, sync,
448 		IORequest::DO_NOT_WAKE | IORequest::IGNORE_MISSING,
449 		BUF_READ_ANY_PAGE,
450 		page_id, page_size, false);
451 
452 	switch (err) {
453 	case DB_SUCCESS:
454 	case DB_TABLESPACE_TRUNCATED:
455 	case DB_ERROR:
456 		break;
457 	case DB_TABLESPACE_DELETED:
458 		ib::info() << "trying to read page " << page_id
459 			<< " in the background"
460 			" in a non-existing or being-dropped tablespace";
461 		break;
462 	case DB_PAGE_CORRUPTED:
463 	case DB_DECRYPTION_FAILED:
464 		ib::error()
465 			<< "Background Page read failed to "
466 			"read or decrypt " << page_id;
467 		break;
468 	default:
469 		ib::fatal() << "Error " << err << " in background read of "
470 			<< page_id;
471 	}
472 
473 	srv_stats.buf_pool_reads.add(count);
474 
475 	/* We do not increment number of I/O operations used for LRU policy
476 	here (buf_LRU_stat_inc_io()). We use this in heuristics to decide
477 	about evicting uncompressed version of compressed pages from the
478 	buffer pool. Since this function is called from buffer pool load
479 	these IOs are deliberate and are not part of normal workload we can
480 	ignore these in our heuristics. */
481 }
482 
483 /** Applies linear read-ahead if in the buf_pool the page is a border page of
484 a linear read-ahead area and all the pages in the area have been accessed.
485 Does not read any page if the read-ahead mechanism is not activated. Note
486 that the algorithm looks at the 'natural' adjacent successor and
487 predecessor of the page, which on the leaf level of a B-tree are the next
488 and previous page in the chain of leaves. To know these, the page specified
489 in (space, offset) must already be present in the buf_pool. Thus, the
490 natural way to use this function is to call it when a page in the buf_pool
491 is accessed the first time, calling this function just after it has been
492 bufferfixed.
493 NOTE 1: as this function looks at the natural predecessor and successor
494 fields on the page, what happens, if these are not initialized to any
495 sensible value? No problem, before applying read-ahead we check that the
496 area to read is within the span of the space, if not, read-ahead is not
497 applied. An uninitialized value may result in a useless read operation, but
498 only very improbably.
499 NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
500 function must be written such that it cannot end up waiting for these
501 latches!
502 NOTE 3: the calling thread must want access to the page given: this rule is
503 set to prevent unintended read-aheads performed by ibuf routines, a situation
504 which could result in a deadlock if the OS does not support asynchronous io.
505 @param[in]	page_id		page id; see NOTE 3 above
506 @param[in]	page_size	page size
507 @param[in]	inside_ibuf	TRUE if we are inside ibuf routine
508 @return number of page read requests issued */
509 ulint
buf_read_ahead_linear(const page_id_t page_id,const page_size_t & page_size,ibool inside_ibuf)510 buf_read_ahead_linear(
511 	const page_id_t		page_id,
512 	const page_size_t&	page_size,
513 	ibool			inside_ibuf)
514 {
515 	buf_pool_t*	buf_pool = buf_pool_get(page_id);
516 	buf_page_t*	bpage;
517 	buf_frame_t*	frame;
518 	buf_page_t*	pred_bpage	= NULL;
519 	ulint		pred_offset;
520 	ulint		succ_offset;
521 	int		asc_or_desc;
522 	ulint		new_offset;
523 	ulint		fail_count;
524 	ulint		low, high;
525 	dberr_t		err = DB_SUCCESS;
526 	ulint		i;
527 	const ulint	buf_read_ahead_linear_area
528 		= BUF_READ_AHEAD_AREA(buf_pool);
529 	ulint		threshold;
530 
531 	/* check if readahead is disabled */
532 	if (!srv_read_ahead_threshold) {
533 		return(0);
534 	}
535 
536 	if (srv_startup_is_before_trx_rollback_phase) {
537 		/* No read-ahead to avoid thread deadlocks */
538 		return(0);
539 	}
540 
541 	low  = (page_id.page_no() / buf_read_ahead_linear_area)
542 		* buf_read_ahead_linear_area;
543 	high = (page_id.page_no() / buf_read_ahead_linear_area + 1)
544 		* buf_read_ahead_linear_area;
545 
546 	if ((page_id.page_no() != low) && (page_id.page_no() != high - 1)) {
547 		/* This is not a border page of the area: return */
548 
549 		return(0);
550 	}
551 
552 	if (ibuf_bitmap_page(page_id, page_size) || trx_sys_hdr_page(page_id)) {
553 
554 		/* If it is an ibuf bitmap page or trx sys hdr, we do
555 		no read-ahead, as that could break the ibuf page access
556 		order */
557 
558 		return(0);
559 	}
560 
561 	ulint	space_size;
562 
563 	if (fil_space_t* space = fil_space_acquire(page_id.space())) {
564 		space_size = space->committed_size;
565 		space->release();
566 
567 		if (high > space_size) {
568 			/* The area is not whole */
569 			return(0);
570 		}
571 	} else {
572 		return(0);
573 	}
574 
575 	buf_pool_mutex_enter(buf_pool);
576 
577 	if (buf_pool->n_pend_reads
578 	    > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
579 		buf_pool_mutex_exit(buf_pool);
580 
581 		return(0);
582 	}
583 
584 	/* Check that almost all pages in the area have been accessed; if
585 	offset == low, the accesses must be in a descending order, otherwise,
586 	in an ascending order. */
587 
588 	asc_or_desc = 1;
589 
590 	if (page_id.page_no() == low) {
591 		asc_or_desc = -1;
592 	}
593 
594 	/* How many out of order accessed pages can we ignore
595 	when working out the access pattern for linear readahead */
596 	threshold = ut_min(static_cast<ulint>(64 - srv_read_ahead_threshold),
597 			   BUF_READ_AHEAD_AREA(buf_pool));
598 
599 	fail_count = 0;
600 
601 	for (i = low; i < high; i++) {
602 		bpage = buf_page_hash_get(buf_pool,
603 					  page_id_t(page_id.space(), i));
604 
605 		if (bpage == NULL || !buf_page_is_accessed(bpage)) {
606 			/* Not accessed */
607 			fail_count++;
608 
609 		} else if (pred_bpage) {
610 			/* Note that buf_page_is_accessed() returns
611 			the time of the first access.  If some blocks
612 			of the extent existed in the buffer pool at
613 			the time of a linear access pattern, the first
614 			access times may be nonmonotonic, even though
615 			the latest access times were linear.  The
616 			threshold (srv_read_ahead_factor) should help
617 			a little against this. */
618 			int res = ut_ulint_cmp(
619 				buf_page_is_accessed(bpage),
620 				buf_page_is_accessed(pred_bpage));
621 			/* Accesses not in the right order */
622 			if (res != 0 && res != asc_or_desc) {
623 				fail_count++;
624 			}
625 		}
626 
627 		if (fail_count > threshold) {
628 			/* Too many failures: return */
629 			buf_pool_mutex_exit(buf_pool);
630 			return(0);
631 		}
632 
633 		if (bpage && buf_page_is_accessed(bpage)) {
634 			pred_bpage = bpage;
635 		}
636 	}
637 
638 	/* If we got this far, we know that enough pages in the area have
639 	been accessed in the right order: linear read-ahead can be sensible */
640 
641 	bpage = buf_page_hash_get(buf_pool, page_id);
642 
643 	if (bpage == NULL) {
644 		buf_pool_mutex_exit(buf_pool);
645 
646 		return(0);
647 	}
648 
649 	switch (buf_page_get_state(bpage)) {
650 	case BUF_BLOCK_ZIP_PAGE:
651 		frame = bpage->zip.data;
652 		break;
653 	case BUF_BLOCK_FILE_PAGE:
654 		frame = ((buf_block_t*) bpage)->frame;
655 		break;
656 	default:
657 		ut_error;
658 		break;
659 	}
660 
661 	/* Read the natural predecessor and successor page addresses from
662 	the page; NOTE that because the calling thread may have an x-latch
663 	on the page, we do not acquire an s-latch on the page, this is to
664 	prevent deadlocks. Even if we read values which are nonsense, the
665 	algorithm will work. */
666 
667 	pred_offset = fil_page_get_prev(frame);
668 	succ_offset = fil_page_get_next(frame);
669 
670 	buf_pool_mutex_exit(buf_pool);
671 
672 	if ((page_id.page_no() == low)
673 	    && (succ_offset == page_id.page_no() + 1)) {
674 
675 		/* This is ok, we can continue */
676 		new_offset = pred_offset;
677 
678 	} else if ((page_id.page_no() == high - 1)
679 		   && (pred_offset == page_id.page_no() - 1)) {
680 
681 		/* This is ok, we can continue */
682 		new_offset = succ_offset;
683 	} else {
684 		/* Successor or predecessor not in the right order */
685 
686 		return(0);
687 	}
688 
689 	low  = (new_offset / buf_read_ahead_linear_area)
690 		* buf_read_ahead_linear_area;
691 	high = (new_offset / buf_read_ahead_linear_area + 1)
692 		* buf_read_ahead_linear_area;
693 
694 	if ((new_offset != low) && (new_offset != high - 1)) {
695 		/* This is not a border page of the area: return */
696 
697 		return(0);
698 	}
699 
700 	if (high > space_size) {
701 		/* The area is not whole, return */
702 
703 		return(0);
704 	}
705 
706 	ulint	count = 0;
707 
708 	/* If we got this far, read-ahead can be sensible: do it */
709 
710 	ulint	ibuf_mode;
711 
712 	ibuf_mode = inside_ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE;
713 
714 	/* Since Windows XP seems to schedule the i/o handler thread
715 	very eagerly, and consequently it does not wait for the
716 	full read batch to be posted, we use special heuristics here */
717 
718 	os_aio_simulated_put_read_threads_to_sleep();
719 
720 	for (i = low; i < high; i++) {
721 		/* It is only sensible to do read-ahead in the non-sync
722 		aio mode: hence FALSE as the first parameter */
723 
724 		const page_id_t	cur_page_id(page_id.space(), i);
725 
726 		if (!ibuf_bitmap_page(cur_page_id, page_size)) {
727 			count += buf_read_page_low(
728 				&err, false,
729 				IORequest::DO_NOT_WAKE,
730 				ibuf_mode, cur_page_id, page_size, false);
731 
732 			switch (err) {
733 			case DB_SUCCESS:
734 			case DB_TABLESPACE_TRUNCATED:
735 			case DB_TABLESPACE_DELETED:
736 			case DB_ERROR:
737 				break;
738 			case DB_PAGE_CORRUPTED:
739 			case DB_DECRYPTION_FAILED:
740 				ib::error() << "linear readahead failed to"
741 					" read or decrypt "
742 					<< page_id_t(page_id.space(), i);
743 				break;
744 			default:
745 				ut_error;
746 			}
747 		}
748 	}
749 
750 	/* In simulated aio we wake the aio handler threads only after
751 	queuing all aio requests, in native aio the following call does
752 	nothing: */
753 
754 	os_aio_simulated_wake_handler_threads();
755 
756 	if (count) {
757 		DBUG_PRINT("ib_buf", ("linear read-ahead " ULINTPF " pages, "
758 				      "%u:%u",
759 				      count,
760 				      page_id.space(),
761 				      page_id.page_no()));
762 	}
763 
764 	/* Read ahead is considered one I/O operation for the purpose of
765 	LRU policy decision. */
766 	buf_LRU_stat_inc_io();
767 
768 	buf_pool->stat.n_ra_pages_read += count;
769 	return(count);
770 }
771 
772 /********************************************************************//**
773 Issues read requests for pages which the ibuf module wants to read in, in
774 order to contract the insert buffer tree. Technically, this function is like
775 a read-ahead function. */
776 void
buf_read_ibuf_merge_pages(bool sync,const ulint * space_ids,const ulint * page_nos,ulint n_stored)777 buf_read_ibuf_merge_pages(
778 /*======================*/
779 	bool		sync,		/*!< in: true if the caller
780 					wants this function to wait
781 					for the highest address page
782 					to get read in, before this
783 					function returns */
784 	const ulint*	space_ids,	/*!< in: array of space ids */
785 	const ulint*	page_nos,	/*!< in: array of page numbers
786 					to read, with the highest page
787 					number the last in the
788 					array */
789 	ulint		n_stored)	/*!< in: number of elements
790 					in the arrays */
791 {
792 #ifdef UNIV_IBUF_DEBUG
793 	ut_a(n_stored < srv_page_size);
794 #endif
795 
796 	for (ulint i = 0; i < n_stored; i++) {
797 		fil_space_t* space = fil_space_acquire_silent(space_ids[i]);
798 		if (!space) {
799 tablespace_deleted:
800 			/* The tablespace was not found: remove all
801 			entries for it */
802 			ibuf_delete_for_discarded_space(space_ids[i]);
803 			while (i + 1 < n_stored
804 			       && space_ids[i + 1] == space_ids[i]) {
805 				i++;
806 			}
807 			continue;
808 		}
809 
810 		ulint size = space->size;
811 		if (!size) {
812 			size = fil_space_get_size(space->id);
813 		}
814 
815 		if (UNIV_UNLIKELY(page_nos[i] >= size)) {
816 			do {
817 				ibuf_delete_recs(page_id_t(space_ids[i],
818 							   page_nos[i]));
819 			} while (++i < n_stored
820 				 && space_ids[i - 1] == space_ids[i]
821 				 && page_nos[i] >= size);
822 			i--;
823 next:
824 			space->release();
825 			continue;
826 		}
827 
828 		const page_id_t	page_id(space_ids[i], page_nos[i]);
829 
830 		buf_pool_t*	buf_pool = buf_pool_get(page_id);
831 
832 		while (buf_pool->n_pend_reads
833 		       > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
834 			os_thread_sleep(500000);
835 		}
836 
837 		dberr_t	err;
838 
839 		buf_read_page_low(&err,
840 				  sync && (i + 1 == n_stored),
841 				  0,
842 				  BUF_READ_ANY_PAGE, page_id,
843 				  page_size_t(space->flags), true);
844 
845 		switch(err) {
846 		case DB_SUCCESS:
847 		case DB_TABLESPACE_TRUNCATED:
848 		case DB_ERROR:
849 			break;
850 		case DB_TABLESPACE_DELETED:
851 			space->release();
852 			goto tablespace_deleted;
853 		case DB_PAGE_CORRUPTED:
854 		case DB_DECRYPTION_FAILED:
855 			ib::error() << "Failed to read or decrypt page "
856 				    << page_nos[i]
857 				    << " of '" << space->chain.start->name
858 				    << "' for change buffer merge";
859 			break;
860 		default:
861 			ut_error;
862 		}
863 
864 		goto next;
865 	}
866 
867 	os_aio_simulated_wake_handler_threads();
868 
869 	if (n_stored) {
870 		DBUG_PRINT("ib_buf",
871 			   ("ibuf merge read-ahead %u pages, space %u",
872 			    unsigned(n_stored), unsigned(space_ids[0])));
873 	}
874 }
875 
876 /** Issues read requests for pages which recovery wants to read in.
877 @param[in]	sync		true if the caller wants this function to wait
878 for the highest address page to get read in, before this function returns
879 @param[in]	space_id	tablespace id
880 @param[in]	page_nos	array of page numbers to read, with the
881 highest page number the last in the array
882 @param[in]	n_stored	number of page numbers in the array */
883 void
buf_read_recv_pages(bool sync,ulint space_id,const ulint * page_nos,ulint n_stored)884 buf_read_recv_pages(
885 	bool		sync,
886 	ulint		space_id,
887 	const ulint*	page_nos,
888 	ulint		n_stored)
889 {
890 	fil_space_t*		space	= fil_space_get(space_id);
891 
892 	if (space == NULL) {
893 		/* The tablespace is missing: do nothing */
894 		return;
895 	}
896 
897 	fil_space_open_if_needed(space);
898 
899 	const page_size_t	page_size(space->flags);
900 
901 	for (ulint i = 0; i < n_stored; i++) {
902 		buf_pool_t*		buf_pool;
903 		const page_id_t	cur_page_id(space_id, page_nos[i]);
904 
905 		ulint			count = 0;
906 
907 		buf_pool = buf_pool_get(cur_page_id);
908 		ulint limit = 0;
909 		for (ulint j = 0; j < buf_pool->n_chunks; j++) {
910 			limit += buf_pool->chunks[j].size / 2;
911 		}
912 
913 		while (buf_pool->n_pend_reads >= limit) {
914 			os_aio_simulated_wake_handler_threads();
915 			os_thread_sleep(10000);
916 
917 			count++;
918 
919 			if (!(count % 1000)) {
920 
921 				ib::error()
922 					<< "Waited for " << count / 100
923 					<< " seconds for "
924 					<< buf_pool->n_pend_reads
925 					<< " pending reads";
926 			}
927 		}
928 
929 		dberr_t err;
930 
931 		if (sync && i + 1 == n_stored) {
932 			buf_read_page_low(
933 				&err, true,
934 				0,
935 				BUF_READ_ANY_PAGE,
936 				cur_page_id, page_size, true);
937 		} else {
938 			buf_read_page_low(
939 				&err, false,
940 				IORequest::DO_NOT_WAKE,
941 				BUF_READ_ANY_PAGE,
942 				cur_page_id, page_size, true);
943 		}
944 
945 		if (err == DB_DECRYPTION_FAILED || err == DB_PAGE_CORRUPTED) {
946 			ib::error() << "Recovery failed to read or decrypt "
947 				<< cur_page_id;
948 		}
949 	}
950 
951 	os_aio_simulated_wake_handler_threads();
952 
953 	DBUG_PRINT("ib_buf", ("recovery read-ahead (%u pages)",
954 			      unsigned(n_stored)));
955 }
956