1 /*****************************************************************************
2 
3 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
4 
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License, version 2.0,
7 as published by the Free Software Foundation.
8 
9 This program is also distributed with certain software (including
10 but not limited to OpenSSL) that is licensed under separate terms,
11 as designated in a particular file or component or in included license
12 documentation.  The authors of MySQL hereby grant you an additional
13 permission to link the program and your derivative works with the
14 separately licensed software that they have included with MySQL.
15 
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 GNU General Public License, version 2.0, for more details.
20 
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
24 
25 *****************************************************************************/
26 
27 /**************************************************//**
28 @file buf/buf0rea.cc
29 The database buffer read
30 
31 Created 11/5/1995 Heikki Tuuri
32 *******************************************************/
33 
34 #include "ha_prototypes.h"
35 #include <mysql/service_thd_wait.h>
36 
37 #include "buf0rea.h"
38 #include "fil0fil.h"
39 #include "mtr0mtr.h"
40 #include "buf0buf.h"
41 #include "buf0flu.h"
42 #include "buf0lru.h"
43 #include "buf0dblwr.h"
44 #include "ibuf0ibuf.h"
45 #include "log0recv.h"
46 #include "trx0sys.h"
47 #include "os0file.h"
48 #include "srv0start.h"
49 #include "srv0srv.h"
50 
51 /** There must be at least this many pages in buf_pool in the area to start
52 a random read-ahead */
53 #define BUF_READ_AHEAD_RANDOM_THRESHOLD(b)	\
54 				(5 + BUF_READ_AHEAD_AREA(b) / 8)
55 
56 /** If there are buf_pool->curr_size per the number below pending reads, then
57 read-ahead is not done: this is to prevent flooding the buffer pool with
58 i/o-fixed buffer blocks */
59 #define BUF_READ_AHEAD_PEND_LIMIT	2
60 
61 /********************************************************************//**
62 Unfixes the pages, unlatches the page,
63 removes it from page_hash and removes it from LRU. */
64 static
65 void
buf_read_page_handle_error(buf_page_t * bpage)66 buf_read_page_handle_error(
67 /*=======================*/
68 	buf_page_t*	bpage)	/*!< in: pointer to the block */
69 {
70 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
71 	const bool	uncompressed = (buf_page_get_state(bpage)
72 					== BUF_BLOCK_FILE_PAGE);
73 
74 	/* First unfix and release lock on the bpage */
75 	buf_pool_mutex_enter(buf_pool);
76 	mutex_enter(buf_page_get_mutex(bpage));
77 	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_READ);
78 	ut_ad(bpage->buf_fix_count == 0);
79 
80 	/* Set BUF_IO_NONE before we remove the block from LRU list */
81 	buf_page_set_io_fix(bpage, BUF_IO_NONE);
82 
83 	if (uncompressed) {
84 		rw_lock_x_unlock_gen(
85 			&((buf_block_t*) bpage)->lock,
86 			BUF_IO_READ);
87 	}
88 
89 	mutex_exit(buf_page_get_mutex(bpage));
90 
91 	/* remove the block from LRU list */
92 	buf_LRU_free_one_page(bpage);
93 
94 	ut_ad(buf_pool->n_pend_reads > 0);
95 	ut_ad(mutex_own(&buf_pool->mutex));
96 	buf_pool->n_pend_reads--;
97 
98 	buf_pool_mutex_exit(buf_pool);
99 }
100 
101 /** Low-level function which reads a page asynchronously from a file to the
102 buffer buf_pool if it is not already there, in which case does nothing.
103 Sets the io_fix flag and sets an exclusive lock on the buffer frame. The
104 flag is cleared and the x-lock released by an i/o-handler thread.
105 
106 @param[out] err		DB_SUCCESS, DB_TABLESPACE_DELETED or
107 			DB_TABLESPACE_TRUNCATED if we are trying
108 			to read from a non-existent tablespace, a
109 			tablespace which is just now being dropped,
110 			or a tablespace which is truncated
111 @param[in] sync		true if synchronous aio is desired
112 @param[in] type		IO type, SIMULATED, IGNORE_MISSING
113 @param[in] mode		BUF_READ_IBUF_PAGES_ONLY, ...,
114 @param[in] page_id	page id
115 @param[in] unzip	true=request uncompressed page
116 @return 1 if a read request was queued, 0 if the page already resided
117 in buf_pool, or if the page is in the doublewrite buffer blocks in
118 which case it is never read into the pool, or if the tablespace does
119 not exist or is being dropped */
120 static
121 ulint
buf_read_page_low(dberr_t * err,bool sync,ulint type,ulint mode,const page_id_t & page_id,const page_size_t & page_size,bool unzip)122 buf_read_page_low(
123 	dberr_t*		err,
124 	bool			sync,
125 	ulint			type,
126 	ulint			mode,
127 	const page_id_t&	page_id,
128 	const page_size_t&	page_size,
129 	bool			unzip)
130 {
131 	buf_page_t*	bpage;
132 
133 	*err = DB_SUCCESS;
134 
135 	if (page_id.space() == TRX_SYS_SPACE
136 	    && buf_dblwr_page_inside(page_id.page_no())) {
137 
138 		ib::error() << "Trying to read doublewrite buffer page "
139 			<< page_id;
140 		return(0);
141 	}
142 
143 	if (ibuf_bitmap_page(page_id, page_size) || trx_sys_hdr_page(page_id)) {
144 
145 		/* Trx sys header is so low in the latching order that we play
146 		safe and do not leave the i/o-completion to an asynchronous
147 		i/o-thread. Ibuf bitmap pages must always be read with
148 		syncronous i/o, to make sure they do not get involved in
149 		thread deadlocks. */
150 
151 		sync = true;
152 	}
153 
154 	/* The following call will also check if the tablespace does not exist
155 	or is being dropped; if we succeed in initing the page in the buffer
156 	pool for read, then DISCARD cannot proceed until the read has
157 	completed */
158 	bpage = buf_page_init_for_read(err, mode, page_id, page_size, unzip);
159 
160 	if (bpage == NULL) {
161 
162 		return(0);
163 	}
164 
165 	DBUG_PRINT("ib_buf", ("read page %u:%u size=%u unzip=%u,%s",
166 			      (unsigned) page_id.space(),
167 			      (unsigned) page_id.page_no(),
168 			      (unsigned) page_size.physical(),
169 			      (unsigned) unzip,
170 			      sync ? "sync" : "async"));
171 
172 	ut_ad(buf_page_in_file(bpage));
173 
174 	if (sync) {
175 		thd_wait_begin(NULL, THD_WAIT_DISKIO);
176 	}
177 
178 	void*	dst;
179 
180 	if (page_size.is_compressed()) {
181 		dst = bpage->zip.data;
182 	} else {
183 		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
184 
185 		dst = ((buf_block_t*) bpage)->frame;
186 	}
187 
188 	/* This debug code is only for 5.7. In trunk, with newDD,
189 	the space->name is no longer same as table name. */
190 	DBUG_EXECUTE_IF("innodb_invalid_read_after_truncate",
191 		fil_space_t*	space = fil_space_get(page_id.space());
192 
193 		if (space != NULL && strcmp(space->name, "test/t1") == 0
194 		    && page_id.page_no() == space->size - 1) {
195 			type = IORequest::READ;
196 			sync = true;
197 		}
198 	);
199 
200 	IORequest	request(type | IORequest::READ);
201 
202 	*err = fil_io(
203 		request, sync, page_id, page_size, 0, page_size.physical(),
204 		dst, bpage);
205 
206 	if (sync) {
207 		thd_wait_end(NULL);
208 	}
209 
210 	if (*err != DB_SUCCESS) {
211 		if (*err == DB_TABLESPACE_TRUNCATED) {
212 			/* Remove the page which is outside the
213 			truncated tablespace bounds when recovering
214 			from a crash happened during a truncation */
215 			buf_read_page_handle_error(bpage);
216 			if (recv_recovery_on) {
217 				mutex_enter(&recv_sys->mutex);
218 				ut_ad(recv_sys->n_addrs > 0);
219 				recv_sys->n_addrs--;
220 				mutex_exit(&recv_sys->mutex);
221 			}
222 			return(0);
223 		} else if (IORequest::ignore_missing(type)
224 			   || *err == DB_TABLESPACE_DELETED) {
225 			buf_read_page_handle_error(bpage);
226 			return(0);
227 		}
228 
229 		ut_error;
230 	}
231 
232 	if (sync) {
233 		/* The i/o is already completed when we arrive from
234 		fil_read */
235 		if (!buf_page_io_complete(bpage)) {
236 			return(0);
237 		}
238 	}
239 
240 	return(1);
241 }
242 
243 /** Applies a random read-ahead in buf_pool if there are at least a threshold
244 value of accessed pages from the random read-ahead area. Does not read any
245 page, not even the one at the position (space, offset), if the read-ahead
246 mechanism is not activated. NOTE 1: the calling thread may own latches on
247 pages: to avoid deadlocks this function must be written such that it cannot
248 end up waiting for these latches! NOTE 2: the calling thread must want
249 access to the page given: this rule is set to prevent unintended read-aheads
250 performed by ibuf routines, a situation which could result in a deadlock if
251 the OS does not support asynchronous i/o.
252 @param[in]	page_id		page id of a page which the current thread
253 wants to access
254 @param[in]	page_size	page size
255 @param[in]	inside_ibuf	TRUE if we are inside ibuf routine
256 @return number of page read requests issued; NOTE that if we read ibuf
257 pages, it may happen that the page at the given page number does not
258 get read even if we return a positive value! */
259 ulint
buf_read_ahead_random(const page_id_t & page_id,const page_size_t & page_size,ibool inside_ibuf)260 buf_read_ahead_random(
261 	const page_id_t&	page_id,
262 	const page_size_t&	page_size,
263 	ibool			inside_ibuf)
264 {
265 	buf_pool_t*	buf_pool = buf_pool_get(page_id);
266 	ulint		recent_blocks	= 0;
267 	ulint		ibuf_mode;
268 	ulint		count;
269 	ulint		low, high;
270 	dberr_t		err;
271 	ulint		i;
272 	const ulint	buf_read_ahead_random_area
273 				= BUF_READ_AHEAD_AREA(buf_pool);
274 
275 	if (!srv_random_read_ahead) {
276 		/* Disabled by user */
277 		return(0);
278 	}
279 
280 	if (srv_startup_is_before_trx_rollback_phase) {
281 		/* No read-ahead to avoid thread deadlocks */
282 		return(0);
283 	}
284 
285 	if (ibuf_bitmap_page(page_id, page_size) || trx_sys_hdr_page(page_id)) {
286 
287 		/* If it is an ibuf bitmap page or trx sys hdr, we do
288 		no read-ahead, as that could break the ibuf page access
289 		order */
290 
291 		return(0);
292 	}
293 
294 	low  = (page_id.page_no() / buf_read_ahead_random_area)
295 		* buf_read_ahead_random_area;
296 
297 	high = (page_id.page_no() / buf_read_ahead_random_area + 1)
298 		* buf_read_ahead_random_area;
299 
300 	/* Remember the tablespace version before we ask the tablespace size
301 	below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
302 	do not try to read outside the bounds of the tablespace! */
303 	if (fil_space_t* space = fil_space_acquire(page_id.space())) {
304 
305 #ifdef UNIV_DEBUG
306 		if (srv_file_per_table) {
307 			ulint	size = 0;
308 
309 			for (const fil_node_t*	node =
310 				UT_LIST_GET_FIRST(space->chain);
311 			     node != NULL;
312 			     node = UT_LIST_GET_NEXT(chain, node)) {
313 
314 				size += os_file_get_size(node->handle)
315 					/ page_size.physical();
316 			}
317 		}
318 #endif /* UNIV_DEBUG */
319 
320 		if (high > space->size) {
321 			high = space->size;
322 		}
323 		fil_space_release(space);
324 	} else {
325 		return(0);
326 	}
327 
328 	buf_pool_mutex_enter(buf_pool);
329 
330 	if (buf_pool->n_pend_reads
331 	    > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
332 		buf_pool_mutex_exit(buf_pool);
333 
334 		return(0);
335 	}
336 
337 	/* Count how many blocks in the area have been recently accessed,
338 	that is, reside near the start of the LRU list. */
339 
340 	for (i = low; i < high; i++) {
341 		/* This debug code is only for 5.7. In trunk, with newDD,
342 		the space->name is no longer same as table name. */
343 		DBUG_EXECUTE_IF("innodb_invalid_read_after_truncate",
344 			fil_space_t*	space = fil_space_get(page_id.space());
345 
346 			if (space != NULL
347 			    && strcmp(space->name, "test/t1") == 0) {
348 				high = space->size;
349 				buf_pool_mutex_exit(buf_pool);
350 				goto read_ahead;
351 			}
352 		);
353 
354 		const buf_page_t*	bpage = buf_page_hash_get(
355 			buf_pool, page_id_t(page_id.space(), i));
356 
357 		if (bpage != NULL
358 		    && buf_page_is_accessed(bpage)
359 		    && buf_page_peek_if_young(bpage)) {
360 
361 			recent_blocks++;
362 
363 			if (recent_blocks
364 			    >= BUF_READ_AHEAD_RANDOM_THRESHOLD(buf_pool)) {
365 
366 				buf_pool_mutex_exit(buf_pool);
367 				goto read_ahead;
368 			}
369 		}
370 	}
371 
372 	buf_pool_mutex_exit(buf_pool);
373 	/* Do nothing */
374 	return(0);
375 
376 read_ahead:
377 	/* Read all the suitable blocks within the area */
378 
379 	if (inside_ibuf) {
380 		ibuf_mode = BUF_READ_IBUF_PAGES_ONLY;
381 	} else {
382 		ibuf_mode = BUF_READ_ANY_PAGE;
383 	}
384 
385 	count = 0;
386 
387 	for (i = low; i < high; i++) {
388 		/* It is only sensible to do read-ahead in the non-sync aio
389 		mode: hence FALSE as the first parameter */
390 
391 		const page_id_t	cur_page_id(page_id.space(), i);
392 
393 		if (!ibuf_bitmap_page(cur_page_id, page_size)) {
394 
395 			count += buf_read_page_low(
396 				&err, false,
397 				IORequest::DO_NOT_WAKE,
398 				ibuf_mode,
399 				cur_page_id, page_size, false);
400 
401 			if (err == DB_TABLESPACE_DELETED) {
402 				ib::warn() << "Random readahead trying to"
403 					" access page " << cur_page_id
404 					<< " in nonexisting or"
405 					" being-dropped tablespace";
406 				break;
407 			}
408 		}
409 	}
410 
411 	/* In simulated aio we wake the aio handler threads only after
412 	queuing all aio requests, in native aio the following call does
413 	nothing: */
414 
415 	os_aio_simulated_wake_handler_threads();
416 
417 	if (count) {
418 		DBUG_PRINT("ib_buf", ("random read-ahead %u pages, %u:%u",
419 				      (unsigned) count,
420 				      (unsigned) page_id.space(),
421 				      (unsigned) page_id.page_no()));
422 	}
423 
424 	/* Read ahead is considered one I/O operation for the purpose of
425 	LRU policy decision. */
426 	buf_LRU_stat_inc_io();
427 
428 	buf_pool->stat.n_ra_pages_read_rnd += count;
429 	srv_stats.buf_pool_reads.add(count);
430 	return(count);
431 }
432 
433 /** High-level function which reads a page asynchronously from a file to the
434 buffer buf_pool if it is not already there. Sets the io_fix flag and sets
435 an exclusive lock on the buffer frame. The flag is cleared and the x-lock
436 released by the i/o-handler thread.
437 @param[in]	page_id		page id
438 @param[in]	page_size	page size
439 @return TRUE if page has been read in, FALSE in case of failure */
440 ibool
buf_read_page(const page_id_t & page_id,const page_size_t & page_size)441 buf_read_page(
442 	const page_id_t&	page_id,
443 	const page_size_t&	page_size)
444 {
445 	ulint		count;
446 	dberr_t		err;
447 
448 	/* We do synchronous IO because our AIO completion code
449 	is sub-optimal. See buf_page_io_complete(), we have to
450 	acquire the buffer pool mutex before acquiring the block
451 	mutex, required for updating the page state. The acquire
452 	of the buffer pool mutex becomes an expensive bottleneck. */
453 
454 	count = buf_read_page_low(
455 		&err, true,
456 		0, BUF_READ_ANY_PAGE, page_id, page_size, false);
457 
458 	srv_stats.buf_pool_reads.add(count);
459 
460 	if (err == DB_TABLESPACE_DELETED) {
461 		ib::error() << "trying to read page " << page_id
462 			<< " in nonexisting or being-dropped tablespace";
463 	}
464 
465 	/* Increment number of I/O operations used for LRU policy. */
466 	buf_LRU_stat_inc_io();
467 
468 	return(count > 0);
469 }
470 
471 /** High-level function which reads a page asynchronously from a file to the
472 buffer buf_pool if it is not already there. Sets the io_fix flag and sets
473 an exclusive lock on the buffer frame. The flag is cleared and the x-lock
474 released by the i/o-handler thread.
475 @param[in]	page_id		page id
476 @param[in]	page_size	page size
477 @param[in]	sync		true if synchronous aio is desired
478 @return TRUE if page has been read in, FALSE in case of failure */
479 ibool
buf_read_page_background(const page_id_t & page_id,const page_size_t & page_size,bool sync)480 buf_read_page_background(
481 	const page_id_t&	page_id,
482 	const page_size_t&	page_size,
483 	bool			sync)
484 {
485 	ulint		count;
486 	dberr_t		err;
487 
488 	count = buf_read_page_low(
489 		&err, sync,
490 		IORequest::DO_NOT_WAKE | IORequest::IGNORE_MISSING,
491 		BUF_READ_ANY_PAGE,
492 		page_id, page_size, false);
493 
494 	srv_stats.buf_pool_reads.add(count);
495 
496 	/* We do not increment number of I/O operations used for LRU policy
497 	here (buf_LRU_stat_inc_io()). We use this in heuristics to decide
498 	about evicting uncompressed version of compressed pages from the
499 	buffer pool. Since this function is called from buffer pool load
500 	these IOs are deliberate and are not part of normal workload we can
501 	ignore these in our heuristics. */
502 
503 	return(count > 0);
504 }
505 
506 /** Applies linear read-ahead if in the buf_pool the page is a border page of
507 a linear read-ahead area and all the pages in the area have been accessed.
508 Does not read any page if the read-ahead mechanism is not activated. Note
509 that the algorithm looks at the 'natural' adjacent successor and
510 predecessor of the page, which on the leaf level of a B-tree are the next
511 and previous page in the chain of leaves. To know these, the page specified
512 in (space, offset) must already be present in the buf_pool. Thus, the
513 natural way to use this function is to call it when a page in the buf_pool
514 is accessed the first time, calling this function just after it has been
515 bufferfixed.
516 NOTE 1: as this function looks at the natural predecessor and successor
517 fields on the page, what happens, if these are not initialized to any
518 sensible value? No problem, before applying read-ahead we check that the
519 area to read is within the span of the space, if not, read-ahead is not
520 applied. An uninitialized value may result in a useless read operation, but
521 only very improbably.
522 NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
523 function must be written such that it cannot end up waiting for these
524 latches!
525 NOTE 3: the calling thread must want access to the page given: this rule is
526 set to prevent unintended read-aheads performed by ibuf routines, a situation
527 which could result in a deadlock if the OS does not support asynchronous io.
528 @param[in]	page_id		page id; see NOTE 3 above
529 @param[in]	page_size	page size
530 @param[in]	inside_ibuf	TRUE if we are inside ibuf routine
531 @return number of page read requests issued */
532 ulint
buf_read_ahead_linear(const page_id_t & page_id,const page_size_t & page_size,ibool inside_ibuf)533 buf_read_ahead_linear(
534 	const page_id_t&	page_id,
535 	const page_size_t&	page_size,
536 	ibool			inside_ibuf)
537 {
538 	buf_pool_t*	buf_pool = buf_pool_get(page_id);
539 	buf_page_t*	bpage;
540 	buf_frame_t*	frame;
541 	buf_page_t*	pred_bpage	= NULL;
542 	ulint		pred_offset;
543 	ulint		succ_offset;
544 	int		asc_or_desc;
545 	ulint		new_offset;
546 	ulint		fail_count;
547 	ulint		low, high;
548 	dberr_t		err;
549 	ulint		i;
550 	const ulint	buf_read_ahead_linear_area
551 		= BUF_READ_AHEAD_AREA(buf_pool);
552 	ulint		threshold;
553 
554 	/* check if readahead is disabled */
555 	if (!srv_read_ahead_threshold) {
556 		return(0);
557 	}
558 
559 	if (srv_startup_is_before_trx_rollback_phase) {
560 		/* No read-ahead to avoid thread deadlocks */
561 		return(0);
562 	}
563 
564 	low  = (page_id.page_no() / buf_read_ahead_linear_area)
565 		* buf_read_ahead_linear_area;
566 	high = (page_id.page_no() / buf_read_ahead_linear_area + 1)
567 		* buf_read_ahead_linear_area;
568 
569 	if ((page_id.page_no() != low) && (page_id.page_no() != high - 1)) {
570 		/* This is not a border page of the area: return */
571 
572 		return(0);
573 	}
574 
575 	if (ibuf_bitmap_page(page_id, page_size) || trx_sys_hdr_page(page_id)) {
576 
577 		/* If it is an ibuf bitmap page or trx sys hdr, we do
578 		no read-ahead, as that could break the ibuf page access
579 		order */
580 
581 		return(0);
582 	}
583 
584 	/* Remember the tablespace version before we ask te tablespace size
585 	below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
586 	do not try to read outside the bounds of the tablespace! */
587 	ulint	space_size;
588 
589 	if (fil_space_t* space = fil_space_acquire(page_id.space())) {
590 		space_size = space->size;
591 		fil_space_release(space);
592 
593 		if (high > space_size) {
594 			/* The area is not whole */
595 			return(0);
596 		}
597 	} else {
598 		return(0);
599 	}
600 
601 	buf_pool_mutex_enter(buf_pool);
602 
603 	if (buf_pool->n_pend_reads
604 	    > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
605 		buf_pool_mutex_exit(buf_pool);
606 
607 		return(0);
608 	}
609 
610 	/* Check that almost all pages in the area have been accessed; if
611 	offset == low, the accesses must be in a descending order, otherwise,
612 	in an ascending order. */
613 
614 	asc_or_desc = 1;
615 
616 	if (page_id.page_no() == low) {
617 		asc_or_desc = -1;
618 	}
619 
620 	/* How many out of order accessed pages can we ignore
621 	when working out the access pattern for linear readahead */
622 	threshold = ut_min(static_cast<ulint>(64 - srv_read_ahead_threshold),
623 			   BUF_READ_AHEAD_AREA(buf_pool));
624 
625 	fail_count = 0;
626 
627 	for (i = low; i < high; i++) {
628 		bpage = buf_page_hash_get(buf_pool,
629 					  page_id_t(page_id.space(), i));
630 
631 		if (bpage == NULL || !buf_page_is_accessed(bpage)) {
632 			/* Not accessed */
633 			fail_count++;
634 
635 		} else if (pred_bpage) {
636 			/* Note that buf_page_is_accessed() returns
637 			the time of the first access.  If some blocks
638 			of the extent existed in the buffer pool at
639 			the time of a linear access pattern, the first
640 			access times may be nonmonotonic, even though
641 			the latest access times were linear.  The
642 			threshold (srv_read_ahead_factor) should help
643 			a little against this. */
644 			int res = ut_ulint_cmp(
645 				buf_page_is_accessed(bpage),
646 				buf_page_is_accessed(pred_bpage));
647 			/* Accesses not in the right order */
648 			if (res != 0 && res != asc_or_desc) {
649 				fail_count++;
650 			}
651 		}
652 
653 		if (fail_count > threshold) {
654 			/* Too many failures: return */
655 			buf_pool_mutex_exit(buf_pool);
656 			return(0);
657 		}
658 
659 		if (bpage && buf_page_is_accessed(bpage)) {
660 			pred_bpage = bpage;
661 		}
662 	}
663 
664 	/* If we got this far, we know that enough pages in the area have
665 	been accessed in the right order: linear read-ahead can be sensible */
666 
667 	bpage = buf_page_hash_get(buf_pool, page_id);
668 
669 	if (bpage == NULL) {
670 		buf_pool_mutex_exit(buf_pool);
671 
672 		return(0);
673 	}
674 
675 	switch (buf_page_get_state(bpage)) {
676 	case BUF_BLOCK_ZIP_PAGE:
677 		frame = bpage->zip.data;
678 		break;
679 	case BUF_BLOCK_FILE_PAGE:
680 		frame = ((buf_block_t*) bpage)->frame;
681 		break;
682 	default:
683 		ut_error;
684 		break;
685 	}
686 
687 	/* Read the natural predecessor and successor page addresses from
688 	the page; NOTE that because the calling thread may have an x-latch
689 	on the page, we do not acquire an s-latch on the page, this is to
690 	prevent deadlocks. Even if we read values which are nonsense, the
691 	algorithm will work. */
692 
693 	pred_offset = fil_page_get_prev(frame);
694 	succ_offset = fil_page_get_next(frame);
695 
696 	buf_pool_mutex_exit(buf_pool);
697 
698 	if ((page_id.page_no() == low)
699 	    && (succ_offset == page_id.page_no() + 1)) {
700 
701 		/* This is ok, we can continue */
702 		new_offset = pred_offset;
703 
704 	} else if ((page_id.page_no() == high - 1)
705 		   && (pred_offset == page_id.page_no() - 1)) {
706 
707 		/* This is ok, we can continue */
708 		new_offset = succ_offset;
709 	} else {
710 		/* Successor or predecessor not in the right order */
711 
712 		return(0);
713 	}
714 
715 	low  = (new_offset / buf_read_ahead_linear_area)
716 		* buf_read_ahead_linear_area;
717 	high = (new_offset / buf_read_ahead_linear_area + 1)
718 		* buf_read_ahead_linear_area;
719 
720 	if ((new_offset != low) && (new_offset != high - 1)) {
721 		/* This is not a border page of the area: return */
722 
723 		return(0);
724 	}
725 
726 	if (high > space_size) {
727 		/* The area is not whole, return */
728 
729 		return(0);
730 	}
731 
732 	ulint	count = 0;
733 
734 	/* If we got this far, read-ahead can be sensible: do it */
735 
736 	ulint	ibuf_mode;
737 
738 	ibuf_mode = inside_ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE;
739 
740 	/* Since Windows XP seems to schedule the i/o handler thread
741 	very eagerly, and consequently it does not wait for the
742 	full read batch to be posted, we use special heuristics here */
743 
744 	os_aio_simulated_put_read_threads_to_sleep();
745 
746 	for (i = low; i < high; i++) {
747 		/* It is only sensible to do read-ahead in the non-sync
748 		aio mode: hence FALSE as the first parameter */
749 
750 		const page_id_t	cur_page_id(page_id.space(), i);
751 
752 		if (!ibuf_bitmap_page(cur_page_id, page_size)) {
753 
754 			count += buf_read_page_low(
755 				&err, false,
756 				IORequest::DO_NOT_WAKE,
757 				ibuf_mode, cur_page_id, page_size, false);
758 
759 			if (err == DB_TABLESPACE_DELETED) {
760 				ib::warn() << "linear readahead trying to"
761 					" access page "
762 					<< page_id_t(page_id.space(), i)
763 					<< " in nonexisting or being-dropped"
764 					" tablespace";
765 			}
766 		}
767 	}
768 
769 	/* In simulated aio we wake the aio handler threads only after
770 	queuing all aio requests, in native aio the following call does
771 	nothing: */
772 
773 	os_aio_simulated_wake_handler_threads();
774 
775 	if (count) {
776 		DBUG_PRINT("ib_buf", ("linear read-ahead %lu pages, "
777 				      UINT32PF ":" UINT32PF,
778 				      count,
779 				      page_id.space(),
780 				      page_id.page_no()));
781 	}
782 
783 	/* Read ahead is considered one I/O operation for the purpose of
784 	LRU policy decision. */
785 	buf_LRU_stat_inc_io();
786 
787 	buf_pool->stat.n_ra_pages_read += count;
788 	return(count);
789 }
790 
791 /********************************************************************//**
792 Issues read requests for pages which the ibuf module wants to read in, in
793 order to contract the insert buffer tree. Technically, this function is like
794 a read-ahead function. */
795 void
buf_read_ibuf_merge_pages(bool sync,const ulint * space_ids,const ulint * page_nos,ulint n_stored)796 buf_read_ibuf_merge_pages(
797 /*======================*/
798 	bool		sync,		/*!< in: true if the caller
799 					wants this function to wait
800 					for the highest address page
801 					to get read in, before this
802 					function returns */
803 	const ulint*	space_ids,	/*!< in: array of space ids */
804 	const ulint*	page_nos,	/*!< in: array of page numbers
805 					to read, with the highest page
806 					number the last in the
807 					array */
808 	ulint		n_stored)	/*!< in: number of elements
809 					in the arrays */
810 {
811 #ifdef UNIV_IBUF_DEBUG
812 	ut_a(n_stored < UNIV_PAGE_SIZE);
813 #endif
814 
815 	for (ulint i = 0; i < n_stored; i++) {
816 		const page_id_t	page_id(space_ids[i], page_nos[i]);
817 
818 		buf_pool_t*	buf_pool = buf_pool_get(page_id);
819 
820 		bool			found;
821 		const page_size_t	page_size(fil_space_get_page_size(
822 			space_ids[i], &found));
823 
824 		if (!found) {
825 			/* The tablespace was not found, remove the
826 			entries for that page */
827 			ibuf_merge_or_delete_for_page(NULL, page_id,
828 						      NULL, FALSE);
829 			continue;
830 		}
831 
832 		while (buf_pool->n_pend_reads
833 		       > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
834 			os_thread_sleep(500000);
835 		}
836 
837 		dberr_t	err;
838 
839 		buf_read_page_low(&err,
840 				  sync && (i + 1 == n_stored),
841 				  0,
842 				  BUF_READ_ANY_PAGE, page_id, page_size,
843 				  true);
844 
845 		if (err == DB_TABLESPACE_DELETED) {
846 			/* We have deleted or are deleting the single-table
847 			tablespace: remove the entries for the tablespace */
848 
849 			ibuf_delete_for_discarded_space(space_ids[i]);
850 		}
851 	}
852 
853 	os_aio_simulated_wake_handler_threads();
854 
855 	if (n_stored) {
856 		DBUG_PRINT("ib_buf",
857 			   ("ibuf merge read-ahead %u pages, space %u",
858 			    unsigned(n_stored), unsigned(space_ids[0])));
859 	}
860 }
861 
862 /** Issues read requests for pages which recovery wants to read in.
863 @param[in]	sync		true if the caller wants this function to wait
864 for the highest address page to get read in, before this function returns
865 @param[in]	space_id	tablespace id
866 @param[in]	page_nos	array of page numbers to read, with the
867 highest page number the last in the array
868 @param[in]	n_stored	number of page numbers in the array */
869 void
buf_read_recv_pages(bool sync,ulint space_id,const ulint * page_nos,ulint n_stored)870 buf_read_recv_pages(
871 	bool		sync,
872 	ulint		space_id,
873 	const ulint*	page_nos,
874 	ulint		n_stored)
875 {
876 	ulint			count;
877 	dberr_t			err;
878 	ulint			i;
879 	fil_space_t*		space	= fil_space_get(space_id);
880 
881 	if (space == NULL) {
882 		/* The tablespace is missing: do nothing */
883 		return;
884 	}
885 
886 	fil_space_open_if_needed(space);
887 
888 	const page_size_t	page_size(space->flags);
889 
890 	for (i = 0; i < n_stored; i++) {
891 		buf_pool_t*		buf_pool;
892 		const page_id_t	cur_page_id(space_id, page_nos[i]);
893 
894 		count = 0;
895 
896 		buf_pool = buf_pool_get(cur_page_id);
897 		while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) {
898 
899 			os_aio_simulated_wake_handler_threads();
900 			os_thread_sleep(10000);
901 
902 			count++;
903 
904 			if (!(count % 1000)) {
905 
906 				ib::error()
907 					<< "Waited for " << count / 100
908 					<< " seconds for "
909 					<< buf_pool->n_pend_reads
910 					<< " pending reads";
911 			}
912 		}
913 
914 		if ((i + 1 == n_stored) && sync) {
915 			buf_read_page_low(
916 				&err, true,
917 				0,
918 				BUF_READ_ANY_PAGE,
919 				cur_page_id, page_size, true);
920 		} else {
921 			buf_read_page_low(
922 				&err, false,
923 				IORequest::DO_NOT_WAKE,
924 				BUF_READ_ANY_PAGE,
925 				cur_page_id, page_size, true);
926 		}
927 	}
928 
929 	os_aio_simulated_wake_handler_threads();
930 
931 	DBUG_PRINT("ib_buf", ("recovery read-ahead (%u pages)",
932 			      unsigned(n_stored)));
933 }
934 
935