1 /*****************************************************************************
2 
3 Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
4 
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License, version 2.0,
7 as published by the Free Software Foundation.
8 
9 This program is also distributed with certain software (including
10 but not limited to OpenSSL) that is licensed under separate terms,
11 as designated in a particular file or component or in included license
12 documentation.  The authors of MySQL hereby grant you an additional
13 permission to link the program and your derivative works with the
14 separately licensed software that they have included with MySQL.
15 
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 GNU General Public License, version 2.0, for more details.
20 
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
24 
25 *****************************************************************************/
26 
27 /**************************************************//**
28 @file buf/buf0rea.cc
29 The database buffer read
30 
31 Created 11/5/1995 Heikki Tuuri
32 *******************************************************/
33 
34 #include "buf0rea.h"
35 
36 #include "fil0fil.h"
37 #include "mtr0mtr.h"
38 
39 #include "buf0buf.h"
40 #include "buf0flu.h"
41 #include "buf0lru.h"
42 #include "buf0dblwr.h"
43 #include "ibuf0ibuf.h"
44 #include "log0recv.h"
45 #include "trx0sys.h"
46 #include "os0file.h"
47 #include "srv0start.h"
48 #include "srv0srv.h"
49 #include "mysql/plugin.h"
50 #include "mysql/service_thd_wait.h"
51 
52 /** There must be at least this many pages in buf_pool in the area to start
53 a random read-ahead */
54 #define BUF_READ_AHEAD_RANDOM_THRESHOLD(b)	\
55 				(5 + BUF_READ_AHEAD_AREA(b) / 8)
56 
57 /** If there are buf_pool->curr_size per the number below pending reads, then
58 read-ahead is not done: this is to prevent flooding the buffer pool with
59 i/o-fixed buffer blocks */
60 #define BUF_READ_AHEAD_PEND_LIMIT	2
61 
62 /********************************************************************//**
63 Unfixes the pages, unlatches the page,
64 removes it from page_hash and removes it from LRU. */
65 static
66 void
buf_read_page_handle_error(buf_page_t * bpage)67 buf_read_page_handle_error(
68 /*=======================*/
69 	buf_page_t*	bpage)	/*!< in: pointer to the block */
70 {
71 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
72 	const bool	uncompressed = (buf_page_get_state(bpage)
73 					== BUF_BLOCK_FILE_PAGE);
74 	const ulint	fold = buf_page_address_fold(bpage->space,
75 						     bpage->offset);
76 	prio_rw_lock_t*	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
77 
78 	mutex_enter(&buf_pool->LRU_list_mutex);
79 	rw_lock_x_lock(hash_lock);
80 	mutex_enter(buf_page_get_mutex(bpage));
81 
82 	/* First unfix and release lock on the bpage */
83 	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_READ);
84 	ut_ad(bpage->buf_fix_count == 0);
85 
86 	/* Set BUF_IO_NONE before we remove the block from LRU list */
87 	buf_page_set_io_fix(bpage, BUF_IO_NONE);
88 
89 	if (uncompressed) {
90 		rw_lock_x_unlock_gen(
91 			&((buf_block_t*) bpage)->lock,
92 			BUF_IO_READ);
93 	}
94 
95 	/* remove the block from LRU list */
96 	buf_LRU_free_one_page(bpage);
97 
98 	mutex_exit(&buf_pool->LRU_list_mutex);
99 
100 	ut_ad(buf_pool->n_pend_reads > 0);
101 	os_atomic_decrement_ulint(&buf_pool->n_pend_reads, 1);
102 }
103 
104 /********************************************************************//**
105 Low-level function which reads a page asynchronously from a file to the
106 buffer buf_pool if it is not already there, in which case does nothing.
107 Sets the io_fix flag and sets an exclusive lock on the buffer frame. The
108 flag is cleared and the x-lock released by an i/o-handler thread.
109 @return 1 if a read request was queued, 0 if the page already resided
110 in buf_pool, or if the page is in the doublewrite buffer blocks in
111 which case it is never read into the pool, or if the tablespace does
112 not exist or is being dropped
113 @return 1 if read request is issued. 0 if it is not */
114 UNIV_INTERN
115 ulint
buf_read_page_low(dberr_t * err,bool sync,ulint mode,ulint space,ulint zip_size,ibool unzip,ib_int64_t tablespace_version,ulint offset,trx_t * trx,bool should_buffer)116 buf_read_page_low(
117 /*==============*/
118 	dberr_t*	err,	/*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are
119 			trying to read from a non-existent tablespace, or a
120 			tablespace which is just now being dropped */
121 	bool	sync,	/*!< in: true if synchronous aio is desired */
122 	ulint	mode,	/*!< in: BUF_READ_IBUF_PAGES_ONLY, ...,
123 			ORed to OS_AIO_SIMULATED_WAKE_LATER (see below
124 			at read-ahead functions) */
125 	ulint	space,	/*!< in: space id */
126 	ulint	zip_size,/*!< in: compressed page size, or 0 */
127 	ibool	unzip,	/*!< in: TRUE=request uncompressed page */
128 	ib_int64_t tablespace_version, /*!< in: if the space memory object has
129 			this timestamp different from what we are giving here,
130 			treat the tablespace as dropped; this is a timestamp we
131 			use to stop dangling page reads from a tablespace
132 			which we have DISCARDed + IMPORTed back */
133 	ulint	offset,	/*!< in: page number */
134 	trx_t*	trx,
135 	bool	should_buffer)	/*!< in: whether to buffer an aio request.
136 			AIO read ahead uses this. If you plan to
137 			use this parameter, make sure you remember
138 			to call os_aio_dispatch_read_array_submit()
139 			when you're ready to commit all your requests.*/
140 {
141 	buf_page_t*	bpage;
142 	ulint		wake_later;
143 	ibool		ignore_nonexistent_pages;
144 
145 	*err = DB_SUCCESS;
146 
147 	wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
148 	mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER;
149 
150 	ignore_nonexistent_pages = mode & BUF_READ_IGNORE_NONEXISTENT_PAGES;
151 	mode &= ~BUF_READ_IGNORE_NONEXISTENT_PAGES;
152 
153 	if (space == TRX_SYS_SPACE && buf_dblwr_page_inside(offset)) {
154 		ut_print_timestamp(stderr);
155 		fprintf(stderr,
156 			"  InnoDB: Warning: trying to read"
157 			" doublewrite buffer page %lu\n",
158 			(ulong) offset);
159 
160 		return(0);
161 	}
162 
163 	if (ibuf_bitmap_page(zip_size, offset)
164 	    || trx_sys_hdr_page(space, offset)) {
165 
166 		/* Trx sys header is so low in the latching order that we play
167 		safe and do not leave the i/o-completion to an asynchronous
168 		i/o-thread. Ibuf bitmap pages must always be read with
169 		syncronous i/o, to make sure they do not get involved in
170 		thread deadlocks. */
171 
172 		sync = true;
173 	}
174 
175 	/* The following call will also check if the tablespace does not exist
176 	or is being dropped; if we succeed in initing the page in the buffer
177 	pool for read, then DISCARD cannot proceed until the read has
178 	completed */
179 	bpage = buf_page_init_for_read(err, mode, space, zip_size, unzip,
180 				       tablespace_version, offset);
181 	if (bpage == NULL) {
182 		/* bugfix: http://bugs.mysql.com/bug.php?id=43948 */
183 		if (recv_recovery_is_on() && *err == DB_TABLESPACE_DELETED) {
184 			/* hashed log recs must be treated here */
185 			recv_addr_t*    recv_addr;
186 
187 			mutex_enter(&(recv_sys->mutex));
188 
189 			if (recv_sys->apply_log_recs == FALSE) {
190 				mutex_exit(&(recv_sys->mutex));
191 				goto not_to_recover;
192 			}
193 
194 			/* recv_get_fil_addr_struct() */
195 			recv_addr = (recv_addr_t*)HASH_GET_FIRST(recv_sys->addr_hash,
196 					hash_calc_hash(ut_fold_ulint_pair(space, offset),
197 						recv_sys->addr_hash));
198 			while (recv_addr) {
199 				if ((recv_addr->space == space)
200 					&& (recv_addr->page_no == offset)) {
201 					break;
202 				}
203 				recv_addr = (recv_addr_t*)HASH_GET_NEXT(addr_hash, recv_addr);
204 			}
205 
206 			if ((recv_addr == NULL)
207 			    || (recv_addr->state == RECV_BEING_PROCESSED)
208 			    || (recv_addr->state == RECV_PROCESSED)) {
209 				mutex_exit(&(recv_sys->mutex));
210 				goto not_to_recover;
211 			}
212 
213 			fprintf(stderr, " (cannot find space: %lu)", space);
214 			recv_addr->state = RECV_PROCESSED;
215 
216 			ut_a(recv_sys->n_addrs);
217 			recv_sys->n_addrs--;
218 
219 			mutex_exit(&(recv_sys->mutex));
220 		}
221 not_to_recover:
222 
223 		return(0);
224 	}
225 
226 #ifdef UNIV_DEBUG
227 	if (buf_debug_prints) {
228 		fprintf(stderr,
229 			"Posting read request for page %lu, sync %s\n",
230 			(ulong) offset, sync ? "true" : "false");
231 	}
232 #endif
233 
234 	ut_ad(buf_page_in_file(bpage));
235 	ut_ad(!mutex_own(&buf_pool_from_bpage(bpage)->LRU_list_mutex));
236 
237 	if (sync) {
238 		thd_wait_begin(NULL, THD_WAIT_DISKIO);
239 	}
240 
241 	if (zip_size) {
242 		*err = _fil_io(OS_FILE_READ | wake_later
243 			       | ignore_nonexistent_pages,
244 			       sync, space, zip_size, offset, 0, zip_size,
245 			       bpage->zip.data, bpage, trx, should_buffer);
246 	} else {
247 		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
248 
249 		*err = _fil_io(OS_FILE_READ | wake_later
250 			      | ignore_nonexistent_pages,
251 			      sync, space, 0, offset, 0, UNIV_PAGE_SIZE,
252 			      ((buf_block_t*) bpage)->frame, bpage, trx,
253 			      should_buffer);
254 	}
255 
256 	if (sync) {
257 		thd_wait_end(NULL);
258 	}
259 
260 	if (*err != DB_SUCCESS) {
261 		if (ignore_nonexistent_pages || *err == DB_TABLESPACE_DELETED) {
262 			buf_read_page_handle_error(bpage);
263 			return(0);
264 		}
265 		SRV_CORRUPT_TABLE_CHECK(*err == DB_SUCCESS,
266 					bpage->is_corrupt = TRUE;);
267 	}
268 
269 	if (sync) {
270 		/* The i/o is already completed when we arrive from
271 		fil_read */
272 		if (!buf_page_io_complete(bpage)) {
273 			return(0);
274 		}
275 	}
276 
277 	return(1);
278 }
279 
280 /********************************************************************//**
281 Applies a random read-ahead in buf_pool if there are at least a threshold
282 value of accessed pages from the random read-ahead area. Does not read any
283 page, not even the one at the position (space, offset), if the read-ahead
284 mechanism is not activated. NOTE 1: the calling thread may own latches on
285 pages: to avoid deadlocks this function must be written such that it cannot
286 end up waiting for these latches! NOTE 2: the calling thread must want
287 access to the page given: this rule is set to prevent unintended read-aheads
288 performed by ibuf routines, a situation which could result in a deadlock if
289 the OS does not support asynchronous i/o.
290 @return number of page read requests issued; NOTE that if we read ibuf
291 pages, it may happen that the page at the given page number does not
292 get read even if we return a positive value!
293 @return	number of page read requests issued */
294 UNIV_INTERN
295 ulint
buf_read_ahead_random(ulint space,ulint zip_size,ulint offset,ibool inside_ibuf,trx_t * trx)296 buf_read_ahead_random(
297 /*==================*/
298 	ulint	space,		/*!< in: space id */
299 	ulint	zip_size,	/*!< in: compressed page size in bytes,
300 				or 0 */
301 	ulint	offset,		/*!< in: page number of a page which
302 				the current thread wants to access */
303 	ibool	inside_ibuf,	/*!< in: TRUE if we are inside ibuf
304 				routine */
305 	trx_t*	trx)
306 {
307 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
308 	ib_int64_t	tablespace_version;
309 	ulint		recent_blocks	= 0;
310 	ulint		ibuf_mode;
311 	ulint		count;
312 	ulint		low, high;
313 	dberr_t		err;
314 	ulint		i;
315 	const ulint	buf_read_ahead_random_area
316 				= BUF_READ_AHEAD_AREA(buf_pool);
317 
318 	if (!srv_random_read_ahead) {
319 		/* Disabled by user */
320 		return(0);
321 	}
322 
323 	if (srv_startup_is_before_trx_rollback_phase) {
324 		/* No read-ahead to avoid thread deadlocks */
325 		return(0);
326 	}
327 
328 	if (ibuf_bitmap_page(zip_size, offset)
329 	    || trx_sys_hdr_page(space, offset)) {
330 
331 		/* If it is an ibuf bitmap page or trx sys hdr, we do
332 		no read-ahead, as that could break the ibuf page access
333 		order */
334 
335 		return(0);
336 	}
337 
338 	/* Remember the tablespace version before we ask te tablespace size
339 	below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
340 	do not try to read outside the bounds of the tablespace! */
341 
342 	tablespace_version = fil_space_get_version(space);
343 
344 	low  = (offset / buf_read_ahead_random_area)
345 		* buf_read_ahead_random_area;
346 	high = (offset / buf_read_ahead_random_area + 1)
347 		* buf_read_ahead_random_area;
348 	if (high > fil_space_get_size(space)) {
349 
350 		high = fil_space_get_size(space);
351 	}
352 
353 	if (buf_pool->n_pend_reads
354 	    > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
355 
356 		return(0);
357 	}
358 
359 	/* Count how many blocks in the area have been recently accessed,
360 	that is, reside near the start of the LRU list. */
361 
362 	for (i = low; i < high; i++) {
363 
364 		prio_rw_lock_t*	hash_lock;
365 
366 		const buf_page_t* bpage =
367 			buf_page_hash_get_s_locked(buf_pool, space, i,
368 						   &hash_lock);
369 
370 		if (bpage
371 		    && buf_page_is_accessed(bpage)
372 		    && buf_page_peek_if_young(bpage)) {
373 
374 			recent_blocks++;
375 
376 			if (recent_blocks
377 			    >= BUF_READ_AHEAD_RANDOM_THRESHOLD(buf_pool)) {
378 
379 				rw_lock_s_unlock(hash_lock);
380 				goto read_ahead;
381 			}
382 		}
383 
384 		if (bpage) {
385 			rw_lock_s_unlock(hash_lock);
386 		}
387 	}
388 
389 	/* Do nothing */
390 	return(0);
391 
392 read_ahead:
393 	/* Read all the suitable blocks within the area */
394 
395 	if (inside_ibuf) {
396 		ibuf_mode = BUF_READ_IBUF_PAGES_ONLY;
397 	} else {
398 		ibuf_mode = BUF_READ_ANY_PAGE;
399 	}
400 
401 	count = 0;
402 
403 	for (i = low; i < high; i++) {
404 		/* It is only sensible to do read-ahead in the non-sync aio
405 		mode: hence FALSE as the first parameter */
406 
407 		if (!ibuf_bitmap_page(zip_size, i)) {
408 			count += buf_read_page_low(
409 				&err, false,
410 				ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER,
411 				space, zip_size, FALSE,
412 				tablespace_version, i, trx, false);
413 			if (err == DB_TABLESPACE_DELETED) {
414 				ut_print_timestamp(stderr);
415 				fprintf(stderr,
416 					"  InnoDB: Warning: in random"
417 					" readahead trying to access\n"
418 					"InnoDB: tablespace %lu page %lu,\n"
419 					"InnoDB: but the tablespace does not"
420 					" exist or is just being dropped.\n",
421 					(ulong) space, (ulong) i);
422 			}
423 		}
424 	}
425 
426 	/* In simulated aio we wake the aio handler threads only after
427 	queuing all aio requests, in native aio the following call does
428 	nothing: */
429 
430 	os_aio_simulated_wake_handler_threads();
431 
432 #ifdef UNIV_DEBUG
433 	if (buf_debug_prints && (count > 0)) {
434 		fprintf(stderr,
435 			"Random read-ahead space %lu offset %lu pages %lu\n",
436 			(ulong) space, (ulong) offset,
437 			(ulong) count);
438 	}
439 #endif /* UNIV_DEBUG */
440 
441 	/* Read ahead is considered one I/O operation for the purpose of
442 	LRU policy decision. */
443 	buf_LRU_stat_inc_io();
444 
445 	buf_pool->stat.n_ra_pages_read_rnd += count;
446 	srv_stats.buf_pool_reads.add(count);
447 	return(count);
448 }
449 
450 /********************************************************************//**
451 High-level function which reads a page asynchronously from a file to the
452 buffer buf_pool if it is not already there. Sets the io_fix flag and sets
453 an exclusive lock on the buffer frame. The flag is cleared and the x-lock
454 released by the i/o-handler thread.
455 @return TRUE if page has been read in, FALSE in case of failure */
456 UNIV_INTERN
457 ibool
buf_read_page(ulint space,ulint zip_size,ulint offset,trx_t * trx)458 buf_read_page(
459 /*==========*/
460 	ulint	space,	/*!< in: space id */
461 	ulint	zip_size,/*!< in: compressed page size in bytes, or 0 */
462 	ulint	offset,	/*!< in: page number */
463 	trx_t*	trx)
464 {
465 	ib_int64_t	tablespace_version;
466 	ulint		count;
467 	dberr_t		err;
468 
469 	tablespace_version = fil_space_get_version(space);
470 
471 	/* We do the i/o in the synchronous aio mode to save thread
472 	switches: hence TRUE */
473 
474 	count = buf_read_page_low(&err, true, BUF_READ_ANY_PAGE, space,
475 				  zip_size, FALSE,
476 				  tablespace_version, offset, trx, false);
477 	srv_stats.buf_pool_reads.add(count);
478 	if (err == DB_TABLESPACE_DELETED) {
479 		ut_print_timestamp(stderr);
480 		fprintf(stderr,
481 			"  InnoDB: Error: trying to access"
482 			" tablespace %lu page no. %lu,\n"
483 			"InnoDB: but the tablespace does not exist"
484 			" or is just being dropped.\n",
485 			(ulong) space, (ulong) offset);
486 	}
487 
488 	/* Increment number of I/O operations used for LRU policy. */
489 	buf_LRU_stat_inc_io();
490 
491 	return(count > 0);
492 }
493 
494 /********************************************************************//**
495 High-level function which reads a page asynchronously from a file to the
496 buffer buf_pool if it is not already there. Sets the io_fix flag and sets
497 an exclusive lock on the buffer frame. The flag is cleared and the x-lock
498 released by the i/o-handler thread.
499 @return TRUE if page has been read in, FALSE in case of failure */
500 UNIV_INTERN
501 ibool
buf_read_page_async(ulint space,ulint offset)502 buf_read_page_async(
503 /*================*/
504 	ulint	space,	/*!< in: space id */
505 	ulint	offset)	/*!< in: page number */
506 {
507 	ulint		zip_size;
508 	ib_int64_t	tablespace_version;
509 	ulint		count;
510 	dberr_t		err;
511 
512 	zip_size = fil_space_get_zip_size(space);
513 
514 	if (zip_size == ULINT_UNDEFINED) {
515 		return(FALSE);
516 	}
517 
518 	tablespace_version = fil_space_get_version(space);
519 
520 	count = buf_read_page_low(&err, true, BUF_READ_ANY_PAGE
521 				  | OS_AIO_SIMULATED_WAKE_LATER
522 				  | BUF_READ_IGNORE_NONEXISTENT_PAGES,
523 				  space, zip_size, FALSE,
524 				  tablespace_version, offset, NULL, false);
525 	srv_stats.buf_pool_reads.add(count);
526 
527 	/* We do not increment number of I/O operations used for LRU policy
528 	here (buf_LRU_stat_inc_io()). We use this in heuristics to decide
529 	about evicting uncompressed version of compressed pages from the
530 	buffer pool. Since this function is called from buffer pool load
531 	these IOs are deliberate and are not part of normal workload we can
532 	ignore these in our heuristics. */
533 
534 	return(count > 0);
535 }
536 
537 /********************************************************************//**
538 Applies linear read-ahead if in the buf_pool the page is a border page of
539 a linear read-ahead area and all the pages in the area have been accessed.
540 Does not read any page if the read-ahead mechanism is not activated. Note
541 that the algorithm looks at the 'natural' adjacent successor and
542 predecessor of the page, which on the leaf level of a B-tree are the next
543 and previous page in the chain of leaves. To know these, the page specified
544 in (space, offset) must already be present in the buf_pool. Thus, the
545 natural way to use this function is to call it when a page in the buf_pool
546 is accessed the first time, calling this function just after it has been
547 bufferfixed.
548 NOTE 1: as this function looks at the natural predecessor and successor
549 fields on the page, what happens, if these are not initialized to any
550 sensible value? No problem, before applying read-ahead we check that the
551 area to read is within the span of the space, if not, read-ahead is not
552 applied. An uninitialized value may result in a useless read operation, but
553 only very improbably.
554 NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
555 function must be written such that it cannot end up waiting for these
556 latches!
557 NOTE 3: the calling thread must want access to the page given: this rule is
558 set to prevent unintended read-aheads performed by ibuf routines, a situation
559 which could result in a deadlock if the OS does not support asynchronous io.
560 @return	number of page read requests issued */
561 UNIV_INTERN
562 ulint
buf_read_ahead_linear(ulint space,ulint zip_size,ulint offset,ibool inside_ibuf,trx_t * trx)563 buf_read_ahead_linear(
564 /*==================*/
565 	ulint	space,		/*!< in: space id */
566 	ulint	zip_size,	/*!< in: compressed page size in bytes, or 0 */
567 	ulint	offset,		/*!< in: page number; see NOTE 3 above */
568 	ibool	inside_ibuf,	/*!< in: TRUE if we are inside ibuf routine */
569 	trx_t*	trx)
570 {
571 	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
572 	ib_int64_t	tablespace_version;
573 	buf_page_t*	bpage;
574 	buf_frame_t*	frame;
575 	buf_page_t*	pred_bpage	= NULL;
576 	unsigned	pred_bpage_is_accessed = 0;
577 	ulint		pred_offset;
578 	ulint		succ_offset;
579 	ulint		count;
580 	int		asc_or_desc;
581 	ulint		new_offset;
582 	ulint		fail_count;
583 	ulint		ibuf_mode;
584 	ulint		low, high;
585 	dberr_t		err;
586 	ulint		i;
587 	const ulint	buf_read_ahead_linear_area
588 		= BUF_READ_AHEAD_AREA(buf_pool);
589 	ulint		threshold;
590 
591 	/* check if readahead is disabled */
592 	if (!srv_read_ahead_threshold) {
593 		return(0);
594 	}
595 
596 	if (UNIV_UNLIKELY(srv_startup_is_before_trx_rollback_phase)) {
597 		/* No read-ahead to avoid thread deadlocks */
598 		return(0);
599 	}
600 
601 	low  = (offset / buf_read_ahead_linear_area)
602 		* buf_read_ahead_linear_area;
603 	high = (offset / buf_read_ahead_linear_area + 1)
604 		* buf_read_ahead_linear_area;
605 
606 	if ((offset != low) && (offset != high - 1)) {
607 		/* This is not a border page of the area: return */
608 
609 		return(0);
610 	}
611 
612 	if (ibuf_bitmap_page(zip_size, offset)
613 	    || trx_sys_hdr_page(space, offset)) {
614 
615 		/* If it is an ibuf bitmap page or trx sys hdr, we do
616 		no read-ahead, as that could break the ibuf page access
617 		order */
618 
619 		return(0);
620 	}
621 
622 	/* Remember the tablespace version before we ask te tablespace size
623 	below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
624 	do not try to read outside the bounds of the tablespace! */
625 
626 	tablespace_version = fil_space_get_version(space);
627 
628 	if (high > fil_space_get_size(space)) {
629 		/* The area is not whole, return */
630 
631 		return(0);
632 	}
633 
634 	if (buf_pool->n_pend_reads
635 	    > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
636 
637 		return(0);
638 	}
639 
640 	/* Check that almost all pages in the area have been accessed; if
641 	offset == low, the accesses must be in a descending order, otherwise,
642 	in an ascending order. */
643 
644 	asc_or_desc = 1;
645 
646 	if (offset == low) {
647 		asc_or_desc = -1;
648 	}
649 
650 	/* How many out of order accessed pages can we ignore
651 	when working out the access pattern for linear readahead */
652 	threshold = ut_min((64 - srv_read_ahead_threshold),
653 			   BUF_READ_AHEAD_AREA(buf_pool));
654 
655 	fail_count = 0;
656 
657 	prio_rw_lock_t*	hash_lock;
658 
659 	for (i = low; i < high; i++) {
660 
661 		bpage = buf_page_hash_get_s_locked(buf_pool, space, i,
662 						   &hash_lock);
663 
664 		if (bpage == NULL || !buf_page_is_accessed(bpage)) {
665 			/* Not accessed */
666 			fail_count++;
667 
668 		} else if (pred_bpage) {
669 			/* Note that buf_page_is_accessed() returns
670 			the time of the first access.  If some blocks
671 			of the extent existed in the buffer pool at
672 			the time of a linear access pattern, the first
673 			access times may be nonmonotonic, even though
674 			the latest access times were linear.  The
675 			threshold (srv_read_ahead_factor) should help
676 			a little against this. */
677 			int res = ut_ulint_cmp(
678 				buf_page_is_accessed(bpage),
679 				pred_bpage_is_accessed);
680 			/* Accesses not in the right order */
681 			if (res != 0 && res != asc_or_desc) {
682 				fail_count++;
683 			}
684 		}
685 
686 		if (fail_count > threshold) {
687 			/* Too many failures: return */
688 			if (bpage) {
689 				rw_lock_s_unlock(hash_lock);
690 			}
691 			return(0);
692 		}
693 
694 		if (bpage) {
695 			if (buf_page_is_accessed(bpage)) {
696 				pred_bpage = bpage;
697 				pred_bpage_is_accessed
698 					= buf_page_is_accessed(bpage);
699 			}
700 
701 			rw_lock_s_unlock(hash_lock);
702 		}
703 	}
704 
705 	/* If we got this far, we know that enough pages in the area have
706 	been accessed in the right order: linear read-ahead can be sensible */
707 
708 	bpage = buf_page_hash_get_s_locked(buf_pool, space, offset, &hash_lock);
709 
710 	if (bpage == NULL) {
711 
712 		return(0);
713 	}
714 
715 	switch (buf_page_get_state(bpage)) {
716 	case BUF_BLOCK_ZIP_PAGE:
717 		frame = bpage->zip.data;
718 		break;
719 	case BUF_BLOCK_FILE_PAGE:
720 		frame = ((buf_block_t*) bpage)->frame;
721 		break;
722 	default:
723 		ut_error;
724 		break;
725 	}
726 
727 	/* Read the natural predecessor and successor page addresses from
728 	the page; NOTE that because the calling thread may have an x-latch
729 	on the page, we do not acquire an s-latch on the page, this is to
730 	prevent deadlocks. Even if we read values which are nonsense, the
731 	algorithm will work. */
732 
733 	pred_offset = fil_page_get_prev(frame);
734 	succ_offset = fil_page_get_next(frame);
735 
736 	rw_lock_s_unlock(hash_lock);
737 
738 	if ((offset == low) && (succ_offset == offset + 1)) {
739 
740 		/* This is ok, we can continue */
741 		new_offset = pred_offset;
742 
743 	} else if ((offset == high - 1) && (pred_offset == offset - 1)) {
744 
745 		/* This is ok, we can continue */
746 		new_offset = succ_offset;
747 	} else {
748 		/* Successor or predecessor not in the right order */
749 
750 		return(0);
751 	}
752 
753 	low  = (new_offset / buf_read_ahead_linear_area)
754 		* buf_read_ahead_linear_area;
755 	high = (new_offset / buf_read_ahead_linear_area + 1)
756 		* buf_read_ahead_linear_area;
757 
758 	if ((new_offset != low) && (new_offset != high - 1)) {
759 		/* This is not a border page of the area: return */
760 
761 		return(0);
762 	}
763 
764 	if (high > fil_space_get_size(space)) {
765 		/* The area is not whole, return */
766 
767 		return(0);
768 	}
769 
770 	/* If we got this far, read-ahead can be sensible: do it */
771 
772 	ibuf_mode = inside_ibuf
773 		? BUF_READ_IBUF_PAGES_ONLY | OS_AIO_SIMULATED_WAKE_LATER
774 		: BUF_READ_ANY_PAGE | OS_AIO_SIMULATED_WAKE_LATER;
775 
776 	count = 0;
777 
778 	/* Since Windows XP seems to schedule the i/o handler thread
779 	very eagerly, and consequently it does not wait for the
780 	full read batch to be posted, we use special heuristics here */
781 
782 	os_aio_simulated_put_read_threads_to_sleep();
783 
784 	for (i = low; i < high; i++) {
785 		/* It is only sensible to do read-ahead in the non-sync
786 		aio mode: hence FALSE as the first parameter */
787 
788 		if (!ibuf_bitmap_page(zip_size, i)) {
789 			count += buf_read_page_low(
790 				&err, false,
791 				ibuf_mode,
792 				space, zip_size, FALSE, tablespace_version,
793 				i, trx, true);
794 			if (err == DB_TABLESPACE_DELETED) {
795 				ut_print_timestamp(stderr);
796 				fprintf(stderr,
797 					"  InnoDB: Warning: in"
798 					" linear readahead trying to access\n"
799 					"InnoDB: tablespace %lu page %lu,\n"
800 					"InnoDB: but the tablespace does not"
801 					" exist or is just being dropped.\n",
802 					(ulong) space, (ulong) i);
803 			}
804 		}
805 	}
806 	os_aio_dispatch_read_array_submit();
807 
808 	/* In simulated aio we wake the aio handler threads only after
809 	queuing all aio requests, in native aio the following call does
810 	nothing: */
811 
812 	os_aio_simulated_wake_handler_threads();
813 
814 #ifdef UNIV_DEBUG
815 	if (buf_debug_prints && (count > 0)) {
816 		fprintf(stderr,
817 			"LINEAR read-ahead space %lu offset %lu pages %lu\n",
818 			(ulong) space, (ulong) offset, (ulong) count);
819 	}
820 #endif /* UNIV_DEBUG */
821 
822 	/* Read ahead is considered one I/O operation for the purpose of
823 	LRU policy decision. */
824 	buf_LRU_stat_inc_io();
825 
826 	buf_pool->stat.n_ra_pages_read += count;
827 	return(count);
828 }
829 
830 /********************************************************************//**
831 Issues read requests for pages which the ibuf module wants to read in, in
832 order to contract the insert buffer tree. Technically, this function is like
833 a read-ahead function. */
834 UNIV_INTERN
835 void
buf_read_ibuf_merge_pages(bool sync,const ulint * space_ids,const ib_int64_t * space_versions,const ulint * page_nos,ulint n_stored)836 buf_read_ibuf_merge_pages(
837 /*======================*/
838 	bool		sync,		/*!< in: true if the caller
839 					wants this function to wait
840 					for the highest address page
841 					to get read in, before this
842 					function returns */
843 	const ulint*	space_ids,	/*!< in: array of space ids */
844 	const ib_int64_t* space_versions,/*!< in: the spaces must have
845 					this version number
846 					(timestamp), otherwise we
847 					discard the read; we use this
848 					to cancel reads if DISCARD +
849 					IMPORT may have changed the
850 					tablespace size */
851 	const ulint*	page_nos,	/*!< in: array of page numbers
852 					to read, with the highest page
853 					number the last in the
854 					array */
855 	ulint		n_stored)	/*!< in: number of elements
856 					in the arrays */
857 {
858 	ulint	i;
859 
860 #ifdef UNIV_IBUF_DEBUG
861 	ut_a(n_stored < UNIV_PAGE_SIZE);
862 #endif
863 
864 	for (i = 0; i < n_stored; i++) {
865 		dberr_t		err;
866 		buf_pool_t*	buf_pool;
867 		ulint		zip_size = fil_space_get_zip_size(space_ids[i]);
868 
869 		buf_pool = buf_pool_get(space_ids[i], page_nos[i]);
870 
871 		while (buf_pool->n_pend_reads
872 		       > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
873 			os_thread_sleep(500000);
874 		}
875 
876 		if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
877 
878 			goto tablespace_deleted;
879 		}
880 
881 		buf_read_page_low(&err, sync && (i + 1 == n_stored),
882 				  BUF_READ_ANY_PAGE, space_ids[i],
883 				  zip_size, TRUE, space_versions[i],
884 				  page_nos[i], NULL, false);
885 
886 		if (UNIV_UNLIKELY(err == DB_TABLESPACE_DELETED)) {
887 tablespace_deleted:
888 			/* We have deleted or are deleting the single-table
889 			tablespace: remove the entries for that page */
890 
891 			ibuf_merge_or_delete_for_page(NULL, space_ids[i],
892 						      page_nos[i],
893 						      zip_size, FALSE);
894 		}
895 	}
896 
897 	os_aio_simulated_wake_handler_threads();
898 
899 #ifdef UNIV_DEBUG
900 	if (buf_debug_prints) {
901 		fprintf(stderr,
902 			"Ibuf merge read-ahead space %lu pages %lu\n",
903 			(ulong) space_ids[0], (ulong) n_stored);
904 	}
905 #endif /* UNIV_DEBUG */
906 }
907 
908 /********************************************************************//**
909 Issues read requests for pages which recovery wants to read in. */
910 UNIV_INTERN
911 void
buf_read_recv_pages(ibool sync,ulint space,ulint zip_size,const ulint * page_nos,ulint n_stored)912 buf_read_recv_pages(
913 /*================*/
914 	ibool		sync,		/*!< in: TRUE if the caller
915 					wants this function to wait
916 					for the highest address page
917 					to get read in, before this
918 					function returns */
919 	ulint		space,		/*!< in: space id */
920 	ulint		zip_size,	/*!< in: compressed page size in
921 					bytes, or 0 */
922 	const ulint*	page_nos,	/*!< in: array of page numbers
923 					to read, with the highest page
924 					number the last in the
925 					array */
926 	ulint		n_stored)	/*!< in: number of page numbers
927 					in the array */
928 {
929 	ib_int64_t	tablespace_version;
930 	ulint		count;
931 	dberr_t		err;
932 	ulint		i;
933 
934 	zip_size = fil_space_get_zip_size(space);
935 
936 	if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
937 		/* It is a single table tablespace and the .ibd file is
938 		missing: do nothing */
939 
940 		/* the log records should be treated here same reason
941 		for http://bugs.mysql.com/bug.php?id=43948 */
942 
943 		if (recv_recovery_is_on()) {
944 			recv_addr_t*    recv_addr;
945 
946 			mutex_enter(&(recv_sys->mutex));
947 
948 			if (recv_sys->apply_log_recs == FALSE) {
949 				mutex_exit(&(recv_sys->mutex));
950 				goto not_to_recover;
951 			}
952 
953 			for (i = 0; i < n_stored; i++) {
954 				/* recv_get_fil_addr_struct() */
955 				recv_addr = (recv_addr_t*)HASH_GET_FIRST(recv_sys->addr_hash,
956 						hash_calc_hash(ut_fold_ulint_pair(space, page_nos[i]),
957 							recv_sys->addr_hash));
958 				while (recv_addr) {
959 					if ((recv_addr->space == space)
960 						&& (recv_addr->page_no == page_nos[i])) {
961 						break;
962 					}
963 					recv_addr = (recv_addr_t*)HASH_GET_NEXT(addr_hash, recv_addr);
964 				}
965 
966 				if ((recv_addr == NULL)
967 				    || (recv_addr->state == RECV_BEING_PROCESSED)
968 				    || (recv_addr->state == RECV_PROCESSED)) {
969 					continue;
970 				}
971 
972 				recv_addr->state = RECV_PROCESSED;
973 
974 				ut_a(recv_sys->n_addrs);
975 				recv_sys->n_addrs--;
976 			}
977 
978 			mutex_exit(&(recv_sys->mutex));
979 
980 			fprintf(stderr, " (cannot find space: %lu)", space);
981 		}
982 not_to_recover:
983 
984 		return;
985 	}
986 
987 	tablespace_version = fil_space_get_version(space);
988 
989 	for (i = 0; i < n_stored; i++) {
990 		buf_pool_t*	buf_pool;
991 
992 		count = 0;
993 
994 		os_aio_print_debug = FALSE;
995 		buf_pool = buf_pool_get(space, page_nos[i]);
996 		while (buf_pool->n_pend_reads
997 		       >= recv_n_pool_free_frames / 2) {
998 
999 			os_aio_simulated_wake_handler_threads();
1000 			os_thread_sleep(10000);
1001 
1002 			count++;
1003 
1004 			if (count > 1000) {
1005 				fprintf(stderr,
1006 					"InnoDB: Error: InnoDB has waited for"
1007 					" 10 seconds for pending\n"
1008 					"InnoDB: reads to the buffer pool to"
1009 					" be finished.\n"
1010 					"InnoDB: Number of pending reads %lu,"
1011 					" pending pread calls %lu\n",
1012 					(ulong) buf_pool->n_pend_reads,
1013 					(ulong) os_file_n_pending_preads);
1014 
1015 				os_aio_print_debug = TRUE;
1016 			}
1017 		}
1018 
1019 		os_aio_print_debug = FALSE;
1020 
1021 		if ((i + 1 == n_stored) && sync) {
1022 			buf_read_page_low(&err, true, BUF_READ_ANY_PAGE, space,
1023 					  zip_size, TRUE, tablespace_version,
1024 					  page_nos[i], NULL, false);
1025 		} else {
1026 			buf_read_page_low(&err, false, BUF_READ_ANY_PAGE
1027 					  | OS_AIO_SIMULATED_WAKE_LATER,
1028 					  space, zip_size, TRUE,
1029 					  tablespace_version, page_nos[i],
1030 					  NULL, false);
1031 		}
1032 	}
1033 
1034 	os_aio_simulated_wake_handler_threads();
1035 
1036 #ifdef UNIV_DEBUG
1037 	if (buf_debug_prints) {
1038 		fprintf(stderr,
1039 			"Recovery applies read-ahead pages %lu\n",
1040 			(ulong) n_stored);
1041 	}
1042 #endif /* UNIV_DEBUG */
1043 }
1044