1 /*****************************************************************************
2
3 Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License, version 2.0,
7 as published by the Free Software Foundation.
8
9 This program is also distributed with certain software (including
10 but not limited to OpenSSL) that is licensed under separate terms,
11 as designated in a particular file or component or in included license
12 documentation. The authors of MySQL hereby grant you an additional
13 permission to link the program and your derivative works with the
14 separately licensed software that they have included with MySQL.
15
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License, version 2.0, for more details.
20
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
24
25 *****************************************************************************/
26
27 /**************************************************//**
28 @file buf/buf0rea.cc
29 The database buffer read
30
31 Created 11/5/1995 Heikki Tuuri
32 *******************************************************/
33
34 #include "buf0rea.h"
35
36 #include "fil0fil.h"
37 #include "mtr0mtr.h"
38
39 #include "buf0buf.h"
40 #include "buf0flu.h"
41 #include "buf0lru.h"
42 #include "buf0dblwr.h"
43 #include "ibuf0ibuf.h"
44 #include "log0recv.h"
45 #include "trx0sys.h"
46 #include "os0file.h"
47 #include "srv0start.h"
48 #include "srv0srv.h"
49 #include "mysql/plugin.h"
50 #include "mysql/service_thd_wait.h"
51
52 /** There must be at least this many pages in buf_pool in the area to start
53 a random read-ahead */
54 #define BUF_READ_AHEAD_RANDOM_THRESHOLD(b) \
55 (5 + BUF_READ_AHEAD_AREA(b) / 8)
56
57 /** If there are buf_pool->curr_size per the number below pending reads, then
58 read-ahead is not done: this is to prevent flooding the buffer pool with
59 i/o-fixed buffer blocks */
60 #define BUF_READ_AHEAD_PEND_LIMIT 2
61
62 /********************************************************************//**
63 Unfixes the pages, unlatches the page,
64 removes it from page_hash and removes it from LRU. */
65 static
66 void
buf_read_page_handle_error(buf_page_t * bpage)67 buf_read_page_handle_error(
68 /*=======================*/
69 buf_page_t* bpage) /*!< in: pointer to the block */
70 {
71 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
72 const bool uncompressed = (buf_page_get_state(bpage)
73 == BUF_BLOCK_FILE_PAGE);
74 const ulint fold = buf_page_address_fold(bpage->space,
75 bpage->offset);
76 prio_rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, fold);
77
78 mutex_enter(&buf_pool->LRU_list_mutex);
79 rw_lock_x_lock(hash_lock);
80 mutex_enter(buf_page_get_mutex(bpage));
81
82 /* First unfix and release lock on the bpage */
83 ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_READ);
84 ut_ad(bpage->buf_fix_count == 0);
85
86 /* Set BUF_IO_NONE before we remove the block from LRU list */
87 buf_page_set_io_fix(bpage, BUF_IO_NONE);
88
89 if (uncompressed) {
90 rw_lock_x_unlock_gen(
91 &((buf_block_t*) bpage)->lock,
92 BUF_IO_READ);
93 }
94
95 /* remove the block from LRU list */
96 buf_LRU_free_one_page(bpage);
97
98 mutex_exit(&buf_pool->LRU_list_mutex);
99
100 ut_ad(buf_pool->n_pend_reads > 0);
101 os_atomic_decrement_ulint(&buf_pool->n_pend_reads, 1);
102 }
103
104 /********************************************************************//**
105 Low-level function which reads a page asynchronously from a file to the
106 buffer buf_pool if it is not already there, in which case does nothing.
107 Sets the io_fix flag and sets an exclusive lock on the buffer frame. The
108 flag is cleared and the x-lock released by an i/o-handler thread.
109 @return 1 if a read request was queued, 0 if the page already resided
110 in buf_pool, or if the page is in the doublewrite buffer blocks in
111 which case it is never read into the pool, or if the tablespace does
112 not exist or is being dropped
113 @return 1 if read request is issued. 0 if it is not */
114 UNIV_INTERN
115 ulint
buf_read_page_low(dberr_t * err,bool sync,ulint mode,ulint space,ulint zip_size,ibool unzip,ib_int64_t tablespace_version,ulint offset,trx_t * trx,bool should_buffer)116 buf_read_page_low(
117 /*==============*/
118 dberr_t* err, /*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are
119 trying to read from a non-existent tablespace, or a
120 tablespace which is just now being dropped */
121 bool sync, /*!< in: true if synchronous aio is desired */
122 ulint mode, /*!< in: BUF_READ_IBUF_PAGES_ONLY, ...,
123 ORed to OS_AIO_SIMULATED_WAKE_LATER (see below
124 at read-ahead functions) */
125 ulint space, /*!< in: space id */
126 ulint zip_size,/*!< in: compressed page size, or 0 */
127 ibool unzip, /*!< in: TRUE=request uncompressed page */
128 ib_int64_t tablespace_version, /*!< in: if the space memory object has
129 this timestamp different from what we are giving here,
130 treat the tablespace as dropped; this is a timestamp we
131 use to stop dangling page reads from a tablespace
132 which we have DISCARDed + IMPORTed back */
133 ulint offset, /*!< in: page number */
134 trx_t* trx,
135 bool should_buffer) /*!< in: whether to buffer an aio request.
136 AIO read ahead uses this. If you plan to
137 use this parameter, make sure you remember
138 to call os_aio_dispatch_read_array_submit()
139 when you're ready to commit all your requests.*/
140 {
141 buf_page_t* bpage;
142 ulint wake_later;
143 ibool ignore_nonexistent_pages;
144
145 *err = DB_SUCCESS;
146
147 wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
148 mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER;
149
150 ignore_nonexistent_pages = mode & BUF_READ_IGNORE_NONEXISTENT_PAGES;
151 mode &= ~BUF_READ_IGNORE_NONEXISTENT_PAGES;
152
153 if (space == TRX_SYS_SPACE && buf_dblwr_page_inside(offset)) {
154 ut_print_timestamp(stderr);
155 fprintf(stderr,
156 " InnoDB: Warning: trying to read"
157 " doublewrite buffer page %lu\n",
158 (ulong) offset);
159
160 return(0);
161 }
162
163 if (ibuf_bitmap_page(zip_size, offset)
164 || trx_sys_hdr_page(space, offset)) {
165
166 /* Trx sys header is so low in the latching order that we play
167 safe and do not leave the i/o-completion to an asynchronous
168 i/o-thread. Ibuf bitmap pages must always be read with
169 syncronous i/o, to make sure they do not get involved in
170 thread deadlocks. */
171
172 sync = true;
173 }
174
175 /* The following call will also check if the tablespace does not exist
176 or is being dropped; if we succeed in initing the page in the buffer
177 pool for read, then DISCARD cannot proceed until the read has
178 completed */
179 bpage = buf_page_init_for_read(err, mode, space, zip_size, unzip,
180 tablespace_version, offset);
181 if (bpage == NULL) {
182 /* bugfix: http://bugs.mysql.com/bug.php?id=43948 */
183 if (recv_recovery_is_on() && *err == DB_TABLESPACE_DELETED) {
184 /* hashed log recs must be treated here */
185 recv_addr_t* recv_addr;
186
187 mutex_enter(&(recv_sys->mutex));
188
189 if (recv_sys->apply_log_recs == FALSE) {
190 mutex_exit(&(recv_sys->mutex));
191 goto not_to_recover;
192 }
193
194 /* recv_get_fil_addr_struct() */
195 recv_addr = (recv_addr_t*)HASH_GET_FIRST(recv_sys->addr_hash,
196 hash_calc_hash(ut_fold_ulint_pair(space, offset),
197 recv_sys->addr_hash));
198 while (recv_addr) {
199 if ((recv_addr->space == space)
200 && (recv_addr->page_no == offset)) {
201 break;
202 }
203 recv_addr = (recv_addr_t*)HASH_GET_NEXT(addr_hash, recv_addr);
204 }
205
206 if ((recv_addr == NULL)
207 || (recv_addr->state == RECV_BEING_PROCESSED)
208 || (recv_addr->state == RECV_PROCESSED)) {
209 mutex_exit(&(recv_sys->mutex));
210 goto not_to_recover;
211 }
212
213 fprintf(stderr, " (cannot find space: %lu)", space);
214 recv_addr->state = RECV_PROCESSED;
215
216 ut_a(recv_sys->n_addrs);
217 recv_sys->n_addrs--;
218
219 mutex_exit(&(recv_sys->mutex));
220 }
221 not_to_recover:
222
223 return(0);
224 }
225
226 #ifdef UNIV_DEBUG
227 if (buf_debug_prints) {
228 fprintf(stderr,
229 "Posting read request for page %lu, sync %s\n",
230 (ulong) offset, sync ? "true" : "false");
231 }
232 #endif
233
234 ut_ad(buf_page_in_file(bpage));
235 ut_ad(!mutex_own(&buf_pool_from_bpage(bpage)->LRU_list_mutex));
236
237 if (sync) {
238 thd_wait_begin(NULL, THD_WAIT_DISKIO);
239 }
240
241 if (zip_size) {
242 *err = _fil_io(OS_FILE_READ | wake_later
243 | ignore_nonexistent_pages,
244 sync, space, zip_size, offset, 0, zip_size,
245 bpage->zip.data, bpage, trx, should_buffer);
246 } else {
247 ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
248
249 *err = _fil_io(OS_FILE_READ | wake_later
250 | ignore_nonexistent_pages,
251 sync, space, 0, offset, 0, UNIV_PAGE_SIZE,
252 ((buf_block_t*) bpage)->frame, bpage, trx,
253 should_buffer);
254 }
255
256 if (sync) {
257 thd_wait_end(NULL);
258 }
259
260 if (*err != DB_SUCCESS) {
261 if (ignore_nonexistent_pages || *err == DB_TABLESPACE_DELETED) {
262 buf_read_page_handle_error(bpage);
263 return(0);
264 }
265 SRV_CORRUPT_TABLE_CHECK(*err == DB_SUCCESS,
266 bpage->is_corrupt = TRUE;);
267 }
268
269 if (sync) {
270 /* The i/o is already completed when we arrive from
271 fil_read */
272 if (!buf_page_io_complete(bpage)) {
273 return(0);
274 }
275 }
276
277 return(1);
278 }
279
280 /********************************************************************//**
281 Applies a random read-ahead in buf_pool if there are at least a threshold
282 value of accessed pages from the random read-ahead area. Does not read any
283 page, not even the one at the position (space, offset), if the read-ahead
284 mechanism is not activated. NOTE 1: the calling thread may own latches on
285 pages: to avoid deadlocks this function must be written such that it cannot
286 end up waiting for these latches! NOTE 2: the calling thread must want
287 access to the page given: this rule is set to prevent unintended read-aheads
288 performed by ibuf routines, a situation which could result in a deadlock if
289 the OS does not support asynchronous i/o.
290 @return number of page read requests issued; NOTE that if we read ibuf
291 pages, it may happen that the page at the given page number does not
292 get read even if we return a positive value!
293 @return number of page read requests issued */
294 UNIV_INTERN
295 ulint
buf_read_ahead_random(ulint space,ulint zip_size,ulint offset,ibool inside_ibuf,trx_t * trx)296 buf_read_ahead_random(
297 /*==================*/
298 ulint space, /*!< in: space id */
299 ulint zip_size, /*!< in: compressed page size in bytes,
300 or 0 */
301 ulint offset, /*!< in: page number of a page which
302 the current thread wants to access */
303 ibool inside_ibuf, /*!< in: TRUE if we are inside ibuf
304 routine */
305 trx_t* trx)
306 {
307 buf_pool_t* buf_pool = buf_pool_get(space, offset);
308 ib_int64_t tablespace_version;
309 ulint recent_blocks = 0;
310 ulint ibuf_mode;
311 ulint count;
312 ulint low, high;
313 dberr_t err;
314 ulint i;
315 const ulint buf_read_ahead_random_area
316 = BUF_READ_AHEAD_AREA(buf_pool);
317
318 if (!srv_random_read_ahead) {
319 /* Disabled by user */
320 return(0);
321 }
322
323 if (srv_startup_is_before_trx_rollback_phase) {
324 /* No read-ahead to avoid thread deadlocks */
325 return(0);
326 }
327
328 if (ibuf_bitmap_page(zip_size, offset)
329 || trx_sys_hdr_page(space, offset)) {
330
331 /* If it is an ibuf bitmap page or trx sys hdr, we do
332 no read-ahead, as that could break the ibuf page access
333 order */
334
335 return(0);
336 }
337
338 /* Remember the tablespace version before we ask te tablespace size
339 below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
340 do not try to read outside the bounds of the tablespace! */
341
342 tablespace_version = fil_space_get_version(space);
343
344 low = (offset / buf_read_ahead_random_area)
345 * buf_read_ahead_random_area;
346 high = (offset / buf_read_ahead_random_area + 1)
347 * buf_read_ahead_random_area;
348 if (high > fil_space_get_size(space)) {
349
350 high = fil_space_get_size(space);
351 }
352
353 if (buf_pool->n_pend_reads
354 > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
355
356 return(0);
357 }
358
359 /* Count how many blocks in the area have been recently accessed,
360 that is, reside near the start of the LRU list. */
361
362 for (i = low; i < high; i++) {
363
364 prio_rw_lock_t* hash_lock;
365
366 const buf_page_t* bpage =
367 buf_page_hash_get_s_locked(buf_pool, space, i,
368 &hash_lock);
369
370 if (bpage
371 && buf_page_is_accessed(bpage)
372 && buf_page_peek_if_young(bpage)) {
373
374 recent_blocks++;
375
376 if (recent_blocks
377 >= BUF_READ_AHEAD_RANDOM_THRESHOLD(buf_pool)) {
378
379 rw_lock_s_unlock(hash_lock);
380 goto read_ahead;
381 }
382 }
383
384 if (bpage) {
385 rw_lock_s_unlock(hash_lock);
386 }
387 }
388
389 /* Do nothing */
390 return(0);
391
392 read_ahead:
393 /* Read all the suitable blocks within the area */
394
395 if (inside_ibuf) {
396 ibuf_mode = BUF_READ_IBUF_PAGES_ONLY;
397 } else {
398 ibuf_mode = BUF_READ_ANY_PAGE;
399 }
400
401 count = 0;
402
403 for (i = low; i < high; i++) {
404 /* It is only sensible to do read-ahead in the non-sync aio
405 mode: hence FALSE as the first parameter */
406
407 if (!ibuf_bitmap_page(zip_size, i)) {
408 count += buf_read_page_low(
409 &err, false,
410 ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER,
411 space, zip_size, FALSE,
412 tablespace_version, i, trx, false);
413 if (err == DB_TABLESPACE_DELETED) {
414 ut_print_timestamp(stderr);
415 fprintf(stderr,
416 " InnoDB: Warning: in random"
417 " readahead trying to access\n"
418 "InnoDB: tablespace %lu page %lu,\n"
419 "InnoDB: but the tablespace does not"
420 " exist or is just being dropped.\n",
421 (ulong) space, (ulong) i);
422 }
423 }
424 }
425
426 /* In simulated aio we wake the aio handler threads only after
427 queuing all aio requests, in native aio the following call does
428 nothing: */
429
430 os_aio_simulated_wake_handler_threads();
431
432 #ifdef UNIV_DEBUG
433 if (buf_debug_prints && (count > 0)) {
434 fprintf(stderr,
435 "Random read-ahead space %lu offset %lu pages %lu\n",
436 (ulong) space, (ulong) offset,
437 (ulong) count);
438 }
439 #endif /* UNIV_DEBUG */
440
441 /* Read ahead is considered one I/O operation for the purpose of
442 LRU policy decision. */
443 buf_LRU_stat_inc_io();
444
445 buf_pool->stat.n_ra_pages_read_rnd += count;
446 srv_stats.buf_pool_reads.add(count);
447 return(count);
448 }
449
450 /********************************************************************//**
451 High-level function which reads a page asynchronously from a file to the
452 buffer buf_pool if it is not already there. Sets the io_fix flag and sets
453 an exclusive lock on the buffer frame. The flag is cleared and the x-lock
454 released by the i/o-handler thread.
455 @return TRUE if page has been read in, FALSE in case of failure */
456 UNIV_INTERN
457 ibool
buf_read_page(ulint space,ulint zip_size,ulint offset,trx_t * trx)458 buf_read_page(
459 /*==========*/
460 ulint space, /*!< in: space id */
461 ulint zip_size,/*!< in: compressed page size in bytes, or 0 */
462 ulint offset, /*!< in: page number */
463 trx_t* trx)
464 {
465 ib_int64_t tablespace_version;
466 ulint count;
467 dberr_t err;
468
469 tablespace_version = fil_space_get_version(space);
470
471 /* We do the i/o in the synchronous aio mode to save thread
472 switches: hence TRUE */
473
474 count = buf_read_page_low(&err, true, BUF_READ_ANY_PAGE, space,
475 zip_size, FALSE,
476 tablespace_version, offset, trx, false);
477 srv_stats.buf_pool_reads.add(count);
478 if (err == DB_TABLESPACE_DELETED) {
479 ut_print_timestamp(stderr);
480 fprintf(stderr,
481 " InnoDB: Error: trying to access"
482 " tablespace %lu page no. %lu,\n"
483 "InnoDB: but the tablespace does not exist"
484 " or is just being dropped.\n",
485 (ulong) space, (ulong) offset);
486 }
487
488 /* Increment number of I/O operations used for LRU policy. */
489 buf_LRU_stat_inc_io();
490
491 return(count > 0);
492 }
493
494 /********************************************************************//**
495 High-level function which reads a page asynchronously from a file to the
496 buffer buf_pool if it is not already there. Sets the io_fix flag and sets
497 an exclusive lock on the buffer frame. The flag is cleared and the x-lock
498 released by the i/o-handler thread.
499 @return TRUE if page has been read in, FALSE in case of failure */
500 UNIV_INTERN
501 ibool
buf_read_page_async(ulint space,ulint offset)502 buf_read_page_async(
503 /*================*/
504 ulint space, /*!< in: space id */
505 ulint offset) /*!< in: page number */
506 {
507 ulint zip_size;
508 ib_int64_t tablespace_version;
509 ulint count;
510 dberr_t err;
511
512 zip_size = fil_space_get_zip_size(space);
513
514 if (zip_size == ULINT_UNDEFINED) {
515 return(FALSE);
516 }
517
518 tablespace_version = fil_space_get_version(space);
519
520 count = buf_read_page_low(&err, true, BUF_READ_ANY_PAGE
521 | OS_AIO_SIMULATED_WAKE_LATER
522 | BUF_READ_IGNORE_NONEXISTENT_PAGES,
523 space, zip_size, FALSE,
524 tablespace_version, offset, NULL, false);
525 srv_stats.buf_pool_reads.add(count);
526
527 /* We do not increment number of I/O operations used for LRU policy
528 here (buf_LRU_stat_inc_io()). We use this in heuristics to decide
529 about evicting uncompressed version of compressed pages from the
530 buffer pool. Since this function is called from buffer pool load
531 these IOs are deliberate and are not part of normal workload we can
532 ignore these in our heuristics. */
533
534 return(count > 0);
535 }
536
537 /********************************************************************//**
538 Applies linear read-ahead if in the buf_pool the page is a border page of
539 a linear read-ahead area and all the pages in the area have been accessed.
540 Does not read any page if the read-ahead mechanism is not activated. Note
541 that the algorithm looks at the 'natural' adjacent successor and
542 predecessor of the page, which on the leaf level of a B-tree are the next
543 and previous page in the chain of leaves. To know these, the page specified
544 in (space, offset) must already be present in the buf_pool. Thus, the
545 natural way to use this function is to call it when a page in the buf_pool
546 is accessed the first time, calling this function just after it has been
547 bufferfixed.
548 NOTE 1: as this function looks at the natural predecessor and successor
549 fields on the page, what happens, if these are not initialized to any
550 sensible value? No problem, before applying read-ahead we check that the
551 area to read is within the span of the space, if not, read-ahead is not
552 applied. An uninitialized value may result in a useless read operation, but
553 only very improbably.
554 NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
555 function must be written such that it cannot end up waiting for these
556 latches!
557 NOTE 3: the calling thread must want access to the page given: this rule is
558 set to prevent unintended read-aheads performed by ibuf routines, a situation
559 which could result in a deadlock if the OS does not support asynchronous io.
560 @return number of page read requests issued */
561 UNIV_INTERN
562 ulint
buf_read_ahead_linear(ulint space,ulint zip_size,ulint offset,ibool inside_ibuf,trx_t * trx)563 buf_read_ahead_linear(
564 /*==================*/
565 ulint space, /*!< in: space id */
566 ulint zip_size, /*!< in: compressed page size in bytes, or 0 */
567 ulint offset, /*!< in: page number; see NOTE 3 above */
568 ibool inside_ibuf, /*!< in: TRUE if we are inside ibuf routine */
569 trx_t* trx)
570 {
571 buf_pool_t* buf_pool = buf_pool_get(space, offset);
572 ib_int64_t tablespace_version;
573 buf_page_t* bpage;
574 buf_frame_t* frame;
575 buf_page_t* pred_bpage = NULL;
576 unsigned pred_bpage_is_accessed = 0;
577 ulint pred_offset;
578 ulint succ_offset;
579 ulint count;
580 int asc_or_desc;
581 ulint new_offset;
582 ulint fail_count;
583 ulint ibuf_mode;
584 ulint low, high;
585 dberr_t err;
586 ulint i;
587 const ulint buf_read_ahead_linear_area
588 = BUF_READ_AHEAD_AREA(buf_pool);
589 ulint threshold;
590
591 /* check if readahead is disabled */
592 if (!srv_read_ahead_threshold) {
593 return(0);
594 }
595
596 if (UNIV_UNLIKELY(srv_startup_is_before_trx_rollback_phase)) {
597 /* No read-ahead to avoid thread deadlocks */
598 return(0);
599 }
600
601 low = (offset / buf_read_ahead_linear_area)
602 * buf_read_ahead_linear_area;
603 high = (offset / buf_read_ahead_linear_area + 1)
604 * buf_read_ahead_linear_area;
605
606 if ((offset != low) && (offset != high - 1)) {
607 /* This is not a border page of the area: return */
608
609 return(0);
610 }
611
612 if (ibuf_bitmap_page(zip_size, offset)
613 || trx_sys_hdr_page(space, offset)) {
614
615 /* If it is an ibuf bitmap page or trx sys hdr, we do
616 no read-ahead, as that could break the ibuf page access
617 order */
618
619 return(0);
620 }
621
622 /* Remember the tablespace version before we ask te tablespace size
623 below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
624 do not try to read outside the bounds of the tablespace! */
625
626 tablespace_version = fil_space_get_version(space);
627
628 if (high > fil_space_get_size(space)) {
629 /* The area is not whole, return */
630
631 return(0);
632 }
633
634 if (buf_pool->n_pend_reads
635 > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
636
637 return(0);
638 }
639
640 /* Check that almost all pages in the area have been accessed; if
641 offset == low, the accesses must be in a descending order, otherwise,
642 in an ascending order. */
643
644 asc_or_desc = 1;
645
646 if (offset == low) {
647 asc_or_desc = -1;
648 }
649
650 /* How many out of order accessed pages can we ignore
651 when working out the access pattern for linear readahead */
652 threshold = ut_min((64 - srv_read_ahead_threshold),
653 BUF_READ_AHEAD_AREA(buf_pool));
654
655 fail_count = 0;
656
657 prio_rw_lock_t* hash_lock;
658
659 for (i = low; i < high; i++) {
660
661 bpage = buf_page_hash_get_s_locked(buf_pool, space, i,
662 &hash_lock);
663
664 if (bpage == NULL || !buf_page_is_accessed(bpage)) {
665 /* Not accessed */
666 fail_count++;
667
668 } else if (pred_bpage) {
669 /* Note that buf_page_is_accessed() returns
670 the time of the first access. If some blocks
671 of the extent existed in the buffer pool at
672 the time of a linear access pattern, the first
673 access times may be nonmonotonic, even though
674 the latest access times were linear. The
675 threshold (srv_read_ahead_factor) should help
676 a little against this. */
677 int res = ut_ulint_cmp(
678 buf_page_is_accessed(bpage),
679 pred_bpage_is_accessed);
680 /* Accesses not in the right order */
681 if (res != 0 && res != asc_or_desc) {
682 fail_count++;
683 }
684 }
685
686 if (fail_count > threshold) {
687 /* Too many failures: return */
688 if (bpage) {
689 rw_lock_s_unlock(hash_lock);
690 }
691 return(0);
692 }
693
694 if (bpage) {
695 if (buf_page_is_accessed(bpage)) {
696 pred_bpage = bpage;
697 pred_bpage_is_accessed
698 = buf_page_is_accessed(bpage);
699 }
700
701 rw_lock_s_unlock(hash_lock);
702 }
703 }
704
705 /* If we got this far, we know that enough pages in the area have
706 been accessed in the right order: linear read-ahead can be sensible */
707
708 bpage = buf_page_hash_get_s_locked(buf_pool, space, offset, &hash_lock);
709
710 if (bpage == NULL) {
711
712 return(0);
713 }
714
715 switch (buf_page_get_state(bpage)) {
716 case BUF_BLOCK_ZIP_PAGE:
717 frame = bpage->zip.data;
718 break;
719 case BUF_BLOCK_FILE_PAGE:
720 frame = ((buf_block_t*) bpage)->frame;
721 break;
722 default:
723 ut_error;
724 break;
725 }
726
727 /* Read the natural predecessor and successor page addresses from
728 the page; NOTE that because the calling thread may have an x-latch
729 on the page, we do not acquire an s-latch on the page, this is to
730 prevent deadlocks. Even if we read values which are nonsense, the
731 algorithm will work. */
732
733 pred_offset = fil_page_get_prev(frame);
734 succ_offset = fil_page_get_next(frame);
735
736 rw_lock_s_unlock(hash_lock);
737
738 if ((offset == low) && (succ_offset == offset + 1)) {
739
740 /* This is ok, we can continue */
741 new_offset = pred_offset;
742
743 } else if ((offset == high - 1) && (pred_offset == offset - 1)) {
744
745 /* This is ok, we can continue */
746 new_offset = succ_offset;
747 } else {
748 /* Successor or predecessor not in the right order */
749
750 return(0);
751 }
752
753 low = (new_offset / buf_read_ahead_linear_area)
754 * buf_read_ahead_linear_area;
755 high = (new_offset / buf_read_ahead_linear_area + 1)
756 * buf_read_ahead_linear_area;
757
758 if ((new_offset != low) && (new_offset != high - 1)) {
759 /* This is not a border page of the area: return */
760
761 return(0);
762 }
763
764 if (high > fil_space_get_size(space)) {
765 /* The area is not whole, return */
766
767 return(0);
768 }
769
770 /* If we got this far, read-ahead can be sensible: do it */
771
772 ibuf_mode = inside_ibuf
773 ? BUF_READ_IBUF_PAGES_ONLY | OS_AIO_SIMULATED_WAKE_LATER
774 : BUF_READ_ANY_PAGE | OS_AIO_SIMULATED_WAKE_LATER;
775
776 count = 0;
777
778 /* Since Windows XP seems to schedule the i/o handler thread
779 very eagerly, and consequently it does not wait for the
780 full read batch to be posted, we use special heuristics here */
781
782 os_aio_simulated_put_read_threads_to_sleep();
783
784 for (i = low; i < high; i++) {
785 /* It is only sensible to do read-ahead in the non-sync
786 aio mode: hence FALSE as the first parameter */
787
788 if (!ibuf_bitmap_page(zip_size, i)) {
789 count += buf_read_page_low(
790 &err, false,
791 ibuf_mode,
792 space, zip_size, FALSE, tablespace_version,
793 i, trx, true);
794 if (err == DB_TABLESPACE_DELETED) {
795 ut_print_timestamp(stderr);
796 fprintf(stderr,
797 " InnoDB: Warning: in"
798 " linear readahead trying to access\n"
799 "InnoDB: tablespace %lu page %lu,\n"
800 "InnoDB: but the tablespace does not"
801 " exist or is just being dropped.\n",
802 (ulong) space, (ulong) i);
803 }
804 }
805 }
806 os_aio_dispatch_read_array_submit();
807
808 /* In simulated aio we wake the aio handler threads only after
809 queuing all aio requests, in native aio the following call does
810 nothing: */
811
812 os_aio_simulated_wake_handler_threads();
813
814 #ifdef UNIV_DEBUG
815 if (buf_debug_prints && (count > 0)) {
816 fprintf(stderr,
817 "LINEAR read-ahead space %lu offset %lu pages %lu\n",
818 (ulong) space, (ulong) offset, (ulong) count);
819 }
820 #endif /* UNIV_DEBUG */
821
822 /* Read ahead is considered one I/O operation for the purpose of
823 LRU policy decision. */
824 buf_LRU_stat_inc_io();
825
826 buf_pool->stat.n_ra_pages_read += count;
827 return(count);
828 }
829
830 /********************************************************************//**
831 Issues read requests for pages which the ibuf module wants to read in, in
832 order to contract the insert buffer tree. Technically, this function is like
833 a read-ahead function. */
834 UNIV_INTERN
835 void
buf_read_ibuf_merge_pages(bool sync,const ulint * space_ids,const ib_int64_t * space_versions,const ulint * page_nos,ulint n_stored)836 buf_read_ibuf_merge_pages(
837 /*======================*/
838 bool sync, /*!< in: true if the caller
839 wants this function to wait
840 for the highest address page
841 to get read in, before this
842 function returns */
843 const ulint* space_ids, /*!< in: array of space ids */
844 const ib_int64_t* space_versions,/*!< in: the spaces must have
845 this version number
846 (timestamp), otherwise we
847 discard the read; we use this
848 to cancel reads if DISCARD +
849 IMPORT may have changed the
850 tablespace size */
851 const ulint* page_nos, /*!< in: array of page numbers
852 to read, with the highest page
853 number the last in the
854 array */
855 ulint n_stored) /*!< in: number of elements
856 in the arrays */
857 {
858 ulint i;
859
860 #ifdef UNIV_IBUF_DEBUG
861 ut_a(n_stored < UNIV_PAGE_SIZE);
862 #endif
863
864 for (i = 0; i < n_stored; i++) {
865 dberr_t err;
866 buf_pool_t* buf_pool;
867 ulint zip_size = fil_space_get_zip_size(space_ids[i]);
868
869 buf_pool = buf_pool_get(space_ids[i], page_nos[i]);
870
871 while (buf_pool->n_pend_reads
872 > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
873 os_thread_sleep(500000);
874 }
875
876 if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
877
878 goto tablespace_deleted;
879 }
880
881 buf_read_page_low(&err, sync && (i + 1 == n_stored),
882 BUF_READ_ANY_PAGE, space_ids[i],
883 zip_size, TRUE, space_versions[i],
884 page_nos[i], NULL, false);
885
886 if (UNIV_UNLIKELY(err == DB_TABLESPACE_DELETED)) {
887 tablespace_deleted:
888 /* We have deleted or are deleting the single-table
889 tablespace: remove the entries for that page */
890
891 ibuf_merge_or_delete_for_page(NULL, space_ids[i],
892 page_nos[i],
893 zip_size, FALSE);
894 }
895 }
896
897 os_aio_simulated_wake_handler_threads();
898
899 #ifdef UNIV_DEBUG
900 if (buf_debug_prints) {
901 fprintf(stderr,
902 "Ibuf merge read-ahead space %lu pages %lu\n",
903 (ulong) space_ids[0], (ulong) n_stored);
904 }
905 #endif /* UNIV_DEBUG */
906 }
907
908 /********************************************************************//**
909 Issues read requests for pages which recovery wants to read in. */
910 UNIV_INTERN
911 void
buf_read_recv_pages(ibool sync,ulint space,ulint zip_size,const ulint * page_nos,ulint n_stored)912 buf_read_recv_pages(
913 /*================*/
914 ibool sync, /*!< in: TRUE if the caller
915 wants this function to wait
916 for the highest address page
917 to get read in, before this
918 function returns */
919 ulint space, /*!< in: space id */
920 ulint zip_size, /*!< in: compressed page size in
921 bytes, or 0 */
922 const ulint* page_nos, /*!< in: array of page numbers
923 to read, with the highest page
924 number the last in the
925 array */
926 ulint n_stored) /*!< in: number of page numbers
927 in the array */
928 {
929 ib_int64_t tablespace_version;
930 ulint count;
931 dberr_t err;
932 ulint i;
933
934 zip_size = fil_space_get_zip_size(space);
935
936 if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
937 /* It is a single table tablespace and the .ibd file is
938 missing: do nothing */
939
940 /* the log records should be treated here same reason
941 for http://bugs.mysql.com/bug.php?id=43948 */
942
943 if (recv_recovery_is_on()) {
944 recv_addr_t* recv_addr;
945
946 mutex_enter(&(recv_sys->mutex));
947
948 if (recv_sys->apply_log_recs == FALSE) {
949 mutex_exit(&(recv_sys->mutex));
950 goto not_to_recover;
951 }
952
953 for (i = 0; i < n_stored; i++) {
954 /* recv_get_fil_addr_struct() */
955 recv_addr = (recv_addr_t*)HASH_GET_FIRST(recv_sys->addr_hash,
956 hash_calc_hash(ut_fold_ulint_pair(space, page_nos[i]),
957 recv_sys->addr_hash));
958 while (recv_addr) {
959 if ((recv_addr->space == space)
960 && (recv_addr->page_no == page_nos[i])) {
961 break;
962 }
963 recv_addr = (recv_addr_t*)HASH_GET_NEXT(addr_hash, recv_addr);
964 }
965
966 if ((recv_addr == NULL)
967 || (recv_addr->state == RECV_BEING_PROCESSED)
968 || (recv_addr->state == RECV_PROCESSED)) {
969 continue;
970 }
971
972 recv_addr->state = RECV_PROCESSED;
973
974 ut_a(recv_sys->n_addrs);
975 recv_sys->n_addrs--;
976 }
977
978 mutex_exit(&(recv_sys->mutex));
979
980 fprintf(stderr, " (cannot find space: %lu)", space);
981 }
982 not_to_recover:
983
984 return;
985 }
986
987 tablespace_version = fil_space_get_version(space);
988
989 for (i = 0; i < n_stored; i++) {
990 buf_pool_t* buf_pool;
991
992 count = 0;
993
994 os_aio_print_debug = FALSE;
995 buf_pool = buf_pool_get(space, page_nos[i]);
996 while (buf_pool->n_pend_reads
997 >= recv_n_pool_free_frames / 2) {
998
999 os_aio_simulated_wake_handler_threads();
1000 os_thread_sleep(10000);
1001
1002 count++;
1003
1004 if (count > 1000) {
1005 fprintf(stderr,
1006 "InnoDB: Error: InnoDB has waited for"
1007 " 10 seconds for pending\n"
1008 "InnoDB: reads to the buffer pool to"
1009 " be finished.\n"
1010 "InnoDB: Number of pending reads %lu,"
1011 " pending pread calls %lu\n",
1012 (ulong) buf_pool->n_pend_reads,
1013 (ulong) os_file_n_pending_preads);
1014
1015 os_aio_print_debug = TRUE;
1016 }
1017 }
1018
1019 os_aio_print_debug = FALSE;
1020
1021 if ((i + 1 == n_stored) && sync) {
1022 buf_read_page_low(&err, true, BUF_READ_ANY_PAGE, space,
1023 zip_size, TRUE, tablespace_version,
1024 page_nos[i], NULL, false);
1025 } else {
1026 buf_read_page_low(&err, false, BUF_READ_ANY_PAGE
1027 | OS_AIO_SIMULATED_WAKE_LATER,
1028 space, zip_size, TRUE,
1029 tablespace_version, page_nos[i],
1030 NULL, false);
1031 }
1032 }
1033
1034 os_aio_simulated_wake_handler_threads();
1035
1036 #ifdef UNIV_DEBUG
1037 if (buf_debug_prints) {
1038 fprintf(stderr,
1039 "Recovery applies read-ahead pages %lu\n",
1040 (ulong) n_stored);
1041 }
1042 #endif /* UNIV_DEBUG */
1043 }
1044