1 /***********************************************************************
2
3 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2009, Percona Inc.
5
6 Portions of this file contain modifications contributed and copyrighted
7 by Percona Inc.. Those modifications are
8 gratefully acknowledged and are described briefly in the InnoDB
9 documentation. The contributions by Percona Inc. are incorporated with
10 their permission, and subject to the conditions contained in the file
11 COPYING.Percona.
12
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License, version 2.0,
15 as published by the Free Software Foundation.
16
17 This program is also distributed with certain software (including
18 but not limited to OpenSSL) that is licensed under separate terms,
19 as designated in a particular file or component or in included license
20 documentation. The authors of MySQL hereby grant you an additional
21 permission to link the program and your derivative works with the
22 separately licensed software that they have included with MySQL.
23
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License, version 2.0, for more details.
28
29 You should have received a copy of the GNU General Public License along with
30 this program; if not, write to the Free Software Foundation, Inc.,
31 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
32
33 ***********************************************************************/
34
35 /**************************************************//**
36 @file os/os0file.cc
37 The interface to the operating system file i/o primitives
38
39 Created 10/21/1995 Heikki Tuuri
40 *******************************************************/
41
42 #include "os0file.h"
43
44 #ifdef UNIV_NONINL
45 #include "os0file.ic"
46 #endif
47
48 #include "ut0mem.h"
49 #include "srv0srv.h"
50 #include "srv0start.h"
51 #include "fil0fil.h"
52 #include "buf0buf.h"
53 #include "srv0mon.h"
54 #ifndef UNIV_HOTBACKUP
55 # include "os0sync.h"
56 # include "os0thread.h"
57 #else /* !UNIV_HOTBACKUP */
58 # ifdef __WIN__
59 /* Add includes for the _stat() call to compile on Windows */
60 # include <sys/types.h>
61 # include <sys/stat.h>
62 # include <errno.h>
63 # endif /* __WIN__ */
64 #endif /* !UNIV_HOTBACKUP */
65
66 #if defined(LINUX_NATIVE_AIO)
67 #include <libaio.h>
68 #endif
69
70 /** Insert buffer segment id */
71 static const ulint IO_IBUF_SEGMENT = 0;
72
73 /** Log segment id */
74 static const ulint IO_LOG_SEGMENT = 1;
75
76 /* This specifies the file permissions InnoDB uses when it creates files in
77 Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
78 my_umask */
79
80 #ifndef __WIN__
81 /** Umask for creating files */
82 UNIV_INTERN ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
83 #else
84 /** Umask for creating files */
85 UNIV_INTERN ulint os_innodb_umask = 0;
86 #endif /* __WIN__ */
87
88 #ifndef UNIV_HOTBACKUP
89 /* We use these mutexes to protect lseek + file i/o operation, if the
90 OS does not provide an atomic pread or pwrite, or similar */
91 #define OS_FILE_N_SEEK_MUTEXES 16
92 UNIV_INTERN os_ib_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
93
94 /* In simulated aio, merge at most this many consecutive i/os */
95 #define OS_AIO_MERGE_N_CONSECUTIVE 64
96
97 #ifdef WITH_INNODB_DISALLOW_WRITES
98 #define WAIT_ALLOW_WRITES() os_event_wait(srv_allow_writes_event)
99 #else
100 #define WAIT_ALLOW_WRITES() do { } while (0)
101 #endif /* WITH_INNODB_DISALLOW_WRITES */
102
103 /**********************************************************************
104
105 InnoDB AIO Implementation:
106 =========================
107
108 We support native AIO for windows and linux. For rest of the platforms
109 we simulate AIO by special io-threads servicing the IO-requests.
110
111 Simulated AIO:
112 ==============
113
114 In platforms where we 'simulate' AIO following is a rough explanation
115 of the high level design.
116 There are four io-threads (for ibuf, log, read, write).
117 All synchronous IO requests are serviced by the calling thread using
118 os_file_write/os_file_read. The Asynchronous requests are queued up
119 in an array (there are four such arrays) by the calling thread.
120 Later these requests are picked up by the io-thread and are serviced
121 synchronously.
122
123 Windows native AIO:
124 ==================
125
126 If srv_use_native_aio is not set then windows follow the same
127 code as simulated AIO. If the flag is set then native AIO interface
128 is used. On windows, one of the limitation is that if a file is opened
129 for AIO no synchronous IO can be done on it. Therefore we have an
130 extra fifth array to queue up synchronous IO requests.
131 There are innodb_file_io_threads helper threads. These threads work
132 on the four arrays mentioned above in Simulated AIO. No thread is
133 required for the sync array.
134 If a synchronous IO request is made, it is first queued in the sync
135 array. Then the calling thread itself waits on the request, thus
136 making the call synchronous.
137 If an AIO request is made the calling thread not only queues it in the
138 array but also submits the requests. The helper thread then collects
139 the completed IO request and calls completion routine on it.
140
141 Linux native AIO:
142 =================
143
144 If we have libaio installed on the system and innodb_use_native_aio
145 is set to TRUE we follow the code path of native AIO, otherwise we
146 do simulated AIO.
147 There are innodb_file_io_threads helper threads. These threads work
148 on the four arrays mentioned above in Simulated AIO.
149 If a synchronous IO request is made, it is handled by calling
150 os_file_write/os_file_read.
151 If an AIO request is made the calling thread not only queues it in the
152 array but also submits the requests. The helper thread then collects
153 the completed IO request and calls completion routine on it.
154
155 **********************************************************************/
156
157 /** Flag: enable debug printout for asynchronous i/o */
158 UNIV_INTERN ibool os_aio_print_debug = FALSE;
159
160 #ifdef UNIV_PFS_IO
161 /* Keys to register InnoDB I/O with performance schema */
162 UNIV_INTERN mysql_pfs_key_t innodb_file_data_key;
163 UNIV_INTERN mysql_pfs_key_t innodb_file_log_key;
164 UNIV_INTERN mysql_pfs_key_t innodb_file_temp_key;
165 #endif /* UNIV_PFS_IO */
166
167 /** The asynchronous i/o array slot structure */
168 struct os_aio_slot_t{
169 ibool is_read; /*!< TRUE if a read operation */
170 ulint pos; /*!< index of the slot in the aio
171 array */
172 ibool reserved; /*!< TRUE if this slot is reserved */
173 time_t reservation_time;/*!< time when reserved */
174 ulint len; /*!< length of the block to read or
175 write */
176 byte* buf; /*!< buffer used in i/o */
177 ulint type; /*!< OS_FILE_READ or OS_FILE_WRITE */
178 os_offset_t offset; /*!< file offset in bytes */
179 pfs_os_file_t file; /*!< file where to read or write */
180 const char* name; /*!< file name or path */
181 ibool io_already_done;/*!< used only in simulated aio:
182 TRUE if the physical i/o already
183 made and only the slot message
184 needs to be passed to the caller
185 of os_aio_simulated_handle */
186 fil_node_t* message1; /*!< message which is given by the */
187 void* message2; /*!< the requester of an aio operation
188 and which can be used to identify
189 which pending aio operation was
190 completed */
191 #ifdef WIN_ASYNC_IO
192 HANDLE handle; /*!< handle object we need in the
193 OVERLAPPED struct */
194 OVERLAPPED control; /*!< Windows control block for the
195 aio request */
196 #elif defined(LINUX_NATIVE_AIO)
197 struct iocb control; /* Linux control block for aio */
198 int n_bytes; /* bytes written/read. */
199 int ret; /* AIO return code */
200 #endif /* WIN_ASYNC_IO */
201 };
202
203 /** The asynchronous i/o array structure */
204 struct os_aio_array_t{
205 os_ib_mutex_t mutex; /*!< the mutex protecting the aio array */
206 os_event_t not_full;
207 /*!< The event which is set to the
208 signaled state when there is space in
209 the aio outside the ibuf segment */
210 os_event_t is_empty;
211 /*!< The event which is set to the
212 signaled state when there are no
213 pending i/os in this array */
214 ulint n_slots;/*!< Total number of slots in the aio
215 array. This must be divisible by
216 n_threads. */
217 ulint n_segments;
218 /*!< Number of segments in the aio
219 array of pending aio requests. A
220 thread can wait separately for any one
221 of the segments. */
222 ulint cur_seg;/*!< We reserve IO requests in round
223 robin fashion to different segments.
224 This points to the segment that is to
225 be used to service next IO request. */
226 ulint n_reserved;
227 /*!< Number of reserved slots in the
228 aio array outside the ibuf segment */
229 os_aio_slot_t* slots; /*!< Pointer to the slots in the array */
230 #ifdef __WIN__
231 HANDLE* handles;
232 /*!< Pointer to an array of OS native
233 event handles where we copied the
234 handles from slots, in the same
235 order. This can be used in
236 WaitForMultipleObjects; used only in
237 Windows */
238 #endif /* __WIN__ */
239
240 #if defined(LINUX_NATIVE_AIO)
241 io_context_t* aio_ctx;
242 /* completion queue for IO. There is
243 one such queue per segment. Each thread
244 will work on one ctx exclusively. */
245 struct io_event* aio_events;
246 /* The array to collect completed IOs.
247 There is one such event for each
248 possible pending IO. The size of the
249 array is equal to n_slots. */
250 #endif /* LINUX_NATIV_AIO */
251 };
252
253 #if defined(LINUX_NATIVE_AIO)
254 /** timeout for each io_getevents() call = 500ms. */
255 #define OS_AIO_REAP_TIMEOUT (500000000UL)
256
257 /** time to sleep, in microseconds if io_setup() returns EAGAIN. */
258 #define OS_AIO_IO_SETUP_RETRY_SLEEP (500000UL)
259
260 /** number of attempts before giving up on io_setup(). */
261 #define OS_AIO_IO_SETUP_RETRY_ATTEMPTS 5
262 #endif
263
264 /** Array of events used in simulated aio */
265 static os_event_t* os_aio_segment_wait_events = NULL;
266
267 /** The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
268 are NULL when the module has not yet been initialized. @{ */
269 static os_aio_array_t* os_aio_read_array = NULL; /*!< Reads */
270 static os_aio_array_t* os_aio_write_array = NULL; /*!< Writes */
271 static os_aio_array_t* os_aio_ibuf_array = NULL; /*!< Insert buffer */
272 static os_aio_array_t* os_aio_log_array = NULL; /*!< Redo log */
273 static os_aio_array_t* os_aio_sync_array = NULL; /*!< Synchronous I/O */
274 /* @} */
275
276 /** Number of asynchronous I/O segments. Set by os_aio_init(). */
277 static ulint os_aio_n_segments = ULINT_UNDEFINED;
278
279 /** If the following is TRUE, read i/o handler threads try to
280 wait until a batch of new read requests have been posted */
281 static ibool os_aio_recommend_sleep_for_read_threads = FALSE;
282 #endif /* !UNIV_HOTBACKUP */
283
284 UNIV_INTERN ulint os_n_file_reads = 0;
285 UNIV_INTERN ulint os_bytes_read_since_printout = 0;
286 UNIV_INTERN ulint os_n_file_writes = 0;
287 UNIV_INTERN ulint os_n_fsyncs = 0;
288 UNIV_INTERN ulint os_n_file_reads_old = 0;
289 UNIV_INTERN ulint os_n_file_writes_old = 0;
290 UNIV_INTERN ulint os_n_fsyncs_old = 0;
291 UNIV_INTERN time_t os_last_printout;
292
293 UNIV_INTERN ibool os_has_said_disk_full = FALSE;
294
295 #if !defined(UNIV_HOTBACKUP) \
296 && (!defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8)
297 /** The mutex protecting the following counts of pending I/O operations */
298 static os_ib_mutex_t os_file_count_mutex;
299 #endif /* !UNIV_HOTBACKUP && (!HAVE_ATOMIC_BUILTINS || UNIV_WORD_SIZE < 8) */
300
301 /** Number of pending os_file_pread() operations */
302 UNIV_INTERN ulint os_file_n_pending_preads = 0;
303 /** Number of pending os_file_pwrite() operations */
304 UNIV_INTERN ulint os_file_n_pending_pwrites = 0;
305 /** Number of pending write operations */
306 UNIV_INTERN ulint os_n_pending_writes = 0;
307 /** Number of pending read operations */
308 UNIV_INTERN ulint os_n_pending_reads = 0;
309
310 #ifdef UNIV_DEBUG
311 # ifndef UNIV_HOTBACKUP
312 /**********************************************************************//**
313 Validates the consistency the aio system some of the time.
314 @return TRUE if ok or the check was skipped */
315 UNIV_INTERN
316 ibool
os_aio_validate_skip(void)317 os_aio_validate_skip(void)
318 /*======================*/
319 {
320 /** Try os_aio_validate() every this many times */
321 # define OS_AIO_VALIDATE_SKIP 13
322
323 /** The os_aio_validate() call skip counter.
324 Use a signed type because of the race condition below. */
325 static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
326
327 /* There is a race condition below, but it does not matter,
328 because this call is only for heuristic purposes. We want to
329 reduce the call frequency of the costly os_aio_validate()
330 check in debug builds. */
331 if (--os_aio_validate_count > 0) {
332 return(TRUE);
333 }
334
335 os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
336 return(os_aio_validate());
337 }
338 # endif /* !UNIV_HOTBACKUP */
339 #endif /* UNIV_DEBUG */
340
341 #ifdef __WIN__
342 /***********************************************************************//**
343 Gets the operating system version. Currently works only on Windows.
344 @return OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000, OS_WINXP, OS_WINVISTA,
345 OS_WIN7. */
346 UNIV_INTERN
347 ulint
os_get_os_version(void)348 os_get_os_version(void)
349 /*===================*/
350 {
351 OSVERSIONINFO os_info;
352
353 os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
354
355 ut_a(GetVersionEx(&os_info));
356
357 if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) {
358 return(OS_WIN31);
359 } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
360 return(OS_WIN95);
361 } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
362 switch (os_info.dwMajorVersion) {
363 case 3:
364 case 4:
365 return(OS_WINNT);
366 case 5:
367 return (os_info.dwMinorVersion == 0)
368 ? OS_WIN2000 : OS_WINXP;
369 case 6:
370 return (os_info.dwMinorVersion == 0)
371 ? OS_WINVISTA : OS_WIN7;
372 default:
373 return(OS_WIN7);
374 }
375 } else {
376 ut_error;
377 return(0);
378 }
379 }
380 #endif /* __WIN__ */
381
382 /***********************************************************************//**
383 Retrieves the last error number if an error occurs in a file io function.
384 The number should be retrieved before any other OS calls (because they may
385 overwrite the error number). If the number is not known to this program,
386 the OS error number + 100 is returned.
387 @return error number, or OS error number + 100 */
388 static
389 ulint
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)390 os_file_get_last_error_low(
391 /*=======================*/
392 bool report_all_errors, /*!< in: TRUE if we want an error
393 message printed of all errors */
394 bool on_error_silent) /*!< in: TRUE then don't print any
395 diagnostic to the log */
396 {
397 #ifdef __WIN__
398
399 ulint err = (ulint) GetLastError();
400 if (err == ERROR_SUCCESS) {
401 return(0);
402 }
403
404 if (report_all_errors
405 || (!on_error_silent
406 && err != ERROR_DISK_FULL
407 && err != ERROR_FILE_EXISTS)) {
408
409 ut_print_timestamp(stderr);
410 fprintf(stderr,
411 " InnoDB: Operating system error number %lu"
412 " in a file operation.\n", (ulong) err);
413
414 if (err == ERROR_PATH_NOT_FOUND) {
415 fprintf(stderr,
416 "InnoDB: The error means the system"
417 " cannot find the path specified.\n");
418
419 if (srv_is_being_started) {
420 fprintf(stderr,
421 "InnoDB: If you are installing InnoDB,"
422 " remember that you must create\n"
423 "InnoDB: directories yourself, InnoDB"
424 " does not create them.\n");
425 }
426 } else if (err == ERROR_ACCESS_DENIED) {
427 fprintf(stderr,
428 "InnoDB: The error means mysqld does not have"
429 " the access rights to\n"
430 "InnoDB: the directory. It may also be"
431 " you have created a subdirectory\n"
432 "InnoDB: of the same name as a data file.\n");
433 } else if (err == ERROR_SHARING_VIOLATION
434 || err == ERROR_LOCK_VIOLATION) {
435 fprintf(stderr,
436 "InnoDB: The error means that another program"
437 " is using InnoDB's files.\n"
438 "InnoDB: This might be a backup or antivirus"
439 " software or another instance\n"
440 "InnoDB: of MySQL."
441 " Please close it to get rid of this error.\n");
442 } else if (err == ERROR_WORKING_SET_QUOTA
443 || err == ERROR_NO_SYSTEM_RESOURCES) {
444 fprintf(stderr,
445 "InnoDB: The error means that there are no"
446 " sufficient system resources or quota to"
447 " complete the operation.\n");
448 } else if (err == ERROR_OPERATION_ABORTED) {
449 fprintf(stderr,
450 "InnoDB: The error means that the I/O"
451 " operation has been aborted\n"
452 "InnoDB: because of either a thread exit"
453 " or an application request.\n"
454 "InnoDB: Retry attempt is made.\n");
455 } else {
456 fprintf(stderr,
457 "InnoDB: Some operating system error numbers"
458 " are described at\n"
459 "InnoDB: "
460 REFMAN
461 "operating-system-error-codes.html\n");
462 }
463 }
464
465 fflush(stderr);
466
467 if (err == ERROR_FILE_NOT_FOUND) {
468 return(OS_FILE_NOT_FOUND);
469 } else if (err == ERROR_DISK_FULL) {
470 return(OS_FILE_DISK_FULL);
471 } else if (err == ERROR_FILE_EXISTS) {
472 return(OS_FILE_ALREADY_EXISTS);
473 } else if (err == ERROR_SHARING_VIOLATION
474 || err == ERROR_LOCK_VIOLATION) {
475 return(OS_FILE_SHARING_VIOLATION);
476 } else if (err == ERROR_WORKING_SET_QUOTA
477 || err == ERROR_NO_SYSTEM_RESOURCES) {
478 return(OS_FILE_INSUFFICIENT_RESOURCE);
479 } else if (err == ERROR_OPERATION_ABORTED) {
480 return(OS_FILE_OPERATION_ABORTED);
481 } else if (err == ERROR_ACCESS_DENIED) {
482 return(OS_FILE_ACCESS_VIOLATION);
483 } else {
484 return(OS_FILE_ERROR_MAX + err);
485 }
486 #else
487 int err = errno;
488 if (err == 0) {
489 return(0);
490 }
491
492 if (report_all_errors
493 || (err != ENOSPC && err != EEXIST && !on_error_silent)) {
494
495 ut_print_timestamp(stderr);
496 fprintf(stderr,
497 " InnoDB: Operating system error number %d"
498 " in a file operation.\n", err);
499
500 if (err == ENOENT) {
501 fprintf(stderr,
502 "InnoDB: The error means the system"
503 " cannot find the path specified.\n");
504
505 if (srv_is_being_started) {
506 fprintf(stderr,
507 "InnoDB: If you are installing InnoDB,"
508 " remember that you must create\n"
509 "InnoDB: directories yourself, InnoDB"
510 " does not create them.\n");
511 }
512 } else if (err == EACCES) {
513 fprintf(stderr,
514 "InnoDB: The error means mysqld does not have"
515 " the access rights to\n"
516 "InnoDB: the directory.\n");
517 } else {
518 if (strerror(err) != NULL) {
519 fprintf(stderr,
520 "InnoDB: Error number %d"
521 " means '%s'.\n",
522 err, strerror(err));
523 }
524
525
526 fprintf(stderr,
527 "InnoDB: Some operating system"
528 " error numbers are described at\n"
529 "InnoDB: "
530 REFMAN
531 "operating-system-error-codes.html\n");
532 }
533 }
534
535 fflush(stderr);
536
537 switch (err) {
538 case ENOSPC:
539 return(OS_FILE_DISK_FULL);
540 case ENOENT:
541 return(OS_FILE_NOT_FOUND);
542 case EEXIST:
543 return(OS_FILE_ALREADY_EXISTS);
544 case EXDEV:
545 case ENOTDIR:
546 case EISDIR:
547 return(OS_FILE_PATH_ERROR);
548 case EAGAIN:
549 if (srv_use_native_aio) {
550 return(OS_FILE_AIO_RESOURCES_RESERVED);
551 }
552 break;
553 case EINTR:
554 if (srv_use_native_aio) {
555 return(OS_FILE_AIO_INTERRUPTED);
556 }
557 break;
558 case EACCES:
559 return(OS_FILE_ACCESS_VIOLATION);
560 }
561 return(OS_FILE_ERROR_MAX + err);
562 #endif
563 }
564
565 /***********************************************************************//**
566 Retrieves the last error number if an error occurs in a file io function.
567 The number should be retrieved before any other OS calls (because they may
568 overwrite the error number). If the number is not known to this program,
569 the OS error number + 100 is returned.
570 @return error number, or OS error number + 100 */
571 UNIV_INTERN
572 ulint
os_file_get_last_error(bool report_all_errors)573 os_file_get_last_error(
574 /*===================*/
575 bool report_all_errors) /*!< in: TRUE if we want an error
576 message printed of all errors */
577 {
578 return(os_file_get_last_error_low(report_all_errors, false));
579 }
580
581 /****************************************************************//**
582 Does error handling when a file operation fails.
583 Conditionally exits (calling exit(3)) based on should_exit value and the
584 error type, if should_exit is TRUE then on_error_silent is ignored.
585 @return TRUE if we should retry the operation */
586 static
587 ibool
os_file_handle_error_cond_exit(const char * name,const char * operation,ibool should_exit,ibool on_error_silent)588 os_file_handle_error_cond_exit(
589 /*===========================*/
590 const char* name, /*!< in: name of a file or NULL */
591 const char* operation, /*!< in: operation */
592 ibool should_exit, /*!< in: call exit(3) if unknown error
593 and this parameter is TRUE */
594 ibool on_error_silent)/*!< in: if TRUE then don't print
595 any message to the log iff it is
596 an unknown non-fatal error */
597 {
598 ulint err;
599
600 err = os_file_get_last_error_low(false, on_error_silent);
601
602 switch (err) {
603 case OS_FILE_DISK_FULL:
604 /* We only print a warning about disk full once */
605
606 if (os_has_said_disk_full) {
607
608 return(FALSE);
609 }
610
611 /* Disk full error is reported irrespective of the
612 on_error_silent setting. */
613
614 if (name) {
615 ut_print_timestamp(stderr);
616 fprintf(stderr,
617 " InnoDB: Encountered a problem with"
618 " file %s\n", name);
619 }
620
621 ut_print_timestamp(stderr);
622 fprintf(stderr,
623 " InnoDB: Disk is full. Try to clean the disk"
624 " to free space.\n");
625
626 os_has_said_disk_full = TRUE;
627
628 fflush(stderr);
629
630 return(FALSE);
631
632 case OS_FILE_AIO_RESOURCES_RESERVED:
633 case OS_FILE_AIO_INTERRUPTED:
634
635 return(TRUE);
636
637 case OS_FILE_PATH_ERROR:
638 case OS_FILE_ALREADY_EXISTS:
639 case OS_FILE_ACCESS_VIOLATION:
640
641 return(FALSE);
642
643 case OS_FILE_SHARING_VIOLATION:
644
645 os_thread_sleep(10000000); /* 10 sec */
646 return(TRUE);
647
648 case OS_FILE_OPERATION_ABORTED:
649 case OS_FILE_INSUFFICIENT_RESOURCE:
650
651 os_thread_sleep(100000); /* 100 ms */
652 return(TRUE);
653
654 default:
655
656 /* If it is an operation that can crash on error then it
657 is better to ignore on_error_silent and print an error message
658 to the log. */
659
660 if (should_exit || !on_error_silent) {
661 ib_logf(IB_LOG_LEVEL_ERROR, "File %s: '%s' returned OS "
662 "error " ULINTPF ".%s", name ? name : "(unknown)",
663 operation, err, should_exit
664 ? " Cannot continue operation" : "");
665 }
666
667 if (should_exit) {
668 exit(1);
669 }
670 }
671
672 return(FALSE);
673 }
674
675 /****************************************************************//**
676 Does error handling when a file operation fails.
677 @return TRUE if we should retry the operation */
678 static
679 ibool
os_file_handle_error(const char * name,const char * operation)680 os_file_handle_error(
681 /*=================*/
682 const char* name, /*!< in: name of a file or NULL */
683 const char* operation) /*!< in: operation */
684 {
685 /* exit in case of unknown error */
686 return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE));
687 }
688
689 /****************************************************************//**
690 Does error handling when a file operation fails.
691 @return TRUE if we should retry the operation */
692 static
693 ibool
os_file_handle_error_no_exit(const char * name,const char * operation,ibool on_error_silent)694 os_file_handle_error_no_exit(
695 /*=========================*/
696 const char* name, /*!< in: name of a file or NULL */
697 const char* operation, /*!< in: operation */
698 ibool on_error_silent)/*!< in: if TRUE then don't print
699 any message to the log. */
700 {
701 /* don't exit in case of unknown error */
702 return(os_file_handle_error_cond_exit(
703 name, operation, FALSE, on_error_silent));
704 }
705
706 #undef USE_FILE_LOCK
707 #define USE_FILE_LOCK
708 #if defined(UNIV_HOTBACKUP) || defined(__WIN__)
709 /* InnoDB Hot Backup does not lock the data files.
710 * On Windows, mandatory locking is used.
711 */
712 # undef USE_FILE_LOCK
713 #endif
714 #ifdef USE_FILE_LOCK
715 /****************************************************************//**
716 Obtain an exclusive lock on a file.
717 @return 0 on success */
718 static
719 int
os_file_lock(int fd,const char * name)720 os_file_lock(
721 /*=========*/
722 int fd, /*!< in: file descriptor */
723 const char* name) /*!< in: file name */
724 {
725 struct flock lk;
726
727 ut_ad(!srv_read_only_mode);
728
729 lk.l_type = F_WRLCK;
730 lk.l_whence = SEEK_SET;
731 lk.l_start = lk.l_len = 0;
732
733 if (fcntl(fd, F_SETLK, &lk) == -1) {
734
735 ib_logf(IB_LOG_LEVEL_ERROR,
736 "Unable to lock %s, error: %d", name, errno);
737
738 if (errno == EAGAIN || errno == EACCES) {
739 ib_logf(IB_LOG_LEVEL_INFO,
740 "Check that you do not already have "
741 "another mysqld process using the "
742 "same InnoDB data or log files.");
743 }
744
745 return(-1);
746 }
747
748 return(0);
749 }
750 #endif /* USE_FILE_LOCK */
751
752 #ifndef UNIV_HOTBACKUP
753 /****************************************************************//**
754 Creates the seek mutexes used in positioned reads and writes. */
755 UNIV_INTERN
756 void
os_io_init_simple(void)757 os_io_init_simple(void)
758 /*===================*/
759 {
760 #if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8
761 os_file_count_mutex = os_mutex_create();
762 #endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD_SIZE < 8 */
763
764 for (ulint i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
765 os_file_seek_mutexes[i] = os_mutex_create();
766 }
767 }
768
769 /** Create a temporary file. This function is like tmpfile(3), but
770 the temporary file is created in the given parameter path. If the path
771 is null then it will create the file in the mysql server configuration
772 parameter (--tmpdir).
773 @param[in] path location for creating temporary file
774 @return temporary file handle, or NULL on error */
775 UNIV_INTERN
776 FILE*
os_file_create_tmpfile(const char * path)777 os_file_create_tmpfile(
778 const char* path)
779 {
780 FILE* file = NULL;
781 WAIT_ALLOW_WRITES();
782 int fd = innobase_mysql_tmpfile(path);
783
784 ut_ad(!srv_read_only_mode);
785
786 if (fd >= 0) {
787 file = fdopen(fd, "w+b");
788 }
789
790 if (!file) {
791 ut_print_timestamp(stderr);
792 fprintf(stderr,
793 " InnoDB: Error: unable to create temporary file;"
794 " errno: %d\n", errno);
795 if (fd >= 0) {
796 close(fd);
797 }
798 }
799
800 return(file);
801 }
802 #endif /* !UNIV_HOTBACKUP */
803
804 /***********************************************************************//**
805 The os_file_opendir() function opens a directory stream corresponding to the
806 directory named by the dirname argument. The directory stream is positioned
807 at the first entry. In both Unix and Windows we automatically skip the '.'
808 and '..' items at the start of the directory listing.
809 @return directory stream, NULL if error */
810 UNIV_INTERN
811 os_file_dir_t
os_file_opendir(const char * dirname,ibool error_is_fatal)812 os_file_opendir(
813 /*============*/
814 const char* dirname, /*!< in: directory name; it must not
815 contain a trailing '\' or '/' */
816 ibool error_is_fatal) /*!< in: TRUE if we should treat an
817 error as a fatal error; if we try to
818 open symlinks then we do not wish a
819 fatal error if it happens not to be
820 a directory */
821 {
822 os_file_dir_t dir;
823 #ifdef __WIN__
824 LPWIN32_FIND_DATA lpFindFileData;
825 char path[OS_FILE_MAX_PATH + 3];
826
827 ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
828
829 strcpy(path, dirname);
830 strcpy(path + strlen(path), "\\*");
831
832 /* Note that in Windows opening the 'directory stream' also retrieves
833 the first entry in the directory. Since it is '.', that is no problem,
834 as we will skip over the '.' and '..' entries anyway. */
835
836 lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
837 ut_malloc(sizeof(WIN32_FIND_DATA)));
838
839 dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
840
841 ut_free(lpFindFileData);
842
843 if (dir == INVALID_HANDLE_VALUE) {
844
845 if (error_is_fatal) {
846 os_file_handle_error(dirname, "opendir");
847 }
848
849 return(NULL);
850 }
851
852 return(dir);
853 #else
854 dir = opendir(dirname);
855
856 if (dir == NULL && error_is_fatal) {
857 os_file_handle_error(dirname, "opendir");
858 }
859
860 return(dir);
861 #endif /* __WIN__ */
862 }
863
864 /***********************************************************************//**
865 Closes a directory stream.
866 @return 0 if success, -1 if failure */
867 UNIV_INTERN
868 int
os_file_closedir(os_file_dir_t dir)869 os_file_closedir(
870 /*=============*/
871 os_file_dir_t dir) /*!< in: directory stream */
872 {
873 #ifdef __WIN__
874 BOOL ret;
875
876 ret = FindClose(dir);
877
878 if (!ret) {
879 os_file_handle_error_no_exit(NULL, "closedir", FALSE);
880
881 return(-1);
882 }
883
884 return(0);
885 #else
886 int ret;
887
888 ret = closedir(dir);
889
890 if (ret) {
891 os_file_handle_error_no_exit(NULL, "closedir", FALSE);
892 }
893
894 return(ret);
895 #endif /* __WIN__ */
896 }
897
898 /***********************************************************************//**
899 This function returns information of the next file in the directory. We jump
900 over the '.' and '..' entries in the directory.
901 @return 0 if ok, -1 if error, 1 if at the end of the directory */
902 UNIV_INTERN
903 int
os_file_readdir_next_file(const char * dirname,os_file_dir_t dir,os_file_stat_t * info)904 os_file_readdir_next_file(
905 /*======================*/
906 const char* dirname,/*!< in: directory name or path */
907 os_file_dir_t dir, /*!< in: directory stream */
908 os_file_stat_t* info) /*!< in/out: buffer where the info is returned */
909 {
910 #ifdef __WIN__
911 LPWIN32_FIND_DATA lpFindFileData;
912 BOOL ret;
913
914 lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
915 ut_malloc(sizeof(WIN32_FIND_DATA)));
916 next_file:
917 ret = FindNextFile(dir, lpFindFileData);
918
919 if (ret) {
920 ut_a(strlen((char*) lpFindFileData->cFileName)
921 < OS_FILE_MAX_PATH);
922
923 if (strcmp((char*) lpFindFileData->cFileName, ".") == 0
924 || strcmp((char*) lpFindFileData->cFileName, "..") == 0) {
925
926 goto next_file;
927 }
928
929 strcpy(info->name, (char*) lpFindFileData->cFileName);
930
931 info->size = (ib_int64_t)(lpFindFileData->nFileSizeLow)
932 + (((ib_int64_t)(lpFindFileData->nFileSizeHigh))
933 << 32);
934
935 if (lpFindFileData->dwFileAttributes
936 & FILE_ATTRIBUTE_REPARSE_POINT) {
937 /* TODO: test Windows symlinks */
938 /* TODO: MySQL has apparently its own symlink
939 implementation in Windows, dbname.sym can
940 redirect a database directory:
941 REFMAN "windows-symbolic-links.html" */
942 info->type = OS_FILE_TYPE_LINK;
943 } else if (lpFindFileData->dwFileAttributes
944 & FILE_ATTRIBUTE_DIRECTORY) {
945 info->type = OS_FILE_TYPE_DIR;
946 } else {
947 /* It is probably safest to assume that all other
948 file types are normal. Better to check them rather
949 than blindly skip them. */
950
951 info->type = OS_FILE_TYPE_FILE;
952 }
953 }
954
955 ut_free(lpFindFileData);
956
957 if (ret) {
958 return(0);
959 } else if (GetLastError() == ERROR_NO_MORE_FILES) {
960
961 return(1);
962 } else {
963 os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE);
964 return(-1);
965 }
966 #else
967 struct dirent* ent;
968 char* full_path;
969 int ret;
970 struct stat statinfo;
971 #ifdef HAVE_READDIR_R
972 char dirent_buf[sizeof(struct dirent)
973 + _POSIX_PATH_MAX + 100];
974 /* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
975 the max file name len; but in most standards, the
976 length is NAME_MAX; we add 100 to be even safer */
977 #endif
978
979 next_file:
980
981 #ifdef HAVE_READDIR_R
982 ret = readdir_r(dir, (struct dirent*) dirent_buf, &ent);
983
984 if (ret != 0
985 #ifdef UNIV_AIX
986 /* On AIX, only if we got non-NULL 'ent' (result) value and
987 a non-zero 'ret' (return) value, it indicates a failed
988 readdir_r() call. An NULL 'ent' with an non-zero 'ret'
989 would indicate the "end of the directory" is reached. */
990 && ent != NULL
991 #endif
992 ) {
993 fprintf(stderr,
994 "InnoDB: cannot read directory %s, error %lu\n",
995 dirname, (ulong) ret);
996
997 return(-1);
998 }
999
1000 if (ent == NULL) {
1001 /* End of directory */
1002
1003 return(1);
1004 }
1005
1006 ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
1007 #else
1008 ent = readdir(dir);
1009
1010 if (ent == NULL) {
1011
1012 return(1);
1013 }
1014 #endif
1015 ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
1016
1017 if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
1018
1019 goto next_file;
1020 }
1021
1022 strcpy(info->name, ent->d_name);
1023
1024 full_path = static_cast<char*>(
1025 ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10));
1026
1027 sprintf(full_path, "%s/%s", dirname, ent->d_name);
1028
1029 ret = stat(full_path, &statinfo);
1030
1031 if (ret) {
1032
1033 if (errno == ENOENT) {
1034 /* readdir() returned a file that does not exist,
1035 it must have been deleted in the meantime. Do what
1036 would have happened if the file was deleted before
1037 readdir() - ignore and go to the next entry.
1038 If this is the last entry then info->name will still
1039 contain the name of the deleted file when this
1040 function returns, but this is not an issue since the
1041 caller shouldn't be looking at info when end of
1042 directory is returned. */
1043
1044 ut_free(full_path);
1045
1046 goto next_file;
1047 }
1048
1049 os_file_handle_error_no_exit(full_path, "stat", FALSE);
1050
1051 ut_free(full_path);
1052
1053 return(-1);
1054 }
1055
1056 info->size = (ib_int64_t) statinfo.st_size;
1057
1058 if (S_ISDIR(statinfo.st_mode)) {
1059 info->type = OS_FILE_TYPE_DIR;
1060 } else if (S_ISLNK(statinfo.st_mode)) {
1061 info->type = OS_FILE_TYPE_LINK;
1062 } else if (S_ISREG(statinfo.st_mode)) {
1063 info->type = OS_FILE_TYPE_FILE;
1064 } else {
1065 info->type = OS_FILE_TYPE_UNKNOWN;
1066 }
1067
1068 ut_free(full_path);
1069
1070 return(0);
1071 #endif
1072 }
1073
1074 /*****************************************************************//**
1075 This function attempts to create a directory named pathname. The new
1076 directory gets default permissions. On Unix the permissions are
1077 (0770 & ~umask). If the directory exists already, nothing is done and
1078 the call succeeds, unless the fail_if_exists arguments is true.
1079 If another error occurs, such as a permission error, this does not crash,
1080 but reports the error and returns FALSE.
1081 @return TRUE if call succeeds, FALSE on error */
1082 UNIV_INTERN
1083 ibool
os_file_create_directory(const char * pathname,ibool fail_if_exists)1084 os_file_create_directory(
1085 /*=====================*/
1086 const char* pathname, /*!< in: directory name as
1087 null-terminated string */
1088 ibool fail_if_exists) /*!< in: if TRUE, pre-existing directory
1089 is treated as an error. */
1090 {
1091 #ifdef __WIN__
1092 BOOL rcode;
1093
1094 rcode = CreateDirectory((LPCTSTR) pathname, NULL);
1095 if (!(rcode != 0
1096 || (GetLastError() == ERROR_ALREADY_EXISTS
1097 && !fail_if_exists))) {
1098
1099 os_file_handle_error_no_exit(
1100 pathname, "CreateDirectory", FALSE);
1101
1102 return(FALSE);
1103 }
1104
1105 return(TRUE);
1106 #else
1107 int rcode;
1108 WAIT_ALLOW_WRITES();
1109
1110 rcode = mkdir(pathname, 0770);
1111
1112 if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
1113 /* failure */
1114 os_file_handle_error_no_exit(pathname, "mkdir", FALSE);
1115
1116 return(FALSE);
1117 }
1118
1119 return (TRUE);
1120 #endif /* __WIN__ */
1121 }
1122
1123 /****************************************************************//**
1124 NOTE! Use the corresponding macro os_file_create_simple(), not directly
1125 this function!
1126 A simple function to open or create a file.
1127 @return own: handle to the file, not defined if error, error number
1128 can be retrieved with os_file_get_last_error */
1129 UNIV_INTERN
1130 os_file_t
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,ibool * success)1131 os_file_create_simple_func(
1132 /*=======================*/
1133 const char* name, /*!< in: name of the file or path as a
1134 null-terminated string */
1135 ulint create_mode,/*!< in: create mode */
1136 ulint access_type,/*!< in: OS_FILE_READ_ONLY or
1137 OS_FILE_READ_WRITE */
1138 ibool* success)/*!< out: TRUE if succeed, FALSE if error */
1139 {
1140 os_file_t file;
1141 ibool retry;
1142
1143 *success = FALSE;
1144 #ifdef __WIN__
1145 DWORD access;
1146 DWORD create_flag;
1147 DWORD attributes = 0;
1148
1149 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
1150 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
1151
1152 if (create_mode == OS_FILE_OPEN) {
1153
1154 create_flag = OPEN_EXISTING;
1155
1156 } else if (srv_read_only_mode) {
1157
1158 create_flag = OPEN_EXISTING;
1159
1160 } else if (create_mode == OS_FILE_CREATE) {
1161
1162 create_flag = CREATE_NEW;
1163
1164 } else if (create_mode == OS_FILE_CREATE_PATH) {
1165
1166 ut_a(!srv_read_only_mode);
1167
1168 /* Create subdirs along the path if needed */
1169 *success = os_file_create_subdirs_if_needed(name);
1170
1171 if (!*success) {
1172
1173 ib_logf(IB_LOG_LEVEL_ERROR,
1174 "Unable to create subdirectories '%s'",
1175 name);
1176
1177 return((os_file_t) -1);
1178 }
1179
1180 create_flag = CREATE_NEW;
1181 create_mode = OS_FILE_CREATE;
1182
1183 } else {
1184 ib_logf(IB_LOG_LEVEL_ERROR,
1185 "Unknown file create mode (%lu) for file '%s'",
1186 create_mode, name);
1187
1188 return((os_file_t) -1);
1189 }
1190
1191 if (access_type == OS_FILE_READ_ONLY) {
1192 access = GENERIC_READ;
1193 } else if (srv_read_only_mode) {
1194
1195 ib_logf(IB_LOG_LEVEL_INFO,
1196 "read only mode set. Unable to "
1197 "open file '%s' in RW mode, trying RO mode", name);
1198
1199 access = GENERIC_READ;
1200
1201 } else if (access_type == OS_FILE_READ_WRITE) {
1202 access = GENERIC_READ | GENERIC_WRITE;
1203 } else {
1204 ib_logf(IB_LOG_LEVEL_ERROR,
1205 "Unknown file access type (%lu) for file '%s'",
1206 access_type, name);
1207
1208 return((os_file_t) -1);
1209 }
1210
1211 do {
1212 /* Use default security attributes and no template file. */
1213
1214 file = CreateFile(
1215 (LPCTSTR) name, access, FILE_SHARE_READ, NULL,
1216 create_flag, attributes, NULL);
1217
1218 if (file == INVALID_HANDLE_VALUE) {
1219
1220 *success = FALSE;
1221
1222 retry = os_file_handle_error(
1223 name, create_mode == OS_FILE_OPEN ?
1224 "open" : "create");
1225
1226 } else {
1227 *success = TRUE;
1228 retry = false;
1229 }
1230
1231 } while (retry);
1232
1233 #else /* __WIN__ */
1234 int create_flag;
1235 if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
1236 WAIT_ALLOW_WRITES();
1237
1238 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
1239 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
1240
1241 if (create_mode == OS_FILE_OPEN) {
1242
1243 if (access_type == OS_FILE_READ_ONLY) {
1244 create_flag = O_RDONLY;
1245 } else if (srv_read_only_mode) {
1246 create_flag = O_RDONLY;
1247 } else {
1248 create_flag = O_RDWR;
1249 }
1250
1251 } else if (srv_read_only_mode) {
1252
1253 create_flag = O_RDONLY;
1254
1255 } else if (create_mode == OS_FILE_CREATE) {
1256
1257 create_flag = O_RDWR | O_CREAT | O_EXCL;
1258
1259 } else if (create_mode == OS_FILE_CREATE_PATH) {
1260
1261 /* Create subdirs along the path if needed */
1262
1263 *success = os_file_create_subdirs_if_needed(name);
1264
1265 if (!*success) {
1266
1267 ib_logf(IB_LOG_LEVEL_ERROR,
1268 "Unable to create subdirectories '%s'",
1269 name);
1270
1271 return((os_file_t) -1);
1272 }
1273
1274 create_flag = O_RDWR | O_CREAT | O_EXCL;
1275 create_mode = OS_FILE_CREATE;
1276 } else {
1277
1278 ib_logf(IB_LOG_LEVEL_ERROR,
1279 "Unknown file create mode (%lu) for file '%s'",
1280 create_mode, name);
1281
1282 return((os_file_t) -1);
1283 }
1284
1285 do {
1286 file = ::open(name, create_flag, os_innodb_umask);
1287
1288 if (file == -1) {
1289 *success = FALSE;
1290
1291 retry = os_file_handle_error(
1292 name,
1293 create_mode == OS_FILE_OPEN
1294 ? "open" : "create");
1295 } else {
1296 *success = TRUE;
1297 retry = false;
1298 }
1299
1300 } while (retry);
1301
1302 #ifdef USE_FILE_LOCK
1303 if (!srv_read_only_mode
1304 && *success
1305 && access_type == OS_FILE_READ_WRITE
1306 && os_file_lock(file, name)) {
1307
1308 *success = FALSE;
1309 close(file);
1310 file = -1;
1311 }
1312 #endif /* USE_FILE_LOCK */
1313
1314 #endif /* __WIN__ */
1315
1316 return(file);
1317 }
1318
1319 /****************************************************************//**
1320 NOTE! Use the corresponding macro
1321 os_file_create_simple_no_error_handling(), not directly this function!
1322 A simple function to open or create a file.
1323 @return own: handle to the file, not defined if error, error number
1324 can be retrieved with os_file_get_last_error */
1325 UNIV_INTERN
1326 pfs_os_file_t
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,ibool * success)1327 os_file_create_simple_no_error_handling_func(
1328 /*=========================================*/
1329 const char* name, /*!< in: name of the file or path as a
1330 null-terminated string */
1331 ulint create_mode,/*!< in: create mode */
1332 ulint access_type,/*!< in: OS_FILE_READ_ONLY,
1333 OS_FILE_READ_WRITE, or
1334 OS_FILE_READ_ALLOW_DELETE; the last option is
1335 used by a backup program reading the file */
1336 ibool* success)/*!< out: TRUE if succeed, FALSE if error */
1337 {
1338 pfs_os_file_t file;
1339
1340 *success = FALSE;
1341 #ifdef __WIN__
1342 DWORD access;
1343 DWORD create_flag;
1344 DWORD attributes = 0;
1345 DWORD share_mode = FILE_SHARE_READ;
1346 ut_a(name);
1347
1348 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
1349 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
1350
1351 if (create_mode == OS_FILE_OPEN) {
1352 create_flag = OPEN_EXISTING;
1353 } else if (srv_read_only_mode) {
1354 create_flag = OPEN_EXISTING;
1355 } else if (create_mode == OS_FILE_CREATE) {
1356 create_flag = CREATE_NEW;
1357 } else {
1358
1359 ib_logf(IB_LOG_LEVEL_ERROR,
1360 "Unknown file create mode (%lu) for file '%s'",
1361 create_mode, name);
1362 file.m_file = (os_file_t)-1;
1363 return(file);
1364 }
1365
1366 if (access_type == OS_FILE_READ_ONLY) {
1367 access = GENERIC_READ;
1368 } else if (srv_read_only_mode) {
1369 access = GENERIC_READ;
1370 } else if (access_type == OS_FILE_READ_WRITE) {
1371 access = GENERIC_READ | GENERIC_WRITE;
1372 } else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
1373
1374 ut_a(!srv_read_only_mode);
1375
1376 access = GENERIC_READ;
1377
1378 /*!< A backup program has to give mysqld the maximum
1379 freedom to do what it likes with the file */
1380
1381 share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE;
1382 } else {
1383 ib_logf(IB_LOG_LEVEL_ERROR,
1384 "Unknown file access type (%lu) for file '%s'",
1385 access_type, name);
1386 file.m_file = (os_file_t)-1;
1387 return(file);
1388 }
1389
1390 file.m_file = CreateFile((LPCTSTR) name,
1391 access,
1392 share_mode,
1393 NULL, // Security attributes
1394 create_flag,
1395 attributes,
1396 NULL); // No template file
1397
1398 *success = (file.m_file != INVALID_HANDLE_VALUE);
1399 #else /* __WIN__ */
1400 int create_flag;
1401 const char* mode_str = NULL;
1402 ut_a(name);
1403 if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
1404 WAIT_ALLOW_WRITES();
1405
1406 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
1407 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
1408
1409 if (create_mode == OS_FILE_OPEN) {
1410
1411 mode_str = "OPEN";
1412
1413 if (access_type == OS_FILE_READ_ONLY) {
1414
1415 create_flag = O_RDONLY;
1416
1417 } else if (srv_read_only_mode) {
1418
1419 create_flag = O_RDONLY;
1420
1421 } else {
1422
1423 ut_a(access_type == OS_FILE_READ_WRITE
1424 || access_type == OS_FILE_READ_ALLOW_DELETE);
1425
1426 create_flag = O_RDWR;
1427 }
1428
1429 } else if (srv_read_only_mode) {
1430
1431 mode_str = "OPEN";
1432
1433 create_flag = O_RDONLY;
1434
1435 } else if (create_mode == OS_FILE_CREATE) {
1436
1437 mode_str = "CREATE";
1438
1439 create_flag = O_RDWR | O_CREAT | O_EXCL;
1440
1441 } else {
1442 ib_logf(IB_LOG_LEVEL_ERROR,
1443 "Unknown file create mode (%lu) for file '%s'",
1444 create_mode, name);
1445 file.m_file = -1;
1446 return(file);
1447 }
1448
1449 file.m_file = ::open(name, create_flag, os_innodb_umask);
1450
1451 *success = file.m_file == -1 ? FALSE : TRUE;
1452
1453 /* This function is always called for data files, we should disable
1454 OS caching (O_DIRECT) here as we do in os_file_create_func(), so
1455 we open the same file in the same mode, see man page of open(2). */
1456 if (!srv_read_only_mode
1457 && *success
1458 && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
1459 || srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) {
1460
1461 os_file_set_nocache(file.m_file, name, mode_str);
1462 }
1463
1464 #ifdef USE_FILE_LOCK
1465 if (!srv_read_only_mode
1466 && *success
1467 && access_type == OS_FILE_READ_WRITE
1468 && os_file_lock(file.m_file, name)) {
1469
1470 *success = FALSE;
1471 close(file.m_file);
1472 file.m_file = -1;
1473
1474 }
1475 #endif /* USE_FILE_LOCK */
1476
1477 #endif /* __WIN__ */
1478
1479 return(file);
1480 }
1481
1482 /****************************************************************//**
1483 Tries to disable OS caching on an opened file descriptor. */
1484 UNIV_INTERN
1485 void
os_file_set_nocache(int fd MY_ATTRIBUTE ((unused)),const char * file_name MY_ATTRIBUTE ((unused)),const char * operation_name MY_ATTRIBUTE ((unused)))1486 os_file_set_nocache(
1487 /*================*/
1488 int fd /*!< in: file descriptor to alter */
1489 MY_ATTRIBUTE((unused)),
1490 const char* file_name /*!< in: used in the diagnostic
1491 message */
1492 MY_ATTRIBUTE((unused)),
1493 const char* operation_name MY_ATTRIBUTE((unused)))
1494 /*!< in: "open" or "create"; used
1495 in the diagnostic message */
1496 {
1497 /* some versions of Solaris may not have DIRECTIO_ON */
1498 #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
1499 if (directio(fd, DIRECTIO_ON) == -1) {
1500 int errno_save = errno;
1501
1502 ib_logf(IB_LOG_LEVEL_ERROR,
1503 "Failed to set DIRECTIO_ON on file %s: %s: %s, "
1504 "continuing anyway.",
1505 file_name, operation_name, strerror(errno_save));
1506 }
1507 #elif defined(O_DIRECT)
1508 if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
1509 int errno_save = errno;
1510 static bool warning_message_printed = false;
1511 if (errno_save == EINVAL) {
1512 if (!warning_message_printed) {
1513 warning_message_printed = true;
1514 # ifdef UNIV_LINUX
1515 ib_logf(IB_LOG_LEVEL_WARN,
1516 "Failed to set O_DIRECT on file "
1517 "%s: %s: %s, continuing anyway. "
1518 "O_DIRECT is known to result "
1519 "in 'Invalid argument' on Linux on "
1520 "tmpfs, see MySQL Bug#26662.",
1521 file_name, operation_name,
1522 strerror(errno_save));
1523 # else /* UNIV_LINUX */
1524 goto short_warning;
1525 # endif /* UNIV_LINUX */
1526 }
1527 } else {
1528 # ifndef UNIV_LINUX
1529 short_warning:
1530 # endif
1531 ib_logf(IB_LOG_LEVEL_WARN,
1532 "Failed to set O_DIRECT on file %s: %s: %s, "
1533 "continuing anyway.",
1534 file_name, operation_name, strerror(errno_save));
1535 }
1536 }
1537 #endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */
1538 }
1539
1540 /****************************************************************//**
1541 NOTE! Use the corresponding macro os_file_create(), not directly
1542 this function!
1543 Opens an existing file or creates a new.
1544 @return own: handle to the file, not defined if error, error number
1545 can be retrieved with os_file_get_last_error */
1546 UNIV_INTERN
1547 pfs_os_file_t
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,ibool * success)1548 os_file_create_func(
1549 /*================*/
1550 const char* name, /*!< in: name of the file or path as a
1551 null-terminated string */
1552 ulint create_mode,/*!< in: create mode */
1553 ulint purpose,/*!< in: OS_FILE_AIO, if asynchronous,
1554 non-buffered i/o is desired,
1555 OS_FILE_NORMAL, if any normal file;
1556 NOTE that it also depends on type, os_aio_..
1557 and srv_.. variables whether we really use
1558 async i/o or unbuffered i/o: look in the
1559 function source code for the exact rules */
1560 ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */
1561 ibool* success)/*!< out: TRUE if succeed, FALSE if error */
1562 {
1563 pfs_os_file_t file;
1564 ibool retry;
1565 ibool on_error_no_exit;
1566 ibool on_error_silent;
1567 #ifdef __WIN__
1568 DBUG_EXECUTE_IF(
1569 "ib_create_table_fail_disk_full",
1570 *success = FALSE;
1571 SetLastError(ERROR_DISK_FULL);
1572 file.m_file = (os_file_t)-1;
1573 return(file);
1574 );
1575 #else /* __WIN__ */
1576 DBUG_EXECUTE_IF(
1577 "ib_create_table_fail_disk_full",
1578 *success = FALSE;
1579 errno = ENOSPC;
1580 file.m_file = -1;
1581 return(file);
1582 );
1583 #endif /* __WIN__ */
1584
1585 #ifdef __WIN__
1586 DWORD create_flag;
1587 DWORD share_mode = FILE_SHARE_READ;
1588
1589 on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
1590 ? TRUE : FALSE;
1591
1592 on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
1593 ? TRUE : FALSE;
1594
1595 create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
1596 create_mode &= ~OS_FILE_ON_ERROR_SILENT;
1597
1598 if (create_mode == OS_FILE_OPEN_RAW) {
1599
1600 ut_a(!srv_read_only_mode);
1601
1602 create_flag = OPEN_EXISTING;
1603
1604 /* On Windows Physical devices require admin privileges and
1605 have to have the write-share mode set. See the remarks
1606 section for the CreateFile() function documentation in MSDN. */
1607
1608 share_mode |= FILE_SHARE_WRITE;
1609
1610 } else if (create_mode == OS_FILE_OPEN
1611 || create_mode == OS_FILE_OPEN_RETRY) {
1612
1613 create_flag = OPEN_EXISTING;
1614
1615 } else if (srv_read_only_mode) {
1616
1617 create_flag = OPEN_EXISTING;
1618
1619 } else if (create_mode == OS_FILE_CREATE) {
1620
1621 create_flag = CREATE_NEW;
1622
1623 } else if (create_mode == OS_FILE_OVERWRITE) {
1624
1625 create_flag = CREATE_ALWAYS;
1626
1627 } else {
1628 ib_logf(IB_LOG_LEVEL_ERROR,
1629 "Unknown file create mode (%lu) for file '%s'",
1630 create_mode, name);
1631
1632 file.m_file = (os_file_t)-1;
1633 return(file);
1634 }
1635
1636 DWORD attributes = 0;
1637
1638 #ifdef UNIV_HOTBACKUP
1639 attributes |= FILE_FLAG_NO_BUFFERING;
1640 #else
1641 if (purpose == OS_FILE_AIO) {
1642
1643 #ifdef WIN_ASYNC_IO
1644 /* If specified, use asynchronous (overlapped) io and no
1645 buffering of writes in the OS */
1646
1647 if (srv_use_native_aio) {
1648 attributes |= FILE_FLAG_OVERLAPPED;
1649 }
1650 #endif /* WIN_ASYNC_IO */
1651
1652 } else if (purpose == OS_FILE_NORMAL) {
1653 /* Use default setting. */
1654 } else {
1655 ib_logf(IB_LOG_LEVEL_ERROR,
1656 "Unknown purpose flag (%lu) while opening file '%s'",
1657 purpose, name);
1658 file.m_file = (os_file_t)-1;
1659 return(file);
1660 }
1661
1662 #ifdef UNIV_NON_BUFFERED_IO
1663 // TODO: Create a bug, this looks wrong. The flush log
1664 // parameter is dynamic.
1665 if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
1666
1667 /* Do not use unbuffered i/o for the log files because
1668 value 2 denotes that we do not flush the log at every
1669 commit, but only once per second */
1670
1671 } else if (srv_win_file_flush_method == SRV_WIN_IO_UNBUFFERED) {
1672
1673 attributes |= FILE_FLAG_NO_BUFFERING;
1674 }
1675 #endif /* UNIV_NON_BUFFERED_IO */
1676
1677 #endif /* UNIV_HOTBACKUP */
1678 DWORD access = GENERIC_READ;
1679
1680 if (!srv_read_only_mode) {
1681 access |= GENERIC_WRITE;
1682 }
1683
1684 do {
1685 /* Use default security attributes and no template file. */
1686 file.m_file = CreateFile(
1687 (LPCTSTR) name, access, share_mode, NULL,
1688 create_flag, attributes, NULL);
1689
1690 if (file.m_file == INVALID_HANDLE_VALUE) {
1691 const char* operation;
1692
1693 operation = (create_mode == OS_FILE_CREATE
1694 && !srv_read_only_mode)
1695 ? "create" : "open";
1696
1697 *success = FALSE;
1698
1699 if (on_error_no_exit) {
1700 retry = os_file_handle_error_no_exit(
1701 name, operation, on_error_silent);
1702 } else {
1703 retry = os_file_handle_error(name, operation);
1704 }
1705 } else {
1706 *success = TRUE;
1707 retry = FALSE;
1708 }
1709
1710 } while (retry);
1711
1712 #else /* __WIN__ */
1713 int create_flag;
1714 const char* mode_str = NULL;
1715 if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
1716 WAIT_ALLOW_WRITES();
1717
1718 on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
1719 ? TRUE : FALSE;
1720 on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
1721 ? TRUE : FALSE;
1722
1723 create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
1724 create_mode &= ~OS_FILE_ON_ERROR_SILENT;
1725
1726 if (create_mode == OS_FILE_OPEN
1727 || create_mode == OS_FILE_OPEN_RAW
1728 || create_mode == OS_FILE_OPEN_RETRY) {
1729
1730 mode_str = "OPEN";
1731
1732 create_flag = srv_read_only_mode ? O_RDONLY : O_RDWR;
1733
1734 } else if (srv_read_only_mode) {
1735
1736 mode_str = "OPEN";
1737
1738 create_flag = O_RDONLY;
1739
1740 } else if (create_mode == OS_FILE_CREATE) {
1741
1742 mode_str = "CREATE";
1743 create_flag = O_RDWR | O_CREAT | O_EXCL;
1744
1745 } else if (create_mode == OS_FILE_OVERWRITE) {
1746
1747 mode_str = "OVERWRITE";
1748 create_flag = O_RDWR | O_CREAT | O_TRUNC;
1749
1750 } else {
1751 ib_logf(IB_LOG_LEVEL_ERROR,
1752 "Unknown file create mode (%lu) for file '%s'",
1753 create_mode, name);
1754
1755 file.m_file = -1;
1756 return(file);
1757 }
1758
1759 ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE);
1760 ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
1761
1762 #ifdef O_SYNC
1763 /* We let O_SYNC only affect log files; note that we map O_DSYNC to
1764 O_SYNC because the datasync options seemed to corrupt files in 2001
1765 in both Linux and Solaris */
1766
1767 if (!srv_read_only_mode
1768 && type == OS_LOG_FILE
1769 && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
1770
1771 create_flag |= O_SYNC;
1772 }
1773 #endif /* O_SYNC */
1774
1775 do {
1776 file.m_file = ::open(name, create_flag, os_innodb_umask);
1777
1778 if (file.m_file == -1) {
1779 const char* operation;
1780
1781 operation = (create_mode == OS_FILE_CREATE
1782 && !srv_read_only_mode)
1783 ? "create" : "open";
1784
1785 *success = FALSE;
1786
1787 if (on_error_no_exit) {
1788 retry = os_file_handle_error_no_exit(
1789 name, operation, on_error_silent);
1790 } else {
1791 retry = os_file_handle_error(name, operation);
1792 }
1793 } else {
1794 *success = TRUE;
1795 retry = false;
1796 }
1797
1798 } while (retry);
1799
1800 /* We disable OS caching (O_DIRECT) only on data files */
1801
1802 if (!srv_read_only_mode
1803 && *success
1804 && type != OS_LOG_FILE
1805 && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
1806 || srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) {
1807
1808 os_file_set_nocache(file.m_file, name, mode_str);
1809 }
1810
1811 #ifdef USE_FILE_LOCK
1812 if (!srv_read_only_mode
1813 && *success
1814 && create_mode != OS_FILE_OPEN_RAW
1815 && os_file_lock(file.m_file, name)) {
1816
1817 if (create_mode == OS_FILE_OPEN_RETRY) {
1818
1819 ut_a(!srv_read_only_mode);
1820
1821 ib_logf(IB_LOG_LEVEL_INFO,
1822 "Retrying to lock the first data file");
1823
1824 for (int i = 0; i < 100; i++) {
1825 os_thread_sleep(1000000);
1826
1827 if (!os_file_lock(file.m_file, name)) {
1828 *success = TRUE;
1829 return(file);
1830 }
1831 }
1832
1833 ib_logf(IB_LOG_LEVEL_INFO,
1834 "Unable to open the first data file");
1835 }
1836
1837 *success = FALSE;
1838 close(file.m_file);
1839 file.m_file = -1;
1840 }
1841 #endif /* USE_FILE_LOCK */
1842
1843 #endif /* __WIN__ */
1844
1845 return(file);
1846 }
1847
1848 /***********************************************************************//**
1849 Deletes a file if it exists. The file has to be closed before calling this.
1850 @return TRUE if success */
1851 UNIV_INTERN
1852 bool
os_file_delete_if_exists_func(const char * name)1853 os_file_delete_if_exists_func(
1854 /*==========================*/
1855 const char* name) /*!< in: file path as a null-terminated
1856 string */
1857 {
1858 #ifdef __WIN__
1859 bool ret;
1860 ulint count = 0;
1861 loop:
1862 /* In Windows, deleting an .ibd file may fail if mysqlbackup is copying
1863 it */
1864
1865 ret = DeleteFile((LPCTSTR) name);
1866
1867 if (ret) {
1868 return(true);
1869 }
1870
1871 DWORD lasterr = GetLastError();
1872 if (lasterr == ERROR_FILE_NOT_FOUND
1873 || lasterr == ERROR_PATH_NOT_FOUND) {
1874 /* the file does not exist, this not an error */
1875
1876 return(true);
1877 }
1878
1879 count++;
1880
1881 if (count > 100 && 0 == (count % 10)) {
1882 os_file_get_last_error(true); /* print error information */
1883
1884 ib_logf(IB_LOG_LEVEL_WARN, "Delete of file %s failed.", name);
1885 }
1886
1887 os_thread_sleep(500000); /* sleep for 0.5 second */
1888
1889 if (count > 2000) {
1890
1891 return(false);
1892 }
1893
1894 goto loop;
1895 #else
1896 int ret;
1897 WAIT_ALLOW_WRITES();
1898
1899 ret = unlink(name);
1900
1901 if (ret != 0 && errno != ENOENT) {
1902 os_file_handle_error_no_exit(name, "delete", FALSE);
1903
1904 return(false);
1905 }
1906
1907 return(true);
1908 #endif /* __WIN__ */
1909 }
1910
1911 /***********************************************************************//**
1912 Deletes a file. The file has to be closed before calling this.
1913 @return TRUE if success */
1914 UNIV_INTERN
1915 bool
os_file_delete_func(const char * name)1916 os_file_delete_func(
1917 /*================*/
1918 const char* name) /*!< in: file path as a null-terminated
1919 string */
1920 {
1921 #ifdef __WIN__
1922 BOOL ret;
1923 ulint count = 0;
1924 loop:
1925 /* In Windows, deleting an .ibd file may fail if mysqlbackup is copying
1926 it */
1927
1928 ret = DeleteFile((LPCTSTR) name);
1929
1930 if (ret) {
1931 return(true);
1932 }
1933
1934 if (GetLastError() == ERROR_FILE_NOT_FOUND) {
1935 /* If the file does not exist, we classify this as a 'mild'
1936 error and return */
1937
1938 return(false);
1939 }
1940
1941 count++;
1942
1943 if (count > 100 && 0 == (count % 10)) {
1944 os_file_get_last_error(true); /* print error information */
1945
1946 fprintf(stderr,
1947 "InnoDB: Warning: cannot delete file %s\n"
1948 "InnoDB: Are you running mysqlbackup"
1949 " to back up the file?\n", name);
1950 }
1951
1952 os_thread_sleep(1000000); /* sleep for a second */
1953
1954 if (count > 2000) {
1955
1956 return(false);
1957 }
1958
1959 goto loop;
1960 #else
1961 int ret;
1962 WAIT_ALLOW_WRITES();
1963
1964 ret = unlink(name);
1965
1966 if (ret != 0) {
1967 os_file_handle_error_no_exit(name, "delete", FALSE);
1968
1969 return(false);
1970 }
1971
1972 return(true);
1973 #endif
1974 }
1975
1976 /***********************************************************************//**
1977 NOTE! Use the corresponding macro os_file_rename(), not directly this function!
1978 Renames a file (can also move it to another directory). It is safest that the
1979 file is closed before calling this function.
1980 @return TRUE if success */
1981 UNIV_INTERN
1982 ibool
os_file_rename_func(const char * oldpath,const char * newpath)1983 os_file_rename_func(
1984 /*================*/
1985 const char* oldpath,/*!< in: old file path as a null-terminated
1986 string */
1987 const char* newpath)/*!< in: new file path */
1988 {
1989 #ifdef UNIV_DEBUG
1990 os_file_type_t type;
1991 ibool exists;
1992
1993 /* New path must not exist. */
1994 ut_ad(os_file_status(newpath, &exists, &type));
1995 ut_ad(!exists);
1996
1997 /* Old path must exist. */
1998 ut_ad(os_file_status(oldpath, &exists, &type));
1999 ut_ad(exists);
2000 #endif /* UNIV_DEBUG */
2001
2002 #ifdef __WIN__
2003 BOOL ret;
2004
2005 ret = MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath);
2006
2007 if (ret) {
2008 return(TRUE);
2009 }
2010
2011 os_file_handle_error_no_exit(oldpath, "rename", FALSE);
2012
2013 return(FALSE);
2014 #else
2015 int ret;
2016 WAIT_ALLOW_WRITES();
2017
2018 ret = rename(oldpath, newpath);
2019
2020 if (ret != 0) {
2021 os_file_handle_error_no_exit(oldpath, "rename", FALSE);
2022
2023 return(FALSE);
2024 }
2025
2026 return(TRUE);
2027 #endif /* __WIN__ */
2028 }
2029
2030 /***********************************************************************//**
2031 NOTE! Use the corresponding macro os_file_close(), not directly this function!
2032 Closes a file handle. In case of error, error number can be retrieved with
2033 os_file_get_last_error.
2034 @return TRUE if success */
2035 UNIV_INTERN
2036 ibool
os_file_close_func(os_file_t file)2037 os_file_close_func(
2038 /*===============*/
2039 os_file_t file) /*!< in, own: handle to a file */
2040 {
2041 #ifdef __WIN__
2042 BOOL ret;
2043
2044 ut_a(file);
2045
2046 ret = CloseHandle(file);
2047
2048 if (ret) {
2049 return(TRUE);
2050 }
2051
2052 os_file_handle_error(NULL, "close");
2053
2054 return(FALSE);
2055 #else
2056 int ret;
2057
2058 ret = close(file);
2059
2060 if (ret == -1) {
2061 os_file_handle_error(NULL, "close");
2062
2063 return(FALSE);
2064 }
2065
2066 return(TRUE);
2067 #endif /* __WIN__ */
2068 }
2069
2070 #ifdef UNIV_HOTBACKUP
2071 /***********************************************************************//**
2072 Closes a file handle.
2073 @return TRUE if success */
2074 UNIV_INTERN
2075 ibool
os_file_close_no_error_handling(os_file_t file)2076 os_file_close_no_error_handling(
2077 /*============================*/
2078 os_file_t file) /*!< in, own: handle to a file */
2079 {
2080 #ifdef __WIN__
2081 BOOL ret;
2082
2083 ut_a(file);
2084
2085 ret = CloseHandle(file);
2086
2087 if (ret) {
2088 return(TRUE);
2089 }
2090
2091 return(FALSE);
2092 #else
2093 int ret;
2094
2095 ret = close(file);
2096
2097 if (ret == -1) {
2098
2099 return(FALSE);
2100 }
2101
2102 return(TRUE);
2103 #endif /* __WIN__ */
2104 }
2105 #endif /* UNIV_HOTBACKUP */
2106
2107 /***********************************************************************//**
2108 Gets a file size.
2109 @return file size, or (os_offset_t) -1 on failure */
2110 UNIV_INTERN
2111 os_offset_t
os_file_get_size(pfs_os_file_t file)2112 os_file_get_size(
2113 /*=============*/
2114 pfs_os_file_t file) /*!< in: handle to a file */
2115 {
2116 #ifdef __WIN__
2117 os_offset_t offset;
2118 DWORD high;
2119 DWORD low;
2120
2121 low = GetFileSize(file.m_file, &high);
2122
2123 if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) {
2124 return((os_offset_t) -1);
2125 }
2126
2127 offset = (os_offset_t) low | ((os_offset_t) high << 32);
2128
2129 return(offset);
2130 #else
2131 return((os_offset_t) lseek(file.m_file, 0, SEEK_END));
2132
2133 #endif /* __WIN__ */
2134 }
2135
2136 /***********************************************************************//**
2137 Write the specified number of zeros to a newly created file.
2138 @return TRUE if success */
2139 UNIV_INTERN
2140 ibool
os_file_set_size(const char * name,pfs_os_file_t file,os_offset_t size)2141 os_file_set_size(
2142 /*=============*/
2143 const char* name, /*!< in: name of the file or path as a
2144 null-terminated string */
2145 pfs_os_file_t file, /*!< in: handle to a file */
2146 os_offset_t size) /*!< in: file size */
2147 {
2148 os_offset_t current_size;
2149 ibool ret;
2150 byte* buf;
2151 byte* buf2;
2152 ulint buf_size;
2153
2154 current_size = 0;
2155
2156 /* Write up to 1 megabyte at a time. */
2157 buf_size = ut_min(64, (ulint) (size / UNIV_PAGE_SIZE))
2158 * UNIV_PAGE_SIZE;
2159 buf2 = static_cast<byte*>(ut_malloc(buf_size + UNIV_PAGE_SIZE));
2160
2161 /* Align the buffer for possible raw i/o */
2162 buf = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
2163
2164 /* Write buffer full of zeros */
2165 memset(buf, 0, buf_size);
2166
2167 if (size >= (os_offset_t) 100 << 20) {
2168
2169 fprintf(stderr, "InnoDB: Progress in MB:");
2170 }
2171
2172 while (current_size < size) {
2173 ulint n_bytes;
2174
2175 if (size - current_size < (os_offset_t) buf_size) {
2176 n_bytes = (ulint) (size - current_size);
2177 } else {
2178 n_bytes = buf_size;
2179 }
2180
2181 ret = os_file_write(name, file, buf, current_size, n_bytes);
2182 if (!ret) {
2183 ut_free(buf2);
2184 goto error_handling;
2185 }
2186
2187 /* Print about progress for each 100 MB written */
2188 if ((current_size + n_bytes) / (100 << 20)
2189 != current_size / (100 << 20)) {
2190
2191 fprintf(stderr, " %lu00",
2192 (ulong) ((current_size + n_bytes)
2193 / (100 << 20)));
2194 }
2195
2196 current_size += n_bytes;
2197 }
2198
2199 if (size >= (os_offset_t) 100 << 20) {
2200
2201 fprintf(stderr, "\n");
2202 }
2203
2204 ut_free(buf2);
2205
2206 ret = os_file_flush(file);
2207
2208 if (ret) {
2209 return(TRUE);
2210 }
2211
2212 error_handling:
2213 return(FALSE);
2214 }
2215
2216 /***********************************************************************//**
2217 Truncates a file at its current position.
2218 @return TRUE if success */
2219 UNIV_INTERN
2220 ibool
os_file_set_eof(FILE * file)2221 os_file_set_eof(
2222 /*============*/
2223 FILE* file) /*!< in: file to be truncated */
2224 {
2225 #ifdef __WIN__
2226 HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
2227 return(SetEndOfFile(h));
2228 #else /* __WIN__ */
2229 WAIT_ALLOW_WRITES();
2230 return(!ftruncate(fileno(file), ftell(file)));
2231 #endif /* __WIN__ */
2232 }
2233
2234 #ifndef __WIN__
2235 /***********************************************************************//**
2236 Wrapper to fsync(2) that retries the call on some errors.
2237 Returns the value 0 if successful; otherwise the value -1 is returned and
2238 the global variable errno is set to indicate the error.
2239 @return 0 if success, -1 otherwise */
2240
2241 static
2242 int
os_file_fsync(os_file_t file)2243 os_file_fsync(
2244 /*==========*/
2245 os_file_t file) /*!< in: handle to a file */
2246 {
2247 int ret;
2248 int failures;
2249 ibool retry;
2250
2251 failures = 0;
2252
2253 do {
2254 ret = fsync(file);
2255
2256 os_n_fsyncs++;
2257
2258 if (ret == -1 && errno == ENOLCK) {
2259
2260 if (failures % 100 == 0) {
2261
2262 ut_print_timestamp(stderr);
2263 fprintf(stderr,
2264 " InnoDB: fsync(): "
2265 "No locks available; retrying\n");
2266 }
2267
2268 os_thread_sleep(200000 /* 0.2 sec */);
2269
2270 failures++;
2271
2272 retry = TRUE;
2273 } else {
2274
2275 retry = FALSE;
2276 }
2277 } while (retry);
2278
2279 return(ret);
2280 }
2281 #endif /* !__WIN__ */
2282
2283 /***********************************************************************//**
2284 NOTE! Use the corresponding macro os_file_flush(), not directly this function!
2285 Flushes the write buffers of a given file to the disk.
2286 @return TRUE if success */
2287 UNIV_INTERN
2288 ibool
os_file_flush_func(os_file_t file)2289 os_file_flush_func(
2290 /*===============*/
2291 os_file_t file) /*!< in, own: handle to a file */
2292 {
2293 #ifdef __WIN__
2294 BOOL ret;
2295
2296 ut_a(file);
2297
2298 os_n_fsyncs++;
2299
2300 ret = FlushFileBuffers(file);
2301
2302 if (ret) {
2303 return(TRUE);
2304 }
2305
2306 /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
2307 actually a raw device, we choose to ignore that error if we are using
2308 raw disks */
2309
2310 if (srv_start_raw_disk_in_use && GetLastError()
2311 == ERROR_INVALID_FUNCTION) {
2312 return(TRUE);
2313 }
2314
2315 os_file_handle_error(NULL, "flush");
2316
2317 /* It is a fatal error if a file flush does not succeed, because then
2318 the database can get corrupt on disk */
2319 ut_error;
2320
2321 return(FALSE);
2322 #else
2323 int ret;
2324 WAIT_ALLOW_WRITES();
2325
2326 #if defined(HAVE_DARWIN_THREADS)
2327 # ifndef F_FULLFSYNC
2328 /* The following definition is from the Mac OS X 10.3 <sys/fcntl.h> */
2329 # define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */
2330 # elif F_FULLFSYNC != 51
2331 # error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3"
2332 # endif
2333 /* Apple has disabled fsync() for internal disk drives in OS X. That
2334 caused corruption for a user when he tested a power outage. Let us in
2335 OS X use a nonstandard flush method recommended by an Apple
2336 engineer. */
2337
2338 if (!srv_have_fullfsync) {
2339 /* If we are not on an operating system that supports this,
2340 then fall back to a plain fsync. */
2341
2342 ret = os_file_fsync(file);
2343 } else {
2344 ret = fcntl(file, F_FULLFSYNC, NULL);
2345
2346 if (ret) {
2347 /* If we are not on a file system that supports this,
2348 then fall back to a plain fsync. */
2349 ret = os_file_fsync(file);
2350 }
2351 }
2352 #else
2353 ret = os_file_fsync(file);
2354 #endif
2355
2356 if (ret == 0) {
2357 return(TRUE);
2358 }
2359
2360 /* Since Linux returns EINVAL if the 'file' is actually a raw device,
2361 we choose to ignore that error if we are using raw disks */
2362
2363 if (srv_start_raw_disk_in_use && errno == EINVAL) {
2364
2365 return(TRUE);
2366 }
2367
2368 ib_logf(IB_LOG_LEVEL_ERROR, "The OS said file flush did not succeed");
2369
2370 os_file_handle_error(NULL, "flush");
2371
2372 /* It is a fatal error if a file flush does not succeed, because then
2373 the database can get corrupt on disk */
2374 ut_error;
2375
2376 return(FALSE);
2377 #endif
2378 }
2379
2380 #ifndef __WIN__
2381 /*******************************************************************//**
2382 Does a synchronous read operation in Posix.
2383 @return number of bytes read, -1 if error */
2384 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2385 ssize_t
os_file_pread(os_file_t file,void * buf,ulint n,os_offset_t offset)2386 os_file_pread(
2387 /*==========*/
2388 os_file_t file, /*!< in: handle to a file */
2389 void* buf, /*!< in: buffer where to read */
2390 ulint n, /*!< in: number of bytes to read */
2391 os_offset_t offset) /*!< in: file offset from where to read */
2392 {
2393 off_t offs;
2394 #if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
2395 ssize_t n_bytes;
2396 #endif /* HAVE_PREAD && !HAVE_BROKEN_PREAD */
2397
2398 ut_ad(n);
2399
2400 /* If off_t is > 4 bytes in size, then we assume we can pass a
2401 64-bit address */
2402 offs = (off_t) offset;
2403
2404 if (sizeof(off_t) <= 4) {
2405 if (offset != (os_offset_t) offs) {
2406 ib_logf(IB_LOG_LEVEL_ERROR,
2407 "File read at offset > 4 GB");
2408 }
2409 }
2410
2411 os_n_file_reads++;
2412
2413 #if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
2414 #if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
2415 (void) os_atomic_increment_ulint(&os_n_pending_reads, 1);
2416 (void) os_atomic_increment_ulint(&os_file_n_pending_preads, 1);
2417 MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS);
2418 #else
2419 os_mutex_enter(os_file_count_mutex);
2420 os_file_n_pending_preads++;
2421 os_n_pending_reads++;
2422 MONITOR_INC(MONITOR_OS_PENDING_READS);
2423 os_mutex_exit(os_file_count_mutex);
2424 #endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD == 8 */
2425
2426 n_bytes = pread(file, buf, n, offs);
2427
2428 #if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
2429 (void) os_atomic_decrement_ulint(&os_n_pending_reads, 1);
2430 (void) os_atomic_decrement_ulint(&os_file_n_pending_preads, 1);
2431 MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_READS);
2432 #else
2433 os_mutex_enter(os_file_count_mutex);
2434 os_file_n_pending_preads--;
2435 os_n_pending_reads--;
2436 MONITOR_DEC(MONITOR_OS_PENDING_READS);
2437 os_mutex_exit(os_file_count_mutex);
2438 #endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD == 8 */
2439
2440 return(n_bytes);
2441 #else
2442 {
2443 off_t ret_offset;
2444 ssize_t ret;
2445 #ifndef UNIV_HOTBACKUP
2446 ulint i;
2447 #endif /* !UNIV_HOTBACKUP */
2448
2449 #if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
2450 (void) os_atomic_increment_ulint(&os_n_pending_reads, 1);
2451 MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS);
2452 #else
2453 os_mutex_enter(os_file_count_mutex);
2454 os_n_pending_reads++;
2455 MONITOR_INC(MONITOR_OS_PENDING_READS);
2456 os_mutex_exit(os_file_count_mutex);
2457 #endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD == 8 */
2458 #ifndef UNIV_HOTBACKUP
2459 /* Protect the seek / read operation with a mutex */
2460 i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2461
2462 os_mutex_enter(os_file_seek_mutexes[i]);
2463 #endif /* !UNIV_HOTBACKUP */
2464
2465 ret_offset = lseek(file, offs, SEEK_SET);
2466
2467 if (ret_offset < 0) {
2468 ret = -1;
2469 } else {
2470 ret = read(file, buf, (ssize_t) n);
2471 }
2472
2473 #ifndef UNIV_HOTBACKUP
2474 os_mutex_exit(os_file_seek_mutexes[i]);
2475 #endif /* !UNIV_HOTBACKUP */
2476
2477 #if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
2478 (void) os_atomic_decrement_ulint(&os_n_pending_reads, 1);
2479 MONITOR_ATOIC_DEC(MONITOR_OS_PENDING_READS);
2480 #else
2481 os_mutex_enter(os_file_count_mutex);
2482 os_n_pending_reads--;
2483 MONITOR_DEC(MONITOR_OS_PENDING_READS);
2484 os_mutex_exit(os_file_count_mutex);
2485 #endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD_SIZE == 8 */
2486
2487 return(ret);
2488 }
2489 #endif
2490 }
2491
2492 /*******************************************************************//**
2493 Does a synchronous write operation in Posix.
2494 @return number of bytes written, -1 if error */
2495 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2496 ssize_t
os_file_pwrite(os_file_t file,const void * buf,ulint n,os_offset_t offset)2497 os_file_pwrite(
2498 /*===========*/
2499 os_file_t file, /*!< in: handle to a file */
2500 const void* buf, /*!< in: buffer from where to write */
2501 ulint n, /*!< in: number of bytes to write */
2502 os_offset_t offset) /*!< in: file offset where to write */
2503 {
2504 ssize_t ret;
2505 off_t offs;
2506
2507 ut_ad(n);
2508 ut_ad(!srv_read_only_mode);
2509
2510 /* If off_t is > 4 bytes in size, then we assume we can pass a
2511 64-bit address */
2512 offs = (off_t) offset;
2513
2514 if (sizeof(off_t) <= 4) {
2515 if (offset != (os_offset_t) offs) {
2516 ib_logf(IB_LOG_LEVEL_ERROR,
2517 "File write at offset > 4 GB.");
2518 }
2519 }
2520
2521 os_n_file_writes++;
2522
2523 #if defined(HAVE_PWRITE) && !defined(HAVE_BROKEN_PREAD)
2524 #if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8
2525 os_mutex_enter(os_file_count_mutex);
2526 os_file_n_pending_pwrites++;
2527 os_n_pending_writes++;
2528 MONITOR_INC(MONITOR_OS_PENDING_WRITES);
2529 os_mutex_exit(os_file_count_mutex);
2530 #else
2531 (void) os_atomic_increment_ulint(&os_n_pending_writes, 1);
2532 (void) os_atomic_increment_ulint(&os_file_n_pending_pwrites, 1);
2533 MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_WRITES);
2534 #endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD < 8 */
2535
2536 ret = pwrite(file, buf, (ssize_t) n, offs);
2537
2538 #if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8
2539 os_mutex_enter(os_file_count_mutex);
2540 os_file_n_pending_pwrites--;
2541 os_n_pending_writes--;
2542 MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
2543 os_mutex_exit(os_file_count_mutex);
2544 #else
2545 (void) os_atomic_decrement_ulint(&os_n_pending_writes, 1);
2546 (void) os_atomic_decrement_ulint(&os_file_n_pending_pwrites, 1);
2547 MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_WRITES);
2548 #endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD < 8 */
2549
2550 return(ret);
2551 #else
2552 {
2553 off_t ret_offset;
2554 # ifndef UNIV_HOTBACKUP
2555 ulint i;
2556 # endif /* !UNIV_HOTBACKUP */
2557
2558 os_mutex_enter(os_file_count_mutex);
2559 os_n_pending_writes++;
2560 MONITOR_INC(MONITOR_OS_PENDING_WRITES);
2561 os_mutex_exit(os_file_count_mutex);
2562
2563 # ifndef UNIV_HOTBACKUP
2564 /* Protect the seek / write operation with a mutex */
2565 i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2566
2567 os_mutex_enter(os_file_seek_mutexes[i]);
2568 # endif /* UNIV_HOTBACKUP */
2569
2570 ret_offset = lseek(file, offs, SEEK_SET);
2571
2572 if (ret_offset < 0) {
2573 ret = -1;
2574
2575 goto func_exit;
2576 }
2577
2578 ret = write(file, buf, (ssize_t) n);
2579
2580 func_exit:
2581 # ifndef UNIV_HOTBACKUP
2582 os_mutex_exit(os_file_seek_mutexes[i]);
2583 # endif /* !UNIV_HOTBACKUP */
2584
2585 os_mutex_enter(os_file_count_mutex);
2586 os_n_pending_writes--;
2587 MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
2588 os_mutex_exit(os_file_count_mutex);
2589
2590 return(ret);
2591 }
2592 #endif /* !UNIV_HOTBACKUP */
2593 }
2594 #endif
2595
2596 /*******************************************************************//**
2597 NOTE! Use the corresponding macro os_file_read(), not directly this
2598 function!
2599 Requests a synchronous positioned read operation.
2600 @return TRUE if request was successful, FALSE if fail */
2601 UNIV_INTERN
2602 ibool
os_file_read_func(os_file_t file,void * buf,os_offset_t offset,ulint n)2603 os_file_read_func(
2604 /*==============*/
2605 os_file_t file, /*!< in: handle to a file */
2606 void* buf, /*!< in: buffer where to read */
2607 os_offset_t offset, /*!< in: file offset where to read */
2608 ulint n) /*!< in: number of bytes to read */
2609 {
2610 #ifdef __WIN__
2611 BOOL ret;
2612 DWORD len;
2613 DWORD ret2;
2614 DWORD low;
2615 DWORD high;
2616 ibool retry;
2617 #ifndef UNIV_HOTBACKUP
2618 ulint i;
2619 #endif /* !UNIV_HOTBACKUP */
2620
2621 /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2622 no more than 32 bits. */
2623 ut_a((n & 0xFFFFFFFFUL) == n);
2624
2625 os_n_file_reads++;
2626 os_bytes_read_since_printout += n;
2627
2628 try_again:
2629 ut_ad(file);
2630 ut_ad(buf);
2631 ut_ad(n > 0);
2632
2633 low = (DWORD) offset & 0xFFFFFFFF;
2634 high = (DWORD) (offset >> 32);
2635
2636 os_mutex_enter(os_file_count_mutex);
2637 os_n_pending_reads++;
2638 MONITOR_INC(MONITOR_OS_PENDING_READS);
2639 os_mutex_exit(os_file_count_mutex);
2640
2641 #ifndef UNIV_HOTBACKUP
2642 /* Protect the seek / read operation with a mutex */
2643 i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2644
2645 os_mutex_enter(os_file_seek_mutexes[i]);
2646 #endif /* !UNIV_HOTBACKUP */
2647
2648 ret2 = SetFilePointer(
2649 file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN);
2650
2651 if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2652
2653 #ifndef UNIV_HOTBACKUP
2654 os_mutex_exit(os_file_seek_mutexes[i]);
2655 #endif /* !UNIV_HOTBACKUP */
2656
2657 os_mutex_enter(os_file_count_mutex);
2658 os_n_pending_reads--;
2659 MONITOR_DEC(MONITOR_OS_PENDING_READS);
2660 os_mutex_exit(os_file_count_mutex);
2661
2662 goto error_handling;
2663 }
2664
2665 ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
2666
2667 #ifndef UNIV_HOTBACKUP
2668 os_mutex_exit(os_file_seek_mutexes[i]);
2669 #endif /* !UNIV_HOTBACKUP */
2670
2671 os_mutex_enter(os_file_count_mutex);
2672 os_n_pending_reads--;
2673 MONITOR_DEC(MONITOR_OS_PENDING_READS);
2674 os_mutex_exit(os_file_count_mutex);
2675
2676 if (ret && len == n) {
2677 return(TRUE);
2678 }
2679 #else /* __WIN__ */
2680 ibool retry;
2681 ssize_t ret;
2682
2683 os_bytes_read_since_printout += n;
2684
2685 try_again:
2686 ret = os_file_pread(file, buf, n, offset);
2687
2688 if ((ulint) ret == n) {
2689 return(TRUE);
2690 } else if (ret == -1) {
2691 ib_logf(IB_LOG_LEVEL_ERROR,
2692 "Error in system call pread(). The operating"
2693 " system error number is %lu.",(ulint) errno);
2694 } else {
2695 /* Partial read occured */
2696 ib_logf(IB_LOG_LEVEL_ERROR,
2697 "Tried to read " ULINTPF " bytes at offset "
2698 UINT64PF ". Was only able to read %ld.",
2699 n, offset, (lint) ret);
2700 }
2701 #endif /* __WIN__ */
2702 #ifdef __WIN__
2703 error_handling:
2704 #endif
2705 retry = os_file_handle_error(NULL, "read");
2706
2707 if (retry) {
2708 goto try_again;
2709 }
2710
2711 fprintf(stderr,
2712 "InnoDB: Fatal error: cannot read from file."
2713 " OS error number %lu.\n",
2714 #ifdef __WIN__
2715 (ulong) GetLastError()
2716 #else
2717 (ulong) errno
2718 #endif /* __WIN__ */
2719 );
2720 fflush(stderr);
2721
2722 ut_error;
2723
2724 return(FALSE);
2725 }
2726
2727 /*******************************************************************//**
2728 NOTE! Use the corresponding macro os_file_read_no_error_handling(),
2729 not directly this function!
2730 Requests a synchronous positioned read operation. This function does not do
2731 any error handling. In case of error it returns FALSE.
2732 @return TRUE if request was successful, FALSE if fail */
2733 UNIV_INTERN
2734 ibool
os_file_read_no_error_handling_func(os_file_t file,void * buf,os_offset_t offset,ulint n)2735 os_file_read_no_error_handling_func(
2736 /*================================*/
2737 os_file_t file, /*!< in: handle to a file */
2738 void* buf, /*!< in: buffer where to read */
2739 os_offset_t offset, /*!< in: file offset where to read */
2740 ulint n) /*!< in: number of bytes to read */
2741 {
2742 #ifdef __WIN__
2743 BOOL ret;
2744 DWORD len;
2745 DWORD ret2;
2746 DWORD low;
2747 DWORD high;
2748 ibool retry;
2749 #ifndef UNIV_HOTBACKUP
2750 ulint i;
2751 #endif /* !UNIV_HOTBACKUP */
2752
2753 /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2754 no more than 32 bits. */
2755 ut_a((n & 0xFFFFFFFFUL) == n);
2756
2757 os_n_file_reads++;
2758 os_bytes_read_since_printout += n;
2759
2760 try_again:
2761 ut_ad(file);
2762 ut_ad(buf);
2763 ut_ad(n > 0);
2764
2765 low = (DWORD) offset & 0xFFFFFFFF;
2766 high = (DWORD) (offset >> 32);
2767
2768 os_mutex_enter(os_file_count_mutex);
2769 os_n_pending_reads++;
2770 MONITOR_INC(MONITOR_OS_PENDING_READS);
2771 os_mutex_exit(os_file_count_mutex);
2772
2773 #ifndef UNIV_HOTBACKUP
2774 /* Protect the seek / read operation with a mutex */
2775 i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2776
2777 os_mutex_enter(os_file_seek_mutexes[i]);
2778 #endif /* !UNIV_HOTBACKUP */
2779
2780 ret2 = SetFilePointer(
2781 file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN);
2782
2783 if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2784
2785 #ifndef UNIV_HOTBACKUP
2786 os_mutex_exit(os_file_seek_mutexes[i]);
2787 #endif /* !UNIV_HOTBACKUP */
2788
2789 os_mutex_enter(os_file_count_mutex);
2790 os_n_pending_reads--;
2791 MONITOR_DEC(MONITOR_OS_PENDING_READS);
2792 os_mutex_exit(os_file_count_mutex);
2793
2794 goto error_handling;
2795 }
2796
2797 ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
2798
2799 #ifndef UNIV_HOTBACKUP
2800 os_mutex_exit(os_file_seek_mutexes[i]);
2801 #endif /* !UNIV_HOTBACKUP */
2802
2803 os_mutex_enter(os_file_count_mutex);
2804 os_n_pending_reads--;
2805 MONITOR_DEC(MONITOR_OS_PENDING_READS);
2806 os_mutex_exit(os_file_count_mutex);
2807
2808 if (ret && len == n) {
2809 return(TRUE);
2810 }
2811 #else /* __WIN__ */
2812 ibool retry;
2813 ssize_t ret;
2814
2815 os_bytes_read_since_printout += n;
2816
2817 try_again:
2818 ret = os_file_pread(file, buf, n, offset);
2819
2820 if ((ulint) ret == n) {
2821 return(TRUE);
2822 } else if (ret == -1) {
2823 ib_logf(IB_LOG_LEVEL_ERROR,
2824 "Error in system call pread(). The operating"
2825 " system error number is %lu.",(ulint) errno);
2826 } else {
2827 /* Partial read occured */
2828 ib_logf(IB_LOG_LEVEL_ERROR,
2829 "Tried to read " ULINTPF " bytes at offset "
2830 UINT64PF ". Was only able to read %ld.",
2831 n, offset, (lint) ret);
2832 }
2833 #endif /* __WIN__ */
2834 #ifdef __WIN__
2835 error_handling:
2836 #endif
2837 retry = os_file_handle_error_no_exit(NULL, "read", FALSE);
2838
2839 if (retry) {
2840 goto try_again;
2841 }
2842
2843 return(FALSE);
2844 }
2845
2846 /*******************************************************************//**
2847 Rewind file to its start, read at most size - 1 bytes from it to str, and
2848 NUL-terminate str. All errors are silently ignored. This function is
2849 mostly meant to be used with temporary files. */
2850 UNIV_INTERN
2851 void
os_file_read_string(FILE * file,char * str,ulint size)2852 os_file_read_string(
2853 /*================*/
2854 FILE* file, /*!< in: file to read from */
2855 char* str, /*!< in: buffer where to read */
2856 ulint size) /*!< in: size of buffer */
2857 {
2858 size_t flen;
2859
2860 if (size == 0) {
2861 return;
2862 }
2863
2864 rewind(file);
2865 flen = fread(str, 1, size - 1, file);
2866 str[flen] = '\0';
2867 }
2868
2869 /*******************************************************************//**
2870 NOTE! Use the corresponding macro os_file_write(), not directly
2871 this function!
2872 Requests a synchronous write operation.
2873 @return TRUE if request was successful, FALSE if fail */
2874 UNIV_INTERN
2875 ibool
os_file_write_func(const char * name,os_file_t file,const void * buf,os_offset_t offset,ulint n)2876 os_file_write_func(
2877 /*===============*/
2878 const char* name, /*!< in: name of the file or path as a
2879 null-terminated string */
2880 os_file_t file, /*!< in: handle to a file */
2881 const void* buf, /*!< in: buffer from which to write */
2882 os_offset_t offset, /*!< in: file offset where to write */
2883 ulint n) /*!< in: number of bytes to write */
2884 {
2885 ut_ad(!srv_read_only_mode);
2886
2887 #ifdef __WIN__
2888 BOOL ret;
2889 DWORD len;
2890 DWORD ret2;
2891 DWORD low;
2892 DWORD high;
2893 ulint n_retries = 0;
2894 ulint err;
2895 #ifndef UNIV_HOTBACKUP
2896 ulint i;
2897 #endif /* !UNIV_HOTBACKUP */
2898
2899 /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2900 no more than 32 bits. */
2901 ut_a((n & 0xFFFFFFFFUL) == n);
2902
2903 os_n_file_writes++;
2904
2905 ut_ad(file);
2906 ut_ad(buf);
2907 ut_ad(n > 0);
2908 retry:
2909 low = (DWORD) offset & 0xFFFFFFFF;
2910 high = (DWORD) (offset >> 32);
2911
2912 os_mutex_enter(os_file_count_mutex);
2913 os_n_pending_writes++;
2914 MONITOR_INC(MONITOR_OS_PENDING_WRITES);
2915 os_mutex_exit(os_file_count_mutex);
2916
2917 #ifndef UNIV_HOTBACKUP
2918 /* Protect the seek / write operation with a mutex */
2919 i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2920
2921 os_mutex_enter(os_file_seek_mutexes[i]);
2922 #endif /* !UNIV_HOTBACKUP */
2923
2924 ret2 = SetFilePointer(
2925 file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN);
2926
2927 if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2928
2929 #ifndef UNIV_HOTBACKUP
2930 os_mutex_exit(os_file_seek_mutexes[i]);
2931 #endif /* !UNIV_HOTBACKUP */
2932
2933 os_mutex_enter(os_file_count_mutex);
2934 os_n_pending_writes--;
2935 MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
2936 os_mutex_exit(os_file_count_mutex);
2937
2938 ut_print_timestamp(stderr);
2939
2940 fprintf(stderr,
2941 " InnoDB: Error: File pointer positioning to"
2942 " file %s failed at\n"
2943 "InnoDB: offset %llu. Operating system"
2944 " error number %lu.\n"
2945 "InnoDB: Some operating system error numbers"
2946 " are described at\n"
2947 "InnoDB: "
2948 REFMAN "operating-system-error-codes.html\n",
2949 name, offset, (ulong) GetLastError());
2950
2951 return(FALSE);
2952 }
2953
2954 ret = WriteFile(file, buf, (DWORD) n, &len, NULL);
2955
2956 #ifndef UNIV_HOTBACKUP
2957 os_mutex_exit(os_file_seek_mutexes[i]);
2958 #endif /* !UNIV_HOTBACKUP */
2959
2960 os_mutex_enter(os_file_count_mutex);
2961 os_n_pending_writes--;
2962 MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
2963 os_mutex_exit(os_file_count_mutex);
2964
2965 if (ret && len == n) {
2966
2967 return(TRUE);
2968 }
2969
2970 /* If some background file system backup tool is running, then, at
2971 least in Windows 2000, we may get here a specific error. Let us
2972 retry the operation 100 times, with 1 second waits. */
2973
2974 if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) {
2975
2976 os_thread_sleep(1000000);
2977
2978 n_retries++;
2979
2980 goto retry;
2981 }
2982
2983 if (!os_has_said_disk_full) {
2984
2985 err = (ulint) GetLastError();
2986
2987 ut_print_timestamp(stderr);
2988
2989 fprintf(stderr,
2990 " InnoDB: Error: Write to file %s failed"
2991 " at offset %llu.\n"
2992 "InnoDB: %lu bytes should have been written,"
2993 " only %lu were written.\n"
2994 "InnoDB: Operating system error number %lu.\n"
2995 "InnoDB: Check that your OS and file system"
2996 " support files of this size.\n"
2997 "InnoDB: Check also that the disk is not full"
2998 " or a disk quota exceeded.\n",
2999 name, offset,
3000 (ulong) n, (ulong) len, (ulong) err);
3001
3002 if (strerror((int) err) != NULL) {
3003 fprintf(stderr,
3004 "InnoDB: Error number %lu means '%s'.\n",
3005 (ulong) err, strerror((int) err));
3006 }
3007
3008 fprintf(stderr,
3009 "InnoDB: Some operating system error numbers"
3010 " are described at\n"
3011 "InnoDB: "
3012 REFMAN "operating-system-error-codes.html\n");
3013
3014 os_has_said_disk_full = TRUE;
3015 }
3016
3017 return(FALSE);
3018 #else
3019 ssize_t ret;
3020 WAIT_ALLOW_WRITES();
3021
3022 ret = os_file_pwrite(file, buf, n, offset);
3023
3024 if ((ulint) ret == n) {
3025
3026 return(TRUE);
3027 }
3028
3029 if (!os_has_said_disk_full) {
3030
3031 ut_print_timestamp(stderr);
3032
3033 if(ret == -1) {
3034 ib_logf(IB_LOG_LEVEL_ERROR,
3035 "Failure of system call pwrite(). Operating"
3036 " system error number is %lu.",
3037 (ulint) errno);
3038 } else {
3039 fprintf(stderr,
3040 " InnoDB: Error: Write to file %s failed"
3041 " at offset " UINT64PF ".\n"
3042 "InnoDB: %lu bytes should have been written,"
3043 " only %ld were written.\n"
3044 "InnoDB: Operating system error number %lu.\n"
3045 "InnoDB: Check that your OS and file system"
3046 " support files of this size.\n"
3047 "InnoDB: Check also that the disk is not full"
3048 " or a disk quota exceeded.\n",
3049 name, offset, n, (lint) ret,
3050 (ulint) errno);
3051 }
3052
3053 if (strerror(errno) != NULL) {
3054 fprintf(stderr,
3055 "InnoDB: Error number %d means '%s'.\n",
3056 errno, strerror(errno));
3057 }
3058
3059 fprintf(stderr,
3060 "InnoDB: Some operating system error numbers"
3061 " are described at\n"
3062 "InnoDB: "
3063 REFMAN "operating-system-error-codes.html\n");
3064
3065 os_has_said_disk_full = TRUE;
3066 }
3067
3068 return(FALSE);
3069 #endif
3070 }
3071
3072 /*******************************************************************//**
3073 Check the existence and type of the given file.
3074 @return TRUE if call succeeded */
3075 UNIV_INTERN
3076 ibool
os_file_status(const char * path,ibool * exists,os_file_type_t * type)3077 os_file_status(
3078 /*===========*/
3079 const char* path, /*!< in: pathname of the file */
3080 ibool* exists, /*!< out: TRUE if file exists */
3081 os_file_type_t* type) /*!< out: type of the file (if it exists) */
3082 {
3083 #ifdef __WIN__
3084 int ret;
3085 struct _stat64 statinfo;
3086
3087 ret = _stat64(path, &statinfo);
3088 if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3089 /* file does not exist */
3090 *exists = FALSE;
3091 return(TRUE);
3092 } else if (ret) {
3093 /* file exists, but stat call failed */
3094
3095 os_file_handle_error_no_exit(path, "stat", FALSE);
3096
3097 return(FALSE);
3098 }
3099
3100 if (_S_IFDIR & statinfo.st_mode) {
3101 *type = OS_FILE_TYPE_DIR;
3102 } else if (_S_IFREG & statinfo.st_mode) {
3103 *type = OS_FILE_TYPE_FILE;
3104 } else {
3105 *type = OS_FILE_TYPE_UNKNOWN;
3106 }
3107
3108 *exists = TRUE;
3109
3110 return(TRUE);
3111 #else
3112 int ret;
3113 struct stat statinfo;
3114
3115 ret = stat(path, &statinfo);
3116 if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3117 /* file does not exist */
3118 *exists = FALSE;
3119 return(TRUE);
3120 } else if (ret) {
3121 /* file exists, but stat call failed */
3122
3123 os_file_handle_error_no_exit(path, "stat", FALSE);
3124
3125 return(FALSE);
3126 }
3127
3128 if (S_ISDIR(statinfo.st_mode)) {
3129 *type = OS_FILE_TYPE_DIR;
3130 } else if (S_ISLNK(statinfo.st_mode)) {
3131 *type = OS_FILE_TYPE_LINK;
3132 } else if (S_ISREG(statinfo.st_mode)) {
3133 *type = OS_FILE_TYPE_FILE;
3134 } else {
3135 *type = OS_FILE_TYPE_UNKNOWN;
3136 }
3137
3138 *exists = TRUE;
3139
3140 return(TRUE);
3141 #endif
3142 }
3143
3144 /*******************************************************************//**
3145 This function returns information about the specified file
3146 @return DB_SUCCESS if all OK */
3147 UNIV_INTERN
3148 dberr_t
os_file_get_status(const char * path,os_file_stat_t * stat_info,bool check_rw_perm)3149 os_file_get_status(
3150 /*===============*/
3151 const char* path, /*!< in: pathname of the file */
3152 os_file_stat_t* stat_info, /*!< information of a file in a
3153 directory */
3154 bool check_rw_perm) /*!< in: for testing whether the
3155 file can be opened in RW mode */
3156 {
3157 int ret;
3158
3159 #ifdef __WIN__
3160 struct _stat64 statinfo;
3161
3162 ret = _stat64(path, &statinfo);
3163
3164 if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3165 /* file does not exist */
3166
3167 return(DB_NOT_FOUND);
3168
3169 } else if (ret) {
3170 /* file exists, but stat call failed */
3171
3172 os_file_handle_error_no_exit(path, "stat", FALSE);
3173
3174 return(DB_FAIL);
3175
3176 } else if (_S_IFDIR & statinfo.st_mode) {
3177 stat_info->type = OS_FILE_TYPE_DIR;
3178 } else if (_S_IFREG & statinfo.st_mode) {
3179
3180 DWORD access = GENERIC_READ;
3181
3182 if (!srv_read_only_mode) {
3183 access |= GENERIC_WRITE;
3184 }
3185
3186 stat_info->type = OS_FILE_TYPE_FILE;
3187
3188 /* Check if we can open it in read-only mode. */
3189
3190 if (check_rw_perm) {
3191 HANDLE fh;
3192
3193 fh = CreateFile(
3194 (LPCTSTR) path, // File to open
3195 access,
3196 0, // No sharing
3197 NULL, // Default security
3198 OPEN_EXISTING, // Existing file only
3199 FILE_ATTRIBUTE_NORMAL, // Normal file
3200 NULL); // No attr. template
3201
3202 if (fh == INVALID_HANDLE_VALUE) {
3203 stat_info->rw_perm = false;
3204 } else {
3205 stat_info->rw_perm = true;
3206 CloseHandle(fh);
3207 }
3208 }
3209 } else {
3210 stat_info->type = OS_FILE_TYPE_UNKNOWN;
3211 }
3212 #else
3213 struct stat statinfo;
3214
3215 ret = stat(path, &statinfo);
3216
3217 if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3218 /* file does not exist */
3219
3220 return(DB_NOT_FOUND);
3221
3222 } else if (ret) {
3223 /* file exists, but stat call failed */
3224
3225 os_file_handle_error_no_exit(path, "stat", FALSE);
3226
3227 return(DB_FAIL);
3228
3229 }
3230
3231 switch (statinfo.st_mode & S_IFMT) {
3232 case S_IFDIR:
3233 stat_info->type = OS_FILE_TYPE_DIR;
3234 break;
3235 case S_IFLNK:
3236 stat_info->type = OS_FILE_TYPE_LINK;
3237 break;
3238 case S_IFBLK:
3239 /* Handle block device as regular file. */
3240 case S_IFCHR:
3241 /* Handle character device as regular file. */
3242 case S_IFREG:
3243 stat_info->type = OS_FILE_TYPE_FILE;
3244 break;
3245 default:
3246 stat_info->type = OS_FILE_TYPE_UNKNOWN;
3247 }
3248
3249
3250 if (check_rw_perm && stat_info->type == OS_FILE_TYPE_FILE) {
3251
3252 int fh;
3253 int access;
3254
3255 access = !srv_read_only_mode ? O_RDWR : O_RDONLY;
3256
3257 fh = ::open(path, access, os_innodb_umask);
3258
3259 if (fh == -1) {
3260 stat_info->rw_perm = false;
3261 } else {
3262 stat_info->rw_perm = true;
3263 close(fh);
3264 }
3265 }
3266
3267 #endif /* _WIN_ */
3268
3269 stat_info->ctime = statinfo.st_ctime;
3270 stat_info->atime = statinfo.st_atime;
3271 stat_info->mtime = statinfo.st_mtime;
3272 stat_info->size = statinfo.st_size;
3273
3274 return(DB_SUCCESS);
3275 }
3276
3277 /* path name separator character */
3278 #ifdef __WIN__
3279 # define OS_FILE_PATH_SEPARATOR '\\'
3280 #else
3281 # define OS_FILE_PATH_SEPARATOR '/'
3282 #endif
3283
3284 /****************************************************************//**
3285 This function returns a new path name after replacing the basename
3286 in an old path with a new basename. The old_path is a full path
3287 name including the extension. The tablename is in the normal
3288 form "databasename/tablename". The new base name is found after
3289 the forward slash. Both input strings are null terminated.
3290
3291 This function allocates memory to be returned. It is the callers
3292 responsibility to free the return value after it is no longer needed.
3293
3294 @return own: new full pathname */
3295 UNIV_INTERN
3296 char*
os_file_make_new_pathname(const char * old_path,const char * tablename)3297 os_file_make_new_pathname(
3298 /*======================*/
3299 const char* old_path, /*!< in: pathname */
3300 const char* tablename) /*!< in: contains new base name */
3301 {
3302 ulint dir_len;
3303 char* last_slash;
3304 char* base_name;
3305 char* new_path;
3306 ulint new_path_len;
3307
3308 /* Split the tablename into its database and table name components.
3309 They are separated by a '/'. */
3310 last_slash = strrchr((char*) tablename, '/');
3311 base_name = last_slash ? last_slash + 1 : (char*) tablename;
3312
3313 /* Find the offset of the last slash. We will strip off the
3314 old basename.ibd which starts after that slash. */
3315 last_slash = strrchr((char*) old_path, OS_FILE_PATH_SEPARATOR);
3316 dir_len = last_slash ? last_slash - old_path : strlen(old_path);
3317
3318 /* allocate a new path and move the old directory path to it. */
3319 new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd";
3320 new_path = static_cast<char*>(mem_alloc(new_path_len));
3321 memcpy(new_path, old_path, dir_len);
3322
3323 ut_snprintf(new_path + dir_len,
3324 new_path_len - dir_len,
3325 "%c%s.ibd",
3326 OS_FILE_PATH_SEPARATOR,
3327 base_name);
3328
3329 return(new_path);
3330 }
3331
3332 /****************************************************************//**
3333 This function returns a remote path name by combining a data directory
3334 path provided in a DATA DIRECTORY clause with the tablename which is
3335 in the form 'database/tablename'. It strips the file basename (which
3336 is the tablename) found after the last directory in the path provided.
3337 The full filepath created will include the database name as a directory
3338 under the path provided. The filename is the tablename with the '.ibd'
3339 extension. All input and output strings are null-terminated.
3340
3341 This function allocates memory to be returned. It is the callers
3342 responsibility to free the return value after it is no longer needed.
3343
3344 @return own: A full pathname; data_dir_path/databasename/tablename.ibd */
3345 UNIV_INTERN
3346 char*
os_file_make_remote_pathname(const char * data_dir_path,const char * tablename,const char * extention)3347 os_file_make_remote_pathname(
3348 /*=========================*/
3349 const char* data_dir_path, /*!< in: pathname */
3350 const char* tablename, /*!< in: tablename */
3351 const char* extention) /*!< in: file extention; ibd,cfg */
3352 {
3353 ulint data_dir_len;
3354 char* last_slash;
3355 char* new_path;
3356 ulint new_path_len;
3357
3358 ut_ad(extention && strlen(extention) == 3);
3359
3360 /* Find the offset of the last slash. We will strip off the
3361 old basename or tablename which starts after that slash. */
3362 last_slash = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
3363 data_dir_len = last_slash ? last_slash - data_dir_path : strlen(data_dir_path);
3364
3365 /* allocate a new path and move the old directory path to it. */
3366 new_path_len = data_dir_len + strlen(tablename)
3367 + sizeof "/." + strlen(extention);
3368 new_path = static_cast<char*>(mem_alloc(new_path_len));
3369 memcpy(new_path, data_dir_path, data_dir_len);
3370 ut_snprintf(new_path + data_dir_len,
3371 new_path_len - data_dir_len,
3372 "%c%s.%s",
3373 OS_FILE_PATH_SEPARATOR,
3374 tablename,
3375 extention);
3376
3377 srv_normalize_path_for_win(new_path);
3378
3379 return(new_path);
3380 }
3381
3382 /****************************************************************//**
3383 This function reduces a null-terminated full remote path name into
3384 the path that is sent by MySQL for DATA DIRECTORY clause. It replaces
3385 the 'databasename/tablename.ibd' found at the end of the path with just
3386 'tablename'.
3387
3388 Since the result is always smaller than the path sent in, no new memory
3389 is allocated. The caller should allocate memory for the path sent in.
3390 This function manipulates that path in place.
3391
3392 If the path format is not as expected, just return. The result is used
3393 to inform a SHOW CREATE TABLE command. */
3394 UNIV_INTERN
3395 void
os_file_make_data_dir_path(char * data_dir_path)3396 os_file_make_data_dir_path(
3397 /*========================*/
3398 char* data_dir_path) /*!< in/out: full path/data_dir_path */
3399 {
3400 char* ptr;
3401 char* tablename;
3402 ulint tablename_len;
3403
3404 /* Replace the period before the extension with a null byte. */
3405 ptr = strrchr((char*) data_dir_path, '.');
3406 if (!ptr) {
3407 return;
3408 }
3409 ptr[0] = '\0';
3410
3411 /* The tablename starts after the last slash. */
3412 ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
3413 if (!ptr) {
3414 return;
3415 }
3416 ptr[0] = '\0';
3417 tablename = ptr + 1;
3418
3419 /* The databasename starts after the next to last slash. */
3420 ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
3421 if (!ptr) {
3422 return;
3423 }
3424 tablename_len = ut_strlen(tablename);
3425
3426 ut_memmove(++ptr, tablename, tablename_len);
3427
3428 ptr[tablename_len] = '\0';
3429 }
3430
3431 /****************************************************************//**
3432 The function os_file_dirname returns a directory component of a
3433 null-terminated pathname string. In the usual case, dirname returns
3434 the string up to, but not including, the final '/', and basename
3435 is the component following the final '/'. Trailing '/' characters
3436 are not counted as part of the pathname.
3437
3438 If path does not contain a slash, dirname returns the string ".".
3439
3440 Concatenating the string returned by dirname, a "/", and the basename
3441 yields a complete pathname.
3442
3443 The return value is a copy of the directory component of the pathname.
3444 The copy is allocated from heap. It is the caller responsibility
3445 to free it after it is no longer needed.
3446
3447 The following list of examples (taken from SUSv2) shows the strings
3448 returned by dirname and basename for different paths:
3449
3450 path dirname basename
3451 "/usr/lib" "/usr" "lib"
3452 "/usr/" "/" "usr"
3453 "usr" "." "usr"
3454 "/" "/" "/"
3455 "." "." "."
3456 ".." "." ".."
3457
3458 @return own: directory component of the pathname */
3459 UNIV_INTERN
3460 char*
os_file_dirname(const char * path)3461 os_file_dirname(
3462 /*============*/
3463 const char* path) /*!< in: pathname */
3464 {
3465 /* Find the offset of the last slash */
3466 const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR);
3467 if (!last_slash) {
3468 /* No slash in the path, return "." */
3469
3470 return(mem_strdup("."));
3471 }
3472
3473 /* Ok, there is a slash */
3474
3475 if (last_slash == path) {
3476 /* last slash is the first char of the path */
3477
3478 return(mem_strdup("/"));
3479 }
3480
3481 /* Non-trivial directory component */
3482
3483 return(mem_strdupl(path, last_slash - path));
3484 }
3485
3486 /****************************************************************//**
3487 Creates all missing subdirectories along the given path.
3488 @return TRUE if call succeeded FALSE otherwise */
3489 UNIV_INTERN
3490 ibool
os_file_create_subdirs_if_needed(const char * path)3491 os_file_create_subdirs_if_needed(
3492 /*=============================*/
3493 const char* path) /*!< in: path name */
3494 {
3495 if (srv_read_only_mode) {
3496
3497 ib_logf(IB_LOG_LEVEL_ERROR,
3498 "read only mode set. Can't create subdirectories '%s'",
3499 path);
3500
3501 return(FALSE);
3502
3503 }
3504
3505 char* subdir = os_file_dirname(path);
3506
3507 if (strlen(subdir) == 1
3508 && (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) {
3509 /* subdir is root or cwd, nothing to do */
3510 mem_free(subdir);
3511
3512 return(TRUE);
3513 }
3514
3515 /* Test if subdir exists */
3516 os_file_type_t type;
3517 ibool subdir_exists;
3518 ibool success = os_file_status(subdir, &subdir_exists, &type);
3519
3520 if (success && !subdir_exists) {
3521
3522 /* subdir does not exist, create it */
3523 success = os_file_create_subdirs_if_needed(subdir);
3524
3525 if (!success) {
3526 mem_free(subdir);
3527
3528 return(FALSE);
3529 }
3530
3531 success = os_file_create_directory(subdir, FALSE);
3532 }
3533
3534 mem_free(subdir);
3535
3536 return(success);
3537 }
3538
3539 #ifndef UNIV_HOTBACKUP
3540 /****************************************************************//**
3541 Returns a pointer to the nth slot in the aio array.
3542 @return pointer to slot */
3543 static
3544 os_aio_slot_t*
os_aio_array_get_nth_slot(os_aio_array_t * array,ulint index)3545 os_aio_array_get_nth_slot(
3546 /*======================*/
3547 os_aio_array_t* array, /*!< in: aio array */
3548 ulint index) /*!< in: index of the slot */
3549 {
3550 ut_a(index < array->n_slots);
3551
3552 return(&array->slots[index]);
3553 }
3554
3555 #if defined(LINUX_NATIVE_AIO)
3556 /******************************************************************//**
3557 Creates an io_context for native linux AIO.
3558 @return TRUE on success. */
3559 static
3560 ibool
os_aio_linux_create_io_ctx(ulint max_events,io_context_t * io_ctx)3561 os_aio_linux_create_io_ctx(
3562 /*=======================*/
3563 ulint max_events, /*!< in: number of events. */
3564 io_context_t* io_ctx) /*!< out: io_ctx to initialize. */
3565 {
3566 int ret;
3567 ulint retries = 0;
3568
3569 retry:
3570 memset(io_ctx, 0x0, sizeof(*io_ctx));
3571
3572 /* Initialize the io_ctx. Tell it how many pending
3573 IO requests this context will handle. */
3574
3575 ret = io_setup(max_events, io_ctx);
3576 if (ret == 0) {
3577 #if defined(UNIV_AIO_DEBUG)
3578 fprintf(stderr,
3579 "InnoDB: Linux native AIO:"
3580 " initialized io_ctx for segment\n");
3581 #endif
3582 /* Success. Return now. */
3583 return(TRUE);
3584 }
3585
3586 /* If we hit EAGAIN we'll make a few attempts before failing. */
3587
3588 switch (ret) {
3589 case -EAGAIN:
3590 if (retries == 0) {
3591 /* First time around. */
3592 ut_print_timestamp(stderr);
3593 fprintf(stderr,
3594 " InnoDB: Warning: io_setup() failed"
3595 " with EAGAIN. Will make %d attempts"
3596 " before giving up.\n",
3597 OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3598 }
3599
3600 if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
3601 ++retries;
3602 fprintf(stderr,
3603 "InnoDB: Warning: io_setup() attempt"
3604 " %lu failed.\n",
3605 retries);
3606 os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
3607 goto retry;
3608 }
3609
3610 /* Have tried enough. Better call it a day. */
3611 ut_print_timestamp(stderr);
3612 fprintf(stderr,
3613 " InnoDB: Error: io_setup() failed"
3614 " with EAGAIN after %d attempts.\n",
3615 OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3616 break;
3617
3618 case -ENOSYS:
3619 ut_print_timestamp(stderr);
3620 fprintf(stderr,
3621 " InnoDB: Error: Linux Native AIO interface"
3622 " is not supported on this platform. Please"
3623 " check your OS documentation and install"
3624 " appropriate binary of InnoDB.\n");
3625
3626 break;
3627
3628 default:
3629 ut_print_timestamp(stderr);
3630 fprintf(stderr,
3631 " InnoDB: Error: Linux Native AIO setup"
3632 " returned following error[%d]\n", -ret);
3633 break;
3634 }
3635
3636 fprintf(stderr,
3637 "InnoDB: You can disable Linux Native AIO by"
3638 " setting innodb_use_native_aio = 0 in my.cnf\n");
3639 return(FALSE);
3640 }
3641
3642 /******************************************************************//**
3643 Checks if the system supports native linux aio. On some kernel
3644 versions where native aio is supported it won't work on tmpfs. In such
3645 cases we can't use native aio as it is not possible to mix simulated
3646 and native aio.
3647 @return: TRUE if supported, FALSE otherwise. */
3648 static
3649 ibool
os_aio_native_aio_supported(void)3650 os_aio_native_aio_supported(void)
3651 /*=============================*/
3652 {
3653 int fd;
3654 io_context_t io_ctx;
3655 char name[1000];
3656
3657 if (!os_aio_linux_create_io_ctx(1, &io_ctx)) {
3658 /* The platform does not support native aio. */
3659 return(FALSE);
3660 } else if (!srv_read_only_mode) {
3661 /* Now check if tmpdir supports native aio ops. */
3662 fd = innobase_mysql_tmpfile(NULL);
3663
3664 if (fd < 0) {
3665 ib_logf(IB_LOG_LEVEL_WARN,
3666 "Unable to create temp file to check "
3667 "native AIO support.");
3668
3669 return(FALSE);
3670 }
3671 } else {
3672
3673 srv_normalize_path_for_win(srv_log_group_home_dir);
3674
3675 ulint dirnamelen = strlen(srv_log_group_home_dir);
3676 ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
3677 memcpy(name, srv_log_group_home_dir, dirnamelen);
3678
3679 /* Add a path separator if needed. */
3680 if (dirnamelen && name[dirnamelen - 1] != SRV_PATH_SEPARATOR) {
3681 name[dirnamelen++] = SRV_PATH_SEPARATOR;
3682 }
3683
3684 strcpy(name + dirnamelen, "ib_logfile0");
3685
3686 fd = ::open(name, O_RDONLY);
3687
3688 if (fd == -1) {
3689
3690 ib_logf(IB_LOG_LEVEL_WARN,
3691 "Unable to open \"%s\" to check "
3692 "native AIO read support.", name);
3693
3694 return(FALSE);
3695 }
3696 }
3697
3698 struct io_event io_event;
3699
3700 memset(&io_event, 0x0, sizeof(io_event));
3701
3702 byte* buf = static_cast<byte*>(ut_malloc(UNIV_PAGE_SIZE * 2));
3703 byte* ptr = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE));
3704
3705 struct iocb iocb;
3706
3707 /* Suppress valgrind warning. */
3708 memset(buf, 0x00, UNIV_PAGE_SIZE * 2);
3709 memset(&iocb, 0x0, sizeof(iocb));
3710
3711 struct iocb* p_iocb = &iocb;
3712
3713 if (!srv_read_only_mode) {
3714 io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0);
3715 } else {
3716 ut_a(UNIV_PAGE_SIZE >= 512);
3717 io_prep_pread(p_iocb, fd, ptr, 512, 0);
3718 }
3719
3720 int err = io_submit(io_ctx, 1, &p_iocb);
3721
3722 if (err >= 1) {
3723 /* Now collect the submitted IO request. */
3724 err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
3725 }
3726
3727 ut_free(buf);
3728 close(fd);
3729
3730 switch (err) {
3731 case 1:
3732 return(TRUE);
3733
3734 case -EINVAL:
3735 case -ENOSYS:
3736 ib_logf(IB_LOG_LEVEL_ERROR,
3737 "Linux Native AIO not supported. You can either "
3738 "move %s to a file system that supports native "
3739 "AIO or you can set innodb_use_native_aio to "
3740 "FALSE to avoid this message.",
3741 srv_read_only_mode ? name : "tmpdir");
3742
3743 /* fall through. */
3744 default:
3745 ib_logf(IB_LOG_LEVEL_ERROR,
3746 "Linux Native AIO check on %s returned error[%d]",
3747 srv_read_only_mode ? name : "tmpdir", -err);
3748 }
3749
3750 return(FALSE);
3751 }
3752 #endif /* LINUX_NATIVE_AIO */
3753
3754 /******************************************************************//**
3755 Creates an aio wait array. Note that we return NULL in case of failure.
3756 We don't care about freeing memory here because we assume that a
3757 failure will result in server refusing to start up.
3758 @return own: aio array, NULL on failure */
3759 static
3760 os_aio_array_t*
os_aio_array_create(ulint n,ulint n_segments)3761 os_aio_array_create(
3762 /*================*/
3763 ulint n, /*!< in: maximum number of pending aio
3764 operations allowed; n must be
3765 divisible by n_segments */
3766 ulint n_segments) /*!< in: number of segments in the aio array */
3767 {
3768 os_aio_array_t* array;
3769 #ifdef WIN_ASYNC_IO
3770 OVERLAPPED* over;
3771 #elif defined(LINUX_NATIVE_AIO)
3772 struct io_event* io_event = NULL;
3773 #endif /* WIN_ASYNC_IO */
3774 ut_a(n > 0);
3775 ut_a(n_segments > 0);
3776
3777 array = static_cast<os_aio_array_t*>(ut_malloc(sizeof(*array)));
3778 memset(array, 0x0, sizeof(*array));
3779
3780 array->mutex = os_mutex_create();
3781 array->not_full = os_event_create();
3782 array->is_empty = os_event_create();
3783
3784 os_event_set(array->is_empty);
3785
3786 array->n_slots = n;
3787 array->n_segments = n_segments;
3788
3789 array->slots = static_cast<os_aio_slot_t*>(
3790 ut_malloc(n * sizeof(*array->slots)));
3791
3792 memset(array->slots, 0x0, sizeof(n * sizeof(*array->slots)));
3793 #ifdef __WIN__
3794 array->handles = static_cast<HANDLE*>(ut_malloc(n * sizeof(HANDLE)));
3795 #endif /* __WIN__ */
3796
3797 #if defined(LINUX_NATIVE_AIO)
3798 array->aio_ctx = NULL;
3799 array->aio_events = NULL;
3800
3801 /* If we are not using native aio interface then skip this
3802 part of initialization. */
3803 if (!srv_use_native_aio) {
3804 goto skip_native_aio;
3805 }
3806
3807 /* Initialize the io_context array. One io_context
3808 per segment in the array. */
3809
3810 array->aio_ctx = static_cast<io_context**>(
3811 ut_malloc(n_segments * sizeof(*array->aio_ctx)));
3812
3813 for (ulint i = 0; i < n_segments; ++i) {
3814 if (!os_aio_linux_create_io_ctx(n/n_segments,
3815 &array->aio_ctx[i])) {
3816 /* If something bad happened during aio setup
3817 we should call it a day and return right away.
3818 We don't care about any leaks because a failure
3819 to initialize the io subsystem means that the
3820 server (or atleast the innodb storage engine)
3821 is not going to startup. */
3822 return(NULL);
3823 }
3824 }
3825
3826 /* Initialize the event array. One event per slot. */
3827 io_event = static_cast<struct io_event*>(
3828 ut_malloc(n * sizeof(*io_event)));
3829
3830 memset(io_event, 0x0, sizeof(*io_event) * n);
3831 array->aio_events = io_event;
3832
3833 skip_native_aio:
3834 #endif /* LINUX_NATIVE_AIO */
3835 for (ulint i = 0; i < n; i++) {
3836 os_aio_slot_t* slot;
3837
3838 slot = os_aio_array_get_nth_slot(array, i);
3839
3840 slot->pos = i;
3841 slot->reserved = FALSE;
3842 #ifdef WIN_ASYNC_IO
3843 slot->handle = CreateEvent(NULL,TRUE, FALSE, NULL);
3844
3845 over = &slot->control;
3846
3847 over->hEvent = slot->handle;
3848
3849 array->handles[i] = over->hEvent;
3850
3851 #elif defined(LINUX_NATIVE_AIO)
3852 memset(&slot->control, 0x0, sizeof(slot->control));
3853 slot->n_bytes = 0;
3854 slot->ret = 0;
3855 #endif /* WIN_ASYNC_IO */
3856 }
3857
3858 return(array);
3859 }
3860
3861 /************************************************************************//**
3862 Frees an aio wait array. */
3863 static
3864 void
os_aio_array_free(os_aio_array_t * & array)3865 os_aio_array_free(
3866 /*==============*/
3867 os_aio_array_t*& array) /*!< in, own: array to free */
3868 {
3869 #ifdef WIN_ASYNC_IO
3870 ulint i;
3871
3872 for (i = 0; i < array->n_slots; i++) {
3873 os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i);
3874 CloseHandle(slot->handle);
3875 }
3876 #endif /* WIN_ASYNC_IO */
3877
3878 #ifdef __WIN__
3879 ut_free(array->handles);
3880 #endif /* __WIN__ */
3881 os_mutex_free(array->mutex);
3882 os_event_free(array->not_full);
3883 os_event_free(array->is_empty);
3884
3885 #if defined(LINUX_NATIVE_AIO)
3886 if (srv_use_native_aio) {
3887 ut_free(array->aio_events);
3888 ut_free(array->aio_ctx);
3889 }
3890 #endif /* LINUX_NATIVE_AIO */
3891
3892 ut_free(array->slots);
3893 ut_free(array);
3894
3895 array = 0;
3896 }
3897
3898 /***********************************************************************
3899 Initializes the asynchronous io system. Creates one array each for ibuf
3900 and log i/o. Also creates one array each for read and write where each
3901 array is divided logically into n_read_segs and n_write_segs
3902 respectively. The caller must create an i/o handler thread for each
3903 segment in these arrays. This function also creates the sync array.
3904 No i/o handler thread needs to be created for that */
3905 UNIV_INTERN
3906 ibool
os_aio_init(ulint n_per_seg,ulint n_read_segs,ulint n_write_segs,ulint n_slots_sync)3907 os_aio_init(
3908 /*========*/
3909 ulint n_per_seg, /*<! in: maximum number of pending aio
3910 operations allowed per segment */
3911 ulint n_read_segs, /*<! in: number of reader threads */
3912 ulint n_write_segs, /*<! in: number of writer threads */
3913 ulint n_slots_sync) /*<! in: number of slots in the sync aio
3914 array */
3915 {
3916 os_io_init_simple();
3917
3918 #if defined(LINUX_NATIVE_AIO)
3919 /* Check if native aio is supported on this system and tmpfs */
3920 if (srv_use_native_aio && !os_aio_native_aio_supported()) {
3921
3922 ib_logf(IB_LOG_LEVEL_WARN, "Linux Native AIO disabled.");
3923
3924 srv_use_native_aio = FALSE;
3925 }
3926 #endif /* LINUX_NATIVE_AIO */
3927
3928 srv_reset_io_thread_op_info();
3929
3930 os_aio_read_array = os_aio_array_create(
3931 n_read_segs * n_per_seg, n_read_segs);
3932
3933 if (os_aio_read_array == NULL) {
3934 return(FALSE);
3935 }
3936
3937 ulint start = (srv_read_only_mode) ? 0 : 2;
3938 ulint n_segs = n_read_segs + start;
3939
3940 /* 0 is the ibuf segment and 1 is the insert buffer segment. */
3941 for (ulint i = start; i < n_segs; ++i) {
3942 ut_a(i < SRV_MAX_N_IO_THREADS);
3943 srv_io_thread_function[i] = "read thread";
3944 }
3945
3946 ulint n_segments = n_read_segs;
3947
3948 if (!srv_read_only_mode) {
3949
3950 os_aio_log_array = os_aio_array_create(n_per_seg, 1);
3951
3952 if (os_aio_log_array == NULL) {
3953 return(FALSE);
3954 }
3955
3956 ++n_segments;
3957
3958 srv_io_thread_function[1] = "log thread";
3959
3960 os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
3961
3962 if (os_aio_ibuf_array == NULL) {
3963 return(FALSE);
3964 }
3965
3966 ++n_segments;
3967
3968 srv_io_thread_function[0] = "insert buffer thread";
3969
3970 os_aio_write_array = os_aio_array_create(
3971 n_write_segs * n_per_seg, n_write_segs);
3972
3973 if (os_aio_write_array == NULL) {
3974 return(FALSE);
3975 }
3976
3977 n_segments += n_write_segs;
3978
3979 for (ulint i = start + n_read_segs; i < n_segments; ++i) {
3980 ut_a(i < SRV_MAX_N_IO_THREADS);
3981 srv_io_thread_function[i] = "write thread";
3982 }
3983
3984 ut_ad(n_segments >= 4);
3985 } else {
3986 ut_ad(n_segments > 0);
3987 }
3988
3989 os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
3990
3991 if (os_aio_sync_array == NULL) {
3992 return(FALSE);
3993 }
3994
3995 os_aio_n_segments = n_segments;
3996
3997 os_aio_validate();
3998
3999 os_aio_segment_wait_events = static_cast<os_event_t*>(
4000 ut_malloc(n_segments * sizeof *os_aio_segment_wait_events));
4001
4002 for (ulint i = 0; i < n_segments; ++i) {
4003 os_aio_segment_wait_events[i] = os_event_create();
4004 }
4005
4006 os_last_printout = ut_time();
4007
4008 return(TRUE);
4009
4010 }
4011
4012 /***********************************************************************
4013 Frees the asynchronous io system. */
4014 UNIV_INTERN
4015 void
os_aio_free(void)4016 os_aio_free(void)
4017 /*=============*/
4018 {
4019 if (os_aio_ibuf_array != 0) {
4020 os_aio_array_free(os_aio_ibuf_array);
4021 }
4022
4023 if (os_aio_log_array != 0) {
4024 os_aio_array_free(os_aio_log_array);
4025 }
4026
4027 if (os_aio_write_array != 0) {
4028 os_aio_array_free(os_aio_write_array);
4029 }
4030
4031 if (os_aio_sync_array != 0) {
4032 os_aio_array_free(os_aio_sync_array);
4033 }
4034
4035 os_aio_array_free(os_aio_read_array);
4036
4037 for (ulint i = 0; i < os_aio_n_segments; i++) {
4038 os_event_free(os_aio_segment_wait_events[i]);
4039 }
4040
4041 ut_free(os_aio_segment_wait_events);
4042 os_aio_segment_wait_events = 0;
4043 os_aio_n_segments = 0;
4044 }
4045
4046 #ifdef WIN_ASYNC_IO
4047 /************************************************************************//**
4048 Wakes up all async i/o threads in the array in Windows async i/o at
4049 shutdown. */
4050 static
4051 void
os_aio_array_wake_win_aio_at_shutdown(os_aio_array_t * array)4052 os_aio_array_wake_win_aio_at_shutdown(
4053 /*==================================*/
4054 os_aio_array_t* array) /*!< in: aio array */
4055 {
4056 ulint i;
4057
4058 for (i = 0; i < array->n_slots; i++) {
4059
4060 SetEvent((array->slots + i)->handle);
4061 }
4062 }
4063 #endif
4064
4065 /************************************************************************//**
4066 Wakes up all async i/o threads so that they know to exit themselves in
4067 shutdown. */
4068 UNIV_INTERN
4069 void
os_aio_wake_all_threads_at_shutdown(void)4070 os_aio_wake_all_threads_at_shutdown(void)
4071 /*=====================================*/
4072 {
4073 #ifdef WIN_ASYNC_IO
4074 /* This code wakes up all ai/o threads in Windows native aio */
4075 os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array);
4076 if (os_aio_write_array != 0) {
4077 os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array);
4078 }
4079
4080 if (os_aio_ibuf_array != 0) {
4081 os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array);
4082 }
4083
4084 if (os_aio_log_array != 0) {
4085 os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array);
4086 }
4087
4088 #elif defined(LINUX_NATIVE_AIO)
4089
4090 /* When using native AIO interface the io helper threads
4091 wait on io_getevents with a timeout value of 500ms. At
4092 each wake up these threads check the server status.
4093 No need to do anything to wake them up. */
4094
4095 if (srv_use_native_aio) {
4096 return;
4097 }
4098
4099 /* Fall through to simulated AIO handler wakeup if we are
4100 not using native AIO. */
4101 #endif /* !WIN_ASYNC_AIO */
4102
4103 /* This loop wakes up all simulated ai/o threads */
4104
4105 for (ulint i = 0; i < os_aio_n_segments; i++) {
4106
4107 os_event_set(os_aio_segment_wait_events[i]);
4108 }
4109 }
4110
4111 /************************************************************************//**
4112 Waits until there are no pending writes in os_aio_write_array. There can
4113 be other, synchronous, pending writes. */
4114 UNIV_INTERN
4115 void
os_aio_wait_until_no_pending_writes(void)4116 os_aio_wait_until_no_pending_writes(void)
4117 /*=====================================*/
4118 {
4119 ut_ad(!srv_read_only_mode);
4120 os_event_wait(os_aio_write_array->is_empty);
4121 }
4122
4123 /**********************************************************************//**
4124 Calculates segment number for a slot.
4125 @return segment number (which is the number used by, for example,
4126 i/o-handler threads) */
4127 static
4128 ulint
os_aio_get_segment_no_from_slot(os_aio_array_t * array,os_aio_slot_t * slot)4129 os_aio_get_segment_no_from_slot(
4130 /*============================*/
4131 os_aio_array_t* array, /*!< in: aio wait array */
4132 os_aio_slot_t* slot) /*!< in: slot in this array */
4133 {
4134 ulint segment;
4135 ulint seg_len;
4136
4137 if (array == os_aio_ibuf_array) {
4138 ut_ad(!srv_read_only_mode);
4139
4140 segment = IO_IBUF_SEGMENT;
4141
4142 } else if (array == os_aio_log_array) {
4143 ut_ad(!srv_read_only_mode);
4144
4145 segment = IO_LOG_SEGMENT;
4146
4147 } else if (array == os_aio_read_array) {
4148 seg_len = os_aio_read_array->n_slots
4149 / os_aio_read_array->n_segments;
4150
4151 segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
4152 } else {
4153 ut_ad(!srv_read_only_mode);
4154 ut_a(array == os_aio_write_array);
4155
4156 seg_len = os_aio_write_array->n_slots
4157 / os_aio_write_array->n_segments;
4158
4159 segment = os_aio_read_array->n_segments + 2
4160 + slot->pos / seg_len;
4161 }
4162
4163 return(segment);
4164 }
4165
4166 /**********************************************************************//**
4167 Calculates local segment number and aio array from global segment number.
4168 @return local segment number within the aio array */
4169 static
4170 ulint
os_aio_get_array_and_local_segment(os_aio_array_t ** array,ulint global_segment)4171 os_aio_get_array_and_local_segment(
4172 /*===============================*/
4173 os_aio_array_t** array, /*!< out: aio wait array */
4174 ulint global_segment)/*!< in: global segment number */
4175 {
4176 ulint segment;
4177
4178 ut_a(global_segment < os_aio_n_segments);
4179
4180 if (srv_read_only_mode) {
4181 *array = os_aio_read_array;
4182
4183 return(global_segment);
4184 } else if (global_segment == IO_IBUF_SEGMENT) {
4185 *array = os_aio_ibuf_array;
4186 segment = 0;
4187
4188 } else if (global_segment == IO_LOG_SEGMENT) {
4189 *array = os_aio_log_array;
4190 segment = 0;
4191
4192 } else if (global_segment < os_aio_read_array->n_segments + 2) {
4193 *array = os_aio_read_array;
4194
4195 segment = global_segment - 2;
4196 } else {
4197 *array = os_aio_write_array;
4198
4199 segment = global_segment - (os_aio_read_array->n_segments + 2);
4200 }
4201
4202 return(segment);
4203 }
4204
4205 /*******************************************************************//**
4206 Requests for a slot in the aio array. If no slot is available, waits until
4207 not_full-event becomes signaled.
4208 @return pointer to slot */
4209 static
4210 os_aio_slot_t*
os_aio_array_reserve_slot(ulint type,os_aio_array_t * array,fil_node_t * message1,void * message2,pfs_os_file_t file,const char * name,void * buf,os_offset_t offset,ulint len)4211 os_aio_array_reserve_slot(
4212 /*======================*/
4213 ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
4214 os_aio_array_t* array, /*!< in: aio array */
4215 fil_node_t* message1,/*!< in: message to be passed along with
4216 the aio operation */
4217 void* message2,/*!< in: message to be passed along with
4218 the aio operation */
4219 pfs_os_file_t file, /*!< in: file handle */
4220 const char* name, /*!< in: name of the file or path as a
4221 null-terminated string */
4222 void* buf, /*!< in: buffer where to read or from which
4223 to write */
4224 os_offset_t offset, /*!< in: file offset */
4225 ulint len) /*!< in: length of the block to read or write */
4226 {
4227 os_aio_slot_t* slot = NULL;
4228 #ifdef WIN_ASYNC_IO
4229 OVERLAPPED* control;
4230
4231 #elif defined(LINUX_NATIVE_AIO)
4232
4233 struct iocb* iocb;
4234 off_t aio_offset;
4235
4236 #endif /* WIN_ASYNC_IO */
4237 ulint i;
4238 ulint counter;
4239 ulint slots_per_seg;
4240 ulint local_seg;
4241
4242 #ifdef WIN_ASYNC_IO
4243 ut_a((len & 0xFFFFFFFFUL) == len);
4244 #endif /* WIN_ASYNC_IO */
4245
4246 /* No need of a mutex. Only reading constant fields */
4247 slots_per_seg = array->n_slots / array->n_segments;
4248
4249 /* We attempt to keep adjacent blocks in the same local
4250 segment. This can help in merging IO requests when we are
4251 doing simulated AIO */
4252 local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6))
4253 % array->n_segments;
4254
4255 loop:
4256 os_mutex_enter(array->mutex);
4257
4258 if (array->n_reserved == array->n_slots) {
4259 os_mutex_exit(array->mutex);
4260
4261 if (!srv_use_native_aio) {
4262 /* If the handler threads are suspended, wake them
4263 so that we get more slots */
4264
4265 os_aio_simulated_wake_handler_threads();
4266 }
4267
4268 os_event_wait(array->not_full);
4269
4270 goto loop;
4271 }
4272
4273 /* We start our search for an available slot from our preferred
4274 local segment and do a full scan of the array. We are
4275 guaranteed to find a slot in full scan. */
4276 for (i = local_seg * slots_per_seg, counter = 0;
4277 counter < array->n_slots;
4278 i++, counter++) {
4279
4280 i %= array->n_slots;
4281
4282 slot = os_aio_array_get_nth_slot(array, i);
4283
4284 if (slot->reserved == FALSE) {
4285 goto found;
4286 }
4287 }
4288
4289 /* We MUST always be able to get hold of a reserved slot. */
4290 ut_error;
4291
4292 found:
4293 ut_a(slot->reserved == FALSE);
4294 array->n_reserved++;
4295
4296 if (array->n_reserved == 1) {
4297 os_event_reset(array->is_empty);
4298 }
4299
4300 if (array->n_reserved == array->n_slots) {
4301 os_event_reset(array->not_full);
4302 }
4303
4304 slot->reserved = TRUE;
4305 slot->reservation_time = ut_time();
4306 slot->message1 = message1;
4307 slot->message2 = message2;
4308 slot->file = file;
4309 slot->name = name;
4310 slot->len = len;
4311 slot->type = type;
4312 slot->buf = static_cast<byte*>(buf);
4313 slot->offset = offset;
4314 slot->io_already_done = FALSE;
4315
4316 #ifdef WIN_ASYNC_IO
4317 control = &slot->control;
4318 control->Offset = (DWORD) offset & 0xFFFFFFFF;
4319 control->OffsetHigh = (DWORD) (offset >> 32);
4320 ResetEvent(slot->handle);
4321
4322 #elif defined(LINUX_NATIVE_AIO)
4323
4324 /* If we are not using native AIO skip this part. */
4325 if (!srv_use_native_aio) {
4326 goto skip_native_aio;
4327 }
4328
4329 /* Check if we are dealing with 64 bit arch.
4330 If not then make sure that offset fits in 32 bits. */
4331 aio_offset = (off_t) offset;
4332
4333 ut_a(sizeof(aio_offset) >= sizeof(offset)
4334 || ((os_offset_t) aio_offset) == offset);
4335
4336 iocb = &slot->control;
4337
4338 if (type == OS_FILE_READ) {
4339 io_prep_pread(iocb, file.m_file, buf, len, aio_offset);
4340 } else {
4341 ut_a(type == OS_FILE_WRITE);
4342 io_prep_pwrite(iocb, file.m_file, buf, len, aio_offset);
4343 }
4344
4345 iocb->data = (void*) slot;
4346 slot->n_bytes = 0;
4347 slot->ret = 0;
4348
4349 skip_native_aio:
4350 #endif /* LINUX_NATIVE_AIO */
4351 os_mutex_exit(array->mutex);
4352
4353 return(slot);
4354 }
4355
4356 /*******************************************************************//**
4357 Frees a slot in the aio array. */
4358 static
4359 void
os_aio_array_free_slot(os_aio_array_t * array,os_aio_slot_t * slot)4360 os_aio_array_free_slot(
4361 /*===================*/
4362 os_aio_array_t* array, /*!< in: aio array */
4363 os_aio_slot_t* slot) /*!< in: pointer to slot */
4364 {
4365 os_mutex_enter(array->mutex);
4366
4367 ut_ad(slot->reserved);
4368
4369 slot->reserved = FALSE;
4370
4371 array->n_reserved--;
4372
4373 if (array->n_reserved == array->n_slots - 1) {
4374 os_event_set(array->not_full);
4375 }
4376
4377 if (array->n_reserved == 0) {
4378 os_event_set(array->is_empty);
4379 }
4380
4381 #ifdef WIN_ASYNC_IO
4382
4383 ResetEvent(slot->handle);
4384
4385 #elif defined(LINUX_NATIVE_AIO)
4386
4387 if (srv_use_native_aio) {
4388 memset(&slot->control, 0x0, sizeof(slot->control));
4389 slot->n_bytes = 0;
4390 slot->ret = 0;
4391 /*fprintf(stderr, "Freed up Linux native slot.\n");*/
4392 } else {
4393 /* These fields should not be used if we are not
4394 using native AIO. */
4395 ut_ad(slot->n_bytes == 0);
4396 ut_ad(slot->ret == 0);
4397 }
4398
4399 #endif
4400 os_mutex_exit(array->mutex);
4401 }
4402
4403 /**********************************************************************//**
4404 Wakes up a simulated aio i/o-handler thread if it has something to do. */
4405 static
4406 void
os_aio_simulated_wake_handler_thread(ulint global_segment)4407 os_aio_simulated_wake_handler_thread(
4408 /*=================================*/
4409 ulint global_segment) /*!< in: the number of the segment in the aio
4410 arrays */
4411 {
4412 os_aio_array_t* array;
4413 ulint segment;
4414
4415 ut_ad(!srv_use_native_aio);
4416
4417 segment = os_aio_get_array_and_local_segment(&array, global_segment);
4418
4419 ulint n = array->n_slots / array->n_segments;
4420
4421 segment *= n;
4422
4423 /* Look through n slots after the segment * n'th slot */
4424
4425 os_mutex_enter(array->mutex);
4426
4427 for (ulint i = 0; i < n; ++i) {
4428 const os_aio_slot_t* slot;
4429
4430 slot = os_aio_array_get_nth_slot(array, segment + i);
4431
4432 if (slot->reserved) {
4433
4434 /* Found an i/o request */
4435
4436 os_mutex_exit(array->mutex);
4437
4438 os_event_t event;
4439
4440 event = os_aio_segment_wait_events[global_segment];
4441
4442 os_event_set(event);
4443
4444 return;
4445 }
4446 }
4447
4448 os_mutex_exit(array->mutex);
4449 }
4450
4451 /**********************************************************************//**
4452 Wakes up simulated aio i/o-handler threads if they have something to do. */
4453 UNIV_INTERN
4454 void
os_aio_simulated_wake_handler_threads(void)4455 os_aio_simulated_wake_handler_threads(void)
4456 /*=======================================*/
4457 {
4458 if (srv_use_native_aio) {
4459 /* We do not use simulated aio: do nothing */
4460
4461 return;
4462 }
4463
4464 os_aio_recommend_sleep_for_read_threads = FALSE;
4465
4466 for (ulint i = 0; i < os_aio_n_segments; i++) {
4467 os_aio_simulated_wake_handler_thread(i);
4468 }
4469 }
4470
4471 /**********************************************************************//**
4472 This function can be called if one wants to post a batch of reads and
4473 prefers an i/o-handler thread to handle them all at once later. You must
4474 call os_aio_simulated_wake_handler_threads later to ensure the threads
4475 are not left sleeping! */
4476 UNIV_INTERN
4477 void
os_aio_simulated_put_read_threads_to_sleep(void)4478 os_aio_simulated_put_read_threads_to_sleep(void)
4479 /*============================================*/
4480 {
4481
4482 /* The idea of putting background IO threads to sleep is only for
4483 Windows when using simulated AIO. Windows XP seems to schedule
4484 background threads too eagerly to allow for coalescing during
4485 readahead requests. */
4486 #ifdef __WIN__
4487 os_aio_array_t* array;
4488
4489 if (srv_use_native_aio) {
4490 /* We do not use simulated aio: do nothing */
4491
4492 return;
4493 }
4494
4495 os_aio_recommend_sleep_for_read_threads = TRUE;
4496
4497 for (ulint i = 0; i < os_aio_n_segments; i++) {
4498 os_aio_get_array_and_local_segment(&array, i);
4499
4500 if (array == os_aio_read_array) {
4501
4502 os_event_reset(os_aio_segment_wait_events[i]);
4503 }
4504 }
4505 #endif /* __WIN__ */
4506 }
4507
4508 #if defined(LINUX_NATIVE_AIO)
4509 /*******************************************************************//**
4510 Dispatch an AIO request to the kernel.
4511 @return TRUE on success. */
4512 static
4513 ibool
os_aio_linux_dispatch(os_aio_array_t * array,os_aio_slot_t * slot)4514 os_aio_linux_dispatch(
4515 /*==================*/
4516 os_aio_array_t* array, /*!< in: io request array. */
4517 os_aio_slot_t* slot) /*!< in: an already reserved slot. */
4518 {
4519 int ret;
4520 ulint io_ctx_index;
4521 struct iocb* iocb;
4522
4523 ut_ad(slot != NULL);
4524 ut_ad(array);
4525
4526 ut_a(slot->reserved);
4527
4528 /* Find out what we are going to work with.
4529 The iocb struct is directly in the slot.
4530 The io_context is one per segment. */
4531
4532 iocb = &slot->control;
4533 io_ctx_index = (slot->pos * array->n_segments) / array->n_slots;
4534
4535 ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb);
4536
4537 #if defined(UNIV_AIO_DEBUG)
4538 fprintf(stderr,
4539 "io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n",
4540 (slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot,
4541 array->aio_ctx[io_ctx_index], (ulong) io_ctx_index);
4542 #endif
4543
4544 /* io_submit returns number of successfully
4545 queued requests or -errno. */
4546 if (UNIV_UNLIKELY(ret != 1)) {
4547 errno = -ret;
4548 return(FALSE);
4549 }
4550
4551 return(TRUE);
4552 }
4553 #endif /* LINUX_NATIVE_AIO */
4554
4555
4556 /*******************************************************************//**
4557 NOTE! Use the corresponding macro os_aio(), not directly this function!
4558 Requests an asynchronous i/o operation.
4559 @return TRUE if request was queued successfully, FALSE if fail */
4560 UNIV_INTERN
4561 ibool
os_aio_func(ulint type,ulint mode,const char * name,pfs_os_file_t file,void * buf,os_offset_t offset,ulint n,fil_node_t * message1,void * message2)4562 os_aio_func(
4563 /*========*/
4564 ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
4565 ulint mode, /*!< in: OS_AIO_NORMAL, ..., possibly ORed
4566 to OS_AIO_SIMULATED_WAKE_LATER: the
4567 last flag advises this function not to wake
4568 i/o-handler threads, but the caller will
4569 do the waking explicitly later, in this
4570 way the caller can post several requests in
4571 a batch; NOTE that the batch must not be
4572 so big that it exhausts the slots in aio
4573 arrays! NOTE that a simulated batch
4574 may introduce hidden chances of deadlocks,
4575 because i/os are not actually handled until
4576 all have been posted: use with great
4577 caution! */
4578 const char* name, /*!< in: name of the file or path as a
4579 null-terminated string */
4580 pfs_os_file_t file, /*!< in: handle to a file */
4581 void* buf, /*!< in: buffer where to read or from which
4582 to write */
4583 os_offset_t offset, /*!< in: file offset where to read or write */
4584 ulint n, /*!< in: number of bytes to read or write */
4585 fil_node_t* message1,/*!< in: message for the aio handler
4586 (can be used to identify a completed
4587 aio operation); ignored if mode is
4588 OS_AIO_SYNC */
4589 void* message2)/*!< in: message for the aio handler
4590 (can be used to identify a completed
4591 aio operation); ignored if mode is
4592 OS_AIO_SYNC */
4593 {
4594 os_aio_array_t* array;
4595 os_aio_slot_t* slot;
4596 #ifdef WIN_ASYNC_IO
4597 ibool retval;
4598 BOOL ret = TRUE;
4599 DWORD len = (DWORD) n;
4600 struct fil_node_t* dummy_mess1;
4601 void* dummy_mess2;
4602 ulint dummy_type;
4603 #endif /* WIN_ASYNC_IO */
4604 ulint wake_later;
4605 ut_ad(file.m_file);
4606 ut_ad(buf);
4607 ut_ad(n > 0);
4608 ut_ad(n % OS_FILE_LOG_BLOCK_SIZE == 0);
4609 ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
4610 ut_ad(os_aio_validate_skip());
4611 #ifdef WIN_ASYNC_IO
4612 ut_ad((n & 0xFFFFFFFFUL) == n);
4613 #endif
4614
4615 wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
4616 mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER);
4617
4618 if (mode == OS_AIO_SYNC
4619 #ifdef WIN_ASYNC_IO
4620 && !srv_use_native_aio
4621 #endif /* WIN_ASYNC_IO */
4622 ) {
4623 /* This is actually an ordinary synchronous read or write:
4624 no need to use an i/o-handler thread. NOTE that if we use
4625 Windows async i/o, Windows does not allow us to use
4626 ordinary synchronous os_file_read etc. on the same file,
4627 therefore we have built a special mechanism for synchronous
4628 wait in the Windows case.
4629 Also note that the Performance Schema instrumentation has
4630 been performed by current os_aio_func()'s wrapper function
4631 pfs_os_aio_func(). So we would no longer need to call
4632 Performance Schema instrumented os_file_read() and
4633 os_file_write(). Instead, we should use os_file_read_func()
4634 and os_file_write_func() */
4635
4636 if (type == OS_FILE_READ) {
4637 return(os_file_read_func(file.m_file, buf, offset, n));
4638 }
4639 ut_ad(!srv_read_only_mode);
4640 ut_a(type == OS_FILE_WRITE);
4641 return(os_file_write_func(name, file.m_file, buf, offset, n));
4642 }
4643
4644 try_again:
4645 switch (mode) {
4646 case OS_AIO_NORMAL:
4647 if (type == OS_FILE_READ) {
4648 array = os_aio_read_array;
4649 } else {
4650 ut_ad(!srv_read_only_mode);
4651 array = os_aio_write_array;
4652 }
4653 break;
4654 case OS_AIO_IBUF:
4655 ut_ad(type == OS_FILE_READ);
4656 /* Reduce probability of deadlock bugs in connection with ibuf:
4657 do not let the ibuf i/o handler sleep */
4658
4659 wake_later = FALSE;
4660
4661 if (srv_read_only_mode) {
4662 array = os_aio_read_array;
4663 } else {
4664 array = os_aio_ibuf_array;
4665 }
4666 break;
4667 case OS_AIO_LOG:
4668 if (srv_read_only_mode) {
4669 array = os_aio_read_array;
4670 } else {
4671 array = os_aio_log_array;
4672 }
4673 break;
4674 case OS_AIO_SYNC:
4675 array = os_aio_sync_array;
4676 #if defined(LINUX_NATIVE_AIO)
4677 /* In Linux native AIO we don't use sync IO array. */
4678 ut_a(!srv_use_native_aio);
4679 #endif /* LINUX_NATIVE_AIO */
4680 break;
4681 default:
4682 ut_error;
4683 array = NULL; /* Eliminate compiler warning */
4684 }
4685
4686 slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
4687 name, buf, offset, n);
4688 if (type == OS_FILE_READ) {
4689 if (srv_use_native_aio) {
4690 os_n_file_reads++;
4691 os_bytes_read_since_printout += n;
4692 #ifdef WIN_ASYNC_IO
4693 ret = ReadFile(file.m_file, buf, (DWORD) n, &len,
4694 &(slot->control));
4695 #elif defined(LINUX_NATIVE_AIO)
4696 if (!os_aio_linux_dispatch(array, slot)) {
4697 goto err_exit;
4698 }
4699 #endif /* WIN_ASYNC_IO */
4700 } else {
4701 if (!wake_later) {
4702 os_aio_simulated_wake_handler_thread(
4703 os_aio_get_segment_no_from_slot(
4704 array, slot));
4705 }
4706 }
4707 } else if (type == OS_FILE_WRITE) {
4708 ut_ad(!srv_read_only_mode);
4709 if (srv_use_native_aio) {
4710 os_n_file_writes++;
4711 #ifdef WIN_ASYNC_IO
4712 ret = WriteFile(file.m_file, buf, (DWORD) n, &len,
4713 &(slot->control));
4714 #elif defined(LINUX_NATIVE_AIO)
4715 if (!os_aio_linux_dispatch(array, slot)) {
4716 goto err_exit;
4717 }
4718 #endif /* WIN_ASYNC_IO */
4719 } else {
4720 if (!wake_later) {
4721 os_aio_simulated_wake_handler_thread(
4722 os_aio_get_segment_no_from_slot(
4723 array, slot));
4724 }
4725 }
4726 } else {
4727 ut_error;
4728 }
4729
4730 #ifdef WIN_ASYNC_IO
4731 if (srv_use_native_aio) {
4732 if ((ret && len == n)
4733 || (!ret && GetLastError() == ERROR_IO_PENDING)) {
4734 /* aio was queued successfully! */
4735
4736 if (mode == OS_AIO_SYNC) {
4737 /* We want a synchronous i/o operation on a
4738 file where we also use async i/o: in Windows
4739 we must use the same wait mechanism as for
4740 async i/o */
4741
4742 retval = os_aio_windows_handle(
4743 ULINT_UNDEFINED, slot->pos,
4744 &dummy_mess1, &dummy_mess2,
4745 &dummy_type);
4746
4747 return(retval);
4748 }
4749
4750 return(TRUE);
4751 }
4752
4753 goto err_exit;
4754 }
4755 #endif /* WIN_ASYNC_IO */
4756 /* aio was queued successfully! */
4757 return(TRUE);
4758
4759 #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
4760 err_exit:
4761 #endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
4762 os_aio_array_free_slot(array, slot);
4763
4764 if (os_file_handle_error(
4765 name,type == OS_FILE_READ ? "aio read" : "aio write")) {
4766
4767 goto try_again;
4768 }
4769
4770 return(FALSE);
4771 }
4772
4773 #ifdef WIN_ASYNC_IO
4774 /**********************************************************************//**
4775 This function is only used in Windows asynchronous i/o.
4776 Waits for an aio operation to complete. This function is used to wait the
4777 for completed requests. The aio array of pending requests is divided
4778 into segments. The thread specifies which segment or slot it wants to wait
4779 for. NOTE: this function will also take care of freeing the aio slot,
4780 therefore no other thread is allowed to do the freeing!
4781 @return TRUE if the aio operation succeeded */
4782 UNIV_INTERN
4783 ibool
os_aio_windows_handle(ulint segment,ulint pos,fil_node_t ** message1,void ** message2,ulint * type)4784 os_aio_windows_handle(
4785 /*==================*/
4786 ulint segment, /*!< in: the number of the segment in the aio
4787 arrays to wait for; segment 0 is the ibuf
4788 i/o thread, segment 1 the log i/o thread,
4789 then follow the non-ibuf read threads, and as
4790 the last are the non-ibuf write threads; if
4791 this is ULINT_UNDEFINED, then it means that
4792 sync aio is used, and this parameter is
4793 ignored */
4794 ulint pos, /*!< this parameter is used only in sync aio:
4795 wait for the aio slot at this position */
4796 fil_node_t**message1, /*!< out: the messages passed with the aio
4797 request; note that also in the case where
4798 the aio operation failed, these output
4799 parameters are valid and can be used to
4800 restart the operation, for example */
4801 void** message2,
4802 ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
4803 {
4804 ulint orig_seg = segment;
4805 os_aio_array_t* array;
4806 os_aio_slot_t* slot;
4807 ulint n;
4808 ulint i;
4809 ibool ret_val;
4810 BOOL ret;
4811 DWORD len;
4812 BOOL retry = FALSE;
4813
4814 if (segment == ULINT_UNDEFINED) {
4815 segment = 0;
4816 array = os_aio_sync_array;
4817 } else {
4818 segment = os_aio_get_array_and_local_segment(&array, segment);
4819 }
4820
4821 /* NOTE! We only access constant fields in os_aio_array. Therefore
4822 we do not have to acquire the protecting mutex yet */
4823
4824 ut_ad(os_aio_validate_skip());
4825 ut_ad(segment < array->n_segments);
4826
4827 n = array->n_slots / array->n_segments;
4828
4829 if (array == os_aio_sync_array) {
4830
4831 WaitForSingleObject(
4832 os_aio_array_get_nth_slot(array, pos)->handle,
4833 INFINITE);
4834
4835 i = pos;
4836
4837 } else {
4838 if (orig_seg != ULINT_UNDEFINED) {
4839 srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
4840 }
4841
4842 i = WaitForMultipleObjects(
4843 (DWORD) n, array->handles + segment * n,
4844 FALSE, INFINITE);
4845 }
4846
4847 os_mutex_enter(array->mutex);
4848
4849 if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
4850 && array->n_reserved == 0) {
4851 *message1 = NULL;
4852 *message2 = NULL;
4853 os_mutex_exit(array->mutex);
4854 return(TRUE);
4855 }
4856
4857 ut_a(i >= WAIT_OBJECT_0 && i <= WAIT_OBJECT_0 + n);
4858
4859 slot = os_aio_array_get_nth_slot(array, i + segment * n);
4860
4861 ut_a(slot->reserved);
4862
4863 if (orig_seg != ULINT_UNDEFINED) {
4864 srv_set_io_thread_op_info(
4865 orig_seg, "get windows aio return value");
4866 }
4867 ret = GetOverlappedResult(slot->file.m_file, &(slot->control), &len, TRUE);
4868
4869 *message1 = slot->message1;
4870 *message2 = slot->message2;
4871
4872 *type = slot->type;
4873
4874 if (ret && len == slot->len) {
4875
4876 ret_val = TRUE;
4877 } else if (os_file_handle_error(slot->name, "Windows aio")) {
4878
4879 retry = TRUE;
4880 } else {
4881
4882 ret_val = FALSE;
4883 }
4884
4885 os_mutex_exit(array->mutex);
4886
4887 if (retry) {
4888 /* retry failed read/write operation synchronously.
4889 No need to hold array->mutex. */
4890
4891 #ifdef UNIV_PFS_IO
4892 /* This read/write does not go through os_file_read
4893 and os_file_write APIs, need to register with
4894 performance schema explicitly here. */
4895 struct PSI_file_locker* locker = NULL;
4896 PSI_file_locker_state state;
4897 register_pfs_file_io_begin(&state, locker, slot->file, slot->len,
4898 (slot->type == OS_FILE_WRITE)
4899 ? PSI_FILE_WRITE
4900 : PSI_FILE_READ,
4901 __FILE__, __LINE__);
4902 #endif
4903
4904 ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
4905
4906 switch (slot->type) {
4907 case OS_FILE_WRITE:
4908 ret = WriteFile(slot->file.m_file, slot->buf,
4909 (DWORD) slot->len, &len,
4910 &(slot->control));
4911 break;
4912 case OS_FILE_READ:
4913 ret = ReadFile(slot->file.m_file, slot->buf,
4914 (DWORD) slot->len, &len,
4915 &(slot->control));
4916 break;
4917 default:
4918 ut_error;
4919 }
4920
4921 #ifdef UNIV_PFS_IO
4922 register_pfs_file_io_end(locker, len);
4923 #endif
4924
4925 if (!ret && GetLastError() == ERROR_IO_PENDING) {
4926 /* aio was queued successfully!
4927 We want a synchronous i/o operation on a
4928 file where we also use async i/o: in Windows
4929 we must use the same wait mechanism as for
4930 async i/o */
4931 ret = GetOverlappedResult(slot->file.m_file,
4932 &(slot->control),
4933 &len, TRUE);
4934 }
4935
4936 ret_val = ret && len == slot->len;
4937 }
4938
4939 os_aio_array_free_slot(array, slot);
4940
4941 return(ret_val);
4942 }
4943 #endif
4944
4945 #if defined(LINUX_NATIVE_AIO)
4946 /******************************************************************//**
4947 This function is only used in Linux native asynchronous i/o. This is
4948 called from within the io-thread. If there are no completed IO requests
4949 in the slot array, the thread calls this function to collect more
4950 requests from the kernel.
4951 The io-thread waits on io_getevents(), which is a blocking call, with
4952 a timeout value. Unless the system is very heavy loaded, keeping the
4953 io-thread very busy, the io-thread will spend most of its time waiting
4954 in this function.
4955 The io-thread also exits in this function. It checks server status at
4956 each wakeup and that is why we use timed wait in io_getevents(). */
4957 static
4958 void
os_aio_linux_collect(os_aio_array_t * array,ulint segment,ulint seg_size)4959 os_aio_linux_collect(
4960 /*=================*/
4961 os_aio_array_t* array, /*!< in/out: slot array. */
4962 ulint segment, /*!< in: local segment no. */
4963 ulint seg_size) /*!< in: segment size. */
4964 {
4965 int i;
4966 int ret;
4967 ulint start_pos;
4968 ulint end_pos;
4969 struct timespec timeout;
4970 struct io_event* events;
4971 struct io_context* io_ctx;
4972
4973 /* sanity checks. */
4974 ut_ad(array != NULL);
4975 ut_ad(seg_size > 0);
4976 ut_ad(segment < array->n_segments);
4977
4978 /* Which part of event array we are going to work on. */
4979 events = &array->aio_events[segment * seg_size];
4980
4981 /* Which io_context we are going to use. */
4982 io_ctx = array->aio_ctx[segment];
4983
4984 /* Starting point of the segment we will be working on. */
4985 start_pos = segment * seg_size;
4986
4987 /* End point. */
4988 end_pos = start_pos + seg_size;
4989
4990 retry:
4991
4992 /* Initialize the events. The timeout value is arbitrary.
4993 We probably need to experiment with it a little. */
4994 memset(events, 0, sizeof(*events) * seg_size);
4995 timeout.tv_sec = 0;
4996 timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
4997
4998 ret = io_getevents(io_ctx, 1, seg_size, events, &timeout);
4999
5000 if (ret > 0) {
5001 for (i = 0; i < ret; i++) {
5002 os_aio_slot_t* slot;
5003 struct iocb* control;
5004
5005 control = (struct iocb*) events[i].obj;
5006 ut_a(control != NULL);
5007
5008 slot = (os_aio_slot_t*) control->data;
5009
5010 /* Some sanity checks. */
5011 ut_a(slot != NULL);
5012 ut_a(slot->reserved);
5013
5014 #if defined(UNIV_AIO_DEBUG)
5015 fprintf(stderr,
5016 "io_getevents[%c]: slot[%p] ctx[%p]"
5017 " seg[%lu]\n",
5018 (slot->type == OS_FILE_WRITE) ? 'w' : 'r',
5019 slot, io_ctx, segment);
5020 #endif
5021
5022 /* We are not scribbling previous segment. */
5023 ut_a(slot->pos >= start_pos);
5024
5025 /* We have not overstepped to next segment. */
5026 ut_a(slot->pos < end_pos);
5027
5028 /* Mark this request as completed. The error handling
5029 will be done in the calling function. */
5030 os_mutex_enter(array->mutex);
5031 slot->n_bytes = events[i].res;
5032 slot->ret = events[i].res2;
5033 slot->io_already_done = TRUE;
5034 os_mutex_exit(array->mutex);
5035 }
5036 return;
5037 }
5038
5039 if (UNIV_UNLIKELY(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) {
5040 return;
5041 }
5042
5043 /* This error handling is for any error in collecting the
5044 IO requests. The errors, if any, for any particular IO
5045 request are simply passed on to the calling routine. */
5046
5047 switch (ret) {
5048 case -EAGAIN:
5049 /* Not enough resources! Try again. */
5050 case -EINTR:
5051 /* Interrupted! I have tested the behaviour in case of an
5052 interrupt. If we have some completed IOs available then
5053 the return code will be the number of IOs. We get EINTR only
5054 if there are no completed IOs and we have been interrupted. */
5055 case 0:
5056 /* No pending request! Go back and check again. */
5057 goto retry;
5058 }
5059
5060 /* All other errors should cause a trap for now. */
5061 ut_print_timestamp(stderr);
5062 fprintf(stderr,
5063 " InnoDB: unexpected ret_code[%d] from io_getevents()!\n",
5064 ret);
5065 ut_error;
5066 }
5067
5068 /**********************************************************************//**
5069 This function is only used in Linux native asynchronous i/o.
5070 Waits for an aio operation to complete. This function is used to wait for
5071 the completed requests. The aio array of pending requests is divided
5072 into segments. The thread specifies which segment or slot it wants to wait
5073 for. NOTE: this function will also take care of freeing the aio slot,
5074 therefore no other thread is allowed to do the freeing!
5075 @return TRUE if the IO was successful */
5076 UNIV_INTERN
5077 ibool
os_aio_linux_handle(ulint global_seg,fil_node_t ** message1,void ** message2,ulint * type)5078 os_aio_linux_handle(
5079 /*================*/
5080 ulint global_seg, /*!< in: segment number in the aio array
5081 to wait for; segment 0 is the ibuf
5082 i/o thread, segment 1 is log i/o thread,
5083 then follow the non-ibuf read threads,
5084 and the last are the non-ibuf write
5085 threads. */
5086 fil_node_t**message1, /*!< out: the messages passed with the */
5087 void** message2, /*!< aio request; note that in case the
5088 aio operation failed, these output
5089 parameters are valid and can be used to
5090 restart the operation. */
5091 ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
5092 {
5093 ulint segment;
5094 os_aio_array_t* array;
5095 os_aio_slot_t* slot;
5096 ulint n;
5097 ulint i;
5098 ibool ret = FALSE;
5099
5100 /* Should never be doing Sync IO here. */
5101 ut_a(global_seg != ULINT_UNDEFINED);
5102
5103 /* Find the array and the local segment. */
5104 segment = os_aio_get_array_and_local_segment(&array, global_seg);
5105 n = array->n_slots / array->n_segments;
5106
5107 /* Loop until we have found a completed request. */
5108 for (;;) {
5109 ibool any_reserved = FALSE;
5110 os_mutex_enter(array->mutex);
5111 for (i = 0; i < n; ++i) {
5112 slot = os_aio_array_get_nth_slot(
5113 array, i + segment * n);
5114 if (!slot->reserved) {
5115 continue;
5116 } else if (slot->io_already_done) {
5117 /* Something for us to work on. */
5118 goto found;
5119 } else {
5120 any_reserved = TRUE;
5121 }
5122 }
5123
5124 os_mutex_exit(array->mutex);
5125
5126 /* There is no completed request.
5127 If there is no pending request at all,
5128 and the system is being shut down, exit. */
5129 if (UNIV_UNLIKELY
5130 (!any_reserved
5131 && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) {
5132 *message1 = NULL;
5133 *message2 = NULL;
5134 return(TRUE);
5135 }
5136
5137 /* Wait for some request. Note that we return
5138 from wait iff we have found a request. */
5139
5140 srv_set_io_thread_op_info(global_seg,
5141 "waiting for completed aio requests");
5142 os_aio_linux_collect(array, segment, n);
5143 }
5144
5145 found:
5146 /* Note that it may be that there are more then one completed
5147 IO requests. We process them one at a time. We may have a case
5148 here to improve the performance slightly by dealing with all
5149 requests in one sweep. */
5150 srv_set_io_thread_op_info(global_seg,
5151 "processing completed aio requests");
5152
5153 /* Ensure that we are scribbling only our segment. */
5154 ut_a(i < n);
5155
5156 ut_ad(slot != NULL);
5157 ut_ad(slot->reserved);
5158 ut_ad(slot->io_already_done);
5159
5160 *message1 = slot->message1;
5161 *message2 = slot->message2;
5162
5163 *type = slot->type;
5164
5165 if (slot->ret == 0 && slot->n_bytes == (long) slot->len) {
5166
5167 ret = TRUE;
5168 } else {
5169 errno = -slot->ret;
5170
5171 /* os_file_handle_error does tell us if we should retry
5172 this IO. As it stands now, we don't do this retry when
5173 reaping requests from a different context than
5174 the dispatcher. This non-retry logic is the same for
5175 windows and linux native AIO.
5176 We should probably look into this to transparently
5177 re-submit the IO. */
5178 os_file_handle_error(slot->name, "Linux aio");
5179
5180 ret = FALSE;
5181 }
5182
5183 os_mutex_exit(array->mutex);
5184
5185 os_aio_array_free_slot(array, slot);
5186
5187 return(ret);
5188 }
5189 #endif /* LINUX_NATIVE_AIO */
5190
5191 /**********************************************************************//**
5192 Does simulated aio. This function should be called by an i/o-handler
5193 thread.
5194 @return TRUE if the aio operation succeeded */
5195 UNIV_INTERN
5196 ibool
os_aio_simulated_handle(ulint global_segment,fil_node_t ** message1,void ** message2,ulint * type)5197 os_aio_simulated_handle(
5198 /*====================*/
5199 ulint global_segment, /*!< in: the number of the segment in the aio
5200 arrays to wait for; segment 0 is the ibuf
5201 i/o thread, segment 1 the log i/o thread,
5202 then follow the non-ibuf read threads, and as
5203 the last are the non-ibuf write threads */
5204 fil_node_t**message1, /*!< out: the messages passed with the aio
5205 request; note that also in the case where
5206 the aio operation failed, these output
5207 parameters are valid and can be used to
5208 restart the operation, for example */
5209 void** message2,
5210 ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
5211 {
5212 os_aio_array_t* array;
5213 ulint segment;
5214 os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
5215 ulint n_consecutive;
5216 ulint total_len;
5217 ulint offs;
5218 os_offset_t lowest_offset;
5219 ulint biggest_age;
5220 ulint age;
5221 byte* combined_buf;
5222 byte* combined_buf2;
5223 ibool ret;
5224 ibool any_reserved;
5225 ulint n;
5226 os_aio_slot_t* aio_slot;
5227
5228 /* Fix compiler warning */
5229 *consecutive_ios = NULL;
5230
5231 segment = os_aio_get_array_and_local_segment(&array, global_segment);
5232
5233 restart:
5234 /* NOTE! We only access constant fields in os_aio_array. Therefore
5235 we do not have to acquire the protecting mutex yet */
5236
5237 srv_set_io_thread_op_info(global_segment,
5238 "looking for i/o requests (a)");
5239 ut_ad(os_aio_validate_skip());
5240 ut_ad(segment < array->n_segments);
5241
5242 n = array->n_slots / array->n_segments;
5243
5244 /* Look through n slots after the segment * n'th slot */
5245
5246 if (array == os_aio_read_array
5247 && os_aio_recommend_sleep_for_read_threads) {
5248
5249 /* Give other threads chance to add several i/os to the array
5250 at once. */
5251
5252 goto recommended_sleep;
5253 }
5254
5255 srv_set_io_thread_op_info(global_segment,
5256 "looking for i/o requests (b)");
5257
5258 /* Check if there is a slot for which the i/o has already been
5259 done */
5260 any_reserved = FALSE;
5261
5262 os_mutex_enter(array->mutex);
5263
5264 for (ulint i = 0; i < n; i++) {
5265 os_aio_slot_t* slot;
5266
5267 slot = os_aio_array_get_nth_slot(array, i + segment * n);
5268
5269 if (!slot->reserved) {
5270 continue;
5271 } else if (slot->io_already_done) {
5272
5273 if (os_aio_print_debug) {
5274 fprintf(stderr,
5275 "InnoDB: i/o for slot %lu"
5276 " already done, returning\n",
5277 (ulong) i);
5278 }
5279
5280 aio_slot = slot;
5281 ret = TRUE;
5282 goto slot_io_done;
5283 } else {
5284 any_reserved = TRUE;
5285 }
5286 }
5287
5288 /* There is no completed request.
5289 If there is no pending request at all,
5290 and the system is being shut down, exit. */
5291 if (!any_reserved && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
5292 os_mutex_exit(array->mutex);
5293 *message1 = NULL;
5294 *message2 = NULL;
5295 return(TRUE);
5296 }
5297
5298 n_consecutive = 0;
5299
5300 /* If there are at least 2 seconds old requests, then pick the oldest
5301 one to prevent starvation. If several requests have the same age,
5302 then pick the one at the lowest offset. */
5303
5304 biggest_age = 0;
5305 lowest_offset = IB_UINT64_MAX;
5306
5307 for (ulint i = 0; i < n; i++) {
5308 os_aio_slot_t* slot;
5309
5310 slot = os_aio_array_get_nth_slot(array, i + segment * n);
5311
5312 if (slot->reserved) {
5313
5314 age = (ulint) difftime(
5315 ut_time(), slot->reservation_time);
5316
5317 if ((age >= 2 && age > biggest_age)
5318 || (age >= 2 && age == biggest_age
5319 && slot->offset < lowest_offset)) {
5320
5321 /* Found an i/o request */
5322 consecutive_ios[0] = slot;
5323
5324 n_consecutive = 1;
5325
5326 biggest_age = age;
5327 lowest_offset = slot->offset;
5328 }
5329 }
5330 }
5331
5332 if (n_consecutive == 0) {
5333 /* There were no old requests. Look for an i/o request at the
5334 lowest offset in the array (we ignore the high 32 bits of the
5335 offset in these heuristics) */
5336
5337 lowest_offset = IB_UINT64_MAX;
5338
5339 for (ulint i = 0; i < n; i++) {
5340 os_aio_slot_t* slot;
5341
5342 slot = os_aio_array_get_nth_slot(
5343 array, i + segment * n);
5344
5345 if (slot->reserved && slot->offset < lowest_offset) {
5346
5347 /* Found an i/o request */
5348 consecutive_ios[0] = slot;
5349
5350 n_consecutive = 1;
5351
5352 lowest_offset = slot->offset;
5353 }
5354 }
5355 }
5356
5357 if (n_consecutive == 0) {
5358
5359 /* No i/o requested at the moment */
5360
5361 goto wait_for_io;
5362 }
5363
5364 /* if n_consecutive != 0, then we have assigned
5365 something valid to consecutive_ios[0] */
5366 ut_ad(n_consecutive != 0);
5367 ut_ad(consecutive_ios[0] != NULL);
5368
5369 aio_slot = consecutive_ios[0];
5370
5371 /* Check if there are several consecutive blocks to read or write */
5372
5373 consecutive_loop:
5374 for (ulint i = 0; i < n; i++) {
5375 os_aio_slot_t* slot;
5376
5377 slot = os_aio_array_get_nth_slot(array, i + segment * n);
5378 if (slot->reserved
5379 && slot != aio_slot
5380 && slot->offset == aio_slot->offset + aio_slot->len
5381 && slot->type == aio_slot->type
5382 && slot->file.m_file == aio_slot->file.m_file) {
5383
5384 /* Found a consecutive i/o request */
5385
5386 consecutive_ios[n_consecutive] = slot;
5387 n_consecutive++;
5388
5389 aio_slot = slot;
5390
5391 if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) {
5392
5393 goto consecutive_loop;
5394 } else {
5395 break;
5396 }
5397 }
5398 }
5399
5400 srv_set_io_thread_op_info(global_segment, "consecutive i/o requests");
5401
5402 /* We have now collected n_consecutive i/o requests in the array;
5403 allocate a single buffer which can hold all data, and perform the
5404 i/o */
5405
5406 total_len = 0;
5407 aio_slot = consecutive_ios[0];
5408
5409 for (ulint i = 0; i < n_consecutive; i++) {
5410 total_len += consecutive_ios[i]->len;
5411 }
5412
5413 if (n_consecutive == 1) {
5414 /* We can use the buffer of the i/o request */
5415 combined_buf = aio_slot->buf;
5416 combined_buf2 = NULL;
5417 } else {
5418 combined_buf2 = static_cast<byte*>(
5419 ut_malloc(total_len + UNIV_PAGE_SIZE));
5420
5421 ut_a(combined_buf2);
5422
5423 combined_buf = static_cast<byte*>(
5424 ut_align(combined_buf2, UNIV_PAGE_SIZE));
5425 }
5426
5427 /* We release the array mutex for the time of the i/o: NOTE that
5428 this assumes that there is just one i/o-handler thread serving
5429 a single segment of slots! */
5430
5431 os_mutex_exit(array->mutex);
5432
5433 if (aio_slot->type == OS_FILE_WRITE && n_consecutive > 1) {
5434 /* Copy the buffers to the combined buffer */
5435 offs = 0;
5436
5437 for (ulint i = 0; i < n_consecutive; i++) {
5438
5439 ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf,
5440 consecutive_ios[i]->len);
5441
5442 offs += consecutive_ios[i]->len;
5443 }
5444 }
5445
5446 srv_set_io_thread_op_info(global_segment, "doing file i/o");
5447
5448 /* Do the i/o with ordinary, synchronous i/o functions: */
5449 if (aio_slot->type == OS_FILE_WRITE) {
5450 ut_ad(!srv_read_only_mode);
5451 ret = os_file_write(
5452 aio_slot->name, aio_slot->file, combined_buf,
5453 aio_slot->offset, total_len);
5454 } else {
5455 ret = os_file_read(
5456 aio_slot->file, combined_buf,
5457 aio_slot->offset, total_len);
5458 }
5459
5460 ut_a(ret);
5461 srv_set_io_thread_op_info(global_segment, "file i/o done");
5462
5463 if (aio_slot->type == OS_FILE_READ && n_consecutive > 1) {
5464 /* Copy the combined buffer to individual buffers */
5465 offs = 0;
5466
5467 for (ulint i = 0; i < n_consecutive; i++) {
5468
5469 ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs,
5470 consecutive_ios[i]->len);
5471 offs += consecutive_ios[i]->len;
5472 }
5473 }
5474
5475 if (combined_buf2) {
5476 ut_free(combined_buf2);
5477 }
5478
5479 os_mutex_enter(array->mutex);
5480
5481 /* Mark the i/os done in slots */
5482
5483 for (ulint i = 0; i < n_consecutive; i++) {
5484 consecutive_ios[i]->io_already_done = TRUE;
5485 }
5486
5487 /* We return the messages for the first slot now, and if there were
5488 several slots, the messages will be returned with subsequent calls
5489 of this function */
5490
5491 slot_io_done:
5492
5493 ut_a(aio_slot->reserved);
5494
5495 *message1 = aio_slot->message1;
5496 *message2 = aio_slot->message2;
5497
5498 *type = aio_slot->type;
5499
5500 os_mutex_exit(array->mutex);
5501
5502 os_aio_array_free_slot(array, aio_slot);
5503
5504 return(ret);
5505
5506 wait_for_io:
5507 srv_set_io_thread_op_info(global_segment, "resetting wait event");
5508
5509 /* We wait here until there again can be i/os in the segment
5510 of this thread */
5511
5512 os_event_reset(os_aio_segment_wait_events[global_segment]);
5513
5514 os_mutex_exit(array->mutex);
5515
5516 recommended_sleep:
5517 srv_set_io_thread_op_info(global_segment, "waiting for i/o request");
5518
5519 os_event_wait(os_aio_segment_wait_events[global_segment]);
5520
5521 goto restart;
5522 }
5523
5524 /**********************************************************************//**
5525 Validates the consistency of an aio array.
5526 @return true if ok */
5527 static
5528 bool
os_aio_array_validate(os_aio_array_t * array)5529 os_aio_array_validate(
5530 /*==================*/
5531 os_aio_array_t* array) /*!< in: aio wait array */
5532 {
5533 ulint i;
5534 ulint n_reserved = 0;
5535
5536 os_mutex_enter(array->mutex);
5537
5538 ut_a(array->n_slots > 0);
5539 ut_a(array->n_segments > 0);
5540
5541 for (i = 0; i < array->n_slots; i++) {
5542 os_aio_slot_t* slot;
5543
5544 slot = os_aio_array_get_nth_slot(array, i);
5545
5546 if (slot->reserved) {
5547 n_reserved++;
5548 ut_a(slot->len > 0);
5549 }
5550 }
5551
5552 ut_a(array->n_reserved == n_reserved);
5553
5554 os_mutex_exit(array->mutex);
5555
5556 return(true);
5557 }
5558
5559 /**********************************************************************//**
5560 Validates the consistency the aio system.
5561 @return TRUE if ok */
5562 UNIV_INTERN
5563 ibool
os_aio_validate(void)5564 os_aio_validate(void)
5565 /*=================*/
5566 {
5567 os_aio_array_validate(os_aio_read_array);
5568
5569 if (os_aio_write_array != 0) {
5570 os_aio_array_validate(os_aio_write_array);
5571 }
5572
5573 if (os_aio_ibuf_array != 0) {
5574 os_aio_array_validate(os_aio_ibuf_array);
5575 }
5576
5577 if (os_aio_log_array != 0) {
5578 os_aio_array_validate(os_aio_log_array);
5579 }
5580
5581 if (os_aio_sync_array != 0) {
5582 os_aio_array_validate(os_aio_sync_array);
5583 }
5584
5585 return(TRUE);
5586 }
5587
5588 /**********************************************************************//**
5589 Prints pending IO requests per segment of an aio array.
5590 We probably don't need per segment statistics but they can help us
5591 during development phase to see if the IO requests are being
5592 distributed as expected. */
5593 static
5594 void
os_aio_print_segment_info(FILE * file,ulint * n_seg,os_aio_array_t * array)5595 os_aio_print_segment_info(
5596 /*======================*/
5597 FILE* file, /*!< in: file where to print */
5598 ulint* n_seg, /*!< in: pending IO array */
5599 os_aio_array_t* array) /*!< in: array to process */
5600 {
5601 ulint i;
5602
5603 ut_ad(array);
5604 ut_ad(n_seg);
5605 ut_ad(array->n_segments > 0);
5606
5607 if (array->n_segments == 1) {
5608 return;
5609 }
5610
5611 fprintf(file, " [");
5612 for (i = 0; i < array->n_segments; i++) {
5613 if (i != 0) {
5614 fprintf(file, ", ");
5615 }
5616
5617 fprintf(file, "%lu", n_seg[i]);
5618 }
5619 fprintf(file, "] ");
5620 }
5621
5622 /**********************************************************************//**
5623 Prints info about the aio array. */
5624 UNIV_INTERN
5625 void
os_aio_print_array(FILE * file,os_aio_array_t * array)5626 os_aio_print_array(
5627 /*==============*/
5628 FILE* file, /*!< in: file where to print */
5629 os_aio_array_t* array) /*!< in: aio array to print */
5630 {
5631 ulint n_reserved = 0;
5632 ulint n_res_seg[SRV_MAX_N_IO_THREADS];
5633
5634 os_mutex_enter(array->mutex);
5635
5636 ut_a(array->n_slots > 0);
5637 ut_a(array->n_segments > 0);
5638
5639 memset(n_res_seg, 0x0, sizeof(n_res_seg));
5640
5641 for (ulint i = 0; i < array->n_slots; ++i) {
5642 os_aio_slot_t* slot;
5643 ulint seg_no;
5644
5645 slot = os_aio_array_get_nth_slot(array, i);
5646
5647 seg_no = (i * array->n_segments) / array->n_slots;
5648
5649 if (slot->reserved) {
5650 ++n_reserved;
5651 ++n_res_seg[seg_no];
5652
5653 ut_a(slot->len > 0);
5654 }
5655 }
5656
5657 ut_a(array->n_reserved == n_reserved);
5658
5659 fprintf(file, " %lu", (ulong) n_reserved);
5660
5661 os_aio_print_segment_info(file, n_res_seg, array);
5662
5663 os_mutex_exit(array->mutex);
5664 }
5665
5666 /**********************************************************************//**
5667 Prints info of the aio arrays. */
5668 UNIV_INTERN
5669 void
os_aio_print(FILE * file)5670 os_aio_print(
5671 /*=========*/
5672 FILE* file) /*!< in: file where to print */
5673 {
5674 time_t current_time;
5675 double time_elapsed;
5676 double avg_bytes_read;
5677
5678 for (ulint i = 0; i < srv_n_file_io_threads; ++i) {
5679 fprintf(file, "I/O thread %lu state: %s (%s)",
5680 (ulong) i,
5681 srv_io_thread_op_info[i],
5682 srv_io_thread_function[i]);
5683
5684 #ifndef __WIN__
5685 if (os_aio_segment_wait_events[i]->is_set) {
5686 fprintf(file, " ev set");
5687 }
5688 #endif /* __WIN__ */
5689
5690 fprintf(file, "\n");
5691 }
5692
5693 fputs("Pending normal aio reads:", file);
5694
5695 os_aio_print_array(file, os_aio_read_array);
5696
5697 if (os_aio_write_array != 0) {
5698 fputs(", aio writes:", file);
5699 os_aio_print_array(file, os_aio_write_array);
5700 }
5701
5702 if (os_aio_ibuf_array != 0) {
5703 fputs(",\n ibuf aio reads:", file);
5704 os_aio_print_array(file, os_aio_ibuf_array);
5705 }
5706
5707 if (os_aio_log_array != 0) {
5708 fputs(", log i/o's:", file);
5709 os_aio_print_array(file, os_aio_log_array);
5710 }
5711
5712 if (os_aio_sync_array != 0) {
5713 fputs(", sync i/o's:", file);
5714 os_aio_print_array(file, os_aio_sync_array);
5715 }
5716
5717 putc('\n', file);
5718 current_time = ut_time();
5719 time_elapsed = 0.001 + difftime(current_time, os_last_printout);
5720
5721 fprintf(file,
5722 "Pending flushes (fsync) log: %lu; buffer pool: %lu\n"
5723 "%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n",
5724 (ulong) fil_n_pending_log_flushes,
5725 (ulong) fil_n_pending_tablespace_flushes,
5726 (ulong) os_n_file_reads,
5727 (ulong) os_n_file_writes,
5728 (ulong) os_n_fsyncs);
5729
5730 if (os_file_n_pending_preads != 0 || os_file_n_pending_pwrites != 0) {
5731 fprintf(file,
5732 "%lu pending preads, %lu pending pwrites\n",
5733 (ulong) os_file_n_pending_preads,
5734 (ulong) os_file_n_pending_pwrites);
5735 }
5736
5737 if (os_n_file_reads == os_n_file_reads_old) {
5738 avg_bytes_read = 0.0;
5739 } else {
5740 avg_bytes_read = (double) os_bytes_read_since_printout
5741 / (os_n_file_reads - os_n_file_reads_old);
5742 }
5743
5744 fprintf(file,
5745 "%.2f reads/s, %lu avg bytes/read,"
5746 " %.2f writes/s, %.2f fsyncs/s\n",
5747 (os_n_file_reads - os_n_file_reads_old)
5748 / time_elapsed,
5749 (ulong) avg_bytes_read,
5750 (os_n_file_writes - os_n_file_writes_old)
5751 / time_elapsed,
5752 (os_n_fsyncs - os_n_fsyncs_old)
5753 / time_elapsed);
5754
5755 os_n_file_reads_old = os_n_file_reads;
5756 os_n_file_writes_old = os_n_file_writes;
5757 os_n_fsyncs_old = os_n_fsyncs;
5758 os_bytes_read_since_printout = 0;
5759
5760 os_last_printout = current_time;
5761 }
5762
5763 /**********************************************************************//**
5764 Refreshes the statistics used to print per-second averages. */
5765 UNIV_INTERN
5766 void
os_aio_refresh_stats(void)5767 os_aio_refresh_stats(void)
5768 /*======================*/
5769 {
5770 os_n_file_reads_old = os_n_file_reads;
5771 os_n_file_writes_old = os_n_file_writes;
5772 os_n_fsyncs_old = os_n_fsyncs;
5773 os_bytes_read_since_printout = 0;
5774
5775 os_last_printout = time(NULL);
5776 }
5777
5778 #ifdef UNIV_DEBUG
5779 /**********************************************************************//**
5780 Checks that all slots in the system have been freed, that is, there are
5781 no pending io operations.
5782 @return TRUE if all free */
5783 UNIV_INTERN
5784 ibool
os_aio_all_slots_free(void)5785 os_aio_all_slots_free(void)
5786 /*=======================*/
5787 {
5788 os_aio_array_t* array;
5789 ulint n_res = 0;
5790
5791 array = os_aio_read_array;
5792
5793 os_mutex_enter(array->mutex);
5794
5795 n_res += array->n_reserved;
5796
5797 os_mutex_exit(array->mutex);
5798
5799 if (!srv_read_only_mode) {
5800 ut_a(os_aio_write_array == 0);
5801
5802 array = os_aio_write_array;
5803
5804 os_mutex_enter(array->mutex);
5805
5806 n_res += array->n_reserved;
5807
5808 os_mutex_exit(array->mutex);
5809
5810 ut_a(os_aio_ibuf_array == 0);
5811
5812 array = os_aio_ibuf_array;
5813
5814 os_mutex_enter(array->mutex);
5815
5816 n_res += array->n_reserved;
5817
5818 os_mutex_exit(array->mutex);
5819 }
5820
5821 ut_a(os_aio_log_array == 0);
5822
5823 array = os_aio_log_array;
5824
5825 os_mutex_enter(array->mutex);
5826
5827 n_res += array->n_reserved;
5828
5829 os_mutex_exit(array->mutex);
5830
5831 array = os_aio_sync_array;
5832
5833 os_mutex_enter(array->mutex);
5834
5835 n_res += array->n_reserved;
5836
5837 os_mutex_exit(array->mutex);
5838
5839 if (n_res == 0) {
5840
5841 return(TRUE);
5842 }
5843
5844 return(FALSE);
5845 }
5846 #endif /* UNIV_DEBUG */
5847
5848 #endif /* !UNIV_HOTBACKUP */
5849