1 /***********************************************************************
2
3 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2009, Percona Inc.
5
6 Portions of this file contain modifications contributed and copyrighted
7 by Percona Inc.. Those modifications are
8 gratefully acknowledged and are described briefly in the InnoDB
9 documentation. The contributions by Percona Inc. are incorporated with
10 their permission, and subject to the conditions contained in the file
11 COPYING.Percona.
12
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License, version 2.0,
15 as published by the Free Software Foundation.
16
17 This program is also distributed with certain software (including
18 but not limited to OpenSSL) that is licensed under separate terms,
19 as designated in a particular file or component or in included license
20 documentation. The authors of MySQL hereby grant you an additional
21 permission to link the program and your derivative works with the
22 separately licensed software that they have included with MySQL.
23
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License, version 2.0, for more details.
28
29 You should have received a copy of the GNU General Public License along with
30 this program; if not, write to the Free Software Foundation, Inc.,
31 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
32
33 ***********************************************************************/
34
35 /**************************************************//**
36 @file os/os0file.cc
37 The interface to the operating system file i/o primitives
38
39 Created 10/21/1995 Heikki Tuuri
40 *******************************************************/
41
42 #include "os0file.h"
43
44 #ifdef UNIV_NONINL
45 #include "os0file.ic"
46 #endif
47
48 #include "ut0mem.h"
49 #include "srv0srv.h"
50 #include "srv0start.h"
51 #include "fil0fil.h"
52 #include "buf0buf.h"
53 #include "srv0mon.h"
54 #ifndef UNIV_HOTBACKUP
55 # include "os0sync.h"
56 # include "os0thread.h"
57 #else /* !UNIV_HOTBACKUP */
58 # ifdef __WIN__
59 /* Add includes for the _stat() call to compile on Windows */
60 # include <sys/types.h>
61 # include <sys/stat.h>
62 # include <errno.h>
63 # endif /* __WIN__ */
64 #endif /* !UNIV_HOTBACKUP */
65
66 #if defined(LINUX_NATIVE_AIO)
67 #include <libaio.h>
68 #endif
69
70 /** Insert buffer segment id */
71 static const ulint IO_IBUF_SEGMENT = 0;
72
73 /** Log segment id */
74 static const ulint IO_LOG_SEGMENT = 1;
75
76 /* This specifies the file permissions InnoDB uses when it creates files in
77 Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
78 my_umask */
79
80 #ifndef __WIN__
81 /** Umask for creating files */
82 UNIV_INTERN ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
83 #else
84 /** Umask for creating files */
85 UNIV_INTERN ulint os_innodb_umask = 0;
86 #endif /* __WIN__ */
87
88 #ifndef UNIV_HOTBACKUP
89 /* We use these mutexes to protect lseek + file i/o operation, if the
90 OS does not provide an atomic pread or pwrite, or similar */
91 #define OS_FILE_N_SEEK_MUTEXES 16
92 UNIV_INTERN os_ib_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
93
94 /* In simulated aio, merge at most this many consecutive i/os */
95 #define OS_AIO_MERGE_N_CONSECUTIVE 64
96
97 /**********************************************************************
98
99 InnoDB AIO Implementation:
100 =========================
101
102 We support native AIO for windows and linux. For rest of the platforms
103 we simulate AIO by special io-threads servicing the IO-requests.
104
105 Simulated AIO:
106 ==============
107
108 In platforms where we 'simulate' AIO following is a rough explanation
109 of the high level design.
110 There are four io-threads (for ibuf, log, read, write).
111 All synchronous IO requests are serviced by the calling thread using
112 os_file_write/os_file_read. The Asynchronous requests are queued up
113 in an array (there are four such arrays) by the calling thread.
114 Later these requests are picked up by the io-thread and are serviced
115 synchronously.
116
117 Windows native AIO:
118 ==================
119
120 If srv_use_native_aio is not set then windows follow the same
121 code as simulated AIO. If the flag is set then native AIO interface
122 is used. On windows, one of the limitation is that if a file is opened
123 for AIO no synchronous IO can be done on it. Therefore we have an
124 extra fifth array to queue up synchronous IO requests.
125 There are innodb_file_io_threads helper threads. These threads work
126 on the four arrays mentioned above in Simulated AIO. No thread is
127 required for the sync array.
128 If a synchronous IO request is made, it is first queued in the sync
129 array. Then the calling thread itself waits on the request, thus
130 making the call synchronous.
131 If an AIO request is made the calling thread not only queues it in the
132 array but also submits the requests. The helper thread then collects
133 the completed IO request and calls completion routine on it.
134
135 Linux native AIO:
136 =================
137
138 If we have libaio installed on the system and innodb_use_native_aio
139 is set to TRUE we follow the code path of native AIO, otherwise we
140 do simulated AIO.
141 There are innodb_file_io_threads helper threads. These threads work
142 on the four arrays mentioned above in Simulated AIO.
143 If a synchronous IO request is made, it is handled by calling
144 os_file_write/os_file_read.
145 If an AIO request is made the calling thread not only queues it in the
146 array but also submits the requests. The helper thread then collects
147 the completed IO request and calls completion routine on it.
148
149 **********************************************************************/
150
151 /** Flag: enable debug printout for asynchronous i/o */
152 UNIV_INTERN ibool os_aio_print_debug = FALSE;
153
154 #ifdef UNIV_PFS_IO
155 /* Keys to register InnoDB I/O with performance schema */
156 UNIV_INTERN mysql_pfs_key_t innodb_file_data_key;
157 UNIV_INTERN mysql_pfs_key_t innodb_file_log_key;
158 UNIV_INTERN mysql_pfs_key_t innodb_file_temp_key;
159 #endif /* UNIV_PFS_IO */
160
161 /** The asynchronous i/o array slot structure */
162 struct os_aio_slot_t{
163 ibool is_read; /*!< TRUE if a read operation */
164 ulint pos; /*!< index of the slot in the aio
165 array */
166 ibool reserved; /*!< TRUE if this slot is reserved */
167 time_t reservation_time;/*!< time when reserved */
168 ulint len; /*!< length of the block to read or
169 write */
170 byte* buf; /*!< buffer used in i/o */
171 ulint type; /*!< OS_FILE_READ or OS_FILE_WRITE */
172 os_offset_t offset; /*!< file offset in bytes */
173 pfs_os_file_t file; /*!< file where to read or write */
174 const char* name; /*!< file name or path */
175 ibool io_already_done;/*!< used only in simulated aio:
176 TRUE if the physical i/o already
177 made and only the slot message
178 needs to be passed to the caller
179 of os_aio_simulated_handle */
180 fil_node_t* message1; /*!< message which is given by the */
181 void* message2; /*!< the requester of an aio operation
182 and which can be used to identify
183 which pending aio operation was
184 completed */
185 #ifdef WIN_ASYNC_IO
186 HANDLE handle; /*!< handle object we need in the
187 OVERLAPPED struct */
188 OVERLAPPED control; /*!< Windows control block for the
189 aio request */
190 #elif defined(LINUX_NATIVE_AIO)
191 struct iocb control; /* Linux control block for aio */
192 int n_bytes; /* bytes written/read. */
193 int ret; /* AIO return code */
194 #endif /* WIN_ASYNC_IO */
195 };
196
197 /** The asynchronous i/o array structure */
198 struct os_aio_array_t{
199 os_ib_mutex_t mutex; /*!< the mutex protecting the aio array */
200 os_event_t not_full;
201 /*!< The event which is set to the
202 signaled state when there is space in
203 the aio outside the ibuf segment */
204 os_event_t is_empty;
205 /*!< The event which is set to the
206 signaled state when there are no
207 pending i/os in this array */
208 ulint n_slots;/*!< Total number of slots in the aio
209 array. This must be divisible by
210 n_threads. */
211 ulint n_segments;
212 /*!< Number of segments in the aio
213 array of pending aio requests. A
214 thread can wait separately for any one
215 of the segments. */
216 ulint cur_seg;/*!< We reserve IO requests in round
217 robin fashion to different segments.
218 This points to the segment that is to
219 be used to service next IO request. */
220 ulint n_reserved;
221 /*!< Number of reserved slots in the
222 aio array outside the ibuf segment */
223 os_aio_slot_t* slots; /*!< Pointer to the slots in the array */
224 #ifdef __WIN__
225 HANDLE* handles;
226 /*!< Pointer to an array of OS native
227 event handles where we copied the
228 handles from slots, in the same
229 order. This can be used in
230 WaitForMultipleObjects; used only in
231 Windows */
232 #endif /* __WIN__ */
233
234 #if defined(LINUX_NATIVE_AIO)
235 io_context_t* aio_ctx;
236 /* completion queue for IO. There is
237 one such queue per segment. Each thread
238 will work on one ctx exclusively. */
239 struct io_event* aio_events;
240 /* The array to collect completed IOs.
241 There is one such event for each
242 possible pending IO. The size of the
243 array is equal to n_slots. */
244 #endif /* LINUX_NATIV_AIO */
245 };
246
247 #if defined(LINUX_NATIVE_AIO)
248 /** timeout for each io_getevents() call = 500ms. */
249 #define OS_AIO_REAP_TIMEOUT (500000000UL)
250
251 /** time to sleep, in microseconds if io_setup() returns EAGAIN. */
252 #define OS_AIO_IO_SETUP_RETRY_SLEEP (500000UL)
253
254 /** number of attempts before giving up on io_setup(). */
255 #define OS_AIO_IO_SETUP_RETRY_ATTEMPTS 5
256 #endif
257
258 /** Array of events used in simulated aio */
259 static os_event_t* os_aio_segment_wait_events = NULL;
260
261 /** The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
262 are NULL when the module has not yet been initialized. @{ */
263 static os_aio_array_t* os_aio_read_array = NULL; /*!< Reads */
264 static os_aio_array_t* os_aio_write_array = NULL; /*!< Writes */
265 static os_aio_array_t* os_aio_ibuf_array = NULL; /*!< Insert buffer */
266 static os_aio_array_t* os_aio_log_array = NULL; /*!< Redo log */
267 static os_aio_array_t* os_aio_sync_array = NULL; /*!< Synchronous I/O */
268 /* @} */
269
270 /** Number of asynchronous I/O segments. Set by os_aio_init(). */
271 static ulint os_aio_n_segments = ULINT_UNDEFINED;
272
273 /** If the following is TRUE, read i/o handler threads try to
274 wait until a batch of new read requests have been posted */
275 static ibool os_aio_recommend_sleep_for_read_threads = FALSE;
276 #endif /* !UNIV_HOTBACKUP */
277
278 UNIV_INTERN ulint os_n_file_reads = 0;
279 UNIV_INTERN ulint os_bytes_read_since_printout = 0;
280 UNIV_INTERN ulint os_n_file_writes = 0;
281 UNIV_INTERN ulint os_n_fsyncs = 0;
282 UNIV_INTERN ulint os_n_file_reads_old = 0;
283 UNIV_INTERN ulint os_n_file_writes_old = 0;
284 UNIV_INTERN ulint os_n_fsyncs_old = 0;
285 UNIV_INTERN time_t os_last_printout;
286
287 UNIV_INTERN ibool os_has_said_disk_full = FALSE;
288
289 #if !defined(UNIV_HOTBACKUP) \
290 && (!defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8)
291 /** The mutex protecting the following counts of pending I/O operations */
292 static os_ib_mutex_t os_file_count_mutex;
293 #endif /* !UNIV_HOTBACKUP && (!HAVE_ATOMIC_BUILTINS || UNIV_WORD_SIZE < 8) */
294
295 /** Number of pending os_file_pread() operations */
296 UNIV_INTERN ulint os_file_n_pending_preads = 0;
297 /** Number of pending os_file_pwrite() operations */
298 UNIV_INTERN ulint os_file_n_pending_pwrites = 0;
299 /** Number of pending write operations */
300 UNIV_INTERN ulint os_n_pending_writes = 0;
301 /** Number of pending read operations */
302 UNIV_INTERN ulint os_n_pending_reads = 0;
303
304 #ifdef UNIV_DEBUG
305 # ifndef UNIV_HOTBACKUP
306 /**********************************************************************//**
307 Validates the consistency the aio system some of the time.
308 @return TRUE if ok or the check was skipped */
309 UNIV_INTERN
310 ibool
os_aio_validate_skip(void)311 os_aio_validate_skip(void)
312 /*======================*/
313 {
314 /** Try os_aio_validate() every this many times */
315 # define OS_AIO_VALIDATE_SKIP 13
316
317 /** The os_aio_validate() call skip counter.
318 Use a signed type because of the race condition below. */
319 static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
320
321 /* There is a race condition below, but it does not matter,
322 because this call is only for heuristic purposes. We want to
323 reduce the call frequency of the costly os_aio_validate()
324 check in debug builds. */
325 if (--os_aio_validate_count > 0) {
326 return(TRUE);
327 }
328
329 os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
330 return(os_aio_validate());
331 }
332 # endif /* !UNIV_HOTBACKUP */
333 #endif /* UNIV_DEBUG */
334
335 #ifdef __WIN__
336 /***********************************************************************//**
337 Gets the operating system version. Currently works only on Windows.
338 @return OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000, OS_WINXP, OS_WINVISTA,
339 OS_WIN7. */
340 UNIV_INTERN
341 ulint
os_get_os_version(void)342 os_get_os_version(void)
343 /*===================*/
344 {
345 OSVERSIONINFO os_info;
346
347 os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
348
349 ut_a(GetVersionEx(&os_info));
350
351 if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) {
352 return(OS_WIN31);
353 } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
354 return(OS_WIN95);
355 } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
356 switch (os_info.dwMajorVersion) {
357 case 3:
358 case 4:
359 return(OS_WINNT);
360 case 5:
361 return (os_info.dwMinorVersion == 0)
362 ? OS_WIN2000 : OS_WINXP;
363 case 6:
364 return (os_info.dwMinorVersion == 0)
365 ? OS_WINVISTA : OS_WIN7;
366 default:
367 return(OS_WIN7);
368 }
369 } else {
370 ut_error;
371 return(0);
372 }
373 }
374 #endif /* __WIN__ */
375
376 /***********************************************************************//**
377 Retrieves the last error number if an error occurs in a file io function.
378 The number should be retrieved before any other OS calls (because they may
379 overwrite the error number). If the number is not known to this program,
380 the OS error number + 100 is returned.
381 @return error number, or OS error number + 100 */
382 static
383 ulint
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)384 os_file_get_last_error_low(
385 /*=======================*/
386 bool report_all_errors, /*!< in: TRUE if we want an error
387 message printed of all errors */
388 bool on_error_silent) /*!< in: TRUE then don't print any
389 diagnostic to the log */
390 {
391 #ifdef __WIN__
392
393 ulint err = (ulint) GetLastError();
394 if (err == ERROR_SUCCESS) {
395 return(0);
396 }
397
398 if (report_all_errors
399 || (!on_error_silent
400 && err != ERROR_DISK_FULL
401 && err != ERROR_FILE_EXISTS)) {
402
403 ut_print_timestamp(stderr);
404 fprintf(stderr,
405 " InnoDB: Operating system error number %lu"
406 " in a file operation.\n", (ulong) err);
407
408 if (err == ERROR_PATH_NOT_FOUND) {
409 fprintf(stderr,
410 "InnoDB: The error means the system"
411 " cannot find the path specified.\n");
412
413 if (srv_is_being_started) {
414 fprintf(stderr,
415 "InnoDB: If you are installing InnoDB,"
416 " remember that you must create\n"
417 "InnoDB: directories yourself, InnoDB"
418 " does not create them.\n");
419 }
420 } else if (err == ERROR_ACCESS_DENIED) {
421 fprintf(stderr,
422 "InnoDB: The error means mysqld does not have"
423 " the access rights to\n"
424 "InnoDB: the directory. It may also be"
425 " you have created a subdirectory\n"
426 "InnoDB: of the same name as a data file.\n");
427 } else if (err == ERROR_SHARING_VIOLATION
428 || err == ERROR_LOCK_VIOLATION) {
429 fprintf(stderr,
430 "InnoDB: The error means that another program"
431 " is using InnoDB's files.\n"
432 "InnoDB: This might be a backup or antivirus"
433 " software or another instance\n"
434 "InnoDB: of MySQL."
435 " Please close it to get rid of this error.\n");
436 } else if (err == ERROR_WORKING_SET_QUOTA
437 || err == ERROR_NO_SYSTEM_RESOURCES) {
438 fprintf(stderr,
439 "InnoDB: The error means that there are no"
440 " sufficient system resources or quota to"
441 " complete the operation.\n");
442 } else if (err == ERROR_OPERATION_ABORTED) {
443 fprintf(stderr,
444 "InnoDB: The error means that the I/O"
445 " operation has been aborted\n"
446 "InnoDB: because of either a thread exit"
447 " or an application request.\n"
448 "InnoDB: Retry attempt is made.\n");
449 } else {
450 fprintf(stderr,
451 "InnoDB: Some operating system error numbers"
452 " are described at\n"
453 "InnoDB: "
454 REFMAN
455 "operating-system-error-codes.html\n");
456 }
457 }
458
459 fflush(stderr);
460
461 if (err == ERROR_FILE_NOT_FOUND) {
462 return(OS_FILE_NOT_FOUND);
463 } else if (err == ERROR_DISK_FULL) {
464 return(OS_FILE_DISK_FULL);
465 } else if (err == ERROR_FILE_EXISTS) {
466 return(OS_FILE_ALREADY_EXISTS);
467 } else if (err == ERROR_SHARING_VIOLATION
468 || err == ERROR_LOCK_VIOLATION) {
469 return(OS_FILE_SHARING_VIOLATION);
470 } else if (err == ERROR_WORKING_SET_QUOTA
471 || err == ERROR_NO_SYSTEM_RESOURCES) {
472 return(OS_FILE_INSUFFICIENT_RESOURCE);
473 } else if (err == ERROR_OPERATION_ABORTED) {
474 return(OS_FILE_OPERATION_ABORTED);
475 } else if (err == ERROR_ACCESS_DENIED) {
476 return(OS_FILE_ACCESS_VIOLATION);
477 } else {
478 return(OS_FILE_ERROR_MAX + err);
479 }
480 #else
481 int err = errno;
482 if (err == 0) {
483 return(0);
484 }
485
486 if (report_all_errors
487 || (err != ENOSPC && err != EEXIST && !on_error_silent)) {
488
489 ut_print_timestamp(stderr);
490 fprintf(stderr,
491 " InnoDB: Operating system error number %d"
492 " in a file operation.\n", err);
493
494 if (err == ENOENT) {
495 fprintf(stderr,
496 "InnoDB: The error means the system"
497 " cannot find the path specified.\n");
498
499 if (srv_is_being_started) {
500 fprintf(stderr,
501 "InnoDB: If you are installing InnoDB,"
502 " remember that you must create\n"
503 "InnoDB: directories yourself, InnoDB"
504 " does not create them.\n");
505 }
506 } else if (err == EACCES) {
507 fprintf(stderr,
508 "InnoDB: The error means mysqld does not have"
509 " the access rights to\n"
510 "InnoDB: the directory.\n");
511 } else {
512 if (strerror(err) != NULL) {
513 fprintf(stderr,
514 "InnoDB: Error number %d"
515 " means '%s'.\n",
516 err, strerror(err));
517 }
518
519
520 fprintf(stderr,
521 "InnoDB: Some operating system"
522 " error numbers are described at\n"
523 "InnoDB: "
524 REFMAN
525 "operating-system-error-codes.html\n");
526 }
527 }
528
529 fflush(stderr);
530
531 switch (err) {
532 case ENOSPC:
533 return(OS_FILE_DISK_FULL);
534 case ENOENT:
535 return(OS_FILE_NOT_FOUND);
536 case EEXIST:
537 return(OS_FILE_ALREADY_EXISTS);
538 case EXDEV:
539 case ENOTDIR:
540 case EISDIR:
541 return(OS_FILE_PATH_ERROR);
542 case EAGAIN:
543 if (srv_use_native_aio) {
544 return(OS_FILE_AIO_RESOURCES_RESERVED);
545 }
546 break;
547 case EINTR:
548 if (srv_use_native_aio) {
549 return(OS_FILE_AIO_INTERRUPTED);
550 }
551 break;
552 case EACCES:
553 return(OS_FILE_ACCESS_VIOLATION);
554 }
555 return(OS_FILE_ERROR_MAX + err);
556 #endif
557 }
558
559 /***********************************************************************//**
560 Retrieves the last error number if an error occurs in a file io function.
561 The number should be retrieved before any other OS calls (because they may
562 overwrite the error number). If the number is not known to this program,
563 the OS error number + 100 is returned.
564 @return error number, or OS error number + 100 */
565 UNIV_INTERN
566 ulint
os_file_get_last_error(bool report_all_errors)567 os_file_get_last_error(
568 /*===================*/
569 bool report_all_errors) /*!< in: TRUE if we want an error
570 message printed of all errors */
571 {
572 return(os_file_get_last_error_low(report_all_errors, false));
573 }
574
575 /****************************************************************//**
576 Does error handling when a file operation fails.
577 Conditionally exits (calling exit(3)) based on should_exit value and the
578 error type, if should_exit is TRUE then on_error_silent is ignored.
579 @return TRUE if we should retry the operation */
580 static
581 ibool
os_file_handle_error_cond_exit(const char * name,const char * operation,ibool should_exit,ibool on_error_silent)582 os_file_handle_error_cond_exit(
583 /*===========================*/
584 const char* name, /*!< in: name of a file or NULL */
585 const char* operation, /*!< in: operation */
586 ibool should_exit, /*!< in: call exit(3) if unknown error
587 and this parameter is TRUE */
588 ibool on_error_silent)/*!< in: if TRUE then don't print
589 any message to the log iff it is
590 an unknown non-fatal error */
591 {
592 ulint err;
593
594 err = os_file_get_last_error_low(false, on_error_silent);
595
596 switch (err) {
597 case OS_FILE_DISK_FULL:
598 /* We only print a warning about disk full once */
599
600 if (os_has_said_disk_full) {
601
602 return(FALSE);
603 }
604
605 /* Disk full error is reported irrespective of the
606 on_error_silent setting. */
607
608 if (name) {
609 ut_print_timestamp(stderr);
610 fprintf(stderr,
611 " InnoDB: Encountered a problem with"
612 " file %s\n", name);
613 }
614
615 ut_print_timestamp(stderr);
616 fprintf(stderr,
617 " InnoDB: Disk is full. Try to clean the disk"
618 " to free space.\n");
619
620 os_has_said_disk_full = TRUE;
621
622 fflush(stderr);
623
624 return(FALSE);
625
626 case OS_FILE_AIO_RESOURCES_RESERVED:
627 case OS_FILE_AIO_INTERRUPTED:
628
629 return(TRUE);
630
631 case OS_FILE_PATH_ERROR:
632 case OS_FILE_ALREADY_EXISTS:
633 case OS_FILE_ACCESS_VIOLATION:
634
635 return(FALSE);
636
637 case OS_FILE_SHARING_VIOLATION:
638
639 os_thread_sleep(10000000); /* 10 sec */
640 return(TRUE);
641
642 case OS_FILE_OPERATION_ABORTED:
643 case OS_FILE_INSUFFICIENT_RESOURCE:
644
645 os_thread_sleep(100000); /* 100 ms */
646 return(TRUE);
647
648 default:
649
650 /* If it is an operation that can crash on error then it
651 is better to ignore on_error_silent and print an error message
652 to the log. */
653
654 if (should_exit || !on_error_silent) {
655 ib_logf(IB_LOG_LEVEL_ERROR, "File %s: '%s' returned OS "
656 "error " ULINTPF ".%s", name ? name : "(unknown)",
657 operation, err, should_exit
658 ? " Cannot continue operation" : "");
659 }
660
661 if (should_exit) {
662 exit(1);
663 }
664 }
665
666 return(FALSE);
667 }
668
669 /****************************************************************//**
670 Does error handling when a file operation fails.
671 @return TRUE if we should retry the operation */
672 static
673 ibool
os_file_handle_error(const char * name,const char * operation)674 os_file_handle_error(
675 /*=================*/
676 const char* name, /*!< in: name of a file or NULL */
677 const char* operation) /*!< in: operation */
678 {
679 /* exit in case of unknown error */
680 return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE));
681 }
682
683 /****************************************************************//**
684 Does error handling when a file operation fails.
685 @return TRUE if we should retry the operation */
686 static
687 ibool
os_file_handle_error_no_exit(const char * name,const char * operation,ibool on_error_silent)688 os_file_handle_error_no_exit(
689 /*=========================*/
690 const char* name, /*!< in: name of a file or NULL */
691 const char* operation, /*!< in: operation */
692 ibool on_error_silent)/*!< in: if TRUE then don't print
693 any message to the log. */
694 {
695 /* don't exit in case of unknown error */
696 return(os_file_handle_error_cond_exit(
697 name, operation, FALSE, on_error_silent));
698 }
699
700 #undef USE_FILE_LOCK
701 #define USE_FILE_LOCK
702 #if defined(UNIV_HOTBACKUP) || defined(__WIN__)
703 /* InnoDB Hot Backup does not lock the data files.
704 * On Windows, mandatory locking is used.
705 */
706 # undef USE_FILE_LOCK
707 #endif
708 #ifdef USE_FILE_LOCK
709 /****************************************************************//**
710 Obtain an exclusive lock on a file.
711 @return 0 on success */
712 static
713 int
os_file_lock(int fd,const char * name)714 os_file_lock(
715 /*=========*/
716 int fd, /*!< in: file descriptor */
717 const char* name) /*!< in: file name */
718 {
719 struct flock lk;
720
721 ut_ad(!srv_read_only_mode);
722
723 lk.l_type = F_WRLCK;
724 lk.l_whence = SEEK_SET;
725 lk.l_start = lk.l_len = 0;
726
727 if (fcntl(fd, F_SETLK, &lk) == -1) {
728
729 ib_logf(IB_LOG_LEVEL_ERROR,
730 "Unable to lock %s, error: %d", name, errno);
731
732 if (errno == EAGAIN || errno == EACCES) {
733 ib_logf(IB_LOG_LEVEL_INFO,
734 "Check that you do not already have "
735 "another mysqld process using the "
736 "same InnoDB data or log files.");
737 }
738
739 return(-1);
740 }
741
742 return(0);
743 }
744 #endif /* USE_FILE_LOCK */
745
746 #ifndef UNIV_HOTBACKUP
747 /****************************************************************//**
748 Creates the seek mutexes used in positioned reads and writes. */
749 UNIV_INTERN
750 void
os_io_init_simple(void)751 os_io_init_simple(void)
752 /*===================*/
753 {
754 #if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8
755 os_file_count_mutex = os_mutex_create();
756 #endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD_SIZE < 8 */
757
758 for (ulint i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
759 os_file_seek_mutexes[i] = os_mutex_create();
760 }
761 }
762
763 /** Create a temporary file. This function is like tmpfile(3), but
764 the temporary file is created in the given parameter path. If the path
765 is null then it will create the file in the mysql server configuration
766 parameter (--tmpdir).
767 @param[in] path location for creating temporary file
768 @return temporary file handle, or NULL on error */
769 UNIV_INTERN
770 FILE*
os_file_create_tmpfile(const char * path)771 os_file_create_tmpfile(
772 const char* path)
773 {
774 FILE* file = NULL;
775 int fd = innobase_mysql_tmpfile(path);
776
777 ut_ad(!srv_read_only_mode);
778
779 if (fd >= 0) {
780 file = fdopen(fd, "w+b");
781 }
782
783 if (!file) {
784 ut_print_timestamp(stderr);
785 fprintf(stderr,
786 " InnoDB: Error: unable to create temporary file;"
787 " errno: %d\n", errno);
788 if (fd >= 0) {
789 close(fd);
790 }
791 }
792
793 return(file);
794 }
795 #endif /* !UNIV_HOTBACKUP */
796
797 /***********************************************************************//**
798 The os_file_opendir() function opens a directory stream corresponding to the
799 directory named by the dirname argument. The directory stream is positioned
800 at the first entry. In both Unix and Windows we automatically skip the '.'
801 and '..' items at the start of the directory listing.
802 @return directory stream, NULL if error */
803 UNIV_INTERN
804 os_file_dir_t
os_file_opendir(const char * dirname,ibool error_is_fatal)805 os_file_opendir(
806 /*============*/
807 const char* dirname, /*!< in: directory name; it must not
808 contain a trailing '\' or '/' */
809 ibool error_is_fatal) /*!< in: TRUE if we should treat an
810 error as a fatal error; if we try to
811 open symlinks then we do not wish a
812 fatal error if it happens not to be
813 a directory */
814 {
815 os_file_dir_t dir;
816 #ifdef __WIN__
817 LPWIN32_FIND_DATA lpFindFileData;
818 char path[OS_FILE_MAX_PATH + 3];
819
820 ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
821
822 strcpy(path, dirname);
823 strcpy(path + strlen(path), "\\*");
824
825 /* Note that in Windows opening the 'directory stream' also retrieves
826 the first entry in the directory. Since it is '.', that is no problem,
827 as we will skip over the '.' and '..' entries anyway. */
828
829 lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
830 ut_malloc(sizeof(WIN32_FIND_DATA)));
831
832 dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
833
834 ut_free(lpFindFileData);
835
836 if (dir == INVALID_HANDLE_VALUE) {
837
838 if (error_is_fatal) {
839 os_file_handle_error(dirname, "opendir");
840 }
841
842 return(NULL);
843 }
844
845 return(dir);
846 #else
847 dir = opendir(dirname);
848
849 if (dir == NULL && error_is_fatal) {
850 os_file_handle_error(dirname, "opendir");
851 }
852
853 return(dir);
854 #endif /* __WIN__ */
855 }
856
857 /***********************************************************************//**
858 Closes a directory stream.
859 @return 0 if success, -1 if failure */
860 UNIV_INTERN
861 int
os_file_closedir(os_file_dir_t dir)862 os_file_closedir(
863 /*=============*/
864 os_file_dir_t dir) /*!< in: directory stream */
865 {
866 #ifdef __WIN__
867 BOOL ret;
868
869 ret = FindClose(dir);
870
871 if (!ret) {
872 os_file_handle_error_no_exit(NULL, "closedir", FALSE);
873
874 return(-1);
875 }
876
877 return(0);
878 #else
879 int ret;
880
881 ret = closedir(dir);
882
883 if (ret) {
884 os_file_handle_error_no_exit(NULL, "closedir", FALSE);
885 }
886
887 return(ret);
888 #endif /* __WIN__ */
889 }
890
891 /***********************************************************************//**
892 This function returns information of the next file in the directory. We jump
893 over the '.' and '..' entries in the directory.
894 @return 0 if ok, -1 if error, 1 if at the end of the directory */
895 UNIV_INTERN
896 int
os_file_readdir_next_file(const char * dirname,os_file_dir_t dir,os_file_stat_t * info)897 os_file_readdir_next_file(
898 /*======================*/
899 const char* dirname,/*!< in: directory name or path */
900 os_file_dir_t dir, /*!< in: directory stream */
901 os_file_stat_t* info) /*!< in/out: buffer where the info is returned */
902 {
903 #ifdef __WIN__
904 LPWIN32_FIND_DATA lpFindFileData;
905 BOOL ret;
906
907 lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
908 ut_malloc(sizeof(WIN32_FIND_DATA)));
909 next_file:
910 ret = FindNextFile(dir, lpFindFileData);
911
912 if (ret) {
913 ut_a(strlen((char*) lpFindFileData->cFileName)
914 < OS_FILE_MAX_PATH);
915
916 if (strcmp((char*) lpFindFileData->cFileName, ".") == 0
917 || strcmp((char*) lpFindFileData->cFileName, "..") == 0) {
918
919 goto next_file;
920 }
921
922 strcpy(info->name, (char*) lpFindFileData->cFileName);
923
924 info->size = (ib_int64_t)(lpFindFileData->nFileSizeLow)
925 + (((ib_int64_t)(lpFindFileData->nFileSizeHigh))
926 << 32);
927
928 if (lpFindFileData->dwFileAttributes
929 & FILE_ATTRIBUTE_REPARSE_POINT) {
930 /* TODO: test Windows symlinks */
931 /* TODO: MySQL has apparently its own symlink
932 implementation in Windows, dbname.sym can
933 redirect a database directory:
934 REFMAN "windows-symbolic-links.html" */
935 info->type = OS_FILE_TYPE_LINK;
936 } else if (lpFindFileData->dwFileAttributes
937 & FILE_ATTRIBUTE_DIRECTORY) {
938 info->type = OS_FILE_TYPE_DIR;
939 } else {
940 /* It is probably safest to assume that all other
941 file types are normal. Better to check them rather
942 than blindly skip them. */
943
944 info->type = OS_FILE_TYPE_FILE;
945 }
946 }
947
948 ut_free(lpFindFileData);
949
950 if (ret) {
951 return(0);
952 } else if (GetLastError() == ERROR_NO_MORE_FILES) {
953
954 return(1);
955 } else {
956 os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE);
957 return(-1);
958 }
959 #else
960 struct dirent* ent;
961 char* full_path;
962 int ret;
963 struct stat statinfo;
964 #ifdef HAVE_READDIR_R
965 char dirent_buf[sizeof(struct dirent)
966 + _POSIX_PATH_MAX + 100];
967 /* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
968 the max file name len; but in most standards, the
969 length is NAME_MAX; we add 100 to be even safer */
970 #endif
971
972 next_file:
973
974 #ifdef HAVE_READDIR_R
975 ret = readdir_r(dir, (struct dirent*) dirent_buf, &ent);
976
977 if (ret != 0
978 #ifdef UNIV_AIX
979 /* On AIX, only if we got non-NULL 'ent' (result) value and
980 a non-zero 'ret' (return) value, it indicates a failed
981 readdir_r() call. An NULL 'ent' with an non-zero 'ret'
982 would indicate the "end of the directory" is reached. */
983 && ent != NULL
984 #endif
985 ) {
986 fprintf(stderr,
987 "InnoDB: cannot read directory %s, error %lu\n",
988 dirname, (ulong) ret);
989
990 return(-1);
991 }
992
993 if (ent == NULL) {
994 /* End of directory */
995
996 return(1);
997 }
998
999 ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
1000 #else
1001 ent = readdir(dir);
1002
1003 if (ent == NULL) {
1004
1005 return(1);
1006 }
1007 #endif
1008 ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
1009
1010 if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
1011
1012 goto next_file;
1013 }
1014
1015 strcpy(info->name, ent->d_name);
1016
1017 full_path = static_cast<char*>(
1018 ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10));
1019
1020 sprintf(full_path, "%s/%s", dirname, ent->d_name);
1021
1022 ret = stat(full_path, &statinfo);
1023
1024 if (ret) {
1025
1026 if (errno == ENOENT) {
1027 /* readdir() returned a file that does not exist,
1028 it must have been deleted in the meantime. Do what
1029 would have happened if the file was deleted before
1030 readdir() - ignore and go to the next entry.
1031 If this is the last entry then info->name will still
1032 contain the name of the deleted file when this
1033 function returns, but this is not an issue since the
1034 caller shouldn't be looking at info when end of
1035 directory is returned. */
1036
1037 ut_free(full_path);
1038
1039 goto next_file;
1040 }
1041
1042 os_file_handle_error_no_exit(full_path, "stat", FALSE);
1043
1044 ut_free(full_path);
1045
1046 return(-1);
1047 }
1048
1049 info->size = (ib_int64_t) statinfo.st_size;
1050
1051 if (S_ISDIR(statinfo.st_mode)) {
1052 info->type = OS_FILE_TYPE_DIR;
1053 } else if (S_ISLNK(statinfo.st_mode)) {
1054 info->type = OS_FILE_TYPE_LINK;
1055 } else if (S_ISREG(statinfo.st_mode)) {
1056 info->type = OS_FILE_TYPE_FILE;
1057 } else {
1058 info->type = OS_FILE_TYPE_UNKNOWN;
1059 }
1060
1061 ut_free(full_path);
1062
1063 return(0);
1064 #endif
1065 }
1066
1067 /*****************************************************************//**
1068 This function attempts to create a directory named pathname. The new
1069 directory gets default permissions. On Unix the permissions are
1070 (0770 & ~umask). If the directory exists already, nothing is done and
1071 the call succeeds, unless the fail_if_exists arguments is true.
1072 If another error occurs, such as a permission error, this does not crash,
1073 but reports the error and returns FALSE.
1074 @return TRUE if call succeeds, FALSE on error */
1075 UNIV_INTERN
1076 ibool
os_file_create_directory(const char * pathname,ibool fail_if_exists)1077 os_file_create_directory(
1078 /*=====================*/
1079 const char* pathname, /*!< in: directory name as
1080 null-terminated string */
1081 ibool fail_if_exists) /*!< in: if TRUE, pre-existing directory
1082 is treated as an error. */
1083 {
1084 #ifdef __WIN__
1085 BOOL rcode;
1086
1087 rcode = CreateDirectory((LPCTSTR) pathname, NULL);
1088 if (!(rcode != 0
1089 || (GetLastError() == ERROR_ALREADY_EXISTS
1090 && !fail_if_exists))) {
1091
1092 os_file_handle_error_no_exit(
1093 pathname, "CreateDirectory", FALSE);
1094
1095 return(FALSE);
1096 }
1097
1098 return(TRUE);
1099 #else
1100 int rcode;
1101
1102 rcode = mkdir(pathname, 0770);
1103
1104 if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
1105 /* failure */
1106 os_file_handle_error_no_exit(pathname, "mkdir", FALSE);
1107
1108 return(FALSE);
1109 }
1110
1111 return (TRUE);
1112 #endif /* __WIN__ */
1113 }
1114
1115 /****************************************************************//**
1116 NOTE! Use the corresponding macro os_file_create_simple(), not directly
1117 this function!
1118 A simple function to open or create a file.
1119 @return own: handle to the file, not defined if error, error number
1120 can be retrieved with os_file_get_last_error */
1121 UNIV_INTERN
1122 os_file_t
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,ibool * success)1123 os_file_create_simple_func(
1124 /*=======================*/
1125 const char* name, /*!< in: name of the file or path as a
1126 null-terminated string */
1127 ulint create_mode,/*!< in: create mode */
1128 ulint access_type,/*!< in: OS_FILE_READ_ONLY or
1129 OS_FILE_READ_WRITE */
1130 ibool* success)/*!< out: TRUE if succeed, FALSE if error */
1131 {
1132 os_file_t file;
1133 ibool retry;
1134
1135 *success = FALSE;
1136 #ifdef __WIN__
1137 DWORD access;
1138 DWORD create_flag;
1139 DWORD attributes = 0;
1140
1141 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
1142 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
1143
1144 if (create_mode == OS_FILE_OPEN) {
1145
1146 create_flag = OPEN_EXISTING;
1147
1148 } else if (srv_read_only_mode) {
1149
1150 create_flag = OPEN_EXISTING;
1151
1152 } else if (create_mode == OS_FILE_CREATE) {
1153
1154 create_flag = CREATE_NEW;
1155
1156 } else if (create_mode == OS_FILE_CREATE_PATH) {
1157
1158 ut_a(!srv_read_only_mode);
1159
1160 /* Create subdirs along the path if needed */
1161 *success = os_file_create_subdirs_if_needed(name);
1162
1163 if (!*success) {
1164
1165 ib_logf(IB_LOG_LEVEL_ERROR,
1166 "Unable to create subdirectories '%s'",
1167 name);
1168
1169 return((os_file_t) -1);
1170 }
1171
1172 create_flag = CREATE_NEW;
1173 create_mode = OS_FILE_CREATE;
1174
1175 } else {
1176 ib_logf(IB_LOG_LEVEL_ERROR,
1177 "Unknown file create mode (%lu) for file '%s'",
1178 create_mode, name);
1179
1180 return((os_file_t) -1);
1181 }
1182
1183 if (access_type == OS_FILE_READ_ONLY) {
1184 access = GENERIC_READ;
1185 } else if (srv_read_only_mode) {
1186
1187 ib_logf(IB_LOG_LEVEL_INFO,
1188 "read only mode set. Unable to "
1189 "open file '%s' in RW mode, trying RO mode", name);
1190
1191 access = GENERIC_READ;
1192
1193 } else if (access_type == OS_FILE_READ_WRITE) {
1194 access = GENERIC_READ | GENERIC_WRITE;
1195 } else {
1196 ib_logf(IB_LOG_LEVEL_ERROR,
1197 "Unknown file access type (%lu) for file '%s'",
1198 access_type, name);
1199
1200 return((os_file_t) -1);
1201 }
1202
1203 do {
1204 /* Use default security attributes and no template file. */
1205
1206 file = CreateFile(
1207 (LPCTSTR) name, access, FILE_SHARE_READ, NULL,
1208 create_flag, attributes, NULL);
1209
1210 if (file == INVALID_HANDLE_VALUE) {
1211
1212 *success = FALSE;
1213
1214 retry = os_file_handle_error(
1215 name, create_mode == OS_FILE_OPEN ?
1216 "open" : "create");
1217
1218 } else {
1219 *success = TRUE;
1220 retry = false;
1221 }
1222
1223 } while (retry);
1224
1225 #else /* __WIN__ */
1226 int create_flag;
1227
1228 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
1229 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
1230
1231 if (create_mode == OS_FILE_OPEN) {
1232
1233 if (access_type == OS_FILE_READ_ONLY) {
1234 create_flag = O_RDONLY;
1235 } else if (srv_read_only_mode) {
1236 create_flag = O_RDONLY;
1237 } else {
1238 create_flag = O_RDWR;
1239 }
1240
1241 } else if (srv_read_only_mode) {
1242
1243 create_flag = O_RDONLY;
1244
1245 } else if (create_mode == OS_FILE_CREATE) {
1246
1247 create_flag = O_RDWR | O_CREAT | O_EXCL;
1248
1249 } else if (create_mode == OS_FILE_CREATE_PATH) {
1250
1251 /* Create subdirs along the path if needed */
1252
1253 *success = os_file_create_subdirs_if_needed(name);
1254
1255 if (!*success) {
1256
1257 ib_logf(IB_LOG_LEVEL_ERROR,
1258 "Unable to create subdirectories '%s'",
1259 name);
1260
1261 return((os_file_t) -1);
1262 }
1263
1264 create_flag = O_RDWR | O_CREAT | O_EXCL;
1265 create_mode = OS_FILE_CREATE;
1266 } else {
1267
1268 ib_logf(IB_LOG_LEVEL_ERROR,
1269 "Unknown file create mode (%lu) for file '%s'",
1270 create_mode, name);
1271
1272 return((os_file_t) -1);
1273 }
1274
1275 do {
1276 file = ::open(name, create_flag, os_innodb_umask);
1277
1278 if (file == -1) {
1279 *success = FALSE;
1280
1281 retry = os_file_handle_error(
1282 name,
1283 create_mode == OS_FILE_OPEN
1284 ? "open" : "create");
1285 } else {
1286 *success = TRUE;
1287 retry = false;
1288 }
1289
1290 } while (retry);
1291
1292 #ifdef USE_FILE_LOCK
1293 if (!srv_read_only_mode
1294 && *success
1295 && access_type == OS_FILE_READ_WRITE
1296 && os_file_lock(file, name)) {
1297
1298 *success = FALSE;
1299 close(file);
1300 file = -1;
1301 }
1302 #endif /* USE_FILE_LOCK */
1303
1304 #endif /* __WIN__ */
1305
1306 return(file);
1307 }
1308
1309 /****************************************************************//**
1310 NOTE! Use the corresponding macro
1311 os_file_create_simple_no_error_handling(), not directly this function!
1312 A simple function to open or create a file.
1313 @return own: handle to the file, not defined if error, error number
1314 can be retrieved with os_file_get_last_error */
1315 UNIV_INTERN
1316 pfs_os_file_t
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,ibool * success)1317 os_file_create_simple_no_error_handling_func(
1318 /*=========================================*/
1319 const char* name, /*!< in: name of the file or path as a
1320 null-terminated string */
1321 ulint create_mode,/*!< in: create mode */
1322 ulint access_type,/*!< in: OS_FILE_READ_ONLY,
1323 OS_FILE_READ_WRITE, or
1324 OS_FILE_READ_ALLOW_DELETE; the last option is
1325 used by a backup program reading the file */
1326 ibool* success)/*!< out: TRUE if succeed, FALSE if error */
1327 {
1328 pfs_os_file_t file;
1329
1330 *success = FALSE;
1331 #ifdef __WIN__
1332 DWORD access;
1333 DWORD create_flag;
1334 DWORD attributes = 0;
1335 DWORD share_mode = FILE_SHARE_READ;
1336 ut_a(name);
1337
1338 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
1339 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
1340
1341 if (create_mode == OS_FILE_OPEN) {
1342 create_flag = OPEN_EXISTING;
1343 } else if (srv_read_only_mode) {
1344 create_flag = OPEN_EXISTING;
1345 } else if (create_mode == OS_FILE_CREATE) {
1346 create_flag = CREATE_NEW;
1347 } else {
1348
1349 ib_logf(IB_LOG_LEVEL_ERROR,
1350 "Unknown file create mode (%lu) for file '%s'",
1351 create_mode, name);
1352 file.m_file = (os_file_t)-1;
1353 return(file);
1354 }
1355
1356 if (access_type == OS_FILE_READ_ONLY) {
1357 access = GENERIC_READ;
1358 } else if (srv_read_only_mode) {
1359 access = GENERIC_READ;
1360 } else if (access_type == OS_FILE_READ_WRITE) {
1361 access = GENERIC_READ | GENERIC_WRITE;
1362 } else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
1363
1364 ut_a(!srv_read_only_mode);
1365
1366 access = GENERIC_READ;
1367
1368 /*!< A backup program has to give mysqld the maximum
1369 freedom to do what it likes with the file */
1370
1371 share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE;
1372 } else {
1373 ib_logf(IB_LOG_LEVEL_ERROR,
1374 "Unknown file access type (%lu) for file '%s'",
1375 access_type, name);
1376 file.m_file = (os_file_t)-1;
1377 return(file);
1378 }
1379
1380 file.m_file = CreateFile((LPCTSTR) name,
1381 access,
1382 share_mode,
1383 NULL, // Security attributes
1384 create_flag,
1385 attributes,
1386 NULL); // No template file
1387
1388 *success = (file.m_file != INVALID_HANDLE_VALUE);
1389 #else /* __WIN__ */
1390 int create_flag;
1391 const char* mode_str = NULL;
1392 ut_a(name);
1393
1394 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
1395 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
1396
1397 if (create_mode == OS_FILE_OPEN) {
1398
1399 mode_str = "OPEN";
1400
1401 if (access_type == OS_FILE_READ_ONLY) {
1402
1403 create_flag = O_RDONLY;
1404
1405 } else if (srv_read_only_mode) {
1406
1407 create_flag = O_RDONLY;
1408
1409 } else {
1410
1411 ut_a(access_type == OS_FILE_READ_WRITE
1412 || access_type == OS_FILE_READ_ALLOW_DELETE);
1413
1414 create_flag = O_RDWR;
1415 }
1416
1417 } else if (srv_read_only_mode) {
1418
1419 mode_str = "OPEN";
1420
1421 create_flag = O_RDONLY;
1422
1423 } else if (create_mode == OS_FILE_CREATE) {
1424
1425 mode_str = "CREATE";
1426
1427 create_flag = O_RDWR | O_CREAT | O_EXCL;
1428
1429 } else {
1430 ib_logf(IB_LOG_LEVEL_ERROR,
1431 "Unknown file create mode (%lu) for file '%s'",
1432 create_mode, name);
1433 file.m_file = -1;
1434 return(file);
1435 }
1436
1437 file.m_file = ::open(name, create_flag, os_innodb_umask);
1438
1439 *success = file.m_file == -1 ? FALSE : TRUE;
1440
1441 /* This function is always called for data files, we should disable
1442 OS caching (O_DIRECT) here as we do in os_file_create_func(), so
1443 we open the same file in the same mode, see man page of open(2). */
1444 if (!srv_read_only_mode
1445 && *success
1446 && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
1447 || srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) {
1448
1449 os_file_set_nocache(file.m_file, name, mode_str);
1450 }
1451
1452 #ifdef USE_FILE_LOCK
1453 if (!srv_read_only_mode
1454 && *success
1455 && access_type == OS_FILE_READ_WRITE
1456 && os_file_lock(file.m_file, name)) {
1457
1458 *success = FALSE;
1459 close(file.m_file);
1460 file.m_file = -1;
1461
1462 }
1463 #endif /* USE_FILE_LOCK */
1464
1465 #endif /* __WIN__ */
1466
1467 return(file);
1468 }
1469
1470 /****************************************************************//**
1471 Tries to disable OS caching on an opened file descriptor. */
1472 UNIV_INTERN
1473 void
os_file_set_nocache(int fd MY_ATTRIBUTE ((unused)),const char * file_name MY_ATTRIBUTE ((unused)),const char * operation_name MY_ATTRIBUTE ((unused)))1474 os_file_set_nocache(
1475 /*================*/
1476 int fd /*!< in: file descriptor to alter */
1477 MY_ATTRIBUTE((unused)),
1478 const char* file_name /*!< in: used in the diagnostic
1479 message */
1480 MY_ATTRIBUTE((unused)),
1481 const char* operation_name MY_ATTRIBUTE((unused)))
1482 /*!< in: "open" or "create"; used
1483 in the diagnostic message */
1484 {
1485 /* some versions of Solaris may not have DIRECTIO_ON */
1486 #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
1487 if (directio(fd, DIRECTIO_ON) == -1) {
1488 int errno_save = errno;
1489
1490 ib_logf(IB_LOG_LEVEL_ERROR,
1491 "Failed to set DIRECTIO_ON on file %s: %s: %s, "
1492 "continuing anyway.",
1493 file_name, operation_name, strerror(errno_save));
1494 }
1495 #elif defined(O_DIRECT)
1496 if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
1497 int errno_save = errno;
1498 static bool warning_message_printed = false;
1499 if (errno_save == EINVAL) {
1500 if (!warning_message_printed) {
1501 warning_message_printed = true;
1502 # ifdef UNIV_LINUX
1503 ib_logf(IB_LOG_LEVEL_WARN,
1504 "Failed to set O_DIRECT on file "
1505 "%s: %s: %s, continuing anyway. "
1506 "O_DIRECT is known to result "
1507 "in 'Invalid argument' on Linux on "
1508 "tmpfs, see MySQL Bug#26662.",
1509 file_name, operation_name,
1510 strerror(errno_save));
1511 # else /* UNIV_LINUX */
1512 goto short_warning;
1513 # endif /* UNIV_LINUX */
1514 }
1515 } else {
1516 # ifndef UNIV_LINUX
1517 short_warning:
1518 # endif
1519 ib_logf(IB_LOG_LEVEL_WARN,
1520 "Failed to set O_DIRECT on file %s: %s: %s, "
1521 "continuing anyway.",
1522 file_name, operation_name, strerror(errno_save));
1523 }
1524 }
1525 #endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */
1526 }
1527
1528 /****************************************************************//**
1529 NOTE! Use the corresponding macro os_file_create(), not directly
1530 this function!
1531 Opens an existing file or creates a new.
1532 @return own: handle to the file, not defined if error, error number
1533 can be retrieved with os_file_get_last_error */
1534 UNIV_INTERN
1535 pfs_os_file_t
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,ibool * success)1536 os_file_create_func(
1537 /*================*/
1538 const char* name, /*!< in: name of the file or path as a
1539 null-terminated string */
1540 ulint create_mode,/*!< in: create mode */
1541 ulint purpose,/*!< in: OS_FILE_AIO, if asynchronous,
1542 non-buffered i/o is desired,
1543 OS_FILE_NORMAL, if any normal file;
1544 NOTE that it also depends on type, os_aio_..
1545 and srv_.. variables whether we really use
1546 async i/o or unbuffered i/o: look in the
1547 function source code for the exact rules */
1548 ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */
1549 ibool* success)/*!< out: TRUE if succeed, FALSE if error */
1550 {
1551 pfs_os_file_t file;
1552 ibool retry;
1553 ibool on_error_no_exit;
1554 ibool on_error_silent;
1555 #ifdef __WIN__
1556 DBUG_EXECUTE_IF(
1557 "ib_create_table_fail_disk_full",
1558 *success = FALSE;
1559 SetLastError(ERROR_DISK_FULL);
1560 file.m_file = (os_file_t)-1;
1561 return(file);
1562 );
1563 #else /* __WIN__ */
1564 DBUG_EXECUTE_IF(
1565 "ib_create_table_fail_disk_full",
1566 *success = FALSE;
1567 errno = ENOSPC;
1568 file.m_file = -1;
1569 return(file);
1570 );
1571 #endif /* __WIN__ */
1572
1573 #ifdef __WIN__
1574 DWORD create_flag;
1575 DWORD share_mode = FILE_SHARE_READ;
1576
1577 on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
1578 ? TRUE : FALSE;
1579
1580 on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
1581 ? TRUE : FALSE;
1582
1583 create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
1584 create_mode &= ~OS_FILE_ON_ERROR_SILENT;
1585
1586 if (create_mode == OS_FILE_OPEN_RAW) {
1587
1588 ut_a(!srv_read_only_mode);
1589
1590 create_flag = OPEN_EXISTING;
1591
1592 /* On Windows Physical devices require admin privileges and
1593 have to have the write-share mode set. See the remarks
1594 section for the CreateFile() function documentation in MSDN. */
1595
1596 share_mode |= FILE_SHARE_WRITE;
1597
1598 } else if (create_mode == OS_FILE_OPEN
1599 || create_mode == OS_FILE_OPEN_RETRY) {
1600
1601 create_flag = OPEN_EXISTING;
1602
1603 } else if (srv_read_only_mode) {
1604
1605 create_flag = OPEN_EXISTING;
1606
1607 } else if (create_mode == OS_FILE_CREATE) {
1608
1609 create_flag = CREATE_NEW;
1610
1611 } else if (create_mode == OS_FILE_OVERWRITE) {
1612
1613 create_flag = CREATE_ALWAYS;
1614
1615 } else {
1616 ib_logf(IB_LOG_LEVEL_ERROR,
1617 "Unknown file create mode (%lu) for file '%s'",
1618 create_mode, name);
1619
1620 file.m_file = (os_file_t)-1;
1621 return(file);
1622 }
1623
1624 DWORD attributes = 0;
1625
1626 #ifdef UNIV_HOTBACKUP
1627 attributes |= FILE_FLAG_NO_BUFFERING;
1628 #else
1629 if (purpose == OS_FILE_AIO) {
1630
1631 #ifdef WIN_ASYNC_IO
1632 /* If specified, use asynchronous (overlapped) io and no
1633 buffering of writes in the OS */
1634
1635 if (srv_use_native_aio) {
1636 attributes |= FILE_FLAG_OVERLAPPED;
1637 }
1638 #endif /* WIN_ASYNC_IO */
1639
1640 } else if (purpose == OS_FILE_NORMAL) {
1641 /* Use default setting. */
1642 } else {
1643 ib_logf(IB_LOG_LEVEL_ERROR,
1644 "Unknown purpose flag (%lu) while opening file '%s'",
1645 purpose, name);
1646 file.m_file = (os_file_t)-1;
1647 return(file);
1648 }
1649
1650 #ifdef UNIV_NON_BUFFERED_IO
1651 // TODO: Create a bug, this looks wrong. The flush log
1652 // parameter is dynamic.
1653 if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
1654
1655 /* Do not use unbuffered i/o for the log files because
1656 value 2 denotes that we do not flush the log at every
1657 commit, but only once per second */
1658
1659 } else if (srv_win_file_flush_method == SRV_WIN_IO_UNBUFFERED) {
1660
1661 attributes |= FILE_FLAG_NO_BUFFERING;
1662 }
1663 #endif /* UNIV_NON_BUFFERED_IO */
1664
1665 #endif /* UNIV_HOTBACKUP */
1666 DWORD access = GENERIC_READ;
1667
1668 if (!srv_read_only_mode) {
1669 access |= GENERIC_WRITE;
1670 }
1671
1672 do {
1673 /* Use default security attributes and no template file. */
1674 file.m_file = CreateFile(
1675 (LPCTSTR) name, access, share_mode, NULL,
1676 create_flag, attributes, NULL);
1677
1678 if (file.m_file == INVALID_HANDLE_VALUE) {
1679 const char* operation;
1680
1681 operation = (create_mode == OS_FILE_CREATE
1682 && !srv_read_only_mode)
1683 ? "create" : "open";
1684
1685 *success = FALSE;
1686
1687 if (on_error_no_exit) {
1688 retry = os_file_handle_error_no_exit(
1689 name, operation, on_error_silent);
1690 } else {
1691 retry = os_file_handle_error(name, operation);
1692 }
1693 } else {
1694 *success = TRUE;
1695 retry = FALSE;
1696 }
1697
1698 } while (retry);
1699
1700 #else /* __WIN__ */
1701 int create_flag;
1702 const char* mode_str = NULL;
1703 on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
1704 ? TRUE : FALSE;
1705 on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
1706 ? TRUE : FALSE;
1707
1708 create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
1709 create_mode &= ~OS_FILE_ON_ERROR_SILENT;
1710
1711 if (create_mode == OS_FILE_OPEN
1712 || create_mode == OS_FILE_OPEN_RAW
1713 || create_mode == OS_FILE_OPEN_RETRY) {
1714
1715 mode_str = "OPEN";
1716
1717 create_flag = srv_read_only_mode ? O_RDONLY : O_RDWR;
1718
1719 } else if (srv_read_only_mode) {
1720
1721 mode_str = "OPEN";
1722
1723 create_flag = O_RDONLY;
1724
1725 } else if (create_mode == OS_FILE_CREATE) {
1726
1727 mode_str = "CREATE";
1728 create_flag = O_RDWR | O_CREAT | O_EXCL;
1729
1730 } else if (create_mode == OS_FILE_OVERWRITE) {
1731
1732 mode_str = "OVERWRITE";
1733 create_flag = O_RDWR | O_CREAT | O_TRUNC;
1734
1735 } else {
1736 ib_logf(IB_LOG_LEVEL_ERROR,
1737 "Unknown file create mode (%lu) for file '%s'",
1738 create_mode, name);
1739
1740 file.m_file = -1;
1741 return(file);
1742 }
1743
1744 ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE);
1745 ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
1746
1747 #ifdef O_SYNC
1748 /* We let O_SYNC only affect log files; note that we map O_DSYNC to
1749 O_SYNC because the datasync options seemed to corrupt files in 2001
1750 in both Linux and Solaris */
1751
1752 if (!srv_read_only_mode
1753 && type == OS_LOG_FILE
1754 && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
1755
1756 create_flag |= O_SYNC;
1757 }
1758 #endif /* O_SYNC */
1759
1760 do {
1761 file.m_file = ::open(name, create_flag, os_innodb_umask);
1762
1763 if (file.m_file == -1) {
1764 const char* operation;
1765
1766 operation = (create_mode == OS_FILE_CREATE
1767 && !srv_read_only_mode)
1768 ? "create" : "open";
1769
1770 *success = FALSE;
1771
1772 if (on_error_no_exit) {
1773 retry = os_file_handle_error_no_exit(
1774 name, operation, on_error_silent);
1775 } else {
1776 retry = os_file_handle_error(name, operation);
1777 }
1778 } else {
1779 *success = TRUE;
1780 retry = false;
1781 }
1782
1783 } while (retry);
1784
1785 /* We disable OS caching (O_DIRECT) only on data files */
1786
1787 if (!srv_read_only_mode
1788 && *success
1789 && type != OS_LOG_FILE
1790 && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
1791 || srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) {
1792
1793 os_file_set_nocache(file.m_file, name, mode_str);
1794 }
1795
1796 #ifdef USE_FILE_LOCK
1797 if (!srv_read_only_mode
1798 && *success
1799 && create_mode != OS_FILE_OPEN_RAW
1800 && os_file_lock(file.m_file, name)) {
1801
1802 if (create_mode == OS_FILE_OPEN_RETRY) {
1803
1804 ut_a(!srv_read_only_mode);
1805
1806 ib_logf(IB_LOG_LEVEL_INFO,
1807 "Retrying to lock the first data file");
1808
1809 for (int i = 0; i < 100; i++) {
1810 os_thread_sleep(1000000);
1811
1812 if (!os_file_lock(file.m_file, name)) {
1813 *success = TRUE;
1814 return(file);
1815 }
1816 }
1817
1818 ib_logf(IB_LOG_LEVEL_INFO,
1819 "Unable to open the first data file");
1820 }
1821
1822 *success = FALSE;
1823 close(file.m_file);
1824 file.m_file = -1;
1825 }
1826 #endif /* USE_FILE_LOCK */
1827
1828 #endif /* __WIN__ */
1829
1830 return(file);
1831 }
1832
1833 /***********************************************************************//**
1834 Deletes a file if it exists. The file has to be closed before calling this.
1835 @return TRUE if success */
1836 UNIV_INTERN
1837 bool
os_file_delete_if_exists_func(const char * name)1838 os_file_delete_if_exists_func(
1839 /*==========================*/
1840 const char* name) /*!< in: file path as a null-terminated
1841 string */
1842 {
1843 #ifdef __WIN__
1844 bool ret;
1845 ulint count = 0;
1846 loop:
1847 /* In Windows, deleting an .ibd file may fail if mysqlbackup is copying
1848 it */
1849
1850 ret = DeleteFile((LPCTSTR) name);
1851
1852 if (ret) {
1853 return(true);
1854 }
1855
1856 DWORD lasterr = GetLastError();
1857 if (lasterr == ERROR_FILE_NOT_FOUND
1858 || lasterr == ERROR_PATH_NOT_FOUND) {
1859 /* the file does not exist, this not an error */
1860
1861 return(true);
1862 }
1863
1864 count++;
1865
1866 if (count > 100 && 0 == (count % 10)) {
1867 os_file_get_last_error(true); /* print error information */
1868
1869 ib_logf(IB_LOG_LEVEL_WARN, "Delete of file %s failed.", name);
1870 }
1871
1872 os_thread_sleep(500000); /* sleep for 0.5 second */
1873
1874 if (count > 2000) {
1875
1876 return(false);
1877 }
1878
1879 goto loop;
1880 #else
1881 int ret;
1882
1883 ret = unlink(name);
1884
1885 if (ret != 0 && errno != ENOENT) {
1886 os_file_handle_error_no_exit(name, "delete", FALSE);
1887
1888 return(false);
1889 }
1890
1891 return(true);
1892 #endif /* __WIN__ */
1893 }
1894
1895 /***********************************************************************//**
1896 Deletes a file. The file has to be closed before calling this.
1897 @return TRUE if success */
1898 UNIV_INTERN
1899 bool
os_file_delete_func(const char * name)1900 os_file_delete_func(
1901 /*================*/
1902 const char* name) /*!< in: file path as a null-terminated
1903 string */
1904 {
1905 #ifdef __WIN__
1906 BOOL ret;
1907 ulint count = 0;
1908 loop:
1909 /* In Windows, deleting an .ibd file may fail if mysqlbackup is copying
1910 it */
1911
1912 ret = DeleteFile((LPCTSTR) name);
1913
1914 if (ret) {
1915 return(true);
1916 }
1917
1918 if (GetLastError() == ERROR_FILE_NOT_FOUND) {
1919 /* If the file does not exist, we classify this as a 'mild'
1920 error and return */
1921
1922 return(false);
1923 }
1924
1925 count++;
1926
1927 if (count > 100 && 0 == (count % 10)) {
1928 os_file_get_last_error(true); /* print error information */
1929
1930 fprintf(stderr,
1931 "InnoDB: Warning: cannot delete file %s\n"
1932 "InnoDB: Are you running mysqlbackup"
1933 " to back up the file?\n", name);
1934 }
1935
1936 os_thread_sleep(1000000); /* sleep for a second */
1937
1938 if (count > 2000) {
1939
1940 return(false);
1941 }
1942
1943 goto loop;
1944 #else
1945 int ret;
1946
1947 ret = unlink(name);
1948
1949 if (ret != 0) {
1950 os_file_handle_error_no_exit(name, "delete", FALSE);
1951
1952 return(false);
1953 }
1954
1955 return(true);
1956 #endif
1957 }
1958
1959 /***********************************************************************//**
1960 NOTE! Use the corresponding macro os_file_rename(), not directly this function!
1961 Renames a file (can also move it to another directory). It is safest that the
1962 file is closed before calling this function.
1963 @return TRUE if success */
1964 UNIV_INTERN
1965 ibool
os_file_rename_func(const char * oldpath,const char * newpath)1966 os_file_rename_func(
1967 /*================*/
1968 const char* oldpath,/*!< in: old file path as a null-terminated
1969 string */
1970 const char* newpath)/*!< in: new file path */
1971 {
1972 #ifdef UNIV_DEBUG
1973 os_file_type_t type;
1974 ibool exists;
1975
1976 /* New path must not exist. */
1977 ut_ad(os_file_status(newpath, &exists, &type));
1978 ut_ad(!exists);
1979
1980 /* Old path must exist. */
1981 ut_ad(os_file_status(oldpath, &exists, &type));
1982 ut_ad(exists);
1983 #endif /* UNIV_DEBUG */
1984
1985 #ifdef __WIN__
1986 BOOL ret;
1987
1988 ret = MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath);
1989
1990 if (ret) {
1991 return(TRUE);
1992 }
1993
1994 os_file_handle_error_no_exit(oldpath, "rename", FALSE);
1995
1996 return(FALSE);
1997 #else
1998 int ret;
1999
2000 ret = rename(oldpath, newpath);
2001
2002 if (ret != 0) {
2003 os_file_handle_error_no_exit(oldpath, "rename", FALSE);
2004
2005 return(FALSE);
2006 }
2007
2008 return(TRUE);
2009 #endif /* __WIN__ */
2010 }
2011
2012 /***********************************************************************//**
2013 NOTE! Use the corresponding macro os_file_close(), not directly this function!
2014 Closes a file handle. In case of error, error number can be retrieved with
2015 os_file_get_last_error.
2016 @return TRUE if success */
2017 UNIV_INTERN
2018 ibool
os_file_close_func(os_file_t file)2019 os_file_close_func(
2020 /*===============*/
2021 os_file_t file) /*!< in, own: handle to a file */
2022 {
2023 #ifdef __WIN__
2024 BOOL ret;
2025
2026 ut_a(file);
2027
2028 ret = CloseHandle(file);
2029
2030 if (ret) {
2031 return(TRUE);
2032 }
2033
2034 os_file_handle_error(NULL, "close");
2035
2036 return(FALSE);
2037 #else
2038 int ret;
2039
2040 ret = close(file);
2041
2042 if (ret == -1) {
2043 os_file_handle_error(NULL, "close");
2044
2045 return(FALSE);
2046 }
2047
2048 return(TRUE);
2049 #endif /* __WIN__ */
2050 }
2051
2052 #ifdef UNIV_HOTBACKUP
2053 /***********************************************************************//**
2054 Closes a file handle.
2055 @return TRUE if success */
2056 UNIV_INTERN
2057 ibool
os_file_close_no_error_handling(os_file_t file)2058 os_file_close_no_error_handling(
2059 /*============================*/
2060 os_file_t file) /*!< in, own: handle to a file */
2061 {
2062 #ifdef __WIN__
2063 BOOL ret;
2064
2065 ut_a(file);
2066
2067 ret = CloseHandle(file);
2068
2069 if (ret) {
2070 return(TRUE);
2071 }
2072
2073 return(FALSE);
2074 #else
2075 int ret;
2076
2077 ret = close(file);
2078
2079 if (ret == -1) {
2080
2081 return(FALSE);
2082 }
2083
2084 return(TRUE);
2085 #endif /* __WIN__ */
2086 }
2087 #endif /* UNIV_HOTBACKUP */
2088
2089 /***********************************************************************//**
2090 Gets a file size.
2091 @return file size, or (os_offset_t) -1 on failure */
2092 UNIV_INTERN
2093 os_offset_t
os_file_get_size(pfs_os_file_t file)2094 os_file_get_size(
2095 /*=============*/
2096 pfs_os_file_t file) /*!< in: handle to a file */
2097 {
2098 #ifdef __WIN__
2099 os_offset_t offset;
2100 DWORD high;
2101 DWORD low;
2102
2103 low = GetFileSize(file.m_file, &high);
2104
2105 if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) {
2106 return((os_offset_t) -1);
2107 }
2108
2109 offset = (os_offset_t) low | ((os_offset_t) high << 32);
2110
2111 return(offset);
2112 #else
2113 return((os_offset_t) lseek(file.m_file, 0, SEEK_END));
2114
2115 #endif /* __WIN__ */
2116 }
2117
2118 /***********************************************************************//**
2119 Write the specified number of zeros to a newly created file.
2120 @return TRUE if success */
2121 UNIV_INTERN
2122 ibool
os_file_set_size(const char * name,pfs_os_file_t file,os_offset_t size)2123 os_file_set_size(
2124 /*=============*/
2125 const char* name, /*!< in: name of the file or path as a
2126 null-terminated string */
2127 pfs_os_file_t file, /*!< in: handle to a file */
2128 os_offset_t size) /*!< in: file size */
2129 {
2130 os_offset_t current_size;
2131 ibool ret;
2132 byte* buf;
2133 byte* buf2;
2134 ulint buf_size;
2135
2136 current_size = 0;
2137
2138 /* Write up to 1 megabyte at a time. */
2139 buf_size = ut_min(64, (ulint) (size / UNIV_PAGE_SIZE))
2140 * UNIV_PAGE_SIZE;
2141 buf2 = static_cast<byte*>(ut_malloc(buf_size + UNIV_PAGE_SIZE));
2142
2143 /* Align the buffer for possible raw i/o */
2144 buf = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
2145
2146 /* Write buffer full of zeros */
2147 memset(buf, 0, buf_size);
2148
2149 if (size >= (os_offset_t) 100 << 20) {
2150
2151 fprintf(stderr, "InnoDB: Progress in MB:");
2152 }
2153
2154 while (current_size < size) {
2155 ulint n_bytes;
2156
2157 if (size - current_size < (os_offset_t) buf_size) {
2158 n_bytes = (ulint) (size - current_size);
2159 } else {
2160 n_bytes = buf_size;
2161 }
2162
2163 ret = os_file_write(name, file, buf, current_size, n_bytes);
2164 if (!ret) {
2165 ut_free(buf2);
2166 goto error_handling;
2167 }
2168
2169 /* Print about progress for each 100 MB written */
2170 if ((current_size + n_bytes) / (100 << 20)
2171 != current_size / (100 << 20)) {
2172
2173 fprintf(stderr, " %lu00",
2174 (ulong) ((current_size + n_bytes)
2175 / (100 << 20)));
2176 }
2177
2178 current_size += n_bytes;
2179 }
2180
2181 if (size >= (os_offset_t) 100 << 20) {
2182
2183 fprintf(stderr, "\n");
2184 }
2185
2186 ut_free(buf2);
2187
2188 ret = os_file_flush(file);
2189
2190 if (ret) {
2191 return(TRUE);
2192 }
2193
2194 error_handling:
2195 return(FALSE);
2196 }
2197
2198 /***********************************************************************//**
2199 Truncates a file at its current position.
2200 @return TRUE if success */
2201 UNIV_INTERN
2202 ibool
os_file_set_eof(FILE * file)2203 os_file_set_eof(
2204 /*============*/
2205 FILE* file) /*!< in: file to be truncated */
2206 {
2207 #ifdef __WIN__
2208 HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
2209 return(SetEndOfFile(h));
2210 #else /* __WIN__ */
2211 return(!ftruncate(fileno(file), ftell(file)));
2212 #endif /* __WIN__ */
2213 }
2214
2215 #ifndef __WIN__
2216 /***********************************************************************//**
2217 Wrapper to fsync(2) that retries the call on some errors.
2218 Returns the value 0 if successful; otherwise the value -1 is returned and
2219 the global variable errno is set to indicate the error.
2220 @return 0 if success, -1 otherwise */
2221
2222 static
2223 int
os_file_fsync(os_file_t file)2224 os_file_fsync(
2225 /*==========*/
2226 os_file_t file) /*!< in: handle to a file */
2227 {
2228 int ret;
2229 int failures;
2230 ibool retry;
2231
2232 failures = 0;
2233
2234 do {
2235 ret = fsync(file);
2236
2237 os_n_fsyncs++;
2238
2239 if (ret == -1 && errno == ENOLCK) {
2240
2241 if (failures % 100 == 0) {
2242
2243 ut_print_timestamp(stderr);
2244 fprintf(stderr,
2245 " InnoDB: fsync(): "
2246 "No locks available; retrying\n");
2247 }
2248
2249 os_thread_sleep(200000 /* 0.2 sec */);
2250
2251 failures++;
2252
2253 retry = TRUE;
2254 } else {
2255
2256 retry = FALSE;
2257 }
2258 } while (retry);
2259
2260 return(ret);
2261 }
2262 #endif /* !__WIN__ */
2263
2264 /***********************************************************************//**
2265 NOTE! Use the corresponding macro os_file_flush(), not directly this function!
2266 Flushes the write buffers of a given file to the disk.
2267 @return TRUE if success */
2268 UNIV_INTERN
2269 ibool
os_file_flush_func(os_file_t file)2270 os_file_flush_func(
2271 /*===============*/
2272 os_file_t file) /*!< in, own: handle to a file */
2273 {
2274 #ifdef __WIN__
2275 BOOL ret;
2276
2277 ut_a(file);
2278
2279 os_n_fsyncs++;
2280
2281 ret = FlushFileBuffers(file);
2282
2283 if (ret) {
2284 return(TRUE);
2285 }
2286
2287 /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
2288 actually a raw device, we choose to ignore that error if we are using
2289 raw disks */
2290
2291 if (srv_start_raw_disk_in_use && GetLastError()
2292 == ERROR_INVALID_FUNCTION) {
2293 return(TRUE);
2294 }
2295
2296 os_file_handle_error(NULL, "flush");
2297
2298 /* It is a fatal error if a file flush does not succeed, because then
2299 the database can get corrupt on disk */
2300 ut_error;
2301
2302 return(FALSE);
2303 #else
2304 int ret;
2305
2306 #if defined(HAVE_DARWIN_THREADS)
2307 # ifndef F_FULLFSYNC
2308 /* The following definition is from the Mac OS X 10.3 <sys/fcntl.h> */
2309 # define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */
2310 # elif F_FULLFSYNC != 51
2311 # error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3"
2312 # endif
2313 /* Apple has disabled fsync() for internal disk drives in OS X. That
2314 caused corruption for a user when he tested a power outage. Let us in
2315 OS X use a nonstandard flush method recommended by an Apple
2316 engineer. */
2317
2318 if (!srv_have_fullfsync) {
2319 /* If we are not on an operating system that supports this,
2320 then fall back to a plain fsync. */
2321
2322 ret = os_file_fsync(file);
2323 } else {
2324 ret = fcntl(file, F_FULLFSYNC, NULL);
2325
2326 if (ret) {
2327 /* If we are not on a file system that supports this,
2328 then fall back to a plain fsync. */
2329 ret = os_file_fsync(file);
2330 }
2331 }
2332 #else
2333 ret = os_file_fsync(file);
2334 #endif
2335
2336 if (ret == 0) {
2337 return(TRUE);
2338 }
2339
2340 /* Since Linux returns EINVAL if the 'file' is actually a raw device,
2341 we choose to ignore that error if we are using raw disks */
2342
2343 if (srv_start_raw_disk_in_use && errno == EINVAL) {
2344
2345 return(TRUE);
2346 }
2347
2348 ib_logf(IB_LOG_LEVEL_ERROR, "The OS said file flush did not succeed");
2349
2350 os_file_handle_error(NULL, "flush");
2351
2352 /* It is a fatal error if a file flush does not succeed, because then
2353 the database can get corrupt on disk */
2354 ut_error;
2355
2356 return(FALSE);
2357 #endif
2358 }
2359
2360 #ifndef __WIN__
2361 /*******************************************************************//**
2362 Does a synchronous read operation in Posix.
2363 @return number of bytes read, -1 if error */
2364 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2365 ssize_t
os_file_pread(os_file_t file,void * buf,ulint n,os_offset_t offset)2366 os_file_pread(
2367 /*==========*/
2368 os_file_t file, /*!< in: handle to a file */
2369 void* buf, /*!< in: buffer where to read */
2370 ulint n, /*!< in: number of bytes to read */
2371 os_offset_t offset) /*!< in: file offset from where to read */
2372 {
2373 off_t offs;
2374 #if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
2375 ssize_t n_bytes;
2376 #endif /* HAVE_PREAD && !HAVE_BROKEN_PREAD */
2377
2378 ut_ad(n);
2379
2380 /* If off_t is > 4 bytes in size, then we assume we can pass a
2381 64-bit address */
2382 offs = (off_t) offset;
2383
2384 if (sizeof(off_t) <= 4) {
2385 if (offset != (os_offset_t) offs) {
2386 ib_logf(IB_LOG_LEVEL_ERROR,
2387 "File read at offset > 4 GB");
2388 }
2389 }
2390
2391 os_n_file_reads++;
2392
2393 #if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
2394 #if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
2395 (void) os_atomic_increment_ulint(&os_n_pending_reads, 1);
2396 (void) os_atomic_increment_ulint(&os_file_n_pending_preads, 1);
2397 MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS);
2398 #else
2399 os_mutex_enter(os_file_count_mutex);
2400 os_file_n_pending_preads++;
2401 os_n_pending_reads++;
2402 MONITOR_INC(MONITOR_OS_PENDING_READS);
2403 os_mutex_exit(os_file_count_mutex);
2404 #endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD == 8 */
2405
2406 n_bytes = pread(file, buf, n, offs);
2407
2408 #if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
2409 (void) os_atomic_decrement_ulint(&os_n_pending_reads, 1);
2410 (void) os_atomic_decrement_ulint(&os_file_n_pending_preads, 1);
2411 MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_READS);
2412 #else
2413 os_mutex_enter(os_file_count_mutex);
2414 os_file_n_pending_preads--;
2415 os_n_pending_reads--;
2416 MONITOR_DEC(MONITOR_OS_PENDING_READS);
2417 os_mutex_exit(os_file_count_mutex);
2418 #endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD == 8 */
2419
2420 return(n_bytes);
2421 #else
2422 {
2423 off_t ret_offset;
2424 ssize_t ret;
2425 #ifndef UNIV_HOTBACKUP
2426 ulint i;
2427 #endif /* !UNIV_HOTBACKUP */
2428
2429 #if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
2430 (void) os_atomic_increment_ulint(&os_n_pending_reads, 1);
2431 MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS);
2432 #else
2433 os_mutex_enter(os_file_count_mutex);
2434 os_n_pending_reads++;
2435 MONITOR_INC(MONITOR_OS_PENDING_READS);
2436 os_mutex_exit(os_file_count_mutex);
2437 #endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD == 8 */
2438 #ifndef UNIV_HOTBACKUP
2439 /* Protect the seek / read operation with a mutex */
2440 i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2441
2442 os_mutex_enter(os_file_seek_mutexes[i]);
2443 #endif /* !UNIV_HOTBACKUP */
2444
2445 ret_offset = lseek(file, offs, SEEK_SET);
2446
2447 if (ret_offset < 0) {
2448 ret = -1;
2449 } else {
2450 ret = read(file, buf, (ssize_t) n);
2451 }
2452
2453 #ifndef UNIV_HOTBACKUP
2454 os_mutex_exit(os_file_seek_mutexes[i]);
2455 #endif /* !UNIV_HOTBACKUP */
2456
2457 #if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
2458 (void) os_atomic_decrement_ulint(&os_n_pending_reads, 1);
2459 MONITOR_ATOIC_DEC(MONITOR_OS_PENDING_READS);
2460 #else
2461 os_mutex_enter(os_file_count_mutex);
2462 os_n_pending_reads--;
2463 MONITOR_DEC(MONITOR_OS_PENDING_READS);
2464 os_mutex_exit(os_file_count_mutex);
2465 #endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD_SIZE == 8 */
2466
2467 return(ret);
2468 }
2469 #endif
2470 }
2471
2472 /*******************************************************************//**
2473 Does a synchronous write operation in Posix.
2474 @return number of bytes written, -1 if error */
2475 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2476 ssize_t
os_file_pwrite(os_file_t file,const void * buf,ulint n,os_offset_t offset)2477 os_file_pwrite(
2478 /*===========*/
2479 os_file_t file, /*!< in: handle to a file */
2480 const void* buf, /*!< in: buffer from where to write */
2481 ulint n, /*!< in: number of bytes to write */
2482 os_offset_t offset) /*!< in: file offset where to write */
2483 {
2484 ssize_t ret;
2485 off_t offs;
2486
2487 ut_ad(n);
2488 ut_ad(!srv_read_only_mode);
2489
2490 /* If off_t is > 4 bytes in size, then we assume we can pass a
2491 64-bit address */
2492 offs = (off_t) offset;
2493
2494 if (sizeof(off_t) <= 4) {
2495 if (offset != (os_offset_t) offs) {
2496 ib_logf(IB_LOG_LEVEL_ERROR,
2497 "File write at offset > 4 GB.");
2498 }
2499 }
2500
2501 os_n_file_writes++;
2502
2503 #if defined(HAVE_PWRITE) && !defined(HAVE_BROKEN_PREAD)
2504 #if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8
2505 os_mutex_enter(os_file_count_mutex);
2506 os_file_n_pending_pwrites++;
2507 os_n_pending_writes++;
2508 MONITOR_INC(MONITOR_OS_PENDING_WRITES);
2509 os_mutex_exit(os_file_count_mutex);
2510 #else
2511 (void) os_atomic_increment_ulint(&os_n_pending_writes, 1);
2512 (void) os_atomic_increment_ulint(&os_file_n_pending_pwrites, 1);
2513 MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_WRITES);
2514 #endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD < 8 */
2515
2516 ret = pwrite(file, buf, (ssize_t) n, offs);
2517
2518 #if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8
2519 os_mutex_enter(os_file_count_mutex);
2520 os_file_n_pending_pwrites--;
2521 os_n_pending_writes--;
2522 MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
2523 os_mutex_exit(os_file_count_mutex);
2524 #else
2525 (void) os_atomic_decrement_ulint(&os_n_pending_writes, 1);
2526 (void) os_atomic_decrement_ulint(&os_file_n_pending_pwrites, 1);
2527 MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_WRITES);
2528 #endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD < 8 */
2529
2530 return(ret);
2531 #else
2532 {
2533 off_t ret_offset;
2534 # ifndef UNIV_HOTBACKUP
2535 ulint i;
2536 # endif /* !UNIV_HOTBACKUP */
2537
2538 os_mutex_enter(os_file_count_mutex);
2539 os_n_pending_writes++;
2540 MONITOR_INC(MONITOR_OS_PENDING_WRITES);
2541 os_mutex_exit(os_file_count_mutex);
2542
2543 # ifndef UNIV_HOTBACKUP
2544 /* Protect the seek / write operation with a mutex */
2545 i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2546
2547 os_mutex_enter(os_file_seek_mutexes[i]);
2548 # endif /* UNIV_HOTBACKUP */
2549
2550 ret_offset = lseek(file, offs, SEEK_SET);
2551
2552 if (ret_offset < 0) {
2553 ret = -1;
2554
2555 goto func_exit;
2556 }
2557
2558 ret = write(file, buf, (ssize_t) n);
2559
2560 func_exit:
2561 # ifndef UNIV_HOTBACKUP
2562 os_mutex_exit(os_file_seek_mutexes[i]);
2563 # endif /* !UNIV_HOTBACKUP */
2564
2565 os_mutex_enter(os_file_count_mutex);
2566 os_n_pending_writes--;
2567 MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
2568 os_mutex_exit(os_file_count_mutex);
2569
2570 return(ret);
2571 }
2572 #endif /* !UNIV_HOTBACKUP */
2573 }
2574 #endif
2575
2576 /*******************************************************************//**
2577 NOTE! Use the corresponding macro os_file_read(), not directly this
2578 function!
2579 Requests a synchronous positioned read operation.
2580 @return TRUE if request was successful, FALSE if fail */
2581 UNIV_INTERN
2582 ibool
os_file_read_func(os_file_t file,void * buf,os_offset_t offset,ulint n)2583 os_file_read_func(
2584 /*==============*/
2585 os_file_t file, /*!< in: handle to a file */
2586 void* buf, /*!< in: buffer where to read */
2587 os_offset_t offset, /*!< in: file offset where to read */
2588 ulint n) /*!< in: number of bytes to read */
2589 {
2590 #ifdef __WIN__
2591 BOOL ret;
2592 DWORD len;
2593 DWORD ret2;
2594 DWORD low;
2595 DWORD high;
2596 ibool retry;
2597 #ifndef UNIV_HOTBACKUP
2598 ulint i;
2599 #endif /* !UNIV_HOTBACKUP */
2600
2601 /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2602 no more than 32 bits. */
2603 ut_a((n & 0xFFFFFFFFUL) == n);
2604
2605 os_n_file_reads++;
2606 os_bytes_read_since_printout += n;
2607
2608 try_again:
2609 ut_ad(file);
2610 ut_ad(buf);
2611 ut_ad(n > 0);
2612
2613 low = (DWORD) offset & 0xFFFFFFFF;
2614 high = (DWORD) (offset >> 32);
2615
2616 os_mutex_enter(os_file_count_mutex);
2617 os_n_pending_reads++;
2618 MONITOR_INC(MONITOR_OS_PENDING_READS);
2619 os_mutex_exit(os_file_count_mutex);
2620
2621 #ifndef UNIV_HOTBACKUP
2622 /* Protect the seek / read operation with a mutex */
2623 i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2624
2625 os_mutex_enter(os_file_seek_mutexes[i]);
2626 #endif /* !UNIV_HOTBACKUP */
2627
2628 ret2 = SetFilePointer(
2629 file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN);
2630
2631 if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2632
2633 #ifndef UNIV_HOTBACKUP
2634 os_mutex_exit(os_file_seek_mutexes[i]);
2635 #endif /* !UNIV_HOTBACKUP */
2636
2637 os_mutex_enter(os_file_count_mutex);
2638 os_n_pending_reads--;
2639 MONITOR_DEC(MONITOR_OS_PENDING_READS);
2640 os_mutex_exit(os_file_count_mutex);
2641
2642 goto error_handling;
2643 }
2644
2645 ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
2646
2647 #ifndef UNIV_HOTBACKUP
2648 os_mutex_exit(os_file_seek_mutexes[i]);
2649 #endif /* !UNIV_HOTBACKUP */
2650
2651 os_mutex_enter(os_file_count_mutex);
2652 os_n_pending_reads--;
2653 MONITOR_DEC(MONITOR_OS_PENDING_READS);
2654 os_mutex_exit(os_file_count_mutex);
2655
2656 if (ret && len == n) {
2657 return(TRUE);
2658 }
2659 #else /* __WIN__ */
2660 ibool retry;
2661 ssize_t ret;
2662
2663 os_bytes_read_since_printout += n;
2664
2665 try_again:
2666 ret = os_file_pread(file, buf, n, offset);
2667
2668 if ((ulint) ret == n) {
2669 return(TRUE);
2670 } else if (ret == -1) {
2671 ib_logf(IB_LOG_LEVEL_ERROR,
2672 "Error in system call pread(). The operating"
2673 " system error number is %lu.",(ulint) errno);
2674 } else {
2675 /* Partial read occured */
2676 ib_logf(IB_LOG_LEVEL_ERROR,
2677 "Tried to read " ULINTPF " bytes at offset "
2678 UINT64PF ". Was only able to read %ld.",
2679 n, offset, (lint) ret);
2680 }
2681 #endif /* __WIN__ */
2682 #ifdef __WIN__
2683 error_handling:
2684 #endif
2685 retry = os_file_handle_error(NULL, "read");
2686
2687 if (retry) {
2688 goto try_again;
2689 }
2690
2691 fprintf(stderr,
2692 "InnoDB: Fatal error: cannot read from file."
2693 " OS error number %lu.\n",
2694 #ifdef __WIN__
2695 (ulong) GetLastError()
2696 #else
2697 (ulong) errno
2698 #endif /* __WIN__ */
2699 );
2700 fflush(stderr);
2701
2702 ut_error;
2703
2704 return(FALSE);
2705 }
2706
2707 /*******************************************************************//**
2708 NOTE! Use the corresponding macro os_file_read_no_error_handling(),
2709 not directly this function!
2710 Requests a synchronous positioned read operation. This function does not do
2711 any error handling. In case of error it returns FALSE.
2712 @return TRUE if request was successful, FALSE if fail */
2713 UNIV_INTERN
2714 ibool
os_file_read_no_error_handling_func(os_file_t file,void * buf,os_offset_t offset,ulint n)2715 os_file_read_no_error_handling_func(
2716 /*================================*/
2717 os_file_t file, /*!< in: handle to a file */
2718 void* buf, /*!< in: buffer where to read */
2719 os_offset_t offset, /*!< in: file offset where to read */
2720 ulint n) /*!< in: number of bytes to read */
2721 {
2722 #ifdef __WIN__
2723 BOOL ret;
2724 DWORD len;
2725 DWORD ret2;
2726 DWORD low;
2727 DWORD high;
2728 ibool retry;
2729 #ifndef UNIV_HOTBACKUP
2730 ulint i;
2731 #endif /* !UNIV_HOTBACKUP */
2732
2733 /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2734 no more than 32 bits. */
2735 ut_a((n & 0xFFFFFFFFUL) == n);
2736
2737 os_n_file_reads++;
2738 os_bytes_read_since_printout += n;
2739
2740 try_again:
2741 ut_ad(file);
2742 ut_ad(buf);
2743 ut_ad(n > 0);
2744
2745 low = (DWORD) offset & 0xFFFFFFFF;
2746 high = (DWORD) (offset >> 32);
2747
2748 os_mutex_enter(os_file_count_mutex);
2749 os_n_pending_reads++;
2750 MONITOR_INC(MONITOR_OS_PENDING_READS);
2751 os_mutex_exit(os_file_count_mutex);
2752
2753 #ifndef UNIV_HOTBACKUP
2754 /* Protect the seek / read operation with a mutex */
2755 i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2756
2757 os_mutex_enter(os_file_seek_mutexes[i]);
2758 #endif /* !UNIV_HOTBACKUP */
2759
2760 ret2 = SetFilePointer(
2761 file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN);
2762
2763 if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2764
2765 #ifndef UNIV_HOTBACKUP
2766 os_mutex_exit(os_file_seek_mutexes[i]);
2767 #endif /* !UNIV_HOTBACKUP */
2768
2769 os_mutex_enter(os_file_count_mutex);
2770 os_n_pending_reads--;
2771 MONITOR_DEC(MONITOR_OS_PENDING_READS);
2772 os_mutex_exit(os_file_count_mutex);
2773
2774 goto error_handling;
2775 }
2776
2777 ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
2778
2779 #ifndef UNIV_HOTBACKUP
2780 os_mutex_exit(os_file_seek_mutexes[i]);
2781 #endif /* !UNIV_HOTBACKUP */
2782
2783 os_mutex_enter(os_file_count_mutex);
2784 os_n_pending_reads--;
2785 MONITOR_DEC(MONITOR_OS_PENDING_READS);
2786 os_mutex_exit(os_file_count_mutex);
2787
2788 if (ret && len == n) {
2789 return(TRUE);
2790 }
2791 #else /* __WIN__ */
2792 ibool retry;
2793 ssize_t ret;
2794
2795 os_bytes_read_since_printout += n;
2796
2797 try_again:
2798 ret = os_file_pread(file, buf, n, offset);
2799
2800 if ((ulint) ret == n) {
2801 return(TRUE);
2802 } else if (ret == -1) {
2803 ib_logf(IB_LOG_LEVEL_ERROR,
2804 "Error in system call pread(). The operating"
2805 " system error number is %lu.",(ulint) errno);
2806 } else {
2807 /* Partial read occured */
2808 ib_logf(IB_LOG_LEVEL_ERROR,
2809 "Tried to read " ULINTPF " bytes at offset "
2810 UINT64PF ". Was only able to read %ld.",
2811 n, offset, (lint) ret);
2812 }
2813 #endif /* __WIN__ */
2814 #ifdef __WIN__
2815 error_handling:
2816 #endif
2817 retry = os_file_handle_error_no_exit(NULL, "read", FALSE);
2818
2819 if (retry) {
2820 goto try_again;
2821 }
2822
2823 return(FALSE);
2824 }
2825
2826 /*******************************************************************//**
2827 Rewind file to its start, read at most size - 1 bytes from it to str, and
2828 NUL-terminate str. All errors are silently ignored. This function is
2829 mostly meant to be used with temporary files. */
2830 UNIV_INTERN
2831 void
os_file_read_string(FILE * file,char * str,ulint size)2832 os_file_read_string(
2833 /*================*/
2834 FILE* file, /*!< in: file to read from */
2835 char* str, /*!< in: buffer where to read */
2836 ulint size) /*!< in: size of buffer */
2837 {
2838 size_t flen;
2839
2840 if (size == 0) {
2841 return;
2842 }
2843
2844 rewind(file);
2845 flen = fread(str, 1, size - 1, file);
2846 str[flen] = '\0';
2847 }
2848
2849 /*******************************************************************//**
2850 NOTE! Use the corresponding macro os_file_write(), not directly
2851 this function!
2852 Requests a synchronous write operation.
2853 @return TRUE if request was successful, FALSE if fail */
2854 UNIV_INTERN
2855 ibool
os_file_write_func(const char * name,os_file_t file,const void * buf,os_offset_t offset,ulint n)2856 os_file_write_func(
2857 /*===============*/
2858 const char* name, /*!< in: name of the file or path as a
2859 null-terminated string */
2860 os_file_t file, /*!< in: handle to a file */
2861 const void* buf, /*!< in: buffer from which to write */
2862 os_offset_t offset, /*!< in: file offset where to write */
2863 ulint n) /*!< in: number of bytes to write */
2864 {
2865 ut_ad(!srv_read_only_mode);
2866
2867 #ifdef __WIN__
2868 BOOL ret;
2869 DWORD len;
2870 DWORD ret2;
2871 DWORD low;
2872 DWORD high;
2873 ulint n_retries = 0;
2874 ulint err;
2875 #ifndef UNIV_HOTBACKUP
2876 ulint i;
2877 #endif /* !UNIV_HOTBACKUP */
2878
2879 /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2880 no more than 32 bits. */
2881 ut_a((n & 0xFFFFFFFFUL) == n);
2882
2883 os_n_file_writes++;
2884
2885 ut_ad(file);
2886 ut_ad(buf);
2887 ut_ad(n > 0);
2888 retry:
2889 low = (DWORD) offset & 0xFFFFFFFF;
2890 high = (DWORD) (offset >> 32);
2891
2892 os_mutex_enter(os_file_count_mutex);
2893 os_n_pending_writes++;
2894 MONITOR_INC(MONITOR_OS_PENDING_WRITES);
2895 os_mutex_exit(os_file_count_mutex);
2896
2897 #ifndef UNIV_HOTBACKUP
2898 /* Protect the seek / write operation with a mutex */
2899 i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2900
2901 os_mutex_enter(os_file_seek_mutexes[i]);
2902 #endif /* !UNIV_HOTBACKUP */
2903
2904 ret2 = SetFilePointer(
2905 file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN);
2906
2907 if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2908
2909 #ifndef UNIV_HOTBACKUP
2910 os_mutex_exit(os_file_seek_mutexes[i]);
2911 #endif /* !UNIV_HOTBACKUP */
2912
2913 os_mutex_enter(os_file_count_mutex);
2914 os_n_pending_writes--;
2915 MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
2916 os_mutex_exit(os_file_count_mutex);
2917
2918 ut_print_timestamp(stderr);
2919
2920 fprintf(stderr,
2921 " InnoDB: Error: File pointer positioning to"
2922 " file %s failed at\n"
2923 "InnoDB: offset %llu. Operating system"
2924 " error number %lu.\n"
2925 "InnoDB: Some operating system error numbers"
2926 " are described at\n"
2927 "InnoDB: "
2928 REFMAN "operating-system-error-codes.html\n",
2929 name, offset, (ulong) GetLastError());
2930
2931 return(FALSE);
2932 }
2933
2934 ret = WriteFile(file, buf, (DWORD) n, &len, NULL);
2935
2936 #ifndef UNIV_HOTBACKUP
2937 os_mutex_exit(os_file_seek_mutexes[i]);
2938 #endif /* !UNIV_HOTBACKUP */
2939
2940 os_mutex_enter(os_file_count_mutex);
2941 os_n_pending_writes--;
2942 MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
2943 os_mutex_exit(os_file_count_mutex);
2944
2945 if (ret && len == n) {
2946
2947 return(TRUE);
2948 }
2949
2950 /* If some background file system backup tool is running, then, at
2951 least in Windows 2000, we may get here a specific error. Let us
2952 retry the operation 100 times, with 1 second waits. */
2953
2954 if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) {
2955
2956 os_thread_sleep(1000000);
2957
2958 n_retries++;
2959
2960 goto retry;
2961 }
2962
2963 if (!os_has_said_disk_full) {
2964
2965 err = (ulint) GetLastError();
2966
2967 ut_print_timestamp(stderr);
2968
2969 fprintf(stderr,
2970 " InnoDB: Error: Write to file %s failed"
2971 " at offset %llu.\n"
2972 "InnoDB: %lu bytes should have been written,"
2973 " only %lu were written.\n"
2974 "InnoDB: Operating system error number %lu.\n"
2975 "InnoDB: Check that your OS and file system"
2976 " support files of this size.\n"
2977 "InnoDB: Check also that the disk is not full"
2978 " or a disk quota exceeded.\n",
2979 name, offset,
2980 (ulong) n, (ulong) len, (ulong) err);
2981
2982 if (strerror((int) err) != NULL) {
2983 fprintf(stderr,
2984 "InnoDB: Error number %lu means '%s'.\n",
2985 (ulong) err, strerror((int) err));
2986 }
2987
2988 fprintf(stderr,
2989 "InnoDB: Some operating system error numbers"
2990 " are described at\n"
2991 "InnoDB: "
2992 REFMAN "operating-system-error-codes.html\n");
2993
2994 os_has_said_disk_full = TRUE;
2995 }
2996
2997 return(FALSE);
2998 #else
2999 ssize_t ret;
3000
3001 ret = os_file_pwrite(file, buf, n, offset);
3002
3003 if ((ulint) ret == n) {
3004
3005 return(TRUE);
3006 }
3007
3008 if (!os_has_said_disk_full) {
3009
3010 ut_print_timestamp(stderr);
3011
3012 if(ret == -1) {
3013 ib_logf(IB_LOG_LEVEL_ERROR,
3014 "Failure of system call pwrite(). Operating"
3015 " system error number is %lu.",
3016 (ulint) errno);
3017 } else {
3018 fprintf(stderr,
3019 " InnoDB: Error: Write to file %s failed"
3020 " at offset " UINT64PF ".\n"
3021 "InnoDB: %lu bytes should have been written,"
3022 " only %ld were written.\n"
3023 "InnoDB: Operating system error number %lu.\n"
3024 "InnoDB: Check that your OS and file system"
3025 " support files of this size.\n"
3026 "InnoDB: Check also that the disk is not full"
3027 " or a disk quota exceeded.\n",
3028 name, offset, n, (lint) ret,
3029 (ulint) errno);
3030 }
3031
3032 if (strerror(errno) != NULL) {
3033 fprintf(stderr,
3034 "InnoDB: Error number %d means '%s'.\n",
3035 errno, strerror(errno));
3036 }
3037
3038 fprintf(stderr,
3039 "InnoDB: Some operating system error numbers"
3040 " are described at\n"
3041 "InnoDB: "
3042 REFMAN "operating-system-error-codes.html\n");
3043
3044 os_has_said_disk_full = TRUE;
3045 }
3046
3047 return(FALSE);
3048 #endif
3049 }
3050
3051 /*******************************************************************//**
3052 Check the existence and type of the given file.
3053 @return TRUE if call succeeded */
3054 UNIV_INTERN
3055 ibool
os_file_status(const char * path,ibool * exists,os_file_type_t * type)3056 os_file_status(
3057 /*===========*/
3058 const char* path, /*!< in: pathname of the file */
3059 ibool* exists, /*!< out: TRUE if file exists */
3060 os_file_type_t* type) /*!< out: type of the file (if it exists) */
3061 {
3062 #ifdef __WIN__
3063 int ret;
3064 struct _stat64 statinfo;
3065
3066 ret = _stat64(path, &statinfo);
3067 if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3068 /* file does not exist */
3069 *exists = FALSE;
3070 return(TRUE);
3071 } else if (ret) {
3072 /* file exists, but stat call failed */
3073
3074 os_file_handle_error_no_exit(path, "stat", FALSE);
3075
3076 return(FALSE);
3077 }
3078
3079 if (_S_IFDIR & statinfo.st_mode) {
3080 *type = OS_FILE_TYPE_DIR;
3081 } else if (_S_IFREG & statinfo.st_mode) {
3082 *type = OS_FILE_TYPE_FILE;
3083 } else {
3084 *type = OS_FILE_TYPE_UNKNOWN;
3085 }
3086
3087 *exists = TRUE;
3088
3089 return(TRUE);
3090 #else
3091 int ret;
3092 struct stat statinfo;
3093
3094 ret = stat(path, &statinfo);
3095 if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3096 /* file does not exist */
3097 *exists = FALSE;
3098 return(TRUE);
3099 } else if (ret) {
3100 /* file exists, but stat call failed */
3101
3102 os_file_handle_error_no_exit(path, "stat", FALSE);
3103
3104 return(FALSE);
3105 }
3106
3107 if (S_ISDIR(statinfo.st_mode)) {
3108 *type = OS_FILE_TYPE_DIR;
3109 } else if (S_ISLNK(statinfo.st_mode)) {
3110 *type = OS_FILE_TYPE_LINK;
3111 } else if (S_ISREG(statinfo.st_mode)) {
3112 *type = OS_FILE_TYPE_FILE;
3113 } else {
3114 *type = OS_FILE_TYPE_UNKNOWN;
3115 }
3116
3117 *exists = TRUE;
3118
3119 return(TRUE);
3120 #endif
3121 }
3122
3123 /*******************************************************************//**
3124 This function returns information about the specified file
3125 @return DB_SUCCESS if all OK */
3126 UNIV_INTERN
3127 dberr_t
os_file_get_status(const char * path,os_file_stat_t * stat_info,bool check_rw_perm)3128 os_file_get_status(
3129 /*===============*/
3130 const char* path, /*!< in: pathname of the file */
3131 os_file_stat_t* stat_info, /*!< information of a file in a
3132 directory */
3133 bool check_rw_perm) /*!< in: for testing whether the
3134 file can be opened in RW mode */
3135 {
3136 int ret;
3137
3138 #ifdef __WIN__
3139 struct _stat64 statinfo;
3140
3141 ret = _stat64(path, &statinfo);
3142
3143 if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3144 /* file does not exist */
3145
3146 return(DB_NOT_FOUND);
3147
3148 } else if (ret) {
3149 /* file exists, but stat call failed */
3150
3151 os_file_handle_error_no_exit(path, "stat", FALSE);
3152
3153 return(DB_FAIL);
3154
3155 } else if (_S_IFDIR & statinfo.st_mode) {
3156 stat_info->type = OS_FILE_TYPE_DIR;
3157 } else if (_S_IFREG & statinfo.st_mode) {
3158
3159 DWORD access = GENERIC_READ;
3160
3161 if (!srv_read_only_mode) {
3162 access |= GENERIC_WRITE;
3163 }
3164
3165 stat_info->type = OS_FILE_TYPE_FILE;
3166
3167 /* Check if we can open it in read-only mode. */
3168
3169 if (check_rw_perm) {
3170 HANDLE fh;
3171
3172 fh = CreateFile(
3173 (LPCTSTR) path, // File to open
3174 access,
3175 0, // No sharing
3176 NULL, // Default security
3177 OPEN_EXISTING, // Existing file only
3178 FILE_ATTRIBUTE_NORMAL, // Normal file
3179 NULL); // No attr. template
3180
3181 if (fh == INVALID_HANDLE_VALUE) {
3182 stat_info->rw_perm = false;
3183 } else {
3184 stat_info->rw_perm = true;
3185 CloseHandle(fh);
3186 }
3187 }
3188 } else {
3189 stat_info->type = OS_FILE_TYPE_UNKNOWN;
3190 }
3191 #else
3192 struct stat statinfo;
3193
3194 ret = stat(path, &statinfo);
3195
3196 if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3197 /* file does not exist */
3198
3199 return(DB_NOT_FOUND);
3200
3201 } else if (ret) {
3202 /* file exists, but stat call failed */
3203
3204 os_file_handle_error_no_exit(path, "stat", FALSE);
3205
3206 return(DB_FAIL);
3207
3208 }
3209
3210 switch (statinfo.st_mode & S_IFMT) {
3211 case S_IFDIR:
3212 stat_info->type = OS_FILE_TYPE_DIR;
3213 break;
3214 case S_IFLNK:
3215 stat_info->type = OS_FILE_TYPE_LINK;
3216 break;
3217 case S_IFBLK:
3218 /* Handle block device as regular file. */
3219 case S_IFCHR:
3220 /* Handle character device as regular file. */
3221 case S_IFREG:
3222 stat_info->type = OS_FILE_TYPE_FILE;
3223 break;
3224 default:
3225 stat_info->type = OS_FILE_TYPE_UNKNOWN;
3226 }
3227
3228
3229 if (check_rw_perm && stat_info->type == OS_FILE_TYPE_FILE) {
3230
3231 int fh;
3232 int access;
3233
3234 access = !srv_read_only_mode ? O_RDWR : O_RDONLY;
3235
3236 fh = ::open(path, access, os_innodb_umask);
3237
3238 if (fh == -1) {
3239 stat_info->rw_perm = false;
3240 } else {
3241 stat_info->rw_perm = true;
3242 close(fh);
3243 }
3244 }
3245
3246 #endif /* _WIN_ */
3247
3248 stat_info->ctime = statinfo.st_ctime;
3249 stat_info->atime = statinfo.st_atime;
3250 stat_info->mtime = statinfo.st_mtime;
3251 stat_info->size = statinfo.st_size;
3252
3253 return(DB_SUCCESS);
3254 }
3255
3256 /* path name separator character */
3257 #ifdef __WIN__
3258 # define OS_FILE_PATH_SEPARATOR '\\'
3259 #else
3260 # define OS_FILE_PATH_SEPARATOR '/'
3261 #endif
3262
3263 /****************************************************************//**
3264 This function returns a new path name after replacing the basename
3265 in an old path with a new basename. The old_path is a full path
3266 name including the extension. The tablename is in the normal
3267 form "databasename/tablename". The new base name is found after
3268 the forward slash. Both input strings are null terminated.
3269
3270 This function allocates memory to be returned. It is the callers
3271 responsibility to free the return value after it is no longer needed.
3272
3273 @return own: new full pathname */
3274 UNIV_INTERN
3275 char*
os_file_make_new_pathname(const char * old_path,const char * tablename)3276 os_file_make_new_pathname(
3277 /*======================*/
3278 const char* old_path, /*!< in: pathname */
3279 const char* tablename) /*!< in: contains new base name */
3280 {
3281 ulint dir_len;
3282 char* last_slash;
3283 char* base_name;
3284 char* new_path;
3285 ulint new_path_len;
3286
3287 /* Split the tablename into its database and table name components.
3288 They are separated by a '/'. */
3289 last_slash = strrchr((char*) tablename, '/');
3290 base_name = last_slash ? last_slash + 1 : (char*) tablename;
3291
3292 /* Find the offset of the last slash. We will strip off the
3293 old basename.ibd which starts after that slash. */
3294 last_slash = strrchr((char*) old_path, OS_FILE_PATH_SEPARATOR);
3295 dir_len = last_slash ? last_slash - old_path : strlen(old_path);
3296
3297 /* allocate a new path and move the old directory path to it. */
3298 new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd";
3299 new_path = static_cast<char*>(mem_alloc(new_path_len));
3300 memcpy(new_path, old_path, dir_len);
3301
3302 ut_snprintf(new_path + dir_len,
3303 new_path_len - dir_len,
3304 "%c%s.ibd",
3305 OS_FILE_PATH_SEPARATOR,
3306 base_name);
3307
3308 return(new_path);
3309 }
3310
3311 /****************************************************************//**
3312 This function returns a remote path name by combining a data directory
3313 path provided in a DATA DIRECTORY clause with the tablename which is
3314 in the form 'database/tablename'. It strips the file basename (which
3315 is the tablename) found after the last directory in the path provided.
3316 The full filepath created will include the database name as a directory
3317 under the path provided. The filename is the tablename with the '.ibd'
3318 extension. All input and output strings are null-terminated.
3319
3320 This function allocates memory to be returned. It is the callers
3321 responsibility to free the return value after it is no longer needed.
3322
3323 @return own: A full pathname; data_dir_path/databasename/tablename.ibd */
3324 UNIV_INTERN
3325 char*
os_file_make_remote_pathname(const char * data_dir_path,const char * tablename,const char * extention)3326 os_file_make_remote_pathname(
3327 /*=========================*/
3328 const char* data_dir_path, /*!< in: pathname */
3329 const char* tablename, /*!< in: tablename */
3330 const char* extention) /*!< in: file extention; ibd,cfg */
3331 {
3332 ulint data_dir_len;
3333 char* last_slash;
3334 char* new_path;
3335 ulint new_path_len;
3336
3337 ut_ad(extention && strlen(extention) == 3);
3338
3339 /* Find the offset of the last slash. We will strip off the
3340 old basename or tablename which starts after that slash. */
3341 last_slash = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
3342 data_dir_len = last_slash ? last_slash - data_dir_path : strlen(data_dir_path);
3343
3344 /* allocate a new path and move the old directory path to it. */
3345 new_path_len = data_dir_len + strlen(tablename)
3346 + sizeof "/." + strlen(extention);
3347 new_path = static_cast<char*>(mem_alloc(new_path_len));
3348 memcpy(new_path, data_dir_path, data_dir_len);
3349 ut_snprintf(new_path + data_dir_len,
3350 new_path_len - data_dir_len,
3351 "%c%s.%s",
3352 OS_FILE_PATH_SEPARATOR,
3353 tablename,
3354 extention);
3355
3356 srv_normalize_path_for_win(new_path);
3357
3358 return(new_path);
3359 }
3360
3361 /****************************************************************//**
3362 This function reduces a null-terminated full remote path name into
3363 the path that is sent by MySQL for DATA DIRECTORY clause. It replaces
3364 the 'databasename/tablename.ibd' found at the end of the path with just
3365 'tablename'.
3366
3367 Since the result is always smaller than the path sent in, no new memory
3368 is allocated. The caller should allocate memory for the path sent in.
3369 This function manipulates that path in place.
3370
3371 If the path format is not as expected, just return. The result is used
3372 to inform a SHOW CREATE TABLE command. */
3373 UNIV_INTERN
3374 void
os_file_make_data_dir_path(char * data_dir_path)3375 os_file_make_data_dir_path(
3376 /*========================*/
3377 char* data_dir_path) /*!< in/out: full path/data_dir_path */
3378 {
3379 char* ptr;
3380 char* tablename;
3381 ulint tablename_len;
3382
3383 /* Replace the period before the extension with a null byte. */
3384 ptr = strrchr((char*) data_dir_path, '.');
3385 if (!ptr) {
3386 return;
3387 }
3388 ptr[0] = '\0';
3389
3390 /* The tablename starts after the last slash. */
3391 ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
3392 if (!ptr) {
3393 return;
3394 }
3395 ptr[0] = '\0';
3396 tablename = ptr + 1;
3397
3398 /* The databasename starts after the next to last slash. */
3399 ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
3400 if (!ptr) {
3401 return;
3402 }
3403 tablename_len = ut_strlen(tablename);
3404
3405 ut_memmove(++ptr, tablename, tablename_len);
3406
3407 ptr[tablename_len] = '\0';
3408 }
3409
3410 /****************************************************************//**
3411 The function os_file_dirname returns a directory component of a
3412 null-terminated pathname string. In the usual case, dirname returns
3413 the string up to, but not including, the final '/', and basename
3414 is the component following the final '/'. Trailing '/' characters
3415 are not counted as part of the pathname.
3416
3417 If path does not contain a slash, dirname returns the string ".".
3418
3419 Concatenating the string returned by dirname, a "/", and the basename
3420 yields a complete pathname.
3421
3422 The return value is a copy of the directory component of the pathname.
3423 The copy is allocated from heap. It is the caller responsibility
3424 to free it after it is no longer needed.
3425
3426 The following list of examples (taken from SUSv2) shows the strings
3427 returned by dirname and basename for different paths:
3428
3429 path dirname basename
3430 "/usr/lib" "/usr" "lib"
3431 "/usr/" "/" "usr"
3432 "usr" "." "usr"
3433 "/" "/" "/"
3434 "." "." "."
3435 ".." "." ".."
3436
3437 @return own: directory component of the pathname */
3438 UNIV_INTERN
3439 char*
os_file_dirname(const char * path)3440 os_file_dirname(
3441 /*============*/
3442 const char* path) /*!< in: pathname */
3443 {
3444 /* Find the offset of the last slash */
3445 const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR);
3446 if (!last_slash) {
3447 /* No slash in the path, return "." */
3448
3449 return(mem_strdup("."));
3450 }
3451
3452 /* Ok, there is a slash */
3453
3454 if (last_slash == path) {
3455 /* last slash is the first char of the path */
3456
3457 return(mem_strdup("/"));
3458 }
3459
3460 /* Non-trivial directory component */
3461
3462 return(mem_strdupl(path, last_slash - path));
3463 }
3464
3465 /****************************************************************//**
3466 Creates all missing subdirectories along the given path.
3467 @return TRUE if call succeeded FALSE otherwise */
3468 UNIV_INTERN
3469 ibool
os_file_create_subdirs_if_needed(const char * path)3470 os_file_create_subdirs_if_needed(
3471 /*=============================*/
3472 const char* path) /*!< in: path name */
3473 {
3474 if (srv_read_only_mode) {
3475
3476 ib_logf(IB_LOG_LEVEL_ERROR,
3477 "read only mode set. Can't create subdirectories '%s'",
3478 path);
3479
3480 return(FALSE);
3481
3482 }
3483
3484 char* subdir = os_file_dirname(path);
3485
3486 if (strlen(subdir) == 1
3487 && (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) {
3488 /* subdir is root or cwd, nothing to do */
3489 mem_free(subdir);
3490
3491 return(TRUE);
3492 }
3493
3494 /* Test if subdir exists */
3495 os_file_type_t type;
3496 ibool subdir_exists;
3497 ibool success = os_file_status(subdir, &subdir_exists, &type);
3498
3499 if (success && !subdir_exists) {
3500
3501 /* subdir does not exist, create it */
3502 success = os_file_create_subdirs_if_needed(subdir);
3503
3504 if (!success) {
3505 mem_free(subdir);
3506
3507 return(FALSE);
3508 }
3509
3510 success = os_file_create_directory(subdir, FALSE);
3511 }
3512
3513 mem_free(subdir);
3514
3515 return(success);
3516 }
3517
3518 #ifndef UNIV_HOTBACKUP
3519 /****************************************************************//**
3520 Returns a pointer to the nth slot in the aio array.
3521 @return pointer to slot */
3522 static
3523 os_aio_slot_t*
os_aio_array_get_nth_slot(os_aio_array_t * array,ulint index)3524 os_aio_array_get_nth_slot(
3525 /*======================*/
3526 os_aio_array_t* array, /*!< in: aio array */
3527 ulint index) /*!< in: index of the slot */
3528 {
3529 ut_a(index < array->n_slots);
3530
3531 return(&array->slots[index]);
3532 }
3533
3534 #if defined(LINUX_NATIVE_AIO)
3535 /******************************************************************//**
3536 Creates an io_context for native linux AIO.
3537 @return TRUE on success. */
3538 static
3539 ibool
os_aio_linux_create_io_ctx(ulint max_events,io_context_t * io_ctx)3540 os_aio_linux_create_io_ctx(
3541 /*=======================*/
3542 ulint max_events, /*!< in: number of events. */
3543 io_context_t* io_ctx) /*!< out: io_ctx to initialize. */
3544 {
3545 int ret;
3546 ulint retries = 0;
3547
3548 retry:
3549 memset(io_ctx, 0x0, sizeof(*io_ctx));
3550
3551 /* Initialize the io_ctx. Tell it how many pending
3552 IO requests this context will handle. */
3553
3554 ret = io_setup(max_events, io_ctx);
3555 if (ret == 0) {
3556 #if defined(UNIV_AIO_DEBUG)
3557 fprintf(stderr,
3558 "InnoDB: Linux native AIO:"
3559 " initialized io_ctx for segment\n");
3560 #endif
3561 /* Success. Return now. */
3562 return(TRUE);
3563 }
3564
3565 /* If we hit EAGAIN we'll make a few attempts before failing. */
3566
3567 switch (ret) {
3568 case -EAGAIN:
3569 if (retries == 0) {
3570 /* First time around. */
3571 ut_print_timestamp(stderr);
3572 fprintf(stderr,
3573 " InnoDB: Warning: io_setup() failed"
3574 " with EAGAIN. Will make %d attempts"
3575 " before giving up.\n",
3576 OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3577 }
3578
3579 if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
3580 ++retries;
3581 fprintf(stderr,
3582 "InnoDB: Warning: io_setup() attempt"
3583 " %lu failed.\n",
3584 retries);
3585 os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
3586 goto retry;
3587 }
3588
3589 /* Have tried enough. Better call it a day. */
3590 ut_print_timestamp(stderr);
3591 fprintf(stderr,
3592 " InnoDB: Error: io_setup() failed"
3593 " with EAGAIN after %d attempts.\n",
3594 OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3595 break;
3596
3597 case -ENOSYS:
3598 ut_print_timestamp(stderr);
3599 fprintf(stderr,
3600 " InnoDB: Error: Linux Native AIO interface"
3601 " is not supported on this platform. Please"
3602 " check your OS documentation and install"
3603 " appropriate binary of InnoDB.\n");
3604
3605 break;
3606
3607 default:
3608 ut_print_timestamp(stderr);
3609 fprintf(stderr,
3610 " InnoDB: Error: Linux Native AIO setup"
3611 " returned following error[%d]\n", -ret);
3612 break;
3613 }
3614
3615 fprintf(stderr,
3616 "InnoDB: You can disable Linux Native AIO by"
3617 " setting innodb_use_native_aio = 0 in my.cnf\n");
3618 return(FALSE);
3619 }
3620
3621 /******************************************************************//**
3622 Checks if the system supports native linux aio. On some kernel
3623 versions where native aio is supported it won't work on tmpfs. In such
3624 cases we can't use native aio as it is not possible to mix simulated
3625 and native aio.
3626 @return: TRUE if supported, FALSE otherwise. */
3627 static
3628 ibool
os_aio_native_aio_supported(void)3629 os_aio_native_aio_supported(void)
3630 /*=============================*/
3631 {
3632 int fd;
3633 io_context_t io_ctx;
3634 char name[1000];
3635
3636 if (!os_aio_linux_create_io_ctx(1, &io_ctx)) {
3637 /* The platform does not support native aio. */
3638 return(FALSE);
3639 } else if (!srv_read_only_mode) {
3640 /* Now check if tmpdir supports native aio ops. */
3641 fd = innobase_mysql_tmpfile(NULL);
3642
3643 if (fd < 0) {
3644 ib_logf(IB_LOG_LEVEL_WARN,
3645 "Unable to create temp file to check "
3646 "native AIO support.");
3647
3648 return(FALSE);
3649 }
3650 } else {
3651
3652 srv_normalize_path_for_win(srv_log_group_home_dir);
3653
3654 ulint dirnamelen = strlen(srv_log_group_home_dir);
3655 ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
3656 memcpy(name, srv_log_group_home_dir, dirnamelen);
3657
3658 /* Add a path separator if needed. */
3659 if (dirnamelen && name[dirnamelen - 1] != SRV_PATH_SEPARATOR) {
3660 name[dirnamelen++] = SRV_PATH_SEPARATOR;
3661 }
3662
3663 strcpy(name + dirnamelen, "ib_logfile0");
3664
3665 fd = ::open(name, O_RDONLY);
3666
3667 if (fd == -1) {
3668
3669 ib_logf(IB_LOG_LEVEL_WARN,
3670 "Unable to open \"%s\" to check "
3671 "native AIO read support.", name);
3672
3673 return(FALSE);
3674 }
3675 }
3676
3677 struct io_event io_event;
3678
3679 memset(&io_event, 0x0, sizeof(io_event));
3680
3681 byte* buf = static_cast<byte*>(ut_malloc(UNIV_PAGE_SIZE * 2));
3682 byte* ptr = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE));
3683
3684 struct iocb iocb;
3685
3686 /* Suppress valgrind warning. */
3687 memset(buf, 0x00, UNIV_PAGE_SIZE * 2);
3688 memset(&iocb, 0x0, sizeof(iocb));
3689
3690 struct iocb* p_iocb = &iocb;
3691
3692 if (!srv_read_only_mode) {
3693 io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0);
3694 } else {
3695 ut_a(UNIV_PAGE_SIZE >= 512);
3696 io_prep_pread(p_iocb, fd, ptr, 512, 0);
3697 }
3698
3699 int err = io_submit(io_ctx, 1, &p_iocb);
3700
3701 if (err >= 1) {
3702 /* Now collect the submitted IO request. */
3703 err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
3704 }
3705
3706 ut_free(buf);
3707 close(fd);
3708
3709 switch (err) {
3710 case 1:
3711 return(TRUE);
3712
3713 case -EINVAL:
3714 case -ENOSYS:
3715 ib_logf(IB_LOG_LEVEL_ERROR,
3716 "Linux Native AIO not supported. You can either "
3717 "move %s to a file system that supports native "
3718 "AIO or you can set innodb_use_native_aio to "
3719 "FALSE to avoid this message.",
3720 srv_read_only_mode ? name : "tmpdir");
3721
3722 /* fall through. */
3723 default:
3724 ib_logf(IB_LOG_LEVEL_ERROR,
3725 "Linux Native AIO check on %s returned error[%d]",
3726 srv_read_only_mode ? name : "tmpdir", -err);
3727 }
3728
3729 return(FALSE);
3730 }
3731 #endif /* LINUX_NATIVE_AIO */
3732
3733 /******************************************************************//**
3734 Creates an aio wait array. Note that we return NULL in case of failure.
3735 We don't care about freeing memory here because we assume that a
3736 failure will result in server refusing to start up.
3737 @return own: aio array, NULL on failure */
3738 static
3739 os_aio_array_t*
os_aio_array_create(ulint n,ulint n_segments)3740 os_aio_array_create(
3741 /*================*/
3742 ulint n, /*!< in: maximum number of pending aio
3743 operations allowed; n must be
3744 divisible by n_segments */
3745 ulint n_segments) /*!< in: number of segments in the aio array */
3746 {
3747 os_aio_array_t* array;
3748 #ifdef WIN_ASYNC_IO
3749 OVERLAPPED* over;
3750 #elif defined(LINUX_NATIVE_AIO)
3751 struct io_event* io_event = NULL;
3752 #endif /* WIN_ASYNC_IO */
3753 ut_a(n > 0);
3754 ut_a(n_segments > 0);
3755
3756 array = static_cast<os_aio_array_t*>(ut_malloc(sizeof(*array)));
3757 memset(array, 0x0, sizeof(*array));
3758
3759 array->mutex = os_mutex_create();
3760 array->not_full = os_event_create();
3761 array->is_empty = os_event_create();
3762
3763 os_event_set(array->is_empty);
3764
3765 array->n_slots = n;
3766 array->n_segments = n_segments;
3767
3768 array->slots = static_cast<os_aio_slot_t*>(
3769 ut_malloc(n * sizeof(*array->slots)));
3770
3771 memset(array->slots, 0x0, sizeof(n * sizeof(*array->slots)));
3772 #ifdef __WIN__
3773 array->handles = static_cast<HANDLE*>(ut_malloc(n * sizeof(HANDLE)));
3774 #endif /* __WIN__ */
3775
3776 #if defined(LINUX_NATIVE_AIO)
3777 array->aio_ctx = NULL;
3778 array->aio_events = NULL;
3779
3780 /* If we are not using native aio interface then skip this
3781 part of initialization. */
3782 if (!srv_use_native_aio) {
3783 goto skip_native_aio;
3784 }
3785
3786 /* Initialize the io_context array. One io_context
3787 per segment in the array. */
3788
3789 array->aio_ctx = static_cast<io_context**>(
3790 ut_malloc(n_segments * sizeof(*array->aio_ctx)));
3791
3792 for (ulint i = 0; i < n_segments; ++i) {
3793 if (!os_aio_linux_create_io_ctx(n/n_segments,
3794 &array->aio_ctx[i])) {
3795 /* If something bad happened during aio setup
3796 we should call it a day and return right away.
3797 We don't care about any leaks because a failure
3798 to initialize the io subsystem means that the
3799 server (or atleast the innodb storage engine)
3800 is not going to startup. */
3801 return(NULL);
3802 }
3803 }
3804
3805 /* Initialize the event array. One event per slot. */
3806 io_event = static_cast<struct io_event*>(
3807 ut_malloc(n * sizeof(*io_event)));
3808
3809 memset(io_event, 0x0, sizeof(*io_event) * n);
3810 array->aio_events = io_event;
3811
3812 skip_native_aio:
3813 #endif /* LINUX_NATIVE_AIO */
3814 for (ulint i = 0; i < n; i++) {
3815 os_aio_slot_t* slot;
3816
3817 slot = os_aio_array_get_nth_slot(array, i);
3818
3819 slot->pos = i;
3820 slot->reserved = FALSE;
3821 #ifdef WIN_ASYNC_IO
3822 slot->handle = CreateEvent(NULL,TRUE, FALSE, NULL);
3823
3824 over = &slot->control;
3825
3826 over->hEvent = slot->handle;
3827
3828 array->handles[i] = over->hEvent;
3829
3830 #elif defined(LINUX_NATIVE_AIO)
3831 memset(&slot->control, 0x0, sizeof(slot->control));
3832 slot->n_bytes = 0;
3833 slot->ret = 0;
3834 #endif /* WIN_ASYNC_IO */
3835 }
3836
3837 return(array);
3838 }
3839
3840 /************************************************************************//**
3841 Frees an aio wait array. */
3842 static
3843 void
os_aio_array_free(os_aio_array_t * & array)3844 os_aio_array_free(
3845 /*==============*/
3846 os_aio_array_t*& array) /*!< in, own: array to free */
3847 {
3848 #ifdef WIN_ASYNC_IO
3849 ulint i;
3850
3851 for (i = 0; i < array->n_slots; i++) {
3852 os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i);
3853 CloseHandle(slot->handle);
3854 }
3855 #endif /* WIN_ASYNC_IO */
3856
3857 #ifdef __WIN__
3858 ut_free(array->handles);
3859 #endif /* __WIN__ */
3860 os_mutex_free(array->mutex);
3861 os_event_free(array->not_full);
3862 os_event_free(array->is_empty);
3863
3864 #if defined(LINUX_NATIVE_AIO)
3865 if (srv_use_native_aio) {
3866 ut_free(array->aio_events);
3867 ut_free(array->aio_ctx);
3868 }
3869 #endif /* LINUX_NATIVE_AIO */
3870
3871 ut_free(array->slots);
3872 ut_free(array);
3873
3874 array = 0;
3875 }
3876
3877 /***********************************************************************
3878 Initializes the asynchronous io system. Creates one array each for ibuf
3879 and log i/o. Also creates one array each for read and write where each
3880 array is divided logically into n_read_segs and n_write_segs
3881 respectively. The caller must create an i/o handler thread for each
3882 segment in these arrays. This function also creates the sync array.
3883 No i/o handler thread needs to be created for that */
3884 UNIV_INTERN
3885 ibool
os_aio_init(ulint n_per_seg,ulint n_read_segs,ulint n_write_segs,ulint n_slots_sync)3886 os_aio_init(
3887 /*========*/
3888 ulint n_per_seg, /*<! in: maximum number of pending aio
3889 operations allowed per segment */
3890 ulint n_read_segs, /*<! in: number of reader threads */
3891 ulint n_write_segs, /*<! in: number of writer threads */
3892 ulint n_slots_sync) /*<! in: number of slots in the sync aio
3893 array */
3894 {
3895 os_io_init_simple();
3896
3897 #if defined(LINUX_NATIVE_AIO)
3898 /* Check if native aio is supported on this system and tmpfs */
3899 if (srv_use_native_aio && !os_aio_native_aio_supported()) {
3900
3901 ib_logf(IB_LOG_LEVEL_WARN, "Linux Native AIO disabled.");
3902
3903 srv_use_native_aio = FALSE;
3904 }
3905 #endif /* LINUX_NATIVE_AIO */
3906
3907 srv_reset_io_thread_op_info();
3908
3909 os_aio_read_array = os_aio_array_create(
3910 n_read_segs * n_per_seg, n_read_segs);
3911
3912 if (os_aio_read_array == NULL) {
3913 return(FALSE);
3914 }
3915
3916 ulint start = (srv_read_only_mode) ? 0 : 2;
3917 ulint n_segs = n_read_segs + start;
3918
3919 /* 0 is the ibuf segment and 1 is the insert buffer segment. */
3920 for (ulint i = start; i < n_segs; ++i) {
3921 ut_a(i < SRV_MAX_N_IO_THREADS);
3922 srv_io_thread_function[i] = "read thread";
3923 }
3924
3925 ulint n_segments = n_read_segs;
3926
3927 if (!srv_read_only_mode) {
3928
3929 os_aio_log_array = os_aio_array_create(n_per_seg, 1);
3930
3931 if (os_aio_log_array == NULL) {
3932 return(FALSE);
3933 }
3934
3935 ++n_segments;
3936
3937 srv_io_thread_function[1] = "log thread";
3938
3939 os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
3940
3941 if (os_aio_ibuf_array == NULL) {
3942 return(FALSE);
3943 }
3944
3945 ++n_segments;
3946
3947 srv_io_thread_function[0] = "insert buffer thread";
3948
3949 os_aio_write_array = os_aio_array_create(
3950 n_write_segs * n_per_seg, n_write_segs);
3951
3952 if (os_aio_write_array == NULL) {
3953 return(FALSE);
3954 }
3955
3956 n_segments += n_write_segs;
3957
3958 for (ulint i = start + n_read_segs; i < n_segments; ++i) {
3959 ut_a(i < SRV_MAX_N_IO_THREADS);
3960 srv_io_thread_function[i] = "write thread";
3961 }
3962
3963 ut_ad(n_segments >= 4);
3964 } else {
3965 ut_ad(n_segments > 0);
3966 }
3967
3968 os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
3969
3970 if (os_aio_sync_array == NULL) {
3971 return(FALSE);
3972 }
3973
3974 os_aio_n_segments = n_segments;
3975
3976 os_aio_validate();
3977
3978 os_aio_segment_wait_events = static_cast<os_event_t*>(
3979 ut_malloc(n_segments * sizeof *os_aio_segment_wait_events));
3980
3981 for (ulint i = 0; i < n_segments; ++i) {
3982 os_aio_segment_wait_events[i] = os_event_create();
3983 }
3984
3985 os_last_printout = ut_time();
3986
3987 return(TRUE);
3988
3989 }
3990
3991 /***********************************************************************
3992 Frees the asynchronous io system. */
3993 UNIV_INTERN
3994 void
os_aio_free(void)3995 os_aio_free(void)
3996 /*=============*/
3997 {
3998 if (os_aio_ibuf_array != 0) {
3999 os_aio_array_free(os_aio_ibuf_array);
4000 }
4001
4002 if (os_aio_log_array != 0) {
4003 os_aio_array_free(os_aio_log_array);
4004 }
4005
4006 if (os_aio_write_array != 0) {
4007 os_aio_array_free(os_aio_write_array);
4008 }
4009
4010 if (os_aio_sync_array != 0) {
4011 os_aio_array_free(os_aio_sync_array);
4012 }
4013
4014 os_aio_array_free(os_aio_read_array);
4015
4016 for (ulint i = 0; i < os_aio_n_segments; i++) {
4017 os_event_free(os_aio_segment_wait_events[i]);
4018 }
4019
4020 ut_free(os_aio_segment_wait_events);
4021 os_aio_segment_wait_events = 0;
4022 os_aio_n_segments = 0;
4023 }
4024
4025 #ifdef WIN_ASYNC_IO
4026 /************************************************************************//**
4027 Wakes up all async i/o threads in the array in Windows async i/o at
4028 shutdown. */
4029 static
4030 void
os_aio_array_wake_win_aio_at_shutdown(os_aio_array_t * array)4031 os_aio_array_wake_win_aio_at_shutdown(
4032 /*==================================*/
4033 os_aio_array_t* array) /*!< in: aio array */
4034 {
4035 ulint i;
4036
4037 for (i = 0; i < array->n_slots; i++) {
4038
4039 SetEvent((array->slots + i)->handle);
4040 }
4041 }
4042 #endif
4043
4044 /************************************************************************//**
4045 Wakes up all async i/o threads so that they know to exit themselves in
4046 shutdown. */
4047 UNIV_INTERN
4048 void
os_aio_wake_all_threads_at_shutdown(void)4049 os_aio_wake_all_threads_at_shutdown(void)
4050 /*=====================================*/
4051 {
4052 #ifdef WIN_ASYNC_IO
4053 /* This code wakes up all ai/o threads in Windows native aio */
4054 os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array);
4055 if (os_aio_write_array != 0) {
4056 os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array);
4057 }
4058
4059 if (os_aio_ibuf_array != 0) {
4060 os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array);
4061 }
4062
4063 if (os_aio_log_array != 0) {
4064 os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array);
4065 }
4066
4067 #elif defined(LINUX_NATIVE_AIO)
4068
4069 /* When using native AIO interface the io helper threads
4070 wait on io_getevents with a timeout value of 500ms. At
4071 each wake up these threads check the server status.
4072 No need to do anything to wake them up. */
4073
4074 if (srv_use_native_aio) {
4075 return;
4076 }
4077
4078 /* Fall through to simulated AIO handler wakeup if we are
4079 not using native AIO. */
4080 #endif /* !WIN_ASYNC_AIO */
4081
4082 /* This loop wakes up all simulated ai/o threads */
4083
4084 for (ulint i = 0; i < os_aio_n_segments; i++) {
4085
4086 os_event_set(os_aio_segment_wait_events[i]);
4087 }
4088 }
4089
4090 /************************************************************************//**
4091 Waits until there are no pending writes in os_aio_write_array. There can
4092 be other, synchronous, pending writes. */
4093 UNIV_INTERN
4094 void
os_aio_wait_until_no_pending_writes(void)4095 os_aio_wait_until_no_pending_writes(void)
4096 /*=====================================*/
4097 {
4098 ut_ad(!srv_read_only_mode);
4099 os_event_wait(os_aio_write_array->is_empty);
4100 }
4101
4102 /**********************************************************************//**
4103 Calculates segment number for a slot.
4104 @return segment number (which is the number used by, for example,
4105 i/o-handler threads) */
4106 static
4107 ulint
os_aio_get_segment_no_from_slot(os_aio_array_t * array,os_aio_slot_t * slot)4108 os_aio_get_segment_no_from_slot(
4109 /*============================*/
4110 os_aio_array_t* array, /*!< in: aio wait array */
4111 os_aio_slot_t* slot) /*!< in: slot in this array */
4112 {
4113 ulint segment;
4114 ulint seg_len;
4115
4116 if (array == os_aio_ibuf_array) {
4117 ut_ad(!srv_read_only_mode);
4118
4119 segment = IO_IBUF_SEGMENT;
4120
4121 } else if (array == os_aio_log_array) {
4122 ut_ad(!srv_read_only_mode);
4123
4124 segment = IO_LOG_SEGMENT;
4125
4126 } else if (array == os_aio_read_array) {
4127 seg_len = os_aio_read_array->n_slots
4128 / os_aio_read_array->n_segments;
4129
4130 segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
4131 } else {
4132 ut_ad(!srv_read_only_mode);
4133 ut_a(array == os_aio_write_array);
4134
4135 seg_len = os_aio_write_array->n_slots
4136 / os_aio_write_array->n_segments;
4137
4138 segment = os_aio_read_array->n_segments + 2
4139 + slot->pos / seg_len;
4140 }
4141
4142 return(segment);
4143 }
4144
4145 /**********************************************************************//**
4146 Calculates local segment number and aio array from global segment number.
4147 @return local segment number within the aio array */
4148 static
4149 ulint
os_aio_get_array_and_local_segment(os_aio_array_t ** array,ulint global_segment)4150 os_aio_get_array_and_local_segment(
4151 /*===============================*/
4152 os_aio_array_t** array, /*!< out: aio wait array */
4153 ulint global_segment)/*!< in: global segment number */
4154 {
4155 ulint segment;
4156
4157 ut_a(global_segment < os_aio_n_segments);
4158
4159 if (srv_read_only_mode) {
4160 *array = os_aio_read_array;
4161
4162 return(global_segment);
4163 } else if (global_segment == IO_IBUF_SEGMENT) {
4164 *array = os_aio_ibuf_array;
4165 segment = 0;
4166
4167 } else if (global_segment == IO_LOG_SEGMENT) {
4168 *array = os_aio_log_array;
4169 segment = 0;
4170
4171 } else if (global_segment < os_aio_read_array->n_segments + 2) {
4172 *array = os_aio_read_array;
4173
4174 segment = global_segment - 2;
4175 } else {
4176 *array = os_aio_write_array;
4177
4178 segment = global_segment - (os_aio_read_array->n_segments + 2);
4179 }
4180
4181 return(segment);
4182 }
4183
4184 /*******************************************************************//**
4185 Requests for a slot in the aio array. If no slot is available, waits until
4186 not_full-event becomes signaled.
4187 @return pointer to slot */
4188 static
4189 os_aio_slot_t*
os_aio_array_reserve_slot(ulint type,os_aio_array_t * array,fil_node_t * message1,void * message2,pfs_os_file_t file,const char * name,void * buf,os_offset_t offset,ulint len)4190 os_aio_array_reserve_slot(
4191 /*======================*/
4192 ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
4193 os_aio_array_t* array, /*!< in: aio array */
4194 fil_node_t* message1,/*!< in: message to be passed along with
4195 the aio operation */
4196 void* message2,/*!< in: message to be passed along with
4197 the aio operation */
4198 pfs_os_file_t file, /*!< in: file handle */
4199 const char* name, /*!< in: name of the file or path as a
4200 null-terminated string */
4201 void* buf, /*!< in: buffer where to read or from which
4202 to write */
4203 os_offset_t offset, /*!< in: file offset */
4204 ulint len) /*!< in: length of the block to read or write */
4205 {
4206 os_aio_slot_t* slot = NULL;
4207 #ifdef WIN_ASYNC_IO
4208 OVERLAPPED* control;
4209
4210 #elif defined(LINUX_NATIVE_AIO)
4211
4212 struct iocb* iocb;
4213 off_t aio_offset;
4214
4215 #endif /* WIN_ASYNC_IO */
4216 ulint i;
4217 ulint counter;
4218 ulint slots_per_seg;
4219 ulint local_seg;
4220
4221 #ifdef WIN_ASYNC_IO
4222 ut_a((len & 0xFFFFFFFFUL) == len);
4223 #endif /* WIN_ASYNC_IO */
4224
4225 /* No need of a mutex. Only reading constant fields */
4226 slots_per_seg = array->n_slots / array->n_segments;
4227
4228 /* We attempt to keep adjacent blocks in the same local
4229 segment. This can help in merging IO requests when we are
4230 doing simulated AIO */
4231 local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6))
4232 % array->n_segments;
4233
4234 loop:
4235 os_mutex_enter(array->mutex);
4236
4237 if (array->n_reserved == array->n_slots) {
4238 os_mutex_exit(array->mutex);
4239
4240 if (!srv_use_native_aio) {
4241 /* If the handler threads are suspended, wake them
4242 so that we get more slots */
4243
4244 os_aio_simulated_wake_handler_threads();
4245 }
4246
4247 os_event_wait(array->not_full);
4248
4249 goto loop;
4250 }
4251
4252 /* We start our search for an available slot from our preferred
4253 local segment and do a full scan of the array. We are
4254 guaranteed to find a slot in full scan. */
4255 for (i = local_seg * slots_per_seg, counter = 0;
4256 counter < array->n_slots;
4257 i++, counter++) {
4258
4259 i %= array->n_slots;
4260
4261 slot = os_aio_array_get_nth_slot(array, i);
4262
4263 if (slot->reserved == FALSE) {
4264 goto found;
4265 }
4266 }
4267
4268 /* We MUST always be able to get hold of a reserved slot. */
4269 ut_error;
4270
4271 found:
4272 ut_a(slot->reserved == FALSE);
4273 array->n_reserved++;
4274
4275 if (array->n_reserved == 1) {
4276 os_event_reset(array->is_empty);
4277 }
4278
4279 if (array->n_reserved == array->n_slots) {
4280 os_event_reset(array->not_full);
4281 }
4282
4283 slot->reserved = TRUE;
4284 slot->reservation_time = ut_time();
4285 slot->message1 = message1;
4286 slot->message2 = message2;
4287 slot->file = file;
4288 slot->name = name;
4289 slot->len = len;
4290 slot->type = type;
4291 slot->buf = static_cast<byte*>(buf);
4292 slot->offset = offset;
4293 slot->io_already_done = FALSE;
4294
4295 #ifdef WIN_ASYNC_IO
4296 control = &slot->control;
4297 control->Offset = (DWORD) offset & 0xFFFFFFFF;
4298 control->OffsetHigh = (DWORD) (offset >> 32);
4299 ResetEvent(slot->handle);
4300
4301 #elif defined(LINUX_NATIVE_AIO)
4302
4303 /* If we are not using native AIO skip this part. */
4304 if (!srv_use_native_aio) {
4305 goto skip_native_aio;
4306 }
4307
4308 /* Check if we are dealing with 64 bit arch.
4309 If not then make sure that offset fits in 32 bits. */
4310 aio_offset = (off_t) offset;
4311
4312 ut_a(sizeof(aio_offset) >= sizeof(offset)
4313 || ((os_offset_t) aio_offset) == offset);
4314
4315 iocb = &slot->control;
4316
4317 if (type == OS_FILE_READ) {
4318 io_prep_pread(iocb, file.m_file, buf, len, aio_offset);
4319 } else {
4320 ut_a(type == OS_FILE_WRITE);
4321 io_prep_pwrite(iocb, file.m_file, buf, len, aio_offset);
4322 }
4323
4324 iocb->data = (void*) slot;
4325 slot->n_bytes = 0;
4326 slot->ret = 0;
4327
4328 skip_native_aio:
4329 #endif /* LINUX_NATIVE_AIO */
4330 os_mutex_exit(array->mutex);
4331
4332 return(slot);
4333 }
4334
4335 /*******************************************************************//**
4336 Frees a slot in the aio array. */
4337 static
4338 void
os_aio_array_free_slot(os_aio_array_t * array,os_aio_slot_t * slot)4339 os_aio_array_free_slot(
4340 /*===================*/
4341 os_aio_array_t* array, /*!< in: aio array */
4342 os_aio_slot_t* slot) /*!< in: pointer to slot */
4343 {
4344 os_mutex_enter(array->mutex);
4345
4346 ut_ad(slot->reserved);
4347
4348 slot->reserved = FALSE;
4349
4350 array->n_reserved--;
4351
4352 if (array->n_reserved == array->n_slots - 1) {
4353 os_event_set(array->not_full);
4354 }
4355
4356 if (array->n_reserved == 0) {
4357 os_event_set(array->is_empty);
4358 }
4359
4360 #ifdef WIN_ASYNC_IO
4361
4362 ResetEvent(slot->handle);
4363
4364 #elif defined(LINUX_NATIVE_AIO)
4365
4366 if (srv_use_native_aio) {
4367 memset(&slot->control, 0x0, sizeof(slot->control));
4368 slot->n_bytes = 0;
4369 slot->ret = 0;
4370 /*fprintf(stderr, "Freed up Linux native slot.\n");*/
4371 } else {
4372 /* These fields should not be used if we are not
4373 using native AIO. */
4374 ut_ad(slot->n_bytes == 0);
4375 ut_ad(slot->ret == 0);
4376 }
4377
4378 #endif
4379 os_mutex_exit(array->mutex);
4380 }
4381
4382 /**********************************************************************//**
4383 Wakes up a simulated aio i/o-handler thread if it has something to do. */
4384 static
4385 void
os_aio_simulated_wake_handler_thread(ulint global_segment)4386 os_aio_simulated_wake_handler_thread(
4387 /*=================================*/
4388 ulint global_segment) /*!< in: the number of the segment in the aio
4389 arrays */
4390 {
4391 os_aio_array_t* array;
4392 ulint segment;
4393
4394 ut_ad(!srv_use_native_aio);
4395
4396 segment = os_aio_get_array_and_local_segment(&array, global_segment);
4397
4398 ulint n = array->n_slots / array->n_segments;
4399
4400 segment *= n;
4401
4402 /* Look through n slots after the segment * n'th slot */
4403
4404 os_mutex_enter(array->mutex);
4405
4406 for (ulint i = 0; i < n; ++i) {
4407 const os_aio_slot_t* slot;
4408
4409 slot = os_aio_array_get_nth_slot(array, segment + i);
4410
4411 if (slot->reserved) {
4412
4413 /* Found an i/o request */
4414
4415 os_mutex_exit(array->mutex);
4416
4417 os_event_t event;
4418
4419 event = os_aio_segment_wait_events[global_segment];
4420
4421 os_event_set(event);
4422
4423 return;
4424 }
4425 }
4426
4427 os_mutex_exit(array->mutex);
4428 }
4429
4430 /**********************************************************************//**
4431 Wakes up simulated aio i/o-handler threads if they have something to do. */
4432 UNIV_INTERN
4433 void
os_aio_simulated_wake_handler_threads(void)4434 os_aio_simulated_wake_handler_threads(void)
4435 /*=======================================*/
4436 {
4437 if (srv_use_native_aio) {
4438 /* We do not use simulated aio: do nothing */
4439
4440 return;
4441 }
4442
4443 os_aio_recommend_sleep_for_read_threads = FALSE;
4444
4445 for (ulint i = 0; i < os_aio_n_segments; i++) {
4446 os_aio_simulated_wake_handler_thread(i);
4447 }
4448 }
4449
4450 /**********************************************************************//**
4451 This function can be called if one wants to post a batch of reads and
4452 prefers an i/o-handler thread to handle them all at once later. You must
4453 call os_aio_simulated_wake_handler_threads later to ensure the threads
4454 are not left sleeping! */
4455 UNIV_INTERN
4456 void
os_aio_simulated_put_read_threads_to_sleep(void)4457 os_aio_simulated_put_read_threads_to_sleep(void)
4458 /*============================================*/
4459 {
4460
4461 /* The idea of putting background IO threads to sleep is only for
4462 Windows when using simulated AIO. Windows XP seems to schedule
4463 background threads too eagerly to allow for coalescing during
4464 readahead requests. */
4465 #ifdef __WIN__
4466 os_aio_array_t* array;
4467
4468 if (srv_use_native_aio) {
4469 /* We do not use simulated aio: do nothing */
4470
4471 return;
4472 }
4473
4474 os_aio_recommend_sleep_for_read_threads = TRUE;
4475
4476 for (ulint i = 0; i < os_aio_n_segments; i++) {
4477 os_aio_get_array_and_local_segment(&array, i);
4478
4479 if (array == os_aio_read_array) {
4480
4481 os_event_reset(os_aio_segment_wait_events[i]);
4482 }
4483 }
4484 #endif /* __WIN__ */
4485 }
4486
4487 #if defined(LINUX_NATIVE_AIO)
4488 /*******************************************************************//**
4489 Dispatch an AIO request to the kernel.
4490 @return TRUE on success. */
4491 static
4492 ibool
os_aio_linux_dispatch(os_aio_array_t * array,os_aio_slot_t * slot)4493 os_aio_linux_dispatch(
4494 /*==================*/
4495 os_aio_array_t* array, /*!< in: io request array. */
4496 os_aio_slot_t* slot) /*!< in: an already reserved slot. */
4497 {
4498 int ret;
4499 ulint io_ctx_index;
4500 struct iocb* iocb;
4501
4502 ut_ad(slot != NULL);
4503 ut_ad(array);
4504
4505 ut_a(slot->reserved);
4506
4507 /* Find out what we are going to work with.
4508 The iocb struct is directly in the slot.
4509 The io_context is one per segment. */
4510
4511 iocb = &slot->control;
4512 io_ctx_index = (slot->pos * array->n_segments) / array->n_slots;
4513
4514 ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb);
4515
4516 #if defined(UNIV_AIO_DEBUG)
4517 fprintf(stderr,
4518 "io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n",
4519 (slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot,
4520 array->aio_ctx[io_ctx_index], (ulong) io_ctx_index);
4521 #endif
4522
4523 /* io_submit returns number of successfully
4524 queued requests or -errno. */
4525 if (UNIV_UNLIKELY(ret != 1)) {
4526 errno = -ret;
4527 return(FALSE);
4528 }
4529
4530 return(TRUE);
4531 }
4532 #endif /* LINUX_NATIVE_AIO */
4533
4534
4535 /*******************************************************************//**
4536 NOTE! Use the corresponding macro os_aio(), not directly this function!
4537 Requests an asynchronous i/o operation.
4538 @return TRUE if request was queued successfully, FALSE if fail */
4539 UNIV_INTERN
4540 ibool
os_aio_func(ulint type,ulint mode,const char * name,pfs_os_file_t file,void * buf,os_offset_t offset,ulint n,fil_node_t * message1,void * message2)4541 os_aio_func(
4542 /*========*/
4543 ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
4544 ulint mode, /*!< in: OS_AIO_NORMAL, ..., possibly ORed
4545 to OS_AIO_SIMULATED_WAKE_LATER: the
4546 last flag advises this function not to wake
4547 i/o-handler threads, but the caller will
4548 do the waking explicitly later, in this
4549 way the caller can post several requests in
4550 a batch; NOTE that the batch must not be
4551 so big that it exhausts the slots in aio
4552 arrays! NOTE that a simulated batch
4553 may introduce hidden chances of deadlocks,
4554 because i/os are not actually handled until
4555 all have been posted: use with great
4556 caution! */
4557 const char* name, /*!< in: name of the file or path as a
4558 null-terminated string */
4559 pfs_os_file_t file, /*!< in: handle to a file */
4560 void* buf, /*!< in: buffer where to read or from which
4561 to write */
4562 os_offset_t offset, /*!< in: file offset where to read or write */
4563 ulint n, /*!< in: number of bytes to read or write */
4564 fil_node_t* message1,/*!< in: message for the aio handler
4565 (can be used to identify a completed
4566 aio operation); ignored if mode is
4567 OS_AIO_SYNC */
4568 void* message2)/*!< in: message for the aio handler
4569 (can be used to identify a completed
4570 aio operation); ignored if mode is
4571 OS_AIO_SYNC */
4572 {
4573 os_aio_array_t* array;
4574 os_aio_slot_t* slot;
4575 #ifdef WIN_ASYNC_IO
4576 ibool retval;
4577 BOOL ret = TRUE;
4578 DWORD len = (DWORD) n;
4579 struct fil_node_t* dummy_mess1;
4580 void* dummy_mess2;
4581 ulint dummy_type;
4582 #endif /* WIN_ASYNC_IO */
4583 ulint wake_later;
4584 ut_ad(file.m_file);
4585 ut_ad(buf);
4586 ut_ad(n > 0);
4587 ut_ad(n % OS_FILE_LOG_BLOCK_SIZE == 0);
4588 ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
4589 ut_ad(os_aio_validate_skip());
4590 #ifdef WIN_ASYNC_IO
4591 ut_ad((n & 0xFFFFFFFFUL) == n);
4592 #endif
4593
4594 wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
4595 mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER);
4596
4597 if (mode == OS_AIO_SYNC
4598 #ifdef WIN_ASYNC_IO
4599 && !srv_use_native_aio
4600 #endif /* WIN_ASYNC_IO */
4601 ) {
4602 /* This is actually an ordinary synchronous read or write:
4603 no need to use an i/o-handler thread. NOTE that if we use
4604 Windows async i/o, Windows does not allow us to use
4605 ordinary synchronous os_file_read etc. on the same file,
4606 therefore we have built a special mechanism for synchronous
4607 wait in the Windows case.
4608 Also note that the Performance Schema instrumentation has
4609 been performed by current os_aio_func()'s wrapper function
4610 pfs_os_aio_func(). So we would no longer need to call
4611 Performance Schema instrumented os_file_read() and
4612 os_file_write(). Instead, we should use os_file_read_func()
4613 and os_file_write_func() */
4614
4615 if (type == OS_FILE_READ) {
4616 return(os_file_read_func(file.m_file, buf, offset, n));
4617 }
4618 ut_ad(!srv_read_only_mode);
4619 ut_a(type == OS_FILE_WRITE);
4620 return(os_file_write_func(name, file.m_file, buf, offset, n));
4621 }
4622
4623 try_again:
4624 switch (mode) {
4625 case OS_AIO_NORMAL:
4626 if (type == OS_FILE_READ) {
4627 array = os_aio_read_array;
4628 } else {
4629 ut_ad(!srv_read_only_mode);
4630 array = os_aio_write_array;
4631 }
4632 break;
4633 case OS_AIO_IBUF:
4634 ut_ad(type == OS_FILE_READ);
4635 /* Reduce probability of deadlock bugs in connection with ibuf:
4636 do not let the ibuf i/o handler sleep */
4637
4638 wake_later = FALSE;
4639
4640 if (srv_read_only_mode) {
4641 array = os_aio_read_array;
4642 } else {
4643 array = os_aio_ibuf_array;
4644 }
4645 break;
4646 case OS_AIO_LOG:
4647 if (srv_read_only_mode) {
4648 array = os_aio_read_array;
4649 } else {
4650 array = os_aio_log_array;
4651 }
4652 break;
4653 case OS_AIO_SYNC:
4654 array = os_aio_sync_array;
4655 #if defined(LINUX_NATIVE_AIO)
4656 /* In Linux native AIO we don't use sync IO array. */
4657 ut_a(!srv_use_native_aio);
4658 #endif /* LINUX_NATIVE_AIO */
4659 break;
4660 default:
4661 ut_error;
4662 array = NULL; /* Eliminate compiler warning */
4663 }
4664
4665 slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
4666 name, buf, offset, n);
4667 if (type == OS_FILE_READ) {
4668 if (srv_use_native_aio) {
4669 os_n_file_reads++;
4670 os_bytes_read_since_printout += n;
4671 #ifdef WIN_ASYNC_IO
4672 ret = ReadFile(file.m_file, buf, (DWORD) n, &len,
4673 &(slot->control));
4674 #elif defined(LINUX_NATIVE_AIO)
4675 if (!os_aio_linux_dispatch(array, slot)) {
4676 goto err_exit;
4677 }
4678 #endif /* WIN_ASYNC_IO */
4679 } else {
4680 if (!wake_later) {
4681 os_aio_simulated_wake_handler_thread(
4682 os_aio_get_segment_no_from_slot(
4683 array, slot));
4684 }
4685 }
4686 } else if (type == OS_FILE_WRITE) {
4687 ut_ad(!srv_read_only_mode);
4688 if (srv_use_native_aio) {
4689 os_n_file_writes++;
4690 #ifdef WIN_ASYNC_IO
4691 ret = WriteFile(file.m_file, buf, (DWORD) n, &len,
4692 &(slot->control));
4693 #elif defined(LINUX_NATIVE_AIO)
4694 if (!os_aio_linux_dispatch(array, slot)) {
4695 goto err_exit;
4696 }
4697 #endif /* WIN_ASYNC_IO */
4698 } else {
4699 if (!wake_later) {
4700 os_aio_simulated_wake_handler_thread(
4701 os_aio_get_segment_no_from_slot(
4702 array, slot));
4703 }
4704 }
4705 } else {
4706 ut_error;
4707 }
4708
4709 #ifdef WIN_ASYNC_IO
4710 if (srv_use_native_aio) {
4711 if ((ret && len == n)
4712 || (!ret && GetLastError() == ERROR_IO_PENDING)) {
4713 /* aio was queued successfully! */
4714
4715 if (mode == OS_AIO_SYNC) {
4716 /* We want a synchronous i/o operation on a
4717 file where we also use async i/o: in Windows
4718 we must use the same wait mechanism as for
4719 async i/o */
4720
4721 retval = os_aio_windows_handle(
4722 ULINT_UNDEFINED, slot->pos,
4723 &dummy_mess1, &dummy_mess2,
4724 &dummy_type);
4725
4726 return(retval);
4727 }
4728
4729 return(TRUE);
4730 }
4731
4732 goto err_exit;
4733 }
4734 #endif /* WIN_ASYNC_IO */
4735 /* aio was queued successfully! */
4736 return(TRUE);
4737
4738 #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
4739 err_exit:
4740 #endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
4741 os_aio_array_free_slot(array, slot);
4742
4743 if (os_file_handle_error(
4744 name,type == OS_FILE_READ ? "aio read" : "aio write")) {
4745
4746 goto try_again;
4747 }
4748
4749 return(FALSE);
4750 }
4751
4752 #ifdef WIN_ASYNC_IO
4753 /**********************************************************************//**
4754 This function is only used in Windows asynchronous i/o.
4755 Waits for an aio operation to complete. This function is used to wait the
4756 for completed requests. The aio array of pending requests is divided
4757 into segments. The thread specifies which segment or slot it wants to wait
4758 for. NOTE: this function will also take care of freeing the aio slot,
4759 therefore no other thread is allowed to do the freeing!
4760 @return TRUE if the aio operation succeeded */
4761 UNIV_INTERN
4762 ibool
os_aio_windows_handle(ulint segment,ulint pos,fil_node_t ** message1,void ** message2,ulint * type)4763 os_aio_windows_handle(
4764 /*==================*/
4765 ulint segment, /*!< in: the number of the segment in the aio
4766 arrays to wait for; segment 0 is the ibuf
4767 i/o thread, segment 1 the log i/o thread,
4768 then follow the non-ibuf read threads, and as
4769 the last are the non-ibuf write threads; if
4770 this is ULINT_UNDEFINED, then it means that
4771 sync aio is used, and this parameter is
4772 ignored */
4773 ulint pos, /*!< this parameter is used only in sync aio:
4774 wait for the aio slot at this position */
4775 fil_node_t**message1, /*!< out: the messages passed with the aio
4776 request; note that also in the case where
4777 the aio operation failed, these output
4778 parameters are valid and can be used to
4779 restart the operation, for example */
4780 void** message2,
4781 ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
4782 {
4783 ulint orig_seg = segment;
4784 os_aio_array_t* array;
4785 os_aio_slot_t* slot;
4786 ulint n;
4787 ulint i;
4788 ibool ret_val;
4789 BOOL ret;
4790 DWORD len;
4791 BOOL retry = FALSE;
4792
4793 if (segment == ULINT_UNDEFINED) {
4794 segment = 0;
4795 array = os_aio_sync_array;
4796 } else {
4797 segment = os_aio_get_array_and_local_segment(&array, segment);
4798 }
4799
4800 /* NOTE! We only access constant fields in os_aio_array. Therefore
4801 we do not have to acquire the protecting mutex yet */
4802
4803 ut_ad(os_aio_validate_skip());
4804 ut_ad(segment < array->n_segments);
4805
4806 n = array->n_slots / array->n_segments;
4807
4808 if (array == os_aio_sync_array) {
4809
4810 WaitForSingleObject(
4811 os_aio_array_get_nth_slot(array, pos)->handle,
4812 INFINITE);
4813
4814 i = pos;
4815
4816 } else {
4817 if (orig_seg != ULINT_UNDEFINED) {
4818 srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
4819 }
4820
4821 i = WaitForMultipleObjects(
4822 (DWORD) n, array->handles + segment * n,
4823 FALSE, INFINITE);
4824 }
4825
4826 os_mutex_enter(array->mutex);
4827
4828 if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
4829 && array->n_reserved == 0) {
4830 *message1 = NULL;
4831 *message2 = NULL;
4832 os_mutex_exit(array->mutex);
4833 return(TRUE);
4834 }
4835
4836 ut_a(i >= WAIT_OBJECT_0 && i <= WAIT_OBJECT_0 + n);
4837
4838 slot = os_aio_array_get_nth_slot(array, i + segment * n);
4839
4840 ut_a(slot->reserved);
4841
4842 if (orig_seg != ULINT_UNDEFINED) {
4843 srv_set_io_thread_op_info(
4844 orig_seg, "get windows aio return value");
4845 }
4846 ret = GetOverlappedResult(slot->file.m_file, &(slot->control), &len, TRUE);
4847
4848 *message1 = slot->message1;
4849 *message2 = slot->message2;
4850
4851 *type = slot->type;
4852
4853 if (ret && len == slot->len) {
4854
4855 ret_val = TRUE;
4856 } else if (os_file_handle_error(slot->name, "Windows aio")) {
4857
4858 retry = TRUE;
4859 } else {
4860
4861 ret_val = FALSE;
4862 }
4863
4864 os_mutex_exit(array->mutex);
4865
4866 if (retry) {
4867 /* retry failed read/write operation synchronously.
4868 No need to hold array->mutex. */
4869
4870 #ifdef UNIV_PFS_IO
4871 /* This read/write does not go through os_file_read
4872 and os_file_write APIs, need to register with
4873 performance schema explicitly here. */
4874 struct PSI_file_locker* locker = NULL;
4875 PSI_file_locker_state state;
4876 register_pfs_file_io_begin(&state, locker, slot->file, slot->len,
4877 (slot->type == OS_FILE_WRITE)
4878 ? PSI_FILE_WRITE
4879 : PSI_FILE_READ,
4880 __FILE__, __LINE__);
4881 #endif
4882
4883 ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
4884
4885 switch (slot->type) {
4886 case OS_FILE_WRITE:
4887 ret = WriteFile(slot->file.m_file, slot->buf,
4888 (DWORD) slot->len, &len,
4889 &(slot->control));
4890 break;
4891 case OS_FILE_READ:
4892 ret = ReadFile(slot->file.m_file, slot->buf,
4893 (DWORD) slot->len, &len,
4894 &(slot->control));
4895 break;
4896 default:
4897 ut_error;
4898 }
4899
4900 #ifdef UNIV_PFS_IO
4901 register_pfs_file_io_end(locker, len);
4902 #endif
4903
4904 if (!ret && GetLastError() == ERROR_IO_PENDING) {
4905 /* aio was queued successfully!
4906 We want a synchronous i/o operation on a
4907 file where we also use async i/o: in Windows
4908 we must use the same wait mechanism as for
4909 async i/o */
4910 ret = GetOverlappedResult(slot->file.m_file,
4911 &(slot->control),
4912 &len, TRUE);
4913 }
4914
4915 ret_val = ret && len == slot->len;
4916 }
4917
4918 os_aio_array_free_slot(array, slot);
4919
4920 return(ret_val);
4921 }
4922 #endif
4923
4924 #if defined(LINUX_NATIVE_AIO)
4925 /******************************************************************//**
4926 This function is only used in Linux native asynchronous i/o. This is
4927 called from within the io-thread. If there are no completed IO requests
4928 in the slot array, the thread calls this function to collect more
4929 requests from the kernel.
4930 The io-thread waits on io_getevents(), which is a blocking call, with
4931 a timeout value. Unless the system is very heavy loaded, keeping the
4932 io-thread very busy, the io-thread will spend most of its time waiting
4933 in this function.
4934 The io-thread also exits in this function. It checks server status at
4935 each wakeup and that is why we use timed wait in io_getevents(). */
4936 static
4937 void
os_aio_linux_collect(os_aio_array_t * array,ulint segment,ulint seg_size)4938 os_aio_linux_collect(
4939 /*=================*/
4940 os_aio_array_t* array, /*!< in/out: slot array. */
4941 ulint segment, /*!< in: local segment no. */
4942 ulint seg_size) /*!< in: segment size. */
4943 {
4944 int i;
4945 int ret;
4946 ulint start_pos;
4947 ulint end_pos;
4948 struct timespec timeout;
4949 struct io_event* events;
4950 struct io_context* io_ctx;
4951
4952 /* sanity checks. */
4953 ut_ad(array != NULL);
4954 ut_ad(seg_size > 0);
4955 ut_ad(segment < array->n_segments);
4956
4957 /* Which part of event array we are going to work on. */
4958 events = &array->aio_events[segment * seg_size];
4959
4960 /* Which io_context we are going to use. */
4961 io_ctx = array->aio_ctx[segment];
4962
4963 /* Starting point of the segment we will be working on. */
4964 start_pos = segment * seg_size;
4965
4966 /* End point. */
4967 end_pos = start_pos + seg_size;
4968
4969 retry:
4970
4971 /* Initialize the events. The timeout value is arbitrary.
4972 We probably need to experiment with it a little. */
4973 memset(events, 0, sizeof(*events) * seg_size);
4974 timeout.tv_sec = 0;
4975 timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
4976
4977 ret = io_getevents(io_ctx, 1, seg_size, events, &timeout);
4978
4979 if (ret > 0) {
4980 for (i = 0; i < ret; i++) {
4981 os_aio_slot_t* slot;
4982 struct iocb* control;
4983
4984 control = (struct iocb*) events[i].obj;
4985 ut_a(control != NULL);
4986
4987 slot = (os_aio_slot_t*) control->data;
4988
4989 /* Some sanity checks. */
4990 ut_a(slot != NULL);
4991 ut_a(slot->reserved);
4992
4993 #if defined(UNIV_AIO_DEBUG)
4994 fprintf(stderr,
4995 "io_getevents[%c]: slot[%p] ctx[%p]"
4996 " seg[%lu]\n",
4997 (slot->type == OS_FILE_WRITE) ? 'w' : 'r',
4998 slot, io_ctx, segment);
4999 #endif
5000
5001 /* We are not scribbling previous segment. */
5002 ut_a(slot->pos >= start_pos);
5003
5004 /* We have not overstepped to next segment. */
5005 ut_a(slot->pos < end_pos);
5006
5007 /* Mark this request as completed. The error handling
5008 will be done in the calling function. */
5009 os_mutex_enter(array->mutex);
5010 slot->n_bytes = events[i].res;
5011 slot->ret = events[i].res2;
5012 slot->io_already_done = TRUE;
5013 os_mutex_exit(array->mutex);
5014 }
5015 return;
5016 }
5017
5018 if (UNIV_UNLIKELY(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) {
5019 return;
5020 }
5021
5022 /* This error handling is for any error in collecting the
5023 IO requests. The errors, if any, for any particular IO
5024 request are simply passed on to the calling routine. */
5025
5026 switch (ret) {
5027 case -EAGAIN:
5028 /* Not enough resources! Try again. */
5029 case -EINTR:
5030 /* Interrupted! I have tested the behaviour in case of an
5031 interrupt. If we have some completed IOs available then
5032 the return code will be the number of IOs. We get EINTR only
5033 if there are no completed IOs and we have been interrupted. */
5034 case 0:
5035 /* No pending request! Go back and check again. */
5036 goto retry;
5037 }
5038
5039 /* All other errors should cause a trap for now. */
5040 ut_print_timestamp(stderr);
5041 fprintf(stderr,
5042 " InnoDB: unexpected ret_code[%d] from io_getevents()!\n",
5043 ret);
5044 ut_error;
5045 }
5046
5047 /**********************************************************************//**
5048 This function is only used in Linux native asynchronous i/o.
5049 Waits for an aio operation to complete. This function is used to wait for
5050 the completed requests. The aio array of pending requests is divided
5051 into segments. The thread specifies which segment or slot it wants to wait
5052 for. NOTE: this function will also take care of freeing the aio slot,
5053 therefore no other thread is allowed to do the freeing!
5054 @return TRUE if the IO was successful */
5055 UNIV_INTERN
5056 ibool
os_aio_linux_handle(ulint global_seg,fil_node_t ** message1,void ** message2,ulint * type)5057 os_aio_linux_handle(
5058 /*================*/
5059 ulint global_seg, /*!< in: segment number in the aio array
5060 to wait for; segment 0 is the ibuf
5061 i/o thread, segment 1 is log i/o thread,
5062 then follow the non-ibuf read threads,
5063 and the last are the non-ibuf write
5064 threads. */
5065 fil_node_t**message1, /*!< out: the messages passed with the */
5066 void** message2, /*!< aio request; note that in case the
5067 aio operation failed, these output
5068 parameters are valid and can be used to
5069 restart the operation. */
5070 ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
5071 {
5072 ulint segment;
5073 os_aio_array_t* array;
5074 os_aio_slot_t* slot;
5075 ulint n;
5076 ulint i;
5077 ibool ret = FALSE;
5078
5079 /* Should never be doing Sync IO here. */
5080 ut_a(global_seg != ULINT_UNDEFINED);
5081
5082 /* Find the array and the local segment. */
5083 segment = os_aio_get_array_and_local_segment(&array, global_seg);
5084 n = array->n_slots / array->n_segments;
5085
5086 /* Loop until we have found a completed request. */
5087 for (;;) {
5088 ibool any_reserved = FALSE;
5089 os_mutex_enter(array->mutex);
5090 for (i = 0; i < n; ++i) {
5091 slot = os_aio_array_get_nth_slot(
5092 array, i + segment * n);
5093 if (!slot->reserved) {
5094 continue;
5095 } else if (slot->io_already_done) {
5096 /* Something for us to work on. */
5097 goto found;
5098 } else {
5099 any_reserved = TRUE;
5100 }
5101 }
5102
5103 os_mutex_exit(array->mutex);
5104
5105 /* There is no completed request.
5106 If there is no pending request at all,
5107 and the system is being shut down, exit. */
5108 if (UNIV_UNLIKELY
5109 (!any_reserved
5110 && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) {
5111 *message1 = NULL;
5112 *message2 = NULL;
5113 return(TRUE);
5114 }
5115
5116 /* Wait for some request. Note that we return
5117 from wait iff we have found a request. */
5118
5119 srv_set_io_thread_op_info(global_seg,
5120 "waiting for completed aio requests");
5121 os_aio_linux_collect(array, segment, n);
5122 }
5123
5124 found:
5125 /* Note that it may be that there are more then one completed
5126 IO requests. We process them one at a time. We may have a case
5127 here to improve the performance slightly by dealing with all
5128 requests in one sweep. */
5129 srv_set_io_thread_op_info(global_seg,
5130 "processing completed aio requests");
5131
5132 /* Ensure that we are scribbling only our segment. */
5133 ut_a(i < n);
5134
5135 ut_ad(slot != NULL);
5136 ut_ad(slot->reserved);
5137 ut_ad(slot->io_already_done);
5138
5139 *message1 = slot->message1;
5140 *message2 = slot->message2;
5141
5142 *type = slot->type;
5143
5144 if (slot->ret == 0 && slot->n_bytes == (long) slot->len) {
5145
5146 ret = TRUE;
5147 } else {
5148 errno = -slot->ret;
5149
5150 /* os_file_handle_error does tell us if we should retry
5151 this IO. As it stands now, we don't do this retry when
5152 reaping requests from a different context than
5153 the dispatcher. This non-retry logic is the same for
5154 windows and linux native AIO.
5155 We should probably look into this to transparently
5156 re-submit the IO. */
5157 os_file_handle_error(slot->name, "Linux aio");
5158
5159 ret = FALSE;
5160 }
5161
5162 os_mutex_exit(array->mutex);
5163
5164 os_aio_array_free_slot(array, slot);
5165
5166 return(ret);
5167 }
5168 #endif /* LINUX_NATIVE_AIO */
5169
5170 /**********************************************************************//**
5171 Does simulated aio. This function should be called by an i/o-handler
5172 thread.
5173 @return TRUE if the aio operation succeeded */
5174 UNIV_INTERN
5175 ibool
os_aio_simulated_handle(ulint global_segment,fil_node_t ** message1,void ** message2,ulint * type)5176 os_aio_simulated_handle(
5177 /*====================*/
5178 ulint global_segment, /*!< in: the number of the segment in the aio
5179 arrays to wait for; segment 0 is the ibuf
5180 i/o thread, segment 1 the log i/o thread,
5181 then follow the non-ibuf read threads, and as
5182 the last are the non-ibuf write threads */
5183 fil_node_t**message1, /*!< out: the messages passed with the aio
5184 request; note that also in the case where
5185 the aio operation failed, these output
5186 parameters are valid and can be used to
5187 restart the operation, for example */
5188 void** message2,
5189 ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
5190 {
5191 os_aio_array_t* array;
5192 ulint segment;
5193 os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
5194 ulint n_consecutive;
5195 ulint total_len;
5196 ulint offs;
5197 os_offset_t lowest_offset;
5198 ulint biggest_age;
5199 ulint age;
5200 byte* combined_buf;
5201 byte* combined_buf2;
5202 ibool ret;
5203 ibool any_reserved;
5204 ulint n;
5205 os_aio_slot_t* aio_slot;
5206
5207 /* Fix compiler warning */
5208 *consecutive_ios = NULL;
5209
5210 segment = os_aio_get_array_and_local_segment(&array, global_segment);
5211
5212 restart:
5213 /* NOTE! We only access constant fields in os_aio_array. Therefore
5214 we do not have to acquire the protecting mutex yet */
5215
5216 srv_set_io_thread_op_info(global_segment,
5217 "looking for i/o requests (a)");
5218 ut_ad(os_aio_validate_skip());
5219 ut_ad(segment < array->n_segments);
5220
5221 n = array->n_slots / array->n_segments;
5222
5223 /* Look through n slots after the segment * n'th slot */
5224
5225 if (array == os_aio_read_array
5226 && os_aio_recommend_sleep_for_read_threads) {
5227
5228 /* Give other threads chance to add several i/os to the array
5229 at once. */
5230
5231 goto recommended_sleep;
5232 }
5233
5234 srv_set_io_thread_op_info(global_segment,
5235 "looking for i/o requests (b)");
5236
5237 /* Check if there is a slot for which the i/o has already been
5238 done */
5239 any_reserved = FALSE;
5240
5241 os_mutex_enter(array->mutex);
5242
5243 for (ulint i = 0; i < n; i++) {
5244 os_aio_slot_t* slot;
5245
5246 slot = os_aio_array_get_nth_slot(array, i + segment * n);
5247
5248 if (!slot->reserved) {
5249 continue;
5250 } else if (slot->io_already_done) {
5251
5252 if (os_aio_print_debug) {
5253 fprintf(stderr,
5254 "InnoDB: i/o for slot %lu"
5255 " already done, returning\n",
5256 (ulong) i);
5257 }
5258
5259 aio_slot = slot;
5260 ret = TRUE;
5261 goto slot_io_done;
5262 } else {
5263 any_reserved = TRUE;
5264 }
5265 }
5266
5267 /* There is no completed request.
5268 If there is no pending request at all,
5269 and the system is being shut down, exit. */
5270 if (!any_reserved && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
5271 os_mutex_exit(array->mutex);
5272 *message1 = NULL;
5273 *message2 = NULL;
5274 return(TRUE);
5275 }
5276
5277 n_consecutive = 0;
5278
5279 /* If there are at least 2 seconds old requests, then pick the oldest
5280 one to prevent starvation. If several requests have the same age,
5281 then pick the one at the lowest offset. */
5282
5283 biggest_age = 0;
5284 lowest_offset = IB_UINT64_MAX;
5285
5286 for (ulint i = 0; i < n; i++) {
5287 os_aio_slot_t* slot;
5288
5289 slot = os_aio_array_get_nth_slot(array, i + segment * n);
5290
5291 if (slot->reserved) {
5292
5293 age = (ulint) difftime(
5294 ut_time(), slot->reservation_time);
5295
5296 if ((age >= 2 && age > biggest_age)
5297 || (age >= 2 && age == biggest_age
5298 && slot->offset < lowest_offset)) {
5299
5300 /* Found an i/o request */
5301 consecutive_ios[0] = slot;
5302
5303 n_consecutive = 1;
5304
5305 biggest_age = age;
5306 lowest_offset = slot->offset;
5307 }
5308 }
5309 }
5310
5311 if (n_consecutive == 0) {
5312 /* There were no old requests. Look for an i/o request at the
5313 lowest offset in the array (we ignore the high 32 bits of the
5314 offset in these heuristics) */
5315
5316 lowest_offset = IB_UINT64_MAX;
5317
5318 for (ulint i = 0; i < n; i++) {
5319 os_aio_slot_t* slot;
5320
5321 slot = os_aio_array_get_nth_slot(
5322 array, i + segment * n);
5323
5324 if (slot->reserved && slot->offset < lowest_offset) {
5325
5326 /* Found an i/o request */
5327 consecutive_ios[0] = slot;
5328
5329 n_consecutive = 1;
5330
5331 lowest_offset = slot->offset;
5332 }
5333 }
5334 }
5335
5336 if (n_consecutive == 0) {
5337
5338 /* No i/o requested at the moment */
5339
5340 goto wait_for_io;
5341 }
5342
5343 /* if n_consecutive != 0, then we have assigned
5344 something valid to consecutive_ios[0] */
5345 ut_ad(n_consecutive != 0);
5346 ut_ad(consecutive_ios[0] != NULL);
5347
5348 aio_slot = consecutive_ios[0];
5349
5350 /* Check if there are several consecutive blocks to read or write */
5351
5352 consecutive_loop:
5353 for (ulint i = 0; i < n; i++) {
5354 os_aio_slot_t* slot;
5355
5356 slot = os_aio_array_get_nth_slot(array, i + segment * n);
5357 if (slot->reserved
5358 && slot != aio_slot
5359 && slot->offset == aio_slot->offset + aio_slot->len
5360 && slot->type == aio_slot->type
5361 && slot->file.m_file == aio_slot->file.m_file) {
5362
5363 /* Found a consecutive i/o request */
5364
5365 consecutive_ios[n_consecutive] = slot;
5366 n_consecutive++;
5367
5368 aio_slot = slot;
5369
5370 if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) {
5371
5372 goto consecutive_loop;
5373 } else {
5374 break;
5375 }
5376 }
5377 }
5378
5379 srv_set_io_thread_op_info(global_segment, "consecutive i/o requests");
5380
5381 /* We have now collected n_consecutive i/o requests in the array;
5382 allocate a single buffer which can hold all data, and perform the
5383 i/o */
5384
5385 total_len = 0;
5386 aio_slot = consecutive_ios[0];
5387
5388 for (ulint i = 0; i < n_consecutive; i++) {
5389 total_len += consecutive_ios[i]->len;
5390 }
5391
5392 if (n_consecutive == 1) {
5393 /* We can use the buffer of the i/o request */
5394 combined_buf = aio_slot->buf;
5395 combined_buf2 = NULL;
5396 } else {
5397 combined_buf2 = static_cast<byte*>(
5398 ut_malloc(total_len + UNIV_PAGE_SIZE));
5399
5400 ut_a(combined_buf2);
5401
5402 combined_buf = static_cast<byte*>(
5403 ut_align(combined_buf2, UNIV_PAGE_SIZE));
5404 }
5405
5406 /* We release the array mutex for the time of the i/o: NOTE that
5407 this assumes that there is just one i/o-handler thread serving
5408 a single segment of slots! */
5409
5410 os_mutex_exit(array->mutex);
5411
5412 if (aio_slot->type == OS_FILE_WRITE && n_consecutive > 1) {
5413 /* Copy the buffers to the combined buffer */
5414 offs = 0;
5415
5416 for (ulint i = 0; i < n_consecutive; i++) {
5417
5418 ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf,
5419 consecutive_ios[i]->len);
5420
5421 offs += consecutive_ios[i]->len;
5422 }
5423 }
5424
5425 srv_set_io_thread_op_info(global_segment, "doing file i/o");
5426
5427 /* Do the i/o with ordinary, synchronous i/o functions: */
5428 if (aio_slot->type == OS_FILE_WRITE) {
5429 ut_ad(!srv_read_only_mode);
5430 ret = os_file_write(
5431 aio_slot->name, aio_slot->file, combined_buf,
5432 aio_slot->offset, total_len);
5433 } else {
5434 ret = os_file_read(
5435 aio_slot->file, combined_buf,
5436 aio_slot->offset, total_len);
5437 }
5438
5439 ut_a(ret);
5440 srv_set_io_thread_op_info(global_segment, "file i/o done");
5441
5442 if (aio_slot->type == OS_FILE_READ && n_consecutive > 1) {
5443 /* Copy the combined buffer to individual buffers */
5444 offs = 0;
5445
5446 for (ulint i = 0; i < n_consecutive; i++) {
5447
5448 ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs,
5449 consecutive_ios[i]->len);
5450 offs += consecutive_ios[i]->len;
5451 }
5452 }
5453
5454 if (combined_buf2) {
5455 ut_free(combined_buf2);
5456 }
5457
5458 os_mutex_enter(array->mutex);
5459
5460 /* Mark the i/os done in slots */
5461
5462 for (ulint i = 0; i < n_consecutive; i++) {
5463 consecutive_ios[i]->io_already_done = TRUE;
5464 }
5465
5466 /* We return the messages for the first slot now, and if there were
5467 several slots, the messages will be returned with subsequent calls
5468 of this function */
5469
5470 slot_io_done:
5471
5472 ut_a(aio_slot->reserved);
5473
5474 *message1 = aio_slot->message1;
5475 *message2 = aio_slot->message2;
5476
5477 *type = aio_slot->type;
5478
5479 os_mutex_exit(array->mutex);
5480
5481 os_aio_array_free_slot(array, aio_slot);
5482
5483 return(ret);
5484
5485 wait_for_io:
5486 srv_set_io_thread_op_info(global_segment, "resetting wait event");
5487
5488 /* We wait here until there again can be i/os in the segment
5489 of this thread */
5490
5491 os_event_reset(os_aio_segment_wait_events[global_segment]);
5492
5493 os_mutex_exit(array->mutex);
5494
5495 recommended_sleep:
5496 srv_set_io_thread_op_info(global_segment, "waiting for i/o request");
5497
5498 os_event_wait(os_aio_segment_wait_events[global_segment]);
5499
5500 goto restart;
5501 }
5502
5503 /**********************************************************************//**
5504 Validates the consistency of an aio array.
5505 @return true if ok */
5506 static
5507 bool
os_aio_array_validate(os_aio_array_t * array)5508 os_aio_array_validate(
5509 /*==================*/
5510 os_aio_array_t* array) /*!< in: aio wait array */
5511 {
5512 ulint i;
5513 ulint n_reserved = 0;
5514
5515 os_mutex_enter(array->mutex);
5516
5517 ut_a(array->n_slots > 0);
5518 ut_a(array->n_segments > 0);
5519
5520 for (i = 0; i < array->n_slots; i++) {
5521 os_aio_slot_t* slot;
5522
5523 slot = os_aio_array_get_nth_slot(array, i);
5524
5525 if (slot->reserved) {
5526 n_reserved++;
5527 ut_a(slot->len > 0);
5528 }
5529 }
5530
5531 ut_a(array->n_reserved == n_reserved);
5532
5533 os_mutex_exit(array->mutex);
5534
5535 return(true);
5536 }
5537
5538 /**********************************************************************//**
5539 Validates the consistency the aio system.
5540 @return TRUE if ok */
5541 UNIV_INTERN
5542 ibool
os_aio_validate(void)5543 os_aio_validate(void)
5544 /*=================*/
5545 {
5546 os_aio_array_validate(os_aio_read_array);
5547
5548 if (os_aio_write_array != 0) {
5549 os_aio_array_validate(os_aio_write_array);
5550 }
5551
5552 if (os_aio_ibuf_array != 0) {
5553 os_aio_array_validate(os_aio_ibuf_array);
5554 }
5555
5556 if (os_aio_log_array != 0) {
5557 os_aio_array_validate(os_aio_log_array);
5558 }
5559
5560 if (os_aio_sync_array != 0) {
5561 os_aio_array_validate(os_aio_sync_array);
5562 }
5563
5564 return(TRUE);
5565 }
5566
5567 /**********************************************************************//**
5568 Prints pending IO requests per segment of an aio array.
5569 We probably don't need per segment statistics but they can help us
5570 during development phase to see if the IO requests are being
5571 distributed as expected. */
5572 static
5573 void
os_aio_print_segment_info(FILE * file,ulint * n_seg,os_aio_array_t * array)5574 os_aio_print_segment_info(
5575 /*======================*/
5576 FILE* file, /*!< in: file where to print */
5577 ulint* n_seg, /*!< in: pending IO array */
5578 os_aio_array_t* array) /*!< in: array to process */
5579 {
5580 ulint i;
5581
5582 ut_ad(array);
5583 ut_ad(n_seg);
5584 ut_ad(array->n_segments > 0);
5585
5586 if (array->n_segments == 1) {
5587 return;
5588 }
5589
5590 fprintf(file, " [");
5591 for (i = 0; i < array->n_segments; i++) {
5592 if (i != 0) {
5593 fprintf(file, ", ");
5594 }
5595
5596 fprintf(file, "%lu", n_seg[i]);
5597 }
5598 fprintf(file, "] ");
5599 }
5600
5601 /**********************************************************************//**
5602 Prints info about the aio array. */
5603 UNIV_INTERN
5604 void
os_aio_print_array(FILE * file,os_aio_array_t * array)5605 os_aio_print_array(
5606 /*==============*/
5607 FILE* file, /*!< in: file where to print */
5608 os_aio_array_t* array) /*!< in: aio array to print */
5609 {
5610 ulint n_reserved = 0;
5611 ulint n_res_seg[SRV_MAX_N_IO_THREADS];
5612
5613 os_mutex_enter(array->mutex);
5614
5615 ut_a(array->n_slots > 0);
5616 ut_a(array->n_segments > 0);
5617
5618 memset(n_res_seg, 0x0, sizeof(n_res_seg));
5619
5620 for (ulint i = 0; i < array->n_slots; ++i) {
5621 os_aio_slot_t* slot;
5622 ulint seg_no;
5623
5624 slot = os_aio_array_get_nth_slot(array, i);
5625
5626 seg_no = (i * array->n_segments) / array->n_slots;
5627
5628 if (slot->reserved) {
5629 ++n_reserved;
5630 ++n_res_seg[seg_no];
5631
5632 ut_a(slot->len > 0);
5633 }
5634 }
5635
5636 ut_a(array->n_reserved == n_reserved);
5637
5638 fprintf(file, " %lu", (ulong) n_reserved);
5639
5640 os_aio_print_segment_info(file, n_res_seg, array);
5641
5642 os_mutex_exit(array->mutex);
5643 }
5644
5645 /**********************************************************************//**
5646 Prints info of the aio arrays. */
5647 UNIV_INTERN
5648 void
os_aio_print(FILE * file)5649 os_aio_print(
5650 /*=========*/
5651 FILE* file) /*!< in: file where to print */
5652 {
5653 time_t current_time;
5654 double time_elapsed;
5655 double avg_bytes_read;
5656
5657 for (ulint i = 0; i < srv_n_file_io_threads; ++i) {
5658 fprintf(file, "I/O thread %lu state: %s (%s)",
5659 (ulong) i,
5660 srv_io_thread_op_info[i],
5661 srv_io_thread_function[i]);
5662
5663 #ifndef __WIN__
5664 if (os_aio_segment_wait_events[i]->is_set) {
5665 fprintf(file, " ev set");
5666 }
5667 #endif /* __WIN__ */
5668
5669 fprintf(file, "\n");
5670 }
5671
5672 fputs("Pending normal aio reads:", file);
5673
5674 os_aio_print_array(file, os_aio_read_array);
5675
5676 if (os_aio_write_array != 0) {
5677 fputs(", aio writes:", file);
5678 os_aio_print_array(file, os_aio_write_array);
5679 }
5680
5681 if (os_aio_ibuf_array != 0) {
5682 fputs(",\n ibuf aio reads:", file);
5683 os_aio_print_array(file, os_aio_ibuf_array);
5684 }
5685
5686 if (os_aio_log_array != 0) {
5687 fputs(", log i/o's:", file);
5688 os_aio_print_array(file, os_aio_log_array);
5689 }
5690
5691 if (os_aio_sync_array != 0) {
5692 fputs(", sync i/o's:", file);
5693 os_aio_print_array(file, os_aio_sync_array);
5694 }
5695
5696 putc('\n', file);
5697 current_time = ut_time();
5698 time_elapsed = 0.001 + difftime(current_time, os_last_printout);
5699
5700 fprintf(file,
5701 "Pending flushes (fsync) log: %lu; buffer pool: %lu\n"
5702 "%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n",
5703 (ulong) fil_n_pending_log_flushes,
5704 (ulong) fil_n_pending_tablespace_flushes,
5705 (ulong) os_n_file_reads,
5706 (ulong) os_n_file_writes,
5707 (ulong) os_n_fsyncs);
5708
5709 if (os_file_n_pending_preads != 0 || os_file_n_pending_pwrites != 0) {
5710 fprintf(file,
5711 "%lu pending preads, %lu pending pwrites\n",
5712 (ulong) os_file_n_pending_preads,
5713 (ulong) os_file_n_pending_pwrites);
5714 }
5715
5716 if (os_n_file_reads == os_n_file_reads_old) {
5717 avg_bytes_read = 0.0;
5718 } else {
5719 avg_bytes_read = (double) os_bytes_read_since_printout
5720 / (os_n_file_reads - os_n_file_reads_old);
5721 }
5722
5723 fprintf(file,
5724 "%.2f reads/s, %lu avg bytes/read,"
5725 " %.2f writes/s, %.2f fsyncs/s\n",
5726 (os_n_file_reads - os_n_file_reads_old)
5727 / time_elapsed,
5728 (ulong) avg_bytes_read,
5729 (os_n_file_writes - os_n_file_writes_old)
5730 / time_elapsed,
5731 (os_n_fsyncs - os_n_fsyncs_old)
5732 / time_elapsed);
5733
5734 os_n_file_reads_old = os_n_file_reads;
5735 os_n_file_writes_old = os_n_file_writes;
5736 os_n_fsyncs_old = os_n_fsyncs;
5737 os_bytes_read_since_printout = 0;
5738
5739 os_last_printout = current_time;
5740 }
5741
5742 /**********************************************************************//**
5743 Refreshes the statistics used to print per-second averages. */
5744 UNIV_INTERN
5745 void
os_aio_refresh_stats(void)5746 os_aio_refresh_stats(void)
5747 /*======================*/
5748 {
5749 os_n_file_reads_old = os_n_file_reads;
5750 os_n_file_writes_old = os_n_file_writes;
5751 os_n_fsyncs_old = os_n_fsyncs;
5752 os_bytes_read_since_printout = 0;
5753
5754 os_last_printout = time(NULL);
5755 }
5756
5757 #ifdef UNIV_DEBUG
5758 /**********************************************************************//**
5759 Checks that all slots in the system have been freed, that is, there are
5760 no pending io operations.
5761 @return TRUE if all free */
5762 UNIV_INTERN
5763 ibool
os_aio_all_slots_free(void)5764 os_aio_all_slots_free(void)
5765 /*=======================*/
5766 {
5767 os_aio_array_t* array;
5768 ulint n_res = 0;
5769
5770 array = os_aio_read_array;
5771
5772 os_mutex_enter(array->mutex);
5773
5774 n_res += array->n_reserved;
5775
5776 os_mutex_exit(array->mutex);
5777
5778 if (!srv_read_only_mode) {
5779 ut_a(os_aio_write_array == 0);
5780
5781 array = os_aio_write_array;
5782
5783 os_mutex_enter(array->mutex);
5784
5785 n_res += array->n_reserved;
5786
5787 os_mutex_exit(array->mutex);
5788
5789 ut_a(os_aio_ibuf_array == 0);
5790
5791 array = os_aio_ibuf_array;
5792
5793 os_mutex_enter(array->mutex);
5794
5795 n_res += array->n_reserved;
5796
5797 os_mutex_exit(array->mutex);
5798 }
5799
5800 ut_a(os_aio_log_array == 0);
5801
5802 array = os_aio_log_array;
5803
5804 os_mutex_enter(array->mutex);
5805
5806 n_res += array->n_reserved;
5807
5808 os_mutex_exit(array->mutex);
5809
5810 array = os_aio_sync_array;
5811
5812 os_mutex_enter(array->mutex);
5813
5814 n_res += array->n_reserved;
5815
5816 os_mutex_exit(array->mutex);
5817
5818 if (n_res == 0) {
5819
5820 return(TRUE);
5821 }
5822
5823 return(FALSE);
5824 }
5825 #endif /* UNIV_DEBUG */
5826
5827 #endif /* !UNIV_HOTBACKUP */
5828