1 /***********************************************************************
2 
3 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2009, Percona Inc.
5 
6 Portions of this file contain modifications contributed and copyrighted
7 by Percona Inc.. Those modifications are
8 gratefully acknowledged and are described briefly in the InnoDB
9 documentation. The contributions by Percona Inc. are incorporated with
10 their permission, and subject to the conditions contained in the file
11 COPYING.Percona.
12 
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License, version 2.0,
15 as published by the Free Software Foundation.
16 
17 This program is also distributed with certain software (including
18 but not limited to OpenSSL) that is licensed under separate terms,
19 as designated in a particular file or component or in included license
20 documentation.  The authors of MySQL hereby grant you an additional
21 permission to link the program and your derivative works with the
22 separately licensed software that they have included with MySQL.
23 
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27 GNU General Public License, version 2.0, for more details.
28 
29 You should have received a copy of the GNU General Public License along with
30 this program; if not, write to the Free Software Foundation, Inc.,
31 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
32 
33 ***********************************************************************/
34 
35 /**************************************************//**
36 @file os/os0file.cc
37 The interface to the operating system file i/o primitives
38 
39 Created 10/21/1995 Heikki Tuuri
40 *******************************************************/
41 
42 #include "os0file.h"
43 
44 #ifdef UNIV_NONINL
45 #include "os0file.ic"
46 #endif
47 
48 #include "ut0mem.h"
49 #include "srv0srv.h"
50 #include "srv0start.h"
51 #include "fil0fil.h"
52 #include "buf0buf.h"
53 #include "srv0mon.h"
54 #ifndef UNIV_HOTBACKUP
55 # include "os0sync.h"
56 # include "os0thread.h"
57 #else /* !UNIV_HOTBACKUP */
58 # ifdef __WIN__
59 /* Add includes for the _stat() call to compile on Windows */
60 #  include <sys/types.h>
61 #  include <sys/stat.h>
62 #  include <errno.h>
63 # endif /* __WIN__ */
64 #endif /* !UNIV_HOTBACKUP */
65 
66 #if defined(LINUX_NATIVE_AIO)
67 #include <libaio.h>
68 #endif
69 
70 /** Insert buffer segment id */
71 static const ulint IO_IBUF_SEGMENT = 0;
72 
73 /** Log segment id */
74 static const ulint IO_LOG_SEGMENT = 1;
75 
76 /* This specifies the file permissions InnoDB uses when it creates files in
77 Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
78 my_umask */
79 
80 #ifndef __WIN__
81 /** Umask for creating files */
82 UNIV_INTERN ulint	os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
83 #else
84 /** Umask for creating files */
85 UNIV_INTERN ulint	os_innodb_umask	= 0;
86 #endif /* __WIN__ */
87 
88 #ifndef UNIV_HOTBACKUP
89 /* We use these mutexes to protect lseek + file i/o operation, if the
90 OS does not provide an atomic pread or pwrite, or similar */
91 #define OS_FILE_N_SEEK_MUTEXES	16
92 UNIV_INTERN os_ib_mutex_t	os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
93 
94 /* In simulated aio, merge at most this many consecutive i/os */
95 #define OS_AIO_MERGE_N_CONSECUTIVE	64
96 
97 /**********************************************************************
98 
99 InnoDB AIO Implementation:
100 =========================
101 
102 We support native AIO for windows and linux. For rest of the platforms
103 we simulate AIO by special io-threads servicing the IO-requests.
104 
105 Simulated AIO:
106 ==============
107 
108 In platforms where we 'simulate' AIO following is a rough explanation
109 of the high level design.
110 There are four io-threads (for ibuf, log, read, write).
111 All synchronous IO requests are serviced by the calling thread using
112 os_file_write/os_file_read. The Asynchronous requests are queued up
113 in an array (there are four such arrays) by the calling thread.
114 Later these requests are picked up by the io-thread and are serviced
115 synchronously.
116 
117 Windows native AIO:
118 ==================
119 
120 If srv_use_native_aio is not set then windows follow the same
121 code as simulated AIO. If the flag is set then native AIO interface
122 is used. On windows, one of the limitation is that if a file is opened
123 for AIO no synchronous IO can be done on it. Therefore we have an
124 extra fifth array to queue up synchronous IO requests.
125 There are innodb_file_io_threads helper threads. These threads work
126 on the four arrays mentioned above in Simulated AIO. No thread is
127 required for the sync array.
128 If a synchronous IO request is made, it is first queued in the sync
129 array. Then the calling thread itself waits on the request, thus
130 making the call synchronous.
131 If an AIO request is made the calling thread not only queues it in the
132 array but also submits the requests. The helper thread then collects
133 the completed IO request and calls completion routine on it.
134 
135 Linux native AIO:
136 =================
137 
138 If we have libaio installed on the system and innodb_use_native_aio
139 is set to TRUE we follow the code path of native AIO, otherwise we
140 do simulated AIO.
141 There are innodb_file_io_threads helper threads. These threads work
142 on the four arrays mentioned above in Simulated AIO.
143 If a synchronous IO request is made, it is handled by calling
144 os_file_write/os_file_read.
145 If an AIO request is made the calling thread not only queues it in the
146 array but also submits the requests. The helper thread then collects
147 the completed IO request and calls completion routine on it.
148 
149 **********************************************************************/
150 
151 /** Flag: enable debug printout for asynchronous i/o */
152 UNIV_INTERN ibool	os_aio_print_debug	= FALSE;
153 
154 #ifdef UNIV_PFS_IO
155 /* Keys to register InnoDB I/O with performance schema */
156 UNIV_INTERN mysql_pfs_key_t  innodb_file_data_key;
157 UNIV_INTERN mysql_pfs_key_t  innodb_file_log_key;
158 UNIV_INTERN mysql_pfs_key_t  innodb_file_temp_key;
159 #endif /* UNIV_PFS_IO */
160 
161 /** The asynchronous i/o array slot structure */
162 struct os_aio_slot_t{
163 	ibool		is_read;	/*!< TRUE if a read operation */
164 	ulint		pos;		/*!< index of the slot in the aio
165 					array */
166 	ibool		reserved;	/*!< TRUE if this slot is reserved */
167 	time_t		reservation_time;/*!< time when reserved */
168 	ulint		len;		/*!< length of the block to read or
169 					write */
170 	byte*		buf;		/*!< buffer used in i/o */
171 	ulint		type;		/*!< OS_FILE_READ or OS_FILE_WRITE */
172 	os_offset_t	offset;		/*!< file offset in bytes */
173 	pfs_os_file_t	file;		/*!< file where to read or write */
174 	const char*	name;		/*!< file name or path */
175 	ibool		io_already_done;/*!< used only in simulated aio:
176 					TRUE if the physical i/o already
177 					made and only the slot message
178 					needs to be passed to the caller
179 					of os_aio_simulated_handle */
180 	fil_node_t*	message1;	/*!< message which is given by the */
181 	void*		message2;	/*!< the requester of an aio operation
182 					and which can be used to identify
183 					which pending aio operation was
184 					completed */
185 #ifdef WIN_ASYNC_IO
186 	HANDLE		handle;		/*!< handle object we need in the
187 					OVERLAPPED struct */
188 	OVERLAPPED	control;	/*!< Windows control block for the
189 					aio request */
190 #elif defined(LINUX_NATIVE_AIO)
191 	struct iocb	control;	/* Linux control block for aio */
192 	int		n_bytes;	/* bytes written/read. */
193 	int		ret;		/* AIO return code */
194 #endif /* WIN_ASYNC_IO */
195 };
196 
197 /** The asynchronous i/o array structure */
198 struct os_aio_array_t{
199 	os_ib_mutex_t	mutex;	/*!< the mutex protecting the aio array */
200 	os_event_t	not_full;
201 				/*!< The event which is set to the
202 				signaled state when there is space in
203 				the aio outside the ibuf segment */
204 	os_event_t	is_empty;
205 				/*!< The event which is set to the
206 				signaled state when there are no
207 				pending i/os in this array */
208 	ulint		n_slots;/*!< Total number of slots in the aio
209 				array.  This must be divisible by
210 				n_threads. */
211 	ulint		n_segments;
212 				/*!< Number of segments in the aio
213 				array of pending aio requests. A
214 				thread can wait separately for any one
215 				of the segments. */
216 	ulint		cur_seg;/*!< We reserve IO requests in round
217 				robin fashion to different segments.
218 				This points to the segment that is to
219 				be used to service next IO request. */
220 	ulint		n_reserved;
221 				/*!< Number of reserved slots in the
222 				aio array outside the ibuf segment */
223 	os_aio_slot_t*	slots;	/*!< Pointer to the slots in the array */
224 #ifdef __WIN__
225 	HANDLE*		handles;
226 				/*!< Pointer to an array of OS native
227 				event handles where we copied the
228 				handles from slots, in the same
229 				order. This can be used in
230 				WaitForMultipleObjects; used only in
231 				Windows */
232 #endif /* __WIN__ */
233 
234 #if defined(LINUX_NATIVE_AIO)
235 	io_context_t*		aio_ctx;
236 				/* completion queue for IO. There is
237 				one such queue per segment. Each thread
238 				will work on one ctx exclusively. */
239 	struct io_event*	aio_events;
240 				/* The array to collect completed IOs.
241 				There is one such event for each
242 				possible pending IO. The size of the
243 				array is equal to n_slots. */
244 #endif /* LINUX_NATIV_AIO */
245 };
246 
247 #if defined(LINUX_NATIVE_AIO)
248 /** timeout for each io_getevents() call = 500ms. */
249 #define OS_AIO_REAP_TIMEOUT	(500000000UL)
250 
251 /** time to sleep, in microseconds if io_setup() returns EAGAIN. */
252 #define OS_AIO_IO_SETUP_RETRY_SLEEP	(500000UL)
253 
254 /** number of attempts before giving up on io_setup(). */
255 #define OS_AIO_IO_SETUP_RETRY_ATTEMPTS	5
256 #endif
257 
258 /** Array of events used in simulated aio */
259 static os_event_t*	os_aio_segment_wait_events = NULL;
260 
261 /** The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
262 are NULL when the module has not yet been initialized. @{ */
263 static os_aio_array_t*	os_aio_read_array	= NULL;	/*!< Reads */
264 static os_aio_array_t*	os_aio_write_array	= NULL;	/*!< Writes */
265 static os_aio_array_t*	os_aio_ibuf_array	= NULL;	/*!< Insert buffer */
266 static os_aio_array_t*	os_aio_log_array	= NULL;	/*!< Redo log */
267 static os_aio_array_t*	os_aio_sync_array	= NULL;	/*!< Synchronous I/O */
268 /* @} */
269 
270 /** Number of asynchronous I/O segments.  Set by os_aio_init(). */
271 static ulint	os_aio_n_segments	= ULINT_UNDEFINED;
272 
273 /** If the following is TRUE, read i/o handler threads try to
274 wait until a batch of new read requests have been posted */
275 static ibool	os_aio_recommend_sleep_for_read_threads	= FALSE;
276 #endif /* !UNIV_HOTBACKUP */
277 
278 UNIV_INTERN ulint	os_n_file_reads		= 0;
279 UNIV_INTERN ulint	os_bytes_read_since_printout = 0;
280 UNIV_INTERN ulint	os_n_file_writes	= 0;
281 UNIV_INTERN ulint	os_n_fsyncs		= 0;
282 UNIV_INTERN ulint	os_n_file_reads_old	= 0;
283 UNIV_INTERN ulint	os_n_file_writes_old	= 0;
284 UNIV_INTERN ulint	os_n_fsyncs_old		= 0;
285 UNIV_INTERN time_t	os_last_printout;
286 
287 UNIV_INTERN ibool	os_has_said_disk_full	= FALSE;
288 
289 #if !defined(UNIV_HOTBACKUP)	\
290     && (!defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8)
291 /** The mutex protecting the following counts of pending I/O operations */
292 static os_ib_mutex_t	os_file_count_mutex;
293 #endif /* !UNIV_HOTBACKUP && (!HAVE_ATOMIC_BUILTINS || UNIV_WORD_SIZE < 8) */
294 
295 /** Number of pending os_file_pread() operations */
296 UNIV_INTERN ulint	os_file_n_pending_preads  = 0;
297 /** Number of pending os_file_pwrite() operations */
298 UNIV_INTERN ulint	os_file_n_pending_pwrites = 0;
299 /** Number of pending write operations */
300 UNIV_INTERN ulint	os_n_pending_writes = 0;
301 /** Number of pending read operations */
302 UNIV_INTERN ulint	os_n_pending_reads = 0;
303 
304 #ifdef UNIV_DEBUG
305 # ifndef UNIV_HOTBACKUP
306 /**********************************************************************//**
307 Validates the consistency the aio system some of the time.
308 @return	TRUE if ok or the check was skipped */
309 UNIV_INTERN
310 ibool
os_aio_validate_skip(void)311 os_aio_validate_skip(void)
312 /*======================*/
313 {
314 /** Try os_aio_validate() every this many times */
315 # define OS_AIO_VALIDATE_SKIP	13
316 
317 	/** The os_aio_validate() call skip counter.
318 	Use a signed type because of the race condition below. */
319 	static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
320 
321 	/* There is a race condition below, but it does not matter,
322 	because this call is only for heuristic purposes. We want to
323 	reduce the call frequency of the costly os_aio_validate()
324 	check in debug builds. */
325 	if (--os_aio_validate_count > 0) {
326 		return(TRUE);
327 	}
328 
329 	os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
330 	return(os_aio_validate());
331 }
332 # endif /* !UNIV_HOTBACKUP */
333 #endif /* UNIV_DEBUG */
334 
335 #ifdef __WIN__
336 /***********************************************************************//**
337 Gets the operating system version. Currently works only on Windows.
338 @return	OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000, OS_WINXP, OS_WINVISTA,
339 OS_WIN7. */
340 UNIV_INTERN
341 ulint
os_get_os_version(void)342 os_get_os_version(void)
343 /*===================*/
344 {
345 	OSVERSIONINFO	os_info;
346 
347 	os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
348 
349 	ut_a(GetVersionEx(&os_info));
350 
351 	if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) {
352 		return(OS_WIN31);
353 	} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
354 		return(OS_WIN95);
355 	} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
356 		switch (os_info.dwMajorVersion) {
357 		case 3:
358 		case 4:
359 			return(OS_WINNT);
360 		case 5:
361 			return (os_info.dwMinorVersion == 0)
362 				? OS_WIN2000 : OS_WINXP;
363 		case 6:
364 			return (os_info.dwMinorVersion == 0)
365 				? OS_WINVISTA : OS_WIN7;
366 		default:
367 			return(OS_WIN7);
368 		}
369 	} else {
370 		ut_error;
371 		return(0);
372 	}
373 }
374 #endif /* __WIN__ */
375 
376 /***********************************************************************//**
377 Retrieves the last error number if an error occurs in a file io function.
378 The number should be retrieved before any other OS calls (because they may
379 overwrite the error number). If the number is not known to this program,
380 the OS error number + 100 is returned.
381 @return	error number, or OS error number + 100 */
382 static
383 ulint
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)384 os_file_get_last_error_low(
385 /*=======================*/
386 	bool	report_all_errors,	/*!< in: TRUE if we want an error
387 					message printed of all errors */
388 	bool	on_error_silent)	/*!< in: TRUE then don't print any
389 					diagnostic to the log */
390 {
391 #ifdef __WIN__
392 
393 	ulint	err = (ulint) GetLastError();
394 	if (err == ERROR_SUCCESS) {
395 		return(0);
396 	}
397 
398 	if (report_all_errors
399 	    || (!on_error_silent
400 		&& err != ERROR_DISK_FULL
401 		&& err != ERROR_FILE_EXISTS)) {
402 
403 		ut_print_timestamp(stderr);
404 		fprintf(stderr,
405 			"  InnoDB: Operating system error number %lu"
406 			" in a file operation.\n", (ulong) err);
407 
408 		if (err == ERROR_PATH_NOT_FOUND) {
409 			fprintf(stderr,
410 				"InnoDB: The error means the system"
411 				" cannot find the path specified.\n");
412 
413 			if (srv_is_being_started) {
414 				fprintf(stderr,
415 					"InnoDB: If you are installing InnoDB,"
416 					" remember that you must create\n"
417 					"InnoDB: directories yourself, InnoDB"
418 					" does not create them.\n");
419 			}
420 		} else if (err == ERROR_ACCESS_DENIED) {
421 			fprintf(stderr,
422 				"InnoDB: The error means mysqld does not have"
423 				" the access rights to\n"
424 				"InnoDB: the directory. It may also be"
425 				" you have created a subdirectory\n"
426 				"InnoDB: of the same name as a data file.\n");
427 		} else if (err == ERROR_SHARING_VIOLATION
428 			   || err == ERROR_LOCK_VIOLATION) {
429 			fprintf(stderr,
430 				"InnoDB: The error means that another program"
431 				" is using InnoDB's files.\n"
432 				"InnoDB: This might be a backup or antivirus"
433 				" software or another instance\n"
434 				"InnoDB: of MySQL."
435 				" Please close it to get rid of this error.\n");
436 		} else if (err == ERROR_WORKING_SET_QUOTA
437 			   || err == ERROR_NO_SYSTEM_RESOURCES) {
438 			fprintf(stderr,
439 				"InnoDB: The error means that there are no"
440 				" sufficient system resources or quota to"
441 				" complete the operation.\n");
442 		} else if (err == ERROR_OPERATION_ABORTED) {
443 			fprintf(stderr,
444 				"InnoDB: The error means that the I/O"
445 				" operation has been aborted\n"
446 				"InnoDB: because of either a thread exit"
447 				" or an application request.\n"
448 				"InnoDB: Retry attempt is made.\n");
449 		} else {
450 			fprintf(stderr,
451 				"InnoDB: Some operating system error numbers"
452 				" are described at\n"
453 				"InnoDB: "
454 				REFMAN
455 				"operating-system-error-codes.html\n");
456 		}
457 	}
458 
459 	fflush(stderr);
460 
461 	if (err == ERROR_FILE_NOT_FOUND) {
462 		return(OS_FILE_NOT_FOUND);
463 	} else if (err == ERROR_DISK_FULL) {
464 		return(OS_FILE_DISK_FULL);
465 	} else if (err == ERROR_FILE_EXISTS) {
466 		return(OS_FILE_ALREADY_EXISTS);
467 	} else if (err == ERROR_SHARING_VIOLATION
468 		   || err == ERROR_LOCK_VIOLATION) {
469 		return(OS_FILE_SHARING_VIOLATION);
470 	} else if (err == ERROR_WORKING_SET_QUOTA
471 		   || err == ERROR_NO_SYSTEM_RESOURCES) {
472 		return(OS_FILE_INSUFFICIENT_RESOURCE);
473 	} else if (err == ERROR_OPERATION_ABORTED) {
474 		return(OS_FILE_OPERATION_ABORTED);
475 	} else if (err == ERROR_ACCESS_DENIED) {
476 		return(OS_FILE_ACCESS_VIOLATION);
477 	} else {
478 		return(OS_FILE_ERROR_MAX + err);
479 	}
480 #else
481 	int err = errno;
482 	if (err == 0) {
483 		return(0);
484 	}
485 
486 	if (report_all_errors
487 	    || (err != ENOSPC && err != EEXIST && !on_error_silent)) {
488 
489 		ut_print_timestamp(stderr);
490 		fprintf(stderr,
491 			"  InnoDB: Operating system error number %d"
492 			" in a file operation.\n", err);
493 
494 		if (err == ENOENT) {
495 			fprintf(stderr,
496 				"InnoDB: The error means the system"
497 				" cannot find the path specified.\n");
498 
499 			if (srv_is_being_started) {
500 				fprintf(stderr,
501 					"InnoDB: If you are installing InnoDB,"
502 					" remember that you must create\n"
503 					"InnoDB: directories yourself, InnoDB"
504 					" does not create them.\n");
505 			}
506 		} else if (err == EACCES) {
507 			fprintf(stderr,
508 				"InnoDB: The error means mysqld does not have"
509 				" the access rights to\n"
510 				"InnoDB: the directory.\n");
511 		} else {
512 			if (strerror(err) != NULL) {
513 				fprintf(stderr,
514 					"InnoDB: Error number %d"
515 					" means '%s'.\n",
516 					err, strerror(err));
517 			}
518 
519 
520 			fprintf(stderr,
521 				"InnoDB: Some operating system"
522 				" error numbers are described at\n"
523 				"InnoDB: "
524 				REFMAN
525 				"operating-system-error-codes.html\n");
526 		}
527 	}
528 
529 	fflush(stderr);
530 
531 	switch (err) {
532 	case ENOSPC:
533 		return(OS_FILE_DISK_FULL);
534 	case ENOENT:
535 		return(OS_FILE_NOT_FOUND);
536 	case EEXIST:
537 		return(OS_FILE_ALREADY_EXISTS);
538 	case EXDEV:
539 	case ENOTDIR:
540 	case EISDIR:
541 		return(OS_FILE_PATH_ERROR);
542 	case EAGAIN:
543 		if (srv_use_native_aio) {
544 			return(OS_FILE_AIO_RESOURCES_RESERVED);
545 		}
546 		break;
547 	case EINTR:
548 		if (srv_use_native_aio) {
549 			return(OS_FILE_AIO_INTERRUPTED);
550 		}
551 		break;
552 	case EACCES:
553 		return(OS_FILE_ACCESS_VIOLATION);
554 	}
555 	return(OS_FILE_ERROR_MAX + err);
556 #endif
557 }
558 
559 /***********************************************************************//**
560 Retrieves the last error number if an error occurs in a file io function.
561 The number should be retrieved before any other OS calls (because they may
562 overwrite the error number). If the number is not known to this program,
563 the OS error number + 100 is returned.
564 @return	error number, or OS error number + 100 */
565 UNIV_INTERN
566 ulint
os_file_get_last_error(bool report_all_errors)567 os_file_get_last_error(
568 /*===================*/
569 	bool	report_all_errors)	/*!< in: TRUE if we want an error
570 					message printed of all errors */
571 {
572 	return(os_file_get_last_error_low(report_all_errors, false));
573 }
574 
575 /****************************************************************//**
576 Does error handling when a file operation fails.
577 Conditionally exits (calling exit(3)) based on should_exit value and the
578 error type, if should_exit is TRUE then on_error_silent is ignored.
579 @return	TRUE if we should retry the operation */
580 static
581 ibool
os_file_handle_error_cond_exit(const char * name,const char * operation,ibool should_exit,ibool on_error_silent)582 os_file_handle_error_cond_exit(
583 /*===========================*/
584 	const char*	name,		/*!< in: name of a file or NULL */
585 	const char*	operation,	/*!< in: operation */
586 	ibool		should_exit,	/*!< in: call exit(3) if unknown error
587 					and this parameter is TRUE */
588 	ibool		on_error_silent)/*!< in: if TRUE then don't print
589 					any message to the log iff it is
590 					an unknown non-fatal error */
591 {
592 	ulint	err;
593 
594 	err = os_file_get_last_error_low(false, on_error_silent);
595 
596 	switch (err) {
597 	case OS_FILE_DISK_FULL:
598 		/* We only print a warning about disk full once */
599 
600 		if (os_has_said_disk_full) {
601 
602 			return(FALSE);
603 		}
604 
605 		/* Disk full error is reported irrespective of the
606 		on_error_silent setting. */
607 
608 		if (name) {
609 			ut_print_timestamp(stderr);
610 			fprintf(stderr,
611 				"  InnoDB: Encountered a problem with"
612 				" file %s\n", name);
613 		}
614 
615 		ut_print_timestamp(stderr);
616 		fprintf(stderr,
617 			"  InnoDB: Disk is full. Try to clean the disk"
618 			" to free space.\n");
619 
620 		os_has_said_disk_full = TRUE;
621 
622 		fflush(stderr);
623 
624 		return(FALSE);
625 
626 	case OS_FILE_AIO_RESOURCES_RESERVED:
627 	case OS_FILE_AIO_INTERRUPTED:
628 
629 		return(TRUE);
630 
631 	case OS_FILE_PATH_ERROR:
632 	case OS_FILE_ALREADY_EXISTS:
633 	case OS_FILE_ACCESS_VIOLATION:
634 
635 		return(FALSE);
636 
637 	case OS_FILE_SHARING_VIOLATION:
638 
639 		os_thread_sleep(10000000);  /* 10 sec */
640 		return(TRUE);
641 
642 	case OS_FILE_OPERATION_ABORTED:
643 	case OS_FILE_INSUFFICIENT_RESOURCE:
644 
645 		os_thread_sleep(100000);	/* 100 ms */
646 		return(TRUE);
647 
648 	default:
649 
650 		/* If it is an operation that can crash on error then it
651 		is better to ignore on_error_silent and print an error message
652 		to the log. */
653 
654 		if (should_exit || !on_error_silent) {
655 			ib_logf(IB_LOG_LEVEL_ERROR, "File %s: '%s' returned OS "
656 				"error " ULINTPF ".%s", name ? name : "(unknown)",
657 				operation, err, should_exit
658 				? " Cannot continue operation" : "");
659 		}
660 
661 		if (should_exit) {
662 			exit(1);
663 		}
664 	}
665 
666 	return(FALSE);
667 }
668 
669 /****************************************************************//**
670 Does error handling when a file operation fails.
671 @return	TRUE if we should retry the operation */
672 static
673 ibool
os_file_handle_error(const char * name,const char * operation)674 os_file_handle_error(
675 /*=================*/
676 	const char*	name,		/*!< in: name of a file or NULL */
677 	const char*	operation)	/*!< in: operation */
678 {
679 	/* exit in case of unknown error */
680 	return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE));
681 }
682 
683 /****************************************************************//**
684 Does error handling when a file operation fails.
685 @return	TRUE if we should retry the operation */
686 static
687 ibool
os_file_handle_error_no_exit(const char * name,const char * operation,ibool on_error_silent)688 os_file_handle_error_no_exit(
689 /*=========================*/
690 	const char*	name,		/*!< in: name of a file or NULL */
691 	const char*	operation,	/*!< in: operation */
692 	ibool		on_error_silent)/*!< in: if TRUE then don't print
693 					any message to the log. */
694 {
695 	/* don't exit in case of unknown error */
696 	return(os_file_handle_error_cond_exit(
697 			name, operation, FALSE, on_error_silent));
698 }
699 
700 #undef USE_FILE_LOCK
701 #define USE_FILE_LOCK
702 #if defined(UNIV_HOTBACKUP) || defined(__WIN__)
703 /* InnoDB Hot Backup does not lock the data files.
704  * On Windows, mandatory locking is used.
705  */
706 # undef USE_FILE_LOCK
707 #endif
708 #ifdef USE_FILE_LOCK
709 /****************************************************************//**
710 Obtain an exclusive lock on a file.
711 @return	0 on success */
712 static
713 int
os_file_lock(int fd,const char * name)714 os_file_lock(
715 /*=========*/
716 	int		fd,	/*!< in: file descriptor */
717 	const char*	name)	/*!< in: file name */
718 {
719 	struct flock lk;
720 
721 	ut_ad(!srv_read_only_mode);
722 
723 	lk.l_type = F_WRLCK;
724 	lk.l_whence = SEEK_SET;
725 	lk.l_start = lk.l_len = 0;
726 
727 	if (fcntl(fd, F_SETLK, &lk) == -1) {
728 
729 		ib_logf(IB_LOG_LEVEL_ERROR,
730 			"Unable to lock %s, error: %d", name, errno);
731 
732 		if (errno == EAGAIN || errno == EACCES) {
733 			ib_logf(IB_LOG_LEVEL_INFO,
734 				"Check that you do not already have "
735 				"another mysqld process using the "
736 				"same InnoDB data or log files.");
737 		}
738 
739 		return(-1);
740 	}
741 
742 	return(0);
743 }
744 #endif /* USE_FILE_LOCK */
745 
746 #ifndef UNIV_HOTBACKUP
747 /****************************************************************//**
748 Creates the seek mutexes used in positioned reads and writes. */
749 UNIV_INTERN
750 void
os_io_init_simple(void)751 os_io_init_simple(void)
752 /*===================*/
753 {
754 #if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8
755 	os_file_count_mutex = os_mutex_create();
756 #endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD_SIZE < 8 */
757 
758 	for (ulint i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
759 		os_file_seek_mutexes[i] = os_mutex_create();
760 	}
761 }
762 
763 /** Create a temporary file. This function is like tmpfile(3), but
764 the temporary file is created in the given parameter path. If the path
765 is null then it will create the file in the mysql server configuration
766 parameter (--tmpdir).
767 @param[in]	path	location for creating temporary file
768 @return temporary file handle, or NULL on error */
769 UNIV_INTERN
770 FILE*
os_file_create_tmpfile(const char * path)771 os_file_create_tmpfile(
772 	const char*	path)
773 {
774 	FILE*	file	= NULL;
775 	int	fd	= innobase_mysql_tmpfile(path);
776 
777 	ut_ad(!srv_read_only_mode);
778 
779 	if (fd >= 0) {
780 		file = fdopen(fd, "w+b");
781 	}
782 
783 	if (!file) {
784 		ut_print_timestamp(stderr);
785 		fprintf(stderr,
786 			"  InnoDB: Error: unable to create temporary file;"
787 			" errno: %d\n", errno);
788 		if (fd >= 0) {
789 			close(fd);
790 		}
791 	}
792 
793 	return(file);
794 }
795 #endif /* !UNIV_HOTBACKUP */
796 
797 /***********************************************************************//**
798 The os_file_opendir() function opens a directory stream corresponding to the
799 directory named by the dirname argument. The directory stream is positioned
800 at the first entry. In both Unix and Windows we automatically skip the '.'
801 and '..' items at the start of the directory listing.
802 @return	directory stream, NULL if error */
803 UNIV_INTERN
804 os_file_dir_t
os_file_opendir(const char * dirname,ibool error_is_fatal)805 os_file_opendir(
806 /*============*/
807 	const char*	dirname,	/*!< in: directory name; it must not
808 					contain a trailing '\' or '/' */
809 	ibool		error_is_fatal)	/*!< in: TRUE if we should treat an
810 					error as a fatal error; if we try to
811 					open symlinks then we do not wish a
812 					fatal error if it happens not to be
813 					a directory */
814 {
815 	os_file_dir_t		dir;
816 #ifdef __WIN__
817 	LPWIN32_FIND_DATA	lpFindFileData;
818 	char			path[OS_FILE_MAX_PATH + 3];
819 
820 	ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
821 
822 	strcpy(path, dirname);
823 	strcpy(path + strlen(path), "\\*");
824 
825 	/* Note that in Windows opening the 'directory stream' also retrieves
826 	the first entry in the directory. Since it is '.', that is no problem,
827 	as we will skip over the '.' and '..' entries anyway. */
828 
829 	lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
830 		ut_malloc(sizeof(WIN32_FIND_DATA)));
831 
832 	dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
833 
834 	ut_free(lpFindFileData);
835 
836 	if (dir == INVALID_HANDLE_VALUE) {
837 
838 		if (error_is_fatal) {
839 			os_file_handle_error(dirname, "opendir");
840 		}
841 
842 		return(NULL);
843 	}
844 
845 	return(dir);
846 #else
847 	dir = opendir(dirname);
848 
849 	if (dir == NULL && error_is_fatal) {
850 		os_file_handle_error(dirname, "opendir");
851 	}
852 
853 	return(dir);
854 #endif /* __WIN__ */
855 }
856 
857 /***********************************************************************//**
858 Closes a directory stream.
859 @return	0 if success, -1 if failure */
860 UNIV_INTERN
861 int
os_file_closedir(os_file_dir_t dir)862 os_file_closedir(
863 /*=============*/
864 	os_file_dir_t	dir)	/*!< in: directory stream */
865 {
866 #ifdef __WIN__
867 	BOOL		ret;
868 
869 	ret = FindClose(dir);
870 
871 	if (!ret) {
872 		os_file_handle_error_no_exit(NULL, "closedir", FALSE);
873 
874 		return(-1);
875 	}
876 
877 	return(0);
878 #else
879 	int	ret;
880 
881 	ret = closedir(dir);
882 
883 	if (ret) {
884 		os_file_handle_error_no_exit(NULL, "closedir", FALSE);
885 	}
886 
887 	return(ret);
888 #endif /* __WIN__ */
889 }
890 
891 /***********************************************************************//**
892 This function returns information of the next file in the directory. We jump
893 over the '.' and '..' entries in the directory.
894 @return	0 if ok, -1 if error, 1 if at the end of the directory */
895 UNIV_INTERN
896 int
os_file_readdir_next_file(const char * dirname,os_file_dir_t dir,os_file_stat_t * info)897 os_file_readdir_next_file(
898 /*======================*/
899 	const char*	dirname,/*!< in: directory name or path */
900 	os_file_dir_t	dir,	/*!< in: directory stream */
901 	os_file_stat_t*	info)	/*!< in/out: buffer where the info is returned */
902 {
903 #ifdef __WIN__
904 	LPWIN32_FIND_DATA	lpFindFileData;
905 	BOOL			ret;
906 
907 	lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
908 		ut_malloc(sizeof(WIN32_FIND_DATA)));
909 next_file:
910 	ret = FindNextFile(dir, lpFindFileData);
911 
912 	if (ret) {
913 		ut_a(strlen((char*) lpFindFileData->cFileName)
914 		     < OS_FILE_MAX_PATH);
915 
916 		if (strcmp((char*) lpFindFileData->cFileName, ".") == 0
917 		    || strcmp((char*) lpFindFileData->cFileName, "..") == 0) {
918 
919 			goto next_file;
920 		}
921 
922 		strcpy(info->name, (char*) lpFindFileData->cFileName);
923 
924 		info->size = (ib_int64_t)(lpFindFileData->nFileSizeLow)
925 			+ (((ib_int64_t)(lpFindFileData->nFileSizeHigh))
926 			   << 32);
927 
928 		if (lpFindFileData->dwFileAttributes
929 		    & FILE_ATTRIBUTE_REPARSE_POINT) {
930 			/* TODO: test Windows symlinks */
931 			/* TODO: MySQL has apparently its own symlink
932 			implementation in Windows, dbname.sym can
933 			redirect a database directory:
934 			REFMAN "windows-symbolic-links.html" */
935 			info->type = OS_FILE_TYPE_LINK;
936 		} else if (lpFindFileData->dwFileAttributes
937 			   & FILE_ATTRIBUTE_DIRECTORY) {
938 			info->type = OS_FILE_TYPE_DIR;
939 		} else {
940 			/* It is probably safest to assume that all other
941 			file types are normal. Better to check them rather
942 			than blindly skip them. */
943 
944 			info->type = OS_FILE_TYPE_FILE;
945 		}
946 	}
947 
948 	ut_free(lpFindFileData);
949 
950 	if (ret) {
951 		return(0);
952 	} else if (GetLastError() == ERROR_NO_MORE_FILES) {
953 
954 		return(1);
955 	} else {
956 		os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE);
957 		return(-1);
958 	}
959 #else
960 	struct dirent*	ent;
961 	char*		full_path;
962 	int		ret;
963 	struct stat	statinfo;
964 #ifdef HAVE_READDIR_R
965 	char		dirent_buf[sizeof(struct dirent)
966 				   + _POSIX_PATH_MAX + 100];
967 	/* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
968 	the max file name len; but in most standards, the
969 	length is NAME_MAX; we add 100 to be even safer */
970 #endif
971 
972 next_file:
973 
974 #ifdef HAVE_READDIR_R
975 	ret = readdir_r(dir, (struct dirent*) dirent_buf, &ent);
976 
977 	if (ret != 0
978 #ifdef UNIV_AIX
979 	    /* On AIX, only if we got non-NULL 'ent' (result) value and
980 	    a non-zero 'ret' (return) value, it indicates a failed
981 	    readdir_r() call. An NULL 'ent' with an non-zero 'ret'
982 	    would indicate the "end of the directory" is reached. */
983 	    && ent != NULL
984 #endif
985 	   ) {
986 		fprintf(stderr,
987 			"InnoDB: cannot read directory %s, error %lu\n",
988 			dirname, (ulong) ret);
989 
990 		return(-1);
991 	}
992 
993 	if (ent == NULL) {
994 		/* End of directory */
995 
996 		return(1);
997 	}
998 
999 	ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
1000 #else
1001 	ent = readdir(dir);
1002 
1003 	if (ent == NULL) {
1004 
1005 		return(1);
1006 	}
1007 #endif
1008 	ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
1009 
1010 	if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
1011 
1012 		goto next_file;
1013 	}
1014 
1015 	strcpy(info->name, ent->d_name);
1016 
1017 	full_path = static_cast<char*>(
1018 		ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10));
1019 
1020 	sprintf(full_path, "%s/%s", dirname, ent->d_name);
1021 
1022 	ret = stat(full_path, &statinfo);
1023 
1024 	if (ret) {
1025 
1026 		if (errno == ENOENT) {
1027 			/* readdir() returned a file that does not exist,
1028 			it must have been deleted in the meantime. Do what
1029 			would have happened if the file was deleted before
1030 			readdir() - ignore and go to the next entry.
1031 			If this is the last entry then info->name will still
1032 			contain the name of the deleted file when this
1033 			function returns, but this is not an issue since the
1034 			caller shouldn't be looking at info when end of
1035 			directory is returned. */
1036 
1037 			ut_free(full_path);
1038 
1039 			goto next_file;
1040 		}
1041 
1042 		os_file_handle_error_no_exit(full_path, "stat", FALSE);
1043 
1044 		ut_free(full_path);
1045 
1046 		return(-1);
1047 	}
1048 
1049 	info->size = (ib_int64_t) statinfo.st_size;
1050 
1051 	if (S_ISDIR(statinfo.st_mode)) {
1052 		info->type = OS_FILE_TYPE_DIR;
1053 	} else if (S_ISLNK(statinfo.st_mode)) {
1054 		info->type = OS_FILE_TYPE_LINK;
1055 	} else if (S_ISREG(statinfo.st_mode)) {
1056 		info->type = OS_FILE_TYPE_FILE;
1057 	} else {
1058 		info->type = OS_FILE_TYPE_UNKNOWN;
1059 	}
1060 
1061 	ut_free(full_path);
1062 
1063 	return(0);
1064 #endif
1065 }
1066 
1067 /*****************************************************************//**
1068 This function attempts to create a directory named pathname. The new
1069 directory gets default permissions. On Unix the permissions are
1070 (0770 & ~umask). If the directory exists already, nothing is done and
1071 the call succeeds, unless the fail_if_exists arguments is true.
1072 If another error occurs, such as a permission error, this does not crash,
1073 but reports the error and returns FALSE.
1074 @return	TRUE if call succeeds, FALSE on error */
1075 UNIV_INTERN
1076 ibool
os_file_create_directory(const char * pathname,ibool fail_if_exists)1077 os_file_create_directory(
1078 /*=====================*/
1079 	const char*	pathname,	/*!< in: directory name as
1080 					null-terminated string */
1081 	ibool		fail_if_exists)	/*!< in: if TRUE, pre-existing directory
1082 					is treated as an error. */
1083 {
1084 #ifdef __WIN__
1085 	BOOL	rcode;
1086 
1087 	rcode = CreateDirectory((LPCTSTR) pathname, NULL);
1088 	if (!(rcode != 0
1089 	      || (GetLastError() == ERROR_ALREADY_EXISTS
1090 		  && !fail_if_exists))) {
1091 
1092 		os_file_handle_error_no_exit(
1093 			pathname, "CreateDirectory", FALSE);
1094 
1095 		return(FALSE);
1096 	}
1097 
1098 	return(TRUE);
1099 #else
1100 	int	rcode;
1101 
1102 	rcode = mkdir(pathname, 0770);
1103 
1104 	if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
1105 		/* failure */
1106 		os_file_handle_error_no_exit(pathname, "mkdir", FALSE);
1107 
1108 		return(FALSE);
1109 	}
1110 
1111 	return (TRUE);
1112 #endif /* __WIN__ */
1113 }
1114 
1115 /****************************************************************//**
1116 NOTE! Use the corresponding macro os_file_create_simple(), not directly
1117 this function!
1118 A simple function to open or create a file.
1119 @return own: handle to the file, not defined if error, error number
1120 can be retrieved with os_file_get_last_error */
1121 UNIV_INTERN
1122 os_file_t
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,ibool * success)1123 os_file_create_simple_func(
1124 /*=======================*/
1125 	const char*	name,	/*!< in: name of the file or path as a
1126 				null-terminated string */
1127 	ulint		create_mode,/*!< in: create mode */
1128 	ulint		access_type,/*!< in: OS_FILE_READ_ONLY or
1129 				OS_FILE_READ_WRITE */
1130 	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
1131 {
1132 	os_file_t	file;
1133 	ibool		retry;
1134 
1135 	*success = FALSE;
1136 #ifdef __WIN__
1137 	DWORD		access;
1138 	DWORD		create_flag;
1139 	DWORD		attributes	= 0;
1140 
1141 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
1142 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
1143 
1144 	if (create_mode == OS_FILE_OPEN) {
1145 
1146 		create_flag = OPEN_EXISTING;
1147 
1148 	} else if (srv_read_only_mode) {
1149 
1150 		create_flag = OPEN_EXISTING;
1151 
1152 	} else if (create_mode == OS_FILE_CREATE) {
1153 
1154 		create_flag = CREATE_NEW;
1155 
1156 	} else if (create_mode == OS_FILE_CREATE_PATH) {
1157 
1158 		ut_a(!srv_read_only_mode);
1159 
1160 		/* Create subdirs along the path if needed  */
1161 		*success = os_file_create_subdirs_if_needed(name);
1162 
1163 		if (!*success) {
1164 
1165 			ib_logf(IB_LOG_LEVEL_ERROR,
1166 				"Unable to create subdirectories '%s'",
1167 				name);
1168 
1169 			return((os_file_t) -1);
1170 		}
1171 
1172 		create_flag = CREATE_NEW;
1173 		create_mode = OS_FILE_CREATE;
1174 
1175 	} else {
1176 		ib_logf(IB_LOG_LEVEL_ERROR,
1177 			"Unknown file create mode (%lu) for file '%s'",
1178 			create_mode, name);
1179 
1180 		return((os_file_t) -1);
1181 	}
1182 
1183 	if (access_type == OS_FILE_READ_ONLY) {
1184 		access = GENERIC_READ;
1185 	} else if (srv_read_only_mode) {
1186 
1187 		ib_logf(IB_LOG_LEVEL_INFO,
1188 			"read only mode set. Unable to "
1189 			"open file '%s' in RW mode, trying RO mode", name);
1190 
1191 		access = GENERIC_READ;
1192 
1193 	} else if (access_type == OS_FILE_READ_WRITE) {
1194 		access = GENERIC_READ | GENERIC_WRITE;
1195 	} else {
1196 		ib_logf(IB_LOG_LEVEL_ERROR,
1197 			"Unknown file access type (%lu) for file '%s'",
1198 			access_type, name);
1199 
1200 		return((os_file_t) -1);
1201 	}
1202 
1203 	do {
1204 		/* Use default security attributes and no template file. */
1205 
1206 		file = CreateFile(
1207 			(LPCTSTR) name, access, FILE_SHARE_READ, NULL,
1208 			create_flag, attributes, NULL);
1209 
1210 		if (file == INVALID_HANDLE_VALUE) {
1211 
1212 			*success = FALSE;
1213 
1214 			retry = os_file_handle_error(
1215 				name, create_mode == OS_FILE_OPEN ?
1216 				"open" : "create");
1217 
1218 		} else {
1219 			*success = TRUE;
1220 			retry = false;
1221 		}
1222 
1223 	} while (retry);
1224 
1225 #else /* __WIN__ */
1226 	int		create_flag;
1227 
1228 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
1229 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
1230 
1231 	if (create_mode == OS_FILE_OPEN) {
1232 
1233 		if (access_type == OS_FILE_READ_ONLY) {
1234 			create_flag = O_RDONLY;
1235 		} else if (srv_read_only_mode) {
1236 			create_flag = O_RDONLY;
1237 		} else {
1238 			create_flag = O_RDWR;
1239 		}
1240 
1241 	} else if (srv_read_only_mode) {
1242 
1243 		create_flag = O_RDONLY;
1244 
1245 	} else if (create_mode == OS_FILE_CREATE) {
1246 
1247 		create_flag = O_RDWR | O_CREAT | O_EXCL;
1248 
1249 	} else if (create_mode == OS_FILE_CREATE_PATH) {
1250 
1251 		/* Create subdirs along the path if needed  */
1252 
1253 		*success = os_file_create_subdirs_if_needed(name);
1254 
1255 		if (!*success) {
1256 
1257 			ib_logf(IB_LOG_LEVEL_ERROR,
1258 				"Unable to create subdirectories '%s'",
1259 				name);
1260 
1261 			return((os_file_t) -1);
1262 		}
1263 
1264 		create_flag = O_RDWR | O_CREAT | O_EXCL;
1265 		create_mode = OS_FILE_CREATE;
1266 	} else {
1267 
1268 		ib_logf(IB_LOG_LEVEL_ERROR,
1269 			"Unknown file create mode (%lu) for file '%s'",
1270 			create_mode, name);
1271 
1272 		return((os_file_t) -1);
1273 	}
1274 
1275 	do {
1276 		file = ::open(name, create_flag, os_innodb_umask);
1277 
1278 		if (file == -1) {
1279 			*success = FALSE;
1280 
1281 			retry = os_file_handle_error(
1282 				name,
1283 				create_mode == OS_FILE_OPEN
1284 				?  "open" : "create");
1285 		} else {
1286 			*success = TRUE;
1287 			retry = false;
1288 		}
1289 
1290 	} while (retry);
1291 
1292 #ifdef USE_FILE_LOCK
1293 	if (!srv_read_only_mode
1294 	    && *success
1295 	    && access_type == OS_FILE_READ_WRITE
1296 	    && os_file_lock(file, name)) {
1297 
1298 		*success = FALSE;
1299 		close(file);
1300 		file = -1;
1301 	}
1302 #endif /* USE_FILE_LOCK */
1303 
1304 #endif /* __WIN__ */
1305 
1306 	return(file);
1307 }
1308 
1309 /****************************************************************//**
1310 NOTE! Use the corresponding macro
1311 os_file_create_simple_no_error_handling(), not directly this function!
1312 A simple function to open or create a file.
1313 @return own: handle to the file, not defined if error, error number
1314 can be retrieved with os_file_get_last_error */
1315 UNIV_INTERN
1316 pfs_os_file_t
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,ibool * success)1317 os_file_create_simple_no_error_handling_func(
1318 /*=========================================*/
1319 	const char*	name,	/*!< in: name of the file or path as a
1320 				null-terminated string */
1321 	ulint		create_mode,/*!< in: create mode */
1322 	ulint		access_type,/*!< in: OS_FILE_READ_ONLY,
1323 				OS_FILE_READ_WRITE, or
1324 				OS_FILE_READ_ALLOW_DELETE; the last option is
1325 				used by a backup program reading the file */
1326 	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
1327 {
1328 	pfs_os_file_t	file;
1329 
1330 	*success = FALSE;
1331 #ifdef __WIN__
1332 	DWORD		access;
1333 	DWORD		create_flag;
1334 	DWORD		attributes	= 0;
1335 	DWORD		share_mode	= FILE_SHARE_READ;
1336 	ut_a(name);
1337 
1338 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
1339 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
1340 
1341 	if (create_mode == OS_FILE_OPEN) {
1342 		create_flag = OPEN_EXISTING;
1343 	} else if (srv_read_only_mode) {
1344 		create_flag = OPEN_EXISTING;
1345 	} else if (create_mode == OS_FILE_CREATE) {
1346 		create_flag = CREATE_NEW;
1347 	} else {
1348 
1349 		ib_logf(IB_LOG_LEVEL_ERROR,
1350 			"Unknown file create mode (%lu) for file '%s'",
1351 			create_mode, name);
1352 		file.m_file = (os_file_t)-1;
1353 		return(file);
1354 	}
1355 
1356 	if (access_type == OS_FILE_READ_ONLY) {
1357 		access = GENERIC_READ;
1358 	} else if (srv_read_only_mode) {
1359 		access = GENERIC_READ;
1360 	} else if (access_type == OS_FILE_READ_WRITE) {
1361 		access = GENERIC_READ | GENERIC_WRITE;
1362 	} else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
1363 
1364 		ut_a(!srv_read_only_mode);
1365 
1366 		access = GENERIC_READ;
1367 
1368 		/*!< A backup program has to give mysqld the maximum
1369 		freedom to do what it likes with the file */
1370 
1371 		share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE;
1372 	} else {
1373 		ib_logf(IB_LOG_LEVEL_ERROR,
1374 			"Unknown file access type (%lu) for file '%s'",
1375 			access_type, name);
1376 		file.m_file = (os_file_t)-1;
1377 		return(file);
1378 	}
1379 
1380 	file.m_file = CreateFile((LPCTSTR) name,
1381 			  access,
1382 			  share_mode,
1383 			  NULL,			// Security attributes
1384 			  create_flag,
1385 			  attributes,
1386 			  NULL);		// No template file
1387 
1388 	*success = (file.m_file != INVALID_HANDLE_VALUE);
1389 #else /* __WIN__ */
1390 	int		create_flag;
1391 	const char*	mode_str	= NULL;
1392 	ut_a(name);
1393 
1394 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
1395 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
1396 
1397 	if (create_mode == OS_FILE_OPEN) {
1398 
1399 		mode_str = "OPEN";
1400 
1401 		if (access_type == OS_FILE_READ_ONLY) {
1402 
1403 			create_flag = O_RDONLY;
1404 
1405 		} else if (srv_read_only_mode) {
1406 
1407 			create_flag = O_RDONLY;
1408 
1409 		} else {
1410 
1411 			ut_a(access_type == OS_FILE_READ_WRITE
1412 			     || access_type == OS_FILE_READ_ALLOW_DELETE);
1413 
1414 			create_flag = O_RDWR;
1415 		}
1416 
1417 	} else if (srv_read_only_mode) {
1418 
1419 		mode_str = "OPEN";
1420 
1421 		create_flag = O_RDONLY;
1422 
1423 	} else if (create_mode == OS_FILE_CREATE) {
1424 
1425 		mode_str = "CREATE";
1426 
1427 		create_flag = O_RDWR | O_CREAT | O_EXCL;
1428 
1429 	} else {
1430 		ib_logf(IB_LOG_LEVEL_ERROR,
1431 			"Unknown file create mode (%lu) for file '%s'",
1432 			create_mode, name);
1433 		file.m_file = -1;
1434 		return(file);
1435 	}
1436 
1437 	file.m_file = ::open(name, create_flag, os_innodb_umask);
1438 
1439 	*success = file.m_file == -1 ? FALSE : TRUE;
1440 
1441 	/* This function is always called for data files, we should disable
1442 	OS caching (O_DIRECT) here as we do in os_file_create_func(), so
1443 	we open the same file in the same mode, see man page of open(2). */
1444 	if (!srv_read_only_mode
1445 	    && *success
1446 	    && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
1447 		|| srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) {
1448 
1449 		os_file_set_nocache(file.m_file, name, mode_str);
1450 	}
1451 
1452 #ifdef USE_FILE_LOCK
1453 	if (!srv_read_only_mode
1454 	    && *success
1455 	    && access_type == OS_FILE_READ_WRITE
1456 	    && os_file_lock(file.m_file, name)) {
1457 
1458 		*success = FALSE;
1459 		close(file.m_file);
1460 		file.m_file = -1;
1461 
1462 	}
1463 #endif /* USE_FILE_LOCK */
1464 
1465 #endif /* __WIN__ */
1466 
1467 	return(file);
1468 }
1469 
1470 /****************************************************************//**
1471 Tries to disable OS caching on an opened file descriptor. */
1472 UNIV_INTERN
1473 void
os_file_set_nocache(int fd MY_ATTRIBUTE ((unused)),const char * file_name MY_ATTRIBUTE ((unused)),const char * operation_name MY_ATTRIBUTE ((unused)))1474 os_file_set_nocache(
1475 /*================*/
1476 	int		fd		/*!< in: file descriptor to alter */
1477 					MY_ATTRIBUTE((unused)),
1478 	const char*	file_name	/*!< in: used in the diagnostic
1479 					message */
1480 					MY_ATTRIBUTE((unused)),
1481 	const char*	operation_name MY_ATTRIBUTE((unused)))
1482 					/*!< in: "open" or "create"; used
1483 					in the diagnostic message */
1484 {
1485 	/* some versions of Solaris may not have DIRECTIO_ON */
1486 #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
1487 	if (directio(fd, DIRECTIO_ON) == -1) {
1488 		int	errno_save = errno;
1489 
1490 		ib_logf(IB_LOG_LEVEL_ERROR,
1491 			"Failed to set DIRECTIO_ON on file %s: %s: %s, "
1492 			"continuing anyway.",
1493 			file_name, operation_name, strerror(errno_save));
1494 	}
1495 #elif defined(O_DIRECT)
1496 	if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
1497 		int		errno_save = errno;
1498 		static bool	warning_message_printed = false;
1499 		if (errno_save == EINVAL) {
1500 			if (!warning_message_printed) {
1501 				warning_message_printed = true;
1502 # ifdef UNIV_LINUX
1503 				ib_logf(IB_LOG_LEVEL_WARN,
1504 					"Failed to set O_DIRECT on file "
1505 					"%s: %s: %s, continuing anyway. "
1506 					"O_DIRECT is known to result "
1507 					"in 'Invalid argument' on Linux on "
1508 					"tmpfs, see MySQL Bug#26662.",
1509 					file_name, operation_name,
1510 					strerror(errno_save));
1511 # else /* UNIV_LINUX */
1512 				goto short_warning;
1513 # endif /* UNIV_LINUX */
1514 			}
1515 		} else {
1516 # ifndef UNIV_LINUX
1517 short_warning:
1518 # endif
1519 			ib_logf(IB_LOG_LEVEL_WARN,
1520 				"Failed to set O_DIRECT on file %s: %s: %s, "
1521 				"continuing anyway.",
1522 				file_name, operation_name, strerror(errno_save));
1523 		}
1524 	}
1525 #endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */
1526 }
1527 
1528 /****************************************************************//**
1529 NOTE! Use the corresponding macro os_file_create(), not directly
1530 this function!
1531 Opens an existing file or creates a new.
1532 @return own: handle to the file, not defined if error, error number
1533 can be retrieved with os_file_get_last_error */
1534 UNIV_INTERN
1535 pfs_os_file_t
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,ibool * success)1536 os_file_create_func(
1537 /*================*/
1538 	const char*	name,	/*!< in: name of the file or path as a
1539 				null-terminated string */
1540 	ulint		create_mode,/*!< in: create mode */
1541 	ulint		purpose,/*!< in: OS_FILE_AIO, if asynchronous,
1542 				non-buffered i/o is desired,
1543 				OS_FILE_NORMAL, if any normal file;
1544 				NOTE that it also depends on type, os_aio_..
1545 				and srv_.. variables whether we really use
1546 				async i/o or unbuffered i/o: look in the
1547 				function source code for the exact rules */
1548 	ulint		type,	/*!< in: OS_DATA_FILE or OS_LOG_FILE */
1549 	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
1550 {
1551 	pfs_os_file_t	file;
1552 	ibool		retry;
1553 	ibool		on_error_no_exit;
1554 	ibool		on_error_silent;
1555 #ifdef __WIN__
1556 	DBUG_EXECUTE_IF(
1557 		"ib_create_table_fail_disk_full",
1558 		*success = FALSE;
1559 		SetLastError(ERROR_DISK_FULL);
1560 		file.m_file = (os_file_t)-1;
1561 		return(file);
1562 	);
1563 #else /* __WIN__ */
1564 	DBUG_EXECUTE_IF(
1565 		"ib_create_table_fail_disk_full",
1566 		*success = FALSE;
1567 		errno = ENOSPC;
1568 		file.m_file = -1;
1569 		return(file);
1570 	);
1571 #endif /* __WIN__ */
1572 
1573 #ifdef __WIN__
1574 	DWORD		create_flag;
1575 	DWORD		share_mode	= FILE_SHARE_READ;
1576 
1577 	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
1578 		? TRUE : FALSE;
1579 
1580 	on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
1581 		? TRUE : FALSE;
1582 
1583 	create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
1584 	create_mode &= ~OS_FILE_ON_ERROR_SILENT;
1585 
1586 	if (create_mode == OS_FILE_OPEN_RAW) {
1587 
1588 		ut_a(!srv_read_only_mode);
1589 
1590 		create_flag = OPEN_EXISTING;
1591 
1592 		/* On Windows Physical devices require admin privileges and
1593 		have to have the write-share mode set. See the remarks
1594 		section for the CreateFile() function documentation in MSDN. */
1595 
1596 		share_mode |= FILE_SHARE_WRITE;
1597 
1598 	} else if (create_mode == OS_FILE_OPEN
1599 		   || create_mode == OS_FILE_OPEN_RETRY) {
1600 
1601 		create_flag = OPEN_EXISTING;
1602 
1603 	} else if (srv_read_only_mode) {
1604 
1605 		create_flag = OPEN_EXISTING;
1606 
1607 	} else if (create_mode == OS_FILE_CREATE) {
1608 
1609 		create_flag = CREATE_NEW;
1610 
1611 	} else if (create_mode == OS_FILE_OVERWRITE) {
1612 
1613 		create_flag = CREATE_ALWAYS;
1614 
1615 	} else {
1616 		ib_logf(IB_LOG_LEVEL_ERROR,
1617 			"Unknown file create mode (%lu) for file '%s'",
1618 			create_mode, name);
1619 
1620 		file.m_file = (os_file_t)-1;
1621 		return(file);
1622 	}
1623 
1624 	DWORD		attributes = 0;
1625 
1626 #ifdef UNIV_HOTBACKUP
1627 	attributes |= FILE_FLAG_NO_BUFFERING;
1628 #else
1629 	if (purpose == OS_FILE_AIO) {
1630 
1631 #ifdef WIN_ASYNC_IO
1632 		/* If specified, use asynchronous (overlapped) io and no
1633 		buffering of writes in the OS */
1634 
1635 		if (srv_use_native_aio) {
1636 			attributes |= FILE_FLAG_OVERLAPPED;
1637 		}
1638 #endif /* WIN_ASYNC_IO */
1639 
1640 	} else if (purpose == OS_FILE_NORMAL) {
1641 		/* Use default setting. */
1642 	} else {
1643 		ib_logf(IB_LOG_LEVEL_ERROR,
1644 			"Unknown purpose flag (%lu) while opening file '%s'",
1645 			purpose, name);
1646 		file.m_file = (os_file_t)-1;
1647 		return(file);
1648 	}
1649 
1650 #ifdef UNIV_NON_BUFFERED_IO
1651 	// TODO: Create a bug, this looks wrong. The flush log
1652 	// parameter is dynamic.
1653 	if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
1654 
1655 		/* Do not use unbuffered i/o for the log files because
1656 		value 2 denotes that we do not flush the log at every
1657 		commit, but only once per second */
1658 
1659 	} else if (srv_win_file_flush_method == SRV_WIN_IO_UNBUFFERED) {
1660 
1661 		attributes |= FILE_FLAG_NO_BUFFERING;
1662 	}
1663 #endif /* UNIV_NON_BUFFERED_IO */
1664 
1665 #endif /* UNIV_HOTBACKUP */
1666 	DWORD	access = GENERIC_READ;
1667 
1668 	if (!srv_read_only_mode) {
1669 		access |= GENERIC_WRITE;
1670 	}
1671 
1672 	do {
1673 		/* Use default security attributes and no template file. */
1674 		file.m_file = CreateFile(
1675 			(LPCTSTR) name, access, share_mode, NULL,
1676 			create_flag, attributes, NULL);
1677 
1678 		if (file.m_file == INVALID_HANDLE_VALUE) {
1679 			const char*	operation;
1680 
1681 			operation = (create_mode == OS_FILE_CREATE
1682 				     && !srv_read_only_mode)
1683 				? "create" : "open";
1684 
1685 			*success = FALSE;
1686 
1687 			if (on_error_no_exit) {
1688 				retry = os_file_handle_error_no_exit(
1689 					name, operation, on_error_silent);
1690 			} else {
1691 				retry = os_file_handle_error(name, operation);
1692 			}
1693 		} else {
1694 			*success = TRUE;
1695 			retry = FALSE;
1696 		}
1697 
1698 	} while (retry);
1699 
1700 #else /* __WIN__ */
1701 	int		create_flag;
1702 	const char*	mode_str	= NULL;
1703 	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
1704 		? TRUE : FALSE;
1705 	on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
1706 		? TRUE : FALSE;
1707 
1708 	create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
1709 	create_mode &= ~OS_FILE_ON_ERROR_SILENT;
1710 
1711 	if (create_mode == OS_FILE_OPEN
1712 	    || create_mode == OS_FILE_OPEN_RAW
1713 	    || create_mode == OS_FILE_OPEN_RETRY) {
1714 
1715 		mode_str = "OPEN";
1716 
1717 		create_flag = srv_read_only_mode ? O_RDONLY : O_RDWR;
1718 
1719 	} else if (srv_read_only_mode) {
1720 
1721 		mode_str = "OPEN";
1722 
1723 		create_flag = O_RDONLY;
1724 
1725 	} else if (create_mode == OS_FILE_CREATE) {
1726 
1727 		mode_str = "CREATE";
1728 		create_flag = O_RDWR | O_CREAT | O_EXCL;
1729 
1730 	} else if (create_mode == OS_FILE_OVERWRITE) {
1731 
1732 		mode_str = "OVERWRITE";
1733 		create_flag = O_RDWR | O_CREAT | O_TRUNC;
1734 
1735 	} else {
1736 		ib_logf(IB_LOG_LEVEL_ERROR,
1737 			"Unknown file create mode (%lu) for file '%s'",
1738 			create_mode, name);
1739 
1740 		file.m_file = -1;
1741 		return(file);
1742 	}
1743 
1744 	ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE);
1745 	ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
1746 
1747 #ifdef O_SYNC
1748 	/* We let O_SYNC only affect log files; note that we map O_DSYNC to
1749 	O_SYNC because the datasync options seemed to corrupt files in 2001
1750 	in both Linux and Solaris */
1751 
1752 	if (!srv_read_only_mode
1753 	    && type == OS_LOG_FILE
1754 	    && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
1755 
1756 		create_flag |= O_SYNC;
1757 	}
1758 #endif /* O_SYNC */
1759 
1760 	do {
1761 		file.m_file = ::open(name, create_flag, os_innodb_umask);
1762 
1763 		if (file.m_file == -1) {
1764 			const char*	operation;
1765 
1766 			operation = (create_mode == OS_FILE_CREATE
1767 				     && !srv_read_only_mode)
1768 				? "create" : "open";
1769 
1770 			*success = FALSE;
1771 
1772 			if (on_error_no_exit) {
1773 				retry = os_file_handle_error_no_exit(
1774 					name, operation, on_error_silent);
1775 			} else {
1776 				retry = os_file_handle_error(name, operation);
1777 			}
1778 		} else {
1779 			*success = TRUE;
1780 			retry = false;
1781 		}
1782 
1783 	} while (retry);
1784 
1785 	/* We disable OS caching (O_DIRECT) only on data files */
1786 
1787 	if (!srv_read_only_mode
1788 	    && *success
1789 	    && type != OS_LOG_FILE
1790 	    && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
1791 		|| srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) {
1792 
1793 		os_file_set_nocache(file.m_file, name, mode_str);
1794 	}
1795 
1796 #ifdef USE_FILE_LOCK
1797 	if (!srv_read_only_mode
1798 	    && *success
1799 	    && create_mode != OS_FILE_OPEN_RAW
1800 	    && os_file_lock(file.m_file, name)) {
1801 
1802 		if (create_mode == OS_FILE_OPEN_RETRY) {
1803 
1804 			ut_a(!srv_read_only_mode);
1805 
1806 			ib_logf(IB_LOG_LEVEL_INFO,
1807 				"Retrying to lock the first data file");
1808 
1809 			for (int i = 0; i < 100; i++) {
1810 				os_thread_sleep(1000000);
1811 
1812 				if (!os_file_lock(file.m_file, name)) {
1813 					*success = TRUE;
1814 					return(file);
1815 				}
1816 			}
1817 
1818 			ib_logf(IB_LOG_LEVEL_INFO,
1819 				"Unable to open the first data file");
1820 		}
1821 
1822 		*success = FALSE;
1823 		close(file.m_file);
1824 		file.m_file = -1;
1825 	}
1826 #endif /* USE_FILE_LOCK */
1827 
1828 #endif /* __WIN__ */
1829 
1830 	return(file);
1831 }
1832 
1833 /***********************************************************************//**
1834 Deletes a file if it exists. The file has to be closed before calling this.
1835 @return	TRUE if success */
1836 UNIV_INTERN
1837 bool
os_file_delete_if_exists_func(const char * name)1838 os_file_delete_if_exists_func(
1839 /*==========================*/
1840 	const char*	name)	/*!< in: file path as a null-terminated
1841 				string */
1842 {
1843 #ifdef __WIN__
1844 	bool	ret;
1845 	ulint	count	= 0;
1846 loop:
1847 	/* In Windows, deleting an .ibd file may fail if mysqlbackup is copying
1848 	it */
1849 
1850 	ret = DeleteFile((LPCTSTR) name);
1851 
1852 	if (ret) {
1853 		return(true);
1854 	}
1855 
1856 	DWORD lasterr = GetLastError();
1857 	if (lasterr == ERROR_FILE_NOT_FOUND
1858 	    || lasterr == ERROR_PATH_NOT_FOUND) {
1859 		/* the file does not exist, this not an error */
1860 
1861 		return(true);
1862 	}
1863 
1864 	count++;
1865 
1866 	if (count > 100 && 0 == (count % 10)) {
1867 		os_file_get_last_error(true); /* print error information */
1868 
1869 		ib_logf(IB_LOG_LEVEL_WARN, "Delete of file %s failed.", name);
1870 	}
1871 
1872 	os_thread_sleep(500000);	/* sleep for 0.5 second */
1873 
1874 	if (count > 2000) {
1875 
1876 		return(false);
1877 	}
1878 
1879 	goto loop;
1880 #else
1881 	int	ret;
1882 
1883 	ret = unlink(name);
1884 
1885 	if (ret != 0 && errno != ENOENT) {
1886 		os_file_handle_error_no_exit(name, "delete", FALSE);
1887 
1888 		return(false);
1889 	}
1890 
1891 	return(true);
1892 #endif /* __WIN__ */
1893 }
1894 
1895 /***********************************************************************//**
1896 Deletes a file. The file has to be closed before calling this.
1897 @return	TRUE if success */
1898 UNIV_INTERN
1899 bool
os_file_delete_func(const char * name)1900 os_file_delete_func(
1901 /*================*/
1902 	const char*	name)	/*!< in: file path as a null-terminated
1903 				string */
1904 {
1905 #ifdef __WIN__
1906 	BOOL	ret;
1907 	ulint	count	= 0;
1908 loop:
1909 	/* In Windows, deleting an .ibd file may fail if mysqlbackup is copying
1910 	it */
1911 
1912 	ret = DeleteFile((LPCTSTR) name);
1913 
1914 	if (ret) {
1915 		return(true);
1916 	}
1917 
1918 	if (GetLastError() == ERROR_FILE_NOT_FOUND) {
1919 		/* If the file does not exist, we classify this as a 'mild'
1920 		error and return */
1921 
1922 		return(false);
1923 	}
1924 
1925 	count++;
1926 
1927 	if (count > 100 && 0 == (count % 10)) {
1928 		os_file_get_last_error(true); /* print error information */
1929 
1930 		fprintf(stderr,
1931 			"InnoDB: Warning: cannot delete file %s\n"
1932 			"InnoDB: Are you running mysqlbackup"
1933 			" to back up the file?\n", name);
1934 	}
1935 
1936 	os_thread_sleep(1000000);	/* sleep for a second */
1937 
1938 	if (count > 2000) {
1939 
1940 		return(false);
1941 	}
1942 
1943 	goto loop;
1944 #else
1945 	int	ret;
1946 
1947 	ret = unlink(name);
1948 
1949 	if (ret != 0) {
1950 		os_file_handle_error_no_exit(name, "delete", FALSE);
1951 
1952 		return(false);
1953 	}
1954 
1955 	return(true);
1956 #endif
1957 }
1958 
1959 /***********************************************************************//**
1960 NOTE! Use the corresponding macro os_file_rename(), not directly this function!
1961 Renames a file (can also move it to another directory). It is safest that the
1962 file is closed before calling this function.
1963 @return	TRUE if success */
1964 UNIV_INTERN
1965 ibool
os_file_rename_func(const char * oldpath,const char * newpath)1966 os_file_rename_func(
1967 /*================*/
1968 	const char*	oldpath,/*!< in: old file path as a null-terminated
1969 				string */
1970 	const char*	newpath)/*!< in: new file path */
1971 {
1972 #ifdef UNIV_DEBUG
1973 	os_file_type_t	type;
1974 	ibool		exists;
1975 
1976 	/* New path must not exist. */
1977 	ut_ad(os_file_status(newpath, &exists, &type));
1978 	ut_ad(!exists);
1979 
1980 	/* Old path must exist. */
1981 	ut_ad(os_file_status(oldpath, &exists, &type));
1982 	ut_ad(exists);
1983 #endif /* UNIV_DEBUG */
1984 
1985 #ifdef __WIN__
1986 	BOOL	ret;
1987 
1988 	ret = MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath);
1989 
1990 	if (ret) {
1991 		return(TRUE);
1992 	}
1993 
1994 	os_file_handle_error_no_exit(oldpath, "rename", FALSE);
1995 
1996 	return(FALSE);
1997 #else
1998 	int	ret;
1999 
2000 	ret = rename(oldpath, newpath);
2001 
2002 	if (ret != 0) {
2003 		os_file_handle_error_no_exit(oldpath, "rename", FALSE);
2004 
2005 		return(FALSE);
2006 	}
2007 
2008 	return(TRUE);
2009 #endif /* __WIN__ */
2010 }
2011 
2012 /***********************************************************************//**
2013 NOTE! Use the corresponding macro os_file_close(), not directly this function!
2014 Closes a file handle. In case of error, error number can be retrieved with
2015 os_file_get_last_error.
2016 @return	TRUE if success */
2017 UNIV_INTERN
2018 ibool
os_file_close_func(os_file_t file)2019 os_file_close_func(
2020 /*===============*/
2021 	os_file_t	file)	/*!< in, own: handle to a file */
2022 {
2023 #ifdef __WIN__
2024 	BOOL	ret;
2025 
2026 	ut_a(file);
2027 
2028 	ret = CloseHandle(file);
2029 
2030 	if (ret) {
2031 		return(TRUE);
2032 	}
2033 
2034 	os_file_handle_error(NULL, "close");
2035 
2036 	return(FALSE);
2037 #else
2038 	int	ret;
2039 
2040 	ret = close(file);
2041 
2042 	if (ret == -1) {
2043 		os_file_handle_error(NULL, "close");
2044 
2045 		return(FALSE);
2046 	}
2047 
2048 	return(TRUE);
2049 #endif /* __WIN__ */
2050 }
2051 
2052 #ifdef UNIV_HOTBACKUP
2053 /***********************************************************************//**
2054 Closes a file handle.
2055 @return	TRUE if success */
2056 UNIV_INTERN
2057 ibool
os_file_close_no_error_handling(os_file_t file)2058 os_file_close_no_error_handling(
2059 /*============================*/
2060 	os_file_t	file)	/*!< in, own: handle to a file */
2061 {
2062 #ifdef __WIN__
2063 	BOOL	ret;
2064 
2065 	ut_a(file);
2066 
2067 	ret = CloseHandle(file);
2068 
2069 	if (ret) {
2070 		return(TRUE);
2071 	}
2072 
2073 	return(FALSE);
2074 #else
2075 	int	ret;
2076 
2077 	ret = close(file);
2078 
2079 	if (ret == -1) {
2080 
2081 		return(FALSE);
2082 	}
2083 
2084 	return(TRUE);
2085 #endif /* __WIN__ */
2086 }
2087 #endif /* UNIV_HOTBACKUP */
2088 
2089 /***********************************************************************//**
2090 Gets a file size.
2091 @return	file size, or (os_offset_t) -1 on failure */
2092 UNIV_INTERN
2093 os_offset_t
os_file_get_size(pfs_os_file_t file)2094 os_file_get_size(
2095 /*=============*/
2096 	pfs_os_file_t	file)	/*!< in: handle to a file */
2097 {
2098 #ifdef __WIN__
2099 	os_offset_t	offset;
2100 	DWORD		high;
2101 	DWORD		low;
2102 
2103 	low = GetFileSize(file.m_file, &high);
2104 
2105 	if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) {
2106 		return((os_offset_t) -1);
2107 	}
2108 
2109 	offset = (os_offset_t) low | ((os_offset_t) high << 32);
2110 
2111 	return(offset);
2112 #else
2113 	return((os_offset_t) lseek(file.m_file, 0, SEEK_END));
2114 
2115 #endif /* __WIN__ */
2116 }
2117 
2118 /***********************************************************************//**
2119 Write the specified number of zeros to a newly created file.
2120 @return	TRUE if success */
2121 UNIV_INTERN
2122 ibool
os_file_set_size(const char * name,pfs_os_file_t file,os_offset_t size)2123 os_file_set_size(
2124 /*=============*/
2125 	const char*	name,	/*!< in: name of the file or path as a
2126 				null-terminated string */
2127 	pfs_os_file_t	file,	/*!< in: handle to a file */
2128 	os_offset_t	size)	/*!< in: file size */
2129 {
2130 	os_offset_t	current_size;
2131 	ibool		ret;
2132 	byte*		buf;
2133 	byte*		buf2;
2134 	ulint		buf_size;
2135 
2136 	current_size = 0;
2137 
2138 	/* Write up to 1 megabyte at a time. */
2139 	buf_size = ut_min(64, (ulint) (size / UNIV_PAGE_SIZE))
2140 		* UNIV_PAGE_SIZE;
2141 	buf2 = static_cast<byte*>(ut_malloc(buf_size + UNIV_PAGE_SIZE));
2142 
2143 	/* Align the buffer for possible raw i/o */
2144 	buf = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
2145 
2146 	/* Write buffer full of zeros */
2147 	memset(buf, 0, buf_size);
2148 
2149 	if (size >= (os_offset_t) 100 << 20) {
2150 
2151 		fprintf(stderr, "InnoDB: Progress in MB:");
2152 	}
2153 
2154 	while (current_size < size) {
2155 		ulint	n_bytes;
2156 
2157 		if (size - current_size < (os_offset_t) buf_size) {
2158 			n_bytes = (ulint) (size - current_size);
2159 		} else {
2160 			n_bytes = buf_size;
2161 		}
2162 
2163 		ret = os_file_write(name, file, buf, current_size, n_bytes);
2164 		if (!ret) {
2165 			ut_free(buf2);
2166 			goto error_handling;
2167 		}
2168 
2169 		/* Print about progress for each 100 MB written */
2170 		if ((current_size + n_bytes) / (100 << 20)
2171 		    != current_size / (100 << 20)) {
2172 
2173 			fprintf(stderr, " %lu00",
2174 				(ulong) ((current_size + n_bytes)
2175 					 / (100 << 20)));
2176 		}
2177 
2178 		current_size += n_bytes;
2179 	}
2180 
2181 	if (size >= (os_offset_t) 100 << 20) {
2182 
2183 		fprintf(stderr, "\n");
2184 	}
2185 
2186 	ut_free(buf2);
2187 
2188 	ret = os_file_flush(file);
2189 
2190 	if (ret) {
2191 		return(TRUE);
2192 	}
2193 
2194 error_handling:
2195 	return(FALSE);
2196 }
2197 
2198 /***********************************************************************//**
2199 Truncates a file at its current position.
2200 @return	TRUE if success */
2201 UNIV_INTERN
2202 ibool
os_file_set_eof(FILE * file)2203 os_file_set_eof(
2204 /*============*/
2205 	FILE*		file)	/*!< in: file to be truncated */
2206 {
2207 #ifdef __WIN__
2208 	HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
2209 	return(SetEndOfFile(h));
2210 #else /* __WIN__ */
2211 	return(!ftruncate(fileno(file), ftell(file)));
2212 #endif /* __WIN__ */
2213 }
2214 
2215 #ifndef __WIN__
2216 /***********************************************************************//**
2217 Wrapper to fsync(2) that retries the call on some errors.
2218 Returns the value 0 if successful; otherwise the value -1 is returned and
2219 the global variable errno is set to indicate the error.
2220 @return	0 if success, -1 otherwise */
2221 
2222 static
2223 int
os_file_fsync(os_file_t file)2224 os_file_fsync(
2225 /*==========*/
2226 	os_file_t	file)	/*!< in: handle to a file */
2227 {
2228 	int	ret;
2229 	int	failures;
2230 	ibool	retry;
2231 
2232 	failures = 0;
2233 
2234 	do {
2235 		ret = fsync(file);
2236 
2237 		os_n_fsyncs++;
2238 
2239 		if (ret == -1 && errno == ENOLCK) {
2240 
2241 			if (failures % 100 == 0) {
2242 
2243 				ut_print_timestamp(stderr);
2244 				fprintf(stderr,
2245 					" InnoDB: fsync(): "
2246 					"No locks available; retrying\n");
2247 			}
2248 
2249 			os_thread_sleep(200000 /* 0.2 sec */);
2250 
2251 			failures++;
2252 
2253 			retry = TRUE;
2254 		} else {
2255 
2256 			retry = FALSE;
2257 		}
2258 	} while (retry);
2259 
2260 	return(ret);
2261 }
2262 #endif /* !__WIN__ */
2263 
2264 /***********************************************************************//**
2265 NOTE! Use the corresponding macro os_file_flush(), not directly this function!
2266 Flushes the write buffers of a given file to the disk.
2267 @return	TRUE if success */
2268 UNIV_INTERN
2269 ibool
os_file_flush_func(os_file_t file)2270 os_file_flush_func(
2271 /*===============*/
2272 	os_file_t	file)	/*!< in, own: handle to a file */
2273 {
2274 #ifdef __WIN__
2275 	BOOL	ret;
2276 
2277 	ut_a(file);
2278 
2279 	os_n_fsyncs++;
2280 
2281 	ret = FlushFileBuffers(file);
2282 
2283 	if (ret) {
2284 		return(TRUE);
2285 	}
2286 
2287 	/* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
2288 	actually a raw device, we choose to ignore that error if we are using
2289 	raw disks */
2290 
2291 	if (srv_start_raw_disk_in_use && GetLastError()
2292 	    == ERROR_INVALID_FUNCTION) {
2293 		return(TRUE);
2294 	}
2295 
2296 	os_file_handle_error(NULL, "flush");
2297 
2298 	/* It is a fatal error if a file flush does not succeed, because then
2299 	the database can get corrupt on disk */
2300 	ut_error;
2301 
2302 	return(FALSE);
2303 #else
2304 	int	ret;
2305 
2306 #if defined(HAVE_DARWIN_THREADS)
2307 # ifndef F_FULLFSYNC
2308 	/* The following definition is from the Mac OS X 10.3 <sys/fcntl.h> */
2309 #  define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */
2310 # elif F_FULLFSYNC != 51
2311 #  error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3"
2312 # endif
2313 	/* Apple has disabled fsync() for internal disk drives in OS X. That
2314 	caused corruption for a user when he tested a power outage. Let us in
2315 	OS X use a nonstandard flush method recommended by an Apple
2316 	engineer. */
2317 
2318 	if (!srv_have_fullfsync) {
2319 		/* If we are not on an operating system that supports this,
2320 		then fall back to a plain fsync. */
2321 
2322 		ret = os_file_fsync(file);
2323 	} else {
2324 		ret = fcntl(file, F_FULLFSYNC, NULL);
2325 
2326 		if (ret) {
2327 			/* If we are not on a file system that supports this,
2328 			then fall back to a plain fsync. */
2329 			ret = os_file_fsync(file);
2330 		}
2331 	}
2332 #else
2333 	ret = os_file_fsync(file);
2334 #endif
2335 
2336 	if (ret == 0) {
2337 		return(TRUE);
2338 	}
2339 
2340 	/* Since Linux returns EINVAL if the 'file' is actually a raw device,
2341 	we choose to ignore that error if we are using raw disks */
2342 
2343 	if (srv_start_raw_disk_in_use && errno == EINVAL) {
2344 
2345 		return(TRUE);
2346 	}
2347 
2348 	ib_logf(IB_LOG_LEVEL_ERROR, "The OS said file flush did not succeed");
2349 
2350 	os_file_handle_error(NULL, "flush");
2351 
2352 	/* It is a fatal error if a file flush does not succeed, because then
2353 	the database can get corrupt on disk */
2354 	ut_error;
2355 
2356 	return(FALSE);
2357 #endif
2358 }
2359 
2360 #ifndef __WIN__
2361 /*******************************************************************//**
2362 Does a synchronous read operation in Posix.
2363 @return	number of bytes read, -1 if error */
2364 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2365 ssize_t
os_file_pread(os_file_t file,void * buf,ulint n,os_offset_t offset)2366 os_file_pread(
2367 /*==========*/
2368 	os_file_t	file,	/*!< in: handle to a file */
2369 	void*		buf,	/*!< in: buffer where to read */
2370 	ulint		n,	/*!< in: number of bytes to read */
2371 	os_offset_t	offset)	/*!< in: file offset from where to read */
2372 {
2373 	off_t	offs;
2374 #if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
2375 	ssize_t	n_bytes;
2376 #endif /* HAVE_PREAD && !HAVE_BROKEN_PREAD */
2377 
2378 	ut_ad(n);
2379 
2380 	/* If off_t is > 4 bytes in size, then we assume we can pass a
2381 	64-bit address */
2382 	offs = (off_t) offset;
2383 
2384 	if (sizeof(off_t) <= 4) {
2385 		if (offset != (os_offset_t) offs) {
2386 			ib_logf(IB_LOG_LEVEL_ERROR,
2387 				"File read at offset > 4 GB");
2388 		}
2389 	}
2390 
2391 	os_n_file_reads++;
2392 
2393 #if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
2394 #if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
2395 	(void) os_atomic_increment_ulint(&os_n_pending_reads, 1);
2396 	(void) os_atomic_increment_ulint(&os_file_n_pending_preads, 1);
2397 	MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS);
2398 #else
2399 	os_mutex_enter(os_file_count_mutex);
2400 	os_file_n_pending_preads++;
2401 	os_n_pending_reads++;
2402 	MONITOR_INC(MONITOR_OS_PENDING_READS);
2403 	os_mutex_exit(os_file_count_mutex);
2404 #endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD == 8 */
2405 
2406 	n_bytes = pread(file, buf, n, offs);
2407 
2408 #if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
2409 	(void) os_atomic_decrement_ulint(&os_n_pending_reads, 1);
2410 	(void) os_atomic_decrement_ulint(&os_file_n_pending_preads, 1);
2411 	MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_READS);
2412 #else
2413 	os_mutex_enter(os_file_count_mutex);
2414 	os_file_n_pending_preads--;
2415 	os_n_pending_reads--;
2416 	MONITOR_DEC(MONITOR_OS_PENDING_READS);
2417 	os_mutex_exit(os_file_count_mutex);
2418 #endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD == 8 */
2419 
2420 	return(n_bytes);
2421 #else
2422 	{
2423 		off_t	ret_offset;
2424 		ssize_t	ret;
2425 #ifndef UNIV_HOTBACKUP
2426 		ulint	i;
2427 #endif /* !UNIV_HOTBACKUP */
2428 
2429 #if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
2430 		(void) os_atomic_increment_ulint(&os_n_pending_reads, 1);
2431 		MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS);
2432 #else
2433 		os_mutex_enter(os_file_count_mutex);
2434 		os_n_pending_reads++;
2435 		MONITOR_INC(MONITOR_OS_PENDING_READS);
2436 		os_mutex_exit(os_file_count_mutex);
2437 #endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD == 8 */
2438 #ifndef UNIV_HOTBACKUP
2439 		/* Protect the seek / read operation with a mutex */
2440 		i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2441 
2442 		os_mutex_enter(os_file_seek_mutexes[i]);
2443 #endif /* !UNIV_HOTBACKUP */
2444 
2445 		ret_offset = lseek(file, offs, SEEK_SET);
2446 
2447 		if (ret_offset < 0) {
2448 			ret = -1;
2449 		} else {
2450 			ret = read(file, buf, (ssize_t) n);
2451 		}
2452 
2453 #ifndef UNIV_HOTBACKUP
2454 		os_mutex_exit(os_file_seek_mutexes[i]);
2455 #endif /* !UNIV_HOTBACKUP */
2456 
2457 #if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
2458 		(void) os_atomic_decrement_ulint(&os_n_pending_reads, 1);
2459 		MONITOR_ATOIC_DEC(MONITOR_OS_PENDING_READS);
2460 #else
2461 		os_mutex_enter(os_file_count_mutex);
2462 		os_n_pending_reads--;
2463 		MONITOR_DEC(MONITOR_OS_PENDING_READS);
2464 		os_mutex_exit(os_file_count_mutex);
2465 #endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD_SIZE == 8 */
2466 
2467 		return(ret);
2468 	}
2469 #endif
2470 }
2471 
2472 /*******************************************************************//**
2473 Does a synchronous write operation in Posix.
2474 @return	number of bytes written, -1 if error */
2475 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2476 ssize_t
os_file_pwrite(os_file_t file,const void * buf,ulint n,os_offset_t offset)2477 os_file_pwrite(
2478 /*===========*/
2479 	os_file_t	file,	/*!< in: handle to a file */
2480 	const void*	buf,	/*!< in: buffer from where to write */
2481 	ulint		n,	/*!< in: number of bytes to write */
2482 	os_offset_t	offset)	/*!< in: file offset where to write */
2483 {
2484 	ssize_t	ret;
2485 	off_t	offs;
2486 
2487 	ut_ad(n);
2488 	ut_ad(!srv_read_only_mode);
2489 
2490 	/* If off_t is > 4 bytes in size, then we assume we can pass a
2491 	64-bit address */
2492 	offs = (off_t) offset;
2493 
2494 	if (sizeof(off_t) <= 4) {
2495 		if (offset != (os_offset_t) offs) {
2496 			ib_logf(IB_LOG_LEVEL_ERROR,
2497 				"File write at offset > 4 GB.");
2498 		}
2499 	}
2500 
2501 	os_n_file_writes++;
2502 
2503 #if defined(HAVE_PWRITE) && !defined(HAVE_BROKEN_PREAD)
2504 #if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8
2505 	os_mutex_enter(os_file_count_mutex);
2506 	os_file_n_pending_pwrites++;
2507 	os_n_pending_writes++;
2508 	MONITOR_INC(MONITOR_OS_PENDING_WRITES);
2509 	os_mutex_exit(os_file_count_mutex);
2510 #else
2511 	(void) os_atomic_increment_ulint(&os_n_pending_writes, 1);
2512 	(void) os_atomic_increment_ulint(&os_file_n_pending_pwrites, 1);
2513 	MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_WRITES);
2514 #endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD < 8 */
2515 
2516 	ret = pwrite(file, buf, (ssize_t) n, offs);
2517 
2518 #if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8
2519 	os_mutex_enter(os_file_count_mutex);
2520 	os_file_n_pending_pwrites--;
2521 	os_n_pending_writes--;
2522 	MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
2523 	os_mutex_exit(os_file_count_mutex);
2524 #else
2525 	(void) os_atomic_decrement_ulint(&os_n_pending_writes, 1);
2526 	(void) os_atomic_decrement_ulint(&os_file_n_pending_pwrites, 1);
2527 	MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_WRITES);
2528 #endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD < 8 */
2529 
2530 	return(ret);
2531 #else
2532 	{
2533 		off_t	ret_offset;
2534 # ifndef UNIV_HOTBACKUP
2535 		ulint	i;
2536 # endif /* !UNIV_HOTBACKUP */
2537 
2538 		os_mutex_enter(os_file_count_mutex);
2539 		os_n_pending_writes++;
2540 		MONITOR_INC(MONITOR_OS_PENDING_WRITES);
2541 		os_mutex_exit(os_file_count_mutex);
2542 
2543 # ifndef UNIV_HOTBACKUP
2544 		/* Protect the seek / write operation with a mutex */
2545 		i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2546 
2547 		os_mutex_enter(os_file_seek_mutexes[i]);
2548 # endif /* UNIV_HOTBACKUP */
2549 
2550 		ret_offset = lseek(file, offs, SEEK_SET);
2551 
2552 		if (ret_offset < 0) {
2553 			ret = -1;
2554 
2555 			goto func_exit;
2556 		}
2557 
2558 		ret = write(file, buf, (ssize_t) n);
2559 
2560 func_exit:
2561 # ifndef UNIV_HOTBACKUP
2562 		os_mutex_exit(os_file_seek_mutexes[i]);
2563 # endif /* !UNIV_HOTBACKUP */
2564 
2565 		os_mutex_enter(os_file_count_mutex);
2566 		os_n_pending_writes--;
2567 		MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
2568 		os_mutex_exit(os_file_count_mutex);
2569 
2570 		return(ret);
2571 	}
2572 #endif /* !UNIV_HOTBACKUP */
2573 }
2574 #endif
2575 
2576 /*******************************************************************//**
2577 NOTE! Use the corresponding macro os_file_read(), not directly this
2578 function!
2579 Requests a synchronous positioned read operation.
2580 @return	TRUE if request was successful, FALSE if fail */
2581 UNIV_INTERN
2582 ibool
os_file_read_func(os_file_t file,void * buf,os_offset_t offset,ulint n)2583 os_file_read_func(
2584 /*==============*/
2585 	os_file_t	file,	/*!< in: handle to a file */
2586 	void*		buf,	/*!< in: buffer where to read */
2587 	os_offset_t	offset,	/*!< in: file offset where to read */
2588 	ulint		n)	/*!< in: number of bytes to read */
2589 {
2590 #ifdef __WIN__
2591 	BOOL		ret;
2592 	DWORD		len;
2593 	DWORD		ret2;
2594 	DWORD		low;
2595 	DWORD		high;
2596 	ibool		retry;
2597 #ifndef UNIV_HOTBACKUP
2598 	ulint		i;
2599 #endif /* !UNIV_HOTBACKUP */
2600 
2601 	/* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2602 	no more than 32 bits. */
2603 	ut_a((n & 0xFFFFFFFFUL) == n);
2604 
2605 	os_n_file_reads++;
2606 	os_bytes_read_since_printout += n;
2607 
2608 try_again:
2609 	ut_ad(file);
2610 	ut_ad(buf);
2611 	ut_ad(n > 0);
2612 
2613 	low = (DWORD) offset & 0xFFFFFFFF;
2614 	high = (DWORD) (offset >> 32);
2615 
2616 	os_mutex_enter(os_file_count_mutex);
2617 	os_n_pending_reads++;
2618 	MONITOR_INC(MONITOR_OS_PENDING_READS);
2619 	os_mutex_exit(os_file_count_mutex);
2620 
2621 #ifndef UNIV_HOTBACKUP
2622 	/* Protect the seek / read operation with a mutex */
2623 	i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2624 
2625 	os_mutex_enter(os_file_seek_mutexes[i]);
2626 #endif /* !UNIV_HOTBACKUP */
2627 
2628 	ret2 = SetFilePointer(
2629 		file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN);
2630 
2631 	if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2632 
2633 #ifndef UNIV_HOTBACKUP
2634 		os_mutex_exit(os_file_seek_mutexes[i]);
2635 #endif /* !UNIV_HOTBACKUP */
2636 
2637 		os_mutex_enter(os_file_count_mutex);
2638 		os_n_pending_reads--;
2639 		MONITOR_DEC(MONITOR_OS_PENDING_READS);
2640 		os_mutex_exit(os_file_count_mutex);
2641 
2642 		goto error_handling;
2643 	}
2644 
2645 	ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
2646 
2647 #ifndef UNIV_HOTBACKUP
2648 	os_mutex_exit(os_file_seek_mutexes[i]);
2649 #endif /* !UNIV_HOTBACKUP */
2650 
2651 	os_mutex_enter(os_file_count_mutex);
2652 	os_n_pending_reads--;
2653 	MONITOR_DEC(MONITOR_OS_PENDING_READS);
2654 	os_mutex_exit(os_file_count_mutex);
2655 
2656 	if (ret && len == n) {
2657 		return(TRUE);
2658 	}
2659 #else /* __WIN__ */
2660 	ibool	retry;
2661 	ssize_t	ret;
2662 
2663 	os_bytes_read_since_printout += n;
2664 
2665 try_again:
2666 	ret = os_file_pread(file, buf, n, offset);
2667 
2668 	if ((ulint) ret == n) {
2669 		return(TRUE);
2670 	} else if (ret == -1) {
2671                 ib_logf(IB_LOG_LEVEL_ERROR,
2672 			"Error in system call pread(). The operating"
2673 			" system error number is %lu.",(ulint) errno);
2674         } else {
2675 		/* Partial read occured */
2676 		ib_logf(IB_LOG_LEVEL_ERROR,
2677 			"Tried to read " ULINTPF " bytes at offset "
2678 			UINT64PF ". Was only able to read %ld.",
2679 			n, offset, (lint) ret);
2680 	}
2681 #endif /* __WIN__ */
2682 #ifdef __WIN__
2683 error_handling:
2684 #endif
2685 	retry = os_file_handle_error(NULL, "read");
2686 
2687 	if (retry) {
2688 		goto try_again;
2689 	}
2690 
2691 	fprintf(stderr,
2692 		"InnoDB: Fatal error: cannot read from file."
2693 		" OS error number %lu.\n",
2694 #ifdef __WIN__
2695 		(ulong) GetLastError()
2696 #else
2697 		(ulong) errno
2698 #endif /* __WIN__ */
2699 		);
2700 	fflush(stderr);
2701 
2702 	ut_error;
2703 
2704 	return(FALSE);
2705 }
2706 
2707 /*******************************************************************//**
2708 NOTE! Use the corresponding macro os_file_read_no_error_handling(),
2709 not directly this function!
2710 Requests a synchronous positioned read operation. This function does not do
2711 any error handling. In case of error it returns FALSE.
2712 @return	TRUE if request was successful, FALSE if fail */
2713 UNIV_INTERN
2714 ibool
os_file_read_no_error_handling_func(os_file_t file,void * buf,os_offset_t offset,ulint n)2715 os_file_read_no_error_handling_func(
2716 /*================================*/
2717 	os_file_t	file,	/*!< in: handle to a file */
2718 	void*		buf,	/*!< in: buffer where to read */
2719 	os_offset_t	offset,	/*!< in: file offset where to read */
2720 	ulint		n)	/*!< in: number of bytes to read */
2721 {
2722 #ifdef __WIN__
2723 	BOOL		ret;
2724 	DWORD		len;
2725 	DWORD		ret2;
2726 	DWORD		low;
2727 	DWORD		high;
2728 	ibool		retry;
2729 #ifndef UNIV_HOTBACKUP
2730 	ulint		i;
2731 #endif /* !UNIV_HOTBACKUP */
2732 
2733 	/* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2734 	no more than 32 bits. */
2735 	ut_a((n & 0xFFFFFFFFUL) == n);
2736 
2737 	os_n_file_reads++;
2738 	os_bytes_read_since_printout += n;
2739 
2740 try_again:
2741 	ut_ad(file);
2742 	ut_ad(buf);
2743 	ut_ad(n > 0);
2744 
2745 	low = (DWORD) offset & 0xFFFFFFFF;
2746 	high = (DWORD) (offset >> 32);
2747 
2748 	os_mutex_enter(os_file_count_mutex);
2749 	os_n_pending_reads++;
2750 	MONITOR_INC(MONITOR_OS_PENDING_READS);
2751 	os_mutex_exit(os_file_count_mutex);
2752 
2753 #ifndef UNIV_HOTBACKUP
2754 	/* Protect the seek / read operation with a mutex */
2755 	i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2756 
2757 	os_mutex_enter(os_file_seek_mutexes[i]);
2758 #endif /* !UNIV_HOTBACKUP */
2759 
2760 	ret2 = SetFilePointer(
2761 		file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN);
2762 
2763 	if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2764 
2765 #ifndef UNIV_HOTBACKUP
2766 		os_mutex_exit(os_file_seek_mutexes[i]);
2767 #endif /* !UNIV_HOTBACKUP */
2768 
2769 		os_mutex_enter(os_file_count_mutex);
2770 		os_n_pending_reads--;
2771 		MONITOR_DEC(MONITOR_OS_PENDING_READS);
2772 		os_mutex_exit(os_file_count_mutex);
2773 
2774 		goto error_handling;
2775 	}
2776 
2777 	ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
2778 
2779 #ifndef UNIV_HOTBACKUP
2780 	os_mutex_exit(os_file_seek_mutexes[i]);
2781 #endif /* !UNIV_HOTBACKUP */
2782 
2783 	os_mutex_enter(os_file_count_mutex);
2784 	os_n_pending_reads--;
2785 	MONITOR_DEC(MONITOR_OS_PENDING_READS);
2786 	os_mutex_exit(os_file_count_mutex);
2787 
2788 	if (ret && len == n) {
2789 		return(TRUE);
2790 	}
2791 #else /* __WIN__ */
2792 	ibool	retry;
2793 	ssize_t	ret;
2794 
2795 	os_bytes_read_since_printout += n;
2796 
2797 try_again:
2798 	ret = os_file_pread(file, buf, n, offset);
2799 
2800 	if ((ulint) ret == n) {
2801 		return(TRUE);
2802 	} else if (ret == -1) {
2803                 ib_logf(IB_LOG_LEVEL_ERROR,
2804 			"Error in system call pread(). The operating"
2805 			" system error number is %lu.",(ulint) errno);
2806         } else {
2807 		/* Partial read occured */
2808 		ib_logf(IB_LOG_LEVEL_ERROR,
2809 			"Tried to read " ULINTPF " bytes at offset "
2810 			UINT64PF ". Was only able to read %ld.",
2811 			n, offset, (lint) ret);
2812 	}
2813 #endif /* __WIN__ */
2814 #ifdef __WIN__
2815 error_handling:
2816 #endif
2817 	retry = os_file_handle_error_no_exit(NULL, "read", FALSE);
2818 
2819 	if (retry) {
2820 		goto try_again;
2821 	}
2822 
2823 	return(FALSE);
2824 }
2825 
2826 /*******************************************************************//**
2827 Rewind file to its start, read at most size - 1 bytes from it to str, and
2828 NUL-terminate str. All errors are silently ignored. This function is
2829 mostly meant to be used with temporary files. */
2830 UNIV_INTERN
2831 void
os_file_read_string(FILE * file,char * str,ulint size)2832 os_file_read_string(
2833 /*================*/
2834 	FILE*	file,	/*!< in: file to read from */
2835 	char*	str,	/*!< in: buffer where to read */
2836 	ulint	size)	/*!< in: size of buffer */
2837 {
2838 	size_t	flen;
2839 
2840 	if (size == 0) {
2841 		return;
2842 	}
2843 
2844 	rewind(file);
2845 	flen = fread(str, 1, size - 1, file);
2846 	str[flen] = '\0';
2847 }
2848 
2849 /*******************************************************************//**
2850 NOTE! Use the corresponding macro os_file_write(), not directly
2851 this function!
2852 Requests a synchronous write operation.
2853 @return	TRUE if request was successful, FALSE if fail */
2854 UNIV_INTERN
2855 ibool
os_file_write_func(const char * name,os_file_t file,const void * buf,os_offset_t offset,ulint n)2856 os_file_write_func(
2857 /*===============*/
2858 	const char*	name,	/*!< in: name of the file or path as a
2859 				null-terminated string */
2860 	os_file_t	file,	/*!< in: handle to a file */
2861 	const void*	buf,	/*!< in: buffer from which to write */
2862 	os_offset_t	offset,	/*!< in: file offset where to write */
2863 	ulint		n)	/*!< in: number of bytes to write */
2864 {
2865 	ut_ad(!srv_read_only_mode);
2866 
2867 #ifdef __WIN__
2868 	BOOL		ret;
2869 	DWORD		len;
2870 	DWORD		ret2;
2871 	DWORD		low;
2872 	DWORD		high;
2873 	ulint		n_retries	= 0;
2874 	ulint		err;
2875 #ifndef UNIV_HOTBACKUP
2876 	ulint		i;
2877 #endif /* !UNIV_HOTBACKUP */
2878 
2879 	/* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2880 	no more than 32 bits. */
2881 	ut_a((n & 0xFFFFFFFFUL) == n);
2882 
2883 	os_n_file_writes++;
2884 
2885 	ut_ad(file);
2886 	ut_ad(buf);
2887 	ut_ad(n > 0);
2888 retry:
2889 	low = (DWORD) offset & 0xFFFFFFFF;
2890 	high = (DWORD) (offset >> 32);
2891 
2892 	os_mutex_enter(os_file_count_mutex);
2893 	os_n_pending_writes++;
2894 	MONITOR_INC(MONITOR_OS_PENDING_WRITES);
2895 	os_mutex_exit(os_file_count_mutex);
2896 
2897 #ifndef UNIV_HOTBACKUP
2898 	/* Protect the seek / write operation with a mutex */
2899 	i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2900 
2901 	os_mutex_enter(os_file_seek_mutexes[i]);
2902 #endif /* !UNIV_HOTBACKUP */
2903 
2904 	ret2 = SetFilePointer(
2905 		file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN);
2906 
2907 	if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2908 
2909 #ifndef UNIV_HOTBACKUP
2910 		os_mutex_exit(os_file_seek_mutexes[i]);
2911 #endif /* !UNIV_HOTBACKUP */
2912 
2913 		os_mutex_enter(os_file_count_mutex);
2914 		os_n_pending_writes--;
2915 		MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
2916 		os_mutex_exit(os_file_count_mutex);
2917 
2918 		ut_print_timestamp(stderr);
2919 
2920 		fprintf(stderr,
2921 			" InnoDB: Error: File pointer positioning to"
2922 			" file %s failed at\n"
2923 			"InnoDB: offset %llu. Operating system"
2924 			" error number %lu.\n"
2925 			"InnoDB: Some operating system error numbers"
2926 			" are described at\n"
2927 			"InnoDB: "
2928 			REFMAN "operating-system-error-codes.html\n",
2929 			name, offset, (ulong) GetLastError());
2930 
2931 		return(FALSE);
2932 	}
2933 
2934 	ret = WriteFile(file, buf, (DWORD) n, &len, NULL);
2935 
2936 #ifndef UNIV_HOTBACKUP
2937 	os_mutex_exit(os_file_seek_mutexes[i]);
2938 #endif /* !UNIV_HOTBACKUP */
2939 
2940 	os_mutex_enter(os_file_count_mutex);
2941 	os_n_pending_writes--;
2942 	MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
2943 	os_mutex_exit(os_file_count_mutex);
2944 
2945 	if (ret && len == n) {
2946 
2947 		return(TRUE);
2948 	}
2949 
2950 	/* If some background file system backup tool is running, then, at
2951 	least in Windows 2000, we may get here a specific error. Let us
2952 	retry the operation 100 times, with 1 second waits. */
2953 
2954 	if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) {
2955 
2956 		os_thread_sleep(1000000);
2957 
2958 		n_retries++;
2959 
2960 		goto retry;
2961 	}
2962 
2963 	if (!os_has_said_disk_full) {
2964 
2965 		err = (ulint) GetLastError();
2966 
2967 		ut_print_timestamp(stderr);
2968 
2969 		fprintf(stderr,
2970 			" InnoDB: Error: Write to file %s failed"
2971 			" at offset %llu.\n"
2972 			"InnoDB: %lu bytes should have been written,"
2973 			" only %lu were written.\n"
2974 			"InnoDB: Operating system error number %lu.\n"
2975 			"InnoDB: Check that your OS and file system"
2976 			" support files of this size.\n"
2977 			"InnoDB: Check also that the disk is not full"
2978 			" or a disk quota exceeded.\n",
2979 			name, offset,
2980 			(ulong) n, (ulong) len, (ulong) err);
2981 
2982 		if (strerror((int) err) != NULL) {
2983 			fprintf(stderr,
2984 				"InnoDB: Error number %lu means '%s'.\n",
2985 				(ulong) err, strerror((int) err));
2986 		}
2987 
2988 		fprintf(stderr,
2989 			"InnoDB: Some operating system error numbers"
2990 			" are described at\n"
2991 			"InnoDB: "
2992 			REFMAN "operating-system-error-codes.html\n");
2993 
2994 		os_has_said_disk_full = TRUE;
2995 	}
2996 
2997 	return(FALSE);
2998 #else
2999 	ssize_t	ret;
3000 
3001 	ret = os_file_pwrite(file, buf, n, offset);
3002 
3003 	if ((ulint) ret == n) {
3004 
3005 		return(TRUE);
3006 	}
3007 
3008 	if (!os_has_said_disk_full) {
3009 
3010 		ut_print_timestamp(stderr);
3011 
3012 		if(ret == -1) {
3013 			ib_logf(IB_LOG_LEVEL_ERROR,
3014 				"Failure of system call pwrite(). Operating"
3015 				" system error number is %lu.",
3016 				(ulint) errno);
3017 		} else {
3018 			fprintf(stderr,
3019 				" InnoDB: Error: Write to file %s failed"
3020 				" at offset " UINT64PF ".\n"
3021 				"InnoDB: %lu bytes should have been written,"
3022 				" only %ld were written.\n"
3023 				"InnoDB: Operating system error number %lu.\n"
3024 				"InnoDB: Check that your OS and file system"
3025 				" support files of this size.\n"
3026 				"InnoDB: Check also that the disk is not full"
3027 				" or a disk quota exceeded.\n",
3028 				name, offset, n, (lint) ret,
3029 				(ulint) errno);
3030 		}
3031 
3032 		if (strerror(errno) != NULL) {
3033 			fprintf(stderr,
3034 				"InnoDB: Error number %d means '%s'.\n",
3035 				errno, strerror(errno));
3036 		}
3037 
3038 		fprintf(stderr,
3039 			"InnoDB: Some operating system error numbers"
3040 			" are described at\n"
3041 			"InnoDB: "
3042 			REFMAN "operating-system-error-codes.html\n");
3043 
3044 		os_has_said_disk_full = TRUE;
3045 	}
3046 
3047 	return(FALSE);
3048 #endif
3049 }
3050 
3051 /*******************************************************************//**
3052 Check the existence and type of the given file.
3053 @return	TRUE if call succeeded */
3054 UNIV_INTERN
3055 ibool
os_file_status(const char * path,ibool * exists,os_file_type_t * type)3056 os_file_status(
3057 /*===========*/
3058 	const char*	path,	/*!< in: pathname of the file */
3059 	ibool*		exists,	/*!< out: TRUE if file exists */
3060 	os_file_type_t* type)	/*!< out: type of the file (if it exists) */
3061 {
3062 #ifdef __WIN__
3063 	int		ret;
3064 	struct _stat64	statinfo;
3065 
3066 	ret = _stat64(path, &statinfo);
3067 	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3068 		/* file does not exist */
3069 		*exists = FALSE;
3070 		return(TRUE);
3071 	} else if (ret) {
3072 		/* file exists, but stat call failed */
3073 
3074 		os_file_handle_error_no_exit(path, "stat", FALSE);
3075 
3076 		return(FALSE);
3077 	}
3078 
3079 	if (_S_IFDIR & statinfo.st_mode) {
3080 		*type = OS_FILE_TYPE_DIR;
3081 	} else if (_S_IFREG & statinfo.st_mode) {
3082 		*type = OS_FILE_TYPE_FILE;
3083 	} else {
3084 		*type = OS_FILE_TYPE_UNKNOWN;
3085 	}
3086 
3087 	*exists = TRUE;
3088 
3089 	return(TRUE);
3090 #else
3091 	int		ret;
3092 	struct stat	statinfo;
3093 
3094 	ret = stat(path, &statinfo);
3095 	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3096 		/* file does not exist */
3097 		*exists = FALSE;
3098 		return(TRUE);
3099 	} else if (ret) {
3100 		/* file exists, but stat call failed */
3101 
3102 		os_file_handle_error_no_exit(path, "stat", FALSE);
3103 
3104 		return(FALSE);
3105 	}
3106 
3107 	if (S_ISDIR(statinfo.st_mode)) {
3108 		*type = OS_FILE_TYPE_DIR;
3109 	} else if (S_ISLNK(statinfo.st_mode)) {
3110 		*type = OS_FILE_TYPE_LINK;
3111 	} else if (S_ISREG(statinfo.st_mode)) {
3112 		*type = OS_FILE_TYPE_FILE;
3113 	} else {
3114 		*type = OS_FILE_TYPE_UNKNOWN;
3115 	}
3116 
3117 	*exists = TRUE;
3118 
3119 	return(TRUE);
3120 #endif
3121 }
3122 
3123 /*******************************************************************//**
3124 This function returns information about the specified file
3125 @return	DB_SUCCESS if all OK */
3126 UNIV_INTERN
3127 dberr_t
os_file_get_status(const char * path,os_file_stat_t * stat_info,bool check_rw_perm)3128 os_file_get_status(
3129 /*===============*/
3130 	const char*	path,		/*!< in:	pathname of the file */
3131 	os_file_stat_t* stat_info,	/*!< information of a file in a
3132 					directory */
3133 	bool		check_rw_perm)	/*!< in: for testing whether the
3134 					file can be opened in RW mode */
3135 {
3136 	int		ret;
3137 
3138 #ifdef __WIN__
3139 	struct _stat64	statinfo;
3140 
3141 	ret = _stat64(path, &statinfo);
3142 
3143 	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3144 		/* file does not exist */
3145 
3146 		return(DB_NOT_FOUND);
3147 
3148 	} else if (ret) {
3149 		/* file exists, but stat call failed */
3150 
3151 		os_file_handle_error_no_exit(path, "stat", FALSE);
3152 
3153 		return(DB_FAIL);
3154 
3155 	} else if (_S_IFDIR & statinfo.st_mode) {
3156 		stat_info->type = OS_FILE_TYPE_DIR;
3157 	} else if (_S_IFREG & statinfo.st_mode) {
3158 
3159 		DWORD	access = GENERIC_READ;
3160 
3161 		if (!srv_read_only_mode) {
3162 			access |= GENERIC_WRITE;
3163 		}
3164 
3165 		stat_info->type = OS_FILE_TYPE_FILE;
3166 
3167 		/* Check if we can open it in read-only mode. */
3168 
3169 		if (check_rw_perm) {
3170 			HANDLE	fh;
3171 
3172 			fh = CreateFile(
3173 				(LPCTSTR) path,		// File to open
3174 				access,
3175 				0,			// No sharing
3176 				NULL,			// Default security
3177 				OPEN_EXISTING,		// Existing file only
3178 				FILE_ATTRIBUTE_NORMAL,	// Normal file
3179 				NULL);			// No attr. template
3180 
3181 			if (fh == INVALID_HANDLE_VALUE) {
3182 				stat_info->rw_perm = false;
3183 			} else {
3184 				stat_info->rw_perm = true;
3185 				CloseHandle(fh);
3186 			}
3187 		}
3188 	} else {
3189 		stat_info->type = OS_FILE_TYPE_UNKNOWN;
3190 	}
3191 #else
3192 	struct stat	statinfo;
3193 
3194 	ret = stat(path, &statinfo);
3195 
3196 	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3197 		/* file does not exist */
3198 
3199 		return(DB_NOT_FOUND);
3200 
3201 	} else if (ret) {
3202 		/* file exists, but stat call failed */
3203 
3204 		os_file_handle_error_no_exit(path, "stat", FALSE);
3205 
3206 		return(DB_FAIL);
3207 
3208 	}
3209 
3210 	switch (statinfo.st_mode & S_IFMT) {
3211 	case S_IFDIR:
3212 		stat_info->type = OS_FILE_TYPE_DIR;
3213 		break;
3214 	case S_IFLNK:
3215 		stat_info->type = OS_FILE_TYPE_LINK;
3216 		break;
3217 	case S_IFBLK:
3218 		/* Handle block device as regular file. */
3219 	case S_IFCHR:
3220 		/* Handle character device as regular file. */
3221 	case S_IFREG:
3222 		stat_info->type = OS_FILE_TYPE_FILE;
3223 		break;
3224 	default:
3225 		stat_info->type = OS_FILE_TYPE_UNKNOWN;
3226 	}
3227 
3228 
3229 	if (check_rw_perm && stat_info->type == OS_FILE_TYPE_FILE) {
3230 
3231 		int	fh;
3232 		int	access;
3233 
3234 		access = !srv_read_only_mode ? O_RDWR : O_RDONLY;
3235 
3236 		fh = ::open(path, access, os_innodb_umask);
3237 
3238 		if (fh == -1) {
3239 			stat_info->rw_perm = false;
3240 		} else {
3241 			stat_info->rw_perm = true;
3242 			close(fh);
3243 		}
3244 	}
3245 
3246 #endif /* _WIN_ */
3247 
3248 	stat_info->ctime = statinfo.st_ctime;
3249 	stat_info->atime = statinfo.st_atime;
3250 	stat_info->mtime = statinfo.st_mtime;
3251 	stat_info->size  = statinfo.st_size;
3252 
3253 	return(DB_SUCCESS);
3254 }
3255 
3256 /* path name separator character */
3257 #ifdef __WIN__
3258 #  define OS_FILE_PATH_SEPARATOR	'\\'
3259 #else
3260 #  define OS_FILE_PATH_SEPARATOR	'/'
3261 #endif
3262 
3263 /****************************************************************//**
3264 This function returns a new path name after replacing the basename
3265 in an old path with a new basename.  The old_path is a full path
3266 name including the extension.  The tablename is in the normal
3267 form "databasename/tablename".  The new base name is found after
3268 the forward slash.  Both input strings are null terminated.
3269 
3270 This function allocates memory to be returned.  It is the callers
3271 responsibility to free the return value after it is no longer needed.
3272 
3273 @return	own: new full pathname */
3274 UNIV_INTERN
3275 char*
os_file_make_new_pathname(const char * old_path,const char * tablename)3276 os_file_make_new_pathname(
3277 /*======================*/
3278 	const char*	old_path,	/*!< in: pathname */
3279 	const char*	tablename)	/*!< in: contains new base name */
3280 {
3281 	ulint		dir_len;
3282 	char*		last_slash;
3283 	char*		base_name;
3284 	char*		new_path;
3285 	ulint		new_path_len;
3286 
3287 	/* Split the tablename into its database and table name components.
3288 	They are separated by a '/'. */
3289 	last_slash = strrchr((char*) tablename, '/');
3290 	base_name = last_slash ? last_slash + 1 : (char*) tablename;
3291 
3292 	/* Find the offset of the last slash. We will strip off the
3293 	old basename.ibd which starts after that slash. */
3294 	last_slash = strrchr((char*) old_path, OS_FILE_PATH_SEPARATOR);
3295 	dir_len = last_slash ? last_slash - old_path : strlen(old_path);
3296 
3297 	/* allocate a new path and move the old directory path to it. */
3298 	new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd";
3299 	new_path = static_cast<char*>(mem_alloc(new_path_len));
3300 	memcpy(new_path, old_path, dir_len);
3301 
3302 	ut_snprintf(new_path + dir_len,
3303 		    new_path_len - dir_len,
3304 		    "%c%s.ibd",
3305 		    OS_FILE_PATH_SEPARATOR,
3306 		    base_name);
3307 
3308 	return(new_path);
3309 }
3310 
3311 /****************************************************************//**
3312 This function returns a remote path name by combining a data directory
3313 path provided in a DATA DIRECTORY clause with the tablename which is
3314 in the form 'database/tablename'.  It strips the file basename (which
3315 is the tablename) found after the last directory in the path provided.
3316 The full filepath created will include the database name as a directory
3317 under the path provided.  The filename is the tablename with the '.ibd'
3318 extension. All input and output strings are null-terminated.
3319 
3320 This function allocates memory to be returned.  It is the callers
3321 responsibility to free the return value after it is no longer needed.
3322 
3323 @return	own: A full pathname; data_dir_path/databasename/tablename.ibd */
3324 UNIV_INTERN
3325 char*
os_file_make_remote_pathname(const char * data_dir_path,const char * tablename,const char * extention)3326 os_file_make_remote_pathname(
3327 /*=========================*/
3328 	const char*	data_dir_path,	/*!< in: pathname */
3329 	const char*	tablename,	/*!< in: tablename */
3330 	const char*	extention)	/*!< in: file extention; ibd,cfg */
3331 {
3332 	ulint		data_dir_len;
3333 	char*		last_slash;
3334 	char*		new_path;
3335 	ulint		new_path_len;
3336 
3337 	ut_ad(extention && strlen(extention) == 3);
3338 
3339 	/* Find the offset of the last slash. We will strip off the
3340 	old basename or tablename which starts after that slash. */
3341 	last_slash = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
3342 	data_dir_len = last_slash ? last_slash - data_dir_path : strlen(data_dir_path);
3343 
3344 	/* allocate a new path and move the old directory path to it. */
3345 	new_path_len = data_dir_len + strlen(tablename)
3346 		       + sizeof "/." + strlen(extention);
3347 	new_path = static_cast<char*>(mem_alloc(new_path_len));
3348 	memcpy(new_path, data_dir_path, data_dir_len);
3349 	ut_snprintf(new_path + data_dir_len,
3350 		    new_path_len - data_dir_len,
3351 		    "%c%s.%s",
3352 		    OS_FILE_PATH_SEPARATOR,
3353 		    tablename,
3354 		    extention);
3355 
3356 	srv_normalize_path_for_win(new_path);
3357 
3358 	return(new_path);
3359 }
3360 
3361 /****************************************************************//**
3362 This function reduces a null-terminated full remote path name into
3363 the path that is sent by MySQL for DATA DIRECTORY clause.  It replaces
3364 the 'databasename/tablename.ibd' found at the end of the path with just
3365 'tablename'.
3366 
3367 Since the result is always smaller than the path sent in, no new memory
3368 is allocated. The caller should allocate memory for the path sent in.
3369 This function manipulates that path in place.
3370 
3371 If the path format is not as expected, just return.  The result is used
3372 to inform a SHOW CREATE TABLE command. */
3373 UNIV_INTERN
3374 void
os_file_make_data_dir_path(char * data_dir_path)3375 os_file_make_data_dir_path(
3376 /*========================*/
3377 	char*	data_dir_path)	/*!< in/out: full path/data_dir_path */
3378 {
3379 	char*	ptr;
3380 	char*	tablename;
3381 	ulint	tablename_len;
3382 
3383 	/* Replace the period before the extension with a null byte. */
3384 	ptr = strrchr((char*) data_dir_path, '.');
3385 	if (!ptr) {
3386 		return;
3387 	}
3388 	ptr[0] = '\0';
3389 
3390 	/* The tablename starts after the last slash. */
3391 	ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
3392 	if (!ptr) {
3393 		return;
3394 	}
3395 	ptr[0] = '\0';
3396 	tablename = ptr + 1;
3397 
3398 	/* The databasename starts after the next to last slash. */
3399 	ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
3400 	if (!ptr) {
3401 		return;
3402 	}
3403 	tablename_len = ut_strlen(tablename);
3404 
3405 	ut_memmove(++ptr, tablename, tablename_len);
3406 
3407 	ptr[tablename_len] = '\0';
3408 }
3409 
3410 /****************************************************************//**
3411 The function os_file_dirname returns a directory component of a
3412 null-terminated pathname string. In the usual case, dirname returns
3413 the string up to, but not including, the final '/', and basename
3414 is the component following the final '/'. Trailing '/' characters
3415 are not counted as part of the pathname.
3416 
3417 If path does not contain a slash, dirname returns the string ".".
3418 
3419 Concatenating the string returned by dirname, a "/", and the basename
3420 yields a complete pathname.
3421 
3422 The return value is a copy of the directory component of the pathname.
3423 The copy is allocated from heap. It is the caller responsibility
3424 to free it after it is no longer needed.
3425 
3426 The following list of examples (taken from SUSv2) shows the strings
3427 returned by dirname and basename for different paths:
3428 
3429        path	      dirname	     basename
3430        "/usr/lib"     "/usr"	     "lib"
3431        "/usr/"	      "/"	     "usr"
3432        "usr"	      "."	     "usr"
3433        "/"	      "/"	     "/"
3434        "."	      "."	     "."
3435        ".."	      "."	     ".."
3436 
3437 @return	own: directory component of the pathname */
3438 UNIV_INTERN
3439 char*
os_file_dirname(const char * path)3440 os_file_dirname(
3441 /*============*/
3442 	const char*	path)	/*!< in: pathname */
3443 {
3444 	/* Find the offset of the last slash */
3445 	const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR);
3446 	if (!last_slash) {
3447 		/* No slash in the path, return "." */
3448 
3449 		return(mem_strdup("."));
3450 	}
3451 
3452 	/* Ok, there is a slash */
3453 
3454 	if (last_slash == path) {
3455 		/* last slash is the first char of the path */
3456 
3457 		return(mem_strdup("/"));
3458 	}
3459 
3460 	/* Non-trivial directory component */
3461 
3462 	return(mem_strdupl(path, last_slash - path));
3463 }
3464 
3465 /****************************************************************//**
3466 Creates all missing subdirectories along the given path.
3467 @return	TRUE if call succeeded FALSE otherwise */
3468 UNIV_INTERN
3469 ibool
os_file_create_subdirs_if_needed(const char * path)3470 os_file_create_subdirs_if_needed(
3471 /*=============================*/
3472 	const char*	path)	/*!< in: path name */
3473 {
3474 	if (srv_read_only_mode) {
3475 
3476 		ib_logf(IB_LOG_LEVEL_ERROR,
3477 			"read only mode set. Can't create subdirectories '%s'",
3478 			path);
3479 
3480 		return(FALSE);
3481 
3482 	}
3483 
3484 	char*	subdir = os_file_dirname(path);
3485 
3486 	if (strlen(subdir) == 1
3487 	    && (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) {
3488 		/* subdir is root or cwd, nothing to do */
3489 		mem_free(subdir);
3490 
3491 		return(TRUE);
3492 	}
3493 
3494 	/* Test if subdir exists */
3495 	os_file_type_t	type;
3496 	ibool	subdir_exists;
3497 	ibool	success = os_file_status(subdir, &subdir_exists, &type);
3498 
3499 	if (success && !subdir_exists) {
3500 
3501 		/* subdir does not exist, create it */
3502 		success = os_file_create_subdirs_if_needed(subdir);
3503 
3504 		if (!success) {
3505 			mem_free(subdir);
3506 
3507 			return(FALSE);
3508 		}
3509 
3510 		success = os_file_create_directory(subdir, FALSE);
3511 	}
3512 
3513 	mem_free(subdir);
3514 
3515 	return(success);
3516 }
3517 
3518 #ifndef UNIV_HOTBACKUP
3519 /****************************************************************//**
3520 Returns a pointer to the nth slot in the aio array.
3521 @return	pointer to slot */
3522 static
3523 os_aio_slot_t*
os_aio_array_get_nth_slot(os_aio_array_t * array,ulint index)3524 os_aio_array_get_nth_slot(
3525 /*======================*/
3526 	os_aio_array_t*		array,	/*!< in: aio array */
3527 	ulint			index)	/*!< in: index of the slot */
3528 {
3529 	ut_a(index < array->n_slots);
3530 
3531 	return(&array->slots[index]);
3532 }
3533 
3534 #if defined(LINUX_NATIVE_AIO)
3535 /******************************************************************//**
3536 Creates an io_context for native linux AIO.
3537 @return	TRUE on success. */
3538 static
3539 ibool
os_aio_linux_create_io_ctx(ulint max_events,io_context_t * io_ctx)3540 os_aio_linux_create_io_ctx(
3541 /*=======================*/
3542 	ulint		max_events,	/*!< in: number of events. */
3543 	io_context_t*	io_ctx)		/*!< out: io_ctx to initialize. */
3544 {
3545 	int	ret;
3546 	ulint	retries = 0;
3547 
3548 retry:
3549 	memset(io_ctx, 0x0, sizeof(*io_ctx));
3550 
3551 	/* Initialize the io_ctx. Tell it how many pending
3552 	IO requests this context will handle. */
3553 
3554 	ret = io_setup(max_events, io_ctx);
3555 	if (ret == 0) {
3556 #if defined(UNIV_AIO_DEBUG)
3557 		fprintf(stderr,
3558 			"InnoDB: Linux native AIO:"
3559 			" initialized io_ctx for segment\n");
3560 #endif
3561 		/* Success. Return now. */
3562 		return(TRUE);
3563 	}
3564 
3565 	/* If we hit EAGAIN we'll make a few attempts before failing. */
3566 
3567 	switch (ret) {
3568 	case -EAGAIN:
3569 		if (retries == 0) {
3570 			/* First time around. */
3571 			ut_print_timestamp(stderr);
3572 			fprintf(stderr,
3573 				" InnoDB: Warning: io_setup() failed"
3574 				" with EAGAIN. Will make %d attempts"
3575 				" before giving up.\n",
3576 				OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3577 		}
3578 
3579 		if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
3580 			++retries;
3581 			fprintf(stderr,
3582 				"InnoDB: Warning: io_setup() attempt"
3583 				" %lu failed.\n",
3584 				retries);
3585 			os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
3586 			goto retry;
3587 		}
3588 
3589 		/* Have tried enough. Better call it a day. */
3590 		ut_print_timestamp(stderr);
3591 		fprintf(stderr,
3592 			" InnoDB: Error: io_setup() failed"
3593 			" with EAGAIN after %d attempts.\n",
3594 			OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3595 		break;
3596 
3597 	case -ENOSYS:
3598 		ut_print_timestamp(stderr);
3599 		fprintf(stderr,
3600 			" InnoDB: Error: Linux Native AIO interface"
3601 			" is not supported on this platform. Please"
3602 			" check your OS documentation and install"
3603 			" appropriate binary of InnoDB.\n");
3604 
3605 		break;
3606 
3607 	default:
3608 		ut_print_timestamp(stderr);
3609 		fprintf(stderr,
3610 			" InnoDB: Error: Linux Native AIO setup"
3611 			" returned following error[%d]\n", -ret);
3612 		break;
3613 	}
3614 
3615 	fprintf(stderr,
3616 		"InnoDB: You can disable Linux Native AIO by"
3617 		" setting innodb_use_native_aio = 0 in my.cnf\n");
3618 	return(FALSE);
3619 }
3620 
3621 /******************************************************************//**
3622 Checks if the system supports native linux aio. On some kernel
3623 versions where native aio is supported it won't work on tmpfs. In such
3624 cases we can't use native aio as it is not possible to mix simulated
3625 and native aio.
3626 @return: TRUE if supported, FALSE otherwise. */
3627 static
3628 ibool
os_aio_native_aio_supported(void)3629 os_aio_native_aio_supported(void)
3630 /*=============================*/
3631 {
3632 	int			fd;
3633 	io_context_t		io_ctx;
3634 	char			name[1000];
3635 
3636 	if (!os_aio_linux_create_io_ctx(1, &io_ctx)) {
3637 		/* The platform does not support native aio. */
3638 		return(FALSE);
3639 	} else if (!srv_read_only_mode) {
3640 		/* Now check if tmpdir supports native aio ops. */
3641 		fd = innobase_mysql_tmpfile(NULL);
3642 
3643 		if (fd < 0) {
3644 			ib_logf(IB_LOG_LEVEL_WARN,
3645 				"Unable to create temp file to check "
3646 				"native AIO support.");
3647 
3648 			return(FALSE);
3649 		}
3650 	} else {
3651 
3652 		srv_normalize_path_for_win(srv_log_group_home_dir);
3653 
3654 		ulint	dirnamelen = strlen(srv_log_group_home_dir);
3655 		ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
3656 		memcpy(name, srv_log_group_home_dir, dirnamelen);
3657 
3658 		/* Add a path separator if needed. */
3659 		if (dirnamelen && name[dirnamelen - 1] != SRV_PATH_SEPARATOR) {
3660 			name[dirnamelen++] = SRV_PATH_SEPARATOR;
3661 		}
3662 
3663 		strcpy(name + dirnamelen, "ib_logfile0");
3664 
3665 		fd = ::open(name, O_RDONLY);
3666 
3667 		if (fd == -1) {
3668 
3669 			ib_logf(IB_LOG_LEVEL_WARN,
3670 				"Unable to open \"%s\" to check "
3671 				"native AIO read support.", name);
3672 
3673 			return(FALSE);
3674 		}
3675 	}
3676 
3677 	struct io_event	io_event;
3678 
3679 	memset(&io_event, 0x0, sizeof(io_event));
3680 
3681 	byte*	buf = static_cast<byte*>(ut_malloc(UNIV_PAGE_SIZE * 2));
3682 	byte*	ptr = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE));
3683 
3684 	struct iocb	iocb;
3685 
3686 	/* Suppress valgrind warning. */
3687 	memset(buf, 0x00, UNIV_PAGE_SIZE * 2);
3688 	memset(&iocb, 0x0, sizeof(iocb));
3689 
3690 	struct iocb*	p_iocb = &iocb;
3691 
3692 	if (!srv_read_only_mode) {
3693 		io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0);
3694 	} else {
3695 		ut_a(UNIV_PAGE_SIZE >= 512);
3696 		io_prep_pread(p_iocb, fd, ptr, 512, 0);
3697 	}
3698 
3699 	int	err = io_submit(io_ctx, 1, &p_iocb);
3700 
3701 	if (err >= 1) {
3702 		/* Now collect the submitted IO request. */
3703 		err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
3704 	}
3705 
3706 	ut_free(buf);
3707 	close(fd);
3708 
3709 	switch (err) {
3710 	case 1:
3711 		return(TRUE);
3712 
3713 	case -EINVAL:
3714 	case -ENOSYS:
3715 		ib_logf(IB_LOG_LEVEL_ERROR,
3716 			"Linux Native AIO not supported. You can either "
3717 			"move %s to a file system that supports native "
3718 			"AIO or you can set innodb_use_native_aio to "
3719 			"FALSE to avoid this message.",
3720 			srv_read_only_mode ? name : "tmpdir");
3721 
3722 		/* fall through. */
3723 	default:
3724 		ib_logf(IB_LOG_LEVEL_ERROR,
3725 			"Linux Native AIO check on %s returned error[%d]",
3726 			srv_read_only_mode ? name : "tmpdir", -err);
3727 	}
3728 
3729 	return(FALSE);
3730 }
3731 #endif /* LINUX_NATIVE_AIO */
3732 
3733 /******************************************************************//**
3734 Creates an aio wait array. Note that we return NULL in case of failure.
3735 We don't care about freeing memory here because we assume that a
3736 failure will result in server refusing to start up.
3737 @return	own: aio array, NULL on failure */
3738 static
3739 os_aio_array_t*
os_aio_array_create(ulint n,ulint n_segments)3740 os_aio_array_create(
3741 /*================*/
3742 	ulint	n,		/*!< in: maximum number of pending aio
3743 				operations allowed; n must be
3744 				divisible by n_segments */
3745 	ulint	n_segments)	/*!< in: number of segments in the aio array */
3746 {
3747 	os_aio_array_t*	array;
3748 #ifdef WIN_ASYNC_IO
3749 	OVERLAPPED*	over;
3750 #elif defined(LINUX_NATIVE_AIO)
3751 	struct io_event*	io_event = NULL;
3752 #endif /* WIN_ASYNC_IO */
3753 	ut_a(n > 0);
3754 	ut_a(n_segments > 0);
3755 
3756 	array = static_cast<os_aio_array_t*>(ut_malloc(sizeof(*array)));
3757 	memset(array, 0x0, sizeof(*array));
3758 
3759 	array->mutex = os_mutex_create();
3760 	array->not_full = os_event_create();
3761 	array->is_empty = os_event_create();
3762 
3763 	os_event_set(array->is_empty);
3764 
3765 	array->n_slots = n;
3766 	array->n_segments = n_segments;
3767 
3768 	array->slots = static_cast<os_aio_slot_t*>(
3769 		ut_malloc(n * sizeof(*array->slots)));
3770 
3771 	memset(array->slots, 0x0, sizeof(n * sizeof(*array->slots)));
3772 #ifdef __WIN__
3773 	array->handles = static_cast<HANDLE*>(ut_malloc(n * sizeof(HANDLE)));
3774 #endif /* __WIN__ */
3775 
3776 #if defined(LINUX_NATIVE_AIO)
3777 	array->aio_ctx = NULL;
3778 	array->aio_events = NULL;
3779 
3780 	/* If we are not using native aio interface then skip this
3781 	part of initialization. */
3782 	if (!srv_use_native_aio) {
3783 		goto skip_native_aio;
3784 	}
3785 
3786 	/* Initialize the io_context array. One io_context
3787 	per segment in the array. */
3788 
3789 	array->aio_ctx = static_cast<io_context**>(
3790 		ut_malloc(n_segments * sizeof(*array->aio_ctx)));
3791 
3792 	for (ulint i = 0; i < n_segments; ++i) {
3793 		if (!os_aio_linux_create_io_ctx(n/n_segments,
3794 						&array->aio_ctx[i])) {
3795 			/* If something bad happened during aio setup
3796 			we should call it a day and return right away.
3797 			We don't care about any leaks because a failure
3798 			to initialize the io subsystem means that the
3799 			server (or atleast the innodb storage engine)
3800 			is not going to startup. */
3801 			return(NULL);
3802 		}
3803 	}
3804 
3805 	/* Initialize the event array. One event per slot. */
3806 	io_event = static_cast<struct io_event*>(
3807 		ut_malloc(n * sizeof(*io_event)));
3808 
3809 	memset(io_event, 0x0, sizeof(*io_event) * n);
3810 	array->aio_events = io_event;
3811 
3812 skip_native_aio:
3813 #endif /* LINUX_NATIVE_AIO */
3814 	for (ulint i = 0; i < n; i++) {
3815 		os_aio_slot_t*	slot;
3816 
3817 		slot = os_aio_array_get_nth_slot(array, i);
3818 
3819 		slot->pos = i;
3820 		slot->reserved = FALSE;
3821 #ifdef WIN_ASYNC_IO
3822 		slot->handle = CreateEvent(NULL,TRUE, FALSE, NULL);
3823 
3824 		over = &slot->control;
3825 
3826 		over->hEvent = slot->handle;
3827 
3828 		array->handles[i] = over->hEvent;
3829 
3830 #elif defined(LINUX_NATIVE_AIO)
3831 		memset(&slot->control, 0x0, sizeof(slot->control));
3832 		slot->n_bytes = 0;
3833 		slot->ret = 0;
3834 #endif /* WIN_ASYNC_IO */
3835 	}
3836 
3837 	return(array);
3838 }
3839 
3840 /************************************************************************//**
3841 Frees an aio wait array. */
3842 static
3843 void
os_aio_array_free(os_aio_array_t * & array)3844 os_aio_array_free(
3845 /*==============*/
3846 	os_aio_array_t*& array)	/*!< in, own: array to free */
3847 {
3848 #ifdef WIN_ASYNC_IO
3849 	ulint	i;
3850 
3851 	for (i = 0; i < array->n_slots; i++) {
3852 		os_aio_slot_t*	slot = os_aio_array_get_nth_slot(array, i);
3853 		CloseHandle(slot->handle);
3854 	}
3855 #endif /* WIN_ASYNC_IO */
3856 
3857 #ifdef __WIN__
3858 	ut_free(array->handles);
3859 #endif /* __WIN__ */
3860 	os_mutex_free(array->mutex);
3861 	os_event_free(array->not_full);
3862 	os_event_free(array->is_empty);
3863 
3864 #if defined(LINUX_NATIVE_AIO)
3865 	if (srv_use_native_aio) {
3866 		ut_free(array->aio_events);
3867 		ut_free(array->aio_ctx);
3868 	}
3869 #endif /* LINUX_NATIVE_AIO */
3870 
3871 	ut_free(array->slots);
3872 	ut_free(array);
3873 
3874 	array = 0;
3875 }
3876 
3877 /***********************************************************************
3878 Initializes the asynchronous io system. Creates one array each for ibuf
3879 and log i/o. Also creates one array each for read and write where each
3880 array is divided logically into n_read_segs and n_write_segs
3881 respectively. The caller must create an i/o handler thread for each
3882 segment in these arrays. This function also creates the sync array.
3883 No i/o handler thread needs to be created for that */
3884 UNIV_INTERN
3885 ibool
os_aio_init(ulint n_per_seg,ulint n_read_segs,ulint n_write_segs,ulint n_slots_sync)3886 os_aio_init(
3887 /*========*/
3888 	ulint	n_per_seg,	/*<! in: maximum number of pending aio
3889 				operations allowed per segment */
3890 	ulint	n_read_segs,	/*<! in: number of reader threads */
3891 	ulint	n_write_segs,	/*<! in: number of writer threads */
3892 	ulint	n_slots_sync)	/*<! in: number of slots in the sync aio
3893 				array */
3894 {
3895 	os_io_init_simple();
3896 
3897 #if defined(LINUX_NATIVE_AIO)
3898 	/* Check if native aio is supported on this system and tmpfs */
3899 	if (srv_use_native_aio && !os_aio_native_aio_supported()) {
3900 
3901 		ib_logf(IB_LOG_LEVEL_WARN, "Linux Native AIO disabled.");
3902 
3903 		srv_use_native_aio = FALSE;
3904 	}
3905 #endif /* LINUX_NATIVE_AIO */
3906 
3907 	srv_reset_io_thread_op_info();
3908 
3909 	os_aio_read_array = os_aio_array_create(
3910 		n_read_segs * n_per_seg, n_read_segs);
3911 
3912 	if (os_aio_read_array == NULL) {
3913 		return(FALSE);
3914 	}
3915 
3916 	ulint	start = (srv_read_only_mode) ? 0 : 2;
3917 	ulint	n_segs = n_read_segs + start;
3918 
3919 	/* 0 is the ibuf segment and 1 is the insert buffer segment. */
3920 	for (ulint i = start; i < n_segs; ++i) {
3921 		ut_a(i < SRV_MAX_N_IO_THREADS);
3922 		srv_io_thread_function[i] = "read thread";
3923 	}
3924 
3925 	ulint	n_segments = n_read_segs;
3926 
3927 	if (!srv_read_only_mode) {
3928 
3929 		os_aio_log_array = os_aio_array_create(n_per_seg, 1);
3930 
3931 		if (os_aio_log_array == NULL) {
3932 			return(FALSE);
3933 		}
3934 
3935 		++n_segments;
3936 
3937 		srv_io_thread_function[1] = "log thread";
3938 
3939 		os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
3940 
3941 		if (os_aio_ibuf_array == NULL) {
3942 			return(FALSE);
3943 		}
3944 
3945 		++n_segments;
3946 
3947 		srv_io_thread_function[0] = "insert buffer thread";
3948 
3949 		os_aio_write_array = os_aio_array_create(
3950 			n_write_segs * n_per_seg, n_write_segs);
3951 
3952 		if (os_aio_write_array == NULL) {
3953 			return(FALSE);
3954 		}
3955 
3956 		n_segments += n_write_segs;
3957 
3958 		for (ulint i = start + n_read_segs; i < n_segments; ++i) {
3959 			ut_a(i < SRV_MAX_N_IO_THREADS);
3960 			srv_io_thread_function[i] = "write thread";
3961 		}
3962 
3963 		ut_ad(n_segments >= 4);
3964 	} else {
3965 		ut_ad(n_segments > 0);
3966 	}
3967 
3968 	os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
3969 
3970 	if (os_aio_sync_array == NULL) {
3971 		return(FALSE);
3972 	}
3973 
3974 	os_aio_n_segments = n_segments;
3975 
3976 	os_aio_validate();
3977 
3978 	os_aio_segment_wait_events = static_cast<os_event_t*>(
3979 		ut_malloc(n_segments * sizeof *os_aio_segment_wait_events));
3980 
3981 	for (ulint i = 0; i < n_segments; ++i) {
3982 		os_aio_segment_wait_events[i] = os_event_create();
3983 	}
3984 
3985 	os_last_printout = ut_time();
3986 
3987 	return(TRUE);
3988 
3989 }
3990 
3991 /***********************************************************************
3992 Frees the asynchronous io system. */
3993 UNIV_INTERN
3994 void
os_aio_free(void)3995 os_aio_free(void)
3996 /*=============*/
3997 {
3998 	if (os_aio_ibuf_array != 0) {
3999 		os_aio_array_free(os_aio_ibuf_array);
4000 	}
4001 
4002 	if (os_aio_log_array != 0) {
4003 		os_aio_array_free(os_aio_log_array);
4004 	}
4005 
4006 	if (os_aio_write_array != 0) {
4007 		os_aio_array_free(os_aio_write_array);
4008 	}
4009 
4010 	if (os_aio_sync_array != 0) {
4011 		os_aio_array_free(os_aio_sync_array);
4012 	}
4013 
4014 	os_aio_array_free(os_aio_read_array);
4015 
4016 	for (ulint i = 0; i < os_aio_n_segments; i++) {
4017 		os_event_free(os_aio_segment_wait_events[i]);
4018 	}
4019 
4020 	ut_free(os_aio_segment_wait_events);
4021 	os_aio_segment_wait_events = 0;
4022 	os_aio_n_segments = 0;
4023 }
4024 
4025 #ifdef WIN_ASYNC_IO
4026 /************************************************************************//**
4027 Wakes up all async i/o threads in the array in Windows async i/o at
4028 shutdown. */
4029 static
4030 void
os_aio_array_wake_win_aio_at_shutdown(os_aio_array_t * array)4031 os_aio_array_wake_win_aio_at_shutdown(
4032 /*==================================*/
4033 	os_aio_array_t*	array)	/*!< in: aio array */
4034 {
4035 	ulint	i;
4036 
4037 	for (i = 0; i < array->n_slots; i++) {
4038 
4039 		SetEvent((array->slots + i)->handle);
4040 	}
4041 }
4042 #endif
4043 
4044 /************************************************************************//**
4045 Wakes up all async i/o threads so that they know to exit themselves in
4046 shutdown. */
4047 UNIV_INTERN
4048 void
os_aio_wake_all_threads_at_shutdown(void)4049 os_aio_wake_all_threads_at_shutdown(void)
4050 /*=====================================*/
4051 {
4052 #ifdef WIN_ASYNC_IO
4053 	/* This code wakes up all ai/o threads in Windows native aio */
4054 	os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array);
4055 	if (os_aio_write_array != 0) {
4056 		os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array);
4057 	}
4058 
4059 	if (os_aio_ibuf_array != 0) {
4060 		os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array);
4061 	}
4062 
4063 	if (os_aio_log_array != 0) {
4064 		os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array);
4065 	}
4066 
4067 #elif defined(LINUX_NATIVE_AIO)
4068 
4069 	/* When using native AIO interface the io helper threads
4070 	wait on io_getevents with a timeout value of 500ms. At
4071 	each wake up these threads check the server status.
4072 	No need to do anything to wake them up. */
4073 
4074 	if (srv_use_native_aio) {
4075 		return;
4076 	}
4077 
4078 	/* Fall through to simulated AIO handler wakeup if we are
4079 	not using native AIO. */
4080 #endif /* !WIN_ASYNC_AIO */
4081 
4082 	/* This loop wakes up all simulated ai/o threads */
4083 
4084 	for (ulint i = 0; i < os_aio_n_segments; i++) {
4085 
4086 		os_event_set(os_aio_segment_wait_events[i]);
4087 	}
4088 }
4089 
4090 /************************************************************************//**
4091 Waits until there are no pending writes in os_aio_write_array. There can
4092 be other, synchronous, pending writes. */
4093 UNIV_INTERN
4094 void
os_aio_wait_until_no_pending_writes(void)4095 os_aio_wait_until_no_pending_writes(void)
4096 /*=====================================*/
4097 {
4098 	ut_ad(!srv_read_only_mode);
4099 	os_event_wait(os_aio_write_array->is_empty);
4100 }
4101 
4102 /**********************************************************************//**
4103 Calculates segment number for a slot.
4104 @return segment number (which is the number used by, for example,
4105 i/o-handler threads) */
4106 static
4107 ulint
os_aio_get_segment_no_from_slot(os_aio_array_t * array,os_aio_slot_t * slot)4108 os_aio_get_segment_no_from_slot(
4109 /*============================*/
4110 	os_aio_array_t*	array,	/*!< in: aio wait array */
4111 	os_aio_slot_t*	slot)	/*!< in: slot in this array */
4112 {
4113 	ulint	segment;
4114 	ulint	seg_len;
4115 
4116 	if (array == os_aio_ibuf_array) {
4117 		ut_ad(!srv_read_only_mode);
4118 
4119 		segment = IO_IBUF_SEGMENT;
4120 
4121 	} else if (array == os_aio_log_array) {
4122 		ut_ad(!srv_read_only_mode);
4123 
4124 		segment = IO_LOG_SEGMENT;
4125 
4126 	} else if (array == os_aio_read_array) {
4127 		seg_len = os_aio_read_array->n_slots
4128 			/ os_aio_read_array->n_segments;
4129 
4130 		segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
4131 	} else {
4132 		ut_ad(!srv_read_only_mode);
4133 		ut_a(array == os_aio_write_array);
4134 
4135 		seg_len = os_aio_write_array->n_slots
4136 			/ os_aio_write_array->n_segments;
4137 
4138 		segment = os_aio_read_array->n_segments + 2
4139 			+ slot->pos / seg_len;
4140 	}
4141 
4142 	return(segment);
4143 }
4144 
4145 /**********************************************************************//**
4146 Calculates local segment number and aio array from global segment number.
4147 @return	local segment number within the aio array */
4148 static
4149 ulint
os_aio_get_array_and_local_segment(os_aio_array_t ** array,ulint global_segment)4150 os_aio_get_array_and_local_segment(
4151 /*===============================*/
4152 	os_aio_array_t** array,		/*!< out: aio wait array */
4153 	ulint		 global_segment)/*!< in: global segment number */
4154 {
4155 	ulint		segment;
4156 
4157 	ut_a(global_segment < os_aio_n_segments);
4158 
4159 	if (srv_read_only_mode) {
4160 		*array = os_aio_read_array;
4161 
4162 		return(global_segment);
4163 	} else if (global_segment == IO_IBUF_SEGMENT) {
4164 		*array = os_aio_ibuf_array;
4165 		segment = 0;
4166 
4167 	} else if (global_segment == IO_LOG_SEGMENT) {
4168 		*array = os_aio_log_array;
4169 		segment = 0;
4170 
4171 	} else if (global_segment < os_aio_read_array->n_segments + 2) {
4172 		*array = os_aio_read_array;
4173 
4174 		segment = global_segment - 2;
4175 	} else {
4176 		*array = os_aio_write_array;
4177 
4178 		segment = global_segment - (os_aio_read_array->n_segments + 2);
4179 	}
4180 
4181 	return(segment);
4182 }
4183 
4184 /*******************************************************************//**
4185 Requests for a slot in the aio array. If no slot is available, waits until
4186 not_full-event becomes signaled.
4187 @return	pointer to slot */
4188 static
4189 os_aio_slot_t*
os_aio_array_reserve_slot(ulint type,os_aio_array_t * array,fil_node_t * message1,void * message2,pfs_os_file_t file,const char * name,void * buf,os_offset_t offset,ulint len)4190 os_aio_array_reserve_slot(
4191 /*======================*/
4192 	ulint		type,	/*!< in: OS_FILE_READ or OS_FILE_WRITE */
4193 	os_aio_array_t*	array,	/*!< in: aio array */
4194 	fil_node_t*	message1,/*!< in: message to be passed along with
4195 				the aio operation */
4196 	void*		message2,/*!< in: message to be passed along with
4197 				the aio operation */
4198 	pfs_os_file_t	file,	/*!< in: file handle */
4199 	const char*	name,	/*!< in: name of the file or path as a
4200 				null-terminated string */
4201 	void*		buf,	/*!< in: buffer where to read or from which
4202 				to write */
4203 	os_offset_t	offset,	/*!< in: file offset */
4204 	ulint		len)	/*!< in: length of the block to read or write */
4205 {
4206 	os_aio_slot_t*	slot = NULL;
4207 #ifdef WIN_ASYNC_IO
4208 	OVERLAPPED*	control;
4209 
4210 #elif defined(LINUX_NATIVE_AIO)
4211 
4212 	struct iocb*	iocb;
4213 	off_t		aio_offset;
4214 
4215 #endif /* WIN_ASYNC_IO */
4216 	ulint		i;
4217 	ulint		counter;
4218 	ulint		slots_per_seg;
4219 	ulint		local_seg;
4220 
4221 #ifdef WIN_ASYNC_IO
4222 	ut_a((len & 0xFFFFFFFFUL) == len);
4223 #endif /* WIN_ASYNC_IO */
4224 
4225 	/* No need of a mutex. Only reading constant fields */
4226 	slots_per_seg = array->n_slots / array->n_segments;
4227 
4228 	/* We attempt to keep adjacent blocks in the same local
4229 	segment. This can help in merging IO requests when we are
4230 	doing simulated AIO */
4231 	local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6))
4232 		% array->n_segments;
4233 
4234 loop:
4235 	os_mutex_enter(array->mutex);
4236 
4237 	if (array->n_reserved == array->n_slots) {
4238 		os_mutex_exit(array->mutex);
4239 
4240 		if (!srv_use_native_aio) {
4241 			/* If the handler threads are suspended, wake them
4242 			so that we get more slots */
4243 
4244 			os_aio_simulated_wake_handler_threads();
4245 		}
4246 
4247 		os_event_wait(array->not_full);
4248 
4249 		goto loop;
4250 	}
4251 
4252 	/* We start our search for an available slot from our preferred
4253 	local segment and do a full scan of the array. We are
4254 	guaranteed to find a slot in full scan. */
4255 	for (i = local_seg * slots_per_seg, counter = 0;
4256 	     counter < array->n_slots;
4257 	     i++, counter++) {
4258 
4259 		i %= array->n_slots;
4260 
4261 		slot = os_aio_array_get_nth_slot(array, i);
4262 
4263 		if (slot->reserved == FALSE) {
4264 			goto found;
4265 		}
4266 	}
4267 
4268 	/* We MUST always be able to get hold of a reserved slot. */
4269 	ut_error;
4270 
4271 found:
4272 	ut_a(slot->reserved == FALSE);
4273 	array->n_reserved++;
4274 
4275 	if (array->n_reserved == 1) {
4276 		os_event_reset(array->is_empty);
4277 	}
4278 
4279 	if (array->n_reserved == array->n_slots) {
4280 		os_event_reset(array->not_full);
4281 	}
4282 
4283 	slot->reserved = TRUE;
4284 	slot->reservation_time = ut_time();
4285 	slot->message1 = message1;
4286 	slot->message2 = message2;
4287 	slot->file     = file;
4288 	slot->name     = name;
4289 	slot->len      = len;
4290 	slot->type     = type;
4291 	slot->buf      = static_cast<byte*>(buf);
4292 	slot->offset   = offset;
4293 	slot->io_already_done = FALSE;
4294 
4295 #ifdef WIN_ASYNC_IO
4296 	control = &slot->control;
4297 	control->Offset = (DWORD) offset & 0xFFFFFFFF;
4298 	control->OffsetHigh = (DWORD) (offset >> 32);
4299 	ResetEvent(slot->handle);
4300 
4301 #elif defined(LINUX_NATIVE_AIO)
4302 
4303 	/* If we are not using native AIO skip this part. */
4304 	if (!srv_use_native_aio) {
4305 		goto skip_native_aio;
4306 	}
4307 
4308 	/* Check if we are dealing with 64 bit arch.
4309 	If not then make sure that offset fits in 32 bits. */
4310 	aio_offset = (off_t) offset;
4311 
4312 	ut_a(sizeof(aio_offset) >= sizeof(offset)
4313 	     || ((os_offset_t) aio_offset) == offset);
4314 
4315 	iocb = &slot->control;
4316 
4317 	if (type == OS_FILE_READ) {
4318 		io_prep_pread(iocb, file.m_file, buf, len, aio_offset);
4319 	} else {
4320 		ut_a(type == OS_FILE_WRITE);
4321 		io_prep_pwrite(iocb, file.m_file, buf, len, aio_offset);
4322 	}
4323 
4324 	iocb->data = (void*) slot;
4325 	slot->n_bytes = 0;
4326 	slot->ret = 0;
4327 
4328 skip_native_aio:
4329 #endif /* LINUX_NATIVE_AIO */
4330 	os_mutex_exit(array->mutex);
4331 
4332 	return(slot);
4333 }
4334 
4335 /*******************************************************************//**
4336 Frees a slot in the aio array. */
4337 static
4338 void
os_aio_array_free_slot(os_aio_array_t * array,os_aio_slot_t * slot)4339 os_aio_array_free_slot(
4340 /*===================*/
4341 	os_aio_array_t*	array,	/*!< in: aio array */
4342 	os_aio_slot_t*	slot)	/*!< in: pointer to slot */
4343 {
4344 	os_mutex_enter(array->mutex);
4345 
4346 	ut_ad(slot->reserved);
4347 
4348 	slot->reserved = FALSE;
4349 
4350 	array->n_reserved--;
4351 
4352 	if (array->n_reserved == array->n_slots - 1) {
4353 		os_event_set(array->not_full);
4354 	}
4355 
4356 	if (array->n_reserved == 0) {
4357 		os_event_set(array->is_empty);
4358 	}
4359 
4360 #ifdef WIN_ASYNC_IO
4361 
4362 	ResetEvent(slot->handle);
4363 
4364 #elif defined(LINUX_NATIVE_AIO)
4365 
4366 	if (srv_use_native_aio) {
4367 		memset(&slot->control, 0x0, sizeof(slot->control));
4368 		slot->n_bytes = 0;
4369 		slot->ret = 0;
4370 		/*fprintf(stderr, "Freed up Linux native slot.\n");*/
4371 	} else {
4372 		/* These fields should not be used if we are not
4373 		using native AIO. */
4374 		ut_ad(slot->n_bytes == 0);
4375 		ut_ad(slot->ret == 0);
4376 	}
4377 
4378 #endif
4379 	os_mutex_exit(array->mutex);
4380 }
4381 
4382 /**********************************************************************//**
4383 Wakes up a simulated aio i/o-handler thread if it has something to do. */
4384 static
4385 void
os_aio_simulated_wake_handler_thread(ulint global_segment)4386 os_aio_simulated_wake_handler_thread(
4387 /*=================================*/
4388 	ulint	global_segment)	/*!< in: the number of the segment in the aio
4389 				arrays */
4390 {
4391 	os_aio_array_t*	array;
4392 	ulint		segment;
4393 
4394 	ut_ad(!srv_use_native_aio);
4395 
4396 	segment = os_aio_get_array_and_local_segment(&array, global_segment);
4397 
4398 	ulint	n = array->n_slots / array->n_segments;
4399 
4400 	segment *= n;
4401 
4402 	/* Look through n slots after the segment * n'th slot */
4403 
4404 	os_mutex_enter(array->mutex);
4405 
4406 	for (ulint i = 0; i < n; ++i) {
4407 		const os_aio_slot_t*	slot;
4408 
4409 		slot = os_aio_array_get_nth_slot(array, segment + i);
4410 
4411 		if (slot->reserved) {
4412 
4413 			/* Found an i/o request */
4414 
4415 			os_mutex_exit(array->mutex);
4416 
4417 			os_event_t	event;
4418 
4419 			event = os_aio_segment_wait_events[global_segment];
4420 
4421 			os_event_set(event);
4422 
4423 			return;
4424 		}
4425 	}
4426 
4427 	os_mutex_exit(array->mutex);
4428 }
4429 
4430 /**********************************************************************//**
4431 Wakes up simulated aio i/o-handler threads if they have something to do. */
4432 UNIV_INTERN
4433 void
os_aio_simulated_wake_handler_threads(void)4434 os_aio_simulated_wake_handler_threads(void)
4435 /*=======================================*/
4436 {
4437 	if (srv_use_native_aio) {
4438 		/* We do not use simulated aio: do nothing */
4439 
4440 		return;
4441 	}
4442 
4443 	os_aio_recommend_sleep_for_read_threads	= FALSE;
4444 
4445 	for (ulint i = 0; i < os_aio_n_segments; i++) {
4446 		os_aio_simulated_wake_handler_thread(i);
4447 	}
4448 }
4449 
4450 /**********************************************************************//**
4451 This function can be called if one wants to post a batch of reads and
4452 prefers an i/o-handler thread to handle them all at once later. You must
4453 call os_aio_simulated_wake_handler_threads later to ensure the threads
4454 are not left sleeping! */
4455 UNIV_INTERN
4456 void
os_aio_simulated_put_read_threads_to_sleep(void)4457 os_aio_simulated_put_read_threads_to_sleep(void)
4458 /*============================================*/
4459 {
4460 
4461 /* The idea of putting background IO threads to sleep is only for
4462 Windows when using simulated AIO. Windows XP seems to schedule
4463 background threads too eagerly to allow for coalescing during
4464 readahead requests. */
4465 #ifdef __WIN__
4466 	os_aio_array_t*	array;
4467 
4468 	if (srv_use_native_aio) {
4469 		/* We do not use simulated aio: do nothing */
4470 
4471 		return;
4472 	}
4473 
4474 	os_aio_recommend_sleep_for_read_threads	= TRUE;
4475 
4476 	for (ulint i = 0; i < os_aio_n_segments; i++) {
4477 		os_aio_get_array_and_local_segment(&array, i);
4478 
4479 		if (array == os_aio_read_array) {
4480 
4481 			os_event_reset(os_aio_segment_wait_events[i]);
4482 		}
4483 	}
4484 #endif /* __WIN__ */
4485 }
4486 
4487 #if defined(LINUX_NATIVE_AIO)
4488 /*******************************************************************//**
4489 Dispatch an AIO request to the kernel.
4490 @return	TRUE on success. */
4491 static
4492 ibool
os_aio_linux_dispatch(os_aio_array_t * array,os_aio_slot_t * slot)4493 os_aio_linux_dispatch(
4494 /*==================*/
4495 	os_aio_array_t*	array,	/*!< in: io request array. */
4496 	os_aio_slot_t*	slot)	/*!< in: an already reserved slot. */
4497 {
4498 	int		ret;
4499 	ulint		io_ctx_index;
4500 	struct iocb*	iocb;
4501 
4502 	ut_ad(slot != NULL);
4503 	ut_ad(array);
4504 
4505 	ut_a(slot->reserved);
4506 
4507 	/* Find out what we are going to work with.
4508 	The iocb struct is directly in the slot.
4509 	The io_context is one per segment. */
4510 
4511 	iocb = &slot->control;
4512 	io_ctx_index = (slot->pos * array->n_segments) / array->n_slots;
4513 
4514 	ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb);
4515 
4516 #if defined(UNIV_AIO_DEBUG)
4517 	fprintf(stderr,
4518 		"io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n",
4519 		(slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot,
4520 		array->aio_ctx[io_ctx_index], (ulong) io_ctx_index);
4521 #endif
4522 
4523 	/* io_submit returns number of successfully
4524 	queued requests or -errno. */
4525 	if (UNIV_UNLIKELY(ret != 1)) {
4526 		errno = -ret;
4527 		return(FALSE);
4528 	}
4529 
4530 	return(TRUE);
4531 }
4532 #endif /* LINUX_NATIVE_AIO */
4533 
4534 
4535 /*******************************************************************//**
4536 NOTE! Use the corresponding macro os_aio(), not directly this function!
4537 Requests an asynchronous i/o operation.
4538 @return	TRUE if request was queued successfully, FALSE if fail */
4539 UNIV_INTERN
4540 ibool
os_aio_func(ulint type,ulint mode,const char * name,pfs_os_file_t file,void * buf,os_offset_t offset,ulint n,fil_node_t * message1,void * message2)4541 os_aio_func(
4542 /*========*/
4543 	ulint		type,	/*!< in: OS_FILE_READ or OS_FILE_WRITE */
4544 	ulint		mode,	/*!< in: OS_AIO_NORMAL, ..., possibly ORed
4545 				to OS_AIO_SIMULATED_WAKE_LATER: the
4546 				last flag advises this function not to wake
4547 				i/o-handler threads, but the caller will
4548 				do the waking explicitly later, in this
4549 				way the caller can post several requests in
4550 				a batch; NOTE that the batch must not be
4551 				so big that it exhausts the slots in aio
4552 				arrays! NOTE that a simulated batch
4553 				may introduce hidden chances of deadlocks,
4554 				because i/os are not actually handled until
4555 				all have been posted: use with great
4556 				caution! */
4557 	const char*	name,	/*!< in: name of the file or path as a
4558 				null-terminated string */
4559 	pfs_os_file_t	file,	/*!< in: handle to a file */
4560 	void*		buf,	/*!< in: buffer where to read or from which
4561 				to write */
4562 	os_offset_t	offset,	/*!< in: file offset where to read or write */
4563 	ulint		n,	/*!< in: number of bytes to read or write */
4564 	fil_node_t*	message1,/*!< in: message for the aio handler
4565 				(can be used to identify a completed
4566 				aio operation); ignored if mode is
4567 				OS_AIO_SYNC */
4568 	void*		message2)/*!< in: message for the aio handler
4569 				(can be used to identify a completed
4570 				aio operation); ignored if mode is
4571 				OS_AIO_SYNC */
4572 {
4573 	os_aio_array_t*	array;
4574 	os_aio_slot_t*	slot;
4575 #ifdef WIN_ASYNC_IO
4576 	ibool		retval;
4577 	BOOL		ret		= TRUE;
4578 	DWORD		len		= (DWORD) n;
4579 	struct fil_node_t* dummy_mess1;
4580 	void*		dummy_mess2;
4581 	ulint		dummy_type;
4582 #endif /* WIN_ASYNC_IO */
4583 	ulint		wake_later;
4584 	ut_ad(file.m_file);
4585 	ut_ad(buf);
4586 	ut_ad(n > 0);
4587 	ut_ad(n % OS_FILE_LOG_BLOCK_SIZE == 0);
4588 	ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
4589 	ut_ad(os_aio_validate_skip());
4590 #ifdef WIN_ASYNC_IO
4591 	ut_ad((n & 0xFFFFFFFFUL) == n);
4592 #endif
4593 
4594 	wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
4595 	mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER);
4596 
4597 	if (mode == OS_AIO_SYNC
4598 #ifdef WIN_ASYNC_IO
4599 	    && !srv_use_native_aio
4600 #endif /* WIN_ASYNC_IO */
4601 	    ) {
4602 		/* This is actually an ordinary synchronous read or write:
4603 		no need to use an i/o-handler thread. NOTE that if we use
4604 		Windows async i/o, Windows does not allow us to use
4605 		ordinary synchronous os_file_read etc. on the same file,
4606 		therefore we have built a special mechanism for synchronous
4607 		wait in the Windows case.
4608 		Also note that the Performance Schema instrumentation has
4609 		been performed by current os_aio_func()'s wrapper function
4610 		pfs_os_aio_func(). So we would no longer need to call
4611 		Performance Schema instrumented os_file_read() and
4612 		os_file_write(). Instead, we should use os_file_read_func()
4613 		and os_file_write_func() */
4614 
4615 		if (type == OS_FILE_READ) {
4616 			return(os_file_read_func(file.m_file, buf, offset, n));
4617 		}
4618 		ut_ad(!srv_read_only_mode);
4619 		ut_a(type == OS_FILE_WRITE);
4620 		return(os_file_write_func(name, file.m_file, buf, offset, n));
4621 	}
4622 
4623 try_again:
4624 	switch (mode) {
4625 	case OS_AIO_NORMAL:
4626 		if (type == OS_FILE_READ) {
4627 			array = os_aio_read_array;
4628 		} else {
4629 			ut_ad(!srv_read_only_mode);
4630 			array = os_aio_write_array;
4631 		}
4632 		break;
4633 	case OS_AIO_IBUF:
4634 		ut_ad(type == OS_FILE_READ);
4635 		/* Reduce probability of deadlock bugs in connection with ibuf:
4636 		do not let the ibuf i/o handler sleep */
4637 
4638 		wake_later = FALSE;
4639 
4640 		if (srv_read_only_mode) {
4641 			array = os_aio_read_array;
4642 		} else {
4643 			array = os_aio_ibuf_array;
4644 		}
4645 		break;
4646 	case OS_AIO_LOG:
4647 		if (srv_read_only_mode) {
4648 			array = os_aio_read_array;
4649 		} else {
4650 			array = os_aio_log_array;
4651 		}
4652 		break;
4653 	case OS_AIO_SYNC:
4654 		array = os_aio_sync_array;
4655 #if defined(LINUX_NATIVE_AIO)
4656 		/* In Linux native AIO we don't use sync IO array. */
4657 		ut_a(!srv_use_native_aio);
4658 #endif /* LINUX_NATIVE_AIO */
4659 		break;
4660 	default:
4661 		ut_error;
4662 		array = NULL; /* Eliminate compiler warning */
4663 	}
4664 
4665 	slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
4666 					 name, buf, offset, n);
4667 	if (type == OS_FILE_READ) {
4668 		if (srv_use_native_aio) {
4669 			os_n_file_reads++;
4670 			os_bytes_read_since_printout += n;
4671 #ifdef WIN_ASYNC_IO
4672 			ret = ReadFile(file.m_file, buf, (DWORD) n, &len,
4673 				       &(slot->control));
4674 #elif defined(LINUX_NATIVE_AIO)
4675 			if (!os_aio_linux_dispatch(array, slot)) {
4676 				goto err_exit;
4677 			}
4678 #endif /* WIN_ASYNC_IO */
4679 		} else {
4680 			if (!wake_later) {
4681 				os_aio_simulated_wake_handler_thread(
4682 					os_aio_get_segment_no_from_slot(
4683 						array, slot));
4684 			}
4685 		}
4686 	} else if (type == OS_FILE_WRITE) {
4687 		ut_ad(!srv_read_only_mode);
4688 		if (srv_use_native_aio) {
4689 			os_n_file_writes++;
4690 #ifdef WIN_ASYNC_IO
4691 			ret = WriteFile(file.m_file, buf, (DWORD) n, &len,
4692 					&(slot->control));
4693 #elif defined(LINUX_NATIVE_AIO)
4694 			if (!os_aio_linux_dispatch(array, slot)) {
4695 				goto err_exit;
4696 			}
4697 #endif /* WIN_ASYNC_IO */
4698 		} else {
4699 			if (!wake_later) {
4700 				os_aio_simulated_wake_handler_thread(
4701 					os_aio_get_segment_no_from_slot(
4702 						array, slot));
4703 			}
4704 		}
4705 	} else {
4706 		ut_error;
4707 	}
4708 
4709 #ifdef WIN_ASYNC_IO
4710 	if (srv_use_native_aio) {
4711 		if ((ret && len == n)
4712 		    || (!ret && GetLastError() == ERROR_IO_PENDING)) {
4713 			/* aio was queued successfully! */
4714 
4715 			if (mode == OS_AIO_SYNC) {
4716 				/* We want a synchronous i/o operation on a
4717 				file where we also use async i/o: in Windows
4718 				we must use the same wait mechanism as for
4719 				async i/o */
4720 
4721 				retval = os_aio_windows_handle(
4722 					ULINT_UNDEFINED, slot->pos,
4723 					&dummy_mess1, &dummy_mess2,
4724 					&dummy_type);
4725 
4726 				return(retval);
4727 			}
4728 
4729 			return(TRUE);
4730 		}
4731 
4732 		goto err_exit;
4733 	}
4734 #endif /* WIN_ASYNC_IO */
4735 	/* aio was queued successfully! */
4736 	return(TRUE);
4737 
4738 #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
4739 err_exit:
4740 #endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
4741 	os_aio_array_free_slot(array, slot);
4742 
4743 	if (os_file_handle_error(
4744 		name,type == OS_FILE_READ ? "aio read" : "aio write")) {
4745 
4746 		goto try_again;
4747 	}
4748 
4749 	return(FALSE);
4750 }
4751 
4752 #ifdef WIN_ASYNC_IO
4753 /**********************************************************************//**
4754 This function is only used in Windows asynchronous i/o.
4755 Waits for an aio operation to complete. This function is used to wait the
4756 for completed requests. The aio array of pending requests is divided
4757 into segments. The thread specifies which segment or slot it wants to wait
4758 for. NOTE: this function will also take care of freeing the aio slot,
4759 therefore no other thread is allowed to do the freeing!
4760 @return	TRUE if the aio operation succeeded */
4761 UNIV_INTERN
4762 ibool
os_aio_windows_handle(ulint segment,ulint pos,fil_node_t ** message1,void ** message2,ulint * type)4763 os_aio_windows_handle(
4764 /*==================*/
4765 	ulint	segment,	/*!< in: the number of the segment in the aio
4766 				arrays to wait for; segment 0 is the ibuf
4767 				i/o thread, segment 1 the log i/o thread,
4768 				then follow the non-ibuf read threads, and as
4769 				the last are the non-ibuf write threads; if
4770 				this is ULINT_UNDEFINED, then it means that
4771 				sync aio is used, and this parameter is
4772 				ignored */
4773 	ulint	pos,		/*!< this parameter is used only in sync aio:
4774 				wait for the aio slot at this position */
4775 	fil_node_t**message1,	/*!< out: the messages passed with the aio
4776 				request; note that also in the case where
4777 				the aio operation failed, these output
4778 				parameters are valid and can be used to
4779 				restart the operation, for example */
4780 	void**	message2,
4781 	ulint*	type)		/*!< out: OS_FILE_WRITE or ..._READ */
4782 {
4783 	ulint		orig_seg	= segment;
4784 	os_aio_array_t*	array;
4785 	os_aio_slot_t*	slot;
4786 	ulint		n;
4787 	ulint		i;
4788 	ibool		ret_val;
4789 	BOOL		ret;
4790 	DWORD		len;
4791 	BOOL		retry		= FALSE;
4792 
4793 	if (segment == ULINT_UNDEFINED) {
4794 		segment = 0;
4795 		array = os_aio_sync_array;
4796 	} else {
4797 		segment = os_aio_get_array_and_local_segment(&array, segment);
4798 	}
4799 
4800 	/* NOTE! We only access constant fields in os_aio_array. Therefore
4801 	we do not have to acquire the protecting mutex yet */
4802 
4803 	ut_ad(os_aio_validate_skip());
4804 	ut_ad(segment < array->n_segments);
4805 
4806 	n = array->n_slots / array->n_segments;
4807 
4808 	if (array == os_aio_sync_array) {
4809 
4810 		WaitForSingleObject(
4811 			os_aio_array_get_nth_slot(array, pos)->handle,
4812 			INFINITE);
4813 
4814 		i = pos;
4815 
4816 	} else {
4817 		if (orig_seg != ULINT_UNDEFINED) {
4818 			srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
4819 		}
4820 
4821 		i = WaitForMultipleObjects(
4822 			(DWORD) n, array->handles + segment * n,
4823 			FALSE, INFINITE);
4824 	}
4825 
4826 	os_mutex_enter(array->mutex);
4827 
4828 	if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
4829 	    && array->n_reserved == 0) {
4830 		*message1 = NULL;
4831 		*message2 = NULL;
4832 		os_mutex_exit(array->mutex);
4833 		return(TRUE);
4834 	}
4835 
4836 	ut_a(i >= WAIT_OBJECT_0 && i <= WAIT_OBJECT_0 + n);
4837 
4838 	slot = os_aio_array_get_nth_slot(array, i + segment * n);
4839 
4840 	ut_a(slot->reserved);
4841 
4842 	if (orig_seg != ULINT_UNDEFINED) {
4843 		srv_set_io_thread_op_info(
4844 			orig_seg, "get windows aio return value");
4845 	}
4846 	ret = GetOverlappedResult(slot->file.m_file, &(slot->control), &len, TRUE);
4847 
4848 	*message1 = slot->message1;
4849 	*message2 = slot->message2;
4850 
4851 	*type = slot->type;
4852 
4853 	if (ret && len == slot->len) {
4854 
4855 		ret_val = TRUE;
4856 	} else if (os_file_handle_error(slot->name, "Windows aio")) {
4857 
4858 		retry = TRUE;
4859 	} else {
4860 
4861 		ret_val = FALSE;
4862 	}
4863 
4864 	os_mutex_exit(array->mutex);
4865 
4866 	if (retry) {
4867 		/* retry failed read/write operation synchronously.
4868 		No need to hold array->mutex. */
4869 
4870 #ifdef UNIV_PFS_IO
4871 		/* This read/write does not go through os_file_read
4872 		and os_file_write APIs, need to register with
4873 		performance schema explicitly here. */
4874 		struct PSI_file_locker* locker = NULL;
4875 		PSI_file_locker_state	state;
4876 		register_pfs_file_io_begin(&state, locker, slot->file, slot->len,
4877 					   (slot->type == OS_FILE_WRITE)
4878 						? PSI_FILE_WRITE
4879 						: PSI_FILE_READ,
4880 					    __FILE__, __LINE__);
4881 #endif
4882 
4883 		ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
4884 
4885 		switch (slot->type) {
4886 		case OS_FILE_WRITE:
4887 			ret = WriteFile(slot->file.m_file, slot->buf,
4888 					(DWORD) slot->len, &len,
4889 					&(slot->control));
4890 			break;
4891 		case OS_FILE_READ:
4892 			ret = ReadFile(slot->file.m_file, slot->buf,
4893 				       (DWORD) slot->len, &len,
4894 				       &(slot->control));
4895 			break;
4896 		default:
4897 			ut_error;
4898 		}
4899 
4900 #ifdef UNIV_PFS_IO
4901 		register_pfs_file_io_end(locker, len);
4902 #endif
4903 
4904 		if (!ret && GetLastError() == ERROR_IO_PENDING) {
4905 			/* aio was queued successfully!
4906 			We want a synchronous i/o operation on a
4907 			file where we also use async i/o: in Windows
4908 			we must use the same wait mechanism as for
4909 			async i/o */
4910 			ret = GetOverlappedResult(slot->file.m_file,
4911 						  &(slot->control),
4912 						  &len, TRUE);
4913 		}
4914 
4915 		ret_val = ret && len == slot->len;
4916 	}
4917 
4918 	os_aio_array_free_slot(array, slot);
4919 
4920 	return(ret_val);
4921 }
4922 #endif
4923 
4924 #if defined(LINUX_NATIVE_AIO)
4925 /******************************************************************//**
4926 This function is only used in Linux native asynchronous i/o. This is
4927 called from within the io-thread. If there are no completed IO requests
4928 in the slot array, the thread calls this function to collect more
4929 requests from the kernel.
4930 The io-thread waits on io_getevents(), which is a blocking call, with
4931 a timeout value. Unless the system is very heavy loaded, keeping the
4932 io-thread very busy, the io-thread will spend most of its time waiting
4933 in this function.
4934 The io-thread also exits in this function. It checks server status at
4935 each wakeup and that is why we use timed wait in io_getevents(). */
4936 static
4937 void
os_aio_linux_collect(os_aio_array_t * array,ulint segment,ulint seg_size)4938 os_aio_linux_collect(
4939 /*=================*/
4940 	os_aio_array_t* array,		/*!< in/out: slot array. */
4941 	ulint		segment,	/*!< in: local segment no. */
4942 	ulint		seg_size)	/*!< in: segment size. */
4943 {
4944 	int			i;
4945 	int			ret;
4946 	ulint			start_pos;
4947 	ulint			end_pos;
4948 	struct timespec		timeout;
4949 	struct io_event*	events;
4950 	struct io_context*	io_ctx;
4951 
4952 	/* sanity checks. */
4953 	ut_ad(array != NULL);
4954 	ut_ad(seg_size > 0);
4955 	ut_ad(segment < array->n_segments);
4956 
4957 	/* Which part of event array we are going to work on. */
4958 	events = &array->aio_events[segment * seg_size];
4959 
4960 	/* Which io_context we are going to use. */
4961 	io_ctx = array->aio_ctx[segment];
4962 
4963 	/* Starting point of the segment we will be working on. */
4964 	start_pos = segment * seg_size;
4965 
4966 	/* End point. */
4967 	end_pos = start_pos + seg_size;
4968 
4969 retry:
4970 
4971 	/* Initialize the events. The timeout value is arbitrary.
4972 	We probably need to experiment with it a little. */
4973 	memset(events, 0, sizeof(*events) * seg_size);
4974 	timeout.tv_sec = 0;
4975 	timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
4976 
4977 	ret = io_getevents(io_ctx, 1, seg_size, events, &timeout);
4978 
4979 	if (ret > 0) {
4980 		for (i = 0; i < ret; i++) {
4981 			os_aio_slot_t*	slot;
4982 			struct iocb*	control;
4983 
4984 			control = (struct iocb*) events[i].obj;
4985 			ut_a(control != NULL);
4986 
4987 			slot = (os_aio_slot_t*) control->data;
4988 
4989 			/* Some sanity checks. */
4990 			ut_a(slot != NULL);
4991 			ut_a(slot->reserved);
4992 
4993 #if defined(UNIV_AIO_DEBUG)
4994 			fprintf(stderr,
4995 				"io_getevents[%c]: slot[%p] ctx[%p]"
4996 				" seg[%lu]\n",
4997 				(slot->type == OS_FILE_WRITE) ? 'w' : 'r',
4998 				slot, io_ctx, segment);
4999 #endif
5000 
5001 			/* We are not scribbling previous segment. */
5002 			ut_a(slot->pos >= start_pos);
5003 
5004 			/* We have not overstepped to next segment. */
5005 			ut_a(slot->pos < end_pos);
5006 
5007 			/* Mark this request as completed. The error handling
5008 			will be done in the calling function. */
5009 			os_mutex_enter(array->mutex);
5010 			slot->n_bytes = events[i].res;
5011 			slot->ret = events[i].res2;
5012 			slot->io_already_done = TRUE;
5013 			os_mutex_exit(array->mutex);
5014 		}
5015 		return;
5016 	}
5017 
5018 	if (UNIV_UNLIKELY(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) {
5019 		return;
5020 	}
5021 
5022 	/* This error handling is for any error in collecting the
5023 	IO requests. The errors, if any, for any particular IO
5024 	request are simply passed on to the calling routine. */
5025 
5026 	switch (ret) {
5027 	case -EAGAIN:
5028 		/* Not enough resources! Try again. */
5029 	case -EINTR:
5030 		/* Interrupted! I have tested the behaviour in case of an
5031 		interrupt. If we have some completed IOs available then
5032 		the return code will be the number of IOs. We get EINTR only
5033 		if there are no completed IOs and we have been interrupted. */
5034 	case 0:
5035 		/* No pending request! Go back and check again. */
5036 		goto retry;
5037 	}
5038 
5039 	/* All other errors should cause a trap for now. */
5040 	ut_print_timestamp(stderr);
5041 	fprintf(stderr,
5042 		" InnoDB: unexpected ret_code[%d] from io_getevents()!\n",
5043 		ret);
5044 	ut_error;
5045 }
5046 
5047 /**********************************************************************//**
5048 This function is only used in Linux native asynchronous i/o.
5049 Waits for an aio operation to complete. This function is used to wait for
5050 the completed requests. The aio array of pending requests is divided
5051 into segments. The thread specifies which segment or slot it wants to wait
5052 for. NOTE: this function will also take care of freeing the aio slot,
5053 therefore no other thread is allowed to do the freeing!
5054 @return	TRUE if the IO was successful */
5055 UNIV_INTERN
5056 ibool
os_aio_linux_handle(ulint global_seg,fil_node_t ** message1,void ** message2,ulint * type)5057 os_aio_linux_handle(
5058 /*================*/
5059 	ulint	global_seg,	/*!< in: segment number in the aio array
5060 				to wait for; segment 0 is the ibuf
5061 				i/o thread, segment 1 is log i/o thread,
5062 				then follow the non-ibuf read threads,
5063 				and the last are the non-ibuf write
5064 				threads. */
5065 	fil_node_t**message1,	/*!< out: the messages passed with the */
5066 	void**	message2,	/*!< aio request; note that in case the
5067 				aio operation failed, these output
5068 				parameters are valid and can be used to
5069 				restart the operation. */
5070 	ulint*	type)		/*!< out: OS_FILE_WRITE or ..._READ */
5071 {
5072 	ulint		segment;
5073 	os_aio_array_t*	array;
5074 	os_aio_slot_t*	slot;
5075 	ulint		n;
5076 	ulint		i;
5077 	ibool		ret = FALSE;
5078 
5079 	/* Should never be doing Sync IO here. */
5080 	ut_a(global_seg != ULINT_UNDEFINED);
5081 
5082 	/* Find the array and the local segment. */
5083 	segment = os_aio_get_array_and_local_segment(&array, global_seg);
5084 	n = array->n_slots / array->n_segments;
5085 
5086 	/* Loop until we have found a completed request. */
5087 	for (;;) {
5088 		ibool	any_reserved = FALSE;
5089 		os_mutex_enter(array->mutex);
5090 		for (i = 0; i < n; ++i) {
5091 			slot = os_aio_array_get_nth_slot(
5092 				array, i + segment * n);
5093 			if (!slot->reserved) {
5094 				continue;
5095 			} else if (slot->io_already_done) {
5096 				/* Something for us to work on. */
5097 				goto found;
5098 			} else {
5099 				any_reserved = TRUE;
5100 			}
5101 		}
5102 
5103 		os_mutex_exit(array->mutex);
5104 
5105 		/* There is no completed request.
5106 		If there is no pending request at all,
5107 		and the system is being shut down, exit. */
5108 		if (UNIV_UNLIKELY
5109 		    (!any_reserved
5110 		     && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) {
5111 			*message1 = NULL;
5112 			*message2 = NULL;
5113 			return(TRUE);
5114 		}
5115 
5116 		/* Wait for some request. Note that we return
5117 		from wait iff we have found a request. */
5118 
5119 		srv_set_io_thread_op_info(global_seg,
5120 			"waiting for completed aio requests");
5121 		os_aio_linux_collect(array, segment, n);
5122 	}
5123 
5124 found:
5125 	/* Note that it may be that there are more then one completed
5126 	IO requests. We process them one at a time. We may have a case
5127 	here to improve the performance slightly by dealing with all
5128 	requests in one sweep. */
5129 	srv_set_io_thread_op_info(global_seg,
5130 				"processing completed aio requests");
5131 
5132 	/* Ensure that we are scribbling only our segment. */
5133 	ut_a(i < n);
5134 
5135 	ut_ad(slot != NULL);
5136 	ut_ad(slot->reserved);
5137 	ut_ad(slot->io_already_done);
5138 
5139 	*message1 = slot->message1;
5140 	*message2 = slot->message2;
5141 
5142 	*type = slot->type;
5143 
5144 	if (slot->ret == 0 && slot->n_bytes == (long) slot->len) {
5145 
5146 		ret = TRUE;
5147 	} else {
5148 		errno = -slot->ret;
5149 
5150 		/* os_file_handle_error does tell us if we should retry
5151 		this IO. As it stands now, we don't do this retry when
5152 		reaping requests from a different context than
5153 		the dispatcher. This non-retry logic is the same for
5154 		windows and linux native AIO.
5155 		We should probably look into this to transparently
5156 		re-submit the IO. */
5157 		os_file_handle_error(slot->name, "Linux aio");
5158 
5159 		ret = FALSE;
5160 	}
5161 
5162 	os_mutex_exit(array->mutex);
5163 
5164 	os_aio_array_free_slot(array, slot);
5165 
5166 	return(ret);
5167 }
5168 #endif /* LINUX_NATIVE_AIO */
5169 
5170 /**********************************************************************//**
5171 Does simulated aio. This function should be called by an i/o-handler
5172 thread.
5173 @return	TRUE if the aio operation succeeded */
5174 UNIV_INTERN
5175 ibool
os_aio_simulated_handle(ulint global_segment,fil_node_t ** message1,void ** message2,ulint * type)5176 os_aio_simulated_handle(
5177 /*====================*/
5178 	ulint	global_segment,	/*!< in: the number of the segment in the aio
5179 				arrays to wait for; segment 0 is the ibuf
5180 				i/o thread, segment 1 the log i/o thread,
5181 				then follow the non-ibuf read threads, and as
5182 				the last are the non-ibuf write threads */
5183 	fil_node_t**message1,	/*!< out: the messages passed with the aio
5184 				request; note that also in the case where
5185 				the aio operation failed, these output
5186 				parameters are valid and can be used to
5187 				restart the operation, for example */
5188 	void**	message2,
5189 	ulint*	type)		/*!< out: OS_FILE_WRITE or ..._READ */
5190 {
5191 	os_aio_array_t*	array;
5192 	ulint		segment;
5193 	os_aio_slot_t*	consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
5194 	ulint		n_consecutive;
5195 	ulint		total_len;
5196 	ulint		offs;
5197 	os_offset_t	lowest_offset;
5198 	ulint		biggest_age;
5199 	ulint		age;
5200 	byte*		combined_buf;
5201 	byte*		combined_buf2;
5202 	ibool		ret;
5203 	ibool		any_reserved;
5204 	ulint		n;
5205 	os_aio_slot_t*	aio_slot;
5206 
5207 	/* Fix compiler warning */
5208 	*consecutive_ios = NULL;
5209 
5210 	segment = os_aio_get_array_and_local_segment(&array, global_segment);
5211 
5212 restart:
5213 	/* NOTE! We only access constant fields in os_aio_array. Therefore
5214 	we do not have to acquire the protecting mutex yet */
5215 
5216 	srv_set_io_thread_op_info(global_segment,
5217 				  "looking for i/o requests (a)");
5218 	ut_ad(os_aio_validate_skip());
5219 	ut_ad(segment < array->n_segments);
5220 
5221 	n = array->n_slots / array->n_segments;
5222 
5223 	/* Look through n slots after the segment * n'th slot */
5224 
5225 	if (array == os_aio_read_array
5226 	    && os_aio_recommend_sleep_for_read_threads) {
5227 
5228 		/* Give other threads chance to add several i/os to the array
5229 		at once. */
5230 
5231 		goto recommended_sleep;
5232 	}
5233 
5234 	srv_set_io_thread_op_info(global_segment,
5235 				  "looking for i/o requests (b)");
5236 
5237 	/* Check if there is a slot for which the i/o has already been
5238 	done */
5239 	any_reserved = FALSE;
5240 
5241 	os_mutex_enter(array->mutex);
5242 
5243 	for (ulint i = 0; i < n; i++) {
5244 		os_aio_slot_t*	slot;
5245 
5246 		slot = os_aio_array_get_nth_slot(array, i + segment * n);
5247 
5248 		if (!slot->reserved) {
5249 			continue;
5250 		} else if (slot->io_already_done) {
5251 
5252 			if (os_aio_print_debug) {
5253 				fprintf(stderr,
5254 					"InnoDB: i/o for slot %lu"
5255 					" already done, returning\n",
5256 					(ulong) i);
5257 			}
5258 
5259 			aio_slot = slot;
5260 			ret = TRUE;
5261 			goto slot_io_done;
5262 		} else {
5263 			any_reserved = TRUE;
5264 		}
5265 	}
5266 
5267 	/* There is no completed request.
5268 	If there is no pending request at all,
5269 	and the system is being shut down, exit. */
5270 	if (!any_reserved && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
5271 		os_mutex_exit(array->mutex);
5272 		*message1 = NULL;
5273 		*message2 = NULL;
5274 		return(TRUE);
5275 	}
5276 
5277 	n_consecutive = 0;
5278 
5279 	/* If there are at least 2 seconds old requests, then pick the oldest
5280 	one to prevent starvation. If several requests have the same age,
5281 	then pick the one at the lowest offset. */
5282 
5283 	biggest_age = 0;
5284 	lowest_offset = IB_UINT64_MAX;
5285 
5286 	for (ulint i = 0; i < n; i++) {
5287 		os_aio_slot_t*	slot;
5288 
5289 		slot = os_aio_array_get_nth_slot(array, i + segment * n);
5290 
5291 		if (slot->reserved) {
5292 
5293 			age = (ulint) difftime(
5294 				ut_time(), slot->reservation_time);
5295 
5296 			if ((age >= 2 && age > biggest_age)
5297 			    || (age >= 2 && age == biggest_age
5298 				&& slot->offset < lowest_offset)) {
5299 
5300 				/* Found an i/o request */
5301 				consecutive_ios[0] = slot;
5302 
5303 				n_consecutive = 1;
5304 
5305 				biggest_age = age;
5306 				lowest_offset = slot->offset;
5307 			}
5308 		}
5309 	}
5310 
5311 	if (n_consecutive == 0) {
5312 		/* There were no old requests. Look for an i/o request at the
5313 		lowest offset in the array (we ignore the high 32 bits of the
5314 		offset in these heuristics) */
5315 
5316 		lowest_offset = IB_UINT64_MAX;
5317 
5318 		for (ulint i = 0; i < n; i++) {
5319 			os_aio_slot_t*	slot;
5320 
5321 			slot = os_aio_array_get_nth_slot(
5322 				array, i + segment * n);
5323 
5324 			if (slot->reserved && slot->offset < lowest_offset) {
5325 
5326 				/* Found an i/o request */
5327 				consecutive_ios[0] = slot;
5328 
5329 				n_consecutive = 1;
5330 
5331 				lowest_offset = slot->offset;
5332 			}
5333 		}
5334 	}
5335 
5336 	if (n_consecutive == 0) {
5337 
5338 		/* No i/o requested at the moment */
5339 
5340 		goto wait_for_io;
5341 	}
5342 
5343 	/* if n_consecutive != 0, then we have assigned
5344 	something valid to consecutive_ios[0] */
5345 	ut_ad(n_consecutive != 0);
5346 	ut_ad(consecutive_ios[0] != NULL);
5347 
5348 	aio_slot = consecutive_ios[0];
5349 
5350 	/* Check if there are several consecutive blocks to read or write */
5351 
5352 consecutive_loop:
5353 	for (ulint i = 0; i < n; i++) {
5354 		os_aio_slot_t*	slot;
5355 
5356 		slot = os_aio_array_get_nth_slot(array, i + segment * n);
5357 		if (slot->reserved
5358 		    && slot != aio_slot
5359 		    && slot->offset == aio_slot->offset + aio_slot->len
5360 		    && slot->type == aio_slot->type
5361 		    && slot->file.m_file == aio_slot->file.m_file) {
5362 
5363 			/* Found a consecutive i/o request */
5364 
5365 			consecutive_ios[n_consecutive] = slot;
5366 			n_consecutive++;
5367 
5368 			aio_slot = slot;
5369 
5370 			if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) {
5371 
5372 				goto consecutive_loop;
5373 			} else {
5374 				break;
5375 			}
5376 		}
5377 	}
5378 
5379 	srv_set_io_thread_op_info(global_segment, "consecutive i/o requests");
5380 
5381 	/* We have now collected n_consecutive i/o requests in the array;
5382 	allocate a single buffer which can hold all data, and perform the
5383 	i/o */
5384 
5385 	total_len = 0;
5386 	aio_slot = consecutive_ios[0];
5387 
5388 	for (ulint i = 0; i < n_consecutive; i++) {
5389 		total_len += consecutive_ios[i]->len;
5390 	}
5391 
5392 	if (n_consecutive == 1) {
5393 		/* We can use the buffer of the i/o request */
5394 		combined_buf = aio_slot->buf;
5395 		combined_buf2 = NULL;
5396 	} else {
5397 		combined_buf2 = static_cast<byte*>(
5398 			ut_malloc(total_len + UNIV_PAGE_SIZE));
5399 
5400 		ut_a(combined_buf2);
5401 
5402 		combined_buf = static_cast<byte*>(
5403 			ut_align(combined_buf2, UNIV_PAGE_SIZE));
5404 	}
5405 
5406 	/* We release the array mutex for the time of the i/o: NOTE that
5407 	this assumes that there is just one i/o-handler thread serving
5408 	a single segment of slots! */
5409 
5410 	os_mutex_exit(array->mutex);
5411 
5412 	if (aio_slot->type == OS_FILE_WRITE && n_consecutive > 1) {
5413 		/* Copy the buffers to the combined buffer */
5414 		offs = 0;
5415 
5416 		for (ulint i = 0; i < n_consecutive; i++) {
5417 
5418 			ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf,
5419 				  consecutive_ios[i]->len);
5420 
5421 			offs += consecutive_ios[i]->len;
5422 		}
5423 	}
5424 
5425 	srv_set_io_thread_op_info(global_segment, "doing file i/o");
5426 
5427 	/* Do the i/o with ordinary, synchronous i/o functions: */
5428 	if (aio_slot->type == OS_FILE_WRITE) {
5429 		ut_ad(!srv_read_only_mode);
5430 		ret = os_file_write(
5431 			aio_slot->name, aio_slot->file, combined_buf,
5432 			aio_slot->offset, total_len);
5433 	} else {
5434 		ret = os_file_read(
5435 			aio_slot->file, combined_buf,
5436 			aio_slot->offset, total_len);
5437 	}
5438 
5439 	ut_a(ret);
5440 	srv_set_io_thread_op_info(global_segment, "file i/o done");
5441 
5442 	if (aio_slot->type == OS_FILE_READ && n_consecutive > 1) {
5443 		/* Copy the combined buffer to individual buffers */
5444 		offs = 0;
5445 
5446 		for (ulint i = 0; i < n_consecutive; i++) {
5447 
5448 			ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs,
5449 				  consecutive_ios[i]->len);
5450 			offs += consecutive_ios[i]->len;
5451 		}
5452 	}
5453 
5454 	if (combined_buf2) {
5455 		ut_free(combined_buf2);
5456 	}
5457 
5458 	os_mutex_enter(array->mutex);
5459 
5460 	/* Mark the i/os done in slots */
5461 
5462 	for (ulint i = 0; i < n_consecutive; i++) {
5463 		consecutive_ios[i]->io_already_done = TRUE;
5464 	}
5465 
5466 	/* We return the messages for the first slot now, and if there were
5467 	several slots, the messages will be returned with subsequent calls
5468 	of this function */
5469 
5470 slot_io_done:
5471 
5472 	ut_a(aio_slot->reserved);
5473 
5474 	*message1 = aio_slot->message1;
5475 	*message2 = aio_slot->message2;
5476 
5477 	*type = aio_slot->type;
5478 
5479 	os_mutex_exit(array->mutex);
5480 
5481 	os_aio_array_free_slot(array, aio_slot);
5482 
5483 	return(ret);
5484 
5485 wait_for_io:
5486 	srv_set_io_thread_op_info(global_segment, "resetting wait event");
5487 
5488 	/* We wait here until there again can be i/os in the segment
5489 	of this thread */
5490 
5491 	os_event_reset(os_aio_segment_wait_events[global_segment]);
5492 
5493 	os_mutex_exit(array->mutex);
5494 
5495 recommended_sleep:
5496 	srv_set_io_thread_op_info(global_segment, "waiting for i/o request");
5497 
5498 	os_event_wait(os_aio_segment_wait_events[global_segment]);
5499 
5500 	goto restart;
5501 }
5502 
5503 /**********************************************************************//**
5504 Validates the consistency of an aio array.
5505 @return	true if ok */
5506 static
5507 bool
os_aio_array_validate(os_aio_array_t * array)5508 os_aio_array_validate(
5509 /*==================*/
5510 	os_aio_array_t*	array)	/*!< in: aio wait array */
5511 {
5512 	ulint		i;
5513 	ulint		n_reserved	= 0;
5514 
5515 	os_mutex_enter(array->mutex);
5516 
5517 	ut_a(array->n_slots > 0);
5518 	ut_a(array->n_segments > 0);
5519 
5520 	for (i = 0; i < array->n_slots; i++) {
5521 		os_aio_slot_t*	slot;
5522 
5523 		slot = os_aio_array_get_nth_slot(array, i);
5524 
5525 		if (slot->reserved) {
5526 			n_reserved++;
5527 			ut_a(slot->len > 0);
5528 		}
5529 	}
5530 
5531 	ut_a(array->n_reserved == n_reserved);
5532 
5533 	os_mutex_exit(array->mutex);
5534 
5535 	return(true);
5536 }
5537 
5538 /**********************************************************************//**
5539 Validates the consistency the aio system.
5540 @return	TRUE if ok */
5541 UNIV_INTERN
5542 ibool
os_aio_validate(void)5543 os_aio_validate(void)
5544 /*=================*/
5545 {
5546 	os_aio_array_validate(os_aio_read_array);
5547 
5548 	if (os_aio_write_array != 0) {
5549 		os_aio_array_validate(os_aio_write_array);
5550 	}
5551 
5552 	if (os_aio_ibuf_array != 0) {
5553 		os_aio_array_validate(os_aio_ibuf_array);
5554 	}
5555 
5556 	if (os_aio_log_array != 0) {
5557 		os_aio_array_validate(os_aio_log_array);
5558 	}
5559 
5560 	if (os_aio_sync_array != 0) {
5561 		os_aio_array_validate(os_aio_sync_array);
5562 	}
5563 
5564 	return(TRUE);
5565 }
5566 
5567 /**********************************************************************//**
5568 Prints pending IO requests per segment of an aio array.
5569 We probably don't need per segment statistics but they can help us
5570 during development phase to see if the IO requests are being
5571 distributed as expected. */
5572 static
5573 void
os_aio_print_segment_info(FILE * file,ulint * n_seg,os_aio_array_t * array)5574 os_aio_print_segment_info(
5575 /*======================*/
5576 	FILE*		file,	/*!< in: file where to print */
5577 	ulint*		n_seg,	/*!< in: pending IO array */
5578 	os_aio_array_t*	array)	/*!< in: array to process */
5579 {
5580 	ulint	i;
5581 
5582 	ut_ad(array);
5583 	ut_ad(n_seg);
5584 	ut_ad(array->n_segments > 0);
5585 
5586 	if (array->n_segments == 1) {
5587 		return;
5588 	}
5589 
5590 	fprintf(file, " [");
5591 	for (i = 0; i < array->n_segments; i++) {
5592 		if (i != 0) {
5593 			fprintf(file, ", ");
5594 		}
5595 
5596 		fprintf(file, "%lu", n_seg[i]);
5597 	}
5598 	fprintf(file, "] ");
5599 }
5600 
5601 /**********************************************************************//**
5602 Prints info about the aio array. */
5603 UNIV_INTERN
5604 void
os_aio_print_array(FILE * file,os_aio_array_t * array)5605 os_aio_print_array(
5606 /*==============*/
5607 	FILE*		file,	/*!< in: file where to print */
5608 	os_aio_array_t*	array)	/*!< in: aio array to print */
5609 {
5610 	ulint			n_reserved = 0;
5611 	ulint			n_res_seg[SRV_MAX_N_IO_THREADS];
5612 
5613 	os_mutex_enter(array->mutex);
5614 
5615 	ut_a(array->n_slots > 0);
5616 	ut_a(array->n_segments > 0);
5617 
5618 	memset(n_res_seg, 0x0, sizeof(n_res_seg));
5619 
5620 	for (ulint i = 0; i < array->n_slots; ++i) {
5621 		os_aio_slot_t*	slot;
5622 		ulint		seg_no;
5623 
5624 		slot = os_aio_array_get_nth_slot(array, i);
5625 
5626 		seg_no = (i * array->n_segments) / array->n_slots;
5627 
5628 		if (slot->reserved) {
5629 			++n_reserved;
5630 			++n_res_seg[seg_no];
5631 
5632 			ut_a(slot->len > 0);
5633 		}
5634 	}
5635 
5636 	ut_a(array->n_reserved == n_reserved);
5637 
5638 	fprintf(file, " %lu", (ulong) n_reserved);
5639 
5640 	os_aio_print_segment_info(file, n_res_seg, array);
5641 
5642 	os_mutex_exit(array->mutex);
5643 }
5644 
5645 /**********************************************************************//**
5646 Prints info of the aio arrays. */
5647 UNIV_INTERN
5648 void
os_aio_print(FILE * file)5649 os_aio_print(
5650 /*=========*/
5651 	FILE*	file)	/*!< in: file where to print */
5652 {
5653 	time_t		current_time;
5654 	double		time_elapsed;
5655 	double		avg_bytes_read;
5656 
5657 	for (ulint i = 0; i < srv_n_file_io_threads; ++i) {
5658 		fprintf(file, "I/O thread %lu state: %s (%s)",
5659 			(ulong) i,
5660 			srv_io_thread_op_info[i],
5661 			srv_io_thread_function[i]);
5662 
5663 #ifndef __WIN__
5664 		if (os_aio_segment_wait_events[i]->is_set) {
5665 			fprintf(file, " ev set");
5666 		}
5667 #endif /* __WIN__ */
5668 
5669 		fprintf(file, "\n");
5670 	}
5671 
5672 	fputs("Pending normal aio reads:", file);
5673 
5674 	os_aio_print_array(file, os_aio_read_array);
5675 
5676 	if (os_aio_write_array != 0) {
5677 		fputs(", aio writes:", file);
5678 		os_aio_print_array(file, os_aio_write_array);
5679 	}
5680 
5681 	if (os_aio_ibuf_array != 0) {
5682 		fputs(",\n ibuf aio reads:", file);
5683 		os_aio_print_array(file, os_aio_ibuf_array);
5684 	}
5685 
5686 	if (os_aio_log_array != 0) {
5687 		fputs(", log i/o's:", file);
5688 		os_aio_print_array(file, os_aio_log_array);
5689 	}
5690 
5691 	if (os_aio_sync_array != 0) {
5692 		fputs(", sync i/o's:", file);
5693 		os_aio_print_array(file, os_aio_sync_array);
5694 	}
5695 
5696 	putc('\n', file);
5697 	current_time = ut_time();
5698 	time_elapsed = 0.001 + difftime(current_time, os_last_printout);
5699 
5700 	fprintf(file,
5701 		"Pending flushes (fsync) log: %lu; buffer pool: %lu\n"
5702 		"%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n",
5703 		(ulong) fil_n_pending_log_flushes,
5704 		(ulong) fil_n_pending_tablespace_flushes,
5705 		(ulong) os_n_file_reads,
5706 		(ulong) os_n_file_writes,
5707 		(ulong) os_n_fsyncs);
5708 
5709 	if (os_file_n_pending_preads != 0 || os_file_n_pending_pwrites != 0) {
5710 		fprintf(file,
5711 			"%lu pending preads, %lu pending pwrites\n",
5712 			(ulong) os_file_n_pending_preads,
5713 			(ulong) os_file_n_pending_pwrites);
5714 	}
5715 
5716 	if (os_n_file_reads == os_n_file_reads_old) {
5717 		avg_bytes_read = 0.0;
5718 	} else {
5719 		avg_bytes_read = (double) os_bytes_read_since_printout
5720 			/ (os_n_file_reads - os_n_file_reads_old);
5721 	}
5722 
5723 	fprintf(file,
5724 		"%.2f reads/s, %lu avg bytes/read,"
5725 		" %.2f writes/s, %.2f fsyncs/s\n",
5726 		(os_n_file_reads - os_n_file_reads_old)
5727 		/ time_elapsed,
5728 		(ulong) avg_bytes_read,
5729 		(os_n_file_writes - os_n_file_writes_old)
5730 		/ time_elapsed,
5731 		(os_n_fsyncs - os_n_fsyncs_old)
5732 		/ time_elapsed);
5733 
5734 	os_n_file_reads_old = os_n_file_reads;
5735 	os_n_file_writes_old = os_n_file_writes;
5736 	os_n_fsyncs_old = os_n_fsyncs;
5737 	os_bytes_read_since_printout = 0;
5738 
5739 	os_last_printout = current_time;
5740 }
5741 
5742 /**********************************************************************//**
5743 Refreshes the statistics used to print per-second averages. */
5744 UNIV_INTERN
5745 void
os_aio_refresh_stats(void)5746 os_aio_refresh_stats(void)
5747 /*======================*/
5748 {
5749 	os_n_file_reads_old = os_n_file_reads;
5750 	os_n_file_writes_old = os_n_file_writes;
5751 	os_n_fsyncs_old = os_n_fsyncs;
5752 	os_bytes_read_since_printout = 0;
5753 
5754 	os_last_printout = time(NULL);
5755 }
5756 
5757 #ifdef UNIV_DEBUG
5758 /**********************************************************************//**
5759 Checks that all slots in the system have been freed, that is, there are
5760 no pending io operations.
5761 @return	TRUE if all free */
5762 UNIV_INTERN
5763 ibool
os_aio_all_slots_free(void)5764 os_aio_all_slots_free(void)
5765 /*=======================*/
5766 {
5767 	os_aio_array_t*	array;
5768 	ulint		n_res	= 0;
5769 
5770 	array = os_aio_read_array;
5771 
5772 	os_mutex_enter(array->mutex);
5773 
5774 	n_res += array->n_reserved;
5775 
5776 	os_mutex_exit(array->mutex);
5777 
5778 	if (!srv_read_only_mode) {
5779 		ut_a(os_aio_write_array == 0);
5780 
5781 		array = os_aio_write_array;
5782 
5783 		os_mutex_enter(array->mutex);
5784 
5785 		n_res += array->n_reserved;
5786 
5787 		os_mutex_exit(array->mutex);
5788 
5789 		ut_a(os_aio_ibuf_array == 0);
5790 
5791 		array = os_aio_ibuf_array;
5792 
5793 		os_mutex_enter(array->mutex);
5794 
5795 		n_res += array->n_reserved;
5796 
5797 		os_mutex_exit(array->mutex);
5798 	}
5799 
5800 	ut_a(os_aio_log_array == 0);
5801 
5802 	array = os_aio_log_array;
5803 
5804 	os_mutex_enter(array->mutex);
5805 
5806 	n_res += array->n_reserved;
5807 
5808 	os_mutex_exit(array->mutex);
5809 
5810 	array = os_aio_sync_array;
5811 
5812 	os_mutex_enter(array->mutex);
5813 
5814 	n_res += array->n_reserved;
5815 
5816 	os_mutex_exit(array->mutex);
5817 
5818 	if (n_res == 0) {
5819 
5820 		return(TRUE);
5821 	}
5822 
5823 	return(FALSE);
5824 }
5825 #endif /* UNIV_DEBUG */
5826 
5827 #endif /* !UNIV_HOTBACKUP */
5828