1 /***********************************************************************
2 
3 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2009, Percona Inc.
5 
6 Portions of this file contain modifications contributed and copyrighted
7 by Percona Inc.. Those modifications are
8 gratefully acknowledged and are described briefly in the InnoDB
9 documentation. The contributions by Percona Inc. are incorporated with
10 their permission, and subject to the conditions contained in the file
11 COPYING.Percona.
12 
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License, version 2.0,
15 as published by the Free Software Foundation.
16 
17 This program is also distributed with certain software (including
18 but not limited to OpenSSL) that is licensed under separate terms,
19 as designated in a particular file or component or in included license
20 documentation.  The authors of MySQL hereby grant you an additional
21 permission to link the program and your derivative works with the
22 separately licensed software that they have included with MySQL.
23 
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27 GNU General Public License, version 2.0, for more details.
28 
29 You should have received a copy of the GNU General Public License along with
30 this program; if not, write to the Free Software Foundation, Inc.,
31 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
32 
33 ***********************************************************************/
34 
35 /**************************************************//**
36 @file os/os0file.cc
37 The interface to the operating system file i/o primitives
38 
39 Created 10/21/1995 Heikki Tuuri
40 *******************************************************/
41 
42 #include "os0file.h"
43 
44 #ifdef UNIV_NONINL
45 #include "os0file.ic"
46 #endif
47 
48 #include "ut0mem.h"
49 #include "srv0srv.h"
50 #include "srv0start.h"
51 #include "fil0fil.h"
52 #include "buf0buf.h"
53 #include "srv0mon.h"
54 #ifndef UNIV_HOTBACKUP
55 # include "os0sync.h"
56 # include "os0thread.h"
57 #else /* !UNIV_HOTBACKUP */
58 # ifdef __WIN__
59 /* Add includes for the _stat() call to compile on Windows */
60 #  include <sys/types.h>
61 #  include <sys/stat.h>
62 #  include <errno.h>
63 # endif /* __WIN__ */
64 #endif /* !UNIV_HOTBACKUP */
65 
66 #if defined(LINUX_NATIVE_AIO)
67 #include <libaio.h>
68 #endif
69 
70 /** Insert buffer segment id */
71 static const ulint IO_IBUF_SEGMENT = 0;
72 
73 /** Log segment id */
74 static const ulint IO_LOG_SEGMENT = 1;
75 
76 /* This specifies the file permissions InnoDB uses when it creates files in
77 Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
78 my_umask */
79 
80 #ifndef __WIN__
81 /** Umask for creating files */
82 UNIV_INTERN ulint	os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
83 #else
84 /** Umask for creating files */
85 UNIV_INTERN ulint	os_innodb_umask	= 0;
86 #endif /* __WIN__ */
87 
88 #ifndef UNIV_HOTBACKUP
89 /* We use these mutexes to protect lseek + file i/o operation, if the
90 OS does not provide an atomic pread or pwrite, or similar */
91 #define OS_FILE_N_SEEK_MUTEXES	16
92 UNIV_INTERN os_ib_mutex_t	os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
93 
94 /* In simulated aio, merge at most this many consecutive i/os */
95 #define OS_AIO_MERGE_N_CONSECUTIVE	64
96 
97 #ifdef WITH_INNODB_DISALLOW_WRITES
98 #define WAIT_ALLOW_WRITES() os_event_wait(srv_allow_writes_event)
99 #else
100 #define WAIT_ALLOW_WRITES() do { } while (0)
101 #endif /* WITH_INNODB_DISALLOW_WRITES */
102 
103 /**********************************************************************
104 
105 InnoDB AIO Implementation:
106 =========================
107 
108 We support native AIO for windows and linux. For rest of the platforms
109 we simulate AIO by special io-threads servicing the IO-requests.
110 
111 Simulated AIO:
112 ==============
113 
114 In platforms where we 'simulate' AIO following is a rough explanation
115 of the high level design.
116 There are four io-threads (for ibuf, log, read, write).
117 All synchronous IO requests are serviced by the calling thread using
118 os_file_write/os_file_read. The Asynchronous requests are queued up
119 in an array (there are four such arrays) by the calling thread.
120 Later these requests are picked up by the io-thread and are serviced
121 synchronously.
122 
123 Windows native AIO:
124 ==================
125 
126 If srv_use_native_aio is not set then windows follow the same
127 code as simulated AIO. If the flag is set then native AIO interface
128 is used. On windows, one of the limitation is that if a file is opened
129 for AIO no synchronous IO can be done on it. Therefore we have an
130 extra fifth array to queue up synchronous IO requests.
131 There are innodb_file_io_threads helper threads. These threads work
132 on the four arrays mentioned above in Simulated AIO. No thread is
133 required for the sync array.
134 If a synchronous IO request is made, it is first queued in the sync
135 array. Then the calling thread itself waits on the request, thus
136 making the call synchronous.
137 If an AIO request is made the calling thread not only queues it in the
138 array but also submits the requests. The helper thread then collects
139 the completed IO request and calls completion routine on it.
140 
141 Linux native AIO:
142 =================
143 
144 If we have libaio installed on the system and innodb_use_native_aio
145 is set to TRUE we follow the code path of native AIO, otherwise we
146 do simulated AIO.
147 There are innodb_file_io_threads helper threads. These threads work
148 on the four arrays mentioned above in Simulated AIO.
149 If a synchronous IO request is made, it is handled by calling
150 os_file_write/os_file_read.
151 If an AIO request is made the calling thread not only queues it in the
152 array but also submits the requests. The helper thread then collects
153 the completed IO request and calls completion routine on it.
154 
155 **********************************************************************/
156 
157 /** Flag: enable debug printout for asynchronous i/o */
158 UNIV_INTERN ibool	os_aio_print_debug	= FALSE;
159 
160 #ifdef UNIV_PFS_IO
161 /* Keys to register InnoDB I/O with performance schema */
162 UNIV_INTERN mysql_pfs_key_t  innodb_file_data_key;
163 UNIV_INTERN mysql_pfs_key_t  innodb_file_log_key;
164 UNIV_INTERN mysql_pfs_key_t  innodb_file_temp_key;
165 #endif /* UNIV_PFS_IO */
166 
167 /** The asynchronous i/o array slot structure */
168 struct os_aio_slot_t{
169 	ibool		is_read;	/*!< TRUE if a read operation */
170 	ulint		pos;		/*!< index of the slot in the aio
171 					array */
172 	ibool		reserved;	/*!< TRUE if this slot is reserved */
173 	time_t		reservation_time;/*!< time when reserved */
174 	ulint		len;		/*!< length of the block to read or
175 					write */
176 	byte*		buf;		/*!< buffer used in i/o */
177 	ulint		type;		/*!< OS_FILE_READ or OS_FILE_WRITE */
178 	os_offset_t	offset;		/*!< file offset in bytes */
179 	pfs_os_file_t	file;		/*!< file where to read or write */
180 	const char*	name;		/*!< file name or path */
181 	ibool		io_already_done;/*!< used only in simulated aio:
182 					TRUE if the physical i/o already
183 					made and only the slot message
184 					needs to be passed to the caller
185 					of os_aio_simulated_handle */
186 	fil_node_t*	message1;	/*!< message which is given by the */
187 	void*		message2;	/*!< the requester of an aio operation
188 					and which can be used to identify
189 					which pending aio operation was
190 					completed */
191 #ifdef WIN_ASYNC_IO
192 	HANDLE		handle;		/*!< handle object we need in the
193 					OVERLAPPED struct */
194 	OVERLAPPED	control;	/*!< Windows control block for the
195 					aio request */
196 #elif defined(LINUX_NATIVE_AIO)
197 	struct iocb	control;	/* Linux control block for aio */
198 	int		n_bytes;	/* bytes written/read. */
199 	int		ret;		/* AIO return code */
200 #endif /* WIN_ASYNC_IO */
201 };
202 
203 /** The asynchronous i/o array structure */
204 struct os_aio_array_t{
205 	os_ib_mutex_t	mutex;	/*!< the mutex protecting the aio array */
206 	os_event_t	not_full;
207 				/*!< The event which is set to the
208 				signaled state when there is space in
209 				the aio outside the ibuf segment */
210 	os_event_t	is_empty;
211 				/*!< The event which is set to the
212 				signaled state when there are no
213 				pending i/os in this array */
214 	ulint		n_slots;/*!< Total number of slots in the aio
215 				array.  This must be divisible by
216 				n_threads. */
217 	ulint		n_segments;
218 				/*!< Number of segments in the aio
219 				array of pending aio requests. A
220 				thread can wait separately for any one
221 				of the segments. */
222 	ulint		cur_seg;/*!< We reserve IO requests in round
223 				robin fashion to different segments.
224 				This points to the segment that is to
225 				be used to service next IO request. */
226 	ulint		n_reserved;
227 				/*!< Number of reserved slots in the
228 				aio array outside the ibuf segment */
229 	os_aio_slot_t*	slots;	/*!< Pointer to the slots in the array */
230 #ifdef __WIN__
231 	HANDLE*		handles;
232 				/*!< Pointer to an array of OS native
233 				event handles where we copied the
234 				handles from slots, in the same
235 				order. This can be used in
236 				WaitForMultipleObjects; used only in
237 				Windows */
238 #endif /* __WIN__ */
239 
240 #if defined(LINUX_NATIVE_AIO)
241 	io_context_t*		aio_ctx;
242 				/* completion queue for IO. There is
243 				one such queue per segment. Each thread
244 				will work on one ctx exclusively. */
245 	struct io_event*	aio_events;
246 				/* The array to collect completed IOs.
247 				There is one such event for each
248 				possible pending IO. The size of the
249 				array is equal to n_slots. */
250 #endif /* LINUX_NATIV_AIO */
251 };
252 
253 #if defined(LINUX_NATIVE_AIO)
254 /** timeout for each io_getevents() call = 500ms. */
255 #define OS_AIO_REAP_TIMEOUT	(500000000UL)
256 
257 /** time to sleep, in microseconds if io_setup() returns EAGAIN. */
258 #define OS_AIO_IO_SETUP_RETRY_SLEEP	(500000UL)
259 
260 /** number of attempts before giving up on io_setup(). */
261 #define OS_AIO_IO_SETUP_RETRY_ATTEMPTS	5
262 #endif
263 
264 /** Array of events used in simulated aio */
265 static os_event_t*	os_aio_segment_wait_events = NULL;
266 
267 /** The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
268 are NULL when the module has not yet been initialized. @{ */
269 static os_aio_array_t*	os_aio_read_array	= NULL;	/*!< Reads */
270 static os_aio_array_t*	os_aio_write_array	= NULL;	/*!< Writes */
271 static os_aio_array_t*	os_aio_ibuf_array	= NULL;	/*!< Insert buffer */
272 static os_aio_array_t*	os_aio_log_array	= NULL;	/*!< Redo log */
273 static os_aio_array_t*	os_aio_sync_array	= NULL;	/*!< Synchronous I/O */
274 /* @} */
275 
276 /** Number of asynchronous I/O segments.  Set by os_aio_init(). */
277 static ulint	os_aio_n_segments	= ULINT_UNDEFINED;
278 
279 /** If the following is TRUE, read i/o handler threads try to
280 wait until a batch of new read requests have been posted */
281 static ibool	os_aio_recommend_sleep_for_read_threads	= FALSE;
282 #endif /* !UNIV_HOTBACKUP */
283 
284 UNIV_INTERN ulint	os_n_file_reads		= 0;
285 UNIV_INTERN ulint	os_bytes_read_since_printout = 0;
286 UNIV_INTERN ulint	os_n_file_writes	= 0;
287 UNIV_INTERN ulint	os_n_fsyncs		= 0;
288 UNIV_INTERN ulint	os_n_file_reads_old	= 0;
289 UNIV_INTERN ulint	os_n_file_writes_old	= 0;
290 UNIV_INTERN ulint	os_n_fsyncs_old		= 0;
291 UNIV_INTERN time_t	os_last_printout;
292 
293 UNIV_INTERN ibool	os_has_said_disk_full	= FALSE;
294 
295 #if !defined(UNIV_HOTBACKUP)	\
296     && (!defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8)
297 /** The mutex protecting the following counts of pending I/O operations */
298 static os_ib_mutex_t	os_file_count_mutex;
299 #endif /* !UNIV_HOTBACKUP && (!HAVE_ATOMIC_BUILTINS || UNIV_WORD_SIZE < 8) */
300 
301 /** Number of pending os_file_pread() operations */
302 UNIV_INTERN ulint	os_file_n_pending_preads  = 0;
303 /** Number of pending os_file_pwrite() operations */
304 UNIV_INTERN ulint	os_file_n_pending_pwrites = 0;
305 /** Number of pending write operations */
306 UNIV_INTERN ulint	os_n_pending_writes = 0;
307 /** Number of pending read operations */
308 UNIV_INTERN ulint	os_n_pending_reads = 0;
309 
310 #ifdef UNIV_DEBUG
311 # ifndef UNIV_HOTBACKUP
312 /**********************************************************************//**
313 Validates the consistency the aio system some of the time.
314 @return	TRUE if ok or the check was skipped */
315 UNIV_INTERN
316 ibool
os_aio_validate_skip(void)317 os_aio_validate_skip(void)
318 /*======================*/
319 {
320 /** Try os_aio_validate() every this many times */
321 # define OS_AIO_VALIDATE_SKIP	13
322 
323 	/** The os_aio_validate() call skip counter.
324 	Use a signed type because of the race condition below. */
325 	static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
326 
327 	/* There is a race condition below, but it does not matter,
328 	because this call is only for heuristic purposes. We want to
329 	reduce the call frequency of the costly os_aio_validate()
330 	check in debug builds. */
331 	if (--os_aio_validate_count > 0) {
332 		return(TRUE);
333 	}
334 
335 	os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
336 	return(os_aio_validate());
337 }
338 # endif /* !UNIV_HOTBACKUP */
339 #endif /* UNIV_DEBUG */
340 
341 #ifdef __WIN__
342 /***********************************************************************//**
343 Gets the operating system version. Currently works only on Windows.
344 @return	OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000, OS_WINXP, OS_WINVISTA,
345 OS_WIN7. */
346 UNIV_INTERN
347 ulint
os_get_os_version(void)348 os_get_os_version(void)
349 /*===================*/
350 {
351 	OSVERSIONINFO	os_info;
352 
353 	os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
354 
355 	ut_a(GetVersionEx(&os_info));
356 
357 	if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) {
358 		return(OS_WIN31);
359 	} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
360 		return(OS_WIN95);
361 	} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
362 		switch (os_info.dwMajorVersion) {
363 		case 3:
364 		case 4:
365 			return(OS_WINNT);
366 		case 5:
367 			return (os_info.dwMinorVersion == 0)
368 				? OS_WIN2000 : OS_WINXP;
369 		case 6:
370 			return (os_info.dwMinorVersion == 0)
371 				? OS_WINVISTA : OS_WIN7;
372 		default:
373 			return(OS_WIN7);
374 		}
375 	} else {
376 		ut_error;
377 		return(0);
378 	}
379 }
380 #endif /* __WIN__ */
381 
382 /***********************************************************************//**
383 Retrieves the last error number if an error occurs in a file io function.
384 The number should be retrieved before any other OS calls (because they may
385 overwrite the error number). If the number is not known to this program,
386 the OS error number + 100 is returned.
387 @return	error number, or OS error number + 100 */
388 static
389 ulint
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)390 os_file_get_last_error_low(
391 /*=======================*/
392 	bool	report_all_errors,	/*!< in: TRUE if we want an error
393 					message printed of all errors */
394 	bool	on_error_silent)	/*!< in: TRUE then don't print any
395 					diagnostic to the log */
396 {
397 #ifdef __WIN__
398 
399 	ulint	err = (ulint) GetLastError();
400 	if (err == ERROR_SUCCESS) {
401 		return(0);
402 	}
403 
404 	if (report_all_errors
405 	    || (!on_error_silent
406 		&& err != ERROR_DISK_FULL
407 		&& err != ERROR_FILE_EXISTS)) {
408 
409 		ut_print_timestamp(stderr);
410 		fprintf(stderr,
411 			"  InnoDB: Operating system error number %lu"
412 			" in a file operation.\n", (ulong) err);
413 
414 		if (err == ERROR_PATH_NOT_FOUND) {
415 			fprintf(stderr,
416 				"InnoDB: The error means the system"
417 				" cannot find the path specified.\n");
418 
419 			if (srv_is_being_started) {
420 				fprintf(stderr,
421 					"InnoDB: If you are installing InnoDB,"
422 					" remember that you must create\n"
423 					"InnoDB: directories yourself, InnoDB"
424 					" does not create them.\n");
425 			}
426 		} else if (err == ERROR_ACCESS_DENIED) {
427 			fprintf(stderr,
428 				"InnoDB: The error means mysqld does not have"
429 				" the access rights to\n"
430 				"InnoDB: the directory. It may also be"
431 				" you have created a subdirectory\n"
432 				"InnoDB: of the same name as a data file.\n");
433 		} else if (err == ERROR_SHARING_VIOLATION
434 			   || err == ERROR_LOCK_VIOLATION) {
435 			fprintf(stderr,
436 				"InnoDB: The error means that another program"
437 				" is using InnoDB's files.\n"
438 				"InnoDB: This might be a backup or antivirus"
439 				" software or another instance\n"
440 				"InnoDB: of MySQL."
441 				" Please close it to get rid of this error.\n");
442 		} else if (err == ERROR_WORKING_SET_QUOTA
443 			   || err == ERROR_NO_SYSTEM_RESOURCES) {
444 			fprintf(stderr,
445 				"InnoDB: The error means that there are no"
446 				" sufficient system resources or quota to"
447 				" complete the operation.\n");
448 		} else if (err == ERROR_OPERATION_ABORTED) {
449 			fprintf(stderr,
450 				"InnoDB: The error means that the I/O"
451 				" operation has been aborted\n"
452 				"InnoDB: because of either a thread exit"
453 				" or an application request.\n"
454 				"InnoDB: Retry attempt is made.\n");
455 		} else {
456 			fprintf(stderr,
457 				"InnoDB: Some operating system error numbers"
458 				" are described at\n"
459 				"InnoDB: "
460 				REFMAN
461 				"operating-system-error-codes.html\n");
462 		}
463 	}
464 
465 	fflush(stderr);
466 
467 	if (err == ERROR_FILE_NOT_FOUND) {
468 		return(OS_FILE_NOT_FOUND);
469 	} else if (err == ERROR_DISK_FULL) {
470 		return(OS_FILE_DISK_FULL);
471 	} else if (err == ERROR_FILE_EXISTS) {
472 		return(OS_FILE_ALREADY_EXISTS);
473 	} else if (err == ERROR_SHARING_VIOLATION
474 		   || err == ERROR_LOCK_VIOLATION) {
475 		return(OS_FILE_SHARING_VIOLATION);
476 	} else if (err == ERROR_WORKING_SET_QUOTA
477 		   || err == ERROR_NO_SYSTEM_RESOURCES) {
478 		return(OS_FILE_INSUFFICIENT_RESOURCE);
479 	} else if (err == ERROR_OPERATION_ABORTED) {
480 		return(OS_FILE_OPERATION_ABORTED);
481 	} else if (err == ERROR_ACCESS_DENIED) {
482 		return(OS_FILE_ACCESS_VIOLATION);
483 	} else {
484 		return(OS_FILE_ERROR_MAX + err);
485 	}
486 #else
487 	int err = errno;
488 	if (err == 0) {
489 		return(0);
490 	}
491 
492 	if (report_all_errors
493 	    || (err != ENOSPC && err != EEXIST && !on_error_silent)) {
494 
495 		ut_print_timestamp(stderr);
496 		fprintf(stderr,
497 			"  InnoDB: Operating system error number %d"
498 			" in a file operation.\n", err);
499 
500 		if (err == ENOENT) {
501 			fprintf(stderr,
502 				"InnoDB: The error means the system"
503 				" cannot find the path specified.\n");
504 
505 			if (srv_is_being_started) {
506 				fprintf(stderr,
507 					"InnoDB: If you are installing InnoDB,"
508 					" remember that you must create\n"
509 					"InnoDB: directories yourself, InnoDB"
510 					" does not create them.\n");
511 			}
512 		} else if (err == EACCES) {
513 			fprintf(stderr,
514 				"InnoDB: The error means mysqld does not have"
515 				" the access rights to\n"
516 				"InnoDB: the directory.\n");
517 		} else {
518 			if (strerror(err) != NULL) {
519 				fprintf(stderr,
520 					"InnoDB: Error number %d"
521 					" means '%s'.\n",
522 					err, strerror(err));
523 			}
524 
525 
526 			fprintf(stderr,
527 				"InnoDB: Some operating system"
528 				" error numbers are described at\n"
529 				"InnoDB: "
530 				REFMAN
531 				"operating-system-error-codes.html\n");
532 		}
533 	}
534 
535 	fflush(stderr);
536 
537 	switch (err) {
538 	case ENOSPC:
539 		return(OS_FILE_DISK_FULL);
540 	case ENOENT:
541 		return(OS_FILE_NOT_FOUND);
542 	case EEXIST:
543 		return(OS_FILE_ALREADY_EXISTS);
544 	case EXDEV:
545 	case ENOTDIR:
546 	case EISDIR:
547 		return(OS_FILE_PATH_ERROR);
548 	case EAGAIN:
549 		if (srv_use_native_aio) {
550 			return(OS_FILE_AIO_RESOURCES_RESERVED);
551 		}
552 		break;
553 	case EINTR:
554 		if (srv_use_native_aio) {
555 			return(OS_FILE_AIO_INTERRUPTED);
556 		}
557 		break;
558 	case EACCES:
559 		return(OS_FILE_ACCESS_VIOLATION);
560 	}
561 	return(OS_FILE_ERROR_MAX + err);
562 #endif
563 }
564 
565 /***********************************************************************//**
566 Retrieves the last error number if an error occurs in a file io function.
567 The number should be retrieved before any other OS calls (because they may
568 overwrite the error number). If the number is not known to this program,
569 the OS error number + 100 is returned.
570 @return	error number, or OS error number + 100 */
571 UNIV_INTERN
572 ulint
os_file_get_last_error(bool report_all_errors)573 os_file_get_last_error(
574 /*===================*/
575 	bool	report_all_errors)	/*!< in: TRUE if we want an error
576 					message printed of all errors */
577 {
578 	return(os_file_get_last_error_low(report_all_errors, false));
579 }
580 
581 /****************************************************************//**
582 Does error handling when a file operation fails.
583 Conditionally exits (calling exit(3)) based on should_exit value and the
584 error type, if should_exit is TRUE then on_error_silent is ignored.
585 @return	TRUE if we should retry the operation */
586 static
587 ibool
os_file_handle_error_cond_exit(const char * name,const char * operation,ibool should_exit,ibool on_error_silent)588 os_file_handle_error_cond_exit(
589 /*===========================*/
590 	const char*	name,		/*!< in: name of a file or NULL */
591 	const char*	operation,	/*!< in: operation */
592 	ibool		should_exit,	/*!< in: call exit(3) if unknown error
593 					and this parameter is TRUE */
594 	ibool		on_error_silent)/*!< in: if TRUE then don't print
595 					any message to the log iff it is
596 					an unknown non-fatal error */
597 {
598 	ulint	err;
599 
600 	err = os_file_get_last_error_low(false, on_error_silent);
601 
602 	switch (err) {
603 	case OS_FILE_DISK_FULL:
604 		/* We only print a warning about disk full once */
605 
606 		if (os_has_said_disk_full) {
607 
608 			return(FALSE);
609 		}
610 
611 		/* Disk full error is reported irrespective of the
612 		on_error_silent setting. */
613 
614 		if (name) {
615 			ut_print_timestamp(stderr);
616 			fprintf(stderr,
617 				"  InnoDB: Encountered a problem with"
618 				" file %s\n", name);
619 		}
620 
621 		ut_print_timestamp(stderr);
622 		fprintf(stderr,
623 			"  InnoDB: Disk is full. Try to clean the disk"
624 			" to free space.\n");
625 
626 		os_has_said_disk_full = TRUE;
627 
628 		fflush(stderr);
629 
630 		return(FALSE);
631 
632 	case OS_FILE_AIO_RESOURCES_RESERVED:
633 	case OS_FILE_AIO_INTERRUPTED:
634 
635 		return(TRUE);
636 
637 	case OS_FILE_PATH_ERROR:
638 	case OS_FILE_ALREADY_EXISTS:
639 	case OS_FILE_ACCESS_VIOLATION:
640 
641 		return(FALSE);
642 
643 	case OS_FILE_SHARING_VIOLATION:
644 
645 		os_thread_sleep(10000000);  /* 10 sec */
646 		return(TRUE);
647 
648 	case OS_FILE_OPERATION_ABORTED:
649 	case OS_FILE_INSUFFICIENT_RESOURCE:
650 
651 		os_thread_sleep(100000);	/* 100 ms */
652 		return(TRUE);
653 
654 	default:
655 
656 		/* If it is an operation that can crash on error then it
657 		is better to ignore on_error_silent and print an error message
658 		to the log. */
659 
660 		if (should_exit || !on_error_silent) {
661 			ib_logf(IB_LOG_LEVEL_ERROR, "File %s: '%s' returned OS "
662 				"error " ULINTPF ".%s", name ? name : "(unknown)",
663 				operation, err, should_exit
664 				? " Cannot continue operation" : "");
665 		}
666 
667 		if (should_exit) {
668 			exit(1);
669 		}
670 	}
671 
672 	return(FALSE);
673 }
674 
675 /****************************************************************//**
676 Does error handling when a file operation fails.
677 @return	TRUE if we should retry the operation */
678 static
679 ibool
os_file_handle_error(const char * name,const char * operation)680 os_file_handle_error(
681 /*=================*/
682 	const char*	name,		/*!< in: name of a file or NULL */
683 	const char*	operation)	/*!< in: operation */
684 {
685 	/* exit in case of unknown error */
686 	return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE));
687 }
688 
689 /****************************************************************//**
690 Does error handling when a file operation fails.
691 @return	TRUE if we should retry the operation */
692 static
693 ibool
os_file_handle_error_no_exit(const char * name,const char * operation,ibool on_error_silent)694 os_file_handle_error_no_exit(
695 /*=========================*/
696 	const char*	name,		/*!< in: name of a file or NULL */
697 	const char*	operation,	/*!< in: operation */
698 	ibool		on_error_silent)/*!< in: if TRUE then don't print
699 					any message to the log. */
700 {
701 	/* don't exit in case of unknown error */
702 	return(os_file_handle_error_cond_exit(
703 			name, operation, FALSE, on_error_silent));
704 }
705 
706 #undef USE_FILE_LOCK
707 #define USE_FILE_LOCK
708 #if defined(UNIV_HOTBACKUP) || defined(__WIN__)
709 /* InnoDB Hot Backup does not lock the data files.
710  * On Windows, mandatory locking is used.
711  */
712 # undef USE_FILE_LOCK
713 #endif
714 #ifdef USE_FILE_LOCK
715 /****************************************************************//**
716 Obtain an exclusive lock on a file.
717 @return	0 on success */
718 static
719 int
os_file_lock(int fd,const char * name)720 os_file_lock(
721 /*=========*/
722 	int		fd,	/*!< in: file descriptor */
723 	const char*	name)	/*!< in: file name */
724 {
725 	struct flock lk;
726 
727 	ut_ad(!srv_read_only_mode);
728 
729 	lk.l_type = F_WRLCK;
730 	lk.l_whence = SEEK_SET;
731 	lk.l_start = lk.l_len = 0;
732 
733 	if (fcntl(fd, F_SETLK, &lk) == -1) {
734 
735 		ib_logf(IB_LOG_LEVEL_ERROR,
736 			"Unable to lock %s, error: %d", name, errno);
737 
738 		if (errno == EAGAIN || errno == EACCES) {
739 			ib_logf(IB_LOG_LEVEL_INFO,
740 				"Check that you do not already have "
741 				"another mysqld process using the "
742 				"same InnoDB data or log files.");
743 		}
744 
745 		return(-1);
746 	}
747 
748 	return(0);
749 }
750 #endif /* USE_FILE_LOCK */
751 
752 #ifndef UNIV_HOTBACKUP
753 /****************************************************************//**
754 Creates the seek mutexes used in positioned reads and writes. */
755 UNIV_INTERN
756 void
os_io_init_simple(void)757 os_io_init_simple(void)
758 /*===================*/
759 {
760 #if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8
761 	os_file_count_mutex = os_mutex_create();
762 #endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD_SIZE < 8 */
763 
764 	for (ulint i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
765 		os_file_seek_mutexes[i] = os_mutex_create();
766 	}
767 }
768 
769 /** Create a temporary file. This function is like tmpfile(3), but
770 the temporary file is created in the given parameter path. If the path
771 is null then it will create the file in the mysql server configuration
772 parameter (--tmpdir).
773 @param[in]	path	location for creating temporary file
774 @return temporary file handle, or NULL on error */
775 UNIV_INTERN
776 FILE*
os_file_create_tmpfile(const char * path)777 os_file_create_tmpfile(
778 	const char*	path)
779 {
780 	FILE*	file	= NULL;
781 	WAIT_ALLOW_WRITES();
782 	int	fd	= innobase_mysql_tmpfile(path);
783 
784 	ut_ad(!srv_read_only_mode);
785 
786 	if (fd >= 0) {
787 		file = fdopen(fd, "w+b");
788 	}
789 
790 	if (!file) {
791 		ut_print_timestamp(stderr);
792 		fprintf(stderr,
793 			"  InnoDB: Error: unable to create temporary file;"
794 			" errno: %d\n", errno);
795 		if (fd >= 0) {
796 			close(fd);
797 		}
798 	}
799 
800 	return(file);
801 }
802 #endif /* !UNIV_HOTBACKUP */
803 
804 /***********************************************************************//**
805 The os_file_opendir() function opens a directory stream corresponding to the
806 directory named by the dirname argument. The directory stream is positioned
807 at the first entry. In both Unix and Windows we automatically skip the '.'
808 and '..' items at the start of the directory listing.
809 @return	directory stream, NULL if error */
810 UNIV_INTERN
811 os_file_dir_t
os_file_opendir(const char * dirname,ibool error_is_fatal)812 os_file_opendir(
813 /*============*/
814 	const char*	dirname,	/*!< in: directory name; it must not
815 					contain a trailing '\' or '/' */
816 	ibool		error_is_fatal)	/*!< in: TRUE if we should treat an
817 					error as a fatal error; if we try to
818 					open symlinks then we do not wish a
819 					fatal error if it happens not to be
820 					a directory */
821 {
822 	os_file_dir_t		dir;
823 #ifdef __WIN__
824 	LPWIN32_FIND_DATA	lpFindFileData;
825 	char			path[OS_FILE_MAX_PATH + 3];
826 
827 	ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
828 
829 	strcpy(path, dirname);
830 	strcpy(path + strlen(path), "\\*");
831 
832 	/* Note that in Windows opening the 'directory stream' also retrieves
833 	the first entry in the directory. Since it is '.', that is no problem,
834 	as we will skip over the '.' and '..' entries anyway. */
835 
836 	lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
837 		ut_malloc(sizeof(WIN32_FIND_DATA)));
838 
839 	dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
840 
841 	ut_free(lpFindFileData);
842 
843 	if (dir == INVALID_HANDLE_VALUE) {
844 
845 		if (error_is_fatal) {
846 			os_file_handle_error(dirname, "opendir");
847 		}
848 
849 		return(NULL);
850 	}
851 
852 	return(dir);
853 #else
854 	dir = opendir(dirname);
855 
856 	if (dir == NULL && error_is_fatal) {
857 		os_file_handle_error(dirname, "opendir");
858 	}
859 
860 	return(dir);
861 #endif /* __WIN__ */
862 }
863 
864 /***********************************************************************//**
865 Closes a directory stream.
866 @return	0 if success, -1 if failure */
867 UNIV_INTERN
868 int
os_file_closedir(os_file_dir_t dir)869 os_file_closedir(
870 /*=============*/
871 	os_file_dir_t	dir)	/*!< in: directory stream */
872 {
873 #ifdef __WIN__
874 	BOOL		ret;
875 
876 	ret = FindClose(dir);
877 
878 	if (!ret) {
879 		os_file_handle_error_no_exit(NULL, "closedir", FALSE);
880 
881 		return(-1);
882 	}
883 
884 	return(0);
885 #else
886 	int	ret;
887 
888 	ret = closedir(dir);
889 
890 	if (ret) {
891 		os_file_handle_error_no_exit(NULL, "closedir", FALSE);
892 	}
893 
894 	return(ret);
895 #endif /* __WIN__ */
896 }
897 
898 /***********************************************************************//**
899 This function returns information of the next file in the directory. We jump
900 over the '.' and '..' entries in the directory.
901 @return	0 if ok, -1 if error, 1 if at the end of the directory */
902 UNIV_INTERN
903 int
os_file_readdir_next_file(const char * dirname,os_file_dir_t dir,os_file_stat_t * info)904 os_file_readdir_next_file(
905 /*======================*/
906 	const char*	dirname,/*!< in: directory name or path */
907 	os_file_dir_t	dir,	/*!< in: directory stream */
908 	os_file_stat_t*	info)	/*!< in/out: buffer where the info is returned */
909 {
910 #ifdef __WIN__
911 	LPWIN32_FIND_DATA	lpFindFileData;
912 	BOOL			ret;
913 
914 	lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
915 		ut_malloc(sizeof(WIN32_FIND_DATA)));
916 next_file:
917 	ret = FindNextFile(dir, lpFindFileData);
918 
919 	if (ret) {
920 		ut_a(strlen((char*) lpFindFileData->cFileName)
921 		     < OS_FILE_MAX_PATH);
922 
923 		if (strcmp((char*) lpFindFileData->cFileName, ".") == 0
924 		    || strcmp((char*) lpFindFileData->cFileName, "..") == 0) {
925 
926 			goto next_file;
927 		}
928 
929 		strcpy(info->name, (char*) lpFindFileData->cFileName);
930 
931 		info->size = (ib_int64_t)(lpFindFileData->nFileSizeLow)
932 			+ (((ib_int64_t)(lpFindFileData->nFileSizeHigh))
933 			   << 32);
934 
935 		if (lpFindFileData->dwFileAttributes
936 		    & FILE_ATTRIBUTE_REPARSE_POINT) {
937 			/* TODO: test Windows symlinks */
938 			/* TODO: MySQL has apparently its own symlink
939 			implementation in Windows, dbname.sym can
940 			redirect a database directory:
941 			REFMAN "windows-symbolic-links.html" */
942 			info->type = OS_FILE_TYPE_LINK;
943 		} else if (lpFindFileData->dwFileAttributes
944 			   & FILE_ATTRIBUTE_DIRECTORY) {
945 			info->type = OS_FILE_TYPE_DIR;
946 		} else {
947 			/* It is probably safest to assume that all other
948 			file types are normal. Better to check them rather
949 			than blindly skip them. */
950 
951 			info->type = OS_FILE_TYPE_FILE;
952 		}
953 	}
954 
955 	ut_free(lpFindFileData);
956 
957 	if (ret) {
958 		return(0);
959 	} else if (GetLastError() == ERROR_NO_MORE_FILES) {
960 
961 		return(1);
962 	} else {
963 		os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE);
964 		return(-1);
965 	}
966 #else
967 	struct dirent*	ent;
968 	char*		full_path;
969 	int		ret;
970 	struct stat	statinfo;
971 #ifdef HAVE_READDIR_R
972 	char		dirent_buf[sizeof(struct dirent)
973 				   + _POSIX_PATH_MAX + 100];
974 	/* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
975 	the max file name len; but in most standards, the
976 	length is NAME_MAX; we add 100 to be even safer */
977 #endif
978 
979 next_file:
980 
981 #ifdef HAVE_READDIR_R
982 	ret = readdir_r(dir, (struct dirent*) dirent_buf, &ent);
983 
984 	if (ret != 0
985 #ifdef UNIV_AIX
986 	    /* On AIX, only if we got non-NULL 'ent' (result) value and
987 	    a non-zero 'ret' (return) value, it indicates a failed
988 	    readdir_r() call. An NULL 'ent' with an non-zero 'ret'
989 	    would indicate the "end of the directory" is reached. */
990 	    && ent != NULL
991 #endif
992 	   ) {
993 		fprintf(stderr,
994 			"InnoDB: cannot read directory %s, error %lu\n",
995 			dirname, (ulong) ret);
996 
997 		return(-1);
998 	}
999 
1000 	if (ent == NULL) {
1001 		/* End of directory */
1002 
1003 		return(1);
1004 	}
1005 
1006 	ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
1007 #else
1008 	ent = readdir(dir);
1009 
1010 	if (ent == NULL) {
1011 
1012 		return(1);
1013 	}
1014 #endif
1015 	ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
1016 
1017 	if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
1018 
1019 		goto next_file;
1020 	}
1021 
1022 	strcpy(info->name, ent->d_name);
1023 
1024 	full_path = static_cast<char*>(
1025 		ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10));
1026 
1027 	sprintf(full_path, "%s/%s", dirname, ent->d_name);
1028 
1029 	ret = stat(full_path, &statinfo);
1030 
1031 	if (ret) {
1032 
1033 		if (errno == ENOENT) {
1034 			/* readdir() returned a file that does not exist,
1035 			it must have been deleted in the meantime. Do what
1036 			would have happened if the file was deleted before
1037 			readdir() - ignore and go to the next entry.
1038 			If this is the last entry then info->name will still
1039 			contain the name of the deleted file when this
1040 			function returns, but this is not an issue since the
1041 			caller shouldn't be looking at info when end of
1042 			directory is returned. */
1043 
1044 			ut_free(full_path);
1045 
1046 			goto next_file;
1047 		}
1048 
1049 		os_file_handle_error_no_exit(full_path, "stat", FALSE);
1050 
1051 		ut_free(full_path);
1052 
1053 		return(-1);
1054 	}
1055 
1056 	info->size = (ib_int64_t) statinfo.st_size;
1057 
1058 	if (S_ISDIR(statinfo.st_mode)) {
1059 		info->type = OS_FILE_TYPE_DIR;
1060 	} else if (S_ISLNK(statinfo.st_mode)) {
1061 		info->type = OS_FILE_TYPE_LINK;
1062 	} else if (S_ISREG(statinfo.st_mode)) {
1063 		info->type = OS_FILE_TYPE_FILE;
1064 	} else {
1065 		info->type = OS_FILE_TYPE_UNKNOWN;
1066 	}
1067 
1068 	ut_free(full_path);
1069 
1070 	return(0);
1071 #endif
1072 }
1073 
1074 /*****************************************************************//**
1075 This function attempts to create a directory named pathname. The new
1076 directory gets default permissions. On Unix the permissions are
1077 (0770 & ~umask). If the directory exists already, nothing is done and
1078 the call succeeds, unless the fail_if_exists arguments is true.
1079 If another error occurs, such as a permission error, this does not crash,
1080 but reports the error and returns FALSE.
1081 @return	TRUE if call succeeds, FALSE on error */
1082 UNIV_INTERN
1083 ibool
os_file_create_directory(const char * pathname,ibool fail_if_exists)1084 os_file_create_directory(
1085 /*=====================*/
1086 	const char*	pathname,	/*!< in: directory name as
1087 					null-terminated string */
1088 	ibool		fail_if_exists)	/*!< in: if TRUE, pre-existing directory
1089 					is treated as an error. */
1090 {
1091 #ifdef __WIN__
1092 	BOOL	rcode;
1093 
1094 	rcode = CreateDirectory((LPCTSTR) pathname, NULL);
1095 	if (!(rcode != 0
1096 	      || (GetLastError() == ERROR_ALREADY_EXISTS
1097 		  && !fail_if_exists))) {
1098 
1099 		os_file_handle_error_no_exit(
1100 			pathname, "CreateDirectory", FALSE);
1101 
1102 		return(FALSE);
1103 	}
1104 
1105 	return(TRUE);
1106 #else
1107 	int	rcode;
1108 	WAIT_ALLOW_WRITES();
1109 
1110 	rcode = mkdir(pathname, 0770);
1111 
1112 	if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
1113 		/* failure */
1114 		os_file_handle_error_no_exit(pathname, "mkdir", FALSE);
1115 
1116 		return(FALSE);
1117 	}
1118 
1119 	return (TRUE);
1120 #endif /* __WIN__ */
1121 }
1122 
1123 /****************************************************************//**
1124 NOTE! Use the corresponding macro os_file_create_simple(), not directly
1125 this function!
1126 A simple function to open or create a file.
1127 @return own: handle to the file, not defined if error, error number
1128 can be retrieved with os_file_get_last_error */
1129 UNIV_INTERN
1130 os_file_t
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,ibool * success)1131 os_file_create_simple_func(
1132 /*=======================*/
1133 	const char*	name,	/*!< in: name of the file or path as a
1134 				null-terminated string */
1135 	ulint		create_mode,/*!< in: create mode */
1136 	ulint		access_type,/*!< in: OS_FILE_READ_ONLY or
1137 				OS_FILE_READ_WRITE */
1138 	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
1139 {
1140 	os_file_t	file;
1141 	ibool		retry;
1142 
1143 	*success = FALSE;
1144 #ifdef __WIN__
1145 	DWORD		access;
1146 	DWORD		create_flag;
1147 	DWORD		attributes	= 0;
1148 
1149 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
1150 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
1151 
1152 	if (create_mode == OS_FILE_OPEN) {
1153 
1154 		create_flag = OPEN_EXISTING;
1155 
1156 	} else if (srv_read_only_mode) {
1157 
1158 		create_flag = OPEN_EXISTING;
1159 
1160 	} else if (create_mode == OS_FILE_CREATE) {
1161 
1162 		create_flag = CREATE_NEW;
1163 
1164 	} else if (create_mode == OS_FILE_CREATE_PATH) {
1165 
1166 		ut_a(!srv_read_only_mode);
1167 
1168 		/* Create subdirs along the path if needed  */
1169 		*success = os_file_create_subdirs_if_needed(name);
1170 
1171 		if (!*success) {
1172 
1173 			ib_logf(IB_LOG_LEVEL_ERROR,
1174 				"Unable to create subdirectories '%s'",
1175 				name);
1176 
1177 			return((os_file_t) -1);
1178 		}
1179 
1180 		create_flag = CREATE_NEW;
1181 		create_mode = OS_FILE_CREATE;
1182 
1183 	} else {
1184 		ib_logf(IB_LOG_LEVEL_ERROR,
1185 			"Unknown file create mode (%lu) for file '%s'",
1186 			create_mode, name);
1187 
1188 		return((os_file_t) -1);
1189 	}
1190 
1191 	if (access_type == OS_FILE_READ_ONLY) {
1192 		access = GENERIC_READ;
1193 	} else if (srv_read_only_mode) {
1194 
1195 		ib_logf(IB_LOG_LEVEL_INFO,
1196 			"read only mode set. Unable to "
1197 			"open file '%s' in RW mode, trying RO mode", name);
1198 
1199 		access = GENERIC_READ;
1200 
1201 	} else if (access_type == OS_FILE_READ_WRITE) {
1202 		access = GENERIC_READ | GENERIC_WRITE;
1203 	} else {
1204 		ib_logf(IB_LOG_LEVEL_ERROR,
1205 			"Unknown file access type (%lu) for file '%s'",
1206 			access_type, name);
1207 
1208 		return((os_file_t) -1);
1209 	}
1210 
1211 	do {
1212 		/* Use default security attributes and no template file. */
1213 
1214 		file = CreateFile(
1215 			(LPCTSTR) name, access, FILE_SHARE_READ, NULL,
1216 			create_flag, attributes, NULL);
1217 
1218 		if (file == INVALID_HANDLE_VALUE) {
1219 
1220 			*success = FALSE;
1221 
1222 			retry = os_file_handle_error(
1223 				name, create_mode == OS_FILE_OPEN ?
1224 				"open" : "create");
1225 
1226 		} else {
1227 			*success = TRUE;
1228 			retry = false;
1229 		}
1230 
1231 	} while (retry);
1232 
1233 #else /* __WIN__ */
1234 	int		create_flag;
1235 	if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
1236 		WAIT_ALLOW_WRITES();
1237 
1238 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
1239 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
1240 
1241 	if (create_mode == OS_FILE_OPEN) {
1242 
1243 		if (access_type == OS_FILE_READ_ONLY) {
1244 			create_flag = O_RDONLY;
1245 		} else if (srv_read_only_mode) {
1246 			create_flag = O_RDONLY;
1247 		} else {
1248 			create_flag = O_RDWR;
1249 		}
1250 
1251 	} else if (srv_read_only_mode) {
1252 
1253 		create_flag = O_RDONLY;
1254 
1255 	} else if (create_mode == OS_FILE_CREATE) {
1256 
1257 		create_flag = O_RDWR | O_CREAT | O_EXCL;
1258 
1259 	} else if (create_mode == OS_FILE_CREATE_PATH) {
1260 
1261 		/* Create subdirs along the path if needed  */
1262 
1263 		*success = os_file_create_subdirs_if_needed(name);
1264 
1265 		if (!*success) {
1266 
1267 			ib_logf(IB_LOG_LEVEL_ERROR,
1268 				"Unable to create subdirectories '%s'",
1269 				name);
1270 
1271 			return((os_file_t) -1);
1272 		}
1273 
1274 		create_flag = O_RDWR | O_CREAT | O_EXCL;
1275 		create_mode = OS_FILE_CREATE;
1276 	} else {
1277 
1278 		ib_logf(IB_LOG_LEVEL_ERROR,
1279 			"Unknown file create mode (%lu) for file '%s'",
1280 			create_mode, name);
1281 
1282 		return((os_file_t) -1);
1283 	}
1284 
1285 	do {
1286 		file = ::open(name, create_flag, os_innodb_umask);
1287 
1288 		if (file == -1) {
1289 			*success = FALSE;
1290 
1291 			retry = os_file_handle_error(
1292 				name,
1293 				create_mode == OS_FILE_OPEN
1294 				?  "open" : "create");
1295 		} else {
1296 			*success = TRUE;
1297 			retry = false;
1298 		}
1299 
1300 	} while (retry);
1301 
1302 #ifdef USE_FILE_LOCK
1303 	if (!srv_read_only_mode
1304 	    && *success
1305 	    && access_type == OS_FILE_READ_WRITE
1306 	    && os_file_lock(file, name)) {
1307 
1308 		*success = FALSE;
1309 		close(file);
1310 		file = -1;
1311 	}
1312 #endif /* USE_FILE_LOCK */
1313 
1314 #endif /* __WIN__ */
1315 
1316 	return(file);
1317 }
1318 
1319 /****************************************************************//**
1320 NOTE! Use the corresponding macro
1321 os_file_create_simple_no_error_handling(), not directly this function!
1322 A simple function to open or create a file.
1323 @return own: handle to the file, not defined if error, error number
1324 can be retrieved with os_file_get_last_error */
1325 UNIV_INTERN
1326 pfs_os_file_t
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,ibool * success)1327 os_file_create_simple_no_error_handling_func(
1328 /*=========================================*/
1329 	const char*	name,	/*!< in: name of the file or path as a
1330 				null-terminated string */
1331 	ulint		create_mode,/*!< in: create mode */
1332 	ulint		access_type,/*!< in: OS_FILE_READ_ONLY,
1333 				OS_FILE_READ_WRITE, or
1334 				OS_FILE_READ_ALLOW_DELETE; the last option is
1335 				used by a backup program reading the file */
1336 	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
1337 {
1338 	pfs_os_file_t	file;
1339 
1340 	*success = FALSE;
1341 #ifdef __WIN__
1342 	DWORD		access;
1343 	DWORD		create_flag;
1344 	DWORD		attributes	= 0;
1345 	DWORD		share_mode	= FILE_SHARE_READ;
1346 	ut_a(name);
1347 
1348 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
1349 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
1350 
1351 	if (create_mode == OS_FILE_OPEN) {
1352 		create_flag = OPEN_EXISTING;
1353 	} else if (srv_read_only_mode) {
1354 		create_flag = OPEN_EXISTING;
1355 	} else if (create_mode == OS_FILE_CREATE) {
1356 		create_flag = CREATE_NEW;
1357 	} else {
1358 
1359 		ib_logf(IB_LOG_LEVEL_ERROR,
1360 			"Unknown file create mode (%lu) for file '%s'",
1361 			create_mode, name);
1362 		file.m_file = (os_file_t)-1;
1363 		return(file);
1364 	}
1365 
1366 	if (access_type == OS_FILE_READ_ONLY) {
1367 		access = GENERIC_READ;
1368 	} else if (srv_read_only_mode) {
1369 		access = GENERIC_READ;
1370 	} else if (access_type == OS_FILE_READ_WRITE) {
1371 		access = GENERIC_READ | GENERIC_WRITE;
1372 	} else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
1373 
1374 		ut_a(!srv_read_only_mode);
1375 
1376 		access = GENERIC_READ;
1377 
1378 		/*!< A backup program has to give mysqld the maximum
1379 		freedom to do what it likes with the file */
1380 
1381 		share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE;
1382 	} else {
1383 		ib_logf(IB_LOG_LEVEL_ERROR,
1384 			"Unknown file access type (%lu) for file '%s'",
1385 			access_type, name);
1386 		file.m_file = (os_file_t)-1;
1387 		return(file);
1388 	}
1389 
1390 	file.m_file = CreateFile((LPCTSTR) name,
1391 			  access,
1392 			  share_mode,
1393 			  NULL,			// Security attributes
1394 			  create_flag,
1395 			  attributes,
1396 			  NULL);		// No template file
1397 
1398 	*success = (file.m_file != INVALID_HANDLE_VALUE);
1399 #else /* __WIN__ */
1400 	int		create_flag;
1401 	const char*	mode_str	= NULL;
1402 	ut_a(name);
1403 	if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
1404 		WAIT_ALLOW_WRITES();
1405 
1406 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
1407 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
1408 
1409 	if (create_mode == OS_FILE_OPEN) {
1410 
1411 		mode_str = "OPEN";
1412 
1413 		if (access_type == OS_FILE_READ_ONLY) {
1414 
1415 			create_flag = O_RDONLY;
1416 
1417 		} else if (srv_read_only_mode) {
1418 
1419 			create_flag = O_RDONLY;
1420 
1421 		} else {
1422 
1423 			ut_a(access_type == OS_FILE_READ_WRITE
1424 			     || access_type == OS_FILE_READ_ALLOW_DELETE);
1425 
1426 			create_flag = O_RDWR;
1427 		}
1428 
1429 	} else if (srv_read_only_mode) {
1430 
1431 		mode_str = "OPEN";
1432 
1433 		create_flag = O_RDONLY;
1434 
1435 	} else if (create_mode == OS_FILE_CREATE) {
1436 
1437 		mode_str = "CREATE";
1438 
1439 		create_flag = O_RDWR | O_CREAT | O_EXCL;
1440 
1441 	} else {
1442 		ib_logf(IB_LOG_LEVEL_ERROR,
1443 			"Unknown file create mode (%lu) for file '%s'",
1444 			create_mode, name);
1445 		file.m_file = -1;
1446 		return(file);
1447 	}
1448 
1449 	file.m_file = ::open(name, create_flag, os_innodb_umask);
1450 
1451 	*success = file.m_file == -1 ? FALSE : TRUE;
1452 
1453 	/* This function is always called for data files, we should disable
1454 	OS caching (O_DIRECT) here as we do in os_file_create_func(), so
1455 	we open the same file in the same mode, see man page of open(2). */
1456 	if (!srv_read_only_mode
1457 	    && *success
1458 	    && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
1459 		|| srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) {
1460 
1461 		os_file_set_nocache(file.m_file, name, mode_str);
1462 	}
1463 
1464 #ifdef USE_FILE_LOCK
1465 	if (!srv_read_only_mode
1466 	    && *success
1467 	    && access_type == OS_FILE_READ_WRITE
1468 	    && os_file_lock(file.m_file, name)) {
1469 
1470 		*success = FALSE;
1471 		close(file.m_file);
1472 		file.m_file = -1;
1473 
1474 	}
1475 #endif /* USE_FILE_LOCK */
1476 
1477 #endif /* __WIN__ */
1478 
1479 	return(file);
1480 }
1481 
1482 /****************************************************************//**
1483 Tries to disable OS caching on an opened file descriptor. */
1484 UNIV_INTERN
1485 void
os_file_set_nocache(int fd MY_ATTRIBUTE ((unused)),const char * file_name MY_ATTRIBUTE ((unused)),const char * operation_name MY_ATTRIBUTE ((unused)))1486 os_file_set_nocache(
1487 /*================*/
1488 	int		fd		/*!< in: file descriptor to alter */
1489 					MY_ATTRIBUTE((unused)),
1490 	const char*	file_name	/*!< in: used in the diagnostic
1491 					message */
1492 					MY_ATTRIBUTE((unused)),
1493 	const char*	operation_name MY_ATTRIBUTE((unused)))
1494 					/*!< in: "open" or "create"; used
1495 					in the diagnostic message */
1496 {
1497 	/* some versions of Solaris may not have DIRECTIO_ON */
1498 #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
1499 	if (directio(fd, DIRECTIO_ON) == -1) {
1500 		int	errno_save = errno;
1501 
1502 		ib_logf(IB_LOG_LEVEL_ERROR,
1503 			"Failed to set DIRECTIO_ON on file %s: %s: %s, "
1504 			"continuing anyway.",
1505 			file_name, operation_name, strerror(errno_save));
1506 	}
1507 #elif defined(O_DIRECT)
1508 	if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
1509 		int		errno_save = errno;
1510 		static bool	warning_message_printed = false;
1511 		if (errno_save == EINVAL) {
1512 			if (!warning_message_printed) {
1513 				warning_message_printed = true;
1514 # ifdef UNIV_LINUX
1515 				ib_logf(IB_LOG_LEVEL_WARN,
1516 					"Failed to set O_DIRECT on file "
1517 					"%s: %s: %s, continuing anyway. "
1518 					"O_DIRECT is known to result "
1519 					"in 'Invalid argument' on Linux on "
1520 					"tmpfs, see MySQL Bug#26662.",
1521 					file_name, operation_name,
1522 					strerror(errno_save));
1523 # else /* UNIV_LINUX */
1524 				goto short_warning;
1525 # endif /* UNIV_LINUX */
1526 			}
1527 		} else {
1528 # ifndef UNIV_LINUX
1529 short_warning:
1530 # endif
1531 			ib_logf(IB_LOG_LEVEL_WARN,
1532 				"Failed to set O_DIRECT on file %s: %s: %s, "
1533 				"continuing anyway.",
1534 				file_name, operation_name, strerror(errno_save));
1535 		}
1536 	}
1537 #endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */
1538 }
1539 
1540 /****************************************************************//**
1541 NOTE! Use the corresponding macro os_file_create(), not directly
1542 this function!
1543 Opens an existing file or creates a new.
1544 @return own: handle to the file, not defined if error, error number
1545 can be retrieved with os_file_get_last_error */
1546 UNIV_INTERN
1547 pfs_os_file_t
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,ibool * success)1548 os_file_create_func(
1549 /*================*/
1550 	const char*	name,	/*!< in: name of the file or path as a
1551 				null-terminated string */
1552 	ulint		create_mode,/*!< in: create mode */
1553 	ulint		purpose,/*!< in: OS_FILE_AIO, if asynchronous,
1554 				non-buffered i/o is desired,
1555 				OS_FILE_NORMAL, if any normal file;
1556 				NOTE that it also depends on type, os_aio_..
1557 				and srv_.. variables whether we really use
1558 				async i/o or unbuffered i/o: look in the
1559 				function source code for the exact rules */
1560 	ulint		type,	/*!< in: OS_DATA_FILE or OS_LOG_FILE */
1561 	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
1562 {
1563 	pfs_os_file_t	file;
1564 	ibool		retry;
1565 	ibool		on_error_no_exit;
1566 	ibool		on_error_silent;
1567 #ifdef __WIN__
1568 	DBUG_EXECUTE_IF(
1569 		"ib_create_table_fail_disk_full",
1570 		*success = FALSE;
1571 		SetLastError(ERROR_DISK_FULL);
1572 		file.m_file = (os_file_t)-1;
1573 		return(file);
1574 	);
1575 #else /* __WIN__ */
1576 	DBUG_EXECUTE_IF(
1577 		"ib_create_table_fail_disk_full",
1578 		*success = FALSE;
1579 		errno = ENOSPC;
1580 		file.m_file = -1;
1581 		return(file);
1582 	);
1583 #endif /* __WIN__ */
1584 
1585 #ifdef __WIN__
1586 	DWORD		create_flag;
1587 	DWORD		share_mode	= FILE_SHARE_READ;
1588 
1589 	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
1590 		? TRUE : FALSE;
1591 
1592 	on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
1593 		? TRUE : FALSE;
1594 
1595 	create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
1596 	create_mode &= ~OS_FILE_ON_ERROR_SILENT;
1597 
1598 	if (create_mode == OS_FILE_OPEN_RAW) {
1599 
1600 		ut_a(!srv_read_only_mode);
1601 
1602 		create_flag = OPEN_EXISTING;
1603 
1604 		/* On Windows Physical devices require admin privileges and
1605 		have to have the write-share mode set. See the remarks
1606 		section for the CreateFile() function documentation in MSDN. */
1607 
1608 		share_mode |= FILE_SHARE_WRITE;
1609 
1610 	} else if (create_mode == OS_FILE_OPEN
1611 		   || create_mode == OS_FILE_OPEN_RETRY) {
1612 
1613 		create_flag = OPEN_EXISTING;
1614 
1615 	} else if (srv_read_only_mode) {
1616 
1617 		create_flag = OPEN_EXISTING;
1618 
1619 	} else if (create_mode == OS_FILE_CREATE) {
1620 
1621 		create_flag = CREATE_NEW;
1622 
1623 	} else if (create_mode == OS_FILE_OVERWRITE) {
1624 
1625 		create_flag = CREATE_ALWAYS;
1626 
1627 	} else {
1628 		ib_logf(IB_LOG_LEVEL_ERROR,
1629 			"Unknown file create mode (%lu) for file '%s'",
1630 			create_mode, name);
1631 
1632 		file.m_file = (os_file_t)-1;
1633 		return(file);
1634 	}
1635 
1636 	DWORD		attributes = 0;
1637 
1638 #ifdef UNIV_HOTBACKUP
1639 	attributes |= FILE_FLAG_NO_BUFFERING;
1640 #else
1641 	if (purpose == OS_FILE_AIO) {
1642 
1643 #ifdef WIN_ASYNC_IO
1644 		/* If specified, use asynchronous (overlapped) io and no
1645 		buffering of writes in the OS */
1646 
1647 		if (srv_use_native_aio) {
1648 			attributes |= FILE_FLAG_OVERLAPPED;
1649 		}
1650 #endif /* WIN_ASYNC_IO */
1651 
1652 	} else if (purpose == OS_FILE_NORMAL) {
1653 		/* Use default setting. */
1654 	} else {
1655 		ib_logf(IB_LOG_LEVEL_ERROR,
1656 			"Unknown purpose flag (%lu) while opening file '%s'",
1657 			purpose, name);
1658 		file.m_file = (os_file_t)-1;
1659 		return(file);
1660 	}
1661 
1662 #ifdef UNIV_NON_BUFFERED_IO
1663 	// TODO: Create a bug, this looks wrong. The flush log
1664 	// parameter is dynamic.
1665 	if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
1666 
1667 		/* Do not use unbuffered i/o for the log files because
1668 		value 2 denotes that we do not flush the log at every
1669 		commit, but only once per second */
1670 
1671 	} else if (srv_win_file_flush_method == SRV_WIN_IO_UNBUFFERED) {
1672 
1673 		attributes |= FILE_FLAG_NO_BUFFERING;
1674 	}
1675 #endif /* UNIV_NON_BUFFERED_IO */
1676 
1677 #endif /* UNIV_HOTBACKUP */
1678 	DWORD	access = GENERIC_READ;
1679 
1680 	if (!srv_read_only_mode) {
1681 		access |= GENERIC_WRITE;
1682 	}
1683 
1684 	do {
1685 		/* Use default security attributes and no template file. */
1686 		file.m_file = CreateFile(
1687 			(LPCTSTR) name, access, share_mode, NULL,
1688 			create_flag, attributes, NULL);
1689 
1690 		if (file.m_file == INVALID_HANDLE_VALUE) {
1691 			const char*	operation;
1692 
1693 			operation = (create_mode == OS_FILE_CREATE
1694 				     && !srv_read_only_mode)
1695 				? "create" : "open";
1696 
1697 			*success = FALSE;
1698 
1699 			if (on_error_no_exit) {
1700 				retry = os_file_handle_error_no_exit(
1701 					name, operation, on_error_silent);
1702 			} else {
1703 				retry = os_file_handle_error(name, operation);
1704 			}
1705 		} else {
1706 			*success = TRUE;
1707 			retry = FALSE;
1708 		}
1709 
1710 	} while (retry);
1711 
1712 #else /* __WIN__ */
1713 	int		create_flag;
1714 	const char*	mode_str	= NULL;
1715 	if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
1716 		WAIT_ALLOW_WRITES();
1717 
1718 	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
1719 		? TRUE : FALSE;
1720 	on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
1721 		? TRUE : FALSE;
1722 
1723 	create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
1724 	create_mode &= ~OS_FILE_ON_ERROR_SILENT;
1725 
1726 	if (create_mode == OS_FILE_OPEN
1727 	    || create_mode == OS_FILE_OPEN_RAW
1728 	    || create_mode == OS_FILE_OPEN_RETRY) {
1729 
1730 		mode_str = "OPEN";
1731 
1732 		create_flag = srv_read_only_mode ? O_RDONLY : O_RDWR;
1733 
1734 	} else if (srv_read_only_mode) {
1735 
1736 		mode_str = "OPEN";
1737 
1738 		create_flag = O_RDONLY;
1739 
1740 	} else if (create_mode == OS_FILE_CREATE) {
1741 
1742 		mode_str = "CREATE";
1743 		create_flag = O_RDWR | O_CREAT | O_EXCL;
1744 
1745 	} else if (create_mode == OS_FILE_OVERWRITE) {
1746 
1747 		mode_str = "OVERWRITE";
1748 		create_flag = O_RDWR | O_CREAT | O_TRUNC;
1749 
1750 	} else {
1751 		ib_logf(IB_LOG_LEVEL_ERROR,
1752 			"Unknown file create mode (%lu) for file '%s'",
1753 			create_mode, name);
1754 
1755 		file.m_file = -1;
1756 		return(file);
1757 	}
1758 
1759 	ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE);
1760 	ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
1761 
1762 #ifdef O_SYNC
1763 	/* We let O_SYNC only affect log files; note that we map O_DSYNC to
1764 	O_SYNC because the datasync options seemed to corrupt files in 2001
1765 	in both Linux and Solaris */
1766 
1767 	if (!srv_read_only_mode
1768 	    && type == OS_LOG_FILE
1769 	    && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
1770 
1771 		create_flag |= O_SYNC;
1772 	}
1773 #endif /* O_SYNC */
1774 
1775 	do {
1776 		file.m_file = ::open(name, create_flag, os_innodb_umask);
1777 
1778 		if (file.m_file == -1) {
1779 			const char*	operation;
1780 
1781 			operation = (create_mode == OS_FILE_CREATE
1782 				     && !srv_read_only_mode)
1783 				? "create" : "open";
1784 
1785 			*success = FALSE;
1786 
1787 			if (on_error_no_exit) {
1788 				retry = os_file_handle_error_no_exit(
1789 					name, operation, on_error_silent);
1790 			} else {
1791 				retry = os_file_handle_error(name, operation);
1792 			}
1793 		} else {
1794 			*success = TRUE;
1795 			retry = false;
1796 		}
1797 
1798 	} while (retry);
1799 
1800 	/* We disable OS caching (O_DIRECT) only on data files */
1801 
1802 	if (!srv_read_only_mode
1803 	    && *success
1804 	    && type != OS_LOG_FILE
1805 	    && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
1806 		|| srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) {
1807 
1808 		os_file_set_nocache(file.m_file, name, mode_str);
1809 	}
1810 
1811 #ifdef USE_FILE_LOCK
1812 	if (!srv_read_only_mode
1813 	    && *success
1814 	    && create_mode != OS_FILE_OPEN_RAW
1815 	    && os_file_lock(file.m_file, name)) {
1816 
1817 		if (create_mode == OS_FILE_OPEN_RETRY) {
1818 
1819 			ut_a(!srv_read_only_mode);
1820 
1821 			ib_logf(IB_LOG_LEVEL_INFO,
1822 				"Retrying to lock the first data file");
1823 
1824 			for (int i = 0; i < 100; i++) {
1825 				os_thread_sleep(1000000);
1826 
1827 				if (!os_file_lock(file.m_file, name)) {
1828 					*success = TRUE;
1829 					return(file);
1830 				}
1831 			}
1832 
1833 			ib_logf(IB_LOG_LEVEL_INFO,
1834 				"Unable to open the first data file");
1835 		}
1836 
1837 		*success = FALSE;
1838 		close(file.m_file);
1839 		file.m_file = -1;
1840 	}
1841 #endif /* USE_FILE_LOCK */
1842 
1843 #endif /* __WIN__ */
1844 
1845 	return(file);
1846 }
1847 
1848 /***********************************************************************//**
1849 Deletes a file if it exists. The file has to be closed before calling this.
1850 @return	TRUE if success */
1851 UNIV_INTERN
1852 bool
os_file_delete_if_exists_func(const char * name)1853 os_file_delete_if_exists_func(
1854 /*==========================*/
1855 	const char*	name)	/*!< in: file path as a null-terminated
1856 				string */
1857 {
1858 #ifdef __WIN__
1859 	bool	ret;
1860 	ulint	count	= 0;
1861 loop:
1862 	/* In Windows, deleting an .ibd file may fail if mysqlbackup is copying
1863 	it */
1864 
1865 	ret = DeleteFile((LPCTSTR) name);
1866 
1867 	if (ret) {
1868 		return(true);
1869 	}
1870 
1871 	DWORD lasterr = GetLastError();
1872 	if (lasterr == ERROR_FILE_NOT_FOUND
1873 	    || lasterr == ERROR_PATH_NOT_FOUND) {
1874 		/* the file does not exist, this not an error */
1875 
1876 		return(true);
1877 	}
1878 
1879 	count++;
1880 
1881 	if (count > 100 && 0 == (count % 10)) {
1882 		os_file_get_last_error(true); /* print error information */
1883 
1884 		ib_logf(IB_LOG_LEVEL_WARN, "Delete of file %s failed.", name);
1885 	}
1886 
1887 	os_thread_sleep(500000);	/* sleep for 0.5 second */
1888 
1889 	if (count > 2000) {
1890 
1891 		return(false);
1892 	}
1893 
1894 	goto loop;
1895 #else
1896 	int	ret;
1897 	WAIT_ALLOW_WRITES();
1898 
1899 	ret = unlink(name);
1900 
1901 	if (ret != 0 && errno != ENOENT) {
1902 		os_file_handle_error_no_exit(name, "delete", FALSE);
1903 
1904 		return(false);
1905 	}
1906 
1907 	return(true);
1908 #endif /* __WIN__ */
1909 }
1910 
1911 /***********************************************************************//**
1912 Deletes a file. The file has to be closed before calling this.
1913 @return	TRUE if success */
1914 UNIV_INTERN
1915 bool
os_file_delete_func(const char * name)1916 os_file_delete_func(
1917 /*================*/
1918 	const char*	name)	/*!< in: file path as a null-terminated
1919 				string */
1920 {
1921 #ifdef __WIN__
1922 	BOOL	ret;
1923 	ulint	count	= 0;
1924 loop:
1925 	/* In Windows, deleting an .ibd file may fail if mysqlbackup is copying
1926 	it */
1927 
1928 	ret = DeleteFile((LPCTSTR) name);
1929 
1930 	if (ret) {
1931 		return(true);
1932 	}
1933 
1934 	if (GetLastError() == ERROR_FILE_NOT_FOUND) {
1935 		/* If the file does not exist, we classify this as a 'mild'
1936 		error and return */
1937 
1938 		return(false);
1939 	}
1940 
1941 	count++;
1942 
1943 	if (count > 100 && 0 == (count % 10)) {
1944 		os_file_get_last_error(true); /* print error information */
1945 
1946 		fprintf(stderr,
1947 			"InnoDB: Warning: cannot delete file %s\n"
1948 			"InnoDB: Are you running mysqlbackup"
1949 			" to back up the file?\n", name);
1950 	}
1951 
1952 	os_thread_sleep(1000000);	/* sleep for a second */
1953 
1954 	if (count > 2000) {
1955 
1956 		return(false);
1957 	}
1958 
1959 	goto loop;
1960 #else
1961 	int	ret;
1962 	WAIT_ALLOW_WRITES();
1963 
1964 	ret = unlink(name);
1965 
1966 	if (ret != 0) {
1967 		os_file_handle_error_no_exit(name, "delete", FALSE);
1968 
1969 		return(false);
1970 	}
1971 
1972 	return(true);
1973 #endif
1974 }
1975 
1976 /***********************************************************************//**
1977 NOTE! Use the corresponding macro os_file_rename(), not directly this function!
1978 Renames a file (can also move it to another directory). It is safest that the
1979 file is closed before calling this function.
1980 @return	TRUE if success */
1981 UNIV_INTERN
1982 ibool
os_file_rename_func(const char * oldpath,const char * newpath)1983 os_file_rename_func(
1984 /*================*/
1985 	const char*	oldpath,/*!< in: old file path as a null-terminated
1986 				string */
1987 	const char*	newpath)/*!< in: new file path */
1988 {
1989 #ifdef UNIV_DEBUG
1990 	os_file_type_t	type;
1991 	ibool		exists;
1992 
1993 	/* New path must not exist. */
1994 	ut_ad(os_file_status(newpath, &exists, &type));
1995 	ut_ad(!exists);
1996 
1997 	/* Old path must exist. */
1998 	ut_ad(os_file_status(oldpath, &exists, &type));
1999 	ut_ad(exists);
2000 #endif /* UNIV_DEBUG */
2001 
2002 #ifdef __WIN__
2003 	BOOL	ret;
2004 
2005 	ret = MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath);
2006 
2007 	if (ret) {
2008 		return(TRUE);
2009 	}
2010 
2011 	os_file_handle_error_no_exit(oldpath, "rename", FALSE);
2012 
2013 	return(FALSE);
2014 #else
2015 	int	ret;
2016 	WAIT_ALLOW_WRITES();
2017 
2018 	ret = rename(oldpath, newpath);
2019 
2020 	if (ret != 0) {
2021 		os_file_handle_error_no_exit(oldpath, "rename", FALSE);
2022 
2023 		return(FALSE);
2024 	}
2025 
2026 	return(TRUE);
2027 #endif /* __WIN__ */
2028 }
2029 
2030 /***********************************************************************//**
2031 NOTE! Use the corresponding macro os_file_close(), not directly this function!
2032 Closes a file handle. In case of error, error number can be retrieved with
2033 os_file_get_last_error.
2034 @return	TRUE if success */
2035 UNIV_INTERN
2036 ibool
os_file_close_func(os_file_t file)2037 os_file_close_func(
2038 /*===============*/
2039 	os_file_t	file)	/*!< in, own: handle to a file */
2040 {
2041 #ifdef __WIN__
2042 	BOOL	ret;
2043 
2044 	ut_a(file);
2045 
2046 	ret = CloseHandle(file);
2047 
2048 	if (ret) {
2049 		return(TRUE);
2050 	}
2051 
2052 	os_file_handle_error(NULL, "close");
2053 
2054 	return(FALSE);
2055 #else
2056 	int	ret;
2057 
2058 	ret = close(file);
2059 
2060 	if (ret == -1) {
2061 		os_file_handle_error(NULL, "close");
2062 
2063 		return(FALSE);
2064 	}
2065 
2066 	return(TRUE);
2067 #endif /* __WIN__ */
2068 }
2069 
2070 #ifdef UNIV_HOTBACKUP
2071 /***********************************************************************//**
2072 Closes a file handle.
2073 @return	TRUE if success */
2074 UNIV_INTERN
2075 ibool
os_file_close_no_error_handling(os_file_t file)2076 os_file_close_no_error_handling(
2077 /*============================*/
2078 	os_file_t	file)	/*!< in, own: handle to a file */
2079 {
2080 #ifdef __WIN__
2081 	BOOL	ret;
2082 
2083 	ut_a(file);
2084 
2085 	ret = CloseHandle(file);
2086 
2087 	if (ret) {
2088 		return(TRUE);
2089 	}
2090 
2091 	return(FALSE);
2092 #else
2093 	int	ret;
2094 
2095 	ret = close(file);
2096 
2097 	if (ret == -1) {
2098 
2099 		return(FALSE);
2100 	}
2101 
2102 	return(TRUE);
2103 #endif /* __WIN__ */
2104 }
2105 #endif /* UNIV_HOTBACKUP */
2106 
2107 /***********************************************************************//**
2108 Gets a file size.
2109 @return	file size, or (os_offset_t) -1 on failure */
2110 UNIV_INTERN
2111 os_offset_t
os_file_get_size(pfs_os_file_t file)2112 os_file_get_size(
2113 /*=============*/
2114 	pfs_os_file_t	file)	/*!< in: handle to a file */
2115 {
2116 #ifdef __WIN__
2117 	os_offset_t	offset;
2118 	DWORD		high;
2119 	DWORD		low;
2120 
2121 	low = GetFileSize(file.m_file, &high);
2122 
2123 	if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) {
2124 		return((os_offset_t) -1);
2125 	}
2126 
2127 	offset = (os_offset_t) low | ((os_offset_t) high << 32);
2128 
2129 	return(offset);
2130 #else
2131 	return((os_offset_t) lseek(file.m_file, 0, SEEK_END));
2132 
2133 #endif /* __WIN__ */
2134 }
2135 
2136 /***********************************************************************//**
2137 Write the specified number of zeros to a newly created file.
2138 @return	TRUE if success */
2139 UNIV_INTERN
2140 ibool
os_file_set_size(const char * name,pfs_os_file_t file,os_offset_t size)2141 os_file_set_size(
2142 /*=============*/
2143 	const char*	name,	/*!< in: name of the file or path as a
2144 				null-terminated string */
2145 	pfs_os_file_t	file,	/*!< in: handle to a file */
2146 	os_offset_t	size)	/*!< in: file size */
2147 {
2148 	os_offset_t	current_size;
2149 	ibool		ret;
2150 	byte*		buf;
2151 	byte*		buf2;
2152 	ulint		buf_size;
2153 
2154 	current_size = 0;
2155 
2156 	/* Write up to 1 megabyte at a time. */
2157 	buf_size = ut_min(64, (ulint) (size / UNIV_PAGE_SIZE))
2158 		* UNIV_PAGE_SIZE;
2159 	buf2 = static_cast<byte*>(ut_malloc(buf_size + UNIV_PAGE_SIZE));
2160 
2161 	/* Align the buffer for possible raw i/o */
2162 	buf = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
2163 
2164 	/* Write buffer full of zeros */
2165 	memset(buf, 0, buf_size);
2166 
2167 	if (size >= (os_offset_t) 100 << 20) {
2168 
2169 		fprintf(stderr, "InnoDB: Progress in MB:");
2170 	}
2171 
2172 	while (current_size < size) {
2173 		ulint	n_bytes;
2174 
2175 		if (size - current_size < (os_offset_t) buf_size) {
2176 			n_bytes = (ulint) (size - current_size);
2177 		} else {
2178 			n_bytes = buf_size;
2179 		}
2180 
2181 		ret = os_file_write(name, file, buf, current_size, n_bytes);
2182 		if (!ret) {
2183 			ut_free(buf2);
2184 			goto error_handling;
2185 		}
2186 
2187 		/* Print about progress for each 100 MB written */
2188 		if ((current_size + n_bytes) / (100 << 20)
2189 		    != current_size / (100 << 20)) {
2190 
2191 			fprintf(stderr, " %lu00",
2192 				(ulong) ((current_size + n_bytes)
2193 					 / (100 << 20)));
2194 		}
2195 
2196 		current_size += n_bytes;
2197 	}
2198 
2199 	if (size >= (os_offset_t) 100 << 20) {
2200 
2201 		fprintf(stderr, "\n");
2202 	}
2203 
2204 	ut_free(buf2);
2205 
2206 	ret = os_file_flush(file);
2207 
2208 	if (ret) {
2209 		return(TRUE);
2210 	}
2211 
2212 error_handling:
2213 	return(FALSE);
2214 }
2215 
2216 /***********************************************************************//**
2217 Truncates a file at its current position.
2218 @return	TRUE if success */
2219 UNIV_INTERN
2220 ibool
os_file_set_eof(FILE * file)2221 os_file_set_eof(
2222 /*============*/
2223 	FILE*		file)	/*!< in: file to be truncated */
2224 {
2225 #ifdef __WIN__
2226 	HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
2227 	return(SetEndOfFile(h));
2228 #else /* __WIN__ */
2229 	WAIT_ALLOW_WRITES();
2230 	return(!ftruncate(fileno(file), ftell(file)));
2231 #endif /* __WIN__ */
2232 }
2233 
2234 #ifndef __WIN__
2235 /***********************************************************************//**
2236 Wrapper to fsync(2) that retries the call on some errors.
2237 Returns the value 0 if successful; otherwise the value -1 is returned and
2238 the global variable errno is set to indicate the error.
2239 @return	0 if success, -1 otherwise */
2240 
2241 static
2242 int
os_file_fsync(os_file_t file)2243 os_file_fsync(
2244 /*==========*/
2245 	os_file_t	file)	/*!< in: handle to a file */
2246 {
2247 	int	ret;
2248 	int	failures;
2249 	ibool	retry;
2250 
2251 	failures = 0;
2252 
2253 	do {
2254 		ret = fsync(file);
2255 
2256 		os_n_fsyncs++;
2257 
2258 		if (ret == -1 && errno == ENOLCK) {
2259 
2260 			if (failures % 100 == 0) {
2261 
2262 				ut_print_timestamp(stderr);
2263 				fprintf(stderr,
2264 					" InnoDB: fsync(): "
2265 					"No locks available; retrying\n");
2266 			}
2267 
2268 			os_thread_sleep(200000 /* 0.2 sec */);
2269 
2270 			failures++;
2271 
2272 			retry = TRUE;
2273 		} else {
2274 
2275 			retry = FALSE;
2276 		}
2277 	} while (retry);
2278 
2279 	return(ret);
2280 }
2281 #endif /* !__WIN__ */
2282 
2283 /***********************************************************************//**
2284 NOTE! Use the corresponding macro os_file_flush(), not directly this function!
2285 Flushes the write buffers of a given file to the disk.
2286 @return	TRUE if success */
2287 UNIV_INTERN
2288 ibool
os_file_flush_func(os_file_t file)2289 os_file_flush_func(
2290 /*===============*/
2291 	os_file_t	file)	/*!< in, own: handle to a file */
2292 {
2293 #ifdef __WIN__
2294 	BOOL	ret;
2295 
2296 	ut_a(file);
2297 
2298 	os_n_fsyncs++;
2299 
2300 	ret = FlushFileBuffers(file);
2301 
2302 	if (ret) {
2303 		return(TRUE);
2304 	}
2305 
2306 	/* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
2307 	actually a raw device, we choose to ignore that error if we are using
2308 	raw disks */
2309 
2310 	if (srv_start_raw_disk_in_use && GetLastError()
2311 	    == ERROR_INVALID_FUNCTION) {
2312 		return(TRUE);
2313 	}
2314 
2315 	os_file_handle_error(NULL, "flush");
2316 
2317 	/* It is a fatal error if a file flush does not succeed, because then
2318 	the database can get corrupt on disk */
2319 	ut_error;
2320 
2321 	return(FALSE);
2322 #else
2323 	int	ret;
2324 	WAIT_ALLOW_WRITES();
2325 
2326 #if defined(HAVE_DARWIN_THREADS)
2327 # ifndef F_FULLFSYNC
2328 	/* The following definition is from the Mac OS X 10.3 <sys/fcntl.h> */
2329 #  define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */
2330 # elif F_FULLFSYNC != 51
2331 #  error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3"
2332 # endif
2333 	/* Apple has disabled fsync() for internal disk drives in OS X. That
2334 	caused corruption for a user when he tested a power outage. Let us in
2335 	OS X use a nonstandard flush method recommended by an Apple
2336 	engineer. */
2337 
2338 	if (!srv_have_fullfsync) {
2339 		/* If we are not on an operating system that supports this,
2340 		then fall back to a plain fsync. */
2341 
2342 		ret = os_file_fsync(file);
2343 	} else {
2344 		ret = fcntl(file, F_FULLFSYNC, NULL);
2345 
2346 		if (ret) {
2347 			/* If we are not on a file system that supports this,
2348 			then fall back to a plain fsync. */
2349 			ret = os_file_fsync(file);
2350 		}
2351 	}
2352 #else
2353 	ret = os_file_fsync(file);
2354 #endif
2355 
2356 	if (ret == 0) {
2357 		return(TRUE);
2358 	}
2359 
2360 	/* Since Linux returns EINVAL if the 'file' is actually a raw device,
2361 	we choose to ignore that error if we are using raw disks */
2362 
2363 	if (srv_start_raw_disk_in_use && errno == EINVAL) {
2364 
2365 		return(TRUE);
2366 	}
2367 
2368 	ib_logf(IB_LOG_LEVEL_ERROR, "The OS said file flush did not succeed");
2369 
2370 	os_file_handle_error(NULL, "flush");
2371 
2372 	/* It is a fatal error if a file flush does not succeed, because then
2373 	the database can get corrupt on disk */
2374 	ut_error;
2375 
2376 	return(FALSE);
2377 #endif
2378 }
2379 
2380 #ifndef __WIN__
2381 /*******************************************************************//**
2382 Does a synchronous read operation in Posix.
2383 @return	number of bytes read, -1 if error */
2384 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2385 ssize_t
os_file_pread(os_file_t file,void * buf,ulint n,os_offset_t offset)2386 os_file_pread(
2387 /*==========*/
2388 	os_file_t	file,	/*!< in: handle to a file */
2389 	void*		buf,	/*!< in: buffer where to read */
2390 	ulint		n,	/*!< in: number of bytes to read */
2391 	os_offset_t	offset)	/*!< in: file offset from where to read */
2392 {
2393 	off_t	offs;
2394 #if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
2395 	ssize_t	n_bytes;
2396 #endif /* HAVE_PREAD && !HAVE_BROKEN_PREAD */
2397 
2398 	ut_ad(n);
2399 
2400 	/* If off_t is > 4 bytes in size, then we assume we can pass a
2401 	64-bit address */
2402 	offs = (off_t) offset;
2403 
2404 	if (sizeof(off_t) <= 4) {
2405 		if (offset != (os_offset_t) offs) {
2406 			ib_logf(IB_LOG_LEVEL_ERROR,
2407 				"File read at offset > 4 GB");
2408 		}
2409 	}
2410 
2411 	os_n_file_reads++;
2412 
2413 #if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
2414 #if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
2415 	(void) os_atomic_increment_ulint(&os_n_pending_reads, 1);
2416 	(void) os_atomic_increment_ulint(&os_file_n_pending_preads, 1);
2417 	MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS);
2418 #else
2419 	os_mutex_enter(os_file_count_mutex);
2420 	os_file_n_pending_preads++;
2421 	os_n_pending_reads++;
2422 	MONITOR_INC(MONITOR_OS_PENDING_READS);
2423 	os_mutex_exit(os_file_count_mutex);
2424 #endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD == 8 */
2425 
2426 	n_bytes = pread(file, buf, n, offs);
2427 
2428 #if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
2429 	(void) os_atomic_decrement_ulint(&os_n_pending_reads, 1);
2430 	(void) os_atomic_decrement_ulint(&os_file_n_pending_preads, 1);
2431 	MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_READS);
2432 #else
2433 	os_mutex_enter(os_file_count_mutex);
2434 	os_file_n_pending_preads--;
2435 	os_n_pending_reads--;
2436 	MONITOR_DEC(MONITOR_OS_PENDING_READS);
2437 	os_mutex_exit(os_file_count_mutex);
2438 #endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD == 8 */
2439 
2440 	return(n_bytes);
2441 #else
2442 	{
2443 		off_t	ret_offset;
2444 		ssize_t	ret;
2445 #ifndef UNIV_HOTBACKUP
2446 		ulint	i;
2447 #endif /* !UNIV_HOTBACKUP */
2448 
2449 #if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
2450 		(void) os_atomic_increment_ulint(&os_n_pending_reads, 1);
2451 		MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS);
2452 #else
2453 		os_mutex_enter(os_file_count_mutex);
2454 		os_n_pending_reads++;
2455 		MONITOR_INC(MONITOR_OS_PENDING_READS);
2456 		os_mutex_exit(os_file_count_mutex);
2457 #endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD == 8 */
2458 #ifndef UNIV_HOTBACKUP
2459 		/* Protect the seek / read operation with a mutex */
2460 		i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2461 
2462 		os_mutex_enter(os_file_seek_mutexes[i]);
2463 #endif /* !UNIV_HOTBACKUP */
2464 
2465 		ret_offset = lseek(file, offs, SEEK_SET);
2466 
2467 		if (ret_offset < 0) {
2468 			ret = -1;
2469 		} else {
2470 			ret = read(file, buf, (ssize_t) n);
2471 		}
2472 
2473 #ifndef UNIV_HOTBACKUP
2474 		os_mutex_exit(os_file_seek_mutexes[i]);
2475 #endif /* !UNIV_HOTBACKUP */
2476 
2477 #if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
2478 		(void) os_atomic_decrement_ulint(&os_n_pending_reads, 1);
2479 		MONITOR_ATOIC_DEC(MONITOR_OS_PENDING_READS);
2480 #else
2481 		os_mutex_enter(os_file_count_mutex);
2482 		os_n_pending_reads--;
2483 		MONITOR_DEC(MONITOR_OS_PENDING_READS);
2484 		os_mutex_exit(os_file_count_mutex);
2485 #endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD_SIZE == 8 */
2486 
2487 		return(ret);
2488 	}
2489 #endif
2490 }
2491 
2492 /*******************************************************************//**
2493 Does a synchronous write operation in Posix.
2494 @return	number of bytes written, -1 if error */
2495 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2496 ssize_t
os_file_pwrite(os_file_t file,const void * buf,ulint n,os_offset_t offset)2497 os_file_pwrite(
2498 /*===========*/
2499 	os_file_t	file,	/*!< in: handle to a file */
2500 	const void*	buf,	/*!< in: buffer from where to write */
2501 	ulint		n,	/*!< in: number of bytes to write */
2502 	os_offset_t	offset)	/*!< in: file offset where to write */
2503 {
2504 	ssize_t	ret;
2505 	off_t	offs;
2506 
2507 	ut_ad(n);
2508 	ut_ad(!srv_read_only_mode);
2509 
2510 	/* If off_t is > 4 bytes in size, then we assume we can pass a
2511 	64-bit address */
2512 	offs = (off_t) offset;
2513 
2514 	if (sizeof(off_t) <= 4) {
2515 		if (offset != (os_offset_t) offs) {
2516 			ib_logf(IB_LOG_LEVEL_ERROR,
2517 				"File write at offset > 4 GB.");
2518 		}
2519 	}
2520 
2521 	os_n_file_writes++;
2522 
2523 #if defined(HAVE_PWRITE) && !defined(HAVE_BROKEN_PREAD)
2524 #if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8
2525 	os_mutex_enter(os_file_count_mutex);
2526 	os_file_n_pending_pwrites++;
2527 	os_n_pending_writes++;
2528 	MONITOR_INC(MONITOR_OS_PENDING_WRITES);
2529 	os_mutex_exit(os_file_count_mutex);
2530 #else
2531 	(void) os_atomic_increment_ulint(&os_n_pending_writes, 1);
2532 	(void) os_atomic_increment_ulint(&os_file_n_pending_pwrites, 1);
2533 	MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_WRITES);
2534 #endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD < 8 */
2535 
2536 	ret = pwrite(file, buf, (ssize_t) n, offs);
2537 
2538 #if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8
2539 	os_mutex_enter(os_file_count_mutex);
2540 	os_file_n_pending_pwrites--;
2541 	os_n_pending_writes--;
2542 	MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
2543 	os_mutex_exit(os_file_count_mutex);
2544 #else
2545 	(void) os_atomic_decrement_ulint(&os_n_pending_writes, 1);
2546 	(void) os_atomic_decrement_ulint(&os_file_n_pending_pwrites, 1);
2547 	MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_WRITES);
2548 #endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD < 8 */
2549 
2550 	return(ret);
2551 #else
2552 	{
2553 		off_t	ret_offset;
2554 # ifndef UNIV_HOTBACKUP
2555 		ulint	i;
2556 # endif /* !UNIV_HOTBACKUP */
2557 
2558 		os_mutex_enter(os_file_count_mutex);
2559 		os_n_pending_writes++;
2560 		MONITOR_INC(MONITOR_OS_PENDING_WRITES);
2561 		os_mutex_exit(os_file_count_mutex);
2562 
2563 # ifndef UNIV_HOTBACKUP
2564 		/* Protect the seek / write operation with a mutex */
2565 		i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2566 
2567 		os_mutex_enter(os_file_seek_mutexes[i]);
2568 # endif /* UNIV_HOTBACKUP */
2569 
2570 		ret_offset = lseek(file, offs, SEEK_SET);
2571 
2572 		if (ret_offset < 0) {
2573 			ret = -1;
2574 
2575 			goto func_exit;
2576 		}
2577 
2578 		ret = write(file, buf, (ssize_t) n);
2579 
2580 func_exit:
2581 # ifndef UNIV_HOTBACKUP
2582 		os_mutex_exit(os_file_seek_mutexes[i]);
2583 # endif /* !UNIV_HOTBACKUP */
2584 
2585 		os_mutex_enter(os_file_count_mutex);
2586 		os_n_pending_writes--;
2587 		MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
2588 		os_mutex_exit(os_file_count_mutex);
2589 
2590 		return(ret);
2591 	}
2592 #endif /* !UNIV_HOTBACKUP */
2593 }
2594 #endif
2595 
2596 /*******************************************************************//**
2597 NOTE! Use the corresponding macro os_file_read(), not directly this
2598 function!
2599 Requests a synchronous positioned read operation.
2600 @return	TRUE if request was successful, FALSE if fail */
2601 UNIV_INTERN
2602 ibool
os_file_read_func(os_file_t file,void * buf,os_offset_t offset,ulint n)2603 os_file_read_func(
2604 /*==============*/
2605 	os_file_t	file,	/*!< in: handle to a file */
2606 	void*		buf,	/*!< in: buffer where to read */
2607 	os_offset_t	offset,	/*!< in: file offset where to read */
2608 	ulint		n)	/*!< in: number of bytes to read */
2609 {
2610 #ifdef __WIN__
2611 	BOOL		ret;
2612 	DWORD		len;
2613 	DWORD		ret2;
2614 	DWORD		low;
2615 	DWORD		high;
2616 	ibool		retry;
2617 #ifndef UNIV_HOTBACKUP
2618 	ulint		i;
2619 #endif /* !UNIV_HOTBACKUP */
2620 
2621 	/* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2622 	no more than 32 bits. */
2623 	ut_a((n & 0xFFFFFFFFUL) == n);
2624 
2625 	os_n_file_reads++;
2626 	os_bytes_read_since_printout += n;
2627 
2628 try_again:
2629 	ut_ad(file);
2630 	ut_ad(buf);
2631 	ut_ad(n > 0);
2632 
2633 	low = (DWORD) offset & 0xFFFFFFFF;
2634 	high = (DWORD) (offset >> 32);
2635 
2636 	os_mutex_enter(os_file_count_mutex);
2637 	os_n_pending_reads++;
2638 	MONITOR_INC(MONITOR_OS_PENDING_READS);
2639 	os_mutex_exit(os_file_count_mutex);
2640 
2641 #ifndef UNIV_HOTBACKUP
2642 	/* Protect the seek / read operation with a mutex */
2643 	i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2644 
2645 	os_mutex_enter(os_file_seek_mutexes[i]);
2646 #endif /* !UNIV_HOTBACKUP */
2647 
2648 	ret2 = SetFilePointer(
2649 		file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN);
2650 
2651 	if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2652 
2653 #ifndef UNIV_HOTBACKUP
2654 		os_mutex_exit(os_file_seek_mutexes[i]);
2655 #endif /* !UNIV_HOTBACKUP */
2656 
2657 		os_mutex_enter(os_file_count_mutex);
2658 		os_n_pending_reads--;
2659 		MONITOR_DEC(MONITOR_OS_PENDING_READS);
2660 		os_mutex_exit(os_file_count_mutex);
2661 
2662 		goto error_handling;
2663 	}
2664 
2665 	ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
2666 
2667 #ifndef UNIV_HOTBACKUP
2668 	os_mutex_exit(os_file_seek_mutexes[i]);
2669 #endif /* !UNIV_HOTBACKUP */
2670 
2671 	os_mutex_enter(os_file_count_mutex);
2672 	os_n_pending_reads--;
2673 	MONITOR_DEC(MONITOR_OS_PENDING_READS);
2674 	os_mutex_exit(os_file_count_mutex);
2675 
2676 	if (ret && len == n) {
2677 		return(TRUE);
2678 	}
2679 #else /* __WIN__ */
2680 	ibool	retry;
2681 	ssize_t	ret;
2682 
2683 	os_bytes_read_since_printout += n;
2684 
2685 try_again:
2686 	ret = os_file_pread(file, buf, n, offset);
2687 
2688 	if ((ulint) ret == n) {
2689 		return(TRUE);
2690 	} else if (ret == -1) {
2691                 ib_logf(IB_LOG_LEVEL_ERROR,
2692 			"Error in system call pread(). The operating"
2693 			" system error number is %lu.",(ulint) errno);
2694         } else {
2695 		/* Partial read occured */
2696 		ib_logf(IB_LOG_LEVEL_ERROR,
2697 			"Tried to read " ULINTPF " bytes at offset "
2698 			UINT64PF ". Was only able to read %ld.",
2699 			n, offset, (lint) ret);
2700 	}
2701 #endif /* __WIN__ */
2702 #ifdef __WIN__
2703 error_handling:
2704 #endif
2705 	retry = os_file_handle_error(NULL, "read");
2706 
2707 	if (retry) {
2708 		goto try_again;
2709 	}
2710 
2711 	fprintf(stderr,
2712 		"InnoDB: Fatal error: cannot read from file."
2713 		" OS error number %lu.\n",
2714 #ifdef __WIN__
2715 		(ulong) GetLastError()
2716 #else
2717 		(ulong) errno
2718 #endif /* __WIN__ */
2719 		);
2720 	fflush(stderr);
2721 
2722 	ut_error;
2723 
2724 	return(FALSE);
2725 }
2726 
2727 /*******************************************************************//**
2728 NOTE! Use the corresponding macro os_file_read_no_error_handling(),
2729 not directly this function!
2730 Requests a synchronous positioned read operation. This function does not do
2731 any error handling. In case of error it returns FALSE.
2732 @return	TRUE if request was successful, FALSE if fail */
2733 UNIV_INTERN
2734 ibool
os_file_read_no_error_handling_func(os_file_t file,void * buf,os_offset_t offset,ulint n)2735 os_file_read_no_error_handling_func(
2736 /*================================*/
2737 	os_file_t	file,	/*!< in: handle to a file */
2738 	void*		buf,	/*!< in: buffer where to read */
2739 	os_offset_t	offset,	/*!< in: file offset where to read */
2740 	ulint		n)	/*!< in: number of bytes to read */
2741 {
2742 #ifdef __WIN__
2743 	BOOL		ret;
2744 	DWORD		len;
2745 	DWORD		ret2;
2746 	DWORD		low;
2747 	DWORD		high;
2748 	ibool		retry;
2749 #ifndef UNIV_HOTBACKUP
2750 	ulint		i;
2751 #endif /* !UNIV_HOTBACKUP */
2752 
2753 	/* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2754 	no more than 32 bits. */
2755 	ut_a((n & 0xFFFFFFFFUL) == n);
2756 
2757 	os_n_file_reads++;
2758 	os_bytes_read_since_printout += n;
2759 
2760 try_again:
2761 	ut_ad(file);
2762 	ut_ad(buf);
2763 	ut_ad(n > 0);
2764 
2765 	low = (DWORD) offset & 0xFFFFFFFF;
2766 	high = (DWORD) (offset >> 32);
2767 
2768 	os_mutex_enter(os_file_count_mutex);
2769 	os_n_pending_reads++;
2770 	MONITOR_INC(MONITOR_OS_PENDING_READS);
2771 	os_mutex_exit(os_file_count_mutex);
2772 
2773 #ifndef UNIV_HOTBACKUP
2774 	/* Protect the seek / read operation with a mutex */
2775 	i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2776 
2777 	os_mutex_enter(os_file_seek_mutexes[i]);
2778 #endif /* !UNIV_HOTBACKUP */
2779 
2780 	ret2 = SetFilePointer(
2781 		file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN);
2782 
2783 	if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2784 
2785 #ifndef UNIV_HOTBACKUP
2786 		os_mutex_exit(os_file_seek_mutexes[i]);
2787 #endif /* !UNIV_HOTBACKUP */
2788 
2789 		os_mutex_enter(os_file_count_mutex);
2790 		os_n_pending_reads--;
2791 		MONITOR_DEC(MONITOR_OS_PENDING_READS);
2792 		os_mutex_exit(os_file_count_mutex);
2793 
2794 		goto error_handling;
2795 	}
2796 
2797 	ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
2798 
2799 #ifndef UNIV_HOTBACKUP
2800 	os_mutex_exit(os_file_seek_mutexes[i]);
2801 #endif /* !UNIV_HOTBACKUP */
2802 
2803 	os_mutex_enter(os_file_count_mutex);
2804 	os_n_pending_reads--;
2805 	MONITOR_DEC(MONITOR_OS_PENDING_READS);
2806 	os_mutex_exit(os_file_count_mutex);
2807 
2808 	if (ret && len == n) {
2809 		return(TRUE);
2810 	}
2811 #else /* __WIN__ */
2812 	ibool	retry;
2813 	ssize_t	ret;
2814 
2815 	os_bytes_read_since_printout += n;
2816 
2817 try_again:
2818 	ret = os_file_pread(file, buf, n, offset);
2819 
2820 	if ((ulint) ret == n) {
2821 		return(TRUE);
2822 	} else if (ret == -1) {
2823                 ib_logf(IB_LOG_LEVEL_ERROR,
2824 			"Error in system call pread(). The operating"
2825 			" system error number is %lu.",(ulint) errno);
2826         } else {
2827 		/* Partial read occured */
2828 		ib_logf(IB_LOG_LEVEL_ERROR,
2829 			"Tried to read " ULINTPF " bytes at offset "
2830 			UINT64PF ". Was only able to read %ld.",
2831 			n, offset, (lint) ret);
2832 	}
2833 #endif /* __WIN__ */
2834 #ifdef __WIN__
2835 error_handling:
2836 #endif
2837 	retry = os_file_handle_error_no_exit(NULL, "read", FALSE);
2838 
2839 	if (retry) {
2840 		goto try_again;
2841 	}
2842 
2843 	return(FALSE);
2844 }
2845 
2846 /*******************************************************************//**
2847 Rewind file to its start, read at most size - 1 bytes from it to str, and
2848 NUL-terminate str. All errors are silently ignored. This function is
2849 mostly meant to be used with temporary files. */
2850 UNIV_INTERN
2851 void
os_file_read_string(FILE * file,char * str,ulint size)2852 os_file_read_string(
2853 /*================*/
2854 	FILE*	file,	/*!< in: file to read from */
2855 	char*	str,	/*!< in: buffer where to read */
2856 	ulint	size)	/*!< in: size of buffer */
2857 {
2858 	size_t	flen;
2859 
2860 	if (size == 0) {
2861 		return;
2862 	}
2863 
2864 	rewind(file);
2865 	flen = fread(str, 1, size - 1, file);
2866 	str[flen] = '\0';
2867 }
2868 
2869 /*******************************************************************//**
2870 NOTE! Use the corresponding macro os_file_write(), not directly
2871 this function!
2872 Requests a synchronous write operation.
2873 @return	TRUE if request was successful, FALSE if fail */
2874 UNIV_INTERN
2875 ibool
os_file_write_func(const char * name,os_file_t file,const void * buf,os_offset_t offset,ulint n)2876 os_file_write_func(
2877 /*===============*/
2878 	const char*	name,	/*!< in: name of the file or path as a
2879 				null-terminated string */
2880 	os_file_t	file,	/*!< in: handle to a file */
2881 	const void*	buf,	/*!< in: buffer from which to write */
2882 	os_offset_t	offset,	/*!< in: file offset where to write */
2883 	ulint		n)	/*!< in: number of bytes to write */
2884 {
2885 	ut_ad(!srv_read_only_mode);
2886 
2887 #ifdef __WIN__
2888 	BOOL		ret;
2889 	DWORD		len;
2890 	DWORD		ret2;
2891 	DWORD		low;
2892 	DWORD		high;
2893 	ulint		n_retries	= 0;
2894 	ulint		err;
2895 #ifndef UNIV_HOTBACKUP
2896 	ulint		i;
2897 #endif /* !UNIV_HOTBACKUP */
2898 
2899 	/* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2900 	no more than 32 bits. */
2901 	ut_a((n & 0xFFFFFFFFUL) == n);
2902 
2903 	os_n_file_writes++;
2904 
2905 	ut_ad(file);
2906 	ut_ad(buf);
2907 	ut_ad(n > 0);
2908 retry:
2909 	low = (DWORD) offset & 0xFFFFFFFF;
2910 	high = (DWORD) (offset >> 32);
2911 
2912 	os_mutex_enter(os_file_count_mutex);
2913 	os_n_pending_writes++;
2914 	MONITOR_INC(MONITOR_OS_PENDING_WRITES);
2915 	os_mutex_exit(os_file_count_mutex);
2916 
2917 #ifndef UNIV_HOTBACKUP
2918 	/* Protect the seek / write operation with a mutex */
2919 	i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2920 
2921 	os_mutex_enter(os_file_seek_mutexes[i]);
2922 #endif /* !UNIV_HOTBACKUP */
2923 
2924 	ret2 = SetFilePointer(
2925 		file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN);
2926 
2927 	if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2928 
2929 #ifndef UNIV_HOTBACKUP
2930 		os_mutex_exit(os_file_seek_mutexes[i]);
2931 #endif /* !UNIV_HOTBACKUP */
2932 
2933 		os_mutex_enter(os_file_count_mutex);
2934 		os_n_pending_writes--;
2935 		MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
2936 		os_mutex_exit(os_file_count_mutex);
2937 
2938 		ut_print_timestamp(stderr);
2939 
2940 		fprintf(stderr,
2941 			" InnoDB: Error: File pointer positioning to"
2942 			" file %s failed at\n"
2943 			"InnoDB: offset %llu. Operating system"
2944 			" error number %lu.\n"
2945 			"InnoDB: Some operating system error numbers"
2946 			" are described at\n"
2947 			"InnoDB: "
2948 			REFMAN "operating-system-error-codes.html\n",
2949 			name, offset, (ulong) GetLastError());
2950 
2951 		return(FALSE);
2952 	}
2953 
2954 	ret = WriteFile(file, buf, (DWORD) n, &len, NULL);
2955 
2956 #ifndef UNIV_HOTBACKUP
2957 	os_mutex_exit(os_file_seek_mutexes[i]);
2958 #endif /* !UNIV_HOTBACKUP */
2959 
2960 	os_mutex_enter(os_file_count_mutex);
2961 	os_n_pending_writes--;
2962 	MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
2963 	os_mutex_exit(os_file_count_mutex);
2964 
2965 	if (ret && len == n) {
2966 
2967 		return(TRUE);
2968 	}
2969 
2970 	/* If some background file system backup tool is running, then, at
2971 	least in Windows 2000, we may get here a specific error. Let us
2972 	retry the operation 100 times, with 1 second waits. */
2973 
2974 	if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) {
2975 
2976 		os_thread_sleep(1000000);
2977 
2978 		n_retries++;
2979 
2980 		goto retry;
2981 	}
2982 
2983 	if (!os_has_said_disk_full) {
2984 
2985 		err = (ulint) GetLastError();
2986 
2987 		ut_print_timestamp(stderr);
2988 
2989 		fprintf(stderr,
2990 			" InnoDB: Error: Write to file %s failed"
2991 			" at offset %llu.\n"
2992 			"InnoDB: %lu bytes should have been written,"
2993 			" only %lu were written.\n"
2994 			"InnoDB: Operating system error number %lu.\n"
2995 			"InnoDB: Check that your OS and file system"
2996 			" support files of this size.\n"
2997 			"InnoDB: Check also that the disk is not full"
2998 			" or a disk quota exceeded.\n",
2999 			name, offset,
3000 			(ulong) n, (ulong) len, (ulong) err);
3001 
3002 		if (strerror((int) err) != NULL) {
3003 			fprintf(stderr,
3004 				"InnoDB: Error number %lu means '%s'.\n",
3005 				(ulong) err, strerror((int) err));
3006 		}
3007 
3008 		fprintf(stderr,
3009 			"InnoDB: Some operating system error numbers"
3010 			" are described at\n"
3011 			"InnoDB: "
3012 			REFMAN "operating-system-error-codes.html\n");
3013 
3014 		os_has_said_disk_full = TRUE;
3015 	}
3016 
3017 	return(FALSE);
3018 #else
3019 	ssize_t	ret;
3020 	WAIT_ALLOW_WRITES();
3021 
3022 	ret = os_file_pwrite(file, buf, n, offset);
3023 
3024 	if ((ulint) ret == n) {
3025 
3026 		return(TRUE);
3027 	}
3028 
3029 	if (!os_has_said_disk_full) {
3030 
3031 		ut_print_timestamp(stderr);
3032 
3033 		if(ret == -1) {
3034 			ib_logf(IB_LOG_LEVEL_ERROR,
3035 				"Failure of system call pwrite(). Operating"
3036 				" system error number is %lu.",
3037 				(ulint) errno);
3038 		} else {
3039 			fprintf(stderr,
3040 				" InnoDB: Error: Write to file %s failed"
3041 				" at offset " UINT64PF ".\n"
3042 				"InnoDB: %lu bytes should have been written,"
3043 				" only %ld were written.\n"
3044 				"InnoDB: Operating system error number %lu.\n"
3045 				"InnoDB: Check that your OS and file system"
3046 				" support files of this size.\n"
3047 				"InnoDB: Check also that the disk is not full"
3048 				" or a disk quota exceeded.\n",
3049 				name, offset, n, (lint) ret,
3050 				(ulint) errno);
3051 		}
3052 
3053 		if (strerror(errno) != NULL) {
3054 			fprintf(stderr,
3055 				"InnoDB: Error number %d means '%s'.\n",
3056 				errno, strerror(errno));
3057 		}
3058 
3059 		fprintf(stderr,
3060 			"InnoDB: Some operating system error numbers"
3061 			" are described at\n"
3062 			"InnoDB: "
3063 			REFMAN "operating-system-error-codes.html\n");
3064 
3065 		os_has_said_disk_full = TRUE;
3066 	}
3067 
3068 	return(FALSE);
3069 #endif
3070 }
3071 
3072 /*******************************************************************//**
3073 Check the existence and type of the given file.
3074 @return	TRUE if call succeeded */
3075 UNIV_INTERN
3076 ibool
os_file_status(const char * path,ibool * exists,os_file_type_t * type)3077 os_file_status(
3078 /*===========*/
3079 	const char*	path,	/*!< in: pathname of the file */
3080 	ibool*		exists,	/*!< out: TRUE if file exists */
3081 	os_file_type_t* type)	/*!< out: type of the file (if it exists) */
3082 {
3083 #ifdef __WIN__
3084 	int		ret;
3085 	struct _stat64	statinfo;
3086 
3087 	ret = _stat64(path, &statinfo);
3088 	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3089 		/* file does not exist */
3090 		*exists = FALSE;
3091 		return(TRUE);
3092 	} else if (ret) {
3093 		/* file exists, but stat call failed */
3094 
3095 		os_file_handle_error_no_exit(path, "stat", FALSE);
3096 
3097 		return(FALSE);
3098 	}
3099 
3100 	if (_S_IFDIR & statinfo.st_mode) {
3101 		*type = OS_FILE_TYPE_DIR;
3102 	} else if (_S_IFREG & statinfo.st_mode) {
3103 		*type = OS_FILE_TYPE_FILE;
3104 	} else {
3105 		*type = OS_FILE_TYPE_UNKNOWN;
3106 	}
3107 
3108 	*exists = TRUE;
3109 
3110 	return(TRUE);
3111 #else
3112 	int		ret;
3113 	struct stat	statinfo;
3114 
3115 	ret = stat(path, &statinfo);
3116 	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3117 		/* file does not exist */
3118 		*exists = FALSE;
3119 		return(TRUE);
3120 	} else if (ret) {
3121 		/* file exists, but stat call failed */
3122 
3123 		os_file_handle_error_no_exit(path, "stat", FALSE);
3124 
3125 		return(FALSE);
3126 	}
3127 
3128 	if (S_ISDIR(statinfo.st_mode)) {
3129 		*type = OS_FILE_TYPE_DIR;
3130 	} else if (S_ISLNK(statinfo.st_mode)) {
3131 		*type = OS_FILE_TYPE_LINK;
3132 	} else if (S_ISREG(statinfo.st_mode)) {
3133 		*type = OS_FILE_TYPE_FILE;
3134 	} else {
3135 		*type = OS_FILE_TYPE_UNKNOWN;
3136 	}
3137 
3138 	*exists = TRUE;
3139 
3140 	return(TRUE);
3141 #endif
3142 }
3143 
3144 /*******************************************************************//**
3145 This function returns information about the specified file
3146 @return	DB_SUCCESS if all OK */
3147 UNIV_INTERN
3148 dberr_t
os_file_get_status(const char * path,os_file_stat_t * stat_info,bool check_rw_perm)3149 os_file_get_status(
3150 /*===============*/
3151 	const char*	path,		/*!< in:	pathname of the file */
3152 	os_file_stat_t* stat_info,	/*!< information of a file in a
3153 					directory */
3154 	bool		check_rw_perm)	/*!< in: for testing whether the
3155 					file can be opened in RW mode */
3156 {
3157 	int		ret;
3158 
3159 #ifdef __WIN__
3160 	struct _stat64	statinfo;
3161 
3162 	ret = _stat64(path, &statinfo);
3163 
3164 	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3165 		/* file does not exist */
3166 
3167 		return(DB_NOT_FOUND);
3168 
3169 	} else if (ret) {
3170 		/* file exists, but stat call failed */
3171 
3172 		os_file_handle_error_no_exit(path, "stat", FALSE);
3173 
3174 		return(DB_FAIL);
3175 
3176 	} else if (_S_IFDIR & statinfo.st_mode) {
3177 		stat_info->type = OS_FILE_TYPE_DIR;
3178 	} else if (_S_IFREG & statinfo.st_mode) {
3179 
3180 		DWORD	access = GENERIC_READ;
3181 
3182 		if (!srv_read_only_mode) {
3183 			access |= GENERIC_WRITE;
3184 		}
3185 
3186 		stat_info->type = OS_FILE_TYPE_FILE;
3187 
3188 		/* Check if we can open it in read-only mode. */
3189 
3190 		if (check_rw_perm) {
3191 			HANDLE	fh;
3192 
3193 			fh = CreateFile(
3194 				(LPCTSTR) path,		// File to open
3195 				access,
3196 				0,			// No sharing
3197 				NULL,			// Default security
3198 				OPEN_EXISTING,		// Existing file only
3199 				FILE_ATTRIBUTE_NORMAL,	// Normal file
3200 				NULL);			// No attr. template
3201 
3202 			if (fh == INVALID_HANDLE_VALUE) {
3203 				stat_info->rw_perm = false;
3204 			} else {
3205 				stat_info->rw_perm = true;
3206 				CloseHandle(fh);
3207 			}
3208 		}
3209 	} else {
3210 		stat_info->type = OS_FILE_TYPE_UNKNOWN;
3211 	}
3212 #else
3213 	struct stat	statinfo;
3214 
3215 	ret = stat(path, &statinfo);
3216 
3217 	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3218 		/* file does not exist */
3219 
3220 		return(DB_NOT_FOUND);
3221 
3222 	} else if (ret) {
3223 		/* file exists, but stat call failed */
3224 
3225 		os_file_handle_error_no_exit(path, "stat", FALSE);
3226 
3227 		return(DB_FAIL);
3228 
3229 	}
3230 
3231 	switch (statinfo.st_mode & S_IFMT) {
3232 	case S_IFDIR:
3233 		stat_info->type = OS_FILE_TYPE_DIR;
3234 		break;
3235 	case S_IFLNK:
3236 		stat_info->type = OS_FILE_TYPE_LINK;
3237 		break;
3238 	case S_IFBLK:
3239 		/* Handle block device as regular file. */
3240 	case S_IFCHR:
3241 		/* Handle character device as regular file. */
3242 	case S_IFREG:
3243 		stat_info->type = OS_FILE_TYPE_FILE;
3244 		break;
3245 	default:
3246 		stat_info->type = OS_FILE_TYPE_UNKNOWN;
3247 	}
3248 
3249 
3250 	if (check_rw_perm && stat_info->type == OS_FILE_TYPE_FILE) {
3251 
3252 		int	fh;
3253 		int	access;
3254 
3255 		access = !srv_read_only_mode ? O_RDWR : O_RDONLY;
3256 
3257 		fh = ::open(path, access, os_innodb_umask);
3258 
3259 		if (fh == -1) {
3260 			stat_info->rw_perm = false;
3261 		} else {
3262 			stat_info->rw_perm = true;
3263 			close(fh);
3264 		}
3265 	}
3266 
3267 #endif /* _WIN_ */
3268 
3269 	stat_info->ctime = statinfo.st_ctime;
3270 	stat_info->atime = statinfo.st_atime;
3271 	stat_info->mtime = statinfo.st_mtime;
3272 	stat_info->size  = statinfo.st_size;
3273 
3274 	return(DB_SUCCESS);
3275 }
3276 
3277 /* path name separator character */
3278 #ifdef __WIN__
3279 #  define OS_FILE_PATH_SEPARATOR	'\\'
3280 #else
3281 #  define OS_FILE_PATH_SEPARATOR	'/'
3282 #endif
3283 
3284 /****************************************************************//**
3285 This function returns a new path name after replacing the basename
3286 in an old path with a new basename.  The old_path is a full path
3287 name including the extension.  The tablename is in the normal
3288 form "databasename/tablename".  The new base name is found after
3289 the forward slash.  Both input strings are null terminated.
3290 
3291 This function allocates memory to be returned.  It is the callers
3292 responsibility to free the return value after it is no longer needed.
3293 
3294 @return	own: new full pathname */
3295 UNIV_INTERN
3296 char*
os_file_make_new_pathname(const char * old_path,const char * tablename)3297 os_file_make_new_pathname(
3298 /*======================*/
3299 	const char*	old_path,	/*!< in: pathname */
3300 	const char*	tablename)	/*!< in: contains new base name */
3301 {
3302 	ulint		dir_len;
3303 	char*		last_slash;
3304 	char*		base_name;
3305 	char*		new_path;
3306 	ulint		new_path_len;
3307 
3308 	/* Split the tablename into its database and table name components.
3309 	They are separated by a '/'. */
3310 	last_slash = strrchr((char*) tablename, '/');
3311 	base_name = last_slash ? last_slash + 1 : (char*) tablename;
3312 
3313 	/* Find the offset of the last slash. We will strip off the
3314 	old basename.ibd which starts after that slash. */
3315 	last_slash = strrchr((char*) old_path, OS_FILE_PATH_SEPARATOR);
3316 	dir_len = last_slash ? last_slash - old_path : strlen(old_path);
3317 
3318 	/* allocate a new path and move the old directory path to it. */
3319 	new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd";
3320 	new_path = static_cast<char*>(mem_alloc(new_path_len));
3321 	memcpy(new_path, old_path, dir_len);
3322 
3323 	ut_snprintf(new_path + dir_len,
3324 		    new_path_len - dir_len,
3325 		    "%c%s.ibd",
3326 		    OS_FILE_PATH_SEPARATOR,
3327 		    base_name);
3328 
3329 	return(new_path);
3330 }
3331 
3332 /****************************************************************//**
3333 This function returns a remote path name by combining a data directory
3334 path provided in a DATA DIRECTORY clause with the tablename which is
3335 in the form 'database/tablename'.  It strips the file basename (which
3336 is the tablename) found after the last directory in the path provided.
3337 The full filepath created will include the database name as a directory
3338 under the path provided.  The filename is the tablename with the '.ibd'
3339 extension. All input and output strings are null-terminated.
3340 
3341 This function allocates memory to be returned.  It is the callers
3342 responsibility to free the return value after it is no longer needed.
3343 
3344 @return	own: A full pathname; data_dir_path/databasename/tablename.ibd */
3345 UNIV_INTERN
3346 char*
os_file_make_remote_pathname(const char * data_dir_path,const char * tablename,const char * extention)3347 os_file_make_remote_pathname(
3348 /*=========================*/
3349 	const char*	data_dir_path,	/*!< in: pathname */
3350 	const char*	tablename,	/*!< in: tablename */
3351 	const char*	extention)	/*!< in: file extention; ibd,cfg */
3352 {
3353 	ulint		data_dir_len;
3354 	char*		last_slash;
3355 	char*		new_path;
3356 	ulint		new_path_len;
3357 
3358 	ut_ad(extention && strlen(extention) == 3);
3359 
3360 	/* Find the offset of the last slash. We will strip off the
3361 	old basename or tablename which starts after that slash. */
3362 	last_slash = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
3363 	data_dir_len = last_slash ? last_slash - data_dir_path : strlen(data_dir_path);
3364 
3365 	/* allocate a new path and move the old directory path to it. */
3366 	new_path_len = data_dir_len + strlen(tablename)
3367 		       + sizeof "/." + strlen(extention);
3368 	new_path = static_cast<char*>(mem_alloc(new_path_len));
3369 	memcpy(new_path, data_dir_path, data_dir_len);
3370 	ut_snprintf(new_path + data_dir_len,
3371 		    new_path_len - data_dir_len,
3372 		    "%c%s.%s",
3373 		    OS_FILE_PATH_SEPARATOR,
3374 		    tablename,
3375 		    extention);
3376 
3377 	srv_normalize_path_for_win(new_path);
3378 
3379 	return(new_path);
3380 }
3381 
3382 /****************************************************************//**
3383 This function reduces a null-terminated full remote path name into
3384 the path that is sent by MySQL for DATA DIRECTORY clause.  It replaces
3385 the 'databasename/tablename.ibd' found at the end of the path with just
3386 'tablename'.
3387 
3388 Since the result is always smaller than the path sent in, no new memory
3389 is allocated. The caller should allocate memory for the path sent in.
3390 This function manipulates that path in place.
3391 
3392 If the path format is not as expected, just return.  The result is used
3393 to inform a SHOW CREATE TABLE command. */
3394 UNIV_INTERN
3395 void
os_file_make_data_dir_path(char * data_dir_path)3396 os_file_make_data_dir_path(
3397 /*========================*/
3398 	char*	data_dir_path)	/*!< in/out: full path/data_dir_path */
3399 {
3400 	char*	ptr;
3401 	char*	tablename;
3402 	ulint	tablename_len;
3403 
3404 	/* Replace the period before the extension with a null byte. */
3405 	ptr = strrchr((char*) data_dir_path, '.');
3406 	if (!ptr) {
3407 		return;
3408 	}
3409 	ptr[0] = '\0';
3410 
3411 	/* The tablename starts after the last slash. */
3412 	ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
3413 	if (!ptr) {
3414 		return;
3415 	}
3416 	ptr[0] = '\0';
3417 	tablename = ptr + 1;
3418 
3419 	/* The databasename starts after the next to last slash. */
3420 	ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
3421 	if (!ptr) {
3422 		return;
3423 	}
3424 	tablename_len = ut_strlen(tablename);
3425 
3426 	ut_memmove(++ptr, tablename, tablename_len);
3427 
3428 	ptr[tablename_len] = '\0';
3429 }
3430 
3431 /****************************************************************//**
3432 The function os_file_dirname returns a directory component of a
3433 null-terminated pathname string. In the usual case, dirname returns
3434 the string up to, but not including, the final '/', and basename
3435 is the component following the final '/'. Trailing '/' characters
3436 are not counted as part of the pathname.
3437 
3438 If path does not contain a slash, dirname returns the string ".".
3439 
3440 Concatenating the string returned by dirname, a "/", and the basename
3441 yields a complete pathname.
3442 
3443 The return value is a copy of the directory component of the pathname.
3444 The copy is allocated from heap. It is the caller responsibility
3445 to free it after it is no longer needed.
3446 
3447 The following list of examples (taken from SUSv2) shows the strings
3448 returned by dirname and basename for different paths:
3449 
3450        path	      dirname	     basename
3451        "/usr/lib"     "/usr"	     "lib"
3452        "/usr/"	      "/"	     "usr"
3453        "usr"	      "."	     "usr"
3454        "/"	      "/"	     "/"
3455        "."	      "."	     "."
3456        ".."	      "."	     ".."
3457 
3458 @return	own: directory component of the pathname */
3459 UNIV_INTERN
3460 char*
os_file_dirname(const char * path)3461 os_file_dirname(
3462 /*============*/
3463 	const char*	path)	/*!< in: pathname */
3464 {
3465 	/* Find the offset of the last slash */
3466 	const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR);
3467 	if (!last_slash) {
3468 		/* No slash in the path, return "." */
3469 
3470 		return(mem_strdup("."));
3471 	}
3472 
3473 	/* Ok, there is a slash */
3474 
3475 	if (last_slash == path) {
3476 		/* last slash is the first char of the path */
3477 
3478 		return(mem_strdup("/"));
3479 	}
3480 
3481 	/* Non-trivial directory component */
3482 
3483 	return(mem_strdupl(path, last_slash - path));
3484 }
3485 
3486 /****************************************************************//**
3487 Creates all missing subdirectories along the given path.
3488 @return	TRUE if call succeeded FALSE otherwise */
3489 UNIV_INTERN
3490 ibool
os_file_create_subdirs_if_needed(const char * path)3491 os_file_create_subdirs_if_needed(
3492 /*=============================*/
3493 	const char*	path)	/*!< in: path name */
3494 {
3495 	if (srv_read_only_mode) {
3496 
3497 		ib_logf(IB_LOG_LEVEL_ERROR,
3498 			"read only mode set. Can't create subdirectories '%s'",
3499 			path);
3500 
3501 		return(FALSE);
3502 
3503 	}
3504 
3505 	char*	subdir = os_file_dirname(path);
3506 
3507 	if (strlen(subdir) == 1
3508 	    && (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) {
3509 		/* subdir is root or cwd, nothing to do */
3510 		mem_free(subdir);
3511 
3512 		return(TRUE);
3513 	}
3514 
3515 	/* Test if subdir exists */
3516 	os_file_type_t	type;
3517 	ibool	subdir_exists;
3518 	ibool	success = os_file_status(subdir, &subdir_exists, &type);
3519 
3520 	if (success && !subdir_exists) {
3521 
3522 		/* subdir does not exist, create it */
3523 		success = os_file_create_subdirs_if_needed(subdir);
3524 
3525 		if (!success) {
3526 			mem_free(subdir);
3527 
3528 			return(FALSE);
3529 		}
3530 
3531 		success = os_file_create_directory(subdir, FALSE);
3532 	}
3533 
3534 	mem_free(subdir);
3535 
3536 	return(success);
3537 }
3538 
3539 #ifndef UNIV_HOTBACKUP
3540 /****************************************************************//**
3541 Returns a pointer to the nth slot in the aio array.
3542 @return	pointer to slot */
3543 static
3544 os_aio_slot_t*
os_aio_array_get_nth_slot(os_aio_array_t * array,ulint index)3545 os_aio_array_get_nth_slot(
3546 /*======================*/
3547 	os_aio_array_t*		array,	/*!< in: aio array */
3548 	ulint			index)	/*!< in: index of the slot */
3549 {
3550 	ut_a(index < array->n_slots);
3551 
3552 	return(&array->slots[index]);
3553 }
3554 
3555 #if defined(LINUX_NATIVE_AIO)
3556 /******************************************************************//**
3557 Creates an io_context for native linux AIO.
3558 @return	TRUE on success. */
3559 static
3560 ibool
os_aio_linux_create_io_ctx(ulint max_events,io_context_t * io_ctx)3561 os_aio_linux_create_io_ctx(
3562 /*=======================*/
3563 	ulint		max_events,	/*!< in: number of events. */
3564 	io_context_t*	io_ctx)		/*!< out: io_ctx to initialize. */
3565 {
3566 	int	ret;
3567 	ulint	retries = 0;
3568 
3569 retry:
3570 	memset(io_ctx, 0x0, sizeof(*io_ctx));
3571 
3572 	/* Initialize the io_ctx. Tell it how many pending
3573 	IO requests this context will handle. */
3574 
3575 	ret = io_setup(max_events, io_ctx);
3576 	if (ret == 0) {
3577 #if defined(UNIV_AIO_DEBUG)
3578 		fprintf(stderr,
3579 			"InnoDB: Linux native AIO:"
3580 			" initialized io_ctx for segment\n");
3581 #endif
3582 		/* Success. Return now. */
3583 		return(TRUE);
3584 	}
3585 
3586 	/* If we hit EAGAIN we'll make a few attempts before failing. */
3587 
3588 	switch (ret) {
3589 	case -EAGAIN:
3590 		if (retries == 0) {
3591 			/* First time around. */
3592 			ut_print_timestamp(stderr);
3593 			fprintf(stderr,
3594 				" InnoDB: Warning: io_setup() failed"
3595 				" with EAGAIN. Will make %d attempts"
3596 				" before giving up.\n",
3597 				OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3598 		}
3599 
3600 		if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
3601 			++retries;
3602 			fprintf(stderr,
3603 				"InnoDB: Warning: io_setup() attempt"
3604 				" %lu failed.\n",
3605 				retries);
3606 			os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
3607 			goto retry;
3608 		}
3609 
3610 		/* Have tried enough. Better call it a day. */
3611 		ut_print_timestamp(stderr);
3612 		fprintf(stderr,
3613 			" InnoDB: Error: io_setup() failed"
3614 			" with EAGAIN after %d attempts.\n",
3615 			OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3616 		break;
3617 
3618 	case -ENOSYS:
3619 		ut_print_timestamp(stderr);
3620 		fprintf(stderr,
3621 			" InnoDB: Error: Linux Native AIO interface"
3622 			" is not supported on this platform. Please"
3623 			" check your OS documentation and install"
3624 			" appropriate binary of InnoDB.\n");
3625 
3626 		break;
3627 
3628 	default:
3629 		ut_print_timestamp(stderr);
3630 		fprintf(stderr,
3631 			" InnoDB: Error: Linux Native AIO setup"
3632 			" returned following error[%d]\n", -ret);
3633 		break;
3634 	}
3635 
3636 	fprintf(stderr,
3637 		"InnoDB: You can disable Linux Native AIO by"
3638 		" setting innodb_use_native_aio = 0 in my.cnf\n");
3639 	return(FALSE);
3640 }
3641 
3642 /******************************************************************//**
3643 Checks if the system supports native linux aio. On some kernel
3644 versions where native aio is supported it won't work on tmpfs. In such
3645 cases we can't use native aio as it is not possible to mix simulated
3646 and native aio.
3647 @return: TRUE if supported, FALSE otherwise. */
3648 static
3649 ibool
os_aio_native_aio_supported(void)3650 os_aio_native_aio_supported(void)
3651 /*=============================*/
3652 {
3653 	int			fd;
3654 	io_context_t		io_ctx;
3655 	char			name[1000];
3656 
3657 	if (!os_aio_linux_create_io_ctx(1, &io_ctx)) {
3658 		/* The platform does not support native aio. */
3659 		return(FALSE);
3660 	} else if (!srv_read_only_mode) {
3661 		/* Now check if tmpdir supports native aio ops. */
3662 		fd = innobase_mysql_tmpfile(NULL);
3663 
3664 		if (fd < 0) {
3665 			ib_logf(IB_LOG_LEVEL_WARN,
3666 				"Unable to create temp file to check "
3667 				"native AIO support.");
3668 
3669 			return(FALSE);
3670 		}
3671 	} else {
3672 
3673 		srv_normalize_path_for_win(srv_log_group_home_dir);
3674 
3675 		ulint	dirnamelen = strlen(srv_log_group_home_dir);
3676 		ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
3677 		memcpy(name, srv_log_group_home_dir, dirnamelen);
3678 
3679 		/* Add a path separator if needed. */
3680 		if (dirnamelen && name[dirnamelen - 1] != SRV_PATH_SEPARATOR) {
3681 			name[dirnamelen++] = SRV_PATH_SEPARATOR;
3682 		}
3683 
3684 		strcpy(name + dirnamelen, "ib_logfile0");
3685 
3686 		fd = ::open(name, O_RDONLY);
3687 
3688 		if (fd == -1) {
3689 
3690 			ib_logf(IB_LOG_LEVEL_WARN,
3691 				"Unable to open \"%s\" to check "
3692 				"native AIO read support.", name);
3693 
3694 			return(FALSE);
3695 		}
3696 	}
3697 
3698 	struct io_event	io_event;
3699 
3700 	memset(&io_event, 0x0, sizeof(io_event));
3701 
3702 	byte*	buf = static_cast<byte*>(ut_malloc(UNIV_PAGE_SIZE * 2));
3703 	byte*	ptr = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE));
3704 
3705 	struct iocb	iocb;
3706 
3707 	/* Suppress valgrind warning. */
3708 	memset(buf, 0x00, UNIV_PAGE_SIZE * 2);
3709 	memset(&iocb, 0x0, sizeof(iocb));
3710 
3711 	struct iocb*	p_iocb = &iocb;
3712 
3713 	if (!srv_read_only_mode) {
3714 		io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0);
3715 	} else {
3716 		ut_a(UNIV_PAGE_SIZE >= 512);
3717 		io_prep_pread(p_iocb, fd, ptr, 512, 0);
3718 	}
3719 
3720 	int	err = io_submit(io_ctx, 1, &p_iocb);
3721 
3722 	if (err >= 1) {
3723 		/* Now collect the submitted IO request. */
3724 		err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
3725 	}
3726 
3727 	ut_free(buf);
3728 	close(fd);
3729 
3730 	switch (err) {
3731 	case 1:
3732 		return(TRUE);
3733 
3734 	case -EINVAL:
3735 	case -ENOSYS:
3736 		ib_logf(IB_LOG_LEVEL_ERROR,
3737 			"Linux Native AIO not supported. You can either "
3738 			"move %s to a file system that supports native "
3739 			"AIO or you can set innodb_use_native_aio to "
3740 			"FALSE to avoid this message.",
3741 			srv_read_only_mode ? name : "tmpdir");
3742 
3743 		/* fall through. */
3744 	default:
3745 		ib_logf(IB_LOG_LEVEL_ERROR,
3746 			"Linux Native AIO check on %s returned error[%d]",
3747 			srv_read_only_mode ? name : "tmpdir", -err);
3748 	}
3749 
3750 	return(FALSE);
3751 }
3752 #endif /* LINUX_NATIVE_AIO */
3753 
3754 /******************************************************************//**
3755 Creates an aio wait array. Note that we return NULL in case of failure.
3756 We don't care about freeing memory here because we assume that a
3757 failure will result in server refusing to start up.
3758 @return	own: aio array, NULL on failure */
3759 static
3760 os_aio_array_t*
os_aio_array_create(ulint n,ulint n_segments)3761 os_aio_array_create(
3762 /*================*/
3763 	ulint	n,		/*!< in: maximum number of pending aio
3764 				operations allowed; n must be
3765 				divisible by n_segments */
3766 	ulint	n_segments)	/*!< in: number of segments in the aio array */
3767 {
3768 	os_aio_array_t*	array;
3769 #ifdef WIN_ASYNC_IO
3770 	OVERLAPPED*	over;
3771 #elif defined(LINUX_NATIVE_AIO)
3772 	struct io_event*	io_event = NULL;
3773 #endif /* WIN_ASYNC_IO */
3774 	ut_a(n > 0);
3775 	ut_a(n_segments > 0);
3776 
3777 	array = static_cast<os_aio_array_t*>(ut_malloc(sizeof(*array)));
3778 	memset(array, 0x0, sizeof(*array));
3779 
3780 	array->mutex = os_mutex_create();
3781 	array->not_full = os_event_create();
3782 	array->is_empty = os_event_create();
3783 
3784 	os_event_set(array->is_empty);
3785 
3786 	array->n_slots = n;
3787 	array->n_segments = n_segments;
3788 
3789 	array->slots = static_cast<os_aio_slot_t*>(
3790 		ut_malloc(n * sizeof(*array->slots)));
3791 
3792 	memset(array->slots, 0x0, sizeof(n * sizeof(*array->slots)));
3793 #ifdef __WIN__
3794 	array->handles = static_cast<HANDLE*>(ut_malloc(n * sizeof(HANDLE)));
3795 #endif /* __WIN__ */
3796 
3797 #if defined(LINUX_NATIVE_AIO)
3798 	array->aio_ctx = NULL;
3799 	array->aio_events = NULL;
3800 
3801 	/* If we are not using native aio interface then skip this
3802 	part of initialization. */
3803 	if (!srv_use_native_aio) {
3804 		goto skip_native_aio;
3805 	}
3806 
3807 	/* Initialize the io_context array. One io_context
3808 	per segment in the array. */
3809 
3810 	array->aio_ctx = static_cast<io_context**>(
3811 		ut_malloc(n_segments * sizeof(*array->aio_ctx)));
3812 
3813 	for (ulint i = 0; i < n_segments; ++i) {
3814 		if (!os_aio_linux_create_io_ctx(n/n_segments,
3815 						&array->aio_ctx[i])) {
3816 			/* If something bad happened during aio setup
3817 			we should call it a day and return right away.
3818 			We don't care about any leaks because a failure
3819 			to initialize the io subsystem means that the
3820 			server (or atleast the innodb storage engine)
3821 			is not going to startup. */
3822 			return(NULL);
3823 		}
3824 	}
3825 
3826 	/* Initialize the event array. One event per slot. */
3827 	io_event = static_cast<struct io_event*>(
3828 		ut_malloc(n * sizeof(*io_event)));
3829 
3830 	memset(io_event, 0x0, sizeof(*io_event) * n);
3831 	array->aio_events = io_event;
3832 
3833 skip_native_aio:
3834 #endif /* LINUX_NATIVE_AIO */
3835 	for (ulint i = 0; i < n; i++) {
3836 		os_aio_slot_t*	slot;
3837 
3838 		slot = os_aio_array_get_nth_slot(array, i);
3839 
3840 		slot->pos = i;
3841 		slot->reserved = FALSE;
3842 #ifdef WIN_ASYNC_IO
3843 		slot->handle = CreateEvent(NULL,TRUE, FALSE, NULL);
3844 
3845 		over = &slot->control;
3846 
3847 		over->hEvent = slot->handle;
3848 
3849 		array->handles[i] = over->hEvent;
3850 
3851 #elif defined(LINUX_NATIVE_AIO)
3852 		memset(&slot->control, 0x0, sizeof(slot->control));
3853 		slot->n_bytes = 0;
3854 		slot->ret = 0;
3855 #endif /* WIN_ASYNC_IO */
3856 	}
3857 
3858 	return(array);
3859 }
3860 
3861 /************************************************************************//**
3862 Frees an aio wait array. */
3863 static
3864 void
os_aio_array_free(os_aio_array_t * & array)3865 os_aio_array_free(
3866 /*==============*/
3867 	os_aio_array_t*& array)	/*!< in, own: array to free */
3868 {
3869 #ifdef WIN_ASYNC_IO
3870 	ulint	i;
3871 
3872 	for (i = 0; i < array->n_slots; i++) {
3873 		os_aio_slot_t*	slot = os_aio_array_get_nth_slot(array, i);
3874 		CloseHandle(slot->handle);
3875 	}
3876 #endif /* WIN_ASYNC_IO */
3877 
3878 #ifdef __WIN__
3879 	ut_free(array->handles);
3880 #endif /* __WIN__ */
3881 	os_mutex_free(array->mutex);
3882 	os_event_free(array->not_full);
3883 	os_event_free(array->is_empty);
3884 
3885 #if defined(LINUX_NATIVE_AIO)
3886 	if (srv_use_native_aio) {
3887 		ut_free(array->aio_events);
3888 		ut_free(array->aio_ctx);
3889 	}
3890 #endif /* LINUX_NATIVE_AIO */
3891 
3892 	ut_free(array->slots);
3893 	ut_free(array);
3894 
3895 	array = 0;
3896 }
3897 
3898 /***********************************************************************
3899 Initializes the asynchronous io system. Creates one array each for ibuf
3900 and log i/o. Also creates one array each for read and write where each
3901 array is divided logically into n_read_segs and n_write_segs
3902 respectively. The caller must create an i/o handler thread for each
3903 segment in these arrays. This function also creates the sync array.
3904 No i/o handler thread needs to be created for that */
3905 UNIV_INTERN
3906 ibool
os_aio_init(ulint n_per_seg,ulint n_read_segs,ulint n_write_segs,ulint n_slots_sync)3907 os_aio_init(
3908 /*========*/
3909 	ulint	n_per_seg,	/*<! in: maximum number of pending aio
3910 				operations allowed per segment */
3911 	ulint	n_read_segs,	/*<! in: number of reader threads */
3912 	ulint	n_write_segs,	/*<! in: number of writer threads */
3913 	ulint	n_slots_sync)	/*<! in: number of slots in the sync aio
3914 				array */
3915 {
3916 	os_io_init_simple();
3917 
3918 #if defined(LINUX_NATIVE_AIO)
3919 	/* Check if native aio is supported on this system and tmpfs */
3920 	if (srv_use_native_aio && !os_aio_native_aio_supported()) {
3921 
3922 		ib_logf(IB_LOG_LEVEL_WARN, "Linux Native AIO disabled.");
3923 
3924 		srv_use_native_aio = FALSE;
3925 	}
3926 #endif /* LINUX_NATIVE_AIO */
3927 
3928 	srv_reset_io_thread_op_info();
3929 
3930 	os_aio_read_array = os_aio_array_create(
3931 		n_read_segs * n_per_seg, n_read_segs);
3932 
3933 	if (os_aio_read_array == NULL) {
3934 		return(FALSE);
3935 	}
3936 
3937 	ulint	start = (srv_read_only_mode) ? 0 : 2;
3938 	ulint	n_segs = n_read_segs + start;
3939 
3940 	/* 0 is the ibuf segment and 1 is the insert buffer segment. */
3941 	for (ulint i = start; i < n_segs; ++i) {
3942 		ut_a(i < SRV_MAX_N_IO_THREADS);
3943 		srv_io_thread_function[i] = "read thread";
3944 	}
3945 
3946 	ulint	n_segments = n_read_segs;
3947 
3948 	if (!srv_read_only_mode) {
3949 
3950 		os_aio_log_array = os_aio_array_create(n_per_seg, 1);
3951 
3952 		if (os_aio_log_array == NULL) {
3953 			return(FALSE);
3954 		}
3955 
3956 		++n_segments;
3957 
3958 		srv_io_thread_function[1] = "log thread";
3959 
3960 		os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
3961 
3962 		if (os_aio_ibuf_array == NULL) {
3963 			return(FALSE);
3964 		}
3965 
3966 		++n_segments;
3967 
3968 		srv_io_thread_function[0] = "insert buffer thread";
3969 
3970 		os_aio_write_array = os_aio_array_create(
3971 			n_write_segs * n_per_seg, n_write_segs);
3972 
3973 		if (os_aio_write_array == NULL) {
3974 			return(FALSE);
3975 		}
3976 
3977 		n_segments += n_write_segs;
3978 
3979 		for (ulint i = start + n_read_segs; i < n_segments; ++i) {
3980 			ut_a(i < SRV_MAX_N_IO_THREADS);
3981 			srv_io_thread_function[i] = "write thread";
3982 		}
3983 
3984 		ut_ad(n_segments >= 4);
3985 	} else {
3986 		ut_ad(n_segments > 0);
3987 	}
3988 
3989 	os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
3990 
3991 	if (os_aio_sync_array == NULL) {
3992 		return(FALSE);
3993 	}
3994 
3995 	os_aio_n_segments = n_segments;
3996 
3997 	os_aio_validate();
3998 
3999 	os_aio_segment_wait_events = static_cast<os_event_t*>(
4000 		ut_malloc(n_segments * sizeof *os_aio_segment_wait_events));
4001 
4002 	for (ulint i = 0; i < n_segments; ++i) {
4003 		os_aio_segment_wait_events[i] = os_event_create();
4004 	}
4005 
4006 	os_last_printout = ut_time();
4007 
4008 	return(TRUE);
4009 
4010 }
4011 
4012 /***********************************************************************
4013 Frees the asynchronous io system. */
4014 UNIV_INTERN
4015 void
os_aio_free(void)4016 os_aio_free(void)
4017 /*=============*/
4018 {
4019 	if (os_aio_ibuf_array != 0) {
4020 		os_aio_array_free(os_aio_ibuf_array);
4021 	}
4022 
4023 	if (os_aio_log_array != 0) {
4024 		os_aio_array_free(os_aio_log_array);
4025 	}
4026 
4027 	if (os_aio_write_array != 0) {
4028 		os_aio_array_free(os_aio_write_array);
4029 	}
4030 
4031 	if (os_aio_sync_array != 0) {
4032 		os_aio_array_free(os_aio_sync_array);
4033 	}
4034 
4035 	os_aio_array_free(os_aio_read_array);
4036 
4037 	for (ulint i = 0; i < os_aio_n_segments; i++) {
4038 		os_event_free(os_aio_segment_wait_events[i]);
4039 	}
4040 
4041 	ut_free(os_aio_segment_wait_events);
4042 	os_aio_segment_wait_events = 0;
4043 	os_aio_n_segments = 0;
4044 }
4045 
4046 #ifdef WIN_ASYNC_IO
4047 /************************************************************************//**
4048 Wakes up all async i/o threads in the array in Windows async i/o at
4049 shutdown. */
4050 static
4051 void
os_aio_array_wake_win_aio_at_shutdown(os_aio_array_t * array)4052 os_aio_array_wake_win_aio_at_shutdown(
4053 /*==================================*/
4054 	os_aio_array_t*	array)	/*!< in: aio array */
4055 {
4056 	ulint	i;
4057 
4058 	for (i = 0; i < array->n_slots; i++) {
4059 
4060 		SetEvent((array->slots + i)->handle);
4061 	}
4062 }
4063 #endif
4064 
4065 /************************************************************************//**
4066 Wakes up all async i/o threads so that they know to exit themselves in
4067 shutdown. */
4068 UNIV_INTERN
4069 void
os_aio_wake_all_threads_at_shutdown(void)4070 os_aio_wake_all_threads_at_shutdown(void)
4071 /*=====================================*/
4072 {
4073 #ifdef WIN_ASYNC_IO
4074 	/* This code wakes up all ai/o threads in Windows native aio */
4075 	os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array);
4076 	if (os_aio_write_array != 0) {
4077 		os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array);
4078 	}
4079 
4080 	if (os_aio_ibuf_array != 0) {
4081 		os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array);
4082 	}
4083 
4084 	if (os_aio_log_array != 0) {
4085 		os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array);
4086 	}
4087 
4088 #elif defined(LINUX_NATIVE_AIO)
4089 
4090 	/* When using native AIO interface the io helper threads
4091 	wait on io_getevents with a timeout value of 500ms. At
4092 	each wake up these threads check the server status.
4093 	No need to do anything to wake them up. */
4094 
4095 	if (srv_use_native_aio) {
4096 		return;
4097 	}
4098 
4099 	/* Fall through to simulated AIO handler wakeup if we are
4100 	not using native AIO. */
4101 #endif /* !WIN_ASYNC_AIO */
4102 
4103 	/* This loop wakes up all simulated ai/o threads */
4104 
4105 	for (ulint i = 0; i < os_aio_n_segments; i++) {
4106 
4107 		os_event_set(os_aio_segment_wait_events[i]);
4108 	}
4109 }
4110 
4111 /************************************************************************//**
4112 Waits until there are no pending writes in os_aio_write_array. There can
4113 be other, synchronous, pending writes. */
4114 UNIV_INTERN
4115 void
os_aio_wait_until_no_pending_writes(void)4116 os_aio_wait_until_no_pending_writes(void)
4117 /*=====================================*/
4118 {
4119 	ut_ad(!srv_read_only_mode);
4120 	os_event_wait(os_aio_write_array->is_empty);
4121 }
4122 
4123 /**********************************************************************//**
4124 Calculates segment number for a slot.
4125 @return segment number (which is the number used by, for example,
4126 i/o-handler threads) */
4127 static
4128 ulint
os_aio_get_segment_no_from_slot(os_aio_array_t * array,os_aio_slot_t * slot)4129 os_aio_get_segment_no_from_slot(
4130 /*============================*/
4131 	os_aio_array_t*	array,	/*!< in: aio wait array */
4132 	os_aio_slot_t*	slot)	/*!< in: slot in this array */
4133 {
4134 	ulint	segment;
4135 	ulint	seg_len;
4136 
4137 	if (array == os_aio_ibuf_array) {
4138 		ut_ad(!srv_read_only_mode);
4139 
4140 		segment = IO_IBUF_SEGMENT;
4141 
4142 	} else if (array == os_aio_log_array) {
4143 		ut_ad(!srv_read_only_mode);
4144 
4145 		segment = IO_LOG_SEGMENT;
4146 
4147 	} else if (array == os_aio_read_array) {
4148 		seg_len = os_aio_read_array->n_slots
4149 			/ os_aio_read_array->n_segments;
4150 
4151 		segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
4152 	} else {
4153 		ut_ad(!srv_read_only_mode);
4154 		ut_a(array == os_aio_write_array);
4155 
4156 		seg_len = os_aio_write_array->n_slots
4157 			/ os_aio_write_array->n_segments;
4158 
4159 		segment = os_aio_read_array->n_segments + 2
4160 			+ slot->pos / seg_len;
4161 	}
4162 
4163 	return(segment);
4164 }
4165 
4166 /**********************************************************************//**
4167 Calculates local segment number and aio array from global segment number.
4168 @return	local segment number within the aio array */
4169 static
4170 ulint
os_aio_get_array_and_local_segment(os_aio_array_t ** array,ulint global_segment)4171 os_aio_get_array_and_local_segment(
4172 /*===============================*/
4173 	os_aio_array_t** array,		/*!< out: aio wait array */
4174 	ulint		 global_segment)/*!< in: global segment number */
4175 {
4176 	ulint		segment;
4177 
4178 	ut_a(global_segment < os_aio_n_segments);
4179 
4180 	if (srv_read_only_mode) {
4181 		*array = os_aio_read_array;
4182 
4183 		return(global_segment);
4184 	} else if (global_segment == IO_IBUF_SEGMENT) {
4185 		*array = os_aio_ibuf_array;
4186 		segment = 0;
4187 
4188 	} else if (global_segment == IO_LOG_SEGMENT) {
4189 		*array = os_aio_log_array;
4190 		segment = 0;
4191 
4192 	} else if (global_segment < os_aio_read_array->n_segments + 2) {
4193 		*array = os_aio_read_array;
4194 
4195 		segment = global_segment - 2;
4196 	} else {
4197 		*array = os_aio_write_array;
4198 
4199 		segment = global_segment - (os_aio_read_array->n_segments + 2);
4200 	}
4201 
4202 	return(segment);
4203 }
4204 
4205 /*******************************************************************//**
4206 Requests for a slot in the aio array. If no slot is available, waits until
4207 not_full-event becomes signaled.
4208 @return	pointer to slot */
4209 static
4210 os_aio_slot_t*
os_aio_array_reserve_slot(ulint type,os_aio_array_t * array,fil_node_t * message1,void * message2,pfs_os_file_t file,const char * name,void * buf,os_offset_t offset,ulint len)4211 os_aio_array_reserve_slot(
4212 /*======================*/
4213 	ulint		type,	/*!< in: OS_FILE_READ or OS_FILE_WRITE */
4214 	os_aio_array_t*	array,	/*!< in: aio array */
4215 	fil_node_t*	message1,/*!< in: message to be passed along with
4216 				the aio operation */
4217 	void*		message2,/*!< in: message to be passed along with
4218 				the aio operation */
4219 	pfs_os_file_t	file,	/*!< in: file handle */
4220 	const char*	name,	/*!< in: name of the file or path as a
4221 				null-terminated string */
4222 	void*		buf,	/*!< in: buffer where to read or from which
4223 				to write */
4224 	os_offset_t	offset,	/*!< in: file offset */
4225 	ulint		len)	/*!< in: length of the block to read or write */
4226 {
4227 	os_aio_slot_t*	slot = NULL;
4228 #ifdef WIN_ASYNC_IO
4229 	OVERLAPPED*	control;
4230 
4231 #elif defined(LINUX_NATIVE_AIO)
4232 
4233 	struct iocb*	iocb;
4234 	off_t		aio_offset;
4235 
4236 #endif /* WIN_ASYNC_IO */
4237 	ulint		i;
4238 	ulint		counter;
4239 	ulint		slots_per_seg;
4240 	ulint		local_seg;
4241 
4242 #ifdef WIN_ASYNC_IO
4243 	ut_a((len & 0xFFFFFFFFUL) == len);
4244 #endif /* WIN_ASYNC_IO */
4245 
4246 	/* No need of a mutex. Only reading constant fields */
4247 	slots_per_seg = array->n_slots / array->n_segments;
4248 
4249 	/* We attempt to keep adjacent blocks in the same local
4250 	segment. This can help in merging IO requests when we are
4251 	doing simulated AIO */
4252 	local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6))
4253 		% array->n_segments;
4254 
4255 loop:
4256 	os_mutex_enter(array->mutex);
4257 
4258 	if (array->n_reserved == array->n_slots) {
4259 		os_mutex_exit(array->mutex);
4260 
4261 		if (!srv_use_native_aio) {
4262 			/* If the handler threads are suspended, wake them
4263 			so that we get more slots */
4264 
4265 			os_aio_simulated_wake_handler_threads();
4266 		}
4267 
4268 		os_event_wait(array->not_full);
4269 
4270 		goto loop;
4271 	}
4272 
4273 	/* We start our search for an available slot from our preferred
4274 	local segment and do a full scan of the array. We are
4275 	guaranteed to find a slot in full scan. */
4276 	for (i = local_seg * slots_per_seg, counter = 0;
4277 	     counter < array->n_slots;
4278 	     i++, counter++) {
4279 
4280 		i %= array->n_slots;
4281 
4282 		slot = os_aio_array_get_nth_slot(array, i);
4283 
4284 		if (slot->reserved == FALSE) {
4285 			goto found;
4286 		}
4287 	}
4288 
4289 	/* We MUST always be able to get hold of a reserved slot. */
4290 	ut_error;
4291 
4292 found:
4293 	ut_a(slot->reserved == FALSE);
4294 	array->n_reserved++;
4295 
4296 	if (array->n_reserved == 1) {
4297 		os_event_reset(array->is_empty);
4298 	}
4299 
4300 	if (array->n_reserved == array->n_slots) {
4301 		os_event_reset(array->not_full);
4302 	}
4303 
4304 	slot->reserved = TRUE;
4305 	slot->reservation_time = ut_time();
4306 	slot->message1 = message1;
4307 	slot->message2 = message2;
4308 	slot->file     = file;
4309 	slot->name     = name;
4310 	slot->len      = len;
4311 	slot->type     = type;
4312 	slot->buf      = static_cast<byte*>(buf);
4313 	slot->offset   = offset;
4314 	slot->io_already_done = FALSE;
4315 
4316 #ifdef WIN_ASYNC_IO
4317 	control = &slot->control;
4318 	control->Offset = (DWORD) offset & 0xFFFFFFFF;
4319 	control->OffsetHigh = (DWORD) (offset >> 32);
4320 	ResetEvent(slot->handle);
4321 
4322 #elif defined(LINUX_NATIVE_AIO)
4323 
4324 	/* If we are not using native AIO skip this part. */
4325 	if (!srv_use_native_aio) {
4326 		goto skip_native_aio;
4327 	}
4328 
4329 	/* Check if we are dealing with 64 bit arch.
4330 	If not then make sure that offset fits in 32 bits. */
4331 	aio_offset = (off_t) offset;
4332 
4333 	ut_a(sizeof(aio_offset) >= sizeof(offset)
4334 	     || ((os_offset_t) aio_offset) == offset);
4335 
4336 	iocb = &slot->control;
4337 
4338 	if (type == OS_FILE_READ) {
4339 		io_prep_pread(iocb, file.m_file, buf, len, aio_offset);
4340 	} else {
4341 		ut_a(type == OS_FILE_WRITE);
4342 		io_prep_pwrite(iocb, file.m_file, buf, len, aio_offset);
4343 	}
4344 
4345 	iocb->data = (void*) slot;
4346 	slot->n_bytes = 0;
4347 	slot->ret = 0;
4348 
4349 skip_native_aio:
4350 #endif /* LINUX_NATIVE_AIO */
4351 	os_mutex_exit(array->mutex);
4352 
4353 	return(slot);
4354 }
4355 
4356 /*******************************************************************//**
4357 Frees a slot in the aio array. */
4358 static
4359 void
os_aio_array_free_slot(os_aio_array_t * array,os_aio_slot_t * slot)4360 os_aio_array_free_slot(
4361 /*===================*/
4362 	os_aio_array_t*	array,	/*!< in: aio array */
4363 	os_aio_slot_t*	slot)	/*!< in: pointer to slot */
4364 {
4365 	os_mutex_enter(array->mutex);
4366 
4367 	ut_ad(slot->reserved);
4368 
4369 	slot->reserved = FALSE;
4370 
4371 	array->n_reserved--;
4372 
4373 	if (array->n_reserved == array->n_slots - 1) {
4374 		os_event_set(array->not_full);
4375 	}
4376 
4377 	if (array->n_reserved == 0) {
4378 		os_event_set(array->is_empty);
4379 	}
4380 
4381 #ifdef WIN_ASYNC_IO
4382 
4383 	ResetEvent(slot->handle);
4384 
4385 #elif defined(LINUX_NATIVE_AIO)
4386 
4387 	if (srv_use_native_aio) {
4388 		memset(&slot->control, 0x0, sizeof(slot->control));
4389 		slot->n_bytes = 0;
4390 		slot->ret = 0;
4391 		/*fprintf(stderr, "Freed up Linux native slot.\n");*/
4392 	} else {
4393 		/* These fields should not be used if we are not
4394 		using native AIO. */
4395 		ut_ad(slot->n_bytes == 0);
4396 		ut_ad(slot->ret == 0);
4397 	}
4398 
4399 #endif
4400 	os_mutex_exit(array->mutex);
4401 }
4402 
4403 /**********************************************************************//**
4404 Wakes up a simulated aio i/o-handler thread if it has something to do. */
4405 static
4406 void
os_aio_simulated_wake_handler_thread(ulint global_segment)4407 os_aio_simulated_wake_handler_thread(
4408 /*=================================*/
4409 	ulint	global_segment)	/*!< in: the number of the segment in the aio
4410 				arrays */
4411 {
4412 	os_aio_array_t*	array;
4413 	ulint		segment;
4414 
4415 	ut_ad(!srv_use_native_aio);
4416 
4417 	segment = os_aio_get_array_and_local_segment(&array, global_segment);
4418 
4419 	ulint	n = array->n_slots / array->n_segments;
4420 
4421 	segment *= n;
4422 
4423 	/* Look through n slots after the segment * n'th slot */
4424 
4425 	os_mutex_enter(array->mutex);
4426 
4427 	for (ulint i = 0; i < n; ++i) {
4428 		const os_aio_slot_t*	slot;
4429 
4430 		slot = os_aio_array_get_nth_slot(array, segment + i);
4431 
4432 		if (slot->reserved) {
4433 
4434 			/* Found an i/o request */
4435 
4436 			os_mutex_exit(array->mutex);
4437 
4438 			os_event_t	event;
4439 
4440 			event = os_aio_segment_wait_events[global_segment];
4441 
4442 			os_event_set(event);
4443 
4444 			return;
4445 		}
4446 	}
4447 
4448 	os_mutex_exit(array->mutex);
4449 }
4450 
4451 /**********************************************************************//**
4452 Wakes up simulated aio i/o-handler threads if they have something to do. */
4453 UNIV_INTERN
4454 void
os_aio_simulated_wake_handler_threads(void)4455 os_aio_simulated_wake_handler_threads(void)
4456 /*=======================================*/
4457 {
4458 	if (srv_use_native_aio) {
4459 		/* We do not use simulated aio: do nothing */
4460 
4461 		return;
4462 	}
4463 
4464 	os_aio_recommend_sleep_for_read_threads	= FALSE;
4465 
4466 	for (ulint i = 0; i < os_aio_n_segments; i++) {
4467 		os_aio_simulated_wake_handler_thread(i);
4468 	}
4469 }
4470 
4471 /**********************************************************************//**
4472 This function can be called if one wants to post a batch of reads and
4473 prefers an i/o-handler thread to handle them all at once later. You must
4474 call os_aio_simulated_wake_handler_threads later to ensure the threads
4475 are not left sleeping! */
4476 UNIV_INTERN
4477 void
os_aio_simulated_put_read_threads_to_sleep(void)4478 os_aio_simulated_put_read_threads_to_sleep(void)
4479 /*============================================*/
4480 {
4481 
4482 /* The idea of putting background IO threads to sleep is only for
4483 Windows when using simulated AIO. Windows XP seems to schedule
4484 background threads too eagerly to allow for coalescing during
4485 readahead requests. */
4486 #ifdef __WIN__
4487 	os_aio_array_t*	array;
4488 
4489 	if (srv_use_native_aio) {
4490 		/* We do not use simulated aio: do nothing */
4491 
4492 		return;
4493 	}
4494 
4495 	os_aio_recommend_sleep_for_read_threads	= TRUE;
4496 
4497 	for (ulint i = 0; i < os_aio_n_segments; i++) {
4498 		os_aio_get_array_and_local_segment(&array, i);
4499 
4500 		if (array == os_aio_read_array) {
4501 
4502 			os_event_reset(os_aio_segment_wait_events[i]);
4503 		}
4504 	}
4505 #endif /* __WIN__ */
4506 }
4507 
4508 #if defined(LINUX_NATIVE_AIO)
4509 /*******************************************************************//**
4510 Dispatch an AIO request to the kernel.
4511 @return	TRUE on success. */
4512 static
4513 ibool
os_aio_linux_dispatch(os_aio_array_t * array,os_aio_slot_t * slot)4514 os_aio_linux_dispatch(
4515 /*==================*/
4516 	os_aio_array_t*	array,	/*!< in: io request array. */
4517 	os_aio_slot_t*	slot)	/*!< in: an already reserved slot. */
4518 {
4519 	int		ret;
4520 	ulint		io_ctx_index;
4521 	struct iocb*	iocb;
4522 
4523 	ut_ad(slot != NULL);
4524 	ut_ad(array);
4525 
4526 	ut_a(slot->reserved);
4527 
4528 	/* Find out what we are going to work with.
4529 	The iocb struct is directly in the slot.
4530 	The io_context is one per segment. */
4531 
4532 	iocb = &slot->control;
4533 	io_ctx_index = (slot->pos * array->n_segments) / array->n_slots;
4534 
4535 	ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb);
4536 
4537 #if defined(UNIV_AIO_DEBUG)
4538 	fprintf(stderr,
4539 		"io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n",
4540 		(slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot,
4541 		array->aio_ctx[io_ctx_index], (ulong) io_ctx_index);
4542 #endif
4543 
4544 	/* io_submit returns number of successfully
4545 	queued requests or -errno. */
4546 	if (UNIV_UNLIKELY(ret != 1)) {
4547 		errno = -ret;
4548 		return(FALSE);
4549 	}
4550 
4551 	return(TRUE);
4552 }
4553 #endif /* LINUX_NATIVE_AIO */
4554 
4555 
4556 /*******************************************************************//**
4557 NOTE! Use the corresponding macro os_aio(), not directly this function!
4558 Requests an asynchronous i/o operation.
4559 @return	TRUE if request was queued successfully, FALSE if fail */
4560 UNIV_INTERN
4561 ibool
os_aio_func(ulint type,ulint mode,const char * name,pfs_os_file_t file,void * buf,os_offset_t offset,ulint n,fil_node_t * message1,void * message2)4562 os_aio_func(
4563 /*========*/
4564 	ulint		type,	/*!< in: OS_FILE_READ or OS_FILE_WRITE */
4565 	ulint		mode,	/*!< in: OS_AIO_NORMAL, ..., possibly ORed
4566 				to OS_AIO_SIMULATED_WAKE_LATER: the
4567 				last flag advises this function not to wake
4568 				i/o-handler threads, but the caller will
4569 				do the waking explicitly later, in this
4570 				way the caller can post several requests in
4571 				a batch; NOTE that the batch must not be
4572 				so big that it exhausts the slots in aio
4573 				arrays! NOTE that a simulated batch
4574 				may introduce hidden chances of deadlocks,
4575 				because i/os are not actually handled until
4576 				all have been posted: use with great
4577 				caution! */
4578 	const char*	name,	/*!< in: name of the file or path as a
4579 				null-terminated string */
4580 	pfs_os_file_t	file,	/*!< in: handle to a file */
4581 	void*		buf,	/*!< in: buffer where to read or from which
4582 				to write */
4583 	os_offset_t	offset,	/*!< in: file offset where to read or write */
4584 	ulint		n,	/*!< in: number of bytes to read or write */
4585 	fil_node_t*	message1,/*!< in: message for the aio handler
4586 				(can be used to identify a completed
4587 				aio operation); ignored if mode is
4588 				OS_AIO_SYNC */
4589 	void*		message2)/*!< in: message for the aio handler
4590 				(can be used to identify a completed
4591 				aio operation); ignored if mode is
4592 				OS_AIO_SYNC */
4593 {
4594 	os_aio_array_t*	array;
4595 	os_aio_slot_t*	slot;
4596 #ifdef WIN_ASYNC_IO
4597 	ibool		retval;
4598 	BOOL		ret		= TRUE;
4599 	DWORD		len		= (DWORD) n;
4600 	struct fil_node_t* dummy_mess1;
4601 	void*		dummy_mess2;
4602 	ulint		dummy_type;
4603 #endif /* WIN_ASYNC_IO */
4604 	ulint		wake_later;
4605 	ut_ad(file.m_file);
4606 	ut_ad(buf);
4607 	ut_ad(n > 0);
4608 	ut_ad(n % OS_FILE_LOG_BLOCK_SIZE == 0);
4609 	ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
4610 	ut_ad(os_aio_validate_skip());
4611 #ifdef WIN_ASYNC_IO
4612 	ut_ad((n & 0xFFFFFFFFUL) == n);
4613 #endif
4614 
4615 	wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
4616 	mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER);
4617 
4618 	if (mode == OS_AIO_SYNC
4619 #ifdef WIN_ASYNC_IO
4620 	    && !srv_use_native_aio
4621 #endif /* WIN_ASYNC_IO */
4622 	    ) {
4623 		/* This is actually an ordinary synchronous read or write:
4624 		no need to use an i/o-handler thread. NOTE that if we use
4625 		Windows async i/o, Windows does not allow us to use
4626 		ordinary synchronous os_file_read etc. on the same file,
4627 		therefore we have built a special mechanism for synchronous
4628 		wait in the Windows case.
4629 		Also note that the Performance Schema instrumentation has
4630 		been performed by current os_aio_func()'s wrapper function
4631 		pfs_os_aio_func(). So we would no longer need to call
4632 		Performance Schema instrumented os_file_read() and
4633 		os_file_write(). Instead, we should use os_file_read_func()
4634 		and os_file_write_func() */
4635 
4636 		if (type == OS_FILE_READ) {
4637 			return(os_file_read_func(file.m_file, buf, offset, n));
4638 		}
4639 		ut_ad(!srv_read_only_mode);
4640 		ut_a(type == OS_FILE_WRITE);
4641 		return(os_file_write_func(name, file.m_file, buf, offset, n));
4642 	}
4643 
4644 try_again:
4645 	switch (mode) {
4646 	case OS_AIO_NORMAL:
4647 		if (type == OS_FILE_READ) {
4648 			array = os_aio_read_array;
4649 		} else {
4650 			ut_ad(!srv_read_only_mode);
4651 			array = os_aio_write_array;
4652 		}
4653 		break;
4654 	case OS_AIO_IBUF:
4655 		ut_ad(type == OS_FILE_READ);
4656 		/* Reduce probability of deadlock bugs in connection with ibuf:
4657 		do not let the ibuf i/o handler sleep */
4658 
4659 		wake_later = FALSE;
4660 
4661 		if (srv_read_only_mode) {
4662 			array = os_aio_read_array;
4663 		} else {
4664 			array = os_aio_ibuf_array;
4665 		}
4666 		break;
4667 	case OS_AIO_LOG:
4668 		if (srv_read_only_mode) {
4669 			array = os_aio_read_array;
4670 		} else {
4671 			array = os_aio_log_array;
4672 		}
4673 		break;
4674 	case OS_AIO_SYNC:
4675 		array = os_aio_sync_array;
4676 #if defined(LINUX_NATIVE_AIO)
4677 		/* In Linux native AIO we don't use sync IO array. */
4678 		ut_a(!srv_use_native_aio);
4679 #endif /* LINUX_NATIVE_AIO */
4680 		break;
4681 	default:
4682 		ut_error;
4683 		array = NULL; /* Eliminate compiler warning */
4684 	}
4685 
4686 	slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
4687 					 name, buf, offset, n);
4688 	if (type == OS_FILE_READ) {
4689 		if (srv_use_native_aio) {
4690 			os_n_file_reads++;
4691 			os_bytes_read_since_printout += n;
4692 #ifdef WIN_ASYNC_IO
4693 			ret = ReadFile(file.m_file, buf, (DWORD) n, &len,
4694 				       &(slot->control));
4695 #elif defined(LINUX_NATIVE_AIO)
4696 			if (!os_aio_linux_dispatch(array, slot)) {
4697 				goto err_exit;
4698 			}
4699 #endif /* WIN_ASYNC_IO */
4700 		} else {
4701 			if (!wake_later) {
4702 				os_aio_simulated_wake_handler_thread(
4703 					os_aio_get_segment_no_from_slot(
4704 						array, slot));
4705 			}
4706 		}
4707 	} else if (type == OS_FILE_WRITE) {
4708 		ut_ad(!srv_read_only_mode);
4709 		if (srv_use_native_aio) {
4710 			os_n_file_writes++;
4711 #ifdef WIN_ASYNC_IO
4712 			ret = WriteFile(file.m_file, buf, (DWORD) n, &len,
4713 					&(slot->control));
4714 #elif defined(LINUX_NATIVE_AIO)
4715 			if (!os_aio_linux_dispatch(array, slot)) {
4716 				goto err_exit;
4717 			}
4718 #endif /* WIN_ASYNC_IO */
4719 		} else {
4720 			if (!wake_later) {
4721 				os_aio_simulated_wake_handler_thread(
4722 					os_aio_get_segment_no_from_slot(
4723 						array, slot));
4724 			}
4725 		}
4726 	} else {
4727 		ut_error;
4728 	}
4729 
4730 #ifdef WIN_ASYNC_IO
4731 	if (srv_use_native_aio) {
4732 		if ((ret && len == n)
4733 		    || (!ret && GetLastError() == ERROR_IO_PENDING)) {
4734 			/* aio was queued successfully! */
4735 
4736 			if (mode == OS_AIO_SYNC) {
4737 				/* We want a synchronous i/o operation on a
4738 				file where we also use async i/o: in Windows
4739 				we must use the same wait mechanism as for
4740 				async i/o */
4741 
4742 				retval = os_aio_windows_handle(
4743 					ULINT_UNDEFINED, slot->pos,
4744 					&dummy_mess1, &dummy_mess2,
4745 					&dummy_type);
4746 
4747 				return(retval);
4748 			}
4749 
4750 			return(TRUE);
4751 		}
4752 
4753 		goto err_exit;
4754 	}
4755 #endif /* WIN_ASYNC_IO */
4756 	/* aio was queued successfully! */
4757 	return(TRUE);
4758 
4759 #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
4760 err_exit:
4761 #endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
4762 	os_aio_array_free_slot(array, slot);
4763 
4764 	if (os_file_handle_error(
4765 		name,type == OS_FILE_READ ? "aio read" : "aio write")) {
4766 
4767 		goto try_again;
4768 	}
4769 
4770 	return(FALSE);
4771 }
4772 
4773 #ifdef WIN_ASYNC_IO
4774 /**********************************************************************//**
4775 This function is only used in Windows asynchronous i/o.
4776 Waits for an aio operation to complete. This function is used to wait the
4777 for completed requests. The aio array of pending requests is divided
4778 into segments. The thread specifies which segment or slot it wants to wait
4779 for. NOTE: this function will also take care of freeing the aio slot,
4780 therefore no other thread is allowed to do the freeing!
4781 @return	TRUE if the aio operation succeeded */
4782 UNIV_INTERN
4783 ibool
os_aio_windows_handle(ulint segment,ulint pos,fil_node_t ** message1,void ** message2,ulint * type)4784 os_aio_windows_handle(
4785 /*==================*/
4786 	ulint	segment,	/*!< in: the number of the segment in the aio
4787 				arrays to wait for; segment 0 is the ibuf
4788 				i/o thread, segment 1 the log i/o thread,
4789 				then follow the non-ibuf read threads, and as
4790 				the last are the non-ibuf write threads; if
4791 				this is ULINT_UNDEFINED, then it means that
4792 				sync aio is used, and this parameter is
4793 				ignored */
4794 	ulint	pos,		/*!< this parameter is used only in sync aio:
4795 				wait for the aio slot at this position */
4796 	fil_node_t**message1,	/*!< out: the messages passed with the aio
4797 				request; note that also in the case where
4798 				the aio operation failed, these output
4799 				parameters are valid and can be used to
4800 				restart the operation, for example */
4801 	void**	message2,
4802 	ulint*	type)		/*!< out: OS_FILE_WRITE or ..._READ */
4803 {
4804 	ulint		orig_seg	= segment;
4805 	os_aio_array_t*	array;
4806 	os_aio_slot_t*	slot;
4807 	ulint		n;
4808 	ulint		i;
4809 	ibool		ret_val;
4810 	BOOL		ret;
4811 	DWORD		len;
4812 	BOOL		retry		= FALSE;
4813 
4814 	if (segment == ULINT_UNDEFINED) {
4815 		segment = 0;
4816 		array = os_aio_sync_array;
4817 	} else {
4818 		segment = os_aio_get_array_and_local_segment(&array, segment);
4819 	}
4820 
4821 	/* NOTE! We only access constant fields in os_aio_array. Therefore
4822 	we do not have to acquire the protecting mutex yet */
4823 
4824 	ut_ad(os_aio_validate_skip());
4825 	ut_ad(segment < array->n_segments);
4826 
4827 	n = array->n_slots / array->n_segments;
4828 
4829 	if (array == os_aio_sync_array) {
4830 
4831 		WaitForSingleObject(
4832 			os_aio_array_get_nth_slot(array, pos)->handle,
4833 			INFINITE);
4834 
4835 		i = pos;
4836 
4837 	} else {
4838 		if (orig_seg != ULINT_UNDEFINED) {
4839 			srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
4840 		}
4841 
4842 		i = WaitForMultipleObjects(
4843 			(DWORD) n, array->handles + segment * n,
4844 			FALSE, INFINITE);
4845 	}
4846 
4847 	os_mutex_enter(array->mutex);
4848 
4849 	if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
4850 	    && array->n_reserved == 0) {
4851 		*message1 = NULL;
4852 		*message2 = NULL;
4853 		os_mutex_exit(array->mutex);
4854 		return(TRUE);
4855 	}
4856 
4857 	ut_a(i >= WAIT_OBJECT_0 && i <= WAIT_OBJECT_0 + n);
4858 
4859 	slot = os_aio_array_get_nth_slot(array, i + segment * n);
4860 
4861 	ut_a(slot->reserved);
4862 
4863 	if (orig_seg != ULINT_UNDEFINED) {
4864 		srv_set_io_thread_op_info(
4865 			orig_seg, "get windows aio return value");
4866 	}
4867 	ret = GetOverlappedResult(slot->file.m_file, &(slot->control), &len, TRUE);
4868 
4869 	*message1 = slot->message1;
4870 	*message2 = slot->message2;
4871 
4872 	*type = slot->type;
4873 
4874 	if (ret && len == slot->len) {
4875 
4876 		ret_val = TRUE;
4877 	} else if (os_file_handle_error(slot->name, "Windows aio")) {
4878 
4879 		retry = TRUE;
4880 	} else {
4881 
4882 		ret_val = FALSE;
4883 	}
4884 
4885 	os_mutex_exit(array->mutex);
4886 
4887 	if (retry) {
4888 		/* retry failed read/write operation synchronously.
4889 		No need to hold array->mutex. */
4890 
4891 #ifdef UNIV_PFS_IO
4892 		/* This read/write does not go through os_file_read
4893 		and os_file_write APIs, need to register with
4894 		performance schema explicitly here. */
4895 		struct PSI_file_locker* locker = NULL;
4896 		PSI_file_locker_state	state;
4897 		register_pfs_file_io_begin(&state, locker, slot->file, slot->len,
4898 					   (slot->type == OS_FILE_WRITE)
4899 						? PSI_FILE_WRITE
4900 						: PSI_FILE_READ,
4901 					    __FILE__, __LINE__);
4902 #endif
4903 
4904 		ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
4905 
4906 		switch (slot->type) {
4907 		case OS_FILE_WRITE:
4908 			ret = WriteFile(slot->file.m_file, slot->buf,
4909 					(DWORD) slot->len, &len,
4910 					&(slot->control));
4911 			break;
4912 		case OS_FILE_READ:
4913 			ret = ReadFile(slot->file.m_file, slot->buf,
4914 				       (DWORD) slot->len, &len,
4915 				       &(slot->control));
4916 			break;
4917 		default:
4918 			ut_error;
4919 		}
4920 
4921 #ifdef UNIV_PFS_IO
4922 		register_pfs_file_io_end(locker, len);
4923 #endif
4924 
4925 		if (!ret && GetLastError() == ERROR_IO_PENDING) {
4926 			/* aio was queued successfully!
4927 			We want a synchronous i/o operation on a
4928 			file where we also use async i/o: in Windows
4929 			we must use the same wait mechanism as for
4930 			async i/o */
4931 			ret = GetOverlappedResult(slot->file.m_file,
4932 						  &(slot->control),
4933 						  &len, TRUE);
4934 		}
4935 
4936 		ret_val = ret && len == slot->len;
4937 	}
4938 
4939 	os_aio_array_free_slot(array, slot);
4940 
4941 	return(ret_val);
4942 }
4943 #endif
4944 
4945 #if defined(LINUX_NATIVE_AIO)
4946 /******************************************************************//**
4947 This function is only used in Linux native asynchronous i/o. This is
4948 called from within the io-thread. If there are no completed IO requests
4949 in the slot array, the thread calls this function to collect more
4950 requests from the kernel.
4951 The io-thread waits on io_getevents(), which is a blocking call, with
4952 a timeout value. Unless the system is very heavy loaded, keeping the
4953 io-thread very busy, the io-thread will spend most of its time waiting
4954 in this function.
4955 The io-thread also exits in this function. It checks server status at
4956 each wakeup and that is why we use timed wait in io_getevents(). */
4957 static
4958 void
os_aio_linux_collect(os_aio_array_t * array,ulint segment,ulint seg_size)4959 os_aio_linux_collect(
4960 /*=================*/
4961 	os_aio_array_t* array,		/*!< in/out: slot array. */
4962 	ulint		segment,	/*!< in: local segment no. */
4963 	ulint		seg_size)	/*!< in: segment size. */
4964 {
4965 	int			i;
4966 	int			ret;
4967 	ulint			start_pos;
4968 	ulint			end_pos;
4969 	struct timespec		timeout;
4970 	struct io_event*	events;
4971 	struct io_context*	io_ctx;
4972 
4973 	/* sanity checks. */
4974 	ut_ad(array != NULL);
4975 	ut_ad(seg_size > 0);
4976 	ut_ad(segment < array->n_segments);
4977 
4978 	/* Which part of event array we are going to work on. */
4979 	events = &array->aio_events[segment * seg_size];
4980 
4981 	/* Which io_context we are going to use. */
4982 	io_ctx = array->aio_ctx[segment];
4983 
4984 	/* Starting point of the segment we will be working on. */
4985 	start_pos = segment * seg_size;
4986 
4987 	/* End point. */
4988 	end_pos = start_pos + seg_size;
4989 
4990 retry:
4991 
4992 	/* Initialize the events. The timeout value is arbitrary.
4993 	We probably need to experiment with it a little. */
4994 	memset(events, 0, sizeof(*events) * seg_size);
4995 	timeout.tv_sec = 0;
4996 	timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
4997 
4998 	ret = io_getevents(io_ctx, 1, seg_size, events, &timeout);
4999 
5000 	if (ret > 0) {
5001 		for (i = 0; i < ret; i++) {
5002 			os_aio_slot_t*	slot;
5003 			struct iocb*	control;
5004 
5005 			control = (struct iocb*) events[i].obj;
5006 			ut_a(control != NULL);
5007 
5008 			slot = (os_aio_slot_t*) control->data;
5009 
5010 			/* Some sanity checks. */
5011 			ut_a(slot != NULL);
5012 			ut_a(slot->reserved);
5013 
5014 #if defined(UNIV_AIO_DEBUG)
5015 			fprintf(stderr,
5016 				"io_getevents[%c]: slot[%p] ctx[%p]"
5017 				" seg[%lu]\n",
5018 				(slot->type == OS_FILE_WRITE) ? 'w' : 'r',
5019 				slot, io_ctx, segment);
5020 #endif
5021 
5022 			/* We are not scribbling previous segment. */
5023 			ut_a(slot->pos >= start_pos);
5024 
5025 			/* We have not overstepped to next segment. */
5026 			ut_a(slot->pos < end_pos);
5027 
5028 			/* Mark this request as completed. The error handling
5029 			will be done in the calling function. */
5030 			os_mutex_enter(array->mutex);
5031 			slot->n_bytes = events[i].res;
5032 			slot->ret = events[i].res2;
5033 			slot->io_already_done = TRUE;
5034 			os_mutex_exit(array->mutex);
5035 		}
5036 		return;
5037 	}
5038 
5039 	if (UNIV_UNLIKELY(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) {
5040 		return;
5041 	}
5042 
5043 	/* This error handling is for any error in collecting the
5044 	IO requests. The errors, if any, for any particular IO
5045 	request are simply passed on to the calling routine. */
5046 
5047 	switch (ret) {
5048 	case -EAGAIN:
5049 		/* Not enough resources! Try again. */
5050 	case -EINTR:
5051 		/* Interrupted! I have tested the behaviour in case of an
5052 		interrupt. If we have some completed IOs available then
5053 		the return code will be the number of IOs. We get EINTR only
5054 		if there are no completed IOs and we have been interrupted. */
5055 	case 0:
5056 		/* No pending request! Go back and check again. */
5057 		goto retry;
5058 	}
5059 
5060 	/* All other errors should cause a trap for now. */
5061 	ut_print_timestamp(stderr);
5062 	fprintf(stderr,
5063 		" InnoDB: unexpected ret_code[%d] from io_getevents()!\n",
5064 		ret);
5065 	ut_error;
5066 }
5067 
5068 /**********************************************************************//**
5069 This function is only used in Linux native asynchronous i/o.
5070 Waits for an aio operation to complete. This function is used to wait for
5071 the completed requests. The aio array of pending requests is divided
5072 into segments. The thread specifies which segment or slot it wants to wait
5073 for. NOTE: this function will also take care of freeing the aio slot,
5074 therefore no other thread is allowed to do the freeing!
5075 @return	TRUE if the IO was successful */
5076 UNIV_INTERN
5077 ibool
os_aio_linux_handle(ulint global_seg,fil_node_t ** message1,void ** message2,ulint * type)5078 os_aio_linux_handle(
5079 /*================*/
5080 	ulint	global_seg,	/*!< in: segment number in the aio array
5081 				to wait for; segment 0 is the ibuf
5082 				i/o thread, segment 1 is log i/o thread,
5083 				then follow the non-ibuf read threads,
5084 				and the last are the non-ibuf write
5085 				threads. */
5086 	fil_node_t**message1,	/*!< out: the messages passed with the */
5087 	void**	message2,	/*!< aio request; note that in case the
5088 				aio operation failed, these output
5089 				parameters are valid and can be used to
5090 				restart the operation. */
5091 	ulint*	type)		/*!< out: OS_FILE_WRITE or ..._READ */
5092 {
5093 	ulint		segment;
5094 	os_aio_array_t*	array;
5095 	os_aio_slot_t*	slot;
5096 	ulint		n;
5097 	ulint		i;
5098 	ibool		ret = FALSE;
5099 
5100 	/* Should never be doing Sync IO here. */
5101 	ut_a(global_seg != ULINT_UNDEFINED);
5102 
5103 	/* Find the array and the local segment. */
5104 	segment = os_aio_get_array_and_local_segment(&array, global_seg);
5105 	n = array->n_slots / array->n_segments;
5106 
5107 	/* Loop until we have found a completed request. */
5108 	for (;;) {
5109 		ibool	any_reserved = FALSE;
5110 		os_mutex_enter(array->mutex);
5111 		for (i = 0; i < n; ++i) {
5112 			slot = os_aio_array_get_nth_slot(
5113 				array, i + segment * n);
5114 			if (!slot->reserved) {
5115 				continue;
5116 			} else if (slot->io_already_done) {
5117 				/* Something for us to work on. */
5118 				goto found;
5119 			} else {
5120 				any_reserved = TRUE;
5121 			}
5122 		}
5123 
5124 		os_mutex_exit(array->mutex);
5125 
5126 		/* There is no completed request.
5127 		If there is no pending request at all,
5128 		and the system is being shut down, exit. */
5129 		if (UNIV_UNLIKELY
5130 		    (!any_reserved
5131 		     && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) {
5132 			*message1 = NULL;
5133 			*message2 = NULL;
5134 			return(TRUE);
5135 		}
5136 
5137 		/* Wait for some request. Note that we return
5138 		from wait iff we have found a request. */
5139 
5140 		srv_set_io_thread_op_info(global_seg,
5141 			"waiting for completed aio requests");
5142 		os_aio_linux_collect(array, segment, n);
5143 	}
5144 
5145 found:
5146 	/* Note that it may be that there are more then one completed
5147 	IO requests. We process them one at a time. We may have a case
5148 	here to improve the performance slightly by dealing with all
5149 	requests in one sweep. */
5150 	srv_set_io_thread_op_info(global_seg,
5151 				"processing completed aio requests");
5152 
5153 	/* Ensure that we are scribbling only our segment. */
5154 	ut_a(i < n);
5155 
5156 	ut_ad(slot != NULL);
5157 	ut_ad(slot->reserved);
5158 	ut_ad(slot->io_already_done);
5159 
5160 	*message1 = slot->message1;
5161 	*message2 = slot->message2;
5162 
5163 	*type = slot->type;
5164 
5165 	if (slot->ret == 0 && slot->n_bytes == (long) slot->len) {
5166 
5167 		ret = TRUE;
5168 	} else {
5169 		errno = -slot->ret;
5170 
5171 		/* os_file_handle_error does tell us if we should retry
5172 		this IO. As it stands now, we don't do this retry when
5173 		reaping requests from a different context than
5174 		the dispatcher. This non-retry logic is the same for
5175 		windows and linux native AIO.
5176 		We should probably look into this to transparently
5177 		re-submit the IO. */
5178 		os_file_handle_error(slot->name, "Linux aio");
5179 
5180 		ret = FALSE;
5181 	}
5182 
5183 	os_mutex_exit(array->mutex);
5184 
5185 	os_aio_array_free_slot(array, slot);
5186 
5187 	return(ret);
5188 }
5189 #endif /* LINUX_NATIVE_AIO */
5190 
5191 /**********************************************************************//**
5192 Does simulated aio. This function should be called by an i/o-handler
5193 thread.
5194 @return	TRUE if the aio operation succeeded */
5195 UNIV_INTERN
5196 ibool
os_aio_simulated_handle(ulint global_segment,fil_node_t ** message1,void ** message2,ulint * type)5197 os_aio_simulated_handle(
5198 /*====================*/
5199 	ulint	global_segment,	/*!< in: the number of the segment in the aio
5200 				arrays to wait for; segment 0 is the ibuf
5201 				i/o thread, segment 1 the log i/o thread,
5202 				then follow the non-ibuf read threads, and as
5203 				the last are the non-ibuf write threads */
5204 	fil_node_t**message1,	/*!< out: the messages passed with the aio
5205 				request; note that also in the case where
5206 				the aio operation failed, these output
5207 				parameters are valid and can be used to
5208 				restart the operation, for example */
5209 	void**	message2,
5210 	ulint*	type)		/*!< out: OS_FILE_WRITE or ..._READ */
5211 {
5212 	os_aio_array_t*	array;
5213 	ulint		segment;
5214 	os_aio_slot_t*	consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
5215 	ulint		n_consecutive;
5216 	ulint		total_len;
5217 	ulint		offs;
5218 	os_offset_t	lowest_offset;
5219 	ulint		biggest_age;
5220 	ulint		age;
5221 	byte*		combined_buf;
5222 	byte*		combined_buf2;
5223 	ibool		ret;
5224 	ibool		any_reserved;
5225 	ulint		n;
5226 	os_aio_slot_t*	aio_slot;
5227 
5228 	/* Fix compiler warning */
5229 	*consecutive_ios = NULL;
5230 
5231 	segment = os_aio_get_array_and_local_segment(&array, global_segment);
5232 
5233 restart:
5234 	/* NOTE! We only access constant fields in os_aio_array. Therefore
5235 	we do not have to acquire the protecting mutex yet */
5236 
5237 	srv_set_io_thread_op_info(global_segment,
5238 				  "looking for i/o requests (a)");
5239 	ut_ad(os_aio_validate_skip());
5240 	ut_ad(segment < array->n_segments);
5241 
5242 	n = array->n_slots / array->n_segments;
5243 
5244 	/* Look through n slots after the segment * n'th slot */
5245 
5246 	if (array == os_aio_read_array
5247 	    && os_aio_recommend_sleep_for_read_threads) {
5248 
5249 		/* Give other threads chance to add several i/os to the array
5250 		at once. */
5251 
5252 		goto recommended_sleep;
5253 	}
5254 
5255 	srv_set_io_thread_op_info(global_segment,
5256 				  "looking for i/o requests (b)");
5257 
5258 	/* Check if there is a slot for which the i/o has already been
5259 	done */
5260 	any_reserved = FALSE;
5261 
5262 	os_mutex_enter(array->mutex);
5263 
5264 	for (ulint i = 0; i < n; i++) {
5265 		os_aio_slot_t*	slot;
5266 
5267 		slot = os_aio_array_get_nth_slot(array, i + segment * n);
5268 
5269 		if (!slot->reserved) {
5270 			continue;
5271 		} else if (slot->io_already_done) {
5272 
5273 			if (os_aio_print_debug) {
5274 				fprintf(stderr,
5275 					"InnoDB: i/o for slot %lu"
5276 					" already done, returning\n",
5277 					(ulong) i);
5278 			}
5279 
5280 			aio_slot = slot;
5281 			ret = TRUE;
5282 			goto slot_io_done;
5283 		} else {
5284 			any_reserved = TRUE;
5285 		}
5286 	}
5287 
5288 	/* There is no completed request.
5289 	If there is no pending request at all,
5290 	and the system is being shut down, exit. */
5291 	if (!any_reserved && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
5292 		os_mutex_exit(array->mutex);
5293 		*message1 = NULL;
5294 		*message2 = NULL;
5295 		return(TRUE);
5296 	}
5297 
5298 	n_consecutive = 0;
5299 
5300 	/* If there are at least 2 seconds old requests, then pick the oldest
5301 	one to prevent starvation. If several requests have the same age,
5302 	then pick the one at the lowest offset. */
5303 
5304 	biggest_age = 0;
5305 	lowest_offset = IB_UINT64_MAX;
5306 
5307 	for (ulint i = 0; i < n; i++) {
5308 		os_aio_slot_t*	slot;
5309 
5310 		slot = os_aio_array_get_nth_slot(array, i + segment * n);
5311 
5312 		if (slot->reserved) {
5313 
5314 			age = (ulint) difftime(
5315 				ut_time(), slot->reservation_time);
5316 
5317 			if ((age >= 2 && age > biggest_age)
5318 			    || (age >= 2 && age == biggest_age
5319 				&& slot->offset < lowest_offset)) {
5320 
5321 				/* Found an i/o request */
5322 				consecutive_ios[0] = slot;
5323 
5324 				n_consecutive = 1;
5325 
5326 				biggest_age = age;
5327 				lowest_offset = slot->offset;
5328 			}
5329 		}
5330 	}
5331 
5332 	if (n_consecutive == 0) {
5333 		/* There were no old requests. Look for an i/o request at the
5334 		lowest offset in the array (we ignore the high 32 bits of the
5335 		offset in these heuristics) */
5336 
5337 		lowest_offset = IB_UINT64_MAX;
5338 
5339 		for (ulint i = 0; i < n; i++) {
5340 			os_aio_slot_t*	slot;
5341 
5342 			slot = os_aio_array_get_nth_slot(
5343 				array, i + segment * n);
5344 
5345 			if (slot->reserved && slot->offset < lowest_offset) {
5346 
5347 				/* Found an i/o request */
5348 				consecutive_ios[0] = slot;
5349 
5350 				n_consecutive = 1;
5351 
5352 				lowest_offset = slot->offset;
5353 			}
5354 		}
5355 	}
5356 
5357 	if (n_consecutive == 0) {
5358 
5359 		/* No i/o requested at the moment */
5360 
5361 		goto wait_for_io;
5362 	}
5363 
5364 	/* if n_consecutive != 0, then we have assigned
5365 	something valid to consecutive_ios[0] */
5366 	ut_ad(n_consecutive != 0);
5367 	ut_ad(consecutive_ios[0] != NULL);
5368 
5369 	aio_slot = consecutive_ios[0];
5370 
5371 	/* Check if there are several consecutive blocks to read or write */
5372 
5373 consecutive_loop:
5374 	for (ulint i = 0; i < n; i++) {
5375 		os_aio_slot_t*	slot;
5376 
5377 		slot = os_aio_array_get_nth_slot(array, i + segment * n);
5378 		if (slot->reserved
5379 		    && slot != aio_slot
5380 		    && slot->offset == aio_slot->offset + aio_slot->len
5381 		    && slot->type == aio_slot->type
5382 		    && slot->file.m_file == aio_slot->file.m_file) {
5383 
5384 			/* Found a consecutive i/o request */
5385 
5386 			consecutive_ios[n_consecutive] = slot;
5387 			n_consecutive++;
5388 
5389 			aio_slot = slot;
5390 
5391 			if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) {
5392 
5393 				goto consecutive_loop;
5394 			} else {
5395 				break;
5396 			}
5397 		}
5398 	}
5399 
5400 	srv_set_io_thread_op_info(global_segment, "consecutive i/o requests");
5401 
5402 	/* We have now collected n_consecutive i/o requests in the array;
5403 	allocate a single buffer which can hold all data, and perform the
5404 	i/o */
5405 
5406 	total_len = 0;
5407 	aio_slot = consecutive_ios[0];
5408 
5409 	for (ulint i = 0; i < n_consecutive; i++) {
5410 		total_len += consecutive_ios[i]->len;
5411 	}
5412 
5413 	if (n_consecutive == 1) {
5414 		/* We can use the buffer of the i/o request */
5415 		combined_buf = aio_slot->buf;
5416 		combined_buf2 = NULL;
5417 	} else {
5418 		combined_buf2 = static_cast<byte*>(
5419 			ut_malloc(total_len + UNIV_PAGE_SIZE));
5420 
5421 		ut_a(combined_buf2);
5422 
5423 		combined_buf = static_cast<byte*>(
5424 			ut_align(combined_buf2, UNIV_PAGE_SIZE));
5425 	}
5426 
5427 	/* We release the array mutex for the time of the i/o: NOTE that
5428 	this assumes that there is just one i/o-handler thread serving
5429 	a single segment of slots! */
5430 
5431 	os_mutex_exit(array->mutex);
5432 
5433 	if (aio_slot->type == OS_FILE_WRITE && n_consecutive > 1) {
5434 		/* Copy the buffers to the combined buffer */
5435 		offs = 0;
5436 
5437 		for (ulint i = 0; i < n_consecutive; i++) {
5438 
5439 			ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf,
5440 				  consecutive_ios[i]->len);
5441 
5442 			offs += consecutive_ios[i]->len;
5443 		}
5444 	}
5445 
5446 	srv_set_io_thread_op_info(global_segment, "doing file i/o");
5447 
5448 	/* Do the i/o with ordinary, synchronous i/o functions: */
5449 	if (aio_slot->type == OS_FILE_WRITE) {
5450 		ut_ad(!srv_read_only_mode);
5451 		ret = os_file_write(
5452 			aio_slot->name, aio_slot->file, combined_buf,
5453 			aio_slot->offset, total_len);
5454 	} else {
5455 		ret = os_file_read(
5456 			aio_slot->file, combined_buf,
5457 			aio_slot->offset, total_len);
5458 	}
5459 
5460 	ut_a(ret);
5461 	srv_set_io_thread_op_info(global_segment, "file i/o done");
5462 
5463 	if (aio_slot->type == OS_FILE_READ && n_consecutive > 1) {
5464 		/* Copy the combined buffer to individual buffers */
5465 		offs = 0;
5466 
5467 		for (ulint i = 0; i < n_consecutive; i++) {
5468 
5469 			ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs,
5470 				  consecutive_ios[i]->len);
5471 			offs += consecutive_ios[i]->len;
5472 		}
5473 	}
5474 
5475 	if (combined_buf2) {
5476 		ut_free(combined_buf2);
5477 	}
5478 
5479 	os_mutex_enter(array->mutex);
5480 
5481 	/* Mark the i/os done in slots */
5482 
5483 	for (ulint i = 0; i < n_consecutive; i++) {
5484 		consecutive_ios[i]->io_already_done = TRUE;
5485 	}
5486 
5487 	/* We return the messages for the first slot now, and if there were
5488 	several slots, the messages will be returned with subsequent calls
5489 	of this function */
5490 
5491 slot_io_done:
5492 
5493 	ut_a(aio_slot->reserved);
5494 
5495 	*message1 = aio_slot->message1;
5496 	*message2 = aio_slot->message2;
5497 
5498 	*type = aio_slot->type;
5499 
5500 	os_mutex_exit(array->mutex);
5501 
5502 	os_aio_array_free_slot(array, aio_slot);
5503 
5504 	return(ret);
5505 
5506 wait_for_io:
5507 	srv_set_io_thread_op_info(global_segment, "resetting wait event");
5508 
5509 	/* We wait here until there again can be i/os in the segment
5510 	of this thread */
5511 
5512 	os_event_reset(os_aio_segment_wait_events[global_segment]);
5513 
5514 	os_mutex_exit(array->mutex);
5515 
5516 recommended_sleep:
5517 	srv_set_io_thread_op_info(global_segment, "waiting for i/o request");
5518 
5519 	os_event_wait(os_aio_segment_wait_events[global_segment]);
5520 
5521 	goto restart;
5522 }
5523 
5524 /**********************************************************************//**
5525 Validates the consistency of an aio array.
5526 @return	true if ok */
5527 static
5528 bool
os_aio_array_validate(os_aio_array_t * array)5529 os_aio_array_validate(
5530 /*==================*/
5531 	os_aio_array_t*	array)	/*!< in: aio wait array */
5532 {
5533 	ulint		i;
5534 	ulint		n_reserved	= 0;
5535 
5536 	os_mutex_enter(array->mutex);
5537 
5538 	ut_a(array->n_slots > 0);
5539 	ut_a(array->n_segments > 0);
5540 
5541 	for (i = 0; i < array->n_slots; i++) {
5542 		os_aio_slot_t*	slot;
5543 
5544 		slot = os_aio_array_get_nth_slot(array, i);
5545 
5546 		if (slot->reserved) {
5547 			n_reserved++;
5548 			ut_a(slot->len > 0);
5549 		}
5550 	}
5551 
5552 	ut_a(array->n_reserved == n_reserved);
5553 
5554 	os_mutex_exit(array->mutex);
5555 
5556 	return(true);
5557 }
5558 
5559 /**********************************************************************//**
5560 Validates the consistency the aio system.
5561 @return	TRUE if ok */
5562 UNIV_INTERN
5563 ibool
os_aio_validate(void)5564 os_aio_validate(void)
5565 /*=================*/
5566 {
5567 	os_aio_array_validate(os_aio_read_array);
5568 
5569 	if (os_aio_write_array != 0) {
5570 		os_aio_array_validate(os_aio_write_array);
5571 	}
5572 
5573 	if (os_aio_ibuf_array != 0) {
5574 		os_aio_array_validate(os_aio_ibuf_array);
5575 	}
5576 
5577 	if (os_aio_log_array != 0) {
5578 		os_aio_array_validate(os_aio_log_array);
5579 	}
5580 
5581 	if (os_aio_sync_array != 0) {
5582 		os_aio_array_validate(os_aio_sync_array);
5583 	}
5584 
5585 	return(TRUE);
5586 }
5587 
5588 /**********************************************************************//**
5589 Prints pending IO requests per segment of an aio array.
5590 We probably don't need per segment statistics but they can help us
5591 during development phase to see if the IO requests are being
5592 distributed as expected. */
5593 static
5594 void
os_aio_print_segment_info(FILE * file,ulint * n_seg,os_aio_array_t * array)5595 os_aio_print_segment_info(
5596 /*======================*/
5597 	FILE*		file,	/*!< in: file where to print */
5598 	ulint*		n_seg,	/*!< in: pending IO array */
5599 	os_aio_array_t*	array)	/*!< in: array to process */
5600 {
5601 	ulint	i;
5602 
5603 	ut_ad(array);
5604 	ut_ad(n_seg);
5605 	ut_ad(array->n_segments > 0);
5606 
5607 	if (array->n_segments == 1) {
5608 		return;
5609 	}
5610 
5611 	fprintf(file, " [");
5612 	for (i = 0; i < array->n_segments; i++) {
5613 		if (i != 0) {
5614 			fprintf(file, ", ");
5615 		}
5616 
5617 		fprintf(file, "%lu", n_seg[i]);
5618 	}
5619 	fprintf(file, "] ");
5620 }
5621 
5622 /**********************************************************************//**
5623 Prints info about the aio array. */
5624 UNIV_INTERN
5625 void
os_aio_print_array(FILE * file,os_aio_array_t * array)5626 os_aio_print_array(
5627 /*==============*/
5628 	FILE*		file,	/*!< in: file where to print */
5629 	os_aio_array_t*	array)	/*!< in: aio array to print */
5630 {
5631 	ulint			n_reserved = 0;
5632 	ulint			n_res_seg[SRV_MAX_N_IO_THREADS];
5633 
5634 	os_mutex_enter(array->mutex);
5635 
5636 	ut_a(array->n_slots > 0);
5637 	ut_a(array->n_segments > 0);
5638 
5639 	memset(n_res_seg, 0x0, sizeof(n_res_seg));
5640 
5641 	for (ulint i = 0; i < array->n_slots; ++i) {
5642 		os_aio_slot_t*	slot;
5643 		ulint		seg_no;
5644 
5645 		slot = os_aio_array_get_nth_slot(array, i);
5646 
5647 		seg_no = (i * array->n_segments) / array->n_slots;
5648 
5649 		if (slot->reserved) {
5650 			++n_reserved;
5651 			++n_res_seg[seg_no];
5652 
5653 			ut_a(slot->len > 0);
5654 		}
5655 	}
5656 
5657 	ut_a(array->n_reserved == n_reserved);
5658 
5659 	fprintf(file, " %lu", (ulong) n_reserved);
5660 
5661 	os_aio_print_segment_info(file, n_res_seg, array);
5662 
5663 	os_mutex_exit(array->mutex);
5664 }
5665 
5666 /**********************************************************************//**
5667 Prints info of the aio arrays. */
5668 UNIV_INTERN
5669 void
os_aio_print(FILE * file)5670 os_aio_print(
5671 /*=========*/
5672 	FILE*	file)	/*!< in: file where to print */
5673 {
5674 	time_t		current_time;
5675 	double		time_elapsed;
5676 	double		avg_bytes_read;
5677 
5678 	for (ulint i = 0; i < srv_n_file_io_threads; ++i) {
5679 		fprintf(file, "I/O thread %lu state: %s (%s)",
5680 			(ulong) i,
5681 			srv_io_thread_op_info[i],
5682 			srv_io_thread_function[i]);
5683 
5684 #ifndef __WIN__
5685 		if (os_aio_segment_wait_events[i]->is_set) {
5686 			fprintf(file, " ev set");
5687 		}
5688 #endif /* __WIN__ */
5689 
5690 		fprintf(file, "\n");
5691 	}
5692 
5693 	fputs("Pending normal aio reads:", file);
5694 
5695 	os_aio_print_array(file, os_aio_read_array);
5696 
5697 	if (os_aio_write_array != 0) {
5698 		fputs(", aio writes:", file);
5699 		os_aio_print_array(file, os_aio_write_array);
5700 	}
5701 
5702 	if (os_aio_ibuf_array != 0) {
5703 		fputs(",\n ibuf aio reads:", file);
5704 		os_aio_print_array(file, os_aio_ibuf_array);
5705 	}
5706 
5707 	if (os_aio_log_array != 0) {
5708 		fputs(", log i/o's:", file);
5709 		os_aio_print_array(file, os_aio_log_array);
5710 	}
5711 
5712 	if (os_aio_sync_array != 0) {
5713 		fputs(", sync i/o's:", file);
5714 		os_aio_print_array(file, os_aio_sync_array);
5715 	}
5716 
5717 	putc('\n', file);
5718 	current_time = ut_time();
5719 	time_elapsed = 0.001 + difftime(current_time, os_last_printout);
5720 
5721 	fprintf(file,
5722 		"Pending flushes (fsync) log: %lu; buffer pool: %lu\n"
5723 		"%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n",
5724 		(ulong) fil_n_pending_log_flushes,
5725 		(ulong) fil_n_pending_tablespace_flushes,
5726 		(ulong) os_n_file_reads,
5727 		(ulong) os_n_file_writes,
5728 		(ulong) os_n_fsyncs);
5729 
5730 	if (os_file_n_pending_preads != 0 || os_file_n_pending_pwrites != 0) {
5731 		fprintf(file,
5732 			"%lu pending preads, %lu pending pwrites\n",
5733 			(ulong) os_file_n_pending_preads,
5734 			(ulong) os_file_n_pending_pwrites);
5735 	}
5736 
5737 	if (os_n_file_reads == os_n_file_reads_old) {
5738 		avg_bytes_read = 0.0;
5739 	} else {
5740 		avg_bytes_read = (double) os_bytes_read_since_printout
5741 			/ (os_n_file_reads - os_n_file_reads_old);
5742 	}
5743 
5744 	fprintf(file,
5745 		"%.2f reads/s, %lu avg bytes/read,"
5746 		" %.2f writes/s, %.2f fsyncs/s\n",
5747 		(os_n_file_reads - os_n_file_reads_old)
5748 		/ time_elapsed,
5749 		(ulong) avg_bytes_read,
5750 		(os_n_file_writes - os_n_file_writes_old)
5751 		/ time_elapsed,
5752 		(os_n_fsyncs - os_n_fsyncs_old)
5753 		/ time_elapsed);
5754 
5755 	os_n_file_reads_old = os_n_file_reads;
5756 	os_n_file_writes_old = os_n_file_writes;
5757 	os_n_fsyncs_old = os_n_fsyncs;
5758 	os_bytes_read_since_printout = 0;
5759 
5760 	os_last_printout = current_time;
5761 }
5762 
5763 /**********************************************************************//**
5764 Refreshes the statistics used to print per-second averages. */
5765 UNIV_INTERN
5766 void
os_aio_refresh_stats(void)5767 os_aio_refresh_stats(void)
5768 /*======================*/
5769 {
5770 	os_n_file_reads_old = os_n_file_reads;
5771 	os_n_file_writes_old = os_n_file_writes;
5772 	os_n_fsyncs_old = os_n_fsyncs;
5773 	os_bytes_read_since_printout = 0;
5774 
5775 	os_last_printout = time(NULL);
5776 }
5777 
5778 #ifdef UNIV_DEBUG
5779 /**********************************************************************//**
5780 Checks that all slots in the system have been freed, that is, there are
5781 no pending io operations.
5782 @return	TRUE if all free */
5783 UNIV_INTERN
5784 ibool
os_aio_all_slots_free(void)5785 os_aio_all_slots_free(void)
5786 /*=======================*/
5787 {
5788 	os_aio_array_t*	array;
5789 	ulint		n_res	= 0;
5790 
5791 	array = os_aio_read_array;
5792 
5793 	os_mutex_enter(array->mutex);
5794 
5795 	n_res += array->n_reserved;
5796 
5797 	os_mutex_exit(array->mutex);
5798 
5799 	if (!srv_read_only_mode) {
5800 		ut_a(os_aio_write_array == 0);
5801 
5802 		array = os_aio_write_array;
5803 
5804 		os_mutex_enter(array->mutex);
5805 
5806 		n_res += array->n_reserved;
5807 
5808 		os_mutex_exit(array->mutex);
5809 
5810 		ut_a(os_aio_ibuf_array == 0);
5811 
5812 		array = os_aio_ibuf_array;
5813 
5814 		os_mutex_enter(array->mutex);
5815 
5816 		n_res += array->n_reserved;
5817 
5818 		os_mutex_exit(array->mutex);
5819 	}
5820 
5821 	ut_a(os_aio_log_array == 0);
5822 
5823 	array = os_aio_log_array;
5824 
5825 	os_mutex_enter(array->mutex);
5826 
5827 	n_res += array->n_reserved;
5828 
5829 	os_mutex_exit(array->mutex);
5830 
5831 	array = os_aio_sync_array;
5832 
5833 	os_mutex_enter(array->mutex);
5834 
5835 	n_res += array->n_reserved;
5836 
5837 	os_mutex_exit(array->mutex);
5838 
5839 	if (n_res == 0) {
5840 
5841 		return(TRUE);
5842 	}
5843 
5844 	return(FALSE);
5845 }
5846 #endif /* UNIV_DEBUG */
5847 
5848 #endif /* !UNIV_HOTBACKUP */
5849