1 /***********************************************************************
2 
3 Copyright (c) 1995, 2019, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2009, Percona Inc.
5 Copyright (c) 2013, 2021, MariaDB Corporation.
6 
7 Portions of this file contain modifications contributed and copyrighted
8 by Percona Inc.. Those modifications are
9 gratefully acknowledged and are described briefly in the InnoDB
10 documentation. The contributions by Percona Inc. are incorporated with
11 their permission, and subject to the conditions contained in the file
12 COPYING.Percona.
13 
14 This program is free software; you can redistribute it and/or modify it
15 under the terms of the GNU General Public License as published by the
16 Free Software Foundation; version 2 of the License.
17 
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
21 Public License for more details.
22 
23 You should have received a copy of the GNU General Public License along with
24 this program; if not, write to the Free Software Foundation, Inc.,
25 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
26 
27 ***********************************************************************/
28 
29 /**************************************************//**
30 @file os/os0file.cc
31 The interface to the operating system file i/o primitives
32 
33 Created 10/21/1995 Heikki Tuuri
34 *******************************************************/
35 
36 #ifndef UNIV_INNOCHECKSUM
37 #include "os0file.h"
38 #include "sql_const.h"
39 
40 #ifdef UNIV_LINUX
41 # include <sys/types.h>
42 # include <sys/stat.h>
43 #endif
44 
45 #include "srv0srv.h"
46 #include "srv0start.h"
47 #include "fil0fil.h"
48 #include "fsp0fsp.h"
49 #ifdef HAVE_LINUX_UNISTD_H
50 #include "unistd.h"
51 #endif
52 #include "os0event.h"
53 #include "os0thread.h"
54 
55 #include <vector>
56 
57 #ifdef LINUX_NATIVE_AIO
58 #include <libaio.h>
59 #endif /* LINUX_NATIVE_AIO */
60 
61 #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
62 # include <fcntl.h>
63 # include <linux/falloc.h>
64 #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
65 
66 #if defined(UNIV_LINUX) && defined(HAVE_SYS_IOCTL_H)
67 # include <sys/ioctl.h>
68 # ifndef DFS_IOCTL_ATOMIC_WRITE_SET
69 #  define DFS_IOCTL_ATOMIC_WRITE_SET _IOW(0x95, 2, uint)
70 # endif
71 #endif
72 
73 #ifdef _WIN32
74 #include <winioctl.h>
75 #else
76 // my_test_if_atomic_write()
77 #include <my_sys.h>
78 #endif
79 
80 
81 /** Insert buffer segment id */
82 static const ulint IO_IBUF_SEGMENT = 0;
83 
84 /** Log segment id */
85 static const ulint IO_LOG_SEGMENT = 1;
86 
87 /** Number of retries for partial I/O's */
88 static const ulint NUM_RETRIES_ON_PARTIAL_IO = 10;
89 
90 /* This specifies the file permissions InnoDB uses when it creates files in
91 Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
92 my_umask */
93 
94 #ifndef _WIN32
95 /** Umask for creating files */
96 static ulint	os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
97 #else
98 /** Umask for creating files */
99 static ulint	os_innodb_umask	= 0;
100 static HANDLE	data_completion_port;
101 static HANDLE	log_completion_port;
102 
103 static DWORD	fls_sync_io  = FLS_OUT_OF_INDEXES;
104 #define IOCP_SHUTDOWN_KEY (ULONG_PTR)-1
105 #endif /* _WIN32 */
106 
107 /** In simulated aio, merge at most this many consecutive i/os */
108 static const ulint	OS_AIO_MERGE_N_CONSECUTIVE = 64;
109 
110 /** Flag indicating if the page_cleaner is in active state. */
111 extern bool buf_page_cleaner_is_active;
112 
113 #ifdef WITH_INNODB_DISALLOW_WRITES
114 #define WAIT_ALLOW_WRITES() os_event_wait(srv_allow_writes_event)
115 #else
116 #define WAIT_ALLOW_WRITES() do { } while (0)
117 #endif /* WITH_INNODB_DISALLOW_WRITES */
118 
119 /**********************************************************************
120 
121 InnoDB AIO Implementation:
122 =========================
123 
124 We support native AIO for Windows and Linux. For rest of the platforms
125 we simulate AIO by special IO-threads servicing the IO-requests.
126 
127 Simulated AIO:
128 ==============
129 
130 On platforms where we 'simulate' AIO, the following is a rough explanation
131 of the high level design.
132 There are four io-threads (for ibuf, log, read, write).
133 All synchronous IO requests are serviced by the calling thread using
134 os_file_write/os_file_read. The Asynchronous requests are queued up
135 in an array (there are four such arrays) by the calling thread.
136 Later these requests are picked up by the IO-thread and are serviced
137 synchronously.
138 
139 Windows native AIO:
140 ==================
141 
142 If srv_use_native_aio is not set then Windows follow the same
143 code as simulated AIO. If the flag is set then native AIO interface
144 is used. On windows, one of the limitation is that if a file is opened
145 for AIO no synchronous IO can be done on it. Therefore we have an
146 extra fifth array to queue up synchronous IO requests.
147 There are innodb_file_io_threads helper threads. These threads work
148 on the four arrays mentioned above in Simulated AIO. No thread is
149 required for the sync array.
150 If a synchronous IO request is made, it is first queued in the sync
151 array. Then the calling thread itself waits on the request, thus
152 making the call synchronous.
153 If an AIO request is made the calling thread not only queues it in the
154 array but also submits the requests. The helper thread then collects
155 the completed IO request and calls completion routine on it.
156 
157 Linux native AIO:
158 =================
159 
160 If we have libaio installed on the system and innodb_use_native_aio
161 is set to true we follow the code path of native AIO, otherwise we
162 do simulated AIO.
163 There are innodb_file_io_threads helper threads. These threads work
164 on the four arrays mentioned above in Simulated AIO.
165 If a synchronous IO request is made, it is handled by calling
166 os_file_write/os_file_read.
167 If an AIO request is made the calling thread not only queues it in the
168 array but also submits the requests. The helper thread then collects
169 the completed IO request and calls completion routine on it.
170 
171 **********************************************************************/
172 
173 
174 #ifdef UNIV_PFS_IO
175 /* Keys to register InnoDB I/O with performance schema */
176 mysql_pfs_key_t  innodb_data_file_key;
177 mysql_pfs_key_t  innodb_log_file_key;
178 mysql_pfs_key_t  innodb_temp_file_key;
179 #endif /* UNIV_PFS_IO */
180 
181 class AIO;
182 
183 /** The asynchronous I/O context */
184 struct Slot {
185 
186 #ifdef WIN_ASYNC_IO
187 	/** Windows control block for the aio request
188 	must be at the very start of Slot, so we can
189 	cast Slot* to OVERLAPPED*
190 	*/
191 	OVERLAPPED		control;
192 #endif
193 
194 	/** index of the slot in the aio array */
195 	uint16_t		pos;
196 
197 	/** true if this slot is reserved */
198 	bool			is_reserved;
199 
200 	/** time when reserved */
201 	time_t			reservation_time;
202 
203 	/** buffer used in i/o */
204 	byte*			buf;
205 
206 	/** Buffer pointer used for actual IO. We advance this
207 	when partial IO is required and not buf */
208 	byte*			ptr;
209 
210 	/** OS_FILE_READ or OS_FILE_WRITE */
211 	IORequest		type;
212 
213 	/** file offset in bytes */
214 	os_offset_t		offset;
215 
216 	/** file where to read or write */
217 	pfs_os_file_t		file;
218 
219 	/** file name or path */
220 	const char*		name;
221 
222 	/** used only in simulated aio: true if the physical i/o
223 	already made and only the slot message needs to be passed
224 	to the caller of os_aio_simulated_handle */
225 	bool			io_already_done;
226 
227 	/*!< file block size */
228 	ulint			file_block_size;
229 
230 	/** The file node for which the IO is requested. */
231 	fil_node_t*		m1;
232 
233 	/** the requester of an aio operation and which can be used
234 	to identify which pending aio operation was completed */
235 	void*			m2;
236 
237 	/** AIO completion status */
238 	dberr_t			err;
239 
240 #ifdef WIN_ASYNC_IO
241 
242 	/** bytes written/read */
243 	DWORD			n_bytes;
244 
245 	/** length of the block to read or write */
246 	DWORD			len;
247 
248 	/** aio array containing this slot */
249 	AIO				*array;
250 #elif defined(LINUX_NATIVE_AIO)
251 	/** Linux control block for aio */
252 	struct iocb		control;
253 
254 	/** AIO return code */
255 	int			ret;
256 
257 	/** bytes written/read. */
258 	ssize_t			n_bytes;
259 
260 	/** length of the block to read or write */
261 	ulint			len;
262 #else
263 	/** length of the block to read or write */
264 	ulint			len;
265 
266 	/** bytes written/read. */
267 	ulint			n_bytes;
268 #endif /* WIN_ASYNC_IO */
269 
270 	/** Length of the block before it was compressed */
271 	uint32			original_len;
272 
273 };
274 
275 /** The asynchronous i/o array structure */
276 class AIO {
277 public:
278 	/** Constructor
279 	@param[in]	id		Latch ID
280 	@param[in]	n_slots		Number of slots to configure
281 	@param[in]	segments	Number of segments to configure */
282 	AIO(latch_id_t id, ulint n_slots, ulint segments);
283 
284 	/** Destructor */
285 	~AIO();
286 
287 	/** Initialize the instance
288 	@return DB_SUCCESS or error code */
289 	dberr_t init();
290 
291 	/** Requests for a slot in the aio array. If no slot is available, waits
292 	until not_full-event becomes signaled.
293 
294 	@param[in]	type	IO context
295 	@param[in,out]	m1	message to be passed along with the AIO
296 				operation
297 	@param[in,out]	m2	message to be passed along with the AIO
298 				operation
299 	@param[in]	file	file handle
300 	@param[in]	name	name of the file or path as a null-terminated
301 				string
302 	@param[in,out]	buf	buffer where to read or from which to write
303 	@param[in]	offset	file offset, where to read from or start writing
304 	@param[in]	len	length of the block to read or write
305 	@return pointer to slot */
306 	Slot* reserve_slot(
307 		const IORequest&	type,
308 		fil_node_t*		m1,
309 		void*			m2,
310 		pfs_os_file_t		file,
311 		const char*		name,
312 		void*			buf,
313 		os_offset_t		offset,
314 		ulint			len)
315 		MY_ATTRIBUTE((warn_unused_result));
316 
317 	/** @return number of reserved slots */
318 	ulint pending_io_count() const;
319 
320 	/** Returns a pointer to the nth slot in the aio array.
321 	@param[in]	index	Index of the slot in the array
322 	@return pointer to slot */
at(ulint i) const323 	const Slot* at(ulint i) const
324 		MY_ATTRIBUTE((warn_unused_result))
325 	{
326 		ut_a(i < m_slots.size());
327 
328 		return(&m_slots[i]);
329 	}
330 
331 	/** Non const version */
at(ulint i)332 	Slot* at(ulint i)
333 		MY_ATTRIBUTE((warn_unused_result))
334 	{
335 		ut_a(i < m_slots.size());
336 
337 		return(&m_slots[i]);
338 	}
339 
340 	/** Frees a slot in the AIO array, assumes caller owns the mutex.
341 	@param[in,out]	slot	Slot to release */
342 	void release(Slot* slot);
343 
344 	/** Frees a slot in the AIO array, assumes caller doesn't own the mutex.
345 	@param[in,out]	slot	Slot to release */
346 	void release_with_mutex(Slot* slot);
347 
348 	/** Prints info about the aio array.
349 	@param[in,out]	file	Where to print */
350 	void print(FILE* file);
351 
352 	/** @return the number of slots per segment */
slots_per_segment() const353 	ulint slots_per_segment() const
354 		MY_ATTRIBUTE((warn_unused_result))
355 	{
356 		return(m_slots.size() / m_n_segments);
357 	}
358 
359 	/** @return accessor for n_segments */
get_n_segments() const360 	ulint get_n_segments() const
361 		MY_ATTRIBUTE((warn_unused_result))
362 	{
363 		return(m_n_segments);
364 	}
365 
366 #ifdef UNIV_DEBUG
367 	/** @return true if the thread owns the mutex */
is_mutex_owned() const368 	bool is_mutex_owned() const
369 		MY_ATTRIBUTE((warn_unused_result))
370 	{
371 		return(mutex_own(&m_mutex));
372 	}
373 #endif /* UNIV_DEBUG */
374 
375 	/** Acquire the mutex */
acquire() const376 	void acquire() const
377 	{
378 		mutex_enter(&m_mutex);
379 	}
380 
381 	/** Release the mutex */
release() const382 	void release() const
383 	{
384 		mutex_exit(&m_mutex);
385 	}
386 
387 	/** Write out the state to the file/stream
388 	@param[in, out]	file	File to write to */
389 	void to_file(FILE* file) const;
390 
391 #ifdef LINUX_NATIVE_AIO
392 	/** Dispatch an AIO request to the kernel.
393 	@param[in,out]	slot	an already reserved slot
394 	@return true on success. */
395 	bool linux_dispatch(Slot* slot)
396 		MY_ATTRIBUTE((warn_unused_result));
397 
398 	/** Accessor for an AIO event
399 	@param[in]	index	Index into the array
400 	@return the event at the index */
io_events(ulint index)401 	io_event* io_events(ulint index)
402 		MY_ATTRIBUTE((warn_unused_result))
403 	{
404 		ut_a(index < m_events.size());
405 
406 		return(&m_events[index]);
407 	}
408 
409 	/** Accessor for the AIO context
410 	@param[in]	segment	Segment for which to get the context
411 	@return the AIO context for the segment */
io_ctx(ulint segment)412 	io_context_t io_ctx(ulint segment)
413 		MY_ATTRIBUTE((warn_unused_result))
414 	{
415 		ut_ad(segment < get_n_segments());
416 
417 		return(m_aio_ctx[segment]);
418 	}
419 
420 	/** Creates an io_context_t for native linux AIO.
421 	@param[in]	max_events	number of events
422 	@param[out]	io_ctx		io_ctx to initialize.
423 	@return true on success. */
424 	static bool linux_create_io_ctx(unsigned max_events, io_context_t& io_ctx)
425 		MY_ATTRIBUTE((warn_unused_result));
426 
427 	/** Checks if the system supports native linux aio. On some kernel
428 	versions where native aio is supported it won't work on tmpfs. In such
429 	cases we can't use native aio as it is not possible to mix simulated
430 	and native aio.
431 	@return true if supported, false otherwise. */
432 	static bool is_linux_native_aio_supported()
433 		MY_ATTRIBUTE((warn_unused_result));
434 #endif /* LINUX_NATIVE_AIO */
435 
436 #ifdef WIN_ASYNC_IO
437 	HANDLE m_completion_port;
438 	/** Wake up all AIO threads in Windows native aio */
wake_at_shutdown()439 	static void wake_at_shutdown() {
440 		AIO *all_arrays[] = {s_reads, s_writes, s_log, s_ibuf };
441 		for (size_t i = 0; i < array_elements(all_arrays); i++) {
442 			AIO *a = all_arrays[i];
443 			if (a) {
444 				PostQueuedCompletionStatus(a->m_completion_port, 0,
445 					IOCP_SHUTDOWN_KEY, 0);
446 			}
447 		}
448 	}
449 #endif /* WIN_ASYNC_IO */
450 
451 #ifdef _WIN32
452 	/** This function can be called if one wants to post a batch of reads
453 	and prefers an I/O - handler thread to handle them all at once later.You
454 	must call os_aio_simulated_wake_handler_threads later to ensure the
455 	threads are not left sleeping! */
456 	static void simulated_put_read_threads_to_sleep();
457 #endif /* _WIN32 */
458 
459 	/** Create an instance using new(std::nothrow)
460 	@param[in]	id		Latch ID
461 	@param[in]	n_slots		The number of AIO request slots
462 	@param[in]	segments	The number of segments
463 	@return a new AIO instance */
464 	static AIO* create(
465 		latch_id_t	id,
466 		ulint		n_slots,
467 		ulint		segments)
468 		MY_ATTRIBUTE((warn_unused_result));
469 
470 	/** Initializes the asynchronous io system. Creates one array each
471 	for ibuf and log I/O. Also creates one array each for read and write
472 	where each array is divided logically into n_readers and n_writers
473 	respectively. The caller must create an i/o handler thread for each
474 	segment in these arrays. This function also creates the sync array.
475 	No I/O handler thread needs to be created for that
476 	@param[in]	n_per_seg	maximum number of pending aio
477 					operations allowed per segment
478 	@param[in]	n_readers	number of reader threads
479 	@param[in]	n_writers	number of writer threads
480 	@param[in]	n_slots_sync	number of slots in the sync aio array
481 	@return true if AIO sub-system was started successfully */
482 	static bool start(
483 		ulint		n_per_seg,
484 		ulint		n_readers,
485 		ulint		n_writers,
486 		ulint		n_slots_sync)
487 		MY_ATTRIBUTE((warn_unused_result));
488 
489 	/** Free the AIO arrays */
490 	static void shutdown();
491 
492 	/** Print all the AIO segments
493 	@param[in,out]	file		Where to print */
494 	static void print_all(FILE* file);
495 
496 	/** Calculates local segment number and aio array from global
497 	segment number.
498 	@param[out]	array		AIO wait array
499 	@param[in]	segment		global segment number
500 	@return local segment number within the aio array */
501 	static ulint get_array_and_local_segment(
502 		AIO**		array,
503 		ulint		segment)
504 		MY_ATTRIBUTE((warn_unused_result));
505 
506 	/** Select the IO slot array
507 	@param[in,out]	type		Type of IO, READ or WRITE
508 	@param[in]	read_only	true if running in read-only mode
509 	@param[in]	mode		IO mode
510 	@return slot array or NULL if invalid mode specified */
511 	static AIO* select_slot_array(
512 		IORequest&		type,
513 		bool			read_only,
514 		ulint			mode)
515 		MY_ATTRIBUTE((warn_unused_result));
516 
517 	/** Calculates segment number for a slot.
518 	@param[in]	array		AIO wait array
519 	@param[in]	slot		slot in this array
520 	@return segment number (which is the number used by, for example,
521 		I/O handler threads) */
522 	static ulint get_segment_no_from_slot(
523 		const AIO*	array,
524 		const Slot*	slot)
525 		MY_ATTRIBUTE((warn_unused_result));
526 
527 	/** Wakes up a simulated AIO I/O-handler thread if it has something
528 	to do.
529 	@param[in]	global_segment	the number of the segment in the
530 					AIO arrays */
531 	static void wake_simulated_handler_thread(ulint global_segment);
532 
533 	/** Check if it is a read request
534 	@param[in]	aio		The AIO instance to check
535 	@return true if the AIO instance is for reading. */
is_read(const AIO * aio)536 	static bool is_read(const AIO* aio)
537 		MY_ATTRIBUTE((warn_unused_result))
538 	{
539 		return(s_reads == aio);
540 	}
541 
542 	/** Wait on an event until no pending writes */
wait_until_no_pending_writes()543 	static void wait_until_no_pending_writes()
544 	{
545 		os_event_wait(AIO::s_writes->m_is_empty);
546 	}
547 
548 	/** Print to file
549 	@param[in]	file		File to write to */
550 	static void print_to_file(FILE* file);
551 
552 	/** Check for pending IO. Gets the count and also validates the
553 	data structures.
554 	@return count of pending IO requests */
555 	static ulint total_pending_io_count();
556 
557 private:
558 	/** Initialise the slots
559 	@return DB_SUCCESS or error code */
560 	dberr_t init_slots()
561 		MY_ATTRIBUTE((warn_unused_result));
562 
563 	/** Wakes up a simulated AIO I/O-handler thread if it has something
564 	to do for a local segment in the AIO array.
565 	@param[in]	global_segment	the number of the segment in the
566 					AIO arrays
567 	@param[in]	segment		the local segment in the AIO array */
568 	void wake_simulated_handler_thread(ulint global_segment, ulint segment);
569 
570 	/** Prints pending IO requests per segment of an aio array.
571 	We probably don't need per segment statistics but they can help us
572 	during development phase to see if the IO requests are being
573 	distributed as expected.
574 	@param[in,out]	file		file where to print
575 	@param[in]	segments	pending IO array */
576 	void print_segment_info(
577 		FILE*		file,
578 		const ulint*	segments);
579 
580 #ifdef LINUX_NATIVE_AIO
581 	/** Initialise the Linux native AIO data structures
582 	@return DB_SUCCESS or error code */
583 	dberr_t init_linux_native_aio()
584 		MY_ATTRIBUTE((warn_unused_result));
585 #endif /* LINUX_NATIVE_AIO */
586 
587 private:
588 	typedef std::vector<Slot> Slots;
589 
590 	/** the mutex protecting the aio array */
591 	mutable SysMutex	m_mutex;
592 
593 	/** Pointer to the slots in the array.
594 	Number of elements must be divisible by n_threads. */
595 	Slots			m_slots;
596 
597 	/** Number of segments in the aio array of pending aio requests.
598 	A thread can wait separately for any one of the segments. */
599 	ulint			m_n_segments;
600 
601 	/** The event which is set to the signaled state when
602 	there is space in the aio outside the ibuf segment;
603 	os_event_set() and os_event_reset() are protected by AIO::m_mutex */
604 	os_event_t		m_not_full;
605 
606 	/** The event which is set to the signaled state when
607 	there are no pending i/os in this array;
608 	os_event_set() and os_event_reset() are protected by AIO::m_mutex */
609 	os_event_t		m_is_empty;
610 
611 	/** Number of reserved slots in the AIO array outside
612 	the ibuf segment */
613 	ulint			m_n_reserved;
614 
615 
616 #if defined(LINUX_NATIVE_AIO)
617 	typedef std::vector<io_event> IOEvents;
618 
619 	/** completion queue for IO. There is one such queue per
620 	segment. Each thread will work on one ctx exclusively. */
621 	std::vector<io_context_t>		m_aio_ctx;
622 
623 	/** The array to collect completed IOs. There is one such
624 	event for each possible pending IO. The size of the array
625 	is equal to m_slots.size(). */
626 	IOEvents		m_events;
627 #endif /* LINUX_NATIV_AIO */
628 
629 	/** The aio arrays for non-ibuf i/o and ibuf i/o, as well as
630 	sync AIO. These are NULL when the module has not yet been
631 	initialized. */
632 
633 	/** Insert buffer */
634 	static AIO*		s_ibuf;
635 
636 	/** Redo log */
637 	static AIO*		s_log;
638 
639 	/** Reads */
640 	static AIO*		s_reads;
641 
642 	/** Writes */
643 	static AIO*		s_writes;
644 
645 	/** Synchronous I/O */
646 	static AIO*		s_sync;
647 };
648 
649 /** Static declarations */
650 AIO*	AIO::s_reads;
651 AIO*	AIO::s_writes;
652 AIO*	AIO::s_ibuf;
653 AIO*	AIO::s_log;
654 AIO*	AIO::s_sync;
655 
656 #if defined(LINUX_NATIVE_AIO)
657 /** timeout for each io_getevents() call = 500ms. */
658 static const ulint	OS_AIO_REAP_TIMEOUT = 500000000UL;
659 
660 /** time to sleep, in microseconds if io_setup() returns EAGAIN. */
661 static const ulint	OS_AIO_IO_SETUP_RETRY_SLEEP = 500000UL;
662 
663 /** number of attempts before giving up on io_setup(). */
664 static const int	OS_AIO_IO_SETUP_RETRY_ATTEMPTS = 5;
665 #endif /* LINUX_NATIVE_AIO */
666 
667 /** Array of events used in simulated AIO */
668 static os_event_t*	os_aio_segment_wait_events;
669 
670 /** Number of asynchronous I/O segments.  Set by os_aio_init(). */
671 static ulint		os_aio_n_segments = ULINT_UNDEFINED;
672 
673 /** If the following is true, read i/o handler threads try to
674 wait until a batch of new read requests have been posted */
675 static bool		os_aio_recommend_sleep_for_read_threads;
676 
677 Atomic_counter<ulint> os_n_file_reads;
678 static ulint	os_bytes_read_since_printout;
679 ulint	os_n_file_writes;
680 ulint	os_n_fsyncs;
681 static ulint	os_n_file_reads_old;
682 static ulint	os_n_file_writes_old;
683 static ulint	os_n_fsyncs_old;
684 
685 static time_t	os_last_printout;
686 bool	os_has_said_disk_full;
687 
688 /** Default Zip compression level */
689 extern uint page_zip_level;
690 
691 /** Validates the consistency of the aio system.
692 @return true if ok */
693 static
694 bool
695 os_aio_validate();
696 
697 /** Handle errors for file operations.
698 @param[in]	name		name of a file or NULL
699 @param[in]	operation	operation
700 @param[in]	should_abort	whether to abort on an unknown error
701 @param[in]	on_error_silent	whether to suppress reports of non-fatal errors
702 @return true if we should retry the operation */
703 static MY_ATTRIBUTE((warn_unused_result))
704 bool
705 os_file_handle_error_cond_exit(
706 	const char*	name,
707 	const char*	operation,
708 	bool		should_abort,
709 	bool		on_error_silent);
710 
711 /** Does error handling when a file operation fails.
712 @param[in]	name		name of a file or NULL
713 @param[in]	operation	operation name that failed
714 @return true if we should retry the operation */
715 static
716 bool
os_file_handle_error(const char * name,const char * operation)717 os_file_handle_error(
718 	const char*	name,
719 	const char*	operation)
720 {
721 	/* Exit in case of unknown error */
722 	return(os_file_handle_error_cond_exit(name, operation, true, false));
723 }
724 
725 /** Does error handling when a file operation fails.
726 @param[in]	name		name of a file or NULL
727 @param[in]	operation	operation name that failed
728 @param[in]	on_error_silent	if true then don't print any message to the log.
729 @return true if we should retry the operation */
730 static
731 bool
os_file_handle_error_no_exit(const char * name,const char * operation,bool on_error_silent)732 os_file_handle_error_no_exit(
733 	const char*	name,
734 	const char*	operation,
735 	bool		on_error_silent)
736 {
737 	/* Don't exit in case of unknown error */
738 	return(os_file_handle_error_cond_exit(
739 			name, operation, false, on_error_silent));
740 }
741 
742 /** Handle RENAME error.
743 @param name	old name of the file
744 @param new_name	new name of the file */
os_file_handle_rename_error(const char * name,const char * new_name)745 static void os_file_handle_rename_error(const char* name, const char* new_name)
746 {
747 	if (os_file_get_last_error(true) != OS_FILE_DISK_FULL) {
748 		ib::error() << "Cannot rename file '" << name << "' to '"
749 			<< new_name << "'";
750 	} else if (!os_has_said_disk_full) {
751 		os_has_said_disk_full = true;
752 		/* Disk full error is reported irrespective of the
753 		on_error_silent setting. */
754 		ib::error() << "Full disk prevents renaming file '"
755 			<< name << "' to '" << new_name << "'";
756 	}
757 }
758 
759 /** Does simulated AIO. This function should be called by an i/o-handler
760 thread.
761 
762 @param[in]	segment	The number of the segment in the aio arrays to wait
763 			for; segment 0 is the ibuf i/o thread, segment 1 the
764 			log i/o thread, then follow the non-ibuf read threads,
765 			and as the last are the non-ibuf write threads
766 @param[out]	m1	the messages passed with the AIO request; note that
767 			also in the case where the AIO operation failed, these
768 			output parameters are valid and can be used to restart
769 			the operation, for example
770 @param[out]	m2	Callback argument
771 @param[in]	type	IO context
772 @return DB_SUCCESS or error code */
773 static
774 dberr_t
775 os_aio_simulated_handler(
776 	ulint		global_segment,
777 	fil_node_t**	m1,
778 	void**		m2,
779 	IORequest*	type);
780 
781 #ifdef _WIN32
782 static HANDLE win_get_syncio_event();
783 
784 /**
785  Wrapper around Windows DeviceIoControl() function.
786 
787  Works synchronously, also in case for handle opened
788  for async access (i.e with FILE_FLAG_OVERLAPPED).
789 
790  Accepts the same parameters as DeviceIoControl(),except
791  last parameter (OVERLAPPED).
792 */
793 static
794 BOOL
os_win32_device_io_control(HANDLE handle,DWORD code,LPVOID inbuf,DWORD inbuf_size,LPVOID outbuf,DWORD outbuf_size,LPDWORD bytes_returned)795 os_win32_device_io_control(
796 	HANDLE handle,
797 	DWORD code,
798 	LPVOID inbuf,
799 	DWORD inbuf_size,
800 	LPVOID outbuf,
801 	DWORD outbuf_size,
802 	LPDWORD bytes_returned
803 )
804 {
805 	OVERLAPPED overlapped = { 0 };
806 	overlapped.hEvent = win_get_syncio_event();
807 	BOOL result = DeviceIoControl(handle, code, inbuf, inbuf_size, outbuf,
808 		outbuf_size,  NULL, &overlapped);
809 
810 	if (result || (GetLastError() == ERROR_IO_PENDING)) {
811 		/* Wait for async io to complete */
812 		result = GetOverlappedResult(handle, &overlapped, bytes_returned, TRUE);
813 	}
814 
815 	return result;
816 }
817 
818 #endif
819 
820 #ifdef WIN_ASYNC_IO
821 /** This function is only used in Windows asynchronous i/o.
822 Waits for an aio operation to complete. This function is used to wait the
823 for completed requests. The aio array of pending requests is divided
824 into segments. The thread specifies which segment or slot it wants to wait
825 for. NOTE: this function will also take care of freeing the aio slot,
826 therefore no other thread is allowed to do the freeing!
827 @param[in]	segment		The number of the segment in the aio arrays to
828 wait for; segment 0 is the ibuf I/O thread,
829 segment 1 the log I/O thread, then follow the
830 non-ibuf read threads, and as the last are the
831 non-ibuf write threads; if this is
832 ULINT_UNDEFINED, then it means that sync AIO
833 is used, and this parameter is ignored
834 @param[in]	pos		this parameter is used only in sync AIO:
835 wait for the aio slot at this position
836 @param[out]	m1		the messages passed with the AIO request; note
837 that also in the case where the AIO operation
838 failed, these output parameters are valid and
839 can be used to restart the operation,
840 for example
841 @param[out]	m2		callback message
842 @param[out]	type		OS_FILE_WRITE or ..._READ
843 @return DB_SUCCESS or error code */
844 static
845 dberr_t
846 os_aio_windows_handler(
847 	ulint		segment,
848 	ulint		pos,
849 	fil_node_t**	m1,
850 	void**		m2,
851 	IORequest*	type);
852 #endif /* WIN_ASYNC_IO */
853 
854 /** Generic AIO Handler methods. Currently handles IO post processing. */
855 class AIOHandler {
856 public:
857 	/** Do any post processing after a read/write
858 	@return DB_SUCCESS or error code. */
859 	static dberr_t post_io_processing(Slot* slot);
860 };
861 
862 /** Helper class for doing synchronous file IO. Currently, the objective
863 is to hide the OS specific code, so that the higher level functions aren't
864 peppered with #ifdef. Makes the code flow difficult to follow.  */
865 class SyncFileIO {
866 public:
867 	/** Constructor
868 	@param[in]	fh	File handle
869 	@param[in,out]	buf	Buffer to read/write
870 	@param[in]	n	Number of bytes to read/write
871 	@param[in]	offset	Offset where to read or write */
SyncFileIO(os_file_t fh,void * buf,ulint n,os_offset_t offset)872 	SyncFileIO(os_file_t fh, void* buf, ulint n, os_offset_t offset)
873 		:
874 		m_fh(fh),
875 		m_buf(buf),
876 		m_n(static_cast<ssize_t>(n)),
877 		m_offset(offset)
878 	{
879 		ut_ad(m_n > 0);
880 	}
881 
882 	/** Destructor */
~SyncFileIO()883 	~SyncFileIO()
884 	{
885 		/* No op */
886 	}
887 
888 	/** Do the read/write
889 	@param[in]	request	The IO context and type
890 	@return the number of bytes read/written or negative value on error */
891 	ssize_t execute(const IORequest& request);
892 
893 	/** Do the read/write
894 	@param[in,out]	slot	The IO slot, it has the IO context
895 	@return the number of bytes read/written or negative value on error */
896 	static ssize_t execute(Slot* slot);
897 
898 	/** Move the read/write offset up to where the partial IO succeeded.
899 	@param[in]	n_bytes	The number of bytes to advance */
advance(ssize_t n_bytes)900 	void advance(ssize_t n_bytes)
901 	{
902 		m_offset += n_bytes;
903 
904 		ut_ad(m_n >= n_bytes);
905 
906 		m_n -=  n_bytes;
907 
908 		m_buf = reinterpret_cast<uchar*>(m_buf) + n_bytes;
909 	}
910 
911 private:
912 	/** Open file handle */
913 	os_file_t		m_fh;
914 
915 	/** Buffer to read/write */
916 	void*			m_buf;
917 
918 	/** Number of bytes to read/write */
919 	ssize_t			m_n;
920 
921 	/** Offset from where to read/write */
922 	os_offset_t		m_offset;
923 };
924 
925 /** Do any post processing after a read/write
926 @return DB_SUCCESS or error code. */
927 dberr_t
post_io_processing(Slot * slot)928 AIOHandler::post_io_processing(Slot* slot)
929 {
930 	ut_ad(slot->is_reserved);
931 
932 	/* Total bytes read so far */
933 	ulint	n_bytes = ulint(slot->ptr - slot->buf) + slot->n_bytes;
934 
935 	return(n_bytes == slot->original_len ? DB_SUCCESS : DB_FAIL);
936 }
937 
938 /** Count the number of free slots
939 @return number of reserved slots */
940 ulint
pending_io_count() const941 AIO::pending_io_count() const
942 {
943 	acquire();
944 
945 #ifdef UNIV_DEBUG
946 	ut_a(m_n_segments > 0);
947 	ut_a(!m_slots.empty());
948 
949 	ulint	count = 0;
950 
951 	for (ulint i = 0; i < m_slots.size(); ++i) {
952 
953 		const Slot&	slot = m_slots[i];
954 
955 		if (slot.is_reserved) {
956 			++count;
957 			ut_a(slot.len > 0);
958 		}
959 	}
960 
961 	ut_a(m_n_reserved == count);
962 #endif /* UNIV_DEBUG */
963 
964 	ulint	reserved = m_n_reserved;
965 
966 	release();
967 
968 	return(reserved);
969 }
970 
971 #ifdef UNIV_DEBUG
972 /** Validates the consistency the aio system some of the time.
973 @return true if ok or the check was skipped */
974 static
975 bool
os_aio_validate_skip()976 os_aio_validate_skip()
977 {
978 /** Try os_aio_validate() every this many times */
979 # define OS_AIO_VALIDATE_SKIP	13
980 
981 	static Atomic_counter<uint32_t> os_aio_validate_count;
982 	return (os_aio_validate_count++ % OS_AIO_VALIDATE_SKIP) || os_aio_validate();
983 }
984 #endif /* UNIV_DEBUG */
985 
986 #undef USE_FILE_LOCK
987 #ifndef _WIN32
988 /* On Windows, mandatory locking is used */
989 # define USE_FILE_LOCK
990 #endif
991 #ifdef USE_FILE_LOCK
992 /** Obtain an exclusive lock on a file.
993 @param[in]	fd		file descriptor
994 @param[in]	name		file name
995 @return 0 on success */
996 static
997 int
os_file_lock(int fd,const char * name)998 os_file_lock(
999 	int		fd,
1000 	const char*	name)
1001 {
1002 	if (my_disable_locking) {
1003 		return 0;
1004 	}
1005 
1006 	struct flock lk;
1007 
1008 	lk.l_type = F_WRLCK;
1009 	lk.l_whence = SEEK_SET;
1010 	lk.l_start = lk.l_len = 0;
1011 
1012 	if (fcntl(fd, F_SETLK, &lk) == -1) {
1013 
1014 		ib::error()
1015 			<< "Unable to lock " << name
1016 			<< " error: " << errno;
1017 
1018 		if (errno == EAGAIN || errno == EACCES) {
1019 
1020 			ib::info()
1021 				<< "Check that you do not already have"
1022 				" another mysqld process using the"
1023 				" same InnoDB data or log files.";
1024 		}
1025 
1026 		return(-1);
1027 	}
1028 
1029 	return(0);
1030 }
1031 #endif /* USE_FILE_LOCK */
1032 
1033 /** Calculates local segment number and aio array from global segment number.
1034 @param[out]	array		aio wait array
1035 @param[in]	segment		global segment number
1036 @return local segment number within the aio array */
1037 ulint
get_array_and_local_segment(AIO ** array,ulint segment)1038 AIO::get_array_and_local_segment(
1039 	AIO**		array,
1040 	ulint		segment)
1041 {
1042 	ulint		local_segment;
1043 	ulint		n_extra_segs = (srv_read_only_mode) ? 0 : 2;
1044 
1045 	ut_a(segment < os_aio_n_segments);
1046 
1047 	if (!srv_read_only_mode && segment < n_extra_segs) {
1048 
1049 		/* We don't support ibuf/log IO during read only mode. */
1050 
1051 		if (segment == IO_IBUF_SEGMENT) {
1052 
1053 			*array = s_ibuf;
1054 
1055 		} else if (segment == IO_LOG_SEGMENT) {
1056 
1057 			*array = s_log;
1058 
1059 		} else {
1060 			*array = NULL;
1061 		}
1062 
1063 		local_segment = 0;
1064 
1065 	} else if (segment < s_reads->m_n_segments + n_extra_segs) {
1066 
1067 		*array = s_reads;
1068 		local_segment = segment - n_extra_segs;
1069 
1070 	} else {
1071 		*array = s_writes;
1072 
1073 		local_segment = segment
1074 			      - (s_reads->m_n_segments + n_extra_segs);
1075 	}
1076 
1077 	return(local_segment);
1078 }
1079 
1080 /** Frees a slot in the aio array. Assumes caller owns the mutex.
1081 @param[in,out]	slot		Slot to release */
1082 void
release(Slot * slot)1083 AIO::release(Slot* slot)
1084 {
1085 	ut_ad(is_mutex_owned());
1086 
1087 	ut_ad(slot->is_reserved);
1088 
1089 	slot->is_reserved = false;
1090 
1091 	--m_n_reserved;
1092 
1093 	if (m_n_reserved == m_slots.size() - 1) {
1094 		os_event_set(m_not_full);
1095 	}
1096 
1097 	if (m_n_reserved == 0) {
1098 		os_event_set(m_is_empty);
1099 	}
1100 
1101 #if defined(LINUX_NATIVE_AIO)
1102 
1103 	if (srv_use_native_aio) {
1104 		memset(&slot->control, 0x0, sizeof(slot->control));
1105 		slot->ret = 0;
1106 		slot->n_bytes = 0;
1107 	} else {
1108 		/* These fields should not be used if we are not
1109 		using native AIO. */
1110 		ut_ad(slot->n_bytes == 0);
1111 		ut_ad(slot->ret == 0);
1112 	}
1113 
1114 #endif /* WIN_ASYNC_IO */
1115 }
1116 
1117 /** Frees a slot in the AIO array. Assumes caller doesn't own the mutex.
1118 @param[in,out]	slot		Slot to release */
1119 void
release_with_mutex(Slot * slot)1120 AIO::release_with_mutex(Slot* slot)
1121 {
1122 	acquire();
1123 
1124 	release(slot);
1125 
1126 	release();
1127 }
1128 
1129 /** Create a temporary file. This function is like tmpfile(3), but
1130 the temporary file is created in the in the mysql server configuration
1131 parameter (--tmpdir).
1132 @return temporary file handle, or NULL on error */
1133 FILE*
os_file_create_tmpfile()1134 os_file_create_tmpfile()
1135 {
1136 	FILE*	file	= NULL;
1137 	WAIT_ALLOW_WRITES();
1138 	os_file_t	fd	= innobase_mysql_tmpfile(NULL);
1139 
1140 	if (fd != OS_FILE_CLOSED) {
1141 #ifdef _WIN32
1142 		int crt_fd = _open_osfhandle((intptr_t)HANDLE(fd), 0);
1143 		if (crt_fd != -1) {
1144 			file = fdopen(crt_fd, "w+b");
1145 			if (!file) {
1146 				close(crt_fd);
1147 			}
1148 		}
1149 #else
1150 		file = fdopen(fd, "w+b");
1151 		if (!file) {
1152 			close(fd);
1153 		}
1154 #endif
1155 	}
1156 
1157 	if (file == NULL) {
1158 
1159 		ib::error()
1160 			<< "Unable to create temporary file; errno: "
1161 			<< errno;
1162 	}
1163 
1164 	return(file);
1165 }
1166 
1167 /** Rewind file to its start, read at most size - 1 bytes from it to str, and
1168 NUL-terminate str. All errors are silently ignored. This function is
1169 mostly meant to be used with temporary files.
1170 @param[in,out]	file		File to read from
1171 @param[in,out]	str		Buffer where to read
1172 @param[in]	size		Size of buffer */
1173 void
os_file_read_string(FILE * file,char * str,ulint size)1174 os_file_read_string(
1175 	FILE*		file,
1176 	char*		str,
1177 	ulint		size)
1178 {
1179 	if (size != 0) {
1180 		rewind(file);
1181 
1182 		size_t	flen = fread(str, 1, size - 1, file);
1183 
1184 		str[flen] = '\0';
1185 	}
1186 }
1187 
1188 /** This function returns a new path name after replacing the basename
1189 in an old path with a new basename.  The old_path is a full path
1190 name including the extension.  The tablename is in the normal
1191 form "databasename/tablename".  The new base name is found after
1192 the forward slash.  Both input strings are null terminated.
1193 
1194 This function allocates memory to be returned.  It is the callers
1195 responsibility to free the return value after it is no longer needed.
1196 
1197 @param[in]	old_path		Pathname
1198 @param[in]	tablename		Contains new base name
1199 @return own: new full pathname */
1200 char*
os_file_make_new_pathname(const char * old_path,const char * tablename)1201 os_file_make_new_pathname(
1202 	const char*	old_path,
1203 	const char*	tablename)
1204 {
1205 	ulint		dir_len;
1206 	char*		last_slash;
1207 	char*		base_name;
1208 	char*		new_path;
1209 	ulint		new_path_len;
1210 
1211 	/* Split the tablename into its database and table name components.
1212 	They are separated by a '/'. */
1213 	last_slash = strrchr((char*) tablename, '/');
1214 	base_name = last_slash ? last_slash + 1 : (char*) tablename;
1215 
1216 	/* Find the offset of the last slash. We will strip off the
1217 	old basename.ibd which starts after that slash. */
1218 	last_slash = strrchr((char*) old_path, OS_PATH_SEPARATOR);
1219 	dir_len = last_slash ? ulint(last_slash - old_path) : strlen(old_path);
1220 
1221 	/* allocate a new path and move the old directory path to it. */
1222 	new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd";
1223 	new_path = static_cast<char*>(ut_malloc_nokey(new_path_len));
1224 	memcpy(new_path, old_path, dir_len);
1225 
1226 	snprintf(new_path + dir_len, new_path_len - dir_len,
1227 		 "%c%s.ibd", OS_PATH_SEPARATOR, base_name);
1228 
1229 	return(new_path);
1230 }
1231 
1232 /** This function reduces a null-terminated full remote path name into
1233 the path that is sent by MySQL for DATA DIRECTORY clause.  It replaces
1234 the 'databasename/tablename.ibd' found at the end of the path with just
1235 'tablename'.
1236 
1237 Since the result is always smaller than the path sent in, no new memory
1238 is allocated. The caller should allocate memory for the path sent in.
1239 This function manipulates that path in place.
1240 
1241 If the path format is not as expected, just return.  The result is used
1242 to inform a SHOW CREATE TABLE command.
1243 @param[in,out]	data_dir_path		Full path/data_dir_path */
1244 void
os_file_make_data_dir_path(char * data_dir_path)1245 os_file_make_data_dir_path(
1246 	char*	data_dir_path)
1247 {
1248 	/* Replace the period before the extension with a null byte. */
1249 	char*	ptr = strrchr((char*) data_dir_path, '.');
1250 
1251 	if (ptr == NULL) {
1252 		return;
1253 	}
1254 
1255 	ptr[0] = '\0';
1256 
1257 	/* The tablename starts after the last slash. */
1258 	ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
1259 
1260 	if (ptr == NULL) {
1261 		return;
1262 	}
1263 
1264 	ptr[0] = '\0';
1265 
1266 	char*	tablename = ptr + 1;
1267 
1268 	/* The databasename starts after the next to last slash. */
1269 	ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
1270 
1271 	if (ptr == NULL) {
1272 		return;
1273 	}
1274 
1275 	ulint	tablename_len = ut_strlen(tablename);
1276 
1277 	ut_memmove(++ptr, tablename, tablename_len);
1278 
1279 	ptr[tablename_len] = '\0';
1280 }
1281 
1282 /** Check if the path refers to the root of a drive using a pointer
1283 to the last directory separator that the caller has fixed.
1284 @param[in]	path	path name
1285 @param[in]	path	last directory separator in the path
1286 @return true if this path is a drive root, false if not */
1287 UNIV_INLINE
1288 bool
os_file_is_root(const char * path,const char * last_slash)1289 os_file_is_root(
1290 	const char*	path,
1291 	const char*	last_slash)
1292 {
1293 	return(
1294 #ifdef _WIN32
1295 	       (last_slash == path + 2 && path[1] == ':') ||
1296 #endif /* _WIN32 */
1297 	       last_slash == path);
1298 }
1299 
1300 /** Return the parent directory component of a null-terminated path.
1301 Return a new buffer containing the string up to, but not including,
1302 the final component of the path.
1303 The path returned will not contain a trailing separator.
1304 Do not return a root path, return NULL instead.
1305 The final component trimmed off may be a filename or a directory name.
1306 If the final component is the only component of the path, return NULL.
1307 It is the caller's responsibility to free the returned string after it
1308 is no longer needed.
1309 @param[in]	path		Path name
1310 @return own: parent directory of the path */
1311 static
1312 char*
os_file_get_parent_dir(const char * path)1313 os_file_get_parent_dir(
1314 	const char*	path)
1315 {
1316 	bool	has_trailing_slash = false;
1317 
1318 	/* Find the offset of the last slash */
1319 	const char* last_slash = strrchr(path, OS_PATH_SEPARATOR);
1320 
1321 	if (!last_slash) {
1322 		/* No slash in the path, return NULL */
1323 		return(NULL);
1324 	}
1325 
1326 	/* Ok, there is a slash. Is there anything after it? */
1327 	if (static_cast<size_t>(last_slash - path + 1) == strlen(path)) {
1328 		has_trailing_slash = true;
1329 	}
1330 
1331 	/* Reduce repetative slashes. */
1332 	while (last_slash > path
1333 		&& last_slash[-1] == OS_PATH_SEPARATOR) {
1334 		last_slash--;
1335 	}
1336 
1337 	/* Check for the root of a drive. */
1338 	if (os_file_is_root(path, last_slash)) {
1339 		return(NULL);
1340 	}
1341 
1342 	/* If a trailing slash prevented the first strrchr() from trimming
1343 	the last component of the path, trim that component now. */
1344 	if (has_trailing_slash) {
1345 		/* Back up to the previous slash. */
1346 		last_slash--;
1347 		while (last_slash > path
1348 		       && last_slash[0] != OS_PATH_SEPARATOR) {
1349 			last_slash--;
1350 		}
1351 
1352 		/* Reduce repetative slashes. */
1353 		while (last_slash > path
1354 			&& last_slash[-1] == OS_PATH_SEPARATOR) {
1355 			last_slash--;
1356 		}
1357 	}
1358 
1359 	/* Check for the root of a drive. */
1360 	if (os_file_is_root(path, last_slash)) {
1361 		return(NULL);
1362 	}
1363 
1364 	if (last_slash - path < 0) {
1365 		/* Sanity check, it prevents gcc from trying to handle this case which
1366 		 * results in warnings for some optimized builds */
1367 		return (NULL);
1368 	}
1369 
1370 	/* Non-trivial directory component */
1371 
1372 	return(mem_strdupl(path, ulint(last_slash - path)));
1373 }
1374 #ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
1375 
1376 /* Test the function os_file_get_parent_dir. */
1377 void
test_os_file_get_parent_dir(const char * child_dir,const char * expected_dir)1378 test_os_file_get_parent_dir(
1379 	const char*	child_dir,
1380 	const char*	expected_dir)
1381 {
1382 	char* child = mem_strdup(child_dir);
1383 	char* expected = expected_dir == NULL ? NULL
1384 			 : mem_strdup(expected_dir);
1385 
1386 	/* os_file_get_parent_dir() assumes that separators are
1387 	converted to OS_PATH_SEPARATOR. */
1388 	os_normalize_path(child);
1389 	os_normalize_path(expected);
1390 
1391 	char* parent = os_file_get_parent_dir(child);
1392 
1393 	bool unexpected = (expected == NULL
1394 			  ? (parent != NULL)
1395 			  : (0 != strcmp(parent, expected)));
1396 	if (unexpected) {
1397 		ib::fatal() << "os_file_get_parent_dir('" << child
1398 			<< "') returned '" << parent
1399 			<< "', instead of '" << expected << "'.";
1400 	}
1401 	ut_free(parent);
1402 	ut_free(child);
1403 	ut_free(expected);
1404 }
1405 
1406 /* Test the function os_file_get_parent_dir. */
1407 void
unit_test_os_file_get_parent_dir()1408 unit_test_os_file_get_parent_dir()
1409 {
1410 	test_os_file_get_parent_dir("/usr/lib/a", "/usr/lib");
1411 	test_os_file_get_parent_dir("/usr/", NULL);
1412 	test_os_file_get_parent_dir("//usr//", NULL);
1413 	test_os_file_get_parent_dir("usr", NULL);
1414 	test_os_file_get_parent_dir("usr//", NULL);
1415 	test_os_file_get_parent_dir("/", NULL);
1416 	test_os_file_get_parent_dir("//", NULL);
1417 	test_os_file_get_parent_dir(".", NULL);
1418 	test_os_file_get_parent_dir("..", NULL);
1419 # ifdef _WIN32
1420 	test_os_file_get_parent_dir("D:", NULL);
1421 	test_os_file_get_parent_dir("D:/", NULL);
1422 	test_os_file_get_parent_dir("D:\\", NULL);
1423 	test_os_file_get_parent_dir("D:/data", NULL);
1424 	test_os_file_get_parent_dir("D:/data/", NULL);
1425 	test_os_file_get_parent_dir("D:\\data\\", NULL);
1426 	test_os_file_get_parent_dir("D:///data/////", NULL);
1427 	test_os_file_get_parent_dir("D:\\\\\\data\\\\\\\\", NULL);
1428 	test_os_file_get_parent_dir("D:/data//a", "D:/data");
1429 	test_os_file_get_parent_dir("D:\\data\\\\a", "D:\\data");
1430 	test_os_file_get_parent_dir("D:///data//a///b/", "D:///data//a");
1431 	test_os_file_get_parent_dir("D:\\\\\\data\\\\a\\\\\\b\\", "D:\\\\\\data\\\\a");
1432 #endif  /* _WIN32 */
1433 }
1434 #endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */
1435 
1436 
1437 /** Creates all missing subdirectories along the given path.
1438 @param[in]	path		Path name
1439 @return DB_SUCCESS if OK, otherwise error code. */
1440 dberr_t
os_file_create_subdirs_if_needed(const char * path)1441 os_file_create_subdirs_if_needed(
1442 	const char*	path)
1443 {
1444 	if (srv_read_only_mode) {
1445 
1446 		ib::error()
1447 			<< "read only mode set. Can't create "
1448 			<< "subdirectories '" << path << "'";
1449 
1450 		return(DB_READ_ONLY);
1451 
1452 	}
1453 
1454 	char*	subdir = os_file_get_parent_dir(path);
1455 
1456 	if (subdir == NULL) {
1457 		/* subdir is root or cwd, nothing to do */
1458 		return(DB_SUCCESS);
1459 	}
1460 
1461 	/* Test if subdir exists */
1462 	os_file_type_t	type;
1463 	bool	subdir_exists;
1464 	bool	success = os_file_status(subdir, &subdir_exists, &type);
1465 
1466 	if (success && !subdir_exists) {
1467 
1468 		/* Subdir does not exist, create it */
1469 		dberr_t	err = os_file_create_subdirs_if_needed(subdir);
1470 
1471 		if (err != DB_SUCCESS) {
1472 
1473 			ut_free(subdir);
1474 
1475 			return(err);
1476 		}
1477 
1478 		success = os_file_create_directory(subdir, false);
1479 	}
1480 
1481 	ut_free(subdir);
1482 
1483 	return(success ? DB_SUCCESS : DB_ERROR);
1484 }
1485 
1486 #ifndef _WIN32
1487 
1488 /** Do the read/write
1489 @param[in]	request	The IO context and type
1490 @return the number of bytes read/written or negative value on error */
1491 ssize_t
execute(const IORequest & request)1492 SyncFileIO::execute(const IORequest& request)
1493 {
1494 	ssize_t	n_bytes;
1495 
1496 	if (request.is_read()) {
1497 		n_bytes = pread(m_fh, m_buf, m_n, m_offset);
1498 	} else {
1499 		ut_ad(request.is_write());
1500 		n_bytes = pwrite(m_fh, m_buf, m_n, m_offset);
1501 	}
1502 
1503 	return(n_bytes);
1504 }
1505 /** Free storage space associated with a section of the file.
1506 @param[in]	fh		Open file handle
1507 @param[in]	off		Starting offset (SEEK_SET)
1508 @param[in]	len		Size of the hole
1509 @return DB_SUCCESS or error code */
1510 static
1511 dberr_t
os_file_punch_hole_posix(os_file_t fh,os_offset_t off,os_offset_t len)1512 os_file_punch_hole_posix(
1513 	os_file_t	fh,
1514 	os_offset_t	off,
1515 	os_offset_t	len)
1516 {
1517 
1518 #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
1519 	const int	mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
1520 
1521 	int		ret = fallocate(fh, mode, off, len);
1522 
1523 	if (ret == 0) {
1524 		return(DB_SUCCESS);
1525 	}
1526 
1527 	if (errno == ENOTSUP) {
1528 		return(DB_IO_NO_PUNCH_HOLE);
1529 	}
1530 
1531 	ib::warn()
1532 		<< "fallocate("
1533 		<<", FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, "
1534 		<< off << ", " << len << ") returned errno: "
1535 		<<  errno;
1536 
1537 	return(DB_IO_ERROR);
1538 
1539 #elif defined(UNIV_SOLARIS)
1540 
1541 	// Use F_FREESP
1542 
1543 #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
1544 
1545 	return(DB_IO_NO_PUNCH_HOLE);
1546 }
1547 
1548 #if defined(LINUX_NATIVE_AIO)
1549 
1550 /** Linux native AIO handler */
1551 class LinuxAIOHandler {
1552 public:
1553 	/**
1554 	@param[in] global_segment	The global segment*/
LinuxAIOHandler(ulint global_segment)1555 	LinuxAIOHandler(ulint global_segment)
1556 		:
1557 		m_global_segment(global_segment)
1558 	{
1559 		/* Should never be doing Sync IO here. */
1560 		ut_a(m_global_segment != ULINT_UNDEFINED);
1561 
1562 		/* Find the array and the local segment. */
1563 
1564 		m_segment = AIO::get_array_and_local_segment(
1565 			&m_array, m_global_segment);
1566 
1567 		m_n_slots = m_array->slots_per_segment();
1568 	}
1569 
1570 	/** Destructor */
~LinuxAIOHandler()1571 	~LinuxAIOHandler()
1572 	{
1573 		// No op
1574 	}
1575 
1576 	/**
1577 	Process a Linux AIO request
1578 	@param[out]	m1		the messages passed with the
1579 	@param[out]	m2		AIO request; note that in case the
1580 					AIO operation failed, these output
1581 					parameters are valid and can be used to
1582 					restart the operation.
1583 	@param[out]	request		IO context
1584 	@return DB_SUCCESS or error code */
1585 	dberr_t poll(fil_node_t** m1, void** m2, IORequest* request);
1586 
1587 private:
1588 	/** Resubmit an IO request that was only partially successful
1589 	@param[in,out]	slot		Request to resubmit
1590 	@return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
1591 	dberr_t	resubmit(Slot* slot);
1592 
1593 	/** Check if the AIO succeeded
1594 	@param[in,out]	slot		The slot to check
1595 	@return DB_SUCCESS, DB_FAIL if the operation should be retried or
1596 		DB_IO_ERROR on all other errors */
1597 	dberr_t	check_state(Slot* slot);
1598 
1599 	/** @return true if a shutdown was detected */
is_shutdown() const1600 	bool is_shutdown() const
1601 	{
1602 		return(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
1603 		       && !buf_page_cleaner_is_active);
1604 	}
1605 
1606 	/** If no slot was found then the m_array->m_mutex will be released.
1607 	@param[out]	n_pending	The number of pending IOs
1608 	@return NULL or a slot that has completed IO */
1609 	Slot* find_completed_slot(ulint* n_pending);
1610 
1611 	/** This is called from within the IO-thread. If there are no completed
1612 	IO requests in the slot array, the thread calls this function to
1613 	collect more requests from the Linux kernel.
1614 	The IO-thread waits on io_getevents(), which is a blocking call, with
1615 	a timeout value. Unless the system is very heavy loaded, keeping the
1616 	IO-thread very busy, the io-thread will spend most of its time waiting
1617 	in this function.
1618 	The IO-thread also exits in this function. It checks server status at
1619 	each wakeup and that is why we use timed wait in io_getevents(). */
1620 	void collect();
1621 
1622 private:
1623 	/** Slot array */
1624 	AIO*			m_array;
1625 
1626 	/** Number of slots inthe local segment */
1627 	ulint			m_n_slots;
1628 
1629 	/** The local segment to check */
1630 	ulint			m_segment;
1631 
1632 	/** The global segment */
1633 	ulint			m_global_segment;
1634 };
1635 
1636 /** Resubmit an IO request that was only partially successful
1637 @param[in,out]	slot		Request to resubmit
1638 @return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
1639 dberr_t
resubmit(Slot * slot)1640 LinuxAIOHandler::resubmit(Slot* slot)
1641 {
1642 #ifdef UNIV_DEBUG
1643 	/* Bytes already read/written out */
1644 	ulint	n_bytes = slot->ptr - slot->buf;
1645 
1646 	ut_ad(m_array->is_mutex_owned());
1647 
1648 	ut_ad(n_bytes < slot->original_len);
1649 	ut_ad(static_cast<ulint>(slot->n_bytes) < slot->original_len - n_bytes);
1650 	/* Partial read or write scenario */
1651 	ut_ad(slot->len >= static_cast<ulint>(slot->n_bytes));
1652 #endif /* UNIV_DEBUG */
1653 
1654 	slot->len -= slot->n_bytes;
1655 	slot->ptr += slot->n_bytes;
1656 	slot->offset += slot->n_bytes;
1657 
1658 	/* Resetting the bytes read/written */
1659 	slot->n_bytes = 0;
1660 	slot->io_already_done = false;
1661 
1662 	compile_time_assert(sizeof(off_t) >= sizeof(os_offset_t));
1663 
1664 	struct iocb*	iocb = &slot->control;
1665 
1666 	if (slot->type.is_read()) {
1667 
1668 		io_prep_pread(
1669 			iocb,
1670 			slot->file,
1671 			slot->ptr,
1672 			slot->len,
1673 			slot->offset);
1674 	} else {
1675 
1676 		ut_a(slot->type.is_write());
1677 
1678 		io_prep_pwrite(
1679 			iocb,
1680 			slot->file,
1681 			slot->ptr,
1682 			slot->len,
1683 			slot->offset);
1684 	}
1685 
1686 	iocb->data = slot;
1687 
1688 	ut_a(reinterpret_cast<size_t>(iocb->u.c.buf) % OS_FILE_LOG_BLOCK_SIZE
1689 	     == 0);
1690 
1691 	/* Resubmit an I/O request */
1692 	int	ret = io_submit(m_array->io_ctx(m_segment), 1, &iocb);
1693 	ut_a(ret != -EINVAL);
1694 
1695 	if (ret < 0)  {
1696 		errno = -ret;
1697 	}
1698 
1699 	return(ret < 0 ? DB_IO_PARTIAL_FAILED : DB_SUCCESS);
1700 }
1701 
1702 /** Check if the AIO succeeded
1703 @param[in,out]	slot		The slot to check
1704 @return DB_SUCCESS, DB_FAIL if the operation should be retried or
1705 	DB_IO_ERROR on all other errors */
1706 dberr_t
check_state(Slot * slot)1707 LinuxAIOHandler::check_state(Slot* slot)
1708 {
1709 	ut_ad(m_array->is_mutex_owned());
1710 
1711 	/* Note that it may be that there is more then one completed
1712 	IO requests. We process them one at a time. We may have a case
1713 	here to improve the performance slightly by dealing with all
1714 	requests in one sweep. */
1715 
1716 	srv_set_io_thread_op_info(
1717 		m_global_segment, "processing completed aio requests");
1718 
1719 	ut_ad(slot->io_already_done);
1720 
1721 	dberr_t	err = DB_SUCCESS;
1722 
1723 	if (slot->ret == 0) {
1724 
1725 		err = AIOHandler::post_io_processing(slot);
1726 
1727 	} else {
1728 		errno = -slot->ret;
1729 
1730 		/* os_file_handle_error does tell us if we should retry
1731 		this IO. As it stands now, we don't do this retry when
1732 		reaping requests from a different context than
1733 		the dispatcher. This non-retry logic is the same for
1734 		Windows and Linux native AIO.
1735 		We should probably look into this to transparently
1736 		re-submit the IO. */
1737 		os_file_handle_error(slot->name, "Linux aio");
1738 
1739 		err = DB_IO_ERROR;
1740 	}
1741 
1742 	return(err);
1743 }
1744 
1745 /** If no slot was found then the m_array->m_mutex will be released.
1746 @param[out]	n_pending		The number of pending IOs
1747 @return NULL or a slot that has completed IO */
1748 Slot*
find_completed_slot(ulint * n_pending)1749 LinuxAIOHandler::find_completed_slot(ulint* n_pending)
1750 {
1751 	ulint	offset = m_n_slots * m_segment;
1752 
1753 	*n_pending = 0;
1754 
1755 	m_array->acquire();
1756 
1757 	Slot*	slot = m_array->at(offset);
1758 
1759 	for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
1760 
1761 		if (slot->is_reserved) {
1762 
1763 			++*n_pending;
1764 
1765 			if (slot->io_already_done) {
1766 
1767 				/* Something for us to work on.
1768 				Note: We don't release the mutex. */
1769 				return(slot);
1770 			}
1771 		}
1772 	}
1773 
1774 	m_array->release();
1775 
1776 	return(NULL);
1777 }
1778 
1779 /** This function is only used in Linux native asynchronous i/o. This is
1780 called from within the io-thread. If there are no completed IO requests
1781 in the slot array, the thread calls this function to collect more
1782 requests from the kernel.
1783 The io-thread waits on io_getevents(), which is a blocking call, with
1784 a timeout value. Unless the system is very heavy loaded, keeping the
1785 io-thread very busy, the io-thread will spend most of its time waiting
1786 in this function.
1787 The io-thread also exits in this function. It checks server status at
1788 each wakeup and that is why we use timed wait in io_getevents(). */
1789 void
collect()1790 LinuxAIOHandler::collect()
1791 {
1792 	ut_ad(m_n_slots > 0);
1793 	ut_ad(m_array != NULL);
1794 	ut_ad(m_segment < m_array->get_n_segments());
1795 
1796 	/* Which io_context_t we are going to use. */
1797 	io_context_t	io_ctx = m_array->io_ctx(m_segment);
1798 
1799 	/* Starting point of the m_segment we will be working on. */
1800 	ulint	start_pos = m_segment * m_n_slots;
1801 
1802 	/* End point. */
1803 	ulint	end_pos = start_pos + m_n_slots;
1804 
1805 	for (;;) {
1806 		struct io_event*	events;
1807 
1808 		/* Which part of event array we are going to work on. */
1809 		events = m_array->io_events(m_segment * m_n_slots);
1810 
1811 		/* Initialize the events. */
1812 		memset(events, 0, sizeof(*events) * m_n_slots);
1813 
1814 		/* The timeout value is arbitrary. We probably need
1815 		to experiment with it a little. */
1816 		struct timespec		timeout;
1817 
1818 		timeout.tv_sec = 0;
1819 		timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
1820 
1821 		int	ret;
1822 
1823 		ret = io_getevents(io_ctx, 1, m_n_slots, events, &timeout);
1824 		ut_a(ret != -EINVAL);
1825 		ut_ad(ret != -EFAULT);
1826 
1827 		for (int i = 0; i < ret; ++i) {
1828 
1829 			struct iocb*	iocb;
1830 
1831 			iocb = reinterpret_cast<struct iocb*>(events[i].obj);
1832 			ut_a(iocb != NULL);
1833 
1834 			Slot*	slot = reinterpret_cast<Slot*>(iocb->data);
1835 
1836 			/* Some sanity checks. */
1837 			ut_a(slot != NULL);
1838 			ut_a(slot->is_reserved);
1839 
1840 			/* We are not scribbling previous segment. */
1841 			ut_a(slot->pos >= start_pos);
1842 
1843 			/* We have not overstepped to next segment. */
1844 			ut_a(slot->pos < end_pos);
1845 
1846 			/* Deallocate unused blocks from file system.
1847 			This is newer done to page 0 or to log files.*/
1848 			if (slot->offset > 0
1849 			    && !slot->type.is_log()
1850 			    && slot->type.is_write()
1851 			    && slot->type.punch_hole()) {
1852 
1853 				slot->err = slot->type.punch_hole(
1854 					slot->file,
1855 					slot->offset, slot->len);
1856 			} else {
1857 				slot->err = DB_SUCCESS;
1858 			}
1859 
1860 			/* Mark this request as completed. The error handling
1861 			will be done in the calling function. */
1862 			m_array->acquire();
1863 
1864 			/* events[i].res2 should always be ZERO */
1865 			ut_ad(events[i].res2 == 0);
1866 			slot->io_already_done = true;
1867 
1868 			/*Even though events[i].res is an unsigned number
1869 			in libaio, it is used to return a negative value
1870 			(negated errno value) to indicate error and a positive
1871 			value to indicate number of bytes read or written. */
1872 
1873 			if (events[i].res > slot->len) {
1874 				/* failure */
1875 				slot->n_bytes = 0;
1876 				slot->ret = events[i].res;
1877 			} else {
1878 				/* success */
1879 				slot->n_bytes = events[i].res;
1880 				slot->ret = 0;
1881 			}
1882 			m_array->release();
1883 		}
1884 
1885 		if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
1886 		    || !buf_page_cleaner_is_active
1887 		    || ret > 0) {
1888 
1889 			break;
1890 		}
1891 
1892 		/* This error handling is for any error in collecting the
1893 		IO requests. The errors, if any, for any particular IO
1894 		request are simply passed on to the calling routine. */
1895 
1896 		switch (ret) {
1897 		case -EAGAIN:
1898 			/* Not enough resources! Try again. */
1899 
1900 		case -EINTR:
1901 			/* Interrupted! The behaviour in case of an interrupt.
1902 			If we have some completed IOs available then the
1903 			return code will be the number of IOs. We get EINTR
1904 			only if there are no completed IOs and we have been
1905 			interrupted. */
1906 
1907 		case 0:
1908 			/* No pending request! Go back and check again. */
1909 
1910 			continue;
1911 		}
1912 
1913 		/* All other errors should cause a trap for now. */
1914 		ib::fatal()
1915 			<< "Unexpected ret_code[" << ret
1916 			<< "] from io_getevents()!";
1917 
1918 		break;
1919 	}
1920 }
1921 
1922 /** Process a Linux AIO request
1923 @param[out]	m1		the messages passed with the
1924 @param[out]	m2		AIO request; note that in case the
1925 				AIO operation failed, these output
1926 				parameters are valid and can be used to
1927 				restart the operation.
1928 @param[out]	request		IO context
1929 @return DB_SUCCESS or error code */
1930 dberr_t
poll(fil_node_t ** m1,void ** m2,IORequest * request)1931 LinuxAIOHandler::poll(fil_node_t** m1, void** m2, IORequest* request)
1932 {
1933 	dberr_t		err = DB_SUCCESS;
1934 	Slot*		slot;
1935 
1936 	/* Loop until we have found a completed request. */
1937 	for (;;) {
1938 
1939 		ulint	n_pending;
1940 
1941 		slot = find_completed_slot(&n_pending);
1942 
1943 		if (slot != NULL) {
1944 
1945 			ut_ad(m_array->is_mutex_owned());
1946 
1947 			err = check_state(slot);
1948 
1949 			/* DB_FAIL is not a hard error, we should retry */
1950 			if (err != DB_FAIL) {
1951 				break;
1952 			}
1953 
1954 			/* Partial IO, resubmit request for
1955 			remaining bytes to read/write */
1956 			err = resubmit(slot);
1957 
1958 			if (err != DB_SUCCESS) {
1959 				break;
1960 			}
1961 
1962 			m_array->release();
1963 
1964 		} else if (is_shutdown() && n_pending == 0) {
1965 
1966 			/* There is no completed request. If there is
1967 			no pending request at all, and the system is
1968 			being shut down, exit. */
1969 
1970 			*m1 = NULL;
1971 			*m2 = NULL;
1972 
1973 			return(DB_SUCCESS);
1974 
1975 		} else {
1976 
1977 			/* Wait for some request. Note that we return
1978 			from wait if we have found a request. */
1979 
1980 			srv_set_io_thread_op_info(
1981 				m_global_segment,
1982 				"waiting for completed aio requests");
1983 
1984 			collect();
1985 		}
1986 	}
1987 
1988 	if (err == DB_IO_PARTIAL_FAILED) {
1989 		/* Aborting in case of submit failure */
1990 		ib::fatal()
1991 			<< "Native Linux AIO interface. "
1992 			"io_submit() call failed when "
1993 			"resubmitting a partial I/O "
1994 			"request on the file " << slot->name
1995 			<< ".";
1996 	}
1997 
1998 	*m1 = slot->m1;
1999 	*m2 = slot->m2;
2000 
2001 	*request = slot->type;
2002 
2003 	m_array->release(slot);
2004 
2005 	m_array->release();
2006 
2007 	return(err);
2008 }
2009 
2010 /** This function is only used in Linux native asynchronous i/o.
2011 Waits for an aio operation to complete. This function is used to wait for
2012 the completed requests. The aio array of pending requests is divided
2013 into segments. The thread specifies which segment or slot it wants to wait
2014 for. NOTE: this function will also take care of freeing the aio slot,
2015 therefore no other thread is allowed to do the freeing!
2016 
2017 @param[in]	global_seg	segment number in the aio array
2018 				to wait for; segment 0 is the ibuf
2019 				i/o thread, segment 1 is log i/o thread,
2020 				then follow the non-ibuf read threads,
2021 				and the last are the non-ibuf write
2022 				threads.
2023 @param[out]	m1		the messages passed with the
2024 @param[out]	m2			AIO request; note that in case the
2025 				AIO operation failed, these output
2026 				parameters are valid and can be used to
2027 				restart the operation.
2028 @param[out]xi	 request	IO context
2029 @return DB_SUCCESS if the IO was successful */
2030 static
2031 dberr_t
os_aio_linux_handler(ulint global_segment,fil_node_t ** m1,void ** m2,IORequest * request)2032 os_aio_linux_handler(
2033 	ulint		global_segment,
2034 	fil_node_t**	m1,
2035 	void**		m2,
2036 	IORequest*	request)
2037 {
2038 	return LinuxAIOHandler(global_segment).poll(m1, m2, request);
2039 }
2040 
2041 /** Dispatch an AIO request to the kernel.
2042 @param[in,out]	slot		an already reserved slot
2043 @return true on success. */
2044 bool
linux_dispatch(Slot * slot)2045 AIO::linux_dispatch(Slot* slot)
2046 {
2047 	ut_a(slot->is_reserved);
2048 	ut_ad(slot->type.validate());
2049 
2050 	/* Find out what we are going to work with.
2051 	The iocb struct is directly in the slot.
2052 	The io_context_t is one per segment. */
2053 
2054 	ulint		io_ctx_index;
2055 	struct iocb*	iocb = &slot->control;
2056 
2057 	io_ctx_index = (slot->pos * m_n_segments) / m_slots.size();
2058 
2059 	ut_a(reinterpret_cast<size_t>(iocb->u.c.buf) % OS_FILE_LOG_BLOCK_SIZE
2060 	     == 0);
2061 
2062 	int	ret = io_submit(io_ctx(io_ctx_index), 1, &iocb);
2063 	ut_a(ret != -EINVAL);
2064 
2065 	/* io_submit() returns number of successfully queued requests
2066 	or -errno. */
2067 
2068 	if (ret != 1) {
2069 		errno = -ret;
2070 	}
2071 
2072 	return(ret == 1);
2073 }
2074 
2075 /** Creates an io_context_t for native linux AIO.
2076 @param[in]	max_events	number of events
2077 @param[out]	io_ctx		io_ctx to initialize.
2078 @return true on success. */
2079 bool
linux_create_io_ctx(unsigned max_events,io_context_t & io_ctx)2080 AIO::linux_create_io_ctx(
2081 	unsigned	max_events,
2082 	io_context_t&	io_ctx)
2083 {
2084 	ssize_t		n_retries = 0;
2085 
2086 	for (;;) {
2087 
2088 		memset(&io_ctx, 0x0, sizeof(io_ctx));
2089 
2090 		/* Initialize the io_ctx. Tell it how many pending
2091 		IO requests this context will handle. */
2092 
2093 		int	ret = io_setup(max_events, &io_ctx);
2094 		ut_a(ret != -EINVAL);
2095 
2096 		if (ret == 0) {
2097 			/* Success. Return now. */
2098 			return(true);
2099 		}
2100 
2101 		/* If we hit EAGAIN we'll make a few attempts before failing. */
2102 
2103 		switch (ret) {
2104 		case -EAGAIN:
2105 			if (n_retries == 0) {
2106 				/* First time around. */
2107 				ib::warn()
2108 					<< "io_setup() failed with EAGAIN."
2109 					" Will make "
2110 					<< OS_AIO_IO_SETUP_RETRY_ATTEMPTS
2111 					<< " attempts before giving up.";
2112 			}
2113 
2114 			if (n_retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
2115 
2116 				++n_retries;
2117 
2118 				ib::warn()
2119 					<< "io_setup() attempt "
2120 					<< n_retries << ".";
2121 
2122 				os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
2123 
2124 				continue;
2125 			}
2126 
2127 			/* Have tried enough. Better call it a day. */
2128 			ib::warn()
2129 				<< "io_setup() failed with EAGAIN after "
2130 				<< OS_AIO_IO_SETUP_RETRY_ATTEMPTS
2131 				<< " attempts.";
2132 			break;
2133 
2134 		case -ENOSYS:
2135 			ib::warn()
2136 				<< "Linux Native AIO interface"
2137 				" is not supported on this platform. Please"
2138 				" check your OS documentation and install"
2139 				" appropriate binary of InnoDB.";
2140 
2141 			break;
2142 
2143 		default:
2144 			ib::warn()
2145 				<< "Linux Native AIO setup"
2146 				<< " returned following error["
2147 				<< ret << "]";
2148 			break;
2149 		}
2150 
2151 		ib::info()
2152 			<< "You can disable Linux Native AIO by"
2153 			" setting innodb_use_native_aio = 0 in my.cnf";
2154 
2155 		break;
2156 	}
2157 
2158 	return(false);
2159 }
2160 
2161 /** Checks if the system supports native linux aio. On some kernel
2162 versions where native aio is supported it won't work on tmpfs. In such
2163 cases we can't use native aio as it is not possible to mix simulated
2164 and native aio.
2165 @return: true if supported, false otherwise. */
2166 bool
is_linux_native_aio_supported()2167 AIO::is_linux_native_aio_supported()
2168 {
2169 	int		fd;
2170 	io_context_t	io_ctx;
2171 	char		name[1000];
2172 
2173 	if (!linux_create_io_ctx(1, io_ctx)) {
2174 
2175 		/* The platform does not support native aio. */
2176 
2177 		return(false);
2178 
2179 	} else if (!srv_read_only_mode) {
2180 
2181 		/* Now check if tmpdir supports native aio ops. */
2182 		fd = innobase_mysql_tmpfile(NULL);
2183 
2184 		if (fd < 0) {
2185 			ib::warn()
2186 				<< "Unable to create temp file to check"
2187 				" native AIO support.";
2188 
2189 			int ret = io_destroy(io_ctx);
2190 			ut_a(ret != -EINVAL);
2191 			ut_ad(ret != -EFAULT);
2192 
2193 			return(false);
2194 		}
2195 	} else {
2196 
2197 		os_normalize_path(srv_log_group_home_dir);
2198 
2199 		ulint	dirnamelen = strlen(srv_log_group_home_dir);
2200 
2201 		ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
2202 
2203 		memcpy(name, srv_log_group_home_dir, dirnamelen);
2204 
2205 		/* Add a path separator if needed. */
2206 		if (dirnamelen && name[dirnamelen - 1] != OS_PATH_SEPARATOR) {
2207 
2208 			name[dirnamelen++] = OS_PATH_SEPARATOR;
2209 		}
2210 
2211 		strcpy(name + dirnamelen, "ib_logfile0");
2212 
2213 		fd = open(name, O_RDONLY | O_CLOEXEC);
2214 
2215 		if (fd == -1) {
2216 
2217 			ib::warn()
2218 				<< "Unable to open"
2219 				<< " \"" << name << "\" to check native"
2220 				<< " AIO read support.";
2221 
2222 			int ret = io_destroy(io_ctx);
2223 			ut_a(ret != EINVAL);
2224 			ut_ad(ret != EFAULT);
2225 
2226 			return(false);
2227 		}
2228 	}
2229 
2230 	struct io_event	io_event;
2231 
2232 	memset(&io_event, 0x0, sizeof(io_event));
2233 
2234 	byte*	buf = static_cast<byte*>(ut_malloc_nokey(srv_page_size * 2));
2235 	byte*	ptr = static_cast<byte*>(ut_align(buf, srv_page_size));
2236 
2237 	struct iocb	iocb;
2238 
2239 	/* Suppress valgrind warning. */
2240 	memset(buf, 0x00, srv_page_size * 2);
2241 	memset(&iocb, 0x0, sizeof(iocb));
2242 
2243 	struct iocb*	p_iocb = &iocb;
2244 
2245 	if (!srv_read_only_mode) {
2246 
2247 		io_prep_pwrite(p_iocb, fd, ptr, srv_page_size, 0);
2248 
2249 	} else {
2250 		ut_a(srv_page_size >= 4096);
2251 		io_prep_pread(p_iocb, fd, ptr, srv_page_size, 0);
2252 	}
2253 
2254 	ut_a(reinterpret_cast<size_t>(p_iocb->u.c.buf) % OS_FILE_LOG_BLOCK_SIZE
2255 	     == 0);
2256 	int	err = io_submit(io_ctx, 1, &p_iocb);
2257 	ut_a(err != -EINVAL);
2258 
2259 	if (err >= 1) {
2260 		/* Now collect the submitted IO request. */
2261 		err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
2262 		ut_a(err != -EINVAL);
2263 	}
2264 
2265 	ut_free(buf);
2266 	close(fd);
2267 
2268 	switch (err) {
2269 	case 1:
2270 		{
2271 			int ret = io_destroy(io_ctx);
2272 			ut_a(ret != -EINVAL);
2273 			ut_ad(ret != -EFAULT);
2274 
2275 			return(true);
2276 		}
2277 
2278 	case -EINVAL:
2279 	case -ENOSYS:
2280 		ib::error()
2281 			<< "Linux Native AIO not supported. You can either"
2282 			" move "
2283 			<< (srv_read_only_mode ? name : "tmpdir")
2284 			<< " to a file system that supports native"
2285 			" AIO or you can set innodb_use_native_aio to"
2286 			" FALSE to avoid this message.";
2287 
2288 		/* fall through. */
2289 	default:
2290 		ib::error()
2291 			<< "Linux Native AIO check on "
2292 			<< (srv_read_only_mode ? name : "tmpdir")
2293 			<< "returned error[" << -err << "]";
2294 	}
2295 
2296 	int ret = io_destroy(io_ctx);
2297 	ut_a(ret != -EINVAL);
2298 	ut_ad(ret != -EFAULT);
2299 
2300 	return(false);
2301 }
2302 
2303 #endif /* LINUX_NATIVE_AIO */
2304 
2305 /** Retrieves the last error number if an error occurs in a file io function.
2306 The number should be retrieved before any other OS calls (because they may
2307 overwrite the error number). If the number is not known to this program,
2308 the OS error number + OS_FILE_ERROR_MAX is returned.
2309 @param[in]	report_all_errors	true if we want an error message
2310 					printed of all errors
2311 @param[in]	on_error_silent		true then don't print any diagnostic
2312 					to the log
2313 @return error number, or OS error number + OS_FILE_ERROR_MAX */
2314 static
2315 ulint
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)2316 os_file_get_last_error_low(
2317 	bool	report_all_errors,
2318 	bool	on_error_silent)
2319 {
2320 	int	err = errno;
2321 
2322 	if (err == 0) {
2323 		return(0);
2324 	}
2325 
2326 	if (report_all_errors
2327 	    || (err != ENOSPC && err != EEXIST && !on_error_silent)) {
2328 
2329 		ib::error()
2330 			<< "Operating system error number "
2331 			<< err
2332 			<< " in a file operation.";
2333 
2334 		if (err == ENOENT) {
2335 
2336 			ib::error()
2337 				<< "The error means the system"
2338 				" cannot find the path specified.";
2339 
2340 			if (srv_is_being_started) {
2341 
2342 				ib::error()
2343 					<< "If you are installing InnoDB,"
2344 					" remember that you must create"
2345 					" directories yourself, InnoDB"
2346 					" does not create them.";
2347 			}
2348 		} else if (err == EACCES) {
2349 
2350 			ib::error()
2351 				<< "The error means mysqld does not have"
2352 				" the access rights to the directory.";
2353 
2354 		} else {
2355 			if (strerror(err) != NULL) {
2356 
2357 				ib::error()
2358 					<< "Error number " << err << " means '"
2359 					<< strerror(err) << "'";
2360 			}
2361 
2362 			ib::info() << OPERATING_SYSTEM_ERROR_MSG;
2363 		}
2364 	}
2365 
2366 	switch (err) {
2367 	case ENOSPC:
2368 		return(OS_FILE_DISK_FULL);
2369 	case ENOENT:
2370 		return(OS_FILE_NOT_FOUND);
2371 	case EEXIST:
2372 		return(OS_FILE_ALREADY_EXISTS);
2373 	case EXDEV:
2374 	case ENOTDIR:
2375 	case EISDIR:
2376 		return(OS_FILE_PATH_ERROR);
2377 	case EAGAIN:
2378 		if (srv_use_native_aio) {
2379 			return(OS_FILE_AIO_RESOURCES_RESERVED);
2380 		}
2381 		break;
2382 	case EINTR:
2383 		if (srv_use_native_aio) {
2384 			return(OS_FILE_AIO_INTERRUPTED);
2385 		}
2386 		break;
2387 	case EACCES:
2388 		return(OS_FILE_ACCESS_VIOLATION);
2389 	}
2390 	return(OS_FILE_ERROR_MAX + err);
2391 }
2392 
2393 /** Wrapper to fsync(2) that retries the call on some errors.
2394 Returns the value 0 if successful; otherwise the value -1 is returned and
2395 the global variable errno is set to indicate the error.
2396 @param[in]	file		open file handle
2397 @return 0 if success, -1 otherwise */
2398 static
2399 int
os_file_fsync_posix(os_file_t file)2400 os_file_fsync_posix(
2401 	os_file_t	file)
2402 {
2403 	ulint		failures = 0;
2404 
2405 	for (;;) {
2406 
2407 		++os_n_fsyncs;
2408 
2409 		int	ret = fsync(file);
2410 
2411 		if (ret == 0) {
2412 			return(ret);
2413 		}
2414 
2415 		switch(errno) {
2416 		case ENOLCK:
2417 
2418 			++failures;
2419 			ut_a(failures < 1000);
2420 
2421 			if (!(failures % 100)) {
2422 
2423 				ib::warn()
2424 					<< "fsync(): "
2425 					<< "No locks available; retrying";
2426 			}
2427 
2428 			/* 0.2 sec */
2429 			os_thread_sleep(200000);
2430 			break;
2431 
2432 		case EINTR:
2433 
2434 			++failures;
2435 			ut_a(failures < 2000);
2436 			break;
2437 
2438 		default:
2439 			ib::fatal() << "fsync() returned " << errno;
2440 		}
2441 	}
2442 }
2443 
2444 /** Check the existence and type of the given file.
2445 @param[in]	path		path name of file
2446 @param[out]	exists		true if the file exists
2447 @param[out]	type		Type of the file, if it exists
2448 @return true if call succeeded */
2449 static
2450 bool
os_file_status_posix(const char * path,bool * exists,os_file_type_t * type)2451 os_file_status_posix(
2452 	const char*	path,
2453 	bool*		exists,
2454 	os_file_type_t* type)
2455 {
2456 	struct stat	statinfo;
2457 
2458 	int	ret = stat(path, &statinfo);
2459 
2460 	*exists = !ret;
2461 
2462 	if (!ret) {
2463 		/* file exists, everything OK */
2464 
2465 	} else if (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG) {
2466 		/* file does not exist */
2467 		return(true);
2468 
2469 	} else {
2470 		/* file exists, but stat call failed */
2471 		os_file_handle_error_no_exit(path, "stat", false);
2472 		return(false);
2473 	}
2474 
2475 	if (S_ISDIR(statinfo.st_mode)) {
2476 		*type = OS_FILE_TYPE_DIR;
2477 
2478 	} else if (S_ISLNK(statinfo.st_mode)) {
2479 		*type = OS_FILE_TYPE_LINK;
2480 
2481 	} else if (S_ISREG(statinfo.st_mode)) {
2482 		*type = OS_FILE_TYPE_FILE;
2483 	} else {
2484 		*type = OS_FILE_TYPE_UNKNOWN;
2485 	}
2486 
2487 	return(true);
2488 }
2489 
2490 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
2491 function!
2492 Flushes the write buffers of a given file to the disk.
2493 @param[in]	file		handle to a file
2494 @return true if success */
2495 bool
os_file_flush_func(os_file_t file)2496 os_file_flush_func(
2497 	os_file_t	file)
2498 {
2499 	int	ret;
2500 
2501 	WAIT_ALLOW_WRITES();
2502 	ret = os_file_fsync_posix(file);
2503 
2504 	if (ret == 0) {
2505 		return(true);
2506 	}
2507 
2508 	/* Since Linux returns EINVAL if the 'file' is actually a raw device,
2509 	we choose to ignore that error if we are using raw disks */
2510 
2511 	if (srv_start_raw_disk_in_use && errno == EINVAL) {
2512 
2513 		return(true);
2514 	}
2515 
2516 	ib::error() << "The OS said file flush did not succeed";
2517 
2518 	os_file_handle_error(NULL, "flush");
2519 
2520 	/* It is a fatal error if a file flush does not succeed, because then
2521 	the database can get corrupt on disk */
2522 	ut_error;
2523 
2524 	return(false);
2525 }
2526 
2527 /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
2528 this function!
2529 A simple function to open or create a file.
2530 @param[in]	name		name of the file or path as a null-terminated
2531 				string
2532 @param[in]	create_mode	create mode
2533 @param[in]	access_type	OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
2534 @param[in]	read_only	if true, read only checks are enforced
2535 @param[out]	success		true if succeed, false if error
2536 @return handle to the file, not defined if error, error number
2537 	can be retrieved with os_file_get_last_error */
2538 pfs_os_file_t
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)2539 os_file_create_simple_func(
2540 	const char*	name,
2541 	ulint		create_mode,
2542 	ulint		access_type,
2543 	bool		read_only,
2544 	bool*		success)
2545 {
2546 	pfs_os_file_t	file;
2547 
2548 	*success = false;
2549 
2550 	int		create_flag;
2551 	const char*	mode_str	= NULL;
2552 
2553 	if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) {
2554 		WAIT_ALLOW_WRITES();
2555 	}
2556 
2557 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
2558 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
2559 
2560 	if (create_mode == OS_FILE_OPEN) {
2561 		mode_str = "OPEN";
2562 
2563 		if (access_type == OS_FILE_READ_ONLY) {
2564 
2565 			create_flag = O_RDONLY;
2566 
2567 		} else if (read_only) {
2568 
2569 			create_flag = O_RDONLY;
2570 
2571 		} else {
2572 			create_flag = O_RDWR;
2573 		}
2574 
2575 	} else if (read_only) {
2576 
2577 		mode_str = "OPEN";
2578 		create_flag = O_RDONLY;
2579 
2580 	} else if (create_mode == OS_FILE_CREATE) {
2581 
2582 		mode_str = "CREATE";
2583 		create_flag = O_RDWR | O_CREAT | O_EXCL;
2584 
2585 	} else if (create_mode == OS_FILE_CREATE_PATH) {
2586 
2587 		mode_str = "CREATE PATH";
2588 		/* Create subdirs along the path if needed. */
2589 
2590 		*success = os_file_create_subdirs_if_needed(name);
2591 
2592 		if (!*success) {
2593 
2594 			ib::error()
2595 				<< "Unable to create subdirectories '"
2596 				<< name << "'";
2597 
2598 			return(OS_FILE_CLOSED);
2599 		}
2600 
2601 		create_flag = O_RDWR | O_CREAT | O_EXCL;
2602 		create_mode = OS_FILE_CREATE;
2603 	} else {
2604 
2605 		ib::error()
2606 			<< "Unknown file create mode ("
2607 			<< create_mode
2608 			<< " for file '" << name << "'";
2609 
2610 		return(OS_FILE_CLOSED);
2611 	}
2612 
2613 	bool	retry;
2614 
2615 	do {
2616 		file = open(name, create_flag | O_CLOEXEC, os_innodb_umask);
2617 
2618 		if (file == -1) {
2619 			*success = false;
2620 			retry = os_file_handle_error(
2621 				name,
2622 				create_mode == OS_FILE_OPEN
2623 				? "open" : "create");
2624 		} else {
2625 			*success = true;
2626 			retry = false;
2627 		}
2628 
2629 	} while (retry);
2630 
2631 	/* This function is always called for data files, we should disable
2632 	OS caching (O_DIRECT) here as we do in os_file_create_func(), so
2633 	we open the same file in the same mode, see man page of open(2). */
2634        if (!srv_read_only_mode
2635 	   && *success
2636 	   && (srv_file_flush_method == SRV_O_DIRECT
2637 	       || srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)) {
2638 
2639 	       os_file_set_nocache(file, name, mode_str);
2640 	}
2641 
2642 #ifdef USE_FILE_LOCK
2643 	if (!read_only
2644 	    && *success
2645 	    && (access_type == OS_FILE_READ_WRITE)
2646 	    && os_file_lock(file, name)) {
2647 
2648 		*success = false;
2649 		close(file);
2650 		file = -1;
2651 	}
2652 #endif /* USE_FILE_LOCK */
2653 
2654 	return(file);
2655 }
2656 
2657 /** This function attempts to create a directory named pathname. The new
2658 directory gets default permissions. On Unix the permissions are
2659 (0770 & ~umask). If the directory exists already, nothing is done and
2660 the call succeeds, unless the fail_if_exists arguments is true.
2661 If another error occurs, such as a permission error, this does not crash,
2662 but reports the error and returns false.
2663 @param[in]	pathname	directory name as null-terminated string
2664 @param[in]	fail_if_exists	if true, pre-existing directory is treated as
2665 				an error.
2666 @return true if call succeeds, false on error */
2667 bool
os_file_create_directory(const char * pathname,bool fail_if_exists)2668 os_file_create_directory(
2669 	const char*	pathname,
2670 	bool		fail_if_exists)
2671 {
2672 	int	rcode;
2673 
2674 	WAIT_ALLOW_WRITES();
2675 	rcode = mkdir(pathname, 0770);
2676 
2677 	if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
2678 		/* failure */
2679 		os_file_handle_error_no_exit(pathname, "mkdir", false);
2680 
2681 		return(false);
2682 	}
2683 
2684 	return(true);
2685 }
2686 
2687 /** NOTE! Use the corresponding macro os_file_create(), not directly
2688 this function!
2689 Opens an existing file or creates a new.
2690 @param[in]	name		name of the file or path as a null-terminated
2691 				string
2692 @param[in]	create_mode	create mode
2693 @param[in]	purpose		OS_FILE_AIO, if asynchronous, non-buffered I/O
2694 				is desired, OS_FILE_NORMAL, if any normal file;
2695 				NOTE that it also depends on type, os_aio_..
2696 				and srv_.. variables whether we really use async
2697 				I/O or unbuffered I/O: look in the function
2698 				source code for the exact rules
2699 @param[in]	type		OS_DATA_FILE or OS_LOG_FILE
2700 @param[in]	read_only	true, if read only checks should be enforcedm
2701 @param[in]	success		true if succeeded
2702 @return handle to the file, not defined if error, error number
2703 	can be retrieved with os_file_get_last_error */
2704 pfs_os_file_t
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,bool read_only,bool * success)2705 os_file_create_func(
2706 	const char*	name,
2707 	ulint		create_mode,
2708 	ulint		purpose,
2709 	ulint		type,
2710 	bool		read_only,
2711 	bool*		success)
2712 {
2713 	bool		on_error_no_exit;
2714 	bool		on_error_silent;
2715 
2716 	*success = false;
2717 
2718 	DBUG_EXECUTE_IF(
2719 		"ib_create_table_fail_disk_full",
2720 		*success = false;
2721 		errno = ENOSPC;
2722 		return(OS_FILE_CLOSED);
2723 	);
2724 
2725 	int		create_flag;
2726 	const char*	mode_str	= NULL;
2727 
2728 	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
2729 		? true : false;
2730 	on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
2731 		? true : false;
2732 
2733 	create_mode &= ulint(~(OS_FILE_ON_ERROR_NO_EXIT
2734 			       | OS_FILE_ON_ERROR_SILENT));
2735 
2736 	if (create_mode == OS_FILE_OPEN
2737 	    || create_mode == OS_FILE_OPEN_RAW
2738 	    || create_mode == OS_FILE_OPEN_RETRY) {
2739 
2740 		mode_str = "OPEN";
2741 
2742 		create_flag = read_only ? O_RDONLY : O_RDWR;
2743 
2744 	} else if (read_only) {
2745 
2746 		mode_str = "OPEN";
2747 
2748 		create_flag = O_RDONLY;
2749 
2750 	} else if (create_mode == OS_FILE_CREATE) {
2751 
2752 		mode_str = "CREATE";
2753 		create_flag = O_RDWR | O_CREAT | O_EXCL;
2754 
2755 	} else if (create_mode == OS_FILE_OVERWRITE) {
2756 
2757 		mode_str = "OVERWRITE";
2758 		create_flag = O_RDWR | O_CREAT | O_TRUNC;
2759 
2760 	} else {
2761 		ib::error()
2762 			<< "Unknown file create mode (" << create_mode << ")"
2763 			<< " for file '" << name << "'";
2764 
2765 		return(OS_FILE_CLOSED);
2766 	}
2767 
2768 	ut_a(type == OS_LOG_FILE
2769 	     || type == OS_DATA_FILE
2770 	     || type == OS_DATA_FILE_NO_O_DIRECT);
2771 
2772 	ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
2773 
2774 #ifdef O_SYNC
2775 	/* We let O_SYNC only affect log files; note that we map O_DSYNC to
2776 	O_SYNC because the datasync options seemed to corrupt files in 2001
2777 	in both Linux and Solaris */
2778 
2779 	if (!read_only
2780 	    && type == OS_LOG_FILE
2781 	    && srv_file_flush_method == SRV_O_DSYNC) {
2782 
2783 		create_flag |= O_SYNC;
2784 	}
2785 #endif /* O_SYNC */
2786 
2787 	os_file_t	file;
2788 	bool		retry;
2789 
2790 	do {
2791 		file = open(name, create_flag | O_CLOEXEC, os_innodb_umask);
2792 
2793 		if (file == -1) {
2794 			const char*	operation;
2795 
2796 			operation = (create_mode == OS_FILE_CREATE
2797 				     && !read_only) ? "create" : "open";
2798 
2799 			*success = false;
2800 
2801 			if (on_error_no_exit) {
2802 				retry = os_file_handle_error_no_exit(
2803 					name, operation, on_error_silent);
2804 			} else {
2805 				retry = os_file_handle_error(name, operation);
2806 			}
2807 		} else {
2808 			*success = true;
2809 			retry = false;
2810 		}
2811 
2812 	} while (retry);
2813 
2814 	/* We disable OS caching (O_DIRECT) only on data files */
2815 	if (!read_only
2816 	    && *success
2817 	    && (type != OS_LOG_FILE
2818 		&& type != OS_DATA_FILE_NO_O_DIRECT)
2819 	    && (srv_file_flush_method == SRV_O_DIRECT
2820 		|| srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)) {
2821 
2822 	       os_file_set_nocache(file, name, mode_str);
2823 	}
2824 
2825 #ifdef USE_FILE_LOCK
2826 	if (!read_only
2827 	    && *success
2828 	    && create_mode != OS_FILE_OPEN_RAW
2829 	    && os_file_lock(file, name)) {
2830 
2831 		if (create_mode == OS_FILE_OPEN_RETRY) {
2832 
2833 			ib::info()
2834 				<< "Retrying to lock the first data file";
2835 
2836 			for (int i = 0; i < 100; i++) {
2837 				os_thread_sleep(1000000);
2838 
2839 				if (!os_file_lock(file, name)) {
2840 					*success = true;
2841 					return(file);
2842 				}
2843 			}
2844 
2845 			ib::info()
2846 				<< "Unable to open the first data file";
2847 		}
2848 
2849 		*success = false;
2850 		close(file);
2851 		file = -1;
2852 	}
2853 #endif /* USE_FILE_LOCK */
2854 
2855 	return(file);
2856 }
2857 
2858 /** NOTE! Use the corresponding macro
2859 os_file_create_simple_no_error_handling(), not directly this function!
2860 A simple function to open or create a file.
2861 @param[in]	name		name of the file or path as a null-terminated
2862 				string
2863 @param[in]	create_mode	create mode
2864 @param[in]	access_type	OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
2865 				OS_FILE_READ_ALLOW_DELETE; the last option
2866 				is used by a backup program reading the file
2867 @param[in]	read_only	if true read only mode checks are enforced
2868 @param[out]	success		true if succeeded
2869 @return own: handle to the file, not defined if error, error number
2870 	can be retrieved with os_file_get_last_error */
2871 pfs_os_file_t
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)2872 os_file_create_simple_no_error_handling_func(
2873 	const char*	name,
2874 	ulint		create_mode,
2875 	ulint		access_type,
2876 	bool		read_only,
2877 	bool*		success)
2878 {
2879 	os_file_t	file;
2880 	int		create_flag;
2881 
2882 	if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) {
2883 		WAIT_ALLOW_WRITES();
2884 	}
2885 
2886 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
2887 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
2888 
2889 	*success = false;
2890 
2891 	if (create_mode == OS_FILE_OPEN) {
2892 
2893 		if (access_type == OS_FILE_READ_ONLY) {
2894 
2895 			create_flag = O_RDONLY;
2896 
2897 		} else if (read_only) {
2898 
2899 			create_flag = O_RDONLY;
2900 
2901 		} else {
2902 
2903 			ut_a(access_type == OS_FILE_READ_WRITE
2904 			     || access_type == OS_FILE_READ_ALLOW_DELETE);
2905 
2906 			create_flag = O_RDWR;
2907 		}
2908 
2909 	} else if (read_only) {
2910 
2911 		create_flag = O_RDONLY;
2912 
2913 	} else if (create_mode == OS_FILE_CREATE) {
2914 
2915 		create_flag = O_RDWR | O_CREAT | O_EXCL;
2916 
2917 	} else {
2918 
2919 		ib::error()
2920 			<< "Unknown file create mode "
2921 			<< create_mode << " for file '" << name << "'";
2922 
2923 		return(OS_FILE_CLOSED);
2924 	}
2925 
2926 	file = open(name, create_flag | O_CLOEXEC, os_innodb_umask);
2927 
2928 	*success = (file != -1);
2929 
2930 #ifdef USE_FILE_LOCK
2931 	if (!read_only
2932 	    && *success
2933 	    && access_type == OS_FILE_READ_WRITE
2934 	    && os_file_lock(file, name)) {
2935 
2936 		*success = false;
2937 		close(file);
2938 		file = -1;
2939 
2940 	}
2941 #endif /* USE_FILE_LOCK */
2942 
2943 	return(file);
2944 }
2945 
2946 /** Deletes a file if it exists. The file has to be closed before calling this.
2947 @param[in]	name		file path as a null-terminated string
2948 @param[out]	exist		indicate if file pre-exist
2949 @return true if success */
2950 bool
os_file_delete_if_exists_func(const char * name,bool * exist)2951 os_file_delete_if_exists_func(
2952 	const char*	name,
2953 	bool*		exist)
2954 {
2955 	if (exist != NULL) {
2956 		*exist = true;
2957 	}
2958 
2959 	int	ret;
2960 	WAIT_ALLOW_WRITES();
2961 
2962 	ret = unlink(name);
2963 
2964 	if (ret != 0 && errno == ENOENT) {
2965 		if (exist != NULL) {
2966 			*exist = false;
2967 		}
2968 	} else if (ret != 0 && errno != ENOENT) {
2969 		os_file_handle_error_no_exit(name, "delete", false);
2970 
2971 		return(false);
2972 	}
2973 
2974 	return(true);
2975 }
2976 
2977 /** Deletes a file. The file has to be closed before calling this.
2978 @param[in]	name		file path as a null-terminated string
2979 @return true if success */
2980 bool
os_file_delete_func(const char * name)2981 os_file_delete_func(
2982 	const char*	name)
2983 {
2984 	int	ret;
2985 	WAIT_ALLOW_WRITES();
2986 
2987 	ret = unlink(name);
2988 
2989 	if (ret != 0) {
2990 		os_file_handle_error_no_exit(name, "delete", FALSE);
2991 
2992 		return(false);
2993 	}
2994 
2995 	return(true);
2996 }
2997 
2998 /** NOTE! Use the corresponding macro os_file_rename(), not directly this
2999 function!
3000 Renames a file (can also move it to another directory). It is safest that the
3001 file is closed before calling this function.
3002 @param[in]	oldpath		old file path as a null-terminated string
3003 @param[in]	newpath		new file path
3004 @return true if success */
3005 bool
os_file_rename_func(const char * oldpath,const char * newpath)3006 os_file_rename_func(
3007 	const char*	oldpath,
3008 	const char*	newpath)
3009 {
3010 #ifdef UNIV_DEBUG
3011 	os_file_type_t	type;
3012 	bool		exists;
3013 
3014 	/* New path must not exist. */
3015 	ut_ad(os_file_status(newpath, &exists, &type));
3016 	ut_ad(!exists);
3017 
3018 	/* Old path must exist. */
3019 	ut_ad(os_file_status(oldpath, &exists, &type));
3020 	ut_ad(exists);
3021 #endif /* UNIV_DEBUG */
3022 
3023 	int	ret;
3024 	WAIT_ALLOW_WRITES();
3025 
3026 	ret = rename(oldpath, newpath);
3027 
3028 	if (ret != 0) {
3029 		os_file_handle_rename_error(oldpath, newpath);
3030 
3031 		return(false);
3032 	}
3033 
3034 	return(true);
3035 }
3036 
3037 /** NOTE! Use the corresponding macro os_file_close(), not directly this
3038 function!
3039 Closes a file handle. In case of error, error number can be retrieved with
3040 os_file_get_last_error.
3041 @param[in]	file		Handle to close
3042 @return true if success */
3043 bool
os_file_close_func(os_file_t file)3044 os_file_close_func(
3045 	os_file_t	file)
3046 {
3047 	int	ret = close(file);
3048 
3049 	if (ret == -1) {
3050 		os_file_handle_error(NULL, "close");
3051 
3052 		return(false);
3053 	}
3054 
3055 	return(true);
3056 }
3057 
3058 /** Gets a file size.
3059 @param[in]	file		handle to an open file
3060 @return file size, or (os_offset_t) -1 on failure */
3061 os_offset_t
os_file_get_size(os_file_t file)3062 os_file_get_size(os_file_t file)
3063 {
3064 	struct stat statbuf;
3065 	return fstat(file, &statbuf) ? os_offset_t(-1) : statbuf.st_size;
3066 }
3067 
3068 /** Gets a file size.
3069 @param[in]	filename	Full path to the filename to check
3070 @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
3071 	errno */
3072 os_file_size_t
os_file_get_size(const char * filename)3073 os_file_get_size(
3074 	const char*	filename)
3075 {
3076 	struct stat	s;
3077 	os_file_size_t	file_size;
3078 
3079 	int	ret = stat(filename, &s);
3080 
3081 	if (ret == 0) {
3082 		file_size.m_total_size = s.st_size;
3083 		/* st_blocks is in 512 byte sized blocks */
3084 		file_size.m_alloc_size = s.st_blocks * 512;
3085 	} else {
3086 		file_size.m_total_size = ~0U;
3087 		file_size.m_alloc_size = (os_offset_t) errno;
3088 	}
3089 
3090 	return(file_size);
3091 }
3092 
3093 /** This function returns information about the specified file
3094 @param[in]	path		pathname of the file
3095 @param[out]	stat_info	information of a file in a directory
3096 @param[in,out]	statinfo	information of a file in a directory
3097 @param[in]	check_rw_perm	for testing whether the file can be opened
3098 				in RW mode
3099 @param[in]	read_only	if true read only mode checks are enforced
3100 @return DB_SUCCESS if all OK */
3101 static
3102 dberr_t
os_file_get_status_posix(const char * path,os_file_stat_t * stat_info,struct stat * statinfo,bool check_rw_perm,bool read_only)3103 os_file_get_status_posix(
3104 	const char*	path,
3105 	os_file_stat_t* stat_info,
3106 	struct stat*	statinfo,
3107 	bool		check_rw_perm,
3108 	bool		read_only)
3109 {
3110 	int	ret = stat(path, statinfo);
3111 
3112 	if (ret && (errno == ENOENT || errno == ENOTDIR
3113 		    || errno == ENAMETOOLONG)) {
3114 		/* file does not exist */
3115 
3116 		return(DB_NOT_FOUND);
3117 
3118 	} else if (ret) {
3119 		/* file exists, but stat call failed */
3120 
3121 		os_file_handle_error_no_exit(path, "stat", false);
3122 
3123 		return(DB_FAIL);
3124 	}
3125 
3126 	switch (statinfo->st_mode & S_IFMT) {
3127 	case S_IFDIR:
3128 		stat_info->type = OS_FILE_TYPE_DIR;
3129 		break;
3130 	case S_IFLNK:
3131 		stat_info->type = OS_FILE_TYPE_LINK;
3132 		break;
3133 	case S_IFBLK:
3134 		/* Handle block device as regular file. */
3135 	case S_IFCHR:
3136 		/* Handle character device as regular file. */
3137 	case S_IFREG:
3138 		stat_info->type = OS_FILE_TYPE_FILE;
3139 		break;
3140 	default:
3141 		stat_info->type = OS_FILE_TYPE_UNKNOWN;
3142 	}
3143 
3144 	stat_info->size = statinfo->st_size;
3145 	stat_info->block_size = statinfo->st_blksize;
3146 	stat_info->alloc_size = statinfo->st_blocks * 512;
3147 
3148 	if (check_rw_perm
3149 	    && (stat_info->type == OS_FILE_TYPE_FILE
3150 		|| stat_info->type == OS_FILE_TYPE_BLOCK)) {
3151 
3152 		stat_info->rw_perm = !access(path, read_only
3153 					     ? R_OK : R_OK | W_OK);
3154 	}
3155 
3156 	return(DB_SUCCESS);
3157 }
3158 
3159 /** Truncates a file to a specified size in bytes.
3160 Do nothing if the size to preserve is greater or equal to the current
3161 size of the file.
3162 @param[in]	pathname	file path
3163 @param[in]	file		file to be truncated
3164 @param[in]	size		size to preserve in bytes
3165 @return true if success */
3166 static
3167 bool
os_file_truncate_posix(const char * pathname,os_file_t file,os_offset_t size)3168 os_file_truncate_posix(
3169 	const char*	pathname,
3170 	os_file_t	file,
3171 	os_offset_t	size)
3172 {
3173 	int	res = ftruncate(file, size);
3174 
3175 	if (res == -1) {
3176 
3177 		bool	retry;
3178 
3179 		retry = os_file_handle_error_no_exit(
3180 			pathname, "truncate", false);
3181 
3182 		if (retry) {
3183 			ib::warn()
3184 				<< "Truncate failed for '"
3185 				<< pathname << "'";
3186 		}
3187 	}
3188 
3189 	return(res == 0);
3190 }
3191 
3192 /** Truncates a file at its current position.
3193 @return true if success */
3194 bool
os_file_set_eof(FILE * file)3195 os_file_set_eof(
3196 	FILE*		file)	/*!< in: file to be truncated */
3197 {
3198 	WAIT_ALLOW_WRITES();
3199 	return(!ftruncate(fileno(file), ftell(file)));
3200 }
3201 
3202 #else /* !_WIN32 */
3203 
3204 #include <WinIoCtl.h>
3205 
3206 /*
3207 Windows : Handling synchronous IO on files opened asynchronously.
3208 
3209 If file is opened for asynchronous IO (FILE_FLAG_OVERLAPPED) and also bound to
3210 a completion port, then every IO on this file would normally be enqueued to the
3211 completion port. Sometimes however we would like to do a synchronous IO. This is
3212 possible if we initialitze have overlapped.hEvent with a valid event and set its
3213 lowest order bit to 1 (see MSDN ReadFile and WriteFile description for more info)
3214 
3215 We'll create this special event once for each thread and store in thread local
3216 storage.
3217 */
3218 
3219 
win_free_syncio_event(void * data)3220 static void __stdcall win_free_syncio_event(void *data) {
3221 	if (data) {
3222 		CloseHandle((HANDLE)data);
3223 	}
3224 }
3225 
3226 
3227 /*
3228 Retrieve per-thread event for doing synchronous io on asyncronously opened files
3229 */
win_get_syncio_event()3230 static HANDLE win_get_syncio_event()
3231 {
3232 	HANDLE h;
3233 
3234 	h = (HANDLE)FlsGetValue(fls_sync_io);
3235 	if (h) {
3236 		return h;
3237 	}
3238 	h = CreateEventA(NULL, FALSE, FALSE, NULL);
3239 	ut_a(h);
3240 	/* Set low-order bit to keeps I/O completion from being queued */
3241 	h = (HANDLE)((uintptr_t)h | 1);
3242 	FlsSetValue(fls_sync_io, h);
3243 	return h;
3244 }
3245 
3246 
3247 /** Do the read/write
3248 @param[in]	request	The IO context and type
3249 @return the number of bytes read/written or negative value on error */
3250 ssize_t
execute(const IORequest & request)3251 SyncFileIO::execute(const IORequest& request)
3252 {
3253 	OVERLAPPED	seek;
3254 
3255 	memset(&seek, 0x0, sizeof(seek));
3256 
3257 	seek.hEvent = win_get_syncio_event();
3258 	seek.Offset = (DWORD) m_offset & 0xFFFFFFFF;
3259 	seek.OffsetHigh = (DWORD) (m_offset >> 32);
3260 
3261 	BOOL	ret;
3262 	DWORD	n_bytes;
3263 
3264 	if (request.is_read()) {
3265 		ret = ReadFile(m_fh, m_buf,
3266 			static_cast<DWORD>(m_n), NULL, &seek);
3267 
3268 	} else {
3269 		ut_ad(request.is_write());
3270 		ret = WriteFile(m_fh, m_buf,
3271 			static_cast<DWORD>(m_n), NULL, &seek);
3272 	}
3273 	if (ret || (GetLastError() == ERROR_IO_PENDING)) {
3274 		/* Wait for async io to complete */
3275 		ret = GetOverlappedResult(m_fh, &seek, &n_bytes, TRUE);
3276 	}
3277 
3278 	return(ret ? static_cast<ssize_t>(n_bytes) : -1);
3279 }
3280 
3281 /** Do the read/write
3282 @param[in,out]	slot	The IO slot, it has the IO context
3283 @return the number of bytes read/written or negative value on error */
3284 ssize_t
execute(Slot * slot)3285 SyncFileIO::execute(Slot* slot)
3286 {
3287 	BOOL	ret;
3288 	slot->control.hEvent = win_get_syncio_event();
3289 	if (slot->type.is_read()) {
3290 
3291 		ret = ReadFile(
3292 			slot->file, slot->ptr, slot->len,
3293 			NULL, &slot->control);
3294 
3295 	} else {
3296 		ut_ad(slot->type.is_write());
3297 
3298 		ret = WriteFile(
3299 			slot->file, slot->ptr, slot->len,
3300 			NULL, &slot->control);
3301 
3302 	}
3303 	if (ret || (GetLastError() == ERROR_IO_PENDING)) {
3304 		/* Wait for async io to complete */
3305 		ret = GetOverlappedResult(slot->file, &slot->control, &slot->n_bytes, TRUE);
3306 	}
3307 
3308 	return(ret ? static_cast<ssize_t>(slot->n_bytes) : -1);
3309 }
3310 
3311 /* Startup/shutdown */
3312 
3313 struct WinIoInit
3314 {
WinIoInitWinIoInit3315 	WinIoInit() {
3316 		fls_sync_io = FlsAlloc(win_free_syncio_event);
3317 		ut_a(fls_sync_io != FLS_OUT_OF_INDEXES);
3318 	}
3319 
~WinIoInitWinIoInit3320 	~WinIoInit() {
3321 		FlsFree(fls_sync_io);
3322 	}
3323 };
3324 
3325 /* Ensures proper initialization and shutdown */
3326 static WinIoInit win_io_init;
3327 
3328 
3329 /** Free storage space associated with a section of the file.
3330 @param[in]	fh		Open file handle
3331 @param[in]	off		Starting offset (SEEK_SET)
3332 @param[in]	len		Size of the hole
3333 @return 0 on success or errno */
3334 static
3335 dberr_t
os_file_punch_hole_win32(os_file_t fh,os_offset_t off,os_offset_t len)3336 os_file_punch_hole_win32(
3337 	os_file_t	fh,
3338 	os_offset_t	off,
3339 	os_offset_t	len)
3340 {
3341 	FILE_ZERO_DATA_INFORMATION	punch;
3342 
3343 	punch.FileOffset.QuadPart = off;
3344 	punch.BeyondFinalZero.QuadPart = off + len;
3345 
3346 	/* If lpOverlapped is NULL, lpBytesReturned cannot be NULL,
3347 	therefore we pass a dummy parameter. */
3348 	DWORD	temp;
3349 	BOOL	success = os_win32_device_io_control(
3350 		fh, FSCTL_SET_ZERO_DATA, &punch, sizeof(punch),
3351 		NULL, 0, &temp);
3352 
3353 	return(success ? DB_SUCCESS: DB_IO_NO_PUNCH_HOLE);
3354 }
3355 
3356 /** Check the existence and type of the given file.
3357 @param[in]	path		path name of file
3358 @param[out]	exists		true if the file exists
3359 @param[out]	type		Type of the file, if it exists
3360 @return true if call succeeded */
3361 static
3362 bool
os_file_status_win32(const char * path,bool * exists,os_file_type_t * type)3363 os_file_status_win32(
3364 	const char*	path,
3365 	bool*		exists,
3366 	os_file_type_t* type)
3367 {
3368 	int		ret;
3369 	struct _stat64	statinfo;
3370 
3371 	ret = _stat64(path, &statinfo);
3372 
3373 	*exists = !ret;
3374 
3375 	if (!ret) {
3376 		/* file exists, everything OK */
3377 
3378 	} else if (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG) {
3379 		/* file does not exist */
3380 		return(true);
3381 
3382 	} else {
3383 		/* file exists, but stat call failed */
3384 		os_file_handle_error_no_exit(path, "stat", false);
3385 		return(false);
3386 	}
3387 
3388 	if (_S_IFDIR & statinfo.st_mode) {
3389 		*type = OS_FILE_TYPE_DIR;
3390 
3391 	} else if (_S_IFREG & statinfo.st_mode) {
3392 		*type = OS_FILE_TYPE_FILE;
3393 
3394 	} else {
3395 		*type = OS_FILE_TYPE_UNKNOWN;
3396 	}
3397 
3398 	return(true);
3399 }
3400 
3401 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
3402 function!
3403 Flushes the write buffers of a given file to the disk.
3404 @param[in]	file		handle to a file
3405 @return true if success */
3406 bool
os_file_flush_func(os_file_t file)3407 os_file_flush_func(
3408 	os_file_t	file)
3409 {
3410 	++os_n_fsyncs;
3411 
3412 	BOOL	ret = FlushFileBuffers(file);
3413 
3414 	if (ret) {
3415 		return(true);
3416 	}
3417 
3418 	/* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
3419 	actually a raw device, we choose to ignore that error if we are using
3420 	raw disks */
3421 
3422 	if (srv_start_raw_disk_in_use && GetLastError()
3423 	    == ERROR_INVALID_FUNCTION) {
3424 		return(true);
3425 	}
3426 
3427 	os_file_handle_error(NULL, "flush");
3428 
3429 	/* It is a fatal error if a file flush does not succeed, because then
3430 	the database can get corrupt on disk */
3431 	ut_error;
3432 
3433 	return(false);
3434 }
3435 
3436 /** Retrieves the last error number if an error occurs in a file io function.
3437 The number should be retrieved before any other OS calls (because they may
3438 overwrite the error number). If the number is not known to this program,
3439 the OS error number + 100 is returned.
3440 @param[in]	report_all_errors	true if we want an error message printed
3441 					of all errors
3442 @param[in]	on_error_silent		true then don't print any diagnostic
3443 					to the log
3444 @return error number, or OS error number + 100 */
3445 static
3446 ulint
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)3447 os_file_get_last_error_low(
3448 	bool	report_all_errors,
3449 	bool	on_error_silent)
3450 {
3451 	ulint	err = (ulint) GetLastError();
3452 
3453 	if (err == ERROR_SUCCESS) {
3454 		return(0);
3455 	}
3456 
3457 	if (report_all_errors
3458 	    || (!on_error_silent
3459 		&& err != ERROR_DISK_FULL
3460 		&& err != ERROR_FILE_EXISTS)) {
3461 
3462 		ib::error()
3463 			<< "Operating system error number " << err
3464 			<< " in a file operation.";
3465 
3466 		if (err == ERROR_PATH_NOT_FOUND) {
3467 			ib::error()
3468 				<< "The error means the system"
3469 				" cannot find the path specified.";
3470 
3471 			if (srv_is_being_started) {
3472 				ib::error()
3473 					<< "If you are installing InnoDB,"
3474 					" remember that you must create"
3475 					" directories yourself, InnoDB"
3476 					" does not create them.";
3477 			}
3478 
3479 		} else if (err == ERROR_ACCESS_DENIED) {
3480 
3481 			ib::error()
3482 				<< "The error means mysqld does not have"
3483 				" the access rights to"
3484 				" the directory. It may also be"
3485 				" you have created a subdirectory"
3486 				" of the same name as a data file.";
3487 
3488 		} else if (err == ERROR_SHARING_VIOLATION
3489 			   || err == ERROR_LOCK_VIOLATION) {
3490 
3491 			ib::error()
3492 				<< "The error means that another program"
3493 				" is using InnoDB's files."
3494 				" This might be a backup or antivirus"
3495 				" software or another instance"
3496 				" of MySQL."
3497 				" Please close it to get rid of this error.";
3498 
3499 		} else if (err == ERROR_WORKING_SET_QUOTA
3500 			   || err == ERROR_NO_SYSTEM_RESOURCES) {
3501 
3502 			ib::error()
3503 				<< "The error means that there are no"
3504 				" sufficient system resources or quota to"
3505 				" complete the operation.";
3506 
3507 		} else if (err == ERROR_OPERATION_ABORTED) {
3508 
3509 			ib::error()
3510 				<< "The error means that the I/O"
3511 				" operation has been aborted"
3512 				" because of either a thread exit"
3513 				" or an application request."
3514 				" Retry attempt is made.";
3515 		} else {
3516 
3517 			ib::info() << OPERATING_SYSTEM_ERROR_MSG;
3518 		}
3519 	}
3520 
3521 	if (err == ERROR_FILE_NOT_FOUND) {
3522 		return(OS_FILE_NOT_FOUND);
3523 	} else if (err == ERROR_DISK_FULL) {
3524 		return(OS_FILE_DISK_FULL);
3525 	} else if (err == ERROR_FILE_EXISTS) {
3526 		return(OS_FILE_ALREADY_EXISTS);
3527 	} else if (err == ERROR_SHARING_VIOLATION
3528 		   || err == ERROR_LOCK_VIOLATION) {
3529 		return(OS_FILE_SHARING_VIOLATION);
3530 	} else if (err == ERROR_WORKING_SET_QUOTA
3531 		   || err == ERROR_NO_SYSTEM_RESOURCES) {
3532 		return(OS_FILE_INSUFFICIENT_RESOURCE);
3533 	} else if (err == ERROR_OPERATION_ABORTED) {
3534 		return(OS_FILE_OPERATION_ABORTED);
3535 	} else if (err == ERROR_ACCESS_DENIED) {
3536 		return(OS_FILE_ACCESS_VIOLATION);
3537 	}
3538 
3539 	return(OS_FILE_ERROR_MAX + err);
3540 }
3541 
3542 
3543 /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
3544 this function!
3545 A simple function to open or create a file.
3546 @param[in]	name		name of the file or path as a null-terminated
3547 				string
3548 @param[in]	create_mode	create mode
3549 @param[in]	access_type	OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
3550 @param[in]	read_only	if true read only mode checks are enforced
3551 @param[out]	success		true if succeed, false if error
3552 @return handle to the file, not defined if error, error number
3553 	can be retrieved with os_file_get_last_error */
3554 pfs_os_file_t
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)3555 os_file_create_simple_func(
3556 	const char*	name,
3557 	ulint		create_mode,
3558 	ulint		access_type,
3559 	bool		read_only,
3560 	bool*		success)
3561 {
3562 	os_file_t	file;
3563 
3564 	*success = false;
3565 
3566 	DWORD		access;
3567 	DWORD		create_flag;
3568 	DWORD		attributes = 0;
3569 
3570 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
3571 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
3572 	ut_ad(srv_operation == SRV_OPERATION_NORMAL);
3573 
3574 	if (create_mode == OS_FILE_OPEN) {
3575 
3576 		create_flag = OPEN_EXISTING;
3577 
3578 	} else if (read_only) {
3579 
3580 		create_flag = OPEN_EXISTING;
3581 
3582 	} else if (create_mode == OS_FILE_CREATE) {
3583 
3584 		create_flag = CREATE_NEW;
3585 
3586 	} else if (create_mode == OS_FILE_CREATE_PATH) {
3587 
3588 		/* Create subdirs along the path if needed. */
3589 		*success = os_file_create_subdirs_if_needed(name);
3590 
3591 		if (!*success) {
3592 
3593 			ib::error()
3594 				<< "Unable to create subdirectories '"
3595 				<< name << "'";
3596 
3597 			return(OS_FILE_CLOSED);
3598 		}
3599 
3600 		create_flag = CREATE_NEW;
3601 		create_mode = OS_FILE_CREATE;
3602 
3603 	} else {
3604 
3605 		ib::error()
3606 			<< "Unknown file create mode ("
3607 			<< create_mode << ") for file '"
3608 			<< name << "'";
3609 
3610 		return(OS_FILE_CLOSED);
3611 	}
3612 
3613 	if (access_type == OS_FILE_READ_ONLY) {
3614 
3615 		access = GENERIC_READ;
3616 
3617 	} else if (read_only) {
3618 
3619 		ib::info()
3620 			<< "Read only mode set. Unable to"
3621 			" open file '" << name << "' in RW mode, "
3622 			<< "trying RO mode";
3623 
3624 		access = GENERIC_READ;
3625 
3626 	} else if (access_type == OS_FILE_READ_WRITE) {
3627 
3628 		access = GENERIC_READ | GENERIC_WRITE;
3629 
3630 	} else {
3631 
3632 		ib::error()
3633 			<< "Unknown file access type (" << access_type << ") "
3634 			"for file '" << name << "'";
3635 
3636 		return(OS_FILE_CLOSED);
3637 	}
3638 
3639 	bool	retry;
3640 
3641 	do {
3642 		/* Use default security attributes and no template file. */
3643 
3644 		file = CreateFile(
3645 			(LPCTSTR) name, access,
3646 			FILE_SHARE_READ | FILE_SHARE_DELETE, NULL,
3647 			create_flag, attributes, NULL);
3648 
3649 		if (file == INVALID_HANDLE_VALUE) {
3650 
3651 			*success = false;
3652 
3653 			retry = os_file_handle_error(
3654 				name, create_mode == OS_FILE_OPEN ?
3655 				"open" : "create");
3656 
3657 		} else {
3658 
3659 			retry = false;
3660 
3661 			*success = true;
3662 		}
3663 
3664 	} while (retry);
3665 
3666 	return(file);
3667 }
3668 
3669 /** This function attempts to create a directory named pathname. The new
3670 directory gets default permissions. On Unix the permissions are
3671 (0770 & ~umask). If the directory exists already, nothing is done and
3672 the call succeeds, unless the fail_if_exists arguments is true.
3673 If another error occurs, such as a permission error, this does not crash,
3674 but reports the error and returns false.
3675 @param[in]	pathname	directory name as null-terminated string
3676 @param[in]	fail_if_exists	if true, pre-existing directory is treated
3677 				as an error.
3678 @return true if call succeeds, false on error */
3679 bool
os_file_create_directory(const char * pathname,bool fail_if_exists)3680 os_file_create_directory(
3681 	const char*	pathname,
3682 	bool		fail_if_exists)
3683 {
3684 	BOOL	rcode;
3685 
3686 	rcode = CreateDirectory((LPCTSTR) pathname, NULL);
3687 	if (!(rcode != 0
3688 	      || (GetLastError() == ERROR_ALREADY_EXISTS
3689 		  && !fail_if_exists))) {
3690 
3691 		os_file_handle_error_no_exit(
3692 			pathname, "CreateDirectory", false);
3693 
3694 		return(false);
3695 	}
3696 
3697 	return(true);
3698 }
3699 
3700 /** Check that IO of specific size is possible for the file
3701 opened with FILE_FLAG_NO_BUFFERING.
3702 
3703 The requirement is that IO is multiple of the disk sector size.
3704 
3705 @param[in]	file      file handle
3706 @param[in]	io_size   expected io size
3707 @return true - unbuffered io of requested size is possible, false otherwise.
3708 
3709 @note: this function only works correctly with Windows 8 or later,
3710 (GetFileInformationByHandleEx with FileStorageInfo is only supported there).
3711 It will return true on earlier Windows version.
3712  */
unbuffered_io_possible(HANDLE file,size_t io_size)3713 static bool unbuffered_io_possible(HANDLE file, size_t io_size)
3714 {
3715 	FILE_STORAGE_INFO info;
3716 	if (GetFileInformationByHandleEx(
3717 		file, FileStorageInfo, &info, sizeof(info))) {
3718 			ULONG sector_size = info.LogicalBytesPerSector;
3719 			if (sector_size)
3720 				return io_size % sector_size == 0;
3721 	}
3722 	return true;
3723 }
3724 
3725 
3726 /** NOTE! Use the corresponding macro os_file_create(), not directly
3727 this function!
3728 Opens an existing file or creates a new.
3729 @param[in]	name		name of the file or path as a null-terminated
3730 				string
3731 @param[in]	create_mode	create mode
3732 @param[in]	purpose		OS_FILE_AIO, if asynchronous, non-buffered I/O
3733 				is desired, OS_FILE_NORMAL, if any normal file;
3734 				NOTE that it also depends on type, os_aio_..
3735 				and srv_.. variables whether we really use async
3736 				I/O or unbuffered I/O: look in the function
3737 				source code for the exact rules
3738 @param[in]	type		OS_DATA_FILE or OS_LOG_FILE
3739 @param[in]	success		true if succeeded
3740 @return handle to the file, not defined if error, error number
3741 	can be retrieved with os_file_get_last_error */
3742 pfs_os_file_t
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,bool read_only,bool * success)3743 os_file_create_func(
3744 	const char*	name,
3745 	ulint		create_mode,
3746 	ulint		purpose,
3747 	ulint		type,
3748 	bool		read_only,
3749 	bool*		success)
3750 {
3751 	os_file_t	file;
3752 	bool		retry;
3753 	bool		on_error_no_exit;
3754 	bool		on_error_silent;
3755 
3756 	*success = false;
3757 
3758 	DBUG_EXECUTE_IF(
3759 		"ib_create_table_fail_disk_full",
3760 		*success = false;
3761 		SetLastError(ERROR_DISK_FULL);
3762 		return(OS_FILE_CLOSED);
3763 	);
3764 
3765 	DWORD		create_flag;
3766 	DWORD		share_mode = srv_operation != SRV_OPERATION_NORMAL
3767 		? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE
3768 		: FILE_SHARE_READ | FILE_SHARE_DELETE;
3769 
3770 	if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) {
3771 		WAIT_ALLOW_WRITES();
3772 	}
3773 
3774 	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
3775 		? true : false;
3776 
3777 	on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
3778 		? true : false;
3779 
3780 	create_mode &= ~(OS_FILE_ON_ERROR_NO_EXIT | OS_FILE_ON_ERROR_SILENT);
3781 
3782 	if (create_mode == OS_FILE_OPEN_RAW) {
3783 
3784 		ut_a(!read_only);
3785 
3786 		create_flag = OPEN_EXISTING;
3787 
3788 		/* On Windows Physical devices require admin privileges and
3789 		have to have the write-share mode set. See the remarks
3790 		section for the CreateFile() function documentation in MSDN. */
3791 
3792 		share_mode |= FILE_SHARE_WRITE;
3793 
3794 	} else if (create_mode == OS_FILE_OPEN
3795 		   || create_mode == OS_FILE_OPEN_RETRY) {
3796 
3797 		create_flag = OPEN_EXISTING;
3798 
3799 	} else if (read_only) {
3800 
3801 		create_flag = OPEN_EXISTING;
3802 
3803 	} else if (create_mode == OS_FILE_CREATE) {
3804 
3805 		create_flag = CREATE_NEW;
3806 
3807 	} else if (create_mode == OS_FILE_OVERWRITE) {
3808 
3809 		create_flag = CREATE_ALWAYS;
3810 
3811 	} else {
3812 		ib::error()
3813 			<< "Unknown file create mode (" << create_mode << ") "
3814 			<< " for file '" << name << "'";
3815 
3816 		return(OS_FILE_CLOSED);
3817 	}
3818 
3819 	DWORD		attributes = 0;
3820 
3821 	if (purpose == OS_FILE_AIO) {
3822 
3823 #ifdef WIN_ASYNC_IO
3824 		/* If specified, use asynchronous (overlapped) io and no
3825 		buffering of writes in the OS */
3826 
3827 		if (srv_use_native_aio) {
3828 			attributes |= FILE_FLAG_OVERLAPPED;
3829 		}
3830 #endif /* WIN_ASYNC_IO */
3831 
3832 	} else if (purpose == OS_FILE_NORMAL) {
3833 
3834 		/* Use default setting. */
3835 
3836 	} else {
3837 
3838 		ib::error()
3839 			<< "Unknown purpose flag (" << purpose << ") "
3840 			<< "while opening file '" << name << "'";
3841 
3842 		return(OS_FILE_CLOSED);
3843 	}
3844 
3845 	if (type == OS_LOG_FILE) {
3846 		/* There is not reason to use buffered write to logs.*/
3847 		attributes |= FILE_FLAG_NO_BUFFERING;
3848 	}
3849 
3850 	switch (srv_file_flush_method)
3851 	{
3852 	case SRV_O_DSYNC:
3853 		if (type == OS_LOG_FILE) {
3854 			/* Map O_SYNC to FILE_WRITE_THROUGH */
3855 			attributes |= FILE_FLAG_WRITE_THROUGH;
3856 		}
3857 		break;
3858 
3859 	case SRV_O_DIRECT_NO_FSYNC:
3860 	case SRV_O_DIRECT:
3861 		if (type == OS_DATA_FILE) {
3862 			attributes |= FILE_FLAG_NO_BUFFERING;
3863 		}
3864 		break;
3865 
3866 	case SRV_ALL_O_DIRECT_FSYNC:
3867 		/*Traditional Windows behavior, no buffering for any files.*/
3868 		if (type != OS_DATA_FILE_NO_O_DIRECT) {
3869 			attributes |= FILE_FLAG_NO_BUFFERING;
3870 		}
3871 		break;
3872 
3873 	case SRV_FSYNC:
3874 	case SRV_LITTLESYNC:
3875 		break;
3876 
3877 	case SRV_NOSYNC:
3878 		/* Let Windows cache manager handle all writes.*/
3879 		attributes &= ~(FILE_FLAG_WRITE_THROUGH | FILE_FLAG_NO_BUFFERING);
3880 		break;
3881 
3882 	default:
3883 		ut_a(false); /* unknown flush mode.*/
3884 	}
3885 
3886 
3887 	// TODO: Create a bug, this looks wrong. The flush log
3888 	// parameter is dynamic.
3889 	if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
3890 		/* Do not use unbuffered i/o for the log files because
3891 		value 2 denotes that we do not flush the log at every
3892 		commit, but only once per second */
3893 		attributes &= ~(FILE_FLAG_WRITE_THROUGH | FILE_FLAG_NO_BUFFERING);
3894 	}
3895 
3896 
3897 	DWORD	access = GENERIC_READ;
3898 
3899 	if (!read_only) {
3900 		access |= GENERIC_WRITE;
3901 	}
3902 
3903 	for (;;) {
3904 		const  char *operation;
3905 
3906 		/* Use default security attributes and no template file. */
3907 		file = CreateFile(
3908 			name, access, share_mode, NULL,
3909 			create_flag, attributes, NULL);
3910 
3911 		/* If FILE_FLAG_NO_BUFFERING was set, check if this can work at all,
3912 		for expected IO sizes. Reopen without the unbuffered flag, if it is won't work*/
3913 		if ((file != INVALID_HANDLE_VALUE)
3914 			&& (attributes & FILE_FLAG_NO_BUFFERING)
3915 			&& (type == OS_LOG_FILE)
3916 			&& !unbuffered_io_possible(file, OS_FILE_LOG_BLOCK_SIZE)) {
3917 				ut_a(CloseHandle(file));
3918 				attributes &= ~FILE_FLAG_NO_BUFFERING;
3919 				create_flag = OPEN_ALWAYS;
3920 				continue;
3921 		}
3922 
3923 		*success = (file != INVALID_HANDLE_VALUE);
3924 		if (*success) {
3925 			break;
3926 		}
3927 
3928 		operation = (create_mode == OS_FILE_CREATE && !read_only) ?
3929 			"create" : "open";
3930 
3931 		if (on_error_no_exit) {
3932 			retry = os_file_handle_error_no_exit(
3933 				name, operation, on_error_silent);
3934 		}
3935 		else {
3936 			retry = os_file_handle_error(name, operation);
3937 		}
3938 
3939 		if (!retry) {
3940 			break;
3941 		}
3942 	}
3943 
3944 	if (*success && srv_use_native_aio &&  (attributes & FILE_FLAG_OVERLAPPED)) {
3945 		/* Bind the file handle to completion port. Completion port
3946 		might not be created yet, in some stages of backup, but
3947 		must always be there for the server.*/
3948 		HANDLE port = (type == OS_LOG_FILE) ?
3949 			log_completion_port : data_completion_port;
3950 		ut_a(port || srv_operation != SRV_OPERATION_NORMAL);
3951 		if (port) {
3952 			ut_a(CreateIoCompletionPort(file, port, 0, 0));
3953 		}
3954 	}
3955 
3956 	return(file);
3957 }
3958 
3959 /** NOTE! Use the corresponding macro os_file_create_simple_no_error_handling(),
3960 not directly this function!
3961 A simple function to open or create a file.
3962 @param[in]	name		name of the file or path as a null-terminated
3963 				string
3964 @param[in]	create_mode	create mode
3965 @param[in]	access_type	OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
3966 				OS_FILE_READ_ALLOW_DELETE; the last option is
3967 				used by a backup program reading the file
3968 @param[out]	success		true if succeeded
3969 @return own: handle to the file, not defined if error, error number
3970 	can be retrieved with os_file_get_last_error */
3971 pfs_os_file_t
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)3972 os_file_create_simple_no_error_handling_func(
3973 	const char*	name,
3974 	ulint		create_mode,
3975 	ulint		access_type,
3976 	bool		read_only,
3977 	bool*		success)
3978 {
3979 	os_file_t	file;
3980 
3981 	*success = false;
3982 
3983 	DWORD		access;
3984 	DWORD		create_flag;
3985 	DWORD		attributes	= 0;
3986 	DWORD		share_mode = srv_operation != SRV_OPERATION_NORMAL
3987 		? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE
3988 		: FILE_SHARE_READ | FILE_SHARE_DELETE;
3989 
3990 	ut_a(name);
3991 
3992 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
3993 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
3994 
3995 	if (create_mode == OS_FILE_OPEN) {
3996 
3997 		create_flag = OPEN_EXISTING;
3998 
3999 	} else if (read_only) {
4000 
4001 		create_flag = OPEN_EXISTING;
4002 
4003 	} else if (create_mode == OS_FILE_CREATE) {
4004 
4005 		create_flag = CREATE_NEW;
4006 
4007 	} else {
4008 
4009 		ib::error()
4010 			<< "Unknown file create mode (" << create_mode << ") "
4011 			<< " for file '" << name << "'";
4012 
4013 		return(OS_FILE_CLOSED);
4014 	}
4015 
4016 	if (access_type == OS_FILE_READ_ONLY) {
4017 
4018 		access = GENERIC_READ;
4019 
4020 	} else if (read_only) {
4021 
4022 		access = GENERIC_READ;
4023 
4024 	} else if (access_type == OS_FILE_READ_WRITE) {
4025 
4026 		access = GENERIC_READ | GENERIC_WRITE;
4027 
4028 	} else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
4029 
4030 		ut_a(!read_only);
4031 
4032 		access = GENERIC_READ;
4033 
4034 		/*!< A backup program has to give mysqld the maximum
4035 		freedom to do what it likes with the file */
4036 
4037 		share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE
4038 			| FILE_SHARE_READ;
4039 	} else {
4040 
4041 		ib::error()
4042 			<< "Unknown file access type (" << access_type << ") "
4043 			<< "for file '" << name << "'";
4044 
4045 		return(OS_FILE_CLOSED);
4046 	}
4047 
4048 	file = CreateFile((LPCTSTR) name,
4049 			  access,
4050 			  share_mode,
4051 			  NULL,			// Security attributes
4052 			  create_flag,
4053 			  attributes,
4054 			  NULL);		// No template file
4055 
4056 	*success = (file != INVALID_HANDLE_VALUE);
4057 
4058 	return(file);
4059 }
4060 
4061 /** Deletes a file if it exists. The file has to be closed before calling this.
4062 @param[in]	name		file path as a null-terminated string
4063 @param[out]	exist		indicate if file pre-exist
4064 @return true if success */
4065 bool
os_file_delete_if_exists_func(const char * name,bool * exist)4066 os_file_delete_if_exists_func(
4067 	const char*	name,
4068 	bool*		exist)
4069 {
4070 	ulint	count	= 0;
4071 
4072 	if (exist != NULL) {
4073 		*exist = true;
4074 	}
4075 
4076 	for (;;) {
4077 		/* In Windows, deleting an .ibd file may fail if
4078 		the file is being accessed by an external program,
4079 		such as a backup tool. */
4080 
4081 		bool	ret = DeleteFile((LPCTSTR) name);
4082 
4083 		if (ret) {
4084 			return(true);
4085 		}
4086 
4087 		DWORD	lasterr = GetLastError();
4088 
4089 		if (lasterr == ERROR_FILE_NOT_FOUND
4090 		    || lasterr == ERROR_PATH_NOT_FOUND) {
4091 
4092 			/* the file does not exist, this not an error */
4093 			if (exist != NULL) {
4094 				*exist = false;
4095 			}
4096 
4097 			return(true);
4098 		}
4099 
4100 		++count;
4101 
4102 		if (count > 100 && 0 == (count % 10)) {
4103 
4104 			/* Print error information */
4105 			os_file_get_last_error(true);
4106 
4107 			ib::warn() << "Delete of file '" << name << "' failed.";
4108 		}
4109 
4110 		/* Sleep for a second */
4111 		os_thread_sleep(1000000);
4112 
4113 		if (count > 2000) {
4114 
4115 			return(false);
4116 		}
4117 	}
4118 }
4119 
4120 /** Deletes a file. The file has to be closed before calling this.
4121 @param[in]	name		File path as NUL terminated string
4122 @return true if success */
4123 bool
os_file_delete_func(const char * name)4124 os_file_delete_func(
4125 	const char*	name)
4126 {
4127 	ulint	count	= 0;
4128 
4129 	for (;;) {
4130 		/* In Windows, deleting an .ibd file may fail if
4131 		the file is being accessed by an external program,
4132 		such as a backup tool. */
4133 
4134 		BOOL	ret = DeleteFile((LPCTSTR) name);
4135 
4136 		if (ret) {
4137 			return(true);
4138 		}
4139 
4140 		if (GetLastError() == ERROR_FILE_NOT_FOUND) {
4141 			/* If the file does not exist, we classify this as
4142 			a 'mild' error and return */
4143 
4144 			return(false);
4145 		}
4146 
4147 		++count;
4148 
4149 		if (count > 100 && 0 == (count % 10)) {
4150 
4151 			/* print error information */
4152 			os_file_get_last_error(true);
4153 
4154 			ib::warn()
4155 				<< "Cannot delete file '" << name << "'. Is "
4156 				<< "another program accessing it?";
4157 		}
4158 
4159 		/* sleep for a second */
4160 		os_thread_sleep(1000000);
4161 
4162 		if (count > 2000) {
4163 
4164 			return(false);
4165 		}
4166 	}
4167 
4168 	ut_error;
4169 	return(false);
4170 }
4171 
4172 /** NOTE! Use the corresponding macro os_file_rename(), not directly this
4173 function!
4174 Renames a file (can also move it to another directory). It is safest that the
4175 file is closed before calling this function.
4176 @param[in]	oldpath		old file path as a null-terminated string
4177 @param[in]	newpath		new file path
4178 @return true if success */
4179 bool
os_file_rename_func(const char * oldpath,const char * newpath)4180 os_file_rename_func(
4181 	const char*	oldpath,
4182 	const char*	newpath)
4183 {
4184 #ifdef UNIV_DEBUG
4185 	os_file_type_t	type;
4186 	bool		exists;
4187 
4188 	/* New path must not exist. */
4189 	ut_ad(os_file_status(newpath, &exists, &type));
4190 	ut_ad(!exists);
4191 
4192 	/* Old path must exist. */
4193 	ut_ad(os_file_status(oldpath, &exists, &type));
4194 	ut_ad(exists);
4195 #endif /* UNIV_DEBUG */
4196 
4197 	if (MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath)) {
4198 		return(true);
4199 	}
4200 
4201 	os_file_handle_rename_error(oldpath, newpath);
4202 	return(false);
4203 }
4204 
4205 /** NOTE! Use the corresponding macro os_file_close(), not directly
4206 this function!
4207 Closes a file handle. In case of error, error number can be retrieved with
4208 os_file_get_last_error.
4209 @param[in,own]	file		Handle to a file
4210 @return true if success */
4211 bool
os_file_close_func(os_file_t file)4212 os_file_close_func(
4213 	os_file_t	file)
4214 {
4215 	ut_a(file);
4216 
4217 	if (CloseHandle(file)) {
4218 		return(true);
4219 	}
4220 
4221 	os_file_handle_error(NULL, "close");
4222 
4223 	return(false);
4224 }
4225 
4226 /** Gets a file size.
4227 @param[in]	file		Handle to a file
4228 @return file size, or (os_offset_t) -1 on failure */
4229 os_offset_t
os_file_get_size(os_file_t file)4230 os_file_get_size(
4231 	os_file_t	file)
4232 {
4233 	DWORD		high;
4234 	DWORD		low = GetFileSize(file, &high);
4235 
4236 	if (low == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
4237 		return((os_offset_t) -1);
4238 	}
4239 
4240 	return(os_offset_t(low | (os_offset_t(high) << 32)));
4241 }
4242 
4243 /** Gets a file size.
4244 @param[in]	filename	Full path to the filename to check
4245 @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
4246 	errno */
4247 os_file_size_t
os_file_get_size(const char * filename)4248 os_file_get_size(
4249 	const char*	filename)
4250 {
4251 	struct __stat64	s;
4252 	os_file_size_t	file_size;
4253 
4254 	int		ret = _stat64(filename, &s);
4255 
4256 	if (ret == 0) {
4257 
4258 		file_size.m_total_size = s.st_size;
4259 
4260 		DWORD	low_size;
4261 		DWORD	high_size;
4262 
4263 		low_size = GetCompressedFileSize(filename, &high_size);
4264 
4265 		if (low_size != INVALID_FILE_SIZE) {
4266 
4267 			file_size.m_alloc_size = high_size;
4268 			file_size.m_alloc_size <<= 32;
4269 			file_size.m_alloc_size |= low_size;
4270 
4271 		} else {
4272 			ib::error()
4273 				<< "GetCompressedFileSize("
4274 				<< filename << ", ..) failed.";
4275 
4276 			file_size.m_alloc_size = (os_offset_t) -1;
4277 		}
4278 	} else {
4279 		file_size.m_total_size = ~0;
4280 		file_size.m_alloc_size = (os_offset_t) ret;
4281 	}
4282 
4283 	return(file_size);
4284 }
4285 
4286 /** This function returns information about the specified file
4287 @param[in]	path		pathname of the file
4288 @param[out]	stat_info	information of a file in a directory
4289 @param[in,out]	statinfo	information of a file in a directory
4290 @param[in]	check_rw_perm	for testing whether the file can be opened
4291 				in RW mode
4292 @param[in]	read_only	true if the file is opened in read-only mode
4293 @return DB_SUCCESS if all OK */
4294 static
4295 dberr_t
os_file_get_status_win32(const char * path,os_file_stat_t * stat_info,struct _stat64 * statinfo,bool check_rw_perm,bool read_only)4296 os_file_get_status_win32(
4297 	const char*	path,
4298 	os_file_stat_t* stat_info,
4299 	struct _stat64*	statinfo,
4300 	bool		check_rw_perm,
4301 	bool		read_only)
4302 {
4303 	int	ret = _stat64(path, statinfo);
4304 
4305 	if (ret && (errno == ENOENT || errno == ENOTDIR
4306 		    || errno == ENAMETOOLONG)) {
4307 		/* file does not exist */
4308 
4309 		return(DB_NOT_FOUND);
4310 
4311 	} else if (ret) {
4312 		/* file exists, but stat call failed */
4313 
4314 		os_file_handle_error_no_exit(path, "STAT", false);
4315 
4316 		return(DB_FAIL);
4317 
4318 	} else if (_S_IFDIR & statinfo->st_mode) {
4319 
4320 		stat_info->type = OS_FILE_TYPE_DIR;
4321 
4322 	} else if (_S_IFREG & statinfo->st_mode) {
4323 
4324 		DWORD	access = GENERIC_READ;
4325 
4326 		if (!read_only) {
4327 			access |= GENERIC_WRITE;
4328 		}
4329 
4330 		stat_info->type = OS_FILE_TYPE_FILE;
4331 
4332 		/* Check if we can open it in read-only mode. */
4333 
4334 		if (check_rw_perm) {
4335 			HANDLE	fh;
4336 
4337 			fh = CreateFile(
4338 				(LPCTSTR) path,		// File to open
4339 				access,
4340 				FILE_SHARE_READ | FILE_SHARE_WRITE
4341 				| FILE_SHARE_DELETE,	// Full sharing
4342 				NULL,			// Default security
4343 				OPEN_EXISTING,		// Existing file only
4344 				FILE_ATTRIBUTE_NORMAL,	// Normal file
4345 				NULL);			// No attr. template
4346 
4347 			if (fh == INVALID_HANDLE_VALUE) {
4348 				stat_info->rw_perm = false;
4349 			} else {
4350 				stat_info->rw_perm = true;
4351 				CloseHandle(fh);
4352 			}
4353 		}
4354 		stat_info->block_size = 0;
4355 
4356 		/* What follows, is calculation of FS block size, which is not important
4357 		(it is just shown in I_S innodb tables). The error to calculate it will be ignored.*/
4358 		char	volname[MAX_PATH];
4359 		BOOL	result = GetVolumePathName(path, volname, MAX_PATH);
4360 		static	bool warned_once = false;
4361 		if (!result) {
4362 			if (!warned_once) {
4363 				ib::warn()
4364 					<< "os_file_get_status_win32: "
4365 					<< "Failed to get the volume path name for: "
4366 					<< path
4367 					<< "- OS error number " << GetLastError();
4368 				warned_once = true;
4369 			}
4370 			return(DB_SUCCESS);
4371 		}
4372 
4373 		DWORD	sectorsPerCluster;
4374 		DWORD	bytesPerSector;
4375 		DWORD	numberOfFreeClusters;
4376 		DWORD	totalNumberOfClusters;
4377 
4378 		result = GetDiskFreeSpace(
4379 			(LPCSTR) volname,
4380 			&sectorsPerCluster,
4381 			&bytesPerSector,
4382 			&numberOfFreeClusters,
4383 			&totalNumberOfClusters);
4384 
4385 		if (!result) {
4386 			if (!warned_once) {
4387 				ib::warn()
4388 					<< "GetDiskFreeSpace(" << volname << ",...) "
4389 					<< "failed "
4390 					<< "- OS error number " << GetLastError();
4391 				warned_once = true;
4392 			}
4393 			return(DB_SUCCESS);
4394 		}
4395 		stat_info->block_size = bytesPerSector * sectorsPerCluster;
4396 	} else {
4397 		stat_info->type = OS_FILE_TYPE_UNKNOWN;
4398 	}
4399 
4400 	return(DB_SUCCESS);
4401 }
4402 
4403 /**
4404 Sets a sparse flag on Windows file.
4405 @param[in]	file  file handle
4406 @return true on success, false on error
4407 */
4408 #include <versionhelpers.h>
os_file_set_sparse_win32(os_file_t file,bool is_sparse)4409 bool os_file_set_sparse_win32(os_file_t file, bool is_sparse)
4410 {
4411 	if (!is_sparse && !IsWindows8OrGreater()) {
4412 		/* Cannot  unset sparse flag on older Windows.
4413 		Until Windows8 it is documented to produce unpredictable results,
4414 		if there are unallocated ranges in file.*/
4415 		return false;
4416 	}
4417 	DWORD temp;
4418 	FILE_SET_SPARSE_BUFFER sparse_buffer;
4419 	sparse_buffer.SetSparse = is_sparse;
4420 	return os_win32_device_io_control(file,
4421 		FSCTL_SET_SPARSE, &sparse_buffer, sizeof(sparse_buffer), 0, 0,&temp);
4422 }
4423 
4424 
4425 /**
4426 Change file size on Windows.
4427 
4428 If file is extended, the bytes between old and new EOF
4429 are zeros.
4430 
4431 If file is sparse, "virtual" block is added at the end of
4432 allocated area.
4433 
4434 If file is normal, file system allocates storage.
4435 
4436 @param[in]	pathname	file path
4437 @param[in]	file		file handle
4438 @param[in]	size		size to preserve in bytes
4439 @return true if success */
4440 bool
os_file_change_size_win32(const char * pathname,os_file_t file,os_offset_t size)4441 os_file_change_size_win32(
4442 	const char*	pathname,
4443 	os_file_t	file,
4444 	os_offset_t	size)
4445 {
4446 	LARGE_INTEGER	length;
4447 
4448 	length.QuadPart = size;
4449 
4450 	BOOL	success = SetFilePointerEx(file, length, NULL, FILE_BEGIN);
4451 
4452 	if (!success) {
4453 		os_file_handle_error_no_exit(
4454 			pathname, "SetFilePointerEx", false);
4455 	} else {
4456 		success = SetEndOfFile(file);
4457 		if (!success) {
4458 			os_file_handle_error_no_exit(
4459 				pathname, "SetEndOfFile", false);
4460 		}
4461 	}
4462 	return(success);
4463 }
4464 
4465 /** Truncates a file at its current position.
4466 @param[in]	file		Handle to be truncated
4467 @return true if success */
4468 bool
os_file_set_eof(FILE * file)4469 os_file_set_eof(
4470 	FILE*		file)
4471 {
4472 	HANDLE	h = (HANDLE) _get_osfhandle(fileno(file));
4473 
4474 	return(SetEndOfFile(h));
4475 }
4476 
4477 /** This function can be called if one wants to post a batch of reads and
4478 prefers an i/o-handler thread to handle them all at once later. You must
4479 call os_aio_simulated_wake_handler_threads later to ensure the threads
4480 are not left sleeping! */
4481 void
os_aio_simulated_put_read_threads_to_sleep()4482 os_aio_simulated_put_read_threads_to_sleep()
4483 {
4484 	AIO::simulated_put_read_threads_to_sleep();
4485 }
4486 
4487 /** This function can be called if one wants to post a batch of reads and
4488 prefers an i/o-handler thread to handle them all at once later. You must
4489 call os_aio_simulated_wake_handler_threads later to ensure the threads
4490 are not left sleeping! */
4491 void
simulated_put_read_threads_to_sleep()4492 AIO::simulated_put_read_threads_to_sleep()
4493 {
4494 	/* The idea of putting background IO threads to sleep is only for
4495 	Windows when using simulated AIO. Windows XP seems to schedule
4496 	background threads too eagerly to allow for coalescing during
4497 	readahead requests. */
4498 
4499 	if (srv_use_native_aio) {
4500 		/* We do not use simulated AIO: do nothing */
4501 
4502 		return;
4503 	}
4504 
4505 	os_aio_recommend_sleep_for_read_threads	= true;
4506 
4507 	for (ulint i = 0; i < os_aio_n_segments; i++) {
4508 		AIO*	array;
4509 
4510 		get_array_and_local_segment(&array, i);
4511 
4512 		if (array == s_reads) {
4513 
4514 			os_event_reset(os_aio_segment_wait_events[i]);
4515 		}
4516 	}
4517 }
4518 
4519 #endif /* !_WIN32*/
4520 
4521 /** Does a syncronous read or write depending upon the type specified
4522 In case of partial reads/writes the function tries
4523 NUM_RETRIES_ON_PARTIAL_IO times to read/write the complete data.
4524 @param[in]	type,		IO flags
4525 @param[in]	file		handle to an open file
4526 @param[out]	buf		buffer where to read
4527 @param[in]	offset		file offset from the start where to read
4528 @param[in]	n		number of bytes to read, starting from offset
4529 @param[out]	err		DB_SUCCESS or error code
4530 @return number of bytes read/written, -1 if error */
4531 static MY_ATTRIBUTE((warn_unused_result))
4532 ssize_t
os_file_io(const IORequest & in_type,os_file_t file,void * buf,ulint n,os_offset_t offset,dberr_t * err)4533 os_file_io(
4534 	const IORequest&in_type,
4535 	os_file_t	file,
4536 	void*		buf,
4537 	ulint		n,
4538 	os_offset_t	offset,
4539 	dberr_t*	err)
4540 {
4541 	ssize_t		original_n = ssize_t(n);
4542 	IORequest	type = in_type;
4543 	ssize_t		bytes_returned = 0;
4544 
4545 	SyncFileIO	sync_file_io(file, buf, n, offset);
4546 
4547 	for (ulint i = 0; i < NUM_RETRIES_ON_PARTIAL_IO; ++i) {
4548 
4549 		ssize_t	n_bytes = sync_file_io.execute(type);
4550 
4551 		/* Check for a hard error. Not much we can do now. */
4552 		if (n_bytes < 0) {
4553 
4554 			break;
4555 
4556 		} else if (n_bytes + bytes_returned == ssize_t(n)) {
4557 
4558 			bytes_returned += n_bytes;
4559 
4560 			if (offset > 0
4561 			    && !type.is_log()
4562 			    && type.is_write()
4563 			    && type.punch_hole()) {
4564 				*err = type.punch_hole(file, offset, n);
4565 
4566 			} else {
4567 				*err = DB_SUCCESS;
4568 			}
4569 
4570 			return(original_n);
4571 		}
4572 
4573 		/* Handle partial read/write. */
4574 
4575 		ut_ad(ulint(n_bytes + bytes_returned) < n);
4576 
4577 		bytes_returned += n_bytes;
4578 
4579 		if (!type.is_partial_io_warning_disabled()) {
4580 
4581 			const char*	op = type.is_read()
4582 				? "read" : "written";
4583 
4584 			ib::warn()
4585 				<< n
4586 				<< " bytes should have been " << op << ". Only "
4587 				<< bytes_returned
4588 				<< " bytes " << op << ". Retrying"
4589 				<< " for the remaining bytes.";
4590 		}
4591 
4592 		/* Advance the offset and buffer by n_bytes */
4593 		sync_file_io.advance(n_bytes);
4594 	}
4595 
4596 	*err = DB_IO_ERROR;
4597 
4598 	if (!type.is_partial_io_warning_disabled()) {
4599 		ib::warn()
4600 			<< "Retry attempts for "
4601 			<< (type.is_read() ? "reading" : "writing")
4602 			<< " partial data failed.";
4603 	}
4604 
4605 	return(bytes_returned);
4606 }
4607 
4608 /** Does a synchronous write operation in Posix.
4609 @param[in]	type		IO context
4610 @param[in]	file		handle to an open file
4611 @param[out]	buf		buffer from which to write
4612 @param[in]	n		number of bytes to read, starting from offset
4613 @param[in]	offset		file offset from the start where to read
4614 @param[out]	err		DB_SUCCESS or error code
4615 @return number of bytes written, -1 if error */
4616 static MY_ATTRIBUTE((warn_unused_result))
4617 ssize_t
os_file_pwrite(const IORequest & type,os_file_t file,const byte * buf,ulint n,os_offset_t offset,dberr_t * err)4618 os_file_pwrite(
4619 	const IORequest&	type,
4620 	os_file_t		file,
4621 	const byte*		buf,
4622 	ulint			n,
4623 	os_offset_t		offset,
4624 	dberr_t*		err)
4625 {
4626 	ut_ad(type.validate());
4627 	ut_ad(type.is_write());
4628 
4629 	++os_n_file_writes;
4630 
4631 	const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_WRITES);
4632 	MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
4633 	ssize_t	n_bytes = os_file_io(type, file, const_cast<byte*>(buf),
4634 				     n, offset, err);
4635 	MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
4636 
4637 	return(n_bytes);
4638 }
4639 
4640 /** NOTE! Use the corresponding macro os_file_write(), not directly
4641 Requests a synchronous write operation.
4642 @param[in]	type		IO flags
4643 @param[in]	file		handle to an open file
4644 @param[out]	buf		buffer from which to write
4645 @param[in]	offset		file offset from the start where to read
4646 @param[in]	n		number of bytes to read, starting from offset
4647 @return error code
4648 @retval	DB_SUCCESS	if the operation succeeded */
4649 dberr_t
os_file_write_func(const IORequest & type,const char * name,os_file_t file,const void * buf,os_offset_t offset,ulint n)4650 os_file_write_func(
4651 	const IORequest&	type,
4652 	const char*		name,
4653 	os_file_t		file,
4654 	const void*		buf,
4655 	os_offset_t		offset,
4656 	ulint			n)
4657 {
4658 	dberr_t		err;
4659 
4660 	ut_ad(type.validate());
4661 	ut_ad(n > 0);
4662 
4663 	WAIT_ALLOW_WRITES();
4664 
4665 	ssize_t	n_bytes = os_file_pwrite(type, file, (byte*)buf, n, offset, &err);
4666 
4667 	if ((ulint) n_bytes != n && !os_has_said_disk_full) {
4668 
4669 		ib::error()
4670 			<< "Write to file " << name << " failed at offset "
4671 			<< offset << ", " << n
4672 			<< " bytes should have been written,"
4673 			" only " << n_bytes << " were written."
4674 			" Operating system error number " << IF_WIN(GetLastError(),errno) << "."
4675 			" Check that your OS and file system"
4676 			" support files of this size."
4677 			" Check also that the disk is not full"
4678 			" or a disk quota exceeded.";
4679 #ifndef _WIN32
4680 		if (strerror(errno) != NULL) {
4681 
4682 			ib::error()
4683 				<< "Error number " << errno
4684 				<< " means '" << strerror(errno) << "'";
4685 		}
4686 
4687 		ib::info() << OPERATING_SYSTEM_ERROR_MSG;
4688 #endif
4689 		os_has_said_disk_full = true;
4690 	}
4691 
4692 	return(err);
4693 }
4694 
4695 /** Does a synchronous read operation in Posix.
4696 @param[in]	type		IO flags
4697 @param[in]	file		handle to an open file
4698 @param[out]	buf		buffer where to read
4699 @param[in]	offset		file offset from the start where to read
4700 @param[in]	n		number of bytes to read, starting from offset
4701 @param[out]	err		DB_SUCCESS or error code
4702 @return number of bytes read, -1 if error */
4703 static MY_ATTRIBUTE((warn_unused_result))
4704 ssize_t
os_file_pread(const IORequest & type,os_file_t file,void * buf,ulint n,os_offset_t offset,dberr_t * err)4705 os_file_pread(
4706 	const IORequest&	type,
4707 	os_file_t		file,
4708 	void*			buf,
4709 	ulint			n,
4710 	os_offset_t		offset,
4711 	dberr_t*		err)
4712 {
4713 	ut_ad(type.is_read());
4714 
4715 	++os_n_file_reads;
4716 
4717 	const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS);
4718 	MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor);
4719 	ssize_t	n_bytes = os_file_io(type, file, buf, n, offset, err);
4720 	MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
4721 
4722 	return(n_bytes);
4723 }
4724 
4725 /** Requests a synchronous positioned read operation.
4726 @return DB_SUCCESS if request was successful, false if fail
4727 @param[in]	type		IO flags
4728 @param[in]	file		handle to an open file
4729 @param[out]	buf		buffer where to read
4730 @param[in]	offset		file offset from the start where to read
4731 @param[in]	n		number of bytes to read, starting from offset
4732 @param[out]	o		number of bytes actually read
4733 @param[in]	exit_on_err	if true then exit on error
4734 @return DB_SUCCESS or error code */
4735 static MY_ATTRIBUTE((warn_unused_result))
4736 dberr_t
os_file_read_page(const IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n,ulint * o,bool exit_on_err)4737 os_file_read_page(
4738 	const IORequest&	type,
4739 	os_file_t		file,
4740 	void*			buf,
4741 	os_offset_t		offset,
4742 	ulint			n,
4743 	ulint*			o,
4744 	bool			exit_on_err)
4745 {
4746 	dberr_t		err;
4747 
4748 	os_bytes_read_since_printout += n;
4749 
4750 	ut_ad(type.validate());
4751 	ut_ad(n > 0);
4752 
4753 	ssize_t	n_bytes = os_file_pread(type, file, buf, n, offset, &err);
4754 
4755 	if (o) {
4756 		*o = n_bytes;
4757 	}
4758 
4759 	if (ulint(n_bytes) == n || (err != DB_SUCCESS && !exit_on_err)) {
4760 		return err;
4761 	}
4762 
4763 	ib::error() << "Tried to read " << n << " bytes at offset "
4764 		    << offset << ", but was only able to read " << n_bytes;
4765 
4766 	if (!os_file_handle_error_cond_exit(
4767 		    NULL, "read", exit_on_err, false)) {
4768 		ib::fatal()
4769 			<< "Cannot read from file. OS error number "
4770 			<< errno << ".";
4771 	}
4772 
4773 	if (err == DB_SUCCESS) {
4774 		err = DB_IO_ERROR;
4775 	}
4776 
4777 	return err;
4778 }
4779 
4780 /** Retrieves the last error number if an error occurs in a file io function.
4781 The number should be retrieved before any other OS calls (because they may
4782 overwrite the error number). If the number is not known to this program,
4783 the OS error number + 100 is returned.
4784 @param[in]	report_all_errors	true if we want an error printed
4785 					for all errors
4786 @return error number, or OS error number + 100 */
4787 ulint
os_file_get_last_error(bool report_all_errors)4788 os_file_get_last_error(
4789 	bool	report_all_errors)
4790 {
4791 	return(os_file_get_last_error_low(report_all_errors, false));
4792 }
4793 
4794 /** Handle errors for file operations.
4795 @param[in]	name		name of a file or NULL
4796 @param[in]	operation	operation
4797 @param[in]	should_abort	whether to abort on an unknown error
4798 @param[in]	on_error_silent	whether to suppress reports of non-fatal errors
4799 @return true if we should retry the operation */
4800 static MY_ATTRIBUTE((warn_unused_result))
4801 bool
os_file_handle_error_cond_exit(const char * name,const char * operation,bool should_abort,bool on_error_silent)4802 os_file_handle_error_cond_exit(
4803 	const char*	name,
4804 	const char*	operation,
4805 	bool		should_abort,
4806 	bool		on_error_silent)
4807 {
4808 	ulint	err;
4809 
4810 	err = os_file_get_last_error_low(false, on_error_silent);
4811 
4812 	switch (err) {
4813 	case OS_FILE_DISK_FULL:
4814 		/* We only print a warning about disk full once */
4815 
4816 		if (os_has_said_disk_full) {
4817 
4818 			return(false);
4819 		}
4820 
4821 		/* Disk full error is reported irrespective of the
4822 		on_error_silent setting. */
4823 
4824 		if (name) {
4825 
4826 			ib::error()
4827 				<< "Encountered a problem with file '"
4828 				<< name << "'";
4829 		}
4830 
4831 		ib::error()
4832 			<< "Disk is full. Try to clean the disk to free space.";
4833 
4834 		os_has_said_disk_full = true;
4835 
4836 		return(false);
4837 
4838 	case OS_FILE_AIO_RESOURCES_RESERVED:
4839 	case OS_FILE_AIO_INTERRUPTED:
4840 
4841 		return(true);
4842 
4843 	case OS_FILE_PATH_ERROR:
4844 	case OS_FILE_ALREADY_EXISTS:
4845 	case OS_FILE_ACCESS_VIOLATION:
4846 
4847 		return(false);
4848 
4849 	case OS_FILE_SHARING_VIOLATION:
4850 
4851 		os_thread_sleep(10000000);	/* 10 sec */
4852 		return(true);
4853 
4854 	case OS_FILE_OPERATION_ABORTED:
4855 	case OS_FILE_INSUFFICIENT_RESOURCE:
4856 
4857 		os_thread_sleep(100000);	/* 100 ms */
4858 		return(true);
4859 
4860 	default:
4861 
4862 		/* If it is an operation that can crash on error then it
4863 		is better to ignore on_error_silent and print an error message
4864 		to the log. */
4865 
4866 		if (should_abort || !on_error_silent) {
4867 			ib::error() << "File "
4868 				<< (name != NULL ? name : "(unknown)")
4869 				<< ": '" << operation << "'"
4870 				" returned OS error " << err << "."
4871 				<< (should_abort
4872 				    ? " Cannot continue operation" : "");
4873 		}
4874 
4875 		if (should_abort) {
4876 			abort();
4877 		}
4878 	}
4879 
4880 	return(false);
4881 }
4882 
4883 #ifndef _WIN32
4884 /** Tries to disable OS caching on an opened file descriptor.
4885 @param[in]	fd		file descriptor to alter
4886 @param[in]	file_name	file name, used in the diagnostic message
4887 @param[in]	name		"open" or "create"; used in the diagnostic
4888 				message */
4889 void
os_file_set_nocache(int fd MY_ATTRIBUTE ((unused)),const char * file_name MY_ATTRIBUTE ((unused)),const char * operation_name MY_ATTRIBUTE ((unused)))4890 os_file_set_nocache(
4891 	int	fd		MY_ATTRIBUTE((unused)),
4892 	const char*	file_name	MY_ATTRIBUTE((unused)),
4893 	const char*	operation_name	MY_ATTRIBUTE((unused)))
4894 {
4895 	/* some versions of Solaris may not have DIRECTIO_ON */
4896 #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
4897 	if (directio(fd, DIRECTIO_ON) == -1) {
4898 		int	errno_save = errno;
4899 
4900 		ib::error()
4901 			<< "Failed to set DIRECTIO_ON on file "
4902 			<< file_name << "; " << operation_name << ": "
4903 			<< strerror(errno_save) << ","
4904 			" continuing anyway.";
4905 	}
4906 #elif defined(O_DIRECT)
4907 	if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
4908 		int		errno_save = errno;
4909 		static bool	warning_message_printed = false;
4910 		if (errno_save == EINVAL) {
4911 			if (!warning_message_printed) {
4912 				warning_message_printed = true;
4913 # ifdef UNIV_LINUX
4914 				ib::warn()
4915 					<< "Failed to set O_DIRECT on file"
4916 					<< file_name << "; " << operation_name
4917 					<< ": " << strerror(errno_save) << ", "
4918 					"continuing anyway. O_DIRECT is "
4919 					"known to result in 'Invalid argument' "
4920 					"on Linux on tmpfs, "
4921 					"see MySQL Bug#26662.";
4922 # else /* UNIV_LINUX */
4923 				goto short_warning;
4924 # endif /* UNIV_LINUX */
4925 			}
4926 		} else {
4927 # ifndef UNIV_LINUX
4928 short_warning:
4929 # endif
4930 			ib::warn()
4931 				<< "Failed to set O_DIRECT on file "
4932 				<< file_name << "; " << operation_name
4933 				<< " : " << strerror(errno_save)
4934 				<< ", continuing anyway.";
4935 		}
4936 	}
4937 #endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */
4938 }
4939 
4940 #endif /* _WIN32 */
4941 
4942 /** Check if the file system supports sparse files.
4943 @param fh	file handle
4944 @return true if the file system supports sparse files */
os_is_sparse_file_supported(os_file_t fh)4945 IF_WIN(static,) bool os_is_sparse_file_supported(os_file_t fh)
4946 {
4947 	/* In this debugging mode, we act as if punch hole is supported,
4948 	then we skip any calls to actually punch a hole.  In this way,
4949 	Transparent Page Compression is still being tested. */
4950 	DBUG_EXECUTE_IF("ignore_punch_hole",
4951 		return(true);
4952 	);
4953 
4954 #ifdef _WIN32
4955 	FILE_ATTRIBUTE_TAG_INFO info;
4956 	if (GetFileInformationByHandleEx(fh, FileAttributeTagInfo,
4957 		&info, (DWORD)sizeof(info))) {
4958 		if (info.FileAttributes != INVALID_FILE_ATTRIBUTES) {
4959 			return (info.FileAttributes & FILE_ATTRIBUTE_SPARSE_FILE) != 0;
4960 		}
4961 	}
4962 	return false;
4963 #else
4964 	/* We don't know the FS block size, use the sector size. The FS
4965 	will do the magic. */
4966 	return DB_SUCCESS == os_file_punch_hole_posix(fh, 0, srv_page_size);
4967 #endif /* _WIN32 */
4968 }
4969 
4970 /** Extend a file.
4971 
4972 On Windows, extending a file allocates blocks for the file,
4973 unless the file is sparse.
4974 
4975 On Unix, we will extend the file with ftruncate(), if
4976 file needs to be sparse. Otherwise posix_fallocate() is used
4977 when available, and if not, binary zeroes are added to the end
4978 of file.
4979 
4980 @param[in]	name	file name
4981 @param[in]	file	file handle
4982 @param[in]	size	desired file size
4983 @param[in]	sparse	whether to create a sparse file (no preallocating)
4984 @return	whether the operation succeeded */
4985 bool
os_file_set_size(const char * name,os_file_t file,os_offset_t size,bool is_sparse)4986 os_file_set_size(
4987 	const char*	name,
4988 	os_file_t	file,
4989 	os_offset_t	size,
4990 	bool	is_sparse)
4991 {
4992 	ut_ad(!(size & 4095));
4993 
4994 #ifdef _WIN32
4995 	/* On Windows, changing file size works well and as expected for both
4996 	sparse and normal files.
4997 
4998 	However, 10.2 up until 10.2.9 made every file sparse in innodb,
4999 	causing NTFS fragmentation issues(MDEV-13941). We try to undo
5000 	the damage, and unsparse the file.*/
5001 
5002 	if (!is_sparse && os_is_sparse_file_supported(file)) {
5003 		if (!os_file_set_sparse_win32(file, false))
5004 			/* Unsparsing file failed. Fallback to writing binary
5005 			zeros, to avoid even higher fragmentation.*/
5006 			goto fallback;
5007 	}
5008 
5009 	return os_file_change_size_win32(name, file, size);
5010 
5011 fallback:
5012 #else
5013 	struct stat statbuf;
5014 
5015 	if (is_sparse) {
5016 		bool success = !ftruncate(file, size);
5017 		if (!success) {
5018 			ib::error() << "ftruncate of file " << name << " to "
5019 				    << size << " bytes failed with error "
5020 				    << errno;
5021 		}
5022 		return(success);
5023 	}
5024 
5025 # ifdef HAVE_POSIX_FALLOCATE
5026 	int err;
5027 	do {
5028 		if (fstat(file, &statbuf)) {
5029 			err = errno;
5030 		} else {
5031 			os_offset_t current_size = statbuf.st_size;
5032 			if (current_size >= size) {
5033 				return true;
5034 			}
5035 			current_size &= ~4095ULL;
5036 			err = posix_fallocate(file, current_size,
5037 					      size - current_size);
5038 		}
5039 	} while (err == EINTR
5040 		 && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED);
5041 
5042 	switch (err) {
5043 	case 0:
5044 		return true;
5045 	default:
5046 		ib::error() << "preallocating "
5047 			    << size << " bytes for file " << name
5048 			    << " failed with error " << err;
5049 		/* fall through */
5050 	case EINTR:
5051 		errno = err;
5052 		return false;
5053 	case EINVAL:
5054 	case EOPNOTSUPP:
5055 		/* fall back to the code below */
5056 		break;
5057 	}
5058 # endif /* HAVE_POSIX_ALLOCATE */
5059 #endif /* _WIN32*/
5060 
5061 #ifdef _WIN32
5062 	os_offset_t	current_size = os_file_get_size(file);
5063 	FILE_STORAGE_INFO info;
5064 	if (GetFileInformationByHandleEx(file, FileStorageInfo, &info,
5065 					 sizeof info)) {
5066 		if (info.LogicalBytesPerSector) {
5067 			current_size &= ~os_offset_t(info.LogicalBytesPerSector
5068 						     - 1);
5069 		}
5070 	}
5071 #else
5072 	if (fstat(file, &statbuf)) {
5073 		return false;
5074 	}
5075 	os_offset_t current_size = statbuf.st_size & ~4095ULL;
5076 #endif
5077 	if (current_size >= size) {
5078 		return true;
5079 	}
5080 
5081 	/* Write up to 1 megabyte at a time. */
5082 	ulint	buf_size = ut_min(ulint(64),
5083 				  ulint(size >> srv_page_size_shift))
5084 		<< srv_page_size_shift;
5085 
5086 	/* Align the buffer for possible raw i/o */
5087 	byte*	buf2;
5088 
5089 	buf2 = static_cast<byte*>(ut_malloc_nokey(buf_size + srv_page_size));
5090 
5091 	byte*	buf = static_cast<byte*>(ut_align(buf2, srv_page_size));
5092 
5093 	/* Write buffer full of zeros */
5094 	memset(buf, 0, buf_size);
5095 
5096 	while (current_size < size
5097 	       && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED) {
5098 		ulint	n_bytes;
5099 
5100 		if (size - current_size < (os_offset_t) buf_size) {
5101 			n_bytes = (ulint) (size - current_size);
5102 		} else {
5103 			n_bytes = buf_size;
5104 		}
5105 
5106 		dberr_t		err;
5107 		IORequest	request(IORequest::WRITE);
5108 
5109 		err = os_file_write(
5110 			request, name, file, buf, current_size, n_bytes);
5111 
5112 		if (err != DB_SUCCESS) {
5113 			break;
5114 		}
5115 
5116 		current_size += n_bytes;
5117 	}
5118 
5119 	ut_free(buf2);
5120 
5121 	return(current_size >= size && os_file_flush(file));
5122 }
5123 
5124 /** Truncate a file to a specified size in bytes.
5125 @param[in]	pathname	file path
5126 @param[in]	file		file to be truncated
5127 @param[in]	size		size preserved in bytes
5128 @param[in]	allow_shrink	whether to allow the file to become smaller
5129 @return true if success */
5130 bool
os_file_truncate(const char * pathname,os_file_t file,os_offset_t size,bool allow_shrink)5131 os_file_truncate(
5132 	const char*	pathname,
5133 	os_file_t	file,
5134 	os_offset_t	size,
5135 	bool		allow_shrink)
5136 {
5137 	if (!allow_shrink) {
5138 		/* Do nothing if the size preserved is larger than or
5139 		equal to the current size of file */
5140 		os_offset_t	size_bytes = os_file_get_size(file);
5141 
5142 		if (size >= size_bytes) {
5143 			return(true);
5144 		}
5145 	}
5146 
5147 #ifdef _WIN32
5148 	return(os_file_change_size_win32(pathname, file, size));
5149 #else /* _WIN32 */
5150 	return(os_file_truncate_posix(pathname, file, size));
5151 #endif /* _WIN32 */
5152 }
5153 
5154 /** NOTE! Use the corresponding macro os_file_read(), not directly this
5155 function!
5156 Requests a synchronous positioned read operation.
5157 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
5158 @param[in]	type		IO flags
5159 @param[in]	file		handle to an open file
5160 @param[out]	buf		buffer where to read
5161 @param[in]	offset		file offset from the start where to read
5162 @param[in]	n		number of bytes to read, starting from offset
5163 @return error code
5164 @retval	DB_SUCCESS	if the operation succeeded */
5165 dberr_t
os_file_read_func(const IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n)5166 os_file_read_func(
5167 	const IORequest&	type,
5168 	os_file_t		file,
5169 	void*			buf,
5170 	os_offset_t		offset,
5171 	ulint			n)
5172 {
5173 	return(os_file_read_page(type, file, buf, offset, n, NULL, true));
5174 }
5175 
5176 /** NOTE! Use the corresponding macro os_file_read_no_error_handling(),
5177 not directly this function!
5178 Requests a synchronous positioned read operation.
5179 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
5180 @param[in]	type		IO flags
5181 @param[in]	file		handle to an open file
5182 @param[out]	buf		buffer where to read
5183 @param[in]	offset		file offset from the start where to read
5184 @param[in]	n		number of bytes to read, starting from offset
5185 @param[out]	o		number of bytes actually read
5186 @return DB_SUCCESS or error code */
5187 dberr_t
os_file_read_no_error_handling_func(const IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n,ulint * o)5188 os_file_read_no_error_handling_func(
5189 	const IORequest&	type,
5190 	os_file_t		file,
5191 	void*			buf,
5192 	os_offset_t		offset,
5193 	ulint			n,
5194 	ulint*			o)
5195 {
5196 	return(os_file_read_page(type, file, buf, offset, n, o, false));
5197 }
5198 
5199 /** Check the existence and type of the given file.
5200 @param[in]	path		path name of file
5201 @param[out]	exists		true if the file exists
5202 @param[out]	type		Type of the file, if it exists
5203 @return true if call succeeded */
5204 bool
os_file_status(const char * path,bool * exists,os_file_type_t * type)5205 os_file_status(
5206 	const char*	path,
5207 	bool*		exists,
5208 	os_file_type_t* type)
5209 {
5210 #ifdef _WIN32
5211 	return(os_file_status_win32(path, exists, type));
5212 #else
5213 	return(os_file_status_posix(path, exists, type));
5214 #endif /* _WIN32 */
5215 }
5216 
5217 /** Free storage space associated with a section of the file.
5218 @param[in]	fh		Open file handle
5219 @param[in]	off		Starting offset (SEEK_SET)
5220 @param[in]	len		Size of the hole
5221 @return DB_SUCCESS or error code */
5222 dberr_t
os_file_punch_hole(os_file_t fh,os_offset_t off,os_offset_t len)5223 os_file_punch_hole(
5224 	os_file_t	fh,
5225 	os_offset_t	off,
5226 	os_offset_t	len)
5227 {
5228 #ifdef _WIN32
5229 	return os_file_punch_hole_win32(fh, off, len);
5230 #else
5231 	return os_file_punch_hole_posix(fh, off, len);
5232 #endif /* _WIN32 */
5233 }
5234 
should_punch_hole() const5235 inline bool IORequest::should_punch_hole() const
5236 {
5237 	return m_fil_node && m_fil_node->space->punch_hole;
5238 }
5239 
5240 /** Free storage space associated with a section of the file.
5241 @param[in]	fh		Open file handle
5242 @param[in]	off		Starting offset (SEEK_SET)
5243 @param[in]	len		Size of the hole
5244 @return DB_SUCCESS or error code */
5245 dberr_t
punch_hole(os_file_t fh,os_offset_t off,ulint len)5246 IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len)
5247 {
5248 	/* In this debugging mode, we act as if punch hole is supported,
5249 	and then skip any calls to actually punch a hole here.
5250 	In this way, Transparent Page Compression is still being tested. */
5251 	DBUG_EXECUTE_IF("ignore_punch_hole",
5252 		return(DB_SUCCESS);
5253 	);
5254 
5255 	ulint trim_len = get_trim_length(len);
5256 
5257 	if (trim_len == 0) {
5258 		return(DB_SUCCESS);
5259 	}
5260 
5261 	off += len;
5262 
5263 	/* Check does file system support punching holes for this
5264 	tablespace. */
5265 	if (!should_punch_hole()) {
5266 		return DB_IO_NO_PUNCH_HOLE;
5267 	}
5268 
5269 	dberr_t err = os_file_punch_hole(fh, off, trim_len);
5270 
5271 	if (err == DB_SUCCESS) {
5272 		srv_stats.page_compressed_trim_op.inc();
5273 	} else {
5274 		/* If punch hole is not supported,
5275 		set space so that it is not used. */
5276 		if (err == DB_IO_NO_PUNCH_HOLE) {
5277 			if (m_fil_node) {
5278 				m_fil_node->space->punch_hole = false;
5279 			}
5280 			err = DB_SUCCESS;
5281 		}
5282 	}
5283 
5284 	return (err);
5285 }
5286 
5287 /** This function returns information about the specified file
5288 @param[in]	path		pathname of the file
5289 @param[out]	stat_info	information of a file in a directory
5290 @param[in]	check_rw_perm	for testing whether the file can be opened
5291 				in RW mode
5292 @param[in]	read_only	true if file is opened in read-only mode
5293 @return DB_SUCCESS if all OK */
5294 dberr_t
os_file_get_status(const char * path,os_file_stat_t * stat_info,bool check_rw_perm,bool read_only)5295 os_file_get_status(
5296 	const char*	path,
5297 	os_file_stat_t* stat_info,
5298 	bool		check_rw_perm,
5299 	bool		read_only)
5300 {
5301 	dberr_t	ret;
5302 
5303 #ifdef _WIN32
5304 	struct _stat64	info;
5305 
5306 	ret = os_file_get_status_win32(
5307 		path, stat_info, &info, check_rw_perm, read_only);
5308 
5309 #else
5310 	struct stat	info;
5311 
5312 	ret = os_file_get_status_posix(
5313 		path, stat_info, &info, check_rw_perm, read_only);
5314 
5315 #endif /* _WIN32 */
5316 
5317 	if (ret == DB_SUCCESS) {
5318 		stat_info->ctime = info.st_ctime;
5319 		stat_info->atime = info.st_atime;
5320 		stat_info->mtime = info.st_mtime;
5321 		stat_info->size  = info.st_size;
5322 	}
5323 
5324 	return(ret);
5325 }
5326 
5327 /**
5328 Waits for an AIO operation to complete. This function is used to wait the
5329 for completed requests. The aio array of pending requests is divided
5330 into segments. The thread specifies which segment or slot it wants to wait
5331 for. NOTE: this function will also take care of freeing the aio slot,
5332 therefore no other thread is allowed to do the freeing!
5333 @param[in]	segment		The number of the segment in the aio arrays to
5334 				wait for; segment 0 is the ibuf I/O thread,
5335 				segment 1 the log I/O thread, then follow the
5336 				non-ibuf read threads, and as the last are the
5337 				non-ibuf write threads; if this is
5338 				ULINT_UNDEFINED, then it means that sync AIO
5339 				is used, and this parameter is ignored
5340 @param[out]	m1		the messages passed with the AIO request; note
5341 				that also in the case where the AIO operation
5342 				failed, these output parameters are valid and
5343 				can be used to restart the operation,
5344 				for example
5345 @param[out]	m2		callback message
5346 @param[out]	type		OS_FILE_WRITE or ..._READ
5347 @return DB_SUCCESS or error code */
5348 dberr_t
os_aio_handler(ulint segment,fil_node_t ** m1,void ** m2,IORequest * request)5349 os_aio_handler(
5350 	ulint		segment,
5351 	fil_node_t**	m1,
5352 	void**		m2,
5353 	IORequest*	request)
5354 {
5355 	dberr_t	err;
5356 
5357 	if (srv_use_native_aio) {
5358 		srv_set_io_thread_op_info(segment, "native aio handle");
5359 
5360 #ifdef WIN_ASYNC_IO
5361 
5362 		err = os_aio_windows_handler(segment, 0, m1, m2, request);
5363 
5364 #elif defined(LINUX_NATIVE_AIO)
5365 
5366 		err = os_aio_linux_handler(segment, m1, m2, request);
5367 
5368 #else
5369 		ut_error;
5370 
5371 		err = DB_ERROR; /* Eliminate compiler warning */
5372 
5373 #endif /* WIN_ASYNC_IO */
5374 
5375 	} else {
5376 		srv_set_io_thread_op_info(segment, "simulated aio handle");
5377 
5378 		err = os_aio_simulated_handler(segment, m1, m2, request);
5379 	}
5380 
5381 	return(err);
5382 }
5383 
5384 #ifdef WIN_ASYNC_IO
new_completion_port()5385 static HANDLE new_completion_port()
5386 {
5387 	HANDLE h = CreateIoCompletionPort(INVALID_HANDLE_VALUE, 0, 0, 0);
5388 	ut_a(h);
5389 	return h;
5390 }
5391 #endif
5392 
5393 /** Constructor
5394 @param[in]	id		The latch ID
5395 @param[in]	n		Number of AIO slots
5396 @param[in]	segments	Number of segments */
AIO(latch_id_t id,ulint n,ulint segments)5397 AIO::AIO(
5398 	latch_id_t	id,
5399 	ulint		n,
5400 	ulint		segments)
5401 	:
5402 	m_slots(n),
5403 	m_n_segments(segments),
5404 	m_n_reserved()
5405 # ifdef LINUX_NATIVE_AIO
5406 	,m_events(m_slots.size())
5407 # endif /* LINUX_NATIVE_AIO */
5408 #ifdef WIN_ASYNC_IO
5409 	,m_completion_port(new_completion_port())
5410 #endif
5411 {
5412 	ut_a(n > 0);
5413 	ut_a(m_n_segments > 0);
5414 
5415 	mutex_create(id, &m_mutex);
5416 
5417 	m_not_full = os_event_create("aio_not_full");
5418 	m_is_empty = os_event_create("aio_is_empty");
5419 
5420 	memset((void*)&m_slots[0], 0x0, sizeof(m_slots[0]) * m_slots.size());
5421 #ifdef LINUX_NATIVE_AIO
5422 	memset(&m_events[0], 0x0, sizeof(m_events[0]) * m_events.size());
5423 #endif /* LINUX_NATIVE_AIO */
5424 
5425 	os_event_set(m_is_empty);
5426 }
5427 
5428 /** Initialise the slots */
5429 dberr_t
init_slots()5430 AIO::init_slots()
5431 {
5432 	for (ulint i = 0; i < m_slots.size(); ++i) {
5433 		Slot&	slot = m_slots[i];
5434 
5435 		slot.pos = static_cast<uint16_t>(i);
5436 
5437 		slot.is_reserved = false;
5438 
5439 #ifdef WIN_ASYNC_IO
5440 
5441 		slot.array = this;
5442 
5443 #elif defined(LINUX_NATIVE_AIO)
5444 
5445 		slot.ret = 0;
5446 
5447 		slot.n_bytes = 0;
5448 
5449 		memset(&slot.control, 0x0, sizeof(slot.control));
5450 
5451 #endif /* WIN_ASYNC_IO */
5452 	}
5453 
5454 	return(DB_SUCCESS);
5455 }
5456 
5457 #ifdef LINUX_NATIVE_AIO
5458 /** Initialise the Linux Native AIO interface */
5459 dberr_t
init_linux_native_aio()5460 AIO::init_linux_native_aio()
5461 {
5462 
5463 	/* Initialize the io_context_t array. One io_context_t
5464 	per segment in the array. */
5465 	m_aio_ctx.resize(get_n_segments());
5466 
5467 	ulint		max_events = slots_per_segment();
5468 
5469 	for (std::vector<io_context_t>::iterator it = m_aio_ctx.begin(),
5470 						 end = m_aio_ctx.end();
5471 	     it != end; ++it) {
5472 
5473 		if (!linux_create_io_ctx(max_events, *it)) {
5474 			/* If something bad happened during aio setup
5475 			we disable linux native aio.
5476 			This frequently happens when running the test suite
5477 			with many threads on a system with low fs.aio-max-nr!
5478 			*/
5479 
5480 			ib::warn()
5481 				<< "Warning: Linux Native AIO disabled "
5482 				<< "because _linux_create_io_ctx() "
5483 				<< "failed. To get rid of this warning you can "
5484 				<< "try increasing system "
5485 				<< "fs.aio-max-nr to 1048576 or larger or "
5486 				<< "setting innodb_use_native_aio = 0 in my.cnf";
5487 
5488 			for (std::vector<io_context_t>::iterator it2
5489 			     = m_aio_ctx.begin();
5490 			     it2 != it; ++it2) {
5491 				int ret = io_destroy(*it2);
5492 				ut_a(ret != -EINVAL);
5493 			}
5494 
5495 			m_aio_ctx.clear();
5496 			srv_use_native_aio = FALSE;
5497 			return(DB_SUCCESS);
5498 		}
5499 	}
5500 
5501 	return(DB_SUCCESS);
5502 }
5503 #endif /* LINUX_NATIVE_AIO */
5504 
5505 /** Initialise the array */
5506 dberr_t
init()5507 AIO::init()
5508 {
5509 	ut_a(!m_slots.empty());
5510 
5511 
5512 	if (srv_use_native_aio) {
5513 #ifdef LINUX_NATIVE_AIO
5514 		dberr_t	err = init_linux_native_aio();
5515 
5516 		if (err != DB_SUCCESS) {
5517 			return(err);
5518 		}
5519 
5520 #endif /* LINUX_NATIVE_AIO */
5521 	}
5522 
5523 	return(init_slots());
5524 }
5525 
5526 /** Creates an aio wait array. Note that we return NULL in case of failure.
5527 We don't care about freeing memory here because we assume that a
5528 failure will result in server refusing to start up.
5529 @param[in]	id		Latch ID
5530 @param[in]	n		maximum number of pending AIO operations
5531 				allowed; n must be divisible by m_n_segments
5532 @param[in]	n_segments	number of segments in the AIO array
5533 @return own: AIO array, NULL on failure */
5534 AIO*
create(latch_id_t id,ulint n,ulint n_segments)5535 AIO::create(
5536 	latch_id_t	id,
5537 	ulint		n,
5538 	ulint		n_segments)
5539 {
5540 	if ((n % n_segments)) {
5541 
5542 		ib::error()
5543 			<< "Maximum number of AIO operations must be "
5544 			<< "divisible by number of segments";
5545 
5546 		return(NULL);
5547 	}
5548 
5549 	AIO*	array = UT_NEW_NOKEY(AIO(id, n, n_segments));
5550 
5551 	if (array != NULL && array->init() != DB_SUCCESS) {
5552 
5553 		UT_DELETE(array);
5554 
5555 		array = NULL;
5556 	}
5557 
5558 	return(array);
5559 }
5560 
5561 /** AIO destructor */
~AIO()5562 AIO::~AIO()
5563 {
5564 	mutex_destroy(&m_mutex);
5565 
5566 	os_event_destroy(m_not_full);
5567 	os_event_destroy(m_is_empty);
5568 
5569 #if defined(LINUX_NATIVE_AIO)
5570 	if (srv_use_native_aio) {
5571 		for (ulint i = 0; i < m_aio_ctx.size(); i++) {
5572 			int ret = io_destroy(m_aio_ctx[i]);
5573 			ut_a(ret != -EINVAL);
5574 		}
5575 	}
5576 #endif /* LINUX_NATIVE_AIO */
5577 #if defined(WIN_ASYNC_IO)
5578 	CloseHandle(m_completion_port);
5579 #endif
5580 }
5581 
5582 /** Initializes the asynchronous io system. Creates one array each for ibuf
5583 and log i/o. Also creates one array each for read and write where each
5584 array is divided logically into n_readers and n_writers
5585 respectively. The caller must create an i/o handler thread for each
5586 segment in these arrays. This function also creates the sync array.
5587 No i/o handler thread needs to be created for that
5588 @param[in]	n_per_seg	maximum number of pending aio
5589 				operations allowed per segment
5590 @param[in]	n_readers	number of reader threads
5591 @param[in]	n_writers	number of writer threads
5592 @param[in]	n_slots_sync	number of slots in the sync aio array
5593 @return true if the AIO sub-system was started successfully */
5594 bool
start(ulint n_per_seg,ulint n_readers,ulint n_writers,ulint n_slots_sync)5595 AIO::start(
5596 	ulint		n_per_seg,
5597 	ulint		n_readers,
5598 	ulint		n_writers,
5599 	ulint		n_slots_sync)
5600 {
5601 #if defined(LINUX_NATIVE_AIO)
5602 	/* Check if native aio is supported on this system and tmpfs */
5603 	if (srv_use_native_aio && !is_linux_native_aio_supported()) {
5604 
5605 		ib::warn() << "Linux Native AIO disabled.";
5606 
5607 		srv_use_native_aio = FALSE;
5608 	}
5609 #endif /* LINUX_NATIVE_AIO */
5610 
5611 	srv_reset_io_thread_op_info();
5612 
5613 	s_reads = create(
5614 		LATCH_ID_OS_AIO_READ_MUTEX, n_readers * n_per_seg, n_readers);
5615 
5616 	if (s_reads == NULL) {
5617 		return(false);
5618 	}
5619 
5620 	ulint	start = srv_read_only_mode ? 0 : 2;
5621 	ulint	n_segs = n_readers + start;
5622 
5623 	/* 0 is the ibuf segment and 1 is the redo log segment. */
5624 	for (ulint i = start; i < n_segs; ++i) {
5625 		ut_a(i < SRV_MAX_N_IO_THREADS);
5626 		srv_io_thread_function[i] = "read thread";
5627 	}
5628 
5629 	ulint	n_segments = n_readers;
5630 
5631 	if (!srv_read_only_mode) {
5632 
5633 		s_ibuf = create(LATCH_ID_OS_AIO_IBUF_MUTEX, n_per_seg, 1);
5634 
5635 		if (s_ibuf == NULL) {
5636 			return(false);
5637 		}
5638 
5639 		++n_segments;
5640 
5641 		srv_io_thread_function[0] = "insert buffer thread";
5642 
5643 		s_log = create(LATCH_ID_OS_AIO_LOG_MUTEX, n_per_seg, 1);
5644 
5645 		if (s_log == NULL) {
5646 			return(false);
5647 		}
5648 
5649 		++n_segments;
5650 
5651 		srv_io_thread_function[1] = "log thread";
5652 
5653 	} else {
5654 		s_ibuf = s_log = NULL;
5655 	}
5656 
5657 	s_writes = create(
5658 		LATCH_ID_OS_AIO_WRITE_MUTEX, n_writers * n_per_seg, n_writers);
5659 
5660 	if (s_writes == NULL) {
5661 		return(false);
5662 	}
5663 
5664 #ifdef WIN_ASYNC_IO
5665 	data_completion_port = s_writes->m_completion_port;
5666 	log_completion_port =
5667 		s_log ? s_log->m_completion_port : data_completion_port;
5668 #endif
5669 
5670 	n_segments += n_writers;
5671 
5672 	for (ulint i = start + n_readers; i < n_segments; ++i) {
5673 		ut_a(i < SRV_MAX_N_IO_THREADS);
5674 		srv_io_thread_function[i] = "write thread";
5675 	}
5676 
5677 	ut_ad(n_segments >= static_cast<ulint>(srv_read_only_mode ? 2 : 4));
5678 
5679 	s_sync = create(LATCH_ID_OS_AIO_SYNC_MUTEX, n_slots_sync, 1);
5680 
5681 	if (s_sync == NULL) {
5682 
5683 		return(false);
5684 	}
5685 
5686 	os_aio_n_segments = n_segments;
5687 
5688 	os_aio_validate();
5689 
5690 	os_last_printout = time(NULL);
5691 
5692 	if (srv_use_native_aio) {
5693 		return(true);
5694 	}
5695 
5696 	os_aio_segment_wait_events = static_cast<os_event_t*>(
5697 		ut_zalloc_nokey(
5698 			n_segments * sizeof *os_aio_segment_wait_events));
5699 
5700 	if (os_aio_segment_wait_events == NULL) {
5701 
5702 		return(false);
5703 	}
5704 
5705 	for (ulint i = 0; i < n_segments; ++i) {
5706 		os_aio_segment_wait_events[i] = os_event_create(0);
5707 	}
5708 
5709 	return(true);
5710 }
5711 
5712 /** Free the AIO arrays */
5713 void
shutdown()5714 AIO::shutdown()
5715 {
5716 	UT_DELETE(s_ibuf);
5717 	s_ibuf = NULL;
5718 
5719 	UT_DELETE(s_log);
5720 	s_log = NULL;
5721 
5722 	UT_DELETE(s_writes);
5723 	s_writes = NULL;
5724 
5725 	UT_DELETE(s_sync);
5726 	s_sync = NULL;
5727 
5728 	UT_DELETE(s_reads);
5729 	s_reads = NULL;
5730 }
5731 
5732 /** Initializes the asynchronous io system. Creates one array each for ibuf
5733 and log i/o. Also creates one array each for read and write where each
5734 array is divided logically into n_readers and n_writers
5735 respectively. The caller must create an i/o handler thread for each
5736 segment in these arrays. This function also creates the sync array.
5737 No i/o handler thread needs to be created for that
5738 @param[in]	n_readers	number of reader threads
5739 @param[in]	n_writers	number of writer threads
5740 @param[in]	n_slots_sync	number of slots in the sync aio array */
5741 bool
os_aio_init(ulint n_readers,ulint n_writers,ulint n_slots_sync)5742 os_aio_init(
5743 	ulint		n_readers,
5744 	ulint		n_writers,
5745 	ulint		n_slots_sync)
5746 {
5747 	/* Maximum number of pending aio operations allowed per segment */
5748 	ulint		limit = 8 * OS_AIO_N_PENDING_IOS_PER_THREAD;
5749 
5750 	return(AIO::start(limit, n_readers, n_writers, n_slots_sync));
5751 }
5752 
5753 /** Frees the asynchronous io system. */
5754 void
os_aio_free()5755 os_aio_free()
5756 {
5757 	AIO::shutdown();
5758 
5759 	ut_ad(!os_aio_segment_wait_events || !srv_use_native_aio);
5760 	ut_ad(srv_use_native_aio || os_aio_segment_wait_events
5761 	      || !srv_was_started);
5762 
5763 	if (!srv_use_native_aio && os_aio_segment_wait_events) {
5764 		for (ulint i = 0; i < os_aio_n_segments; i++) {
5765 			os_event_destroy(os_aio_segment_wait_events[i]);
5766 		}
5767 
5768 		ut_free(os_aio_segment_wait_events);
5769 		os_aio_segment_wait_events = 0;
5770 	}
5771 	os_aio_n_segments = 0;
5772 }
5773 
5774 /** Wakes up all async i/o threads so that they know to exit themselves in
5775 shutdown. */
5776 void
os_aio_wake_all_threads_at_shutdown()5777 os_aio_wake_all_threads_at_shutdown()
5778 {
5779 #ifdef WIN_ASYNC_IO
5780 	AIO::wake_at_shutdown();
5781 #elif defined(LINUX_NATIVE_AIO)
5782 	/* When using native AIO interface the io helper threads
5783 	wait on io_getevents with a timeout value of 500ms. At
5784 	each wake up these threads check the server status.
5785 	No need to do anything to wake them up. */
5786 #endif /* !WIN_ASYNC_AIO */
5787 
5788 	if (srv_use_native_aio) {
5789 		return;
5790 	}
5791 
5792 	/* This loop wakes up all simulated ai/o threads */
5793 
5794 	for (ulint i = 0; i < os_aio_n_segments; ++i) {
5795 
5796 		os_event_set(os_aio_segment_wait_events[i]);
5797 	}
5798 }
5799 
5800 /** Waits until there are no pending writes in AIO::s_writes. There can
5801 be other, synchronous, pending writes. */
5802 void
os_aio_wait_until_no_pending_writes()5803 os_aio_wait_until_no_pending_writes()
5804 {
5805 	AIO::wait_until_no_pending_writes();
5806 }
5807 
5808 /** Calculates segment number for a slot.
5809 @param[in]	array		AIO wait array
5810 @param[in]	slot		slot in this array
5811 @return segment number (which is the number used by, for example,
5812 	I/O-handler threads) */
5813 ulint
get_segment_no_from_slot(const AIO * array,const Slot * slot)5814 AIO::get_segment_no_from_slot(
5815 	const AIO*	array,
5816 	const Slot*	slot)
5817 {
5818 	ulint	segment;
5819 	ulint	seg_len;
5820 
5821 	if (array == s_ibuf) {
5822 		ut_ad(!srv_read_only_mode);
5823 
5824 		segment = IO_IBUF_SEGMENT;
5825 
5826 	} else if (array == s_log) {
5827 		ut_ad(!srv_read_only_mode);
5828 
5829 		segment = IO_LOG_SEGMENT;
5830 
5831 	} else if (array == s_reads) {
5832 		seg_len = s_reads->slots_per_segment();
5833 
5834 		segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
5835 	} else {
5836 		ut_a(array == s_writes);
5837 
5838 		seg_len = s_writes->slots_per_segment();
5839 
5840 		segment = s_reads->m_n_segments
5841 			+ (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
5842 	}
5843 
5844 	return(segment);
5845 }
5846 
5847 /** Requests for a slot in the aio array. If no slot is available, waits until
5848 not_full-event becomes signaled.
5849 
5850 @param[in]	type		IO context
5851 @param[in,out]	m1		message to be passed along with the AIO
5852 				operation
5853 @param[in,out]	m2		message to be passed along with the AIO
5854 				operation
5855 @param[in]	file		file handle
5856 @param[in]	name		name of the file or path as a NUL-terminated
5857 				string
5858 @param[in,out]	buf		buffer where to read or from which to write
5859 @param[in]	offset		file offset, where to read from or start writing
5860 @param[in]	len		length of the block to read or write
5861 @return pointer to slot */
5862 Slot*
reserve_slot(const IORequest & type,fil_node_t * m1,void * m2,pfs_os_file_t file,const char * name,void * buf,os_offset_t offset,ulint len)5863 AIO::reserve_slot(
5864 	const IORequest&	type,
5865 	fil_node_t*		m1,
5866 	void*			m2,
5867 	pfs_os_file_t		file,
5868 	const char*		name,
5869 	void*			buf,
5870 	os_offset_t		offset,
5871 	ulint			len)
5872 {
5873 	ut_ad(reinterpret_cast<size_t>(buf) % OS_FILE_LOG_BLOCK_SIZE == 0);
5874 	ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
5875 	ut_ad(len % OS_FILE_LOG_BLOCK_SIZE == 0);
5876 
5877 #ifdef WIN_ASYNC_IO
5878 	ut_a((len & 0xFFFFFFFFUL) == len);
5879 #endif /* WIN_ASYNC_IO */
5880 
5881 	/* No need of a mutex. Only reading constant fields */
5882 	ulint		slots_per_seg;
5883 
5884 	ut_ad(type.validate());
5885 
5886 	slots_per_seg = slots_per_segment();
5887 
5888 	/* We attempt to keep adjacent blocks in the same local
5889 	segment. This can help in merging IO requests when we are
5890 	doing simulated AIO */
5891 	ulint		local_seg;
5892 
5893 	local_seg = (offset >> (srv_page_size_shift + 6)) % m_n_segments;
5894 
5895 	for (;;) {
5896 
5897 		acquire();
5898 
5899 		if (m_n_reserved != m_slots.size()) {
5900 			break;
5901 		}
5902 
5903 		release();
5904 
5905 		if (!srv_use_native_aio) {
5906 			/* If the handler threads are suspended,
5907 			wake them so that we get more slots */
5908 
5909 			os_aio_simulated_wake_handler_threads();
5910 		}
5911 
5912 		os_event_wait(m_not_full);
5913 	}
5914 
5915 	ulint	counter = 0;
5916 	Slot*	slot = NULL;
5917 
5918 	/* We start our search for an available slot from our preferred
5919 	local segment and do a full scan of the array. We are
5920 	guaranteed to find a slot in full scan. */
5921 	for (ulint i = local_seg * slots_per_seg;
5922 	     counter < m_slots.size();
5923 	     ++i, ++counter) {
5924 
5925 		i %= m_slots.size();
5926 
5927 		slot = at(i);
5928 
5929 		if (slot->is_reserved == false) {
5930 			break;
5931 		}
5932 	}
5933 
5934 	/* We MUST always be able to get hold of a reserved slot. */
5935 	ut_a(counter < m_slots.size());
5936 
5937 	ut_a(slot->is_reserved == false);
5938 
5939 	++m_n_reserved;
5940 
5941 	if (m_n_reserved == 1) {
5942 		os_event_reset(m_is_empty);
5943 	}
5944 
5945 	if (m_n_reserved == m_slots.size()) {
5946 		os_event_reset(m_not_full);
5947 	}
5948 
5949 	slot->is_reserved = true;
5950 	slot->reservation_time = time(NULL);
5951 	slot->m1       = m1;
5952 	slot->m2       = m2;
5953 	slot->file     = file;
5954 	slot->name     = name;
5955 #ifdef _WIN32
5956 	slot->len      = static_cast<DWORD>(len);
5957 #else
5958 	slot->len      = len;
5959 #endif /* _WIN32 */
5960 	slot->type     = type;
5961 	slot->buf      = static_cast<byte*>(buf);
5962 	slot->ptr      = slot->buf;
5963 	slot->offset   = offset;
5964 	slot->err      = DB_SUCCESS;
5965 	slot->original_len = static_cast<uint32>(len);
5966 	slot->io_already_done = false;
5967 	slot->buf      = static_cast<byte*>(buf);
5968 
5969 #ifdef WIN_ASYNC_IO
5970 	{
5971 		OVERLAPPED*	control;
5972 
5973 		control = &slot->control;
5974 		control->Offset = (DWORD) offset & 0xFFFFFFFF;
5975 		control->OffsetHigh = (DWORD) (offset >> 32);
5976 	}
5977 #elif defined(LINUX_NATIVE_AIO)
5978 
5979 	/* If we are not using native AIO skip this part. */
5980 	if (srv_use_native_aio) {
5981 
5982 		off_t		aio_offset;
5983 
5984 		/* Check if we are dealing with 64 bit arch.
5985 		If not then make sure that offset fits in 32 bits. */
5986 		aio_offset = (off_t) offset;
5987 
5988 		ut_a(sizeof(aio_offset) >= sizeof(offset)
5989 		     || ((os_offset_t) aio_offset) == offset);
5990 
5991 		struct iocb*	iocb = &slot->control;
5992 
5993 		if (type.is_read()) {
5994 
5995 			io_prep_pread(
5996 				iocb, file, slot->ptr, slot->len, aio_offset);
5997 		} else {
5998 			ut_ad(type.is_write());
5999 
6000 			io_prep_pwrite(
6001 				iocb, file, slot->ptr, slot->len, aio_offset);
6002 		}
6003 
6004 		iocb->data = slot;
6005 
6006 		slot->n_bytes = 0;
6007 		slot->ret = 0;
6008 	}
6009 #endif /* LINUX_NATIVE_AIO */
6010 
6011 	release();
6012 
6013 	return(slot);
6014 }
6015 
6016 /** Wakes up a simulated aio i/o-handler thread if it has something to do.
6017 @param[in]	global_segment	The number of the segment in the AIO arrays */
6018 void
wake_simulated_handler_thread(ulint global_segment)6019 AIO::wake_simulated_handler_thread(ulint global_segment)
6020 {
6021 	ut_ad(!srv_use_native_aio);
6022 
6023 	AIO*	array;
6024 	ulint	segment = get_array_and_local_segment(&array, global_segment);
6025 
6026 	array->wake_simulated_handler_thread(global_segment, segment);
6027 }
6028 
6029 /** Wakes up a simulated AIO I/O-handler thread if it has something to do
6030 for a local segment in the AIO array.
6031 @param[in]	global_segment	The number of the segment in the AIO arrays
6032 @param[in]	segment		The local segment in the AIO array */
6033 void
wake_simulated_handler_thread(ulint global_segment,ulint segment)6034 AIO::wake_simulated_handler_thread(ulint global_segment, ulint segment)
6035 {
6036 	ut_ad(!srv_use_native_aio);
6037 
6038 	ulint	n = slots_per_segment();
6039 	ulint	offset = segment * n;
6040 
6041 	/* Look through n slots after the segment * n'th slot */
6042 
6043 	acquire();
6044 
6045 	const Slot*	slot = at(offset);
6046 
6047 	for (ulint i = 0; i < n; ++i, ++slot) {
6048 
6049 		if (slot->is_reserved) {
6050 
6051 			/* Found an i/o request */
6052 
6053 			release();
6054 
6055 			os_event_t	event;
6056 
6057 			event = os_aio_segment_wait_events[global_segment];
6058 
6059 			os_event_set(event);
6060 
6061 			return;
6062 		}
6063 	}
6064 
6065 	release();
6066 }
6067 
6068 /** Wakes up simulated aio i/o-handler threads if they have something to do. */
6069 void
os_aio_simulated_wake_handler_threads()6070 os_aio_simulated_wake_handler_threads()
6071 {
6072 	if (srv_use_native_aio) {
6073 		/* We do not use simulated aio: do nothing */
6074 
6075 		return;
6076 	}
6077 
6078 	os_aio_recommend_sleep_for_read_threads	= false;
6079 
6080 	for (ulint i = 0; i < os_aio_n_segments; i++) {
6081 		AIO::wake_simulated_handler_thread(i);
6082 	}
6083 }
6084 
6085 /** Select the IO slot array
6086 @param[in,out]	type		Type of IO, READ or WRITE
6087 @param[in]	read_only	true if running in read-only mode
6088 @param[in]	mode		IO mode
6089 @return slot array or NULL if invalid mode specified */
6090 AIO*
select_slot_array(IORequest & type,bool read_only,ulint mode)6091 AIO::select_slot_array(IORequest& type, bool read_only, ulint mode)
6092 {
6093 	AIO*	array;
6094 
6095 	ut_ad(type.validate());
6096 
6097 	switch (mode) {
6098 	case OS_AIO_NORMAL:
6099 
6100 		array = type.is_read() ? AIO::s_reads : AIO::s_writes;
6101 		break;
6102 
6103 	case OS_AIO_IBUF:
6104 		ut_ad(type.is_read());
6105 
6106 		/* Reduce probability of deadlock bugs in connection with ibuf:
6107 		do not let the ibuf i/o handler sleep */
6108 
6109 		type.clear_do_not_wake();
6110 
6111 		array = read_only ? AIO::s_reads : AIO::s_ibuf;
6112 		break;
6113 
6114 	case OS_AIO_LOG:
6115 
6116 		array = read_only ? AIO::s_reads : AIO::s_log;
6117 		break;
6118 
6119 	case OS_AIO_SYNC:
6120 
6121 		array = AIO::s_sync;
6122 #if defined(LINUX_NATIVE_AIO)
6123 		/* In Linux native AIO we don't use sync IO array. */
6124 		ut_a(!srv_use_native_aio);
6125 #endif /* LINUX_NATIVE_AIO */
6126 		break;
6127 
6128 	default:
6129 		ut_error;
6130 		array = NULL; /* Eliminate compiler warning */
6131 	}
6132 
6133 	return(array);
6134 }
6135 
6136 #ifdef WIN_ASYNC_IO
6137 /** This function is only used in Windows asynchronous i/o.
6138 Waits for an aio operation to complete. This function is used to wait the
6139 for completed requests. The aio array of pending requests is divided
6140 into segments. The thread specifies which segment or slot it wants to wait
6141 for. NOTE: this function will also take care of freeing the aio slot,
6142 therefore no other thread is allowed to do the freeing!
6143 @param[in]	segment		The number of the segment in the aio arrays to
6144 				wait for; segment 0 is the ibuf I/O thread,
6145 				segment 1 the log I/O thread, then follow the
6146 				non-ibuf read threads, and as the last are the
6147 				non-ibuf write threads; if this is
6148 				ULINT_UNDEFINED, then it means that sync AIO
6149 				is used, and this parameter is ignored
6150 @param[in]	pos		this parameter is used only in sync AIO:
6151 				wait for the aio slot at this position
6152 @param[out]	m1		the messages passed with the AIO request; note
6153 				that also in the case where the AIO operation
6154 				failed, these output parameters are valid and
6155 				can be used to restart the operation,
6156 				for example
6157 @param[out]	m2		callback message
6158 @param[out]	type		OS_FILE_WRITE or ..._READ
6159 @return DB_SUCCESS or error code */
6160 
6161 
6162 
6163 static
6164 dberr_t
os_aio_windows_handler(ulint segment,ulint pos,fil_node_t ** m1,void ** m2,IORequest * type)6165 os_aio_windows_handler(
6166 	ulint		segment,
6167 	ulint		pos,
6168 	fil_node_t**	m1,
6169 	void**		m2,
6170 	IORequest*	type)
6171 {
6172 	Slot*		slot= 0;
6173 	dberr_t		err;
6174 
6175 	BOOL		ret;
6176 	ULONG_PTR	key;
6177 
6178 	ut_a(segment != ULINT_UNDEFINED);
6179 
6180 	/* NOTE! We only access constant fields in os_aio_array. Therefore
6181 	we do not have to acquire the protecting mutex yet */
6182 
6183 	ut_ad(os_aio_validate_skip());
6184 	AIO *my_array;
6185 	AIO::get_array_and_local_segment(&my_array, segment);
6186 
6187 	HANDLE port = my_array->m_completion_port;
6188 	ut_ad(port);
6189 	for (;;) {
6190 		DWORD len;
6191 		ret = GetQueuedCompletionStatus(port, &len, &key,
6192 		(OVERLAPPED **)&slot, INFINITE);
6193 
6194 		/* If shutdown key was received, repost the shutdown message and exit */
6195 		if (ret && key == IOCP_SHUTDOWN_KEY) {
6196 			PostQueuedCompletionStatus(port, 0, key, NULL);
6197 			*m1 = NULL;
6198 			*m2 = NULL;
6199 			return (DB_SUCCESS);
6200 		}
6201 
6202 		ut_a(slot);
6203 
6204 		if (!ret) {
6205 			/* IO failed */
6206 			break;
6207 		}
6208 
6209 		slot->n_bytes= len;
6210 		ut_a(slot->array);
6211 		HANDLE slot_port = slot->array->m_completion_port;
6212 		if (slot_port != port) {
6213 			/* there are no redirections between data and log */
6214 			ut_ad(port == data_completion_port);
6215 			ut_ad(slot_port != log_completion_port);
6216 
6217 			/*
6218 			Redirect completions  to the dedicated completion port
6219 			and threads.
6220 
6221 			"Write array" threads receive write,read and ibuf
6222 			notifications, read and ibuf completions are redirected.
6223 
6224 			Forwarding IO completion this way costs a context switch,
6225 			and this seems tolerable  since asynchronous reads are by
6226 			far less frequent.
6227 			*/
6228 			ut_a(PostQueuedCompletionStatus(slot_port,
6229 				len, key, &slot->control));
6230 		}
6231 		else {
6232 			break;
6233 		}
6234 	}
6235 
6236 	ut_a(slot->is_reserved);
6237 
6238 	*m1 = slot->m1;
6239 	*m2 = slot->m2;
6240 
6241 	*type = slot->type;
6242 
6243 	bool retry = false;
6244 
6245 	if (ret && slot->n_bytes == slot->len) {
6246 
6247 		err = DB_SUCCESS;
6248 
6249 	} else if (os_file_handle_error(slot->name, "Windows aio")) {
6250 
6251 		retry = true;
6252 
6253 	} else {
6254 
6255 		err = DB_IO_ERROR;
6256 	}
6257 
6258 
6259 	if (retry) {
6260 		/* Retry failed read/write operation synchronously. */
6261 
6262 #ifdef UNIV_PFS_IO
6263 		/* This read/write does not go through os_file_read
6264 		and os_file_write APIs, need to register with
6265 		performance schema explicitly here. */
6266 		PSI_file_locker_state	state;
6267 		struct PSI_file_locker* locker = NULL;
6268 
6269 		register_pfs_file_io_begin(
6270 			&state, locker, slot->file, slot->len,
6271 			slot->type.is_write()
6272 			? PSI_FILE_WRITE : PSI_FILE_READ, __FILE__, __LINE__);
6273 #endif /* UNIV_PFS_IO */
6274 
6275 		ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
6276 
6277 		ssize_t	n_bytes = SyncFileIO::execute(slot);
6278 
6279 #ifdef UNIV_PFS_IO
6280 		register_pfs_file_io_end(locker, slot->len);
6281 #endif /* UNIV_PFS_IO */
6282 
6283 		err = (n_bytes == slot->len) ? DB_SUCCESS : DB_IO_ERROR;
6284 	}
6285 
6286 	if (err == DB_SUCCESS) {
6287 		err = AIOHandler::post_io_processing(slot);
6288 	}
6289 
6290 	slot->array->release_with_mutex(slot);
6291 
6292 	if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
6293 		&& !buf_page_cleaner_is_active
6294 		&& os_aio_all_slots_free()) {
6295 			/* Last IO, wakeup other io  threads */
6296 			AIO::wake_at_shutdown();
6297 	}
6298 	return(err);
6299 }
6300 #endif /* WIN_ASYNC_IO */
6301 
6302 /**
6303 NOTE! Use the corresponding macro os_aio(), not directly this function!
6304 Requests an asynchronous i/o operation.
6305 @param[in,out]	type		IO request context
6306 @param[in]	mode		IO mode
6307 @param[in]	name		Name of the file or path as NUL terminated
6308 				string
6309 @param[in]	file		Open file handle
6310 @param[out]	buf		buffer where to read
6311 @param[in]	offset		file offset where to read
6312 @param[in]	n		number of bytes to read
6313 @param[in]	read_only	if true read only mode checks are enforced
6314 @param[in,out]	m1		Message for the AIO handler, (can be used to
6315 				identify a completed AIO operation); ignored
6316 				if mode is OS_AIO_SYNC
6317 @param[in,out]	m2		message for the AIO handler (can be used to
6318 				identify a completed AIO operation); ignored
6319 				if mode is OS_AIO_SYNC
6320 
6321 @return DB_SUCCESS or error code */
6322 dberr_t
os_aio_func(IORequest & type,ulint mode,const char * name,pfs_os_file_t file,void * buf,os_offset_t offset,ulint n,bool read_only,fil_node_t * m1,void * m2)6323 os_aio_func(
6324 	IORequest&	type,
6325 	ulint		mode,
6326 	const char*	name,
6327 	pfs_os_file_t	file,
6328 	void*		buf,
6329 	os_offset_t	offset,
6330 	ulint		n,
6331 	bool		read_only,
6332 	fil_node_t*	m1,
6333 	void*		m2)
6334 {
6335 #ifdef WIN_ASYNC_IO
6336 	BOOL		ret = TRUE;
6337 #endif /* WIN_ASYNC_IO */
6338 
6339 	ut_ad(n > 0);
6340 	ut_ad((n % OS_FILE_LOG_BLOCK_SIZE) == 0);
6341 	ut_ad((offset % OS_FILE_LOG_BLOCK_SIZE) == 0);
6342 	ut_ad(os_aio_validate_skip());
6343 
6344 #ifdef WIN_ASYNC_IO
6345 	ut_ad((n & 0xFFFFFFFFUL) == n);
6346 #endif /* WIN_ASYNC_IO */
6347 
6348 	DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28",
6349 			mode = OS_AIO_SYNC; os_has_said_disk_full = FALSE;);
6350 
6351 	if (mode == OS_AIO_SYNC) {
6352 		if (type.is_read()) {
6353 			return(os_file_read_func(type, file, buf, offset, n));
6354 		}
6355 
6356 		ut_ad(type.is_write());
6357 
6358 		return(os_file_write_func(type, name, file, buf, offset, n));
6359 	}
6360 
6361 try_again:
6362 
6363 	AIO*	array;
6364 
6365 	array = AIO::select_slot_array(type, read_only, mode);
6366 
6367 	Slot*	slot;
6368 
6369 	slot = array->reserve_slot(type, m1, m2, file, name, buf, offset, n);
6370 
6371 	if (type.is_read()) {
6372 
6373 
6374 		if (srv_use_native_aio) {
6375 
6376 			++os_n_file_reads;
6377 
6378 			os_bytes_read_since_printout += n;
6379 #ifdef WIN_ASYNC_IO
6380 			ret = ReadFile(
6381 				file, slot->ptr, slot->len,
6382 				NULL, &slot->control);
6383 #elif defined(LINUX_NATIVE_AIO)
6384 			if (!array->linux_dispatch(slot)) {
6385 				goto err_exit;
6386 			}
6387 #endif /* WIN_ASYNC_IO */
6388 		} else if (type.is_wake()) {
6389 			AIO::wake_simulated_handler_thread(
6390 				AIO::get_segment_no_from_slot(array, slot));
6391 		}
6392 	} else if (type.is_write()) {
6393 
6394 		if (srv_use_native_aio) {
6395 			++os_n_file_writes;
6396 
6397 #ifdef WIN_ASYNC_IO
6398 			ret = WriteFile(
6399 				file, slot->ptr, slot->len,
6400 				NULL, &slot->control);
6401 #elif defined(LINUX_NATIVE_AIO)
6402 			if (!array->linux_dispatch(slot)) {
6403 				goto err_exit;
6404 			}
6405 #endif /* WIN_ASYNC_IO */
6406 
6407 		} else if (type.is_wake()) {
6408 			AIO::wake_simulated_handler_thread(
6409 				AIO::get_segment_no_from_slot(array, slot));
6410 		}
6411 	} else {
6412 		ut_error;
6413 	}
6414 
6415 #ifdef WIN_ASYNC_IO
6416 	if (ret || (GetLastError() == ERROR_IO_PENDING)) {
6417 		/* aio completed or was queued successfully! */
6418 		return(DB_SUCCESS);
6419 	}
6420 
6421 	goto err_exit;
6422 
6423 #endif /* WIN_ASYNC_IO */
6424 
6425 	/* AIO request was queued successfully! */
6426 	return(DB_SUCCESS);
6427 
6428 #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
6429 err_exit:
6430 #endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
6431 
6432 	array->release_with_mutex(slot);
6433 
6434 	if (os_file_handle_error(
6435 		name, type.is_read() ? "aio read" : "aio write")) {
6436 
6437 		goto try_again;
6438 	}
6439 
6440 	return(DB_IO_ERROR);
6441 }
6442 
6443 /** Simulated AIO handler for reaping IO requests */
6444 class SimulatedAIOHandler {
6445 
6446 public:
6447 
6448 	/** Constructor
6449 	@param[in,out]	array	The AIO array
6450 	@param[in]	segment	Local segment in the array */
SimulatedAIOHandler(AIO * array,ulint segment)6451 	SimulatedAIOHandler(AIO* array, ulint segment)
6452 		:
6453 		m_oldest(),
6454 		m_n_elems(),
6455 		m_lowest_offset(IB_UINT64_MAX),
6456 		m_array(array),
6457 		m_n_slots(),
6458 		m_segment(segment),
6459 		m_ptr(),
6460 		m_buf()
6461 	{
6462 		ut_ad(m_segment < 100);
6463 
6464 		m_slots.resize(OS_AIO_MERGE_N_CONSECUTIVE);
6465 	}
6466 
6467 	/** Destructor */
~SimulatedAIOHandler()6468 	~SimulatedAIOHandler()
6469 	{
6470 		if (m_ptr != NULL) {
6471 			ut_free(m_ptr);
6472 		}
6473 	}
6474 
6475 	/** Reset the state of the handler
6476 	@param[in]	n_slots	Number of pending AIO operations supported */
init(ulint n_slots)6477 	void init(ulint n_slots)
6478 	{
6479 		m_oldest = 0;
6480 		m_n_elems = 0;
6481 		m_n_slots = n_slots;
6482 		m_lowest_offset = IB_UINT64_MAX;
6483 
6484 		if (m_ptr != NULL) {
6485 			ut_free(m_ptr);
6486 			m_ptr = m_buf = NULL;
6487 		}
6488 
6489 		m_slots[0] = NULL;
6490 	}
6491 
6492 	/** Check if there is a slot for which the i/o has already been done
6493 	@param[out]	n_reserved	Number of reserved slots
6494 	@return the first completed slot that is found. */
check_completed(ulint * n_reserved)6495 	Slot* check_completed(ulint* n_reserved)
6496 	{
6497 		ulint	offset = m_segment * m_n_slots;
6498 
6499 		*n_reserved = 0;
6500 
6501 		Slot*	slot;
6502 
6503 		slot = m_array->at(offset);
6504 
6505 		for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
6506 
6507 			if (slot->is_reserved) {
6508 
6509 				if (slot->io_already_done) {
6510 
6511 					ut_a(slot->is_reserved);
6512 
6513 					return(slot);
6514 				}
6515 
6516 				++*n_reserved;
6517 			}
6518 		}
6519 
6520 		return(NULL);
6521 	}
6522 
6523 	/** If there are at least 2 seconds old requests, then pick the
6524 	oldest one to prevent starvation.  If several requests have the
6525 	same age, then pick the one at the lowest offset.
6526 	@return true if request was selected */
select()6527 	bool select()
6528 	{
6529 		if (!select_oldest()) {
6530 
6531 			return(select_lowest_offset());
6532 		}
6533 
6534 		return(true);
6535 	}
6536 
6537 	/** Check if there are several consecutive blocks
6538 	to read or write. Merge them if found. */
merge()6539 	void merge()
6540 	{
6541 		/* if m_n_elems != 0, then we have assigned
6542 		something valid to consecutive_ios[0] */
6543 		ut_ad(m_n_elems != 0);
6544 		ut_ad(first_slot() != NULL);
6545 
6546 		Slot*	slot = first_slot();
6547 
6548 		while (!merge_adjacent(slot)) {
6549 			/* No op */
6550 		}
6551 	}
6552 
6553 	/** We have now collected n_consecutive I/O requests
6554 	in the array; allocate a single buffer which can hold
6555 	all data, and perform the I/O
6556 	@return the length of the buffer */
allocate_buffer()6557 	ulint allocate_buffer()
6558 		MY_ATTRIBUTE((warn_unused_result))
6559 	{
6560 		ulint	len;
6561 		Slot*	slot = first_slot();
6562 
6563 		ut_ad(m_ptr == NULL);
6564 
6565 		if (slot->type.is_read() && m_n_elems > 1) {
6566 
6567 			len = 0;
6568 
6569 			for (ulint i = 0; i < m_n_elems; ++i) {
6570 				len += m_slots[i]->len;
6571 			}
6572 
6573 			m_ptr = static_cast<byte*>(
6574 				ut_malloc_nokey(len + srv_page_size));
6575 
6576 			m_buf = static_cast<byte*>(
6577 				ut_align(m_ptr, srv_page_size));
6578 
6579 		} else {
6580 			len = first_slot()->len;
6581 			m_buf = first_slot()->buf;
6582 		}
6583 
6584 		return(len);
6585 	}
6586 
6587 	/** We have to compress the individual pages and punch
6588 	holes in them on a page by page basis when writing to
6589 	tables that can be compresed at the IO level.
6590 	@param[in]	len		Value returned by allocate_buffer */
copy_to_buffer(ulint len)6591 	void copy_to_buffer(ulint len)
6592 	{
6593 		Slot*	slot = first_slot();
6594 
6595 		if (len > slot->len && slot->type.is_write()) {
6596 
6597 			byte*	ptr = m_buf;
6598 
6599 			ut_ad(ptr != slot->buf);
6600 
6601 			/* Copy the buffers to the combined buffer */
6602 			for (ulint i = 0; i < m_n_elems; ++i) {
6603 
6604 				slot = m_slots[i];
6605 
6606 				memmove(ptr, slot->buf, slot->len);
6607 
6608 				ptr += slot->len;
6609 			}
6610 		}
6611 	}
6612 
6613 	/** Do the I/O with ordinary, synchronous i/o functions:
6614 	@param[in]	len		Length of buffer for IO */
io()6615 	void io()
6616 	{
6617 		if (first_slot()->type.is_write()) {
6618 
6619 			for (ulint i = 0; i < m_n_elems; ++i) {
6620 				write(m_slots[i]);
6621 			}
6622 
6623 		} else {
6624 
6625 			for (ulint i = 0; i < m_n_elems; ++i) {
6626 				read(m_slots[i]);
6627 			}
6628 		}
6629 	}
6630 
6631 	/** Mark the i/os done in slots */
done()6632 	void done()
6633 	{
6634 		for (ulint i = 0; i < m_n_elems; ++i) {
6635 			m_slots[i]->io_already_done = true;
6636 		}
6637 	}
6638 
6639 	/** @return the first slot in the consecutive array */
first_slot()6640 	Slot* first_slot()
6641 		MY_ATTRIBUTE((warn_unused_result))
6642 	{
6643 		ut_a(m_n_elems > 0);
6644 
6645 		return(m_slots[0]);
6646 	}
6647 
6648 	/** Wait for I/O requests
6649 	@param[in]	global_segment	The global segment
6650 	@param[in,out]	event		Wait on event if no active requests
6651 	@return the number of slots */
6652 	ulint check_pending(
6653 		ulint		global_segment,
6654 		os_event_t	event)
6655 		MY_ATTRIBUTE((warn_unused_result));
6656 private:
6657 
6658 	/** Do the file read
6659 	@param[in,out]	slot		Slot that has the IO context */
read(Slot * slot)6660 	void read(Slot* slot)
6661 	{
6662 		dberr_t	err = os_file_read(
6663 			slot->type,
6664 			slot->file,
6665 			slot->ptr,
6666 			slot->offset,
6667 			slot->len);
6668 
6669 		ut_a(err == DB_SUCCESS);
6670 	}
6671 
6672 	/** Do the file read
6673 	@param[in,out]	slot		Slot that has the IO context */
write(Slot * slot)6674 	void write(Slot* slot)
6675 	{
6676 		dberr_t	err = os_file_write(
6677 			slot->type,
6678 			slot->name,
6679 			slot->file,
6680 			slot->ptr,
6681 			slot->offset,
6682 			slot->len);
6683 
6684 		ut_a(err == DB_SUCCESS);
6685 	}
6686 
6687 	/** @return true if the slots are adjacent and can be merged */
adjacent(const Slot * s1,const Slot * s2) const6688 	bool adjacent(const Slot* s1, const Slot* s2) const
6689 	{
6690 		return(s1 != s2
6691 		       && s1->file == s2->file
6692 		       && s2->offset == s1->offset + s1->len
6693 		       && s1->type == s2->type);
6694 	}
6695 
6696 	/** @return true if merge limit reached or no adjacent slots found. */
merge_adjacent(Slot * & current)6697 	bool merge_adjacent(Slot*& current)
6698 	{
6699 		Slot*	slot;
6700 		ulint	offset = m_segment * m_n_slots;
6701 
6702 		slot = m_array->at(offset);
6703 
6704 		for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
6705 
6706 			if (slot->is_reserved && adjacent(current, slot)) {
6707 
6708 				current = slot;
6709 
6710 				/* Found a consecutive i/o request */
6711 
6712 				m_slots[m_n_elems] = slot;
6713 
6714 				++m_n_elems;
6715 
6716 				return(m_n_elems >= m_slots.capacity());
6717 			}
6718 		}
6719 
6720 		return(true);
6721 	}
6722 
6723 	/** There were no old requests. Look for an I/O request at the lowest
6724 	offset in the array (we ignore the high 32 bits of the offset in these
6725 	heuristics) */
select_lowest_offset()6726 	bool select_lowest_offset()
6727 	{
6728 		ut_ad(m_n_elems == 0);
6729 
6730 		ulint	offset = m_segment * m_n_slots;
6731 
6732 		m_lowest_offset = IB_UINT64_MAX;
6733 
6734 		for (ulint i = 0; i < m_n_slots; ++i) {
6735 			Slot*	slot;
6736 
6737 			slot = m_array->at(i + offset);
6738 
6739 			if (slot->is_reserved
6740 			    && slot->offset < m_lowest_offset) {
6741 
6742 				/* Found an i/o request */
6743 				m_slots[0] = slot;
6744 
6745 				m_n_elems = 1;
6746 
6747 				m_lowest_offset = slot->offset;
6748 			}
6749 		}
6750 
6751 		return(m_n_elems > 0);
6752 	}
6753 
6754 	/** Select the slot if it is older than the current oldest slot.
6755 	@param[in]	slot		The slot to check */
select_if_older(Slot * slot)6756 	void select_if_older(Slot* slot)
6757 	{
6758 		ulint	age;
6759 
6760 		age = (ulint) difftime(time(NULL), slot->reservation_time);
6761 
6762 		if ((age >= 2 && age > m_oldest)
6763 		    || (age >= 2
6764 			&& age == m_oldest
6765 			&& slot->offset < m_lowest_offset)) {
6766 
6767 			/* Found an i/o request */
6768 			m_slots[0] = slot;
6769 
6770 			m_n_elems = 1;
6771 
6772 			m_oldest = age;
6773 
6774 			m_lowest_offset = slot->offset;
6775 		}
6776 	}
6777 
6778 	/** Select th oldest slot in the array
6779 	@return true if oldest slot found */
select_oldest()6780 	bool select_oldest()
6781 	{
6782 		ut_ad(m_n_elems == 0);
6783 
6784 		Slot*	slot;
6785 		ulint	offset = m_n_slots * m_segment;
6786 
6787 		slot = m_array->at(offset);
6788 
6789 		for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
6790 
6791 			if (slot->is_reserved) {
6792 				select_if_older(slot);
6793 			}
6794 		}
6795 
6796 		return(m_n_elems > 0);
6797 	}
6798 
6799 	typedef std::vector<Slot*> slots_t;
6800 
6801 private:
6802 	ulint		m_oldest;
6803 	ulint		m_n_elems;
6804 	os_offset_t	m_lowest_offset;
6805 
6806 	AIO*		m_array;
6807 	ulint		m_n_slots;
6808 	ulint		m_segment;
6809 
6810 	slots_t		m_slots;
6811 
6812 	byte*		m_ptr;
6813 	byte*		m_buf;
6814 };
6815 
6816 /** Wait for I/O requests
6817 @return the number of slots */
6818 ulint
check_pending(ulint global_segment,os_event_t event)6819 SimulatedAIOHandler::check_pending(
6820 	ulint		global_segment,
6821 	os_event_t	event)
6822 {
6823 	/* NOTE! We only access constant fields in os_aio_array.
6824 	Therefore we do not have to acquire the protecting mutex yet */
6825 
6826 	ut_ad(os_aio_validate_skip());
6827 
6828 	ut_ad(m_segment < m_array->get_n_segments());
6829 
6830 	/* Look through n slots after the segment * n'th slot */
6831 
6832 	if (AIO::is_read(m_array)
6833 	    && os_aio_recommend_sleep_for_read_threads) {
6834 
6835 		/* Give other threads chance to add several
6836 		I/Os to the array at once. */
6837 
6838 		srv_set_io_thread_op_info(
6839 			global_segment, "waiting for i/o request");
6840 
6841 		os_event_wait(event);
6842 
6843 		return(0);
6844 	}
6845 
6846 	return(m_array->slots_per_segment());
6847 }
6848 
6849 /** Does simulated AIO. This function should be called by an i/o-handler
6850 thread.
6851 
6852 @param[in]	segment	The number of the segment in the aio arrays to wait
6853 			for; segment 0 is the ibuf i/o thread, segment 1 the
6854 			log i/o thread, then follow the non-ibuf read threads,
6855 			and as the last are the non-ibuf write threads
6856 @param[out]	m1	the messages passed with the AIO request; note that
6857 			also in the case where the AIO operation failed, these
6858 			output parameters are valid and can be used to restart
6859 			the operation, for example
6860 @param[out]	m2	Callback argument
6861 @param[in]	type	IO context
6862 @return DB_SUCCESS or error code */
6863 static
6864 dberr_t
os_aio_simulated_handler(ulint global_segment,fil_node_t ** m1,void ** m2,IORequest * type)6865 os_aio_simulated_handler(
6866 	ulint		global_segment,
6867 	fil_node_t**	m1,
6868 	void**		m2,
6869 	IORequest*	type)
6870 {
6871 	Slot*		slot;
6872 	AIO*		array;
6873 	ulint		segment;
6874 	os_event_t	event = os_aio_segment_wait_events[global_segment];
6875 
6876 	segment = AIO::get_array_and_local_segment(&array, global_segment);
6877 
6878 	SimulatedAIOHandler	handler(array, segment);
6879 
6880 	for (;;) {
6881 
6882 		srv_set_io_thread_op_info(
6883 			global_segment, "looking for i/o requests (a)");
6884 
6885 		ulint	n_slots = handler.check_pending(global_segment, event);
6886 
6887 		if (n_slots == 0) {
6888 			continue;
6889 		}
6890 
6891 		handler.init(n_slots);
6892 
6893 		srv_set_io_thread_op_info(
6894 			global_segment, "looking for i/o requests (b)");
6895 
6896 		array->acquire();
6897 
6898 		ulint	n_reserved;
6899 
6900 		slot = handler.check_completed(&n_reserved);
6901 
6902 		if (slot != NULL) {
6903 
6904 			break;
6905 
6906 		} else if (n_reserved == 0
6907 			   && !buf_page_cleaner_is_active
6908 			   && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
6909 
6910 			/* There is no completed request. If there
6911 			are no pending request at all, and the system
6912 			is being shut down, exit. */
6913 
6914 			array->release();
6915 
6916 			*m1 = NULL;
6917 
6918 			*m2 = NULL;
6919 
6920 			return(DB_SUCCESS);
6921 
6922 		} else if (handler.select()) {
6923 
6924 			break;
6925 		}
6926 
6927 		/* No I/O requested at the moment */
6928 
6929 		srv_set_io_thread_op_info(
6930 			global_segment, "resetting wait event");
6931 
6932 		/* We wait here until tbere are more IO requests
6933 		for this segment. */
6934 
6935 		os_event_reset(event);
6936 
6937 		array->release();
6938 
6939 		srv_set_io_thread_op_info(
6940 			global_segment, "waiting for i/o request");
6941 
6942 		os_event_wait(event);
6943 	}
6944 
6945 	/** Found a slot that has already completed its IO */
6946 
6947 	if (slot == NULL) {
6948 		/* Merge adjacent requests */
6949 		handler.merge();
6950 
6951 		/* Check if there are several consecutive blocks
6952 		to read or write */
6953 
6954 		srv_set_io_thread_op_info(
6955 			global_segment, "consecutive i/o requests");
6956 
6957 		// Note: We don't support write combining for simulated AIO.
6958 		//ulint	total_len = handler.allocate_buffer();
6959 
6960 		/* We release the array mutex for the time of the I/O: NOTE that
6961 		this assumes that there is just one i/o-handler thread serving
6962 		a single segment of slots! */
6963 
6964 		array->release();
6965 
6966 		// Note: We don't support write combining for simulated AIO.
6967 		//handler.copy_to_buffer(total_len);
6968 
6969 		srv_set_io_thread_op_info(global_segment, "doing file i/o");
6970 
6971 		handler.io();
6972 
6973 		srv_set_io_thread_op_info(global_segment, "file i/o done");
6974 
6975 		array->acquire();
6976 
6977 		handler.done();
6978 
6979 		/* We return the messages for the first slot now, and if there
6980 		were several slots, the messages will be returned with
6981 		subsequent calls of this function */
6982 
6983 		slot = handler.first_slot();
6984 	}
6985 
6986 	ut_ad(slot->is_reserved);
6987 
6988 	*m1 = slot->m1;
6989 	*m2 = slot->m2;
6990 
6991 	*type = slot->type;
6992 
6993 	array->release(slot);
6994 
6995 	array->release();
6996 
6997 	return(DB_SUCCESS);
6998 }
6999 
7000 /** Get the total number of pending IOs
7001 @return the total number of pending IOs */
7002 ulint
total_pending_io_count()7003 AIO::total_pending_io_count()
7004 {
7005 	ulint	count = s_reads->pending_io_count();
7006 
7007 	if (s_writes != NULL) {
7008 		count += s_writes->pending_io_count();
7009 	}
7010 
7011 	if (s_ibuf != NULL) {
7012 		count += s_ibuf->pending_io_count();
7013 	}
7014 
7015 	if (s_log != NULL) {
7016 		count += s_log->pending_io_count();
7017 	}
7018 
7019 	if (s_sync != NULL) {
7020 		count += s_sync->pending_io_count();
7021 	}
7022 
7023 	return(count);
7024 }
7025 
7026 /** Validates the consistency the aio system.
7027 @return true if ok */
7028 static
7029 bool
os_aio_validate()7030 os_aio_validate()
7031 {
7032 	/* The methods countds and validates, we ignore the count. */
7033 	AIO::total_pending_io_count();
7034 
7035 	return(true);
7036 }
7037 
7038 /** Prints pending IO requests per segment of an aio array.
7039 We probably don't need per segment statistics but they can help us
7040 during development phase to see if the IO requests are being
7041 distributed as expected.
7042 @param[in,out]	file		File where to print
7043 @param[in]	segments	Pending IO array */
7044 void
print_segment_info(FILE * file,const ulint * segments)7045 AIO::print_segment_info(
7046 	FILE*		file,
7047 	const ulint*	segments)
7048 {
7049 	ut_ad(m_n_segments > 0);
7050 
7051 	if (m_n_segments > 1) {
7052 
7053 		fprintf(file, " [");
7054 
7055 		for (ulint i = 0; i < m_n_segments; ++i, ++segments) {
7056 
7057 			if (i != 0) {
7058 				fprintf(file, ", ");
7059 			}
7060 
7061 			fprintf(file, ULINTPF, *segments);
7062 		}
7063 
7064 		fprintf(file, "] ");
7065 	}
7066 }
7067 
7068 /** Prints info about the aio array.
7069 @param[in,out]	file		Where to print */
7070 void
print(FILE * file)7071 AIO::print(FILE* file)
7072 {
7073 	ulint	count = 0;
7074 	ulint	n_res_seg[SRV_MAX_N_IO_THREADS];
7075 
7076 	mutex_enter(&m_mutex);
7077 
7078 	ut_a(!m_slots.empty());
7079 	ut_a(m_n_segments > 0);
7080 
7081 	memset(n_res_seg, 0x0, sizeof(n_res_seg));
7082 
7083 	for (ulint i = 0; i < m_slots.size(); ++i) {
7084 		Slot&	slot = m_slots[i];
7085 		ulint	segment = (i * m_n_segments) / m_slots.size();
7086 
7087 		if (slot.is_reserved) {
7088 
7089 			++count;
7090 
7091 			++n_res_seg[segment];
7092 
7093 			ut_a(slot.len > 0);
7094 		}
7095 	}
7096 
7097 	ut_a(m_n_reserved == count);
7098 
7099 	print_segment_info(file, n_res_seg);
7100 
7101 	mutex_exit(&m_mutex);
7102 }
7103 
7104 /** Print all the AIO segments
7105 @param[in,out]	file		Where to print */
7106 void
print_all(FILE * file)7107 AIO::print_all(FILE* file)
7108 {
7109 	s_reads->print(file);
7110 
7111 	if (s_writes != NULL) {
7112 		fputs(", aio writes:", file);
7113 		s_writes->print(file);
7114 	}
7115 
7116 	if (s_ibuf != NULL) {
7117 		fputs(",\n ibuf aio reads:", file);
7118 		s_ibuf->print(file);
7119 	}
7120 
7121 	if (s_log != NULL) {
7122 		fputs(", log i/o's:", file);
7123 		s_log->print(file);
7124 	}
7125 
7126 	if (s_sync != NULL) {
7127 		fputs(", sync i/o's:", file);
7128 		s_sync->print(file);
7129 	}
7130 }
7131 
7132 /** Prints info of the aio arrays.
7133 @param[in,out]	file		file where to print */
7134 void
os_aio_print(FILE * file)7135 os_aio_print(FILE*	file)
7136 {
7137 	time_t		current_time;
7138 	double		time_elapsed;
7139 	double		avg_bytes_read;
7140 
7141 	for (ulint i = 0; i < srv_n_file_io_threads; ++i) {
7142 		fprintf(file, "I/O thread " ULINTPF " state: %s (%s)",
7143 			i,
7144 			srv_io_thread_op_info[i],
7145 			srv_io_thread_function[i]);
7146 
7147 #ifndef _WIN32
7148 		if (!srv_use_native_aio
7149 		    && os_event_is_set(os_aio_segment_wait_events[i])) {
7150 			fprintf(file, " ev set");
7151 		}
7152 #endif /* _WIN32 */
7153 
7154 		fprintf(file, "\n");
7155 	}
7156 
7157 	fputs("Pending normal aio reads:", file);
7158 
7159 	AIO::print_all(file);
7160 
7161 	putc('\n', file);
7162 	current_time = time(NULL);
7163 	time_elapsed = 0.001 + difftime(current_time, os_last_printout);
7164 
7165 	fprintf(file,
7166 		"Pending flushes (fsync) log: " ULINTPF
7167 		"; buffer pool: " ULINTPF "\n"
7168 		ULINTPF " OS file reads, "
7169 		ULINTPF " OS file writes, "
7170 		ULINTPF " OS fsyncs\n",
7171 		fil_n_pending_log_flushes,
7172 		fil_n_pending_tablespace_flushes,
7173 		ulint{os_n_file_reads},
7174 		os_n_file_writes,
7175 		os_n_fsyncs);
7176 
7177 	const ulint n_reads = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_READS));
7178 	const ulint n_writes = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_WRITES));
7179 
7180 	if (n_reads != 0 || n_writes != 0) {
7181 		fprintf(file,
7182 			ULINTPF " pending reads, " ULINTPF " pending writes\n",
7183 			n_reads, n_writes);
7184 	}
7185 
7186 	if (os_n_file_reads == os_n_file_reads_old) {
7187 		avg_bytes_read = 0.0;
7188 	} else {
7189 		avg_bytes_read = (double) os_bytes_read_since_printout
7190 			/ (os_n_file_reads - os_n_file_reads_old);
7191 	}
7192 
7193 	fprintf(file,
7194 		"%.2f reads/s, " ULINTPF " avg bytes/read,"
7195 		" %.2f writes/s, %.2f fsyncs/s\n",
7196 		(os_n_file_reads - os_n_file_reads_old)
7197 		/ time_elapsed,
7198 		(ulint) avg_bytes_read,
7199 		(os_n_file_writes - os_n_file_writes_old)
7200 		/ time_elapsed,
7201 		(os_n_fsyncs - os_n_fsyncs_old)
7202 		/ time_elapsed);
7203 
7204 	os_n_file_reads_old = os_n_file_reads;
7205 	os_n_file_writes_old = os_n_file_writes;
7206 	os_n_fsyncs_old = os_n_fsyncs;
7207 	os_bytes_read_since_printout = 0;
7208 
7209 	os_last_printout = current_time;
7210 }
7211 
7212 /** Refreshes the statistics used to print per-second averages. */
7213 void
os_aio_refresh_stats()7214 os_aio_refresh_stats()
7215 {
7216 	os_n_fsyncs_old = os_n_fsyncs;
7217 
7218 	os_bytes_read_since_printout = 0;
7219 
7220 	os_n_file_reads_old = os_n_file_reads;
7221 
7222 	os_n_file_writes_old = os_n_file_writes;
7223 
7224 	os_n_fsyncs_old = os_n_fsyncs;
7225 
7226 	os_bytes_read_since_printout = 0;
7227 
7228 	os_last_printout = time(NULL);
7229 }
7230 
7231 /** Checks that all slots in the system have been freed, that is, there are
7232 no pending io operations.
7233 @return true if all free */
7234 bool
os_aio_all_slots_free()7235 os_aio_all_slots_free()
7236 {
7237 	return(AIO::total_pending_io_count() == 0);
7238 }
7239 
7240 #ifdef UNIV_DEBUG
7241 /** Prints all pending IO for the array
7242 @param[in]	file	file where to print
7243 @param[in]	array	array to process */
7244 void
to_file(FILE * file) const7245 AIO::to_file(FILE* file) const
7246 {
7247 	acquire();
7248 
7249 	fprintf(file, " " ULINTPF "\n", m_n_reserved);
7250 
7251 	for (ulint i = 0; i < m_slots.size(); ++i) {
7252 
7253 		const Slot&	slot = m_slots[i];
7254 
7255 		if (slot.is_reserved) {
7256 
7257 			fprintf(file,
7258 				"%s IO for %s (offset=" UINT64PF
7259 				", size=%lu)\n",
7260 				slot.type.is_read() ? "read" : "write",
7261 				slot.name, slot.offset, (unsigned long)(slot.len));
7262 		}
7263 	}
7264 
7265 	release();
7266 }
7267 
7268 /** Print pending IOs for all arrays */
7269 void
print_to_file(FILE * file)7270 AIO::print_to_file(FILE* file)
7271 {
7272 	fprintf(file, "Pending normal aio reads:");
7273 
7274 	s_reads->to_file(file);
7275 
7276 	if (s_writes != NULL) {
7277 		fprintf(file, "Pending normal aio writes:");
7278 		s_writes->to_file(file);
7279 	}
7280 
7281 	if (s_ibuf != NULL) {
7282 		fprintf(file, "Pending ibuf aio reads:");
7283 		s_ibuf->to_file(file);
7284 	}
7285 
7286 	if (s_log != NULL) {
7287 		fprintf(file, "Pending log i/o's:");
7288 		s_log->to_file(file);
7289 	}
7290 
7291 	if (s_sync != NULL) {
7292 		fprintf(file, "Pending sync i/o's:");
7293 		s_sync->to_file(file);
7294 	}
7295 }
7296 
7297 /** Prints all pending IO
7298 @param[in]	file		File where to print */
7299 void
os_aio_print_pending_io(FILE * file)7300 os_aio_print_pending_io(
7301 	FILE*	file)
7302 {
7303 	AIO::print_to_file(file);
7304 }
7305 
7306 #endif /* UNIV_DEBUG */
7307 
7308 /**
7309 Set the file create umask
7310 @param[in]	umask		The umask to use for file creation. */
7311 void
os_file_set_umask(ulint umask)7312 os_file_set_umask(ulint umask)
7313 {
7314 	os_innodb_umask = umask;
7315 }
7316 
7317 #ifdef _WIN32
7318 
7319 /* Checks whether physical drive is on SSD.*/
is_drive_on_ssd(DWORD nr)7320 static bool is_drive_on_ssd(DWORD nr)
7321 {
7322   char physical_drive_path[32];
7323   snprintf(physical_drive_path, sizeof(physical_drive_path),
7324            "\\\\.\\PhysicalDrive%lu", nr);
7325 
7326   HANDLE h= CreateFile(physical_drive_path, 0,
7327                  FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
7328                  nullptr, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, nullptr);
7329   if (h == INVALID_HANDLE_VALUE)
7330     return false;
7331 
7332   DEVICE_SEEK_PENALTY_DESCRIPTOR seek_penalty;
7333   STORAGE_PROPERTY_QUERY storage_query{};
7334   storage_query.PropertyId= StorageDeviceSeekPenaltyProperty;
7335   storage_query.QueryType= PropertyStandardQuery;
7336 
7337   bool on_ssd= false;
7338   DWORD bytes_written;
7339   if (DeviceIoControl(h, IOCTL_STORAGE_QUERY_PROPERTY, &storage_query,
7340                       sizeof storage_query, &seek_penalty, sizeof seek_penalty,
7341                       &bytes_written, nullptr))
7342   {
7343     on_ssd= seek_penalty.IncursSeekPenalty;
7344   }
7345   else
7346   {
7347     on_ssd= false;
7348   }
7349   CloseHandle(h);
7350   return on_ssd;
7351 }
7352 
7353 /*
7354   Checks whether volume is on SSD, by checking all physical drives
7355   in that volume.
7356 */
is_volume_on_ssd(const char * volume_mount_point)7357 static bool is_volume_on_ssd(const char *volume_mount_point)
7358 {
7359   char volume_name[MAX_PATH];
7360 
7361   if (!GetVolumeNameForVolumeMountPoint(volume_mount_point, volume_name,
7362                                         array_elements(volume_name)))
7363   {
7364     /* This can fail, e.g if file is on network share */
7365     return false;
7366   }
7367 
7368   /* Chomp last backslash, this is needed to open volume.*/
7369   size_t length= strlen(volume_name);
7370   if (length && volume_name[length - 1] == '\\')
7371     volume_name[length - 1]= 0;
7372 
7373   /* Open volume handle */
7374   HANDLE volume_handle= CreateFile(
7375       volume_name, 0, FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
7376       nullptr, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, nullptr);
7377 
7378   if (volume_handle == INVALID_HANDLE_VALUE)
7379     return false;
7380 
7381   /*
7382    Enumerate all volume extends, check whether all of them are on SSD
7383   */
7384 
7385   /* Anticipate common case where there is only one extent.*/
7386   VOLUME_DISK_EXTENTS single_extent;
7387 
7388   /* But also have a place to manage allocated data.*/
7389   std::unique_ptr<BYTE[]> lifetime;
7390 
7391   DWORD bytes_written;
7392   VOLUME_DISK_EXTENTS *extents= nullptr;
7393   if (DeviceIoControl(volume_handle, IOCTL_VOLUME_GET_VOLUME_DISK_EXTENTS,
7394                       nullptr, 0, &single_extent, sizeof(single_extent),
7395                       &bytes_written, nullptr))
7396   {
7397     /* Worked on the first try. Use the preallocated buffer.*/
7398     extents= &single_extent;
7399   }
7400   else
7401   {
7402     VOLUME_DISK_EXTENTS *last_query= &single_extent;
7403     while (GetLastError() == ERROR_MORE_DATA)
7404     {
7405       DWORD extentCount= last_query->NumberOfDiskExtents;
7406       DWORD allocatedSize=
7407           FIELD_OFFSET(VOLUME_DISK_EXTENTS, Extents[extentCount]);
7408       lifetime.reset(new BYTE[allocatedSize]);
7409       last_query= (VOLUME_DISK_EXTENTS *) lifetime.get();
7410       if (DeviceIoControl(volume_handle, IOCTL_VOLUME_GET_VOLUME_DISK_EXTENTS,
7411                           nullptr, 0, last_query, allocatedSize,
7412                           &bytes_written, nullptr))
7413       {
7414         extents= last_query;
7415         break;
7416       }
7417     }
7418   }
7419   CloseHandle(volume_handle);
7420   if (!extents)
7421     return false;
7422 
7423   for (DWORD i= 0; i < extents->NumberOfDiskExtents; i++)
7424     if (!is_drive_on_ssd(extents->Extents[i].DiskNumber))
7425       return false;
7426 
7427   return true;
7428 }
7429 
7430 #include <unordered_map>
is_file_on_ssd(char * file_path)7431 static bool is_file_on_ssd(char *file_path)
7432 {
7433   /* Cache of volume_path => volume_info, protected by rwlock.*/
7434   static std::unordered_map<std::string, bool> cache;
7435   static SRWLOCK lock= SRWLOCK_INIT;
7436 
7437   /* Preset result, in case something fails, e.g we're on network drive.*/
7438   char volume_path[MAX_PATH];
7439   if (!GetVolumePathName(file_path, volume_path, array_elements(volume_path)))
7440     return false;
7441 
7442   /* Try cached volume info first.*/
7443   std::string volume_path_str(volume_path);
7444   bool found;
7445   bool result;
7446   AcquireSRWLockShared(&lock);
7447   auto e= cache.find(volume_path_str);
7448   if ((found= e != cache.end()))
7449     result= e->second;
7450   ReleaseSRWLockShared(&lock);
7451 
7452   if (found)
7453     return result;
7454 
7455   result= is_volume_on_ssd(volume_path);
7456 
7457   /* Update cache */
7458   AcquireSRWLockExclusive(&lock);
7459   cache[volume_path_str]= result;
7460   ReleaseSRWLockExclusive(&lock);
7461   return result;
7462 }
7463 
7464 #endif
7465 
7466 /** Determine some file metadata when creating or reading the file.
7467 @param	file	the file that is being created, or OS_FILE_CLOSED */
find_metadata(os_file_t file,struct stat * statbuf)7468 void fil_node_t::find_metadata(os_file_t file
7469 #ifndef _WIN32
7470 			       , struct stat* statbuf
7471 #endif
7472 			       )
7473 {
7474 	if (file == OS_FILE_CLOSED) {
7475 		file = handle;
7476 		ut_ad(is_open());
7477 	}
7478 
7479 #ifdef _WIN32 /* FIXME: make this unconditional */
7480 	if (space->punch_hole) {
7481 		space->punch_hole = os_is_sparse_file_supported(file);
7482 	}
7483 #endif
7484 
7485 	/*
7486 	For the temporary tablespace and during the
7487 	non-redo-logged adjustments in
7488 	IMPORT TABLESPACE, we do not care about
7489 	the atomicity of writes.
7490 
7491 	Atomic writes is supported if the file can be used
7492 	with atomic_writes (not log file), O_DIRECT is
7493 	used (tested in ha_innodb.cc) and the file is
7494 	device and file system that supports atomic writes
7495 	for the given block size.
7496 	*/
7497 	space->atomic_write_supported = space->purpose == FIL_TYPE_TEMPORARY
7498 		|| space->purpose == FIL_TYPE_IMPORT;
7499 #ifdef _WIN32
7500 	on_ssd = is_file_on_ssd(name);
7501 	FILE_STORAGE_INFO info;
7502 	if (GetFileInformationByHandleEx(
7503 		file, FileStorageInfo, &info, sizeof(info))) {
7504 		block_size = info.PhysicalBytesPerSectorForAtomicity;
7505 	} else {
7506 		block_size = 512;
7507 	}
7508 #else
7509 	struct stat sbuf;
7510 	if (!statbuf && !fstat(file, &sbuf)) {
7511 		statbuf = &sbuf;
7512 	}
7513 	if (statbuf) {
7514 		block_size = statbuf->st_blksize;
7515 	}
7516 	on_ssd = space->atomic_write_supported
7517 # ifdef UNIV_LINUX
7518 		|| (statbuf && fil_system.is_ssd(statbuf->st_dev))
7519 # endif
7520 		;
7521 #endif
7522 	if (!space->atomic_write_supported) {
7523 		space->atomic_write_supported = atomic_write
7524 			&& srv_use_atomic_writes
7525 #ifndef _WIN32
7526 			&& my_test_if_atomic_write(file,
7527 						   space->physical_size())
7528 #else
7529 			/* On Windows, all single sector writes are atomic,
7530 			as per WriteFile() documentation on MSDN.
7531 			We also require SSD for atomic writes, eventhough
7532 			technically it is not necessary- the reason is that
7533 			on hard disks, we still want the benefit from
7534 			(non-atomic) neighbor page flushing in the buffer
7535 			pool code. */
7536 			&& srv_page_size == block_size
7537 			&& on_ssd
7538 #endif
7539 			;
7540 	}
7541 }
7542 
7543 /** Read the first page of a data file.
7544 @param[in]	first	whether this is the very first read
7545 @return	whether the page was found valid */
read_page0(bool first)7546 bool fil_node_t::read_page0(bool first)
7547 {
7548 	ut_ad(mutex_own(&fil_system.mutex));
7549 	ut_a(space->purpose != FIL_TYPE_LOG);
7550 	const ulint psize = space->physical_size();
7551 #ifndef _WIN32
7552 	struct stat statbuf;
7553 	if (fstat(handle, &statbuf)) {
7554 		return false;
7555 	}
7556 	os_offset_t size_bytes = statbuf.st_size;
7557 #else
7558 	os_offset_t size_bytes = os_file_get_size(handle);
7559 	ut_a(size_bytes != (os_offset_t) -1);
7560 #endif
7561 	const ulint min_size = FIL_IBD_FILE_INITIAL_SIZE * psize;
7562 
7563 	if (size_bytes < min_size) {
7564 		ib::error() << "The size of the file " << name
7565 			    << " is only " << size_bytes
7566 			    << " bytes, should be at least " << min_size;
7567 		return false;
7568 	}
7569 
7570 	byte* buf2 = static_cast<byte*>(ut_malloc_nokey(2 * psize));
7571 
7572 	/* Align the memory for file i/o if we might have O_DIRECT set */
7573 	byte* page = static_cast<byte*>(ut_align(buf2, psize));
7574 	IORequest request(IORequest::READ);
7575 	if (os_file_read(request, handle, page, 0, psize) != DB_SUCCESS) {
7576 		ib::error() << "Unable to read first page of file " << name;
7577 		ut_free(buf2);
7578 		return false;
7579 	}
7580 	const ulint space_id = fsp_header_get_space_id(page);
7581 	ulint flags = fsp_header_get_flags(page);
7582 	const ulint size = fsp_header_get_field(page, FSP_SIZE);
7583 	const ulint free_limit = fsp_header_get_field(page, FSP_FREE_LIMIT);
7584 	const ulint free_len = flst_get_len(FSP_HEADER_OFFSET + FSP_FREE
7585 					    + page);
7586 	if (!fil_space_t::is_valid_flags(flags, space->id)) {
7587 		ulint cflags = fsp_flags_convert_from_101(flags);
7588 		if (cflags == ULINT_UNDEFINED) {
7589 invalid:
7590 			ib::error()
7591 				<< "Expected tablespace flags "
7592 				<< ib::hex(space->flags)
7593 				<< " but found " << ib::hex(flags)
7594 				<< " in the file " << name;
7595 			ut_free(buf2);
7596 			return false;
7597 		}
7598 
7599 		ulint cf = cflags & ~FSP_FLAGS_MEM_MASK;
7600 		ulint sf = space->flags & ~FSP_FLAGS_MEM_MASK;
7601 
7602 		if (!fil_space_t::is_flags_equal(cf, sf)
7603 		    && !fil_space_t::is_flags_equal(sf, cf)) {
7604 			goto invalid;
7605 		}
7606 
7607 		flags = cflags;
7608 	}
7609 
7610 	ut_ad(!(flags & FSP_FLAGS_MEM_MASK));
7611 
7612 	/* Try to read crypt_data from page 0 if it is not yet read. */
7613 	if (!space->crypt_data) {
7614 		space->crypt_data = fil_space_read_crypt_data(
7615 			fil_space_t::zip_size(flags), page);
7616 	}
7617 	ut_free(buf2);
7618 
7619 	if (UNIV_UNLIKELY(space_id != space->id)) {
7620 		ib::error() << "Expected tablespace id " << space->id
7621 			<< " but found " << space_id
7622 			<< " in the file " << name;
7623 		return false;
7624 	}
7625 
7626 	if (first) {
7627 		ut_ad(space->id != TRX_SYS_SPACE);
7628 #ifdef UNIV_LINUX
7629 		find_metadata(handle, &statbuf);
7630 #else
7631 		find_metadata();
7632 #endif
7633 
7634 		/* Truncate the size to a multiple of extent size. */
7635 		ulint	mask = psize * FSP_EXTENT_SIZE - 1;
7636 
7637 		if (size_bytes <= mask) {
7638 			/* .ibd files start smaller than an
7639 			extent size. Do not truncate valid data. */
7640 		} else {
7641 			size_bytes &= ~os_offset_t(mask);
7642 		}
7643 
7644 		space->flags = (space->flags & FSP_FLAGS_MEM_MASK) | flags;
7645 
7646 		this->size = ulint(size_bytes / psize);
7647 		space->committed_size = space->size += this->size;
7648 	} else if (space->id != TRX_SYS_SPACE || space->size_in_header) {
7649 		/* If this is not the first-time open, do nothing.
7650 		For the system tablespace, we always get invoked as
7651 		first=false, so we detect the true first-time-open based
7652 		on size_in_header and proceed to initialize the data. */
7653 		return true;
7654 	} else {
7655 		/* Initialize the size of predefined tablespaces
7656 		to FSP_SIZE. */
7657 		space->committed_size = size;
7658 	}
7659 
7660 	ut_ad(space->free_limit == 0 || space->free_limit == free_limit);
7661 	ut_ad(space->free_len == 0 || space->free_len == free_len);
7662 	space->size_in_header = size;
7663 	space->free_limit = free_limit;
7664 	space->free_len = free_len;
7665 	return true;
7666 }
7667 
7668 #else
7669 #include "univ.i"
7670 #endif /* !UNIV_INNOCHECKSUM */
7671 
7672 /** Normalizes a directory path for the current OS:
7673 On Windows, we convert '/' to '\', else we convert '\' to '/'.
7674 @param[in,out] str A null-terminated directory and file path */
7675 void
os_normalize_path(char * str)7676 os_normalize_path(
7677 	char*	str)
7678 {
7679 	if (str != NULL) {
7680 		for (; *str; str++) {
7681 			if (*str == OS_PATH_SEPARATOR_ALT) {
7682 				*str = OS_PATH_SEPARATOR;
7683 			}
7684 		}
7685 	}
7686 }
7687