1 /***********************************************************************
2 
3 Copyright (c) 1995, 2019, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2009, Percona Inc.
5 Copyright (c) 2013, 2021, MariaDB Corporation.
6 
7 Portions of this file contain modifications contributed and copyrighted
8 by Percona Inc.. Those modifications are
9 gratefully acknowledged and are described briefly in the InnoDB
10 documentation. The contributions by Percona Inc. are incorporated with
11 their permission, and subject to the conditions contained in the file
12 COPYING.Percona.
13 
14 This program is free software; you can redistribute it and/or modify it
15 under the terms of the GNU General Public License as published by the
16 Free Software Foundation; version 2 of the License.
17 
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
21 Public License for more details.
22 
23 You should have received a copy of the GNU General Public License along with
24 this program; if not, write to the Free Software Foundation, Inc.,
25 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
26 
27 ***********************************************************************/
28 
29 /**************************************************//**
30 @file os/os0file.cc
31 The interface to the operating system file i/o primitives
32 
33 Created 10/21/1995 Heikki Tuuri
34 *******************************************************/
35 
36 #ifndef UNIV_INNOCHECKSUM
37 #include "os0file.h"
38 #include "sql_const.h"
39 
40 #ifdef UNIV_LINUX
41 #include <sys/types.h>
42 #include <sys/stat.h>
43 #endif
44 
45 #include "srv0srv.h"
46 #include "srv0start.h"
47 #include "fil0fil.h"
48 #include "srv0srv.h"
49 #ifdef HAVE_LINUX_UNISTD_H
50 #include "unistd.h"
51 #endif
52 #include "os0event.h"
53 #include "os0thread.h"
54 
55 #include <vector>
56 
57 #ifdef LINUX_NATIVE_AIO
58 #include <libaio.h>
59 #endif /* LINUX_NATIVE_AIO */
60 
61 #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
62 # include <fcntl.h>
63 # include <linux/falloc.h>
64 #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
65 
66 #if defined(UNIV_LINUX) && defined(HAVE_SYS_IOCTL_H)
67 # include <sys/ioctl.h>
68 # ifndef DFS_IOCTL_ATOMIC_WRITE_SET
69 #  define DFS_IOCTL_ATOMIC_WRITE_SET _IOW(0x95, 2, uint)
70 # endif
71 #endif
72 
73 #if defined(UNIV_LINUX) && defined(HAVE_SYS_STATVFS_H)
74 #include <sys/statvfs.h>
75 #endif
76 
77 #if defined(UNIV_LINUX) && defined(HAVE_LINUX_FALLOC_H)
78 #include <linux/falloc.h>
79 #endif
80 
81 #ifdef _WIN32
82 #include <winioctl.h>
83 #endif
84 
85 /** Insert buffer segment id */
86 static const ulint IO_IBUF_SEGMENT = 0;
87 
88 /** Log segment id */
89 static const ulint IO_LOG_SEGMENT = 1;
90 
91 /** Number of retries for partial I/O's */
92 static const ulint NUM_RETRIES_ON_PARTIAL_IO = 10;
93 
94 /* This specifies the file permissions InnoDB uses when it creates files in
95 Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
96 my_umask */
97 
98 #ifndef _WIN32
99 /** Umask for creating files */
100 static ulint	os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
101 #else
102 /** Umask for creating files */
103 static ulint	os_innodb_umask	= 0;
104 static HANDLE	data_completion_port;
105 static HANDLE	log_completion_port;
106 
107 static DWORD	fls_sync_io  = FLS_OUT_OF_INDEXES;
108 #define IOCP_SHUTDOWN_KEY (ULONG_PTR)-1
109 #endif /* _WIN32 */
110 
111 /** In simulated aio, merge at most this many consecutive i/os */
112 static const ulint	OS_AIO_MERGE_N_CONSECUTIVE = 64;
113 
114 /** Flag indicating if the page_cleaner is in active state. */
115 extern bool buf_page_cleaner_is_active;
116 
117 #ifdef WITH_INNODB_DISALLOW_WRITES
118 #define WAIT_ALLOW_WRITES() os_event_wait(srv_allow_writes_event)
119 #else
120 #define WAIT_ALLOW_WRITES() do { } while (0)
121 #endif /* WITH_INNODB_DISALLOW_WRITES */
122 
123 /**********************************************************************
124 
125 InnoDB AIO Implementation:
126 =========================
127 
128 We support native AIO for Windows and Linux. For rest of the platforms
129 we simulate AIO by special IO-threads servicing the IO-requests.
130 
131 Simulated AIO:
132 ==============
133 
134 On platforms where we 'simulate' AIO, the following is a rough explanation
135 of the high level design.
136 There are four io-threads (for ibuf, log, read, write).
137 All synchronous IO requests are serviced by the calling thread using
138 os_file_write/os_file_read. The Asynchronous requests are queued up
139 in an array (there are four such arrays) by the calling thread.
140 Later these requests are picked up by the IO-thread and are serviced
141 synchronously.
142 
143 Windows native AIO:
144 ==================
145 
146 If srv_use_native_aio is not set then Windows follow the same
147 code as simulated AIO. If the flag is set then native AIO interface
148 is used. On windows, one of the limitation is that if a file is opened
149 for AIO no synchronous IO can be done on it. Therefore we have an
150 extra fifth array to queue up synchronous IO requests.
151 There are innodb_file_io_threads helper threads. These threads work
152 on the four arrays mentioned above in Simulated AIO. No thread is
153 required for the sync array.
154 If a synchronous IO request is made, it is first queued in the sync
155 array. Then the calling thread itself waits on the request, thus
156 making the call synchronous.
157 If an AIO request is made the calling thread not only queues it in the
158 array but also submits the requests. The helper thread then collects
159 the completed IO request and calls completion routine on it.
160 
161 Linux native AIO:
162 =================
163 
164 If we have libaio installed on the system and innodb_use_native_aio
165 is set to true we follow the code path of native AIO, otherwise we
166 do simulated AIO.
167 There are innodb_file_io_threads helper threads. These threads work
168 on the four arrays mentioned above in Simulated AIO.
169 If a synchronous IO request is made, it is handled by calling
170 os_file_write/os_file_read.
171 If an AIO request is made the calling thread not only queues it in the
172 array but also submits the requests. The helper thread then collects
173 the completed IO request and calls completion routine on it.
174 
175 **********************************************************************/
176 
177 
178 #ifdef UNIV_PFS_IO
179 /* Keys to register InnoDB I/O with performance schema */
180 mysql_pfs_key_t  innodb_data_file_key;
181 mysql_pfs_key_t  innodb_log_file_key;
182 mysql_pfs_key_t  innodb_temp_file_key;
183 #endif /* UNIV_PFS_IO */
184 
185 class AIO;
186 
187 /** The asynchronous I/O context */
188 struct Slot {
189 
190 #ifdef WIN_ASYNC_IO
191 	/** Windows control block for the aio request
192 	must be at the very start of Slot, so we can
193 	cast Slot* to OVERLAPPED*
194 	*/
195 	OVERLAPPED		control;
196 #endif
197 
198 	/** index of the slot in the aio array */
199 	uint16_t		pos;
200 
201 	/** true if this slot is reserved */
202 	bool			is_reserved;
203 
204 	/** time when reserved */
205 	time_t			reservation_time;
206 
207 	/** buffer used in i/o */
208 	byte*			buf;
209 
210 	/** Buffer pointer used for actual IO. We advance this
211 	when partial IO is required and not buf */
212 	byte*			ptr;
213 
214 	/** OS_FILE_READ or OS_FILE_WRITE */
215 	IORequest		type;
216 
217 	/** file offset in bytes */
218 	os_offset_t		offset;
219 
220 	/** file where to read or write */
221 	pfs_os_file_t		file;
222 
223 	/** file name or path */
224 	const char*		name;
225 
226 	/** used only in simulated aio: true if the physical i/o
227 	already made and only the slot message needs to be passed
228 	to the caller of os_aio_simulated_handle */
229 	bool			io_already_done;
230 
231 	/*!< file block size */
232 	ulint			file_block_size;
233 
234 	/** The file node for which the IO is requested. */
235 	fil_node_t*		m1;
236 
237 	/** the requester of an aio operation and which can be used
238 	to identify which pending aio operation was completed */
239 	void*			m2;
240 
241 	/** AIO completion status */
242 	dberr_t			err;
243 
244 #ifdef WIN_ASYNC_IO
245 
246 	/** bytes written/read */
247 	DWORD			n_bytes;
248 
249 	/** length of the block to read or write */
250 	DWORD			len;
251 
252 	/** aio array containing this slot */
253 	AIO				*array;
254 #elif defined(LINUX_NATIVE_AIO)
255 	/** Linux control block for aio */
256 	struct iocb		control;
257 
258 	/** AIO return code */
259 	int			ret;
260 
261 	/** bytes written/read. */
262 	ssize_t			n_bytes;
263 
264 	/** length of the block to read or write */
265 	ulint			len;
266 #else
267 	/** length of the block to read or write */
268 	ulint			len;
269 
270 	/** bytes written/read. */
271 	ulint			n_bytes;
272 #endif /* WIN_ASYNC_IO */
273 
274 	/** Length of the block before it was compressed */
275 	uint32			original_len;
276 
277 };
278 
279 /** The asynchronous i/o array structure */
280 class AIO {
281 public:
282 	/** Constructor
283 	@param[in]	id		Latch ID
284 	@param[in]	n_slots		Number of slots to configure
285 	@param[in]	segments	Number of segments to configure */
286 	AIO(latch_id_t id, ulint n_slots, ulint segments);
287 
288 	/** Destructor */
289 	~AIO();
290 
291 	/** Initialize the instance
292 	@return DB_SUCCESS or error code */
293 	dberr_t init();
294 
295 	/** Requests for a slot in the aio array. If no slot is available, waits
296 	until not_full-event becomes signaled.
297 
298 	@param[in]	type	IO context
299 	@param[in,out]	m1	message to be passed along with the AIO
300 				operation
301 	@param[in,out]	m2	message to be passed along with the AIO
302 				operation
303 	@param[in]	file	file handle
304 	@param[in]	name	name of the file or path as a null-terminated
305 				string
306 	@param[in,out]	buf	buffer where to read or from which to write
307 	@param[in]	offset	file offset, where to read from or start writing
308 	@param[in]	len	length of the block to read or write
309 	@return pointer to slot */
310 	Slot* reserve_slot(
311 		const IORequest&	type,
312 		fil_node_t*		m1,
313 		void*			m2,
314 		pfs_os_file_t		file,
315 		const char*		name,
316 		void*			buf,
317 		os_offset_t		offset,
318 		ulint			len)
319 		MY_ATTRIBUTE((warn_unused_result));
320 
321 	/** @return number of reserved slots */
322 	ulint pending_io_count() const;
323 
324 	/** Returns a pointer to the nth slot in the aio array.
325 	@param[in]	index	Index of the slot in the array
326 	@return pointer to slot */
at(ulint i) const327 	const Slot* at(ulint i) const
328 		MY_ATTRIBUTE((warn_unused_result))
329 	{
330 		ut_a(i < m_slots.size());
331 
332 		return(&m_slots[i]);
333 	}
334 
335 	/** Non const version */
at(ulint i)336 	Slot* at(ulint i)
337 		MY_ATTRIBUTE((warn_unused_result))
338 	{
339 		ut_a(i < m_slots.size());
340 
341 		return(&m_slots[i]);
342 	}
343 
344 	/** Frees a slot in the AIO array, assumes caller owns the mutex.
345 	@param[in,out]	slot	Slot to release */
346 	void release(Slot* slot);
347 
348 	/** Frees a slot in the AIO array, assumes caller doesn't own the mutex.
349 	@param[in,out]	slot	Slot to release */
350 	void release_with_mutex(Slot* slot);
351 
352 	/** Prints info about the aio array.
353 	@param[in,out]	file	Where to print */
354 	void print(FILE* file);
355 
356 	/** @return the number of slots per segment */
slots_per_segment() const357 	ulint slots_per_segment() const
358 		MY_ATTRIBUTE((warn_unused_result))
359 	{
360 		return(m_slots.size() / m_n_segments);
361 	}
362 
363 	/** @return accessor for n_segments */
get_n_segments() const364 	ulint get_n_segments() const
365 		MY_ATTRIBUTE((warn_unused_result))
366 	{
367 		return(m_n_segments);
368 	}
369 
370 #ifdef UNIV_DEBUG
371 	/** @return true if the thread owns the mutex */
is_mutex_owned() const372 	bool is_mutex_owned() const
373 		MY_ATTRIBUTE((warn_unused_result))
374 	{
375 		return(mutex_own(&m_mutex));
376 	}
377 #endif /* UNIV_DEBUG */
378 
379 	/** Acquire the mutex */
acquire() const380 	void acquire() const
381 	{
382 		mutex_enter(&m_mutex);
383 	}
384 
385 	/** Release the mutex */
release() const386 	void release() const
387 	{
388 		mutex_exit(&m_mutex);
389 	}
390 
391 	/** Write out the state to the file/stream
392 	@param[in, out]	file	File to write to */
393 	void to_file(FILE* file) const;
394 
395 #ifdef LINUX_NATIVE_AIO
396 	/** Dispatch an AIO request to the kernel.
397 	@param[in,out]	slot	an already reserved slot
398 	@return true on success. */
399 	bool linux_dispatch(Slot* slot)
400 		MY_ATTRIBUTE((warn_unused_result));
401 
402 	/** Accessor for an AIO event
403 	@param[in]	index	Index into the array
404 	@return the event at the index */
io_events(ulint index)405 	io_event* io_events(ulint index)
406 		MY_ATTRIBUTE((warn_unused_result))
407 	{
408 		ut_a(index < m_events.size());
409 
410 		return(&m_events[index]);
411 	}
412 
413 	/** Accessor for the AIO context
414 	@param[in]	segment	Segment for which to get the context
415 	@return the AIO context for the segment */
io_ctx(ulint segment)416 	io_context_t io_ctx(ulint segment)
417 		MY_ATTRIBUTE((warn_unused_result))
418 	{
419 		ut_ad(segment < get_n_segments());
420 
421 		return(m_aio_ctx[segment]);
422 	}
423 
424 	/** Creates an io_context_t for native linux AIO.
425 	@param[in]	max_events	number of events
426 	@param[out]	io_ctx		io_ctx to initialize.
427 	@return true on success. */
428 	static bool linux_create_io_ctx(unsigned max_events, io_context_t& io_ctx)
429 		MY_ATTRIBUTE((warn_unused_result));
430 
431 	/** Checks if the system supports native linux aio. On some kernel
432 	versions where native aio is supported it won't work on tmpfs. In such
433 	cases we can't use native aio as it is not possible to mix simulated
434 	and native aio.
435 	@return true if supported, false otherwise. */
436 	static bool is_linux_native_aio_supported()
437 		MY_ATTRIBUTE((warn_unused_result));
438 #endif /* LINUX_NATIVE_AIO */
439 
440 #ifdef WIN_ASYNC_IO
441 	HANDLE m_completion_port;
442 	/** Wake up all AIO threads in Windows native aio */
wake_at_shutdown()443 	static void wake_at_shutdown() {
444 		AIO *all_arrays[] = {s_reads, s_writes, s_log, s_ibuf };
445 		for (size_t i = 0; i < array_elements(all_arrays); i++) {
446 			AIO *a = all_arrays[i];
447 			if (a) {
448 				PostQueuedCompletionStatus(a->m_completion_port, 0,
449 					IOCP_SHUTDOWN_KEY, 0);
450 			}
451 		}
452 	}
453 #endif /* WIN_ASYNC_IO */
454 
455 #ifdef _WIN32
456 	/** This function can be called if one wants to post a batch of reads
457 	and prefers an I/O - handler thread to handle them all at once later.You
458 	must call os_aio_simulated_wake_handler_threads later to ensure the
459 	threads are not left sleeping! */
460 	static void simulated_put_read_threads_to_sleep();
461 #endif /* _WIN32 */
462 
463 	/** Create an instance using new(std::nothrow)
464 	@param[in]	id		Latch ID
465 	@param[in]	n_slots		The number of AIO request slots
466 	@param[in]	segments	The number of segments
467 	@return a new AIO instance */
468 	static AIO* create(
469 		latch_id_t	id,
470 		ulint		n_slots,
471 		ulint		segments)
472 		MY_ATTRIBUTE((warn_unused_result));
473 
474 	/** Initializes the asynchronous io system. Creates one array each
475 	for ibuf and log I/O. Also creates one array each for read and write
476 	where each array is divided logically into n_readers and n_writers
477 	respectively. The caller must create an i/o handler thread for each
478 	segment in these arrays. This function also creates the sync array.
479 	No I/O handler thread needs to be created for that
480 	@param[in]	n_per_seg	maximum number of pending aio
481 					operations allowed per segment
482 	@param[in]	n_readers	number of reader threads
483 	@param[in]	n_writers	number of writer threads
484 	@param[in]	n_slots_sync	number of slots in the sync aio array
485 	@return true if AIO sub-system was started successfully */
486 	static bool start(
487 		ulint		n_per_seg,
488 		ulint		n_readers,
489 		ulint		n_writers,
490 		ulint		n_slots_sync)
491 		MY_ATTRIBUTE((warn_unused_result));
492 
493 	/** Free the AIO arrays */
494 	static void shutdown();
495 
496 	/** Print all the AIO segments
497 	@param[in,out]	file		Where to print */
498 	static void print_all(FILE* file);
499 
500 	/** Calculates local segment number and aio array from global
501 	segment number.
502 	@param[out]	array		AIO wait array
503 	@param[in]	segment		global segment number
504 	@return local segment number within the aio array */
505 	static ulint get_array_and_local_segment(
506 		AIO**		array,
507 		ulint		segment)
508 		MY_ATTRIBUTE((warn_unused_result));
509 
510 	/** Select the IO slot array
511 	@param[in,out]	type		Type of IO, READ or WRITE
512 	@param[in]	read_only	true if running in read-only mode
513 	@param[in]	mode		IO mode
514 	@return slot array or NULL if invalid mode specified */
515 	static AIO* select_slot_array(
516 		IORequest&		type,
517 		bool			read_only,
518 		ulint			mode)
519 		MY_ATTRIBUTE((warn_unused_result));
520 
521 	/** Calculates segment number for a slot.
522 	@param[in]	array		AIO wait array
523 	@param[in]	slot		slot in this array
524 	@return segment number (which is the number used by, for example,
525 		I/O handler threads) */
526 	static ulint get_segment_no_from_slot(
527 		const AIO*	array,
528 		const Slot*	slot)
529 		MY_ATTRIBUTE((warn_unused_result));
530 
531 	/** Wakes up a simulated AIO I/O-handler thread if it has something
532 	to do.
533 	@param[in]	global_segment	the number of the segment in the
534 					AIO arrays */
535 	static void wake_simulated_handler_thread(ulint global_segment);
536 
537 	/** Check if it is a read request
538 	@param[in]	aio		The AIO instance to check
539 	@return true if the AIO instance is for reading. */
is_read(const AIO * aio)540 	static bool is_read(const AIO* aio)
541 		MY_ATTRIBUTE((warn_unused_result))
542 	{
543 		return(s_reads == aio);
544 	}
545 
546 	/** Wait on an event until no pending writes */
wait_until_no_pending_writes()547 	static void wait_until_no_pending_writes()
548 	{
549 		os_event_wait(AIO::s_writes->m_is_empty);
550 	}
551 
552 	/** Print to file
553 	@param[in]	file		File to write to */
554 	static void print_to_file(FILE* file);
555 
556 	/** Check for pending IO. Gets the count and also validates the
557 	data structures.
558 	@return count of pending IO requests */
559 	static ulint total_pending_io_count();
560 
561 private:
562 	/** Initialise the slots
563 	@return DB_SUCCESS or error code */
564 	dberr_t init_slots()
565 		MY_ATTRIBUTE((warn_unused_result));
566 
567 	/** Wakes up a simulated AIO I/O-handler thread if it has something
568 	to do for a local segment in the AIO array.
569 	@param[in]	global_segment	the number of the segment in the
570 					AIO arrays
571 	@param[in]	segment		the local segment in the AIO array */
572 	void wake_simulated_handler_thread(ulint global_segment, ulint segment);
573 
574 	/** Prints pending IO requests per segment of an aio array.
575 	We probably don't need per segment statistics but they can help us
576 	during development phase to see if the IO requests are being
577 	distributed as expected.
578 	@param[in,out]	file		file where to print
579 	@param[in]	segments	pending IO array */
580 	void print_segment_info(
581 		FILE*		file,
582 		const ulint*	segments);
583 
584 #ifdef LINUX_NATIVE_AIO
585 	/** Initialise the Linux native AIO data structures
586 	@return DB_SUCCESS or error code */
587 	dberr_t init_linux_native_aio()
588 		MY_ATTRIBUTE((warn_unused_result));
589 #endif /* LINUX_NATIVE_AIO */
590 
591 private:
592 	typedef std::vector<Slot> Slots;
593 
594 	/** the mutex protecting the aio array */
595 	mutable SysMutex	m_mutex;
596 
597 	/** Pointer to the slots in the array.
598 	Number of elements must be divisible by n_threads. */
599 	Slots			m_slots;
600 
601 	/** Number of segments in the aio array of pending aio requests.
602 	A thread can wait separately for any one of the segments. */
603 	ulint			m_n_segments;
604 
605 	/** The event which is set to the signaled state when
606 	there is space in the aio outside the ibuf segment;
607 	os_event_set() and os_event_reset() are protected by AIO::m_mutex */
608 	os_event_t		m_not_full;
609 
610 	/** The event which is set to the signaled state when
611 	there are no pending i/os in this array;
612 	os_event_set() and os_event_reset() are protected by AIO::m_mutex */
613 	os_event_t		m_is_empty;
614 
615 	/** Number of reserved slots in the AIO array outside
616 	the ibuf segment */
617 	ulint			m_n_reserved;
618 
619 
620 #if defined(LINUX_NATIVE_AIO)
621 	typedef std::vector<io_event> IOEvents;
622 
623 	/** completion queue for IO. There is one such queue per
624 	segment. Each thread will work on one ctx exclusively. */
625 	std::vector<io_context_t>		m_aio_ctx;
626 
627 	/** The array to collect completed IOs. There is one such
628 	event for each possible pending IO. The size of the array
629 	is equal to m_slots.size(). */
630 	IOEvents		m_events;
631 #endif /* LINUX_NATIV_AIO */
632 
633 	/** The aio arrays for non-ibuf i/o and ibuf i/o, as well as
634 	sync AIO. These are NULL when the module has not yet been
635 	initialized. */
636 
637 	/** Insert buffer */
638 	static AIO*		s_ibuf;
639 
640 	/** Redo log */
641 	static AIO*		s_log;
642 
643 	/** Reads */
644 	static AIO*		s_reads;
645 
646 	/** Writes */
647 	static AIO*		s_writes;
648 
649 	/** Synchronous I/O */
650 	static AIO*		s_sync;
651 };
652 
653 /** Static declarations */
654 AIO*	AIO::s_reads;
655 AIO*	AIO::s_writes;
656 AIO*	AIO::s_ibuf;
657 AIO*	AIO::s_log;
658 AIO*	AIO::s_sync;
659 
660 #if defined(LINUX_NATIVE_AIO)
661 /** timeout for each io_getevents() call = 500ms. */
662 static const ulint	OS_AIO_REAP_TIMEOUT = 500000000UL;
663 
664 /** time to sleep, in microseconds if io_setup() returns EAGAIN. */
665 static const ulint	OS_AIO_IO_SETUP_RETRY_SLEEP = 500000UL;
666 
667 /** number of attempts before giving up on io_setup(). */
668 static const int	OS_AIO_IO_SETUP_RETRY_ATTEMPTS = 5;
669 #endif /* LINUX_NATIVE_AIO */
670 
671 /** Array of events used in simulated AIO */
672 static os_event_t*	os_aio_segment_wait_events;
673 
674 /** Number of asynchronous I/O segments.  Set by os_aio_init(). */
675 static ulint		os_aio_n_segments = ULINT_UNDEFINED;
676 
677 /** If the following is true, read i/o handler threads try to
678 wait until a batch of new read requests have been posted */
679 static bool		os_aio_recommend_sleep_for_read_threads;
680 
681 ulint	os_n_file_reads;
682 static ulint	os_bytes_read_since_printout;
683 ulint	os_n_file_writes;
684 ulint	os_n_fsyncs;
685 static ulint	os_n_file_reads_old;
686 static ulint	os_n_file_writes_old;
687 static ulint	os_n_fsyncs_old;
688 
689 static time_t	os_last_printout;
690 bool	os_has_said_disk_full;
691 
692 /** Default Zip compression level */
693 extern uint page_zip_level;
694 
695 /** Validates the consistency of the aio system.
696 @return true if ok */
697 static
698 bool
699 os_aio_validate();
700 
701 /** Handle errors for file operations.
702 @param[in]	name		name of a file or NULL
703 @param[in]	operation	operation
704 @param[in]	should_abort	whether to abort on an unknown error
705 @param[in]	on_error_silent	whether to suppress reports of non-fatal errors
706 @return true if we should retry the operation */
707 static MY_ATTRIBUTE((warn_unused_result))
708 bool
709 os_file_handle_error_cond_exit(
710 	const char*	name,
711 	const char*	operation,
712 	bool		should_abort,
713 	bool		on_error_silent);
714 
715 /** Does error handling when a file operation fails.
716 @param[in]	name		name of a file or NULL
717 @param[in]	operation	operation name that failed
718 @return true if we should retry the operation */
719 static
720 bool
os_file_handle_error(const char * name,const char * operation)721 os_file_handle_error(
722 	const char*	name,
723 	const char*	operation)
724 {
725 	/* Exit in case of unknown error */
726 	return(os_file_handle_error_cond_exit(name, operation, true, false));
727 }
728 
729 /** Does error handling when a file operation fails.
730 @param[in]	name		name of a file or NULL
731 @param[in]	operation	operation name that failed
732 @param[in]	on_error_silent	if true then don't print any message to the log.
733 @return true if we should retry the operation */
734 static
735 bool
os_file_handle_error_no_exit(const char * name,const char * operation,bool on_error_silent)736 os_file_handle_error_no_exit(
737 	const char*	name,
738 	const char*	operation,
739 	bool		on_error_silent)
740 {
741 	/* Don't exit in case of unknown error */
742 	return(os_file_handle_error_cond_exit(
743 			name, operation, false, on_error_silent));
744 }
745 
746 /** Handle RENAME error.
747 @param name	old name of the file
748 @param new_name	new name of the file */
os_file_handle_rename_error(const char * name,const char * new_name)749 static void os_file_handle_rename_error(const char* name, const char* new_name)
750 {
751 	if (os_file_get_last_error(true) != OS_FILE_DISK_FULL) {
752 		ib::error() << "Cannot rename file '" << name << "' to '"
753 			<< new_name << "'";
754 	} else if (!os_has_said_disk_full) {
755 		os_has_said_disk_full = true;
756 		/* Disk full error is reported irrespective of the
757 		on_error_silent setting. */
758 		ib::error() << "Full disk prevents renaming file '"
759 			<< name << "' to '" << new_name << "'";
760 	}
761 }
762 
763 /** Does simulated AIO. This function should be called by an i/o-handler
764 thread.
765 
766 @param[in]	segment	The number of the segment in the aio arrays to wait
767 			for; segment 0 is the ibuf i/o thread, segment 1 the
768 			log i/o thread, then follow the non-ibuf read threads,
769 			and as the last are the non-ibuf write threads
770 @param[out]	m1	the messages passed with the AIO request; note that
771 			also in the case where the AIO operation failed, these
772 			output parameters are valid and can be used to restart
773 			the operation, for example
774 @param[out]	m2	Callback argument
775 @param[in]	type	IO context
776 @return DB_SUCCESS or error code */
777 static
778 dberr_t
779 os_aio_simulated_handler(
780 	ulint		global_segment,
781 	fil_node_t**	m1,
782 	void**		m2,
783 	IORequest*	type);
784 
785 #ifdef _WIN32
786 static HANDLE win_get_syncio_event();
787 
788 /**
789  Wrapper around Windows DeviceIoControl() function.
790 
791  Works synchronously, also in case for handle opened
792  for async access (i.e with FILE_FLAG_OVERLAPPED).
793 
794  Accepts the same parameters as DeviceIoControl(),except
795  last parameter (OVERLAPPED).
796 */
797 static
798 BOOL
os_win32_device_io_control(HANDLE handle,DWORD code,LPVOID inbuf,DWORD inbuf_size,LPVOID outbuf,DWORD outbuf_size,LPDWORD bytes_returned)799 os_win32_device_io_control(
800 	HANDLE handle,
801 	DWORD code,
802 	LPVOID inbuf,
803 	DWORD inbuf_size,
804 	LPVOID outbuf,
805 	DWORD outbuf_size,
806 	LPDWORD bytes_returned
807 )
808 {
809 	OVERLAPPED overlapped = { 0 };
810 	overlapped.hEvent = win_get_syncio_event();
811 	BOOL result = DeviceIoControl(handle, code, inbuf, inbuf_size, outbuf,
812 		outbuf_size,  NULL, &overlapped);
813 
814 	if (result || (GetLastError() == ERROR_IO_PENDING)) {
815 		/* Wait for async io to complete */
816 		result = GetOverlappedResult(handle, &overlapped, bytes_returned, TRUE);
817 	}
818 
819 	return result;
820 }
821 
822 #endif
823 
824 /***********************************************************************//**
825 Try to get number of bytes per sector from file system.
826 @return	file block size */
827 UNIV_INTERN
828 ulint
os_file_get_block_size(os_file_t file,const char * name)829 os_file_get_block_size(
830 /*===================*/
831 	os_file_t	file,	/*!< in: handle to a file */
832 	const char*	name)	/*!< in: file name */
833 {
834 	ulint		fblock_size = 512;
835 
836 #if defined(UNIV_LINUX)
837 	struct stat local_stat;
838 	int		err;
839 
840 	err = fstat((int)file, &local_stat);
841 
842 	if (err != 0) {
843 		os_file_handle_error_no_exit(name, "fstat()", FALSE);
844 	} else {
845 		fblock_size = local_stat.st_blksize;
846 	}
847 #endif /* UNIV_LINUX */
848 #ifdef _WIN32
849 
850 	fblock_size = 0;
851 	BOOL result = false;
852 	size_t len = 0;
853 	// Open volume for this file, find out it "physical bytes per sector"
854 
855 	HANDLE volume_handle = INVALID_HANDLE_VALUE;
856 	char volume[MAX_PATH + 4]="\\\\.\\"; // Special prefix required for volume names.
857 	if (!GetVolumePathName(name , volume + 4, MAX_PATH)) {
858 		os_file_handle_error_no_exit(name,
859 			"GetVolumePathName()", FALSE);
860 		goto end;
861 	}
862 
863 	len = strlen(volume);
864 	if (volume[len - 1] == '\\') {
865 		// Trim trailing backslash from volume name.
866 		volume[len - 1] = 0;
867 	}
868 
869 	volume_handle = CreateFile(volume, FILE_READ_ATTRIBUTES,
870 		FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
871 		0, OPEN_EXISTING, 0, 0);
872 
873 	if (volume_handle == INVALID_HANDLE_VALUE) {
874 		if (GetLastError() != ERROR_ACCESS_DENIED) {
875 			os_file_handle_error_no_exit(volume,
876 				"CreateFile()", FALSE);
877 		}
878 		goto end;
879 	}
880 
881 	DWORD tmp;
882 	STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR disk_alignment;
883 
884 	STORAGE_PROPERTY_QUERY storage_query;
885 	memset(&storage_query, 0, sizeof(storage_query));
886 	storage_query.PropertyId = StorageAccessAlignmentProperty;
887 	storage_query.QueryType  = PropertyStandardQuery;
888 
889 	result = os_win32_device_io_control(volume_handle,
890 		IOCTL_STORAGE_QUERY_PROPERTY,
891 		&storage_query,
892 		sizeof(storage_query),
893 		&disk_alignment,
894 		sizeof(disk_alignment),
895 		&tmp);
896 
897 	if (!result) {
898 		DWORD err = GetLastError();
899 		if (err != ERROR_INVALID_FUNCTION && err != ERROR_NOT_SUPPORTED) {
900 				os_file_handle_error_no_exit(volume,
901 					"DeviceIoControl(IOCTL_STORAGE_QUERY_PROPERTY)", FALSE);
902 		}
903 		goto end;
904 	}
905 
906 	fblock_size = disk_alignment.BytesPerPhysicalSector;
907 
908 end:
909 	if (volume_handle != INVALID_HANDLE_VALUE) {
910 		CloseHandle(volume_handle);
911 	}
912 #endif /* _WIN32 */
913 
914 	/* Currently we support file block size up to 4Kb */
915 	if (fblock_size > 4096 || fblock_size < 512) {
916 		if (fblock_size < 512) {
917 			fblock_size = 512;
918 		} else {
919 			fblock_size = 4096;
920 		}
921 	}
922 
923 	return fblock_size;
924 }
925 
926 #ifdef WIN_ASYNC_IO
927 /** This function is only used in Windows asynchronous i/o.
928 Waits for an aio operation to complete. This function is used to wait the
929 for completed requests. The aio array of pending requests is divided
930 into segments. The thread specifies which segment or slot it wants to wait
931 for. NOTE: this function will also take care of freeing the aio slot,
932 therefore no other thread is allowed to do the freeing!
933 @param[in]	segment		The number of the segment in the aio arrays to
934 wait for; segment 0 is the ibuf I/O thread,
935 segment 1 the log I/O thread, then follow the
936 non-ibuf read threads, and as the last are the
937 non-ibuf write threads; if this is
938 ULINT_UNDEFINED, then it means that sync AIO
939 is used, and this parameter is ignored
940 @param[in]	pos		this parameter is used only in sync AIO:
941 wait for the aio slot at this position
942 @param[out]	m1		the messages passed with the AIO request; note
943 that also in the case where the AIO operation
944 failed, these output parameters are valid and
945 can be used to restart the operation,
946 for example
947 @param[out]	m2		callback message
948 @param[out]	type		OS_FILE_WRITE or ..._READ
949 @return DB_SUCCESS or error code */
950 static
951 dberr_t
952 os_aio_windows_handler(
953 	ulint		segment,
954 	ulint		pos,
955 	fil_node_t**	m1,
956 	void**		m2,
957 	IORequest*	type);
958 #endif /* WIN_ASYNC_IO */
959 
960 /** Generic AIO Handler methods. Currently handles IO post processing. */
961 class AIOHandler {
962 public:
963 	/** Do any post processing after a read/write
964 	@return DB_SUCCESS or error code. */
965 	static dberr_t post_io_processing(Slot* slot);
966 };
967 
968 /** Helper class for doing synchronous file IO. Currently, the objective
969 is to hide the OS specific code, so that the higher level functions aren't
970 peppered with #ifdef. Makes the code flow difficult to follow.  */
971 class SyncFileIO {
972 public:
973 	/** Constructor
974 	@param[in]	fh	File handle
975 	@param[in,out]	buf	Buffer to read/write
976 	@param[in]	n	Number of bytes to read/write
977 	@param[in]	offset	Offset where to read or write */
SyncFileIO(os_file_t fh,void * buf,ulint n,os_offset_t offset)978 	SyncFileIO(os_file_t fh, void* buf, ulint n, os_offset_t offset)
979 		:
980 		m_fh(fh),
981 		m_buf(buf),
982 		m_n(static_cast<ssize_t>(n)),
983 		m_offset(offset)
984 	{
985 		ut_ad(m_n > 0);
986 	}
987 
988 	/** Destructor */
~SyncFileIO()989 	~SyncFileIO()
990 	{
991 		/* No op */
992 	}
993 
994 	/** Do the read/write
995 	@param[in]	request	The IO context and type
996 	@return the number of bytes read/written or negative value on error */
997 	ssize_t execute(const IORequest& request);
998 
999 	/** Do the read/write
1000 	@param[in,out]	slot	The IO slot, it has the IO context
1001 	@return the number of bytes read/written or negative value on error */
1002 	static ssize_t execute(Slot* slot);
1003 
1004 	/** Move the read/write offset up to where the partial IO succeeded.
1005 	@param[in]	n_bytes	The number of bytes to advance */
advance(ssize_t n_bytes)1006 	void advance(ssize_t n_bytes)
1007 	{
1008 		m_offset += n_bytes;
1009 
1010 		ut_ad(m_n >= n_bytes);
1011 
1012 		m_n -=  n_bytes;
1013 
1014 		m_buf = reinterpret_cast<uchar*>(m_buf) + n_bytes;
1015 	}
1016 
1017 private:
1018 	/** Open file handle */
1019 	os_file_t		m_fh;
1020 
1021 	/** Buffer to read/write */
1022 	void*			m_buf;
1023 
1024 	/** Number of bytes to read/write */
1025 	ssize_t			m_n;
1026 
1027 	/** Offset from where to read/write */
1028 	os_offset_t		m_offset;
1029 };
1030 
1031 /** Do any post processing after a read/write
1032 @return DB_SUCCESS or error code. */
1033 dberr_t
post_io_processing(Slot * slot)1034 AIOHandler::post_io_processing(Slot* slot)
1035 {
1036 	ut_ad(slot->is_reserved);
1037 
1038 	/* Total bytes read so far */
1039 	ulint	n_bytes = ulint(slot->ptr - slot->buf) + slot->n_bytes;
1040 
1041 	return(n_bytes == slot->original_len ? DB_SUCCESS : DB_FAIL);
1042 }
1043 
1044 /** Count the number of free slots
1045 @return number of reserved slots */
1046 ulint
pending_io_count() const1047 AIO::pending_io_count() const
1048 {
1049 	acquire();
1050 
1051 #ifdef UNIV_DEBUG
1052 	ut_a(m_n_segments > 0);
1053 	ut_a(!m_slots.empty());
1054 
1055 	ulint	count = 0;
1056 
1057 	for (ulint i = 0; i < m_slots.size(); ++i) {
1058 
1059 		const Slot&	slot = m_slots[i];
1060 
1061 		if (slot.is_reserved) {
1062 			++count;
1063 			ut_a(slot.len > 0);
1064 		}
1065 	}
1066 
1067 	ut_a(m_n_reserved == count);
1068 #endif /* UNIV_DEBUG */
1069 
1070 	ulint	reserved = m_n_reserved;
1071 
1072 	release();
1073 
1074 	return(reserved);
1075 }
1076 
1077 #ifdef UNIV_DEBUG
1078 /** Validates the consistency the aio system some of the time.
1079 @return true if ok or the check was skipped */
1080 static
1081 bool
os_aio_validate_skip()1082 os_aio_validate_skip()
1083 {
1084 /** Try os_aio_validate() every this many times */
1085 # define OS_AIO_VALIDATE_SKIP	13
1086 
1087 	static int os_aio_validate_count;
1088 
1089 	if (my_atomic_add32_explicit(&os_aio_validate_count, -1,
1090 				     MY_MEMORY_ORDER_RELAXED)
1091 	    % OS_AIO_VALIDATE_SKIP) {
1092 		return true;
1093 	}
1094 
1095 	return(os_aio_validate());
1096 }
1097 #endif /* UNIV_DEBUG */
1098 
1099 #undef USE_FILE_LOCK
1100 #ifndef _WIN32
1101 /* On Windows, mandatory locking is used */
1102 # define USE_FILE_LOCK
1103 #endif
1104 #ifdef USE_FILE_LOCK
1105 /** Obtain an exclusive lock on a file.
1106 @param[in]	fd		file descriptor
1107 @param[in]	name		file name
1108 @return 0 on success */
1109 static
1110 int
os_file_lock(int fd,const char * name)1111 os_file_lock(
1112 	int		fd,
1113 	const char*	name)
1114 {
1115 	if (my_disable_locking) {
1116 		return 0;
1117 	}
1118 
1119 	struct flock lk;
1120 
1121 	lk.l_type = F_WRLCK;
1122 	lk.l_whence = SEEK_SET;
1123 	lk.l_start = lk.l_len = 0;
1124 
1125 	if (fcntl(fd, F_SETLK, &lk) == -1) {
1126 
1127 		ib::error()
1128 			<< "Unable to lock " << name
1129 			<< " error: " << errno;
1130 
1131 		if (errno == EAGAIN || errno == EACCES) {
1132 
1133 			ib::info()
1134 				<< "Check that you do not already have"
1135 				" another mysqld process using the"
1136 				" same InnoDB data or log files.";
1137 		}
1138 
1139 		return(-1);
1140 	}
1141 
1142 	return(0);
1143 }
1144 #endif /* USE_FILE_LOCK */
1145 
1146 /** Calculates local segment number and aio array from global segment number.
1147 @param[out]	array		aio wait array
1148 @param[in]	segment		global segment number
1149 @return local segment number within the aio array */
1150 ulint
get_array_and_local_segment(AIO ** array,ulint segment)1151 AIO::get_array_and_local_segment(
1152 	AIO**		array,
1153 	ulint		segment)
1154 {
1155 	ulint		local_segment;
1156 	ulint		n_extra_segs = (srv_read_only_mode) ? 0 : 2;
1157 
1158 	ut_a(segment < os_aio_n_segments);
1159 
1160 	if (!srv_read_only_mode && segment < n_extra_segs) {
1161 
1162 		/* We don't support ibuf/log IO during read only mode. */
1163 
1164 		if (segment == IO_IBUF_SEGMENT) {
1165 
1166 			*array = s_ibuf;
1167 
1168 		} else if (segment == IO_LOG_SEGMENT) {
1169 
1170 			*array = s_log;
1171 
1172 		} else {
1173 			*array = NULL;
1174 		}
1175 
1176 		local_segment = 0;
1177 
1178 	} else if (segment < s_reads->m_n_segments + n_extra_segs) {
1179 
1180 		*array = s_reads;
1181 		local_segment = segment - n_extra_segs;
1182 
1183 	} else {
1184 		*array = s_writes;
1185 
1186 		local_segment = segment
1187 			      - (s_reads->m_n_segments + n_extra_segs);
1188 	}
1189 
1190 	return(local_segment);
1191 }
1192 
1193 /** Frees a slot in the aio array. Assumes caller owns the mutex.
1194 @param[in,out]	slot		Slot to release */
1195 void
release(Slot * slot)1196 AIO::release(Slot* slot)
1197 {
1198 	ut_ad(is_mutex_owned());
1199 
1200 	ut_ad(slot->is_reserved);
1201 
1202 	slot->is_reserved = false;
1203 
1204 	--m_n_reserved;
1205 
1206 	if (m_n_reserved == m_slots.size() - 1) {
1207 		os_event_set(m_not_full);
1208 	}
1209 
1210 	if (m_n_reserved == 0) {
1211 		os_event_set(m_is_empty);
1212 	}
1213 
1214 #if defined(LINUX_NATIVE_AIO)
1215 
1216 	if (srv_use_native_aio) {
1217 		memset(&slot->control, 0x0, sizeof(slot->control));
1218 		slot->ret = 0;
1219 		slot->n_bytes = 0;
1220 	} else {
1221 		/* These fields should not be used if we are not
1222 		using native AIO. */
1223 		ut_ad(slot->n_bytes == 0);
1224 		ut_ad(slot->ret == 0);
1225 	}
1226 
1227 #endif /* WIN_ASYNC_IO */
1228 }
1229 
1230 /** Frees a slot in the AIO array. Assumes caller doesn't own the mutex.
1231 @param[in,out]	slot		Slot to release */
1232 void
release_with_mutex(Slot * slot)1233 AIO::release_with_mutex(Slot* slot)
1234 {
1235 	acquire();
1236 
1237 	release(slot);
1238 
1239 	release();
1240 }
1241 
1242 /** Create a temporary file. This function is like tmpfile(3), but
1243 the temporary file is created in the in the mysql server configuration
1244 parameter (--tmpdir).
1245 @return temporary file handle, or NULL on error */
1246 FILE*
os_file_create_tmpfile()1247 os_file_create_tmpfile()
1248 {
1249 	FILE*	file	= NULL;
1250 	WAIT_ALLOW_WRITES();
1251 	os_file_t	fd	= innobase_mysql_tmpfile(NULL);
1252 
1253 	if (fd != OS_FILE_CLOSED) {
1254 #ifdef _WIN32
1255 		int crt_fd = _open_osfhandle((intptr_t)HANDLE(fd), 0);
1256 		if (crt_fd != -1) {
1257 			file = fdopen(crt_fd, "w+b");
1258 			if (!file) {
1259 				close(crt_fd);
1260 			}
1261 		}
1262 #else
1263 		file = fdopen(fd, "w+b");
1264 		if (!file) {
1265 			close(fd);
1266 		}
1267 #endif
1268 	}
1269 
1270 	if (file == NULL) {
1271 
1272 		ib::error()
1273 			<< "Unable to create temporary file; errno: "
1274 			<< errno;
1275 	}
1276 
1277 	return(file);
1278 }
1279 
1280 /** Rewind file to its start, read at most size - 1 bytes from it to str, and
1281 NUL-terminate str. All errors are silently ignored. This function is
1282 mostly meant to be used with temporary files.
1283 @param[in,out]	file		File to read from
1284 @param[in,out]	str		Buffer where to read
1285 @param[in]	size		Size of buffer */
1286 void
os_file_read_string(FILE * file,char * str,ulint size)1287 os_file_read_string(
1288 	FILE*		file,
1289 	char*		str,
1290 	ulint		size)
1291 {
1292 	if (size != 0) {
1293 		rewind(file);
1294 
1295 		size_t	flen = fread(str, 1, size - 1, file);
1296 
1297 		str[flen] = '\0';
1298 	}
1299 }
1300 
1301 /** This function returns a new path name after replacing the basename
1302 in an old path with a new basename.  The old_path is a full path
1303 name including the extension.  The tablename is in the normal
1304 form "databasename/tablename".  The new base name is found after
1305 the forward slash.  Both input strings are null terminated.
1306 
1307 This function allocates memory to be returned.  It is the callers
1308 responsibility to free the return value after it is no longer needed.
1309 
1310 @param[in]	old_path		Pathname
1311 @param[in]	tablename		Contains new base name
1312 @return own: new full pathname */
1313 char*
os_file_make_new_pathname(const char * old_path,const char * tablename)1314 os_file_make_new_pathname(
1315 	const char*	old_path,
1316 	const char*	tablename)
1317 {
1318 	ulint		dir_len;
1319 	char*		last_slash;
1320 	char*		base_name;
1321 	char*		new_path;
1322 	ulint		new_path_len;
1323 
1324 	/* Split the tablename into its database and table name components.
1325 	They are separated by a '/'. */
1326 	last_slash = strrchr((char*) tablename, '/');
1327 	base_name = last_slash ? last_slash + 1 : (char*) tablename;
1328 
1329 	/* Find the offset of the last slash. We will strip off the
1330 	old basename.ibd which starts after that slash. */
1331 	last_slash = strrchr((char*) old_path, OS_PATH_SEPARATOR);
1332 	dir_len = last_slash ? ulint(last_slash - old_path) : strlen(old_path);
1333 
1334 	/* allocate a new path and move the old directory path to it. */
1335 	new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd";
1336 	new_path = static_cast<char*>(ut_malloc_nokey(new_path_len));
1337 	memcpy(new_path, old_path, dir_len);
1338 
1339 	snprintf(new_path + dir_len, new_path_len - dir_len,
1340 		 "%c%s.ibd", OS_PATH_SEPARATOR, base_name);
1341 
1342 	return(new_path);
1343 }
1344 
1345 /** This function reduces a null-terminated full remote path name into
1346 the path that is sent by MySQL for DATA DIRECTORY clause.  It replaces
1347 the 'databasename/tablename.ibd' found at the end of the path with just
1348 'tablename'.
1349 
1350 Since the result is always smaller than the path sent in, no new memory
1351 is allocated. The caller should allocate memory for the path sent in.
1352 This function manipulates that path in place.
1353 
1354 If the path format is not as expected, just return.  The result is used
1355 to inform a SHOW CREATE TABLE command.
1356 @param[in,out]	data_dir_path		Full path/data_dir_path */
1357 void
os_file_make_data_dir_path(char * data_dir_path)1358 os_file_make_data_dir_path(
1359 	char*	data_dir_path)
1360 {
1361 	/* Replace the period before the extension with a null byte. */
1362 	char*	ptr = strrchr((char*) data_dir_path, '.');
1363 
1364 	if (ptr == NULL) {
1365 		return;
1366 	}
1367 
1368 	ptr[0] = '\0';
1369 
1370 	/* The tablename starts after the last slash. */
1371 	ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
1372 
1373 	if (ptr == NULL) {
1374 		return;
1375 	}
1376 
1377 	ptr[0] = '\0';
1378 
1379 	char*	tablename = ptr + 1;
1380 
1381 	/* The databasename starts after the next to last slash. */
1382 	ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
1383 
1384 	if (ptr == NULL) {
1385 		return;
1386 	}
1387 
1388 	ulint	tablename_len = ut_strlen(tablename);
1389 
1390 	ut_memmove(++ptr, tablename, tablename_len);
1391 
1392 	ptr[tablename_len] = '\0';
1393 }
1394 
1395 /** Check if the path refers to the root of a drive using a pointer
1396 to the last directory separator that the caller has fixed.
1397 @param[in]	path	path name
1398 @param[in]	path	last directory separator in the path
1399 @return true if this path is a drive root, false if not */
1400 UNIV_INLINE
1401 bool
os_file_is_root(const char * path,const char * last_slash)1402 os_file_is_root(
1403 	const char*	path,
1404 	const char*	last_slash)
1405 {
1406 	return(
1407 #ifdef _WIN32
1408 	       (last_slash == path + 2 && path[1] == ':') ||
1409 #endif /* _WIN32 */
1410 	       last_slash == path);
1411 }
1412 
1413 /** Return the parent directory component of a null-terminated path.
1414 Return a new buffer containing the string up to, but not including,
1415 the final component of the path.
1416 The path returned will not contain a trailing separator.
1417 Do not return a root path, return NULL instead.
1418 The final component trimmed off may be a filename or a directory name.
1419 If the final component is the only component of the path, return NULL.
1420 It is the caller's responsibility to free the returned string after it
1421 is no longer needed.
1422 @param[in]	path		Path name
1423 @return own: parent directory of the path */
1424 static
1425 char*
os_file_get_parent_dir(const char * path)1426 os_file_get_parent_dir(
1427 	const char*	path)
1428 {
1429 	bool	has_trailing_slash = false;
1430 
1431 	/* Find the offset of the last slash */
1432 	const char* last_slash = strrchr(path, OS_PATH_SEPARATOR);
1433 
1434 	if (!last_slash) {
1435 		/* No slash in the path, return NULL */
1436 		return(NULL);
1437 	}
1438 
1439 	/* Ok, there is a slash. Is there anything after it? */
1440 	if (static_cast<size_t>(last_slash - path + 1) == strlen(path)) {
1441 		has_trailing_slash = true;
1442 	}
1443 
1444 	/* Reduce repetative slashes. */
1445 	while (last_slash > path
1446 		&& last_slash[-1] == OS_PATH_SEPARATOR) {
1447 		last_slash--;
1448 	}
1449 
1450 	/* Check for the root of a drive. */
1451 	if (os_file_is_root(path, last_slash)) {
1452 		return(NULL);
1453 	}
1454 
1455 	/* If a trailing slash prevented the first strrchr() from trimming
1456 	the last component of the path, trim that component now. */
1457 	if (has_trailing_slash) {
1458 		/* Back up to the previous slash. */
1459 		last_slash--;
1460 		while (last_slash > path
1461 		       && last_slash[0] != OS_PATH_SEPARATOR) {
1462 			last_slash--;
1463 		}
1464 
1465 		/* Reduce repetative slashes. */
1466 		while (last_slash > path
1467 			&& last_slash[-1] == OS_PATH_SEPARATOR) {
1468 			last_slash--;
1469 		}
1470 	}
1471 
1472 	/* Check for the root of a drive. */
1473 	if (os_file_is_root(path, last_slash)) {
1474 		return(NULL);
1475 	}
1476 
1477 	if (last_slash - path < 0) {
1478 		/* Sanity check, it prevents gcc from trying to handle this case which
1479 		 * results in warnings for some optimized builds */
1480 		return (NULL);
1481 	}
1482 
1483 	/* Non-trivial directory component */
1484 
1485 	return(mem_strdupl(path, ulint(last_slash - path)));
1486 }
1487 #ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
1488 
1489 /* Test the function os_file_get_parent_dir. */
1490 void
test_os_file_get_parent_dir(const char * child_dir,const char * expected_dir)1491 test_os_file_get_parent_dir(
1492 	const char*	child_dir,
1493 	const char*	expected_dir)
1494 {
1495 	char* child = mem_strdup(child_dir);
1496 	char* expected = expected_dir == NULL ? NULL
1497 			 : mem_strdup(expected_dir);
1498 
1499 	/* os_file_get_parent_dir() assumes that separators are
1500 	converted to OS_PATH_SEPARATOR. */
1501 	os_normalize_path(child);
1502 	os_normalize_path(expected);
1503 
1504 	char* parent = os_file_get_parent_dir(child);
1505 
1506 	bool unexpected = (expected == NULL
1507 			  ? (parent != NULL)
1508 			  : (0 != strcmp(parent, expected)));
1509 	if (unexpected) {
1510 		ib::fatal() << "os_file_get_parent_dir('" << child
1511 			<< "') returned '" << parent
1512 			<< "', instead of '" << expected << "'.";
1513 	}
1514 	ut_free(parent);
1515 	ut_free(child);
1516 	ut_free(expected);
1517 }
1518 
1519 /* Test the function os_file_get_parent_dir. */
1520 void
unit_test_os_file_get_parent_dir()1521 unit_test_os_file_get_parent_dir()
1522 {
1523 	test_os_file_get_parent_dir("/usr/lib/a", "/usr/lib");
1524 	test_os_file_get_parent_dir("/usr/", NULL);
1525 	test_os_file_get_parent_dir("//usr//", NULL);
1526 	test_os_file_get_parent_dir("usr", NULL);
1527 	test_os_file_get_parent_dir("usr//", NULL);
1528 	test_os_file_get_parent_dir("/", NULL);
1529 	test_os_file_get_parent_dir("//", NULL);
1530 	test_os_file_get_parent_dir(".", NULL);
1531 	test_os_file_get_parent_dir("..", NULL);
1532 # ifdef _WIN32
1533 	test_os_file_get_parent_dir("D:", NULL);
1534 	test_os_file_get_parent_dir("D:/", NULL);
1535 	test_os_file_get_parent_dir("D:\\", NULL);
1536 	test_os_file_get_parent_dir("D:/data", NULL);
1537 	test_os_file_get_parent_dir("D:/data/", NULL);
1538 	test_os_file_get_parent_dir("D:\\data\\", NULL);
1539 	test_os_file_get_parent_dir("D:///data/////", NULL);
1540 	test_os_file_get_parent_dir("D:\\\\\\data\\\\\\\\", NULL);
1541 	test_os_file_get_parent_dir("D:/data//a", "D:/data");
1542 	test_os_file_get_parent_dir("D:\\data\\\\a", "D:\\data");
1543 	test_os_file_get_parent_dir("D:///data//a///b/", "D:///data//a");
1544 	test_os_file_get_parent_dir("D:\\\\\\data\\\\a\\\\\\b\\", "D:\\\\\\data\\\\a");
1545 #endif  /* _WIN32 */
1546 }
1547 #endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */
1548 
1549 
1550 /** Creates all missing subdirectories along the given path.
1551 @param[in]	path		Path name
1552 @return DB_SUCCESS if OK, otherwise error code. */
1553 dberr_t
os_file_create_subdirs_if_needed(const char * path)1554 os_file_create_subdirs_if_needed(
1555 	const char*	path)
1556 {
1557 	if (srv_read_only_mode) {
1558 
1559 		ib::error()
1560 			<< "read only mode set. Can't create "
1561 			<< "subdirectories '" << path << "'";
1562 
1563 		return(DB_READ_ONLY);
1564 
1565 	}
1566 
1567 	char*	subdir = os_file_get_parent_dir(path);
1568 
1569 	if (subdir == NULL) {
1570 		/* subdir is root or cwd, nothing to do */
1571 		return(DB_SUCCESS);
1572 	}
1573 
1574 	/* Test if subdir exists */
1575 	os_file_type_t	type;
1576 	bool	subdir_exists;
1577 	bool	success = os_file_status(subdir, &subdir_exists, &type);
1578 
1579 	if (success && !subdir_exists) {
1580 
1581 		/* Subdir does not exist, create it */
1582 		dberr_t	err = os_file_create_subdirs_if_needed(subdir);
1583 
1584 		if (err != DB_SUCCESS) {
1585 
1586 			ut_free(subdir);
1587 
1588 			return(err);
1589 		}
1590 
1591 		success = os_file_create_directory(subdir, false);
1592 	}
1593 
1594 	ut_free(subdir);
1595 
1596 	return(success ? DB_SUCCESS : DB_ERROR);
1597 }
1598 
1599 #ifndef _WIN32
1600 
1601 /** Do the read/write
1602 @param[in]	request	The IO context and type
1603 @return the number of bytes read/written or negative value on error */
1604 ssize_t
execute(const IORequest & request)1605 SyncFileIO::execute(const IORequest& request)
1606 {
1607 	ssize_t	n_bytes;
1608 
1609 	if (request.is_read()) {
1610 		n_bytes = pread(m_fh, m_buf, m_n, m_offset);
1611 	} else {
1612 		ut_ad(request.is_write());
1613 		n_bytes = pwrite(m_fh, m_buf, m_n, m_offset);
1614 	}
1615 
1616 	return(n_bytes);
1617 }
1618 /** Free storage space associated with a section of the file.
1619 @param[in]	fh		Open file handle
1620 @param[in]	off		Starting offset (SEEK_SET)
1621 @param[in]	len		Size of the hole
1622 @return DB_SUCCESS or error code */
1623 static
1624 dberr_t
os_file_punch_hole_posix(os_file_t fh,os_offset_t off,os_offset_t len)1625 os_file_punch_hole_posix(
1626 	os_file_t	fh,
1627 	os_offset_t	off,
1628 	os_offset_t	len)
1629 {
1630 
1631 #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
1632 	const int	mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
1633 
1634 	int		ret = fallocate(fh, mode, off, len);
1635 
1636 	if (ret == 0) {
1637 		return(DB_SUCCESS);
1638 	}
1639 
1640 	if (errno == ENOTSUP) {
1641 		return(DB_IO_NO_PUNCH_HOLE);
1642 	}
1643 
1644 	ib::warn()
1645 		<< "fallocate("
1646 		<<", FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, "
1647 		<< off << ", " << len << ") returned errno: "
1648 		<<  errno;
1649 
1650 	return(DB_IO_ERROR);
1651 
1652 #elif defined(UNIV_SOLARIS)
1653 
1654 	// Use F_FREESP
1655 
1656 #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
1657 
1658 	return(DB_IO_NO_PUNCH_HOLE);
1659 }
1660 
1661 #if defined(LINUX_NATIVE_AIO)
1662 
1663 /** Linux native AIO handler */
1664 class LinuxAIOHandler {
1665 public:
1666 	/**
1667 	@param[in] global_segment	The global segment*/
LinuxAIOHandler(ulint global_segment)1668 	LinuxAIOHandler(ulint global_segment)
1669 		:
1670 		m_global_segment(global_segment)
1671 	{
1672 		/* Should never be doing Sync IO here. */
1673 		ut_a(m_global_segment != ULINT_UNDEFINED);
1674 
1675 		/* Find the array and the local segment. */
1676 
1677 		m_segment = AIO::get_array_and_local_segment(
1678 			&m_array, m_global_segment);
1679 
1680 		m_n_slots = m_array->slots_per_segment();
1681 	}
1682 
1683 	/** Destructor */
~LinuxAIOHandler()1684 	~LinuxAIOHandler()
1685 	{
1686 		// No op
1687 	}
1688 
1689 	/**
1690 	Process a Linux AIO request
1691 	@param[out]	m1		the messages passed with the
1692 	@param[out]	m2		AIO request; note that in case the
1693 					AIO operation failed, these output
1694 					parameters are valid and can be used to
1695 					restart the operation.
1696 	@param[out]	request		IO context
1697 	@return DB_SUCCESS or error code */
1698 	dberr_t poll(fil_node_t** m1, void** m2, IORequest* request);
1699 
1700 private:
1701 	/** Resubmit an IO request that was only partially successful
1702 	@param[in,out]	slot		Request to resubmit
1703 	@return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
1704 	dberr_t	resubmit(Slot* slot);
1705 
1706 	/** Check if the AIO succeeded
1707 	@param[in,out]	slot		The slot to check
1708 	@return DB_SUCCESS, DB_FAIL if the operation should be retried or
1709 		DB_IO_ERROR on all other errors */
1710 	dberr_t	check_state(Slot* slot);
1711 
1712 	/** @return true if a shutdown was detected */
is_shutdown() const1713 	bool is_shutdown() const
1714 	{
1715 		return(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
1716 		       && !buf_page_cleaner_is_active);
1717 	}
1718 
1719 	/** If no slot was found then the m_array->m_mutex will be released.
1720 	@param[out]	n_pending	The number of pending IOs
1721 	@return NULL or a slot that has completed IO */
1722 	Slot* find_completed_slot(ulint* n_pending);
1723 
1724 	/** This is called from within the IO-thread. If there are no completed
1725 	IO requests in the slot array, the thread calls this function to
1726 	collect more requests from the Linux kernel.
1727 	The IO-thread waits on io_getevents(), which is a blocking call, with
1728 	a timeout value. Unless the system is very heavy loaded, keeping the
1729 	IO-thread very busy, the io-thread will spend most of its time waiting
1730 	in this function.
1731 	The IO-thread also exits in this function. It checks server status at
1732 	each wakeup and that is why we use timed wait in io_getevents(). */
1733 	void collect();
1734 
1735 private:
1736 	/** Slot array */
1737 	AIO*			m_array;
1738 
1739 	/** Number of slots inthe local segment */
1740 	ulint			m_n_slots;
1741 
1742 	/** The local segment to check */
1743 	ulint			m_segment;
1744 
1745 	/** The global segment */
1746 	ulint			m_global_segment;
1747 };
1748 
1749 /** Resubmit an IO request that was only partially successful
1750 @param[in,out]	slot		Request to resubmit
1751 @return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
1752 dberr_t
resubmit(Slot * slot)1753 LinuxAIOHandler::resubmit(Slot* slot)
1754 {
1755 #ifdef UNIV_DEBUG
1756 	/* Bytes already read/written out */
1757 	ulint	n_bytes = slot->ptr - slot->buf;
1758 
1759 	ut_ad(m_array->is_mutex_owned());
1760 
1761 	ut_ad(n_bytes < slot->original_len);
1762 	ut_ad(static_cast<ulint>(slot->n_bytes) < slot->original_len - n_bytes);
1763 	/* Partial read or write scenario */
1764 	ut_ad(slot->len >= static_cast<ulint>(slot->n_bytes));
1765 #endif /* UNIV_DEBUG */
1766 
1767 	slot->len -= slot->n_bytes;
1768 	slot->ptr += slot->n_bytes;
1769 	slot->offset += slot->n_bytes;
1770 
1771 	/* Resetting the bytes read/written */
1772 	slot->n_bytes = 0;
1773 	slot->io_already_done = false;
1774 
1775 	compile_time_assert(sizeof(off_t) >= sizeof(os_offset_t));
1776 
1777 	struct iocb*	iocb = &slot->control;
1778 
1779 	if (slot->type.is_read()) {
1780 
1781 		io_prep_pread(
1782 			iocb,
1783 			slot->file,
1784 			slot->ptr,
1785 			slot->len,
1786 			slot->offset);
1787 	} else {
1788 
1789 		ut_a(slot->type.is_write());
1790 
1791 		io_prep_pwrite(
1792 			iocb,
1793 			slot->file,
1794 			slot->ptr,
1795 			slot->len,
1796 			slot->offset);
1797 	}
1798 
1799 	iocb->data = slot;
1800 
1801 	ut_a(reinterpret_cast<size_t>(iocb->u.c.buf) % OS_FILE_LOG_BLOCK_SIZE
1802 	     == 0);
1803 
1804 	/* Resubmit an I/O request */
1805 	int	ret = io_submit(m_array->io_ctx(m_segment), 1, &iocb);
1806 	ut_a(ret != -EINVAL);
1807 
1808 	if (ret < 0)  {
1809 		errno = -ret;
1810 	}
1811 
1812 	return(ret < 0 ? DB_IO_PARTIAL_FAILED : DB_SUCCESS);
1813 }
1814 
1815 /** Check if the AIO succeeded
1816 @param[in,out]	slot		The slot to check
1817 @return DB_SUCCESS, DB_FAIL if the operation should be retried or
1818 	DB_IO_ERROR on all other errors */
1819 dberr_t
check_state(Slot * slot)1820 LinuxAIOHandler::check_state(Slot* slot)
1821 {
1822 	ut_ad(m_array->is_mutex_owned());
1823 
1824 	/* Note that it may be that there is more then one completed
1825 	IO requests. We process them one at a time. We may have a case
1826 	here to improve the performance slightly by dealing with all
1827 	requests in one sweep. */
1828 
1829 	srv_set_io_thread_op_info(
1830 		m_global_segment, "processing completed aio requests");
1831 
1832 	ut_ad(slot->io_already_done);
1833 
1834 	dberr_t	err = DB_SUCCESS;
1835 
1836 	if (slot->ret == 0) {
1837 
1838 		err = AIOHandler::post_io_processing(slot);
1839 
1840 	} else {
1841 		errno = -slot->ret;
1842 
1843 		/* os_file_handle_error does tell us if we should retry
1844 		this IO. As it stands now, we don't do this retry when
1845 		reaping requests from a different context than
1846 		the dispatcher. This non-retry logic is the same for
1847 		Windows and Linux native AIO.
1848 		We should probably look into this to transparently
1849 		re-submit the IO. */
1850 		os_file_handle_error(slot->name, "Linux aio");
1851 
1852 		err = DB_IO_ERROR;
1853 	}
1854 
1855 	return(err);
1856 }
1857 
1858 /** If no slot was found then the m_array->m_mutex will be released.
1859 @param[out]	n_pending		The number of pending IOs
1860 @return NULL or a slot that has completed IO */
1861 Slot*
find_completed_slot(ulint * n_pending)1862 LinuxAIOHandler::find_completed_slot(ulint* n_pending)
1863 {
1864 	ulint	offset = m_n_slots * m_segment;
1865 
1866 	*n_pending = 0;
1867 
1868 	m_array->acquire();
1869 
1870 	Slot*	slot = m_array->at(offset);
1871 
1872 	for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
1873 
1874 		if (slot->is_reserved) {
1875 
1876 			++*n_pending;
1877 
1878 			if (slot->io_already_done) {
1879 
1880 				/* Something for us to work on.
1881 				Note: We don't release the mutex. */
1882 				return(slot);
1883 			}
1884 		}
1885 	}
1886 
1887 	m_array->release();
1888 
1889 	return(NULL);
1890 }
1891 
1892 /** This function is only used in Linux native asynchronous i/o. This is
1893 called from within the io-thread. If there are no completed IO requests
1894 in the slot array, the thread calls this function to collect more
1895 requests from the kernel.
1896 The io-thread waits on io_getevents(), which is a blocking call, with
1897 a timeout value. Unless the system is very heavy loaded, keeping the
1898 io-thread very busy, the io-thread will spend most of its time waiting
1899 in this function.
1900 The io-thread also exits in this function. It checks server status at
1901 each wakeup and that is why we use timed wait in io_getevents(). */
1902 void
collect()1903 LinuxAIOHandler::collect()
1904 {
1905 	ut_ad(m_n_slots > 0);
1906 	ut_ad(m_array != NULL);
1907 	ut_ad(m_segment < m_array->get_n_segments());
1908 
1909 	/* Which io_context_t we are going to use. */
1910 	io_context_t	io_ctx = m_array->io_ctx(m_segment);
1911 
1912 	/* Starting point of the m_segment we will be working on. */
1913 	ulint	start_pos = m_segment * m_n_slots;
1914 
1915 	/* End point. */
1916 	ulint	end_pos = start_pos + m_n_slots;
1917 
1918 	for (;;) {
1919 		struct io_event*	events;
1920 
1921 		/* Which part of event array we are going to work on. */
1922 		events = m_array->io_events(m_segment * m_n_slots);
1923 
1924 		/* Initialize the events. */
1925 		memset(events, 0, sizeof(*events) * m_n_slots);
1926 
1927 		/* The timeout value is arbitrary. We probably need
1928 		to experiment with it a little. */
1929 		struct timespec		timeout;
1930 
1931 		timeout.tv_sec = 0;
1932 		timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
1933 
1934 		int	ret;
1935 
1936 		ret = io_getevents(io_ctx, 1, m_n_slots, events, &timeout);
1937 		ut_a(ret != -EINVAL);
1938 		ut_ad(ret != -EFAULT);
1939 
1940 		for (int i = 0; i < ret; ++i) {
1941 
1942 			struct iocb*	iocb;
1943 
1944 			iocb = reinterpret_cast<struct iocb*>(events[i].obj);
1945 			ut_a(iocb != NULL);
1946 
1947 			Slot*	slot = reinterpret_cast<Slot*>(iocb->data);
1948 
1949 			/* Some sanity checks. */
1950 			ut_a(slot != NULL);
1951 			ut_a(slot->is_reserved);
1952 
1953 			/* We are not scribbling previous segment. */
1954 			ut_a(slot->pos >= start_pos);
1955 
1956 			/* We have not overstepped to next segment. */
1957 			ut_a(slot->pos < end_pos);
1958 
1959 			/* Deallocate unused blocks from file system.
1960 			This is newer done to page 0 or to log files.*/
1961 			if (slot->offset > 0
1962 			    && !slot->type.is_log()
1963 			    && slot->type.is_write()
1964 			    && slot->type.punch_hole()) {
1965 
1966 				slot->err = slot->type.punch_hole(
1967 					slot->file,
1968 					slot->offset, slot->len);
1969 			} else {
1970 				slot->err = DB_SUCCESS;
1971 			}
1972 
1973 			/* Mark this request as completed. The error handling
1974 			will be done in the calling function. */
1975 			m_array->acquire();
1976 
1977 			/* events[i].res2 should always be ZERO */
1978 			ut_ad(events[i].res2 == 0);
1979 			slot->io_already_done = true;
1980 
1981 			/*Even though events[i].res is an unsigned number
1982 			in libaio, it is used to return a negative value
1983 			(negated errno value) to indicate error and a positive
1984 			value to indicate number of bytes read or written. */
1985 
1986 			if (events[i].res > slot->len) {
1987 				/* failure */
1988 				slot->n_bytes = 0;
1989 				slot->ret = events[i].res;
1990 			} else {
1991 				/* success */
1992 				slot->n_bytes = events[i].res;
1993 				slot->ret = 0;
1994 			}
1995 			m_array->release();
1996 		}
1997 
1998 		if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
1999 		    || !buf_page_cleaner_is_active
2000 		    || ret > 0) {
2001 
2002 			break;
2003 		}
2004 
2005 		/* This error handling is for any error in collecting the
2006 		IO requests. The errors, if any, for any particular IO
2007 		request are simply passed on to the calling routine. */
2008 
2009 		switch (ret) {
2010 		case -EAGAIN:
2011 			/* Not enough resources! Try again. */
2012 
2013 		case -EINTR:
2014 			/* Interrupted! The behaviour in case of an interrupt.
2015 			If we have some completed IOs available then the
2016 			return code will be the number of IOs. We get EINTR
2017 			only if there are no completed IOs and we have been
2018 			interrupted. */
2019 
2020 		case 0:
2021 			/* No pending request! Go back and check again. */
2022 
2023 			continue;
2024 		}
2025 
2026 		/* All other errors should cause a trap for now. */
2027 		ib::fatal()
2028 			<< "Unexpected ret_code[" << ret
2029 			<< "] from io_getevents()!";
2030 
2031 		break;
2032 	}
2033 }
2034 
2035 /** Process a Linux AIO request
2036 @param[out]	m1		the messages passed with the
2037 @param[out]	m2		AIO request; note that in case the
2038 				AIO operation failed, these output
2039 				parameters are valid and can be used to
2040 				restart the operation.
2041 @param[out]	request		IO context
2042 @return DB_SUCCESS or error code */
2043 dberr_t
poll(fil_node_t ** m1,void ** m2,IORequest * request)2044 LinuxAIOHandler::poll(fil_node_t** m1, void** m2, IORequest* request)
2045 {
2046 	dberr_t		err = DB_SUCCESS;
2047 	Slot*		slot;
2048 
2049 	/* Loop until we have found a completed request. */
2050 	for (;;) {
2051 
2052 		ulint	n_pending;
2053 
2054 		slot = find_completed_slot(&n_pending);
2055 
2056 		if (slot != NULL) {
2057 
2058 			ut_ad(m_array->is_mutex_owned());
2059 
2060 			err = check_state(slot);
2061 
2062 			/* DB_FAIL is not a hard error, we should retry */
2063 			if (err != DB_FAIL) {
2064 				break;
2065 			}
2066 
2067 			/* Partial IO, resubmit request for
2068 			remaining bytes to read/write */
2069 			err = resubmit(slot);
2070 
2071 			if (err != DB_SUCCESS) {
2072 				break;
2073 			}
2074 
2075 			m_array->release();
2076 
2077 		} else if (is_shutdown() && n_pending == 0) {
2078 
2079 			/* There is no completed request. If there is
2080 			no pending request at all, and the system is
2081 			being shut down, exit. */
2082 
2083 			*m1 = NULL;
2084 			*m2 = NULL;
2085 
2086 			return(DB_SUCCESS);
2087 
2088 		} else {
2089 
2090 			/* Wait for some request. Note that we return
2091 			from wait if we have found a request. */
2092 
2093 			srv_set_io_thread_op_info(
2094 				m_global_segment,
2095 				"waiting for completed aio requests");
2096 
2097 			collect();
2098 		}
2099 	}
2100 
2101 	if (err == DB_IO_PARTIAL_FAILED) {
2102 		/* Aborting in case of submit failure */
2103 		ib::fatal()
2104 			<< "Native Linux AIO interface. "
2105 			"io_submit() call failed when "
2106 			"resubmitting a partial I/O "
2107 			"request on the file " << slot->name
2108 			<< ".";
2109 	}
2110 
2111 	*m1 = slot->m1;
2112 	*m2 = slot->m2;
2113 
2114 	*request = slot->type;
2115 
2116 	m_array->release(slot);
2117 
2118 	m_array->release();
2119 
2120 	return(err);
2121 }
2122 
2123 /** This function is only used in Linux native asynchronous i/o.
2124 Waits for an aio operation to complete. This function is used to wait for
2125 the completed requests. The aio array of pending requests is divided
2126 into segments. The thread specifies which segment or slot it wants to wait
2127 for. NOTE: this function will also take care of freeing the aio slot,
2128 therefore no other thread is allowed to do the freeing!
2129 
2130 @param[in]	global_seg	segment number in the aio array
2131 				to wait for; segment 0 is the ibuf
2132 				i/o thread, segment 1 is log i/o thread,
2133 				then follow the non-ibuf read threads,
2134 				and the last are the non-ibuf write
2135 				threads.
2136 @param[out]	m1		the messages passed with the
2137 @param[out]	m2			AIO request; note that in case the
2138 				AIO operation failed, these output
2139 				parameters are valid and can be used to
2140 				restart the operation.
2141 @param[out]xi	 request	IO context
2142 @return DB_SUCCESS if the IO was successful */
2143 static
2144 dberr_t
os_aio_linux_handler(ulint global_segment,fil_node_t ** m1,void ** m2,IORequest * request)2145 os_aio_linux_handler(
2146 	ulint		global_segment,
2147 	fil_node_t**	m1,
2148 	void**		m2,
2149 	IORequest*	request)
2150 {
2151 	return LinuxAIOHandler(global_segment).poll(m1, m2, request);
2152 }
2153 
2154 /** Dispatch an AIO request to the kernel.
2155 @param[in,out]	slot		an already reserved slot
2156 @return true on success. */
2157 bool
linux_dispatch(Slot * slot)2158 AIO::linux_dispatch(Slot* slot)
2159 {
2160 	ut_a(slot->is_reserved);
2161 	ut_ad(slot->type.validate());
2162 
2163 	/* Find out what we are going to work with.
2164 	The iocb struct is directly in the slot.
2165 	The io_context_t is one per segment. */
2166 
2167 	ulint		io_ctx_index;
2168 	struct iocb*	iocb = &slot->control;
2169 
2170 	io_ctx_index = (slot->pos * m_n_segments) / m_slots.size();
2171 
2172 	ut_a(reinterpret_cast<size_t>(iocb->u.c.buf) % OS_FILE_LOG_BLOCK_SIZE
2173 	     == 0);
2174 
2175 	int	ret = io_submit(io_ctx(io_ctx_index), 1, &iocb);
2176 	ut_a(ret != -EINVAL);
2177 
2178 	/* io_submit() returns number of successfully queued requests
2179 	or -errno. */
2180 
2181 	if (ret != 1) {
2182 		errno = -ret;
2183 	}
2184 
2185 	return(ret == 1);
2186 }
2187 
2188 /** Creates an io_context_t for native linux AIO.
2189 @param[in]	max_events	number of events
2190 @param[out]	io_ctx		io_ctx to initialize.
2191 @return true on success. */
2192 bool
linux_create_io_ctx(unsigned max_events,io_context_t & io_ctx)2193 AIO::linux_create_io_ctx(
2194 	unsigned	max_events,
2195 	io_context_t&	io_ctx)
2196 {
2197 	ssize_t		n_retries = 0;
2198 
2199 	for (;;) {
2200 
2201 		memset(&io_ctx, 0x0, sizeof(io_ctx));
2202 
2203 		/* Initialize the io_ctx. Tell it how many pending
2204 		IO requests this context will handle. */
2205 
2206 		int	ret = io_setup(max_events, &io_ctx);
2207 		ut_a(ret != -EINVAL);
2208 
2209 		if (ret == 0) {
2210 			/* Success. Return now. */
2211 			return(true);
2212 		}
2213 
2214 		/* If we hit EAGAIN we'll make a few attempts before failing. */
2215 
2216 		switch (ret) {
2217 		case -EAGAIN:
2218 			if (n_retries == 0) {
2219 				/* First time around. */
2220 				ib::warn()
2221 					<< "io_setup() failed with EAGAIN."
2222 					" Will make "
2223 					<< OS_AIO_IO_SETUP_RETRY_ATTEMPTS
2224 					<< " attempts before giving up.";
2225 			}
2226 
2227 			if (n_retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
2228 
2229 				++n_retries;
2230 
2231 				ib::warn()
2232 					<< "io_setup() attempt "
2233 					<< n_retries << ".";
2234 
2235 				os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
2236 
2237 				continue;
2238 			}
2239 
2240 			/* Have tried enough. Better call it a day. */
2241 			ib::warn()
2242 				<< "io_setup() failed with EAGAIN after "
2243 				<< OS_AIO_IO_SETUP_RETRY_ATTEMPTS
2244 				<< " attempts.";
2245 			break;
2246 
2247 		case -ENOSYS:
2248 			ib::warn()
2249 				<< "Linux Native AIO interface"
2250 				" is not supported on this platform. Please"
2251 				" check your OS documentation and install"
2252 				" appropriate binary of InnoDB.";
2253 
2254 			break;
2255 
2256 		default:
2257 			ib::warn()
2258 				<< "Linux Native AIO setup"
2259 				<< " returned following error["
2260 				<< ret << "]";
2261 			break;
2262 		}
2263 
2264 		ib::info()
2265 			<< "You can disable Linux Native AIO by"
2266 			" setting innodb_use_native_aio = 0 in my.cnf";
2267 
2268 		break;
2269 	}
2270 
2271 	return(false);
2272 }
2273 
2274 /** Checks if the system supports native linux aio. On some kernel
2275 versions where native aio is supported it won't work on tmpfs. In such
2276 cases we can't use native aio as it is not possible to mix simulated
2277 and native aio.
2278 @return: true if supported, false otherwise. */
2279 bool
is_linux_native_aio_supported()2280 AIO::is_linux_native_aio_supported()
2281 {
2282 	int		fd;
2283 	io_context_t	io_ctx;
2284 	char		name[1000];
2285 
2286 	if (!linux_create_io_ctx(1, io_ctx)) {
2287 
2288 		/* The platform does not support native aio. */
2289 
2290 		return(false);
2291 
2292 	} else if (!srv_read_only_mode) {
2293 
2294 		/* Now check if tmpdir supports native aio ops. */
2295 		fd = innobase_mysql_tmpfile(NULL);
2296 
2297 		if (fd < 0) {
2298 			ib::warn()
2299 				<< "Unable to create temp file to check"
2300 				" native AIO support.";
2301 
2302 			int ret = io_destroy(io_ctx);
2303 			ut_a(ret != -EINVAL);
2304 			ut_ad(ret != -EFAULT);
2305 
2306 			return(false);
2307 		}
2308 	} else {
2309 
2310 		os_normalize_path(srv_log_group_home_dir);
2311 
2312 		ulint	dirnamelen = strlen(srv_log_group_home_dir);
2313 
2314 		ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
2315 
2316 		memcpy(name, srv_log_group_home_dir, dirnamelen);
2317 
2318 		/* Add a path separator if needed. */
2319 		if (dirnamelen && name[dirnamelen - 1] != OS_PATH_SEPARATOR) {
2320 
2321 			name[dirnamelen++] = OS_PATH_SEPARATOR;
2322 		}
2323 
2324 		strcpy(name + dirnamelen, "ib_logfile0");
2325 
2326 		fd = open(name, O_RDONLY | O_CLOEXEC);
2327 
2328 		if (fd == -1) {
2329 
2330 			ib::warn()
2331 				<< "Unable to open"
2332 				<< " \"" << name << "\" to check native"
2333 				<< " AIO read support.";
2334 
2335 			int ret = io_destroy(io_ctx);
2336 			ut_a(ret != EINVAL);
2337 			ut_ad(ret != EFAULT);
2338 
2339 			return(false);
2340 		}
2341 	}
2342 
2343 	struct io_event	io_event;
2344 
2345 	memset(&io_event, 0x0, sizeof(io_event));
2346 
2347 	byte*	buf = static_cast<byte*>(ut_malloc_nokey(srv_page_size * 2));
2348 	byte*	ptr = static_cast<byte*>(ut_align(buf, srv_page_size));
2349 
2350 	struct iocb	iocb;
2351 
2352 	/* Suppress valgrind warning. */
2353 	memset(buf, 0x00, srv_page_size * 2);
2354 	memset(&iocb, 0x0, sizeof(iocb));
2355 
2356 	struct iocb*	p_iocb = &iocb;
2357 
2358 	if (!srv_read_only_mode) {
2359 
2360 		io_prep_pwrite(p_iocb, fd, ptr, srv_page_size, 0);
2361 
2362 	} else {
2363 		ut_a(srv_page_size >= 4096);
2364 		io_prep_pread(p_iocb, fd, ptr, srv_page_size, 0);
2365 	}
2366 
2367 	ut_a(reinterpret_cast<size_t>(p_iocb->u.c.buf) % OS_FILE_LOG_BLOCK_SIZE
2368 	     == 0);
2369 	int	err = io_submit(io_ctx, 1, &p_iocb);
2370 	ut_a(err != -EINVAL);
2371 
2372 	if (err >= 1) {
2373 		/* Now collect the submitted IO request. */
2374 		err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
2375 		ut_a(err != -EINVAL);
2376 	}
2377 
2378 	ut_free(buf);
2379 	close(fd);
2380 
2381 	switch (err) {
2382 	case 1:
2383 		{
2384 			int ret = io_destroy(io_ctx);
2385 			ut_a(ret != -EINVAL);
2386 			ut_ad(ret != -EFAULT);
2387 
2388 			return(true);
2389 		}
2390 
2391 	case -EINVAL:
2392 	case -ENOSYS:
2393 		ib::error()
2394 			<< "Linux Native AIO not supported. You can either"
2395 			" move "
2396 			<< (srv_read_only_mode ? name : "tmpdir")
2397 			<< " to a file system that supports native"
2398 			" AIO or you can set innodb_use_native_aio to"
2399 			" FALSE to avoid this message.";
2400 
2401 		/* fall through. */
2402 	default:
2403 		ib::error()
2404 			<< "Linux Native AIO check on "
2405 			<< (srv_read_only_mode ? name : "tmpdir")
2406 			<< "returned error[" << -err << "]";
2407 	}
2408 
2409 	int ret = io_destroy(io_ctx);
2410 	ut_a(ret != -EINVAL);
2411 	ut_ad(ret != -EFAULT);
2412 
2413 	return(false);
2414 }
2415 
2416 #endif /* LINUX_NATIVE_AIO */
2417 
2418 /** Retrieves the last error number if an error occurs in a file io function.
2419 The number should be retrieved before any other OS calls (because they may
2420 overwrite the error number). If the number is not known to this program,
2421 the OS error number + OS_FILE_ERROR_MAX is returned.
2422 @param[in]	report_all_errors	true if we want an error message
2423 					printed of all errors
2424 @param[in]	on_error_silent		true then don't print any diagnostic
2425 					to the log
2426 @return error number, or OS error number + OS_FILE_ERROR_MAX */
2427 static
2428 ulint
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)2429 os_file_get_last_error_low(
2430 	bool	report_all_errors,
2431 	bool	on_error_silent)
2432 {
2433 	int	err = errno;
2434 
2435 	if (err == 0) {
2436 		return(0);
2437 	}
2438 
2439 	if (report_all_errors
2440 	    || (err != ENOSPC && err != EEXIST && !on_error_silent)) {
2441 
2442 		ib::error()
2443 			<< "Operating system error number "
2444 			<< err
2445 			<< " in a file operation.";
2446 
2447 		if (err == ENOENT) {
2448 
2449 			ib::error()
2450 				<< "The error means the system"
2451 				" cannot find the path specified.";
2452 
2453 			if (srv_is_being_started) {
2454 
2455 				ib::error()
2456 					<< "If you are installing InnoDB,"
2457 					" remember that you must create"
2458 					" directories yourself, InnoDB"
2459 					" does not create them.";
2460 			}
2461 		} else if (err == EACCES) {
2462 
2463 			ib::error()
2464 				<< "The error means mysqld does not have"
2465 				" the access rights to the directory.";
2466 
2467 		} else {
2468 			if (strerror(err) != NULL) {
2469 
2470 				ib::error()
2471 					<< "Error number " << err << " means '"
2472 					<< strerror(err) << "'";
2473 			}
2474 
2475 			ib::info() << OPERATING_SYSTEM_ERROR_MSG;
2476 		}
2477 	}
2478 
2479 	switch (err) {
2480 	case ENOSPC:
2481 		return(OS_FILE_DISK_FULL);
2482 	case ENOENT:
2483 		return(OS_FILE_NOT_FOUND);
2484 	case EEXIST:
2485 		return(OS_FILE_ALREADY_EXISTS);
2486 	case EXDEV:
2487 	case ENOTDIR:
2488 	case EISDIR:
2489 		return(OS_FILE_PATH_ERROR);
2490 	case EAGAIN:
2491 		if (srv_use_native_aio) {
2492 			return(OS_FILE_AIO_RESOURCES_RESERVED);
2493 		}
2494 		break;
2495 	case EINTR:
2496 		if (srv_use_native_aio) {
2497 			return(OS_FILE_AIO_INTERRUPTED);
2498 		}
2499 		break;
2500 	case EACCES:
2501 		return(OS_FILE_ACCESS_VIOLATION);
2502 	}
2503 	return(OS_FILE_ERROR_MAX + err);
2504 }
2505 
2506 /** Wrapper to fsync(2) that retries the call on some errors.
2507 Returns the value 0 if successful; otherwise the value -1 is returned and
2508 the global variable errno is set to indicate the error.
2509 @param[in]	file		open file handle
2510 @return 0 if success, -1 otherwise */
2511 static
2512 int
os_file_fsync_posix(os_file_t file)2513 os_file_fsync_posix(
2514 	os_file_t	file)
2515 {
2516 	ulint		failures = 0;
2517 
2518 	for (;;) {
2519 
2520 		++os_n_fsyncs;
2521 
2522 		int	ret = fsync(file);
2523 
2524 		if (ret == 0) {
2525 			return(ret);
2526 		}
2527 
2528 		switch(errno) {
2529 		case ENOLCK:
2530 
2531 			++failures;
2532 			ut_a(failures < 1000);
2533 
2534 			if (!(failures % 100)) {
2535 
2536 				ib::warn()
2537 					<< "fsync(): "
2538 					<< "No locks available; retrying";
2539 			}
2540 
2541 			/* 0.2 sec */
2542 			os_thread_sleep(200000);
2543 			break;
2544 
2545 		case EINTR:
2546 
2547 			++failures;
2548 			ut_a(failures < 2000);
2549 			break;
2550 
2551 		default:
2552 			ib::fatal() << "fsync() returned " << errno;
2553 		}
2554 	}
2555 }
2556 
2557 /** Check the existence and type of the given file.
2558 @param[in]	path		path name of file
2559 @param[out]	exists		true if the file exists
2560 @param[out]	type		Type of the file, if it exists
2561 @return true if call succeeded */
2562 static
2563 bool
os_file_status_posix(const char * path,bool * exists,os_file_type_t * type)2564 os_file_status_posix(
2565 	const char*	path,
2566 	bool*		exists,
2567 	os_file_type_t* type)
2568 {
2569 	struct stat	statinfo;
2570 
2571 	int	ret = stat(path, &statinfo);
2572 
2573 	*exists = !ret;
2574 
2575 	if (!ret) {
2576 		/* file exists, everything OK */
2577 
2578 	} else if (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG) {
2579 		/* file does not exist */
2580 		return(true);
2581 
2582 	} else {
2583 		/* file exists, but stat call failed */
2584 		os_file_handle_error_no_exit(path, "stat", false);
2585 		return(false);
2586 	}
2587 
2588 	if (S_ISDIR(statinfo.st_mode)) {
2589 		*type = OS_FILE_TYPE_DIR;
2590 
2591 	} else if (S_ISLNK(statinfo.st_mode)) {
2592 		*type = OS_FILE_TYPE_LINK;
2593 
2594 	} else if (S_ISREG(statinfo.st_mode)) {
2595 		*type = OS_FILE_TYPE_FILE;
2596 	} else {
2597 		*type = OS_FILE_TYPE_UNKNOWN;
2598 	}
2599 
2600 	return(true);
2601 }
2602 
2603 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
2604 function!
2605 Flushes the write buffers of a given file to the disk.
2606 @param[in]	file		handle to a file
2607 @return true if success */
2608 bool
os_file_flush_func(os_file_t file)2609 os_file_flush_func(
2610 	os_file_t	file)
2611 {
2612 	int	ret;
2613 
2614 	WAIT_ALLOW_WRITES();
2615 	ret = os_file_fsync_posix(file);
2616 
2617 	if (ret == 0) {
2618 		return(true);
2619 	}
2620 
2621 	/* Since Linux returns EINVAL if the 'file' is actually a raw device,
2622 	we choose to ignore that error if we are using raw disks */
2623 
2624 	if (srv_start_raw_disk_in_use && errno == EINVAL) {
2625 
2626 		return(true);
2627 	}
2628 
2629 	ib::error() << "The OS said file flush did not succeed";
2630 
2631 	os_file_handle_error(NULL, "flush");
2632 
2633 	/* It is a fatal error if a file flush does not succeed, because then
2634 	the database can get corrupt on disk */
2635 	ut_error;
2636 
2637 	return(false);
2638 }
2639 
2640 /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
2641 this function!
2642 A simple function to open or create a file.
2643 @param[in]	name		name of the file or path as a null-terminated
2644 				string
2645 @param[in]	create_mode	create mode
2646 @param[in]	access_type	OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
2647 @param[in]	read_only	if true, read only checks are enforced
2648 @param[out]	success		true if succeed, false if error
2649 @return handle to the file, not defined if error, error number
2650 	can be retrieved with os_file_get_last_error */
2651 pfs_os_file_t
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)2652 os_file_create_simple_func(
2653 	const char*	name,
2654 	ulint		create_mode,
2655 	ulint		access_type,
2656 	bool		read_only,
2657 	bool*		success)
2658 {
2659 	pfs_os_file_t	file;
2660 
2661 	*success = false;
2662 
2663 	int		create_flag;
2664 	const char*	mode_str	= NULL;
2665 
2666 	if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) {
2667 		WAIT_ALLOW_WRITES();
2668 	}
2669 
2670 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
2671 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
2672 
2673 	if (create_mode == OS_FILE_OPEN) {
2674 		mode_str = "OPEN";
2675 
2676 		if (access_type == OS_FILE_READ_ONLY) {
2677 
2678 			create_flag = O_RDONLY;
2679 
2680 		} else if (read_only) {
2681 
2682 			create_flag = O_RDONLY;
2683 
2684 		} else {
2685 			create_flag = O_RDWR;
2686 		}
2687 
2688 	} else if (read_only) {
2689 
2690 		mode_str = "OPEN";
2691 		create_flag = O_RDONLY;
2692 
2693 	} else if (create_mode == OS_FILE_CREATE) {
2694 
2695 		mode_str = "CREATE";
2696 		create_flag = O_RDWR | O_CREAT | O_EXCL;
2697 
2698 	} else if (create_mode == OS_FILE_CREATE_PATH) {
2699 
2700 		mode_str = "CREATE PATH";
2701 		/* Create subdirs along the path if needed. */
2702 
2703 		*success = os_file_create_subdirs_if_needed(name);
2704 
2705 		if (!*success) {
2706 
2707 			ib::error()
2708 				<< "Unable to create subdirectories '"
2709 				<< name << "'";
2710 
2711 			return(OS_FILE_CLOSED);
2712 		}
2713 
2714 		create_flag = O_RDWR | O_CREAT | O_EXCL;
2715 		create_mode = OS_FILE_CREATE;
2716 	} else {
2717 
2718 		ib::error()
2719 			<< "Unknown file create mode ("
2720 			<< create_mode
2721 			<< " for file '" << name << "'";
2722 
2723 		return(OS_FILE_CLOSED);
2724 	}
2725 
2726 	bool	retry;
2727 
2728 	do {
2729 		file = open(name, create_flag | O_CLOEXEC, os_innodb_umask);
2730 
2731 		if (file == -1) {
2732 			*success = false;
2733 			retry = os_file_handle_error(
2734 				name,
2735 				create_mode == OS_FILE_OPEN
2736 				? "open" : "create");
2737 		} else {
2738 			*success = true;
2739 			retry = false;
2740 		}
2741 
2742 	} while (retry);
2743 
2744 	/* This function is always called for data files, we should disable
2745 	OS caching (O_DIRECT) here as we do in os_file_create_func(), so
2746 	we open the same file in the same mode, see man page of open(2). */
2747        if (!srv_read_only_mode
2748 	   && *success
2749 	   && (srv_file_flush_method == SRV_O_DIRECT
2750 	       || srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)) {
2751 
2752 	       os_file_set_nocache(file, name, mode_str);
2753 	}
2754 
2755 #ifdef USE_FILE_LOCK
2756 	if (!read_only
2757 	    && *success
2758 	    && (access_type == OS_FILE_READ_WRITE)
2759 	    && os_file_lock(file, name)) {
2760 
2761 		*success = false;
2762 		close(file);
2763 		file = -1;
2764 	}
2765 #endif /* USE_FILE_LOCK */
2766 
2767 	return(file);
2768 }
2769 
2770 /** This function attempts to create a directory named pathname. The new
2771 directory gets default permissions. On Unix the permissions are
2772 (0770 & ~umask). If the directory exists already, nothing is done and
2773 the call succeeds, unless the fail_if_exists arguments is true.
2774 If another error occurs, such as a permission error, this does not crash,
2775 but reports the error and returns false.
2776 @param[in]	pathname	directory name as null-terminated string
2777 @param[in]	fail_if_exists	if true, pre-existing directory is treated as
2778 				an error.
2779 @return true if call succeeds, false on error */
2780 bool
os_file_create_directory(const char * pathname,bool fail_if_exists)2781 os_file_create_directory(
2782 	const char*	pathname,
2783 	bool		fail_if_exists)
2784 {
2785 	int	rcode;
2786 
2787 	WAIT_ALLOW_WRITES();
2788 	rcode = mkdir(pathname, 0770);
2789 
2790 	if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
2791 		/* failure */
2792 		os_file_handle_error_no_exit(pathname, "mkdir", false);
2793 
2794 		return(false);
2795 	}
2796 
2797 	return(true);
2798 }
2799 
2800 /**
2801 The os_file_opendir() function opens a directory stream corresponding to the
2802 directory named by the dirname argument. The directory stream is positioned
2803 at the first entry. In both Unix and Windows we automatically skip the '.'
2804 and '..' items at the start of the directory listing.
2805 @param[in]	dirname		directory name; it must not contain a trailing
2806 				'\' or '/'
2807 @param[in]	is_fatal	true if we should treat an error as a fatal
2808 				error; if we try to open symlinks then we do
2809 				not wish a fatal error if it happens not to be
2810 				a directory
2811 @return directory stream, NULL if error */
2812 os_file_dir_t
os_file_opendir(const char * dirname,bool error_is_fatal)2813 os_file_opendir(
2814 	const char*	dirname,
2815 	bool		error_is_fatal)
2816 {
2817 	os_file_dir_t		dir;
2818 	dir = opendir(dirname);
2819 
2820 	if (dir == NULL && error_is_fatal) {
2821 		os_file_handle_error(dirname, "opendir");
2822 	}
2823 
2824 	return(dir);
2825 }
2826 
2827 /** Closes a directory stream.
2828 @param[in]	dir		directory stream
2829 @return 0 if success, -1 if failure */
2830 int
os_file_closedir(os_file_dir_t dir)2831 os_file_closedir(
2832 	os_file_dir_t	dir)
2833 {
2834 	int	ret = closedir(dir);
2835 
2836 	if (ret != 0) {
2837 		os_file_handle_error_no_exit(NULL, "closedir", false);
2838 	}
2839 
2840 	return(ret);
2841 }
2842 
2843 /** This function returns information of the next file in the directory. We jump
2844 over the '.' and '..' entries in the directory.
2845 @param[in]	dirname		directory name or path
2846 @param[in]	dir		directory stream
2847 @param[out]	info		buffer where the info is returned
2848 @return 0 if ok, -1 if error, 1 if at the end of the directory */
2849 int
os_file_readdir_next_file(const char * dirname,os_file_dir_t dir,os_file_stat_t * info)2850 os_file_readdir_next_file(
2851 	const char*	dirname,
2852 	os_file_dir_t	dir,
2853 	os_file_stat_t*	info)
2854 {
2855 	struct dirent*	ent;
2856 	char*		full_path;
2857 	int		ret;
2858 	struct stat	statinfo;
2859 
2860 next_file:
2861 
2862 	ent = readdir(dir);
2863 
2864 	if (ent == NULL) {
2865 
2866 		return(1);
2867 	}
2868 
2869 	ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
2870 
2871 	if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
2872 
2873 		goto next_file;
2874 	}
2875 
2876 	strcpy(info->name, ent->d_name);
2877 
2878 	full_path = static_cast<char*>(
2879 		ut_malloc_nokey(strlen(dirname) + strlen(ent->d_name) + 10));
2880 
2881 	sprintf(full_path, "%s/%s", dirname, ent->d_name);
2882 
2883 	ret = stat(full_path, &statinfo);
2884 
2885 	if (ret) {
2886 
2887 		if (errno == ENOENT) {
2888 			/* readdir() returned a file that does not exist,
2889 			it must have been deleted in the meantime. Do what
2890 			would have happened if the file was deleted before
2891 			readdir() - ignore and go to the next entry.
2892 			If this is the last entry then info->name will still
2893 			contain the name of the deleted file when this
2894 			function returns, but this is not an issue since the
2895 			caller shouldn't be looking at info when end of
2896 			directory is returned. */
2897 
2898 			ut_free(full_path);
2899 
2900 			goto next_file;
2901 		}
2902 
2903 		os_file_handle_error_no_exit(full_path, "stat", false);
2904 
2905 		ut_free(full_path);
2906 
2907 		return(-1);
2908 	}
2909 
2910 	info->size = statinfo.st_size;
2911 
2912 	if (S_ISDIR(statinfo.st_mode)) {
2913 		info->type = OS_FILE_TYPE_DIR;
2914 	} else if (S_ISLNK(statinfo.st_mode)) {
2915 		info->type = OS_FILE_TYPE_LINK;
2916 	} else if (S_ISREG(statinfo.st_mode)) {
2917 		info->type = OS_FILE_TYPE_FILE;
2918 	} else {
2919 		info->type = OS_FILE_TYPE_UNKNOWN;
2920 	}
2921 
2922 	ut_free(full_path);
2923 
2924 	return(0);
2925 }
2926 
2927 /** NOTE! Use the corresponding macro os_file_create(), not directly
2928 this function!
2929 Opens an existing file or creates a new.
2930 @param[in]	name		name of the file or path as a null-terminated
2931 				string
2932 @param[in]	create_mode	create mode
2933 @param[in]	purpose		OS_FILE_AIO, if asynchronous, non-buffered I/O
2934 				is desired, OS_FILE_NORMAL, if any normal file;
2935 				NOTE that it also depends on type, os_aio_..
2936 				and srv_.. variables whether we really use async
2937 				I/O or unbuffered I/O: look in the function
2938 				source code for the exact rules
2939 @param[in]	type		OS_DATA_FILE or OS_LOG_FILE
2940 @param[in]	read_only	true, if read only checks should be enforcedm
2941 @param[in]	success		true if succeeded
2942 @return handle to the file, not defined if error, error number
2943 	can be retrieved with os_file_get_last_error */
2944 pfs_os_file_t
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,bool read_only,bool * success)2945 os_file_create_func(
2946 	const char*	name,
2947 	ulint		create_mode,
2948 	ulint		purpose,
2949 	ulint		type,
2950 	bool		read_only,
2951 	bool*		success)
2952 {
2953 	bool		on_error_no_exit;
2954 	bool		on_error_silent;
2955 
2956 	*success = false;
2957 
2958 	DBUG_EXECUTE_IF(
2959 		"ib_create_table_fail_disk_full",
2960 		*success = false;
2961 		errno = ENOSPC;
2962 		return(OS_FILE_CLOSED);
2963 	);
2964 
2965 	int		create_flag;
2966 	const char*	mode_str	= NULL;
2967 
2968 	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
2969 		? true : false;
2970 	on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
2971 		? true : false;
2972 
2973 	create_mode &= ulint(~(OS_FILE_ON_ERROR_NO_EXIT
2974 			       | OS_FILE_ON_ERROR_SILENT));
2975 
2976 	if (create_mode == OS_FILE_OPEN
2977 	    || create_mode == OS_FILE_OPEN_RAW
2978 	    || create_mode == OS_FILE_OPEN_RETRY) {
2979 
2980 		mode_str = "OPEN";
2981 
2982 		create_flag = read_only ? O_RDONLY : O_RDWR;
2983 
2984 	} else if (read_only) {
2985 
2986 		mode_str = "OPEN";
2987 
2988 		create_flag = O_RDONLY;
2989 
2990 	} else if (create_mode == OS_FILE_CREATE) {
2991 
2992 		mode_str = "CREATE";
2993 		create_flag = O_RDWR | O_CREAT | O_EXCL;
2994 
2995 	} else if (create_mode == OS_FILE_OVERWRITE) {
2996 
2997 		mode_str = "OVERWRITE";
2998 		create_flag = O_RDWR | O_CREAT | O_TRUNC;
2999 
3000 	} else {
3001 		ib::error()
3002 			<< "Unknown file create mode (" << create_mode << ")"
3003 			<< " for file '" << name << "'";
3004 
3005 		return(OS_FILE_CLOSED);
3006 	}
3007 
3008 	ut_a(type == OS_LOG_FILE
3009 	     || type == OS_DATA_FILE
3010 	     || type == OS_DATA_FILE_NO_O_DIRECT);
3011 
3012 	ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
3013 
3014 #ifdef O_SYNC
3015 	/* We let O_SYNC only affect log files; note that we map O_DSYNC to
3016 	O_SYNC because the datasync options seemed to corrupt files in 2001
3017 	in both Linux and Solaris */
3018 
3019 	if (!read_only
3020 	    && type == OS_LOG_FILE
3021 	    && srv_file_flush_method == SRV_O_DSYNC) {
3022 
3023 		create_flag |= O_SYNC;
3024 	}
3025 #endif /* O_SYNC */
3026 
3027 	os_file_t	file;
3028 	bool		retry;
3029 
3030 	do {
3031 		file = open(name, create_flag | O_CLOEXEC, os_innodb_umask);
3032 
3033 		if (file == -1) {
3034 			const char*	operation;
3035 
3036 			operation = (create_mode == OS_FILE_CREATE
3037 				     && !read_only) ? "create" : "open";
3038 
3039 			*success = false;
3040 
3041 			if (on_error_no_exit) {
3042 				retry = os_file_handle_error_no_exit(
3043 					name, operation, on_error_silent);
3044 			} else {
3045 				retry = os_file_handle_error(name, operation);
3046 			}
3047 		} else {
3048 			*success = true;
3049 			retry = false;
3050 		}
3051 
3052 	} while (retry);
3053 
3054 	/* We disable OS caching (O_DIRECT) only on data files */
3055 	if (!read_only
3056 	    && *success
3057 	    && (type != OS_LOG_FILE
3058 		&& type != OS_DATA_FILE_NO_O_DIRECT)
3059 	    && (srv_file_flush_method == SRV_O_DIRECT
3060 		|| srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)) {
3061 
3062 	       os_file_set_nocache(file, name, mode_str);
3063 	}
3064 
3065 #ifdef USE_FILE_LOCK
3066 	if (!read_only
3067 	    && *success
3068 	    && create_mode != OS_FILE_OPEN_RAW
3069 	    && os_file_lock(file, name)) {
3070 
3071 		if (create_mode == OS_FILE_OPEN_RETRY) {
3072 
3073 			ib::info()
3074 				<< "Retrying to lock the first data file";
3075 
3076 			for (int i = 0; i < 100; i++) {
3077 				os_thread_sleep(1000000);
3078 
3079 				if (!os_file_lock(file, name)) {
3080 					*success = true;
3081 					return(file);
3082 				}
3083 			}
3084 
3085 			ib::info()
3086 				<< "Unable to open the first data file";
3087 		}
3088 
3089 		*success = false;
3090 		close(file);
3091 		file = -1;
3092 	}
3093 #endif /* USE_FILE_LOCK */
3094 
3095 	return(file);
3096 }
3097 
3098 /** NOTE! Use the corresponding macro
3099 os_file_create_simple_no_error_handling(), not directly this function!
3100 A simple function to open or create a file.
3101 @param[in]	name		name of the file or path as a null-terminated
3102 				string
3103 @param[in]	create_mode	create mode
3104 @param[in]	access_type	OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
3105 				OS_FILE_READ_ALLOW_DELETE; the last option
3106 				is used by a backup program reading the file
3107 @param[in]	read_only	if true read only mode checks are enforced
3108 @param[out]	success		true if succeeded
3109 @return own: handle to the file, not defined if error, error number
3110 	can be retrieved with os_file_get_last_error */
3111 pfs_os_file_t
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)3112 os_file_create_simple_no_error_handling_func(
3113 	const char*	name,
3114 	ulint		create_mode,
3115 	ulint		access_type,
3116 	bool		read_only,
3117 	bool*		success)
3118 {
3119 	os_file_t	file;
3120 	int		create_flag;
3121 
3122 	if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) {
3123 		WAIT_ALLOW_WRITES();
3124 	}
3125 
3126 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
3127 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
3128 
3129 	*success = false;
3130 
3131 	if (create_mode == OS_FILE_OPEN) {
3132 
3133 		if (access_type == OS_FILE_READ_ONLY) {
3134 
3135 			create_flag = O_RDONLY;
3136 
3137 		} else if (read_only) {
3138 
3139 			create_flag = O_RDONLY;
3140 
3141 		} else {
3142 
3143 			ut_a(access_type == OS_FILE_READ_WRITE
3144 			     || access_type == OS_FILE_READ_ALLOW_DELETE);
3145 
3146 			create_flag = O_RDWR;
3147 		}
3148 
3149 	} else if (read_only) {
3150 
3151 		create_flag = O_RDONLY;
3152 
3153 	} else if (create_mode == OS_FILE_CREATE) {
3154 
3155 		create_flag = O_RDWR | O_CREAT | O_EXCL;
3156 
3157 	} else {
3158 
3159 		ib::error()
3160 			<< "Unknown file create mode "
3161 			<< create_mode << " for file '" << name << "'";
3162 
3163 		return(OS_FILE_CLOSED);
3164 	}
3165 
3166 	file = open(name, create_flag | O_CLOEXEC, os_innodb_umask);
3167 
3168 	*success = (file != -1);
3169 
3170 #ifdef USE_FILE_LOCK
3171 	if (!read_only
3172 	    && *success
3173 	    && access_type == OS_FILE_READ_WRITE
3174 	    && os_file_lock(file, name)) {
3175 
3176 		*success = false;
3177 		close(file);
3178 		file = -1;
3179 
3180 	}
3181 #endif /* USE_FILE_LOCK */
3182 
3183 	return(file);
3184 }
3185 
3186 /** Deletes a file if it exists. The file has to be closed before calling this.
3187 @param[in]	name		file path as a null-terminated string
3188 @param[out]	exist		indicate if file pre-exist
3189 @return true if success */
3190 bool
os_file_delete_if_exists_func(const char * name,bool * exist)3191 os_file_delete_if_exists_func(
3192 	const char*	name,
3193 	bool*		exist)
3194 {
3195 	if (exist != NULL) {
3196 		*exist = true;
3197 	}
3198 
3199 	int	ret;
3200 	WAIT_ALLOW_WRITES();
3201 
3202 	ret = unlink(name);
3203 
3204 	if (ret != 0 && errno == ENOENT) {
3205 		if (exist != NULL) {
3206 			*exist = false;
3207 		}
3208 	} else if (ret != 0 && errno != ENOENT) {
3209 		os_file_handle_error_no_exit(name, "delete", false);
3210 
3211 		return(false);
3212 	}
3213 
3214 	return(true);
3215 }
3216 
3217 /** Deletes a file. The file has to be closed before calling this.
3218 @param[in]	name		file path as a null-terminated string
3219 @return true if success */
3220 bool
os_file_delete_func(const char * name)3221 os_file_delete_func(
3222 	const char*	name)
3223 {
3224 	int	ret;
3225 	WAIT_ALLOW_WRITES();
3226 
3227 	ret = unlink(name);
3228 
3229 	if (ret != 0) {
3230 		os_file_handle_error_no_exit(name, "delete", FALSE);
3231 
3232 		return(false);
3233 	}
3234 
3235 	return(true);
3236 }
3237 
3238 /** NOTE! Use the corresponding macro os_file_rename(), not directly this
3239 function!
3240 Renames a file (can also move it to another directory). It is safest that the
3241 file is closed before calling this function.
3242 @param[in]	oldpath		old file path as a null-terminated string
3243 @param[in]	newpath		new file path
3244 @return true if success */
3245 bool
os_file_rename_func(const char * oldpath,const char * newpath)3246 os_file_rename_func(
3247 	const char*	oldpath,
3248 	const char*	newpath)
3249 {
3250 #ifdef UNIV_DEBUG
3251 	os_file_type_t	type;
3252 	bool		exists;
3253 
3254 	/* New path must not exist. */
3255 	ut_ad(os_file_status(newpath, &exists, &type));
3256 	ut_ad(!exists);
3257 
3258 	/* Old path must exist. */
3259 	ut_ad(os_file_status(oldpath, &exists, &type));
3260 	ut_ad(exists);
3261 #endif /* UNIV_DEBUG */
3262 
3263 	int	ret;
3264 	WAIT_ALLOW_WRITES();
3265 
3266 	ret = rename(oldpath, newpath);
3267 
3268 	if (ret != 0) {
3269 		os_file_handle_rename_error(oldpath, newpath);
3270 
3271 		return(false);
3272 	}
3273 
3274 	return(true);
3275 }
3276 
3277 /** NOTE! Use the corresponding macro os_file_close(), not directly this
3278 function!
3279 Closes a file handle. In case of error, error number can be retrieved with
3280 os_file_get_last_error.
3281 @param[in]	file		Handle to close
3282 @return true if success */
3283 bool
os_file_close_func(os_file_t file)3284 os_file_close_func(
3285 	os_file_t	file)
3286 {
3287 	int	ret = close(file);
3288 
3289 	if (ret == -1) {
3290 		os_file_handle_error(NULL, "close");
3291 
3292 		return(false);
3293 	}
3294 
3295 	return(true);
3296 }
3297 
3298 /** Gets a file size.
3299 @param[in]	file		handle to an open file
3300 @return file size, or (os_offset_t) -1 on failure */
3301 os_offset_t
os_file_get_size(os_file_t file)3302 os_file_get_size(os_file_t file)
3303 {
3304 	struct stat statbuf;
3305 	return fstat(file, &statbuf) ? os_offset_t(-1) : statbuf.st_size;
3306 }
3307 
3308 /** Gets a file size.
3309 @param[in]	filename	Full path to the filename to check
3310 @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
3311 	errno */
3312 os_file_size_t
os_file_get_size(const char * filename)3313 os_file_get_size(
3314 	const char*	filename)
3315 {
3316 	struct stat	s;
3317 	os_file_size_t	file_size;
3318 
3319 	int	ret = stat(filename, &s);
3320 
3321 	if (ret == 0) {
3322 		file_size.m_total_size = s.st_size;
3323 		/* st_blocks is in 512 byte sized blocks */
3324 		file_size.m_alloc_size = s.st_blocks * 512;
3325 	} else {
3326 		file_size.m_total_size = ~0U;
3327 		file_size.m_alloc_size = (os_offset_t) errno;
3328 	}
3329 
3330 	return(file_size);
3331 }
3332 
3333 /** This function returns information about the specified file
3334 @param[in]	path		pathname of the file
3335 @param[out]	stat_info	information of a file in a directory
3336 @param[in,out]	statinfo	information of a file in a directory
3337 @param[in]	check_rw_perm	for testing whether the file can be opened
3338 				in RW mode
3339 @param[in]	read_only	if true read only mode checks are enforced
3340 @return DB_SUCCESS if all OK */
3341 static
3342 dberr_t
os_file_get_status_posix(const char * path,os_file_stat_t * stat_info,struct stat * statinfo,bool check_rw_perm,bool read_only)3343 os_file_get_status_posix(
3344 	const char*	path,
3345 	os_file_stat_t* stat_info,
3346 	struct stat*	statinfo,
3347 	bool		check_rw_perm,
3348 	bool		read_only)
3349 {
3350 	int	ret = stat(path, statinfo);
3351 
3352 	if (ret && (errno == ENOENT || errno == ENOTDIR
3353 		    || errno == ENAMETOOLONG)) {
3354 		/* file does not exist */
3355 
3356 		return(DB_NOT_FOUND);
3357 
3358 	} else if (ret) {
3359 		/* file exists, but stat call failed */
3360 
3361 		os_file_handle_error_no_exit(path, "stat", false);
3362 
3363 		return(DB_FAIL);
3364 	}
3365 
3366 	switch (statinfo->st_mode & S_IFMT) {
3367 	case S_IFDIR:
3368 		stat_info->type = OS_FILE_TYPE_DIR;
3369 		break;
3370 	case S_IFLNK:
3371 		stat_info->type = OS_FILE_TYPE_LINK;
3372 		break;
3373 	case S_IFBLK:
3374 		/* Handle block device as regular file. */
3375 	case S_IFCHR:
3376 		/* Handle character device as regular file. */
3377 	case S_IFREG:
3378 		stat_info->type = OS_FILE_TYPE_FILE;
3379 		break;
3380 	default:
3381 		stat_info->type = OS_FILE_TYPE_UNKNOWN;
3382 	}
3383 
3384 	stat_info->size = statinfo->st_size;
3385 	stat_info->block_size = statinfo->st_blksize;
3386 	stat_info->alloc_size = statinfo->st_blocks * 512;
3387 
3388 	if (check_rw_perm
3389 	    && (stat_info->type == OS_FILE_TYPE_FILE
3390 		|| stat_info->type == OS_FILE_TYPE_BLOCK)) {
3391 
3392 		stat_info->rw_perm = !access(path, read_only
3393 					     ? R_OK : R_OK | W_OK);
3394 	}
3395 
3396 	return(DB_SUCCESS);
3397 }
3398 
3399 /** Truncates a file to a specified size in bytes.
3400 Do nothing if the size to preserve is greater or equal to the current
3401 size of the file.
3402 @param[in]	pathname	file path
3403 @param[in]	file		file to be truncated
3404 @param[in]	size		size to preserve in bytes
3405 @return true if success */
3406 static
3407 bool
os_file_truncate_posix(const char * pathname,os_file_t file,os_offset_t size)3408 os_file_truncate_posix(
3409 	const char*	pathname,
3410 	os_file_t	file,
3411 	os_offset_t	size)
3412 {
3413 	int	res = ftruncate(file, size);
3414 
3415 	if (res == -1) {
3416 
3417 		bool	retry;
3418 
3419 		retry = os_file_handle_error_no_exit(
3420 			pathname, "truncate", false);
3421 
3422 		if (retry) {
3423 			ib::warn()
3424 				<< "Truncate failed for '"
3425 				<< pathname << "'";
3426 		}
3427 	}
3428 
3429 	return(res == 0);
3430 }
3431 
3432 /** Truncates a file at its current position.
3433 @return true if success */
3434 bool
os_file_set_eof(FILE * file)3435 os_file_set_eof(
3436 	FILE*		file)	/*!< in: file to be truncated */
3437 {
3438 	WAIT_ALLOW_WRITES();
3439 	return(!ftruncate(fileno(file), ftell(file)));
3440 }
3441 
3442 #else /* !_WIN32 */
3443 
3444 #include <WinIoCtl.h>
3445 
3446 /*
3447 Windows : Handling synchronous IO on files opened asynchronously.
3448 
3449 If file is opened for asynchronous IO (FILE_FLAG_OVERLAPPED) and also bound to
3450 a completion port, then every IO on this file would normally be enqueued to the
3451 completion port. Sometimes however we would like to do a synchronous IO. This is
3452 possible if we initialitze have overlapped.hEvent with a valid event and set its
3453 lowest order bit to 1 (see MSDN ReadFile and WriteFile description for more info)
3454 
3455 We'll create this special event once for each thread and store in thread local
3456 storage.
3457 */
3458 
3459 
win_free_syncio_event(void * data)3460 static void __stdcall win_free_syncio_event(void *data) {
3461 	if (data) {
3462 		CloseHandle((HANDLE)data);
3463 	}
3464 }
3465 
3466 
3467 /*
3468 Retrieve per-thread event for doing synchronous io on asyncronously opened files
3469 */
win_get_syncio_event()3470 static HANDLE win_get_syncio_event()
3471 {
3472 	HANDLE h;
3473 
3474 	h = (HANDLE)FlsGetValue(fls_sync_io);
3475 	if (h) {
3476 		return h;
3477 	}
3478 	h = CreateEventA(NULL, FALSE, FALSE, NULL);
3479 	ut_a(h);
3480 	/* Set low-order bit to keeps I/O completion from being queued */
3481 	h = (HANDLE)((uintptr_t)h | 1);
3482 	FlsSetValue(fls_sync_io, h);
3483 	return h;
3484 }
3485 
3486 
3487 /** Do the read/write
3488 @param[in]	request	The IO context and type
3489 @return the number of bytes read/written or negative value on error */
3490 ssize_t
execute(const IORequest & request)3491 SyncFileIO::execute(const IORequest& request)
3492 {
3493 	OVERLAPPED	seek;
3494 
3495 	memset(&seek, 0x0, sizeof(seek));
3496 
3497 	seek.hEvent = win_get_syncio_event();
3498 	seek.Offset = (DWORD) m_offset & 0xFFFFFFFF;
3499 	seek.OffsetHigh = (DWORD) (m_offset >> 32);
3500 
3501 	BOOL	ret;
3502 	DWORD	n_bytes;
3503 
3504 	if (request.is_read()) {
3505 		ret = ReadFile(m_fh, m_buf,
3506 			static_cast<DWORD>(m_n), NULL, &seek);
3507 
3508 	} else {
3509 		ut_ad(request.is_write());
3510 		ret = WriteFile(m_fh, m_buf,
3511 			static_cast<DWORD>(m_n), NULL, &seek);
3512 	}
3513 	if (ret || (GetLastError() == ERROR_IO_PENDING)) {
3514 		/* Wait for async io to complete */
3515 		ret = GetOverlappedResult(m_fh, &seek, &n_bytes, TRUE);
3516 	}
3517 
3518 	return(ret ? static_cast<ssize_t>(n_bytes) : -1);
3519 }
3520 
3521 /** Do the read/write
3522 @param[in,out]	slot	The IO slot, it has the IO context
3523 @return the number of bytes read/written or negative value on error */
3524 ssize_t
execute(Slot * slot)3525 SyncFileIO::execute(Slot* slot)
3526 {
3527 	BOOL	ret;
3528 	slot->control.hEvent = win_get_syncio_event();
3529 	if (slot->type.is_read()) {
3530 
3531 		ret = ReadFile(
3532 			slot->file, slot->ptr, slot->len,
3533 			NULL, &slot->control);
3534 
3535 	} else {
3536 		ut_ad(slot->type.is_write());
3537 
3538 		ret = WriteFile(
3539 			slot->file, slot->ptr, slot->len,
3540 			NULL, &slot->control);
3541 
3542 	}
3543 	if (ret || (GetLastError() == ERROR_IO_PENDING)) {
3544 		/* Wait for async io to complete */
3545 		ret = GetOverlappedResult(slot->file, &slot->control, &slot->n_bytes, TRUE);
3546 	}
3547 
3548 	return(ret ? static_cast<ssize_t>(slot->n_bytes) : -1);
3549 }
3550 
3551 /* Startup/shutdown */
3552 
3553 struct WinIoInit
3554 {
WinIoInitWinIoInit3555 	WinIoInit() {
3556 		fls_sync_io = FlsAlloc(win_free_syncio_event);
3557 		ut_a(fls_sync_io != FLS_OUT_OF_INDEXES);
3558 	}
3559 
~WinIoInitWinIoInit3560 	~WinIoInit() {
3561 		FlsFree(fls_sync_io);
3562 	}
3563 };
3564 
3565 /* Ensures proper initialization and shutdown */
3566 static WinIoInit win_io_init;
3567 
3568 
3569 /** Free storage space associated with a section of the file.
3570 @param[in]	fh		Open file handle
3571 @param[in]	page_size	Tablespace page size
3572 @param[in]	block_size	File system block size
3573 @param[in]	off		Starting offset (SEEK_SET)
3574 @param[in]	len		Size of the hole
3575 @return 0 on success or errno */
3576 static
3577 dberr_t
os_file_punch_hole_win32(os_file_t fh,os_offset_t off,os_offset_t len)3578 os_file_punch_hole_win32(
3579 	os_file_t	fh,
3580 	os_offset_t	off,
3581 	os_offset_t	len)
3582 {
3583 	FILE_ZERO_DATA_INFORMATION	punch;
3584 
3585 	punch.FileOffset.QuadPart = off;
3586 	punch.BeyondFinalZero.QuadPart = off + len;
3587 
3588 	/* If lpOverlapped is NULL, lpBytesReturned cannot be NULL,
3589 	therefore we pass a dummy parameter. */
3590 	DWORD	temp;
3591 	BOOL	success = os_win32_device_io_control(
3592 		fh, FSCTL_SET_ZERO_DATA, &punch, sizeof(punch),
3593 		NULL, 0, &temp);
3594 
3595 	return(success ? DB_SUCCESS: DB_IO_NO_PUNCH_HOLE);
3596 }
3597 
3598 /** Check the existence and type of the given file.
3599 @param[in]	path		path name of file
3600 @param[out]	exists		true if the file exists
3601 @param[out]	type		Type of the file, if it exists
3602 @return true if call succeeded */
3603 static
3604 bool
os_file_status_win32(const char * path,bool * exists,os_file_type_t * type)3605 os_file_status_win32(
3606 	const char*	path,
3607 	bool*		exists,
3608 	os_file_type_t* type)
3609 {
3610 	int		ret;
3611 	struct _stat64	statinfo;
3612 
3613 	ret = _stat64(path, &statinfo);
3614 
3615 	*exists = !ret;
3616 
3617 	if (!ret) {
3618 		/* file exists, everything OK */
3619 
3620 	} else if (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG) {
3621 		/* file does not exist */
3622 		return(true);
3623 
3624 	} else {
3625 		/* file exists, but stat call failed */
3626 		os_file_handle_error_no_exit(path, "stat", false);
3627 		return(false);
3628 	}
3629 
3630 	if (_S_IFDIR & statinfo.st_mode) {
3631 		*type = OS_FILE_TYPE_DIR;
3632 
3633 	} else if (_S_IFREG & statinfo.st_mode) {
3634 		*type = OS_FILE_TYPE_FILE;
3635 
3636 	} else {
3637 		*type = OS_FILE_TYPE_UNKNOWN;
3638 	}
3639 
3640 	return(true);
3641 }
3642 
3643 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
3644 function!
3645 Flushes the write buffers of a given file to the disk.
3646 @param[in]	file		handle to a file
3647 @return true if success */
3648 bool
os_file_flush_func(os_file_t file)3649 os_file_flush_func(
3650 	os_file_t	file)
3651 {
3652 	++os_n_fsyncs;
3653 
3654 	BOOL	ret = FlushFileBuffers(file);
3655 
3656 	if (ret) {
3657 		return(true);
3658 	}
3659 
3660 	/* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
3661 	actually a raw device, we choose to ignore that error if we are using
3662 	raw disks */
3663 
3664 	if (srv_start_raw_disk_in_use && GetLastError()
3665 	    == ERROR_INVALID_FUNCTION) {
3666 		return(true);
3667 	}
3668 
3669 	os_file_handle_error(NULL, "flush");
3670 
3671 	/* It is a fatal error if a file flush does not succeed, because then
3672 	the database can get corrupt on disk */
3673 	ut_error;
3674 
3675 	return(false);
3676 }
3677 
3678 /** Retrieves the last error number if an error occurs in a file io function.
3679 The number should be retrieved before any other OS calls (because they may
3680 overwrite the error number). If the number is not known to this program,
3681 the OS error number + 100 is returned.
3682 @param[in]	report_all_errors	true if we want an error message printed
3683 					of all errors
3684 @param[in]	on_error_silent		true then don't print any diagnostic
3685 					to the log
3686 @return error number, or OS error number + 100 */
3687 static
3688 ulint
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)3689 os_file_get_last_error_low(
3690 	bool	report_all_errors,
3691 	bool	on_error_silent)
3692 {
3693 	ulint	err = (ulint) GetLastError();
3694 
3695 	if (err == ERROR_SUCCESS) {
3696 		return(0);
3697 	}
3698 
3699 	if (report_all_errors
3700 	    || (!on_error_silent
3701 		&& err != ERROR_DISK_FULL
3702 		&& err != ERROR_FILE_EXISTS)) {
3703 
3704 		ib::error()
3705 			<< "Operating system error number " << err
3706 			<< " in a file operation.";
3707 
3708 		if (err == ERROR_PATH_NOT_FOUND) {
3709 			ib::error()
3710 				<< "The error means the system"
3711 				" cannot find the path specified.";
3712 
3713 			if (srv_is_being_started) {
3714 				ib::error()
3715 					<< "If you are installing InnoDB,"
3716 					" remember that you must create"
3717 					" directories yourself, InnoDB"
3718 					" does not create them.";
3719 			}
3720 
3721 		} else if (err == ERROR_ACCESS_DENIED) {
3722 
3723 			ib::error()
3724 				<< "The error means mysqld does not have"
3725 				" the access rights to"
3726 				" the directory. It may also be"
3727 				" you have created a subdirectory"
3728 				" of the same name as a data file.";
3729 
3730 		} else if (err == ERROR_SHARING_VIOLATION
3731 			   || err == ERROR_LOCK_VIOLATION) {
3732 
3733 			ib::error()
3734 				<< "The error means that another program"
3735 				" is using InnoDB's files."
3736 				" This might be a backup or antivirus"
3737 				" software or another instance"
3738 				" of MySQL."
3739 				" Please close it to get rid of this error.";
3740 
3741 		} else if (err == ERROR_WORKING_SET_QUOTA
3742 			   || err == ERROR_NO_SYSTEM_RESOURCES) {
3743 
3744 			ib::error()
3745 				<< "The error means that there are no"
3746 				" sufficient system resources or quota to"
3747 				" complete the operation.";
3748 
3749 		} else if (err == ERROR_OPERATION_ABORTED) {
3750 
3751 			ib::error()
3752 				<< "The error means that the I/O"
3753 				" operation has been aborted"
3754 				" because of either a thread exit"
3755 				" or an application request."
3756 				" Retry attempt is made.";
3757 		} else {
3758 
3759 			ib::info() << OPERATING_SYSTEM_ERROR_MSG;
3760 		}
3761 	}
3762 
3763 	if (err == ERROR_FILE_NOT_FOUND) {
3764 		return(OS_FILE_NOT_FOUND);
3765 	} else if (err == ERROR_DISK_FULL) {
3766 		return(OS_FILE_DISK_FULL);
3767 	} else if (err == ERROR_FILE_EXISTS) {
3768 		return(OS_FILE_ALREADY_EXISTS);
3769 	} else if (err == ERROR_SHARING_VIOLATION
3770 		   || err == ERROR_LOCK_VIOLATION) {
3771 		return(OS_FILE_SHARING_VIOLATION);
3772 	} else if (err == ERROR_WORKING_SET_QUOTA
3773 		   || err == ERROR_NO_SYSTEM_RESOURCES) {
3774 		return(OS_FILE_INSUFFICIENT_RESOURCE);
3775 	} else if (err == ERROR_OPERATION_ABORTED) {
3776 		return(OS_FILE_OPERATION_ABORTED);
3777 	} else if (err == ERROR_ACCESS_DENIED) {
3778 		return(OS_FILE_ACCESS_VIOLATION);
3779 	}
3780 
3781 	return(OS_FILE_ERROR_MAX + err);
3782 }
3783 
3784 
3785 /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
3786 this function!
3787 A simple function to open or create a file.
3788 @param[in]	name		name of the file or path as a null-terminated
3789 				string
3790 @param[in]	create_mode	create mode
3791 @param[in]	access_type	OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
3792 @param[in]	read_only	if true read only mode checks are enforced
3793 @param[out]	success		true if succeed, false if error
3794 @return handle to the file, not defined if error, error number
3795 	can be retrieved with os_file_get_last_error */
3796 pfs_os_file_t
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)3797 os_file_create_simple_func(
3798 	const char*	name,
3799 	ulint		create_mode,
3800 	ulint		access_type,
3801 	bool		read_only,
3802 	bool*		success)
3803 {
3804 	os_file_t	file;
3805 
3806 	*success = false;
3807 
3808 	DWORD		access;
3809 	DWORD		create_flag;
3810 	DWORD		attributes = 0;
3811 
3812 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
3813 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
3814 	ut_ad(srv_operation == SRV_OPERATION_NORMAL);
3815 
3816 	if (create_mode == OS_FILE_OPEN) {
3817 
3818 		create_flag = OPEN_EXISTING;
3819 
3820 	} else if (read_only) {
3821 
3822 		create_flag = OPEN_EXISTING;
3823 
3824 	} else if (create_mode == OS_FILE_CREATE) {
3825 
3826 		create_flag = CREATE_NEW;
3827 
3828 	} else if (create_mode == OS_FILE_CREATE_PATH) {
3829 
3830 		/* Create subdirs along the path if needed. */
3831 		*success = os_file_create_subdirs_if_needed(name);
3832 
3833 		if (!*success) {
3834 
3835 			ib::error()
3836 				<< "Unable to create subdirectories '"
3837 				<< name << "'";
3838 
3839 			return(OS_FILE_CLOSED);
3840 		}
3841 
3842 		create_flag = CREATE_NEW;
3843 		create_mode = OS_FILE_CREATE;
3844 
3845 	} else {
3846 
3847 		ib::error()
3848 			<< "Unknown file create mode ("
3849 			<< create_mode << ") for file '"
3850 			<< name << "'";
3851 
3852 		return(OS_FILE_CLOSED);
3853 	}
3854 
3855 	if (access_type == OS_FILE_READ_ONLY) {
3856 
3857 		access = GENERIC_READ;
3858 
3859 	} else if (read_only) {
3860 
3861 		ib::info()
3862 			<< "Read only mode set. Unable to"
3863 			" open file '" << name << "' in RW mode, "
3864 			<< "trying RO mode";
3865 
3866 		access = GENERIC_READ;
3867 
3868 	} else if (access_type == OS_FILE_READ_WRITE) {
3869 
3870 		access = GENERIC_READ | GENERIC_WRITE;
3871 
3872 	} else {
3873 
3874 		ib::error()
3875 			<< "Unknown file access type (" << access_type << ") "
3876 			"for file '" << name << "'";
3877 
3878 		return(OS_FILE_CLOSED);
3879 	}
3880 
3881 	bool	retry;
3882 
3883 	do {
3884 		/* Use default security attributes and no template file. */
3885 
3886 		file = CreateFile(
3887 			(LPCTSTR) name, access,
3888 			FILE_SHARE_READ | FILE_SHARE_DELETE, NULL,
3889 			create_flag, attributes, NULL);
3890 
3891 		if (file == INVALID_HANDLE_VALUE) {
3892 
3893 			*success = false;
3894 
3895 			retry = os_file_handle_error(
3896 				name, create_mode == OS_FILE_OPEN ?
3897 				"open" : "create");
3898 
3899 		} else {
3900 
3901 			retry = false;
3902 
3903 			*success = true;
3904 		}
3905 
3906 	} while (retry);
3907 
3908 	return(file);
3909 }
3910 
3911 /** This function attempts to create a directory named pathname. The new
3912 directory gets default permissions. On Unix the permissions are
3913 (0770 & ~umask). If the directory exists already, nothing is done and
3914 the call succeeds, unless the fail_if_exists arguments is true.
3915 If another error occurs, such as a permission error, this does not crash,
3916 but reports the error and returns false.
3917 @param[in]	pathname	directory name as null-terminated string
3918 @param[in]	fail_if_exists	if true, pre-existing directory is treated
3919 				as an error.
3920 @return true if call succeeds, false on error */
3921 bool
os_file_create_directory(const char * pathname,bool fail_if_exists)3922 os_file_create_directory(
3923 	const char*	pathname,
3924 	bool		fail_if_exists)
3925 {
3926 	BOOL	rcode;
3927 
3928 	rcode = CreateDirectory((LPCTSTR) pathname, NULL);
3929 	if (!(rcode != 0
3930 	      || (GetLastError() == ERROR_ALREADY_EXISTS
3931 		  && !fail_if_exists))) {
3932 
3933 		os_file_handle_error_no_exit(
3934 			pathname, "CreateDirectory", false);
3935 
3936 		return(false);
3937 	}
3938 
3939 	return(true);
3940 }
3941 
3942 /** The os_file_opendir() function opens a directory stream corresponding to the
3943 directory named by the dirname argument. The directory stream is positioned
3944 at the first entry. In both Unix and Windows we automatically skip the '.'
3945 and '..' items at the start of the directory listing.
3946 @param[in]	dirname		directory name; it must not contain a trailing
3947 				'\' or '/'
3948 @param[in]	is_fatal	true if we should treat an error as a fatal
3949 				error; if we try to open symlinks then we do
3950 				not wish a fatal error if it happens not to
3951 				be a directory
3952 @return directory stream, NULL if error */
3953 os_file_dir_t
os_file_opendir(const char * dirname,bool error_is_fatal)3954 os_file_opendir(
3955 	const char*	dirname,
3956 	bool		error_is_fatal)
3957 {
3958 	os_file_dir_t		dir;
3959 	LPWIN32_FIND_DATA	lpFindFileData;
3960 	char			path[OS_FILE_MAX_PATH + 3];
3961 
3962 	ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
3963 
3964 	strcpy(path, dirname);
3965 	strcpy(path + strlen(path), "\\*");
3966 
3967 	/* Note that in Windows opening the 'directory stream' also retrieves
3968 	the first entry in the directory. Since it is '.', that is no problem,
3969 	as we will skip over the '.' and '..' entries anyway. */
3970 
3971 	lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
3972 		ut_malloc_nokey(sizeof(WIN32_FIND_DATA)));
3973 
3974 	dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
3975 
3976 	ut_free(lpFindFileData);
3977 
3978 	if (dir == INVALID_HANDLE_VALUE) {
3979 
3980 		if (error_is_fatal) {
3981 			os_file_handle_error(dirname, "opendir");
3982 		}
3983 
3984 		return(NULL);
3985 	}
3986 
3987 	return(dir);
3988 }
3989 
3990 /** Closes a directory stream.
3991 @param[in]	dir	directory stream
3992 @return 0 if success, -1 if failure */
3993 int
os_file_closedir(os_file_dir_t dir)3994 os_file_closedir(
3995 	os_file_dir_t	dir)
3996 {
3997 	BOOL		ret;
3998 
3999 	ret = FindClose(dir);
4000 
4001 	if (!ret) {
4002 		os_file_handle_error_no_exit(NULL, "closedir", false);
4003 
4004 		return(-1);
4005 	}
4006 
4007 	return(0);
4008 }
4009 
4010 /** This function returns information of the next file in the directory. We
4011 jump over the '.' and '..' entries in the directory.
4012 @param[in]	dirname		directory name or path
4013 @param[in]	dir		directory stream
4014 @param[out]	info		buffer where the info is returned
4015 @return 0 if ok, -1 if error, 1 if at the end of the directory */
4016 int
os_file_readdir_next_file(const char * dirname,os_file_dir_t dir,os_file_stat_t * info)4017 os_file_readdir_next_file(
4018 	const char*	dirname,
4019 	os_file_dir_t	dir,
4020 	os_file_stat_t*	info)
4021 {
4022 	BOOL		ret;
4023 	int		status;
4024 	WIN32_FIND_DATA	find_data;
4025 
4026 next_file:
4027 
4028 	ret = FindNextFile(dir, &find_data);
4029 
4030 	if (ret > 0) {
4031 
4032 		const char* name;
4033 
4034 		name = static_cast<const char*>(find_data.cFileName);
4035 
4036 		ut_a(strlen(name) < OS_FILE_MAX_PATH);
4037 
4038 		if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0) {
4039 
4040 			goto next_file;
4041 		}
4042 
4043 		strcpy(info->name, name);
4044 
4045 		info->size = find_data.nFileSizeHigh;
4046 		info->size <<= 32;
4047 		info->size |= find_data.nFileSizeLow;
4048 
4049 		if (find_data.dwFileAttributes
4050 		    & FILE_ATTRIBUTE_REPARSE_POINT) {
4051 
4052 			/* TODO: test Windows symlinks */
4053 			/* TODO: MySQL has apparently its own symlink
4054 			implementation in Windows, dbname.sym can
4055 			redirect a database directory:
4056 			REFMAN "windows-symbolic-links.html" */
4057 
4058 			info->type = OS_FILE_TYPE_LINK;
4059 
4060 		} else if (find_data.dwFileAttributes
4061 			   & FILE_ATTRIBUTE_DIRECTORY) {
4062 
4063 			info->type = OS_FILE_TYPE_DIR;
4064 
4065 		} else {
4066 
4067 			/* It is probably safest to assume that all other
4068 			file types are normal. Better to check them rather
4069 			than blindly skip them. */
4070 
4071 			info->type = OS_FILE_TYPE_FILE;
4072 		}
4073 
4074 		status = 0;
4075 
4076 	} else if (GetLastError() == ERROR_NO_MORE_FILES) {
4077 
4078 		status = 1;
4079 
4080 	} else {
4081 
4082 		os_file_handle_error_no_exit(NULL, "readdir_next_file", false);
4083 
4084 		status = -1;
4085 	}
4086 
4087 	return(status);
4088 }
4089 
4090 /** Check that IO of specific size is possible for the file
4091 opened with FILE_FLAG_NO_BUFFERING.
4092 
4093 The requirement is that IO is multiple of the disk sector size.
4094 
4095 @param[in]	file      file handle
4096 @param[in]	io_size   expected io size
4097 @return true - unbuffered io of requested size is possible, false otherwise.
4098 
4099 @note: this function only works correctly with Windows 8 or later,
4100 (GetFileInformationByHandleEx with FileStorageInfo is only supported there).
4101 It will return true on earlier Windows version.
4102  */
unbuffered_io_possible(HANDLE file,size_t io_size)4103 static bool unbuffered_io_possible(HANDLE file, size_t io_size)
4104 {
4105 	FILE_STORAGE_INFO info;
4106 	if (GetFileInformationByHandleEx(
4107 		file, FileStorageInfo, &info, sizeof(info))) {
4108 			ULONG sector_size = info.LogicalBytesPerSector;
4109 			if (sector_size)
4110 				return io_size % sector_size == 0;
4111 	}
4112 	return true;
4113 }
4114 
4115 
4116 /** NOTE! Use the corresponding macro os_file_create(), not directly
4117 this function!
4118 Opens an existing file or creates a new.
4119 @param[in]	name		name of the file or path as a null-terminated
4120 				string
4121 @param[in]	create_mode	create mode
4122 @param[in]	purpose		OS_FILE_AIO, if asynchronous, non-buffered I/O
4123 				is desired, OS_FILE_NORMAL, if any normal file;
4124 				NOTE that it also depends on type, os_aio_..
4125 				and srv_.. variables whether we really use async
4126 				I/O or unbuffered I/O: look in the function
4127 				source code for the exact rules
4128 @param[in]	type		OS_DATA_FILE or OS_LOG_FILE
4129 @param[in]	success		true if succeeded
4130 @return handle to the file, not defined if error, error number
4131 	can be retrieved with os_file_get_last_error */
4132 pfs_os_file_t
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,bool read_only,bool * success)4133 os_file_create_func(
4134 	const char*	name,
4135 	ulint		create_mode,
4136 	ulint		purpose,
4137 	ulint		type,
4138 	bool		read_only,
4139 	bool*		success)
4140 {
4141 	os_file_t	file;
4142 	bool		retry;
4143 	bool		on_error_no_exit;
4144 	bool		on_error_silent;
4145 
4146 	*success = false;
4147 
4148 	DBUG_EXECUTE_IF(
4149 		"ib_create_table_fail_disk_full",
4150 		*success = false;
4151 		SetLastError(ERROR_DISK_FULL);
4152 		return(OS_FILE_CLOSED);
4153 	);
4154 
4155 	DWORD		create_flag;
4156 	DWORD		share_mode = srv_operation != SRV_OPERATION_NORMAL
4157 		? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE
4158 		: FILE_SHARE_READ | FILE_SHARE_DELETE;
4159 
4160 	if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) {
4161 		WAIT_ALLOW_WRITES();
4162 	}
4163 
4164 	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
4165 		? true : false;
4166 
4167 	on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
4168 		? true : false;
4169 
4170 	create_mode &= ~(OS_FILE_ON_ERROR_NO_EXIT | OS_FILE_ON_ERROR_SILENT);
4171 
4172 	if (create_mode == OS_FILE_OPEN_RAW) {
4173 
4174 		ut_a(!read_only);
4175 
4176 		create_flag = OPEN_EXISTING;
4177 
4178 		/* On Windows Physical devices require admin privileges and
4179 		have to have the write-share mode set. See the remarks
4180 		section for the CreateFile() function documentation in MSDN. */
4181 
4182 		share_mode |= FILE_SHARE_WRITE;
4183 
4184 	} else if (create_mode == OS_FILE_OPEN
4185 		   || create_mode == OS_FILE_OPEN_RETRY) {
4186 
4187 		create_flag = OPEN_EXISTING;
4188 
4189 	} else if (read_only) {
4190 
4191 		create_flag = OPEN_EXISTING;
4192 
4193 	} else if (create_mode == OS_FILE_CREATE) {
4194 
4195 		create_flag = CREATE_NEW;
4196 
4197 	} else if (create_mode == OS_FILE_OVERWRITE) {
4198 
4199 		create_flag = CREATE_ALWAYS;
4200 
4201 	} else {
4202 		ib::error()
4203 			<< "Unknown file create mode (" << create_mode << ") "
4204 			<< " for file '" << name << "'";
4205 
4206 		return(OS_FILE_CLOSED);
4207 	}
4208 
4209 	DWORD		attributes = 0;
4210 
4211 	if (purpose == OS_FILE_AIO) {
4212 
4213 #ifdef WIN_ASYNC_IO
4214 		/* If specified, use asynchronous (overlapped) io and no
4215 		buffering of writes in the OS */
4216 
4217 		if (srv_use_native_aio) {
4218 			attributes |= FILE_FLAG_OVERLAPPED;
4219 		}
4220 #endif /* WIN_ASYNC_IO */
4221 
4222 	} else if (purpose == OS_FILE_NORMAL) {
4223 
4224 		/* Use default setting. */
4225 
4226 	} else {
4227 
4228 		ib::error()
4229 			<< "Unknown purpose flag (" << purpose << ") "
4230 			<< "while opening file '" << name << "'";
4231 
4232 		return(OS_FILE_CLOSED);
4233 	}
4234 
4235 	if (type == OS_LOG_FILE) {
4236 		/* There is not reason to use buffered write to logs.*/
4237 		attributes |= FILE_FLAG_NO_BUFFERING;
4238 	}
4239 
4240 	switch (srv_file_flush_method)
4241 	{
4242 	case SRV_O_DSYNC:
4243 		if (type == OS_LOG_FILE) {
4244 			/* Map O_SYNC to FILE_WRITE_THROUGH */
4245 			attributes |= FILE_FLAG_WRITE_THROUGH;
4246 		}
4247 		break;
4248 
4249 	case SRV_O_DIRECT_NO_FSYNC:
4250 	case SRV_O_DIRECT:
4251 		if (type == OS_DATA_FILE) {
4252 			attributes |= FILE_FLAG_NO_BUFFERING;
4253 		}
4254 		break;
4255 
4256 	case SRV_ALL_O_DIRECT_FSYNC:
4257 		/*Traditional Windows behavior, no buffering for any files.*/
4258 		if (type != OS_DATA_FILE_NO_O_DIRECT) {
4259 			attributes |= FILE_FLAG_NO_BUFFERING;
4260 		}
4261 		break;
4262 
4263 	case SRV_FSYNC:
4264 	case SRV_LITTLESYNC:
4265 		break;
4266 
4267 	case SRV_NOSYNC:
4268 		/* Let Windows cache manager handle all writes.*/
4269 		attributes &= ~(FILE_FLAG_WRITE_THROUGH | FILE_FLAG_NO_BUFFERING);
4270 		break;
4271 
4272 	default:
4273 		ut_a(false); /* unknown flush mode.*/
4274 	}
4275 
4276 
4277 	// TODO: Create a bug, this looks wrong. The flush log
4278 	// parameter is dynamic.
4279 	if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
4280 		/* Do not use unbuffered i/o for the log files because
4281 		value 2 denotes that we do not flush the log at every
4282 		commit, but only once per second */
4283 		attributes &= ~(FILE_FLAG_WRITE_THROUGH | FILE_FLAG_NO_BUFFERING);
4284 	}
4285 
4286 
4287 	DWORD	access = GENERIC_READ;
4288 
4289 	if (!read_only) {
4290 		access |= GENERIC_WRITE;
4291 	}
4292 
4293 	for (;;) {
4294 		const  char *operation;
4295 
4296 		/* Use default security attributes and no template file. */
4297 		file = CreateFile(
4298 			name, access, share_mode, NULL,
4299 			create_flag, attributes, NULL);
4300 
4301 		/* If FILE_FLAG_NO_BUFFERING was set, check if this can work at all,
4302 		for expected IO sizes. Reopen without the unbuffered flag, if it is won't work*/
4303 		if ((file != INVALID_HANDLE_VALUE)
4304 			&& (attributes & FILE_FLAG_NO_BUFFERING)
4305 			&& (type == OS_LOG_FILE)
4306 			&& !unbuffered_io_possible(file, OS_FILE_LOG_BLOCK_SIZE)) {
4307 				ut_a(CloseHandle(file));
4308 				attributes &= ~FILE_FLAG_NO_BUFFERING;
4309 				create_flag = OPEN_ALWAYS;
4310 				continue;
4311 		}
4312 
4313 		*success = (file != INVALID_HANDLE_VALUE);
4314 		if (*success) {
4315 			break;
4316 		}
4317 
4318 		operation = (create_mode == OS_FILE_CREATE && !read_only) ?
4319 			"create" : "open";
4320 
4321 		if (on_error_no_exit) {
4322 			retry = os_file_handle_error_no_exit(
4323 				name, operation, on_error_silent);
4324 		}
4325 		else {
4326 			retry = os_file_handle_error(name, operation);
4327 		}
4328 
4329 		if (!retry) {
4330 			break;
4331 		}
4332 	}
4333 
4334 	if (*success && srv_use_native_aio &&  (attributes & FILE_FLAG_OVERLAPPED)) {
4335 		/* Bind the file handle to completion port. Completion port
4336 		might not be created yet, in some stages of backup, but
4337 		must always be there for the server.*/
4338 		HANDLE port = (type == OS_LOG_FILE) ?
4339 			log_completion_port : data_completion_port;
4340 		ut_a(port || srv_operation != SRV_OPERATION_NORMAL);
4341 		if (port) {
4342 			ut_a(CreateIoCompletionPort(file, port, 0, 0));
4343 		}
4344 	}
4345 
4346 	return(file);
4347 }
4348 
4349 /** NOTE! Use the corresponding macro os_file_create_simple_no_error_handling(),
4350 not directly this function!
4351 A simple function to open or create a file.
4352 @param[in]	name		name of the file or path as a null-terminated
4353 				string
4354 @param[in]	create_mode	create mode
4355 @param[in]	access_type	OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
4356 				OS_FILE_READ_ALLOW_DELETE; the last option is
4357 				used by a backup program reading the file
4358 @param[out]	success		true if succeeded
4359 @return own: handle to the file, not defined if error, error number
4360 	can be retrieved with os_file_get_last_error */
4361 pfs_os_file_t
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)4362 os_file_create_simple_no_error_handling_func(
4363 	const char*	name,
4364 	ulint		create_mode,
4365 	ulint		access_type,
4366 	bool		read_only,
4367 	bool*		success)
4368 {
4369 	os_file_t	file;
4370 
4371 	*success = false;
4372 
4373 	DWORD		access;
4374 	DWORD		create_flag;
4375 	DWORD		attributes	= 0;
4376 	DWORD		share_mode = srv_operation != SRV_OPERATION_NORMAL
4377 		? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE
4378 		: FILE_SHARE_READ | FILE_SHARE_DELETE;
4379 
4380 	ut_a(name);
4381 
4382 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
4383 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
4384 
4385 	if (create_mode == OS_FILE_OPEN) {
4386 
4387 		create_flag = OPEN_EXISTING;
4388 
4389 	} else if (read_only) {
4390 
4391 		create_flag = OPEN_EXISTING;
4392 
4393 	} else if (create_mode == OS_FILE_CREATE) {
4394 
4395 		create_flag = CREATE_NEW;
4396 
4397 	} else {
4398 
4399 		ib::error()
4400 			<< "Unknown file create mode (" << create_mode << ") "
4401 			<< " for file '" << name << "'";
4402 
4403 		return(OS_FILE_CLOSED);
4404 	}
4405 
4406 	if (access_type == OS_FILE_READ_ONLY) {
4407 
4408 		access = GENERIC_READ;
4409 
4410 	} else if (read_only) {
4411 
4412 		access = GENERIC_READ;
4413 
4414 	} else if (access_type == OS_FILE_READ_WRITE) {
4415 
4416 		access = GENERIC_READ | GENERIC_WRITE;
4417 
4418 	} else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
4419 
4420 		ut_a(!read_only);
4421 
4422 		access = GENERIC_READ;
4423 
4424 		/*!< A backup program has to give mysqld the maximum
4425 		freedom to do what it likes with the file */
4426 
4427 		share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE
4428 			| FILE_SHARE_READ;
4429 	} else {
4430 
4431 		ib::error()
4432 			<< "Unknown file access type (" << access_type << ") "
4433 			<< "for file '" << name << "'";
4434 
4435 		return(OS_FILE_CLOSED);
4436 	}
4437 
4438 	file = CreateFile((LPCTSTR) name,
4439 			  access,
4440 			  share_mode,
4441 			  NULL,			// Security attributes
4442 			  create_flag,
4443 			  attributes,
4444 			  NULL);		// No template file
4445 
4446 	*success = (file != INVALID_HANDLE_VALUE);
4447 
4448 	return(file);
4449 }
4450 
4451 /** Deletes a file if it exists. The file has to be closed before calling this.
4452 @param[in]	name		file path as a null-terminated string
4453 @param[out]	exist		indicate if file pre-exist
4454 @return true if success */
4455 bool
os_file_delete_if_exists_func(const char * name,bool * exist)4456 os_file_delete_if_exists_func(
4457 	const char*	name,
4458 	bool*		exist)
4459 {
4460 	ulint	count	= 0;
4461 
4462 	if (exist != NULL) {
4463 		*exist = true;
4464 	}
4465 
4466 	for (;;) {
4467 		/* In Windows, deleting an .ibd file may fail if
4468 		the file is being accessed by an external program,
4469 		such as a backup tool. */
4470 
4471 		bool	ret = DeleteFile((LPCTSTR) name);
4472 
4473 		if (ret) {
4474 			return(true);
4475 		}
4476 
4477 		DWORD	lasterr = GetLastError();
4478 
4479 		if (lasterr == ERROR_FILE_NOT_FOUND
4480 		    || lasterr == ERROR_PATH_NOT_FOUND) {
4481 
4482 			/* the file does not exist, this not an error */
4483 			if (exist != NULL) {
4484 				*exist = false;
4485 			}
4486 
4487 			return(true);
4488 		}
4489 
4490 		++count;
4491 
4492 		if (count > 100 && 0 == (count % 10)) {
4493 
4494 			/* Print error information */
4495 			os_file_get_last_error(true);
4496 
4497 			ib::warn() << "Delete of file '" << name << "' failed.";
4498 		}
4499 
4500 		/* Sleep for a second */
4501 		os_thread_sleep(1000000);
4502 
4503 		if (count > 2000) {
4504 
4505 			return(false);
4506 		}
4507 	}
4508 }
4509 
4510 /** Deletes a file. The file has to be closed before calling this.
4511 @param[in]	name		File path as NUL terminated string
4512 @return true if success */
4513 bool
os_file_delete_func(const char * name)4514 os_file_delete_func(
4515 	const char*	name)
4516 {
4517 	ulint	count	= 0;
4518 
4519 	for (;;) {
4520 		/* In Windows, deleting an .ibd file may fail if
4521 		the file is being accessed by an external program,
4522 		such as a backup tool. */
4523 
4524 		BOOL	ret = DeleteFile((LPCTSTR) name);
4525 
4526 		if (ret) {
4527 			return(true);
4528 		}
4529 
4530 		if (GetLastError() == ERROR_FILE_NOT_FOUND) {
4531 			/* If the file does not exist, we classify this as
4532 			a 'mild' error and return */
4533 
4534 			return(false);
4535 		}
4536 
4537 		++count;
4538 
4539 		if (count > 100 && 0 == (count % 10)) {
4540 
4541 			/* print error information */
4542 			os_file_get_last_error(true);
4543 
4544 			ib::warn()
4545 				<< "Cannot delete file '" << name << "'. Is "
4546 				<< "another program accessing it?";
4547 		}
4548 
4549 		/* sleep for a second */
4550 		os_thread_sleep(1000000);
4551 
4552 		if (count > 2000) {
4553 
4554 			return(false);
4555 		}
4556 	}
4557 
4558 	ut_error;
4559 	return(false);
4560 }
4561 
4562 /** NOTE! Use the corresponding macro os_file_rename(), not directly this
4563 function!
4564 Renames a file (can also move it to another directory). It is safest that the
4565 file is closed before calling this function.
4566 @param[in]	oldpath		old file path as a null-terminated string
4567 @param[in]	newpath		new file path
4568 @return true if success */
4569 bool
os_file_rename_func(const char * oldpath,const char * newpath)4570 os_file_rename_func(
4571 	const char*	oldpath,
4572 	const char*	newpath)
4573 {
4574 #ifdef UNIV_DEBUG
4575 	os_file_type_t	type;
4576 	bool		exists;
4577 
4578 	/* New path must not exist. */
4579 	ut_ad(os_file_status(newpath, &exists, &type));
4580 	ut_ad(!exists);
4581 
4582 	/* Old path must exist. */
4583 	ut_ad(os_file_status(oldpath, &exists, &type));
4584 	ut_ad(exists);
4585 #endif /* UNIV_DEBUG */
4586 
4587 	if (MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath)) {
4588 		return(true);
4589 	}
4590 
4591 	os_file_handle_rename_error(oldpath, newpath);
4592 	return(false);
4593 }
4594 
4595 /** NOTE! Use the corresponding macro os_file_close(), not directly
4596 this function!
4597 Closes a file handle. In case of error, error number can be retrieved with
4598 os_file_get_last_error.
4599 @param[in,own]	file		Handle to a file
4600 @return true if success */
4601 bool
os_file_close_func(os_file_t file)4602 os_file_close_func(
4603 	os_file_t	file)
4604 {
4605 	ut_a(file);
4606 
4607 	if (CloseHandle(file)) {
4608 		return(true);
4609 	}
4610 
4611 	os_file_handle_error(NULL, "close");
4612 
4613 	return(false);
4614 }
4615 
4616 /** Gets a file size.
4617 @param[in]	file		Handle to a file
4618 @return file size, or (os_offset_t) -1 on failure */
4619 os_offset_t
os_file_get_size(os_file_t file)4620 os_file_get_size(
4621 	os_file_t	file)
4622 {
4623 	DWORD		high;
4624 	DWORD		low = GetFileSize(file, &high);
4625 
4626 	if (low == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
4627 		return((os_offset_t) -1);
4628 	}
4629 
4630 	return(os_offset_t(low | (os_offset_t(high) << 32)));
4631 }
4632 
4633 /** Gets a file size.
4634 @param[in]	filename	Full path to the filename to check
4635 @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
4636 	errno */
4637 os_file_size_t
os_file_get_size(const char * filename)4638 os_file_get_size(
4639 	const char*	filename)
4640 {
4641 	struct __stat64	s;
4642 	os_file_size_t	file_size;
4643 
4644 	int		ret = _stat64(filename, &s);
4645 
4646 	if (ret == 0) {
4647 
4648 		file_size.m_total_size = s.st_size;
4649 
4650 		DWORD	low_size;
4651 		DWORD	high_size;
4652 
4653 		low_size = GetCompressedFileSize(filename, &high_size);
4654 
4655 		if (low_size != INVALID_FILE_SIZE) {
4656 
4657 			file_size.m_alloc_size = high_size;
4658 			file_size.m_alloc_size <<= 32;
4659 			file_size.m_alloc_size |= low_size;
4660 
4661 		} else {
4662 			ib::error()
4663 				<< "GetCompressedFileSize("
4664 				<< filename << ", ..) failed.";
4665 
4666 			file_size.m_alloc_size = (os_offset_t) -1;
4667 		}
4668 	} else {
4669 		file_size.m_total_size = ~0;
4670 		file_size.m_alloc_size = (os_offset_t) ret;
4671 	}
4672 
4673 	return(file_size);
4674 }
4675 
4676 /** This function returns information about the specified file
4677 @param[in]	path		pathname of the file
4678 @param[out]	stat_info	information of a file in a directory
4679 @param[in,out]	statinfo	information of a file in a directory
4680 @param[in]	check_rw_perm	for testing whether the file can be opened
4681 				in RW mode
4682 @param[in]	read_only	true if the file is opened in read-only mode
4683 @return DB_SUCCESS if all OK */
4684 static
4685 dberr_t
os_file_get_status_win32(const char * path,os_file_stat_t * stat_info,struct _stat64 * statinfo,bool check_rw_perm,bool read_only)4686 os_file_get_status_win32(
4687 	const char*	path,
4688 	os_file_stat_t* stat_info,
4689 	struct _stat64*	statinfo,
4690 	bool		check_rw_perm,
4691 	bool		read_only)
4692 {
4693 	int	ret = _stat64(path, statinfo);
4694 
4695 	if (ret && (errno == ENOENT || errno == ENOTDIR
4696 		    || errno == ENAMETOOLONG)) {
4697 		/* file does not exist */
4698 
4699 		return(DB_NOT_FOUND);
4700 
4701 	} else if (ret) {
4702 		/* file exists, but stat call failed */
4703 
4704 		os_file_handle_error_no_exit(path, "STAT", false);
4705 
4706 		return(DB_FAIL);
4707 
4708 	} else if (_S_IFDIR & statinfo->st_mode) {
4709 
4710 		stat_info->type = OS_FILE_TYPE_DIR;
4711 
4712 	} else if (_S_IFREG & statinfo->st_mode) {
4713 
4714 		DWORD	access = GENERIC_READ;
4715 
4716 		if (!read_only) {
4717 			access |= GENERIC_WRITE;
4718 		}
4719 
4720 		stat_info->type = OS_FILE_TYPE_FILE;
4721 
4722 		/* Check if we can open it in read-only mode. */
4723 
4724 		if (check_rw_perm) {
4725 			HANDLE	fh;
4726 
4727 			fh = CreateFile(
4728 				(LPCTSTR) path,		// File to open
4729 				access,
4730 				FILE_SHARE_READ | FILE_SHARE_WRITE
4731 				| FILE_SHARE_DELETE,	// Full sharing
4732 				NULL,			// Default security
4733 				OPEN_EXISTING,		// Existing file only
4734 				FILE_ATTRIBUTE_NORMAL,	// Normal file
4735 				NULL);			// No attr. template
4736 
4737 			if (fh == INVALID_HANDLE_VALUE) {
4738 				stat_info->rw_perm = false;
4739 			} else {
4740 				stat_info->rw_perm = true;
4741 				CloseHandle(fh);
4742 			}
4743 		}
4744 		stat_info->block_size = 0;
4745 
4746 		/* What follows, is calculation of FS block size, which is not important
4747 		(it is just shown in I_S innodb tables). The error to calculate it will be ignored.*/
4748 		char	volname[MAX_PATH];
4749 		BOOL	result = GetVolumePathName(path, volname, MAX_PATH);
4750 		static	bool warned_once = false;
4751 		if (!result) {
4752 			if (!warned_once) {
4753 				ib::warn()
4754 					<< "os_file_get_status_win32: "
4755 					<< "Failed to get the volume path name for: "
4756 					<< path
4757 					<< "- OS error number " << GetLastError();
4758 				warned_once = true;
4759 			}
4760 			return(DB_SUCCESS);
4761 		}
4762 
4763 		DWORD	sectorsPerCluster;
4764 		DWORD	bytesPerSector;
4765 		DWORD	numberOfFreeClusters;
4766 		DWORD	totalNumberOfClusters;
4767 
4768 		result = GetDiskFreeSpace(
4769 			(LPCSTR) volname,
4770 			&sectorsPerCluster,
4771 			&bytesPerSector,
4772 			&numberOfFreeClusters,
4773 			&totalNumberOfClusters);
4774 
4775 		if (!result) {
4776 			if (!warned_once) {
4777 				ib::warn()
4778 					<< "GetDiskFreeSpace(" << volname << ",...) "
4779 					<< "failed "
4780 					<< "- OS error number " << GetLastError();
4781 				warned_once = true;
4782 			}
4783 			return(DB_SUCCESS);
4784 		}
4785 		stat_info->block_size = bytesPerSector * sectorsPerCluster;
4786 	} else {
4787 		stat_info->type = OS_FILE_TYPE_UNKNOWN;
4788 	}
4789 
4790 	return(DB_SUCCESS);
4791 }
4792 
4793 /**
4794 Sets a sparse flag on Windows file.
4795 @param[in]	file  file handle
4796 @return true on success, false on error
4797 */
4798 #include <versionhelpers.h>
os_file_set_sparse_win32(os_file_t file,bool is_sparse)4799 bool os_file_set_sparse_win32(os_file_t file, bool is_sparse)
4800 {
4801 	if (!is_sparse && !IsWindows8OrGreater()) {
4802 		/* Cannot  unset sparse flag on older Windows.
4803 		Until Windows8 it is documented to produce unpredictable results,
4804 		if there are unallocated ranges in file.*/
4805 		return false;
4806 	}
4807 	DWORD temp;
4808 	FILE_SET_SPARSE_BUFFER sparse_buffer;
4809 	sparse_buffer.SetSparse = is_sparse;
4810 	return os_win32_device_io_control(file,
4811 		FSCTL_SET_SPARSE, &sparse_buffer, sizeof(sparse_buffer), 0, 0,&temp);
4812 }
4813 
4814 
4815 /**
4816 Change file size on Windows.
4817 
4818 If file is extended, the bytes between old and new EOF
4819 are zeros.
4820 
4821 If file is sparse, "virtual" block is added at the end of
4822 allocated area.
4823 
4824 If file is normal, file system allocates storage.
4825 
4826 @param[in]	pathname	file path
4827 @param[in]	file		file handle
4828 @param[in]	size		size to preserve in bytes
4829 @return true if success */
4830 bool
os_file_change_size_win32(const char * pathname,os_file_t file,os_offset_t size)4831 os_file_change_size_win32(
4832 	const char*	pathname,
4833 	os_file_t	file,
4834 	os_offset_t	size)
4835 {
4836 	LARGE_INTEGER	length;
4837 
4838 	length.QuadPart = size;
4839 
4840 	BOOL	success = SetFilePointerEx(file, length, NULL, FILE_BEGIN);
4841 
4842 	if (!success) {
4843 		os_file_handle_error_no_exit(
4844 			pathname, "SetFilePointerEx", false);
4845 	} else {
4846 		success = SetEndOfFile(file);
4847 		if (!success) {
4848 			os_file_handle_error_no_exit(
4849 				pathname, "SetEndOfFile", false);
4850 		}
4851 	}
4852 	return(success);
4853 }
4854 
4855 /** Truncates a file at its current position.
4856 @param[in]	file		Handle to be truncated
4857 @return true if success */
4858 bool
os_file_set_eof(FILE * file)4859 os_file_set_eof(
4860 	FILE*		file)
4861 {
4862 	HANDLE	h = (HANDLE) _get_osfhandle(fileno(file));
4863 
4864 	return(SetEndOfFile(h));
4865 }
4866 
4867 /** This function can be called if one wants to post a batch of reads and
4868 prefers an i/o-handler thread to handle them all at once later. You must
4869 call os_aio_simulated_wake_handler_threads later to ensure the threads
4870 are not left sleeping! */
4871 void
os_aio_simulated_put_read_threads_to_sleep()4872 os_aio_simulated_put_read_threads_to_sleep()
4873 {
4874 	AIO::simulated_put_read_threads_to_sleep();
4875 }
4876 
4877 /** This function can be called if one wants to post a batch of reads and
4878 prefers an i/o-handler thread to handle them all at once later. You must
4879 call os_aio_simulated_wake_handler_threads later to ensure the threads
4880 are not left sleeping! */
4881 void
simulated_put_read_threads_to_sleep()4882 AIO::simulated_put_read_threads_to_sleep()
4883 {
4884 	/* The idea of putting background IO threads to sleep is only for
4885 	Windows when using simulated AIO. Windows XP seems to schedule
4886 	background threads too eagerly to allow for coalescing during
4887 	readahead requests. */
4888 
4889 	if (srv_use_native_aio) {
4890 		/* We do not use simulated AIO: do nothing */
4891 
4892 		return;
4893 	}
4894 
4895 	os_aio_recommend_sleep_for_read_threads	= true;
4896 
4897 	for (ulint i = 0; i < os_aio_n_segments; i++) {
4898 		AIO*	array;
4899 
4900 		get_array_and_local_segment(&array, i);
4901 
4902 		if (array == s_reads) {
4903 
4904 			os_event_reset(os_aio_segment_wait_events[i]);
4905 		}
4906 	}
4907 }
4908 
4909 #endif /* !_WIN32*/
4910 
4911 /** Does a syncronous read or write depending upon the type specified
4912 In case of partial reads/writes the function tries
4913 NUM_RETRIES_ON_PARTIAL_IO times to read/write the complete data.
4914 @param[in]	type,		IO flags
4915 @param[in]	file		handle to an open file
4916 @param[out]	buf		buffer where to read
4917 @param[in]	offset		file offset from the start where to read
4918 @param[in]	n		number of bytes to read, starting from offset
4919 @param[out]	err		DB_SUCCESS or error code
4920 @return number of bytes read/written, -1 if error */
4921 static MY_ATTRIBUTE((warn_unused_result))
4922 ssize_t
os_file_io(const IORequest & in_type,os_file_t file,void * buf,ulint n,os_offset_t offset,dberr_t * err)4923 os_file_io(
4924 	const IORequest&in_type,
4925 	os_file_t	file,
4926 	void*		buf,
4927 	ulint		n,
4928 	os_offset_t	offset,
4929 	dberr_t*	err)
4930 {
4931 	ssize_t		original_n = ssize_t(n);
4932 	IORequest	type = in_type;
4933 	ssize_t		bytes_returned = 0;
4934 
4935 	SyncFileIO	sync_file_io(file, buf, n, offset);
4936 
4937 	for (ulint i = 0; i < NUM_RETRIES_ON_PARTIAL_IO; ++i) {
4938 
4939 		ssize_t	n_bytes = sync_file_io.execute(type);
4940 
4941 		/* Check for a hard error. Not much we can do now. */
4942 		if (n_bytes < 0) {
4943 
4944 			break;
4945 
4946 		} else if (n_bytes + bytes_returned == ssize_t(n)) {
4947 
4948 			bytes_returned += n_bytes;
4949 
4950 			if (offset > 0
4951 			    && !type.is_log()
4952 			    && type.is_write()
4953 			    && type.punch_hole()) {
4954 				*err = type.punch_hole(file, offset, n);
4955 
4956 			} else {
4957 				*err = DB_SUCCESS;
4958 			}
4959 
4960 			return(original_n);
4961 		}
4962 
4963 		/* Handle partial read/write. */
4964 
4965 		ut_ad(ulint(n_bytes + bytes_returned) < n);
4966 
4967 		bytes_returned += n_bytes;
4968 
4969 		if (!type.is_partial_io_warning_disabled()) {
4970 
4971 			const char*	op = type.is_read()
4972 				? "read" : "written";
4973 
4974 			ib::warn()
4975 				<< n
4976 				<< " bytes should have been " << op << ". Only "
4977 				<< bytes_returned
4978 				<< " bytes " << op << ". Retrying"
4979 				<< " for the remaining bytes.";
4980 		}
4981 
4982 		/* Advance the offset and buffer by n_bytes */
4983 		sync_file_io.advance(n_bytes);
4984 	}
4985 
4986 	*err = DB_IO_ERROR;
4987 
4988 	if (!type.is_partial_io_warning_disabled()) {
4989 		ib::warn()
4990 			<< "Retry attempts for "
4991 			<< (type.is_read() ? "reading" : "writing")
4992 			<< " partial data failed.";
4993 	}
4994 
4995 	return(bytes_returned);
4996 }
4997 
4998 /** Does a synchronous write operation in Posix.
4999 @param[in]	type		IO context
5000 @param[in]	file		handle to an open file
5001 @param[out]	buf		buffer from which to write
5002 @param[in]	n		number of bytes to read, starting from offset
5003 @param[in]	offset		file offset from the start where to read
5004 @param[out]	err		DB_SUCCESS or error code
5005 @return number of bytes written, -1 if error */
5006 static MY_ATTRIBUTE((warn_unused_result))
5007 ssize_t
os_file_pwrite(const IORequest & type,os_file_t file,const byte * buf,ulint n,os_offset_t offset,dberr_t * err)5008 os_file_pwrite(
5009 	const IORequest&	type,
5010 	os_file_t		file,
5011 	const byte*		buf,
5012 	ulint			n,
5013 	os_offset_t		offset,
5014 	dberr_t*		err)
5015 {
5016 	ut_ad(type.validate());
5017 	ut_ad(type.is_write());
5018 
5019 	++os_n_file_writes;
5020 
5021 	const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_WRITES);
5022 	MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
5023 	ssize_t	n_bytes = os_file_io(type, file, const_cast<byte*>(buf),
5024 				     n, offset, err);
5025 	MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
5026 
5027 	return(n_bytes);
5028 }
5029 
5030 /** NOTE! Use the corresponding macro os_file_write(), not directly
5031 Requests a synchronous write operation.
5032 @param[in]	type		IO flags
5033 @param[in]	file		handle to an open file
5034 @param[out]	buf		buffer from which to write
5035 @param[in]	offset		file offset from the start where to read
5036 @param[in]	n		number of bytes to read, starting from offset
5037 @return error code
5038 @retval	DB_SUCCESS	if the operation succeeded */
5039 dberr_t
os_file_write_func(const IORequest & type,const char * name,os_file_t file,const void * buf,os_offset_t offset,ulint n)5040 os_file_write_func(
5041 	const IORequest&	type,
5042 	const char*		name,
5043 	os_file_t		file,
5044 	const void*		buf,
5045 	os_offset_t		offset,
5046 	ulint			n)
5047 {
5048 	dberr_t		err;
5049 
5050 	ut_ad(type.validate());
5051 	ut_ad(n > 0);
5052 
5053 	WAIT_ALLOW_WRITES();
5054 
5055 	ssize_t	n_bytes = os_file_pwrite(type, file, (byte*)buf, n, offset, &err);
5056 
5057 	if ((ulint) n_bytes != n && !os_has_said_disk_full) {
5058 
5059 		ib::error()
5060 			<< "Write to file " << name << " failed at offset "
5061 			<< offset << ", " << n
5062 			<< " bytes should have been written,"
5063 			" only " << n_bytes << " were written."
5064 			" Operating system error number " << IF_WIN(GetLastError(),errno) << "."
5065 			" Check that your OS and file system"
5066 			" support files of this size."
5067 			" Check also that the disk is not full"
5068 			" or a disk quota exceeded.";
5069 #ifndef _WIN32
5070 		if (strerror(errno) != NULL) {
5071 
5072 			ib::error()
5073 				<< "Error number " << errno
5074 				<< " means '" << strerror(errno) << "'";
5075 		}
5076 
5077 		ib::info() << OPERATING_SYSTEM_ERROR_MSG;
5078 #endif
5079 		os_has_said_disk_full = true;
5080 	}
5081 
5082 	return(err);
5083 }
5084 
5085 /** Does a synchronous read operation in Posix.
5086 @param[in]	type		IO flags
5087 @param[in]	file		handle to an open file
5088 @param[out]	buf		buffer where to read
5089 @param[in]	offset		file offset from the start where to read
5090 @param[in]	n		number of bytes to read, starting from offset
5091 @param[out]	err		DB_SUCCESS or error code
5092 @return number of bytes read, -1 if error */
5093 static MY_ATTRIBUTE((warn_unused_result))
5094 ssize_t
os_file_pread(const IORequest & type,os_file_t file,void * buf,ulint n,os_offset_t offset,dberr_t * err)5095 os_file_pread(
5096 	const IORequest&	type,
5097 	os_file_t		file,
5098 	void*			buf,
5099 	ulint			n,
5100 	os_offset_t		offset,
5101 	dberr_t*		err)
5102 {
5103 	ut_ad(type.is_read());
5104 
5105 	++os_n_file_reads;
5106 
5107 	const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS);
5108 	MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor);
5109 	ssize_t	n_bytes = os_file_io(type, file, buf, n, offset, err);
5110 	MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
5111 
5112 	return(n_bytes);
5113 }
5114 
5115 /** Requests a synchronous positioned read operation.
5116 @return DB_SUCCESS if request was successful, false if fail
5117 @param[in]	type		IO flags
5118 @param[in]	file		handle to an open file
5119 @param[out]	buf		buffer where to read
5120 @param[in]	offset		file offset from the start where to read
5121 @param[in]	n		number of bytes to read, starting from offset
5122 @param[out]	o		number of bytes actually read
5123 @param[in]	exit_on_err	if true then exit on error
5124 @return DB_SUCCESS or error code */
5125 static MY_ATTRIBUTE((warn_unused_result))
5126 dberr_t
os_file_read_page(const IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n,ulint * o,bool exit_on_err)5127 os_file_read_page(
5128 	const IORequest&	type,
5129 	os_file_t		file,
5130 	void*			buf,
5131 	os_offset_t		offset,
5132 	ulint			n,
5133 	ulint*			o,
5134 	bool			exit_on_err)
5135 {
5136 	dberr_t		err;
5137 
5138 	os_bytes_read_since_printout += n;
5139 
5140 	ut_ad(type.validate());
5141 	ut_ad(n > 0);
5142 
5143 	ssize_t	n_bytes = os_file_pread(type, file, buf, n, offset, &err);
5144 
5145 	if (o) {
5146 		*o = n_bytes;
5147 	}
5148 
5149 	if (ulint(n_bytes) == n || (err != DB_SUCCESS && !exit_on_err)) {
5150 		return err;
5151 	}
5152 
5153 	ib::error() << "Tried to read " << n << " bytes at offset "
5154 		    << offset << ", but was only able to read " << n_bytes;
5155 
5156 	if (!os_file_handle_error_cond_exit(
5157 		    NULL, "read", exit_on_err, false)) {
5158 		ib::fatal()
5159 			<< "Cannot read from file. OS error number "
5160 			<< errno << ".";
5161 	}
5162 
5163 	if (err == DB_SUCCESS) {
5164 		err = DB_IO_ERROR;
5165 	}
5166 
5167 	return err;
5168 }
5169 
5170 /** Retrieves the last error number if an error occurs in a file io function.
5171 The number should be retrieved before any other OS calls (because they may
5172 overwrite the error number). If the number is not known to this program,
5173 the OS error number + 100 is returned.
5174 @param[in]	report_all_errors	true if we want an error printed
5175 					for all errors
5176 @return error number, or OS error number + 100 */
5177 ulint
os_file_get_last_error(bool report_all_errors)5178 os_file_get_last_error(
5179 	bool	report_all_errors)
5180 {
5181 	return(os_file_get_last_error_low(report_all_errors, false));
5182 }
5183 
5184 /** Handle errors for file operations.
5185 @param[in]	name		name of a file or NULL
5186 @param[in]	operation	operation
5187 @param[in]	should_abort	whether to abort on an unknown error
5188 @param[in]	on_error_silent	whether to suppress reports of non-fatal errors
5189 @return true if we should retry the operation */
5190 static MY_ATTRIBUTE((warn_unused_result))
5191 bool
os_file_handle_error_cond_exit(const char * name,const char * operation,bool should_abort,bool on_error_silent)5192 os_file_handle_error_cond_exit(
5193 	const char*	name,
5194 	const char*	operation,
5195 	bool		should_abort,
5196 	bool		on_error_silent)
5197 {
5198 	ulint	err;
5199 
5200 	err = os_file_get_last_error_low(false, on_error_silent);
5201 
5202 	switch (err) {
5203 	case OS_FILE_DISK_FULL:
5204 		/* We only print a warning about disk full once */
5205 
5206 		if (os_has_said_disk_full) {
5207 
5208 			return(false);
5209 		}
5210 
5211 		/* Disk full error is reported irrespective of the
5212 		on_error_silent setting. */
5213 
5214 		if (name) {
5215 
5216 			ib::error()
5217 				<< "Encountered a problem with file '"
5218 				<< name << "'";
5219 		}
5220 
5221 		ib::error()
5222 			<< "Disk is full. Try to clean the disk to free space.";
5223 
5224 		os_has_said_disk_full = true;
5225 
5226 		return(false);
5227 
5228 	case OS_FILE_AIO_RESOURCES_RESERVED:
5229 	case OS_FILE_AIO_INTERRUPTED:
5230 
5231 		return(true);
5232 
5233 	case OS_FILE_PATH_ERROR:
5234 	case OS_FILE_ALREADY_EXISTS:
5235 	case OS_FILE_ACCESS_VIOLATION:
5236 
5237 		return(false);
5238 
5239 	case OS_FILE_SHARING_VIOLATION:
5240 
5241 		os_thread_sleep(10000000);	/* 10 sec */
5242 		return(true);
5243 
5244 	case OS_FILE_OPERATION_ABORTED:
5245 	case OS_FILE_INSUFFICIENT_RESOURCE:
5246 
5247 		os_thread_sleep(100000);	/* 100 ms */
5248 		return(true);
5249 
5250 	default:
5251 
5252 		/* If it is an operation that can crash on error then it
5253 		is better to ignore on_error_silent and print an error message
5254 		to the log. */
5255 
5256 		if (should_abort || !on_error_silent) {
5257 			ib::error() << "File "
5258 				<< (name != NULL ? name : "(unknown)")
5259 				<< ": '" << operation << "'"
5260 				" returned OS error " << err << "."
5261 				<< (should_abort
5262 				    ? " Cannot continue operation" : "");
5263 		}
5264 
5265 		if (should_abort) {
5266 			abort();
5267 		}
5268 	}
5269 
5270 	return(false);
5271 }
5272 
5273 #ifndef _WIN32
5274 /** Tries to disable OS caching on an opened file descriptor.
5275 @param[in]	fd		file descriptor to alter
5276 @param[in]	file_name	file name, used in the diagnostic message
5277 @param[in]	name		"open" or "create"; used in the diagnostic
5278 				message */
5279 void
os_file_set_nocache(int fd MY_ATTRIBUTE ((unused)),const char * file_name MY_ATTRIBUTE ((unused)),const char * operation_name MY_ATTRIBUTE ((unused)))5280 os_file_set_nocache(
5281 	int	fd		MY_ATTRIBUTE((unused)),
5282 	const char*	file_name	MY_ATTRIBUTE((unused)),
5283 	const char*	operation_name	MY_ATTRIBUTE((unused)))
5284 {
5285 	/* some versions of Solaris may not have DIRECTIO_ON */
5286 #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
5287 	if (directio(fd, DIRECTIO_ON) == -1) {
5288 		int	errno_save = errno;
5289 
5290 		ib::error()
5291 			<< "Failed to set DIRECTIO_ON on file "
5292 			<< file_name << "; " << operation_name << ": "
5293 			<< strerror(errno_save) << ","
5294 			" continuing anyway.";
5295 	}
5296 #elif defined(O_DIRECT)
5297 	if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
5298 		int		errno_save = errno;
5299 		static bool	warning_message_printed = false;
5300 		if (errno_save == EINVAL) {
5301 			if (!warning_message_printed) {
5302 				warning_message_printed = true;
5303 # ifdef UNIV_LINUX
5304 				ib::warn()
5305 					<< "Failed to set O_DIRECT on file"
5306 					<< file_name << "; " << operation_name
5307 					<< ": " << strerror(errno_save) << ", "
5308 					"continuing anyway. O_DIRECT is "
5309 					"known to result in 'Invalid argument' "
5310 					"on Linux on tmpfs, "
5311 					"see MySQL Bug#26662.";
5312 # else /* UNIV_LINUX */
5313 				goto short_warning;
5314 # endif /* UNIV_LINUX */
5315 			}
5316 		} else {
5317 # ifndef UNIV_LINUX
5318 short_warning:
5319 # endif
5320 			ib::warn()
5321 				<< "Failed to set O_DIRECT on file "
5322 				<< file_name << "; " << operation_name
5323 				<< " : " << strerror(errno_save)
5324 				<< ", continuing anyway.";
5325 		}
5326 	}
5327 #endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */
5328 }
5329 
5330 #endif /* _WIN32 */
5331 
5332 /** Extend a file.
5333 
5334 On Windows, extending a file allocates blocks for the file,
5335 unless the file is sparse.
5336 
5337 On Unix, we will extend the file with ftruncate(), if
5338 file needs to be sparse. Otherwise posix_fallocate() is used
5339 when available, and if not, binary zeroes are added to the end
5340 of file.
5341 
5342 @param[in]	name	file name
5343 @param[in]	file	file handle
5344 @param[in]	size	desired file size
5345 @param[in]	sparse	whether to create a sparse file (no preallocating)
5346 @return	whether the operation succeeded */
5347 bool
os_file_set_size(const char * name,os_file_t file,os_offset_t size,bool is_sparse)5348 os_file_set_size(
5349 	const char*	name,
5350 	os_file_t	file,
5351 	os_offset_t	size,
5352 	bool	is_sparse)
5353 {
5354 	ut_ad(!(size & 4095));
5355 
5356 #ifdef _WIN32
5357 	/* On Windows, changing file size works well and as expected for both
5358 	sparse and normal files.
5359 
5360 	However, 10.2 up until 10.2.9 made every file sparse in innodb,
5361 	causing NTFS fragmentation issues(MDEV-13941). We try to undo
5362 	the damage, and unsparse the file.*/
5363 
5364 	if (!is_sparse && os_is_sparse_file_supported(file)) {
5365 		if (!os_file_set_sparse_win32(file, false))
5366 			/* Unsparsing file failed. Fallback to writing binary
5367 			zeros, to avoid even higher fragmentation.*/
5368 			goto fallback;
5369 	}
5370 
5371 	return os_file_change_size_win32(name, file, size);
5372 
5373 fallback:
5374 #else
5375 	struct stat statbuf;
5376 
5377 	if (is_sparse) {
5378 		bool success = !ftruncate(file, size);
5379 		if (!success) {
5380 			ib::error() << "ftruncate of file " << name << " to "
5381 				    << size << " bytes failed with error "
5382 				    << errno;
5383 		}
5384 		return(success);
5385 	}
5386 
5387 # ifdef HAVE_POSIX_FALLOCATE
5388 	int err;
5389 	do {
5390 		if (fstat(file, &statbuf)) {
5391 			err = errno;
5392 		} else {
5393 			os_offset_t current_size = statbuf.st_size;
5394 			if (current_size >= size) {
5395 				return true;
5396 			}
5397 			current_size &= ~4095ULL;
5398 			err = posix_fallocate(file, current_size,
5399 					      size - current_size);
5400 		}
5401 	} while (err == EINTR
5402 		 && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED);
5403 
5404 	switch (err) {
5405 	case 0:
5406 		return true;
5407 	default:
5408 		ib::error() << "preallocating "
5409 			    << size << " bytes for file " << name
5410 			    << " failed with error " << err;
5411 		/* fall through */
5412 	case EINTR:
5413 		errno = err;
5414 		return false;
5415 	case EINVAL:
5416 	case EOPNOTSUPP:
5417 		/* fall back to the code below */
5418 		break;
5419 	}
5420 # endif /* HAVE_POSIX_ALLOCATE */
5421 #endif /* _WIN32*/
5422 
5423 #ifdef _WIN32
5424 	os_offset_t	current_size = os_file_get_size(file);
5425 	FILE_STORAGE_INFO info;
5426 	if (GetFileInformationByHandleEx(file, FileStorageInfo, &info,
5427 					 sizeof info)) {
5428 		if (info.LogicalBytesPerSector) {
5429 			current_size &= ~os_offset_t(info.LogicalBytesPerSector
5430 						     - 1);
5431 		}
5432 	}
5433 #else
5434 	if (fstat(file, &statbuf)) {
5435 		return false;
5436 	}
5437 	os_offset_t current_size = statbuf.st_size & ~4095ULL;
5438 #endif
5439 	if (current_size >= size) {
5440 		return true;
5441 	}
5442 
5443 	/* Write up to 1 megabyte at a time. */
5444 	ulint	buf_size = ut_min(ulint(64),
5445 				  ulint(size >> srv_page_size_shift))
5446 		<< srv_page_size_shift;
5447 
5448 	/* Align the buffer for possible raw i/o */
5449 	byte*	buf2;
5450 
5451 	buf2 = static_cast<byte*>(ut_malloc_nokey(buf_size + srv_page_size));
5452 
5453 	byte*	buf = static_cast<byte*>(ut_align(buf2, srv_page_size));
5454 
5455 	/* Write buffer full of zeros */
5456 	memset(buf, 0, buf_size);
5457 
5458 	while (current_size < size
5459 	       && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED) {
5460 		ulint	n_bytes;
5461 
5462 		if (size - current_size < (os_offset_t) buf_size) {
5463 			n_bytes = (ulint) (size - current_size);
5464 		} else {
5465 			n_bytes = buf_size;
5466 		}
5467 
5468 		dberr_t		err;
5469 		IORequest	request(IORequest::WRITE);
5470 
5471 		err = os_file_write(
5472 			request, name, file, buf, current_size, n_bytes);
5473 
5474 		if (err != DB_SUCCESS) {
5475 			break;
5476 		}
5477 
5478 		current_size += n_bytes;
5479 	}
5480 
5481 	ut_free(buf2);
5482 
5483 	return(current_size >= size && os_file_flush(file));
5484 }
5485 
5486 /** Truncate a file to a specified size in bytes.
5487 @param[in]	pathname	file path
5488 @param[in]	file		file to be truncated
5489 @param[in]	size		size preserved in bytes
5490 @param[in]	allow_shrink	whether to allow the file to become smaller
5491 @return true if success */
5492 bool
os_file_truncate(const char * pathname,os_file_t file,os_offset_t size,bool allow_shrink)5493 os_file_truncate(
5494 	const char*	pathname,
5495 	os_file_t	file,
5496 	os_offset_t	size,
5497 	bool		allow_shrink)
5498 {
5499 	if (!allow_shrink) {
5500 		/* Do nothing if the size preserved is larger than or
5501 		equal to the current size of file */
5502 		os_offset_t	size_bytes = os_file_get_size(file);
5503 
5504 		if (size >= size_bytes) {
5505 			return(true);
5506 		}
5507 	}
5508 
5509 #ifdef _WIN32
5510 	return(os_file_change_size_win32(pathname, file, size));
5511 #else /* _WIN32 */
5512 	return(os_file_truncate_posix(pathname, file, size));
5513 #endif /* _WIN32 */
5514 }
5515 
5516 /** NOTE! Use the corresponding macro os_file_read(), not directly this
5517 function!
5518 Requests a synchronous positioned read operation.
5519 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
5520 @param[in]	type		IO flags
5521 @param[in]	file		handle to an open file
5522 @param[out]	buf		buffer where to read
5523 @param[in]	offset		file offset from the start where to read
5524 @param[in]	n		number of bytes to read, starting from offset
5525 @return error code
5526 @retval	DB_SUCCESS	if the operation succeeded */
5527 dberr_t
os_file_read_func(const IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n)5528 os_file_read_func(
5529 	const IORequest&	type,
5530 	os_file_t		file,
5531 	void*			buf,
5532 	os_offset_t		offset,
5533 	ulint			n)
5534 {
5535 	return(os_file_read_page(type, file, buf, offset, n, NULL, true));
5536 }
5537 
5538 /** NOTE! Use the corresponding macro os_file_read_no_error_handling(),
5539 not directly this function!
5540 Requests a synchronous positioned read operation.
5541 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
5542 @param[in]	type		IO flags
5543 @param[in]	file		handle to an open file
5544 @param[out]	buf		buffer where to read
5545 @param[in]	offset		file offset from the start where to read
5546 @param[in]	n		number of bytes to read, starting from offset
5547 @param[out]	o		number of bytes actually read
5548 @return DB_SUCCESS or error code */
5549 dberr_t
os_file_read_no_error_handling_func(const IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n,ulint * o)5550 os_file_read_no_error_handling_func(
5551 	const IORequest&	type,
5552 	os_file_t		file,
5553 	void*			buf,
5554 	os_offset_t		offset,
5555 	ulint			n,
5556 	ulint*			o)
5557 {
5558 	return(os_file_read_page(type, file, buf, offset, n, o, false));
5559 }
5560 
5561 /** Check the existence and type of the given file.
5562 @param[in]	path		path name of file
5563 @param[out]	exists		true if the file exists
5564 @param[out]	type		Type of the file, if it exists
5565 @return true if call succeeded */
5566 bool
os_file_status(const char * path,bool * exists,os_file_type_t * type)5567 os_file_status(
5568 	const char*	path,
5569 	bool*		exists,
5570 	os_file_type_t* type)
5571 {
5572 #ifdef _WIN32
5573 	return(os_file_status_win32(path, exists, type));
5574 #else
5575 	return(os_file_status_posix(path, exists, type));
5576 #endif /* _WIN32 */
5577 }
5578 
5579 /** Free storage space associated with a section of the file.
5580 @param[in]	fh		Open file handle
5581 @param[in]	off		Starting offset (SEEK_SET)
5582 @param[in]	len		Size of the hole
5583 @return DB_SUCCESS or error code */
5584 dberr_t
os_file_punch_hole(os_file_t fh,os_offset_t off,os_offset_t len)5585 os_file_punch_hole(
5586 	os_file_t	fh,
5587 	os_offset_t	off,
5588 	os_offset_t	len)
5589 {
5590 	dberr_t err;
5591 
5592 #ifdef _WIN32
5593 	err = os_file_punch_hole_win32(fh, off, len);
5594 #else
5595 	err = os_file_punch_hole_posix(fh, off, len);
5596 #endif /* _WIN32 */
5597 
5598 	return (err);
5599 }
5600 
5601 /** Free storage space associated with a section of the file.
5602 @param[in]	fh		Open file handle
5603 @param[in]	off		Starting offset (SEEK_SET)
5604 @param[in]	len		Size of the hole
5605 @return DB_SUCCESS or error code */
5606 dberr_t
punch_hole(os_file_t fh,os_offset_t off,ulint len)5607 IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len)
5608 {
5609 	/* In this debugging mode, we act as if punch hole is supported,
5610 	and then skip any calls to actually punch a hole here.
5611 	In this way, Transparent Page Compression is still being tested. */
5612 	DBUG_EXECUTE_IF("ignore_punch_hole",
5613 		return(DB_SUCCESS);
5614 	);
5615 
5616 	ulint trim_len = get_trim_length(len);
5617 
5618 	if (trim_len == 0) {
5619 		return(DB_SUCCESS);
5620 	}
5621 
5622 	off += len;
5623 
5624 	/* Check does file system support punching holes for this
5625 	tablespace. */
5626 	if (!should_punch_hole()) {
5627 		return DB_IO_NO_PUNCH_HOLE;
5628 	}
5629 
5630 	dberr_t err = os_file_punch_hole(fh, off, trim_len);
5631 
5632 	if (err == DB_SUCCESS) {
5633 		srv_stats.page_compressed_trim_op.inc();
5634 	} else {
5635 		/* If punch hole is not supported,
5636 		set space so that it is not used. */
5637 		if (err == DB_IO_NO_PUNCH_HOLE) {
5638 			space_no_punch_hole();
5639 			err = DB_SUCCESS;
5640 		}
5641 	}
5642 
5643 	return (err);
5644 }
5645 
5646 /** Check if the file system supports sparse files.
5647 
5648 Warning: On POSIX systems we try and punch a hole from offset 0 to
5649 the system configured page size. This should only be called on an empty
5650 file.
5651 @param[in]	fh		File handle for the file - if opened
5652 @return true if the file system supports sparse files */
5653 bool
os_is_sparse_file_supported(os_file_t fh)5654 os_is_sparse_file_supported(os_file_t fh)
5655 {
5656 	/* In this debugging mode, we act as if punch hole is supported,
5657 	then we skip any calls to actually punch a hole.  In this way,
5658 	Transparent Page Compression is still being tested. */
5659 	DBUG_EXECUTE_IF("ignore_punch_hole",
5660 		return(true);
5661 	);
5662 
5663 #ifdef _WIN32
5664 	FILE_ATTRIBUTE_TAG_INFO info;
5665 	if (GetFileInformationByHandleEx(fh, FileAttributeTagInfo,
5666 		&info, (DWORD)sizeof(info))) {
5667 		if (info.FileAttributes != INVALID_FILE_ATTRIBUTES) {
5668 			return (info.FileAttributes & FILE_ATTRIBUTE_SPARSE_FILE) != 0;
5669 		}
5670 	}
5671 	return false;
5672 #else
5673 	dberr_t	err;
5674 
5675 	/* We don't know the FS block size, use the sector size. The FS
5676 	will do the magic. */
5677 	err = os_file_punch_hole_posix(fh, 0, srv_page_size);
5678 
5679 	return(err == DB_SUCCESS);
5680 #endif /* _WIN32 */
5681 }
5682 
5683 /** This function returns information about the specified file
5684 @param[in]	path		pathname of the file
5685 @param[out]	stat_info	information of a file in a directory
5686 @param[in]	check_rw_perm	for testing whether the file can be opened
5687 				in RW mode
5688 @param[in]	read_only	true if file is opened in read-only mode
5689 @return DB_SUCCESS if all OK */
5690 dberr_t
os_file_get_status(const char * path,os_file_stat_t * stat_info,bool check_rw_perm,bool read_only)5691 os_file_get_status(
5692 	const char*	path,
5693 	os_file_stat_t* stat_info,
5694 	bool		check_rw_perm,
5695 	bool		read_only)
5696 {
5697 	dberr_t	ret;
5698 
5699 #ifdef _WIN32
5700 	struct _stat64	info;
5701 
5702 	ret = os_file_get_status_win32(
5703 		path, stat_info, &info, check_rw_perm, read_only);
5704 
5705 #else
5706 	struct stat	info;
5707 
5708 	ret = os_file_get_status_posix(
5709 		path, stat_info, &info, check_rw_perm, read_only);
5710 
5711 #endif /* _WIN32 */
5712 
5713 	if (ret == DB_SUCCESS) {
5714 		stat_info->ctime = info.st_ctime;
5715 		stat_info->atime = info.st_atime;
5716 		stat_info->mtime = info.st_mtime;
5717 		stat_info->size  = info.st_size;
5718 	}
5719 
5720 	return(ret);
5721 }
5722 
5723 /**
5724 Waits for an AIO operation to complete. This function is used to wait the
5725 for completed requests. The aio array of pending requests is divided
5726 into segments. The thread specifies which segment or slot it wants to wait
5727 for. NOTE: this function will also take care of freeing the aio slot,
5728 therefore no other thread is allowed to do the freeing!
5729 @param[in]	segment		The number of the segment in the aio arrays to
5730 				wait for; segment 0 is the ibuf I/O thread,
5731 				segment 1 the log I/O thread, then follow the
5732 				non-ibuf read threads, and as the last are the
5733 				non-ibuf write threads; if this is
5734 				ULINT_UNDEFINED, then it means that sync AIO
5735 				is used, and this parameter is ignored
5736 @param[out]	m1		the messages passed with the AIO request; note
5737 				that also in the case where the AIO operation
5738 				failed, these output parameters are valid and
5739 				can be used to restart the operation,
5740 				for example
5741 @param[out]	m2		callback message
5742 @param[out]	type		OS_FILE_WRITE or ..._READ
5743 @return DB_SUCCESS or error code */
5744 dberr_t
os_aio_handler(ulint segment,fil_node_t ** m1,void ** m2,IORequest * request)5745 os_aio_handler(
5746 	ulint		segment,
5747 	fil_node_t**	m1,
5748 	void**		m2,
5749 	IORequest*	request)
5750 {
5751 	dberr_t	err;
5752 
5753 	if (srv_use_native_aio) {
5754 		srv_set_io_thread_op_info(segment, "native aio handle");
5755 
5756 #ifdef WIN_ASYNC_IO
5757 
5758 		err = os_aio_windows_handler(segment, 0, m1, m2, request);
5759 
5760 #elif defined(LINUX_NATIVE_AIO)
5761 
5762 		err = os_aio_linux_handler(segment, m1, m2, request);
5763 
5764 #else
5765 		ut_error;
5766 
5767 		err = DB_ERROR; /* Eliminate compiler warning */
5768 
5769 #endif /* WIN_ASYNC_IO */
5770 
5771 	} else {
5772 		srv_set_io_thread_op_info(segment, "simulated aio handle");
5773 
5774 		err = os_aio_simulated_handler(segment, m1, m2, request);
5775 	}
5776 
5777 	return(err);
5778 }
5779 
5780 #ifdef WIN_ASYNC_IO
new_completion_port()5781 static HANDLE new_completion_port()
5782 {
5783 	HANDLE h = CreateIoCompletionPort(INVALID_HANDLE_VALUE, 0, 0, 0);
5784 	ut_a(h);
5785 	return h;
5786 }
5787 #endif
5788 
5789 /** Constructor
5790 @param[in]	id		The latch ID
5791 @param[in]	n		Number of AIO slots
5792 @param[in]	segments	Number of segments */
AIO(latch_id_t id,ulint n,ulint segments)5793 AIO::AIO(
5794 	latch_id_t	id,
5795 	ulint		n,
5796 	ulint		segments)
5797 	:
5798 	m_slots(n),
5799 	m_n_segments(segments),
5800 	m_n_reserved()
5801 # ifdef LINUX_NATIVE_AIO
5802 	,m_events(m_slots.size())
5803 # endif /* LINUX_NATIVE_AIO */
5804 #ifdef WIN_ASYNC_IO
5805 	,m_completion_port(new_completion_port())
5806 #endif
5807 {
5808 	ut_a(n > 0);
5809 	ut_a(m_n_segments > 0);
5810 
5811 	mutex_create(id, &m_mutex);
5812 
5813 	m_not_full = os_event_create("aio_not_full");
5814 	m_is_empty = os_event_create("aio_is_empty");
5815 
5816 	memset((void*)&m_slots[0], 0x0, sizeof(m_slots[0]) * m_slots.size());
5817 #ifdef LINUX_NATIVE_AIO
5818 	memset(&m_events[0], 0x0, sizeof(m_events[0]) * m_events.size());
5819 #endif /* LINUX_NATIVE_AIO */
5820 
5821 	os_event_set(m_is_empty);
5822 }
5823 
5824 /** Initialise the slots */
5825 dberr_t
init_slots()5826 AIO::init_slots()
5827 {
5828 	for (ulint i = 0; i < m_slots.size(); ++i) {
5829 		Slot&	slot = m_slots[i];
5830 
5831 		slot.pos = static_cast<uint16_t>(i);
5832 
5833 		slot.is_reserved = false;
5834 
5835 #ifdef WIN_ASYNC_IO
5836 
5837 		slot.array = this;
5838 
5839 #elif defined(LINUX_NATIVE_AIO)
5840 
5841 		slot.ret = 0;
5842 
5843 		slot.n_bytes = 0;
5844 
5845 		memset(&slot.control, 0x0, sizeof(slot.control));
5846 
5847 #endif /* WIN_ASYNC_IO */
5848 	}
5849 
5850 	return(DB_SUCCESS);
5851 }
5852 
5853 #ifdef LINUX_NATIVE_AIO
5854 /** Initialise the Linux Native AIO interface */
5855 dberr_t
init_linux_native_aio()5856 AIO::init_linux_native_aio()
5857 {
5858 
5859 	/* Initialize the io_context_t array. One io_context_t
5860 	per segment in the array. */
5861 	m_aio_ctx.resize(get_n_segments());
5862 
5863 	ulint		max_events = slots_per_segment();
5864 
5865 	for (std::vector<io_context_t>::iterator it = m_aio_ctx.begin(),
5866 						 end = m_aio_ctx.end();
5867 	     it != end; ++it) {
5868 
5869 		if (!linux_create_io_ctx(max_events, *it)) {
5870 			/* If something bad happened during aio setup
5871 			we disable linux native aio.
5872 			This frequently happens when running the test suite
5873 			with many threads on a system with low fs.aio-max-nr!
5874 			*/
5875 
5876 			ib::warn()
5877 				<< "Warning: Linux Native AIO disabled "
5878 				<< "because _linux_create_io_ctx() "
5879 				<< "failed. To get rid of this warning you can "
5880 				<< "try increasing system "
5881 				<< "fs.aio-max-nr to 1048576 or larger or "
5882 				<< "setting innodb_use_native_aio = 0 in my.cnf";
5883 
5884 			for (std::vector<io_context_t>::iterator it2
5885 			     = m_aio_ctx.begin();
5886 			     it2 != it; ++it2) {
5887 				int ret = io_destroy(*it2);
5888 				ut_a(ret != -EINVAL);
5889 			}
5890 
5891 			m_aio_ctx.clear();
5892 			srv_use_native_aio = FALSE;
5893 			return(DB_SUCCESS);
5894 		}
5895 	}
5896 
5897 	return(DB_SUCCESS);
5898 }
5899 #endif /* LINUX_NATIVE_AIO */
5900 
5901 /** Initialise the array */
5902 dberr_t
init()5903 AIO::init()
5904 {
5905 	ut_a(!m_slots.empty());
5906 
5907 
5908 	if (srv_use_native_aio) {
5909 #ifdef LINUX_NATIVE_AIO
5910 		dberr_t	err = init_linux_native_aio();
5911 
5912 		if (err != DB_SUCCESS) {
5913 			return(err);
5914 		}
5915 
5916 #endif /* LINUX_NATIVE_AIO */
5917 	}
5918 
5919 	return(init_slots());
5920 }
5921 
5922 /** Creates an aio wait array. Note that we return NULL in case of failure.
5923 We don't care about freeing memory here because we assume that a
5924 failure will result in server refusing to start up.
5925 @param[in]	id		Latch ID
5926 @param[in]	n		maximum number of pending AIO operations
5927 				allowed; n must be divisible by m_n_segments
5928 @param[in]	n_segments	number of segments in the AIO array
5929 @return own: AIO array, NULL on failure */
5930 AIO*
create(latch_id_t id,ulint n,ulint n_segments)5931 AIO::create(
5932 	latch_id_t	id,
5933 	ulint		n,
5934 	ulint		n_segments)
5935 {
5936 	if ((n % n_segments)) {
5937 
5938 		ib::error()
5939 			<< "Maximum number of AIO operations must be "
5940 			<< "divisible by number of segments";
5941 
5942 		return(NULL);
5943 	}
5944 
5945 	AIO*	array = UT_NEW_NOKEY(AIO(id, n, n_segments));
5946 
5947 	if (array != NULL && array->init() != DB_SUCCESS) {
5948 
5949 		UT_DELETE(array);
5950 
5951 		array = NULL;
5952 	}
5953 
5954 	return(array);
5955 }
5956 
5957 /** AIO destructor */
~AIO()5958 AIO::~AIO()
5959 {
5960 	mutex_destroy(&m_mutex);
5961 
5962 	os_event_destroy(m_not_full);
5963 	os_event_destroy(m_is_empty);
5964 
5965 #if defined(LINUX_NATIVE_AIO)
5966 	if (srv_use_native_aio) {
5967 		for (ulint i = 0; i < m_aio_ctx.size(); i++) {
5968 			int ret = io_destroy(m_aio_ctx[i]);
5969 			ut_a(ret != -EINVAL);
5970 		}
5971 	}
5972 #endif /* LINUX_NATIVE_AIO */
5973 #if defined(WIN_ASYNC_IO)
5974 	CloseHandle(m_completion_port);
5975 #endif
5976 }
5977 
5978 /** Initializes the asynchronous io system. Creates one array each for ibuf
5979 and log i/o. Also creates one array each for read and write where each
5980 array is divided logically into n_readers and n_writers
5981 respectively. The caller must create an i/o handler thread for each
5982 segment in these arrays. This function also creates the sync array.
5983 No i/o handler thread needs to be created for that
5984 @param[in]	n_per_seg	maximum number of pending aio
5985 				operations allowed per segment
5986 @param[in]	n_readers	number of reader threads
5987 @param[in]	n_writers	number of writer threads
5988 @param[in]	n_slots_sync	number of slots in the sync aio array
5989 @return true if the AIO sub-system was started successfully */
5990 bool
start(ulint n_per_seg,ulint n_readers,ulint n_writers,ulint n_slots_sync)5991 AIO::start(
5992 	ulint		n_per_seg,
5993 	ulint		n_readers,
5994 	ulint		n_writers,
5995 	ulint		n_slots_sync)
5996 {
5997 #if defined(LINUX_NATIVE_AIO)
5998 	/* Check if native aio is supported on this system and tmpfs */
5999 	if (srv_use_native_aio && !is_linux_native_aio_supported()) {
6000 
6001 		ib::warn() << "Linux Native AIO disabled.";
6002 
6003 		srv_use_native_aio = FALSE;
6004 	}
6005 #endif /* LINUX_NATIVE_AIO */
6006 
6007 	srv_reset_io_thread_op_info();
6008 
6009 	s_reads = create(
6010 		LATCH_ID_OS_AIO_READ_MUTEX, n_readers * n_per_seg, n_readers);
6011 
6012 	if (s_reads == NULL) {
6013 		return(false);
6014 	}
6015 
6016 	ulint	start = srv_read_only_mode ? 0 : 2;
6017 	ulint	n_segs = n_readers + start;
6018 
6019 	/* 0 is the ibuf segment and 1 is the redo log segment. */
6020 	for (ulint i = start; i < n_segs; ++i) {
6021 		ut_a(i < SRV_MAX_N_IO_THREADS);
6022 		srv_io_thread_function[i] = "read thread";
6023 	}
6024 
6025 	ulint	n_segments = n_readers;
6026 
6027 	if (!srv_read_only_mode) {
6028 
6029 		s_ibuf = create(LATCH_ID_OS_AIO_IBUF_MUTEX, n_per_seg, 1);
6030 
6031 		if (s_ibuf == NULL) {
6032 			return(false);
6033 		}
6034 
6035 		++n_segments;
6036 
6037 		srv_io_thread_function[0] = "insert buffer thread";
6038 
6039 		s_log = create(LATCH_ID_OS_AIO_LOG_MUTEX, n_per_seg, 1);
6040 
6041 		if (s_log == NULL) {
6042 			return(false);
6043 		}
6044 
6045 		++n_segments;
6046 
6047 		srv_io_thread_function[1] = "log thread";
6048 
6049 	} else {
6050 		s_ibuf = s_log = NULL;
6051 	}
6052 
6053 	s_writes = create(
6054 		LATCH_ID_OS_AIO_WRITE_MUTEX, n_writers * n_per_seg, n_writers);
6055 
6056 	if (s_writes == NULL) {
6057 		return(false);
6058 	}
6059 
6060 #ifdef WIN_ASYNC_IO
6061 	data_completion_port = s_writes->m_completion_port;
6062 	log_completion_port =
6063 		s_log ? s_log->m_completion_port : data_completion_port;
6064 #endif
6065 
6066 	n_segments += n_writers;
6067 
6068 	for (ulint i = start + n_readers; i < n_segments; ++i) {
6069 		ut_a(i < SRV_MAX_N_IO_THREADS);
6070 		srv_io_thread_function[i] = "write thread";
6071 	}
6072 
6073 	ut_ad(n_segments >= static_cast<ulint>(srv_read_only_mode ? 2 : 4));
6074 
6075 	s_sync = create(LATCH_ID_OS_AIO_SYNC_MUTEX, n_slots_sync, 1);
6076 
6077 	if (s_sync == NULL) {
6078 
6079 		return(false);
6080 	}
6081 
6082 	os_aio_n_segments = n_segments;
6083 
6084 	os_aio_validate();
6085 
6086 	os_last_printout = time(NULL);
6087 
6088 	if (srv_use_native_aio) {
6089 		return(true);
6090 	}
6091 
6092 	os_aio_segment_wait_events = static_cast<os_event_t*>(
6093 		ut_zalloc_nokey(
6094 			n_segments * sizeof *os_aio_segment_wait_events));
6095 
6096 	if (os_aio_segment_wait_events == NULL) {
6097 
6098 		return(false);
6099 	}
6100 
6101 	for (ulint i = 0; i < n_segments; ++i) {
6102 		os_aio_segment_wait_events[i] = os_event_create(0);
6103 	}
6104 
6105 	return(true);
6106 }
6107 
6108 /** Free the AIO arrays */
6109 void
shutdown()6110 AIO::shutdown()
6111 {
6112 	UT_DELETE(s_ibuf);
6113 	s_ibuf = NULL;
6114 
6115 	UT_DELETE(s_log);
6116 	s_log = NULL;
6117 
6118 	UT_DELETE(s_writes);
6119 	s_writes = NULL;
6120 
6121 	UT_DELETE(s_sync);
6122 	s_sync = NULL;
6123 
6124 	UT_DELETE(s_reads);
6125 	s_reads = NULL;
6126 }
6127 
6128 /** Initializes the asynchronous io system. Creates one array each for ibuf
6129 and log i/o. Also creates one array each for read and write where each
6130 array is divided logically into n_readers and n_writers
6131 respectively. The caller must create an i/o handler thread for each
6132 segment in these arrays. This function also creates the sync array.
6133 No i/o handler thread needs to be created for that
6134 @param[in]	n_readers	number of reader threads
6135 @param[in]	n_writers	number of writer threads
6136 @param[in]	n_slots_sync	number of slots in the sync aio array */
6137 bool
os_aio_init(ulint n_readers,ulint n_writers,ulint n_slots_sync)6138 os_aio_init(
6139 	ulint		n_readers,
6140 	ulint		n_writers,
6141 	ulint		n_slots_sync)
6142 {
6143 	/* Maximum number of pending aio operations allowed per segment */
6144 	ulint		limit = 8 * OS_AIO_N_PENDING_IOS_PER_THREAD;
6145 
6146 	return(AIO::start(limit, n_readers, n_writers, n_slots_sync));
6147 }
6148 
6149 /** Frees the asynchronous io system. */
6150 void
os_aio_free()6151 os_aio_free()
6152 {
6153 	AIO::shutdown();
6154 
6155 	ut_ad(!os_aio_segment_wait_events || !srv_use_native_aio);
6156 	ut_ad(srv_use_native_aio || os_aio_segment_wait_events
6157 	      || !srv_was_started);
6158 
6159 	if (!srv_use_native_aio && os_aio_segment_wait_events) {
6160 		for (ulint i = 0; i < os_aio_n_segments; i++) {
6161 			os_event_destroy(os_aio_segment_wait_events[i]);
6162 		}
6163 
6164 		ut_free(os_aio_segment_wait_events);
6165 		os_aio_segment_wait_events = 0;
6166 	}
6167 	os_aio_n_segments = 0;
6168 }
6169 
6170 /** Wakes up all async i/o threads so that they know to exit themselves in
6171 shutdown. */
6172 void
os_aio_wake_all_threads_at_shutdown()6173 os_aio_wake_all_threads_at_shutdown()
6174 {
6175 #ifdef WIN_ASYNC_IO
6176 	AIO::wake_at_shutdown();
6177 #elif defined(LINUX_NATIVE_AIO)
6178 	/* When using native AIO interface the io helper threads
6179 	wait on io_getevents with a timeout value of 500ms. At
6180 	each wake up these threads check the server status.
6181 	No need to do anything to wake them up. */
6182 #endif /* !WIN_ASYNC_AIO */
6183 
6184 	if (srv_use_native_aio) {
6185 		return;
6186 	}
6187 
6188 	/* This loop wakes up all simulated ai/o threads */
6189 
6190 	for (ulint i = 0; i < os_aio_n_segments; ++i) {
6191 
6192 		os_event_set(os_aio_segment_wait_events[i]);
6193 	}
6194 }
6195 
6196 /** Waits until there are no pending writes in AIO::s_writes. There can
6197 be other, synchronous, pending writes. */
6198 void
os_aio_wait_until_no_pending_writes()6199 os_aio_wait_until_no_pending_writes()
6200 {
6201 	AIO::wait_until_no_pending_writes();
6202 }
6203 
6204 /** Calculates segment number for a slot.
6205 @param[in]	array		AIO wait array
6206 @param[in]	slot		slot in this array
6207 @return segment number (which is the number used by, for example,
6208 	I/O-handler threads) */
6209 ulint
get_segment_no_from_slot(const AIO * array,const Slot * slot)6210 AIO::get_segment_no_from_slot(
6211 	const AIO*	array,
6212 	const Slot*	slot)
6213 {
6214 	ulint	segment;
6215 	ulint	seg_len;
6216 
6217 	if (array == s_ibuf) {
6218 		ut_ad(!srv_read_only_mode);
6219 
6220 		segment = IO_IBUF_SEGMENT;
6221 
6222 	} else if (array == s_log) {
6223 		ut_ad(!srv_read_only_mode);
6224 
6225 		segment = IO_LOG_SEGMENT;
6226 
6227 	} else if (array == s_reads) {
6228 		seg_len = s_reads->slots_per_segment();
6229 
6230 		segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
6231 	} else {
6232 		ut_a(array == s_writes);
6233 
6234 		seg_len = s_writes->slots_per_segment();
6235 
6236 		segment = s_reads->m_n_segments
6237 			+ (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
6238 	}
6239 
6240 	return(segment);
6241 }
6242 
6243 /** Requests for a slot in the aio array. If no slot is available, waits until
6244 not_full-event becomes signaled.
6245 
6246 @param[in]	type		IO context
6247 @param[in,out]	m1		message to be passed along with the AIO
6248 				operation
6249 @param[in,out]	m2		message to be passed along with the AIO
6250 				operation
6251 @param[in]	file		file handle
6252 @param[in]	name		name of the file or path as a NUL-terminated
6253 				string
6254 @param[in,out]	buf		buffer where to read or from which to write
6255 @param[in]	offset		file offset, where to read from or start writing
6256 @param[in]	len		length of the block to read or write
6257 @return pointer to slot */
6258 Slot*
reserve_slot(const IORequest & type,fil_node_t * m1,void * m2,pfs_os_file_t file,const char * name,void * buf,os_offset_t offset,ulint len)6259 AIO::reserve_slot(
6260 	const IORequest&	type,
6261 	fil_node_t*		m1,
6262 	void*			m2,
6263 	pfs_os_file_t		file,
6264 	const char*		name,
6265 	void*			buf,
6266 	os_offset_t		offset,
6267 	ulint			len)
6268 {
6269 	ut_ad(reinterpret_cast<size_t>(buf) % OS_FILE_LOG_BLOCK_SIZE == 0);
6270 	ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
6271 	ut_ad(len % OS_FILE_LOG_BLOCK_SIZE == 0);
6272 
6273 #ifdef WIN_ASYNC_IO
6274 	ut_a((len & 0xFFFFFFFFUL) == len);
6275 #endif /* WIN_ASYNC_IO */
6276 
6277 	/* No need of a mutex. Only reading constant fields */
6278 	ulint		slots_per_seg;
6279 
6280 	ut_ad(type.validate());
6281 
6282 	slots_per_seg = slots_per_segment();
6283 
6284 	/* We attempt to keep adjacent blocks in the same local
6285 	segment. This can help in merging IO requests when we are
6286 	doing simulated AIO */
6287 	ulint		local_seg;
6288 
6289 	local_seg = (offset >> (srv_page_size_shift + 6)) % m_n_segments;
6290 
6291 	for (;;) {
6292 
6293 		acquire();
6294 
6295 		if (m_n_reserved != m_slots.size()) {
6296 			break;
6297 		}
6298 
6299 		release();
6300 
6301 		if (!srv_use_native_aio) {
6302 			/* If the handler threads are suspended,
6303 			wake them so that we get more slots */
6304 
6305 			os_aio_simulated_wake_handler_threads();
6306 		}
6307 
6308 		os_event_wait(m_not_full);
6309 	}
6310 
6311 	ulint	counter = 0;
6312 	Slot*	slot = NULL;
6313 
6314 	/* We start our search for an available slot from our preferred
6315 	local segment and do a full scan of the array. We are
6316 	guaranteed to find a slot in full scan. */
6317 	for (ulint i = local_seg * slots_per_seg;
6318 	     counter < m_slots.size();
6319 	     ++i, ++counter) {
6320 
6321 		i %= m_slots.size();
6322 
6323 		slot = at(i);
6324 
6325 		if (slot->is_reserved == false) {
6326 			break;
6327 		}
6328 	}
6329 
6330 	/* We MUST always be able to get hold of a reserved slot. */
6331 	ut_a(counter < m_slots.size());
6332 
6333 	ut_a(slot->is_reserved == false);
6334 
6335 	++m_n_reserved;
6336 
6337 	if (m_n_reserved == 1) {
6338 		os_event_reset(m_is_empty);
6339 	}
6340 
6341 	if (m_n_reserved == m_slots.size()) {
6342 		os_event_reset(m_not_full);
6343 	}
6344 
6345 	slot->is_reserved = true;
6346 	slot->reservation_time = time(NULL);
6347 	slot->m1       = m1;
6348 	slot->m2       = m2;
6349 	slot->file     = file;
6350 	slot->name     = name;
6351 #ifdef _WIN32
6352 	slot->len      = static_cast<DWORD>(len);
6353 #else
6354 	slot->len      = len;
6355 #endif /* _WIN32 */
6356 	slot->type     = type;
6357 	slot->buf      = static_cast<byte*>(buf);
6358 	slot->ptr      = slot->buf;
6359 	slot->offset   = offset;
6360 	slot->err      = DB_SUCCESS;
6361 	slot->original_len = static_cast<uint32>(len);
6362 	slot->io_already_done = false;
6363 	slot->buf      = static_cast<byte*>(buf);
6364 
6365 #ifdef WIN_ASYNC_IO
6366 	{
6367 		OVERLAPPED*	control;
6368 
6369 		control = &slot->control;
6370 		control->Offset = (DWORD) offset & 0xFFFFFFFF;
6371 		control->OffsetHigh = (DWORD) (offset >> 32);
6372 	}
6373 #elif defined(LINUX_NATIVE_AIO)
6374 
6375 	/* If we are not using native AIO skip this part. */
6376 	if (srv_use_native_aio) {
6377 
6378 		off_t		aio_offset;
6379 
6380 		/* Check if we are dealing with 64 bit arch.
6381 		If not then make sure that offset fits in 32 bits. */
6382 		aio_offset = (off_t) offset;
6383 
6384 		ut_a(sizeof(aio_offset) >= sizeof(offset)
6385 		     || ((os_offset_t) aio_offset) == offset);
6386 
6387 		struct iocb*	iocb = &slot->control;
6388 
6389 		if (type.is_read()) {
6390 
6391 			io_prep_pread(
6392 				iocb, file, slot->ptr, slot->len, aio_offset);
6393 		} else {
6394 			ut_ad(type.is_write());
6395 
6396 			io_prep_pwrite(
6397 				iocb, file, slot->ptr, slot->len, aio_offset);
6398 		}
6399 
6400 		iocb->data = slot;
6401 
6402 		slot->n_bytes = 0;
6403 		slot->ret = 0;
6404 	}
6405 #endif /* LINUX_NATIVE_AIO */
6406 
6407 	release();
6408 
6409 	return(slot);
6410 }
6411 
6412 /** Wakes up a simulated aio i/o-handler thread if it has something to do.
6413 @param[in]	global_segment	The number of the segment in the AIO arrays */
6414 void
wake_simulated_handler_thread(ulint global_segment)6415 AIO::wake_simulated_handler_thread(ulint global_segment)
6416 {
6417 	ut_ad(!srv_use_native_aio);
6418 
6419 	AIO*	array;
6420 	ulint	segment = get_array_and_local_segment(&array, global_segment);
6421 
6422 	array->wake_simulated_handler_thread(global_segment, segment);
6423 }
6424 
6425 /** Wakes up a simulated AIO I/O-handler thread if it has something to do
6426 for a local segment in the AIO array.
6427 @param[in]	global_segment	The number of the segment in the AIO arrays
6428 @param[in]	segment		The local segment in the AIO array */
6429 void
wake_simulated_handler_thread(ulint global_segment,ulint segment)6430 AIO::wake_simulated_handler_thread(ulint global_segment, ulint segment)
6431 {
6432 	ut_ad(!srv_use_native_aio);
6433 
6434 	ulint	n = slots_per_segment();
6435 	ulint	offset = segment * n;
6436 
6437 	/* Look through n slots after the segment * n'th slot */
6438 
6439 	acquire();
6440 
6441 	const Slot*	slot = at(offset);
6442 
6443 	for (ulint i = 0; i < n; ++i, ++slot) {
6444 
6445 		if (slot->is_reserved) {
6446 
6447 			/* Found an i/o request */
6448 
6449 			release();
6450 
6451 			os_event_t	event;
6452 
6453 			event = os_aio_segment_wait_events[global_segment];
6454 
6455 			os_event_set(event);
6456 
6457 			return;
6458 		}
6459 	}
6460 
6461 	release();
6462 }
6463 
6464 /** Wakes up simulated aio i/o-handler threads if they have something to do. */
6465 void
os_aio_simulated_wake_handler_threads()6466 os_aio_simulated_wake_handler_threads()
6467 {
6468 	if (srv_use_native_aio) {
6469 		/* We do not use simulated aio: do nothing */
6470 
6471 		return;
6472 	}
6473 
6474 	os_aio_recommend_sleep_for_read_threads	= false;
6475 
6476 	for (ulint i = 0; i < os_aio_n_segments; i++) {
6477 		AIO::wake_simulated_handler_thread(i);
6478 	}
6479 }
6480 
6481 /** Select the IO slot array
6482 @param[in,out]	type		Type of IO, READ or WRITE
6483 @param[in]	read_only	true if running in read-only mode
6484 @param[in]	mode		IO mode
6485 @return slot array or NULL if invalid mode specified */
6486 AIO*
select_slot_array(IORequest & type,bool read_only,ulint mode)6487 AIO::select_slot_array(IORequest& type, bool read_only, ulint mode)
6488 {
6489 	AIO*	array;
6490 
6491 	ut_ad(type.validate());
6492 
6493 	switch (mode) {
6494 	case OS_AIO_NORMAL:
6495 
6496 		array = type.is_read() ? AIO::s_reads : AIO::s_writes;
6497 		break;
6498 
6499 	case OS_AIO_IBUF:
6500 		ut_ad(type.is_read());
6501 
6502 		/* Reduce probability of deadlock bugs in connection with ibuf:
6503 		do not let the ibuf i/o handler sleep */
6504 
6505 		type.clear_do_not_wake();
6506 
6507 		array = read_only ? AIO::s_reads : AIO::s_ibuf;
6508 		break;
6509 
6510 	case OS_AIO_LOG:
6511 
6512 		array = read_only ? AIO::s_reads : AIO::s_log;
6513 		break;
6514 
6515 	case OS_AIO_SYNC:
6516 
6517 		array = AIO::s_sync;
6518 #if defined(LINUX_NATIVE_AIO)
6519 		/* In Linux native AIO we don't use sync IO array. */
6520 		ut_a(!srv_use_native_aio);
6521 #endif /* LINUX_NATIVE_AIO */
6522 		break;
6523 
6524 	default:
6525 		ut_error;
6526 		array = NULL; /* Eliminate compiler warning */
6527 	}
6528 
6529 	return(array);
6530 }
6531 
6532 #ifdef WIN_ASYNC_IO
6533 /** This function is only used in Windows asynchronous i/o.
6534 Waits for an aio operation to complete. This function is used to wait the
6535 for completed requests. The aio array of pending requests is divided
6536 into segments. The thread specifies which segment or slot it wants to wait
6537 for. NOTE: this function will also take care of freeing the aio slot,
6538 therefore no other thread is allowed to do the freeing!
6539 @param[in]	segment		The number of the segment in the aio arrays to
6540 				wait for; segment 0 is the ibuf I/O thread,
6541 				segment 1 the log I/O thread, then follow the
6542 				non-ibuf read threads, and as the last are the
6543 				non-ibuf write threads; if this is
6544 				ULINT_UNDEFINED, then it means that sync AIO
6545 				is used, and this parameter is ignored
6546 @param[in]	pos		this parameter is used only in sync AIO:
6547 				wait for the aio slot at this position
6548 @param[out]	m1		the messages passed with the AIO request; note
6549 				that also in the case where the AIO operation
6550 				failed, these output parameters are valid and
6551 				can be used to restart the operation,
6552 				for example
6553 @param[out]	m2		callback message
6554 @param[out]	type		OS_FILE_WRITE or ..._READ
6555 @return DB_SUCCESS or error code */
6556 
6557 
6558 
6559 static
6560 dberr_t
os_aio_windows_handler(ulint segment,ulint pos,fil_node_t ** m1,void ** m2,IORequest * type)6561 os_aio_windows_handler(
6562 	ulint		segment,
6563 	ulint		pos,
6564 	fil_node_t**	m1,
6565 	void**		m2,
6566 	IORequest*	type)
6567 {
6568 	Slot*		slot= 0;
6569 	dberr_t		err;
6570 
6571 	BOOL		ret;
6572 	ULONG_PTR	key;
6573 
6574 	ut_a(segment != ULINT_UNDEFINED);
6575 
6576 	/* NOTE! We only access constant fields in os_aio_array. Therefore
6577 	we do not have to acquire the protecting mutex yet */
6578 
6579 	ut_ad(os_aio_validate_skip());
6580 	AIO *my_array;
6581 	AIO::get_array_and_local_segment(&my_array, segment);
6582 
6583 	HANDLE port = my_array->m_completion_port;
6584 	ut_ad(port);
6585 	for (;;) {
6586 		DWORD len;
6587 		ret = GetQueuedCompletionStatus(port, &len, &key,
6588 		(OVERLAPPED **)&slot, INFINITE);
6589 
6590 		/* If shutdown key was received, repost the shutdown message and exit */
6591 		if (ret && key == IOCP_SHUTDOWN_KEY) {
6592 			PostQueuedCompletionStatus(port, 0, key, NULL);
6593 			*m1 = NULL;
6594 			*m2 = NULL;
6595 			return (DB_SUCCESS);
6596 		}
6597 
6598 		ut_a(slot);
6599 
6600 		if (!ret) {
6601 			/* IO failed */
6602 			break;
6603 		}
6604 
6605 		slot->n_bytes= len;
6606 		ut_a(slot->array);
6607 		HANDLE slot_port = slot->array->m_completion_port;
6608 		if (slot_port != port) {
6609 			/* there are no redirections between data and log */
6610 			ut_ad(port == data_completion_port);
6611 			ut_ad(slot_port != log_completion_port);
6612 
6613 			/*
6614 			Redirect completions  to the dedicated completion port
6615 			and threads.
6616 
6617 			"Write array" threads receive write,read and ibuf
6618 			notifications, read and ibuf completions are redirected.
6619 
6620 			Forwarding IO completion this way costs a context switch,
6621 			and this seems tolerable  since asynchronous reads are by
6622 			far less frequent.
6623 			*/
6624 			ut_a(PostQueuedCompletionStatus(slot_port,
6625 				len, key, &slot->control));
6626 		}
6627 		else {
6628 			break;
6629 		}
6630 	}
6631 
6632 	ut_a(slot->is_reserved);
6633 
6634 	*m1 = slot->m1;
6635 	*m2 = slot->m2;
6636 
6637 	*type = slot->type;
6638 
6639 	bool retry = false;
6640 
6641 	if (ret && slot->n_bytes == slot->len) {
6642 
6643 		err = DB_SUCCESS;
6644 
6645 	} else if (os_file_handle_error(slot->name, "Windows aio")) {
6646 
6647 		retry = true;
6648 
6649 	} else {
6650 
6651 		err = DB_IO_ERROR;
6652 	}
6653 
6654 
6655 	if (retry) {
6656 		/* Retry failed read/write operation synchronously. */
6657 
6658 #ifdef UNIV_PFS_IO
6659 		/* This read/write does not go through os_file_read
6660 		and os_file_write APIs, need to register with
6661 		performance schema explicitly here. */
6662 		PSI_file_locker_state	state;
6663 		struct PSI_file_locker* locker = NULL;
6664 
6665 		register_pfs_file_io_begin(
6666 			&state, locker, slot->file, slot->len,
6667 			slot->type.is_write()
6668 			? PSI_FILE_WRITE : PSI_FILE_READ, __FILE__, __LINE__);
6669 #endif /* UNIV_PFS_IO */
6670 
6671 		ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
6672 
6673 		ssize_t	n_bytes = SyncFileIO::execute(slot);
6674 
6675 #ifdef UNIV_PFS_IO
6676 		register_pfs_file_io_end(locker, slot->len);
6677 #endif /* UNIV_PFS_IO */
6678 
6679 		err = (n_bytes == slot->len) ? DB_SUCCESS : DB_IO_ERROR;
6680 	}
6681 
6682 	if (err == DB_SUCCESS) {
6683 		err = AIOHandler::post_io_processing(slot);
6684 	}
6685 
6686 	slot->array->release_with_mutex(slot);
6687 
6688 	if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
6689 		&& !buf_page_cleaner_is_active
6690 		&& os_aio_all_slots_free()) {
6691 			/* Last IO, wakeup other io  threads */
6692 			AIO::wake_at_shutdown();
6693 	}
6694 	return(err);
6695 }
6696 #endif /* WIN_ASYNC_IO */
6697 
6698 /**
6699 NOTE! Use the corresponding macro os_aio(), not directly this function!
6700 Requests an asynchronous i/o operation.
6701 @param[in,out]	type		IO request context
6702 @param[in]	mode		IO mode
6703 @param[in]	name		Name of the file or path as NUL terminated
6704 				string
6705 @param[in]	file		Open file handle
6706 @param[out]	buf		buffer where to read
6707 @param[in]	offset		file offset where to read
6708 @param[in]	n		number of bytes to read
6709 @param[in]	read_only	if true read only mode checks are enforced
6710 @param[in,out]	m1		Message for the AIO handler, (can be used to
6711 				identify a completed AIO operation); ignored
6712 				if mode is OS_AIO_SYNC
6713 @param[in,out]	m2		message for the AIO handler (can be used to
6714 				identify a completed AIO operation); ignored
6715 				if mode is OS_AIO_SYNC
6716 
6717 @return DB_SUCCESS or error code */
6718 dberr_t
os_aio_func(IORequest & type,ulint mode,const char * name,pfs_os_file_t file,void * buf,os_offset_t offset,ulint n,bool read_only,fil_node_t * m1,void * m2)6719 os_aio_func(
6720 	IORequest&	type,
6721 	ulint		mode,
6722 	const char*	name,
6723 	pfs_os_file_t	file,
6724 	void*		buf,
6725 	os_offset_t	offset,
6726 	ulint		n,
6727 	bool		read_only,
6728 	fil_node_t*	m1,
6729 	void*		m2)
6730 {
6731 #ifdef WIN_ASYNC_IO
6732 	BOOL		ret = TRUE;
6733 #endif /* WIN_ASYNC_IO */
6734 
6735 	ut_ad(n > 0);
6736 	ut_ad((n % OS_FILE_LOG_BLOCK_SIZE) == 0);
6737 	ut_ad((offset % OS_FILE_LOG_BLOCK_SIZE) == 0);
6738 	ut_ad(os_aio_validate_skip());
6739 
6740 #ifdef WIN_ASYNC_IO
6741 	ut_ad((n & 0xFFFFFFFFUL) == n);
6742 #endif /* WIN_ASYNC_IO */
6743 
6744 	DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28",
6745 			mode = OS_AIO_SYNC; os_has_said_disk_full = FALSE;);
6746 
6747 	if (mode == OS_AIO_SYNC) {
6748 		if (type.is_read()) {
6749 			return(os_file_read_func(type, file, buf, offset, n));
6750 		}
6751 
6752 		ut_ad(type.is_write());
6753 
6754 		return(os_file_write_func(type, name, file, buf, offset, n));
6755 	}
6756 
6757 try_again:
6758 
6759 	AIO*	array;
6760 
6761 	array = AIO::select_slot_array(type, read_only, mode);
6762 
6763 	Slot*	slot;
6764 
6765 	slot = array->reserve_slot(type, m1, m2, file, name, buf, offset, n);
6766 
6767 	if (type.is_read()) {
6768 
6769 
6770 		if (srv_use_native_aio) {
6771 
6772 			++os_n_file_reads;
6773 
6774 			os_bytes_read_since_printout += n;
6775 #ifdef WIN_ASYNC_IO
6776 			ret = ReadFile(
6777 				file, slot->ptr, slot->len,
6778 				NULL, &slot->control);
6779 #elif defined(LINUX_NATIVE_AIO)
6780 			if (!array->linux_dispatch(slot)) {
6781 				goto err_exit;
6782 			}
6783 #endif /* WIN_ASYNC_IO */
6784 		} else if (type.is_wake()) {
6785 			AIO::wake_simulated_handler_thread(
6786 				AIO::get_segment_no_from_slot(array, slot));
6787 		}
6788 	} else if (type.is_write()) {
6789 
6790 		if (srv_use_native_aio) {
6791 			++os_n_file_writes;
6792 
6793 #ifdef WIN_ASYNC_IO
6794 			ret = WriteFile(
6795 				file, slot->ptr, slot->len,
6796 				NULL, &slot->control);
6797 #elif defined(LINUX_NATIVE_AIO)
6798 			if (!array->linux_dispatch(slot)) {
6799 				goto err_exit;
6800 			}
6801 #endif /* WIN_ASYNC_IO */
6802 
6803 		} else if (type.is_wake()) {
6804 			AIO::wake_simulated_handler_thread(
6805 				AIO::get_segment_no_from_slot(array, slot));
6806 		}
6807 	} else {
6808 		ut_error;
6809 	}
6810 
6811 #ifdef WIN_ASYNC_IO
6812 	if (ret || (GetLastError() == ERROR_IO_PENDING)) {
6813 		/* aio completed or was queued successfully! */
6814 		return(DB_SUCCESS);
6815 	}
6816 
6817 	goto err_exit;
6818 
6819 #endif /* WIN_ASYNC_IO */
6820 
6821 	/* AIO request was queued successfully! */
6822 	return(DB_SUCCESS);
6823 
6824 #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
6825 err_exit:
6826 #endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
6827 
6828 	array->release_with_mutex(slot);
6829 
6830 	if (os_file_handle_error(
6831 		name, type.is_read() ? "aio read" : "aio write")) {
6832 
6833 		goto try_again;
6834 	}
6835 
6836 	return(DB_IO_ERROR);
6837 }
6838 
6839 /** Simulated AIO handler for reaping IO requests */
6840 class SimulatedAIOHandler {
6841 
6842 public:
6843 
6844 	/** Constructor
6845 	@param[in,out]	array	The AIO array
6846 	@param[in]	segment	Local segment in the array */
SimulatedAIOHandler(AIO * array,ulint segment)6847 	SimulatedAIOHandler(AIO* array, ulint segment)
6848 		:
6849 		m_oldest(),
6850 		m_n_elems(),
6851 		m_lowest_offset(IB_UINT64_MAX),
6852 		m_array(array),
6853 		m_n_slots(),
6854 		m_segment(segment),
6855 		m_ptr(),
6856 		m_buf()
6857 	{
6858 		ut_ad(m_segment < 100);
6859 
6860 		m_slots.resize(OS_AIO_MERGE_N_CONSECUTIVE);
6861 	}
6862 
6863 	/** Destructor */
~SimulatedAIOHandler()6864 	~SimulatedAIOHandler()
6865 	{
6866 		if (m_ptr != NULL) {
6867 			ut_free(m_ptr);
6868 		}
6869 	}
6870 
6871 	/** Reset the state of the handler
6872 	@param[in]	n_slots	Number of pending AIO operations supported */
init(ulint n_slots)6873 	void init(ulint n_slots)
6874 	{
6875 		m_oldest = 0;
6876 		m_n_elems = 0;
6877 		m_n_slots = n_slots;
6878 		m_lowest_offset = IB_UINT64_MAX;
6879 
6880 		if (m_ptr != NULL) {
6881 			ut_free(m_ptr);
6882 			m_ptr = m_buf = NULL;
6883 		}
6884 
6885 		m_slots[0] = NULL;
6886 	}
6887 
6888 	/** Check if there is a slot for which the i/o has already been done
6889 	@param[out]	n_reserved	Number of reserved slots
6890 	@return the first completed slot that is found. */
check_completed(ulint * n_reserved)6891 	Slot* check_completed(ulint* n_reserved)
6892 	{
6893 		ulint	offset = m_segment * m_n_slots;
6894 
6895 		*n_reserved = 0;
6896 
6897 		Slot*	slot;
6898 
6899 		slot = m_array->at(offset);
6900 
6901 		for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
6902 
6903 			if (slot->is_reserved) {
6904 
6905 				if (slot->io_already_done) {
6906 
6907 					ut_a(slot->is_reserved);
6908 
6909 					return(slot);
6910 				}
6911 
6912 				++*n_reserved;
6913 			}
6914 		}
6915 
6916 		return(NULL);
6917 	}
6918 
6919 	/** If there are at least 2 seconds old requests, then pick the
6920 	oldest one to prevent starvation.  If several requests have the
6921 	same age, then pick the one at the lowest offset.
6922 	@return true if request was selected */
select()6923 	bool select()
6924 	{
6925 		if (!select_oldest()) {
6926 
6927 			return(select_lowest_offset());
6928 		}
6929 
6930 		return(true);
6931 	}
6932 
6933 	/** Check if there are several consecutive blocks
6934 	to read or write. Merge them if found. */
merge()6935 	void merge()
6936 	{
6937 		/* if m_n_elems != 0, then we have assigned
6938 		something valid to consecutive_ios[0] */
6939 		ut_ad(m_n_elems != 0);
6940 		ut_ad(first_slot() != NULL);
6941 
6942 		Slot*	slot = first_slot();
6943 
6944 		while (!merge_adjacent(slot)) {
6945 			/* No op */
6946 		}
6947 	}
6948 
6949 	/** We have now collected n_consecutive I/O requests
6950 	in the array; allocate a single buffer which can hold
6951 	all data, and perform the I/O
6952 	@return the length of the buffer */
allocate_buffer()6953 	ulint allocate_buffer()
6954 		MY_ATTRIBUTE((warn_unused_result))
6955 	{
6956 		ulint	len;
6957 		Slot*	slot = first_slot();
6958 
6959 		ut_ad(m_ptr == NULL);
6960 
6961 		if (slot->type.is_read() && m_n_elems > 1) {
6962 
6963 			len = 0;
6964 
6965 			for (ulint i = 0; i < m_n_elems; ++i) {
6966 				len += m_slots[i]->len;
6967 			}
6968 
6969 			m_ptr = static_cast<byte*>(
6970 				ut_malloc_nokey(len + srv_page_size));
6971 
6972 			m_buf = static_cast<byte*>(
6973 				ut_align(m_ptr, srv_page_size));
6974 
6975 		} else {
6976 			len = first_slot()->len;
6977 			m_buf = first_slot()->buf;
6978 		}
6979 
6980 		return(len);
6981 	}
6982 
6983 	/** We have to compress the individual pages and punch
6984 	holes in them on a page by page basis when writing to
6985 	tables that can be compresed at the IO level.
6986 	@param[in]	len		Value returned by allocate_buffer */
copy_to_buffer(ulint len)6987 	void copy_to_buffer(ulint len)
6988 	{
6989 		Slot*	slot = first_slot();
6990 
6991 		if (len > slot->len && slot->type.is_write()) {
6992 
6993 			byte*	ptr = m_buf;
6994 
6995 			ut_ad(ptr != slot->buf);
6996 
6997 			/* Copy the buffers to the combined buffer */
6998 			for (ulint i = 0; i < m_n_elems; ++i) {
6999 
7000 				slot = m_slots[i];
7001 
7002 				memmove(ptr, slot->buf, slot->len);
7003 
7004 				ptr += slot->len;
7005 			}
7006 		}
7007 	}
7008 
7009 	/** Do the I/O with ordinary, synchronous i/o functions:
7010 	@param[in]	len		Length of buffer for IO */
io()7011 	void io()
7012 	{
7013 		if (first_slot()->type.is_write()) {
7014 
7015 			for (ulint i = 0; i < m_n_elems; ++i) {
7016 				write(m_slots[i]);
7017 			}
7018 
7019 		} else {
7020 
7021 			for (ulint i = 0; i < m_n_elems; ++i) {
7022 				read(m_slots[i]);
7023 			}
7024 		}
7025 	}
7026 
7027 	/** Mark the i/os done in slots */
done()7028 	void done()
7029 	{
7030 		for (ulint i = 0; i < m_n_elems; ++i) {
7031 			m_slots[i]->io_already_done = true;
7032 		}
7033 	}
7034 
7035 	/** @return the first slot in the consecutive array */
first_slot()7036 	Slot* first_slot()
7037 		MY_ATTRIBUTE((warn_unused_result))
7038 	{
7039 		ut_a(m_n_elems > 0);
7040 
7041 		return(m_slots[0]);
7042 	}
7043 
7044 	/** Wait for I/O requests
7045 	@param[in]	global_segment	The global segment
7046 	@param[in,out]	event		Wait on event if no active requests
7047 	@return the number of slots */
7048 	ulint check_pending(
7049 		ulint		global_segment,
7050 		os_event_t	event)
7051 		MY_ATTRIBUTE((warn_unused_result));
7052 private:
7053 
7054 	/** Do the file read
7055 	@param[in,out]	slot		Slot that has the IO context */
read(Slot * slot)7056 	void read(Slot* slot)
7057 	{
7058 		dberr_t	err = os_file_read(
7059 			slot->type,
7060 			slot->file,
7061 			slot->ptr,
7062 			slot->offset,
7063 			slot->len);
7064 
7065 		ut_a(err == DB_SUCCESS);
7066 	}
7067 
7068 	/** Do the file read
7069 	@param[in,out]	slot		Slot that has the IO context */
write(Slot * slot)7070 	void write(Slot* slot)
7071 	{
7072 		dberr_t	err = os_file_write(
7073 			slot->type,
7074 			slot->name,
7075 			slot->file,
7076 			slot->ptr,
7077 			slot->offset,
7078 			slot->len);
7079 
7080 		ut_a(err == DB_SUCCESS);
7081 	}
7082 
7083 	/** @return true if the slots are adjacent and can be merged */
adjacent(const Slot * s1,const Slot * s2) const7084 	bool adjacent(const Slot* s1, const Slot* s2) const
7085 	{
7086 		return(s1 != s2
7087 		       && s1->file == s2->file
7088 		       && s2->offset == s1->offset + s1->len
7089 		       && s1->type == s2->type);
7090 	}
7091 
7092 	/** @return true if merge limit reached or no adjacent slots found. */
merge_adjacent(Slot * & current)7093 	bool merge_adjacent(Slot*& current)
7094 	{
7095 		Slot*	slot;
7096 		ulint	offset = m_segment * m_n_slots;
7097 
7098 		slot = m_array->at(offset);
7099 
7100 		for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7101 
7102 			if (slot->is_reserved && adjacent(current, slot)) {
7103 
7104 				current = slot;
7105 
7106 				/* Found a consecutive i/o request */
7107 
7108 				m_slots[m_n_elems] = slot;
7109 
7110 				++m_n_elems;
7111 
7112 				return(m_n_elems >= m_slots.capacity());
7113 			}
7114 		}
7115 
7116 		return(true);
7117 	}
7118 
7119 	/** There were no old requests. Look for an I/O request at the lowest
7120 	offset in the array (we ignore the high 32 bits of the offset in these
7121 	heuristics) */
select_lowest_offset()7122 	bool select_lowest_offset()
7123 	{
7124 		ut_ad(m_n_elems == 0);
7125 
7126 		ulint	offset = m_segment * m_n_slots;
7127 
7128 		m_lowest_offset = IB_UINT64_MAX;
7129 
7130 		for (ulint i = 0; i < m_n_slots; ++i) {
7131 			Slot*	slot;
7132 
7133 			slot = m_array->at(i + offset);
7134 
7135 			if (slot->is_reserved
7136 			    && slot->offset < m_lowest_offset) {
7137 
7138 				/* Found an i/o request */
7139 				m_slots[0] = slot;
7140 
7141 				m_n_elems = 1;
7142 
7143 				m_lowest_offset = slot->offset;
7144 			}
7145 		}
7146 
7147 		return(m_n_elems > 0);
7148 	}
7149 
7150 	/** Select the slot if it is older than the current oldest slot.
7151 	@param[in]	slot		The slot to check */
select_if_older(Slot * slot)7152 	void select_if_older(Slot* slot)
7153 	{
7154 		ulint	age;
7155 
7156 		age = (ulint) difftime(time(NULL), slot->reservation_time);
7157 
7158 		if ((age >= 2 && age > m_oldest)
7159 		    || (age >= 2
7160 			&& age == m_oldest
7161 			&& slot->offset < m_lowest_offset)) {
7162 
7163 			/* Found an i/o request */
7164 			m_slots[0] = slot;
7165 
7166 			m_n_elems = 1;
7167 
7168 			m_oldest = age;
7169 
7170 			m_lowest_offset = slot->offset;
7171 		}
7172 	}
7173 
7174 	/** Select th oldest slot in the array
7175 	@return true if oldest slot found */
select_oldest()7176 	bool select_oldest()
7177 	{
7178 		ut_ad(m_n_elems == 0);
7179 
7180 		Slot*	slot;
7181 		ulint	offset = m_n_slots * m_segment;
7182 
7183 		slot = m_array->at(offset);
7184 
7185 		for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7186 
7187 			if (slot->is_reserved) {
7188 				select_if_older(slot);
7189 			}
7190 		}
7191 
7192 		return(m_n_elems > 0);
7193 	}
7194 
7195 	typedef std::vector<Slot*> slots_t;
7196 
7197 private:
7198 	ulint		m_oldest;
7199 	ulint		m_n_elems;
7200 	os_offset_t	m_lowest_offset;
7201 
7202 	AIO*		m_array;
7203 	ulint		m_n_slots;
7204 	ulint		m_segment;
7205 
7206 	slots_t		m_slots;
7207 
7208 	byte*		m_ptr;
7209 	byte*		m_buf;
7210 };
7211 
7212 /** Wait for I/O requests
7213 @return the number of slots */
7214 ulint
check_pending(ulint global_segment,os_event_t event)7215 SimulatedAIOHandler::check_pending(
7216 	ulint		global_segment,
7217 	os_event_t	event)
7218 {
7219 	/* NOTE! We only access constant fields in os_aio_array.
7220 	Therefore we do not have to acquire the protecting mutex yet */
7221 
7222 	ut_ad(os_aio_validate_skip());
7223 
7224 	ut_ad(m_segment < m_array->get_n_segments());
7225 
7226 	/* Look through n slots after the segment * n'th slot */
7227 
7228 	if (AIO::is_read(m_array)
7229 	    && os_aio_recommend_sleep_for_read_threads) {
7230 
7231 		/* Give other threads chance to add several
7232 		I/Os to the array at once. */
7233 
7234 		srv_set_io_thread_op_info(
7235 			global_segment, "waiting for i/o request");
7236 
7237 		os_event_wait(event);
7238 
7239 		return(0);
7240 	}
7241 
7242 	return(m_array->slots_per_segment());
7243 }
7244 
7245 /** Does simulated AIO. This function should be called by an i/o-handler
7246 thread.
7247 
7248 @param[in]	segment	The number of the segment in the aio arrays to wait
7249 			for; segment 0 is the ibuf i/o thread, segment 1 the
7250 			log i/o thread, then follow the non-ibuf read threads,
7251 			and as the last are the non-ibuf write threads
7252 @param[out]	m1	the messages passed with the AIO request; note that
7253 			also in the case where the AIO operation failed, these
7254 			output parameters are valid and can be used to restart
7255 			the operation, for example
7256 @param[out]	m2	Callback argument
7257 @param[in]	type	IO context
7258 @return DB_SUCCESS or error code */
7259 static
7260 dberr_t
os_aio_simulated_handler(ulint global_segment,fil_node_t ** m1,void ** m2,IORequest * type)7261 os_aio_simulated_handler(
7262 	ulint		global_segment,
7263 	fil_node_t**	m1,
7264 	void**		m2,
7265 	IORequest*	type)
7266 {
7267 	Slot*		slot;
7268 	AIO*		array;
7269 	ulint		segment;
7270 	os_event_t	event = os_aio_segment_wait_events[global_segment];
7271 
7272 	segment = AIO::get_array_and_local_segment(&array, global_segment);
7273 
7274 	SimulatedAIOHandler	handler(array, segment);
7275 
7276 	for (;;) {
7277 
7278 		srv_set_io_thread_op_info(
7279 			global_segment, "looking for i/o requests (a)");
7280 
7281 		ulint	n_slots = handler.check_pending(global_segment, event);
7282 
7283 		if (n_slots == 0) {
7284 			continue;
7285 		}
7286 
7287 		handler.init(n_slots);
7288 
7289 		srv_set_io_thread_op_info(
7290 			global_segment, "looking for i/o requests (b)");
7291 
7292 		array->acquire();
7293 
7294 		ulint	n_reserved;
7295 
7296 		slot = handler.check_completed(&n_reserved);
7297 
7298 		if (slot != NULL) {
7299 
7300 			break;
7301 
7302 		} else if (n_reserved == 0
7303 			   && !buf_page_cleaner_is_active
7304 			   && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
7305 
7306 			/* There is no completed request. If there
7307 			are no pending request at all, and the system
7308 			is being shut down, exit. */
7309 
7310 			array->release();
7311 
7312 			*m1 = NULL;
7313 
7314 			*m2 = NULL;
7315 
7316 			return(DB_SUCCESS);
7317 
7318 		} else if (handler.select()) {
7319 
7320 			break;
7321 		}
7322 
7323 		/* No I/O requested at the moment */
7324 
7325 		srv_set_io_thread_op_info(
7326 			global_segment, "resetting wait event");
7327 
7328 		/* We wait here until tbere are more IO requests
7329 		for this segment. */
7330 
7331 		os_event_reset(event);
7332 
7333 		array->release();
7334 
7335 		srv_set_io_thread_op_info(
7336 			global_segment, "waiting for i/o request");
7337 
7338 		os_event_wait(event);
7339 	}
7340 
7341 	/** Found a slot that has already completed its IO */
7342 
7343 	if (slot == NULL) {
7344 		/* Merge adjacent requests */
7345 		handler.merge();
7346 
7347 		/* Check if there are several consecutive blocks
7348 		to read or write */
7349 
7350 		srv_set_io_thread_op_info(
7351 			global_segment, "consecutive i/o requests");
7352 
7353 		// Note: We don't support write combining for simulated AIO.
7354 		//ulint	total_len = handler.allocate_buffer();
7355 
7356 		/* We release the array mutex for the time of the I/O: NOTE that
7357 		this assumes that there is just one i/o-handler thread serving
7358 		a single segment of slots! */
7359 
7360 		array->release();
7361 
7362 		// Note: We don't support write combining for simulated AIO.
7363 		//handler.copy_to_buffer(total_len);
7364 
7365 		srv_set_io_thread_op_info(global_segment, "doing file i/o");
7366 
7367 		handler.io();
7368 
7369 		srv_set_io_thread_op_info(global_segment, "file i/o done");
7370 
7371 		array->acquire();
7372 
7373 		handler.done();
7374 
7375 		/* We return the messages for the first slot now, and if there
7376 		were several slots, the messages will be returned with
7377 		subsequent calls of this function */
7378 
7379 		slot = handler.first_slot();
7380 	}
7381 
7382 	ut_ad(slot->is_reserved);
7383 
7384 	*m1 = slot->m1;
7385 	*m2 = slot->m2;
7386 
7387 	*type = slot->type;
7388 
7389 	array->release(slot);
7390 
7391 	array->release();
7392 
7393 	return(DB_SUCCESS);
7394 }
7395 
7396 /** Get the total number of pending IOs
7397 @return the total number of pending IOs */
7398 ulint
total_pending_io_count()7399 AIO::total_pending_io_count()
7400 {
7401 	ulint	count = s_reads->pending_io_count();
7402 
7403 	if (s_writes != NULL) {
7404 		count += s_writes->pending_io_count();
7405 	}
7406 
7407 	if (s_ibuf != NULL) {
7408 		count += s_ibuf->pending_io_count();
7409 	}
7410 
7411 	if (s_log != NULL) {
7412 		count += s_log->pending_io_count();
7413 	}
7414 
7415 	if (s_sync != NULL) {
7416 		count += s_sync->pending_io_count();
7417 	}
7418 
7419 	return(count);
7420 }
7421 
7422 /** Validates the consistency the aio system.
7423 @return true if ok */
7424 static
7425 bool
os_aio_validate()7426 os_aio_validate()
7427 {
7428 	/* The methods countds and validates, we ignore the count. */
7429 	AIO::total_pending_io_count();
7430 
7431 	return(true);
7432 }
7433 
7434 /** Prints pending IO requests per segment of an aio array.
7435 We probably don't need per segment statistics but they can help us
7436 during development phase to see if the IO requests are being
7437 distributed as expected.
7438 @param[in,out]	file		File where to print
7439 @param[in]	segments	Pending IO array */
7440 void
print_segment_info(FILE * file,const ulint * segments)7441 AIO::print_segment_info(
7442 	FILE*		file,
7443 	const ulint*	segments)
7444 {
7445 	ut_ad(m_n_segments > 0);
7446 
7447 	if (m_n_segments > 1) {
7448 
7449 		fprintf(file, " [");
7450 
7451 		for (ulint i = 0; i < m_n_segments; ++i, ++segments) {
7452 
7453 			if (i != 0) {
7454 				fprintf(file, ", ");
7455 			}
7456 
7457 			fprintf(file, ULINTPF, *segments);
7458 		}
7459 
7460 		fprintf(file, "] ");
7461 	}
7462 }
7463 
7464 /** Prints info about the aio array.
7465 @param[in,out]	file		Where to print */
7466 void
print(FILE * file)7467 AIO::print(FILE* file)
7468 {
7469 	ulint	count = 0;
7470 	ulint	n_res_seg[SRV_MAX_N_IO_THREADS];
7471 
7472 	mutex_enter(&m_mutex);
7473 
7474 	ut_a(!m_slots.empty());
7475 	ut_a(m_n_segments > 0);
7476 
7477 	memset(n_res_seg, 0x0, sizeof(n_res_seg));
7478 
7479 	for (ulint i = 0; i < m_slots.size(); ++i) {
7480 		Slot&	slot = m_slots[i];
7481 		ulint	segment = (i * m_n_segments) / m_slots.size();
7482 
7483 		if (slot.is_reserved) {
7484 
7485 			++count;
7486 
7487 			++n_res_seg[segment];
7488 
7489 			ut_a(slot.len > 0);
7490 		}
7491 	}
7492 
7493 	ut_a(m_n_reserved == count);
7494 
7495 	print_segment_info(file, n_res_seg);
7496 
7497 	mutex_exit(&m_mutex);
7498 }
7499 
7500 /** Print all the AIO segments
7501 @param[in,out]	file		Where to print */
7502 void
print_all(FILE * file)7503 AIO::print_all(FILE* file)
7504 {
7505 	s_reads->print(file);
7506 
7507 	if (s_writes != NULL) {
7508 		fputs(", aio writes:", file);
7509 		s_writes->print(file);
7510 	}
7511 
7512 	if (s_ibuf != NULL) {
7513 		fputs(",\n ibuf aio reads:", file);
7514 		s_ibuf->print(file);
7515 	}
7516 
7517 	if (s_log != NULL) {
7518 		fputs(", log i/o's:", file);
7519 		s_log->print(file);
7520 	}
7521 
7522 	if (s_sync != NULL) {
7523 		fputs(", sync i/o's:", file);
7524 		s_sync->print(file);
7525 	}
7526 }
7527 
7528 /** Prints info of the aio arrays.
7529 @param[in,out]	file		file where to print */
7530 void
os_aio_print(FILE * file)7531 os_aio_print(FILE*	file)
7532 {
7533 	time_t		current_time;
7534 	double		time_elapsed;
7535 	double		avg_bytes_read;
7536 
7537 	for (ulint i = 0; i < srv_n_file_io_threads; ++i) {
7538 		fprintf(file, "I/O thread " ULINTPF " state: %s (%s)",
7539 			i,
7540 			srv_io_thread_op_info[i],
7541 			srv_io_thread_function[i]);
7542 
7543 #ifndef _WIN32
7544 		if (!srv_use_native_aio
7545 		    && os_event_is_set(os_aio_segment_wait_events[i])) {
7546 			fprintf(file, " ev set");
7547 		}
7548 #endif /* _WIN32 */
7549 
7550 		fprintf(file, "\n");
7551 	}
7552 
7553 	fputs("Pending normal aio reads:", file);
7554 
7555 	AIO::print_all(file);
7556 
7557 	putc('\n', file);
7558 	current_time = time(NULL);
7559 	time_elapsed = 0.001 + difftime(current_time, os_last_printout);
7560 
7561 	fprintf(file,
7562 		"Pending flushes (fsync) log: " ULINTPF
7563 		"; buffer pool: " ULINTPF "\n"
7564 		ULINTPF " OS file reads, "
7565 		ULINTPF " OS file writes, "
7566 		ULINTPF " OS fsyncs\n",
7567 		fil_n_pending_log_flushes,
7568 		fil_n_pending_tablespace_flushes,
7569 		os_n_file_reads,
7570 		os_n_file_writes,
7571 		os_n_fsyncs);
7572 
7573 	const ulint n_reads = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_READS));
7574 	const ulint n_writes = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_WRITES));
7575 
7576 	if (n_reads != 0 || n_writes != 0) {
7577 		fprintf(file,
7578 			ULINTPF " pending reads, " ULINTPF " pending writes\n",
7579 			n_reads, n_writes);
7580 	}
7581 
7582 	if (os_n_file_reads == os_n_file_reads_old) {
7583 		avg_bytes_read = 0.0;
7584 	} else {
7585 		avg_bytes_read = (double) os_bytes_read_since_printout
7586 			/ (os_n_file_reads - os_n_file_reads_old);
7587 	}
7588 
7589 	fprintf(file,
7590 		"%.2f reads/s, " ULINTPF " avg bytes/read,"
7591 		" %.2f writes/s, %.2f fsyncs/s\n",
7592 		(os_n_file_reads - os_n_file_reads_old)
7593 		/ time_elapsed,
7594 		(ulint) avg_bytes_read,
7595 		(os_n_file_writes - os_n_file_writes_old)
7596 		/ time_elapsed,
7597 		(os_n_fsyncs - os_n_fsyncs_old)
7598 		/ time_elapsed);
7599 
7600 	os_n_file_reads_old = os_n_file_reads;
7601 	os_n_file_writes_old = os_n_file_writes;
7602 	os_n_fsyncs_old = os_n_fsyncs;
7603 	os_bytes_read_since_printout = 0;
7604 
7605 	os_last_printout = current_time;
7606 }
7607 
7608 /** Refreshes the statistics used to print per-second averages. */
7609 void
os_aio_refresh_stats()7610 os_aio_refresh_stats()
7611 {
7612 	os_n_fsyncs_old = os_n_fsyncs;
7613 
7614 	os_bytes_read_since_printout = 0;
7615 
7616 	os_n_file_reads_old = os_n_file_reads;
7617 
7618 	os_n_file_writes_old = os_n_file_writes;
7619 
7620 	os_n_fsyncs_old = os_n_fsyncs;
7621 
7622 	os_bytes_read_since_printout = 0;
7623 
7624 	os_last_printout = time(NULL);
7625 }
7626 
7627 /** Checks that all slots in the system have been freed, that is, there are
7628 no pending io operations.
7629 @return true if all free */
7630 bool
os_aio_all_slots_free()7631 os_aio_all_slots_free()
7632 {
7633 	return(AIO::total_pending_io_count() == 0);
7634 }
7635 
7636 #ifdef UNIV_DEBUG
7637 /** Prints all pending IO for the array
7638 @param[in]	file	file where to print
7639 @param[in]	array	array to process */
7640 void
to_file(FILE * file) const7641 AIO::to_file(FILE* file) const
7642 {
7643 	acquire();
7644 
7645 	fprintf(file, " " ULINTPF "\n", m_n_reserved);
7646 
7647 	for (ulint i = 0; i < m_slots.size(); ++i) {
7648 
7649 		const Slot&	slot = m_slots[i];
7650 
7651 		if (slot.is_reserved) {
7652 
7653 			fprintf(file,
7654 				"%s IO for %s (offset=" UINT64PF
7655 				", size=%lu)\n",
7656 				slot.type.is_read() ? "read" : "write",
7657 				slot.name, slot.offset, (unsigned long)(slot.len));
7658 		}
7659 	}
7660 
7661 	release();
7662 }
7663 
7664 /** Print pending IOs for all arrays */
7665 void
print_to_file(FILE * file)7666 AIO::print_to_file(FILE* file)
7667 {
7668 	fprintf(file, "Pending normal aio reads:");
7669 
7670 	s_reads->to_file(file);
7671 
7672 	if (s_writes != NULL) {
7673 		fprintf(file, "Pending normal aio writes:");
7674 		s_writes->to_file(file);
7675 	}
7676 
7677 	if (s_ibuf != NULL) {
7678 		fprintf(file, "Pending ibuf aio reads:");
7679 		s_ibuf->to_file(file);
7680 	}
7681 
7682 	if (s_log != NULL) {
7683 		fprintf(file, "Pending log i/o's:");
7684 		s_log->to_file(file);
7685 	}
7686 
7687 	if (s_sync != NULL) {
7688 		fprintf(file, "Pending sync i/o's:");
7689 		s_sync->to_file(file);
7690 	}
7691 }
7692 
7693 /** Prints all pending IO
7694 @param[in]	file		File where to print */
7695 void
os_aio_print_pending_io(FILE * file)7696 os_aio_print_pending_io(
7697 	FILE*	file)
7698 {
7699 	AIO::print_to_file(file);
7700 }
7701 
7702 #endif /* UNIV_DEBUG */
7703 
7704 /**
7705 Set the file create umask
7706 @param[in]	umask		The umask to use for file creation. */
7707 void
os_file_set_umask(ulint umask)7708 os_file_set_umask(ulint umask)
7709 {
7710 	os_innodb_umask = umask;
7711 }
7712 
7713 #else
7714 #include "univ.i"
7715 #endif /* !UNIV_INNOCHECKSUM */
7716 
7717 /** Normalizes a directory path for the current OS:
7718 On Windows, we convert '/' to '\', else we convert '\' to '/'.
7719 @param[in,out] str A null-terminated directory and file path */
7720 void
os_normalize_path(char * str)7721 os_normalize_path(
7722 	char*	str)
7723 {
7724 	if (str != NULL) {
7725 		for (; *str; str++) {
7726 			if (*str == OS_PATH_SEPARATOR_ALT) {
7727 				*str = OS_PATH_SEPARATOR;
7728 			}
7729 		}
7730 	}
7731 }
7732