1 /***********************************************************************
2 
3 Copyright (c) 1995, 2021, Oracle and/or its affiliates.
4 Copyright (c) 2009, Percona Inc.
5 
6 Portions of this file contain modifications contributed and copyrighted
7 by Percona Inc.. Those modifications are
8 gratefully acknowledged and are described briefly in the InnoDB
9 documentation. The contributions by Percona Inc. are incorporated with
10 their permission, and subject to the conditions contained in the file
11 COPYING.Percona.
12 
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License, version 2.0,
15 as published by the Free Software Foundation.
16 
17 This program is also distributed with certain software (including
18 but not limited to OpenSSL) that is licensed under separate terms,
19 as designated in a particular file or component or in included license
20 documentation.  The authors of MySQL hereby grant you an additional
21 permission to link the program and your derivative works with the
22 separately licensed software that they have included with MySQL.
23 
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27 GNU General Public License, version 2.0, for more details.
28 
29 You should have received a copy of the GNU General Public License along with
30 this program; if not, write to the Free Software Foundation, Inc.,
31 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
32 
33 ***********************************************************************/
34 
35 /**************************************************//**
36 @file os/os0file.cc
37 The interface to the operating system file i/o primitives
38 
39 Created 10/21/1995 Heikki Tuuri
40 *******************************************************/
41 
42 #ifndef UNIV_INNOCHECKSUM
43 
44 #include "ha_prototypes.h"
45 #include "sql_const.h"
46 
47 #include "os0file.h"
48 
49 #ifdef UNIV_NONINL
50 #include "os0file.ic"
51 #endif
52 
53 #include "page0page.h"
54 #include "srv0srv.h"
55 #include "srv0start.h"
56 #include "fil0fil.h"
57 #ifndef UNIV_HOTBACKUP
58 # include "os0event.h"
59 # include "os0thread.h"
60 #else /* !UNIV_HOTBACKUP */
61 # ifdef _WIN32
62 /* Add includes for the _stat() call to compile on Windows */
63 #  include <sys/types.h>
64 #  include <sys/stat.h>
65 #  include <errno.h>
66 # endif /* _WIN32 */
67 #endif /* !UNIV_HOTBACKUP */
68 
69 #include <vector>
70 #include <functional>
71 
72 #ifdef LINUX_NATIVE_AIO
73 #include <libaio.h>
74 #endif /* LINUX_NATIVE_AIO */
75 
76 #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
77 # include <fcntl.h>
78 # include <linux/falloc.h>
79 #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
80 
81 #include <lz4.h>
82 #include <zlib.h>
83 
84 #ifdef UNIV_DEBUG
85 /** Set when InnoDB has invoked exit(). */
86 bool	innodb_calling_exit;
87 #endif /* UNIV_DEBUG */
88 
89 #include <my_aes.h>
90 #include <my_rnd.h>
91 #include <mysqld.h>
92 #include <mysql/service_mysql_keyring.h>
93 
94 /** Insert buffer segment id */
95 static const ulint IO_IBUF_SEGMENT = 0;
96 
97 /** Log segment id */
98 static const ulint IO_LOG_SEGMENT = 1;
99 
100 /** Number of retries for partial I/O's */
101 static const ulint NUM_RETRIES_ON_PARTIAL_IO = 10;
102 
103 /** Blocks for doing IO, used in the transparent compression
104 and encryption code. */
105 struct Block {
106 	/** Default constructor */
BlockBlock107 	Block() : m_ptr(), m_in_use() { }
108 
109 	byte*		m_ptr;
110 
111 	byte		pad[CACHE_LINE_SIZE - sizeof(ulint)];
112 	lock_word_t	m_in_use;
113 };
114 
115 /** For storing the allocated blocks */
116 typedef std::vector<Block> Blocks;
117 
118 /** Block collection */
119 static Blocks*	block_cache;
120 
121 /** Number of blocks to allocate for sync read/writes */
122 static const size_t	MAX_BLOCKS = 128;
123 
124 /** Block buffer size */
125 #define BUFFER_BLOCK_SIZE ((ulint)(UNIV_PAGE_SIZE * 1.3))
126 
127 /** Disk sector size of aligning write buffer for DIRECT_IO */
128 static ulint	os_io_ptr_align = UNIV_SECTOR_SIZE;
129 
130 /* This specifies the file permissions InnoDB uses when it creates files in
131 Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
132 my_umask */
133 
134 #ifndef _WIN32
135 /** Umask for creating files */
136 static ulint	os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
137 #else
138 /** Umask for creating files */
139 static ulint	os_innodb_umask	= 0;
140 
141 /* On Windows when using native AIO the number of AIO requests
142 that a thread can handle at a given time is limited to 32
143 i.e.: SRV_N_PENDING_IOS_PER_THREAD */
144 #define SRV_N_PENDING_IOS_PER_THREAD    OS_AIO_N_PENDING_IOS_PER_THREAD
145 
146 #endif /* _WIN32 */
147 
148 #ifndef UNIV_HOTBACKUP
149 
150 /** In simulated aio, merge at most this many consecutive i/os */
151 static const ulint	OS_AIO_MERGE_N_CONSECUTIVE = 64;
152 
153 /** Flag indicating if the page_cleaner is in active state. */
154 extern bool buf_page_cleaner_is_active;
155 
156 #ifdef WITH_INNODB_DISALLOW_WRITES
157 #define WAIT_ALLOW_WRITES() os_event_wait(srv_allow_writes_event)
158 #else
159 #define WAIT_ALLOW_WRITES() do { } while (0)
160 #endif /* WITH_INNODB_DISALLOW_WRITES */
161 
162 /**********************************************************************
163 
164 InnoDB AIO Implementation:
165 =========================
166 
167 We support native AIO for Windows and Linux. For rest of the platforms
168 we simulate AIO by special IO-threads servicing the IO-requests.
169 
170 Simulated AIO:
171 ==============
172 
173 On platforms where we 'simulate' AIO, the following is a rough explanation
174 of the high level design.
175 There are four io-threads (for ibuf, log, read, write).
176 All synchronous IO requests are serviced by the calling thread using
177 os_file_write/os_file_read. The Asynchronous requests are queued up
178 in an array (there are four such arrays) by the calling thread.
179 Later these requests are picked up by the IO-thread and are serviced
180 synchronously.
181 
182 Windows native AIO:
183 ==================
184 
185 If srv_use_native_aio is not set then Windows follow the same
186 code as simulated AIO. If the flag is set then native AIO interface
187 is used. On windows, one of the limitation is that if a file is opened
188 for AIO no synchronous IO can be done on it. Therefore we have an
189 extra fifth array to queue up synchronous IO requests.
190 There are innodb_file_io_threads helper threads. These threads work
191 on the four arrays mentioned above in Simulated AIO. No thread is
192 required for the sync array.
193 If a synchronous IO request is made, it is first queued in the sync
194 array. Then the calling thread itself waits on the request, thus
195 making the call synchronous.
196 If an AIO request is made the calling thread not only queues it in the
197 array but also submits the requests. The helper thread then collects
198 the completed IO request and calls completion routine on it.
199 
200 Linux native AIO:
201 =================
202 
203 If we have libaio installed on the system and innodb_use_native_aio
204 is set to true we follow the code path of native AIO, otherwise we
205 do simulated AIO.
206 There are innodb_file_io_threads helper threads. These threads work
207 on the four arrays mentioned above in Simulated AIO.
208 If a synchronous IO request is made, it is handled by calling
209 os_file_write/os_file_read.
210 If an AIO request is made the calling thread not only queues it in the
211 array but also submits the requests. The helper thread then collects
212 the completed IO request and calls completion routine on it.
213 
214 **********************************************************************/
215 
216 
217 #ifdef UNIV_PFS_IO
218 /* Keys to register InnoDB I/O with performance schema */
219 mysql_pfs_key_t  innodb_data_file_key;
220 mysql_pfs_key_t  innodb_log_file_key;
221 mysql_pfs_key_t  innodb_temp_file_key;
222 #endif /* UNIV_PFS_IO */
223 
224 /** The asynchronous I/O context */
225 struct Slot {
SlotSlot226 	Slot() { memset(this, 0, sizeof(*this)); }
227 
228 	/** index of the slot in the aio array */
229 	uint16_t		pos;
230 
231 	/** true if this slot is reserved */
232 	bool			is_reserved;
233 
234 	/** time when reserved */
235 	ib_time_monotonic_t	reservation_time;
236 
237 	/** buffer used in i/o */
238 	byte*			buf;
239 
240 	/** Buffer pointer used for actual IO. We advance this
241 	when partial IO is required and not buf */
242 	byte*			ptr;
243 
244 	/** OS_FILE_READ or OS_FILE_WRITE */
245 	IORequest		type;
246 
247 	/** file offset in bytes */
248 	os_offset_t		offset;
249 
250 	/** file where to read or write */
251 	pfs_os_file_t		file;
252 
253 	/** file name or path */
254 	const char*		name;
255 
256 	/** used only in simulated aio: true if the physical i/o
257 	already made and only the slot message needs to be passed
258 	to the caller of os_aio_simulated_handle */
259 	bool			io_already_done;
260 
261 	/** The file node for which the IO is requested. */
262 	fil_node_t*		m1;
263 
264 	/** the requester of an aio operation and which can be used
265 	to identify which pending aio operation was completed */
266 	void*			m2;
267 
268 	/** AIO completion status */
269 	dberr_t			err;
270 
271 #ifdef WIN_ASYNC_IO
272 	/** handle object we need in the OVERLAPPED struct */
273 	HANDLE			handle;
274 
275 	/** Windows control block for the aio request */
276 	OVERLAPPED		control;
277 
278 	/** bytes written/read */
279 	DWORD			n_bytes;
280 
281 	/** length of the block to read or write */
282 	DWORD			len;
283 
284 #elif defined(LINUX_NATIVE_AIO)
285 	/** Linux control block for aio */
286 	struct iocb		control;
287 
288 	/** AIO return code */
289 	int			ret;
290 
291 	/** bytes written/read. */
292 	ssize_t			n_bytes;
293 
294 	/** length of the block to read or write */
295 	ulint			len;
296 #else
297 	/** length of the block to read or write */
298 	ulint			len;
299 
300 	/** bytes written/read. */
301 	ulint			n_bytes;
302 #endif /* WIN_ASYNC_IO */
303 
304 	/** Length of the block before it was compressed */
305 	uint32			original_len;
306 
307 	/** Buffer block for compressed pages or encrypted pages */
308 	Block*			buf_block;
309 
310 	/** true, if we shouldn't punch a hole after writing the page */
311 	bool			skip_punch_hole;
312 };
313 
314 /** The asynchronous i/o array structure */
315 class AIO {
316 public:
317 	/** Constructor
318 	@param[in]	id		Latch ID
319 	@param[in]	n_slots		Number of slots to configure
320 	@param[in]	segments	Number of segments to configure */
321 	AIO(latch_id_t id, ulint n_slots, ulint segments);
322 
323 	/** Destructor */
324 	~AIO();
325 
326 	/** Initialize the instance
327 	@return DB_SUCCESS or error code */
328 	dberr_t init();
329 
330 	/** Requests for a slot in the aio array. If no slot is available, waits
331 	until not_full-event becomes signaled.
332 
333 	@param[in,out]	type	IO context
334 	@param[in,out]	m1	message to be passed along with the AIO
335 				operation
336 	@param[in,out]	m2	message to be passed along with the AIO
337 				operation
338 	@param[in]	file	file handle
339 	@param[in]	name	name of the file or path as a null-terminated
340 				string
341 	@param[in,out]	buf	buffer where to read or from which to write
342 	@param[in]	offset	file offset, where to read from or start writing
343 	@param[in]	len	length of the block to read or write
344 	@return pointer to slot */
345 	Slot* reserve_slot(
346 		IORequest&	type,
347 		fil_node_t*	m1,
348 		void*		m2,
349 		pfs_os_file_t	file,
350 		const char*	name,
351 		void*		buf,
352 		os_offset_t	offset,
353 		ulint		len)
354 		MY_ATTRIBUTE((warn_unused_result));
355 
356 	/** @return number of reserved slots */
357 	ulint pending_io_count() const;
358 
359 	/** Returns a pointer to the nth slot in the aio array.
360 	@param[in]	index	Index of the slot in the array
361 	@return pointer to slot */
at(ulint i) const362 	const Slot* at(ulint i) const
363 		MY_ATTRIBUTE((warn_unused_result))
364 	{
365 		ut_a(i < m_slots.size());
366 
367 		return(&m_slots[i]);
368 	}
369 
370 	/** Non const version */
at(ulint i)371 	Slot* at(ulint i)
372 		MY_ATTRIBUTE((warn_unused_result))
373 	{
374 		ut_a(i < m_slots.size());
375 
376 		return(&m_slots[i]);
377 	}
378 
379 	/** Frees a slot in the AIO array, assumes caller owns the mutex.
380 	@param[in,out]	slot	Slot to release */
381 	void release(Slot* slot);
382 
383 	/** Frees a slot in the AIO array, assumes caller doesn't own the mutex.
384 	@param[in,out]	slot	Slot to release */
385 	void release_with_mutex(Slot* slot);
386 
387 	/** Prints info about the aio array.
388 	@param[in,out]	file	Where to print */
389 	void print(FILE* file);
390 
391 	/** @return the number of slots per segment */
slots_per_segment() const392 	ulint slots_per_segment() const
393 		MY_ATTRIBUTE((warn_unused_result))
394 	{
395 		return(m_slots.size() / m_n_segments);
396 	}
397 
398 	/** @return accessor for n_segments */
get_n_segments() const399 	ulint get_n_segments() const
400 		MY_ATTRIBUTE((warn_unused_result))
401 	{
402 		return(m_n_segments);
403 	}
404 
405 #ifdef UNIV_DEBUG
406 	/** @return true if the thread owns the mutex */
is_mutex_owned() const407 	bool is_mutex_owned() const
408 		MY_ATTRIBUTE((warn_unused_result))
409 	{
410 		return(mutex_own(&m_mutex));
411 	}
412 #endif /* UNIV_DEBUG */
413 
414 	/** Acquire the mutex */
acquire() const415 	void acquire() const
416 	{
417 		mutex_enter(&m_mutex);
418 	}
419 
420 	/** Release the mutex */
release() const421 	void release() const
422 	{
423 		mutex_exit(&m_mutex);
424 	}
425 
426 	/** Write out the state to the file/stream
427 	@param[in, out]	file	File to write to */
428 	void to_file(FILE* file) const;
429 
430 #ifdef LINUX_NATIVE_AIO
431 	/** Dispatch an AIO request to the kernel.
432 	@param[in,out]	slot	an already reserved slot
433 	@return true on success. */
434 	bool linux_dispatch(Slot* slot)
435 		MY_ATTRIBUTE((warn_unused_result));
436 
437 	/** Accessor for an AIO event
438 	@param[in]	index	Index into the array
439 	@return the event at the index */
io_events(ulint index)440 	io_event* io_events(ulint index)
441 		MY_ATTRIBUTE((warn_unused_result))
442 	{
443 		ut_a(index < m_events.size());
444 
445 		return(&m_events[index]);
446 	}
447 
448 	/** Accessor for the AIO context
449 	@param[in]	segment	Segment for which to get the context
450 	@return the AIO context for the segment */
io_ctx(ulint segment)451 	io_context* io_ctx(ulint segment)
452 		MY_ATTRIBUTE((warn_unused_result))
453 	{
454 		ut_ad(segment < get_n_segments());
455 
456 		return(m_aio_ctx[segment]);
457 	}
458 
459 	/** Creates an io_context for native linux AIO.
460 	@param[in]	max_events	number of events
461 	@param[out]	io_ctx		io_ctx to initialize.
462 	@return true on success. */
463 	static bool linux_create_io_ctx(ulint max_events, io_context_t* io_ctx)
464 		MY_ATTRIBUTE((warn_unused_result));
465 
466 	/** Checks if the system supports native linux aio. On some kernel
467 	versions where native aio is supported it won't work on tmpfs. In such
468 	cases we can't use native aio as it is not possible to mix simulated
469 	and native aio.
470 	@return true if supported, false otherwise. */
471 	static bool is_linux_native_aio_supported()
472 		MY_ATTRIBUTE((warn_unused_result));
473 #endif /* LINUX_NATIVE_AIO */
474 
475 #ifdef WIN_ASYNC_IO
476 	/** Wakes up all async i/o threads in the array in Windows async I/O at
477 	shutdown. */
signal()478 	void signal()
479 	{
480 		for (ulint i = 0; i < m_slots.size(); ++i) {
481 			SetEvent(m_slots[i].handle);
482 		}
483 	}
484 
485 	/** Wake up all AIO threads in Windows native aio */
wake_at_shutdown()486 	static void wake_at_shutdown()
487 	{
488 		s_reads->signal();
489 
490 		if (s_writes != NULL) {
491 			s_writes->signal();
492 		}
493 
494 		if (s_ibuf != NULL) {
495 			s_ibuf->signal();
496 		}
497 
498 		if (s_log != NULL) {
499 			s_log->signal();
500 		}
501 	}
502 #endif /* WIN_ASYNC_IO */
503 
504 #ifdef _WIN32
505 	/** This function can be called if one wants to post a batch of reads
506 	and prefers an I/O - handler thread to handle them all at once later.You
507 	must call os_aio_simulated_wake_handler_threads later to ensure the
508 	threads are not left sleeping! */
509 	static void simulated_put_read_threads_to_sleep();
510 
511 	/** The non asynchronous IO array.
512 	@return the synchronous AIO array instance. */
sync_array()513 	static AIO* sync_array()
514 		MY_ATTRIBUTE((warn_unused_result))
515 	{
516 		return(s_sync);
517 	}
518 
519 	/**
520 	Get the AIO handles for a segment.
521 	@param[in]	segment		The local segment.
522 	@return the handles for the segment. */
handles(ulint segment)523 	HANDLE* handles(ulint segment)
524 		MY_ATTRIBUTE((warn_unused_result))
525 	{
526 		ut_ad(segment < m_handles->size() / slots_per_segment());
527 
528 		return(&(*m_handles)[segment * slots_per_segment()]);
529 	}
530 
531 	/** @return true if no slots are reserved */
is_empty() const532 	bool is_empty() const
533 		MY_ATTRIBUTE((warn_unused_result))
534 	{
535 		ut_ad(is_mutex_owned());
536 		return(m_n_reserved == 0);
537 	}
538 #endif /* _WIN32 */
539 
540 	/** Create an instance using new(std::nothrow)
541 	@param[in]	id		Latch ID
542 	@param[in]	n_slots		The number of AIO request slots
543 	@param[in]	segments	The number of segments
544 	@return a new AIO instance */
545 	static AIO* create(
546 		latch_id_t	id,
547 		ulint		n_slots,
548 		ulint		segments)
549 		MY_ATTRIBUTE((warn_unused_result));
550 
551 	/** Initializes the asynchronous io system. Creates one array each
552 	for ibuf and log I/O. Also creates one array each for read and write
553 	where each array is divided logically into n_readers and n_writers
554 	respectively. The caller must create an i/o handler thread for each
555 	segment in these arrays. This function also creates the sync array.
556 	No I/O handler thread needs to be created for that
557 	@param[in]	n_per_seg	maximum number of pending aio
558 					operations allowed per segment
559 	@param[in]	n_readers	number of reader threads
560 	@param[in]	n_writers	number of writer threads
561 	@param[in]	n_slots_sync	number of slots in the sync aio array
562 	@return true if AIO sub-system was started successfully */
563 	static bool start(
564 		ulint		n_per_seg,
565 		ulint		n_readers,
566 		ulint		n_writers,
567 		ulint		n_slots_sync)
568 		MY_ATTRIBUTE((warn_unused_result));
569 
570 	/** Free the AIO arrays */
571 	static void shutdown();
572 
573 	/** Print all the AIO segments
574 	@param[in,out]	file		Where to print */
575 	static void print_all(FILE* file);
576 
577 	/** Calculates local segment number and aio array from global
578 	segment number.
579 	@param[out]	array		AIO wait array
580 	@param[in]	segment		global segment number
581 	@return local segment number within the aio array */
582 	static ulint get_array_and_local_segment(
583 		AIO**		array,
584 		ulint		segment)
585 		MY_ATTRIBUTE((warn_unused_result));
586 
587 	/** Select the IO slot array
588 	@param[in]	type		Type of IO, READ or WRITE
589 	@param[in]	read_only	true if running in read-only mode
590 	@param[in]	mode		IO mode
591 	@return slot array or NULL if invalid mode specified */
592 	static AIO* select_slot_array(
593 		IORequest&	type,
594 		bool		read_only,
595 		ulint		mode)
596 		MY_ATTRIBUTE((warn_unused_result));
597 
598 	/** Calculates segment number for a slot.
599 	@param[in]	array		AIO wait array
600 	@param[in]	slot		slot in this array
601 	@return segment number (which is the number used by, for example,
602 		I/O handler threads) */
603 	static ulint get_segment_no_from_slot(
604 		const AIO*	array,
605 		const Slot*	slot)
606 		MY_ATTRIBUTE((warn_unused_result));
607 
608 	/** Wakes up a simulated AIO I/O-handler thread if it has something
609 	to do.
610 	@param[in]	global_segment	the number of the segment in the
611 					AIO arrays */
612 	static void wake_simulated_handler_thread(ulint global_segment);
613 
614 	/** Check if it is a read request
615 	@param[in]	aio		The AIO instance to check
616 	@return true if the AIO instance is for reading. */
is_read(const AIO * aio)617 	static bool is_read(const AIO* aio)
618 		MY_ATTRIBUTE((warn_unused_result))
619 	{
620 		return(s_reads == aio);
621 	}
622 
623 	/** Wait on an event until no pending writes */
wait_until_no_pending_writes()624 	static void wait_until_no_pending_writes()
625 	{
626 		os_event_wait(AIO::s_writes->m_is_empty);
627 	}
628 
629 	/** Print to file
630 	@param[in]	file		File to write to */
631 	static void print_to_file(FILE* file);
632 
633 	/** Check for pending IO. Gets the count and also validates the
634 	data structures.
635 	@return count of pending IO requests */
636 	static ulint total_pending_io_count();
637 
638 private:
639 	/** Initialise the slots
640 	@return DB_SUCCESS or error code */
641 	dberr_t init_slots()
642 		MY_ATTRIBUTE((warn_unused_result));
643 
644 	/** Wakes up a simulated AIO I/O-handler thread if it has something
645 	to do for a local segment in the AIO array.
646 	@param[in]	global_segment	the number of the segment in the
647 					AIO arrays
648 	@param[in]	segment		the local segment in the AIO array */
649 	void wake_simulated_handler_thread(ulint global_segment, ulint segment);
650 
651 	/** Prints pending IO requests per segment of an aio array.
652 	We probably don't need per segment statistics but they can help us
653 	during development phase to see if the IO requests are being
654 	distributed as expected.
655 	@param[in,out]	file		file where to print
656 	@param[in]	segments	pending IO array */
657 	void print_segment_info(
658 		FILE*		file,
659 		const ulint*	segments);
660 
661 #ifdef LINUX_NATIVE_AIO
662 	/** Initialise the Linux native AIO data structures
663 	@return DB_SUCCESS or error code */
664 	dberr_t init_linux_native_aio()
665 		MY_ATTRIBUTE((warn_unused_result));
666 #endif /* LINUX_NATIVE_AIO */
667 
668 private:
669 	typedef std::vector<Slot> Slots;
670 
671 	/** the mutex protecting the aio array */
672 	mutable SysMutex	m_mutex;
673 
674 	/** Pointer to the slots in the array.
675 	Number of elements must be divisible by n_threads. */
676 	Slots			m_slots;
677 
678 	/** Number of segments in the aio array of pending aio requests.
679 	A thread can wait separately for any one of the segments. */
680 	ulint			m_n_segments;
681 
682 	/** The event which is set to the signaled state when
683 	there is space in the aio outside the ibuf segment */
684 	os_event_t		m_not_full;
685 
686 	/** The event which is set to the signaled state when
687 	there are no pending i/os in this array */
688 	os_event_t		m_is_empty;
689 
690 	/** Number of reserved slots in the AIO array outside
691 	the ibuf segment */
692 	ulint			m_n_reserved;
693 
694 #ifdef _WIN32
695 	typedef std::vector<HANDLE, ut_allocator<HANDLE> > Handles;
696 
697 	/** Pointer to an array of OS native event handles where
698 	we copied the handles from slots, in the same order. This
699 	can be used in WaitForMultipleObjects; used only in Windows */
700 	Handles*		m_handles;
701 #endif /* _WIN32 */
702 
703 #if defined(LINUX_NATIVE_AIO)
704 	typedef std::vector<io_event> IOEvents;
705 
706 	/** completion queue for IO. There is one such queue per
707 	segment. Each thread will work on one ctx exclusively. */
708 	io_context_t*		m_aio_ctx;
709 
710 	/** The array to collect completed IOs. There is one such
711 	event for each possible pending IO. The size of the array
712 	is equal to m_slots.size(). */
713 	IOEvents		m_events;
714 #endif /* LINUX_NATIV_AIO */
715 
716 	/** The aio arrays for non-ibuf i/o and ibuf i/o, as well as
717 	sync AIO. These are NULL when the module has not yet been
718 	initialized. */
719 
720 	/** Insert buffer */
721 	static AIO*		s_ibuf;
722 
723 	/** Redo log */
724 	static AIO*		s_log;
725 
726 	/** Reads */
727 	static AIO*		s_reads;
728 
729 	/** Writes */
730 	static AIO*		s_writes;
731 
732 	/** Synchronous I/O */
733 	static AIO*		s_sync;
734 };
735 
736 /** Static declarations */
737 AIO*	AIO::s_reads;
738 AIO*	AIO::s_writes;
739 AIO*	AIO::s_ibuf;
740 AIO*	AIO::s_log;
741 AIO*	AIO::s_sync;
742 
743 #if defined(LINUX_NATIVE_AIO)
744 /** timeout for each io_getevents() call = 500ms. */
745 static const ulint	OS_AIO_REAP_TIMEOUT = 500000000UL;
746 
747 /** time to sleep, in microseconds if io_setup() returns EAGAIN. */
748 static const ulint	OS_AIO_IO_SETUP_RETRY_SLEEP = 500000UL;
749 
750 /** number of attempts before giving up on io_setup(). */
751 static const int	OS_AIO_IO_SETUP_RETRY_ATTEMPTS = 5;
752 #endif /* LINUX_NATIVE_AIO */
753 
754 /** Array of events used in simulated AIO */
755 static os_event_t*	os_aio_segment_wait_events = NULL;
756 
757 /** Number of asynchronous I/O segments.  Set by os_aio_init(). */
758 static ulint		os_aio_n_segments = ULINT_UNDEFINED;
759 
760 /** If the following is true, read i/o handler threads try to
761 wait until a batch of new read requests have been posted */
762 static bool		os_aio_recommend_sleep_for_read_threads = false;
763 #endif /* !UNIV_HOTBACKUP */
764 
765 ulint	os_n_file_reads		= 0;
766 ulint	os_bytes_read_since_printout = 0;
767 ulint	os_n_file_writes	= 0;
768 ulint	os_n_fsyncs		= 0;
769 ulint	os_n_file_reads_old	= 0;
770 ulint	os_n_file_writes_old	= 0;
771 ulint	os_n_fsyncs_old		= 0;
772 /** Number of pending write operations */
773 ulint	os_n_pending_writes = 0;
774 /** Number of pending read operations */
775 ulint	os_n_pending_reads = 0;
776 
777 ib_time_monotonic_t	os_last_printout;
778 bool	os_has_said_disk_full	= false;
779 
780 /** Default Zip compression level */
781 extern uint page_zip_level;
782 
783 #if DATA_TRX_ID_LEN > 6
784 #error "COMPRESSION_ALGORITHM will not fit"
785 #endif /* DATA_TRX_ID_LEN */
786 
787 /** Validates the consistency of the aio system.
788 @return true if ok */
789 static
790 bool
791 os_aio_validate();
792 
793 /** Does error handling when a file operation fails.
794 @param[in]	name		File name or NULL
795 @param[in]	operation	Name of operation e.g., "read", "write"
796 @return true if we should retry the operation */
797 static
798 bool
799 os_file_handle_error(
800 	const char*	name,
801 	const char*	operation);
802 
803 /** Free storage space associated with a section of the file.
804 @param[in]      fh              Open file handle
805 @param[in]      off             Starting offset (SEEK_SET)
806 @param[in]      len             Size of the hole
807 @return DB_SUCCESS or error code */
808 dberr_t
809 os_file_punch_hole(
810         os_file_t   fh,
811         os_offset_t     off,
812         os_offset_t     len);
813 
814 /**
815 Does error handling when a file operation fails.
816 @param[in]	name		File name or NULL
817 @param[in]	operation	Name of operation e.g., "read", "write"
818 @param[in]	silent	if true then don't print any message to the log.
819 @return true if we should retry the operation */
820 static
821 bool
822 os_file_handle_error_no_exit(
823 	const char*	name,
824 	const char*	operation,
825 	bool		silent);
826 
827 /** Decompress after a read and punch a hole in the file if it was a write
828 @param[in]	type		IO context
829 @param[in]	fh		Open file handle
830 @param[in,out]	buf		Buffer to transform
831 @param[in,out]	scratch		Scratch area for read decompression
832 @param[in]	src_len		Length of the buffer before compression
833 @param[in]	len		Compressed buffer length for write and size
834 				of buf len for read
835 @return DB_SUCCESS or error code */
836 static
837 dberr_t
838 os_file_io_complete(
839 	const IORequest&type,
840 	os_file_t	fh,
841 	byte*		buf,
842 	byte*		scratch,
843 	ulint		src_len,
844 	os_offset_t	offset,
845 	ulint		len);
846 
847 /** Does simulated AIO. This function should be called by an i/o-handler
848 thread.
849 
850 @param[in]	segment	The number of the segment in the aio arrays to wait
851 			for; segment 0 is the ibuf i/o thread, segment 1 the
852 			log i/o thread, then follow the non-ibuf read threads,
853 			and as the last are the non-ibuf write threads
854 @param[out]	m1	the messages passed with the AIO request; note that
855 			also in the case where the AIO operation failed, these
856 			output parameters are valid and can be used to restart
857 			the operation, for example
858 @param[out]	m2	Callback argument
859 @param[in]	type	IO context
860 @return DB_SUCCESS or error code */
861 static
862 dberr_t
863 os_aio_simulated_handler(
864 	ulint		global_segment,
865 	fil_node_t**	m1,
866 	void**		m2,
867 	IORequest*	type);
868 
869 #ifdef WIN_ASYNC_IO
870 /** This function is only used in Windows asynchronous i/o.
871 Waits for an aio operation to complete. This function is used to wait the
872 for completed requests. The aio array of pending requests is divided
873 into segments. The thread specifies which segment or slot it wants to wait
874 for. NOTE: this function will also take care of freeing the aio slot,
875 therefore no other thread is allowed to do the freeing!
876 @param[in]	segment		The number of the segment in the aio arrays to
877 wait for; segment 0 is the ibuf I/O thread,
878 segment 1 the log I/O thread, then follow the
879 non-ibuf read threads, and as the last are the
880 non-ibuf write threads; if this is
881 ULINT_UNDEFINED, then it means that sync AIO
882 is used, and this parameter is ignored
883 @param[in]	pos		this parameter is used only in sync AIO:
884 wait for the aio slot at this position
885 @param[out]	m1		the messages passed with the AIO request; note
886 that also in the case where the AIO operation
887 failed, these output parameters are valid and
888 can be used to restart the operation,
889 for example
890 @param[out]	m2		callback message
891 @param[out]	type		OS_FILE_WRITE or ..._READ
892 @return DB_SUCCESS or error code */
893 static
894 dberr_t
895 os_aio_windows_handler(
896 	ulint		segment,
897 	ulint		pos,
898 	fil_node_t**	m1,
899 	void**		m2,
900 	IORequest*	type);
901 #endif /* WIN_ASYNC_IO */
902 
903 /** Allocate a page for sync IO
904 @return pointer to page */
905 static
906 Block*
os_alloc_block()907 os_alloc_block()
908 {
909 	size_t		pos;
910 	Blocks&		blocks = *block_cache;
911 	size_t		i = static_cast<size_t>(my_timer_cycles());
912 	const size_t	size = blocks.size();
913 	ulint		retry = 0;
914 	Block*		block;
915 
916 	DBUG_EXECUTE_IF("os_block_cache_busy", retry = MAX_BLOCKS * 3;);
917 
918 	for (;;) {
919 
920 		/* After go through the block cache for 3 times,
921 		allocate a new temporary block. */
922 		if (retry == MAX_BLOCKS * 3) {
923 			byte*	ptr;
924 
925 			ptr = static_cast<byte*>(
926 				ut_malloc_nokey(sizeof(*block)
927 						+ BUFFER_BLOCK_SIZE));
928 
929 			block = new (ptr) Block();
930 			block->m_ptr = static_cast<byte*>(
931 				ptr + sizeof(*block));
932 			block->m_in_use = 1;
933 
934 			break;
935 		}
936 
937 		pos = i++ % size;
938 
939 		if (TAS(&blocks[pos].m_in_use, 1) == 0) {
940 			block = &blocks[pos];
941 			break;
942 		}
943 
944 		os_thread_yield();
945 
946 		++retry;
947 	}
948 
949 	ut_a(block->m_in_use != 0);
950 
951 	return(block);
952 }
953 
954 /** Free a page after sync IO
955 @param[in,own]	block		The block to free/release */
956 static
957 void
os_free_block(Block * block)958 os_free_block(Block* block)
959 {
960 	ut_ad(block->m_in_use == 1);
961 
962 	TAS(&block->m_in_use, 0);
963 
964 	/* When this block is not in the block cache, and it's
965 	a temporary block, we need to free it directly. */
966 	if (std::less<Block*>()(block, &block_cache->front())
967 	    || std::greater<Block*>()(block, &block_cache->back())) {
968 		ut_free(block);
969 	}
970 }
971 
972 /** Generic AIO Handler methods. Currently handles IO post processing. */
973 class AIOHandler {
974 public:
975 	/** Do any post processing after a read/write
976 	@return DB_SUCCESS or error code. */
977 	static dberr_t post_io_processing(Slot* slot);
978 
979 	/** Decompress after a read and punch a hole in the file if
980 	it was a write */
io_complete(const Slot * slot)981 	static dberr_t io_complete(const Slot* slot)
982 	{
983 		ut_a(slot->offset > 0);
984 		ut_a(slot->type.is_read() || !slot->skip_punch_hole);
985 		return(os_file_io_complete(
986 				slot->type, slot->file.m_file, slot->buf,
987 				NULL, slot->original_len,
988 				slot->offset, slot->len));
989 	}
990 
991 private:
992 	/** Check whether the page was encrypted.
993 	@param[in]	slot		The slot that contains the IO request
994 	@return true if it was an encyrpted page */
is_encrypted_page(const Slot * slot)995 	static bool is_encrypted_page(const Slot* slot)
996 	{
997 		return(Encryption::is_encrypted_page(slot->buf));
998 	}
999 
1000 	/** Check whether the page was compressed.
1001 	@param[in]	slot		The slot that contains the IO request
1002 	@return true if it was a compressed page */
is_compressed_page(const Slot * slot)1003 	static bool is_compressed_page(const Slot* slot)
1004 	{
1005 		const byte*	src = slot->buf;
1006 
1007 		ulint	page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
1008 
1009 		return(page_type == FIL_PAGE_COMPRESSED);
1010 	}
1011 
1012 	/** Get the compressed page size.
1013 	@param[in]	slot		The slot that contains the IO request
1014 	@return number of bytes to read for a successful decompress */
compressed_page_size(const Slot * slot)1015 	static ulint compressed_page_size(const Slot* slot)
1016 	{
1017 		ut_ad(slot->type.is_read());
1018 		ut_ad(is_compressed_page(slot));
1019 
1020 		ulint		size;
1021 		const byte*	src = slot->buf;
1022 
1023 		size = mach_read_from_2(src + FIL_PAGE_COMPRESS_SIZE_V1);
1024 
1025 		return(size + FIL_PAGE_DATA);
1026 	}
1027 
1028 	/** Check if the page contents can be decompressed.
1029 	@param[in]	slot		The slot that contains the IO request
1030 	@return true if the data read has all the compressed data */
can_decompress(const Slot * slot)1031 	static bool can_decompress(const Slot* slot)
1032 	{
1033 		ut_ad(slot->type.is_read());
1034 		ut_ad(is_compressed_page(slot));
1035 
1036 		ulint		version;
1037 		const byte*	src = slot->buf;
1038 
1039 		version = mach_read_from_1(src + FIL_PAGE_VERSION);
1040 
1041 		ut_a(Compression::is_valid_page_version(version));
1042 
1043 		/* Includes the page header size too */
1044 		ulint		size = compressed_page_size(slot);
1045 
1046 		return(size <= (slot->ptr - slot->buf) + (ulint) slot->n_bytes);
1047 	}
1048 
1049 	/** Check if we need to read some more data.
1050 	@param[in]	slot		The slot that contains the IO request
1051 	@param[in]	n_bytes		Total bytes read so far
1052 	@return DB_SUCCESS or error code */
1053 	static dberr_t check_read(Slot* slot, ulint n_bytes);
1054 };
1055 
1056 /** Helper class for doing synchronous file IO. Currently, the objective
1057 is to hide the OS specific code, so that the higher level functions aren't
1058 peppered with #ifdef. Makes the code flow difficult to follow.  */
1059 class SyncFileIO {
1060 public:
1061 	/** Constructor
1062 	@param[in]	fh	File handle
1063 	@param[in,out]	buf	Buffer to read/write
1064 	@param[in]	n	Number of bytes to read/write
1065 	@param[in]	offset	Offset where to read or write */
SyncFileIO(os_file_t fh,void * buf,ulint n,os_offset_t offset)1066 	SyncFileIO(os_file_t fh, void* buf, ulint n, os_offset_t offset)
1067 		:
1068 		m_fh(fh),
1069 		m_buf(buf),
1070 		m_n(static_cast<ssize_t>(n)),
1071 		m_offset(offset)
1072 	{
1073 		ut_ad(m_n > 0);
1074 	}
1075 
1076 	/** Destructor */
~SyncFileIO()1077 	~SyncFileIO()
1078 	{
1079 		/* No op */
1080 	}
1081 
1082 	/** Do the read/write
1083 	@param[in]	request	The IO context and type
1084 	@return the number of bytes read/written or negative value on error */
1085 	ssize_t execute(const IORequest& request);
1086 
1087 	/** Do the read/write
1088 	@param[in,out]	slot	The IO slot, it has the IO context
1089 	@return the number of bytes read/written or negative value on error */
1090 	static ssize_t execute(Slot* slot);
1091 
1092 	/** Move the read/write offset up to where the partial IO succeeded.
1093 	@param[in]	n_bytes	The number of bytes to advance */
advance(ssize_t n_bytes)1094 	void advance(ssize_t n_bytes)
1095 	{
1096 		m_offset += n_bytes;
1097 
1098 		ut_ad(m_n >= n_bytes);
1099 
1100 		m_n -=  n_bytes;
1101 
1102 		m_buf = reinterpret_cast<uchar*>(m_buf) + n_bytes;
1103 	}
1104 
1105 private:
1106 	/** Open file handle */
1107 	os_file_t		m_fh;
1108 
1109 	/** Buffer to read/write */
1110 	void*			m_buf;
1111 
1112 	/** Number of bytes to read/write */
1113 	ssize_t			m_n;
1114 
1115 	/** Offset from where to read/write */
1116 	os_offset_t		m_offset;
1117 };
1118 
1119 /** If it is a compressed page return the compressed page data + footer size
1120 @param[in]	buf		Buffer to check, must include header + 10 bytes
1121 @return ULINT_UNDEFINED if the page is not a compressed page or length
1122 	of the compressed data (including footer) if it is a compressed page */
1123 ulint
os_file_compressed_page_size(const byte * buf)1124 os_file_compressed_page_size(const byte* buf)
1125 {
1126 	ulint	type = mach_read_from_2(buf + FIL_PAGE_TYPE);
1127 
1128 	if (type == FIL_PAGE_COMPRESSED) {
1129 		ulint	version = mach_read_from_1(buf + FIL_PAGE_VERSION);
1130 		ut_a(Compression::is_valid_page_version(version));
1131 		return(mach_read_from_2(buf + FIL_PAGE_COMPRESS_SIZE_V1));
1132 	}
1133 
1134 	return(ULINT_UNDEFINED);
1135 }
1136 
1137 /** If it is a compressed page return the original page data + footer size
1138 @param[in] buf		Buffer to check, must include header + 10 bytes
1139 @return ULINT_UNDEFINED if the page is not a compressed page or length
1140 	of the original data + footer if it is a compressed page */
1141 ulint
os_file_original_page_size(const byte * buf)1142 os_file_original_page_size(const byte* buf)
1143 {
1144 	ulint	type = mach_read_from_2(buf + FIL_PAGE_TYPE);
1145 
1146 	if (type == FIL_PAGE_COMPRESSED) {
1147 
1148 		ulint	version = mach_read_from_1(buf + FIL_PAGE_VERSION);
1149 		ut_a(Compression::is_valid_page_version(version));
1150 
1151 		return(mach_read_from_2(buf + FIL_PAGE_ORIGINAL_SIZE_V1));
1152 	}
1153 
1154 	return(ULINT_UNDEFINED);
1155 }
1156 
1157 /** Check if we need to read some more data.
1158 @param[in]	slot		The slot that contains the IO request
1159 @param[in]	n_bytes		Total bytes read so far
1160 @return DB_SUCCESS or error code */
1161 dberr_t
check_read(Slot * slot,ulint n_bytes)1162 AIOHandler::check_read(Slot* slot, ulint n_bytes)
1163 {
1164 	dberr_t	err;
1165 
1166 	ut_ad(slot->type.is_read());
1167 	ut_ad(slot->original_len > slot->len);
1168 
1169 	if (is_compressed_page(slot)) {
1170 
1171 		if (can_decompress(slot)) {
1172 
1173 			ut_a(slot->offset > 0);
1174 
1175 			slot->len = slot->original_len;
1176 #ifdef _WIN32
1177 			slot->n_bytes = static_cast<DWORD>(n_bytes);
1178 #else
1179 			slot->n_bytes = static_cast<ulint>(n_bytes);
1180 #endif /* _WIN32 */
1181 
1182 			err = io_complete(slot);
1183 			ut_a(err == DB_SUCCESS);
1184 		} else {
1185 			/* Read the next block in */
1186 			ut_ad(compressed_page_size(slot) >= n_bytes);
1187 
1188 			err = DB_FAIL;
1189 		}
1190 	} else if (is_encrypted_page(slot)) {
1191 			ut_a(slot->offset > 0);
1192 
1193 			slot->len = slot->original_len;
1194 #ifdef _WIN32
1195 			slot->n_bytes = static_cast<DWORD>(n_bytes);
1196 #else
1197 			slot->n_bytes = static_cast<ulint>(n_bytes);
1198 #endif /* _WIN32 */
1199 
1200 			err = io_complete(slot);
1201 			ut_a(err == DB_SUCCESS);
1202 
1203 	} else {
1204 		err = DB_FAIL;
1205 	}
1206 
1207 	if (slot->buf_block != NULL) {
1208 		os_free_block(slot->buf_block);
1209 		slot->buf_block = NULL;
1210 	}
1211 
1212 	return(err);
1213 }
1214 
1215 /** Do any post processing after a read/write
1216 @return DB_SUCCESS or error code. */
1217 dberr_t
post_io_processing(Slot * slot)1218 AIOHandler::post_io_processing(Slot* slot)
1219 {
1220 	dberr_t	err;
1221 
1222 	ut_ad(slot->is_reserved);
1223 
1224 	/* Total bytes read so far */
1225 	ulint	n_bytes = (slot->ptr - slot->buf) + slot->n_bytes;
1226 
1227 	/* Compressed writes can be smaller than the original length.
1228 	Therefore they can be processed without further IO. */
1229 	if (n_bytes == slot->original_len
1230 	    || (slot->type.is_write()
1231 		&& slot->type.is_compressed()
1232 		&& slot->len == static_cast<ulint>(slot->n_bytes))) {
1233 
1234 		if (!slot->type.is_log()
1235 		    && (is_compressed_page(slot)
1236 			|| is_encrypted_page(slot))) {
1237 
1238 			ut_a(slot->offset > 0);
1239 
1240 			if (slot->type.is_read()) {
1241 				slot->len = slot->original_len;
1242 			}
1243 
1244 			/* The punch hole has been done on collect() */
1245 
1246 			if (slot->type.is_read()) {
1247 				err = io_complete(slot);
1248 			} else {
1249 				err = DB_SUCCESS;
1250 			}
1251 
1252 			ut_ad(err == DB_SUCCESS
1253 			      || err == DB_UNSUPPORTED
1254 			      || err == DB_CORRUPTION
1255 			      || err == DB_IO_DECOMPRESS_FAIL);
1256 		} else {
1257 
1258 			err = DB_SUCCESS;
1259 		}
1260 
1261 		if (slot->buf_block != NULL) {
1262 			os_free_block(slot->buf_block);
1263 			slot->buf_block = NULL;
1264 		}
1265 
1266 	} else if ((ulint) slot->n_bytes == (ulint) slot->len) {
1267 
1268 		/* It *must* be a partial read. */
1269 		ut_ad(slot->len < slot->original_len);
1270 
1271 		/* Has to be a read request, if it is less than
1272 		the original length. */
1273 		ut_ad(slot->type.is_read());
1274 		err = check_read(slot, n_bytes);
1275 
1276 	} else {
1277 		err = DB_FAIL;
1278 	}
1279 
1280 	return(err);
1281 }
1282 
1283 /** Count the number of free slots
1284 @return number of reserved slots */
1285 ulint
pending_io_count() const1286 AIO::pending_io_count() const
1287 {
1288 	acquire();
1289 
1290 #ifdef UNIV_DEBUG
1291 	ut_a(m_n_segments > 0);
1292 	ut_a(!m_slots.empty());
1293 
1294 	ulint	count = 0;
1295 
1296 	for (ulint i = 0; i < m_slots.size(); ++i) {
1297 
1298 		const Slot&	slot = m_slots[i];
1299 
1300 		if (slot.is_reserved) {
1301 			++count;
1302 			ut_a(slot.len > 0);
1303 		}
1304 	}
1305 
1306 	ut_a(m_n_reserved == count);
1307 #endif /* UNIV_DEBUG */
1308 
1309 	ulint	reserved = m_n_reserved;
1310 
1311 	release();
1312 
1313 	return(reserved);
1314 }
1315 
1316 /** Compress a data page
1317 #param[in]	block_size	File system block size
1318 @param[in]	src		Source contents to compress
1319 @param[in]	src_len		Length in bytes of the source
1320 @param[out]	dst		Compressed page contents
1321 @param[out]	dst_len		Length in bytes of dst contents
1322 @return buffer data, dst_len will have the length of the data */
1323 static
1324 byte*
os_file_compress_page(Compression compression,ulint block_size,byte * src,ulint src_len,byte * dst,ulint * dst_len)1325 os_file_compress_page(
1326 	Compression	compression,
1327 	ulint		block_size,
1328 	byte*		src,
1329 	ulint		src_len,
1330 	byte*		dst,
1331 	ulint*		dst_len)
1332 {
1333 	ulint		len = 0;
1334 	ulint		compression_level = page_zip_level;
1335 	ulint		page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
1336 
1337 	/* The page size must be a multiple of the OS punch hole size. */
1338 	ut_ad(!(src_len % block_size));
1339 
1340 	/* Shouldn't compress an already compressed page. */
1341 	ut_ad(page_type != FIL_PAGE_COMPRESSED);
1342 
1343 	/* The page must be at least twice as large as the file system
1344 	block size if we are to save any space. Ignore R-Tree pages for now,
1345 	they repurpose the same 8 bytes in the page header. No point in
1346 	compressing if the file system block size >= our page size. */
1347 
1348 	if (page_type == FIL_PAGE_RTREE
1349 	    || block_size == ULINT_UNDEFINED
1350 	    || compression.m_type == Compression::NONE
1351 	    || src_len < block_size * 2) {
1352 
1353 		*dst_len = src_len;
1354 
1355 		return(src);
1356 	}
1357 
1358 	/* Leave the header alone when compressing. */
1359 	ut_ad(block_size >= FIL_PAGE_DATA * 2);
1360 
1361 	ut_ad(src_len > FIL_PAGE_DATA + block_size);
1362 
1363 	/* Must compress to <= N-1 FS blocks. */
1364 	ulint		out_len = src_len - (FIL_PAGE_DATA + block_size);
1365 
1366 	/* This is the original data page size - the page header. */
1367 	ulint		content_len = src_len - FIL_PAGE_DATA;
1368 
1369 	ut_ad(out_len >= block_size - FIL_PAGE_DATA);
1370 	ut_ad(out_len <= src_len - (block_size + FIL_PAGE_DATA));
1371 
1372 	/* Only compress the data + trailer, leave the header alone */
1373 
1374 	switch (compression.m_type) {
1375 	case Compression::NONE:
1376 		ut_error;
1377 
1378 	case Compression::ZLIB: {
1379 
1380 		uLongf	zlen = static_cast<uLongf>(out_len);
1381 
1382 		if (compress2(
1383 			dst + FIL_PAGE_DATA,
1384 			&zlen,
1385 			src + FIL_PAGE_DATA,
1386 			static_cast<uLong>(content_len),
1387 			static_cast<int>(compression_level)) != Z_OK) {
1388 
1389 			*dst_len = src_len;
1390 
1391 			return(src);
1392 		}
1393 
1394 		len = static_cast<ulint>(zlen);
1395 
1396 		break;
1397 	}
1398 
1399 	case Compression::LZ4:
1400 
1401 		len = LZ4_compress_default(
1402 			reinterpret_cast<char*>(src) + FIL_PAGE_DATA,
1403 			reinterpret_cast<char*>(dst) + FIL_PAGE_DATA,
1404 			static_cast<int>(content_len),
1405 			static_cast<int>(out_len));
1406 
1407 		ut_a(len <= src_len - FIL_PAGE_DATA);
1408 
1409 		if (len == 0  || len >= out_len) {
1410 
1411 			*dst_len = src_len;
1412 
1413 			return(src);
1414 		}
1415 
1416 		break;
1417 
1418 	default:
1419 		*dst_len = src_len;
1420 		return(src);
1421 	}
1422 
1423 	ut_a(len <= out_len);
1424 
1425 	ut_ad(memcmp(src + FIL_PAGE_LSN + 4,
1426 		     src + src_len - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)
1427 	      == 0);
1428 
1429 	/* Copy the header as is. */
1430 	memmove(dst, src, FIL_PAGE_DATA);
1431 
1432 	/* Add compression control information. Required for decompressing. */
1433 	mach_write_to_2(dst + FIL_PAGE_TYPE, FIL_PAGE_COMPRESSED);
1434 
1435 	mach_write_to_1(dst + FIL_PAGE_VERSION, Compression::FIL_PAGE_VERSION_2);
1436 
1437 	mach_write_to_1(dst + FIL_PAGE_ALGORITHM_V1, compression.m_type);
1438 
1439 	mach_write_to_2(dst + FIL_PAGE_ORIGINAL_TYPE_V1, page_type);
1440 
1441 	mach_write_to_2(dst + FIL_PAGE_ORIGINAL_SIZE_V1, content_len);
1442 
1443 	mach_write_to_2(dst + FIL_PAGE_COMPRESS_SIZE_V1, len);
1444 
1445 	/* Round to the next full block size */
1446 
1447 	len += FIL_PAGE_DATA;
1448 
1449 	*dst_len = ut_calc_align(len, block_size);
1450 
1451 	ut_ad(*dst_len >= len && *dst_len <= out_len + FIL_PAGE_DATA);
1452 
1453 	/* Clear out the unused portion of the page. */
1454 	if (len % block_size) {
1455 		memset(dst + len, 0x0, block_size - (len % block_size));
1456 	}
1457 
1458 	return(dst);
1459 }
1460 
1461 #ifdef UNIV_DEBUG
1462 # ifndef UNIV_HOTBACKUP
1463 /** Validates the consistency the aio system some of the time.
1464 @return true if ok or the check was skipped */
1465 bool
os_aio_validate_skip()1466 os_aio_validate_skip()
1467 {
1468 /** Try os_aio_validate() every this many times */
1469 # define OS_AIO_VALIDATE_SKIP	13
1470 
1471 	/** The os_aio_validate() call skip counter.
1472 	Use a signed type because of the race condition below. */
1473 	static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
1474 
1475 	/* There is a race condition below, but it does not matter,
1476 	because this call is only for heuristic purposes. We want to
1477 	reduce the call frequency of the costly os_aio_validate()
1478 	check in debug builds. */
1479 	--os_aio_validate_count;
1480 
1481 	if (os_aio_validate_count > 0) {
1482 		return(true);
1483 	}
1484 
1485 	os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
1486 	return(os_aio_validate());
1487 }
1488 # endif /* !UNIV_HOTBACKUP */
1489 #endif /* UNIV_DEBUG */
1490 
1491 #undef USE_FILE_LOCK
1492 #define USE_FILE_LOCK
1493 #if defined(UNIV_HOTBACKUP) || defined(_WIN32)
1494 /* InnoDB Hot Backup does not lock the data files.
1495  * On Windows, mandatory locking is used.
1496  */
1497 # undef USE_FILE_LOCK
1498 #endif
1499 #ifdef USE_FILE_LOCK
1500 /** Obtain an exclusive lock on a file.
1501 @param[in]	fd		file descriptor
1502 @param[in]	name		file name
1503 @return 0 on success */
1504 static
1505 int
os_file_lock(int fd,const char * name)1506 os_file_lock(
1507 	int		fd,
1508 	const char*	name)
1509 {
1510 	struct flock lk;
1511 
1512 	lk.l_type = F_WRLCK;
1513 	lk.l_whence = SEEK_SET;
1514 	lk.l_start = lk.l_len = 0;
1515 
1516 	if (fcntl(fd, F_SETLK, &lk) == -1) {
1517 
1518 		ib::error()
1519 			<< "Unable to lock " << name
1520 			<< " error: " << errno;
1521 
1522 		if (errno == EAGAIN || errno == EACCES) {
1523 
1524 			ib::info()
1525 				<< "Check that you do not already have"
1526 				" another mysqld process using the"
1527 				" same InnoDB data or log files.";
1528 		}
1529 
1530 		return(-1);
1531 	}
1532 
1533 	return(0);
1534 }
1535 #endif /* USE_FILE_LOCK */
1536 
1537 #ifndef UNIV_HOTBACKUP
1538 
1539 /** Calculates local segment number and aio array from global segment number.
1540 @param[out]	array		aio wait array
1541 @param[in]	segment		global segment number
1542 @return local segment number within the aio array */
1543 ulint
get_array_and_local_segment(AIO ** array,ulint segment)1544 AIO::get_array_and_local_segment(
1545 	AIO**		array,
1546 	ulint		segment)
1547 {
1548 	ulint		local_segment;
1549 	ulint		n_extra_segs = (srv_read_only_mode) ? 0 : 2;
1550 
1551 	ut_a(segment < os_aio_n_segments);
1552 
1553 	if (!srv_read_only_mode && segment < n_extra_segs) {
1554 
1555 		/* We don't support ibuf/log IO during read only mode. */
1556 
1557 		if (segment == IO_IBUF_SEGMENT) {
1558 
1559 			*array = s_ibuf;
1560 
1561 		} else if (segment == IO_LOG_SEGMENT) {
1562 
1563 			*array = s_log;
1564 
1565 		} else {
1566 			*array = NULL;
1567 		}
1568 
1569 		local_segment = 0;
1570 
1571 	} else if (segment < s_reads->m_n_segments + n_extra_segs) {
1572 
1573 		*array = s_reads;
1574 		local_segment = segment - n_extra_segs;
1575 
1576 	} else {
1577 		*array = s_writes;
1578 
1579 		local_segment = segment
1580 			      - (s_reads->m_n_segments + n_extra_segs);
1581 	}
1582 
1583 	return(local_segment);
1584 }
1585 
1586 /** Frees a slot in the aio array. Assumes caller owns the mutex.
1587 @param[in,out]	slot		Slot to release */
1588 void
release(Slot * slot)1589 AIO::release(Slot* slot)
1590 {
1591 	ut_ad(is_mutex_owned());
1592 
1593 	ut_ad(slot->is_reserved);
1594 
1595 	slot->is_reserved = false;
1596 
1597 	--m_n_reserved;
1598 
1599 	if (m_n_reserved == m_slots.size() - 1) {
1600 		os_event_set(m_not_full);
1601 	}
1602 
1603 	if (m_n_reserved == 0) {
1604 		os_event_set(m_is_empty);
1605 	}
1606 
1607 #ifdef WIN_ASYNC_IO
1608 
1609 	ResetEvent(slot->handle);
1610 
1611 #elif defined(LINUX_NATIVE_AIO)
1612 
1613 	if (srv_use_native_aio) {
1614 		memset(&slot->control, 0x0, sizeof(slot->control));
1615 		slot->ret = 0;
1616 		slot->n_bytes = 0;
1617 	} else {
1618 		/* These fields should not be used if we are not
1619 		using native AIO. */
1620 		ut_ad(slot->n_bytes == 0);
1621 		ut_ad(slot->ret == 0);
1622 	}
1623 
1624 #endif /* WIN_ASYNC_IO */
1625 }
1626 
1627 /** Frees a slot in the AIO array. Assumes caller doesn't own the mutex.
1628 @param[in,out]	slot		Slot to release */
1629 void
release_with_mutex(Slot * slot)1630 AIO::release_with_mutex(Slot* slot)
1631 {
1632 	acquire();
1633 
1634 	release(slot);
1635 
1636 	release();
1637 }
1638 
1639 /** Creates a temporary file.  This function is like tmpfile(3), but
1640 the temporary file is created in the given parameter path. If the path
1641 is NULL then it will create the file in the MySQL server configuration
1642 parameter (--tmpdir).
1643 @param[in]	path	location for creating temporary file
1644 @return temporary file handle, or NULL on error */
1645 FILE*
os_file_create_tmpfile(const char * path)1646 os_file_create_tmpfile(
1647 	const char*	path)
1648 {
1649 	FILE*	file	= NULL;
1650 	WAIT_ALLOW_WRITES();
1651 	int	fd	= innobase_mysql_tmpfile(path);
1652 
1653 	if (fd >= 0) {
1654 		file = fdopen(fd, "w+b");
1655 	}
1656 
1657 	if (file == NULL) {
1658 
1659 		ib::error()
1660 			<< "Unable to create temporary file; errno: "
1661 			<< errno;
1662 
1663 		if (fd >= 0) {
1664 			close(fd);
1665 		}
1666 	}
1667 
1668 	return(file);
1669 }
1670 
1671 /** Rewind file to its start, read at most size - 1 bytes from it to str, and
1672 NUL-terminate str. All errors are silently ignored. This function is
1673 mostly meant to be used with temporary files.
1674 @param[in,out]	file		File to read from
1675 @param[in,out]	str		Buffer where to read
1676 @param[in]	size		Size of buffer */
1677 void
os_file_read_string(FILE * file,char * str,ulint size)1678 os_file_read_string(
1679 	FILE*		file,
1680 	char*		str,
1681 	ulint		size)
1682 {
1683 	if (size != 0) {
1684 		rewind(file);
1685 
1686 		size_t	flen = fread(str, 1, size - 1, file);
1687 
1688 		str[flen] = '\0';
1689 	}
1690 }
1691 
1692 /** Decompress after a read and punch a hole in the file if it was a write
1693 @param[in]	type		IO context
1694 @param[in]	fh		Open file handle
1695 @param[in,out]	buf		Buffer to transform
1696 @param[in,out]	scratch		Scratch area for read decompression
1697 @param[in]	src_len		Length of the buffer before compression
1698 @param[in]	len		Used buffer length for write and output
1699 				buf len for read
1700 @return DB_SUCCESS or error code */
1701 static
1702 dberr_t
os_file_io_complete(const IORequest & type,os_file_t fh,byte * buf,byte * scratch,ulint src_len,os_offset_t offset,ulint len)1703 os_file_io_complete(
1704 	const IORequest&type,
1705 	os_file_t	fh,
1706 	byte*		buf,
1707 	byte*		scratch,
1708 	ulint		src_len,
1709 	os_offset_t	offset,
1710 	ulint		len)
1711 {
1712 	/* We never compress/decompress the first page */
1713 	ut_a(offset > 0);
1714 	ut_ad(type.validate());
1715 
1716 	if (!type.is_compression_enabled()) {
1717 
1718 		return(DB_SUCCESS);
1719 
1720 	} else if (type.is_read()) {
1721 		dberr_t		ret;
1722 		Encryption	encryption(type.encryption_algorithm());
1723 
1724 		ut_ad(!type.is_log());
1725 		ut_ad(!type.is_row_log());
1726 
1727 		ret = encryption.decrypt(type, buf, src_len, scratch, len);
1728 		if (ret == DB_SUCCESS) {
1729 			return(os_file_decompress_page(
1730 					type.is_dblwr_recover(),
1731 					buf, scratch, len));
1732 		} else {
1733 			return(ret);
1734 		}
1735 
1736 	} else if (type.punch_hole()) {
1737 
1738 		ut_ad(len <= src_len);
1739 		ut_ad(!type.is_log());
1740 		ut_ad(type.is_write());
1741 		ut_ad(type.is_compressed());
1742 
1743 		/* Nothing to do. */
1744 		if (len == src_len) {
1745 			return(DB_SUCCESS);
1746 		}
1747 
1748 #ifdef UNIV_DEBUG
1749 		const ulint	block_size = type.block_size();
1750 #endif /* UNIV_DEBUG */
1751 
1752 		/* We don't support multiple page sizes in the server
1753 		at the moment. */
1754 		ut_ad(src_len == srv_page_size);
1755 
1756 		/* Must be a multiple of the compression unit size. */
1757 		ut_ad((len % block_size) == 0);
1758 		ut_ad((offset % block_size) == 0);
1759 
1760 		ut_ad(len + block_size <= src_len);
1761 
1762 		offset += len;
1763 
1764 		return(os_file_punch_hole(fh, offset, src_len - len));
1765 	}
1766 
1767 	ut_ad(!type.is_log());
1768 
1769 	return(DB_SUCCESS);
1770 }
1771 
1772 #endif /* !UNIV_HOTBACKUP */
1773 
1774 /** This function returns a new path name after replacing the basename
1775 in an old path with a new basename.  The old_path is a full path
1776 name including the extension.  The tablename is in the normal
1777 form "databasename/tablename".  The new base name is found after
1778 the forward slash.  Both input strings are null terminated.
1779 
1780 This function allocates memory to be returned.  It is the callers
1781 responsibility to free the return value after it is no longer needed.
1782 
1783 @param[in]	old_path		Pathname
1784 @param[in]	tablename		Contains new base name
1785 @return own: new full pathname */
1786 char*
os_file_make_new_pathname(const char * old_path,const char * tablename)1787 os_file_make_new_pathname(
1788 	const char*	old_path,
1789 	const char*	tablename)
1790 {
1791 	ulint		dir_len;
1792 	char*		last_slash;
1793 	char*		base_name;
1794 	char*		new_path;
1795 	ulint		new_path_len;
1796 
1797 	/* Split the tablename into its database and table name components.
1798 	They are separated by a '/'. */
1799 	last_slash = strrchr((char*) tablename, '/');
1800 	base_name = last_slash ? last_slash + 1 : (char*) tablename;
1801 
1802 	/* Find the offset of the last slash. We will strip off the
1803 	old basename.ibd which starts after that slash. */
1804 	last_slash = strrchr((char*) old_path, OS_PATH_SEPARATOR);
1805 	dir_len = last_slash ? last_slash - old_path : strlen(old_path);
1806 
1807 	/* allocate a new path and move the old directory path to it. */
1808 	new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd";
1809 	new_path = static_cast<char*>(ut_malloc_nokey(new_path_len));
1810 	memcpy(new_path, old_path, dir_len);
1811 
1812 	ut_snprintf(new_path + dir_len,
1813 		    new_path_len - dir_len,
1814 		    "%c%s.ibd",
1815 		    OS_PATH_SEPARATOR,
1816 		    base_name);
1817 
1818 	return(new_path);
1819 }
1820 
1821 /** This function reduces a null-terminated full remote path name into
1822 the path that is sent by MySQL for DATA DIRECTORY clause.  It replaces
1823 the 'databasename/tablename.ibd' found at the end of the path with just
1824 'tablename'.
1825 
1826 Since the result is always smaller than the path sent in, no new memory
1827 is allocated. The caller should allocate memory for the path sent in.
1828 This function manipulates that path in place.
1829 
1830 If the path format is not as expected, just return.  The result is used
1831 to inform a SHOW CREATE TABLE command.
1832 @param[in,out]	data_dir_path		Full path/data_dir_path */
1833 void
os_file_make_data_dir_path(char * data_dir_path)1834 os_file_make_data_dir_path(
1835 	char*	data_dir_path)
1836 {
1837 	/* Replace the period before the extension with a null byte. */
1838 	char*	ptr = strrchr((char*) data_dir_path, '.');
1839 
1840 	if (ptr == NULL) {
1841 		return;
1842 	}
1843 
1844 	ptr[0] = '\0';
1845 
1846 	/* The tablename starts after the last slash. */
1847 	ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
1848 
1849 	if (ptr == NULL) {
1850 		return;
1851 	}
1852 
1853 	ptr[0] = '\0';
1854 
1855 	char*	tablename = ptr + 1;
1856 
1857 	/* The databasename starts after the next to last slash. */
1858 	ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
1859 
1860 	if (ptr == NULL) {
1861 		return;
1862 	}
1863 
1864 	ulint	tablename_len = ut_strlen(tablename);
1865 
1866 	ut_memmove(++ptr, tablename, tablename_len);
1867 
1868 	ptr[tablename_len] = '\0';
1869 }
1870 
1871 /** Check if the path refers to the root of a drive using a pointer
1872 to the last directory separator that the caller has fixed.
1873 @param[in]	path	path name
1874 @param[in]	path	last directory separator in the path
1875 @return true if this path is a drive root, false if not */
1876 UNIV_INLINE
1877 bool
os_file_is_root(const char * path,const char * last_slash)1878 os_file_is_root(
1879 	const char*	path,
1880 	const char*	last_slash)
1881 {
1882 	return(
1883 #ifdef _WIN32
1884 	       (last_slash == path + 2 && path[1] == ':') ||
1885 #endif /* _WIN32 */
1886 	       last_slash == path);
1887 }
1888 
1889 /** Return the parent directory component of a null-terminated path.
1890 Return a new buffer containing the string up to, but not including,
1891 the final component of the path.
1892 The path returned will not contain a trailing separator.
1893 Do not return a root path, return NULL instead.
1894 The final component trimmed off may be a filename or a directory name.
1895 If the final component is the only component of the path, return NULL.
1896 It is the caller's responsibility to free the returned string after it
1897 is no longer needed.
1898 @param[in]	path		Path name
1899 @return own: parent directory of the path */
1900 static
1901 char*
os_file_get_parent_dir(const char * path)1902 os_file_get_parent_dir(
1903 	const char*	path)
1904 {
1905 	bool	has_trailing_slash = false;
1906 
1907 	/* Find the offset of the last slash */
1908 	const char* last_slash = strrchr(path, OS_PATH_SEPARATOR);
1909 
1910 	if (!last_slash) {
1911 		/* No slash in the path, return NULL */
1912 		return(NULL);
1913 	}
1914 
1915 	/* Ok, there is a slash. Is there anything after it? */
1916 	if (static_cast<size_t>(last_slash - path + 1) == strlen(path)) {
1917 		has_trailing_slash = true;
1918 	}
1919 
1920 	/* Reduce repetative slashes. */
1921 	while (last_slash > path
1922 		&& last_slash[-1] == OS_PATH_SEPARATOR) {
1923 		last_slash--;
1924 	}
1925 
1926 	/* Check for the root of a drive. */
1927 	if (os_file_is_root(path, last_slash)) {
1928 		return(NULL);
1929 	}
1930 
1931 	/* If a trailing slash prevented the first strrchr() from trimming
1932 	the last component of the path, trim that component now. */
1933 	if (has_trailing_slash) {
1934 		/* Back up to the previous slash. */
1935 		last_slash--;
1936 		while (last_slash > path
1937 		       && last_slash[0] != OS_PATH_SEPARATOR) {
1938 			last_slash--;
1939 		}
1940 
1941 		/* Reduce repetative slashes. */
1942 		while (last_slash > path
1943 			&& last_slash[-1] == OS_PATH_SEPARATOR) {
1944 			last_slash--;
1945 		}
1946 	}
1947 
1948 	/* Check for the root of a drive. */
1949 	if (os_file_is_root(path, last_slash)) {
1950 		return(NULL);
1951 	}
1952 
1953 	if (last_slash - path < 0) {
1954 		/* Sanity check, it prevents gcc from trying to handle this case which
1955 		 * results in warnings for some optimized builds */
1956 		return (NULL);
1957 	}
1958 
1959 	/* Non-trivial directory component */
1960 
1961 	return(mem_strdupl(path, last_slash - path));
1962 }
1963 #ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
1964 
1965 /* Test the function os_file_get_parent_dir. */
1966 void
test_os_file_get_parent_dir(const char * child_dir,const char * expected_dir)1967 test_os_file_get_parent_dir(
1968 	const char*	child_dir,
1969 	const char*	expected_dir)
1970 {
1971 	char* child = mem_strdup(child_dir);
1972 	char* expected = expected_dir == NULL ? NULL
1973 			 : mem_strdup(expected_dir);
1974 
1975 	/* os_file_get_parent_dir() assumes that separators are
1976 	converted to OS_PATH_SEPARATOR. */
1977 	os_normalize_path(child);
1978 	os_normalize_path(expected);
1979 
1980 	char* parent = os_file_get_parent_dir(child);
1981 
1982 	bool unexpected = (expected == NULL
1983 			  ? (parent != NULL)
1984 			  : (0 != strcmp(parent, expected)));
1985 	if (unexpected) {
1986 		ib::fatal() << "os_file_get_parent_dir('" << child
1987 			<< "') returned '" << parent
1988 			<< "', instead of '" << expected << "'.";
1989 	}
1990 	ut_free(parent);
1991 	ut_free(child);
1992 	ut_free(expected);
1993 }
1994 
1995 /* Test the function os_file_get_parent_dir. */
1996 void
unit_test_os_file_get_parent_dir()1997 unit_test_os_file_get_parent_dir()
1998 {
1999 	test_os_file_get_parent_dir("/usr/lib/a", "/usr/lib");
2000 	test_os_file_get_parent_dir("/usr/", NULL);
2001 	test_os_file_get_parent_dir("//usr//", NULL);
2002 	test_os_file_get_parent_dir("usr", NULL);
2003 	test_os_file_get_parent_dir("usr//", NULL);
2004 	test_os_file_get_parent_dir("/", NULL);
2005 	test_os_file_get_parent_dir("//", NULL);
2006 	test_os_file_get_parent_dir(".", NULL);
2007 	test_os_file_get_parent_dir("..", NULL);
2008 # ifdef _WIN32
2009 	test_os_file_get_parent_dir("D:", NULL);
2010 	test_os_file_get_parent_dir("D:/", NULL);
2011 	test_os_file_get_parent_dir("D:\\", NULL);
2012 	test_os_file_get_parent_dir("D:/data", NULL);
2013 	test_os_file_get_parent_dir("D:/data/", NULL);
2014 	test_os_file_get_parent_dir("D:\\data\\", NULL);
2015 	test_os_file_get_parent_dir("D:///data/////", NULL);
2016 	test_os_file_get_parent_dir("D:\\\\\\data\\\\\\\\", NULL);
2017 	test_os_file_get_parent_dir("D:/data//a", "D:/data");
2018 	test_os_file_get_parent_dir("D:\\data\\\\a", "D:\\data");
2019 	test_os_file_get_parent_dir("D:///data//a///b/", "D:///data//a");
2020 	test_os_file_get_parent_dir("D:\\\\\\data\\\\a\\\\\\b\\", "D:\\\\\\data\\\\a");
2021 #endif  /* _WIN32 */
2022 }
2023 #endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */
2024 
2025 
2026 /** Creates all missing subdirectories along the given path.
2027 @param[in]	path		Path name
2028 @return DB_SUCCESS if OK, otherwise error code. */
2029 dberr_t
os_file_create_subdirs_if_needed(const char * path)2030 os_file_create_subdirs_if_needed(
2031 	const char*	path)
2032 {
2033 	if (srv_read_only_mode) {
2034 
2035 		ib::error()
2036 			<< "read only mode set. Can't create "
2037 			<< "subdirectories '" << path << "'";
2038 
2039 		return(DB_READ_ONLY);
2040 
2041 	}
2042 
2043 	char*	subdir = os_file_get_parent_dir(path);
2044 
2045 	if (subdir == NULL) {
2046 		/* subdir is root or cwd, nothing to do */
2047 		return(DB_SUCCESS);
2048 	}
2049 
2050 	/* Test if subdir exists */
2051 	os_file_type_t	type;
2052 	bool	subdir_exists;
2053 	bool	success = os_file_status(subdir, &subdir_exists, &type);
2054 
2055 	if (success && !subdir_exists) {
2056 
2057 		/* Subdir does not exist, create it */
2058 		dberr_t	err = os_file_create_subdirs_if_needed(subdir);
2059 
2060 		if (err != DB_SUCCESS) {
2061 
2062 			ut_free(subdir);
2063 
2064 			return(err);
2065 		}
2066 
2067 		success = os_file_create_directory(subdir, false);
2068 	}
2069 
2070 	ut_free(subdir);
2071 
2072 	return(success ? DB_SUCCESS : DB_ERROR);
2073 }
2074 
2075 /** Allocate the buffer for IO on a transparently compressed table.
2076 @param[in]	type		IO flags
2077 @param[out]	buf		buffer to read or write
2078 @param[in,out]	n		number of bytes to read/write, starting from
2079 				offset
2080 @return pointer to allocated page, compressed data is written to the offset
2081 	that is aligned on the disk sector size */
2082 static
2083 Block*
os_file_compress_page(IORequest & type,void * & buf,ulint * n)2084 os_file_compress_page(
2085 	IORequest&	type,
2086 	void*&		buf,
2087 	ulint*		n)
2088 {
2089 	ut_ad(!type.is_log());
2090 	ut_ad(type.is_write());
2091 	ut_ad(type.is_compressed());
2092 
2093 	ulint	n_alloc = *n * 2;
2094 
2095 	ut_a(n_alloc <= UNIV_PAGE_SIZE_MAX * 2);
2096 	ut_a(type.compression_algorithm().m_type != Compression::LZ4
2097 	     || static_cast<ulint>(LZ4_COMPRESSBOUND(*n)) < n_alloc);
2098 
2099 	Block*  block = os_alloc_block();
2100 
2101 	ulint	old_compressed_len;
2102 	ulint	compressed_len = *n;
2103 
2104 	old_compressed_len = mach_read_from_2(
2105 		reinterpret_cast<byte*>(buf)
2106 		+ FIL_PAGE_COMPRESS_SIZE_V1);
2107 
2108 	if (old_compressed_len > 0) {
2109 		old_compressed_len = ut_calc_align(
2110 			old_compressed_len + FIL_PAGE_DATA,
2111 			type.block_size());
2112 	} else {
2113 		old_compressed_len = *n;
2114 	}
2115 
2116 	byte*	compressed_page;
2117 
2118 	compressed_page = static_cast<byte*>(
2119 		ut_align(block->m_ptr, os_io_ptr_align));
2120 
2121 	byte*	buf_ptr;
2122 
2123 	buf_ptr = os_file_compress_page(
2124 		type.compression_algorithm(),
2125 		type.block_size(),
2126 		reinterpret_cast<byte*>(buf),
2127 		*n,
2128 		compressed_page,
2129 		&compressed_len);
2130 
2131 	if (buf_ptr != buf) {
2132 		/* Set new compressed size to uncompressed page. */
2133 		memcpy(reinterpret_cast<byte*>(buf) + FIL_PAGE_COMPRESS_SIZE_V1,
2134 		       buf_ptr + FIL_PAGE_COMPRESS_SIZE_V1, 2);
2135 
2136 		buf = buf_ptr;
2137 		*n = compressed_len;
2138 
2139 		if (compressed_len >= old_compressed_len) {
2140 
2141 			ut_ad(old_compressed_len <= UNIV_PAGE_SIZE);
2142 
2143 			type.clear_punch_hole();
2144 		}
2145 	}
2146 
2147 	return(block);
2148 }
2149 
2150 /** Encrypt a page content when write it to disk.
2151 @param[in]	type		IO flags
2152 @param[out]	buf		buffer to read or write
2153 @param[in,out]	n		number of bytes to read/write, starting from
2154 				offset
2155 @return pointer to the encrypted page */
2156 static
2157 Block*
os_file_encrypt_page(const IORequest & type,void * & buf,ulint * n)2158 os_file_encrypt_page(
2159 	const IORequest&	type,
2160 	void*&			buf,
2161 	ulint*			n)
2162 {
2163 
2164 	byte*		encrypted_page;
2165 	ulint		encrypted_len = *n;
2166 	byte*		buf_ptr;
2167 	Encryption	encryption(type.encryption_algorithm());
2168 
2169 	ut_ad(!type.is_log());
2170 	ut_ad(type.is_write());
2171 	ut_ad(type.is_encrypted());
2172 
2173 	Block*  block = os_alloc_block();
2174 
2175 	encrypted_page = static_cast<byte*>(
2176 		ut_align(block->m_ptr, os_io_ptr_align));
2177 
2178 	buf_ptr = encryption.encrypt(type,
2179 				     reinterpret_cast<byte*>(buf), *n,
2180 				     encrypted_page, &encrypted_len);
2181 
2182 	bool	encrypted = buf_ptr != buf;
2183 
2184 	if (encrypted) {
2185 
2186 		buf = buf_ptr;
2187 		*n = encrypted_len;
2188 	}
2189 
2190 	return(block);
2191 }
2192 
2193 #ifndef _WIN32
2194 
2195 /** Do the read/write
2196 @param[in]	request	The IO context and type
2197 @return the number of bytes read/written or negative value on error */
2198 ssize_t
execute(const IORequest & request)2199 SyncFileIO::execute(const IORequest& request)
2200 {
2201 	ssize_t	n_bytes;
2202 
2203 	if (request.is_read()) {
2204 		n_bytes = pread(m_fh, m_buf, m_n, m_offset);
2205 	} else {
2206 		ut_ad(request.is_write());
2207 		n_bytes = pwrite(m_fh, m_buf, m_n, m_offset);
2208 	}
2209 
2210 	return(n_bytes);
2211 }
2212 
2213 /** Free storage space associated with a section of the file.
2214 @param[in]	fh		Open file handle
2215 @param[in]	off		Starting offset (SEEK_SET)
2216 @param[in]	len		Size of the hole
2217 @return DB_SUCCESS or error code */
2218 static
2219 dberr_t
os_file_punch_hole_posix(os_file_t fh,os_offset_t off,os_offset_t len)2220 os_file_punch_hole_posix(
2221 	os_file_t	fh,
2222 	os_offset_t	off,
2223 	os_offset_t	len)
2224 {
2225 #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
2226 	const int	mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
2227 
2228 	int             ret = fallocate(fh, mode, off, len);
2229 
2230 	if (ret == 0) {
2231 		return(DB_SUCCESS);
2232 	}
2233 
2234 	ut_a(ret == -1);
2235 
2236 	if (errno == ENOTSUP) {
2237 		return(DB_IO_NO_PUNCH_HOLE);
2238 	}
2239 
2240 	ib::warn()
2241 		<< "fallocate(" << fh
2242 		<<", FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, "
2243 		<< off << ", " << len << ") returned errno: "
2244 		<<  errno;
2245 
2246 	return(DB_IO_ERROR);
2247 
2248 #elif defined(UNIV_SOLARIS)
2249 
2250 	// Use F_FREESP
2251 
2252 #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
2253 
2254 	return(DB_IO_NO_PUNCH_HOLE);
2255 }
2256 
2257 #if defined(LINUX_NATIVE_AIO)
2258 
2259 /** Linux native AIO handler */
2260 class LinuxAIOHandler {
2261 public:
2262 	/**
2263 	@param[in] global_segment	The global segment*/
LinuxAIOHandler(ulint global_segment)2264 	LinuxAIOHandler(ulint global_segment)
2265 		:
2266 		m_global_segment(global_segment)
2267 	{
2268 		/* Should never be doing Sync IO here. */
2269 		ut_a(m_global_segment != ULINT_UNDEFINED);
2270 
2271 		/* Find the array and the local segment. */
2272 
2273 		m_segment = AIO::get_array_and_local_segment(
2274 			&m_array, m_global_segment);
2275 
2276 		m_n_slots = m_array->slots_per_segment();
2277 	}
2278 
2279 	/** Destructor */
~LinuxAIOHandler()2280 	~LinuxAIOHandler()
2281 	{
2282 		// No op
2283 	}
2284 
2285 	/**
2286 	Process a Linux AIO request
2287 	@param[out]	m1		the messages passed with the
2288 	@param[out]	m2		AIO request; note that in case the
2289 					AIO operation failed, these output
2290 					parameters are valid and can be used to
2291 					restart the operation.
2292 	@param[out]	request		IO context
2293 	@return DB_SUCCESS or error code */
2294 	dberr_t poll(fil_node_t** m1, void** m2, IORequest* request);
2295 
2296 private:
2297 	/** Resubmit an IO request that was only partially successful
2298 	@param[in,out]	slot		Request to resubmit
2299 	@return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
2300 	dberr_t	resubmit(Slot* slot);
2301 
2302 	/** Check if the AIO succeeded
2303 	@param[in,out]	slot		The slot to check
2304 	@return DB_SUCCESS, DB_FAIL if the operation should be retried or
2305 		DB_IO_ERROR on all other errors */
2306 	dberr_t	check_state(Slot* slot);
2307 
2308 	/** @return true if a shutdown was detected */
is_shutdown() const2309 	bool is_shutdown() const
2310 	{
2311 		return(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
2312 		       && !buf_page_cleaner_is_active);
2313 	}
2314 
2315 	/** If no slot was found then the m_array->m_mutex will be released.
2316 	@param[out]	n_pending	The number of pending IOs
2317 	@return NULL or a slot that has completed IO */
2318 	Slot* find_completed_slot(ulint* n_pending);
2319 
2320 	/** This is called from within the IO-thread. If there are no completed
2321 	IO requests in the slot array, the thread calls this function to
2322 	collect more requests from the Linux kernel.
2323 	The IO-thread waits on io_getevents(), which is a blocking call, with
2324 	a timeout value. Unless the system is very heavy loaded, keeping the
2325 	IO-thread very busy, the io-thread will spend most of its time waiting
2326 	in this function.
2327 	The IO-thread also exits in this function. It checks server status at
2328 	each wakeup and that is why we use timed wait in io_getevents(). */
2329 	void collect();
2330 
2331 private:
2332 	/** Slot array */
2333 	AIO*			m_array;
2334 
2335 	/** Number of slots inthe local segment */
2336 	ulint			m_n_slots;
2337 
2338 	/** The local segment to check */
2339 	ulint			m_segment;
2340 
2341 	/** The global segment */
2342 	ulint			m_global_segment;
2343 };
2344 
2345 /** Resubmit an IO request that was only partially successful
2346 @param[in,out]	slot		Request to resubmit
2347 @return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
2348 dberr_t
resubmit(Slot * slot)2349 LinuxAIOHandler::resubmit(Slot* slot)
2350 {
2351 #ifdef UNIV_DEBUG
2352 	/* Bytes already read/written out */
2353 	ulint	n_bytes = slot->ptr - slot->buf;
2354 
2355 	ut_ad(m_array->is_mutex_owned());
2356 
2357 	ut_ad(n_bytes < slot->original_len);
2358 	ut_ad(static_cast<ulint>(slot->n_bytes) < slot->original_len - n_bytes);
2359 	/* Partial read or write scenario */
2360 	ut_ad(slot->len >= static_cast<ulint>(slot->n_bytes));
2361 #endif /* UNIV_DEBUG */
2362 
2363 	slot->len -= slot->n_bytes;
2364 	slot->ptr += slot->n_bytes;
2365 	slot->offset += slot->n_bytes;
2366 
2367 	/* Resetting the bytes read/written */
2368 	slot->n_bytes = 0;
2369 	slot->io_already_done = false;
2370 
2371 	/* make sure that slot->offset fits in off_t */
2372 	ut_ad(sizeof(off_t) >= sizeof(os_offset_t));
2373 
2374 	struct iocb*	iocb = &slot->control;
2375 	if (slot->type.is_read()) {
2376 		io_prep_pread(
2377 			iocb,
2378 			slot->file.m_file,
2379 			slot->ptr,
2380 			slot->len,
2381 			slot->offset);
2382 
2383 	} else {
2384 
2385 		ut_a(slot->type.is_write());
2386 
2387 		io_prep_pwrite(
2388 			iocb,
2389 			slot->file.m_file,
2390 			slot->ptr,
2391 			slot->len,
2392 			slot->offset);
2393 	}
2394 
2395 	iocb->data = slot;
2396 
2397 	/* Resubmit an I/O request */
2398 	int	ret = io_submit(m_array->io_ctx(m_segment), 1, &iocb);
2399 
2400 	if (ret < -1)  {
2401 		errno = -ret;
2402 	}
2403 
2404 	return(ret < 0 ? DB_IO_PARTIAL_FAILED : DB_SUCCESS);
2405 }
2406 
2407 /** Check if the AIO succeeded
2408 @param[in,out]	slot		The slot to check
2409 @return DB_SUCCESS, DB_FAIL if the operation should be retried or
2410 	DB_IO_ERROR on all other errors */
2411 dberr_t
check_state(Slot * slot)2412 LinuxAIOHandler::check_state(Slot* slot)
2413 {
2414 	ut_ad(m_array->is_mutex_owned());
2415 
2416 	/* Note that it may be that there is more then one completed
2417 	IO requests. We process them one at a time. We may have a case
2418 	here to improve the performance slightly by dealing with all
2419 	requests in one sweep. */
2420 
2421 	srv_set_io_thread_op_info(
2422 		m_global_segment, "processing completed aio requests");
2423 
2424 	ut_ad(slot->io_already_done);
2425 
2426 	dberr_t	err;
2427 
2428 	if (slot->ret == 0) {
2429 
2430 		err = AIOHandler::post_io_processing(slot);
2431 
2432 	} else {
2433 		errno = -slot->ret;
2434 
2435 		/* os_file_handle_error does tell us if we should retry
2436 		this IO. As it stands now, we don't do this retry when
2437 		reaping requests from a different context than
2438 		the dispatcher. This non-retry logic is the same for
2439 		Windows and Linux native AIO.
2440 		We should probably look into this to transparently
2441 		re-submit the IO. */
2442 		os_file_handle_error(slot->name, "Linux aio");
2443 
2444 		err = DB_IO_ERROR;
2445 	}
2446 
2447 	return(err);
2448 }
2449 
2450 /** If no slot was found then the m_array->m_mutex will be released.
2451 @param[out]	n_pending		The number of pending IOs
2452 @return NULL or a slot that has completed IO */
2453 Slot*
find_completed_slot(ulint * n_pending)2454 LinuxAIOHandler::find_completed_slot(ulint* n_pending)
2455 {
2456 	ulint	offset = m_n_slots * m_segment;
2457 
2458 	*n_pending = 0;
2459 
2460 	m_array->acquire();
2461 
2462 	Slot*	slot = m_array->at(offset);
2463 
2464 	for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
2465 
2466 		if (slot->is_reserved) {
2467 
2468 			++*n_pending;
2469 
2470 			if (slot->io_already_done) {
2471 
2472 				/* Something for us to work on.
2473 				Note: We don't release the mutex. */
2474 				return(slot);
2475 			}
2476 		}
2477 	}
2478 
2479 	m_array->release();
2480 
2481 	return(NULL);
2482 }
2483 
2484 /** This function is only used in Linux native asynchronous i/o. This is
2485 called from within the io-thread. If there are no completed IO requests
2486 in the slot array, the thread calls this function to collect more
2487 requests from the kernel.
2488 The io-thread waits on io_getevents(), which is a blocking call, with
2489 a timeout value. Unless the system is very heavy loaded, keeping the
2490 io-thread very busy, the io-thread will spend most of its time waiting
2491 in this function.
2492 The io-thread also exits in this function. It checks server status at
2493 each wakeup and that is why we use timed wait in io_getevents(). */
2494 void
collect()2495 LinuxAIOHandler::collect()
2496 {
2497 	ut_ad(m_n_slots > 0);
2498 	ut_ad(m_array != NULL);
2499 	ut_ad(m_segment < m_array->get_n_segments());
2500 
2501 	/* Which io_context we are going to use. */
2502 	io_context*	io_ctx = m_array->io_ctx(m_segment);
2503 
2504 	/* Starting point of the m_segment we will be working on. */
2505 	ulint	start_pos = m_segment * m_n_slots;
2506 
2507 	/* End point. */
2508 	ulint	end_pos = start_pos + m_n_slots;
2509 
2510 	for (;;) {
2511 		struct io_event*	events;
2512 
2513 		/* Which part of event array we are going to work on. */
2514 		events = m_array->io_events(m_segment * m_n_slots);
2515 
2516 		/* Initialize the events. */
2517 		memset(events, 0, sizeof(*events) * m_n_slots);
2518 
2519 		/* The timeout value is arbitrary. We probably need
2520 		to experiment with it a little. */
2521 		struct timespec		timeout;
2522 
2523 		timeout.tv_sec = 0;
2524 		timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
2525 
2526 		int	ret;
2527 
2528 		ret = io_getevents(io_ctx, 1, m_n_slots, events, &timeout);
2529 
2530 		for (int i = 0; i < ret; ++i) {
2531 
2532 			struct iocb*	iocb;
2533 
2534 			iocb = reinterpret_cast<struct iocb*>(events[i].obj);
2535 			ut_a(iocb != NULL);
2536 
2537 			Slot*	slot = reinterpret_cast<Slot*>(iocb->data);
2538 
2539 			/* Some sanity checks. */
2540 			ut_a(slot != NULL);
2541 			ut_a(slot->is_reserved);
2542 
2543 			/* We are not scribbling previous segment. */
2544 			ut_a(slot->pos >= start_pos);
2545 
2546 			/* We have not overstepped to next segment. */
2547 			ut_a(slot->pos < end_pos);
2548 
2549 			/* We never compress/decompress the first page */
2550 
2551 			if (slot->offset > 0
2552 			    && !slot->skip_punch_hole
2553 			    && slot->type.is_compression_enabled()
2554 			    && !slot->type.is_log()
2555 			    && slot->type.is_write()
2556 			    && slot->type.is_compressed()
2557 			    && slot->type.punch_hole()) {
2558 
2559 				slot->err = AIOHandler::io_complete(slot);
2560 			} else {
2561 				slot->err = DB_SUCCESS;
2562 			}
2563 
2564 			/* Mark this request as completed. The error handling
2565 			will be done in the calling function. */
2566 			m_array->acquire();
2567 
2568 			/* events[i].res2 should always be ZERO */
2569 			ut_ad(events[i].res2 == 0);
2570 			slot->io_already_done = true;
2571 
2572 			/*Even though events[i].res is an unsigned number
2573 			in libaio, it is used to return a negative value
2574 			(negated errno value) to indicate error and a positive
2575 			value to indicate number of bytes read or written. */
2576 
2577 			if (events[i].res > slot->len) {
2578 				/* failure */
2579 				slot->n_bytes = 0;
2580 				slot->ret = events[i].res;
2581 			} else {
2582 				/* success */
2583 				slot->n_bytes = events[i].res;
2584 				slot->ret = 0;
2585 			}
2586 			m_array->release();
2587 		}
2588 
2589 		if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
2590 		    || !buf_page_cleaner_is_active
2591 		    || ret > 0) {
2592 
2593 			break;
2594 		}
2595 
2596 		/* This error handling is for any error in collecting the
2597 		IO requests. The errors, if any, for any particular IO
2598 		request are simply passed on to the calling routine. */
2599 
2600 		switch (ret) {
2601 		case -EAGAIN:
2602 			/* Not enough resources! Try again. */
2603 
2604 		case -EINTR:
2605 			/* Interrupted! The behaviour in case of an interrupt.
2606 			If we have some completed IOs available then the
2607 			return code will be the number of IOs. We get EINTR
2608 			only if there are no completed IOs and we have been
2609 			interrupted. */
2610 
2611 		case 0:
2612 			/* No pending request! Go back and check again. */
2613 
2614 			continue;
2615 		}
2616 
2617 		/* All other errors should cause a trap for now. */
2618 		ib::fatal()
2619 			<< "Unexpected ret_code[" << ret
2620 			<< "] from io_getevents()!";
2621 
2622 		break;
2623 	}
2624 }
2625 
2626 /** Process a Linux AIO request
2627 @param[out]	m1		the messages passed with the
2628 @param[out]	m2		AIO request; note that in case the
2629 				AIO operation failed, these output
2630 				parameters are valid and can be used to
2631 				restart the operation.
2632 @param[out]	request		IO context
2633 @return DB_SUCCESS or error code */
2634 dberr_t
poll(fil_node_t ** m1,void ** m2,IORequest * request)2635 LinuxAIOHandler::poll(fil_node_t** m1, void** m2, IORequest* request)
2636 {
2637 	dberr_t		err;
2638 	Slot*		slot;
2639 
2640 	/* Loop until we have found a completed request. */
2641 	for (;;) {
2642 
2643 		ulint	n_pending;
2644 
2645 		slot = find_completed_slot(&n_pending);
2646 
2647 		if (slot != NULL) {
2648 
2649 			ut_ad(m_array->is_mutex_owned());
2650 
2651 			err = check_state(slot);
2652 
2653 			/* DB_FAIL is not a hard error, we should retry */
2654 			if (err != DB_FAIL) {
2655 				break;
2656 			}
2657 
2658 			/* Partial IO, resubmit request for
2659 			remaining bytes to read/write */
2660 			err = resubmit(slot);
2661 
2662 			if (err != DB_SUCCESS) {
2663 				break;
2664 			}
2665 
2666 			m_array->release();
2667 
2668 		} else if (is_shutdown() && n_pending == 0) {
2669 
2670 			/* There is no completed request. If there is
2671 			no pending request at all, and the system is
2672 			being shut down, exit. */
2673 
2674 			*m1 = NULL;
2675 			*m2 = NULL;
2676 
2677 			return(DB_SUCCESS);
2678 
2679 		} else {
2680 
2681 			/* Wait for some request. Note that we return
2682 			from wait if we have found a request. */
2683 
2684 			srv_set_io_thread_op_info(
2685 				m_global_segment,
2686 				"waiting for completed aio requests");
2687 
2688 			collect();
2689 		}
2690 	}
2691 
2692 	if (err == DB_IO_PARTIAL_FAILED) {
2693 		/* Aborting in case of submit failure */
2694 		ib::fatal()
2695 			<< "Native Linux AIO interface. "
2696 			"io_submit() call failed when "
2697 			"resubmitting a partial I/O "
2698 			"request on the file " << slot->name
2699 			<< ".";
2700 	}
2701 
2702 	*m1 = slot->m1;
2703 	*m2 = slot->m2;
2704 
2705 	*request = slot->type;
2706 
2707 	m_array->release(slot);
2708 
2709 	m_array->release();
2710 
2711 	return(err);
2712 }
2713 
2714 /** This function is only used in Linux native asynchronous i/o.
2715 Waits for an aio operation to complete. This function is used to wait for
2716 the completed requests. The aio array of pending requests is divided
2717 into segments. The thread specifies which segment or slot it wants to wait
2718 for. NOTE: this function will also take care of freeing the aio slot,
2719 therefore no other thread is allowed to do the freeing!
2720 
2721 @param[in]	global_seg	segment number in the aio array
2722 				to wait for; segment 0 is the ibuf
2723 				i/o thread, segment 1 is log i/o thread,
2724 				then follow the non-ibuf read threads,
2725 				and the last are the non-ibuf write
2726 				threads.
2727 @param[out]	m1		the messages passed with the
2728 @param[out]	m2			AIO request; note that in case the
2729 				AIO operation failed, these output
2730 				parameters are valid and can be used to
2731 				restart the operation.
2732 @param[out]xi	 request	IO context
2733 @return DB_SUCCESS if the IO was successful */
2734 static
2735 dberr_t
os_aio_linux_handler(ulint global_segment,fil_node_t ** m1,void ** m2,IORequest * request)2736 os_aio_linux_handler(
2737 	ulint		global_segment,
2738 	fil_node_t**	m1,
2739 	void**		m2,
2740 	IORequest*	request)
2741 {
2742 	LinuxAIOHandler	handler(global_segment);
2743 
2744 	dberr_t	err = handler.poll(m1, m2, request);
2745 
2746 	if (err == DB_IO_NO_PUNCH_HOLE) {
2747 		fil_no_punch_hole(*m1);
2748 		err = DB_SUCCESS;
2749 	}
2750 
2751 	return(err);
2752 }
2753 
2754 /** Dispatch an AIO request to the kernel.
2755 @param[in,out]	slot		an already reserved slot
2756 @return true on success. */
2757 bool
linux_dispatch(Slot * slot)2758 AIO::linux_dispatch(Slot* slot)
2759 {
2760 	ut_a(slot->is_reserved);
2761 	ut_ad(slot->type.validate());
2762 
2763 	/* Find out what we are going to work with.
2764 	The iocb struct is directly in the slot.
2765 	The io_context is one per segment. */
2766 
2767 	ulint		io_ctx_index;
2768 	struct iocb*	iocb = &slot->control;
2769 
2770 	io_ctx_index = (slot->pos * m_n_segments) / m_slots.size();
2771 
2772 	int	ret = io_submit(m_aio_ctx[io_ctx_index], 1, &iocb);
2773 
2774 	/* io_submit() returns number of successfully queued requests
2775 	or -errno. */
2776 
2777 	if (ret != 1) {
2778 		errno = -ret;
2779 	}
2780 
2781 	return(ret == 1);
2782 }
2783 
2784 /** Creates an io_context for native linux AIO.
2785 @param[in]	max_events	number of events
2786 @param[out]	io_ctx		io_ctx to initialize.
2787 @return true on success. */
2788 bool
linux_create_io_ctx(ulint max_events,io_context_t * io_ctx)2789 AIO::linux_create_io_ctx(
2790 	ulint		max_events,
2791 	io_context_t*	io_ctx)
2792 {
2793 	ssize_t		n_retries = 0;
2794 
2795 	for (;;) {
2796 
2797 		memset(io_ctx, 0x0, sizeof(*io_ctx));
2798 
2799 		/* Initialize the io_ctx. Tell it how many pending
2800 		IO requests this context will handle. */
2801 
2802 		int	ret = io_setup(max_events, io_ctx);
2803 
2804 		if (ret == 0) {
2805 			/* Success. Return now. */
2806 			return(true);
2807 		}
2808 
2809 		/* If we hit EAGAIN we'll make a few attempts before failing. */
2810 
2811 		switch (ret) {
2812 		case -EAGAIN:
2813 			if (n_retries == 0) {
2814 				/* First time around. */
2815 				ib::warn()
2816 					<< "io_setup() failed with EAGAIN."
2817 					" Will make "
2818 					<< OS_AIO_IO_SETUP_RETRY_ATTEMPTS
2819 					<< " attempts before giving up.";
2820 			}
2821 
2822 			if (n_retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
2823 
2824 				++n_retries;
2825 
2826 				ib::warn()
2827 					<< "io_setup() attempt "
2828 					<< n_retries << ".";
2829 
2830 				os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
2831 
2832 				continue;
2833 			}
2834 
2835 			/* Have tried enough. Better call it a day. */
2836 			ib::error()
2837 				<< "io_setup() failed with EAGAIN after "
2838 				<< OS_AIO_IO_SETUP_RETRY_ATTEMPTS
2839 				<< " attempts.";
2840 			break;
2841 
2842 		case -ENOSYS:
2843 			ib::error()
2844 				<< "Linux Native AIO interface"
2845 				" is not supported on this platform. Please"
2846 				" check your OS documentation and install"
2847 				" appropriate binary of InnoDB.";
2848 
2849 			break;
2850 
2851 		default:
2852 			ib::error()
2853 				<< "Linux Native AIO setup"
2854 				<< " returned following error["
2855 				<< ret << "]";
2856 			break;
2857 		}
2858 
2859 		ib::info()
2860 			<< "You can disable Linux Native AIO by"
2861 			" setting innodb_use_native_aio = 0 in my.cnf";
2862 
2863 		break;
2864 	}
2865 
2866 	return(false);
2867 }
2868 
2869 /** Checks if the system supports native linux aio. On some kernel
2870 versions where native aio is supported it won't work on tmpfs. In such
2871 cases we can't use native aio as it is not possible to mix simulated
2872 and native aio.
2873 @return: true if supported, false otherwise. */
2874 bool
is_linux_native_aio_supported()2875 AIO::is_linux_native_aio_supported()
2876 {
2877 	int		fd;
2878 	io_context_t	io_ctx;
2879 	char		name[1000];
2880 
2881 	if (!linux_create_io_ctx(1, &io_ctx)) {
2882 
2883 		/* The platform does not support native aio. */
2884 
2885 		return(false);
2886 
2887 	} else if (!srv_read_only_mode) {
2888 
2889 		/* Now check if tmpdir supports native aio ops. */
2890 		fd = innobase_mysql_tmpfile(NULL);
2891 
2892 		if (fd < 0) {
2893 			ib::warn()
2894 				<< "Unable to create temp file to check"
2895 				" native AIO support.";
2896 
2897 			return(false);
2898 		}
2899 	} else {
2900 
2901 		os_normalize_path(srv_log_group_home_dir);
2902 
2903 		ulint	dirnamelen = strlen(srv_log_group_home_dir);
2904 
2905 		ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
2906 
2907 		memcpy(name, srv_log_group_home_dir, dirnamelen);
2908 
2909 		/* Add a path separator if needed. */
2910 		if (dirnamelen && name[dirnamelen - 1] != OS_PATH_SEPARATOR) {
2911 
2912 			name[dirnamelen++] = OS_PATH_SEPARATOR;
2913 		}
2914 
2915 		strcpy(name + dirnamelen, "ib_logfile0");
2916 
2917 		fd = ::open(name, O_RDONLY);
2918 
2919 		if (fd == -1) {
2920 
2921 			ib::warn()
2922 				<< "Unable to open"
2923 				<< " \"" << name << "\" to check native"
2924 				<< " AIO read support.";
2925 
2926 			return(false);
2927 		}
2928 	}
2929 
2930 	struct io_event	io_event;
2931 
2932 	memset(&io_event, 0x0, sizeof(io_event));
2933 
2934 	byte*	buf = static_cast<byte*>(ut_malloc_nokey(UNIV_PAGE_SIZE * 2));
2935 	byte*	ptr = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE));
2936 
2937 	struct iocb	iocb;
2938 
2939 	/* Suppress valgrind warning. */
2940 	memset(buf, 0x00, UNIV_PAGE_SIZE * 2);
2941 	memset(&iocb, 0x0, sizeof(iocb));
2942 
2943 	struct iocb*	p_iocb = &iocb;
2944 
2945 	if (!srv_read_only_mode) {
2946 
2947 		io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0);
2948 
2949 	} else {
2950 		ut_a(UNIV_PAGE_SIZE >= 512);
2951 		io_prep_pread(p_iocb, fd, ptr, 512, 0);
2952 	}
2953 
2954 	int	err = io_submit(io_ctx, 1, &p_iocb);
2955 
2956 	if (err >= 1) {
2957 		/* Now collect the submitted IO request. */
2958 		err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
2959 	}
2960 
2961 	ut_free(buf);
2962 	close(fd);
2963 
2964 	switch (err) {
2965 	case 1:
2966 		return(true);
2967 
2968 	case -EINVAL:
2969 	case -ENOSYS:
2970 		ib::error()
2971 			<< "Linux Native AIO not supported. You can either"
2972 			" move "
2973 			<< (srv_read_only_mode ? name : "tmpdir")
2974 			<< " to a file system that supports native"
2975 			" AIO or you can set innodb_use_native_aio to"
2976 			" FALSE to avoid this message.";
2977 
2978 		/* fall through. */
2979 	default:
2980 		ib::error()
2981 			<< "Linux Native AIO check on "
2982 			<< (srv_read_only_mode ? name : "tmpdir")
2983 			<< "returned error[" << -err << "]";
2984 	}
2985 
2986 	return(false);
2987 }
2988 
2989 #endif /* LINUX_NATIVE_AIO */
2990 
2991 /** Retrieves the last error number if an error occurs in a file io function.
2992 The number should be retrieved before any other OS calls (because they may
2993 overwrite the error number). If the number is not known to this program,
2994 the OS error number + 100 is returned.
2995 @param[in]	report_all_errors	true if we want an error message
2996 					printed of all errors
2997 @param[in]	on_error_silent		true then don't print any diagnostic
2998 					to the log
2999 @return error number, or OS error number + 100 */
3000 static
3001 ulint
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)3002 os_file_get_last_error_low(
3003 	bool	report_all_errors,
3004 	bool	on_error_silent)
3005 {
3006 	int	err = errno;
3007 
3008 	if (err == 0) {
3009 		return(0);
3010 	}
3011 
3012 	if (report_all_errors
3013 	    || (err != ENOSPC && err != EEXIST && !on_error_silent)) {
3014 
3015 		ib::error()
3016 			<< "Operating system error number "
3017 			<< err
3018 			<< " in a file operation.";
3019 
3020 		if (err == ENOENT) {
3021 
3022 			ib::error()
3023 				<< "The error means the system"
3024 				" cannot find the path specified.";
3025 
3026 			if (srv_is_being_started) {
3027 
3028 				ib::error()
3029 					<< "If you are installing InnoDB,"
3030 					" remember that you must create"
3031 					" directories yourself, InnoDB"
3032 					" does not create them.";
3033 			}
3034 		} else if (err == EACCES) {
3035 
3036 			ib::error()
3037 				<< "The error means mysqld does not have"
3038 				" the access rights to the directory.";
3039 
3040 		} else {
3041 			if (strerror(err) != NULL) {
3042 
3043 				ib::error()
3044 					<< "Error number " << err << " means '"
3045 					<< strerror(err) << "'";
3046 			}
3047 
3048 			ib::info() << OPERATING_SYSTEM_ERROR_MSG;
3049 		}
3050 	}
3051 
3052 	switch (err) {
3053 	case ENOSPC:
3054 		return(OS_FILE_DISK_FULL);
3055 	case ENOENT:
3056 		return(OS_FILE_NOT_FOUND);
3057 	case EEXIST:
3058 		return(OS_FILE_ALREADY_EXISTS);
3059 	case EXDEV:
3060 	case ENOTDIR:
3061 	case EISDIR:
3062 		return(OS_FILE_PATH_ERROR);
3063 	case EAGAIN:
3064 		if (srv_use_native_aio) {
3065 			return(OS_FILE_AIO_RESOURCES_RESERVED);
3066 		}
3067 		break;
3068 	case EINTR:
3069 		if (srv_use_native_aio) {
3070 			return(OS_FILE_AIO_INTERRUPTED);
3071 		}
3072 		break;
3073 	case EACCES:
3074 		return(OS_FILE_ACCESS_VIOLATION);
3075 	}
3076 	return(OS_FILE_ERROR_MAX + err);
3077 }
3078 
3079 /** Wrapper to fsync(2) that retries the call on some errors.
3080 Returns the value 0 if successful; otherwise the value -1 is returned and
3081 the global variable errno is set to indicate the error.
3082 @param[in]	file		open file handle
3083 @return 0 if success, -1 otherwise */
3084 static
3085 int
os_file_fsync_posix(os_file_t file)3086 os_file_fsync_posix(
3087 	os_file_t	file)
3088 {
3089 	ulint		failures = 0;
3090 
3091 	for (;;) {
3092 
3093 		++os_n_fsyncs;
3094 
3095 		int	ret = fsync(file);
3096 
3097 		if (ret == 0) {
3098 			return(ret);
3099 		}
3100 
3101 		switch(errno) {
3102 		case ENOLCK:
3103 
3104 			++failures;
3105 			ut_a(failures < 1000);
3106 
3107 			if (!(failures % 100)) {
3108 
3109 				ib::warn()
3110 					<< "fsync(): "
3111 					<< "No locks available; retrying";
3112 			}
3113 
3114 			/* 0.2 sec */
3115 			os_thread_sleep(200000);
3116 			break;
3117 
3118 		case EIO:
3119 
3120                         ib::fatal()
3121 				<< "fsync() returned EIO, aborting.";
3122 			break;
3123 
3124 		case EINTR:
3125 
3126 			++failures;
3127 			ut_a(failures < 2000);
3128 			break;
3129 
3130 		default:
3131 			ut_error;
3132 			break;
3133 		}
3134 	}
3135 
3136 	ut_error;
3137 
3138 	return(-1);
3139 }
3140 
3141 /** Check the existence and type of the given file.
3142 @param[in]	path		path name of file
3143 @param[out]	exists		true if the file exists
3144 @param[out]	type		Type of the file, if it exists
3145 @return true if call succeeded */
3146 bool
os_file_status_posix(const char * path,bool * exists,os_file_type_t * type)3147 os_file_status_posix(
3148 	const char*	path,
3149 	bool*		exists,
3150 	os_file_type_t* type)
3151 {
3152 	struct stat	statinfo;
3153 
3154 	int	ret = stat(path, &statinfo);
3155 
3156 	*exists = !ret;
3157 
3158 	if (!ret) {
3159 		/* file exists, everything OK */
3160 
3161 	} else if (errno == ENOENT || errno == ENOTDIR
3162 		   || errno == ENAMETOOLONG) {
3163 		/* file does not exist */
3164 		return(true);
3165 
3166 	} else {
3167 		/* file exists, but stat call failed */
3168 		os_file_handle_error_no_exit(path, "stat", false);
3169 		return(false);
3170 	}
3171 
3172 	if (S_ISDIR(statinfo.st_mode)) {
3173 		*type = OS_FILE_TYPE_DIR;
3174 
3175 	} else if (S_ISLNK(statinfo.st_mode)) {
3176 		*type = OS_FILE_TYPE_LINK;
3177 
3178 	} else if (S_ISREG(statinfo.st_mode)) {
3179 		*type = OS_FILE_TYPE_FILE;
3180 
3181 	} else {
3182 		*type = OS_FILE_TYPE_UNKNOWN;
3183 	}
3184 
3185 	return(true);
3186 }
3187 
3188 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
3189 function!
3190 Flushes the write buffers of a given file to the disk.
3191 @param[in]	file		handle to a file
3192 @return true if success */
3193 bool
os_file_flush_func(os_file_t file)3194 os_file_flush_func(
3195 	os_file_t	file)
3196 {
3197 	int	ret;
3198 
3199 	ret = os_file_fsync_posix(file);
3200 
3201 	if (ret == 0) {
3202 		return(true);
3203 	}
3204 
3205 	/* Since Linux returns EINVAL if the 'file' is actually a raw device,
3206 	we choose to ignore that error if we are using raw disks */
3207 
3208 	if (srv_start_raw_disk_in_use && errno == EINVAL) {
3209 
3210 		return(true);
3211 	}
3212 
3213 	ib::error() << "The OS said file flush did not succeed";
3214 
3215 	os_file_handle_error(NULL, "flush");
3216 
3217 	/* It is a fatal error if a file flush does not succeed, because then
3218 	the database can get corrupt on disk */
3219 	ut_error;
3220 
3221 	return(false);
3222 }
3223 
3224 /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
3225 this function!
3226 A simple function to open or create a file.
3227 @param[in]	name		name of the file or path as a null-terminated
3228 				string
3229 @param[in]	create_mode	create mode
3230 @param[in]	access_type	OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
3231 @param[in]	read_only	if true, read only checks are enforced
3232 @param[out]	success		true if succeed, false if error
3233 @return handle to the file, not defined if error, error number
3234 	can be retrieved with os_file_get_last_error */
3235 pfs_os_file_t
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)3236 os_file_create_simple_func(
3237 	const char*	name,
3238 	ulint		create_mode,
3239 	ulint		access_type,
3240 	bool		read_only,
3241 	bool*		success)
3242 {
3243 	pfs_os_file_t	file;
3244 
3245 	*success = false;
3246 
3247 	int		create_flag;
3248 
3249 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
3250 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
3251 
3252 	if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
3253 		WAIT_ALLOW_WRITES();
3254 	if (create_mode == OS_FILE_OPEN) {
3255 
3256 		if (access_type == OS_FILE_READ_ONLY) {
3257 
3258 			create_flag = O_RDONLY;
3259 
3260 		} else if (read_only) {
3261 
3262 			create_flag = O_RDONLY;
3263 
3264 		} else {
3265 			create_flag = O_RDWR;
3266 		}
3267 
3268 	} else if (read_only) {
3269 
3270 		create_flag = O_RDONLY;
3271 
3272 	} else if (create_mode == OS_FILE_CREATE) {
3273 
3274 		create_flag = O_RDWR | O_CREAT | O_EXCL;
3275 
3276 	} else if (create_mode == OS_FILE_CREATE_PATH) {
3277 
3278 		/* Create subdirs along the path if needed. */
3279 
3280 		*success = os_file_create_subdirs_if_needed(name);
3281 
3282 		if (!*success) {
3283 
3284 			ib::error()
3285 				<< "Unable to create subdirectories '"
3286 				<< name << "'";
3287 
3288 			file.m_file = OS_FILE_CLOSED;
3289 			return(file);
3290 		}
3291 
3292 		create_flag = O_RDWR | O_CREAT | O_EXCL;
3293 		create_mode = OS_FILE_CREATE;
3294 	} else {
3295 
3296 		ib::error()
3297 			<< "Unknown file create mode ("
3298 			<< create_mode
3299 			<< " for file '" << name << "'";
3300 
3301 		file.m_file = OS_FILE_CLOSED;
3302 		return(file);
3303 	}
3304 
3305 	bool	retry;
3306 
3307 	do {
3308 		file.m_file = ::open(name, create_flag, os_innodb_umask);
3309 
3310 		if (file.m_file == -1) {
3311 			*success = false;
3312 
3313 			retry = os_file_handle_error(
3314 				name,
3315 				create_mode == OS_FILE_OPEN
3316 				? "open" : "create");
3317 		} else {
3318 			*success = true;
3319 			retry = false;
3320 		}
3321 
3322 	} while (retry);
3323 
3324 #ifdef USE_FILE_LOCK
3325 	if (!read_only
3326 	    && *success
3327 	    && access_type == OS_FILE_READ_WRITE
3328 	    && os_file_lock(file.m_file, name)) {
3329 
3330 		*success = false;
3331 		close(file.m_file);
3332 		file.m_file = -1;
3333 	}
3334 #endif /* USE_FILE_LOCK */
3335 
3336 	return(file);
3337 }
3338 
3339 /** This function attempts to create a directory named pathname. The new
3340 directory gets default permissions. On Unix the permissions are
3341 (0770 & ~umask). If the directory exists already, nothing is done and
3342 the call succeeds, unless the fail_if_exists arguments is true.
3343 If another error occurs, such as a permission error, this does not crash,
3344 but reports the error and returns false.
3345 @param[in]	pathname	directory name as null-terminated string
3346 @param[in]	fail_if_exists	if true, pre-existing directory is treated as
3347 				an error.
3348 @return true if call succeeds, false on error */
3349 bool
os_file_create_directory(const char * pathname,bool fail_if_exists)3350 os_file_create_directory(
3351 	const char*	pathname,
3352 	bool		fail_if_exists)
3353 {
3354 	WAIT_ALLOW_WRITES();
3355 	int	rcode = mkdir(pathname, 0770);
3356 
3357 	if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
3358 		/* failure */
3359 		os_file_handle_error_no_exit(pathname, "mkdir", false);
3360 
3361 		return(false);
3362 	}
3363 
3364 	return(true);
3365 }
3366 
3367 /**
3368 The os_file_opendir() function opens a directory stream corresponding to the
3369 directory named by the dirname argument. The directory stream is positioned
3370 at the first entry. In both Unix and Windows we automatically skip the '.'
3371 and '..' items at the start of the directory listing.
3372 @param[in]	dirname		directory name; it must not contain a trailing
3373 				'\' or '/'
3374 @param[in]	is_fatal	true if we should treat an error as a fatal
3375 				error; if we try to open symlinks then we do
3376 				not wish a fatal error if it happens not to be
3377 				a directory
3378 @return directory stream, NULL if error */
3379 os_file_dir_t
os_file_opendir(const char * dirname,bool error_is_fatal)3380 os_file_opendir(
3381 	const char*	dirname,
3382 	bool		error_is_fatal)
3383 {
3384 	os_file_dir_t		dir;
3385 	dir = opendir(dirname);
3386 
3387 	if (dir == NULL && error_is_fatal) {
3388 		os_file_handle_error(dirname, "opendir");
3389 	}
3390 
3391 	return(dir);
3392 }
3393 
3394 /** Closes a directory stream.
3395 @param[in]	dir		directory stream
3396 @return 0 if success, -1 if failure */
3397 int
os_file_closedir(os_file_dir_t dir)3398 os_file_closedir(
3399 	os_file_dir_t	dir)
3400 {
3401 	int	ret = closedir(dir);
3402 
3403 	if (ret != 0) {
3404 		os_file_handle_error_no_exit(NULL, "closedir", false);
3405 	}
3406 
3407 	return(ret);
3408 }
3409 
3410 /** This function returns information of the next file in the directory. We jump
3411 over the '.' and '..' entries in the directory.
3412 @param[in]	dirname		directory name or path
3413 @param[in]	dir		directory stream
3414 @param[out]	info		buffer where the info is returned
3415 @return 0 if ok, -1 if error, 1 if at the end of the directory */
3416 int
os_file_readdir_next_file(const char * dirname,os_file_dir_t dir,os_file_stat_t * info)3417 os_file_readdir_next_file(
3418 	const char*	dirname,
3419 	os_file_dir_t	dir,
3420 	os_file_stat_t*	info)
3421 {
3422 	struct dirent*	ent;
3423 	char*		full_path;
3424 	int		ret;
3425 	struct stat	statinfo;
3426 
3427 #ifdef HAVE_READDIR_R
3428 	char		dirent_buf[sizeof(struct dirent)
3429 				   + _POSIX_PATH_MAX + 100];
3430 	/* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
3431 	the max file name len; but in most standards, the
3432 	length is NAME_MAX; we add 100 to be even safer */
3433 #endif /* HAVE_READDIR_R */
3434 
3435 next_file:
3436 
3437 #ifdef HAVE_READDIR_R
3438 	ret = readdir_r(dir, (struct dirent*) dirent_buf, &ent);
3439 
3440 	if (ret != 0) {
3441 
3442 		ib::error()
3443 			<< "Cannot read directory " << dirname
3444 			<< " error: " << ret;
3445 
3446 		return(-1);
3447 	}
3448 
3449 	if (ent == NULL) {
3450 		/* End of directory */
3451 
3452 		return(1);
3453 	}
3454 
3455 	ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
3456 #else
3457 	ent = readdir(dir);
3458 
3459 	if (ent == NULL) {
3460 
3461 		return(1);
3462 	}
3463 #endif /* HAVE_READDIR_R */
3464 	ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
3465 
3466 	if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
3467 
3468 		goto next_file;
3469 	}
3470 
3471 	strcpy(info->name, ent->d_name);
3472 
3473 	full_path = static_cast<char*>(
3474 		ut_malloc_nokey(strlen(dirname) + strlen(ent->d_name) + 10));
3475 
3476 	sprintf(full_path, "%s/%s", dirname, ent->d_name);
3477 
3478 	ret = stat(full_path, &statinfo);
3479 
3480 	if (ret) {
3481 
3482 		if (errno == ENOENT) {
3483 			/* readdir() returned a file that does not exist,
3484 			it must have been deleted in the meantime. Do what
3485 			would have happened if the file was deleted before
3486 			readdir() - ignore and go to the next entry.
3487 			If this is the last entry then info->name will still
3488 			contain the name of the deleted file when this
3489 			function returns, but this is not an issue since the
3490 			caller shouldn't be looking at info when end of
3491 			directory is returned. */
3492 
3493 			ut_free(full_path);
3494 
3495 			goto next_file;
3496 		}
3497 
3498 		os_file_handle_error_no_exit(full_path, "stat", false);
3499 
3500 		ut_free(full_path);
3501 
3502 		return(-1);
3503 	}
3504 
3505 	info->size = statinfo.st_size;
3506 
3507 	if (S_ISDIR(statinfo.st_mode)) {
3508 		info->type = OS_FILE_TYPE_DIR;
3509 	} else if (S_ISLNK(statinfo.st_mode)) {
3510 		info->type = OS_FILE_TYPE_LINK;
3511 	} else if (S_ISREG(statinfo.st_mode)) {
3512 		info->type = OS_FILE_TYPE_FILE;
3513 	} else {
3514 		info->type = OS_FILE_TYPE_UNKNOWN;
3515 	}
3516 
3517 	ut_free(full_path);
3518 
3519 	return(0);
3520 }
3521 
3522 /** NOTE! Use the corresponding macro os_file_create(), not directly
3523 this function!
3524 Opens an existing file or creates a new.
3525 @param[in]	name		name of the file or path as a null-terminated
3526 				string
3527 @param[in]	create_mode	create mode
3528 @param[in]	purpose		OS_FILE_AIO, if asynchronous, non-buffered I/O
3529 				is desired, OS_FILE_NORMAL, if any normal file;
3530 				NOTE that it also depends on type, os_aio_..
3531 				and srv_.. variables whether we really use async
3532 				I/O or unbuffered I/O: look in the function
3533 				source code for the exact rules
3534 @param[in]	type		OS_DATA_FILE or OS_LOG_FILE
3535 @param[in]	read_only	true, if read only checks should be enforcedm
3536 @param[in]	success		true if succeeded
3537 @return handle to the file, not defined if error, error number
3538 	can be retrieved with os_file_get_last_error */
3539 pfs_os_file_t
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,bool read_only,bool * success)3540 os_file_create_func(
3541 	const char*	name,
3542 	ulint		create_mode,
3543 	ulint		purpose,
3544 	ulint		type,
3545 	bool		read_only,
3546 	bool*		success)
3547 {
3548 	bool		on_error_no_exit;
3549 	bool		on_error_silent;
3550 	if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
3551 		WAIT_ALLOW_WRITES();
3552 	pfs_os_file_t	file;
3553 
3554 	*success = false;
3555 
3556 	DBUG_EXECUTE_IF(
3557 		"ib_create_table_fail_disk_full",
3558 		*success = false;
3559 		errno = ENOSPC;
3560 		file.m_file = OS_FILE_CLOSED;
3561 		return(file);
3562 	);
3563 
3564 	int		create_flag;
3565 	const char*	mode_str	= NULL;
3566 
3567 	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
3568 		? true : false;
3569 	on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
3570 		? true : false;
3571 
3572 	create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
3573 	create_mode &= ~OS_FILE_ON_ERROR_SILENT;
3574 
3575 	if (create_mode == OS_FILE_OPEN
3576 	    || create_mode == OS_FILE_OPEN_RAW
3577 	    || create_mode == OS_FILE_OPEN_RETRY) {
3578 
3579 		mode_str = "OPEN";
3580 
3581 		create_flag = read_only ? O_RDONLY : O_RDWR;
3582 
3583 	} else if (read_only) {
3584 
3585 		mode_str = "OPEN";
3586 
3587 		create_flag = O_RDONLY;
3588 
3589 	} else if (create_mode == OS_FILE_CREATE) {
3590 
3591 		mode_str = "CREATE";
3592 		create_flag = O_RDWR | O_CREAT | O_EXCL;
3593 
3594 	} else if (create_mode == OS_FILE_OVERWRITE) {
3595 
3596 		mode_str = "OVERWRITE";
3597 		create_flag = O_RDWR | O_CREAT | O_TRUNC;
3598 
3599 	} else {
3600 		ib::error()
3601 			<< "Unknown file create mode (" << create_mode << ")"
3602 			<< " for file '" << name << "'";
3603 
3604 		file.m_file = OS_FILE_CLOSED;
3605 		return(file);
3606 	}
3607 
3608 	ut_a(type == OS_LOG_FILE
3609 	     || type == OS_DATA_FILE
3610 	     || type == OS_DATA_TEMP_FILE);
3611 
3612 	ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
3613 
3614 #ifdef O_SYNC
3615 	/* We let O_SYNC only affect log files; note that we map O_DSYNC to
3616 	O_SYNC because the datasync options seemed to corrupt files in 2001
3617 	in both Linux and Solaris */
3618 
3619 	if (!read_only
3620 	    && type == OS_LOG_FILE
3621 	    && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
3622 
3623 		create_flag |= O_SYNC;
3624 	}
3625 #endif /* O_SYNC */
3626 
3627 	bool		retry;
3628 
3629 	do {
3630 		file.m_file = ::open(name, create_flag, os_innodb_umask);
3631 
3632 		if (file.m_file == -1) {
3633 			const char*	operation;
3634 
3635 			operation = (create_mode == OS_FILE_CREATE
3636 				     && !read_only) ? "create" : "open";
3637 
3638 			*success = false;
3639 
3640 			if (on_error_no_exit) {
3641 				retry = os_file_handle_error_no_exit(
3642 					name, operation, on_error_silent);
3643 			} else {
3644 				retry = os_file_handle_error(name, operation);
3645 			}
3646 		} else {
3647 			*success = true;
3648 			retry = false;
3649 		}
3650 
3651 	} while (retry);
3652 
3653 	/* We disable OS caching (O_DIRECT) only on data files */
3654 
3655 	if (!read_only
3656 	    && *success
3657 	    && (type != OS_LOG_FILE && type != OS_DATA_TEMP_FILE)
3658 	    && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
3659 		|| srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) {
3660 
3661 		os_file_set_nocache(file.m_file, name, mode_str);
3662 	}
3663 
3664 #ifdef USE_FILE_LOCK
3665 	if (!read_only
3666 	    && *success
3667 	    && create_mode != OS_FILE_OPEN_RAW
3668 	    && os_file_lock(file.m_file, name)) {
3669 
3670 		if (create_mode == OS_FILE_OPEN_RETRY) {
3671 
3672 			ib::info()
3673 				<< "Retrying to lock the first data file";
3674 
3675 			for (int i = 0; i < 100; i++) {
3676 				os_thread_sleep(1000000);
3677 
3678 				if (!os_file_lock(file.m_file, name)) {
3679 					*success = true;
3680 					return(file);
3681 				}
3682 			}
3683 
3684 			ib::info()
3685 				<< "Unable to open the first data file";
3686 		}
3687 
3688 		*success = false;
3689 		close(file.m_file);
3690 		file.m_file = -1;
3691 	}
3692 #endif /* USE_FILE_LOCK */
3693 
3694 	return(file);
3695 }
3696 
3697 /** NOTE! Use the corresponding macro
3698 os_file_create_simple_no_error_handling(), not directly this function!
3699 A simple function to open or create a file.
3700 @param[in]	name		name of the file or path as a null-terminated
3701 				string
3702 @param[in]	create_mode	create mode
3703 @param[in]	access_type	OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
3704 				OS_FILE_READ_ALLOW_DELETE; the last option
3705 				is used by a backup program reading the file
3706 @param[in]	read_only	if true read only mode checks are enforced
3707 @param[out]	success		true if succeeded
3708 @return own: handle to the file, not defined if error, error number
3709 	can be retrieved with os_file_get_last_error */
3710 pfs_os_file_t
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)3711 os_file_create_simple_no_error_handling_func(
3712 	const char*	name,
3713 	ulint		create_mode,
3714 	ulint		access_type,
3715 	bool		read_only,
3716 	bool*		success)
3717 {
3718 	pfs_os_file_t	file;
3719 	int		create_flag;
3720 
3721 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
3722 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
3723 
3724 	if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
3725 		WAIT_ALLOW_WRITES();
3726 	*success = false;
3727 
3728 	if (create_mode == OS_FILE_OPEN) {
3729 
3730 		if (access_type == OS_FILE_READ_ONLY) {
3731 
3732 			create_flag = O_RDONLY;
3733 
3734 		} else if (read_only) {
3735 
3736 			create_flag = O_RDONLY;
3737 
3738 		} else {
3739 
3740 			ut_a(access_type == OS_FILE_READ_WRITE
3741 			     || access_type == OS_FILE_READ_ALLOW_DELETE);
3742 
3743 			create_flag = O_RDWR;
3744 		}
3745 
3746 	} else if (read_only) {
3747 
3748 		create_flag = O_RDONLY;
3749 
3750 	} else if (create_mode == OS_FILE_CREATE) {
3751 
3752 		create_flag = O_RDWR | O_CREAT | O_EXCL;
3753 
3754 	} else {
3755 
3756 		ib::error()
3757 			<< "Unknown file create mode "
3758 			<< create_mode << " for file '" << name << "'";
3759 		file.m_file = OS_FILE_CLOSED;
3760 		return(file);
3761 	}
3762 
3763 	file.m_file = ::open(name, create_flag, os_innodb_umask);
3764 
3765 	*success = (file.m_file != -1);
3766 
3767 #ifdef USE_FILE_LOCK
3768 	if (!read_only
3769 	    && *success
3770 	    && access_type == OS_FILE_READ_WRITE
3771 	    && os_file_lock(file.m_file, name)) {
3772 
3773 		*success = false;
3774 		close(file.m_file);
3775 		file.m_file = -1;
3776 
3777 	}
3778 #endif /* USE_FILE_LOCK */
3779 
3780 	return(file);
3781 }
3782 
3783 /** Deletes a file if it exists. The file has to be closed before calling this.
3784 @param[in]	name		file path as a null-terminated string
3785 @param[out]	exist		indicate if file pre-exist
3786 @return true if success */
3787 bool
os_file_delete_if_exists_func(const char * name,bool * exist)3788 os_file_delete_if_exists_func(
3789 	const char*	name,
3790 	bool*		exist)
3791 {
3792 	WAIT_ALLOW_WRITES();
3793 	if (exist != NULL) {
3794 		*exist = true;
3795 	}
3796 
3797 	int	ret = unlink(name);
3798 
3799 	if (ret != 0 && errno == ENOENT) {
3800 		if (exist != NULL) {
3801 			*exist = false;
3802 		}
3803 	} else if (ret != 0 && errno != ENOENT) {
3804 		os_file_handle_error_no_exit(name, "delete", false);
3805 
3806 		return(false);
3807 	}
3808 
3809 	return(true);
3810 }
3811 
3812 /** Deletes a file. The file has to be closed before calling this.
3813 @param[in]	name		file path as a null-terminated string
3814 @return true if success */
3815 bool
os_file_delete_func(const char * name)3816 os_file_delete_func(
3817 	const char*	name)
3818 {
3819 	WAIT_ALLOW_WRITES();
3820 	int	ret = unlink(name);
3821 
3822 	if (ret != 0) {
3823 		os_file_handle_error_no_exit(name, "delete", false);
3824 
3825 		return(false);
3826 	}
3827 
3828 	return(true);
3829 }
3830 
3831 /** NOTE! Use the corresponding macro os_file_rename(), not directly this
3832 function!
3833 Renames a file (can also move it to another directory). It is safest that the
3834 file is closed before calling this function.
3835 @param[in]	oldpath		old file path as a null-terminated string
3836 @param[in]	newpath		new file path
3837 @return true if success */
3838 bool
os_file_rename_func(const char * oldpath,const char * newpath)3839 os_file_rename_func(
3840 	const char*	oldpath,
3841 	const char*	newpath)
3842 {
3843 #ifdef UNIV_DEBUG
3844 	os_file_type_t	type;
3845 	bool		exists;
3846 
3847 	/* New path must not exist. */
3848 	ut_ad(os_file_status(newpath, &exists, &type));
3849 	ut_ad(!exists);
3850 
3851 	/* Old path must exist. */
3852 	ut_ad(os_file_status(oldpath, &exists, &type));
3853 	ut_ad(exists);
3854 #endif /* UNIV_DEBUG */
3855 	WAIT_ALLOW_WRITES();
3856 
3857 	int	ret = rename(oldpath, newpath);
3858 
3859 	if (ret != 0) {
3860 		os_file_handle_error_no_exit(oldpath, "rename", false);
3861 
3862 		return(false);
3863 	}
3864 
3865 	return(true);
3866 }
3867 
3868 /** NOTE! Use the corresponding macro os_file_close(), not directly this
3869 function!
3870 Closes a file handle. In case of error, error number can be retrieved with
3871 os_file_get_last_error.
3872 @param[in]	file		Handle to close
3873 @return true if success */
3874 bool
os_file_close_func(os_file_t file)3875 os_file_close_func(
3876 	os_file_t	file)
3877 {
3878 	int	ret = close(file);
3879 
3880 	if (ret == -1) {
3881 		os_file_handle_error(NULL, "close");
3882 
3883 		return(false);
3884 	}
3885 
3886 	return(true);
3887 }
3888 
3889 /** Gets a file size.
3890 @param[in]	file		handle to an open file
3891 @return file size, or (os_offset_t) -1 on failure */
3892 os_offset_t
os_file_get_size(pfs_os_file_t file)3893 os_file_get_size(
3894 	pfs_os_file_t	file)
3895 {
3896 	/* Store current position */
3897 	os_offset_t	pos = lseek(file.m_file, 0, SEEK_CUR);
3898 	os_offset_t	file_size = lseek(file.m_file, 0, SEEK_END);
3899 
3900 	/* Restore current position as the function should not change it */
3901 	lseek(file.m_file, pos, SEEK_SET);
3902 
3903 	return(file_size);
3904 }
3905 
3906 /** Gets a file size.
3907 @param[in]	filename	Full path to the filename to check
3908 @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
3909 	errno */
3910 os_file_size_t
os_file_get_size(const char * filename)3911 os_file_get_size(
3912 	const char*	filename)
3913 {
3914 	struct stat	s;
3915 	os_file_size_t	file_size;
3916 
3917 	int	ret = stat(filename, &s);
3918 
3919 	if (ret == 0) {
3920 		file_size.m_total_size = s.st_size;
3921 		/* st_blocks is in 512 byte sized blocks */
3922 		file_size.m_alloc_size = s.st_blocks * 512;
3923 	} else {
3924 		file_size.m_total_size = ~0;
3925 		file_size.m_alloc_size = (os_offset_t) errno;
3926 	}
3927 
3928 	return(file_size);
3929 }
3930 
3931 /** This function returns information about the specified file
3932 @param[in]	path		pathname of the file
3933 @param[out]	stat_info	information of a file in a directory
3934 @param[in,out]	statinfo	information of a file in a directory
3935 @param[in]	check_rw_perm	for testing whether the file can be opened
3936 				in RW mode
3937 @param[in]	read_only	if true read only mode checks are enforced
3938 @return DB_SUCCESS if all OK */
3939 static
3940 dberr_t
os_file_get_status_posix(const char * path,os_file_stat_t * stat_info,struct stat * statinfo,bool check_rw_perm,bool read_only)3941 os_file_get_status_posix(
3942 	const char*	path,
3943 	os_file_stat_t* stat_info,
3944 	struct stat*	statinfo,
3945 	bool		check_rw_perm,
3946 	bool		read_only)
3947 {
3948 	int	ret = stat(path, statinfo);
3949 
3950 	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3951 		/* file does not exist */
3952 
3953 		return(DB_NOT_FOUND);
3954 
3955 	} else if (ret) {
3956 		/* file exists, but stat call failed */
3957 
3958 		os_file_handle_error_no_exit(path, "stat", false);
3959 
3960 		return(DB_FAIL);
3961 	}
3962 
3963 	switch (statinfo->st_mode & S_IFMT) {
3964 	case S_IFDIR:
3965 		stat_info->type = OS_FILE_TYPE_DIR;
3966 		break;
3967 	case S_IFLNK:
3968 		stat_info->type = OS_FILE_TYPE_LINK;
3969 		break;
3970 	case S_IFBLK:
3971 		/* Handle block device as regular file. */
3972 	case S_IFCHR:
3973 		/* Handle character device as regular file. */
3974 	case S_IFREG:
3975 		stat_info->type = OS_FILE_TYPE_FILE;
3976 		break;
3977 	default:
3978 		stat_info->type = OS_FILE_TYPE_UNKNOWN;
3979 	}
3980 
3981 	stat_info->size = statinfo->st_size;
3982 	stat_info->block_size = statinfo->st_blksize;
3983 	stat_info->alloc_size = statinfo->st_blocks * 512;
3984 
3985 	if (check_rw_perm
3986 	    && (stat_info->type == OS_FILE_TYPE_FILE
3987 		|| stat_info->type == OS_FILE_TYPE_BLOCK)) {
3988 
3989 		int	access = !read_only ? O_RDWR : O_RDONLY;
3990 		int	fh = ::open(path, access, os_innodb_umask);
3991 
3992 		if (fh == -1) {
3993 			stat_info->rw_perm = false;
3994 		} else {
3995 			stat_info->rw_perm = true;
3996 			close(fh);
3997 		}
3998 	}
3999 
4000 	return(DB_SUCCESS);
4001 }
4002 
4003 /** Truncates a file to a specified size in bytes.
4004 Do nothing if the size to preserve is greater or equal to the current
4005 size of the file.
4006 @param[in]	pathname	file path
4007 @param[in]	file		file to be truncated
4008 @param[in]	size		size to preserve in bytes
4009 @return true if success */
4010 static
4011 bool
os_file_truncate_posix(const char * pathname,pfs_os_file_t file,os_offset_t size)4012 os_file_truncate_posix(
4013 	const char*	pathname,
4014 	pfs_os_file_t	file,
4015 	os_offset_t	size)
4016 {
4017 	WAIT_ALLOW_WRITES();
4018 	int     res = ftruncate(file.m_file, size);
4019 	if (res == -1) {
4020 
4021 		bool	retry;
4022 
4023 		retry = os_file_handle_error_no_exit(
4024 			pathname, "truncate", false);
4025 
4026 		if (retry) {
4027 			ib::warn()
4028 				<< "Truncate failed for '"
4029 				<< pathname << "'";
4030 		}
4031 	}
4032 
4033 	return(res == 0);
4034 }
4035 
4036 /** Truncates a file at its current position.
4037 @return true if success */
4038 bool
os_file_set_eof(FILE * file)4039 os_file_set_eof(
4040 	FILE*		file)	/*!< in: file to be truncated */
4041 {
4042 	WAIT_ALLOW_WRITES();
4043 	return(!ftruncate(fileno(file), ftell(file)));
4044 }
4045 
4046 #ifdef UNIV_HOTBACKUP
4047 /** Closes a file handle.
4048 @param[in]	file		Handle to a file
4049 @return true if success */
4050 bool
os_file_close_no_error_handling(os_file_t file)4051 os_file_close_no_error_handling(
4052 	os_file_t	file)
4053 {
4054 	return(close(file) != -1);
4055 }
4056 #endif /* UNIV_HOTBACKUP */
4057 
4058 /** This function can be called if one wants to post a batch of reads and
4059 prefers an i/o-handler thread to handle them all at once later. You must
4060 call os_aio_simulated_wake_handler_threads later to ensure the threads
4061 are not left sleeping! */
4062 void
os_aio_simulated_put_read_threads_to_sleep()4063 os_aio_simulated_put_read_threads_to_sleep()
4064 {
4065 	/* No op on non Windows */
4066 }
4067 
4068 #else /* !_WIN32 */
4069 
4070 #include <WinIoCtl.h>
4071 
4072 /** Do the read/write
4073 @param[in]	request	The IO context and type
4074 @return the number of bytes read/written or negative value on error */
4075 ssize_t
execute(const IORequest & request)4076 SyncFileIO::execute(const IORequest& request)
4077 {
4078 	OVERLAPPED	seek;
4079 
4080 	memset(&seek, 0x0, sizeof(seek));
4081 
4082 	seek.Offset = (DWORD) m_offset & 0xFFFFFFFF;
4083 	seek.OffsetHigh = (DWORD) (m_offset >> 32);
4084 
4085 	BOOL	ret;
4086 	DWORD	n_bytes;
4087 
4088 	if (request.is_read()) {
4089 		ret = ReadFile(m_fh, m_buf,
4090 			static_cast<DWORD>(m_n), &n_bytes, &seek);
4091 
4092 	} else {
4093 		ut_ad(request.is_write());
4094 		ret = WriteFile(m_fh, m_buf,
4095 			static_cast<DWORD>(m_n), &n_bytes, &seek);
4096 	}
4097 
4098 	return(ret ? static_cast<ssize_t>(n_bytes) : -1);
4099 }
4100 
4101 /** Do the read/write
4102 @param[in,out]	slot	The IO slot, it has the IO context
4103 @return the number of bytes read/written or negative value on error */
4104 ssize_t
execute(Slot * slot)4105 SyncFileIO::execute(Slot* slot)
4106 {
4107 	BOOL	ret;
4108 
4109 	if (slot->type.is_read()) {
4110 		ret = ReadFile(
4111 			slot->file.m_file, slot->ptr, slot->len,
4112 			&slot->n_bytes, &slot->control);
4113 	} else {
4114 		ut_ad(slot->type.is_write());
4115 		ret = WriteFile(
4116 			slot->file.m_file, slot->ptr, slot->len,
4117 			&slot->n_bytes, &slot->control);
4118 	}
4119 
4120 	return(ret ? static_cast<ssize_t>(slot->n_bytes) : -1);
4121 }
4122 
4123 /** Check if the file system supports sparse files.
4124 @param[in]	 name		File name
4125 @return true if the file system supports sparse files */
4126 static
4127 bool
os_is_sparse_file_supported_win32(const char * filename)4128 os_is_sparse_file_supported_win32(const char* filename)
4129 {
4130 	char	volname[MAX_PATH];
4131 	BOOL	result = GetVolumePathName(filename, volname, MAX_PATH);
4132 
4133 	if (!result) {
4134 
4135 		ib::error()
4136 			<< "os_is_sparse_file_supported: "
4137 			<< "Failed to get the volume path name for: "
4138 			<< filename
4139 			<< "- OS error number " << GetLastError();
4140 
4141 		return(false);
4142 	}
4143 
4144 	DWORD	flags;
4145 
4146 	GetVolumeInformation(
4147 		volname, NULL, MAX_PATH, NULL, NULL,
4148 		&flags, NULL, MAX_PATH);
4149 
4150 	return(flags & FILE_SUPPORTS_SPARSE_FILES) ? true : false;
4151 }
4152 
4153 /** Free storage space associated with a section of the file.
4154 @param[in]	fh		Open file handle
4155 @param[in]	page_size	Tablespace page size
4156 @param[in]	block_size	File system block size
4157 @param[in]	off		Starting offset (SEEK_SET)
4158 @param[in]	len		Size of the hole
4159 @return 0 on success or errno */
4160 static
4161 dberr_t
os_file_punch_hole_win32(os_file_t fh,os_offset_t off,os_offset_t len)4162 os_file_punch_hole_win32(
4163 	os_file_t	fh,
4164 	os_offset_t	off,
4165 	os_offset_t	len)
4166 {
4167 	FILE_ZERO_DATA_INFORMATION	punch;
4168 
4169 	punch.FileOffset.QuadPart = off;
4170 	punch.BeyondFinalZero.QuadPart = off + len;
4171 
4172 	/* If lpOverlapped is NULL, lpBytesReturned cannot be NULL,
4173 	therefore we pass a dummy parameter. */
4174 	DWORD	temp;
4175 
4176 	BOOL	result = DeviceIoControl(
4177 		fh, FSCTL_SET_ZERO_DATA, &punch, sizeof(punch),
4178 		NULL, 0, &temp, NULL);
4179 
4180 	return(!result ? DB_IO_NO_PUNCH_HOLE : DB_SUCCESS);
4181 }
4182 
4183 /** Check the existence and type of the given file.
4184 @param[in]	path		path name of file
4185 @param[out]	exists		true if the file exists
4186 @param[out]	type		Type of the file, if it exists
4187 @return true if call succeeded */
4188 bool
os_file_status_win32(const char * path,bool * exists,os_file_type_t * type)4189 os_file_status_win32(
4190 	const char*	path,
4191 	bool*		exists,
4192 	os_file_type_t* type)
4193 {
4194 	int		ret;
4195 	struct _stat64	statinfo;
4196 
4197 	ret = _stat64(path, &statinfo);
4198 
4199 	*exists = !ret;
4200 
4201 	if (!ret) {
4202 		/* file exists, everything OK */
4203 
4204 	} else if (errno == ENOENT || errno == ENOTDIR
4205 		  || errno == ENAMETOOLONG) {
4206 		/* file does not exist */
4207 		return(true);
4208 
4209 	} else {
4210 		/* file exists, but stat call failed */
4211 		os_file_handle_error_no_exit(path, "stat", false);
4212 		return(false);
4213 	}
4214 
4215 	if (_S_IFDIR & statinfo.st_mode) {
4216 		*type = OS_FILE_TYPE_DIR;
4217 
4218 	} else if (_S_IFREG & statinfo.st_mode) {
4219 		*type = OS_FILE_TYPE_FILE;
4220 
4221 	} else {
4222 		*type = OS_FILE_TYPE_UNKNOWN;
4223 	}
4224 
4225 	return(true);
4226 }
4227 
4228 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
4229 function!
4230 Flushes the write buffers of a given file to the disk.
4231 @param[in]	file		handle to a file
4232 @return true if success */
4233 bool
os_file_flush_func(os_file_t file)4234 os_file_flush_func(
4235 	os_file_t	file)
4236 {
4237 	WAIT_ALLOW_WRITES();
4238 	++os_n_fsyncs;
4239 
4240 	BOOL	ret = FlushFileBuffers(file);
4241 
4242 	if (ret) {
4243 		return(true);
4244 	}
4245 
4246 	/* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
4247 	actually a raw device, we choose to ignore that error if we are using
4248 	raw disks */
4249 
4250 	if (srv_start_raw_disk_in_use && GetLastError()
4251 	    == ERROR_INVALID_FUNCTION) {
4252 		return(true);
4253 	}
4254 
4255 	os_file_handle_error(NULL, "flush");
4256 
4257 	/* It is a fatal error if a file flush does not succeed, because then
4258 	the database can get corrupt on disk */
4259 	ut_error;
4260 
4261 	return(false);
4262 }
4263 
4264 /** Retrieves the last error number if an error occurs in a file io function.
4265 The number should be retrieved before any other OS calls (because they may
4266 overwrite the error number). If the number is not known to this program,
4267 the OS error number + 100 is returned.
4268 @param[in]	report_all_errors	true if we want an error message printed
4269 					of all errors
4270 @param[in]	on_error_silent		true then don't print any diagnostic
4271 					to the log
4272 @return error number, or OS error number + 100 */
4273 static
4274 ulint
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)4275 os_file_get_last_error_low(
4276 	bool	report_all_errors,
4277 	bool	on_error_silent)
4278 {
4279 	ulint	err = (ulint) GetLastError();
4280 
4281 	if (err == ERROR_SUCCESS) {
4282 		return(0);
4283 	}
4284 
4285 	if (report_all_errors
4286 	    || (!on_error_silent
4287 		&& err != ERROR_DISK_FULL
4288 		&& err != ERROR_FILE_EXISTS)) {
4289 
4290 		ib::error()
4291 			<< "Operating system error number " << err
4292 			<< " in a file operation.";
4293 
4294 		if (err == ERROR_PATH_NOT_FOUND) {
4295 			ib::error()
4296 				<< "The error means the system"
4297 				" cannot find the path specified.";
4298 
4299 			if (srv_is_being_started) {
4300 				ib::error()
4301 					<< "If you are installing InnoDB,"
4302 					" remember that you must create"
4303 					" directories yourself, InnoDB"
4304 					" does not create them.";
4305 			}
4306 
4307 		} else if (err == ERROR_ACCESS_DENIED) {
4308 
4309 			ib::error()
4310 				<< "The error means mysqld does not have"
4311 				" the access rights to"
4312 				" the directory. It may also be"
4313 				" you have created a subdirectory"
4314 				" of the same name as a data file.";
4315 
4316 		} else if (err == ERROR_SHARING_VIOLATION
4317 			   || err == ERROR_LOCK_VIOLATION) {
4318 
4319 			ib::error()
4320 				<< "The error means that another program"
4321 				" is using InnoDB's files."
4322 				" This might be a backup or antivirus"
4323 				" software or another instance"
4324 				" of MySQL."
4325 				" Please close it to get rid of this error.";
4326 
4327 		} else if (err == ERROR_WORKING_SET_QUOTA
4328 			   || err == ERROR_NO_SYSTEM_RESOURCES) {
4329 
4330 			ib::error()
4331 				<< "The error means that there are no"
4332 				" sufficient system resources or quota to"
4333 				" complete the operation.";
4334 
4335 		} else if (err == ERROR_OPERATION_ABORTED) {
4336 
4337 			ib::error()
4338 				<< "The error means that the I/O"
4339 				" operation has been aborted"
4340 				" because of either a thread exit"
4341 				" or an application request."
4342 				" Retry attempt is made.";
4343 		} else {
4344 
4345 			ib::info() << OPERATING_SYSTEM_ERROR_MSG;
4346 		}
4347 	}
4348 
4349 	if (err == ERROR_FILE_NOT_FOUND) {
4350 		return(OS_FILE_NOT_FOUND);
4351 	} else if (err == ERROR_DISK_FULL) {
4352 		return(OS_FILE_DISK_FULL);
4353 	} else if (err == ERROR_FILE_EXISTS) {
4354 		return(OS_FILE_ALREADY_EXISTS);
4355 	} else if (err == ERROR_SHARING_VIOLATION
4356 		   || err == ERROR_LOCK_VIOLATION) {
4357 		return(OS_FILE_SHARING_VIOLATION);
4358 	} else if (err == ERROR_WORKING_SET_QUOTA
4359 		   || err == ERROR_NO_SYSTEM_RESOURCES) {
4360 		return(OS_FILE_INSUFFICIENT_RESOURCE);
4361 	} else if (err == ERROR_OPERATION_ABORTED) {
4362 		return(OS_FILE_OPERATION_ABORTED);
4363 	} else if (err == ERROR_ACCESS_DENIED) {
4364 		return(OS_FILE_ACCESS_VIOLATION);
4365 	}
4366 
4367 	return(OS_FILE_ERROR_MAX + err);
4368 }
4369 
4370 /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
4371 this function!
4372 A simple function to open or create a file.
4373 @param[in]	name		name of the file or path as a null-terminated
4374 				string
4375 @param[in]	create_mode	create mode
4376 @param[in]	access_type	OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
4377 @param[in]	read_only	if true read only mode checks are enforced
4378 @param[out]	success		true if succeed, false if error
4379 @return handle to the file, not defined if error, error number
4380 	can be retrieved with os_file_get_last_error */
4381 pfs_os_file_t
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)4382 os_file_create_simple_func(
4383 	const char*	name,
4384 	ulint		create_mode,
4385 	ulint		access_type,
4386 	bool		read_only,
4387 	bool*		success)
4388 {
4389 	pfs_os_file_t	file;
4390 
4391 	*success = false;
4392 
4393 	DWORD		access;
4394 	DWORD		create_flag;
4395 	DWORD		attributes = 0;
4396 
4397 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
4398 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
4399 
4400 	if (create_mode == OS_FILE_OPEN) {
4401 
4402 		create_flag = OPEN_EXISTING;
4403 
4404 	} else if (read_only) {
4405 
4406 		create_flag = OPEN_EXISTING;
4407 
4408 	} else if (create_mode == OS_FILE_CREATE) {
4409 
4410 		create_flag = CREATE_NEW;
4411 
4412 	} else if (create_mode == OS_FILE_CREATE_PATH) {
4413 
4414 		/* Create subdirs along the path if needed. */
4415 		*success = os_file_create_subdirs_if_needed(name);
4416 
4417 		if (!*success) {
4418 
4419 			ib::error()
4420 				<< "Unable to create subdirectories '"
4421 				<< name << "'";
4422 			file.m_file = OS_FILE_CLOSED;
4423 			return(file);
4424 		}
4425 
4426 		create_flag = CREATE_NEW;
4427 		create_mode = OS_FILE_CREATE;
4428 
4429 	} else {
4430 
4431 		ib::error()
4432 			<< "Unknown file create mode ("
4433 			<< create_mode << ") for file '"
4434 			<< name << "'";
4435 
4436 		file.m_file = OS_FILE_CLOSED;
4437 		return(file);
4438 	}
4439 
4440 	if (access_type == OS_FILE_READ_ONLY) {
4441 
4442 		access = GENERIC_READ;
4443 
4444 	} else if (read_only) {
4445 
4446 		ib::info()
4447 			<< "Read only mode set. Unable to"
4448 			" open file '" << name << "' in RW mode, "
4449 			<< "trying RO mode", name;
4450 
4451 		access = GENERIC_READ;
4452 
4453 	} else if (access_type == OS_FILE_READ_WRITE) {
4454 
4455 		access = GENERIC_READ | GENERIC_WRITE;
4456 
4457 	} else {
4458 
4459 		ib::error()
4460 			<< "Unknown file access type (" << access_type << ") "
4461 			"for file '" << name << "'";
4462 
4463 		file.m_file = OS_FILE_CLOSED;
4464 		return(file);
4465 	}
4466 
4467 	bool	retry;
4468 
4469 	do {
4470 		/* Use default security attributes and no template file. */
4471 
4472 		file.m_file = CreateFile(
4473 			(LPCTSTR) name, access, FILE_SHARE_READ, NULL,
4474 			create_flag, attributes, NULL);
4475 
4476 		if (file.m_file == INVALID_HANDLE_VALUE) {
4477 
4478 			*success = false;
4479 
4480 			retry = os_file_handle_error(
4481 				name, create_mode == OS_FILE_OPEN ?
4482 				"open" : "create");
4483 
4484 		} else {
4485 
4486 			retry = false;
4487 
4488 			*success = true;
4489 
4490 			DWORD	temp;
4491 
4492 			/* This is a best effort use case, if it fails then
4493 			we will find out when we try and punch the hole. */
4494 
4495 			DeviceIoControl(
4496 				file.m_file, FSCTL_SET_SPARSE, NULL, 0, NULL, 0,
4497 				&temp, NULL);
4498 		}
4499 
4500 	} while (retry);
4501 
4502 	return(file);
4503 }
4504 
4505 /** This function attempts to create a directory named pathname. The new
4506 directory gets default permissions. On Unix the permissions are
4507 (0770 & ~umask). If the directory exists already, nothing is done and
4508 the call succeeds, unless the fail_if_exists arguments is true.
4509 If another error occurs, such as a permission error, this does not crash,
4510 but reports the error and returns false.
4511 @param[in]	pathname	directory name as null-terminated string
4512 @param[in]	fail_if_exists	if true, pre-existing directory is treated
4513 				as an error.
4514 @return true if call succeeds, false on error */
4515 bool
os_file_create_directory(const char * pathname,bool fail_if_exists)4516 os_file_create_directory(
4517 	const char*	pathname,
4518 	bool		fail_if_exists)
4519 {
4520 	BOOL	rcode;
4521 
4522 	rcode = CreateDirectory((LPCTSTR) pathname, NULL);
4523 	if (!(rcode != 0
4524 	      || (GetLastError() == ERROR_ALREADY_EXISTS
4525 		  && !fail_if_exists))) {
4526 
4527 		os_file_handle_error_no_exit(
4528 			pathname, "CreateDirectory", false);
4529 
4530 		return(false);
4531 	}
4532 
4533 	return(true);
4534 }
4535 
4536 /** The os_file_opendir() function opens a directory stream corresponding to the
4537 directory named by the dirname argument. The directory stream is positioned
4538 at the first entry. In both Unix and Windows we automatically skip the '.'
4539 and '..' items at the start of the directory listing.
4540 @param[in]	dirname		directory name; it must not contain a trailing
4541 				'\' or '/'
4542 @param[in]	is_fatal	true if we should treat an error as a fatal
4543 				error; if we try to open symlinks then we do
4544 				not wish a fatal error if it happens not to
4545 				be a directory
4546 @return directory stream, NULL if error */
4547 os_file_dir_t
os_file_opendir(const char * dirname,bool error_is_fatal)4548 os_file_opendir(
4549 	const char*	dirname,
4550 	bool		error_is_fatal)
4551 {
4552 	os_file_dir_t		dir;
4553 	LPWIN32_FIND_DATA	lpFindFileData;
4554 	char			path[OS_FILE_MAX_PATH + 3];
4555 
4556 	ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
4557 
4558 	strcpy(path, dirname);
4559 	strcpy(path + strlen(path), "\\*");
4560 
4561 	/* Note that in Windows opening the 'directory stream' also retrieves
4562 	the first entry in the directory. Since it is '.', that is no problem,
4563 	as we will skip over the '.' and '..' entries anyway. */
4564 
4565 	lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
4566 		ut_malloc_nokey(sizeof(WIN32_FIND_DATA)));
4567 
4568 	dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
4569 
4570 	ut_free(lpFindFileData);
4571 
4572 	if (dir == INVALID_HANDLE_VALUE) {
4573 
4574 		if (error_is_fatal) {
4575 			os_file_handle_error(dirname, "opendir");
4576 		}
4577 
4578 		return(NULL);
4579 	}
4580 
4581 	return(dir);
4582 }
4583 
4584 /** Closes a directory stream.
4585 @param[in]	dir	directory stream
4586 @return 0 if success, -1 if failure */
4587 int
os_file_closedir(os_file_dir_t dir)4588 os_file_closedir(
4589 	os_file_dir_t	dir)
4590 {
4591 	BOOL		ret;
4592 
4593 	ret = FindClose(dir);
4594 
4595 	if (!ret) {
4596 		os_file_handle_error_no_exit(NULL, "closedir", false);
4597 
4598 		return(-1);
4599 	}
4600 
4601 	return(0);
4602 }
4603 
4604 /** This function returns information of the next file in the directory. We
4605 jump over the '.' and '..' entries in the directory.
4606 @param[in]	dirname		directory name or path
4607 @param[in]	dir		directory stream
4608 @param[out]	info		buffer where the info is returned
4609 @return 0 if ok, -1 if error, 1 if at the end of the directory */
4610 int
os_file_readdir_next_file(const char * dirname,os_file_dir_t dir,os_file_stat_t * info)4611 os_file_readdir_next_file(
4612 	const char*	dirname,
4613 	os_file_dir_t	dir,
4614 	os_file_stat_t*	info)
4615 {
4616 	BOOL		ret;
4617 	int		status;
4618 	WIN32_FIND_DATA	find_data;
4619 
4620 next_file:
4621 
4622 	ret = FindNextFile(dir, &find_data);
4623 
4624 	if (ret > 0) {
4625 
4626 		const char* name;
4627 
4628 		name = static_cast<const char*>(find_data.cFileName);
4629 
4630 		ut_a(strlen(name) < OS_FILE_MAX_PATH);
4631 
4632 		if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0) {
4633 
4634 			goto next_file;
4635 		}
4636 
4637 		strcpy(info->name, name);
4638 
4639 		info->size = find_data.nFileSizeHigh;
4640 		info->size <<= 32;
4641 		info->size |= find_data.nFileSizeLow;
4642 
4643 		if (find_data.dwFileAttributes
4644 		    & FILE_ATTRIBUTE_REPARSE_POINT) {
4645 
4646 			/* TODO: test Windows symlinks */
4647 			/* TODO: MySQL has apparently its own symlink
4648 			implementation in Windows, dbname.sym can
4649 			redirect a database directory:
4650 			REFMAN "windows-symbolic-links.html" */
4651 
4652 			info->type = OS_FILE_TYPE_LINK;
4653 
4654 		} else if (find_data.dwFileAttributes
4655 			   & FILE_ATTRIBUTE_DIRECTORY) {
4656 
4657 			info->type = OS_FILE_TYPE_DIR;
4658 
4659 		} else {
4660 
4661 			/* It is probably safest to assume that all other
4662 			file types are normal. Better to check them rather
4663 			than blindly skip them. */
4664 
4665 			info->type = OS_FILE_TYPE_FILE;
4666 		}
4667 
4668 		status = 0;
4669 
4670 	} else if (GetLastError() == ERROR_NO_MORE_FILES) {
4671 
4672 		status = 1;
4673 
4674 	} else {
4675 
4676 		os_file_handle_error_no_exit(NULL, "readdir_next_file", false);
4677 
4678 		status = -1;
4679 	}
4680 
4681 	return(status);
4682 }
4683 
4684 /** NOTE! Use the corresponding macro os_file_create(), not directly
4685 this function!
4686 Opens an existing file or creates a new.
4687 @param[in]	name		name of the file or path as a null-terminated
4688 				string
4689 @param[in]	create_mode	create mode
4690 @param[in]	purpose		OS_FILE_AIO, if asynchronous, non-buffered I/O
4691 				is desired, OS_FILE_NORMAL, if any normal file;
4692 				NOTE that it also depends on type, os_aio_..
4693 				and srv_.. variables whether we really use async
4694 				I/O or unbuffered I/O: look in the function
4695 				source code for the exact rules
4696 @param[in]	type		OS_DATA_FILE or OS_LOG_FILE
4697 @param[in]	success		true if succeeded
4698 @return handle to the file, not defined if error, error number
4699 	can be retrieved with os_file_get_last_error */
4700 pfs_os_file_t
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,bool read_only,bool * success)4701 os_file_create_func(
4702 	const char*	name,
4703 	ulint		create_mode,
4704 	ulint		purpose,
4705 	ulint		type,
4706 	bool		read_only,
4707 	bool*		success)
4708 {
4709 	pfs_os_file_t	file;
4710 	bool		retry;
4711 	bool		on_error_no_exit;
4712 	bool		on_error_silent;
4713 
4714 	*success = false;
4715 
4716 	DBUG_EXECUTE_IF(
4717 		"ib_create_table_fail_disk_full",
4718 		*success = false;
4719 		SetLastError(ERROR_DISK_FULL);
4720 		file.m_file = OS_FILE_CLOSED;
4721 		return(file);
4722 	);
4723 
4724 	DWORD		create_flag;
4725 	DWORD		share_mode = FILE_SHARE_READ;
4726 
4727 	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
4728 		? true : false;
4729 
4730 	on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
4731 		? true : false;
4732 
4733 	create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
4734 	create_mode &= ~OS_FILE_ON_ERROR_SILENT;
4735 
4736 	if (create_mode == OS_FILE_OPEN_RAW) {
4737 
4738 		ut_a(!read_only);
4739 
4740 		create_flag = OPEN_EXISTING;
4741 
4742 		/* On Windows Physical devices require admin privileges and
4743 		have to have the write-share mode set. See the remarks
4744 		section for the CreateFile() function documentation in MSDN. */
4745 
4746 		share_mode |= FILE_SHARE_WRITE;
4747 
4748 	} else if (create_mode == OS_FILE_OPEN
4749 		   || create_mode == OS_FILE_OPEN_RETRY) {
4750 
4751 		create_flag = OPEN_EXISTING;
4752 
4753 	} else if (read_only) {
4754 
4755 		create_flag = OPEN_EXISTING;
4756 
4757 	} else if (create_mode == OS_FILE_CREATE) {
4758 
4759 		create_flag = CREATE_NEW;
4760 
4761 	} else if (create_mode == OS_FILE_OVERWRITE) {
4762 
4763 		create_flag = CREATE_ALWAYS;
4764 
4765 	} else {
4766 		ib::error()
4767 			<< "Unknown file create mode (" << create_mode << ") "
4768 			<< " for file '" << name << "'";
4769 
4770 		file.m_file = OS_FILE_CLOSED;
4771 		return(file);
4772 	}
4773 
4774 	DWORD		attributes = 0;
4775 
4776 #ifdef UNIV_HOTBACKUP
4777 	attributes |= FILE_FLAG_NO_BUFFERING;
4778 #else
4779 	if (purpose == OS_FILE_AIO) {
4780 
4781 #ifdef WIN_ASYNC_IO
4782 		/* If specified, use asynchronous (overlapped) io and no
4783 		buffering of writes in the OS */
4784 
4785 		if (srv_use_native_aio) {
4786 			attributes |= FILE_FLAG_OVERLAPPED;
4787 		}
4788 #endif /* WIN_ASYNC_IO */
4789 
4790 	} else if (purpose == OS_FILE_NORMAL) {
4791 
4792 		/* Use default setting. */
4793 
4794 	} else {
4795 
4796 		ib::error()
4797 			<< "Unknown purpose flag (" << purpose << ") "
4798 			<< "while opening file '" << name << "'";
4799 
4800 		file.m_file = OS_FILE_CLOSED;
4801 		return(file);
4802 	}
4803 
4804 #ifdef UNIV_NON_BUFFERED_IO
4805 	// TODO: Create a bug, this looks wrong. The flush log
4806 	// parameter is dynamic.
4807 	if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
4808 
4809 		/* Do not use unbuffered i/o for the log files because
4810 		value 2 denotes that we do not flush the log at every
4811 		commit, but only once per second */
4812 
4813 	} else if (srv_win_file_flush_method == SRV_WIN_IO_UNBUFFERED) {
4814 
4815 		attributes |= FILE_FLAG_NO_BUFFERING;
4816 	}
4817 #endif /* UNIV_NON_BUFFERED_IO */
4818 
4819 #endif /* UNIV_HOTBACKUP */
4820 	DWORD	access = GENERIC_READ;
4821 
4822 	if (!read_only) {
4823 		access |= GENERIC_WRITE;
4824 	}
4825 
4826 	do {
4827 		/* Use default security attributes and no template file. */
4828 		file.m_file = CreateFile(
4829 			(LPCTSTR) name, access, share_mode, NULL,
4830 			create_flag, attributes, NULL);
4831 
4832 		if (file.m_file == INVALID_HANDLE_VALUE) {
4833 			const char*	operation;
4834 
4835 			operation = (create_mode == OS_FILE_CREATE
4836 				     && !read_only)
4837 				? "create" : "open";
4838 
4839 			*success = false;
4840 
4841 			if (on_error_no_exit) {
4842 				retry = os_file_handle_error_no_exit(
4843 					name, operation, on_error_silent);
4844 			} else {
4845 				retry = os_file_handle_error(name, operation);
4846 			}
4847 		} else {
4848 
4849 			retry = false;
4850 
4851 			*success = true;
4852 
4853 			DWORD	temp;
4854 
4855 			/* This is a best effort use case, if it fails then
4856 			we will find out when we try and punch the hole. */
4857 			DeviceIoControl(
4858 				file.m_file, FSCTL_SET_SPARSE, NULL, 0, NULL, 0,
4859 				&temp, NULL);
4860 		}
4861 
4862 	} while (retry);
4863 
4864 	return(file);
4865 }
4866 
4867 /** NOTE! Use the corresponding macro os_file_create_simple_no_error_handling(),
4868 not directly this function!
4869 A simple function to open or create a file.
4870 @param[in]	name		name of the file or path as a null-terminated
4871 				string
4872 @param[in]	create_mode	create mode
4873 @param[in]	access_type	OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
4874 				OS_FILE_READ_ALLOW_DELETE; the last option is
4875 				used by a backup program reading the file
4876 @param[out]	success		true if succeeded
4877 @return own: handle to the file, not defined if error, error number
4878 	can be retrieved with os_file_get_last_error */
4879 pfs_os_file_t
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)4880 os_file_create_simple_no_error_handling_func(
4881 	const char*	name,
4882 	ulint		create_mode,
4883 	ulint		access_type,
4884 	bool		read_only,
4885 	bool*		success)
4886 {
4887 	pfs_os_file_t	file;
4888 
4889 	*success = false;
4890 
4891 	DWORD		access;
4892 	DWORD		create_flag;
4893 	DWORD		attributes	= 0;
4894 	DWORD		share_mode	= FILE_SHARE_READ;
4895 
4896 	ut_a(name);
4897 
4898 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
4899 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
4900 
4901 	if (create_mode == OS_FILE_OPEN) {
4902 
4903 		create_flag = OPEN_EXISTING;
4904 
4905 	} else if (read_only) {
4906 
4907 		create_flag = OPEN_EXISTING;
4908 
4909 	} else if (create_mode == OS_FILE_CREATE) {
4910 
4911 		create_flag = CREATE_NEW;
4912 
4913 	} else {
4914 
4915 		ib::error()
4916 			<< "Unknown file create mode (" << create_mode << ") "
4917 			<< " for file '" << name << "'";
4918 
4919 		file.m_file = OS_FILE_CLOSED;
4920 		return(file);
4921 	}
4922 
4923 	if (access_type == OS_FILE_READ_ONLY) {
4924 
4925 		access = GENERIC_READ;
4926 
4927 	} else if (read_only) {
4928 
4929 		access = GENERIC_READ;
4930 
4931 	} else if (access_type == OS_FILE_READ_WRITE) {
4932 
4933 		access = GENERIC_READ | GENERIC_WRITE;
4934 
4935 	} else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
4936 
4937 		ut_a(!read_only);
4938 
4939 		access = GENERIC_READ;
4940 
4941 		/*!< A backup program has to give mysqld the maximum
4942 		freedom to do what it likes with the file */
4943 
4944 		share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE;
4945 	} else {
4946 
4947 		ib::error()
4948 			<< "Unknown file access type (" << access_type << ") "
4949 			<< "for file '" << name << "'";
4950 
4951 		file.m_file = OS_FILE_CLOSED;
4952 		return(file);
4953 	}
4954 
4955 	file.m_file = CreateFile((LPCTSTR) name,
4956 			  access,
4957 			  share_mode,
4958 			  NULL,			// Security attributes
4959 			  create_flag,
4960 			  attributes,
4961 			  NULL);		// No template file
4962 
4963 	*success = (file.m_file != INVALID_HANDLE_VALUE);
4964 
4965 	return(file);
4966 }
4967 
4968 /** Deletes a file if it exists. The file has to be closed before calling this.
4969 @param[in]	name		file path as a null-terminated string
4970 @param[out]	exist		indicate if file pre-exist
4971 @return true if success */
4972 bool
os_file_delete_if_exists_func(const char * name,bool * exist)4973 os_file_delete_if_exists_func(
4974 	const char*	name,
4975 	bool*		exist)
4976 {
4977 	ulint	count	= 0;
4978 
4979 	if (exist != NULL) {
4980 		*exist = true;
4981 	}
4982 
4983 	for (;;) {
4984 		/* In Windows, deleting an .ibd file may fail if ibbackup
4985 		is copying it */
4986 
4987 		bool	ret = DeleteFile((LPCTSTR) name);
4988 
4989 		if (ret) {
4990 			return(true);
4991 		}
4992 
4993 		DWORD	lasterr = GetLastError();
4994 
4995 		if (lasterr == ERROR_FILE_NOT_FOUND
4996 		    || lasterr == ERROR_PATH_NOT_FOUND) {
4997 
4998 			/* the file does not exist, this not an error */
4999 			if (exist != NULL) {
5000 				*exist = false;
5001 			}
5002 
5003 			return(true);
5004 		}
5005 
5006 		++count;
5007 
5008 		if (count > 100 && 0 == (count % 10)) {
5009 
5010 			/* Print error information */
5011 			os_file_get_last_error(true);
5012 
5013 			ib::warn() << "Delete of file '" << name << "' failed.";
5014 		}
5015 
5016 		/* Sleep for a second */
5017 		os_thread_sleep(1000000);
5018 
5019 		if (count > 2000) {
5020 
5021 			return(false);
5022 		}
5023 	}
5024 }
5025 
5026 /** Deletes a file. The file has to be closed before calling this.
5027 @param[in]	name		File path as NUL terminated string
5028 @return true if success */
5029 bool
os_file_delete_func(const char * name)5030 os_file_delete_func(
5031 	const char*	name)
5032 {
5033 	ulint	count	= 0;
5034 
5035 	for (;;) {
5036 		/* In Windows, deleting an .ibd file may fail if ibbackup
5037 		is copying it */
5038 
5039 		BOOL	ret = DeleteFile((LPCTSTR) name);
5040 
5041 		if (ret) {
5042 			return(true);
5043 		}
5044 
5045 		if (GetLastError() == ERROR_FILE_NOT_FOUND) {
5046 			/* If the file does not exist, we classify this as
5047 			a 'mild' error and return */
5048 
5049 			return(false);
5050 		}
5051 
5052 		++count;
5053 
5054 		if (count > 100 && 0 == (count % 10)) {
5055 
5056 			/* print error information */
5057 			os_file_get_last_error(true);
5058 
5059 			ib::warn()
5060 				<< "Cannot delete file '" << name << "'. Are "
5061 				<< "you running ibbackup to back up the file?";
5062 		}
5063 
5064 		/* sleep for a second */
5065 		os_thread_sleep(1000000);
5066 
5067 		if (count > 2000) {
5068 
5069 			return(false);
5070 		}
5071 	}
5072 
5073 	ut_error;
5074 	return(false);
5075 }
5076 
5077 /** NOTE! Use the corresponding macro os_file_rename(), not directly this
5078 function!
5079 Renames a file (can also move it to another directory). It is safest that the
5080 file is closed before calling this function.
5081 @param[in]	oldpath		old file path as a null-terminated string
5082 @param[in]	newpath		new file path
5083 @return true if success */
5084 bool
os_file_rename_func(const char * oldpath,const char * newpath)5085 os_file_rename_func(
5086 	const char*	oldpath,
5087 	const char*	newpath)
5088 {
5089 #ifdef UNIV_DEBUG
5090 	os_file_type_t	type;
5091 	bool		exists;
5092 
5093 	/* New path must not exist. */
5094 	ut_ad(os_file_status(newpath, &exists, &type));
5095 	ut_ad(!exists);
5096 
5097 	/* Old path must exist. */
5098 	ut_ad(os_file_status(oldpath, &exists, &type));
5099 	ut_ad(exists);
5100 #endif /* UNIV_DEBUG */
5101 
5102 	if (MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath)) {
5103 		return(true);
5104 	}
5105 
5106 	os_file_handle_error_no_exit(oldpath, "rename", false);
5107 
5108 	return(false);
5109 }
5110 
5111 /** NOTE! Use the corresponding macro os_file_close(), not directly
5112 this function!
5113 Closes a file handle. In case of error, error number can be retrieved with
5114 os_file_get_last_error.
5115 @param[in,own]	file		Handle to a file
5116 @return true if success */
5117 bool
os_file_close_func(os_file_t file)5118 os_file_close_func(
5119 	os_file_t	file)
5120 {
5121 	ut_a(file > 0);
5122 
5123 	if (CloseHandle(file)) {
5124 		return(true);
5125 	}
5126 
5127 	os_file_handle_error(NULL, "close");
5128 
5129 	return(false);
5130 }
5131 
5132 /** Gets a file size.
5133 @param[in]	file		Handle to a file
5134 @return file size, or (os_offset_t) -1 on failure */
5135 os_offset_t
os_file_get_size(pfs_os_file_t file)5136 os_file_get_size(
5137 	pfs_os_file_t	file)
5138 {
5139 	DWORD		high;
5140 	DWORD		low;
5141 
5142 	low = GetFileSize(file.m_file, &high);
5143 
5144 	if (low == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
5145 		return((os_offset_t) -1);
5146 	}
5147 
5148 	return(os_offset_t(low | (os_offset_t(high) << 32)));
5149 }
5150 
5151 /** Gets a file size.
5152 @param[in]	filename	Full path to the filename to check
5153 @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
5154 	errno */
5155 os_file_size_t
os_file_get_size(const char * filename)5156 os_file_get_size(
5157 	const char*	filename)
5158 {
5159 	struct __stat64	s;
5160 	os_file_size_t	file_size;
5161 
5162 	int		ret = _stat64(filename, &s);
5163 
5164 	if (ret == 0) {
5165 
5166 		file_size.m_total_size = s.st_size;
5167 
5168 		DWORD	low_size;
5169 		DWORD	high_size;
5170 
5171 		low_size = GetCompressedFileSize(filename, &high_size);
5172 
5173 		if (low_size != INVALID_FILE_SIZE) {
5174 
5175 			file_size.m_alloc_size = high_size;
5176 			file_size.m_alloc_size <<= 32;
5177 			file_size.m_alloc_size |= low_size;
5178 
5179 		} else {
5180 			ib::error()
5181 				<< "GetCompressedFileSize("
5182 				<< filename << ", ..) failed.";
5183 
5184 			file_size.m_alloc_size = (os_offset_t) -1;
5185 		}
5186 	} else {
5187 		file_size.m_total_size = ~0;
5188 		file_size.m_alloc_size = (os_offset_t) ret;
5189 	}
5190 
5191 	return(file_size);
5192 }
5193 
5194 /** This function returns information about the specified file
5195 @param[in]	path		pathname of the file
5196 @param[out]	stat_info	information of a file in a directory
5197 @param[in,out]	statinfo	information of a file in a directory
5198 @param[in]	check_rw_perm	for testing whether the file can be opened
5199 				in RW mode
5200 @param[in]	read_only	true if the file is opened in read-only mode
5201 @return DB_SUCCESS if all OK */
5202 static
5203 dberr_t
os_file_get_status_win32(const char * path,os_file_stat_t * stat_info,struct _stat64 * statinfo,bool check_rw_perm,bool read_only)5204 os_file_get_status_win32(
5205 	const char*	path,
5206 	os_file_stat_t* stat_info,
5207 	struct _stat64*	statinfo,
5208 	bool		check_rw_perm,
5209 	bool		read_only)
5210 {
5211 	int	ret = _stat64(path, statinfo);
5212 
5213 	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
5214 		/* file does not exist */
5215 
5216 		return(DB_NOT_FOUND);
5217 
5218 	} else if (ret) {
5219 		/* file exists, but stat call failed */
5220 
5221 		os_file_handle_error_no_exit(path, "stat", false);
5222 
5223 		return(DB_FAIL);
5224 
5225 	} else if (_S_IFDIR & statinfo->st_mode) {
5226 
5227 		stat_info->type = OS_FILE_TYPE_DIR;
5228 
5229 	} else if (_S_IFREG & statinfo->st_mode) {
5230 
5231 		DWORD	access = GENERIC_READ;
5232 
5233 		if (!read_only) {
5234 			access |= GENERIC_WRITE;
5235 		}
5236 
5237 		stat_info->type = OS_FILE_TYPE_FILE;
5238 
5239 		/* Check if we can open it in read-only mode. */
5240 
5241 		if (check_rw_perm) {
5242 			HANDLE	fh;
5243 
5244 			fh = CreateFile(
5245 				(LPCTSTR) path,		// File to open
5246 				access,
5247 				0,			// No sharing
5248 				NULL,			// Default security
5249 				OPEN_EXISTING,		// Existing file only
5250 				FILE_ATTRIBUTE_NORMAL,	// Normal file
5251 				NULL);			// No attr. template
5252 
5253 			if (fh == INVALID_HANDLE_VALUE) {
5254 				stat_info->rw_perm = false;
5255 			} else {
5256 				stat_info->rw_perm = true;
5257 				CloseHandle(fh);
5258 			}
5259 		}
5260 
5261 		char	volname[MAX_PATH];
5262 		BOOL	result = GetVolumePathName(path, volname, MAX_PATH);
5263 
5264 		if (!result) {
5265 
5266 			ib::error()
5267 				<< "os_file_get_status_win32: "
5268 				<< "Failed to get the volume path name for: "
5269 				<< path
5270 				<< "- OS error number " << GetLastError();
5271 
5272 			return(DB_FAIL);
5273 		}
5274 
5275 		DWORD	sectorsPerCluster;
5276 		DWORD	bytesPerSector;
5277 		DWORD	numberOfFreeClusters;
5278 		DWORD	totalNumberOfClusters;
5279 
5280 		result = GetDiskFreeSpace(
5281 			(LPCSTR) volname,
5282 			&sectorsPerCluster,
5283 			&bytesPerSector,
5284 			&numberOfFreeClusters,
5285 			&totalNumberOfClusters);
5286 
5287 		if (!result) {
5288 
5289 			ib::error()
5290 				<< "GetDiskFreeSpace(" << volname << ",...) "
5291 				<< "failed "
5292 				<< "- OS error number " << GetLastError();
5293 
5294 			return(DB_FAIL);
5295 		}
5296 
5297 		stat_info->block_size = bytesPerSector * sectorsPerCluster;
5298 
5299 		/* On Windows the block size is not used as the allocation
5300 		unit for sparse files. The underlying infra-structure for
5301 		sparse files is based on NTFS compression. The punch hole
5302 		is done on a "compression unit". This compression unit
5303 		is based on the cluster size. You cannot punch a hole if
5304 		the cluster size >= 8K. For smaller sizes the table is
5305 		as follows:
5306 
5307 		Cluster Size	Compression Unit
5308 		512 Bytes		 8 KB
5309 		  1 KB			16 KB
5310 		  2 KB			32 KB
5311 		  4 KB			64 KB
5312 
5313 		Default NTFS cluster size is 4K, compression unit size of 64K.
5314 		Therefore unless the user has created the file system with
5315 		a smaller cluster size and used larger page sizes there is
5316 		little benefit from compression out of the box. */
5317 
5318 		stat_info->block_size = (stat_info->block_size <= 4096)
5319 			?  stat_info->block_size * 16 : ULINT_UNDEFINED;
5320 	} else {
5321 		stat_info->type = OS_FILE_TYPE_UNKNOWN;
5322 	}
5323 
5324 	return(DB_SUCCESS);
5325 }
5326 
5327 /** Truncates a file to a specified size in bytes.
5328 Do nothing if the size to preserve is greater or equal to the current
5329 size of the file.
5330 @param[in]	pathname	file path
5331 @param[in]	file		file to be truncated
5332 @param[in]	size		size to preserve in bytes
5333 @return true if success */
5334 static
5335 bool
os_file_truncate_win32(const char * pathname,pfs_os_file_t file,os_offset_t size)5336 os_file_truncate_win32(
5337 	const char*	pathname,
5338 	pfs_os_file_t	file,
5339 	os_offset_t	size)
5340 {
5341 	LARGE_INTEGER	length;
5342 
5343 	length.QuadPart = size;
5344 	BOOL	success = SetFilePointerEx(file.m_file, length, NULL, FILE_BEGIN);
5345 	if (!success) {
5346 		os_file_handle_error_no_exit(
5347 			pathname, "SetFilePointerEx", false);
5348 	} else {
5349 		success = SetEndOfFile(file.m_file);
5350 		if (!success) {
5351 			os_file_handle_error_no_exit(
5352 				pathname, "SetEndOfFile", false);
5353 		}
5354 	}
5355 	return(success);
5356 }
5357 
5358 /** Truncates a file at its current position.
5359 @param[in]	file		Handle to be truncated
5360 @return true if success */
5361 bool
os_file_set_eof(FILE * file)5362 os_file_set_eof(
5363 	FILE*		file)
5364 {
5365 	HANDLE	h = (HANDLE) _get_osfhandle(fileno(file));
5366 
5367 	return(SetEndOfFile(h));
5368 }
5369 
5370 #ifdef UNIV_HOTBACKUP
5371 /** Closes a file handle.
5372 @param[in]	file		Handle to close
5373 @return true if success */
5374 bool
os_file_close_no_error_handling(os_file_t file)5375 os_file_close_no_error_handling(
5376 	os_file_t	file)
5377 {
5378 	return(CloseHandle(file) ? true : false);
5379 }
5380 #endif /* UNIV_HOTBACKUP */
5381 
5382 /** This function can be called if one wants to post a batch of reads and
5383 prefers an i/o-handler thread to handle them all at once later. You must
5384 call os_aio_simulated_wake_handler_threads later to ensure the threads
5385 are not left sleeping! */
5386 void
os_aio_simulated_put_read_threads_to_sleep()5387 os_aio_simulated_put_read_threads_to_sleep()
5388 {
5389 	AIO::simulated_put_read_threads_to_sleep();
5390 }
5391 
5392 /** This function can be called if one wants to post a batch of reads and
5393 prefers an i/o-handler thread to handle them all at once later. You must
5394 call os_aio_simulated_wake_handler_threads later to ensure the threads
5395 are not left sleeping! */
5396 void
simulated_put_read_threads_to_sleep()5397 AIO::simulated_put_read_threads_to_sleep()
5398 {
5399 	/* The idea of putting background IO threads to sleep is only for
5400 	Windows when using simulated AIO. Windows XP seems to schedule
5401 	background threads too eagerly to allow for coalescing during
5402 	readahead requests. */
5403 
5404 	if (srv_use_native_aio) {
5405 		/* We do not use simulated AIO: do nothing */
5406 
5407 		return;
5408 	}
5409 
5410 	os_aio_recommend_sleep_for_read_threads	= true;
5411 
5412 	for (ulint i = 0; i < os_aio_n_segments; i++) {
5413 		AIO*	array;
5414 
5415 		get_array_and_local_segment(&array, i);
5416 
5417 		if (array == s_reads) {
5418 
5419 			os_event_reset(os_aio_segment_wait_events[i]);
5420 		}
5421 	}
5422 }
5423 
5424 #endif /* !_WIN32*/
5425 
5426 /** Does a syncronous read or write depending upon the type specified
5427 In case of partial reads/writes the function tries
5428 NUM_RETRIES_ON_PARTIAL_IO times to read/write the complete data.
5429 @param[in]	type,		IO flags
5430 @param[in]	file		handle to an open file
5431 @param[out]	buf		buffer where to read
5432 @param[in]	offset		file offset from the start where to read
5433 @param[in]	n		number of bytes to read, starting from offset
5434 @param[out]	err		DB_SUCCESS or error code
5435 @return number of bytes read/written, -1 if error */
5436 static MY_ATTRIBUTE((warn_unused_result))
5437 ssize_t
os_file_io(const IORequest & in_type,os_file_t file,void * buf,ulint n,os_offset_t offset,dberr_t * err)5438 os_file_io(
5439 	const IORequest&in_type,
5440 	os_file_t	file,
5441 	void*		buf,
5442 	ulint		n,
5443 	os_offset_t	offset,
5444 	dberr_t*	err)
5445 {
5446 	Block*		block;
5447 	ulint		original_n = n;
5448 	IORequest	type = in_type;
5449 	ssize_t		bytes_returned = 0;
5450 
5451 	if (type.is_compressed()) {
5452 
5453 		/* We don't compress the first page of any file. */
5454 		ut_ad(offset > 0);
5455 
5456 		block = os_file_compress_page(type, buf, &n);
5457 	} else {
5458 		block = NULL;
5459 	}
5460 
5461 	/* We do encryption after compression, since if we do encryption
5462 	before compression, the encrypted data will cause compression fail
5463 	or low compression rate. */
5464         if (type.is_encrypted() && type.is_write()) {
5465 		/* We don't encrypt the first page of any file. */
5466 		Block*	compressed_block = block;
5467 		ut_ad(offset > 0);
5468 
5469 		block = os_file_encrypt_page(type, buf, &n);
5470 
5471 		if (compressed_block != NULL) {
5472 			os_free_block(compressed_block);
5473 		}
5474         }
5475 
5476 	SyncFileIO	sync_file_io(file, buf, n, offset);
5477 
5478 	for (ulint i = 0; i < NUM_RETRIES_ON_PARTIAL_IO; ++i) {
5479 
5480 		ssize_t	n_bytes = sync_file_io.execute(type);
5481 
5482 		/* Check for a hard error. Not much we can do now. */
5483 		if (n_bytes < 0) {
5484 
5485 			break;
5486 
5487 		} else if ((ulint) n_bytes + bytes_returned == n) {
5488 
5489 			bytes_returned += n_bytes;
5490 
5491 			if (offset > 0
5492 			    && (type.is_compressed() || type.is_read())) {
5493 
5494 				*err = os_file_io_complete(
5495 					type, file,
5496 					reinterpret_cast<byte*>(buf),
5497 					NULL, original_n, offset, n);
5498 			} else {
5499 
5500 				*err = DB_SUCCESS;
5501 			}
5502 
5503 			if (block != NULL) {
5504 				os_free_block(block);
5505 			}
5506 
5507 			return(original_n);
5508 		}
5509 
5510 		/* Handle partial read/write. */
5511 
5512 		ut_ad((ulint) n_bytes + bytes_returned < n);
5513 
5514 		bytes_returned += (ulint) n_bytes;
5515 
5516 		if (!type.is_partial_io_warning_disabled()) {
5517 
5518 			const char*	op = type.is_read()
5519 				? "read" : "written";
5520 
5521 			ib::warn()
5522 				<< n
5523 				<< " bytes should have been " << op << ". Only "
5524 				<< bytes_returned
5525 				<< " bytes " << op << ". Retrying"
5526 				<< " for the remaining bytes.";
5527 		}
5528 
5529 		/* Advance the offset and buffer by n_bytes */
5530 		sync_file_io.advance(n_bytes);
5531 	}
5532 
5533 	if (block != NULL) {
5534 		os_free_block(block);
5535 	}
5536 
5537 	*err = DB_IO_ERROR;
5538 
5539 	if (!type.is_partial_io_warning_disabled()) {
5540 		ib::warn()
5541 			<< "Retry attempts for "
5542 			<< (type.is_read() ? "reading" : "writing")
5543 			<< " partial data failed.";
5544 	}
5545 
5546 	return(bytes_returned);
5547 }
5548 
5549 /** Does a synchronous write operation in Posix.
5550 @param[in]	type		IO context
5551 @param[in]	file		handle to an open file
5552 @param[out]	buf		buffer from which to write
5553 @param[in]	n		number of bytes to read, starting from offset
5554 @param[in]	offset		file offset from the start where to read
5555 @param[out]	err		DB_SUCCESS or error code
5556 @return number of bytes written, -1 if error */
5557 static MY_ATTRIBUTE((warn_unused_result))
5558 ssize_t
os_file_pwrite(IORequest & type,os_file_t file,const byte * buf,ulint n,os_offset_t offset,dberr_t * err)5559 os_file_pwrite(
5560 	IORequest&	type,
5561 	os_file_t	file,
5562 	const byte*	buf,
5563 	ulint		n,
5564 	os_offset_t	offset,
5565 	dberr_t*	err)
5566 {
5567 	ut_ad(type.validate());
5568 
5569 	++os_n_file_writes;
5570 
5571 	(void) os_atomic_increment_ulint(&os_n_pending_writes, 1);
5572 	MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_WRITES);
5573 
5574 	ssize_t	n_bytes = os_file_io(type, file, (void*) buf, n, offset, err);
5575 
5576 	(void) os_atomic_decrement_ulint(&os_n_pending_writes, 1);
5577 	MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_WRITES);
5578 
5579 	return(n_bytes);
5580 }
5581 
5582 /** Requests a synchronous write operation.
5583 @param[in]	type		IO flags
5584 @param[in]	file		handle to an open file
5585 @param[out]	buf		buffer from which to write
5586 @param[in]	offset		file offset from the start where to read
5587 @param[in]	n		number of bytes to read, starting from offset
5588 @return DB_SUCCESS if request was successful, false if fail */
5589 static MY_ATTRIBUTE((warn_unused_result))
5590 dberr_t
os_file_write_page(IORequest & type,const char * name,os_file_t file,const byte * buf,os_offset_t offset,ulint n)5591 os_file_write_page(
5592 	IORequest&	type,
5593 	const char*	name,
5594 	os_file_t	file,
5595 	const byte*	buf,
5596 	os_offset_t	offset,
5597 	ulint		n)
5598 {
5599 	dberr_t		err;
5600 	ut_ad(type.validate());
5601 	ut_ad(n > 0);
5602 
5603 	WAIT_ALLOW_WRITES();
5604 	ssize_t n_bytes = os_file_pwrite(type, file, buf, n, offset, &err);
5605 
5606 	if ((ulint) n_bytes != n && !os_has_said_disk_full) {
5607 
5608 		ib::error()
5609 			<< "Write to file " << name << "failed at offset "
5610 			<< offset << ", " << n
5611 			<< " bytes should have been written,"
5612 			" only " << n_bytes << " were written."
5613 			" Operating system error number " << errno << "."
5614 			" Check that your OS and file system"
5615 			" support files of this size."
5616 			" Check also that the disk is not full"
5617 			" or a disk quota exceeded.";
5618 
5619 		if (strerror(errno) != NULL) {
5620 
5621 			ib::error()
5622 				<< "Error number " << errno
5623 				<< " means '" << strerror(errno) << "'";
5624 		}
5625 
5626 		ib::info() << OPERATING_SYSTEM_ERROR_MSG;
5627 
5628 		os_has_said_disk_full = true;
5629 	}
5630 
5631 	return(err);
5632 }
5633 
5634 /** Does a synchronous read operation in Posix.
5635 @param[in]	type		IO flags
5636 @param[in]	file		handle to an open file
5637 @param[out]	buf		buffer where to read
5638 @param[in]	offset		file offset from the start where to read
5639 @param[in]	n		number of bytes to read, starting from offset
5640 @param[out]	err		DB_SUCCESS or error code
5641 @return number of bytes read, -1 if error */
5642 static MY_ATTRIBUTE((warn_unused_result))
5643 ssize_t
os_file_pread(IORequest & type,os_file_t file,void * buf,ulint n,os_offset_t offset,dberr_t * err)5644 os_file_pread(
5645 	IORequest&	type,
5646 	os_file_t	file,
5647 	void*		buf,
5648 	ulint		n,
5649 	os_offset_t	offset,
5650 	dberr_t*	err)
5651 {
5652 	++os_n_file_reads;
5653 
5654 	(void) os_atomic_increment_ulint(&os_n_pending_reads, 1);
5655 	MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS);
5656 
5657 	ssize_t	n_bytes = os_file_io(type, file, buf, n, offset, err);
5658 
5659 	(void) os_atomic_decrement_ulint(&os_n_pending_reads, 1);
5660 	MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_READS);
5661 
5662 	return(n_bytes);
5663 }
5664 
5665 /** Requests a synchronous positioned read operation.
5666 @return DB_SUCCESS if request was successful, false if fail
5667 @param[in]	type		IO flags
5668 @param[in]	file		handle to an open file
5669 @param[out]	buf		buffer where to read
5670 @param[in]	offset		file offset from the start where to read
5671 @param[in]	n		number of bytes to read, starting from offset
5672 @param[out]	o		number of bytes actually read
5673 @param[in]	exit_on_err	if true then exit on error
5674 @return DB_SUCCESS or error code */
5675 static MY_ATTRIBUTE((warn_unused_result))
5676 dberr_t
os_file_read_page(IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n,ulint * o,bool exit_on_err)5677 os_file_read_page(
5678 	IORequest&	type,
5679 	os_file_t	file,
5680 	void*		buf,
5681 	os_offset_t	offset,
5682 	ulint		n,
5683 	ulint*		o,
5684 	bool		exit_on_err)
5685 {
5686 	dberr_t		err;
5687 
5688 	os_bytes_read_since_printout += n;
5689 
5690 	ut_ad(type.validate());
5691 	ut_ad(n > 0);
5692 
5693 	for (;;) {
5694 		ssize_t	n_bytes;
5695 
5696 		n_bytes = os_file_pread(type, file, buf, n, offset, &err);
5697 
5698 		if (o != NULL) {
5699 			*o = n_bytes;
5700 		}
5701 
5702 		if (err != DB_SUCCESS && !exit_on_err) {
5703 
5704 			return(err);
5705 
5706 		} else if ((ulint) n_bytes == n) {
5707 
5708 			/** The read will succeed but decompress can fail
5709 			for various reasons. */
5710 
5711 			if (type.is_compression_enabled()
5712 			    && !Compression::is_compressed_page(
5713 				    static_cast<byte*>(buf))) {
5714 
5715 				return(DB_SUCCESS);
5716 
5717 			} else {
5718 				return(err);
5719 			}
5720 		}
5721 
5722 		ib::error() << "Tried to read " << n
5723 			<< " bytes at offset " << offset
5724 			<< ", but was only able to read " << n_bytes;
5725 
5726 		if (exit_on_err) {
5727 
5728 			if (!os_file_handle_error(NULL, "read")) {
5729 				/* Hard error */
5730 				break;
5731 			}
5732 
5733 		} else if (!os_file_handle_error_no_exit(NULL, "read", false)) {
5734 
5735 			/* Hard error */
5736 			break;
5737 		}
5738 
5739 		if (n_bytes > 0 && (ulint) n_bytes < n) {
5740 			n -= (ulint) n_bytes;
5741 			offset += (ulint) n_bytes;
5742 			buf = reinterpret_cast<uchar*>(buf) + (ulint) n_bytes;
5743 		}
5744 	}
5745 
5746 	ib::fatal()
5747 		<< "Cannot read from file. OS error number "
5748 		<< errno << ".";
5749 
5750 	return(err);
5751 }
5752 
5753 /** Retrieves the last error number if an error occurs in a file io function.
5754 The number should be retrieved before any other OS calls (because they may
5755 overwrite the error number). If the number is not known to this program,
5756 the OS error number + 100 is returned.
5757 @param[in]	report_all_errors	true if we want an error printed
5758 					for all errors
5759 @return error number, or OS error number + 100 */
5760 ulint
os_file_get_last_error(bool report_all_errors)5761 os_file_get_last_error(
5762 	bool	report_all_errors)
5763 {
5764 	return(os_file_get_last_error_low(report_all_errors, false));
5765 }
5766 
5767 /** Does error handling when a file operation fails.
5768 Conditionally exits (calling srv_fatal_error()) based on should_exit value
5769 and the error type, if should_exit is true then on_error_silent is ignored.
5770 @param[in]	name		name of a file or NULL
5771 @param[in]	operation	operation
5772 @param[in]	should_exit	call srv_fatal_error() on an unknown error,
5773 				if this parameter is true
5774 @param[in]	on_error_silent	if true then don't print any message to the log
5775 				iff it is an unknown non-fatal error
5776 @return true if we should retry the operation */
5777 static MY_ATTRIBUTE((warn_unused_result))
5778 bool
os_file_handle_error_cond_exit(const char * name,const char * operation,bool should_exit,bool on_error_silent)5779 os_file_handle_error_cond_exit(
5780 	const char*	name,
5781 	const char*	operation,
5782 	bool		should_exit,
5783 	bool		on_error_silent)
5784 {
5785 	ulint	err;
5786 
5787 	err = os_file_get_last_error_low(false, on_error_silent);
5788 
5789 	switch (err) {
5790 	case OS_FILE_DISK_FULL:
5791 		/* We only print a warning about disk full once */
5792 
5793 		if (os_has_said_disk_full) {
5794 
5795 			return(false);
5796 		}
5797 
5798 		/* Disk full error is reported irrespective of the
5799 		on_error_silent setting. */
5800 
5801 		if (name) {
5802 
5803 			ib::error()
5804 				<< "Encountered a problem with file '"
5805 				<< name << "'";
5806 		}
5807 
5808 		ib::error()
5809 			<< "Disk is full. Try to clean the disk to free space.";
5810 
5811 		os_has_said_disk_full = true;
5812 
5813 		return(false);
5814 
5815 	case OS_FILE_AIO_RESOURCES_RESERVED:
5816 	case OS_FILE_AIO_INTERRUPTED:
5817 
5818 		return(true);
5819 
5820 	case OS_FILE_PATH_ERROR:
5821 	case OS_FILE_ALREADY_EXISTS:
5822 	case OS_FILE_ACCESS_VIOLATION:
5823 
5824 		return(false);
5825 
5826 	case OS_FILE_SHARING_VIOLATION:
5827 
5828 		os_thread_sleep(10000000);	/* 10 sec */
5829 		return(true);
5830 
5831 	case OS_FILE_OPERATION_ABORTED:
5832 	case OS_FILE_INSUFFICIENT_RESOURCE:
5833 
5834 		os_thread_sleep(100000);	/* 100 ms */
5835 		return(true);
5836 
5837 	default:
5838 
5839 		/* If it is an operation that can crash on error then it
5840 		is better to ignore on_error_silent and print an error message
5841 		to the log. */
5842 
5843 		if (should_exit || !on_error_silent) {
5844 			ib::error() << "File "
5845 				<< (name != NULL ? name : "(unknown)")
5846 				<< ": '" << operation << "'"
5847 				" returned OS error " << err << "."
5848 				<< (should_exit
5849 				    ? " Cannot continue operation" : "");
5850 		}
5851 
5852 		if (should_exit) {
5853 			srv_fatal_error();
5854 		}
5855 	}
5856 
5857 	return(false);
5858 }
5859 
5860 /** Does error handling when a file operation fails.
5861 @param[in]	name		name of a file or NULL
5862 @param[in]	operation	operation name that failed
5863 @return true if we should retry the operation */
5864 static
5865 bool
os_file_handle_error(const char * name,const char * operation)5866 os_file_handle_error(
5867 	const char*	name,
5868 	const char*	operation)
5869 {
5870 	/* Exit in case of unknown error */
5871 	return(os_file_handle_error_cond_exit(name, operation, true, false));
5872 }
5873 
5874 /** Does error handling when a file operation fails.
5875 @param[in]	name		name of a file or NULL
5876 @param[in]	operation	operation name that failed
5877 @param[in]	on_error_silent	if true then don't print any message to the log.
5878 @return true if we should retry the operation */
5879 static
5880 bool
os_file_handle_error_no_exit(const char * name,const char * operation,bool on_error_silent)5881 os_file_handle_error_no_exit(
5882 	const char*	name,
5883 	const char*	operation,
5884 	bool		on_error_silent)
5885 {
5886 	/* Don't exit in case of unknown error */
5887 	return(os_file_handle_error_cond_exit(
5888 			name, operation, false, on_error_silent));
5889 }
5890 
5891 /** Tries to disable OS caching on an opened file descriptor.
5892 @param[in]	fd		file descriptor to alter
5893 @param[in]	file_name	file name, used in the diagnostic message
5894 @param[in]	name		"open" or "create"; used in the diagnostic
5895 				message */
5896 void
os_file_set_nocache(int fd MY_ATTRIBUTE ((unused)),const char * file_name MY_ATTRIBUTE ((unused)),const char * operation_name MY_ATTRIBUTE ((unused)))5897 os_file_set_nocache(
5898 	int		fd		MY_ATTRIBUTE((unused)),
5899 	const char*	file_name	MY_ATTRIBUTE((unused)),
5900 	const char*	operation_name	MY_ATTRIBUTE((unused)))
5901 {
5902 	/* some versions of Solaris may not have DIRECTIO_ON */
5903 #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
5904 	if (directio(fd, DIRECTIO_ON) == -1) {
5905 		int	errno_save = errno;
5906 
5907 		ib::error()
5908 			<< "Failed to set DIRECTIO_ON on file "
5909 			<< file_name << ": " << operation_name
5910 			<< strerror(errno_save) << ","
5911 			" continuing anyway.";
5912 	}
5913 #elif defined(O_DIRECT)
5914 	if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
5915 		int		errno_save = errno;
5916 		static bool	warning_message_printed = false;
5917 		if (errno_save == EINVAL) {
5918 			if (!warning_message_printed) {
5919 				warning_message_printed = true;
5920 # ifdef UNIV_LINUX
5921 				ib::warn()
5922 					<< "Failed to set O_DIRECT on file"
5923 					<< file_name << ";" << operation_name
5924 					<< ": " << strerror(errno_save) << ", "
5925 					<< "continuing anyway. O_DIRECT is "
5926 					"known to result in 'Invalid argument' "
5927 					"on Linux on tmpfs, "
5928 					"see MySQL Bug#26662.";
5929 # else /* UNIV_LINUX */
5930 				goto short_warning;
5931 # endif /* UNIV_LINUX */
5932 			}
5933 		} else {
5934 # ifndef UNIV_LINUX
5935 short_warning:
5936 # endif
5937 			ib::warn()
5938 				<< "Failed to set O_DIRECT on file "
5939 				<< file_name << "; " << operation_name
5940 				<< " : " << strerror(errno_save)
5941 				<< " continuing anyway.";
5942 		}
5943 	}
5944 #endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */
5945 }
5946 
5947 /** Write the specified number of zeros to a newly created file.
5948 @param[in]	name		name of the file or path as a null-terminated
5949 				string
5950 @param[in]	file		handle to a file
5951 @param[in]	size		file size
5952 @param[in]	read_only	Enable read-only checks if true
5953 @return true if success */
5954 bool
os_file_set_size(const char * name,pfs_os_file_t file,os_offset_t size,bool read_only)5955 os_file_set_size(
5956 	const char*	name,
5957 	pfs_os_file_t	file,
5958 	os_offset_t	size,
5959 	bool		read_only)
5960 {
5961 	/* Write up to 1 megabyte at a time. */
5962 	ulint	buf_size = ut_min(
5963 		static_cast<ulint>(64),
5964 		static_cast<ulint>(size / UNIV_PAGE_SIZE));
5965 
5966 	buf_size *= UNIV_PAGE_SIZE;
5967 
5968 	/* Align the buffer for possible raw i/o */
5969 	byte*	buf2;
5970 
5971 	buf2 = static_cast<byte*>(ut_malloc_nokey(buf_size + UNIV_PAGE_SIZE));
5972 
5973 	byte*	buf = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
5974 
5975 	/* Write buffer full of zeros */
5976 	memset(buf, 0, buf_size);
5977 
5978 	if (size >= (os_offset_t) 100 << 20) {
5979 
5980 		ib::info() << "Progress in MB:";
5981 	}
5982 
5983 	os_offset_t	current_size = 0;
5984 
5985 	while (current_size < size) {
5986 		ulint	n_bytes;
5987 
5988 		if (size - current_size < (os_offset_t) buf_size) {
5989 			n_bytes = (ulint) (size - current_size);
5990 		} else {
5991 			n_bytes = buf_size;
5992 		}
5993 
5994 		dberr_t		err;
5995 		IORequest	request(IORequest::WRITE);
5996 
5997 #ifdef UNIV_HOTBACKUP
5998 
5999 		err = os_file_write(
6000 			request, name, file, buf, current_size, n_bytes);
6001 #else
6002 		/* Using OS_AIO_SYNC mode on POSIX systems will result in
6003 		fall back to os_file_write/read. On Windows it will use
6004 		special mechanism to wait before it returns back. */
6005 
6006 		err = os_aio(
6007 			request,
6008 			OS_AIO_SYNC, name,
6009 			file, buf, current_size, n_bytes,
6010 			read_only, NULL, NULL);
6011 #endif /* UNIV_HOTBACKUP */
6012 
6013 		if (err != DB_SUCCESS) {
6014 
6015 			ut_free(buf2);
6016 			return(false);
6017 		}
6018 
6019 		/* Print about progress for each 100 MB written */
6020 		if ((current_size + n_bytes) / (100 << 20)
6021 		    != current_size / (100 << 20)) {
6022 
6023 			fprintf(stderr, " %lu00",
6024 				(ulong) ((current_size + n_bytes)
6025 					 / (100 << 20)));
6026 		}
6027 
6028 		current_size += n_bytes;
6029 	}
6030 
6031 	if (size >= (os_offset_t) 100 << 20) {
6032 
6033 		fprintf(stderr, "\n");
6034 	}
6035 
6036 	ut_free(buf2);
6037 
6038 	return(os_file_flush(file));
6039 }
6040 
6041 /** Truncates a file to a specified size in bytes.
6042 Do nothing if the size to preserve is greater or equal to the current
6043 size of the file.
6044 @param[in]	pathname	file path
6045 @param[in]	file		file to be truncated
6046 @param[in]	size		size to preserve in bytes
6047 @return true if success */
6048 bool
os_file_truncate(const char * pathname,pfs_os_file_t file,os_offset_t size)6049 os_file_truncate(
6050 	const char*	pathname,
6051 	pfs_os_file_t	file,
6052 	os_offset_t	size)
6053 {
6054 	/* Do nothing if the size preserved is larger than or equal to the
6055 	current size of file */
6056 	os_offset_t	size_bytes = os_file_get_size(file);
6057 
6058 	if (size >= size_bytes) {
6059 		return(true);
6060 	}
6061 
6062 #ifdef _WIN32
6063 	return(os_file_truncate_win32(pathname, file, size));
6064 #else /* _WIN32 */
6065 	return(os_file_truncate_posix(pathname, file, size));
6066 #endif /* _WIN32 */
6067 }
6068 
6069 /** NOTE! Use the corresponding macro os_file_read(), not directly this
6070 function!
6071 Requests a synchronous positioned read operation.
6072 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
6073 @param[in]	type		IO flags
6074 @param[in]	file		handle to an open file
6075 @param[out]	buf		buffer where to read
6076 @param[in]	offset		file offset from the start where to read
6077 @param[in]	n		number of bytes to read, starting from offset
6078 @return DB_SUCCESS or error code */
6079 dberr_t
os_file_read_func(IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n)6080 os_file_read_func(
6081 	IORequest&	type,
6082 	os_file_t	file,
6083 	void*		buf,
6084 	os_offset_t	offset,
6085 	ulint		n)
6086 {
6087 	ut_ad(type.is_read());
6088 
6089 	return(os_file_read_page(type, file, buf, offset, n, NULL, true));
6090 }
6091 
6092 /** NOTE! Use the corresponding macro os_file_read_no_error_handling(),
6093 not directly this function!
6094 Requests a synchronous positioned read operation.
6095 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
6096 @param[in]	type		IO flags
6097 @param[in]	file		handle to an open file
6098 @param[out]	buf		buffer where to read
6099 @param[in]	offset		file offset from the start where to read
6100 @param[in]	n		number of bytes to read, starting from offset
6101 @param[out]	o		number of bytes actually read
6102 @return DB_SUCCESS or error code */
6103 dberr_t
os_file_read_no_error_handling_func(IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n,ulint * o)6104 os_file_read_no_error_handling_func(
6105 	IORequest&	type,
6106 	os_file_t	file,
6107 	void*		buf,
6108 	os_offset_t	offset,
6109 	ulint		n,
6110 	ulint*		o)
6111 {
6112 	ut_ad(type.is_read());
6113 
6114 	return(os_file_read_page(type, file, buf, offset, n, o, false));
6115 }
6116 
6117 /** NOTE! Use the corresponding macro os_file_write(), not directly
6118 Requests a synchronous write operation.
6119 @param[in]	type		IO flags
6120 @param[in]	file		handle to an open file
6121 @param[out]	buf		buffer from which to write
6122 @param[in]	offset		file offset from the start where to read
6123 @param[in]	n		number of bytes to read, starting from offset
6124 @return DB_SUCCESS if request was successful, false if fail */
6125 dberr_t
os_file_write_func(IORequest & type,const char * name,os_file_t file,const void * buf,os_offset_t offset,ulint n)6126 os_file_write_func(
6127 	IORequest&	type,
6128 	const char*	name,
6129 	os_file_t	file,
6130 	const void*	buf,
6131 	os_offset_t	offset,
6132 	ulint		n)
6133 {
6134 	ut_ad(type.validate());
6135 	ut_ad(type.is_write());
6136 
6137 	/* We never compress the first page.
6138 	Note: This assumes we always do block IO. */
6139 	if (offset == 0) {
6140 		type.clear_compressed();
6141 	}
6142 
6143 	const byte*	ptr = reinterpret_cast<const byte*>(buf);
6144 
6145 	return(os_file_write_page(type, name, file, ptr, offset, n));
6146 }
6147 
6148 /** Check the existence and type of the given file.
6149 @param[in]	path		path name of file
6150 @param[out]	exists		true if the file exists
6151 @param[out]	type		Type of the file, if it exists
6152 @return true if call succeeded */
6153 bool
os_file_status(const char * path,bool * exists,os_file_type_t * type)6154 os_file_status(
6155 	const char*	path,
6156 	bool*		exists,
6157 	os_file_type_t* type)
6158 {
6159 #ifdef _WIN32
6160 	return(os_file_status_win32(path, exists, type));
6161 #else
6162 	return(os_file_status_posix(path, exists, type));
6163 #endif /* _WIN32 */
6164 }
6165 
6166 /** Free storage space associated with a section of the file.
6167 @param[in]	fh		Open file handle
6168 @param[in]	off		Starting offset (SEEK_SET)
6169 @param[in]	len		Size of the hole
6170 @return DB_SUCCESS or error code */
6171 dberr_t
os_file_punch_hole(os_file_t fh,os_offset_t off,os_offset_t len)6172 os_file_punch_hole(
6173 	os_file_t	fh,
6174 	os_offset_t	off,
6175 	os_offset_t	len)
6176 {
6177 	/* In this debugging mode, we act as if punch hole is supported,
6178 	and then skip any calls to actually punch a hole here.
6179 	In this way, Transparent Page Compression is still being tested. */
6180 	DBUG_EXECUTE_IF("ignore_punch_hole",
6181 		return(DB_SUCCESS);
6182 	);
6183 
6184 #ifdef _WIN32
6185 	return(os_file_punch_hole_win32(fh, off, len));
6186 #else
6187 	return(os_file_punch_hole_posix(fh, off, len));
6188 #endif /* _WIN32 */
6189 }
6190 
6191 /** Check if the file system supports sparse files.
6192 
6193 Warning: On POSIX systems we try and punch a hole from offset 0 to
6194 the system configured page size. This should only be called on an empty
6195 file.
6196 
6197 Note: On Windows we use the name and on Unices we use the file handle.
6198 
6199 @param[in]	name		File name
6200 @param[in]	fh		File handle for the file - if opened
6201 @return true if the file system supports sparse files */
6202 bool
os_is_sparse_file_supported(const char * path,pfs_os_file_t fh)6203 os_is_sparse_file_supported(const char* path, pfs_os_file_t fh)
6204 {
6205 	/* In this debugging mode, we act as if punch hole is supported,
6206 	then we skip any calls to actually punch a hole.  In this way,
6207 	Transparent Page Compression is still being tested. */
6208 	DBUG_EXECUTE_IF("ignore_punch_hole",
6209 		return(true);
6210 	);
6211 
6212 #ifdef _WIN32
6213 	return(os_is_sparse_file_supported_win32(path));
6214 #else
6215 	dberr_t	err;
6216 
6217 	/* We don't know the FS block size, use the sector size. The FS
6218 	will do the magic. */
6219 	err = os_file_punch_hole(fh.m_file, 0, UNIV_PAGE_SIZE);
6220 
6221 	return(err == DB_SUCCESS);
6222 #endif /* _WIN32 */
6223 }
6224 
6225 /** This function returns information about the specified file
6226 @param[in]	path		pathname of the file
6227 @param[out]	stat_info	information of a file in a directory
6228 @param[in]	check_rw_perm	for testing whether the file can be opened
6229 				in RW mode
6230 @param[in]	read_only	true if file is opened in read-only mode
6231 @return DB_SUCCESS if all OK */
6232 dberr_t
os_file_get_status(const char * path,os_file_stat_t * stat_info,bool check_rw_perm,bool read_only)6233 os_file_get_status(
6234 	const char*	path,
6235 	os_file_stat_t* stat_info,
6236 	bool		check_rw_perm,
6237 	bool		read_only)
6238 {
6239 	dberr_t	ret;
6240 
6241 #ifdef _WIN32
6242 	struct _stat64	info;
6243 
6244 	ret = os_file_get_status_win32(
6245 		path, stat_info, &info, check_rw_perm, read_only);
6246 
6247 #else
6248 	struct stat	info;
6249 
6250 	ret = os_file_get_status_posix(
6251 		path, stat_info, &info, check_rw_perm, read_only);
6252 
6253 #endif /* _WIN32 */
6254 
6255 	if (ret == DB_SUCCESS) {
6256 		stat_info->ctime = info.st_ctime;
6257 		stat_info->atime = info.st_atime;
6258 		stat_info->mtime = info.st_mtime;
6259 		stat_info->size  = info.st_size;
6260 	}
6261 
6262 	return(ret);
6263 }
6264 
6265 /**
6266 Waits for an AIO operation to complete. This function is used to wait the
6267 for completed requests. The aio array of pending requests is divided
6268 into segments. The thread specifies which segment or slot it wants to wait
6269 for. NOTE: this function will also take care of freeing the aio slot,
6270 therefore no other thread is allowed to do the freeing!
6271 @param[in]	segment		The number of the segment in the aio arrays to
6272 				wait for; segment 0 is the ibuf I/O thread,
6273 				segment 1 the log I/O thread, then follow the
6274 				non-ibuf read threads, and as the last are the
6275 				non-ibuf write threads; if this is
6276 				ULINT_UNDEFINED, then it means that sync AIO
6277 				is used, and this parameter is ignored
6278 @param[out]	m1		the messages passed with the AIO request; note
6279 				that also in the case where the AIO operation
6280 				failed, these output parameters are valid and
6281 				can be used to restart the operation,
6282 				for example
6283 @param[out]	m2		callback message
6284 @param[out]	type		OS_FILE_WRITE or ..._READ
6285 @return DB_SUCCESS or error code */
6286 dberr_t
os_aio_handler(ulint segment,fil_node_t ** m1,void ** m2,IORequest * request)6287 os_aio_handler(
6288 	ulint		segment,
6289 	fil_node_t**	m1,
6290 	void**		m2,
6291 	IORequest*	request)
6292 {
6293 	dberr_t	err;
6294 
6295 	if (srv_use_native_aio) {
6296 		srv_set_io_thread_op_info(segment, "native aio handle");
6297 
6298 #ifdef WIN_ASYNC_IO
6299 
6300 		err = os_aio_windows_handler(segment, 0, m1, m2, request);
6301 
6302 #elif defined(LINUX_NATIVE_AIO)
6303 
6304 		err = os_aio_linux_handler(segment, m1, m2, request);
6305 
6306 #else
6307 		ut_error;
6308 
6309 		err = DB_ERROR; /* Eliminate compiler warning */
6310 
6311 #endif /* WIN_ASYNC_IO */
6312 
6313 	} else {
6314 		srv_set_io_thread_op_info(segment, "simulated aio handle");
6315 
6316 		err = os_aio_simulated_handler(segment, m1, m2, request);
6317 	}
6318 
6319 	return(err);
6320 }
6321 
6322 /** Constructor
6323 @param[in]	id		The latch ID
6324 @param[in]	n		Number of AIO slots
6325 @param[in]	segments	Number of segments */
AIO(latch_id_t id,ulint n,ulint segments)6326 AIO::AIO(
6327 	latch_id_t	id,
6328 	ulint		n,
6329 	ulint		segments)
6330 	:
6331 	m_slots(n),
6332 	m_n_segments(segments),
6333 	m_n_reserved()
6334 # ifdef LINUX_NATIVE_AIO
6335 	,m_aio_ctx(),
6336 	m_events(m_slots.size())
6337 # elif defined(_WIN32)
6338 	,m_handles()
6339 # endif /* LINUX_NATIVE_AIO */
6340 {
6341 	ut_a(n > 0);
6342 	ut_a(m_n_segments > 0);
6343 
6344 	mutex_create(id, &m_mutex);
6345 
6346 	m_not_full = os_event_create("aio_not_full");
6347 	m_is_empty = os_event_create("aio_is_empty");
6348 
6349 	std::uninitialized_fill(m_slots.begin(), m_slots.end(), Slot());
6350 #ifdef LINUX_NATIVE_AIO
6351 	memset(&m_events[0], 0x0, sizeof(m_events[0]) * m_events.size());
6352 #endif /* LINUX_NATIVE_AIO */
6353 
6354 	os_event_set(m_is_empty);
6355 }
6356 
6357 /** Initialise the slots */
6358 dberr_t
init_slots()6359 AIO::init_slots()
6360 {
6361 	for (ulint i = 0; i < m_slots.size(); ++i) {
6362 		Slot&	slot = m_slots[i];
6363 
6364 		slot.pos = static_cast<uint16_t>(i);
6365 
6366 		slot.is_reserved = false;
6367 
6368 #ifdef WIN_ASYNC_IO
6369 
6370 		slot.handle = CreateEvent(NULL, TRUE, FALSE, NULL);
6371 
6372 		OVERLAPPED*	over = &slot.control;
6373 
6374 		over->hEvent = slot.handle;
6375 
6376 		(*m_handles)[i] = over->hEvent;
6377 
6378 #elif defined(LINUX_NATIVE_AIO)
6379 
6380 		slot.ret = 0;
6381 
6382 		slot.n_bytes = 0;
6383 
6384 		memset(&slot.control, 0x0, sizeof(slot.control));
6385 
6386 #endif /* WIN_ASYNC_IO */
6387 	}
6388 
6389 	return(DB_SUCCESS);
6390 }
6391 
6392 #ifdef LINUX_NATIVE_AIO
6393 /** Initialise the Linux Native AIO interface */
6394 dberr_t
init_linux_native_aio()6395 AIO::init_linux_native_aio()
6396 {
6397 	/* Initialize the io_context array. One io_context
6398 	per segment in the array. */
6399 
6400 	ut_a(m_aio_ctx == NULL);
6401 
6402 	m_aio_ctx = static_cast<io_context**>(
6403 		ut_zalloc_nokey(m_n_segments * sizeof(*m_aio_ctx)));
6404 
6405 	if (m_aio_ctx == NULL) {
6406 		return(DB_OUT_OF_MEMORY);
6407 	}
6408 
6409 	io_context**	ctx = m_aio_ctx;
6410 	ulint		max_events = slots_per_segment();
6411 
6412 	for (ulint i = 0; i < m_n_segments; ++i, ++ctx) {
6413 
6414 		if (!linux_create_io_ctx(max_events, ctx)) {
6415 			/* If something bad happened during aio setup
6416 			we should call it a day and return right away.
6417 			We don't care about any leaks because a failure
6418 			to initialize the io subsystem means that the
6419 			server (or atleast the innodb storage engine)
6420 			is not going to startup. */
6421 			return(DB_IO_ERROR);
6422 		}
6423 	}
6424 
6425 	return(DB_SUCCESS);
6426 }
6427 #endif /* LINUX_NATIVE_AIO */
6428 
6429 /** Initialise the array */
6430 dberr_t
init()6431 AIO::init()
6432 {
6433 	ut_a(!m_slots.empty());
6434 
6435 #ifdef _WIN32
6436 	ut_a(m_handles == NULL);
6437 
6438 	m_handles = UT_NEW_NOKEY(Handles(m_slots.size()));
6439 #endif /* _WIN32 */
6440 
6441 	if (srv_use_native_aio) {
6442 #ifdef LINUX_NATIVE_AIO
6443 		dberr_t	err = init_linux_native_aio();
6444 
6445 		if (err != DB_SUCCESS) {
6446 			return(err);
6447 		}
6448 
6449 #endif /* LINUX_NATIVE_AIO */
6450 	}
6451 
6452 	return(init_slots());
6453 }
6454 
6455 /** Creates an aio wait array. Note that we return NULL in case of failure.
6456 We don't care about freeing memory here because we assume that a
6457 failure will result in server refusing to start up.
6458 @param[in]	id		Latch ID
6459 @param[in]	n		maximum number of pending AIO operations
6460 				allowed; n must be divisible by m_n_segments
6461 @param[in]	n_segments	number of segments in the AIO array
6462 @return own: AIO array, NULL on failure */
6463 AIO*
create(latch_id_t id,ulint n,ulint n_segments)6464 AIO::create(
6465 	latch_id_t	id,
6466 	ulint		n,
6467 	ulint		n_segments)
6468 {
6469 	if ((n % n_segments)) {
6470 
6471 		ib::error()
6472 			<< "Maximum number of AIO operations must be "
6473 			<< "divisible by number of segments";
6474 
6475 		return(NULL);
6476 	}
6477 
6478 	AIO*	array = UT_NEW_NOKEY(AIO(id, n, n_segments));
6479 
6480 	if (array != NULL && array->init() != DB_SUCCESS) {
6481 
6482 		UT_DELETE(array);
6483 
6484 		array = NULL;
6485 	}
6486 
6487 	return(array);
6488 }
6489 
6490 /** AIO destructor */
~AIO()6491 AIO::~AIO()
6492 {
6493 #ifdef WIN_ASYNC_IO
6494 	for (ulint i = 0; i < m_slots.size(); ++i) {
6495 		CloseHandle(m_slots[i].handle);
6496 	}
6497 #endif /* WIN_ASYNC_IO */
6498 
6499 #ifdef _WIN32
6500 	UT_DELETE(m_handles);
6501 #endif /* _WIN32 */
6502 
6503 	mutex_destroy(&m_mutex);
6504 
6505 	os_event_destroy(m_not_full);
6506 	os_event_destroy(m_is_empty);
6507 
6508 #if defined(LINUX_NATIVE_AIO)
6509 	if (srv_use_native_aio) {
6510 		m_events.clear();
6511 		ut_free(m_aio_ctx);
6512 	}
6513 #endif /* LINUX_NATIVE_AIO */
6514 
6515 	m_slots.clear();
6516 }
6517 
6518 /** Initializes the asynchronous io system. Creates one array each for ibuf
6519 and log i/o. Also creates one array each for read and write where each
6520 array is divided logically into n_readers and n_writers
6521 respectively. The caller must create an i/o handler thread for each
6522 segment in these arrays. This function also creates the sync array.
6523 No i/o handler thread needs to be created for that
6524 @param[in]	n_per_seg	maximum number of pending aio
6525 				operations allowed per segment
6526 @param[in]	n_readers	number of reader threads
6527 @param[in]	n_writers	number of writer threads
6528 @param[in]	n_slots_sync	number of slots in the sync aio array
6529 @return true if the AIO sub-system was started successfully */
6530 bool
start(ulint n_per_seg,ulint n_readers,ulint n_writers,ulint n_slots_sync)6531 AIO::start(
6532 	ulint		n_per_seg,
6533 	ulint		n_readers,
6534 	ulint		n_writers,
6535 	ulint		n_slots_sync)
6536 {
6537 #if defined(LINUX_NATIVE_AIO)
6538 	/* Check if native aio is supported on this system and tmpfs */
6539 	if (srv_use_native_aio && !is_linux_native_aio_supported()) {
6540 
6541 		ib::warn() << "Linux Native AIO disabled.";
6542 
6543 		srv_use_native_aio = FALSE;
6544 	}
6545 #endif /* LINUX_NATIVE_AIO */
6546 
6547 	srv_reset_io_thread_op_info();
6548 
6549 	s_reads = create(
6550 		LATCH_ID_OS_AIO_READ_MUTEX, n_readers * n_per_seg, n_readers);
6551 
6552 	if (s_reads == NULL) {
6553 		return(false);
6554 	}
6555 
6556 	ulint	start = srv_read_only_mode ? 0 : 2;
6557 	ulint	n_segs = n_readers + start;
6558 
6559 	/* 0 is the ibuf segment and 1 is the redo log segment. */
6560 	for (ulint i = start; i < n_segs; ++i) {
6561 		ut_a(i < SRV_MAX_N_IO_THREADS);
6562 		srv_io_thread_function[i] = "read thread";
6563 	}
6564 
6565 	ulint	n_segments = n_readers;
6566 
6567 	if (!srv_read_only_mode) {
6568 
6569 		s_ibuf = create(LATCH_ID_OS_AIO_IBUF_MUTEX, n_per_seg, 1);
6570 
6571 		if (s_ibuf == NULL) {
6572 			return(false);
6573 		}
6574 
6575 		++n_segments;
6576 
6577 		srv_io_thread_function[0] = "insert buffer thread";
6578 
6579 		s_log = create(LATCH_ID_OS_AIO_LOG_MUTEX, n_per_seg, 1);
6580 
6581 		if (s_log == NULL) {
6582 			return(false);
6583 		}
6584 
6585 		++n_segments;
6586 
6587 		srv_io_thread_function[1] = "log thread";
6588 
6589 	} else {
6590 		s_ibuf = s_log = NULL;
6591 	}
6592 
6593 	s_writes = create(
6594 		LATCH_ID_OS_AIO_WRITE_MUTEX, n_writers * n_per_seg, n_writers);
6595 
6596 	if (s_writes == NULL) {
6597 		return(false);
6598 	}
6599 
6600 	n_segments += n_writers;
6601 
6602 	for (ulint i = start + n_readers; i < n_segments; ++i) {
6603 		ut_a(i < SRV_MAX_N_IO_THREADS);
6604 		srv_io_thread_function[i] = "write thread";
6605 	}
6606 
6607 	ut_ad(n_segments >= static_cast<ulint>(srv_read_only_mode ? 2 : 4));
6608 
6609 	s_sync = create(LATCH_ID_OS_AIO_SYNC_MUTEX, n_slots_sync, 1);
6610 
6611 	if (s_sync == NULL) {
6612 
6613 		return(false);
6614 	}
6615 
6616 	os_aio_n_segments = n_segments;
6617 
6618 	os_aio_validate();
6619 
6620 	os_aio_segment_wait_events = static_cast<os_event_t*>(
6621 		ut_zalloc_nokey(
6622 			n_segments * sizeof *os_aio_segment_wait_events));
6623 
6624 	if (os_aio_segment_wait_events == NULL) {
6625 
6626 		return(false);
6627 	}
6628 
6629 	for (ulint i = 0; i < n_segments; ++i) {
6630 		os_aio_segment_wait_events[i] = os_event_create(0);
6631 	}
6632 
6633 	os_last_printout = ut_time_monotonic();
6634 
6635 	return(true);
6636 }
6637 
6638 /** Free the AIO arrays */
6639 void
shutdown()6640 AIO::shutdown()
6641 {
6642 	UT_DELETE(s_ibuf);
6643 	s_ibuf = NULL;
6644 
6645 	UT_DELETE(s_log);
6646 	s_log = NULL;
6647 
6648 	UT_DELETE(s_writes);
6649 	s_writes = NULL;
6650 
6651 	UT_DELETE(s_sync);
6652 	s_sync = NULL;
6653 
6654 	UT_DELETE(s_reads);
6655 	s_reads = NULL;
6656 }
6657 
6658 #if !defined(NO_FALLOCATE) && defined(UNIV_LINUX)
6659 
6660 /** Max disk sector size */
6661 static const ulint	MAX_SECTOR_SIZE = 4096;
6662 
6663 /**
6664 Try and get the FusionIO sector size. */
6665 void
os_fusionio_get_sector_size()6666 os_fusionio_get_sector_size()
6667 {
6668 	if (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
6669 	    || srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC) {
6670 		ulint		sector_size = UNIV_SECTOR_SIZE;
6671 		char*		path = srv_data_home;
6672 		os_file_t	check_file;
6673 		byte*		ptr;
6674 		byte*		block_ptr;
6675 		char		current_dir[3];
6676 		char*		dir_end;
6677 		ulint		dir_len;
6678 		ulint		check_path_len;
6679 		char*		check_file_name;
6680 		ssize_t		ret;
6681 
6682 		/* If the srv_data_home is empty, set the path to
6683 		current dir. */
6684 		if (*path == 0) {
6685 			current_dir[0] = FN_CURLIB;
6686 			current_dir[1] = FN_LIBCHAR;
6687 			current_dir[2] = 0;
6688 			path = current_dir;
6689 		}
6690 
6691 		/* Get the path of data file */
6692 		dir_end = strrchr(path, OS_PATH_SEPARATOR);
6693 		dir_len = dir_end? dir_end - path : strlen(path);
6694 
6695 		/* allocate a new path and move the directory path to it. */
6696 		check_path_len = dir_len + sizeof "/check_sector_size";
6697 		check_file_name = static_cast<char*>(
6698 			ut_zalloc_nokey(check_path_len));
6699 		memcpy(check_file_name, path, dir_len);
6700 
6701 		/* Construct a check file name. */
6702 		strcat(check_file_name + dir_len, "/check_sector_size");
6703 
6704 		/* Create a tmp file for checking sector size. */
6705 		check_file = ::open(check_file_name,
6706 				    O_CREAT|O_TRUNC|O_WRONLY|O_DIRECT,
6707 				    S_IRWXU);
6708 
6709 		if (check_file == -1) {
6710 			ib::error()
6711 				<< "Failed to create check sector file, errno:"
6712 				<< errno << " Please confirm O_DIRECT is"
6713 				<< " supported and remove the file "
6714 				<< check_file_name << " if it exists.";
6715 			ut_free(check_file_name);
6716 			errno = 0;
6717 			return;
6718 		}
6719 
6720 		/* Try to write the file with different sector size
6721 		alignment. */
6722 		ptr = static_cast<byte*>(ut_malloc_nokey(2 * MAX_SECTOR_SIZE));
6723 
6724 		while (sector_size <= MAX_SECTOR_SIZE) {
6725 			block_ptr = static_cast<byte*>(
6726 				ut_align(ptr, sector_size));
6727 			ret = pwrite(check_file, block_ptr,
6728 				    sector_size, 0);
6729 			if (ret > 0 && (ulint) ret == sector_size) {
6730 				break;
6731 			}
6732 			sector_size *= 2;
6733 		}
6734 
6735 		/* The sector size should <= MAX_SECTOR_SIZE. */
6736 		ut_ad(sector_size <= MAX_SECTOR_SIZE);
6737 
6738 		close(check_file);
6739 		unlink(check_file_name);
6740 
6741 		ut_free(check_file_name);
6742 		ut_free(ptr);
6743 		errno = 0;
6744 
6745 		os_io_ptr_align = sector_size;
6746 	}
6747 }
6748 #endif /* !NO_FALLOCATE && UNIV_LINUX */
6749 
6750 /** Initializes the asynchronous io system. Creates one array each for ibuf
6751 and log i/o. Also creates one array each for read and write where each
6752 array is divided logically into n_readers and n_writers
6753 respectively. The caller must create an i/o handler thread for each
6754 segment in these arrays. This function also creates the sync array.
6755 No i/o handler thread needs to be created for that
6756 @param[in]	n_readers	number of reader threads
6757 @param[in]	n_writers	number of writer threads
6758 @param[in]	n_slots_sync	number of slots in the sync aio array */
6759 bool
os_aio_init(ulint n_readers,ulint n_writers,ulint n_slots_sync)6760 os_aio_init(
6761 	ulint		n_readers,
6762 	ulint		n_writers,
6763 	ulint		n_slots_sync)
6764 {
6765 	/* Maximum number of pending aio operations allowed per segment */
6766 	ulint		limit = 8 * OS_AIO_N_PENDING_IOS_PER_THREAD;
6767 
6768 #ifdef _WIN32
6769 	if (srv_use_native_aio) {
6770 		limit = SRV_N_PENDING_IOS_PER_THREAD;
6771 	}
6772 #endif /* _WIN32 */
6773 
6774 	ut_a(block_cache == NULL);
6775 
6776 	block_cache = UT_NEW_NOKEY(Blocks(MAX_BLOCKS));
6777 
6778 	for (Blocks::iterator it = block_cache->begin();
6779 	     it != block_cache->end();
6780 	     ++it) {
6781 
6782 		ut_a(it->m_in_use == 0);
6783 		ut_a(it->m_ptr == NULL);
6784 
6785 		/* Allocate double of max page size memory, since
6786 		compress could generate more bytes than orgininal
6787 		data. */
6788 		it->m_ptr = static_cast<byte*>(
6789 			ut_malloc_nokey(BUFFER_BLOCK_SIZE));
6790 
6791 		ut_a(it->m_ptr != NULL);
6792 	}
6793 
6794 	/* Get sector size for DIRECT_IO. In this case, we need to
6795 	know the sector size for aligning the write buffer. */
6796 #if !defined(NO_FALLOCATE) && defined(UNIV_LINUX)
6797 	os_fusionio_get_sector_size();
6798 #endif /* !NO_FALLOCATE && UNIV_LINUX */
6799 
6800 	return(AIO::start(limit, n_readers, n_writers, n_slots_sync));
6801 }
6802 
6803 /** Frees the asynchronous io system. */
6804 void
os_aio_free()6805 os_aio_free()
6806 {
6807 	AIO::shutdown();
6808 
6809 	for (ulint i = 0; i < os_aio_n_segments; i++) {
6810 		os_event_destroy(os_aio_segment_wait_events[i]);
6811 	}
6812 
6813 	ut_free(os_aio_segment_wait_events);
6814 	os_aio_segment_wait_events = 0;
6815 	os_aio_n_segments = 0;
6816 
6817 	for (Blocks::iterator it = block_cache->begin();
6818 	     it != block_cache->end();
6819 	     ++it) {
6820 
6821 		ut_a(it->m_in_use == 0);
6822 		ut_free(it->m_ptr);
6823 	}
6824 
6825 	UT_DELETE(block_cache);
6826 
6827 	block_cache = NULL;
6828 }
6829 
6830 /** Wakes up all async i/o threads so that they know to exit themselves in
6831 shutdown. */
6832 void
os_aio_wake_all_threads_at_shutdown()6833 os_aio_wake_all_threads_at_shutdown()
6834 {
6835 #ifdef WIN_ASYNC_IO
6836 
6837 	AIO::wake_at_shutdown();
6838 
6839 #elif defined(LINUX_NATIVE_AIO)
6840 
6841 	/* When using native AIO interface the io helper threads
6842 	wait on io_getevents with a timeout value of 500ms. At
6843 	each wake up these threads check the server status.
6844 	No need to do anything to wake them up. */
6845 
6846 	if (srv_use_native_aio) {
6847 		return;
6848 	}
6849 
6850 #endif /* !WIN_ASYNC_AIO */
6851 
6852 	/* Fall through to simulated AIO handler wakeup if we are
6853 	not using native AIO. */
6854 
6855 	/* This loop wakes up all simulated ai/o threads */
6856 
6857 	for (ulint i = 0; i < os_aio_n_segments; ++i) {
6858 
6859 		os_event_set(os_aio_segment_wait_events[i]);
6860 	}
6861 }
6862 
6863 /** Waits until there are no pending writes in AIO::s_writes. There can
6864 be other, synchronous, pending writes. */
6865 void
os_aio_wait_until_no_pending_writes()6866 os_aio_wait_until_no_pending_writes()
6867 {
6868 	AIO::wait_until_no_pending_writes();
6869 }
6870 
6871 /** Calculates segment number for a slot.
6872 @param[in]	array		AIO wait array
6873 @param[in]	slot		slot in this array
6874 @return segment number (which is the number used by, for example,
6875 	I/O-handler threads) */
6876 ulint
get_segment_no_from_slot(const AIO * array,const Slot * slot)6877 AIO::get_segment_no_from_slot(
6878 	const AIO*	array,
6879 	const Slot*	slot)
6880 {
6881 	ulint	segment;
6882 	ulint	seg_len;
6883 
6884 	if (array == s_ibuf) {
6885 		ut_ad(!srv_read_only_mode);
6886 
6887 		segment = IO_IBUF_SEGMENT;
6888 
6889 	} else if (array == s_log) {
6890 		ut_ad(!srv_read_only_mode);
6891 
6892 		segment = IO_LOG_SEGMENT;
6893 
6894 	} else if (array == s_reads) {
6895 		seg_len = s_reads->slots_per_segment();
6896 
6897 		segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
6898 	} else {
6899 		ut_a(array == s_writes);
6900 
6901 		seg_len = s_writes->slots_per_segment();
6902 
6903 		segment = s_reads->m_n_segments
6904 			+ (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
6905 	}
6906 
6907 	return(segment);
6908 }
6909 
6910 /** Requests for a slot in the aio array. If no slot is available, waits until
6911 not_full-event becomes signaled.
6912 
6913 @param[in,out]	type		IO context
6914 @param[in,out]	m1		message to be passed along with the AIO
6915 				operation
6916 @param[in,out]	m2		message to be passed along with the AIO
6917 				operation
6918 @param[in]	file		file handle
6919 @param[in]	name		name of the file or path as a NUL-terminated
6920 				string
6921 @param[in,out]	buf		buffer where to read or from which to write
6922 @param[in]	offset		file offset, where to read from or start writing
6923 @param[in]	len		length of the block to read or write
6924 @return pointer to slot */
6925 Slot*
reserve_slot(IORequest & type,fil_node_t * m1,void * m2,pfs_os_file_t file,const char * name,void * buf,os_offset_t offset,ulint len)6926 AIO::reserve_slot(
6927 	IORequest&	type,
6928 	fil_node_t*	m1,
6929 	void*		m2,
6930 	pfs_os_file_t	file,
6931 	const char*	name,
6932 	void*		buf,
6933 	os_offset_t	offset,
6934 	ulint		len)
6935 {
6936 #ifdef WIN_ASYNC_IO
6937 	ut_a((len & 0xFFFFFFFFUL) == len);
6938 #endif /* WIN_ASYNC_IO */
6939 
6940 	/* No need of a mutex. Only reading constant fields */
6941 	ulint		slots_per_seg;
6942 
6943 	ut_ad(type.validate());
6944 
6945 	slots_per_seg = slots_per_segment();
6946 
6947 	/* We attempt to keep adjacent blocks in the same local
6948 	segment. This can help in merging IO requests when we are
6949 	doing simulated AIO */
6950 	ulint		local_seg;
6951 
6952 	local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6)) % m_n_segments;
6953 
6954 	for (;;) {
6955 
6956 		acquire();
6957 
6958 		if (m_n_reserved != m_slots.size()) {
6959 			break;
6960 		}
6961 
6962 		release();
6963 
6964 		if (!srv_use_native_aio) {
6965 			/* If the handler threads are suspended,
6966 			wake them so that we get more slots */
6967 
6968 			os_aio_simulated_wake_handler_threads();
6969 		}
6970 
6971 		os_event_wait(m_not_full);
6972 	}
6973 
6974 	ulint	counter = 0;
6975 	Slot*	slot = NULL;
6976 
6977 	/* We start our search for an available slot from our preferred
6978 	local segment and do a full scan of the array. We are
6979 	guaranteed to find a slot in full scan. */
6980 	for (ulint i = local_seg * slots_per_seg;
6981 	     counter < m_slots.size();
6982 	     ++i, ++counter) {
6983 
6984 		i %= m_slots.size();
6985 
6986 		slot = at(i);
6987 
6988 		if (slot->is_reserved == false) {
6989 			break;
6990 		}
6991 	}
6992 
6993 	/* We MUST always be able to get hold of a reserved slot. */
6994 	ut_a(counter < m_slots.size());
6995 
6996 	ut_a(slot->is_reserved == false);
6997 
6998 	++m_n_reserved;
6999 
7000 	if (m_n_reserved == 1) {
7001 		os_event_reset(m_is_empty);
7002 	}
7003 
7004 	if (m_n_reserved == m_slots.size()) {
7005 		os_event_reset(m_not_full);
7006 	}
7007 
7008 	slot->is_reserved = true;
7009 	slot->reservation_time = ut_time_monotonic();
7010 	slot->m1       = m1;
7011 	slot->m2       = m2;
7012 	slot->file     = file;
7013 	slot->name     = name;
7014 #ifdef _WIN32
7015 	slot->len      = static_cast<DWORD>(len);
7016 #else
7017 	slot->len      = static_cast<ulint>(len);
7018 #endif /* _WIN32 */
7019 	slot->type     = type;
7020 	slot->buf      = static_cast<byte*>(buf);
7021 	slot->ptr      = slot->buf;
7022 	slot->offset   = offset;
7023 	slot->err      = DB_SUCCESS;
7024 	slot->original_len = static_cast<uint32>(len);
7025 	slot->io_already_done = false;
7026 	slot->buf_block = NULL;
7027 
7028 	if (srv_use_native_aio
7029 	    && offset > 0
7030 	    && type.is_write()
7031 	    && type.is_compressed()) {
7032 		ulint	compressed_len = len;
7033 
7034 		ut_ad(!type.is_log());
7035 
7036 		release();
7037 
7038 		void* src_buf = slot->buf;
7039 		slot->buf_block = os_file_compress_page(
7040 			type,
7041 			src_buf,
7042 			&compressed_len);
7043 
7044 		slot->buf = static_cast<byte*>(src_buf);
7045 		slot->ptr = slot->buf;
7046 #ifdef _WIN32
7047 		slot->len = static_cast<DWORD>(compressed_len);
7048 #else
7049 		slot->len = static_cast<ulint>(compressed_len);
7050 #endif /* _WIN32 */
7051 		slot->skip_punch_hole = !type.punch_hole();
7052 
7053 		acquire();
7054 	}
7055 
7056 	/* We do encryption after compression, since if we do encryption
7057 	before compression, the encrypted data will cause compression fail
7058 	or low compression rate. */
7059 	if (srv_use_native_aio
7060 	    && offset > 0
7061 	    && type.is_write()
7062 	    && type.is_encrypted()) {
7063 		ulint		encrypted_len = slot->len;
7064 		Block*		encrypted_block;
7065 
7066 		ut_ad(!type.is_log());
7067 
7068 		release();
7069 
7070 		void* src_buf = slot->buf;
7071 		encrypted_block = os_file_encrypt_page(
7072 			type,
7073 			src_buf,
7074 			&encrypted_len);
7075 
7076 		if (slot->buf_block != NULL) {
7077 			os_free_block(slot->buf_block);
7078 		}
7079 
7080 		slot->buf_block = encrypted_block;
7081 		slot->buf = static_cast<byte*>(src_buf);
7082 		slot->ptr = slot->buf;
7083 
7084 #ifdef _WIN32
7085 		slot->len = static_cast<DWORD>(encrypted_len);
7086 #else
7087 		slot->len = static_cast<ulint>(encrypted_len);
7088 #endif /* _WIN32 */
7089 
7090 		acquire();
7091         }
7092 
7093 #ifdef WIN_ASYNC_IO
7094 	{
7095 		OVERLAPPED*	control;
7096 
7097 		control = &slot->control;
7098 		control->Offset = (DWORD) offset & 0xFFFFFFFF;
7099 		control->OffsetHigh = (DWORD) (offset >> 32);
7100 
7101 		ResetEvent(slot->handle);
7102 	}
7103 #elif defined(LINUX_NATIVE_AIO)
7104 
7105 	/* If we are not using native AIO skip this part. */
7106 	if (srv_use_native_aio) {
7107 
7108 		off_t		aio_offset;
7109 
7110 		/* Check if we are dealing with 64 bit arch.
7111 		If not then make sure that offset fits in 32 bits. */
7112 		aio_offset = (off_t) offset;
7113 
7114 		ut_a(sizeof(aio_offset) >= sizeof(offset)
7115 		     || ((os_offset_t) aio_offset) == offset);
7116 
7117 		struct iocb*	iocb = &slot->control;
7118 
7119 		if (type.is_read()) {
7120 			io_prep_pread(
7121 				iocb, file.m_file, slot->ptr, slot->len, aio_offset);
7122 		} else {
7123 			ut_ad(type.is_write());
7124 			io_prep_pwrite(
7125 				iocb, file.m_file, slot->ptr, slot->len, aio_offset);
7126 		}
7127 
7128 		iocb->data = slot;
7129 
7130 		slot->n_bytes = 0;
7131 		slot->ret = 0;
7132 	}
7133 #endif /* LINUX_NATIVE_AIO */
7134 
7135 	release();
7136 
7137 	return(slot);
7138 }
7139 
7140 /** Wakes up a simulated aio i/o-handler thread if it has something to do.
7141 @param[in]	global_segment	The number of the segment in the AIO arrays */
7142 void
wake_simulated_handler_thread(ulint global_segment)7143 AIO::wake_simulated_handler_thread(ulint global_segment)
7144 {
7145 	ut_ad(!srv_use_native_aio);
7146 
7147 	AIO*	array;
7148 	ulint	segment = get_array_and_local_segment(&array, global_segment);
7149 
7150 	array->wake_simulated_handler_thread(global_segment, segment);
7151 }
7152 
7153 /** Wakes up a simulated AIO I/O-handler thread if it has something to do
7154 for a local segment in the AIO array.
7155 @param[in]	global_segment	The number of the segment in the AIO arrays
7156 @param[in]	segment		The local segment in the AIO array */
7157 void
wake_simulated_handler_thread(ulint global_segment,ulint segment)7158 AIO::wake_simulated_handler_thread(ulint global_segment, ulint segment)
7159 {
7160 	ut_ad(!srv_use_native_aio);
7161 
7162 	ulint	n = slots_per_segment();
7163 	ulint	offset = segment * n;
7164 
7165 	/* Look through n slots after the segment * n'th slot */
7166 
7167 	acquire();
7168 
7169 	const Slot*	slot = at(offset);
7170 
7171 	for (ulint i = 0; i < n; ++i, ++slot) {
7172 
7173 		if (slot->is_reserved) {
7174 
7175 			/* Found an i/o request */
7176 
7177 			release();
7178 
7179 			os_event_t	event;
7180 
7181 			event = os_aio_segment_wait_events[global_segment];
7182 
7183 			os_event_set(event);
7184 
7185 			return;
7186 		}
7187 	}
7188 
7189 	release();
7190 }
7191 
7192 /** Wakes up simulated aio i/o-handler threads if they have something to do. */
7193 void
os_aio_simulated_wake_handler_threads()7194 os_aio_simulated_wake_handler_threads()
7195 {
7196 	if (srv_use_native_aio) {
7197 		/* We do not use simulated aio: do nothing */
7198 
7199 		return;
7200 	}
7201 
7202 	os_aio_recommend_sleep_for_read_threads	= false;
7203 
7204 	for (ulint i = 0; i < os_aio_n_segments; i++) {
7205 		AIO::wake_simulated_handler_thread(i);
7206 	}
7207 }
7208 
7209 /** Select the IO slot array
7210 @param[in]	type		Type of IO, READ or WRITE
7211 @param[in]	read_only	true if running in read-only mode
7212 @param[in]	mode		IO mode
7213 @return slot array or NULL if invalid mode specified */
7214 AIO*
select_slot_array(IORequest & type,bool read_only,ulint mode)7215 AIO::select_slot_array(IORequest& type, bool read_only, ulint mode)
7216 {
7217 	AIO*	array;
7218 
7219 	ut_ad(type.validate());
7220 
7221 	switch (mode) {
7222 	case OS_AIO_NORMAL:
7223 
7224 		array = type.is_read() ? AIO::s_reads : AIO::s_writes;
7225 		break;
7226 
7227 	case OS_AIO_IBUF:
7228 		ut_ad(type.is_read());
7229 
7230 		/* Reduce probability of deadlock bugs in connection with ibuf:
7231 		do not let the ibuf i/o handler sleep */
7232 
7233 		type.clear_do_not_wake();
7234 
7235 		array = read_only ? AIO::s_reads : AIO::s_ibuf;
7236 		break;
7237 
7238 	case OS_AIO_LOG:
7239 
7240 		array = read_only ? AIO::s_reads : AIO::s_log;
7241 		break;
7242 
7243 	case OS_AIO_SYNC:
7244 
7245 		array = AIO::s_sync;
7246 #if defined(LINUX_NATIVE_AIO)
7247 		/* In Linux native AIO we don't use sync IO array. */
7248 		ut_a(!srv_use_native_aio);
7249 #endif /* LINUX_NATIVE_AIO */
7250 		break;
7251 
7252 	default:
7253 		ut_error;
7254 		array = NULL; /* Eliminate compiler warning */
7255 	}
7256 
7257 	return(array);
7258 }
7259 
7260 #ifdef WIN_ASYNC_IO
7261 /** This function is only used in Windows asynchronous i/o.
7262 Waits for an aio operation to complete. This function is used to wait the
7263 for completed requests. The aio array of pending requests is divided
7264 into segments. The thread specifies which segment or slot it wants to wait
7265 for. NOTE: this function will also take care of freeing the aio slot,
7266 therefore no other thread is allowed to do the freeing!
7267 @param[in]	segment		The number of the segment in the aio arrays to
7268 				wait for; segment 0 is the ibuf I/O thread,
7269 				segment 1 the log I/O thread, then follow the
7270 				non-ibuf read threads, and as the last are the
7271 				non-ibuf write threads; if this is
7272 				ULINT_UNDEFINED, then it means that sync AIO
7273 				is used, and this parameter is ignored
7274 @param[in]	pos		this parameter is used only in sync AIO:
7275 				wait for the aio slot at this position
7276 @param[out]	m1		the messages passed with the AIO request; note
7277 				that also in the case where the AIO operation
7278 				failed, these output parameters are valid and
7279 				can be used to restart the operation,
7280 				for example
7281 @param[out]	m2		callback message
7282 @param[out]	type		OS_FILE_WRITE or ..._READ
7283 @return DB_SUCCESS or error code */
7284 static
7285 dberr_t
os_aio_windows_handler(ulint segment,ulint pos,fil_node_t ** m1,void ** m2,IORequest * type)7286 os_aio_windows_handler(
7287 	ulint		segment,
7288 	ulint		pos,
7289 	fil_node_t**	m1,
7290 	void**		m2,
7291 	IORequest*	type)
7292 {
7293 	Slot*		slot;
7294 	dberr_t		err;
7295 	AIO*		array;
7296 	ulint		orig_seg = segment;
7297 
7298 	if (segment == ULINT_UNDEFINED) {
7299 		segment = 0;
7300 		array = AIO::sync_array();
7301 	} else {
7302 		segment = AIO::get_array_and_local_segment(&array, segment);
7303 	}
7304 
7305 	/* NOTE! We only access constant fields in os_aio_array. Therefore
7306 	we do not have to acquire the protecting mutex yet */
7307 
7308 	ut_ad(os_aio_validate_skip());
7309 
7310 	if (array == AIO::sync_array()) {
7311 
7312 		WaitForSingleObject(array->at(pos)->handle, INFINITE);
7313 
7314 	} else {
7315 		if (orig_seg != ULINT_UNDEFINED) {
7316 			srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
7317 		}
7318 
7319 		pos = WaitForMultipleObjects(
7320 			(DWORD) array->slots_per_segment(),
7321 			array->handles(segment),
7322 			FALSE, INFINITE);
7323 	}
7324 
7325 	array->acquire();
7326 
7327 	if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
7328 	    && array->is_empty()
7329 	    && !buf_page_cleaner_is_active) {
7330 
7331 		*m1 = NULL;
7332 		*m2 = NULL;
7333 
7334 		array->release();
7335 
7336 		return(DB_SUCCESS);
7337 	}
7338 
7339 	ulint	n = array->slots_per_segment();
7340 
7341 	ut_a(pos >= WAIT_OBJECT_0 && pos <= WAIT_OBJECT_0 + n);
7342 
7343 	slot = array->at(pos + segment * n);
7344 
7345 	ut_a(slot->is_reserved);
7346 
7347 	if (orig_seg != ULINT_UNDEFINED) {
7348 		srv_set_io_thread_op_info(
7349 			orig_seg, "get windows aio return value");
7350 	}
7351 
7352 	BOOL	ret;
7353 	ret = GetOverlappedResult(
7354 		slot->file.m_file, &slot->control, &slot->n_bytes, TRUE);
7355 	*m1 = slot->m1;
7356 	*m2 = slot->m2;
7357 
7358 	*type = slot->type;
7359 
7360 	BOOL	retry = FALSE;
7361 
7362 	if (ret && slot->n_bytes == slot->len) {
7363 
7364 		err = DB_SUCCESS;
7365 
7366 	} else if (os_file_handle_error(slot->name, "Windows aio")) {
7367 
7368 		retry = true;
7369 
7370 	} else {
7371 
7372 		err = DB_IO_ERROR;
7373 	}
7374 
7375 	array->release();
7376 
7377 	if (retry) {
7378 		/* Retry failed read/write operation synchronously.
7379 		No need to hold array->m_mutex. */
7380 
7381 #ifdef UNIV_PFS_IO
7382 		/* This read/write does not go through os_file_read
7383 		and os_file_write APIs, need to register with
7384 		performance schema explicitly here. */
7385 		struct PSI_file_locker* locker = NULL;
7386 		PSI_file_locker_state   state;
7387 		register_pfs_file_io_begin(
7388 			&state, locker, slot->file, slot->len,
7389 			slot->type.is_write()
7390 			? PSI_FILE_WRITE : PSI_FILE_READ, __FILE__, __LINE__);
7391 #endif /* UNIV_PFS_IO */
7392 
7393 		ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
7394 
7395 		ssize_t	n_bytes = SyncFileIO::execute(slot);
7396 
7397 #ifdef UNIV_PFS_IO
7398 		register_pfs_file_io_end(locker, slot->len);
7399 #endif /* UNIV_PFS_IO */
7400 
7401 		if (n_bytes < 0 && GetLastError() == ERROR_IO_PENDING) {
7402 			/* AIO was queued successfully!
7403 			We want a synchronous I/O operation on a
7404 			file where we also use async I/O: in Windows
7405 			we must use the same wait mechanism as for
7406 			async I/O */
7407 
7408 			BOOL	ret;
7409 			ret = GetOverlappedResult(
7410 				slot->file.m_file, &slot->control, &slot->n_bytes,
7411 				TRUE);
7412 			n_bytes = ret ? slot->n_bytes : -1;
7413 		}
7414 
7415 		err = (n_bytes == slot->len) ? DB_SUCCESS : DB_IO_ERROR;
7416 	}
7417 
7418 	if (err == DB_SUCCESS) {
7419 		err = AIOHandler::post_io_processing(slot);
7420 	}
7421 
7422 	array->release_with_mutex(slot);
7423 
7424 	return(err);
7425 }
7426 #endif /* WIN_ASYNC_IO */
7427 
7428 /**
7429 NOTE! Use the corresponding macro os_aio(), not directly this function!
7430 Requests an asynchronous i/o operation.
7431 @param[in]	type		IO request context
7432 @param[in]	mode		IO mode
7433 @param[in]	name		Name of the file or path as NUL terminated
7434 				string
7435 @param[in]	file		Open file handle
7436 @param[out]	buf		buffer where to read
7437 @param[in]	offset		file offset where to read
7438 @param[in]	n		number of bytes to read
7439 @param[in]	read_only	if true read only mode checks are enforced
7440 @param[in,out]	m1		Message for the AIO handler, (can be used to
7441 				identify a completed AIO operation); ignored
7442 				if mode is OS_AIO_SYNC
7443 @param[in,out]	m2		message for the AIO handler (can be used to
7444 				identify a completed AIO operation); ignored
7445 				if mode is OS_AIO_SYNC
7446 @return DB_SUCCESS or error code */
7447 dberr_t
os_aio_func(IORequest & type,ulint mode,const char * name,pfs_os_file_t file,void * buf,os_offset_t offset,ulint n,bool read_only,fil_node_t * m1,void * m2)7448 os_aio_func(
7449 	IORequest&	type,
7450 	ulint		mode,
7451 	const char*	name,
7452 	pfs_os_file_t	file,
7453 	void*		buf,
7454 	os_offset_t	offset,
7455 	ulint		n,
7456 	bool		read_only,
7457 	fil_node_t*	m1,
7458 	void*		m2)
7459 {
7460 #ifdef WIN_ASYNC_IO
7461 	BOOL		ret = TRUE;
7462 #endif /* WIN_ASYNC_IO */
7463 
7464 	ut_ad(n > 0);
7465 	ut_ad((n % OS_FILE_LOG_BLOCK_SIZE) == 0);
7466 	ut_ad((offset % OS_FILE_LOG_BLOCK_SIZE) == 0);
7467 	ut_ad(os_aio_validate_skip());
7468 
7469 #ifdef WIN_ASYNC_IO
7470 	ut_ad((n & 0xFFFFFFFFUL) == n);
7471 #endif /* WIN_ASYNC_IO */
7472 
7473 	if (mode == OS_AIO_SYNC
7474 #ifdef WIN_ASYNC_IO
7475 	    && !srv_use_native_aio
7476 #endif /* WIN_ASYNC_IO */
7477 	    ) {
7478 		/* This is actually an ordinary synchronous read or write:
7479 		no need to use an i/o-handler thread. NOTE that if we use
7480 		Windows async i/o, Windows does not allow us to use
7481 		ordinary synchronous os_file_read etc. on the same file,
7482 		therefore we have built a special mechanism for synchronous
7483 		wait in the Windows case.
7484 		Also note that the Performance Schema instrumentation has
7485 		been performed by current os_aio_func()'s wrapper function
7486 		pfs_os_aio_func(). So we would no longer need to call
7487 		Performance Schema instrumented os_file_read() and
7488 		os_file_write(). Instead, we should use os_file_read_func()
7489 		and os_file_write_func() */
7490 
7491 		if (type.is_read()) {
7492 			return(os_file_read_func(type, file.m_file, buf, offset, n));
7493 		}
7494 
7495 		ut_ad(type.is_write());
7496 		return(os_file_write_func(type, name, file.m_file, buf, offset, n));
7497 	}
7498 
7499 try_again:
7500 
7501 	AIO*	array;
7502 
7503 	array = AIO::select_slot_array(type, read_only, mode);
7504 
7505 	Slot*	slot;
7506 
7507 	slot = array->reserve_slot(type, m1, m2, file, name, buf, offset, n);
7508 
7509 	if (type.is_read()) {
7510 
7511 		if (srv_use_native_aio) {
7512 
7513 			++os_n_file_reads;
7514 
7515 			os_bytes_read_since_printout += n;
7516 #ifdef WIN_ASYNC_IO
7517 			ret = ReadFile(
7518 				file.m_file, slot->ptr, slot->len,
7519 				&slot->n_bytes, &slot->control);
7520 #elif defined(LINUX_NATIVE_AIO)
7521 			if (!array->linux_dispatch(slot)) {
7522 				goto err_exit;
7523 			}
7524 #endif /* WIN_ASYNC_IO */
7525 		} else if (type.is_wake()) {
7526 			AIO::wake_simulated_handler_thread(
7527 				AIO::get_segment_no_from_slot(array, slot));
7528 		}
7529 	} else if (type.is_write()) {
7530 
7531 		if (srv_use_native_aio) {
7532 			++os_n_file_writes;
7533 
7534 #ifdef WIN_ASYNC_IO
7535 			ret = WriteFile(
7536 				file.m_file, slot->ptr, slot->len,
7537 				&slot->n_bytes, &slot->control);
7538 #elif defined(LINUX_NATIVE_AIO)
7539 			if (!array->linux_dispatch(slot)) {
7540 				goto err_exit;
7541 			}
7542 #endif /* WIN_ASYNC_IO */
7543 
7544 		} else if (type.is_wake()) {
7545 			AIO::wake_simulated_handler_thread(
7546 				AIO::get_segment_no_from_slot(array, slot));
7547 		}
7548 	} else {
7549 		ut_error;
7550 	}
7551 
7552 #ifdef WIN_ASYNC_IO
7553 	if (srv_use_native_aio) {
7554 		if ((ret && slot->len == slot->n_bytes)
7555 		     || (!ret && GetLastError() == ERROR_IO_PENDING)) {
7556 			/* aio was queued successfully! */
7557 
7558 			if (mode == OS_AIO_SYNC) {
7559 				IORequest	dummy_type;
7560 				void*		dummy_mess2;
7561 				struct fil_node_t* dummy_mess1;
7562 
7563 				/* We want a synchronous i/o operation on a
7564 				file where we also use async i/o: in Windows
7565 				we must use the same wait mechanism as for
7566 				async i/o */
7567 
7568 				return(os_aio_windows_handler(
7569 					ULINT_UNDEFINED, slot->pos,
7570 					&dummy_mess1, &dummy_mess2,
7571 					&dummy_type));
7572 			}
7573 
7574 			return(DB_SUCCESS);
7575 		}
7576 
7577 		goto err_exit;
7578 	}
7579 #endif /* WIN_ASYNC_IO */
7580 
7581 	/* AIO request was queued successfully! */
7582 	return(DB_SUCCESS);
7583 
7584 #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
7585 err_exit:
7586 #endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
7587 
7588 	array->release_with_mutex(slot);
7589 
7590 	if (os_file_handle_error(
7591 		name, type.is_read() ? "aio read" : "aio write")) {
7592 
7593 		goto try_again;
7594 	}
7595 
7596 	return(DB_IO_ERROR);
7597 }
7598 
7599 /** Simulated AIO handler for reaping IO requests */
7600 class SimulatedAIOHandler {
7601 
7602 public:
7603 
7604 	/** Constructor
7605 	@param[in,out]	array	The AIO array
7606 	@param[in]	segment	Local segment in the array */
SimulatedAIOHandler(AIO * array,ulint segment)7607 	SimulatedAIOHandler(AIO* array, ulint segment)
7608 		:
7609 		m_oldest(),
7610 		m_n_elems(),
7611 		m_lowest_offset(IB_UINT64_MAX),
7612 		m_array(array),
7613 		m_n_slots(),
7614 		m_segment(segment),
7615 		m_ptr(),
7616 		m_buf()
7617 	{
7618 		ut_ad(m_segment < 100);
7619 
7620 		m_slots.resize(OS_AIO_MERGE_N_CONSECUTIVE);
7621 	}
7622 
7623 	/** Destructor */
~SimulatedAIOHandler()7624 	~SimulatedAIOHandler()
7625 	{
7626 		if (m_ptr != NULL) {
7627 			ut_free(m_ptr);
7628 		}
7629 	}
7630 
7631 	/** Reset the state of the handler
7632 	@param[in]	n_slots	Number of pending AIO operations supported */
init(ulint n_slots)7633 	void init(ulint n_slots)
7634 	{
7635 		m_oldest = 0;
7636 		m_n_elems = 0;
7637 		m_n_slots = n_slots;
7638 		m_lowest_offset = IB_UINT64_MAX;
7639 
7640 		if (m_ptr != NULL) {
7641 			ut_free(m_ptr);
7642 			m_ptr = m_buf = NULL;
7643 		}
7644 
7645 		m_slots[0] = NULL;
7646 	}
7647 
7648 	/** Check if there is a slot for which the i/o has already been done
7649 	@param[out]	n_reserved	Number of reserved slots
7650 	@return the first completed slot that is found. */
check_completed(ulint * n_reserved)7651 	Slot* check_completed(ulint* n_reserved)
7652 	{
7653 		ulint	offset = m_segment * m_n_slots;
7654 
7655 		*n_reserved = 0;
7656 
7657 		Slot*	slot;
7658 
7659 		slot = m_array->at(offset);
7660 
7661 		for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7662 
7663 			if (slot->is_reserved) {
7664 
7665 				if (slot->io_already_done) {
7666 
7667 					ut_a(slot->is_reserved);
7668 
7669 					return(slot);
7670 				}
7671 
7672 				++*n_reserved;
7673 			}
7674 		}
7675 
7676 		return(NULL);
7677 	}
7678 
7679 	/** If there are at least 2 seconds old requests, then pick the
7680 	oldest one to prevent starvation.  If several requests have the
7681 	same age, then pick the one at the lowest offset.
7682 	@return true if request was selected */
select()7683 	bool select()
7684 	{
7685 		if (!select_oldest()) {
7686 
7687 			return(select_lowest_offset());
7688 		}
7689 
7690 		return(true);
7691 	}
7692 
7693 	/** Check if there are several consecutive blocks
7694 	to read or write. Merge them if found. */
merge()7695 	void merge()
7696 	{
7697 		/* if m_n_elems != 0, then we have assigned
7698 		something valid to consecutive_ios[0] */
7699 		ut_ad(m_n_elems != 0);
7700 		ut_ad(first_slot() != NULL);
7701 
7702 		Slot*	slot = first_slot();
7703 
7704 		while (!merge_adjacent(slot)) {
7705 			/* No op */
7706 		}
7707 	}
7708 
7709 	/** We have now collected n_consecutive I/O requests
7710 	in the array; allocate a single buffer which can hold
7711 	all data, and perform the I/O
7712 	@return the length of the buffer */
allocate_buffer()7713 	ulint allocate_buffer()
7714 		MY_ATTRIBUTE((warn_unused_result))
7715 	{
7716 		ulint	len;
7717 		Slot*	slot = first_slot();
7718 
7719 		ut_ad(m_ptr == NULL);
7720 
7721 		if (slot->type.is_read() && m_n_elems > 1) {
7722 
7723 			len = 0;
7724 
7725 			for (ulint i = 0; i < m_n_elems; ++i) {
7726 				len += m_slots[i]->len;
7727 			}
7728 
7729 			m_ptr = static_cast<byte*>(
7730 				ut_malloc_nokey(len + UNIV_PAGE_SIZE));
7731 
7732 			m_buf = static_cast<byte*>(
7733 				ut_align(m_ptr, UNIV_PAGE_SIZE));
7734 
7735 		} else {
7736 			len = first_slot()->len;
7737 			m_buf = first_slot()->buf;
7738 		}
7739 
7740 		return(len);
7741 	}
7742 
7743 	/** We have to compress the individual pages and punch
7744 	holes in them on a page by page basis when writing to
7745 	tables that can be compresed at the IO level.
7746 	@param[in]	len		Value returned by allocate_buffer */
copy_to_buffer(ulint len)7747 	void copy_to_buffer(ulint len)
7748 	{
7749 		Slot*	slot = first_slot();
7750 
7751 		if (len > slot->len && slot->type.is_write()) {
7752 
7753 			byte*	ptr = m_buf;
7754 
7755 			ut_ad(ptr != slot->buf);
7756 
7757 			/* Copy the buffers to the combined buffer */
7758 			for (ulint i = 0; i < m_n_elems; ++i) {
7759 
7760 				slot = m_slots[i];
7761 
7762 				memmove(ptr, slot->buf, slot->len);
7763 
7764 				ptr += slot->len;
7765 			}
7766 		}
7767 	}
7768 
7769 	/** Do the I/O with ordinary, synchronous i/o functions:
7770 	@param[in]	len		Length of buffer for IO */
io()7771 	void io()
7772 	{
7773 		if (first_slot()->type.is_write()) {
7774 
7775 			for (ulint i = 0; i < m_n_elems; ++i) {
7776 				write(m_slots[i]);
7777 			}
7778 
7779 		} else {
7780 
7781 			for (ulint i = 0; i < m_n_elems; ++i) {
7782 				read(m_slots[i]);
7783 			}
7784 		}
7785 	}
7786 
7787 	/** Do the decompression of the pages read in */
io_complete()7788 	void io_complete()
7789 	{
7790 		// Note: For non-compressed tables. Not required
7791 		// for correctness.
7792 	}
7793 
7794 	/** Mark the i/os done in slots */
done()7795 	void done()
7796 	{
7797 		for (ulint i = 0; i < m_n_elems; ++i) {
7798 			m_slots[i]->io_already_done = true;
7799 		}
7800 	}
7801 
7802 	/** @return the first slot in the consecutive array */
first_slot()7803 	Slot* first_slot()
7804 		MY_ATTRIBUTE((warn_unused_result))
7805 	{
7806 		ut_a(m_n_elems > 0);
7807 
7808 		return(m_slots[0]);
7809 	}
7810 
7811 	/** Wait for I/O requests
7812 	@param[in]	global_segment	The global segment
7813 	@param[in,out]	event		Wait on event if no active requests
7814 	@return the number of slots */
7815 	ulint check_pending(
7816 		ulint		global_segment,
7817 		os_event_t	event)
7818 		MY_ATTRIBUTE((warn_unused_result));
7819 private:
7820 
7821 	/** Do the file read
7822 	@param[in,out]	slot		Slot that has the IO context */
read(Slot * slot)7823 	void read(Slot* slot)
7824 	{
7825 		dberr_t	err = os_file_read_func(
7826 			slot->type,
7827 			slot->file.m_file,
7828 			slot->ptr,
7829 			slot->offset,
7830 			slot->len);
7831 		ut_a(err == DB_SUCCESS);
7832 	}
7833 
7834 	/** Do the file read
7835 	@param[in,out]	slot		Slot that has the IO context */
write(Slot * slot)7836 	void write(Slot* slot)
7837 	{
7838 		dberr_t	err = os_file_write_func(
7839 			slot->type,
7840 			slot->name,
7841 			slot->file.m_file,
7842 			slot->ptr,
7843 			slot->offset,
7844 			slot->len);
7845 		ut_a(err == DB_SUCCESS || err == DB_IO_NO_PUNCH_HOLE);
7846 	}
7847 
7848 	/** @return true if the slots are adjacent and can be merged */
adjacent(const Slot * s1,const Slot * s2) const7849 	bool adjacent(const Slot* s1, const Slot* s2) const
7850 	{
7851 		return(s1 != s2
7852 		       && s1->file.m_file == s2->file.m_file
7853 		       && s2->offset == s1->offset + s1->len
7854 		       && s1->type == s2->type);
7855 	}
7856 
7857 	/** @return true if merge limit reached or no adjacent slots found. */
merge_adjacent(Slot * & current)7858 	bool merge_adjacent(Slot*& current)
7859 	{
7860 		Slot*	slot;
7861 		ulint	offset = m_segment * m_n_slots;
7862 
7863 		slot = m_array->at(offset);
7864 
7865 		for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7866 
7867 			if (slot->is_reserved && adjacent(current, slot)) {
7868 
7869 				current = slot;
7870 
7871 				/* Found a consecutive i/o request */
7872 
7873 				m_slots[m_n_elems] = slot;
7874 
7875 				++m_n_elems;
7876 
7877 				return(m_n_elems >= m_slots.capacity());
7878 			}
7879 		}
7880 
7881 		return(true);
7882 	}
7883 
7884 	/** There were no old requests. Look for an I/O request at the lowest
7885 	offset in the array (we ignore the high 32 bits of the offset in these
7886 	heuristics) */
select_lowest_offset()7887 	bool select_lowest_offset()
7888 	{
7889 		ut_ad(m_n_elems == 0);
7890 
7891 		ulint	offset = m_segment * m_n_slots;
7892 
7893 		m_lowest_offset = IB_UINT64_MAX;
7894 
7895 		for (ulint i = 0; i < m_n_slots; ++i) {
7896 			Slot*	slot;
7897 
7898 			slot = m_array->at(i + offset);
7899 
7900 			if (slot->is_reserved
7901 			    && slot->offset < m_lowest_offset) {
7902 
7903 				/* Found an i/o request */
7904 				m_slots[0] = slot;
7905 
7906 				m_n_elems = 1;
7907 
7908 				m_lowest_offset = slot->offset;
7909 			}
7910 		}
7911 
7912 		return(m_n_elems > 0);
7913 	}
7914 
7915 	/** Select the slot if it is older than the current oldest slot.
7916 	@param[in]	slot		The slot to check */
select_if_older(Slot * slot)7917 	void select_if_older(Slot* slot)
7918 	{
7919 		int64_t time_diff = ut_time_monotonic() -
7920 					slot->reservation_time;
7921 
7922 		const uint64_t age = time_diff > 0 ? (uint64_t) time_diff : 0;
7923 
7924 		if ((age >= 2 && age > m_oldest)
7925 		    || (age >= 2
7926 			&& age == m_oldest
7927 			&& slot->offset < m_lowest_offset)) {
7928 
7929 			/* Found an i/o request */
7930 			m_slots[0] = slot;
7931 
7932 			m_n_elems = 1;
7933 
7934 			m_oldest = age;
7935 
7936 			m_lowest_offset = slot->offset;
7937 		}
7938 	}
7939 
7940 	/** Select th oldest slot in the array
7941 	@return true if oldest slot found */
select_oldest()7942 	bool select_oldest()
7943 	{
7944 		ut_ad(m_n_elems == 0);
7945 
7946 		Slot*	slot;
7947 		ulint	offset = m_n_slots * m_segment;
7948 
7949 		slot = m_array->at(offset);
7950 
7951 		for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7952 
7953 			if (slot->is_reserved) {
7954 				select_if_older(slot);
7955 			}
7956 		}
7957 
7958 		return(m_n_elems > 0);
7959 	}
7960 
7961 	typedef std::vector<Slot*> slots_t;
7962 
7963 private:
7964 	ulint		m_oldest;
7965 	ulint		m_n_elems;
7966 	os_offset_t	m_lowest_offset;
7967 
7968 	AIO*		m_array;
7969 	ulint		m_n_slots;
7970 	ulint		m_segment;
7971 
7972 	slots_t		m_slots;
7973 
7974 	byte*		m_ptr;
7975 	byte*		m_buf;
7976 };
7977 
7978 /** Wait for I/O requests
7979 @return the number of slots */
7980 ulint
check_pending(ulint global_segment,os_event_t event)7981 SimulatedAIOHandler::check_pending(
7982 	ulint		global_segment,
7983 	os_event_t	event)
7984 {
7985 	/* NOTE! We only access constant fields in os_aio_array.
7986 	Therefore we do not have to acquire the protecting mutex yet */
7987 
7988 	ut_ad(os_aio_validate_skip());
7989 
7990 	ut_ad(m_segment < m_array->get_n_segments());
7991 
7992 	/* Look through n slots after the segment * n'th slot */
7993 
7994 	if (AIO::is_read(m_array)
7995 	    && os_aio_recommend_sleep_for_read_threads) {
7996 
7997 		/* Give other threads chance to add several
7998 		I/Os to the array at once. */
7999 
8000 		srv_set_io_thread_op_info(
8001 			global_segment, "waiting for i/o request");
8002 
8003 		os_event_wait(event);
8004 
8005 		return(0);
8006 	}
8007 
8008 	return(m_array->slots_per_segment());
8009 }
8010 
8011 /** Does simulated AIO. This function should be called by an i/o-handler
8012 thread.
8013 
8014 @param[in]	segment	The number of the segment in the aio arrays to wait
8015 			for; segment 0 is the ibuf i/o thread, segment 1 the
8016 			log i/o thread, then follow the non-ibuf read threads,
8017 			and as the last are the non-ibuf write threads
8018 @param[out]	m1	the messages passed with the AIO request; note that
8019 			also in the case where the AIO operation failed, these
8020 			output parameters are valid and can be used to restart
8021 			the operation, for example
8022 @param[out]	m2	Callback argument
8023 @param[in]	type	IO context
8024 @return DB_SUCCESS or error code */
8025 static
8026 dberr_t
os_aio_simulated_handler(ulint global_segment,fil_node_t ** m1,void ** m2,IORequest * type)8027 os_aio_simulated_handler(
8028 	ulint		global_segment,
8029 	fil_node_t**	m1,
8030 	void**		m2,
8031 	IORequest*	type)
8032 {
8033 	Slot*		slot;
8034 	AIO*		array;
8035 	ulint		segment;
8036 	os_event_t	event = os_aio_segment_wait_events[global_segment];
8037 
8038 	segment = AIO::get_array_and_local_segment(&array, global_segment);
8039 
8040 	SimulatedAIOHandler	handler(array, segment);
8041 
8042 	for (;;) {
8043 
8044 		srv_set_io_thread_op_info(
8045 			global_segment, "looking for i/o requests (a)");
8046 
8047 		ulint	n_slots = handler.check_pending(global_segment, event);
8048 
8049 		if (n_slots == 0) {
8050 			continue;
8051 		}
8052 
8053 		handler.init(n_slots);
8054 
8055 		srv_set_io_thread_op_info(
8056 			global_segment, "looking for i/o requests (b)");
8057 
8058 		array->acquire();
8059 
8060 		ulint	n_reserved;
8061 
8062 		slot = handler.check_completed(&n_reserved);
8063 
8064 		if (slot != NULL) {
8065 
8066 			break;
8067 
8068 		} else if (n_reserved == 0
8069 			   && !buf_page_cleaner_is_active
8070 			   && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
8071 
8072 			/* There is no completed request. If there
8073 			are no pending request at all, and the system
8074 			is being shut down, exit. */
8075 
8076 			array->release();
8077 
8078 			*m1 = NULL;
8079 
8080 			*m2 = NULL;
8081 
8082 			return(DB_SUCCESS);
8083 
8084 		} else if (handler.select()) {
8085 
8086 			break;
8087 		}
8088 
8089 		/* No I/O requested at the moment */
8090 
8091 		srv_set_io_thread_op_info(
8092 			global_segment, "resetting wait event");
8093 
8094 		/* We wait here until tbere are more IO requests
8095 		for this segment. */
8096 
8097 		os_event_reset(event);
8098 
8099 		array->release();
8100 
8101 		srv_set_io_thread_op_info(
8102 			global_segment, "waiting for i/o request");
8103 
8104 		os_event_wait(event);
8105 	}
8106 
8107 	/** Found a slot that has already completed its IO */
8108 
8109 	if (slot == NULL) {
8110 		/* Merge adjacent requests */
8111 		handler.merge();
8112 
8113 		/* Check if there are several consecutive blocks
8114 		to read or write */
8115 
8116 		srv_set_io_thread_op_info(
8117 			global_segment, "consecutive i/o requests");
8118 
8119 		// Note: We don't support write combining for simulated AIO.
8120 		//ulint	total_len = handler.allocate_buffer();
8121 
8122 		/* We release the array mutex for the time of the I/O: NOTE that
8123 		this assumes that there is just one i/o-handler thread serving
8124 		a single segment of slots! */
8125 
8126 		array->release();
8127 
8128 		// Note: We don't support write combining for simulated AIO.
8129 		//handler.copy_to_buffer(total_len);
8130 
8131 		srv_set_io_thread_op_info(global_segment, "doing file i/o");
8132 
8133 		handler.io();
8134 
8135 		srv_set_io_thread_op_info(global_segment, "file i/o done");
8136 
8137 		handler.io_complete();
8138 
8139 		array->acquire();
8140 
8141 		handler.done();
8142 
8143 		/* We return the messages for the first slot now, and if there
8144 		were several slots, the messages will be returned with
8145 		subsequent calls of this function */
8146 
8147 		slot = handler.first_slot();
8148 	}
8149 
8150 	ut_ad(slot->is_reserved);
8151 
8152 	*m1 = slot->m1;
8153 	*m2 = slot->m2;
8154 
8155 	*type = slot->type;
8156 
8157 	array->release(slot);
8158 
8159 	array->release();
8160 
8161 	return(DB_SUCCESS);
8162 }
8163 
8164 /** Get the total number of pending IOs
8165 @return the total number of pending IOs */
8166 ulint
total_pending_io_count()8167 AIO::total_pending_io_count()
8168 {
8169 	ulint	count = s_reads->pending_io_count();
8170 
8171 	if (s_writes != NULL) {
8172 		count += s_writes->pending_io_count();
8173 	}
8174 
8175 	if (s_ibuf != NULL) {
8176 		count += s_ibuf->pending_io_count();
8177 	}
8178 
8179 	if (s_log != NULL) {
8180 		count += s_log->pending_io_count();
8181 	}
8182 
8183 	if (s_sync != NULL) {
8184 		count += s_sync->pending_io_count();
8185 	}
8186 
8187 	return(count);
8188 }
8189 
8190 /** Validates the consistency the aio system.
8191 @return true if ok */
8192 static
8193 bool
os_aio_validate()8194 os_aio_validate()
8195 {
8196 	/* The methods countds and validates, we ignore the count. */
8197 	AIO::total_pending_io_count();
8198 
8199 	return(true);
8200 }
8201 
8202 /** Prints pending IO requests per segment of an aio array.
8203 We probably don't need per segment statistics but they can help us
8204 during development phase to see if the IO requests are being
8205 distributed as expected.
8206 @param[in,out]	file		File where to print
8207 @param[in]	segments	Pending IO array */
8208 void
print_segment_info(FILE * file,const ulint * segments)8209 AIO::print_segment_info(
8210 	FILE*		file,
8211 	const ulint*	segments)
8212 {
8213 	ut_ad(m_n_segments > 0);
8214 
8215 	if (m_n_segments > 1) {
8216 
8217 		fprintf(file, " [");
8218 
8219 		for (ulint i = 0; i < m_n_segments; ++i, ++segments) {
8220 
8221 			if (i != 0) {
8222 				fprintf(file, ", ");
8223 			}
8224 
8225 			fprintf(file, ULINTPF, *segments);
8226 		}
8227 
8228 		fprintf(file, "] ");
8229 	}
8230 }
8231 
8232 /** Prints info about the aio array.
8233 @param[in,out]	file		Where to print */
8234 void
print(FILE * file)8235 AIO::print(FILE* file)
8236 {
8237 	ulint	count = 0;
8238 	ulint	n_res_seg[SRV_MAX_N_IO_THREADS];
8239 
8240 	mutex_enter(&m_mutex);
8241 
8242 	ut_a(!m_slots.empty());
8243 	ut_a(m_n_segments > 0);
8244 
8245 	memset(n_res_seg, 0x0, sizeof(n_res_seg));
8246 
8247 	for (ulint i = 0; i < m_slots.size(); ++i) {
8248 		Slot&	slot = m_slots[i];
8249 		ulint	segment = (i * m_n_segments) / m_slots.size();
8250 
8251 		if (slot.is_reserved) {
8252 
8253 			++count;
8254 
8255 			++n_res_seg[segment];
8256 
8257 			ut_a(slot.len > 0);
8258 		}
8259 	}
8260 
8261 	ut_a(m_n_reserved == count);
8262 
8263 	print_segment_info(file, n_res_seg);
8264 
8265 	mutex_exit(&m_mutex);
8266 }
8267 
8268 /** Print all the AIO segments
8269 @param[in,out]	file		Where to print */
8270 void
print_all(FILE * file)8271 AIO::print_all(FILE* file)
8272 {
8273 	s_reads->print(file);
8274 
8275 	if (s_writes != NULL) {
8276 		fputs(", aio writes:", file);
8277 		s_writes->print(file);
8278 	}
8279 
8280 	if (s_ibuf != NULL) {
8281 		fputs(",\n ibuf aio reads:", file);
8282 		s_ibuf->print(file);
8283 	}
8284 
8285 	if (s_log != NULL) {
8286 		fputs(", log i/o's:", file);
8287 		s_log->print(file);
8288 	}
8289 
8290 	if (s_sync != NULL) {
8291 		fputs(", sync i/o's:", file);
8292 		s_sync->print(file);
8293 	}
8294 }
8295 
8296 /** Prints info of the aio arrays.
8297 @param[in,out]	file		file where to print */
8298 void
os_aio_print(FILE * file)8299 os_aio_print(FILE*	file)
8300 {
8301 	ib_time_monotonic_t 		current_time;
8302 	double	 			time_elapsed;
8303 	double				avg_bytes_read;
8304 
8305 	for (ulint i = 0; i < srv_n_file_io_threads; ++i) {
8306 		fprintf(file, "I/O thread %lu state: %s (%s)",
8307 			(ulong) i,
8308 			srv_io_thread_op_info[i],
8309 			srv_io_thread_function[i]);
8310 
8311 #ifndef _WIN32
8312 		if (os_event_is_set(os_aio_segment_wait_events[i])) {
8313 			fprintf(file, " ev set");
8314 		}
8315 #endif /* _WIN32 */
8316 
8317 		fprintf(file, "\n");
8318 	}
8319 
8320 	fputs("Pending normal aio reads:", file);
8321 
8322 	AIO::print_all(file);
8323 
8324 	putc('\n', file);
8325 	current_time = ut_time_monotonic();
8326 	time_elapsed = 0.001 + (current_time - os_last_printout);
8327 
8328 	fprintf(file,
8329 		"Pending flushes (fsync) log: " ULINTPF "; "
8330 		"buffer pool: " ULINTPF "\n"
8331 		ULINTPF " OS file reads, "
8332 		ULINTPF " OS file writes, "
8333 		ULINTPF " OS fsyncs\n",
8334 		fil_n_pending_log_flushes,
8335 		fil_n_pending_tablespace_flushes,
8336 		os_n_file_reads,
8337 		os_n_file_writes,
8338 		os_n_fsyncs);
8339 
8340 	if (os_n_pending_writes != 0 || os_n_pending_reads != 0) {
8341 		fprintf(file,
8342 			ULINTPF " pending preads, "
8343 			ULINTPF " pending pwrites\n",
8344 			os_n_pending_reads,
8345 			os_n_pending_writes);
8346 	}
8347 
8348 	if (os_n_file_reads == os_n_file_reads_old) {
8349 		avg_bytes_read = 0.0;
8350 	} else {
8351 		avg_bytes_read = (double) os_bytes_read_since_printout
8352 			/ (os_n_file_reads - os_n_file_reads_old);
8353 	}
8354 
8355 	fprintf(file,
8356 		"%.2f reads/s, %lu avg bytes/read,"
8357 		" %.2f writes/s, %.2f fsyncs/s\n",
8358 		(os_n_file_reads - os_n_file_reads_old)
8359 		/ time_elapsed,
8360 		(ulong) avg_bytes_read,
8361 		(os_n_file_writes - os_n_file_writes_old)
8362 		/ time_elapsed,
8363 		(os_n_fsyncs - os_n_fsyncs_old)
8364 		/ time_elapsed);
8365 
8366 	os_n_file_reads_old = os_n_file_reads;
8367 	os_n_file_writes_old = os_n_file_writes;
8368 	os_n_fsyncs_old = os_n_fsyncs;
8369 	os_bytes_read_since_printout = 0;
8370 
8371 	os_last_printout = current_time;
8372 }
8373 
8374 /** Refreshes the statistics used to print per-second averages. */
8375 void
os_aio_refresh_stats()8376 os_aio_refresh_stats()
8377 {
8378 	os_n_fsyncs_old = os_n_fsyncs;
8379 
8380 	os_bytes_read_since_printout = 0;
8381 
8382 	os_n_file_reads_old = os_n_file_reads;
8383 
8384 	os_n_file_writes_old = os_n_file_writes;
8385 
8386 	os_n_fsyncs_old = os_n_fsyncs;
8387 
8388 	os_bytes_read_since_printout = 0;
8389 
8390 	os_last_printout = ut_time_monotonic();
8391 }
8392 
8393 /** Checks that all slots in the system have been freed, that is, there are
8394 no pending io operations.
8395 @return true if all free */
8396 bool
os_aio_all_slots_free()8397 os_aio_all_slots_free()
8398 {
8399 	return(AIO::total_pending_io_count() == 0);
8400 }
8401 
8402 #ifdef UNIV_DEBUG
8403 /** Prints all pending IO for the array
8404 @param[in]	file	file where to print
8405 @param[in]	array	array to process */
8406 void
to_file(FILE * file) const8407 AIO::to_file(FILE* file) const
8408 {
8409 	acquire();
8410 
8411 	fprintf(file, " %lu\n", static_cast<ulong>(m_n_reserved));
8412 
8413 	for (ulint i = 0; i < m_slots.size(); ++i) {
8414 
8415 		const Slot&	slot = m_slots[i];
8416 
8417 		if (slot.is_reserved) {
8418 
8419 			fprintf(file,
8420 				"%s IO for %s (offset=" UINT64PF
8421 				", size=%lu)\n",
8422 				slot.type.is_read() ? "read" : "write",
8423 				slot.name, slot.offset, slot.len);
8424 		}
8425 	}
8426 
8427 	release();
8428 }
8429 
8430 /** Print pending IOs for all arrays */
8431 void
print_to_file(FILE * file)8432 AIO::print_to_file(FILE* file)
8433 {
8434 	fprintf(file, "Pending normal aio reads:");
8435 
8436 	s_reads->to_file(file);
8437 
8438 	if (s_writes != NULL) {
8439 		fprintf(file, "Pending normal aio writes:");
8440 		s_writes->to_file(file);
8441 	}
8442 
8443 	if (s_ibuf != NULL) {
8444 		fprintf(file, "Pending ibuf aio reads:");
8445 		s_ibuf->to_file(file);
8446 	}
8447 
8448 	if (s_log != NULL) {
8449 		fprintf(file, "Pending log i/o's:");
8450 		s_log->to_file(file);
8451 	}
8452 
8453 	if (s_sync != NULL) {
8454 		fprintf(file, "Pending sync i/o's:");
8455 		s_sync->to_file(file);
8456 	}
8457 }
8458 
8459 /** Prints all pending IO
8460 @param[in]	file		File where to print */
8461 void
os_aio_print_pending_io(FILE * file)8462 os_aio_print_pending_io(
8463 	FILE*	file)
8464 {
8465 	AIO::print_to_file(file);
8466 }
8467 
8468 #endif /* UNIV_DEBUG */
8469 
8470 /**
8471 Set the file create umask
8472 @param[in]	umask		The umask to use for file creation. */
8473 void
os_file_set_umask(ulint umask)8474 os_file_set_umask(ulint umask)
8475 {
8476 	os_innodb_umask = umask;
8477 }
8478 #else
8479 
8480 #include "univ.i"
8481 #include "db0err.h"
8482 #include "mach0data.h"
8483 #include "fil0fil.h"
8484 #include "os0file.h"
8485 
8486 #include <lz4.h>
8487 #include <zlib.h>
8488 
8489 #include <my_aes.h>
8490 #include <my_rnd.h>
8491 #include <mysqld.h>
8492 #include <mysql/service_mysql_keyring.h>
8493 
8494 typedef byte	Block;
8495 
8496 /** Allocate a page for sync IO
8497 @return pointer to page */
8498 static
8499 Block*
os_alloc_block()8500 os_alloc_block()
8501 {
8502 	return(reinterpret_cast<byte*>(malloc(UNIV_PAGE_SIZE_MAX * 2)));
8503 }
8504 
8505 /** Free a page after sync IO
8506 @param[in,own]	block		The block to free/release */
8507 static
8508 void
os_free_block(Block * block)8509 os_free_block(Block* block)
8510 {
8511 	ut_free(block);
8512 }
8513 
8514 #endif /* !UNIV_INNOCHECKSUM */
8515 
8516 /** Minimum length needed for encryption */
8517 const unsigned int MIN_ENCRYPTION_LEN = 2 * MY_AES_BLOCK_SIZE + FIL_PAGE_DATA;
8518 
8519 /**
8520 @param[in]      type            The compression type
8521 @return the string representation */
8522 const char*
to_string(Type type)8523 Compression::to_string(Type type)
8524 {
8525         switch(type) {
8526         case NONE:
8527                 return("None");
8528         case ZLIB:
8529                 return("Zlib");
8530         case LZ4:
8531                 return("LZ4");
8532         }
8533 
8534         ut_ad(0);
8535 
8536         return("<UNKNOWN>");
8537 }
8538 
8539 /**
8540 @param[in]      meta		Page Meta data
8541 @return the string representation */
to_string(const Compression::meta_t & meta)8542 std::string Compression::to_string(const Compression::meta_t& meta)
8543 {
8544 	std::ostringstream	stream;
8545 
8546 	stream	<< "version: " << int(meta.m_version) << " "
8547 		<< "algorithm: " << meta.m_algorithm << " "
8548 		<< "(" << to_string(meta.m_algorithm) << ") "
8549 		<< "orginal_type: " << meta.m_original_type << " "
8550 		<< "original_size: " << meta.m_original_size << " "
8551 		<< "compressed_size: " << meta.m_compressed_size;
8552 
8553 	return(stream.str());
8554 }
8555 
8556 /** @return true if it is a compressed page */
8557 bool
is_compressed_page(const byte * page)8558 Compression::is_compressed_page(const byte* page)
8559 {
8560 	return(mach_read_from_2(page + FIL_PAGE_TYPE) == FIL_PAGE_COMPRESSED);
8561 }
8562 
8563 bool
is_compressed_encrypted_page(const byte * page)8564 Compression::is_compressed_encrypted_page(const byte *page) {
8565 	return (mach_read_from_2(page + FIL_PAGE_TYPE) ==
8566 		FIL_PAGE_COMPRESSED_AND_ENCRYPTED);
8567 }
8568 
8569 bool
is_valid_page_version(uint8_t version)8570 Compression::is_valid_page_version(uint8_t version) {
8571 	return (version == FIL_PAGE_VERSION_1 || version == FIL_PAGE_VERSION_2);
8572 }
8573 
8574 /** Deserizlise the page header compression meta-data
8575 @param[in]	page		Pointer to the page header
8576 @param[out]	control		Deserialised data */
8577 void
deserialize_header(const byte * page,Compression::meta_t * control)8578 Compression::deserialize_header(
8579 	const byte*		page,
8580 	Compression::meta_t*	control)
8581 {
8582 	ut_ad(is_compressed_page(page) || is_compressed_encrypted_page(page));
8583 
8584 	control->m_version = static_cast<uint8_t>(
8585 		mach_read_from_1(page + FIL_PAGE_VERSION));
8586 
8587 	control->m_original_type = static_cast<uint16_t>(
8588 		mach_read_from_2(page + FIL_PAGE_ORIGINAL_TYPE_V1));
8589 
8590 	control->m_compressed_size = static_cast<uint16_t>(
8591 		mach_read_from_2(page + FIL_PAGE_COMPRESS_SIZE_V1));
8592 
8593 	control->m_original_size = static_cast<uint16_t>(
8594 		mach_read_from_2(page + FIL_PAGE_ORIGINAL_SIZE_V1));
8595 
8596 	control->m_algorithm = static_cast<Type>(
8597 		mach_read_from_1(page + FIL_PAGE_ALGORITHM_V1));
8598 }
8599 
8600 /** Decompress the page data contents. Page type must be FIL_PAGE_COMPRESSED, if
8601 not then the source contents are left unchanged and DB_SUCCESS is returned.
8602 @param[in]	dblwr_recover	true of double write recovery in progress
8603 @param[in,out]	src		Data read from disk, decompressed data will be
8604 				copied to this page
8605 @param[in,out]	dst		Scratch area to use for decompression
8606 @param[in]	dst_len		Size of the scratch area in bytes
8607 @return DB_SUCCESS or error code */
8608 dberr_t
deserialize(bool dblwr_recover,byte * src,byte * dst,ulint dst_len)8609 Compression::deserialize(
8610 	bool		dblwr_recover,
8611 	byte*		src,
8612 	byte*		dst,
8613 	ulint		dst_len)
8614 {
8615 	if (!is_compressed_page(src)) {
8616 		/* There is nothing we can do. */
8617 		return(DB_SUCCESS);
8618 	}
8619 
8620 	meta_t	header;
8621 
8622 	deserialize_header(src, &header);
8623 
8624 	byte*	ptr = src + FIL_PAGE_DATA;
8625 
8626 	ut_ad(is_valid_page_version(header.m_version));
8627 
8628 	if (!is_valid_page_version(header.m_version)
8629 	    || header.m_original_size < UNIV_PAGE_SIZE_MIN - (FIL_PAGE_DATA + 8)
8630 	    || header.m_original_size > UNIV_PAGE_SIZE_MAX - FIL_PAGE_DATA
8631 	    || dst_len < header.m_original_size + FIL_PAGE_DATA) {
8632 
8633 		/* The last check could potentially return DB_OVERFLOW,
8634 		the caller should be able to retry with a larger buffer. */
8635 
8636 		return(DB_CORRUPTION);
8637 	}
8638 
8639 	Block*	block;
8640 
8641 	/* The caller doesn't know what to expect */
8642 	if (dst == NULL) {
8643 
8644 		block = os_alloc_block();
8645 
8646 #ifdef UNIV_INNOCHECKSUM
8647 		dst = block;
8648 #else
8649 		dst = block->m_ptr;
8650 #endif /* UNIV_INNOCHECKSUM */
8651 
8652 	} else {
8653 		block = NULL;
8654 	}
8655 
8656 	int		ret;
8657 	Compression	compression;
8658 	ulint		len = header.m_original_size;
8659 
8660 	compression.m_type = static_cast<Compression::Type>(header.m_algorithm);
8661 
8662 	switch(compression.m_type) {
8663 	case Compression::ZLIB: {
8664 
8665 		uLongf	zlen = header.m_original_size;
8666 
8667 		if (uncompress(dst, &zlen, ptr, header.m_compressed_size)
8668 		    != Z_OK) {
8669 
8670 			if (block != NULL) {
8671 				os_free_block(block);
8672 			}
8673 
8674 			return(DB_IO_DECOMPRESS_FAIL);
8675 		}
8676 
8677 		len = static_cast<ulint>(zlen);
8678 
8679 		break;
8680 	}
8681 
8682 	case Compression::LZ4:
8683 
8684                 ret = LZ4_decompress_safe(
8685                         reinterpret_cast<char*>(ptr),
8686                         reinterpret_cast<char*>(dst),
8687                         header.m_compressed_size,
8688                         header.m_original_size);
8689 		if (ret < 0) {
8690 
8691 			if (block != NULL) {
8692 				os_free_block(block);
8693 			}
8694 
8695 			return(DB_IO_DECOMPRESS_FAIL);
8696 		}
8697 
8698 		break;
8699 
8700 	default:
8701 #if !defined(UNIV_INNOCHECKSUM)
8702 		ib::error()
8703 			<< "Compression algorithm support missing: "
8704 			<< Compression::to_string(compression.m_type);
8705 #else
8706 		fprintf(stderr, "Compression algorithm support missing: %s\n",
8707 			Compression::to_string(compression.m_type));
8708 #endif /* !UNIV_INNOCHECKSUM */
8709 
8710 		if (block != NULL) {
8711 			os_free_block(block);
8712 		}
8713 
8714 		return(DB_UNSUPPORTED);
8715 	}
8716 
8717 	/* Leave the header alone */
8718 	memmove(src + FIL_PAGE_DATA, dst, len);
8719 
8720 	mach_write_to_2(src + FIL_PAGE_TYPE, header.m_original_type);
8721 
8722 	ut_ad(dblwr_recover
8723 	      || memcmp(src + FIL_PAGE_LSN + 4,
8724 			src + (header.m_original_size + FIL_PAGE_DATA)
8725 			- FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4) == 0);
8726 
8727 	if (block != NULL) {
8728 		os_free_block(block);
8729 	}
8730 
8731 	return(DB_SUCCESS);
8732 }
8733 
8734 /** Decompress the page data contents. Page type must be FIL_PAGE_COMPRESSED, if
8735 not then the source contents are left unchanged and DB_SUCCESS is returned.
8736 @param[in]	dblwr_recover	true of double write recovery in progress
8737 @param[in,out]	src		Data read from disk, decompressed data will be
8738 				copied to this page
8739 @param[in,out]	dst		Scratch area to use for decompression
8740 @param[in]	dst_len		Size of the scratch area in bytes
8741 @return DB_SUCCESS or error code */
8742 dberr_t
os_file_decompress_page(bool dblwr_recover,byte * src,byte * dst,ulint dst_len)8743 os_file_decompress_page(
8744 	bool		dblwr_recover,
8745 	byte*		src,
8746 	byte*		dst,
8747 	ulint		dst_len)
8748 {
8749 	return(Compression::deserialize(dblwr_recover, src, dst, dst_len));
8750 }
8751 
8752 /**
8753 @param[in]      type            The encryption type
8754 @return the string representation */
8755 const char*
to_string(Type type)8756 Encryption::to_string(Type type)
8757 {
8758         switch(type) {
8759         case NONE:
8760                 return("N");
8761         case AES:
8762                 return("Y");
8763         }
8764 
8765         ut_ad(0);
8766 
8767         return("<UNKNOWN>");
8768 }
8769 
8770 /** Generate random encryption value for key and iv.
8771 @param[in,out]	value	Encryption value */
random_value(byte * value)8772 void Encryption::random_value(byte* value)
8773 {
8774 	ut_ad(value != NULL);
8775 
8776 	my_rand_buffer(value, ENCRYPTION_KEY_LEN);
8777 }
8778 
8779 /** Create new master key for key rotation.
8780 @param[in,out]	master_key	master key */
8781 void
create_master_key(byte ** master_key)8782 Encryption::create_master_key(byte** master_key)
8783 {
8784 #ifndef UNIV_INNOCHECKSUM
8785 	char*	key_type = NULL;
8786 	size_t	key_len;
8787 	char	key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
8788 	int	ret;
8789 
8790 	/* If uuid does not match with current server uuid,
8791 	set uuid as current server uuid. */
8792 	if (strcmp(uuid, server_uuid) != 0) {
8793 		memcpy(uuid, server_uuid, ENCRYPTION_SERVER_UUID_LEN);
8794 	}
8795 	memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
8796 
8797 	/* Generate new master key */
8798 	ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8799 		    "%s-%s-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8800 		    uuid, master_key_id + 1);
8801 
8802 	/* We call key ring API to generate master key here. */
8803 	ret = my_key_generate(key_name, "AES",
8804 			      NULL, ENCRYPTION_KEY_LEN);
8805 
8806 	/* We call key ring API to get master key here. */
8807 	ret = my_key_fetch(key_name, &key_type, NULL,
8808 			   reinterpret_cast<void**>(master_key),
8809 			   &key_len);
8810 
8811 	if (ret || *master_key == NULL) {
8812 		ib::error() << "Encryption can't find master key, please check"
8813 				" the keyring plugin is loaded.";
8814 		*master_key = NULL;
8815 	} else {
8816 		master_key_id++;
8817 	}
8818 
8819 	if (key_type) {
8820 		my_free(key_type);
8821 	}
8822 #endif
8823 }
8824 
8825 /** Get master key by key id.
8826 @param[in]	master_key_id	master key id
8827 @param[in]	srv_uuid	uuid of server instance
8828 @param[in,out]	master_key	master key */
8829 void
get_master_key(ulint master_key_id,char * srv_uuid,byte ** master_key)8830 Encryption::get_master_key(ulint master_key_id,
8831 			   char* srv_uuid,
8832 			   byte** master_key)
8833 {
8834 #ifndef UNIV_INNOCHECKSUM
8835 	char*	key_type = NULL;
8836 	size_t	key_len;
8837 	char	key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
8838 	int	ret;
8839 
8840 	memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
8841 
8842 	if (srv_uuid != NULL) {
8843 		ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8844 			    "%s-%s-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8845 			    srv_uuid, master_key_id);
8846 	} else {
8847 		/* For compitable with 5.7.11, we need to get master key with
8848 		server id. */
8849 		memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
8850 		ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8851 			    "%s-%lu-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8852 			    server_id, master_key_id);
8853 	}
8854 
8855 	/* We call key ring API to get master key here. */
8856 	ret = my_key_fetch(key_name, &key_type, NULL,
8857 			   reinterpret_cast<void**>(master_key), &key_len);
8858 
8859 	if (key_type) {
8860 		my_free(key_type);
8861 	}
8862 
8863 	if (ret) {
8864 		*master_key = NULL;
8865 		ib::error() << "Encryption can't find master key, please check"
8866 				" the keyring plugin is loaded.";
8867 	}
8868 
8869 #ifdef UNIV_ENCRYPT_DEBUG
8870 	if (!ret && *master_key) {
8871 		fprintf(stderr, "Fetched master key:%lu ", master_key_id);
8872 		ut_print_buf(stderr, *master_key, key_len);
8873 		fprintf(stderr, "\n");
8874 	}
8875 #endif /* DEBUG_TDE */
8876 
8877 #endif
8878 }
8879 
8880 /** Current master key id */
8881 ulint	Encryption::master_key_id = 0;
8882 
8883 /** Current uuid of server instance */
8884 char	Encryption::uuid[ENCRYPTION_SERVER_UUID_LEN + 1] = {0};
8885 
8886 /** Get current master key and master key id
8887 @param[in,out]	master_key_id	master key id
8888 @param[in,out]	master_key	master key
8889 @param[in,out]	version		encryption information version */
8890 void
get_master_key(ulint * master_key_id,byte ** master_key,Encryption::Version * version)8891 Encryption::get_master_key(ulint* master_key_id,
8892 			   byte** master_key,
8893 			   Encryption::Version*  version)
8894 {
8895 #ifndef UNIV_INNOCHECKSUM
8896 	char*	key_type = NULL;
8897 	size_t	key_len;
8898 	char	key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
8899 	int	ret;
8900 
8901 	memset(key_name, 0, ENCRYPTION_KEY_LEN);
8902 	*version = Encryption::ENCRYPTION_VERSION_2;
8903 
8904 	if (Encryption::master_key_id == 0) {
8905 		/* If m_master_key is 0, means there's no encrypted
8906 		tablespace, we need to generate the first master key,
8907 		and store it to key ring. */
8908 		memset(uuid, 0, ENCRYPTION_SERVER_UUID_LEN + 1);
8909 		memcpy(uuid, server_uuid, ENCRYPTION_SERVER_UUID_LEN);
8910 
8911 		/* Prepare the server uuid. */
8912 		ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8913 			    "%s-%s-1", ENCRYPTION_MASTER_KEY_PRIFIX,
8914 			    uuid);
8915 
8916 		/* We call key ring API to generate master key here. */
8917 		ret = my_key_generate(key_name, "AES",
8918 				      NULL, ENCRYPTION_KEY_LEN);
8919 
8920 		/* We call key ring API to get master key here. */
8921 		ret = my_key_fetch(key_name, &key_type, NULL,
8922 				   reinterpret_cast<void**>(master_key),
8923 				   &key_len);
8924 
8925 		if (!ret && *master_key != NULL) {
8926 			Encryption::master_key_id++;
8927 			*master_key_id = Encryption::master_key_id;
8928 		}
8929 #ifdef UNIV_ENCRYPT_DEBUG
8930 		if (!ret && *master_key) {
8931 			fprintf(stderr, "Generated new master key:");
8932 			ut_print_buf(stderr, *master_key, key_len);
8933 			fprintf(stderr, "\n");
8934 		}
8935 #endif
8936 	} else {
8937 		*master_key_id = Encryption::master_key_id;
8938 
8939 		ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8940 			    "%s-%s-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8941 			    uuid, *master_key_id);
8942 
8943 		/* We call key ring API to get master key here. */
8944 		ret = my_key_fetch(key_name, &key_type, NULL,
8945 				   reinterpret_cast<void**>(master_key),
8946 				   &key_len);
8947 
8948 		/* For compitable with 5.7.11, we need to try to get master key with
8949 		server id when get master key with server uuid failure. */
8950 		if (ret || *master_key == NULL) {
8951 			if (key_type) {
8952 				my_free(key_type);
8953 			}
8954 
8955 			memset(key_name, 0,
8956 			       ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
8957 			ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8958 				    "%s-%lu-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8959 				    server_id, *master_key_id);
8960 
8961 			ret = my_key_fetch(key_name, &key_type, NULL,
8962 					   reinterpret_cast<void**>(master_key),
8963 					   &key_len);
8964 			*version = Encryption::ENCRYPTION_VERSION_1;
8965 		}
8966 #ifdef UNIV_ENCRYPT_DEBUG
8967 		if (!ret && *master_key) {
8968 			fprintf(stderr, "Fetched master key:%lu ",
8969 				*master_key_id);
8970 			ut_print_buf(stderr, *master_key, key_len);
8971 			fprintf(stderr, "\n");
8972 		}
8973 #endif
8974 	}
8975 
8976 	if (ret) {
8977 		*master_key = NULL;
8978 		ib::error() << "Encryption can't find master key, please check"
8979 				" the keyring plugin is loaded.";
8980 	}
8981 
8982 	if (key_type) {
8983 		my_free(key_type);
8984 	}
8985 #endif
8986 }
8987 
8988 /** Check if page is encrypted page or not
8989 @param[in]	page	page which need to check
8990 @return true if it is a encrypted page */
8991 bool
is_encrypted_page(const byte * page)8992 Encryption::is_encrypted_page(const byte* page)
8993 {
8994 	ulint	page_type = mach_read_from_2(page + FIL_PAGE_TYPE);
8995 
8996 	return(page_type == FIL_PAGE_ENCRYPTED
8997 	       || page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED
8998 	       || page_type == FIL_PAGE_ENCRYPTED_RTREE);
8999 }
9000 
9001 /** Encrypt the page data contents. Page type can't be
9002 FIL_PAGE_ENCRYPTED, FIL_PAGE_COMPRESSED_AND_ENCRYPTED,
9003 FIL_PAGE_ENCRYPTED_RTREE.
9004 @param[in]	type		IORequest
9005 @param[in,out]	src		page data which need to encrypt
9006 @param[in]	src_len		Size of the source in bytes
9007 @param[in,out]	dst		destination area
9008 @param[in,out]	dst_len		Size of the destination in bytes
9009 @return buffer data, dst_len will have the length of the data */
9010 byte*
encrypt(const IORequest & type,byte * src,ulint src_len,byte * dst,ulint * dst_len)9011 Encryption::encrypt(
9012 	const IORequest&	type,
9013 	byte*			src,
9014 	ulint			src_len,
9015 	byte*			dst,
9016 	ulint*			dst_len)
9017 {
9018 	ut_ad(m_type != NONE);
9019 	ut_ad(!type.is_log());
9020 #ifdef UNIV_ENCRYPT_DEBUG
9021 	ulint space_id =
9022 		mach_read_from_4(src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
9023 	ulint page_no = mach_read_from_4(src + FIL_PAGE_OFFSET);
9024 
9025 	fprintf(stderr, "Encrypting page:%lu.%lu len:%lu\n",
9026 		space_id, page_no, src_len);
9027 #endif
9028 
9029 	/* Shouldn't encrypte an already encrypted page. */
9030 	ut_ad(!is_encrypted_page(src));
9031 
9032 	const uint16_t page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
9033 
9034 	/* This is data size which need to encrypt. */
9035 	ulint src_enc_len = src_len;
9036 
9037 	/* In FIL_PAGE_VERSION_2, we encrypt the actual compressed data length. */
9038 	if (page_type == FIL_PAGE_COMPRESSED) {
9039 		src_enc_len = mach_read_from_2(src + FIL_PAGE_COMPRESS_SIZE_V1) +
9040 					       FIL_PAGE_DATA;
9041 		/* Extend src_enc_len if needed */
9042 		if (src_enc_len < MIN_ENCRYPTION_LEN) {
9043 			src_enc_len = MIN_ENCRYPTION_LEN;
9044 		}
9045 		ut_a(src_enc_len <= src_len);
9046 	}
9047 
9048 	/* Only encrypt the data + trailer, leave the header alone */
9049 
9050 	switch (m_type) {
9051 	case Encryption::NONE:
9052 		ut_error;
9053 
9054 	case Encryption::AES: {
9055 		ut_ad(m_klen == ENCRYPTION_KEY_LEN);
9056 
9057 		/* Total length of the data to encrypt. */
9058 		const ulint data_len = src_enc_len - FIL_PAGE_DATA;
9059 
9060 		/* Server encryption functions expect input data to be in
9061 		multiples of MY_AES_BLOCK SIZE. Therefore we encrypt the
9062 		overlapping data of the chunk_len and trailer_len twice.
9063 		First we encrypt the bigger chunk of data then we do the
9064 		trailer. The trailer encryption block starts at
9065 		2 * MY_AES_BLOCK_SIZE bytes offset from the end of the enc_len.
9066 		During decryption we do the reverse of the above process. */
9067 		ut_ad(data_len >= 2 * MY_AES_BLOCK_SIZE);
9068 
9069 		const ulint chunk_len =
9070 			 (data_len / MY_AES_BLOCK_SIZE) * MY_AES_BLOCK_SIZE;
9071 		const ulint remain_len = data_len - chunk_len;
9072 
9073 		lint elen = my_aes_encrypt(
9074 			src + FIL_PAGE_DATA, static_cast<uint32>(chunk_len),
9075 			dst + FIL_PAGE_DATA, reinterpret_cast<byte *>(m_key),
9076 			static_cast<uint32>(m_klen), my_aes_256_cbc,
9077 			reinterpret_cast<byte *>(m_iv), false);
9078 
9079 		if (elen == MY_AES_BAD_DATA) {
9080 			ulint	page_no =mach_read_from_4(
9081 				src + FIL_PAGE_OFFSET);
9082 			ulint	space_id = mach_read_from_4(
9083 				src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
9084 			*dst_len = src_len;
9085 #ifndef UNIV_INNOCHECKSUM
9086 				ib::warn()
9087 					<< " Can't encrypt data of page,"
9088 					<< " page no:" << page_no
9089 					<< " space id:" << space_id;
9090 #else
9091 				fprintf(stderr, " Can't encrypt data of page,"
9092 					" page no:" ULINTPF
9093 					" space id:" ULINTPF,
9094 					page_no, space_id);
9095 #endif /* !UNIV_INNOCHECKSUM */
9096 			return(src);
9097 		}
9098 
9099 		const ulint len = static_cast<ulint>(elen);
9100 		ut_ad(len == chunk_len);
9101 
9102 		/* Encrypt the trailing bytes. */
9103 		if (remain_len != 0) {
9104 			/* Copy remaining bytes and page tailer. */
9105 			memcpy(dst + FIL_PAGE_DATA + len,
9106 			       src + FIL_PAGE_DATA + len,
9107 			       remain_len);
9108 
9109 			const ulint trailer_len = MY_AES_BLOCK_SIZE * 2;
9110 			byte buf[trailer_len];
9111 
9112 			elen = my_aes_encrypt(
9113 				dst + FIL_PAGE_DATA + data_len - trailer_len,
9114 				static_cast<uint32>(trailer_len), buf,
9115 				reinterpret_cast<unsigned char*>(m_key),
9116 				static_cast<uint32>(m_klen), my_aes_256_cbc,
9117 				reinterpret_cast<byte *>(m_iv), false);
9118 
9119 			if (elen == MY_AES_BAD_DATA) {
9120 				ulint	page_no =mach_read_from_4(
9121 					src + FIL_PAGE_OFFSET);
9122 				ulint	space_id = mach_read_from_4(
9123 					src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
9124 #ifndef UNIV_INNOCHECKSUM
9125 				ib::warn()
9126 					<< " Can't encrypt data of page,"
9127 					<< " page no:" << page_no
9128 					<< " space id:" << space_id;
9129 #else
9130 				fprintf(stderr, " Can't encrypt data of page,"
9131 					" page no:" ULINTPF
9132 					" space id:" ULINTPF,
9133 					page_no, space_id);
9134 #endif /* !UNIV_INNOCHECKSUM */
9135 				*dst_len = src_len;
9136 				return(src);
9137 			}
9138 
9139 			ut_a(static_cast<ulint>(elen) == trailer_len);
9140 
9141 			memcpy(dst + FIL_PAGE_DATA + data_len - trailer_len,
9142 			       buf, trailer_len);
9143 		}
9144 
9145 
9146 		break;
9147 	}
9148 
9149 	default:
9150 		ut_error;
9151 	}
9152 
9153 	/* Copy the header as is. */
9154 	memmove(dst, src, FIL_PAGE_DATA);
9155 	ut_ad(memcmp(src, dst, FIL_PAGE_DATA) == 0);
9156 
9157 	/* Add encryption control information. Required for decrypting. */
9158 	if (page_type == FIL_PAGE_COMPRESSED) {
9159 		/* If the page is compressed, we don't need to save the
9160 		original type, since it is done in compression already. */
9161 		mach_write_to_2(dst + FIL_PAGE_TYPE,
9162 				FIL_PAGE_COMPRESSED_AND_ENCRYPTED);
9163 		ut_ad(memcmp(src+FIL_PAGE_TYPE+2,
9164 			     dst+FIL_PAGE_TYPE+2,
9165 			     FIL_PAGE_DATA-FIL_PAGE_TYPE-2) == 0);
9166 	} else if (page_type == FIL_PAGE_RTREE) {
9167 		/* If the page is R-tree page, we need to save original type. */
9168 		mach_write_to_2(dst + FIL_PAGE_TYPE, FIL_PAGE_ENCRYPTED_RTREE);
9169 	} else{
9170 		mach_write_to_2(dst + FIL_PAGE_TYPE, FIL_PAGE_ENCRYPTED);
9171 		mach_write_to_2(dst + FIL_PAGE_ORIGINAL_TYPE_V1, page_type);
9172 	}
9173 
9174 #ifdef UNIV_ENCRYPT_DEBUG
9175 #ifndef UNIV_INNOCHECKSUM
9176 #if 0
9177 	byte*	check_buf = static_cast<byte*>(ut_malloc_nokey(src_len));
9178 	byte*	buf2 = static_cast<byte*>(ut_malloc_nokey(src_len));
9179 
9180 	memcpy(check_buf, dst, src_len);
9181 
9182 	dberr_t err = decrypt(type, check_buf, src_len, buf2, src_len);
9183 	if (err != DB_SUCCESS || memcmp(src + FIL_PAGE_DATA,
9184 					check_buf + FIL_PAGE_DATA,
9185 					src_len - FIL_PAGE_DATA) != 0) {
9186 		ut_print_buf(stderr, src, src_len);
9187 		ut_print_buf(stderr, check_buf, src_len);
9188 		ut_ad(0);
9189 	}
9190 	ut_free(buf2);
9191 	ut_free(check_buf);
9192 #endif
9193 	fprintf(stderr, "Encrypted page:%lu.%lu\n", space_id, page_no);
9194 #endif
9195 #endif
9196 
9197 	/* Add padding 0 for unused portion */
9198 	if (src_len > src_enc_len) {
9199 		memset(dst + src_enc_len, 0, src_len - src_enc_len);
9200 	}
9201 
9202 	*dst_len = src_len;
9203 
9204 	return(dst);
9205 }
9206 
9207 /** Decrypt the page data contents. Page type must be FIL_PAGE_ENCRYPTED,
9208 if not then the source contents are left unchanged and DB_SUCCESS is returned.
9209 @param[in]	type		IORequest
9210 @param[in,out]	src		Data read from disk, decrypted data will be
9211 				copied to this page
9212 @param[in]	src_len		source data length
9213 @param[in,out]	dst		Scratch area to use for decryption
9214 @param[in]	dst_len		Size of the scratch area in bytes
9215 @return DB_SUCCESS or error code */
9216 dberr_t
decrypt(const IORequest & type,byte * src,ulint src_len,byte * dst,ulint dst_len)9217 Encryption::decrypt(
9218 	const IORequest&	type,
9219 	byte*			src,
9220 	ulint			src_len,
9221 	byte*			dst,
9222 	ulint			dst_len)
9223 {
9224 	ulint		data_len;
9225 	ulint		main_len;
9226 	ulint		remain_len;
9227 	ulint		original_type;
9228 	ulint		page_type;
9229 	byte		remain_buf[MY_AES_BLOCK_SIZE * 2];
9230 	Block*		block;
9231 
9232 	/* Do nothing if it's not an encrypted table. */
9233 	if (!is_encrypted_page(src)) {
9234 		return(DB_SUCCESS);
9235 	}
9236 
9237 	/* For compressed page, we need to get the compressed size
9238 	for decryption */
9239 	page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
9240 	if (page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED) {
9241 		src_len = static_cast<uint16_t>(
9242 			mach_read_from_2(src + FIL_PAGE_COMPRESS_SIZE_V1))
9243 			+ FIL_PAGE_DATA;
9244 #ifndef UNIV_INNOCHECKSUM
9245 		Compression::meta_t header;
9246 		Compression::deserialize_header(src, &header);
9247 		if (header.m_version == Compression::FIL_PAGE_VERSION_1) {
9248 			src_len = ut_calc_align(src_len, type.block_size());
9249 		} else {
9250 			/* Extend src_len if needed */
9251 			if (src_len < MIN_ENCRYPTION_LEN) {
9252 				src_len = MIN_ENCRYPTION_LEN;
9253 			}
9254 		}
9255 #endif
9256 	}
9257 #ifdef UNIV_ENCRYPT_DEBUG
9258 	ulint space_id =
9259 		mach_read_from_4(src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
9260 	ulint page_no = mach_read_from_4(src + FIL_PAGE_OFFSET);
9261 
9262 	fprintf(stderr, "Decrypting page:%lu.%lu len:%lu\n",
9263 		space_id, page_no, src_len);
9264 #endif
9265 
9266 	original_type = static_cast<uint16_t>(
9267 		mach_read_from_2(src + FIL_PAGE_ORIGINAL_TYPE_V1));
9268 
9269 	byte*	ptr = src + FIL_PAGE_DATA;
9270 
9271 	/* The caller doesn't know what to expect */
9272 	if (dst == NULL) {
9273 
9274 		block = os_alloc_block();
9275 #ifdef UNIV_INNOCHECKSUM
9276 		dst = block;
9277 #else
9278 		dst = block->m_ptr;
9279 #endif /* UNIV_INNOCHECKSUM */
9280 
9281 	} else {
9282 		block = NULL;
9283 	}
9284 
9285 	data_len = src_len - FIL_PAGE_DATA;
9286 	main_len = (data_len / MY_AES_BLOCK_SIZE) * MY_AES_BLOCK_SIZE;
9287 	remain_len = data_len - main_len;
9288 
9289 	switch(m_type) {
9290 	case Encryption::AES: {
9291 		lint			elen;
9292 
9293 		/* First decrypt the last 2 blocks data of data, since
9294 		data is no block aligned. */
9295 		if (remain_len != 0) {
9296 			ut_ad(m_klen == ENCRYPTION_KEY_LEN);
9297 
9298 			remain_len = MY_AES_BLOCK_SIZE * 2;
9299 
9300 			/* Copy the last 2 blocks. */
9301 			memcpy(remain_buf,
9302 			       ptr + data_len - remain_len,
9303 			       remain_len);
9304 
9305 			elen = my_aes_decrypt(
9306 				remain_buf,
9307 				static_cast<uint32>(remain_len),
9308 				dst + data_len - remain_len,
9309 				reinterpret_cast<unsigned char*>(m_key),
9310 				static_cast<uint32>(m_klen),
9311 				my_aes_256_cbc,
9312 				reinterpret_cast<unsigned char*>(m_iv),
9313 				false);
9314 			if (elen == MY_AES_BAD_DATA) {
9315 				if (block != NULL) {
9316 					os_free_block(block);
9317 				}
9318 
9319 				return(DB_IO_DECRYPT_FAIL);
9320 			}
9321 
9322 			/* Copy the other data bytes to temp area. */
9323 			memcpy(dst, ptr, data_len - remain_len);
9324 		} else {
9325 			ut_ad(data_len == main_len);
9326 
9327 			/* Copy the data bytes to temp area. */
9328 			memcpy(dst, ptr, data_len);
9329 		}
9330 
9331 		/* Then decrypt the main data */
9332 		elen = my_aes_decrypt(
9333 				dst,
9334 				static_cast<uint32>(main_len),
9335 				ptr,
9336 				reinterpret_cast<unsigned char*>(m_key),
9337 				static_cast<uint32>(m_klen),
9338 				my_aes_256_cbc,
9339 				reinterpret_cast<unsigned char*>(m_iv),
9340 				false);
9341 		if (elen == MY_AES_BAD_DATA) {
9342 
9343 			if (block != NULL) {
9344 				os_free_block(block);
9345 			}
9346 
9347 			return(DB_IO_DECRYPT_FAIL);
9348 		}
9349 
9350 		ut_ad(static_cast<ulint>(elen) == main_len);
9351 
9352 		/* Copy the remain bytes. */
9353 		memcpy(ptr + main_len, dst + main_len, data_len - main_len);
9354 
9355 		break;
9356 	}
9357 
9358 	default:
9359 		if (!type.is_dblwr_recover()) {
9360 #if !defined(UNIV_INNOCHECKSUM)
9361 			ib::error()
9362 				<< "Encryption algorithm support missing: "
9363 				<< Encryption::to_string(m_type);
9364 #else
9365 			fprintf(stderr, "Encryption algorithm support missing: %s\n",
9366 				Encryption::to_string(m_type));
9367 #endif /* !UNIV_INNOCHECKSUM */
9368 		}
9369 
9370 		if (block != NULL) {
9371 			os_free_block(block);
9372 		}
9373 
9374 		return(DB_UNSUPPORTED);
9375 	}
9376 
9377 	/* Restore the original page type. If it's a compressed and
9378 	encrypted page, just reset it as compressed page type, since
9379 	we will do uncompress later. */
9380 
9381 	if (page_type == FIL_PAGE_ENCRYPTED) {
9382 		mach_write_to_2(src + FIL_PAGE_TYPE, original_type);
9383 		mach_write_to_2(src + FIL_PAGE_ORIGINAL_TYPE_V1, 0);
9384 	} else if (page_type == FIL_PAGE_ENCRYPTED_RTREE) {
9385 		mach_write_to_2(src + FIL_PAGE_TYPE, FIL_PAGE_RTREE);
9386 	} else {
9387 		ut_ad(page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED);
9388 		mach_write_to_2(src + FIL_PAGE_TYPE, FIL_PAGE_COMPRESSED);
9389 	}
9390 
9391 	if (block != NULL) {
9392 		os_free_block(block);
9393 	}
9394 
9395 #ifdef UNIV_ENCRYPT_DEBUG
9396 	fprintf(stderr, "Decrypted page:%lu.%lu\n", space_id, page_no);
9397 #endif
9398 
9399 	DBUG_EXECUTE_IF("ib_crash_during_decrypt_page", DBUG_SUICIDE(););
9400 
9401 	return(DB_SUCCESS);
9402 }
9403 
9404 /** Normalizes a directory path for the current OS:
9405 On Windows, we convert '/' to '\', else we convert '\' to '/'.
9406 @param[in,out] str A null-terminated directory and file path */
9407 void
os_normalize_path(char * str)9408 os_normalize_path(
9409 	char*	str)
9410 {
9411 	if (str != NULL) {
9412 		for (; *str; str++) {
9413 			if (*str == OS_PATH_SEPARATOR_ALT) {
9414 				*str = OS_PATH_SEPARATOR;
9415 			}
9416 		}
9417 	}
9418 }
9419