1 /***********************************************************************
2 
3 Copyright (c) 1995, 2020, Oracle and/or its affiliates.
4 Copyright (c) 2009, Percona Inc.
5 
6 Portions of this file contain modifications contributed and copyrighted
7 by Percona Inc.. Those modifications are
8 gratefully acknowledged and are described briefly in the InnoDB
9 documentation. The contributions by Percona Inc. are incorporated with
10 their permission, and subject to the conditions contained in the file
11 COPYING.Percona.
12 
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License, version 2.0,
15 as published by the Free Software Foundation.
16 
17 This program is also distributed with certain software (including
18 but not limited to OpenSSL) that is licensed under separate terms,
19 as designated in a particular file or component or in included license
20 documentation.  The authors of MySQL hereby grant you an additional
21 permission to link the program and your derivative works with the
22 separately licensed software that they have included with MySQL.
23 
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27 GNU General Public License, version 2.0, for more details.
28 
29 You should have received a copy of the GNU General Public License along with
30 this program; if not, write to the Free Software Foundation, Inc.,
31 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
32 
33 ***********************************************************************/
34 
35 /**************************************************//**
36 @file os/os0file.cc
37 The interface to the operating system file i/o primitives
38 
39 Created 10/21/1995 Heikki Tuuri
40 *******************************************************/
41 
42 #ifndef UNIV_INNOCHECKSUM
43 
44 #include "ha_prototypes.h"
45 #include "sql_const.h"
46 
47 #include "os0file.h"
48 
49 #ifdef UNIV_NONINL
50 #include "os0file.ic"
51 #endif
52 
53 #include "page0page.h"
54 #include "srv0srv.h"
55 #include "srv0start.h"
56 #include "fil0fil.h"
57 #ifndef UNIV_HOTBACKUP
58 # include "os0event.h"
59 # include "os0thread.h"
60 #else /* !UNIV_HOTBACKUP */
61 # ifdef _WIN32
62 /* Add includes for the _stat() call to compile on Windows */
63 #  include <sys/types.h>
64 #  include <sys/stat.h>
65 #  include <errno.h>
66 # endif /* _WIN32 */
67 #endif /* !UNIV_HOTBACKUP */
68 
69 #include <vector>
70 #include <functional>
71 
72 #ifdef LINUX_NATIVE_AIO
73 #include <libaio.h>
74 #endif /* LINUX_NATIVE_AIO */
75 
76 #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
77 # include <fcntl.h>
78 # include <linux/falloc.h>
79 #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
80 
81 #include <lz4.h>
82 #include <zlib.h>
83 
84 #ifdef UNIV_DEBUG
85 /** Set when InnoDB has invoked exit(). */
86 bool	innodb_calling_exit;
87 #endif /* UNIV_DEBUG */
88 
89 #include <my_aes.h>
90 #include <my_rnd.h>
91 #include <mysqld.h>
92 #include <mysql/service_mysql_keyring.h>
93 
94 /** Insert buffer segment id */
95 static const ulint IO_IBUF_SEGMENT = 0;
96 
97 /** Log segment id */
98 static const ulint IO_LOG_SEGMENT = 1;
99 
100 /** Number of retries for partial I/O's */
101 static const ulint NUM_RETRIES_ON_PARTIAL_IO = 10;
102 
103 /** Blocks for doing IO, used in the transparent compression
104 and encryption code. */
105 struct Block {
106 	/** Default constructor */
BlockBlock107 	Block() : m_ptr(), m_in_use() { }
108 
109 	byte*		m_ptr;
110 
111 	byte		pad[CACHE_LINE_SIZE - sizeof(ulint)];
112 	lock_word_t	m_in_use;
113 };
114 
115 /** For storing the allocated blocks */
116 typedef std::vector<Block> Blocks;
117 
118 /** Block collection */
119 static Blocks*	block_cache;
120 
121 /** Number of blocks to allocate for sync read/writes */
122 static const size_t	MAX_BLOCKS = 128;
123 
124 /** Block buffer size */
125 #define BUFFER_BLOCK_SIZE ((ulint)(UNIV_PAGE_SIZE * 1.3))
126 
127 /** Disk sector size of aligning write buffer for DIRECT_IO */
128 static ulint	os_io_ptr_align = UNIV_SECTOR_SIZE;
129 
130 /* This specifies the file permissions InnoDB uses when it creates files in
131 Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
132 my_umask */
133 
134 #ifndef _WIN32
135 /** Umask for creating files */
136 ulint	os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
137 #else
138 /** Umask for creating files */
139 ulint	os_innodb_umask	= 0;
140 
141 /* On Windows when using native AIO the number of AIO requests
142 that a thread can handle at a given time is limited to 32
143 i.e.: SRV_N_PENDING_IOS_PER_THREAD */
144 #define SRV_N_PENDING_IOS_PER_THREAD    OS_AIO_N_PENDING_IOS_PER_THREAD
145 
146 #endif /* _WIN32 */
147 
148 #ifndef UNIV_HOTBACKUP
149 
150 /** In simulated aio, merge at most this many consecutive i/os */
151 static const ulint	OS_AIO_MERGE_N_CONSECUTIVE = 64;
152 
153 /** Flag indicating if the page_cleaner is in active state. */
154 extern bool buf_page_cleaner_is_active;
155 
156 /**********************************************************************
157 
158 InnoDB AIO Implementation:
159 =========================
160 
161 We support native AIO for Windows and Linux. For rest of the platforms
162 we simulate AIO by special IO-threads servicing the IO-requests.
163 
164 Simulated AIO:
165 ==============
166 
167 On platforms where we 'simulate' AIO, the following is a rough explanation
168 of the high level design.
169 There are four io-threads (for ibuf, log, read, write).
170 All synchronous IO requests are serviced by the calling thread using
171 os_file_write/os_file_read. The Asynchronous requests are queued up
172 in an array (there are four such arrays) by the calling thread.
173 Later these requests are picked up by the IO-thread and are serviced
174 synchronously.
175 
176 Windows native AIO:
177 ==================
178 
179 If srv_use_native_aio is not set then Windows follow the same
180 code as simulated AIO. If the flag is set then native AIO interface
181 is used. On windows, one of the limitation is that if a file is opened
182 for AIO no synchronous IO can be done on it. Therefore we have an
183 extra fifth array to queue up synchronous IO requests.
184 There are innodb_file_io_threads helper threads. These threads work
185 on the four arrays mentioned above in Simulated AIO. No thread is
186 required for the sync array.
187 If a synchronous IO request is made, it is first queued in the sync
188 array. Then the calling thread itself waits on the request, thus
189 making the call synchronous.
190 If an AIO request is made the calling thread not only queues it in the
191 array but also submits the requests. The helper thread then collects
192 the completed IO request and calls completion routine on it.
193 
194 Linux native AIO:
195 =================
196 
197 If we have libaio installed on the system and innodb_use_native_aio
198 is set to true we follow the code path of native AIO, otherwise we
199 do simulated AIO.
200 There are innodb_file_io_threads helper threads. These threads work
201 on the four arrays mentioned above in Simulated AIO.
202 If a synchronous IO request is made, it is handled by calling
203 os_file_write/os_file_read.
204 If an AIO request is made the calling thread not only queues it in the
205 array but also submits the requests. The helper thread then collects
206 the completed IO request and calls completion routine on it.
207 
208 **********************************************************************/
209 
210 
211 #ifdef UNIV_PFS_IO
212 /* Keys to register InnoDB I/O with performance schema */
213 mysql_pfs_key_t  innodb_data_file_key;
214 mysql_pfs_key_t  innodb_log_file_key;
215 mysql_pfs_key_t  innodb_temp_file_key;
216 #endif /* UNIV_PFS_IO */
217 
218 /** The asynchronous I/O context */
219 struct Slot {
SlotSlot220 	Slot() { memset(this, 0, sizeof(*this)); }
221 
222 	/** index of the slot in the aio array */
223 	uint16_t		pos;
224 
225 	/** true if this slot is reserved */
226 	bool			is_reserved;
227 
228 	/** time when reserved */
229 	ib_time_monotonic_t	reservation_time;
230 
231 	/** buffer used in i/o */
232 	byte*			buf;
233 
234 	/** Buffer pointer used for actual IO. We advance this
235 	when partial IO is required and not buf */
236 	byte*			ptr;
237 
238 	/** OS_FILE_READ or OS_FILE_WRITE */
239 	IORequest		type;
240 
241 	/** file offset in bytes */
242 	os_offset_t		offset;
243 
244 	/** file where to read or write */
245 	pfs_os_file_t		file;
246 
247 	/** file name or path */
248 	const char*		name;
249 
250 	/** used only in simulated aio: true if the physical i/o
251 	already made and only the slot message needs to be passed
252 	to the caller of os_aio_simulated_handle */
253 	bool			io_already_done;
254 
255 	/** The file node for which the IO is requested. */
256 	fil_node_t*		m1;
257 
258 	/** the requester of an aio operation and which can be used
259 	to identify which pending aio operation was completed */
260 	void*			m2;
261 
262 	/** AIO completion status */
263 	dberr_t			err;
264 
265 #ifdef WIN_ASYNC_IO
266 	/** handle object we need in the OVERLAPPED struct */
267 	HANDLE			handle;
268 
269 	/** Windows control block for the aio request */
270 	OVERLAPPED		control;
271 
272 	/** bytes written/read */
273 	DWORD			n_bytes;
274 
275 	/** length of the block to read or write */
276 	DWORD			len;
277 
278 #elif defined(LINUX_NATIVE_AIO)
279 	/** Linux control block for aio */
280 	struct iocb		control;
281 
282 	/** AIO return code */
283 	int			ret;
284 
285 	/** bytes written/read. */
286 	ssize_t			n_bytes;
287 
288 	/** length of the block to read or write */
289 	ulint			len;
290 #else
291 	/** length of the block to read or write */
292 	ulint			len;
293 
294 	/** bytes written/read. */
295 	ulint			n_bytes;
296 #endif /* WIN_ASYNC_IO */
297 
298 	/** Length of the block before it was compressed */
299 	uint32			original_len;
300 
301 	/** Buffer block for compressed pages or encrypted pages */
302 	Block*			buf_block;
303 
304 	/** true, if we shouldn't punch a hole after writing the page */
305 	bool			skip_punch_hole;
306 };
307 
308 /** The asynchronous i/o array structure */
309 class AIO {
310 public:
311 	/** Constructor
312 	@param[in]	id		Latch ID
313 	@param[in]	n_slots		Number of slots to configure
314 	@param[in]	segments	Number of segments to configure */
315 	AIO(latch_id_t id, ulint n_slots, ulint segments);
316 
317 	/** Destructor */
318 	~AIO();
319 
320 	/** Initialize the instance
321 	@return DB_SUCCESS or error code */
322 	dberr_t init();
323 
324 	/** Requests for a slot in the aio array. If no slot is available, waits
325 	until not_full-event becomes signaled.
326 
327 	@param[in,out]	type	IO context
328 	@param[in,out]	m1	message to be passed along with the AIO
329 				operation
330 	@param[in,out]	m2	message to be passed along with the AIO
331 				operation
332 	@param[in]	file	file handle
333 	@param[in]	name	name of the file or path as a null-terminated
334 				string
335 	@param[in,out]	buf	buffer where to read or from which to write
336 	@param[in]	offset	file offset, where to read from or start writing
337 	@param[in]	len	length of the block to read or write
338 	@return pointer to slot */
339 	Slot* reserve_slot(
340 		IORequest&	type,
341 		fil_node_t*	m1,
342 		void*		m2,
343 		pfs_os_file_t	file,
344 		const char*	name,
345 		void*		buf,
346 		os_offset_t	offset,
347 		ulint		len)
348 		MY_ATTRIBUTE((warn_unused_result));
349 
350 	/** @return number of reserved slots */
351 	ulint pending_io_count() const;
352 
353 	/** Returns a pointer to the nth slot in the aio array.
354 	@param[in]	index	Index of the slot in the array
355 	@return pointer to slot */
at(ulint i) const356 	const Slot* at(ulint i) const
357 		MY_ATTRIBUTE((warn_unused_result))
358 	{
359 		ut_a(i < m_slots.size());
360 
361 		return(&m_slots[i]);
362 	}
363 
364 	/** Non const version */
at(ulint i)365 	Slot* at(ulint i)
366 		MY_ATTRIBUTE((warn_unused_result))
367 	{
368 		ut_a(i < m_slots.size());
369 
370 		return(&m_slots[i]);
371 	}
372 
373 	/** Frees a slot in the AIO array, assumes caller owns the mutex.
374 	@param[in,out]	slot	Slot to release */
375 	void release(Slot* slot);
376 
377 	/** Frees a slot in the AIO array, assumes caller doesn't own the mutex.
378 	@param[in,out]	slot	Slot to release */
379 	void release_with_mutex(Slot* slot);
380 
381 	/** Prints info about the aio array.
382 	@param[in,out]	file	Where to print */
383 	void print(FILE* file);
384 
385 	/** @return the number of slots per segment */
slots_per_segment() const386 	ulint slots_per_segment() const
387 		MY_ATTRIBUTE((warn_unused_result))
388 	{
389 		return(m_slots.size() / m_n_segments);
390 	}
391 
392 	/** @return accessor for n_segments */
get_n_segments() const393 	ulint get_n_segments() const
394 		MY_ATTRIBUTE((warn_unused_result))
395 	{
396 		return(m_n_segments);
397 	}
398 
399 #ifdef UNIV_DEBUG
400 	/** @return true if the thread owns the mutex */
is_mutex_owned() const401 	bool is_mutex_owned() const
402 		MY_ATTRIBUTE((warn_unused_result))
403 	{
404 		return(mutex_own(&m_mutex));
405 	}
406 #endif /* UNIV_DEBUG */
407 
408 	/** Acquire the mutex */
acquire() const409 	void acquire() const
410 	{
411 		mutex_enter(&m_mutex);
412 	}
413 
414 	/** Release the mutex */
release() const415 	void release() const
416 	{
417 		mutex_exit(&m_mutex);
418 	}
419 
420 	/** Write out the state to the file/stream
421 	@param[in, out]	file	File to write to */
422 	void to_file(FILE* file) const;
423 
424 #ifdef LINUX_NATIVE_AIO
425 	/** Dispatch an AIO request to the kernel.
426 	@param[in,out]	slot	an already reserved slot
427 	@return true on success. */
428 	bool linux_dispatch(Slot* slot)
429 		MY_ATTRIBUTE((warn_unused_result));
430 
431 	/** Accessor for an AIO event
432 	@param[in]	index	Index into the array
433 	@return the event at the index */
io_events(ulint index)434 	io_event* io_events(ulint index)
435 		MY_ATTRIBUTE((warn_unused_result))
436 	{
437 		ut_a(index < m_events.size());
438 
439 		return(&m_events[index]);
440 	}
441 
442 	/** Accessor for the AIO context
443 	@param[in]	segment	Segment for which to get the context
444 	@return the AIO context for the segment */
io_ctx(ulint segment)445 	io_context* io_ctx(ulint segment)
446 		MY_ATTRIBUTE((warn_unused_result))
447 	{
448 		ut_ad(segment < get_n_segments());
449 
450 		return(m_aio_ctx[segment]);
451 	}
452 
453 	/** Creates an io_context for native linux AIO.
454 	@param[in]	max_events	number of events
455 	@param[out]	io_ctx		io_ctx to initialize.
456 	@return true on success. */
457 	static bool linux_create_io_ctx(ulint max_events, io_context_t* io_ctx)
458 		MY_ATTRIBUTE((warn_unused_result));
459 
460 	/** Checks if the system supports native linux aio. On some kernel
461 	versions where native aio is supported it won't work on tmpfs. In such
462 	cases we can't use native aio as it is not possible to mix simulated
463 	and native aio.
464 	@return true if supported, false otherwise. */
465 	static bool is_linux_native_aio_supported()
466 		MY_ATTRIBUTE((warn_unused_result));
467 #endif /* LINUX_NATIVE_AIO */
468 
469 #ifdef WIN_ASYNC_IO
470 	/** Wakes up all async i/o threads in the array in Windows async I/O at
471 	shutdown. */
signal()472 	void signal()
473 	{
474 		for (ulint i = 0; i < m_slots.size(); ++i) {
475 			SetEvent(m_slots[i].handle);
476 		}
477 	}
478 
479 	/** Wake up all AIO threads in Windows native aio */
wake_at_shutdown()480 	static void wake_at_shutdown()
481 	{
482 		s_reads->signal();
483 
484 		if (s_writes != NULL) {
485 			s_writes->signal();
486 		}
487 
488 		if (s_ibuf != NULL) {
489 			s_ibuf->signal();
490 		}
491 
492 		if (s_log != NULL) {
493 			s_log->signal();
494 		}
495 	}
496 #endif /* WIN_ASYNC_IO */
497 
498 #ifdef _WIN32
499 	/** This function can be called if one wants to post a batch of reads
500 	and prefers an I/O - handler thread to handle them all at once later.You
501 	must call os_aio_simulated_wake_handler_threads later to ensure the
502 	threads are not left sleeping! */
503 	static void simulated_put_read_threads_to_sleep();
504 
505 	/** The non asynchronous IO array.
506 	@return the synchronous AIO array instance. */
sync_array()507 	static AIO* sync_array()
508 		MY_ATTRIBUTE((warn_unused_result))
509 	{
510 		return(s_sync);
511 	}
512 
513 	/**
514 	Get the AIO handles for a segment.
515 	@param[in]	segment		The local segment.
516 	@return the handles for the segment. */
handles(ulint segment)517 	HANDLE* handles(ulint segment)
518 		MY_ATTRIBUTE((warn_unused_result))
519 	{
520 		ut_ad(segment < m_handles->size() / slots_per_segment());
521 
522 		return(&(*m_handles)[segment * slots_per_segment()]);
523 	}
524 
525 	/** @return true if no slots are reserved */
is_empty() const526 	bool is_empty() const
527 		MY_ATTRIBUTE((warn_unused_result))
528 	{
529 		ut_ad(is_mutex_owned());
530 		return(m_n_reserved == 0);
531 	}
532 #endif /* _WIN32 */
533 
534 	/** Create an instance using new(std::nothrow)
535 	@param[in]	id		Latch ID
536 	@param[in]	n_slots		The number of AIO request slots
537 	@param[in]	segments	The number of segments
538 	@return a new AIO instance */
539 	static AIO* create(
540 		latch_id_t	id,
541 		ulint		n_slots,
542 		ulint		segments)
543 		MY_ATTRIBUTE((warn_unused_result));
544 
545 	/** Initializes the asynchronous io system. Creates one array each
546 	for ibuf and log I/O. Also creates one array each for read and write
547 	where each array is divided logically into n_readers and n_writers
548 	respectively. The caller must create an i/o handler thread for each
549 	segment in these arrays. This function also creates the sync array.
550 	No I/O handler thread needs to be created for that
551 	@param[in]	n_per_seg	maximum number of pending aio
552 					operations allowed per segment
553 	@param[in]	n_readers	number of reader threads
554 	@param[in]	n_writers	number of writer threads
555 	@param[in]	n_slots_sync	number of slots in the sync aio array
556 	@return true if AIO sub-system was started successfully */
557 	static bool start(
558 		ulint		n_per_seg,
559 		ulint		n_readers,
560 		ulint		n_writers,
561 		ulint		n_slots_sync)
562 		MY_ATTRIBUTE((warn_unused_result));
563 
564 	/** Free the AIO arrays */
565 	static void shutdown();
566 
567 	/** Print all the AIO segments
568 	@param[in,out]	file		Where to print */
569 	static void print_all(FILE* file);
570 
571 	/** Calculates local segment number and aio array from global
572 	segment number.
573 	@param[out]	array		AIO wait array
574 	@param[in]	segment		global segment number
575 	@return local segment number within the aio array */
576 	static ulint get_array_and_local_segment(
577 		AIO**		array,
578 		ulint		segment)
579 		MY_ATTRIBUTE((warn_unused_result));
580 
581 	/** Select the IO slot array
582 	@param[in]	type		Type of IO, READ or WRITE
583 	@param[in]	read_only	true if running in read-only mode
584 	@param[in]	mode		IO mode
585 	@return slot array or NULL if invalid mode specified */
586 	static AIO* select_slot_array(
587 		IORequest&	type,
588 		bool		read_only,
589 		ulint		mode)
590 		MY_ATTRIBUTE((warn_unused_result));
591 
592 	/** Calculates segment number for a slot.
593 	@param[in]	array		AIO wait array
594 	@param[in]	slot		slot in this array
595 	@return segment number (which is the number used by, for example,
596 		I/O handler threads) */
597 	static ulint get_segment_no_from_slot(
598 		const AIO*	array,
599 		const Slot*	slot)
600 		MY_ATTRIBUTE((warn_unused_result));
601 
602 	/** Wakes up a simulated AIO I/O-handler thread if it has something
603 	to do.
604 	@param[in]	global_segment	the number of the segment in the
605 					AIO arrays */
606 	static void wake_simulated_handler_thread(ulint global_segment);
607 
608 	/** Check if it is a read request
609 	@param[in]	aio		The AIO instance to check
610 	@return true if the AIO instance is for reading. */
is_read(const AIO * aio)611 	static bool is_read(const AIO* aio)
612 		MY_ATTRIBUTE((warn_unused_result))
613 	{
614 		return(s_reads == aio);
615 	}
616 
617 	/** Wait on an event until no pending writes */
wait_until_no_pending_writes()618 	static void wait_until_no_pending_writes()
619 	{
620 		os_event_wait(AIO::s_writes->m_is_empty);
621 	}
622 
623 	/** Print to file
624 	@param[in]	file		File to write to */
625 	static void print_to_file(FILE* file);
626 
627 	/** Check for pending IO. Gets the count and also validates the
628 	data structures.
629 	@return count of pending IO requests */
630 	static ulint total_pending_io_count();
631 
632 private:
633 	/** Initialise the slots
634 	@return DB_SUCCESS or error code */
635 	dberr_t init_slots()
636 		MY_ATTRIBUTE((warn_unused_result));
637 
638 	/** Wakes up a simulated AIO I/O-handler thread if it has something
639 	to do for a local segment in the AIO array.
640 	@param[in]	global_segment	the number of the segment in the
641 					AIO arrays
642 	@param[in]	segment		the local segment in the AIO array */
643 	void wake_simulated_handler_thread(ulint global_segment, ulint segment);
644 
645 	/** Prints pending IO requests per segment of an aio array.
646 	We probably don't need per segment statistics but they can help us
647 	during development phase to see if the IO requests are being
648 	distributed as expected.
649 	@param[in,out]	file		file where to print
650 	@param[in]	segments	pending IO array */
651 	void print_segment_info(
652 		FILE*		file,
653 		const ulint*	segments);
654 
655 #ifdef LINUX_NATIVE_AIO
656 	/** Initialise the Linux native AIO data structures
657 	@return DB_SUCCESS or error code */
658 	dberr_t init_linux_native_aio()
659 		MY_ATTRIBUTE((warn_unused_result));
660 #endif /* LINUX_NATIVE_AIO */
661 
662 private:
663 	typedef std::vector<Slot> Slots;
664 
665 	/** the mutex protecting the aio array */
666 	mutable SysMutex	m_mutex;
667 
668 	/** Pointer to the slots in the array.
669 	Number of elements must be divisible by n_threads. */
670 	Slots			m_slots;
671 
672 	/** Number of segments in the aio array of pending aio requests.
673 	A thread can wait separately for any one of the segments. */
674 	ulint			m_n_segments;
675 
676 	/** The event which is set to the signaled state when
677 	there is space in the aio outside the ibuf segment */
678 	os_event_t		m_not_full;
679 
680 	/** The event which is set to the signaled state when
681 	there are no pending i/os in this array */
682 	os_event_t		m_is_empty;
683 
684 	/** Number of reserved slots in the AIO array outside
685 	the ibuf segment */
686 	ulint			m_n_reserved;
687 
688 #ifdef _WIN32
689 	typedef std::vector<HANDLE, ut_allocator<HANDLE> > Handles;
690 
691 	/** Pointer to an array of OS native event handles where
692 	we copied the handles from slots, in the same order. This
693 	can be used in WaitForMultipleObjects; used only in Windows */
694 	Handles*		m_handles;
695 #endif /* _WIN32 */
696 
697 #if defined(LINUX_NATIVE_AIO)
698 	typedef std::vector<io_event> IOEvents;
699 
700 	/** completion queue for IO. There is one such queue per
701 	segment. Each thread will work on one ctx exclusively. */
702 	io_context_t*		m_aio_ctx;
703 
704 	/** The array to collect completed IOs. There is one such
705 	event for each possible pending IO. The size of the array
706 	is equal to m_slots.size(). */
707 	IOEvents		m_events;
708 #endif /* LINUX_NATIV_AIO */
709 
710 	/** The aio arrays for non-ibuf i/o and ibuf i/o, as well as
711 	sync AIO. These are NULL when the module has not yet been
712 	initialized. */
713 
714 	/** Insert buffer */
715 	static AIO*		s_ibuf;
716 
717 	/** Redo log */
718 	static AIO*		s_log;
719 
720 	/** Reads */
721 	static AIO*		s_reads;
722 
723 	/** Writes */
724 	static AIO*		s_writes;
725 
726 	/** Synchronous I/O */
727 	static AIO*		s_sync;
728 };
729 
730 /** Static declarations */
731 AIO*	AIO::s_reads;
732 AIO*	AIO::s_writes;
733 AIO*	AIO::s_ibuf;
734 AIO*	AIO::s_log;
735 AIO*	AIO::s_sync;
736 
737 #if defined(LINUX_NATIVE_AIO)
738 /** timeout for each io_getevents() call = 500ms. */
739 static const ulint	OS_AIO_REAP_TIMEOUT = 500000000UL;
740 
741 /** time to sleep, in microseconds if io_setup() returns EAGAIN. */
742 static const ulint	OS_AIO_IO_SETUP_RETRY_SLEEP = 500000UL;
743 
744 /** number of attempts before giving up on io_setup(). */
745 static const int	OS_AIO_IO_SETUP_RETRY_ATTEMPTS = 5;
746 #endif /* LINUX_NATIVE_AIO */
747 
748 /** Array of events used in simulated AIO */
749 static os_event_t*	os_aio_segment_wait_events = NULL;
750 
751 /** Number of asynchronous I/O segments.  Set by os_aio_init(). */
752 static ulint		os_aio_n_segments = ULINT_UNDEFINED;
753 
754 /** If the following is true, read i/o handler threads try to
755 wait until a batch of new read requests have been posted */
756 static bool		os_aio_recommend_sleep_for_read_threads = false;
757 #endif /* !UNIV_HOTBACKUP */
758 
759 ulint	os_n_file_reads		= 0;
760 ulint	os_bytes_read_since_printout = 0;
761 ulint	os_n_file_writes	= 0;
762 ulint	os_n_fsyncs		= 0;
763 ulint	os_n_file_reads_old	= 0;
764 ulint	os_n_file_writes_old	= 0;
765 ulint	os_n_fsyncs_old		= 0;
766 /** Number of pending write operations */
767 ulint	os_n_pending_writes = 0;
768 /** Number of pending read operations */
769 ulint	os_n_pending_reads = 0;
770 
771 ib_time_monotonic_t	os_last_printout;
772 bool	os_has_said_disk_full	= false;
773 
774 /** Default Zip compression level */
775 extern uint page_zip_level;
776 
777 #if DATA_TRX_ID_LEN > 6
778 #error "COMPRESSION_ALGORITHM will not fit"
779 #endif /* DATA_TRX_ID_LEN */
780 
781 /** Validates the consistency of the aio system.
782 @return true if ok */
783 static
784 bool
785 os_aio_validate();
786 
787 /** Does error handling when a file operation fails.
788 @param[in]	name		File name or NULL
789 @param[in]	operation	Name of operation e.g., "read", "write"
790 @return true if we should retry the operation */
791 static
792 bool
793 os_file_handle_error(
794 	const char*	name,
795 	const char*	operation);
796 
797 /** Free storage space associated with a section of the file.
798 @param[in]      fh              Open file handle
799 @param[in]      off             Starting offset (SEEK_SET)
800 @param[in]      len             Size of the hole
801 @return DB_SUCCESS or error code */
802 dberr_t
803 os_file_punch_hole(
804         os_file_t   fh,
805         os_offset_t     off,
806         os_offset_t     len);
807 
808 /**
809 Does error handling when a file operation fails.
810 @param[in]	name		File name or NULL
811 @param[in]	operation	Name of operation e.g., "read", "write"
812 @param[in]	silent	if true then don't print any message to the log.
813 @return true if we should retry the operation */
814 static
815 bool
816 os_file_handle_error_no_exit(
817 	const char*	name,
818 	const char*	operation,
819 	bool		silent);
820 
821 /** Decompress after a read and punch a hole in the file if it was a write
822 @param[in]	type		IO context
823 @param[in]	fh		Open file handle
824 @param[in,out]	buf		Buffer to transform
825 @param[in,out]	scratch		Scratch area for read decompression
826 @param[in]	src_len		Length of the buffer before compression
827 @param[in]	len		Compressed buffer length for write and size
828 				of buf len for read
829 @return DB_SUCCESS or error code */
830 static
831 dberr_t
832 os_file_io_complete(
833 	const IORequest&type,
834 	os_file_t	fh,
835 	byte*		buf,
836 	byte*		scratch,
837 	ulint		src_len,
838 	os_offset_t	offset,
839 	ulint		len);
840 
841 /** Does simulated AIO. This function should be called by an i/o-handler
842 thread.
843 
844 @param[in]	segment	The number of the segment in the aio arrays to wait
845 			for; segment 0 is the ibuf i/o thread, segment 1 the
846 			log i/o thread, then follow the non-ibuf read threads,
847 			and as the last are the non-ibuf write threads
848 @param[out]	m1	the messages passed with the AIO request; note that
849 			also in the case where the AIO operation failed, these
850 			output parameters are valid and can be used to restart
851 			the operation, for example
852 @param[out]	m2	Callback argument
853 @param[in]	type	IO context
854 @return DB_SUCCESS or error code */
855 static
856 dberr_t
857 os_aio_simulated_handler(
858 	ulint		global_segment,
859 	fil_node_t**	m1,
860 	void**		m2,
861 	IORequest*	type);
862 
863 #ifdef WIN_ASYNC_IO
864 /** This function is only used in Windows asynchronous i/o.
865 Waits for an aio operation to complete. This function is used to wait the
866 for completed requests. The aio array of pending requests is divided
867 into segments. The thread specifies which segment or slot it wants to wait
868 for. NOTE: this function will also take care of freeing the aio slot,
869 therefore no other thread is allowed to do the freeing!
870 @param[in]	segment		The number of the segment in the aio arrays to
871 wait for; segment 0 is the ibuf I/O thread,
872 segment 1 the log I/O thread, then follow the
873 non-ibuf read threads, and as the last are the
874 non-ibuf write threads; if this is
875 ULINT_UNDEFINED, then it means that sync AIO
876 is used, and this parameter is ignored
877 @param[in]	pos		this parameter is used only in sync AIO:
878 wait for the aio slot at this position
879 @param[out]	m1		the messages passed with the AIO request; note
880 that also in the case where the AIO operation
881 failed, these output parameters are valid and
882 can be used to restart the operation,
883 for example
884 @param[out]	m2		callback message
885 @param[out]	type		OS_FILE_WRITE or ..._READ
886 @return DB_SUCCESS or error code */
887 static
888 dberr_t
889 os_aio_windows_handler(
890 	ulint		segment,
891 	ulint		pos,
892 	fil_node_t**	m1,
893 	void**		m2,
894 	IORequest*	type);
895 #endif /* WIN_ASYNC_IO */
896 
897 /** Allocate a page for sync IO
898 @return pointer to page */
899 static
900 Block*
os_alloc_block()901 os_alloc_block()
902 {
903 	size_t		pos;
904 	Blocks&		blocks = *block_cache;
905 	size_t		i = static_cast<size_t>(my_timer_cycles());
906 	const size_t	size = blocks.size();
907 	ulint		retry = 0;
908 	Block*		block;
909 
910 	DBUG_EXECUTE_IF("os_block_cache_busy", retry = MAX_BLOCKS * 3;);
911 
912 	for (;;) {
913 
914 		/* After go through the block cache for 3 times,
915 		allocate a new temporary block. */
916 		if (retry == MAX_BLOCKS * 3) {
917 			byte*	ptr;
918 
919 			ptr = static_cast<byte*>(
920 				ut_malloc_nokey(sizeof(*block)
921 						+ BUFFER_BLOCK_SIZE));
922 
923 			block = new (ptr) Block();
924 			block->m_ptr = static_cast<byte*>(
925 				ptr + sizeof(*block));
926 			block->m_in_use = 1;
927 
928 			break;
929 		}
930 
931 		pos = i++ % size;
932 
933 		if (TAS(&blocks[pos].m_in_use, 1) == 0) {
934 			block = &blocks[pos];
935 			break;
936 		}
937 
938 		os_thread_yield();
939 
940 		++retry;
941 	}
942 
943 	ut_a(block->m_in_use != 0);
944 
945 	return(block);
946 }
947 
948 /** Free a page after sync IO
949 @param[in,own]	block		The block to free/release */
950 static
951 void
os_free_block(Block * block)952 os_free_block(Block* block)
953 {
954 	ut_ad(block->m_in_use == 1);
955 
956 	TAS(&block->m_in_use, 0);
957 
958 	/* When this block is not in the block cache, and it's
959 	a temporary block, we need to free it directly. */
960 	if (std::less<Block*>()(block, &block_cache->front())
961 	    || std::greater<Block*>()(block, &block_cache->back())) {
962 		ut_free(block);
963 	}
964 }
965 
966 /** Generic AIO Handler methods. Currently handles IO post processing. */
967 class AIOHandler {
968 public:
969 	/** Do any post processing after a read/write
970 	@return DB_SUCCESS or error code. */
971 	static dberr_t post_io_processing(Slot* slot);
972 
973 	/** Decompress after a read and punch a hole in the file if
974 	it was a write */
io_complete(const Slot * slot)975 	static dberr_t io_complete(const Slot* slot)
976 	{
977 		ut_a(slot->offset > 0);
978 		ut_a(slot->type.is_read() || !slot->skip_punch_hole);
979 		return(os_file_io_complete(
980 				slot->type, slot->file.m_file, slot->buf,
981 				NULL, slot->original_len,
982 				slot->offset, slot->len));
983 	}
984 
985 private:
986 	/** Check whether the page was encrypted.
987 	@param[in]	slot		The slot that contains the IO request
988 	@return true if it was an encyrpted page */
is_encrypted_page(const Slot * slot)989 	static bool is_encrypted_page(const Slot* slot)
990 	{
991 		return(Encryption::is_encrypted_page(slot->buf));
992 	}
993 
994 	/** Check whether the page was compressed.
995 	@param[in]	slot		The slot that contains the IO request
996 	@return true if it was a compressed page */
is_compressed_page(const Slot * slot)997 	static bool is_compressed_page(const Slot* slot)
998 	{
999 		const byte*	src = slot->buf;
1000 
1001 		ulint	page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
1002 
1003 		return(page_type == FIL_PAGE_COMPRESSED);
1004 	}
1005 
1006 	/** Get the compressed page size.
1007 	@param[in]	slot		The slot that contains the IO request
1008 	@return number of bytes to read for a successful decompress */
compressed_page_size(const Slot * slot)1009 	static ulint compressed_page_size(const Slot* slot)
1010 	{
1011 		ut_ad(slot->type.is_read());
1012 		ut_ad(is_compressed_page(slot));
1013 
1014 		ulint		size;
1015 		const byte*	src = slot->buf;
1016 
1017 		size = mach_read_from_2(src + FIL_PAGE_COMPRESS_SIZE_V1);
1018 
1019 		return(size + FIL_PAGE_DATA);
1020 	}
1021 
1022 	/** Check if the page contents can be decompressed.
1023 	@param[in]	slot		The slot that contains the IO request
1024 	@return true if the data read has all the compressed data */
can_decompress(const Slot * slot)1025 	static bool can_decompress(const Slot* slot)
1026 	{
1027 		ut_ad(slot->type.is_read());
1028 		ut_ad(is_compressed_page(slot));
1029 
1030 		ulint		version;
1031 		const byte*	src = slot->buf;
1032 
1033 		version = mach_read_from_1(src + FIL_PAGE_VERSION);
1034 
1035 		ut_a(Compression::is_valid_page_version(version));
1036 
1037 		/* Includes the page header size too */
1038 		ulint		size = compressed_page_size(slot);
1039 
1040 		return(size <= (slot->ptr - slot->buf) + (ulint) slot->n_bytes);
1041 	}
1042 
1043 	/** Check if we need to read some more data.
1044 	@param[in]	slot		The slot that contains the IO request
1045 	@param[in]	n_bytes		Total bytes read so far
1046 	@return DB_SUCCESS or error code */
1047 	static dberr_t check_read(Slot* slot, ulint n_bytes);
1048 };
1049 
1050 /** Helper class for doing synchronous file IO. Currently, the objective
1051 is to hide the OS specific code, so that the higher level functions aren't
1052 peppered with #ifdef. Makes the code flow difficult to follow.  */
1053 class SyncFileIO {
1054 public:
1055 	/** Constructor
1056 	@param[in]	fh	File handle
1057 	@param[in,out]	buf	Buffer to read/write
1058 	@param[in]	n	Number of bytes to read/write
1059 	@param[in]	offset	Offset where to read or write */
SyncFileIO(os_file_t fh,void * buf,ulint n,os_offset_t offset)1060 	SyncFileIO(os_file_t fh, void* buf, ulint n, os_offset_t offset)
1061 		:
1062 		m_fh(fh),
1063 		m_buf(buf),
1064 		m_n(static_cast<ssize_t>(n)),
1065 		m_offset(offset)
1066 	{
1067 		ut_ad(m_n > 0);
1068 	}
1069 
1070 	/** Destructor */
~SyncFileIO()1071 	~SyncFileIO()
1072 	{
1073 		/* No op */
1074 	}
1075 
1076 	/** Do the read/write
1077 	@param[in]	request	The IO context and type
1078 	@return the number of bytes read/written or negative value on error */
1079 	ssize_t execute(const IORequest& request);
1080 
1081 	/** Do the read/write
1082 	@param[in,out]	slot	The IO slot, it has the IO context
1083 	@return the number of bytes read/written or negative value on error */
1084 	static ssize_t execute(Slot* slot);
1085 
1086 	/** Move the read/write offset up to where the partial IO succeeded.
1087 	@param[in]	n_bytes	The number of bytes to advance */
advance(ssize_t n_bytes)1088 	void advance(ssize_t n_bytes)
1089 	{
1090 		m_offset += n_bytes;
1091 
1092 		ut_ad(m_n >= n_bytes);
1093 
1094 		m_n -=  n_bytes;
1095 
1096 		m_buf = reinterpret_cast<uchar*>(m_buf) + n_bytes;
1097 	}
1098 
1099 private:
1100 	/** Open file handle */
1101 	os_file_t		m_fh;
1102 
1103 	/** Buffer to read/write */
1104 	void*			m_buf;
1105 
1106 	/** Number of bytes to read/write */
1107 	ssize_t			m_n;
1108 
1109 	/** Offset from where to read/write */
1110 	os_offset_t		m_offset;
1111 };
1112 
1113 /** If it is a compressed page return the compressed page data + footer size
1114 @param[in]	buf		Buffer to check, must include header + 10 bytes
1115 @return ULINT_UNDEFINED if the page is not a compressed page or length
1116 	of the compressed data (including footer) if it is a compressed page */
1117 ulint
os_file_compressed_page_size(const byte * buf)1118 os_file_compressed_page_size(const byte* buf)
1119 {
1120 	ulint	type = mach_read_from_2(buf + FIL_PAGE_TYPE);
1121 
1122 	if (type == FIL_PAGE_COMPRESSED) {
1123 		ulint	version = mach_read_from_1(buf + FIL_PAGE_VERSION);
1124 		ut_a(Compression::is_valid_page_version(version));
1125 		return(mach_read_from_2(buf + FIL_PAGE_COMPRESS_SIZE_V1));
1126 	}
1127 
1128 	return(ULINT_UNDEFINED);
1129 }
1130 
1131 /** If it is a compressed page return the original page data + footer size
1132 @param[in] buf		Buffer to check, must include header + 10 bytes
1133 @return ULINT_UNDEFINED if the page is not a compressed page or length
1134 	of the original data + footer if it is a compressed page */
1135 ulint
os_file_original_page_size(const byte * buf)1136 os_file_original_page_size(const byte* buf)
1137 {
1138 	ulint	type = mach_read_from_2(buf + FIL_PAGE_TYPE);
1139 
1140 	if (type == FIL_PAGE_COMPRESSED) {
1141 
1142 		ulint	version = mach_read_from_1(buf + FIL_PAGE_VERSION);
1143 		ut_a(Compression::is_valid_page_version(version));
1144 
1145 		return(mach_read_from_2(buf + FIL_PAGE_ORIGINAL_SIZE_V1));
1146 	}
1147 
1148 	return(ULINT_UNDEFINED);
1149 }
1150 
1151 /** Check if we need to read some more data.
1152 @param[in]	slot		The slot that contains the IO request
1153 @param[in]	n_bytes		Total bytes read so far
1154 @return DB_SUCCESS or error code */
1155 dberr_t
check_read(Slot * slot,ulint n_bytes)1156 AIOHandler::check_read(Slot* slot, ulint n_bytes)
1157 {
1158 	dberr_t	err;
1159 
1160 	ut_ad(slot->type.is_read());
1161 	ut_ad(slot->original_len > slot->len);
1162 
1163 	if (is_compressed_page(slot)) {
1164 
1165 		if (can_decompress(slot)) {
1166 
1167 			ut_a(slot->offset > 0);
1168 
1169 			slot->len = slot->original_len;
1170 #ifdef _WIN32
1171 			slot->n_bytes = static_cast<DWORD>(n_bytes);
1172 #else
1173 			slot->n_bytes = static_cast<ulint>(n_bytes);
1174 #endif /* _WIN32 */
1175 
1176 			err = io_complete(slot);
1177 			ut_a(err == DB_SUCCESS);
1178 		} else {
1179 			/* Read the next block in */
1180 			ut_ad(compressed_page_size(slot) >= n_bytes);
1181 
1182 			err = DB_FAIL;
1183 		}
1184 	} else if (is_encrypted_page(slot)) {
1185 			ut_a(slot->offset > 0);
1186 
1187 			slot->len = slot->original_len;
1188 #ifdef _WIN32
1189 			slot->n_bytes = static_cast<DWORD>(n_bytes);
1190 #else
1191 			slot->n_bytes = static_cast<ulint>(n_bytes);
1192 #endif /* _WIN32 */
1193 
1194 			err = io_complete(slot);
1195 			ut_a(err == DB_SUCCESS);
1196 
1197 	} else {
1198 		err = DB_FAIL;
1199 	}
1200 
1201 	if (slot->buf_block != NULL) {
1202 		os_free_block(slot->buf_block);
1203 		slot->buf_block = NULL;
1204 	}
1205 
1206 	return(err);
1207 }
1208 
1209 /** Do any post processing after a read/write
1210 @return DB_SUCCESS or error code. */
1211 dberr_t
post_io_processing(Slot * slot)1212 AIOHandler::post_io_processing(Slot* slot)
1213 {
1214 	dberr_t	err;
1215 
1216 	ut_ad(slot->is_reserved);
1217 
1218 	/* Total bytes read so far */
1219 	ulint	n_bytes = (slot->ptr - slot->buf) + slot->n_bytes;
1220 
1221 	/* Compressed writes can be smaller than the original length.
1222 	Therefore they can be processed without further IO. */
1223 	if (n_bytes == slot->original_len
1224 	    || (slot->type.is_write()
1225 		&& slot->type.is_compressed()
1226 		&& slot->len == static_cast<ulint>(slot->n_bytes))) {
1227 
1228 		if (!slot->type.is_log()
1229 		    && (is_compressed_page(slot)
1230 			|| is_encrypted_page(slot))) {
1231 
1232 			ut_a(slot->offset > 0);
1233 
1234 			if (slot->type.is_read()) {
1235 				slot->len = slot->original_len;
1236 			}
1237 
1238 			/* The punch hole has been done on collect() */
1239 
1240 			if (slot->type.is_read()) {
1241 				err = io_complete(slot);
1242 			} else {
1243 				err = DB_SUCCESS;
1244 			}
1245 
1246 			ut_ad(err == DB_SUCCESS
1247 			      || err == DB_UNSUPPORTED
1248 			      || err == DB_CORRUPTION
1249 			      || err == DB_IO_DECOMPRESS_FAIL);
1250 		} else {
1251 
1252 			err = DB_SUCCESS;
1253 		}
1254 
1255 		if (slot->buf_block != NULL) {
1256 			os_free_block(slot->buf_block);
1257 			slot->buf_block = NULL;
1258 		}
1259 
1260 	} else if ((ulint) slot->n_bytes == (ulint) slot->len) {
1261 
1262 		/* It *must* be a partial read. */
1263 		ut_ad(slot->len < slot->original_len);
1264 
1265 		/* Has to be a read request, if it is less than
1266 		the original length. */
1267 		ut_ad(slot->type.is_read());
1268 		err = check_read(slot, n_bytes);
1269 
1270 	} else {
1271 		err = DB_FAIL;
1272 	}
1273 
1274 	return(err);
1275 }
1276 
1277 /** Count the number of free slots
1278 @return number of reserved slots */
1279 ulint
pending_io_count() const1280 AIO::pending_io_count() const
1281 {
1282 	acquire();
1283 
1284 #ifdef UNIV_DEBUG
1285 	ut_a(m_n_segments > 0);
1286 	ut_a(!m_slots.empty());
1287 
1288 	ulint	count = 0;
1289 
1290 	for (ulint i = 0; i < m_slots.size(); ++i) {
1291 
1292 		const Slot&	slot = m_slots[i];
1293 
1294 		if (slot.is_reserved) {
1295 			++count;
1296 			ut_a(slot.len > 0);
1297 		}
1298 	}
1299 
1300 	ut_a(m_n_reserved == count);
1301 #endif /* UNIV_DEBUG */
1302 
1303 	ulint	reserved = m_n_reserved;
1304 
1305 	release();
1306 
1307 	return(reserved);
1308 }
1309 
1310 /** Compress a data page
1311 #param[in]	block_size	File system block size
1312 @param[in]	src		Source contents to compress
1313 @param[in]	src_len		Length in bytes of the source
1314 @param[out]	dst		Compressed page contents
1315 @param[out]	dst_len		Length in bytes of dst contents
1316 @return buffer data, dst_len will have the length of the data */
1317 static
1318 byte*
os_file_compress_page(Compression compression,ulint block_size,byte * src,ulint src_len,byte * dst,ulint * dst_len)1319 os_file_compress_page(
1320 	Compression	compression,
1321 	ulint		block_size,
1322 	byte*		src,
1323 	ulint		src_len,
1324 	byte*		dst,
1325 	ulint*		dst_len)
1326 {
1327 	ulint		len = 0;
1328 	ulint		compression_level = page_zip_level;
1329 	ulint		page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
1330 
1331 	/* The page size must be a multiple of the OS punch hole size. */
1332 	ut_ad(!(src_len % block_size));
1333 
1334 	/* Shouldn't compress an already compressed page. */
1335 	ut_ad(page_type != FIL_PAGE_COMPRESSED);
1336 
1337 	/* The page must be at least twice as large as the file system
1338 	block size if we are to save any space. Ignore R-Tree pages for now,
1339 	they repurpose the same 8 bytes in the page header. No point in
1340 	compressing if the file system block size >= our page size. */
1341 
1342 	if (page_type == FIL_PAGE_RTREE
1343 	    || block_size == ULINT_UNDEFINED
1344 	    || compression.m_type == Compression::NONE
1345 	    || src_len < block_size * 2) {
1346 
1347 		*dst_len = src_len;
1348 
1349 		return(src);
1350 	}
1351 
1352 	/* Leave the header alone when compressing. */
1353 	ut_ad(block_size >= FIL_PAGE_DATA * 2);
1354 
1355 	ut_ad(src_len > FIL_PAGE_DATA + block_size);
1356 
1357 	/* Must compress to <= N-1 FS blocks. */
1358 	ulint		out_len = src_len - (FIL_PAGE_DATA + block_size);
1359 
1360 	/* This is the original data page size - the page header. */
1361 	ulint		content_len = src_len - FIL_PAGE_DATA;
1362 
1363 	ut_ad(out_len >= block_size - FIL_PAGE_DATA);
1364 	ut_ad(out_len <= src_len - (block_size + FIL_PAGE_DATA));
1365 
1366 	/* Only compress the data + trailer, leave the header alone */
1367 
1368 	switch (compression.m_type) {
1369 	case Compression::NONE:
1370 		ut_error;
1371 
1372 	case Compression::ZLIB: {
1373 
1374 		uLongf	zlen = static_cast<uLongf>(out_len);
1375 
1376 		if (compress2(
1377 			dst + FIL_PAGE_DATA,
1378 			&zlen,
1379 			src + FIL_PAGE_DATA,
1380 			static_cast<uLong>(content_len),
1381 			static_cast<int>(compression_level)) != Z_OK) {
1382 
1383 			*dst_len = src_len;
1384 
1385 			return(src);
1386 		}
1387 
1388 		len = static_cast<ulint>(zlen);
1389 
1390 		break;
1391 	}
1392 
1393 	case Compression::LZ4:
1394 
1395 		len = LZ4_compress_default(
1396 			reinterpret_cast<char*>(src) + FIL_PAGE_DATA,
1397 			reinterpret_cast<char*>(dst) + FIL_PAGE_DATA,
1398 			static_cast<int>(content_len),
1399 			static_cast<int>(out_len));
1400 
1401 		ut_a(len <= src_len - FIL_PAGE_DATA);
1402 
1403 		if (len == 0  || len >= out_len) {
1404 
1405 			*dst_len = src_len;
1406 
1407 			return(src);
1408 		}
1409 
1410 		break;
1411 
1412 	default:
1413 		*dst_len = src_len;
1414 		return(src);
1415 	}
1416 
1417 	ut_a(len <= out_len);
1418 
1419 	ut_ad(memcmp(src + FIL_PAGE_LSN + 4,
1420 		     src + src_len - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)
1421 	      == 0);
1422 
1423 	/* Copy the header as is. */
1424 	memmove(dst, src, FIL_PAGE_DATA);
1425 
1426 	/* Add compression control information. Required for decompressing. */
1427 	mach_write_to_2(dst + FIL_PAGE_TYPE, FIL_PAGE_COMPRESSED);
1428 
1429 	mach_write_to_1(dst + FIL_PAGE_VERSION, Compression::FIL_PAGE_VERSION_2);
1430 
1431 	mach_write_to_1(dst + FIL_PAGE_ALGORITHM_V1, compression.m_type);
1432 
1433 	mach_write_to_2(dst + FIL_PAGE_ORIGINAL_TYPE_V1, page_type);
1434 
1435 	mach_write_to_2(dst + FIL_PAGE_ORIGINAL_SIZE_V1, content_len);
1436 
1437 	mach_write_to_2(dst + FIL_PAGE_COMPRESS_SIZE_V1, len);
1438 
1439 	/* Round to the next full block size */
1440 
1441 	len += FIL_PAGE_DATA;
1442 
1443 	*dst_len = ut_calc_align(len, block_size);
1444 
1445 	ut_ad(*dst_len >= len && *dst_len <= out_len + FIL_PAGE_DATA);
1446 
1447 	/* Clear out the unused portion of the page. */
1448 	if (len % block_size) {
1449 		memset(dst + len, 0x0, block_size - (len % block_size));
1450 	}
1451 
1452 	return(dst);
1453 }
1454 
1455 #ifdef UNIV_DEBUG
1456 # ifndef UNIV_HOTBACKUP
1457 /** Validates the consistency the aio system some of the time.
1458 @return true if ok or the check was skipped */
1459 bool
os_aio_validate_skip()1460 os_aio_validate_skip()
1461 {
1462 /** Try os_aio_validate() every this many times */
1463 # define OS_AIO_VALIDATE_SKIP	13
1464 
1465 	/** The os_aio_validate() call skip counter.
1466 	Use a signed type because of the race condition below. */
1467 	static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
1468 
1469 	/* There is a race condition below, but it does not matter,
1470 	because this call is only for heuristic purposes. We want to
1471 	reduce the call frequency of the costly os_aio_validate()
1472 	check in debug builds. */
1473 	--os_aio_validate_count;
1474 
1475 	if (os_aio_validate_count > 0) {
1476 		return(true);
1477 	}
1478 
1479 	os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
1480 	return(os_aio_validate());
1481 }
1482 # endif /* !UNIV_HOTBACKUP */
1483 #endif /* UNIV_DEBUG */
1484 
1485 #undef USE_FILE_LOCK
1486 //#define USE_FILE_LOCK
1487 #if defined(UNIV_HOTBACKUP) || defined(_WIN32)
1488 /* InnoDB Hot Backup does not lock the data files.
1489  * On Windows, mandatory locking is used.
1490  */
1491 # undef USE_FILE_LOCK
1492 #endif
1493 #ifdef USE_FILE_LOCK
1494 /** Obtain an exclusive lock on a file.
1495 @param[in]	fd		file descriptor
1496 @param[in]	name		file name
1497 @return 0 on success */
1498 static
1499 int
os_file_lock(int fd,const char * name)1500 os_file_lock(
1501 	int		fd,
1502 	const char*	name)
1503 {
1504 	struct flock lk;
1505 
1506 	lk.l_type = F_WRLCK;
1507 	lk.l_whence = SEEK_SET;
1508 	lk.l_start = lk.l_len = 0;
1509 
1510 	if (fcntl(fd, F_SETLK, &lk) == -1) {
1511 
1512 		ib::error()
1513 			<< "Unable to lock " << name
1514 			<< " error: " << errno;
1515 
1516 		if (errno == EAGAIN || errno == EACCES) {
1517 
1518 			ib::info()
1519 				<< "Check that you do not already have"
1520 				" another mysqld process using the"
1521 				" same InnoDB data or log files.";
1522 		}
1523 
1524 		return(-1);
1525 	}
1526 
1527 	return(0);
1528 }
1529 #endif /* USE_FILE_LOCK */
1530 
1531 #ifndef UNIV_HOTBACKUP
1532 
1533 /** Calculates local segment number and aio array from global segment number.
1534 @param[out]	array		aio wait array
1535 @param[in]	segment		global segment number
1536 @return local segment number within the aio array */
1537 ulint
get_array_and_local_segment(AIO ** array,ulint segment)1538 AIO::get_array_and_local_segment(
1539 	AIO**		array,
1540 	ulint		segment)
1541 {
1542 	ulint		local_segment;
1543 	ulint		n_extra_segs = (srv_read_only_mode) ? 0 : 2;
1544 
1545 	ut_a(segment < os_aio_n_segments);
1546 
1547 	if (!srv_read_only_mode && segment < n_extra_segs) {
1548 
1549 		/* We don't support ibuf/log IO during read only mode. */
1550 
1551 		if (segment == IO_IBUF_SEGMENT) {
1552 
1553 			*array = s_ibuf;
1554 
1555 		} else if (segment == IO_LOG_SEGMENT) {
1556 
1557 			*array = s_log;
1558 
1559 		} else {
1560 			*array = NULL;
1561 		}
1562 
1563 		local_segment = 0;
1564 
1565 	} else if (segment < s_reads->m_n_segments + n_extra_segs) {
1566 
1567 		*array = s_reads;
1568 		local_segment = segment - n_extra_segs;
1569 
1570 	} else {
1571 		*array = s_writes;
1572 
1573 		local_segment = segment
1574 			      - (s_reads->m_n_segments + n_extra_segs);
1575 	}
1576 
1577 	return(local_segment);
1578 }
1579 
1580 /** Frees a slot in the aio array. Assumes caller owns the mutex.
1581 @param[in,out]	slot		Slot to release */
1582 void
release(Slot * slot)1583 AIO::release(Slot* slot)
1584 {
1585 	ut_ad(is_mutex_owned());
1586 
1587 	ut_ad(slot->is_reserved);
1588 
1589 	slot->is_reserved = false;
1590 
1591 	--m_n_reserved;
1592 
1593 	if (m_n_reserved == m_slots.size() - 1) {
1594 		os_event_set(m_not_full);
1595 	}
1596 
1597 	if (m_n_reserved == 0) {
1598 		os_event_set(m_is_empty);
1599 	}
1600 
1601 #ifdef WIN_ASYNC_IO
1602 
1603 	ResetEvent(slot->handle);
1604 
1605 #elif defined(LINUX_NATIVE_AIO)
1606 
1607 	if (srv_use_native_aio) {
1608 		memset(&slot->control, 0x0, sizeof(slot->control));
1609 		slot->ret = 0;
1610 		slot->n_bytes = 0;
1611 	} else {
1612 		/* These fields should not be used if we are not
1613 		using native AIO. */
1614 		ut_ad(slot->n_bytes == 0);
1615 		ut_ad(slot->ret == 0);
1616 	}
1617 
1618 #endif /* WIN_ASYNC_IO */
1619 }
1620 
1621 /** Frees a slot in the AIO array. Assumes caller doesn't own the mutex.
1622 @param[in,out]	slot		Slot to release */
1623 void
release_with_mutex(Slot * slot)1624 AIO::release_with_mutex(Slot* slot)
1625 {
1626 	acquire();
1627 
1628 	release(slot);
1629 
1630 	release();
1631 }
1632 
1633 /** Creates a temporary file.  This function is like tmpfile(3), but
1634 the temporary file is created in the given parameter path. If the path
1635 is NULL then it will create the file in the MySQL server configuration
1636 parameter (--tmpdir).
1637 @param[in]	path	location for creating temporary file
1638 @return temporary file handle, or NULL on error */
1639 FILE*
os_file_create_tmpfile(const char * path)1640 os_file_create_tmpfile(
1641 	const char*	path)
1642 {
1643 	FILE*	file	= NULL;
1644 	int	fd	= innobase_mysql_tmpfile(path);
1645 
1646 	if (fd >= 0) {
1647 		file = fdopen(fd, "w+b");
1648 	}
1649 
1650 	if (file == NULL) {
1651 
1652 		ib::error()
1653 			<< "Unable to create temporary file; errno: "
1654 			<< errno;
1655 
1656 		if (fd >= 0) {
1657 			close(fd);
1658 		}
1659 	}
1660 
1661 	return(file);
1662 }
1663 
1664 /** Rewind file to its start, read at most size - 1 bytes from it to str, and
1665 NUL-terminate str. All errors are silently ignored. This function is
1666 mostly meant to be used with temporary files.
1667 @param[in,out]	file		File to read from
1668 @param[in,out]	str		Buffer where to read
1669 @param[in]	size		Size of buffer */
1670 void
os_file_read_string(FILE * file,char * str,ulint size)1671 os_file_read_string(
1672 	FILE*		file,
1673 	char*		str,
1674 	ulint		size)
1675 {
1676 	if (size != 0) {
1677 		rewind(file);
1678 
1679 		size_t	flen = fread(str, 1, size - 1, file);
1680 
1681 		str[flen] = '\0';
1682 	}
1683 }
1684 
1685 /** Decompress after a read and punch a hole in the file if it was a write
1686 @param[in]	type		IO context
1687 @param[in]	fh		Open file handle
1688 @param[in,out]	buf		Buffer to transform
1689 @param[in,out]	scratch		Scratch area for read decompression
1690 @param[in]	src_len		Length of the buffer before compression
1691 @param[in]	len		Used buffer length for write and output
1692 				buf len for read
1693 @return DB_SUCCESS or error code */
1694 static
1695 dberr_t
os_file_io_complete(const IORequest & type,os_file_t fh,byte * buf,byte * scratch,ulint src_len,os_offset_t offset,ulint len)1696 os_file_io_complete(
1697 	const IORequest&type,
1698 	os_file_t	fh,
1699 	byte*		buf,
1700 	byte*		scratch,
1701 	ulint		src_len,
1702 	os_offset_t	offset,
1703 	ulint		len)
1704 {
1705 	/* We never compress/decompress the first page */
1706 	ut_a(offset > 0);
1707 	ut_ad(type.validate());
1708 
1709 	if (!type.is_compression_enabled()) {
1710 
1711 		return(DB_SUCCESS);
1712 
1713 	} else if (type.is_read() && !srv_backup_mode) {
1714 		/* Do not decrypt / decompress when taking a backup.
1715 		   We actually decompress the pages in fil_cur.
1716 		   We want encrypted pages to remain encrypted. */
1717 		dberr_t		ret;
1718 		Encryption	encryption(type.encryption_algorithm());
1719 
1720 		ut_ad(!type.is_log());
1721 
1722 		ret = encryption.decrypt(type, buf, src_len, scratch, len);
1723 		if (ret == DB_SUCCESS) {
1724 			return(os_file_decompress_page(
1725 					type.is_dblwr_recover(),
1726 					buf, scratch, len));
1727 		} else {
1728 			return(ret);
1729 		}
1730 
1731 	} else if (type.punch_hole()) {
1732 
1733 		ut_ad(len <= src_len);
1734 		ut_ad(!type.is_log());
1735 		ut_ad(type.is_write());
1736 		ut_ad(type.is_compressed());
1737 
1738 		/* Nothing to do. */
1739 		if (len == src_len) {
1740 			return(DB_SUCCESS);
1741 		}
1742 
1743 #ifdef UNIV_DEBUG
1744 		const ulint	block_size = type.block_size();
1745 #endif /* UNIV_DEBUG */
1746 
1747 		/* We don't support multiple page sizes in the server
1748 		at the moment. */
1749 		ut_ad(src_len == srv_page_size);
1750 
1751 		/* Must be a multiple of the compression unit size. */
1752 		ut_ad((len % block_size) == 0);
1753 		ut_ad((offset % block_size) == 0);
1754 
1755 		ut_ad(len + block_size <= src_len);
1756 
1757 		offset += len;
1758 
1759 		return(os_file_punch_hole(fh, offset, src_len - len));
1760 	}
1761 
1762 	ut_ad(!type.is_log());
1763 
1764 	return(DB_SUCCESS);
1765 }
1766 
1767 #endif /* !UNIV_HOTBACKUP */
1768 
1769 /** This function returns a new path name after replacing the basename
1770 in an old path with a new basename.  The old_path is a full path
1771 name including the extension.  The tablename is in the normal
1772 form "databasename/tablename".  The new base name is found after
1773 the forward slash.  Both input strings are null terminated.
1774 
1775 This function allocates memory to be returned.  It is the callers
1776 responsibility to free the return value after it is no longer needed.
1777 
1778 @param[in]	old_path		Pathname
1779 @param[in]	tablename		Contains new base name
1780 @return own: new full pathname */
1781 char*
os_file_make_new_pathname(const char * old_path,const char * tablename)1782 os_file_make_new_pathname(
1783 	const char*	old_path,
1784 	const char*	tablename)
1785 {
1786 	ulint		dir_len;
1787 	char*		last_slash;
1788 	char*		base_name;
1789 	char*		new_path;
1790 	ulint		new_path_len;
1791 
1792 	/* Split the tablename into its database and table name components.
1793 	They are separated by a '/'. */
1794 	last_slash = strrchr((char*) tablename, '/');
1795 	base_name = last_slash ? last_slash + 1 : (char*) tablename;
1796 
1797 	/* Find the offset of the last slash. We will strip off the
1798 	old basename.ibd which starts after that slash. */
1799 	last_slash = strrchr((char*) old_path, OS_PATH_SEPARATOR);
1800 	dir_len = last_slash ? last_slash - old_path : strlen(old_path);
1801 
1802 	/* allocate a new path and move the old directory path to it. */
1803 	new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd";
1804 	new_path = static_cast<char*>(ut_malloc_nokey(new_path_len));
1805 	memcpy(new_path, old_path, dir_len);
1806 
1807 	ut_snprintf(new_path + dir_len,
1808 		    new_path_len - dir_len,
1809 		    "%c%s.ibd",
1810 		    OS_PATH_SEPARATOR,
1811 		    base_name);
1812 
1813 	return(new_path);
1814 }
1815 
1816 /** This function reduces a null-terminated full remote path name into
1817 the path that is sent by MySQL for DATA DIRECTORY clause.  It replaces
1818 the 'databasename/tablename.ibd' found at the end of the path with just
1819 'tablename'.
1820 
1821 Since the result is always smaller than the path sent in, no new memory
1822 is allocated. The caller should allocate memory for the path sent in.
1823 This function manipulates that path in place.
1824 
1825 If the path format is not as expected, just return.  The result is used
1826 to inform a SHOW CREATE TABLE command.
1827 @param[in,out]	data_dir_path		Full path/data_dir_path */
1828 void
os_file_make_data_dir_path(char * data_dir_path)1829 os_file_make_data_dir_path(
1830 	char*	data_dir_path)
1831 {
1832 	/* Replace the period before the extension with a null byte. */
1833 	char*	ptr = strrchr((char*) data_dir_path, '.');
1834 
1835 	if (ptr == NULL) {
1836 		return;
1837 	}
1838 
1839 	ptr[0] = '\0';
1840 
1841 	/* The tablename starts after the last slash. */
1842 	ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
1843 
1844 	if (ptr == NULL) {
1845 		return;
1846 	}
1847 
1848 	ptr[0] = '\0';
1849 
1850 	char*	tablename = ptr + 1;
1851 
1852 	/* The databasename starts after the next to last slash. */
1853 	ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
1854 
1855 	if (ptr == NULL) {
1856 		return;
1857 	}
1858 
1859 	ulint	tablename_len = ut_strlen(tablename);
1860 
1861 	ut_memmove(++ptr, tablename, tablename_len);
1862 
1863 	ptr[tablename_len] = '\0';
1864 }
1865 
1866 /** Check if the path refers to the root of a drive using a pointer
1867 to the last directory separator that the caller has fixed.
1868 @param[in]	path	path name
1869 @param[in]	path	last directory separator in the path
1870 @return true if this path is a drive root, false if not */
1871 UNIV_INLINE
1872 bool
os_file_is_root(const char * path,const char * last_slash)1873 os_file_is_root(
1874 	const char*	path,
1875 	const char*	last_slash)
1876 {
1877 	return(
1878 #ifdef _WIN32
1879 	       (last_slash == path + 2 && path[1] == ':') ||
1880 #endif /* _WIN32 */
1881 	       last_slash == path);
1882 }
1883 
1884 /** Return the parent directory component of a null-terminated path.
1885 Return a new buffer containing the string up to, but not including,
1886 the final component of the path.
1887 The path returned will not contain a trailing separator.
1888 Do not return a root path, return NULL instead.
1889 The final component trimmed off may be a filename or a directory name.
1890 If the final component is the only component of the path, return NULL.
1891 It is the caller's responsibility to free the returned string after it
1892 is no longer needed.
1893 @param[in]	path		Path name
1894 @return own: parent directory of the path */
1895 static
1896 char*
os_file_get_parent_dir(const char * path)1897 os_file_get_parent_dir(
1898 	const char*	path)
1899 {
1900 	bool	has_trailing_slash = false;
1901 
1902 	/* Find the offset of the last slash */
1903 	const char* last_slash = strrchr(path, OS_PATH_SEPARATOR);
1904 
1905 	if (!last_slash) {
1906 		/* No slash in the path, return NULL */
1907 		return(NULL);
1908 	}
1909 
1910 	/* Ok, there is a slash. Is there anything after it? */
1911 	if (static_cast<size_t>(last_slash - path + 1) == strlen(path)) {
1912 		has_trailing_slash = true;
1913 	}
1914 
1915 	/* Reduce repetative slashes. */
1916 	while (last_slash > path
1917 		&& last_slash[-1] == OS_PATH_SEPARATOR) {
1918 		last_slash--;
1919 	}
1920 
1921 	/* Check for the root of a drive. */
1922 	if (os_file_is_root(path, last_slash)) {
1923 		return(NULL);
1924 	}
1925 
1926 	/* If a trailing slash prevented the first strrchr() from trimming
1927 	the last component of the path, trim that component now. */
1928 	if (has_trailing_slash) {
1929 		/* Back up to the previous slash. */
1930 		last_slash--;
1931 		while (last_slash > path
1932 		       && last_slash[0] != OS_PATH_SEPARATOR) {
1933 			last_slash--;
1934 		}
1935 
1936 		/* Reduce repetative slashes. */
1937 		while (last_slash > path
1938 			&& last_slash[-1] == OS_PATH_SEPARATOR) {
1939 			last_slash--;
1940 		}
1941 	}
1942 
1943 	/* Check for the root of a drive. */
1944 	if (os_file_is_root(path, last_slash)) {
1945 		return(NULL);
1946 	}
1947 
1948 	if (last_slash - path < 0) {
1949 		/* Sanity check, it prevents gcc from trying to handle this case which
1950 		 * results in warnings for some optimized builds */
1951 		return (NULL);
1952 	}
1953 
1954 	/* Non-trivial directory component */
1955 
1956 	return(mem_strdupl(path, last_slash - path));
1957 }
1958 #ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
1959 
1960 /* Test the function os_file_get_parent_dir. */
1961 void
test_os_file_get_parent_dir(const char * child_dir,const char * expected_dir)1962 test_os_file_get_parent_dir(
1963 	const char*	child_dir,
1964 	const char*	expected_dir)
1965 {
1966 	char* child = mem_strdup(child_dir);
1967 	char* expected = expected_dir == NULL ? NULL
1968 			 : mem_strdup(expected_dir);
1969 
1970 	/* os_file_get_parent_dir() assumes that separators are
1971 	converted to OS_PATH_SEPARATOR. */
1972 	os_normalize_path(child);
1973 	os_normalize_path(expected);
1974 
1975 	char* parent = os_file_get_parent_dir(child);
1976 
1977 	bool unexpected = (expected == NULL
1978 			  ? (parent != NULL)
1979 			  : (0 != strcmp(parent, expected)));
1980 	if (unexpected) {
1981 		ib::fatal() << "os_file_get_parent_dir('" << child
1982 			<< "') returned '" << parent
1983 			<< "', instead of '" << expected << "'.";
1984 	}
1985 	ut_free(parent);
1986 	ut_free(child);
1987 	ut_free(expected);
1988 }
1989 
1990 /* Test the function os_file_get_parent_dir. */
1991 void
unit_test_os_file_get_parent_dir()1992 unit_test_os_file_get_parent_dir()
1993 {
1994 	test_os_file_get_parent_dir("/usr/lib/a", "/usr/lib");
1995 	test_os_file_get_parent_dir("/usr/", NULL);
1996 	test_os_file_get_parent_dir("//usr//", NULL);
1997 	test_os_file_get_parent_dir("usr", NULL);
1998 	test_os_file_get_parent_dir("usr//", NULL);
1999 	test_os_file_get_parent_dir("/", NULL);
2000 	test_os_file_get_parent_dir("//", NULL);
2001 	test_os_file_get_parent_dir(".", NULL);
2002 	test_os_file_get_parent_dir("..", NULL);
2003 # ifdef _WIN32
2004 	test_os_file_get_parent_dir("D:", NULL);
2005 	test_os_file_get_parent_dir("D:/", NULL);
2006 	test_os_file_get_parent_dir("D:\\", NULL);
2007 	test_os_file_get_parent_dir("D:/data", NULL);
2008 	test_os_file_get_parent_dir("D:/data/", NULL);
2009 	test_os_file_get_parent_dir("D:\\data\\", NULL);
2010 	test_os_file_get_parent_dir("D:///data/////", NULL);
2011 	test_os_file_get_parent_dir("D:\\\\\\data\\\\\\\\", NULL);
2012 	test_os_file_get_parent_dir("D:/data//a", "D:/data");
2013 	test_os_file_get_parent_dir("D:\\data\\\\a", "D:\\data");
2014 	test_os_file_get_parent_dir("D:///data//a///b/", "D:///data//a");
2015 	test_os_file_get_parent_dir("D:\\\\\\data\\\\a\\\\\\b\\", "D:\\\\\\data\\\\a");
2016 #endif  /* _WIN32 */
2017 }
2018 #endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */
2019 
2020 
2021 /** Creates all missing subdirectories along the given path.
2022 @param[in]	path		Path name
2023 @return DB_SUCCESS if OK, otherwise error code. */
2024 dberr_t
os_file_create_subdirs_if_needed(const char * path)2025 os_file_create_subdirs_if_needed(
2026 	const char*	path)
2027 {
2028 	if (srv_read_only_mode) {
2029 
2030 		ib::error()
2031 			<< "read only mode set. Can't create "
2032 			<< "subdirectories '" << path << "'";
2033 
2034 		return(DB_READ_ONLY);
2035 
2036 	}
2037 
2038 	char*	subdir = os_file_get_parent_dir(path);
2039 
2040 	if (subdir == NULL) {
2041 		/* subdir is root or cwd, nothing to do */
2042 		return(DB_SUCCESS);
2043 	}
2044 
2045 	/* Test if subdir exists */
2046 	os_file_type_t	type;
2047 	bool	subdir_exists;
2048 	bool	success = os_file_status(subdir, &subdir_exists, &type);
2049 
2050 	if (success && !subdir_exists) {
2051 
2052 		/* Subdir does not exist, create it */
2053 		dberr_t	err = os_file_create_subdirs_if_needed(subdir);
2054 
2055 		if (err != DB_SUCCESS) {
2056 
2057 			ut_free(subdir);
2058 
2059 			return(err);
2060 		}
2061 
2062 		success = os_file_create_directory(subdir, false);
2063 	}
2064 
2065 	ut_free(subdir);
2066 
2067 	return(success ? DB_SUCCESS : DB_ERROR);
2068 }
2069 
2070 /** Allocate the buffer for IO on a transparently compressed table.
2071 @param[in]	type		IO flags
2072 @param[out]	buf		buffer to read or write
2073 @param[in,out]	n		number of bytes to read/write, starting from
2074 				offset
2075 @return pointer to allocated page, compressed data is written to the offset
2076 	that is aligned on the disk sector size */
2077 static
2078 Block*
os_file_compress_page(IORequest & type,void * & buf,ulint * n)2079 os_file_compress_page(
2080 	IORequest&	type,
2081 	void*&		buf,
2082 	ulint*		n)
2083 {
2084 	ut_ad(!type.is_log());
2085 	ut_ad(type.is_write());
2086 	ut_ad(type.is_compressed());
2087 
2088 	ulint	n_alloc = *n * 2;
2089 
2090 	ut_a(n_alloc <= UNIV_PAGE_SIZE_MAX * 2);
2091 	ut_a(type.compression_algorithm().m_type != Compression::LZ4
2092 	     || static_cast<ulint>(LZ4_COMPRESSBOUND(*n)) < n_alloc);
2093 
2094 	Block*  block = os_alloc_block();
2095 
2096 	ulint	old_compressed_len;
2097 	ulint	compressed_len = *n;
2098 
2099 	old_compressed_len = mach_read_from_2(
2100 		reinterpret_cast<byte*>(buf)
2101 		+ FIL_PAGE_COMPRESS_SIZE_V1);
2102 
2103 	if (old_compressed_len > 0) {
2104 		old_compressed_len = ut_calc_align(
2105 			old_compressed_len + FIL_PAGE_DATA,
2106 			type.block_size());
2107 	} else {
2108 		old_compressed_len = *n;
2109 	}
2110 
2111 	byte*	compressed_page;
2112 
2113 	compressed_page = static_cast<byte*>(
2114 		ut_align(block->m_ptr, os_io_ptr_align));
2115 
2116 	byte*	buf_ptr;
2117 
2118 	buf_ptr = os_file_compress_page(
2119 		type.compression_algorithm(),
2120 		type.block_size(),
2121 		reinterpret_cast<byte*>(buf),
2122 		*n,
2123 		compressed_page,
2124 		&compressed_len);
2125 
2126 	if (buf_ptr != buf) {
2127 		/* Set new compressed size to uncompressed page. */
2128 		memcpy(reinterpret_cast<byte*>(buf) + FIL_PAGE_COMPRESS_SIZE_V1,
2129 		       buf_ptr + FIL_PAGE_COMPRESS_SIZE_V1, 2);
2130 
2131 		buf = buf_ptr;
2132 		*n = compressed_len;
2133 
2134 		if (compressed_len >= old_compressed_len) {
2135 
2136 			ut_ad(old_compressed_len <= UNIV_PAGE_SIZE);
2137 
2138 			type.clear_punch_hole();
2139 		}
2140 	}
2141 
2142 	return(block);
2143 }
2144 
2145 /** Encrypt a page content when write it to disk.
2146 @param[in]	type		IO flags
2147 @param[out]	buf		buffer to read or write
2148 @param[in,out]	n		number of bytes to read/write, starting from
2149 				offset
2150 @return pointer to the encrypted page */
2151 static
2152 Block*
os_file_encrypt_page(const IORequest & type,void * & buf,ulint * n)2153 os_file_encrypt_page(
2154 	const IORequest&	type,
2155 	void*&			buf,
2156 	ulint*			n)
2157 {
2158 
2159 	byte*		encrypted_page;
2160 	ulint		encrypted_len = *n;
2161 	byte*		buf_ptr;
2162 	Encryption	encryption(type.encryption_algorithm());
2163 
2164 	ut_ad(!type.is_log());
2165 	ut_ad(type.is_write());
2166 	ut_ad(type.is_encrypted());
2167 
2168 	Block*  block = os_alloc_block();
2169 
2170 	encrypted_page = static_cast<byte*>(
2171 		ut_align(block->m_ptr, os_io_ptr_align));
2172 
2173 	buf_ptr = encryption.encrypt(type,
2174 				     reinterpret_cast<byte*>(buf), *n,
2175 				     encrypted_page, &encrypted_len);
2176 
2177 	bool	encrypted = buf_ptr != buf;
2178 
2179 	if (encrypted) {
2180 
2181 		buf = buf_ptr;
2182 		*n = encrypted_len;
2183 	}
2184 
2185 	return(block);
2186 }
2187 
2188 #ifndef _WIN32
2189 
2190 /** Do the read/write
2191 @param[in]	request	The IO context and type
2192 @return the number of bytes read/written or negative value on error */
2193 ssize_t
execute(const IORequest & request)2194 SyncFileIO::execute(const IORequest& request)
2195 {
2196 	ssize_t	n_bytes;
2197 
2198 	if (request.is_read()) {
2199 		n_bytes = pread(m_fh, m_buf, m_n, m_offset);
2200 	} else {
2201 		ut_ad(request.is_write());
2202 		n_bytes = pwrite(m_fh, m_buf, m_n, m_offset);
2203 	}
2204 
2205 	return(n_bytes);
2206 }
2207 
2208 /** Free storage space associated with a section of the file.
2209 @param[in]	fh		Open file handle
2210 @param[in]	off		Starting offset (SEEK_SET)
2211 @param[in]	len		Size of the hole
2212 @return DB_SUCCESS or error code */
2213 static
2214 dberr_t
os_file_punch_hole_posix(os_file_t fh,os_offset_t off,os_offset_t len)2215 os_file_punch_hole_posix(
2216 	os_file_t	fh,
2217 	os_offset_t	off,
2218 	os_offset_t	len)
2219 {
2220 #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
2221 	const int	mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
2222 
2223 	int             ret = fallocate(fh, mode, off, len);
2224 
2225 	if (ret == 0) {
2226 		return(DB_SUCCESS);
2227 	}
2228 
2229 	ut_a(ret == -1);
2230 
2231 	if (errno == ENOTSUP) {
2232 		return(DB_IO_NO_PUNCH_HOLE);
2233 	}
2234 
2235 	ib::warn()
2236 		<< "fallocate(" << fh
2237 		<<", FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, "
2238 		<< off << ", " << len << ") returned errno: "
2239 		<<  errno;
2240 
2241 	return(DB_IO_ERROR);
2242 
2243 #elif defined(UNIV_SOLARIS)
2244 
2245 	// Use F_FREESP
2246 
2247 #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
2248 
2249 	return(DB_IO_NO_PUNCH_HOLE);
2250 }
2251 
2252 #if defined(LINUX_NATIVE_AIO)
2253 
2254 /** Linux native AIO handler */
2255 class LinuxAIOHandler {
2256 public:
2257 	/**
2258 	@param[in] global_segment	The global segment*/
LinuxAIOHandler(ulint global_segment)2259 	LinuxAIOHandler(ulint global_segment)
2260 		:
2261 		m_global_segment(global_segment)
2262 	{
2263 		/* Should never be doing Sync IO here. */
2264 		ut_a(m_global_segment != ULINT_UNDEFINED);
2265 
2266 		/* Find the array and the local segment. */
2267 
2268 		m_segment = AIO::get_array_and_local_segment(
2269 			&m_array, m_global_segment);
2270 
2271 		m_n_slots = m_array->slots_per_segment();
2272 	}
2273 
2274 	/** Destructor */
~LinuxAIOHandler()2275 	~LinuxAIOHandler()
2276 	{
2277 		// No op
2278 	}
2279 
2280 	/**
2281 	Process a Linux AIO request
2282 	@param[out]	m1		the messages passed with the
2283 	@param[out]	m2		AIO request; note that in case the
2284 					AIO operation failed, these output
2285 					parameters are valid and can be used to
2286 					restart the operation.
2287 	@param[out]	request		IO context
2288 	@return DB_SUCCESS or error code */
2289 	dberr_t poll(fil_node_t** m1, void** m2, IORequest* request);
2290 
2291 private:
2292 	/** Resubmit an IO request that was only partially successful
2293 	@param[in,out]	slot		Request to resubmit
2294 	@return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
2295 	dberr_t	resubmit(Slot* slot);
2296 
2297 	/** Check if the AIO succeeded
2298 	@param[in,out]	slot		The slot to check
2299 	@return DB_SUCCESS, DB_FAIL if the operation should be retried or
2300 		DB_IO_ERROR on all other errors */
2301 	dberr_t	check_state(Slot* slot);
2302 
2303 	/** @return true if a shutdown was detected */
is_shutdown() const2304 	bool is_shutdown() const
2305 	{
2306 		return(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
2307 		       && !buf_page_cleaner_is_active);
2308 	}
2309 
2310 	/** If no slot was found then the m_array->m_mutex will be released.
2311 	@param[out]	n_pending	The number of pending IOs
2312 	@return NULL or a slot that has completed IO */
2313 	Slot* find_completed_slot(ulint* n_pending);
2314 
2315 	/** This is called from within the IO-thread. If there are no completed
2316 	IO requests in the slot array, the thread calls this function to
2317 	collect more requests from the Linux kernel.
2318 	The IO-thread waits on io_getevents(), which is a blocking call, with
2319 	a timeout value. Unless the system is very heavy loaded, keeping the
2320 	IO-thread very busy, the io-thread will spend most of its time waiting
2321 	in this function.
2322 	The IO-thread also exits in this function. It checks server status at
2323 	each wakeup and that is why we use timed wait in io_getevents(). */
2324 	void collect();
2325 
2326 private:
2327 	/** Slot array */
2328 	AIO*			m_array;
2329 
2330 	/** Number of slots inthe local segment */
2331 	ulint			m_n_slots;
2332 
2333 	/** The local segment to check */
2334 	ulint			m_segment;
2335 
2336 	/** The global segment */
2337 	ulint			m_global_segment;
2338 };
2339 
2340 /** Resubmit an IO request that was only partially successful
2341 @param[in,out]	slot		Request to resubmit
2342 @return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
2343 dberr_t
resubmit(Slot * slot)2344 LinuxAIOHandler::resubmit(Slot* slot)
2345 {
2346 #ifdef UNIV_DEBUG
2347 	/* Bytes already read/written out */
2348 	ulint	n_bytes = slot->ptr - slot->buf;
2349 
2350 	ut_ad(m_array->is_mutex_owned());
2351 
2352 	ut_ad(n_bytes < slot->original_len);
2353 	ut_ad(static_cast<ulint>(slot->n_bytes) < slot->original_len - n_bytes);
2354 	/* Partial read or write scenario */
2355 	ut_ad(slot->len >= static_cast<ulint>(slot->n_bytes));
2356 #endif /* UNIV_DEBUG */
2357 
2358 	slot->len -= slot->n_bytes;
2359 	slot->ptr += slot->n_bytes;
2360 	slot->offset += slot->n_bytes;
2361 
2362 	/* Resetting the bytes read/written */
2363 	slot->n_bytes = 0;
2364 	slot->io_already_done = false;
2365 
2366 	/* make sure that slot->offset fits in off_t */
2367 	ut_ad(sizeof(off_t) >= sizeof(os_offset_t));
2368 
2369 	struct iocb*	iocb = &slot->control;
2370 	if (slot->type.is_read()) {
2371 		io_prep_pread(
2372 			iocb,
2373 			slot->file.m_file,
2374 			slot->ptr,
2375 			slot->len,
2376 			slot->offset);
2377 
2378 	} else {
2379 
2380 		ut_a(slot->type.is_write());
2381 
2382 		io_prep_pwrite(
2383 			iocb,
2384 			slot->file.m_file,
2385 			slot->ptr,
2386 			slot->len,
2387 			slot->offset);
2388 	}
2389 
2390 	iocb->data = slot;
2391 
2392 	/* Resubmit an I/O request */
2393 	int	ret = io_submit(m_array->io_ctx(m_segment), 1, &iocb);
2394 
2395 	if (ret < -1)  {
2396 		errno = -ret;
2397 	}
2398 
2399 	return(ret < 0 ? DB_IO_PARTIAL_FAILED : DB_SUCCESS);
2400 }
2401 
2402 /** Check if the AIO succeeded
2403 @param[in,out]	slot		The slot to check
2404 @return DB_SUCCESS, DB_FAIL if the operation should be retried or
2405 	DB_IO_ERROR on all other errors */
2406 dberr_t
check_state(Slot * slot)2407 LinuxAIOHandler::check_state(Slot* slot)
2408 {
2409 	ut_ad(m_array->is_mutex_owned());
2410 
2411 	/* Note that it may be that there is more then one completed
2412 	IO requests. We process them one at a time. We may have a case
2413 	here to improve the performance slightly by dealing with all
2414 	requests in one sweep. */
2415 
2416 	srv_set_io_thread_op_info(
2417 		m_global_segment, "processing completed aio requests");
2418 
2419 	ut_ad(slot->io_already_done);
2420 
2421 	dberr_t	err;
2422 
2423 	if (slot->ret == 0) {
2424 
2425 		err = AIOHandler::post_io_processing(slot);
2426 
2427 	} else {
2428 		errno = -slot->ret;
2429 
2430 		/* os_file_handle_error does tell us if we should retry
2431 		this IO. As it stands now, we don't do this retry when
2432 		reaping requests from a different context than
2433 		the dispatcher. This non-retry logic is the same for
2434 		Windows and Linux native AIO.
2435 		We should probably look into this to transparently
2436 		re-submit the IO. */
2437 		os_file_handle_error(slot->name, "Linux aio");
2438 
2439 		err = DB_IO_ERROR;
2440 	}
2441 
2442 	return(err);
2443 }
2444 
2445 /** If no slot was found then the m_array->m_mutex will be released.
2446 @param[out]	n_pending		The number of pending IOs
2447 @return NULL or a slot that has completed IO */
2448 Slot*
find_completed_slot(ulint * n_pending)2449 LinuxAIOHandler::find_completed_slot(ulint* n_pending)
2450 {
2451 	ulint	offset = m_n_slots * m_segment;
2452 
2453 	*n_pending = 0;
2454 
2455 	m_array->acquire();
2456 
2457 	Slot*	slot = m_array->at(offset);
2458 
2459 	for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
2460 
2461 		if (slot->is_reserved) {
2462 
2463 			++*n_pending;
2464 
2465 			if (slot->io_already_done) {
2466 
2467 				/* Something for us to work on.
2468 				Note: We don't release the mutex. */
2469 				return(slot);
2470 			}
2471 		}
2472 	}
2473 
2474 	m_array->release();
2475 
2476 	return(NULL);
2477 }
2478 
2479 /** This function is only used in Linux native asynchronous i/o. This is
2480 called from within the io-thread. If there are no completed IO requests
2481 in the slot array, the thread calls this function to collect more
2482 requests from the kernel.
2483 The io-thread waits on io_getevents(), which is a blocking call, with
2484 a timeout value. Unless the system is very heavy loaded, keeping the
2485 io-thread very busy, the io-thread will spend most of its time waiting
2486 in this function.
2487 The io-thread also exits in this function. It checks server status at
2488 each wakeup and that is why we use timed wait in io_getevents(). */
2489 void
collect()2490 LinuxAIOHandler::collect()
2491 {
2492 	ut_ad(m_n_slots > 0);
2493 	ut_ad(m_array != NULL);
2494 	ut_ad(m_segment < m_array->get_n_segments());
2495 
2496 	/* Which io_context we are going to use. */
2497 	io_context*	io_ctx = m_array->io_ctx(m_segment);
2498 
2499 	/* Starting point of the m_segment we will be working on. */
2500 	ulint	start_pos = m_segment * m_n_slots;
2501 
2502 	/* End point. */
2503 	ulint	end_pos = start_pos + m_n_slots;
2504 
2505 	for (;;) {
2506 		struct io_event*	events;
2507 
2508 		/* Which part of event array we are going to work on. */
2509 		events = m_array->io_events(m_segment * m_n_slots);
2510 
2511 		/* Initialize the events. */
2512 		memset(events, 0, sizeof(*events) * m_n_slots);
2513 
2514 		/* The timeout value is arbitrary. We probably need
2515 		to experiment with it a little. */
2516 		struct timespec		timeout;
2517 
2518 		timeout.tv_sec = 0;
2519 		timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
2520 
2521 		int	ret;
2522 
2523 		ret = io_getevents(io_ctx, 1, m_n_slots, events, &timeout);
2524 
2525 		for (int i = 0; i < ret; ++i) {
2526 
2527 			struct iocb*	iocb;
2528 
2529 			iocb = reinterpret_cast<struct iocb*>(events[i].obj);
2530 			ut_a(iocb != NULL);
2531 
2532 			Slot*	slot = reinterpret_cast<Slot*>(iocb->data);
2533 
2534 			/* Some sanity checks. */
2535 			ut_a(slot != NULL);
2536 			ut_a(slot->is_reserved);
2537 
2538 			/* We are not scribbling previous segment. */
2539 			ut_a(slot->pos >= start_pos);
2540 
2541 			/* We have not overstepped to next segment. */
2542 			ut_a(slot->pos < end_pos);
2543 
2544 			/* We never compress/decompress the first page */
2545 
2546 			if (slot->offset > 0
2547 			    && !slot->skip_punch_hole
2548 			    && slot->type.is_compression_enabled()
2549 			    && !slot->type.is_log()
2550 			    && slot->type.is_write()
2551 			    && slot->type.is_compressed()
2552 			    && slot->type.punch_hole()) {
2553 
2554 				slot->err = AIOHandler::io_complete(slot);
2555 			} else {
2556 				slot->err = DB_SUCCESS;
2557 			}
2558 
2559 			/* Mark this request as completed. The error handling
2560 			will be done in the calling function. */
2561 			m_array->acquire();
2562 
2563 			/* events[i].res2 should always be ZERO */
2564 			ut_ad(events[i].res2 == 0);
2565 			slot->io_already_done = true;
2566 
2567 			/*Even though events[i].res is an unsigned number
2568 			in libaio, it is used to return a negative value
2569 			(negated errno value) to indicate error and a positive
2570 			value to indicate number of bytes read or written. */
2571 
2572 			if (events[i].res > slot->len) {
2573 				/* failure */
2574 				slot->n_bytes = 0;
2575 				slot->ret = events[i].res;
2576 			} else {
2577 				/* success */
2578 				slot->n_bytes = events[i].res;
2579 				slot->ret = 0;
2580 			}
2581 			m_array->release();
2582 		}
2583 
2584 		if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
2585 		    || !buf_page_cleaner_is_active
2586 		    || ret > 0) {
2587 
2588 			break;
2589 		}
2590 
2591 		/* This error handling is for any error in collecting the
2592 		IO requests. The errors, if any, for any particular IO
2593 		request are simply passed on to the calling routine. */
2594 
2595 		switch (ret) {
2596 		case -EAGAIN:
2597 			/* Not enough resources! Try again. */
2598 
2599 		case -EINTR:
2600 			/* Interrupted! The behaviour in case of an interrupt.
2601 			If we have some completed IOs available then the
2602 			return code will be the number of IOs. We get EINTR
2603 			only if there are no completed IOs and we have been
2604 			interrupted. */
2605 
2606 		case 0:
2607 			/* No pending request! Go back and check again. */
2608 
2609 			continue;
2610 		}
2611 
2612 		/* All other errors should cause a trap for now. */
2613 		ib::fatal()
2614 			<< "Unexpected ret_code[" << ret
2615 			<< "] from io_getevents()!";
2616 
2617 		break;
2618 	}
2619 }
2620 
2621 /** Process a Linux AIO request
2622 @param[out]	m1		the messages passed with the
2623 @param[out]	m2		AIO request; note that in case the
2624 				AIO operation failed, these output
2625 				parameters are valid and can be used to
2626 				restart the operation.
2627 @param[out]	request		IO context
2628 @return DB_SUCCESS or error code */
2629 dberr_t
poll(fil_node_t ** m1,void ** m2,IORequest * request)2630 LinuxAIOHandler::poll(fil_node_t** m1, void** m2, IORequest* request)
2631 {
2632 	dberr_t		err;
2633 	Slot*		slot;
2634 
2635 	/* Loop until we have found a completed request. */
2636 	for (;;) {
2637 
2638 		ulint	n_pending;
2639 
2640 		slot = find_completed_slot(&n_pending);
2641 
2642 		if (slot != NULL) {
2643 
2644 			ut_ad(m_array->is_mutex_owned());
2645 
2646 			err = check_state(slot);
2647 
2648 			/* DB_FAIL is not a hard error, we should retry */
2649 			if (err != DB_FAIL) {
2650 				break;
2651 			}
2652 
2653 			/* Partial IO, resubmit request for
2654 			remaining bytes to read/write */
2655 			err = resubmit(slot);
2656 
2657 			if (err != DB_SUCCESS) {
2658 				break;
2659 			}
2660 
2661 			m_array->release();
2662 
2663 		} else if (is_shutdown() && n_pending == 0) {
2664 
2665 			/* There is no completed request. If there is
2666 			no pending request at all, and the system is
2667 			being shut down, exit. */
2668 
2669 			*m1 = NULL;
2670 			*m2 = NULL;
2671 
2672 			return(DB_SUCCESS);
2673 
2674 		} else {
2675 
2676 			/* Wait for some request. Note that we return
2677 			from wait if we have found a request. */
2678 
2679 			srv_set_io_thread_op_info(
2680 				m_global_segment,
2681 				"waiting for completed aio requests");
2682 
2683 			collect();
2684 		}
2685 	}
2686 
2687 	if (err == DB_IO_PARTIAL_FAILED) {
2688 		/* Aborting in case of submit failure */
2689 		ib::fatal()
2690 			<< "Native Linux AIO interface. "
2691 			"io_submit() call failed when "
2692 			"resubmitting a partial I/O "
2693 			"request on the file " << slot->name
2694 			<< ".";
2695 	}
2696 
2697 	*m1 = slot->m1;
2698 	*m2 = slot->m2;
2699 
2700 	*request = slot->type;
2701 
2702 	m_array->release(slot);
2703 
2704 	m_array->release();
2705 
2706 	return(err);
2707 }
2708 
2709 /** This function is only used in Linux native asynchronous i/o.
2710 Waits for an aio operation to complete. This function is used to wait for
2711 the completed requests. The aio array of pending requests is divided
2712 into segments. The thread specifies which segment or slot it wants to wait
2713 for. NOTE: this function will also take care of freeing the aio slot,
2714 therefore no other thread is allowed to do the freeing!
2715 
2716 @param[in]	global_seg	segment number in the aio array
2717 				to wait for; segment 0 is the ibuf
2718 				i/o thread, segment 1 is log i/o thread,
2719 				then follow the non-ibuf read threads,
2720 				and the last are the non-ibuf write
2721 				threads.
2722 @param[out]	m1		the messages passed with the
2723 @param[out]	m2			AIO request; note that in case the
2724 				AIO operation failed, these output
2725 				parameters are valid and can be used to
2726 				restart the operation.
2727 @param[out]xi	 request	IO context
2728 @return DB_SUCCESS if the IO was successful */
2729 static
2730 dberr_t
os_aio_linux_handler(ulint global_segment,fil_node_t ** m1,void ** m2,IORequest * request)2731 os_aio_linux_handler(
2732 	ulint		global_segment,
2733 	fil_node_t**	m1,
2734 	void**		m2,
2735 	IORequest*	request)
2736 {
2737 	LinuxAIOHandler	handler(global_segment);
2738 
2739 	dberr_t	err = handler.poll(m1, m2, request);
2740 
2741 	if (err == DB_IO_NO_PUNCH_HOLE) {
2742 		fil_no_punch_hole(*m1);
2743 		err = DB_SUCCESS;
2744 	}
2745 
2746 	return(err);
2747 }
2748 
2749 /** Dispatch an AIO request to the kernel.
2750 @param[in,out]	slot		an already reserved slot
2751 @return true on success. */
2752 bool
linux_dispatch(Slot * slot)2753 AIO::linux_dispatch(Slot* slot)
2754 {
2755 	ut_a(slot->is_reserved);
2756 	ut_ad(slot->type.validate());
2757 
2758 	/* Find out what we are going to work with.
2759 	The iocb struct is directly in the slot.
2760 	The io_context is one per segment. */
2761 
2762 	ulint		io_ctx_index;
2763 	struct iocb*	iocb = &slot->control;
2764 
2765 	io_ctx_index = (slot->pos * m_n_segments) / m_slots.size();
2766 
2767 	int	ret = io_submit(m_aio_ctx[io_ctx_index], 1, &iocb);
2768 
2769 	/* io_submit() returns number of successfully queued requests
2770 	or -errno. */
2771 
2772 	if (ret != 1) {
2773 		errno = -ret;
2774 	}
2775 
2776 	return(ret == 1);
2777 }
2778 
2779 /** Creates an io_context for native linux AIO.
2780 @param[in]	max_events	number of events
2781 @param[out]	io_ctx		io_ctx to initialize.
2782 @return true on success. */
2783 bool
linux_create_io_ctx(ulint max_events,io_context_t * io_ctx)2784 AIO::linux_create_io_ctx(
2785 	ulint		max_events,
2786 	io_context_t*	io_ctx)
2787 {
2788 	ssize_t		n_retries = 0;
2789 
2790 	for (;;) {
2791 
2792 		memset(io_ctx, 0x0, sizeof(*io_ctx));
2793 
2794 		/* Initialize the io_ctx. Tell it how many pending
2795 		IO requests this context will handle. */
2796 
2797 		int	ret = io_setup(max_events, io_ctx);
2798 
2799 		if (ret == 0) {
2800 			/* Success. Return now. */
2801 			return(true);
2802 		}
2803 
2804 		/* If we hit EAGAIN we'll make a few attempts before failing. */
2805 
2806 		switch (ret) {
2807 		case -EAGAIN:
2808 			if (n_retries == 0) {
2809 				/* First time around. */
2810 				ib::warn()
2811 					<< "io_setup() failed with EAGAIN."
2812 					" Will make "
2813 					<< OS_AIO_IO_SETUP_RETRY_ATTEMPTS
2814 					<< " attempts before giving up.";
2815 			}
2816 
2817 			if (n_retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
2818 
2819 				++n_retries;
2820 
2821 				ib::warn()
2822 					<< "io_setup() attempt "
2823 					<< n_retries << ".";
2824 
2825 				os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
2826 
2827 				continue;
2828 			}
2829 
2830 			/* Have tried enough. Better call it a day. */
2831 			ib::error()
2832 				<< "io_setup() failed with EAGAIN after "
2833 				<< OS_AIO_IO_SETUP_RETRY_ATTEMPTS
2834 				<< " attempts.";
2835 			break;
2836 
2837 		case -ENOSYS:
2838 			ib::error()
2839 				<< "Linux Native AIO interface"
2840 				" is not supported on this platform. Please"
2841 				" check your OS documentation and install"
2842 				" appropriate binary of InnoDB.";
2843 
2844 			break;
2845 
2846 		default:
2847 			ib::error()
2848 				<< "Linux Native AIO setup"
2849 				<< " returned following error["
2850 				<< ret << "]";
2851 			break;
2852 		}
2853 
2854 		ib::info()
2855 			<< "You can disable Linux Native AIO by"
2856 			" setting innodb_use_native_aio = 0 in my.cnf";
2857 
2858 		break;
2859 	}
2860 
2861 	return(false);
2862 }
2863 
2864 /** Checks if the system supports native linux aio. On some kernel
2865 versions where native aio is supported it won't work on tmpfs. In such
2866 cases we can't use native aio as it is not possible to mix simulated
2867 and native aio.
2868 @return: true if supported, false otherwise. */
2869 bool
is_linux_native_aio_supported()2870 AIO::is_linux_native_aio_supported()
2871 {
2872 	int		fd;
2873 	io_context_t	io_ctx;
2874 	char		name[1000];
2875 
2876 	if (!linux_create_io_ctx(1, &io_ctx)) {
2877 
2878 		/* The platform does not support native aio. */
2879 
2880 		return(false);
2881 
2882 	} else if (!srv_read_only_mode) {
2883 
2884 		/* Now check if tmpdir supports native aio ops. */
2885 		fd = innobase_mysql_tmpfile(NULL);
2886 
2887 		if (fd < 0) {
2888 			ib::warn()
2889 				<< "Unable to create temp file to check"
2890 				" native AIO support.";
2891 
2892 			return(false);
2893 		}
2894 	} else {
2895 
2896 		os_normalize_path(srv_log_group_home_dir);
2897 
2898 		ulint	dirnamelen = strlen(srv_log_group_home_dir);
2899 
2900 		ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
2901 
2902 		memcpy(name, srv_log_group_home_dir, dirnamelen);
2903 
2904 		/* Add a path separator if needed. */
2905 		if (dirnamelen && name[dirnamelen - 1] != OS_PATH_SEPARATOR) {
2906 
2907 			name[dirnamelen++] = OS_PATH_SEPARATOR;
2908 		}
2909 
2910 		strcpy(name + dirnamelen, "ib_logfile0");
2911 
2912 		fd = ::open(name, O_RDONLY);
2913 
2914 		if (fd == -1) {
2915 
2916 			ib::warn()
2917 				<< "Unable to open"
2918 				<< " \"" << name << "\" to check native"
2919 				<< " AIO read support.";
2920 
2921 			return(false);
2922 		}
2923 	}
2924 
2925 	struct io_event	io_event;
2926 
2927 	memset(&io_event, 0x0, sizeof(io_event));
2928 
2929 	byte*	buf = static_cast<byte*>(ut_malloc_nokey(UNIV_PAGE_SIZE * 2));
2930 	byte*	ptr = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE));
2931 
2932 	struct iocb	iocb;
2933 
2934 	/* Suppress valgrind warning. */
2935 	memset(buf, 0x00, UNIV_PAGE_SIZE * 2);
2936 	memset(&iocb, 0x0, sizeof(iocb));
2937 
2938 	struct iocb*	p_iocb = &iocb;
2939 
2940 	if (!srv_read_only_mode) {
2941 
2942 		io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0);
2943 
2944 	} else {
2945 		ut_a(UNIV_PAGE_SIZE >= 512);
2946 		io_prep_pread(p_iocb, fd, ptr, 512, 0);
2947 	}
2948 
2949 	int	err = io_submit(io_ctx, 1, &p_iocb);
2950 
2951 	if (err >= 1) {
2952 		/* Now collect the submitted IO request. */
2953 		err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
2954 	}
2955 
2956 	ut_free(buf);
2957 	close(fd);
2958 
2959 	switch (err) {
2960 	case 1:
2961 		return(true);
2962 
2963 	case -EINVAL:
2964 	case -ENOSYS:
2965 		ib::error()
2966 			<< "Linux Native AIO not supported. You can either"
2967 			" move "
2968 			<< (srv_read_only_mode ? name : "tmpdir")
2969 			<< " to a file system that supports native"
2970 			" AIO or you can set innodb_use_native_aio to"
2971 			" FALSE to avoid this message.";
2972 
2973 		/* fall through. */
2974 	default:
2975 		ib::error()
2976 			<< "Linux Native AIO check on "
2977 			<< (srv_read_only_mode ? name : "tmpdir")
2978 			<< "returned error[" << -err << "]";
2979 	}
2980 
2981 	return(false);
2982 }
2983 
2984 #endif /* LINUX_NATIVE_AIO */
2985 
2986 /** Retrieves the last error number if an error occurs in a file io function.
2987 The number should be retrieved before any other OS calls (because they may
2988 overwrite the error number). If the number is not known to this program,
2989 the OS error number + 100 is returned.
2990 @param[in]	report_all_errors	true if we want an error message
2991 					printed of all errors
2992 @param[in]	on_error_silent		true then don't print any diagnostic
2993 					to the log
2994 @return error number, or OS error number + 100 */
2995 static
2996 ulint
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)2997 os_file_get_last_error_low(
2998 	bool	report_all_errors,
2999 	bool	on_error_silent)
3000 {
3001 	int	err = errno;
3002 
3003 	if (err == 0) {
3004 		return(0);
3005 	}
3006 
3007 	if (report_all_errors
3008 	    || (err != ENOSPC && err != EEXIST && !on_error_silent)) {
3009 
3010 		ib::error()
3011 			<< "Operating system error number "
3012 			<< err
3013 			<< " in a file operation.";
3014 
3015 		if (err == ENOENT) {
3016 
3017 			ib::error()
3018 				<< "The error means the system"
3019 				" cannot find the path specified.";
3020 
3021 			if (srv_is_being_started) {
3022 
3023 				ib::error()
3024 					<< "If you are installing InnoDB,"
3025 					" remember that you must create"
3026 					" directories yourself, InnoDB"
3027 					" does not create them.";
3028 			}
3029 		} else if (err == EACCES) {
3030 
3031 			ib::error()
3032 				<< "The error means mysqld does not have"
3033 				" the access rights to the directory.";
3034 
3035 		} else {
3036 			if (strerror(err) != NULL) {
3037 
3038 				ib::error()
3039 					<< "Error number " << err << " means '"
3040 					<< strerror(err) << "'";
3041 			}
3042 
3043 			ib::info() << OPERATING_SYSTEM_ERROR_MSG;
3044 		}
3045 	}
3046 
3047 	switch (err) {
3048 	case ENOSPC:
3049 		return(OS_FILE_DISK_FULL);
3050 	case ENOENT:
3051 		return(OS_FILE_NOT_FOUND);
3052 	case EEXIST:
3053 		return(OS_FILE_ALREADY_EXISTS);
3054 	case EXDEV:
3055 	case ENOTDIR:
3056 	case EISDIR:
3057 		return(OS_FILE_PATH_ERROR);
3058 	case EAGAIN:
3059 		if (srv_use_native_aio) {
3060 			return(OS_FILE_AIO_RESOURCES_RESERVED);
3061 		}
3062 		break;
3063 	case EINTR:
3064 		if (srv_use_native_aio) {
3065 			return(OS_FILE_AIO_INTERRUPTED);
3066 		}
3067 		break;
3068 	case EACCES:
3069 		return(OS_FILE_ACCESS_VIOLATION);
3070 	}
3071 	return(OS_FILE_ERROR_MAX + err);
3072 }
3073 
3074 /** Wrapper to fsync(2) that retries the call on some errors.
3075 Returns the value 0 if successful; otherwise the value -1 is returned and
3076 the global variable errno is set to indicate the error.
3077 @param[in]	file		open file handle
3078 @return 0 if success, -1 otherwise */
3079 static
3080 int
os_file_fsync_posix(os_file_t file)3081 os_file_fsync_posix(
3082 	os_file_t	file)
3083 {
3084 	ulint		failures = 0;
3085 
3086 	for (;;) {
3087 
3088 		++os_n_fsyncs;
3089 
3090 		int	ret = fsync(file);
3091 
3092 		if (ret == 0) {
3093 			return(ret);
3094 		}
3095 
3096 		switch(errno) {
3097 		case ENOLCK:
3098 
3099 			++failures;
3100 			ut_a(failures < 1000);
3101 
3102 			if (!(failures % 100)) {
3103 
3104 				ib::warn()
3105 					<< "fsync(): "
3106 					<< "No locks available; retrying";
3107 			}
3108 
3109 			/* 0.2 sec */
3110 			os_thread_sleep(200000);
3111 			break;
3112 
3113 		case EIO:
3114 
3115                         ib::fatal()
3116 				<< "fsync() returned EIO, aborting.";
3117 			break;
3118 
3119 		case EINTR:
3120 
3121 			++failures;
3122 			ut_a(failures < 2000);
3123 			break;
3124 
3125 		default:
3126 			ut_error;
3127 			break;
3128 		}
3129 	}
3130 
3131 	ut_error;
3132 
3133 	return(-1);
3134 }
3135 
3136 /** Check the existence and type of the given file.
3137 @param[in]	path		path name of file
3138 @param[out]	exists		true if the file exists
3139 @param[out]	type		Type of the file, if it exists
3140 @return true if call succeeded */
3141 bool
os_file_status_posix(const char * path,bool * exists,os_file_type_t * type)3142 os_file_status_posix(
3143 	const char*	path,
3144 	bool*		exists,
3145 	os_file_type_t* type)
3146 {
3147 	struct stat	statinfo;
3148 
3149 	int	ret = stat(path, &statinfo);
3150 
3151 	*exists = !ret;
3152 
3153 	if (!ret) {
3154 		/* file exists, everything OK */
3155 
3156 	} else if (errno == ENOENT || errno == ENOTDIR
3157 		   || errno == ENAMETOOLONG) {
3158 		/* file does not exist */
3159 		return(true);
3160 
3161 	} else {
3162 		/* file exists, but stat call failed */
3163 		os_file_handle_error_no_exit(path, "stat", false);
3164 		return(false);
3165 	}
3166 
3167 	if (S_ISDIR(statinfo.st_mode)) {
3168 		*type = OS_FILE_TYPE_DIR;
3169 
3170 	} else if (S_ISLNK(statinfo.st_mode)) {
3171 		*type = OS_FILE_TYPE_LINK;
3172 
3173 	} else if (S_ISREG(statinfo.st_mode)) {
3174 		*type = OS_FILE_TYPE_FILE;
3175 
3176 	} else {
3177 		*type = OS_FILE_TYPE_UNKNOWN;
3178 	}
3179 
3180 	return(true);
3181 }
3182 
3183 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
3184 function!
3185 Flushes the write buffers of a given file to the disk.
3186 @param[in]	file		handle to a file
3187 @return true if success */
3188 bool
os_file_flush_func(os_file_t file)3189 os_file_flush_func(
3190 	os_file_t	file)
3191 {
3192 	int	ret;
3193 
3194 	ret = os_file_fsync_posix(file);
3195 
3196 	if (ret == 0) {
3197 		return(true);
3198 	}
3199 
3200 	/* Since Linux returns EINVAL if the 'file' is actually a raw device,
3201 	we choose to ignore that error if we are using raw disks */
3202 
3203 	if (srv_start_raw_disk_in_use && errno == EINVAL) {
3204 
3205 		return(true);
3206 	}
3207 
3208 	ib::error() << "The OS said file flush did not succeed";
3209 
3210 	os_file_handle_error(NULL, "flush");
3211 
3212 	/* It is a fatal error if a file flush does not succeed, because then
3213 	the database can get corrupt on disk */
3214 	ut_error;
3215 
3216 	return(false);
3217 }
3218 
3219 /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
3220 this function!
3221 A simple function to open or create a file.
3222 @param[in]	name		name of the file or path as a null-terminated
3223 				string
3224 @param[in]	create_mode	create mode
3225 @param[in]	access_type	OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
3226 @param[in]	read_only	if true, read only checks are enforced
3227 @param[out]	success		true if succeed, false if error
3228 @return handle to the file, not defined if error, error number
3229 	can be retrieved with os_file_get_last_error */
3230 pfs_os_file_t
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)3231 os_file_create_simple_func(
3232 	const char*	name,
3233 	ulint		create_mode,
3234 	ulint		access_type,
3235 	bool		read_only,
3236 	bool*		success)
3237 {
3238 	pfs_os_file_t	file;
3239 
3240 	*success = false;
3241 
3242 	int		create_flag;
3243 
3244 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
3245 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
3246 
3247 	if (create_mode == OS_FILE_OPEN) {
3248 
3249 		if (access_type == OS_FILE_READ_ONLY) {
3250 
3251 			create_flag = O_RDONLY;
3252 
3253 		} else if (read_only) {
3254 
3255 			create_flag = O_RDONLY;
3256 
3257 		} else {
3258 			create_flag = O_RDWR;
3259 		}
3260 
3261 	} else if (read_only) {
3262 
3263 		create_flag = O_RDONLY;
3264 
3265 	} else if (create_mode == OS_FILE_CREATE) {
3266 
3267 		create_flag = O_RDWR | O_CREAT | O_EXCL;
3268 
3269 	} else if (create_mode == OS_FILE_CREATE_PATH) {
3270 
3271 		/* Create subdirs along the path if needed. */
3272 
3273 		*success = os_file_create_subdirs_if_needed(name);
3274 
3275 		if (!*success) {
3276 
3277 			ib::error()
3278 				<< "Unable to create subdirectories '"
3279 				<< name << "'";
3280 
3281 			file.m_file = OS_FILE_CLOSED;
3282 			return(file);
3283 		}
3284 
3285 		create_flag = O_RDWR | O_CREAT | O_EXCL;
3286 		create_mode = OS_FILE_CREATE;
3287 	} else {
3288 
3289 		ib::error()
3290 			<< "Unknown file create mode ("
3291 			<< create_mode
3292 			<< " for file '" << name << "'";
3293 
3294 		file.m_file = OS_FILE_CLOSED;
3295 		return(file);
3296 	}
3297 
3298 	bool	retry;
3299 
3300 	do {
3301 		file.m_file = ::open(name, create_flag, os_innodb_umask);
3302 
3303 		if (file.m_file == -1) {
3304 			*success = false;
3305 
3306 			retry = os_file_handle_error(
3307 				name,
3308 				create_mode == OS_FILE_OPEN
3309 				? "open" : "create");
3310 		} else {
3311 			*success = true;
3312 			retry = false;
3313 		}
3314 
3315 	} while (retry);
3316 
3317 #ifdef USE_FILE_LOCK
3318 	if (!read_only
3319 	    && *success
3320 	    && access_type == OS_FILE_READ_WRITE
3321 	    && os_file_lock(file.m_file, name)) {
3322 
3323 		*success = false;
3324 		close(file.m_file);
3325 		file.m_file = -1;
3326 	}
3327 #endif /* USE_FILE_LOCK */
3328 
3329 	return(file);
3330 }
3331 
3332 /** This function attempts to create a directory named pathname. The new
3333 directory gets default permissions. On Unix the permissions are
3334 (0770 & ~umask). If the directory exists already, nothing is done and
3335 the call succeeds, unless the fail_if_exists arguments is true.
3336 If another error occurs, such as a permission error, this does not crash,
3337 but reports the error and returns false.
3338 @param[in]	pathname	directory name as null-terminated string
3339 @param[in]	fail_if_exists	if true, pre-existing directory is treated as
3340 				an error.
3341 @return true if call succeeds, false on error */
3342 bool
os_file_create_directory(const char * pathname,bool fail_if_exists)3343 os_file_create_directory(
3344 	const char*	pathname,
3345 	bool		fail_if_exists)
3346 {
3347 	int	rcode = mkdir(pathname, 0770);
3348 
3349 	if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
3350 		/* failure */
3351 		os_file_handle_error_no_exit(pathname, "mkdir", false);
3352 
3353 		return(false);
3354 	}
3355 
3356 	return(true);
3357 }
3358 
3359 /**
3360 The os_file_opendir() function opens a directory stream corresponding to the
3361 directory named by the dirname argument. The directory stream is positioned
3362 at the first entry. In both Unix and Windows we automatically skip the '.'
3363 and '..' items at the start of the directory listing.
3364 @param[in]	dirname		directory name; it must not contain a trailing
3365 				'\' or '/'
3366 @param[in]	is_fatal	true if we should treat an error as a fatal
3367 				error; if we try to open symlinks then we do
3368 				not wish a fatal error if it happens not to be
3369 				a directory
3370 @return directory stream, NULL if error */
3371 os_file_dir_t
os_file_opendir(const char * dirname,bool error_is_fatal)3372 os_file_opendir(
3373 	const char*	dirname,
3374 	bool		error_is_fatal)
3375 {
3376 	os_file_dir_t		dir;
3377 	dir = opendir(dirname);
3378 
3379 	if (dir == NULL && error_is_fatal) {
3380 		os_file_handle_error(dirname, "opendir");
3381 	}
3382 
3383 	return(dir);
3384 }
3385 
3386 /** Closes a directory stream.
3387 @param[in]	dir		directory stream
3388 @return 0 if success, -1 if failure */
3389 int
os_file_closedir(os_file_dir_t dir)3390 os_file_closedir(
3391 	os_file_dir_t	dir)
3392 {
3393 	int	ret = closedir(dir);
3394 
3395 	if (ret != 0) {
3396 		os_file_handle_error_no_exit(NULL, "closedir", false);
3397 	}
3398 
3399 	return(ret);
3400 }
3401 
3402 /** This function returns information of the next file in the directory. We jump
3403 over the '.' and '..' entries in the directory.
3404 @param[in]	dirname		directory name or path
3405 @param[in]	dir		directory stream
3406 @param[out]	info		buffer where the info is returned
3407 @return 0 if ok, -1 if error, 1 if at the end of the directory */
3408 int
os_file_readdir_next_file(const char * dirname,os_file_dir_t dir,os_file_stat_t * info)3409 os_file_readdir_next_file(
3410 	const char*	dirname,
3411 	os_file_dir_t	dir,
3412 	os_file_stat_t*	info)
3413 {
3414 	struct dirent*	ent;
3415 	char*		full_path;
3416 	int		ret;
3417 	struct stat	statinfo;
3418 
3419 #ifdef HAVE_READDIR_R
3420 	char		dirent_buf[sizeof(struct dirent)
3421 				   + _POSIX_PATH_MAX + 100];
3422 	/* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
3423 	the max file name len; but in most standards, the
3424 	length is NAME_MAX; we add 100 to be even safer */
3425 #endif /* HAVE_READDIR_R */
3426 
3427 next_file:
3428 
3429 #ifdef HAVE_READDIR_R
3430 	ret = readdir_r(dir, (struct dirent*) dirent_buf, &ent);
3431 
3432 	if (ret != 0) {
3433 
3434 		ib::error()
3435 			<< "Cannot read directory " << dirname
3436 			<< " error: " << ret;
3437 
3438 		return(-1);
3439 	}
3440 
3441 	if (ent == NULL) {
3442 		/* End of directory */
3443 
3444 		return(1);
3445 	}
3446 
3447 	ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
3448 #else
3449 	ent = readdir(dir);
3450 
3451 	if (ent == NULL) {
3452 
3453 		return(1);
3454 	}
3455 #endif /* HAVE_READDIR_R */
3456 	ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
3457 
3458 	if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
3459 
3460 		goto next_file;
3461 	}
3462 
3463 	strcpy(info->name, ent->d_name);
3464 
3465 	full_path = static_cast<char*>(
3466 		ut_malloc_nokey(strlen(dirname) + strlen(ent->d_name) + 10));
3467 
3468 	sprintf(full_path, "%s/%s", dirname, ent->d_name);
3469 
3470 	ret = stat(full_path, &statinfo);
3471 
3472 	if (ret) {
3473 
3474 		if (errno == ENOENT) {
3475 			/* readdir() returned a file that does not exist,
3476 			it must have been deleted in the meantime. Do what
3477 			would have happened if the file was deleted before
3478 			readdir() - ignore and go to the next entry.
3479 			If this is the last entry then info->name will still
3480 			contain the name of the deleted file when this
3481 			function returns, but this is not an issue since the
3482 			caller shouldn't be looking at info when end of
3483 			directory is returned. */
3484 
3485 			ut_free(full_path);
3486 
3487 			goto next_file;
3488 		}
3489 
3490 		os_file_handle_error_no_exit(full_path, "stat", false);
3491 
3492 		ut_free(full_path);
3493 
3494 		return(-1);
3495 	}
3496 
3497 	info->size = statinfo.st_size;
3498 
3499 	if (S_ISDIR(statinfo.st_mode)) {
3500 		info->type = OS_FILE_TYPE_DIR;
3501 	} else if (S_ISLNK(statinfo.st_mode)) {
3502 		info->type = OS_FILE_TYPE_LINK;
3503 	} else if (S_ISREG(statinfo.st_mode)) {
3504 		info->type = OS_FILE_TYPE_FILE;
3505 	} else {
3506 		info->type = OS_FILE_TYPE_UNKNOWN;
3507 	}
3508 
3509 	ut_free(full_path);
3510 
3511 	return(0);
3512 }
3513 
3514 /** NOTE! Use the corresponding macro os_file_create(), not directly
3515 this function!
3516 Opens an existing file or creates a new.
3517 @param[in]	name		name of the file or path as a null-terminated
3518 				string
3519 @param[in]	create_mode	create mode
3520 @param[in]	purpose		OS_FILE_AIO, if asynchronous, non-buffered I/O
3521 				is desired, OS_FILE_NORMAL, if any normal file;
3522 				NOTE that it also depends on type, os_aio_..
3523 				and srv_.. variables whether we really use async
3524 				I/O or unbuffered I/O: look in the function
3525 				source code for the exact rules
3526 @param[in]	type		OS_DATA_FILE or OS_LOG_FILE
3527 @param[in]	read_only	true, if read only checks should be enforcedm
3528 @param[in]	success		true if succeeded
3529 @return handle to the file, not defined if error, error number
3530 	can be retrieved with os_file_get_last_error */
3531 pfs_os_file_t
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,bool read_only,bool * success)3532 os_file_create_func(
3533 	const char*	name,
3534 	ulint		create_mode,
3535 	ulint		purpose,
3536 	ulint		type,
3537 	bool		read_only,
3538 	bool*		success)
3539 {
3540 	bool		on_error_no_exit;
3541 	bool		on_error_silent;
3542 	pfs_os_file_t	file;
3543 
3544 	*success = false;
3545 
3546 	DBUG_EXECUTE_IF(
3547 		"ib_create_table_fail_disk_full",
3548 		*success = false;
3549 		errno = ENOSPC;
3550 		file.m_file = OS_FILE_CLOSED;
3551 		return(file);
3552 	);
3553 
3554 	int		create_flag;
3555 	const char*	mode_str	= NULL;
3556 
3557 	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
3558 		? true : false;
3559 	on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
3560 		? true : false;
3561 
3562 	create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
3563 	create_mode &= ~OS_FILE_ON_ERROR_SILENT;
3564 
3565 	if (create_mode == OS_FILE_OPEN
3566 	    || create_mode == OS_FILE_OPEN_RAW
3567 	    || create_mode == OS_FILE_OPEN_RETRY) {
3568 
3569 		mode_str = "OPEN";
3570 
3571 		create_flag = read_only ? O_RDONLY : O_RDWR;
3572 
3573 	} else if (read_only) {
3574 
3575 		mode_str = "OPEN";
3576 
3577 		create_flag = O_RDONLY;
3578 
3579 	} else if (create_mode == OS_FILE_CREATE) {
3580 
3581 		mode_str = "CREATE";
3582 		create_flag = O_RDWR | O_CREAT | O_EXCL;
3583 
3584 	} else if (create_mode == OS_FILE_OVERWRITE) {
3585 
3586 		mode_str = "OVERWRITE";
3587 		create_flag = O_RDWR | O_CREAT | O_TRUNC;
3588 
3589 	} else {
3590 		ib::error()
3591 			<< "Unknown file create mode (" << create_mode << ")"
3592 			<< " for file '" << name << "'";
3593 
3594 		file.m_file = OS_FILE_CLOSED;
3595 		return(file);
3596 	}
3597 
3598 	ut_a(type == OS_LOG_FILE
3599 	     || type == OS_DATA_FILE
3600 	     || type == OS_DATA_TEMP_FILE);
3601 
3602 	ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
3603 
3604 #ifdef O_SYNC
3605 	/* We let O_SYNC only affect log files; note that we map O_DSYNC to
3606 	O_SYNC because the datasync options seemed to corrupt files in 2001
3607 	in both Linux and Solaris */
3608 
3609 	if (!read_only
3610 	    && type == OS_LOG_FILE
3611 	    && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
3612 
3613 		create_flag |= O_SYNC;
3614 	}
3615 #endif /* O_SYNC */
3616 
3617 	bool		retry;
3618 
3619 	do {
3620 		file.m_file = ::open(name, create_flag, os_innodb_umask);
3621 
3622 		if (file.m_file == -1) {
3623 			const char*	operation;
3624 
3625 			operation = (create_mode == OS_FILE_CREATE
3626 				     && !read_only) ? "create" : "open";
3627 
3628 			*success = false;
3629 
3630 			if (on_error_no_exit) {
3631 				retry = os_file_handle_error_no_exit(
3632 					name, operation, on_error_silent);
3633 			} else {
3634 				retry = os_file_handle_error(name, operation);
3635 			}
3636 		} else {
3637 			*success = true;
3638 			retry = false;
3639 		}
3640 
3641 	} while (retry);
3642 
3643 	/* We disable OS caching (O_DIRECT) only on data files */
3644 
3645 	if (!read_only
3646 	    && *success
3647 	    && (type != OS_LOG_FILE && type != OS_DATA_TEMP_FILE)
3648 	    && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
3649 		|| srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) {
3650 
3651 		os_file_set_nocache(file.m_file, name, mode_str);
3652 	} else if (!read_only
3653 	    && *success
3654 	    && srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) {
3655 
3656 		os_file_set_nocache(file.m_file, name, mode_str);
3657 	}
3658 
3659 #ifdef USE_FILE_LOCK
3660 	if (!read_only
3661 	    && *success
3662 	    && create_mode != OS_FILE_OPEN_RAW
3663 	    && os_file_lock(file.m_file, name)) {
3664 
3665 		if (create_mode == OS_FILE_OPEN_RETRY) {
3666 
3667 			ib::info()
3668 				<< "Retrying to lock the first data file";
3669 
3670 			for (int i = 0; i < 100; i++) {
3671 				os_thread_sleep(1000000);
3672 
3673 				if (!os_file_lock(file.m_file, name)) {
3674 					*success = true;
3675 					return(file);
3676 				}
3677 			}
3678 
3679 			ib::info()
3680 				<< "Unable to open the first data file";
3681 		}
3682 
3683 		*success = false;
3684 		close(file.m_file);
3685 		file.m_file = -1;
3686 	}
3687 #endif /* USE_FILE_LOCK */
3688 
3689 	return(file);
3690 }
3691 
3692 /** NOTE! Use the corresponding macro
3693 os_file_create_simple_no_error_handling(), not directly this function!
3694 A simple function to open or create a file.
3695 @param[in]	name		name of the file or path as a null-terminated
3696 				string
3697 @param[in]	create_mode	create mode
3698 @param[in]	access_type	OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
3699 				OS_FILE_READ_ALLOW_DELETE; the last option
3700 				is used by a backup program reading the file
3701 @param[in]	read_only	if true read only mode checks are enforced
3702 @param[out]	success		true if succeeded
3703 @return own: handle to the file, not defined if error, error number
3704 	can be retrieved with os_file_get_last_error */
3705 pfs_os_file_t
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)3706 os_file_create_simple_no_error_handling_func(
3707 	const char*	name,
3708 	ulint		create_mode,
3709 	ulint		access_type,
3710 	bool		read_only,
3711 	bool*		success)
3712 {
3713 	pfs_os_file_t	file;
3714 	int		create_flag;
3715 
3716 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
3717 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
3718 
3719 	*success = false;
3720 
3721 	if (create_mode == OS_FILE_OPEN) {
3722 
3723 		if (access_type == OS_FILE_READ_ONLY) {
3724 
3725 			create_flag = O_RDONLY;
3726 
3727 		} else if (read_only) {
3728 
3729 			create_flag = O_RDONLY;
3730 
3731 		} else {
3732 
3733 			ut_a(access_type == OS_FILE_READ_WRITE
3734 			     || access_type == OS_FILE_READ_ALLOW_DELETE);
3735 
3736 			create_flag = O_RDWR;
3737 		}
3738 
3739 	} else if (read_only) {
3740 
3741 		create_flag = O_RDONLY;
3742 
3743 	} else if (create_mode == OS_FILE_CREATE) {
3744 
3745 		create_flag = O_RDWR | O_CREAT | O_EXCL;
3746 
3747 	} else {
3748 
3749 		ib::error()
3750 			<< "Unknown file create mode "
3751 			<< create_mode << " for file '" << name << "'";
3752 		file.m_file = OS_FILE_CLOSED;
3753 		return(file);
3754 	}
3755 
3756 	file.m_file = ::open(name, create_flag, os_innodb_umask);
3757 
3758 	*success = (file.m_file != -1);
3759 
3760 #ifdef USE_FILE_LOCK
3761 	if (!read_only
3762 	    && *success
3763 	    && access_type == OS_FILE_READ_WRITE
3764 	    && os_file_lock(file.m_file, name)) {
3765 
3766 		*success = false;
3767 		close(file.m_file);
3768 		file.m_file = -1;
3769 
3770 	}
3771 #endif /* USE_FILE_LOCK */
3772 
3773 	return(file);
3774 }
3775 
3776 /** Deletes a file if it exists. The file has to be closed before calling this.
3777 @param[in]	name		file path as a null-terminated string
3778 @param[out]	exist		indicate if file pre-exist
3779 @return true if success */
3780 bool
os_file_delete_if_exists_func(const char * name,bool * exist)3781 os_file_delete_if_exists_func(
3782 	const char*	name,
3783 	bool*		exist)
3784 {
3785 	if (exist != NULL) {
3786 		*exist = true;
3787 	}
3788 
3789 	int	ret = unlink(name);
3790 
3791 	if (ret != 0 && errno == ENOENT) {
3792 		if (exist != NULL) {
3793 			*exist = false;
3794 		}
3795 	} else if (ret != 0 && errno != ENOENT) {
3796 		os_file_handle_error_no_exit(name, "delete", false);
3797 
3798 		return(false);
3799 	}
3800 
3801 	return(true);
3802 }
3803 
3804 /** Deletes a file. The file has to be closed before calling this.
3805 @param[in]	name		file path as a null-terminated string
3806 @return true if success */
3807 bool
os_file_delete_func(const char * name)3808 os_file_delete_func(
3809 	const char*	name)
3810 {
3811 	int	ret = unlink(name);
3812 
3813 	if (ret != 0) {
3814 		os_file_handle_error_no_exit(name, "delete", false);
3815 
3816 		return(false);
3817 	}
3818 
3819 	return(true);
3820 }
3821 
3822 /** NOTE! Use the corresponding macro os_file_rename(), not directly this
3823 function!
3824 Renames a file (can also move it to another directory). It is safest that the
3825 file is closed before calling this function.
3826 @param[in]	oldpath		old file path as a null-terminated string
3827 @param[in]	newpath		new file path
3828 @return true if success */
3829 bool
os_file_rename_func(const char * oldpath,const char * newpath)3830 os_file_rename_func(
3831 	const char*	oldpath,
3832 	const char*	newpath)
3833 {
3834 #ifdef UNIV_DEBUG
3835 	os_file_type_t	type;
3836 	bool		exists;
3837 
3838 	/* New path must not exist. */
3839 	ut_ad(os_file_status(newpath, &exists, &type));
3840 	ut_ad(!exists);
3841 
3842 	/* Old path must exist. */
3843 	ut_ad(os_file_status(oldpath, &exists, &type));
3844 	ut_ad(exists);
3845 #endif /* UNIV_DEBUG */
3846 
3847 	int	ret = rename(oldpath, newpath);
3848 
3849 	if (ret != 0) {
3850 		os_file_handle_error_no_exit(oldpath, "rename", false);
3851 
3852 		return(false);
3853 	}
3854 
3855 	return(true);
3856 }
3857 
3858 /** NOTE! Use the corresponding macro os_file_close(), not directly this
3859 function!
3860 Closes a file handle. In case of error, error number can be retrieved with
3861 os_file_get_last_error.
3862 @param[in]	file		Handle to close
3863 @return true if success */
3864 bool
os_file_close_func(os_file_t file)3865 os_file_close_func(
3866 	os_file_t	file)
3867 {
3868 	int	ret = close(file);
3869 
3870 	if (ret == -1) {
3871 		os_file_handle_error(NULL, "close");
3872 
3873 		return(false);
3874 	}
3875 
3876 	return(true);
3877 }
3878 
3879 /** Gets a file size.
3880 @param[in]	file		handle to an open file
3881 @return file size, or (os_offset_t) -1 on failure */
3882 os_offset_t
os_file_get_size(pfs_os_file_t file)3883 os_file_get_size(
3884 	pfs_os_file_t	file)
3885 {
3886 	/* Store current position */
3887 	os_offset_t	pos = lseek(file.m_file, 0, SEEK_CUR);
3888 	os_offset_t	file_size = lseek(file.m_file, 0, SEEK_END);
3889 
3890 	/* Restore current position as the function should not change it */
3891 	lseek(file.m_file, pos, SEEK_SET);
3892 
3893 	return(file_size);
3894 }
3895 
3896 /** Gets a file size.
3897 @param[in]	filename	Full path to the filename to check
3898 @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
3899 	errno */
3900 os_file_size_t
os_file_get_size(const char * filename)3901 os_file_get_size(
3902 	const char*	filename)
3903 {
3904 	struct stat	s;
3905 	os_file_size_t	file_size;
3906 
3907 	int	ret = stat(filename, &s);
3908 
3909 	if (ret == 0) {
3910 		file_size.m_total_size = s.st_size;
3911 		/* st_blocks is in 512 byte sized blocks */
3912 		file_size.m_alloc_size = s.st_blocks * 512;
3913 	} else {
3914 		file_size.m_total_size = ~0;
3915 		file_size.m_alloc_size = (os_offset_t) errno;
3916 	}
3917 
3918 	return(file_size);
3919 }
3920 
3921 /** This function returns information about the specified file
3922 @param[in]	path		pathname of the file
3923 @param[out]	stat_info	information of a file in a directory
3924 @param[in,out]	statinfo	information of a file in a directory
3925 @param[in]	check_rw_perm	for testing whether the file can be opened
3926 				in RW mode
3927 @param[in]	read_only	if true read only mode checks are enforced
3928 @return DB_SUCCESS if all OK */
3929 static
3930 dberr_t
os_file_get_status_posix(const char * path,os_file_stat_t * stat_info,struct stat * statinfo,bool check_rw_perm,bool read_only)3931 os_file_get_status_posix(
3932 	const char*	path,
3933 	os_file_stat_t* stat_info,
3934 	struct stat*	statinfo,
3935 	bool		check_rw_perm,
3936 	bool		read_only)
3937 {
3938 	int	ret = stat(path, statinfo);
3939 
3940 	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3941 		/* file does not exist */
3942 
3943 		return(DB_NOT_FOUND);
3944 
3945 	} else if (ret) {
3946 		/* file exists, but stat call failed */
3947 
3948 		os_file_handle_error_no_exit(path, "stat", false);
3949 
3950 		return(DB_FAIL);
3951 	}
3952 
3953 	switch (statinfo->st_mode & S_IFMT) {
3954 	case S_IFDIR:
3955 		stat_info->type = OS_FILE_TYPE_DIR;
3956 		break;
3957 	case S_IFLNK:
3958 		stat_info->type = OS_FILE_TYPE_LINK;
3959 		break;
3960 	case S_IFBLK:
3961 		/* Handle block device as regular file. */
3962 	case S_IFCHR:
3963 		/* Handle character device as regular file. */
3964 	case S_IFREG:
3965 		stat_info->type = OS_FILE_TYPE_FILE;
3966 		break;
3967 	default:
3968 		stat_info->type = OS_FILE_TYPE_UNKNOWN;
3969 	}
3970 
3971 	stat_info->size = statinfo->st_size;
3972 	stat_info->block_size = statinfo->st_blksize;
3973 	stat_info->alloc_size = statinfo->st_blocks * 512;
3974 
3975 	if (check_rw_perm
3976 	    && (stat_info->type == OS_FILE_TYPE_FILE
3977 		|| stat_info->type == OS_FILE_TYPE_BLOCK)) {
3978 
3979 		int	access = !read_only ? O_RDWR : O_RDONLY;
3980 		int	fh = ::open(path, access, os_innodb_umask);
3981 
3982 		if (fh == -1) {
3983 			stat_info->rw_perm = false;
3984 		} else {
3985 			stat_info->rw_perm = true;
3986 			close(fh);
3987 		}
3988 	}
3989 
3990 	return(DB_SUCCESS);
3991 }
3992 
3993 /** Truncates a file to a specified size in bytes.
3994 Do nothing if the size to preserve is greater or equal to the current
3995 size of the file.
3996 @param[in]	pathname	file path
3997 @param[in]	file		file to be truncated
3998 @param[in]	size		size to preserve in bytes
3999 @return true if success */
4000 static
4001 bool
os_file_truncate_posix(const char * pathname,pfs_os_file_t file,os_offset_t size)4002 os_file_truncate_posix(
4003 	const char*	pathname,
4004 	pfs_os_file_t	file,
4005 	os_offset_t	size)
4006 {
4007 	int     res = ftruncate(file.m_file, size);
4008 	if (res == -1) {
4009 
4010 		bool	retry;
4011 
4012 		retry = os_file_handle_error_no_exit(
4013 			pathname, "truncate", false);
4014 
4015 		if (retry) {
4016 			ib::warn()
4017 				<< "Truncate failed for '"
4018 				<< pathname << "'";
4019 		}
4020 	}
4021 
4022 	return(res == 0);
4023 }
4024 
4025 /** Truncates a file at its current position.
4026 @return true if success */
4027 bool
os_file_set_eof(FILE * file)4028 os_file_set_eof(
4029 	FILE*		file)	/*!< in: file to be truncated */
4030 {
4031 	return(!ftruncate(fileno(file), ftell(file)));
4032 }
4033 
4034 #ifdef UNIV_HOTBACKUP
4035 /** Closes a file handle.
4036 @param[in]	file		Handle to a file
4037 @return true if success */
4038 bool
os_file_close_no_error_handling(os_file_t file)4039 os_file_close_no_error_handling(
4040 	os_file_t	file)
4041 {
4042 	return(close(file) != -1);
4043 }
4044 #endif /* UNIV_HOTBACKUP */
4045 
4046 /** This function can be called if one wants to post a batch of reads and
4047 prefers an i/o-handler thread to handle them all at once later. You must
4048 call os_aio_simulated_wake_handler_threads later to ensure the threads
4049 are not left sleeping! */
4050 void
os_aio_simulated_put_read_threads_to_sleep()4051 os_aio_simulated_put_read_threads_to_sleep()
4052 {
4053 	/* No op on non Windows */
4054 }
4055 
4056 #else /* !_WIN32 */
4057 
4058 #include <WinIoCtl.h>
4059 
4060 /** Do the read/write
4061 @param[in]	request	The IO context and type
4062 @return the number of bytes read/written or negative value on error */
4063 ssize_t
execute(const IORequest & request)4064 SyncFileIO::execute(const IORequest& request)
4065 {
4066 	OVERLAPPED	seek;
4067 
4068 	memset(&seek, 0x0, sizeof(seek));
4069 
4070 	seek.Offset = (DWORD) m_offset & 0xFFFFFFFF;
4071 	seek.OffsetHigh = (DWORD) (m_offset >> 32);
4072 
4073 	BOOL	ret;
4074 	DWORD	n_bytes;
4075 
4076 	if (request.is_read()) {
4077 		ret = ReadFile(m_fh, m_buf,
4078 			static_cast<DWORD>(m_n), &n_bytes, &seek);
4079 
4080 	} else {
4081 		ut_ad(request.is_write());
4082 		ret = WriteFile(m_fh, m_buf,
4083 			static_cast<DWORD>(m_n), &n_bytes, &seek);
4084 	}
4085 
4086 	return(ret ? static_cast<ssize_t>(n_bytes) : -1);
4087 }
4088 
4089 /** Do the read/write
4090 @param[in,out]	slot	The IO slot, it has the IO context
4091 @return the number of bytes read/written or negative value on error */
4092 ssize_t
execute(Slot * slot)4093 SyncFileIO::execute(Slot* slot)
4094 {
4095 	BOOL	ret;
4096 
4097 	if (slot->type.is_read()) {
4098 		ret = ReadFile(
4099 			slot->file.m_file, slot->ptr, slot->len,
4100 			&slot->n_bytes, &slot->control);
4101 	} else {
4102 		ut_ad(slot->type.is_write());
4103 		ret = WriteFile(
4104 			slot->file.m_file, slot->ptr, slot->len,
4105 			&slot->n_bytes, &slot->control);
4106 	}
4107 
4108 	return(ret ? static_cast<ssize_t>(slot->n_bytes) : -1);
4109 }
4110 
4111 /** Check if the file system supports sparse files.
4112 @param[in]	 name		File name
4113 @return true if the file system supports sparse files */
4114 static
4115 bool
os_is_sparse_file_supported_win32(const char * filename)4116 os_is_sparse_file_supported_win32(const char* filename)
4117 {
4118 	char	volname[MAX_PATH];
4119 	BOOL	result = GetVolumePathName(filename, volname, MAX_PATH);
4120 
4121 	if (!result) {
4122 
4123 		ib::error()
4124 			<< "os_is_sparse_file_supported: "
4125 			<< "Failed to get the volume path name for: "
4126 			<< filename
4127 			<< "- OS error number " << GetLastError();
4128 
4129 		return(false);
4130 	}
4131 
4132 	DWORD	flags;
4133 
4134 	GetVolumeInformation(
4135 		volname, NULL, MAX_PATH, NULL, NULL,
4136 		&flags, NULL, MAX_PATH);
4137 
4138 	return(flags & FILE_SUPPORTS_SPARSE_FILES) ? true : false;
4139 }
4140 
4141 /** Free storage space associated with a section of the file.
4142 @param[in]	fh		Open file handle
4143 @param[in]	page_size	Tablespace page size
4144 @param[in]	block_size	File system block size
4145 @param[in]	off		Starting offset (SEEK_SET)
4146 @param[in]	len		Size of the hole
4147 @return 0 on success or errno */
4148 static
4149 dberr_t
os_file_punch_hole_win32(os_file_t fh,os_offset_t off,os_offset_t len)4150 os_file_punch_hole_win32(
4151 	os_file_t	fh,
4152 	os_offset_t	off,
4153 	os_offset_t	len)
4154 {
4155 	FILE_ZERO_DATA_INFORMATION	punch;
4156 
4157 	punch.FileOffset.QuadPart = off;
4158 	punch.BeyondFinalZero.QuadPart = off + len;
4159 
4160 	/* If lpOverlapped is NULL, lpBytesReturned cannot be NULL,
4161 	therefore we pass a dummy parameter. */
4162 	DWORD	temp;
4163 
4164 	BOOL	result = DeviceIoControl(
4165 		fh, FSCTL_SET_ZERO_DATA, &punch, sizeof(punch),
4166 		NULL, 0, &temp, NULL);
4167 
4168 	return(!result ? DB_IO_NO_PUNCH_HOLE : DB_SUCCESS);
4169 }
4170 
4171 /** Check the existence and type of the given file.
4172 @param[in]	path		path name of file
4173 @param[out]	exists		true if the file exists
4174 @param[out]	type		Type of the file, if it exists
4175 @return true if call succeeded */
4176 bool
os_file_status_win32(const char * path,bool * exists,os_file_type_t * type)4177 os_file_status_win32(
4178 	const char*	path,
4179 	bool*		exists,
4180 	os_file_type_t* type)
4181 {
4182 	int		ret;
4183 	struct _stat64	statinfo;
4184 
4185 	ret = _stat64(path, &statinfo);
4186 
4187 	*exists = !ret;
4188 
4189 	if (!ret) {
4190 		/* file exists, everything OK */
4191 
4192 	} else if (errno == ENOENT || errno == ENOTDIR
4193 		  || errno == ENAMETOOLONG) {
4194 		/* file does not exist */
4195 		return(true);
4196 
4197 	} else {
4198 		/* file exists, but stat call failed */
4199 		os_file_handle_error_no_exit(path, "stat", false);
4200 		return(false);
4201 	}
4202 
4203 	if (_S_IFDIR & statinfo.st_mode) {
4204 		*type = OS_FILE_TYPE_DIR;
4205 
4206 	} else if (_S_IFREG & statinfo.st_mode) {
4207 		*type = OS_FILE_TYPE_FILE;
4208 
4209 	} else {
4210 		*type = OS_FILE_TYPE_UNKNOWN;
4211 	}
4212 
4213 	return(true);
4214 }
4215 
4216 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
4217 function!
4218 Flushes the write buffers of a given file to the disk.
4219 @param[in]	file		handle to a file
4220 @return true if success */
4221 bool
os_file_flush_func(os_file_t file)4222 os_file_flush_func(
4223 	os_file_t	file)
4224 {
4225 	++os_n_fsyncs;
4226 
4227 	BOOL	ret = FlushFileBuffers(file);
4228 
4229 	if (ret) {
4230 		return(true);
4231 	}
4232 
4233 	/* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
4234 	actually a raw device, we choose to ignore that error if we are using
4235 	raw disks */
4236 
4237 	if (srv_start_raw_disk_in_use && GetLastError()
4238 	    == ERROR_INVALID_FUNCTION) {
4239 		return(true);
4240 	}
4241 
4242 	os_file_handle_error(NULL, "flush");
4243 
4244 	/* It is a fatal error if a file flush does not succeed, because then
4245 	the database can get corrupt on disk */
4246 	ut_error;
4247 
4248 	return(false);
4249 }
4250 
4251 /** Retrieves the last error number if an error occurs in a file io function.
4252 The number should be retrieved before any other OS calls (because they may
4253 overwrite the error number). If the number is not known to this program,
4254 the OS error number + 100 is returned.
4255 @param[in]	report_all_errors	true if we want an error message printed
4256 					of all errors
4257 @param[in]	on_error_silent		true then don't print any diagnostic
4258 					to the log
4259 @return error number, or OS error number + 100 */
4260 static
4261 ulint
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)4262 os_file_get_last_error_low(
4263 	bool	report_all_errors,
4264 	bool	on_error_silent)
4265 {
4266 	ulint	err = (ulint) GetLastError();
4267 
4268 	if (err == ERROR_SUCCESS) {
4269 		return(0);
4270 	}
4271 
4272 	if (report_all_errors
4273 	    || (!on_error_silent
4274 		&& err != ERROR_DISK_FULL
4275 		&& err != ERROR_FILE_EXISTS)) {
4276 
4277 		ib::error()
4278 			<< "Operating system error number " << err
4279 			<< " in a file operation.";
4280 
4281 		if (err == ERROR_PATH_NOT_FOUND) {
4282 			ib::error()
4283 				<< "The error means the system"
4284 				" cannot find the path specified.";
4285 
4286 			if (srv_is_being_started) {
4287 				ib::error()
4288 					<< "If you are installing InnoDB,"
4289 					" remember that you must create"
4290 					" directories yourself, InnoDB"
4291 					" does not create them.";
4292 			}
4293 
4294 		} else if (err == ERROR_ACCESS_DENIED) {
4295 
4296 			ib::error()
4297 				<< "The error means mysqld does not have"
4298 				" the access rights to"
4299 				" the directory. It may also be"
4300 				" you have created a subdirectory"
4301 				" of the same name as a data file.";
4302 
4303 		} else if (err == ERROR_SHARING_VIOLATION
4304 			   || err == ERROR_LOCK_VIOLATION) {
4305 
4306 			ib::error()
4307 				<< "The error means that another program"
4308 				" is using InnoDB's files."
4309 				" This might be a backup or antivirus"
4310 				" software or another instance"
4311 				" of MySQL."
4312 				" Please close it to get rid of this error.";
4313 
4314 		} else if (err == ERROR_WORKING_SET_QUOTA
4315 			   || err == ERROR_NO_SYSTEM_RESOURCES) {
4316 
4317 			ib::error()
4318 				<< "The error means that there are no"
4319 				" sufficient system resources or quota to"
4320 				" complete the operation.";
4321 
4322 		} else if (err == ERROR_OPERATION_ABORTED) {
4323 
4324 			ib::error()
4325 				<< "The error means that the I/O"
4326 				" operation has been aborted"
4327 				" because of either a thread exit"
4328 				" or an application request."
4329 				" Retry attempt is made.";
4330 		} else {
4331 
4332 			ib::info() << OPERATING_SYSTEM_ERROR_MSG;
4333 		}
4334 	}
4335 
4336 	if (err == ERROR_FILE_NOT_FOUND) {
4337 		return(OS_FILE_NOT_FOUND);
4338 	} else if (err == ERROR_DISK_FULL) {
4339 		return(OS_FILE_DISK_FULL);
4340 	} else if (err == ERROR_FILE_EXISTS) {
4341 		return(OS_FILE_ALREADY_EXISTS);
4342 	} else if (err == ERROR_SHARING_VIOLATION
4343 		   || err == ERROR_LOCK_VIOLATION) {
4344 		return(OS_FILE_SHARING_VIOLATION);
4345 	} else if (err == ERROR_WORKING_SET_QUOTA
4346 		   || err == ERROR_NO_SYSTEM_RESOURCES) {
4347 		return(OS_FILE_INSUFFICIENT_RESOURCE);
4348 	} else if (err == ERROR_OPERATION_ABORTED) {
4349 		return(OS_FILE_OPERATION_ABORTED);
4350 	} else if (err == ERROR_ACCESS_DENIED) {
4351 		return(OS_FILE_ACCESS_VIOLATION);
4352 	}
4353 
4354 	return(OS_FILE_ERROR_MAX + err);
4355 }
4356 
4357 /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
4358 this function!
4359 A simple function to open or create a file.
4360 @param[in]	name		name of the file or path as a null-terminated
4361 				string
4362 @param[in]	create_mode	create mode
4363 @param[in]	access_type	OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
4364 @param[in]	read_only	if true read only mode checks are enforced
4365 @param[out]	success		true if succeed, false if error
4366 @return handle to the file, not defined if error, error number
4367 	can be retrieved with os_file_get_last_error */
4368 pfs_os_file_t
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)4369 os_file_create_simple_func(
4370 	const char*	name,
4371 	ulint		create_mode,
4372 	ulint		access_type,
4373 	bool		read_only,
4374 	bool*		success)
4375 {
4376 	pfs_os_file_t	file;
4377 
4378 	*success = false;
4379 
4380 	DWORD		access;
4381 	DWORD		create_flag;
4382 	DWORD		attributes = 0;
4383 
4384 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
4385 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
4386 
4387 	if (create_mode == OS_FILE_OPEN) {
4388 
4389 		create_flag = OPEN_EXISTING;
4390 
4391 	} else if (read_only) {
4392 
4393 		create_flag = OPEN_EXISTING;
4394 
4395 	} else if (create_mode == OS_FILE_CREATE) {
4396 
4397 		create_flag = CREATE_NEW;
4398 
4399 	} else if (create_mode == OS_FILE_CREATE_PATH) {
4400 
4401 		/* Create subdirs along the path if needed. */
4402 		*success = os_file_create_subdirs_if_needed(name);
4403 
4404 		if (!*success) {
4405 
4406 			ib::error()
4407 				<< "Unable to create subdirectories '"
4408 				<< name << "'";
4409 			file.m_file = OS_FILE_CLOSED;
4410 			return(file);
4411 		}
4412 
4413 		create_flag = CREATE_NEW;
4414 		create_mode = OS_FILE_CREATE;
4415 
4416 	} else {
4417 
4418 		ib::error()
4419 			<< "Unknown file create mode ("
4420 			<< create_mode << ") for file '"
4421 			<< name << "'";
4422 
4423 		file.m_file = OS_FILE_CLOSED;
4424 		return(file);
4425 	}
4426 
4427 	if (access_type == OS_FILE_READ_ONLY) {
4428 
4429 		access = GENERIC_READ;
4430 
4431 	} else if (read_only) {
4432 
4433 		ib::info()
4434 			<< "Read only mode set. Unable to"
4435 			" open file '" << name << "' in RW mode, "
4436 			<< "trying RO mode", name;
4437 
4438 		access = GENERIC_READ;
4439 
4440 	} else if (access_type == OS_FILE_READ_WRITE) {
4441 
4442 		access = GENERIC_READ | GENERIC_WRITE;
4443 
4444 	} else {
4445 
4446 		ib::error()
4447 			<< "Unknown file access type (" << access_type << ") "
4448 			"for file '" << name << "'";
4449 
4450 		file.m_file = OS_FILE_CLOSED;
4451 		return(file);
4452 	}
4453 
4454 	bool	retry;
4455 
4456 	do {
4457 		/* Use default security attributes and no template file. */
4458 
4459 		file.m_file = CreateFile(
4460 			(LPCTSTR) name, access, FILE_SHARE_READ, NULL,
4461 			create_flag, attributes, NULL);
4462 
4463 		if (file.m_file == INVALID_HANDLE_VALUE) {
4464 
4465 			*success = false;
4466 
4467 			retry = os_file_handle_error(
4468 				name, create_mode == OS_FILE_OPEN ?
4469 				"open" : "create");
4470 
4471 		} else {
4472 
4473 			retry = false;
4474 
4475 			*success = true;
4476 
4477 			DWORD	temp;
4478 
4479 			/* This is a best effort use case, if it fails then
4480 			we will find out when we try and punch the hole. */
4481 
4482 			DeviceIoControl(
4483 				file.m_file, FSCTL_SET_SPARSE, NULL, 0, NULL, 0,
4484 				&temp, NULL);
4485 		}
4486 
4487 	} while (retry);
4488 
4489 	return(file);
4490 }
4491 
4492 /** This function attempts to create a directory named pathname. The new
4493 directory gets default permissions. On Unix the permissions are
4494 (0770 & ~umask). If the directory exists already, nothing is done and
4495 the call succeeds, unless the fail_if_exists arguments is true.
4496 If another error occurs, such as a permission error, this does not crash,
4497 but reports the error and returns false.
4498 @param[in]	pathname	directory name as null-terminated string
4499 @param[in]	fail_if_exists	if true, pre-existing directory is treated
4500 				as an error.
4501 @return true if call succeeds, false on error */
4502 bool
os_file_create_directory(const char * pathname,bool fail_if_exists)4503 os_file_create_directory(
4504 	const char*	pathname,
4505 	bool		fail_if_exists)
4506 {
4507 	BOOL	rcode;
4508 
4509 	rcode = CreateDirectory((LPCTSTR) pathname, NULL);
4510 	if (!(rcode != 0
4511 	      || (GetLastError() == ERROR_ALREADY_EXISTS
4512 		  && !fail_if_exists))) {
4513 
4514 		os_file_handle_error_no_exit(
4515 			pathname, "CreateDirectory", false);
4516 
4517 		return(false);
4518 	}
4519 
4520 	return(true);
4521 }
4522 
4523 /** The os_file_opendir() function opens a directory stream corresponding to the
4524 directory named by the dirname argument. The directory stream is positioned
4525 at the first entry. In both Unix and Windows we automatically skip the '.'
4526 and '..' items at the start of the directory listing.
4527 @param[in]	dirname		directory name; it must not contain a trailing
4528 				'\' or '/'
4529 @param[in]	is_fatal	true if we should treat an error as a fatal
4530 				error; if we try to open symlinks then we do
4531 				not wish a fatal error if it happens not to
4532 				be a directory
4533 @return directory stream, NULL if error */
4534 os_file_dir_t
os_file_opendir(const char * dirname,bool error_is_fatal)4535 os_file_opendir(
4536 	const char*	dirname,
4537 	bool		error_is_fatal)
4538 {
4539 	os_file_dir_t		dir;
4540 	LPWIN32_FIND_DATA	lpFindFileData;
4541 	char			path[OS_FILE_MAX_PATH + 3];
4542 
4543 	ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
4544 
4545 	strcpy(path, dirname);
4546 	strcpy(path + strlen(path), "\\*");
4547 
4548 	/* Note that in Windows opening the 'directory stream' also retrieves
4549 	the first entry in the directory. Since it is '.', that is no problem,
4550 	as we will skip over the '.' and '..' entries anyway. */
4551 
4552 	lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
4553 		ut_malloc_nokey(sizeof(WIN32_FIND_DATA)));
4554 
4555 	dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
4556 
4557 	ut_free(lpFindFileData);
4558 
4559 	if (dir == INVALID_HANDLE_VALUE) {
4560 
4561 		if (error_is_fatal) {
4562 			os_file_handle_error(dirname, "opendir");
4563 		}
4564 
4565 		return(NULL);
4566 	}
4567 
4568 	return(dir);
4569 }
4570 
4571 /** Closes a directory stream.
4572 @param[in]	dir	directory stream
4573 @return 0 if success, -1 if failure */
4574 int
os_file_closedir(os_file_dir_t dir)4575 os_file_closedir(
4576 	os_file_dir_t	dir)
4577 {
4578 	BOOL		ret;
4579 
4580 	ret = FindClose(dir);
4581 
4582 	if (!ret) {
4583 		os_file_handle_error_no_exit(NULL, "closedir", false);
4584 
4585 		return(-1);
4586 	}
4587 
4588 	return(0);
4589 }
4590 
4591 /** This function returns information of the next file in the directory. We
4592 jump over the '.' and '..' entries in the directory.
4593 @param[in]	dirname		directory name or path
4594 @param[in]	dir		directory stream
4595 @param[out]	info		buffer where the info is returned
4596 @return 0 if ok, -1 if error, 1 if at the end of the directory */
4597 int
os_file_readdir_next_file(const char * dirname,os_file_dir_t dir,os_file_stat_t * info)4598 os_file_readdir_next_file(
4599 	const char*	dirname,
4600 	os_file_dir_t	dir,
4601 	os_file_stat_t*	info)
4602 {
4603 	BOOL		ret;
4604 	int		status;
4605 	WIN32_FIND_DATA	find_data;
4606 
4607 next_file:
4608 
4609 	ret = FindNextFile(dir, &find_data);
4610 
4611 	if (ret > 0) {
4612 
4613 		const char* name;
4614 
4615 		name = static_cast<const char*>(find_data.cFileName);
4616 
4617 		ut_a(strlen(name) < OS_FILE_MAX_PATH);
4618 
4619 		if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0) {
4620 
4621 			goto next_file;
4622 		}
4623 
4624 		strcpy(info->name, name);
4625 
4626 		info->size = find_data.nFileSizeHigh;
4627 		info->size <<= 32;
4628 		info->size |= find_data.nFileSizeLow;
4629 
4630 		if (find_data.dwFileAttributes
4631 		    & FILE_ATTRIBUTE_REPARSE_POINT) {
4632 
4633 			/* TODO: test Windows symlinks */
4634 			/* TODO: MySQL has apparently its own symlink
4635 			implementation in Windows, dbname.sym can
4636 			redirect a database directory:
4637 			REFMAN "windows-symbolic-links.html" */
4638 
4639 			info->type = OS_FILE_TYPE_LINK;
4640 
4641 		} else if (find_data.dwFileAttributes
4642 			   & FILE_ATTRIBUTE_DIRECTORY) {
4643 
4644 			info->type = OS_FILE_TYPE_DIR;
4645 
4646 		} else {
4647 
4648 			/* It is probably safest to assume that all other
4649 			file types are normal. Better to check them rather
4650 			than blindly skip them. */
4651 
4652 			info->type = OS_FILE_TYPE_FILE;
4653 		}
4654 
4655 		status = 0;
4656 
4657 	} else if (GetLastError() == ERROR_NO_MORE_FILES) {
4658 
4659 		status = 1;
4660 
4661 	} else {
4662 
4663 		os_file_handle_error_no_exit(NULL, "readdir_next_file", false);
4664 
4665 		status = -1;
4666 	}
4667 
4668 	return(status);
4669 }
4670 
4671 /** NOTE! Use the corresponding macro os_file_create(), not directly
4672 this function!
4673 Opens an existing file or creates a new.
4674 @param[in]	name		name of the file or path as a null-terminated
4675 				string
4676 @param[in]	create_mode	create mode
4677 @param[in]	purpose		OS_FILE_AIO, if asynchronous, non-buffered I/O
4678 				is desired, OS_FILE_NORMAL, if any normal file;
4679 				NOTE that it also depends on type, os_aio_..
4680 				and srv_.. variables whether we really use async
4681 				I/O or unbuffered I/O: look in the function
4682 				source code for the exact rules
4683 @param[in]	type		OS_DATA_FILE or OS_LOG_FILE
4684 @param[in]	success		true if succeeded
4685 @return handle to the file, not defined if error, error number
4686 	can be retrieved with os_file_get_last_error */
4687 pfs_os_file_t
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,bool read_only,bool * success)4688 os_file_create_func(
4689 	const char*	name,
4690 	ulint		create_mode,
4691 	ulint		purpose,
4692 	ulint		type,
4693 	bool		read_only,
4694 	bool*		success)
4695 {
4696 	pfs_os_file_t	file;
4697 	bool		retry;
4698 	bool		on_error_no_exit;
4699 	bool		on_error_silent;
4700 
4701 	*success = false;
4702 
4703 	DBUG_EXECUTE_IF(
4704 		"ib_create_table_fail_disk_full",
4705 		*success = false;
4706 		SetLastError(ERROR_DISK_FULL);
4707 		file.m_file = OS_FILE_CLOSED;
4708 		return(file);
4709 	);
4710 
4711 	DWORD		create_flag;
4712 	DWORD		share_mode = FILE_SHARE_READ;
4713 
4714 	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
4715 		? true : false;
4716 
4717 	on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
4718 		? true : false;
4719 
4720 	create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
4721 	create_mode &= ~OS_FILE_ON_ERROR_SILENT;
4722 
4723 	if (create_mode == OS_FILE_OPEN_RAW) {
4724 
4725 		ut_a(!read_only);
4726 
4727 		create_flag = OPEN_EXISTING;
4728 
4729 		/* On Windows Physical devices require admin privileges and
4730 		have to have the write-share mode set. See the remarks
4731 		section for the CreateFile() function documentation in MSDN. */
4732 
4733 		share_mode |= FILE_SHARE_WRITE;
4734 
4735 	} else if (create_mode == OS_FILE_OPEN
4736 		   || create_mode == OS_FILE_OPEN_RETRY) {
4737 
4738 		create_flag = OPEN_EXISTING;
4739 
4740 	} else if (read_only) {
4741 
4742 		create_flag = OPEN_EXISTING;
4743 
4744 	} else if (create_mode == OS_FILE_CREATE) {
4745 
4746 		create_flag = CREATE_NEW;
4747 
4748 	} else if (create_mode == OS_FILE_OVERWRITE) {
4749 
4750 		create_flag = CREATE_ALWAYS;
4751 
4752 	} else {
4753 		ib::error()
4754 			<< "Unknown file create mode (" << create_mode << ") "
4755 			<< " for file '" << name << "'";
4756 
4757 		file.m_file = OS_FILE_CLOSED;
4758 		return(file);
4759 	}
4760 
4761 	DWORD		attributes = 0;
4762 
4763 #ifdef UNIV_HOTBACKUP
4764 	attributes |= FILE_FLAG_NO_BUFFERING;
4765 #else
4766 	if (purpose == OS_FILE_AIO) {
4767 
4768 #ifdef WIN_ASYNC_IO
4769 		/* If specified, use asynchronous (overlapped) io and no
4770 		buffering of writes in the OS */
4771 
4772 		if (srv_use_native_aio) {
4773 			attributes |= FILE_FLAG_OVERLAPPED;
4774 		}
4775 #endif /* WIN_ASYNC_IO */
4776 
4777 	} else if (purpose == OS_FILE_NORMAL) {
4778 
4779 		/* Use default setting. */
4780 
4781 	} else {
4782 
4783 		ib::error()
4784 			<< "Unknown purpose flag (" << purpose << ") "
4785 			<< "while opening file '" << name << "'";
4786 
4787 		file.m_file = OS_FILE_CLOSED;
4788 		return(file);
4789 	}
4790 
4791 #ifdef UNIV_NON_BUFFERED_IO
4792 	// TODO: Create a bug, this looks wrong. The flush log
4793 	// parameter is dynamic.
4794 	if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
4795 
4796 		/* Do not use unbuffered i/o for the log files because
4797 		value 2 denotes that we do not flush the log at every
4798 		commit, but only once per second */
4799 
4800 	} else if (srv_win_file_flush_method == SRV_WIN_IO_UNBUFFERED) {
4801 
4802 		attributes |= FILE_FLAG_NO_BUFFERING;
4803 	}
4804 #endif /* UNIV_NON_BUFFERED_IO */
4805 
4806 #endif /* UNIV_HOTBACKUP */
4807 	DWORD	access = GENERIC_READ;
4808 
4809 	if (!read_only) {
4810 		access |= GENERIC_WRITE;
4811 	}
4812 
4813 	do {
4814 		/* Use default security attributes and no template file. */
4815 		file.m_file = CreateFile(
4816 			(LPCTSTR) name, access, share_mode, NULL,
4817 			create_flag, attributes, NULL);
4818 
4819 		if (file.m_file == INVALID_HANDLE_VALUE) {
4820 			const char*	operation;
4821 
4822 			operation = (create_mode == OS_FILE_CREATE
4823 				     && !read_only)
4824 				? "create" : "open";
4825 
4826 			*success = false;
4827 
4828 			if (on_error_no_exit) {
4829 				retry = os_file_handle_error_no_exit(
4830 					name, operation, on_error_silent);
4831 			} else {
4832 				retry = os_file_handle_error(name, operation);
4833 			}
4834 		} else {
4835 
4836 			retry = false;
4837 
4838 			*success = true;
4839 
4840 			DWORD	temp;
4841 
4842 			/* This is a best effort use case, if it fails then
4843 			we will find out when we try and punch the hole. */
4844 			DeviceIoControl(
4845 				file.m_file, FSCTL_SET_SPARSE, NULL, 0, NULL, 0,
4846 				&temp, NULL);
4847 		}
4848 
4849 	} while (retry);
4850 
4851 	return(file);
4852 }
4853 
4854 /** NOTE! Use the corresponding macro os_file_create_simple_no_error_handling(),
4855 not directly this function!
4856 A simple function to open or create a file.
4857 @param[in]	name		name of the file or path as a null-terminated
4858 				string
4859 @param[in]	create_mode	create mode
4860 @param[in]	access_type	OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
4861 				OS_FILE_READ_ALLOW_DELETE; the last option is
4862 				used by a backup program reading the file
4863 @param[out]	success		true if succeeded
4864 @return own: handle to the file, not defined if error, error number
4865 	can be retrieved with os_file_get_last_error */
4866 pfs_os_file_t
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)4867 os_file_create_simple_no_error_handling_func(
4868 	const char*	name,
4869 	ulint		create_mode,
4870 	ulint		access_type,
4871 	bool		read_only,
4872 	bool*		success)
4873 {
4874 	pfs_os_file_t	file;
4875 
4876 	*success = false;
4877 
4878 	DWORD		access;
4879 	DWORD		create_flag;
4880 	DWORD		attributes	= 0;
4881 	DWORD		share_mode	= FILE_SHARE_READ;
4882 
4883 	ut_a(name);
4884 
4885 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
4886 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
4887 
4888 	if (create_mode == OS_FILE_OPEN) {
4889 
4890 		create_flag = OPEN_EXISTING;
4891 
4892 	} else if (read_only) {
4893 
4894 		create_flag = OPEN_EXISTING;
4895 
4896 	} else if (create_mode == OS_FILE_CREATE) {
4897 
4898 		create_flag = CREATE_NEW;
4899 
4900 	} else {
4901 
4902 		ib::error()
4903 			<< "Unknown file create mode (" << create_mode << ") "
4904 			<< " for file '" << name << "'";
4905 
4906 		file.m_file = OS_FILE_CLOSED;
4907 		return(file);
4908 	}
4909 
4910 	if (access_type == OS_FILE_READ_ONLY) {
4911 
4912 		access = GENERIC_READ;
4913 
4914 	} else if (read_only) {
4915 
4916 		access = GENERIC_READ;
4917 
4918 	} else if (access_type == OS_FILE_READ_WRITE) {
4919 
4920 		access = GENERIC_READ | GENERIC_WRITE;
4921 
4922 	} else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
4923 
4924 		ut_a(!read_only);
4925 
4926 		access = GENERIC_READ;
4927 
4928 		/*!< A backup program has to give mysqld the maximum
4929 		freedom to do what it likes with the file */
4930 
4931 		share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE;
4932 	} else {
4933 
4934 		ib::error()
4935 			<< "Unknown file access type (" << access_type << ") "
4936 			<< "for file '" << name << "'";
4937 
4938 		file.m_file = OS_FILE_CLOSED;
4939 		return(file);
4940 	}
4941 
4942 	file.m_file = CreateFile((LPCTSTR) name,
4943 			  access,
4944 			  share_mode,
4945 			  NULL,			// Security attributes
4946 			  create_flag,
4947 			  attributes,
4948 			  NULL);		// No template file
4949 
4950 	*success = (file.m_file != INVALID_HANDLE_VALUE);
4951 
4952 	return(file);
4953 }
4954 
4955 /** Deletes a file if it exists. The file has to be closed before calling this.
4956 @param[in]	name		file path as a null-terminated string
4957 @param[out]	exist		indicate if file pre-exist
4958 @return true if success */
4959 bool
os_file_delete_if_exists_func(const char * name,bool * exist)4960 os_file_delete_if_exists_func(
4961 	const char*	name,
4962 	bool*		exist)
4963 {
4964 	ulint	count	= 0;
4965 
4966 	if (exist != NULL) {
4967 		*exist = true;
4968 	}
4969 
4970 	for (;;) {
4971 		/* In Windows, deleting an .ibd file may fail if ibbackup
4972 		is copying it */
4973 
4974 		bool	ret = DeleteFile((LPCTSTR) name);
4975 
4976 		if (ret) {
4977 			return(true);
4978 		}
4979 
4980 		DWORD	lasterr = GetLastError();
4981 
4982 		if (lasterr == ERROR_FILE_NOT_FOUND
4983 		    || lasterr == ERROR_PATH_NOT_FOUND) {
4984 
4985 			/* the file does not exist, this not an error */
4986 			if (exist != NULL) {
4987 				*exist = false;
4988 			}
4989 
4990 			return(true);
4991 		}
4992 
4993 		++count;
4994 
4995 		if (count > 100 && 0 == (count % 10)) {
4996 
4997 			/* Print error information */
4998 			os_file_get_last_error(true);
4999 
5000 			ib::warn() << "Delete of file '" << name << "' failed.";
5001 		}
5002 
5003 		/* Sleep for a second */
5004 		os_thread_sleep(1000000);
5005 
5006 		if (count > 2000) {
5007 
5008 			return(false);
5009 		}
5010 	}
5011 }
5012 
5013 /** Deletes a file. The file has to be closed before calling this.
5014 @param[in]	name		File path as NUL terminated string
5015 @return true if success */
5016 bool
os_file_delete_func(const char * name)5017 os_file_delete_func(
5018 	const char*	name)
5019 {
5020 	ulint	count	= 0;
5021 
5022 	for (;;) {
5023 		/* In Windows, deleting an .ibd file may fail if ibbackup
5024 		is copying it */
5025 
5026 		BOOL	ret = DeleteFile((LPCTSTR) name);
5027 
5028 		if (ret) {
5029 			return(true);
5030 		}
5031 
5032 		if (GetLastError() == ERROR_FILE_NOT_FOUND) {
5033 			/* If the file does not exist, we classify this as
5034 			a 'mild' error and return */
5035 
5036 			return(false);
5037 		}
5038 
5039 		++count;
5040 
5041 		if (count > 100 && 0 == (count % 10)) {
5042 
5043 			/* print error information */
5044 			os_file_get_last_error(true);
5045 
5046 			ib::warn()
5047 				<< "Cannot delete file '" << name << "'. Are "
5048 				<< "you running ibbackup to back up the file?";
5049 		}
5050 
5051 		/* sleep for a second */
5052 		os_thread_sleep(1000000);
5053 
5054 		if (count > 2000) {
5055 
5056 			return(false);
5057 		}
5058 	}
5059 
5060 	ut_error;
5061 	return(false);
5062 }
5063 
5064 /** NOTE! Use the corresponding macro os_file_rename(), not directly this
5065 function!
5066 Renames a file (can also move it to another directory). It is safest that the
5067 file is closed before calling this function.
5068 @param[in]	oldpath		old file path as a null-terminated string
5069 @param[in]	newpath		new file path
5070 @return true if success */
5071 bool
os_file_rename_func(const char * oldpath,const char * newpath)5072 os_file_rename_func(
5073 	const char*	oldpath,
5074 	const char*	newpath)
5075 {
5076 #ifdef UNIV_DEBUG
5077 	os_file_type_t	type;
5078 	bool		exists;
5079 
5080 	/* New path must not exist. */
5081 	ut_ad(os_file_status(newpath, &exists, &type));
5082 	ut_ad(!exists);
5083 
5084 	/* Old path must exist. */
5085 	ut_ad(os_file_status(oldpath, &exists, &type));
5086 	ut_ad(exists);
5087 #endif /* UNIV_DEBUG */
5088 
5089 	if (MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath)) {
5090 		return(true);
5091 	}
5092 
5093 	os_file_handle_error_no_exit(oldpath, "rename", false);
5094 
5095 	return(false);
5096 }
5097 
5098 /** NOTE! Use the corresponding macro os_file_close(), not directly
5099 this function!
5100 Closes a file handle. In case of error, error number can be retrieved with
5101 os_file_get_last_error.
5102 @param[in,own]	file		Handle to a file
5103 @return true if success */
5104 bool
os_file_close_func(os_file_t file)5105 os_file_close_func(
5106 	os_file_t	file)
5107 {
5108 	ut_a(file > 0);
5109 
5110 	if (CloseHandle(file)) {
5111 		return(true);
5112 	}
5113 
5114 	os_file_handle_error(NULL, "close");
5115 
5116 	return(false);
5117 }
5118 
5119 /** Gets a file size.
5120 @param[in]	file		Handle to a file
5121 @return file size, or (os_offset_t) -1 on failure */
5122 os_offset_t
os_file_get_size(pfs_os_file_t file)5123 os_file_get_size(
5124 	pfs_os_file_t	file)
5125 {
5126 	DWORD		high;
5127 	DWORD		low;
5128 
5129 	low = GetFileSize(file.m_file, &high);
5130 
5131 	if (low == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
5132 		return((os_offset_t) -1);
5133 	}
5134 
5135 	return(os_offset_t(low | (os_offset_t(high) << 32)));
5136 }
5137 
5138 /** Gets a file size.
5139 @param[in]	filename	Full path to the filename to check
5140 @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
5141 	errno */
5142 os_file_size_t
os_file_get_size(const char * filename)5143 os_file_get_size(
5144 	const char*	filename)
5145 {
5146 	struct __stat64	s;
5147 	os_file_size_t	file_size;
5148 
5149 	int		ret = _stat64(filename, &s);
5150 
5151 	if (ret == 0) {
5152 
5153 		file_size.m_total_size = s.st_size;
5154 
5155 		DWORD	low_size;
5156 		DWORD	high_size;
5157 
5158 		low_size = GetCompressedFileSize(filename, &high_size);
5159 
5160 		if (low_size != INVALID_FILE_SIZE) {
5161 
5162 			file_size.m_alloc_size = high_size;
5163 			file_size.m_alloc_size <<= 32;
5164 			file_size.m_alloc_size |= low_size;
5165 
5166 		} else {
5167 			ib::error()
5168 				<< "GetCompressedFileSize("
5169 				<< filename << ", ..) failed.";
5170 
5171 			file_size.m_alloc_size = (os_offset_t) -1;
5172 		}
5173 	} else {
5174 		file_size.m_total_size = ~0;
5175 		file_size.m_alloc_size = (os_offset_t) ret;
5176 	}
5177 
5178 	return(file_size);
5179 }
5180 
5181 /** This function returns information about the specified file
5182 @param[in]	path		pathname of the file
5183 @param[out]	stat_info	information of a file in a directory
5184 @param[in,out]	statinfo	information of a file in a directory
5185 @param[in]	check_rw_perm	for testing whether the file can be opened
5186 				in RW mode
5187 @param[in]	read_only	true if the file is opened in read-only mode
5188 @return DB_SUCCESS if all OK */
5189 static
5190 dberr_t
os_file_get_status_win32(const char * path,os_file_stat_t * stat_info,struct _stat64 * statinfo,bool check_rw_perm,bool read_only)5191 os_file_get_status_win32(
5192 	const char*	path,
5193 	os_file_stat_t* stat_info,
5194 	struct _stat64*	statinfo,
5195 	bool		check_rw_perm,
5196 	bool		read_only)
5197 {
5198 	int	ret = _stat64(path, statinfo);
5199 
5200 	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
5201 		/* file does not exist */
5202 
5203 		return(DB_NOT_FOUND);
5204 
5205 	} else if (ret) {
5206 		/* file exists, but stat call failed */
5207 
5208 		os_file_handle_error_no_exit(path, "stat", false);
5209 
5210 		return(DB_FAIL);
5211 
5212 	} else if (_S_IFDIR & statinfo->st_mode) {
5213 
5214 		stat_info->type = OS_FILE_TYPE_DIR;
5215 
5216 	} else if (_S_IFREG & statinfo->st_mode) {
5217 
5218 		DWORD	access = GENERIC_READ;
5219 
5220 		if (!read_only) {
5221 			access |= GENERIC_WRITE;
5222 		}
5223 
5224 		stat_info->type = OS_FILE_TYPE_FILE;
5225 
5226 		/* Check if we can open it in read-only mode. */
5227 
5228 		if (check_rw_perm) {
5229 			HANDLE	fh;
5230 
5231 			fh = CreateFile(
5232 				(LPCTSTR) path,		// File to open
5233 				access,
5234 				0,			// No sharing
5235 				NULL,			// Default security
5236 				OPEN_EXISTING,		// Existing file only
5237 				FILE_ATTRIBUTE_NORMAL,	// Normal file
5238 				NULL);			// No attr. template
5239 
5240 			if (fh == INVALID_HANDLE_VALUE) {
5241 				stat_info->rw_perm = false;
5242 			} else {
5243 				stat_info->rw_perm = true;
5244 				CloseHandle(fh);
5245 			}
5246 		}
5247 
5248 		char	volname[MAX_PATH];
5249 		BOOL	result = GetVolumePathName(path, volname, MAX_PATH);
5250 
5251 		if (!result) {
5252 
5253 			ib::error()
5254 				<< "os_file_get_status_win32: "
5255 				<< "Failed to get the volume path name for: "
5256 				<< path
5257 				<< "- OS error number " << GetLastError();
5258 
5259 			return(DB_FAIL);
5260 		}
5261 
5262 		DWORD	sectorsPerCluster;
5263 		DWORD	bytesPerSector;
5264 		DWORD	numberOfFreeClusters;
5265 		DWORD	totalNumberOfClusters;
5266 
5267 		result = GetDiskFreeSpace(
5268 			(LPCSTR) volname,
5269 			&sectorsPerCluster,
5270 			&bytesPerSector,
5271 			&numberOfFreeClusters,
5272 			&totalNumberOfClusters);
5273 
5274 		if (!result) {
5275 
5276 			ib::error()
5277 				<< "GetDiskFreeSpace(" << volname << ",...) "
5278 				<< "failed "
5279 				<< "- OS error number " << GetLastError();
5280 
5281 			return(DB_FAIL);
5282 		}
5283 
5284 		stat_info->block_size = bytesPerSector * sectorsPerCluster;
5285 
5286 		/* On Windows the block size is not used as the allocation
5287 		unit for sparse files. The underlying infra-structure for
5288 		sparse files is based on NTFS compression. The punch hole
5289 		is done on a "compression unit". This compression unit
5290 		is based on the cluster size. You cannot punch a hole if
5291 		the cluster size >= 8K. For smaller sizes the table is
5292 		as follows:
5293 
5294 		Cluster Size	Compression Unit
5295 		512 Bytes		 8 KB
5296 		  1 KB			16 KB
5297 		  2 KB			32 KB
5298 		  4 KB			64 KB
5299 
5300 		Default NTFS cluster size is 4K, compression unit size of 64K.
5301 		Therefore unless the user has created the file system with
5302 		a smaller cluster size and used larger page sizes there is
5303 		little benefit from compression out of the box. */
5304 
5305 		stat_info->block_size = (stat_info->block_size <= 4096)
5306 			?  stat_info->block_size * 16 : ULINT_UNDEFINED;
5307 	} else {
5308 		stat_info->type = OS_FILE_TYPE_UNKNOWN;
5309 	}
5310 
5311 	return(DB_SUCCESS);
5312 }
5313 
5314 /** Truncates a file to a specified size in bytes.
5315 Do nothing if the size to preserve is greater or equal to the current
5316 size of the file.
5317 @param[in]	pathname	file path
5318 @param[in]	file		file to be truncated
5319 @param[in]	size		size to preserve in bytes
5320 @return true if success */
5321 static
5322 bool
os_file_truncate_win32(const char * pathname,pfs_os_file_t file,os_offset_t size)5323 os_file_truncate_win32(
5324 	const char*	pathname,
5325 	pfs_os_file_t	file,
5326 	os_offset_t	size)
5327 {
5328 	LARGE_INTEGER	length;
5329 
5330 	length.QuadPart = size;
5331 	BOOL	success = SetFilePointerEx(file.m_file, length, NULL, FILE_BEGIN);
5332 	if (!success) {
5333 		os_file_handle_error_no_exit(
5334 			pathname, "SetFilePointerEx", false);
5335 	} else {
5336 		success = SetEndOfFile(file.m_file);
5337 		if (!success) {
5338 			os_file_handle_error_no_exit(
5339 				pathname, "SetEndOfFile", false);
5340 		}
5341 	}
5342 	return(success);
5343 }
5344 
5345 /** Truncates a file at its current position.
5346 @param[in]	file		Handle to be truncated
5347 @return true if success */
5348 bool
os_file_set_eof(FILE * file)5349 os_file_set_eof(
5350 	FILE*		file)
5351 {
5352 	HANDLE	h = (HANDLE) _get_osfhandle(fileno(file));
5353 
5354 	return(SetEndOfFile(h));
5355 }
5356 
5357 #ifdef UNIV_HOTBACKUP
5358 /** Closes a file handle.
5359 @param[in]	file		Handle to close
5360 @return true if success */
5361 bool
os_file_close_no_error_handling(os_file_t file)5362 os_file_close_no_error_handling(
5363 	os_file_t	file)
5364 {
5365 	return(CloseHandle(file) ? true : false);
5366 }
5367 #endif /* UNIV_HOTBACKUP */
5368 
5369 /** This function can be called if one wants to post a batch of reads and
5370 prefers an i/o-handler thread to handle them all at once later. You must
5371 call os_aio_simulated_wake_handler_threads later to ensure the threads
5372 are not left sleeping! */
5373 void
os_aio_simulated_put_read_threads_to_sleep()5374 os_aio_simulated_put_read_threads_to_sleep()
5375 {
5376 	AIO::simulated_put_read_threads_to_sleep();
5377 }
5378 
5379 /** This function can be called if one wants to post a batch of reads and
5380 prefers an i/o-handler thread to handle them all at once later. You must
5381 call os_aio_simulated_wake_handler_threads later to ensure the threads
5382 are not left sleeping! */
5383 void
simulated_put_read_threads_to_sleep()5384 AIO::simulated_put_read_threads_to_sleep()
5385 {
5386 	/* The idea of putting background IO threads to sleep is only for
5387 	Windows when using simulated AIO. Windows XP seems to schedule
5388 	background threads too eagerly to allow for coalescing during
5389 	readahead requests. */
5390 
5391 	if (srv_use_native_aio) {
5392 		/* We do not use simulated AIO: do nothing */
5393 
5394 		return;
5395 	}
5396 
5397 	os_aio_recommend_sleep_for_read_threads	= true;
5398 
5399 	for (ulint i = 0; i < os_aio_n_segments; i++) {
5400 		AIO*	array;
5401 
5402 		get_array_and_local_segment(&array, i);
5403 
5404 		if (array == s_reads) {
5405 
5406 			os_event_reset(os_aio_segment_wait_events[i]);
5407 		}
5408 	}
5409 }
5410 
5411 #endif /* !_WIN32*/
5412 
5413 /** Does a syncronous read or write depending upon the type specified
5414 In case of partial reads/writes the function tries
5415 NUM_RETRIES_ON_PARTIAL_IO times to read/write the complete data.
5416 @param[in]	type,		IO flags
5417 @param[in]	file		handle to an open file
5418 @param[out]	buf		buffer where to read
5419 @param[in]	offset		file offset from the start where to read
5420 @param[in]	n		number of bytes to read, starting from offset
5421 @param[out]	err		DB_SUCCESS or error code
5422 @return number of bytes read/written, -1 if error */
5423 static MY_ATTRIBUTE((warn_unused_result))
5424 ssize_t
os_file_io(const IORequest & in_type,os_file_t file,void * buf,ulint n,os_offset_t offset,dberr_t * err)5425 os_file_io(
5426 	const IORequest&in_type,
5427 	os_file_t	file,
5428 	void*		buf,
5429 	ulint		n,
5430 	os_offset_t	offset,
5431 	dberr_t*	err)
5432 {
5433 	Block*		block;
5434 	ulint		original_n = n;
5435 	IORequest	type = in_type;
5436 	ssize_t		bytes_returned = 0;
5437 
5438 	if (type.is_compressed()) {
5439 
5440 		/* We don't compress the first page of any file. */
5441 		ut_ad(offset > 0);
5442 
5443 		block = os_file_compress_page(type, buf, &n);
5444 	} else {
5445 		block = NULL;
5446 	}
5447 
5448 	/* We do encryption after compression, since if we do encryption
5449 	before compression, the encrypted data will cause compression fail
5450 	or low compression rate. */
5451         if (type.is_encrypted() && type.is_write()) {
5452 		/* We don't encrypt the first page of any file. */
5453 		Block*	compressed_block = block;
5454 		ut_ad(offset > 0);
5455 
5456 		block = os_file_encrypt_page(type, buf, &n);
5457 
5458 		if (compressed_block != NULL) {
5459 			os_free_block(compressed_block);
5460 		}
5461         }
5462 
5463 	SyncFileIO	sync_file_io(file, buf, n, offset);
5464 
5465 	for (ulint i = 0; i < NUM_RETRIES_ON_PARTIAL_IO; ++i) {
5466 
5467 		ssize_t	n_bytes = sync_file_io.execute(type);
5468 
5469 		/* Check for a hard error. Not much we can do now. */
5470 		if (n_bytes < 0) {
5471 
5472 			break;
5473 
5474 		} else if ((ulint) n_bytes + bytes_returned == n) {
5475 
5476 			bytes_returned += n_bytes;
5477 
5478 			if (offset > 0
5479 			    && (type.is_compressed() || type.is_read())) {
5480 
5481 				*err = os_file_io_complete(
5482 					type, file,
5483 					reinterpret_cast<byte*>(buf),
5484 					NULL, original_n, offset, n);
5485 			} else {
5486 
5487 				*err = DB_SUCCESS;
5488 			}
5489 
5490 			if (block != NULL) {
5491 				os_free_block(block);
5492 			}
5493 
5494 			return(original_n);
5495 		}
5496 
5497 		/* Handle partial read/write. */
5498 
5499 		ut_ad((ulint) n_bytes + bytes_returned < n);
5500 
5501 		bytes_returned += (ulint) n_bytes;
5502 
5503 		if (!type.is_partial_io_warning_disabled()) {
5504 
5505 			const char*	op = type.is_read()
5506 				? "read" : "written";
5507 
5508 			ib::warn()
5509 				<< n
5510 				<< " bytes should have been " << op << ". Only "
5511 				<< bytes_returned
5512 				<< " bytes " << op << ". Retrying"
5513 				<< " for the remaining bytes.";
5514 		}
5515 
5516 		/* Advance the offset and buffer by n_bytes */
5517 		sync_file_io.advance(n_bytes);
5518 	}
5519 
5520 	if (block != NULL) {
5521 		os_free_block(block);
5522 	}
5523 
5524 	*err = DB_IO_ERROR;
5525 
5526 	if (!type.is_partial_io_warning_disabled()) {
5527 		ib::warn()
5528 			<< "Retry attempts for "
5529 			<< (type.is_read() ? "reading" : "writing")
5530 			<< " partial data failed.";
5531 	}
5532 
5533 	return(bytes_returned);
5534 }
5535 
5536 /** Does a synchronous write operation in Posix.
5537 @param[in]	type		IO context
5538 @param[in]	file		handle to an open file
5539 @param[out]	buf		buffer from which to write
5540 @param[in]	n		number of bytes to read, starting from offset
5541 @param[in]	offset		file offset from the start where to read
5542 @param[out]	err		DB_SUCCESS or error code
5543 @return number of bytes written, -1 if error */
5544 static MY_ATTRIBUTE((warn_unused_result))
5545 ssize_t
os_file_pwrite(IORequest & type,os_file_t file,const byte * buf,ulint n,os_offset_t offset,dberr_t * err)5546 os_file_pwrite(
5547 	IORequest&	type,
5548 	os_file_t	file,
5549 	const byte*	buf,
5550 	ulint		n,
5551 	os_offset_t	offset,
5552 	dberr_t*	err)
5553 {
5554 	ut_ad(type.validate());
5555 
5556 	++os_n_file_writes;
5557 
5558 	(void) os_atomic_increment_ulint(&os_n_pending_writes, 1);
5559 	MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_WRITES);
5560 
5561 	ssize_t	n_bytes = os_file_io(type, file, (void*) buf, n, offset, err);
5562 
5563 	(void) os_atomic_decrement_ulint(&os_n_pending_writes, 1);
5564 	MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_WRITES);
5565 
5566 	return(n_bytes);
5567 }
5568 
5569 /** Requests a synchronous write operation.
5570 @param[in]	type		IO flags
5571 @param[in]	file		handle to an open file
5572 @param[out]	buf		buffer from which to write
5573 @param[in]	offset		file offset from the start where to read
5574 @param[in]	n		number of bytes to read, starting from offset
5575 @return DB_SUCCESS if request was successful, false if fail */
5576 static MY_ATTRIBUTE((warn_unused_result))
5577 dberr_t
os_file_write_page(IORequest & type,const char * name,os_file_t file,const byte * buf,os_offset_t offset,ulint n)5578 os_file_write_page(
5579 	IORequest&	type,
5580 	const char*	name,
5581 	os_file_t	file,
5582 	const byte*	buf,
5583 	os_offset_t	offset,
5584 	ulint		n)
5585 {
5586 	dberr_t		err;
5587 	ut_ad(type.validate());
5588 	ut_ad(n > 0);
5589 
5590 	ssize_t n_bytes = os_file_pwrite(type, file, buf, n, offset, &err);
5591 
5592 	if ((ulint) n_bytes != n && !os_has_said_disk_full) {
5593 
5594 		ib::error()
5595 			<< "Write to file " << name << "failed at offset "
5596 			<< offset << ", " << n
5597 			<< " bytes should have been written,"
5598 			" only " << n_bytes << " were written."
5599 			" Operating system error number " << errno << "."
5600 			" Check that your OS and file system"
5601 			" support files of this size."
5602 			" Check also that the disk is not full"
5603 			" or a disk quota exceeded.";
5604 
5605 		if (strerror(errno) != NULL) {
5606 
5607 			ib::error()
5608 				<< "Error number " << errno
5609 				<< " means '" << strerror(errno) << "'";
5610 		}
5611 
5612 		ib::info() << OPERATING_SYSTEM_ERROR_MSG;
5613 
5614 		os_has_said_disk_full = true;
5615 	}
5616 
5617 	return(err);
5618 }
5619 
5620 /** Does a synchronous read operation in Posix.
5621 @param[in]	type		IO flags
5622 @param[in]	file		handle to an open file
5623 @param[out]	buf		buffer where to read
5624 @param[in]	offset		file offset from the start where to read
5625 @param[in]	n		number of bytes to read, starting from offset
5626 @param[out]	err		DB_SUCCESS or error code
5627 @return number of bytes read, -1 if error */
5628 static MY_ATTRIBUTE((warn_unused_result))
5629 ssize_t
os_file_pread(IORequest & type,os_file_t file,void * buf,ulint n,os_offset_t offset,dberr_t * err)5630 os_file_pread(
5631 	IORequest&	type,
5632 	os_file_t	file,
5633 	void*		buf,
5634 	ulint		n,
5635 	os_offset_t	offset,
5636 	dberr_t*	err)
5637 {
5638 	++os_n_file_reads;
5639 
5640 	(void) os_atomic_increment_ulint(&os_n_pending_reads, 1);
5641 	MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS);
5642 
5643 	ssize_t	n_bytes = os_file_io(type, file, buf, n, offset, err);
5644 
5645 	(void) os_atomic_decrement_ulint(&os_n_pending_reads, 1);
5646 	MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_READS);
5647 
5648 	return(n_bytes);
5649 }
5650 
5651 /** Requests a synchronous positioned read operation.
5652 @return DB_SUCCESS if request was successful, false if fail
5653 @param[in]	type		IO flags
5654 @param[in]	file		handle to an open file
5655 @param[out]	buf		buffer where to read
5656 @param[in]	offset		file offset from the start where to read
5657 @param[in]	n		number of bytes to read, starting from offset
5658 @param[out]	o		number of bytes actually read
5659 @param[in]	exit_on_err	if true then exit on error
5660 @return DB_SUCCESS or error code */
5661 static MY_ATTRIBUTE((warn_unused_result))
5662 dberr_t
os_file_read_page(IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n,ulint * o,bool exit_on_err)5663 os_file_read_page(
5664 	IORequest&	type,
5665 	os_file_t	file,
5666 	void*		buf,
5667 	os_offset_t	offset,
5668 	ulint		n,
5669 	ulint*		o,
5670 	bool		exit_on_err)
5671 {
5672 	dberr_t		err;
5673 
5674 	os_bytes_read_since_printout += n;
5675 
5676 	ut_ad(type.validate());
5677 	ut_ad(n > 0);
5678 
5679 	for (;;) {
5680 		ssize_t	n_bytes;
5681 
5682 		n_bytes = os_file_pread(type, file, buf, n, offset, &err);
5683 
5684 		if (o != NULL) {
5685 			*o = n_bytes;
5686 		}
5687 
5688 		if (err != DB_SUCCESS && !exit_on_err) {
5689 
5690 			return(err);
5691 
5692 		} else if ((ulint) n_bytes == n) {
5693 
5694 			/** The read will succeed but decompress can fail
5695 			for various reasons. */
5696 
5697 			if (type.is_compression_enabled()
5698 			    && !Compression::is_compressed_page(
5699 				    static_cast<byte*>(buf))) {
5700 
5701 				return(DB_SUCCESS);
5702 
5703 			} else {
5704 				return(err);
5705 			}
5706 		}
5707 
5708 		ib::error() << "Tried to read " << n
5709 			<< " bytes at offset " << offset
5710 			<< ", but was only able to read " << n_bytes;
5711 
5712 		if (exit_on_err) {
5713 
5714 			if (!os_file_handle_error(NULL, "read")) {
5715 				/* Hard error */
5716 				break;
5717 			}
5718 
5719 		} else if (!os_file_handle_error_no_exit(NULL, "read", false)) {
5720 
5721 			/* Hard error */
5722 			break;
5723 		}
5724 
5725 		if (n_bytes > 0 && (ulint) n_bytes < n) {
5726 			n -= (ulint) n_bytes;
5727 			offset += (ulint) n_bytes;
5728 			buf = reinterpret_cast<uchar*>(buf) + (ulint) n_bytes;
5729 		}
5730 	}
5731 
5732 	ib::fatal()
5733 		<< "Cannot read from file. OS error number "
5734 		<< errno << ".";
5735 
5736 	return(err);
5737 }
5738 
5739 /** Retrieves the last error number if an error occurs in a file io function.
5740 The number should be retrieved before any other OS calls (because they may
5741 overwrite the error number). If the number is not known to this program,
5742 the OS error number + 100 is returned.
5743 @param[in]	report_all_errors	true if we want an error printed
5744 					for all errors
5745 @return error number, or OS error number + 100 */
5746 ulint
os_file_get_last_error(bool report_all_errors)5747 os_file_get_last_error(
5748 	bool	report_all_errors)
5749 {
5750 	return(os_file_get_last_error_low(report_all_errors, false));
5751 }
5752 
5753 /** Does error handling when a file operation fails.
5754 Conditionally exits (calling srv_fatal_error()) based on should_exit value
5755 and the error type, if should_exit is true then on_error_silent is ignored.
5756 @param[in]	name		name of a file or NULL
5757 @param[in]	operation	operation
5758 @param[in]	should_exit	call srv_fatal_error() on an unknown error,
5759 				if this parameter is true
5760 @param[in]	on_error_silent	if true then don't print any message to the log
5761 				iff it is an unknown non-fatal error
5762 @return true if we should retry the operation */
5763 static MY_ATTRIBUTE((warn_unused_result))
5764 bool
os_file_handle_error_cond_exit(const char * name,const char * operation,bool should_exit,bool on_error_silent)5765 os_file_handle_error_cond_exit(
5766 	const char*	name,
5767 	const char*	operation,
5768 	bool		should_exit,
5769 	bool		on_error_silent)
5770 {
5771 	ulint	err;
5772 
5773 	err = os_file_get_last_error_low(false, on_error_silent);
5774 
5775 	switch (err) {
5776 	case OS_FILE_DISK_FULL:
5777 		/* We only print a warning about disk full once */
5778 
5779 		if (os_has_said_disk_full) {
5780 
5781 			return(false);
5782 		}
5783 
5784 		/* Disk full error is reported irrespective of the
5785 		on_error_silent setting. */
5786 
5787 		if (name) {
5788 
5789 			ib::error()
5790 				<< "Encountered a problem with file '"
5791 				<< name << "'";
5792 		}
5793 
5794 		ib::error()
5795 			<< "Disk is full. Try to clean the disk to free space.";
5796 
5797 		os_has_said_disk_full = true;
5798 
5799 		return(false);
5800 
5801 	case OS_FILE_AIO_RESOURCES_RESERVED:
5802 	case OS_FILE_AIO_INTERRUPTED:
5803 
5804 		return(true);
5805 
5806 	case OS_FILE_PATH_ERROR:
5807 	case OS_FILE_ALREADY_EXISTS:
5808 	case OS_FILE_ACCESS_VIOLATION:
5809 
5810 		return(false);
5811 
5812 	case OS_FILE_SHARING_VIOLATION:
5813 
5814 		os_thread_sleep(10000000);	/* 10 sec */
5815 		return(true);
5816 
5817 	case OS_FILE_OPERATION_ABORTED:
5818 	case OS_FILE_INSUFFICIENT_RESOURCE:
5819 
5820 		os_thread_sleep(100000);	/* 100 ms */
5821 		return(true);
5822 
5823 	default:
5824 
5825 		/* If it is an operation that can crash on error then it
5826 		is better to ignore on_error_silent and print an error message
5827 		to the log. */
5828 
5829 		if (should_exit || !on_error_silent) {
5830 			ib::error() << "File "
5831 				<< (name != NULL ? name : "(unknown)")
5832 				<< ": '" << operation << "'"
5833 				" returned OS error " << err << "."
5834 				<< (should_exit
5835 				    ? " Cannot continue operation" : "");
5836 		}
5837 
5838 		if (should_exit) {
5839 			srv_fatal_error();
5840 		}
5841 	}
5842 
5843 	return(false);
5844 }
5845 
5846 /** Does error handling when a file operation fails.
5847 @param[in]	name		name of a file or NULL
5848 @param[in]	operation	operation name that failed
5849 @return true if we should retry the operation */
5850 static
5851 bool
os_file_handle_error(const char * name,const char * operation)5852 os_file_handle_error(
5853 	const char*	name,
5854 	const char*	operation)
5855 {
5856 	/* Exit in case of unknown error */
5857 	return(os_file_handle_error_cond_exit(name, operation, true, false));
5858 }
5859 
5860 /** Does error handling when a file operation fails.
5861 @param[in]	name		name of a file or NULL
5862 @param[in]	operation	operation name that failed
5863 @param[in]	on_error_silent	if true then don't print any message to the log.
5864 @return true if we should retry the operation */
5865 static
5866 bool
os_file_handle_error_no_exit(const char * name,const char * operation,bool on_error_silent)5867 os_file_handle_error_no_exit(
5868 	const char*	name,
5869 	const char*	operation,
5870 	bool		on_error_silent)
5871 {
5872 	/* Don't exit in case of unknown error */
5873 	return(os_file_handle_error_cond_exit(
5874 			name, operation, false, on_error_silent));
5875 }
5876 
5877 /** Tries to disable OS caching on an opened file descriptor.
5878 @param[in]	fd		file descriptor to alter
5879 @param[in]	file_name	file name, used in the diagnostic message
5880 @param[in]	name		"open" or "create"; used in the diagnostic
5881 				message */
5882 void
os_file_set_nocache(int fd MY_ATTRIBUTE ((unused)),const char * file_name MY_ATTRIBUTE ((unused)),const char * operation_name MY_ATTRIBUTE ((unused)))5883 os_file_set_nocache(
5884 	int		fd		MY_ATTRIBUTE((unused)),
5885 	const char*	file_name	MY_ATTRIBUTE((unused)),
5886 	const char*	operation_name	MY_ATTRIBUTE((unused)))
5887 {
5888 	/* some versions of Solaris may not have DIRECTIO_ON */
5889 #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
5890 	if (directio(fd, DIRECTIO_ON) == -1) {
5891 		int	errno_save = errno;
5892 
5893 		ib::error()
5894 			<< "Failed to set DIRECTIO_ON on file "
5895 			<< file_name << ": " << operation_name
5896 			<< strerror(errno_save) << ","
5897 			" continuing anyway.";
5898 	}
5899 #elif defined(O_DIRECT)
5900 	if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
5901 		int		errno_save = errno;
5902 		static bool	warning_message_printed = false;
5903 		if (errno_save == EINVAL) {
5904 			if (!warning_message_printed) {
5905 				warning_message_printed = true;
5906 # ifdef UNIV_LINUX
5907 				ib::warn()
5908 					<< "Failed to set O_DIRECT on file"
5909 					<< file_name << ";" << operation_name
5910 					<< ": " << strerror(errno_save) << ", "
5911 					<< "continuing anyway. O_DIRECT is "
5912 					"known to result in 'Invalid argument' "
5913 					"on Linux on tmpfs, "
5914 					"see MySQL Bug#26662.";
5915 # else /* UNIV_LINUX */
5916 				goto short_warning;
5917 # endif /* UNIV_LINUX */
5918 			}
5919 		} else {
5920 # ifndef UNIV_LINUX
5921 short_warning:
5922 # endif
5923 			ib::warn()
5924 				<< "Failed to set O_DIRECT on file "
5925 				<< file_name << "; " << operation_name
5926 				<< " : " << strerror(errno_save)
5927 				<< " continuing anyway.";
5928 		}
5929 	}
5930 #endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */
5931 }
5932 
5933 /** Write the specified number of zeros to a newly created file.
5934 @param[in]	name		name of the file or path as a null-terminated
5935 				string
5936 @param[in]	file		handle to a file
5937 @param[in]	size		file size
5938 @param[in]	read_only	Enable read-only checks if true
5939 @return true if success */
5940 bool
os_file_set_size(const char * name,pfs_os_file_t file,os_offset_t size,bool read_only)5941 os_file_set_size(
5942 	const char*	name,
5943 	pfs_os_file_t	file,
5944 	os_offset_t	size,
5945 	bool		read_only)
5946 {
5947 	/* Write up to 1 megabyte at a time. */
5948 	ulint	buf_size = ut_min(
5949 		static_cast<ulint>(64),
5950 		static_cast<ulint>(size / UNIV_PAGE_SIZE));
5951 
5952 	buf_size *= UNIV_PAGE_SIZE;
5953 
5954 	/* Align the buffer for possible raw i/o */
5955 	byte*	buf2;
5956 
5957 	buf2 = static_cast<byte*>(ut_malloc_nokey(buf_size + UNIV_PAGE_SIZE));
5958 
5959 	byte*	buf = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
5960 
5961 	/* Write buffer full of zeros */
5962 	memset(buf, 0, buf_size);
5963 
5964 	if (size >= (os_offset_t) 100 << 20) {
5965 
5966 		ib::info() << "Progress in MB:";
5967 	}
5968 
5969 	os_offset_t	current_size = 0;
5970 
5971 	while (current_size < size) {
5972 		ulint	n_bytes;
5973 
5974 		if (size - current_size < (os_offset_t) buf_size) {
5975 			n_bytes = (ulint) (size - current_size);
5976 		} else {
5977 			n_bytes = buf_size;
5978 		}
5979 
5980 		dberr_t		err;
5981 		IORequest	request(IORequest::WRITE);
5982 
5983 #ifdef UNIV_HOTBACKUP
5984 
5985 		err = os_file_write(
5986 			request, name, file, buf, current_size, n_bytes);
5987 #else
5988 		/* Using OS_AIO_SYNC mode on POSIX systems will result in
5989 		fall back to os_file_write/read. On Windows it will use
5990 		special mechanism to wait before it returns back. */
5991 
5992 		err = os_aio(
5993 			request,
5994 			OS_AIO_SYNC, name,
5995 			file, buf, current_size, n_bytes,
5996 			read_only, NULL, NULL);
5997 #endif /* UNIV_HOTBACKUP */
5998 
5999 		if (err != DB_SUCCESS) {
6000 
6001 			ut_free(buf2);
6002 			return(false);
6003 		}
6004 
6005 		/* Print about progress for each 100 MB written */
6006 		if ((current_size + n_bytes) / (100 << 20)
6007 		    != current_size / (100 << 20)) {
6008 
6009 			fprintf(stderr, " %lu00",
6010 				(ulong) ((current_size + n_bytes)
6011 					 / (100 << 20)));
6012 		}
6013 
6014 		current_size += n_bytes;
6015 	}
6016 
6017 	if (size >= (os_offset_t) 100 << 20) {
6018 
6019 		fprintf(stderr, "\n");
6020 	}
6021 
6022 	ut_free(buf2);
6023 
6024 	return(os_file_flush(file));
6025 }
6026 
6027 /** Truncates a file to a specified size in bytes.
6028 Do nothing if the size to preserve is greater or equal to the current
6029 size of the file.
6030 @param[in]	pathname	file path
6031 @param[in]	file		file to be truncated
6032 @param[in]	size		size to preserve in bytes
6033 @return true if success */
6034 bool
os_file_truncate(const char * pathname,pfs_os_file_t file,os_offset_t size)6035 os_file_truncate(
6036 	const char*	pathname,
6037 	pfs_os_file_t	file,
6038 	os_offset_t	size)
6039 {
6040 	/* Do nothing if the size preserved is larger than or equal to the
6041 	current size of file */
6042 	os_offset_t	size_bytes = os_file_get_size(file);
6043 
6044 	if (size >= size_bytes) {
6045 		return(true);
6046 	}
6047 
6048 #ifdef _WIN32
6049 	return(os_file_truncate_win32(pathname, file, size));
6050 #else /* _WIN32 */
6051 	return(os_file_truncate_posix(pathname, file, size));
6052 #endif /* _WIN32 */
6053 }
6054 
6055 /** NOTE! Use the corresponding macro os_file_read(), not directly this
6056 function!
6057 Requests a synchronous positioned read operation.
6058 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
6059 @param[in]	type		IO flags
6060 @param[in]	file		handle to an open file
6061 @param[out]	buf		buffer where to read
6062 @param[in]	offset		file offset from the start where to read
6063 @param[in]	n		number of bytes to read, starting from offset
6064 @return DB_SUCCESS or error code */
6065 dberr_t
os_file_read_func(IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n)6066 os_file_read_func(
6067 	IORequest&	type,
6068 	os_file_t	file,
6069 	void*		buf,
6070 	os_offset_t	offset,
6071 	ulint		n)
6072 {
6073 	ut_ad(type.is_read());
6074 
6075 	return(os_file_read_page(type, file, buf, offset, n, NULL, true));
6076 }
6077 
6078 /** NOTE! Use the corresponding macro os_file_read_no_error_handling(),
6079 not directly this function!
6080 Requests a synchronous positioned read operation.
6081 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
6082 @param[in]	type		IO flags
6083 @param[in]	file		handle to an open file
6084 @param[out]	buf		buffer where to read
6085 @param[in]	offset		file offset from the start where to read
6086 @param[in]	n		number of bytes to read, starting from offset
6087 @param[out]	o		number of bytes actually read
6088 @return DB_SUCCESS or error code */
6089 dberr_t
os_file_read_no_error_handling_func(IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n,ulint * o)6090 os_file_read_no_error_handling_func(
6091 	IORequest&	type,
6092 	os_file_t	file,
6093 	void*		buf,
6094 	os_offset_t	offset,
6095 	ulint		n,
6096 	ulint*		o)
6097 {
6098 	ut_ad(type.is_read());
6099 
6100 	return(os_file_read_page(type, file, buf, offset, n, o, false));
6101 }
6102 
6103 /** NOTE! Use the corresponding macro os_file_write(), not directly
6104 Requests a synchronous write operation.
6105 @param[in]	type		IO flags
6106 @param[in]	file		handle to an open file
6107 @param[out]	buf		buffer from which to write
6108 @param[in]	offset		file offset from the start where to read
6109 @param[in]	n		number of bytes to read, starting from offset
6110 @return DB_SUCCESS if request was successful, false if fail */
6111 dberr_t
os_file_write_func(IORequest & type,const char * name,os_file_t file,const void * buf,os_offset_t offset,ulint n)6112 os_file_write_func(
6113 	IORequest&	type,
6114 	const char*	name,
6115 	os_file_t	file,
6116 	const void*	buf,
6117 	os_offset_t	offset,
6118 	ulint		n)
6119 {
6120 	ut_ad(type.validate());
6121 	ut_ad(type.is_write());
6122 
6123 	/* We never compress the first page.
6124 	Note: This assumes we always do block IO. */
6125 	if (offset == 0) {
6126 		type.clear_compressed();
6127 	}
6128 
6129 	const byte*	ptr = reinterpret_cast<const byte*>(buf);
6130 
6131 	return(os_file_write_page(type, name, file, ptr, offset, n));
6132 }
6133 
6134 /** Check the existence and type of the given file.
6135 @param[in]	path		path name of file
6136 @param[out]	exists		true if the file exists
6137 @param[out]	type		Type of the file, if it exists
6138 @return true if call succeeded */
6139 bool
os_file_status(const char * path,bool * exists,os_file_type_t * type)6140 os_file_status(
6141 	const char*	path,
6142 	bool*		exists,
6143 	os_file_type_t* type)
6144 {
6145 #ifdef _WIN32
6146 	return(os_file_status_win32(path, exists, type));
6147 #else
6148 	return(os_file_status_posix(path, exists, type));
6149 #endif /* _WIN32 */
6150 }
6151 
6152 /** Free storage space associated with a section of the file.
6153 @param[in]	fh		Open file handle
6154 @param[in]	off		Starting offset (SEEK_SET)
6155 @param[in]	len		Size of the hole
6156 @return DB_SUCCESS or error code */
6157 dberr_t
os_file_punch_hole(os_file_t fh,os_offset_t off,os_offset_t len)6158 os_file_punch_hole(
6159 	os_file_t	fh,
6160 	os_offset_t	off,
6161 	os_offset_t	len)
6162 {
6163 	/* In this debugging mode, we act as if punch hole is supported,
6164 	and then skip any calls to actually punch a hole here.
6165 	In this way, Transparent Page Compression is still being tested. */
6166 	DBUG_EXECUTE_IF("ignore_punch_hole",
6167 		return(DB_SUCCESS);
6168 	);
6169 
6170 #ifdef _WIN32
6171 	return(os_file_punch_hole_win32(fh, off, len));
6172 #else
6173 	return(os_file_punch_hole_posix(fh, off, len));
6174 #endif /* _WIN32 */
6175 }
6176 
6177 /** Check if the file system supports sparse files.
6178 
6179 Warning: On POSIX systems we try and punch a hole from offset 0 to
6180 the system configured page size. This should only be called on an empty
6181 file.
6182 
6183 Note: On Windows we use the name and on Unices we use the file handle.
6184 
6185 @param[in]	name		File name
6186 @param[in]	fh		File handle for the file - if opened
6187 @return true if the file system supports sparse files */
6188 bool
os_is_sparse_file_supported(const char * path,pfs_os_file_t fh)6189 os_is_sparse_file_supported(const char* path, pfs_os_file_t fh)
6190 {
6191 	/* In this debugging mode, we act as if punch hole is supported,
6192 	then we skip any calls to actually punch a hole.  In this way,
6193 	Transparent Page Compression is still being tested. */
6194 	DBUG_EXECUTE_IF("ignore_punch_hole",
6195 		return(true);
6196 	);
6197 
6198 #ifdef _WIN32
6199 	return(os_is_sparse_file_supported_win32(path));
6200 #else
6201 	dberr_t	err;
6202 
6203 	/* We don't know the FS block size, use the sector size. The FS
6204 	will do the magic. */
6205 	err = os_file_punch_hole(fh.m_file, 0, UNIV_PAGE_SIZE);
6206 
6207 	return(err == DB_SUCCESS);
6208 #endif /* _WIN32 */
6209 }
6210 
6211 /** This function returns information about the specified file
6212 @param[in]	path		pathname of the file
6213 @param[out]	stat_info	information of a file in a directory
6214 @param[in]	check_rw_perm	for testing whether the file can be opened
6215 				in RW mode
6216 @param[in]	read_only	true if file is opened in read-only mode
6217 @return DB_SUCCESS if all OK */
6218 dberr_t
os_file_get_status(const char * path,os_file_stat_t * stat_info,bool check_rw_perm,bool read_only)6219 os_file_get_status(
6220 	const char*	path,
6221 	os_file_stat_t* stat_info,
6222 	bool		check_rw_perm,
6223 	bool		read_only)
6224 {
6225 	dberr_t	ret;
6226 
6227 #ifdef _WIN32
6228 	struct _stat64	info;
6229 
6230 	ret = os_file_get_status_win32(
6231 		path, stat_info, &info, check_rw_perm, read_only);
6232 
6233 #else
6234 	struct stat	info;
6235 
6236 	ret = os_file_get_status_posix(
6237 		path, stat_info, &info, check_rw_perm, read_only);
6238 
6239 #endif /* _WIN32 */
6240 
6241 	if (ret == DB_SUCCESS) {
6242 		stat_info->ctime = info.st_ctime;
6243 		stat_info->atime = info.st_atime;
6244 		stat_info->mtime = info.st_mtime;
6245 		stat_info->size  = info.st_size;
6246 	}
6247 
6248 	return(ret);
6249 }
6250 
6251 /**
6252 Waits for an AIO operation to complete. This function is used to wait the
6253 for completed requests. The aio array of pending requests is divided
6254 into segments. The thread specifies which segment or slot it wants to wait
6255 for. NOTE: this function will also take care of freeing the aio slot,
6256 therefore no other thread is allowed to do the freeing!
6257 @param[in]	segment		The number of the segment in the aio arrays to
6258 				wait for; segment 0 is the ibuf I/O thread,
6259 				segment 1 the log I/O thread, then follow the
6260 				non-ibuf read threads, and as the last are the
6261 				non-ibuf write threads; if this is
6262 				ULINT_UNDEFINED, then it means that sync AIO
6263 				is used, and this parameter is ignored
6264 @param[out]	m1		the messages passed with the AIO request; note
6265 				that also in the case where the AIO operation
6266 				failed, these output parameters are valid and
6267 				can be used to restart the operation,
6268 				for example
6269 @param[out]	m2		callback message
6270 @param[out]	type		OS_FILE_WRITE or ..._READ
6271 @return DB_SUCCESS or error code */
6272 dberr_t
os_aio_handler(ulint segment,fil_node_t ** m1,void ** m2,IORequest * request)6273 os_aio_handler(
6274 	ulint		segment,
6275 	fil_node_t**	m1,
6276 	void**		m2,
6277 	IORequest*	request)
6278 {
6279 	dberr_t	err;
6280 
6281 	if (srv_use_native_aio) {
6282 		srv_set_io_thread_op_info(segment, "native aio handle");
6283 
6284 #ifdef WIN_ASYNC_IO
6285 
6286 		err = os_aio_windows_handler(segment, 0, m1, m2, request);
6287 
6288 #elif defined(LINUX_NATIVE_AIO)
6289 
6290 		err = os_aio_linux_handler(segment, m1, m2, request);
6291 
6292 #else
6293 		ut_error;
6294 
6295 		err = DB_ERROR; /* Eliminate compiler warning */
6296 
6297 #endif /* WIN_ASYNC_IO */
6298 
6299 	} else {
6300 		srv_set_io_thread_op_info(segment, "simulated aio handle");
6301 
6302 		err = os_aio_simulated_handler(segment, m1, m2, request);
6303 	}
6304 
6305 	return(err);
6306 }
6307 
6308 /** Constructor
6309 @param[in]	id		The latch ID
6310 @param[in]	n		Number of AIO slots
6311 @param[in]	segments	Number of segments */
AIO(latch_id_t id,ulint n,ulint segments)6312 AIO::AIO(
6313 	latch_id_t	id,
6314 	ulint		n,
6315 	ulint		segments)
6316 	:
6317 	m_slots(n),
6318 	m_n_segments(segments),
6319 	m_n_reserved()
6320 # ifdef LINUX_NATIVE_AIO
6321 	,m_aio_ctx(),
6322 	m_events(m_slots.size())
6323 # elif defined(_WIN32)
6324 	,m_handles()
6325 # endif /* LINUX_NATIVE_AIO */
6326 {
6327 	ut_a(n > 0);
6328 	ut_a(m_n_segments > 0);
6329 
6330 	mutex_create(id, &m_mutex);
6331 
6332 	m_not_full = os_event_create("aio_not_full");
6333 	m_is_empty = os_event_create("aio_is_empty");
6334 
6335 	std::uninitialized_fill(m_slots.begin(), m_slots.end(), Slot());
6336 #ifdef LINUX_NATIVE_AIO
6337 	memset(&m_events[0], 0x0, sizeof(m_events[0]) * m_events.size());
6338 #endif /* LINUX_NATIVE_AIO */
6339 
6340 	os_event_set(m_is_empty);
6341 }
6342 
6343 /** Initialise the slots */
6344 dberr_t
init_slots()6345 AIO::init_slots()
6346 {
6347 	for (ulint i = 0; i < m_slots.size(); ++i) {
6348 		Slot&	slot = m_slots[i];
6349 
6350 		slot.pos = static_cast<uint16_t>(i);
6351 
6352 		slot.is_reserved = false;
6353 
6354 #ifdef WIN_ASYNC_IO
6355 
6356 		slot.handle = CreateEvent(NULL, TRUE, FALSE, NULL);
6357 
6358 		OVERLAPPED*	over = &slot.control;
6359 
6360 		over->hEvent = slot.handle;
6361 
6362 		(*m_handles)[i] = over->hEvent;
6363 
6364 #elif defined(LINUX_NATIVE_AIO)
6365 
6366 		slot.ret = 0;
6367 
6368 		slot.n_bytes = 0;
6369 
6370 		memset(&slot.control, 0x0, sizeof(slot.control));
6371 
6372 #endif /* WIN_ASYNC_IO */
6373 	}
6374 
6375 	return(DB_SUCCESS);
6376 }
6377 
6378 #ifdef LINUX_NATIVE_AIO
6379 /** Initialise the Linux Native AIO interface */
6380 dberr_t
init_linux_native_aio()6381 AIO::init_linux_native_aio()
6382 {
6383 	/* Initialize the io_context array. One io_context
6384 	per segment in the array. */
6385 
6386 	ut_a(m_aio_ctx == NULL);
6387 
6388 	m_aio_ctx = static_cast<io_context**>(
6389 		ut_zalloc_nokey(m_n_segments * sizeof(*m_aio_ctx)));
6390 
6391 	if (m_aio_ctx == NULL) {
6392 		return(DB_OUT_OF_MEMORY);
6393 	}
6394 
6395 	io_context**	ctx = m_aio_ctx;
6396 	ulint		max_events = slots_per_segment();
6397 
6398 	for (ulint i = 0; i < m_n_segments; ++i, ++ctx) {
6399 
6400 		if (!linux_create_io_ctx(max_events, ctx)) {
6401 			/* If something bad happened during aio setup
6402 			we should call it a day and return right away.
6403 			We don't care about any leaks because a failure
6404 			to initialize the io subsystem means that the
6405 			server (or atleast the innodb storage engine)
6406 			is not going to startup. */
6407 			return(DB_IO_ERROR);
6408 		}
6409 	}
6410 
6411 	return(DB_SUCCESS);
6412 }
6413 #endif /* LINUX_NATIVE_AIO */
6414 
6415 /** Initialise the array */
6416 dberr_t
init()6417 AIO::init()
6418 {
6419 	ut_a(!m_slots.empty());
6420 
6421 #ifdef _WIN32
6422 	ut_a(m_handles == NULL);
6423 
6424 	m_handles = UT_NEW_NOKEY(Handles(m_slots.size()));
6425 #endif /* _WIN32 */
6426 
6427 	if (srv_use_native_aio) {
6428 #ifdef LINUX_NATIVE_AIO
6429 		dberr_t	err = init_linux_native_aio();
6430 
6431 		if (err != DB_SUCCESS) {
6432 			return(err);
6433 		}
6434 
6435 #endif /* LINUX_NATIVE_AIO */
6436 	}
6437 
6438 	return(init_slots());
6439 }
6440 
6441 /** Creates an aio wait array. Note that we return NULL in case of failure.
6442 We don't care about freeing memory here because we assume that a
6443 failure will result in server refusing to start up.
6444 @param[in]	id		Latch ID
6445 @param[in]	n		maximum number of pending AIO operations
6446 				allowed; n must be divisible by m_n_segments
6447 @param[in]	n_segments	number of segments in the AIO array
6448 @return own: AIO array, NULL on failure */
6449 AIO*
create(latch_id_t id,ulint n,ulint n_segments)6450 AIO::create(
6451 	latch_id_t	id,
6452 	ulint		n,
6453 	ulint		n_segments)
6454 {
6455 	if ((n % n_segments)) {
6456 
6457 		ib::error()
6458 			<< "Maximum number of AIO operations must be "
6459 			<< "divisible by number of segments";
6460 
6461 		return(NULL);
6462 	}
6463 
6464 	AIO*	array = UT_NEW_NOKEY(AIO(id, n, n_segments));
6465 
6466 	if (array != NULL && array->init() != DB_SUCCESS) {
6467 
6468 		UT_DELETE(array);
6469 
6470 		array = NULL;
6471 	}
6472 
6473 	return(array);
6474 }
6475 
6476 /** AIO destructor */
~AIO()6477 AIO::~AIO()
6478 {
6479 #ifdef WIN_ASYNC_IO
6480 	for (ulint i = 0; i < m_slots.size(); ++i) {
6481 		CloseHandle(m_slots[i].handle);
6482 	}
6483 #endif /* WIN_ASYNC_IO */
6484 
6485 #ifdef _WIN32
6486 	UT_DELETE(m_handles);
6487 #endif /* _WIN32 */
6488 
6489 	mutex_destroy(&m_mutex);
6490 
6491 	os_event_destroy(m_not_full);
6492 	os_event_destroy(m_is_empty);
6493 
6494 #if defined(LINUX_NATIVE_AIO)
6495 	if (srv_use_native_aio) {
6496 		m_events.clear();
6497 		ut_free(m_aio_ctx);
6498 	}
6499 #endif /* LINUX_NATIVE_AIO */
6500 
6501 	m_slots.clear();
6502 }
6503 
6504 /** Initializes the asynchronous io system. Creates one array each for ibuf
6505 and log i/o. Also creates one array each for read and write where each
6506 array is divided logically into n_readers and n_writers
6507 respectively. The caller must create an i/o handler thread for each
6508 segment in these arrays. This function also creates the sync array.
6509 No i/o handler thread needs to be created for that
6510 @param[in]	n_per_seg	maximum number of pending aio
6511 				operations allowed per segment
6512 @param[in]	n_readers	number of reader threads
6513 @param[in]	n_writers	number of writer threads
6514 @param[in]	n_slots_sync	number of slots in the sync aio array
6515 @return true if the AIO sub-system was started successfully */
6516 bool
start(ulint n_per_seg,ulint n_readers,ulint n_writers,ulint n_slots_sync)6517 AIO::start(
6518 	ulint		n_per_seg,
6519 	ulint		n_readers,
6520 	ulint		n_writers,
6521 	ulint		n_slots_sync)
6522 {
6523 #if defined(LINUX_NATIVE_AIO)
6524 	/* Check if native aio is supported on this system and tmpfs */
6525 	if (srv_use_native_aio && !is_linux_native_aio_supported()) {
6526 
6527 		ib::warn() << "Linux Native AIO disabled.";
6528 
6529 		srv_use_native_aio = FALSE;
6530 	}
6531 #endif /* LINUX_NATIVE_AIO */
6532 
6533 	srv_reset_io_thread_op_info();
6534 
6535 	s_reads = create(
6536 		LATCH_ID_OS_AIO_READ_MUTEX, n_readers * n_per_seg, n_readers);
6537 
6538 	if (s_reads == NULL) {
6539 		return(false);
6540 	}
6541 
6542 	ulint	start = srv_read_only_mode ? 0 : 2;
6543 	ulint	n_segs = n_readers + start;
6544 
6545 	/* 0 is the ibuf segment and 1 is the redo log segment. */
6546 	for (ulint i = start; i < n_segs; ++i) {
6547 		ut_a(i < SRV_MAX_N_IO_THREADS);
6548 		srv_io_thread_function[i] = "read thread";
6549 	}
6550 
6551 	ulint	n_segments = n_readers;
6552 
6553 	if (!srv_read_only_mode) {
6554 
6555 		s_ibuf = create(LATCH_ID_OS_AIO_IBUF_MUTEX, n_per_seg, 1);
6556 
6557 		if (s_ibuf == NULL) {
6558 			return(false);
6559 		}
6560 
6561 		++n_segments;
6562 
6563 		srv_io_thread_function[0] = "insert buffer thread";
6564 
6565 		s_log = create(LATCH_ID_OS_AIO_LOG_MUTEX, n_per_seg, 1);
6566 
6567 		if (s_log == NULL) {
6568 			return(false);
6569 		}
6570 
6571 		++n_segments;
6572 
6573 		srv_io_thread_function[1] = "log thread";
6574 
6575 	} else {
6576 		s_ibuf = s_log = NULL;
6577 	}
6578 
6579 	s_writes = create(
6580 		LATCH_ID_OS_AIO_WRITE_MUTEX, n_writers * n_per_seg, n_writers);
6581 
6582 	if (s_writes == NULL) {
6583 		return(false);
6584 	}
6585 
6586 	n_segments += n_writers;
6587 
6588 	for (ulint i = start + n_readers; i < n_segments; ++i) {
6589 		ut_a(i < SRV_MAX_N_IO_THREADS);
6590 		srv_io_thread_function[i] = "write thread";
6591 	}
6592 
6593 	ut_ad(n_segments >= static_cast<ulint>(srv_read_only_mode ? 2 : 4));
6594 
6595 	s_sync = create(LATCH_ID_OS_AIO_SYNC_MUTEX, n_slots_sync, 1);
6596 
6597 	if (s_sync == NULL) {
6598 
6599 		return(false);
6600 	}
6601 
6602 	os_aio_n_segments = n_segments;
6603 
6604 	os_aio_validate();
6605 
6606 	os_aio_segment_wait_events = static_cast<os_event_t*>(
6607 		ut_zalloc_nokey(
6608 			n_segments * sizeof *os_aio_segment_wait_events));
6609 
6610 	if (os_aio_segment_wait_events == NULL) {
6611 
6612 		return(false);
6613 	}
6614 
6615 	for (ulint i = 0; i < n_segments; ++i) {
6616 		os_aio_segment_wait_events[i] = os_event_create(0);
6617 	}
6618 
6619 	os_last_printout = ut_time_monotonic();
6620 
6621 	return(true);
6622 }
6623 
6624 /** Free the AIO arrays */
6625 void
shutdown()6626 AIO::shutdown()
6627 {
6628 	UT_DELETE(s_ibuf);
6629 	s_ibuf = NULL;
6630 
6631 	UT_DELETE(s_log);
6632 	s_log = NULL;
6633 
6634 	UT_DELETE(s_writes);
6635 	s_writes = NULL;
6636 
6637 	UT_DELETE(s_sync);
6638 	s_sync = NULL;
6639 
6640 	UT_DELETE(s_reads);
6641 	s_reads = NULL;
6642 }
6643 
6644 #if !defined(NO_FALLOCATE) && defined(UNIV_LINUX)
6645 
6646 /** Max disk sector size */
6647 static const ulint	MAX_SECTOR_SIZE = 4096;
6648 
6649 /**
6650 Try and get the FusionIO sector size. */
6651 void
os_fusionio_get_sector_size()6652 os_fusionio_get_sector_size()
6653 {
6654 	if (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
6655 	    || srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC) {
6656 		ulint		sector_size = UNIV_SECTOR_SIZE;
6657 		char*		path = srv_data_home;
6658 		os_file_t	check_file;
6659 		byte*		ptr;
6660 		byte*		block_ptr;
6661 		char		current_dir[3];
6662 		char*		dir_end;
6663 		ulint		dir_len;
6664 		ulint		check_path_len;
6665 		char*		check_file_name;
6666 		ssize_t		ret;
6667 
6668 		/* If the srv_data_home is empty, set the path to
6669 		current dir. */
6670 		if (*path == 0) {
6671 			current_dir[0] = FN_CURLIB;
6672 			current_dir[1] = FN_LIBCHAR;
6673 			current_dir[2] = 0;
6674 			path = current_dir;
6675 		}
6676 
6677 		/* Get the path of data file */
6678 		dir_end = strrchr(path, OS_PATH_SEPARATOR);
6679 		dir_len = dir_end? dir_end - path : strlen(path);
6680 
6681 		/* allocate a new path and move the directory path to it. */
6682 		check_path_len = dir_len + sizeof "/check_sector_size";
6683 		check_file_name = static_cast<char*>(
6684 			ut_zalloc_nokey(check_path_len));
6685 		memcpy(check_file_name, path, dir_len);
6686 
6687 		/* Construct a check file name. */
6688 		strcat(check_file_name + dir_len, "/check_sector_size");
6689 
6690 		/* Create a tmp file for checking sector size. */
6691 		check_file = ::open(check_file_name,
6692 				    O_CREAT|O_TRUNC|O_WRONLY|O_DIRECT,
6693 				    S_IRWXU);
6694 
6695 		if (check_file == -1) {
6696 			ib::error()
6697 				<< "Failed to create check sector file, errno:"
6698 				<< errno << " Please confirm O_DIRECT is"
6699 				<< " supported and remove the file "
6700 				<< check_file_name << " if it exists.";
6701 			ut_free(check_file_name);
6702 			errno = 0;
6703 			return;
6704 		}
6705 
6706 		/* Try to write the file with different sector size
6707 		alignment. */
6708 		ptr = static_cast<byte*>(ut_malloc_nokey(2 * MAX_SECTOR_SIZE));
6709 
6710 		while (sector_size <= MAX_SECTOR_SIZE) {
6711 			block_ptr = static_cast<byte*>(
6712 				ut_align(ptr, sector_size));
6713 			ret = pwrite(check_file, block_ptr,
6714 				    sector_size, 0);
6715 			if (ret > 0 && (ulint) ret == sector_size) {
6716 				break;
6717 			}
6718 			sector_size *= 2;
6719 		}
6720 
6721 		/* The sector size should <= MAX_SECTOR_SIZE. */
6722 		ut_ad(sector_size <= MAX_SECTOR_SIZE);
6723 
6724 		close(check_file);
6725 		unlink(check_file_name);
6726 
6727 		ut_free(check_file_name);
6728 		ut_free(ptr);
6729 		errno = 0;
6730 
6731 		os_io_ptr_align = sector_size;
6732 	}
6733 }
6734 #endif /* !NO_FALLOCATE && UNIV_LINUX */
6735 
6736 /** Initializes the asynchronous io system. Creates one array each for ibuf
6737 and log i/o. Also creates one array each for read and write where each
6738 array is divided logically into n_readers and n_writers
6739 respectively. The caller must create an i/o handler thread for each
6740 segment in these arrays. This function also creates the sync array.
6741 No i/o handler thread needs to be created for that
6742 @param[in]	n_readers	number of reader threads
6743 @param[in]	n_writers	number of writer threads
6744 @param[in]	n_slots_sync	number of slots in the sync aio array */
6745 bool
os_aio_init(ulint n_readers,ulint n_writers,ulint n_slots_sync)6746 os_aio_init(
6747 	ulint		n_readers,
6748 	ulint		n_writers,
6749 	ulint		n_slots_sync)
6750 {
6751 	/* Maximum number of pending aio operations allowed per segment */
6752 	ulint		limit = 8 * OS_AIO_N_PENDING_IOS_PER_THREAD;
6753 
6754 #ifdef _WIN32
6755 	if (srv_use_native_aio) {
6756 		limit = SRV_N_PENDING_IOS_PER_THREAD;
6757 	}
6758 #endif /* _WIN32 */
6759 
6760 	ut_a(block_cache == NULL);
6761 
6762 	block_cache = UT_NEW_NOKEY(Blocks(MAX_BLOCKS));
6763 
6764 	for (Blocks::iterator it = block_cache->begin();
6765 	     it != block_cache->end();
6766 	     ++it) {
6767 
6768 		ut_a(it->m_in_use == 0);
6769 		ut_a(it->m_ptr == NULL);
6770 
6771 		/* Allocate double of max page size memory, since
6772 		compress could generate more bytes than orgininal
6773 		data. */
6774 		it->m_ptr = static_cast<byte*>(
6775 			ut_malloc_nokey(BUFFER_BLOCK_SIZE));
6776 
6777 		ut_a(it->m_ptr != NULL);
6778 	}
6779 
6780 	/* Get sector size for DIRECT_IO. In this case, we need to
6781 	know the sector size for aligning the write buffer. */
6782 #if !defined(NO_FALLOCATE) && defined(UNIV_LINUX)
6783 	os_fusionio_get_sector_size();
6784 #endif /* !NO_FALLOCATE && UNIV_LINUX */
6785 
6786 	return(AIO::start(limit, n_readers, n_writers, n_slots_sync));
6787 }
6788 
6789 /** Frees the asynchronous io system. */
6790 void
os_aio_free()6791 os_aio_free()
6792 {
6793 	AIO::shutdown();
6794 
6795 	for (ulint i = 0; i < os_aio_n_segments; i++) {
6796 		os_event_destroy(os_aio_segment_wait_events[i]);
6797 	}
6798 
6799 	ut_free(os_aio_segment_wait_events);
6800 	os_aio_segment_wait_events = 0;
6801 	os_aio_n_segments = 0;
6802 
6803 	for (Blocks::iterator it = block_cache->begin();
6804 	     it != block_cache->end();
6805 	     ++it) {
6806 
6807 		ut_a(it->m_in_use == 0);
6808 		ut_free(it->m_ptr);
6809 	}
6810 
6811 	UT_DELETE(block_cache);
6812 
6813 	block_cache = NULL;
6814 }
6815 
6816 /** Wakes up all async i/o threads so that they know to exit themselves in
6817 shutdown. */
6818 void
os_aio_wake_all_threads_at_shutdown()6819 os_aio_wake_all_threads_at_shutdown()
6820 {
6821 #ifdef WIN_ASYNC_IO
6822 
6823 	AIO::wake_at_shutdown();
6824 
6825 #elif defined(LINUX_NATIVE_AIO)
6826 
6827 	/* When using native AIO interface the io helper threads
6828 	wait on io_getevents with a timeout value of 500ms. At
6829 	each wake up these threads check the server status.
6830 	No need to do anything to wake them up. */
6831 
6832 	if (srv_use_native_aio) {
6833 		return;
6834 	}
6835 
6836 #endif /* !WIN_ASYNC_AIO */
6837 
6838 	/* Fall through to simulated AIO handler wakeup if we are
6839 	not using native AIO. */
6840 
6841 	/* This loop wakes up all simulated ai/o threads */
6842 
6843 	for (ulint i = 0; i < os_aio_n_segments; ++i) {
6844 
6845 		os_event_set(os_aio_segment_wait_events[i]);
6846 	}
6847 }
6848 
6849 /** Waits until there are no pending writes in AIO::s_writes. There can
6850 be other, synchronous, pending writes. */
6851 void
os_aio_wait_until_no_pending_writes()6852 os_aio_wait_until_no_pending_writes()
6853 {
6854 	AIO::wait_until_no_pending_writes();
6855 }
6856 
6857 /** Calculates segment number for a slot.
6858 @param[in]	array		AIO wait array
6859 @param[in]	slot		slot in this array
6860 @return segment number (which is the number used by, for example,
6861 	I/O-handler threads) */
6862 ulint
get_segment_no_from_slot(const AIO * array,const Slot * slot)6863 AIO::get_segment_no_from_slot(
6864 	const AIO*	array,
6865 	const Slot*	slot)
6866 {
6867 	ulint	segment;
6868 	ulint	seg_len;
6869 
6870 	if (array == s_ibuf) {
6871 		ut_ad(!srv_read_only_mode);
6872 
6873 		segment = IO_IBUF_SEGMENT;
6874 
6875 	} else if (array == s_log) {
6876 		ut_ad(!srv_read_only_mode);
6877 
6878 		segment = IO_LOG_SEGMENT;
6879 
6880 	} else if (array == s_reads) {
6881 		seg_len = s_reads->slots_per_segment();
6882 
6883 		segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
6884 	} else {
6885 		ut_a(array == s_writes);
6886 
6887 		seg_len = s_writes->slots_per_segment();
6888 
6889 		segment = s_reads->m_n_segments
6890 			+ (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
6891 	}
6892 
6893 	return(segment);
6894 }
6895 
6896 /** Requests for a slot in the aio array. If no slot is available, waits until
6897 not_full-event becomes signaled.
6898 
6899 @param[in,out]	type		IO context
6900 @param[in,out]	m1		message to be passed along with the AIO
6901 				operation
6902 @param[in,out]	m2		message to be passed along with the AIO
6903 				operation
6904 @param[in]	file		file handle
6905 @param[in]	name		name of the file or path as a NUL-terminated
6906 				string
6907 @param[in,out]	buf		buffer where to read or from which to write
6908 @param[in]	offset		file offset, where to read from or start writing
6909 @param[in]	len		length of the block to read or write
6910 @return pointer to slot */
6911 Slot*
reserve_slot(IORequest & type,fil_node_t * m1,void * m2,pfs_os_file_t file,const char * name,void * buf,os_offset_t offset,ulint len)6912 AIO::reserve_slot(
6913 	IORequest&	type,
6914 	fil_node_t*	m1,
6915 	void*		m2,
6916 	pfs_os_file_t	file,
6917 	const char*	name,
6918 	void*		buf,
6919 	os_offset_t	offset,
6920 	ulint		len)
6921 {
6922 #ifdef WIN_ASYNC_IO
6923 	ut_a((len & 0xFFFFFFFFUL) == len);
6924 #endif /* WIN_ASYNC_IO */
6925 
6926 	/* No need of a mutex. Only reading constant fields */
6927 	ulint		slots_per_seg;
6928 
6929 	ut_ad(type.validate());
6930 
6931 	slots_per_seg = slots_per_segment();
6932 
6933 	/* We attempt to keep adjacent blocks in the same local
6934 	segment. This can help in merging IO requests when we are
6935 	doing simulated AIO */
6936 	ulint		local_seg;
6937 
6938 	local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6)) % m_n_segments;
6939 
6940 	for (;;) {
6941 
6942 		acquire();
6943 
6944 		if (m_n_reserved != m_slots.size()) {
6945 			break;
6946 		}
6947 
6948 		release();
6949 
6950 		if (!srv_use_native_aio) {
6951 			/* If the handler threads are suspended,
6952 			wake them so that we get more slots */
6953 
6954 			os_aio_simulated_wake_handler_threads();
6955 		}
6956 
6957 		os_event_wait(m_not_full);
6958 	}
6959 
6960 	ulint	counter = 0;
6961 	Slot*	slot = NULL;
6962 
6963 	/* We start our search for an available slot from our preferred
6964 	local segment and do a full scan of the array. We are
6965 	guaranteed to find a slot in full scan. */
6966 	for (ulint i = local_seg * slots_per_seg;
6967 	     counter < m_slots.size();
6968 	     ++i, ++counter) {
6969 
6970 		i %= m_slots.size();
6971 
6972 		slot = at(i);
6973 
6974 		if (slot->is_reserved == false) {
6975 			break;
6976 		}
6977 	}
6978 
6979 	/* We MUST always be able to get hold of a reserved slot. */
6980 	ut_a(counter < m_slots.size());
6981 
6982 	ut_a(slot->is_reserved == false);
6983 
6984 	++m_n_reserved;
6985 
6986 	if (m_n_reserved == 1) {
6987 		os_event_reset(m_is_empty);
6988 	}
6989 
6990 	if (m_n_reserved == m_slots.size()) {
6991 		os_event_reset(m_not_full);
6992 	}
6993 
6994 	slot->is_reserved = true;
6995 	slot->reservation_time = ut_time_monotonic();
6996 	slot->m1       = m1;
6997 	slot->m2       = m2;
6998 	slot->file     = file;
6999 	slot->name     = name;
7000 #ifdef _WIN32
7001 	slot->len      = static_cast<DWORD>(len);
7002 #else
7003 	slot->len      = static_cast<ulint>(len);
7004 #endif /* _WIN32 */
7005 	slot->type     = type;
7006 	slot->buf      = static_cast<byte*>(buf);
7007 	slot->ptr      = slot->buf;
7008 	slot->offset   = offset;
7009 	slot->err      = DB_SUCCESS;
7010 	slot->original_len = static_cast<uint32>(len);
7011 	slot->io_already_done = false;
7012 	slot->buf_block = NULL;
7013 
7014 	if (srv_use_native_aio
7015 	    && offset > 0
7016 	    && type.is_write()
7017 	    && type.is_compressed()) {
7018 		ulint	compressed_len = len;
7019 
7020 		ut_ad(!type.is_log());
7021 
7022 		release();
7023 
7024 		void* src_buf = slot->buf;
7025 		slot->buf_block = os_file_compress_page(
7026 			type,
7027 			src_buf,
7028 			&compressed_len);
7029 
7030 		slot->buf = static_cast<byte*>(src_buf);
7031 		slot->ptr = slot->buf;
7032 #ifdef _WIN32
7033 		slot->len = static_cast<DWORD>(compressed_len);
7034 #else
7035 		slot->len = static_cast<ulint>(compressed_len);
7036 #endif /* _WIN32 */
7037 		slot->skip_punch_hole = !type.punch_hole();
7038 
7039 		acquire();
7040 	}
7041 
7042 	/* We do encryption after compression, since if we do encryption
7043 	before compression, the encrypted data will cause compression fail
7044 	or low compression rate. */
7045 	if (srv_use_native_aio
7046 	    && offset > 0
7047 	    && type.is_write()
7048 	    && type.is_encrypted()) {
7049 		ulint		encrypted_len = slot->len;
7050 		Block*		encrypted_block;
7051 
7052 		ut_ad(!type.is_log());
7053 
7054 		release();
7055 
7056 		void* src_buf = slot->buf;
7057 		encrypted_block = os_file_encrypt_page(
7058 			type,
7059 			src_buf,
7060 			&encrypted_len);
7061 
7062 		if (slot->buf_block != NULL) {
7063 			os_free_block(slot->buf_block);
7064 		}
7065 
7066 		slot->buf_block = encrypted_block;
7067 		slot->buf = static_cast<byte*>(src_buf);
7068 		slot->ptr = slot->buf;
7069 
7070 #ifdef _WIN32
7071 		slot->len = static_cast<DWORD>(encrypted_len);
7072 #else
7073 		slot->len = static_cast<ulint>(encrypted_len);
7074 #endif /* _WIN32 */
7075 
7076 		acquire();
7077         }
7078 
7079 #ifdef WIN_ASYNC_IO
7080 	{
7081 		OVERLAPPED*	control;
7082 
7083 		control = &slot->control;
7084 		control->Offset = (DWORD) offset & 0xFFFFFFFF;
7085 		control->OffsetHigh = (DWORD) (offset >> 32);
7086 
7087 		ResetEvent(slot->handle);
7088 	}
7089 #elif defined(LINUX_NATIVE_AIO)
7090 
7091 	/* If we are not using native AIO skip this part. */
7092 	if (srv_use_native_aio) {
7093 
7094 		off_t		aio_offset;
7095 
7096 		/* Check if we are dealing with 64 bit arch.
7097 		If not then make sure that offset fits in 32 bits. */
7098 		aio_offset = (off_t) offset;
7099 
7100 		ut_a(sizeof(aio_offset) >= sizeof(offset)
7101 		     || ((os_offset_t) aio_offset) == offset);
7102 
7103 		struct iocb*	iocb = &slot->control;
7104 
7105 		if (type.is_read()) {
7106 			io_prep_pread(
7107 				iocb, file.m_file, slot->ptr, slot->len, aio_offset);
7108 		} else {
7109 			ut_ad(type.is_write());
7110 			io_prep_pwrite(
7111 				iocb, file.m_file, slot->ptr, slot->len, aio_offset);
7112 		}
7113 
7114 		iocb->data = slot;
7115 
7116 		slot->n_bytes = 0;
7117 		slot->ret = 0;
7118 	}
7119 #endif /* LINUX_NATIVE_AIO */
7120 
7121 	release();
7122 
7123 	return(slot);
7124 }
7125 
7126 /** Wakes up a simulated aio i/o-handler thread if it has something to do.
7127 @param[in]	global_segment	The number of the segment in the AIO arrays */
7128 void
wake_simulated_handler_thread(ulint global_segment)7129 AIO::wake_simulated_handler_thread(ulint global_segment)
7130 {
7131 	ut_ad(!srv_use_native_aio);
7132 
7133 	AIO*	array;
7134 	ulint	segment = get_array_and_local_segment(&array, global_segment);
7135 
7136 	array->wake_simulated_handler_thread(global_segment, segment);
7137 }
7138 
7139 /** Wakes up a simulated AIO I/O-handler thread if it has something to do
7140 for a local segment in the AIO array.
7141 @param[in]	global_segment	The number of the segment in the AIO arrays
7142 @param[in]	segment		The local segment in the AIO array */
7143 void
wake_simulated_handler_thread(ulint global_segment,ulint segment)7144 AIO::wake_simulated_handler_thread(ulint global_segment, ulint segment)
7145 {
7146 	ut_ad(!srv_use_native_aio);
7147 
7148 	ulint	n = slots_per_segment();
7149 	ulint	offset = segment * n;
7150 
7151 	/* Look through n slots after the segment * n'th slot */
7152 
7153 	acquire();
7154 
7155 	const Slot*	slot = at(offset);
7156 
7157 	for (ulint i = 0; i < n; ++i, ++slot) {
7158 
7159 		if (slot->is_reserved) {
7160 
7161 			/* Found an i/o request */
7162 
7163 			release();
7164 
7165 			os_event_t	event;
7166 
7167 			event = os_aio_segment_wait_events[global_segment];
7168 
7169 			os_event_set(event);
7170 
7171 			return;
7172 		}
7173 	}
7174 
7175 	release();
7176 }
7177 
7178 /** Wakes up simulated aio i/o-handler threads if they have something to do. */
7179 void
os_aio_simulated_wake_handler_threads()7180 os_aio_simulated_wake_handler_threads()
7181 {
7182 	if (srv_use_native_aio) {
7183 		/* We do not use simulated aio: do nothing */
7184 
7185 		return;
7186 	}
7187 
7188 	os_aio_recommend_sleep_for_read_threads	= false;
7189 
7190 	for (ulint i = 0; i < os_aio_n_segments; i++) {
7191 		AIO::wake_simulated_handler_thread(i);
7192 	}
7193 }
7194 
7195 /** Select the IO slot array
7196 @param[in]	type		Type of IO, READ or WRITE
7197 @param[in]	read_only	true if running in read-only mode
7198 @param[in]	mode		IO mode
7199 @return slot array or NULL if invalid mode specified */
7200 AIO*
select_slot_array(IORequest & type,bool read_only,ulint mode)7201 AIO::select_slot_array(IORequest& type, bool read_only, ulint mode)
7202 {
7203 	AIO*	array;
7204 
7205 	ut_ad(type.validate());
7206 
7207 	switch (mode) {
7208 	case OS_AIO_NORMAL:
7209 
7210 		array = type.is_read() ? AIO::s_reads : AIO::s_writes;
7211 		break;
7212 
7213 	case OS_AIO_IBUF:
7214 		ut_ad(type.is_read());
7215 
7216 		/* Reduce probability of deadlock bugs in connection with ibuf:
7217 		do not let the ibuf i/o handler sleep */
7218 
7219 		type.clear_do_not_wake();
7220 
7221 		array = read_only ? AIO::s_reads : AIO::s_ibuf;
7222 		break;
7223 
7224 	case OS_AIO_LOG:
7225 
7226 		array = read_only ? AIO::s_reads : AIO::s_log;
7227 		break;
7228 
7229 	case OS_AIO_SYNC:
7230 
7231 		array = AIO::s_sync;
7232 #if defined(LINUX_NATIVE_AIO)
7233 		/* In Linux native AIO we don't use sync IO array. */
7234 		ut_a(!srv_use_native_aio);
7235 #endif /* LINUX_NATIVE_AIO */
7236 		break;
7237 
7238 	default:
7239 		ut_error;
7240 		array = NULL; /* Eliminate compiler warning */
7241 	}
7242 
7243 	return(array);
7244 }
7245 
7246 #ifdef WIN_ASYNC_IO
7247 /** This function is only used in Windows asynchronous i/o.
7248 Waits for an aio operation to complete. This function is used to wait the
7249 for completed requests. The aio array of pending requests is divided
7250 into segments. The thread specifies which segment or slot it wants to wait
7251 for. NOTE: this function will also take care of freeing the aio slot,
7252 therefore no other thread is allowed to do the freeing!
7253 @param[in]	segment		The number of the segment in the aio arrays to
7254 				wait for; segment 0 is the ibuf I/O thread,
7255 				segment 1 the log I/O thread, then follow the
7256 				non-ibuf read threads, and as the last are the
7257 				non-ibuf write threads; if this is
7258 				ULINT_UNDEFINED, then it means that sync AIO
7259 				is used, and this parameter is ignored
7260 @param[in]	pos		this parameter is used only in sync AIO:
7261 				wait for the aio slot at this position
7262 @param[out]	m1		the messages passed with the AIO request; note
7263 				that also in the case where the AIO operation
7264 				failed, these output parameters are valid and
7265 				can be used to restart the operation,
7266 				for example
7267 @param[out]	m2		callback message
7268 @param[out]	type		OS_FILE_WRITE or ..._READ
7269 @return DB_SUCCESS or error code */
7270 static
7271 dberr_t
os_aio_windows_handler(ulint segment,ulint pos,fil_node_t ** m1,void ** m2,IORequest * type)7272 os_aio_windows_handler(
7273 	ulint		segment,
7274 	ulint		pos,
7275 	fil_node_t**	m1,
7276 	void**		m2,
7277 	IORequest*	type)
7278 {
7279 	Slot*		slot;
7280 	dberr_t		err;
7281 	AIO*		array;
7282 	ulint		orig_seg = segment;
7283 
7284 	if (segment == ULINT_UNDEFINED) {
7285 		segment = 0;
7286 		array = AIO::sync_array();
7287 	} else {
7288 		segment = AIO::get_array_and_local_segment(&array, segment);
7289 	}
7290 
7291 	/* NOTE! We only access constant fields in os_aio_array. Therefore
7292 	we do not have to acquire the protecting mutex yet */
7293 
7294 	ut_ad(os_aio_validate_skip());
7295 
7296 	if (array == AIO::sync_array()) {
7297 
7298 		WaitForSingleObject(array->at(pos)->handle, INFINITE);
7299 
7300 	} else {
7301 		if (orig_seg != ULINT_UNDEFINED) {
7302 			srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
7303 		}
7304 
7305 		pos = WaitForMultipleObjects(
7306 			(DWORD) array->slots_per_segment(),
7307 			array->handles(segment),
7308 			FALSE, INFINITE);
7309 	}
7310 
7311 	array->acquire();
7312 
7313 	if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
7314 	    && array->is_empty()
7315 	    && !buf_page_cleaner_is_active) {
7316 
7317 		*m1 = NULL;
7318 		*m2 = NULL;
7319 
7320 		array->release();
7321 
7322 		return(DB_SUCCESS);
7323 	}
7324 
7325 	ulint	n = array->slots_per_segment();
7326 
7327 	ut_a(pos >= WAIT_OBJECT_0 && pos <= WAIT_OBJECT_0 + n);
7328 
7329 	slot = array->at(pos + segment * n);
7330 
7331 	ut_a(slot->is_reserved);
7332 
7333 	if (orig_seg != ULINT_UNDEFINED) {
7334 		srv_set_io_thread_op_info(
7335 			orig_seg, "get windows aio return value");
7336 	}
7337 
7338 	BOOL	ret;
7339 	ret = GetOverlappedResult(
7340 		slot->file.m_file, &slot->control, &slot->n_bytes, TRUE);
7341 	*m1 = slot->m1;
7342 	*m2 = slot->m2;
7343 
7344 	*type = slot->type;
7345 
7346 	BOOL	retry = FALSE;
7347 
7348 	if (ret && slot->n_bytes == slot->len) {
7349 
7350 		err = DB_SUCCESS;
7351 
7352 	} else if (os_file_handle_error(slot->name, "Windows aio")) {
7353 
7354 		retry = true;
7355 
7356 	} else {
7357 
7358 		err = DB_IO_ERROR;
7359 	}
7360 
7361 	array->release();
7362 
7363 	if (retry) {
7364 		/* Retry failed read/write operation synchronously.
7365 		No need to hold array->m_mutex. */
7366 
7367 #ifdef UNIV_PFS_IO
7368 		/* This read/write does not go through os_file_read
7369 		and os_file_write APIs, need to register with
7370 		performance schema explicitly here. */
7371 		struct PSI_file_locker* locker = NULL;
7372 		PSI_file_locker_state   state;
7373 		register_pfs_file_io_begin(
7374 			&state, locker, slot->file, slot->len,
7375 			slot->type.is_write()
7376 			? PSI_FILE_WRITE : PSI_FILE_READ, __FILE__, __LINE__);
7377 #endif /* UNIV_PFS_IO */
7378 
7379 		ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
7380 
7381 		ssize_t	n_bytes = SyncFileIO::execute(slot);
7382 
7383 #ifdef UNIV_PFS_IO
7384 		register_pfs_file_io_end(locker, slot->len);
7385 #endif /* UNIV_PFS_IO */
7386 
7387 		if (n_bytes < 0 && GetLastError() == ERROR_IO_PENDING) {
7388 			/* AIO was queued successfully!
7389 			We want a synchronous I/O operation on a
7390 			file where we also use async I/O: in Windows
7391 			we must use the same wait mechanism as for
7392 			async I/O */
7393 
7394 			BOOL	ret;
7395 			ret = GetOverlappedResult(
7396 				slot->file.m_file, &slot->control, &slot->n_bytes,
7397 				TRUE);
7398 			n_bytes = ret ? slot->n_bytes : -1;
7399 		}
7400 
7401 		err = (n_bytes == slot->len) ? DB_SUCCESS : DB_IO_ERROR;
7402 	}
7403 
7404 	if (err == DB_SUCCESS) {
7405 		err = AIOHandler::post_io_processing(slot);
7406 	}
7407 
7408 	array->release_with_mutex(slot);
7409 
7410 	return(err);
7411 }
7412 #endif /* WIN_ASYNC_IO */
7413 
7414 /**
7415 NOTE! Use the corresponding macro os_aio(), not directly this function!
7416 Requests an asynchronous i/o operation.
7417 @param[in]	type		IO request context
7418 @param[in]	mode		IO mode
7419 @param[in]	name		Name of the file or path as NUL terminated
7420 				string
7421 @param[in]	file		Open file handle
7422 @param[out]	buf		buffer where to read
7423 @param[in]	offset		file offset where to read
7424 @param[in]	n		number of bytes to read
7425 @param[in]	read_only	if true read only mode checks are enforced
7426 @param[in,out]	m1		Message for the AIO handler, (can be used to
7427 				identify a completed AIO operation); ignored
7428 				if mode is OS_AIO_SYNC
7429 @param[in,out]	m2		message for the AIO handler (can be used to
7430 				identify a completed AIO operation); ignored
7431 				if mode is OS_AIO_SYNC
7432 @return DB_SUCCESS or error code */
7433 dberr_t
os_aio_func(IORequest & type,ulint mode,const char * name,pfs_os_file_t file,void * buf,os_offset_t offset,ulint n,bool read_only,fil_node_t * m1,void * m2)7434 os_aio_func(
7435 	IORequest&	type,
7436 	ulint		mode,
7437 	const char*	name,
7438 	pfs_os_file_t	file,
7439 	void*		buf,
7440 	os_offset_t	offset,
7441 	ulint		n,
7442 	bool		read_only,
7443 	fil_node_t*	m1,
7444 	void*		m2)
7445 {
7446 #ifdef WIN_ASYNC_IO
7447 	BOOL		ret = TRUE;
7448 #endif /* WIN_ASYNC_IO */
7449 
7450 	ut_ad(n > 0);
7451 	ut_ad((n % OS_MIN_LOG_BLOCK_SIZE) == 0);
7452 	ut_ad((offset % OS_MIN_LOG_BLOCK_SIZE) == 0);
7453 	ut_ad(os_aio_validate_skip());
7454 
7455 #ifdef WIN_ASYNC_IO
7456 	ut_ad((n & 0xFFFFFFFFUL) == n);
7457 #endif /* WIN_ASYNC_IO */
7458 
7459 	if (mode == OS_AIO_SYNC
7460 #ifdef WIN_ASYNC_IO
7461 	    && !srv_use_native_aio
7462 #endif /* WIN_ASYNC_IO */
7463 	    ) {
7464 		/* This is actually an ordinary synchronous read or write:
7465 		no need to use an i/o-handler thread. NOTE that if we use
7466 		Windows async i/o, Windows does not allow us to use
7467 		ordinary synchronous os_file_read etc. on the same file,
7468 		therefore we have built a special mechanism for synchronous
7469 		wait in the Windows case.
7470 		Also note that the Performance Schema instrumentation has
7471 		been performed by current os_aio_func()'s wrapper function
7472 		pfs_os_aio_func(). So we would no longer need to call
7473 		Performance Schema instrumented os_file_read() and
7474 		os_file_write(). Instead, we should use os_file_read_func()
7475 		and os_file_write_func() */
7476 
7477 		if (type.is_read()) {
7478 			return(os_file_read_func(type, file.m_file, buf, offset, n));
7479 		}
7480 
7481 		ut_ad(type.is_write());
7482 		return(os_file_write_func(type, name, file.m_file, buf, offset, n));
7483 	}
7484 
7485 try_again:
7486 
7487 	AIO*	array;
7488 
7489 	array = AIO::select_slot_array(type, read_only, mode);
7490 
7491 	Slot*	slot;
7492 
7493 	slot = array->reserve_slot(type, m1, m2, file, name, buf, offset, n);
7494 
7495 	if (type.is_read()) {
7496 
7497 		if (srv_use_native_aio) {
7498 
7499 			++os_n_file_reads;
7500 
7501 			os_bytes_read_since_printout += n;
7502 #ifdef WIN_ASYNC_IO
7503 			ret = ReadFile(
7504 				file.m_file, slot->ptr, slot->len,
7505 				&slot->n_bytes, &slot->control);
7506 #elif defined(LINUX_NATIVE_AIO)
7507 			if (!array->linux_dispatch(slot)) {
7508 				goto err_exit;
7509 			}
7510 #endif /* WIN_ASYNC_IO */
7511 		} else if (type.is_wake()) {
7512 			AIO::wake_simulated_handler_thread(
7513 				AIO::get_segment_no_from_slot(array, slot));
7514 		}
7515 	} else if (type.is_write()) {
7516 
7517 		if (srv_use_native_aio) {
7518 			++os_n_file_writes;
7519 
7520 #ifdef WIN_ASYNC_IO
7521 			ret = WriteFile(
7522 				file.m_file, slot->ptr, slot->len,
7523 				&slot->n_bytes, &slot->control);
7524 #elif defined(LINUX_NATIVE_AIO)
7525 			if (!array->linux_dispatch(slot)) {
7526 				goto err_exit;
7527 			}
7528 #endif /* WIN_ASYNC_IO */
7529 
7530 		} else if (type.is_wake()) {
7531 			AIO::wake_simulated_handler_thread(
7532 				AIO::get_segment_no_from_slot(array, slot));
7533 		}
7534 	} else {
7535 		ut_error;
7536 	}
7537 
7538 #ifdef WIN_ASYNC_IO
7539 	if (srv_use_native_aio) {
7540 		if ((ret && slot->len == slot->n_bytes)
7541 		     || (!ret && GetLastError() == ERROR_IO_PENDING)) {
7542 			/* aio was queued successfully! */
7543 
7544 			if (mode == OS_AIO_SYNC) {
7545 				IORequest	dummy_type;
7546 				void*		dummy_mess2;
7547 				struct fil_node_t* dummy_mess1;
7548 
7549 				/* We want a synchronous i/o operation on a
7550 				file where we also use async i/o: in Windows
7551 				we must use the same wait mechanism as for
7552 				async i/o */
7553 
7554 				return(os_aio_windows_handler(
7555 					ULINT_UNDEFINED, slot->pos,
7556 					&dummy_mess1, &dummy_mess2,
7557 					&dummy_type));
7558 			}
7559 
7560 			return(DB_SUCCESS);
7561 		}
7562 
7563 		goto err_exit;
7564 	}
7565 #endif /* WIN_ASYNC_IO */
7566 
7567 	/* AIO request was queued successfully! */
7568 	return(DB_SUCCESS);
7569 
7570 #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
7571 err_exit:
7572 #endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
7573 
7574 	array->release_with_mutex(slot);
7575 
7576 	if (os_file_handle_error(
7577 		name, type.is_read() ? "aio read" : "aio write")) {
7578 
7579 		goto try_again;
7580 	}
7581 
7582 	return(DB_IO_ERROR);
7583 }
7584 
7585 /** Simulated AIO handler for reaping IO requests */
7586 class SimulatedAIOHandler {
7587 
7588 public:
7589 
7590 	/** Constructor
7591 	@param[in,out]	array	The AIO array
7592 	@param[in]	segment	Local segment in the array */
SimulatedAIOHandler(AIO * array,ulint segment)7593 	SimulatedAIOHandler(AIO* array, ulint segment)
7594 		:
7595 		m_oldest(),
7596 		m_n_elems(),
7597 		m_lowest_offset(IB_UINT64_MAX),
7598 		m_array(array),
7599 		m_n_slots(),
7600 		m_segment(segment),
7601 		m_ptr(),
7602 		m_buf()
7603 	{
7604 		ut_ad(m_segment < 100);
7605 
7606 		m_slots.resize(OS_AIO_MERGE_N_CONSECUTIVE);
7607 	}
7608 
7609 	/** Destructor */
~SimulatedAIOHandler()7610 	~SimulatedAIOHandler()
7611 	{
7612 		if (m_ptr != NULL) {
7613 			ut_free(m_ptr);
7614 		}
7615 	}
7616 
7617 	/** Reset the state of the handler
7618 	@param[in]	n_slots	Number of pending AIO operations supported */
init(ulint n_slots)7619 	void init(ulint n_slots)
7620 	{
7621 		m_oldest = 0;
7622 		m_n_elems = 0;
7623 		m_n_slots = n_slots;
7624 		m_lowest_offset = IB_UINT64_MAX;
7625 
7626 		if (m_ptr != NULL) {
7627 			ut_free(m_ptr);
7628 			m_ptr = m_buf = NULL;
7629 		}
7630 
7631 		m_slots[0] = NULL;
7632 	}
7633 
7634 	/** Check if there is a slot for which the i/o has already been done
7635 	@param[out]	n_reserved	Number of reserved slots
7636 	@return the first completed slot that is found. */
check_completed(ulint * n_reserved)7637 	Slot* check_completed(ulint* n_reserved)
7638 	{
7639 		ulint	offset = m_segment * m_n_slots;
7640 
7641 		*n_reserved = 0;
7642 
7643 		Slot*	slot;
7644 
7645 		slot = m_array->at(offset);
7646 
7647 		for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7648 
7649 			if (slot->is_reserved) {
7650 
7651 				if (slot->io_already_done) {
7652 
7653 					ut_a(slot->is_reserved);
7654 
7655 					return(slot);
7656 				}
7657 
7658 				++*n_reserved;
7659 			}
7660 		}
7661 
7662 		return(NULL);
7663 	}
7664 
7665 	/** If there are at least 2 seconds old requests, then pick the
7666 	oldest one to prevent starvation.  If several requests have the
7667 	same age, then pick the one at the lowest offset.
7668 	@return true if request was selected */
select()7669 	bool select()
7670 	{
7671 		if (!select_oldest()) {
7672 
7673 			return(select_lowest_offset());
7674 		}
7675 
7676 		return(true);
7677 	}
7678 
7679 	/** Check if there are several consecutive blocks
7680 	to read or write. Merge them if found. */
merge()7681 	void merge()
7682 	{
7683 		/* if m_n_elems != 0, then we have assigned
7684 		something valid to consecutive_ios[0] */
7685 		ut_ad(m_n_elems != 0);
7686 		ut_ad(first_slot() != NULL);
7687 
7688 		Slot*	slot = first_slot();
7689 
7690 		while (!merge_adjacent(slot)) {
7691 			/* No op */
7692 		}
7693 	}
7694 
7695 	/** We have now collected n_consecutive I/O requests
7696 	in the array; allocate a single buffer which can hold
7697 	all data, and perform the I/O
7698 	@return the length of the buffer */
allocate_buffer()7699 	ulint allocate_buffer()
7700 		MY_ATTRIBUTE((warn_unused_result))
7701 	{
7702 		ulint	len;
7703 		Slot*	slot = first_slot();
7704 
7705 		ut_ad(m_ptr == NULL);
7706 
7707 		if (slot->type.is_read() && m_n_elems > 1) {
7708 
7709 			len = 0;
7710 
7711 			for (ulint i = 0; i < m_n_elems; ++i) {
7712 				len += m_slots[i]->len;
7713 			}
7714 
7715 			m_ptr = static_cast<byte*>(
7716 				ut_malloc_nokey(len + UNIV_PAGE_SIZE));
7717 
7718 			m_buf = static_cast<byte*>(
7719 				ut_align(m_ptr, UNIV_PAGE_SIZE));
7720 
7721 		} else {
7722 			len = first_slot()->len;
7723 			m_buf = first_slot()->buf;
7724 		}
7725 
7726 		return(len);
7727 	}
7728 
7729 	/** We have to compress the individual pages and punch
7730 	holes in them on a page by page basis when writing to
7731 	tables that can be compresed at the IO level.
7732 	@param[in]	len		Value returned by allocate_buffer */
copy_to_buffer(ulint len)7733 	void copy_to_buffer(ulint len)
7734 	{
7735 		Slot*	slot = first_slot();
7736 
7737 		if (len > slot->len && slot->type.is_write()) {
7738 
7739 			byte*	ptr = m_buf;
7740 
7741 			ut_ad(ptr != slot->buf);
7742 
7743 			/* Copy the buffers to the combined buffer */
7744 			for (ulint i = 0; i < m_n_elems; ++i) {
7745 
7746 				slot = m_slots[i];
7747 
7748 				memmove(ptr, slot->buf, slot->len);
7749 
7750 				ptr += slot->len;
7751 			}
7752 		}
7753 	}
7754 
7755 	/** Do the I/O with ordinary, synchronous i/o functions:
7756 	@param[in]	len		Length of buffer for IO */
io()7757 	void io()
7758 	{
7759 		if (first_slot()->type.is_write()) {
7760 
7761 			for (ulint i = 0; i < m_n_elems; ++i) {
7762 				write(m_slots[i]);
7763 			}
7764 
7765 		} else {
7766 
7767 			for (ulint i = 0; i < m_n_elems; ++i) {
7768 				read(m_slots[i]);
7769 			}
7770 		}
7771 	}
7772 
7773 	/** Do the decompression of the pages read in */
io_complete()7774 	void io_complete()
7775 	{
7776 		// Note: For non-compressed tables. Not required
7777 		// for correctness.
7778 	}
7779 
7780 	/** Mark the i/os done in slots */
done()7781 	void done()
7782 	{
7783 		for (ulint i = 0; i < m_n_elems; ++i) {
7784 			m_slots[i]->io_already_done = true;
7785 		}
7786 	}
7787 
7788 	/** @return the first slot in the consecutive array */
first_slot()7789 	Slot* first_slot()
7790 		MY_ATTRIBUTE((warn_unused_result))
7791 	{
7792 		ut_a(m_n_elems > 0);
7793 
7794 		return(m_slots[0]);
7795 	}
7796 
7797 	/** Wait for I/O requests
7798 	@param[in]	global_segment	The global segment
7799 	@param[in,out]	event		Wait on event if no active requests
7800 	@return the number of slots */
7801 	ulint check_pending(
7802 		ulint		global_segment,
7803 		os_event_t	event)
7804 		MY_ATTRIBUTE((warn_unused_result));
7805 private:
7806 
7807 	/** Do the file read
7808 	@param[in,out]	slot		Slot that has the IO context */
read(Slot * slot)7809 	void read(Slot* slot)
7810 	{
7811 		dberr_t	err = os_file_read_func(
7812 			slot->type,
7813 			slot->file.m_file,
7814 			slot->ptr,
7815 			slot->offset,
7816 			slot->len);
7817 		ut_a(err == DB_SUCCESS);
7818 	}
7819 
7820 	/** Do the file read
7821 	@param[in,out]	slot		Slot that has the IO context */
write(Slot * slot)7822 	void write(Slot* slot)
7823 	{
7824 		dberr_t	err = os_file_write_func(
7825 			slot->type,
7826 			slot->name,
7827 			slot->file.m_file,
7828 			slot->ptr,
7829 			slot->offset,
7830 			slot->len);
7831 		ut_a(err == DB_SUCCESS || err == DB_IO_NO_PUNCH_HOLE);
7832 	}
7833 
7834 	/** @return true if the slots are adjacent and can be merged */
adjacent(const Slot * s1,const Slot * s2) const7835 	bool adjacent(const Slot* s1, const Slot* s2) const
7836 	{
7837 		return(s1 != s2
7838 		       && s1->file.m_file == s2->file.m_file
7839 		       && s2->offset == s1->offset + s1->len
7840 		       && s1->type == s2->type);
7841 	}
7842 
7843 	/** @return true if merge limit reached or no adjacent slots found. */
merge_adjacent(Slot * & current)7844 	bool merge_adjacent(Slot*& current)
7845 	{
7846 		Slot*	slot;
7847 		ulint	offset = m_segment * m_n_slots;
7848 
7849 		slot = m_array->at(offset);
7850 
7851 		for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7852 
7853 			if (slot->is_reserved && adjacent(current, slot)) {
7854 
7855 				current = slot;
7856 
7857 				/* Found a consecutive i/o request */
7858 
7859 				m_slots[m_n_elems] = slot;
7860 
7861 				++m_n_elems;
7862 
7863 				return(m_n_elems >= m_slots.capacity());
7864 			}
7865 		}
7866 
7867 		return(true);
7868 	}
7869 
7870 	/** There were no old requests. Look for an I/O request at the lowest
7871 	offset in the array (we ignore the high 32 bits of the offset in these
7872 	heuristics) */
select_lowest_offset()7873 	bool select_lowest_offset()
7874 	{
7875 		ut_ad(m_n_elems == 0);
7876 
7877 		ulint	offset = m_segment * m_n_slots;
7878 
7879 		m_lowest_offset = IB_UINT64_MAX;
7880 
7881 		for (ulint i = 0; i < m_n_slots; ++i) {
7882 			Slot*	slot;
7883 
7884 			slot = m_array->at(i + offset);
7885 
7886 			if (slot->is_reserved
7887 			    && slot->offset < m_lowest_offset) {
7888 
7889 				/* Found an i/o request */
7890 				m_slots[0] = slot;
7891 
7892 				m_n_elems = 1;
7893 
7894 				m_lowest_offset = slot->offset;
7895 			}
7896 		}
7897 
7898 		return(m_n_elems > 0);
7899 	}
7900 
7901 	/** Select the slot if it is older than the current oldest slot.
7902 	@param[in]	slot		The slot to check */
select_if_older(Slot * slot)7903 	void select_if_older(Slot* slot)
7904 	{
7905 		int64_t time_diff = ut_time_monotonic() -
7906 					slot->reservation_time;
7907 
7908 		const uint64_t age = time_diff > 0 ? (uint64_t) time_diff : 0;
7909 
7910 		if ((age >= 2 && age > m_oldest)
7911 		    || (age >= 2
7912 			&& age == m_oldest
7913 			&& slot->offset < m_lowest_offset)) {
7914 
7915 			/* Found an i/o request */
7916 			m_slots[0] = slot;
7917 
7918 			m_n_elems = 1;
7919 
7920 			m_oldest = age;
7921 
7922 			m_lowest_offset = slot->offset;
7923 		}
7924 	}
7925 
7926 	/** Select th oldest slot in the array
7927 	@return true if oldest slot found */
select_oldest()7928 	bool select_oldest()
7929 	{
7930 		ut_ad(m_n_elems == 0);
7931 
7932 		Slot*	slot;
7933 		ulint	offset = m_n_slots * m_segment;
7934 
7935 		slot = m_array->at(offset);
7936 
7937 		for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7938 
7939 			if (slot->is_reserved) {
7940 				select_if_older(slot);
7941 			}
7942 		}
7943 
7944 		return(m_n_elems > 0);
7945 	}
7946 
7947 	typedef std::vector<Slot*> slots_t;
7948 
7949 private:
7950 	ulint		m_oldest;
7951 	ulint		m_n_elems;
7952 	os_offset_t	m_lowest_offset;
7953 
7954 	AIO*		m_array;
7955 	ulint		m_n_slots;
7956 	ulint		m_segment;
7957 
7958 	slots_t		m_slots;
7959 
7960 	byte*		m_ptr;
7961 	byte*		m_buf;
7962 };
7963 
7964 /** Wait for I/O requests
7965 @return the number of slots */
7966 ulint
check_pending(ulint global_segment,os_event_t event)7967 SimulatedAIOHandler::check_pending(
7968 	ulint		global_segment,
7969 	os_event_t	event)
7970 {
7971 	/* NOTE! We only access constant fields in os_aio_array.
7972 	Therefore we do not have to acquire the protecting mutex yet */
7973 
7974 	ut_ad(os_aio_validate_skip());
7975 
7976 	ut_ad(m_segment < m_array->get_n_segments());
7977 
7978 	/* Look through n slots after the segment * n'th slot */
7979 
7980 	if (AIO::is_read(m_array)
7981 	    && os_aio_recommend_sleep_for_read_threads) {
7982 
7983 		/* Give other threads chance to add several
7984 		I/Os to the array at once. */
7985 
7986 		srv_set_io_thread_op_info(
7987 			global_segment, "waiting for i/o request");
7988 
7989 		os_event_wait(event);
7990 
7991 		return(0);
7992 	}
7993 
7994 	return(m_array->slots_per_segment());
7995 }
7996 
7997 /** Does simulated AIO. This function should be called by an i/o-handler
7998 thread.
7999 
8000 @param[in]	segment	The number of the segment in the aio arrays to wait
8001 			for; segment 0 is the ibuf i/o thread, segment 1 the
8002 			log i/o thread, then follow the non-ibuf read threads,
8003 			and as the last are the non-ibuf write threads
8004 @param[out]	m1	the messages passed with the AIO request; note that
8005 			also in the case where the AIO operation failed, these
8006 			output parameters are valid and can be used to restart
8007 			the operation, for example
8008 @param[out]	m2	Callback argument
8009 @param[in]	type	IO context
8010 @return DB_SUCCESS or error code */
8011 static
8012 dberr_t
os_aio_simulated_handler(ulint global_segment,fil_node_t ** m1,void ** m2,IORequest * type)8013 os_aio_simulated_handler(
8014 	ulint		global_segment,
8015 	fil_node_t**	m1,
8016 	void**		m2,
8017 	IORequest*	type)
8018 {
8019 	Slot*		slot;
8020 	AIO*		array;
8021 	ulint		segment;
8022 	os_event_t	event = os_aio_segment_wait_events[global_segment];
8023 
8024 	segment = AIO::get_array_and_local_segment(&array, global_segment);
8025 
8026 	SimulatedAIOHandler	handler(array, segment);
8027 
8028 	for (;;) {
8029 
8030 		srv_set_io_thread_op_info(
8031 			global_segment, "looking for i/o requests (a)");
8032 
8033 		ulint	n_slots = handler.check_pending(global_segment, event);
8034 
8035 		if (n_slots == 0) {
8036 			continue;
8037 		}
8038 
8039 		handler.init(n_slots);
8040 
8041 		srv_set_io_thread_op_info(
8042 			global_segment, "looking for i/o requests (b)");
8043 
8044 		array->acquire();
8045 
8046 		ulint	n_reserved;
8047 
8048 		slot = handler.check_completed(&n_reserved);
8049 
8050 		if (slot != NULL) {
8051 
8052 			break;
8053 
8054 		} else if (n_reserved == 0
8055 			   && !buf_page_cleaner_is_active
8056 			   && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
8057 
8058 			/* There is no completed request. If there
8059 			are no pending request at all, and the system
8060 			is being shut down, exit. */
8061 
8062 			array->release();
8063 
8064 			*m1 = NULL;
8065 
8066 			*m2 = NULL;
8067 
8068 			return(DB_SUCCESS);
8069 
8070 		} else if (handler.select()) {
8071 
8072 			break;
8073 		}
8074 
8075 		/* No I/O requested at the moment */
8076 
8077 		srv_set_io_thread_op_info(
8078 			global_segment, "resetting wait event");
8079 
8080 		/* We wait here until tbere are more IO requests
8081 		for this segment. */
8082 
8083 		os_event_reset(event);
8084 
8085 		array->release();
8086 
8087 		srv_set_io_thread_op_info(
8088 			global_segment, "waiting for i/o request");
8089 
8090 		os_event_wait(event);
8091 	}
8092 
8093 	/** Found a slot that has already completed its IO */
8094 
8095 	if (slot == NULL) {
8096 		/* Merge adjacent requests */
8097 		handler.merge();
8098 
8099 		/* Check if there are several consecutive blocks
8100 		to read or write */
8101 
8102 		srv_set_io_thread_op_info(
8103 			global_segment, "consecutive i/o requests");
8104 
8105 		// Note: We don't support write combining for simulated AIO.
8106 		//ulint	total_len = handler.allocate_buffer();
8107 
8108 		/* We release the array mutex for the time of the I/O: NOTE that
8109 		this assumes that there is just one i/o-handler thread serving
8110 		a single segment of slots! */
8111 
8112 		array->release();
8113 
8114 		// Note: We don't support write combining for simulated AIO.
8115 		//handler.copy_to_buffer(total_len);
8116 
8117 		srv_set_io_thread_op_info(global_segment, "doing file i/o");
8118 
8119 		handler.io();
8120 
8121 		srv_set_io_thread_op_info(global_segment, "file i/o done");
8122 
8123 		handler.io_complete();
8124 
8125 		array->acquire();
8126 
8127 		handler.done();
8128 
8129 		/* We return the messages for the first slot now, and if there
8130 		were several slots, the messages will be returned with
8131 		subsequent calls of this function */
8132 
8133 		slot = handler.first_slot();
8134 	}
8135 
8136 	ut_ad(slot->is_reserved);
8137 
8138 	*m1 = slot->m1;
8139 	*m2 = slot->m2;
8140 
8141 	*type = slot->type;
8142 
8143 	array->release(slot);
8144 
8145 	array->release();
8146 
8147 	return(DB_SUCCESS);
8148 }
8149 
8150 /** Get the total number of pending IOs
8151 @return the total number of pending IOs */
8152 ulint
total_pending_io_count()8153 AIO::total_pending_io_count()
8154 {
8155 	ulint	count = s_reads->pending_io_count();
8156 
8157 	if (s_writes != NULL) {
8158 		count += s_writes->pending_io_count();
8159 	}
8160 
8161 	if (s_ibuf != NULL) {
8162 		count += s_ibuf->pending_io_count();
8163 	}
8164 
8165 	if (s_log != NULL) {
8166 		count += s_log->pending_io_count();
8167 	}
8168 
8169 	if (s_sync != NULL) {
8170 		count += s_sync->pending_io_count();
8171 	}
8172 
8173 	return(count);
8174 }
8175 
8176 /** Validates the consistency the aio system.
8177 @return true if ok */
8178 static
8179 bool
os_aio_validate()8180 os_aio_validate()
8181 {
8182 	/* The methods countds and validates, we ignore the count. */
8183 	AIO::total_pending_io_count();
8184 
8185 	return(true);
8186 }
8187 
8188 /** Prints pending IO requests per segment of an aio array.
8189 We probably don't need per segment statistics but they can help us
8190 during development phase to see if the IO requests are being
8191 distributed as expected.
8192 @param[in,out]	file		File where to print
8193 @param[in]	segments	Pending IO array */
8194 void
print_segment_info(FILE * file,const ulint * segments)8195 AIO::print_segment_info(
8196 	FILE*		file,
8197 	const ulint*	segments)
8198 {
8199 	ut_ad(m_n_segments > 0);
8200 
8201 	if (m_n_segments > 1) {
8202 
8203 		fprintf(file, " [");
8204 
8205 		for (ulint i = 0; i < m_n_segments; ++i, ++segments) {
8206 
8207 			if (i != 0) {
8208 				fprintf(file, ", ");
8209 			}
8210 
8211 			fprintf(file, ULINTPF, *segments);
8212 		}
8213 
8214 		fprintf(file, "] ");
8215 	}
8216 }
8217 
8218 /** Prints info about the aio array.
8219 @param[in,out]	file		Where to print */
8220 void
print(FILE * file)8221 AIO::print(FILE* file)
8222 {
8223 	ulint	count = 0;
8224 	ulint	n_res_seg[SRV_MAX_N_IO_THREADS];
8225 
8226 	mutex_enter(&m_mutex);
8227 
8228 	ut_a(!m_slots.empty());
8229 	ut_a(m_n_segments > 0);
8230 
8231 	memset(n_res_seg, 0x0, sizeof(n_res_seg));
8232 
8233 	for (ulint i = 0; i < m_slots.size(); ++i) {
8234 		Slot&	slot = m_slots[i];
8235 		ulint	segment = (i * m_n_segments) / m_slots.size();
8236 
8237 		if (slot.is_reserved) {
8238 
8239 			++count;
8240 
8241 			++n_res_seg[segment];
8242 
8243 			ut_a(slot.len > 0);
8244 		}
8245 	}
8246 
8247 	ut_a(m_n_reserved == count);
8248 
8249 	print_segment_info(file, n_res_seg);
8250 
8251 	mutex_exit(&m_mutex);
8252 }
8253 
8254 /** Print all the AIO segments
8255 @param[in,out]	file		Where to print */
8256 void
print_all(FILE * file)8257 AIO::print_all(FILE* file)
8258 {
8259 	s_reads->print(file);
8260 
8261 	if (s_writes != NULL) {
8262 		fputs(", aio writes:", file);
8263 		s_writes->print(file);
8264 	}
8265 
8266 	if (s_ibuf != NULL) {
8267 		fputs(",\n ibuf aio reads:", file);
8268 		s_ibuf->print(file);
8269 	}
8270 
8271 	if (s_log != NULL) {
8272 		fputs(", log i/o's:", file);
8273 		s_log->print(file);
8274 	}
8275 
8276 	if (s_sync != NULL) {
8277 		fputs(", sync i/o's:", file);
8278 		s_sync->print(file);
8279 	}
8280 }
8281 
8282 /** Prints info of the aio arrays.
8283 @param[in,out]	file		file where to print */
8284 void
os_aio_print(FILE * file)8285 os_aio_print(FILE*	file)
8286 {
8287 	ib_time_monotonic_t 		current_time;
8288 	double	 			time_elapsed;
8289 	double				avg_bytes_read;
8290 
8291 	for (ulint i = 0; i < srv_n_file_io_threads; ++i) {
8292 		fprintf(file, "I/O thread %lu state: %s (%s)",
8293 			(ulong) i,
8294 			srv_io_thread_op_info[i],
8295 			srv_io_thread_function[i]);
8296 
8297 #ifndef _WIN32
8298 		if (os_event_is_set(os_aio_segment_wait_events[i])) {
8299 			fprintf(file, " ev set");
8300 		}
8301 #endif /* _WIN32 */
8302 
8303 		fprintf(file, "\n");
8304 	}
8305 
8306 	fputs("Pending normal aio reads:", file);
8307 
8308 	AIO::print_all(file);
8309 
8310 	putc('\n', file);
8311 	current_time = ut_time_monotonic();
8312 	time_elapsed = 0.001 + (current_time - os_last_printout);
8313 
8314 	fprintf(file,
8315 		"Pending flushes (fsync) log: " ULINTPF "; "
8316 		"buffer pool: " ULINTPF "\n"
8317 		ULINTPF " OS file reads, "
8318 		ULINTPF " OS file writes, "
8319 		ULINTPF " OS fsyncs\n",
8320 		fil_n_pending_log_flushes,
8321 		fil_n_pending_tablespace_flushes,
8322 		os_n_file_reads,
8323 		os_n_file_writes,
8324 		os_n_fsyncs);
8325 
8326 	if (os_n_pending_writes != 0 || os_n_pending_reads != 0) {
8327 		fprintf(file,
8328 			ULINTPF " pending preads, "
8329 			ULINTPF " pending pwrites\n",
8330 			os_n_pending_reads,
8331 			os_n_pending_writes);
8332 	}
8333 
8334 	if (os_n_file_reads == os_n_file_reads_old) {
8335 		avg_bytes_read = 0.0;
8336 	} else {
8337 		avg_bytes_read = (double) os_bytes_read_since_printout
8338 			/ (os_n_file_reads - os_n_file_reads_old);
8339 	}
8340 
8341 	fprintf(file,
8342 		"%.2f reads/s, %lu avg bytes/read,"
8343 		" %.2f writes/s, %.2f fsyncs/s\n",
8344 		(os_n_file_reads - os_n_file_reads_old)
8345 		/ time_elapsed,
8346 		(ulong) avg_bytes_read,
8347 		(os_n_file_writes - os_n_file_writes_old)
8348 		/ time_elapsed,
8349 		(os_n_fsyncs - os_n_fsyncs_old)
8350 		/ time_elapsed);
8351 
8352 	os_n_file_reads_old = os_n_file_reads;
8353 	os_n_file_writes_old = os_n_file_writes;
8354 	os_n_fsyncs_old = os_n_fsyncs;
8355 	os_bytes_read_since_printout = 0;
8356 
8357 	os_last_printout = current_time;
8358 }
8359 
8360 /** Refreshes the statistics used to print per-second averages. */
8361 void
os_aio_refresh_stats()8362 os_aio_refresh_stats()
8363 {
8364 	os_n_fsyncs_old = os_n_fsyncs;
8365 
8366 	os_bytes_read_since_printout = 0;
8367 
8368 	os_n_file_reads_old = os_n_file_reads;
8369 
8370 	os_n_file_writes_old = os_n_file_writes;
8371 
8372 	os_n_fsyncs_old = os_n_fsyncs;
8373 
8374 	os_bytes_read_since_printout = 0;
8375 
8376 	os_last_printout = ut_time_monotonic();
8377 }
8378 
8379 /** Checks that all slots in the system have been freed, that is, there are
8380 no pending io operations.
8381 @return true if all free */
8382 bool
os_aio_all_slots_free()8383 os_aio_all_slots_free()
8384 {
8385 	return(AIO::total_pending_io_count() == 0);
8386 }
8387 
8388 #ifdef UNIV_DEBUG
8389 /** Prints all pending IO for the array
8390 @param[in]	file	file where to print
8391 @param[in]	array	array to process */
8392 void
to_file(FILE * file) const8393 AIO::to_file(FILE* file) const
8394 {
8395 	acquire();
8396 
8397 	fprintf(file, " %lu\n", static_cast<ulong>(m_n_reserved));
8398 
8399 	for (ulint i = 0; i < m_slots.size(); ++i) {
8400 
8401 		const Slot&	slot = m_slots[i];
8402 
8403 		if (slot.is_reserved) {
8404 
8405 			fprintf(file,
8406 				"%s IO for %s (offset=" UINT64PF
8407 				", size=%lu)\n",
8408 				slot.type.is_read() ? "read" : "write",
8409 				slot.name, slot.offset, slot.len);
8410 		}
8411 	}
8412 
8413 	release();
8414 }
8415 
8416 /** Print pending IOs for all arrays */
8417 void
print_to_file(FILE * file)8418 AIO::print_to_file(FILE* file)
8419 {
8420 	fprintf(file, "Pending normal aio reads:");
8421 
8422 	s_reads->to_file(file);
8423 
8424 	if (s_writes != NULL) {
8425 		fprintf(file, "Pending normal aio writes:");
8426 		s_writes->to_file(file);
8427 	}
8428 
8429 	if (s_ibuf != NULL) {
8430 		fprintf(file, "Pending ibuf aio reads:");
8431 		s_ibuf->to_file(file);
8432 	}
8433 
8434 	if (s_log != NULL) {
8435 		fprintf(file, "Pending log i/o's:");
8436 		s_log->to_file(file);
8437 	}
8438 
8439 	if (s_sync != NULL) {
8440 		fprintf(file, "Pending sync i/o's:");
8441 		s_sync->to_file(file);
8442 	}
8443 }
8444 
8445 /** Prints all pending IO
8446 @param[in]	file		File where to print */
8447 void
os_aio_print_pending_io(FILE * file)8448 os_aio_print_pending_io(
8449 	FILE*	file)
8450 {
8451 	AIO::print_to_file(file);
8452 }
8453 
8454 #endif /* UNIV_DEBUG */
8455 
8456 /**
8457 Set the file create umask
8458 @param[in]	umask		The umask to use for file creation. */
8459 void
os_file_set_umask(ulint umask)8460 os_file_set_umask(ulint umask)
8461 {
8462 	os_innodb_umask = umask;
8463 }
8464 #else
8465 
8466 #include "univ.i"
8467 #include "db0err.h"
8468 #include "mach0data.h"
8469 #include "fil0fil.h"
8470 #include "os0file.h"
8471 
8472 #include <lz4.h>
8473 #include <zlib.h>
8474 
8475 #include <my_aes.h>
8476 #include <my_rnd.h>
8477 #include <mysqld.h>
8478 #include <mysql/service_mysql_keyring.h>
8479 
8480 typedef byte	Block;
8481 
8482 /** Allocate a page for sync IO
8483 @return pointer to page */
8484 static
8485 Block*
os_alloc_block()8486 os_alloc_block()
8487 {
8488 	return(reinterpret_cast<byte*>(malloc(UNIV_PAGE_SIZE_MAX * 2)));
8489 }
8490 
8491 /** Free a page after sync IO
8492 @param[in,own]	block		The block to free/release */
8493 static
8494 void
os_free_block(Block * block)8495 os_free_block(Block* block)
8496 {
8497 	ut_free(block);
8498 }
8499 
8500 #endif /* !UNIV_INNOCHECKSUM */
8501 
8502 /** Minimum length needed for encryption */
8503 const unsigned int MIN_ENCRYPTION_LEN = 2 * MY_AES_BLOCK_SIZE + FIL_PAGE_DATA;
8504 
8505 /**
8506 @param[in]      type            The compression type
8507 @return the string representation */
8508 const char*
to_string(Type type)8509 Compression::to_string(Type type)
8510 {
8511         switch(type) {
8512         case NONE:
8513                 return("None");
8514         case ZLIB:
8515                 return("Zlib");
8516         case LZ4:
8517                 return("LZ4");
8518         }
8519 
8520         ut_ad(0);
8521 
8522         return("<UNKNOWN>");
8523 }
8524 
8525 /**
8526 @param[in]      meta		Page Meta data
8527 @return the string representation */
to_string(const Compression::meta_t & meta)8528 std::string Compression::to_string(const Compression::meta_t& meta)
8529 {
8530 	std::ostringstream	stream;
8531 
8532 	stream	<< "version: " << int(meta.m_version) << " "
8533 		<< "algorithm: " << meta.m_algorithm << " "
8534 		<< "(" << to_string(meta.m_algorithm) << ") "
8535 		<< "orginal_type: " << meta.m_original_type << " "
8536 		<< "original_size: " << meta.m_original_size << " "
8537 		<< "compressed_size: " << meta.m_compressed_size;
8538 
8539 	return(stream.str());
8540 }
8541 
8542 /** @return true if it is a compressed page */
8543 bool
is_compressed_page(const byte * page)8544 Compression::is_compressed_page(const byte* page)
8545 {
8546 	return(mach_read_from_2(page + FIL_PAGE_TYPE) == FIL_PAGE_COMPRESSED);
8547 }
8548 
8549 bool
is_compressed_encrypted_page(const byte * page)8550 Compression::is_compressed_encrypted_page(const byte *page) {
8551 	return (mach_read_from_2(page + FIL_PAGE_TYPE) ==
8552 		FIL_PAGE_COMPRESSED_AND_ENCRYPTED);
8553 }
8554 
8555 bool
is_valid_page_version(uint8_t version)8556 Compression::is_valid_page_version(uint8_t version) {
8557 	return (version == FIL_PAGE_VERSION_1 || version == FIL_PAGE_VERSION_2);
8558 }
8559 
8560 /** Deserizlise the page header compression meta-data
8561 @param[in]	page		Pointer to the page header
8562 @param[out]	control		Deserialised data */
8563 void
deserialize_header(const byte * page,Compression::meta_t * control)8564 Compression::deserialize_header(
8565 	const byte*		page,
8566 	Compression::meta_t*	control)
8567 {
8568 	ut_ad(is_compressed_page(page) || is_compressed_encrypted_page(page));
8569 
8570 	control->m_version = static_cast<uint8_t>(
8571 		mach_read_from_1(page + FIL_PAGE_VERSION));
8572 
8573 	control->m_original_type = static_cast<uint16_t>(
8574 		mach_read_from_2(page + FIL_PAGE_ORIGINAL_TYPE_V1));
8575 
8576 	control->m_compressed_size = static_cast<uint16_t>(
8577 		mach_read_from_2(page + FIL_PAGE_COMPRESS_SIZE_V1));
8578 
8579 	control->m_original_size = static_cast<uint16_t>(
8580 		mach_read_from_2(page + FIL_PAGE_ORIGINAL_SIZE_V1));
8581 
8582 	control->m_algorithm = static_cast<Type>(
8583 		mach_read_from_1(page + FIL_PAGE_ALGORITHM_V1));
8584 }
8585 
8586 /** Decompress the page data contents. Page type must be FIL_PAGE_COMPRESSED, if
8587 not then the source contents are left unchanged and DB_SUCCESS is returned.
8588 @param[in]	dblwr_recover	true of double write recovery in progress
8589 @param[in,out]	src		Data read from disk, decompressed data will be
8590 				copied to this page
8591 @param[in,out]	dst		Scratch area to use for decompression
8592 @param[in]	dst_len		Size of the scratch area in bytes
8593 @return DB_SUCCESS or error code */
8594 dberr_t
deserialize(bool dblwr_recover,byte * src,byte * dst,ulint dst_len)8595 Compression::deserialize(
8596 	bool		dblwr_recover,
8597 	byte*		src,
8598 	byte*		dst,
8599 	ulint		dst_len)
8600 {
8601 	if (!is_compressed_page(src)) {
8602 		/* There is nothing we can do. */
8603 		return(DB_SUCCESS);
8604 	}
8605 
8606 	meta_t	header;
8607 
8608 	deserialize_header(src, &header);
8609 
8610 	byte*	ptr = src + FIL_PAGE_DATA;
8611 
8612 	ut_ad(is_valid_page_version(header.m_version));
8613 
8614 	if (!is_valid_page_version(header.m_version)
8615 	    || header.m_original_size < UNIV_PAGE_SIZE_MIN - (FIL_PAGE_DATA + 8)
8616 	    || header.m_original_size > UNIV_PAGE_SIZE_MAX - FIL_PAGE_DATA
8617 	    || dst_len < header.m_original_size + FIL_PAGE_DATA) {
8618 
8619 		/* The last check could potentially return DB_OVERFLOW,
8620 		the caller should be able to retry with a larger buffer. */
8621 
8622 		return(DB_CORRUPTION);
8623 	}
8624 
8625 	Block*	block;
8626 
8627 	/* The caller doesn't know what to expect */
8628 	if (dst == NULL) {
8629 
8630 		block = os_alloc_block();
8631 
8632 #ifdef UNIV_INNOCHECKSUM
8633 		dst = block;
8634 #else
8635 		dst = block->m_ptr;
8636 #endif /* UNIV_INNOCHECKSUM */
8637 
8638 	} else {
8639 		block = NULL;
8640 	}
8641 
8642 	int		ret;
8643 	Compression	compression;
8644 	ulint		len = header.m_original_size;
8645 
8646 	compression.m_type = static_cast<Compression::Type>(header.m_algorithm);
8647 
8648 	switch(compression.m_type) {
8649 	case Compression::ZLIB: {
8650 
8651 		uLongf	zlen = header.m_original_size;
8652 
8653 		if (uncompress(dst, &zlen, ptr, header.m_compressed_size)
8654 		    != Z_OK) {
8655 
8656 			if (block != NULL) {
8657 				os_free_block(block);
8658 			}
8659 
8660 			return(DB_IO_DECOMPRESS_FAIL);
8661 		}
8662 
8663 		len = static_cast<ulint>(zlen);
8664 
8665 		break;
8666 	}
8667 
8668 	case Compression::LZ4:
8669 
8670 		if (dblwr_recover) {
8671 
8672 			ret = LZ4_decompress_safe(
8673 				reinterpret_cast<char*>(ptr),
8674 				reinterpret_cast<char*>(dst),
8675 				header.m_compressed_size,
8676 				header.m_original_size);
8677 
8678 		} else {
8679 
8680 			/* This can potentially read beyond the input
8681 			buffer if the data is malformed. According to
8682 			the LZ4 documentation it is a little faster
8683 			than the above function. When recovering from
8684 			the double write buffer we can afford to us the
8685 			slower function above. */
8686 
8687 			ret = LZ4_decompress_fast(
8688 				reinterpret_cast<char*>(ptr),
8689 				reinterpret_cast<char*>(dst),
8690 				header.m_original_size);
8691 		}
8692 
8693 		if (ret < 0) {
8694 
8695 			if (block != NULL) {
8696 				os_free_block(block);
8697 			}
8698 
8699 			return(DB_IO_DECOMPRESS_FAIL);
8700 		}
8701 
8702 		break;
8703 
8704 	default:
8705 #if !defined(UNIV_INNOCHECKSUM)
8706 		ib::error()
8707 			<< "Compression algorithm support missing: "
8708 			<< Compression::to_string(compression.m_type);
8709 #else
8710 		fprintf(stderr, "Compression algorithm support missing: %s\n",
8711 			Compression::to_string(compression.m_type));
8712 #endif /* !UNIV_INNOCHECKSUM */
8713 
8714 		if (block != NULL) {
8715 			os_free_block(block);
8716 		}
8717 
8718 		return(DB_UNSUPPORTED);
8719 	}
8720 
8721 	/* Leave the header alone */
8722 	memmove(src + FIL_PAGE_DATA, dst, len);
8723 
8724 	mach_write_to_2(src + FIL_PAGE_TYPE, header.m_original_type);
8725 
8726 	ut_ad(dblwr_recover
8727 	      || memcmp(src + FIL_PAGE_LSN + 4,
8728 			src + (header.m_original_size + FIL_PAGE_DATA)
8729 			- FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4) == 0);
8730 
8731 	if (block != NULL) {
8732 		os_free_block(block);
8733 	}
8734 
8735 	return(DB_SUCCESS);
8736 }
8737 
8738 /** Decompress the page data contents. Page type must be FIL_PAGE_COMPRESSED, if
8739 not then the source contents are left unchanged and DB_SUCCESS is returned.
8740 @param[in]	dblwr_recover	true of double write recovery in progress
8741 @param[in,out]	src		Data read from disk, decompressed data will be
8742 				copied to this page
8743 @param[in,out]	dst		Scratch area to use for decompression
8744 @param[in]	dst_len		Size of the scratch area in bytes
8745 @return DB_SUCCESS or error code */
8746 dberr_t
os_file_decompress_page(bool dblwr_recover,byte * src,byte * dst,ulint dst_len)8747 os_file_decompress_page(
8748 	bool		dblwr_recover,
8749 	byte*		src,
8750 	byte*		dst,
8751 	ulint		dst_len)
8752 {
8753 	return(Compression::deserialize(dblwr_recover, src, dst, dst_len));
8754 }
8755 
8756 /**
8757 @param[in]      type            The encryption type
8758 @return the string representation */
8759 const char*
to_string(Type type)8760 Encryption::to_string(Type type)
8761 {
8762         switch(type) {
8763         case NONE:
8764                 return("N");
8765         case AES:
8766                 return("Y");
8767         }
8768 
8769         ut_ad(0);
8770 
8771         return("<UNKNOWN>");
8772 }
8773 
8774 /** Generate random encryption value for key and iv.
8775 @param[in,out]	value	Encryption value */
random_value(byte * value)8776 void Encryption::random_value(byte* value)
8777 {
8778 	ut_ad(value != NULL);
8779 
8780 	my_rand_buffer(value, ENCRYPTION_KEY_LEN);
8781 }
8782 
8783 /** Create new master key
8784 @param[in,out]	master_key	master key */
8785 void
create_master_key_v0(byte ** master_key)8786 Encryption::create_master_key_v0(byte** master_key)
8787 {
8788 #ifndef UNIV_INNOCHECKSUM
8789 	char*	key_type = NULL;
8790 	size_t	key_len;
8791 	char	key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
8792 	int	ret;
8793 
8794 	memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
8795 
8796 	/* Generate new master key */
8797 	sprintf(key_name, "%s-%lu-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8798 		server_id, master_key_id + 1);
8799 
8800 	/* We call key ring API to generate master key here. */
8801 	ret = my_key_generate(key_name, "AES",
8802 			      NULL, ENCRYPTION_KEY_LEN);
8803 
8804 	/* We call key ring API to get master key here. */
8805 	ret = my_key_fetch(key_name, &key_type, NULL,
8806 			   reinterpret_cast<void**>(master_key),
8807 			   &key_len);
8808 
8809 	if (ret) {
8810 		ib::error() << "Encryption can't find master key, please check"
8811 				" the keyring plugin is loaded.";
8812 		*master_key = NULL;
8813 	}
8814 
8815 	master_key_id++;
8816 
8817 	if (key_type) {
8818 		my_free(key_type);
8819 	}
8820 #endif
8821 }
8822 
8823 /** Create new master key for key rotation.
8824 @param[in,out]	master_key	master key */
8825 void
create_master_key(byte ** master_key)8826 Encryption::create_master_key(byte** master_key)
8827 {
8828 #ifndef UNIV_INNOCHECKSUM
8829 	char*	key_type = NULL;
8830 	size_t	key_len;
8831 	char	key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
8832 	int	ret;
8833 
8834 	/* If uuid does not match with current server uuid,
8835 	set uuid as current server uuid. */
8836 	if (strcmp(uuid, server_uuid) != 0) {
8837 		memcpy(uuid, server_uuid, ENCRYPTION_SERVER_UUID_LEN);
8838 	}
8839 	memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
8840 
8841 	/* Generate new master key */
8842 	ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8843 		    "%s-%s-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8844 		    uuid, master_key_id + 1);
8845 
8846 	/* We call key ring API to generate master key here. */
8847 	ret = my_key_generate(key_name, "AES",
8848 			      NULL, ENCRYPTION_KEY_LEN);
8849 
8850 	/* We call key ring API to get master key here. */
8851 	ret = my_key_fetch(key_name, &key_type, NULL,
8852 			   reinterpret_cast<void**>(master_key),
8853 			   &key_len);
8854 
8855 	if (ret || *master_key == NULL) {
8856 		ib::error() << "Encryption can't find master key, please check"
8857 				" the keyring plugin is loaded.";
8858 		*master_key = NULL;
8859 	} else {
8860 		master_key_id++;
8861 	}
8862 
8863 	if (key_type) {
8864 		my_free(key_type);
8865 	}
8866 #endif
8867 }
8868 
8869 /** Get master key by key id.
8870 @param[in]	master_key_id	master key id
8871 @param[in]	srv_uuid	uuid of server instance
8872 @param[in,out]	master_key	master key */
8873 void
get_master_key(ulint master_key_id,char * srv_uuid,byte ** master_key)8874 Encryption::get_master_key(ulint master_key_id,
8875 			   char* srv_uuid,
8876 			   byte** master_key)
8877 {
8878 #ifndef UNIV_INNOCHECKSUM
8879 	char*	key_type = NULL;
8880 	size_t	key_len;
8881 	char	key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
8882 	int	ret;
8883 
8884 	memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
8885 
8886 	if (srv_uuid != NULL) {
8887 		ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8888 			    "%s-%s-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8889 			    srv_uuid, master_key_id);
8890 	} else {
8891 		/* For compitable with 5.7.11, we need to get master key with
8892 		server id. */
8893 		memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
8894 		ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8895 			    "%s-%lu-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8896 			    server_id, master_key_id);
8897 	}
8898 
8899 	/* We call key ring API to get master key here. */
8900 	ret = my_key_fetch(key_name, &key_type, NULL,
8901 			   reinterpret_cast<void**>(master_key), &key_len);
8902 
8903 	if (key_type) {
8904 		my_free(key_type);
8905 	}
8906 
8907 	if (ret) {
8908 		*master_key = NULL;
8909 		ib::error() << "Encryption can't find master key, please check"
8910 				" the keyring plugin is loaded.";
8911 	}
8912 
8913 #ifdef UNIV_ENCRYPT_DEBUG
8914 	if (!ret && *master_key) {
8915 		fprintf(stderr, "Fetched master key:%lu ", master_key_id);
8916 		ut_print_buf(stderr, *master_key, key_len);
8917 		fprintf(stderr, "\n");
8918 	}
8919 #endif /* DEBUG_TDE */
8920 
8921 #endif
8922 }
8923 
8924 /** Current master key id */
8925 ulint	Encryption::master_key_id = 0;
8926 
8927 /** Current uuid of server instance */
8928 char	Encryption::uuid[ENCRYPTION_SERVER_UUID_LEN + 1] = {0};
8929 
8930 /** Default max_version */
8931 Encryption::Version Encryption::max_version = Encryption::ENCRYPTION_VERSION_2;
8932 
8933 /** Get current master key and master key id
8934 @param[in,out]	master_key_id	master key id
8935 @param[in,out]	master_key	master key
8936 @param[in,out]	version		encryption information version */
8937 void
get_master_key(ulint * master_key_id,byte ** master_key,Encryption::Version * version)8938 Encryption::get_master_key(ulint* master_key_id,
8939 			   byte** master_key,
8940 			   Encryption::Version*  version)
8941 {
8942 #ifndef UNIV_INNOCHECKSUM
8943 	char*	key_type = NULL;
8944 	size_t	key_len;
8945 	char	key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
8946 	int	ret;
8947 
8948 	memset(key_name, 0, ENCRYPTION_KEY_LEN);
8949 	*version = Encryption::max_version;
8950 
8951 	if (Encryption::master_key_id == 0) {
8952 		/* If m_master_key is 0, means there's no encrypted
8953 		tablespace, we need to generate the first master key,
8954 		and store it to key ring. */
8955 		memset(uuid, 0, ENCRYPTION_SERVER_UUID_LEN + 1);
8956 		memcpy(uuid, server_uuid, ENCRYPTION_SERVER_UUID_LEN);
8957 
8958 		/* Prepare the server uuid. */
8959 		ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8960 			    "%s-%s-1", ENCRYPTION_MASTER_KEY_PRIFIX,
8961 			    uuid);
8962 
8963 		/* We call key ring API to generate master key here. */
8964 		ret = my_key_generate(key_name, "AES",
8965 				      NULL, ENCRYPTION_KEY_LEN);
8966 
8967 		/* We call key ring API to get master key here. */
8968 		ret = my_key_fetch(key_name, &key_type, NULL,
8969 				   reinterpret_cast<void**>(master_key),
8970 				   &key_len);
8971 
8972 		if (!ret && *master_key != NULL) {
8973 			Encryption::master_key_id++;
8974 			*master_key_id = Encryption::master_key_id;
8975 		}
8976 #ifdef UNIV_ENCRYPT_DEBUG
8977 		if (!ret && *master_key) {
8978 			fprintf(stderr, "Generated new master key:");
8979 			ut_print_buf(stderr, *master_key, key_len);
8980 			fprintf(stderr, "\n");
8981 		}
8982 #endif
8983 	} else {
8984 		*master_key_id = Encryption::master_key_id;
8985 
8986 		ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8987 			    "%s-%s-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8988 			    uuid, *master_key_id);
8989 
8990 		/* We call key ring API to get master key here. */
8991 		ret = my_key_fetch(key_name, &key_type, NULL,
8992 				   reinterpret_cast<void**>(master_key),
8993 				   &key_len);
8994 
8995 		/* For compitable with 5.7.11, we need to try to get master key with
8996 		server id when get master key with server uuid failure. */
8997 		if (ret || *master_key == NULL) {
8998 			if (key_type) {
8999 				my_free(key_type);
9000 			}
9001 
9002 			memset(key_name, 0,
9003 			       ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
9004 			ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
9005 				    "%s-%lu-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
9006 				    server_id, *master_key_id);
9007 
9008 			ret = my_key_fetch(key_name, &key_type, NULL,
9009 					   reinterpret_cast<void**>(master_key),
9010 					   &key_len);
9011 			*version = Encryption::ENCRYPTION_VERSION_1;
9012 		}
9013 #ifdef UNIV_ENCRYPT_DEBUG
9014 		if (!ret && *master_key) {
9015 			fprintf(stderr, "Fetched master key:%lu ",
9016 				*master_key_id);
9017 			ut_print_buf(stderr, *master_key, key_len);
9018 			fprintf(stderr, "\n");
9019 		}
9020 #endif
9021 	}
9022 
9023 	if (ret) {
9024 		*master_key = NULL;
9025 		ib::error() << "Encryption can't find master key, please check"
9026 				" the keyring plugin is loaded.";
9027 	}
9028 
9029 	if (key_type) {
9030 		my_free(key_type);
9031 	}
9032 #endif
9033 }
9034 
9035 /** Check if page is encrypted page or not
9036 @param[in]	page	page which need to check
9037 @return true if it is a encrypted page */
9038 bool
is_encrypted_page(const byte * page)9039 Encryption::is_encrypted_page(const byte* page)
9040 {
9041 	ulint	page_type = mach_read_from_2(page + FIL_PAGE_TYPE);
9042 
9043 	return(page_type == FIL_PAGE_ENCRYPTED
9044 	       || page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED
9045 	       || page_type == FIL_PAGE_ENCRYPTED_RTREE);
9046 }
9047 
9048 /** Encrypt the page data contents. Page type can't be
9049 FIL_PAGE_ENCRYPTED, FIL_PAGE_COMPRESSED_AND_ENCRYPTED,
9050 FIL_PAGE_ENCRYPTED_RTREE.
9051 @param[in]	type		IORequest
9052 @param[in,out]	src		page data which need to encrypt
9053 @param[in]	src_len		Size of the source in bytes
9054 @param[in,out]	dst		destination area
9055 @param[in,out]	dst_len		Size of the destination in bytes
9056 @return buffer data, dst_len will have the length of the data */
9057 byte*
encrypt(const IORequest & type,byte * src,ulint src_len,byte * dst,ulint * dst_len)9058 Encryption::encrypt(
9059 	const IORequest&	type,
9060 	byte*			src,
9061 	ulint			src_len,
9062 	byte*			dst,
9063 	ulint*			dst_len)
9064 {
9065 	ut_ad(m_type != NONE);
9066 	ut_ad(!type.is_log());
9067 #ifdef UNIV_ENCRYPT_DEBUG
9068 	ulint space_id =
9069 		mach_read_from_4(src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
9070 	ulint page_no = mach_read_from_4(src + FIL_PAGE_OFFSET);
9071 
9072 	fprintf(stderr, "Encrypting page:%lu.%lu len:%lu\n",
9073 		space_id, page_no, src_len);
9074 #endif
9075 
9076 	/* Shouldn't encrypte an already encrypted page. */
9077 	ut_ad(!is_encrypted_page(src));
9078 
9079 	const uint16_t page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
9080 
9081 	/* This is data size which need to encrypt. */
9082 	ulint src_enc_len = src_len;
9083 
9084 	/* In FIL_PAGE_VERSION_2, we encrypt the actual compressed data length. */
9085 	if (page_type == FIL_PAGE_COMPRESSED) {
9086 		src_enc_len = mach_read_from_2(src + FIL_PAGE_COMPRESS_SIZE_V1) +
9087 					       FIL_PAGE_DATA;
9088 		/* Extend src_enc_len if needed */
9089 		if (src_enc_len < MIN_ENCRYPTION_LEN) {
9090 			src_enc_len = MIN_ENCRYPTION_LEN;
9091 		}
9092 		ut_a(src_enc_len <= src_len);
9093 	}
9094 
9095 	/* Only encrypt the data + trailer, leave the header alone */
9096 
9097 	switch (m_type) {
9098 	case Encryption::NONE:
9099 		ut_error;
9100 
9101 	case Encryption::AES: {
9102 		ut_ad(m_klen == ENCRYPTION_KEY_LEN);
9103 
9104 		/* Total length of the data to encrypt. */
9105 		const ulint data_len = src_enc_len - FIL_PAGE_DATA;
9106 
9107 		/* Server encryption functions expect input data to be in
9108 		multiples of MY_AES_BLOCK SIZE. Therefore we encrypt the
9109 		overlapping data of the chunk_len and trailer_len twice.
9110 		First we encrypt the bigger chunk of data then we do the
9111 		trailer. The trailer encryption block starts at
9112 		2 * MY_AES_BLOCK_SIZE bytes offset from the end of the enc_len.
9113 		During decryption we do the reverse of the above process. */
9114 		ut_ad(data_len >= 2 * MY_AES_BLOCK_SIZE);
9115 
9116 		const ulint chunk_len =
9117 			 (data_len / MY_AES_BLOCK_SIZE) * MY_AES_BLOCK_SIZE;
9118 		const ulint remain_len = data_len - chunk_len;
9119 
9120 		lint elen = my_aes_encrypt(
9121 			src + FIL_PAGE_DATA, static_cast<uint32>(chunk_len),
9122 			dst + FIL_PAGE_DATA, reinterpret_cast<byte *>(m_key),
9123 			static_cast<uint32>(m_klen), my_aes_256_cbc,
9124 			reinterpret_cast<byte *>(m_iv), false);
9125 
9126 		if (elen == MY_AES_BAD_DATA) {
9127 			ulint	page_no =mach_read_from_4(
9128 				src + FIL_PAGE_OFFSET);
9129 			ulint	space_id = mach_read_from_4(
9130 				src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
9131 			*dst_len = src_len;
9132 #ifndef UNIV_INNOCHECKSUM
9133 				ib::warn()
9134 					<< " Can't encrypt data of page,"
9135 					<< " page no:" << page_no
9136 					<< " space id:" << space_id;
9137 #else
9138 				fprintf(stderr, " Can't encrypt data of page,"
9139 					" page no:" ULINTPF
9140 					" space id:" ULINTPF,
9141 					page_no, space_id);
9142 #endif /* !UNIV_INNOCHECKSUM */
9143 			return(src);
9144 		}
9145 
9146 		const ulint len = static_cast<ulint>(elen);
9147 		ut_ad(len == chunk_len);
9148 
9149 		/* Encrypt the trailing bytes. */
9150 		if (remain_len != 0) {
9151 			/* Copy remaining bytes and page tailer. */
9152 			memcpy(dst + FIL_PAGE_DATA + len,
9153 			       src + FIL_PAGE_DATA + len,
9154 			       remain_len);
9155 
9156 			const ulint trailer_len = MY_AES_BLOCK_SIZE * 2;
9157 			byte buf[trailer_len];
9158 
9159 			elen = my_aes_encrypt(
9160 				dst + FIL_PAGE_DATA + data_len - trailer_len,
9161 				static_cast<uint32>(trailer_len), buf,
9162 				reinterpret_cast<unsigned char*>(m_key),
9163 				static_cast<uint32>(m_klen), my_aes_256_cbc,
9164 				reinterpret_cast<byte *>(m_iv), false);
9165 
9166 			if (elen == MY_AES_BAD_DATA) {
9167 				ulint	page_no =mach_read_from_4(
9168 					src + FIL_PAGE_OFFSET);
9169 				ulint	space_id = mach_read_from_4(
9170 					src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
9171 #ifndef UNIV_INNOCHECKSUM
9172 				ib::warn()
9173 					<< " Can't encrypt data of page,"
9174 					<< " page no:" << page_no
9175 					<< " space id:" << space_id;
9176 #else
9177 				fprintf(stderr, " Can't encrypt data of page,"
9178 					" page no:" ULINTPF
9179 					" space id:" ULINTPF,
9180 					page_no, space_id);
9181 #endif /* !UNIV_INNOCHECKSUM */
9182 				*dst_len = src_len;
9183 				return(src);
9184 			}
9185 
9186 			ut_a(static_cast<ulint>(elen) == trailer_len);
9187 
9188 			memcpy(dst + FIL_PAGE_DATA + data_len - trailer_len,
9189 			       buf, trailer_len);
9190 		}
9191 
9192 
9193 		break;
9194 	}
9195 
9196 	default:
9197 		ut_error;
9198 	}
9199 
9200 	/* Copy the header as is. */
9201 	memmove(dst, src, FIL_PAGE_DATA);
9202 	ut_ad(memcmp(src, dst, FIL_PAGE_DATA) == 0);
9203 
9204 	/* Add encryption control information. Required for decrypting. */
9205 	if (page_type == FIL_PAGE_COMPRESSED) {
9206 		/* If the page is compressed, we don't need to save the
9207 		original type, since it is done in compression already. */
9208 		mach_write_to_2(dst + FIL_PAGE_TYPE,
9209 				FIL_PAGE_COMPRESSED_AND_ENCRYPTED);
9210 		ut_ad(memcmp(src+FIL_PAGE_TYPE+2,
9211 			     dst+FIL_PAGE_TYPE+2,
9212 			     FIL_PAGE_DATA-FIL_PAGE_TYPE-2) == 0);
9213 	} else if (page_type == FIL_PAGE_RTREE) {
9214 		/* If the page is R-tree page, we need to save original type. */
9215 		mach_write_to_2(dst + FIL_PAGE_TYPE, FIL_PAGE_ENCRYPTED_RTREE);
9216 	} else{
9217 		mach_write_to_2(dst + FIL_PAGE_TYPE, FIL_PAGE_ENCRYPTED);
9218 		mach_write_to_2(dst + FIL_PAGE_ORIGINAL_TYPE_V1, page_type);
9219 	}
9220 
9221 #ifdef UNIV_ENCRYPT_DEBUG
9222 #ifndef UNIV_INNOCHECKSUM
9223 #if 0
9224 	byte*	check_buf = static_cast<byte*>(ut_malloc_nokey(src_len));
9225 	byte*	buf2 = static_cast<byte*>(ut_malloc_nokey(src_len));
9226 
9227 	memcpy(check_buf, dst, src_len);
9228 
9229 	dberr_t err = decrypt(type, check_buf, src_len, buf2, src_len);
9230 	if (err != DB_SUCCESS || memcmp(src + FIL_PAGE_DATA,
9231 					check_buf + FIL_PAGE_DATA,
9232 					src_len - FIL_PAGE_DATA) != 0) {
9233 		ut_print_buf(stderr, src, src_len);
9234 		ut_print_buf(stderr, check_buf, src_len);
9235 		ut_ad(0);
9236 	}
9237 	ut_free(buf2);
9238 	ut_free(check_buf);
9239 #endif
9240 	fprintf(stderr, "Encrypted page:%lu.%lu\n", space_id, page_no);
9241 #endif
9242 #endif
9243 
9244 	/* Add padding 0 for unused portion */
9245 	if (src_len > src_enc_len) {
9246 		memset(dst + src_enc_len, 0, src_len - src_enc_len);
9247 	}
9248 
9249 	*dst_len = src_len;
9250 
9251 	return(dst);
9252 }
9253 
9254 /** Decrypt the page data contents. Page type must be FIL_PAGE_ENCRYPTED,
9255 if not then the source contents are left unchanged and DB_SUCCESS is returned.
9256 @param[in]	type		IORequest
9257 @param[in,out]	src		Data read from disk, decrypted data will be
9258 				copied to this page
9259 @param[in]	src_len		source data length
9260 @param[in,out]	dst		Scratch area to use for decryption
9261 @param[in]	dst_len		Size of the scratch area in bytes
9262 @return DB_SUCCESS or error code */
9263 dberr_t
decrypt(const IORequest & type,byte * src,ulint src_len,byte * dst,ulint dst_len)9264 Encryption::decrypt(
9265 	const IORequest&	type,
9266 	byte*			src,
9267 	ulint			src_len,
9268 	byte*			dst,
9269 	ulint			dst_len)
9270 {
9271 	ulint		data_len;
9272 	ulint		main_len;
9273 	ulint		remain_len;
9274 	ulint		original_type;
9275 	ulint		page_type;
9276 	byte		remain_buf[MY_AES_BLOCK_SIZE * 2];
9277 	Block*		block;
9278 
9279 	/* Do nothing if it's not an encrypted table. */
9280 	if (!is_encrypted_page(src)) {
9281 		return(DB_SUCCESS);
9282 	}
9283 
9284 	/* For compressed page, we need to get the compressed size
9285 	for decryption */
9286 	page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
9287 	if (page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED) {
9288 		src_len = static_cast<uint16_t>(
9289 			mach_read_from_2(src + FIL_PAGE_COMPRESS_SIZE_V1))
9290 			+ FIL_PAGE_DATA;
9291 #ifndef UNIV_INNOCHECKSUM
9292 		Compression::meta_t header;
9293 		Compression::deserialize_header(src, &header);
9294 		if (header.m_version == Compression::FIL_PAGE_VERSION_1) {
9295 			src_len = ut_calc_align(src_len, type.block_size());
9296 		} else {
9297 			/* Extend src_len if needed */
9298 			if (src_len < MIN_ENCRYPTION_LEN) {
9299 				src_len = MIN_ENCRYPTION_LEN;
9300 			}
9301 		}
9302 #endif
9303 	}
9304 #ifdef UNIV_ENCRYPT_DEBUG
9305 	ulint space_id =
9306 		mach_read_from_4(src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
9307 	ulint page_no = mach_read_from_4(src + FIL_PAGE_OFFSET);
9308 
9309 	fprintf(stderr, "Decrypting page:%lu.%lu len:%lu\n",
9310 		space_id, page_no, src_len);
9311 #endif
9312 
9313 	original_type = static_cast<uint16_t>(
9314 		mach_read_from_2(src + FIL_PAGE_ORIGINAL_TYPE_V1));
9315 
9316 	byte*	ptr = src + FIL_PAGE_DATA;
9317 
9318 	/* The caller doesn't know what to expect */
9319 	if (dst == NULL) {
9320 
9321 		block = os_alloc_block();
9322 #ifdef UNIV_INNOCHECKSUM
9323 		dst = block;
9324 #else
9325 		dst = block->m_ptr;
9326 #endif /* UNIV_INNOCHECKSUM */
9327 
9328 	} else {
9329 		block = NULL;
9330 	}
9331 
9332 	data_len = src_len - FIL_PAGE_DATA;
9333 	main_len = (data_len / MY_AES_BLOCK_SIZE) * MY_AES_BLOCK_SIZE;
9334 	remain_len = data_len - main_len;
9335 
9336 	switch(m_type) {
9337 	case Encryption::AES: {
9338 		lint			elen;
9339 
9340 		/* First decrypt the last 2 blocks data of data, since
9341 		data is no block aligned. */
9342 		if (remain_len != 0) {
9343 			ut_ad(m_klen == ENCRYPTION_KEY_LEN);
9344 
9345 			remain_len = MY_AES_BLOCK_SIZE * 2;
9346 
9347 			/* Copy the last 2 blocks. */
9348 			memcpy(remain_buf,
9349 			       ptr + data_len - remain_len,
9350 			       remain_len);
9351 
9352 			elen = my_aes_decrypt(
9353 				remain_buf,
9354 				static_cast<uint32>(remain_len),
9355 				dst + data_len - remain_len,
9356 				reinterpret_cast<unsigned char*>(m_key),
9357 				static_cast<uint32>(m_klen),
9358 				my_aes_256_cbc,
9359 				reinterpret_cast<unsigned char*>(m_iv),
9360 				false);
9361 			if (elen == MY_AES_BAD_DATA) {
9362 				if (block != NULL) {
9363 					os_free_block(block);
9364 				}
9365 
9366 				return(DB_IO_DECRYPT_FAIL);
9367 			}
9368 
9369 			/* Copy the other data bytes to temp area. */
9370 			memcpy(dst, ptr, data_len - remain_len);
9371 		} else {
9372 			ut_ad(data_len == main_len);
9373 
9374 			/* Copy the data bytes to temp area. */
9375 			memcpy(dst, ptr, data_len);
9376 		}
9377 
9378 		/* Then decrypt the main data */
9379 		elen = my_aes_decrypt(
9380 				dst,
9381 				static_cast<uint32>(main_len),
9382 				ptr,
9383 				reinterpret_cast<unsigned char*>(m_key),
9384 				static_cast<uint32>(m_klen),
9385 				my_aes_256_cbc,
9386 				reinterpret_cast<unsigned char*>(m_iv),
9387 				false);
9388 		if (elen == MY_AES_BAD_DATA) {
9389 
9390 			if (block != NULL) {
9391 				os_free_block(block);
9392 			}
9393 
9394 			return(DB_IO_DECRYPT_FAIL);
9395 		}
9396 
9397 		ut_ad(static_cast<ulint>(elen) == main_len);
9398 
9399 		/* Copy the remain bytes. */
9400 		memcpy(ptr + main_len, dst + main_len, data_len - main_len);
9401 
9402 		break;
9403 	}
9404 
9405 	default:
9406 		if (!type.is_dblwr_recover()) {
9407 #if !defined(UNIV_INNOCHECKSUM)
9408 			ib::error()
9409 				<< "Encryption algorithm support missing: "
9410 				<< Encryption::to_string(m_type);
9411 #else
9412 			fprintf(stderr, "Encryption algorithm support missing: %s\n",
9413 				Encryption::to_string(m_type));
9414 #endif /* !UNIV_INNOCHECKSUM */
9415 		}
9416 
9417 		if (block != NULL) {
9418 			os_free_block(block);
9419 		}
9420 
9421 		return(DB_UNSUPPORTED);
9422 	}
9423 
9424 	/* Restore the original page type. If it's a compressed and
9425 	encrypted page, just reset it as compressed page type, since
9426 	we will do uncompress later. */
9427 
9428 	if (page_type == FIL_PAGE_ENCRYPTED) {
9429 		mach_write_to_2(src + FIL_PAGE_TYPE, original_type);
9430 		mach_write_to_2(src + FIL_PAGE_ORIGINAL_TYPE_V1, 0);
9431 	} else if (page_type == FIL_PAGE_ENCRYPTED_RTREE) {
9432 		mach_write_to_2(src + FIL_PAGE_TYPE, FIL_PAGE_RTREE);
9433 	} else {
9434 		ut_ad(page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED);
9435 		mach_write_to_2(src + FIL_PAGE_TYPE, FIL_PAGE_COMPRESSED);
9436 	}
9437 
9438 	if (block != NULL) {
9439 		os_free_block(block);
9440 	}
9441 
9442 #ifdef UNIV_ENCRYPT_DEBUG
9443 	fprintf(stderr, "Decrypted page:%lu.%lu\n", space_id, page_no);
9444 #endif
9445 
9446 	DBUG_EXECUTE_IF("ib_crash_during_decrypt_page", DBUG_SUICIDE(););
9447 
9448 	return(DB_SUCCESS);
9449 }
9450 
9451 /** Normalizes a directory path for the current OS:
9452 On Windows, we convert '/' to '\', else we convert '\' to '/'.
9453 @param[in,out] str A null-terminated directory and file path */
9454 void
os_normalize_path(char * str)9455 os_normalize_path(
9456 	char*	str)
9457 {
9458 	if (str != NULL) {
9459 		for (; *str; str++) {
9460 			if (*str == OS_PATH_SEPARATOR_ALT) {
9461 				*str = OS_PATH_SEPARATOR;
9462 			}
9463 		}
9464 	}
9465 }
9466