1 /***********************************************************************
2 
3 Copyright (c) 1995, 2021, Oracle and/or its affiliates.
4 Copyright (c) 2009, Percona Inc.
5 
6 Portions of this file contain modifications contributed and copyrighted
7 by Percona Inc.. Those modifications are
8 gratefully acknowledged and are described briefly in the InnoDB
9 documentation. The contributions by Percona Inc. are incorporated with
10 their permission, and subject to the conditions contained in the file
11 COPYING.Percona.
12 
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License, version 2.0,
15 as published by the Free Software Foundation.
16 
17 This program is also distributed with certain software (including
18 but not limited to OpenSSL) that is licensed under separate terms,
19 as designated in a particular file or component or in included license
20 documentation.  The authors of MySQL hereby grant you an additional
21 permission to link the program and your derivative works with the
22 separately licensed software that they have included with MySQL.
23 
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27 GNU General Public License, version 2.0, for more details.
28 
29 You should have received a copy of the GNU General Public License along with
30 this program; if not, write to the Free Software Foundation, Inc.,
31 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
32 
33 ***********************************************************************/
34 
35 /**************************************************//**
36 @file os/os0file.cc
37 The interface to the operating system file i/o primitives
38 
39 Created 10/21/1995 Heikki Tuuri
40 *******************************************************/
41 
42 #ifndef UNIV_INNOCHECKSUM
43 
44 #include "ha_prototypes.h"
45 #include "sql_const.h"
46 
47 #include "os0file.h"
48 
49 #ifdef UNIV_NONINL
50 #include "os0file.ic"
51 #endif
52 
53 #include "page0page.h"
54 #include "srv0srv.h"
55 #include "srv0start.h"
56 #include "fil0fil.h"
57 #ifndef UNIV_HOTBACKUP
58 # include "os0event.h"
59 # include "os0thread.h"
60 #else /* !UNIV_HOTBACKUP */
61 # ifdef _WIN32
62 /* Add includes for the _stat() call to compile on Windows */
63 #  include <sys/types.h>
64 #  include <sys/stat.h>
65 #  include <errno.h>
66 # endif /* _WIN32 */
67 #endif /* !UNIV_HOTBACKUP */
68 
69 #include <vector>
70 #include <functional>
71 
72 #ifdef LINUX_NATIVE_AIO
73 #include <libaio.h>
74 #endif /* LINUX_NATIVE_AIO */
75 
76 #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
77 # include <fcntl.h>
78 # include <linux/falloc.h>
79 #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
80 
81 #include <lz4.h>
82 #include <zlib.h>
83 
84 #ifdef UNIV_DEBUG
85 /** Set when InnoDB has invoked exit(). */
86 bool	innodb_calling_exit;
87 #endif /* UNIV_DEBUG */
88 
89 #include <my_aes.h>
90 #include <my_rnd.h>
91 #include <mysqld.h>
92 #include <mysql/service_mysql_keyring.h>
93 
94 /** Insert buffer segment id */
95 static const ulint IO_IBUF_SEGMENT = 0;
96 
97 /** Log segment id */
98 static const ulint IO_LOG_SEGMENT = 1;
99 
100 /** Number of retries for partial I/O's */
101 static const ulint NUM_RETRIES_ON_PARTIAL_IO = 10;
102 
103 /** Blocks for doing IO, used in the transparent compression
104 and encryption code. */
105 struct Block {
106 	/** Default constructor */
BlockBlock107 	Block() : m_ptr(), m_in_use() { }
108 
109 	byte*		m_ptr;
110 
111 	byte		pad[CACHE_LINE_SIZE - sizeof(ulint)];
112 	lock_word_t	m_in_use;
113 };
114 
115 /** For storing the allocated blocks */
116 typedef std::vector<Block> Blocks;
117 
118 /** Block collection */
119 static Blocks*	block_cache;
120 
121 /** Number of blocks to allocate for sync read/writes */
122 static const size_t	MAX_BLOCKS = 128;
123 
124 /** Block buffer size */
125 #define BUFFER_BLOCK_SIZE ((ulint)(UNIV_PAGE_SIZE * 1.3))
126 
127 /** Disk sector size of aligning write buffer for DIRECT_IO */
128 static ulint	os_io_ptr_align = UNIV_SECTOR_SIZE;
129 
130 /* This specifies the file permissions InnoDB uses when it creates files in
131 Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
132 my_umask */
133 
134 #ifndef _WIN32
135 /** Umask for creating files */
136 static ulint	os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
137 #else
138 /** Umask for creating files */
139 static ulint	os_innodb_umask	= 0;
140 
141 /* On Windows when using native AIO the number of AIO requests
142 that a thread can handle at a given time is limited to 32
143 i.e.: SRV_N_PENDING_IOS_PER_THREAD */
144 #define SRV_N_PENDING_IOS_PER_THREAD    OS_AIO_N_PENDING_IOS_PER_THREAD
145 
146 #endif /* _WIN32 */
147 
148 #ifndef UNIV_HOTBACKUP
149 
150 /** In simulated aio, merge at most this many consecutive i/os */
151 static const ulint	OS_AIO_MERGE_N_CONSECUTIVE = 64;
152 
153 /** Flag indicating if the page_cleaner is in active state. */
154 extern bool buf_page_cleaner_is_active;
155 
156 /**********************************************************************
157 
158 InnoDB AIO Implementation:
159 =========================
160 
161 We support native AIO for Windows and Linux. For rest of the platforms
162 we simulate AIO by special IO-threads servicing the IO-requests.
163 
164 Simulated AIO:
165 ==============
166 
167 On platforms where we 'simulate' AIO, the following is a rough explanation
168 of the high level design.
169 There are four io-threads (for ibuf, log, read, write).
170 All synchronous IO requests are serviced by the calling thread using
171 os_file_write/os_file_read. The Asynchronous requests are queued up
172 in an array (there are four such arrays) by the calling thread.
173 Later these requests are picked up by the IO-thread and are serviced
174 synchronously.
175 
176 Windows native AIO:
177 ==================
178 
179 If srv_use_native_aio is not set then Windows follow the same
180 code as simulated AIO. If the flag is set then native AIO interface
181 is used. On windows, one of the limitation is that if a file is opened
182 for AIO no synchronous IO can be done on it. Therefore we have an
183 extra fifth array to queue up synchronous IO requests.
184 There are innodb_file_io_threads helper threads. These threads work
185 on the four arrays mentioned above in Simulated AIO. No thread is
186 required for the sync array.
187 If a synchronous IO request is made, it is first queued in the sync
188 array. Then the calling thread itself waits on the request, thus
189 making the call synchronous.
190 If an AIO request is made the calling thread not only queues it in the
191 array but also submits the requests. The helper thread then collects
192 the completed IO request and calls completion routine on it.
193 
194 Linux native AIO:
195 =================
196 
197 If we have libaio installed on the system and innodb_use_native_aio
198 is set to true we follow the code path of native AIO, otherwise we
199 do simulated AIO.
200 There are innodb_file_io_threads helper threads. These threads work
201 on the four arrays mentioned above in Simulated AIO.
202 If a synchronous IO request is made, it is handled by calling
203 os_file_write/os_file_read.
204 If an AIO request is made the calling thread not only queues it in the
205 array but also submits the requests. The helper thread then collects
206 the completed IO request and calls completion routine on it.
207 
208 **********************************************************************/
209 
210 
211 #ifdef UNIV_PFS_IO
212 /* Keys to register InnoDB I/O with performance schema */
213 mysql_pfs_key_t  innodb_data_file_key;
214 mysql_pfs_key_t  innodb_log_file_key;
215 mysql_pfs_key_t  innodb_temp_file_key;
216 #endif /* UNIV_PFS_IO */
217 
218 /** The asynchronous I/O context */
219 struct Slot {
SlotSlot220 	Slot() { memset(this, 0, sizeof(*this)); }
221 
222 	/** index of the slot in the aio array */
223 	uint16_t		pos;
224 
225 	/** true if this slot is reserved */
226 	bool			is_reserved;
227 
228 	/** time when reserved */
229 	ib_time_monotonic_t	reservation_time;
230 
231 	/** buffer used in i/o */
232 	byte*			buf;
233 
234 	/** Buffer pointer used for actual IO. We advance this
235 	when partial IO is required and not buf */
236 	byte*			ptr;
237 
238 	/** OS_FILE_READ or OS_FILE_WRITE */
239 	IORequest		type;
240 
241 	/** file offset in bytes */
242 	os_offset_t		offset;
243 
244 	/** file where to read or write */
245 	pfs_os_file_t		file;
246 
247 	/** file name or path */
248 	const char*		name;
249 
250 	/** used only in simulated aio: true if the physical i/o
251 	already made and only the slot message needs to be passed
252 	to the caller of os_aio_simulated_handle */
253 	bool			io_already_done;
254 
255 	/** The file node for which the IO is requested. */
256 	fil_node_t*		m1;
257 
258 	/** the requester of an aio operation and which can be used
259 	to identify which pending aio operation was completed */
260 	void*			m2;
261 
262 	/** AIO completion status */
263 	dberr_t			err;
264 
265 #ifdef WIN_ASYNC_IO
266 	/** handle object we need in the OVERLAPPED struct */
267 	HANDLE			handle;
268 
269 	/** Windows control block for the aio request */
270 	OVERLAPPED		control;
271 
272 	/** bytes written/read */
273 	DWORD			n_bytes;
274 
275 	/** length of the block to read or write */
276 	DWORD			len;
277 
278 #elif defined(LINUX_NATIVE_AIO)
279 	/** Linux control block for aio */
280 	struct iocb		control;
281 
282 	/** AIO return code */
283 	int			ret;
284 
285 	/** bytes written/read. */
286 	ssize_t			n_bytes;
287 
288 	/** length of the block to read or write */
289 	ulint			len;
290 #else
291 	/** length of the block to read or write */
292 	ulint			len;
293 
294 	/** bytes written/read. */
295 	ulint			n_bytes;
296 #endif /* WIN_ASYNC_IO */
297 
298 	/** Length of the block before it was compressed */
299 	uint32			original_len;
300 
301 	/** Buffer block for compressed pages or encrypted pages */
302 	Block*			buf_block;
303 
304 	/** true, if we shouldn't punch a hole after writing the page */
305 	bool			skip_punch_hole;
306 };
307 
308 /** The asynchronous i/o array structure */
309 class AIO {
310 public:
311 	/** Constructor
312 	@param[in]	id		Latch ID
313 	@param[in]	n_slots		Number of slots to configure
314 	@param[in]	segments	Number of segments to configure */
315 	AIO(latch_id_t id, ulint n_slots, ulint segments);
316 
317 	/** Destructor */
318 	~AIO();
319 
320 	/** Initialize the instance
321 	@return DB_SUCCESS or error code */
322 	dberr_t init();
323 
324 	/** Requests for a slot in the aio array. If no slot is available, waits
325 	until not_full-event becomes signaled.
326 
327 	@param[in,out]	type	IO context
328 	@param[in,out]	m1	message to be passed along with the AIO
329 				operation
330 	@param[in,out]	m2	message to be passed along with the AIO
331 				operation
332 	@param[in]	file	file handle
333 	@param[in]	name	name of the file or path as a null-terminated
334 				string
335 	@param[in,out]	buf	buffer where to read or from which to write
336 	@param[in]	offset	file offset, where to read from or start writing
337 	@param[in]	len	length of the block to read or write
338 	@return pointer to slot */
339 	Slot* reserve_slot(
340 		IORequest&	type,
341 		fil_node_t*	m1,
342 		void*		m2,
343 		pfs_os_file_t	file,
344 		const char*	name,
345 		void*		buf,
346 		os_offset_t	offset,
347 		ulint		len)
348 		MY_ATTRIBUTE((warn_unused_result));
349 
350 	/** @return number of reserved slots */
351 	ulint pending_io_count() const;
352 
353 	/** Returns a pointer to the nth slot in the aio array.
354 	@param[in]	index	Index of the slot in the array
355 	@return pointer to slot */
at(ulint i) const356 	const Slot* at(ulint i) const
357 		MY_ATTRIBUTE((warn_unused_result))
358 	{
359 		ut_a(i < m_slots.size());
360 
361 		return(&m_slots[i]);
362 	}
363 
364 	/** Non const version */
at(ulint i)365 	Slot* at(ulint i)
366 		MY_ATTRIBUTE((warn_unused_result))
367 	{
368 		ut_a(i < m_slots.size());
369 
370 		return(&m_slots[i]);
371 	}
372 
373 	/** Frees a slot in the AIO array, assumes caller owns the mutex.
374 	@param[in,out]	slot	Slot to release */
375 	void release(Slot* slot);
376 
377 	/** Frees a slot in the AIO array, assumes caller doesn't own the mutex.
378 	@param[in,out]	slot	Slot to release */
379 	void release_with_mutex(Slot* slot);
380 
381 	/** Prints info about the aio array.
382 	@param[in,out]	file	Where to print */
383 	void print(FILE* file);
384 
385 	/** @return the number of slots per segment */
slots_per_segment() const386 	ulint slots_per_segment() const
387 		MY_ATTRIBUTE((warn_unused_result))
388 	{
389 		return(m_slots.size() / m_n_segments);
390 	}
391 
392 	/** @return accessor for n_segments */
get_n_segments() const393 	ulint get_n_segments() const
394 		MY_ATTRIBUTE((warn_unused_result))
395 	{
396 		return(m_n_segments);
397 	}
398 
399 #ifdef UNIV_DEBUG
400 	/** @return true if the thread owns the mutex */
is_mutex_owned() const401 	bool is_mutex_owned() const
402 		MY_ATTRIBUTE((warn_unused_result))
403 	{
404 		return(mutex_own(&m_mutex));
405 	}
406 #endif /* UNIV_DEBUG */
407 
408 	/** Acquire the mutex */
acquire() const409 	void acquire() const
410 	{
411 		mutex_enter(&m_mutex);
412 	}
413 
414 	/** Release the mutex */
release() const415 	void release() const
416 	{
417 		mutex_exit(&m_mutex);
418 	}
419 
420 	/** Write out the state to the file/stream
421 	@param[in, out]	file	File to write to */
422 	void to_file(FILE* file) const;
423 
424 #ifdef LINUX_NATIVE_AIO
425 	/** Dispatch an AIO request to the kernel.
426 	@param[in,out]	slot	an already reserved slot
427 	@return true on success. */
428 	bool linux_dispatch(Slot* slot)
429 		MY_ATTRIBUTE((warn_unused_result));
430 
431 	/** Accessor for an AIO event
432 	@param[in]	index	Index into the array
433 	@return the event at the index */
io_events(ulint index)434 	io_event* io_events(ulint index)
435 		MY_ATTRIBUTE((warn_unused_result))
436 	{
437 		ut_a(index < m_events.size());
438 
439 		return(&m_events[index]);
440 	}
441 
442 	/** Accessor for the AIO context
443 	@param[in]	segment	Segment for which to get the context
444 	@return the AIO context for the segment */
io_ctx(ulint segment)445 	io_context* io_ctx(ulint segment)
446 		MY_ATTRIBUTE((warn_unused_result))
447 	{
448 		ut_ad(segment < get_n_segments());
449 
450 		return(m_aio_ctx[segment]);
451 	}
452 
453 	/** Creates an io_context for native linux AIO.
454 	@param[in]	max_events	number of events
455 	@param[out]	io_ctx		io_ctx to initialize.
456 	@return true on success. */
457 	static bool linux_create_io_ctx(ulint max_events, io_context_t* io_ctx)
458 		MY_ATTRIBUTE((warn_unused_result));
459 
460 	/** Checks if the system supports native linux aio. On some kernel
461 	versions where native aio is supported it won't work on tmpfs. In such
462 	cases we can't use native aio as it is not possible to mix simulated
463 	and native aio.
464 	@return true if supported, false otherwise. */
465 	static bool is_linux_native_aio_supported()
466 		MY_ATTRIBUTE((warn_unused_result));
467 #endif /* LINUX_NATIVE_AIO */
468 
469 #ifdef WIN_ASYNC_IO
470 	/** Wakes up all async i/o threads in the array in Windows async I/O at
471 	shutdown. */
signal()472 	void signal()
473 	{
474 		for (ulint i = 0; i < m_slots.size(); ++i) {
475 			SetEvent(m_slots[i].handle);
476 		}
477 	}
478 
479 	/** Wake up all AIO threads in Windows native aio */
wake_at_shutdown()480 	static void wake_at_shutdown()
481 	{
482 		s_reads->signal();
483 
484 		if (s_writes != NULL) {
485 			s_writes->signal();
486 		}
487 
488 		if (s_ibuf != NULL) {
489 			s_ibuf->signal();
490 		}
491 
492 		if (s_log != NULL) {
493 			s_log->signal();
494 		}
495 	}
496 #endif /* WIN_ASYNC_IO */
497 
498 #ifdef _WIN32
499 	/** This function can be called if one wants to post a batch of reads
500 	and prefers an I/O - handler thread to handle them all at once later.You
501 	must call os_aio_simulated_wake_handler_threads later to ensure the
502 	threads are not left sleeping! */
503 	static void simulated_put_read_threads_to_sleep();
504 
505 	/** The non asynchronous IO array.
506 	@return the synchronous AIO array instance. */
sync_array()507 	static AIO* sync_array()
508 		MY_ATTRIBUTE((warn_unused_result))
509 	{
510 		return(s_sync);
511 	}
512 
513 	/**
514 	Get the AIO handles for a segment.
515 	@param[in]	segment		The local segment.
516 	@return the handles for the segment. */
handles(ulint segment)517 	HANDLE* handles(ulint segment)
518 		MY_ATTRIBUTE((warn_unused_result))
519 	{
520 		ut_ad(segment < m_handles->size() / slots_per_segment());
521 
522 		return(&(*m_handles)[segment * slots_per_segment()]);
523 	}
524 
525 	/** @return true if no slots are reserved */
is_empty() const526 	bool is_empty() const
527 		MY_ATTRIBUTE((warn_unused_result))
528 	{
529 		ut_ad(is_mutex_owned());
530 		return(m_n_reserved == 0);
531 	}
532 #endif /* _WIN32 */
533 
534 	/** Create an instance using new(std::nothrow)
535 	@param[in]	id		Latch ID
536 	@param[in]	n_slots		The number of AIO request slots
537 	@param[in]	segments	The number of segments
538 	@return a new AIO instance */
539 	static AIO* create(
540 		latch_id_t	id,
541 		ulint		n_slots,
542 		ulint		segments)
543 		MY_ATTRIBUTE((warn_unused_result));
544 
545 	/** Initializes the asynchronous io system. Creates one array each
546 	for ibuf and log I/O. Also creates one array each for read and write
547 	where each array is divided logically into n_readers and n_writers
548 	respectively. The caller must create an i/o handler thread for each
549 	segment in these arrays. This function also creates the sync array.
550 	No I/O handler thread needs to be created for that
551 	@param[in]	n_per_seg	maximum number of pending aio
552 					operations allowed per segment
553 	@param[in]	n_readers	number of reader threads
554 	@param[in]	n_writers	number of writer threads
555 	@param[in]	n_slots_sync	number of slots in the sync aio array
556 	@return true if AIO sub-system was started successfully */
557 	static bool start(
558 		ulint		n_per_seg,
559 		ulint		n_readers,
560 		ulint		n_writers,
561 		ulint		n_slots_sync)
562 		MY_ATTRIBUTE((warn_unused_result));
563 
564 	/** Free the AIO arrays */
565 	static void shutdown();
566 
567 	/** Print all the AIO segments
568 	@param[in,out]	file		Where to print */
569 	static void print_all(FILE* file);
570 
571 	/** Calculates local segment number and aio array from global
572 	segment number.
573 	@param[out]	array		AIO wait array
574 	@param[in]	segment		global segment number
575 	@return local segment number within the aio array */
576 	static ulint get_array_and_local_segment(
577 		AIO**		array,
578 		ulint		segment)
579 		MY_ATTRIBUTE((warn_unused_result));
580 
581 	/** Select the IO slot array
582 	@param[in]	type		Type of IO, READ or WRITE
583 	@param[in]	read_only	true if running in read-only mode
584 	@param[in]	mode		IO mode
585 	@return slot array or NULL if invalid mode specified */
586 	static AIO* select_slot_array(
587 		IORequest&	type,
588 		bool		read_only,
589 		ulint		mode)
590 		MY_ATTRIBUTE((warn_unused_result));
591 
592 	/** Calculates segment number for a slot.
593 	@param[in]	array		AIO wait array
594 	@param[in]	slot		slot in this array
595 	@return segment number (which is the number used by, for example,
596 		I/O handler threads) */
597 	static ulint get_segment_no_from_slot(
598 		const AIO*	array,
599 		const Slot*	slot)
600 		MY_ATTRIBUTE((warn_unused_result));
601 
602 	/** Wakes up a simulated AIO I/O-handler thread if it has something
603 	to do.
604 	@param[in]	global_segment	the number of the segment in the
605 					AIO arrays */
606 	static void wake_simulated_handler_thread(ulint global_segment);
607 
608 	/** Check if it is a read request
609 	@param[in]	aio		The AIO instance to check
610 	@return true if the AIO instance is for reading. */
is_read(const AIO * aio)611 	static bool is_read(const AIO* aio)
612 		MY_ATTRIBUTE((warn_unused_result))
613 	{
614 		return(s_reads == aio);
615 	}
616 
617 	/** Wait on an event until no pending writes */
wait_until_no_pending_writes()618 	static void wait_until_no_pending_writes()
619 	{
620 		os_event_wait(AIO::s_writes->m_is_empty);
621 	}
622 
623 	/** Print to file
624 	@param[in]	file		File to write to */
625 	static void print_to_file(FILE* file);
626 
627 	/** Check for pending IO. Gets the count and also validates the
628 	data structures.
629 	@return count of pending IO requests */
630 	static ulint total_pending_io_count();
631 
632 private:
633 	/** Initialise the slots
634 	@return DB_SUCCESS or error code */
635 	dberr_t init_slots()
636 		MY_ATTRIBUTE((warn_unused_result));
637 
638 	/** Wakes up a simulated AIO I/O-handler thread if it has something
639 	to do for a local segment in the AIO array.
640 	@param[in]	global_segment	the number of the segment in the
641 					AIO arrays
642 	@param[in]	segment		the local segment in the AIO array */
643 	void wake_simulated_handler_thread(ulint global_segment, ulint segment);
644 
645 	/** Prints pending IO requests per segment of an aio array.
646 	We probably don't need per segment statistics but they can help us
647 	during development phase to see if the IO requests are being
648 	distributed as expected.
649 	@param[in,out]	file		file where to print
650 	@param[in]	segments	pending IO array */
651 	void print_segment_info(
652 		FILE*		file,
653 		const ulint*	segments);
654 
655 #ifdef LINUX_NATIVE_AIO
656 	/** Initialise the Linux native AIO data structures
657 	@return DB_SUCCESS or error code */
658 	dberr_t init_linux_native_aio()
659 		MY_ATTRIBUTE((warn_unused_result));
660 #endif /* LINUX_NATIVE_AIO */
661 
662 private:
663 	typedef std::vector<Slot> Slots;
664 
665 	/** the mutex protecting the aio array */
666 	mutable SysMutex	m_mutex;
667 
668 	/** Pointer to the slots in the array.
669 	Number of elements must be divisible by n_threads. */
670 	Slots			m_slots;
671 
672 	/** Number of segments in the aio array of pending aio requests.
673 	A thread can wait separately for any one of the segments. */
674 	ulint			m_n_segments;
675 
676 	/** The event which is set to the signaled state when
677 	there is space in the aio outside the ibuf segment */
678 	os_event_t		m_not_full;
679 
680 	/** The event which is set to the signaled state when
681 	there are no pending i/os in this array */
682 	os_event_t		m_is_empty;
683 
684 	/** Number of reserved slots in the AIO array outside
685 	the ibuf segment */
686 	ulint			m_n_reserved;
687 
688 #ifdef _WIN32
689 	typedef std::vector<HANDLE, ut_allocator<HANDLE> > Handles;
690 
691 	/** Pointer to an array of OS native event handles where
692 	we copied the handles from slots, in the same order. This
693 	can be used in WaitForMultipleObjects; used only in Windows */
694 	Handles*		m_handles;
695 #endif /* _WIN32 */
696 
697 #if defined(LINUX_NATIVE_AIO)
698 	typedef std::vector<io_event> IOEvents;
699 
700 	/** completion queue for IO. There is one such queue per
701 	segment. Each thread will work on one ctx exclusively. */
702 	io_context_t*		m_aio_ctx;
703 
704 	/** The array to collect completed IOs. There is one such
705 	event for each possible pending IO. The size of the array
706 	is equal to m_slots.size(). */
707 	IOEvents		m_events;
708 #endif /* LINUX_NATIV_AIO */
709 
710 	/** The aio arrays for non-ibuf i/o and ibuf i/o, as well as
711 	sync AIO. These are NULL when the module has not yet been
712 	initialized. */
713 
714 	/** Insert buffer */
715 	static AIO*		s_ibuf;
716 
717 	/** Redo log */
718 	static AIO*		s_log;
719 
720 	/** Reads */
721 	static AIO*		s_reads;
722 
723 	/** Writes */
724 	static AIO*		s_writes;
725 
726 	/** Synchronous I/O */
727 	static AIO*		s_sync;
728 };
729 
730 /** Static declarations */
731 AIO*	AIO::s_reads;
732 AIO*	AIO::s_writes;
733 AIO*	AIO::s_ibuf;
734 AIO*	AIO::s_log;
735 AIO*	AIO::s_sync;
736 
737 #if defined(LINUX_NATIVE_AIO)
738 /** timeout for each io_getevents() call = 500ms. */
739 static const ulint	OS_AIO_REAP_TIMEOUT = 500000000UL;
740 
741 /** time to sleep, in microseconds if io_setup() returns EAGAIN. */
742 static const ulint	OS_AIO_IO_SETUP_RETRY_SLEEP = 500000UL;
743 
744 /** number of attempts before giving up on io_setup(). */
745 static const int	OS_AIO_IO_SETUP_RETRY_ATTEMPTS = 5;
746 #endif /* LINUX_NATIVE_AIO */
747 
748 /** Array of events used in simulated AIO */
749 static os_event_t*	os_aio_segment_wait_events = NULL;
750 
751 /** Number of asynchronous I/O segments.  Set by os_aio_init(). */
752 static ulint		os_aio_n_segments = ULINT_UNDEFINED;
753 
754 /** If the following is true, read i/o handler threads try to
755 wait until a batch of new read requests have been posted */
756 static bool		os_aio_recommend_sleep_for_read_threads = false;
757 #endif /* !UNIV_HOTBACKUP */
758 
759 ulint	os_n_file_reads		= 0;
760 ulint	os_bytes_read_since_printout = 0;
761 ulint	os_n_file_writes	= 0;
762 ulint	os_n_fsyncs		= 0;
763 ulint	os_n_file_reads_old	= 0;
764 ulint	os_n_file_writes_old	= 0;
765 ulint	os_n_fsyncs_old		= 0;
766 /** Number of pending write operations */
767 ulint	os_n_pending_writes = 0;
768 /** Number of pending read operations */
769 ulint	os_n_pending_reads = 0;
770 
771 ib_time_monotonic_t	os_last_printout;
772 bool	os_has_said_disk_full	= false;
773 
774 /** Default Zip compression level */
775 extern uint page_zip_level;
776 
777 #if DATA_TRX_ID_LEN > 6
778 #error "COMPRESSION_ALGORITHM will not fit"
779 #endif /* DATA_TRX_ID_LEN */
780 
781 /** Validates the consistency of the aio system.
782 @return true if ok */
783 static
784 bool
785 os_aio_validate();
786 
787 /** Does error handling when a file operation fails.
788 @param[in]	name		File name or NULL
789 @param[in]	operation	Name of operation e.g., "read", "write"
790 @return true if we should retry the operation */
791 static
792 bool
793 os_file_handle_error(
794 	const char*	name,
795 	const char*	operation);
796 
797 /** Free storage space associated with a section of the file.
798 @param[in]      fh              Open file handle
799 @param[in]      off             Starting offset (SEEK_SET)
800 @param[in]      len             Size of the hole
801 @return DB_SUCCESS or error code */
802 dberr_t
803 os_file_punch_hole(
804         os_file_t   fh,
805         os_offset_t     off,
806         os_offset_t     len);
807 
808 /**
809 Does error handling when a file operation fails.
810 @param[in]	name		File name or NULL
811 @param[in]	operation	Name of operation e.g., "read", "write"
812 @param[in]	silent	if true then don't print any message to the log.
813 @return true if we should retry the operation */
814 static
815 bool
816 os_file_handle_error_no_exit(
817 	const char*	name,
818 	const char*	operation,
819 	bool		silent);
820 
821 /** Decompress after a read and punch a hole in the file if it was a write
822 @param[in]	type		IO context
823 @param[in]	fh		Open file handle
824 @param[in,out]	buf		Buffer to transform
825 @param[in,out]	scratch		Scratch area for read decompression
826 @param[in]	src_len		Length of the buffer before compression
827 @param[in]	len		Compressed buffer length for write and size
828 				of buf len for read
829 @return DB_SUCCESS or error code */
830 static
831 dberr_t
832 os_file_io_complete(
833 	const IORequest&type,
834 	os_file_t	fh,
835 	byte*		buf,
836 	byte*		scratch,
837 	ulint		src_len,
838 	os_offset_t	offset,
839 	ulint		len);
840 
841 /** Does simulated AIO. This function should be called by an i/o-handler
842 thread.
843 
844 @param[in]	segment	The number of the segment in the aio arrays to wait
845 			for; segment 0 is the ibuf i/o thread, segment 1 the
846 			log i/o thread, then follow the non-ibuf read threads,
847 			and as the last are the non-ibuf write threads
848 @param[out]	m1	the messages passed with the AIO request; note that
849 			also in the case where the AIO operation failed, these
850 			output parameters are valid and can be used to restart
851 			the operation, for example
852 @param[out]	m2	Callback argument
853 @param[in]	type	IO context
854 @return DB_SUCCESS or error code */
855 static
856 dberr_t
857 os_aio_simulated_handler(
858 	ulint		global_segment,
859 	fil_node_t**	m1,
860 	void**		m2,
861 	IORequest*	type);
862 
863 #ifdef WIN_ASYNC_IO
864 /** This function is only used in Windows asynchronous i/o.
865 Waits for an aio operation to complete. This function is used to wait the
866 for completed requests. The aio array of pending requests is divided
867 into segments. The thread specifies which segment or slot it wants to wait
868 for. NOTE: this function will also take care of freeing the aio slot,
869 therefore no other thread is allowed to do the freeing!
870 @param[in]	segment		The number of the segment in the aio arrays to
871 wait for; segment 0 is the ibuf I/O thread,
872 segment 1 the log I/O thread, then follow the
873 non-ibuf read threads, and as the last are the
874 non-ibuf write threads; if this is
875 ULINT_UNDEFINED, then it means that sync AIO
876 is used, and this parameter is ignored
877 @param[in]	pos		this parameter is used only in sync AIO:
878 wait for the aio slot at this position
879 @param[out]	m1		the messages passed with the AIO request; note
880 that also in the case where the AIO operation
881 failed, these output parameters are valid and
882 can be used to restart the operation,
883 for example
884 @param[out]	m2		callback message
885 @param[out]	type		OS_FILE_WRITE or ..._READ
886 @return DB_SUCCESS or error code */
887 static
888 dberr_t
889 os_aio_windows_handler(
890 	ulint		segment,
891 	ulint		pos,
892 	fil_node_t**	m1,
893 	void**		m2,
894 	IORequest*	type);
895 #endif /* WIN_ASYNC_IO */
896 
897 /** Allocate a page for sync IO
898 @return pointer to page */
899 static
900 Block*
os_alloc_block()901 os_alloc_block()
902 {
903 	size_t		pos;
904 	Blocks&		blocks = *block_cache;
905 	size_t		i = static_cast<size_t>(my_timer_cycles());
906 	const size_t	size = blocks.size();
907 	ulint		retry = 0;
908 	Block*		block;
909 
910 	DBUG_EXECUTE_IF("os_block_cache_busy", retry = MAX_BLOCKS * 3;);
911 
912 	for (;;) {
913 
914 		/* After go through the block cache for 3 times,
915 		allocate a new temporary block. */
916 		if (retry == MAX_BLOCKS * 3) {
917 			byte*	ptr;
918 
919 			ptr = static_cast<byte*>(
920 				ut_malloc_nokey(sizeof(*block)
921 						+ BUFFER_BLOCK_SIZE));
922 
923 			block = new (ptr) Block();
924 			block->m_ptr = static_cast<byte*>(
925 				ptr + sizeof(*block));
926 			block->m_in_use = 1;
927 
928 			break;
929 		}
930 
931 		pos = i++ % size;
932 
933 		if (TAS(&blocks[pos].m_in_use, 1) == 0) {
934 			block = &blocks[pos];
935 			break;
936 		}
937 
938 		os_thread_yield();
939 
940 		++retry;
941 	}
942 
943 	ut_a(block->m_in_use != 0);
944 
945 	return(block);
946 }
947 
948 /** Free a page after sync IO
949 @param[in,own]	block		The block to free/release */
950 static
951 void
os_free_block(Block * block)952 os_free_block(Block* block)
953 {
954 	ut_ad(block->m_in_use == 1);
955 
956 	TAS(&block->m_in_use, 0);
957 
958 	/* When this block is not in the block cache, and it's
959 	a temporary block, we need to free it directly. */
960 	if (std::less<Block*>()(block, &block_cache->front())
961 	    || std::greater<Block*>()(block, &block_cache->back())) {
962 		ut_free(block);
963 	}
964 }
965 
966 /** Generic AIO Handler methods. Currently handles IO post processing. */
967 class AIOHandler {
968 public:
969 	/** Do any post processing after a read/write
970 	@return DB_SUCCESS or error code. */
971 	static dberr_t post_io_processing(Slot* slot);
972 
973 	/** Decompress after a read and punch a hole in the file if
974 	it was a write */
io_complete(const Slot * slot)975 	static dberr_t io_complete(const Slot* slot)
976 	{
977 		ut_a(slot->offset > 0);
978 		ut_a(slot->type.is_read() || !slot->skip_punch_hole);
979 		return(os_file_io_complete(
980 				slot->type, slot->file.m_file, slot->buf,
981 				NULL, slot->original_len,
982 				slot->offset, slot->len));
983 	}
984 
985 private:
986 	/** Check whether the page was encrypted.
987 	@param[in]	slot		The slot that contains the IO request
988 	@return true if it was an encyrpted page */
is_encrypted_page(const Slot * slot)989 	static bool is_encrypted_page(const Slot* slot)
990 	{
991 		return(Encryption::is_encrypted_page(slot->buf));
992 	}
993 
994 	/** Check whether the page was compressed.
995 	@param[in]	slot		The slot that contains the IO request
996 	@return true if it was a compressed page */
is_compressed_page(const Slot * slot)997 	static bool is_compressed_page(const Slot* slot)
998 	{
999 		const byte*	src = slot->buf;
1000 
1001 		ulint	page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
1002 
1003 		return(page_type == FIL_PAGE_COMPRESSED);
1004 	}
1005 
1006 	/** Get the compressed page size.
1007 	@param[in]	slot		The slot that contains the IO request
1008 	@return number of bytes to read for a successful decompress */
compressed_page_size(const Slot * slot)1009 	static ulint compressed_page_size(const Slot* slot)
1010 	{
1011 		ut_ad(slot->type.is_read());
1012 		ut_ad(is_compressed_page(slot));
1013 
1014 		ulint		size;
1015 		const byte*	src = slot->buf;
1016 
1017 		size = mach_read_from_2(src + FIL_PAGE_COMPRESS_SIZE_V1);
1018 
1019 		return(size + FIL_PAGE_DATA);
1020 	}
1021 
1022 	/** Check if the page contents can be decompressed.
1023 	@param[in]	slot		The slot that contains the IO request
1024 	@return true if the data read has all the compressed data */
can_decompress(const Slot * slot)1025 	static bool can_decompress(const Slot* slot)
1026 	{
1027 		ut_ad(slot->type.is_read());
1028 		ut_ad(is_compressed_page(slot));
1029 
1030 		ulint		version;
1031 		const byte*	src = slot->buf;
1032 
1033 		version = mach_read_from_1(src + FIL_PAGE_VERSION);
1034 
1035 		ut_a(Compression::is_valid_page_version(version));
1036 
1037 		/* Includes the page header size too */
1038 		ulint		size = compressed_page_size(slot);
1039 
1040 		return(size <= (slot->ptr - slot->buf) + (ulint) slot->n_bytes);
1041 	}
1042 
1043 	/** Check if we need to read some more data.
1044 	@param[in]	slot		The slot that contains the IO request
1045 	@param[in]	n_bytes		Total bytes read so far
1046 	@return DB_SUCCESS or error code */
1047 	static dberr_t check_read(Slot* slot, ulint n_bytes);
1048 };
1049 
1050 /** Helper class for doing synchronous file IO. Currently, the objective
1051 is to hide the OS specific code, so that the higher level functions aren't
1052 peppered with #ifdef. Makes the code flow difficult to follow.  */
1053 class SyncFileIO {
1054 public:
1055 	/** Constructor
1056 	@param[in]	fh	File handle
1057 	@param[in,out]	buf	Buffer to read/write
1058 	@param[in]	n	Number of bytes to read/write
1059 	@param[in]	offset	Offset where to read or write */
SyncFileIO(os_file_t fh,void * buf,ulint n,os_offset_t offset)1060 	SyncFileIO(os_file_t fh, void* buf, ulint n, os_offset_t offset)
1061 		:
1062 		m_fh(fh),
1063 		m_buf(buf),
1064 		m_n(static_cast<ssize_t>(n)),
1065 		m_offset(offset)
1066 	{
1067 		ut_ad(m_n > 0);
1068 	}
1069 
1070 	/** Destructor */
~SyncFileIO()1071 	~SyncFileIO()
1072 	{
1073 		/* No op */
1074 	}
1075 
1076 	/** Do the read/write
1077 	@param[in]	request	The IO context and type
1078 	@return the number of bytes read/written or negative value on error */
1079 	ssize_t execute(const IORequest& request);
1080 
1081 	/** Do the read/write
1082 	@param[in,out]	slot	The IO slot, it has the IO context
1083 	@return the number of bytes read/written or negative value on error */
1084 	static ssize_t execute(Slot* slot);
1085 
1086 	/** Move the read/write offset up to where the partial IO succeeded.
1087 	@param[in]	n_bytes	The number of bytes to advance */
advance(ssize_t n_bytes)1088 	void advance(ssize_t n_bytes)
1089 	{
1090 		m_offset += n_bytes;
1091 
1092 		ut_ad(m_n >= n_bytes);
1093 
1094 		m_n -=  n_bytes;
1095 
1096 		m_buf = reinterpret_cast<uchar*>(m_buf) + n_bytes;
1097 	}
1098 
1099 private:
1100 	/** Open file handle */
1101 	os_file_t		m_fh;
1102 
1103 	/** Buffer to read/write */
1104 	void*			m_buf;
1105 
1106 	/** Number of bytes to read/write */
1107 	ssize_t			m_n;
1108 
1109 	/** Offset from where to read/write */
1110 	os_offset_t		m_offset;
1111 };
1112 
1113 /** If it is a compressed page return the compressed page data + footer size
1114 @param[in]	buf		Buffer to check, must include header + 10 bytes
1115 @return ULINT_UNDEFINED if the page is not a compressed page or length
1116 	of the compressed data (including footer) if it is a compressed page */
1117 ulint
os_file_compressed_page_size(const byte * buf)1118 os_file_compressed_page_size(const byte* buf)
1119 {
1120 	ulint	type = mach_read_from_2(buf + FIL_PAGE_TYPE);
1121 
1122 	if (type == FIL_PAGE_COMPRESSED) {
1123 		ulint	version = mach_read_from_1(buf + FIL_PAGE_VERSION);
1124 		ut_a(Compression::is_valid_page_version(version));
1125 		return(mach_read_from_2(buf + FIL_PAGE_COMPRESS_SIZE_V1));
1126 	}
1127 
1128 	return(ULINT_UNDEFINED);
1129 }
1130 
1131 /** If it is a compressed page return the original page data + footer size
1132 @param[in] buf		Buffer to check, must include header + 10 bytes
1133 @return ULINT_UNDEFINED if the page is not a compressed page or length
1134 	of the original data + footer if it is a compressed page */
1135 ulint
os_file_original_page_size(const byte * buf)1136 os_file_original_page_size(const byte* buf)
1137 {
1138 	ulint	type = mach_read_from_2(buf + FIL_PAGE_TYPE);
1139 
1140 	if (type == FIL_PAGE_COMPRESSED) {
1141 
1142 		ulint	version = mach_read_from_1(buf + FIL_PAGE_VERSION);
1143 		ut_a(Compression::is_valid_page_version(version));
1144 
1145 		return(mach_read_from_2(buf + FIL_PAGE_ORIGINAL_SIZE_V1));
1146 	}
1147 
1148 	return(ULINT_UNDEFINED);
1149 }
1150 
1151 /** Check if we need to read some more data.
1152 @param[in]	slot		The slot that contains the IO request
1153 @param[in]	n_bytes		Total bytes read so far
1154 @return DB_SUCCESS or error code */
1155 dberr_t
check_read(Slot * slot,ulint n_bytes)1156 AIOHandler::check_read(Slot* slot, ulint n_bytes)
1157 {
1158 	dberr_t	err;
1159 
1160 	ut_ad(slot->type.is_read());
1161 	ut_ad(slot->original_len > slot->len);
1162 
1163 	if (is_compressed_page(slot)) {
1164 
1165 		if (can_decompress(slot)) {
1166 
1167 			ut_a(slot->offset > 0);
1168 
1169 			slot->len = slot->original_len;
1170 #ifdef _WIN32
1171 			slot->n_bytes = static_cast<DWORD>(n_bytes);
1172 #else
1173 			slot->n_bytes = static_cast<ulint>(n_bytes);
1174 #endif /* _WIN32 */
1175 
1176 			err = io_complete(slot);
1177 			ut_a(err == DB_SUCCESS);
1178 		} else {
1179 			/* Read the next block in */
1180 			ut_ad(compressed_page_size(slot) >= n_bytes);
1181 
1182 			err = DB_FAIL;
1183 		}
1184 	} else if (is_encrypted_page(slot)) {
1185 			ut_a(slot->offset > 0);
1186 
1187 			slot->len = slot->original_len;
1188 #ifdef _WIN32
1189 			slot->n_bytes = static_cast<DWORD>(n_bytes);
1190 #else
1191 			slot->n_bytes = static_cast<ulint>(n_bytes);
1192 #endif /* _WIN32 */
1193 
1194 			err = io_complete(slot);
1195 			ut_a(err == DB_SUCCESS);
1196 
1197 	} else {
1198 		err = DB_FAIL;
1199 	}
1200 
1201 	if (slot->buf_block != NULL) {
1202 		os_free_block(slot->buf_block);
1203 		slot->buf_block = NULL;
1204 	}
1205 
1206 	return(err);
1207 }
1208 
1209 /** Do any post processing after a read/write
1210 @return DB_SUCCESS or error code. */
1211 dberr_t
post_io_processing(Slot * slot)1212 AIOHandler::post_io_processing(Slot* slot)
1213 {
1214 	dberr_t	err;
1215 
1216 	ut_ad(slot->is_reserved);
1217 
1218 	/* Total bytes read so far */
1219 	ulint	n_bytes = (slot->ptr - slot->buf) + slot->n_bytes;
1220 
1221 	/* Compressed writes can be smaller than the original length.
1222 	Therefore they can be processed without further IO. */
1223 	if (n_bytes == slot->original_len
1224 	    || (slot->type.is_write()
1225 		&& slot->type.is_compressed()
1226 		&& slot->len == static_cast<ulint>(slot->n_bytes))) {
1227 
1228 		if (!slot->type.is_log()
1229 		    && (is_compressed_page(slot)
1230 			|| is_encrypted_page(slot))) {
1231 
1232 			ut_a(slot->offset > 0);
1233 
1234 			if (slot->type.is_read()) {
1235 				slot->len = slot->original_len;
1236 			}
1237 
1238 			/* The punch hole has been done on collect() */
1239 
1240 			if (slot->type.is_read()) {
1241 				err = io_complete(slot);
1242 			} else {
1243 				err = DB_SUCCESS;
1244 			}
1245 
1246 			ut_ad(err == DB_SUCCESS
1247 			      || err == DB_UNSUPPORTED
1248 			      || err == DB_CORRUPTION
1249 			      || err == DB_IO_DECOMPRESS_FAIL);
1250 		} else {
1251 
1252 			err = DB_SUCCESS;
1253 		}
1254 
1255 		if (slot->buf_block != NULL) {
1256 			os_free_block(slot->buf_block);
1257 			slot->buf_block = NULL;
1258 		}
1259 
1260 	} else if ((ulint) slot->n_bytes == (ulint) slot->len) {
1261 
1262 		/* It *must* be a partial read. */
1263 		ut_ad(slot->len < slot->original_len);
1264 
1265 		/* Has to be a read request, if it is less than
1266 		the original length. */
1267 		ut_ad(slot->type.is_read());
1268 		err = check_read(slot, n_bytes);
1269 
1270 	} else {
1271 		err = DB_FAIL;
1272 	}
1273 
1274 	return(err);
1275 }
1276 
1277 /** Count the number of free slots
1278 @return number of reserved slots */
1279 ulint
pending_io_count() const1280 AIO::pending_io_count() const
1281 {
1282 	acquire();
1283 
1284 #ifdef UNIV_DEBUG
1285 	ut_a(m_n_segments > 0);
1286 	ut_a(!m_slots.empty());
1287 
1288 	ulint	count = 0;
1289 
1290 	for (ulint i = 0; i < m_slots.size(); ++i) {
1291 
1292 		const Slot&	slot = m_slots[i];
1293 
1294 		if (slot.is_reserved) {
1295 			++count;
1296 			ut_a(slot.len > 0);
1297 		}
1298 	}
1299 
1300 	ut_a(m_n_reserved == count);
1301 #endif /* UNIV_DEBUG */
1302 
1303 	ulint	reserved = m_n_reserved;
1304 
1305 	release();
1306 
1307 	return(reserved);
1308 }
1309 
1310 /** Compress a data page
1311 #param[in]	block_size	File system block size
1312 @param[in]	src		Source contents to compress
1313 @param[in]	src_len		Length in bytes of the source
1314 @param[out]	dst		Compressed page contents
1315 @param[out]	dst_len		Length in bytes of dst contents
1316 @return buffer data, dst_len will have the length of the data */
1317 static
1318 byte*
os_file_compress_page(Compression compression,ulint block_size,byte * src,ulint src_len,byte * dst,ulint * dst_len)1319 os_file_compress_page(
1320 	Compression	compression,
1321 	ulint		block_size,
1322 	byte*		src,
1323 	ulint		src_len,
1324 	byte*		dst,
1325 	ulint*		dst_len)
1326 {
1327 	ulint		len = 0;
1328 	ulint		compression_level = page_zip_level;
1329 	ulint		page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
1330 
1331 	/* The page size must be a multiple of the OS punch hole size. */
1332 	ut_ad(!(src_len % block_size));
1333 
1334 	/* Shouldn't compress an already compressed page. */
1335 	ut_ad(page_type != FIL_PAGE_COMPRESSED);
1336 
1337 	/* The page must be at least twice as large as the file system
1338 	block size if we are to save any space. Ignore R-Tree pages for now,
1339 	they repurpose the same 8 bytes in the page header. No point in
1340 	compressing if the file system block size >= our page size. */
1341 
1342 	if (page_type == FIL_PAGE_RTREE
1343 	    || block_size == ULINT_UNDEFINED
1344 	    || compression.m_type == Compression::NONE
1345 	    || src_len < block_size * 2) {
1346 
1347 		*dst_len = src_len;
1348 
1349 		return(src);
1350 	}
1351 
1352 	/* Leave the header alone when compressing. */
1353 	ut_ad(block_size >= FIL_PAGE_DATA * 2);
1354 
1355 	ut_ad(src_len > FIL_PAGE_DATA + block_size);
1356 
1357 	/* Must compress to <= N-1 FS blocks. */
1358 	ulint		out_len = src_len - (FIL_PAGE_DATA + block_size);
1359 
1360 	/* This is the original data page size - the page header. */
1361 	ulint		content_len = src_len - FIL_PAGE_DATA;
1362 
1363 	ut_ad(out_len >= block_size - FIL_PAGE_DATA);
1364 	ut_ad(out_len <= src_len - (block_size + FIL_PAGE_DATA));
1365 
1366 	/* Only compress the data + trailer, leave the header alone */
1367 
1368 	switch (compression.m_type) {
1369 	case Compression::NONE:
1370 		ut_error;
1371 
1372 	case Compression::ZLIB: {
1373 
1374 		uLongf	zlen = static_cast<uLongf>(out_len);
1375 
1376 		if (compress2(
1377 			dst + FIL_PAGE_DATA,
1378 			&zlen,
1379 			src + FIL_PAGE_DATA,
1380 			static_cast<uLong>(content_len),
1381 			static_cast<int>(compression_level)) != Z_OK) {
1382 
1383 			*dst_len = src_len;
1384 
1385 			return(src);
1386 		}
1387 
1388 		len = static_cast<ulint>(zlen);
1389 
1390 		break;
1391 	}
1392 
1393 	case Compression::LZ4:
1394 
1395 		len = LZ4_compress_default(
1396 			reinterpret_cast<char*>(src) + FIL_PAGE_DATA,
1397 			reinterpret_cast<char*>(dst) + FIL_PAGE_DATA,
1398 			static_cast<int>(content_len),
1399 			static_cast<int>(out_len));
1400 
1401 		ut_a(len <= src_len - FIL_PAGE_DATA);
1402 
1403 		if (len == 0  || len >= out_len) {
1404 
1405 			*dst_len = src_len;
1406 
1407 			return(src);
1408 		}
1409 
1410 		break;
1411 
1412 	default:
1413 		*dst_len = src_len;
1414 		return(src);
1415 	}
1416 
1417 	ut_a(len <= out_len);
1418 
1419 	ut_ad(memcmp(src + FIL_PAGE_LSN + 4,
1420 		     src + src_len - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)
1421 	      == 0);
1422 
1423 	/* Copy the header as is. */
1424 	memmove(dst, src, FIL_PAGE_DATA);
1425 
1426 	/* Add compression control information. Required for decompressing. */
1427 	mach_write_to_2(dst + FIL_PAGE_TYPE, FIL_PAGE_COMPRESSED);
1428 
1429 	mach_write_to_1(dst + FIL_PAGE_VERSION, Compression::FIL_PAGE_VERSION_2);
1430 
1431 	mach_write_to_1(dst + FIL_PAGE_ALGORITHM_V1, compression.m_type);
1432 
1433 	mach_write_to_2(dst + FIL_PAGE_ORIGINAL_TYPE_V1, page_type);
1434 
1435 	mach_write_to_2(dst + FIL_PAGE_ORIGINAL_SIZE_V1, content_len);
1436 
1437 	mach_write_to_2(dst + FIL_PAGE_COMPRESS_SIZE_V1, len);
1438 
1439 	/* Round to the next full block size */
1440 
1441 	len += FIL_PAGE_DATA;
1442 
1443 	*dst_len = ut_calc_align(len, block_size);
1444 
1445 	ut_ad(*dst_len >= len && *dst_len <= out_len + FIL_PAGE_DATA);
1446 
1447 	/* Clear out the unused portion of the page. */
1448 	if (len % block_size) {
1449 		memset(dst + len, 0x0, block_size - (len % block_size));
1450 	}
1451 
1452 	return(dst);
1453 }
1454 
1455 #ifdef UNIV_DEBUG
1456 # ifndef UNIV_HOTBACKUP
1457 /** Validates the consistency the aio system some of the time.
1458 @return true if ok or the check was skipped */
1459 bool
os_aio_validate_skip()1460 os_aio_validate_skip()
1461 {
1462 /** Try os_aio_validate() every this many times */
1463 # define OS_AIO_VALIDATE_SKIP	13
1464 
1465 	/** The os_aio_validate() call skip counter.
1466 	Use a signed type because of the race condition below. */
1467 	static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
1468 
1469 	/* There is a race condition below, but it does not matter,
1470 	because this call is only for heuristic purposes. We want to
1471 	reduce the call frequency of the costly os_aio_validate()
1472 	check in debug builds. */
1473 	--os_aio_validate_count;
1474 
1475 	if (os_aio_validate_count > 0) {
1476 		return(true);
1477 	}
1478 
1479 	os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
1480 	return(os_aio_validate());
1481 }
1482 # endif /* !UNIV_HOTBACKUP */
1483 #endif /* UNIV_DEBUG */
1484 
1485 #undef USE_FILE_LOCK
1486 #define USE_FILE_LOCK
1487 #if defined(UNIV_HOTBACKUP) || defined(_WIN32)
1488 /* InnoDB Hot Backup does not lock the data files.
1489  * On Windows, mandatory locking is used.
1490  */
1491 # undef USE_FILE_LOCK
1492 #endif
1493 #ifdef USE_FILE_LOCK
1494 /** Obtain an exclusive lock on a file.
1495 @param[in]	fd		file descriptor
1496 @param[in]	name		file name
1497 @return 0 on success */
1498 static
1499 int
os_file_lock(int fd,const char * name)1500 os_file_lock(
1501 	int		fd,
1502 	const char*	name)
1503 {
1504 	struct flock lk;
1505 
1506 	lk.l_type = F_WRLCK;
1507 	lk.l_whence = SEEK_SET;
1508 	lk.l_start = lk.l_len = 0;
1509 
1510 	if (fcntl(fd, F_SETLK, &lk) == -1) {
1511 
1512 		ib::error()
1513 			<< "Unable to lock " << name
1514 			<< " error: " << errno;
1515 
1516 		if (errno == EAGAIN || errno == EACCES) {
1517 
1518 			ib::info()
1519 				<< "Check that you do not already have"
1520 				" another mysqld process using the"
1521 				" same InnoDB data or log files.";
1522 		}
1523 
1524 		return(-1);
1525 	}
1526 
1527 	return(0);
1528 }
1529 #endif /* USE_FILE_LOCK */
1530 
1531 #ifndef UNIV_HOTBACKUP
1532 
1533 /** Calculates local segment number and aio array from global segment number.
1534 @param[out]	array		aio wait array
1535 @param[in]	segment		global segment number
1536 @return local segment number within the aio array */
1537 ulint
get_array_and_local_segment(AIO ** array,ulint segment)1538 AIO::get_array_and_local_segment(
1539 	AIO**		array,
1540 	ulint		segment)
1541 {
1542 	ulint		local_segment;
1543 	ulint		n_extra_segs = (srv_read_only_mode) ? 0 : 2;
1544 
1545 	ut_a(segment < os_aio_n_segments);
1546 
1547 	if (!srv_read_only_mode && segment < n_extra_segs) {
1548 
1549 		/* We don't support ibuf/log IO during read only mode. */
1550 
1551 		if (segment == IO_IBUF_SEGMENT) {
1552 
1553 			*array = s_ibuf;
1554 
1555 		} else if (segment == IO_LOG_SEGMENT) {
1556 
1557 			*array = s_log;
1558 
1559 		} else {
1560 			*array = NULL;
1561 		}
1562 
1563 		local_segment = 0;
1564 
1565 	} else if (segment < s_reads->m_n_segments + n_extra_segs) {
1566 
1567 		*array = s_reads;
1568 		local_segment = segment - n_extra_segs;
1569 
1570 	} else {
1571 		*array = s_writes;
1572 
1573 		local_segment = segment
1574 			      - (s_reads->m_n_segments + n_extra_segs);
1575 	}
1576 
1577 	return(local_segment);
1578 }
1579 
1580 /** Frees a slot in the aio array. Assumes caller owns the mutex.
1581 @param[in,out]	slot		Slot to release */
1582 void
release(Slot * slot)1583 AIO::release(Slot* slot)
1584 {
1585 	ut_ad(is_mutex_owned());
1586 
1587 	ut_ad(slot->is_reserved);
1588 
1589 	slot->is_reserved = false;
1590 
1591 	--m_n_reserved;
1592 
1593 	if (m_n_reserved == m_slots.size() - 1) {
1594 		os_event_set(m_not_full);
1595 	}
1596 
1597 	if (m_n_reserved == 0) {
1598 		os_event_set(m_is_empty);
1599 	}
1600 
1601 #ifdef WIN_ASYNC_IO
1602 
1603 	ResetEvent(slot->handle);
1604 
1605 #elif defined(LINUX_NATIVE_AIO)
1606 
1607 	if (srv_use_native_aio) {
1608 		memset(&slot->control, 0x0, sizeof(slot->control));
1609 		slot->ret = 0;
1610 		slot->n_bytes = 0;
1611 	} else {
1612 		/* These fields should not be used if we are not
1613 		using native AIO. */
1614 		ut_ad(slot->n_bytes == 0);
1615 		ut_ad(slot->ret == 0);
1616 	}
1617 
1618 #endif /* WIN_ASYNC_IO */
1619 }
1620 
1621 /** Frees a slot in the AIO array. Assumes caller doesn't own the mutex.
1622 @param[in,out]	slot		Slot to release */
1623 void
release_with_mutex(Slot * slot)1624 AIO::release_with_mutex(Slot* slot)
1625 {
1626 	acquire();
1627 
1628 	release(slot);
1629 
1630 	release();
1631 }
1632 
1633 /** Creates a temporary file.  This function is like tmpfile(3), but
1634 the temporary file is created in the given parameter path. If the path
1635 is NULL then it will create the file in the MySQL server configuration
1636 parameter (--tmpdir).
1637 @param[in]	path	location for creating temporary file
1638 @return temporary file handle, or NULL on error */
1639 FILE*
os_file_create_tmpfile(const char * path)1640 os_file_create_tmpfile(
1641 	const char*	path)
1642 {
1643 	FILE*	file	= NULL;
1644 	int	fd	= innobase_mysql_tmpfile(path);
1645 
1646 	if (fd >= 0) {
1647 		file = fdopen(fd, "w+b");
1648 	}
1649 
1650 	if (file == NULL) {
1651 
1652 		ib::error()
1653 			<< "Unable to create temporary file; errno: "
1654 			<< errno;
1655 
1656 		if (fd >= 0) {
1657 			close(fd);
1658 		}
1659 	}
1660 
1661 	return(file);
1662 }
1663 
1664 /** Rewind file to its start, read at most size - 1 bytes from it to str, and
1665 NUL-terminate str. All errors are silently ignored. This function is
1666 mostly meant to be used with temporary files.
1667 @param[in,out]	file		File to read from
1668 @param[in,out]	str		Buffer where to read
1669 @param[in]	size		Size of buffer */
1670 void
os_file_read_string(FILE * file,char * str,ulint size)1671 os_file_read_string(
1672 	FILE*		file,
1673 	char*		str,
1674 	ulint		size)
1675 {
1676 	if (size != 0) {
1677 		rewind(file);
1678 
1679 		size_t	flen = fread(str, 1, size - 1, file);
1680 
1681 		str[flen] = '\0';
1682 	}
1683 }
1684 
1685 /** Decompress after a read and punch a hole in the file if it was a write
1686 @param[in]	type		IO context
1687 @param[in]	fh		Open file handle
1688 @param[in,out]	buf		Buffer to transform
1689 @param[in,out]	scratch		Scratch area for read decompression
1690 @param[in]	src_len		Length of the buffer before compression
1691 @param[in]	len		Used buffer length for write and output
1692 				buf len for read
1693 @return DB_SUCCESS or error code */
1694 static
1695 dberr_t
os_file_io_complete(const IORequest & type,os_file_t fh,byte * buf,byte * scratch,ulint src_len,os_offset_t offset,ulint len)1696 os_file_io_complete(
1697 	const IORequest&type,
1698 	os_file_t	fh,
1699 	byte*		buf,
1700 	byte*		scratch,
1701 	ulint		src_len,
1702 	os_offset_t	offset,
1703 	ulint		len)
1704 {
1705 	/* We never compress/decompress the first page */
1706 	ut_a(offset > 0);
1707 	ut_ad(type.validate());
1708 
1709 	if (!type.is_compression_enabled()) {
1710 
1711 		return(DB_SUCCESS);
1712 
1713 	} else if (type.is_read()) {
1714 		dberr_t		ret;
1715 		Encryption	encryption(type.encryption_algorithm());
1716 
1717 		ut_ad(!type.is_log());
1718 		ut_ad(!type.is_row_log());
1719 
1720 		ret = encryption.decrypt(type, buf, src_len, scratch, len);
1721 		if (ret == DB_SUCCESS) {
1722 			return(os_file_decompress_page(
1723 					type.is_dblwr_recover(),
1724 					buf, scratch, len));
1725 		} else {
1726 			return(ret);
1727 		}
1728 
1729 	} else if (type.punch_hole()) {
1730 
1731 		ut_ad(len <= src_len);
1732 		ut_ad(!type.is_log());
1733 		ut_ad(type.is_write());
1734 		ut_ad(type.is_compressed());
1735 
1736 		/* Nothing to do. */
1737 		if (len == src_len) {
1738 			return(DB_SUCCESS);
1739 		}
1740 
1741 #ifdef UNIV_DEBUG
1742 		const ulint	block_size = type.block_size();
1743 #endif /* UNIV_DEBUG */
1744 
1745 		/* We don't support multiple page sizes in the server
1746 		at the moment. */
1747 		ut_ad(src_len == srv_page_size);
1748 
1749 		/* Must be a multiple of the compression unit size. */
1750 		ut_ad((len % block_size) == 0);
1751 		ut_ad((offset % block_size) == 0);
1752 
1753 		ut_ad(len + block_size <= src_len);
1754 
1755 		offset += len;
1756 
1757 		return(os_file_punch_hole(fh, offset, src_len - len));
1758 	}
1759 
1760 	ut_ad(!type.is_log());
1761 
1762 	return(DB_SUCCESS);
1763 }
1764 
1765 #endif /* !UNIV_HOTBACKUP */
1766 
1767 /** This function returns a new path name after replacing the basename
1768 in an old path with a new basename.  The old_path is a full path
1769 name including the extension.  The tablename is in the normal
1770 form "databasename/tablename".  The new base name is found after
1771 the forward slash.  Both input strings are null terminated.
1772 
1773 This function allocates memory to be returned.  It is the callers
1774 responsibility to free the return value after it is no longer needed.
1775 
1776 @param[in]	old_path		Pathname
1777 @param[in]	tablename		Contains new base name
1778 @return own: new full pathname */
1779 char*
os_file_make_new_pathname(const char * old_path,const char * tablename)1780 os_file_make_new_pathname(
1781 	const char*	old_path,
1782 	const char*	tablename)
1783 {
1784 	ulint		dir_len;
1785 	char*		last_slash;
1786 	char*		base_name;
1787 	char*		new_path;
1788 	ulint		new_path_len;
1789 
1790 	/* Split the tablename into its database and table name components.
1791 	They are separated by a '/'. */
1792 	last_slash = strrchr((char*) tablename, '/');
1793 	base_name = last_slash ? last_slash + 1 : (char*) tablename;
1794 
1795 	/* Find the offset of the last slash. We will strip off the
1796 	old basename.ibd which starts after that slash. */
1797 	last_slash = strrchr((char*) old_path, OS_PATH_SEPARATOR);
1798 	dir_len = last_slash ? last_slash - old_path : strlen(old_path);
1799 
1800 	/* allocate a new path and move the old directory path to it. */
1801 	new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd";
1802 	new_path = static_cast<char*>(ut_malloc_nokey(new_path_len));
1803 	memcpy(new_path, old_path, dir_len);
1804 
1805 	ut_snprintf(new_path + dir_len,
1806 		    new_path_len - dir_len,
1807 		    "%c%s.ibd",
1808 		    OS_PATH_SEPARATOR,
1809 		    base_name);
1810 
1811 	return(new_path);
1812 }
1813 
1814 /** This function reduces a null-terminated full remote path name into
1815 the path that is sent by MySQL for DATA DIRECTORY clause.  It replaces
1816 the 'databasename/tablename.ibd' found at the end of the path with just
1817 'tablename'.
1818 
1819 Since the result is always smaller than the path sent in, no new memory
1820 is allocated. The caller should allocate memory for the path sent in.
1821 This function manipulates that path in place.
1822 
1823 If the path format is not as expected, just return.  The result is used
1824 to inform a SHOW CREATE TABLE command.
1825 @param[in,out]	data_dir_path		Full path/data_dir_path */
1826 void
os_file_make_data_dir_path(char * data_dir_path)1827 os_file_make_data_dir_path(
1828 	char*	data_dir_path)
1829 {
1830 	/* Replace the period before the extension with a null byte. */
1831 	char*	ptr = strrchr((char*) data_dir_path, '.');
1832 
1833 	if (ptr == NULL) {
1834 		return;
1835 	}
1836 
1837 	ptr[0] = '\0';
1838 
1839 	/* The tablename starts after the last slash. */
1840 	ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
1841 
1842 	if (ptr == NULL) {
1843 		return;
1844 	}
1845 
1846 	ptr[0] = '\0';
1847 
1848 	char*	tablename = ptr + 1;
1849 
1850 	/* The databasename starts after the next to last slash. */
1851 	ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
1852 
1853 	if (ptr == NULL) {
1854 		return;
1855 	}
1856 
1857 	ulint	tablename_len = ut_strlen(tablename);
1858 
1859 	ut_memmove(++ptr, tablename, tablename_len);
1860 
1861 	ptr[tablename_len] = '\0';
1862 }
1863 
1864 /** Check if the path refers to the root of a drive using a pointer
1865 to the last directory separator that the caller has fixed.
1866 @param[in]	path	path name
1867 @param[in]	path	last directory separator in the path
1868 @return true if this path is a drive root, false if not */
1869 UNIV_INLINE
1870 bool
os_file_is_root(const char * path,const char * last_slash)1871 os_file_is_root(
1872 	const char*	path,
1873 	const char*	last_slash)
1874 {
1875 	return(
1876 #ifdef _WIN32
1877 	       (last_slash == path + 2 && path[1] == ':') ||
1878 #endif /* _WIN32 */
1879 	       last_slash == path);
1880 }
1881 
1882 /** Return the parent directory component of a null-terminated path.
1883 Return a new buffer containing the string up to, but not including,
1884 the final component of the path.
1885 The path returned will not contain a trailing separator.
1886 Do not return a root path, return NULL instead.
1887 The final component trimmed off may be a filename or a directory name.
1888 If the final component is the only component of the path, return NULL.
1889 It is the caller's responsibility to free the returned string after it
1890 is no longer needed.
1891 @param[in]	path		Path name
1892 @return own: parent directory of the path */
1893 static
1894 char*
os_file_get_parent_dir(const char * path)1895 os_file_get_parent_dir(
1896 	const char*	path)
1897 {
1898 	bool	has_trailing_slash = false;
1899 
1900 	/* Find the offset of the last slash */
1901 	const char* last_slash = strrchr(path, OS_PATH_SEPARATOR);
1902 
1903 	if (!last_slash) {
1904 		/* No slash in the path, return NULL */
1905 		return(NULL);
1906 	}
1907 
1908 	/* Ok, there is a slash. Is there anything after it? */
1909 	if (static_cast<size_t>(last_slash - path + 1) == strlen(path)) {
1910 		has_trailing_slash = true;
1911 	}
1912 
1913 	/* Reduce repetative slashes. */
1914 	while (last_slash > path
1915 		&& last_slash[-1] == OS_PATH_SEPARATOR) {
1916 		last_slash--;
1917 	}
1918 
1919 	/* Check for the root of a drive. */
1920 	if (os_file_is_root(path, last_slash)) {
1921 		return(NULL);
1922 	}
1923 
1924 	/* If a trailing slash prevented the first strrchr() from trimming
1925 	the last component of the path, trim that component now. */
1926 	if (has_trailing_slash) {
1927 		/* Back up to the previous slash. */
1928 		last_slash--;
1929 		while (last_slash > path
1930 		       && last_slash[0] != OS_PATH_SEPARATOR) {
1931 			last_slash--;
1932 		}
1933 
1934 		/* Reduce repetative slashes. */
1935 		while (last_slash > path
1936 			&& last_slash[-1] == OS_PATH_SEPARATOR) {
1937 			last_slash--;
1938 		}
1939 	}
1940 
1941 	/* Check for the root of a drive. */
1942 	if (os_file_is_root(path, last_slash)) {
1943 		return(NULL);
1944 	}
1945 
1946 	if (last_slash - path < 0) {
1947 		/* Sanity check, it prevents gcc from trying to handle this case which
1948 		 * results in warnings for some optimized builds */
1949 		return (NULL);
1950 	}
1951 
1952 	/* Non-trivial directory component */
1953 
1954 	return(mem_strdupl(path, last_slash - path));
1955 }
1956 #ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
1957 
1958 /* Test the function os_file_get_parent_dir. */
1959 void
test_os_file_get_parent_dir(const char * child_dir,const char * expected_dir)1960 test_os_file_get_parent_dir(
1961 	const char*	child_dir,
1962 	const char*	expected_dir)
1963 {
1964 	char* child = mem_strdup(child_dir);
1965 	char* expected = expected_dir == NULL ? NULL
1966 			 : mem_strdup(expected_dir);
1967 
1968 	/* os_file_get_parent_dir() assumes that separators are
1969 	converted to OS_PATH_SEPARATOR. */
1970 	os_normalize_path(child);
1971 	os_normalize_path(expected);
1972 
1973 	char* parent = os_file_get_parent_dir(child);
1974 
1975 	bool unexpected = (expected == NULL
1976 			  ? (parent != NULL)
1977 			  : (0 != strcmp(parent, expected)));
1978 	if (unexpected) {
1979 		ib::fatal() << "os_file_get_parent_dir('" << child
1980 			<< "') returned '" << parent
1981 			<< "', instead of '" << expected << "'.";
1982 	}
1983 	ut_free(parent);
1984 	ut_free(child);
1985 	ut_free(expected);
1986 }
1987 
1988 /* Test the function os_file_get_parent_dir. */
1989 void
unit_test_os_file_get_parent_dir()1990 unit_test_os_file_get_parent_dir()
1991 {
1992 	test_os_file_get_parent_dir("/usr/lib/a", "/usr/lib");
1993 	test_os_file_get_parent_dir("/usr/", NULL);
1994 	test_os_file_get_parent_dir("//usr//", NULL);
1995 	test_os_file_get_parent_dir("usr", NULL);
1996 	test_os_file_get_parent_dir("usr//", NULL);
1997 	test_os_file_get_parent_dir("/", NULL);
1998 	test_os_file_get_parent_dir("//", NULL);
1999 	test_os_file_get_parent_dir(".", NULL);
2000 	test_os_file_get_parent_dir("..", NULL);
2001 # ifdef _WIN32
2002 	test_os_file_get_parent_dir("D:", NULL);
2003 	test_os_file_get_parent_dir("D:/", NULL);
2004 	test_os_file_get_parent_dir("D:\\", NULL);
2005 	test_os_file_get_parent_dir("D:/data", NULL);
2006 	test_os_file_get_parent_dir("D:/data/", NULL);
2007 	test_os_file_get_parent_dir("D:\\data\\", NULL);
2008 	test_os_file_get_parent_dir("D:///data/////", NULL);
2009 	test_os_file_get_parent_dir("D:\\\\\\data\\\\\\\\", NULL);
2010 	test_os_file_get_parent_dir("D:/data//a", "D:/data");
2011 	test_os_file_get_parent_dir("D:\\data\\\\a", "D:\\data");
2012 	test_os_file_get_parent_dir("D:///data//a///b/", "D:///data//a");
2013 	test_os_file_get_parent_dir("D:\\\\\\data\\\\a\\\\\\b\\", "D:\\\\\\data\\\\a");
2014 #endif  /* _WIN32 */
2015 }
2016 #endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */
2017 
2018 
2019 /** Creates all missing subdirectories along the given path.
2020 @param[in]	path		Path name
2021 @return DB_SUCCESS if OK, otherwise error code. */
2022 dberr_t
os_file_create_subdirs_if_needed(const char * path)2023 os_file_create_subdirs_if_needed(
2024 	const char*	path)
2025 {
2026 	if (srv_read_only_mode) {
2027 
2028 		ib::error()
2029 			<< "read only mode set. Can't create "
2030 			<< "subdirectories '" << path << "'";
2031 
2032 		return(DB_READ_ONLY);
2033 
2034 	}
2035 
2036 	char*	subdir = os_file_get_parent_dir(path);
2037 
2038 	if (subdir == NULL) {
2039 		/* subdir is root or cwd, nothing to do */
2040 		return(DB_SUCCESS);
2041 	}
2042 
2043 	/* Test if subdir exists */
2044 	os_file_type_t	type;
2045 	bool	subdir_exists;
2046 	bool	success = os_file_status(subdir, &subdir_exists, &type);
2047 
2048 	if (success && !subdir_exists) {
2049 
2050 		/* Subdir does not exist, create it */
2051 		dberr_t	err = os_file_create_subdirs_if_needed(subdir);
2052 
2053 		if (err != DB_SUCCESS) {
2054 
2055 			ut_free(subdir);
2056 
2057 			return(err);
2058 		}
2059 
2060 		success = os_file_create_directory(subdir, false);
2061 	}
2062 
2063 	ut_free(subdir);
2064 
2065 	return(success ? DB_SUCCESS : DB_ERROR);
2066 }
2067 
2068 /** Allocate the buffer for IO on a transparently compressed table.
2069 @param[in]	type		IO flags
2070 @param[out]	buf		buffer to read or write
2071 @param[in,out]	n		number of bytes to read/write, starting from
2072 				offset
2073 @return pointer to allocated page, compressed data is written to the offset
2074 	that is aligned on the disk sector size */
2075 static
2076 Block*
os_file_compress_page(IORequest & type,void * & buf,ulint * n)2077 os_file_compress_page(
2078 	IORequest&	type,
2079 	void*&		buf,
2080 	ulint*		n)
2081 {
2082 	ut_ad(!type.is_log());
2083 	ut_ad(type.is_write());
2084 	ut_ad(type.is_compressed());
2085 
2086 	ulint	n_alloc = *n * 2;
2087 
2088 	ut_a(n_alloc <= UNIV_PAGE_SIZE_MAX * 2);
2089 	ut_a(type.compression_algorithm().m_type != Compression::LZ4
2090 	     || static_cast<ulint>(LZ4_COMPRESSBOUND(*n)) < n_alloc);
2091 
2092 	Block*  block = os_alloc_block();
2093 
2094 	ulint	old_compressed_len;
2095 	ulint	compressed_len = *n;
2096 
2097 	old_compressed_len = mach_read_from_2(
2098 		reinterpret_cast<byte*>(buf)
2099 		+ FIL_PAGE_COMPRESS_SIZE_V1);
2100 
2101 	if (old_compressed_len > 0) {
2102 		old_compressed_len = ut_calc_align(
2103 			old_compressed_len + FIL_PAGE_DATA,
2104 			type.block_size());
2105 	} else {
2106 		old_compressed_len = *n;
2107 	}
2108 
2109 	byte*	compressed_page;
2110 
2111 	compressed_page = static_cast<byte*>(
2112 		ut_align(block->m_ptr, os_io_ptr_align));
2113 
2114 	byte*	buf_ptr;
2115 
2116 	buf_ptr = os_file_compress_page(
2117 		type.compression_algorithm(),
2118 		type.block_size(),
2119 		reinterpret_cast<byte*>(buf),
2120 		*n,
2121 		compressed_page,
2122 		&compressed_len);
2123 
2124 	if (buf_ptr != buf) {
2125 		/* Set new compressed size to uncompressed page. */
2126 		memcpy(reinterpret_cast<byte*>(buf) + FIL_PAGE_COMPRESS_SIZE_V1,
2127 		       buf_ptr + FIL_PAGE_COMPRESS_SIZE_V1, 2);
2128 
2129 		buf = buf_ptr;
2130 		*n = compressed_len;
2131 
2132 		if (compressed_len >= old_compressed_len) {
2133 
2134 			ut_ad(old_compressed_len <= UNIV_PAGE_SIZE);
2135 
2136 			type.clear_punch_hole();
2137 		}
2138 	}
2139 
2140 	return(block);
2141 }
2142 
2143 /** Encrypt a page content when write it to disk.
2144 @param[in]	type		IO flags
2145 @param[out]	buf		buffer to read or write
2146 @param[in,out]	n		number of bytes to read/write, starting from
2147 				offset
2148 @return pointer to the encrypted page */
2149 static
2150 Block*
os_file_encrypt_page(const IORequest & type,void * & buf,ulint * n)2151 os_file_encrypt_page(
2152 	const IORequest&	type,
2153 	void*&			buf,
2154 	ulint*			n)
2155 {
2156 
2157 	byte*		encrypted_page;
2158 	ulint		encrypted_len = *n;
2159 	byte*		buf_ptr;
2160 	Encryption	encryption(type.encryption_algorithm());
2161 
2162 	ut_ad(!type.is_log());
2163 	ut_ad(type.is_write());
2164 	ut_ad(type.is_encrypted());
2165 
2166 	Block*  block = os_alloc_block();
2167 
2168 	encrypted_page = static_cast<byte*>(
2169 		ut_align(block->m_ptr, os_io_ptr_align));
2170 
2171 	buf_ptr = encryption.encrypt(type,
2172 				     reinterpret_cast<byte*>(buf), *n,
2173 				     encrypted_page, &encrypted_len);
2174 
2175 	bool	encrypted = buf_ptr != buf;
2176 
2177 	if (encrypted) {
2178 
2179 		buf = buf_ptr;
2180 		*n = encrypted_len;
2181 	}
2182 
2183 	return(block);
2184 }
2185 
2186 #ifndef _WIN32
2187 
2188 /** Do the read/write
2189 @param[in]	request	The IO context and type
2190 @return the number of bytes read/written or negative value on error */
2191 ssize_t
execute(const IORequest & request)2192 SyncFileIO::execute(const IORequest& request)
2193 {
2194 	ssize_t	n_bytes;
2195 
2196 	if (request.is_read()) {
2197 		n_bytes = pread(m_fh, m_buf, m_n, m_offset);
2198 	} else {
2199 		ut_ad(request.is_write());
2200 		n_bytes = pwrite(m_fh, m_buf, m_n, m_offset);
2201 	}
2202 
2203 	return(n_bytes);
2204 }
2205 
2206 /** Free storage space associated with a section of the file.
2207 @param[in]	fh		Open file handle
2208 @param[in]	off		Starting offset (SEEK_SET)
2209 @param[in]	len		Size of the hole
2210 @return DB_SUCCESS or error code */
2211 static
2212 dberr_t
os_file_punch_hole_posix(os_file_t fh,os_offset_t off,os_offset_t len)2213 os_file_punch_hole_posix(
2214 	os_file_t	fh,
2215 	os_offset_t	off,
2216 	os_offset_t	len)
2217 {
2218 #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
2219 	const int	mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
2220 
2221 	int             ret = fallocate(fh, mode, off, len);
2222 
2223 	if (ret == 0) {
2224 		return(DB_SUCCESS);
2225 	}
2226 
2227 	ut_a(ret == -1);
2228 
2229 	if (errno == ENOTSUP) {
2230 		return(DB_IO_NO_PUNCH_HOLE);
2231 	}
2232 
2233 	ib::warn()
2234 		<< "fallocate(" << fh
2235 		<<", FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, "
2236 		<< off << ", " << len << ") returned errno: "
2237 		<<  errno;
2238 
2239 	return(DB_IO_ERROR);
2240 
2241 #elif defined(UNIV_SOLARIS)
2242 
2243 	// Use F_FREESP
2244 
2245 #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
2246 
2247 	return(DB_IO_NO_PUNCH_HOLE);
2248 }
2249 
2250 #if defined(LINUX_NATIVE_AIO)
2251 
2252 /** Linux native AIO handler */
2253 class LinuxAIOHandler {
2254 public:
2255 	/**
2256 	@param[in] global_segment	The global segment*/
LinuxAIOHandler(ulint global_segment)2257 	LinuxAIOHandler(ulint global_segment)
2258 		:
2259 		m_global_segment(global_segment)
2260 	{
2261 		/* Should never be doing Sync IO here. */
2262 		ut_a(m_global_segment != ULINT_UNDEFINED);
2263 
2264 		/* Find the array and the local segment. */
2265 
2266 		m_segment = AIO::get_array_and_local_segment(
2267 			&m_array, m_global_segment);
2268 
2269 		m_n_slots = m_array->slots_per_segment();
2270 	}
2271 
2272 	/** Destructor */
~LinuxAIOHandler()2273 	~LinuxAIOHandler()
2274 	{
2275 		// No op
2276 	}
2277 
2278 	/**
2279 	Process a Linux AIO request
2280 	@param[out]	m1		the messages passed with the
2281 	@param[out]	m2		AIO request; note that in case the
2282 					AIO operation failed, these output
2283 					parameters are valid and can be used to
2284 					restart the operation.
2285 	@param[out]	request		IO context
2286 	@return DB_SUCCESS or error code */
2287 	dberr_t poll(fil_node_t** m1, void** m2, IORequest* request);
2288 
2289 private:
2290 	/** Resubmit an IO request that was only partially successful
2291 	@param[in,out]	slot		Request to resubmit
2292 	@return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
2293 	dberr_t	resubmit(Slot* slot);
2294 
2295 	/** Check if the AIO succeeded
2296 	@param[in,out]	slot		The slot to check
2297 	@return DB_SUCCESS, DB_FAIL if the operation should be retried or
2298 		DB_IO_ERROR on all other errors */
2299 	dberr_t	check_state(Slot* slot);
2300 
2301 	/** @return true if a shutdown was detected */
is_shutdown() const2302 	bool is_shutdown() const
2303 	{
2304 		return(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
2305 		       && !buf_page_cleaner_is_active);
2306 	}
2307 
2308 	/** If no slot was found then the m_array->m_mutex will be released.
2309 	@param[out]	n_pending	The number of pending IOs
2310 	@return NULL or a slot that has completed IO */
2311 	Slot* find_completed_slot(ulint* n_pending);
2312 
2313 	/** This is called from within the IO-thread. If there are no completed
2314 	IO requests in the slot array, the thread calls this function to
2315 	collect more requests from the Linux kernel.
2316 	The IO-thread waits on io_getevents(), which is a blocking call, with
2317 	a timeout value. Unless the system is very heavy loaded, keeping the
2318 	IO-thread very busy, the io-thread will spend most of its time waiting
2319 	in this function.
2320 	The IO-thread also exits in this function. It checks server status at
2321 	each wakeup and that is why we use timed wait in io_getevents(). */
2322 	void collect();
2323 
2324 private:
2325 	/** Slot array */
2326 	AIO*			m_array;
2327 
2328 	/** Number of slots inthe local segment */
2329 	ulint			m_n_slots;
2330 
2331 	/** The local segment to check */
2332 	ulint			m_segment;
2333 
2334 	/** The global segment */
2335 	ulint			m_global_segment;
2336 };
2337 
2338 /** Resubmit an IO request that was only partially successful
2339 @param[in,out]	slot		Request to resubmit
2340 @return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
2341 dberr_t
resubmit(Slot * slot)2342 LinuxAIOHandler::resubmit(Slot* slot)
2343 {
2344 #ifdef UNIV_DEBUG
2345 	/* Bytes already read/written out */
2346 	ulint	n_bytes = slot->ptr - slot->buf;
2347 
2348 	ut_ad(m_array->is_mutex_owned());
2349 
2350 	ut_ad(n_bytes < slot->original_len);
2351 	ut_ad(static_cast<ulint>(slot->n_bytes) < slot->original_len - n_bytes);
2352 	/* Partial read or write scenario */
2353 	ut_ad(slot->len >= static_cast<ulint>(slot->n_bytes));
2354 #endif /* UNIV_DEBUG */
2355 
2356 	slot->len -= slot->n_bytes;
2357 	slot->ptr += slot->n_bytes;
2358 	slot->offset += slot->n_bytes;
2359 
2360 	/* Resetting the bytes read/written */
2361 	slot->n_bytes = 0;
2362 	slot->io_already_done = false;
2363 
2364 	/* make sure that slot->offset fits in off_t */
2365 	ut_ad(sizeof(off_t) >= sizeof(os_offset_t));
2366 
2367 	struct iocb*	iocb = &slot->control;
2368 	if (slot->type.is_read()) {
2369 		io_prep_pread(
2370 			iocb,
2371 			slot->file.m_file,
2372 			slot->ptr,
2373 			slot->len,
2374 			slot->offset);
2375 
2376 	} else {
2377 
2378 		ut_a(slot->type.is_write());
2379 
2380 		io_prep_pwrite(
2381 			iocb,
2382 			slot->file.m_file,
2383 			slot->ptr,
2384 			slot->len,
2385 			slot->offset);
2386 	}
2387 
2388 	iocb->data = slot;
2389 
2390 	/* Resubmit an I/O request */
2391 	int	ret = io_submit(m_array->io_ctx(m_segment), 1, &iocb);
2392 
2393 	if (ret < -1)  {
2394 		errno = -ret;
2395 	}
2396 
2397 	return(ret < 0 ? DB_IO_PARTIAL_FAILED : DB_SUCCESS);
2398 }
2399 
2400 /** Check if the AIO succeeded
2401 @param[in,out]	slot		The slot to check
2402 @return DB_SUCCESS, DB_FAIL if the operation should be retried or
2403 	DB_IO_ERROR on all other errors */
2404 dberr_t
check_state(Slot * slot)2405 LinuxAIOHandler::check_state(Slot* slot)
2406 {
2407 	ut_ad(m_array->is_mutex_owned());
2408 
2409 	/* Note that it may be that there is more then one completed
2410 	IO requests. We process them one at a time. We may have a case
2411 	here to improve the performance slightly by dealing with all
2412 	requests in one sweep. */
2413 
2414 	srv_set_io_thread_op_info(
2415 		m_global_segment, "processing completed aio requests");
2416 
2417 	ut_ad(slot->io_already_done);
2418 
2419 	dberr_t	err;
2420 
2421 	if (slot->ret == 0) {
2422 
2423 		err = AIOHandler::post_io_processing(slot);
2424 
2425 	} else {
2426 		errno = -slot->ret;
2427 
2428 		/* os_file_handle_error does tell us if we should retry
2429 		this IO. As it stands now, we don't do this retry when
2430 		reaping requests from a different context than
2431 		the dispatcher. This non-retry logic is the same for
2432 		Windows and Linux native AIO.
2433 		We should probably look into this to transparently
2434 		re-submit the IO. */
2435 		os_file_handle_error(slot->name, "Linux aio");
2436 
2437 		err = DB_IO_ERROR;
2438 	}
2439 
2440 	return(err);
2441 }
2442 
2443 /** If no slot was found then the m_array->m_mutex will be released.
2444 @param[out]	n_pending		The number of pending IOs
2445 @return NULL or a slot that has completed IO */
2446 Slot*
find_completed_slot(ulint * n_pending)2447 LinuxAIOHandler::find_completed_slot(ulint* n_pending)
2448 {
2449 	ulint	offset = m_n_slots * m_segment;
2450 
2451 	*n_pending = 0;
2452 
2453 	m_array->acquire();
2454 
2455 	Slot*	slot = m_array->at(offset);
2456 
2457 	for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
2458 
2459 		if (slot->is_reserved) {
2460 
2461 			++*n_pending;
2462 
2463 			if (slot->io_already_done) {
2464 
2465 				/* Something for us to work on.
2466 				Note: We don't release the mutex. */
2467 				return(slot);
2468 			}
2469 		}
2470 	}
2471 
2472 	m_array->release();
2473 
2474 	return(NULL);
2475 }
2476 
2477 /** This function is only used in Linux native asynchronous i/o. This is
2478 called from within the io-thread. If there are no completed IO requests
2479 in the slot array, the thread calls this function to collect more
2480 requests from the kernel.
2481 The io-thread waits on io_getevents(), which is a blocking call, with
2482 a timeout value. Unless the system is very heavy loaded, keeping the
2483 io-thread very busy, the io-thread will spend most of its time waiting
2484 in this function.
2485 The io-thread also exits in this function. It checks server status at
2486 each wakeup and that is why we use timed wait in io_getevents(). */
2487 void
collect()2488 LinuxAIOHandler::collect()
2489 {
2490 	ut_ad(m_n_slots > 0);
2491 	ut_ad(m_array != NULL);
2492 	ut_ad(m_segment < m_array->get_n_segments());
2493 
2494 	/* Which io_context we are going to use. */
2495 	io_context*	io_ctx = m_array->io_ctx(m_segment);
2496 
2497 	/* Starting point of the m_segment we will be working on. */
2498 	ulint	start_pos = m_segment * m_n_slots;
2499 
2500 	/* End point. */
2501 	ulint	end_pos = start_pos + m_n_slots;
2502 
2503 	for (;;) {
2504 		struct io_event*	events;
2505 
2506 		/* Which part of event array we are going to work on. */
2507 		events = m_array->io_events(m_segment * m_n_slots);
2508 
2509 		/* Initialize the events. */
2510 		memset(events, 0, sizeof(*events) * m_n_slots);
2511 
2512 		/* The timeout value is arbitrary. We probably need
2513 		to experiment with it a little. */
2514 		struct timespec		timeout;
2515 
2516 		timeout.tv_sec = 0;
2517 		timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
2518 
2519 		int	ret;
2520 
2521 		ret = io_getevents(io_ctx, 1, m_n_slots, events, &timeout);
2522 
2523 		for (int i = 0; i < ret; ++i) {
2524 
2525 			struct iocb*	iocb;
2526 
2527 			iocb = reinterpret_cast<struct iocb*>(events[i].obj);
2528 			ut_a(iocb != NULL);
2529 
2530 			Slot*	slot = reinterpret_cast<Slot*>(iocb->data);
2531 
2532 			/* Some sanity checks. */
2533 			ut_a(slot != NULL);
2534 			ut_a(slot->is_reserved);
2535 
2536 			/* We are not scribbling previous segment. */
2537 			ut_a(slot->pos >= start_pos);
2538 
2539 			/* We have not overstepped to next segment. */
2540 			ut_a(slot->pos < end_pos);
2541 
2542 			/* We never compress/decompress the first page */
2543 
2544 			if (slot->offset > 0
2545 			    && !slot->skip_punch_hole
2546 			    && slot->type.is_compression_enabled()
2547 			    && !slot->type.is_log()
2548 			    && slot->type.is_write()
2549 			    && slot->type.is_compressed()
2550 			    && slot->type.punch_hole()) {
2551 
2552 				slot->err = AIOHandler::io_complete(slot);
2553 			} else {
2554 				slot->err = DB_SUCCESS;
2555 			}
2556 
2557 			/* Mark this request as completed. The error handling
2558 			will be done in the calling function. */
2559 			m_array->acquire();
2560 
2561 			/* events[i].res2 should always be ZERO */
2562 			ut_ad(events[i].res2 == 0);
2563 			slot->io_already_done = true;
2564 
2565 			/*Even though events[i].res is an unsigned number
2566 			in libaio, it is used to return a negative value
2567 			(negated errno value) to indicate error and a positive
2568 			value to indicate number of bytes read or written. */
2569 
2570 			if (events[i].res > slot->len) {
2571 				/* failure */
2572 				slot->n_bytes = 0;
2573 				slot->ret = events[i].res;
2574 			} else {
2575 				/* success */
2576 				slot->n_bytes = events[i].res;
2577 				slot->ret = 0;
2578 			}
2579 			m_array->release();
2580 		}
2581 
2582 		if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
2583 		    || !buf_page_cleaner_is_active
2584 		    || ret > 0) {
2585 
2586 			break;
2587 		}
2588 
2589 		/* This error handling is for any error in collecting the
2590 		IO requests. The errors, if any, for any particular IO
2591 		request are simply passed on to the calling routine. */
2592 
2593 		switch (ret) {
2594 		case -EAGAIN:
2595 			/* Not enough resources! Try again. */
2596 
2597 		case -EINTR:
2598 			/* Interrupted! The behaviour in case of an interrupt.
2599 			If we have some completed IOs available then the
2600 			return code will be the number of IOs. We get EINTR
2601 			only if there are no completed IOs and we have been
2602 			interrupted. */
2603 
2604 		case 0:
2605 			/* No pending request! Go back and check again. */
2606 
2607 			continue;
2608 		}
2609 
2610 		/* All other errors should cause a trap for now. */
2611 		ib::fatal()
2612 			<< "Unexpected ret_code[" << ret
2613 			<< "] from io_getevents()!";
2614 
2615 		break;
2616 	}
2617 }
2618 
2619 /** Process a Linux AIO request
2620 @param[out]	m1		the messages passed with the
2621 @param[out]	m2		AIO request; note that in case the
2622 				AIO operation failed, these output
2623 				parameters are valid and can be used to
2624 				restart the operation.
2625 @param[out]	request		IO context
2626 @return DB_SUCCESS or error code */
2627 dberr_t
poll(fil_node_t ** m1,void ** m2,IORequest * request)2628 LinuxAIOHandler::poll(fil_node_t** m1, void** m2, IORequest* request)
2629 {
2630 	dberr_t		err;
2631 	Slot*		slot;
2632 
2633 	/* Loop until we have found a completed request. */
2634 	for (;;) {
2635 
2636 		ulint	n_pending;
2637 
2638 		slot = find_completed_slot(&n_pending);
2639 
2640 		if (slot != NULL) {
2641 
2642 			ut_ad(m_array->is_mutex_owned());
2643 
2644 			err = check_state(slot);
2645 
2646 			/* DB_FAIL is not a hard error, we should retry */
2647 			if (err != DB_FAIL) {
2648 				break;
2649 			}
2650 
2651 			/* Partial IO, resubmit request for
2652 			remaining bytes to read/write */
2653 			err = resubmit(slot);
2654 
2655 			if (err != DB_SUCCESS) {
2656 				break;
2657 			}
2658 
2659 			m_array->release();
2660 
2661 		} else if (is_shutdown() && n_pending == 0) {
2662 
2663 			/* There is no completed request. If there is
2664 			no pending request at all, and the system is
2665 			being shut down, exit. */
2666 
2667 			*m1 = NULL;
2668 			*m2 = NULL;
2669 
2670 			return(DB_SUCCESS);
2671 
2672 		} else {
2673 
2674 			/* Wait for some request. Note that we return
2675 			from wait if we have found a request. */
2676 
2677 			srv_set_io_thread_op_info(
2678 				m_global_segment,
2679 				"waiting for completed aio requests");
2680 
2681 			collect();
2682 		}
2683 	}
2684 
2685 	if (err == DB_IO_PARTIAL_FAILED) {
2686 		/* Aborting in case of submit failure */
2687 		ib::fatal()
2688 			<< "Native Linux AIO interface. "
2689 			"io_submit() call failed when "
2690 			"resubmitting a partial I/O "
2691 			"request on the file " << slot->name
2692 			<< ".";
2693 	}
2694 
2695 	*m1 = slot->m1;
2696 	*m2 = slot->m2;
2697 
2698 	*request = slot->type;
2699 
2700 	m_array->release(slot);
2701 
2702 	m_array->release();
2703 
2704 	return(err);
2705 }
2706 
2707 /** This function is only used in Linux native asynchronous i/o.
2708 Waits for an aio operation to complete. This function is used to wait for
2709 the completed requests. The aio array of pending requests is divided
2710 into segments. The thread specifies which segment or slot it wants to wait
2711 for. NOTE: this function will also take care of freeing the aio slot,
2712 therefore no other thread is allowed to do the freeing!
2713 
2714 @param[in]	global_seg	segment number in the aio array
2715 				to wait for; segment 0 is the ibuf
2716 				i/o thread, segment 1 is log i/o thread,
2717 				then follow the non-ibuf read threads,
2718 				and the last are the non-ibuf write
2719 				threads.
2720 @param[out]	m1		the messages passed with the
2721 @param[out]	m2			AIO request; note that in case the
2722 				AIO operation failed, these output
2723 				parameters are valid and can be used to
2724 				restart the operation.
2725 @param[out]xi	 request	IO context
2726 @return DB_SUCCESS if the IO was successful */
2727 static
2728 dberr_t
os_aio_linux_handler(ulint global_segment,fil_node_t ** m1,void ** m2,IORequest * request)2729 os_aio_linux_handler(
2730 	ulint		global_segment,
2731 	fil_node_t**	m1,
2732 	void**		m2,
2733 	IORequest*	request)
2734 {
2735 	LinuxAIOHandler	handler(global_segment);
2736 
2737 	dberr_t	err = handler.poll(m1, m2, request);
2738 
2739 	if (err == DB_IO_NO_PUNCH_HOLE) {
2740 		fil_no_punch_hole(*m1);
2741 		err = DB_SUCCESS;
2742 	}
2743 
2744 	return(err);
2745 }
2746 
2747 /** Dispatch an AIO request to the kernel.
2748 @param[in,out]	slot		an already reserved slot
2749 @return true on success. */
2750 bool
linux_dispatch(Slot * slot)2751 AIO::linux_dispatch(Slot* slot)
2752 {
2753 	ut_a(slot->is_reserved);
2754 	ut_ad(slot->type.validate());
2755 
2756 	/* Find out what we are going to work with.
2757 	The iocb struct is directly in the slot.
2758 	The io_context is one per segment. */
2759 
2760 	ulint		io_ctx_index;
2761 	struct iocb*	iocb = &slot->control;
2762 
2763 	io_ctx_index = (slot->pos * m_n_segments) / m_slots.size();
2764 
2765 	int	ret = io_submit(m_aio_ctx[io_ctx_index], 1, &iocb);
2766 
2767 	/* io_submit() returns number of successfully queued requests
2768 	or -errno. */
2769 
2770 	if (ret != 1) {
2771 		errno = -ret;
2772 	}
2773 
2774 	return(ret == 1);
2775 }
2776 
2777 /** Creates an io_context for native linux AIO.
2778 @param[in]	max_events	number of events
2779 @param[out]	io_ctx		io_ctx to initialize.
2780 @return true on success. */
2781 bool
linux_create_io_ctx(ulint max_events,io_context_t * io_ctx)2782 AIO::linux_create_io_ctx(
2783 	ulint		max_events,
2784 	io_context_t*	io_ctx)
2785 {
2786 	ssize_t		n_retries = 0;
2787 
2788 	for (;;) {
2789 
2790 		memset(io_ctx, 0x0, sizeof(*io_ctx));
2791 
2792 		/* Initialize the io_ctx. Tell it how many pending
2793 		IO requests this context will handle. */
2794 
2795 		int	ret = io_setup(max_events, io_ctx);
2796 
2797 		if (ret == 0) {
2798 			/* Success. Return now. */
2799 			return(true);
2800 		}
2801 
2802 		/* If we hit EAGAIN we'll make a few attempts before failing. */
2803 
2804 		switch (ret) {
2805 		case -EAGAIN:
2806 			if (n_retries == 0) {
2807 				/* First time around. */
2808 				ib::warn()
2809 					<< "io_setup() failed with EAGAIN."
2810 					" Will make "
2811 					<< OS_AIO_IO_SETUP_RETRY_ATTEMPTS
2812 					<< " attempts before giving up.";
2813 			}
2814 
2815 			if (n_retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
2816 
2817 				++n_retries;
2818 
2819 				ib::warn()
2820 					<< "io_setup() attempt "
2821 					<< n_retries << ".";
2822 
2823 				os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
2824 
2825 				continue;
2826 			}
2827 
2828 			/* Have tried enough. Better call it a day. */
2829 			ib::error()
2830 				<< "io_setup() failed with EAGAIN after "
2831 				<< OS_AIO_IO_SETUP_RETRY_ATTEMPTS
2832 				<< " attempts.";
2833 			break;
2834 
2835 		case -ENOSYS:
2836 			ib::error()
2837 				<< "Linux Native AIO interface"
2838 				" is not supported on this platform. Please"
2839 				" check your OS documentation and install"
2840 				" appropriate binary of InnoDB.";
2841 
2842 			break;
2843 
2844 		default:
2845 			ib::error()
2846 				<< "Linux Native AIO setup"
2847 				<< " returned following error["
2848 				<< ret << "]";
2849 			break;
2850 		}
2851 
2852 		ib::info()
2853 			<< "You can disable Linux Native AIO by"
2854 			" setting innodb_use_native_aio = 0 in my.cnf";
2855 
2856 		break;
2857 	}
2858 
2859 	return(false);
2860 }
2861 
2862 /** Checks if the system supports native linux aio. On some kernel
2863 versions where native aio is supported it won't work on tmpfs. In such
2864 cases we can't use native aio as it is not possible to mix simulated
2865 and native aio.
2866 @return: true if supported, false otherwise. */
2867 bool
is_linux_native_aio_supported()2868 AIO::is_linux_native_aio_supported()
2869 {
2870 	int		fd;
2871 	io_context_t	io_ctx;
2872 	char		name[1000];
2873 
2874 	if (!linux_create_io_ctx(1, &io_ctx)) {
2875 
2876 		/* The platform does not support native aio. */
2877 
2878 		return(false);
2879 
2880 	} else if (!srv_read_only_mode) {
2881 
2882 		/* Now check if tmpdir supports native aio ops. */
2883 		fd = innobase_mysql_tmpfile(NULL);
2884 
2885 		if (fd < 0) {
2886 			ib::warn()
2887 				<< "Unable to create temp file to check"
2888 				" native AIO support.";
2889 
2890 			return(false);
2891 		}
2892 	} else {
2893 
2894 		os_normalize_path(srv_log_group_home_dir);
2895 
2896 		ulint	dirnamelen = strlen(srv_log_group_home_dir);
2897 
2898 		ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
2899 
2900 		memcpy(name, srv_log_group_home_dir, dirnamelen);
2901 
2902 		/* Add a path separator if needed. */
2903 		if (dirnamelen && name[dirnamelen - 1] != OS_PATH_SEPARATOR) {
2904 
2905 			name[dirnamelen++] = OS_PATH_SEPARATOR;
2906 		}
2907 
2908 		strcpy(name + dirnamelen, "ib_logfile0");
2909 
2910 		fd = ::open(name, O_RDONLY);
2911 
2912 		if (fd == -1) {
2913 
2914 			ib::warn()
2915 				<< "Unable to open"
2916 				<< " \"" << name << "\" to check native"
2917 				<< " AIO read support.";
2918 
2919 			return(false);
2920 		}
2921 	}
2922 
2923 	struct io_event	io_event;
2924 
2925 	memset(&io_event, 0x0, sizeof(io_event));
2926 
2927 	byte*	buf = static_cast<byte*>(ut_malloc_nokey(UNIV_PAGE_SIZE * 2));
2928 	byte*	ptr = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE));
2929 
2930 	struct iocb	iocb;
2931 
2932 	/* Suppress valgrind warning. */
2933 	memset(buf, 0x00, UNIV_PAGE_SIZE * 2);
2934 	memset(&iocb, 0x0, sizeof(iocb));
2935 
2936 	struct iocb*	p_iocb = &iocb;
2937 
2938 	if (!srv_read_only_mode) {
2939 
2940 		io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0);
2941 
2942 	} else {
2943 		ut_a(UNIV_PAGE_SIZE >= 512);
2944 		io_prep_pread(p_iocb, fd, ptr, 512, 0);
2945 	}
2946 
2947 	int	err = io_submit(io_ctx, 1, &p_iocb);
2948 
2949 	if (err >= 1) {
2950 		/* Now collect the submitted IO request. */
2951 		err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
2952 	}
2953 
2954 	ut_free(buf);
2955 	close(fd);
2956 
2957 	switch (err) {
2958 	case 1:
2959 		return(true);
2960 
2961 	case -EINVAL:
2962 	case -ENOSYS:
2963 		ib::error()
2964 			<< "Linux Native AIO not supported. You can either"
2965 			" move "
2966 			<< (srv_read_only_mode ? name : "tmpdir")
2967 			<< " to a file system that supports native"
2968 			" AIO or you can set innodb_use_native_aio to"
2969 			" FALSE to avoid this message.";
2970 
2971 		/* fall through. */
2972 	default:
2973 		ib::error()
2974 			<< "Linux Native AIO check on "
2975 			<< (srv_read_only_mode ? name : "tmpdir")
2976 			<< "returned error[" << -err << "]";
2977 	}
2978 
2979 	return(false);
2980 }
2981 
2982 #endif /* LINUX_NATIVE_AIO */
2983 
2984 /** Retrieves the last error number if an error occurs in a file io function.
2985 The number should be retrieved before any other OS calls (because they may
2986 overwrite the error number). If the number is not known to this program,
2987 the OS error number + 100 is returned.
2988 @param[in]	report_all_errors	true if we want an error message
2989 					printed of all errors
2990 @param[in]	on_error_silent		true then don't print any diagnostic
2991 					to the log
2992 @return error number, or OS error number + 100 */
2993 static
2994 ulint
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)2995 os_file_get_last_error_low(
2996 	bool	report_all_errors,
2997 	bool	on_error_silent)
2998 {
2999 	int	err = errno;
3000 
3001 	if (err == 0) {
3002 		return(0);
3003 	}
3004 
3005 	if (report_all_errors
3006 	    || (err != ENOSPC && err != EEXIST && !on_error_silent)) {
3007 
3008 		ib::error()
3009 			<< "Operating system error number "
3010 			<< err
3011 			<< " in a file operation.";
3012 
3013 		if (err == ENOENT) {
3014 
3015 			ib::error()
3016 				<< "The error means the system"
3017 				" cannot find the path specified.";
3018 
3019 			if (srv_is_being_started) {
3020 
3021 				ib::error()
3022 					<< "If you are installing InnoDB,"
3023 					" remember that you must create"
3024 					" directories yourself, InnoDB"
3025 					" does not create them.";
3026 			}
3027 		} else if (err == EACCES) {
3028 
3029 			ib::error()
3030 				<< "The error means mysqld does not have"
3031 				" the access rights to the directory.";
3032 
3033 		} else {
3034 			if (strerror(err) != NULL) {
3035 
3036 				ib::error()
3037 					<< "Error number " << err << " means '"
3038 					<< strerror(err) << "'";
3039 			}
3040 
3041 			ib::info() << OPERATING_SYSTEM_ERROR_MSG;
3042 		}
3043 	}
3044 
3045 	switch (err) {
3046 	case ENOSPC:
3047 		return(OS_FILE_DISK_FULL);
3048 	case ENOENT:
3049 		return(OS_FILE_NOT_FOUND);
3050 	case EEXIST:
3051 		return(OS_FILE_ALREADY_EXISTS);
3052 	case EXDEV:
3053 	case ENOTDIR:
3054 	case EISDIR:
3055 		return(OS_FILE_PATH_ERROR);
3056 	case EAGAIN:
3057 		if (srv_use_native_aio) {
3058 			return(OS_FILE_AIO_RESOURCES_RESERVED);
3059 		}
3060 		break;
3061 	case EINTR:
3062 		if (srv_use_native_aio) {
3063 			return(OS_FILE_AIO_INTERRUPTED);
3064 		}
3065 		break;
3066 	case EACCES:
3067 		return(OS_FILE_ACCESS_VIOLATION);
3068 	}
3069 	return(OS_FILE_ERROR_MAX + err);
3070 }
3071 
3072 /** Wrapper to fsync(2) that retries the call on some errors.
3073 Returns the value 0 if successful; otherwise the value -1 is returned and
3074 the global variable errno is set to indicate the error.
3075 @param[in]	file		open file handle
3076 @return 0 if success, -1 otherwise */
3077 static
3078 int
os_file_fsync_posix(os_file_t file)3079 os_file_fsync_posix(
3080 	os_file_t	file)
3081 {
3082 	ulint		failures = 0;
3083 
3084 	for (;;) {
3085 
3086 		++os_n_fsyncs;
3087 
3088 		int	ret = fsync(file);
3089 
3090 		if (ret == 0) {
3091 			return(ret);
3092 		}
3093 
3094 		switch(errno) {
3095 		case ENOLCK:
3096 
3097 			++failures;
3098 			ut_a(failures < 1000);
3099 
3100 			if (!(failures % 100)) {
3101 
3102 				ib::warn()
3103 					<< "fsync(): "
3104 					<< "No locks available; retrying";
3105 			}
3106 
3107 			/* 0.2 sec */
3108 			os_thread_sleep(200000);
3109 			break;
3110 
3111 		case EIO:
3112 
3113                         ib::fatal()
3114 				<< "fsync() returned EIO, aborting.";
3115 			break;
3116 
3117 		case EINTR:
3118 
3119 			++failures;
3120 			ut_a(failures < 2000);
3121 			break;
3122 
3123 		default:
3124 			ut_error;
3125 			break;
3126 		}
3127 	}
3128 
3129 	ut_error;
3130 
3131 	return(-1);
3132 }
3133 
3134 /** Check the existence and type of the given file.
3135 @param[in]	path		path name of file
3136 @param[out]	exists		true if the file exists
3137 @param[out]	type		Type of the file, if it exists
3138 @return true if call succeeded */
3139 bool
os_file_status_posix(const char * path,bool * exists,os_file_type_t * type)3140 os_file_status_posix(
3141 	const char*	path,
3142 	bool*		exists,
3143 	os_file_type_t* type)
3144 {
3145 	struct stat	statinfo;
3146 
3147 	int	ret = stat(path, &statinfo);
3148 
3149 	*exists = !ret;
3150 
3151 	if (!ret) {
3152 		/* file exists, everything OK */
3153 
3154 	} else if (errno == ENOENT || errno == ENOTDIR
3155 		   || errno == ENAMETOOLONG) {
3156 		/* file does not exist */
3157 		return(true);
3158 
3159 	} else {
3160 		/* file exists, but stat call failed */
3161 		os_file_handle_error_no_exit(path, "stat", false);
3162 		return(false);
3163 	}
3164 
3165 	if (S_ISDIR(statinfo.st_mode)) {
3166 		*type = OS_FILE_TYPE_DIR;
3167 
3168 	} else if (S_ISLNK(statinfo.st_mode)) {
3169 		*type = OS_FILE_TYPE_LINK;
3170 
3171 	} else if (S_ISREG(statinfo.st_mode)) {
3172 		*type = OS_FILE_TYPE_FILE;
3173 
3174 	} else {
3175 		*type = OS_FILE_TYPE_UNKNOWN;
3176 	}
3177 
3178 	return(true);
3179 }
3180 
3181 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
3182 function!
3183 Flushes the write buffers of a given file to the disk.
3184 @param[in]	file		handle to a file
3185 @return true if success */
3186 bool
os_file_flush_func(os_file_t file)3187 os_file_flush_func(
3188 	os_file_t	file)
3189 {
3190 	int	ret;
3191 
3192 	ret = os_file_fsync_posix(file);
3193 
3194 	if (ret == 0) {
3195 		return(true);
3196 	}
3197 
3198 	/* Since Linux returns EINVAL if the 'file' is actually a raw device,
3199 	we choose to ignore that error if we are using raw disks */
3200 
3201 	if (srv_start_raw_disk_in_use && errno == EINVAL) {
3202 
3203 		return(true);
3204 	}
3205 
3206 	ib::error() << "The OS said file flush did not succeed";
3207 
3208 	os_file_handle_error(NULL, "flush");
3209 
3210 	/* It is a fatal error if a file flush does not succeed, because then
3211 	the database can get corrupt on disk */
3212 	ut_error;
3213 
3214 	return(false);
3215 }
3216 
3217 /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
3218 this function!
3219 A simple function to open or create a file.
3220 @param[in]	name		name of the file or path as a null-terminated
3221 				string
3222 @param[in]	create_mode	create mode
3223 @param[in]	access_type	OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
3224 @param[in]	read_only	if true, read only checks are enforced
3225 @param[out]	success		true if succeed, false if error
3226 @return handle to the file, not defined if error, error number
3227 	can be retrieved with os_file_get_last_error */
3228 pfs_os_file_t
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)3229 os_file_create_simple_func(
3230 	const char*	name,
3231 	ulint		create_mode,
3232 	ulint		access_type,
3233 	bool		read_only,
3234 	bool*		success)
3235 {
3236 	pfs_os_file_t	file;
3237 
3238 	*success = false;
3239 
3240 	int		create_flag;
3241 
3242 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
3243 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
3244 
3245 	if (create_mode == OS_FILE_OPEN) {
3246 
3247 		if (access_type == OS_FILE_READ_ONLY) {
3248 
3249 			create_flag = O_RDONLY;
3250 
3251 		} else if (read_only) {
3252 
3253 			create_flag = O_RDONLY;
3254 
3255 		} else {
3256 			create_flag = O_RDWR;
3257 		}
3258 
3259 	} else if (read_only) {
3260 
3261 		create_flag = O_RDONLY;
3262 
3263 	} else if (create_mode == OS_FILE_CREATE) {
3264 
3265 		create_flag = O_RDWR | O_CREAT | O_EXCL;
3266 
3267 	} else if (create_mode == OS_FILE_CREATE_PATH) {
3268 
3269 		/* Create subdirs along the path if needed. */
3270 
3271 		*success = os_file_create_subdirs_if_needed(name);
3272 
3273 		if (!*success) {
3274 
3275 			ib::error()
3276 				<< "Unable to create subdirectories '"
3277 				<< name << "'";
3278 
3279 			file.m_file = OS_FILE_CLOSED;
3280 			return(file);
3281 		}
3282 
3283 		create_flag = O_RDWR | O_CREAT | O_EXCL;
3284 		create_mode = OS_FILE_CREATE;
3285 	} else {
3286 
3287 		ib::error()
3288 			<< "Unknown file create mode ("
3289 			<< create_mode
3290 			<< " for file '" << name << "'";
3291 
3292 		file.m_file = OS_FILE_CLOSED;
3293 		return(file);
3294 	}
3295 
3296 	bool	retry;
3297 
3298 	do {
3299 		file.m_file = ::open(name, create_flag, os_innodb_umask);
3300 
3301 		if (file.m_file == -1) {
3302 			*success = false;
3303 
3304 			retry = os_file_handle_error(
3305 				name,
3306 				create_mode == OS_FILE_OPEN
3307 				? "open" : "create");
3308 		} else {
3309 			*success = true;
3310 			retry = false;
3311 		}
3312 
3313 	} while (retry);
3314 
3315 #ifdef USE_FILE_LOCK
3316 	if (!read_only
3317 	    && *success
3318 	    && access_type == OS_FILE_READ_WRITE
3319 	    && os_file_lock(file.m_file, name)) {
3320 
3321 		*success = false;
3322 		close(file.m_file);
3323 		file.m_file = -1;
3324 	}
3325 #endif /* USE_FILE_LOCK */
3326 
3327 	return(file);
3328 }
3329 
3330 /** This function attempts to create a directory named pathname. The new
3331 directory gets default permissions. On Unix the permissions are
3332 (0770 & ~umask). If the directory exists already, nothing is done and
3333 the call succeeds, unless the fail_if_exists arguments is true.
3334 If another error occurs, such as a permission error, this does not crash,
3335 but reports the error and returns false.
3336 @param[in]	pathname	directory name as null-terminated string
3337 @param[in]	fail_if_exists	if true, pre-existing directory is treated as
3338 				an error.
3339 @return true if call succeeds, false on error */
3340 bool
os_file_create_directory(const char * pathname,bool fail_if_exists)3341 os_file_create_directory(
3342 	const char*	pathname,
3343 	bool		fail_if_exists)
3344 {
3345 	int	rcode = mkdir(pathname, 0770);
3346 
3347 	if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
3348 		/* failure */
3349 		os_file_handle_error_no_exit(pathname, "mkdir", false);
3350 
3351 		return(false);
3352 	}
3353 
3354 	return(true);
3355 }
3356 
3357 /**
3358 The os_file_opendir() function opens a directory stream corresponding to the
3359 directory named by the dirname argument. The directory stream is positioned
3360 at the first entry. In both Unix and Windows we automatically skip the '.'
3361 and '..' items at the start of the directory listing.
3362 @param[in]	dirname		directory name; it must not contain a trailing
3363 				'\' or '/'
3364 @param[in]	is_fatal	true if we should treat an error as a fatal
3365 				error; if we try to open symlinks then we do
3366 				not wish a fatal error if it happens not to be
3367 				a directory
3368 @return directory stream, NULL if error */
3369 os_file_dir_t
os_file_opendir(const char * dirname,bool error_is_fatal)3370 os_file_opendir(
3371 	const char*	dirname,
3372 	bool		error_is_fatal)
3373 {
3374 	os_file_dir_t		dir;
3375 	dir = opendir(dirname);
3376 
3377 	if (dir == NULL && error_is_fatal) {
3378 		os_file_handle_error(dirname, "opendir");
3379 	}
3380 
3381 	return(dir);
3382 }
3383 
3384 /** Closes a directory stream.
3385 @param[in]	dir		directory stream
3386 @return 0 if success, -1 if failure */
3387 int
os_file_closedir(os_file_dir_t dir)3388 os_file_closedir(
3389 	os_file_dir_t	dir)
3390 {
3391 	int	ret = closedir(dir);
3392 
3393 	if (ret != 0) {
3394 		os_file_handle_error_no_exit(NULL, "closedir", false);
3395 	}
3396 
3397 	return(ret);
3398 }
3399 
3400 /** This function returns information of the next file in the directory. We jump
3401 over the '.' and '..' entries in the directory.
3402 @param[in]	dirname		directory name or path
3403 @param[in]	dir		directory stream
3404 @param[out]	info		buffer where the info is returned
3405 @return 0 if ok, -1 if error, 1 if at the end of the directory */
3406 int
os_file_readdir_next_file(const char * dirname,os_file_dir_t dir,os_file_stat_t * info)3407 os_file_readdir_next_file(
3408 	const char*	dirname,
3409 	os_file_dir_t	dir,
3410 	os_file_stat_t*	info)
3411 {
3412 	struct dirent*	ent;
3413 	char*		full_path;
3414 	int		ret;
3415 	struct stat	statinfo;
3416 
3417 #ifdef HAVE_READDIR_R
3418 	char		dirent_buf[sizeof(struct dirent)
3419 				   + _POSIX_PATH_MAX + 100];
3420 	/* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
3421 	the max file name len; but in most standards, the
3422 	length is NAME_MAX; we add 100 to be even safer */
3423 #endif /* HAVE_READDIR_R */
3424 
3425 next_file:
3426 
3427 #ifdef HAVE_READDIR_R
3428 	ret = readdir_r(dir, (struct dirent*) dirent_buf, &ent);
3429 
3430 	if (ret != 0) {
3431 
3432 		ib::error()
3433 			<< "Cannot read directory " << dirname
3434 			<< " error: " << ret;
3435 
3436 		return(-1);
3437 	}
3438 
3439 	if (ent == NULL) {
3440 		/* End of directory */
3441 
3442 		return(1);
3443 	}
3444 
3445 	ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
3446 #else
3447 	ent = readdir(dir);
3448 
3449 	if (ent == NULL) {
3450 
3451 		return(1);
3452 	}
3453 #endif /* HAVE_READDIR_R */
3454 	ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
3455 
3456 	if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
3457 
3458 		goto next_file;
3459 	}
3460 
3461 	strcpy(info->name, ent->d_name);
3462 
3463 	full_path = static_cast<char*>(
3464 		ut_malloc_nokey(strlen(dirname) + strlen(ent->d_name) + 10));
3465 
3466 	sprintf(full_path, "%s/%s", dirname, ent->d_name);
3467 
3468 	ret = stat(full_path, &statinfo);
3469 
3470 	if (ret) {
3471 
3472 		if (errno == ENOENT) {
3473 			/* readdir() returned a file that does not exist,
3474 			it must have been deleted in the meantime. Do what
3475 			would have happened if the file was deleted before
3476 			readdir() - ignore and go to the next entry.
3477 			If this is the last entry then info->name will still
3478 			contain the name of the deleted file when this
3479 			function returns, but this is not an issue since the
3480 			caller shouldn't be looking at info when end of
3481 			directory is returned. */
3482 
3483 			ut_free(full_path);
3484 
3485 			goto next_file;
3486 		}
3487 
3488 		os_file_handle_error_no_exit(full_path, "stat", false);
3489 
3490 		ut_free(full_path);
3491 
3492 		return(-1);
3493 	}
3494 
3495 	info->size = statinfo.st_size;
3496 
3497 	if (S_ISDIR(statinfo.st_mode)) {
3498 		info->type = OS_FILE_TYPE_DIR;
3499 	} else if (S_ISLNK(statinfo.st_mode)) {
3500 		info->type = OS_FILE_TYPE_LINK;
3501 	} else if (S_ISREG(statinfo.st_mode)) {
3502 		info->type = OS_FILE_TYPE_FILE;
3503 	} else {
3504 		info->type = OS_FILE_TYPE_UNKNOWN;
3505 	}
3506 
3507 	ut_free(full_path);
3508 
3509 	return(0);
3510 }
3511 
3512 /** NOTE! Use the corresponding macro os_file_create(), not directly
3513 this function!
3514 Opens an existing file or creates a new.
3515 @param[in]	name		name of the file or path as a null-terminated
3516 				string
3517 @param[in]	create_mode	create mode
3518 @param[in]	purpose		OS_FILE_AIO, if asynchronous, non-buffered I/O
3519 				is desired, OS_FILE_NORMAL, if any normal file;
3520 				NOTE that it also depends on type, os_aio_..
3521 				and srv_.. variables whether we really use async
3522 				I/O or unbuffered I/O: look in the function
3523 				source code for the exact rules
3524 @param[in]	type		OS_DATA_FILE or OS_LOG_FILE
3525 @param[in]	read_only	true, if read only checks should be enforcedm
3526 @param[in]	success		true if succeeded
3527 @return handle to the file, not defined if error, error number
3528 	can be retrieved with os_file_get_last_error */
3529 pfs_os_file_t
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,bool read_only,bool * success)3530 os_file_create_func(
3531 	const char*	name,
3532 	ulint		create_mode,
3533 	ulint		purpose,
3534 	ulint		type,
3535 	bool		read_only,
3536 	bool*		success)
3537 {
3538 	bool		on_error_no_exit;
3539 	bool		on_error_silent;
3540 	pfs_os_file_t	file;
3541 
3542 	*success = false;
3543 
3544 	DBUG_EXECUTE_IF(
3545 		"ib_create_table_fail_disk_full",
3546 		*success = false;
3547 		errno = ENOSPC;
3548 		file.m_file = OS_FILE_CLOSED;
3549 		return(file);
3550 	);
3551 
3552 	int		create_flag;
3553 	const char*	mode_str	= NULL;
3554 
3555 	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
3556 		? true : false;
3557 	on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
3558 		? true : false;
3559 
3560 	create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
3561 	create_mode &= ~OS_FILE_ON_ERROR_SILENT;
3562 
3563 	if (create_mode == OS_FILE_OPEN
3564 	    || create_mode == OS_FILE_OPEN_RAW
3565 	    || create_mode == OS_FILE_OPEN_RETRY) {
3566 
3567 		mode_str = "OPEN";
3568 
3569 		create_flag = read_only ? O_RDONLY : O_RDWR;
3570 
3571 	} else if (read_only) {
3572 
3573 		mode_str = "OPEN";
3574 
3575 		create_flag = O_RDONLY;
3576 
3577 	} else if (create_mode == OS_FILE_CREATE) {
3578 
3579 		mode_str = "CREATE";
3580 		create_flag = O_RDWR | O_CREAT | O_EXCL;
3581 
3582 	} else if (create_mode == OS_FILE_OVERWRITE) {
3583 
3584 		mode_str = "OVERWRITE";
3585 		create_flag = O_RDWR | O_CREAT | O_TRUNC;
3586 
3587 	} else {
3588 		ib::error()
3589 			<< "Unknown file create mode (" << create_mode << ")"
3590 			<< " for file '" << name << "'";
3591 
3592 		file.m_file = OS_FILE_CLOSED;
3593 		return(file);
3594 	}
3595 
3596 	ut_a(type == OS_LOG_FILE
3597 	     || type == OS_DATA_FILE
3598 	     || type == OS_DATA_TEMP_FILE);
3599 
3600 	ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
3601 
3602 #ifdef O_SYNC
3603 	/* We let O_SYNC only affect log files; note that we map O_DSYNC to
3604 	O_SYNC because the datasync options seemed to corrupt files in 2001
3605 	in both Linux and Solaris */
3606 
3607 	if (!read_only
3608 	    && type == OS_LOG_FILE
3609 	    && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
3610 
3611 		create_flag |= O_SYNC;
3612 	}
3613 #endif /* O_SYNC */
3614 
3615 	bool		retry;
3616 
3617 	do {
3618 		file.m_file = ::open(name, create_flag, os_innodb_umask);
3619 
3620 		if (file.m_file == -1) {
3621 			const char*	operation;
3622 
3623 			operation = (create_mode == OS_FILE_CREATE
3624 				     && !read_only) ? "create" : "open";
3625 
3626 			*success = false;
3627 
3628 			if (on_error_no_exit) {
3629 				retry = os_file_handle_error_no_exit(
3630 					name, operation, on_error_silent);
3631 			} else {
3632 				retry = os_file_handle_error(name, operation);
3633 			}
3634 		} else {
3635 			*success = true;
3636 			retry = false;
3637 		}
3638 
3639 	} while (retry);
3640 
3641 	/* We disable OS caching (O_DIRECT) only on data files */
3642 
3643 	if (!read_only
3644 	    && *success
3645 	    && (type != OS_LOG_FILE && type != OS_DATA_TEMP_FILE)
3646 	    && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
3647 		|| srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) {
3648 
3649 		os_file_set_nocache(file.m_file, name, mode_str);
3650 	}
3651 
3652 #ifdef USE_FILE_LOCK
3653 	if (!read_only
3654 	    && *success
3655 	    && create_mode != OS_FILE_OPEN_RAW
3656 	    && os_file_lock(file.m_file, name)) {
3657 
3658 		if (create_mode == OS_FILE_OPEN_RETRY) {
3659 
3660 			ib::info()
3661 				<< "Retrying to lock the first data file";
3662 
3663 			for (int i = 0; i < 100; i++) {
3664 				os_thread_sleep(1000000);
3665 
3666 				if (!os_file_lock(file.m_file, name)) {
3667 					*success = true;
3668 					return(file);
3669 				}
3670 			}
3671 
3672 			ib::info()
3673 				<< "Unable to open the first data file";
3674 		}
3675 
3676 		*success = false;
3677 		close(file.m_file);
3678 		file.m_file = -1;
3679 	}
3680 #endif /* USE_FILE_LOCK */
3681 
3682 	return(file);
3683 }
3684 
3685 /** NOTE! Use the corresponding macro
3686 os_file_create_simple_no_error_handling(), not directly this function!
3687 A simple function to open or create a file.
3688 @param[in]	name		name of the file or path as a null-terminated
3689 				string
3690 @param[in]	create_mode	create mode
3691 @param[in]	access_type	OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
3692 				OS_FILE_READ_ALLOW_DELETE; the last option
3693 				is used by a backup program reading the file
3694 @param[in]	read_only	if true read only mode checks are enforced
3695 @param[out]	success		true if succeeded
3696 @return own: handle to the file, not defined if error, error number
3697 	can be retrieved with os_file_get_last_error */
3698 pfs_os_file_t
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)3699 os_file_create_simple_no_error_handling_func(
3700 	const char*	name,
3701 	ulint		create_mode,
3702 	ulint		access_type,
3703 	bool		read_only,
3704 	bool*		success)
3705 {
3706 	pfs_os_file_t	file;
3707 	int		create_flag;
3708 
3709 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
3710 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
3711 
3712 	*success = false;
3713 
3714 	if (create_mode == OS_FILE_OPEN) {
3715 
3716 		if (access_type == OS_FILE_READ_ONLY) {
3717 
3718 			create_flag = O_RDONLY;
3719 
3720 		} else if (read_only) {
3721 
3722 			create_flag = O_RDONLY;
3723 
3724 		} else {
3725 
3726 			ut_a(access_type == OS_FILE_READ_WRITE
3727 			     || access_type == OS_FILE_READ_ALLOW_DELETE);
3728 
3729 			create_flag = O_RDWR;
3730 		}
3731 
3732 	} else if (read_only) {
3733 
3734 		create_flag = O_RDONLY;
3735 
3736 	} else if (create_mode == OS_FILE_CREATE) {
3737 
3738 		create_flag = O_RDWR | O_CREAT | O_EXCL;
3739 
3740 	} else {
3741 
3742 		ib::error()
3743 			<< "Unknown file create mode "
3744 			<< create_mode << " for file '" << name << "'";
3745 		file.m_file = OS_FILE_CLOSED;
3746 		return(file);
3747 	}
3748 
3749 	file.m_file = ::open(name, create_flag, os_innodb_umask);
3750 
3751 	*success = (file.m_file != -1);
3752 
3753 #ifdef USE_FILE_LOCK
3754 	if (!read_only
3755 	    && *success
3756 	    && access_type == OS_FILE_READ_WRITE
3757 	    && os_file_lock(file.m_file, name)) {
3758 
3759 		*success = false;
3760 		close(file.m_file);
3761 		file.m_file = -1;
3762 
3763 	}
3764 #endif /* USE_FILE_LOCK */
3765 
3766 	return(file);
3767 }
3768 
3769 /** Deletes a file if it exists. The file has to be closed before calling this.
3770 @param[in]	name		file path as a null-terminated string
3771 @param[out]	exist		indicate if file pre-exist
3772 @return true if success */
3773 bool
os_file_delete_if_exists_func(const char * name,bool * exist)3774 os_file_delete_if_exists_func(
3775 	const char*	name,
3776 	bool*		exist)
3777 {
3778 	if (exist != NULL) {
3779 		*exist = true;
3780 	}
3781 
3782 	int	ret = unlink(name);
3783 
3784 	if (ret != 0 && errno == ENOENT) {
3785 		if (exist != NULL) {
3786 			*exist = false;
3787 		}
3788 	} else if (ret != 0 && errno != ENOENT) {
3789 		os_file_handle_error_no_exit(name, "delete", false);
3790 
3791 		return(false);
3792 	}
3793 
3794 	return(true);
3795 }
3796 
3797 /** Deletes a file. The file has to be closed before calling this.
3798 @param[in]	name		file path as a null-terminated string
3799 @return true if success */
3800 bool
os_file_delete_func(const char * name)3801 os_file_delete_func(
3802 	const char*	name)
3803 {
3804 	int	ret = unlink(name);
3805 
3806 	if (ret != 0) {
3807 		os_file_handle_error_no_exit(name, "delete", false);
3808 
3809 		return(false);
3810 	}
3811 
3812 	return(true);
3813 }
3814 
3815 /** NOTE! Use the corresponding macro os_file_rename(), not directly this
3816 function!
3817 Renames a file (can also move it to another directory). It is safest that the
3818 file is closed before calling this function.
3819 @param[in]	oldpath		old file path as a null-terminated string
3820 @param[in]	newpath		new file path
3821 @return true if success */
3822 bool
os_file_rename_func(const char * oldpath,const char * newpath)3823 os_file_rename_func(
3824 	const char*	oldpath,
3825 	const char*	newpath)
3826 {
3827 #ifdef UNIV_DEBUG
3828 	os_file_type_t	type;
3829 	bool		exists;
3830 
3831 	/* New path must not exist. */
3832 	ut_ad(os_file_status(newpath, &exists, &type));
3833 	ut_ad(!exists);
3834 
3835 	/* Old path must exist. */
3836 	ut_ad(os_file_status(oldpath, &exists, &type));
3837 	ut_ad(exists);
3838 #endif /* UNIV_DEBUG */
3839 
3840 	int	ret = rename(oldpath, newpath);
3841 
3842 	if (ret != 0) {
3843 		os_file_handle_error_no_exit(oldpath, "rename", false);
3844 
3845 		return(false);
3846 	}
3847 
3848 	return(true);
3849 }
3850 
3851 /** NOTE! Use the corresponding macro os_file_close(), not directly this
3852 function!
3853 Closes a file handle. In case of error, error number can be retrieved with
3854 os_file_get_last_error.
3855 @param[in]	file		Handle to close
3856 @return true if success */
3857 bool
os_file_close_func(os_file_t file)3858 os_file_close_func(
3859 	os_file_t	file)
3860 {
3861 	int	ret = close(file);
3862 
3863 	if (ret == -1) {
3864 		os_file_handle_error(NULL, "close");
3865 
3866 		return(false);
3867 	}
3868 
3869 	return(true);
3870 }
3871 
3872 /** Gets a file size.
3873 @param[in]	file		handle to an open file
3874 @return file size, or (os_offset_t) -1 on failure */
3875 os_offset_t
os_file_get_size(pfs_os_file_t file)3876 os_file_get_size(
3877 	pfs_os_file_t	file)
3878 {
3879 	/* Store current position */
3880 	os_offset_t	pos = lseek(file.m_file, 0, SEEK_CUR);
3881 	os_offset_t	file_size = lseek(file.m_file, 0, SEEK_END);
3882 
3883 	/* Restore current position as the function should not change it */
3884 	lseek(file.m_file, pos, SEEK_SET);
3885 
3886 	return(file_size);
3887 }
3888 
3889 /** Gets a file size.
3890 @param[in]	filename	Full path to the filename to check
3891 @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
3892 	errno */
3893 os_file_size_t
os_file_get_size(const char * filename)3894 os_file_get_size(
3895 	const char*	filename)
3896 {
3897 	struct stat	s;
3898 	os_file_size_t	file_size;
3899 
3900 	int	ret = stat(filename, &s);
3901 
3902 	if (ret == 0) {
3903 		file_size.m_total_size = s.st_size;
3904 		/* st_blocks is in 512 byte sized blocks */
3905 		file_size.m_alloc_size = s.st_blocks * 512;
3906 	} else {
3907 		file_size.m_total_size = ~0;
3908 		file_size.m_alloc_size = (os_offset_t) errno;
3909 	}
3910 
3911 	return(file_size);
3912 }
3913 
3914 /** This function returns information about the specified file
3915 @param[in]	path		pathname of the file
3916 @param[out]	stat_info	information of a file in a directory
3917 @param[in,out]	statinfo	information of a file in a directory
3918 @param[in]	check_rw_perm	for testing whether the file can be opened
3919 				in RW mode
3920 @param[in]	read_only	if true read only mode checks are enforced
3921 @return DB_SUCCESS if all OK */
3922 static
3923 dberr_t
os_file_get_status_posix(const char * path,os_file_stat_t * stat_info,struct stat * statinfo,bool check_rw_perm,bool read_only)3924 os_file_get_status_posix(
3925 	const char*	path,
3926 	os_file_stat_t* stat_info,
3927 	struct stat*	statinfo,
3928 	bool		check_rw_perm,
3929 	bool		read_only)
3930 {
3931 	int	ret = stat(path, statinfo);
3932 
3933 	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3934 		/* file does not exist */
3935 
3936 		return(DB_NOT_FOUND);
3937 
3938 	} else if (ret) {
3939 		/* file exists, but stat call failed */
3940 
3941 		os_file_handle_error_no_exit(path, "stat", false);
3942 
3943 		return(DB_FAIL);
3944 	}
3945 
3946 	switch (statinfo->st_mode & S_IFMT) {
3947 	case S_IFDIR:
3948 		stat_info->type = OS_FILE_TYPE_DIR;
3949 		break;
3950 	case S_IFLNK:
3951 		stat_info->type = OS_FILE_TYPE_LINK;
3952 		break;
3953 	case S_IFBLK:
3954 		/* Handle block device as regular file. */
3955 	case S_IFCHR:
3956 		/* Handle character device as regular file. */
3957 	case S_IFREG:
3958 		stat_info->type = OS_FILE_TYPE_FILE;
3959 		break;
3960 	default:
3961 		stat_info->type = OS_FILE_TYPE_UNKNOWN;
3962 	}
3963 
3964 	stat_info->size = statinfo->st_size;
3965 	stat_info->block_size = statinfo->st_blksize;
3966 	stat_info->alloc_size = statinfo->st_blocks * 512;
3967 
3968 	if (check_rw_perm
3969 	    && (stat_info->type == OS_FILE_TYPE_FILE
3970 		|| stat_info->type == OS_FILE_TYPE_BLOCK)) {
3971 
3972 		int	access = !read_only ? O_RDWR : O_RDONLY;
3973 		int	fh = ::open(path, access, os_innodb_umask);
3974 
3975 		if (fh == -1) {
3976 			stat_info->rw_perm = false;
3977 		} else {
3978 			stat_info->rw_perm = true;
3979 			close(fh);
3980 		}
3981 	}
3982 
3983 	return(DB_SUCCESS);
3984 }
3985 
3986 /** Truncates a file to a specified size in bytes.
3987 Do nothing if the size to preserve is greater or equal to the current
3988 size of the file.
3989 @param[in]	pathname	file path
3990 @param[in]	file		file to be truncated
3991 @param[in]	size		size to preserve in bytes
3992 @return true if success */
3993 static
3994 bool
os_file_truncate_posix(const char * pathname,pfs_os_file_t file,os_offset_t size)3995 os_file_truncate_posix(
3996 	const char*	pathname,
3997 	pfs_os_file_t	file,
3998 	os_offset_t	size)
3999 {
4000 	int     res = ftruncate(file.m_file, size);
4001 	if (res == -1) {
4002 
4003 		bool	retry;
4004 
4005 		retry = os_file_handle_error_no_exit(
4006 			pathname, "truncate", false);
4007 
4008 		if (retry) {
4009 			ib::warn()
4010 				<< "Truncate failed for '"
4011 				<< pathname << "'";
4012 		}
4013 	}
4014 
4015 	return(res == 0);
4016 }
4017 
4018 /** Truncates a file at its current position.
4019 @return true if success */
4020 bool
os_file_set_eof(FILE * file)4021 os_file_set_eof(
4022 	FILE*		file)	/*!< in: file to be truncated */
4023 {
4024 	return(!ftruncate(fileno(file), ftell(file)));
4025 }
4026 
4027 #ifdef UNIV_HOTBACKUP
4028 /** Closes a file handle.
4029 @param[in]	file		Handle to a file
4030 @return true if success */
4031 bool
os_file_close_no_error_handling(os_file_t file)4032 os_file_close_no_error_handling(
4033 	os_file_t	file)
4034 {
4035 	return(close(file) != -1);
4036 }
4037 #endif /* UNIV_HOTBACKUP */
4038 
4039 /** This function can be called if one wants to post a batch of reads and
4040 prefers an i/o-handler thread to handle them all at once later. You must
4041 call os_aio_simulated_wake_handler_threads later to ensure the threads
4042 are not left sleeping! */
4043 void
os_aio_simulated_put_read_threads_to_sleep()4044 os_aio_simulated_put_read_threads_to_sleep()
4045 {
4046 	/* No op on non Windows */
4047 }
4048 
4049 #else /* !_WIN32 */
4050 
4051 #include <WinIoCtl.h>
4052 
4053 /** Do the read/write
4054 @param[in]	request	The IO context and type
4055 @return the number of bytes read/written or negative value on error */
4056 ssize_t
execute(const IORequest & request)4057 SyncFileIO::execute(const IORequest& request)
4058 {
4059 	OVERLAPPED	seek;
4060 
4061 	memset(&seek, 0x0, sizeof(seek));
4062 
4063 	seek.Offset = (DWORD) m_offset & 0xFFFFFFFF;
4064 	seek.OffsetHigh = (DWORD) (m_offset >> 32);
4065 
4066 	BOOL	ret;
4067 	DWORD	n_bytes;
4068 
4069 	if (request.is_read()) {
4070 		ret = ReadFile(m_fh, m_buf,
4071 			static_cast<DWORD>(m_n), &n_bytes, &seek);
4072 
4073 	} else {
4074 		ut_ad(request.is_write());
4075 		ret = WriteFile(m_fh, m_buf,
4076 			static_cast<DWORD>(m_n), &n_bytes, &seek);
4077 	}
4078 
4079 	return(ret ? static_cast<ssize_t>(n_bytes) : -1);
4080 }
4081 
4082 /** Do the read/write
4083 @param[in,out]	slot	The IO slot, it has the IO context
4084 @return the number of bytes read/written or negative value on error */
4085 ssize_t
execute(Slot * slot)4086 SyncFileIO::execute(Slot* slot)
4087 {
4088 	BOOL	ret;
4089 
4090 	if (slot->type.is_read()) {
4091 		ret = ReadFile(
4092 			slot->file.m_file, slot->ptr, slot->len,
4093 			&slot->n_bytes, &slot->control);
4094 	} else {
4095 		ut_ad(slot->type.is_write());
4096 		ret = WriteFile(
4097 			slot->file.m_file, slot->ptr, slot->len,
4098 			&slot->n_bytes, &slot->control);
4099 	}
4100 
4101 	return(ret ? static_cast<ssize_t>(slot->n_bytes) : -1);
4102 }
4103 
4104 /** Check if the file system supports sparse files.
4105 @param[in]	 name		File name
4106 @return true if the file system supports sparse files */
4107 static
4108 bool
os_is_sparse_file_supported_win32(const char * filename)4109 os_is_sparse_file_supported_win32(const char* filename)
4110 {
4111 	char	volname[MAX_PATH];
4112 	BOOL	result = GetVolumePathName(filename, volname, MAX_PATH);
4113 
4114 	if (!result) {
4115 
4116 		ib::error()
4117 			<< "os_is_sparse_file_supported: "
4118 			<< "Failed to get the volume path name for: "
4119 			<< filename
4120 			<< "- OS error number " << GetLastError();
4121 
4122 		return(false);
4123 	}
4124 
4125 	DWORD	flags;
4126 
4127 	GetVolumeInformation(
4128 		volname, NULL, MAX_PATH, NULL, NULL,
4129 		&flags, NULL, MAX_PATH);
4130 
4131 	return(flags & FILE_SUPPORTS_SPARSE_FILES) ? true : false;
4132 }
4133 
4134 /** Free storage space associated with a section of the file.
4135 @param[in]	fh		Open file handle
4136 @param[in]	page_size	Tablespace page size
4137 @param[in]	block_size	File system block size
4138 @param[in]	off		Starting offset (SEEK_SET)
4139 @param[in]	len		Size of the hole
4140 @return 0 on success or errno */
4141 static
4142 dberr_t
os_file_punch_hole_win32(os_file_t fh,os_offset_t off,os_offset_t len)4143 os_file_punch_hole_win32(
4144 	os_file_t	fh,
4145 	os_offset_t	off,
4146 	os_offset_t	len)
4147 {
4148 	FILE_ZERO_DATA_INFORMATION	punch;
4149 
4150 	punch.FileOffset.QuadPart = off;
4151 	punch.BeyondFinalZero.QuadPart = off + len;
4152 
4153 	/* If lpOverlapped is NULL, lpBytesReturned cannot be NULL,
4154 	therefore we pass a dummy parameter. */
4155 	DWORD	temp;
4156 
4157 	BOOL	result = DeviceIoControl(
4158 		fh, FSCTL_SET_ZERO_DATA, &punch, sizeof(punch),
4159 		NULL, 0, &temp, NULL);
4160 
4161 	return(!result ? DB_IO_NO_PUNCH_HOLE : DB_SUCCESS);
4162 }
4163 
4164 /** Check the existence and type of the given file.
4165 @param[in]	path		path name of file
4166 @param[out]	exists		true if the file exists
4167 @param[out]	type		Type of the file, if it exists
4168 @return true if call succeeded */
4169 bool
os_file_status_win32(const char * path,bool * exists,os_file_type_t * type)4170 os_file_status_win32(
4171 	const char*	path,
4172 	bool*		exists,
4173 	os_file_type_t* type)
4174 {
4175 	int		ret;
4176 	struct _stat64	statinfo;
4177 
4178 	ret = _stat64(path, &statinfo);
4179 
4180 	*exists = !ret;
4181 
4182 	if (!ret) {
4183 		/* file exists, everything OK */
4184 
4185 	} else if (errno == ENOENT || errno == ENOTDIR
4186 		  || errno == ENAMETOOLONG) {
4187 		/* file does not exist */
4188 		return(true);
4189 
4190 	} else {
4191 		/* file exists, but stat call failed */
4192 		os_file_handle_error_no_exit(path, "stat", false);
4193 		return(false);
4194 	}
4195 
4196 	if (_S_IFDIR & statinfo.st_mode) {
4197 		*type = OS_FILE_TYPE_DIR;
4198 
4199 	} else if (_S_IFREG & statinfo.st_mode) {
4200 		*type = OS_FILE_TYPE_FILE;
4201 
4202 	} else {
4203 		*type = OS_FILE_TYPE_UNKNOWN;
4204 	}
4205 
4206 	return(true);
4207 }
4208 
4209 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
4210 function!
4211 Flushes the write buffers of a given file to the disk.
4212 @param[in]	file		handle to a file
4213 @return true if success */
4214 bool
os_file_flush_func(os_file_t file)4215 os_file_flush_func(
4216 	os_file_t	file)
4217 {
4218 	++os_n_fsyncs;
4219 
4220 	BOOL	ret = FlushFileBuffers(file);
4221 
4222 	if (ret) {
4223 		return(true);
4224 	}
4225 
4226 	/* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
4227 	actually a raw device, we choose to ignore that error if we are using
4228 	raw disks */
4229 
4230 	if (srv_start_raw_disk_in_use && GetLastError()
4231 	    == ERROR_INVALID_FUNCTION) {
4232 		return(true);
4233 	}
4234 
4235 	os_file_handle_error(NULL, "flush");
4236 
4237 	/* It is a fatal error if a file flush does not succeed, because then
4238 	the database can get corrupt on disk */
4239 	ut_error;
4240 
4241 	return(false);
4242 }
4243 
4244 /** Retrieves the last error number if an error occurs in a file io function.
4245 The number should be retrieved before any other OS calls (because they may
4246 overwrite the error number). If the number is not known to this program,
4247 the OS error number + 100 is returned.
4248 @param[in]	report_all_errors	true if we want an error message printed
4249 					of all errors
4250 @param[in]	on_error_silent		true then don't print any diagnostic
4251 					to the log
4252 @return error number, or OS error number + 100 */
4253 static
4254 ulint
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)4255 os_file_get_last_error_low(
4256 	bool	report_all_errors,
4257 	bool	on_error_silent)
4258 {
4259 	ulint	err = (ulint) GetLastError();
4260 
4261 	if (err == ERROR_SUCCESS) {
4262 		return(0);
4263 	}
4264 
4265 	if (report_all_errors
4266 	    || (!on_error_silent
4267 		&& err != ERROR_DISK_FULL
4268 		&& err != ERROR_FILE_EXISTS)) {
4269 
4270 		ib::error()
4271 			<< "Operating system error number " << err
4272 			<< " in a file operation.";
4273 
4274 		if (err == ERROR_PATH_NOT_FOUND) {
4275 			ib::error()
4276 				<< "The error means the system"
4277 				" cannot find the path specified.";
4278 
4279 			if (srv_is_being_started) {
4280 				ib::error()
4281 					<< "If you are installing InnoDB,"
4282 					" remember that you must create"
4283 					" directories yourself, InnoDB"
4284 					" does not create them.";
4285 			}
4286 
4287 		} else if (err == ERROR_ACCESS_DENIED) {
4288 
4289 			ib::error()
4290 				<< "The error means mysqld does not have"
4291 				" the access rights to"
4292 				" the directory. It may also be"
4293 				" you have created a subdirectory"
4294 				" of the same name as a data file.";
4295 
4296 		} else if (err == ERROR_SHARING_VIOLATION
4297 			   || err == ERROR_LOCK_VIOLATION) {
4298 
4299 			ib::error()
4300 				<< "The error means that another program"
4301 				" is using InnoDB's files."
4302 				" This might be a backup or antivirus"
4303 				" software or another instance"
4304 				" of MySQL."
4305 				" Please close it to get rid of this error.";
4306 
4307 		} else if (err == ERROR_WORKING_SET_QUOTA
4308 			   || err == ERROR_NO_SYSTEM_RESOURCES) {
4309 
4310 			ib::error()
4311 				<< "The error means that there are no"
4312 				" sufficient system resources or quota to"
4313 				" complete the operation.";
4314 
4315 		} else if (err == ERROR_OPERATION_ABORTED) {
4316 
4317 			ib::error()
4318 				<< "The error means that the I/O"
4319 				" operation has been aborted"
4320 				" because of either a thread exit"
4321 				" or an application request."
4322 				" Retry attempt is made.";
4323 		} else {
4324 
4325 			ib::info() << OPERATING_SYSTEM_ERROR_MSG;
4326 		}
4327 	}
4328 
4329 	if (err == ERROR_FILE_NOT_FOUND) {
4330 		return(OS_FILE_NOT_FOUND);
4331 	} else if (err == ERROR_DISK_FULL) {
4332 		return(OS_FILE_DISK_FULL);
4333 	} else if (err == ERROR_FILE_EXISTS) {
4334 		return(OS_FILE_ALREADY_EXISTS);
4335 	} else if (err == ERROR_SHARING_VIOLATION
4336 		   || err == ERROR_LOCK_VIOLATION) {
4337 		return(OS_FILE_SHARING_VIOLATION);
4338 	} else if (err == ERROR_WORKING_SET_QUOTA
4339 		   || err == ERROR_NO_SYSTEM_RESOURCES) {
4340 		return(OS_FILE_INSUFFICIENT_RESOURCE);
4341 	} else if (err == ERROR_OPERATION_ABORTED) {
4342 		return(OS_FILE_OPERATION_ABORTED);
4343 	} else if (err == ERROR_ACCESS_DENIED) {
4344 		return(OS_FILE_ACCESS_VIOLATION);
4345 	}
4346 
4347 	return(OS_FILE_ERROR_MAX + err);
4348 }
4349 
4350 /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
4351 this function!
4352 A simple function to open or create a file.
4353 @param[in]	name		name of the file or path as a null-terminated
4354 				string
4355 @param[in]	create_mode	create mode
4356 @param[in]	access_type	OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
4357 @param[in]	read_only	if true read only mode checks are enforced
4358 @param[out]	success		true if succeed, false if error
4359 @return handle to the file, not defined if error, error number
4360 	can be retrieved with os_file_get_last_error */
4361 pfs_os_file_t
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)4362 os_file_create_simple_func(
4363 	const char*	name,
4364 	ulint		create_mode,
4365 	ulint		access_type,
4366 	bool		read_only,
4367 	bool*		success)
4368 {
4369 	pfs_os_file_t	file;
4370 
4371 	*success = false;
4372 
4373 	DWORD		access;
4374 	DWORD		create_flag;
4375 	DWORD		attributes = 0;
4376 
4377 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
4378 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
4379 
4380 	if (create_mode == OS_FILE_OPEN) {
4381 
4382 		create_flag = OPEN_EXISTING;
4383 
4384 	} else if (read_only) {
4385 
4386 		create_flag = OPEN_EXISTING;
4387 
4388 	} else if (create_mode == OS_FILE_CREATE) {
4389 
4390 		create_flag = CREATE_NEW;
4391 
4392 	} else if (create_mode == OS_FILE_CREATE_PATH) {
4393 
4394 		/* Create subdirs along the path if needed. */
4395 		*success = os_file_create_subdirs_if_needed(name);
4396 
4397 		if (!*success) {
4398 
4399 			ib::error()
4400 				<< "Unable to create subdirectories '"
4401 				<< name << "'";
4402 			file.m_file = OS_FILE_CLOSED;
4403 			return(file);
4404 		}
4405 
4406 		create_flag = CREATE_NEW;
4407 		create_mode = OS_FILE_CREATE;
4408 
4409 	} else {
4410 
4411 		ib::error()
4412 			<< "Unknown file create mode ("
4413 			<< create_mode << ") for file '"
4414 			<< name << "'";
4415 
4416 		file.m_file = OS_FILE_CLOSED;
4417 		return(file);
4418 	}
4419 
4420 	if (access_type == OS_FILE_READ_ONLY) {
4421 
4422 		access = GENERIC_READ;
4423 
4424 	} else if (read_only) {
4425 
4426 		ib::info()
4427 			<< "Read only mode set. Unable to"
4428 			" open file '" << name << "' in RW mode, "
4429 			<< "trying RO mode", name;
4430 
4431 		access = GENERIC_READ;
4432 
4433 	} else if (access_type == OS_FILE_READ_WRITE) {
4434 
4435 		access = GENERIC_READ | GENERIC_WRITE;
4436 
4437 	} else {
4438 
4439 		ib::error()
4440 			<< "Unknown file access type (" << access_type << ") "
4441 			"for file '" << name << "'";
4442 
4443 		file.m_file = OS_FILE_CLOSED;
4444 		return(file);
4445 	}
4446 
4447 	bool	retry;
4448 
4449 	do {
4450 		/* Use default security attributes and no template file. */
4451 
4452 		file.m_file = CreateFile(
4453 			(LPCTSTR) name, access, FILE_SHARE_READ, NULL,
4454 			create_flag, attributes, NULL);
4455 
4456 		if (file.m_file == INVALID_HANDLE_VALUE) {
4457 
4458 			*success = false;
4459 
4460 			retry = os_file_handle_error(
4461 				name, create_mode == OS_FILE_OPEN ?
4462 				"open" : "create");
4463 
4464 		} else {
4465 
4466 			retry = false;
4467 
4468 			*success = true;
4469 
4470 			DWORD	temp;
4471 
4472 			/* This is a best effort use case, if it fails then
4473 			we will find out when we try and punch the hole. */
4474 
4475 			DeviceIoControl(
4476 				file.m_file, FSCTL_SET_SPARSE, NULL, 0, NULL, 0,
4477 				&temp, NULL);
4478 		}
4479 
4480 	} while (retry);
4481 
4482 	return(file);
4483 }
4484 
4485 /** This function attempts to create a directory named pathname. The new
4486 directory gets default permissions. On Unix the permissions are
4487 (0770 & ~umask). If the directory exists already, nothing is done and
4488 the call succeeds, unless the fail_if_exists arguments is true.
4489 If another error occurs, such as a permission error, this does not crash,
4490 but reports the error and returns false.
4491 @param[in]	pathname	directory name as null-terminated string
4492 @param[in]	fail_if_exists	if true, pre-existing directory is treated
4493 				as an error.
4494 @return true if call succeeds, false on error */
4495 bool
os_file_create_directory(const char * pathname,bool fail_if_exists)4496 os_file_create_directory(
4497 	const char*	pathname,
4498 	bool		fail_if_exists)
4499 {
4500 	BOOL	rcode;
4501 
4502 	rcode = CreateDirectory((LPCTSTR) pathname, NULL);
4503 	if (!(rcode != 0
4504 	      || (GetLastError() == ERROR_ALREADY_EXISTS
4505 		  && !fail_if_exists))) {
4506 
4507 		os_file_handle_error_no_exit(
4508 			pathname, "CreateDirectory", false);
4509 
4510 		return(false);
4511 	}
4512 
4513 	return(true);
4514 }
4515 
4516 /** The os_file_opendir() function opens a directory stream corresponding to the
4517 directory named by the dirname argument. The directory stream is positioned
4518 at the first entry. In both Unix and Windows we automatically skip the '.'
4519 and '..' items at the start of the directory listing.
4520 @param[in]	dirname		directory name; it must not contain a trailing
4521 				'\' or '/'
4522 @param[in]	is_fatal	true if we should treat an error as a fatal
4523 				error; if we try to open symlinks then we do
4524 				not wish a fatal error if it happens not to
4525 				be a directory
4526 @return directory stream, NULL if error */
4527 os_file_dir_t
os_file_opendir(const char * dirname,bool error_is_fatal)4528 os_file_opendir(
4529 	const char*	dirname,
4530 	bool		error_is_fatal)
4531 {
4532 	os_file_dir_t		dir;
4533 	LPWIN32_FIND_DATA	lpFindFileData;
4534 	char			path[OS_FILE_MAX_PATH + 3];
4535 
4536 	ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
4537 
4538 	strcpy(path, dirname);
4539 	strcpy(path + strlen(path), "\\*");
4540 
4541 	/* Note that in Windows opening the 'directory stream' also retrieves
4542 	the first entry in the directory. Since it is '.', that is no problem,
4543 	as we will skip over the '.' and '..' entries anyway. */
4544 
4545 	lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
4546 		ut_malloc_nokey(sizeof(WIN32_FIND_DATA)));
4547 
4548 	dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
4549 
4550 	ut_free(lpFindFileData);
4551 
4552 	if (dir == INVALID_HANDLE_VALUE) {
4553 
4554 		if (error_is_fatal) {
4555 			os_file_handle_error(dirname, "opendir");
4556 		}
4557 
4558 		return(NULL);
4559 	}
4560 
4561 	return(dir);
4562 }
4563 
4564 /** Closes a directory stream.
4565 @param[in]	dir	directory stream
4566 @return 0 if success, -1 if failure */
4567 int
os_file_closedir(os_file_dir_t dir)4568 os_file_closedir(
4569 	os_file_dir_t	dir)
4570 {
4571 	BOOL		ret;
4572 
4573 	ret = FindClose(dir);
4574 
4575 	if (!ret) {
4576 		os_file_handle_error_no_exit(NULL, "closedir", false);
4577 
4578 		return(-1);
4579 	}
4580 
4581 	return(0);
4582 }
4583 
4584 /** This function returns information of the next file in the directory. We
4585 jump over the '.' and '..' entries in the directory.
4586 @param[in]	dirname		directory name or path
4587 @param[in]	dir		directory stream
4588 @param[out]	info		buffer where the info is returned
4589 @return 0 if ok, -1 if error, 1 if at the end of the directory */
4590 int
os_file_readdir_next_file(const char * dirname,os_file_dir_t dir,os_file_stat_t * info)4591 os_file_readdir_next_file(
4592 	const char*	dirname,
4593 	os_file_dir_t	dir,
4594 	os_file_stat_t*	info)
4595 {
4596 	BOOL		ret;
4597 	int		status;
4598 	WIN32_FIND_DATA	find_data;
4599 
4600 next_file:
4601 
4602 	ret = FindNextFile(dir, &find_data);
4603 
4604 	if (ret > 0) {
4605 
4606 		const char* name;
4607 
4608 		name = static_cast<const char*>(find_data.cFileName);
4609 
4610 		ut_a(strlen(name) < OS_FILE_MAX_PATH);
4611 
4612 		if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0) {
4613 
4614 			goto next_file;
4615 		}
4616 
4617 		strcpy(info->name, name);
4618 
4619 		info->size = find_data.nFileSizeHigh;
4620 		info->size <<= 32;
4621 		info->size |= find_data.nFileSizeLow;
4622 
4623 		if (find_data.dwFileAttributes
4624 		    & FILE_ATTRIBUTE_REPARSE_POINT) {
4625 
4626 			/* TODO: test Windows symlinks */
4627 			/* TODO: MySQL has apparently its own symlink
4628 			implementation in Windows, dbname.sym can
4629 			redirect a database directory:
4630 			REFMAN "windows-symbolic-links.html" */
4631 
4632 			info->type = OS_FILE_TYPE_LINK;
4633 
4634 		} else if (find_data.dwFileAttributes
4635 			   & FILE_ATTRIBUTE_DIRECTORY) {
4636 
4637 			info->type = OS_FILE_TYPE_DIR;
4638 
4639 		} else {
4640 
4641 			/* It is probably safest to assume that all other
4642 			file types are normal. Better to check them rather
4643 			than blindly skip them. */
4644 
4645 			info->type = OS_FILE_TYPE_FILE;
4646 		}
4647 
4648 		status = 0;
4649 
4650 	} else if (GetLastError() == ERROR_NO_MORE_FILES) {
4651 
4652 		status = 1;
4653 
4654 	} else {
4655 
4656 		os_file_handle_error_no_exit(NULL, "readdir_next_file", false);
4657 
4658 		status = -1;
4659 	}
4660 
4661 	return(status);
4662 }
4663 
4664 /** NOTE! Use the corresponding macro os_file_create(), not directly
4665 this function!
4666 Opens an existing file or creates a new.
4667 @param[in]	name		name of the file or path as a null-terminated
4668 				string
4669 @param[in]	create_mode	create mode
4670 @param[in]	purpose		OS_FILE_AIO, if asynchronous, non-buffered I/O
4671 				is desired, OS_FILE_NORMAL, if any normal file;
4672 				NOTE that it also depends on type, os_aio_..
4673 				and srv_.. variables whether we really use async
4674 				I/O or unbuffered I/O: look in the function
4675 				source code for the exact rules
4676 @param[in]	type		OS_DATA_FILE or OS_LOG_FILE
4677 @param[in]	success		true if succeeded
4678 @return handle to the file, not defined if error, error number
4679 	can be retrieved with os_file_get_last_error */
4680 pfs_os_file_t
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,bool read_only,bool * success)4681 os_file_create_func(
4682 	const char*	name,
4683 	ulint		create_mode,
4684 	ulint		purpose,
4685 	ulint		type,
4686 	bool		read_only,
4687 	bool*		success)
4688 {
4689 	pfs_os_file_t	file;
4690 	bool		retry;
4691 	bool		on_error_no_exit;
4692 	bool		on_error_silent;
4693 
4694 	*success = false;
4695 
4696 	DBUG_EXECUTE_IF(
4697 		"ib_create_table_fail_disk_full",
4698 		*success = false;
4699 		SetLastError(ERROR_DISK_FULL);
4700 		file.m_file = OS_FILE_CLOSED;
4701 		return(file);
4702 	);
4703 
4704 	DWORD		create_flag;
4705 	DWORD		share_mode = FILE_SHARE_READ;
4706 
4707 	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
4708 		? true : false;
4709 
4710 	on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
4711 		? true : false;
4712 
4713 	create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
4714 	create_mode &= ~OS_FILE_ON_ERROR_SILENT;
4715 
4716 	if (create_mode == OS_FILE_OPEN_RAW) {
4717 
4718 		ut_a(!read_only);
4719 
4720 		create_flag = OPEN_EXISTING;
4721 
4722 		/* On Windows Physical devices require admin privileges and
4723 		have to have the write-share mode set. See the remarks
4724 		section for the CreateFile() function documentation in MSDN. */
4725 
4726 		share_mode |= FILE_SHARE_WRITE;
4727 
4728 	} else if (create_mode == OS_FILE_OPEN
4729 		   || create_mode == OS_FILE_OPEN_RETRY) {
4730 
4731 		create_flag = OPEN_EXISTING;
4732 
4733 	} else if (read_only) {
4734 
4735 		create_flag = OPEN_EXISTING;
4736 
4737 	} else if (create_mode == OS_FILE_CREATE) {
4738 
4739 		create_flag = CREATE_NEW;
4740 
4741 	} else if (create_mode == OS_FILE_OVERWRITE) {
4742 
4743 		create_flag = CREATE_ALWAYS;
4744 
4745 	} else {
4746 		ib::error()
4747 			<< "Unknown file create mode (" << create_mode << ") "
4748 			<< " for file '" << name << "'";
4749 
4750 		file.m_file = OS_FILE_CLOSED;
4751 		return(file);
4752 	}
4753 
4754 	DWORD		attributes = 0;
4755 
4756 #ifdef UNIV_HOTBACKUP
4757 	attributes |= FILE_FLAG_NO_BUFFERING;
4758 #else
4759 	if (purpose == OS_FILE_AIO) {
4760 
4761 #ifdef WIN_ASYNC_IO
4762 		/* If specified, use asynchronous (overlapped) io and no
4763 		buffering of writes in the OS */
4764 
4765 		if (srv_use_native_aio) {
4766 			attributes |= FILE_FLAG_OVERLAPPED;
4767 		}
4768 #endif /* WIN_ASYNC_IO */
4769 
4770 	} else if (purpose == OS_FILE_NORMAL) {
4771 
4772 		/* Use default setting. */
4773 
4774 	} else {
4775 
4776 		ib::error()
4777 			<< "Unknown purpose flag (" << purpose << ") "
4778 			<< "while opening file '" << name << "'";
4779 
4780 		file.m_file = OS_FILE_CLOSED;
4781 		return(file);
4782 	}
4783 
4784 #ifdef UNIV_NON_BUFFERED_IO
4785 	// TODO: Create a bug, this looks wrong. The flush log
4786 	// parameter is dynamic.
4787 	if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
4788 
4789 		/* Do not use unbuffered i/o for the log files because
4790 		value 2 denotes that we do not flush the log at every
4791 		commit, but only once per second */
4792 
4793 	} else if (srv_win_file_flush_method == SRV_WIN_IO_UNBUFFERED) {
4794 
4795 		attributes |= FILE_FLAG_NO_BUFFERING;
4796 	}
4797 #endif /* UNIV_NON_BUFFERED_IO */
4798 
4799 #endif /* UNIV_HOTBACKUP */
4800 	DWORD	access = GENERIC_READ;
4801 
4802 	if (!read_only) {
4803 		access |= GENERIC_WRITE;
4804 	}
4805 
4806 	do {
4807 		/* Use default security attributes and no template file. */
4808 		file.m_file = CreateFile(
4809 			(LPCTSTR) name, access, share_mode, NULL,
4810 			create_flag, attributes, NULL);
4811 
4812 		if (file.m_file == INVALID_HANDLE_VALUE) {
4813 			const char*	operation;
4814 
4815 			operation = (create_mode == OS_FILE_CREATE
4816 				     && !read_only)
4817 				? "create" : "open";
4818 
4819 			*success = false;
4820 
4821 			if (on_error_no_exit) {
4822 				retry = os_file_handle_error_no_exit(
4823 					name, operation, on_error_silent);
4824 			} else {
4825 				retry = os_file_handle_error(name, operation);
4826 			}
4827 		} else {
4828 
4829 			retry = false;
4830 
4831 			*success = true;
4832 
4833 			DWORD	temp;
4834 
4835 			/* This is a best effort use case, if it fails then
4836 			we will find out when we try and punch the hole. */
4837 			DeviceIoControl(
4838 				file.m_file, FSCTL_SET_SPARSE, NULL, 0, NULL, 0,
4839 				&temp, NULL);
4840 		}
4841 
4842 	} while (retry);
4843 
4844 	return(file);
4845 }
4846 
4847 /** NOTE! Use the corresponding macro os_file_create_simple_no_error_handling(),
4848 not directly this function!
4849 A simple function to open or create a file.
4850 @param[in]	name		name of the file or path as a null-terminated
4851 				string
4852 @param[in]	create_mode	create mode
4853 @param[in]	access_type	OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
4854 				OS_FILE_READ_ALLOW_DELETE; the last option is
4855 				used by a backup program reading the file
4856 @param[out]	success		true if succeeded
4857 @return own: handle to the file, not defined if error, error number
4858 	can be retrieved with os_file_get_last_error */
4859 pfs_os_file_t
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)4860 os_file_create_simple_no_error_handling_func(
4861 	const char*	name,
4862 	ulint		create_mode,
4863 	ulint		access_type,
4864 	bool		read_only,
4865 	bool*		success)
4866 {
4867 	pfs_os_file_t	file;
4868 
4869 	*success = false;
4870 
4871 	DWORD		access;
4872 	DWORD		create_flag;
4873 	DWORD		attributes	= 0;
4874 	DWORD		share_mode	= FILE_SHARE_READ;
4875 
4876 	ut_a(name);
4877 
4878 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
4879 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
4880 
4881 	if (create_mode == OS_FILE_OPEN) {
4882 
4883 		create_flag = OPEN_EXISTING;
4884 
4885 	} else if (read_only) {
4886 
4887 		create_flag = OPEN_EXISTING;
4888 
4889 	} else if (create_mode == OS_FILE_CREATE) {
4890 
4891 		create_flag = CREATE_NEW;
4892 
4893 	} else {
4894 
4895 		ib::error()
4896 			<< "Unknown file create mode (" << create_mode << ") "
4897 			<< " for file '" << name << "'";
4898 
4899 		file.m_file = OS_FILE_CLOSED;
4900 		return(file);
4901 	}
4902 
4903 	if (access_type == OS_FILE_READ_ONLY) {
4904 
4905 		access = GENERIC_READ;
4906 
4907 	} else if (read_only) {
4908 
4909 		access = GENERIC_READ;
4910 
4911 	} else if (access_type == OS_FILE_READ_WRITE) {
4912 
4913 		access = GENERIC_READ | GENERIC_WRITE;
4914 
4915 	} else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
4916 
4917 		ut_a(!read_only);
4918 
4919 		access = GENERIC_READ;
4920 
4921 		/*!< A backup program has to give mysqld the maximum
4922 		freedom to do what it likes with the file */
4923 
4924 		share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE;
4925 	} else {
4926 
4927 		ib::error()
4928 			<< "Unknown file access type (" << access_type << ") "
4929 			<< "for file '" << name << "'";
4930 
4931 		file.m_file = OS_FILE_CLOSED;
4932 		return(file);
4933 	}
4934 
4935 	file.m_file = CreateFile((LPCTSTR) name,
4936 			  access,
4937 			  share_mode,
4938 			  NULL,			// Security attributes
4939 			  create_flag,
4940 			  attributes,
4941 			  NULL);		// No template file
4942 
4943 	*success = (file.m_file != INVALID_HANDLE_VALUE);
4944 
4945 	return(file);
4946 }
4947 
4948 /** Deletes a file if it exists. The file has to be closed before calling this.
4949 @param[in]	name		file path as a null-terminated string
4950 @param[out]	exist		indicate if file pre-exist
4951 @return true if success */
4952 bool
os_file_delete_if_exists_func(const char * name,bool * exist)4953 os_file_delete_if_exists_func(
4954 	const char*	name,
4955 	bool*		exist)
4956 {
4957 	ulint	count	= 0;
4958 
4959 	if (exist != NULL) {
4960 		*exist = true;
4961 	}
4962 
4963 	for (;;) {
4964 		/* In Windows, deleting an .ibd file may fail if ibbackup
4965 		is copying it */
4966 
4967 		bool	ret = DeleteFile((LPCTSTR) name);
4968 
4969 		if (ret) {
4970 			return(true);
4971 		}
4972 
4973 		DWORD	lasterr = GetLastError();
4974 
4975 		if (lasterr == ERROR_FILE_NOT_FOUND
4976 		    || lasterr == ERROR_PATH_NOT_FOUND) {
4977 
4978 			/* the file does not exist, this not an error */
4979 			if (exist != NULL) {
4980 				*exist = false;
4981 			}
4982 
4983 			return(true);
4984 		}
4985 
4986 		++count;
4987 
4988 		if (count > 100 && 0 == (count % 10)) {
4989 
4990 			/* Print error information */
4991 			os_file_get_last_error(true);
4992 
4993 			ib::warn() << "Delete of file '" << name << "' failed.";
4994 		}
4995 
4996 		/* Sleep for a second */
4997 		os_thread_sleep(1000000);
4998 
4999 		if (count > 2000) {
5000 
5001 			return(false);
5002 		}
5003 	}
5004 }
5005 
5006 /** Deletes a file. The file has to be closed before calling this.
5007 @param[in]	name		File path as NUL terminated string
5008 @return true if success */
5009 bool
os_file_delete_func(const char * name)5010 os_file_delete_func(
5011 	const char*	name)
5012 {
5013 	ulint	count	= 0;
5014 
5015 	for (;;) {
5016 		/* In Windows, deleting an .ibd file may fail if ibbackup
5017 		is copying it */
5018 
5019 		BOOL	ret = DeleteFile((LPCTSTR) name);
5020 
5021 		if (ret) {
5022 			return(true);
5023 		}
5024 
5025 		if (GetLastError() == ERROR_FILE_NOT_FOUND) {
5026 			/* If the file does not exist, we classify this as
5027 			a 'mild' error and return */
5028 
5029 			return(false);
5030 		}
5031 
5032 		++count;
5033 
5034 		if (count > 100 && 0 == (count % 10)) {
5035 
5036 			/* print error information */
5037 			os_file_get_last_error(true);
5038 
5039 			ib::warn()
5040 				<< "Cannot delete file '" << name << "'. Are "
5041 				<< "you running ibbackup to back up the file?";
5042 		}
5043 
5044 		/* sleep for a second */
5045 		os_thread_sleep(1000000);
5046 
5047 		if (count > 2000) {
5048 
5049 			return(false);
5050 		}
5051 	}
5052 
5053 	ut_error;
5054 	return(false);
5055 }
5056 
5057 /** NOTE! Use the corresponding macro os_file_rename(), not directly this
5058 function!
5059 Renames a file (can also move it to another directory). It is safest that the
5060 file is closed before calling this function.
5061 @param[in]	oldpath		old file path as a null-terminated string
5062 @param[in]	newpath		new file path
5063 @return true if success */
5064 bool
os_file_rename_func(const char * oldpath,const char * newpath)5065 os_file_rename_func(
5066 	const char*	oldpath,
5067 	const char*	newpath)
5068 {
5069 #ifdef UNIV_DEBUG
5070 	os_file_type_t	type;
5071 	bool		exists;
5072 
5073 	/* New path must not exist. */
5074 	ut_ad(os_file_status(newpath, &exists, &type));
5075 	ut_ad(!exists);
5076 
5077 	/* Old path must exist. */
5078 	ut_ad(os_file_status(oldpath, &exists, &type));
5079 	ut_ad(exists);
5080 #endif /* UNIV_DEBUG */
5081 
5082 	if (MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath)) {
5083 		return(true);
5084 	}
5085 
5086 	os_file_handle_error_no_exit(oldpath, "rename", false);
5087 
5088 	return(false);
5089 }
5090 
5091 /** NOTE! Use the corresponding macro os_file_close(), not directly
5092 this function!
5093 Closes a file handle. In case of error, error number can be retrieved with
5094 os_file_get_last_error.
5095 @param[in,own]	file		Handle to a file
5096 @return true if success */
5097 bool
os_file_close_func(os_file_t file)5098 os_file_close_func(
5099 	os_file_t	file)
5100 {
5101 	ut_a(file > 0);
5102 
5103 	if (CloseHandle(file)) {
5104 		return(true);
5105 	}
5106 
5107 	os_file_handle_error(NULL, "close");
5108 
5109 	return(false);
5110 }
5111 
5112 /** Gets a file size.
5113 @param[in]	file		Handle to a file
5114 @return file size, or (os_offset_t) -1 on failure */
5115 os_offset_t
os_file_get_size(pfs_os_file_t file)5116 os_file_get_size(
5117 	pfs_os_file_t	file)
5118 {
5119 	DWORD		high;
5120 	DWORD		low;
5121 
5122 	low = GetFileSize(file.m_file, &high);
5123 
5124 	if (low == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
5125 		return((os_offset_t) -1);
5126 	}
5127 
5128 	return(os_offset_t(low | (os_offset_t(high) << 32)));
5129 }
5130 
5131 /** Gets a file size.
5132 @param[in]	filename	Full path to the filename to check
5133 @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
5134 	errno */
5135 os_file_size_t
os_file_get_size(const char * filename)5136 os_file_get_size(
5137 	const char*	filename)
5138 {
5139 	struct __stat64	s;
5140 	os_file_size_t	file_size;
5141 
5142 	int		ret = _stat64(filename, &s);
5143 
5144 	if (ret == 0) {
5145 
5146 		file_size.m_total_size = s.st_size;
5147 
5148 		DWORD	low_size;
5149 		DWORD	high_size;
5150 
5151 		low_size = GetCompressedFileSize(filename, &high_size);
5152 
5153 		if (low_size != INVALID_FILE_SIZE) {
5154 
5155 			file_size.m_alloc_size = high_size;
5156 			file_size.m_alloc_size <<= 32;
5157 			file_size.m_alloc_size |= low_size;
5158 
5159 		} else {
5160 			ib::error()
5161 				<< "GetCompressedFileSize("
5162 				<< filename << ", ..) failed.";
5163 
5164 			file_size.m_alloc_size = (os_offset_t) -1;
5165 		}
5166 	} else {
5167 		file_size.m_total_size = ~0;
5168 		file_size.m_alloc_size = (os_offset_t) ret;
5169 	}
5170 
5171 	return(file_size);
5172 }
5173 
5174 /** This function returns information about the specified file
5175 @param[in]	path		pathname of the file
5176 @param[out]	stat_info	information of a file in a directory
5177 @param[in,out]	statinfo	information of a file in a directory
5178 @param[in]	check_rw_perm	for testing whether the file can be opened
5179 				in RW mode
5180 @param[in]	read_only	true if the file is opened in read-only mode
5181 @return DB_SUCCESS if all OK */
5182 static
5183 dberr_t
os_file_get_status_win32(const char * path,os_file_stat_t * stat_info,struct _stat64 * statinfo,bool check_rw_perm,bool read_only)5184 os_file_get_status_win32(
5185 	const char*	path,
5186 	os_file_stat_t* stat_info,
5187 	struct _stat64*	statinfo,
5188 	bool		check_rw_perm,
5189 	bool		read_only)
5190 {
5191 	int	ret = _stat64(path, statinfo);
5192 
5193 	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
5194 		/* file does not exist */
5195 
5196 		return(DB_NOT_FOUND);
5197 
5198 	} else if (ret) {
5199 		/* file exists, but stat call failed */
5200 
5201 		os_file_handle_error_no_exit(path, "stat", false);
5202 
5203 		return(DB_FAIL);
5204 
5205 	} else if (_S_IFDIR & statinfo->st_mode) {
5206 
5207 		stat_info->type = OS_FILE_TYPE_DIR;
5208 
5209 	} else if (_S_IFREG & statinfo->st_mode) {
5210 
5211 		DWORD	access = GENERIC_READ;
5212 
5213 		if (!read_only) {
5214 			access |= GENERIC_WRITE;
5215 		}
5216 
5217 		stat_info->type = OS_FILE_TYPE_FILE;
5218 
5219 		/* Check if we can open it in read-only mode. */
5220 
5221 		if (check_rw_perm) {
5222 			HANDLE	fh;
5223 
5224 			fh = CreateFile(
5225 				(LPCTSTR) path,		// File to open
5226 				access,
5227 				0,			// No sharing
5228 				NULL,			// Default security
5229 				OPEN_EXISTING,		// Existing file only
5230 				FILE_ATTRIBUTE_NORMAL,	// Normal file
5231 				NULL);			// No attr. template
5232 
5233 			if (fh == INVALID_HANDLE_VALUE) {
5234 				stat_info->rw_perm = false;
5235 			} else {
5236 				stat_info->rw_perm = true;
5237 				CloseHandle(fh);
5238 			}
5239 		}
5240 
5241 		char	volname[MAX_PATH];
5242 		BOOL	result = GetVolumePathName(path, volname, MAX_PATH);
5243 
5244 		if (!result) {
5245 
5246 			ib::error()
5247 				<< "os_file_get_status_win32: "
5248 				<< "Failed to get the volume path name for: "
5249 				<< path
5250 				<< "- OS error number " << GetLastError();
5251 
5252 			return(DB_FAIL);
5253 		}
5254 
5255 		DWORD	sectorsPerCluster;
5256 		DWORD	bytesPerSector;
5257 		DWORD	numberOfFreeClusters;
5258 		DWORD	totalNumberOfClusters;
5259 
5260 		result = GetDiskFreeSpace(
5261 			(LPCSTR) volname,
5262 			&sectorsPerCluster,
5263 			&bytesPerSector,
5264 			&numberOfFreeClusters,
5265 			&totalNumberOfClusters);
5266 
5267 		if (!result) {
5268 
5269 			ib::error()
5270 				<< "GetDiskFreeSpace(" << volname << ",...) "
5271 				<< "failed "
5272 				<< "- OS error number " << GetLastError();
5273 
5274 			return(DB_FAIL);
5275 		}
5276 
5277 		stat_info->block_size = bytesPerSector * sectorsPerCluster;
5278 
5279 		/* On Windows the block size is not used as the allocation
5280 		unit for sparse files. The underlying infra-structure for
5281 		sparse files is based on NTFS compression. The punch hole
5282 		is done on a "compression unit". This compression unit
5283 		is based on the cluster size. You cannot punch a hole if
5284 		the cluster size >= 8K. For smaller sizes the table is
5285 		as follows:
5286 
5287 		Cluster Size	Compression Unit
5288 		512 Bytes		 8 KB
5289 		  1 KB			16 KB
5290 		  2 KB			32 KB
5291 		  4 KB			64 KB
5292 
5293 		Default NTFS cluster size is 4K, compression unit size of 64K.
5294 		Therefore unless the user has created the file system with
5295 		a smaller cluster size and used larger page sizes there is
5296 		little benefit from compression out of the box. */
5297 
5298 		stat_info->block_size = (stat_info->block_size <= 4096)
5299 			?  stat_info->block_size * 16 : ULINT_UNDEFINED;
5300 	} else {
5301 		stat_info->type = OS_FILE_TYPE_UNKNOWN;
5302 	}
5303 
5304 	return(DB_SUCCESS);
5305 }
5306 
5307 /** Truncates a file to a specified size in bytes.
5308 Do nothing if the size to preserve is greater or equal to the current
5309 size of the file.
5310 @param[in]	pathname	file path
5311 @param[in]	file		file to be truncated
5312 @param[in]	size		size to preserve in bytes
5313 @return true if success */
5314 static
5315 bool
os_file_truncate_win32(const char * pathname,pfs_os_file_t file,os_offset_t size)5316 os_file_truncate_win32(
5317 	const char*	pathname,
5318 	pfs_os_file_t	file,
5319 	os_offset_t	size)
5320 {
5321 	LARGE_INTEGER	length;
5322 
5323 	length.QuadPart = size;
5324 	BOOL	success = SetFilePointerEx(file.m_file, length, NULL, FILE_BEGIN);
5325 	if (!success) {
5326 		os_file_handle_error_no_exit(
5327 			pathname, "SetFilePointerEx", false);
5328 	} else {
5329 		success = SetEndOfFile(file.m_file);
5330 		if (!success) {
5331 			os_file_handle_error_no_exit(
5332 				pathname, "SetEndOfFile", false);
5333 		}
5334 	}
5335 	return(success);
5336 }
5337 
5338 /** Truncates a file at its current position.
5339 @param[in]	file		Handle to be truncated
5340 @return true if success */
5341 bool
os_file_set_eof(FILE * file)5342 os_file_set_eof(
5343 	FILE*		file)
5344 {
5345 	HANDLE	h = (HANDLE) _get_osfhandle(fileno(file));
5346 
5347 	return(SetEndOfFile(h));
5348 }
5349 
5350 #ifdef UNIV_HOTBACKUP
5351 /** Closes a file handle.
5352 @param[in]	file		Handle to close
5353 @return true if success */
5354 bool
os_file_close_no_error_handling(os_file_t file)5355 os_file_close_no_error_handling(
5356 	os_file_t	file)
5357 {
5358 	return(CloseHandle(file) ? true : false);
5359 }
5360 #endif /* UNIV_HOTBACKUP */
5361 
5362 /** This function can be called if one wants to post a batch of reads and
5363 prefers an i/o-handler thread to handle them all at once later. You must
5364 call os_aio_simulated_wake_handler_threads later to ensure the threads
5365 are not left sleeping! */
5366 void
os_aio_simulated_put_read_threads_to_sleep()5367 os_aio_simulated_put_read_threads_to_sleep()
5368 {
5369 	AIO::simulated_put_read_threads_to_sleep();
5370 }
5371 
5372 /** This function can be called if one wants to post a batch of reads and
5373 prefers an i/o-handler thread to handle them all at once later. You must
5374 call os_aio_simulated_wake_handler_threads later to ensure the threads
5375 are not left sleeping! */
5376 void
simulated_put_read_threads_to_sleep()5377 AIO::simulated_put_read_threads_to_sleep()
5378 {
5379 	/* The idea of putting background IO threads to sleep is only for
5380 	Windows when using simulated AIO. Windows XP seems to schedule
5381 	background threads too eagerly to allow for coalescing during
5382 	readahead requests. */
5383 
5384 	if (srv_use_native_aio) {
5385 		/* We do not use simulated AIO: do nothing */
5386 
5387 		return;
5388 	}
5389 
5390 	os_aio_recommend_sleep_for_read_threads	= true;
5391 
5392 	for (ulint i = 0; i < os_aio_n_segments; i++) {
5393 		AIO*	array;
5394 
5395 		get_array_and_local_segment(&array, i);
5396 
5397 		if (array == s_reads) {
5398 
5399 			os_event_reset(os_aio_segment_wait_events[i]);
5400 		}
5401 	}
5402 }
5403 
5404 #endif /* !_WIN32*/
5405 
5406 /** Does a syncronous read or write depending upon the type specified
5407 In case of partial reads/writes the function tries
5408 NUM_RETRIES_ON_PARTIAL_IO times to read/write the complete data.
5409 @param[in]	type,		IO flags
5410 @param[in]	file		handle to an open file
5411 @param[out]	buf		buffer where to read
5412 @param[in]	offset		file offset from the start where to read
5413 @param[in]	n		number of bytes to read, starting from offset
5414 @param[out]	err		DB_SUCCESS or error code
5415 @return number of bytes read/written, -1 if error */
5416 static MY_ATTRIBUTE((warn_unused_result))
5417 ssize_t
os_file_io(const IORequest & in_type,os_file_t file,void * buf,ulint n,os_offset_t offset,dberr_t * err)5418 os_file_io(
5419 	const IORequest&in_type,
5420 	os_file_t	file,
5421 	void*		buf,
5422 	ulint		n,
5423 	os_offset_t	offset,
5424 	dberr_t*	err)
5425 {
5426 	Block*		block;
5427 	ulint		original_n = n;
5428 	IORequest	type = in_type;
5429 	ssize_t		bytes_returned = 0;
5430 
5431 	if (type.is_compressed()) {
5432 
5433 		/* We don't compress the first page of any file. */
5434 		ut_ad(offset > 0);
5435 
5436 		block = os_file_compress_page(type, buf, &n);
5437 	} else {
5438 		block = NULL;
5439 	}
5440 
5441 	/* We do encryption after compression, since if we do encryption
5442 	before compression, the encrypted data will cause compression fail
5443 	or low compression rate. */
5444         if (type.is_encrypted() && type.is_write()) {
5445 		/* We don't encrypt the first page of any file. */
5446 		Block*	compressed_block = block;
5447 		ut_ad(offset > 0);
5448 
5449 		block = os_file_encrypt_page(type, buf, &n);
5450 
5451 		if (compressed_block != NULL) {
5452 			os_free_block(compressed_block);
5453 		}
5454         }
5455 
5456 	SyncFileIO	sync_file_io(file, buf, n, offset);
5457 
5458 	for (ulint i = 0; i < NUM_RETRIES_ON_PARTIAL_IO; ++i) {
5459 
5460 		ssize_t	n_bytes = sync_file_io.execute(type);
5461 
5462 		/* Check for a hard error. Not much we can do now. */
5463 		if (n_bytes < 0) {
5464 
5465 			break;
5466 
5467 		} else if ((ulint) n_bytes + bytes_returned == n) {
5468 
5469 			bytes_returned += n_bytes;
5470 
5471 			if (offset > 0
5472 			    && (type.is_compressed() || type.is_read())) {
5473 
5474 				*err = os_file_io_complete(
5475 					type, file,
5476 					reinterpret_cast<byte*>(buf),
5477 					NULL, original_n, offset, n);
5478 			} else {
5479 
5480 				*err = DB_SUCCESS;
5481 			}
5482 
5483 			if (block != NULL) {
5484 				os_free_block(block);
5485 			}
5486 
5487 			return(original_n);
5488 		}
5489 
5490 		/* Handle partial read/write. */
5491 
5492 		ut_ad((ulint) n_bytes + bytes_returned < n);
5493 
5494 		bytes_returned += (ulint) n_bytes;
5495 
5496 		if (!type.is_partial_io_warning_disabled()) {
5497 
5498 			const char*	op = type.is_read()
5499 				? "read" : "written";
5500 
5501 			ib::warn()
5502 				<< n
5503 				<< " bytes should have been " << op << ". Only "
5504 				<< bytes_returned
5505 				<< " bytes " << op << ". Retrying"
5506 				<< " for the remaining bytes.";
5507 		}
5508 
5509 		/* Advance the offset and buffer by n_bytes */
5510 		sync_file_io.advance(n_bytes);
5511 	}
5512 
5513 	if (block != NULL) {
5514 		os_free_block(block);
5515 	}
5516 
5517 	*err = DB_IO_ERROR;
5518 
5519 	if (!type.is_partial_io_warning_disabled()) {
5520 		ib::warn()
5521 			<< "Retry attempts for "
5522 			<< (type.is_read() ? "reading" : "writing")
5523 			<< " partial data failed.";
5524 	}
5525 
5526 	return(bytes_returned);
5527 }
5528 
5529 /** Does a synchronous write operation in Posix.
5530 @param[in]	type		IO context
5531 @param[in]	file		handle to an open file
5532 @param[out]	buf		buffer from which to write
5533 @param[in]	n		number of bytes to read, starting from offset
5534 @param[in]	offset		file offset from the start where to read
5535 @param[out]	err		DB_SUCCESS or error code
5536 @return number of bytes written, -1 if error */
5537 static MY_ATTRIBUTE((warn_unused_result))
5538 ssize_t
os_file_pwrite(IORequest & type,os_file_t file,const byte * buf,ulint n,os_offset_t offset,dberr_t * err)5539 os_file_pwrite(
5540 	IORequest&	type,
5541 	os_file_t	file,
5542 	const byte*	buf,
5543 	ulint		n,
5544 	os_offset_t	offset,
5545 	dberr_t*	err)
5546 {
5547 	ut_ad(type.validate());
5548 
5549 	++os_n_file_writes;
5550 
5551 	(void) os_atomic_increment_ulint(&os_n_pending_writes, 1);
5552 	MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_WRITES);
5553 
5554 	ssize_t	n_bytes = os_file_io(type, file, (void*) buf, n, offset, err);
5555 
5556 	(void) os_atomic_decrement_ulint(&os_n_pending_writes, 1);
5557 	MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_WRITES);
5558 
5559 	return(n_bytes);
5560 }
5561 
5562 /** Requests a synchronous write operation.
5563 @param[in]	type		IO flags
5564 @param[in]	file		handle to an open file
5565 @param[out]	buf		buffer from which to write
5566 @param[in]	offset		file offset from the start where to read
5567 @param[in]	n		number of bytes to read, starting from offset
5568 @return DB_SUCCESS if request was successful, false if fail */
5569 static MY_ATTRIBUTE((warn_unused_result))
5570 dberr_t
os_file_write_page(IORequest & type,const char * name,os_file_t file,const byte * buf,os_offset_t offset,ulint n)5571 os_file_write_page(
5572 	IORequest&	type,
5573 	const char*	name,
5574 	os_file_t	file,
5575 	const byte*	buf,
5576 	os_offset_t	offset,
5577 	ulint		n)
5578 {
5579 	dberr_t		err;
5580 	ut_ad(type.validate());
5581 	ut_ad(n > 0);
5582 
5583 	ssize_t n_bytes = os_file_pwrite(type, file, buf, n, offset, &err);
5584 
5585 	if ((ulint) n_bytes != n && !os_has_said_disk_full) {
5586 
5587 		ib::error()
5588 			<< "Write to file " << name << "failed at offset "
5589 			<< offset << ", " << n
5590 			<< " bytes should have been written,"
5591 			" only " << n_bytes << " were written."
5592 			" Operating system error number " << errno << "."
5593 			" Check that your OS and file system"
5594 			" support files of this size."
5595 			" Check also that the disk is not full"
5596 			" or a disk quota exceeded.";
5597 
5598 		if (strerror(errno) != NULL) {
5599 
5600 			ib::error()
5601 				<< "Error number " << errno
5602 				<< " means '" << strerror(errno) << "'";
5603 		}
5604 
5605 		ib::info() << OPERATING_SYSTEM_ERROR_MSG;
5606 
5607 		os_has_said_disk_full = true;
5608 	}
5609 
5610 	return(err);
5611 }
5612 
5613 /** Does a synchronous read operation in Posix.
5614 @param[in]	type		IO flags
5615 @param[in]	file		handle to an open file
5616 @param[out]	buf		buffer where to read
5617 @param[in]	offset		file offset from the start where to read
5618 @param[in]	n		number of bytes to read, starting from offset
5619 @param[out]	err		DB_SUCCESS or error code
5620 @return number of bytes read, -1 if error */
5621 static MY_ATTRIBUTE((warn_unused_result))
5622 ssize_t
os_file_pread(IORequest & type,os_file_t file,void * buf,ulint n,os_offset_t offset,dberr_t * err)5623 os_file_pread(
5624 	IORequest&	type,
5625 	os_file_t	file,
5626 	void*		buf,
5627 	ulint		n,
5628 	os_offset_t	offset,
5629 	dberr_t*	err)
5630 {
5631 	++os_n_file_reads;
5632 
5633 	(void) os_atomic_increment_ulint(&os_n_pending_reads, 1);
5634 	MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS);
5635 
5636 	ssize_t	n_bytes = os_file_io(type, file, buf, n, offset, err);
5637 
5638 	(void) os_atomic_decrement_ulint(&os_n_pending_reads, 1);
5639 	MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_READS);
5640 
5641 	return(n_bytes);
5642 }
5643 
5644 /** Requests a synchronous positioned read operation.
5645 @return DB_SUCCESS if request was successful, false if fail
5646 @param[in]	type		IO flags
5647 @param[in]	file		handle to an open file
5648 @param[out]	buf		buffer where to read
5649 @param[in]	offset		file offset from the start where to read
5650 @param[in]	n		number of bytes to read, starting from offset
5651 @param[out]	o		number of bytes actually read
5652 @param[in]	exit_on_err	if true then exit on error
5653 @return DB_SUCCESS or error code */
5654 static MY_ATTRIBUTE((warn_unused_result))
5655 dberr_t
os_file_read_page(IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n,ulint * o,bool exit_on_err)5656 os_file_read_page(
5657 	IORequest&	type,
5658 	os_file_t	file,
5659 	void*		buf,
5660 	os_offset_t	offset,
5661 	ulint		n,
5662 	ulint*		o,
5663 	bool		exit_on_err)
5664 {
5665 	dberr_t		err;
5666 
5667 	os_bytes_read_since_printout += n;
5668 
5669 	ut_ad(type.validate());
5670 	ut_ad(n > 0);
5671 
5672 	for (;;) {
5673 		ssize_t	n_bytes;
5674 
5675 		n_bytes = os_file_pread(type, file, buf, n, offset, &err);
5676 
5677 		if (o != NULL) {
5678 			*o = n_bytes;
5679 		}
5680 
5681 		if (err != DB_SUCCESS && !exit_on_err) {
5682 
5683 			return(err);
5684 
5685 		} else if ((ulint) n_bytes == n) {
5686 
5687 			/** The read will succeed but decompress can fail
5688 			for various reasons. */
5689 
5690 			if (type.is_compression_enabled()
5691 			    && !Compression::is_compressed_page(
5692 				    static_cast<byte*>(buf))) {
5693 
5694 				return(DB_SUCCESS);
5695 
5696 			} else {
5697 				return(err);
5698 			}
5699 		}
5700 
5701 		ib::error() << "Tried to read " << n
5702 			<< " bytes at offset " << offset
5703 			<< ", but was only able to read " << n_bytes;
5704 
5705 		if (exit_on_err) {
5706 
5707 			if (!os_file_handle_error(NULL, "read")) {
5708 				/* Hard error */
5709 				break;
5710 			}
5711 
5712 		} else if (!os_file_handle_error_no_exit(NULL, "read", false)) {
5713 
5714 			/* Hard error */
5715 			break;
5716 		}
5717 
5718 		if (n_bytes > 0 && (ulint) n_bytes < n) {
5719 			n -= (ulint) n_bytes;
5720 			offset += (ulint) n_bytes;
5721 			buf = reinterpret_cast<uchar*>(buf) + (ulint) n_bytes;
5722 		}
5723 	}
5724 
5725 	ib::fatal()
5726 		<< "Cannot read from file. OS error number "
5727 		<< errno << ".";
5728 
5729 	return(err);
5730 }
5731 
5732 /** Retrieves the last error number if an error occurs in a file io function.
5733 The number should be retrieved before any other OS calls (because they may
5734 overwrite the error number). If the number is not known to this program,
5735 the OS error number + 100 is returned.
5736 @param[in]	report_all_errors	true if we want an error printed
5737 					for all errors
5738 @return error number, or OS error number + 100 */
5739 ulint
os_file_get_last_error(bool report_all_errors)5740 os_file_get_last_error(
5741 	bool	report_all_errors)
5742 {
5743 	return(os_file_get_last_error_low(report_all_errors, false));
5744 }
5745 
5746 /** Does error handling when a file operation fails.
5747 Conditionally exits (calling srv_fatal_error()) based on should_exit value
5748 and the error type, if should_exit is true then on_error_silent is ignored.
5749 @param[in]	name		name of a file or NULL
5750 @param[in]	operation	operation
5751 @param[in]	should_exit	call srv_fatal_error() on an unknown error,
5752 				if this parameter is true
5753 @param[in]	on_error_silent	if true then don't print any message to the log
5754 				iff it is an unknown non-fatal error
5755 @return true if we should retry the operation */
5756 static MY_ATTRIBUTE((warn_unused_result))
5757 bool
os_file_handle_error_cond_exit(const char * name,const char * operation,bool should_exit,bool on_error_silent)5758 os_file_handle_error_cond_exit(
5759 	const char*	name,
5760 	const char*	operation,
5761 	bool		should_exit,
5762 	bool		on_error_silent)
5763 {
5764 	ulint	err;
5765 
5766 	err = os_file_get_last_error_low(false, on_error_silent);
5767 
5768 	switch (err) {
5769 	case OS_FILE_DISK_FULL:
5770 		/* We only print a warning about disk full once */
5771 
5772 		if (os_has_said_disk_full) {
5773 
5774 			return(false);
5775 		}
5776 
5777 		/* Disk full error is reported irrespective of the
5778 		on_error_silent setting. */
5779 
5780 		if (name) {
5781 
5782 			ib::error()
5783 				<< "Encountered a problem with file '"
5784 				<< name << "'";
5785 		}
5786 
5787 		ib::error()
5788 			<< "Disk is full. Try to clean the disk to free space.";
5789 
5790 		os_has_said_disk_full = true;
5791 
5792 		return(false);
5793 
5794 	case OS_FILE_AIO_RESOURCES_RESERVED:
5795 	case OS_FILE_AIO_INTERRUPTED:
5796 
5797 		return(true);
5798 
5799 	case OS_FILE_PATH_ERROR:
5800 	case OS_FILE_ALREADY_EXISTS:
5801 	case OS_FILE_ACCESS_VIOLATION:
5802 
5803 		return(false);
5804 
5805 	case OS_FILE_SHARING_VIOLATION:
5806 
5807 		os_thread_sleep(10000000);	/* 10 sec */
5808 		return(true);
5809 
5810 	case OS_FILE_OPERATION_ABORTED:
5811 	case OS_FILE_INSUFFICIENT_RESOURCE:
5812 
5813 		os_thread_sleep(100000);	/* 100 ms */
5814 		return(true);
5815 
5816 	default:
5817 
5818 		/* If it is an operation that can crash on error then it
5819 		is better to ignore on_error_silent and print an error message
5820 		to the log. */
5821 
5822 		if (should_exit || !on_error_silent) {
5823 			ib::error() << "File "
5824 				<< (name != NULL ? name : "(unknown)")
5825 				<< ": '" << operation << "'"
5826 				" returned OS error " << err << "."
5827 				<< (should_exit
5828 				    ? " Cannot continue operation" : "");
5829 		}
5830 
5831 		if (should_exit) {
5832 			srv_fatal_error();
5833 		}
5834 	}
5835 
5836 	return(false);
5837 }
5838 
5839 /** Does error handling when a file operation fails.
5840 @param[in]	name		name of a file or NULL
5841 @param[in]	operation	operation name that failed
5842 @return true if we should retry the operation */
5843 static
5844 bool
os_file_handle_error(const char * name,const char * operation)5845 os_file_handle_error(
5846 	const char*	name,
5847 	const char*	operation)
5848 {
5849 	/* Exit in case of unknown error */
5850 	return(os_file_handle_error_cond_exit(name, operation, true, false));
5851 }
5852 
5853 /** Does error handling when a file operation fails.
5854 @param[in]	name		name of a file or NULL
5855 @param[in]	operation	operation name that failed
5856 @param[in]	on_error_silent	if true then don't print any message to the log.
5857 @return true if we should retry the operation */
5858 static
5859 bool
os_file_handle_error_no_exit(const char * name,const char * operation,bool on_error_silent)5860 os_file_handle_error_no_exit(
5861 	const char*	name,
5862 	const char*	operation,
5863 	bool		on_error_silent)
5864 {
5865 	/* Don't exit in case of unknown error */
5866 	return(os_file_handle_error_cond_exit(
5867 			name, operation, false, on_error_silent));
5868 }
5869 
5870 /** Tries to disable OS caching on an opened file descriptor.
5871 @param[in]	fd		file descriptor to alter
5872 @param[in]	file_name	file name, used in the diagnostic message
5873 @param[in]	name		"open" or "create"; used in the diagnostic
5874 				message */
5875 void
os_file_set_nocache(int fd MY_ATTRIBUTE ((unused)),const char * file_name MY_ATTRIBUTE ((unused)),const char * operation_name MY_ATTRIBUTE ((unused)))5876 os_file_set_nocache(
5877 	int		fd		MY_ATTRIBUTE((unused)),
5878 	const char*	file_name	MY_ATTRIBUTE((unused)),
5879 	const char*	operation_name	MY_ATTRIBUTE((unused)))
5880 {
5881 	/* some versions of Solaris may not have DIRECTIO_ON */
5882 #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
5883 	if (directio(fd, DIRECTIO_ON) == -1) {
5884 		int	errno_save = errno;
5885 
5886 		ib::error()
5887 			<< "Failed to set DIRECTIO_ON on file "
5888 			<< file_name << ": " << operation_name
5889 			<< strerror(errno_save) << ","
5890 			" continuing anyway.";
5891 	}
5892 #elif defined(O_DIRECT)
5893 	if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
5894 		int		errno_save = errno;
5895 		static bool	warning_message_printed = false;
5896 		if (errno_save == EINVAL) {
5897 			if (!warning_message_printed) {
5898 				warning_message_printed = true;
5899 # ifdef UNIV_LINUX
5900 				ib::warn()
5901 					<< "Failed to set O_DIRECT on file"
5902 					<< file_name << ";" << operation_name
5903 					<< ": " << strerror(errno_save) << ", "
5904 					<< "continuing anyway. O_DIRECT is "
5905 					"known to result in 'Invalid argument' "
5906 					"on Linux on tmpfs, "
5907 					"see MySQL Bug#26662.";
5908 # else /* UNIV_LINUX */
5909 				goto short_warning;
5910 # endif /* UNIV_LINUX */
5911 			}
5912 		} else {
5913 # ifndef UNIV_LINUX
5914 short_warning:
5915 # endif
5916 			ib::warn()
5917 				<< "Failed to set O_DIRECT on file "
5918 				<< file_name << "; " << operation_name
5919 				<< " : " << strerror(errno_save)
5920 				<< " continuing anyway.";
5921 		}
5922 	}
5923 #endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */
5924 }
5925 
5926 /** Write the specified number of zeros to a newly created file.
5927 @param[in]	name		name of the file or path as a null-terminated
5928 				string
5929 @param[in]	file		handle to a file
5930 @param[in]	size		file size
5931 @param[in]	read_only	Enable read-only checks if true
5932 @return true if success */
5933 bool
os_file_set_size(const char * name,pfs_os_file_t file,os_offset_t size,bool read_only)5934 os_file_set_size(
5935 	const char*	name,
5936 	pfs_os_file_t	file,
5937 	os_offset_t	size,
5938 	bool		read_only)
5939 {
5940 	/* Write up to 1 megabyte at a time. */
5941 	ulint	buf_size = ut_min(
5942 		static_cast<ulint>(64),
5943 		static_cast<ulint>(size / UNIV_PAGE_SIZE));
5944 
5945 	buf_size *= UNIV_PAGE_SIZE;
5946 
5947 	/* Align the buffer for possible raw i/o */
5948 	byte*	buf2;
5949 
5950 	buf2 = static_cast<byte*>(ut_malloc_nokey(buf_size + UNIV_PAGE_SIZE));
5951 
5952 	byte*	buf = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
5953 
5954 	/* Write buffer full of zeros */
5955 	memset(buf, 0, buf_size);
5956 
5957 	if (size >= (os_offset_t) 100 << 20) {
5958 
5959 		ib::info() << "Progress in MB:";
5960 	}
5961 
5962 	os_offset_t	current_size = 0;
5963 
5964 	while (current_size < size) {
5965 		ulint	n_bytes;
5966 
5967 		if (size - current_size < (os_offset_t) buf_size) {
5968 			n_bytes = (ulint) (size - current_size);
5969 		} else {
5970 			n_bytes = buf_size;
5971 		}
5972 
5973 		dberr_t		err;
5974 		IORequest	request(IORequest::WRITE);
5975 
5976 #ifdef UNIV_HOTBACKUP
5977 
5978 		err = os_file_write(
5979 			request, name, file, buf, current_size, n_bytes);
5980 #else
5981 		/* Using OS_AIO_SYNC mode on POSIX systems will result in
5982 		fall back to os_file_write/read. On Windows it will use
5983 		special mechanism to wait before it returns back. */
5984 
5985 		err = os_aio(
5986 			request,
5987 			OS_AIO_SYNC, name,
5988 			file, buf, current_size, n_bytes,
5989 			read_only, NULL, NULL);
5990 #endif /* UNIV_HOTBACKUP */
5991 
5992 		if (err != DB_SUCCESS) {
5993 
5994 			ut_free(buf2);
5995 			return(false);
5996 		}
5997 
5998 		/* Print about progress for each 100 MB written */
5999 		if ((current_size + n_bytes) / (100 << 20)
6000 		    != current_size / (100 << 20)) {
6001 
6002 			fprintf(stderr, " %lu00",
6003 				(ulong) ((current_size + n_bytes)
6004 					 / (100 << 20)));
6005 		}
6006 
6007 		current_size += n_bytes;
6008 	}
6009 
6010 	if (size >= (os_offset_t) 100 << 20) {
6011 
6012 		fprintf(stderr, "\n");
6013 	}
6014 
6015 	ut_free(buf2);
6016 
6017 	return(os_file_flush(file));
6018 }
6019 
6020 /** Truncates a file to a specified size in bytes.
6021 Do nothing if the size to preserve is greater or equal to the current
6022 size of the file.
6023 @param[in]	pathname	file path
6024 @param[in]	file		file to be truncated
6025 @param[in]	size		size to preserve in bytes
6026 @return true if success */
6027 bool
os_file_truncate(const char * pathname,pfs_os_file_t file,os_offset_t size)6028 os_file_truncate(
6029 	const char*	pathname,
6030 	pfs_os_file_t	file,
6031 	os_offset_t	size)
6032 {
6033 	/* Do nothing if the size preserved is larger than or equal to the
6034 	current size of file */
6035 	os_offset_t	size_bytes = os_file_get_size(file);
6036 
6037 	if (size >= size_bytes) {
6038 		return(true);
6039 	}
6040 
6041 #ifdef _WIN32
6042 	return(os_file_truncate_win32(pathname, file, size));
6043 #else /* _WIN32 */
6044 	return(os_file_truncate_posix(pathname, file, size));
6045 #endif /* _WIN32 */
6046 }
6047 
6048 /** NOTE! Use the corresponding macro os_file_read(), not directly this
6049 function!
6050 Requests a synchronous positioned read operation.
6051 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
6052 @param[in]	type		IO flags
6053 @param[in]	file		handle to an open file
6054 @param[out]	buf		buffer where to read
6055 @param[in]	offset		file offset from the start where to read
6056 @param[in]	n		number of bytes to read, starting from offset
6057 @return DB_SUCCESS or error code */
6058 dberr_t
os_file_read_func(IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n)6059 os_file_read_func(
6060 	IORequest&	type,
6061 	os_file_t	file,
6062 	void*		buf,
6063 	os_offset_t	offset,
6064 	ulint		n)
6065 {
6066 	ut_ad(type.is_read());
6067 
6068 	return(os_file_read_page(type, file, buf, offset, n, NULL, true));
6069 }
6070 
6071 /** NOTE! Use the corresponding macro os_file_read_no_error_handling(),
6072 not directly this function!
6073 Requests a synchronous positioned read operation.
6074 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
6075 @param[in]	type		IO flags
6076 @param[in]	file		handle to an open file
6077 @param[out]	buf		buffer where to read
6078 @param[in]	offset		file offset from the start where to read
6079 @param[in]	n		number of bytes to read, starting from offset
6080 @param[out]	o		number of bytes actually read
6081 @return DB_SUCCESS or error code */
6082 dberr_t
os_file_read_no_error_handling_func(IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n,ulint * o)6083 os_file_read_no_error_handling_func(
6084 	IORequest&	type,
6085 	os_file_t	file,
6086 	void*		buf,
6087 	os_offset_t	offset,
6088 	ulint		n,
6089 	ulint*		o)
6090 {
6091 	ut_ad(type.is_read());
6092 
6093 	return(os_file_read_page(type, file, buf, offset, n, o, false));
6094 }
6095 
6096 /** NOTE! Use the corresponding macro os_file_write(), not directly
6097 Requests a synchronous write operation.
6098 @param[in]	type		IO flags
6099 @param[in]	file		handle to an open file
6100 @param[out]	buf		buffer from which to write
6101 @param[in]	offset		file offset from the start where to read
6102 @param[in]	n		number of bytes to read, starting from offset
6103 @return DB_SUCCESS if request was successful, false if fail */
6104 dberr_t
os_file_write_func(IORequest & type,const char * name,os_file_t file,const void * buf,os_offset_t offset,ulint n)6105 os_file_write_func(
6106 	IORequest&	type,
6107 	const char*	name,
6108 	os_file_t	file,
6109 	const void*	buf,
6110 	os_offset_t	offset,
6111 	ulint		n)
6112 {
6113 	ut_ad(type.validate());
6114 	ut_ad(type.is_write());
6115 
6116 	/* We never compress the first page.
6117 	Note: This assumes we always do block IO. */
6118 	if (offset == 0) {
6119 		type.clear_compressed();
6120 	}
6121 
6122 	const byte*	ptr = reinterpret_cast<const byte*>(buf);
6123 
6124 	return(os_file_write_page(type, name, file, ptr, offset, n));
6125 }
6126 
6127 /** Check the existence and type of the given file.
6128 @param[in]	path		path name of file
6129 @param[out]	exists		true if the file exists
6130 @param[out]	type		Type of the file, if it exists
6131 @return true if call succeeded */
6132 bool
os_file_status(const char * path,bool * exists,os_file_type_t * type)6133 os_file_status(
6134 	const char*	path,
6135 	bool*		exists,
6136 	os_file_type_t* type)
6137 {
6138 #ifdef _WIN32
6139 	return(os_file_status_win32(path, exists, type));
6140 #else
6141 	return(os_file_status_posix(path, exists, type));
6142 #endif /* _WIN32 */
6143 }
6144 
6145 /** Free storage space associated with a section of the file.
6146 @param[in]	fh		Open file handle
6147 @param[in]	off		Starting offset (SEEK_SET)
6148 @param[in]	len		Size of the hole
6149 @return DB_SUCCESS or error code */
6150 dberr_t
os_file_punch_hole(os_file_t fh,os_offset_t off,os_offset_t len)6151 os_file_punch_hole(
6152 	os_file_t	fh,
6153 	os_offset_t	off,
6154 	os_offset_t	len)
6155 {
6156 	/* In this debugging mode, we act as if punch hole is supported,
6157 	and then skip any calls to actually punch a hole here.
6158 	In this way, Transparent Page Compression is still being tested. */
6159 	DBUG_EXECUTE_IF("ignore_punch_hole",
6160 		return(DB_SUCCESS);
6161 	);
6162 
6163 #ifdef _WIN32
6164 	return(os_file_punch_hole_win32(fh, off, len));
6165 #else
6166 	return(os_file_punch_hole_posix(fh, off, len));
6167 #endif /* _WIN32 */
6168 }
6169 
6170 /** Check if the file system supports sparse files.
6171 
6172 Warning: On POSIX systems we try and punch a hole from offset 0 to
6173 the system configured page size. This should only be called on an empty
6174 file.
6175 
6176 Note: On Windows we use the name and on Unices we use the file handle.
6177 
6178 @param[in]	name		File name
6179 @param[in]	fh		File handle for the file - if opened
6180 @return true if the file system supports sparse files */
6181 bool
os_is_sparse_file_supported(const char * path,pfs_os_file_t fh)6182 os_is_sparse_file_supported(const char* path, pfs_os_file_t fh)
6183 {
6184 	/* In this debugging mode, we act as if punch hole is supported,
6185 	then we skip any calls to actually punch a hole.  In this way,
6186 	Transparent Page Compression is still being tested. */
6187 	DBUG_EXECUTE_IF("ignore_punch_hole",
6188 		return(true);
6189 	);
6190 
6191 #ifdef _WIN32
6192 	return(os_is_sparse_file_supported_win32(path));
6193 #else
6194 	dberr_t	err;
6195 
6196 	/* We don't know the FS block size, use the sector size. The FS
6197 	will do the magic. */
6198 	err = os_file_punch_hole(fh.m_file, 0, UNIV_PAGE_SIZE);
6199 
6200 	return(err == DB_SUCCESS);
6201 #endif /* _WIN32 */
6202 }
6203 
6204 /** This function returns information about the specified file
6205 @param[in]	path		pathname of the file
6206 @param[out]	stat_info	information of a file in a directory
6207 @param[in]	check_rw_perm	for testing whether the file can be opened
6208 				in RW mode
6209 @param[in]	read_only	true if file is opened in read-only mode
6210 @return DB_SUCCESS if all OK */
6211 dberr_t
os_file_get_status(const char * path,os_file_stat_t * stat_info,bool check_rw_perm,bool read_only)6212 os_file_get_status(
6213 	const char*	path,
6214 	os_file_stat_t* stat_info,
6215 	bool		check_rw_perm,
6216 	bool		read_only)
6217 {
6218 	dberr_t	ret;
6219 
6220 #ifdef _WIN32
6221 	struct _stat64	info;
6222 
6223 	ret = os_file_get_status_win32(
6224 		path, stat_info, &info, check_rw_perm, read_only);
6225 
6226 #else
6227 	struct stat	info;
6228 
6229 	ret = os_file_get_status_posix(
6230 		path, stat_info, &info, check_rw_perm, read_only);
6231 
6232 #endif /* _WIN32 */
6233 
6234 	if (ret == DB_SUCCESS) {
6235 		stat_info->ctime = info.st_ctime;
6236 		stat_info->atime = info.st_atime;
6237 		stat_info->mtime = info.st_mtime;
6238 		stat_info->size  = info.st_size;
6239 	}
6240 
6241 	return(ret);
6242 }
6243 
6244 /**
6245 Waits for an AIO operation to complete. This function is used to wait the
6246 for completed requests. The aio array of pending requests is divided
6247 into segments. The thread specifies which segment or slot it wants to wait
6248 for. NOTE: this function will also take care of freeing the aio slot,
6249 therefore no other thread is allowed to do the freeing!
6250 @param[in]	segment		The number of the segment in the aio arrays to
6251 				wait for; segment 0 is the ibuf I/O thread,
6252 				segment 1 the log I/O thread, then follow the
6253 				non-ibuf read threads, and as the last are the
6254 				non-ibuf write threads; if this is
6255 				ULINT_UNDEFINED, then it means that sync AIO
6256 				is used, and this parameter is ignored
6257 @param[out]	m1		the messages passed with the AIO request; note
6258 				that also in the case where the AIO operation
6259 				failed, these output parameters are valid and
6260 				can be used to restart the operation,
6261 				for example
6262 @param[out]	m2		callback message
6263 @param[out]	type		OS_FILE_WRITE or ..._READ
6264 @return DB_SUCCESS or error code */
6265 dberr_t
os_aio_handler(ulint segment,fil_node_t ** m1,void ** m2,IORequest * request)6266 os_aio_handler(
6267 	ulint		segment,
6268 	fil_node_t**	m1,
6269 	void**		m2,
6270 	IORequest*	request)
6271 {
6272 	dberr_t	err;
6273 
6274 	if (srv_use_native_aio) {
6275 		srv_set_io_thread_op_info(segment, "native aio handle");
6276 
6277 #ifdef WIN_ASYNC_IO
6278 
6279 		err = os_aio_windows_handler(segment, 0, m1, m2, request);
6280 
6281 #elif defined(LINUX_NATIVE_AIO)
6282 
6283 		err = os_aio_linux_handler(segment, m1, m2, request);
6284 
6285 #else
6286 		ut_error;
6287 
6288 		err = DB_ERROR; /* Eliminate compiler warning */
6289 
6290 #endif /* WIN_ASYNC_IO */
6291 
6292 	} else {
6293 		srv_set_io_thread_op_info(segment, "simulated aio handle");
6294 
6295 		err = os_aio_simulated_handler(segment, m1, m2, request);
6296 	}
6297 
6298 	return(err);
6299 }
6300 
6301 /** Constructor
6302 @param[in]	id		The latch ID
6303 @param[in]	n		Number of AIO slots
6304 @param[in]	segments	Number of segments */
AIO(latch_id_t id,ulint n,ulint segments)6305 AIO::AIO(
6306 	latch_id_t	id,
6307 	ulint		n,
6308 	ulint		segments)
6309 	:
6310 	m_slots(n),
6311 	m_n_segments(segments),
6312 	m_n_reserved()
6313 # ifdef LINUX_NATIVE_AIO
6314 	,m_aio_ctx(),
6315 	m_events(m_slots.size())
6316 # elif defined(_WIN32)
6317 	,m_handles()
6318 # endif /* LINUX_NATIVE_AIO */
6319 {
6320 	ut_a(n > 0);
6321 	ut_a(m_n_segments > 0);
6322 
6323 	mutex_create(id, &m_mutex);
6324 
6325 	m_not_full = os_event_create("aio_not_full");
6326 	m_is_empty = os_event_create("aio_is_empty");
6327 
6328 	std::uninitialized_fill(m_slots.begin(), m_slots.end(), Slot());
6329 #ifdef LINUX_NATIVE_AIO
6330 	memset(&m_events[0], 0x0, sizeof(m_events[0]) * m_events.size());
6331 #endif /* LINUX_NATIVE_AIO */
6332 
6333 	os_event_set(m_is_empty);
6334 }
6335 
6336 /** Initialise the slots */
6337 dberr_t
init_slots()6338 AIO::init_slots()
6339 {
6340 	for (ulint i = 0; i < m_slots.size(); ++i) {
6341 		Slot&	slot = m_slots[i];
6342 
6343 		slot.pos = static_cast<uint16_t>(i);
6344 
6345 		slot.is_reserved = false;
6346 
6347 #ifdef WIN_ASYNC_IO
6348 
6349 		slot.handle = CreateEvent(NULL, TRUE, FALSE, NULL);
6350 
6351 		OVERLAPPED*	over = &slot.control;
6352 
6353 		over->hEvent = slot.handle;
6354 
6355 		(*m_handles)[i] = over->hEvent;
6356 
6357 #elif defined(LINUX_NATIVE_AIO)
6358 
6359 		slot.ret = 0;
6360 
6361 		slot.n_bytes = 0;
6362 
6363 		memset(&slot.control, 0x0, sizeof(slot.control));
6364 
6365 #endif /* WIN_ASYNC_IO */
6366 	}
6367 
6368 	return(DB_SUCCESS);
6369 }
6370 
6371 #ifdef LINUX_NATIVE_AIO
6372 /** Initialise the Linux Native AIO interface */
6373 dberr_t
init_linux_native_aio()6374 AIO::init_linux_native_aio()
6375 {
6376 	/* Initialize the io_context array. One io_context
6377 	per segment in the array. */
6378 
6379 	ut_a(m_aio_ctx == NULL);
6380 
6381 	m_aio_ctx = static_cast<io_context**>(
6382 		ut_zalloc_nokey(m_n_segments * sizeof(*m_aio_ctx)));
6383 
6384 	if (m_aio_ctx == NULL) {
6385 		return(DB_OUT_OF_MEMORY);
6386 	}
6387 
6388 	io_context**	ctx = m_aio_ctx;
6389 	ulint		max_events = slots_per_segment();
6390 
6391 	for (ulint i = 0; i < m_n_segments; ++i, ++ctx) {
6392 
6393 		if (!linux_create_io_ctx(max_events, ctx)) {
6394 			/* If something bad happened during aio setup
6395 			we should call it a day and return right away.
6396 			We don't care about any leaks because a failure
6397 			to initialize the io subsystem means that the
6398 			server (or atleast the innodb storage engine)
6399 			is not going to startup. */
6400 			return(DB_IO_ERROR);
6401 		}
6402 	}
6403 
6404 	return(DB_SUCCESS);
6405 }
6406 #endif /* LINUX_NATIVE_AIO */
6407 
6408 /** Initialise the array */
6409 dberr_t
init()6410 AIO::init()
6411 {
6412 	ut_a(!m_slots.empty());
6413 
6414 #ifdef _WIN32
6415 	ut_a(m_handles == NULL);
6416 
6417 	m_handles = UT_NEW_NOKEY(Handles(m_slots.size()));
6418 #endif /* _WIN32 */
6419 
6420 	if (srv_use_native_aio) {
6421 #ifdef LINUX_NATIVE_AIO
6422 		dberr_t	err = init_linux_native_aio();
6423 
6424 		if (err != DB_SUCCESS) {
6425 			return(err);
6426 		}
6427 
6428 #endif /* LINUX_NATIVE_AIO */
6429 	}
6430 
6431 	return(init_slots());
6432 }
6433 
6434 /** Creates an aio wait array. Note that we return NULL in case of failure.
6435 We don't care about freeing memory here because we assume that a
6436 failure will result in server refusing to start up.
6437 @param[in]	id		Latch ID
6438 @param[in]	n		maximum number of pending AIO operations
6439 				allowed; n must be divisible by m_n_segments
6440 @param[in]	n_segments	number of segments in the AIO array
6441 @return own: AIO array, NULL on failure */
6442 AIO*
create(latch_id_t id,ulint n,ulint n_segments)6443 AIO::create(
6444 	latch_id_t	id,
6445 	ulint		n,
6446 	ulint		n_segments)
6447 {
6448 	if ((n % n_segments)) {
6449 
6450 		ib::error()
6451 			<< "Maximum number of AIO operations must be "
6452 			<< "divisible by number of segments";
6453 
6454 		return(NULL);
6455 	}
6456 
6457 	AIO*	array = UT_NEW_NOKEY(AIO(id, n, n_segments));
6458 
6459 	if (array != NULL && array->init() != DB_SUCCESS) {
6460 
6461 		UT_DELETE(array);
6462 
6463 		array = NULL;
6464 	}
6465 
6466 	return(array);
6467 }
6468 
6469 /** AIO destructor */
~AIO()6470 AIO::~AIO()
6471 {
6472 #ifdef WIN_ASYNC_IO
6473 	for (ulint i = 0; i < m_slots.size(); ++i) {
6474 		CloseHandle(m_slots[i].handle);
6475 	}
6476 #endif /* WIN_ASYNC_IO */
6477 
6478 #ifdef _WIN32
6479 	UT_DELETE(m_handles);
6480 #endif /* _WIN32 */
6481 
6482 	mutex_destroy(&m_mutex);
6483 
6484 	os_event_destroy(m_not_full);
6485 	os_event_destroy(m_is_empty);
6486 
6487 #if defined(LINUX_NATIVE_AIO)
6488 	if (srv_use_native_aio) {
6489 		m_events.clear();
6490 		ut_free(m_aio_ctx);
6491 	}
6492 #endif /* LINUX_NATIVE_AIO */
6493 
6494 	m_slots.clear();
6495 }
6496 
6497 /** Initializes the asynchronous io system. Creates one array each for ibuf
6498 and log i/o. Also creates one array each for read and write where each
6499 array is divided logically into n_readers and n_writers
6500 respectively. The caller must create an i/o handler thread for each
6501 segment in these arrays. This function also creates the sync array.
6502 No i/o handler thread needs to be created for that
6503 @param[in]	n_per_seg	maximum number of pending aio
6504 				operations allowed per segment
6505 @param[in]	n_readers	number of reader threads
6506 @param[in]	n_writers	number of writer threads
6507 @param[in]	n_slots_sync	number of slots in the sync aio array
6508 @return true if the AIO sub-system was started successfully */
6509 bool
start(ulint n_per_seg,ulint n_readers,ulint n_writers,ulint n_slots_sync)6510 AIO::start(
6511 	ulint		n_per_seg,
6512 	ulint		n_readers,
6513 	ulint		n_writers,
6514 	ulint		n_slots_sync)
6515 {
6516 #if defined(LINUX_NATIVE_AIO)
6517 	/* Check if native aio is supported on this system and tmpfs */
6518 	if (srv_use_native_aio && !is_linux_native_aio_supported()) {
6519 
6520 		ib::warn() << "Linux Native AIO disabled.";
6521 
6522 		srv_use_native_aio = FALSE;
6523 	}
6524 #endif /* LINUX_NATIVE_AIO */
6525 
6526 	srv_reset_io_thread_op_info();
6527 
6528 	s_reads = create(
6529 		LATCH_ID_OS_AIO_READ_MUTEX, n_readers * n_per_seg, n_readers);
6530 
6531 	if (s_reads == NULL) {
6532 		return(false);
6533 	}
6534 
6535 	ulint	start = srv_read_only_mode ? 0 : 2;
6536 	ulint	n_segs = n_readers + start;
6537 
6538 	/* 0 is the ibuf segment and 1 is the redo log segment. */
6539 	for (ulint i = start; i < n_segs; ++i) {
6540 		ut_a(i < SRV_MAX_N_IO_THREADS);
6541 		srv_io_thread_function[i] = "read thread";
6542 	}
6543 
6544 	ulint	n_segments = n_readers;
6545 
6546 	if (!srv_read_only_mode) {
6547 
6548 		s_ibuf = create(LATCH_ID_OS_AIO_IBUF_MUTEX, n_per_seg, 1);
6549 
6550 		if (s_ibuf == NULL) {
6551 			return(false);
6552 		}
6553 
6554 		++n_segments;
6555 
6556 		srv_io_thread_function[0] = "insert buffer thread";
6557 
6558 		s_log = create(LATCH_ID_OS_AIO_LOG_MUTEX, n_per_seg, 1);
6559 
6560 		if (s_log == NULL) {
6561 			return(false);
6562 		}
6563 
6564 		++n_segments;
6565 
6566 		srv_io_thread_function[1] = "log thread";
6567 
6568 	} else {
6569 		s_ibuf = s_log = NULL;
6570 	}
6571 
6572 	s_writes = create(
6573 		LATCH_ID_OS_AIO_WRITE_MUTEX, n_writers * n_per_seg, n_writers);
6574 
6575 	if (s_writes == NULL) {
6576 		return(false);
6577 	}
6578 
6579 	n_segments += n_writers;
6580 
6581 	for (ulint i = start + n_readers; i < n_segments; ++i) {
6582 		ut_a(i < SRV_MAX_N_IO_THREADS);
6583 		srv_io_thread_function[i] = "write thread";
6584 	}
6585 
6586 	ut_ad(n_segments >= static_cast<ulint>(srv_read_only_mode ? 2 : 4));
6587 
6588 	s_sync = create(LATCH_ID_OS_AIO_SYNC_MUTEX, n_slots_sync, 1);
6589 
6590 	if (s_sync == NULL) {
6591 
6592 		return(false);
6593 	}
6594 
6595 	os_aio_n_segments = n_segments;
6596 
6597 	os_aio_validate();
6598 
6599 	os_aio_segment_wait_events = static_cast<os_event_t*>(
6600 		ut_zalloc_nokey(
6601 			n_segments * sizeof *os_aio_segment_wait_events));
6602 
6603 	if (os_aio_segment_wait_events == NULL) {
6604 
6605 		return(false);
6606 	}
6607 
6608 	for (ulint i = 0; i < n_segments; ++i) {
6609 		os_aio_segment_wait_events[i] = os_event_create(0);
6610 	}
6611 
6612 	os_last_printout = ut_time_monotonic();
6613 
6614 	return(true);
6615 }
6616 
6617 /** Free the AIO arrays */
6618 void
shutdown()6619 AIO::shutdown()
6620 {
6621 	UT_DELETE(s_ibuf);
6622 	s_ibuf = NULL;
6623 
6624 	UT_DELETE(s_log);
6625 	s_log = NULL;
6626 
6627 	UT_DELETE(s_writes);
6628 	s_writes = NULL;
6629 
6630 	UT_DELETE(s_sync);
6631 	s_sync = NULL;
6632 
6633 	UT_DELETE(s_reads);
6634 	s_reads = NULL;
6635 }
6636 
6637 #if !defined(NO_FALLOCATE) && defined(UNIV_LINUX)
6638 
6639 /** Max disk sector size */
6640 static const ulint	MAX_SECTOR_SIZE = 4096;
6641 
6642 /**
6643 Try and get the FusionIO sector size. */
6644 void
os_fusionio_get_sector_size()6645 os_fusionio_get_sector_size()
6646 {
6647 	if (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
6648 	    || srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC) {
6649 		ulint		sector_size = UNIV_SECTOR_SIZE;
6650 		char*		path = srv_data_home;
6651 		os_file_t	check_file;
6652 		byte*		ptr;
6653 		byte*		block_ptr;
6654 		char		current_dir[3];
6655 		char*		dir_end;
6656 		ulint		dir_len;
6657 		ulint		check_path_len;
6658 		char*		check_file_name;
6659 		ssize_t		ret;
6660 
6661 		/* If the srv_data_home is empty, set the path to
6662 		current dir. */
6663 		if (*path == 0) {
6664 			current_dir[0] = FN_CURLIB;
6665 			current_dir[1] = FN_LIBCHAR;
6666 			current_dir[2] = 0;
6667 			path = current_dir;
6668 		}
6669 
6670 		/* Get the path of data file */
6671 		dir_end = strrchr(path, OS_PATH_SEPARATOR);
6672 		dir_len = dir_end? dir_end - path : strlen(path);
6673 
6674 		/* allocate a new path and move the directory path to it. */
6675 		check_path_len = dir_len + sizeof "/check_sector_size";
6676 		check_file_name = static_cast<char*>(
6677 			ut_zalloc_nokey(check_path_len));
6678 		memcpy(check_file_name, path, dir_len);
6679 
6680 		/* Construct a check file name. */
6681 		strcat(check_file_name + dir_len, "/check_sector_size");
6682 
6683 		/* Create a tmp file for checking sector size. */
6684 		check_file = ::open(check_file_name,
6685 				    O_CREAT|O_TRUNC|O_WRONLY|O_DIRECT,
6686 				    S_IRWXU);
6687 
6688 		if (check_file == -1) {
6689 			ib::error()
6690 				<< "Failed to create check sector file, errno:"
6691 				<< errno << " Please confirm O_DIRECT is"
6692 				<< " supported and remove the file "
6693 				<< check_file_name << " if it exists.";
6694 			ut_free(check_file_name);
6695 			errno = 0;
6696 			return;
6697 		}
6698 
6699 		/* Try to write the file with different sector size
6700 		alignment. */
6701 		ptr = static_cast<byte*>(ut_malloc_nokey(2 * MAX_SECTOR_SIZE));
6702 
6703 		while (sector_size <= MAX_SECTOR_SIZE) {
6704 			block_ptr = static_cast<byte*>(
6705 				ut_align(ptr, sector_size));
6706 			ret = pwrite(check_file, block_ptr,
6707 				    sector_size, 0);
6708 			if (ret > 0 && (ulint) ret == sector_size) {
6709 				break;
6710 			}
6711 			sector_size *= 2;
6712 		}
6713 
6714 		/* The sector size should <= MAX_SECTOR_SIZE. */
6715 		ut_ad(sector_size <= MAX_SECTOR_SIZE);
6716 
6717 		close(check_file);
6718 		unlink(check_file_name);
6719 
6720 		ut_free(check_file_name);
6721 		ut_free(ptr);
6722 		errno = 0;
6723 
6724 		os_io_ptr_align = sector_size;
6725 	}
6726 }
6727 #endif /* !NO_FALLOCATE && UNIV_LINUX */
6728 
6729 /** Initializes the asynchronous io system. Creates one array each for ibuf
6730 and log i/o. Also creates one array each for read and write where each
6731 array is divided logically into n_readers and n_writers
6732 respectively. The caller must create an i/o handler thread for each
6733 segment in these arrays. This function also creates the sync array.
6734 No i/o handler thread needs to be created for that
6735 @param[in]	n_readers	number of reader threads
6736 @param[in]	n_writers	number of writer threads
6737 @param[in]	n_slots_sync	number of slots in the sync aio array */
6738 bool
os_aio_init(ulint n_readers,ulint n_writers,ulint n_slots_sync)6739 os_aio_init(
6740 	ulint		n_readers,
6741 	ulint		n_writers,
6742 	ulint		n_slots_sync)
6743 {
6744 	/* Maximum number of pending aio operations allowed per segment */
6745 	ulint		limit = 8 * OS_AIO_N_PENDING_IOS_PER_THREAD;
6746 
6747 #ifdef _WIN32
6748 	if (srv_use_native_aio) {
6749 		limit = SRV_N_PENDING_IOS_PER_THREAD;
6750 	}
6751 #endif /* _WIN32 */
6752 
6753 	ut_a(block_cache == NULL);
6754 
6755 	block_cache = UT_NEW_NOKEY(Blocks(MAX_BLOCKS));
6756 
6757 	for (Blocks::iterator it = block_cache->begin();
6758 	     it != block_cache->end();
6759 	     ++it) {
6760 
6761 		ut_a(it->m_in_use == 0);
6762 		ut_a(it->m_ptr == NULL);
6763 
6764 		/* Allocate double of max page size memory, since
6765 		compress could generate more bytes than orgininal
6766 		data. */
6767 		it->m_ptr = static_cast<byte*>(
6768 			ut_malloc_nokey(BUFFER_BLOCK_SIZE));
6769 
6770 		ut_a(it->m_ptr != NULL);
6771 	}
6772 
6773 	/* Get sector size for DIRECT_IO. In this case, we need to
6774 	know the sector size for aligning the write buffer. */
6775 #if !defined(NO_FALLOCATE) && defined(UNIV_LINUX)
6776 	os_fusionio_get_sector_size();
6777 #endif /* !NO_FALLOCATE && UNIV_LINUX */
6778 
6779 	return(AIO::start(limit, n_readers, n_writers, n_slots_sync));
6780 }
6781 
6782 /** Frees the asynchronous io system. */
6783 void
os_aio_free()6784 os_aio_free()
6785 {
6786 	AIO::shutdown();
6787 
6788 	for (ulint i = 0; i < os_aio_n_segments; i++) {
6789 		os_event_destroy(os_aio_segment_wait_events[i]);
6790 	}
6791 
6792 	ut_free(os_aio_segment_wait_events);
6793 	os_aio_segment_wait_events = 0;
6794 	os_aio_n_segments = 0;
6795 
6796 	for (Blocks::iterator it = block_cache->begin();
6797 	     it != block_cache->end();
6798 	     ++it) {
6799 
6800 		ut_a(it->m_in_use == 0);
6801 		ut_free(it->m_ptr);
6802 	}
6803 
6804 	UT_DELETE(block_cache);
6805 
6806 	block_cache = NULL;
6807 }
6808 
6809 /** Wakes up all async i/o threads so that they know to exit themselves in
6810 shutdown. */
6811 void
os_aio_wake_all_threads_at_shutdown()6812 os_aio_wake_all_threads_at_shutdown()
6813 {
6814 #ifdef WIN_ASYNC_IO
6815 
6816 	AIO::wake_at_shutdown();
6817 
6818 #elif defined(LINUX_NATIVE_AIO)
6819 
6820 	/* When using native AIO interface the io helper threads
6821 	wait on io_getevents with a timeout value of 500ms. At
6822 	each wake up these threads check the server status.
6823 	No need to do anything to wake them up. */
6824 
6825 	if (srv_use_native_aio) {
6826 		return;
6827 	}
6828 
6829 #endif /* !WIN_ASYNC_AIO */
6830 
6831 	/* Fall through to simulated AIO handler wakeup if we are
6832 	not using native AIO. */
6833 
6834 	/* This loop wakes up all simulated ai/o threads */
6835 
6836 	for (ulint i = 0; i < os_aio_n_segments; ++i) {
6837 
6838 		os_event_set(os_aio_segment_wait_events[i]);
6839 	}
6840 }
6841 
6842 /** Waits until there are no pending writes in AIO::s_writes. There can
6843 be other, synchronous, pending writes. */
6844 void
os_aio_wait_until_no_pending_writes()6845 os_aio_wait_until_no_pending_writes()
6846 {
6847 	AIO::wait_until_no_pending_writes();
6848 }
6849 
6850 /** Calculates segment number for a slot.
6851 @param[in]	array		AIO wait array
6852 @param[in]	slot		slot in this array
6853 @return segment number (which is the number used by, for example,
6854 	I/O-handler threads) */
6855 ulint
get_segment_no_from_slot(const AIO * array,const Slot * slot)6856 AIO::get_segment_no_from_slot(
6857 	const AIO*	array,
6858 	const Slot*	slot)
6859 {
6860 	ulint	segment;
6861 	ulint	seg_len;
6862 
6863 	if (array == s_ibuf) {
6864 		ut_ad(!srv_read_only_mode);
6865 
6866 		segment = IO_IBUF_SEGMENT;
6867 
6868 	} else if (array == s_log) {
6869 		ut_ad(!srv_read_only_mode);
6870 
6871 		segment = IO_LOG_SEGMENT;
6872 
6873 	} else if (array == s_reads) {
6874 		seg_len = s_reads->slots_per_segment();
6875 
6876 		segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
6877 	} else {
6878 		ut_a(array == s_writes);
6879 
6880 		seg_len = s_writes->slots_per_segment();
6881 
6882 		segment = s_reads->m_n_segments
6883 			+ (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
6884 	}
6885 
6886 	return(segment);
6887 }
6888 
6889 /** Requests for a slot in the aio array. If no slot is available, waits until
6890 not_full-event becomes signaled.
6891 
6892 @param[in,out]	type		IO context
6893 @param[in,out]	m1		message to be passed along with the AIO
6894 				operation
6895 @param[in,out]	m2		message to be passed along with the AIO
6896 				operation
6897 @param[in]	file		file handle
6898 @param[in]	name		name of the file or path as a NUL-terminated
6899 				string
6900 @param[in,out]	buf		buffer where to read or from which to write
6901 @param[in]	offset		file offset, where to read from or start writing
6902 @param[in]	len		length of the block to read or write
6903 @return pointer to slot */
6904 Slot*
reserve_slot(IORequest & type,fil_node_t * m1,void * m2,pfs_os_file_t file,const char * name,void * buf,os_offset_t offset,ulint len)6905 AIO::reserve_slot(
6906 	IORequest&	type,
6907 	fil_node_t*	m1,
6908 	void*		m2,
6909 	pfs_os_file_t	file,
6910 	const char*	name,
6911 	void*		buf,
6912 	os_offset_t	offset,
6913 	ulint		len)
6914 {
6915 #ifdef WIN_ASYNC_IO
6916 	ut_a((len & 0xFFFFFFFFUL) == len);
6917 #endif /* WIN_ASYNC_IO */
6918 
6919 	/* No need of a mutex. Only reading constant fields */
6920 	ulint		slots_per_seg;
6921 
6922 	ut_ad(type.validate());
6923 
6924 	slots_per_seg = slots_per_segment();
6925 
6926 	/* We attempt to keep adjacent blocks in the same local
6927 	segment. This can help in merging IO requests when we are
6928 	doing simulated AIO */
6929 	ulint		local_seg;
6930 
6931 	local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6)) % m_n_segments;
6932 
6933 	for (;;) {
6934 
6935 		acquire();
6936 
6937 		if (m_n_reserved != m_slots.size()) {
6938 			break;
6939 		}
6940 
6941 		release();
6942 
6943 		if (!srv_use_native_aio) {
6944 			/* If the handler threads are suspended,
6945 			wake them so that we get more slots */
6946 
6947 			os_aio_simulated_wake_handler_threads();
6948 		}
6949 
6950 		os_event_wait(m_not_full);
6951 	}
6952 
6953 	ulint	counter = 0;
6954 	Slot*	slot = NULL;
6955 
6956 	/* We start our search for an available slot from our preferred
6957 	local segment and do a full scan of the array. We are
6958 	guaranteed to find a slot in full scan. */
6959 	for (ulint i = local_seg * slots_per_seg;
6960 	     counter < m_slots.size();
6961 	     ++i, ++counter) {
6962 
6963 		i %= m_slots.size();
6964 
6965 		slot = at(i);
6966 
6967 		if (slot->is_reserved == false) {
6968 			break;
6969 		}
6970 	}
6971 
6972 	/* We MUST always be able to get hold of a reserved slot. */
6973 	ut_a(counter < m_slots.size());
6974 
6975 	ut_a(slot->is_reserved == false);
6976 
6977 	++m_n_reserved;
6978 
6979 	if (m_n_reserved == 1) {
6980 		os_event_reset(m_is_empty);
6981 	}
6982 
6983 	if (m_n_reserved == m_slots.size()) {
6984 		os_event_reset(m_not_full);
6985 	}
6986 
6987 	slot->is_reserved = true;
6988 	slot->reservation_time = ut_time_monotonic();
6989 	slot->m1       = m1;
6990 	slot->m2       = m2;
6991 	slot->file     = file;
6992 	slot->name     = name;
6993 #ifdef _WIN32
6994 	slot->len      = static_cast<DWORD>(len);
6995 #else
6996 	slot->len      = static_cast<ulint>(len);
6997 #endif /* _WIN32 */
6998 	slot->type     = type;
6999 	slot->buf      = static_cast<byte*>(buf);
7000 	slot->ptr      = slot->buf;
7001 	slot->offset   = offset;
7002 	slot->err      = DB_SUCCESS;
7003 	slot->original_len = static_cast<uint32>(len);
7004 	slot->io_already_done = false;
7005 	slot->buf_block = NULL;
7006 
7007 	if (srv_use_native_aio
7008 	    && offset > 0
7009 	    && type.is_write()
7010 	    && type.is_compressed()) {
7011 		ulint	compressed_len = len;
7012 
7013 		ut_ad(!type.is_log());
7014 
7015 		release();
7016 
7017 		void* src_buf = slot->buf;
7018 		slot->buf_block = os_file_compress_page(
7019 			type,
7020 			src_buf,
7021 			&compressed_len);
7022 
7023 		slot->buf = static_cast<byte*>(src_buf);
7024 		slot->ptr = slot->buf;
7025 #ifdef _WIN32
7026 		slot->len = static_cast<DWORD>(compressed_len);
7027 #else
7028 		slot->len = static_cast<ulint>(compressed_len);
7029 #endif /* _WIN32 */
7030 		slot->skip_punch_hole = !type.punch_hole();
7031 
7032 		acquire();
7033 	}
7034 
7035 	/* We do encryption after compression, since if we do encryption
7036 	before compression, the encrypted data will cause compression fail
7037 	or low compression rate. */
7038 	if (srv_use_native_aio
7039 	    && offset > 0
7040 	    && type.is_write()
7041 	    && type.is_encrypted()) {
7042 		ulint		encrypted_len = slot->len;
7043 		Block*		encrypted_block;
7044 
7045 		ut_ad(!type.is_log());
7046 
7047 		release();
7048 
7049 		void* src_buf = slot->buf;
7050 		encrypted_block = os_file_encrypt_page(
7051 			type,
7052 			src_buf,
7053 			&encrypted_len);
7054 
7055 		if (slot->buf_block != NULL) {
7056 			os_free_block(slot->buf_block);
7057 		}
7058 
7059 		slot->buf_block = encrypted_block;
7060 		slot->buf = static_cast<byte*>(src_buf);
7061 		slot->ptr = slot->buf;
7062 
7063 #ifdef _WIN32
7064 		slot->len = static_cast<DWORD>(encrypted_len);
7065 #else
7066 		slot->len = static_cast<ulint>(encrypted_len);
7067 #endif /* _WIN32 */
7068 
7069 		acquire();
7070         }
7071 
7072 #ifdef WIN_ASYNC_IO
7073 	{
7074 		OVERLAPPED*	control;
7075 
7076 		control = &slot->control;
7077 		control->Offset = (DWORD) offset & 0xFFFFFFFF;
7078 		control->OffsetHigh = (DWORD) (offset >> 32);
7079 
7080 		ResetEvent(slot->handle);
7081 	}
7082 #elif defined(LINUX_NATIVE_AIO)
7083 
7084 	/* If we are not using native AIO skip this part. */
7085 	if (srv_use_native_aio) {
7086 
7087 		off_t		aio_offset;
7088 
7089 		/* Check if we are dealing with 64 bit arch.
7090 		If not then make sure that offset fits in 32 bits. */
7091 		aio_offset = (off_t) offset;
7092 
7093 		ut_a(sizeof(aio_offset) >= sizeof(offset)
7094 		     || ((os_offset_t) aio_offset) == offset);
7095 
7096 		struct iocb*	iocb = &slot->control;
7097 
7098 		if (type.is_read()) {
7099 			io_prep_pread(
7100 				iocb, file.m_file, slot->ptr, slot->len, aio_offset);
7101 		} else {
7102 			ut_ad(type.is_write());
7103 			io_prep_pwrite(
7104 				iocb, file.m_file, slot->ptr, slot->len, aio_offset);
7105 		}
7106 
7107 		iocb->data = slot;
7108 
7109 		slot->n_bytes = 0;
7110 		slot->ret = 0;
7111 	}
7112 #endif /* LINUX_NATIVE_AIO */
7113 
7114 	release();
7115 
7116 	return(slot);
7117 }
7118 
7119 /** Wakes up a simulated aio i/o-handler thread if it has something to do.
7120 @param[in]	global_segment	The number of the segment in the AIO arrays */
7121 void
wake_simulated_handler_thread(ulint global_segment)7122 AIO::wake_simulated_handler_thread(ulint global_segment)
7123 {
7124 	ut_ad(!srv_use_native_aio);
7125 
7126 	AIO*	array;
7127 	ulint	segment = get_array_and_local_segment(&array, global_segment);
7128 
7129 	array->wake_simulated_handler_thread(global_segment, segment);
7130 }
7131 
7132 /** Wakes up a simulated AIO I/O-handler thread if it has something to do
7133 for a local segment in the AIO array.
7134 @param[in]	global_segment	The number of the segment in the AIO arrays
7135 @param[in]	segment		The local segment in the AIO array */
7136 void
wake_simulated_handler_thread(ulint global_segment,ulint segment)7137 AIO::wake_simulated_handler_thread(ulint global_segment, ulint segment)
7138 {
7139 	ut_ad(!srv_use_native_aio);
7140 
7141 	ulint	n = slots_per_segment();
7142 	ulint	offset = segment * n;
7143 
7144 	/* Look through n slots after the segment * n'th slot */
7145 
7146 	acquire();
7147 
7148 	const Slot*	slot = at(offset);
7149 
7150 	for (ulint i = 0; i < n; ++i, ++slot) {
7151 
7152 		if (slot->is_reserved) {
7153 
7154 			/* Found an i/o request */
7155 
7156 			release();
7157 
7158 			os_event_t	event;
7159 
7160 			event = os_aio_segment_wait_events[global_segment];
7161 
7162 			os_event_set(event);
7163 
7164 			return;
7165 		}
7166 	}
7167 
7168 	release();
7169 }
7170 
7171 /** Wakes up simulated aio i/o-handler threads if they have something to do. */
7172 void
os_aio_simulated_wake_handler_threads()7173 os_aio_simulated_wake_handler_threads()
7174 {
7175 	if (srv_use_native_aio) {
7176 		/* We do not use simulated aio: do nothing */
7177 
7178 		return;
7179 	}
7180 
7181 	os_aio_recommend_sleep_for_read_threads	= false;
7182 
7183 	for (ulint i = 0; i < os_aio_n_segments; i++) {
7184 		AIO::wake_simulated_handler_thread(i);
7185 	}
7186 }
7187 
7188 /** Select the IO slot array
7189 @param[in]	type		Type of IO, READ or WRITE
7190 @param[in]	read_only	true if running in read-only mode
7191 @param[in]	mode		IO mode
7192 @return slot array or NULL if invalid mode specified */
7193 AIO*
select_slot_array(IORequest & type,bool read_only,ulint mode)7194 AIO::select_slot_array(IORequest& type, bool read_only, ulint mode)
7195 {
7196 	AIO*	array;
7197 
7198 	ut_ad(type.validate());
7199 
7200 	switch (mode) {
7201 	case OS_AIO_NORMAL:
7202 
7203 		array = type.is_read() ? AIO::s_reads : AIO::s_writes;
7204 		break;
7205 
7206 	case OS_AIO_IBUF:
7207 		ut_ad(type.is_read());
7208 
7209 		/* Reduce probability of deadlock bugs in connection with ibuf:
7210 		do not let the ibuf i/o handler sleep */
7211 
7212 		type.clear_do_not_wake();
7213 
7214 		array = read_only ? AIO::s_reads : AIO::s_ibuf;
7215 		break;
7216 
7217 	case OS_AIO_LOG:
7218 
7219 		array = read_only ? AIO::s_reads : AIO::s_log;
7220 		break;
7221 
7222 	case OS_AIO_SYNC:
7223 
7224 		array = AIO::s_sync;
7225 #if defined(LINUX_NATIVE_AIO)
7226 		/* In Linux native AIO we don't use sync IO array. */
7227 		ut_a(!srv_use_native_aio);
7228 #endif /* LINUX_NATIVE_AIO */
7229 		break;
7230 
7231 	default:
7232 		ut_error;
7233 		array = NULL; /* Eliminate compiler warning */
7234 	}
7235 
7236 	return(array);
7237 }
7238 
7239 #ifdef WIN_ASYNC_IO
7240 /** This function is only used in Windows asynchronous i/o.
7241 Waits for an aio operation to complete. This function is used to wait the
7242 for completed requests. The aio array of pending requests is divided
7243 into segments. The thread specifies which segment or slot it wants to wait
7244 for. NOTE: this function will also take care of freeing the aio slot,
7245 therefore no other thread is allowed to do the freeing!
7246 @param[in]	segment		The number of the segment in the aio arrays to
7247 				wait for; segment 0 is the ibuf I/O thread,
7248 				segment 1 the log I/O thread, then follow the
7249 				non-ibuf read threads, and as the last are the
7250 				non-ibuf write threads; if this is
7251 				ULINT_UNDEFINED, then it means that sync AIO
7252 				is used, and this parameter is ignored
7253 @param[in]	pos		this parameter is used only in sync AIO:
7254 				wait for the aio slot at this position
7255 @param[out]	m1		the messages passed with the AIO request; note
7256 				that also in the case where the AIO operation
7257 				failed, these output parameters are valid and
7258 				can be used to restart the operation,
7259 				for example
7260 @param[out]	m2		callback message
7261 @param[out]	type		OS_FILE_WRITE or ..._READ
7262 @return DB_SUCCESS or error code */
7263 static
7264 dberr_t
os_aio_windows_handler(ulint segment,ulint pos,fil_node_t ** m1,void ** m2,IORequest * type)7265 os_aio_windows_handler(
7266 	ulint		segment,
7267 	ulint		pos,
7268 	fil_node_t**	m1,
7269 	void**		m2,
7270 	IORequest*	type)
7271 {
7272 	Slot*		slot;
7273 	dberr_t		err;
7274 	AIO*		array;
7275 	ulint		orig_seg = segment;
7276 
7277 	if (segment == ULINT_UNDEFINED) {
7278 		segment = 0;
7279 		array = AIO::sync_array();
7280 	} else {
7281 		segment = AIO::get_array_and_local_segment(&array, segment);
7282 	}
7283 
7284 	/* NOTE! We only access constant fields in os_aio_array. Therefore
7285 	we do not have to acquire the protecting mutex yet */
7286 
7287 	ut_ad(os_aio_validate_skip());
7288 
7289 	if (array == AIO::sync_array()) {
7290 
7291 		WaitForSingleObject(array->at(pos)->handle, INFINITE);
7292 
7293 	} else {
7294 		if (orig_seg != ULINT_UNDEFINED) {
7295 			srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
7296 		}
7297 
7298 		pos = WaitForMultipleObjects(
7299 			(DWORD) array->slots_per_segment(),
7300 			array->handles(segment),
7301 			FALSE, INFINITE);
7302 	}
7303 
7304 	array->acquire();
7305 
7306 	if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
7307 	    && array->is_empty()
7308 	    && !buf_page_cleaner_is_active) {
7309 
7310 		*m1 = NULL;
7311 		*m2 = NULL;
7312 
7313 		array->release();
7314 
7315 		return(DB_SUCCESS);
7316 	}
7317 
7318 	ulint	n = array->slots_per_segment();
7319 
7320 	ut_a(pos >= WAIT_OBJECT_0 && pos <= WAIT_OBJECT_0 + n);
7321 
7322 	slot = array->at(pos + segment * n);
7323 
7324 	ut_a(slot->is_reserved);
7325 
7326 	if (orig_seg != ULINT_UNDEFINED) {
7327 		srv_set_io_thread_op_info(
7328 			orig_seg, "get windows aio return value");
7329 	}
7330 
7331 	BOOL	ret;
7332 	ret = GetOverlappedResult(
7333 		slot->file.m_file, &slot->control, &slot->n_bytes, TRUE);
7334 	*m1 = slot->m1;
7335 	*m2 = slot->m2;
7336 
7337 	*type = slot->type;
7338 
7339 	BOOL	retry = FALSE;
7340 
7341 	if (ret && slot->n_bytes == slot->len) {
7342 
7343 		err = DB_SUCCESS;
7344 
7345 	} else if (os_file_handle_error(slot->name, "Windows aio")) {
7346 
7347 		retry = true;
7348 
7349 	} else {
7350 
7351 		err = DB_IO_ERROR;
7352 	}
7353 
7354 	array->release();
7355 
7356 	if (retry) {
7357 		/* Retry failed read/write operation synchronously.
7358 		No need to hold array->m_mutex. */
7359 
7360 #ifdef UNIV_PFS_IO
7361 		/* This read/write does not go through os_file_read
7362 		and os_file_write APIs, need to register with
7363 		performance schema explicitly here. */
7364 		struct PSI_file_locker* locker = NULL;
7365 		PSI_file_locker_state   state;
7366 		register_pfs_file_io_begin(
7367 			&state, locker, slot->file, slot->len,
7368 			slot->type.is_write()
7369 			? PSI_FILE_WRITE : PSI_FILE_READ, __FILE__, __LINE__);
7370 #endif /* UNIV_PFS_IO */
7371 
7372 		ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
7373 
7374 		ssize_t	n_bytes = SyncFileIO::execute(slot);
7375 
7376 #ifdef UNIV_PFS_IO
7377 		register_pfs_file_io_end(locker, slot->len);
7378 #endif /* UNIV_PFS_IO */
7379 
7380 		if (n_bytes < 0 && GetLastError() == ERROR_IO_PENDING) {
7381 			/* AIO was queued successfully!
7382 			We want a synchronous I/O operation on a
7383 			file where we also use async I/O: in Windows
7384 			we must use the same wait mechanism as for
7385 			async I/O */
7386 
7387 			BOOL	ret;
7388 			ret = GetOverlappedResult(
7389 				slot->file.m_file, &slot->control, &slot->n_bytes,
7390 				TRUE);
7391 			n_bytes = ret ? slot->n_bytes : -1;
7392 		}
7393 
7394 		err = (n_bytes == slot->len) ? DB_SUCCESS : DB_IO_ERROR;
7395 	}
7396 
7397 	if (err == DB_SUCCESS) {
7398 		err = AIOHandler::post_io_processing(slot);
7399 	}
7400 
7401 	array->release_with_mutex(slot);
7402 
7403 	return(err);
7404 }
7405 #endif /* WIN_ASYNC_IO */
7406 
7407 /**
7408 NOTE! Use the corresponding macro os_aio(), not directly this function!
7409 Requests an asynchronous i/o operation.
7410 @param[in]	type		IO request context
7411 @param[in]	mode		IO mode
7412 @param[in]	name		Name of the file or path as NUL terminated
7413 				string
7414 @param[in]	file		Open file handle
7415 @param[out]	buf		buffer where to read
7416 @param[in]	offset		file offset where to read
7417 @param[in]	n		number of bytes to read
7418 @param[in]	read_only	if true read only mode checks are enforced
7419 @param[in,out]	m1		Message for the AIO handler, (can be used to
7420 				identify a completed AIO operation); ignored
7421 				if mode is OS_AIO_SYNC
7422 @param[in,out]	m2		message for the AIO handler (can be used to
7423 				identify a completed AIO operation); ignored
7424 				if mode is OS_AIO_SYNC
7425 @return DB_SUCCESS or error code */
7426 dberr_t
os_aio_func(IORequest & type,ulint mode,const char * name,pfs_os_file_t file,void * buf,os_offset_t offset,ulint n,bool read_only,fil_node_t * m1,void * m2)7427 os_aio_func(
7428 	IORequest&	type,
7429 	ulint		mode,
7430 	const char*	name,
7431 	pfs_os_file_t	file,
7432 	void*		buf,
7433 	os_offset_t	offset,
7434 	ulint		n,
7435 	bool		read_only,
7436 	fil_node_t*	m1,
7437 	void*		m2)
7438 {
7439 #ifdef WIN_ASYNC_IO
7440 	BOOL		ret = TRUE;
7441 #endif /* WIN_ASYNC_IO */
7442 
7443 	ut_ad(n > 0);
7444 	ut_ad((n % OS_FILE_LOG_BLOCK_SIZE) == 0);
7445 	ut_ad((offset % OS_FILE_LOG_BLOCK_SIZE) == 0);
7446 	ut_ad(os_aio_validate_skip());
7447 
7448 #ifdef WIN_ASYNC_IO
7449 	ut_ad((n & 0xFFFFFFFFUL) == n);
7450 #endif /* WIN_ASYNC_IO */
7451 
7452 	if (mode == OS_AIO_SYNC
7453 #ifdef WIN_ASYNC_IO
7454 	    && !srv_use_native_aio
7455 #endif /* WIN_ASYNC_IO */
7456 	    ) {
7457 		/* This is actually an ordinary synchronous read or write:
7458 		no need to use an i/o-handler thread. NOTE that if we use
7459 		Windows async i/o, Windows does not allow us to use
7460 		ordinary synchronous os_file_read etc. on the same file,
7461 		therefore we have built a special mechanism for synchronous
7462 		wait in the Windows case.
7463 		Also note that the Performance Schema instrumentation has
7464 		been performed by current os_aio_func()'s wrapper function
7465 		pfs_os_aio_func(). So we would no longer need to call
7466 		Performance Schema instrumented os_file_read() and
7467 		os_file_write(). Instead, we should use os_file_read_func()
7468 		and os_file_write_func() */
7469 
7470 		if (type.is_read()) {
7471 			return(os_file_read_func(type, file.m_file, buf, offset, n));
7472 		}
7473 
7474 		ut_ad(type.is_write());
7475 		return(os_file_write_func(type, name, file.m_file, buf, offset, n));
7476 	}
7477 
7478 try_again:
7479 
7480 	AIO*	array;
7481 
7482 	array = AIO::select_slot_array(type, read_only, mode);
7483 
7484 	Slot*	slot;
7485 
7486 	slot = array->reserve_slot(type, m1, m2, file, name, buf, offset, n);
7487 
7488 	if (type.is_read()) {
7489 
7490 		if (srv_use_native_aio) {
7491 
7492 			++os_n_file_reads;
7493 
7494 			os_bytes_read_since_printout += n;
7495 #ifdef WIN_ASYNC_IO
7496 			ret = ReadFile(
7497 				file.m_file, slot->ptr, slot->len,
7498 				&slot->n_bytes, &slot->control);
7499 #elif defined(LINUX_NATIVE_AIO)
7500 			if (!array->linux_dispatch(slot)) {
7501 				goto err_exit;
7502 			}
7503 #endif /* WIN_ASYNC_IO */
7504 		} else if (type.is_wake()) {
7505 			AIO::wake_simulated_handler_thread(
7506 				AIO::get_segment_no_from_slot(array, slot));
7507 		}
7508 	} else if (type.is_write()) {
7509 
7510 		if (srv_use_native_aio) {
7511 			++os_n_file_writes;
7512 
7513 #ifdef WIN_ASYNC_IO
7514 			ret = WriteFile(
7515 				file.m_file, slot->ptr, slot->len,
7516 				&slot->n_bytes, &slot->control);
7517 #elif defined(LINUX_NATIVE_AIO)
7518 			if (!array->linux_dispatch(slot)) {
7519 				goto err_exit;
7520 			}
7521 #endif /* WIN_ASYNC_IO */
7522 
7523 		} else if (type.is_wake()) {
7524 			AIO::wake_simulated_handler_thread(
7525 				AIO::get_segment_no_from_slot(array, slot));
7526 		}
7527 	} else {
7528 		ut_error;
7529 	}
7530 
7531 #ifdef WIN_ASYNC_IO
7532 	if (srv_use_native_aio) {
7533 		if ((ret && slot->len == slot->n_bytes)
7534 		     || (!ret && GetLastError() == ERROR_IO_PENDING)) {
7535 			/* aio was queued successfully! */
7536 
7537 			if (mode == OS_AIO_SYNC) {
7538 				IORequest	dummy_type;
7539 				void*		dummy_mess2;
7540 				struct fil_node_t* dummy_mess1;
7541 
7542 				/* We want a synchronous i/o operation on a
7543 				file where we also use async i/o: in Windows
7544 				we must use the same wait mechanism as for
7545 				async i/o */
7546 
7547 				return(os_aio_windows_handler(
7548 					ULINT_UNDEFINED, slot->pos,
7549 					&dummy_mess1, &dummy_mess2,
7550 					&dummy_type));
7551 			}
7552 
7553 			return(DB_SUCCESS);
7554 		}
7555 
7556 		goto err_exit;
7557 	}
7558 #endif /* WIN_ASYNC_IO */
7559 
7560 	/* AIO request was queued successfully! */
7561 	return(DB_SUCCESS);
7562 
7563 #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
7564 err_exit:
7565 #endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
7566 
7567 	array->release_with_mutex(slot);
7568 
7569 	if (os_file_handle_error(
7570 		name, type.is_read() ? "aio read" : "aio write")) {
7571 
7572 		goto try_again;
7573 	}
7574 
7575 	return(DB_IO_ERROR);
7576 }
7577 
7578 /** Simulated AIO handler for reaping IO requests */
7579 class SimulatedAIOHandler {
7580 
7581 public:
7582 
7583 	/** Constructor
7584 	@param[in,out]	array	The AIO array
7585 	@param[in]	segment	Local segment in the array */
SimulatedAIOHandler(AIO * array,ulint segment)7586 	SimulatedAIOHandler(AIO* array, ulint segment)
7587 		:
7588 		m_oldest(),
7589 		m_n_elems(),
7590 		m_lowest_offset(IB_UINT64_MAX),
7591 		m_array(array),
7592 		m_n_slots(),
7593 		m_segment(segment),
7594 		m_ptr(),
7595 		m_buf()
7596 	{
7597 		ut_ad(m_segment < 100);
7598 
7599 		m_slots.resize(OS_AIO_MERGE_N_CONSECUTIVE);
7600 	}
7601 
7602 	/** Destructor */
~SimulatedAIOHandler()7603 	~SimulatedAIOHandler()
7604 	{
7605 		if (m_ptr != NULL) {
7606 			ut_free(m_ptr);
7607 		}
7608 	}
7609 
7610 	/** Reset the state of the handler
7611 	@param[in]	n_slots	Number of pending AIO operations supported */
init(ulint n_slots)7612 	void init(ulint n_slots)
7613 	{
7614 		m_oldest = 0;
7615 		m_n_elems = 0;
7616 		m_n_slots = n_slots;
7617 		m_lowest_offset = IB_UINT64_MAX;
7618 
7619 		if (m_ptr != NULL) {
7620 			ut_free(m_ptr);
7621 			m_ptr = m_buf = NULL;
7622 		}
7623 
7624 		m_slots[0] = NULL;
7625 	}
7626 
7627 	/** Check if there is a slot for which the i/o has already been done
7628 	@param[out]	n_reserved	Number of reserved slots
7629 	@return the first completed slot that is found. */
check_completed(ulint * n_reserved)7630 	Slot* check_completed(ulint* n_reserved)
7631 	{
7632 		ulint	offset = m_segment * m_n_slots;
7633 
7634 		*n_reserved = 0;
7635 
7636 		Slot*	slot;
7637 
7638 		slot = m_array->at(offset);
7639 
7640 		for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7641 
7642 			if (slot->is_reserved) {
7643 
7644 				if (slot->io_already_done) {
7645 
7646 					ut_a(slot->is_reserved);
7647 
7648 					return(slot);
7649 				}
7650 
7651 				++*n_reserved;
7652 			}
7653 		}
7654 
7655 		return(NULL);
7656 	}
7657 
7658 	/** If there are at least 2 seconds old requests, then pick the
7659 	oldest one to prevent starvation.  If several requests have the
7660 	same age, then pick the one at the lowest offset.
7661 	@return true if request was selected */
select()7662 	bool select()
7663 	{
7664 		if (!select_oldest()) {
7665 
7666 			return(select_lowest_offset());
7667 		}
7668 
7669 		return(true);
7670 	}
7671 
7672 	/** Check if there are several consecutive blocks
7673 	to read or write. Merge them if found. */
merge()7674 	void merge()
7675 	{
7676 		/* if m_n_elems != 0, then we have assigned
7677 		something valid to consecutive_ios[0] */
7678 		ut_ad(m_n_elems != 0);
7679 		ut_ad(first_slot() != NULL);
7680 
7681 		Slot*	slot = first_slot();
7682 
7683 		while (!merge_adjacent(slot)) {
7684 			/* No op */
7685 		}
7686 	}
7687 
7688 	/** We have now collected n_consecutive I/O requests
7689 	in the array; allocate a single buffer which can hold
7690 	all data, and perform the I/O
7691 	@return the length of the buffer */
allocate_buffer()7692 	ulint allocate_buffer()
7693 		MY_ATTRIBUTE((warn_unused_result))
7694 	{
7695 		ulint	len;
7696 		Slot*	slot = first_slot();
7697 
7698 		ut_ad(m_ptr == NULL);
7699 
7700 		if (slot->type.is_read() && m_n_elems > 1) {
7701 
7702 			len = 0;
7703 
7704 			for (ulint i = 0; i < m_n_elems; ++i) {
7705 				len += m_slots[i]->len;
7706 			}
7707 
7708 			m_ptr = static_cast<byte*>(
7709 				ut_malloc_nokey(len + UNIV_PAGE_SIZE));
7710 
7711 			m_buf = static_cast<byte*>(
7712 				ut_align(m_ptr, UNIV_PAGE_SIZE));
7713 
7714 		} else {
7715 			len = first_slot()->len;
7716 			m_buf = first_slot()->buf;
7717 		}
7718 
7719 		return(len);
7720 	}
7721 
7722 	/** We have to compress the individual pages and punch
7723 	holes in them on a page by page basis when writing to
7724 	tables that can be compresed at the IO level.
7725 	@param[in]	len		Value returned by allocate_buffer */
copy_to_buffer(ulint len)7726 	void copy_to_buffer(ulint len)
7727 	{
7728 		Slot*	slot = first_slot();
7729 
7730 		if (len > slot->len && slot->type.is_write()) {
7731 
7732 			byte*	ptr = m_buf;
7733 
7734 			ut_ad(ptr != slot->buf);
7735 
7736 			/* Copy the buffers to the combined buffer */
7737 			for (ulint i = 0; i < m_n_elems; ++i) {
7738 
7739 				slot = m_slots[i];
7740 
7741 				memmove(ptr, slot->buf, slot->len);
7742 
7743 				ptr += slot->len;
7744 			}
7745 		}
7746 	}
7747 
7748 	/** Do the I/O with ordinary, synchronous i/o functions:
7749 	@param[in]	len		Length of buffer for IO */
io()7750 	void io()
7751 	{
7752 		if (first_slot()->type.is_write()) {
7753 
7754 			for (ulint i = 0; i < m_n_elems; ++i) {
7755 				write(m_slots[i]);
7756 			}
7757 
7758 		} else {
7759 
7760 			for (ulint i = 0; i < m_n_elems; ++i) {
7761 				read(m_slots[i]);
7762 			}
7763 		}
7764 	}
7765 
7766 	/** Do the decompression of the pages read in */
io_complete()7767 	void io_complete()
7768 	{
7769 		// Note: For non-compressed tables. Not required
7770 		// for correctness.
7771 	}
7772 
7773 	/** Mark the i/os done in slots */
done()7774 	void done()
7775 	{
7776 		for (ulint i = 0; i < m_n_elems; ++i) {
7777 			m_slots[i]->io_already_done = true;
7778 		}
7779 	}
7780 
7781 	/** @return the first slot in the consecutive array */
first_slot()7782 	Slot* first_slot()
7783 		MY_ATTRIBUTE((warn_unused_result))
7784 	{
7785 		ut_a(m_n_elems > 0);
7786 
7787 		return(m_slots[0]);
7788 	}
7789 
7790 	/** Wait for I/O requests
7791 	@param[in]	global_segment	The global segment
7792 	@param[in,out]	event		Wait on event if no active requests
7793 	@return the number of slots */
7794 	ulint check_pending(
7795 		ulint		global_segment,
7796 		os_event_t	event)
7797 		MY_ATTRIBUTE((warn_unused_result));
7798 private:
7799 
7800 	/** Do the file read
7801 	@param[in,out]	slot		Slot that has the IO context */
read(Slot * slot)7802 	void read(Slot* slot)
7803 	{
7804 		dberr_t	err = os_file_read_func(
7805 			slot->type,
7806 			slot->file.m_file,
7807 			slot->ptr,
7808 			slot->offset,
7809 			slot->len);
7810 		ut_a(err == DB_SUCCESS);
7811 	}
7812 
7813 	/** Do the file read
7814 	@param[in,out]	slot		Slot that has the IO context */
write(Slot * slot)7815 	void write(Slot* slot)
7816 	{
7817 		dberr_t	err = os_file_write_func(
7818 			slot->type,
7819 			slot->name,
7820 			slot->file.m_file,
7821 			slot->ptr,
7822 			slot->offset,
7823 			slot->len);
7824 		ut_a(err == DB_SUCCESS || err == DB_IO_NO_PUNCH_HOLE);
7825 	}
7826 
7827 	/** @return true if the slots are adjacent and can be merged */
adjacent(const Slot * s1,const Slot * s2) const7828 	bool adjacent(const Slot* s1, const Slot* s2) const
7829 	{
7830 		return(s1 != s2
7831 		       && s1->file.m_file == s2->file.m_file
7832 		       && s2->offset == s1->offset + s1->len
7833 		       && s1->type == s2->type);
7834 	}
7835 
7836 	/** @return true if merge limit reached or no adjacent slots found. */
merge_adjacent(Slot * & current)7837 	bool merge_adjacent(Slot*& current)
7838 	{
7839 		Slot*	slot;
7840 		ulint	offset = m_segment * m_n_slots;
7841 
7842 		slot = m_array->at(offset);
7843 
7844 		for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7845 
7846 			if (slot->is_reserved && adjacent(current, slot)) {
7847 
7848 				current = slot;
7849 
7850 				/* Found a consecutive i/o request */
7851 
7852 				m_slots[m_n_elems] = slot;
7853 
7854 				++m_n_elems;
7855 
7856 				return(m_n_elems >= m_slots.capacity());
7857 			}
7858 		}
7859 
7860 		return(true);
7861 	}
7862 
7863 	/** There were no old requests. Look for an I/O request at the lowest
7864 	offset in the array (we ignore the high 32 bits of the offset in these
7865 	heuristics) */
select_lowest_offset()7866 	bool select_lowest_offset()
7867 	{
7868 		ut_ad(m_n_elems == 0);
7869 
7870 		ulint	offset = m_segment * m_n_slots;
7871 
7872 		m_lowest_offset = IB_UINT64_MAX;
7873 
7874 		for (ulint i = 0; i < m_n_slots; ++i) {
7875 			Slot*	slot;
7876 
7877 			slot = m_array->at(i + offset);
7878 
7879 			if (slot->is_reserved
7880 			    && slot->offset < m_lowest_offset) {
7881 
7882 				/* Found an i/o request */
7883 				m_slots[0] = slot;
7884 
7885 				m_n_elems = 1;
7886 
7887 				m_lowest_offset = slot->offset;
7888 			}
7889 		}
7890 
7891 		return(m_n_elems > 0);
7892 	}
7893 
7894 	/** Select the slot if it is older than the current oldest slot.
7895 	@param[in]	slot		The slot to check */
select_if_older(Slot * slot)7896 	void select_if_older(Slot* slot)
7897 	{
7898 		int64_t time_diff = ut_time_monotonic() -
7899 					slot->reservation_time;
7900 
7901 		const uint64_t age = time_diff > 0 ? (uint64_t) time_diff : 0;
7902 
7903 		if ((age >= 2 && age > m_oldest)
7904 		    || (age >= 2
7905 			&& age == m_oldest
7906 			&& slot->offset < m_lowest_offset)) {
7907 
7908 			/* Found an i/o request */
7909 			m_slots[0] = slot;
7910 
7911 			m_n_elems = 1;
7912 
7913 			m_oldest = age;
7914 
7915 			m_lowest_offset = slot->offset;
7916 		}
7917 	}
7918 
7919 	/** Select th oldest slot in the array
7920 	@return true if oldest slot found */
select_oldest()7921 	bool select_oldest()
7922 	{
7923 		ut_ad(m_n_elems == 0);
7924 
7925 		Slot*	slot;
7926 		ulint	offset = m_n_slots * m_segment;
7927 
7928 		slot = m_array->at(offset);
7929 
7930 		for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7931 
7932 			if (slot->is_reserved) {
7933 				select_if_older(slot);
7934 			}
7935 		}
7936 
7937 		return(m_n_elems > 0);
7938 	}
7939 
7940 	typedef std::vector<Slot*> slots_t;
7941 
7942 private:
7943 	ulint		m_oldest;
7944 	ulint		m_n_elems;
7945 	os_offset_t	m_lowest_offset;
7946 
7947 	AIO*		m_array;
7948 	ulint		m_n_slots;
7949 	ulint		m_segment;
7950 
7951 	slots_t		m_slots;
7952 
7953 	byte*		m_ptr;
7954 	byte*		m_buf;
7955 };
7956 
7957 /** Wait for I/O requests
7958 @return the number of slots */
7959 ulint
check_pending(ulint global_segment,os_event_t event)7960 SimulatedAIOHandler::check_pending(
7961 	ulint		global_segment,
7962 	os_event_t	event)
7963 {
7964 	/* NOTE! We only access constant fields in os_aio_array.
7965 	Therefore we do not have to acquire the protecting mutex yet */
7966 
7967 	ut_ad(os_aio_validate_skip());
7968 
7969 	ut_ad(m_segment < m_array->get_n_segments());
7970 
7971 	/* Look through n slots after the segment * n'th slot */
7972 
7973 	if (AIO::is_read(m_array)
7974 	    && os_aio_recommend_sleep_for_read_threads) {
7975 
7976 		/* Give other threads chance to add several
7977 		I/Os to the array at once. */
7978 
7979 		srv_set_io_thread_op_info(
7980 			global_segment, "waiting for i/o request");
7981 
7982 		os_event_wait(event);
7983 
7984 		return(0);
7985 	}
7986 
7987 	return(m_array->slots_per_segment());
7988 }
7989 
7990 /** Does simulated AIO. This function should be called by an i/o-handler
7991 thread.
7992 
7993 @param[in]	segment	The number of the segment in the aio arrays to wait
7994 			for; segment 0 is the ibuf i/o thread, segment 1 the
7995 			log i/o thread, then follow the non-ibuf read threads,
7996 			and as the last are the non-ibuf write threads
7997 @param[out]	m1	the messages passed with the AIO request; note that
7998 			also in the case where the AIO operation failed, these
7999 			output parameters are valid and can be used to restart
8000 			the operation, for example
8001 @param[out]	m2	Callback argument
8002 @param[in]	type	IO context
8003 @return DB_SUCCESS or error code */
8004 static
8005 dberr_t
os_aio_simulated_handler(ulint global_segment,fil_node_t ** m1,void ** m2,IORequest * type)8006 os_aio_simulated_handler(
8007 	ulint		global_segment,
8008 	fil_node_t**	m1,
8009 	void**		m2,
8010 	IORequest*	type)
8011 {
8012 	Slot*		slot;
8013 	AIO*		array;
8014 	ulint		segment;
8015 	os_event_t	event = os_aio_segment_wait_events[global_segment];
8016 
8017 	segment = AIO::get_array_and_local_segment(&array, global_segment);
8018 
8019 	SimulatedAIOHandler	handler(array, segment);
8020 
8021 	for (;;) {
8022 
8023 		srv_set_io_thread_op_info(
8024 			global_segment, "looking for i/o requests (a)");
8025 
8026 		ulint	n_slots = handler.check_pending(global_segment, event);
8027 
8028 		if (n_slots == 0) {
8029 			continue;
8030 		}
8031 
8032 		handler.init(n_slots);
8033 
8034 		srv_set_io_thread_op_info(
8035 			global_segment, "looking for i/o requests (b)");
8036 
8037 		array->acquire();
8038 
8039 		ulint	n_reserved;
8040 
8041 		slot = handler.check_completed(&n_reserved);
8042 
8043 		if (slot != NULL) {
8044 
8045 			break;
8046 
8047 		} else if (n_reserved == 0
8048 			   && !buf_page_cleaner_is_active
8049 			   && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
8050 
8051 			/* There is no completed request. If there
8052 			are no pending request at all, and the system
8053 			is being shut down, exit. */
8054 
8055 			array->release();
8056 
8057 			*m1 = NULL;
8058 
8059 			*m2 = NULL;
8060 
8061 			return(DB_SUCCESS);
8062 
8063 		} else if (handler.select()) {
8064 
8065 			break;
8066 		}
8067 
8068 		/* No I/O requested at the moment */
8069 
8070 		srv_set_io_thread_op_info(
8071 			global_segment, "resetting wait event");
8072 
8073 		/* We wait here until tbere are more IO requests
8074 		for this segment. */
8075 
8076 		os_event_reset(event);
8077 
8078 		array->release();
8079 
8080 		srv_set_io_thread_op_info(
8081 			global_segment, "waiting for i/o request");
8082 
8083 		os_event_wait(event);
8084 	}
8085 
8086 	/** Found a slot that has already completed its IO */
8087 
8088 	if (slot == NULL) {
8089 		/* Merge adjacent requests */
8090 		handler.merge();
8091 
8092 		/* Check if there are several consecutive blocks
8093 		to read or write */
8094 
8095 		srv_set_io_thread_op_info(
8096 			global_segment, "consecutive i/o requests");
8097 
8098 		// Note: We don't support write combining for simulated AIO.
8099 		//ulint	total_len = handler.allocate_buffer();
8100 
8101 		/* We release the array mutex for the time of the I/O: NOTE that
8102 		this assumes that there is just one i/o-handler thread serving
8103 		a single segment of slots! */
8104 
8105 		array->release();
8106 
8107 		// Note: We don't support write combining for simulated AIO.
8108 		//handler.copy_to_buffer(total_len);
8109 
8110 		srv_set_io_thread_op_info(global_segment, "doing file i/o");
8111 
8112 		handler.io();
8113 
8114 		srv_set_io_thread_op_info(global_segment, "file i/o done");
8115 
8116 		handler.io_complete();
8117 
8118 		array->acquire();
8119 
8120 		handler.done();
8121 
8122 		/* We return the messages for the first slot now, and if there
8123 		were several slots, the messages will be returned with
8124 		subsequent calls of this function */
8125 
8126 		slot = handler.first_slot();
8127 	}
8128 
8129 	ut_ad(slot->is_reserved);
8130 
8131 	*m1 = slot->m1;
8132 	*m2 = slot->m2;
8133 
8134 	*type = slot->type;
8135 
8136 	array->release(slot);
8137 
8138 	array->release();
8139 
8140 	return(DB_SUCCESS);
8141 }
8142 
8143 /** Get the total number of pending IOs
8144 @return the total number of pending IOs */
8145 ulint
total_pending_io_count()8146 AIO::total_pending_io_count()
8147 {
8148 	ulint	count = s_reads->pending_io_count();
8149 
8150 	if (s_writes != NULL) {
8151 		count += s_writes->pending_io_count();
8152 	}
8153 
8154 	if (s_ibuf != NULL) {
8155 		count += s_ibuf->pending_io_count();
8156 	}
8157 
8158 	if (s_log != NULL) {
8159 		count += s_log->pending_io_count();
8160 	}
8161 
8162 	if (s_sync != NULL) {
8163 		count += s_sync->pending_io_count();
8164 	}
8165 
8166 	return(count);
8167 }
8168 
8169 /** Validates the consistency the aio system.
8170 @return true if ok */
8171 static
8172 bool
os_aio_validate()8173 os_aio_validate()
8174 {
8175 	/* The methods countds and validates, we ignore the count. */
8176 	AIO::total_pending_io_count();
8177 
8178 	return(true);
8179 }
8180 
8181 /** Prints pending IO requests per segment of an aio array.
8182 We probably don't need per segment statistics but they can help us
8183 during development phase to see if the IO requests are being
8184 distributed as expected.
8185 @param[in,out]	file		File where to print
8186 @param[in]	segments	Pending IO array */
8187 void
print_segment_info(FILE * file,const ulint * segments)8188 AIO::print_segment_info(
8189 	FILE*		file,
8190 	const ulint*	segments)
8191 {
8192 	ut_ad(m_n_segments > 0);
8193 
8194 	if (m_n_segments > 1) {
8195 
8196 		fprintf(file, " [");
8197 
8198 		for (ulint i = 0; i < m_n_segments; ++i, ++segments) {
8199 
8200 			if (i != 0) {
8201 				fprintf(file, ", ");
8202 			}
8203 
8204 			fprintf(file, ULINTPF, *segments);
8205 		}
8206 
8207 		fprintf(file, "] ");
8208 	}
8209 }
8210 
8211 /** Prints info about the aio array.
8212 @param[in,out]	file		Where to print */
8213 void
print(FILE * file)8214 AIO::print(FILE* file)
8215 {
8216 	ulint	count = 0;
8217 	ulint	n_res_seg[SRV_MAX_N_IO_THREADS];
8218 
8219 	mutex_enter(&m_mutex);
8220 
8221 	ut_a(!m_slots.empty());
8222 	ut_a(m_n_segments > 0);
8223 
8224 	memset(n_res_seg, 0x0, sizeof(n_res_seg));
8225 
8226 	for (ulint i = 0; i < m_slots.size(); ++i) {
8227 		Slot&	slot = m_slots[i];
8228 		ulint	segment = (i * m_n_segments) / m_slots.size();
8229 
8230 		if (slot.is_reserved) {
8231 
8232 			++count;
8233 
8234 			++n_res_seg[segment];
8235 
8236 			ut_a(slot.len > 0);
8237 		}
8238 	}
8239 
8240 	ut_a(m_n_reserved == count);
8241 
8242 	print_segment_info(file, n_res_seg);
8243 
8244 	mutex_exit(&m_mutex);
8245 }
8246 
8247 /** Print all the AIO segments
8248 @param[in,out]	file		Where to print */
8249 void
print_all(FILE * file)8250 AIO::print_all(FILE* file)
8251 {
8252 	s_reads->print(file);
8253 
8254 	if (s_writes != NULL) {
8255 		fputs(", aio writes:", file);
8256 		s_writes->print(file);
8257 	}
8258 
8259 	if (s_ibuf != NULL) {
8260 		fputs(",\n ibuf aio reads:", file);
8261 		s_ibuf->print(file);
8262 	}
8263 
8264 	if (s_log != NULL) {
8265 		fputs(", log i/o's:", file);
8266 		s_log->print(file);
8267 	}
8268 
8269 	if (s_sync != NULL) {
8270 		fputs(", sync i/o's:", file);
8271 		s_sync->print(file);
8272 	}
8273 }
8274 
8275 /** Prints info of the aio arrays.
8276 @param[in,out]	file		file where to print */
8277 void
os_aio_print(FILE * file)8278 os_aio_print(FILE*	file)
8279 {
8280 	ib_time_monotonic_t 		current_time;
8281 	double	 			time_elapsed;
8282 	double				avg_bytes_read;
8283 
8284 	for (ulint i = 0; i < srv_n_file_io_threads; ++i) {
8285 		fprintf(file, "I/O thread %lu state: %s (%s)",
8286 			(ulong) i,
8287 			srv_io_thread_op_info[i],
8288 			srv_io_thread_function[i]);
8289 
8290 #ifndef _WIN32
8291 		if (os_event_is_set(os_aio_segment_wait_events[i])) {
8292 			fprintf(file, " ev set");
8293 		}
8294 #endif /* _WIN32 */
8295 
8296 		fprintf(file, "\n");
8297 	}
8298 
8299 	fputs("Pending normal aio reads:", file);
8300 
8301 	AIO::print_all(file);
8302 
8303 	putc('\n', file);
8304 	current_time = ut_time_monotonic();
8305 	time_elapsed = 0.001 + (current_time - os_last_printout);
8306 
8307 	fprintf(file,
8308 		"Pending flushes (fsync) log: " ULINTPF "; "
8309 		"buffer pool: " ULINTPF "\n"
8310 		ULINTPF " OS file reads, "
8311 		ULINTPF " OS file writes, "
8312 		ULINTPF " OS fsyncs\n",
8313 		fil_n_pending_log_flushes,
8314 		fil_n_pending_tablespace_flushes,
8315 		os_n_file_reads,
8316 		os_n_file_writes,
8317 		os_n_fsyncs);
8318 
8319 	if (os_n_pending_writes != 0 || os_n_pending_reads != 0) {
8320 		fprintf(file,
8321 			ULINTPF " pending preads, "
8322 			ULINTPF " pending pwrites\n",
8323 			os_n_pending_reads,
8324 			os_n_pending_writes);
8325 	}
8326 
8327 	if (os_n_file_reads == os_n_file_reads_old) {
8328 		avg_bytes_read = 0.0;
8329 	} else {
8330 		avg_bytes_read = (double) os_bytes_read_since_printout
8331 			/ (os_n_file_reads - os_n_file_reads_old);
8332 	}
8333 
8334 	fprintf(file,
8335 		"%.2f reads/s, %lu avg bytes/read,"
8336 		" %.2f writes/s, %.2f fsyncs/s\n",
8337 		(os_n_file_reads - os_n_file_reads_old)
8338 		/ time_elapsed,
8339 		(ulong) avg_bytes_read,
8340 		(os_n_file_writes - os_n_file_writes_old)
8341 		/ time_elapsed,
8342 		(os_n_fsyncs - os_n_fsyncs_old)
8343 		/ time_elapsed);
8344 
8345 	os_n_file_reads_old = os_n_file_reads;
8346 	os_n_file_writes_old = os_n_file_writes;
8347 	os_n_fsyncs_old = os_n_fsyncs;
8348 	os_bytes_read_since_printout = 0;
8349 
8350 	os_last_printout = current_time;
8351 }
8352 
8353 /** Refreshes the statistics used to print per-second averages. */
8354 void
os_aio_refresh_stats()8355 os_aio_refresh_stats()
8356 {
8357 	os_n_fsyncs_old = os_n_fsyncs;
8358 
8359 	os_bytes_read_since_printout = 0;
8360 
8361 	os_n_file_reads_old = os_n_file_reads;
8362 
8363 	os_n_file_writes_old = os_n_file_writes;
8364 
8365 	os_n_fsyncs_old = os_n_fsyncs;
8366 
8367 	os_bytes_read_since_printout = 0;
8368 
8369 	os_last_printout = ut_time_monotonic();
8370 }
8371 
8372 /** Checks that all slots in the system have been freed, that is, there are
8373 no pending io operations.
8374 @return true if all free */
8375 bool
os_aio_all_slots_free()8376 os_aio_all_slots_free()
8377 {
8378 	return(AIO::total_pending_io_count() == 0);
8379 }
8380 
8381 #ifdef UNIV_DEBUG
8382 /** Prints all pending IO for the array
8383 @param[in]	file	file where to print
8384 @param[in]	array	array to process */
8385 void
to_file(FILE * file) const8386 AIO::to_file(FILE* file) const
8387 {
8388 	acquire();
8389 
8390 	fprintf(file, " %lu\n", static_cast<ulong>(m_n_reserved));
8391 
8392 	for (ulint i = 0; i < m_slots.size(); ++i) {
8393 
8394 		const Slot&	slot = m_slots[i];
8395 
8396 		if (slot.is_reserved) {
8397 
8398 			fprintf(file,
8399 				"%s IO for %s (offset=" UINT64PF
8400 				", size=%lu)\n",
8401 				slot.type.is_read() ? "read" : "write",
8402 				slot.name, slot.offset, slot.len);
8403 		}
8404 	}
8405 
8406 	release();
8407 }
8408 
8409 /** Print pending IOs for all arrays */
8410 void
print_to_file(FILE * file)8411 AIO::print_to_file(FILE* file)
8412 {
8413 	fprintf(file, "Pending normal aio reads:");
8414 
8415 	s_reads->to_file(file);
8416 
8417 	if (s_writes != NULL) {
8418 		fprintf(file, "Pending normal aio writes:");
8419 		s_writes->to_file(file);
8420 	}
8421 
8422 	if (s_ibuf != NULL) {
8423 		fprintf(file, "Pending ibuf aio reads:");
8424 		s_ibuf->to_file(file);
8425 	}
8426 
8427 	if (s_log != NULL) {
8428 		fprintf(file, "Pending log i/o's:");
8429 		s_log->to_file(file);
8430 	}
8431 
8432 	if (s_sync != NULL) {
8433 		fprintf(file, "Pending sync i/o's:");
8434 		s_sync->to_file(file);
8435 	}
8436 }
8437 
8438 /** Prints all pending IO
8439 @param[in]	file		File where to print */
8440 void
os_aio_print_pending_io(FILE * file)8441 os_aio_print_pending_io(
8442 	FILE*	file)
8443 {
8444 	AIO::print_to_file(file);
8445 }
8446 
8447 #endif /* UNIV_DEBUG */
8448 
8449 /**
8450 Set the file create umask
8451 @param[in]	umask		The umask to use for file creation. */
8452 void
os_file_set_umask(ulint umask)8453 os_file_set_umask(ulint umask)
8454 {
8455 	os_innodb_umask = umask;
8456 }
8457 #else
8458 
8459 #include "univ.i"
8460 #include "db0err.h"
8461 #include "mach0data.h"
8462 #include "fil0fil.h"
8463 #include "os0file.h"
8464 
8465 #include <lz4.h>
8466 #include <zlib.h>
8467 
8468 #include <my_aes.h>
8469 #include <my_rnd.h>
8470 #include <mysqld.h>
8471 #include <mysql/service_mysql_keyring.h>
8472 
8473 typedef byte	Block;
8474 
8475 /** Allocate a page for sync IO
8476 @return pointer to page */
8477 static
8478 Block*
os_alloc_block()8479 os_alloc_block()
8480 {
8481 	return(reinterpret_cast<byte*>(malloc(UNIV_PAGE_SIZE_MAX * 2)));
8482 }
8483 
8484 /** Free a page after sync IO
8485 @param[in,own]	block		The block to free/release */
8486 static
8487 void
os_free_block(Block * block)8488 os_free_block(Block* block)
8489 {
8490 	ut_free(block);
8491 }
8492 
8493 #endif /* !UNIV_INNOCHECKSUM */
8494 
8495 /** Minimum length needed for encryption */
8496 const unsigned int MIN_ENCRYPTION_LEN = 2 * MY_AES_BLOCK_SIZE + FIL_PAGE_DATA;
8497 
8498 /**
8499 @param[in]      type            The compression type
8500 @return the string representation */
8501 const char*
to_string(Type type)8502 Compression::to_string(Type type)
8503 {
8504         switch(type) {
8505         case NONE:
8506                 return("None");
8507         case ZLIB:
8508                 return("Zlib");
8509         case LZ4:
8510                 return("LZ4");
8511         }
8512 
8513         ut_ad(0);
8514 
8515         return("<UNKNOWN>");
8516 }
8517 
8518 /**
8519 @param[in]      meta		Page Meta data
8520 @return the string representation */
to_string(const Compression::meta_t & meta)8521 std::string Compression::to_string(const Compression::meta_t& meta)
8522 {
8523 	std::ostringstream	stream;
8524 
8525 	stream	<< "version: " << int(meta.m_version) << " "
8526 		<< "algorithm: " << meta.m_algorithm << " "
8527 		<< "(" << to_string(meta.m_algorithm) << ") "
8528 		<< "orginal_type: " << meta.m_original_type << " "
8529 		<< "original_size: " << meta.m_original_size << " "
8530 		<< "compressed_size: " << meta.m_compressed_size;
8531 
8532 	return(stream.str());
8533 }
8534 
8535 /** @return true if it is a compressed page */
8536 bool
is_compressed_page(const byte * page)8537 Compression::is_compressed_page(const byte* page)
8538 {
8539 	return(mach_read_from_2(page + FIL_PAGE_TYPE) == FIL_PAGE_COMPRESSED);
8540 }
8541 
8542 bool
is_compressed_encrypted_page(const byte * page)8543 Compression::is_compressed_encrypted_page(const byte *page) {
8544 	return (mach_read_from_2(page + FIL_PAGE_TYPE) ==
8545 		FIL_PAGE_COMPRESSED_AND_ENCRYPTED);
8546 }
8547 
8548 bool
is_valid_page_version(uint8_t version)8549 Compression::is_valid_page_version(uint8_t version) {
8550 	return (version == FIL_PAGE_VERSION_1 || version == FIL_PAGE_VERSION_2);
8551 }
8552 
8553 /** Deserizlise the page header compression meta-data
8554 @param[in]	page		Pointer to the page header
8555 @param[out]	control		Deserialised data */
8556 void
deserialize_header(const byte * page,Compression::meta_t * control)8557 Compression::deserialize_header(
8558 	const byte*		page,
8559 	Compression::meta_t*	control)
8560 {
8561 	ut_ad(is_compressed_page(page) || is_compressed_encrypted_page(page));
8562 
8563 	control->m_version = static_cast<uint8_t>(
8564 		mach_read_from_1(page + FIL_PAGE_VERSION));
8565 
8566 	control->m_original_type = static_cast<uint16_t>(
8567 		mach_read_from_2(page + FIL_PAGE_ORIGINAL_TYPE_V1));
8568 
8569 	control->m_compressed_size = static_cast<uint16_t>(
8570 		mach_read_from_2(page + FIL_PAGE_COMPRESS_SIZE_V1));
8571 
8572 	control->m_original_size = static_cast<uint16_t>(
8573 		mach_read_from_2(page + FIL_PAGE_ORIGINAL_SIZE_V1));
8574 
8575 	control->m_algorithm = static_cast<Type>(
8576 		mach_read_from_1(page + FIL_PAGE_ALGORITHM_V1));
8577 }
8578 
8579 /** Decompress the page data contents. Page type must be FIL_PAGE_COMPRESSED, if
8580 not then the source contents are left unchanged and DB_SUCCESS is returned.
8581 @param[in]	dblwr_recover	true of double write recovery in progress
8582 @param[in,out]	src		Data read from disk, decompressed data will be
8583 				copied to this page
8584 @param[in,out]	dst		Scratch area to use for decompression
8585 @param[in]	dst_len		Size of the scratch area in bytes
8586 @return DB_SUCCESS or error code */
8587 dberr_t
deserialize(bool dblwr_recover,byte * src,byte * dst,ulint dst_len)8588 Compression::deserialize(
8589 	bool		dblwr_recover,
8590 	byte*		src,
8591 	byte*		dst,
8592 	ulint		dst_len)
8593 {
8594 	if (!is_compressed_page(src)) {
8595 		/* There is nothing we can do. */
8596 		return(DB_SUCCESS);
8597 	}
8598 
8599 	meta_t	header;
8600 
8601 	deserialize_header(src, &header);
8602 
8603 	byte*	ptr = src + FIL_PAGE_DATA;
8604 
8605 	ut_ad(is_valid_page_version(header.m_version));
8606 
8607 	if (!is_valid_page_version(header.m_version)
8608 	    || header.m_original_size < UNIV_PAGE_SIZE_MIN - (FIL_PAGE_DATA + 8)
8609 	    || header.m_original_size > UNIV_PAGE_SIZE_MAX - FIL_PAGE_DATA
8610 	    || dst_len < header.m_original_size + FIL_PAGE_DATA) {
8611 
8612 		/* The last check could potentially return DB_OVERFLOW,
8613 		the caller should be able to retry with a larger buffer. */
8614 
8615 		return(DB_CORRUPTION);
8616 	}
8617 
8618 	Block*	block;
8619 
8620 	/* The caller doesn't know what to expect */
8621 	if (dst == NULL) {
8622 
8623 		block = os_alloc_block();
8624 
8625 #ifdef UNIV_INNOCHECKSUM
8626 		dst = block;
8627 #else
8628 		dst = block->m_ptr;
8629 #endif /* UNIV_INNOCHECKSUM */
8630 
8631 	} else {
8632 		block = NULL;
8633 	}
8634 
8635 	int		ret;
8636 	Compression	compression;
8637 	ulint		len = header.m_original_size;
8638 
8639 	compression.m_type = static_cast<Compression::Type>(header.m_algorithm);
8640 
8641 	switch(compression.m_type) {
8642 	case Compression::ZLIB: {
8643 
8644 		uLongf	zlen = header.m_original_size;
8645 
8646 		if (uncompress(dst, &zlen, ptr, header.m_compressed_size)
8647 		    != Z_OK) {
8648 
8649 			if (block != NULL) {
8650 				os_free_block(block);
8651 			}
8652 
8653 			return(DB_IO_DECOMPRESS_FAIL);
8654 		}
8655 
8656 		len = static_cast<ulint>(zlen);
8657 
8658 		break;
8659 	}
8660 
8661 	case Compression::LZ4:
8662 
8663                 ret = LZ4_decompress_safe(
8664                         reinterpret_cast<char*>(ptr),
8665                         reinterpret_cast<char*>(dst),
8666                         header.m_compressed_size,
8667                         header.m_original_size);
8668 		if (ret < 0) {
8669 
8670 			if (block != NULL) {
8671 				os_free_block(block);
8672 			}
8673 
8674 			return(DB_IO_DECOMPRESS_FAIL);
8675 		}
8676 
8677 		break;
8678 
8679 	default:
8680 #if !defined(UNIV_INNOCHECKSUM)
8681 		ib::error()
8682 			<< "Compression algorithm support missing: "
8683 			<< Compression::to_string(compression.m_type);
8684 #else
8685 		fprintf(stderr, "Compression algorithm support missing: %s\n",
8686 			Compression::to_string(compression.m_type));
8687 #endif /* !UNIV_INNOCHECKSUM */
8688 
8689 		if (block != NULL) {
8690 			os_free_block(block);
8691 		}
8692 
8693 		return(DB_UNSUPPORTED);
8694 	}
8695 
8696 	/* Leave the header alone */
8697 	memmove(src + FIL_PAGE_DATA, dst, len);
8698 
8699 	mach_write_to_2(src + FIL_PAGE_TYPE, header.m_original_type);
8700 
8701 	ut_ad(dblwr_recover
8702 	      || memcmp(src + FIL_PAGE_LSN + 4,
8703 			src + (header.m_original_size + FIL_PAGE_DATA)
8704 			- FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4) == 0);
8705 
8706 	if (block != NULL) {
8707 		os_free_block(block);
8708 	}
8709 
8710 	return(DB_SUCCESS);
8711 }
8712 
8713 /** Decompress the page data contents. Page type must be FIL_PAGE_COMPRESSED, if
8714 not then the source contents are left unchanged and DB_SUCCESS is returned.
8715 @param[in]	dblwr_recover	true of double write recovery in progress
8716 @param[in,out]	src		Data read from disk, decompressed data will be
8717 				copied to this page
8718 @param[in,out]	dst		Scratch area to use for decompression
8719 @param[in]	dst_len		Size of the scratch area in bytes
8720 @return DB_SUCCESS or error code */
8721 dberr_t
os_file_decompress_page(bool dblwr_recover,byte * src,byte * dst,ulint dst_len)8722 os_file_decompress_page(
8723 	bool		dblwr_recover,
8724 	byte*		src,
8725 	byte*		dst,
8726 	ulint		dst_len)
8727 {
8728 	return(Compression::deserialize(dblwr_recover, src, dst, dst_len));
8729 }
8730 
8731 /**
8732 @param[in]      type            The encryption type
8733 @return the string representation */
8734 const char*
to_string(Type type)8735 Encryption::to_string(Type type)
8736 {
8737         switch(type) {
8738         case NONE:
8739                 return("N");
8740         case AES:
8741                 return("Y");
8742         }
8743 
8744         ut_ad(0);
8745 
8746         return("<UNKNOWN>");
8747 }
8748 
8749 /** Generate random encryption value for key and iv.
8750 @param[in,out]	value	Encryption value */
random_value(byte * value)8751 void Encryption::random_value(byte* value)
8752 {
8753 	ut_ad(value != NULL);
8754 
8755 	my_rand_buffer(value, ENCRYPTION_KEY_LEN);
8756 }
8757 
8758 /** Create new master key for key rotation.
8759 @param[in,out]	master_key	master key */
8760 void
create_master_key(byte ** master_key)8761 Encryption::create_master_key(byte** master_key)
8762 {
8763 #ifndef UNIV_INNOCHECKSUM
8764 	char*	key_type = NULL;
8765 	size_t	key_len;
8766 	char	key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
8767 	int	ret;
8768 
8769 	/* If uuid does not match with current server uuid,
8770 	set uuid as current server uuid. */
8771 	if (strcmp(uuid, server_uuid) != 0) {
8772 		memcpy(uuid, server_uuid, ENCRYPTION_SERVER_UUID_LEN);
8773 	}
8774 	memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
8775 
8776 	/* Generate new master key */
8777 	ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8778 		    "%s-%s-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8779 		    uuid, master_key_id + 1);
8780 
8781 	/* We call key ring API to generate master key here. */
8782 	ret = my_key_generate(key_name, "AES",
8783 			      NULL, ENCRYPTION_KEY_LEN);
8784 
8785 	/* We call key ring API to get master key here. */
8786 	ret = my_key_fetch(key_name, &key_type, NULL,
8787 			   reinterpret_cast<void**>(master_key),
8788 			   &key_len);
8789 
8790 	if (ret || *master_key == NULL) {
8791 		ib::error() << "Encryption can't find master key, please check"
8792 				" the keyring plugin is loaded.";
8793 		*master_key = NULL;
8794 	} else {
8795 		master_key_id++;
8796 	}
8797 
8798 	if (key_type) {
8799 		my_free(key_type);
8800 	}
8801 #endif
8802 }
8803 
8804 /** Get master key by key id.
8805 @param[in]	master_key_id	master key id
8806 @param[in]	srv_uuid	uuid of server instance
8807 @param[in,out]	master_key	master key */
8808 void
get_master_key(ulint master_key_id,char * srv_uuid,byte ** master_key)8809 Encryption::get_master_key(ulint master_key_id,
8810 			   char* srv_uuid,
8811 			   byte** master_key)
8812 {
8813 #ifndef UNIV_INNOCHECKSUM
8814 	char*	key_type = NULL;
8815 	size_t	key_len;
8816 	char	key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
8817 	int	ret;
8818 
8819 	memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
8820 
8821 	if (srv_uuid != NULL) {
8822 		ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8823 			    "%s-%s-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8824 			    srv_uuid, master_key_id);
8825 	} else {
8826 		/* For compitable with 5.7.11, we need to get master key with
8827 		server id. */
8828 		memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
8829 		ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8830 			    "%s-%lu-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8831 			    server_id, master_key_id);
8832 	}
8833 
8834 	/* We call key ring API to get master key here. */
8835 	ret = my_key_fetch(key_name, &key_type, NULL,
8836 			   reinterpret_cast<void**>(master_key), &key_len);
8837 
8838 	if (key_type) {
8839 		my_free(key_type);
8840 	}
8841 
8842 	if (ret) {
8843 		*master_key = NULL;
8844 		ib::error() << "Encryption can't find master key, please check"
8845 				" the keyring plugin is loaded.";
8846 	}
8847 
8848 #ifdef UNIV_ENCRYPT_DEBUG
8849 	if (!ret && *master_key) {
8850 		fprintf(stderr, "Fetched master key:%lu ", master_key_id);
8851 		ut_print_buf(stderr, *master_key, key_len);
8852 		fprintf(stderr, "\n");
8853 	}
8854 #endif /* DEBUG_TDE */
8855 
8856 #endif
8857 }
8858 
8859 /** Current master key id */
8860 ulint	Encryption::master_key_id = 0;
8861 
8862 /** Current uuid of server instance */
8863 char	Encryption::uuid[ENCRYPTION_SERVER_UUID_LEN + 1] = {0};
8864 
8865 /** Get current master key and master key id
8866 @param[in,out]	master_key_id	master key id
8867 @param[in,out]	master_key	master key
8868 @param[in,out]	version		encryption information version */
8869 void
get_master_key(ulint * master_key_id,byte ** master_key,Encryption::Version * version)8870 Encryption::get_master_key(ulint* master_key_id,
8871 			   byte** master_key,
8872 			   Encryption::Version*  version)
8873 {
8874 #ifndef UNIV_INNOCHECKSUM
8875 	char*	key_type = NULL;
8876 	size_t	key_len;
8877 	char	key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
8878 	int	ret;
8879 
8880 	memset(key_name, 0, ENCRYPTION_KEY_LEN);
8881 	*version = Encryption::ENCRYPTION_VERSION_2;
8882 
8883 	if (Encryption::master_key_id == 0) {
8884 		/* If m_master_key is 0, means there's no encrypted
8885 		tablespace, we need to generate the first master key,
8886 		and store it to key ring. */
8887 		memset(uuid, 0, ENCRYPTION_SERVER_UUID_LEN + 1);
8888 		memcpy(uuid, server_uuid, ENCRYPTION_SERVER_UUID_LEN);
8889 
8890 		/* Prepare the server uuid. */
8891 		ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8892 			    "%s-%s-1", ENCRYPTION_MASTER_KEY_PRIFIX,
8893 			    uuid);
8894 
8895 		/* We call key ring API to generate master key here. */
8896 		ret = my_key_generate(key_name, "AES",
8897 				      NULL, ENCRYPTION_KEY_LEN);
8898 
8899 		/* We call key ring API to get master key here. */
8900 		ret = my_key_fetch(key_name, &key_type, NULL,
8901 				   reinterpret_cast<void**>(master_key),
8902 				   &key_len);
8903 
8904 		if (!ret && *master_key != NULL) {
8905 			Encryption::master_key_id++;
8906 			*master_key_id = Encryption::master_key_id;
8907 		}
8908 #ifdef UNIV_ENCRYPT_DEBUG
8909 		if (!ret && *master_key) {
8910 			fprintf(stderr, "Generated new master key:");
8911 			ut_print_buf(stderr, *master_key, key_len);
8912 			fprintf(stderr, "\n");
8913 		}
8914 #endif
8915 	} else {
8916 		*master_key_id = Encryption::master_key_id;
8917 
8918 		ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8919 			    "%s-%s-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8920 			    uuid, *master_key_id);
8921 
8922 		/* We call key ring API to get master key here. */
8923 		ret = my_key_fetch(key_name, &key_type, NULL,
8924 				   reinterpret_cast<void**>(master_key),
8925 				   &key_len);
8926 
8927 		/* For compitable with 5.7.11, we need to try to get master key with
8928 		server id when get master key with server uuid failure. */
8929 		if (ret || *master_key == NULL) {
8930 			if (key_type) {
8931 				my_free(key_type);
8932 			}
8933 
8934 			memset(key_name, 0,
8935 			       ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
8936 			ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8937 				    "%s-%lu-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8938 				    server_id, *master_key_id);
8939 
8940 			ret = my_key_fetch(key_name, &key_type, NULL,
8941 					   reinterpret_cast<void**>(master_key),
8942 					   &key_len);
8943 			*version = Encryption::ENCRYPTION_VERSION_1;
8944 		}
8945 #ifdef UNIV_ENCRYPT_DEBUG
8946 		if (!ret && *master_key) {
8947 			fprintf(stderr, "Fetched master key:%lu ",
8948 				*master_key_id);
8949 			ut_print_buf(stderr, *master_key, key_len);
8950 			fprintf(stderr, "\n");
8951 		}
8952 #endif
8953 	}
8954 
8955 	if (ret) {
8956 		*master_key = NULL;
8957 		ib::error() << "Encryption can't find master key, please check"
8958 				" the keyring plugin is loaded.";
8959 	}
8960 
8961 	if (key_type) {
8962 		my_free(key_type);
8963 	}
8964 #endif
8965 }
8966 
8967 /** Check if page is encrypted page or not
8968 @param[in]	page	page which need to check
8969 @return true if it is a encrypted page */
8970 bool
is_encrypted_page(const byte * page)8971 Encryption::is_encrypted_page(const byte* page)
8972 {
8973 	ulint	page_type = mach_read_from_2(page + FIL_PAGE_TYPE);
8974 
8975 	return(page_type == FIL_PAGE_ENCRYPTED
8976 	       || page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED
8977 	       || page_type == FIL_PAGE_ENCRYPTED_RTREE);
8978 }
8979 
8980 /** Encrypt the page data contents. Page type can't be
8981 FIL_PAGE_ENCRYPTED, FIL_PAGE_COMPRESSED_AND_ENCRYPTED,
8982 FIL_PAGE_ENCRYPTED_RTREE.
8983 @param[in]	type		IORequest
8984 @param[in,out]	src		page data which need to encrypt
8985 @param[in]	src_len		Size of the source in bytes
8986 @param[in,out]	dst		destination area
8987 @param[in,out]	dst_len		Size of the destination in bytes
8988 @return buffer data, dst_len will have the length of the data */
8989 byte*
encrypt(const IORequest & type,byte * src,ulint src_len,byte * dst,ulint * dst_len)8990 Encryption::encrypt(
8991 	const IORequest&	type,
8992 	byte*			src,
8993 	ulint			src_len,
8994 	byte*			dst,
8995 	ulint*			dst_len)
8996 {
8997 	ut_ad(m_type != NONE);
8998 	ut_ad(!type.is_log());
8999 #ifdef UNIV_ENCRYPT_DEBUG
9000 	ulint space_id =
9001 		mach_read_from_4(src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
9002 	ulint page_no = mach_read_from_4(src + FIL_PAGE_OFFSET);
9003 
9004 	fprintf(stderr, "Encrypting page:%lu.%lu len:%lu\n",
9005 		space_id, page_no, src_len);
9006 #endif
9007 
9008 	/* Shouldn't encrypte an already encrypted page. */
9009 	ut_ad(!is_encrypted_page(src));
9010 
9011 	const uint16_t page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
9012 
9013 	/* This is data size which need to encrypt. */
9014 	ulint src_enc_len = src_len;
9015 
9016 	/* In FIL_PAGE_VERSION_2, we encrypt the actual compressed data length. */
9017 	if (page_type == FIL_PAGE_COMPRESSED) {
9018 		src_enc_len = mach_read_from_2(src + FIL_PAGE_COMPRESS_SIZE_V1) +
9019 					       FIL_PAGE_DATA;
9020 		/* Extend src_enc_len if needed */
9021 		if (src_enc_len < MIN_ENCRYPTION_LEN) {
9022 			src_enc_len = MIN_ENCRYPTION_LEN;
9023 		}
9024 		ut_a(src_enc_len <= src_len);
9025 	}
9026 
9027 	/* Only encrypt the data + trailer, leave the header alone */
9028 
9029 	switch (m_type) {
9030 	case Encryption::NONE:
9031 		ut_error;
9032 
9033 	case Encryption::AES: {
9034 		ut_ad(m_klen == ENCRYPTION_KEY_LEN);
9035 
9036 		/* Total length of the data to encrypt. */
9037 		const ulint data_len = src_enc_len - FIL_PAGE_DATA;
9038 
9039 		/* Server encryption functions expect input data to be in
9040 		multiples of MY_AES_BLOCK SIZE. Therefore we encrypt the
9041 		overlapping data of the chunk_len and trailer_len twice.
9042 		First we encrypt the bigger chunk of data then we do the
9043 		trailer. The trailer encryption block starts at
9044 		2 * MY_AES_BLOCK_SIZE bytes offset from the end of the enc_len.
9045 		During decryption we do the reverse of the above process. */
9046 		ut_ad(data_len >= 2 * MY_AES_BLOCK_SIZE);
9047 
9048 		const ulint chunk_len =
9049 			 (data_len / MY_AES_BLOCK_SIZE) * MY_AES_BLOCK_SIZE;
9050 		const ulint remain_len = data_len - chunk_len;
9051 
9052 		lint elen = my_aes_encrypt(
9053 			src + FIL_PAGE_DATA, static_cast<uint32>(chunk_len),
9054 			dst + FIL_PAGE_DATA, reinterpret_cast<byte *>(m_key),
9055 			static_cast<uint32>(m_klen), my_aes_256_cbc,
9056 			reinterpret_cast<byte *>(m_iv), false);
9057 
9058 		if (elen == MY_AES_BAD_DATA) {
9059 			ulint	page_no =mach_read_from_4(
9060 				src + FIL_PAGE_OFFSET);
9061 			ulint	space_id = mach_read_from_4(
9062 				src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
9063 			*dst_len = src_len;
9064 #ifndef UNIV_INNOCHECKSUM
9065 				ib::warn()
9066 					<< " Can't encrypt data of page,"
9067 					<< " page no:" << page_no
9068 					<< " space id:" << space_id;
9069 #else
9070 				fprintf(stderr, " Can't encrypt data of page,"
9071 					" page no:" ULINTPF
9072 					" space id:" ULINTPF,
9073 					page_no, space_id);
9074 #endif /* !UNIV_INNOCHECKSUM */
9075 			return(src);
9076 		}
9077 
9078 		const ulint len = static_cast<ulint>(elen);
9079 		ut_ad(len == chunk_len);
9080 
9081 		/* Encrypt the trailing bytes. */
9082 		if (remain_len != 0) {
9083 			/* Copy remaining bytes and page tailer. */
9084 			memcpy(dst + FIL_PAGE_DATA + len,
9085 			       src + FIL_PAGE_DATA + len,
9086 			       remain_len);
9087 
9088 			const ulint trailer_len = MY_AES_BLOCK_SIZE * 2;
9089 			byte buf[trailer_len];
9090 
9091 			elen = my_aes_encrypt(
9092 				dst + FIL_PAGE_DATA + data_len - trailer_len,
9093 				static_cast<uint32>(trailer_len), buf,
9094 				reinterpret_cast<unsigned char*>(m_key),
9095 				static_cast<uint32>(m_klen), my_aes_256_cbc,
9096 				reinterpret_cast<byte *>(m_iv), false);
9097 
9098 			if (elen == MY_AES_BAD_DATA) {
9099 				ulint	page_no =mach_read_from_4(
9100 					src + FIL_PAGE_OFFSET);
9101 				ulint	space_id = mach_read_from_4(
9102 					src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
9103 #ifndef UNIV_INNOCHECKSUM
9104 				ib::warn()
9105 					<< " Can't encrypt data of page,"
9106 					<< " page no:" << page_no
9107 					<< " space id:" << space_id;
9108 #else
9109 				fprintf(stderr, " Can't encrypt data of page,"
9110 					" page no:" ULINTPF
9111 					" space id:" ULINTPF,
9112 					page_no, space_id);
9113 #endif /* !UNIV_INNOCHECKSUM */
9114 				*dst_len = src_len;
9115 				return(src);
9116 			}
9117 
9118 			ut_a(static_cast<ulint>(elen) == trailer_len);
9119 
9120 			memcpy(dst + FIL_PAGE_DATA + data_len - trailer_len,
9121 			       buf, trailer_len);
9122 		}
9123 
9124 
9125 		break;
9126 	}
9127 
9128 	default:
9129 		ut_error;
9130 	}
9131 
9132 	/* Copy the header as is. */
9133 	memmove(dst, src, FIL_PAGE_DATA);
9134 	ut_ad(memcmp(src, dst, FIL_PAGE_DATA) == 0);
9135 
9136 	/* Add encryption control information. Required for decrypting. */
9137 	if (page_type == FIL_PAGE_COMPRESSED) {
9138 		/* If the page is compressed, we don't need to save the
9139 		original type, since it is done in compression already. */
9140 		mach_write_to_2(dst + FIL_PAGE_TYPE,
9141 				FIL_PAGE_COMPRESSED_AND_ENCRYPTED);
9142 		ut_ad(memcmp(src+FIL_PAGE_TYPE+2,
9143 			     dst+FIL_PAGE_TYPE+2,
9144 			     FIL_PAGE_DATA-FIL_PAGE_TYPE-2) == 0);
9145 	} else if (page_type == FIL_PAGE_RTREE) {
9146 		/* If the page is R-tree page, we need to save original type. */
9147 		mach_write_to_2(dst + FIL_PAGE_TYPE, FIL_PAGE_ENCRYPTED_RTREE);
9148 	} else{
9149 		mach_write_to_2(dst + FIL_PAGE_TYPE, FIL_PAGE_ENCRYPTED);
9150 		mach_write_to_2(dst + FIL_PAGE_ORIGINAL_TYPE_V1, page_type);
9151 	}
9152 
9153 #ifdef UNIV_ENCRYPT_DEBUG
9154 #ifndef UNIV_INNOCHECKSUM
9155 #if 0
9156 	byte*	check_buf = static_cast<byte*>(ut_malloc_nokey(src_len));
9157 	byte*	buf2 = static_cast<byte*>(ut_malloc_nokey(src_len));
9158 
9159 	memcpy(check_buf, dst, src_len);
9160 
9161 	dberr_t err = decrypt(type, check_buf, src_len, buf2, src_len);
9162 	if (err != DB_SUCCESS || memcmp(src + FIL_PAGE_DATA,
9163 					check_buf + FIL_PAGE_DATA,
9164 					src_len - FIL_PAGE_DATA) != 0) {
9165 		ut_print_buf(stderr, src, src_len);
9166 		ut_print_buf(stderr, check_buf, src_len);
9167 		ut_ad(0);
9168 	}
9169 	ut_free(buf2);
9170 	ut_free(check_buf);
9171 #endif
9172 	fprintf(stderr, "Encrypted page:%lu.%lu\n", space_id, page_no);
9173 #endif
9174 #endif
9175 
9176 	/* Add padding 0 for unused portion */
9177 	if (src_len > src_enc_len) {
9178 		memset(dst + src_enc_len, 0, src_len - src_enc_len);
9179 	}
9180 
9181 	*dst_len = src_len;
9182 
9183 	return(dst);
9184 }
9185 
9186 /** Decrypt the page data contents. Page type must be FIL_PAGE_ENCRYPTED,
9187 if not then the source contents are left unchanged and DB_SUCCESS is returned.
9188 @param[in]	type		IORequest
9189 @param[in,out]	src		Data read from disk, decrypted data will be
9190 				copied to this page
9191 @param[in]	src_len		source data length
9192 @param[in,out]	dst		Scratch area to use for decryption
9193 @param[in]	dst_len		Size of the scratch area in bytes
9194 @return DB_SUCCESS or error code */
9195 dberr_t
decrypt(const IORequest & type,byte * src,ulint src_len,byte * dst,ulint dst_len)9196 Encryption::decrypt(
9197 	const IORequest&	type,
9198 	byte*			src,
9199 	ulint			src_len,
9200 	byte*			dst,
9201 	ulint			dst_len)
9202 {
9203 	ulint		data_len;
9204 	ulint		main_len;
9205 	ulint		remain_len;
9206 	ulint		original_type;
9207 	ulint		page_type;
9208 	byte		remain_buf[MY_AES_BLOCK_SIZE * 2];
9209 	Block*		block;
9210 
9211 	/* Do nothing if it's not an encrypted table. */
9212 	if (!is_encrypted_page(src)) {
9213 		return(DB_SUCCESS);
9214 	}
9215 
9216 	/* For compressed page, we need to get the compressed size
9217 	for decryption */
9218 	page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
9219 	if (page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED) {
9220 		src_len = static_cast<uint16_t>(
9221 			mach_read_from_2(src + FIL_PAGE_COMPRESS_SIZE_V1))
9222 			+ FIL_PAGE_DATA;
9223 #ifndef UNIV_INNOCHECKSUM
9224 		Compression::meta_t header;
9225 		Compression::deserialize_header(src, &header);
9226 		if (header.m_version == Compression::FIL_PAGE_VERSION_1) {
9227 			src_len = ut_calc_align(src_len, type.block_size());
9228 		} else {
9229 			/* Extend src_len if needed */
9230 			if (src_len < MIN_ENCRYPTION_LEN) {
9231 				src_len = MIN_ENCRYPTION_LEN;
9232 			}
9233 		}
9234 #endif
9235 	}
9236 #ifdef UNIV_ENCRYPT_DEBUG
9237 	ulint space_id =
9238 		mach_read_from_4(src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
9239 	ulint page_no = mach_read_from_4(src + FIL_PAGE_OFFSET);
9240 
9241 	fprintf(stderr, "Decrypting page:%lu.%lu len:%lu\n",
9242 		space_id, page_no, src_len);
9243 #endif
9244 
9245 	original_type = static_cast<uint16_t>(
9246 		mach_read_from_2(src + FIL_PAGE_ORIGINAL_TYPE_V1));
9247 
9248 	byte*	ptr = src + FIL_PAGE_DATA;
9249 
9250 	/* The caller doesn't know what to expect */
9251 	if (dst == NULL) {
9252 
9253 		block = os_alloc_block();
9254 #ifdef UNIV_INNOCHECKSUM
9255 		dst = block;
9256 #else
9257 		dst = block->m_ptr;
9258 #endif /* UNIV_INNOCHECKSUM */
9259 
9260 	} else {
9261 		block = NULL;
9262 	}
9263 
9264 	data_len = src_len - FIL_PAGE_DATA;
9265 	main_len = (data_len / MY_AES_BLOCK_SIZE) * MY_AES_BLOCK_SIZE;
9266 	remain_len = data_len - main_len;
9267 
9268 	switch(m_type) {
9269 	case Encryption::AES: {
9270 		lint			elen;
9271 
9272 		/* First decrypt the last 2 blocks data of data, since
9273 		data is no block aligned. */
9274 		if (remain_len != 0) {
9275 			ut_ad(m_klen == ENCRYPTION_KEY_LEN);
9276 
9277 			remain_len = MY_AES_BLOCK_SIZE * 2;
9278 
9279 			/* Copy the last 2 blocks. */
9280 			memcpy(remain_buf,
9281 			       ptr + data_len - remain_len,
9282 			       remain_len);
9283 
9284 			elen = my_aes_decrypt(
9285 				remain_buf,
9286 				static_cast<uint32>(remain_len),
9287 				dst + data_len - remain_len,
9288 				reinterpret_cast<unsigned char*>(m_key),
9289 				static_cast<uint32>(m_klen),
9290 				my_aes_256_cbc,
9291 				reinterpret_cast<unsigned char*>(m_iv),
9292 				false);
9293 			if (elen == MY_AES_BAD_DATA) {
9294 				if (block != NULL) {
9295 					os_free_block(block);
9296 				}
9297 
9298 				return(DB_IO_DECRYPT_FAIL);
9299 			}
9300 
9301 			/* Copy the other data bytes to temp area. */
9302 			memcpy(dst, ptr, data_len - remain_len);
9303 		} else {
9304 			ut_ad(data_len == main_len);
9305 
9306 			/* Copy the data bytes to temp area. */
9307 			memcpy(dst, ptr, data_len);
9308 		}
9309 
9310 		/* Then decrypt the main data */
9311 		elen = my_aes_decrypt(
9312 				dst,
9313 				static_cast<uint32>(main_len),
9314 				ptr,
9315 				reinterpret_cast<unsigned char*>(m_key),
9316 				static_cast<uint32>(m_klen),
9317 				my_aes_256_cbc,
9318 				reinterpret_cast<unsigned char*>(m_iv),
9319 				false);
9320 		if (elen == MY_AES_BAD_DATA) {
9321 
9322 			if (block != NULL) {
9323 				os_free_block(block);
9324 			}
9325 
9326 			return(DB_IO_DECRYPT_FAIL);
9327 		}
9328 
9329 		ut_ad(static_cast<ulint>(elen) == main_len);
9330 
9331 		/* Copy the remain bytes. */
9332 		memcpy(ptr + main_len, dst + main_len, data_len - main_len);
9333 
9334 		break;
9335 	}
9336 
9337 	default:
9338 		if (!type.is_dblwr_recover()) {
9339 #if !defined(UNIV_INNOCHECKSUM)
9340 			ib::error()
9341 				<< "Encryption algorithm support missing: "
9342 				<< Encryption::to_string(m_type);
9343 #else
9344 			fprintf(stderr, "Encryption algorithm support missing: %s\n",
9345 				Encryption::to_string(m_type));
9346 #endif /* !UNIV_INNOCHECKSUM */
9347 		}
9348 
9349 		if (block != NULL) {
9350 			os_free_block(block);
9351 		}
9352 
9353 		return(DB_UNSUPPORTED);
9354 	}
9355 
9356 	/* Restore the original page type. If it's a compressed and
9357 	encrypted page, just reset it as compressed page type, since
9358 	we will do uncompress later. */
9359 
9360 	if (page_type == FIL_PAGE_ENCRYPTED) {
9361 		mach_write_to_2(src + FIL_PAGE_TYPE, original_type);
9362 		mach_write_to_2(src + FIL_PAGE_ORIGINAL_TYPE_V1, 0);
9363 	} else if (page_type == FIL_PAGE_ENCRYPTED_RTREE) {
9364 		mach_write_to_2(src + FIL_PAGE_TYPE, FIL_PAGE_RTREE);
9365 	} else {
9366 		ut_ad(page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED);
9367 		mach_write_to_2(src + FIL_PAGE_TYPE, FIL_PAGE_COMPRESSED);
9368 	}
9369 
9370 	if (block != NULL) {
9371 		os_free_block(block);
9372 	}
9373 
9374 #ifdef UNIV_ENCRYPT_DEBUG
9375 	fprintf(stderr, "Decrypted page:%lu.%lu\n", space_id, page_no);
9376 #endif
9377 
9378 	DBUG_EXECUTE_IF("ib_crash_during_decrypt_page", DBUG_SUICIDE(););
9379 
9380 	return(DB_SUCCESS);
9381 }
9382 
9383 /** Normalizes a directory path for the current OS:
9384 On Windows, we convert '/' to '\', else we convert '\' to '/'.
9385 @param[in,out] str A null-terminated directory and file path */
9386 void
os_normalize_path(char * str)9387 os_normalize_path(
9388 	char*	str)
9389 {
9390 	if (str != NULL) {
9391 		for (; *str; str++) {
9392 			if (*str == OS_PATH_SEPARATOR_ALT) {
9393 				*str = OS_PATH_SEPARATOR;
9394 			}
9395 		}
9396 	}
9397 }
9398