1 /***********************************************************************
2 
3 Copyright (c) 1995, 2021, Oracle and/or its affiliates.
4 Copyright (c) 2009, 2016, Percona Inc.
5 
6 Portions of this file contain modifications contributed and copyrighted
7 by Percona Inc.. Those modifications are
8 gratefully acknowledged and are described briefly in the InnoDB
9 documentation. The contributions by Percona Inc. are incorporated with
10 their permission, and subject to the conditions contained in the file
11 COPYING.Percona.
12 
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License, version 2.0,
15 as published by the Free Software Foundation.
16 
17 This program is also distributed with certain software (including
18 but not limited to OpenSSL) that is licensed under separate terms,
19 as designated in a particular file or component or in included license
20 documentation.  The authors of MySQL hereby grant you an additional
21 permission to link the program and your derivative works with the
22 separately licensed software that they have included with MySQL.
23 
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27 GNU General Public License, version 2.0, for more details.
28 
29 You should have received a copy of the GNU General Public License along with
30 this program; if not, write to the Free Software Foundation, Inc.,
31 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
32 
33 ***********************************************************************/
34 
35 /**************************************************//**
36 @file os/os0file.cc
37 The interface to the operating system file i/o primitives
38 
39 Created 10/21/1995 Heikki Tuuri
40 *******************************************************/
41 
42 #ifndef UNIV_INNOCHECKSUM
43 
44 #include "ha_prototypes.h"
45 #include "sql_const.h"
46 #include "system_key.h"
47 
48 #include "os0file.h"
49 
50 #include "fil0crypt.h"
51 #include "system_key.h"
52 
53 #ifdef UNIV_NONINL
54 #include "os0file.ic"
55 #endif
56 
57 #include "page0page.h"
58 #include "srv0srv.h"
59 #include "srv0start.h"
60 #include "fil0fil.h"
61 #include "btr0types.h"
62 #include "trx0trx.h"
63 #ifndef UNIV_HOTBACKUP
64 # include "os0event.h"
65 # include "os0thread.h"
66 #else /* !UNIV_HOTBACKUP */
67 # ifdef _WIN32
68 /* Add includes for the _stat() call to compile on Windows */
69 #  include <sys/types.h>
70 #  include <sys/stat.h>
71 #  include <errno.h>
72 # endif /* _WIN32 */
73 #endif /* !UNIV_HOTBACKUP */
74 
75 #include <vector>
76 #include <functional>
77 
78 #include "fil0crypt.h"
79 
80 #ifdef LINUX_NATIVE_AIO
81 #include <libaio.h>
82 #endif /* LINUX_NATIVE_AIO */
83 
84 #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
85 # include <fcntl.h>
86 # include <linux/falloc.h>
87 #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
88 
89 #include <lz4.h>
90 #include <zlib.h>
91 
92 #ifdef UNIV_DEBUG
93 /** Set when InnoDB has invoked exit(). */
94 bool	innodb_calling_exit;
95 #include <ut0ut.h>
96 #endif /* UNIV_DEBUG */
97 
98 #include <my_aes.h>
99 #include <my_rnd.h>
100 #include <mysqld.h>
101 #include "fil0crypt.h"
102 #include <mysql/service_mysql_keyring.h>
103 #include "buf0buf.h"
104 
105 /** Insert buffer segment id */
106 static const ulint IO_IBUF_SEGMENT = 0;
107 
108 /** Log segment id */
109 static const ulint IO_LOG_SEGMENT = 1;
110 
111 /** Number of retries for partial I/O's */
112 static const ulint NUM_RETRIES_ON_PARTIAL_IO = 10;
113 
114 /** Blocks for doing IO, used in the transparent compression
115 and encryption code. */
116 struct Block {
117 	/** Default constructor */
BlockBlock118 	Block() : m_ptr(), m_in_use() { }
119 
120 	byte*		m_ptr;
121 
122 	byte		pad[CACHE_LINE_SIZE - sizeof(ulint)];
123 	lock_word_t	m_in_use;
124 };
125 
126 /** For storing the allocated blocks */
127 typedef std::vector<Block> Blocks;
128 
129 /** Block collection */
130 static Blocks*	block_cache;
131 
132 /** Number of blocks to allocate for sync read/writes */
133 static const size_t	MAX_BLOCKS = 128;
134 
135 /** Block buffer size */
136 #define BUFFER_BLOCK_SIZE ((ulint)(UNIV_PAGE_SIZE * 1.3))
137 
138 /** Disk sector size of aligning write buffer for DIRECT_IO */
139 static ulint	os_io_ptr_align = UNIV_SECTOR_SIZE;
140 
141 /* This specifies the file permissions InnoDB uses when it creates files in
142 Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
143 my_umask */
144 
145 #ifndef _WIN32
146 /** Umask for creating files */
147 static ulint	os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
148 #else
149 /** Umask for creating files */
150 static ulint	os_innodb_umask	= 0;
151 
152 /* On Windows when using native AIO the number of AIO requests
153 that a thread can handle at a given time is limited to 32
154 i.e.: SRV_N_PENDING_IOS_PER_THREAD */
155 #define SRV_N_PENDING_IOS_PER_THREAD    OS_AIO_N_PENDING_IOS_PER_THREAD
156 
157 #endif /* _WIN32 */
158 
159 #ifndef UNIV_HOTBACKUP
160 
161 /** In simulated aio, merge at most this many consecutive i/os */
162 static const ulint	OS_AIO_MERGE_N_CONSECUTIVE = 64;
163 
164 /** Flag indicating if the page_cleaner is in active state. */
165 extern bool buf_page_cleaner_is_active;
166 
167 /**********************************************************************
168 
169 InnoDB AIO Implementation:
170 =========================
171 
172 We support native AIO for Windows and Linux. For rest of the platforms
173 we simulate AIO by special IO-threads servicing the IO-requests.
174 
175 Simulated AIO:
176 ==============
177 
178 On platforms where we 'simulate' AIO, the following is a rough explanation
179 of the high level design.
180 There are four io-threads (for ibuf, log, read, write).
181 All synchronous IO requests are serviced by the calling thread using
182 os_file_write/os_file_read. The Asynchronous requests are queued up
183 in an array (there are four such arrays) by the calling thread.
184 Later these requests are picked up by the IO-thread and are serviced
185 synchronously.
186 
187 Windows native AIO:
188 ==================
189 
190 If srv_use_native_aio is not set then Windows follow the same
191 code as simulated AIO. If the flag is set then native AIO interface
192 is used. On windows, one of the limitation is that if a file is opened
193 for AIO no synchronous IO can be done on it. Therefore we have an
194 extra fifth array to queue up synchronous IO requests.
195 There are innodb_file_io_threads helper threads. These threads work
196 on the four arrays mentioned above in Simulated AIO. No thread is
197 required for the sync array.
198 If a synchronous IO request is made, it is first queued in the sync
199 array. Then the calling thread itself waits on the request, thus
200 making the call synchronous.
201 If an AIO request is made the calling thread not only queues it in the
202 array but also submits the requests. The helper thread then collects
203 the completed IO request and calls completion routine on it.
204 
205 Linux native AIO:
206 =================
207 
208 If we have libaio installed on the system and innodb_use_native_aio
209 is set to true we follow the code path of native AIO, otherwise we
210 do simulated AIO.
211 There are innodb_file_io_threads helper threads. These threads work
212 on the four arrays mentioned above in Simulated AIO.
213 If a synchronous IO request is made, it is handled by calling
214 os_file_write/os_file_read.
215 If an AIO request is made the calling thread not only queues it in the
216 array but also submits the requests. The helper thread then collects
217 the completed IO request and calls completion routine on it.
218 
219 **********************************************************************/
220 
221 
222 #ifdef UNIV_PFS_IO
223 /* Keys to register InnoDB I/O with performance schema */
224 mysql_pfs_key_t  innodb_data_file_key;
225 mysql_pfs_key_t  innodb_log_file_key;
226 mysql_pfs_key_t  innodb_temp_file_key;
227 mysql_pfs_key_t	 innodb_bmp_file_key;
228 mysql_pfs_key_t  innodb_parallel_dblwrite_file_key;
229 #endif /* UNIV_PFS_IO */
230 
231 /** The asynchronous I/O context */
232 struct Slot {
SlotSlot233 	Slot() { memset(this, 0, sizeof(*this)); }
234 
235 	/** index of the slot in the aio array */
236 	uint16_t		pos;
237 
238 	/** true if this slot is reserved */
239 	bool			is_reserved;
240 
241 	/** time when reserved */
242 	ib_time_monotonic_t	reservation_time;
243 
244 	/** buffer used in i/o */
245 	byte*			buf;
246 
247 	/** Buffer pointer used for actual IO. We advance this
248 	when partial IO is required and not buf */
249 	byte*			ptr;
250 
251 	/** OS_FILE_READ or OS_FILE_WRITE */
252 	IORequest		type;
253 
254 	/** file offset in bytes */
255 	os_offset_t		offset;
256 
257 	/** file where to read or write */
258 	pfs_os_file_t		file;
259 
260 	/** file name or path */
261 	const char*		name;
262 
263 	/** used only in simulated aio: true if the physical i/o
264 	already made and only the slot message needs to be passed
265 	to the caller of os_aio_simulated_handle */
266 	bool			io_already_done;
267 
268 	ulint			space_id;
269 
270 	/** The file node for which the IO is requested. */
271 	fil_node_t*		m1;
272 
273 	/** the requester of an aio operation and which can be used
274 	to identify which pending aio operation was completed */
275 	void*			m2;
276 
277 	/** AIO completion status */
278 	dberr_t			err;
279 
280 #ifdef WIN_ASYNC_IO
281 	/** handle object we need in the OVERLAPPED struct */
282 	HANDLE			handle;
283 
284 	/** Windows control block for the aio request */
285 	OVERLAPPED		control;
286 
287 	/** bytes written/read */
288 	DWORD			n_bytes;
289 
290 	/** length of the block to read or write */
291 	DWORD			len;
292 
293 #elif defined(LINUX_NATIVE_AIO)
294 	/** Linux control block for aio */
295 	struct iocb		control;
296 
297 	/** AIO return code */
298 	int			ret;
299 
300 	/** bytes written/read. */
301 	ssize_t			n_bytes;
302 
303 	/** length of the block to read or write */
304 	ulint			len;
305 #else
306 	/** length of the block to read or write */
307 	ulint			len;
308 
309 	/** bytes written/read. */
310 	ulint			n_bytes;
311 #endif /* WIN_ASYNC_IO */
312 
313 	/** Length of the block before it was compressed */
314 	uint32			original_len;
315 
316 	/** Buffer block for compressed pages or encrypted pages */
317 	Block*			buf_block;
318 
319 	/** true, if we shouldn't punch a hole after writing the page */
320 	bool			skip_punch_hole;
321 
322 	/** Buffer for encrypt log */
323 	void*			encrypt_log_buf;
324 };
325 
326 /** The asynchronous i/o array structure */
327 class AIO {
328 public:
329 	/** Constructor
330 	@param[in]	id		Latch ID
331 	@param[in]	n_slots		Number of slots to configure
332 	@param[in]	segments	Number of segments to configure */
333 	AIO(latch_id_t id, ulint n_slots, ulint segments);
334 
335 	/** Destructor */
336 	~AIO();
337 
338 	/** Initialize the instance
339 	@return DB_SUCCESS or error code */
340 	dberr_t init();
341 
342 	/** Requests for a slot in the aio array. If no slot is available, waits
343 	until not_full-event becomes signaled.
344 
345 	@param[in,out]	type	IO context
346 	@param[in,out]	m1	message to be passed along with the AIO
347 				operation
348 	@param[in,out]	m2	message to be passed along with the AIO
349 				operation
350 	@param[in]	file	file handle
351 	@param[in]	name	name of the file or path as a null-terminated
352 				string
353 	@param[in,out]	buf	buffer where to read or from which to write
354 	@param[in]	offset	file offset, where to read from or start writing
355 	@param[in]	len	length of the block to read or write
356 	@return pointer to slot */
357 	Slot* reserve_slot(
358 		IORequest&	type,
359 		fil_node_t*	m1,
360 		void*		m2,
361 		pfs_os_file_t	file,
362 		const char*	name,
363 		void*		buf,
364 		os_offset_t	offset,
365 		ulint		len,
366 		ulint		space_id)
367 		MY_ATTRIBUTE((warn_unused_result));
368 
369 	/** @return number of reserved slots */
370 	ulint pending_io_count() const;
371 
372 	/** Returns a pointer to the nth slot in the aio array.
373 	@param[in]	index	Index of the slot in the array
374 	@return pointer to slot */
at(ulint i) const375 	const Slot* at(ulint i) const
376 		MY_ATTRIBUTE((warn_unused_result))
377 	{
378 		ut_a(i < m_slots.size());
379 
380 		return(&m_slots[i]);
381 	}
382 
383 	/** Non const version */
at(ulint i)384 	Slot* at(ulint i)
385 		MY_ATTRIBUTE((warn_unused_result))
386 	{
387 		ut_a(i < m_slots.size());
388 
389 		return(&m_slots[i]);
390 	}
391 
392 	/** Frees a slot in the AIO array, assumes caller owns the mutex.
393 	@param[in,out]	slot	Slot to release */
394 	void release(Slot* slot);
395 
396 	/** Frees a slot in the AIO array, assumes caller doesn't own the mutex.
397 	@param[in,out]	slot	Slot to release */
398 	void release_with_mutex(Slot* slot);
399 
400 	/** Prints info about the aio array.
401 	@param[in,out]	file	Where to print */
402 	void print(FILE* file);
403 
404 	/** @return the number of slots per segment */
slots_per_segment() const405 	ulint slots_per_segment() const
406 		MY_ATTRIBUTE((warn_unused_result))
407 	{
408 		return(m_slots.size() / m_n_segments);
409 	}
410 
411 	/** @return accessor for n_segments */
get_n_segments() const412 	ulint get_n_segments() const
413 		MY_ATTRIBUTE((warn_unused_result))
414 	{
415 		return(m_n_segments);
416 	}
417 
418 #ifdef UNIV_DEBUG
419 	/** @return true if the thread owns the mutex */
is_mutex_owned() const420 	bool is_mutex_owned() const
421 		MY_ATTRIBUTE((warn_unused_result))
422 	{
423 		return(mutex_own(&m_mutex));
424 	}
425 #endif /* UNIV_DEBUG */
426 
427 	/** Acquire the mutex */
acquire() const428 	void acquire() const
429 	{
430 		mutex_enter(&m_mutex);
431 	}
432 
433 	/** Release the mutex */
release() const434 	void release() const
435 	{
436 		mutex_exit(&m_mutex);
437 	}
438 
439 	/** Write out the state to the file/stream
440 	@param[in, out]	file	File to write to */
441 	void to_file(FILE* file) const;
442 
443 	/** Submit buffered AIO requests on the given segment to the kernel.
444 	(low level function).
445 	@param[in] acquire_mutex specifies whether to lock array mutex */
446 	static void os_aio_dispatch_read_array_submit_low(
447 		bool acquire_mutex);
448 
449 #ifdef LINUX_NATIVE_AIO
450 	/** Dispatch an AIO request to the kernel.
451 	@param[in,out]	slot	an already reserved slot
452 	@param[in]	should_buffer	should buffer the request
453 					rather than submit
454 	@return true on success. */
455 	bool linux_dispatch(Slot* slot, bool should_buffer)
456 		MY_ATTRIBUTE((warn_unused_result));
457 
458 	/** Accessor for an AIO event
459 	@param[in]	index	Index into the array
460 	@return the event at the index */
io_events(ulint index)461 	io_event* io_events(ulint index)
462 		MY_ATTRIBUTE((warn_unused_result))
463 	{
464 		ut_a(index < m_events.size());
465 
466 		return(&m_events[index]);
467 	}
468 
469 	/** Accessor for the AIO context
470 	@param[in]	segment	Segment for which to get the context
471 	@return the AIO context for the segment */
io_ctx(ulint segment)472 	io_context* io_ctx(ulint segment)
473 		MY_ATTRIBUTE((warn_unused_result))
474 	{
475 		ut_ad(segment < get_n_segments());
476 
477 		return(m_aio_ctx[segment]);
478 	}
479 
480 	/** Creates an io_context for native linux AIO.
481 	@param[in]	max_events	number of events
482 	@param[out]	io_ctx		io_ctx to initialize.
483 	@return true on success. */
484 	static bool linux_create_io_ctx(ulint max_events, io_context_t* io_ctx)
485 		MY_ATTRIBUTE((warn_unused_result));
486 
487 	/** Checks if the system supports native linux aio. On some kernel
488 	versions where native aio is supported it won't work on tmpfs. In such
489 	cases we can't use native aio as it is not possible to mix simulated
490 	and native aio.
491 	@return true if supported, false otherwise. */
492 	static bool is_linux_native_aio_supported()
493 		MY_ATTRIBUTE((warn_unused_result));
494 #endif /* LINUX_NATIVE_AIO */
495 
496 #ifdef WIN_ASYNC_IO
497 	/** Wakes up all async i/o threads in the array in Windows async I/O at
498 	shutdown. */
signal()499 	void signal()
500 	{
501 		for (ulint i = 0; i < m_slots.size(); ++i) {
502 			SetEvent(m_slots[i].handle);
503 		}
504 	}
505 
506 	/** Wake up all AIO threads in Windows native aio */
wake_at_shutdown()507 	static void wake_at_shutdown()
508 	{
509 		s_reads->signal();
510 
511 		if (s_writes != NULL) {
512 			s_writes->signal();
513 		}
514 
515 		if (s_ibuf != NULL) {
516 			s_ibuf->signal();
517 		}
518 
519 		if (s_log != NULL) {
520 			s_log->signal();
521 		}
522 	}
523 #endif /* WIN_ASYNC_IO */
524 
525 #ifdef _WIN32
526 	/** This function can be called if one wants to post a batch of reads
527 	and prefers an I/O - handler thread to handle them all at once later.You
528 	must call os_aio_simulated_wake_handler_threads later to ensure the
529 	threads are not left sleeping! */
530 	static void simulated_put_read_threads_to_sleep();
531 
532 	/** The non asynchronous IO array.
533 	@return the synchronous AIO array instance. */
sync_array()534 	static AIO* sync_array()
535 		MY_ATTRIBUTE((warn_unused_result))
536 	{
537 		return(s_sync);
538 	}
539 
540 	/**
541 	Get the AIO handles for a segment.
542 	@param[in]	segment		The local segment.
543 	@return the handles for the segment. */
handles(ulint segment)544 	HANDLE* handles(ulint segment)
545 		MY_ATTRIBUTE((warn_unused_result))
546 	{
547 		ut_ad(segment < m_handles->size() / slots_per_segment());
548 
549 		return(&(*m_handles)[segment * slots_per_segment()]);
550 	}
551 
552 	/** @return true if no slots are reserved */
is_empty() const553 	bool is_empty() const
554 		MY_ATTRIBUTE((warn_unused_result))
555 	{
556 		ut_ad(is_mutex_owned());
557 		return(m_n_reserved == 0);
558 	}
559 #endif /* _WIN32 */
560 
561 	/** Create an instance using new(std::nothrow)
562 	@param[in]	id		Latch ID
563 	@param[in]	n_slots		The number of AIO request slots
564 	@param[in]	segments	The number of segments
565 	@return a new AIO instance */
566 	static AIO* create(
567 		latch_id_t	id,
568 		ulint		n_slots,
569 		ulint		segments)
570 		MY_ATTRIBUTE((warn_unused_result));
571 
572 	/** Initializes the asynchronous io system. Creates one array each
573 	for ibuf and log I/O. Also creates one array each for read and write
574 	where each array is divided logically into n_readers and n_writers
575 	respectively. The caller must create an i/o handler thread for each
576 	segment in these arrays. This function also creates the sync array.
577 	No I/O handler thread needs to be created for that
578 	@param[in]	n_per_seg	maximum number of pending aio
579 					operations allowed per segment
580 	@param[in]	n_readers	number of reader threads
581 	@param[in]	n_writers	number of writer threads
582 	@param[in]	n_slots_sync	number of slots in the sync aio array
583 	@return true if AIO sub-system was started successfully */
584 	static bool start(
585 		ulint		n_per_seg,
586 		ulint		n_readers,
587 		ulint		n_writers,
588 		ulint		n_slots_sync)
589 		MY_ATTRIBUTE((warn_unused_result));
590 
591 	/** Free the AIO arrays */
592 	static void shutdown();
593 
594 	/** Print all the AIO segments
595 	@param[in,out]	file		Where to print */
596 	static void print_all(FILE* file);
597 
598 	/** Calculates local segment number and aio array from global
599 	segment number.
600 	@param[out]	array		AIO wait array
601 	@param[in]	segment		global segment number
602 	@return local segment number within the aio array */
603 	static ulint get_array_and_local_segment(
604 		AIO**		array,
605 		ulint		segment)
606 		MY_ATTRIBUTE((warn_unused_result));
607 
608 	/** Select the IO slot array
609 	@param[in]	type		Type of IO, READ or WRITE
610 	@param[in]	read_only	true if running in read-only mode
611 	@param[in]	mode		IO mode
612 	@return slot array or NULL if invalid mode specified */
613 	static AIO* select_slot_array(
614 		IORequest&	type,
615 		bool		read_only,
616 		ulint		mode)
617 		MY_ATTRIBUTE((warn_unused_result));
618 
619 	/** Calculates segment number for a slot.
620 	@param[in]	array		AIO wait array
621 	@param[in]	slot		slot in this array
622 	@return segment number (which is the number used by, for example,
623 		I/O handler threads) */
624 	static ulint get_segment_no_from_slot(
625 		const AIO*	array,
626 		const Slot*	slot)
627 		MY_ATTRIBUTE((warn_unused_result));
628 
629 	/** Wakes up a simulated AIO I/O-handler thread if it has something
630 	to do.
631 	@param[in]	global_segment	the number of the segment in the
632 					AIO arrays */
633 	static void wake_simulated_handler_thread(ulint global_segment);
634 
635 	/** Check if it is a read request
636 	@param[in]	aio		The AIO instance to check
637 	@return true if the AIO instance is for reading. */
is_read(const AIO * aio)638 	static bool is_read(const AIO* aio)
639 		MY_ATTRIBUTE((warn_unused_result))
640 	{
641 		return(s_reads == aio);
642 	}
643 
644 	/** Wait on an event until no pending writes */
wait_until_no_pending_writes()645 	static void wait_until_no_pending_writes()
646 	{
647 		os_event_wait(AIO::s_writes->m_is_empty);
648 	}
649 
650 	/** Print to file
651 	@param[in]	file		File to write to */
652 	static void print_to_file(FILE* file);
653 
654 	/** Check for pending IO. Gets the count and also validates the
655 	data structures.
656 	@return count of pending IO requests */
657 	static ulint total_pending_io_count();
658 
659 private:
660 	/** Initialise the slots
661 	@return DB_SUCCESS or error code */
662 	dberr_t init_slots()
663 		MY_ATTRIBUTE((warn_unused_result));
664 
665 	/** Wakes up a simulated AIO I/O-handler thread if it has something
666 	to do for a local segment in the AIO array.
667 	@param[in]	global_segment	the number of the segment in the
668 					AIO arrays
669 	@param[in]	segment		the local segment in the AIO array */
670 	void wake_simulated_handler_thread(ulint global_segment, ulint segment);
671 
672 	/** Prints pending IO requests per segment of an aio array.
673 	We probably don't need per segment statistics but they can help us
674 	during development phase to see if the IO requests are being
675 	distributed as expected.
676 	@param[in,out]	file		file where to print
677 	@param[in]	segments	pending IO array */
678 	void print_segment_info(
679 		FILE*		file,
680 		const ulint*	segments);
681 
682 #ifdef LINUX_NATIVE_AIO
683 	/** Initialise the Linux native AIO data structures
684 	@return DB_SUCCESS or error code */
685 	dberr_t init_linux_native_aio()
686 		MY_ATTRIBUTE((warn_unused_result));
687 #endif /* LINUX_NATIVE_AIO */
688 
689 	/** Submit buffered AIO requests on the array to the kernel.
690 	(low level function).
691 	@param[in] acquire_mutex specifies whether to lock array mutex
692 	@param[in] array for which to submit IO */
693 	static void os_aio_dispatch_read_array_submit_low_for_array(
694 		bool acquire_mutex MY_ATTRIBUTE((unused)), const AIO* arr);
695 
696 private:
697 	typedef std::vector<Slot> Slots;
698 
699 	/** the mutex protecting the aio array */
700 	mutable SysMutex	m_mutex;
701 
702 	/** Pointer to the slots in the array.
703 	Number of elements must be divisible by n_threads. */
704 	Slots			m_slots;
705 
706 	/** Number of segments in the aio array of pending aio requests.
707 	A thread can wait separately for any one of the segments. */
708 	ulint			m_n_segments;
709 
710 	/** The event which is set to the signaled state when
711 	there is space in the aio outside the ibuf segment */
712 	os_event_t		m_not_full;
713 
714 	/** The event which is set to the signaled state when
715 	there are no pending i/os in this array */
716 	os_event_t		m_is_empty;
717 
718 	/** Number of reserved slots in the AIO array outside
719 	the ibuf segment */
720 	ulint			m_n_reserved;
721 
722 #ifdef _WIN32
723 	typedef std::vector<HANDLE, ut_allocator<HANDLE> > Handles;
724 
725 	/** Pointer to an array of OS native event handles where
726 	we copied the handles from slots, in the same order. This
727 	can be used in WaitForMultipleObjects; used only in Windows */
728 	Handles*		m_handles;
729 #endif /* _WIN32 */
730 
731 #if defined(LINUX_NATIVE_AIO)
732 	typedef std::vector<io_event> IOEvents;
733 
734 	/** completion queue for IO. There is one such queue per
735 	segment. Each thread will work on one ctx exclusively. */
736 	io_context_t*		m_aio_ctx;
737 
738 	/** The array to collect completed IOs. There is one such
739 	event for each possible pending IO. The size of the array
740 	is equal to m_slots.size(). */
741 	IOEvents		m_events;
742 
743 	/** Array to buffer the not-submitted aio requests. The array length
744 	is n_slots. It is divided into n_segments segments. Pending requests
745 	on each segment are buffered separately. */
746 	struct iocb**		m_pending;
747 
748 	/** Array of length n_segments. Each element counts the number of not
749 	submitted aio request on that segment. */
750 	ulint*			m_count;
751 #endif /* LINUX_NATIV_AIO */
752 
753 	/** The aio arrays for non-ibuf i/o and ibuf i/o, as well as
754 	sync AIO. These are NULL when the module has not yet been
755 	initialized. */
756 
757 	/** Insert buffer */
758 	static AIO*		s_ibuf;
759 
760 	/** Redo log */
761 	static AIO*		s_log;
762 
763 	/** Reads */
764 	static AIO*		s_reads;
765 
766 	/** Writes */
767 	static AIO*		s_writes;
768 
769 	/** Synchronous I/O */
770 	static AIO*		s_sync;
771 };
772 
773 /** Static declarations */
774 AIO*	AIO::s_reads;
775 AIO*	AIO::s_writes;
776 AIO*	AIO::s_ibuf;
777 AIO*	AIO::s_log;
778 AIO*	AIO::s_sync;
779 
780 #if defined(LINUX_NATIVE_AIO)
781 /** timeout for each io_getevents() call = 500ms. */
782 static const ulint	OS_AIO_REAP_TIMEOUT = 500000000UL;
783 
784 /** time to sleep, in microseconds if io_setup() returns EAGAIN. */
785 static const ulint	OS_AIO_IO_SETUP_RETRY_SLEEP = 500000UL;
786 
787 /** number of attempts before giving up on io_setup(). */
788 static const int	OS_AIO_IO_SETUP_RETRY_ATTEMPTS = 5;
789 #endif /* LINUX_NATIVE_AIO */
790 
791 /** Array of events used in simulated AIO */
792 static os_event_t*	os_aio_segment_wait_events = NULL;
793 
794 /** Number of asynchronous I/O segments.  Set by os_aio_init(). */
795 static ulint		os_aio_n_segments = ULINT_UNDEFINED;
796 
797 /** If the following is true, read i/o handler threads try to
798 wait until a batch of new read requests have been posted */
799 static bool		os_aio_recommend_sleep_for_read_threads = false;
800 #endif /* !UNIV_HOTBACKUP */
801 
802 ulint	os_n_file_reads		= 0;
803 ulint	os_bytes_read_since_printout = 0;
804 ulint	os_n_file_writes	= 0;
805 ulint	os_n_fsyncs		= 0;
806 ulint	os_n_file_reads_old	= 0;
807 ulint	os_n_file_writes_old	= 0;
808 ulint	os_n_fsyncs_old		= 0;
809 /** Number of pending write operations */
810 ulint	os_n_pending_writes = 0;
811 /** Number of pending read operations */
812 ulint	os_n_pending_reads = 0;
813 
814 ib_time_monotonic_t	os_last_printout;
815 bool	os_has_said_disk_full	= false;
816 
817 /** Default Zip compression level */
818 extern uint page_zip_level;
819 
820 #if DATA_TRX_ID_LEN > 6
821 #error "COMPRESSION_ALGORITHM will not fit"
822 #endif /* DATA_TRX_ID_LEN */
823 
824 /** Validates the consistency of the aio system.
825 @return true if ok */
826 static
827 bool
828 os_aio_validate();
829 
830 /** Does error handling when a file operation fails.
831 @param[in]	name		File name or NULL
832 @param[in]	operation	Name of operation e.g., "read", "write"
833 @return true if we should retry the operation */
834 static
835 bool
836 os_file_handle_error(
837 	const char*	name,
838 	const char*	operation);
839 
840 /** Free storage space associated with a section of the file.
841 @param[in]      fh              Open file handle
842 @param[in]      off             Starting offset (SEEK_SET)
843 @param[in]      len             Size of the hole
844 @return DB_SUCCESS or error code */
845 dberr_t
846 os_file_punch_hole(
847         os_file_t   fh,
848         os_offset_t     off,
849         os_offset_t     len);
850 
851 /**
852 Does error handling when a file operation fails.
853 @param[in]	name		File name or NULL
854 @param[in]	operation	Name of operation e.g., "read", "write"
855 @param[in]	silent	if true then don't print any message to the log.
856 @return true if we should retry the operation */
857 static
858 bool
859 os_file_handle_error_no_exit(
860 	const char*	name,
861 	const char*	operation,
862 	bool		silent);
863 
864 /** Decompress after a read and punch a hole in the file if it was a write
865 @param[in]	type		IO context
866 @param[in]	fh		Open file handle
867 @param[in,out]	buf		Buffer to transform
868 @param[in,out]	scratch		Scratch area for read decompression
869 @param[in]	src_len		Length of the buffer before compression
870 @param[in]	len		Compressed buffer length for write and size
871 				of buf len for read
872 @return DB_SUCCESS or error code */
873 static
874 dberr_t
875 os_file_io_complete(
876 	const IORequest&type,
877 	os_file_t	fh,
878 	byte*		buf,
879 	byte*		scratch,
880 	ulint		src_len,
881 	os_offset_t	offset,
882 	ulint		len);
883 
884 /** Does simulated AIO. This function should be called by an i/o-handler
885 thread.
886 
887 @param[in]	segment	The number of the segment in the aio arrays to wait
888 			for; segment 0 is the ibuf i/o thread, segment 1 the
889 			log i/o thread, then follow the non-ibuf read threads,
890 			and as the last are the non-ibuf write threads
891 @param[out]	m1	the messages passed with the AIO request; note that
892 			also in the case where the AIO operation failed, these
893 			output parameters are valid and can be used to restart
894 			the operation, for example
895 @param[out]	m2	Callback argument
896 @param[in]	type	IO context
897 @return DB_SUCCESS or error code */
898 static
899 dberr_t
900 os_aio_simulated_handler(
901 	ulint		global_segment,
902 	fil_node_t**	m1,
903 	void**		m2,
904 	IORequest*	type);
905 
906 #ifdef WIN_ASYNC_IO
907 /** This function is only used in Windows asynchronous i/o.
908 Waits for an aio operation to complete. This function is used to wait the
909 for completed requests. The aio array of pending requests is divided
910 into segments. The thread specifies which segment or slot it wants to wait
911 for. NOTE: this function will also take care of freeing the aio slot,
912 therefore no other thread is allowed to do the freeing!
913 @param[in]	segment		The number of the segment in the aio arrays to
914 wait for; segment 0 is the ibuf I/O thread,
915 segment 1 the log I/O thread, then follow the
916 non-ibuf read threads, and as the last are the
917 non-ibuf write threads; if this is
918 ULINT_UNDEFINED, then it means that sync AIO
919 is used, and this parameter is ignored
920 @param[in]	pos		this parameter is used only in sync AIO:
921 wait for the aio slot at this position
922 @param[out]	m1		the messages passed with the AIO request; note
923 that also in the case where the AIO operation
924 failed, these output parameters are valid and
925 can be used to restart the operation,
926 for example
927 @param[out]	m2		callback message
928 @param[out]	type		OS_FILE_WRITE or ..._READ
929 @return DB_SUCCESS or error code */
930 static
931 dberr_t
932 os_aio_windows_handler(
933 	ulint		segment,
934 	ulint		pos,
935 	fil_node_t**	m1,
936 	void**		m2,
937 	IORequest*	type);
938 #endif /* WIN_ASYNC_IO */
939 
940 /** Allocate a page for sync IO
941 @return pointer to page */
942 static
943 Block*
os_alloc_block()944 os_alloc_block()
945 {
946 	size_t		pos;
947 	Blocks&		blocks = *block_cache;
948 	size_t		i = static_cast<size_t>(my_timer_cycles());
949 	const size_t	size = blocks.size();
950 	ulint		retry = 0;
951 	Block*		block;
952 
953 	DBUG_EXECUTE_IF("os_block_cache_busy", retry = MAX_BLOCKS * 3;);
954 
955 	for (;;) {
956 
957 		/* After go through the block cache for 3 times,
958 		allocate a new temporary block. */
959 		if (retry == MAX_BLOCKS * 3) {
960 			byte*	ptr;
961 
962 			ptr = static_cast<byte*>(
963 				ut_malloc_nokey(sizeof(*block)
964 						+ BUFFER_BLOCK_SIZE));
965 
966 			block = new (ptr) Block();
967 			block->m_ptr = static_cast<byte*>(
968 				ptr + sizeof(*block));
969 			block->m_in_use = 1;
970 
971 			break;
972 		}
973 
974 		pos = i++ % size;
975 
976 		if (TAS(&blocks[pos].m_in_use, 1) == 0) {
977 			block = &blocks[pos];
978 			break;
979 		}
980 
981 		os_thread_yield();
982 
983 		++retry;
984 	}
985 
986 	ut_a(block->m_in_use != 0);
987 
988 	return(block);
989 }
990 
991 /** Free a page after sync IO
992 @param[in,own]	block		The block to free/release */
993 static
994 void
os_free_block(Block * block)995 os_free_block(Block* block)
996 {
997 	ut_ad(block->m_in_use == 1);
998 
999 	TAS(&block->m_in_use, 0);
1000 
1001 	/* When this block is not in the block cache, and it's
1002 	a temporary block, we need to free it directly. */
1003 	if (std::less<Block*>()(block, &block_cache->front())
1004 	    || std::greater<Block*>()(block, &block_cache->back())) {
1005 		ut_free(block);
1006 	}
1007 }
1008 
1009 /** Generic AIO Handler methods. Currently handles IO post processing. */
1010 class AIOHandler {
1011 public:
1012 	/** Do any post processing after a read/write
1013 	@return DB_SUCCESS or error code. */
1014 	static dberr_t post_io_processing(Slot* slot);
1015 
1016 	/** Decompress after a read and punch a hole in the file if
1017 	it was a write */
io_complete(const Slot * slot)1018 	static dberr_t io_complete(const Slot* slot)
1019 	{
1020 		ut_a(slot->offset > 0);
1021 		ut_a(slot->type.is_read() || !slot->skip_punch_hole);
1022 		return(os_file_io_complete(
1023 				slot->type, slot->file.m_file, slot->buf,
1024 				NULL, slot->original_len,
1025 				slot->offset, slot->len));
1026 	}
1027 
1028 private:
1029 	/** Check whether the page was encrypted.
1030 	@param[in]	slot		The slot that contains the IO request
1031 	@return true if it was an encyrpted page */
is_encrypted_page(const Slot * slot)1032 	static bool is_encrypted_page(const Slot* slot)
1033 	{
1034 		return(Encryption::is_encrypted_page(slot->buf));
1035 	}
1036 
1037 	/** Check whether the page was compressed.
1038 	@param[in]	slot		The slot that contains the IO request
1039 	@return true if it was a compressed page */
is_compressed_page(const Slot * slot)1040 	static bool is_compressed_page(const Slot* slot)
1041 	{
1042 		const byte*	src = slot->buf;
1043 
1044 		ulint	page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
1045 
1046 		return(page_type == FIL_PAGE_COMPRESSED);
1047 	}
1048 
1049 	/** Get the compressed page size.
1050 	@param[in]	slot		The slot that contains the IO request
1051 	@return number of bytes to read for a successful decompress */
compressed_page_size(const Slot * slot)1052 	static ulint compressed_page_size(const Slot* slot)
1053 	{
1054 		ut_ad(slot->type.is_read());
1055 		ut_ad(is_compressed_page(slot));
1056 
1057 		ulint		size;
1058 		const byte*	src = slot->buf;
1059 
1060 		size = mach_read_from_2(src + FIL_PAGE_COMPRESS_SIZE_V1);
1061 
1062 		return(size + FIL_PAGE_DATA);
1063 	}
1064 
1065 	/** Check if the page contents can be decompressed.
1066 	@param[in]	slot		The slot that contains the IO request
1067 	@return true if the data read has all the compressed data */
can_decompress(const Slot * slot)1068 	static bool can_decompress(const Slot* slot)
1069 	{
1070 		ut_ad(slot->type.is_read());
1071 		ut_ad(is_compressed_page(slot));
1072 
1073 		ulint		version;
1074 		const byte*	src = slot->buf;
1075 
1076 		version = mach_read_from_1(src + FIL_PAGE_VERSION);
1077 
1078 		ut_a(Compression::is_valid_page_version(version));
1079 
1080 		/* Includes the page header size too */
1081 		ulint		size = compressed_page_size(slot);
1082 
1083 		return(size <= (slot->ptr - slot->buf) + (ulint) slot->n_bytes);
1084 	}
1085 
1086 	/** Check if we need to read some more data.
1087 	@param[in]	slot		The slot that contains the IO request
1088 	@param[in]	n_bytes		Total bytes read so far
1089 	@return DB_SUCCESS or error code */
1090 	static dberr_t check_read(Slot* slot, ulint n_bytes);
1091 };
1092 
1093 /** Helper class for doing synchronous file IO. Currently, the objective
1094 is to hide the OS specific code, so that the higher level functions aren't
1095 peppered with #ifdef. Makes the code flow difficult to follow.  */
1096 class SyncFileIO {
1097 public:
1098 	/** Constructor
1099 	@param[in]	fh	File handle
1100 	@param[in,out]	buf	Buffer to read/write
1101 	@param[in]	n	Number of bytes to read/write
1102 	@param[in]	offset	Offset where to read or write */
SyncFileIO(os_file_t fh,void * buf,ulint n,os_offset_t offset)1103 	SyncFileIO(os_file_t fh, void* buf, ulint n, os_offset_t offset)
1104 		:
1105 		m_fh(fh),
1106 		m_buf(buf),
1107 		m_n(static_cast<ssize_t>(n)),
1108 		m_offset(offset)
1109 	{
1110 		ut_ad(m_n > 0);
1111 	}
1112 
1113 	/** Destructor */
~SyncFileIO()1114 	~SyncFileIO()
1115 	{
1116 		/* No op */
1117 	}
1118 
1119 	/** Do the read/write
1120 	@param[in]	request	The IO context and type
1121 	@return the number of bytes read/written or negative value on error */
1122 	ssize_t execute(const IORequest& request);
1123 
1124 	/** Do the read/write
1125 	@param[in,out]	slot	The IO slot, it has the IO context
1126 	@return the number of bytes read/written or negative value on error */
1127 	static ssize_t execute(Slot* slot);
1128 
1129 	/** Move the read/write offset up to where the partial IO succeeded.
1130 	@param[in]	n_bytes	The number of bytes to advance */
advance(ssize_t n_bytes)1131 	void advance(ssize_t n_bytes)
1132 	{
1133 		m_offset += n_bytes;
1134 
1135 		ut_ad(m_n >= n_bytes);
1136 
1137 		m_n -=  n_bytes;
1138 
1139 		m_buf = reinterpret_cast<uchar*>(m_buf) + n_bytes;
1140 	}
1141 
1142 private:
1143 	/** Open file handle */
1144 	os_file_t		m_fh;
1145 
1146 	/** Buffer to read/write */
1147 	void*			m_buf;
1148 
1149 	/** Number of bytes to read/write */
1150 	ssize_t			m_n;
1151 
1152 	/** Offset from where to read/write */
1153 	os_offset_t		m_offset;
1154 };
1155 
1156 /** If it is a compressed page return the compressed page data + footer size
1157 @param[in]	buf		Buffer to check, must include header + 10 bytes
1158 @return ULINT_UNDEFINED if the page is not a compressed page or length
1159 	of the compressed data (including footer) if it is a compressed page */
1160 ulint
os_file_compressed_page_size(const byte * buf)1161 os_file_compressed_page_size(const byte* buf)
1162 {
1163 	ulint	type = mach_read_from_2(buf + FIL_PAGE_TYPE);
1164 
1165 	if (type == FIL_PAGE_COMPRESSED) {
1166 		ulint	version = mach_read_from_1(buf + FIL_PAGE_VERSION);
1167 		ut_a(Compression::is_valid_page_version(version));
1168 		return(mach_read_from_2(buf + FIL_PAGE_COMPRESS_SIZE_V1));
1169 	}
1170 
1171 	return(ULINT_UNDEFINED);
1172 }
1173 
1174 /** If it is a compressed page return the original page data + footer size
1175 @param[in] buf		Buffer to check, must include header + 10 bytes
1176 @return ULINT_UNDEFINED if the page is not a compressed page or length
1177 	of the original data + footer if it is a compressed page */
1178 ulint
os_file_original_page_size(const byte * buf)1179 os_file_original_page_size(const byte* buf)
1180 {
1181 	ulint	type = mach_read_from_2(buf + FIL_PAGE_TYPE);
1182 
1183 	if (type == FIL_PAGE_COMPRESSED) {
1184 
1185 		ulint	version = mach_read_from_1(buf + FIL_PAGE_VERSION);
1186 		ut_a(Compression::is_valid_page_version(version));
1187 
1188 		return(mach_read_from_2(buf + FIL_PAGE_ORIGINAL_SIZE_V1));
1189 	}
1190 
1191 	return(ULINT_UNDEFINED);
1192 }
1193 
1194 /** Check if we need to read some more data.
1195 @param[in]	slot		The slot that contains the IO request
1196 @param[in]	n_bytes		Total bytes read so far
1197 @return DB_SUCCESS or error code */
1198 dberr_t
check_read(Slot * slot,ulint n_bytes)1199 AIOHandler::check_read(Slot* slot, ulint n_bytes)
1200 {
1201 	dberr_t	err;
1202 
1203 	ut_ad(slot->type.is_read());
1204 	ut_ad(slot->original_len > slot->len);
1205 
1206 	if (is_compressed_page(slot)) {
1207 
1208 		if (can_decompress(slot)) {
1209 
1210 			ut_a(slot->offset > 0);
1211 
1212 			slot->len = slot->original_len;
1213 #ifdef _WIN32
1214 			slot->n_bytes = static_cast<DWORD>(n_bytes);
1215 #else
1216 			slot->n_bytes = static_cast<ulint>(n_bytes);
1217 #endif /* _WIN32 */
1218 
1219 			err = io_complete(slot);
1220 			ut_a(err == DB_SUCCESS);
1221 		} else {
1222 			/* Read the next block in */
1223 			ut_ad(compressed_page_size(slot) >= n_bytes);
1224 
1225 			err = DB_FAIL;
1226 		}
1227 	} else if (is_encrypted_page(slot)
1228 		   || (slot->type.is_log()
1229 		       && slot->offset >= LOG_FILE_HDR_SIZE)) {
1230 			ut_a(slot->offset > 0);
1231 
1232 			slot->len = slot->original_len;
1233 #ifdef _WIN32
1234 			slot->n_bytes = static_cast<DWORD>(n_bytes);
1235 #else
1236 			slot->n_bytes = static_cast<ulint>(n_bytes);
1237 #endif /* _WIN32 */
1238 
1239 			err = io_complete(slot);
1240 			ut_a(err == DB_SUCCESS);
1241 
1242 	} else {
1243 		err = DB_FAIL;
1244 	}
1245 
1246 	if (slot->buf_block != NULL) {
1247 		os_free_block(slot->buf_block);
1248 		slot->buf_block = NULL;
1249 	}
1250 
1251 	if (slot->encrypt_log_buf != NULL) {
1252 		ut_free(slot->encrypt_log_buf);
1253 		slot->encrypt_log_buf = NULL;
1254 	}
1255 
1256 	return(err);
1257 }
1258 
1259 /** Do any post processing after a read/write
1260 @return DB_SUCCESS or error code. */
1261 dberr_t
post_io_processing(Slot * slot)1262 AIOHandler::post_io_processing(Slot* slot)
1263 {
1264 	dberr_t	err;
1265 
1266 	ut_ad(slot->is_reserved);
1267 
1268 	/* Total bytes read so far */
1269 	ulint	n_bytes = (slot->ptr - slot->buf) + slot->n_bytes;
1270 
1271 	/* Compressed writes can be smaller than the original length.
1272 	Therefore they can be processed without further IO. */
1273 	if (n_bytes == slot->original_len
1274 	    || (slot->type.is_write()
1275 		&& slot->type.is_compressed()
1276 		&& slot->len == static_cast<ulint>(slot->n_bytes))) {
1277 
1278 		if ((slot->type.is_log() && slot->offset >= LOG_FILE_HDR_SIZE)
1279 		    || is_compressed_page(slot) || is_encrypted_page(slot)) {
1280 
1281 			ut_a(slot->offset > 0);
1282 
1283 			if (slot->type.is_read()) {
1284 				slot->len = slot->original_len;
1285 			}
1286 
1287 			/* The punch hole has been done on collect() */
1288 
1289 			if (slot->type.is_read()) {
1290 				err = io_complete(slot);
1291 			} else {
1292 				err = DB_SUCCESS;
1293 			}
1294 
1295 			ut_ad(err == DB_SUCCESS
1296 			      || err == DB_UNSUPPORTED
1297 			      || err == DB_CORRUPTION
1298 			      || err == DB_IO_DECOMPRESS_FAIL);
1299 		} else if (!slot->type.is_log() && slot->type.is_read() && Encryption::can_page_be_keyring_encrypted(slot->buf)
1300 			   && !slot->type.is_encryption_disabled()) {
1301 			ut_ad(is_encrypted_page(slot) == false);
1302 			// we did not go to io_complete - so mark read page as unencrypted here
1303 			mach_write_to_4(slot->buf + FIL_PAGE_ENCRYPTION_KEY_VERSION, ENCRYPTION_KEY_VERSION_NOT_ENCRYPTED);
1304 			err = DB_SUCCESS;
1305                 }
1306 		else {
1307 
1308 			err = DB_SUCCESS;
1309 		}
1310 
1311 		if (slot->buf_block != NULL) {
1312 			os_free_block(slot->buf_block);
1313 			slot->buf_block = NULL;
1314 		}
1315 
1316 		if (slot->encrypt_log_buf != NULL) {
1317 			ut_free(slot->encrypt_log_buf);
1318 			slot->encrypt_log_buf = NULL;
1319 		}
1320 	} else if ((ulint) slot->n_bytes == (ulint) slot->len) {
1321 
1322 		/* It *must* be a partial read. */
1323 		ut_ad(slot->len < slot->original_len);
1324 
1325 		/* Has to be a read request, if it is less than
1326 		the original length. */
1327 		ut_ad(slot->type.is_read());
1328 		err = check_read(slot, n_bytes);
1329 
1330 	} else {
1331 		err = DB_FAIL;
1332 	}
1333 
1334 	return(err);
1335 }
1336 
1337 /** Count the number of free slots
1338 @return number of reserved slots */
1339 ulint
pending_io_count() const1340 AIO::pending_io_count() const
1341 {
1342 	acquire();
1343 
1344 #ifdef UNIV_DEBUG
1345 	ut_a(m_n_segments > 0);
1346 	ut_a(!m_slots.empty());
1347 
1348 	ulint	count = 0;
1349 
1350 	for (ulint i = 0; i < m_slots.size(); ++i) {
1351 
1352 		const Slot&	slot = m_slots[i];
1353 
1354 		if (slot.is_reserved) {
1355 			++count;
1356 			ut_a(slot.len > 0);
1357 		}
1358 	}
1359 
1360 	ut_a(m_n_reserved == count);
1361 #endif /* UNIV_DEBUG */
1362 
1363 	ulint	reserved = m_n_reserved;
1364 
1365 	release();
1366 
1367 	return(reserved);
1368 }
1369 
1370 /** Compress a data page
1371 #param[in]	block_size	File system block size
1372 @param[in]	src		Source contents to compress
1373 @param[in]	src_len		Length in bytes of the source
1374 @param[out]	dst		Compressed page contents
1375 @param[out]	dst_len		Length in bytes of dst contents
1376 @return buffer data, dst_len will have the length of the data */
1377 static
1378 byte*
os_file_compress_page(Compression compression,ulint block_size,byte * src,ulint src_len,byte * dst,ulint * dst_len,bool will_be_encrypted_with_keyring)1379 os_file_compress_page(
1380 	Compression	compression,
1381 	ulint		block_size,
1382 	byte*		src,
1383 	ulint		src_len,
1384 	byte*		dst,
1385 	ulint*		dst_len,
1386 	bool            will_be_encrypted_with_keyring)
1387 {
1388 	ulint		len = 0;
1389 	ulint		compression_level = page_zip_level;
1390 	ulint		page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
1391 
1392 	/* The page size must be a multiple of the OS punch hole size. */
1393 	ut_ad(!(src_len % block_size));
1394 
1395 	/* Shouldn't compress an already compressed page. */
1396 	ut_ad(page_type != FIL_PAGE_COMPRESSED);
1397 
1398 	/* The page must be at least twice as large as the file system
1399 	block size if we are to save any space. Ignore R-Tree pages for now,
1400 	they repurpose the same 8 bytes in the page header. No point in
1401 	compressing if the file system block size >= our page size. */
1402 
1403 	if (page_type == FIL_PAGE_RTREE
1404 	    || block_size == ULINT_UNDEFINED
1405 	    || compression.m_type == Compression::NONE
1406 	    || src_len < block_size * 2) {
1407 
1408 		*dst_len = src_len;
1409 
1410 		return(src);
1411 	}
1412 
1413 	/* Leave the header alone when compressing. */
1414 	ut_ad(block_size >= FIL_PAGE_DATA * 2);
1415 
1416 	ut_ad(src_len > FIL_PAGE_DATA + block_size);
1417 
1418 	/* Must compress to <= N-1 FS blocks. */
1419 	/* There need to be at least 4 bytes for key version and 4 bytes for post encryption
1420 	checksum */
1421 	ulint		out_len = src_len - (FIL_PAGE_DATA + block_size + ((will_be_encrypted_with_keyring) ? 8 : 0));
1422 
1423 	/* This is the original data page size - the page header. */
1424 	ulint		content_len = src_len - FIL_PAGE_DATA;
1425 
1426 	ut_ad(out_len >= block_size - FIL_PAGE_DATA + ((will_be_encrypted_with_keyring) ? 8 : 0));
1427 	ut_ad(out_len <= src_len - (block_size + FIL_PAGE_DATA + (will_be_encrypted_with_keyring ? 8 : 0)));
1428 
1429 	/* Only compress the data + trailer, leave the header alone */
1430 
1431 	switch (compression.m_type) {
1432 	case Compression::NONE:
1433 		ut_error;
1434 
1435 	case Compression::ZLIB: {
1436 
1437 		uLongf	zlen = static_cast<uLongf>(out_len);
1438 
1439 		if (compress2(
1440 			dst + FIL_PAGE_DATA,
1441 			&zlen,
1442 			src + FIL_PAGE_DATA,
1443 			static_cast<uLong>(content_len),
1444 			static_cast<int>(compression_level)) != Z_OK) {
1445 
1446 			*dst_len = src_len;
1447 
1448 			return(src);
1449 		}
1450 
1451 		len = static_cast<ulint>(zlen);
1452 
1453 		break;
1454 	}
1455 
1456 	case Compression::LZ4:
1457 
1458 		len = LZ4_compress_default(
1459 			reinterpret_cast<char*>(src) + FIL_PAGE_DATA,
1460 			reinterpret_cast<char*>(dst) + FIL_PAGE_DATA,
1461 			static_cast<int>(content_len),
1462 			static_cast<int>(out_len));
1463 
1464 		ut_a(len <= src_len - FIL_PAGE_DATA);
1465 
1466 		if (len == 0  || len >= out_len) {
1467 
1468 			*dst_len = src_len;
1469 
1470 			return(src);
1471 		}
1472 
1473 		break;
1474 
1475 	default:
1476 		*dst_len = src_len;
1477 		return(src);
1478 	}
1479 
1480 	ut_a(len <= out_len);
1481 
1482 	ut_ad(memcmp(src + FIL_PAGE_LSN + 4,
1483 		     src + src_len - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)
1484 	      == 0);
1485 
1486 	/* Copy the header as is. */
1487 	memmove(dst, src, FIL_PAGE_DATA);
1488 
1489 	/* Add compression control information. Required for decompressing. */
1490 	mach_write_to_2(dst + FIL_PAGE_TYPE, FIL_PAGE_COMPRESSED);
1491 
1492 	mach_write_to_1(dst + FIL_PAGE_VERSION, Compression::FIL_PAGE_VERSION_2);
1493 
1494 	mach_write_to_1(dst + FIL_PAGE_ALGORITHM_V1, compression.m_type);
1495 
1496 	mach_write_to_2(dst + FIL_PAGE_ORIGINAL_TYPE_V1, page_type);
1497 
1498 	mach_write_to_2(dst + FIL_PAGE_ORIGINAL_SIZE_V1, content_len);
1499 
1500 	mach_write_to_2(dst + FIL_PAGE_COMPRESS_SIZE_V1, len);
1501 
1502 	/* Round to the next full block size */
1503 
1504 	len += FIL_PAGE_DATA;
1505 
1506 	if (will_be_encrypted_with_keyring) {
1507 		mach_write_to_8(dst + len, 0);
1508 		len += 8;
1509 	}
1510 
1511 	// For encryption with keyring keys we required that there will be at least 8 bytes left
1512 	// 4 bytes for key version and 4 bytes for post encryption checksum
1513 	*dst_len = ut_calc_align(len, block_size);
1514 
1515 	ut_ad(*dst_len >= len && *dst_len <= out_len + FIL_PAGE_DATA + (will_be_encrypted_with_keyring ? 8 : 0));
1516 
1517 	/* Clear out the unused portion of the page. */
1518 	if (len % block_size) {
1519 		memset(dst + len, 0x0, block_size - (len % block_size));
1520 	}
1521 
1522 	return(dst);
1523 }
1524 
1525 #ifdef UNIV_DEBUG
1526 # ifndef UNIV_HOTBACKUP
1527 /** Validates the consistency the aio system some of the time.
1528 @return true if ok or the check was skipped */
1529 bool
os_aio_validate_skip()1530 os_aio_validate_skip()
1531 {
1532 /** Try os_aio_validate() every this many times */
1533 # define OS_AIO_VALIDATE_SKIP	13
1534 
1535 	/** The os_aio_validate() call skip counter.
1536 	Use a signed type because of the race condition below. */
1537 	static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
1538 
1539 	/* There is a race condition below, but it does not matter,
1540 	because this call is only for heuristic purposes. We want to
1541 	reduce the call frequency of the costly os_aio_validate()
1542 	check in debug builds. */
1543 	--os_aio_validate_count;
1544 
1545 	if (os_aio_validate_count > 0) {
1546 		return(true);
1547 	}
1548 
1549 	os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
1550 	return(os_aio_validate());
1551 }
1552 # endif /* !UNIV_HOTBACKUP */
1553 #endif /* UNIV_DEBUG */
1554 
1555 #undef USE_FILE_LOCK
1556 #define USE_FILE_LOCK
1557 #if defined(UNIV_HOTBACKUP) || defined(_WIN32)
1558 /* InnoDB Hot Backup does not lock the data files.
1559  * On Windows, mandatory locking is used.
1560  */
1561 # undef USE_FILE_LOCK
1562 #endif
1563 #ifdef USE_FILE_LOCK
1564 /** Obtain an exclusive lock on a file.
1565 @param[in]	fd		file descriptor
1566 @param[in]	name		file name
1567 @return 0 on success */
1568 static
1569 int
os_file_lock(int fd,const char * name)1570 os_file_lock(
1571 	int		fd,
1572 	const char*	name)
1573 {
1574 	struct flock lk;
1575 
1576 	lk.l_type = F_WRLCK;
1577 	lk.l_whence = SEEK_SET;
1578 	lk.l_start = lk.l_len = 0;
1579 
1580 	if (fcntl(fd, F_SETLK, &lk) == -1) {
1581 
1582 		ib::error()
1583 			<< "Unable to lock " << name
1584 			<< " error: " << errno;
1585 
1586 		if (errno == EAGAIN || errno == EACCES) {
1587 
1588 			ib::info()
1589 				<< "Check that you do not already have"
1590 				" another mysqld process using the"
1591 				" same InnoDB data or log files.";
1592 		}
1593 
1594 		return(-1);
1595 	}
1596 
1597 	return(0);
1598 }
1599 #endif /* USE_FILE_LOCK */
1600 
1601 #ifndef UNIV_HOTBACKUP
1602 
1603 /** Calculates local segment number and aio array from global segment number.
1604 @param[out]	array		aio wait array
1605 @param[in]	segment		global segment number
1606 @return local segment number within the aio array */
1607 ulint
get_array_and_local_segment(AIO ** array,ulint segment)1608 AIO::get_array_and_local_segment(
1609 	AIO**		array,
1610 	ulint		segment)
1611 {
1612 	ulint		local_segment;
1613 	ulint		n_extra_segs = (srv_read_only_mode) ? 0 : 2;
1614 
1615 	ut_a(segment < os_aio_n_segments);
1616 
1617 	if (!srv_read_only_mode && segment < n_extra_segs) {
1618 
1619 		/* We don't support ibuf/log IO during read only mode. */
1620 
1621 		if (segment == IO_IBUF_SEGMENT) {
1622 
1623 			*array = s_ibuf;
1624 
1625 		} else if (segment == IO_LOG_SEGMENT) {
1626 
1627 			*array = s_log;
1628 
1629 		} else {
1630 			*array = NULL;
1631 		}
1632 
1633 		local_segment = 0;
1634 
1635 	} else if (segment < s_reads->m_n_segments + n_extra_segs) {
1636 
1637 		*array = s_reads;
1638 		local_segment = segment - n_extra_segs;
1639 
1640 	} else {
1641 		*array = s_writes;
1642 
1643 		local_segment = segment
1644 			      - (s_reads->m_n_segments + n_extra_segs);
1645 	}
1646 
1647 	return(local_segment);
1648 }
1649 
1650 /** Frees a slot in the aio array. Assumes caller owns the mutex.
1651 @param[in,out]	slot		Slot to release */
1652 void
release(Slot * slot)1653 AIO::release(Slot* slot)
1654 {
1655 	ut_ad(is_mutex_owned());
1656 
1657 	ut_ad(slot->is_reserved);
1658 
1659 	slot->is_reserved = false;
1660 
1661 	--m_n_reserved;
1662 
1663 	if (m_n_reserved == m_slots.size() - 1) {
1664 		os_event_set(m_not_full);
1665 	}
1666 
1667 	if (m_n_reserved == 0) {
1668 		os_event_set(m_is_empty);
1669 	}
1670 
1671 #ifdef WIN_ASYNC_IO
1672 
1673 	ResetEvent(slot->handle);
1674 
1675 #elif defined(LINUX_NATIVE_AIO)
1676 
1677 	if (srv_use_native_aio) {
1678 		memset(&slot->control, 0x0, sizeof(slot->control));
1679 		slot->ret = 0;
1680 		slot->n_bytes = 0;
1681 	} else {
1682 		/* These fields should not be used if we are not
1683 		using native AIO. */
1684 		ut_ad(slot->n_bytes == 0);
1685 		ut_ad(slot->ret == 0);
1686 	}
1687 
1688 #endif /* WIN_ASYNC_IO */
1689 }
1690 
1691 /** Frees a slot in the AIO array. Assumes caller doesn't own the mutex.
1692 @param[in,out]	slot		Slot to release */
1693 void
release_with_mutex(Slot * slot)1694 AIO::release_with_mutex(Slot* slot)
1695 {
1696 	acquire();
1697 
1698 	release(slot);
1699 
1700 	release();
1701 }
1702 
1703 /** Creates a temporary file.  This function is like tmpfile(3), but
1704 the temporary file is created in the given parameter path. If the path
1705 is NULL then it will create the file in the MySQL server configuration
1706 parameter (--tmpdir).
1707 @param[in]	path	location for creating temporary file
1708 @return temporary file handle, or NULL on error */
1709 FILE*
os_file_create_tmpfile(const char * path)1710 os_file_create_tmpfile(
1711 	const char*	path)
1712 {
1713 	FILE*	file	= NULL;
1714 	int	fd	= innobase_mysql_tmpfile(path);
1715 
1716 	if (fd >= 0) {
1717 		file = fdopen(fd, "w+b");
1718 	}
1719 
1720 	if (file == NULL) {
1721 
1722 		ib::error()
1723 			<< "Unable to create temporary file; errno: "
1724 			<< errno;
1725 
1726 		if (fd >= 0) {
1727 			close(fd);
1728 		}
1729 	}
1730 
1731 	return(file);
1732 }
1733 
1734 /** Rewind file to its start, read at most size - 1 bytes from it to str, and
1735 NUL-terminate str. All errors are silently ignored. This function is
1736 mostly meant to be used with temporary files.
1737 @param[in,out]	file		File to read from
1738 @param[in,out]	str		Buffer where to read
1739 @param[in]	size		Size of buffer */
1740 void
os_file_read_string(FILE * file,char * str,ulint size)1741 os_file_read_string(
1742 	FILE*		file,
1743 	char*		str,
1744 	ulint		size)
1745 {
1746 	if (size != 0) {
1747 		rewind(file);
1748 
1749 		size_t	flen = fread(str, 1, size - 1, file);
1750 
1751 		str[flen] = '\0';
1752 	}
1753 }
1754 
1755 static
1756 dberr_t
verify_post_encryption_checksum(const IORequest & type,Encryption & encryption,byte * buf,ulint src_len)1757 verify_post_encryption_checksum(const IORequest &type, Encryption &encryption,
1758 				byte *buf, ulint src_len)
1759 {
1760 	bool is_crypt_checksum_correct = false; // For MK encryption is_crypt_checksum_correct stays false
1761 	ulint original_type = static_cast<uint16_t>(
1762 		mach_read_from_2(buf + FIL_PAGE_ORIGINAL_TYPE_V1));
1763 
1764 	if (encryption.m_type == Encryption::KEYRING && Encryption::can_page_be_keyring_encrypted(original_type)) {
1765 		if (type.is_page_zip_compressed()) {
1766 		byte zip_magic[ENCRYPTION_ZIP_PAGE_KEYRING_ENCRYPTION_MAGIC_LEN];
1767 		memcpy(zip_magic, buf + FIL_PAGE_ZIP_KEYRING_ENCRYPTION_MAGIC,
1768 		ENCRYPTION_ZIP_PAGE_KEYRING_ENCRYPTION_MAGIC_LEN);
1769 		is_crypt_checksum_correct =	 memcmp(zip_magic, ENCRYPTION_ZIP_PAGE_KEYRING_ENCRYPTION_MAGIC,
1770 							ENCRYPTION_ZIP_PAGE_KEYRING_ENCRYPTION_MAGIC_LEN) == 0;
1771 		} else
1772 			is_crypt_checksum_correct = fil_space_verify_crypt_checksum(buf, src_len, type.is_page_zip_compressed(),
1773 										    encryption.is_encrypted_and_compressed(buf));
1774 
1775 		if (encryption.m_encryption_rotation == Encryption::NO_ROTATION && !is_crypt_checksum_correct) { // There is no re-encryption going on
1776 			ulint space_id = mach_read_from_4(buf + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
1777 			ulint page_no = mach_read_from_4(buf + FIL_PAGE_OFFSET);
1778 			ib::error() << "Post - encryption checksum verification failed - decryption failed for space id = " << space_id
1779 				    << " page_no = " << page_no;
1780 
1781 			return (DB_IO_DECRYPT_FAIL);
1782 		}
1783 	}
1784 
1785 	if (encryption.m_encryption_rotation == Encryption::MASTER_KEY_TO_KEYRING) { // There is re-encryption going on
1786 		encryption.m_type = is_crypt_checksum_correct
1787 		 ? Encryption::KEYRING // assume page is RK encrypted
1788 		 : Encryption::AES; // assume page is MK encrypted
1789 	}
1790 
1791 	return DB_SUCCESS;
1792 }
1793 
1794 static
1795 void
assing_key_version(byte * buf,Encryption & encryption,bool is_page_encrypted)1796 assing_key_version(
1797 	byte* buf,
1798 	Encryption	&encryption,
1799 	bool is_page_encrypted)
1800 {
1801 	if (is_page_encrypted && encryption.m_type == Encryption::KEYRING)
1802 	{
1803 		mach_write_to_2(buf + FIL_PAGE_ORIGINAL_TYPE_V1, FIL_PAGE_ENCRYPTED);
1804 		ut_ad(encryption.m_key_version != ENCRYPTION_KEY_VERSION_NOT_ENCRYPTED);
1805 		mach_write_to_4(buf + FIL_PAGE_ENCRYPTION_KEY_VERSION, encryption.m_key_version);
1806 	}
1807 	else
1808 		mach_write_to_4(buf + FIL_PAGE_ENCRYPTION_KEY_VERSION, ENCRYPTION_KEY_VERSION_NOT_ENCRYPTED);
1809 }
1810 
1811 static
1812 bool
load_key_needed_for_decryption(const IORequest & type,Encryption & encryption,byte * buf)1813 load_key_needed_for_decryption(
1814 	const IORequest& type,
1815 	Encryption &encryption,
1816 	byte *buf)
1817 {
1818 	if (encryption.m_type == Encryption::KEYRING)
1819 	{
1820 		ulint key_version_read_from_page = ENCRYPTION_KEY_VERSION_INVALID;
1821 		ulint page_type = mach_read_from_2(buf + FIL_PAGE_TYPE);
1822 		if (page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED)
1823 			key_version_read_from_page= mach_read_from_4(buf +  FIL_PAGE_DATA + 4);
1824 		else
1825 		{
1826 			ut_ad(page_type == FIL_PAGE_ENCRYPTED);
1827 			key_version_read_from_page= mach_read_from_4(buf + FIL_PAGE_ENCRYPTION_KEY_VERSION);
1828 		}
1829 
1830 		ut_ad(key_version_read_from_page != ENCRYPTION_KEY_VERSION_INVALID);
1831 		ut_ad(key_version_read_from_page != ENCRYPTION_KEY_VERSION_NOT_ENCRYPTED);
1832 
1833 		// in rare cases - when (re-)encryption was aborted there can be pages encrypted with
1834 		// different key versions in a given tablespace - retrieve needed key here
1835 
1836 		byte *key_read;
1837 
1838 		size_t key_len;
1839 		if (Encryption::get_tablespace_key(encryption.m_key_id,
1840 						   key_version_read_from_page,
1841 						   &key_read, &key_len) == false)
1842 		{
1843 			return false;
1844 			ut_ad(0);
1845 		}
1846 
1847 		//For test
1848 		if (key_version_read_from_page == encryption.m_key_version) {
1849 				ut_ad(memcmp(key_read, encryption.m_key, key_len) == 0);
1850 		}
1851 
1852 		// TODO: Allocated or not depends on whether key was taken from cache or keyring
1853                 encryption.set_key(key_read, static_cast<ulint>(key_len), true);
1854                 //encryption.m_key = key_read;
1855 		//******
1856 
1857 		//encryption.m_klen = static_cast<ulint>(key_len);
1858 		encryption.m_key_version = key_version_read_from_page;
1859 		//encryption.m_free_key_on_delete= true; // we own the key
1860 	}
1861 	else {
1862 		ut_ad(encryption.m_type == Encryption::AES);
1863 		if (encryption.m_encryption_rotation == Encryption::NO_ROTATION)
1864 			return true; // we are all set - needed key was alread loaded into encryption module
1865 
1866 		ut_ad(encryption.m_encryption_rotation == Encryption::MASTER_KEY_TO_KEYRING);
1867 		ut_ad(encryption.m_tablespace_iv != NULL);
1868 		encryption.m_iv = encryption.m_tablespace_iv; // iv comes from tablespace header for MK encryption
1869 		ut_ad(encryption.m_tablespace_key != NULL);
1870 		encryption.set_key(encryption.m_tablespace_key,
1871 				   ENCRYPTION_KEY_LEN, false);
1872 	}
1873 
1874 	return true;
1875 }
1876 
1877 /** Decompress after a read and punch a hole in the file if it was a write
1878 @param[in]	type		IO context
1879 @param[in]	fh		Open file handle
1880 @param[in,out]	buf		Buffer to transform
1881 @param[in,out]	scratch		Scratch area for read decompression
1882 @param[in]	src_len		Length of the buffer before compression
1883 @param[in]	len		Used buffer length for write and output
1884 				buf len for read
1885 @return DB_SUCCESS or error code */
1886 static
1887 dberr_t
os_file_io_complete(const IORequest & type,os_file_t fh,byte * buf,byte * scratch,ulint src_len,os_offset_t offset,ulint len)1888 os_file_io_complete(
1889 	const IORequest&type,
1890 	os_file_t	fh,
1891 	byte*		buf,
1892 	byte*		scratch,
1893 	ulint		src_len,
1894 	os_offset_t	offset,
1895 	ulint		len)
1896 {
1897 	dberr_t		ret = DB_SUCCESS;
1898 
1899 	/* We never compress/decompress the first page */
1900 	ut_a(offset > 0);
1901 	ut_ad(type.validate());
1902 
1903 	if (!type.is_compression_enabled()) {
1904 		if (type.is_log() && offset >= LOG_FILE_HDR_SIZE
1905 		    && !type.is_encryption_disabled()) {
1906 			Encryption encryption(type.encryption_algorithm());
1907 
1908 			ret = encryption.decrypt_log(type, buf, src_len,
1909 						     scratch, len);
1910 		}
1911 
1912 		return(ret);
1913 	} else if (type.is_read()) {
1914 		Encryption	encryption(type.encryption_algorithm());
1915 
1916 		bool is_page_encrypted= type.is_encryption_disabled()
1917 					? false
1918 					: encryption.is_encrypted_page(buf);
1919 
1920 		if (is_page_encrypted)
1921 		{
1922 			dberr_t err = verify_post_encryption_checksum(type, encryption, buf, src_len);
1923 			if (err != DB_SUCCESS)
1924 				return err;
1925 
1926 			if (!load_key_needed_for_decryption(type, encryption, buf))
1927 				return DB_DECRYPTION_FAILED;
1928 
1929 			ret = encryption.decrypt(type, buf, src_len, scratch, len);
1930 			if (ret != DB_SUCCESS)
1931 				return ret;
1932 		}
1933 
1934 		ret = os_file_decompress_page(type.is_dblwr_recover(),
1935 					      buf, scratch, len);
1936 		if (ret != DB_SUCCESS)
1937 			return ret;
1938 		if (Encryption::can_page_be_keyring_encrypted(buf) && !type.is_encryption_disabled())
1939 			assing_key_version(buf, encryption, is_page_encrypted); // is_page_encrypted meaning page was encrypted before calling decrypt
1940 
1941 
1942 	} else if (type.punch_hole()) {
1943 
1944 		ut_ad(len <= src_len);
1945 		ut_ad(!type.is_log());
1946 		ut_ad(type.is_write());
1947 		ut_ad(type.is_compressed());
1948 
1949 		/* Nothing to do. */
1950 		if (len == src_len) {
1951 			return(DB_SUCCESS);
1952 		}
1953 
1954 #ifdef UNIV_DEBUG
1955 		const ulint	block_size = type.block_size();
1956 #endif /* UNIV_DEBUG */
1957 
1958 		/* We don't support multiple page sizes in the server
1959 		at the moment. */
1960 		ut_ad(src_len == srv_page_size);
1961 
1962 		/* Must be a multiple of the compression unit size. */
1963 		ut_ad((len % block_size) == 0);
1964 		ut_ad((offset % block_size) == 0);
1965 
1966 		ut_ad(len + block_size <= src_len);
1967 
1968 		offset += len;
1969 
1970 		return(os_file_punch_hole(fh, offset, src_len - len));
1971 	}
1972 #ifdef UNIV_DEBUG
1973 	if (type.is_write() && type.encryption_algorithm().m_type == Encryption::KEYRING) {
1974 		Encryption	encryption(type.encryption_algorithm());
1975 		bool was_page_encrypted= encryption.is_encrypted_page(buf);
1976 
1977 		//TODO:Robert czy bez type.is_page_zip_compressed to działa - powinno
1978 		ut_ad(!was_page_encrypted || //!type.is_page_zip_compressed() ||
1979 		fil_space_verify_crypt_checksum(buf, src_len, type.is_page_zip_compressed(), encryption.is_encrypted_and_compressed(buf)));
1980 	}
1981 #endif
1982 
1983 	ut_ad(!type.is_log());
1984 
1985 	return(DB_SUCCESS);
1986 }
1987 
1988 #endif /* !UNIV_HOTBACKUP */
1989 
1990 /** This function returns a new path name after replacing the basename
1991 in an old path with a new basename.  The old_path is a full path
1992 name including the extension.  The tablename is in the normal
1993 form "databasename/tablename".  The new base name is found after
1994 the forward slash.  Both input strings are null terminated.
1995 
1996 This function allocates memory to be returned.  It is the callers
1997 responsibility to free the return value after it is no longer needed.
1998 
1999 @param[in]	old_path		Pathname
2000 @param[in]	tablename		Contains new base name
2001 @return own: new full pathname */
2002 char*
os_file_make_new_pathname(const char * old_path,const char * tablename)2003 os_file_make_new_pathname(
2004 	const char*	old_path,
2005 	const char*	tablename)
2006 {
2007 	ulint		dir_len;
2008 	char*		last_slash;
2009 	char*		base_name;
2010 	char*		new_path;
2011 	ulint		new_path_len;
2012 
2013 	/* Split the tablename into its database and table name components.
2014 	They are separated by a '/'. */
2015 	last_slash = strrchr((char*) tablename, '/');
2016 	base_name = last_slash ? last_slash + 1 : (char*) tablename;
2017 
2018 	/* Find the offset of the last slash. We will strip off the
2019 	old basename.ibd which starts after that slash. */
2020 	last_slash = strrchr((char*) old_path, OS_PATH_SEPARATOR);
2021 	dir_len = last_slash ? last_slash - old_path : strlen(old_path);
2022 
2023 	/* allocate a new path and move the old directory path to it. */
2024 	new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd";
2025 	new_path = static_cast<char*>(ut_malloc_nokey(new_path_len));
2026 	memcpy(new_path, old_path, dir_len);
2027 
2028 	ut_snprintf(new_path + dir_len,
2029 		    new_path_len - dir_len,
2030 		    "%c%s.ibd",
2031 		    OS_PATH_SEPARATOR,
2032 		    base_name);
2033 
2034 	return(new_path);
2035 }
2036 
2037 /** This function reduces a null-terminated full remote path name into
2038 the path that is sent by MySQL for DATA DIRECTORY clause.  It replaces
2039 the 'databasename/tablename.ibd' found at the end of the path with just
2040 'tablename'.
2041 
2042 Since the result is always smaller than the path sent in, no new memory
2043 is allocated. The caller should allocate memory for the path sent in.
2044 This function manipulates that path in place.
2045 
2046 If the path format is not as expected, just return.  The result is used
2047 to inform a SHOW CREATE TABLE command.
2048 @param[in,out]	data_dir_path		Full path/data_dir_path */
2049 void
os_file_make_data_dir_path(char * data_dir_path)2050 os_file_make_data_dir_path(
2051 	char*	data_dir_path)
2052 {
2053 	/* Replace the period before the extension with a null byte. */
2054 	char*	ptr = strrchr((char*) data_dir_path, '.');
2055 
2056 	if (ptr == NULL) {
2057 		return;
2058 	}
2059 
2060 	ptr[0] = '\0';
2061 
2062 	/* The tablename starts after the last slash. */
2063 	ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
2064 
2065 	if (ptr == NULL) {
2066 		return;
2067 	}
2068 
2069 	ptr[0] = '\0';
2070 
2071 	char*	tablename = ptr + 1;
2072 
2073 	/* The databasename starts after the next to last slash. */
2074 	ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
2075 
2076 	if (ptr == NULL) {
2077 		return;
2078 	}
2079 
2080 	ulint	tablename_len = ut_strlen(tablename);
2081 
2082 	ut_memmove(++ptr, tablename, tablename_len);
2083 
2084 	ptr[tablename_len] = '\0';
2085 }
2086 
2087 /** Check if the path refers to the root of a drive using a pointer
2088 to the last directory separator that the caller has fixed.
2089 @param[in]	path	path name
2090 @param[in]	path	last directory separator in the path
2091 @return true if this path is a drive root, false if not */
2092 UNIV_INLINE
2093 bool
os_file_is_root(const char * path,const char * last_slash)2094 os_file_is_root(
2095 	const char*	path,
2096 	const char*	last_slash)
2097 {
2098 	return(
2099 #ifdef _WIN32
2100 	       (last_slash == path + 2 && path[1] == ':') ||
2101 #endif /* _WIN32 */
2102 	       last_slash == path);
2103 }
2104 
2105 /** Return the parent directory component of a null-terminated path.
2106 Return a new buffer containing the string up to, but not including,
2107 the final component of the path.
2108 The path returned will not contain a trailing separator.
2109 Do not return a root path, return NULL instead.
2110 The final component trimmed off may be a filename or a directory name.
2111 If the final component is the only component of the path, return NULL.
2112 It is the caller's responsibility to free the returned string after it
2113 is no longer needed.
2114 @param[in]	path		Path name
2115 @return own: parent directory of the path */
2116 static
2117 char*
os_file_get_parent_dir(const char * path)2118 os_file_get_parent_dir(
2119 	const char*	path)
2120 {
2121 	bool	has_trailing_slash = false;
2122 
2123 	/* Find the offset of the last slash */
2124 	const char* last_slash = strrchr(path, OS_PATH_SEPARATOR);
2125 
2126 	if (!last_slash) {
2127 		/* No slash in the path, return NULL */
2128 		return(NULL);
2129 	}
2130 
2131 	/* Ok, there is a slash. Is there anything after it? */
2132 	if (static_cast<size_t>(last_slash - path + 1) == strlen(path)) {
2133 		has_trailing_slash = true;
2134 	}
2135 
2136 	/* Reduce repetative slashes. */
2137 	while (last_slash > path
2138 		&& last_slash[-1] == OS_PATH_SEPARATOR) {
2139 		last_slash--;
2140 	}
2141 
2142 	/* Check for the root of a drive. */
2143 	if (os_file_is_root(path, last_slash)) {
2144 		return(NULL);
2145 	}
2146 
2147 	/* If a trailing slash prevented the first strrchr() from trimming
2148 	the last component of the path, trim that component now. */
2149 	if (has_trailing_slash) {
2150 		/* Back up to the previous slash. */
2151 		last_slash--;
2152 		while (last_slash > path
2153 		       && last_slash[0] != OS_PATH_SEPARATOR) {
2154 			last_slash--;
2155 		}
2156 
2157 		/* Reduce repetative slashes. */
2158 		while (last_slash > path
2159 			&& last_slash[-1] == OS_PATH_SEPARATOR) {
2160 			last_slash--;
2161 		}
2162 	}
2163 
2164 	/* Check for the root of a drive. */
2165 	if (os_file_is_root(path, last_slash)) {
2166 		return(NULL);
2167 	}
2168 
2169 	if (last_slash - path < 0) {
2170 		/* Sanity check, it prevents gcc from trying to handle this case which
2171 		 * results in warnings for some optimized builds */
2172 		return (NULL);
2173 	}
2174 
2175 	/* Non-trivial directory component */
2176 
2177 	return(mem_strdupl(path, last_slash - path));
2178 }
2179 #ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
2180 
2181 /* Test the function os_file_get_parent_dir. */
2182 void
test_os_file_get_parent_dir(const char * child_dir,const char * expected_dir)2183 test_os_file_get_parent_dir(
2184 	const char*	child_dir,
2185 	const char*	expected_dir)
2186 {
2187 	char* child = mem_strdup(child_dir);
2188 	char* expected = expected_dir == NULL ? NULL
2189 			 : mem_strdup(expected_dir);
2190 
2191 	/* os_file_get_parent_dir() assumes that separators are
2192 	converted to OS_PATH_SEPARATOR. */
2193 	os_normalize_path(child);
2194 	os_normalize_path(expected);
2195 
2196 	char* parent = os_file_get_parent_dir(child);
2197 
2198 	bool unexpected = (expected == NULL
2199 			  ? (parent != NULL)
2200 			  : (0 != strcmp(parent, expected)));
2201 	if (unexpected) {
2202 		ib::fatal() << "os_file_get_parent_dir('" << child
2203 			<< "') returned '" << parent
2204 			<< "', instead of '" << expected << "'.";
2205 	}
2206 	ut_free(parent);
2207 	ut_free(child);
2208 	ut_free(expected);
2209 }
2210 
2211 /* Test the function os_file_get_parent_dir. */
2212 void
unit_test_os_file_get_parent_dir()2213 unit_test_os_file_get_parent_dir()
2214 {
2215 	test_os_file_get_parent_dir("/usr/lib/a", "/usr/lib");
2216 	test_os_file_get_parent_dir("/usr/", NULL);
2217 	test_os_file_get_parent_dir("//usr//", NULL);
2218 	test_os_file_get_parent_dir("usr", NULL);
2219 	test_os_file_get_parent_dir("usr//", NULL);
2220 	test_os_file_get_parent_dir("/", NULL);
2221 	test_os_file_get_parent_dir("//", NULL);
2222 	test_os_file_get_parent_dir(".", NULL);
2223 	test_os_file_get_parent_dir("..", NULL);
2224 # ifdef _WIN32
2225 	test_os_file_get_parent_dir("D:", NULL);
2226 	test_os_file_get_parent_dir("D:/", NULL);
2227 	test_os_file_get_parent_dir("D:\\", NULL);
2228 	test_os_file_get_parent_dir("D:/data", NULL);
2229 	test_os_file_get_parent_dir("D:/data/", NULL);
2230 	test_os_file_get_parent_dir("D:\\data\\", NULL);
2231 	test_os_file_get_parent_dir("D:///data/////", NULL);
2232 	test_os_file_get_parent_dir("D:\\\\\\data\\\\\\\\", NULL);
2233 	test_os_file_get_parent_dir("D:/data//a", "D:/data");
2234 	test_os_file_get_parent_dir("D:\\data\\\\a", "D:\\data");
2235 	test_os_file_get_parent_dir("D:///data//a///b/", "D:///data//a");
2236 	test_os_file_get_parent_dir("D:\\\\\\data\\\\a\\\\\\b\\", "D:\\\\\\data\\\\a");
2237 #endif  /* _WIN32 */
2238 }
2239 #endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */
2240 
2241 
2242 /** Creates all missing subdirectories along the given path.
2243 @param[in]	path		Path name
2244 @return DB_SUCCESS if OK, otherwise error code. */
2245 dberr_t
os_file_create_subdirs_if_needed(const char * path)2246 os_file_create_subdirs_if_needed(
2247 	const char*	path)
2248 {
2249 	if (srv_read_only_mode) {
2250 
2251 		ib::error()
2252 			<< "read only mode set. Can't create "
2253 			<< "subdirectories '" << path << "'";
2254 
2255 		return(DB_READ_ONLY);
2256 
2257 	}
2258 
2259 	char*	subdir = os_file_get_parent_dir(path);
2260 
2261 	if (subdir == NULL) {
2262 		/* subdir is root or cwd, nothing to do */
2263 		return(DB_SUCCESS);
2264 	}
2265 
2266 	/* Test if subdir exists */
2267 	os_file_type_t	type;
2268 	bool	subdir_exists;
2269 	bool	success = os_file_status(subdir, &subdir_exists, &type);
2270 
2271 	if (success && !subdir_exists) {
2272 
2273 		/* Subdir does not exist, create it */
2274 		dberr_t	err = os_file_create_subdirs_if_needed(subdir);
2275 
2276 		if (err != DB_SUCCESS) {
2277 
2278 			ut_free(subdir);
2279 
2280 			return(err);
2281 		}
2282 
2283 		success = os_file_create_directory(subdir, false);
2284 	}
2285 
2286 	ut_free(subdir);
2287 
2288 	return(success ? DB_SUCCESS : DB_ERROR);
2289 }
2290 
2291 /** Allocate the buffer for IO on a transparently compressed table.
2292 @param[in]	type		IO flags
2293 @param[out]	buf		buffer to read or write
2294 @param[in,out]	n		number of bytes to read/write, starting from
2295 				offset
2296 @return pointer to allocated page, compressed data is written to the offset
2297 	that is aligned on the disk sector size */
2298 static
2299 Block*
os_file_compress_page(IORequest & type,void * & buf,ulint * n)2300 os_file_compress_page(
2301 	IORequest&	type,
2302 	void*&		buf,
2303 	ulint*		n)
2304 {
2305 	ut_ad(!type.is_log());
2306 	ut_ad(type.is_write());
2307 	ut_ad(type.is_compressed());
2308 
2309 	ulint	n_alloc = *n * 2;
2310 
2311 	ut_a(n_alloc <= UNIV_PAGE_SIZE_MAX * 2);
2312 	ut_a(type.compression_algorithm().m_type != Compression::LZ4
2313 	     || static_cast<ulint>(LZ4_COMPRESSBOUND(*n)) < n_alloc);
2314 
2315 	Block*  block = os_alloc_block();
2316 
2317 	ulint	old_compressed_len;
2318 	ulint	compressed_len = *n;
2319 
2320 	old_compressed_len = mach_read_from_2(
2321 		reinterpret_cast<byte*>(buf)
2322 		+ FIL_PAGE_COMPRESS_SIZE_V1);
2323 
2324 	if (old_compressed_len > 0) {
2325 		old_compressed_len = ut_calc_align(
2326 			old_compressed_len + FIL_PAGE_DATA,
2327 			type.block_size());
2328 	} else {
2329 		old_compressed_len = *n;
2330 	}
2331 
2332 	byte*	compressed_page;
2333 
2334 	compressed_page = static_cast<byte*>(
2335 		ut_align(block->m_ptr, os_io_ptr_align));
2336 
2337 	byte*	buf_ptr;
2338 
2339 	buf_ptr = os_file_compress_page(
2340 		type.compression_algorithm(),
2341 		type.block_size(),
2342 		reinterpret_cast<byte*>(buf),
2343 		*n,
2344 		compressed_page,
2345 		&compressed_len,
2346 		type.encryption_algorithm().m_type == Encryption::KEYRING &&
2347 		type.encryption_algorithm().m_key != NULL);
2348 
2349 	if (buf_ptr != buf) {
2350 		/* Set new compressed size to uncompressed page. */
2351 		memcpy(reinterpret_cast<byte*>(buf) + FIL_PAGE_COMPRESS_SIZE_V1,
2352 		       buf_ptr + FIL_PAGE_COMPRESS_SIZE_V1, 2);
2353 
2354 		buf = buf_ptr;
2355 		*n = compressed_len;
2356 
2357 		if (compressed_len >= old_compressed_len) {
2358 
2359 			ut_ad(old_compressed_len <= UNIV_PAGE_SIZE);
2360 
2361 			type.clear_punch_hole();
2362 		}
2363 	}
2364 
2365 	return(block);
2366 }
2367 
2368 /** Encrypt a page content when write it to disk.
2369 @param[in]	type		IO flags
2370 @param[out]	buf		buffer to read or write
2371 @param[in,out]	n		number of bytes to read/write, starting from
2372 				offset
2373 @return pointer to the encrypted page */
2374 static
2375 Block*
os_file_encrypt_page(const IORequest & type,void * & buf,ulint * n)2376 os_file_encrypt_page(
2377 	const IORequest&	type,
2378 	void*&			buf,
2379 	ulint*			n)
2380 {
2381 
2382 	byte*		encrypted_page;
2383 	ulint		encrypted_len = *n;
2384 	byte*		buf_ptr;
2385 	Encryption	encryption(type.encryption_algorithm());
2386 
2387 	ut_ad(type.is_write());
2388 	ut_ad(type.is_encrypted());
2389 
2390 	Block*  block = os_alloc_block();
2391 
2392 	encrypted_page = static_cast<byte*>(
2393 		ut_align(block->m_ptr, os_io_ptr_align));
2394 
2395 	buf_ptr = encryption.encrypt(type,
2396 				     reinterpret_cast<byte*>(buf), *n,
2397 				     encrypted_page, &encrypted_len);
2398 
2399 	bool	encrypted = buf_ptr != buf;
2400 
2401 	if (encrypted) {
2402 
2403 		buf = buf_ptr;
2404 		*n = encrypted_len;
2405 	}
2406 
2407 	return(block);
2408 }
2409 
2410 /** Encrypt log blocks content when write it to disk.
2411 @param[in]	type		IO flags
2412 @param[in,out]	buf		buffer to read or write
2413 @param[in,out]	scratch		buffer for encrypting log
2414 @param[in,out]	n		number of bytes to read/write, starting from
2415 				offset
2416 @return pointer to the encrypted log blocks */
2417 static
2418 Block*
os_file_encrypt_log(const IORequest & type,void * & buf,byte * & scratch,ulint * n)2419 os_file_encrypt_log(
2420 	const IORequest&	type,
2421 	void*&			buf,
2422 	byte*&			scratch,
2423 	ulint*			n)
2424 {
2425 
2426 	byte*		buf_ptr;
2427 	Block*		block = NULL;
2428 
2429 	ut_ad(type.is_write());
2430 	ut_ad(type.is_encrypted());
2431 	ut_ad(type.is_log());
2432 	ut_ad(*n % OS_FILE_LOG_BLOCK_SIZE == 0);
2433 
2434 	if (*n <= BUFFER_BLOCK_SIZE - os_io_ptr_align) {
2435 		block = os_alloc_block();
2436 		buf_ptr = block->m_ptr;
2437 		scratch = NULL;
2438 	} else {
2439 		buf_ptr = static_cast<byte*>(
2440 			ut_malloc_nokey(*n + os_io_ptr_align));
2441 		scratch = buf_ptr;
2442 	}
2443 
2444 	byte*		encrypted_log;
2445 	encrypted_log = static_cast<byte*>(ut_align(buf_ptr, os_io_ptr_align));
2446 
2447 	ulint encrypted_len = *n;
2448 	Encryption encryption(type.encryption_algorithm());
2449 	encrypted_log = encryption.encrypt_log(type,
2450 					       reinterpret_cast<byte*>(buf),
2451 					       *n, encrypted_log,
2452 					       &encrypted_len);
2453 
2454 	bool	encrypted = encrypted_log != buf;
2455 
2456 	if (encrypted) {
2457 		buf = encrypted_log;
2458 		*n = encrypted_len;
2459 	}
2460 
2461 	return(block);
2462 }
2463 
2464 #ifndef _WIN32
2465 
2466 /** Do the read/write
2467 @param[in]	request	The IO context and type
2468 @return the number of bytes read/written or negative value on error */
2469 ssize_t
execute(const IORequest & request)2470 SyncFileIO::execute(const IORequest& request)
2471 {
2472 	ssize_t	n_bytes;
2473 
2474 	if (request.is_read()) {
2475 		n_bytes = pread(m_fh, m_buf, m_n, m_offset);
2476 	} else {
2477 		ut_ad(request.is_write());
2478 		n_bytes = pwrite(m_fh, m_buf, m_n, m_offset);
2479 	}
2480 
2481 	return(n_bytes);
2482 }
2483 
2484 MY_ATTRIBUTE((warn_unused_result))
2485 static std::string
os_file_find_path_for_fd(os_file_t fd)2486 os_file_find_path_for_fd(
2487 	os_file_t fd)
2488 {
2489 	char fdname[FN_REFLEN];
2490 	snprintf(fdname, sizeof fdname, "/proc/%d/fd/%d", getpid(), fd);
2491 	char filename[FN_REFLEN];
2492 	const int err_filename = my_readlink(filename, fdname, MYF(0));
2493 	return std::string((err_filename != -1) ? filename : "");
2494 }
2495 
2496 /** Free storage space associated with a section of the file.
2497 @param[in]	fh		Open file handle
2498 @param[in]	off		Starting offset (SEEK_SET)
2499 @param[in]	len		Size of the hole
2500 @return DB_SUCCESS or error code */
2501 static
2502 dberr_t
os_file_punch_hole_posix(os_file_t fh,os_offset_t off,os_offset_t len)2503 os_file_punch_hole_posix(
2504 	os_file_t	fh,
2505 	os_offset_t	off,
2506 	os_offset_t	len)
2507 {
2508 #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
2509 	const int	mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
2510 
2511 	int             ret = fallocate(fh, mode, off, len);
2512 
2513 	if (ret == 0) {
2514 		return(DB_SUCCESS);
2515 	}
2516 
2517 	ut_a(ret == -1);
2518 
2519 	if (errno == ENOTSUP) {
2520 		return(DB_IO_NO_PUNCH_HOLE);
2521 	}
2522 
2523 	const std::string fd_path = os_file_find_path_for_fd(fh);
2524 	if (!fd_path.empty()) {
2525 		ib::warn()
2526 			<< "fallocate(" << fh << " ("
2527 			<< fd_path << "), FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, "
2528 			<< off << ", " << len << ") returned errno: "
2529 			<<  errno;
2530 	} else {
2531 		ib::warn()
2532 			<< "fallocate(" << fh
2533 			<<", FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, "
2534 			<< off << ", " << len << ") returned errno: "
2535 			<<  errno;
2536 	}
2537 
2538 	return(DB_IO_ERROR);
2539 
2540 #elif defined(UNIV_SOLARIS)
2541 
2542 	// Use F_FREESP
2543 
2544 #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
2545 
2546 	return(DB_IO_NO_PUNCH_HOLE);
2547 }
2548 
2549 #if defined(LINUX_NATIVE_AIO)
2550 
2551 /** Linux native AIO handler */
2552 class LinuxAIOHandler {
2553 public:
2554 	/**
2555 	@param[in] global_segment	The global segment*/
LinuxAIOHandler(ulint global_segment)2556 	LinuxAIOHandler(ulint global_segment)
2557 		:
2558 		m_global_segment(global_segment)
2559 	{
2560 		/* Should never be doing Sync IO here. */
2561 		ut_a(m_global_segment != ULINT_UNDEFINED);
2562 
2563 		/* Find the array and the local segment. */
2564 
2565 		m_segment = AIO::get_array_and_local_segment(
2566 			&m_array, m_global_segment);
2567 
2568 		m_n_slots = m_array->slots_per_segment();
2569 	}
2570 
2571 	/** Destructor */
~LinuxAIOHandler()2572 	~LinuxAIOHandler()
2573 	{
2574 		// No op
2575 	}
2576 
2577 	/**
2578 	Process a Linux AIO request
2579 	@param[out]	m1		the messages passed with the
2580 	@param[out]	m2		AIO request; note that in case the
2581 					AIO operation failed, these output
2582 					parameters are valid and can be used to
2583 					restart the operation.
2584 	@param[out]	request		IO context
2585 	@return DB_SUCCESS or error code */
2586 	dberr_t poll(fil_node_t** m1, void** m2, IORequest* request);
2587 
2588 private:
2589 	/** Resubmit an IO request that was only partially successful
2590 	@param[in,out]	slot		Request to resubmit
2591 	@return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
2592 	dberr_t	resubmit(Slot* slot);
2593 
2594 	/** Check if the AIO succeeded
2595 	@param[in,out]	slot		The slot to check
2596 	@return DB_SUCCESS, DB_FAIL if the operation should be retried or
2597 		DB_IO_ERROR on all other errors */
2598 	dberr_t	check_state(Slot* slot);
2599 
2600 	/** @return true if a shutdown was detected */
is_shutdown() const2601 	bool is_shutdown() const
2602 	{
2603 		return(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
2604 		       && !buf_page_cleaner_is_active);
2605 	}
2606 
2607 	/** If no slot was found then the m_array->m_mutex will be released.
2608 	@param[out]	n_pending	The number of pending IOs
2609 	@return NULL or a slot that has completed IO */
2610 	Slot* find_completed_slot(ulint* n_pending);
2611 
2612 	/** This is called from within the IO-thread. If there are no completed
2613 	IO requests in the slot array, the thread calls this function to
2614 	collect more requests from the Linux kernel.
2615 	The IO-thread waits on io_getevents(), which is a blocking call, with
2616 	a timeout value. Unless the system is very heavy loaded, keeping the
2617 	IO-thread very busy, the io-thread will spend most of its time waiting
2618 	in this function.
2619 	The IO-thread also exits in this function. It checks server status at
2620 	each wakeup and that is why we use timed wait in io_getevents(). */
2621 	void collect();
2622 
2623 private:
2624 	/** Slot array */
2625 	AIO*			m_array;
2626 
2627 	/** Number of slots inthe local segment */
2628 	ulint			m_n_slots;
2629 
2630 	/** The local segment to check */
2631 	ulint			m_segment;
2632 
2633 	/** The global segment */
2634 	ulint			m_global_segment;
2635 };
2636 
2637 /** Resubmit an IO request that was only partially successful
2638 @param[in,out]	slot		Request to resubmit
2639 @return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
2640 dberr_t
resubmit(Slot * slot)2641 LinuxAIOHandler::resubmit(Slot* slot)
2642 {
2643 #ifdef UNIV_DEBUG
2644 	/* Bytes already read/written out */
2645 	ulint	n_bytes = slot->ptr - slot->buf;
2646 
2647 	ut_ad(m_array->is_mutex_owned());
2648 
2649 	ut_ad(n_bytes < slot->original_len);
2650 	ut_ad(static_cast<ulint>(slot->n_bytes) < slot->original_len - n_bytes);
2651 	/* Partial read or write scenario */
2652 	ut_ad(slot->len >= static_cast<ulint>(slot->n_bytes));
2653 #endif /* UNIV_DEBUG */
2654 
2655 	slot->len -= slot->n_bytes;
2656 	slot->ptr += slot->n_bytes;
2657 	slot->offset += slot->n_bytes;
2658 
2659 	/* Resetting the bytes read/written */
2660 	slot->n_bytes = 0;
2661 	slot->io_already_done = false;
2662 
2663 	/* make sure that slot->offset fits in off_t */
2664 	ut_ad(sizeof(off_t) >= sizeof(os_offset_t));
2665 
2666 	struct iocb*	iocb = &slot->control;
2667 	if (slot->type.is_read()) {
2668 		io_prep_pread(
2669 			iocb,
2670 			slot->file.m_file,
2671 			slot->ptr,
2672 			slot->len,
2673 			slot->offset);
2674 
2675 	} else {
2676 
2677 		ut_a(slot->type.is_write());
2678 
2679 		io_prep_pwrite(
2680 			iocb,
2681 			slot->file.m_file,
2682 			slot->ptr,
2683 			slot->len,
2684 			slot->offset);
2685 	}
2686 
2687 	iocb->data = slot;
2688 
2689 	/* Resubmit an I/O request */
2690 	int	ret = io_submit(m_array->io_ctx(m_segment), 1, &iocb);
2691 
2692 	if (ret < -1)  {
2693 		errno = -ret;
2694 	}
2695 
2696 	return(ret < 0 ? DB_IO_PARTIAL_FAILED : DB_SUCCESS);
2697 }
2698 
2699 /** Check if the AIO succeeded
2700 @param[in,out]	slot		The slot to check
2701 @return DB_SUCCESS, DB_FAIL if the operation should be retried or
2702 	DB_IO_ERROR on all other errors */
2703 dberr_t
check_state(Slot * slot)2704 LinuxAIOHandler::check_state(Slot* slot)
2705 {
2706 	ut_ad(m_array->is_mutex_owned());
2707 
2708 	/* Note that it may be that there is more then one completed
2709 	IO requests. We process them one at a time. We may have a case
2710 	here to improve the performance slightly by dealing with all
2711 	requests in one sweep. */
2712 
2713 	srv_set_io_thread_op_info(
2714 		m_global_segment, "processing completed aio requests");
2715 
2716 	ut_ad(slot->io_already_done);
2717 
2718 	dberr_t	err;
2719 
2720 	if (slot->ret == 0) {
2721 
2722 		err = AIOHandler::post_io_processing(slot);
2723 
2724 	} else {
2725 		errno = -slot->ret;
2726 
2727 		/* os_file_handle_error does tell us if we should retry
2728 		this IO. As it stands now, we don't do this retry when
2729 		reaping requests from a different context than
2730 		the dispatcher. This non-retry logic is the same for
2731 		Windows and Linux native AIO.
2732 		We should probably look into this to transparently
2733 		re-submit the IO. */
2734 		os_file_handle_error(slot->name, "Linux aio");
2735 
2736 		err = DB_IO_ERROR;
2737 	}
2738 
2739 	return(err);
2740 }
2741 
2742 /** If no slot was found then the m_array->m_mutex will be released.
2743 @param[out]	n_pending		The number of pending IOs
2744 @return NULL or a slot that has completed IO */
2745 Slot*
find_completed_slot(ulint * n_pending)2746 LinuxAIOHandler::find_completed_slot(ulint* n_pending)
2747 {
2748 	ulint	offset = m_n_slots * m_segment;
2749 
2750 	*n_pending = 0;
2751 
2752 	m_array->acquire();
2753 
2754 	Slot*	slot = m_array->at(offset);
2755 
2756 	for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
2757 
2758 		if (slot->is_reserved) {
2759 
2760 			++*n_pending;
2761 
2762 			if (slot->io_already_done) {
2763 
2764 				/* Something for us to work on.
2765 				Note: We don't release the mutex. */
2766 				return(slot);
2767 			}
2768 		}
2769 	}
2770 
2771 	m_array->release();
2772 
2773 	return(NULL);
2774 }
2775 
2776 /** This function is only used in Linux native asynchronous i/o. This is
2777 called from within the io-thread. If there are no completed IO requests
2778 in the slot array, the thread calls this function to collect more
2779 requests from the kernel.
2780 The io-thread waits on io_getevents(), which is a blocking call, with
2781 a timeout value. Unless the system is very heavy loaded, keeping the
2782 io-thread very busy, the io-thread will spend most of its time waiting
2783 in this function.
2784 The io-thread also exits in this function. It checks server status at
2785 each wakeup and that is why we use timed wait in io_getevents(). */
2786 void
collect()2787 LinuxAIOHandler::collect()
2788 {
2789 	ut_ad(m_n_slots > 0);
2790 	ut_ad(m_array != NULL);
2791 	ut_ad(m_segment < m_array->get_n_segments());
2792 
2793 	/* Which io_context we are going to use. */
2794 	io_context*	io_ctx = m_array->io_ctx(m_segment);
2795 
2796 	/* Starting point of the m_segment we will be working on. */
2797 	ulint	start_pos = m_segment * m_n_slots;
2798 
2799 	/* End point. */
2800 	ulint	end_pos = start_pos + m_n_slots;
2801 
2802 	for (;;) {
2803 		struct io_event*	events;
2804 
2805 		/* Which part of event array we are going to work on. */
2806 		events = m_array->io_events(m_segment * m_n_slots);
2807 
2808 		/* Initialize the events. */
2809 		memset(events, 0, sizeof(*events) * m_n_slots);
2810 
2811 		/* The timeout value is arbitrary. We probably need
2812 		to experiment with it a little. */
2813 		struct timespec		timeout;
2814 
2815 		timeout.tv_sec = 0;
2816 		timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
2817 
2818 		int	ret;
2819 
2820 		ret = io_getevents(io_ctx, 1, m_n_slots, events, &timeout);
2821 
2822 		for (int i = 0; i < ret; ++i) {
2823 
2824 			struct iocb*	iocb;
2825 
2826 			iocb = reinterpret_cast<struct iocb*>(events[i].obj);
2827 			ut_a(iocb != NULL);
2828 
2829 			Slot*	slot = reinterpret_cast<Slot*>(iocb->data);
2830 
2831 			/* Some sanity checks. */
2832 			ut_a(slot != NULL);
2833 			ut_a(slot->is_reserved);
2834 
2835 			/* We are not scribbling previous segment. */
2836 			ut_a(slot->pos >= start_pos);
2837 
2838 			/* We have not overstepped to next segment. */
2839 			ut_a(slot->pos < end_pos);
2840 
2841 			/* We never compress/decompress the first page */
2842 
2843 			if (slot->offset > 0
2844 			    && !slot->skip_punch_hole
2845 			    && slot->type.is_compression_enabled()
2846 			    && !slot->type.is_log()
2847 			    && slot->type.is_write()
2848 			    && slot->type.is_compressed()
2849 			    && slot->type.punch_hole()) {
2850 
2851 				slot->err = AIOHandler::io_complete(slot);
2852 			} else {
2853 				slot->err = DB_SUCCESS;
2854 			}
2855 
2856 			/* Mark this request as completed. The error handling
2857 			will be done in the calling function. */
2858 			m_array->acquire();
2859 
2860 			/* events[i].res2 should always be ZERO */
2861 			ut_ad(events[i].res2 == 0);
2862 			slot->io_already_done = true;
2863 
2864 			/*Even though events[i].res is an unsigned number
2865 			in libaio, it is used to return a negative value
2866 			(negated errno value) to indicate error and a positive
2867 			value to indicate number of bytes read or written. */
2868 
2869 			if (events[i].res > slot->len) {
2870 				/* failure */
2871 				slot->n_bytes = 0;
2872 				slot->ret = events[i].res;
2873 			} else {
2874 				/* success */
2875 				slot->n_bytes = events[i].res;
2876 				slot->ret = 0;
2877 			}
2878 			m_array->release();
2879 		}
2880 
2881 		if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
2882 		    || !buf_page_cleaner_is_active
2883 		    || ret > 0) {
2884 
2885 			break;
2886 		}
2887 
2888 		/* This error handling is for any error in collecting the
2889 		IO requests. The errors, if any, for any particular IO
2890 		request are simply passed on to the calling routine. */
2891 
2892 		switch (ret) {
2893 		case -EAGAIN:
2894 			/* Not enough resources! Try again. */
2895 
2896 		case -EINTR:
2897 			/* Interrupted! The behaviour in case of an interrupt.
2898 			If we have some completed IOs available then the
2899 			return code will be the number of IOs. We get EINTR
2900 			only if there are no completed IOs and we have been
2901 			interrupted. */
2902 
2903 		case 0:
2904 			/* No pending request! Go back and check again. */
2905 
2906 			continue;
2907 		}
2908 
2909 		/* All other errors should cause a trap for now. */
2910 		ib::fatal()
2911 			<< "Unexpected ret_code[" << ret
2912 			<< "] from io_getevents()!";
2913 
2914 		break;
2915 	}
2916 }
2917 
2918 /** Process a Linux AIO request
2919 @param[out]	m1		the messages passed with the
2920 @param[out]	m2		AIO request; note that in case the
2921 				AIO operation failed, these output
2922 				parameters are valid and can be used to
2923 				restart the operation.
2924 @param[out]	request		IO context
2925 @return DB_SUCCESS or error code */
2926 dberr_t
poll(fil_node_t ** m1,void ** m2,IORequest * request)2927 LinuxAIOHandler::poll(fil_node_t** m1, void** m2, IORequest* request)
2928 {
2929 	dberr_t		err;
2930 	Slot*		slot;
2931 
2932 	/* Loop until we have found a completed request. */
2933 	for (;;) {
2934 
2935 		ulint	n_pending;
2936 
2937 		slot = find_completed_slot(&n_pending);
2938 
2939 		if (slot != NULL) {
2940 
2941 			ut_ad(m_array->is_mutex_owned());
2942 
2943 			err = check_state(slot);
2944 
2945 			/* DB_FAIL is not a hard error, we should retry */
2946 			if (err != DB_FAIL) {
2947 				break;
2948 			}
2949 
2950 			/* Partial IO, resubmit request for
2951 			remaining bytes to read/write */
2952 			err = resubmit(slot);
2953 
2954 			if (err != DB_SUCCESS) {
2955 				break;
2956 			}
2957 
2958 			m_array->release();
2959 
2960 		} else if (is_shutdown() && n_pending == 0) {
2961 
2962 			/* There is no completed request. If there is
2963 			no pending request at all, and the system is
2964 			being shut down, exit. */
2965 
2966 			*m1 = NULL;
2967 			*m2 = NULL;
2968 
2969 			return(DB_SUCCESS);
2970 
2971 		} else {
2972 
2973 			/* Wait for some request. Note that we return
2974 			from wait if we have found a request. */
2975 
2976 			srv_set_io_thread_op_info(
2977 				m_global_segment,
2978 				"waiting for completed aio requests");
2979 
2980 			collect();
2981 		}
2982 	}
2983 
2984 	if (err == DB_IO_PARTIAL_FAILED) {
2985 		/* Aborting in case of submit failure */
2986 		ib::fatal()
2987 			<< "Native Linux AIO interface. "
2988 			"io_submit() call failed when "
2989 			"resubmitting a partial I/O "
2990 			"request on the file " << slot->name
2991 			<< ".";
2992 	}
2993 
2994 	*m1 = slot->m1;
2995 	*m2 = slot->m2;
2996 
2997 	*request = slot->type;
2998 
2999 	m_array->release(slot);
3000 
3001 	m_array->release();
3002 
3003 	return(err);
3004 }
3005 
3006 /** This function is only used in Linux native asynchronous i/o.
3007 Waits for an aio operation to complete. This function is used to wait for
3008 the completed requests. The aio array of pending requests is divided
3009 into segments. The thread specifies which segment or slot it wants to wait
3010 for. NOTE: this function will also take care of freeing the aio slot,
3011 therefore no other thread is allowed to do the freeing!
3012 
3013 @param[in]	global_seg	segment number in the aio array
3014 				to wait for; segment 0 is the ibuf
3015 				i/o thread, segment 1 is log i/o thread,
3016 				then follow the non-ibuf read threads,
3017 				and the last are the non-ibuf write
3018 				threads.
3019 @param[out]	m1		the messages passed with the
3020 @param[out]	m2			AIO request; note that in case the
3021 				AIO operation failed, these output
3022 				parameters are valid and can be used to
3023 				restart the operation.
3024 @param[out]xi	 request	IO context
3025 @return DB_SUCCESS if the IO was successful */
3026 static
3027 dberr_t
os_aio_linux_handler(ulint global_segment,fil_node_t ** m1,void ** m2,IORequest * request)3028 os_aio_linux_handler(
3029 	ulint		global_segment,
3030 	fil_node_t**	m1,
3031 	void**		m2,
3032 	IORequest*	request)
3033 {
3034 	LinuxAIOHandler	handler(global_segment);
3035 
3036 	dberr_t	err = handler.poll(m1, m2, request);
3037 
3038 	if (err == DB_IO_NO_PUNCH_HOLE) {
3039 		fil_no_punch_hole(*m1);
3040 		err = DB_SUCCESS;
3041 	}
3042 
3043 	return(err);
3044 }
3045 #endif
3046 
3047 /** Submit buffered AIO requests on the given segment to the kernel.
3048 (low level function).
3049 @param[in] acquire_mutex specifies whether to lock array mutex */
3050 void
os_aio_dispatch_read_array_submit_low(bool acquire_mutex MY_ATTRIBUTE ((unused)))3051 AIO::os_aio_dispatch_read_array_submit_low(
3052 	bool acquire_mutex MY_ATTRIBUTE((unused)))
3053 {
3054 	os_aio_dispatch_read_array_submit_low_for_array(acquire_mutex, s_reads);
3055 	if (s_ibuf != NULL) {
3056 		os_aio_dispatch_read_array_submit_low_for_array(acquire_mutex, s_ibuf);
3057 	}
3058 }
3059 
3060 /** Submit buffered AIO requests on the array to the kernel.
3061 (low level function).
3062 @param[in] acquire_mutex specifies whether to lock array mutex
3063 @param[in] array for which to submit IO */
3064 void
os_aio_dispatch_read_array_submit_low_for_array(bool acquire_mutex MY_ATTRIBUTE ((unused)),const AIO * arr)3065 AIO::os_aio_dispatch_read_array_submit_low_for_array(
3066 	bool acquire_mutex MY_ATTRIBUTE((unused)), const AIO* arr)
3067 {
3068 	if (!srv_use_native_aio) {
3069 		return;
3070 	}
3071 #if defined(LINUX_NATIVE_AIO)
3072 	const AIO* array = arr;
3073 	ulint total_submitted = 0;
3074 	if (acquire_mutex)
3075 		array->acquire();
3076 	/* Submit aio requests buffered on all segments. */
3077 	ut_ad(array->m_pending);
3078 	ut_ad(array->m_count);
3079 	for (ulint i = 0; i < array->m_n_segments; i++) {
3080 		const int	count = array->m_count[i];
3081 		int	offset = 0;
3082 		while (offset != count) {
3083 			struct iocb** const	iocb_array = array->m_pending
3084 				+ i * array->m_slots.size()
3085 					/ array->m_n_segments
3086 				+ offset;
3087 			const int	partial_count = count - offset;
3088 			/* io_submit() returns number of successfully queued
3089 			requests or (-errno).
3090 			It returns 0 only if the number of iocb blocks passed
3091 			is also 0. */
3092 			const int	submitted = io_submit(
3093 						array->m_aio_ctx[i],
3094 						partial_count, iocb_array);
3095 
3096 			/* This assertion prevents infinite loop in both
3097 			debug and release modes. */
3098 			ut_a(submitted != 0);
3099 
3100 			if (submitted < 0) {
3101 				/* Terminating with fatal error */
3102 				const char*	errmsg =
3103 					strerror(-submitted);
3104 				ib::fatal() << "Trying to sumbit " << count
3105 					<< " aio requests, io_submit() set "
3106 					<< "errno to " << -submitted << ": "
3107 					<< (errmsg ? errmsg : "<unknown>");
3108 			}
3109 			ut_ad(submitted <= partial_count);
3110 			if (submitted < partial_count)
3111 			{
3112 				ib::warn() << "Trying to sumbit " << count
3113 					<< " aio requests, io_submit() "
3114 					<< "submitted only " << submitted;
3115 			}
3116 			offset += submitted;
3117 		}
3118 		total_submitted += count;
3119 	}
3120 	/* Reset the aio request buffer. */
3121 	memset(array->m_pending, 0x0,
3122 		sizeof(struct iocb*) * array->m_slots.size());
3123 	memset(array->m_count, 0x0, sizeof(ulint) * array->m_n_segments);
3124 	if (acquire_mutex)
3125 		array->release();
3126 
3127 	srv_stats.n_aio_submitted.add(total_submitted);
3128 #endif
3129 }
3130 
3131 /** Submit buffered AIO requests on the given segment to the kernel. */
3132 void
os_aio_dispatch_read_array_submit()3133 os_aio_dispatch_read_array_submit()
3134 {
3135 	AIO::os_aio_dispatch_read_array_submit_low(true);
3136 }
3137 
3138 #if defined(LINUX_NATIVE_AIO)
3139 /** Dispatch an AIO request to the kernel.
3140 @param[in,out]	slot		an already reserved slot
3141 @param[in]	should_buffer	should buffer the request
3142 rather than submit
3143 @return true on success. */
3144 bool
linux_dispatch(Slot * slot,bool should_buffer)3145 AIO::linux_dispatch(Slot* slot, bool should_buffer)
3146 {
3147 	ut_ad(slot);
3148 	ut_a(slot->is_reserved);
3149 	ut_ad(slot->type.validate());
3150 
3151 	/* Find out what we are going to work with.
3152 	The iocb struct is directly in the slot.
3153 	The io_context is one per segment. */
3154 
3155 	struct iocb*	iocb = &slot->control;
3156 
3157 	ulint	slots_per_segment = m_slots.size() / m_n_segments;
3158 	ulint	io_ctx_index = slot->pos / slots_per_segment;
3159 
3160 	if (should_buffer) {
3161 		ut_ad(this == s_reads || this == s_ibuf);
3162 
3163 		acquire();
3164 		/* There are m_slots.size() elements in m_pending,
3165 		which is divided into m_n_segments area of equal size.
3166 		The iocb of each segment are buffered in its corresponding area
3167 		in the pending array consecutively as they come.
3168 		m_count[i] records the number of buffered aio requests
3169 		in the ith segment.*/
3170 		ut_ad(m_count);
3171 		ulint&	count = m_count[io_ctx_index];
3172 		ut_ad(count != slots_per_segment);
3173 		ulint	n = io_ctx_index * slots_per_segment + count;
3174 		ut_ad(m_pending);
3175 		m_pending[n] = iocb;
3176 		++count;
3177 		if (count == slots_per_segment) {
3178 			AIO::os_aio_dispatch_read_array_submit_low_for_array(false, this);
3179 		}
3180 		release();
3181 		return(true);
3182 	}
3183 	/* Submit the given request. */
3184 	int	ret = io_submit(m_aio_ctx[io_ctx_index], 1, &iocb);
3185 
3186 	/* io_submit() returns number of successfully queued requests
3187 	or -errno. */
3188 
3189 	if (ret != 1) {
3190 		errno = -ret;
3191 	}
3192 
3193 	return(ret == 1);
3194 }
3195 
3196 /** Creates an io_context for native linux AIO.
3197 @param[in]	max_events	number of events
3198 @param[out]	io_ctx		io_ctx to initialize.
3199 @return true on success. */
3200 bool
linux_create_io_ctx(ulint max_events,io_context_t * io_ctx)3201 AIO::linux_create_io_ctx(
3202 	ulint		max_events,
3203 	io_context_t*	io_ctx)
3204 {
3205 	ssize_t		n_retries = 0;
3206 
3207 	for (;;) {
3208 
3209 		memset(io_ctx, 0x0, sizeof(*io_ctx));
3210 
3211 		/* Initialize the io_ctx. Tell it how many pending
3212 		IO requests this context will handle. */
3213 
3214 		int	ret = io_setup(max_events, io_ctx);
3215 
3216 		if (ret == 0) {
3217 			/* Success. Return now. */
3218 			return(true);
3219 		}
3220 
3221 		/* If we hit EAGAIN we'll make a few attempts before failing. */
3222 
3223 		switch (ret) {
3224 		case -EAGAIN:
3225 			if (n_retries == 0) {
3226 				/* First time around. */
3227 				ib::warn()
3228 					<< "io_setup() failed with EAGAIN."
3229 					" Will make "
3230 					<< OS_AIO_IO_SETUP_RETRY_ATTEMPTS
3231 					<< " attempts before giving up.";
3232 			}
3233 
3234 			if (n_retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
3235 
3236 				++n_retries;
3237 
3238 				ib::warn()
3239 					<< "io_setup() attempt "
3240 					<< n_retries << ".";
3241 
3242 				os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
3243 
3244 				continue;
3245 			}
3246 
3247 			/* Have tried enough. Better call it a day. */
3248 			ib::error()
3249 				<< "io_setup() failed with EAGAIN after "
3250 				<< OS_AIO_IO_SETUP_RETRY_ATTEMPTS
3251 				<< " attempts.";
3252 			break;
3253 
3254 		case -ENOSYS:
3255 			ib::error()
3256 				<< "Linux Native AIO interface"
3257 				" is not supported on this platform. Please"
3258 				" check your OS documentation and install"
3259 				" appropriate binary of InnoDB.";
3260 
3261 			break;
3262 
3263 		default:
3264 			ib::error()
3265 				<< "Linux Native AIO setup"
3266 				<< " returned following error["
3267 				<< ret << "]";
3268 			break;
3269 		}
3270 
3271 		ib::info()
3272 			<< "You can disable Linux Native AIO by"
3273 			" setting innodb_use_native_aio = 0 in my.cnf";
3274 
3275 		break;
3276 	}
3277 
3278 	return(false);
3279 }
3280 
3281 /** Checks if the system supports native linux aio. On some kernel
3282 versions where native aio is supported it won't work on tmpfs. In such
3283 cases we can't use native aio as it is not possible to mix simulated
3284 and native aio.
3285 @return: true if supported, false otherwise. */
3286 bool
is_linux_native_aio_supported()3287 AIO::is_linux_native_aio_supported()
3288 {
3289 	int		fd;
3290 	io_context_t	io_ctx;
3291 	char		name[1000];
3292 
3293 	if (!linux_create_io_ctx(1, &io_ctx)) {
3294 
3295 		/* The platform does not support native aio. */
3296 
3297 		return(false);
3298 
3299 	} else if (!srv_read_only_mode) {
3300 
3301 		/* Now check if tmpdir supports native aio ops. */
3302 		fd = innobase_mysql_tmpfile(NULL);
3303 
3304 		if (fd < 0) {
3305 			ib::warn()
3306 				<< "Unable to create temp file to check"
3307 				" native AIO support.";
3308 
3309 			return(false);
3310 		}
3311 	} else {
3312 
3313 		os_normalize_path(srv_log_group_home_dir);
3314 
3315 		ulint	dirnamelen = strlen(srv_log_group_home_dir);
3316 
3317 		ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
3318 
3319 		memcpy(name, srv_log_group_home_dir, dirnamelen);
3320 
3321 		/* Add a path separator if needed. */
3322 		if (dirnamelen && name[dirnamelen - 1] != OS_PATH_SEPARATOR) {
3323 
3324 			name[dirnamelen++] = OS_PATH_SEPARATOR;
3325 		}
3326 
3327 		strcpy(name + dirnamelen, "ib_logfile0");
3328 
3329 		fd = ::open(name, O_RDONLY);
3330 
3331 		if (fd == -1) {
3332 
3333 			ib::warn()
3334 				<< "Unable to open"
3335 				<< " \"" << name << "\" to check native"
3336 				<< " AIO read support.";
3337 
3338 			return(false);
3339 		}
3340 	}
3341 
3342 	struct io_event	io_event;
3343 
3344 	memset(&io_event, 0x0, sizeof(io_event));
3345 
3346 	byte*	buf = static_cast<byte*>(ut_malloc_nokey(UNIV_PAGE_SIZE * 2));
3347 	byte*	ptr = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE));
3348 
3349 	struct iocb	iocb;
3350 
3351 	/* Suppress valgrind warning. */
3352 	memset(buf, 0x00, UNIV_PAGE_SIZE * 2);
3353 	memset(&iocb, 0x0, sizeof(iocb));
3354 
3355 	struct iocb*	p_iocb = &iocb;
3356 
3357 	if (!srv_read_only_mode) {
3358 
3359 		io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0);
3360 
3361 	} else {
3362 		ut_a(UNIV_PAGE_SIZE >= 512);
3363 		io_prep_pread(p_iocb, fd, ptr, 512, 0);
3364 	}
3365 
3366 	int	err = io_submit(io_ctx, 1, &p_iocb);
3367 
3368 	if (err >= 1) {
3369 		/* Now collect the submitted IO request. */
3370 		err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
3371 	}
3372 
3373 	ut_free(buf);
3374 	close(fd);
3375 
3376 	switch (err) {
3377 	case 1:
3378 		return(true);
3379 
3380 	case -EINVAL:
3381 	case -ENOSYS:
3382 		ib::error()
3383 			<< "Linux Native AIO not supported. You can either"
3384 			" move "
3385 			<< (srv_read_only_mode ? name : "tmpdir")
3386 			<< " to a file system that supports native"
3387 			" AIO or you can set innodb_use_native_aio to"
3388 			" FALSE to avoid this message.";
3389 
3390 		/* fall through. */
3391 	default:
3392 		ib::error()
3393 			<< "Linux Native AIO check on "
3394 			<< (srv_read_only_mode ? name : "tmpdir")
3395 			<< "returned error[" << -err << "]";
3396 	}
3397 
3398 	return(false);
3399 }
3400 
3401 #endif /* LINUX_NATIVE_AIO */
3402 
3403 /** For an EINVAL I/O error, prints a diagnostic message if innodb_flush_method
3404 == ALL_O_DIRECT.
3405 @param[in]	err	C error code
3406 @return true if the diagnostic message was printed
3407 @return false if the diagnostic message does not apply */
3408 static
3409 bool
os_diagnose_all_o_direct_einval(ulint err)3410 os_diagnose_all_o_direct_einval(
3411 	ulint	err)
3412 {
3413 	if ((err == EINVAL)
3414 	    && (srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT)) {
3415 		ib::info() << "The error might be caused by redo log I/O not "
3416 			"satisfying innodb_flush_method=ALL_O_DIRECT "
3417 			"requirements by the underlying file system.";
3418 		if (srv_log_write_ahead_size
3419 		    != DEFAULT_SRV_LOG_WRITE_AHEAD_SIZE)
3420 			ib::info() <<
3421 				"This might be caused by an incompatible "
3422 				"non-default innodb_log_write_ahead_size "
3423 				"value " << srv_log_write_ahead_size;
3424 		ib::info() <<
3425 			"Please file a bug at https://bugs.percona.com and "
3426 			"include this error message, my.cnf settings, ";
3427 		ib::info() <<
3428 			"and information about the file system where the redo "
3429 			"log resides.";
3430 		ib::info() <<
3431 			"A possible workaround is to change "
3432 			"innodb_flush_method value to something else "
3433 			"than ALL_O_DIRECT.";
3434 		return(true);
3435 	}
3436 	return(false);
3437 }
3438 
3439 /** Retrieves the last error number if an error occurs in a file io function.
3440 The number should be retrieved before any other OS calls (because they may
3441 overwrite the error number). If the number is not known to this program,
3442 the OS error number + 100 is returned.
3443 @param[in]	report_all_errors	true if we want an error message
3444 					printed of all errors
3445 @param[in]	on_error_silent		true then don't print any diagnostic
3446 					to the log
3447 @return error number, or OS error number + 100 */
3448 static
3449 ulint
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)3450 os_file_get_last_error_low(
3451 	bool	report_all_errors,
3452 	bool	on_error_silent)
3453 {
3454 	int	err = errno;
3455 
3456 	if (err == 0) {
3457 		return(0);
3458 	}
3459 
3460 	if (report_all_errors
3461 	    || (err != ENOSPC && err != EEXIST && !on_error_silent)) {
3462 
3463 		ib::error()
3464 			<< "Operating system error number "
3465 			<< err
3466 			<< " in a file operation.";
3467 
3468 		if (err == ENOENT) {
3469 
3470 			ib::error()
3471 				<< "The error means the system"
3472 				" cannot find the path specified.";
3473 
3474 			if (srv_is_being_started) {
3475 
3476 				ib::error()
3477 					<< "If you are installing InnoDB,"
3478 					" remember that you must create"
3479 					" directories yourself, InnoDB"
3480 					" does not create them.";
3481 			}
3482 		} else if (err == EACCES) {
3483 
3484 			ib::error()
3485 				<< "The error means mysqld does not have"
3486 				" the access rights to the directory.";
3487 
3488 		} else if (!os_diagnose_all_o_direct_einval(err)) {
3489 			if (strerror(err) != NULL) {
3490 
3491 				ib::error()
3492 					<< "Error number " << err << " means '"
3493 					<< strerror(err) << "'";
3494 			}
3495 
3496 			ib::info() << OPERATING_SYSTEM_ERROR_MSG;
3497 		}
3498 	}
3499 
3500 	switch (err) {
3501 	case ENOSPC:
3502 		return(OS_FILE_DISK_FULL);
3503 	case ENOENT:
3504 		return(OS_FILE_NOT_FOUND);
3505 	case EEXIST:
3506 		return(OS_FILE_ALREADY_EXISTS);
3507 	case EXDEV:
3508 	case ENOTDIR:
3509 	case EISDIR:
3510 		return(OS_FILE_PATH_ERROR);
3511 	case EAGAIN:
3512 		if (srv_use_native_aio) {
3513 			return(OS_FILE_AIO_RESOURCES_RESERVED);
3514 		}
3515 		break;
3516 	case EINTR:
3517 		return(OS_FILE_AIO_INTERRUPTED);
3518 		break;
3519 	case EACCES:
3520 		return(OS_FILE_ACCESS_VIOLATION);
3521 	}
3522 	return(OS_FILE_ERROR_MAX + err);
3523 }
3524 
3525 /** Wrapper to fsync(2) that retries the call on some errors.
3526 Returns the value 0 if successful; otherwise the value -1 is returned and
3527 the global variable errno is set to indicate the error.
3528 @param[in]	file		open file handle
3529 @return 0 if success, -1 otherwise */
3530 static
3531 int
os_file_fsync_posix(os_file_t file)3532 os_file_fsync_posix(
3533 	os_file_t	file)
3534 {
3535 	ulint		failures = 0;
3536 
3537 	for (;;) {
3538 
3539 		++os_n_fsyncs;
3540 
3541 		int	ret = fsync(file);
3542 
3543 		if (ret == 0) {
3544 			return(ret);
3545 		}
3546 
3547 		switch(errno) {
3548 		case ENOLCK:
3549 
3550 			++failures;
3551 			ut_a(failures < 1000);
3552 
3553 			if (!(failures % 100)) {
3554 
3555 				ib::warn()
3556 					<< "fsync(): "
3557 					<< "No locks available; retrying";
3558 			}
3559 
3560 			/* 0.2 sec */
3561 			os_thread_sleep(200000);
3562 			break;
3563 
3564 		case EIO: {
3565 
3566 			const std::string fd_path
3567 				= os_file_find_path_for_fd(file);
3568 			if (!fd_path.empty())
3569 				ib::fatal() << "fsync(\"" << fd_path
3570 					    << "\") returned EIO, aborting.";
3571 			else
3572 				ib::fatal() << "fsync() returned EIO, aborting.";
3573 			break;
3574 		}
3575 
3576 		case EINTR:
3577 
3578 			++failures;
3579 			ut_a(failures < 2000);
3580 			break;
3581 
3582 		default:
3583 			ut_error;
3584 			break;
3585 		}
3586 	}
3587 
3588 	ut_error;
3589 
3590 	return(-1);
3591 }
3592 
3593 /** Check the existence and type of the given file.
3594 @param[in]	path		path name of file
3595 @param[out]	exists		true if the file exists
3596 @param[out]	type		Type of the file, if it exists
3597 @return true if call succeeded */
3598 bool
os_file_status_posix(const char * path,bool * exists,os_file_type_t * type)3599 os_file_status_posix(
3600 	const char*	path,
3601 	bool*		exists,
3602 	os_file_type_t* type)
3603 {
3604 	struct stat	statinfo;
3605 
3606 	int	ret = stat(path, &statinfo);
3607 
3608 	*exists = !ret;
3609 
3610 	if (!ret) {
3611 		/* file exists, everything OK */
3612 
3613 	} else if (errno == ENOENT || errno == ENOTDIR
3614 		   || errno == ENAMETOOLONG) {
3615 		/* file does not exist */
3616 		return(true);
3617 
3618 	} else {
3619 		/* file exists, but stat call failed */
3620 		os_file_handle_error_no_exit(path, "stat", false);
3621 		return(false);
3622 	}
3623 
3624 	if (S_ISDIR(statinfo.st_mode)) {
3625 		*type = OS_FILE_TYPE_DIR;
3626 
3627 	} else if (S_ISLNK(statinfo.st_mode)) {
3628 		*type = OS_FILE_TYPE_LINK;
3629 
3630 	} else if (S_ISREG(statinfo.st_mode)) {
3631 		*type = OS_FILE_TYPE_FILE;
3632 
3633 	} else {
3634 		*type = OS_FILE_TYPE_UNKNOWN;
3635 	}
3636 
3637 	return(true);
3638 }
3639 
3640 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
3641 function!
3642 Flushes the write buffers of a given file to the disk.
3643 @param[in]	file		handle to a file
3644 @return true if success */
3645 bool
os_file_flush_func(os_file_t file)3646 os_file_flush_func(
3647 	os_file_t	file)
3648 {
3649 	int	ret;
3650 
3651 	ret = os_file_fsync_posix(file);
3652 
3653 	if (ret == 0) {
3654 		return(true);
3655 	}
3656 
3657 	/* Since Linux returns EINVAL if the 'file' is actually a raw device,
3658 	we choose to ignore that error if we are using raw disks */
3659 
3660 	if (srv_start_raw_disk_in_use && errno == EINVAL) {
3661 
3662 		return(true);
3663 	}
3664 
3665 	ib::error() << "The OS said file flush did not succeed";
3666 
3667 	os_file_handle_error(NULL, "flush");
3668 
3669 	/* It is a fatal error if a file flush does not succeed, because then
3670 	the database can get corrupt on disk */
3671 	ut_error;
3672 
3673 	return(false);
3674 }
3675 
3676 /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
3677 this function!
3678 A simple function to open or create a file.
3679 @param[in]	name		name of the file or path as a null-terminated
3680 				string
3681 @param[in]	create_mode	create mode
3682 @param[in]	access_type	OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
3683 @param[in]	read_only	if true, read only checks are enforced
3684 @param[out]	success		true if succeed, false if error
3685 @return handle to the file, not defined if error, error number
3686 	can be retrieved with os_file_get_last_error */
3687 pfs_os_file_t
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)3688 os_file_create_simple_func(
3689 	const char*	name,
3690 	ulint		create_mode,
3691 	ulint		access_type,
3692 	bool		read_only,
3693 	bool*		success)
3694 {
3695 	pfs_os_file_t	file;
3696 
3697 	*success = false;
3698 
3699 	int		create_flag;
3700 
3701 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
3702 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
3703 
3704 	int		create_o_sync;
3705 	if (create_mode & OS_FILE_O_SYNC) {
3706 
3707 		create_o_sync = O_SYNC;
3708 		create_mode &= ~(static_cast<ulint>(OS_FILE_O_SYNC));
3709 	} else {
3710 		create_o_sync = 0;
3711 	}
3712 
3713 	if (create_mode == OS_FILE_OPEN) {
3714 
3715 		if (access_type == OS_FILE_READ_ONLY) {
3716 
3717 			create_flag = O_RDONLY;
3718 
3719 		} else if (read_only) {
3720 
3721 			create_flag = O_RDONLY;
3722 
3723 		} else {
3724 			create_flag = O_RDWR;
3725 		}
3726 
3727 	} else if (read_only) {
3728 
3729 		create_flag = O_RDONLY;
3730 
3731 	} else if (create_mode == OS_FILE_CREATE) {
3732 
3733 		create_flag = O_RDWR | O_CREAT | O_EXCL;
3734 
3735 	} else if (create_mode == OS_FILE_CREATE_PATH) {
3736 
3737 		/* Create subdirs along the path if needed. */
3738 
3739 		*success = os_file_create_subdirs_if_needed(name);
3740 
3741 		if (!*success) {
3742 
3743 			ib::error()
3744 				<< "Unable to create subdirectories '"
3745 				<< name << "'";
3746 
3747 			file.m_file = OS_FILE_CLOSED;
3748 			return(file);
3749 		}
3750 
3751 		create_flag = O_RDWR | O_CREAT | O_EXCL;
3752 		create_mode = OS_FILE_CREATE;
3753 	} else {
3754 
3755 		ib::error()
3756 			<< "Unknown file create mode ("
3757 			<< create_mode
3758 			<< " for file '" << name << "'";
3759 
3760 		file.m_file = OS_FILE_CLOSED;
3761 		return(file);
3762 	}
3763 
3764 	bool	retry;
3765 
3766 	do {
3767 		file.m_file = ::open(name, create_flag | create_o_sync,
3768 			      os_innodb_umask);
3769 
3770 		if (file.m_file == -1) {
3771 			*success = false;
3772 
3773 			retry = os_file_handle_error(
3774 				name,
3775 				create_mode == OS_FILE_OPEN
3776 				? "open" : "create");
3777 		} else {
3778 			*success = true;
3779 			retry = false;
3780 		}
3781 
3782 	} while (retry);
3783 
3784 #ifdef USE_FILE_LOCK
3785 	if (!read_only
3786 	    && *success
3787 	    && access_type == OS_FILE_READ_WRITE
3788 	    && os_file_lock(file.m_file, name)) {
3789 
3790 		*success = false;
3791 		close(file.m_file);
3792 		file.m_file = -1;
3793 	}
3794 #endif /* USE_FILE_LOCK */
3795 
3796 	return(file);
3797 }
3798 
3799 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
3800 function!
3801 Truncates a file at the specified position.
3802 @param[in]	file	file to truncate
3803 @param[in]	new_len	new file length
3804 @return true if success */
3805 bool
os_file_set_eof_at_func(os_file_t file,ib_uint64_t new_len)3806 os_file_set_eof_at_func(
3807 	os_file_t	file,
3808 	ib_uint64_t	new_len)
3809 {
3810 #ifdef __WIN__
3811 	LARGE_INTEGER li, li2;
3812 	li.QuadPart = new_len;
3813 	return(SetFilePointerEx(file, li, &li2,FILE_BEGIN)
3814 	       && SetEndOfFile(file));
3815 #else
3816 	/* TODO: works only with -D_FILE_OFFSET_BITS=64 ? */
3817 	return(!ftruncate(file, new_len));
3818 #endif
3819 }
3820 
3821 /** This function attempts to create a directory named pathname. The new
3822 directory gets default permissions. On Unix the permissions are
3823 (0770 & ~umask). If the directory exists already, nothing is done and
3824 the call succeeds, unless the fail_if_exists arguments is true.
3825 If another error occurs, such as a permission error, this does not crash,
3826 but reports the error and returns false.
3827 @param[in]	pathname	directory name as null-terminated string
3828 @param[in]	fail_if_exists	if true, pre-existing directory is treated as
3829 				an error.
3830 @return true if call succeeds, false on error */
3831 bool
os_file_create_directory(const char * pathname,bool fail_if_exists)3832 os_file_create_directory(
3833 	const char*	pathname,
3834 	bool		fail_if_exists)
3835 {
3836 	int	rcode = mkdir(pathname, 0770);
3837 
3838 	if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
3839 		/* failure */
3840 		os_file_handle_error_no_exit(pathname, "mkdir", false);
3841 
3842 		return(false);
3843 	}
3844 
3845 	return(true);
3846 }
3847 
3848 /**
3849 The os_file_opendir() function opens a directory stream corresponding to the
3850 directory named by the dirname argument. The directory stream is positioned
3851 at the first entry. In both Unix and Windows we automatically skip the '.'
3852 and '..' items at the start of the directory listing.
3853 @param[in]	dirname		directory name; it must not contain a trailing
3854 				'\' or '/'
3855 @param[in]	is_fatal	true if we should treat an error as a fatal
3856 				error; if we try to open symlinks then we do
3857 				not wish a fatal error if it happens not to be
3858 				a directory
3859 @return directory stream, NULL if error */
3860 os_file_dir_t
os_file_opendir(const char * dirname,bool error_is_fatal)3861 os_file_opendir(
3862 	const char*	dirname,
3863 	bool		error_is_fatal)
3864 {
3865 	os_file_dir_t		dir;
3866 	dir = opendir(dirname);
3867 
3868 	if (dir == NULL && error_is_fatal) {
3869 		os_file_handle_error(dirname, "opendir");
3870 	}
3871 
3872 	return(dir);
3873 }
3874 
3875 /** Closes a directory stream.
3876 @param[in]	dir		directory stream
3877 @return 0 if success, -1 if failure */
3878 int
os_file_closedir(os_file_dir_t dir)3879 os_file_closedir(
3880 	os_file_dir_t	dir)
3881 {
3882 	int	ret = closedir(dir);
3883 
3884 	if (ret != 0) {
3885 		os_file_handle_error_no_exit(NULL, "closedir", false);
3886 	}
3887 
3888 	return(ret);
3889 }
3890 
3891 /** This function returns information of the next file in the directory. We jump
3892 over the '.' and '..' entries in the directory.
3893 @param[in]	dirname		directory name or path
3894 @param[in]	dir		directory stream
3895 @param[out]	info		buffer where the info is returned
3896 @return 0 if ok, -1 if error, 1 if at the end of the directory */
3897 int
os_file_readdir_next_file(const char * dirname,os_file_dir_t dir,os_file_stat_t * info)3898 os_file_readdir_next_file(
3899 	const char*	dirname,
3900 	os_file_dir_t	dir,
3901 	os_file_stat_t*	info)
3902 {
3903 	struct dirent*	ent;
3904 	char*		full_path;
3905 	int		ret;
3906 	struct stat	statinfo;
3907 
3908 #ifdef HAVE_READDIR_R
3909 	char		dirent_buf[sizeof(struct dirent)
3910 				   + _POSIX_PATH_MAX + 100];
3911 	/* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
3912 	the max file name len; but in most standards, the
3913 	length is NAME_MAX; we add 100 to be even safer */
3914 #endif /* HAVE_READDIR_R */
3915 
3916 next_file:
3917 
3918 #ifdef HAVE_READDIR_R
3919 	ret = readdir_r(dir, (struct dirent*) dirent_buf, &ent);
3920 
3921 	if (ret != 0) {
3922 
3923 		ib::error()
3924 			<< "Cannot read directory " << dirname
3925 			<< " error: " << ret;
3926 
3927 		return(-1);
3928 	}
3929 
3930 	if (ent == NULL) {
3931 		/* End of directory */
3932 
3933 		return(1);
3934 	}
3935 
3936 	ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
3937 #else
3938 	ent = readdir(dir);
3939 
3940 	if (ent == NULL) {
3941 
3942 		return(1);
3943 	}
3944 #endif /* HAVE_READDIR_R */
3945 	ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
3946 
3947 	if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
3948 
3949 		goto next_file;
3950 	}
3951 
3952 	strcpy(info->name, ent->d_name);
3953 
3954 	full_path = static_cast<char*>(
3955 		ut_malloc_nokey(strlen(dirname) + strlen(ent->d_name) + 10));
3956 
3957 	sprintf(full_path, "%s/%s", dirname, ent->d_name);
3958 
3959 	ret = stat(full_path, &statinfo);
3960 
3961 	if (ret) {
3962 
3963 		if (errno == ENOENT) {
3964 			/* readdir() returned a file that does not exist,
3965 			it must have been deleted in the meantime. Do what
3966 			would have happened if the file was deleted before
3967 			readdir() - ignore and go to the next entry.
3968 			If this is the last entry then info->name will still
3969 			contain the name of the deleted file when this
3970 			function returns, but this is not an issue since the
3971 			caller shouldn't be looking at info when end of
3972 			directory is returned. */
3973 
3974 			ut_free(full_path);
3975 
3976 			goto next_file;
3977 		}
3978 
3979 		os_file_handle_error_no_exit(full_path, "stat", false);
3980 
3981 		ut_free(full_path);
3982 
3983 		return(-1);
3984 	}
3985 
3986 	info->size = statinfo.st_size;
3987 
3988 	if (S_ISDIR(statinfo.st_mode)) {
3989 		info->type = OS_FILE_TYPE_DIR;
3990 	} else if (S_ISLNK(statinfo.st_mode)) {
3991 		info->type = OS_FILE_TYPE_LINK;
3992 	} else if (S_ISREG(statinfo.st_mode)) {
3993 		info->type = OS_FILE_TYPE_FILE;
3994 	} else {
3995 		info->type = OS_FILE_TYPE_UNKNOWN;
3996 	}
3997 
3998 	ut_free(full_path);
3999 
4000 	return(0);
4001 }
4002 
4003 /** NOTE! Use the corresponding macro os_file_create(), not directly
4004 this function!
4005 Opens an existing file or creates a new.
4006 @param[in]	name		name of the file or path as a null-terminated
4007 				string
4008 @param[in]	create_mode	create mode
4009 @param[in]	purpose		OS_FILE_AIO, if asynchronous, non-buffered I/O
4010 				is desired, OS_FILE_NORMAL, if any normal file;
4011 				NOTE that it also depends on type, os_aio_..
4012 				and srv_.. variables whether we really use async
4013 				I/O or unbuffered I/O: look in the function
4014 				source code for the exact rules
4015 @param[in]	type		OS_DATA_FILE or OS_LOG_FILE
4016 @param[in]	read_only	true, if read only checks should be enforcedm
4017 @param[in]	success		true if succeeded
4018 @return handle to the file, not defined if error, error number
4019 	can be retrieved with os_file_get_last_error */
4020 pfs_os_file_t
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,bool read_only,bool * success)4021 os_file_create_func(
4022 	const char*	name,
4023 	ulint		create_mode,
4024 	ulint		purpose,
4025 	ulint		type,
4026 	bool		read_only,
4027 	bool*		success)
4028 {
4029 	bool		on_error_no_exit;
4030 	bool		on_error_silent;
4031 	pfs_os_file_t	file;
4032 
4033 	*success = false;
4034 
4035 	DBUG_EXECUTE_IF(
4036 		"ib_create_table_fail_disk_full",
4037 		*success = false;
4038 		errno = ENOSPC;
4039 		file.m_file = OS_FILE_CLOSED;
4040 		return(file);
4041 	);
4042 
4043 	int		create_flag;
4044 	const char*	mode_str	= NULL;
4045 
4046 	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
4047 		? true : false;
4048 	on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
4049 		? true : false;
4050 
4051 	create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
4052 	create_mode &= ~OS_FILE_ON_ERROR_SILENT;
4053 
4054 	if (create_mode == OS_FILE_OPEN
4055 	    || create_mode == OS_FILE_OPEN_RAW
4056 	    || create_mode == OS_FILE_OPEN_RETRY) {
4057 
4058 		mode_str = "OPEN";
4059 
4060 		create_flag = read_only ? O_RDONLY : O_RDWR;
4061 
4062 	} else if (read_only) {
4063 
4064 		mode_str = "OPEN";
4065 
4066 		create_flag = O_RDONLY;
4067 
4068 	} else if (create_mode == OS_FILE_CREATE) {
4069 
4070 		mode_str = "CREATE";
4071 		create_flag = O_RDWR | O_CREAT | O_EXCL;
4072 
4073 	} else if (create_mode == OS_FILE_OVERWRITE) {
4074 
4075 		mode_str = "OVERWRITE";
4076 		create_flag = O_RDWR | O_CREAT | O_TRUNC;
4077 
4078 	} else {
4079 		ib::error()
4080 			<< "Unknown file create mode (" << create_mode << ")"
4081 			<< " for file '" << name << "'";
4082 
4083 		file.m_file = OS_FILE_CLOSED;
4084 		return(file);
4085 	}
4086 
4087 	ut_a(type == OS_LOG_FILE
4088 	     || type == OS_DATA_FILE
4089 	     || type == OS_DATA_TEMP_FILE);
4090 
4091 	ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
4092 
4093 #ifdef O_SYNC
4094 	/* We let O_SYNC only affect log files; note that we map O_DSYNC to
4095 	O_SYNC because the datasync options seemed to corrupt files in 2001
4096 	in both Linux and Solaris */
4097 
4098 	if (!read_only
4099 	    && type == OS_LOG_FILE
4100 	    && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
4101 
4102 		create_flag |= O_SYNC;
4103 	}
4104 #endif /* O_SYNC */
4105 
4106 	bool		retry;
4107 
4108 	do {
4109 		file.m_file = ::open(name, create_flag, os_innodb_umask);
4110 
4111 		if (file.m_file == -1) {
4112 			const char*	operation;
4113 
4114 			operation = (create_mode == OS_FILE_CREATE
4115 				     && !read_only) ? "create" : "open";
4116 
4117 			*success = false;
4118 
4119 			if (on_error_no_exit) {
4120 				retry = os_file_handle_error_no_exit(
4121 					name, operation, on_error_silent);
4122 			} else {
4123 				retry = os_file_handle_error(name, operation);
4124 			}
4125 		} else {
4126 			*success = true;
4127 			retry = false;
4128 		}
4129 
4130 	} while (retry);
4131 
4132 	/* We disable OS caching (O_DIRECT) only on data files */
4133 
4134 	if (!read_only
4135 	    && *success
4136 	    && (type != OS_LOG_FILE && type != OS_DATA_TEMP_FILE)
4137 	    && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
4138 		|| srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) {
4139 
4140 		os_file_set_nocache(file.m_file, name, mode_str);
4141 	} else if (!srv_read_only_mode
4142 		   && *success
4143 		   && srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) {
4144 		/* Do fsync() on log and files when setting O_DIRECT fails.
4145 		See log_io_complete() */
4146 		if (!os_file_set_nocache(file.m_file, name, mode_str)) {
4147 			srv_unix_file_flush_method = SRV_UNIX_O_DIRECT;
4148 		}
4149 	}
4150 
4151 #ifdef USE_FILE_LOCK
4152 	if (!read_only
4153 	    && *success
4154 	    && create_mode != OS_FILE_OPEN_RAW
4155 	    && os_file_lock(file.m_file, name)) {
4156 
4157 		if (create_mode == OS_FILE_OPEN_RETRY) {
4158 
4159 			ib::info()
4160 				<< "Retrying to lock the first data file";
4161 
4162 			for (int i = 0; i < 100; i++) {
4163 				os_thread_sleep(1000000);
4164 
4165 				if (!os_file_lock(file.m_file, name)) {
4166 					*success = true;
4167 					return(file);
4168 				}
4169 			}
4170 
4171 			ib::info()
4172 				<< "Unable to open the first data file";
4173 		}
4174 
4175 		*success = false;
4176 		close(file.m_file);
4177 		file.m_file = -1;
4178 	}
4179 #endif /* USE_FILE_LOCK */
4180 
4181 	return(file);
4182 }
4183 
4184 /** NOTE! Use the corresponding macro
4185 os_file_create_simple_no_error_handling(), not directly this function!
4186 A simple function to open or create a file.
4187 @param[in]	name		name of the file or path as a null-terminated
4188 				string
4189 @param[in]	create_mode	create mode
4190 @param[in]	access_type	OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
4191 				OS_FILE_READ_ALLOW_DELETE; the last option
4192 				is used by a backup program reading the file
4193 @param[in]	read_only	if true read only mode checks are enforced
4194 @param[out]	success		true if succeeded
4195 @return own: handle to the file, not defined if error, error number
4196 	can be retrieved with os_file_get_last_error */
4197 pfs_os_file_t
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)4198 os_file_create_simple_no_error_handling_func(
4199 	const char*	name,
4200 	ulint		create_mode,
4201 	ulint		access_type,
4202 	bool		read_only,
4203 	bool*		success)
4204 {
4205 	pfs_os_file_t	file;
4206 	int		create_flag;
4207 
4208 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
4209 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
4210 
4211 	*success = false;
4212 
4213 	if (create_mode == OS_FILE_OPEN) {
4214 
4215 		if (access_type == OS_FILE_READ_ONLY) {
4216 
4217 			create_flag = O_RDONLY;
4218 
4219 		} else if (read_only) {
4220 
4221 			create_flag = O_RDONLY;
4222 
4223 		} else {
4224 
4225 			ut_a(access_type == OS_FILE_READ_WRITE
4226 			     || access_type == OS_FILE_READ_ALLOW_DELETE);
4227 
4228 			create_flag = O_RDWR;
4229 		}
4230 
4231 	} else if (read_only) {
4232 
4233 		create_flag = O_RDONLY;
4234 
4235 	} else if (create_mode == OS_FILE_CREATE) {
4236 
4237 		create_flag = O_RDWR | O_CREAT | O_EXCL;
4238 
4239 	} else {
4240 
4241 		ib::error()
4242 			<< "Unknown file create mode "
4243 			<< create_mode << " for file '" << name << "'";
4244 		file.m_file = OS_FILE_CLOSED;
4245 		return(file);
4246 	}
4247 
4248 	file.m_file = ::open(name, create_flag, os_innodb_umask);
4249 
4250 	*success = (file.m_file != -1);
4251 
4252 #ifdef USE_FILE_LOCK
4253 	if (!read_only
4254 	    && *success
4255 	    && access_type == OS_FILE_READ_WRITE
4256 	    && os_file_lock(file.m_file, name)) {
4257 
4258 		*success = false;
4259 		close(file.m_file);
4260 		file.m_file = -1;
4261 
4262 	}
4263 #endif /* USE_FILE_LOCK */
4264 
4265 	return(file);
4266 }
4267 
4268 /** Deletes a file if it exists. The file has to be closed before calling this.
4269 @param[in]	name		file path as a null-terminated string
4270 @param[out]	exist		indicate if file pre-exist
4271 @return true if success */
4272 bool
os_file_delete_if_exists_func(const char * name,bool * exist)4273 os_file_delete_if_exists_func(
4274 	const char*	name,
4275 	bool*		exist)
4276 {
4277 	if (exist != NULL) {
4278 		*exist = true;
4279 	}
4280 
4281 	int	ret = unlink(name);
4282 
4283 	if (ret != 0 && errno == ENOENT) {
4284 		if (exist != NULL) {
4285 			*exist = false;
4286 		}
4287 	} else if (ret != 0 && errno != ENOENT) {
4288 		os_file_handle_error_no_exit(name, "delete", false);
4289 
4290 		return(false);
4291 	}
4292 
4293 	return(true);
4294 }
4295 
4296 /** Deletes a file. The file has to be closed before calling this.
4297 @param[in]	name		file path as a null-terminated string
4298 @return true if success */
4299 bool
os_file_delete_func(const char * name)4300 os_file_delete_func(
4301 	const char*	name)
4302 {
4303 	int	ret = unlink(name);
4304 
4305 	if (ret != 0) {
4306 		os_file_handle_error_no_exit(name, "delete", false);
4307 
4308 		return(false);
4309 	}
4310 
4311 	return(true);
4312 }
4313 
4314 /** NOTE! Use the corresponding macro os_file_rename(), not directly this
4315 function!
4316 Renames a file (can also move it to another directory). It is safest that the
4317 file is closed before calling this function.
4318 @param[in]	oldpath		old file path as a null-terminated string
4319 @param[in]	newpath		new file path
4320 @return true if success */
4321 bool
os_file_rename_func(const char * oldpath,const char * newpath)4322 os_file_rename_func(
4323 	const char*	oldpath,
4324 	const char*	newpath)
4325 {
4326 #ifdef UNIV_DEBUG
4327 	os_file_type_t	type;
4328 	bool		exists;
4329 
4330 	/* New path must not exist. */
4331 	ut_ad(os_file_status(newpath, &exists, &type));
4332 	ut_ad(!exists);
4333 
4334 	/* Old path must exist. */
4335 	ut_ad(os_file_status(oldpath, &exists, &type));
4336 	ut_ad(exists);
4337 #endif /* UNIV_DEBUG */
4338 
4339 	int	ret = rename(oldpath, newpath);
4340 
4341 	if (ret != 0) {
4342 		os_file_handle_error_no_exit(oldpath, "rename", false);
4343 
4344 		return(false);
4345 	}
4346 
4347 	return(true);
4348 }
4349 
4350 /** NOTE! Use the corresponding macro os_file_close(), not directly this
4351 function!
4352 Closes a file handle. In case of error, error number can be retrieved with
4353 os_file_get_last_error.
4354 @param[in]	file		Handle to close
4355 @return true if success */
4356 bool
os_file_close_func(os_file_t file)4357 os_file_close_func(
4358 	os_file_t	file)
4359 {
4360 	int	ret = close(file);
4361 
4362 	if (ret == -1) {
4363 		os_file_handle_error(NULL, "close");
4364 
4365 		return(false);
4366 	}
4367 
4368 	return(true);
4369 }
4370 
4371 /** Announces an intention to access file data in a specific pattern in the
4372 future.
4373 @param[in, own]	file	handle to a file
4374 @param[in]	offset	file region offset
4375 @param[in]	len	file region length
4376 @param[in]	advice	advice for access pattern
4377 @return true if success */
4378 bool
os_file_advise(pfs_os_file_t file,os_offset_t offset,os_offset_t len,ulint advice)4379 os_file_advise(pfs_os_file_t file, os_offset_t offset, os_offset_t len,
4380 	       ulint advice)
4381 {
4382 #ifdef __WIN__
4383 	return(true);
4384 #else
4385 #ifdef UNIV_LINUX
4386 	int     native_advice = 0;
4387 	if ((advice & OS_FILE_ADVISE_NORMAL) != 0)
4388 		native_advice |= POSIX_FADV_NORMAL;
4389 	if ((advice & OS_FILE_ADVISE_RANDOM) != 0)
4390 		native_advice |= POSIX_FADV_RANDOM;
4391 	if ((advice & OS_FILE_ADVISE_SEQUENTIAL) != 0)
4392 		native_advice |= POSIX_FADV_SEQUENTIAL;
4393 	if ((advice & OS_FILE_ADVISE_WILLNEED) != 0)
4394 		native_advice |= POSIX_FADV_WILLNEED;
4395 	if ((advice & OS_FILE_ADVISE_DONTNEED) != 0)
4396 		native_advice |= POSIX_FADV_DONTNEED;
4397 	if ((advice & OS_FILE_ADVISE_NOREUSE) != 0)
4398 		native_advice |= POSIX_FADV_NOREUSE;
4399 
4400 	return(posix_fadvise(file.m_file, offset, len, native_advice) == 0);
4401 #else
4402 	return(true);
4403 #endif
4404 #endif /* __WIN__ */
4405 }
4406 
4407 
4408 /** Gets a file size.
4409 @param[in]	file		handle to an open file
4410 @return file size, or (os_offset_t) -1 on failure */
4411 os_offset_t
os_file_get_size(pfs_os_file_t file)4412 os_file_get_size(
4413 	pfs_os_file_t	file)
4414 {
4415 	/* Store current position */
4416 	os_offset_t	pos = lseek(file.m_file, 0, SEEK_CUR);
4417 	os_offset_t	file_size = lseek(file.m_file, 0, SEEK_END);
4418 
4419 	/* Restore current position as the function should not change it */
4420 	lseek(file.m_file, pos, SEEK_SET);
4421 
4422 	return(file_size);
4423 }
4424 
4425 /** Gets a file size.
4426 @param[in]	filename	Full path to the filename to check
4427 @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
4428 	errno */
4429 os_file_size_t
os_file_get_size(const char * filename)4430 os_file_get_size(
4431 	const char*	filename)
4432 {
4433 	struct stat	s;
4434 	os_file_size_t	file_size;
4435 
4436 	int	ret = stat(filename, &s);
4437 
4438 	if (ret == 0) {
4439 		file_size.m_total_size = s.st_size;
4440 		/* st_blocks is in 512 byte sized blocks */
4441 		file_size.m_alloc_size = s.st_blocks * 512;
4442 	} else {
4443 		file_size.m_total_size = ~0;
4444 		file_size.m_alloc_size = (os_offset_t) errno;
4445 	}
4446 
4447 	return(file_size);
4448 }
4449 
4450 /** This function returns information about the specified file
4451 @param[in]	path		pathname of the file
4452 @param[out]	stat_info	information of a file in a directory
4453 @param[in,out]	statinfo	information of a file in a directory
4454 @param[in]	check_rw_perm	for testing whether the file can be opened
4455 				in RW mode
4456 @param[in]	read_only	if true read only mode checks are enforced
4457 @return DB_SUCCESS if all OK */
4458 static
4459 dberr_t
os_file_get_status_posix(const char * path,os_file_stat_t * stat_info,struct stat * statinfo,bool check_rw_perm,bool read_only)4460 os_file_get_status_posix(
4461 	const char*	path,
4462 	os_file_stat_t* stat_info,
4463 	struct stat*	statinfo,
4464 	bool		check_rw_perm,
4465 	bool		read_only)
4466 {
4467 	int	ret = stat(path, statinfo);
4468 
4469 	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
4470 		/* file does not exist */
4471 
4472 		return(DB_NOT_FOUND);
4473 
4474 	} else if (ret) {
4475 		/* file exists, but stat call failed */
4476 
4477 		os_file_handle_error_no_exit(path, "stat", false);
4478 
4479 		return(DB_FAIL);
4480 	}
4481 
4482 	switch (statinfo->st_mode & S_IFMT) {
4483 	case S_IFDIR:
4484 		stat_info->type = OS_FILE_TYPE_DIR;
4485 		break;
4486 	case S_IFLNK:
4487 		stat_info->type = OS_FILE_TYPE_LINK;
4488 		break;
4489 	case S_IFBLK:
4490 		/* Handle block device as regular file. */
4491 	case S_IFCHR:
4492 		/* Handle character device as regular file. */
4493 	case S_IFREG:
4494 		stat_info->type = OS_FILE_TYPE_FILE;
4495 		break;
4496 	default:
4497 		stat_info->type = OS_FILE_TYPE_UNKNOWN;
4498 	}
4499 
4500 	stat_info->size = statinfo->st_size;
4501 	stat_info->block_size = statinfo->st_blksize;
4502 	stat_info->alloc_size = statinfo->st_blocks * 512;
4503 
4504 	if (check_rw_perm
4505 	    && (stat_info->type == OS_FILE_TYPE_FILE
4506 		|| stat_info->type == OS_FILE_TYPE_BLOCK)) {
4507 
4508 		int	access = !read_only ? O_RDWR : O_RDONLY;
4509 		int	fh = ::open(path, access, os_innodb_umask);
4510 
4511 		if (fh == -1) {
4512 			stat_info->rw_perm = false;
4513 		} else {
4514 			stat_info->rw_perm = true;
4515 			close(fh);
4516 		}
4517 	}
4518 
4519 	return(DB_SUCCESS);
4520 }
4521 
4522 /** Truncates a file to a specified size in bytes.
4523 Do nothing if the size to preserve is greater or equal to the current
4524 size of the file.
4525 @param[in]	pathname	file path
4526 @param[in]	file		file to be truncated
4527 @param[in]	size		size to preserve in bytes
4528 @return true if success */
4529 static
4530 bool
os_file_truncate_posix(const char * pathname,pfs_os_file_t file,os_offset_t size)4531 os_file_truncate_posix(
4532 	const char*	pathname,
4533 	pfs_os_file_t	file,
4534 	os_offset_t	size)
4535 {
4536 	int     res = ftruncate(file.m_file, size);
4537 	if (res == -1) {
4538 
4539 		bool	retry;
4540 
4541 		retry = os_file_handle_error_no_exit(
4542 			pathname, "truncate", false);
4543 
4544 		if (retry) {
4545 			ib::warn()
4546 				<< "Truncate failed for '"
4547 				<< pathname << "'";
4548 		}
4549 	}
4550 
4551 	return(res == 0);
4552 }
4553 
4554 /** Truncates a file at its current position.
4555 @return true if success */
4556 bool
os_file_set_eof(FILE * file)4557 os_file_set_eof(
4558 	FILE*		file)	/*!< in: file to be truncated */
4559 {
4560 	return(!ftruncate(fileno(file), ftell(file)));
4561 }
4562 
4563 /** Closes a file handle.
4564 @param[in]	file		Handle to a file
4565 @return true if success */
4566 bool
os_file_close_no_error_handling_func(os_file_t file)4567 os_file_close_no_error_handling_func(
4568 	os_file_t	file)
4569 {
4570 	return(close(file) != -1);
4571 }
4572 
4573 /** This function can be called if one wants to post a batch of reads and
4574 prefers an i/o-handler thread to handle them all at once later. You must
4575 call os_aio_simulated_wake_handler_threads later to ensure the threads
4576 are not left sleeping! */
4577 void
os_aio_simulated_put_read_threads_to_sleep()4578 os_aio_simulated_put_read_threads_to_sleep()
4579 {
4580 	/* No op on non Windows */
4581 }
4582 
4583 #else /* !_WIN32 */
4584 
4585 #include <WinIoCtl.h>
4586 
4587 /** Do the read/write
4588 @param[in]	request	The IO context and type
4589 @return the number of bytes read/written or negative value on error */
4590 ssize_t
execute(const IORequest & request)4591 SyncFileIO::execute(const IORequest& request)
4592 {
4593 	OVERLAPPED	seek;
4594 
4595 	memset(&seek, 0x0, sizeof(seek));
4596 
4597 	seek.Offset = (DWORD) m_offset & 0xFFFFFFFF;
4598 	seek.OffsetHigh = (DWORD) (m_offset >> 32);
4599 
4600 	BOOL	ret;
4601 	DWORD	n_bytes;
4602 
4603 	if (request.is_read()) {
4604 		ret = ReadFile(m_fh, m_buf,
4605 			static_cast<DWORD>(m_n), &n_bytes, &seek);
4606 
4607 	} else {
4608 		ut_ad(request.is_write());
4609 		ret = WriteFile(m_fh, m_buf,
4610 			static_cast<DWORD>(m_n), &n_bytes, &seek);
4611 	}
4612 
4613 	return(ret ? static_cast<ssize_t>(n_bytes) : -1);
4614 }
4615 
4616 /** Do the read/write
4617 @param[in,out]	slot	The IO slot, it has the IO context
4618 @return the number of bytes read/written or negative value on error */
4619 ssize_t
execute(Slot * slot)4620 SyncFileIO::execute(Slot* slot)
4621 {
4622 	BOOL	ret;
4623 
4624 	if (slot->type.is_read()) {
4625 		ret = ReadFile(
4626 			slot->file.m_file, slot->ptr, slot->len,
4627 			&slot->n_bytes, &slot->control);
4628 	} else {
4629 		ut_ad(slot->type.is_write());
4630 		ret = WriteFile(
4631 			slot->file.m_file, slot->ptr, slot->len,
4632 			&slot->n_bytes, &slot->control);
4633 	}
4634 
4635 	return(ret ? static_cast<ssize_t>(slot->n_bytes) : -1);
4636 }
4637 
4638 /** Check if the file system supports sparse files.
4639 @param[in]	 name		File name
4640 @return true if the file system supports sparse files */
4641 static
4642 bool
os_is_sparse_file_supported_win32(const char * filename)4643 os_is_sparse_file_supported_win32(const char* filename)
4644 {
4645 	char	volname[MAX_PATH];
4646 	BOOL	result = GetVolumePathName(filename, volname, MAX_PATH);
4647 
4648 	if (!result) {
4649 
4650 		ib::error()
4651 			<< "os_is_sparse_file_supported: "
4652 			<< "Failed to get the volume path name for: "
4653 			<< filename
4654 			<< "- OS error number " << GetLastError();
4655 
4656 		return(false);
4657 	}
4658 
4659 	DWORD	flags;
4660 
4661 	GetVolumeInformation(
4662 		volname, NULL, MAX_PATH, NULL, NULL,
4663 		&flags, NULL, MAX_PATH);
4664 
4665 	return(flags & FILE_SUPPORTS_SPARSE_FILES) ? true : false;
4666 }
4667 
4668 /** Free storage space associated with a section of the file.
4669 @param[in]	fh		Open file handle
4670 @param[in]	page_size	Tablespace page size
4671 @param[in]	block_size	File system block size
4672 @param[in]	off		Starting offset (SEEK_SET)
4673 @param[in]	len		Size of the hole
4674 @return 0 on success or errno */
4675 static
4676 dberr_t
os_file_punch_hole_win32(os_file_t fh,os_offset_t off,os_offset_t len)4677 os_file_punch_hole_win32(
4678 	os_file_t	fh,
4679 	os_offset_t	off,
4680 	os_offset_t	len)
4681 {
4682 	FILE_ZERO_DATA_INFORMATION	punch;
4683 
4684 	punch.FileOffset.QuadPart = off;
4685 	punch.BeyondFinalZero.QuadPart = off + len;
4686 
4687 	/* If lpOverlapped is NULL, lpBytesReturned cannot be NULL,
4688 	therefore we pass a dummy parameter. */
4689 	DWORD	temp;
4690 
4691 	BOOL	result = DeviceIoControl(
4692 		fh, FSCTL_SET_ZERO_DATA, &punch, sizeof(punch),
4693 		NULL, 0, &temp, NULL);
4694 
4695 	return(!result ? DB_IO_NO_PUNCH_HOLE : DB_SUCCESS);
4696 }
4697 
4698 /** Check the existence and type of the given file.
4699 @param[in]	path		path name of file
4700 @param[out]	exists		true if the file exists
4701 @param[out]	type		Type of the file, if it exists
4702 @return true if call succeeded */
4703 bool
os_file_status_win32(const char * path,bool * exists,os_file_type_t * type)4704 os_file_status_win32(
4705 	const char*	path,
4706 	bool*		exists,
4707 	os_file_type_t* type)
4708 {
4709 	int		ret;
4710 	struct _stat64	statinfo;
4711 
4712 	ret = _stat64(path, &statinfo);
4713 
4714 	*exists = !ret;
4715 
4716 	if (!ret) {
4717 		/* file exists, everything OK */
4718 
4719 	} else if (errno == ENOENT || errno == ENOTDIR
4720 		  || errno == ENAMETOOLONG) {
4721 		/* file does not exist */
4722 		return(true);
4723 
4724 	} else {
4725 		/* file exists, but stat call failed */
4726 		os_file_handle_error_no_exit(path, "stat", false);
4727 		return(false);
4728 	}
4729 
4730 	if (_S_IFDIR & statinfo.st_mode) {
4731 		*type = OS_FILE_TYPE_DIR;
4732 
4733 	} else if (_S_IFREG & statinfo.st_mode) {
4734 		*type = OS_FILE_TYPE_FILE;
4735 
4736 	} else {
4737 		*type = OS_FILE_TYPE_UNKNOWN;
4738 	}
4739 
4740 	return(true);
4741 }
4742 
4743 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
4744 function!
4745 Flushes the write buffers of a given file to the disk.
4746 @param[in]	file		handle to a file
4747 @return true if success */
4748 bool
os_file_flush_func(os_file_t file)4749 os_file_flush_func(
4750 	os_file_t	file)
4751 {
4752 	++os_n_fsyncs;
4753 
4754 	BOOL	ret = FlushFileBuffers(file);
4755 
4756 	if (ret) {
4757 		return(true);
4758 	}
4759 
4760 	/* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
4761 	actually a raw device, we choose to ignore that error if we are using
4762 	raw disks */
4763 
4764 	if (srv_start_raw_disk_in_use && GetLastError()
4765 	    == ERROR_INVALID_FUNCTION) {
4766 		return(true);
4767 	}
4768 
4769 	os_file_handle_error(NULL, "flush");
4770 
4771 	/* It is a fatal error if a file flush does not succeed, because then
4772 	the database can get corrupt on disk */
4773 	ut_error;
4774 
4775 	return(false);
4776 }
4777 
4778 /** Retrieves the last error number if an error occurs in a file io function.
4779 The number should be retrieved before any other OS calls (because they may
4780 overwrite the error number). If the number is not known to this program,
4781 the OS error number + 100 is returned.
4782 @param[in]	report_all_errors	true if we want an error message printed
4783 					of all errors
4784 @param[in]	on_error_silent		true then don't print any diagnostic
4785 					to the log
4786 @return error number, or OS error number + 100 */
4787 static
4788 ulint
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)4789 os_file_get_last_error_low(
4790 	bool	report_all_errors,
4791 	bool	on_error_silent)
4792 {
4793 	ulint	err = (ulint) GetLastError();
4794 
4795 	if (err == ERROR_SUCCESS) {
4796 		return(0);
4797 	}
4798 
4799 	if (report_all_errors
4800 	    || (!on_error_silent
4801 		&& err != ERROR_DISK_FULL
4802 		&& err != ERROR_FILE_EXISTS)) {
4803 
4804 		ib::error()
4805 			<< "Operating system error number " << err
4806 			<< " in a file operation.";
4807 
4808 		if (err == ERROR_PATH_NOT_FOUND) {
4809 			ib::error()
4810 				<< "The error means the system"
4811 				" cannot find the path specified.";
4812 
4813 			if (srv_is_being_started) {
4814 				ib::error()
4815 					<< "If you are installing InnoDB,"
4816 					" remember that you must create"
4817 					" directories yourself, InnoDB"
4818 					" does not create them.";
4819 			}
4820 
4821 		} else if (err == ERROR_ACCESS_DENIED) {
4822 
4823 			ib::error()
4824 				<< "The error means mysqld does not have"
4825 				" the access rights to"
4826 				" the directory. It may also be"
4827 				" you have created a subdirectory"
4828 				" of the same name as a data file.";
4829 
4830 		} else if (err == ERROR_SHARING_VIOLATION
4831 			   || err == ERROR_LOCK_VIOLATION) {
4832 
4833 			ib::error()
4834 				<< "The error means that another program"
4835 				" is using InnoDB's files."
4836 				" This might be a backup or antivirus"
4837 				" software or another instance"
4838 				" of MySQL."
4839 				" Please close it to get rid of this error.";
4840 
4841 		} else if (err == ERROR_WORKING_SET_QUOTA
4842 			   || err == ERROR_NO_SYSTEM_RESOURCES) {
4843 
4844 			ib::error()
4845 				<< "The error means that there are no"
4846 				" sufficient system resources or quota to"
4847 				" complete the operation.";
4848 
4849 		} else if (err == ERROR_OPERATION_ABORTED) {
4850 
4851 			ib::error()
4852 				<< "The error means that the I/O"
4853 				" operation has been aborted"
4854 				" because of either a thread exit"
4855 				" or an application request."
4856 				" Retry attempt is made.";
4857 		} else {
4858 
4859 			ib::info() << OPERATING_SYSTEM_ERROR_MSG;
4860 		}
4861 	}
4862 
4863 	if (err == ERROR_FILE_NOT_FOUND) {
4864 		return(OS_FILE_NOT_FOUND);
4865 	} else if (err == ERROR_DISK_FULL) {
4866 		return(OS_FILE_DISK_FULL);
4867 	} else if (err == ERROR_FILE_EXISTS) {
4868 		return(OS_FILE_ALREADY_EXISTS);
4869 	} else if (err == ERROR_SHARING_VIOLATION
4870 		   || err == ERROR_LOCK_VIOLATION) {
4871 		return(OS_FILE_SHARING_VIOLATION);
4872 	} else if (err == ERROR_WORKING_SET_QUOTA
4873 		   || err == ERROR_NO_SYSTEM_RESOURCES) {
4874 		return(OS_FILE_INSUFFICIENT_RESOURCE);
4875 	} else if (err == ERROR_OPERATION_ABORTED) {
4876 		return(OS_FILE_OPERATION_ABORTED);
4877 	} else if (err == ERROR_ACCESS_DENIED) {
4878 		return(OS_FILE_ACCESS_VIOLATION);
4879 	}
4880 
4881 	return(OS_FILE_ERROR_MAX + err);
4882 }
4883 
4884 /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
4885 this function!
4886 A simple function to open or create a file.
4887 @param[in]	name		name of the file or path as a null-terminated
4888 				string
4889 @param[in]	create_mode	create mode
4890 @param[in]	access_type	OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
4891 @param[in]	read_only	if true read only mode checks are enforced
4892 @param[out]	success		true if succeed, false if error
4893 @return handle to the file, not defined if error, error number
4894 	can be retrieved with os_file_get_last_error */
4895 pfs_os_file_t
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)4896 os_file_create_simple_func(
4897 	const char*	name,
4898 	ulint		create_mode,
4899 	ulint		access_type,
4900 	bool		read_only,
4901 	bool*		success)
4902 {
4903 	pfs_os_file_t	file;
4904 
4905 	*success = false;
4906 
4907 	DWORD		access;
4908 	DWORD		create_flag;
4909 	DWORD		attributes = 0;
4910 
4911 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
4912 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
4913 
4914 	if (create_mode == OS_FILE_OPEN) {
4915 
4916 		create_flag = OPEN_EXISTING;
4917 
4918 	} else if (read_only) {
4919 
4920 		create_flag = OPEN_EXISTING;
4921 
4922 	} else if (create_mode == OS_FILE_CREATE) {
4923 
4924 		create_flag = CREATE_NEW;
4925 
4926 	} else if (create_mode == OS_FILE_CREATE_PATH) {
4927 
4928 		/* Create subdirs along the path if needed. */
4929 		*success = os_file_create_subdirs_if_needed(name);
4930 
4931 		if (!*success) {
4932 
4933 			ib::error()
4934 				<< "Unable to create subdirectories '"
4935 				<< name << "'";
4936 			file.m_file = OS_FILE_CLOSED;
4937 			return(file);
4938 		}
4939 
4940 		create_flag = CREATE_NEW;
4941 		create_mode = OS_FILE_CREATE;
4942 
4943 	} else {
4944 
4945 		ib::error()
4946 			<< "Unknown file create mode ("
4947 			<< create_mode << ") for file '"
4948 			<< name << "'";
4949 
4950 		file.m_file = OS_FILE_CLOSED;
4951 		return(file);
4952 	}
4953 
4954 	if (access_type == OS_FILE_READ_ONLY) {
4955 
4956 		access = GENERIC_READ;
4957 
4958 	} else if (read_only) {
4959 
4960 		ib::info()
4961 			<< "Read only mode set. Unable to"
4962 			" open file '" << name << "' in RW mode, "
4963 			<< "trying RO mode", name;
4964 
4965 		access = GENERIC_READ;
4966 
4967 	} else if (access_type == OS_FILE_READ_WRITE) {
4968 
4969 		access = GENERIC_READ | GENERIC_WRITE;
4970 
4971 	} else {
4972 
4973 		ib::error()
4974 			<< "Unknown file access type (" << access_type << ") "
4975 			"for file '" << name << "'";
4976 
4977 		file.m_file = OS_FILE_CLOSED;
4978 		return(file);
4979 	}
4980 
4981 	bool	retry;
4982 
4983 	do {
4984 		/* Use default security attributes and no template file. */
4985 
4986 		file.m_file = CreateFile(
4987 			(LPCTSTR) name, access, FILE_SHARE_READ, NULL,
4988 			create_flag, attributes, NULL);
4989 
4990 		if (file.m_file == INVALID_HANDLE_VALUE) {
4991 
4992 			*success = false;
4993 
4994 			retry = os_file_handle_error(
4995 				name, create_mode == OS_FILE_OPEN ?
4996 				"open" : "create");
4997 
4998 		} else {
4999 
5000 			retry = false;
5001 
5002 			*success = true;
5003 
5004 			DWORD	temp;
5005 
5006 			/* This is a best effort use case, if it fails then
5007 			we will find out when we try and punch the hole. */
5008 
5009 			DeviceIoControl(
5010 				file.m_file, FSCTL_SET_SPARSE, NULL, 0, NULL, 0,
5011 				&temp, NULL);
5012 		}
5013 
5014 	} while (retry);
5015 
5016 	return(file);
5017 }
5018 
5019 /** This function attempts to create a directory named pathname. The new
5020 directory gets default permissions. On Unix the permissions are
5021 (0770 & ~umask). If the directory exists already, nothing is done and
5022 the call succeeds, unless the fail_if_exists arguments is true.
5023 If another error occurs, such as a permission error, this does not crash,
5024 but reports the error and returns false.
5025 @param[in]	pathname	directory name as null-terminated string
5026 @param[in]	fail_if_exists	if true, pre-existing directory is treated
5027 				as an error.
5028 @return true if call succeeds, false on error */
5029 bool
os_file_create_directory(const char * pathname,bool fail_if_exists)5030 os_file_create_directory(
5031 	const char*	pathname,
5032 	bool		fail_if_exists)
5033 {
5034 	BOOL	rcode;
5035 
5036 	rcode = CreateDirectory((LPCTSTR) pathname, NULL);
5037 	if (!(rcode != 0
5038 	      || (GetLastError() == ERROR_ALREADY_EXISTS
5039 		  && !fail_if_exists))) {
5040 
5041 		os_file_handle_error_no_exit(
5042 			pathname, "CreateDirectory", false);
5043 
5044 		return(false);
5045 	}
5046 
5047 	return(true);
5048 }
5049 
5050 /** The os_file_opendir() function opens a directory stream corresponding to the
5051 directory named by the dirname argument. The directory stream is positioned
5052 at the first entry. In both Unix and Windows we automatically skip the '.'
5053 and '..' items at the start of the directory listing.
5054 @param[in]	dirname		directory name; it must not contain a trailing
5055 				'\' or '/'
5056 @param[in]	is_fatal	true if we should treat an error as a fatal
5057 				error; if we try to open symlinks then we do
5058 				not wish a fatal error if it happens not to
5059 				be a directory
5060 @return directory stream, NULL if error */
5061 os_file_dir_t
os_file_opendir(const char * dirname,bool error_is_fatal)5062 os_file_opendir(
5063 	const char*	dirname,
5064 	bool		error_is_fatal)
5065 {
5066 	os_file_dir_t		dir;
5067 	LPWIN32_FIND_DATA	lpFindFileData;
5068 	char			path[OS_FILE_MAX_PATH + 3];
5069 
5070 	ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
5071 
5072 	strcpy(path, dirname);
5073 	strcpy(path + strlen(path), "\\*");
5074 
5075 	/* Note that in Windows opening the 'directory stream' also retrieves
5076 	the first entry in the directory. Since it is '.', that is no problem,
5077 	as we will skip over the '.' and '..' entries anyway. */
5078 
5079 	lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
5080 		ut_malloc_nokey(sizeof(WIN32_FIND_DATA)));
5081 
5082 	dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
5083 
5084 	ut_free(lpFindFileData);
5085 
5086 	if (dir == INVALID_HANDLE_VALUE) {
5087 
5088 		if (error_is_fatal) {
5089 			os_file_handle_error(dirname, "opendir");
5090 		}
5091 
5092 		return(NULL);
5093 	}
5094 
5095 	return(dir);
5096 }
5097 
5098 /** Closes a directory stream.
5099 @param[in]	dir	directory stream
5100 @return 0 if success, -1 if failure */
5101 int
os_file_closedir(os_file_dir_t dir)5102 os_file_closedir(
5103 	os_file_dir_t	dir)
5104 {
5105 	BOOL		ret;
5106 
5107 	ret = FindClose(dir);
5108 
5109 	if (!ret) {
5110 		os_file_handle_error_no_exit(NULL, "closedir", false);
5111 
5112 		return(-1);
5113 	}
5114 
5115 	return(0);
5116 }
5117 
5118 /** This function returns information of the next file in the directory. We
5119 jump over the '.' and '..' entries in the directory.
5120 @param[in]	dirname		directory name or path
5121 @param[in]	dir		directory stream
5122 @param[out]	info		buffer where the info is returned
5123 @return 0 if ok, -1 if error, 1 if at the end of the directory */
5124 int
os_file_readdir_next_file(const char * dirname,os_file_dir_t dir,os_file_stat_t * info)5125 os_file_readdir_next_file(
5126 	const char*	dirname,
5127 	os_file_dir_t	dir,
5128 	os_file_stat_t*	info)
5129 {
5130 	BOOL		ret;
5131 	int		status;
5132 	WIN32_FIND_DATA	find_data;
5133 
5134 next_file:
5135 
5136 	ret = FindNextFile(dir, &find_data);
5137 
5138 	if (ret > 0) {
5139 
5140 		const char* name;
5141 
5142 		name = static_cast<const char*>(find_data.cFileName);
5143 
5144 		ut_a(strlen(name) < OS_FILE_MAX_PATH);
5145 
5146 		if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0) {
5147 
5148 			goto next_file;
5149 		}
5150 
5151 		strcpy(info->name, name);
5152 
5153 		info->size = find_data.nFileSizeHigh;
5154 		info->size <<= 32;
5155 		info->size |= find_data.nFileSizeLow;
5156 
5157 		if (find_data.dwFileAttributes
5158 		    & FILE_ATTRIBUTE_REPARSE_POINT) {
5159 
5160 			/* TODO: test Windows symlinks */
5161 			/* TODO: MySQL has apparently its own symlink
5162 			implementation in Windows, dbname.sym can
5163 			redirect a database directory:
5164 			REFMAN "windows-symbolic-links.html" */
5165 
5166 			info->type = OS_FILE_TYPE_LINK;
5167 
5168 		} else if (find_data.dwFileAttributes
5169 			   & FILE_ATTRIBUTE_DIRECTORY) {
5170 
5171 			info->type = OS_FILE_TYPE_DIR;
5172 
5173 		} else {
5174 
5175 			/* It is probably safest to assume that all other
5176 			file types are normal. Better to check them rather
5177 			than blindly skip them. */
5178 
5179 			info->type = OS_FILE_TYPE_FILE;
5180 		}
5181 
5182 		status = 0;
5183 
5184 	} else if (GetLastError() == ERROR_NO_MORE_FILES) {
5185 
5186 		status = 1;
5187 
5188 	} else {
5189 
5190 		os_file_handle_error_no_exit(NULL, "readdir_next_file", false);
5191 
5192 		status = -1;
5193 	}
5194 
5195 	return(status);
5196 }
5197 
5198 /** NOTE! Use the corresponding macro os_file_create(), not directly
5199 this function!
5200 Opens an existing file or creates a new.
5201 @param[in]	name		name of the file or path as a null-terminated
5202 				string
5203 @param[in]	create_mode	create mode
5204 @param[in]	purpose		OS_FILE_AIO, if asynchronous, non-buffered I/O
5205 				is desired, OS_FILE_NORMAL, if any normal file;
5206 				NOTE that it also depends on type, os_aio_..
5207 				and srv_.. variables whether we really use async
5208 				I/O or unbuffered I/O: look in the function
5209 				source code for the exact rules
5210 @param[in]	type		OS_DATA_FILE or OS_LOG_FILE
5211 @param[in]	success		true if succeeded
5212 @return handle to the file, not defined if error, error number
5213 	can be retrieved with os_file_get_last_error */
5214 pfs_os_file_t
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,bool read_only,bool * success)5215 os_file_create_func(
5216 	const char*	name,
5217 	ulint		create_mode,
5218 	ulint		purpose,
5219 	ulint		type,
5220 	bool		read_only,
5221 	bool*		success)
5222 {
5223 	pfs_os_file_t	file;
5224 	bool		retry;
5225 	bool		on_error_no_exit;
5226 	bool		on_error_silent;
5227 
5228 	*success = false;
5229 
5230 	DBUG_EXECUTE_IF(
5231 		"ib_create_table_fail_disk_full",
5232 		*success = false;
5233 		SetLastError(ERROR_DISK_FULL);
5234 		file.m_file = OS_FILE_CLOSED;
5235 		return(file);
5236 	);
5237 
5238 	DWORD		create_flag;
5239 	DWORD		share_mode = FILE_SHARE_READ;
5240 
5241 	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
5242 		? true : false;
5243 
5244 	on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
5245 		? true : false;
5246 
5247 	create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
5248 	create_mode &= ~OS_FILE_ON_ERROR_SILENT;
5249 
5250 	if (create_mode == OS_FILE_OPEN_RAW) {
5251 
5252 		ut_a(!read_only);
5253 
5254 		create_flag = OPEN_EXISTING;
5255 
5256 		/* On Windows Physical devices require admin privileges and
5257 		have to have the write-share mode set. See the remarks
5258 		section for the CreateFile() function documentation in MSDN. */
5259 
5260 		share_mode |= FILE_SHARE_WRITE;
5261 
5262 	} else if (create_mode == OS_FILE_OPEN
5263 		   || create_mode == OS_FILE_OPEN_RETRY) {
5264 
5265 		create_flag = OPEN_EXISTING;
5266 
5267 	} else if (read_only) {
5268 
5269 		create_flag = OPEN_EXISTING;
5270 
5271 	} else if (create_mode == OS_FILE_CREATE) {
5272 
5273 		create_flag = CREATE_NEW;
5274 
5275 	} else if (create_mode == OS_FILE_OVERWRITE) {
5276 
5277 		create_flag = CREATE_ALWAYS;
5278 
5279 	} else {
5280 		ib::error()
5281 			<< "Unknown file create mode (" << create_mode << ") "
5282 			<< " for file '" << name << "'";
5283 
5284 		file.m_file = OS_FILE_CLOSED;
5285 		return(file);
5286 	}
5287 
5288 	DWORD		attributes = 0;
5289 
5290 #ifdef UNIV_HOTBACKUP
5291 	attributes |= FILE_FLAG_NO_BUFFERING;
5292 #else
5293 	if (purpose == OS_FILE_AIO) {
5294 
5295 #ifdef WIN_ASYNC_IO
5296 		/* If specified, use asynchronous (overlapped) io and no
5297 		buffering of writes in the OS */
5298 
5299 		if (srv_use_native_aio) {
5300 			attributes |= FILE_FLAG_OVERLAPPED;
5301 		}
5302 #endif /* WIN_ASYNC_IO */
5303 
5304 	} else if (purpose == OS_FILE_NORMAL) {
5305 
5306 		/* Use default setting. */
5307 
5308 	} else {
5309 
5310 		ib::error()
5311 			<< "Unknown purpose flag (" << purpose << ") "
5312 			<< "while opening file '" << name << "'";
5313 
5314 		file.m_file = OS_FILE_CLOSED;
5315 		return(file);
5316 	}
5317 
5318 #ifdef UNIV_NON_BUFFERED_IO
5319 	// TODO: Create a bug, this looks wrong. The flush log
5320 	// parameter is dynamic.
5321 	if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
5322 
5323 		/* Do not use unbuffered i/o for the log files because
5324 		value 2 denotes that we do not flush the log at every
5325 		commit, but only once per second */
5326 
5327 	} else if (srv_win_file_flush_method == SRV_WIN_IO_UNBUFFERED) {
5328 
5329 		attributes |= FILE_FLAG_NO_BUFFERING;
5330 	}
5331 #endif /* UNIV_NON_BUFFERED_IO */
5332 
5333 #endif /* UNIV_HOTBACKUP */
5334 	DWORD	access = GENERIC_READ;
5335 
5336 	if (!read_only) {
5337 		access |= GENERIC_WRITE;
5338 	}
5339 
5340 	do {
5341 		/* Use default security attributes and no template file. */
5342 		file.m_file = CreateFile(
5343 			(LPCTSTR) name, access, share_mode, NULL,
5344 			create_flag, attributes, NULL);
5345 
5346 		if (file.m_file == INVALID_HANDLE_VALUE) {
5347 			const char*	operation;
5348 
5349 			operation = (create_mode == OS_FILE_CREATE
5350 				     && !read_only)
5351 				? "create" : "open";
5352 
5353 			*success = false;
5354 
5355 			if (on_error_no_exit) {
5356 				retry = os_file_handle_error_no_exit(
5357 					name, operation, on_error_silent);
5358 			} else {
5359 				retry = os_file_handle_error(name, operation);
5360 			}
5361 		} else {
5362 
5363 			retry = false;
5364 
5365 			*success = true;
5366 
5367 			DWORD	temp;
5368 
5369 			/* This is a best effort use case, if it fails then
5370 			we will find out when we try and punch the hole. */
5371 			DeviceIoControl(
5372 				file.m_file, FSCTL_SET_SPARSE, NULL, 0, NULL, 0,
5373 				&temp, NULL);
5374 		}
5375 
5376 	} while (retry);
5377 
5378 	return(file);
5379 }
5380 
5381 /** NOTE! Use the corresponding macro os_file_create_simple_no_error_handling(),
5382 not directly this function!
5383 A simple function to open or create a file.
5384 @param[in]	name		name of the file or path as a null-terminated
5385 				string
5386 @param[in]	create_mode	create mode
5387 @param[in]	access_type	OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
5388 				OS_FILE_READ_ALLOW_DELETE; the last option is
5389 				used by a backup program reading the file
5390 @param[out]	success		true if succeeded
5391 @return own: handle to the file, not defined if error, error number
5392 	can be retrieved with os_file_get_last_error */
5393 pfs_os_file_t
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)5394 os_file_create_simple_no_error_handling_func(
5395 	const char*	name,
5396 	ulint		create_mode,
5397 	ulint		access_type,
5398 	bool		read_only,
5399 	bool*		success)
5400 {
5401 	pfs_os_file_t	file;
5402 
5403 	*success = false;
5404 
5405 	DWORD		access;
5406 	DWORD		create_flag;
5407 	DWORD		attributes	= 0;
5408 	DWORD		share_mode	= FILE_SHARE_READ;
5409 
5410 	ut_a(name);
5411 
5412 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
5413 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
5414 
5415 	if (create_mode == OS_FILE_OPEN) {
5416 
5417 		create_flag = OPEN_EXISTING;
5418 
5419 	} else if (read_only) {
5420 
5421 		create_flag = OPEN_EXISTING;
5422 
5423 	} else if (create_mode == OS_FILE_CREATE) {
5424 
5425 		create_flag = CREATE_NEW;
5426 
5427 	} else {
5428 
5429 		ib::error()
5430 			<< "Unknown file create mode (" << create_mode << ") "
5431 			<< " for file '" << name << "'";
5432 
5433 		file.m_file = OS_FILE_CLOSED;
5434 		return(file);
5435 	}
5436 
5437 	if (access_type == OS_FILE_READ_ONLY) {
5438 
5439 		access = GENERIC_READ;
5440 
5441 	} else if (read_only) {
5442 
5443 		access = GENERIC_READ;
5444 
5445 	} else if (access_type == OS_FILE_READ_WRITE) {
5446 
5447 		access = GENERIC_READ | GENERIC_WRITE;
5448 
5449 	} else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
5450 
5451 		ut_a(!read_only);
5452 
5453 		access = GENERIC_READ;
5454 
5455 		/*!< A backup program has to give mysqld the maximum
5456 		freedom to do what it likes with the file */
5457 
5458 		share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE;
5459 	} else {
5460 
5461 		ib::error()
5462 			<< "Unknown file access type (" << access_type << ") "
5463 			<< "for file '" << name << "'";
5464 
5465 		file.m_file = OS_FILE_CLOSED;
5466 		return(file);
5467 	}
5468 
5469 	file.m_file = CreateFile((LPCTSTR) name,
5470 			  access,
5471 			  share_mode,
5472 			  NULL,			// Security attributes
5473 			  create_flag,
5474 			  attributes,
5475 			  NULL);		// No template file
5476 
5477 	*success = (file.m_file != INVALID_HANDLE_VALUE);
5478 
5479 	return(file);
5480 }
5481 
5482 /** Deletes a file if it exists. The file has to be closed before calling this.
5483 @param[in]	name		file path as a null-terminated string
5484 @param[out]	exist		indicate if file pre-exist
5485 @return true if success */
5486 bool
os_file_delete_if_exists_func(const char * name,bool * exist)5487 os_file_delete_if_exists_func(
5488 	const char*	name,
5489 	bool*		exist)
5490 {
5491 	ulint	count	= 0;
5492 
5493 	if (exist != NULL) {
5494 		*exist = true;
5495 	}
5496 
5497 	for (;;) {
5498 		/* In Windows, deleting an .ibd file may fail if ibbackup
5499 		is copying it */
5500 
5501 		bool	ret = DeleteFile((LPCTSTR) name);
5502 
5503 		if (ret) {
5504 			return(true);
5505 		}
5506 
5507 		DWORD	lasterr = GetLastError();
5508 
5509 		if (lasterr == ERROR_FILE_NOT_FOUND
5510 		    || lasterr == ERROR_PATH_NOT_FOUND) {
5511 
5512 			/* the file does not exist, this not an error */
5513 			if (exist != NULL) {
5514 				*exist = false;
5515 			}
5516 
5517 			return(true);
5518 		}
5519 
5520 		++count;
5521 
5522 		if (count > 100 && 0 == (count % 10)) {
5523 
5524 			/* Print error information */
5525 			os_file_get_last_error(true);
5526 
5527 			ib::warn() << "Delete of file '" << name << "' failed.";
5528 		}
5529 
5530 		/* Sleep for a second */
5531 		os_thread_sleep(1000000);
5532 
5533 		if (count > 2000) {
5534 
5535 			return(false);
5536 		}
5537 	}
5538 }
5539 
5540 /** Deletes a file. The file has to be closed before calling this.
5541 @param[in]	name		File path as NUL terminated string
5542 @return true if success */
5543 bool
os_file_delete_func(const char * name)5544 os_file_delete_func(
5545 	const char*	name)
5546 {
5547 	ulint	count	= 0;
5548 
5549 	for (;;) {
5550 		/* In Windows, deleting an .ibd file may fail if ibbackup
5551 		is copying it */
5552 
5553 		BOOL	ret = DeleteFile((LPCTSTR) name);
5554 
5555 		if (ret) {
5556 			return(true);
5557 		}
5558 
5559 		if (GetLastError() == ERROR_FILE_NOT_FOUND) {
5560 			/* If the file does not exist, we classify this as
5561 			a 'mild' error and return */
5562 
5563 			return(false);
5564 		}
5565 
5566 		++count;
5567 
5568 		if (count > 100 && 0 == (count % 10)) {
5569 
5570 			/* print error information */
5571 			os_file_get_last_error(true);
5572 
5573 			ib::warn()
5574 				<< "Cannot delete file '" << name << "'. Are "
5575 				<< "you running ibbackup to back up the file?";
5576 		}
5577 
5578 		/* sleep for a second */
5579 		os_thread_sleep(1000000);
5580 
5581 		if (count > 2000) {
5582 
5583 			return(false);
5584 		}
5585 	}
5586 
5587 	ut_error;
5588 	return(false);
5589 }
5590 
5591 /** NOTE! Use the corresponding macro os_file_rename(), not directly this
5592 function!
5593 Renames a file (can also move it to another directory). It is safest that the
5594 file is closed before calling this function.
5595 @param[in]	oldpath		old file path as a null-terminated string
5596 @param[in]	newpath		new file path
5597 @return true if success */
5598 bool
os_file_rename_func(const char * oldpath,const char * newpath)5599 os_file_rename_func(
5600 	const char*	oldpath,
5601 	const char*	newpath)
5602 {
5603 #ifdef UNIV_DEBUG
5604 	os_file_type_t	type;
5605 	bool		exists;
5606 
5607 	/* New path must not exist. */
5608 	ut_ad(os_file_status(newpath, &exists, &type));
5609 	ut_ad(!exists);
5610 
5611 	/* Old path must exist. */
5612 	ut_ad(os_file_status(oldpath, &exists, &type));
5613 	ut_ad(exists);
5614 #endif /* UNIV_DEBUG */
5615 
5616 	if (MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath)) {
5617 		return(true);
5618 	}
5619 
5620 	os_file_handle_error_no_exit(oldpath, "rename", false);
5621 
5622 	return(false);
5623 }
5624 
5625 /** NOTE! Use the corresponding macro os_file_close(), not directly
5626 this function!
5627 Closes a file handle. In case of error, error number can be retrieved with
5628 os_file_get_last_error.
5629 @param[in,own]	file		Handle to a file
5630 @return true if success */
5631 bool
os_file_close_func(os_file_t file)5632 os_file_close_func(
5633 	os_file_t	file)
5634 {
5635 	ut_a(file > 0);
5636 
5637 	if (CloseHandle(file)) {
5638 		return(true);
5639 	}
5640 
5641 	os_file_handle_error(NULL, "close");
5642 
5643 	return(false);
5644 }
5645 
5646 /** Gets a file size.
5647 @param[in]	file		Handle to a file
5648 @return file size, or (os_offset_t) -1 on failure */
5649 os_offset_t
os_file_get_size(pfs_os_file_t file)5650 os_file_get_size(
5651 	pfs_os_file_t	file)
5652 {
5653 	DWORD		high;
5654 	DWORD		low;
5655 
5656 	low = GetFileSize(file.m_file, &high);
5657 
5658 	if (low == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
5659 		return((os_offset_t) -1);
5660 	}
5661 
5662 	return(os_offset_t(low | (os_offset_t(high) << 32)));
5663 }
5664 
5665 /** Gets a file size.
5666 @param[in]	filename	Full path to the filename to check
5667 @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
5668 	errno */
5669 os_file_size_t
os_file_get_size(const char * filename)5670 os_file_get_size(
5671 	const char*	filename)
5672 {
5673 	struct __stat64	s;
5674 	os_file_size_t	file_size;
5675 
5676 	int		ret = _stat64(filename, &s);
5677 
5678 	if (ret == 0) {
5679 
5680 		file_size.m_total_size = s.st_size;
5681 
5682 		DWORD	low_size;
5683 		DWORD	high_size;
5684 
5685 		low_size = GetCompressedFileSize(filename, &high_size);
5686 
5687 		if (low_size != INVALID_FILE_SIZE) {
5688 
5689 			file_size.m_alloc_size = high_size;
5690 			file_size.m_alloc_size <<= 32;
5691 			file_size.m_alloc_size |= low_size;
5692 
5693 		} else {
5694 			ib::error()
5695 				<< "GetCompressedFileSize("
5696 				<< filename << ", ..) failed.";
5697 
5698 			file_size.m_alloc_size = (os_offset_t) -1;
5699 		}
5700 	} else {
5701 		file_size.m_total_size = ~0;
5702 		file_size.m_alloc_size = (os_offset_t) ret;
5703 	}
5704 
5705 	return(file_size);
5706 }
5707 
5708 /** This function returns information about the specified file
5709 @param[in]	path		pathname of the file
5710 @param[out]	stat_info	information of a file in a directory
5711 @param[in,out]	statinfo	information of a file in a directory
5712 @param[in]	check_rw_perm	for testing whether the file can be opened
5713 				in RW mode
5714 @param[in]	read_only	true if the file is opened in read-only mode
5715 @return DB_SUCCESS if all OK */
5716 static
5717 dberr_t
os_file_get_status_win32(const char * path,os_file_stat_t * stat_info,struct _stat64 * statinfo,bool check_rw_perm,bool read_only)5718 os_file_get_status_win32(
5719 	const char*	path,
5720 	os_file_stat_t* stat_info,
5721 	struct _stat64*	statinfo,
5722 	bool		check_rw_perm,
5723 	bool		read_only)
5724 {
5725 	int	ret = _stat64(path, statinfo);
5726 
5727 	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
5728 		/* file does not exist */
5729 
5730 		return(DB_NOT_FOUND);
5731 
5732 	} else if (ret) {
5733 		/* file exists, but stat call failed */
5734 
5735 		os_file_handle_error_no_exit(path, "stat", false);
5736 
5737 		return(DB_FAIL);
5738 
5739 	} else if (_S_IFDIR & statinfo->st_mode) {
5740 
5741 		stat_info->type = OS_FILE_TYPE_DIR;
5742 
5743 	} else if (_S_IFREG & statinfo->st_mode) {
5744 
5745 		DWORD	access = GENERIC_READ;
5746 
5747 		if (!read_only) {
5748 			access |= GENERIC_WRITE;
5749 		}
5750 
5751 		stat_info->type = OS_FILE_TYPE_FILE;
5752 
5753 		/* Check if we can open it in read-only mode. */
5754 
5755 		if (check_rw_perm) {
5756 			HANDLE	fh;
5757 
5758 			fh = CreateFile(
5759 				(LPCTSTR) path,		// File to open
5760 				access,
5761 				0,			// No sharing
5762 				NULL,			// Default security
5763 				OPEN_EXISTING,		// Existing file only
5764 				FILE_ATTRIBUTE_NORMAL,	// Normal file
5765 				NULL);			// No attr. template
5766 
5767 			if (fh == INVALID_HANDLE_VALUE) {
5768 				stat_info->rw_perm = false;
5769 			} else {
5770 				stat_info->rw_perm = true;
5771 				CloseHandle(fh);
5772 			}
5773 		}
5774 
5775 		char	volname[MAX_PATH];
5776 		BOOL	result = GetVolumePathName(path, volname, MAX_PATH);
5777 
5778 		if (!result) {
5779 
5780 			ib::error()
5781 				<< "os_file_get_status_win32: "
5782 				<< "Failed to get the volume path name for: "
5783 				<< path
5784 				<< "- OS error number " << GetLastError();
5785 
5786 			return(DB_FAIL);
5787 		}
5788 
5789 		DWORD	sectorsPerCluster;
5790 		DWORD	bytesPerSector;
5791 		DWORD	numberOfFreeClusters;
5792 		DWORD	totalNumberOfClusters;
5793 
5794 		result = GetDiskFreeSpace(
5795 			(LPCSTR) volname,
5796 			&sectorsPerCluster,
5797 			&bytesPerSector,
5798 			&numberOfFreeClusters,
5799 			&totalNumberOfClusters);
5800 
5801 		if (!result) {
5802 
5803 			ib::error()
5804 				<< "GetDiskFreeSpace(" << volname << ",...) "
5805 				<< "failed "
5806 				<< "- OS error number " << GetLastError();
5807 
5808 			return(DB_FAIL);
5809 		}
5810 
5811 		stat_info->block_size = bytesPerSector * sectorsPerCluster;
5812 
5813 		/* On Windows the block size is not used as the allocation
5814 		unit for sparse files. The underlying infra-structure for
5815 		sparse files is based on NTFS compression. The punch hole
5816 		is done on a "compression unit". This compression unit
5817 		is based on the cluster size. You cannot punch a hole if
5818 		the cluster size >= 8K. For smaller sizes the table is
5819 		as follows:
5820 
5821 		Cluster Size	Compression Unit
5822 		512 Bytes		 8 KB
5823 		  1 KB			16 KB
5824 		  2 KB			32 KB
5825 		  4 KB			64 KB
5826 
5827 		Default NTFS cluster size is 4K, compression unit size of 64K.
5828 		Therefore unless the user has created the file system with
5829 		a smaller cluster size and used larger page sizes there is
5830 		little benefit from compression out of the box. */
5831 
5832 		stat_info->block_size = (stat_info->block_size <= 4096)
5833 			?  stat_info->block_size * 16 : ULINT_UNDEFINED;
5834 	} else {
5835 		stat_info->type = OS_FILE_TYPE_UNKNOWN;
5836 	}
5837 
5838 	return(DB_SUCCESS);
5839 }
5840 
5841 /** Truncates a file to a specified size in bytes.
5842 Do nothing if the size to preserve is greater or equal to the current
5843 size of the file.
5844 @param[in]	pathname	file path
5845 @param[in]	file		file to be truncated
5846 @param[in]	size		size to preserve in bytes
5847 @return true if success */
5848 static
5849 bool
os_file_truncate_win32(const char * pathname,pfs_os_file_t file,os_offset_t size)5850 os_file_truncate_win32(
5851 	const char*	pathname,
5852 	pfs_os_file_t	file,
5853 	os_offset_t	size)
5854 {
5855 	LARGE_INTEGER	length;
5856 
5857 	length.QuadPart = size;
5858 	BOOL	success = SetFilePointerEx(file.m_file, length, NULL, FILE_BEGIN);
5859 	if (!success) {
5860 		os_file_handle_error_no_exit(
5861 			pathname, "SetFilePointerEx", false);
5862 	} else {
5863 		success = SetEndOfFile(file.m_file);
5864 		if (!success) {
5865 			os_file_handle_error_no_exit(
5866 				pathname, "SetEndOfFile", false);
5867 		}
5868 	}
5869 	return(success);
5870 }
5871 
5872 /** Truncates a file at its current position.
5873 @param[in]	file		Handle to be truncated
5874 @return true if success */
5875 bool
os_file_set_eof(FILE * file)5876 os_file_set_eof(
5877 	FILE*		file)
5878 {
5879 	HANDLE	h = (HANDLE) _get_osfhandle(fileno(file));
5880 
5881 	return(SetEndOfFile(h));
5882 }
5883 
5884 /** Closes a file handle.
5885 @param[in]	file		Handle to close
5886 @return true if success */
5887 bool
os_file_close_no_error_handling_func(os_file_t file)5888 os_file_close_no_error_handling_func(
5889 	os_file_t	file)
5890 {
5891 	return(CloseHandle(file) ? true : false);
5892 }
5893 
5894 /** This function can be called if one wants to post a batch of reads and
5895 prefers an i/o-handler thread to handle them all at once later. You must
5896 call os_aio_simulated_wake_handler_threads later to ensure the threads
5897 are not left sleeping! */
5898 void
os_aio_simulated_put_read_threads_to_sleep()5899 os_aio_simulated_put_read_threads_to_sleep()
5900 {
5901 	AIO::simulated_put_read_threads_to_sleep();
5902 }
5903 
5904 /** This function can be called if one wants to post a batch of reads and
5905 prefers an i/o-handler thread to handle them all at once later. You must
5906 call os_aio_simulated_wake_handler_threads later to ensure the threads
5907 are not left sleeping! */
5908 void
simulated_put_read_threads_to_sleep()5909 AIO::simulated_put_read_threads_to_sleep()
5910 {
5911 	/* The idea of putting background IO threads to sleep is only for
5912 	Windows when using simulated AIO. Windows XP seems to schedule
5913 	background threads too eagerly to allow for coalescing during
5914 	readahead requests. */
5915 
5916 	if (srv_use_native_aio) {
5917 		/* We do not use simulated AIO: do nothing */
5918 
5919 		return;
5920 	}
5921 
5922 	os_aio_recommend_sleep_for_read_threads	= true;
5923 
5924 	for (ulint i = 0; i < os_aio_n_segments; i++) {
5925 		AIO*	array;
5926 
5927 		get_array_and_local_segment(&array, i);
5928 
5929 		if (array == s_reads) {
5930 
5931 			os_event_reset(os_aio_segment_wait_events[i]);
5932 		}
5933 	}
5934 }
5935 
5936 #endif /* !_WIN32*/
5937 
5938 /** Does a syncronous read or write depending upon the type specified
5939 In case of partial reads/writes the function tries
5940 NUM_RETRIES_ON_PARTIAL_IO times to read/write the complete data.
5941 @param[in]	type,		IO flags
5942 @param[in]	file		handle to an open file
5943 @param[out]	buf		buffer where to read
5944 @param[in]	offset		file offset from the start where to read
5945 @param[in]	n		number of bytes to read, starting from offset
5946 @param[out]	err		DB_SUCCESS or error code
5947 @return number of bytes read/written, -1 if error */
5948 static MY_ATTRIBUTE((warn_unused_result))
5949 ssize_t
os_file_io(const IORequest & in_type,os_file_t file,void * buf,ulint n,os_offset_t offset,dberr_t * err)5950 os_file_io(
5951 	const IORequest&in_type,
5952 	os_file_t	file,
5953 	void*		buf,
5954 	ulint		n,
5955 	os_offset_t	offset,
5956 	dberr_t*	err)
5957 {
5958 	Block*		block = NULL;
5959 	ulint		original_n = n;
5960 	IORequest	type = in_type;
5961 	ssize_t		bytes_returned = 0;
5962 	byte*		encrypt_log_buf = NULL;
5963 
5964 	if (type.is_compressed()) {
5965 
5966 		/* We don't compress the first page of any file. */
5967 		ut_ad(offset > 0);
5968 
5969 		block = os_file_compress_page(type, buf, &n);
5970 	} else {
5971 		block = NULL;
5972 	}
5973 
5974 	/* We do encryption after compression, since if we do encryption
5975 	before compression, the encrypted data will cause compression fail
5976 	or low compression rate. */
5977 	if (type.is_encrypted() && type.is_write() &&
5978 	    (type.encryption_algorithm().m_type != Encryption::KEYRING ||
5979 		(type.encryption_algorithm().m_key != NULL &&
5980 		 Encryption::can_page_be_keyring_encrypted(reinterpret_cast<byte*>(buf))
5981 		)
5982 	    )) {
5983 		if (!type.is_log()) {
5984 			/* We don't encrypt the first page of any file. */
5985 			Block*	compressed_block = block;
5986 			ut_ad(offset > 0);
5987 
5988 			ut_ad(type.encryption_algorithm().m_key != NULL);
5989 			block = os_file_encrypt_page(type, buf, &n);
5990 
5991 			if (compressed_block != NULL) {
5992 				os_free_block(compressed_block);
5993 			}
5994 		} else {
5995 			/* Skip encrypt log file header */
5996 			if (offset >= LOG_FILE_HDR_SIZE) {
5997 				block = os_file_encrypt_log(type,
5998 							    buf,
5999 							    encrypt_log_buf,
6000 							    &n);
6001 			}
6002 		}
6003         }
6004 
6005 	SyncFileIO	sync_file_io(file, buf, n, offset);
6006 
6007 	for (ulint i = 0; i < NUM_RETRIES_ON_PARTIAL_IO; ++i) {
6008 
6009 		ssize_t	n_bytes = sync_file_io.execute(type);
6010 
6011 		/* Check for a hard error. Not much we can do now. */
6012 		if (n_bytes < 0) {
6013 
6014 			break;
6015 
6016 		} else if ((ulint) n_bytes + bytes_returned == n) {
6017 
6018 			bytes_returned += n_bytes;
6019 
6020 			if (offset > 0
6021 			    && (type.is_compressed() || type.is_read())) {
6022 
6023 				*err = os_file_io_complete(
6024 					type, file,
6025 					reinterpret_cast<byte*>(buf),
6026 					NULL, original_n, offset, n);
6027 			} else {
6028 
6029 				*err = DB_SUCCESS;
6030 			}
6031 
6032 			if (block != NULL) {
6033 				os_free_block(block);
6034 			}
6035 
6036 			if (encrypt_log_buf != NULL) {
6037 				ut_free(encrypt_log_buf);
6038 			}
6039 
6040 			return(original_n);
6041 		}
6042 
6043 		/* Handle partial read/write. */
6044 
6045 		ut_ad((ulint) n_bytes + bytes_returned < n);
6046 
6047 		bytes_returned += (ulint) n_bytes;
6048 
6049 		if (!type.is_partial_io_warning_disabled()) {
6050 
6051 			const char*	op = type.is_read()
6052 				? "read" : "written";
6053 
6054 			ib::warn()
6055 				<< n
6056 				<< " bytes should have been " << op << ". Only "
6057 				<< bytes_returned
6058 				<< " bytes " << op << ". Retrying"
6059 				<< " for the remaining bytes.";
6060 		}
6061 
6062 		/* Advance the offset and buffer by n_bytes */
6063 		sync_file_io.advance(n_bytes);
6064 	}
6065 
6066 	if (block != NULL) {
6067 		os_free_block(block);
6068 	}
6069 
6070 	if (encrypt_log_buf != NULL) {
6071 		ut_free(encrypt_log_buf);
6072 	}
6073 
6074 	*err = DB_IO_ERROR;
6075 
6076 	if (!type.is_partial_io_warning_disabled()) {
6077 		ib::warn()
6078 			<< "Retry attempts for "
6079 			<< (type.is_read() ? "reading" : "writing")
6080 			<< " partial data failed.";
6081 	}
6082 
6083 	return(bytes_returned);
6084 }
6085 
6086 /** Does a synchronous write operation in Posix.
6087 @param[in]	type		IO context
6088 @param[in]	file		handle to an open file
6089 @param[out]	buf		buffer from which to write
6090 @param[in]	n		number of bytes to read, starting from offset
6091 @param[in]	offset		file offset from the start where to read
6092 @param[out]	err		DB_SUCCESS or error code
6093 @return number of bytes written, -1 if error */
6094 static MY_ATTRIBUTE((warn_unused_result))
6095 ssize_t
os_file_pwrite(IORequest & type,os_file_t file,const byte * buf,ulint n,os_offset_t offset,dberr_t * err)6096 os_file_pwrite(
6097 	IORequest&	type,
6098 	os_file_t	file,
6099 	const byte*	buf,
6100 	ulint		n,
6101 	os_offset_t	offset,
6102 	dberr_t*	err)
6103 {
6104 	ut_ad(type.validate());
6105 
6106 	++os_n_file_writes;
6107 
6108 	(void) os_atomic_increment_ulint(&os_n_pending_writes, 1);
6109 	MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_WRITES);
6110 
6111 	ssize_t	n_bytes = os_file_io(type, file, (void*) buf, n, offset, err);
6112 
6113 	DBUG_EXECUTE_IF("xb_simulate_all_o_direct_write_failure",
6114 			n_bytes = -1;
6115 			errno = EINVAL;
6116 			*err = DB_IO_ERROR;);
6117 
6118 	(void) os_atomic_decrement_ulint(&os_n_pending_writes, 1);
6119 	MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_WRITES);
6120 
6121 	return(n_bytes);
6122 }
6123 
6124 /** Requests a synchronous write operation.
6125 @param[in]	type		IO flags
6126 @param[in]	file		handle to an open file
6127 @param[out]	buf		buffer from which to write
6128 @param[in]	offset		file offset from the start where to read
6129 @param[in]	n		number of bytes to read, starting from offset
6130 @return DB_SUCCESS if request was successful, false if fail */
6131 static MY_ATTRIBUTE((warn_unused_result))
6132 dberr_t
os_file_write_page(IORequest & type,const char * name,os_file_t file,const byte * buf,os_offset_t offset,ulint n)6133 os_file_write_page(
6134 	IORequest&	type,
6135 	const char*	name,
6136 	os_file_t	file,
6137 	const byte*	buf,
6138 	os_offset_t	offset,
6139 	ulint		n)
6140 {
6141 	dberr_t		err;
6142 	ut_ad(type.validate());
6143 	ut_ad(n > 0);
6144 
6145 	ssize_t n_bytes = os_file_pwrite(type, file, buf, n, offset, &err);
6146 
6147 	if ((ulint) n_bytes != n && !os_has_said_disk_full) {
6148 
6149 		ib::error()
6150 			<< "Write to file " << name << "failed at offset "
6151 			<< offset << ", " << n
6152 			<< " bytes should have been written,"
6153 			" only " << n_bytes << " were written."
6154 			" Operating system error number " << errno << "."
6155 			" Check that your OS and file system"
6156 			" support files of this size."
6157 			" Check also that the disk is not full"
6158 			" or a disk quota exceeded.";
6159 
6160 		if (strerror(errno) != NULL) {
6161 
6162 			ib::error()
6163 				<< "Error number " << errno
6164 				<< " means '" << strerror(errno) << "'";
6165 		}
6166 
6167 		ib::info() << OPERATING_SYSTEM_ERROR_MSG;
6168 
6169 		os_diagnose_all_o_direct_einval(errno);
6170 
6171 		os_has_said_disk_full = true;
6172 	}
6173 
6174 	return(err);
6175 }
6176 
6177 /** Does a synchronous read operation in Posix.
6178 @param[in]	type		IO flags
6179 @param[in]	file		handle to an open file
6180 @param[out]	buf		buffer where to read
6181 @param[in]	offset		file offset from the start where to read
6182 @param[in]	n		number of bytes to read, starting from offset
6183 @param[out]	err		DB_SUCCESS or error code
6184 @return number of bytes read, -1 if error */
6185 static MY_ATTRIBUTE((warn_unused_result))
6186 ssize_t
os_file_pread(IORequest & type,os_file_t file,void * buf,ulint n,os_offset_t offset,trx_t * trx,dberr_t * err)6187 os_file_pread(
6188 	IORequest&	type,
6189 	os_file_t	file,
6190 	void*		buf,
6191 	ulint		n,
6192 	os_offset_t	offset,
6193 	trx_t*		trx,
6194 	dberr_t*	err)
6195 {
6196 	++os_n_file_reads;
6197 
6198 	const ib_time_monotonic_us_t start_time = trx_stats::start_io_read(trx, n);
6199 
6200 	(void) os_atomic_increment_ulint(&os_n_pending_reads, 1);
6201 	MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS);
6202 
6203 	ssize_t	n_bytes = os_file_io(type, file, buf, n, offset, err);
6204 
6205 	DBUG_EXECUTE_IF("xb_simulate_all_o_direct_read_failure",
6206 			n_bytes = -1;
6207 			errno = EINVAL;);
6208 
6209 	trx_stats::end_io_read(trx, start_time);
6210 
6211 	(void) os_atomic_decrement_ulint(&os_n_pending_reads, 1);
6212 	MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_READS);
6213 
6214 	return(n_bytes);
6215 }
6216 
6217 /** Requests a synchronous positioned read operation.
6218 @return DB_SUCCESS if request was successful, false if fail
6219 @param[in]	type		IO flags
6220 @param[in]	file		handle to an open file
6221 @param[out]	buf		buffer where to read
6222 @param[in]	offset		file offset from the start where to read
6223 @param[in]	n		number of bytes to read, starting from offset
6224 @param[out]	o		number of bytes actually read
6225 @param[in]	exit_on_err	if true then exit on error
6226 @return DB_SUCCESS or error code */
6227 static MY_ATTRIBUTE((warn_unused_result))
6228 dberr_t
os_file_read_page(IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n,ulint * o,bool exit_on_err,trx_t * trx)6229 os_file_read_page(
6230 	IORequest&	type,
6231 	os_file_t	file,
6232 	void*		buf,
6233 	os_offset_t	offset,
6234 	ulint		n,
6235 	ulint*		o,
6236 	bool		exit_on_err,
6237 	trx_t*		trx)
6238 {
6239 	dberr_t		err;
6240 
6241 	os_bytes_read_since_printout += n;
6242 
6243 	ut_ad(type.validate());
6244 	ut_ad(n > 0);
6245 
6246 	for (;;) {
6247 		ssize_t	n_bytes;
6248 
6249 		n_bytes = os_file_pread(type, file, buf, n, offset, trx, &err);
6250 
6251 		if (o != NULL) {
6252 			*o = n_bytes;
6253 		}
6254 
6255 		if (err != DB_SUCCESS && !exit_on_err) {
6256 
6257 			return(err);
6258 
6259 		} else if ((ulint) n_bytes == n) {
6260 
6261 			/*The page decryption failed - will handled by buf_io_comptelete*/
6262 			if (err == DB_IO_DECRYPT_FAIL)
6263 				return (DB_IO_DECRYPT_FAIL);
6264 
6265 			/** The read will succeed but decompress can fail
6266 			for various reasons. */
6267 
6268 			if (type.is_compression_enabled()
6269 			    && !Compression::is_compressed_page(
6270 				    static_cast<byte*>(buf))) {
6271 
6272 				return(DB_SUCCESS);
6273 
6274 			} else {
6275 				return(err);
6276 			}
6277 		}
6278 
6279 		const std::string fd_path = os_file_find_path_for_fd(file);
6280 		if (!fd_path.empty()) {
6281 			ib::error() << "Tried to read " << n
6282 				    << " bytes at offset " << offset
6283 				    << ", but was only able to read " << n_bytes
6284 				    << " of FD " << file
6285 				    << ", filename " << fd_path;
6286 		} else {
6287 			ib::error() << "Tried to read " << n
6288 				    << " bytes at offset " << offset
6289 				    << ", but was only able to read " << n_bytes;
6290 		}
6291 
6292 		if (exit_on_err) {
6293 
6294 			if (!os_file_handle_error(NULL, "read")) {
6295 				/* Hard error */
6296 				break;
6297 			}
6298 
6299 		} else if (!os_file_handle_error_no_exit(NULL, "read", false)) {
6300 
6301 			/* Hard error */
6302 			break;
6303 		}
6304 
6305 		if (n_bytes > 0 && (ulint) n_bytes < n) {
6306 			n -= (ulint) n_bytes;
6307 			offset += (ulint) n_bytes;
6308 			buf = reinterpret_cast<uchar*>(buf) + (ulint) n_bytes;
6309 		}
6310 	}
6311 
6312 	ib::fatal()
6313 		<< "Cannot read from file. OS error number "
6314 		<< errno << ".";
6315 
6316 	return(err);
6317 }
6318 
6319 /** Retrieves the last error number if an error occurs in a file io function.
6320 The number should be retrieved before any other OS calls (because they may
6321 overwrite the error number). If the number is not known to this program,
6322 the OS error number + 100 is returned.
6323 @param[in]	report_all_errors	true if we want an error printed
6324 					for all errors
6325 @return error number, or OS error number + 100 */
6326 ulint
os_file_get_last_error(bool report_all_errors)6327 os_file_get_last_error(
6328 	bool	report_all_errors)
6329 {
6330 	return(os_file_get_last_error_low(report_all_errors, false));
6331 }
6332 
6333 /** Does error handling when a file operation fails.
6334 Conditionally exits (calling srv_fatal_error()) based on should_exit value
6335 and the error type, if should_exit is true then on_error_silent is ignored.
6336 @param[in]	name		name of a file or NULL
6337 @param[in]	operation	operation
6338 @param[in]	should_exit	call srv_fatal_error() on an unknown error,
6339 				if this parameter is true
6340 @param[in]	on_error_silent	if true then don't print any message to the log
6341 				iff it is an unknown non-fatal error
6342 @return true if we should retry the operation */
6343 static MY_ATTRIBUTE((warn_unused_result))
6344 bool
os_file_handle_error_cond_exit(const char * name,const char * operation,bool should_exit,bool on_error_silent)6345 os_file_handle_error_cond_exit(
6346 	const char*	name,
6347 	const char*	operation,
6348 	bool		should_exit,
6349 	bool		on_error_silent)
6350 {
6351 	ulint	err;
6352 
6353 	err = os_file_get_last_error_low(false, on_error_silent);
6354 
6355 	switch (err) {
6356 	case OS_FILE_DISK_FULL:
6357 		/* We only print a warning about disk full once */
6358 
6359 		if (os_has_said_disk_full) {
6360 
6361 			return(false);
6362 		}
6363 
6364 		/* Disk full error is reported irrespective of the
6365 		on_error_silent setting. */
6366 
6367 		if (name) {
6368 
6369 			ib::error()
6370 				<< "Encountered a problem with file '"
6371 				<< name << "'";
6372 		}
6373 
6374 		ib::error()
6375 			<< "Disk is full. Try to clean the disk to free space.";
6376 
6377 		os_has_said_disk_full = true;
6378 
6379 		return(false);
6380 
6381 	case OS_FILE_AIO_RESOURCES_RESERVED:
6382 	case OS_FILE_AIO_INTERRUPTED:
6383 
6384 		return(true);
6385 
6386 	case OS_FILE_PATH_ERROR:
6387 	case OS_FILE_ALREADY_EXISTS:
6388 	case OS_FILE_ACCESS_VIOLATION:
6389 
6390 		return(false);
6391 
6392 	case OS_FILE_SHARING_VIOLATION:
6393 
6394 		os_thread_sleep(10000000);	/* 10 sec */
6395 		return(true);
6396 
6397 	case OS_FILE_OPERATION_ABORTED:
6398 	case OS_FILE_INSUFFICIENT_RESOURCE:
6399 
6400 		os_thread_sleep(100000);	/* 100 ms */
6401 		return(true);
6402 
6403 	default:
6404 
6405 		/* If it is an operation that can crash on error then it
6406 		is better to ignore on_error_silent and print an error message
6407 		to the log. */
6408 
6409 		if (should_exit || !on_error_silent) {
6410 			ib::error() << "File "
6411 				<< (name != NULL ? name : "(unknown)")
6412 				<< ": '" << operation << "'"
6413 				" returned OS error " << err << "."
6414 				<< (should_exit
6415 				    ? " Cannot continue operation" : "");
6416 		}
6417 
6418 		if (should_exit) {
6419 			srv_fatal_error();
6420 		}
6421 	}
6422 
6423 	return(false);
6424 }
6425 
6426 /** Does error handling when a file operation fails.
6427 @param[in]	name		name of a file or NULL
6428 @param[in]	operation	operation name that failed
6429 @return true if we should retry the operation */
6430 static
6431 bool
os_file_handle_error(const char * name,const char * operation)6432 os_file_handle_error(
6433 	const char*	name,
6434 	const char*	operation)
6435 {
6436 	/* Exit in case of unknown error */
6437 	return(os_file_handle_error_cond_exit(name, operation, true, false));
6438 }
6439 
6440 /** Does error handling when a file operation fails.
6441 @param[in]	name		name of a file or NULL
6442 @param[in]	operation	operation name that failed
6443 @param[in]	on_error_silent	if true then don't print any message to the log.
6444 @return true if we should retry the operation */
6445 static
6446 bool
os_file_handle_error_no_exit(const char * name,const char * operation,bool on_error_silent)6447 os_file_handle_error_no_exit(
6448 	const char*	name,
6449 	const char*	operation,
6450 	bool		on_error_silent)
6451 {
6452 	/* Don't exit in case of unknown error */
6453 	return(os_file_handle_error_cond_exit(
6454 			name, operation, false, on_error_silent));
6455 }
6456 
6457 /** Tries to disable OS caching on an opened file descriptor.
6458 @param[in]	fd		file descriptor to alter
6459 @param[in]	file_name	file name, used in the diagnostic message
6460 @param[in]	name		"open" or "create"; used in the diagnostic
6461 				message
6462 @param[in]	failure_warning	if true (the default), the failure to disable
6463 caching is diagnosed at warning severity, and at note severity otherwise
6464 @return true if operation is success and false */
6465 bool
os_file_set_nocache(int fd MY_ATTRIBUTE ((unused)),const char * file_name MY_ATTRIBUTE ((unused)),const char * operation_name MY_ATTRIBUTE ((unused)),bool failure_warning MY_ATTRIBUTE ((unused)))6466 os_file_set_nocache(
6467 	int		fd		MY_ATTRIBUTE((unused)),
6468 	const char*	file_name	MY_ATTRIBUTE((unused)),
6469 	const char*	operation_name	MY_ATTRIBUTE((unused)),
6470 	bool		failure_warning MY_ATTRIBUTE((unused)))
6471 {
6472 	/* some versions of Solaris may not have DIRECTIO_ON */
6473 #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
6474 	if (directio(fd, DIRECTIO_ON) == -1) {
6475 		int	errno_save = errno;
6476 
6477 		ib::error()
6478 			<< "Failed to set DIRECTIO_ON on file "
6479 			<< file_name << ": " << operation_name
6480 			<< strerror(errno_save) << ","
6481 			" continuing anyway.";
6482 		return false;
6483 	}
6484 #elif defined(O_DIRECT)
6485 	if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
6486 		int		errno_save = errno;
6487 		static bool	warning_message_printed = false;
6488 		if (errno_save == EINVAL) {
6489 			if (!warning_message_printed) {
6490 				warning_message_printed = true;
6491 # ifdef UNIV_LINUX
6492 				ib::warn_or_info(failure_warning)
6493 					<< "Failed to set O_DIRECT on file "
6494 					<< file_name << ";" << operation_name
6495 					<< ": " << strerror(errno_save) << ", "
6496 					<< "continuing anyway. O_DIRECT is "
6497 					"known to result in 'Invalid argument' "
6498 					"on Linux on tmpfs, "
6499 					"see MySQL Bug#26662.";
6500 # else /* UNIV_LINUX */
6501 				goto short_warning;
6502 # endif /* UNIV_LINUX */
6503 			}
6504 		} else {
6505 # ifndef UNIV_LINUX
6506 short_warning:
6507 # endif
6508 			ib::warn_or_info(failure_warning)
6509 				<< "Failed to set O_DIRECT on file "
6510 				<< file_name << "; " << operation_name
6511 				<< " : " << strerror(errno_save)
6512 				<< " continuing anyway.";
6513 		}
6514 		return false;
6515 	}
6516 #endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */
6517 	return true;
6518 }
6519 
6520 /** Write the specified number of zeros to a newly created file.
6521 @param[in]	name		name of the file or path as a null-terminated
6522 				string
6523 @param[in]	file		handle to a file
6524 @param[in]	size		file size
6525 @param[in]	read_only	Enable read-only checks if true
6526 @return true if success */
6527 bool
os_file_set_size(const char * name,pfs_os_file_t file,os_offset_t size,bool read_only)6528 os_file_set_size(
6529 	const char*	name,
6530 	pfs_os_file_t	file,
6531 	os_offset_t	size,
6532 	bool		read_only)
6533 {
6534 	/* Write up to 1 megabyte at a time. */
6535 	ulint	buf_size = ut_min(
6536 		static_cast<ulint>(64),
6537 		static_cast<ulint>(size / UNIV_PAGE_SIZE));
6538 
6539 	buf_size *= UNIV_PAGE_SIZE;
6540 
6541 	/* Align the buffer for possible raw i/o */
6542 	byte*	buf2;
6543 
6544 	buf2 = static_cast<byte*>(ut_malloc_nokey(buf_size + UNIV_PAGE_SIZE));
6545 
6546 	byte*	buf = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
6547 
6548 	/* Write buffer full of zeros */
6549 	memset(buf, 0, buf_size);
6550 
6551 	if (size >= (os_offset_t) 100 << 20) {
6552 
6553 		ib::info() << "Progress in MB:";
6554 	}
6555 
6556 	os_offset_t	current_size = 0;
6557 
6558 	while (current_size < size) {
6559 		ulint	n_bytes;
6560 
6561 		if (size - current_size < (os_offset_t) buf_size) {
6562 			n_bytes = (ulint) (size - current_size);
6563 		} else {
6564 			n_bytes = buf_size;
6565 		}
6566 
6567 		dberr_t		err;
6568 		IORequest	request(IORequest::WRITE);
6569 
6570 #ifdef UNIV_HOTBACKUP
6571 
6572 		err = os_file_write(
6573 			request, name, file, buf, current_size, n_bytes);
6574 #else
6575 		/* Using OS_AIO_SYNC mode on POSIX systems will result in
6576 		fall back to os_file_write/read. On Windows it will use
6577 		special mechanism to wait before it returns back. */
6578 
6579 		err = os_aio(
6580 			request,
6581 			OS_AIO_SYNC, name,
6582 			file, buf, current_size, n_bytes,
6583 			read_only, NULL, NULL, 0, NULL, false);
6584 #endif /* UNIV_HOTBACKUP */
6585 
6586 		if (err != DB_SUCCESS) {
6587 
6588 			ut_free(buf2);
6589 			return(false);
6590 		}
6591 
6592 		/* Print about progress for each 100 MB written */
6593 		if ((current_size + n_bytes) / (100 << 20)
6594 		    != current_size / (100 << 20)) {
6595 
6596 			fprintf(stderr, " %lu00",
6597 				(ulong) ((current_size + n_bytes)
6598 					 / (100 << 20)));
6599 		}
6600 
6601 		current_size += n_bytes;
6602 	}
6603 
6604 	if (size >= (os_offset_t) 100 << 20) {
6605 
6606 		fprintf(stderr, "\n");
6607 	}
6608 
6609 	ut_free(buf2);
6610 
6611 	return(os_file_flush(file));
6612 }
6613 
6614 /** Truncates a file to a specified size in bytes.
6615 Do nothing if the size to preserve is greater or equal to the current
6616 size of the file.
6617 @param[in]	pathname	file path
6618 @param[in]	file		file to be truncated
6619 @param[in]	size		size to preserve in bytes
6620 @return true if success */
6621 bool
os_file_truncate(const char * pathname,pfs_os_file_t file,os_offset_t size)6622 os_file_truncate(
6623 	const char*	pathname,
6624 	pfs_os_file_t	file,
6625 	os_offset_t	size)
6626 {
6627 	/* Do nothing if the size preserved is larger than or equal to the
6628 	current size of file */
6629 	os_offset_t	size_bytes = os_file_get_size(file);
6630 
6631 	if (size >= size_bytes) {
6632 		return(true);
6633 	}
6634 
6635 #ifdef _WIN32
6636 	return(os_file_truncate_win32(pathname, file, size));
6637 #else /* _WIN32 */
6638 	return(os_file_truncate_posix(pathname, file, size));
6639 #endif /* _WIN32 */
6640 }
6641 
6642 /** NOTE! Use the corresponding macro os_file_read(), not directly this
6643 function!
6644 Requests a synchronous positioned read operation.
6645 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
6646 @param[in]	type		IO flags
6647 @param[in]	file		handle to an open file
6648 @param[out]	buf		buffer where to read
6649 @param[in]	offset		file offset from the start where to read
6650 @param[in]	n		number of bytes to read, starting from offset
6651 @return DB_SUCCESS or error code */
6652 dberr_t
os_file_read_func(IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n,trx_t * trx)6653 os_file_read_func(
6654 	IORequest&	type,
6655 	os_file_t	file,
6656 	void*		buf,
6657 	os_offset_t	offset,
6658 	ulint		n,
6659 	trx_t*		trx)
6660 {
6661 	ut_ad(type.is_read());
6662 
6663 	return(os_file_read_page(type, file, buf, offset, n, NULL, true, trx));
6664 }
6665 
6666 /** NOTE! Use the corresponding macro os_file_read_no_error_handling(),
6667 not directly this function!
6668 Requests a synchronous positioned read operation.
6669 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
6670 @param[in]	type		IO flags
6671 @param[in]	file		handle to an open file
6672 @param[out]	buf		buffer where to read
6673 @param[in]	offset		file offset from the start where to read
6674 @param[in]	n		number of bytes to read, starting from offset
6675 @param[out]	o		number of bytes actually read
6676 @return DB_SUCCESS or error code */
6677 dberr_t
os_file_read_no_error_handling_func(IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n,ulint * o)6678 os_file_read_no_error_handling_func(
6679 	IORequest&	type,
6680 	os_file_t	file,
6681 	void*		buf,
6682 	os_offset_t	offset,
6683 	ulint		n,
6684 	ulint*		o)
6685 {
6686 	ut_ad(type.is_read());
6687 
6688 	return(os_file_read_page(type, file, buf, offset, n, o, false, NULL));
6689 }
6690 
6691 /** NOTE! Use the corresponding macro os_file_write(), not directly
6692 Requests a synchronous write operation.
6693 @param[in]	type		IO flags
6694 @param[in]	file		handle to an open file
6695 @param[out]	buf		buffer from which to write
6696 @param[in]	offset		file offset from the start where to read
6697 @param[in]	n		number of bytes to read, starting from offset
6698 @return DB_SUCCESS if request was successful, false if fail */
6699 dberr_t
os_file_write_func(IORequest & type,const char * name,os_file_t file,const void * buf,os_offset_t offset,ulint n)6700 os_file_write_func(
6701 	IORequest&	type,
6702 	const char*	name,
6703 	os_file_t	file,
6704 	const void*	buf,
6705 	os_offset_t	offset,
6706 	ulint		n)
6707 {
6708 	ut_ad(type.validate());
6709 	ut_ad(type.is_write());
6710 
6711 	/* We never compress the first page.
6712 	Note: This assumes we always do block IO. */
6713 	if (offset == 0) {
6714 		type.clear_compressed();
6715 	}
6716 
6717 	const byte*	ptr = reinterpret_cast<const byte*>(buf);
6718 
6719 	return(os_file_write_page(type, name, file, ptr, offset, n));
6720 }
6721 
6722 /** Check the existence and type of the given file.
6723 @param[in]	path		path name of file
6724 @param[out]	exists		true if the file exists
6725 @param[out]	type		Type of the file, if it exists
6726 @return true if call succeeded */
6727 bool
os_file_status(const char * path,bool * exists,os_file_type_t * type)6728 os_file_status(
6729 	const char*	path,
6730 	bool*		exists,
6731 	os_file_type_t* type)
6732 {
6733 #ifdef _WIN32
6734 	return(os_file_status_win32(path, exists, type));
6735 #else
6736 	return(os_file_status_posix(path, exists, type));
6737 #endif /* _WIN32 */
6738 }
6739 
6740 /** Free storage space associated with a section of the file.
6741 @param[in]	fh		Open file handle
6742 @param[in]	off		Starting offset (SEEK_SET)
6743 @param[in]	len		Size of the hole
6744 @return DB_SUCCESS or error code */
6745 dberr_t
os_file_punch_hole(os_file_t fh,os_offset_t off,os_offset_t len)6746 os_file_punch_hole(
6747 	os_file_t	fh,
6748 	os_offset_t	off,
6749 	os_offset_t	len)
6750 {
6751 	/* In this debugging mode, we act as if punch hole is supported,
6752 	and then skip any calls to actually punch a hole here.
6753 	In this way, Transparent Page Compression is still being tested. */
6754 	DBUG_EXECUTE_IF("ignore_punch_hole",
6755 		return(DB_SUCCESS);
6756 	);
6757 
6758 #ifdef _WIN32
6759 	return(os_file_punch_hole_win32(fh, off, len));
6760 #else
6761 	return(os_file_punch_hole_posix(fh, off, len));
6762 #endif /* _WIN32 */
6763 }
6764 
6765 /** Check if the file system supports sparse files.
6766 
6767 Warning: On POSIX systems we try and punch a hole from offset 0 to
6768 the system configured page size. This should only be called on an empty
6769 file.
6770 
6771 Note: On Windows we use the name and on Unices we use the file handle.
6772 
6773 @param[in]	name		File name
6774 @param[in]	fh		File handle for the file - if opened
6775 @return true if the file system supports sparse files */
6776 bool
os_is_sparse_file_supported(const char * path,pfs_os_file_t fh)6777 os_is_sparse_file_supported(const char* path, pfs_os_file_t fh)
6778 {
6779 	/* In this debugging mode, we act as if punch hole is supported,
6780 	then we skip any calls to actually punch a hole.  In this way,
6781 	Transparent Page Compression is still being tested. */
6782 	DBUG_EXECUTE_IF("ignore_punch_hole",
6783 		return(true);
6784 	);
6785 
6786 #ifdef _WIN32
6787 	return(os_is_sparse_file_supported_win32(path));
6788 #else
6789 	dberr_t	err;
6790 
6791 	/* We don't know the FS block size, use the sector size. The FS
6792 	will do the magic. */
6793 	err = os_file_punch_hole(fh.m_file, 0, UNIV_PAGE_SIZE);
6794 
6795 	return(err == DB_SUCCESS);
6796 #endif /* _WIN32 */
6797 }
6798 
6799 /** This function returns information about the specified file
6800 @param[in]	path		pathname of the file
6801 @param[out]	stat_info	information of a file in a directory
6802 @param[in]	check_rw_perm	for testing whether the file can be opened
6803 				in RW mode
6804 @param[in]	read_only	true if file is opened in read-only mode
6805 @return DB_SUCCESS if all OK */
6806 dberr_t
os_file_get_status(const char * path,os_file_stat_t * stat_info,bool check_rw_perm,bool read_only)6807 os_file_get_status(
6808 	const char*	path,
6809 	os_file_stat_t* stat_info,
6810 	bool		check_rw_perm,
6811 	bool		read_only)
6812 {
6813 	dberr_t	ret;
6814 
6815 #ifdef _WIN32
6816 	struct _stat64	info;
6817 
6818 	ret = os_file_get_status_win32(
6819 		path, stat_info, &info, check_rw_perm, read_only);
6820 
6821 #else
6822 	struct stat	info;
6823 
6824 	ret = os_file_get_status_posix(
6825 		path, stat_info, &info, check_rw_perm, read_only);
6826 
6827 #endif /* _WIN32 */
6828 
6829 	if (ret == DB_SUCCESS) {
6830 		stat_info->ctime = info.st_ctime;
6831 		stat_info->atime = info.st_atime;
6832 		stat_info->mtime = info.st_mtime;
6833 		stat_info->size  = info.st_size;
6834 	}
6835 
6836 	return(ret);
6837 }
6838 
6839 /**
6840 Waits for an AIO operation to complete. This function is used to wait the
6841 for completed requests. The aio array of pending requests is divided
6842 into segments. The thread specifies which segment or slot it wants to wait
6843 for. NOTE: this function will also take care of freeing the aio slot,
6844 therefore no other thread is allowed to do the freeing!
6845 @param[in]	segment		The number of the segment in the aio arrays to
6846 				wait for; segment 0 is the ibuf I/O thread,
6847 				segment 1 the log I/O thread, then follow the
6848 				non-ibuf read threads, and as the last are the
6849 				non-ibuf write threads; if this is
6850 				ULINT_UNDEFINED, then it means that sync AIO
6851 				is used, and this parameter is ignored
6852 @param[out]	m1		the messages passed with the AIO request; note
6853 				that also in the case where the AIO operation
6854 				failed, these output parameters are valid and
6855 				can be used to restart the operation,
6856 				for example
6857 @param[out]	m2		callback message
6858 @param[out]	type		OS_FILE_WRITE or ..._READ
6859 @return DB_SUCCESS or error code */
6860 dberr_t
os_aio_handler(ulint segment,fil_node_t ** m1,void ** m2,IORequest * request)6861 os_aio_handler(
6862 	ulint		segment,
6863 	fil_node_t**	m1,
6864 	void**		m2,
6865 	IORequest*	request)
6866 {
6867 	dberr_t	err;
6868 
6869 	if (srv_use_native_aio) {
6870 		srv_set_io_thread_op_info(segment, "native aio handle");
6871 
6872 #ifdef WIN_ASYNC_IO
6873 
6874 		err = os_aio_windows_handler(segment, 0, m1, m2, request);
6875 
6876 #elif defined(LINUX_NATIVE_AIO)
6877 
6878 		err = os_aio_linux_handler(segment, m1, m2, request);
6879 
6880 #else
6881 		ut_error;
6882 
6883 		err = DB_ERROR; /* Eliminate compiler warning */
6884 
6885 #endif /* WIN_ASYNC_IO */
6886 
6887 	} else {
6888 		srv_set_io_thread_op_info(segment, "simulated aio handle");
6889 
6890 		err = os_aio_simulated_handler(segment, m1, m2, request);
6891 	}
6892 
6893 	return(err);
6894 }
6895 
6896 /** Constructor
6897 @param[in]	id		The latch ID
6898 @param[in]	n		Number of AIO slots
6899 @param[in]	segments	Number of segments */
AIO(latch_id_t id,ulint n,ulint segments)6900 AIO::AIO(
6901 	latch_id_t	id,
6902 	ulint		n,
6903 	ulint		segments)
6904 	:
6905 	m_slots(n),
6906 	m_n_segments(segments),
6907 	m_n_reserved()
6908 # ifdef LINUX_NATIVE_AIO
6909 	,m_aio_ctx(),
6910 	m_events(m_slots.size())
6911 	,m_pending(NULL)
6912 	,m_count(NULL)
6913 # elif defined(_WIN32)
6914 	,m_handles()
6915 # endif /* LINUX_NATIVE_AIO */
6916 {
6917 	ut_a(n > 0);
6918 	ut_a(m_n_segments > 0);
6919 
6920 	mutex_create(id, &m_mutex);
6921 
6922 	m_not_full = os_event_create("aio_not_full");
6923 	m_is_empty = os_event_create("aio_is_empty");
6924 
6925 	std::uninitialized_fill(m_slots.begin(), m_slots.end(), Slot());
6926 #ifdef LINUX_NATIVE_AIO
6927 	memset(&m_events[0], 0x0, sizeof(m_events[0]) * m_events.size());
6928 #endif /* LINUX_NATIVE_AIO */
6929 
6930 	os_event_set(m_is_empty);
6931 }
6932 
6933 /** Initialise the slots */
6934 dberr_t
init_slots()6935 AIO::init_slots()
6936 {
6937 	for (ulint i = 0; i < m_slots.size(); ++i) {
6938 		Slot&	slot = m_slots[i];
6939 
6940 		slot.pos = static_cast<uint16_t>(i);
6941 
6942 		slot.is_reserved = false;
6943 
6944 #ifdef WIN_ASYNC_IO
6945 
6946 		slot.handle = CreateEvent(NULL, TRUE, FALSE, NULL);
6947 
6948 		OVERLAPPED*	over = &slot.control;
6949 
6950 		over->hEvent = slot.handle;
6951 
6952 		(*m_handles)[i] = over->hEvent;
6953 
6954 #elif defined(LINUX_NATIVE_AIO)
6955 
6956 		slot.ret = 0;
6957 
6958 		slot.n_bytes = 0;
6959 
6960 		memset(&slot.control, 0x0, sizeof(slot.control));
6961 
6962 #endif /* WIN_ASYNC_IO */
6963 	}
6964 
6965 	return(DB_SUCCESS);
6966 }
6967 
6968 #ifdef LINUX_NATIVE_AIO
6969 /** Initialise the Linux Native AIO interface */
6970 dberr_t
init_linux_native_aio()6971 AIO::init_linux_native_aio()
6972 {
6973 	/* Initialize the io_context array. One io_context
6974 	per segment in the array. */
6975 
6976 	ut_a(m_aio_ctx == NULL);
6977 
6978 	m_aio_ctx = static_cast<io_context**>(
6979 		ut_zalloc_nokey(m_n_segments * sizeof(*m_aio_ctx)));
6980 
6981 	if (m_aio_ctx == NULL) {
6982 		return(DB_OUT_OF_MEMORY);
6983 	}
6984 
6985 	io_context**	ctx = m_aio_ctx;
6986 	ulint		max_events = slots_per_segment();
6987 
6988 	for (ulint i = 0; i < m_n_segments; ++i, ++ctx) {
6989 
6990 		if (!linux_create_io_ctx(max_events, ctx)) {
6991 			/* If something bad happened during aio setup
6992 			we should call it a day and return right away.
6993 			We don't care about any leaks because a failure
6994 			to initialize the io subsystem means that the
6995 			server (or atleast the innodb storage engine)
6996 			is not going to startup. */
6997 			return(DB_IO_ERROR);
6998 		}
6999 	}
7000 
7001 	m_pending = static_cast<struct iocb**>(
7002 		ut_zalloc_nokey(m_slots.size() * sizeof(struct iocb*)));
7003 	m_count = static_cast<ulint*>(
7004 		ut_zalloc_nokey(m_n_segments * sizeof(ulint)));
7005 
7006 	return(DB_SUCCESS);
7007 }
7008 #endif /* LINUX_NATIVE_AIO */
7009 
7010 /** Initialise the array */
7011 dberr_t
init()7012 AIO::init()
7013 {
7014 	ut_a(!m_slots.empty());
7015 
7016 #ifdef _WIN32
7017 	ut_a(m_handles == NULL);
7018 
7019 	m_handles = UT_NEW_NOKEY(Handles(m_slots.size()));
7020 #endif /* _WIN32 */
7021 
7022 	if (srv_use_native_aio) {
7023 #ifdef LINUX_NATIVE_AIO
7024 		dberr_t	err = init_linux_native_aio();
7025 
7026 		if (err != DB_SUCCESS) {
7027 			return(err);
7028 		}
7029 
7030 #endif /* LINUX_NATIVE_AIO */
7031 	}
7032 
7033 	return(init_slots());
7034 }
7035 
7036 /** Creates an aio wait array. Note that we return NULL in case of failure.
7037 We don't care about freeing memory here because we assume that a
7038 failure will result in server refusing to start up.
7039 @param[in]	id		Latch ID
7040 @param[in]	n		maximum number of pending AIO operations
7041 				allowed; n must be divisible by m_n_segments
7042 @param[in]	n_segments	number of segments in the AIO array
7043 @return own: AIO array, NULL on failure */
7044 AIO*
create(latch_id_t id,ulint n,ulint n_segments)7045 AIO::create(
7046 	latch_id_t	id,
7047 	ulint		n,
7048 	ulint		n_segments)
7049 {
7050 	if ((n % n_segments)) {
7051 
7052 		ib::error()
7053 			<< "Maximum number of AIO operations must be "
7054 			<< "divisible by number of segments";
7055 
7056 		return(NULL);
7057 	}
7058 
7059 	AIO*	array = UT_NEW_NOKEY(AIO(id, n, n_segments));
7060 
7061 	if (array != NULL && array->init() != DB_SUCCESS) {
7062 
7063 		UT_DELETE(array);
7064 
7065 		array = NULL;
7066 	}
7067 
7068 	return(array);
7069 }
7070 
7071 /** AIO destructor */
~AIO()7072 AIO::~AIO()
7073 {
7074 #ifdef WIN_ASYNC_IO
7075 	for (ulint i = 0; i < m_slots.size(); ++i) {
7076 		CloseHandle(m_slots[i].handle);
7077 	}
7078 #endif /* WIN_ASYNC_IO */
7079 
7080 #ifdef _WIN32
7081 	UT_DELETE(m_handles);
7082 #endif /* _WIN32 */
7083 
7084 	mutex_destroy(&m_mutex);
7085 
7086 	os_event_destroy(m_not_full);
7087 	os_event_destroy(m_is_empty);
7088 
7089 #if defined(LINUX_NATIVE_AIO)
7090 	if (srv_use_native_aio) {
7091 		m_events.clear();
7092 		ut_free(m_aio_ctx);
7093 #ifdef UNIV_DEBUG
7094 		if (m_pending) {
7095 			for (size_t idx = 0; idx < m_slots.size(); ++idx)
7096 				ut_ad(m_pending[idx] == NULL);
7097 		}
7098 		if (m_count) {
7099 			for (size_t idx = 0; idx < m_n_segments; ++idx)
7100 				ut_ad(m_count[idx] == 0);
7101 		}
7102 #endif
7103 		ut_free(m_pending);
7104 		ut_free(m_count);
7105 }
7106 #endif /* LINUX_NATIVE_AIO */
7107 
7108 	m_slots.clear();
7109 }
7110 
7111 /** Initializes the asynchronous io system. Creates one array each for ibuf
7112 and log i/o. Also creates one array each for read and write where each
7113 array is divided logically into n_readers and n_writers
7114 respectively. The caller must create an i/o handler thread for each
7115 segment in these arrays. This function also creates the sync array.
7116 No i/o handler thread needs to be created for that
7117 @param[in]	n_per_seg	maximum number of pending aio
7118 				operations allowed per segment
7119 @param[in]	n_readers	number of reader threads
7120 @param[in]	n_writers	number of writer threads
7121 @param[in]	n_slots_sync	number of slots in the sync aio array
7122 @return true if the AIO sub-system was started successfully */
7123 bool
start(ulint n_per_seg,ulint n_readers,ulint n_writers,ulint n_slots_sync)7124 AIO::start(
7125 	ulint		n_per_seg,
7126 	ulint		n_readers,
7127 	ulint		n_writers,
7128 	ulint		n_slots_sync)
7129 {
7130 #if defined(LINUX_NATIVE_AIO)
7131 	/* Check if native aio is supported on this system and tmpfs */
7132 	if (srv_use_native_aio && !is_linux_native_aio_supported()) {
7133 
7134 		ib::warn() << "Linux Native AIO disabled.";
7135 
7136 		srv_use_native_aio = FALSE;
7137 	}
7138 #endif /* LINUX_NATIVE_AIO */
7139 
7140 	srv_reset_io_thread_op_info();
7141 
7142 	s_reads = create(
7143 		LATCH_ID_OS_AIO_READ_MUTEX, n_readers * n_per_seg, n_readers);
7144 
7145 	if (s_reads == NULL) {
7146 		return(false);
7147 	}
7148 
7149 	ulint	start = srv_read_only_mode ? 0 : 2;
7150 	ulint	n_segs = n_readers + start;
7151 
7152 	/* 0 is the ibuf segment and 1 is the redo log segment. */
7153 	for (ulint i = start; i < n_segs; ++i) {
7154 		ut_a(i < SRV_MAX_N_IO_THREADS);
7155 		srv_io_thread_function[i] = "read thread";
7156 	}
7157 
7158 	ulint	n_segments = n_readers;
7159 
7160 	if (!srv_read_only_mode) {
7161 
7162 		s_ibuf = create(LATCH_ID_OS_AIO_IBUF_MUTEX, n_per_seg, 1);
7163 
7164 		if (s_ibuf == NULL) {
7165 			return(false);
7166 		}
7167 
7168 		++n_segments;
7169 
7170 		srv_io_thread_function[0] = "insert buffer thread";
7171 
7172 		s_log = create(LATCH_ID_OS_AIO_LOG_MUTEX, n_per_seg, 1);
7173 
7174 		if (s_log == NULL) {
7175 			return(false);
7176 		}
7177 
7178 		++n_segments;
7179 
7180 		srv_io_thread_function[1] = "log thread";
7181 
7182 	} else {
7183 		s_ibuf = s_log = NULL;
7184 	}
7185 
7186 	s_writes = create(
7187 		LATCH_ID_OS_AIO_WRITE_MUTEX, n_writers * n_per_seg, n_writers);
7188 
7189 	if (s_writes == NULL) {
7190 		return(false);
7191 	}
7192 
7193 	n_segments += n_writers;
7194 
7195 	for (ulint i = start + n_readers; i < n_segments; ++i) {
7196 		ut_a(i < SRV_MAX_N_IO_THREADS);
7197 		srv_io_thread_function[i] = "write thread";
7198 	}
7199 
7200 	ut_ad(n_segments >= static_cast<ulint>(srv_read_only_mode ? 2 : 4));
7201 
7202 	s_sync = create(LATCH_ID_OS_AIO_SYNC_MUTEX, n_slots_sync, 1);
7203 
7204 	if (s_sync == NULL) {
7205 
7206 		return(false);
7207 	}
7208 
7209 	os_aio_n_segments = n_segments;
7210 
7211 	os_aio_validate();
7212 
7213 	os_aio_segment_wait_events = static_cast<os_event_t*>(
7214 		ut_zalloc_nokey(
7215 			n_segments * sizeof *os_aio_segment_wait_events));
7216 
7217 	if (os_aio_segment_wait_events == NULL) {
7218 
7219 		return(false);
7220 	}
7221 
7222 	for (ulint i = 0; i < n_segments; ++i) {
7223 		os_aio_segment_wait_events[i] = os_event_create(0);
7224 	}
7225 
7226 	os_last_printout = ut_time_monotonic();
7227 
7228 	return(true);
7229 }
7230 
7231 /** Free the AIO arrays */
7232 void
shutdown()7233 AIO::shutdown()
7234 {
7235 	UT_DELETE(s_ibuf);
7236 	s_ibuf = NULL;
7237 
7238 	UT_DELETE(s_log);
7239 	s_log = NULL;
7240 
7241 	UT_DELETE(s_writes);
7242 	s_writes = NULL;
7243 
7244 	UT_DELETE(s_sync);
7245 	s_sync = NULL;
7246 
7247 	UT_DELETE(s_reads);
7248 	s_reads = NULL;
7249 }
7250 
7251 #if !defined(NO_FALLOCATE) && defined(UNIV_LINUX)
7252 
7253 /** Max disk sector size */
7254 static const ulint	MAX_SECTOR_SIZE = 4096;
7255 
7256 /**
7257 Try and get the FusionIO sector size. */
7258 void
os_fusionio_get_sector_size()7259 os_fusionio_get_sector_size()
7260 {
7261 	if (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
7262 	    || srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC) {
7263 		ulint		sector_size = UNIV_SECTOR_SIZE;
7264 		char*		path = srv_data_home;
7265 		os_file_t	check_file;
7266 		byte*		ptr;
7267 		byte*		block_ptr;
7268 		char		current_dir[3];
7269 		char*		dir_end;
7270 		ulint		dir_len;
7271 		ulint		check_path_len;
7272 		char*		check_file_name;
7273 		ssize_t		ret;
7274 
7275 		/* If the srv_data_home is empty, set the path to
7276 		current dir. */
7277 		if (*path == 0) {
7278 			current_dir[0] = FN_CURLIB;
7279 			current_dir[1] = FN_LIBCHAR;
7280 			current_dir[2] = 0;
7281 			path = current_dir;
7282 		}
7283 
7284 		/* Get the path of data file */
7285 		dir_end = strrchr(path, OS_PATH_SEPARATOR);
7286 		dir_len = dir_end? dir_end - path : strlen(path);
7287 
7288 		/* allocate a new path and move the directory path to it. */
7289 		check_path_len = dir_len + sizeof "/check_sector_size";
7290 		check_file_name = static_cast<char*>(
7291 			ut_zalloc_nokey(check_path_len));
7292 		memcpy(check_file_name, path, dir_len);
7293 
7294 		/* Construct a check file name. */
7295 		strcat(check_file_name + dir_len, "/check_sector_size");
7296 
7297 		/* Create a tmp file for checking sector size. */
7298 		check_file = ::open(check_file_name,
7299 				    O_CREAT|O_TRUNC|O_WRONLY|O_DIRECT,
7300 				    S_IRWXU);
7301 
7302 		if (check_file == -1) {
7303 			ib::error()
7304 				<< "Failed to create check sector file, errno:"
7305 				<< errno << " Please confirm O_DIRECT is"
7306 				<< " supported and remove the file "
7307 				<< check_file_name << " if it exists.";
7308 			ut_free(check_file_name);
7309 			errno = 0;
7310 			return;
7311 		}
7312 
7313 		/* Try to write the file with different sector size
7314 		alignment. */
7315 		ptr = static_cast<byte*>(ut_malloc_nokey(2 * MAX_SECTOR_SIZE));
7316 
7317 		while (sector_size <= MAX_SECTOR_SIZE) {
7318 			block_ptr = static_cast<byte*>(
7319 				ut_align(ptr, sector_size));
7320 			ret = pwrite(check_file, block_ptr,
7321 				    sector_size, 0);
7322 			if (ret > 0 && (ulint) ret == sector_size) {
7323 				break;
7324 			}
7325 			sector_size *= 2;
7326 		}
7327 
7328 		/* The sector size should <= MAX_SECTOR_SIZE. */
7329 		ut_ad(sector_size <= MAX_SECTOR_SIZE);
7330 
7331 		close(check_file);
7332 		unlink(check_file_name);
7333 
7334 		ut_free(check_file_name);
7335 		ut_free(ptr);
7336 		errno = 0;
7337 
7338 		os_io_ptr_align = sector_size;
7339 	}
7340 }
7341 #endif /* !NO_FALLOCATE && UNIV_LINUX */
7342 
7343 /** Initializes the asynchronous io system. Creates one array each for ibuf
7344 and log i/o. Also creates one array each for read and write where each
7345 array is divided logically into n_readers and n_writers
7346 respectively. The caller must create an i/o handler thread for each
7347 segment in these arrays. This function also creates the sync array.
7348 No i/o handler thread needs to be created for that
7349 @param[in]	n_readers	number of reader threads
7350 @param[in]	n_writers	number of writer threads
7351 @param[in]	n_slots_sync	number of slots in the sync aio array */
7352 bool
os_aio_init(ulint n_readers,ulint n_writers,ulint n_slots_sync)7353 os_aio_init(
7354 	ulint		n_readers,
7355 	ulint		n_writers,
7356 	ulint		n_slots_sync)
7357 {
7358 	/* Maximum number of pending aio operations allowed per segment */
7359 	ulint		limit = 8 * OS_AIO_N_PENDING_IOS_PER_THREAD;
7360 
7361 #ifdef _WIN32
7362 	if (srv_use_native_aio) {
7363 		limit = SRV_N_PENDING_IOS_PER_THREAD;
7364 	}
7365 #endif /* _WIN32 */
7366 
7367 	ut_a(block_cache == NULL);
7368 
7369 	block_cache = UT_NEW_NOKEY(Blocks(MAX_BLOCKS));
7370 
7371 	for (Blocks::iterator it = block_cache->begin();
7372 	     it != block_cache->end();
7373 	     ++it) {
7374 
7375 		ut_a(it->m_in_use == 0);
7376 		ut_a(it->m_ptr == NULL);
7377 
7378 		/* Allocate double of max page size memory, since
7379 		compress could generate more bytes than orgininal
7380 		data. */
7381 		it->m_ptr = static_cast<byte*>(
7382 			ut_malloc_nokey(BUFFER_BLOCK_SIZE));
7383 
7384 		ut_a(it->m_ptr != NULL);
7385 	}
7386 
7387 	/* Get sector size for DIRECT_IO. In this case, we need to
7388 	know the sector size for aligning the write buffer. */
7389 #if !defined(NO_FALLOCATE) && defined(UNIV_LINUX)
7390 	os_fusionio_get_sector_size();
7391 #endif /* !NO_FALLOCATE && UNIV_LINUX */
7392 
7393 	return(AIO::start(limit, n_readers, n_writers, n_slots_sync));
7394 }
7395 
7396 /** Frees the asynchronous io system. */
7397 void
os_aio_free()7398 os_aio_free()
7399 {
7400 	AIO::shutdown();
7401 
7402 	for (ulint i = 0; i < os_aio_n_segments; i++) {
7403 		os_event_destroy(os_aio_segment_wait_events[i]);
7404 	}
7405 
7406 	ut_free(os_aio_segment_wait_events);
7407 	os_aio_segment_wait_events = 0;
7408 	os_aio_n_segments = 0;
7409 
7410 	for (Blocks::iterator it = block_cache->begin();
7411 	     it != block_cache->end();
7412 	     ++it) {
7413 
7414 		ut_a(it->m_in_use == 0);
7415 		ut_free(it->m_ptr);
7416 	}
7417 
7418 	UT_DELETE(block_cache);
7419 
7420 	block_cache = NULL;
7421 }
7422 
7423 /** Wakes up all async i/o threads so that they know to exit themselves in
7424 shutdown. */
7425 void
os_aio_wake_all_threads_at_shutdown()7426 os_aio_wake_all_threads_at_shutdown()
7427 {
7428 #ifdef WIN_ASYNC_IO
7429 
7430 	AIO::wake_at_shutdown();
7431 
7432 #elif defined(LINUX_NATIVE_AIO)
7433 
7434 	/* When using native AIO interface the io helper threads
7435 	wait on io_getevents with a timeout value of 500ms. At
7436 	each wake up these threads check the server status.
7437 	No need to do anything to wake them up. */
7438 
7439 	if (srv_use_native_aio) {
7440 		return;
7441 	}
7442 
7443 #endif /* !WIN_ASYNC_AIO */
7444 
7445 	/* Fall through to simulated AIO handler wakeup if we are
7446 	not using native AIO. */
7447 
7448 	/* This loop wakes up all simulated ai/o threads */
7449 
7450 	for (ulint i = 0; i < os_aio_n_segments; ++i) {
7451 
7452 		os_event_set(os_aio_segment_wait_events[i]);
7453 	}
7454 }
7455 
7456 /** Waits until there are no pending writes in AIO::s_writes. There can
7457 be other, synchronous, pending writes. */
7458 void
os_aio_wait_until_no_pending_writes()7459 os_aio_wait_until_no_pending_writes()
7460 {
7461 	AIO::wait_until_no_pending_writes();
7462 }
7463 
7464 /** Calculates segment number for a slot.
7465 @param[in]	array		AIO wait array
7466 @param[in]	slot		slot in this array
7467 @return segment number (which is the number used by, for example,
7468 	I/O-handler threads) */
7469 ulint
get_segment_no_from_slot(const AIO * array,const Slot * slot)7470 AIO::get_segment_no_from_slot(
7471 	const AIO*	array,
7472 	const Slot*	slot)
7473 {
7474 	ulint	segment;
7475 	ulint	seg_len;
7476 
7477 	if (array == s_ibuf) {
7478 		ut_ad(!srv_read_only_mode);
7479 
7480 		segment = IO_IBUF_SEGMENT;
7481 
7482 	} else if (array == s_log) {
7483 		ut_ad(!srv_read_only_mode);
7484 
7485 		segment = IO_LOG_SEGMENT;
7486 
7487 	} else if (array == s_reads) {
7488 		seg_len = s_reads->slots_per_segment();
7489 
7490 		segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
7491 	} else {
7492 		ut_a(array == s_writes);
7493 
7494 		seg_len = s_writes->slots_per_segment();
7495 
7496 		segment = s_reads->m_n_segments
7497 			+ (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
7498 	}
7499 
7500 	return(segment);
7501 }
7502 
7503 /** Requests for a slot in the aio array. If no slot is available, waits until
7504 not_full-event becomes signaled.
7505 
7506 @param[in,out]	type		IO context
7507 @param[in,out]	m1		message to be passed along with the AIO
7508 				operation
7509 @param[in,out]	m2		message to be passed along with the AIO
7510 				operation
7511 @param[in]	file		file handle
7512 @param[in]	name		name of the file or path as a NUL-terminated
7513 				string
7514 @param[in,out]	buf		buffer where to read or from which to write
7515 @param[in]	offset		file offset, where to read from or start writing
7516 @param[in]	len		length of the block to read or write
7517 @return pointer to slot */
7518 Slot*
reserve_slot(IORequest & type,fil_node_t * m1,void * m2,pfs_os_file_t file,const char * name,void * buf,os_offset_t offset,ulint len,ulint space_id)7519 AIO::reserve_slot(
7520 	IORequest&	type,
7521 	fil_node_t*	m1,
7522 	void*		m2,
7523 	pfs_os_file_t	file,
7524 	const char*	name,
7525 	void*		buf,
7526 	os_offset_t	offset,
7527 	ulint		len,
7528 	ulint		space_id)
7529 {
7530 #ifdef WIN_ASYNC_IO
7531 	ut_a((len & 0xFFFFFFFFUL) == len);
7532 #endif /* WIN_ASYNC_IO */
7533 
7534 	/* No need of a mutex. Only reading constant fields */
7535 	ulint		slots_per_seg;
7536 
7537 	ut_ad(type.validate());
7538 
7539 	slots_per_seg = slots_per_segment();
7540 
7541 	/* We attempt to keep adjacent blocks in the same local
7542 	segment. This can help in merging IO requests when we are
7543 	doing simulated AIO */
7544 	ulint		local_seg;
7545 
7546 	local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6)) % m_n_segments;
7547 
7548 	for (;;) {
7549 
7550 		acquire();
7551 
7552 		if (m_n_reserved != m_slots.size()) {
7553 			break;
7554 		}
7555 
7556 		release();
7557 
7558 		if (!srv_use_native_aio) {
7559 			/* If the handler threads are suspended,
7560 			wake them so that we get more slots */
7561 
7562 			os_aio_simulated_wake_handler_threads();
7563 		}
7564 
7565 		os_event_wait(m_not_full);
7566 	}
7567 
7568 	ulint	counter = 0;
7569 	Slot*	slot = NULL;
7570 
7571 	/* We start our search for an available slot from our preferred
7572 	local segment and do a full scan of the array. We are
7573 	guaranteed to find a slot in full scan. */
7574 	for (ulint i = local_seg * slots_per_seg;
7575 	     counter < m_slots.size();
7576 	     ++i, ++counter) {
7577 
7578 		i %= m_slots.size();
7579 
7580 		slot = at(i);
7581 
7582 		if (slot->is_reserved == false) {
7583 			break;
7584 		}
7585 	}
7586 
7587 	/* We MUST always be able to get hold of a reserved slot. */
7588 	ut_a(counter < m_slots.size());
7589 
7590 	ut_a(slot->is_reserved == false);
7591 
7592 	++m_n_reserved;
7593 
7594 	if (m_n_reserved == 1) {
7595 		os_event_reset(m_is_empty);
7596 	}
7597 
7598 	if (m_n_reserved == m_slots.size()) {
7599 		os_event_reset(m_not_full);
7600 	}
7601 
7602 	slot->is_reserved = true;
7603 	slot->reservation_time = ut_time_monotonic();
7604 	slot->m1       = m1;
7605 	slot->m2       = m2;
7606 	slot->file     = file;
7607 	slot->name     = name;
7608 #ifdef _WIN32
7609 	slot->len      = static_cast<DWORD>(len);
7610 #else
7611 	slot->len      = static_cast<ulint>(len);
7612 #endif /* _WIN32 */
7613 	slot->type     = type;
7614 	slot->buf      = static_cast<byte*>(buf);
7615 	slot->ptr      = slot->buf;
7616 	slot->offset   = offset;
7617 	slot->err      = DB_SUCCESS;
7618 	slot->original_len = static_cast<uint32>(len);
7619 	slot->io_already_done = false;
7620 	slot->space_id = space_id;
7621 	slot->buf_block = NULL;
7622 	slot->encrypt_log_buf = NULL;
7623 
7624 	if (srv_use_native_aio
7625 	    && offset > 0
7626 	    && type.is_write()
7627 	    && type.is_compressed()) {
7628 		ulint	compressed_len = len;
7629 
7630 		ut_ad(!type.is_log());
7631 
7632 		release();
7633 
7634 		void* src_buf = slot->buf;
7635 		slot->buf_block = os_file_compress_page(
7636 			type,
7637 			src_buf,
7638 			&compressed_len);
7639 
7640 		slot->buf = static_cast<byte*>(src_buf);
7641 		slot->ptr = slot->buf;
7642 #ifdef _WIN32
7643 		slot->len = static_cast<DWORD>(compressed_len);
7644 #else
7645 		slot->len = static_cast<ulint>(compressed_len);
7646 #endif /* _WIN32 */
7647 		slot->skip_punch_hole = !type.punch_hole();
7648 
7649 		acquire();
7650 	}
7651 
7652 	/* We do encryption after compression, since if we do encryption
7653 	before compression, the encrypted data will cause compression fail
7654 	or low compression rate. */
7655 	if (srv_use_native_aio
7656 	    && offset > 0
7657 	    && type.is_write()
7658 	    && type.is_encrypted()
7659 	    && (type.encryption_algorithm().m_type != Encryption::KEYRING ||
7660 		(type.encryption_algorithm().m_key != NULL &&
7661 		 Encryption::can_page_be_keyring_encrypted(slot->buf)))) {
7662 
7663 		ulint		encrypted_len = slot->len;
7664 		Block*		encrypted_block;
7665 		byte*		encrypt_log_buf;
7666 
7667 		release();
7668 
7669 		void* src_buf = slot->buf;
7670 		if (!type.is_log()) {
7671 			encrypted_block = os_file_encrypt_page(
7672 				type,
7673 				src_buf,
7674 				&encrypted_len);
7675 
7676 			if (slot->buf_block != NULL) {
7677 				os_free_block(slot->buf_block);
7678 			}
7679 
7680 			slot->buf_block = encrypted_block;
7681 		} else {
7682 			/* Skip encrypted log file header */
7683 			if (offset >= LOG_FILE_HDR_SIZE) {
7684 				encrypted_block = os_file_encrypt_log(
7685 					type,
7686 					src_buf,
7687 					encrypt_log_buf,
7688 					&encrypted_len);
7689 
7690 				if (slot->buf_block != NULL) {
7691 					os_free_block(slot->buf_block);
7692 				}
7693 
7694 				slot->buf_block = encrypted_block;
7695 
7696 				if (slot->encrypt_log_buf != NULL) {
7697 					ut_free(slot->encrypt_log_buf);
7698 				}
7699 
7700 				slot->encrypt_log_buf = encrypt_log_buf;
7701 			}
7702 		}
7703 
7704 		slot->buf = static_cast<byte*>(src_buf);
7705 		slot->ptr = slot->buf;
7706 
7707 #ifdef _WIN32
7708 		slot->len = static_cast<DWORD>(encrypted_len);
7709 #else
7710 		slot->len = static_cast<ulint>(encrypted_len);
7711 #endif /* _WIN32 */
7712 
7713 		acquire();
7714         }
7715 
7716 #ifdef WIN_ASYNC_IO
7717 	{
7718 		OVERLAPPED*	control;
7719 
7720 		control = &slot->control;
7721 		control->Offset = (DWORD) offset & 0xFFFFFFFF;
7722 		control->OffsetHigh = (DWORD) (offset >> 32);
7723 
7724 		ResetEvent(slot->handle);
7725 	}
7726 #elif defined(LINUX_NATIVE_AIO)
7727 
7728 	/* If we are not using native AIO skip this part. */
7729 	if (srv_use_native_aio) {
7730 
7731 		off_t		aio_offset;
7732 
7733 		/* Check if we are dealing with 64 bit arch.
7734 		If not then make sure that offset fits in 32 bits. */
7735 		aio_offset = (off_t) offset;
7736 
7737 		ut_a(sizeof(aio_offset) >= sizeof(offset)
7738 		     || ((os_offset_t) aio_offset) == offset);
7739 
7740 		struct iocb*	iocb = &slot->control;
7741 
7742 		if (type.is_read()) {
7743 			io_prep_pread(
7744 				iocb, file.m_file, slot->ptr, slot->len, aio_offset);
7745 		} else {
7746 			ut_ad(type.is_write());
7747 			io_prep_pwrite(
7748 				iocb, file.m_file, slot->ptr, slot->len, aio_offset);
7749 		}
7750 
7751 		iocb->data = slot;
7752 
7753 		slot->n_bytes = 0;
7754 		slot->ret = 0;
7755 	}
7756 #endif /* LINUX_NATIVE_AIO */
7757 
7758 	release();
7759 
7760 	return(slot);
7761 }
7762 
7763 /** Wakes up a simulated aio i/o-handler thread if it has something to do.
7764 @param[in]	global_segment	The number of the segment in the AIO arrays */
7765 void
wake_simulated_handler_thread(ulint global_segment)7766 AIO::wake_simulated_handler_thread(ulint global_segment)
7767 {
7768 	ut_ad(!srv_use_native_aio);
7769 
7770 	AIO*	array;
7771 	ulint	segment = get_array_and_local_segment(&array, global_segment);
7772 
7773 	array->wake_simulated_handler_thread(global_segment, segment);
7774 }
7775 
7776 /** Wakes up a simulated AIO I/O-handler thread if it has something to do
7777 for a local segment in the AIO array.
7778 @param[in]	global_segment	The number of the segment in the AIO arrays
7779 @param[in]	segment		The local segment in the AIO array */
7780 void
wake_simulated_handler_thread(ulint global_segment,ulint segment)7781 AIO::wake_simulated_handler_thread(ulint global_segment, ulint segment)
7782 {
7783 	ut_ad(!srv_use_native_aio);
7784 
7785 	ulint	n = slots_per_segment();
7786 	ulint	offset = segment * n;
7787 
7788 	/* Look through n slots after the segment * n'th slot */
7789 
7790 	acquire();
7791 
7792 	const Slot*	slot = at(offset);
7793 
7794 	for (ulint i = 0; i < n; ++i, ++slot) {
7795 
7796 		if (slot->is_reserved) {
7797 
7798 			/* Found an i/o request */
7799 
7800 			release();
7801 
7802 			os_event_t	event;
7803 
7804 			event = os_aio_segment_wait_events[global_segment];
7805 
7806 			os_event_set(event);
7807 
7808 			return;
7809 		}
7810 	}
7811 
7812 	release();
7813 }
7814 
7815 /** Wakes up simulated aio i/o-handler threads if they have something to do. */
7816 void
os_aio_simulated_wake_handler_threads()7817 os_aio_simulated_wake_handler_threads()
7818 {
7819 	if (srv_use_native_aio) {
7820 		/* We do not use simulated aio: do nothing */
7821 
7822 		return;
7823 	}
7824 
7825 	os_aio_recommend_sleep_for_read_threads	= false;
7826 
7827 	for (ulint i = 0; i < os_aio_n_segments; i++) {
7828 		AIO::wake_simulated_handler_thread(i);
7829 	}
7830 }
7831 
7832 /** Select the IO slot array
7833 @param[in]	type		Type of IO, READ or WRITE
7834 @param[in]	read_only	true if running in read-only mode
7835 @param[in]	mode		IO mode
7836 @return slot array or NULL if invalid mode specified */
7837 AIO*
select_slot_array(IORequest & type,bool read_only,ulint mode)7838 AIO::select_slot_array(IORequest& type, bool read_only, ulint mode)
7839 {
7840 	AIO*	array;
7841 
7842 	ut_ad(type.validate());
7843 
7844 	switch (mode) {
7845 	case OS_AIO_NORMAL:
7846 
7847 		array = type.is_read() ? AIO::s_reads : AIO::s_writes;
7848 		break;
7849 
7850 	case OS_AIO_IBUF:
7851 		ut_ad(type.is_read());
7852 
7853 		/* Reduce probability of deadlock bugs in connection with ibuf:
7854 		do not let the ibuf i/o handler sleep */
7855 
7856 		type.clear_do_not_wake();
7857 
7858 		array = read_only ? AIO::s_reads : AIO::s_ibuf;
7859 		break;
7860 
7861 	case OS_AIO_LOG:
7862 
7863 		array = read_only ? AIO::s_reads : AIO::s_log;
7864 		break;
7865 
7866 	case OS_AIO_SYNC:
7867 
7868 		array = AIO::s_sync;
7869 #if defined(LINUX_NATIVE_AIO)
7870 		/* In Linux native AIO we don't use sync IO array. */
7871 		ut_a(!srv_use_native_aio);
7872 #endif /* LINUX_NATIVE_AIO */
7873 		break;
7874 
7875 	default:
7876 		ut_error;
7877 		array = NULL; /* Eliminate compiler warning */
7878 	}
7879 
7880 	return(array);
7881 }
7882 
7883 #ifdef WIN_ASYNC_IO
7884 /** This function is only used in Windows asynchronous i/o.
7885 Waits for an aio operation to complete. This function is used to wait the
7886 for completed requests. The aio array of pending requests is divided
7887 into segments. The thread specifies which segment or slot it wants to wait
7888 for. NOTE: this function will also take care of freeing the aio slot,
7889 therefore no other thread is allowed to do the freeing!
7890 @param[in]	segment		The number of the segment in the aio arrays to
7891 				wait for; segment 0 is the ibuf I/O thread,
7892 				segment 1 the log I/O thread, then follow the
7893 				non-ibuf read threads, and as the last are the
7894 				non-ibuf write threads; if this is
7895 				ULINT_UNDEFINED, then it means that sync AIO
7896 				is used, and this parameter is ignored
7897 @param[in]	pos		this parameter is used only in sync AIO:
7898 				wait for the aio slot at this position
7899 @param[out]	m1		the messages passed with the AIO request; note
7900 				that also in the case where the AIO operation
7901 				failed, these output parameters are valid and
7902 				can be used to restart the operation,
7903 				for example
7904 @param[out]	m2		callback message
7905 @param[out]	type		OS_FILE_WRITE or ..._READ
7906 @return DB_SUCCESS or error code */
7907 static
7908 dberr_t
os_aio_windows_handler(ulint segment,ulint pos,fil_node_t ** m1,void ** m2,IORequest * type)7909 os_aio_windows_handler(
7910 	ulint		segment,
7911 	ulint		pos,
7912 	fil_node_t**	m1,
7913 	void**		m2,
7914 	IORequest*	type)
7915 {
7916 	Slot*		slot;
7917 	dberr_t		err;
7918 	AIO*		array;
7919 	ulint		orig_seg = segment;
7920 
7921 	if (segment == ULINT_UNDEFINED) {
7922 		segment = 0;
7923 		array = AIO::sync_array();
7924 	} else {
7925 		segment = AIO::get_array_and_local_segment(&array, segment);
7926 	}
7927 
7928 	/* NOTE! We only access constant fields in os_aio_array. Therefore
7929 	we do not have to acquire the protecting mutex yet */
7930 
7931 	ut_ad(os_aio_validate_skip());
7932 
7933 	if (array == AIO::sync_array()) {
7934 
7935 		WaitForSingleObject(array->at(pos)->handle, INFINITE);
7936 
7937 	} else {
7938 		if (orig_seg != ULINT_UNDEFINED) {
7939 			srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
7940 		}
7941 
7942 		pos = WaitForMultipleObjects(
7943 			(DWORD) array->slots_per_segment(),
7944 			array->handles(segment),
7945 			FALSE, INFINITE);
7946 	}
7947 
7948 	array->acquire();
7949 
7950 	if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
7951 	    && array->is_empty()
7952 	    && !buf_page_cleaner_is_active) {
7953 
7954 		*m1 = NULL;
7955 		*m2 = NULL;
7956 
7957 		array->release();
7958 
7959 		return(DB_SUCCESS);
7960 	}
7961 
7962 	ulint	n = array->slots_per_segment();
7963 
7964 	ut_a(pos >= WAIT_OBJECT_0 && pos <= WAIT_OBJECT_0 + n);
7965 
7966 	slot = array->at(pos + segment * n);
7967 
7968 	ut_a(slot->is_reserved);
7969 
7970 	if (orig_seg != ULINT_UNDEFINED) {
7971 		srv_set_io_thread_op_info(
7972 			orig_seg, "get windows aio return value");
7973 	}
7974 
7975 	BOOL	ret;
7976 	ret = GetOverlappedResult(
7977 		slot->file.m_file, &slot->control, &slot->n_bytes, TRUE);
7978 	*m1 = slot->m1;
7979 	*m2 = slot->m2;
7980 
7981 	*type = slot->type;
7982 
7983 	BOOL	retry = FALSE;
7984 
7985 	if (ret && slot->n_bytes == slot->len) {
7986 
7987 		err = DB_SUCCESS;
7988 
7989 	} else if (os_file_handle_error(slot->name, "Windows aio")) {
7990 
7991 		retry = true;
7992 
7993 	} else {
7994 
7995 		err = DB_IO_ERROR;
7996 	}
7997 
7998 	array->release();
7999 
8000 	if (retry) {
8001 		/* Retry failed read/write operation synchronously.
8002 		No need to hold array->m_mutex. */
8003 
8004 #ifdef UNIV_PFS_IO
8005 		/* This read/write does not go through os_file_read
8006 		and os_file_write APIs, need to register with
8007 		performance schema explicitly here. */
8008 		struct PSI_file_locker* locker = NULL;
8009 		PSI_file_locker_state   state;
8010 		register_pfs_file_io_begin(
8011 			&state, locker, slot->file, slot->len,
8012 			slot->type.is_write()
8013 			? PSI_FILE_WRITE : PSI_FILE_READ, __FILE__, __LINE__);
8014 #endif /* UNIV_PFS_IO */
8015 
8016 		ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
8017 
8018 		ssize_t	n_bytes = SyncFileIO::execute(slot);
8019 
8020 #ifdef UNIV_PFS_IO
8021 		register_pfs_file_io_end(locker, slot->len);
8022 #endif /* UNIV_PFS_IO */
8023 
8024 		if (n_bytes < 0 && GetLastError() == ERROR_IO_PENDING) {
8025 			/* AIO was queued successfully!
8026 			We want a synchronous I/O operation on a
8027 			file where we also use async I/O: in Windows
8028 			we must use the same wait mechanism as for
8029 			async I/O */
8030 
8031 			BOOL	ret;
8032 			ret = GetOverlappedResult(
8033 				slot->file.m_file, &slot->control, &slot->n_bytes,
8034 				TRUE);
8035 			n_bytes = ret ? slot->n_bytes : -1;
8036 		}
8037 
8038 		err = (n_bytes == slot->len) ? DB_SUCCESS : DB_IO_ERROR;
8039 	}
8040 
8041 	if (err == DB_SUCCESS) {
8042 		err = AIOHandler::post_io_processing(slot);
8043 	}
8044 
8045 	array->release_with_mutex(slot);
8046 
8047 	return(err);
8048 }
8049 #endif /* WIN_ASYNC_IO */
8050 
8051 /**
8052 NOTE! Use the corresponding macro os_aio(), not directly this function!
8053 Requests an asynchronous i/o operation.
8054 @param[in]	type		IO request context
8055 @param[in]	mode		IO mode
8056 @param[in]	name		Name of the file or path as NUL terminated
8057 				string
8058 @param[in]	file		Open file handle
8059 @param[out]	buf		buffer where to read
8060 @param[in]	offset		file offset where to read
8061 @param[in]	n		number of bytes to read
8062 @param[in]	read_only	if true read only mode checks are enforced
8063 @param[in,out]	m1		Message for the AIO handler, (can be used to
8064 				identify a completed AIO operation); ignored
8065 				if mode is OS_AIO_SYNC
8066 @param[in,out]	m2		message for the AIO handler (can be used to
8067 				identify a completed AIO operation); ignored
8068 				if mode is OS_AIO_SYNC
8069 @param[in]	should_buffer	Whether to buffer an aio request.
8070 				AIO read ahead uses this. If you plan to
8071 				use this parameter, make sure you remember to
8072 				call os_aio_dispatch_read_array_submit()
8073 				when you're ready to commit all your
8074 				requests.
8075 
8076 @return DB_SUCCESS or error code */
8077 dberr_t
os_aio_func(IORequest & type,ulint mode,const char * name,pfs_os_file_t file,void * buf,os_offset_t offset,ulint n,bool read_only,fil_node_t * m1,void * m2,ulint space_id,trx_t * trx,bool should_buffer)8078 os_aio_func(
8079 	IORequest&	type,
8080 	ulint		mode,
8081 	const char*	name,
8082 	pfs_os_file_t	file,
8083 	void*		buf,
8084 	os_offset_t	offset,
8085 	ulint		n,
8086 	bool		read_only,
8087 	fil_node_t*	m1,
8088 	void*		m2,
8089 	ulint		space_id,
8090 	trx_t*		trx,
8091 	bool		should_buffer)
8092 {
8093 #ifdef WIN_ASYNC_IO
8094 	BOOL		ret = TRUE;
8095 #endif /* WIN_ASYNC_IO */
8096 
8097 	ut_ad(n > 0);
8098 	ut_ad((n % OS_FILE_LOG_BLOCK_SIZE) == 0);
8099 	ut_ad((offset % OS_FILE_LOG_BLOCK_SIZE) == 0);
8100 	ut_ad(os_aio_validate_skip());
8101 
8102 #ifdef WIN_ASYNC_IO
8103 	ut_ad((n & 0xFFFFFFFFUL) == n);
8104 #endif /* WIN_ASYNC_IO */
8105 
8106 	if (mode == OS_AIO_SYNC
8107 #ifdef WIN_ASYNC_IO
8108 	    && !srv_use_native_aio
8109 #endif /* WIN_ASYNC_IO */
8110 	    ) {
8111 		/* This is actually an ordinary synchronous read or write:
8112 		no need to use an i/o-handler thread. NOTE that if we use
8113 		Windows async i/o, Windows does not allow us to use
8114 		ordinary synchronous os_file_read etc. on the same file,
8115 		therefore we have built a special mechanism for synchronous
8116 		wait in the Windows case.
8117 		Also note that the Performance Schema instrumentation has
8118 		been performed by current os_aio_func()'s wrapper function
8119 		pfs_os_aio_func(). So we would no longer need to call
8120 		Performance Schema instrumented os_file_read() and
8121 		os_file_write(). Instead, we should use os_file_read_func()
8122 		and os_file_write_func() */
8123 
8124 		if (type.is_read()) {
8125 			return(os_file_read_func(type, file.m_file, buf,
8126 						 offset, n, trx));
8127 		}
8128 
8129 		ut_ad(type.is_write());
8130 		return(os_file_write_func(type, name, file.m_file, buf, offset, n));
8131 	}
8132 
8133 try_again:
8134 
8135 	AIO*	array;
8136 
8137 	array = AIO::select_slot_array(type, read_only, mode);
8138 
8139 	Slot*	slot;
8140 
8141 	slot = array->reserve_slot(type, m1, m2, file, name, buf, offset, n,
8142 				   space_id);
8143 
8144 	if (type.is_read()) {
8145 		trx_stats::bump_io_read(trx, n);
8146 
8147 		if (srv_use_native_aio) {
8148 
8149 			++os_n_file_reads;
8150 
8151 			os_bytes_read_since_printout += n;
8152 #ifdef WIN_ASYNC_IO
8153 			ret = ReadFile(
8154 				file.m_file, slot->ptr, slot->len,
8155 				&slot->n_bytes, &slot->control);
8156 #elif defined(LINUX_NATIVE_AIO)
8157 			if (!array->linux_dispatch(slot, should_buffer)) {
8158 				goto err_exit;
8159 			}
8160 #endif /* WIN_ASYNC_IO */
8161 		} else if (type.is_wake()) {
8162 			AIO::wake_simulated_handler_thread(
8163 				AIO::get_segment_no_from_slot(array, slot));
8164 		}
8165 	} else if (type.is_write()) {
8166 
8167 		if (srv_use_native_aio) {
8168 			++os_n_file_writes;
8169 
8170 #ifdef WIN_ASYNC_IO
8171 			ret = WriteFile(
8172 				file.m_file, slot->ptr, slot->len,
8173 				&slot->n_bytes, &slot->control);
8174 #elif defined(LINUX_NATIVE_AIO)
8175 			if (!array->linux_dispatch(slot, false)) {
8176 				goto err_exit;
8177 			}
8178 #endif /* WIN_ASYNC_IO */
8179 
8180 		} else if (type.is_wake()) {
8181 			AIO::wake_simulated_handler_thread(
8182 				AIO::get_segment_no_from_slot(array, slot));
8183 		}
8184 	} else {
8185 		ut_error;
8186 	}
8187 
8188 #ifdef WIN_ASYNC_IO
8189 	if (srv_use_native_aio) {
8190 		if ((ret && slot->len == slot->n_bytes)
8191 		     || (!ret && GetLastError() == ERROR_IO_PENDING)) {
8192 			/* aio was queued successfully! */
8193 
8194 			if (mode == OS_AIO_SYNC) {
8195 				IORequest	dummy_type;
8196 				void*		dummy_mess2;
8197 				struct fil_node_t* dummy_mess1;
8198 
8199 				/* We want a synchronous i/o operation on a
8200 				file where we also use async i/o: in Windows
8201 				we must use the same wait mechanism as for
8202 				async i/o */
8203 
8204 				return(os_aio_windows_handler(
8205 					ULINT_UNDEFINED, slot->pos,
8206 					&dummy_mess1, &dummy_mess2,
8207 					&dummy_type));
8208 			}
8209 
8210 			return(DB_SUCCESS);
8211 		}
8212 
8213 		goto err_exit;
8214 	}
8215 #endif /* WIN_ASYNC_IO */
8216 
8217 	/* AIO request was queued successfully! */
8218 	return(DB_SUCCESS);
8219 
8220 #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
8221 err_exit:
8222 #endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
8223 
8224 	array->release_with_mutex(slot);
8225 
8226 	if (os_file_handle_error(
8227 		name, type.is_read() ? "aio read" : "aio write")) {
8228 
8229 		goto try_again;
8230 	}
8231 
8232 	return(DB_IO_ERROR);
8233 }
8234 
8235 /** Simulated AIO handler for reaping IO requests */
8236 class SimulatedAIOHandler {
8237 
8238 public:
8239 
8240 	/** Constructor
8241 	@param[in,out]	array	The AIO array
8242 	@param[in]	segment	Local segment in the array */
SimulatedAIOHandler(AIO * array,ulint segment)8243 	SimulatedAIOHandler(AIO* array, ulint segment)
8244 		:
8245 		m_oldest(),
8246 		m_n_elems(),
8247 		m_lowest_offset(IB_UINT64_MAX),
8248 		m_array(array),
8249 		m_n_slots(),
8250 		m_segment(segment),
8251 		m_ptr(),
8252 		m_buf()
8253 	{
8254 		ut_ad(m_segment < 100);
8255 
8256 		m_slots.resize(OS_AIO_MERGE_N_CONSECUTIVE);
8257 	}
8258 
8259 	/** Destructor */
~SimulatedAIOHandler()8260 	~SimulatedAIOHandler()
8261 	{
8262 		if (m_ptr != NULL) {
8263 			ut_free(m_ptr);
8264 		}
8265 	}
8266 
8267 	/** Reset the state of the handler
8268 	@param[in]	n_slots	Number of pending AIO operations supported */
init(ulint n_slots)8269 	void init(ulint n_slots)
8270 	{
8271 		m_oldest = 0;
8272 		m_n_elems = 0;
8273 		m_n_slots = n_slots;
8274 		m_lowest_offset = IB_UINT64_MAX;
8275 
8276 		if (m_ptr != NULL) {
8277 			ut_free(m_ptr);
8278 			m_ptr = m_buf = NULL;
8279 		}
8280 
8281 		m_slots[0] = NULL;
8282 	}
8283 
8284 	/** Check if there is a slot for which the i/o has already been done
8285 	@param[out]	n_reserved	Number of reserved slots
8286 	@return the first completed slot that is found. */
check_completed(ulint * n_reserved)8287 	Slot* check_completed(ulint* n_reserved)
8288 	{
8289 		ulint	offset = m_segment * m_n_slots;
8290 
8291 		*n_reserved = 0;
8292 
8293 		Slot*	slot;
8294 
8295 		slot = m_array->at(offset);
8296 
8297 		for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
8298 
8299 			if (slot->is_reserved) {
8300 
8301 				if (slot->io_already_done) {
8302 
8303 					ut_a(slot->is_reserved);
8304 
8305 					return(slot);
8306 				}
8307 
8308 				++*n_reserved;
8309 			}
8310 		}
8311 
8312 		return(NULL);
8313 	}
8314 
8315 	/** If there are at least 2 seconds old requests, then pick the
8316 	oldest one to prevent starvation.  If several requests have the
8317 	same age, then pick the one at the lowest offset.
8318 	@return true if request was selected */
select()8319 	bool select()
8320 	{
8321 		if (!select_oldest()) {
8322 
8323 			return(select_lowest_offset());
8324 		}
8325 
8326 		return(true);
8327 	}
8328 
8329 	/** Check if there are several consecutive blocks
8330 	to read or write. Merge them if found. */
merge()8331 	void merge()
8332 	{
8333 		/* if m_n_elems != 0, then we have assigned
8334 		something valid to consecutive_ios[0] */
8335 		ut_ad(m_n_elems != 0);
8336 		ut_ad(first_slot() != NULL);
8337 
8338 		Slot*	slot = first_slot();
8339 
8340 		while (!merge_adjacent(slot)) {
8341 			/* No op */
8342 		}
8343 	}
8344 
8345 	/** We have now collected n_consecutive I/O requests
8346 	in the array; allocate a single buffer which can hold
8347 	all data, and perform the I/O
8348 	@return the length of the buffer */
allocate_buffer()8349 	ulint allocate_buffer()
8350 		MY_ATTRIBUTE((warn_unused_result))
8351 	{
8352 		ulint	len;
8353 		Slot*	slot = first_slot();
8354 
8355 		ut_ad(m_ptr == NULL);
8356 
8357 		if (slot->type.is_read() && m_n_elems > 1) {
8358 
8359 			len = 0;
8360 
8361 			for (ulint i = 0; i < m_n_elems; ++i) {
8362 				len += m_slots[i]->len;
8363 			}
8364 
8365 			m_ptr = static_cast<byte*>(
8366 				ut_malloc_nokey(len + UNIV_PAGE_SIZE));
8367 
8368 			m_buf = static_cast<byte*>(
8369 				ut_align(m_ptr, UNIV_PAGE_SIZE));
8370 
8371 		} else {
8372 			len = first_slot()->len;
8373 			m_buf = first_slot()->buf;
8374 		}
8375 
8376 		return(len);
8377 	}
8378 
8379 	/** We have to compress the individual pages and punch
8380 	holes in them on a page by page basis when writing to
8381 	tables that can be compresed at the IO level.
8382 	@param[in]	len		Value returned by allocate_buffer */
copy_to_buffer(ulint len)8383 	void copy_to_buffer(ulint len)
8384 	{
8385 		Slot*	slot = first_slot();
8386 
8387 		if (len > slot->len && slot->type.is_write()) {
8388 
8389 			byte*	ptr = m_buf;
8390 
8391 			ut_ad(ptr != slot->buf);
8392 
8393 			/* Copy the buffers to the combined buffer */
8394 			for (ulint i = 0; i < m_n_elems; ++i) {
8395 
8396 				slot = m_slots[i];
8397 
8398 				memmove(ptr, slot->buf, slot->len);
8399 
8400 				ptr += slot->len;
8401 			}
8402 		}
8403 	}
8404 
8405 	/** Do the I/O with ordinary, synchronous i/o functions:
8406 	@param[in]	len		Length of buffer for IO */
io()8407 	void io()
8408 	{
8409 		if (first_slot()->type.is_write()) {
8410 
8411 			for (ulint i = 0; i < m_n_elems; ++i) {
8412 				write(m_slots[i]);
8413 			}
8414 
8415 		} else {
8416 
8417 			for (ulint i = 0; i < m_n_elems; ++i) {
8418 				read(m_slots[i]);
8419 			}
8420 		}
8421 	}
8422 
8423 	/** Do the decompression of the pages read in */
io_complete()8424 	void io_complete()
8425 	{
8426 		// Note: For non-compressed tables. Not required
8427 		// for correctness.
8428 	}
8429 
8430 	/** Mark the i/os done in slots */
done()8431 	void done()
8432 	{
8433 		for (ulint i = 0; i < m_n_elems; ++i) {
8434 			m_slots[i]->io_already_done = true;
8435 		}
8436 	}
8437 
8438 	/** @return the first slot in the consecutive array */
first_slot()8439 	Slot* first_slot()
8440 		MY_ATTRIBUTE((warn_unused_result))
8441 	{
8442 		ut_a(m_n_elems > 0);
8443 
8444 		return(m_slots[0]);
8445 	}
8446 
8447 	/** Wait for I/O requests
8448 	@param[in]	global_segment	The global segment
8449 	@param[in,out]	event		Wait on event if no active requests
8450 	@return the number of slots */
8451 	ulint check_pending(
8452 		ulint		global_segment,
8453 		os_event_t	event)
8454 		MY_ATTRIBUTE((warn_unused_result));
8455 private:
8456 
8457 	/** Do the file read
8458 	@param[in,out]	slot		Slot that has the IO context */
read(Slot * slot)8459 	void read(Slot* slot)
8460 	{
8461 		dberr_t	err = os_file_read_func(
8462 			slot->type,
8463 			slot->file.m_file,
8464 			slot->ptr,
8465 			slot->offset,
8466 			slot->len, NULL);
8467 		ut_a(err == DB_SUCCESS);
8468 	}
8469 
8470 	/** Do the file read
8471 	@param[in,out]	slot		Slot that has the IO context */
write(Slot * slot)8472 	void write(Slot* slot)
8473 	{
8474 		dberr_t	err = os_file_write_func(
8475 			slot->type,
8476 			slot->name,
8477 			slot->file.m_file,
8478 			slot->ptr,
8479 			slot->offset,
8480 			slot->len);
8481 		ut_a(err == DB_SUCCESS || err == DB_IO_NO_PUNCH_HOLE);
8482 	}
8483 
8484 	/** @return true if the slots are adjacent and can be merged */
adjacent(const Slot * s1,const Slot * s2) const8485 	bool adjacent(const Slot* s1, const Slot* s2) const
8486 	{
8487 		return(s1 != s2
8488 		       && s1->file.m_file == s2->file.m_file
8489 		       && s2->offset == s1->offset + s1->len
8490 		       && s1->type == s2->type);
8491 	}
8492 
8493 	/** @return true if merge limit reached or no adjacent slots found. */
merge_adjacent(Slot * & current)8494 	bool merge_adjacent(Slot*& current)
8495 	{
8496 		Slot*	slot;
8497 		ulint	offset = m_segment * m_n_slots;
8498 
8499 		slot = m_array->at(offset);
8500 
8501 		for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
8502 
8503 			if (slot->is_reserved && adjacent(current, slot)) {
8504 
8505 				current = slot;
8506 
8507 				/* Found a consecutive i/o request */
8508 
8509 				m_slots[m_n_elems] = slot;
8510 
8511 				++m_n_elems;
8512 
8513 				return(m_n_elems >= m_slots.capacity());
8514 			}
8515 		}
8516 
8517 		return(true);
8518 	}
8519 
8520 	/** There were no old requests. Look for an I/O request at the lowest
8521 	offset in the array (we ignore the high 32 bits of the offset in these
8522 	heuristics) */
select_lowest_offset()8523 	bool select_lowest_offset()
8524 	{
8525 		ut_ad(m_n_elems == 0);
8526 
8527 		ulint	offset = m_segment * m_n_slots;
8528 
8529 		m_lowest_offset = IB_UINT64_MAX;
8530 
8531 		for (ulint i = 0; i < m_n_slots; ++i) {
8532 			Slot*	slot;
8533 
8534 			slot = m_array->at(i + offset);
8535 
8536 			if (slot->is_reserved
8537 			    && slot->offset < m_lowest_offset) {
8538 
8539 				/* Found an i/o request */
8540 				m_slots[0] = slot;
8541 
8542 				m_n_elems = 1;
8543 
8544 				m_lowest_offset = slot->offset;
8545 			}
8546 		}
8547 
8548 		return(m_n_elems > 0);
8549 	}
8550 
8551 	/** Select the slot if it is older than the current oldest slot.
8552 	@param[in]	slot		The slot to check */
select_if_older(Slot * slot)8553 	void select_if_older(Slot* slot)
8554 	{
8555 		int64_t time_diff = ut_time_monotonic() -
8556 					slot->reservation_time;
8557 
8558 		const uint64_t age = time_diff > 0 ? (uint64_t) time_diff : 0;
8559 
8560 		if ((age >= 2 && age > m_oldest)
8561 		    || (age >= 2
8562 			&& age == m_oldest
8563 			&& slot->offset < m_lowest_offset)) {
8564 
8565 			/* Found an i/o request */
8566 			m_slots[0] = slot;
8567 
8568 			m_n_elems = 1;
8569 
8570 			m_oldest = age;
8571 
8572 			m_lowest_offset = slot->offset;
8573 		}
8574 	}
8575 
8576 	/** Select th oldest slot in the array
8577 	@return true if oldest slot found */
select_oldest()8578 	bool select_oldest()
8579 	{
8580 		ut_ad(m_n_elems == 0);
8581 
8582 		Slot*	slot;
8583 		ulint	offset = m_n_slots * m_segment;
8584 
8585 		slot = m_array->at(offset);
8586 
8587 		for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
8588 
8589 			if (slot->is_reserved) {
8590 				select_if_older(slot);
8591 			}
8592 		}
8593 
8594 		return(m_n_elems > 0);
8595 	}
8596 
8597 	typedef std::vector<Slot*> slots_t;
8598 
8599 private:
8600 	ulint		m_oldest;
8601 	ulint		m_n_elems;
8602 	os_offset_t	m_lowest_offset;
8603 
8604 	AIO*		m_array;
8605 	ulint		m_n_slots;
8606 	ulint		m_segment;
8607 
8608 	slots_t		m_slots;
8609 
8610 	byte*		m_ptr;
8611 	byte*		m_buf;
8612 };
8613 
8614 /** Wait for I/O requests
8615 @return the number of slots */
8616 ulint
check_pending(ulint global_segment,os_event_t event)8617 SimulatedAIOHandler::check_pending(
8618 	ulint		global_segment,
8619 	os_event_t	event)
8620 {
8621 	/* NOTE! We only access constant fields in os_aio_array.
8622 	Therefore we do not have to acquire the protecting mutex yet */
8623 
8624 	ut_ad(os_aio_validate_skip());
8625 
8626 	ut_ad(m_segment < m_array->get_n_segments());
8627 
8628 	/* Look through n slots after the segment * n'th slot */
8629 
8630 	if (AIO::is_read(m_array)
8631 	    && os_aio_recommend_sleep_for_read_threads) {
8632 
8633 		/* Give other threads chance to add several
8634 		I/Os to the array at once. */
8635 
8636 		srv_set_io_thread_op_info(
8637 			global_segment, "waiting for i/o request");
8638 
8639 		os_event_wait(event);
8640 
8641 		return(0);
8642 	}
8643 
8644 	return(m_array->slots_per_segment());
8645 }
8646 
8647 /** Does simulated AIO. This function should be called by an i/o-handler
8648 thread.
8649 
8650 @param[in]	segment	The number of the segment in the aio arrays to wait
8651 			for; segment 0 is the ibuf i/o thread, segment 1 the
8652 			log i/o thread, then follow the non-ibuf read threads,
8653 			and as the last are the non-ibuf write threads
8654 @param[out]	m1	the messages passed with the AIO request; note that
8655 			also in the case where the AIO operation failed, these
8656 			output parameters are valid and can be used to restart
8657 			the operation, for example
8658 @param[out]	m2	Callback argument
8659 @param[in]	type	IO context
8660 @return DB_SUCCESS or error code */
8661 static
8662 dberr_t
os_aio_simulated_handler(ulint global_segment,fil_node_t ** m1,void ** m2,IORequest * type)8663 os_aio_simulated_handler(
8664 	ulint		global_segment,
8665 	fil_node_t**	m1,
8666 	void**		m2,
8667 	IORequest*	type)
8668 {
8669 	Slot*		slot;
8670 	AIO*		array;
8671 	ulint		segment;
8672 	os_event_t	event = os_aio_segment_wait_events[global_segment];
8673 
8674 	segment = AIO::get_array_and_local_segment(&array, global_segment);
8675 
8676 	SimulatedAIOHandler	handler(array, segment);
8677 
8678 	for (;;) {
8679 
8680 		srv_set_io_thread_op_info(
8681 			global_segment, "looking for i/o requests (a)");
8682 
8683 		ulint	n_slots = handler.check_pending(global_segment, event);
8684 
8685 		if (n_slots == 0) {
8686 			continue;
8687 		}
8688 
8689 		handler.init(n_slots);
8690 
8691 		srv_set_io_thread_op_info(
8692 			global_segment, "looking for i/o requests (b)");
8693 
8694 		array->acquire();
8695 
8696 		ulint	n_reserved;
8697 
8698 		slot = handler.check_completed(&n_reserved);
8699 
8700 		if (slot != NULL) {
8701 
8702 			break;
8703 
8704 		} else if (n_reserved == 0
8705 			   && !buf_page_cleaner_is_active
8706 			   && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
8707 
8708 			/* There is no completed request. If there
8709 			are no pending request at all, and the system
8710 			is being shut down, exit. */
8711 
8712 			array->release();
8713 
8714 			*m1 = NULL;
8715 
8716 			*m2 = NULL;
8717 
8718 			return(DB_SUCCESS);
8719 
8720 		} else if (handler.select()) {
8721 
8722 			break;
8723 		}
8724 
8725 		/* No I/O requested at the moment */
8726 
8727 		srv_set_io_thread_op_info(
8728 			global_segment, "resetting wait event");
8729 
8730 		/* We wait here until tbere are more IO requests
8731 		for this segment. */
8732 
8733 		os_event_reset(event);
8734 
8735 		array->release();
8736 
8737 		srv_set_io_thread_op_info(
8738 			global_segment, "waiting for i/o request");
8739 
8740 		os_event_wait(event);
8741 	}
8742 
8743 	/** Found a slot that has already completed its IO */
8744 
8745 	if (slot == NULL) {
8746 		/* Merge adjacent requests */
8747 		handler.merge();
8748 
8749 		/* Check if there are several consecutive blocks
8750 		to read or write */
8751 
8752 		srv_set_io_thread_op_info(
8753 			global_segment, "consecutive i/o requests");
8754 
8755 		// Note: We don't support write combining for simulated AIO.
8756 		//ulint	total_len = handler.allocate_buffer();
8757 
8758 		/* We release the array mutex for the time of the I/O: NOTE that
8759 		this assumes that there is just one i/o-handler thread serving
8760 		a single segment of slots! */
8761 
8762 		array->release();
8763 
8764 		// Note: We don't support write combining for simulated AIO.
8765 		//handler.copy_to_buffer(total_len);
8766 
8767 		srv_set_io_thread_op_info(global_segment, "doing file i/o");
8768 
8769 		handler.io();
8770 
8771 		srv_set_io_thread_op_info(global_segment, "file i/o done");
8772 
8773 		handler.io_complete();
8774 
8775 		array->acquire();
8776 
8777 		handler.done();
8778 
8779 		/* We return the messages for the first slot now, and if there
8780 		were several slots, the messages will be returned with
8781 		subsequent calls of this function */
8782 
8783 		slot = handler.first_slot();
8784 	}
8785 
8786 	ut_ad(slot->is_reserved);
8787 
8788 	*m1 = slot->m1;
8789 	*m2 = slot->m2;
8790 
8791 	*type = slot->type;
8792 
8793 	array->release(slot);
8794 
8795 	array->release();
8796 
8797 	return(DB_SUCCESS);
8798 }
8799 
8800 /** Get the total number of pending IOs
8801 @return the total number of pending IOs */
8802 ulint
total_pending_io_count()8803 AIO::total_pending_io_count()
8804 {
8805 	ulint	count = s_reads->pending_io_count();
8806 
8807 	if (s_writes != NULL) {
8808 		count += s_writes->pending_io_count();
8809 	}
8810 
8811 	if (s_ibuf != NULL) {
8812 		count += s_ibuf->pending_io_count();
8813 	}
8814 
8815 	if (s_log != NULL) {
8816 		count += s_log->pending_io_count();
8817 	}
8818 
8819 	if (s_sync != NULL) {
8820 		count += s_sync->pending_io_count();
8821 	}
8822 
8823 	return(count);
8824 }
8825 
8826 /** Validates the consistency the aio system.
8827 @return true if ok */
8828 static
8829 bool
os_aio_validate()8830 os_aio_validate()
8831 {
8832 	/* The methods countds and validates, we ignore the count. */
8833 	AIO::total_pending_io_count();
8834 
8835 	return(true);
8836 }
8837 
8838 /** Prints pending IO requests per segment of an aio array.
8839 We probably don't need per segment statistics but they can help us
8840 during development phase to see if the IO requests are being
8841 distributed as expected.
8842 @param[in,out]	file		File where to print
8843 @param[in]	segments	Pending IO array */
8844 void
print_segment_info(FILE * file,const ulint * segments)8845 AIO::print_segment_info(
8846 	FILE*		file,
8847 	const ulint*	segments)
8848 {
8849 	ut_ad(m_n_segments > 0);
8850 
8851 	if (m_n_segments > 1) {
8852 
8853 		fprintf(file, " [");
8854 
8855 		for (ulint i = 0; i < m_n_segments; ++i, ++segments) {
8856 
8857 			if (i != 0) {
8858 				fprintf(file, ", ");
8859 			}
8860 
8861 			fprintf(file, ULINTPF, *segments);
8862 		}
8863 
8864 		fprintf(file, "] ");
8865 	}
8866 }
8867 
8868 /** Prints info about the aio array.
8869 @param[in,out]	file		Where to print */
8870 void
print(FILE * file)8871 AIO::print(FILE* file)
8872 {
8873 	ulint	count = 0;
8874 	ulint	n_res_seg[SRV_MAX_N_IO_THREADS];
8875 
8876 	mutex_enter(&m_mutex);
8877 
8878 	ut_a(!m_slots.empty());
8879 	ut_a(m_n_segments > 0);
8880 
8881 	memset(n_res_seg, 0x0, sizeof(n_res_seg));
8882 
8883 	for (ulint i = 0; i < m_slots.size(); ++i) {
8884 		Slot&	slot = m_slots[i];
8885 		ulint	segment = (i * m_n_segments) / m_slots.size();
8886 
8887 		if (slot.is_reserved) {
8888 
8889 			++count;
8890 
8891 			++n_res_seg[segment];
8892 
8893 			ut_a(slot.len > 0);
8894 		}
8895 	}
8896 
8897 	ut_a(m_n_reserved == count);
8898 
8899 	print_segment_info(file, n_res_seg);
8900 
8901 	mutex_exit(&m_mutex);
8902 }
8903 
8904 /** Print all the AIO segments
8905 @param[in,out]	file		Where to print */
8906 void
print_all(FILE * file)8907 AIO::print_all(FILE* file)
8908 {
8909 	s_reads->print(file);
8910 
8911 	if (s_writes != NULL) {
8912 		fputs(", aio writes:", file);
8913 		s_writes->print(file);
8914 	}
8915 
8916 	if (s_ibuf != NULL) {
8917 		fputs(",\n ibuf aio reads:", file);
8918 		s_ibuf->print(file);
8919 	}
8920 
8921 	if (s_log != NULL) {
8922 		fputs(", log i/o's:", file);
8923 		s_log->print(file);
8924 	}
8925 
8926 	if (s_sync != NULL) {
8927 		fputs(", sync i/o's:", file);
8928 		s_sync->print(file);
8929 	}
8930 }
8931 
8932 /** Prints info of the aio arrays.
8933 @param[in,out]	file		file where to print */
8934 void
os_aio_print(FILE * file)8935 os_aio_print(FILE*	file)
8936 {
8937 	ib_time_monotonic_t 		current_time;
8938 	double	 			time_elapsed;
8939 	double				avg_bytes_read;
8940 
8941 	for (ulint i = 0; i < srv_n_file_io_threads; ++i) {
8942 		fprintf(file, "I/O thread %lu state: %s (%s)",
8943 			(ulong) i,
8944 			srv_io_thread_op_info[i],
8945 			srv_io_thread_function[i]);
8946 
8947 #ifndef _WIN32
8948 		if (os_event_is_set(os_aio_segment_wait_events[i])) {
8949 			fprintf(file, " ev set");
8950 		}
8951 #endif /* _WIN32 */
8952 
8953 		fprintf(file, "\n");
8954 	}
8955 
8956 	fputs("Pending normal aio reads:", file);
8957 
8958 	AIO::print_all(file);
8959 
8960 	putc('\n', file);
8961 	current_time = ut_time_monotonic();
8962 	time_elapsed = 0.001 + (current_time - os_last_printout);
8963 
8964 	fprintf(file,
8965 		"Pending flushes (fsync) log: " ULINTPF "; "
8966 		"buffer pool: " ULINTPF "\n"
8967 		ULINTPF " OS file reads, "
8968 		ULINTPF " OS file writes, "
8969 		ULINTPF " OS fsyncs\n",
8970 		fil_n_pending_log_flushes,
8971 		fil_n_pending_tablespace_flushes,
8972 		os_n_file_reads,
8973 		os_n_file_writes,
8974 		os_n_fsyncs);
8975 
8976 	if (os_n_pending_writes != 0 || os_n_pending_reads != 0) {
8977 		fprintf(file,
8978 			ULINTPF " pending preads, "
8979 			ULINTPF " pending pwrites\n",
8980 			os_n_pending_reads,
8981 			os_n_pending_writes);
8982 	}
8983 
8984 	if (os_n_file_reads == os_n_file_reads_old) {
8985 		avg_bytes_read = 0.0;
8986 	} else {
8987 		avg_bytes_read = (double) os_bytes_read_since_printout
8988 			/ (os_n_file_reads - os_n_file_reads_old);
8989 	}
8990 
8991 	fprintf(file,
8992 		"%.2f reads/s, %lu avg bytes/read,"
8993 		" %.2f writes/s, %.2f fsyncs/s\n",
8994 		(os_n_file_reads - os_n_file_reads_old)
8995 		/ time_elapsed,
8996 		(ulong) avg_bytes_read,
8997 		(os_n_file_writes - os_n_file_writes_old)
8998 		/ time_elapsed,
8999 		(os_n_fsyncs - os_n_fsyncs_old)
9000 		/ time_elapsed);
9001 
9002 	os_n_file_reads_old = os_n_file_reads;
9003 	os_n_file_writes_old = os_n_file_writes;
9004 	os_n_fsyncs_old = os_n_fsyncs;
9005 	os_bytes_read_since_printout = 0;
9006 
9007 	os_last_printout = current_time;
9008 }
9009 
9010 /** Refreshes the statistics used to print per-second averages. */
9011 void
os_aio_refresh_stats()9012 os_aio_refresh_stats()
9013 {
9014 	os_n_fsyncs_old = os_n_fsyncs;
9015 
9016 	os_bytes_read_since_printout = 0;
9017 
9018 	os_n_file_reads_old = os_n_file_reads;
9019 
9020 	os_n_file_writes_old = os_n_file_writes;
9021 
9022 	os_n_fsyncs_old = os_n_fsyncs;
9023 
9024 	os_bytes_read_since_printout = 0;
9025 
9026 	os_last_printout = ut_time_monotonic();
9027 }
9028 
9029 /** Checks that all slots in the system have been freed, that is, there are
9030 no pending io operations.
9031 @return true if all free */
9032 bool
os_aio_all_slots_free()9033 os_aio_all_slots_free()
9034 {
9035 	return(AIO::total_pending_io_count() == 0);
9036 }
9037 
9038 #ifdef UNIV_DEBUG
9039 /** Prints all pending IO for the array
9040 @param[in]	file	file where to print
9041 @param[in]	array	array to process */
9042 void
to_file(FILE * file) const9043 AIO::to_file(FILE* file) const
9044 {
9045 	acquire();
9046 
9047 	fprintf(file, " %lu\n", static_cast<ulong>(m_n_reserved));
9048 
9049 	for (ulint i = 0; i < m_slots.size(); ++i) {
9050 
9051 		const Slot&	slot = m_slots[i];
9052 
9053 		if (slot.is_reserved) {
9054 
9055 			fprintf(file,
9056 				"%s IO for %s (offset=" UINT64PF
9057 				", size=%lu)\n",
9058 				slot.type.is_read() ? "read" : "write",
9059 				slot.name, slot.offset, slot.len);
9060 		}
9061 	}
9062 
9063 	release();
9064 }
9065 
9066 /** Print pending IOs for all arrays */
9067 void
print_to_file(FILE * file)9068 AIO::print_to_file(FILE* file)
9069 {
9070 	fprintf(file, "Pending normal aio reads:");
9071 
9072 	s_reads->to_file(file);
9073 
9074 	if (s_writes != NULL) {
9075 		fprintf(file, "Pending normal aio writes:");
9076 		s_writes->to_file(file);
9077 	}
9078 
9079 	if (s_ibuf != NULL) {
9080 		fprintf(file, "Pending ibuf aio reads:");
9081 		s_ibuf->to_file(file);
9082 	}
9083 
9084 	if (s_log != NULL) {
9085 		fprintf(file, "Pending log i/o's:");
9086 		s_log->to_file(file);
9087 	}
9088 
9089 	if (s_sync != NULL) {
9090 		fprintf(file, "Pending sync i/o's:");
9091 		s_sync->to_file(file);
9092 	}
9093 }
9094 
9095 /** Prints all pending IO
9096 @param[in]	file		File where to print */
9097 void
os_aio_print_pending_io(FILE * file)9098 os_aio_print_pending_io(
9099 	FILE*	file)
9100 {
9101 	AIO::print_to_file(file);
9102 }
9103 
9104 #endif /* UNIV_DEBUG */
9105 
9106 /**
9107 Set the file create umask
9108 @param[in]	umask		The umask to use for file creation. */
9109 void
os_file_set_umask(ulint umask)9110 os_file_set_umask(ulint umask)
9111 {
9112 	os_innodb_umask = umask;
9113 }
9114 #else
9115 
9116 #include "univ.i"
9117 #include "db0err.h"
9118 #include "mach0data.h"
9119 #include "fsp0fsp.h"
9120 #include "fil0fil.h"
9121 #include "os0file.h"
9122 
9123 #ifdef UNIV_NONINL
9124 #include "os0file.ic"
9125 #endif
9126 
9127 #include <lz4.h>
9128 #include <zlib.h>
9129 
9130 #include <my_aes.h>
9131 #include <my_rnd.h>
9132 #include <mysqld.h>
9133 #include <mysql/service_mysql_keyring.h>
9134 
9135 typedef byte	Block;
9136 
9137 /** Allocate a page for sync IO
9138 @return pointer to page */
9139 static
9140 Block*
os_alloc_block()9141 os_alloc_block()
9142 {
9143 	return(reinterpret_cast<byte*>(malloc(UNIV_PAGE_SIZE_MAX * 2)));
9144 }
9145 
9146 /** Free a page after sync IO
9147 @param[in,own]	block		The block to free/release */
9148 static
9149 void
os_free_block(Block * block)9150 os_free_block(Block* block)
9151 {
9152 	ut_free(block);
9153 }
9154 
9155 #endif /* !UNIV_INNOCHECKSUM */
9156 
9157 /** Minimum length needed for encryption */
9158 const unsigned int MIN_ENCRYPTION_LEN = 2 * MY_AES_BLOCK_SIZE + FIL_PAGE_DATA;
9159 
9160 /**
9161 @param[in]      type            The compression type
9162 @return the string representation */
9163 const char*
to_string(Type type)9164 Compression::to_string(Type type)
9165 {
9166         switch(type) {
9167         case NONE:
9168                 return("None");
9169         case ZLIB:
9170                 return("Zlib");
9171         case LZ4:
9172                 return("LZ4");
9173         }
9174 
9175         ut_ad(0);
9176 
9177         return("<UNKNOWN>");
9178 }
9179 
9180 /**
9181 @param[in]      meta		Page Meta data
9182 @return the string representation */
to_string(const Compression::meta_t & meta)9183 std::string Compression::to_string(const Compression::meta_t& meta)
9184 {
9185 	std::ostringstream	stream;
9186 
9187 	stream	<< "version: " << int(meta.m_version) << " "
9188 		<< "algorithm: " << meta.m_algorithm << " "
9189 		<< "(" << to_string(meta.m_algorithm) << ") "
9190 		<< "orginal_type: " << meta.m_original_type << " "
9191 		<< "original_size: " << meta.m_original_size << " "
9192 		<< "compressed_size: " << meta.m_compressed_size;
9193 
9194 	return(stream.str());
9195 }
9196 
9197 /** @return true if it is a compressed page */
9198 bool
is_compressed_page(const byte * page)9199 Compression::is_compressed_page(const byte* page)
9200 {
9201 	return(mach_read_from_2(page + FIL_PAGE_TYPE) == FIL_PAGE_COMPRESSED);
9202 }
9203 
9204 bool
is_compressed_encrypted_page(const byte * page)9205 Compression::is_compressed_encrypted_page(const byte *page) {
9206 	return (mach_read_from_2(page + FIL_PAGE_TYPE) ==
9207 		FIL_PAGE_COMPRESSED_AND_ENCRYPTED);
9208 }
9209 
9210 bool
is_valid_page_version(uint8_t version)9211 Compression::is_valid_page_version(uint8_t version) {
9212 	return (version == FIL_PAGE_VERSION_1 || version == FIL_PAGE_VERSION_2);
9213 }
9214 
9215 /** Deserizlise the page header compression meta-data
9216 @param[in]	page		Pointer to the page header
9217 @param[out]	control		Deserialised data */
9218 void
deserialize_header(const byte * page,Compression::meta_t * control)9219 Compression::deserialize_header(
9220 	const byte*		page,
9221 	Compression::meta_t*	control)
9222 {
9223 	ut_ad(is_compressed_page(page) || is_compressed_encrypted_page(page));
9224 
9225 	control->m_version = static_cast<uint8_t>(
9226 		mach_read_from_1(page + FIL_PAGE_VERSION));
9227 
9228 	control->m_original_type = static_cast<uint16_t>(
9229 		mach_read_from_2(page + FIL_PAGE_ORIGINAL_TYPE_V1));
9230 
9231 	control->m_compressed_size = static_cast<uint16_t>(
9232 		mach_read_from_2(page + FIL_PAGE_COMPRESS_SIZE_V1));
9233 
9234 	control->m_original_size = static_cast<uint16_t>(
9235 		mach_read_from_2(page + FIL_PAGE_ORIGINAL_SIZE_V1));
9236 
9237 	control->m_algorithm = static_cast<Type>(
9238 		mach_read_from_1(page + FIL_PAGE_ALGORITHM_V1));
9239 }
9240 
9241 /** Decompress the page data contents. Page type must be FIL_PAGE_COMPRESSED, if
9242 not then the source contents are left unchanged and DB_SUCCESS is returned.
9243 @param[in]	dblwr_recover	true of double write recovery in progress
9244 @param[in,out]	src		Data read from disk, decompressed data will be
9245 				copied to this page
9246 @param[in,out]	dst		Scratch area to use for decompression
9247 @param[in]	dst_len		Size of the scratch area in bytes
9248 @return DB_SUCCESS or error code */
9249 dberr_t
deserialize(bool dblwr_recover,byte * src,byte * dst,ulint dst_len)9250 Compression::deserialize(
9251 	bool		dblwr_recover,
9252 	byte*		src,
9253 	byte*		dst,
9254 	ulint		dst_len)
9255 {
9256 	if (!is_compressed_page(src)) {
9257 		/* There is nothing we can do. */
9258 		return(DB_SUCCESS);
9259 	}
9260 
9261 	meta_t	header;
9262 
9263 	deserialize_header(src, &header);
9264 
9265 	byte*	ptr = src + FIL_PAGE_DATA;
9266 
9267 	ut_ad(is_valid_page_version(header.m_version));
9268 
9269 	if (!is_valid_page_version(header.m_version)
9270 	    || header.m_original_size < UNIV_PAGE_SIZE_MIN - (FIL_PAGE_DATA + 8)
9271 	    || header.m_original_size > UNIV_PAGE_SIZE_MAX - FIL_PAGE_DATA
9272 	    || dst_len < header.m_original_size + FIL_PAGE_DATA) {
9273 
9274 		/* The last check could potentially return DB_OVERFLOW,
9275 		the caller should be able to retry with a larger buffer. */
9276 
9277 		return(DB_CORRUPTION);
9278 	}
9279 
9280 	Block*	block;
9281 
9282 	/* The caller doesn't know what to expect */
9283 	if (dst == NULL) {
9284 
9285 		block = os_alloc_block();
9286 
9287 #ifdef UNIV_INNOCHECKSUM
9288 		dst = block;
9289 #else
9290 		dst = block->m_ptr;
9291 #endif /* UNIV_INNOCHECKSUM */
9292 
9293 	} else {
9294 		block = NULL;
9295 	}
9296 
9297 	int		ret;
9298 	Compression	compression;
9299 	ulint		len = header.m_original_size;
9300 
9301 	compression.m_type = static_cast<Compression::Type>(header.m_algorithm);
9302 
9303 	switch(compression.m_type) {
9304 	case Compression::ZLIB: {
9305 
9306 		uLongf	zlen = header.m_original_size;
9307 
9308 		if (uncompress(dst, &zlen, ptr, header.m_compressed_size)
9309 		    != Z_OK) {
9310 
9311 			if (block != NULL) {
9312 				os_free_block(block);
9313 			}
9314 
9315 			return(DB_IO_DECOMPRESS_FAIL);
9316 		}
9317 
9318 		len = static_cast<ulint>(zlen);
9319 
9320 		break;
9321 	}
9322 
9323 	case Compression::LZ4:
9324 
9325                 ret = LZ4_decompress_safe(
9326                         reinterpret_cast<char*>(ptr),
9327                         reinterpret_cast<char*>(dst),
9328                         header.m_compressed_size,
9329                         header.m_original_size);
9330 		if (ret < 0) {
9331 
9332 			if (block != NULL) {
9333 				os_free_block(block);
9334 			}
9335 
9336 			return(DB_IO_DECOMPRESS_FAIL);
9337 		}
9338 
9339 		break;
9340 
9341 	default:
9342 #if !defined(UNIV_INNOCHECKSUM)
9343 		ib::error()
9344 			<< "Compression algorithm support missing: "
9345 			<< Compression::to_string(compression.m_type);
9346 #else
9347 		fprintf(stderr, "Compression algorithm support missing: %s\n",
9348 			Compression::to_string(compression.m_type));
9349 #endif /* !UNIV_INNOCHECKSUM */
9350 
9351 		if (block != NULL) {
9352 			os_free_block(block);
9353 		}
9354 
9355 		return(DB_UNSUPPORTED);
9356 	}
9357 
9358 	/* Leave the header alone */
9359 	memmove(src + FIL_PAGE_DATA, dst, len);
9360 
9361 	mach_write_to_2(src + FIL_PAGE_TYPE, header.m_original_type);
9362 
9363 	ut_ad(dblwr_recover
9364 	      || memcmp(src + FIL_PAGE_LSN + 4,
9365 			src + (header.m_original_size + FIL_PAGE_DATA)
9366 			- FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4) == 0);
9367 
9368 	if (block != NULL) {
9369 		os_free_block(block);
9370 	}
9371 
9372 	return(DB_SUCCESS);
9373 }
9374 
9375 /** Decompress the page data contents. Page type must be FIL_PAGE_COMPRESSED, if
9376 not then the source contents are left unchanged and DB_SUCCESS is returned.
9377 @param[in]	dblwr_recover	true of double write recovery in progress
9378 @param[in,out]	src		Data read from disk, decompressed data will be
9379 				copied to this page
9380 @param[in,out]	dst		Scratch area to use for decompression
9381 @param[in]	dst_len		Size of the scratch area in bytes
9382 @return DB_SUCCESS or error code */
9383 dberr_t
os_file_decompress_page(bool dblwr_recover,byte * src,byte * dst,ulint dst_len)9384 os_file_decompress_page(
9385 	bool		dblwr_recover,
9386 	byte*		src,
9387 	byte*		dst,
9388 	ulint		dst_len)
9389 {
9390 	return(Compression::deserialize(dblwr_recover, src, dst, dst_len));
9391 }
9392 
9393 /**
9394 @param[in]      type            The encryption type
9395 @return the string representation */
9396 const char*
to_string(Type type)9397 Encryption::to_string(Type type)
9398 {
9399         switch(type) {
9400         case NONE:
9401                 return("N");
9402         case AES:
9403                 return("Y");
9404         case KEYRING:
9405                 return("KEYRING");
9406         }
9407 
9408         ut_ad(0);
9409 
9410         return("<UNKNOWN>");
9411 }
9412 
9413 /** Generate random encryption value for key and iv.
9414 @param[in,out]	value	Encryption value */
random_value(byte * value)9415 void Encryption::random_value(byte* value)
9416 {
9417 	ut_ad(value != NULL);
9418 
9419 	my_rand_buffer(value, ENCRYPTION_KEY_LEN);
9420 }
9421 
9422 void
fill_key_name(char * key_name,uint key_id)9423 Encryption::fill_key_name(char *key_name, uint key_id)
9424 {
9425 #ifndef UNIV_INNOCHECKSUM
9426 	memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
9427 
9428 	ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
9429 		    "%s-%u", ENCRYPTION_PERCONA_SYSTEM_KEY_PREFIX,
9430 		    key_id);
9431 #endif
9432 }
9433 
9434 void
fill_key_name(char * key_name,uint key_id,uint key_version)9435 Encryption::fill_key_name(char* key_name, uint key_id, uint key_version)
9436 {
9437 #ifndef UNIV_INNOCHECKSUM
9438 	memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
9439 
9440 	ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
9441 		    "%s-%u:%u", ENCRYPTION_PERCONA_SYSTEM_KEY_PREFIX,
9442 		    key_id, key_version);
9443 #endif
9444 }
9445 
9446 void
create_tablespace_key(byte ** tablespace_key,uint key_id)9447 Encryption::create_tablespace_key(byte** tablespace_key,
9448 				  uint key_id)
9449 {
9450 #ifndef UNIV_INNOCHECKSUM
9451 	char*	key_type = NULL;
9452 	size_t	key_len;
9453 	char	key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
9454 	int	ret;
9455 
9456 
9457 	fill_key_name(key_name, key_id);
9458 
9459 	/* We call key ring API to generate tablespace key here. */
9460 	ret = my_key_generate(key_name, "AES",
9461 			      NULL, ENCRYPTION_KEY_LEN);
9462 
9463 	if (ret) {
9464 		ib::error() << "Encryption can't generate tablespace key : " << key_name;
9465 		*tablespace_key = NULL;
9466 		return;
9467 	}
9468 
9469 	byte *system_tablespace_key = NULL;
9470 	/* We call key ring API to get tablespace key here. */
9471 	ret = my_key_fetch(key_name, &key_type, NULL,
9472 			   reinterpret_cast<void**>(&system_tablespace_key),
9473 			   &key_len);
9474 
9475 	if (ret || system_tablespace_key == NULL) {
9476 		ib::error() << "Encryption can't find tablespace key " << key_name << " please check"
9477 				" that the keyring plugin is loaded.";
9478 		*tablespace_key = NULL;
9479 		my_free(key_type);
9480 		return;
9481 	}
9482 	my_free(key_type);
9483 
9484 	uint tablespace_key_version = 0;
9485 	size_t tablespace_key_data_length = 0;
9486 
9487 	if (parse_system_key(system_tablespace_key, key_len, &tablespace_key_version,
9488 			     tablespace_key, &tablespace_key_data_length) == NULL) {
9489 		my_free(system_tablespace_key);
9490 		return;
9491 	}
9492 	my_free(system_tablespace_key);
9493 	// Newly created key should have 1 assigned as its key version
9494 	ut_ad(tablespace_key_version == 1 && tablespace_key_data_length == ENCRYPTION_KEY_LEN);
9495 #endif
9496 }
9497 
9498 
9499 /** Create new master key for key rotation.
9500 @param[in,out]	master_key	master key */
9501 void
create_master_key(byte ** master_key)9502 Encryption::create_master_key(byte** master_key)
9503 {
9504 #ifndef UNIV_INNOCHECKSUM
9505 	char*	key_type = NULL;
9506 	size_t	key_len;
9507 	char	key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
9508 	int	ret;
9509 
9510 	/* If uuid does not match with current server uuid,
9511 	set uuid as current server uuid. */
9512 	if (strcmp(uuid, server_uuid) != 0) {
9513 		memcpy(uuid, server_uuid, ENCRYPTION_SERVER_UUID_LEN);
9514 	}
9515 	memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
9516 
9517 	/* Generate new master key */
9518 	ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
9519 		    "%s-%s-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
9520 		    uuid, master_key_id + 1);
9521 
9522 	/* We call key ring API to generate master key here. */
9523 	ret = my_key_generate(key_name, "AES",
9524 			      NULL, ENCRYPTION_KEY_LEN);
9525 
9526 	/* We call key ring API to get master key here. */
9527 	ret = my_key_fetch(key_name, &key_type, NULL,
9528 			   reinterpret_cast<void**>(master_key),
9529 			   &key_len);
9530 
9531 	if (ret || *master_key == NULL) {
9532 		ib::error() << "Encryption can't find master key, please check"
9533 			        " the keyring plugin is loaded.";
9534 		*master_key = NULL;
9535 	} else {
9536 		master_key_id++;
9537 	}
9538 
9539 	if (key_type) {
9540 		my_free(key_type);
9541 	}
9542 #endif
9543 }
9544 
9545 void
get_keyring_key(const char * key_name,byte ** key,size_t * key_len)9546 Encryption::get_keyring_key(const char *key_name,
9547 			    byte** key, size_t *key_len)
9548 {
9549 #ifndef UNIV_INNOCHECKSUM
9550 	int ret;
9551 	char*	key_type = NULL;
9552 	//size_t	key_len;
9553 	/* We call key ring API to get master key here. */
9554 	ret = my_key_fetch(key_name, &key_type, NULL,
9555 			   reinterpret_cast<void**>(key), key_len);
9556 
9557 	if (key_type) {
9558 		my_free(key_type);
9559 	}
9560 
9561 	if (ret) {
9562 		*key = NULL;
9563 	}
9564 #endif
9565 }
9566 
9567 bool
get_tablespace_key(uint key_id,uint tablespace_key_version,byte ** tablespace_key,size_t * key_len)9568 Encryption::get_tablespace_key(uint key_id,
9569 			       uint tablespace_key_version,
9570 			       byte** tablespace_key,
9571 			       size_t *key_len)
9572 {
9573 	bool result = true;
9574 #ifndef UNIV_INNOCHECKSUM
9575 	char	key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
9576 
9577 	fill_key_name(key_name, key_id, tablespace_key_version);
9578 
9579 	Encryption::get_keyring_key(key_name, tablespace_key, key_len);
9580 
9581 	if (*tablespace_key == NULL) {
9582 		ib::error() << "Encryption can't find tablespace key, please check"
9583 				" the keyring plugin is loaded.";
9584 		 result = false;
9585 	}
9586 
9587 #ifdef UNIV_ENCRYPT_DEBUG
9588 	if (*tablespace_key) {
9589 		fprintf(stderr, "Fetched tablespace key:%s ", key_name);
9590 		ut_print_buf(stderr, *tablespace_key, *key_len);
9591 		fprintf(stderr, "\n");
9592 	}
9593 #endif /* DEBUG_TDE */
9594 #endif
9595 	return result;
9596 }
9597 
9598 void
get_latest_system_key(const char * system_key_name,byte ** key,uint * key_version,size_t * key_length)9599 Encryption::get_latest_system_key(const char *system_key_name,
9600 				  byte **key,
9601 				  uint *key_version,
9602 				  size_t *key_length)
9603 {
9604 #ifndef UNIV_INNOCHECKSUM
9605 	size_t system_key_len = 0;
9606 	uchar *system_key = NULL;
9607 	get_keyring_key(system_key_name, &system_key, &system_key_len);
9608 	if (system_key == NULL)
9609 	{
9610 		*key = NULL;
9611 		return;
9612 	}
9613 
9614 	parse_system_key(system_key, system_key_len, key_version, (uchar**)key, key_length);
9615 	my_free(system_key);
9616 #endif
9617 }
9618 
9619 // tablespace_key_version as output parameter
9620 void
get_latest_tablespace_key(uint key_id,uint * tablespace_key_version,byte ** tablespace_key)9621 Encryption::get_latest_tablespace_key(uint key_id,
9622 				      uint *tablespace_key_version,
9623 				      byte** tablespace_key)
9624 {
9625 #ifndef UNIV_INNOCHECKSUM
9626 	size_t	key_len;
9627 	char	key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
9628 
9629 	fill_key_name(key_name, key_id);
9630 
9631 	get_latest_system_key(key_name, tablespace_key, tablespace_key_version, &key_len);
9632 
9633 #ifdef UNIV_ENCRYPT_DEBUG
9634 	if (*tablespace_key) {
9635 		fprintf(stderr, "Fetched tablespace key:%s ", key_name);
9636 		ut_print_buf(stderr, *tablespace_key, key_len);
9637 		fprintf(stderr, "\n");
9638 	}
9639 #endif /* DEBUG_TDE */
9640 
9641 #endif
9642 }
9643 
tablespace_key_exists(uint key_id)9644 bool Encryption::tablespace_key_exists(uint key_id)
9645 {
9646 	uint tablespace_key_version;
9647 	byte *tablespace_key;
9648 
9649 	get_latest_tablespace_key(key_id, &tablespace_key_version, &tablespace_key);
9650 
9651 	if(tablespace_key == NULL)
9652 		return false;
9653 
9654 	my_free(tablespace_key);
9655 	return true;
9656 }
9657 
tablespace_key_exists_or_create_new_one_if_does_not_exist(uint key_id)9658 bool Encryption::tablespace_key_exists_or_create_new_one_if_does_not_exist(uint key_id)
9659 {
9660 	uint tablespace_key_version;
9661 	byte *tablespace_key;
9662 
9663 	get_latest_tablespace_key_or_create_new_one(key_id, &tablespace_key_version, &tablespace_key);
9664 
9665 	if (tablespace_key == NULL)
9666 		return false;
9667 
9668 	my_free(tablespace_key);
9669 	return true;
9670 }
9671 
9672 void
get_latest_tablespace_key_or_create_new_one(uint key_id,uint * tablespace_key_version,byte ** tablespace_key)9673 Encryption::get_latest_tablespace_key_or_create_new_one(uint key_id,
9674 							uint *tablespace_key_version,
9675 							byte** tablespace_key)
9676 {
9677 	get_latest_tablespace_key(key_id, tablespace_key_version, tablespace_key);
9678 	if (*tablespace_key == NULL) {
9679 		Encryption::create_tablespace_key(tablespace_key, key_id);
9680 		*tablespace_key_version = 1;
9681 	}
9682 }
9683 
is_keyring_alive()9684 bool Encryption::is_keyring_alive()
9685 {
9686 	return Encryption::tablespace_key_exists_or_create_new_one_if_does_not_exist(0); //DEFAULT ENCRYPTION KEY
9687 }
9688 
can_page_be_keyring_encrypted(ulint page_type)9689 bool Encryption::can_page_be_keyring_encrypted(ulint page_type)
9690 {
9691 	switch (page_type) {
9692 		case FIL_PAGE_TYPE_FSP_HDR:
9693 		case FIL_PAGE_TYPE_XDES:
9694 		case FIL_PAGE_RTREE:
9695 		/* File space header, extent descriptor or spatial index
9696 		are not encrypted. */
9697 		return false;
9698 	}
9699 	return true;
9700 }
9701 
can_page_be_keyring_encrypted(byte * page)9702 bool Encryption::can_page_be_keyring_encrypted(byte* page)
9703 {
9704 	ut_ad(page != NULL);
9705 	return can_page_be_keyring_encrypted(mach_read_from_2(page+FIL_PAGE_TYPE));
9706 }
9707 
9708 
encryption_get_latest_version(uint key_id)9709 uint Encryption::encryption_get_latest_version(uint key_id)
9710 {
9711 #ifndef UNIV_INNOCHECKSUM
9712 	uint tablespace_key_version;
9713 	byte *tablespace_key;
9714 
9715 	get_latest_tablespace_key(key_id, &tablespace_key_version, &tablespace_key);
9716 
9717 	if(tablespace_key == NULL)
9718 		return ENCRYPTION_KEY_VERSION_INVALID;
9719 
9720 	my_free(tablespace_key);
9721 	return tablespace_key_version;
9722 #endif
9723 	return ENCRYPTION_KEY_VERSION_INVALID;
9724 }
9725 
9726 /** Get master key by key id.
9727 @param[in]	master_key_id	master key id
9728 @param[in]	srv_uuid	uuid of server instance
9729 @param[in,out]	master_key	master key */
9730 void
get_master_key(ulint master_key_id,char * srv_uuid,byte ** master_key)9731 Encryption::get_master_key(ulint master_key_id,
9732 			   char* srv_uuid,
9733 			   byte** master_key)
9734 {
9735 #ifndef UNIV_INNOCHECKSUM
9736 	size_t	key_len;
9737 	char	key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
9738 
9739 	memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
9740 
9741 	if (srv_uuid != NULL) {
9742 		ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
9743 			    "%s-%s-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
9744 			    srv_uuid, master_key_id);
9745 	} else {
9746 		/* For compitable with 5.7.11, we need to get master key with
9747 		server id. */
9748 		memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
9749 		ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
9750 			    "%s-%lu-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
9751 			    server_id, master_key_id);
9752 	}
9753 
9754 	/* We call key ring API to get master key here. */
9755 	get_keyring_key(key_name, master_key, &key_len);
9756 	if (*master_key == NULL) {
9757 		ib::error() << "Encryption can't find master key, please check"
9758 				" the keyring plugin is loaded.";
9759 	}
9760 
9761 #ifdef UNIV_ENCRYPT_DEBUG
9762 	if (*master_key) {
9763 		fprintf(stderr, "Fetched master key:%lu ", master_key_id);
9764 		ut_print_buf(stderr, *master_key, key_len);
9765 		fprintf(stderr, "\n");
9766 	}
9767 #endif /* DEBUG_TDE */
9768 
9769 #endif
9770 }
9771 
9772 /** Current master key id */
9773 ulint	Encryption::master_key_id = 0;
9774 
9775 /** Current uuid of server instance */
9776 char	Encryption::uuid[ENCRYPTION_SERVER_UUID_LEN + 1] = {0};
9777 
9778 /** Get current master key and master key id
9779 @param[in,out]	master_key_id	master key id
9780 @param[in,out]	master_key	master key
9781 @param[in,out]	version		encryption information version */
9782 void
get_master_key(ulint * master_key_id,byte ** master_key,Encryption::Version * version)9783 Encryption::get_master_key(ulint* master_key_id,
9784 			   byte** master_key,
9785 			   Encryption::Version*  version)
9786 {
9787 #ifndef UNIV_INNOCHECKSUM
9788 	char*	key_type = NULL;
9789 	size_t	key_len;
9790 	char	key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
9791 	int	ret;
9792 
9793 	memset(key_name, 0, ENCRYPTION_KEY_LEN);
9794 	*version = Encryption::ENCRYPTION_VERSION_3;
9795 
9796 	DBUG_EXECUTE_IF("force_v2_encryption",{
9797 			*version = Encryption::ENCRYPTION_VERSION_2;
9798 			});
9799 
9800 
9801 	if (Encryption::master_key_id == 0) {
9802 		/* If m_master_key is 0, means there's no encrypted
9803 		tablespace, we need to generate the first master key,
9804 		and store it to key ring. */
9805 		memset(uuid, 0, ENCRYPTION_SERVER_UUID_LEN + 1);
9806 		memcpy(uuid, server_uuid, ENCRYPTION_SERVER_UUID_LEN);
9807 
9808 		/* Prepare the server uuid. */
9809 		ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
9810 			    "%s-%s-1", ENCRYPTION_MASTER_KEY_PRIFIX,
9811 			    uuid);
9812 
9813 		/* We call key ring API to generate master key here. */
9814 		ret = my_key_generate(key_name, "AES",
9815 				      NULL, ENCRYPTION_KEY_LEN);
9816 
9817 		/* We call key ring API to get master key here. */
9818 		ret = my_key_fetch(key_name, &key_type, NULL,
9819 				   reinterpret_cast<void**>(master_key),
9820 				   &key_len);
9821 
9822 		if (!ret && *master_key != NULL) {
9823 			Encryption::master_key_id++;
9824 			*master_key_id = Encryption::master_key_id;
9825 		}
9826 #ifdef UNIV_ENCRYPT_DEBUG
9827 		if (!ret && *master_key) {
9828 			fprintf(stderr, "Generated new master key:");
9829 			ut_print_buf(stderr, *master_key, key_len);
9830 			fprintf(stderr, "\n");
9831 		}
9832 #endif
9833 	} else {
9834 		*master_key_id = Encryption::master_key_id;
9835 
9836 		ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
9837 			    "%s-%s-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
9838 			    uuid, *master_key_id);
9839 
9840 		/* We call key ring API to get master key here. */
9841 		ret = my_key_fetch(key_name, &key_type, NULL,
9842 				   reinterpret_cast<void**>(master_key),
9843 				   &key_len);
9844 
9845 		/* For compitable with 5.7.11, we need to try to get master key with
9846 		server id when get master key with server uuid failure. */
9847 		if (ret || *master_key == NULL) {
9848 			if (key_type) {
9849 				my_free(key_type);
9850 			}
9851 
9852 			memset(key_name, 0,
9853 			       ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
9854 			ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
9855 				    "%s-%lu-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
9856 				    server_id, *master_key_id);
9857 
9858 			ret = my_key_fetch(key_name, &key_type, NULL,
9859 					   reinterpret_cast<void**>(master_key),
9860 					   &key_len);
9861 			*version = Encryption::ENCRYPTION_VERSION_1;
9862 		}
9863 #ifdef UNIV_ENCRYPT_DEBUG
9864 		if (!ret && *master_key) {
9865 			fprintf(stderr, "Fetched master key:%lu ",
9866 				*master_key_id);
9867 			ut_print_buf(stderr, *master_key, key_len);
9868 			fprintf(stderr, "\n");
9869 		}
9870 #endif
9871 	}
9872 
9873 	if (ret) {
9874 		*master_key = NULL;
9875 		ib::error() << "Encryption can't find master key, please check"
9876 			       " the keyring plugin is loaded.";
9877 	}
9878 
9879 	if (key_type) {
9880 		my_free(key_type);
9881 	}
9882 #endif
9883 }
9884 
9885 #ifndef UNIV_INNOCHECKSUM
9886 
9887 /** Fill the encryption information.
9888 @param[in]	key		encryption key
9889 @param[in]	iv		encryption iv
9890 @param[in,out]	encrypt_info	encryption information
9891 @return true if success */
fill_encryption_info(byte * key,byte * iv,byte * encrypt_info)9892 bool Encryption::fill_encryption_info(byte*	key,
9893 				      byte*	iv,
9894 				      byte*	encrypt_info)
9895 {
9896 	byte			key_info[ENCRYPTION_KEY_LEN * 2];
9897 
9898 	/* Get master key from key ring. For bootstrap, we use a default
9899 	master key which master_key_id is 0. */
9900 	byte*			master_key;
9901 	ulint master_key_id;
9902 	Version			version;
9903 	get_master_key(&master_key_id, &master_key, &version);
9904 	if (master_key == NULL) {
9905 		return(false);
9906 	}
9907 
9908 	memset(encrypt_info, 0, ENCRYPTION_INFO_SIZE_V2);
9909 	memset(key_info, 0, ENCRYPTION_KEY_LEN * 2);
9910 
9911 	/* Use the new master key to encrypt the key. */
9912 	ut_ad(encrypt_info != NULL);
9913 	byte* ptr = encrypt_info;
9914 
9915 	if (version == ENCRYPTION_VERSION_1) {
9916 		memcpy(ptr, ENCRYPTION_KEY_MAGIC_V1, ENCRYPTION_MAGIC_SIZE);
9917 	} else if (version == ENCRYPTION_VERSION_2) {
9918 		memcpy(ptr, ENCRYPTION_KEY_MAGIC_V2, ENCRYPTION_MAGIC_SIZE);
9919 	} else  {
9920 		memcpy(ptr, ENCRYPTION_KEY_MAGIC_V3, ENCRYPTION_MAGIC_SIZE);
9921 	}
9922 	ptr += ENCRYPTION_MAGIC_SIZE;
9923 
9924 	mach_write_to_4(ptr, master_key_id);
9925 	if (version == ENCRYPTION_VERSION_3) {
9926 		ptr += sizeof(uint32);
9927 	} else {
9928 		ptr += sizeof(master_key_id);
9929 	}
9930 
9931 	if (version >= ENCRYPTION_VERSION_2) {
9932 		memcpy(ptr, uuid, ENCRYPTION_SERVER_UUID_LEN);
9933 		ptr += ENCRYPTION_SERVER_UUID_LEN;
9934 	}
9935 
9936 	memcpy(key_info, key, ENCRYPTION_KEY_LEN);
9937 
9938 	memcpy(key_info + ENCRYPTION_KEY_LEN, iv, ENCRYPTION_KEY_LEN);
9939 
9940 	/* Encrypt key and iv. */
9941 	const lint elen = my_aes_encrypt(key_info,
9942 			                 ENCRYPTION_KEY_LEN * 2,
9943 			                 ptr,
9944 			                 master_key,
9945 			                 ENCRYPTION_KEY_LEN,
9946 			                 my_aes_256_ecb,
9947 			                 NULL, false);
9948 
9949 	if (elen == MY_AES_BAD_DATA) {
9950 		my_free(master_key);
9951 		return(false);
9952 	}
9953 
9954 	ptr += ENCRYPTION_KEY_LEN * 2;
9955 
9956 	/* Write checksum bytes. */
9957 	ulint crc = ut_crc32(key_info, ENCRYPTION_KEY_LEN * 2);
9958 	mach_write_to_4(ptr, crc);
9959 
9960 	my_free(master_key);
9961 
9962 	return(true);
9963 }
9964 
9965 /** Decoding the encryption info
9966 from the first page of a tablespace.
9967 @param[in,out]	key		key
9968 @param[in,out]	iv		iv
9969 @param[in]	encryption_info	encrytion info.
9970 @return true if success */
9971 bool
decode_encryption_info(byte * key,byte * iv,byte * encryption_info)9972 Encryption::decode_encryption_info(byte*	key,
9973 				   byte*	iv,
9974 				   byte*	encryption_info)
9975 {
9976 	byte*			master_key = NULL;
9977 	byte			key_info[ENCRYPTION_KEY_LEN * 2];
9978 	char			srv_uuid[ENCRYPTION_SERVER_UUID_LEN + 1];
9979 
9980 	byte* ptr = encryption_info;
9981 
9982 	/* For compatibility with 5.7.11, we need to handle the
9983 	encryption information which created in this old version. */
9984 	Version			version;
9985 	if (memcmp(ptr, ENCRYPTION_KEY_MAGIC_V1,
9986 		     ENCRYPTION_MAGIC_SIZE) == 0) {
9987 		version = ENCRYPTION_VERSION_1;
9988 	} else if (memcmp(ptr, ENCRYPTION_KEY_MAGIC_V2,
9989 		     ENCRYPTION_MAGIC_SIZE) == 0) {
9990 		version = ENCRYPTION_VERSION_2;
9991 	} else {
9992 		version = ENCRYPTION_VERSION_3;
9993 	}
9994 
9995 	/* Check magic. */
9996 	if (version >= ENCRYPTION_VERSION_2
9997 	    && memcmp(ptr, ENCRYPTION_KEY_MAGIC_V2, ENCRYPTION_MAGIC_SIZE) != 0
9998 	    && memcmp(ptr, ENCRYPTION_KEY_MAGIC_V3, ENCRYPTION_MAGIC_SIZE) != 0) {
9999 		/* We ignore report error for recovery,
10000 		since the encryption info maybe hasn't writen
10001 		into datafile when the table is newly created. */
10002 		return recv_recovery_is_on();
10003 	}
10004 
10005 	ptr += ENCRYPTION_MAGIC_SIZE;
10006 
10007 	/* Get master key id. */
10008 	const ulint m_key_id = mach_read_from_4(ptr);
10009 	if (version == ENCRYPTION_VERSION_3) {
10010 		ptr += sizeof(uint32);
10011 	} else {
10012 		ptr += sizeof(ptr);
10013 	}
10014 
10015 	/* Get server uuid. */
10016 	if (version >= ENCRYPTION_VERSION_2) {
10017 		memset(srv_uuid, 0, ENCRYPTION_SERVER_UUID_LEN + 1);
10018 		memcpy(srv_uuid, ptr, ENCRYPTION_SERVER_UUID_LEN);
10019 		ptr += ENCRYPTION_SERVER_UUID_LEN;
10020 	}
10021 
10022 	/* Get master key by key id. */
10023 	memset(key_info, 0, ENCRYPTION_KEY_LEN * 2);
10024 	if (version == ENCRYPTION_VERSION_1) {
10025 		get_master_key(m_key_id, NULL, &master_key);
10026 	} else {
10027 		if (m_key_id == 0) {
10028 			/* When m_key_id is 0, which means it's the
10029 			default master key for bootstrap. */
10030 			master_key = static_cast<byte*>(ut_zalloc_nokey(
10031 				ENCRYPTION_KEY_LEN));
10032 			memcpy(master_key, ENCRYPTION_DEFAULT_MASTER_KEY,
10033 			       strlen(ENCRYPTION_DEFAULT_MASTER_KEY));
10034 		} else {
10035 			get_master_key(m_key_id, srv_uuid, &master_key);
10036 		}
10037 	}
10038 
10039         if (master_key == NULL) {
10040                 return(false);
10041         }
10042 
10043 #ifdef	UNIV_ENCRYPT_DEBUG
10044 	fprintf(stderr, "%lu ", m_key_id);
10045 	for (const byte* data = (const byte*) master_key, ulint i = 0;
10046 	     i < ENCRYPTION_KEY_LEN; i++)
10047 		fprintf(stderr, "%02lx", (ulong)*data++);
10048 #endif
10049 
10050 	/* Decrypt tablespace key and iv. */
10051 	const lint elen = my_aes_decrypt(
10052 		ptr,
10053 		ENCRYPTION_KEY_LEN * 2,
10054 		key_info,
10055 		master_key,
10056 		ENCRYPTION_KEY_LEN,
10057 		my_aes_256_ecb, NULL, false);
10058 
10059 	if (elen == MY_AES_BAD_DATA) {
10060 		if (m_key_id == 0) {
10061 			ut_free(master_key);
10062 		} else {
10063 			my_free(master_key);
10064 		}
10065 		return(NULL);
10066 	}
10067 
10068 	/* Check checksum bytes. */
10069 	ptr += ENCRYPTION_KEY_LEN * 2;
10070 
10071 	const ulint crc1 = mach_read_from_4(ptr);
10072 	const ulint crc2 = ut_crc32(key_info, ENCRYPTION_KEY_LEN * 2);
10073 	if (crc1 != crc2) {
10074 		ib::error() << "Failed to decrypt encryption information,"
10075 			<< " please check whether key file has been changed!";
10076 		if (m_key_id == 0) {
10077 			ut_free(master_key);
10078 		} else {
10079 			my_free(master_key);
10080 		}
10081 		return(false);
10082 	}
10083 
10084 	/* Get tablespace key */
10085 	memcpy(key, key_info, ENCRYPTION_KEY_LEN);
10086 
10087 	/* Get tablespace iv */
10088 	memcpy(iv, key_info + ENCRYPTION_KEY_LEN,
10089 	       ENCRYPTION_KEY_LEN);
10090 
10091 #ifdef	UNIV_ENCRYPT_DEBUG
10092 	fprintf(stderr, " ");
10093 	for (const byte* data = (const byte*) key,
10094 	     ulint i = 0; i < ENCRYPTION_KEY_LEN; i++)
10095 		fprintf(stderr, "%02lx", (ulong)*data++);
10096 	fprintf(stderr, " ");
10097 	for (const byte* data = (const byte*) iv,
10098 	     ulint i = 0; i < ENCRYPTION_KEY_LEN; i++)
10099 		fprintf(stderr, "%02lx", (ulong)*data++);
10100 	fprintf(stderr, "\n");
10101 #endif
10102 
10103 	if (m_key_id == 0) {
10104 		ut_free(master_key);
10105 	} else {
10106 		my_free(master_key);
10107 	}
10108 
10109 	if (master_key_id < m_key_id) {
10110 		master_key_id = m_key_id;
10111 		memcpy(uuid, srv_uuid, ENCRYPTION_SERVER_UUID_LEN);
10112 	}
10113 
10114 	return(true);
10115 }
10116 
10117 bool
is_encrypted_and_compressed(const byte * page)10118 Encryption::is_encrypted_and_compressed(const byte *page)
10119 {
10120 	ulint	page_type = mach_read_from_2(page + FIL_PAGE_TYPE);
10121 
10122 	return page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED;
10123 }
10124 
10125 /** Check if redo log block is encrypted block or not
10126 @param[in]	block	log block to check
10127 @return true if it is an encrypted block */
10128 bool
is_encrypted_log(const byte * block)10129 Encryption::is_encrypted_log(const byte *block) {
10130 	return (log_block_get_encrypt_bit(block));
10131 }
10132 
10133 /** Encrypt the redo log block.
10134 @param[in]	type		IORequest
10135 @param[in]	src_ptr		log block which need to encrypt
10136 @param[in,out]	dst_ptr		destination area
10137 @return true if success. */
10138 bool
encrypt_log_block(const IORequest & type,byte * src_ptr,byte * dst_ptr)10139 Encryption::encrypt_log_block(const IORequest &type, byte* src_ptr,
10140 			      byte* dst_ptr) {
10141 	byte remain_buf[MY_AES_BLOCK_SIZE * 2];
10142 
10143 #ifdef UNIV_ENCRYPT_DEBUG
10144 	fprintf(stderr, "Encrypting block %lu.\n",
10145 		log_block_get_hdr_no(src_ptr));
10146 	ut_print_buf_hex(std::cerr, src_ptr, OS_FILE_LOG_BLOCK_SIZE);
10147 	fprintf(stderr, "\n");
10148 #endif
10149 	/* This is data size which need to encrypt. */
10150 	const ulint unencrypted_trailer_size =
10151 	    (m_type == Encryption::KEYRING) ? LOG_BLOCK_TRL_SIZE : 0;
10152 	const ulint data_len = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE -
10153 			       unencrypted_trailer_size;
10154 	const ulint main_len =
10155 	    (data_len / MY_AES_BLOCK_SIZE) * MY_AES_BLOCK_SIZE;
10156 	ulint remain_len = data_len - main_len;
10157 
10158 	/* Encrypt the block. */
10159 	/* Copy the header as is. */
10160 	memmove(dst_ptr, src_ptr, LOG_BLOCK_HDR_SIZE);
10161 	ut_ad(memcmp(src_ptr, dst_ptr, LOG_BLOCK_HDR_SIZE) == 0);
10162 
10163 	switch (m_type) {
10164 		case Encryption::NONE:
10165 			ut_error;
10166 
10167 		case Encryption::KEYRING:
10168 		case Encryption::AES: {
10169 			lint elen;
10170 
10171 			ut_ad(m_klen == ENCRYPTION_KEY_LEN);
10172 
10173 			elen = my_aes_encrypt(
10174 			    src_ptr + LOG_BLOCK_HDR_SIZE,
10175 			    static_cast<uint32>(main_len),
10176 			    dst_ptr + LOG_BLOCK_HDR_SIZE,
10177 			    reinterpret_cast<unsigned char *>(m_key),
10178 			    static_cast<uint32>(m_klen), my_aes_256_cbc,
10179 			    reinterpret_cast<unsigned char *>(m_iv), false);
10180 
10181 			if (elen == MY_AES_BAD_DATA) {
10182 				return (false);
10183 			}
10184 
10185 			const ulint len = static_cast<ulint>(elen);
10186 			ut_ad(len == main_len);
10187 
10188 			/* Copy remaining bytes. */
10189 			memcpy(dst_ptr + LOG_BLOCK_HDR_SIZE + len,
10190 			       src_ptr + LOG_BLOCK_HDR_SIZE + len,
10191 			       OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE -
10192 				   len);
10193 
10194 			/* Encrypt the remaining bytes. Since my_aes_encrypt
10195 			request the content which need to encrypt is
10196 			multiple of MY_AES_BLOCK_SIZE, but the block
10197 			content is possiblly not, so, we need to handle
10198 			the tail bytes first. */
10199 			if (remain_len != 0) {
10200 				remain_len = MY_AES_BLOCK_SIZE * 2;
10201 
10202 				elen = my_aes_encrypt(
10203 				    dst_ptr + LOG_BLOCK_HDR_SIZE + data_len -
10204 					remain_len,
10205 				    static_cast<uint32>(remain_len),
10206 				    remain_buf,
10207 				    reinterpret_cast<unsigned char *>(m_key),
10208 				    static_cast<uint32>(m_klen),
10209 				    my_aes_256_cbc,
10210 				    reinterpret_cast<unsigned char *>(m_iv),
10211 				    false);
10212 
10213 				if (elen == MY_AES_BAD_DATA) {
10214 					return (false);
10215 				}
10216 
10217 				memcpy(dst_ptr + LOG_BLOCK_HDR_SIZE +
10218 					   data_len - remain_len,
10219 				       remain_buf, remain_len);
10220 			}
10221 
10222 			break;
10223 		}
10224 
10225 		default:
10226 			ut_error;
10227 	}
10228 
10229 	/* Set the encrypted flag. */
10230 	log_block_set_encrypt_bit(dst_ptr, true);
10231 
10232 	if (m_type == Encryption::KEYRING) {
10233 		const ulint crc = log_block_calc_checksum_crc32(dst_ptr);
10234 		log_block_set_checksum(dst_ptr, crc + m_key_version);
10235 	}
10236 
10237 
10238 #ifdef UNIV_ENCRYPT_DEBUG
10239 	fprintf(stderr, "Encrypted block %lu.\n",
10240 		log_block_get_hdr_no(dst_ptr));
10241 	ut_print_buf_hex(std::cerr, dst_ptr, OS_FILE_LOG_BLOCK_SIZE);
10242 	fprintf(stderr, "\n");
10243 
10244 	byte* check_buf =
10245 	    static_cast<byte *>(ut_malloc_nokey(OS_FILE_LOG_BLOCK_SIZE));
10246 	byte* buf2 =
10247 	    static_cast<byte *>(ut_malloc_nokey(OS_FILE_LOG_BLOCK_SIZE));
10248 
10249 	memcpy(check_buf, dst_ptr, OS_FILE_LOG_BLOCK_SIZE);
10250 	dberr_t err = decrypt_log(type, check_buf, OS_FILE_LOG_BLOCK_SIZE,
10251 				  buf2, OS_FILE_LOG_BLOCK_SIZE);
10252 	if (err != DB_SUCCESS ||
10253 	    memcmp(src_ptr, check_buf, OS_FILE_LOG_BLOCK_SIZE) != 0) {
10254 		ut_print_buf_hex(std::cerr, src_ptr, OS_FILE_LOG_BLOCK_SIZE);
10255 		ut_print_buf_hex(std::cerr, check_buf,
10256 				 OS_FILE_LOG_BLOCK_SIZE);
10257 		ut_ad(0);
10258 	}
10259 	ut_free(buf2);
10260 	ut_free(check_buf);
10261 #endif
10262 
10263 	return (true);
10264 }
10265 
10266 /** Encrypt the redo log data contents.
10267 @param[in]	type		IORequest
10268 @param[in]	src		page data which need to encrypt
10269 @param[in]	src_len		Size of the source in bytes
10270 @param[in,out]	dst		destination area
10271 @param[in,out]	dst_len		Size of the destination in bytes
10272 @return buffer data, dst_len will have the length of the data */
10273 byte *
encrypt_log(const IORequest & type,byte * src,ulint src_len,byte * dst,ulint * dst_len)10274 Encryption::encrypt_log(const IORequest &type, byte* src, ulint src_len,
10275 			byte* dst, ulint* dst_len) {
10276 	byte* src_ptr = src;
10277 	byte* dst_ptr = dst;
10278 
10279 	ut_ad(type.is_log());
10280 	ut_ad(src_len % OS_FILE_LOG_BLOCK_SIZE == 0);
10281 	ut_ad(m_type != Encryption::NONE);
10282 
10283 	/* Encrypt the log blocks one by one. */
10284 	while (src_ptr != src + src_len) {
10285 		if (!encrypt_log_block(type, src_ptr, dst_ptr)) {
10286 			*dst_len = src_len;
10287 			ib::error() << " Can't encrypt data of"
10288 				    << " redo log";
10289 			return (src);
10290 		}
10291 
10292 		src_ptr += OS_FILE_LOG_BLOCK_SIZE;
10293 		dst_ptr += OS_FILE_LOG_BLOCK_SIZE;
10294 	}
10295 
10296 #ifdef UNIV_ENCRYPT_DEBUG
10297 	byte* check_buf = static_cast<byte *>(ut_malloc_nokey(src_len));
10298 	byte* buf2 = static_cast<byte *>(ut_malloc_nokey(src_len));
10299 
10300 	memcpy(check_buf, dst, src_len);
10301 
10302 	dberr_t err = decrypt_log(type, check_buf, src_len, buf2, src_len);
10303 	if (err != DB_SUCCESS || memcmp(src, check_buf, src_len) != 0) {
10304 		ut_print_buf_hex(std::cerr, src, src_len);
10305 		ut_print_buf_hex(std::cerr, check_buf, src_len);
10306 		ut_ad(0);
10307 	}
10308 	ut_free(buf2);
10309 	ut_free(check_buf);
10310 #endif
10311 
10312 	return (dst);
10313 }
10314 
10315 
10316 #endif
10317 
10318 /** Check if page is encrypted page or not
10319 @param[in]	page	page which need to check
10320 @return true if it is an encrypted page */
10321 bool
is_encrypted_page(const byte * page)10322 Encryption::is_encrypted_page(const byte* page)
10323 {
10324 	ulint	page_type = mach_read_from_2(page + FIL_PAGE_TYPE);
10325 
10326 	return(page_type == FIL_PAGE_ENCRYPTED
10327 				 || page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED
10328 				 || page_type == FIL_PAGE_ENCRYPTED_RTREE);
10329 }
10330 
10331 /** Encrypt the page data contents. Page type can't be
10332 FIL_PAGE_ENCRYPTED, FIL_PAGE_COMPRESSED_AND_ENCRYPTED,
10333 FIL_PAGE_ENCRYPTED_RTREE.
10334 @param[in]	type		IORequest
10335 @param[in]	src		page data which need to encrypt
10336 @param[in]	src_len		Size of the source in bytes
10337 @param[in,out]	dst		destination area
10338 @param[in,out]	dst_len		Size of the destination in bytes
10339 @return buffer data, dst_len will have the length of the data */
10340 byte*
encrypt(const IORequest & type,byte * src,ulint src_len,byte * dst,ulint * dst_len)10341 Encryption::encrypt(
10342 	const IORequest&	type,
10343 	byte*			src,
10344 	ulint			src_len,
10345 	byte*			dst,
10346 	ulint*			dst_len)
10347 {
10348 	ut_ad(m_type != Encryption::NONE);
10349 	ut_ad(m_type != Encryption::KEYRING || m_key != NULL);
10350 	/* For encrypting redo log, take another way. */
10351 	ut_ad(!type.is_log());
10352 	/* Shouldn't encrypte an already encrypted page. */
10353 	ut_ad(!is_encrypted_page(src));
10354 
10355 #ifdef UNIV_ENCRYPT_DEBUG
10356 	ulint space_id =
10357 		mach_read_from_4(src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
10358 	ulint page_no = mach_read_from_4(src + FIL_PAGE_OFFSET);
10359 
10360 	fprintf(stderr, "Encrypting page:%lu.%lu len:%lu\n",
10361 		space_id, page_no, src_len);
10362 #endif
10363 
10364 	const uint16_t page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
10365 	// Destination header might need to acommodate key_version and checksum after encryption
10366 	const uint DST_HEADER_SIZE = (m_type == Encryption::KEYRING && page_type == FIL_PAGE_COMPRESSED)
10367 				     ? FIL_PAGE_DATA + 8 : FIL_PAGE_DATA;
10368 
10369 	/* This is data size which need to encrypt. */
10370 	ulint src_enc_len = src_len;
10371 
10372 	/* In FIL_PAGE_VERSION_2, we encrypt the actual compressed data length. */
10373 	if (page_type == FIL_PAGE_COMPRESSED) {
10374 		src_enc_len = mach_read_from_2(src + FIL_PAGE_COMPRESS_SIZE_V1) +
10375 									FIL_PAGE_DATA;
10376 		/* Extend src_enc_len if needed */
10377 		if (src_enc_len < MIN_ENCRYPTION_LEN) {
10378 			src_enc_len = MIN_ENCRYPTION_LEN;
10379 		}
10380 		ut_a(src_enc_len <= src_len);
10381 	}
10382 
10383 	/* Total length of the data to encrypt. */
10384 	ulint data_len = 0;
10385 	if (m_type == Encryption::KEYRING && page_type == FIL_PAGE_COMPRESSED) {
10386 		data_len = src_enc_len - FIL_PAGE_DATA;
10387 		// We need those 8 bytes for key_version and post-encryption checksum
10388 		// There need to be at least 8 bytes left
10389 		//ut_ad((uint)(*(src + src_len -8)) == 0);
10390 	} else if (m_type == Encryption::KEYRING && !type.is_page_zip_compressed()) {
10391 		// For keyring encryption we do not encrypt last four bytes which are
10392 		// equal to the LSN bytes in header so they are not encrypted anyway
10393 		data_len = src_enc_len - FIL_PAGE_DATA - 4;
10394 	} else {
10395 		data_len = src_enc_len - FIL_PAGE_DATA;
10396 	}
10397 
10398 	/* Only encrypt the data + trailer, leave the header alone */
10399 	switch (m_type) {
10400 	case Encryption::NONE:
10401 		ut_error;
10402 
10403 	case Encryption::KEYRING :
10404 		//fallthrough
10405 
10406 	case Encryption::AES: {
10407 		ut_ad(m_klen == ENCRYPTION_KEY_LEN);
10408 		ut_ad(m_iv != NULL);
10409 
10410 		/* Server encryption functions expect input data to be in
10411 		multiples of MY_AES_BLOCK SIZE. Therefore we encrypt the
10412 		overlapping data of the chunk_len and trailer_len twice.
10413 		First we encrypt the bigger chunk of data then we do the
10414 		trailer. The trailer encryption block starts at
10415 		2 * MY_AES_BLOCK_SIZE bytes offset from the end of the enc_len.
10416 		During decryption we do the reverse of the above process. */
10417 		ut_ad(data_len >= 2 * MY_AES_BLOCK_SIZE);
10418 
10419 		const ulint chunk_len = (data_len / MY_AES_BLOCK_SIZE) * MY_AES_BLOCK_SIZE;
10420 		ulint remain_len = data_len - chunk_len;
10421 
10422 		lint elen = my_aes_encrypt(
10423 			src + FIL_PAGE_DATA, static_cast<uint32>(chunk_len),
10424 			dst + DST_HEADER_SIZE, reinterpret_cast<byte *>(m_key),
10425 			static_cast<uint32>(m_klen), my_aes_256_cbc,
10426 			reinterpret_cast<byte *>(m_iv), false);
10427 
10428 		ut_ad(elen != MY_AES_BAD_DATA);
10429 
10430 		if (elen == MY_AES_BAD_DATA) {
10431 			ulint	page_no =mach_read_from_4(
10432 				src + FIL_PAGE_OFFSET);
10433 			ulint	space_id = mach_read_from_4(
10434 				src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
10435 			*dst_len = src_len;
10436 #ifndef UNIV_INNOCHECKSUM
10437 				ib::error()
10438 					<< " Can't encrypt data of page,"
10439 					<< " page no:" << page_no
10440 					<< " space id:" << space_id;
10441 #else
10442 				fprintf(stderr, " Can't encrypt data of page,"
10443 					" page no:" ULINTPF
10444 					" space id:" ULINTPF,
10445 					page_no, space_id);
10446 #endif /* !UNIV_INNOCHECKSUM */
10447 			return(src);
10448 		}
10449 
10450 		const ulint len = static_cast<ulint>(elen);
10451 		ut_ad(len == chunk_len);
10452 
10453 		/* Encrypt the trailing bytes. */
10454 		if (remain_len != 0) {
10455 			/* Copy remaining bytes and page trailer. */
10456 			memcpy(dst + DST_HEADER_SIZE + len,
10457 			       src + FIL_PAGE_DATA + len,
10458 			       remain_len);
10459 
10460 			const ulint trailer_len = MY_AES_BLOCK_SIZE * 2;
10461 			byte buf[trailer_len];
10462 
10463 			elen = my_aes_encrypt(
10464 				dst + DST_HEADER_SIZE + data_len - trailer_len,
10465 				static_cast<uint32>(trailer_len), buf,
10466 				reinterpret_cast<unsigned char*>(m_key),
10467 				static_cast<uint32>(m_klen), my_aes_256_cbc,
10468 				reinterpret_cast<byte *>(m_iv), false);
10469 
10470 			ut_ad(elen != MY_AES_BAD_DATA);
10471 
10472 			if (elen == MY_AES_BAD_DATA) {
10473 				ulint	page_no =mach_read_from_4(
10474 					src + FIL_PAGE_OFFSET);
10475 				ulint	space_id = mach_read_from_4(
10476 					src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
10477 #ifndef UNIV_INNOCHECKSUM
10478 				ib::error()
10479 					<< " Can't encrypt data of page,"
10480 					<< " page no:" << page_no
10481 					<< " space id:" << space_id;
10482 #else
10483 				fprintf(stderr, " Can't encrypt data of page,"
10484 					" page no:" ULINTPF
10485 					" space id:" ULINTPF,
10486 					page_no, space_id);
10487 #endif /* !UNIV_INNOCHECKSUM */
10488 				*dst_len = src_len;
10489 				return(src);
10490 			}
10491 
10492       ut_a(static_cast<ulint>(elen) == trailer_len);
10493 
10494 			memcpy(dst + DST_HEADER_SIZE + data_len - trailer_len,
10495 			       buf, trailer_len);
10496 		}
10497 
10498 		break;
10499 	}
10500 
10501 	default:
10502 		ut_error;
10503 	}
10504 
10505 	/* Copy the header as is. */
10506 	memmove(dst, src, FIL_PAGE_DATA);
10507 	ut_ad(memcmp(src, dst, FIL_PAGE_DATA) == 0);
10508 
10509 	/* Add encryption control information. Required for decrypting. */
10510 	if (page_type == FIL_PAGE_COMPRESSED) {
10511 		/* If the page is compressed, we don't need to save the
10512 		original type, since it is done in compression already. */
10513 		mach_write_to_2(dst + FIL_PAGE_TYPE,
10514 				FIL_PAGE_COMPRESSED_AND_ENCRYPTED);
10515 		ut_ad(memcmp(src+FIL_PAGE_TYPE+2,
10516 			     dst+FIL_PAGE_TYPE+2,
10517 			     FIL_PAGE_DATA-FIL_PAGE_TYPE-2) == 0);
10518 	} else if (page_type == FIL_PAGE_RTREE) {
10519 		/* If the page is R-tree page, we need to save original type. */
10520 		mach_write_to_2(dst + FIL_PAGE_TYPE, FIL_PAGE_ENCRYPTED_RTREE);
10521 	} else{
10522 		mach_write_to_2(dst + FIL_PAGE_TYPE, FIL_PAGE_ENCRYPTED);
10523 		mach_write_to_2(dst + FIL_PAGE_ORIGINAL_TYPE_V1, page_type);
10524 	}
10525 
10526 	if (m_type == Encryption::KEYRING) {
10527 		/* handle post encryption checksum */
10528 		m_checksum = 0;
10529 
10530 		ut_ad(*dst_len == src_len);
10531 
10532 		if (page_type == FIL_PAGE_COMPRESSED) {
10533 			memset(dst + FIL_PAGE_DATA, 0, 4); // set the checksum data to 0s before the checksum is calculated
10534 			mach_write_to_4(dst + FIL_PAGE_DATA + 4, m_key_version); // Add it here so it would be included in the checksum
10535 		}
10536 
10537 		if (type.is_page_zip_compressed())
10538 			memcpy(dst + FIL_PAGE_ZIP_KEYRING_ENCRYPTION_MAGIC, ENCRYPTION_ZIP_PAGE_KEYRING_ENCRYPTION_MAGIC,
10539 			       ENCRYPTION_ZIP_PAGE_KEYRING_ENCRYPTION_MAGIC_LEN);
10540 
10541 #ifndef UNIV_INNOCHECKSUM //TODO: Robert - this might need to be included in innodbchecksum
10542 		uint page_size = *dst_len;
10543 		if (page_type == FIL_PAGE_COMPRESSED) {
10544 			page_size = static_cast<uint16_t>(mach_read_from_2(dst + FIL_PAGE_COMPRESS_SIZE_V1));
10545 		} else if (type.is_page_zip_compressed()) {
10546 			page_size = type.get_zip_page_physical_size();
10547 		}
10548 		m_checksum = fil_crypt_calculate_checksum(page_size, dst, type.is_page_zip_compressed());
10549 #endif
10550 		ut_ad(m_key_version != 0); // Since we are encrypting key_version cannot be 0 (i.e. page unencrypted)
10551 
10552 
10553 		if (page_type == FIL_PAGE_COMPRESSED) {
10554 			mach_write_to_4(dst +  FIL_PAGE_DATA, m_checksum);
10555 		} else if (!type.is_page_zip_compressed()) {
10556 			mach_write_to_4(dst +  FIL_PAGE_ENCRYPTION_KEY_VERSION, m_key_version);
10557 			ut_ad(m_checksum != 0);
10558 			mach_write_to_4(dst + *dst_len - 4, m_checksum);
10559 		}
10560 		else if (type.is_page_zip_compressed()) {
10561 			mach_write_to_4(dst +  FIL_PAGE_ENCRYPTION_KEY_VERSION, m_key_version);
10562 			ut_ad(m_key_version != 0);
10563 			uint32 innodb_checksum = mach_read_from_4(dst + FIL_PAGE_SPACE_OR_CHKSUM);
10564 			uint32 xor_checksum = innodb_checksum ^ m_checksum;
10565 			mach_write_to_4(dst +  FIL_PAGE_SPACE_OR_CHKSUM, xor_checksum);
10566 			ut_ad(m_checksum != 0);
10567 		}
10568 
10569 		#ifdef UNIV_ENCRYPT_DEBUG
10570 		ut_ad(type.is_page_zip_compressed() ||
10571 		fil_space_verify_crypt_checksum(dst, *dst_len, type.is_page_zip_compressed(), type.is_compressed())); // This works only for not zipped compressed pages
10572 		#endif
10573 	}
10574 
10575 #ifdef UNIV_ENCRYPT_DEBUG
10576 #ifndef UNIV_INNOCHECKSUM
10577 #if 1
10578         if (m_type == Encryption::KEYRING)
10579         {
10580 
10581           byte*	check_buf = static_cast<byte*>(ut_malloc_nokey(src_len));
10582           byte*	buf2 = static_cast<byte*>(ut_malloc_nokey(src_len));
10583 
10584           memcpy(check_buf, dst, src_len);
10585 
10586           fprintf(stderr, "Robert: Comparing before and after encryption");
10587 
10588           byte *m_key_used = m_key;
10589 
10590           if (m_type == Encryption::KEYRING) // TODO:Robert:For decryption KEYRING page key needs to be set to NULL
10591             m_key = NULL;
10592 
10593           dberr_t err = decrypt(type, check_buf, src_len, buf2, src_len);
10594           if (space_id == 23 && page_no == 1)
10595           {
10596               fprintf(stderr, "Robert: After encrypting page 23:1:");
10597               ut_print_buf(stderr, dst, src_len);
10598           }
10599 
10600           if (err != DB_SUCCESS || memcmp(src + FIL_PAGE_DATA,
10601                                           check_buf + FIL_PAGE_DATA,
10602                                           src_len - FIL_PAGE_DATA - 4) != 0) {
10603 
10604                   fprintf(stderr, "Robert: After and before encryption are different. "
10605                                   " key_version used for encryption: %d, key used for encryption:", m_key_version);
10606                   ut_print_buf(stderr, m_key_used, 32);
10607 	          m_key_version= mach_read_from_4(check_buf + FIL_PAGE_ENCRYPTION_KEY_VERSION);
10608                   fprintf(stderr, "Robert: After and before encryption are different. "
10609                                   " key_version used for decryption: %d, key used for decryption:", m_key_version);
10610 
10611                   size_t key_len;
10612                   get_tablespace_key(m_key_id, uuid, m_key_version, &m_key, &key_len);
10613                   ut_print_buf(stderr, m_key, 32);
10614 
10615                   ut_ad(0);
10616           }
10617           ut_free(buf2);
10618           ut_free(check_buf);
10619 
10620           ut_ad(type.is_page_zip_compressed() ||
10621                 fil_space_verify_crypt_checksum(dst, *dst_len, type.is_page_zip_compressed(), type.is_compressed()));
10622 
10623           ut_ad(type.is_page_zip_compressed() ||
10624                 fil_space_verify_crypt_checksum(dst, *dst_len, type.is_page_zip_compressed(), type.is_compressed()));
10625         }
10626 #endif
10627 	fprintf(stderr, "Encrypted page:%lu.%lu\n", space_id, page_no);
10628 
10629 #endif
10630 #endif
10631 
10632 #ifdef UNIV_ENCRYPT_DEBUG
10633 	fprintf(stderr, "Robert:Encrypted page:%lu.%lu\n", space_id, page_no);
10634 #endif
10635 
10636 #if !defined(UNIV_INNOCHECKSUM)
10637         srv_stats.pages_encrypted.inc();
10638 #endif
10639 
10640 	/* Add padding 0 for unused portion */
10641 	if (src_len > src_enc_len) {
10642 		memset(dst + DST_HEADER_SIZE + data_len, 0,
10643 					 src_len - DST_HEADER_SIZE - data_len);
10644 	}
10645 
10646 	*dst_len = src_len;
10647 
10648 	return(dst);
10649 }
10650 
10651 #ifndef UNIV_INNOCHECKSUM
10652 
10653 /** Decrypt the log block.
10654 @param[in]	type		IORequest
10655 @param[in,out]	src		Data read from disk, decrypted data will be
10656 				copied to this page
10657 @param[in,out]	dst		Scratch area to use for decryption
10658 @return DB_SUCCESS or error code */
10659 dberr_t
decrypt_log_block(const IORequest & type,byte * src,byte * dst)10660 Encryption::decrypt_log_block(const IORequest &type, byte* src, byte* dst) {
10661 	byte remain_buf[MY_AES_BLOCK_SIZE * 2];
10662 
10663 	const ulint unencrypted_trailer_size =
10664 	    (m_type == Encryption::KEYRING) ? LOG_BLOCK_TRL_SIZE : 0;
10665 	const ulint data_len = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE -
10666 			       unencrypted_trailer_size;
10667 	const ulint main_len =
10668 	    (data_len / MY_AES_BLOCK_SIZE) * MY_AES_BLOCK_SIZE;
10669 	ulint remain_len = data_len - main_len;
10670 	byte* ptr = src + LOG_BLOCK_HDR_SIZE;
10671 
10672 	switch (m_type) {
10673 		case Encryption::KEYRING: {
10674 			const ulint block_crc =
10675 			    log_block_calc_checksum_crc32(src);
10676 			const ulint written_crc = log_block_get_checksum(src);
10677 
10678 			const ulint enc_key_version = written_crc - block_crc;
10679 
10680 			if (m_key_version != enc_key_version &&
10681 			    enc_key_version != REDO_LOG_ENCRYPT_NO_VERSION) {
10682 				redo_log_key* mkey =
10683 				    redo_log_key_mgr.load_key_version(
10684 					NULL, enc_key_version);
10685 				m_key_version = mkey->version;
10686 				m_key = reinterpret_cast<unsigned char *>(
10687 				    mkey->key);
10688 			}
10689 		}
10690 		/* FALLTHROUGH */
10691 		case Encryption::AES: {
10692 			lint elen;
10693 
10694 			/* First decrypt the last 2 blocks data of data, since
10695 			data is no block aligned. */
10696 			if (remain_len != 0) {
10697 				ut_ad(m_klen == ENCRYPTION_KEY_LEN);
10698 
10699 				remain_len = MY_AES_BLOCK_SIZE * 2;
10700 
10701 				/* Copy the last 2 blocks. */
10702 				memcpy(remain_buf,
10703 				       ptr + data_len - remain_len,
10704 				       remain_len);
10705 
10706 				elen = my_aes_decrypt(
10707 				    remain_buf,
10708 				    static_cast<uint32>(remain_len),
10709 				    dst + data_len - remain_len,
10710 				    reinterpret_cast<unsigned char *>(m_key),
10711 				    static_cast<uint32>(m_klen),
10712 				    my_aes_256_cbc,
10713 				    reinterpret_cast<unsigned char *>(m_iv),
10714 				    false);
10715 				if (elen == MY_AES_BAD_DATA) {
10716 					return (DB_IO_DECRYPT_FAIL);
10717 				}
10718 
10719 				/* Copy the other data bytes to temp area. */
10720 				memcpy(dst, ptr, data_len - remain_len);
10721 			} else {
10722 				ut_ad(data_len == main_len);
10723 
10724 				/* Copy the data bytes to temp area. */
10725 				memcpy(dst, ptr, data_len);
10726 			}
10727 
10728 			/* Then decrypt the main data */
10729 			elen = my_aes_decrypt(
10730 			    dst, static_cast<uint32>(main_len), ptr,
10731 			    reinterpret_cast<unsigned char *>(m_key),
10732 			    static_cast<uint32>(m_klen), my_aes_256_cbc,
10733 			    reinterpret_cast<unsigned char *>(m_iv), false);
10734 			if (elen == MY_AES_BAD_DATA) {
10735 				return (DB_IO_DECRYPT_FAIL);
10736 			}
10737 
10738 			ut_ad(static_cast<ulint>(elen) == main_len);
10739 
10740 			/* Copy the remaining bytes. */
10741 			memcpy(ptr + main_len, dst + main_len,
10742 			       data_len - main_len);
10743 
10744 			break;
10745 		}
10746 
10747 		default:
10748 			ib::error()
10749 			    << "Encryption algorithm support missing: "
10750 			    << Encryption::to_string(m_type);
10751 			return (DB_UNSUPPORTED);
10752 	}
10753 
10754 	ptr -= LOG_BLOCK_HDR_SIZE;
10755 
10756 #ifdef UNIV_ENCRYPT_DEBUG
10757 	fprintf(stderr, "Decrypted block %lu.\n", log_block_get_hdr_no(ptr));
10758 	ut_print_buf_hex(std::cerr, ptr, OS_FILE_LOG_BLOCK_SIZE);
10759 	fprintf(stderr, "\n");
10760 #endif
10761 
10762 	/* Reset the encrypted flag. */
10763 	log_block_set_encrypt_bit(ptr, false);
10764 
10765 	if (m_type == Encryption::KEYRING) {
10766 		const ulint crc = log_block_calc_checksum_crc32(src);
10767 		log_block_set_checksum(src, crc);
10768 	}
10769 
10770 	return (DB_SUCCESS);
10771 }
10772 
10773 /** Decrypt the log data contents.
10774 @param[in]	type		IORequest
10775 @param[in,out]	src		Data read from disk, decrypted data will be
10776 				copied to this page
10777 @param[in]	src_len		source data length
10778 @param[in,out]	dst		Scratch area to use for decryption
10779 @param[in]	dst_len		Size of the scratch area in bytes
10780 @return DB_SUCCESS or error code */
10781 dberr_t
decrypt_log(const IORequest & type,byte * src,ulint src_len,byte * dst,ulint dst_len)10782 Encryption::decrypt_log(const IORequest &type, byte* src, ulint src_len,
10783 			byte* dst, ulint dst_len) {
10784 	Block*  block;
10785 	byte*   ptr = src;
10786 	dberr_t ret;
10787 
10788 	ut_ad(type.is_log());
10789 
10790 	/* The caller doesn't know what to expect */
10791 	if (dst == NULL) {
10792 		block = os_alloc_block();
10793 #ifdef UNIV_INNOCHECKSUM
10794 		dst = block;
10795 #else
10796 		dst = block->m_ptr;
10797 #endif
10798 	} else {
10799 		block = NULL;
10800 	}
10801 
10802 	/* Encrypt the log blocks one by one. */
10803 	while (ptr != src + src_len) {
10804 		/* If it's not an encrypted block, skip it. */
10805 		if (!is_encrypted_log(ptr)) {
10806 			ptr += OS_FILE_LOG_BLOCK_SIZE;
10807 			continue;
10808 		}
10809 #ifdef UNIV_ENCRYPT_DEBUG
10810 		fprintf(stderr, "Decrypting block %lu.\n",
10811 			log_block_get_hdr_no(ptr));
10812 		ut_print_buf_hex(std::cerr, ptr, OS_FILE_LOG_BLOCK_SIZE);
10813 		fprintf(stderr, "\n");
10814 		ut_print_buf(stderr, m_key, 32);
10815 		ut_print_buf(stderr, m_iv, 32);
10816 #endif
10817 
10818 		/* Decrypt block */
10819 		ret = decrypt_log_block(type, ptr, dst);
10820 		if (ret != DB_SUCCESS) {
10821 			if (block != NULL) {
10822 				os_free_block(block);
10823 			}
10824 
10825 			return (ret);
10826 		}
10827 
10828 		ptr += OS_FILE_LOG_BLOCK_SIZE;
10829 	}
10830 
10831 	if (block != NULL) {
10832 		os_free_block(block);
10833 	}
10834 
10835 	return (DB_SUCCESS);
10836 }
10837 #endif
10838 
10839 /** Decrypt the page data contents. Page type must be FIL_PAGE_ENCRYPTED,
10840 if not then the source contents are left unchanged and DB_SUCCESS is returned.
10841 @param[in]	type		IORequest
10842 @param[in,out]	src		Data read from disk, decrypted data will be
10843 				copied to this page
10844 @param[in]	src_len		source data length
10845 @param[in,out]	dst		Scratch area to use for decryption
10846 @param[in]	dst_len		Size of the scratch area in bytes
10847 @return DB_SUCCESS or error code */
10848 dberr_t
decrypt(const IORequest & type,byte * src,ulint src_len,byte * dst,ulint dst_len)10849 Encryption::decrypt(
10850 	const IORequest&	type,
10851 	byte*			src,
10852 	ulint			src_len,
10853 	byte*			dst,
10854 	ulint			dst_len)
10855 {
10856 	ulint		data_len;
10857 	ulint		main_len;
10858 	ulint		remain_len;
10859 	ulint		original_type;
10860 	ulint		page_type;
10861 	byte		remain_buf[MY_AES_BLOCK_SIZE * 2];
10862 	Block*		block;
10863 
10864 #ifndef UNIV_INNOCHECKSUM
10865 	/* Do nothing if it's not an encrypted table. */
10866 	if (!Encryption::is_encrypted_page(src)) {
10867 		return(DB_SUCCESS);
10868 	}
10869 	if (m_type == Encryption::KEYRING && type.is_page_zip_compressed()) {
10870 		uint32 post_enc_checksum = fil_crypt_calculate_checksum(type.get_zip_page_physical_size(), src, type.is_page_zip_compressed());
10871 		 uint32 xor_checksum = mach_read_from_4(src + FIL_PAGE_SPACE_OR_CHKSUM);
10872 		ut_ad(xor_checksum != 0);
10873 		uint32 innodb_checksum = xor_checksum ^ post_enc_checksum;
10874 		mach_write_to_4(src +  FIL_PAGE_SPACE_OR_CHKSUM, innodb_checksum);
10875 	}
10876 #endif
10877 
10878 	/* For compressed page, we need to get the compressed size
10879 	for decryption */
10880 	page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
10881 	if (page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED) {
10882 		src_len = static_cast<uint16_t>(
10883 			mach_read_from_2(src + FIL_PAGE_COMPRESS_SIZE_V1))
10884 			+ FIL_PAGE_DATA;
10885 #ifndef UNIV_INNOCHECKSUM
10886 		Compression::meta_t header;
10887 		Compression::deserialize_header(src, &header);
10888 		if (header.m_version == Compression::FIL_PAGE_VERSION_1) {
10889 			src_len = ut_calc_align(src_len, type.block_size());
10890 		} else {
10891 			/* Extend src_len if needed */
10892 			if (src_len < MIN_ENCRYPTION_LEN) {
10893 				src_len = MIN_ENCRYPTION_LEN;
10894 			}
10895 		}
10896 #endif
10897 	}
10898 #ifdef UNIV_ENCRYPT_DEBUG
10899 	ulint space_id =
10900 		mach_read_from_4(src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
10901 	ulint page_no = mach_read_from_4(src + FIL_PAGE_OFFSET);
10902 
10903 	fprintf(stderr, "Decrypting page:%lu.%lu len:%lu\n",
10904 		space_id, page_no, src_len);
10905 #endif
10906 	const uint HEADER_SIZE = (m_type == Encryption::KEYRING && page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED)
10907 				  ? FIL_PAGE_DATA + 8 : FIL_PAGE_DATA;
10908 
10909 	original_type = static_cast<uint16_t>(
10910 		mach_read_from_2(src + FIL_PAGE_ORIGINAL_TYPE_V1));
10911 
10912 	byte*	ptr = src + HEADER_SIZE;
10913 
10914 	/* The caller doesn't know what to expect */
10915 	if (dst == NULL) {
10916 
10917 		block = os_alloc_block();
10918 #ifdef UNIV_INNOCHECKSUM
10919 		dst = block;
10920 #else
10921 		dst = block->m_ptr;
10922 #endif /* UNIV_INNOCHECKSUM */
10923 
10924 	} else {
10925 		block = NULL;
10926 	}
10927 
10928 	ut_ad(m_key != NULL);
10929 
10930   data_len = src_len - HEADER_SIZE;
10931 
10932 	if (m_type == Encryption::KEYRING
10933 			&& page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED) {
10934 		// There are 8 bytes after the header used for key_version and checksum
10935 		data_len += 8;
10936 	} else if (page_type == FIL_PAGE_ENCRYPTED && m_type == Encryption::KEYRING
10937 						 && !type.is_page_zip_compressed()) {
10938 		data_len -= 4;  // Last 4 bytes are not encrypted
10939 	}
10940 
10941 	main_len = (data_len / MY_AES_BLOCK_SIZE) * MY_AES_BLOCK_SIZE;
10942 	remain_len = data_len - main_len;
10943 
10944 	switch(m_type) {
10945 	case Encryption::KEYRING:
10946 	case Encryption::AES: {
10947 		lint			elen;
10948 
10949 		/* First decrypt the last 2 blocks data of data, since
10950 		data is no block aligned. */
10951 		if (remain_len != 0) {
10952 			ut_ad(m_klen == ENCRYPTION_KEY_LEN);
10953 			ut_ad(m_iv != NULL);
10954 
10955 			remain_len = MY_AES_BLOCK_SIZE * 2;
10956 
10957 			/* Copy the last 2 blocks. */
10958 			memcpy(remain_buf,
10959 			       ptr + data_len - remain_len,
10960 			       remain_len);
10961 
10962 			elen = my_aes_decrypt(
10963 				remain_buf,
10964 				static_cast<uint32>(remain_len),
10965 				dst + data_len - remain_len,
10966 				reinterpret_cast<unsigned char*>(m_key),
10967 				static_cast<uint32>(m_klen),
10968 				my_aes_256_cbc,
10969 				reinterpret_cast<unsigned char*>(m_iv),
10970 				false);
10971 
10972 			ut_ad(elen != MY_AES_BAD_DATA);
10973 
10974 			if (elen == MY_AES_BAD_DATA) {
10975 				if (block != NULL) {
10976 					os_free_block(block);
10977 				}
10978 
10979 				return(DB_IO_DECRYPT_FAIL);
10980 			}
10981 
10982 			/* Copy the other data bytes to temp area. */
10983 			memcpy(dst, ptr, data_len - remain_len);
10984 		} else {
10985 			ut_ad(data_len == main_len);
10986 
10987 			/* Copy the data bytes to temp area. */
10988 			memcpy(dst, ptr, data_len);
10989 		}
10990 
10991 		if (m_type == Encryption::KEYRING && page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED) {
10992 			ptr -= 8; //This much is unused as it was previously used by key version and encrypted checksum
10993 			//It is not needed - overwrite this with decrypted data
10994 			memset(ptr + data_len, 0, 8);
10995 		}
10996 
10997 		/* Then decrypt the main data */
10998 		elen = my_aes_decrypt(
10999 				dst,
11000 				static_cast<uint32>(main_len),
11001 				ptr,
11002 				reinterpret_cast<unsigned char*>(m_key),
11003 				static_cast<uint32>(m_klen),
11004 				my_aes_256_cbc,
11005 				reinterpret_cast<unsigned char*>(m_iv),
11006 				false);
11007 		if (elen == MY_AES_BAD_DATA) {
11008 
11009 			if (block != NULL) {
11010 				os_free_block(block);
11011 			}
11012 
11013 			return(DB_IO_DECRYPT_FAIL);
11014 		}
11015 
11016 		ut_ad(static_cast<ulint>(elen) == main_len);
11017 
11018 		/* Copy the remain bytes. */
11019 		memcpy(ptr + main_len, dst + main_len, data_len - main_len);
11020 
11021 		break;
11022 	}
11023 
11024 	default:
11025 		if (!type.is_dblwr_recover()) {
11026 #if !defined(UNIV_INNOCHECKSUM)
11027 			ib::error()
11028 				<< "Encryption algorithm support missing: "
11029 				<< Encryption::to_string(m_type);
11030 #else
11031 			fprintf(stderr, "Encryption algorithm support missing: %s\n",
11032 				Encryption::to_string(m_type));
11033 #endif /* !UNIV_INNOCHECKSUM */
11034 		}
11035 
11036 		if (block != NULL) {
11037 			os_free_block(block);
11038 		}
11039 
11040 		return(DB_UNSUPPORTED);
11041 	}
11042 
11043 	if (m_type == Encryption::KEYRING && page_type != FIL_PAGE_COMPRESSED_AND_ENCRYPTED
11044 	    && !type.is_page_zip_compressed()) {
11045 		//restore LSN
11046 		memcpy(src + src_len - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, src + FIL_PAGE_LSN + 4, 4);
11047 	}
11048 
11049 	/* Restore the original page type. If it's a compressed and
11050 	encrypted page, just reset it as compressed page type, since
11051 	we will do uncompress later. */
11052 	if (page_type == FIL_PAGE_ENCRYPTED) {
11053 		mach_write_to_2(src + FIL_PAGE_TYPE, original_type);
11054 	} else if (page_type == FIL_PAGE_ENCRYPTED_RTREE) {
11055 		mach_write_to_2(src + FIL_PAGE_TYPE, FIL_PAGE_RTREE);
11056 	} else {
11057 		ut_ad(page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED);
11058 		mach_write_to_2(src + FIL_PAGE_TYPE, FIL_PAGE_COMPRESSED);
11059 	}
11060 
11061 	// mark orignal page_type as encrypted - so that when checksum check fail - we will be able
11062 	// to report that if failed because decryption failed
11063 	if (original_type != FIL_PAGE_TYPE_ALLOCATED && page_type != FIL_PAGE_COMPRESSED_AND_ENCRYPTED)
11064 		mach_write_to_2(src + FIL_PAGE_ORIGINAL_TYPE_V1, FIL_PAGE_ENCRYPTED);
11065 
11066 	if (block != NULL) {
11067 		os_free_block(block);
11068 	}
11069 
11070 	if (m_type == Encryption::KEYRING && type.is_page_zip_compressed())
11071 		memset(src + FIL_PAGE_ZIP_KEYRING_ENCRYPTION_MAGIC, 0, ENCRYPTION_ZIP_PAGE_KEYRING_ENCRYPTION_MAGIC_LEN);
11072 #ifdef UNIV_ENCRYPT_DEBUG
11073 	fprintf(stderr, "Decrypted page:%lu.%lu\n", space_id, page_no);
11074 #endif
11075 
11076 	DBUG_EXECUTE_IF("ib_crash_during_decrypt_page", DBUG_SUICIDE(););
11077 
11078 #if !defined(UNIV_INNOCHECKSUM)
11079 	srv_stats.pages_decrypted.inc();
11080 #endif
11081 
11082 	return(DB_SUCCESS);
11083 }
11084 
11085 #ifndef UNIV_INNOCHECKSUM
11086 
11087 /** Check if keyring plugin loaded. */
check_keyring()11088 bool Encryption::check_keyring()
11089 {
11090 	char	key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
11091 	memset(key_name, 0, ENCRYPTION_KEY_LEN);
11092 	strcpy(key_name, ENCRYPTION_DEFAULT_MASTER_KEY);
11093 
11094 	/* We call key ring API to generate master key here. */
11095 	int	my_ret = my_key_generate(key_name, "AES",
11096 					 NULL, ENCRYPTION_KEY_LEN);
11097 
11098 	/* We call key ring API to get master key here. */
11099 	if (my_ret != 0) {
11100 		char*	key_type = NULL;
11101 		char*	master_key = NULL;
11102 		size_t	key_len;
11103 		my_ret = my_key_fetch(key_name, &key_type, NULL,
11104 				      reinterpret_cast<void**>(&master_key),
11105 				      &key_len);
11106 
11107 		my_free(key_type);
11108 		my_free(master_key);
11109 	}
11110 
11111 	if (my_ret) {
11112 		ib::error() << "keyring error: please check that a"
11113 			       " keyring plugin is loaded.";
11114 	} else {
11115 		my_key_remove(key_name, NULL);
11116 		return(true);
11117 	}
11118 
11119 	return(false);
11120 }
11121 
11122 /** Encrypt a doublewrite buffer page. The page is encrypted
11123 using the key of tablespace object provided.
11124 Caller should allocate buffer for encrypted page
11125 @param[in]	space			tablespace object
11126 @param[in]	in_page			unencrypted page
11127 @param[in,out]	encrypted_buf		buffer to hold the encrypted page
11128 @param[in]	encrypted_buf_len	length of the encrypted buffer
11129 @return true on success, false on failure */
11130 bool
os_dblwr_encrypt_page(fil_space_t * space,page_t * in_page,page_t * encrypted_buf,ulint encrypted_buf_len)11131 os_dblwr_encrypt_page(
11132 	fil_space_t*	space,
11133 	page_t*		in_page,
11134 	page_t*		encrypted_buf,
11135 	ulint		encrypted_buf_len)
11136 {
11137 	if (!FSP_FLAGS_GET_ENCRYPTION(space->flags)) {
11138 		return(false);
11139 	}
11140 
11141 	IORequest	write_request(IORequest::WRITE);
11142 	page_size_t	page_size(space->flags);
11143 
11144 	write_request.encryption_key(
11145 		space->encryption_key,
11146 		space->encryption_klen,
11147 		false,
11148 		space->encryption_iv,
11149 		0, 0, NULL, NULL);
11150 	write_request.encryption_algorithm(
11151 		Encryption::AES);
11152 
11153 	ulint 	bytes = page_size.physical();
11154 
11155 	/* After successful encryption, in_page will point
11156 	to a new memory block which is encrypted and
11157 	the bytes will have value of length of encrypted data */
11158 	void*	in_page_before	= in_page;
11159 	Block*	block = os_file_encrypt_page(
11160 		write_request,
11161 		in_page_before,
11162 		&bytes);
11163 
11164 	ut_ad(block != NULL);
11165 
11166 	if (in_page_before == in_page) {
11167 		os_free_block(block);
11168 		return(false);
11169 	}
11170 
11171 	ut_ad(bytes == page_size.physical());
11172 	ut_ad(bytes <= encrypted_buf_len);
11173 
11174 	memcpy(encrypted_buf, in_page_before /*encrypted page*/,
11175 	       bytes);
11176 
11177 	os_free_block(block);
11178 	return(true);
11179 }
11180 
11181 /** Decrypt a page from doublewrite buffer. Tablespace object
11182 (fil_space_t) must have encryption key, iv set properly.
11183 The decrpyted page will be written in the same buffer of input page.
11184 @param[in]	space	tablespace obejct
11185 @param[in,out]	page	in: encrypted page
11186 			out: decrypted page
11187 @return DB_SUCCESS on success, others on failure */
11188 dberr_t
os_dblwr_decrypt_page(fil_space_t * space,page_t * page)11189 os_dblwr_decrypt_page(
11190 	fil_space_t*		space,
11191 	page_t*			page)
11192 {
11193 	if (!FSP_FLAGS_GET_ENCRYPTION(space->flags)) {
11194 		return(DB_SUCCESS);
11195 	}
11196 
11197 	page_size_t	page_size(space->flags);
11198 
11199 	IORequest	decrypt_request;
11200 
11201 	decrypt_request.encryption_key(
11202 			space->encryption_key,
11203 			space->encryption_klen,
11204 			false,
11205 			space->encryption_iv,
11206 			0, 0, NULL, NULL);
11207 
11208 	decrypt_request.encryption_algorithm(
11209 		Encryption::AES);
11210 
11211 	Encryption	encryption(
11212 		decrypt_request.encryption_algorithm());
11213 
11214 	dberr_t	err = encryption.decrypt(
11215 		decrypt_request,
11216 		page, page_size.physical(), NULL,
11217 		page_size.physical());
11218 
11219 	ut_ad(err == DB_SUCCESS);
11220 	return(err);
11221 }
11222 
11223 #endif
11224 
11225 /** Normalizes a directory path for the current OS:
11226 On Windows, we convert '/' to '\', else we convert '\' to '/'.
11227 @param[in,out] str A null-terminated directory and file path */
11228 void
os_normalize_path(char * str)11229 os_normalize_path(
11230 	char*	str)
11231 {
11232 	if (str != NULL) {
11233 		for (; *str; str++) {
11234 			if (*str == OS_PATH_SEPARATOR_ALT) {
11235 				*str = OS_PATH_SEPARATOR;
11236 			}
11237 		}
11238 	}
11239 }
11240