1 /***********************************************************************
2 
3 Copyright (c) 1995, 2020, Oracle and/or its affiliates.
4 Copyright (c) 2009, Percona Inc.
5 
6 Portions of this file contain modifications contributed and copyrighted
7 by Percona Inc.. Those modifications are
8 gratefully acknowledged and are described briefly in the InnoDB
9 documentation. The contributions by Percona Inc. are incorporated with
10 their permission, and subject to the conditions contained in the file
11 COPYING.Percona.
12 
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License, version 2.0,
15 as published by the Free Software Foundation.
16 
17 This program is also distributed with certain software (including
18 but not limited to OpenSSL) that is licensed under separate terms,
19 as designated in a particular file or component or in included license
20 documentation.  The authors of MySQL hereby grant you an additional
21 permission to link the program and your derivative works with the
22 separately licensed software that they have included with MySQL.
23 
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27 GNU General Public License, version 2.0, for more details.
28 
29 You should have received a copy of the GNU General Public License
30 along with this program; if not, write to the Free Software
31 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA
32 
33 ***********************************************************************/
34 
35 /** @file os/os0file.cc
36  The interface to the operating system file i/o primitives
37 
38  Created 10/21/1995 Heikki Tuuri
39  *******************************************************/
40 
41 #include "os0file.h"
42 #include "fil0fil.h"
43 #include "ha_prototypes.h"
44 #include "log0log.h"
45 #include "my_dbug.h"
46 #include "my_io.h"
47 
48 #include "fil0fil.h"
49 #include "ha_prototypes.h"
50 #include "os0file.h"
51 #include "sql_const.h"
52 #include "srv0srv.h"
53 #include "srv0start.h"
54 #ifndef UNIV_HOTBACKUP
55 #include "os0event.h"
56 #include "os0thread.h"
57 #endif /* !UNIV_HOTBACKUP */
58 
59 #ifdef _WIN32
60 #include <errno.h>
61 #include <mbstring.h>
62 #include <sys/stat.h>
63 #include <tchar.h>
64 #include <codecvt>
65 #endif /* _WIN32 */
66 
67 #ifdef __linux__
68 #include <sys/sendfile.h>
69 #endif /* __linux__ */
70 
71 #ifdef LINUX_NATIVE_AIO
72 #ifndef UNIV_HOTBACKUP
73 #include <libaio.h>
74 #else /* !UNIV_HOTBACKUP */
75 #undef LINUX_NATIVE_AIO
76 #endif /* !UNIV_HOTBACKUP */
77 #endif /* LINUX_NATIVE_AIO */
78 
79 #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
80 #include <fcntl.h>
81 #include <linux/falloc.h>
82 #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
83 
84 #include <errno.h>
85 #include <lz4.h>
86 #include "my_aes.h"
87 #include "my_rnd.h"
88 #include "mysql/service_mysql_keyring.h"
89 #include "mysqld.h"
90 
91 #include <sys/types.h>
92 #include <zlib.h>
93 #include <ctime>
94 #include <functional>
95 #include <new>
96 #include <vector>
97 
98 #ifdef UNIV_HOTBACKUP
99 #include <data0type.h>
100 #endif /* UNIV_HOTBACKUP */
101 
102 /* Flush after each os_fsync_threshold bytes */
103 unsigned long long os_fsync_threshold = 0;
104 
105 /** Insert buffer segment id */
106 static const ulint IO_IBUF_SEGMENT = 0;
107 
108 /** Log segment id */
109 static const ulint IO_LOG_SEGMENT = 1;
110 
111 /** Number of retries for partial I/O's */
112 static const ulint NUM_RETRIES_ON_PARTIAL_IO = 10;
113 
114 /** For storing the allocated blocks */
115 using Blocks = std::vector<file::Block>;
116 
117 /** Block collection */
118 static Blocks *block_cache;
119 
120 /** Number of blocks to allocate for sync read/writes */
121 static const size_t MAX_BLOCKS = 128;
122 
123 /** Block buffer size */
124 #define BUFFER_BLOCK_SIZE ((ulint)(UNIV_PAGE_SIZE * 1.3))
125 
126 /** Disk sector size of aligning write buffer for DIRECT_IO */
127 static ulint os_io_ptr_align = UNIV_SECTOR_SIZE;
128 
129 /** Determine if O_DIRECT is supported
130 @retval	true	if O_DIRECT is supported.
131 @retval	false	if O_DIRECT is not supported. */
os_is_o_direct_supported()132 bool os_is_o_direct_supported() {
133 #if !defined(NO_FALLOCATE) && defined(UNIV_LINUX)
134   char *path = srv_data_home;
135   char *file_name;
136   os_file_t file_handle;
137   ulint dir_len;
138   ulint path_len;
139   bool add_os_path_separator = false;
140 
141   /* If the srv_data_home is empty, set the path to current dir. */
142   char current_dir[3];
143   if (*path == 0) {
144     current_dir[0] = FN_CURLIB;
145     current_dir[1] = FN_LIBCHAR;
146     current_dir[2] = 0;
147     path = current_dir;
148   }
149 
150   /* Get the path length. */
151   if (path[strlen(path) - 1] == OS_PATH_SEPARATOR) {
152     /* path is ended with OS_PATH_SEPARATOR */
153     dir_len = strlen(path);
154   } else {
155     /* path is not ended with OS_PATH_SEPARATOR */
156     dir_len = strlen(path) + 1;
157     add_os_path_separator = true;
158   }
159 
160   /* Allocate a new path and move the directory path to it. */
161   path_len = dir_len + sizeof "o_direct_test";
162   file_name = static_cast<char *>(ut_zalloc_nokey(path_len));
163   if (add_os_path_separator == true) {
164     memcpy(file_name, path, dir_len - 1);
165     file_name[dir_len - 1] = OS_PATH_SEPARATOR;
166   } else {
167     memcpy(file_name, path, dir_len);
168   }
169 
170   /* Construct a temp file name. */
171   strcat(file_name + dir_len, "o_direct_test");
172 
173   /* Try to create a temp file with O_DIRECT flag. */
174   file_handle =
175       ::open(file_name, O_CREAT | O_TRUNC | O_WRONLY | O_DIRECT, S_IRWXU);
176 
177   /* If Failed */
178   if (file_handle == -1) {
179     ut_free(file_name);
180     return (false);
181   }
182 
183   ::close(file_handle);
184   unlink(file_name);
185   ut_free(file_name);
186 
187   return (true);
188 #else
189   return (false);
190 #endif /* !NO_FALLOCATE && UNIV_LINUX */
191 }
192 
193 /* This specifies the file permissions InnoDB uses when it creates files in
194 Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
195 my_umask */
196 
197 #ifndef _WIN32
198 /** Umask for creating files */
199 ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
200 #else
201 /** Umask for creating files */
202 ulint os_innodb_umask = 0;
203 
204 /* On Windows when using native AIO the number of AIO requests
205 that a thread can handle at a given time is limited to 32
206 i.e.: SRV_N_PENDING_IOS_PER_THREAD */
207 #define SRV_N_PENDING_IOS_PER_THREAD OS_AIO_N_PENDING_IOS_PER_THREAD
208 
209 #endif /* _WIN32 */
210 
211 /** In simulated aio, merge at most this many consecutive i/os */
212 static const ulint OS_AIO_MERGE_N_CONSECUTIVE = 64;
213 
214 /** Checks if the page_cleaner is in active state. */
215 bool buf_flush_page_cleaner_is_active();
216 
217 #ifndef UNIV_HOTBACKUP
218 /**********************************************************************
219 
220 InnoDB AIO Implementation:
221 =========================
222 
223 We support native AIO for Windows and Linux. For rest of the platforms
224 we simulate AIO by special IO-threads servicing the IO-requests.
225 
226 Simulated AIO:
227 ==============
228 
229 On platforms where we 'simulate' AIO, the following is a rough explanation
230 of the high level design.
231 There are four io-threads (for ibuf, log, read, write).
232 All synchronous IO requests are serviced by the calling thread using
233 os_file_write/os_file_read. The Asynchronous requests are queued up
234 in an array (there are four such arrays) by the calling thread.
235 Later these requests are picked up by the IO-thread and are serviced
236 synchronously.
237 
238 Windows native AIO:
239 ==================
240 
241 If srv_use_native_aio is not set then Windows follow the same
242 code as simulated AIO. If the flag is set then native AIO interface
243 is used. On windows, one of the limitation is that if a file is opened
244 for AIO no synchronous IO can be done on it. Therefore we have an
245 extra fifth array to queue up synchronous IO requests.
246 There are innodb_file_io_threads helper threads. These threads work
247 on the four arrays mentioned above in Simulated AIO. No thread is
248 required for the sync array.
249 If a synchronous IO request is made, it is first queued in the sync
250 array. Then the calling thread itself waits on the request, thus
251 making the call synchronous.
252 If an AIO request is made the calling thread not only queues it in the
253 array but also submits the requests. The helper thread then collects
254 the completed IO request and calls completion routine on it.
255 
256 Linux native AIO:
257 =================
258 
259 If we have libaio installed on the system and innodb_use_native_aio
260 is set to true we follow the code path of native AIO, otherwise we
261 do simulated AIO.
262 There are innodb_file_io_threads helper threads. These threads work
263 on the four arrays mentioned above in Simulated AIO.
264 If a synchronous IO request is made, it is handled by calling
265 os_file_write/os_file_read.
266 If an AIO request is made the calling thread not only queues it in the
267 array but also submits the requests. The helper thread then collects
268 the completed IO request and calls completion routine on it.
269 
270 **********************************************************************/
271 
272 #ifdef UNIV_PFS_IO
273 /* Keys to register InnoDB I/O with performance schema */
274 mysql_pfs_key_t innodb_log_file_key;
275 mysql_pfs_key_t innodb_data_file_key;
276 mysql_pfs_key_t innodb_temp_file_key;
277 mysql_pfs_key_t innodb_dblwr_file_key;
278 mysql_pfs_key_t innodb_arch_file_key;
279 mysql_pfs_key_t innodb_clone_file_key;
280 #endif /* UNIV_PFS_IO */
281 
282 #endif /* !UNIV_HOTBACKUP */
283 /** The asynchronous I/O context */
284 struct Slot {
285   /** Default constructor/assignment etc. are OK */
286 
287   /** index of the slot in the aio array */
288   uint16_t pos{0};
289 
290   /** true if this slot is reserved */
291   bool is_reserved{false};
292 
293   /** time when reserved */
294   ib_time_monotonic_t reservation_time{0};
295 
296   /** buffer used in i/o */
297   byte *buf{nullptr};
298 
299   /** Buffer pointer used for actual IO. We advance this
300   when partial IO is required and not buf */
301   byte *ptr{nullptr};
302 
303   /** OS_FILE_READ or OS_FILE_WRITE */
304   IORequest type{IORequest::UNSET};
305 
306   /** file offset in bytes */
307   os_offset_t offset{0};
308 
309   /** file where to read or write */
310   pfs_os_file_t file{
311 #ifdef UNIV_PFS_IO
312       nullptr,  // m_psi
313 #endif
314       0  // m_file
315   };
316 
317   /** file name or path */
318   const char *name{nullptr};
319 
320   /** used only in simulated aio: true if the physical i/o
321   already made and only the slot message needs to be passed
322   to the caller of os_aio_simulated_handle */
323   bool io_already_done{false};
324 
325   /** The file node for which the IO is requested. */
326   fil_node_t *m1{nullptr};
327 
328   /** the requester of an aio operation and which can be used
329   to identify which pending aio operation was completed */
330   void *m2{nullptr};
331 
332   /** AIO completion status */
333   dberr_t err{DB_ERROR_UNSET};
334 
335 #ifdef WIN_ASYNC_IO
336   /** handle object we need in the OVERLAPPED struct */
337   HANDLE handle{INVALID_HANDLE_VALUE};
338 
339   /** Windows control block for the aio request */
340   OVERLAPPED control{0, 0};
341 
342   /** bytes written/read */
343   DWORD n_bytes{0};
344 
345   /** length of the block to read or write */
346   DWORD len{0};
347 
348 #elif defined(LINUX_NATIVE_AIO)
349   /** Linux control block for aio */
350   struct iocb control;
351 
352   /** AIO return code */
353   int ret{0};
354 
355   /** bytes written/read. */
356   ssize_t n_bytes{0};
357 
358   /** length of the block to read or write */
359   ulint len{0};
360 #else
361   /** length of the block to read or write */
362   ulint len{0};
363 
364   /** bytes written/read. */
365   ulint n_bytes{0};
366 #endif /* WIN_ASYNC_IO */
367 
368   /** Length of the block before it was compressed */
369   uint32 original_len{0};
370 
371   /** Buffer block for compressed pages or encrypted pages */
372   file::Block *buf_block{nullptr};
373 
374   /** true, if we shouldn't punch a hole after writing the page */
375   bool skip_punch_hole{false};
376 
377   /** Buffer for encrypt log */
378   void *encrypt_log_buf{nullptr};
379 
SlotSlot380   Slot() {
381 #if defined(LINUX_NATIVE_AIO)
382     memset(&control, 0, sizeof(control));
383 #endif /* LINUX_NATIVE_AIO */
384   }
385 };
386 
387 /** The asynchronous i/o array structure */
388 class AIO {
389  public:
390   /** Constructor
391   @param[in]	id		Latch ID
392   @param[in]	n		Number of slots to configure
393   @param[in]	segments	Number of segments to configure */
394   AIO(latch_id_t id, ulint n, ulint segments);
395 
396   /** Destructor */
397   ~AIO();
398 
399   /** Initialize the instance
400   @return DB_SUCCESS or error code */
401   dberr_t init();
402 
403   /** Requests for a slot in the aio array. If no slot is available, waits
404   until not_full-event becomes signaled.
405 
406   @param[in,out]	type	IO context
407   @param[in,out]	m1	message to be passed along with the AIO
408                           operation
409   @param[in,out]	m2	message to be passed along with the AIO
410                           operation
411   @param[in]	file	file handle
412   @param[in]	name	name of the file or path as a null-terminated
413                           string
414   @param[in,out]	buf	buffer where to read or from which to write
415   @param[in]	offset	file offset, where to read from or start writing
416   @param[in]	len	length of the block to read or write
417   @return pointer to slot */
418   Slot *reserve_slot(IORequest &type, fil_node_t *m1, void *m2,
419                      pfs_os_file_t file, const char *name, void *buf,
420                      os_offset_t offset, ulint len)
421       MY_ATTRIBUTE((warn_unused_result));
422 
423   /** @return number of reserved slots */
424   ulint pending_io_count() const;
425 
426   /** Returns a pointer to the nth slot in the aio array.
427   @param[in]	i	Index of the slot in the array
428   @return pointer to slot */
at(ulint i) const429   const Slot *at(ulint i) const MY_ATTRIBUTE((warn_unused_result)) {
430     ut_a(i < m_slots.size());
431 
432     return (&m_slots[i]);
433   }
434 
435   /** Non const version */
at(ulint i)436   Slot *at(ulint i) MY_ATTRIBUTE((warn_unused_result)) {
437     if (i >= m_slots.size()) {
438       ib::fatal(ER_IB_MSG_1357) << "i: " << i << " slots: " << m_slots.size();
439     }
440 
441     return (&m_slots[i]);
442   }
443 
444   /** Frees a slot in the AIO array, assumes caller owns the mutex.
445   @param[in,out]	slot	Slot to release */
446   void release(Slot *slot);
447 
448   /** Frees a slot in the AIO array, assumes caller doesn't own the mutex.
449   @param[in,out]	slot	Slot to release */
450   void release_with_mutex(Slot *slot);
451 
452   /** Prints info about the aio array.
453   @param[in,out]	file	Where to print */
454   void print(FILE *file);
455 
456   /** @return the number of slots per segment */
slots_per_segment() const457   ulint slots_per_segment() const MY_ATTRIBUTE((warn_unused_result)) {
458     return (m_slots.size() / m_n_segments);
459   }
460 
461   /** @return accessor for n_segments */
get_n_segments() const462   ulint get_n_segments() const MY_ATTRIBUTE((warn_unused_result)) {
463     return (m_n_segments);
464   }
465 
466 #ifdef UNIV_DEBUG
467   /** @return true if the thread owns the mutex */
is_mutex_owned() const468   bool is_mutex_owned() const MY_ATTRIBUTE((warn_unused_result)) {
469     return (mutex_own(&m_mutex));
470   }
471 #endif /* UNIV_DEBUG */
472 
473   /** Acquire the mutex */
acquire() const474   void acquire() const { mutex_enter(&m_mutex); }
475 
476   /** Release the mutex */
release() const477   void release() const { mutex_exit(&m_mutex); }
478 
479   /** Write out the state to the file/stream
480   @param[in, out]	file	File to write to */
481   void to_file(FILE *file) const;
482 
483 #ifdef LINUX_NATIVE_AIO
484   /** Dispatch an AIO request to the kernel.
485   @param[in,out]	slot	an already reserved slot
486   @return true on success. */
487   bool linux_dispatch(Slot *slot) MY_ATTRIBUTE((warn_unused_result));
488 
489   /** Accessor for an AIO event
490   @param[in]	index	Index into the array
491   @return the event at the index */
io_events(ulint index)492   io_event *io_events(ulint index) MY_ATTRIBUTE((warn_unused_result)) {
493     ut_a(index < m_events.size());
494 
495     return (&m_events[index]);
496   }
497 
498   /** Accessor for the AIO context
499   @param[in]	segment	Segment for which to get the context
500   @return the AIO context for the segment */
io_ctx(ulint segment)501   io_context *io_ctx(ulint segment) MY_ATTRIBUTE((warn_unused_result)) {
502     ut_ad(segment < get_n_segments());
503 
504     return (m_aio_ctx[segment]);
505   }
506 
507   /** Creates an io_context for native linux AIO.
508   @param[in]	max_events	number of events
509   @param[out]	io_ctx		io_ctx to initialize.
510   @return true on success. */
511   static bool linux_create_io_ctx(ulint max_events, io_context_t *io_ctx)
512       MY_ATTRIBUTE((warn_unused_result));
513 
514   /** Checks if the system supports native linux aio. On some kernel
515   versions where native aio is supported it won't work on tmpfs. In such
516   cases we can't use native aio as it is not possible to mix simulated
517   and native aio.
518   @return true if supported, false otherwise. */
519   static bool is_linux_native_aio_supported()
520       MY_ATTRIBUTE((warn_unused_result));
521 #endif /* LINUX_NATIVE_AIO */
522 
523 #ifdef WIN_ASYNC_IO
524   /** Wakes up all async i/o threads in the array in Windows async I/O at
525   shutdown. */
signal()526   void signal() {
527     for (ulint i = 0; i < m_slots.size(); ++i) {
528       SetEvent(m_slots[i].handle);
529     }
530   }
531 
532   /** Wake up all AIO threads in Windows native aio */
wake_at_shutdown()533   static void wake_at_shutdown() {
534     s_reads->signal();
535 
536     if (s_writes != NULL) {
537       s_writes->signal();
538     }
539 
540     if (s_ibuf != NULL) {
541       s_ibuf->signal();
542     }
543 
544     if (s_log != NULL) {
545       s_log->signal();
546     }
547   }
548 #endif /* WIN_ASYNC_IO */
549 
550 #ifdef _WIN32
551   /** This function can be called if one wants to post a batch of reads
552   and prefers an I/O - handler thread to handle them all at once later.You
553   must call os_aio_simulated_wake_handler_threads later to ensure the
554   threads are not left sleeping! */
555   static void simulated_put_read_threads_to_sleep();
556 
557   /** The non asynchronous IO array.
558   @return the synchronous AIO array instance. */
sync_array()559   static AIO *sync_array() MY_ATTRIBUTE((warn_unused_result)) { return s_sync; }
560 
561   /**
562   Get the AIO handles for a segment.
563   @param[in]	segment		The local segment.
564   @return the handles for the segment. */
handles(ulint segment)565   HANDLE *handles(ulint segment) MY_ATTRIBUTE((warn_unused_result)) {
566     ut_ad(segment < m_handles->size() / slots_per_segment());
567 
568     return (&(*m_handles)[segment * slots_per_segment()]);
569   }
570 
571   /** @return true if no slots are reserved */
is_empty() const572   bool is_empty() const MY_ATTRIBUTE((warn_unused_result)) {
573     ut_ad(is_mutex_owned());
574     return (m_n_reserved == 0);
575   }
576 #endif /* _WIN32 */
577 
578   /** Create an instance using new(std::nothrow)
579   @param[in]	id		Latch ID
580   @param[in]	n		The number of AIO request slots
581   @param[in]	n_segments	The number of segments
582   @return a new AIO instance */
583   static AIO *create(latch_id_t id, ulint n, ulint n_segments)
584       MY_ATTRIBUTE((warn_unused_result));
585 
586   /** Initializes the asynchronous io system. Creates one array each
587   for ibuf and log I/O. Also creates one array each for read and write
588   where each array is divided logically into n_readers and n_writers
589   respectively. The caller must create an i/o handler thread for each
590   segment in these arrays. This function also creates the sync array.
591   No I/O handler thread needs to be created for that
592   @param[in]	n_per_seg	maximum number of pending aio
593                                   operations allowed per segment
594   @param[in]	n_readers	number of reader threads
595   @param[in]	n_writers	number of writer threads
596   @param[in]	n_slots_sync	number of slots in the sync aio array
597   @return true if AIO sub-system was started successfully */
598   static bool start(ulint n_per_seg, ulint n_readers, ulint n_writers,
599                     ulint n_slots_sync) MY_ATTRIBUTE((warn_unused_result));
600 
601   /** Free the AIO arrays */
602   static void shutdown();
603 
604   /** Print all the AIO segments
605   @param[in,out]	file		Where to print */
606   static void print_all(FILE *file);
607 
608   /** Calculates local segment number and aio array from global
609   segment number.
610   @param[out]	array		AIO wait array
611   @param[in]	segment		global segment number
612   @return local segment number within the aio array */
613   static ulint get_array_and_local_segment(AIO *&array, ulint segment)
614       MY_ATTRIBUTE((warn_unused_result));
615 
616   /** Select the IO slot array
617   @param[in,out]	type		Type of IO, READ or WRITE
618   @param[in]	read_only	true if running in read-only mode
619   @param[in]	aio_mode	IO mode
620   @return slot array or NULL if invalid mode specified */
621   static AIO *select_slot_array(IORequest &type, bool read_only,
622                                 AIO_mode aio_mode)
623       MY_ATTRIBUTE((warn_unused_result));
624 
625   /** Calculates segment number for a slot.
626   @param[in]	array		AIO wait array
627   @param[in]	slot		slot in this array
628   @return segment number (which is the number used by, for example,
629           I/O handler threads) */
630   static ulint get_segment_no_from_slot(const AIO *array, const Slot *slot)
631       MY_ATTRIBUTE((warn_unused_result));
632 
633   /** Wakes up a simulated AIO I/O-handler thread if it has something
634   to do.
635   @param[in]	global_segment	the number of the segment in the
636                                   AIO arrays */
637   static void wake_simulated_handler_thread(ulint global_segment);
638 
639   /** Check if it is a read request
640   @param[in]	aio		The AIO instance to check
641   @return true if the AIO instance is for reading. */
is_read(const AIO * aio)642   static bool is_read(const AIO *aio) MY_ATTRIBUTE((warn_unused_result)) {
643     return (s_reads == aio);
644   }
645 
646   /** Wait on an event until no pending writes */
wait_until_no_pending_writes()647   static void wait_until_no_pending_writes() {
648     os_event_wait(AIO::s_writes->m_is_empty);
649   }
650 
651   /** Print to file
652   @param[in]	file		File to write to */
653   static void print_to_file(FILE *file);
654 
655   /** Check for pending IO. Gets the count and also validates the
656   data structures.
657   @return count of pending IO requests */
658   static ulint total_pending_io_count();
659 
660  private:
661   /** Initialise the slots
662   @return DB_SUCCESS or error code */
663   dberr_t init_slots() MY_ATTRIBUTE((warn_unused_result));
664 
665   /** Wakes up a simulated AIO I/O-handler thread if it has something
666   to do for a local segment in the AIO array.
667   @param[in]	global_segment	the number of the segment in the
668                                   AIO arrays
669   @param[in]	segment		the local segment in the AIO array */
670   void wake_simulated_handler_thread(ulint global_segment, ulint segment);
671 
672   /** Prints pending IO requests per segment of an aio array.
673   We probably don't need per segment statistics but they can help us
674   during development phase to see if the IO requests are being
675   distributed as expected.
676   @param[in,out]	file		file where to print
677   @param[in]	segments	pending IO array */
678   void print_segment_info(FILE *file, const ulint *segments);
679 
680 #ifdef LINUX_NATIVE_AIO
681   /** Initialise the Linux native AIO data structures
682   @return DB_SUCCESS or error code */
683   dberr_t init_linux_native_aio() MY_ATTRIBUTE((warn_unused_result));
684 #endif /* LINUX_NATIVE_AIO */
685 
686  private:
687   typedef std::vector<Slot> Slots;
688 
689   /** the mutex protecting the aio array */
690   mutable SysMutex m_mutex;
691 
692   /** Pointer to the slots in the array.
693   Number of elements must be divisible by n_threads. */
694   Slots m_slots;
695 
696   /** Number of segments in the aio array of pending aio requests.
697   A thread can wait separately for any one of the segments. */
698   ulint m_n_segments;
699 
700   /** The event which is set to the signaled state when
701   there is space in the aio outside the ibuf segment */
702   os_event_t m_not_full;
703 
704   /** The event which is set to the signaled state when
705   there are no pending i/os in this array */
706   os_event_t m_is_empty;
707 
708   /** Number of reserved slots in the AIO array outside
709   the ibuf segment */
710   ulint m_n_reserved;
711 
712 #ifdef _WIN32
713   typedef std::vector<HANDLE, ut_allocator<HANDLE>> Handles;
714 
715   /** Pointer to an array of OS native event handles where
716   we copied the handles from slots, in the same order. This
717   can be used in WaitForMultipleObjects; used only in Windows */
718   Handles *m_handles;
719 #endif /* _WIN32 */
720 
721 #if defined(LINUX_NATIVE_AIO)
722   typedef std::vector<io_event> IOEvents;
723 
724   /** completion queue for IO. There is one such queue per
725   segment. Each thread will work on one ctx exclusively. */
726   io_context_t *m_aio_ctx;
727 
728   /** The array to collect completed IOs. There is one such
729   event for each possible pending IO. The size of the array
730   is equal to m_slots.size(). */
731   IOEvents m_events;
732 #endif /* LINUX_NATIV_AIO */
733 
734   /** The aio arrays for non-ibuf i/o and ibuf i/o, as well as
735   sync AIO. These are NULL when the module has not yet been
736   initialized. */
737 
738   /** Insert buffer */
739   static AIO *s_ibuf;
740 
741   /** Redo log */
742   static AIO *s_log;
743 
744   /** Reads */
745   static AIO *s_reads;
746 
747   /** Writes */
748   static AIO *s_writes;
749 
750   /** Synchronous I/O */
751   static AIO *s_sync;
752 };
753 
754 /** Static declarations */
755 AIO *AIO::s_reads;
756 AIO *AIO::s_writes;
757 AIO *AIO::s_ibuf;
758 AIO *AIO::s_log;
759 AIO *AIO::s_sync;
760 
761 #if defined(LINUX_NATIVE_AIO)
762 /** timeout for each io_getevents() call = 500ms. */
763 static const ulint OS_AIO_REAP_TIMEOUT = 500000000UL;
764 
765 /** time to sleep, in microseconds if io_setup() returns EAGAIN. */
766 static const ulint OS_AIO_IO_SETUP_RETRY_SLEEP = 500000UL;
767 
768 /** number of attempts before giving up on io_setup(). */
769 static const int OS_AIO_IO_SETUP_RETRY_ATTEMPTS = 5;
770 #endif /* LINUX_NATIVE_AIO */
771 
772 /** Array of events used in simulated AIO */
773 static os_event_t *os_aio_segment_wait_events = nullptr;
774 
775 /** Number of asynchronous I/O segments.  Set by os_aio_init(). */
776 static ulint os_aio_n_segments = ULINT_UNDEFINED;
777 
778 /** If the following is true, read i/o handler threads try to
779 wait until a batch of new read requests have been posted */
780 static bool os_aio_recommend_sleep_for_read_threads = false;
781 
782 ulint os_n_file_reads = 0;
783 static ulint os_bytes_read_since_printout = 0;
784 ulint os_n_file_writes = 0;
785 ulint os_n_fsyncs = 0;
786 static ulint os_n_file_reads_old = 0;
787 static ulint os_n_file_writes_old = 0;
788 static ulint os_n_fsyncs_old = 0;
789 /** Number of pending write operations */
790 ulint os_n_pending_writes = 0;
791 /** Number of pending read operations */
792 ulint os_n_pending_reads = 0;
793 
794 static ib_time_monotonic_t os_last_printout;
795 bool os_has_said_disk_full = false;
796 
797 /** Default Zip compression level */
798 extern uint page_zip_level;
799 
800 static_assert(DATA_TRX_ID_LEN <= 6, "COMPRESSION_ALGORITHM will not fit!");
801 
802 /** Validates the consistency of the aio system.
803 @return true if ok */
804 static bool os_aio_validate();
805 
806 /** Does error handling when a file operation fails.
807 @param[in]	name		File name or NULL
808 @param[in]	operation	Name of operation e.g., "read", "write"
809 @return true if we should retry the operation */
810 static bool os_file_handle_error(const char *name, const char *operation);
811 
812 /** Free storage space associated with a section of the file.
813 @param[in]      fh              Open file handle
814 @param[in]      off             Starting offset (SEEK_SET)
815 @param[in]      len             Size of the hole
816 @return DB_SUCCESS or error code */
817 dberr_t os_file_punch_hole(os_file_t fh, os_offset_t off, os_offset_t len);
818 
819 /**
820 Does error handling when a file operation fails.
821 @param[in]	name		File name or NULL
822 @param[in]	operation	Name of operation e.g., "read", "write"
823 @param[in]	on_error_silent	if true then don't print any message to the log.
824 @return true if we should retry the operation */
825 static bool os_file_handle_error_no_exit(const char *name,
826                                          const char *operation,
827                                          bool on_error_silent);
828 
829 /** Decompress after a read and punch a hole in the file if it was a write
830 @param[in]	type		IO context
831 @param[in]	fh		Open file handle
832 @param[in,out]	buf		Buffer to transform
833 @param[in,out]	scratch		Scratch area for read decompression
834 @param[in]	src_len		Length of the buffer before compression
835 @param[in]	offset		file offset from the start where to read
836 @param[in]	len		Compressed buffer length for write and size
837                                 of buf len for read
838 @return DB_SUCCESS or error code */
839 static dberr_t os_file_io_complete(const IORequest &type, os_file_t fh,
840                                    byte *buf, byte *scratch, ulint src_len,
841                                    os_offset_t offset, ulint len);
842 
843 /** Does simulated AIO. This function should be called by an i/o-handler
844 thread.
845 
846 @param[in]	global_segment	The number of the segment in the aio arrays to
847                                 await for; segment 0 is the ibuf i/o thread,
848                                 segment 1 the log i/o thread, then follow the
849                                 non-ibuf read threads, and as the last are the
850                                 non-ibuf write threads
851 @param[out]	m1		the messages passed with the AIO request; note
852                                 that also in the case where the AIO operation
853                                 failed, these output parameters are valid and
854                                 can be used to restart the operation, for
855                                 example
856 @param[out]	m2		Callback argument
857 @param[in]	type		IO context
858 @return DB_SUCCESS or error code */
859 static dberr_t os_aio_simulated_handler(ulint global_segment, fil_node_t **m1,
860                                         void **m2, IORequest *type);
861 
862 #ifdef WIN_ASYNC_IO
863 /** This function is only used in Windows asynchronous i/o.
864 Waits for an aio operation to complete. This function is used to wait the
865 for completed requests. The aio array of pending requests is divided
866 into segments. The thread specifies which segment or slot it wants to wait
867 for. NOTE: this function will also take care of freeing the aio slot,
868 therefore no other thread is allowed to do the freeing!
869 @param[in]	segment		The number of the segment in the aio arrays to
870 wait for; segment 0 is the ibuf I/O thread,
871 segment 1 the log I/O thread, then follow the
872 non-ibuf read threads, and as the last are the
873 non-ibuf write threads; if this is
874 ULINT_UNDEFINED, then it means that sync AIO
875 is used, and this parameter is ignored
876 @param[in]	pos		this parameter is used only in sync AIO:
877 wait for the aio slot at this position
878 @param[out]	m1		the messages passed with the AIO request; note
879 that also in the case where the AIO operation
880 failed, these output parameters are valid and
881 can be used to restart the operation,
882 for example
883 @param[out]	m2		callback message
884 @param[out]	type		OS_FILE_WRITE or ..._READ
885 @return DB_SUCCESS or error code */
886 static dberr_t os_aio_windows_handler(ulint segment, ulint pos, fil_node_t **m1,
887                                       void **m2, IORequest *type);
888 #endif /* WIN_ASYNC_IO */
889 
890 /** Check the file type and determine if it can be deleted.
891 @param[in]	name		Filename/Path to check
892 @return true if it's a file or a symlink and can be deleted */
os_file_can_delete(const char * name)893 static bool os_file_can_delete(const char *name) {
894   switch (Fil_path::get_file_type(name)) {
895     case OS_FILE_TYPE_FILE:
896     case OS_FILE_TYPE_LINK:
897       return (true);
898 
899     case OS_FILE_TYPE_DIR:
900 
901       ib::warn(ER_IB_MSG_743) << "'" << name << "'"
902                               << " is a directory, can't delete!";
903       break;
904 
905     case OS_FILE_TYPE_BLOCK:
906 
907       ib::warn(ER_IB_MSG_744) << "'" << name << "'"
908                               << " is a block device, can't delete!";
909       break;
910 
911     case OS_FILE_TYPE_FAILED:
912 
913       ib::warn(ER_IB_MSG_745) << "'" << name << "'"
914                               << " get file type failed, won't delete!";
915       break;
916 
917     case OS_FILE_TYPE_UNKNOWN:
918 
919       ib::warn(ER_IB_MSG_746) << "'" << name << "'"
920                               << " unknown file type, won't delete!";
921       break;
922 
923     case OS_FILE_TYPE_NAME_TOO_LONG:
924 
925       ib::warn(ER_IB_MSG_747) << "'" << name << "'"
926                               << " name too long, can't delete!";
927       break;
928 
929     case OS_FILE_PERMISSION_ERROR:
930       ib::warn(ER_IB_MSG_748) << "'" << name << "'"
931                               << " permission error, can't delete!";
932       break;
933 
934     case OS_FILE_TYPE_MISSING:
935       break;
936   }
937 
938   return (false);
939 }
940 
os_alloc_block()941 file::Block *os_alloc_block() noexcept {
942   size_t pos;
943   Blocks &blocks = *block_cache;
944   size_t i = static_cast<size_t>(my_timer_cycles());
945   const size_t size = blocks.size();
946   ulint retry = 0;
947   file::Block *block;
948 
949   DBUG_EXECUTE_IF("os_block_cache_busy", retry = MAX_BLOCKS * 3;);
950 
951   for (;;) {
952     /* After go through the block cache for 3 times,
953     allocate a new temporary block. */
954     if (retry == MAX_BLOCKS * 3) {
955       byte *ptr;
956 
957       ptr = static_cast<byte *>(
958           ut_malloc_nokey(sizeof(*block) + BUFFER_BLOCK_SIZE));
959 
960       block = new (ptr) file::Block();
961       block->m_ptr = static_cast<byte *>(ptr + sizeof(*block));
962       block->m_in_use = 1;
963 
964       break;
965     }
966 
967     pos = i++ % size;
968 
969     if (TAS(&blocks[pos].m_in_use, 1) == 0) {
970       block = &blocks[pos];
971       break;
972     }
973 
974     os_thread_yield();
975 
976     ++retry;
977   }
978 
979   ut_a(block->m_in_use != 0);
980 
981   return (block);
982 }
983 
os_free_block(file::Block * block)984 void os_free_block(file::Block *block) noexcept {
985   ut_ad(block->m_in_use == 1);
986 
987   TAS(&block->m_in_use, 0);
988 
989   /* When this block is not in the block cache, and it's
990   a temporary block, we need to free it directly. */
991   if (std::less<file::Block *>()(block, &block_cache->front()) ||
992       std::greater<file::Block *>()(block, &block_cache->back())) {
993     ut_free(block);
994   }
995 }
996 
997 /** Generic AIO Handler methods. Currently handles IO post processing. */
998 class AIOHandler {
999  public:
1000   /** Do any post processing after a read/write
1001   @return DB_SUCCESS or error code. */
1002   static dberr_t post_io_processing(Slot *slot);
1003 
1004   /** Decompress after a read and punch a hole in the file if
1005   it was a write */
io_complete(const Slot * slot)1006   static dberr_t io_complete(const Slot *slot) {
1007     ut_a(slot->offset > 0);
1008     ut_a(slot->type.is_read() || !slot->skip_punch_hole);
1009     return (os_file_io_complete(slot->type, slot->file.m_file, slot->buf,
1010                                 nullptr, slot->original_len, slot->offset,
1011                                 slot->len));
1012   }
1013 
1014  private:
1015   /** Check whether the page was encrypted.
1016   @param[in]	slot		The slot that contains the IO request
1017   @return true if it was an encyrpted page */
is_encrypted_page(const Slot * slot)1018   static bool is_encrypted_page(const Slot *slot) {
1019     return (Encryption::is_encrypted_page(slot->buf));
1020   }
1021 
1022   /** Check whether the page was compressed.
1023   @param[in]	slot		The slot that contains the IO request
1024   @return true if it was a compressed page */
is_compressed_page(const Slot * slot)1025   static bool is_compressed_page(const Slot *slot) {
1026     const byte *src = slot->buf;
1027 
1028     ulint page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
1029 
1030     return (page_type == FIL_PAGE_COMPRESSED);
1031   }
1032 
1033   /** Get the compressed page size.
1034   @param[in]	slot		The slot that contains the IO request
1035   @return number of bytes to read for a successful decompress */
compressed_page_size(const Slot * slot)1036   static ulint compressed_page_size(const Slot *slot) {
1037     ut_ad(slot->type.is_read());
1038     ut_ad(is_compressed_page(slot));
1039 
1040     ulint size;
1041     const byte *src = slot->buf;
1042 
1043     size = mach_read_from_2(src + FIL_PAGE_COMPRESS_SIZE_V1);
1044 
1045     return (size + FIL_PAGE_DATA);
1046   }
1047 
1048   /** Check if the page contents can be decompressed.
1049   @param[in]	slot		The slot that contains the IO request
1050   @return true if the data read has all the compressed data */
can_decompress(const Slot * slot)1051   static bool can_decompress(const Slot *slot) {
1052     ut_ad(slot->type.is_read());
1053     ut_ad(is_compressed_page(slot));
1054 
1055     ulint version;
1056     const byte *src = slot->buf;
1057 
1058     version = mach_read_from_1(src + FIL_PAGE_VERSION);
1059 
1060     ut_a(version == 1);
1061 
1062     /* Includes the page header size too */
1063     ulint size = compressed_page_size(slot);
1064 
1065     return (size <= (slot->ptr - slot->buf) + (ulint)slot->n_bytes);
1066   }
1067 
1068   /** Check if we need to read some more data.
1069   @param[in]	slot		The slot that contains the IO request
1070   @param[in]	n_bytes		Total bytes read so far
1071   @return DB_SUCCESS or error code */
1072   static dberr_t check_read(Slot *slot, ulint n_bytes);
1073 };
1074 
1075 /** Helper class for doing synchronous file IO. Currently, the objective
1076 is to hide the OS specific code, so that the higher level functions aren't
1077 peppered with "#ifdef". Makes the code flow difficult to follow.  */
1078 class SyncFileIO {
1079  public:
1080   /** Constructor
1081   @param[in]	fh	File handle
1082   @param[in,out]	buf	Buffer to read/write
1083   @param[in]	n	Number of bytes to read/write
1084   @param[in]	offset	Offset where to read or write */
SyncFileIO(os_file_t fh,void * buf,ulint n,os_offset_t offset)1085   SyncFileIO(os_file_t fh, void *buf, ulint n, os_offset_t offset)
1086       : m_fh(fh), m_buf(buf), m_n(static_cast<ssize_t>(n)), m_offset(offset) {
1087     ut_ad(m_n > 0);
1088   }
1089 
1090   /** Destructor */
~SyncFileIO()1091   ~SyncFileIO() { /* No op */
1092   }
1093 
1094   /** Do the read/write
1095   @param[in]	request	The IO context and type
1096   @return the number of bytes read/written or negative value on error */
1097   ssize_t execute(const IORequest &request);
1098 
1099   /** Do the read/write
1100   @param[in,out]	slot	The IO slot, it has the IO context
1101   @return the number of bytes read/written or negative value on error */
1102   static ssize_t execute(Slot *slot);
1103 
1104   /** Move the read/write offset up to where the partial IO succeeded.
1105   @param[in]	n_bytes	The number of bytes to advance */
advance(ssize_t n_bytes)1106   void advance(ssize_t n_bytes) {
1107     m_offset += n_bytes;
1108 
1109     ut_ad(m_n >= n_bytes);
1110 
1111     m_n -= n_bytes;
1112 
1113     m_buf = reinterpret_cast<uchar *>(m_buf) + n_bytes;
1114   }
1115 
1116  private:
1117   /** Open file handle */
1118   os_file_t m_fh;
1119 
1120   /** Buffer to read/write */
1121   void *m_buf;
1122 
1123   /** Number of bytes to read/write */
1124   ssize_t m_n;
1125 
1126   /** Offset from where to read/write */
1127   os_offset_t m_offset;
1128 };
1129 
1130 /** If it is a compressed page return the compressed page data + footer size
1131 @param[in]	buf		Buffer to check, must include header + 10 bytes
1132 @return ULINT_UNDEFINED if the page is not a compressed page or length
1133         of the compressed data (including footer) if it is a compressed page */
os_file_compressed_page_size(const byte * buf)1134 ulint os_file_compressed_page_size(const byte *buf) {
1135   ulint type = mach_read_from_2(buf + FIL_PAGE_TYPE);
1136 
1137   if (type == FIL_PAGE_COMPRESSED) {
1138     ulint version = mach_read_from_1(buf + FIL_PAGE_VERSION);
1139     ut_a(version == 1);
1140     return (mach_read_from_2(buf + FIL_PAGE_COMPRESS_SIZE_V1));
1141   }
1142 
1143   return (ULINT_UNDEFINED);
1144 }
1145 
1146 /** If it is a compressed page return the original page data + footer size
1147 @param[in] buf		Buffer to check, must include header + 10 bytes
1148 @return ULINT_UNDEFINED if the page is not a compressed page or length
1149         of the original data + footer if it is a compressed page */
os_file_original_page_size(const byte * buf)1150 ulint os_file_original_page_size(const byte *buf) {
1151   ulint type = mach_read_from_2(buf + FIL_PAGE_TYPE);
1152 
1153   if (type == FIL_PAGE_COMPRESSED) {
1154     ulint version = mach_read_from_1(buf + FIL_PAGE_VERSION);
1155     ut_a(version == 1);
1156 
1157     return (mach_read_from_2(buf + FIL_PAGE_ORIGINAL_SIZE_V1));
1158   }
1159 
1160   return (ULINT_UNDEFINED);
1161 }
1162 
1163 /** Check if we need to read some more data.
1164 @param[in]	slot		The slot that contains the IO request
1165 @param[in]	n_bytes		Total bytes read so far
1166 @return DB_SUCCESS or error code */
check_read(Slot * slot,ulint n_bytes)1167 dberr_t AIOHandler::check_read(Slot *slot, ulint n_bytes) {
1168   dberr_t err;
1169 
1170   ut_ad(slot->type.is_read());
1171   ut_ad(slot->original_len > slot->len);
1172 
1173   if (is_compressed_page(slot)) {
1174     if (can_decompress(slot)) {
1175       ut_a(slot->offset > 0);
1176 
1177       slot->len = slot->original_len;
1178 #ifdef _WIN32
1179       slot->n_bytes = static_cast<DWORD>(n_bytes);
1180 #else
1181       slot->n_bytes = static_cast<ulint>(n_bytes);
1182 #endif /* _WIN32 */
1183 
1184       err = io_complete(slot);
1185       ut_a(err == DB_SUCCESS);
1186     } else {
1187       /* Read the next block in */
1188       ut_ad(compressed_page_size(slot) >= n_bytes);
1189 
1190       err = DB_FAIL;
1191     }
1192   } else if (is_encrypted_page(slot) ||
1193              (slot->type.is_log() && slot->offset >= LOG_FILE_HDR_SIZE)) {
1194     ut_a(slot->offset > 0);
1195 
1196     slot->len = slot->original_len;
1197 #ifdef _WIN32
1198     slot->n_bytes = static_cast<DWORD>(n_bytes);
1199 #else
1200     slot->n_bytes = static_cast<ulint>(n_bytes);
1201 #endif /* _WIN32 */
1202 
1203     err = io_complete(slot);
1204     ut_a(err == DB_SUCCESS);
1205 
1206   } else {
1207     err = DB_FAIL;
1208   }
1209 
1210   if (slot->buf_block != nullptr) {
1211     os_free_block(slot->buf_block);
1212     slot->buf_block = nullptr;
1213   }
1214 
1215   if (slot->encrypt_log_buf != nullptr) {
1216     ut_free(slot->encrypt_log_buf);
1217     slot->encrypt_log_buf = nullptr;
1218   }
1219 
1220   return (err);
1221 }
1222 
1223 /** Do any post processing after a read/write
1224 @return DB_SUCCESS or error code. */
post_io_processing(Slot * slot)1225 dberr_t AIOHandler::post_io_processing(Slot *slot) {
1226   dberr_t err;
1227 
1228   ut_ad(slot->is_reserved);
1229 
1230   /* Total bytes read so far */
1231   ulint n_bytes = (slot->ptr - slot->buf) + slot->n_bytes;
1232 
1233   /* Compressed writes can be smaller than the original length.
1234   Therefore they can be processed without further IO. */
1235   if (n_bytes == slot->original_len ||
1236       (slot->type.is_write() && slot->type.is_compressed() &&
1237        slot->len == static_cast<ulint>(slot->n_bytes))) {
1238     if ((slot->type.is_log() && slot->offset >= LOG_FILE_HDR_SIZE) ||
1239         is_compressed_page(slot) || is_encrypted_page(slot)) {
1240       ut_a(slot->offset > 0);
1241 
1242       if (slot->type.is_read()) {
1243         slot->len = slot->original_len;
1244       }
1245 
1246       /* The punch hole has been done on collect() */
1247 
1248       if (slot->type.is_read()) {
1249         err = io_complete(slot);
1250       } else {
1251         err = DB_SUCCESS;
1252       }
1253 
1254       ut_ad(err == DB_SUCCESS || err == DB_UNSUPPORTED ||
1255             err == DB_CORRUPTION || err == DB_IO_DECOMPRESS_FAIL);
1256     } else {
1257       err = DB_SUCCESS;
1258     }
1259 
1260     if (slot->buf_block != nullptr) {
1261       os_free_block(slot->buf_block);
1262       slot->buf_block = nullptr;
1263     }
1264 
1265     if (slot->encrypt_log_buf != nullptr) {
1266       ut_free(slot->encrypt_log_buf);
1267       slot->encrypt_log_buf = nullptr;
1268     }
1269   } else if ((ulint)slot->n_bytes == (ulint)slot->len) {
1270     /* It *must* be a partial read. */
1271     ut_ad(slot->len < slot->original_len);
1272 
1273     /* Has to be a read request, if it is less than
1274     the original length. */
1275     ut_ad(slot->type.is_read());
1276     err = check_read(slot, n_bytes);
1277 
1278   } else {
1279     err = DB_FAIL;
1280   }
1281 
1282   return (err);
1283 }
1284 
1285 /** Count the number of free slots
1286 @return number of reserved slots */
pending_io_count() const1287 ulint AIO::pending_io_count() const {
1288   acquire();
1289 
1290 #ifdef UNIV_DEBUG
1291   ut_a(m_n_segments > 0);
1292   ut_a(!m_slots.empty());
1293 
1294   ulint count = 0;
1295 
1296   for (ulint i = 0; i < m_slots.size(); ++i) {
1297     const Slot &slot = m_slots[i];
1298 
1299     if (slot.is_reserved) {
1300       ++count;
1301       ut_a(slot.len > 0);
1302     }
1303   }
1304 
1305   ut_a(m_n_reserved == count);
1306 #endif /* UNIV_DEBUG */
1307 
1308   ulint reserved = m_n_reserved;
1309 
1310   release();
1311 
1312   return (reserved);
1313 }
1314 
1315 /** Compress a data page
1316 @param[in]	compression	Compression algorithm
1317 @param[in]	block_size	File system block size
1318 @param[in]	src		Source contents to compress
1319 @param[in]	src_len		Length in bytes of the source
1320 @param[out]	dst		Compressed page contents
1321 @param[out]	dst_len		Length in bytes of dst contents
1322 @return buffer data, dst_len will have the length of the data */
os_file_compress_page(Compression compression,ulint block_size,byte * src,ulint src_len,byte * dst,ulint * dst_len)1323 byte *os_file_compress_page(Compression compression, ulint block_size,
1324                             byte *src, ulint src_len, byte *dst,
1325                             ulint *dst_len) {
1326   ulint len = 0;
1327   ulint compression_level = page_zip_level;
1328   ulint page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
1329 
1330   /* The page size must be a multiple of the OS punch hole size. */
1331   ut_ad(!(src_len % block_size));
1332 
1333   /* Shouldn't compress an already compressed page. */
1334   ut_ad(page_type != FIL_PAGE_COMPRESSED);
1335 
1336   /* The page must be at least twice as large as the file system
1337   block size if we are to save any space. Ignore R-Tree pages for now,
1338   they repurpose the same 8 bytes in the page header. No point in
1339   compressing if the file system block size >= our page size. */
1340 
1341   if (page_type == FIL_PAGE_RTREE || block_size == ULINT_UNDEFINED ||
1342       compression.m_type == Compression::NONE || src_len < block_size * 2) {
1343     *dst_len = src_len;
1344 
1345     return (src);
1346   }
1347 
1348   /* Leave the header alone when compressing. */
1349   ut_ad(block_size >= FIL_PAGE_DATA * 2);
1350 
1351   ut_ad(src_len > FIL_PAGE_DATA + block_size);
1352 
1353   /* Must compress to <= N-1 FS blocks. */
1354   ulint out_len = src_len - (FIL_PAGE_DATA + block_size);
1355 
1356   /* This is the original data page size - the page header. */
1357   ulint content_len = src_len - FIL_PAGE_DATA;
1358 
1359   ut_ad(out_len >= block_size - FIL_PAGE_DATA);
1360   ut_ad(out_len <= src_len - (block_size + FIL_PAGE_DATA));
1361 
1362   /* Only compress the data + trailer, leave the header alone */
1363 
1364   switch (compression.m_type) {
1365     case Compression::NONE:
1366       ut_error;
1367 
1368     case Compression::ZLIB: {
1369       uLongf zlen = static_cast<uLongf>(out_len);
1370 
1371       if (compress2(dst + FIL_PAGE_DATA, &zlen, src + FIL_PAGE_DATA,
1372                     static_cast<uLong>(content_len),
1373                     static_cast<int>(compression_level)) != Z_OK) {
1374         *dst_len = src_len;
1375 
1376         return (src);
1377       }
1378 
1379       len = static_cast<ulint>(zlen);
1380 
1381       break;
1382     }
1383 
1384     case Compression::LZ4:
1385 
1386       len = LZ4_compress_default(reinterpret_cast<char *>(src) + FIL_PAGE_DATA,
1387                                  reinterpret_cast<char *>(dst) + FIL_PAGE_DATA,
1388                                  static_cast<int>(content_len),
1389                                  static_cast<int>(out_len));
1390 
1391       ut_a(len <= src_len - FIL_PAGE_DATA);
1392 
1393       if (len == 0 || len >= out_len) {
1394         *dst_len = src_len;
1395 
1396         return (src);
1397       }
1398 
1399       break;
1400 
1401     default:
1402       *dst_len = src_len;
1403       return (src);
1404   }
1405 
1406   ut_a(len <= out_len);
1407 
1408   ut_ad(memcmp(src + FIL_PAGE_LSN + 4,
1409                src + src_len - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4) == 0);
1410 
1411   /* Copy the header as is. */
1412   memmove(dst, src, FIL_PAGE_DATA);
1413 
1414   /* Add compression control information. Required for decompressing. */
1415   mach_write_to_2(dst + FIL_PAGE_TYPE, FIL_PAGE_COMPRESSED);
1416 
1417   mach_write_to_1(dst + FIL_PAGE_VERSION, 1);
1418 
1419   mach_write_to_1(dst + FIL_PAGE_ALGORITHM_V1, compression.m_type);
1420 
1421   mach_write_to_2(dst + FIL_PAGE_ORIGINAL_TYPE_V1, page_type);
1422 
1423   mach_write_to_2(dst + FIL_PAGE_ORIGINAL_SIZE_V1, content_len);
1424 
1425   mach_write_to_2(dst + FIL_PAGE_COMPRESS_SIZE_V1, len);
1426 
1427   /* Round to the next full block size */
1428 
1429   len += FIL_PAGE_DATA;
1430 
1431   *dst_len = ut_calc_align(len, block_size);
1432 
1433   ut_ad(*dst_len >= len && *dst_len <= out_len + FIL_PAGE_DATA);
1434 
1435   /* Clear out the unused portion of the page. */
1436   if (len % block_size) {
1437     memset(dst + len, 0x0, block_size - (len % block_size));
1438   }
1439 
1440   return (dst);
1441 }
1442 
1443 #ifdef UNIV_DEBUG
1444 #ifndef UNIV_HOTBACKUP
1445 /** Validates the consistency the aio system some of the time.
1446 @return true if ok or the check was skipped */
os_aio_validate_skip()1447 static bool os_aio_validate_skip() {
1448 /** Try os_aio_validate() every this many times */
1449 #define OS_AIO_VALIDATE_SKIP 13
1450 
1451   /** The os_aio_validate() call skip counter.
1452   Use a signed type because of the race condition below. */
1453   static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
1454 
1455   /* There is a race condition below, but it does not matter,
1456   because this call is only for heuristic purposes. We want to
1457   reduce the call frequency of the costly os_aio_validate()
1458   check in debug builds. */
1459   --os_aio_validate_count;
1460 
1461   if (os_aio_validate_count > 0) {
1462     return (true);
1463   }
1464 
1465   os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
1466   return (os_aio_validate());
1467 }
1468 #endif /* !UNIV_HOTBACKUP */
1469 #endif /* UNIV_DEBUG */
1470 
1471 #undef USE_FILE_LOCK
1472 #define USE_FILE_LOCK
1473 #if defined(UNIV_HOTBACKUP) || defined(_WIN32)
1474 /* InnoDB Hot Backup does not lock the data files.
1475  * On Windows, mandatory locking is used.
1476  */
1477 #undef USE_FILE_LOCK
1478 #endif /* UNIV_HOTBACKUP || _WIN32 */
1479 #ifdef USE_FILE_LOCK
1480 /** Obtain an exclusive lock on a file.
1481 @param[in]	fd		file descriptor
1482 @param[in]	name		file name
1483 @return 0 on success */
os_file_lock(int fd,const char * name)1484 static int os_file_lock(int fd, const char *name) {
1485   struct flock lk;
1486 
1487   lk.l_type = F_WRLCK;
1488   lk.l_whence = SEEK_SET;
1489   lk.l_start = lk.l_len = 0;
1490 
1491   if (fcntl(fd, F_SETLK, &lk) == -1) {
1492     ib::error(ER_IB_MSG_749)
1493         << "Unable to lock " << name << " error: " << errno;
1494 
1495     if (errno == EAGAIN || errno == EACCES) {
1496       ib::info(ER_IB_MSG_750) << "Check that you do not already have"
1497                                  " another mysqld process using the"
1498                                  " same InnoDB data or log files.";
1499     }
1500 
1501     return (-1);
1502   }
1503 
1504   return (0);
1505 }
1506 #endif /* USE_FILE_LOCK */
1507 
1508 /** Calculates local segment number and aio array from global segment number.
1509 @param[out]	array		aio wait array
1510 @param[in]	segment		global segment number
1511 @return local segment number within the aio array */
get_array_and_local_segment(AIO * & array,ulint segment)1512 ulint AIO::get_array_and_local_segment(AIO *&array, ulint segment) {
1513   ulint limit = srv_read_only_mode ? 0 : 2;
1514 
1515   ut_a(segment < os_aio_n_segments);
1516 
1517   if (!srv_read_only_mode && segment < limit) {
1518     /* We don't support ibuf/log IO during read only mode. */
1519 
1520     if (segment == IO_IBUF_SEGMENT) {
1521       array = s_ibuf;
1522 
1523     } else if (segment == IO_LOG_SEGMENT) {
1524       array = s_log;
1525 
1526     } else {
1527       array = nullptr;
1528     }
1529 
1530     return 0;
1531   }
1532 
1533   if (segment < s_reads->m_n_segments + limit) {
1534     array = s_reads;
1535 
1536     return segment - limit;
1537   }
1538 
1539   limit += s_reads->m_n_segments;
1540 
1541   ut_a(segment < s_writes->m_n_segments + limit);
1542 
1543   array = s_writes;
1544 
1545   return segment - limit;
1546 }
1547 
1548 /** Frees a slot in the aio array. Assumes caller owns the mutex.
1549 @param[in,out]	slot		Slot to release */
release(Slot * slot)1550 void AIO::release(Slot *slot) {
1551   ut_ad(is_mutex_owned());
1552 
1553   ut_ad(slot->is_reserved);
1554 
1555   slot->is_reserved = false;
1556 
1557   --m_n_reserved;
1558 
1559   if (m_n_reserved == m_slots.size() - 1) {
1560     os_event_set(m_not_full);
1561   }
1562 
1563   if (m_n_reserved == 0) {
1564     os_event_set(m_is_empty);
1565   }
1566 
1567 #ifdef WIN_ASYNC_IO
1568 
1569   ResetEvent(slot->handle);
1570 
1571 #elif defined(LINUX_NATIVE_AIO)
1572 
1573   if (srv_use_native_aio) {
1574     memset(&slot->control, 0x0, sizeof(slot->control));
1575     slot->ret = 0;
1576     slot->n_bytes = 0;
1577   } else {
1578     /* These fields should not be used if we are not
1579     using native AIO. */
1580     ut_ad(slot->n_bytes == 0);
1581     ut_ad(slot->ret == 0);
1582   }
1583 
1584 #endif /* WIN_ASYNC_IO */
1585 }
1586 
1587 /** Frees a slot in the AIO array. Assumes caller doesn't own the mutex.
1588 @param[in,out]	slot		Slot to release */
release_with_mutex(Slot * slot)1589 void AIO::release_with_mutex(Slot *slot) {
1590   acquire();
1591 
1592   release(slot);
1593 
1594   release();
1595 }
1596 
1597 #ifndef UNIV_HOTBACKUP
1598 /** Creates a temporary file.  This function is like tmpfile(3), but
1599 the temporary file is created in the given parameter path. If the path
1600 is NULL then it will create the file in the MySQL server configuration
1601 parameter (--tmpdir).
1602 @param[in]	path	location for creating temporary file
1603 @return temporary file handle, or NULL on error */
os_file_create_tmpfile(const char * path)1604 FILE *os_file_create_tmpfile(const char *path) {
1605   FILE *file = nullptr;
1606   int fd = innobase_mysql_tmpfile(path);
1607 
1608   if (fd >= 0) {
1609     file = fdopen(fd, "w+b");
1610   }
1611 
1612   if (file == nullptr) {
1613     ib::error(ER_IB_MSG_751)
1614         << "Unable to create temporary file; errno: " << errno;
1615 
1616     if (fd >= 0) {
1617       close(fd);
1618     }
1619   }
1620 
1621   return (file);
1622 }
1623 #endif /* !UNIV_HOTBACKUP */
1624 
1625 /** Rewind file to its start, read at most size - 1 bytes from it to str, and
1626 NUL-terminate str. All errors are silently ignored. This function is
1627 mostly meant to be used with temporary files.
1628 @param[in,out]	file		File to read from
1629 @param[in,out]	str		Buffer where to read
1630 @param[in]	size		Size of buffer */
os_file_read_string(FILE * file,char * str,ulint size)1631 void os_file_read_string(FILE *file, char *str, ulint size) {
1632   if (size != 0) {
1633     rewind(file);
1634 
1635     size_t flen = fread(str, 1, size - 1, file);
1636 
1637     str[flen] = '\0';
1638   }
1639 }
1640 
1641 /** Decompress after a read and punch a hole in the file if it was a write
1642 @param[in]	type		IO context
1643 @param[in]	fh		Open file handle
1644 @param[in,out]	buf		Buffer to transform
1645 @param[in,out]	scratch		Scratch area for read decompression
1646 @param[in]	src_len		Length of the buffer before compression
1647 @param[in]	offset		file offset from the start where to read
1648 @param[in]	len		Used buffer length for write and output
1649                                 buf len for read
1650 @return DB_SUCCESS or error code */
os_file_io_complete(const IORequest & type,os_file_t fh,byte * buf,byte * scratch,ulint src_len,os_offset_t offset,ulint len)1651 static dberr_t os_file_io_complete(const IORequest &type, os_file_t fh,
1652                                    byte *buf, byte *scratch, ulint src_len,
1653                                    os_offset_t offset, ulint len) {
1654   dberr_t ret = DB_SUCCESS;
1655 
1656   /* We never compress/decompress the first page */
1657   ut_a(offset > 0);
1658   ut_ad(type.validate());
1659 
1660   if (!type.is_compression_enabled()) {
1661     if (type.is_log() && offset >= LOG_FILE_HDR_SIZE) {
1662       Encryption encryption(type.encryption_algorithm());
1663 
1664       ret = encryption.decrypt_log(type, buf, src_len, scratch, len);
1665     }
1666 
1667     return (ret);
1668   } else if (type.is_read()) {
1669     Encryption encryption(type.encryption_algorithm());
1670 
1671     ret = encryption.decrypt(type, buf, src_len, scratch, len);
1672 
1673     if (ret == DB_SUCCESS) {
1674       return (os_file_decompress_page(type.is_dblwr(), buf, scratch, len));
1675     } else {
1676       return (ret);
1677     }
1678   } else if (type.punch_hole()) {
1679     ut_ad(len <= src_len);
1680     ut_ad(!type.is_log());
1681     ut_ad(type.is_write());
1682     ut_ad(type.is_compressed());
1683 
1684     /* Nothing to do. */
1685     if (len == src_len) {
1686       return (DB_SUCCESS);
1687     }
1688 
1689 #ifdef UNIV_DEBUG
1690     const ulint block_size = type.block_size();
1691 #endif /* UNIV_DEBUG */
1692 
1693     /* We don't support multiple page sizes in the server
1694     at the moment. */
1695     ut_ad(src_len == srv_page_size);
1696 
1697     /* Must be a multiple of the compression unit size. */
1698     ut_ad((len % block_size) == 0);
1699     ut_ad((offset % block_size) == 0);
1700 
1701     ut_ad(len + block_size <= src_len);
1702 
1703     offset += len;
1704 
1705     return (os_file_punch_hole(fh, offset, src_len - len));
1706   }
1707 
1708   ut_ad(!type.is_log());
1709 
1710   return (DB_SUCCESS);
1711 }
1712 
1713 /** Check if the path refers to the root of a drive using a pointer
1714 to the last directory separator that the caller has fixed.
1715 @param[in]	path		path name
1716 @param[in]	last_slash	last directory separator in the path
1717 @return true if this path is a drive root, false if not */
1718 UNIV_INLINE
os_file_is_root(const char * path,const char * last_slash)1719 bool os_file_is_root(const char *path, const char *last_slash) {
1720   return (
1721 #ifdef _WIN32
1722       (last_slash == path + 2 && path[1] == ':') ||
1723 #endif /* _WIN32 */
1724       last_slash == path);
1725 }
1726 
1727 /** Return the parent directory component of a null-terminated path.
1728 Return a new buffer containing the string up to, but not including,
1729 the final component of the path.
1730 The path returned will not contain a trailing separator.
1731 Do not return a root path, return NULL instead.
1732 The final component trimmed off may be a filename or a directory name.
1733 If the final component is the only component of the path, return NULL.
1734 It is the caller's responsibility to free the returned string after it
1735 is no longer needed.
1736 @param[in]	path		Path name
1737 @return own: parent directory of the path */
os_file_get_parent_dir(const char * path)1738 static char *os_file_get_parent_dir(const char *path) {
1739   bool has_trailing_slash = false;
1740 
1741   /* Find the offset of the last slash */
1742   const char *last_slash = strrchr(path, OS_PATH_SEPARATOR);
1743 
1744   if (!last_slash) {
1745     /* No slash in the path, return NULL */
1746     return (nullptr);
1747   }
1748 
1749   /* Ok, there is a slash. Is there anything after it? */
1750   if (static_cast<size_t>(last_slash - path + 1) == strlen(path)) {
1751     has_trailing_slash = true;
1752   }
1753 
1754   /* Reduce repetative slashes. */
1755   while (last_slash > path && last_slash[-1] == OS_PATH_SEPARATOR) {
1756     last_slash--;
1757   }
1758 
1759   /* Check for the root of a drive. */
1760   if (os_file_is_root(path, last_slash)) {
1761     return (nullptr);
1762   }
1763 
1764   /* If a trailing slash prevented the first strrchr() from trimming
1765   the last component of the path, trim that component now. */
1766   if (has_trailing_slash) {
1767     /* Back up to the previous slash. */
1768     last_slash--;
1769     while (last_slash > path && last_slash[0] != OS_PATH_SEPARATOR) {
1770       last_slash--;
1771     }
1772 
1773     /* Reduce repetative slashes. */
1774     while (last_slash > path && last_slash[-1] == OS_PATH_SEPARATOR) {
1775       last_slash--;
1776     }
1777   }
1778 
1779   /* Check for the root of a drive. */
1780   if (os_file_is_root(path, last_slash)) {
1781     return (nullptr);
1782   }
1783 
1784   if (last_slash - path < 0) {
1785     /* Sanity check, it prevents gcc from trying to handle this case which
1786      * results in warnings for some optimized builds */
1787     return (nullptr);
1788   }
1789 
1790   /* Non-trivial directory component */
1791 
1792   return (mem_strdupl(path, last_slash - path));
1793 }
1794 #ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
1795 
1796 /* Test the function os_file_get_parent_dir. */
test_os_file_get_parent_dir(const char * child_dir,const char * expected_dir)1797 void test_os_file_get_parent_dir(const char *child_dir,
1798                                  const char *expected_dir) {
1799   char *child = mem_strdup(child_dir);
1800   char *expected = expected_dir == NULL ? NULL : mem_strdup(expected_dir);
1801 
1802   /* os_file_get_parent_dir() assumes that separators are
1803   converted to OS_PATH_SEPARATOR. */
1804   Fil_path::normalize(child);
1805   Fil_path::normalize(expected);
1806 
1807   char *parent = os_file_get_parent_dir(child);
1808 
1809   bool unexpected =
1810       (expected == NULL ? (parent != NULL) : (0 != strcmp(parent, expected)));
1811   if (unexpected) {
1812     ib::fatal(ER_IB_MSG_752)
1813         << "os_file_get_parent_dir('" << child << "') returned '" << parent
1814         << "', instead of '" << expected << "'.";
1815   }
1816   ut_free(parent);
1817   ut_free(child);
1818   ut_free(expected);
1819 }
1820 
1821 /* Test the function os_file_get_parent_dir. */
unit_test_os_file_get_parent_dir()1822 void unit_test_os_file_get_parent_dir() {
1823   test_os_file_get_parent_dir("/usr/lib/a", "/usr/lib");
1824   test_os_file_get_parent_dir("/usr/", NULL);
1825   test_os_file_get_parent_dir("//usr//", NULL);
1826   test_os_file_get_parent_dir("usr", NULL);
1827   test_os_file_get_parent_dir("usr//", NULL);
1828   test_os_file_get_parent_dir("/", NULL);
1829   test_os_file_get_parent_dir("//", NULL);
1830   test_os_file_get_parent_dir(".", NULL);
1831   test_os_file_get_parent_dir("..", NULL);
1832 #ifdef _WIN32
1833   test_os_file_get_parent_dir("D:", NULL);
1834   test_os_file_get_parent_dir("D:/", NULL);
1835   test_os_file_get_parent_dir("D:\\", NULL);
1836   test_os_file_get_parent_dir("D:/data", NULL);
1837   test_os_file_get_parent_dir("D:/data/", NULL);
1838   test_os_file_get_parent_dir("D:\\data\\", NULL);
1839   test_os_file_get_parent_dir("D:///data/////", NULL);
1840   test_os_file_get_parent_dir("D:\\\\\\data\\\\\\\\", NULL);
1841   test_os_file_get_parent_dir("D:/data//a", "D:/data");
1842   test_os_file_get_parent_dir("D:\\data\\\\a", "D:\\data");
1843   test_os_file_get_parent_dir("D:///data//a///b/", "D:///data//a");
1844   test_os_file_get_parent_dir("D:\\\\\\data\\\\a\\\\\\b\\",
1845                               "D:\\\\\\data\\\\a");
1846 #endif /* _WIN32 */
1847 }
1848 #endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */
1849 
1850 /** Creates all missing subdirectories along the given path.
1851 @param[in]	path		Path name
1852 @return DB_SUCCESS if OK, otherwise error code. */
os_file_create_subdirs_if_needed(const char * path)1853 dberr_t os_file_create_subdirs_if_needed(const char *path) {
1854   if (srv_read_only_mode) {
1855     ib::error(ER_IB_MSG_753) << "read only mode set. Can't create "
1856                              << "subdirectories '" << path << "'";
1857 
1858     return (DB_READ_ONLY);
1859   }
1860 
1861   char *subdir = os_file_get_parent_dir(path);
1862 
1863   if (subdir == nullptr) {
1864     /* subdir is root or cwd, nothing to do */
1865     return (DB_SUCCESS);
1866   }
1867 
1868   /* Test if subdir exists */
1869   os_file_type_t type;
1870   bool subdir_exists;
1871   bool success = os_file_status(subdir, &subdir_exists, &type);
1872 
1873   if (success && !subdir_exists) {
1874     /* Subdir does not exist, create it */
1875     dberr_t err = os_file_create_subdirs_if_needed(subdir);
1876 
1877     if (err != DB_SUCCESS) {
1878       ut_free(subdir);
1879 
1880       return (err);
1881     }
1882 
1883     success = os_file_create_directory(subdir, false);
1884   }
1885 
1886   ut_free(subdir);
1887 
1888   return (success ? DB_SUCCESS : DB_ERROR);
1889 }
1890 
1891 /** Allocate the buffer for IO on a transparently compressed table.
1892 @param[in]	type		IO flags
1893 @param[out]	buf		buffer to read or write
1894 @param[in,out]	n		number of bytes to read/write, starting from
1895                                 offset
1896 @return pointer to allocated page, compressed data is written to the offset
1897         that is aligned on the disk sector size */
os_file_compress_page(IORequest & type,void * & buf,ulint * n)1898 static file::Block *os_file_compress_page(IORequest &type, void *&buf,
1899                                           ulint *n) {
1900   ut_ad(!type.is_log());
1901   ut_ad(type.is_write());
1902   ut_ad(type.is_compressed());
1903 
1904   ulint n_alloc = *n * 2;
1905 
1906   ut_a(n_alloc <= UNIV_PAGE_SIZE_MAX * 2);
1907   ut_a(type.compression_algorithm().m_type != Compression::LZ4 ||
1908        static_cast<ulint>(LZ4_COMPRESSBOUND(*n)) < n_alloc);
1909 
1910   auto block = os_alloc_block();
1911 
1912   ulint old_compressed_len;
1913   ulint compressed_len = *n;
1914 
1915   old_compressed_len = mach_read_from_2(reinterpret_cast<byte *>(buf) +
1916                                         FIL_PAGE_COMPRESS_SIZE_V1);
1917 
1918   if (old_compressed_len > 0) {
1919     old_compressed_len =
1920         ut_calc_align(old_compressed_len + FIL_PAGE_DATA, type.block_size());
1921   } else {
1922     old_compressed_len = *n;
1923   }
1924 
1925   byte *compressed_page;
1926 
1927   compressed_page =
1928       static_cast<byte *>(ut_align(block->m_ptr, os_io_ptr_align));
1929 
1930   byte *buf_ptr;
1931 
1932   buf_ptr = os_file_compress_page(
1933       type.compression_algorithm(), type.block_size(),
1934       reinterpret_cast<byte *>(buf), *n, compressed_page, &compressed_len);
1935 
1936   if (buf_ptr != buf) {
1937     /* Set new compressed size to uncompressed page. */
1938     memcpy(reinterpret_cast<byte *>(buf) + FIL_PAGE_COMPRESS_SIZE_V1,
1939            buf_ptr + FIL_PAGE_COMPRESS_SIZE_V1, 2);
1940 
1941     buf = buf_ptr;
1942     *n = compressed_len;
1943 
1944     if (compressed_len >= old_compressed_len) {
1945       ut_ad(old_compressed_len <= UNIV_PAGE_SIZE);
1946 
1947       type.clear_punch_hole();
1948     }
1949   }
1950 
1951   return (block);
1952 }
1953 
1954 /** Encrypt a page content when write it to disk.
1955 @param[in]	type		IO flags
1956 @param[out]	buf		buffer to read or write
1957 @param[in,out]	n		number of bytes to read/write, starting from
1958                                 offset
1959 @return pointer to the encrypted page */
os_file_encrypt_page(const IORequest & type,void * & buf,ulint * n)1960 static file::Block *os_file_encrypt_page(const IORequest &type, void *&buf,
1961                                          ulint *n) {
1962   byte *encrypted_page;
1963   ulint encrypted_len = *n;
1964   byte *buf_ptr;
1965   Encryption encryption(type.encryption_algorithm());
1966 
1967   ut_ad(type.is_write());
1968   ut_ad(type.is_encrypted());
1969 
1970   auto block = os_alloc_block();
1971 
1972   encrypted_page = static_cast<byte *>(ut_align(block->m_ptr, os_io_ptr_align));
1973 
1974   buf_ptr = encryption.encrypt(type, reinterpret_cast<byte *>(buf), *n,
1975                                encrypted_page, &encrypted_len);
1976 
1977   bool encrypted = buf_ptr != buf;
1978 
1979   if (encrypted) {
1980     buf = buf_ptr;
1981     *n = encrypted_len;
1982   }
1983 
1984   return (block);
1985 }
1986 
1987 /** Encrypt log blocks content when write it to disk.
1988 @param[in]	type		IO flags
1989 @param[in,out]	buf		buffer to read or write
1990 @param[in,out]	scratch		buffer for encrypting log
1991 @param[in,out]	n		number of bytes to read/write, starting from
1992                                 offset
1993 @return pointer to the encrypted log blocks */
os_file_encrypt_log(const IORequest & type,void * & buf,byte * & scratch,ulint * n)1994 static file::Block *os_file_encrypt_log(const IORequest &type, void *&buf,
1995                                         byte *&scratch, ulint *n) {
1996   byte *encrypted_log;
1997   ulint encrypted_len = *n;
1998   byte *buf_ptr;
1999   Encryption encryption(type.encryption_algorithm());
2000   file::Block *block{};
2001 
2002   ut_ad(type.is_write() && type.is_encrypted() && type.is_log());
2003   ut_ad(*n % OS_FILE_LOG_BLOCK_SIZE == 0);
2004 
2005   if (*n <= BUFFER_BLOCK_SIZE - os_io_ptr_align) {
2006     block = os_alloc_block();
2007     buf_ptr = block->m_ptr;
2008     scratch = nullptr;
2009   } else {
2010     buf_ptr = static_cast<byte *>(ut_malloc_nokey(*n + os_io_ptr_align));
2011     scratch = buf_ptr;
2012   }
2013 
2014   encrypted_log = static_cast<byte *>(ut_align(buf_ptr, os_io_ptr_align));
2015 
2016   encrypted_log = encryption.encrypt_log(type, reinterpret_cast<byte *>(buf),
2017                                          *n, encrypted_log, &encrypted_len);
2018 
2019   bool encrypted = encrypted_log != buf;
2020 
2021   if (encrypted) {
2022     buf = encrypted_log;
2023     *n = encrypted_len;
2024   }
2025 
2026   return (block);
2027 }
2028 
2029 #ifndef _WIN32
2030 
2031 /** Do the read/write
2032 @param[in]	request	The IO context and type
2033 @return the number of bytes read/written or negative value on error */
execute(const IORequest & request)2034 ssize_t SyncFileIO::execute(const IORequest &request) {
2035   ssize_t n_bytes;
2036 
2037   if (request.is_read()) {
2038     n_bytes = pread(m_fh, m_buf, m_n, m_offset);
2039   } else {
2040     ut_ad(request.is_write());
2041     n_bytes = pwrite(m_fh, m_buf, m_n, m_offset);
2042   }
2043 
2044   return (n_bytes);
2045 }
2046 
2047 /** Free storage space associated with a section of the file.
2048 @param[in]	fh		Open file handle
2049 @param[in]	off		Starting offset (SEEK_SET)
2050 @param[in]	len		Size of the hole
2051 @return DB_SUCCESS or error code */
os_file_punch_hole_posix(os_file_t fh,os_offset_t off,os_offset_t len)2052 static dberr_t os_file_punch_hole_posix(os_file_t fh, os_offset_t off,
2053                                         os_offset_t len) {
2054 #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
2055   const int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
2056 
2057   int ret = fallocate(fh, mode, off, len);
2058 
2059   if (ret == 0) {
2060     return (DB_SUCCESS);
2061   }
2062 
2063   ut_a(ret == -1);
2064 
2065   if (errno == ENOTSUP) {
2066     return (DB_IO_NO_PUNCH_HOLE);
2067   }
2068 
2069   ib::warn(ER_IB_MSG_754) << "fallocate(" << fh
2070                           << ", FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, "
2071                           << off << ", " << len
2072                           << ") returned errno: " << errno;
2073 
2074   return (DB_IO_ERROR);
2075 
2076 #elif defined(UNIV_SOLARIS)
2077 
2078   // Use F_FREESP
2079 
2080 #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
2081 
2082   return (DB_IO_NO_PUNCH_HOLE);
2083 }
2084 
2085 #if defined(LINUX_NATIVE_AIO)
2086 
2087 /** Linux native AIO handler */
2088 class LinuxAIOHandler {
2089  public:
2090   /**
2091   @param[in] global_segment	The global segment*/
LinuxAIOHandler(ulint global_segment)2092   LinuxAIOHandler(ulint global_segment) : m_global_segment(global_segment) {
2093     /* Should never be doing Sync IO here. */
2094     ut_a(m_global_segment != ULINT_UNDEFINED);
2095 
2096     /* Find the array and the local segment. */
2097 
2098     m_segment = AIO::get_array_and_local_segment(m_array, m_global_segment);
2099 
2100     m_n_slots = m_array->slots_per_segment();
2101   }
2102 
2103   /** Destructor */
~LinuxAIOHandler()2104   ~LinuxAIOHandler() {
2105     // No op
2106   }
2107 
2108   /**
2109   Process a Linux AIO request
2110   @param[out]	m1		the messages passed with the
2111   @param[out]	m2		AIO request; note that in case the
2112                                   AIO operation failed, these output
2113                                   parameters are valid and can be used to
2114                                   restart the operation.
2115   @param[out]	request		IO context
2116   @return DB_SUCCESS or error code */
2117   dberr_t poll(fil_node_t **m1, void **m2, IORequest *request);
2118 
2119  private:
2120   /** Resubmit an IO request that was only partially successful
2121   @param[in,out]	slot		Request to resubmit
2122   @return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
2123   dberr_t resubmit(Slot *slot);
2124 
2125   /** Check if the AIO succeeded
2126   @param[in,out]	slot		The slot to check
2127   @return DB_SUCCESS, DB_FAIL if the operation should be retried or
2128           DB_IO_ERROR on all other errors */
2129   dberr_t check_state(Slot *slot);
2130 
2131   /** @return true if a shutdown was detected */
is_shutdown() const2132   bool is_shutdown() const {
2133     return (srv_shutdown_state.load() == SRV_SHUTDOWN_EXIT_THREADS &&
2134             !buf_flush_page_cleaner_is_active());
2135   }
2136 
2137   /** If no slot was found then the m_array->m_mutex will be released.
2138   @param[out]	n_pending	The number of pending IOs
2139   @return NULL or a slot that has completed IO */
2140   Slot *find_completed_slot(ulint *n_pending);
2141 
2142   /** This is called from within the IO-thread. If there are no completed
2143   IO requests in the slot array, the thread calls this function to
2144   collect more requests from the Linux kernel.
2145   The IO-thread waits on io_getevents(), which is a blocking call, with
2146   a timeout value. Unless the system is very heavy loaded, keeping the
2147   IO-thread very busy, the io-thread will spend most of its time waiting
2148   in this function.
2149   The IO-thread also exits in this function. It checks server status at
2150   each wakeup and that is why we use timed wait in io_getevents(). */
2151   void collect();
2152 
2153  private:
2154   /** Slot array */
2155   AIO *m_array;
2156 
2157   /** Number of slots inthe local segment */
2158   ulint m_n_slots;
2159 
2160   /** The local segment to check */
2161   ulint m_segment;
2162 
2163   /** The global segment */
2164   ulint m_global_segment;
2165 };
2166 
2167 /** Resubmit an IO request that was only partially successful
2168 @param[in,out]	slot		Request to resubmit
2169 @return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
resubmit(Slot * slot)2170 dberr_t LinuxAIOHandler::resubmit(Slot *slot) {
2171 #ifdef UNIV_DEBUG
2172   /* Bytes already read/written out */
2173   ulint n_bytes = slot->ptr - slot->buf;
2174 
2175   ut_ad(m_array->is_mutex_owned());
2176 
2177   ut_ad(n_bytes < slot->original_len);
2178   ut_ad(static_cast<ulint>(slot->n_bytes) < slot->original_len - n_bytes);
2179   /* Partial read or write scenario */
2180   ut_ad(slot->len >= static_cast<ulint>(slot->n_bytes));
2181 #endif /* UNIV_DEBUG */
2182 
2183   slot->len -= slot->n_bytes;
2184   slot->ptr += slot->n_bytes;
2185   slot->offset += slot->n_bytes;
2186 
2187   /* Resetting the bytes read/written */
2188   slot->n_bytes = 0;
2189   slot->io_already_done = false;
2190 
2191   /* make sure that slot->offset fits in off_t */
2192   ut_ad(sizeof(off_t) >= sizeof(os_offset_t));
2193   struct iocb *iocb = &slot->control;
2194   if (slot->type.is_read()) {
2195     io_prep_pread(iocb, slot->file.m_file, slot->ptr, slot->len, slot->offset);
2196 
2197   } else {
2198     ut_a(slot->type.is_write());
2199 
2200     io_prep_pwrite(iocb, slot->file.m_file, slot->ptr, slot->len, slot->offset);
2201   }
2202   iocb->data = slot;
2203 
2204   /* Resubmit an I/O request */
2205   int ret = io_submit(m_array->io_ctx(m_segment), 1, &iocb);
2206 
2207   if (ret < -1) {
2208     errno = -ret;
2209   }
2210 
2211   return (ret < 0 ? DB_IO_PARTIAL_FAILED : DB_SUCCESS);
2212 }
2213 
2214 /** Check if the AIO succeeded
2215 @param[in,out]	slot		The slot to check
2216 @return DB_SUCCESS, DB_FAIL if the operation should be retried or
2217         DB_IO_ERROR on all other errors */
check_state(Slot * slot)2218 dberr_t LinuxAIOHandler::check_state(Slot *slot) {
2219   ut_ad(m_array->is_mutex_owned());
2220 
2221   /* Note that it may be that there is more then one completed
2222   IO requests. We process them one at a time. We may have a case
2223   here to improve the performance slightly by dealing with all
2224   requests in one sweep. */
2225 
2226   srv_set_io_thread_op_info(m_global_segment,
2227                             "processing completed aio requests");
2228 
2229   ut_ad(slot->io_already_done);
2230 
2231   dberr_t err;
2232 
2233   if (slot->ret == 0) {
2234     err = AIOHandler::post_io_processing(slot);
2235 
2236   } else {
2237     errno = -slot->ret;
2238 
2239     /* os_file_handle_error does tell us if we should retry
2240     this IO. As it stands now, we don't do this retry when
2241     reaping requests from a different context than
2242     the dispatcher. This non-retry logic is the same for
2243     Windows and Linux native AIO.
2244     We should probably look into this to transparently
2245     re-submit the IO. */
2246     os_file_handle_error(slot->name, "Linux aio");
2247 
2248     err = DB_IO_ERROR;
2249   }
2250 
2251   return (err);
2252 }
2253 
2254 /** If no slot was found then the m_array->m_mutex will be released.
2255 @param[out]	n_pending		The number of pending IOs
2256 @return NULL or a slot that has completed IO */
find_completed_slot(ulint * n_pending)2257 Slot *LinuxAIOHandler::find_completed_slot(ulint *n_pending) {
2258   ulint offset = m_n_slots * m_segment;
2259 
2260   *n_pending = 0;
2261 
2262   m_array->acquire();
2263 
2264   Slot *slot = m_array->at(offset);
2265 
2266   for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
2267     if (slot->is_reserved) {
2268       ++*n_pending;
2269 
2270       if (slot->io_already_done) {
2271         /* Something for us to work on.
2272         Note: We don't release the mutex. */
2273         return (slot);
2274       }
2275     }
2276   }
2277 
2278   m_array->release();
2279 
2280   return (nullptr);
2281 }
2282 
2283 /** This function is only used in Linux native asynchronous i/o. This is
2284 called from within the io-thread. If there are no completed IO requests
2285 in the slot array, the thread calls this function to collect more
2286 requests from the kernel.
2287 The io-thread waits on io_getevents(), which is a blocking call, with
2288 a timeout value. Unless the system is very heavy loaded, keeping the
2289 io-thread very busy, the io-thread will spend most of its time waiting
2290 in this function.
2291 The io-thread also exits in this function. It checks server status at
2292 each wakeup and that is why we use timed wait in io_getevents(). */
collect()2293 void LinuxAIOHandler::collect() {
2294   ut_ad(m_n_slots > 0);
2295   ut_ad(m_segment < m_array->get_n_segments());
2296 
2297   /* Which io_context we are going to use. */
2298   io_context *io_ctx = m_array->io_ctx(m_segment);
2299 
2300   /* Starting point of the m_segment we will be working on. */
2301   ulint start_pos = m_segment * m_n_slots;
2302 
2303   /* End point. */
2304   ulint end_pos = start_pos + m_n_slots;
2305 
2306   for (;;) {
2307     struct io_event *events;
2308 
2309     /* Which part of event array we are going to work on. */
2310     events = m_array->io_events(m_segment * m_n_slots);
2311 
2312     /* Initialize the events. */
2313     memset(events, 0, sizeof(*events) * m_n_slots);
2314 
2315     /* The timeout value is arbitrary. We probably need
2316     to experiment with it a little. */
2317     struct timespec timeout;
2318 
2319     timeout.tv_sec = 0;
2320     timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
2321 
2322     auto ret = io_getevents(io_ctx, 1, m_n_slots, events, &timeout);
2323 
2324     for (int i = 0; i < ret; ++i) {
2325       auto iocb = reinterpret_cast<struct iocb *>(events[i].obj);
2326       ut_a(iocb != nullptr);
2327 
2328       auto slot = reinterpret_cast<Slot *>(iocb->data);
2329 
2330       /* Some sanity checks. */
2331       ut_a(slot != nullptr);
2332       ut_a(slot->is_reserved);
2333 
2334       /* We are not scribbling previous segment. */
2335       ut_a(slot->pos >= start_pos);
2336 
2337       /* We have not overstepped to next segment. */
2338       ut_a(slot->pos < end_pos);
2339 
2340       if (slot->offset > 0 && !slot->skip_punch_hole &&
2341           slot->type.is_compression_enabled() && !slot->type.is_log() &&
2342           slot->type.is_write() && slot->type.is_compressed() &&
2343           slot->type.punch_hole() && !slot->type.is_dblwr()) {
2344         slot->err = AIOHandler::io_complete(slot);
2345       } else {
2346         slot->err = DB_SUCCESS;
2347       }
2348 
2349       /* Mark this request as completed. The error handling
2350       will be done in the calling function. */
2351       m_array->acquire();
2352 
2353       /* events[i].res2 should always be ZERO */
2354       ut_ad(events[i].res2 == 0);
2355       slot->io_already_done = true;
2356 
2357       /* Even though events[i].res is an unsigned number in libaio, it is
2358       used to return a negative value (negated errno value) to indicate
2359       error and a positive value to indicate number of bytes read or
2360       written. */
2361 
2362       if (events[i].res > slot->len) {
2363         /* failure */
2364         slot->n_bytes = 0;
2365         slot->ret = events[i].res;
2366       } else {
2367         /* success */
2368         slot->n_bytes = events[i].res;
2369         slot->ret = 0;
2370       }
2371       m_array->release();
2372     }
2373 
2374     if (srv_shutdown_state.load() == SRV_SHUTDOWN_EXIT_THREADS ||
2375         !buf_flush_page_cleaner_is_active() || ret > 0) {
2376       break;
2377     }
2378 
2379     /* This error handling is for any error in collecting the
2380     IO requests. The errors, if any, for any particular IO
2381     request are simply passed on to the calling routine. */
2382 
2383     switch (ret) {
2384       case -EAGAIN:
2385         /* Not enough resources! Try again. */
2386 
2387       case -EINTR:
2388         /* Interrupted! The behaviour in case of an interrupt.
2389         If we have some completed IOs available then the
2390         return code will be the number of IOs. We get EINTR
2391         only if there are no completed IOs and we have been
2392         interrupted. */
2393 
2394       case 0:
2395         /* No pending request! Go back and check again. */
2396 
2397         continue;
2398     }
2399 
2400     /* All other errors should cause a trap for now. */
2401     ib::fatal(ER_IB_MSG_755)
2402         << "Unexpected ret_code[" << ret << "] from io_getevents()!";
2403 
2404     break;
2405   }
2406 }
2407 
2408 /** Process a Linux AIO request
2409 @param[out]	m1		the messages passed with the
2410 @param[out]	m2		AIO request; note that in case the
2411                                 AIO operation failed, these output
2412                                 parameters are valid and can be used to
2413                                 restart the operation.
2414 @param[out]	request		IO context
2415 @return DB_SUCCESS or error code */
poll(fil_node_t ** m1,void ** m2,IORequest * request)2416 dberr_t LinuxAIOHandler::poll(fil_node_t **m1, void **m2, IORequest *request) {
2417   dberr_t err;
2418   Slot *slot;
2419 
2420   /* Loop until we have found a completed request. */
2421   for (;;) {
2422     ulint n_pending;
2423 
2424     slot = find_completed_slot(&n_pending);
2425 
2426     if (slot != nullptr) {
2427       ut_ad(m_array->is_mutex_owned());
2428 
2429       err = check_state(slot);
2430 
2431       /* DB_FAIL is not a hard error, we should retry */
2432       if (err != DB_FAIL) {
2433         break;
2434       }
2435 
2436       /* Partial IO, resubmit request for
2437       remaining bytes to read/write */
2438       err = resubmit(slot);
2439 
2440       if (err != DB_SUCCESS) {
2441         break;
2442       }
2443 
2444       m_array->release();
2445 
2446     } else if (is_shutdown() && n_pending == 0) {
2447       /* There is no completed request. If there is
2448       no pending request at all, and the system is
2449       being shut down, exit. */
2450 
2451       *m1 = nullptr;
2452       *m2 = nullptr;
2453 
2454       return (DB_SUCCESS);
2455 
2456     } else {
2457       /* Wait for some request. Note that we return
2458       from wait if we have found a request. */
2459 
2460       srv_set_io_thread_op_info(m_global_segment,
2461                                 "waiting for completed aio requests");
2462 
2463       collect();
2464     }
2465   }
2466 
2467   if (err == DB_IO_PARTIAL_FAILED) {
2468     /* Aborting in case of submit failure */
2469     ib::fatal(ER_IB_MSG_756) << "Native Linux AIO interface. "
2470                                 "io_submit() call failed when "
2471                                 "resubmitting a partial I/O "
2472                                 "request on the file "
2473                              << slot->name << ".";
2474   }
2475 
2476   *m1 = slot->m1;
2477   *m2 = slot->m2;
2478 
2479   *request = slot->type;
2480 
2481   m_array->release(slot);
2482 
2483   m_array->release();
2484 
2485   return (err);
2486 }
2487 
2488 /** This function is only used in Linux native asynchronous i/o.
2489 Waits for an aio operation to complete. This function is used to wait for
2490 the completed requests. The aio array of pending requests is divided
2491 into segments. The thread specifies which segment or slot it wants to wait
2492 for. NOTE: this function will also take care of freeing the aio slot,
2493 therefore no other thread is allowed to do the freeing!
2494 
2495 @param[in]	global_segment	segment number in the aio array
2496                                 to wait for; segment 0 is the ibuf
2497                                 i/o thread, segment 1 is log i/o thread,
2498                                 then follow the non-ibuf read threads,
2499                                 and the last are the non-ibuf write
2500                                 threads.
2501 @param[out]	m1		the messages passed with the
2502 @param[out]	m2			AIO request; note that in case the
2503                                 AIO operation failed, these output
2504                                 parameters are valid and can be used to
2505                                 restart the operation.
2506 @param[out]	request		IO context
2507 @return DB_SUCCESS if the IO was successful */
os_aio_linux_handler(ulint global_segment,fil_node_t ** m1,void ** m2,IORequest * request)2508 static dberr_t os_aio_linux_handler(ulint global_segment, fil_node_t **m1,
2509                                     void **m2, IORequest *request) {
2510   LinuxAIOHandler handler(global_segment);
2511 
2512   dberr_t err = handler.poll(m1, m2, request);
2513 
2514   if (err == DB_IO_NO_PUNCH_HOLE) {
2515     if (!request->is_dblwr()) {
2516       fil_no_punch_hole(*m1);
2517       err = DB_SUCCESS;
2518     }
2519   }
2520 
2521   return (err);
2522 }
2523 
2524 /** Dispatch an AIO request to the kernel.
2525 @param[in,out]	slot		an already reserved slot
2526 @return true on success. */
linux_dispatch(Slot * slot)2527 bool AIO::linux_dispatch(Slot *slot) {
2528   ut_a(slot->is_reserved);
2529   ut_ad(slot->type.validate());
2530 
2531   /* Find out what we are going to work with.
2532   The iocb struct is directly in the slot.
2533   The io_context is one per segment. */
2534 
2535   ulint io_ctx_index;
2536   struct iocb *iocb = &slot->control;
2537 
2538   io_ctx_index = (slot->pos * m_n_segments) / m_slots.size();
2539 
2540   int ret = io_submit(m_aio_ctx[io_ctx_index], 1, &iocb);
2541 
2542   /* io_submit() returns number of successfully queued requests
2543   or -errno. */
2544 
2545   if (ret != 1) {
2546     errno = -ret;
2547   }
2548 
2549   return (ret == 1);
2550 }
2551 
2552 /** Creates an io_context for native linux AIO.
2553 @param[in]	max_events	number of events
2554 @param[out]	io_ctx		io_ctx to initialize.
2555 @return true on success. */
linux_create_io_ctx(ulint max_events,io_context_t * io_ctx)2556 bool AIO::linux_create_io_ctx(ulint max_events, io_context_t *io_ctx) {
2557   ssize_t n_retries = 0;
2558 
2559   for (;;) {
2560     memset(io_ctx, 0x0, sizeof(*io_ctx));
2561 
2562     /* Initialize the io_ctx. Tell it how many pending
2563     IO requests this context will handle. */
2564 
2565     int ret = io_setup(max_events, io_ctx);
2566 
2567     if (ret == 0) {
2568       /* Success. Return now. */
2569       return (true);
2570     }
2571 
2572     /* If we hit EAGAIN we'll make a few attempts before failing. */
2573 
2574     switch (ret) {
2575       case -EAGAIN:
2576         if (n_retries == 0) {
2577           /* First time around. */
2578           ib::warn(ER_IB_MSG_757) << "io_setup() failed with EAGAIN."
2579                                      " Will make "
2580                                   << OS_AIO_IO_SETUP_RETRY_ATTEMPTS
2581                                   << " attempts before giving up.";
2582         }
2583 
2584         if (n_retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
2585           ++n_retries;
2586 
2587           ib::warn(ER_IB_MSG_758) << "io_setup() attempt " << n_retries << ".";
2588 
2589           os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
2590 
2591           continue;
2592         }
2593 
2594         /* Have tried enough. Better call it a day. */
2595         ib::error(ER_IB_MSG_759)
2596             << "io_setup() failed with EAGAIN after "
2597             << OS_AIO_IO_SETUP_RETRY_ATTEMPTS << " attempts.";
2598         break;
2599 
2600       case -ENOSYS:
2601         ib::error(ER_IB_MSG_760) << "Linux Native AIO interface"
2602                                     " is not supported on this platform. Please"
2603                                     " check your OS documentation and install"
2604                                     " appropriate binary of InnoDB.";
2605 
2606         break;
2607 
2608       default:
2609         ib::error(ER_IB_MSG_761) << "Linux Native AIO setup"
2610                                  << " returned following error[" << ret << "]";
2611         break;
2612     }
2613 
2614     ib::info(ER_IB_MSG_762) << "You can disable Linux Native AIO by"
2615                                " setting innodb_use_native_aio = 0 in my.cnf";
2616 
2617     break;
2618   }
2619 
2620   return (false);
2621 }
2622 
2623 /** Checks if the system supports native linux aio. On some kernel
2624 versions where native aio is supported it won't work on tmpfs. In such
2625 cases we can't use native aio as it is not possible to mix simulated
2626 and native aio.
2627 @return: true if supported, false otherwise. */
is_linux_native_aio_supported()2628 bool AIO::is_linux_native_aio_supported() {
2629   int fd;
2630   io_context_t io_ctx;
2631   char name[1000];
2632 
2633   if (!linux_create_io_ctx(1, &io_ctx)) {
2634     /* The platform does not support native aio. */
2635 
2636     return (false);
2637 
2638   } else if (!srv_read_only_mode) {
2639     /* Now check if tmpdir supports native aio ops. */
2640     fd = innobase_mysql_tmpfile(nullptr);
2641 
2642     if (fd < 0) {
2643       ib::warn(ER_IB_MSG_763) << "Unable to create temp file to check"
2644                                  " native AIO support.";
2645 
2646       return (false);
2647     }
2648   } else {
2649     ulint dirnamelen = strlen(srv_log_group_home_dir);
2650 
2651     ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
2652 
2653     memcpy(name, srv_log_group_home_dir, dirnamelen);
2654 
2655     /* Add a path separator if needed. */
2656     if (dirnamelen && name[dirnamelen - 1] != OS_PATH_SEPARATOR) {
2657       name[dirnamelen++] = OS_PATH_SEPARATOR;
2658     }
2659 
2660     strcpy(name + dirnamelen, "ib_logfile0");
2661 
2662     fd = ::open(name, O_RDONLY);
2663 
2664     if (fd == -1) {
2665       ib::warn(ER_IB_MSG_764) << "Unable to open"
2666                               << " \"" << name << "\" to check native"
2667                               << " AIO read support.";
2668 
2669       return (false);
2670     }
2671   }
2672 
2673   struct io_event io_event;
2674 
2675   memset(&io_event, 0x0, sizeof(io_event));
2676 
2677   byte *buf = static_cast<byte *>(ut_malloc_nokey(UNIV_PAGE_SIZE * 2));
2678   byte *ptr = static_cast<byte *>(ut_align(buf, UNIV_PAGE_SIZE));
2679 
2680   struct iocb iocb;
2681 
2682   /* Suppress valgrind warning. */
2683   memset(buf, 0x00, UNIV_PAGE_SIZE * 2);
2684   memset(&iocb, 0x0, sizeof(iocb));
2685 
2686   struct iocb *p_iocb = &iocb;
2687 
2688   if (!srv_read_only_mode) {
2689     io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0);
2690 
2691   } else {
2692     ut_a(UNIV_PAGE_SIZE >= 512);
2693     io_prep_pread(p_iocb, fd, ptr, 512, 0);
2694   }
2695 
2696   int err = io_submit(io_ctx, 1, &p_iocb);
2697 
2698   if (err >= 1) {
2699     /* Now collect the submitted IO request. */
2700     err = io_getevents(io_ctx, 1, 1, &io_event, nullptr);
2701   }
2702 
2703   ut_free(buf);
2704   close(fd);
2705 
2706   switch (err) {
2707     case 1:
2708       return (true);
2709 
2710     case -EINVAL:
2711     case -ENOSYS:
2712       ib::error(ER_IB_MSG_765)
2713           << "Linux Native AIO not supported. You can either"
2714              " move "
2715           << (srv_read_only_mode ? name : "tmpdir")
2716           << " to a file system that supports native"
2717              " AIO or you can set innodb_use_native_aio to"
2718              " FALSE to avoid this message.";
2719 
2720       /* fall through. */
2721     default:
2722       ib::error(ER_IB_MSG_766) << "Linux Native AIO check on "
2723                                << (srv_read_only_mode ? name : "tmpdir")
2724                                << "returned error[" << -err << "]";
2725   }
2726 
2727   return (false);
2728 }
2729 
2730 #endif /* LINUX_NATIVE_AIO */
2731 
2732 /** Retrieves the last error number if an error occurs in a file io function.
2733 The number should be retrieved before any other OS calls (because they may
2734 overwrite the error number). If the number is not known to this program,
2735 the OS error number + 100 is returned.
2736 @param[in]	report_all_errors	true if we want an error message
2737                                         printed of all errors
2738 @param[in]	on_error_silent		true then don't print any diagnostic
2739                                         to the log
2740 @return error number, or OS error number + 100 */
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)2741 static ulint os_file_get_last_error_low(bool report_all_errors,
2742                                         bool on_error_silent) {
2743   int err = errno;
2744 
2745   if (err == 0) {
2746     return (0);
2747   }
2748 
2749   if (report_all_errors ||
2750       (err != ENOSPC && err != EEXIST && !on_error_silent)) {
2751     ib::error(ER_IB_MSG_767)
2752         << "Operating system error number " << err << " in a file operation.";
2753 
2754     if (err == ENOENT) {
2755       ib::error(ER_IB_MSG_768) << "The error means the system"
2756                                   " cannot find the path specified.";
2757 
2758 #ifndef UNIV_HOTBACKUP
2759       if (srv_is_being_started) {
2760         ib::error(ER_IB_MSG_769) << "If you are installing InnoDB,"
2761                                     " remember that you must create"
2762                                     " directories yourself, InnoDB"
2763                                     " does not create them.";
2764       }
2765 #endif /* !UNIV_HOTBACKUP */
2766     } else if (err == EACCES) {
2767       ib::error(ER_IB_MSG_770) << "The error means mysqld does not have"
2768                                   " the access rights to the directory.";
2769 
2770     } else {
2771       if (strerror(err) != nullptr) {
2772         ib::error(ER_IB_MSG_771)
2773             << "Error number " << err << " means '" << strerror(err) << "'";
2774       }
2775 
2776       ib::info(ER_IB_MSG_772) << OPERATING_SYSTEM_ERROR_MSG;
2777     }
2778   }
2779 
2780   switch (err) {
2781     case ENOSPC:
2782       return (OS_FILE_DISK_FULL);
2783     case ENOENT:
2784       return (OS_FILE_NOT_FOUND);
2785     case EEXIST:
2786       return (OS_FILE_ALREADY_EXISTS);
2787     case EXDEV:
2788     case ENOTDIR:
2789     case EISDIR:
2790       return (OS_FILE_PATH_ERROR);
2791     case EAGAIN:
2792       if (srv_use_native_aio) {
2793         return (OS_FILE_AIO_RESOURCES_RESERVED);
2794       }
2795       break;
2796     case EINTR:
2797       if (srv_use_native_aio) {
2798         return (OS_FILE_AIO_INTERRUPTED);
2799       }
2800       break;
2801     case EACCES:
2802       return (OS_FILE_ACCESS_VIOLATION);
2803     case ENAMETOOLONG:
2804       return (OS_FILE_NAME_TOO_LONG);
2805   }
2806   return (OS_FILE_ERROR_MAX + err);
2807 }
2808 
2809 /** Wrapper to fsync(2) that retries the call on some errors.
2810 Returns the value 0 if successful; otherwise the value -1 is returned and
2811 the global variable errno is set to indicate the error.
2812 @param[in]	file		open file handle
2813 @return 0 if success, -1 otherwise */
os_file_fsync_posix(os_file_t file)2814 static int os_file_fsync_posix(os_file_t file) {
2815   ulint failures = 0;
2816 #ifdef UNIV_HOTBACKUP
2817   static meb::Mutex meb_mutex;
2818 #endif /* UNIV_HOTBACKUP */
2819 
2820   for (;;) {
2821 #ifdef UNIV_HOTBACKUP
2822     meb_mutex.lock();
2823 #endif /* UNIV_HOTBACKUP */
2824     ++os_n_fsyncs;
2825 #ifdef UNIV_HOTBACKUP
2826     meb_mutex.unlock();
2827 #endif /* UNIV_HOTBACKUP */
2828 
2829     int ret = fsync(file);
2830 
2831     if (ret == 0) {
2832       return (ret);
2833     }
2834 
2835     switch (errno) {
2836       case ENOLCK:
2837 
2838         ++failures;
2839         ut_a(failures < 1000);
2840 
2841         if (!(failures % 100)) {
2842           ib::warn(ER_IB_MSG_773) << "fsync(): "
2843                                   << "No locks available; retrying";
2844         }
2845 
2846         /* 0.2 sec */
2847         os_thread_sleep(200000);
2848         break;
2849 
2850       case EIO:
2851 
2852         ib::fatal(ER_IB_MSG_1358) << "fsync() returned EIO, aborting.";
2853         break;
2854 
2855       case EINTR:
2856 
2857         ++failures;
2858         ut_a(failures < 2000);
2859         break;
2860 
2861       default:
2862         ut_error;
2863         break;
2864     }
2865   }
2866 
2867   ut_error;
2868 
2869   return (-1);
2870 }
2871 
2872 /** Check the existence and type of the given file.
2873 @param[in]	path		path name of file
2874 @param[out]	exists		true if the file exists
2875 @param[out]	type		Type of the file, if it exists
2876 @return true if call succeeded */
os_file_status_posix(const char * path,bool * exists,os_file_type_t * type)2877 static bool os_file_status_posix(const char *path, bool *exists,
2878                                  os_file_type_t *type) {
2879   struct stat statinfo;
2880 
2881   int ret = stat(path, &statinfo);
2882 
2883   if (exists != nullptr) {
2884     *exists = !ret;
2885   }
2886 
2887   if (ret == 0) {
2888     /* file exists, everything OK */
2889 
2890   } else if (errno == ENOENT || errno == ENOTDIR) {
2891     if (exists != nullptr) {
2892       *exists = false;
2893     }
2894 
2895     /* file does not exist */
2896     *type = OS_FILE_TYPE_MISSING;
2897     return (true);
2898 
2899   } else if (errno == ENAMETOOLONG) {
2900     *type = OS_FILE_TYPE_NAME_TOO_LONG;
2901     return (false);
2902   } else if (errno == EACCES) {
2903     *type = OS_FILE_PERMISSION_ERROR;
2904     return (false);
2905   } else {
2906     *type = OS_FILE_TYPE_FAILED;
2907 
2908     /* The stat() call failed with some other error. */
2909     os_file_handle_error_no_exit(path, "file_status_posix_stat", false);
2910     return (false);
2911   }
2912 
2913   if (exists != nullptr) {
2914     *exists = true;
2915   }
2916 
2917   if (S_ISDIR(statinfo.st_mode)) {
2918     *type = OS_FILE_TYPE_DIR;
2919 
2920   } else if (S_ISLNK(statinfo.st_mode)) {
2921     *type = OS_FILE_TYPE_LINK;
2922 
2923   } else if (S_ISREG(statinfo.st_mode)) {
2924     *type = OS_FILE_TYPE_FILE;
2925 
2926   } else {
2927     *type = OS_FILE_TYPE_UNKNOWN;
2928   }
2929 
2930   return (true);
2931 }
2932 
2933 /** Check the existence and usefulness of a given path.
2934 @param[in]  path  path name
2935 @retval true if the path exists and can be used
2936 @retval false if the path does not exist or if the path is
2937 unuseable to get to a possibly existing file or directory. */
os_file_exists_posix(const char * path)2938 static bool os_file_exists_posix(const char *path) {
2939   struct stat statinfo;
2940 
2941   int ret = stat(path, &statinfo);
2942 
2943   if (ret == 0) {
2944     return (true);
2945   }
2946 
2947   if (!(errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG ||
2948         errno == EACCES)) {
2949     os_file_handle_error_no_exit(path, "file_exists_posix_stat", false);
2950   }
2951 
2952   return (false);
2953 }
2954 
2955 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
2956 function!
2957 Flushes the write buffers of a given file to the disk.
2958 @param[in]	file		handle to a file
2959 @return true if success */
os_file_flush_func(os_file_t file)2960 bool os_file_flush_func(os_file_t file) {
2961   int ret;
2962 
2963   ret = os_file_fsync_posix(file);
2964 
2965   if (ret == 0) {
2966     return (true);
2967   }
2968 
2969   /* Since Linux returns EINVAL if the 'file' is actually a raw device,
2970   we choose to ignore that error if we are using raw disks */
2971 
2972   if (srv_start_raw_disk_in_use && errno == EINVAL) {
2973     return (true);
2974   }
2975 
2976   ib::error(ER_IB_MSG_775) << "The OS said file flush did not succeed";
2977 
2978   os_file_handle_error(nullptr, "flush");
2979 
2980   /* It is a fatal error if a file flush does not succeed, because then
2981   the database can get corrupt on disk */
2982   ut_error;
2983 
2984   return (false);
2985 }
2986 
2987 /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
2988 this function!
2989 A simple function to open or create a file.
2990 @param[in]	name		name of the file or path as a null-terminated
2991                                 string
2992 @param[in]	create_mode	create mode
2993 @param[in]	access_type	OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
2994 @param[in]	read_only	if true, read only checks are enforced
2995 @param[out]	success		true if succeed, false if error
2996 @return handle to the file, not defined if error, error number
2997         can be retrieved with os_file_get_last_error */
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)2998 os_file_t os_file_create_simple_func(const char *name, ulint create_mode,
2999                                      ulint access_type, bool read_only,
3000                                      bool *success) {
3001   os_file_t file;
3002 
3003   *success = false;
3004 
3005   int create_flag;
3006 
3007   ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
3008   ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
3009 
3010   if (create_mode == OS_FILE_OPEN) {
3011     if (access_type == OS_FILE_READ_ONLY) {
3012       create_flag = O_RDONLY;
3013 
3014     } else if (read_only) {
3015       create_flag = O_RDONLY;
3016 
3017     } else {
3018       create_flag = O_RDWR;
3019     }
3020 
3021   } else if (read_only) {
3022     create_flag = O_RDONLY;
3023 
3024   } else if (create_mode == OS_FILE_CREATE) {
3025     create_flag = O_RDWR | O_CREAT | O_EXCL;
3026 
3027   } else if (create_mode == OS_FILE_CREATE_PATH) {
3028     /* Create subdirs along the path if needed. */
3029     dberr_t err;
3030 
3031     err = os_file_create_subdirs_if_needed(name);
3032 
3033     if (err != DB_SUCCESS) {
3034       *success = false;
3035       ib::error(ER_IB_MSG_776)
3036           << "Unable to create subdirectories '" << name << "'";
3037 
3038       return (OS_FILE_CLOSED);
3039     }
3040 
3041     create_flag = O_RDWR | O_CREAT | O_EXCL;
3042     create_mode = OS_FILE_CREATE;
3043   } else {
3044     ib::error(ER_IB_MSG_777) << "Unknown file create mode (" << create_mode
3045                              << " for file '" << name << "'";
3046 
3047     return (OS_FILE_CLOSED);
3048   }
3049 
3050   bool retry;
3051 
3052   do {
3053     file = ::open(name, create_flag, os_innodb_umask);
3054 
3055     if (file == -1) {
3056       *success = false;
3057 
3058       retry = os_file_handle_error(
3059           name, create_mode == OS_FILE_OPEN ? "open" : "create");
3060     } else {
3061       *success = true;
3062       retry = false;
3063     }
3064 
3065   } while (retry);
3066 
3067 #ifdef USE_FILE_LOCK
3068   if (!read_only && *success && access_type == OS_FILE_READ_WRITE &&
3069       os_file_lock(file, name)) {
3070     *success = false;
3071     close(file);
3072     file = -1;
3073   }
3074 #endif /* USE_FILE_LOCK */
3075 
3076   return (file);
3077 }
3078 
3079 /** This function attempts to create a directory named pathname. The new
3080 directory gets default permissions. On Unix the permissions are
3081 (0770 & ~umask). If the directory exists already, nothing is done and
3082 the call succeeds, unless the fail_if_exists arguments is true.
3083 If another error occurs, such as a permission error, this does not crash,
3084 but reports the error and returns false.
3085 @param[in]	pathname	directory name as null-terminated string
3086 @param[in]	fail_if_exists	if true, pre-existing directory is treated as
3087                                 an error.
3088 @return true if call succeeds, false on error */
os_file_create_directory(const char * pathname,bool fail_if_exists)3089 bool os_file_create_directory(const char *pathname, bool fail_if_exists) {
3090   int rcode = mkdir(pathname, 0770);
3091 
3092   if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
3093     /* failure */
3094     os_file_handle_error_no_exit(pathname, "mkdir", false);
3095 
3096     return (false);
3097   }
3098 
3099   return (true);
3100 }
3101 
3102 /** This function scans the contents of a directory and invokes the callback
3103 for each entry.
3104 @param[in]	path		directory name as null-terminated string
3105 @param[in]	scan_cbk	use callback to be called for each entry
3106 @param[in]	is_drop		attempt to drop the directory after scan
3107 @return true if call succeeds, false on error */
os_file_scan_directory(const char * path,os_dir_cbk_t scan_cbk,bool is_drop)3108 bool os_file_scan_directory(const char *path, os_dir_cbk_t scan_cbk,
3109                             bool is_drop) {
3110   DIR *directory;
3111   dirent *entry;
3112 
3113   directory = opendir(path);
3114 
3115   if (directory == nullptr) {
3116     os_file_handle_error_no_exit(path, "opendir", false);
3117     return (false);
3118   }
3119 
3120   entry = readdir(directory);
3121 
3122   while (entry != nullptr) {
3123     scan_cbk(path, entry->d_name);
3124     entry = readdir(directory);
3125   }
3126 
3127   closedir(directory);
3128 
3129   if (is_drop) {
3130     int err;
3131     err = rmdir(path);
3132 
3133     if (err != 0) {
3134       os_file_handle_error_no_exit(path, "rmdir", false);
3135       return (false);
3136     }
3137   }
3138 
3139   return (true);
3140 }
3141 
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,bool read_only,bool * success)3142 pfs_os_file_t os_file_create_func(const char *name, ulint create_mode,
3143                                   ulint purpose, ulint type, bool read_only,
3144                                   bool *success) {
3145   bool on_error_no_exit;
3146   bool on_error_silent;
3147   pfs_os_file_t file;
3148 
3149   *success = false;
3150 
3151   DBUG_EXECUTE_IF("ib_create_table_fail_disk_full", *success = false;
3152                   errno = ENOSPC; file.m_file = OS_FILE_CLOSED; return (file););
3153 
3154   int create_flag;
3155   const char *mode_str = nullptr;
3156 
3157   on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT ? true : false;
3158   on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT ? true : false;
3159 
3160   create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
3161   create_mode &= ~OS_FILE_ON_ERROR_SILENT;
3162 
3163   if (create_mode == OS_FILE_OPEN || create_mode == OS_FILE_OPEN_RAW ||
3164       create_mode == OS_FILE_OPEN_RETRY) {
3165     mode_str = "OPEN";
3166 
3167     create_flag = read_only ? O_RDONLY : O_RDWR;
3168 
3169   } else if (read_only) {
3170     mode_str = "OPEN";
3171 
3172     create_flag = O_RDONLY;
3173 
3174   } else if (create_mode == OS_FILE_CREATE) {
3175     mode_str = "CREATE";
3176     create_flag = O_RDWR | O_CREAT | O_EXCL;
3177 
3178   } else if (create_mode == OS_FILE_CREATE_PATH) {
3179     /* Create subdirs along the path if needed. */
3180     dberr_t err;
3181 
3182     err = os_file_create_subdirs_if_needed(name);
3183 
3184     if (err != DB_SUCCESS) {
3185       *success = false;
3186       ib::error(ER_IB_MSG_778)
3187           << "Unable to create subdirectories '" << name << "'";
3188 
3189       file.m_file = OS_FILE_CLOSED;
3190       return (file);
3191     }
3192 
3193     create_flag = O_RDWR | O_CREAT | O_EXCL;
3194     create_mode = OS_FILE_CREATE;
3195 
3196   } else {
3197     ib::error(ER_IB_MSG_779)
3198         << "Unknown file create mode (" << create_mode << ")"
3199         << " for file '" << name << "'";
3200 
3201     file.m_file = OS_FILE_CLOSED;
3202     return (file);
3203   }
3204 
3205   ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE || type == OS_DBLWR_FILE ||
3206        type == OS_CLONE_DATA_FILE || type == OS_CLONE_LOG_FILE ||
3207        type == OS_BUFFERED_FILE || type == OS_REDO_LOG_ARCHIVE_FILE);
3208 
3209   ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
3210 
3211 #ifdef O_SYNC
3212   /* We let O_SYNC only affect log files; note that we map O_DSYNC to
3213   O_SYNC because the datasync options seemed to corrupt files in 2001
3214   in both Linux and Solaris */
3215 
3216   if (!read_only && type == OS_LOG_FILE &&
3217       srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
3218     create_flag |= O_SYNC;
3219   }
3220 #endif /* O_SYNC */
3221 
3222   bool retry;
3223 
3224   do {
3225     file.m_file = ::open(name, create_flag, os_innodb_umask);
3226 
3227     if (file.m_file == -1) {
3228       const char *operation;
3229 
3230       operation =
3231           (create_mode == OS_FILE_CREATE && !read_only) ? "create" : "open";
3232 
3233       *success = false;
3234 
3235       if (on_error_no_exit) {
3236         retry = os_file_handle_error_no_exit(name, operation, on_error_silent);
3237       } else {
3238         retry = os_file_handle_error(name, operation);
3239       }
3240     } else {
3241       *success = true;
3242       retry = false;
3243     }
3244 
3245   } while (retry);
3246 
3247   /* We disable OS caching (O_DIRECT) only on data files. For clone we
3248   need to set O_DIRECT even for read_only mode. */
3249 
3250   if ((!read_only || type == OS_CLONE_DATA_FILE) && *success &&
3251       (type == OS_DATA_FILE || type == OS_CLONE_DATA_FILE ||
3252        type == OS_DBLWR_FILE) &&
3253       (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT ||
3254        srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) {
3255     os_file_set_nocache(file.m_file, name, mode_str);
3256   }
3257 
3258 #ifdef USE_FILE_LOCK
3259   if (!read_only && *success && create_mode != OS_FILE_OPEN_RAW &&
3260       /* Don't acquire file lock while cloning files. */
3261       type != OS_CLONE_DATA_FILE && type != OS_CLONE_LOG_FILE &&
3262       os_file_lock(file.m_file, name)) {
3263     if (create_mode == OS_FILE_OPEN_RETRY) {
3264       ib::info(ER_IB_MSG_780) << "Retrying to lock the first data file";
3265 
3266       for (int i = 0; i < 100; i++) {
3267         os_thread_sleep(1000000);
3268 
3269         if (!os_file_lock(file.m_file, name)) {
3270           *success = true;
3271           return (file);
3272         }
3273       }
3274 
3275       ib::info(ER_IB_MSG_781) << "Unable to open the first data file";
3276     }
3277 
3278     *success = false;
3279     close(file.m_file);
3280     file.m_file = -1;
3281   }
3282 #endif /* USE_FILE_LOCK */
3283 
3284   return (file);
3285 }
3286 
3287 /** NOTE! Use the corresponding macro
3288 os_file_create_simple_no_error_handling(), not directly this function!
3289 A simple function to open or create a file.
3290 @param[in]	name		name of the file or path as a null-terminated
3291                                 string
3292 @param[in]	create_mode	create mode
3293 @param[in]	access_type	OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
3294                                 OS_FILE_READ_ALLOW_DELETE; the last option
3295                                 is used by a backup program reading the file
3296 @param[in]	read_only	if true read only mode checks are enforced
3297 @param[out]	success		true if succeeded
3298 @return own: handle to the file, not defined if error, error number
3299         can be retrieved with os_file_get_last_error */
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)3300 pfs_os_file_t os_file_create_simple_no_error_handling_func(const char *name,
3301                                                            ulint create_mode,
3302                                                            ulint access_type,
3303                                                            bool read_only,
3304                                                            bool *success) {
3305   pfs_os_file_t file;
3306   int create_flag;
3307 
3308   ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
3309   ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
3310 
3311   *success = false;
3312 
3313   if (create_mode == OS_FILE_OPEN) {
3314     if (access_type == OS_FILE_READ_ONLY) {
3315       create_flag = O_RDONLY;
3316 
3317     } else if (read_only) {
3318       create_flag = O_RDONLY;
3319 
3320     } else {
3321       ut_a(access_type == OS_FILE_READ_WRITE ||
3322            access_type == OS_FILE_READ_ALLOW_DELETE);
3323 
3324       create_flag = O_RDWR;
3325     }
3326 
3327   } else if (read_only) {
3328     create_flag = O_RDONLY;
3329 
3330   } else if (create_mode == OS_FILE_CREATE) {
3331     create_flag = O_RDWR | O_CREAT | O_EXCL;
3332 
3333   } else {
3334     ib::error(ER_IB_MSG_782) << "Unknown file create mode " << create_mode
3335                              << " for file '" << name << "'";
3336     file.m_file = OS_FILE_CLOSED;
3337     return (file);
3338   }
3339 
3340   file.m_file = ::open(name, create_flag, os_innodb_umask);
3341 
3342   *success = (file.m_file != -1);
3343 
3344 #ifdef USE_FILE_LOCK
3345   if (!read_only && *success && access_type == OS_FILE_READ_WRITE &&
3346       os_file_lock(file.m_file, name)) {
3347     *success = false;
3348     close(file.m_file);
3349     file.m_file = -1;
3350   }
3351 #endif /* USE_FILE_LOCK */
3352 
3353   return (file);
3354 }
3355 
3356 /** Deletes a file if it exists. The file has to be closed before calling this.
3357 @param[in]	name		file path as a null-terminated string
3358 @param[out]	exist		indicate if file pre-exist
3359 @return true if success */
os_file_delete_if_exists_func(const char * name,bool * exist)3360 bool os_file_delete_if_exists_func(const char *name, bool *exist) {
3361   if (!os_file_can_delete(name)) {
3362     return (false);
3363   }
3364 
3365   if (exist != nullptr) {
3366     *exist = true;
3367   }
3368 
3369   int ret = unlink(name);
3370 
3371   if (ret != 0 && errno == ENOENT) {
3372     if (exist != nullptr) {
3373       *exist = false;
3374     }
3375 
3376   } else if (ret != 0 && errno != ENOENT) {
3377     os_file_handle_error_no_exit(name, "delete", false);
3378 
3379     return (false);
3380   }
3381 
3382   return (true);
3383 }
3384 
3385 /** Deletes a file. The file has to be closed before calling this.
3386 @param[in]	name		file path as a null-terminated string
3387 @return true if success */
os_file_delete_func(const char * name)3388 bool os_file_delete_func(const char *name) {
3389   int ret = unlink(name);
3390 
3391   if (ret != 0) {
3392     os_file_handle_error_no_exit(name, "delete", false);
3393 
3394     return (false);
3395   }
3396 
3397   return (true);
3398 }
3399 
3400 /** NOTE! Use the corresponding macro os_file_rename(), not directly this
3401 function!
3402 Renames a file (can also move it to another directory). It is safest that the
3403 file is closed before calling this function.
3404 @param[in]	oldpath		old file path as a null-terminated string
3405 @param[in]	newpath		new file path
3406 @return true if success */
os_file_rename_func(const char * oldpath,const char * newpath)3407 bool os_file_rename_func(const char *oldpath, const char *newpath) {
3408 #ifdef UNIV_DEBUG
3409   /* New path must be valid but not exist. */
3410   os_file_type_t type;
3411   bool exists;
3412   ut_ad(os_file_status(newpath, &exists, &type));
3413   ut_ad(!exists);
3414 
3415   /* Old path must exist. */
3416   ut_ad(os_file_exists(oldpath));
3417 #endif /* UNIV_DEBUG */
3418 
3419   int ret = rename(oldpath, newpath);
3420 
3421   if (ret != 0) {
3422     os_file_handle_error_no_exit(oldpath, "rename", false);
3423 
3424     return (false);
3425   }
3426 
3427   return (true);
3428 }
3429 
3430 /** NOTE! Use the corresponding macro os_file_close(), not directly this
3431 function!
3432 Closes a file handle. In case of error, error number can be retrieved with
3433 os_file_get_last_error.
3434 @param[in]	file		Handle to close
3435 @return true if success */
os_file_close_func(os_file_t file)3436 bool os_file_close_func(os_file_t file) {
3437   int ret = close(file);
3438 
3439   if (ret == -1) {
3440     os_file_handle_error(nullptr, "close");
3441 
3442     return (false);
3443   }
3444 
3445   return (true);
3446 }
3447 
3448 /** Gets a file size.
3449 @param[in]	file		handle to an open file
3450 @return file size, or (os_offset_t) -1 on failure */
os_file_get_size(pfs_os_file_t file)3451 os_offset_t os_file_get_size(pfs_os_file_t file) {
3452   /* Store current position */
3453   os_offset_t pos = lseek(file.m_file, 0, SEEK_CUR);
3454   os_offset_t file_size = lseek(file.m_file, 0, SEEK_END);
3455   /* Restore current position as the function should not change it */
3456   lseek(file.m_file, pos, SEEK_SET);
3457   return (file_size);
3458 }
3459 
3460 /** Gets a file size.
3461 @param[in]	filename	Full path to the filename to check
3462 @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
3463         errno */
os_file_get_size(const char * filename)3464 os_file_size_t os_file_get_size(const char *filename) {
3465   struct stat s;
3466   os_file_size_t file_size;
3467 
3468   int ret = stat(filename, &s);
3469 
3470   if (ret == 0) {
3471     file_size.m_total_size = s.st_size;
3472     /* st_blocks is in 512 byte sized blocks */
3473     file_size.m_alloc_size = s.st_blocks * 512;
3474   } else {
3475     file_size.m_total_size = ~0;
3476     file_size.m_alloc_size = (os_offset_t)errno;
3477   }
3478 
3479   return (file_size);
3480 }
3481 
3482 /** Get available free space on disk
3483 @param[in]	path		pathname of a directory or file in disk
3484 @param[out]	free_space	free space available in bytes
3485 @return DB_SUCCESS if all OK */
os_get_free_space_posix(const char * path,uint64_t & free_space)3486 static dberr_t os_get_free_space_posix(const char *path, uint64_t &free_space) {
3487   struct statvfs stat;
3488   auto ret = statvfs(path, &stat);
3489 
3490   if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3491     /* file or directory  does not exist */
3492     return (DB_NOT_FOUND);
3493 
3494   } else if (ret) {
3495     /* file exists, but stat call failed */
3496     os_file_handle_error_no_exit(path, "statvfs", false);
3497     return (DB_FAIL);
3498   }
3499 
3500   free_space = stat.f_bsize;
3501   free_space *= stat.f_bavail;
3502   return (DB_SUCCESS);
3503 }
3504 
3505 /** This function returns information about the specified file
3506 @param[in]	path		pathname of the file
3507 @param[out]	stat_info	information of a file in a directory
3508 @param[in,out]	statinfo	information of a file in a directory
3509 @param[in]	check_rw_perm	for testing whether the file can be opened
3510                                 in RW mode
3511 @param[in]	read_only	if true read only mode checks are enforced
3512 @return DB_SUCCESS if all OK */
os_file_get_status_posix(const char * path,os_file_stat_t * stat_info,struct stat * statinfo,bool check_rw_perm,bool read_only)3513 static dberr_t os_file_get_status_posix(const char *path,
3514                                         os_file_stat_t *stat_info,
3515                                         struct stat *statinfo,
3516                                         bool check_rw_perm, bool read_only) {
3517   int ret = stat(path, statinfo);
3518 
3519   if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3520     /* file does not exist */
3521 
3522     return (DB_NOT_FOUND);
3523 
3524   } else if (ret) {
3525     /* file exists, but stat call failed */
3526 
3527     os_file_handle_error_no_exit(path, "stat", false);
3528 
3529     return (DB_FAIL);
3530   }
3531 
3532   switch (statinfo->st_mode & S_IFMT) {
3533     case S_IFDIR:
3534       stat_info->type = OS_FILE_TYPE_DIR;
3535       break;
3536     case S_IFLNK:
3537       stat_info->type = OS_FILE_TYPE_LINK;
3538       break;
3539     case S_IFBLK:
3540       /* Handle block device as regular file. */
3541     case S_IFCHR:
3542       /* Handle character device as regular file. */
3543     case S_IFREG:
3544       stat_info->type = OS_FILE_TYPE_FILE;
3545       break;
3546     default:
3547       stat_info->type = OS_FILE_TYPE_UNKNOWN;
3548   }
3549 
3550   stat_info->size = statinfo->st_size;
3551   stat_info->block_size = statinfo->st_blksize;
3552   stat_info->alloc_size = statinfo->st_blocks * 512;
3553 
3554   if (check_rw_perm && (stat_info->type == OS_FILE_TYPE_FILE ||
3555                         stat_info->type == OS_FILE_TYPE_BLOCK)) {
3556     int access = !read_only ? O_RDWR : O_RDONLY;
3557     int fh = ::open(path, access, os_innodb_umask);
3558 
3559     if (fh == -1) {
3560       stat_info->rw_perm = false;
3561     } else {
3562       stat_info->rw_perm = true;
3563       close(fh);
3564     }
3565   }
3566 
3567   return (DB_SUCCESS);
3568 }
3569 
3570 /** Truncates a file to a specified size in bytes.
3571 Do nothing if the size to preserve is greater or equal to the current
3572 size of the file.
3573 @param[in]	pathname	file path
3574 @param[in]	file		file to be truncated
3575 @param[in]	size		size to preserve in bytes
3576 @return true if success */
os_file_truncate_posix(const char * pathname,pfs_os_file_t file,os_offset_t size)3577 static bool os_file_truncate_posix(const char *pathname, pfs_os_file_t file,
3578                                    os_offset_t size) {
3579   int res = ftruncate(file.m_file, size);
3580   if (res == -1) {
3581     bool retry;
3582 
3583     retry = os_file_handle_error_no_exit(pathname, "truncate", false);
3584 
3585     if (retry) {
3586       ib::warn(ER_IB_MSG_783) << "Truncate failed for '" << pathname << "'";
3587     }
3588   }
3589 
3590   return (res == 0);
3591 }
3592 
3593 /** Truncates a file at its current position.
3594 @return true if success */
os_file_set_eof(FILE * file)3595 bool os_file_set_eof(FILE *file) /*!< in: file to be truncated */
3596 {
3597   return (!ftruncate(fileno(file), ftell(file)));
3598 }
3599 
3600 #ifdef UNIV_HOTBACKUP
3601 /** Closes a file handle.
3602 @param[in]	file		Handle to a file
3603 @return true if success */
os_file_close_no_error_handling(os_file_t file)3604 bool os_file_close_no_error_handling(os_file_t file) {
3605   return (close(file) != -1);
3606 }
3607 #endif /* UNIV_HOTBACKUP */
3608 
3609 /** This function can be called if one wants to post a batch of reads and
3610 prefers an i/o-handler thread to handle them all at once later. You must
3611 call os_aio_simulated_wake_handler_threads later to ensure the threads
3612 are not left sleeping! */
os_aio_simulated_put_read_threads_to_sleep()3613 void os_aio_simulated_put_read_threads_to_sleep() { /* No op on non Windows */
3614 }
3615 
3616 /** Depth first traversal of the directory starting from basedir
3617 @param[in]  basedir     Start scanning from this directory
3618 @param[in]  recursive  `true` if scan should be recursive
3619 @param[in]  f           Function to call for each entry */
walk_posix(const Path & basedir,bool recursive,Function && f)3620 void Dir_Walker::walk_posix(const Path &basedir, bool recursive, Function &&f) {
3621   using Stack = std::stack<Entry>;
3622 
3623   Stack directories;
3624 
3625   directories.push(Entry(basedir, 0));
3626 
3627   while (!directories.empty()) {
3628     Entry current = directories.top();
3629 
3630     directories.pop();
3631 
3632     /* Ignore hidden directories and files. */
3633     if (Fil_path::is_hidden(current.m_path)) {
3634       ib::info(ER_IB_MSG_SKIP_HIDDEN_DIR, current.m_path.c_str());
3635       continue;
3636     }
3637 
3638     DIR *parent = opendir(current.m_path.c_str());
3639 
3640     if (parent == nullptr) {
3641       ib::info(ER_IB_MSG_784) << "Failed to walk directory"
3642                               << " '" << current.m_path << "'";
3643 
3644       continue;
3645     }
3646 
3647     if (!is_directory(current.m_path)) {
3648       f(current.m_path, current.m_depth);
3649     }
3650 
3651     struct dirent *dirent = nullptr;
3652 
3653     for (;;) {
3654       dirent = readdir(parent);
3655 
3656       if (dirent == nullptr) {
3657         break;
3658       }
3659 
3660       if (strcmp(dirent->d_name, ".") == 0 ||
3661           strcmp(dirent->d_name, "..") == 0) {
3662         continue;
3663       }
3664 
3665       Path path(current.m_path);
3666 
3667       if (path.back() != '/' && path.back() != '\\') {
3668         path += OS_PATH_SEPARATOR;
3669       }
3670 
3671       path.append(dirent->d_name);
3672 
3673       /* Ignore hidden subdirectories and files. */
3674       if (Fil_path::is_hidden(path)) {
3675         ib::info(ER_IB_MSG_SKIP_HIDDEN_DIR, path.c_str());
3676         continue;
3677       }
3678 
3679       if (is_directory(path) && recursive) {
3680         directories.push(Entry(path, current.m_depth + 1));
3681       } else {
3682         f(path, current.m_depth + 1);
3683       }
3684     }
3685 
3686     closedir(parent);
3687   }
3688 }
3689 
3690 #else /* !_WIN32 */
3691 
3692 #include <WinIoCtl.h>
3693 
3694 /** Do the read/write
3695 @param[in]	request	The IO context and type
3696 @return the number of bytes read/written or negative value on error */
execute(const IORequest & request)3697 ssize_t SyncFileIO::execute(const IORequest &request) {
3698   OVERLAPPED seek;
3699 
3700   memset(&seek, 0x0, sizeof(seek));
3701 
3702   seek.Offset = (DWORD)m_offset & 0xFFFFFFFF;
3703   seek.OffsetHigh = (DWORD)(m_offset >> 32);
3704 
3705   BOOL ret;
3706   DWORD n_bytes;
3707 
3708   if (request.is_read()) {
3709     ret = ReadFile(m_fh, m_buf, static_cast<DWORD>(m_n), &n_bytes, &seek);
3710 
3711   } else {
3712     ut_ad(request.is_write());
3713     ret = WriteFile(m_fh, m_buf, static_cast<DWORD>(m_n), &n_bytes, &seek);
3714   }
3715 
3716   /* Sync IO can't be done on a file opened in AIO mode. */
3717   // ut_a(GetLastError() != ERROR_IO_PENDING);
3718 
3719   return (ret ? static_cast<ssize_t>(n_bytes) : -1);
3720 }
3721 
3722 /** Do the read/write
3723 @param[in,out]	slot	The IO slot, it has the IO context
3724 @return the number of bytes read/written or negative value on error */
execute(Slot * slot)3725 ssize_t SyncFileIO::execute(Slot *slot) {
3726   BOOL ret;
3727 
3728   if (slot->type.is_read()) {
3729     ret = ReadFile(slot->file.m_file, slot->ptr, slot->len, &slot->n_bytes,
3730                    &slot->control);
3731   } else {
3732     ut_ad(slot->type.is_write());
3733     ret = WriteFile(slot->file.m_file, slot->ptr, slot->len, &slot->n_bytes,
3734                     &slot->control);
3735   }
3736 
3737   /* Sync IO can't be done on a file opened in AIO mode. */
3738   // ut_a(GetLastError() != ERROR_IO_PENDING);
3739 
3740   return (ret ? static_cast<ssize_t>(slot->n_bytes) : -1);
3741 }
3742 
3743 /** Check if the file system supports sparse files.
3744 @param[in]	 name		File name
3745 @return true if the file system supports sparse files */
os_is_sparse_file_supported_win32(const char * filename)3746 static bool os_is_sparse_file_supported_win32(const char *filename) {
3747   char volname[MAX_PATH];
3748   BOOL result = GetVolumePathName(filename, volname, MAX_PATH);
3749 
3750   if (!result) {
3751     ib::error(ER_IB_MSG_785)
3752         << "os_is_sparse_file_supported: "
3753         << "Failed to get the volume path name for: " << filename
3754         << "- OS error number " << GetLastError();
3755 
3756     return (false);
3757   }
3758 
3759   DWORD flags;
3760 
3761   GetVolumeInformation(volname, NULL, MAX_PATH, NULL, NULL, &flags, NULL,
3762                        MAX_PATH);
3763 
3764   return (flags & FILE_SUPPORTS_SPARSE_FILES) ? true : false;
3765 }
3766 
3767 /** Free storage space associated with a section of the file.
3768 @param[in]	fh		Open file handle
3769 @param[in]	page_size	Tablespace page size
3770 @param[in]	block_size	File system block size
3771 @param[in]	off		Starting offset (SEEK_SET)
3772 @param[in]	len		Size of the hole
3773 @return 0 on success or errno */
os_file_punch_hole_win32(os_file_t fh,os_offset_t off,os_offset_t len)3774 static dberr_t os_file_punch_hole_win32(os_file_t fh, os_offset_t off,
3775                                         os_offset_t len) {
3776   FILE_ZERO_DATA_INFORMATION punch;
3777 
3778   punch.FileOffset.QuadPart = off;
3779   punch.BeyondFinalZero.QuadPart = off + len;
3780 
3781   /* If lpOverlapped is NULL, lpBytesReturned cannot be NULL,
3782   therefore we pass a dummy parameter. */
3783   DWORD temp;
3784 
3785   BOOL result = DeviceIoControl(fh, FSCTL_SET_ZERO_DATA, &punch, sizeof(punch),
3786                                 NULL, 0, &temp, NULL);
3787 
3788   return (!result ? DB_IO_NO_PUNCH_HOLE : DB_SUCCESS);
3789 }
3790 
3791 /** Check the existence and type of a given path.
3792 @param[in]   path    pathname of the file
3793 @param[out]  exists  true if file exists
3794 @param[out]  type    type of the file (if it exists)
3795 @return true if call succeeded */
os_file_status_win32(const char * path,bool * exists,os_file_type_t * type)3796 static bool os_file_status_win32(const char *path, bool *exists,
3797                                  os_file_type_t *type) {
3798   struct _stat64 statinfo;
3799 
3800   int ret = _stat64(path, &statinfo);
3801 
3802   if (exists != nullptr) {
3803     *exists = !ret;
3804   }
3805 
3806   if (ret == 0) {
3807     /* file exists, everything OK */
3808 
3809   } else if (errno == ENOENT || errno == ENOTDIR) {
3810     *type = OS_FILE_TYPE_MISSING;
3811 
3812     /* file does not exist */
3813 
3814     if (exists != nullptr) {
3815       *exists = false;
3816     }
3817 
3818     return (true);
3819 
3820   } else if (errno == EACCES) {
3821     *type = OS_FILE_PERMISSION_ERROR;
3822     return (false);
3823 
3824   } else {
3825     *type = OS_FILE_TYPE_FAILED;
3826 
3827     /* The _stat64() call failed with some other error */
3828     os_file_handle_error_no_exit(path, "file_status_win_stat64", false);
3829     return (false);
3830   }
3831 
3832   if (exists != nullptr) {
3833     *exists = true;
3834   }
3835 
3836   if (_S_IFDIR & statinfo.st_mode) {
3837     *type = OS_FILE_TYPE_DIR;
3838 
3839   } else if (_S_IFREG & statinfo.st_mode) {
3840     *type = OS_FILE_TYPE_FILE;
3841 
3842   } else {
3843     *type = OS_FILE_TYPE_UNKNOWN;
3844   }
3845 
3846   return (true);
3847 }
3848 
3849 /** Check the existence and usefulness of a given path.
3850 @param[in]  path  path name
3851 @retval true if the path exists and can be used
3852 @retval false if the path does not exist or if the path is
3853 unuseable to get to a possibly existing file or directory. */
os_file_exists_win32(const char * path)3854 static bool os_file_exists_win32(const char *path) {
3855   struct _stat64 statinfo;
3856 
3857   int ret = _stat64(path, &statinfo);
3858 
3859   if (ret == 0) {
3860     return (true);
3861   }
3862 
3863   if (!(errno == ENOENT || errno == EINVAL || errno == EACCES)) {
3864     /* The _stat64() call failed with an unknown error */
3865     os_file_handle_error_no_exit(path, "file_exists_win_stat64", false);
3866   }
3867 
3868   return (false);
3869 }
3870 
3871 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
3872 function!
3873 Flushes the write buffers of a given file to the disk.
3874 @param[in]	file		handle to a file
3875 @return true if success */
os_file_flush_func(os_file_t file)3876 bool os_file_flush_func(os_file_t file) {
3877   ++os_n_fsyncs;
3878 
3879   BOOL ret = FlushFileBuffers(file);
3880 
3881   if (ret) {
3882     return (true);
3883   }
3884 
3885   /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
3886   actually a raw device, we choose to ignore that error if we are using
3887   raw disks */
3888 
3889   if (srv_start_raw_disk_in_use && GetLastError() == ERROR_INVALID_FUNCTION) {
3890     return (true);
3891   }
3892 
3893   os_file_handle_error(NULL, "flush");
3894 
3895   /* It is a fatal error if a file flush does not succeed, because then
3896   the database can get corrupt on disk */
3897   ut_error;
3898 }
3899 
3900 /** Retrieves the last error number if an error occurs in a file io function.
3901 The number should be retrieved before any other OS calls (because they may
3902 overwrite the error number). If the number is not known to this program,
3903 the OS error number + 100 is returned.
3904 @param[in]	report_all_errors	true if we want an error message printed
3905                                         of all errors
3906 @param[in]	on_error_silent		true then don't print any diagnostic
3907                                         to the log
3908 @return error number, or OS error number + 100 */
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)3909 static ulint os_file_get_last_error_low(bool report_all_errors,
3910                                         bool on_error_silent) {
3911   ulint err = (ulint)GetLastError();
3912 
3913   if (err == ERROR_SUCCESS) {
3914     return (0);
3915   }
3916 
3917   if (report_all_errors || (!on_error_silent && err != ERROR_DISK_FULL &&
3918                             err != ERROR_FILE_EXISTS)) {
3919     ib::error(ER_IB_MSG_786)
3920         << "Operating system error number " << err << " in a file operation.";
3921 
3922     if (err == ERROR_PATH_NOT_FOUND) {
3923       ib::error(ER_IB_MSG_787) << "The error means the system cannot find"
3924                                   " the path specified. It might be too long"
3925                                   " or it might not exist.";
3926 
3927 #ifndef UNIV_HOTBACKUP
3928       if (srv_is_being_started) {
3929         ib::error(ER_IB_MSG_788) << "If you are installing InnoDB,"
3930                                     " remember that you must create"
3931                                     " directories yourself, InnoDB"
3932                                     " does not create them.";
3933       }
3934 #endif /* !UNIV_HOTBACKUP */
3935 
3936     } else if (err == ERROR_ACCESS_DENIED) {
3937       ib::error(ER_IB_MSG_789) << "The error means mysqld does not have"
3938                                   " the access rights to"
3939                                   " the directory. It may also be"
3940                                   " you have created a subdirectory"
3941                                   " of the same name as a data file.";
3942 
3943     } else if (err == ERROR_SHARING_VIOLATION || err == ERROR_LOCK_VIOLATION) {
3944       ib::error(ER_IB_MSG_790) << "The error means that another program"
3945                                   " is using InnoDB's files."
3946                                   " This might be a backup or antivirus"
3947                                   " software or another instance"
3948                                   " of MySQL."
3949                                   " Please close it to get rid of this error.";
3950 
3951     } else if (err == ERROR_WORKING_SET_QUOTA ||
3952                err == ERROR_NO_SYSTEM_RESOURCES) {
3953       ib::error(ER_IB_MSG_791) << "The error means that there are no"
3954                                   " sufficient system resources or quota to"
3955                                   " complete the operation.";
3956 
3957     } else if (err == ERROR_OPERATION_ABORTED) {
3958       ib::error(ER_IB_MSG_792) << "The error means that the I/O"
3959                                   " operation has been aborted"
3960                                   " because of either a thread exit"
3961                                   " or an application request."
3962                                   " Retry attempt is made.";
3963     } else {
3964       ib::info(ER_IB_MSG_793) << OPERATING_SYSTEM_ERROR_MSG;
3965     }
3966   }
3967 
3968   if (err == ERROR_FILE_NOT_FOUND) {
3969     return (OS_FILE_NOT_FOUND);
3970   } else if (err == ERROR_PATH_NOT_FOUND) {
3971     return (OS_FILE_NAME_TOO_LONG);
3972   } else if (err == ERROR_DISK_FULL) {
3973     return (OS_FILE_DISK_FULL);
3974   } else if (err == ERROR_FILE_EXISTS) {
3975     return (OS_FILE_ALREADY_EXISTS);
3976   } else if (err == ERROR_SHARING_VIOLATION || err == ERROR_LOCK_VIOLATION) {
3977     return (OS_FILE_SHARING_VIOLATION);
3978   } else if (err == ERROR_WORKING_SET_QUOTA ||
3979              err == ERROR_NO_SYSTEM_RESOURCES) {
3980     return (OS_FILE_INSUFFICIENT_RESOURCE);
3981   } else if (err == ERROR_OPERATION_ABORTED) {
3982     return (OS_FILE_OPERATION_ABORTED);
3983   } else if (err == ERROR_ACCESS_DENIED) {
3984     return (OS_FILE_ACCESS_VIOLATION);
3985   }
3986 
3987   return (OS_FILE_ERROR_MAX + err);
3988 }
3989 
3990 /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
3991 this function!
3992 A simple function to open or create a file.
3993 @param[in]	name		name of the file or path as a null-terminated
3994                                 string
3995 @param[in]	create_mode	create mode
3996 @param[in]	access_type	OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
3997 @param[in]	read_only	if true read only mode checks are enforced
3998 @param[out]	success		true if succeed, false if error
3999 @return handle to the file, not defined if error, error number
4000         can be retrieved with os_file_get_last_error */
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)4001 os_file_t os_file_create_simple_func(const char *name, ulint create_mode,
4002                                      ulint access_type, bool read_only,
4003                                      bool *success) {
4004   os_file_t file;
4005 
4006   *success = false;
4007 
4008   DWORD access;
4009   DWORD create_flag;
4010   DWORD attributes = 0;
4011 #ifdef UNIV_HOTBACKUP
4012   DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE;
4013 #else
4014   DWORD share_mode = FILE_SHARE_READ;
4015 #endif /* UNIV_HOTBACKUP */
4016 
4017   ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
4018   ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
4019 
4020   if (create_mode == OS_FILE_OPEN) {
4021     create_flag = OPEN_EXISTING;
4022 
4023   } else if (read_only) {
4024     create_flag = OPEN_EXISTING;
4025 
4026   } else if (create_mode == OS_FILE_CREATE) {
4027     create_flag = CREATE_NEW;
4028 
4029   } else if (create_mode == OS_FILE_CREATE_PATH) {
4030     /* Create subdirs along the path if needed. */
4031     dberr_t err;
4032 
4033     err = os_file_create_subdirs_if_needed(name);
4034 
4035     if (err != DB_SUCCESS) {
4036       *success = false;
4037       ib::error(ER_IB_MSG_794)
4038           << "Unable to create subdirectories '" << name << "'";
4039 
4040       return (OS_FILE_CLOSED);
4041     }
4042 
4043     create_flag = CREATE_NEW;
4044     create_mode = OS_FILE_CREATE;
4045 
4046   } else {
4047     ib::error(ER_IB_MSG_795) << "Unknown file create mode (" << create_mode
4048                              << ") for file '" << name << "'";
4049 
4050     return (OS_FILE_CLOSED);
4051   }
4052 
4053   if (access_type == OS_FILE_READ_ONLY) {
4054     access = GENERIC_READ;
4055 
4056   } else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
4057     ut_ad(read_only);
4058 
4059     access = GENERIC_READ;
4060     share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE;
4061 
4062   } else if (read_only) {
4063     ib::info(ER_IB_MSG_796) << "Read only mode set. Unable to"
4064                                " open file '"
4065                             << name << "' in RW mode, "
4066                             << "trying RO mode",
4067         name;
4068 
4069     access = GENERIC_READ;
4070 
4071   } else if (access_type == OS_FILE_READ_WRITE) {
4072     access = GENERIC_READ | GENERIC_WRITE;
4073 
4074   } else {
4075     ib::error(ER_IB_MSG_797) << "Unknown file access type (" << access_type
4076                              << ") "
4077                                 "for file '"
4078                              << name << "'";
4079 
4080     return (OS_FILE_CLOSED);
4081   }
4082 
4083   bool retry;
4084 
4085   do {
4086     /* Use default security attributes and no template file. */
4087 
4088     file = CreateFile((LPCTSTR)name, access, share_mode, NULL, create_flag,
4089                       attributes, NULL);
4090 
4091     if (file == INVALID_HANDLE_VALUE) {
4092       *success = false;
4093 
4094       retry = os_file_handle_error(
4095           name, create_mode == OS_FILE_OPEN ? "open" : "create");
4096 
4097     } else {
4098       retry = false;
4099 
4100       *success = true;
4101 
4102       DWORD temp;
4103 
4104       /* This is a best effort use case, if it fails then
4105       we will find out when we try and punch the hole. */
4106 
4107       DeviceIoControl(file, FSCTL_SET_SPARSE, NULL, 0, NULL, 0, &temp, NULL);
4108     }
4109 
4110   } while (retry);
4111 
4112   return (file);
4113 }
4114 
4115 /** This function attempts to create a directory named pathname. The new
4116 directory gets default permissions. On Unix the permissions are
4117 (0770 & ~umask). If the directory exists already, nothing is done and
4118 the call succeeds, unless the fail_if_exists arguments is true.
4119 If another error occurs, such as a permission error, this does not crash,
4120 but reports the error and returns false.
4121 @param[in]	pathname	directory name as null-terminated string
4122 @param[in]	fail_if_exists	if true, pre-existing directory is treated
4123                                 as an error.
4124 @return true if call succeeds, false on error */
os_file_create_directory(const char * pathname,bool fail_if_exists)4125 bool os_file_create_directory(const char *pathname, bool fail_if_exists) {
4126   BOOL rcode;
4127 
4128   rcode = CreateDirectory((LPCTSTR)pathname, NULL);
4129   if (!(rcode != 0 ||
4130         (GetLastError() == ERROR_ALREADY_EXISTS && !fail_if_exists))) {
4131     os_file_handle_error_no_exit(pathname, "CreateDirectory", false);
4132 
4133     return (false);
4134   }
4135 
4136   return (true);
4137 }
4138 
4139 /** This function scans the contents of a directory and invokes the callback
4140 for each entry.
4141 @param[in]	path		directory name as null-terminated string
4142 @param[in]	scan_cbk	use callback to be called for each entry
4143 @param[in]	is_drop		attempt to drop the directory after scan
4144 @return true if call succeeds, false on error */
os_file_scan_directory(const char * path,os_dir_cbk_t scan_cbk,bool is_drop)4145 bool os_file_scan_directory(const char *path, os_dir_cbk_t scan_cbk,
4146                             bool is_drop) {
4147   bool file_found;
4148   HANDLE find_hdl;
4149   WIN32_FIND_DATA find_data;
4150   char wild_card_path[MAX_PATH];
4151 
4152   snprintf(wild_card_path, MAX_PATH, "%s\\*", path);
4153 
4154   find_hdl = FindFirstFile((LPCTSTR)wild_card_path, &find_data);
4155 
4156   if (find_hdl == INVALID_HANDLE_VALUE) {
4157     os_file_handle_error_no_exit(path, "FindFirstFile", false);
4158     return (false);
4159   }
4160 
4161   do {
4162     scan_cbk(path, find_data.cFileName);
4163     file_found = FindNextFile(find_hdl, &find_data);
4164 
4165   } while (file_found);
4166 
4167   FindClose(find_hdl);
4168 
4169   if (is_drop) {
4170     bool ret;
4171 
4172     ret = RemoveDirectory((LPCSTR)path);
4173 
4174     if (!ret) {
4175       os_file_handle_error_no_exit(path, "RemoveDirectory", false);
4176       return (false);
4177     }
4178   }
4179 
4180   return (true);
4181 }
4182 
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,bool read_only,bool * success)4183 pfs_os_file_t os_file_create_func(const char *name, ulint create_mode,
4184                                   ulint purpose, ulint type, bool read_only,
4185                                   bool *success) {
4186   pfs_os_file_t file;
4187   bool retry;
4188   bool on_error_no_exit;
4189   bool on_error_silent;
4190 
4191   *success = false;
4192 
4193   DBUG_EXECUTE_IF("ib_create_table_fail_disk_full", *success = false;
4194                   SetLastError(ERROR_DISK_FULL); file.m_file = OS_FILE_CLOSED;
4195                   return (file););
4196 
4197   DWORD create_flag;
4198   DWORD share_mode = FILE_SHARE_READ;
4199 
4200   on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT ? true : false;
4201 
4202   on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT ? true : false;
4203 
4204   create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
4205   create_mode &= ~OS_FILE_ON_ERROR_SILENT;
4206 
4207   if (create_mode == OS_FILE_OPEN_RAW) {
4208     ut_a(!read_only);
4209 
4210     create_flag = OPEN_EXISTING;
4211 
4212     /* On Windows Physical devices require admin privileges and
4213     have to have the write-share mode set. See the remarks
4214     section for the CreateFile() function documentation in MSDN. */
4215 
4216     share_mode |= FILE_SHARE_WRITE;
4217 
4218   } else if (create_mode == OS_FILE_OPEN || create_mode == OS_FILE_OPEN_RETRY) {
4219     create_flag = OPEN_EXISTING;
4220 
4221   } else if (read_only) {
4222     create_flag = OPEN_EXISTING;
4223 
4224   } else if (create_mode == OS_FILE_CREATE) {
4225     create_flag = CREATE_NEW;
4226 
4227   } else if (create_mode == OS_FILE_CREATE_PATH) {
4228     /* Create subdirs along the path if needed. */
4229     dberr_t err;
4230 
4231     err = os_file_create_subdirs_if_needed(name);
4232 
4233     if (err != DB_SUCCESS) {
4234       *success = false;
4235       ib::error(ER_IB_MSG_798)
4236           << "Unable to create subdirectories '" << name << "'";
4237 
4238       file.m_file = OS_FILE_CLOSED;
4239       return (file);
4240     }
4241 
4242     create_flag = CREATE_NEW;
4243     create_mode = OS_FILE_CREATE;
4244 
4245   } else {
4246     ib::error(ER_IB_MSG_799)
4247         << "Unknown file create mode (" << create_mode << ") "
4248         << " for file '" << name << "'";
4249 
4250     file.m_file = OS_FILE_CLOSED;
4251     return (file);
4252   }
4253 
4254   DWORD attributes = 0;
4255 
4256 #ifdef UNIV_HOTBACKUP
4257   attributes |= FILE_FLAG_NO_BUFFERING;
4258 #else /* UNIV_HOTBACKUP */
4259 
4260   if (purpose == OS_FILE_AIO) {
4261 #ifdef WIN_ASYNC_IO
4262     /* If specified, use asynchronous (overlapped) io and no
4263     buffering of writes in the OS */
4264 
4265     if (srv_use_native_aio) {
4266       attributes |= FILE_FLAG_OVERLAPPED;
4267     }
4268 #endif /* WIN_ASYNC_IO */
4269 
4270   } else if (purpose == OS_FILE_NORMAL) {
4271     /* Use default setting. */
4272 
4273   } else {
4274     ib::error(ER_IB_MSG_800) << "Unknown purpose flag (" << purpose << ") "
4275                              << "while opening file '" << name << "'";
4276 
4277     file.m_file = OS_FILE_CLOSED;
4278     return (file);
4279   }
4280 
4281 #ifdef UNIV_NON_BUFFERED_IO
4282   // TODO: Create a bug, this looks wrong. The flush log
4283   // parameter is dynamic.
4284   if (type == OS_BUFFERED_FILE || type == OS_CLONE_LOG_FILE ||
4285       type == OS_LOG_FILE) {
4286     /* Do not use unbuffered i/o for the log files because
4287     we write really a lot and we have log flusher for fsyncs. */
4288 
4289   } else if (srv_win_file_flush_method == SRV_WIN_IO_UNBUFFERED) {
4290     attributes |= FILE_FLAG_NO_BUFFERING;
4291   }
4292 #endif /* UNIV_NON_BUFFERED_IO */
4293 
4294 #endif /* UNIV_HOTBACKUP */
4295   DWORD access = GENERIC_READ;
4296 
4297   if (!read_only) {
4298     access |= GENERIC_WRITE;
4299   }
4300 
4301   /* Clone must allow concurrent write to file. */
4302   if (type == OS_CLONE_LOG_FILE || type == OS_CLONE_DATA_FILE) {
4303     share_mode |= FILE_SHARE_WRITE;
4304   }
4305 
4306   do {
4307     /* Use default security attributes and no template file. */
4308     file.m_file = CreateFile((LPCTSTR)name, access, share_mode, NULL,
4309                              create_flag, attributes, NULL);
4310 
4311     if (file.m_file == INVALID_HANDLE_VALUE) {
4312       const char *operation;
4313 
4314       operation =
4315           (create_mode == OS_FILE_CREATE && !read_only) ? "create" : "open";
4316 
4317       *success = false;
4318 
4319       if (on_error_no_exit) {
4320         retry = os_file_handle_error_no_exit(name, operation, on_error_silent);
4321       } else {
4322         retry = os_file_handle_error(name, operation);
4323       }
4324     } else {
4325       retry = false;
4326 
4327       *success = true;
4328 
4329       DWORD temp;
4330 
4331       /* This is a best effort use case, if it fails then
4332       we will find out when we try and punch the hole. */
4333       DeviceIoControl(file.m_file, FSCTL_SET_SPARSE, NULL, 0, NULL, 0, &temp,
4334                       NULL);
4335     }
4336 
4337   } while (retry);
4338 
4339   return (file);
4340 }
4341 
4342 /** NOTE! Use the corresponding macro os_file_create_simple_no_error_handling(),
4343 not directly this function!
4344 A simple function to open or create a file.
4345 @param[in]	name		name of the file or path as a null-terminated
4346                                 string
4347 @param[in]	create_mode	create mode
4348 @param[in]	access_type	OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
4349                                 OS_FILE_READ_ALLOW_DELETE; the last option is
4350                                 used by a backup program reading the file
4351 @param[out]	success		true if succeeded
4352 @return own: handle to the file, not defined if error, error number
4353         can be retrieved with os_file_get_last_error */
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)4354 pfs_os_file_t os_file_create_simple_no_error_handling_func(const char *name,
4355                                                            ulint create_mode,
4356                                                            ulint access_type,
4357                                                            bool read_only,
4358                                                            bool *success) {
4359   pfs_os_file_t file;
4360 
4361   *success = false;
4362 
4363   DWORD access;
4364   DWORD create_flag;
4365   DWORD attributes = 0;
4366   DWORD share_mode = FILE_SHARE_READ;
4367 
4368 #ifdef UNIV_HOTBACKUP
4369   share_mode |= FILE_SHARE_WRITE;
4370 #endif /* UNIV_HOTBACKUP */
4371 
4372   ut_a(name);
4373 
4374   ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
4375   ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
4376 
4377   if (create_mode == OS_FILE_OPEN) {
4378     create_flag = OPEN_EXISTING;
4379 
4380   } else if (read_only) {
4381     create_flag = OPEN_EXISTING;
4382 
4383   } else if (create_mode == OS_FILE_CREATE) {
4384     create_flag = CREATE_NEW;
4385 
4386   } else {
4387     ib::error(ER_IB_MSG_801)
4388         << "Unknown file create mode (" << create_mode << ") "
4389         << " for file '" << name << "'";
4390 
4391     file.m_file = OS_FILE_CLOSED;
4392     return (file);
4393   }
4394 
4395   if (access_type == OS_FILE_READ_ONLY) {
4396     access = GENERIC_READ;
4397 
4398   } else if (read_only) {
4399     access = GENERIC_READ;
4400 
4401   } else if (access_type == OS_FILE_READ_WRITE) {
4402     access = GENERIC_READ | GENERIC_WRITE;
4403 
4404   } else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
4405     ut_a(!read_only);
4406 
4407     access = GENERIC_READ;
4408 
4409     /* A backup program has to give mysqld the maximum
4410     freedom to do what it likes with the file */
4411 
4412     share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE;
4413   } else {
4414     ib::error(ER_IB_MSG_802)
4415         << "Unknown file access type (" << access_type << ") "
4416         << "for file '" << name << "'";
4417 
4418     file.m_file = OS_FILE_CLOSED;
4419     return (file);
4420   }
4421 
4422   file.m_file = CreateFile((LPCTSTR)name, access, share_mode,
4423                            NULL,  // Security attributes
4424                            create_flag, attributes,
4425                            NULL);  // No template file
4426 
4427   *success = (file.m_file != INVALID_HANDLE_VALUE);
4428 
4429   return (file);
4430 }
4431 
4432 /** Deletes a file if it exists. The file has to be closed before calling this.
4433 @param[in]	name		file path as a null-terminated string
4434 @param[out]	exist		indicate if file pre-exist
4435 @return true if success */
os_file_delete_if_exists_func(const char * name,bool * exist)4436 bool os_file_delete_if_exists_func(const char *name, bool *exist) {
4437   if (!os_file_can_delete(name)) {
4438     return (false);
4439   }
4440 
4441   if (exist != nullptr) {
4442     *exist = true;
4443   }
4444 
4445   ulint count = 0;
4446 
4447   for (;;) {
4448     /* In Windows, deleting an .ibd file may fail if mysqlbackup
4449     is copying it */
4450 
4451     bool ret = DeleteFile((LPCTSTR)name);
4452 
4453     if (ret) {
4454       return (true);
4455     }
4456 
4457     DWORD lasterr = GetLastError();
4458 
4459     if (lasterr == ERROR_FILE_NOT_FOUND || lasterr == ERROR_PATH_NOT_FOUND) {
4460       /* The file does not exist, this not an error */
4461       if (exist != NULL) {
4462         *exist = false;
4463       }
4464 
4465       return (true);
4466     }
4467 
4468     ++count;
4469 
4470     if (count > 100 && 0 == (count % 10)) {
4471       /* Print error information */
4472       os_file_get_last_error(true);
4473 
4474       ib::warn(ER_IB_MSG_803) << "Delete of file '" << name << "' failed.";
4475     }
4476 
4477     /* Sleep for a second */
4478     os_thread_sleep(1000000);
4479 
4480     if (count > 2000) {
4481       return (false);
4482     }
4483   }
4484 }
4485 
4486 /** Deletes a file. The file has to be closed before calling this.
4487 @param[in]	name		File path as NUL terminated string
4488 @return true if success */
os_file_delete_func(const char * name)4489 bool os_file_delete_func(const char *name) {
4490   ulint count = 0;
4491 
4492   for (;;) {
4493     /* In Windows, deleting an .ibd file may fail if mysqlbackup
4494     is copying it */
4495 
4496     BOOL ret = DeleteFile((LPCTSTR)name);
4497 
4498     if (ret) {
4499       return (true);
4500     }
4501 
4502     if (GetLastError() == ERROR_FILE_NOT_FOUND) {
4503       /* If the file does not exist, we classify this as
4504       a 'mild' error and return */
4505 
4506       return (false);
4507     }
4508 
4509     ++count;
4510 
4511     if (count > 100 && 0 == (count % 10)) {
4512       /* print error information */
4513       os_file_get_last_error(true);
4514 
4515       ib::warn(ER_IB_MSG_804)
4516           << "Cannot delete file '" << name << "'. Are you running mysqlbackup"
4517           << " to back up the file?";
4518     }
4519 
4520     /* sleep for a second */
4521     os_thread_sleep(1000000);
4522 
4523     if (count > 2000) {
4524       return (false);
4525     }
4526   }
4527 
4528   ut_error;
4529   return (false);
4530 }
4531 
4532 /** NOTE! Use the corresponding macro os_file_rename(), not directly this
4533 function!
4534 Renames a file (can also move it to another directory). It is safest that the
4535 file is closed before calling this function.
4536 @param[in]	oldpath		old file path as a null-terminated string
4537 @param[in]	newpath		new file path
4538 @return true if success */
os_file_rename_func(const char * oldpath,const char * newpath)4539 bool os_file_rename_func(const char *oldpath, const char *newpath) {
4540 #ifdef UNIV_DEBUG
4541   /* New path must be valid but not exist. */
4542   os_file_type_t type;
4543   bool exists;
4544   ut_ad(os_file_status(newpath, &exists, &type));
4545   ut_ad(!exists);
4546 
4547   /* Old path must exist. */
4548   ut_ad(os_file_exists(oldpath));
4549 #endif /* UNIV_DEBUG */
4550 
4551   if (MoveFile((LPCTSTR)oldpath, (LPCTSTR)newpath)) {
4552     return (true);
4553   }
4554 
4555   os_file_handle_error_no_exit(oldpath, "rename", false);
4556 
4557   return (false);
4558 }
4559 
4560 /** NOTE! Use the corresponding macro os_file_close(), not directly
4561 this function!
4562 Closes a file handle. In case of error, error number can be retrieved with
4563 os_file_get_last_error.
4564 @param[in,own]	file		Handle to a file
4565 @return true if success */
os_file_close_func(os_file_t file)4566 bool os_file_close_func(os_file_t file) {
4567   ut_a(file != INVALID_HANDLE_VALUE);
4568 
4569   if (CloseHandle(file)) {
4570     return (true);
4571   }
4572 
4573   os_file_handle_error(NULL, "close");
4574 
4575   return (false);
4576 }
4577 
4578 /** Gets a file size.
4579 @param[in]	file		Handle to a file
4580 @return file size, or (os_offset_t) -1 on failure */
os_file_get_size(pfs_os_file_t file)4581 os_offset_t os_file_get_size(pfs_os_file_t file) {
4582   DWORD high;
4583   DWORD low;
4584 
4585   low = GetFileSize(file.m_file, &high);
4586   if (low == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
4587     return ((os_offset_t)-1);
4588   }
4589 
4590   return (os_offset_t(low | (os_offset_t(high) << 32)));
4591 }
4592 
4593 /** Gets a file size.
4594 @param[in]	filename	Full path to the filename to check
4595 @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
4596         errno */
os_file_get_size(const char * filename)4597 os_file_size_t os_file_get_size(const char *filename) {
4598   struct __stat64 s;
4599   os_file_size_t file_size;
4600 
4601   int ret = _stat64(filename, &s);
4602 
4603   if (ret == 0) {
4604     file_size.m_total_size = s.st_size;
4605 
4606     DWORD low_size;
4607     DWORD high_size;
4608 
4609     low_size = GetCompressedFileSize(filename, &high_size);
4610 
4611     if (low_size != INVALID_FILE_SIZE) {
4612       file_size.m_alloc_size = high_size;
4613       file_size.m_alloc_size <<= 32;
4614       file_size.m_alloc_size |= low_size;
4615 
4616     } else {
4617       ib::error(ER_IB_MSG_805)
4618           << "GetCompressedFileSize(" << filename << ", ..) failed.";
4619 
4620       file_size.m_alloc_size = (os_offset_t)-1;
4621     }
4622   } else {
4623     file_size.m_total_size = ~0;
4624     file_size.m_alloc_size = (os_offset_t)ret;
4625   }
4626 
4627   return (file_size);
4628 }
4629 
4630 /** Get available free space on disk
4631 @param[in]	path		pathname of a directory or file in disk
4632 @param[out]	block_size	Block size to use for IO in bytes
4633 @param[out]	free_space	free space available in bytes
4634 @return DB_SUCCESS if all OK */
os_get_free_space_win32(const char * path,uint32_t & block_size,uint64_t & free_space)4635 static dberr_t os_get_free_space_win32(const char *path, uint32_t &block_size,
4636                                        uint64_t &free_space) {
4637   char volname[MAX_PATH];
4638   BOOL result = GetVolumePathName(path, volname, MAX_PATH);
4639 
4640   if (!result) {
4641     ib::error(ER_IB_MSG_806)
4642         << "os_file_get_status_win32: "
4643         << "Failed to get the volume path name for: " << path
4644         << "- OS error number " << GetLastError();
4645 
4646     return (DB_FAIL);
4647   }
4648 
4649   DWORD sectorsPerCluster;
4650   DWORD bytesPerSector;
4651   DWORD numberOfFreeClusters;
4652   DWORD totalNumberOfClusters;
4653 
4654   result =
4655       GetDiskFreeSpace((LPCSTR)volname, &sectorsPerCluster, &bytesPerSector,
4656                        &numberOfFreeClusters, &totalNumberOfClusters);
4657 
4658   if (!result) {
4659     ib::error(ER_IB_MSG_807) << "GetDiskFreeSpace(" << volname << ",...) "
4660                              << "failed "
4661                              << "- OS error number " << GetLastError();
4662 
4663     return (DB_FAIL);
4664   }
4665 
4666   block_size = bytesPerSector * sectorsPerCluster;
4667 
4668   free_space = static_cast<uint64_t>(block_size);
4669   free_space *= numberOfFreeClusters;
4670 
4671   return (DB_SUCCESS);
4672 }
4673 
4674 /** This function returns information about the specified file
4675 @param[in]	path		pathname of the file
4676 @param[out]	stat_info	information of a file in a directory
4677 @param[in,out]	statinfo	information of a file in a directory
4678 @param[in]	check_rw_perm	for testing whether the file can be opened
4679                                 in RW mode
4680 @param[in]	read_only	true if the file is opened in read-only mode
4681 @return DB_SUCCESS if all OK */
os_file_get_status_win32(const char * path,os_file_stat_t * stat_info,struct _stat64 * statinfo,bool check_rw_perm,bool read_only)4682 static dberr_t os_file_get_status_win32(const char *path,
4683                                         os_file_stat_t *stat_info,
4684                                         struct _stat64 *statinfo,
4685                                         bool check_rw_perm, bool read_only) {
4686   int ret = _stat64(path, statinfo);
4687 
4688   if (ret && (errno == ENOENT || errno == ENOTDIR)) {
4689     /* file does not exist */
4690 
4691     return (DB_NOT_FOUND);
4692 
4693   } else if (ret) {
4694     /* file exists, but stat call failed */
4695 
4696     os_file_handle_error_no_exit(path, "stat", false);
4697 
4698     return (DB_FAIL);
4699 
4700   } else if (_S_IFDIR & statinfo->st_mode) {
4701     stat_info->type = OS_FILE_TYPE_DIR;
4702 
4703   } else if (_S_IFREG & statinfo->st_mode) {
4704     DWORD access = GENERIC_READ;
4705 
4706     if (!read_only) {
4707       access |= GENERIC_WRITE;
4708     }
4709 
4710     stat_info->type = OS_FILE_TYPE_FILE;
4711 
4712     /* Check if we can open it in read-only mode. */
4713 
4714     if (check_rw_perm) {
4715       HANDLE fh;
4716 
4717       fh = CreateFile((LPCTSTR)path,  // File to open
4718                       access, FILE_SHARE_READ,
4719                       NULL,                   // Default security
4720                       OPEN_EXISTING,          // Existing file only
4721                       FILE_ATTRIBUTE_NORMAL,  // Normal file
4722                       NULL);                  // No attr. template
4723 
4724       if (fh == INVALID_HANDLE_VALUE) {
4725         stat_info->rw_perm = false;
4726       } else {
4727         stat_info->rw_perm = true;
4728         CloseHandle(fh);
4729       }
4730     }
4731 
4732     uint64_t free_space;
4733     auto err = os_get_free_space_win32(path, stat_info->block_size, free_space);
4734 
4735     if (err != DB_SUCCESS) {
4736       return (err);
4737     }
4738     /* On Windows the block size is not used as the allocation
4739     unit for sparse files. The underlying infra-structure for
4740     sparse files is based on NTFS compression. The punch hole
4741     is done on a "compression unit". This compression unit
4742     is based on the cluster size. You cannot punch a hole if
4743     the cluster size >= 8K. For smaller sizes the table is
4744     as follows:
4745 
4746     Cluster Size	Compression Unit
4747     512 Bytes		 8 KB
4748       1 KB			16 KB
4749       2 KB			32 KB
4750       4 KB			64 KB
4751 
4752     Default NTFS cluster size is 4K, compression unit size of 64K.
4753     Therefore unless the user has created the file system with
4754     a smaller cluster size and used larger page sizes there is
4755     little benefit from compression out of the box. */
4756 
4757     stat_info->block_size = (stat_info->block_size <= 4096)
4758                                 ? stat_info->block_size * 16
4759                                 : UINT32_UNDEFINED;
4760   } else {
4761     stat_info->type = OS_FILE_TYPE_UNKNOWN;
4762   }
4763 
4764   return (DB_SUCCESS);
4765 }
4766 
4767 /** Truncates a file to a specified size in bytes.
4768 Do nothing if the size to preserve is greater or equal to the current
4769 size of the file.
4770 @param[in]	pathname	file path
4771 @param[in]	file		file to be truncated
4772 @param[in]	size		size to preserve in bytes
4773 @return true if success */
os_file_truncate_win32(const char * pathname,pfs_os_file_t file,os_offset_t size)4774 static bool os_file_truncate_win32(const char *pathname, pfs_os_file_t file,
4775                                    os_offset_t size) {
4776   LARGE_INTEGER length;
4777 
4778   length.QuadPart = size;
4779 
4780   BOOL success = SetFilePointerEx(file.m_file, length, NULL, FILE_BEGIN);
4781 
4782   if (!success) {
4783     os_file_handle_error_no_exit(pathname, "SetFilePointerEx", false);
4784   } else {
4785     success = SetEndOfFile(file.m_file);
4786     if (!success) {
4787       os_file_handle_error_no_exit(pathname, "SetEndOfFile", false);
4788     }
4789   }
4790   return (success);
4791 }
4792 
4793 /** Truncates a file at its current position.
4794 @param[in]	file		Handle to be truncated
4795 @return true if success */
os_file_set_eof(FILE * file)4796 bool os_file_set_eof(FILE *file) {
4797   HANDLE h = (HANDLE)_get_osfhandle(fileno(file));
4798 
4799   return (SetEndOfFile(h));
4800 }
4801 
4802 #ifdef UNIV_HOTBACKUP
4803 /** Closes a file handle.
4804 @param[in]	file		Handle to close
4805 @return true if success */
os_file_close_no_error_handling(os_file_t file)4806 bool os_file_close_no_error_handling(os_file_t file) {
4807   return (CloseHandle(file) ? true : false);
4808 }
4809 #endif /* UNIV_HOTBACKUP */
4810 
4811 /** This function can be called if one wants to post a batch of reads and
4812 prefers an i/o-handler thread to handle them all at once later. You must
4813 call os_aio_simulated_wake_handler_threads later to ensure the threads
4814 are not left sleeping! */
os_aio_simulated_put_read_threads_to_sleep()4815 void os_aio_simulated_put_read_threads_to_sleep() {
4816   AIO::simulated_put_read_threads_to_sleep();
4817 }
4818 
4819 /** This function can be called if one wants to post a batch of reads and
4820 prefers an i/o-handler thread to handle them all at once later. You must
4821 call os_aio_simulated_wake_handler_threads later to ensure the threads
4822 are not left sleeping! */
simulated_put_read_threads_to_sleep()4823 void AIO::simulated_put_read_threads_to_sleep() {
4824   /* The idea of putting background IO threads to sleep is only for
4825   Windows when using simulated AIO. Windows XP seems to schedule
4826   background threads too eagerly to allow for coalescing during
4827   readahead requests. */
4828 
4829   if (srv_use_native_aio) {
4830     /* We do not use simulated AIO: do nothing */
4831 
4832     return;
4833   }
4834 
4835   os_aio_recommend_sleep_for_read_threads = true;
4836 
4837   for (ulint i = 0; i < os_aio_n_segments; i++) {
4838     AIO *array{};
4839 
4840     get_array_and_local_segment(array, i);
4841 
4842     if (array == s_reads) {
4843       os_event_reset(os_aio_segment_wait_events[i]);
4844     }
4845   }
4846 }
4847 
4848 /** Depth first traversal of the directory starting from basedir
4849 @param[in]      basedir    Start scanning from this directory
4850 @param[in]      recursive  `true` if scan should be recursive
4851 @param[in]      f          Callback for each entry found */
walk_win32(const Path & basedir,bool recursive,Function && f)4852 void Dir_Walker::walk_win32(const Path &basedir, bool recursive, Function &&f) {
4853   using Stack = std::stack<Entry>;
4854 
4855   HRESULT res;
4856   size_t length;
4857   Stack directories;
4858   TCHAR directory[MAX_PATH];
4859 
4860   res = StringCchLength(basedir.c_str(), MAX_PATH, &length);
4861 
4862   /* Check if the name is too long. */
4863   if (!SUCCEEDED(res)) {
4864     ib::warn(ER_IB_MSG_808) << "StringCchLength() call failed!";
4865     return;
4866 
4867   } else if (length > (MAX_PATH - 3)) {
4868     ib::warn(ER_IB_MSG_809) << "Directory name too long: '" << basedir << "'";
4869     return;
4870   }
4871 
4872   StringCchCopy(directory, MAX_PATH, basedir.c_str());
4873 
4874   if (directory[_tcslen(directory) - 1] != TEXT('\\')) {
4875     StringCchCat(directory, MAX_PATH, TEXT("\\*"));
4876   } else {
4877     StringCchCat(directory, MAX_PATH, TEXT("*"));
4878   }
4879 
4880   directories.push(Entry(directory, 0));
4881 
4882   using Type = std::codecvt_utf8<wchar_t>;
4883   using Converter = std::wstring_convert<Type, wchar_t>;
4884 
4885   Converter converter;
4886 
4887   while (!directories.empty()) {
4888     Entry current = directories.top();
4889 
4890     directories.pop();
4891 
4892     if (Fil_path::is_hidden(current.m_path)) {
4893       ib::info(ER_IB_MSG_SKIP_HIDDEN_DIR, current.m_path.c_str());
4894       continue;
4895     }
4896 
4897     HANDLE h;
4898     WIN32_FIND_DATA dirent;
4899 
4900     h = FindFirstFile(current.m_path.c_str(), &dirent);
4901 
4902     if (h == INVALID_HANDLE_VALUE) {
4903       ib::info(ER_IB_MSG_810) << "Directory read failed:"
4904                               << " '" << current.m_path << "' during scan";
4905 
4906       continue;
4907     }
4908 
4909     do {
4910       /* dirent.cFileName is a TCHAR. */
4911       if (_tcscmp(dirent.cFileName, _T(".")) == 0 ||
4912           _tcscmp(dirent.cFileName, _T("..")) == 0) {
4913         continue;
4914       }
4915 
4916       Path path(current.m_path);
4917 
4918       /* Shorten the path to remove the trailing '*'. */
4919       ut_ad(path.substr(path.size() - 2).compare("\\*") == 0);
4920 
4921       path.resize(path.size() - 1);
4922       path.append(dirent.cFileName);
4923 
4924       /* Ignore hidden files and directories. */
4925       if (Fil_path::is_hidden(dirent) || Fil_path::is_hidden(path)) {
4926         ib::info(ER_IB_MSG_SKIP_HIDDEN_DIR, path.c_str());
4927         continue;
4928       }
4929 
4930       if ((dirent.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) && recursive) {
4931         path.append("\\*");
4932 
4933         using value_type = Stack::value_type;
4934 
4935         value_type dir(path, current.m_depth + 1);
4936 
4937         directories.push(dir);
4938 
4939       } else {
4940         f(path, current.m_depth + 1);
4941       }
4942 
4943     } while (FindNextFile(h, &dirent) != 0);
4944 
4945     if (GetLastError() != ERROR_NO_MORE_FILES) {
4946       ib::error(ER_IB_MSG_811) << "Scanning '" << directory << "'"
4947                                << " - FindNextFile(): returned error";
4948     }
4949 
4950     FindClose(h);
4951   }
4952 }
4953 #endif /* !_WIN32*/
4954 
4955 /** Does a syncronous read or write depending upon the type specified
4956 In case of partial reads/writes the function tries
4957 NUM_RETRIES_ON_PARTIAL_IO times to read/write the complete data.
4958 @param[in]	in_type		IO flags
4959 @param[in]	file		handle to an open file
4960 @param[out]	buf		buffer where to read
4961 @param[in]	offset		file offset from the start where to read
4962 @param[in]	n		number of bytes to read, starting from offset
4963 @param[out]	err		DB_SUCCESS or error code
4964 @return number of bytes read/written, -1 if error */
4965 static MY_ATTRIBUTE((warn_unused_result)) ssize_t
os_file_io(const IORequest & in_type,os_file_t file,void * buf,ulint n,os_offset_t offset,dberr_t * err)4966     os_file_io(const IORequest &in_type, os_file_t file, void *buf, ulint n,
4967                os_offset_t offset, dberr_t *err) {
4968   ulint original_n = n;
4969   file::Block *block{};
4970   IORequest type = in_type;
4971   ssize_t bytes_returned = 0;
4972   byte *encrypt_log_buf = nullptr;
4973 
4974   if (type.is_compressed()) {
4975     /* We don't compress the first page of any file. */
4976     ut_ad(offset > 0);
4977     block = os_file_compress_page(type, buf, &n);
4978   } else {
4979     block = nullptr;
4980   }
4981 
4982   /* We do encryption after compression, since if we do encryption
4983   before compression, the encrypted data will cause compression fail
4984   or low compression rate. */
4985   if (type.is_encrypted() && type.is_write()) {
4986     if (!type.is_log()) {
4987       /* We don't encrypt the first page of any file. */
4988       auto compressed_block = block;
4989       ut_ad(offset > 0);
4990 
4991       block = os_file_encrypt_page(type, buf, &n);
4992 
4993       if (compressed_block != nullptr) {
4994         os_free_block(compressed_block);
4995       }
4996     } else {
4997       /* Skip encrypt log file header */
4998       if (offset >= LOG_FILE_HDR_SIZE) {
4999         block = os_file_encrypt_log(type, buf, encrypt_log_buf, &n);
5000       }
5001     }
5002   }
5003 
5004   SyncFileIO sync_file_io(file, buf, n, offset);
5005 
5006   for (ulint i = 0; i < NUM_RETRIES_ON_PARTIAL_IO; ++i) {
5007     ssize_t n_bytes = sync_file_io.execute(type);
5008 
5009     /* Check for a hard error. Not much we can do now. */
5010     if (n_bytes < 0) {
5011       break;
5012 
5013     } else if ((ulint)n_bytes + bytes_returned == n) {
5014       bytes_returned += n_bytes;
5015 
5016       if (offset > 0 && (type.is_compressed() || type.is_read())) {
5017         *err = os_file_io_complete(type, file, reinterpret_cast<byte *>(buf),
5018                                    nullptr, original_n, offset, n);
5019       } else {
5020         *err = DB_SUCCESS;
5021       }
5022 
5023       if (block != nullptr) {
5024         os_free_block(block);
5025       }
5026 
5027       if (encrypt_log_buf != nullptr) {
5028         ut_free(encrypt_log_buf);
5029       }
5030 
5031       return (original_n);
5032     }
5033 
5034     /* Handle partial read/write. */
5035 
5036     ut_ad((ulint)n_bytes + bytes_returned < n);
5037 
5038     bytes_returned += (ulint)n_bytes;
5039 
5040     if (!type.is_partial_io_warning_disabled()) {
5041       const char *op = type.is_read() ? "read" : "written";
5042 
5043       ib::warn(ER_IB_MSG_812)
5044           << n << " bytes should have been " << op << ". Only "
5045           << bytes_returned << " bytes " << op << ". Retrying"
5046           << " for the remaining bytes.";
5047     }
5048 
5049     /* Advance the offset and buffer by n_bytes */
5050     sync_file_io.advance(n_bytes);
5051   }
5052 
5053   if (block != nullptr) {
5054     os_free_block(block);
5055   }
5056 
5057   if (encrypt_log_buf != nullptr) {
5058     ut_free(encrypt_log_buf);
5059   }
5060 
5061   *err = DB_IO_ERROR;
5062 
5063   if (!type.is_partial_io_warning_disabled()) {
5064     ib::warn(ER_IB_MSG_813)
5065         << "Retry attempts for " << (type.is_read() ? "reading" : "writing")
5066         << " partial data failed.";
5067   }
5068 
5069   return (bytes_returned);
5070 }
5071 
5072 /** Does a synchronous write operation in Posix.
5073 @param[in]	type		IO context
5074 @param[in]	file		handle to an open file
5075 @param[out]	buf		buffer from which to write
5076 @param[in]	n		number of bytes to read, starting from offset
5077 @param[in]	offset		file offset from the start where to read
5078 @param[out]	err		DB_SUCCESS or error code
5079 @return number of bytes written, -1 if error */
5080 static MY_ATTRIBUTE((warn_unused_result)) ssize_t
os_file_pwrite(IORequest & type,os_file_t file,const byte * buf,ulint n,os_offset_t offset,dberr_t * err)5081     os_file_pwrite(IORequest &type, os_file_t file, const byte *buf, ulint n,
5082                    os_offset_t offset, dberr_t *err) {
5083 #ifdef UNIV_HOTBACKUP
5084   static meb::Mutex meb_mutex;
5085 #endif /* UNIV_HOTBACKUP */
5086 
5087   ut_ad(type.validate());
5088 
5089 #ifdef UNIV_HOTBACKUP
5090   meb_mutex.lock();
5091 #endif /* UNIV_HOTBACKUP */
5092   ++os_n_file_writes;
5093 #ifdef UNIV_HOTBACKUP
5094   meb_mutex.unlock();
5095 #endif /* UNIV_HOTBACKUP */
5096 
5097   (void)os_atomic_increment_ulint(&os_n_pending_writes, 1);
5098   MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_WRITES);
5099 
5100   ssize_t n_bytes = os_file_io(type, file, (void *)buf, n, offset, err);
5101 
5102   (void)os_atomic_decrement_ulint(&os_n_pending_writes, 1);
5103   MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_WRITES);
5104 
5105   return (n_bytes);
5106 }
5107 
5108 /** Requests a synchronous write operation.
5109 @param[in]	type		IO flags
5110 @param[in]	name		name of the file or path as a null-terminated
5111                                 string
5112 @param[in]	file		handle to an open file
5113 @param[out]	buf		buffer from which to write
5114 @param[in]	offset		file offset from the start where to read
5115 @param[in]	n		number of bytes to read, starting from offset
5116 @return DB_SUCCESS if request was successful, false if fail */
5117 static MY_ATTRIBUTE((warn_unused_result)) dberr_t
os_file_write_page(IORequest & type,const char * name,os_file_t file,const byte * buf,os_offset_t offset,ulint n)5118     os_file_write_page(IORequest &type, const char *name, os_file_t file,
5119                        const byte *buf, os_offset_t offset, ulint n) {
5120   dberr_t err;
5121 
5122   ut_ad(type.validate());
5123   ut_ad(n > 0);
5124 
5125   ssize_t n_bytes = os_file_pwrite(type, file, buf, n, offset, &err);
5126 
5127   if ((ulint)n_bytes != n && !os_has_said_disk_full) {
5128     ib::error(ER_IB_MSG_814) << "Write to file " << name << " failed at offset "
5129                              << offset << ", " << n
5130                              << " bytes should have been written,"
5131                                 " only "
5132                              << n_bytes
5133                              << " were written."
5134                                 " Operating system error number "
5135                              << errno
5136                              << "."
5137                                 " Check that your OS and file system"
5138                                 " support files of this size."
5139                                 " Check also that the disk is not full"
5140                                 " or a disk quota exceeded.";
5141 
5142     if (strerror(errno) != nullptr) {
5143       ib::error(ER_IB_MSG_815)
5144           << "Error number " << errno << " means '" << strerror(errno) << "'";
5145     }
5146 
5147     ib::info(ER_IB_MSG_816) << OPERATING_SYSTEM_ERROR_MSG;
5148 
5149     os_has_said_disk_full = true;
5150   }
5151 
5152   return (err);
5153 }
5154 
5155 /** Does a synchronous read operation in Posix.
5156 @param[in]	type		IO flags
5157 @param[in]	file		handle to an open file
5158 @param[out]	buf		buffer where to read
5159 @param[in]	offset		file offset from the start where to read
5160 @param[in]	n		number of bytes to read, starting from offset
5161 @param[out]	err		DB_SUCCESS or error code
5162 @return number of bytes read, -1 if error */
5163 static MY_ATTRIBUTE((warn_unused_result)) ssize_t
os_file_pread(IORequest & type,os_file_t file,void * buf,ulint n,os_offset_t offset,dberr_t * err)5164     os_file_pread(IORequest &type, os_file_t file, void *buf, ulint n,
5165                   os_offset_t offset, dberr_t *err) {
5166 #ifdef UNIV_HOTBACKUP
5167   static meb::Mutex meb_mutex;
5168 
5169   meb_mutex.lock();
5170 #endif /* UNIV_HOTBACKUP */
5171   ++os_n_file_reads;
5172 #ifdef UNIV_HOTBACKUP
5173   meb_mutex.unlock();
5174 #endif /* UNIV_HOTBACKUP */
5175 
5176   (void)os_atomic_increment_ulint(&os_n_pending_reads, 1);
5177   MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS);
5178 
5179   ssize_t n_bytes = os_file_io(type, file, buf, n, offset, err);
5180 
5181   (void)os_atomic_decrement_ulint(&os_n_pending_reads, 1);
5182   MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_READS);
5183 
5184   return (n_bytes);
5185 }
5186 
5187 /** Requests a synchronous positioned read operation.
5188 @return DB_SUCCESS if request was successful, false if fail
5189 @param[in]	type		IO flags
5190 @param[in]  file_name file name
5191 @param[in]	file		handle to an open file
5192 @param[out]	buf		buffer where to read
5193 @param[in]	offset		file offset from the start where to read
5194 @param[in]	n		number of bytes to read, starting from offset
5195 @param[out]	o		number of bytes actually read
5196 @param[in]	exit_on_err	if true then exit on error
5197 @return DB_SUCCESS or error code */
5198 static MY_ATTRIBUTE((warn_unused_result)) dberr_t
os_file_read_page(IORequest & type,const char * file_name,os_file_t file,void * buf,os_offset_t offset,ulint n,ulint * o,bool exit_on_err)5199     os_file_read_page(IORequest &type, const char *file_name, os_file_t file,
5200                       void *buf, os_offset_t offset, ulint n, ulint *o,
5201                       bool exit_on_err) {
5202   dberr_t err;
5203 
5204 #ifdef UNIV_HOTBACKUP
5205   static meb::Mutex meb_mutex;
5206 
5207   meb_mutex.lock();
5208 #endif /* UNIV_HOTBACKUP */
5209   os_bytes_read_since_printout += n;
5210 #ifdef UNIV_HOTBACKUP
5211   meb_mutex.unlock();
5212 #endif /* UNIV_HOTBACKUP */
5213 
5214   ut_ad(type.validate());
5215   ut_ad(n > 0);
5216 
5217   for (;;) {
5218     ssize_t n_bytes;
5219 
5220     n_bytes = os_file_pread(type, file, buf, n, offset, &err);
5221 
5222     if (o != nullptr) {
5223       *o = n_bytes;
5224     }
5225 
5226     if (err != DB_SUCCESS && !exit_on_err) {
5227       return (err);
5228 
5229     } else if ((ulint)n_bytes == n) {
5230       /** The read will succeed but decompress can fail
5231       for various reasons. */
5232 
5233       if (type.is_compression_enabled() &&
5234           !Compression::is_compressed_page(static_cast<byte *>(buf))) {
5235         return (DB_SUCCESS);
5236 
5237       } else {
5238         return (err);
5239       }
5240     }
5241 
5242     ib::error(ER_IB_MSG_817)
5243         << "Tried to read " << n << " bytes at offset " << offset
5244         << ", but was only able to read " << n_bytes;
5245 
5246     if (exit_on_err) {
5247       if (!os_file_handle_error(file_name, "read")) {
5248         /* Hard error */
5249         break;
5250       }
5251 
5252     } else if (!os_file_handle_error_no_exit(file_name, "read", false)) {
5253       /* Hard error */
5254       break;
5255     }
5256 
5257     if (n_bytes > 0 && (ulint)n_bytes < n) {
5258       n -= (ulint)n_bytes;
5259       offset += (ulint)n_bytes;
5260       buf = reinterpret_cast<uchar *>(buf) + (ulint)n_bytes;
5261     }
5262   }
5263 
5264   ib::fatal(ER_IB_MSG_818) << "Cannot read from file. OS error number " << errno
5265                            << ".";
5266 
5267   return (err);
5268 }
5269 
5270 /** Retrieves the last error number if an error occurs in a file io function.
5271 The number should be retrieved before any other OS calls (because they may
5272 overwrite the error number). If the number is not known to this program,
5273 the OS error number + 100 is returned.
5274 @param[in]	report_all_errors	true if we want an error printed
5275                                         for all errors
5276 @return error number, or OS error number + 100 */
os_file_get_last_error(bool report_all_errors)5277 ulint os_file_get_last_error(bool report_all_errors) {
5278   return (os_file_get_last_error_low(report_all_errors, false));
5279 }
5280 
5281 /** Does error handling when a file operation fails.
5282 Conditionally exits (calling srv_fatal_error()) based on should_exit value
5283 and the error type, if should_exit is true then on_error_silent is ignored.
5284 @param[in]	name		name of a file or NULL
5285 @param[in]	operation	operation
5286 @param[in]	should_exit	call srv_fatal_error() on an unknown error,
5287                                 if this parameter is true
5288 @param[in]	on_error_silent	if true then don't print any message to the log
5289                                 iff it is an unknown non-fatal error
5290 @return true if we should retry the operation */
os_file_handle_error_cond_exit(const char * name,const char * operation,bool should_exit,bool on_error_silent)5291 static MY_ATTRIBUTE((warn_unused_result)) bool os_file_handle_error_cond_exit(
5292     const char *name, const char *operation, bool should_exit,
5293     bool on_error_silent) {
5294   ulint err;
5295 
5296   err = os_file_get_last_error_low(false, on_error_silent);
5297 
5298   switch (err) {
5299     case OS_FILE_DISK_FULL:
5300       /* We only print a warning about disk full once */
5301 
5302       if (os_has_said_disk_full) {
5303         return (false);
5304       }
5305 
5306       /* Disk full error is reported irrespective of the
5307       on_error_silent setting. */
5308 
5309       if (name) {
5310         ib::error(ER_IB_MSG_819)
5311             << "Encountered a problem with file '" << name << "'";
5312       }
5313 
5314       ib::error(ER_IB_MSG_820)
5315           << "Disk is full. Try to clean the disk to free space.";
5316 
5317       os_has_said_disk_full = true;
5318 
5319       return (false);
5320 
5321     case OS_FILE_AIO_RESOURCES_RESERVED:
5322     case OS_FILE_AIO_INTERRUPTED:
5323 
5324       return (true);
5325 
5326     case OS_FILE_PATH_ERROR:
5327     case OS_FILE_ALREADY_EXISTS:
5328     case OS_FILE_ACCESS_VIOLATION:
5329 
5330       return (false);
5331 
5332     case OS_FILE_SHARING_VIOLATION:
5333 
5334       os_thread_sleep(10000000); /* 10 sec */
5335       return (true);
5336 
5337     case OS_FILE_OPERATION_ABORTED:
5338     case OS_FILE_INSUFFICIENT_RESOURCE:
5339 
5340       os_thread_sleep(100000); /* 100 ms */
5341       return (true);
5342 
5343     case OS_FILE_NAME_TOO_LONG:
5344       return (false);
5345 
5346     default:
5347 
5348       /* If it is an operation that can crash on error then it
5349       is better to ignore on_error_silent and print an error message
5350       to the log. */
5351 
5352       if (should_exit || !on_error_silent) {
5353         ib::error(ER_IB_MSG_821)
5354             << "File " << (name != nullptr ? name : "(unknown)") << ": '"
5355             << operation
5356             << "'"
5357                " returned OS error "
5358             << err << "." << (should_exit ? " Cannot continue operation" : "");
5359       }
5360 
5361       if (should_exit) {
5362 #ifndef UNIV_HOTBACKUP
5363         srv_fatal_error();
5364 #else  /* !UNIV_HOTBACKUP */
5365         ib::fatal(ER_IB_MSG_822) << "Internal error,"
5366                                  << " cannot continue operation.";
5367 #endif /* !UNIV_HOTBACKUP */
5368       }
5369   }
5370 
5371   return (false);
5372 }
5373 
5374 /** Does error handling when a file operation fails.
5375 @param[in]	name		name of a file or NULL
5376 @param[in]	operation	operation name that failed
5377 @return true if we should retry the operation */
os_file_handle_error(const char * name,const char * operation)5378 static bool os_file_handle_error(const char *name, const char *operation) {
5379   /* Exit in case of unknown error */
5380   return (os_file_handle_error_cond_exit(name, operation, true, false));
5381 }
5382 
5383 /** Does error handling when a file operation fails.
5384 @param[in]	name		name of a file or NULL
5385 @param[in]	operation	operation name that failed
5386 @param[in]	on_error_silent	if true then don't print any message to the log.
5387 @return true if we should retry the operation */
os_file_handle_error_no_exit(const char * name,const char * operation,bool on_error_silent)5388 static bool os_file_handle_error_no_exit(const char *name,
5389                                          const char *operation,
5390                                          bool on_error_silent) {
5391   /* Don't exit in case of unknown error */
5392   return (
5393       os_file_handle_error_cond_exit(name, operation, false, on_error_silent));
5394 }
5395 
5396 /** Tries to disable OS caching on an opened file descriptor.
5397 @param[in]	fd		file descriptor to alter
5398 @param[in]	file_name	file name, used in the diagnostic message
5399 @param[in]	operation_name	"open" or "create"; used in the diagnostic
5400                                 message */
os_file_set_nocache(int fd MY_ATTRIBUTE ((unused)),const char * file_name MY_ATTRIBUTE ((unused)),const char * operation_name MY_ATTRIBUTE ((unused)))5401 void os_file_set_nocache(int fd MY_ATTRIBUTE((unused)),
5402                          const char *file_name MY_ATTRIBUTE((unused)),
5403                          const char *operation_name MY_ATTRIBUTE((unused))) {
5404 /* some versions of Solaris may not have DIRECTIO_ON */
5405 #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
5406   if (directio(fd, DIRECTIO_ON) == -1) {
5407     int errno_save = errno;
5408 
5409     ib::error(ER_IB_MSG_823)
5410         << "Failed to set DIRECTIO_ON on file " << file_name << "; "
5411         << operation_name << ": " << strerror(errno_save)
5412         << ","
5413            " continuing anyway.";
5414   }
5415 #elif defined(O_DIRECT)
5416   if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
5417     int errno_save = errno;
5418     static bool warning_message_printed = false;
5419     if (errno_save == EINVAL) {
5420       if (!warning_message_printed) {
5421         warning_message_printed = true;
5422 #ifdef UNIV_LINUX
5423         ib::warn(ER_IB_MSG_824)
5424             << "Failed to set O_DIRECT on file" << file_name << "; "
5425             << operation_name << ": " << strerror(errno_save)
5426             << ", "
5427                "continuing anyway. O_DIRECT is "
5428                "known to result in 'Invalid argument' "
5429                "on Linux on tmpfs, "
5430                "see MySQL Bug#26662.";
5431 #else  /* UNIV_LINUX */
5432         goto short_warning;
5433 #endif /* UNIV_LINUX */
5434       }
5435     } else {
5436 #ifndef UNIV_LINUX
5437     short_warning:
5438 #endif
5439       ib::warn(ER_IB_MSG_825) << "Failed to set O_DIRECT on file " << file_name
5440                               << "; " << operation_name << " : "
5441                               << strerror(errno_save) << ", continuing anyway.";
5442     }
5443   }
5444 #endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */
5445 }
5446 
os_file_set_size_fast(const char * name,pfs_os_file_t pfs_file,os_offset_t offset,os_offset_t size,bool read_only,bool flush)5447 bool os_file_set_size_fast(const char *name, pfs_os_file_t pfs_file,
5448                            os_offset_t offset, os_offset_t size, bool read_only,
5449                            bool flush) {
5450 #if !defined(NO_FALLOCATE) && defined(UNIV_LINUX) && \
5451     defined(HAVE_FALLOC_FL_ZERO_RANGE)
5452   ut_a(size >= offset);
5453 
5454   static bool print_message = true;
5455 
5456   int ret =
5457       fallocate(pfs_file.m_file, FALLOC_FL_ZERO_RANGE, offset, size - offset);
5458 
5459   if (ret == 0) {
5460     if (flush) {
5461       return os_file_flush(pfs_file);
5462     }
5463 
5464     return true;
5465   }
5466 
5467   ut_a(ret == -1);
5468 
5469   /* Print the failure message only once for all the redo log files. */
5470   if (print_message) {
5471     ib::info(ER_IB_MSG_1359) << "fallocate() failed with errno " << errno
5472                              << " - falling back to writing NULLs.";
5473     print_message = false;
5474   }
5475 #endif /* !NO_FALLOCATE && UNIV_LINUX && HAVE_FALLOC_FL_ZERO_RANGE */
5476 
5477   return os_file_set_size(name, pfs_file, offset, size, read_only, flush);
5478 }
5479 
os_file_set_size(const char * name,pfs_os_file_t file,os_offset_t offset,os_offset_t size,bool read_only,bool flush)5480 bool os_file_set_size(const char *name, pfs_os_file_t file, os_offset_t offset,
5481                       os_offset_t size, bool read_only, bool flush) {
5482   /* Write up to FSP_EXTENT_SIZE bytes at a time. */
5483   ulint buf_size = 0;
5484 
5485   if (size <= UNIV_PAGE_SIZE) {
5486     buf_size = 1;
5487   } else {
5488     buf_size = ut_min(static_cast<ulint>(64),
5489                       static_cast<ulint>(size / UNIV_PAGE_SIZE));
5490   }
5491 
5492   ut_ad(buf_size != 0);
5493 
5494   buf_size *= UNIV_PAGE_SIZE;
5495 
5496   /* Align the buffer for possible raw i/o */
5497   byte *buf2;
5498 
5499   buf2 = static_cast<byte *>(ut_malloc_nokey(buf_size + UNIV_PAGE_SIZE));
5500 
5501   byte *buf = static_cast<byte *>(ut_align(buf2, UNIV_PAGE_SIZE));
5502 
5503   /* Write buffer full of zeros */
5504   memset(buf, 0, buf_size);
5505 
5506   os_offset_t current_size = offset;
5507 
5508   /* Count to check and print progress of file write for file_size > 100 MB. */
5509   uint percentage_count = 10;
5510 
5511   while (current_size < size) {
5512     ulint n_bytes;
5513 
5514     if (size - current_size < (os_offset_t)buf_size) {
5515       n_bytes = (ulint)(size - current_size);
5516     } else {
5517       n_bytes = buf_size;
5518     }
5519 
5520     dberr_t err;
5521     IORequest request(IORequest::WRITE);
5522 
5523 #ifdef UNIV_HOTBACKUP
5524 
5525     err = os_file_write(request, name, file, buf, current_size, n_bytes);
5526 #else
5527     /* Using AIO_mode::SYNC mode on POSIX systems will result in
5528     fall back to os_file_write/read. On Windows it will use
5529     special mechanism to wait before it returns back. */
5530 
5531     err = os_aio(request, AIO_mode::SYNC, name, file, buf, current_size,
5532                  n_bytes, read_only, nullptr, nullptr);
5533 #endif /* UNIV_HOTBACKUP */
5534 
5535     if (err != DB_SUCCESS) {
5536       ut_free(buf2);
5537       return (false);
5538     }
5539 
5540     /* Flush after each os_fsync_threhold bytes */
5541     if (flush && os_fsync_threshold != 0) {
5542       if ((current_size + n_bytes) / os_fsync_threshold !=
5543           current_size / os_fsync_threshold) {
5544         DBUG_EXECUTE_IF("flush_after_reaching_threshold",
5545                         std::cerr << os_fsync_threshold
5546                                   << " bytes being flushed at once"
5547                                   << std::endl;);
5548 
5549         bool ret = os_file_flush(file);
5550 
5551         if (!ret) {
5552           ut_free(buf2);
5553           return (false);
5554         }
5555       }
5556     }
5557 
5558     /* Print percentage of progress if the size is more than 100MB */
5559     if ((size >> 20) > 100) {
5560       float progress_percentage =
5561           ((float)(current_size + n_bytes) / (float)size) * 100;
5562 
5563       if (progress_percentage >= percentage_count) {
5564         ib::info(ER_IB_MSG_1062, name, ulonglong{size >> 20}, percentage_count);
5565         percentage_count += 10;
5566       }
5567     }
5568 
5569     current_size += n_bytes;
5570   }
5571 
5572   ut_free(buf2);
5573 
5574   if (flush) {
5575     return (os_file_flush(file));
5576   }
5577 
5578   return (true);
5579 }
5580 
5581 /** Truncates a file to a specified size in bytes.
5582 Do nothing if the size to preserve is greater or equal to the current
5583 size of the file.
5584 @param[in]	pathname	file path
5585 @param[in]	file		file to be truncated
5586 @param[in]	size		size to preserve in bytes
5587 @return true if success */
os_file_truncate(const char * pathname,pfs_os_file_t file,os_offset_t size)5588 bool os_file_truncate(const char *pathname, pfs_os_file_t file,
5589                       os_offset_t size) {
5590   /* Do nothing if the size preserved is larger than or equal to the
5591   current size of file */
5592   os_offset_t size_bytes = os_file_get_size(file);
5593 
5594   if (size >= size_bytes) {
5595     return (true);
5596   }
5597 
5598 #ifdef _WIN32
5599   return (os_file_truncate_win32(pathname, file, size));
5600 #else  /* _WIN32 */
5601   return (os_file_truncate_posix(pathname, file, size));
5602 #endif /* _WIN32 */
5603 }
5604 
5605 /** Set read/write position of a file handle to specific offset.
5606 @param[in]	pathname	file path
5607 @param[in]	file		file handle
5608 @param[in]	offset		read/write offset
5609 @return true if success */
os_file_seek(const char * pathname,os_file_t file,os_offset_t offset)5610 bool os_file_seek(const char *pathname, os_file_t file, os_offset_t offset) {
5611   bool success = true;
5612 
5613 #ifdef _WIN32
5614   LARGE_INTEGER length;
5615 
5616   length.QuadPart = offset;
5617 
5618   success = SetFilePointerEx(file, length, NULL, FILE_BEGIN);
5619 
5620 #else  /* _WIN32 */
5621   off_t ret;
5622 
5623   ret = lseek(file, offset, SEEK_SET);
5624 
5625   if (ret == -1) {
5626     success = false;
5627   }
5628 #endif /* _WIN32 */
5629 
5630   if (!success) {
5631     os_file_handle_error_no_exit(pathname, "os_file_set", false);
5632   }
5633 
5634   return (success);
5635 }
5636 
5637 /** NOTE! Use the corresponding macro os_file_read(), not directly this
5638 function!
5639 Requests a synchronous positioned read operation.
5640 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
5641 @param[in]	type		IO flags
5642 @param[in]  file_name file name
5643 @param[in]	file		handle to an open file
5644 @param[out]	buf		buffer where to read
5645 @param[in]	offset		file offset from the start where to read
5646 @param[in]	n		number of bytes to read, starting from offset
5647 @return DB_SUCCESS or error code */
os_file_read_func(IORequest & type,const char * file_name,os_file_t file,void * buf,os_offset_t offset,ulint n)5648 dberr_t os_file_read_func(IORequest &type, const char *file_name,
5649                           os_file_t file, void *buf, os_offset_t offset,
5650                           ulint n) {
5651   ut_ad(type.is_read());
5652 
5653   return (
5654       os_file_read_page(type, file_name, file, buf, offset, n, nullptr, true));
5655 }
5656 
5657 /** NOTE! Use the corresponding macro os_file_read_first_page(), not
5658 directly this function!
5659 Requests a synchronous positioned read operation of page 0 of IBD file
5660 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
5661 @param[in]	type		IO flags
5662 @param[in]  file_name file name
5663 @param[in]	file		handle to an open file
5664 @param[out]	buf		buffer where to read
5665 @param[in]	n		number of bytes to read, starting from offset
5666 @return DB_SUCCESS or error code */
os_file_read_first_page_func(IORequest & type,const char * file_name,os_file_t file,void * buf,ulint n)5667 dberr_t os_file_read_first_page_func(IORequest &type, const char *file_name,
5668                                      os_file_t file, void *buf, ulint n) {
5669   ut_ad(type.is_read());
5670 
5671   dberr_t err = os_file_read_page(type, file_name, file, buf, 0,
5672                                   UNIV_ZIP_SIZE_MIN, nullptr, true);
5673 
5674   if (err == DB_SUCCESS) {
5675     uint32_t flags = fsp_header_get_flags(static_cast<byte *>(buf));
5676     const page_size_t page_size(flags);
5677     ut_ad(page_size.physical() <= n);
5678     err = os_file_read_page(type, file_name, file, buf, 0, page_size.physical(),
5679                             nullptr, true);
5680   }
5681   return (err);
5682 }
5683 
5684 /** copy data from one file to another file using read, write.
5685 @param[in]	src_file	file handle to copy from
5686 @param[in]	src_offset	offset to copy from
5687 @param[in]	dest_file	file handle to copy to
5688 @param[in]	dest_offset	offset to copy to
5689 @param[in]	size		number of bytes to copy
5690 @return DB_SUCCESS if successful */
os_file_copy_read_write(os_file_t src_file,os_offset_t src_offset,os_file_t dest_file,os_offset_t dest_offset,uint size)5691 static dberr_t os_file_copy_read_write(os_file_t src_file,
5692                                        os_offset_t src_offset,
5693                                        os_file_t dest_file,
5694                                        os_offset_t dest_offset, uint size) {
5695   dberr_t err;
5696   uint request_size;
5697   const uint BUF_SIZE = 4 * UNIV_SECTOR_SIZE;
5698 
5699   char buf[BUF_SIZE + UNIV_SECTOR_SIZE];
5700   char *buf_ptr;
5701 
5702   buf_ptr = static_cast<char *>(ut_align(buf, UNIV_SECTOR_SIZE));
5703 
5704   IORequest read_request(IORequest::READ);
5705   read_request.disable_compression();
5706   read_request.clear_encrypted();
5707 
5708   IORequest write_request(IORequest::WRITE);
5709   write_request.disable_compression();
5710   write_request.clear_encrypted();
5711 
5712   while (size > 0) {
5713     if (size > BUF_SIZE) {
5714       request_size = BUF_SIZE;
5715     } else {
5716       request_size = size;
5717     }
5718 
5719     err = os_file_read_func(read_request, nullptr, src_file, buf_ptr,
5720                             src_offset, request_size);
5721 
5722     if (err != DB_SUCCESS) {
5723       return (err);
5724     }
5725     src_offset += request_size;
5726 
5727     err = os_file_write_func(write_request, "file copy", dest_file, buf_ptr,
5728                              dest_offset, request_size);
5729 
5730     if (err != DB_SUCCESS) {
5731       return (err);
5732     }
5733     dest_offset += request_size;
5734     size -= request_size;
5735   }
5736 
5737   return (DB_SUCCESS);
5738 }
5739 
5740 /** copy data from one file to another file.
5741 @param[in]	src_file	file handle to copy from
5742 @param[in]	src_offset	offset to copy from
5743 @param[in]	dest_file	file handle to copy to
5744 @param[in]	dest_offset	offset to copy to
5745 @param[in]	size		number of bytes to copy
5746 @return DB_SUCCESS if successful */
5747 #ifdef __linux__
os_file_copy_func(os_file_t src_file,os_offset_t src_offset,os_file_t dest_file,os_offset_t dest_offset,uint size)5748 dberr_t os_file_copy_func(os_file_t src_file, os_offset_t src_offset,
5749                           os_file_t dest_file, os_offset_t dest_offset,
5750                           uint size) {
5751   dberr_t err;
5752   static bool use_sendfile = true;
5753 
5754   uint actual_size;
5755   int ret_size;
5756 
5757   int src_fd;
5758   int dest_fd;
5759 
5760   if (!os_file_seek(nullptr, src_file, src_offset)) {
5761     return (DB_IO_ERROR);
5762   }
5763 
5764   if (!os_file_seek(nullptr, dest_file, dest_offset)) {
5765     return (DB_IO_ERROR);
5766   }
5767 
5768   src_fd = OS_FD_FROM_FILE(src_file);
5769   dest_fd = OS_FD_FROM_FILE(dest_file);
5770 
5771   while (use_sendfile && size > 0) {
5772     ret_size = sendfile(dest_fd, src_fd, nullptr, size);
5773 
5774     if (ret_size == -1) {
5775       /* Fall through read/write path. */
5776       ib::info(ER_IB_MSG_827) << "sendfile failed to copy data"
5777                                  " : trying read/write ";
5778 
5779       use_sendfile = false;
5780       break;
5781     }
5782 
5783     actual_size = static_cast<uint>(ret_size);
5784 
5785     ut_ad(size >= actual_size);
5786     size -= actual_size;
5787   }
5788 
5789   if (size == 0) {
5790     return (DB_SUCCESS);
5791   }
5792 
5793   err = os_file_copy_read_write(src_file, src_offset, dest_file, dest_offset,
5794                                 size);
5795 
5796   return (err);
5797 }
5798 #else
os_file_copy_func(os_file_t src_file,os_offset_t src_offset,os_file_t dest_file,os_offset_t dest_offset,uint size)5799 dberr_t os_file_copy_func(os_file_t src_file, os_offset_t src_offset,
5800                           os_file_t dest_file, os_offset_t dest_offset,
5801                           uint size) {
5802   dberr_t err;
5803 
5804   err = os_file_copy_read_write(src_file, src_offset, dest_file, dest_offset,
5805                                 size);
5806   return (err);
5807 }
5808 #endif
5809 
5810 /** NOTE! Use the corresponding macro os_file_read_no_error_handling(),
5811 not directly this function!
5812 Requests a synchronous positioned read operation.
5813 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
5814 @param[in]	type		IO flags
5815 @param[in]  file_name file name
5816 @param[in]	file		handle to an open file
5817 @param[out]	buf		buffer where to read
5818 @param[in]	offset		file offset from the start where to read
5819 @param[in]	n		number of bytes to read, starting from offset
5820 @param[out]	o		number of bytes actually read
5821 @return DB_SUCCESS or error code */
os_file_read_no_error_handling_func(IORequest & type,const char * file_name,os_file_t file,void * buf,os_offset_t offset,ulint n,ulint * o)5822 dberr_t os_file_read_no_error_handling_func(IORequest &type,
5823                                             const char *file_name,
5824                                             os_file_t file, void *buf,
5825                                             os_offset_t offset, ulint n,
5826                                             ulint *o) {
5827   ut_ad(type.is_read());
5828 
5829   return (os_file_read_page(type, file_name, file, buf, offset, n, o, false));
5830 }
5831 
5832 /** NOTE! Use the corresponding macro os_file_write(), not directly
5833 Requests a synchronous write operation.
5834 @param[in]	type		IO flags
5835 @param[in]	name		name of the file or path as a null-terminated
5836                                 string
5837 @param[in]	file		handle to an open file
5838 @param[out]	buf		buffer from which to write
5839 @param[in]	offset		file offset from the start where to read
5840 @param[in]	n		number of bytes to read, starting from offset
5841 @return DB_SUCCESS if request was successful, false if fail */
os_file_write_func(IORequest & type,const char * name,os_file_t file,const void * buf,os_offset_t offset,ulint n)5842 dberr_t os_file_write_func(IORequest &type, const char *name, os_file_t file,
5843                            const void *buf, os_offset_t offset, ulint n) {
5844   ut_ad(type.validate());
5845   ut_ad(type.is_write());
5846 
5847   /* We never compress the first page.
5848   Note: This assumes we always do block IO. */
5849   if (offset == 0) {
5850     type.clear_compressed();
5851   }
5852 
5853   const byte *ptr = reinterpret_cast<const byte *>(buf);
5854 
5855   return (os_file_write_page(type, name, file, ptr, offset, n));
5856 }
5857 
os_file_status(const char * path,bool * exists,os_file_type_t * type)5858 bool os_file_status(const char *path, bool *exists, os_file_type_t *type) {
5859 #ifdef _WIN32
5860   return (os_file_status_win32(path, exists, type));
5861 #else
5862   return (os_file_status_posix(path, exists, type));
5863 #endif /* _WIN32 */
5864 }
5865 
os_file_exists(const char * path)5866 bool os_file_exists(const char *path) {
5867 #ifdef _WIN32
5868   return (os_file_exists_win32(path));
5869 #else
5870   return (os_file_exists_posix(path));
5871 #endif /* _WIN32 */
5872 }
5873 
5874 /** Free storage space associated with a section of the file.
5875 @param[in]	fh		Open file handle
5876 @param[in]	off		Starting offset (SEEK_SET)
5877 @param[in]	len		Size of the hole
5878 @return DB_SUCCESS or error code */
os_file_punch_hole(os_file_t fh,os_offset_t off,os_offset_t len)5879 dberr_t os_file_punch_hole(os_file_t fh, os_offset_t off, os_offset_t len) {
5880   /* In this debugging mode, we act as if punch hole is supported,
5881   and then skip any calls to actually punch a hole here.
5882   In this way, Transparent Page Compression is still being tested. */
5883   DBUG_EXECUTE_IF("ignore_punch_hole", return (DB_SUCCESS););
5884 
5885 #ifdef _WIN32
5886   return (os_file_punch_hole_win32(fh, off, len));
5887 #else
5888   return (os_file_punch_hole_posix(fh, off, len));
5889 #endif /* _WIN32 */
5890 }
5891 
5892 /** Check if the file system supports sparse files.
5893 
5894 Warning: On POSIX systems we try and punch a hole from offset 0 to
5895 the system configured page size. This should only be called on an empty
5896 file.
5897 
5898 Note: On Windows we use the name and on Unices we use the file handle.
5899 
5900 @param[in]	path		File name
5901 @param[in]	fh		File handle for the file - if opened
5902 @return true if the file system supports sparse files */
os_is_sparse_file_supported(const char * path,pfs_os_file_t fh)5903 bool os_is_sparse_file_supported(const char *path, pfs_os_file_t fh) {
5904   /* In this debugging mode, we act as if punch hole is supported,
5905   then we skip any calls to actually punch a hole.  In this way,
5906   Transparent Page Compression is still being tested. */
5907   DBUG_EXECUTE_IF("ignore_punch_hole", return (true););
5908 
5909 #ifdef _WIN32
5910   return (os_is_sparse_file_supported_win32(path));
5911 #else
5912   dberr_t err;
5913 
5914   /* We don't know the FS block size, use the sector size. The FS
5915   will do the magic. */
5916   err = os_file_punch_hole(fh.m_file, 0, UNIV_PAGE_SIZE);
5917 
5918   return (err == DB_SUCCESS);
5919 #endif /* _WIN32 */
5920 }
5921 
os_get_free_space(const char * path,uint64_t & free_space)5922 dberr_t os_get_free_space(const char *path, uint64_t &free_space) {
5923 #ifdef _WIN32
5924   uint32_t block_size;
5925   auto err = os_get_free_space_win32(path, block_size, free_space);
5926 
5927 #else
5928   auto err = os_get_free_space_posix(path, free_space);
5929 
5930 #endif /* _WIN32 */
5931   return (err);
5932 }
5933 
5934 /** This function returns information about the specified file
5935 @param[in]	path		pathname of the file
5936 @param[out]	stat_info	information of a file in a directory
5937 @param[in]	check_rw_perm	for testing whether the file can be opened
5938                                 in RW mode
5939 @param[in]	read_only	true if file is opened in read-only mode
5940 @return DB_SUCCESS if all OK */
os_file_get_status(const char * path,os_file_stat_t * stat_info,bool check_rw_perm,bool read_only)5941 dberr_t os_file_get_status(const char *path, os_file_stat_t *stat_info,
5942                            bool check_rw_perm, bool read_only) {
5943   dberr_t ret;
5944 
5945 #ifdef _WIN32
5946   struct _stat64 info;
5947 
5948   ret = os_file_get_status_win32(path, stat_info, &info, check_rw_perm,
5949                                  read_only);
5950 
5951 #else
5952   struct stat info;
5953 
5954   ret = os_file_get_status_posix(path, stat_info, &info, check_rw_perm,
5955                                  read_only);
5956 
5957 #endif /* _WIN32 */
5958 
5959   if (ret == DB_SUCCESS) {
5960     stat_info->ctime = info.st_ctime;
5961     stat_info->atime = info.st_atime;
5962     stat_info->mtime = info.st_mtime;
5963     stat_info->size = info.st_size;
5964   }
5965 
5966   return (ret);
5967 }
5968 
5969 /** Fill the pages with NULs
5970 @param[in] file		File handle
5971 @param[in] name		File name
5972 @param[in] page_size	physical page size
5973 @param[in] start	Offset from the start of the file in bytes
5974 @param[in] len		Length in bytes
5975 @param[in] read_only_mode
5976                         if true, then read only mode checks are enforced.
5977 @return DB_SUCCESS or error code */
os_file_write_zeros(pfs_os_file_t file,const char * name,ulint page_size,os_offset_t start,ulint len,bool read_only_mode)5978 dberr_t os_file_write_zeros(pfs_os_file_t file, const char *name,
5979                             ulint page_size, os_offset_t start, ulint len,
5980                             bool read_only_mode) {
5981   ut_a(len > 0);
5982 
5983   /* Extend at most 1M at a time */
5984   ulint n_bytes = ut_min(static_cast<ulint>(1024 * 1024), len);
5985 
5986   byte *ptr = reinterpret_cast<byte *>(ut_zalloc_nokey(n_bytes + page_size));
5987 
5988   byte *buf = reinterpret_cast<byte *>(ut_align(ptr, page_size));
5989 
5990   os_offset_t offset = start;
5991   dberr_t err = DB_SUCCESS;
5992   const os_offset_t end = start + len;
5993   IORequest request(IORequest::WRITE);
5994 
5995   while (offset < end) {
5996 #ifdef UNIV_HOTBACKUP
5997     err = os_file_write(request, name, file, buf, offset, n_bytes);
5998 #else
5999     err = os_aio(request, AIO_mode::SYNC, name, file, buf, offset, n_bytes,
6000                  read_only_mode, NULL, NULL);
6001 #endif /* UNIV_HOTBACKUP */
6002 
6003     if (err != DB_SUCCESS) {
6004       break;
6005     }
6006 
6007     offset += n_bytes;
6008 
6009     n_bytes = ut_min(n_bytes, static_cast<ulint>(end - offset));
6010 
6011     DBUG_EXECUTE_IF("ib_crash_during_tablespace_extension", DBUG_SUICIDE(););
6012   }
6013 
6014   ut_free(ptr);
6015 
6016   return (err);
6017 }
6018 
6019 /** Waits for an AIO operation to complete. This function is used to wait the
6020 for completed requests. The aio array of pending requests is divided
6021 into segments. The thread specifies which segment or slot it wants to wait
6022 for. NOTE: this function will also take care of freeing the aio slot,
6023 therefore no other thread is allowed to do the freeing!
6024 @param[in]	segment		The number of the segment in the aio arrays to
6025                                 wait for; segment 0 is the ibuf I/O thread,
6026                                 segment 1 the log I/O thread, then follow the
6027                                 non-ibuf read threads, and as the last are the
6028                                 non-ibuf write threads; if this is
6029                                 ULINT_UNDEFINED, then it means that sync AIO
6030                                 is used, and this parameter is ignored
6031 @param[out]	m1		the messages passed with the AIO request; note
6032                                 that also in the case where the AIO operation
6033                                 failed, these output parameters are valid and
6034                                 can be used to restart the operation,
6035                                 for example
6036 @param[out]	m2		callback message
6037 @param[out]	request		OS_FILE_WRITE or ..._READ
6038 @return DB_SUCCESS or error code */
os_aio_handler(ulint segment,fil_node_t ** m1,void ** m2,IORequest * request)6039 dberr_t os_aio_handler(ulint segment, fil_node_t **m1, void **m2,
6040                        IORequest *request) {
6041   dberr_t err;
6042 
6043   if (srv_use_native_aio) {
6044     srv_set_io_thread_op_info(segment, "native aio handle");
6045 
6046 #ifdef WIN_ASYNC_IO
6047 
6048     err = os_aio_windows_handler(segment, 0, m1, m2, request);
6049 
6050 #elif defined(LINUX_NATIVE_AIO)
6051 
6052     err = os_aio_linux_handler(segment, m1, m2, request);
6053 #else
6054     ut_error;
6055 
6056     err = DB_ERROR; /* Eliminate compiler warning */
6057 
6058 #endif /* WIN_ASYNC_IO */
6059 
6060   } else {
6061     srv_set_io_thread_op_info(segment, "simulated aio handle");
6062 
6063     err = os_aio_simulated_handler(segment, m1, m2, request);
6064   }
6065 
6066   return (err);
6067 }
6068 
6069 /** Constructor
6070 @param[in]	id		The latch ID
6071 @param[in]	n		Number of AIO slots
6072 @param[in]	segments	Number of segments */
AIO(latch_id_t id,ulint n,ulint segments)6073 AIO::AIO(latch_id_t id, ulint n, ulint segments)
6074     : m_slots(n),
6075       m_n_segments(segments),
6076       m_n_reserved()
6077 #ifdef LINUX_NATIVE_AIO
6078       ,
6079       m_aio_ctx(),
6080       m_events(m_slots.size())
6081 #elif defined(_WIN32)
6082       ,
6083       m_handles()
6084 #endif /* LINUX_NATIVE_AIO */
6085 {
6086   ut_a(n > 0);
6087   ut_a(m_n_segments > 0);
6088 
6089   mutex_create(id, &m_mutex);
6090 
6091   m_not_full = os_event_create();
6092   m_is_empty = os_event_create();
6093 
6094 #ifdef LINUX_NATIVE_AIO
6095   memset(&m_events[0], 0x0, sizeof(m_events[0]) * m_events.size());
6096 #endif /* LINUX_NATIVE_AIO */
6097 
6098   os_event_set(m_is_empty);
6099 }
6100 
6101 /** Initialise the slots */
init_slots()6102 dberr_t AIO::init_slots() {
6103   for (ulint i = 0; i < m_slots.size(); ++i) {
6104     Slot &slot = m_slots[i];
6105 
6106     slot.pos = static_cast<uint16_t>(i);
6107 
6108     slot.is_reserved = false;
6109 
6110 #ifdef WIN_ASYNC_IO
6111 
6112     slot.handle = CreateEvent(NULL, TRUE, FALSE, NULL);
6113 
6114     OVERLAPPED *over = &slot.control;
6115 
6116     over->hEvent = slot.handle;
6117 
6118     (*m_handles)[i] = over->hEvent;
6119 
6120 #elif defined(LINUX_NATIVE_AIO)
6121 
6122     slot.ret = 0;
6123 
6124     slot.n_bytes = 0;
6125 
6126     memset(&slot.control, 0x0, sizeof(slot.control));
6127 
6128 #endif /* WIN_ASYNC_IO */
6129   }
6130 
6131   return (DB_SUCCESS);
6132 }
6133 
6134 #ifdef LINUX_NATIVE_AIO
6135 /** Initialise the Linux Native AIO interface */
init_linux_native_aio()6136 dberr_t AIO::init_linux_native_aio() {
6137   /* Initialize the io_context array. One io_context
6138   per segment in the array. */
6139 
6140   ut_a(m_aio_ctx == nullptr);
6141 
6142   m_aio_ctx = static_cast<io_context **>(
6143       ut_zalloc_nokey(m_n_segments * sizeof(*m_aio_ctx)));
6144 
6145   if (m_aio_ctx == nullptr) {
6146     return (DB_OUT_OF_MEMORY);
6147   }
6148 
6149   io_context **ctx = m_aio_ctx;
6150   ulint max_events = slots_per_segment();
6151 
6152   for (ulint i = 0; i < m_n_segments; ++i, ++ctx) {
6153     if (!linux_create_io_ctx(max_events, ctx)) {
6154       /* If something bad happened during aio setup
6155       we should call it a day and return right away.
6156       We don't care about any leaks because a failure
6157       to initialize the io subsystem means that the
6158       server (or atleast the innodb storage engine)
6159       is not going to startup. */
6160       return (DB_IO_ERROR);
6161     }
6162   }
6163 
6164   return (DB_SUCCESS);
6165 }
6166 #endif /* LINUX_NATIVE_AIO */
6167 
6168 /** Initialise the array */
init()6169 dberr_t AIO::init() {
6170   ut_a(!m_slots.empty());
6171 
6172 #ifdef _WIN32
6173   ut_a(m_handles == NULL);
6174 
6175   m_handles = UT_NEW_NOKEY(Handles(m_slots.size()));
6176 #endif /* _WIN32 */
6177 
6178   if (srv_use_native_aio) {
6179 #ifdef LINUX_NATIVE_AIO
6180     dberr_t err = init_linux_native_aio();
6181 
6182     if (err != DB_SUCCESS) {
6183       return (err);
6184     }
6185 
6186 #endif /* LINUX_NATIVE_AIO */
6187   }
6188 
6189   return (init_slots());
6190 }
6191 
6192 /** Creates an aio wait array. Note that we return NULL in case of failure.
6193 We don't care about freeing memory here because we assume that a
6194 failure will result in server refusing to start up.
6195 @param[in]	id		Latch ID
6196 @param[in]	n		maximum number of pending AIO operations
6197                                 allowed; n must be divisible by m_n_segments
6198 @param[in]	n_segments	number of segments in the AIO array
6199 @return own: AIO array, NULL on failure */
create(latch_id_t id,ulint n,ulint n_segments)6200 AIO *AIO::create(latch_id_t id, ulint n, ulint n_segments) {
6201   ut_a(n_segments > 0);
6202 
6203   if ((n % n_segments)) {
6204     ib::error(ER_IB_MSG_828) << "Maximum number of AIO operations must be "
6205                              << "divisible by number of segments";
6206 
6207     return (nullptr);
6208   }
6209 
6210   AIO *array = UT_NEW_NOKEY(AIO(id, n, n_segments));
6211 
6212   if (array != nullptr && array->init() != DB_SUCCESS) {
6213     UT_DELETE(array);
6214 
6215     array = nullptr;
6216   }
6217 
6218   return (array);
6219 }
6220 
6221 /** AIO destructor */
~AIO()6222 AIO::~AIO() {
6223 #ifdef WIN_ASYNC_IO
6224   for (ulint i = 0; i < m_slots.size(); ++i) {
6225     CloseHandle(m_slots[i].handle);
6226   }
6227 #endif /* WIN_ASYNC_IO */
6228 
6229 #ifdef _WIN32
6230   UT_DELETE(m_handles);
6231 #endif /* _WIN32 */
6232 
6233   mutex_destroy(&m_mutex);
6234 
6235   os_event_destroy(m_not_full);
6236   os_event_destroy(m_is_empty);
6237 
6238 #if defined(LINUX_NATIVE_AIO)
6239   if (srv_use_native_aio) {
6240     m_events.clear();
6241     ut_free(m_aio_ctx);
6242   }
6243 #endif /* LINUX_NATIVE_AIO */
6244 
6245   m_slots.clear();
6246 }
6247 
6248 /** Initializes the asynchronous io system. Creates one array each for ibuf
6249 and log i/o. Also creates one array each for read and write where each
6250 array is divided logically into n_readers and n_writers
6251 respectively. The caller must create an i/o handler thread for each
6252 segment in these arrays. This function also creates the sync array.
6253 No i/o handler thread needs to be created for that
6254 @param[in]	n_per_seg	maximum number of pending aio
6255                                 operations allowed per segment
6256 @param[in]	n_readers	number of reader threads
6257 @param[in]	n_writers	number of writer threads
6258 @param[in]	n_slots_sync	number of slots in the sync aio array
6259 @return true if the AIO sub-system was started successfully */
start(ulint n_per_seg,ulint n_readers,ulint n_writers,ulint n_slots_sync)6260 bool AIO::start(ulint n_per_seg, ulint n_readers, ulint n_writers,
6261                 ulint n_slots_sync) {
6262 #if defined(LINUX_NATIVE_AIO)
6263   /* Check if native aio is supported on this system and tmpfs */
6264   if (srv_use_native_aio && !is_linux_native_aio_supported()) {
6265     ib::warn(ER_IB_MSG_829) << "Linux Native AIO disabled.";
6266 
6267     srv_use_native_aio = FALSE;
6268   }
6269 #endif /* LINUX_NATIVE_AIO */
6270 
6271   srv_reset_io_thread_op_info();
6272 
6273   s_reads =
6274       create(LATCH_ID_OS_AIO_READ_MUTEX, n_readers * n_per_seg, n_readers);
6275 
6276   if (s_reads == nullptr) {
6277     return false;
6278   }
6279 
6280   ulint start = srv_read_only_mode ? 0 : 2;
6281   ulint n_segs = n_readers + start;
6282 
6283 #ifndef UNIV_HOTBACKUP
6284   /* 0 is the ibuf segment and 1 is the redo log segment. */
6285   for (ulint i = start; i < n_segs; ++i) {
6286     ut_a(i < SRV_MAX_N_IO_THREADS);
6287     srv_io_thread_function[i] = "read thread";
6288   }
6289 #endif /* !UNIV_HOTBACKUP */
6290 
6291   ulint n_segments = n_readers;
6292 
6293   if (!srv_read_only_mode) {
6294     s_ibuf = create(LATCH_ID_OS_AIO_IBUF_MUTEX, n_per_seg, 1);
6295 
6296     if (s_ibuf == nullptr) {
6297       return false;
6298     }
6299 
6300     ++n_segments;
6301 
6302 #ifndef UNIV_HOTBACKUP
6303     srv_io_thread_function[0] = "insert buffer thread";
6304 #endif /* !UNIV_HOTBACKUP */
6305 
6306     s_log = create(LATCH_ID_OS_AIO_LOG_MUTEX, n_per_seg, 1);
6307 
6308     if (s_log == nullptr) {
6309       return false;
6310     }
6311 
6312     ++n_segments;
6313 
6314 #ifndef UNIV_HOTBACKUP
6315     srv_io_thread_function[1] = "log thread";
6316 #endif /* !UNIV_HOTBAKUP */
6317 
6318   } else {
6319     s_ibuf = s_log = nullptr;
6320   }
6321 
6322   s_writes =
6323       create(LATCH_ID_OS_AIO_WRITE_MUTEX, n_writers * n_per_seg, n_writers);
6324 
6325   if (s_writes == nullptr) {
6326     return false;
6327   }
6328 
6329   n_segments += n_writers;
6330 
6331 #ifndef UNIV_HOTBACKUP
6332   for (ulint i = start + n_readers; i < n_segments; ++i) {
6333     ut_a(i < SRV_MAX_N_IO_THREADS);
6334     srv_io_thread_function[i] = "write thread";
6335   }
6336 #endif /* !UNIV_HOTBACKUP */
6337 
6338   ut_ad(n_segments >= static_cast<ulint>(srv_read_only_mode ? 2 : 4));
6339 
6340   s_sync = create(LATCH_ID_OS_AIO_SYNC_MUTEX, n_slots_sync, 1);
6341 
6342   if (s_sync == nullptr) {
6343     return false;
6344   }
6345 
6346   os_aio_n_segments = n_segments;
6347 
6348   os_aio_validate();
6349 
6350   os_aio_segment_wait_events = static_cast<os_event_t *>(
6351       ut_zalloc_nokey(n_segments * sizeof *os_aio_segment_wait_events));
6352 
6353   if (os_aio_segment_wait_events == nullptr) {
6354     return false;
6355   }
6356 
6357   for (ulint i = 0; i < n_segments; ++i) {
6358     os_aio_segment_wait_events[i] = os_event_create();
6359   }
6360 
6361   os_last_printout = ut_time_monotonic();
6362 
6363   return true;
6364 }
6365 
6366 /** Free the AIO arrays */
shutdown()6367 void AIO::shutdown() {
6368   UT_DELETE(s_ibuf);
6369   s_ibuf = nullptr;
6370 
6371   UT_DELETE(s_log);
6372   s_log = nullptr;
6373 
6374   UT_DELETE(s_writes);
6375   s_writes = nullptr;
6376 
6377   UT_DELETE(s_sync);
6378   s_sync = nullptr;
6379 
6380   UT_DELETE(s_reads);
6381   s_reads = nullptr;
6382 }
6383 
6384 #if !defined(NO_FALLOCATE) && defined(UNIV_LINUX)
6385 
6386 /** Max disk sector size */
6387 static const ulint MAX_SECTOR_SIZE = 4096;
6388 
6389 /**
6390 Try and get the FusionIO sector size. */
os_fusionio_get_sector_size()6391 void os_fusionio_get_sector_size() {
6392   if (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT ||
6393       srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC) {
6394     ulint sector_size = UNIV_SECTOR_SIZE;
6395     char *path = srv_data_home;
6396     os_file_t check_file;
6397     byte *ptr;
6398     byte *block_ptr;
6399     char current_dir[3];
6400     char *dir_end;
6401     ulint dir_len;
6402     ulint check_path_len;
6403     char *check_file_name;
6404     ssize_t ret;
6405 
6406     /* If the srv_data_home is empty, set the path to
6407     current dir. */
6408     if (*path == 0) {
6409       current_dir[0] = FN_CURLIB;
6410       current_dir[1] = FN_LIBCHAR;
6411       current_dir[2] = 0;
6412       path = current_dir;
6413     }
6414 
6415     /* Get the path of data file */
6416     dir_end = strrchr(path, OS_PATH_SEPARATOR);
6417     dir_len = dir_end ? dir_end - path : strlen(path);
6418 
6419     /* allocate a new path and move the directory path to it. */
6420     check_path_len = dir_len + sizeof "/check_sector_size";
6421     check_file_name = static_cast<char *>(ut_zalloc_nokey(check_path_len));
6422     memcpy(check_file_name, path, dir_len);
6423 
6424     /* Construct a check file name. */
6425     strcat(check_file_name + dir_len, "/check_sector_size");
6426 
6427     /* Create a tmp file for checking sector size. */
6428     check_file = ::open(check_file_name,
6429                         O_CREAT | O_TRUNC | O_WRONLY | O_DIRECT, S_IRWXU);
6430 
6431     if (check_file == -1) {
6432       ib::error(ER_IB_MSG_830)
6433           << "Failed to create check sector file, errno:" << errno
6434           << " Please confirm O_DIRECT is"
6435           << " supported and remove the file " << check_file_name
6436           << " if it exists.";
6437       ut_free(check_file_name);
6438       errno = 0;
6439       return;
6440     }
6441 
6442     /* Try to write the file with different sector size
6443     alignment. */
6444     ptr = static_cast<byte *>(ut_zalloc_nokey(2 * MAX_SECTOR_SIZE));
6445 
6446     while (sector_size <= MAX_SECTOR_SIZE) {
6447       block_ptr = static_cast<byte *>(ut_align(ptr, sector_size));
6448       ret = pwrite(check_file, block_ptr, sector_size, 0);
6449       if (ret > 0 && (ulint)ret == sector_size) {
6450         break;
6451       }
6452       sector_size *= 2;
6453     }
6454 
6455     /* The sector size should <= MAX_SECTOR_SIZE. */
6456     ut_ad(sector_size <= MAX_SECTOR_SIZE);
6457 
6458     close(check_file);
6459     unlink(check_file_name);
6460 
6461     ut_free(check_file_name);
6462     ut_free(ptr);
6463     errno = 0;
6464 
6465     os_io_ptr_align = sector_size;
6466   }
6467 }
6468 #endif /* !NO_FALLOCATE && UNIV_LINUX */
6469 
6470 /** Creates and initializes block_cache. Creates array of MAX_BLOCKS
6471 and allocates the memory in each block to hold BUFFER_BLOCK_SIZE
6472 of data.
6473 
6474 This function is called by InnoDB during srv_start().
6475 It is also called by MEB while applying the redo logs on TDE tablespaces,
6476 the "Blocks" allocated in this block_cache are used to hold the decrypted
6477 page data. */
os_create_block_cache()6478 void os_create_block_cache() {
6479   ut_a(block_cache == nullptr);
6480 
6481   block_cache = UT_NEW_NOKEY(Blocks(MAX_BLOCKS));
6482 
6483   for (Blocks::iterator it = block_cache->begin(); it != block_cache->end();
6484        ++it) {
6485     ut_a(it->m_in_use == 0);
6486     ut_a(it->m_ptr == nullptr);
6487 
6488     /* Allocate double of max page size memory, since
6489     compress could generate more bytes than orgininal
6490     data. */
6491     it->m_ptr = static_cast<byte *>(ut_malloc_nokey(BUFFER_BLOCK_SIZE));
6492 
6493     ut_a(it->m_ptr != nullptr);
6494   }
6495 }
6496 
6497 #ifdef UNIV_HOTBACKUP
6498 /** De-allocates block cache at InnoDB shutdown. */
meb_free_block_cache()6499 void meb_free_block_cache() {
6500   if (block_cache == nullptr) {
6501     return;
6502   }
6503 
6504   for (Blocks::iterator it = block_cache->begin(); it != block_cache->end();
6505        ++it) {
6506     ut_a(it->m_in_use == 0);
6507     ut_free(it->m_ptr);
6508   }
6509 
6510   UT_DELETE(block_cache);
6511 
6512   block_cache = nullptr;
6513 }
6514 #endif /* UNIV_HOTBACKUP */
6515 
6516 /** Initializes the asynchronous io system. Creates one array each for ibuf
6517 and log i/o. Also creates one array each for read and write where each
6518 array is divided logically into n_readers and n_writers
6519 respectively. The caller must create an i/o handler thread for each
6520 segment in these arrays. This function also creates the sync array.
6521 No i/o handler thread needs to be created for that
6522 @param[in]	n_readers	number of reader threads
6523 @param[in]	n_writers	number of writer threads
6524 @param[in]	n_slots_sync	number of dblwr slots in the sync aio array */
os_aio_init(ulint n_readers,ulint n_writers,ulint n_slots_sync)6525 bool os_aio_init(ulint n_readers, ulint n_writers, ulint n_slots_sync) {
6526   /* Maximum number of pending aio operations allowed per segment */
6527   ulint limit = 8 * OS_AIO_N_PENDING_IOS_PER_THREAD;
6528 
6529 #ifdef _WIN32
6530   if (srv_use_native_aio) {
6531     limit = SRV_N_PENDING_IOS_PER_THREAD;
6532   }
6533 #endif /* _WIN32 */
6534 
6535   /* Get sector size for DIRECT_IO. In this case, we need to
6536   know the sector size for aligning the write buffer. */
6537 #if !defined(NO_FALLOCATE) && defined(UNIV_LINUX)
6538   os_fusionio_get_sector_size();
6539 #endif /* !NO_FALLOCATE && UNIV_LINUX */
6540 
6541   return (AIO::start(limit, n_readers, n_writers, n_slots_sync));
6542 }
6543 
6544 /** Frees the asynchronous io system. */
os_aio_free()6545 void os_aio_free() {
6546   AIO::shutdown();
6547 
6548   for (ulint i = 0; i < os_aio_n_segments; i++) {
6549     os_event_destroy(os_aio_segment_wait_events[i]);
6550   }
6551 
6552   ut_free(os_aio_segment_wait_events);
6553   os_aio_segment_wait_events = nullptr;
6554   os_aio_n_segments = 0;
6555 
6556   for (Blocks::iterator it = block_cache->begin(); it != block_cache->end();
6557        ++it) {
6558     ut_a(it->m_in_use == 0);
6559     ut_free(it->m_ptr);
6560   }
6561 
6562   UT_DELETE(block_cache);
6563 
6564   block_cache = nullptr;
6565 }
6566 
6567 /** Wakes up all async i/o threads so that they know to exit themselves in
6568 shutdown. */
os_aio_wake_all_threads_at_shutdown()6569 void os_aio_wake_all_threads_at_shutdown() {
6570 #ifdef WIN_ASYNC_IO
6571 
6572   AIO::wake_at_shutdown();
6573 
6574 #elif defined(LINUX_NATIVE_AIO)
6575 
6576   /* When using native AIO interface the io helper threads
6577   wait on io_getevents with a timeout value of 500ms. At
6578   each wake up these threads check the server status.
6579   No need to do anything to wake them up. */
6580 
6581   if (srv_use_native_aio) {
6582     return;
6583   }
6584 
6585 #endif /* !WIN_ASYNC_AIO */
6586 
6587   /* Fall through to simulated AIO handler wakeup if we are
6588   not using native AIO. */
6589 
6590   /* This loop wakes up all simulated ai/o threads */
6591 
6592   for (ulint i = 0; i < os_aio_n_segments; ++i) {
6593     os_event_set(os_aio_segment_wait_events[i]);
6594   }
6595 }
6596 
6597 /** Waits until there are no pending writes in AIO::s_writes. There can
6598 be other, synchronous, pending writes. */
os_aio_wait_until_no_pending_writes()6599 void os_aio_wait_until_no_pending_writes() {
6600   AIO::wait_until_no_pending_writes();
6601 }
6602 
6603 /** Calculates segment number for a slot.
6604 @param[in]	array		AIO wait array
6605 @param[in]	slot		slot in this array
6606 @return segment number (which is the number used by, for example,
6607         I/O-handler threads) */
get_segment_no_from_slot(const AIO * array,const Slot * slot)6608 ulint AIO::get_segment_no_from_slot(const AIO *array, const Slot *slot) {
6609   ulint segment;
6610   ulint seg_len;
6611 
6612   if (array == s_ibuf) {
6613     ut_ad(!srv_read_only_mode);
6614 
6615     segment = IO_IBUF_SEGMENT;
6616 
6617   } else if (array == s_log) {
6618     ut_ad(!srv_read_only_mode);
6619 
6620     segment = IO_LOG_SEGMENT;
6621 
6622   } else if (array == s_reads) {
6623     seg_len = s_reads->slots_per_segment();
6624 
6625     segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
6626   } else {
6627     ut_a(array == s_writes);
6628 
6629     seg_len = s_writes->slots_per_segment();
6630 
6631     segment = s_reads->m_n_segments + (srv_read_only_mode ? 0 : 2) +
6632               slot->pos / seg_len;
6633   }
6634   return (segment);
6635 }
6636 
6637 /** Requests for a slot in the aio array. If no slot is available, waits until
6638 not_full-event becomes signaled.
6639 
6640 @param[in,out]	type		IO context
6641 @param[in,out]	m1		message to be passed along with the AIO
6642                                 operation
6643 @param[in,out]	m2		message to be passed along with the AIO
6644                                 operation
6645 @param[in]	file		file handle
6646 @param[in]	name		name of the file or path as a NUL-terminated
6647                                 string
6648 @param[in,out]	buf		buffer where to read or from which to write
6649 @param[in]	offset		file offset, where to read from or start writing
6650 @param[in]	len		length of the block to read or write
6651 @return pointer to slot */
reserve_slot(IORequest & type,fil_node_t * m1,void * m2,pfs_os_file_t file,const char * name,void * buf,os_offset_t offset,ulint len)6652 Slot *AIO::reserve_slot(IORequest &type, fil_node_t *m1, void *m2,
6653                         pfs_os_file_t file, const char *name, void *buf,
6654                         os_offset_t offset, ulint len) {
6655 #ifdef WIN_ASYNC_IO
6656   ut_a((len & 0xFFFFFFFFUL) == len);
6657 #endif /* WIN_ASYNC_IO */
6658 
6659   /* No need of a mutex. Only reading constant fields */
6660   ut_ad(type.validate());
6661 
6662   const auto slots_per_seg = slots_per_segment();
6663 
6664   /* We attempt to keep adjacent blocks in the same local
6665   segment. This can help in merging IO requests when we are
6666   doing simulated AIO */
6667   ulint local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6)) % m_n_segments;
6668 
6669   for (;;) {
6670     acquire();
6671 
6672     if (m_n_reserved != m_slots.size()) {
6673       break;
6674     }
6675 
6676     release();
6677 
6678     if (!srv_use_native_aio) {
6679       /* If the handler threads are suspended,
6680       wake them so that we get more slots */
6681 
6682       os_aio_simulated_wake_handler_threads();
6683     }
6684 
6685     os_event_wait(m_not_full);
6686   }
6687 
6688   ulint counter = 0;
6689   Slot *slot = nullptr;
6690 
6691   /* We start our search for an available slot from our preferred
6692   local segment and do a full scan of the array. We are
6693   guaranteed to find a slot in full scan. */
6694   for (ulint i = local_seg * slots_per_seg; counter < m_slots.size();
6695        ++i, ++counter) {
6696     i %= m_slots.size();
6697 
6698     slot = at(i);
6699 
6700     if (slot->is_reserved == false) {
6701       break;
6702     }
6703   }
6704 
6705   /* We MUST always be able to get hold of a reserved slot. */
6706   ut_a(counter < m_slots.size());
6707 
6708   ut_a(slot->is_reserved == false);
6709 
6710   ++m_n_reserved;
6711 
6712   if (m_n_reserved == 1) {
6713     os_event_reset(m_is_empty);
6714   }
6715 
6716   if (m_n_reserved == m_slots.size()) {
6717     os_event_reset(m_not_full);
6718   }
6719 
6720   slot->is_reserved = true;
6721   slot->reservation_time = ut_time_monotonic();
6722   slot->m1 = m1;
6723   slot->m2 = m2;
6724   slot->file = file;
6725   slot->name = name;
6726 #ifdef _WIN32
6727   slot->len = static_cast<DWORD>(len);
6728 #else
6729   slot->len = static_cast<ulint>(len);
6730 #endif /* _WIN32 */
6731   slot->type = type;
6732   slot->buf = static_cast<byte *>(buf);
6733   slot->ptr = slot->buf;
6734   slot->offset = offset;
6735   slot->err = DB_SUCCESS;
6736   slot->original_len = static_cast<uint32>(len);
6737   slot->io_already_done = false;
6738   slot->buf_block = nullptr;
6739   slot->encrypt_log_buf = nullptr;
6740 
6741   if (srv_use_native_aio && offset > 0 && type.is_write() &&
6742       type.is_compressed()) {
6743     ulint compressed_len = len;
6744 
6745     ut_ad(!type.is_log());
6746 
6747     release();
6748 
6749     void *src_buf = slot->buf;
6750     slot->buf_block = os_file_compress_page(type, src_buf, &compressed_len);
6751 
6752     slot->buf = static_cast<byte *>(src_buf);
6753     slot->ptr = slot->buf;
6754 #ifdef _WIN32
6755     slot->len = static_cast<DWORD>(compressed_len);
6756 #else
6757     slot->len = static_cast<ulint>(compressed_len);
6758 #endif /* _WIN32 */
6759     slot->skip_punch_hole = !type.punch_hole();
6760 
6761     acquire();
6762   }
6763 
6764   /* We do encryption after compression, since if we do encryption
6765   before compression, the encrypted data will cause compression fail
6766   or low compression rate. */
6767   if (srv_use_native_aio && offset > 0 && type.is_write() &&
6768       type.is_encrypted()) {
6769     ulint encrypted_len = slot->len;
6770     file::Block *encrypted_block;
6771     byte *encrypt_log_buf;
6772 
6773     release();
6774 
6775     void *src_buf = slot->buf;
6776     if (!type.is_log()) {
6777       encrypted_block = os_file_encrypt_page(type, src_buf, &encrypted_len);
6778 
6779       if (slot->buf_block != nullptr) {
6780         os_free_block(slot->buf_block);
6781       }
6782 
6783       slot->buf_block = encrypted_block;
6784     } else {
6785       /* Skip encrypt log file header */
6786       if (offset >= LOG_FILE_HDR_SIZE) {
6787         encrypted_block =
6788             os_file_encrypt_log(type, src_buf, encrypt_log_buf, &encrypted_len);
6789 
6790         if (slot->buf_block != nullptr) {
6791           os_free_block(slot->buf_block);
6792         }
6793 
6794         slot->buf_block = encrypted_block;
6795 
6796         if (slot->encrypt_log_buf != nullptr) {
6797           ut_free(slot->encrypt_log_buf);
6798         }
6799 
6800         slot->encrypt_log_buf = encrypt_log_buf;
6801       }
6802     }
6803 
6804     slot->buf = static_cast<byte *>(src_buf);
6805 
6806     slot->ptr = slot->buf;
6807 
6808 #ifdef _WIN32
6809     slot->len = static_cast<DWORD>(encrypted_len);
6810 #else
6811     slot->len = static_cast<ulint>(encrypted_len);
6812 #endif /* _WIN32 */
6813 
6814     acquire();
6815   }
6816 
6817 #ifdef WIN_ASYNC_IO
6818   {
6819     OVERLAPPED *control;
6820 
6821     control = &slot->control;
6822     control->Offset = (DWORD)offset & 0xFFFFFFFF;
6823     control->OffsetHigh = (DWORD)(offset >> 32);
6824 
6825     ResetEvent(slot->handle);
6826   }
6827 #elif defined(LINUX_NATIVE_AIO)
6828 
6829   /* If we are not using native AIO skip this part. */
6830   if (srv_use_native_aio) {
6831     off_t aio_offset;
6832 
6833     /* Check if we are dealing with 64 bit arch.
6834     If not then make sure that offset fits in 32 bits. */
6835     aio_offset = (off_t)offset;
6836 
6837     ut_a(sizeof(aio_offset) >= sizeof(offset) ||
6838          ((os_offset_t)aio_offset) == offset);
6839 
6840     auto iocb = &slot->control;
6841 
6842     if (type.is_read()) {
6843       io_prep_pread(iocb, file.m_file, slot->ptr, slot->len, aio_offset);
6844     } else {
6845       ut_ad(type.is_write());
6846       io_prep_pwrite(iocb, file.m_file, slot->ptr, slot->len, aio_offset);
6847     }
6848 
6849     iocb->data = slot;
6850 
6851     slot->n_bytes = 0;
6852     slot->ret = 0;
6853   }
6854 #endif /* LINUX_NATIVE_AIO */
6855 
6856   release();
6857 
6858   return (slot);
6859 }
6860 
6861 /** Wakes up a simulated aio i/o-handler thread if it has something to do.
6862 @param[in]	global_segment	The number of the segment in the AIO arrays */
wake_simulated_handler_thread(ulint global_segment)6863 void AIO::wake_simulated_handler_thread(ulint global_segment) {
6864   ut_ad(!srv_use_native_aio);
6865 
6866   AIO *array{};
6867 
6868   auto segment = get_array_and_local_segment(array, global_segment);
6869 
6870   array->wake_simulated_handler_thread(global_segment, segment);
6871 }
6872 
6873 /** Wakes up a simulated AIO I/O-handler thread if it has something to do
6874 for a local segment in the AIO array.
6875 @param[in]	global_segment	The number of the segment in the AIO arrays
6876 @param[in]	segment		The local segment in the AIO array */
wake_simulated_handler_thread(ulint global_segment,ulint segment)6877 void AIO::wake_simulated_handler_thread(ulint global_segment, ulint segment) {
6878   ut_ad(!srv_use_native_aio);
6879 
6880   ulint n = slots_per_segment();
6881   ulint offset = segment * n;
6882 
6883   /* Look through n slots after the segment * n'th slot */
6884 
6885   acquire();
6886 
6887   const Slot *slot = at(offset);
6888 
6889   for (ulint i = 0; i < n; ++i, ++slot) {
6890     if (slot->is_reserved) {
6891       /* Found an i/o request */
6892 
6893       release();
6894 
6895       os_event_set(os_aio_segment_wait_events[global_segment]);
6896 
6897       return;
6898     }
6899   }
6900 
6901   release();
6902 }
6903 
6904 /** Wakes up simulated aio i/o-handler threads if they have something to do. */
os_aio_simulated_wake_handler_threads()6905 void os_aio_simulated_wake_handler_threads() {
6906   if (srv_use_native_aio) {
6907     /* We do not use simulated aio: do nothing */
6908 
6909     return;
6910   }
6911 
6912   os_aio_recommend_sleep_for_read_threads = false;
6913 
6914   for (ulint i = 0; i < os_aio_n_segments; ++i) {
6915     AIO::wake_simulated_handler_thread(i);
6916   }
6917 }
6918 
6919 /** Select the IO slot array
6920 @param[in,out]	type		Type of IO, READ or WRITE
6921 @param[in]	read_only	true if running in read-only mode
6922 @param[in]	aio_mode	IO mode
6923 @return slot array or NULL if invalid mode specified */
select_slot_array(IORequest & type,bool read_only,AIO_mode aio_mode)6924 AIO *AIO::select_slot_array(IORequest &type, bool read_only,
6925                             AIO_mode aio_mode) {
6926   AIO *array;
6927 
6928   ut_ad(type.validate());
6929 
6930   switch (aio_mode) {
6931     case AIO_mode::NORMAL:
6932       array = type.is_read() ? AIO::s_reads : AIO::s_writes;
6933       break;
6934 
6935     case AIO_mode::IBUF:
6936       ut_ad(type.is_read());
6937 
6938       /* Reduce probability of deadlock bugs in connection with ibuf:
6939       do not let the ibuf i/o handler sleep */
6940 
6941       type.clear_do_not_wake();
6942 
6943       array = read_only ? AIO::s_reads : AIO::s_ibuf;
6944       break;
6945 
6946     case AIO_mode::LOG:
6947       array = read_only ? AIO::s_reads : AIO::s_log;
6948       break;
6949 
6950     case AIO_mode::SYNC:
6951 
6952       array = AIO::s_sync;
6953 #if defined(LINUX_NATIVE_AIO)
6954       /* In Linux native AIO we don't use sync IO array. */
6955       ut_a(!srv_use_native_aio);
6956 #endif /* LINUX_NATIVE_AIO */
6957       break;
6958 
6959     default:
6960       ut_error;
6961   }
6962 
6963   return (array);
6964 }
6965 
6966 #ifdef WIN_ASYNC_IO
6967 /** This function is only used in Windows asynchronous i/o.
6968 Waits for an aio operation to complete. This function is used to wait the
6969 for completed requests. The aio array of pending requests is divided
6970 into segments. The thread specifies which segment or slot it wants to wait
6971 for. NOTE: this function will also take care of freeing the aio slot,
6972 therefore no other thread is allowed to do the freeing!
6973 @param[in]	segment		The number of the segment in the aio arrays to
6974                                 wait for; segment 0 is the ibuf I/O thread,
6975                                 segment 1 the log I/O thread, then follow the
6976                                 non-ibuf read threads, and as the last are the
6977                                 non-ibuf write threads; if this is
6978                                 ULINT_UNDEFINED, then it means that sync AIO
6979                                 is used, and this parameter is ignored
6980 @param[in]	pos		this parameter is used only in sync AIO:
6981                                 wait for the aio slot at this position
6982 @param[out]	m1		the messages passed with the AIO request; note
6983                                 that also in the case where the AIO operation
6984                                 failed, these output parameters are valid and
6985                                 can be used to restart the operation,
6986                                 for example
6987 @param[out]	m2		callback message
6988 @param[out]	type		OS_FILE_WRITE or ..._READ
6989 @return DB_SUCCESS or error code */
os_aio_windows_handler(ulint segment,ulint pos,fil_node_t ** m1,void ** m2,IORequest * type)6990 static dberr_t os_aio_windows_handler(ulint segment, ulint pos, fil_node_t **m1,
6991                                       void **m2, IORequest *type) {
6992   Slot *slot;
6993   dberr_t err;
6994   AIO *array{};
6995   ulint orig_seg = segment;
6996 
6997   if (segment == ULINT_UNDEFINED) {
6998     segment = 0;
6999     array = AIO::sync_array();
7000   } else {
7001     segment = AIO::get_array_and_local_segment(array, segment);
7002   }
7003 
7004   /* NOTE! We only access constant fields in os_aio_array. Therefore
7005   we do not have to acquire the protecting mutex yet */
7006 
7007 #ifndef UNIV_HOTBACKUP
7008   ut_ad(os_aio_validate_skip());
7009 #endif /* !UNIV_HOTBACKUP */
7010 
7011   if (array == AIO::sync_array()) {
7012     WaitForSingleObject(array->at(pos)->handle, INFINITE);
7013 
7014   } else {
7015     if (orig_seg != ULINT_UNDEFINED) {
7016       srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
7017     }
7018 
7019     pos = WaitForMultipleObjects((DWORD)array->slots_per_segment(),
7020                                  array->handles(segment), FALSE, INFINITE);
7021   }
7022 
7023   array->acquire();
7024 
7025   if (
7026 #ifndef UNIV_HOTBACKUP
7027       srv_shutdown_state.load() == SRV_SHUTDOWN_EXIT_THREADS
7028 #else  /* !UNIV_HOTBACKUP */
7029       true
7030 #endif /* !UNIV_HOTBACKUP */
7031       && array->is_empty() && !buf_flush_page_cleaner_is_active()) {
7032 
7033     *m1 = NULL;
7034     *m2 = NULL;
7035 
7036     array->release();
7037 
7038     return (DB_SUCCESS);
7039   }
7040 
7041   ulint n = array->slots_per_segment();
7042 
7043   ut_a(pos >= WAIT_OBJECT_0 && pos <= WAIT_OBJECT_0 + n);
7044 
7045   slot = array->at(pos + segment * n);
7046 
7047   ut_a(slot->is_reserved);
7048 
7049   if (orig_seg != ULINT_UNDEFINED) {
7050     srv_set_io_thread_op_info(orig_seg, "get windows aio return value");
7051   }
7052 
7053   BOOL ret;
7054   ret = GetOverlappedResult(slot->file.m_file, &slot->control, &slot->n_bytes,
7055                             TRUE);
7056 
7057   *m1 = slot->m1;
7058   *m2 = slot->m2;
7059 
7060   *type = slot->type;
7061 
7062   BOOL retry = FALSE;
7063 
7064   if (ret && slot->n_bytes == slot->len) {
7065     err = DB_SUCCESS;
7066 
7067   } else if (os_file_handle_error(slot->name, "Windows aio")) {
7068     retry = true;
7069 
7070   } else {
7071     err = DB_IO_ERROR;
7072   }
7073 
7074   array->release();
7075 
7076   if (retry) {
7077     /* Retry failed read/write operation synchronously.
7078     No need to hold array->m_mutex. */
7079 
7080 #ifdef UNIV_PFS_IO
7081     /* This read/write does not go through os_file_read
7082     and os_file_write APIs, need to register with
7083     performance schema explicitly here. */
7084     struct PSI_file_locker *locker = NULL;
7085     PSI_file_locker_state state;
7086     register_pfs_file_io_begin(
7087         &state, locker, slot->file, slot->len,
7088         slot->type.is_write() ? PSI_FILE_WRITE : PSI_FILE_READ, __FILE__,
7089         __LINE__);
7090 #endif /* UNIV_PFS_IO */
7091 
7092     ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
7093 
7094     ssize_t n_bytes = SyncFileIO::execute(slot);
7095 
7096 #ifdef UNIV_PFS_IO
7097     register_pfs_file_io_end(locker, slot->len);
7098 #endif /* UNIV_PFS_IO */
7099 
7100     if (n_bytes < 0 && GetLastError() == ERROR_IO_PENDING) {
7101       /* AIO was queued successfully!
7102       We want a synchronous I/O operation on a
7103       file where we also use async I/O: in Windows
7104       we must use the same wait mechanism as for
7105       async I/O */
7106 
7107       BOOL ret;
7108       ret = GetOverlappedResult(slot->file.m_file, &slot->control,
7109                                 &slot->n_bytes, TRUE);
7110 
7111       n_bytes = ret ? slot->n_bytes : -1;
7112     }
7113 
7114     err = (n_bytes == slot->len) ? DB_SUCCESS : DB_IO_ERROR;
7115   }
7116 
7117   if (err == DB_SUCCESS) {
7118     err = AIOHandler::post_io_processing(slot);
7119   }
7120 
7121   array->release_with_mutex(slot);
7122 
7123   return (err);
7124 }
7125 #endif /* WIN_ASYNC_IO */
7126 
os_aio_func(IORequest & type,AIO_mode aio_mode,const char * name,pfs_os_file_t file,void * buf,os_offset_t offset,ulint n,bool read_only,fil_node_t * m1,void * m2)7127 dberr_t os_aio_func(IORequest &type, AIO_mode aio_mode, const char *name,
7128                     pfs_os_file_t file, void *buf, os_offset_t offset, ulint n,
7129                     bool read_only, fil_node_t *m1, void *m2) {
7130 #ifdef WIN_ASYNC_IO
7131   BOOL ret = TRUE;
7132 #endif /* WIN_ASYNC_IO */
7133 
7134   ut_ad(n > 0);
7135   ut_ad((n % OS_FILE_LOG_BLOCK_SIZE) == 0);
7136   ut_ad((offset % OS_FILE_LOG_BLOCK_SIZE) == 0);
7137 #ifndef UNIV_HOTBACKUP
7138   ut_ad(os_aio_validate_skip());
7139 #endif /* !UNIV_HOTBACKUP */
7140 
7141 #ifdef WIN_ASYNC_IO
7142   ut_ad((n & 0xFFFFFFFFUL) == n);
7143 #endif /* WIN_ASYNC_IO */
7144 
7145   if (aio_mode == AIO_mode::SYNC
7146 #ifdef WIN_ASYNC_IO
7147       && !srv_use_native_aio
7148 #endif /* WIN_ASYNC_IO */
7149   ) {
7150     /* This is actually an ordinary synchronous read or write:
7151     no need to use an i/o-handler thread. NOTE that if we use
7152     Windows async i/o, Windows does not allow us to use
7153     ordinary synchronous os_file_read etc. on the same file,
7154     therefore we have built a special mechanism for synchronous
7155     wait in the Windows case.
7156     Also note that the Performance Schema instrumentation has
7157     been performed by current os_aio_func()'s wrapper function
7158     pfs_os_aio_func(). So we would no longer need to call
7159     Performance Schema instrumented os_file_read() and
7160     os_file_write(). Instead, we should use os_file_read_func()
7161     and os_file_write_func() */
7162 
7163     if (type.is_read()) {
7164       return (os_file_read_func(type, name, file.m_file, buf, offset, n));
7165     }
7166 
7167     ut_ad(type.is_write());
7168     return (os_file_write_func(type, name, file.m_file, buf, offset, n));
7169   }
7170 
7171 try_again:
7172 
7173   auto array = AIO::select_slot_array(type, read_only, aio_mode);
7174 
7175   auto slot = array->reserve_slot(type, m1, m2, file, name, buf, offset, n);
7176 
7177   if (type.is_read()) {
7178     if (srv_use_native_aio) {
7179       ++os_n_file_reads;
7180 
7181       os_bytes_read_since_printout += n;
7182 #ifdef WIN_ASYNC_IO
7183       ret = ReadFile(file.m_file, slot->ptr, slot->len, &slot->n_bytes,
7184                      &slot->control);
7185 #elif defined(LINUX_NATIVE_AIO)
7186       if (!array->linux_dispatch(slot)) {
7187         goto err_exit;
7188       }
7189 #endif /* WIN_ASYNC_IO */
7190     } else if (type.is_wake()) {
7191       AIO::wake_simulated_handler_thread(
7192           AIO::get_segment_no_from_slot(array, slot));
7193     }
7194   } else if (type.is_write()) {
7195     if (srv_use_native_aio) {
7196       ++os_n_file_writes;
7197 
7198 #ifdef WIN_ASYNC_IO
7199       ret = WriteFile(file.m_file, slot->ptr, slot->len, &slot->n_bytes,
7200                       &slot->control);
7201 #elif defined(LINUX_NATIVE_AIO)
7202       if (!array->linux_dispatch(slot)) {
7203         goto err_exit;
7204       }
7205 #endif /* WIN_ASYNC_IO */
7206 
7207     } else if (type.is_wake()) {
7208       AIO::wake_simulated_handler_thread(
7209           AIO::get_segment_no_from_slot(array, slot));
7210     }
7211   } else {
7212     ut_error;
7213   }
7214 
7215 #ifdef WIN_ASYNC_IO
7216   if (srv_use_native_aio) {
7217     if ((ret && slot->len == slot->n_bytes) ||
7218         (!ret && GetLastError() == ERROR_IO_PENDING)) {
7219       /* AIO was queued successfully! */
7220 
7221       if (aio_mode == AIO_mode::SYNC) {
7222         void *dummy_mess2;
7223         IORequest dummy_type;
7224         fil_node_t *dummy_mess1;
7225 
7226         /* We want a synchronous i/o operation on a
7227         file where we also use async i/o: in Windows
7228         we must use the same wait mechanism as for
7229         async i/o */
7230 
7231         return (os_aio_windows_handler(ULINT_UNDEFINED, slot->pos, &dummy_mess1,
7232                                        &dummy_mess2, &dummy_type));
7233       }
7234 
7235       return (DB_SUCCESS);
7236     }
7237 
7238     goto err_exit;
7239   }
7240 #endif /* WIN_ASYNC_IO */
7241 
7242   /* AIO request was queued successfully! */
7243   return (DB_SUCCESS);
7244 
7245 #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
7246 err_exit:
7247 #endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
7248 
7249   array->release_with_mutex(slot);
7250 
7251   if (os_file_handle_error(name, type.is_read() ? "aio read" : "aio write")) {
7252     goto try_again;
7253   }
7254 
7255   return (DB_IO_ERROR);
7256 }
7257 
7258 /** Simulated AIO handler for reaping IO requests */
7259 class SimulatedAIOHandler {
7260  public:
7261   /** Constructor
7262   @param[in,out]	array	The AIO array
7263   @param[in]	segment	Local segment in the array */
SimulatedAIOHandler(AIO * array,ulint segment)7264   SimulatedAIOHandler(AIO *array, ulint segment)
7265       : m_oldest(),
7266         m_n_elems(),
7267         m_lowest_offset(IB_UINT64_MAX),
7268         m_array(array),
7269         m_n_slots(),
7270         m_segment(segment),
7271         m_ptr(),
7272         m_buf() {
7273     ut_ad(m_segment < 100);
7274 
7275     m_slots.resize(OS_AIO_MERGE_N_CONSECUTIVE);
7276   }
7277 
7278   /** Destructor */
~SimulatedAIOHandler()7279   ~SimulatedAIOHandler() {
7280     if (m_ptr != nullptr) {
7281       ut_free(m_ptr);
7282     }
7283   }
7284 
7285   /** Reset the state of the handler
7286   @param[in]	n_slots	Number of pending AIO operations supported */
init(ulint n_slots)7287   void init(ulint n_slots) {
7288     m_oldest = 0;
7289     m_n_elems = 0;
7290     m_n_slots = n_slots;
7291     m_lowest_offset = IB_UINT64_MAX;
7292 
7293     if (m_ptr != nullptr) {
7294       ut_free(m_ptr);
7295       m_ptr = m_buf = nullptr;
7296     }
7297 
7298     m_slots[0] = nullptr;
7299   }
7300 
7301   /** Check if there is a slot for which the i/o has already been done
7302   @param[out]	n_reserved	Number of reserved slots
7303   @return the first completed slot that is found. */
check_completed(ulint * n_reserved)7304   Slot *check_completed(ulint *n_reserved) {
7305     ulint offset = m_segment * m_n_slots;
7306 
7307     *n_reserved = 0;
7308 
7309     Slot *slot;
7310 
7311     slot = m_array->at(offset);
7312 
7313     for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7314       if (slot->is_reserved) {
7315         if (slot->io_already_done) {
7316           ut_a(slot->is_reserved);
7317 
7318           return (slot);
7319         }
7320 
7321         ++*n_reserved;
7322       }
7323     }
7324 
7325     return (nullptr);
7326   }
7327 
7328   /** If there are at least 2 seconds old requests, then pick the
7329   oldest one to prevent starvation.  If several requests have the
7330   same age, then pick the one at the lowest offset.
7331   @return true if request was selected */
select()7332   bool select() {
7333     if (!select_oldest()) {
7334       return (select_lowest_offset());
7335     }
7336 
7337     return (true);
7338   }
7339 
7340   /** Check if there are several consecutive blocks
7341   to read or write. Merge them if found. */
merge()7342   void merge() {
7343     /* if m_n_elems != 0, then we have assigned
7344     something valid to consecutive_ios[0] */
7345     ut_ad(m_n_elems != 0);
7346     ut_ad(first_slot() != nullptr);
7347 
7348     Slot *slot = first_slot();
7349 
7350     while (!merge_adjacent(slot)) {
7351       /* No op */
7352     }
7353   }
7354 
7355   /** We have now collected n_consecutive I/O requests
7356   in the array; allocate a single buffer which can hold
7357   all data, and perform the I/O
7358   @return the length of the buffer */
allocate_buffer()7359   ulint allocate_buffer() MY_ATTRIBUTE((warn_unused_result)) {
7360     ulint len;
7361     Slot *slot = first_slot();
7362 
7363     ut_ad(m_ptr == nullptr);
7364 
7365     if (slot->type.is_read() && m_n_elems > 1) {
7366       len = 0;
7367 
7368       for (ulint i = 0; i < m_n_elems; ++i) {
7369         len += m_slots[i]->len;
7370       }
7371 
7372       m_ptr = static_cast<byte *>(ut_malloc_nokey(len + UNIV_PAGE_SIZE));
7373 
7374       m_buf = static_cast<byte *>(ut_align(m_ptr, UNIV_PAGE_SIZE));
7375 
7376     } else {
7377       len = first_slot()->len;
7378       m_buf = first_slot()->buf;
7379     }
7380 
7381     return (len);
7382   }
7383 
7384   /** We have to compress the individual pages and punch
7385   holes in them on a page by page basis when writing to
7386   tables that can be compresed at the IO level.
7387   @param[in]	len		Value returned by allocate_buffer */
copy_to_buffer(ulint len)7388   void copy_to_buffer(ulint len) {
7389     Slot *slot = first_slot();
7390 
7391     if (len > slot->len && slot->type.is_write()) {
7392       byte *ptr = m_buf;
7393 
7394       ut_ad(ptr != slot->buf);
7395 
7396       /* Copy the buffers to the combined buffer */
7397       for (ulint i = 0; i < m_n_elems; ++i) {
7398         slot = m_slots[i];
7399 
7400         memmove(ptr, slot->buf, slot->len);
7401 
7402         ptr += slot->len;
7403       }
7404     }
7405   }
7406 
7407   /** Do the I/O with ordinary, synchronous i/o functions: */
io()7408   void io() {
7409     if (first_slot()->type.is_write()) {
7410       for (ulint i = 0; i < m_n_elems; ++i) {
7411         write(m_slots[i]);
7412       }
7413 
7414     } else {
7415       for (ulint i = 0; i < m_n_elems; ++i) {
7416         read(m_slots[i]);
7417       }
7418     }
7419   }
7420 
7421   /** Do the decompression of the pages read in */
io_complete()7422   void io_complete() {
7423     // Note: For non-compressed tables. Not required
7424     // for correctness.
7425   }
7426 
7427   /** Mark the i/os done in slots */
done()7428   void done() {
7429     for (ulint i = 0; i < m_n_elems; ++i) {
7430       m_slots[i]->io_already_done = true;
7431     }
7432   }
7433 
7434   /** @return the first slot in the consecutive array */
first_slot()7435   Slot *first_slot() MY_ATTRIBUTE((warn_unused_result)) {
7436     ut_a(m_n_elems > 0);
7437 
7438     return (m_slots[0]);
7439   }
7440 
7441   /** Wait for I/O requests
7442   @param[in]	global_segment	The global segment
7443   @param[in,out]	event		Wait on event if no active requests
7444   @return the number of slots */
7445   ulint check_pending(ulint global_segment, os_event_t event)
7446       MY_ATTRIBUTE((warn_unused_result));
7447 
7448  private:
7449   /** Do the file read
7450   @param[in,out]	slot		Slot that has the IO context */
read(Slot * slot)7451   void read(Slot *slot) {
7452     dberr_t err = os_file_read_func(slot->type, slot->name, slot->file.m_file,
7453                                     slot->ptr, slot->offset, slot->len);
7454     ut_a(err == DB_SUCCESS);
7455   }
7456 
7457   /** Do the file write
7458   @param[in,out]	slot		Slot that has the IO context */
write(Slot * slot)7459   void write(Slot *slot) {
7460     dberr_t err = os_file_write_func(slot->type, slot->name, slot->file.m_file,
7461                                      slot->ptr, slot->offset, slot->len);
7462     ut_a(err == DB_SUCCESS || err == DB_IO_NO_PUNCH_HOLE);
7463   }
7464 
7465   /** @return true if the slots are adjacent and can be merged */
adjacent(const Slot * s1,const Slot * s2) const7466   bool adjacent(const Slot *s1, const Slot *s2) const {
7467     return (s1 != s2 && s1->file.m_file == s2->file.m_file &&
7468             s2->offset == s1->offset + s1->len && s1->type == s2->type);
7469   }
7470 
7471   /** @return true if merge limit reached or no adjacent slots found. */
merge_adjacent(Slot * & current)7472   bool merge_adjacent(Slot *&current) {
7473     Slot *slot;
7474     ulint offset = m_segment * m_n_slots;
7475 
7476     slot = m_array->at(offset);
7477 
7478     for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7479       if (slot->is_reserved && adjacent(current, slot)) {
7480         current = slot;
7481 
7482         /* Found a consecutive i/o request */
7483 
7484         m_slots[m_n_elems] = slot;
7485 
7486         ++m_n_elems;
7487 
7488         return (m_n_elems >= m_slots.capacity());
7489       }
7490     }
7491 
7492     return (true);
7493   }
7494 
7495   /** There were no old requests. Look for an I/O request at the lowest
7496   offset in the array (we ignore the high 32 bits of the offset in these
7497   heuristics) */
select_lowest_offset()7498   bool select_lowest_offset() {
7499     ut_ad(m_n_elems == 0);
7500 
7501     ulint offset = m_segment * m_n_slots;
7502 
7503     m_lowest_offset = IB_UINT64_MAX;
7504 
7505     for (ulint i = 0; i < m_n_slots; ++i) {
7506       Slot *slot;
7507 
7508       slot = m_array->at(i + offset);
7509 
7510       if (slot->is_reserved && slot->offset < m_lowest_offset) {
7511         /* Found an i/o request */
7512         m_slots[0] = slot;
7513 
7514         m_n_elems = 1;
7515 
7516         m_lowest_offset = slot->offset;
7517       }
7518     }
7519 
7520     return (m_n_elems > 0);
7521   }
7522 
7523   /** Select the slot if it is older than the current oldest slot.
7524   @param[in]	slot		The slot to check */
select_if_older(Slot * slot)7525   void select_if_older(Slot *slot) {
7526     const auto time_diff = ut_time_monotonic() - slot->reservation_time;
7527 
7528     const uint64_t age = time_diff > 0 ? (uint64_t)time_diff : 0;
7529 
7530     if ((age >= 2 && age > m_oldest) ||
7531         (age >= 2 && age == m_oldest && slot->offset < m_lowest_offset)) {
7532       /* Found an i/o request */
7533       m_slots[0] = slot;
7534 
7535       m_n_elems = 1;
7536 
7537       m_oldest = age;
7538 
7539       m_lowest_offset = slot->offset;
7540     }
7541   }
7542 
7543   /** Select th oldest slot in the array
7544   @return true if oldest slot found */
select_oldest()7545   bool select_oldest() {
7546     ut_ad(m_n_elems == 0);
7547 
7548     Slot *slot;
7549     ulint offset = m_n_slots * m_segment;
7550 
7551     slot = m_array->at(offset);
7552 
7553     for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7554       if (slot->is_reserved) {
7555         select_if_older(slot);
7556       }
7557     }
7558 
7559     return (m_n_elems > 0);
7560   }
7561 
7562   typedef std::vector<Slot *> slots_t;
7563 
7564  private:
7565   ulint m_oldest;
7566   ulint m_n_elems;
7567   os_offset_t m_lowest_offset;
7568 
7569   AIO *m_array;
7570   ulint m_n_slots;
7571   ulint m_segment;
7572 
7573   slots_t m_slots;
7574 
7575   byte *m_ptr;
7576   byte *m_buf;
7577 };
7578 
7579 /** Wait for I/O requests
7580 @return the number of slots */
check_pending(ulint global_segment,os_event_t event)7581 ulint SimulatedAIOHandler::check_pending(ulint global_segment,
7582                                          os_event_t event) {
7583   /* NOTE! We only access constant fields in os_aio_array.
7584   Therefore we do not have to acquire the protecting mutex yet */
7585 
7586 #ifndef UNIV_HOTBACKUP
7587   ut_ad(os_aio_validate_skip());
7588 #endif /* !UNIV_HOTBACKUP */
7589 
7590   ut_ad(m_segment < m_array->get_n_segments());
7591 
7592   /* Look through n slots after the segment * n'th slot */
7593 
7594   if (AIO::is_read(m_array) && os_aio_recommend_sleep_for_read_threads) {
7595     /* Give other threads chance to add several
7596     I/Os to the array at once. */
7597 
7598     srv_set_io_thread_op_info(global_segment, "waiting for i/o request");
7599 
7600     os_event_wait(event);
7601 
7602     return (0);
7603   }
7604 
7605   return (m_array->slots_per_segment());
7606 }
7607 
7608 /** Does simulated AIO. This function should be called by an i/o-handler
7609 thread.
7610 
7611 @param[in]	global_segment	The number of the segment in the aio arrays to
7612                                 wait for; segment 0 is the ibuf i/o thread,
7613                                 segment 1 the log i/o thread, then follow the
7614                                 non-ibuf read threads, and as the last are the
7615                                 non-ibuf write threads
7616 @param[out]	m1		the messages passed with the AIO request; note
7617                                 that also in the case where the AIO operation
7618                                 failed, these output parameters are valid and
7619                                 can be used to restart
7620                                 the operation, for example
7621 @param[out]	m2		Callback argument
7622 @param[in]	type		IO context
7623 @return DB_SUCCESS or error code */
os_aio_simulated_handler(ulint global_segment,fil_node_t ** m1,void ** m2,IORequest * type)7624 static dberr_t os_aio_simulated_handler(ulint global_segment, fil_node_t **m1,
7625                                         void **m2, IORequest *type) {
7626   Slot *slot;
7627   AIO *array{};
7628   os_event_t event = os_aio_segment_wait_events[global_segment];
7629 
7630   auto segment = AIO::get_array_and_local_segment(array, global_segment);
7631 
7632   SimulatedAIOHandler handler(array, segment);
7633 
7634   for (;;) {
7635     srv_set_io_thread_op_info(global_segment, "looking for i/o requests (a)");
7636 
7637     ulint n_slots = handler.check_pending(global_segment, event);
7638 
7639     if (n_slots == 0) {
7640       continue;
7641     }
7642 
7643     handler.init(n_slots);
7644 
7645     srv_set_io_thread_op_info(global_segment, "looking for i/o requests (b)");
7646 
7647     array->acquire();
7648 
7649     ulint n_reserved;
7650 
7651     slot = handler.check_completed(&n_reserved);
7652 
7653     if (slot != nullptr) {
7654       break;
7655 
7656     } else if (n_reserved == 0
7657 #ifndef UNIV_HOTBACKUP
7658                && !buf_flush_page_cleaner_is_active() &&
7659                srv_shutdown_state.load() == SRV_SHUTDOWN_EXIT_THREADS
7660 #endif /* !UNIV_HOTBACKUP */
7661     ) {
7662 
7663       /* There is no completed request. If there
7664       are no pending request at all, and the system
7665       is being shut down, exit. */
7666 
7667       array->release();
7668 
7669       *m1 = nullptr;
7670 
7671       *m2 = nullptr;
7672 
7673       return (DB_SUCCESS);
7674 
7675     } else if (handler.select()) {
7676       break;
7677     }
7678 
7679     /* No I/O requested at the moment */
7680 
7681     srv_set_io_thread_op_info(global_segment, "resetting wait event");
7682 
7683     /* We wait here until tbere are more IO requests
7684     for this segment. */
7685 
7686     os_event_reset(event);
7687 
7688     array->release();
7689 
7690     srv_set_io_thread_op_info(global_segment, "waiting for i/o request");
7691 
7692     os_event_wait(event);
7693   }
7694 
7695   /** Found a slot that has already completed its IO */
7696 
7697   if (slot == nullptr) {
7698     /* Merge adjacent requests */
7699     handler.merge();
7700 
7701     /* Check if there are several consecutive blocks
7702     to read or write */
7703 
7704     srv_set_io_thread_op_info(global_segment, "consecutive i/o requests");
7705 
7706     // Note: We don't support write combining for simulated AIO.
7707     // ulint	total_len = handler.allocate_buffer();
7708 
7709     /* We release the array mutex for the time of the I/O: NOTE that
7710     this assumes that there is just one i/o-handler thread serving
7711     a single segment of slots! */
7712 
7713     array->release();
7714 
7715     // Note: We don't support write combining for simulated AIO.
7716     // handler.copy_to_buffer(total_len);
7717 
7718     srv_set_io_thread_op_info(global_segment, "doing file i/o");
7719 
7720     handler.io();
7721 
7722     srv_set_io_thread_op_info(global_segment, "file i/o done");
7723 
7724     handler.io_complete();
7725 
7726     array->acquire();
7727 
7728     handler.done();
7729 
7730     /* We return the messages for the first slot now, and if there
7731     were several slots, the messages will be returned with
7732     subsequent calls of this function */
7733 
7734     slot = handler.first_slot();
7735   }
7736 
7737   ut_ad(slot->is_reserved);
7738 
7739   *m1 = slot->m1;
7740   *m2 = slot->m2;
7741 
7742   *type = slot->type;
7743 
7744   array->release(slot);
7745 
7746   array->release();
7747 
7748   return (DB_SUCCESS);
7749 }
7750 
7751 /** Get the total number of pending IOs
7752 @return the total number of pending IOs */
total_pending_io_count()7753 ulint AIO::total_pending_io_count() {
7754   ulint count = s_reads->pending_io_count();
7755 
7756   if (s_writes != nullptr) {
7757     count += s_writes->pending_io_count();
7758   }
7759 
7760   if (s_ibuf != nullptr) {
7761     count += s_ibuf->pending_io_count();
7762   }
7763 
7764   if (s_log != nullptr) {
7765     count += s_log->pending_io_count();
7766   }
7767 
7768   if (s_sync != nullptr) {
7769     count += s_sync->pending_io_count();
7770   }
7771 
7772   return (count);
7773 }
7774 
7775 /** Validates the consistency the aio system.
7776 @return true if ok */
os_aio_validate()7777 static bool os_aio_validate() {
7778   /* The methods countds and validates, we ignore the count. */
7779   AIO::total_pending_io_count();
7780 
7781   return (true);
7782 }
7783 
7784 /** Prints pending IO requests per segment of an aio array.
7785 We probably don't need per segment statistics but they can help us
7786 during development phase to see if the IO requests are being
7787 distributed as expected.
7788 @param[in,out]	file		File where to print
7789 @param[in]	segments	Pending IO array */
print_segment_info(FILE * file,const ulint * segments)7790 void AIO::print_segment_info(FILE *file, const ulint *segments) {
7791   ut_ad(m_n_segments > 0);
7792 
7793   if (m_n_segments > 1) {
7794     fprintf(file, " [");
7795 
7796     for (ulint i = 0; i < m_n_segments; ++i, ++segments) {
7797       if (i != 0) {
7798         fprintf(file, ", ");
7799       }
7800 
7801       fprintf(file, ULINTPF, *segments);
7802     }
7803 
7804     fprintf(file, "] ");
7805   }
7806 }
7807 
7808 /** Prints info about the aio array.
7809 @param[in,out]	file		Where to print */
print(FILE * file)7810 void AIO::print(FILE *file) {
7811   ulint count = 0;
7812   ulint n_res_seg[SRV_MAX_N_IO_THREADS];
7813 
7814   mutex_enter(&m_mutex);
7815 
7816   ut_a(!m_slots.empty());
7817   ut_a(m_n_segments > 0);
7818 
7819   memset(n_res_seg, 0x0, sizeof(n_res_seg));
7820 
7821   for (ulint i = 0; i < m_slots.size(); ++i) {
7822     Slot &slot = m_slots[i];
7823     ulint segment = (i * m_n_segments) / m_slots.size();
7824 
7825     if (slot.is_reserved) {
7826       ++count;
7827 
7828       ++n_res_seg[segment];
7829 
7830       ut_a(slot.len > 0);
7831     }
7832   }
7833 
7834   ut_a(m_n_reserved == count);
7835 
7836   print_segment_info(file, n_res_seg);
7837 
7838   mutex_exit(&m_mutex);
7839 }
7840 
7841 /** Print all the AIO segments
7842 @param[in,out]	file		Where to print */
print_all(FILE * file)7843 void AIO::print_all(FILE *file) {
7844   s_reads->print(file);
7845 
7846   if (s_writes != nullptr) {
7847     fputs(", aio writes:", file);
7848     s_writes->print(file);
7849   }
7850 
7851   if (s_ibuf != nullptr) {
7852     fputs(",\n ibuf aio reads:", file);
7853     s_ibuf->print(file);
7854   }
7855 
7856   if (s_log != nullptr) {
7857     fputs(", log i/o's:", file);
7858     s_log->print(file);
7859   }
7860 
7861   if (s_sync != nullptr) {
7862     fputs(", sync i/o's:", file);
7863     s_sync->print(file);
7864   }
7865 }
7866 
7867 /** Prints info of the aio arrays.
7868 @param[in,out]	file		file where to print */
os_aio_print(FILE * file)7869 void os_aio_print(FILE *file) {
7870   double avg_bytes_read;
7871 
7872 #ifndef UNIV_HOTBACKUP
7873   for (ulint i = 0; i < srv_n_file_io_threads; ++i) {
7874     fprintf(file, "I/O thread %lu state: %s (%s)", (ulong)i,
7875             srv_io_thread_op_info[i], srv_io_thread_function[i]);
7876 
7877 #ifndef _WIN32
7878     if (os_event_is_set(os_aio_segment_wait_events[i])) {
7879       fprintf(file, " ev set");
7880     }
7881 #endif /* _WIN32 */
7882 
7883     fprintf(file, "\n");
7884   }
7885 #endif /* !UNIV_HOTBACKUP */
7886 
7887   fputs("Pending normal aio reads:", file);
7888 
7889   AIO::print_all(file);
7890 
7891   putc('\n', file);
7892   const auto current_time = ut_time_monotonic();
7893   const auto time_elapsed = 0.001 + (current_time - os_last_printout);
7894 
7895   fprintf(file,
7896           "Pending flushes (fsync) log: " ULINTPF
7897           "; "
7898           "buffer pool: " ULINTPF "\n" ULINTPF " OS file reads, " ULINTPF
7899           " OS file writes, " ULINTPF " OS fsyncs\n",
7900           fil_n_pending_log_flushes, fil_n_pending_tablespace_flushes,
7901           os_n_file_reads, os_n_file_writes, os_n_fsyncs);
7902 
7903   if (os_n_pending_writes != 0 || os_n_pending_reads != 0) {
7904     fprintf(file, ULINTPF " pending preads, " ULINTPF " pending pwrites\n",
7905             os_n_pending_reads, os_n_pending_writes);
7906   }
7907 
7908   if (os_n_file_reads == os_n_file_reads_old) {
7909     avg_bytes_read = 0.0;
7910   } else {
7911     avg_bytes_read = (double)os_bytes_read_since_printout /
7912                      (os_n_file_reads - os_n_file_reads_old);
7913   }
7914 
7915   fprintf(file,
7916           "%.2f reads/s, %lu avg bytes/read,"
7917           " %.2f writes/s, %.2f fsyncs/s\n",
7918           (os_n_file_reads - os_n_file_reads_old) / time_elapsed,
7919           (ulong)avg_bytes_read,
7920           (os_n_file_writes - os_n_file_writes_old) / time_elapsed,
7921           (os_n_fsyncs - os_n_fsyncs_old) / time_elapsed);
7922 
7923   os_n_file_reads_old = os_n_file_reads;
7924   os_n_file_writes_old = os_n_file_writes;
7925   os_n_fsyncs_old = os_n_fsyncs;
7926   os_bytes_read_since_printout = 0;
7927 
7928   os_last_printout = current_time;
7929 }
7930 
7931 /** Refreshes the statistics used to print per-second averages. */
os_aio_refresh_stats()7932 void os_aio_refresh_stats() {
7933   os_n_fsyncs_old = os_n_fsyncs;
7934 
7935   os_bytes_read_since_printout = 0;
7936 
7937   os_n_file_reads_old = os_n_file_reads;
7938 
7939   os_n_file_writes_old = os_n_file_writes;
7940 
7941   os_n_fsyncs_old = os_n_fsyncs;
7942 
7943   os_bytes_read_since_printout = 0;
7944 
7945   os_last_printout = ut_time_monotonic();
7946 }
7947 
7948 /** Checks that all slots in the system have been freed, that is, there are
7949 no pending io operations.
7950 @return true if all free */
os_aio_all_slots_free()7951 bool os_aio_all_slots_free() { return (AIO::total_pending_io_count() == 0); }
7952 
7953 #ifdef UNIV_DEBUG
7954 /** Prints all pending IO for the array
7955 @param[in]	file	file where to print */
to_file(FILE * file) const7956 void AIO::to_file(FILE *file) const {
7957   acquire();
7958 
7959   fprintf(file, " %lu\n", static_cast<ulong>(m_n_reserved));
7960 
7961   for (ulint i = 0; i < m_slots.size(); ++i) {
7962     const Slot &slot = m_slots[i];
7963 
7964     if (slot.is_reserved) {
7965       fprintf(file, "%s IO for %s (offset=" UINT64PF ", size=%lu)\n",
7966               slot.type.is_read() ? "read" : "write", slot.name, slot.offset,
7967               slot.len);
7968     }
7969   }
7970 
7971   release();
7972 }
7973 
7974 /** Print pending IOs for all arrays */
print_to_file(FILE * file)7975 void AIO::print_to_file(FILE *file) {
7976   fprintf(file, "Pending normal aio reads:");
7977 
7978   s_reads->to_file(file);
7979 
7980   if (s_writes != nullptr) {
7981     fprintf(file, "Pending normal aio writes:");
7982     s_writes->to_file(file);
7983   }
7984 
7985   if (s_ibuf != nullptr) {
7986     fprintf(file, "Pending ibuf aio reads:");
7987     s_ibuf->to_file(file);
7988   }
7989 
7990   if (s_log != nullptr) {
7991     fprintf(file, "Pending log i/o's:");
7992     s_log->to_file(file);
7993   }
7994 
7995   if (s_sync != nullptr) {
7996     fprintf(file, "Pending sync i/o's:");
7997     s_sync->to_file(file);
7998   }
7999 }
8000 
8001 /** Prints all pending IO
8002 @param[in]	file		File where to print */
os_aio_print_pending_io(FILE * file)8003 void os_aio_print_pending_io(FILE *file) { AIO::print_to_file(file); }
8004 
8005 #endif /* UNIV_DEBUG */
8006 
8007 /**
8008 Set the file create umask
8009 @param[in]	umask		The umask to use for file creation. */
os_file_set_umask(ulint umask)8010 void os_file_set_umask(ulint umask) { os_innodb_umask = umask; }
8011 
8012 /** Get the file create umask
8013 @return the umask to use for file creation. */
os_file_get_umask()8014 ulint os_file_get_umask() { return (os_innodb_umask); }
8015 
8016 /** Check if the path is a directory. The file/directory must exist.
8017 @param[in]	path		The path to check
8018 @return true if it is a directory */
is_directory(const Path & path)8019 bool Dir_Walker::is_directory(const Path &path) {
8020   os_file_type_t type;
8021   bool exists;
8022 
8023   if (os_file_status(path.c_str(), &exists, &type)) {
8024     ut_ad(exists);
8025     ut_ad(type != OS_FILE_TYPE_MISSING);
8026 
8027     return (type == OS_FILE_TYPE_DIR);
8028   }
8029 
8030   ut_ad(exists || type == OS_FILE_TYPE_FAILED);
8031   ut_ad(type != OS_FILE_TYPE_MISSING);
8032 
8033   return (false);
8034 }
8035