1 /***********************************************************************
2
3 Copyright (c) 1995, 2020, Oracle and/or its affiliates.
4 Copyright (c) 2009, Percona Inc.
5
6 Portions of this file contain modifications contributed and copyrighted
7 by Percona Inc.. Those modifications are
8 gratefully acknowledged and are described briefly in the InnoDB
9 documentation. The contributions by Percona Inc. are incorporated with
10 their permission, and subject to the conditions contained in the file
11 COPYING.Percona.
12
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License, version 2.0,
15 as published by the Free Software Foundation.
16
17 This program is also distributed with certain software (including
18 but not limited to OpenSSL) that is licensed under separate terms,
19 as designated in a particular file or component or in included license
20 documentation. The authors of MySQL hereby grant you an additional
21 permission to link the program and your derivative works with the
22 separately licensed software that they have included with MySQL.
23
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License, version 2.0, for more details.
28
29 You should have received a copy of the GNU General Public License
30 along with this program; if not, write to the Free Software
31 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
32
33 ***********************************************************************/
34
35 /** @file os/os0file.cc
36 The interface to the operating system file i/o primitives
37
38 Created 10/21/1995 Heikki Tuuri
39 *******************************************************/
40
41 #include "os0file.h"
42 #include "fil0fil.h"
43 #include "ha_prototypes.h"
44 #include "log0log.h"
45 #include "my_dbug.h"
46 #include "my_io.h"
47
48 #include "fil0fil.h"
49 #include "ha_prototypes.h"
50 #include "os0file.h"
51 #include "sql_const.h"
52 #include "srv0srv.h"
53 #include "srv0start.h"
54 #ifndef UNIV_HOTBACKUP
55 #include "os0event.h"
56 #include "os0thread.h"
57 #endif /* !UNIV_HOTBACKUP */
58
59 #ifdef _WIN32
60 #include <errno.h>
61 #include <mbstring.h>
62 #include <sys/stat.h>
63 #include <tchar.h>
64 #include <codecvt>
65 #endif /* _WIN32 */
66
67 #ifdef __linux__
68 #include <sys/sendfile.h>
69 #endif /* __linux__ */
70
71 #ifdef LINUX_NATIVE_AIO
72 #ifndef UNIV_HOTBACKUP
73 #include <libaio.h>
74 #else /* !UNIV_HOTBACKUP */
75 #undef LINUX_NATIVE_AIO
76 #endif /* !UNIV_HOTBACKUP */
77 #endif /* LINUX_NATIVE_AIO */
78
79 #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
80 #include <fcntl.h>
81 #include <linux/falloc.h>
82 #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
83
84 #include <errno.h>
85 #include <lz4.h>
86 #include "my_aes.h"
87 #include "my_rnd.h"
88 #include "mysql/service_mysql_keyring.h"
89 #include "mysqld.h"
90
91 #include <sys/types.h>
92 #include <zlib.h>
93 #include <ctime>
94 #include <functional>
95 #include <new>
96 #include <vector>
97
98 #ifdef UNIV_HOTBACKUP
99 #include <data0type.h>
100 #endif /* UNIV_HOTBACKUP */
101
102 /* Flush after each os_fsync_threshold bytes */
103 unsigned long long os_fsync_threshold = 0;
104
105 /** Insert buffer segment id */
106 static const ulint IO_IBUF_SEGMENT = 0;
107
108 /** Log segment id */
109 static const ulint IO_LOG_SEGMENT = 1;
110
111 /** Number of retries for partial I/O's */
112 static const ulint NUM_RETRIES_ON_PARTIAL_IO = 10;
113
114 /** For storing the allocated blocks */
115 using Blocks = std::vector<file::Block>;
116
117 /** Block collection */
118 static Blocks *block_cache;
119
120 /** Number of blocks to allocate for sync read/writes */
121 static const size_t MAX_BLOCKS = 128;
122
123 /** Block buffer size */
124 #define BUFFER_BLOCK_SIZE ((ulint)(UNIV_PAGE_SIZE * 1.3))
125
126 /** Disk sector size of aligning write buffer for DIRECT_IO */
127 static ulint os_io_ptr_align = UNIV_SECTOR_SIZE;
128
129 /** Determine if O_DIRECT is supported
130 @retval true if O_DIRECT is supported.
131 @retval false if O_DIRECT is not supported. */
os_is_o_direct_supported()132 bool os_is_o_direct_supported() {
133 #if !defined(NO_FALLOCATE) && defined(UNIV_LINUX)
134 char *path = srv_data_home;
135 char *file_name;
136 os_file_t file_handle;
137 ulint dir_len;
138 ulint path_len;
139 bool add_os_path_separator = false;
140
141 /* If the srv_data_home is empty, set the path to current dir. */
142 char current_dir[3];
143 if (*path == 0) {
144 current_dir[0] = FN_CURLIB;
145 current_dir[1] = FN_LIBCHAR;
146 current_dir[2] = 0;
147 path = current_dir;
148 }
149
150 /* Get the path length. */
151 if (path[strlen(path) - 1] == OS_PATH_SEPARATOR) {
152 /* path is ended with OS_PATH_SEPARATOR */
153 dir_len = strlen(path);
154 } else {
155 /* path is not ended with OS_PATH_SEPARATOR */
156 dir_len = strlen(path) + 1;
157 add_os_path_separator = true;
158 }
159
160 /* Allocate a new path and move the directory path to it. */
161 path_len = dir_len + sizeof "o_direct_test";
162 file_name = static_cast<char *>(ut_zalloc_nokey(path_len));
163 if (add_os_path_separator == true) {
164 memcpy(file_name, path, dir_len - 1);
165 file_name[dir_len - 1] = OS_PATH_SEPARATOR;
166 } else {
167 memcpy(file_name, path, dir_len);
168 }
169
170 /* Construct a temp file name. */
171 strcat(file_name + dir_len, "o_direct_test");
172
173 /* Try to create a temp file with O_DIRECT flag. */
174 file_handle =
175 ::open(file_name, O_CREAT | O_TRUNC | O_WRONLY | O_DIRECT, S_IRWXU);
176
177 /* If Failed */
178 if (file_handle == -1) {
179 ut_free(file_name);
180 return (false);
181 }
182
183 ::close(file_handle);
184 unlink(file_name);
185 ut_free(file_name);
186
187 return (true);
188 #else
189 return (false);
190 #endif /* !NO_FALLOCATE && UNIV_LINUX */
191 }
192
193 /* This specifies the file permissions InnoDB uses when it creates files in
194 Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
195 my_umask */
196
197 #ifndef _WIN32
198 /** Umask for creating files */
199 ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
200 #else
201 /** Umask for creating files */
202 ulint os_innodb_umask = 0;
203
204 /* On Windows when using native AIO the number of AIO requests
205 that a thread can handle at a given time is limited to 32
206 i.e.: SRV_N_PENDING_IOS_PER_THREAD */
207 #define SRV_N_PENDING_IOS_PER_THREAD OS_AIO_N_PENDING_IOS_PER_THREAD
208
209 #endif /* _WIN32 */
210
211 /** In simulated aio, merge at most this many consecutive i/os */
212 static const ulint OS_AIO_MERGE_N_CONSECUTIVE = 64;
213
214 /** Checks if the page_cleaner is in active state. */
215 bool buf_flush_page_cleaner_is_active();
216
217 #ifndef UNIV_HOTBACKUP
218 /**********************************************************************
219
220 InnoDB AIO Implementation:
221 =========================
222
223 We support native AIO for Windows and Linux. For rest of the platforms
224 we simulate AIO by special IO-threads servicing the IO-requests.
225
226 Simulated AIO:
227 ==============
228
229 On platforms where we 'simulate' AIO, the following is a rough explanation
230 of the high level design.
231 There are four io-threads (for ibuf, log, read, write).
232 All synchronous IO requests are serviced by the calling thread using
233 os_file_write/os_file_read. The Asynchronous requests are queued up
234 in an array (there are four such arrays) by the calling thread.
235 Later these requests are picked up by the IO-thread and are serviced
236 synchronously.
237
238 Windows native AIO:
239 ==================
240
241 If srv_use_native_aio is not set then Windows follow the same
242 code as simulated AIO. If the flag is set then native AIO interface
243 is used. On windows, one of the limitation is that if a file is opened
244 for AIO no synchronous IO can be done on it. Therefore we have an
245 extra fifth array to queue up synchronous IO requests.
246 There are innodb_file_io_threads helper threads. These threads work
247 on the four arrays mentioned above in Simulated AIO. No thread is
248 required for the sync array.
249 If a synchronous IO request is made, it is first queued in the sync
250 array. Then the calling thread itself waits on the request, thus
251 making the call synchronous.
252 If an AIO request is made the calling thread not only queues it in the
253 array but also submits the requests. The helper thread then collects
254 the completed IO request and calls completion routine on it.
255
256 Linux native AIO:
257 =================
258
259 If we have libaio installed on the system and innodb_use_native_aio
260 is set to true we follow the code path of native AIO, otherwise we
261 do simulated AIO.
262 There are innodb_file_io_threads helper threads. These threads work
263 on the four arrays mentioned above in Simulated AIO.
264 If a synchronous IO request is made, it is handled by calling
265 os_file_write/os_file_read.
266 If an AIO request is made the calling thread not only queues it in the
267 array but also submits the requests. The helper thread then collects
268 the completed IO request and calls completion routine on it.
269
270 **********************************************************************/
271
272 #ifdef UNIV_PFS_IO
273 /* Keys to register InnoDB I/O with performance schema */
274 mysql_pfs_key_t innodb_log_file_key;
275 mysql_pfs_key_t innodb_data_file_key;
276 mysql_pfs_key_t innodb_temp_file_key;
277 mysql_pfs_key_t innodb_dblwr_file_key;
278 mysql_pfs_key_t innodb_arch_file_key;
279 mysql_pfs_key_t innodb_clone_file_key;
280 #endif /* UNIV_PFS_IO */
281
282 #endif /* !UNIV_HOTBACKUP */
283 /** The asynchronous I/O context */
284 struct Slot {
285 /** Default constructor/assignment etc. are OK */
286
287 /** index of the slot in the aio array */
288 uint16_t pos{0};
289
290 /** true if this slot is reserved */
291 bool is_reserved{false};
292
293 /** time when reserved */
294 ib_time_monotonic_t reservation_time{0};
295
296 /** buffer used in i/o */
297 byte *buf{nullptr};
298
299 /** Buffer pointer used for actual IO. We advance this
300 when partial IO is required and not buf */
301 byte *ptr{nullptr};
302
303 /** OS_FILE_READ or OS_FILE_WRITE */
304 IORequest type{IORequest::UNSET};
305
306 /** file offset in bytes */
307 os_offset_t offset{0};
308
309 /** file where to read or write */
310 pfs_os_file_t file{
311 #ifdef UNIV_PFS_IO
312 nullptr, // m_psi
313 #endif
314 0 // m_file
315 };
316
317 /** file name or path */
318 const char *name{nullptr};
319
320 /** used only in simulated aio: true if the physical i/o
321 already made and only the slot message needs to be passed
322 to the caller of os_aio_simulated_handle */
323 bool io_already_done{false};
324
325 /** The file node for which the IO is requested. */
326 fil_node_t *m1{nullptr};
327
328 /** the requester of an aio operation and which can be used
329 to identify which pending aio operation was completed */
330 void *m2{nullptr};
331
332 /** AIO completion status */
333 dberr_t err{DB_ERROR_UNSET};
334
335 #ifdef WIN_ASYNC_IO
336 /** handle object we need in the OVERLAPPED struct */
337 HANDLE handle{INVALID_HANDLE_VALUE};
338
339 /** Windows control block for the aio request */
340 OVERLAPPED control{0, 0};
341
342 /** bytes written/read */
343 DWORD n_bytes{0};
344
345 /** length of the block to read or write */
346 DWORD len{0};
347
348 #elif defined(LINUX_NATIVE_AIO)
349 /** Linux control block for aio */
350 struct iocb control;
351
352 /** AIO return code */
353 int ret{0};
354
355 /** bytes written/read. */
356 ssize_t n_bytes{0};
357
358 /** length of the block to read or write */
359 ulint len{0};
360 #else
361 /** length of the block to read or write */
362 ulint len{0};
363
364 /** bytes written/read. */
365 ulint n_bytes{0};
366 #endif /* WIN_ASYNC_IO */
367
368 /** Length of the block before it was compressed */
369 uint32 original_len{0};
370
371 /** Buffer block for compressed pages or encrypted pages */
372 file::Block *buf_block{nullptr};
373
374 /** true, if we shouldn't punch a hole after writing the page */
375 bool skip_punch_hole{false};
376
377 /** Buffer for encrypt log */
378 void *encrypt_log_buf{nullptr};
379
SlotSlot380 Slot() {
381 #if defined(LINUX_NATIVE_AIO)
382 memset(&control, 0, sizeof(control));
383 #endif /* LINUX_NATIVE_AIO */
384 }
385 };
386
387 /** The asynchronous i/o array structure */
388 class AIO {
389 public:
390 /** Constructor
391 @param[in] id Latch ID
392 @param[in] n Number of slots to configure
393 @param[in] segments Number of segments to configure */
394 AIO(latch_id_t id, ulint n, ulint segments);
395
396 /** Destructor */
397 ~AIO();
398
399 /** Initialize the instance
400 @return DB_SUCCESS or error code */
401 dberr_t init();
402
403 /** Requests for a slot in the aio array. If no slot is available, waits
404 until not_full-event becomes signaled.
405
406 @param[in,out] type IO context
407 @param[in,out] m1 message to be passed along with the AIO
408 operation
409 @param[in,out] m2 message to be passed along with the AIO
410 operation
411 @param[in] file file handle
412 @param[in] name name of the file or path as a null-terminated
413 string
414 @param[in,out] buf buffer where to read or from which to write
415 @param[in] offset file offset, where to read from or start writing
416 @param[in] len length of the block to read or write
417 @return pointer to slot */
418 Slot *reserve_slot(IORequest &type, fil_node_t *m1, void *m2,
419 pfs_os_file_t file, const char *name, void *buf,
420 os_offset_t offset, ulint len)
421 MY_ATTRIBUTE((warn_unused_result));
422
423 /** @return number of reserved slots */
424 ulint pending_io_count() const;
425
426 /** Returns a pointer to the nth slot in the aio array.
427 @param[in] i Index of the slot in the array
428 @return pointer to slot */
at(ulint i) const429 const Slot *at(ulint i) const MY_ATTRIBUTE((warn_unused_result)) {
430 ut_a(i < m_slots.size());
431
432 return (&m_slots[i]);
433 }
434
435 /** Non const version */
at(ulint i)436 Slot *at(ulint i) MY_ATTRIBUTE((warn_unused_result)) {
437 if (i >= m_slots.size()) {
438 ib::fatal(ER_IB_MSG_1357) << "i: " << i << " slots: " << m_slots.size();
439 }
440
441 return (&m_slots[i]);
442 }
443
444 /** Frees a slot in the AIO array, assumes caller owns the mutex.
445 @param[in,out] slot Slot to release */
446 void release(Slot *slot);
447
448 /** Frees a slot in the AIO array, assumes caller doesn't own the mutex.
449 @param[in,out] slot Slot to release */
450 void release_with_mutex(Slot *slot);
451
452 /** Prints info about the aio array.
453 @param[in,out] file Where to print */
454 void print(FILE *file);
455
456 /** @return the number of slots per segment */
slots_per_segment() const457 ulint slots_per_segment() const MY_ATTRIBUTE((warn_unused_result)) {
458 return (m_slots.size() / m_n_segments);
459 }
460
461 /** @return accessor for n_segments */
get_n_segments() const462 ulint get_n_segments() const MY_ATTRIBUTE((warn_unused_result)) {
463 return (m_n_segments);
464 }
465
466 #ifdef UNIV_DEBUG
467 /** @return true if the thread owns the mutex */
is_mutex_owned() const468 bool is_mutex_owned() const MY_ATTRIBUTE((warn_unused_result)) {
469 return (mutex_own(&m_mutex));
470 }
471 #endif /* UNIV_DEBUG */
472
473 /** Acquire the mutex */
acquire() const474 void acquire() const { mutex_enter(&m_mutex); }
475
476 /** Release the mutex */
release() const477 void release() const { mutex_exit(&m_mutex); }
478
479 /** Write out the state to the file/stream
480 @param[in, out] file File to write to */
481 void to_file(FILE *file) const;
482
483 #ifdef LINUX_NATIVE_AIO
484 /** Dispatch an AIO request to the kernel.
485 @param[in,out] slot an already reserved slot
486 @return true on success. */
487 bool linux_dispatch(Slot *slot) MY_ATTRIBUTE((warn_unused_result));
488
489 /** Accessor for an AIO event
490 @param[in] index Index into the array
491 @return the event at the index */
io_events(ulint index)492 io_event *io_events(ulint index) MY_ATTRIBUTE((warn_unused_result)) {
493 ut_a(index < m_events.size());
494
495 return (&m_events[index]);
496 }
497
498 /** Accessor for the AIO context
499 @param[in] segment Segment for which to get the context
500 @return the AIO context for the segment */
io_ctx(ulint segment)501 io_context *io_ctx(ulint segment) MY_ATTRIBUTE((warn_unused_result)) {
502 ut_ad(segment < get_n_segments());
503
504 return (m_aio_ctx[segment]);
505 }
506
507 /** Creates an io_context for native linux AIO.
508 @param[in] max_events number of events
509 @param[out] io_ctx io_ctx to initialize.
510 @return true on success. */
511 static bool linux_create_io_ctx(ulint max_events, io_context_t *io_ctx)
512 MY_ATTRIBUTE((warn_unused_result));
513
514 /** Checks if the system supports native linux aio. On some kernel
515 versions where native aio is supported it won't work on tmpfs. In such
516 cases we can't use native aio as it is not possible to mix simulated
517 and native aio.
518 @return true if supported, false otherwise. */
519 static bool is_linux_native_aio_supported()
520 MY_ATTRIBUTE((warn_unused_result));
521 #endif /* LINUX_NATIVE_AIO */
522
523 #ifdef WIN_ASYNC_IO
524 /** Wakes up all async i/o threads in the array in Windows async I/O at
525 shutdown. */
signal()526 void signal() {
527 for (ulint i = 0; i < m_slots.size(); ++i) {
528 SetEvent(m_slots[i].handle);
529 }
530 }
531
532 /** Wake up all AIO threads in Windows native aio */
wake_at_shutdown()533 static void wake_at_shutdown() {
534 s_reads->signal();
535
536 if (s_writes != NULL) {
537 s_writes->signal();
538 }
539
540 if (s_ibuf != NULL) {
541 s_ibuf->signal();
542 }
543
544 if (s_log != NULL) {
545 s_log->signal();
546 }
547 }
548 #endif /* WIN_ASYNC_IO */
549
550 #ifdef _WIN32
551 /** This function can be called if one wants to post a batch of reads
552 and prefers an I/O - handler thread to handle them all at once later.You
553 must call os_aio_simulated_wake_handler_threads later to ensure the
554 threads are not left sleeping! */
555 static void simulated_put_read_threads_to_sleep();
556
557 /** The non asynchronous IO array.
558 @return the synchronous AIO array instance. */
sync_array()559 static AIO *sync_array() MY_ATTRIBUTE((warn_unused_result)) { return s_sync; }
560
561 /**
562 Get the AIO handles for a segment.
563 @param[in] segment The local segment.
564 @return the handles for the segment. */
handles(ulint segment)565 HANDLE *handles(ulint segment) MY_ATTRIBUTE((warn_unused_result)) {
566 ut_ad(segment < m_handles->size() / slots_per_segment());
567
568 return (&(*m_handles)[segment * slots_per_segment()]);
569 }
570
571 /** @return true if no slots are reserved */
is_empty() const572 bool is_empty() const MY_ATTRIBUTE((warn_unused_result)) {
573 ut_ad(is_mutex_owned());
574 return (m_n_reserved == 0);
575 }
576 #endif /* _WIN32 */
577
578 /** Create an instance using new(std::nothrow)
579 @param[in] id Latch ID
580 @param[in] n The number of AIO request slots
581 @param[in] n_segments The number of segments
582 @return a new AIO instance */
583 static AIO *create(latch_id_t id, ulint n, ulint n_segments)
584 MY_ATTRIBUTE((warn_unused_result));
585
586 /** Initializes the asynchronous io system. Creates one array each
587 for ibuf and log I/O. Also creates one array each for read and write
588 where each array is divided logically into n_readers and n_writers
589 respectively. The caller must create an i/o handler thread for each
590 segment in these arrays. This function also creates the sync array.
591 No I/O handler thread needs to be created for that
592 @param[in] n_per_seg maximum number of pending aio
593 operations allowed per segment
594 @param[in] n_readers number of reader threads
595 @param[in] n_writers number of writer threads
596 @param[in] n_slots_sync number of slots in the sync aio array
597 @return true if AIO sub-system was started successfully */
598 static bool start(ulint n_per_seg, ulint n_readers, ulint n_writers,
599 ulint n_slots_sync) MY_ATTRIBUTE((warn_unused_result));
600
601 /** Free the AIO arrays */
602 static void shutdown();
603
604 /** Print all the AIO segments
605 @param[in,out] file Where to print */
606 static void print_all(FILE *file);
607
608 /** Calculates local segment number and aio array from global
609 segment number.
610 @param[out] array AIO wait array
611 @param[in] segment global segment number
612 @return local segment number within the aio array */
613 static ulint get_array_and_local_segment(AIO *&array, ulint segment)
614 MY_ATTRIBUTE((warn_unused_result));
615
616 /** Select the IO slot array
617 @param[in,out] type Type of IO, READ or WRITE
618 @param[in] read_only true if running in read-only mode
619 @param[in] aio_mode IO mode
620 @return slot array or NULL if invalid mode specified */
621 static AIO *select_slot_array(IORequest &type, bool read_only,
622 AIO_mode aio_mode)
623 MY_ATTRIBUTE((warn_unused_result));
624
625 /** Calculates segment number for a slot.
626 @param[in] array AIO wait array
627 @param[in] slot slot in this array
628 @return segment number (which is the number used by, for example,
629 I/O handler threads) */
630 static ulint get_segment_no_from_slot(const AIO *array, const Slot *slot)
631 MY_ATTRIBUTE((warn_unused_result));
632
633 /** Wakes up a simulated AIO I/O-handler thread if it has something
634 to do.
635 @param[in] global_segment the number of the segment in the
636 AIO arrays */
637 static void wake_simulated_handler_thread(ulint global_segment);
638
639 /** Check if it is a read request
640 @param[in] aio The AIO instance to check
641 @return true if the AIO instance is for reading. */
is_read(const AIO * aio)642 static bool is_read(const AIO *aio) MY_ATTRIBUTE((warn_unused_result)) {
643 return (s_reads == aio);
644 }
645
646 /** Wait on an event until no pending writes */
wait_until_no_pending_writes()647 static void wait_until_no_pending_writes() {
648 os_event_wait(AIO::s_writes->m_is_empty);
649 }
650
651 /** Print to file
652 @param[in] file File to write to */
653 static void print_to_file(FILE *file);
654
655 /** Check for pending IO. Gets the count and also validates the
656 data structures.
657 @return count of pending IO requests */
658 static ulint total_pending_io_count();
659
660 private:
661 /** Initialise the slots
662 @return DB_SUCCESS or error code */
663 dberr_t init_slots() MY_ATTRIBUTE((warn_unused_result));
664
665 /** Wakes up a simulated AIO I/O-handler thread if it has something
666 to do for a local segment in the AIO array.
667 @param[in] global_segment the number of the segment in the
668 AIO arrays
669 @param[in] segment the local segment in the AIO array */
670 void wake_simulated_handler_thread(ulint global_segment, ulint segment);
671
672 /** Prints pending IO requests per segment of an aio array.
673 We probably don't need per segment statistics but they can help us
674 during development phase to see if the IO requests are being
675 distributed as expected.
676 @param[in,out] file file where to print
677 @param[in] segments pending IO array */
678 void print_segment_info(FILE *file, const ulint *segments);
679
680 #ifdef LINUX_NATIVE_AIO
681 /** Initialise the Linux native AIO data structures
682 @return DB_SUCCESS or error code */
683 dberr_t init_linux_native_aio() MY_ATTRIBUTE((warn_unused_result));
684 #endif /* LINUX_NATIVE_AIO */
685
686 private:
687 typedef std::vector<Slot> Slots;
688
689 /** the mutex protecting the aio array */
690 mutable SysMutex m_mutex;
691
692 /** Pointer to the slots in the array.
693 Number of elements must be divisible by n_threads. */
694 Slots m_slots;
695
696 /** Number of segments in the aio array of pending aio requests.
697 A thread can wait separately for any one of the segments. */
698 ulint m_n_segments;
699
700 /** The event which is set to the signaled state when
701 there is space in the aio outside the ibuf segment */
702 os_event_t m_not_full;
703
704 /** The event which is set to the signaled state when
705 there are no pending i/os in this array */
706 os_event_t m_is_empty;
707
708 /** Number of reserved slots in the AIO array outside
709 the ibuf segment */
710 ulint m_n_reserved;
711
712 #ifdef _WIN32
713 typedef std::vector<HANDLE, ut_allocator<HANDLE>> Handles;
714
715 /** Pointer to an array of OS native event handles where
716 we copied the handles from slots, in the same order. This
717 can be used in WaitForMultipleObjects; used only in Windows */
718 Handles *m_handles;
719 #endif /* _WIN32 */
720
721 #if defined(LINUX_NATIVE_AIO)
722 typedef std::vector<io_event> IOEvents;
723
724 /** completion queue for IO. There is one such queue per
725 segment. Each thread will work on one ctx exclusively. */
726 io_context_t *m_aio_ctx;
727
728 /** The array to collect completed IOs. There is one such
729 event for each possible pending IO. The size of the array
730 is equal to m_slots.size(). */
731 IOEvents m_events;
732 #endif /* LINUX_NATIV_AIO */
733
734 /** The aio arrays for non-ibuf i/o and ibuf i/o, as well as
735 sync AIO. These are NULL when the module has not yet been
736 initialized. */
737
738 /** Insert buffer */
739 static AIO *s_ibuf;
740
741 /** Redo log */
742 static AIO *s_log;
743
744 /** Reads */
745 static AIO *s_reads;
746
747 /** Writes */
748 static AIO *s_writes;
749
750 /** Synchronous I/O */
751 static AIO *s_sync;
752 };
753
754 /** Static declarations */
755 AIO *AIO::s_reads;
756 AIO *AIO::s_writes;
757 AIO *AIO::s_ibuf;
758 AIO *AIO::s_log;
759 AIO *AIO::s_sync;
760
761 #if defined(LINUX_NATIVE_AIO)
762 /** timeout for each io_getevents() call = 500ms. */
763 static const ulint OS_AIO_REAP_TIMEOUT = 500000000UL;
764
765 /** time to sleep, in microseconds if io_setup() returns EAGAIN. */
766 static const ulint OS_AIO_IO_SETUP_RETRY_SLEEP = 500000UL;
767
768 /** number of attempts before giving up on io_setup(). */
769 static const int OS_AIO_IO_SETUP_RETRY_ATTEMPTS = 5;
770 #endif /* LINUX_NATIVE_AIO */
771
772 /** Array of events used in simulated AIO */
773 static os_event_t *os_aio_segment_wait_events = nullptr;
774
775 /** Number of asynchronous I/O segments. Set by os_aio_init(). */
776 static ulint os_aio_n_segments = ULINT_UNDEFINED;
777
778 /** If the following is true, read i/o handler threads try to
779 wait until a batch of new read requests have been posted */
780 static bool os_aio_recommend_sleep_for_read_threads = false;
781
782 ulint os_n_file_reads = 0;
783 static ulint os_bytes_read_since_printout = 0;
784 ulint os_n_file_writes = 0;
785 ulint os_n_fsyncs = 0;
786 static ulint os_n_file_reads_old = 0;
787 static ulint os_n_file_writes_old = 0;
788 static ulint os_n_fsyncs_old = 0;
789 /** Number of pending write operations */
790 ulint os_n_pending_writes = 0;
791 /** Number of pending read operations */
792 ulint os_n_pending_reads = 0;
793
794 static ib_time_monotonic_t os_last_printout;
795 bool os_has_said_disk_full = false;
796
797 /** Default Zip compression level */
798 extern uint page_zip_level;
799
800 static_assert(DATA_TRX_ID_LEN <= 6, "COMPRESSION_ALGORITHM will not fit!");
801
802 /** Validates the consistency of the aio system.
803 @return true if ok */
804 static bool os_aio_validate();
805
806 /** Does error handling when a file operation fails.
807 @param[in] name File name or NULL
808 @param[in] operation Name of operation e.g., "read", "write"
809 @return true if we should retry the operation */
810 static bool os_file_handle_error(const char *name, const char *operation);
811
812 /** Free storage space associated with a section of the file.
813 @param[in] fh Open file handle
814 @param[in] off Starting offset (SEEK_SET)
815 @param[in] len Size of the hole
816 @return DB_SUCCESS or error code */
817 dberr_t os_file_punch_hole(os_file_t fh, os_offset_t off, os_offset_t len);
818
819 /**
820 Does error handling when a file operation fails.
821 @param[in] name File name or NULL
822 @param[in] operation Name of operation e.g., "read", "write"
823 @param[in] on_error_silent if true then don't print any message to the log.
824 @return true if we should retry the operation */
825 static bool os_file_handle_error_no_exit(const char *name,
826 const char *operation,
827 bool on_error_silent);
828
829 /** Decompress after a read and punch a hole in the file if it was a write
830 @param[in] type IO context
831 @param[in] fh Open file handle
832 @param[in,out] buf Buffer to transform
833 @param[in,out] scratch Scratch area for read decompression
834 @param[in] src_len Length of the buffer before compression
835 @param[in] offset file offset from the start where to read
836 @param[in] len Compressed buffer length for write and size
837 of buf len for read
838 @return DB_SUCCESS or error code */
839 static dberr_t os_file_io_complete(const IORequest &type, os_file_t fh,
840 byte *buf, byte *scratch, ulint src_len,
841 os_offset_t offset, ulint len);
842
843 /** Does simulated AIO. This function should be called by an i/o-handler
844 thread.
845
846 @param[in] global_segment The number of the segment in the aio arrays to
847 await for; segment 0 is the ibuf i/o thread,
848 segment 1 the log i/o thread, then follow the
849 non-ibuf read threads, and as the last are the
850 non-ibuf write threads
851 @param[out] m1 the messages passed with the AIO request; note
852 that also in the case where the AIO operation
853 failed, these output parameters are valid and
854 can be used to restart the operation, for
855 example
856 @param[out] m2 Callback argument
857 @param[in] type IO context
858 @return DB_SUCCESS or error code */
859 static dberr_t os_aio_simulated_handler(ulint global_segment, fil_node_t **m1,
860 void **m2, IORequest *type);
861
862 #ifdef WIN_ASYNC_IO
863 /** This function is only used in Windows asynchronous i/o.
864 Waits for an aio operation to complete. This function is used to wait the
865 for completed requests. The aio array of pending requests is divided
866 into segments. The thread specifies which segment or slot it wants to wait
867 for. NOTE: this function will also take care of freeing the aio slot,
868 therefore no other thread is allowed to do the freeing!
869 @param[in] segment The number of the segment in the aio arrays to
870 wait for; segment 0 is the ibuf I/O thread,
871 segment 1 the log I/O thread, then follow the
872 non-ibuf read threads, and as the last are the
873 non-ibuf write threads; if this is
874 ULINT_UNDEFINED, then it means that sync AIO
875 is used, and this parameter is ignored
876 @param[in] pos this parameter is used only in sync AIO:
877 wait for the aio slot at this position
878 @param[out] m1 the messages passed with the AIO request; note
879 that also in the case where the AIO operation
880 failed, these output parameters are valid and
881 can be used to restart the operation,
882 for example
883 @param[out] m2 callback message
884 @param[out] type OS_FILE_WRITE or ..._READ
885 @return DB_SUCCESS or error code */
886 static dberr_t os_aio_windows_handler(ulint segment, ulint pos, fil_node_t **m1,
887 void **m2, IORequest *type);
888 #endif /* WIN_ASYNC_IO */
889
890 /** Check the file type and determine if it can be deleted.
891 @param[in] name Filename/Path to check
892 @return true if it's a file or a symlink and can be deleted */
os_file_can_delete(const char * name)893 static bool os_file_can_delete(const char *name) {
894 switch (Fil_path::get_file_type(name)) {
895 case OS_FILE_TYPE_FILE:
896 case OS_FILE_TYPE_LINK:
897 return (true);
898
899 case OS_FILE_TYPE_DIR:
900
901 ib::warn(ER_IB_MSG_743) << "'" << name << "'"
902 << " is a directory, can't delete!";
903 break;
904
905 case OS_FILE_TYPE_BLOCK:
906
907 ib::warn(ER_IB_MSG_744) << "'" << name << "'"
908 << " is a block device, can't delete!";
909 break;
910
911 case OS_FILE_TYPE_FAILED:
912
913 ib::warn(ER_IB_MSG_745) << "'" << name << "'"
914 << " get file type failed, won't delete!";
915 break;
916
917 case OS_FILE_TYPE_UNKNOWN:
918
919 ib::warn(ER_IB_MSG_746) << "'" << name << "'"
920 << " unknown file type, won't delete!";
921 break;
922
923 case OS_FILE_TYPE_NAME_TOO_LONG:
924
925 ib::warn(ER_IB_MSG_747) << "'" << name << "'"
926 << " name too long, can't delete!";
927 break;
928
929 case OS_FILE_PERMISSION_ERROR:
930 ib::warn(ER_IB_MSG_748) << "'" << name << "'"
931 << " permission error, can't delete!";
932 break;
933
934 case OS_FILE_TYPE_MISSING:
935 break;
936 }
937
938 return (false);
939 }
940
os_alloc_block()941 file::Block *os_alloc_block() noexcept {
942 size_t pos;
943 Blocks &blocks = *block_cache;
944 size_t i = static_cast<size_t>(my_timer_cycles());
945 const size_t size = blocks.size();
946 ulint retry = 0;
947 file::Block *block;
948
949 DBUG_EXECUTE_IF("os_block_cache_busy", retry = MAX_BLOCKS * 3;);
950
951 for (;;) {
952 /* After go through the block cache for 3 times,
953 allocate a new temporary block. */
954 if (retry == MAX_BLOCKS * 3) {
955 byte *ptr;
956
957 ptr = static_cast<byte *>(
958 ut_malloc_nokey(sizeof(*block) + BUFFER_BLOCK_SIZE));
959
960 block = new (ptr) file::Block();
961 block->m_ptr = static_cast<byte *>(ptr + sizeof(*block));
962 block->m_in_use = 1;
963
964 break;
965 }
966
967 pos = i++ % size;
968
969 if (TAS(&blocks[pos].m_in_use, 1) == 0) {
970 block = &blocks[pos];
971 break;
972 }
973
974 os_thread_yield();
975
976 ++retry;
977 }
978
979 ut_a(block->m_in_use != 0);
980
981 return (block);
982 }
983
os_free_block(file::Block * block)984 void os_free_block(file::Block *block) noexcept {
985 ut_ad(block->m_in_use == 1);
986
987 TAS(&block->m_in_use, 0);
988
989 /* When this block is not in the block cache, and it's
990 a temporary block, we need to free it directly. */
991 if (std::less<file::Block *>()(block, &block_cache->front()) ||
992 std::greater<file::Block *>()(block, &block_cache->back())) {
993 ut_free(block);
994 }
995 }
996
997 /** Generic AIO Handler methods. Currently handles IO post processing. */
998 class AIOHandler {
999 public:
1000 /** Do any post processing after a read/write
1001 @return DB_SUCCESS or error code. */
1002 static dberr_t post_io_processing(Slot *slot);
1003
1004 /** Decompress after a read and punch a hole in the file if
1005 it was a write */
io_complete(const Slot * slot)1006 static dberr_t io_complete(const Slot *slot) {
1007 ut_a(slot->offset > 0);
1008 ut_a(slot->type.is_read() || !slot->skip_punch_hole);
1009 return (os_file_io_complete(slot->type, slot->file.m_file, slot->buf,
1010 nullptr, slot->original_len, slot->offset,
1011 slot->len));
1012 }
1013
1014 private:
1015 /** Check whether the page was encrypted.
1016 @param[in] slot The slot that contains the IO request
1017 @return true if it was an encyrpted page */
is_encrypted_page(const Slot * slot)1018 static bool is_encrypted_page(const Slot *slot) {
1019 return (Encryption::is_encrypted_page(slot->buf));
1020 }
1021
1022 /** Check whether the page was compressed.
1023 @param[in] slot The slot that contains the IO request
1024 @return true if it was a compressed page */
is_compressed_page(const Slot * slot)1025 static bool is_compressed_page(const Slot *slot) {
1026 const byte *src = slot->buf;
1027
1028 ulint page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
1029
1030 return (page_type == FIL_PAGE_COMPRESSED);
1031 }
1032
1033 /** Get the compressed page size.
1034 @param[in] slot The slot that contains the IO request
1035 @return number of bytes to read for a successful decompress */
compressed_page_size(const Slot * slot)1036 static ulint compressed_page_size(const Slot *slot) {
1037 ut_ad(slot->type.is_read());
1038 ut_ad(is_compressed_page(slot));
1039
1040 ulint size;
1041 const byte *src = slot->buf;
1042
1043 size = mach_read_from_2(src + FIL_PAGE_COMPRESS_SIZE_V1);
1044
1045 return (size + FIL_PAGE_DATA);
1046 }
1047
1048 /** Check if the page contents can be decompressed.
1049 @param[in] slot The slot that contains the IO request
1050 @return true if the data read has all the compressed data */
can_decompress(const Slot * slot)1051 static bool can_decompress(const Slot *slot) {
1052 ut_ad(slot->type.is_read());
1053 ut_ad(is_compressed_page(slot));
1054
1055 ulint version;
1056 const byte *src = slot->buf;
1057
1058 version = mach_read_from_1(src + FIL_PAGE_VERSION);
1059
1060 ut_a(version == 1);
1061
1062 /* Includes the page header size too */
1063 ulint size = compressed_page_size(slot);
1064
1065 return (size <= (slot->ptr - slot->buf) + (ulint)slot->n_bytes);
1066 }
1067
1068 /** Check if we need to read some more data.
1069 @param[in] slot The slot that contains the IO request
1070 @param[in] n_bytes Total bytes read so far
1071 @return DB_SUCCESS or error code */
1072 static dberr_t check_read(Slot *slot, ulint n_bytes);
1073 };
1074
1075 /** Helper class for doing synchronous file IO. Currently, the objective
1076 is to hide the OS specific code, so that the higher level functions aren't
1077 peppered with "#ifdef". Makes the code flow difficult to follow. */
1078 class SyncFileIO {
1079 public:
1080 /** Constructor
1081 @param[in] fh File handle
1082 @param[in,out] buf Buffer to read/write
1083 @param[in] n Number of bytes to read/write
1084 @param[in] offset Offset where to read or write */
SyncFileIO(os_file_t fh,void * buf,ulint n,os_offset_t offset)1085 SyncFileIO(os_file_t fh, void *buf, ulint n, os_offset_t offset)
1086 : m_fh(fh), m_buf(buf), m_n(static_cast<ssize_t>(n)), m_offset(offset) {
1087 ut_ad(m_n > 0);
1088 }
1089
1090 /** Destructor */
~SyncFileIO()1091 ~SyncFileIO() { /* No op */
1092 }
1093
1094 /** Do the read/write
1095 @param[in] request The IO context and type
1096 @return the number of bytes read/written or negative value on error */
1097 ssize_t execute(const IORequest &request);
1098
1099 /** Do the read/write
1100 @param[in,out] slot The IO slot, it has the IO context
1101 @return the number of bytes read/written or negative value on error */
1102 static ssize_t execute(Slot *slot);
1103
1104 /** Move the read/write offset up to where the partial IO succeeded.
1105 @param[in] n_bytes The number of bytes to advance */
advance(ssize_t n_bytes)1106 void advance(ssize_t n_bytes) {
1107 m_offset += n_bytes;
1108
1109 ut_ad(m_n >= n_bytes);
1110
1111 m_n -= n_bytes;
1112
1113 m_buf = reinterpret_cast<uchar *>(m_buf) + n_bytes;
1114 }
1115
1116 private:
1117 /** Open file handle */
1118 os_file_t m_fh;
1119
1120 /** Buffer to read/write */
1121 void *m_buf;
1122
1123 /** Number of bytes to read/write */
1124 ssize_t m_n;
1125
1126 /** Offset from where to read/write */
1127 os_offset_t m_offset;
1128 };
1129
1130 /** If it is a compressed page return the compressed page data + footer size
1131 @param[in] buf Buffer to check, must include header + 10 bytes
1132 @return ULINT_UNDEFINED if the page is not a compressed page or length
1133 of the compressed data (including footer) if it is a compressed page */
os_file_compressed_page_size(const byte * buf)1134 ulint os_file_compressed_page_size(const byte *buf) {
1135 ulint type = mach_read_from_2(buf + FIL_PAGE_TYPE);
1136
1137 if (type == FIL_PAGE_COMPRESSED) {
1138 ulint version = mach_read_from_1(buf + FIL_PAGE_VERSION);
1139 ut_a(version == 1);
1140 return (mach_read_from_2(buf + FIL_PAGE_COMPRESS_SIZE_V1));
1141 }
1142
1143 return (ULINT_UNDEFINED);
1144 }
1145
1146 /** If it is a compressed page return the original page data + footer size
1147 @param[in] buf Buffer to check, must include header + 10 bytes
1148 @return ULINT_UNDEFINED if the page is not a compressed page or length
1149 of the original data + footer if it is a compressed page */
os_file_original_page_size(const byte * buf)1150 ulint os_file_original_page_size(const byte *buf) {
1151 ulint type = mach_read_from_2(buf + FIL_PAGE_TYPE);
1152
1153 if (type == FIL_PAGE_COMPRESSED) {
1154 ulint version = mach_read_from_1(buf + FIL_PAGE_VERSION);
1155 ut_a(version == 1);
1156
1157 return (mach_read_from_2(buf + FIL_PAGE_ORIGINAL_SIZE_V1));
1158 }
1159
1160 return (ULINT_UNDEFINED);
1161 }
1162
1163 /** Check if we need to read some more data.
1164 @param[in] slot The slot that contains the IO request
1165 @param[in] n_bytes Total bytes read so far
1166 @return DB_SUCCESS or error code */
check_read(Slot * slot,ulint n_bytes)1167 dberr_t AIOHandler::check_read(Slot *slot, ulint n_bytes) {
1168 dberr_t err;
1169
1170 ut_ad(slot->type.is_read());
1171 ut_ad(slot->original_len > slot->len);
1172
1173 if (is_compressed_page(slot)) {
1174 if (can_decompress(slot)) {
1175 ut_a(slot->offset > 0);
1176
1177 slot->len = slot->original_len;
1178 #ifdef _WIN32
1179 slot->n_bytes = static_cast<DWORD>(n_bytes);
1180 #else
1181 slot->n_bytes = static_cast<ulint>(n_bytes);
1182 #endif /* _WIN32 */
1183
1184 err = io_complete(slot);
1185 ut_a(err == DB_SUCCESS);
1186 } else {
1187 /* Read the next block in */
1188 ut_ad(compressed_page_size(slot) >= n_bytes);
1189
1190 err = DB_FAIL;
1191 }
1192 } else if (is_encrypted_page(slot) ||
1193 (slot->type.is_log() && slot->offset >= LOG_FILE_HDR_SIZE)) {
1194 ut_a(slot->offset > 0);
1195
1196 slot->len = slot->original_len;
1197 #ifdef _WIN32
1198 slot->n_bytes = static_cast<DWORD>(n_bytes);
1199 #else
1200 slot->n_bytes = static_cast<ulint>(n_bytes);
1201 #endif /* _WIN32 */
1202
1203 err = io_complete(slot);
1204 ut_a(err == DB_SUCCESS);
1205
1206 } else {
1207 err = DB_FAIL;
1208 }
1209
1210 if (slot->buf_block != nullptr) {
1211 os_free_block(slot->buf_block);
1212 slot->buf_block = nullptr;
1213 }
1214
1215 if (slot->encrypt_log_buf != nullptr) {
1216 ut_free(slot->encrypt_log_buf);
1217 slot->encrypt_log_buf = nullptr;
1218 }
1219
1220 return (err);
1221 }
1222
1223 /** Do any post processing after a read/write
1224 @return DB_SUCCESS or error code. */
post_io_processing(Slot * slot)1225 dberr_t AIOHandler::post_io_processing(Slot *slot) {
1226 dberr_t err;
1227
1228 ut_ad(slot->is_reserved);
1229
1230 /* Total bytes read so far */
1231 ulint n_bytes = (slot->ptr - slot->buf) + slot->n_bytes;
1232
1233 /* Compressed writes can be smaller than the original length.
1234 Therefore they can be processed without further IO. */
1235 if (n_bytes == slot->original_len ||
1236 (slot->type.is_write() && slot->type.is_compressed() &&
1237 slot->len == static_cast<ulint>(slot->n_bytes))) {
1238 if ((slot->type.is_log() && slot->offset >= LOG_FILE_HDR_SIZE) ||
1239 is_compressed_page(slot) || is_encrypted_page(slot)) {
1240 ut_a(slot->offset > 0);
1241
1242 if (slot->type.is_read()) {
1243 slot->len = slot->original_len;
1244 }
1245
1246 /* The punch hole has been done on collect() */
1247
1248 if (slot->type.is_read()) {
1249 err = io_complete(slot);
1250 } else {
1251 err = DB_SUCCESS;
1252 }
1253
1254 ut_ad(err == DB_SUCCESS || err == DB_UNSUPPORTED ||
1255 err == DB_CORRUPTION || err == DB_IO_DECOMPRESS_FAIL);
1256 } else {
1257 err = DB_SUCCESS;
1258 }
1259
1260 if (slot->buf_block != nullptr) {
1261 os_free_block(slot->buf_block);
1262 slot->buf_block = nullptr;
1263 }
1264
1265 if (slot->encrypt_log_buf != nullptr) {
1266 ut_free(slot->encrypt_log_buf);
1267 slot->encrypt_log_buf = nullptr;
1268 }
1269 } else if ((ulint)slot->n_bytes == (ulint)slot->len) {
1270 /* It *must* be a partial read. */
1271 ut_ad(slot->len < slot->original_len);
1272
1273 /* Has to be a read request, if it is less than
1274 the original length. */
1275 ut_ad(slot->type.is_read());
1276 err = check_read(slot, n_bytes);
1277
1278 } else {
1279 err = DB_FAIL;
1280 }
1281
1282 return (err);
1283 }
1284
1285 /** Count the number of free slots
1286 @return number of reserved slots */
pending_io_count() const1287 ulint AIO::pending_io_count() const {
1288 acquire();
1289
1290 #ifdef UNIV_DEBUG
1291 ut_a(m_n_segments > 0);
1292 ut_a(!m_slots.empty());
1293
1294 ulint count = 0;
1295
1296 for (ulint i = 0; i < m_slots.size(); ++i) {
1297 const Slot &slot = m_slots[i];
1298
1299 if (slot.is_reserved) {
1300 ++count;
1301 ut_a(slot.len > 0);
1302 }
1303 }
1304
1305 ut_a(m_n_reserved == count);
1306 #endif /* UNIV_DEBUG */
1307
1308 ulint reserved = m_n_reserved;
1309
1310 release();
1311
1312 return (reserved);
1313 }
1314
1315 /** Compress a data page
1316 @param[in] compression Compression algorithm
1317 @param[in] block_size File system block size
1318 @param[in] src Source contents to compress
1319 @param[in] src_len Length in bytes of the source
1320 @param[out] dst Compressed page contents
1321 @param[out] dst_len Length in bytes of dst contents
1322 @return buffer data, dst_len will have the length of the data */
os_file_compress_page(Compression compression,ulint block_size,byte * src,ulint src_len,byte * dst,ulint * dst_len)1323 byte *os_file_compress_page(Compression compression, ulint block_size,
1324 byte *src, ulint src_len, byte *dst,
1325 ulint *dst_len) {
1326 ulint len = 0;
1327 ulint compression_level = page_zip_level;
1328 ulint page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
1329
1330 /* The page size must be a multiple of the OS punch hole size. */
1331 ut_ad(!(src_len % block_size));
1332
1333 /* Shouldn't compress an already compressed page. */
1334 ut_ad(page_type != FIL_PAGE_COMPRESSED);
1335
1336 /* The page must be at least twice as large as the file system
1337 block size if we are to save any space. Ignore R-Tree pages for now,
1338 they repurpose the same 8 bytes in the page header. No point in
1339 compressing if the file system block size >= our page size. */
1340
1341 if (page_type == FIL_PAGE_RTREE || block_size == ULINT_UNDEFINED ||
1342 compression.m_type == Compression::NONE || src_len < block_size * 2) {
1343 *dst_len = src_len;
1344
1345 return (src);
1346 }
1347
1348 /* Leave the header alone when compressing. */
1349 ut_ad(block_size >= FIL_PAGE_DATA * 2);
1350
1351 ut_ad(src_len > FIL_PAGE_DATA + block_size);
1352
1353 /* Must compress to <= N-1 FS blocks. */
1354 ulint out_len = src_len - (FIL_PAGE_DATA + block_size);
1355
1356 /* This is the original data page size - the page header. */
1357 ulint content_len = src_len - FIL_PAGE_DATA;
1358
1359 ut_ad(out_len >= block_size - FIL_PAGE_DATA);
1360 ut_ad(out_len <= src_len - (block_size + FIL_PAGE_DATA));
1361
1362 /* Only compress the data + trailer, leave the header alone */
1363
1364 switch (compression.m_type) {
1365 case Compression::NONE:
1366 ut_error;
1367
1368 case Compression::ZLIB: {
1369 uLongf zlen = static_cast<uLongf>(out_len);
1370
1371 if (compress2(dst + FIL_PAGE_DATA, &zlen, src + FIL_PAGE_DATA,
1372 static_cast<uLong>(content_len),
1373 static_cast<int>(compression_level)) != Z_OK) {
1374 *dst_len = src_len;
1375
1376 return (src);
1377 }
1378
1379 len = static_cast<ulint>(zlen);
1380
1381 break;
1382 }
1383
1384 case Compression::LZ4:
1385
1386 len = LZ4_compress_default(reinterpret_cast<char *>(src) + FIL_PAGE_DATA,
1387 reinterpret_cast<char *>(dst) + FIL_PAGE_DATA,
1388 static_cast<int>(content_len),
1389 static_cast<int>(out_len));
1390
1391 ut_a(len <= src_len - FIL_PAGE_DATA);
1392
1393 if (len == 0 || len >= out_len) {
1394 *dst_len = src_len;
1395
1396 return (src);
1397 }
1398
1399 break;
1400
1401 default:
1402 *dst_len = src_len;
1403 return (src);
1404 }
1405
1406 ut_a(len <= out_len);
1407
1408 ut_ad(memcmp(src + FIL_PAGE_LSN + 4,
1409 src + src_len - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4) == 0);
1410
1411 /* Copy the header as is. */
1412 memmove(dst, src, FIL_PAGE_DATA);
1413
1414 /* Add compression control information. Required for decompressing. */
1415 mach_write_to_2(dst + FIL_PAGE_TYPE, FIL_PAGE_COMPRESSED);
1416
1417 mach_write_to_1(dst + FIL_PAGE_VERSION, 1);
1418
1419 mach_write_to_1(dst + FIL_PAGE_ALGORITHM_V1, compression.m_type);
1420
1421 mach_write_to_2(dst + FIL_PAGE_ORIGINAL_TYPE_V1, page_type);
1422
1423 mach_write_to_2(dst + FIL_PAGE_ORIGINAL_SIZE_V1, content_len);
1424
1425 mach_write_to_2(dst + FIL_PAGE_COMPRESS_SIZE_V1, len);
1426
1427 /* Round to the next full block size */
1428
1429 len += FIL_PAGE_DATA;
1430
1431 *dst_len = ut_calc_align(len, block_size);
1432
1433 ut_ad(*dst_len >= len && *dst_len <= out_len + FIL_PAGE_DATA);
1434
1435 /* Clear out the unused portion of the page. */
1436 if (len % block_size) {
1437 memset(dst + len, 0x0, block_size - (len % block_size));
1438 }
1439
1440 return (dst);
1441 }
1442
1443 #ifdef UNIV_DEBUG
1444 #ifndef UNIV_HOTBACKUP
1445 /** Validates the consistency the aio system some of the time.
1446 @return true if ok or the check was skipped */
os_aio_validate_skip()1447 static bool os_aio_validate_skip() {
1448 /** Try os_aio_validate() every this many times */
1449 #define OS_AIO_VALIDATE_SKIP 13
1450
1451 /** The os_aio_validate() call skip counter.
1452 Use a signed type because of the race condition below. */
1453 static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
1454
1455 /* There is a race condition below, but it does not matter,
1456 because this call is only for heuristic purposes. We want to
1457 reduce the call frequency of the costly os_aio_validate()
1458 check in debug builds. */
1459 --os_aio_validate_count;
1460
1461 if (os_aio_validate_count > 0) {
1462 return (true);
1463 }
1464
1465 os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
1466 return (os_aio_validate());
1467 }
1468 #endif /* !UNIV_HOTBACKUP */
1469 #endif /* UNIV_DEBUG */
1470
1471 #undef USE_FILE_LOCK
1472 #define USE_FILE_LOCK
1473 #if defined(UNIV_HOTBACKUP) || defined(_WIN32)
1474 /* InnoDB Hot Backup does not lock the data files.
1475 * On Windows, mandatory locking is used.
1476 */
1477 #undef USE_FILE_LOCK
1478 #endif /* UNIV_HOTBACKUP || _WIN32 */
1479 #ifdef USE_FILE_LOCK
1480 /** Obtain an exclusive lock on a file.
1481 @param[in] fd file descriptor
1482 @param[in] name file name
1483 @return 0 on success */
os_file_lock(int fd,const char * name)1484 static int os_file_lock(int fd, const char *name) {
1485 struct flock lk;
1486
1487 lk.l_type = F_WRLCK;
1488 lk.l_whence = SEEK_SET;
1489 lk.l_start = lk.l_len = 0;
1490
1491 if (fcntl(fd, F_SETLK, &lk) == -1) {
1492 ib::error(ER_IB_MSG_749)
1493 << "Unable to lock " << name << " error: " << errno;
1494
1495 if (errno == EAGAIN || errno == EACCES) {
1496 ib::info(ER_IB_MSG_750) << "Check that you do not already have"
1497 " another mysqld process using the"
1498 " same InnoDB data or log files.";
1499 }
1500
1501 return (-1);
1502 }
1503
1504 return (0);
1505 }
1506 #endif /* USE_FILE_LOCK */
1507
1508 /** Calculates local segment number and aio array from global segment number.
1509 @param[out] array aio wait array
1510 @param[in] segment global segment number
1511 @return local segment number within the aio array */
get_array_and_local_segment(AIO * & array,ulint segment)1512 ulint AIO::get_array_and_local_segment(AIO *&array, ulint segment) {
1513 ulint limit = srv_read_only_mode ? 0 : 2;
1514
1515 ut_a(segment < os_aio_n_segments);
1516
1517 if (!srv_read_only_mode && segment < limit) {
1518 /* We don't support ibuf/log IO during read only mode. */
1519
1520 if (segment == IO_IBUF_SEGMENT) {
1521 array = s_ibuf;
1522
1523 } else if (segment == IO_LOG_SEGMENT) {
1524 array = s_log;
1525
1526 } else {
1527 array = nullptr;
1528 }
1529
1530 return 0;
1531 }
1532
1533 if (segment < s_reads->m_n_segments + limit) {
1534 array = s_reads;
1535
1536 return segment - limit;
1537 }
1538
1539 limit += s_reads->m_n_segments;
1540
1541 ut_a(segment < s_writes->m_n_segments + limit);
1542
1543 array = s_writes;
1544
1545 return segment - limit;
1546 }
1547
1548 /** Frees a slot in the aio array. Assumes caller owns the mutex.
1549 @param[in,out] slot Slot to release */
release(Slot * slot)1550 void AIO::release(Slot *slot) {
1551 ut_ad(is_mutex_owned());
1552
1553 ut_ad(slot->is_reserved);
1554
1555 slot->is_reserved = false;
1556
1557 --m_n_reserved;
1558
1559 if (m_n_reserved == m_slots.size() - 1) {
1560 os_event_set(m_not_full);
1561 }
1562
1563 if (m_n_reserved == 0) {
1564 os_event_set(m_is_empty);
1565 }
1566
1567 #ifdef WIN_ASYNC_IO
1568
1569 ResetEvent(slot->handle);
1570
1571 #elif defined(LINUX_NATIVE_AIO)
1572
1573 if (srv_use_native_aio) {
1574 memset(&slot->control, 0x0, sizeof(slot->control));
1575 slot->ret = 0;
1576 slot->n_bytes = 0;
1577 } else {
1578 /* These fields should not be used if we are not
1579 using native AIO. */
1580 ut_ad(slot->n_bytes == 0);
1581 ut_ad(slot->ret == 0);
1582 }
1583
1584 #endif /* WIN_ASYNC_IO */
1585 }
1586
1587 /** Frees a slot in the AIO array. Assumes caller doesn't own the mutex.
1588 @param[in,out] slot Slot to release */
release_with_mutex(Slot * slot)1589 void AIO::release_with_mutex(Slot *slot) {
1590 acquire();
1591
1592 release(slot);
1593
1594 release();
1595 }
1596
1597 #ifndef UNIV_HOTBACKUP
1598 /** Creates a temporary file. This function is like tmpfile(3), but
1599 the temporary file is created in the given parameter path. If the path
1600 is NULL then it will create the file in the MySQL server configuration
1601 parameter (--tmpdir).
1602 @param[in] path location for creating temporary file
1603 @return temporary file handle, or NULL on error */
os_file_create_tmpfile(const char * path)1604 FILE *os_file_create_tmpfile(const char *path) {
1605 FILE *file = nullptr;
1606 int fd = innobase_mysql_tmpfile(path);
1607
1608 if (fd >= 0) {
1609 file = fdopen(fd, "w+b");
1610 }
1611
1612 if (file == nullptr) {
1613 ib::error(ER_IB_MSG_751)
1614 << "Unable to create temporary file; errno: " << errno;
1615
1616 if (fd >= 0) {
1617 close(fd);
1618 }
1619 }
1620
1621 return (file);
1622 }
1623 #endif /* !UNIV_HOTBACKUP */
1624
1625 /** Rewind file to its start, read at most size - 1 bytes from it to str, and
1626 NUL-terminate str. All errors are silently ignored. This function is
1627 mostly meant to be used with temporary files.
1628 @param[in,out] file File to read from
1629 @param[in,out] str Buffer where to read
1630 @param[in] size Size of buffer */
os_file_read_string(FILE * file,char * str,ulint size)1631 void os_file_read_string(FILE *file, char *str, ulint size) {
1632 if (size != 0) {
1633 rewind(file);
1634
1635 size_t flen = fread(str, 1, size - 1, file);
1636
1637 str[flen] = '\0';
1638 }
1639 }
1640
1641 /** Decompress after a read and punch a hole in the file if it was a write
1642 @param[in] type IO context
1643 @param[in] fh Open file handle
1644 @param[in,out] buf Buffer to transform
1645 @param[in,out] scratch Scratch area for read decompression
1646 @param[in] src_len Length of the buffer before compression
1647 @param[in] offset file offset from the start where to read
1648 @param[in] len Used buffer length for write and output
1649 buf len for read
1650 @return DB_SUCCESS or error code */
os_file_io_complete(const IORequest & type,os_file_t fh,byte * buf,byte * scratch,ulint src_len,os_offset_t offset,ulint len)1651 static dberr_t os_file_io_complete(const IORequest &type, os_file_t fh,
1652 byte *buf, byte *scratch, ulint src_len,
1653 os_offset_t offset, ulint len) {
1654 dberr_t ret = DB_SUCCESS;
1655
1656 /* We never compress/decompress the first page */
1657 ut_a(offset > 0);
1658 ut_ad(type.validate());
1659
1660 if (!type.is_compression_enabled()) {
1661 if (type.is_log() && offset >= LOG_FILE_HDR_SIZE) {
1662 Encryption encryption(type.encryption_algorithm());
1663
1664 ret = encryption.decrypt_log(type, buf, src_len, scratch, len);
1665 }
1666
1667 return (ret);
1668 } else if (type.is_read()) {
1669 Encryption encryption(type.encryption_algorithm());
1670
1671 ret = encryption.decrypt(type, buf, src_len, scratch, len);
1672
1673 if (ret == DB_SUCCESS) {
1674 return (os_file_decompress_page(type.is_dblwr(), buf, scratch, len));
1675 } else {
1676 return (ret);
1677 }
1678 } else if (type.punch_hole()) {
1679 ut_ad(len <= src_len);
1680 ut_ad(!type.is_log());
1681 ut_ad(type.is_write());
1682 ut_ad(type.is_compressed());
1683
1684 /* Nothing to do. */
1685 if (len == src_len) {
1686 return (DB_SUCCESS);
1687 }
1688
1689 #ifdef UNIV_DEBUG
1690 const ulint block_size = type.block_size();
1691 #endif /* UNIV_DEBUG */
1692
1693 /* We don't support multiple page sizes in the server
1694 at the moment. */
1695 ut_ad(src_len == srv_page_size);
1696
1697 /* Must be a multiple of the compression unit size. */
1698 ut_ad((len % block_size) == 0);
1699 ut_ad((offset % block_size) == 0);
1700
1701 ut_ad(len + block_size <= src_len);
1702
1703 offset += len;
1704
1705 return (os_file_punch_hole(fh, offset, src_len - len));
1706 }
1707
1708 ut_ad(!type.is_log());
1709
1710 return (DB_SUCCESS);
1711 }
1712
1713 /** Check if the path refers to the root of a drive using a pointer
1714 to the last directory separator that the caller has fixed.
1715 @param[in] path path name
1716 @param[in] last_slash last directory separator in the path
1717 @return true if this path is a drive root, false if not */
1718 UNIV_INLINE
os_file_is_root(const char * path,const char * last_slash)1719 bool os_file_is_root(const char *path, const char *last_slash) {
1720 return (
1721 #ifdef _WIN32
1722 (last_slash == path + 2 && path[1] == ':') ||
1723 #endif /* _WIN32 */
1724 last_slash == path);
1725 }
1726
1727 /** Return the parent directory component of a null-terminated path.
1728 Return a new buffer containing the string up to, but not including,
1729 the final component of the path.
1730 The path returned will not contain a trailing separator.
1731 Do not return a root path, return NULL instead.
1732 The final component trimmed off may be a filename or a directory name.
1733 If the final component is the only component of the path, return NULL.
1734 It is the caller's responsibility to free the returned string after it
1735 is no longer needed.
1736 @param[in] path Path name
1737 @return own: parent directory of the path */
os_file_get_parent_dir(const char * path)1738 static char *os_file_get_parent_dir(const char *path) {
1739 bool has_trailing_slash = false;
1740
1741 /* Find the offset of the last slash */
1742 const char *last_slash = strrchr(path, OS_PATH_SEPARATOR);
1743
1744 if (!last_slash) {
1745 /* No slash in the path, return NULL */
1746 return (nullptr);
1747 }
1748
1749 /* Ok, there is a slash. Is there anything after it? */
1750 if (static_cast<size_t>(last_slash - path + 1) == strlen(path)) {
1751 has_trailing_slash = true;
1752 }
1753
1754 /* Reduce repetative slashes. */
1755 while (last_slash > path && last_slash[-1] == OS_PATH_SEPARATOR) {
1756 last_slash--;
1757 }
1758
1759 /* Check for the root of a drive. */
1760 if (os_file_is_root(path, last_slash)) {
1761 return (nullptr);
1762 }
1763
1764 /* If a trailing slash prevented the first strrchr() from trimming
1765 the last component of the path, trim that component now. */
1766 if (has_trailing_slash) {
1767 /* Back up to the previous slash. */
1768 last_slash--;
1769 while (last_slash > path && last_slash[0] != OS_PATH_SEPARATOR) {
1770 last_slash--;
1771 }
1772
1773 /* Reduce repetative slashes. */
1774 while (last_slash > path && last_slash[-1] == OS_PATH_SEPARATOR) {
1775 last_slash--;
1776 }
1777 }
1778
1779 /* Check for the root of a drive. */
1780 if (os_file_is_root(path, last_slash)) {
1781 return (nullptr);
1782 }
1783
1784 if (last_slash - path < 0) {
1785 /* Sanity check, it prevents gcc from trying to handle this case which
1786 * results in warnings for some optimized builds */
1787 return (nullptr);
1788 }
1789
1790 /* Non-trivial directory component */
1791
1792 return (mem_strdupl(path, last_slash - path));
1793 }
1794 #ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
1795
1796 /* Test the function os_file_get_parent_dir. */
test_os_file_get_parent_dir(const char * child_dir,const char * expected_dir)1797 void test_os_file_get_parent_dir(const char *child_dir,
1798 const char *expected_dir) {
1799 char *child = mem_strdup(child_dir);
1800 char *expected = expected_dir == NULL ? NULL : mem_strdup(expected_dir);
1801
1802 /* os_file_get_parent_dir() assumes that separators are
1803 converted to OS_PATH_SEPARATOR. */
1804 Fil_path::normalize(child);
1805 Fil_path::normalize(expected);
1806
1807 char *parent = os_file_get_parent_dir(child);
1808
1809 bool unexpected =
1810 (expected == NULL ? (parent != NULL) : (0 != strcmp(parent, expected)));
1811 if (unexpected) {
1812 ib::fatal(ER_IB_MSG_752)
1813 << "os_file_get_parent_dir('" << child << "') returned '" << parent
1814 << "', instead of '" << expected << "'.";
1815 }
1816 ut_free(parent);
1817 ut_free(child);
1818 ut_free(expected);
1819 }
1820
1821 /* Test the function os_file_get_parent_dir. */
unit_test_os_file_get_parent_dir()1822 void unit_test_os_file_get_parent_dir() {
1823 test_os_file_get_parent_dir("/usr/lib/a", "/usr/lib");
1824 test_os_file_get_parent_dir("/usr/", NULL);
1825 test_os_file_get_parent_dir("//usr//", NULL);
1826 test_os_file_get_parent_dir("usr", NULL);
1827 test_os_file_get_parent_dir("usr//", NULL);
1828 test_os_file_get_parent_dir("/", NULL);
1829 test_os_file_get_parent_dir("//", NULL);
1830 test_os_file_get_parent_dir(".", NULL);
1831 test_os_file_get_parent_dir("..", NULL);
1832 #ifdef _WIN32
1833 test_os_file_get_parent_dir("D:", NULL);
1834 test_os_file_get_parent_dir("D:/", NULL);
1835 test_os_file_get_parent_dir("D:\\", NULL);
1836 test_os_file_get_parent_dir("D:/data", NULL);
1837 test_os_file_get_parent_dir("D:/data/", NULL);
1838 test_os_file_get_parent_dir("D:\\data\\", NULL);
1839 test_os_file_get_parent_dir("D:///data/////", NULL);
1840 test_os_file_get_parent_dir("D:\\\\\\data\\\\\\\\", NULL);
1841 test_os_file_get_parent_dir("D:/data//a", "D:/data");
1842 test_os_file_get_parent_dir("D:\\data\\\\a", "D:\\data");
1843 test_os_file_get_parent_dir("D:///data//a///b/", "D:///data//a");
1844 test_os_file_get_parent_dir("D:\\\\\\data\\\\a\\\\\\b\\",
1845 "D:\\\\\\data\\\\a");
1846 #endif /* _WIN32 */
1847 }
1848 #endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */
1849
1850 /** Creates all missing subdirectories along the given path.
1851 @param[in] path Path name
1852 @return DB_SUCCESS if OK, otherwise error code. */
os_file_create_subdirs_if_needed(const char * path)1853 dberr_t os_file_create_subdirs_if_needed(const char *path) {
1854 if (srv_read_only_mode) {
1855 ib::error(ER_IB_MSG_753) << "read only mode set. Can't create "
1856 << "subdirectories '" << path << "'";
1857
1858 return (DB_READ_ONLY);
1859 }
1860
1861 char *subdir = os_file_get_parent_dir(path);
1862
1863 if (subdir == nullptr) {
1864 /* subdir is root or cwd, nothing to do */
1865 return (DB_SUCCESS);
1866 }
1867
1868 /* Test if subdir exists */
1869 os_file_type_t type;
1870 bool subdir_exists;
1871 bool success = os_file_status(subdir, &subdir_exists, &type);
1872
1873 if (success && !subdir_exists) {
1874 /* Subdir does not exist, create it */
1875 dberr_t err = os_file_create_subdirs_if_needed(subdir);
1876
1877 if (err != DB_SUCCESS) {
1878 ut_free(subdir);
1879
1880 return (err);
1881 }
1882
1883 success = os_file_create_directory(subdir, false);
1884 }
1885
1886 ut_free(subdir);
1887
1888 return (success ? DB_SUCCESS : DB_ERROR);
1889 }
1890
1891 /** Allocate the buffer for IO on a transparently compressed table.
1892 @param[in] type IO flags
1893 @param[out] buf buffer to read or write
1894 @param[in,out] n number of bytes to read/write, starting from
1895 offset
1896 @return pointer to allocated page, compressed data is written to the offset
1897 that is aligned on the disk sector size */
os_file_compress_page(IORequest & type,void * & buf,ulint * n)1898 static file::Block *os_file_compress_page(IORequest &type, void *&buf,
1899 ulint *n) {
1900 ut_ad(!type.is_log());
1901 ut_ad(type.is_write());
1902 ut_ad(type.is_compressed());
1903
1904 ulint n_alloc = *n * 2;
1905
1906 ut_a(n_alloc <= UNIV_PAGE_SIZE_MAX * 2);
1907 ut_a(type.compression_algorithm().m_type != Compression::LZ4 ||
1908 static_cast<ulint>(LZ4_COMPRESSBOUND(*n)) < n_alloc);
1909
1910 auto block = os_alloc_block();
1911
1912 ulint old_compressed_len;
1913 ulint compressed_len = *n;
1914
1915 old_compressed_len = mach_read_from_2(reinterpret_cast<byte *>(buf) +
1916 FIL_PAGE_COMPRESS_SIZE_V1);
1917
1918 if (old_compressed_len > 0) {
1919 old_compressed_len =
1920 ut_calc_align(old_compressed_len + FIL_PAGE_DATA, type.block_size());
1921 } else {
1922 old_compressed_len = *n;
1923 }
1924
1925 byte *compressed_page;
1926
1927 compressed_page =
1928 static_cast<byte *>(ut_align(block->m_ptr, os_io_ptr_align));
1929
1930 byte *buf_ptr;
1931
1932 buf_ptr = os_file_compress_page(
1933 type.compression_algorithm(), type.block_size(),
1934 reinterpret_cast<byte *>(buf), *n, compressed_page, &compressed_len);
1935
1936 if (buf_ptr != buf) {
1937 /* Set new compressed size to uncompressed page. */
1938 memcpy(reinterpret_cast<byte *>(buf) + FIL_PAGE_COMPRESS_SIZE_V1,
1939 buf_ptr + FIL_PAGE_COMPRESS_SIZE_V1, 2);
1940
1941 buf = buf_ptr;
1942 *n = compressed_len;
1943
1944 if (compressed_len >= old_compressed_len) {
1945 ut_ad(old_compressed_len <= UNIV_PAGE_SIZE);
1946
1947 type.clear_punch_hole();
1948 }
1949 }
1950
1951 return (block);
1952 }
1953
1954 /** Encrypt a page content when write it to disk.
1955 @param[in] type IO flags
1956 @param[out] buf buffer to read or write
1957 @param[in,out] n number of bytes to read/write, starting from
1958 offset
1959 @return pointer to the encrypted page */
os_file_encrypt_page(const IORequest & type,void * & buf,ulint * n)1960 static file::Block *os_file_encrypt_page(const IORequest &type, void *&buf,
1961 ulint *n) {
1962 byte *encrypted_page;
1963 ulint encrypted_len = *n;
1964 byte *buf_ptr;
1965 Encryption encryption(type.encryption_algorithm());
1966
1967 ut_ad(type.is_write());
1968 ut_ad(type.is_encrypted());
1969
1970 auto block = os_alloc_block();
1971
1972 encrypted_page = static_cast<byte *>(ut_align(block->m_ptr, os_io_ptr_align));
1973
1974 buf_ptr = encryption.encrypt(type, reinterpret_cast<byte *>(buf), *n,
1975 encrypted_page, &encrypted_len);
1976
1977 bool encrypted = buf_ptr != buf;
1978
1979 if (encrypted) {
1980 buf = buf_ptr;
1981 *n = encrypted_len;
1982 }
1983
1984 return (block);
1985 }
1986
1987 /** Encrypt log blocks content when write it to disk.
1988 @param[in] type IO flags
1989 @param[in,out] buf buffer to read or write
1990 @param[in,out] scratch buffer for encrypting log
1991 @param[in,out] n number of bytes to read/write, starting from
1992 offset
1993 @return pointer to the encrypted log blocks */
os_file_encrypt_log(const IORequest & type,void * & buf,byte * & scratch,ulint * n)1994 static file::Block *os_file_encrypt_log(const IORequest &type, void *&buf,
1995 byte *&scratch, ulint *n) {
1996 byte *encrypted_log;
1997 ulint encrypted_len = *n;
1998 byte *buf_ptr;
1999 Encryption encryption(type.encryption_algorithm());
2000 file::Block *block{};
2001
2002 ut_ad(type.is_write() && type.is_encrypted() && type.is_log());
2003 ut_ad(*n % OS_FILE_LOG_BLOCK_SIZE == 0);
2004
2005 if (*n <= BUFFER_BLOCK_SIZE - os_io_ptr_align) {
2006 block = os_alloc_block();
2007 buf_ptr = block->m_ptr;
2008 scratch = nullptr;
2009 } else {
2010 buf_ptr = static_cast<byte *>(ut_malloc_nokey(*n + os_io_ptr_align));
2011 scratch = buf_ptr;
2012 }
2013
2014 encrypted_log = static_cast<byte *>(ut_align(buf_ptr, os_io_ptr_align));
2015
2016 encrypted_log = encryption.encrypt_log(type, reinterpret_cast<byte *>(buf),
2017 *n, encrypted_log, &encrypted_len);
2018
2019 bool encrypted = encrypted_log != buf;
2020
2021 if (encrypted) {
2022 buf = encrypted_log;
2023 *n = encrypted_len;
2024 }
2025
2026 return (block);
2027 }
2028
2029 #ifndef _WIN32
2030
2031 /** Do the read/write
2032 @param[in] request The IO context and type
2033 @return the number of bytes read/written or negative value on error */
execute(const IORequest & request)2034 ssize_t SyncFileIO::execute(const IORequest &request) {
2035 ssize_t n_bytes;
2036
2037 if (request.is_read()) {
2038 n_bytes = pread(m_fh, m_buf, m_n, m_offset);
2039 } else {
2040 ut_ad(request.is_write());
2041 n_bytes = pwrite(m_fh, m_buf, m_n, m_offset);
2042 }
2043
2044 return (n_bytes);
2045 }
2046
2047 /** Free storage space associated with a section of the file.
2048 @param[in] fh Open file handle
2049 @param[in] off Starting offset (SEEK_SET)
2050 @param[in] len Size of the hole
2051 @return DB_SUCCESS or error code */
os_file_punch_hole_posix(os_file_t fh,os_offset_t off,os_offset_t len)2052 static dberr_t os_file_punch_hole_posix(os_file_t fh, os_offset_t off,
2053 os_offset_t len) {
2054 #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
2055 const int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
2056
2057 int ret = fallocate(fh, mode, off, len);
2058
2059 if (ret == 0) {
2060 return (DB_SUCCESS);
2061 }
2062
2063 ut_a(ret == -1);
2064
2065 if (errno == ENOTSUP) {
2066 return (DB_IO_NO_PUNCH_HOLE);
2067 }
2068
2069 ib::warn(ER_IB_MSG_754) << "fallocate(" << fh
2070 << ", FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, "
2071 << off << ", " << len
2072 << ") returned errno: " << errno;
2073
2074 return (DB_IO_ERROR);
2075
2076 #elif defined(UNIV_SOLARIS)
2077
2078 // Use F_FREESP
2079
2080 #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
2081
2082 return (DB_IO_NO_PUNCH_HOLE);
2083 }
2084
2085 #if defined(LINUX_NATIVE_AIO)
2086
2087 /** Linux native AIO handler */
2088 class LinuxAIOHandler {
2089 public:
2090 /**
2091 @param[in] global_segment The global segment*/
LinuxAIOHandler(ulint global_segment)2092 LinuxAIOHandler(ulint global_segment) : m_global_segment(global_segment) {
2093 /* Should never be doing Sync IO here. */
2094 ut_a(m_global_segment != ULINT_UNDEFINED);
2095
2096 /* Find the array and the local segment. */
2097
2098 m_segment = AIO::get_array_and_local_segment(m_array, m_global_segment);
2099
2100 m_n_slots = m_array->slots_per_segment();
2101 }
2102
2103 /** Destructor */
~LinuxAIOHandler()2104 ~LinuxAIOHandler() {
2105 // No op
2106 }
2107
2108 /**
2109 Process a Linux AIO request
2110 @param[out] m1 the messages passed with the
2111 @param[out] m2 AIO request; note that in case the
2112 AIO operation failed, these output
2113 parameters are valid and can be used to
2114 restart the operation.
2115 @param[out] request IO context
2116 @return DB_SUCCESS or error code */
2117 dberr_t poll(fil_node_t **m1, void **m2, IORequest *request);
2118
2119 private:
2120 /** Resubmit an IO request that was only partially successful
2121 @param[in,out] slot Request to resubmit
2122 @return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
2123 dberr_t resubmit(Slot *slot);
2124
2125 /** Check if the AIO succeeded
2126 @param[in,out] slot The slot to check
2127 @return DB_SUCCESS, DB_FAIL if the operation should be retried or
2128 DB_IO_ERROR on all other errors */
2129 dberr_t check_state(Slot *slot);
2130
2131 /** @return true if a shutdown was detected */
is_shutdown() const2132 bool is_shutdown() const {
2133 return (srv_shutdown_state.load() == SRV_SHUTDOWN_EXIT_THREADS &&
2134 !buf_flush_page_cleaner_is_active());
2135 }
2136
2137 /** If no slot was found then the m_array->m_mutex will be released.
2138 @param[out] n_pending The number of pending IOs
2139 @return NULL or a slot that has completed IO */
2140 Slot *find_completed_slot(ulint *n_pending);
2141
2142 /** This is called from within the IO-thread. If there are no completed
2143 IO requests in the slot array, the thread calls this function to
2144 collect more requests from the Linux kernel.
2145 The IO-thread waits on io_getevents(), which is a blocking call, with
2146 a timeout value. Unless the system is very heavy loaded, keeping the
2147 IO-thread very busy, the io-thread will spend most of its time waiting
2148 in this function.
2149 The IO-thread also exits in this function. It checks server status at
2150 each wakeup and that is why we use timed wait in io_getevents(). */
2151 void collect();
2152
2153 private:
2154 /** Slot array */
2155 AIO *m_array;
2156
2157 /** Number of slots inthe local segment */
2158 ulint m_n_slots;
2159
2160 /** The local segment to check */
2161 ulint m_segment;
2162
2163 /** The global segment */
2164 ulint m_global_segment;
2165 };
2166
2167 /** Resubmit an IO request that was only partially successful
2168 @param[in,out] slot Request to resubmit
2169 @return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
resubmit(Slot * slot)2170 dberr_t LinuxAIOHandler::resubmit(Slot *slot) {
2171 #ifdef UNIV_DEBUG
2172 /* Bytes already read/written out */
2173 ulint n_bytes = slot->ptr - slot->buf;
2174
2175 ut_ad(m_array->is_mutex_owned());
2176
2177 ut_ad(n_bytes < slot->original_len);
2178 ut_ad(static_cast<ulint>(slot->n_bytes) < slot->original_len - n_bytes);
2179 /* Partial read or write scenario */
2180 ut_ad(slot->len >= static_cast<ulint>(slot->n_bytes));
2181 #endif /* UNIV_DEBUG */
2182
2183 slot->len -= slot->n_bytes;
2184 slot->ptr += slot->n_bytes;
2185 slot->offset += slot->n_bytes;
2186
2187 /* Resetting the bytes read/written */
2188 slot->n_bytes = 0;
2189 slot->io_already_done = false;
2190
2191 /* make sure that slot->offset fits in off_t */
2192 ut_ad(sizeof(off_t) >= sizeof(os_offset_t));
2193 struct iocb *iocb = &slot->control;
2194 if (slot->type.is_read()) {
2195 io_prep_pread(iocb, slot->file.m_file, slot->ptr, slot->len, slot->offset);
2196
2197 } else {
2198 ut_a(slot->type.is_write());
2199
2200 io_prep_pwrite(iocb, slot->file.m_file, slot->ptr, slot->len, slot->offset);
2201 }
2202 iocb->data = slot;
2203
2204 /* Resubmit an I/O request */
2205 int ret = io_submit(m_array->io_ctx(m_segment), 1, &iocb);
2206
2207 if (ret < -1) {
2208 errno = -ret;
2209 }
2210
2211 return (ret < 0 ? DB_IO_PARTIAL_FAILED : DB_SUCCESS);
2212 }
2213
2214 /** Check if the AIO succeeded
2215 @param[in,out] slot The slot to check
2216 @return DB_SUCCESS, DB_FAIL if the operation should be retried or
2217 DB_IO_ERROR on all other errors */
check_state(Slot * slot)2218 dberr_t LinuxAIOHandler::check_state(Slot *slot) {
2219 ut_ad(m_array->is_mutex_owned());
2220
2221 /* Note that it may be that there is more then one completed
2222 IO requests. We process them one at a time. We may have a case
2223 here to improve the performance slightly by dealing with all
2224 requests in one sweep. */
2225
2226 srv_set_io_thread_op_info(m_global_segment,
2227 "processing completed aio requests");
2228
2229 ut_ad(slot->io_already_done);
2230
2231 dberr_t err;
2232
2233 if (slot->ret == 0) {
2234 err = AIOHandler::post_io_processing(slot);
2235
2236 } else {
2237 errno = -slot->ret;
2238
2239 /* os_file_handle_error does tell us if we should retry
2240 this IO. As it stands now, we don't do this retry when
2241 reaping requests from a different context than
2242 the dispatcher. This non-retry logic is the same for
2243 Windows and Linux native AIO.
2244 We should probably look into this to transparently
2245 re-submit the IO. */
2246 os_file_handle_error(slot->name, "Linux aio");
2247
2248 err = DB_IO_ERROR;
2249 }
2250
2251 return (err);
2252 }
2253
2254 /** If no slot was found then the m_array->m_mutex will be released.
2255 @param[out] n_pending The number of pending IOs
2256 @return NULL or a slot that has completed IO */
find_completed_slot(ulint * n_pending)2257 Slot *LinuxAIOHandler::find_completed_slot(ulint *n_pending) {
2258 ulint offset = m_n_slots * m_segment;
2259
2260 *n_pending = 0;
2261
2262 m_array->acquire();
2263
2264 Slot *slot = m_array->at(offset);
2265
2266 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
2267 if (slot->is_reserved) {
2268 ++*n_pending;
2269
2270 if (slot->io_already_done) {
2271 /* Something for us to work on.
2272 Note: We don't release the mutex. */
2273 return (slot);
2274 }
2275 }
2276 }
2277
2278 m_array->release();
2279
2280 return (nullptr);
2281 }
2282
2283 /** This function is only used in Linux native asynchronous i/o. This is
2284 called from within the io-thread. If there are no completed IO requests
2285 in the slot array, the thread calls this function to collect more
2286 requests from the kernel.
2287 The io-thread waits on io_getevents(), which is a blocking call, with
2288 a timeout value. Unless the system is very heavy loaded, keeping the
2289 io-thread very busy, the io-thread will spend most of its time waiting
2290 in this function.
2291 The io-thread also exits in this function. It checks server status at
2292 each wakeup and that is why we use timed wait in io_getevents(). */
collect()2293 void LinuxAIOHandler::collect() {
2294 ut_ad(m_n_slots > 0);
2295 ut_ad(m_segment < m_array->get_n_segments());
2296
2297 /* Which io_context we are going to use. */
2298 io_context *io_ctx = m_array->io_ctx(m_segment);
2299
2300 /* Starting point of the m_segment we will be working on. */
2301 ulint start_pos = m_segment * m_n_slots;
2302
2303 /* End point. */
2304 ulint end_pos = start_pos + m_n_slots;
2305
2306 for (;;) {
2307 struct io_event *events;
2308
2309 /* Which part of event array we are going to work on. */
2310 events = m_array->io_events(m_segment * m_n_slots);
2311
2312 /* Initialize the events. */
2313 memset(events, 0, sizeof(*events) * m_n_slots);
2314
2315 /* The timeout value is arbitrary. We probably need
2316 to experiment with it a little. */
2317 struct timespec timeout;
2318
2319 timeout.tv_sec = 0;
2320 timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
2321
2322 auto ret = io_getevents(io_ctx, 1, m_n_slots, events, &timeout);
2323
2324 for (int i = 0; i < ret; ++i) {
2325 auto iocb = reinterpret_cast<struct iocb *>(events[i].obj);
2326 ut_a(iocb != nullptr);
2327
2328 auto slot = reinterpret_cast<Slot *>(iocb->data);
2329
2330 /* Some sanity checks. */
2331 ut_a(slot != nullptr);
2332 ut_a(slot->is_reserved);
2333
2334 /* We are not scribbling previous segment. */
2335 ut_a(slot->pos >= start_pos);
2336
2337 /* We have not overstepped to next segment. */
2338 ut_a(slot->pos < end_pos);
2339
2340 if (slot->offset > 0 && !slot->skip_punch_hole &&
2341 slot->type.is_compression_enabled() && !slot->type.is_log() &&
2342 slot->type.is_write() && slot->type.is_compressed() &&
2343 slot->type.punch_hole() && !slot->type.is_dblwr()) {
2344 slot->err = AIOHandler::io_complete(slot);
2345 } else {
2346 slot->err = DB_SUCCESS;
2347 }
2348
2349 /* Mark this request as completed. The error handling
2350 will be done in the calling function. */
2351 m_array->acquire();
2352
2353 /* events[i].res2 should always be ZERO */
2354 ut_ad(events[i].res2 == 0);
2355 slot->io_already_done = true;
2356
2357 /* Even though events[i].res is an unsigned number in libaio, it is
2358 used to return a negative value (negated errno value) to indicate
2359 error and a positive value to indicate number of bytes read or
2360 written. */
2361
2362 if (events[i].res > slot->len) {
2363 /* failure */
2364 slot->n_bytes = 0;
2365 slot->ret = events[i].res;
2366 } else {
2367 /* success */
2368 slot->n_bytes = events[i].res;
2369 slot->ret = 0;
2370 }
2371 m_array->release();
2372 }
2373
2374 if (srv_shutdown_state.load() == SRV_SHUTDOWN_EXIT_THREADS ||
2375 !buf_flush_page_cleaner_is_active() || ret > 0) {
2376 break;
2377 }
2378
2379 /* This error handling is for any error in collecting the
2380 IO requests. The errors, if any, for any particular IO
2381 request are simply passed on to the calling routine. */
2382
2383 switch (ret) {
2384 case -EAGAIN:
2385 /* Not enough resources! Try again. */
2386
2387 case -EINTR:
2388 /* Interrupted! The behaviour in case of an interrupt.
2389 If we have some completed IOs available then the
2390 return code will be the number of IOs. We get EINTR
2391 only if there are no completed IOs and we have been
2392 interrupted. */
2393
2394 case 0:
2395 /* No pending request! Go back and check again. */
2396
2397 continue;
2398 }
2399
2400 /* All other errors should cause a trap for now. */
2401 ib::fatal(ER_IB_MSG_755)
2402 << "Unexpected ret_code[" << ret << "] from io_getevents()!";
2403
2404 break;
2405 }
2406 }
2407
2408 /** Process a Linux AIO request
2409 @param[out] m1 the messages passed with the
2410 @param[out] m2 AIO request; note that in case the
2411 AIO operation failed, these output
2412 parameters are valid and can be used to
2413 restart the operation.
2414 @param[out] request IO context
2415 @return DB_SUCCESS or error code */
poll(fil_node_t ** m1,void ** m2,IORequest * request)2416 dberr_t LinuxAIOHandler::poll(fil_node_t **m1, void **m2, IORequest *request) {
2417 dberr_t err;
2418 Slot *slot;
2419
2420 /* Loop until we have found a completed request. */
2421 for (;;) {
2422 ulint n_pending;
2423
2424 slot = find_completed_slot(&n_pending);
2425
2426 if (slot != nullptr) {
2427 ut_ad(m_array->is_mutex_owned());
2428
2429 err = check_state(slot);
2430
2431 /* DB_FAIL is not a hard error, we should retry */
2432 if (err != DB_FAIL) {
2433 break;
2434 }
2435
2436 /* Partial IO, resubmit request for
2437 remaining bytes to read/write */
2438 err = resubmit(slot);
2439
2440 if (err != DB_SUCCESS) {
2441 break;
2442 }
2443
2444 m_array->release();
2445
2446 } else if (is_shutdown() && n_pending == 0) {
2447 /* There is no completed request. If there is
2448 no pending request at all, and the system is
2449 being shut down, exit. */
2450
2451 *m1 = nullptr;
2452 *m2 = nullptr;
2453
2454 return (DB_SUCCESS);
2455
2456 } else {
2457 /* Wait for some request. Note that we return
2458 from wait if we have found a request. */
2459
2460 srv_set_io_thread_op_info(m_global_segment,
2461 "waiting for completed aio requests");
2462
2463 collect();
2464 }
2465 }
2466
2467 if (err == DB_IO_PARTIAL_FAILED) {
2468 /* Aborting in case of submit failure */
2469 ib::fatal(ER_IB_MSG_756) << "Native Linux AIO interface. "
2470 "io_submit() call failed when "
2471 "resubmitting a partial I/O "
2472 "request on the file "
2473 << slot->name << ".";
2474 }
2475
2476 *m1 = slot->m1;
2477 *m2 = slot->m2;
2478
2479 *request = slot->type;
2480
2481 m_array->release(slot);
2482
2483 m_array->release();
2484
2485 return (err);
2486 }
2487
2488 /** This function is only used in Linux native asynchronous i/o.
2489 Waits for an aio operation to complete. This function is used to wait for
2490 the completed requests. The aio array of pending requests is divided
2491 into segments. The thread specifies which segment or slot it wants to wait
2492 for. NOTE: this function will also take care of freeing the aio slot,
2493 therefore no other thread is allowed to do the freeing!
2494
2495 @param[in] global_segment segment number in the aio array
2496 to wait for; segment 0 is the ibuf
2497 i/o thread, segment 1 is log i/o thread,
2498 then follow the non-ibuf read threads,
2499 and the last are the non-ibuf write
2500 threads.
2501 @param[out] m1 the messages passed with the
2502 @param[out] m2 AIO request; note that in case the
2503 AIO operation failed, these output
2504 parameters are valid and can be used to
2505 restart the operation.
2506 @param[out] request IO context
2507 @return DB_SUCCESS if the IO was successful */
os_aio_linux_handler(ulint global_segment,fil_node_t ** m1,void ** m2,IORequest * request)2508 static dberr_t os_aio_linux_handler(ulint global_segment, fil_node_t **m1,
2509 void **m2, IORequest *request) {
2510 LinuxAIOHandler handler(global_segment);
2511
2512 dberr_t err = handler.poll(m1, m2, request);
2513
2514 if (err == DB_IO_NO_PUNCH_HOLE) {
2515 if (!request->is_dblwr()) {
2516 fil_no_punch_hole(*m1);
2517 err = DB_SUCCESS;
2518 }
2519 }
2520
2521 return (err);
2522 }
2523
2524 /** Dispatch an AIO request to the kernel.
2525 @param[in,out] slot an already reserved slot
2526 @return true on success. */
linux_dispatch(Slot * slot)2527 bool AIO::linux_dispatch(Slot *slot) {
2528 ut_a(slot->is_reserved);
2529 ut_ad(slot->type.validate());
2530
2531 /* Find out what we are going to work with.
2532 The iocb struct is directly in the slot.
2533 The io_context is one per segment. */
2534
2535 ulint io_ctx_index;
2536 struct iocb *iocb = &slot->control;
2537
2538 io_ctx_index = (slot->pos * m_n_segments) / m_slots.size();
2539
2540 int ret = io_submit(m_aio_ctx[io_ctx_index], 1, &iocb);
2541
2542 /* io_submit() returns number of successfully queued requests
2543 or -errno. */
2544
2545 if (ret != 1) {
2546 errno = -ret;
2547 }
2548
2549 return (ret == 1);
2550 }
2551
2552 /** Creates an io_context for native linux AIO.
2553 @param[in] max_events number of events
2554 @param[out] io_ctx io_ctx to initialize.
2555 @return true on success. */
linux_create_io_ctx(ulint max_events,io_context_t * io_ctx)2556 bool AIO::linux_create_io_ctx(ulint max_events, io_context_t *io_ctx) {
2557 ssize_t n_retries = 0;
2558
2559 for (;;) {
2560 memset(io_ctx, 0x0, sizeof(*io_ctx));
2561
2562 /* Initialize the io_ctx. Tell it how many pending
2563 IO requests this context will handle. */
2564
2565 int ret = io_setup(max_events, io_ctx);
2566
2567 if (ret == 0) {
2568 /* Success. Return now. */
2569 return (true);
2570 }
2571
2572 /* If we hit EAGAIN we'll make a few attempts before failing. */
2573
2574 switch (ret) {
2575 case -EAGAIN:
2576 if (n_retries == 0) {
2577 /* First time around. */
2578 ib::warn(ER_IB_MSG_757) << "io_setup() failed with EAGAIN."
2579 " Will make "
2580 << OS_AIO_IO_SETUP_RETRY_ATTEMPTS
2581 << " attempts before giving up.";
2582 }
2583
2584 if (n_retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
2585 ++n_retries;
2586
2587 ib::warn(ER_IB_MSG_758) << "io_setup() attempt " << n_retries << ".";
2588
2589 os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
2590
2591 continue;
2592 }
2593
2594 /* Have tried enough. Better call it a day. */
2595 ib::error(ER_IB_MSG_759)
2596 << "io_setup() failed with EAGAIN after "
2597 << OS_AIO_IO_SETUP_RETRY_ATTEMPTS << " attempts.";
2598 break;
2599
2600 case -ENOSYS:
2601 ib::error(ER_IB_MSG_760) << "Linux Native AIO interface"
2602 " is not supported on this platform. Please"
2603 " check your OS documentation and install"
2604 " appropriate binary of InnoDB.";
2605
2606 break;
2607
2608 default:
2609 ib::error(ER_IB_MSG_761) << "Linux Native AIO setup"
2610 << " returned following error[" << ret << "]";
2611 break;
2612 }
2613
2614 ib::info(ER_IB_MSG_762) << "You can disable Linux Native AIO by"
2615 " setting innodb_use_native_aio = 0 in my.cnf";
2616
2617 break;
2618 }
2619
2620 return (false);
2621 }
2622
2623 /** Checks if the system supports native linux aio. On some kernel
2624 versions where native aio is supported it won't work on tmpfs. In such
2625 cases we can't use native aio as it is not possible to mix simulated
2626 and native aio.
2627 @return: true if supported, false otherwise. */
is_linux_native_aio_supported()2628 bool AIO::is_linux_native_aio_supported() {
2629 int fd;
2630 io_context_t io_ctx;
2631 char name[1000];
2632
2633 if (!linux_create_io_ctx(1, &io_ctx)) {
2634 /* The platform does not support native aio. */
2635
2636 return (false);
2637
2638 } else if (!srv_read_only_mode) {
2639 /* Now check if tmpdir supports native aio ops. */
2640 fd = innobase_mysql_tmpfile(nullptr);
2641
2642 if (fd < 0) {
2643 ib::warn(ER_IB_MSG_763) << "Unable to create temp file to check"
2644 " native AIO support.";
2645
2646 return (false);
2647 }
2648 } else {
2649 ulint dirnamelen = strlen(srv_log_group_home_dir);
2650
2651 ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
2652
2653 memcpy(name, srv_log_group_home_dir, dirnamelen);
2654
2655 /* Add a path separator if needed. */
2656 if (dirnamelen && name[dirnamelen - 1] != OS_PATH_SEPARATOR) {
2657 name[dirnamelen++] = OS_PATH_SEPARATOR;
2658 }
2659
2660 strcpy(name + dirnamelen, "ib_logfile0");
2661
2662 fd = ::open(name, O_RDONLY);
2663
2664 if (fd == -1) {
2665 ib::warn(ER_IB_MSG_764) << "Unable to open"
2666 << " \"" << name << "\" to check native"
2667 << " AIO read support.";
2668
2669 return (false);
2670 }
2671 }
2672
2673 struct io_event io_event;
2674
2675 memset(&io_event, 0x0, sizeof(io_event));
2676
2677 byte *buf = static_cast<byte *>(ut_malloc_nokey(UNIV_PAGE_SIZE * 2));
2678 byte *ptr = static_cast<byte *>(ut_align(buf, UNIV_PAGE_SIZE));
2679
2680 struct iocb iocb;
2681
2682 /* Suppress valgrind warning. */
2683 memset(buf, 0x00, UNIV_PAGE_SIZE * 2);
2684 memset(&iocb, 0x0, sizeof(iocb));
2685
2686 struct iocb *p_iocb = &iocb;
2687
2688 if (!srv_read_only_mode) {
2689 io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0);
2690
2691 } else {
2692 ut_a(UNIV_PAGE_SIZE >= 512);
2693 io_prep_pread(p_iocb, fd, ptr, 512, 0);
2694 }
2695
2696 int err = io_submit(io_ctx, 1, &p_iocb);
2697
2698 if (err >= 1) {
2699 /* Now collect the submitted IO request. */
2700 err = io_getevents(io_ctx, 1, 1, &io_event, nullptr);
2701 }
2702
2703 ut_free(buf);
2704 close(fd);
2705
2706 switch (err) {
2707 case 1:
2708 return (true);
2709
2710 case -EINVAL:
2711 case -ENOSYS:
2712 ib::error(ER_IB_MSG_765)
2713 << "Linux Native AIO not supported. You can either"
2714 " move "
2715 << (srv_read_only_mode ? name : "tmpdir")
2716 << " to a file system that supports native"
2717 " AIO or you can set innodb_use_native_aio to"
2718 " FALSE to avoid this message.";
2719
2720 /* fall through. */
2721 default:
2722 ib::error(ER_IB_MSG_766) << "Linux Native AIO check on "
2723 << (srv_read_only_mode ? name : "tmpdir")
2724 << "returned error[" << -err << "]";
2725 }
2726
2727 return (false);
2728 }
2729
2730 #endif /* LINUX_NATIVE_AIO */
2731
2732 /** Retrieves the last error number if an error occurs in a file io function.
2733 The number should be retrieved before any other OS calls (because they may
2734 overwrite the error number). If the number is not known to this program,
2735 the OS error number + 100 is returned.
2736 @param[in] report_all_errors true if we want an error message
2737 printed of all errors
2738 @param[in] on_error_silent true then don't print any diagnostic
2739 to the log
2740 @return error number, or OS error number + 100 */
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)2741 static ulint os_file_get_last_error_low(bool report_all_errors,
2742 bool on_error_silent) {
2743 int err = errno;
2744
2745 if (err == 0) {
2746 return (0);
2747 }
2748
2749 if (report_all_errors ||
2750 (err != ENOSPC && err != EEXIST && !on_error_silent)) {
2751 ib::error(ER_IB_MSG_767)
2752 << "Operating system error number " << err << " in a file operation.";
2753
2754 if (err == ENOENT) {
2755 ib::error(ER_IB_MSG_768) << "The error means the system"
2756 " cannot find the path specified.";
2757
2758 #ifndef UNIV_HOTBACKUP
2759 if (srv_is_being_started) {
2760 ib::error(ER_IB_MSG_769) << "If you are installing InnoDB,"
2761 " remember that you must create"
2762 " directories yourself, InnoDB"
2763 " does not create them.";
2764 }
2765 #endif /* !UNIV_HOTBACKUP */
2766 } else if (err == EACCES) {
2767 ib::error(ER_IB_MSG_770) << "The error means mysqld does not have"
2768 " the access rights to the directory.";
2769
2770 } else {
2771 if (strerror(err) != nullptr) {
2772 ib::error(ER_IB_MSG_771)
2773 << "Error number " << err << " means '" << strerror(err) << "'";
2774 }
2775
2776 ib::info(ER_IB_MSG_772) << OPERATING_SYSTEM_ERROR_MSG;
2777 }
2778 }
2779
2780 switch (err) {
2781 case ENOSPC:
2782 return (OS_FILE_DISK_FULL);
2783 case ENOENT:
2784 return (OS_FILE_NOT_FOUND);
2785 case EEXIST:
2786 return (OS_FILE_ALREADY_EXISTS);
2787 case EXDEV:
2788 case ENOTDIR:
2789 case EISDIR:
2790 return (OS_FILE_PATH_ERROR);
2791 case EAGAIN:
2792 if (srv_use_native_aio) {
2793 return (OS_FILE_AIO_RESOURCES_RESERVED);
2794 }
2795 break;
2796 case EINTR:
2797 if (srv_use_native_aio) {
2798 return (OS_FILE_AIO_INTERRUPTED);
2799 }
2800 break;
2801 case EACCES:
2802 return (OS_FILE_ACCESS_VIOLATION);
2803 case ENAMETOOLONG:
2804 return (OS_FILE_NAME_TOO_LONG);
2805 }
2806 return (OS_FILE_ERROR_MAX + err);
2807 }
2808
2809 /** Wrapper to fsync(2) that retries the call on some errors.
2810 Returns the value 0 if successful; otherwise the value -1 is returned and
2811 the global variable errno is set to indicate the error.
2812 @param[in] file open file handle
2813 @return 0 if success, -1 otherwise */
os_file_fsync_posix(os_file_t file)2814 static int os_file_fsync_posix(os_file_t file) {
2815 ulint failures = 0;
2816 #ifdef UNIV_HOTBACKUP
2817 static meb::Mutex meb_mutex;
2818 #endif /* UNIV_HOTBACKUP */
2819
2820 for (;;) {
2821 #ifdef UNIV_HOTBACKUP
2822 meb_mutex.lock();
2823 #endif /* UNIV_HOTBACKUP */
2824 ++os_n_fsyncs;
2825 #ifdef UNIV_HOTBACKUP
2826 meb_mutex.unlock();
2827 #endif /* UNIV_HOTBACKUP */
2828
2829 int ret = fsync(file);
2830
2831 if (ret == 0) {
2832 return (ret);
2833 }
2834
2835 switch (errno) {
2836 case ENOLCK:
2837
2838 ++failures;
2839 ut_a(failures < 1000);
2840
2841 if (!(failures % 100)) {
2842 ib::warn(ER_IB_MSG_773) << "fsync(): "
2843 << "No locks available; retrying";
2844 }
2845
2846 /* 0.2 sec */
2847 os_thread_sleep(200000);
2848 break;
2849
2850 case EIO:
2851
2852 ib::fatal(ER_IB_MSG_1358) << "fsync() returned EIO, aborting.";
2853 break;
2854
2855 case EINTR:
2856
2857 ++failures;
2858 ut_a(failures < 2000);
2859 break;
2860
2861 default:
2862 ut_error;
2863 break;
2864 }
2865 }
2866
2867 ut_error;
2868
2869 return (-1);
2870 }
2871
2872 /** Check the existence and type of the given file.
2873 @param[in] path path name of file
2874 @param[out] exists true if the file exists
2875 @param[out] type Type of the file, if it exists
2876 @return true if call succeeded */
os_file_status_posix(const char * path,bool * exists,os_file_type_t * type)2877 static bool os_file_status_posix(const char *path, bool *exists,
2878 os_file_type_t *type) {
2879 struct stat statinfo;
2880
2881 int ret = stat(path, &statinfo);
2882
2883 if (exists != nullptr) {
2884 *exists = !ret;
2885 }
2886
2887 if (ret == 0) {
2888 /* file exists, everything OK */
2889
2890 } else if (errno == ENOENT || errno == ENOTDIR) {
2891 if (exists != nullptr) {
2892 *exists = false;
2893 }
2894
2895 /* file does not exist */
2896 *type = OS_FILE_TYPE_MISSING;
2897 return (true);
2898
2899 } else if (errno == ENAMETOOLONG) {
2900 *type = OS_FILE_TYPE_NAME_TOO_LONG;
2901 return (false);
2902 } else if (errno == EACCES) {
2903 *type = OS_FILE_PERMISSION_ERROR;
2904 return (false);
2905 } else {
2906 *type = OS_FILE_TYPE_FAILED;
2907
2908 /* The stat() call failed with some other error. */
2909 os_file_handle_error_no_exit(path, "file_status_posix_stat", false);
2910 return (false);
2911 }
2912
2913 if (exists != nullptr) {
2914 *exists = true;
2915 }
2916
2917 if (S_ISDIR(statinfo.st_mode)) {
2918 *type = OS_FILE_TYPE_DIR;
2919
2920 } else if (S_ISLNK(statinfo.st_mode)) {
2921 *type = OS_FILE_TYPE_LINK;
2922
2923 } else if (S_ISREG(statinfo.st_mode)) {
2924 *type = OS_FILE_TYPE_FILE;
2925
2926 } else {
2927 *type = OS_FILE_TYPE_UNKNOWN;
2928 }
2929
2930 return (true);
2931 }
2932
2933 /** Check the existence and usefulness of a given path.
2934 @param[in] path path name
2935 @retval true if the path exists and can be used
2936 @retval false if the path does not exist or if the path is
2937 unuseable to get to a possibly existing file or directory. */
os_file_exists_posix(const char * path)2938 static bool os_file_exists_posix(const char *path) {
2939 struct stat statinfo;
2940
2941 int ret = stat(path, &statinfo);
2942
2943 if (ret == 0) {
2944 return (true);
2945 }
2946
2947 if (!(errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG ||
2948 errno == EACCES)) {
2949 os_file_handle_error_no_exit(path, "file_exists_posix_stat", false);
2950 }
2951
2952 return (false);
2953 }
2954
2955 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
2956 function!
2957 Flushes the write buffers of a given file to the disk.
2958 @param[in] file handle to a file
2959 @return true if success */
os_file_flush_func(os_file_t file)2960 bool os_file_flush_func(os_file_t file) {
2961 int ret;
2962
2963 ret = os_file_fsync_posix(file);
2964
2965 if (ret == 0) {
2966 return (true);
2967 }
2968
2969 /* Since Linux returns EINVAL if the 'file' is actually a raw device,
2970 we choose to ignore that error if we are using raw disks */
2971
2972 if (srv_start_raw_disk_in_use && errno == EINVAL) {
2973 return (true);
2974 }
2975
2976 ib::error(ER_IB_MSG_775) << "The OS said file flush did not succeed";
2977
2978 os_file_handle_error(nullptr, "flush");
2979
2980 /* It is a fatal error if a file flush does not succeed, because then
2981 the database can get corrupt on disk */
2982 ut_error;
2983
2984 return (false);
2985 }
2986
2987 /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
2988 this function!
2989 A simple function to open or create a file.
2990 @param[in] name name of the file or path as a null-terminated
2991 string
2992 @param[in] create_mode create mode
2993 @param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
2994 @param[in] read_only if true, read only checks are enforced
2995 @param[out] success true if succeed, false if error
2996 @return handle to the file, not defined if error, error number
2997 can be retrieved with os_file_get_last_error */
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)2998 os_file_t os_file_create_simple_func(const char *name, ulint create_mode,
2999 ulint access_type, bool read_only,
3000 bool *success) {
3001 os_file_t file;
3002
3003 *success = false;
3004
3005 int create_flag;
3006
3007 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
3008 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
3009
3010 if (create_mode == OS_FILE_OPEN) {
3011 if (access_type == OS_FILE_READ_ONLY) {
3012 create_flag = O_RDONLY;
3013
3014 } else if (read_only) {
3015 create_flag = O_RDONLY;
3016
3017 } else {
3018 create_flag = O_RDWR;
3019 }
3020
3021 } else if (read_only) {
3022 create_flag = O_RDONLY;
3023
3024 } else if (create_mode == OS_FILE_CREATE) {
3025 create_flag = O_RDWR | O_CREAT | O_EXCL;
3026
3027 } else if (create_mode == OS_FILE_CREATE_PATH) {
3028 /* Create subdirs along the path if needed. */
3029 dberr_t err;
3030
3031 err = os_file_create_subdirs_if_needed(name);
3032
3033 if (err != DB_SUCCESS) {
3034 *success = false;
3035 ib::error(ER_IB_MSG_776)
3036 << "Unable to create subdirectories '" << name << "'";
3037
3038 return (OS_FILE_CLOSED);
3039 }
3040
3041 create_flag = O_RDWR | O_CREAT | O_EXCL;
3042 create_mode = OS_FILE_CREATE;
3043 } else {
3044 ib::error(ER_IB_MSG_777) << "Unknown file create mode (" << create_mode
3045 << " for file '" << name << "'";
3046
3047 return (OS_FILE_CLOSED);
3048 }
3049
3050 bool retry;
3051
3052 do {
3053 file = ::open(name, create_flag, os_innodb_umask);
3054
3055 if (file == -1) {
3056 *success = false;
3057
3058 retry = os_file_handle_error(
3059 name, create_mode == OS_FILE_OPEN ? "open" : "create");
3060 } else {
3061 *success = true;
3062 retry = false;
3063 }
3064
3065 } while (retry);
3066
3067 #ifdef USE_FILE_LOCK
3068 if (!read_only && *success && access_type == OS_FILE_READ_WRITE &&
3069 os_file_lock(file, name)) {
3070 *success = false;
3071 close(file);
3072 file = -1;
3073 }
3074 #endif /* USE_FILE_LOCK */
3075
3076 return (file);
3077 }
3078
3079 /** This function attempts to create a directory named pathname. The new
3080 directory gets default permissions. On Unix the permissions are
3081 (0770 & ~umask). If the directory exists already, nothing is done and
3082 the call succeeds, unless the fail_if_exists arguments is true.
3083 If another error occurs, such as a permission error, this does not crash,
3084 but reports the error and returns false.
3085 @param[in] pathname directory name as null-terminated string
3086 @param[in] fail_if_exists if true, pre-existing directory is treated as
3087 an error.
3088 @return true if call succeeds, false on error */
os_file_create_directory(const char * pathname,bool fail_if_exists)3089 bool os_file_create_directory(const char *pathname, bool fail_if_exists) {
3090 int rcode = mkdir(pathname, 0770);
3091
3092 if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
3093 /* failure */
3094 os_file_handle_error_no_exit(pathname, "mkdir", false);
3095
3096 return (false);
3097 }
3098
3099 return (true);
3100 }
3101
3102 /** This function scans the contents of a directory and invokes the callback
3103 for each entry.
3104 @param[in] path directory name as null-terminated string
3105 @param[in] scan_cbk use callback to be called for each entry
3106 @param[in] is_drop attempt to drop the directory after scan
3107 @return true if call succeeds, false on error */
os_file_scan_directory(const char * path,os_dir_cbk_t scan_cbk,bool is_drop)3108 bool os_file_scan_directory(const char *path, os_dir_cbk_t scan_cbk,
3109 bool is_drop) {
3110 DIR *directory;
3111 dirent *entry;
3112
3113 directory = opendir(path);
3114
3115 if (directory == nullptr) {
3116 os_file_handle_error_no_exit(path, "opendir", false);
3117 return (false);
3118 }
3119
3120 entry = readdir(directory);
3121
3122 while (entry != nullptr) {
3123 scan_cbk(path, entry->d_name);
3124 entry = readdir(directory);
3125 }
3126
3127 closedir(directory);
3128
3129 if (is_drop) {
3130 int err;
3131 err = rmdir(path);
3132
3133 if (err != 0) {
3134 os_file_handle_error_no_exit(path, "rmdir", false);
3135 return (false);
3136 }
3137 }
3138
3139 return (true);
3140 }
3141
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,bool read_only,bool * success)3142 pfs_os_file_t os_file_create_func(const char *name, ulint create_mode,
3143 ulint purpose, ulint type, bool read_only,
3144 bool *success) {
3145 bool on_error_no_exit;
3146 bool on_error_silent;
3147 pfs_os_file_t file;
3148
3149 *success = false;
3150
3151 DBUG_EXECUTE_IF("ib_create_table_fail_disk_full", *success = false;
3152 errno = ENOSPC; file.m_file = OS_FILE_CLOSED; return (file););
3153
3154 int create_flag;
3155 const char *mode_str = nullptr;
3156
3157 on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT ? true : false;
3158 on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT ? true : false;
3159
3160 create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
3161 create_mode &= ~OS_FILE_ON_ERROR_SILENT;
3162
3163 if (create_mode == OS_FILE_OPEN || create_mode == OS_FILE_OPEN_RAW ||
3164 create_mode == OS_FILE_OPEN_RETRY) {
3165 mode_str = "OPEN";
3166
3167 create_flag = read_only ? O_RDONLY : O_RDWR;
3168
3169 } else if (read_only) {
3170 mode_str = "OPEN";
3171
3172 create_flag = O_RDONLY;
3173
3174 } else if (create_mode == OS_FILE_CREATE) {
3175 mode_str = "CREATE";
3176 create_flag = O_RDWR | O_CREAT | O_EXCL;
3177
3178 } else if (create_mode == OS_FILE_CREATE_PATH) {
3179 /* Create subdirs along the path if needed. */
3180 dberr_t err;
3181
3182 err = os_file_create_subdirs_if_needed(name);
3183
3184 if (err != DB_SUCCESS) {
3185 *success = false;
3186 ib::error(ER_IB_MSG_778)
3187 << "Unable to create subdirectories '" << name << "'";
3188
3189 file.m_file = OS_FILE_CLOSED;
3190 return (file);
3191 }
3192
3193 create_flag = O_RDWR | O_CREAT | O_EXCL;
3194 create_mode = OS_FILE_CREATE;
3195
3196 } else {
3197 ib::error(ER_IB_MSG_779)
3198 << "Unknown file create mode (" << create_mode << ")"
3199 << " for file '" << name << "'";
3200
3201 file.m_file = OS_FILE_CLOSED;
3202 return (file);
3203 }
3204
3205 ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE || type == OS_DBLWR_FILE ||
3206 type == OS_CLONE_DATA_FILE || type == OS_CLONE_LOG_FILE ||
3207 type == OS_BUFFERED_FILE || type == OS_REDO_LOG_ARCHIVE_FILE);
3208
3209 ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
3210
3211 #ifdef O_SYNC
3212 /* We let O_SYNC only affect log files; note that we map O_DSYNC to
3213 O_SYNC because the datasync options seemed to corrupt files in 2001
3214 in both Linux and Solaris */
3215
3216 if (!read_only && type == OS_LOG_FILE &&
3217 srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
3218 create_flag |= O_SYNC;
3219 }
3220 #endif /* O_SYNC */
3221
3222 bool retry;
3223
3224 do {
3225 file.m_file = ::open(name, create_flag, os_innodb_umask);
3226
3227 if (file.m_file == -1) {
3228 const char *operation;
3229
3230 operation =
3231 (create_mode == OS_FILE_CREATE && !read_only) ? "create" : "open";
3232
3233 *success = false;
3234
3235 if (on_error_no_exit) {
3236 retry = os_file_handle_error_no_exit(name, operation, on_error_silent);
3237 } else {
3238 retry = os_file_handle_error(name, operation);
3239 }
3240 } else {
3241 *success = true;
3242 retry = false;
3243 }
3244
3245 } while (retry);
3246
3247 /* We disable OS caching (O_DIRECT) only on data files. For clone we
3248 need to set O_DIRECT even for read_only mode. */
3249
3250 if ((!read_only || type == OS_CLONE_DATA_FILE) && *success &&
3251 (type == OS_DATA_FILE || type == OS_CLONE_DATA_FILE ||
3252 type == OS_DBLWR_FILE) &&
3253 (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT ||
3254 srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) {
3255 os_file_set_nocache(file.m_file, name, mode_str);
3256 }
3257
3258 #ifdef USE_FILE_LOCK
3259 if (!read_only && *success && create_mode != OS_FILE_OPEN_RAW &&
3260 /* Don't acquire file lock while cloning files. */
3261 type != OS_CLONE_DATA_FILE && type != OS_CLONE_LOG_FILE &&
3262 os_file_lock(file.m_file, name)) {
3263 if (create_mode == OS_FILE_OPEN_RETRY) {
3264 ib::info(ER_IB_MSG_780) << "Retrying to lock the first data file";
3265
3266 for (int i = 0; i < 100; i++) {
3267 os_thread_sleep(1000000);
3268
3269 if (!os_file_lock(file.m_file, name)) {
3270 *success = true;
3271 return (file);
3272 }
3273 }
3274
3275 ib::info(ER_IB_MSG_781) << "Unable to open the first data file";
3276 }
3277
3278 *success = false;
3279 close(file.m_file);
3280 file.m_file = -1;
3281 }
3282 #endif /* USE_FILE_LOCK */
3283
3284 return (file);
3285 }
3286
3287 /** NOTE! Use the corresponding macro
3288 os_file_create_simple_no_error_handling(), not directly this function!
3289 A simple function to open or create a file.
3290 @param[in] name name of the file or path as a null-terminated
3291 string
3292 @param[in] create_mode create mode
3293 @param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
3294 OS_FILE_READ_ALLOW_DELETE; the last option
3295 is used by a backup program reading the file
3296 @param[in] read_only if true read only mode checks are enforced
3297 @param[out] success true if succeeded
3298 @return own: handle to the file, not defined if error, error number
3299 can be retrieved with os_file_get_last_error */
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)3300 pfs_os_file_t os_file_create_simple_no_error_handling_func(const char *name,
3301 ulint create_mode,
3302 ulint access_type,
3303 bool read_only,
3304 bool *success) {
3305 pfs_os_file_t file;
3306 int create_flag;
3307
3308 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
3309 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
3310
3311 *success = false;
3312
3313 if (create_mode == OS_FILE_OPEN) {
3314 if (access_type == OS_FILE_READ_ONLY) {
3315 create_flag = O_RDONLY;
3316
3317 } else if (read_only) {
3318 create_flag = O_RDONLY;
3319
3320 } else {
3321 ut_a(access_type == OS_FILE_READ_WRITE ||
3322 access_type == OS_FILE_READ_ALLOW_DELETE);
3323
3324 create_flag = O_RDWR;
3325 }
3326
3327 } else if (read_only) {
3328 create_flag = O_RDONLY;
3329
3330 } else if (create_mode == OS_FILE_CREATE) {
3331 create_flag = O_RDWR | O_CREAT | O_EXCL;
3332
3333 } else {
3334 ib::error(ER_IB_MSG_782) << "Unknown file create mode " << create_mode
3335 << " for file '" << name << "'";
3336 file.m_file = OS_FILE_CLOSED;
3337 return (file);
3338 }
3339
3340 file.m_file = ::open(name, create_flag, os_innodb_umask);
3341
3342 *success = (file.m_file != -1);
3343
3344 #ifdef USE_FILE_LOCK
3345 if (!read_only && *success && access_type == OS_FILE_READ_WRITE &&
3346 os_file_lock(file.m_file, name)) {
3347 *success = false;
3348 close(file.m_file);
3349 file.m_file = -1;
3350 }
3351 #endif /* USE_FILE_LOCK */
3352
3353 return (file);
3354 }
3355
3356 /** Deletes a file if it exists. The file has to be closed before calling this.
3357 @param[in] name file path as a null-terminated string
3358 @param[out] exist indicate if file pre-exist
3359 @return true if success */
os_file_delete_if_exists_func(const char * name,bool * exist)3360 bool os_file_delete_if_exists_func(const char *name, bool *exist) {
3361 if (!os_file_can_delete(name)) {
3362 return (false);
3363 }
3364
3365 if (exist != nullptr) {
3366 *exist = true;
3367 }
3368
3369 int ret = unlink(name);
3370
3371 if (ret != 0 && errno == ENOENT) {
3372 if (exist != nullptr) {
3373 *exist = false;
3374 }
3375
3376 } else if (ret != 0 && errno != ENOENT) {
3377 os_file_handle_error_no_exit(name, "delete", false);
3378
3379 return (false);
3380 }
3381
3382 return (true);
3383 }
3384
3385 /** Deletes a file. The file has to be closed before calling this.
3386 @param[in] name file path as a null-terminated string
3387 @return true if success */
os_file_delete_func(const char * name)3388 bool os_file_delete_func(const char *name) {
3389 int ret = unlink(name);
3390
3391 if (ret != 0) {
3392 os_file_handle_error_no_exit(name, "delete", false);
3393
3394 return (false);
3395 }
3396
3397 return (true);
3398 }
3399
3400 /** NOTE! Use the corresponding macro os_file_rename(), not directly this
3401 function!
3402 Renames a file (can also move it to another directory). It is safest that the
3403 file is closed before calling this function.
3404 @param[in] oldpath old file path as a null-terminated string
3405 @param[in] newpath new file path
3406 @return true if success */
os_file_rename_func(const char * oldpath,const char * newpath)3407 bool os_file_rename_func(const char *oldpath, const char *newpath) {
3408 #ifdef UNIV_DEBUG
3409 /* New path must be valid but not exist. */
3410 os_file_type_t type;
3411 bool exists;
3412 ut_ad(os_file_status(newpath, &exists, &type));
3413 ut_ad(!exists);
3414
3415 /* Old path must exist. */
3416 ut_ad(os_file_exists(oldpath));
3417 #endif /* UNIV_DEBUG */
3418
3419 int ret = rename(oldpath, newpath);
3420
3421 if (ret != 0) {
3422 os_file_handle_error_no_exit(oldpath, "rename", false);
3423
3424 return (false);
3425 }
3426
3427 return (true);
3428 }
3429
3430 /** NOTE! Use the corresponding macro os_file_close(), not directly this
3431 function!
3432 Closes a file handle. In case of error, error number can be retrieved with
3433 os_file_get_last_error.
3434 @param[in] file Handle to close
3435 @return true if success */
os_file_close_func(os_file_t file)3436 bool os_file_close_func(os_file_t file) {
3437 int ret = close(file);
3438
3439 if (ret == -1) {
3440 os_file_handle_error(nullptr, "close");
3441
3442 return (false);
3443 }
3444
3445 return (true);
3446 }
3447
3448 /** Gets a file size.
3449 @param[in] file handle to an open file
3450 @return file size, or (os_offset_t) -1 on failure */
os_file_get_size(pfs_os_file_t file)3451 os_offset_t os_file_get_size(pfs_os_file_t file) {
3452 /* Store current position */
3453 os_offset_t pos = lseek(file.m_file, 0, SEEK_CUR);
3454 os_offset_t file_size = lseek(file.m_file, 0, SEEK_END);
3455 /* Restore current position as the function should not change it */
3456 lseek(file.m_file, pos, SEEK_SET);
3457 return (file_size);
3458 }
3459
3460 /** Gets a file size.
3461 @param[in] filename Full path to the filename to check
3462 @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
3463 errno */
os_file_get_size(const char * filename)3464 os_file_size_t os_file_get_size(const char *filename) {
3465 struct stat s;
3466 os_file_size_t file_size;
3467
3468 int ret = stat(filename, &s);
3469
3470 if (ret == 0) {
3471 file_size.m_total_size = s.st_size;
3472 /* st_blocks is in 512 byte sized blocks */
3473 file_size.m_alloc_size = s.st_blocks * 512;
3474 } else {
3475 file_size.m_total_size = ~0;
3476 file_size.m_alloc_size = (os_offset_t)errno;
3477 }
3478
3479 return (file_size);
3480 }
3481
3482 /** Get available free space on disk
3483 @param[in] path pathname of a directory or file in disk
3484 @param[out] free_space free space available in bytes
3485 @return DB_SUCCESS if all OK */
os_get_free_space_posix(const char * path,uint64_t & free_space)3486 static dberr_t os_get_free_space_posix(const char *path, uint64_t &free_space) {
3487 struct statvfs stat;
3488 auto ret = statvfs(path, &stat);
3489
3490 if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3491 /* file or directory does not exist */
3492 return (DB_NOT_FOUND);
3493
3494 } else if (ret) {
3495 /* file exists, but stat call failed */
3496 os_file_handle_error_no_exit(path, "statvfs", false);
3497 return (DB_FAIL);
3498 }
3499
3500 free_space = stat.f_bsize;
3501 free_space *= stat.f_bavail;
3502 return (DB_SUCCESS);
3503 }
3504
3505 /** This function returns information about the specified file
3506 @param[in] path pathname of the file
3507 @param[out] stat_info information of a file in a directory
3508 @param[in,out] statinfo information of a file in a directory
3509 @param[in] check_rw_perm for testing whether the file can be opened
3510 in RW mode
3511 @param[in] read_only if true read only mode checks are enforced
3512 @return DB_SUCCESS if all OK */
os_file_get_status_posix(const char * path,os_file_stat_t * stat_info,struct stat * statinfo,bool check_rw_perm,bool read_only)3513 static dberr_t os_file_get_status_posix(const char *path,
3514 os_file_stat_t *stat_info,
3515 struct stat *statinfo,
3516 bool check_rw_perm, bool read_only) {
3517 int ret = stat(path, statinfo);
3518
3519 if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3520 /* file does not exist */
3521
3522 return (DB_NOT_FOUND);
3523
3524 } else if (ret) {
3525 /* file exists, but stat call failed */
3526
3527 os_file_handle_error_no_exit(path, "stat", false);
3528
3529 return (DB_FAIL);
3530 }
3531
3532 switch (statinfo->st_mode & S_IFMT) {
3533 case S_IFDIR:
3534 stat_info->type = OS_FILE_TYPE_DIR;
3535 break;
3536 case S_IFLNK:
3537 stat_info->type = OS_FILE_TYPE_LINK;
3538 break;
3539 case S_IFBLK:
3540 /* Handle block device as regular file. */
3541 case S_IFCHR:
3542 /* Handle character device as regular file. */
3543 case S_IFREG:
3544 stat_info->type = OS_FILE_TYPE_FILE;
3545 break;
3546 default:
3547 stat_info->type = OS_FILE_TYPE_UNKNOWN;
3548 }
3549
3550 stat_info->size = statinfo->st_size;
3551 stat_info->block_size = statinfo->st_blksize;
3552 stat_info->alloc_size = statinfo->st_blocks * 512;
3553
3554 if (check_rw_perm && (stat_info->type == OS_FILE_TYPE_FILE ||
3555 stat_info->type == OS_FILE_TYPE_BLOCK)) {
3556 int access = !read_only ? O_RDWR : O_RDONLY;
3557 int fh = ::open(path, access, os_innodb_umask);
3558
3559 if (fh == -1) {
3560 stat_info->rw_perm = false;
3561 } else {
3562 stat_info->rw_perm = true;
3563 close(fh);
3564 }
3565 }
3566
3567 return (DB_SUCCESS);
3568 }
3569
3570 /** Truncates a file to a specified size in bytes.
3571 Do nothing if the size to preserve is greater or equal to the current
3572 size of the file.
3573 @param[in] pathname file path
3574 @param[in] file file to be truncated
3575 @param[in] size size to preserve in bytes
3576 @return true if success */
os_file_truncate_posix(const char * pathname,pfs_os_file_t file,os_offset_t size)3577 static bool os_file_truncate_posix(const char *pathname, pfs_os_file_t file,
3578 os_offset_t size) {
3579 int res = ftruncate(file.m_file, size);
3580 if (res == -1) {
3581 bool retry;
3582
3583 retry = os_file_handle_error_no_exit(pathname, "truncate", false);
3584
3585 if (retry) {
3586 ib::warn(ER_IB_MSG_783) << "Truncate failed for '" << pathname << "'";
3587 }
3588 }
3589
3590 return (res == 0);
3591 }
3592
3593 /** Truncates a file at its current position.
3594 @return true if success */
os_file_set_eof(FILE * file)3595 bool os_file_set_eof(FILE *file) /*!< in: file to be truncated */
3596 {
3597 return (!ftruncate(fileno(file), ftell(file)));
3598 }
3599
3600 #ifdef UNIV_HOTBACKUP
3601 /** Closes a file handle.
3602 @param[in] file Handle to a file
3603 @return true if success */
os_file_close_no_error_handling(os_file_t file)3604 bool os_file_close_no_error_handling(os_file_t file) {
3605 return (close(file) != -1);
3606 }
3607 #endif /* UNIV_HOTBACKUP */
3608
3609 /** This function can be called if one wants to post a batch of reads and
3610 prefers an i/o-handler thread to handle them all at once later. You must
3611 call os_aio_simulated_wake_handler_threads later to ensure the threads
3612 are not left sleeping! */
os_aio_simulated_put_read_threads_to_sleep()3613 void os_aio_simulated_put_read_threads_to_sleep() { /* No op on non Windows */
3614 }
3615
3616 /** Depth first traversal of the directory starting from basedir
3617 @param[in] basedir Start scanning from this directory
3618 @param[in] recursive `true` if scan should be recursive
3619 @param[in] f Function to call for each entry */
walk_posix(const Path & basedir,bool recursive,Function && f)3620 void Dir_Walker::walk_posix(const Path &basedir, bool recursive, Function &&f) {
3621 using Stack = std::stack<Entry>;
3622
3623 Stack directories;
3624
3625 directories.push(Entry(basedir, 0));
3626
3627 while (!directories.empty()) {
3628 Entry current = directories.top();
3629
3630 directories.pop();
3631
3632 /* Ignore hidden directories and files. */
3633 if (Fil_path::is_hidden(current.m_path)) {
3634 ib::info(ER_IB_MSG_SKIP_HIDDEN_DIR, current.m_path.c_str());
3635 continue;
3636 }
3637
3638 DIR *parent = opendir(current.m_path.c_str());
3639
3640 if (parent == nullptr) {
3641 ib::info(ER_IB_MSG_784) << "Failed to walk directory"
3642 << " '" << current.m_path << "'";
3643
3644 continue;
3645 }
3646
3647 if (!is_directory(current.m_path)) {
3648 f(current.m_path, current.m_depth);
3649 }
3650
3651 struct dirent *dirent = nullptr;
3652
3653 for (;;) {
3654 dirent = readdir(parent);
3655
3656 if (dirent == nullptr) {
3657 break;
3658 }
3659
3660 if (strcmp(dirent->d_name, ".") == 0 ||
3661 strcmp(dirent->d_name, "..") == 0) {
3662 continue;
3663 }
3664
3665 Path path(current.m_path);
3666
3667 if (path.back() != '/' && path.back() != '\\') {
3668 path += OS_PATH_SEPARATOR;
3669 }
3670
3671 path.append(dirent->d_name);
3672
3673 /* Ignore hidden subdirectories and files. */
3674 if (Fil_path::is_hidden(path)) {
3675 ib::info(ER_IB_MSG_SKIP_HIDDEN_DIR, path.c_str());
3676 continue;
3677 }
3678
3679 if (is_directory(path) && recursive) {
3680 directories.push(Entry(path, current.m_depth + 1));
3681 } else {
3682 f(path, current.m_depth + 1);
3683 }
3684 }
3685
3686 closedir(parent);
3687 }
3688 }
3689
3690 #else /* !_WIN32 */
3691
3692 #include <WinIoCtl.h>
3693
3694 /** Do the read/write
3695 @param[in] request The IO context and type
3696 @return the number of bytes read/written or negative value on error */
execute(const IORequest & request)3697 ssize_t SyncFileIO::execute(const IORequest &request) {
3698 OVERLAPPED seek;
3699
3700 memset(&seek, 0x0, sizeof(seek));
3701
3702 seek.Offset = (DWORD)m_offset & 0xFFFFFFFF;
3703 seek.OffsetHigh = (DWORD)(m_offset >> 32);
3704
3705 BOOL ret;
3706 DWORD n_bytes;
3707
3708 if (request.is_read()) {
3709 ret = ReadFile(m_fh, m_buf, static_cast<DWORD>(m_n), &n_bytes, &seek);
3710
3711 } else {
3712 ut_ad(request.is_write());
3713 ret = WriteFile(m_fh, m_buf, static_cast<DWORD>(m_n), &n_bytes, &seek);
3714 }
3715
3716 /* Sync IO can't be done on a file opened in AIO mode. */
3717 // ut_a(GetLastError() != ERROR_IO_PENDING);
3718
3719 return (ret ? static_cast<ssize_t>(n_bytes) : -1);
3720 }
3721
3722 /** Do the read/write
3723 @param[in,out] slot The IO slot, it has the IO context
3724 @return the number of bytes read/written or negative value on error */
execute(Slot * slot)3725 ssize_t SyncFileIO::execute(Slot *slot) {
3726 BOOL ret;
3727
3728 if (slot->type.is_read()) {
3729 ret = ReadFile(slot->file.m_file, slot->ptr, slot->len, &slot->n_bytes,
3730 &slot->control);
3731 } else {
3732 ut_ad(slot->type.is_write());
3733 ret = WriteFile(slot->file.m_file, slot->ptr, slot->len, &slot->n_bytes,
3734 &slot->control);
3735 }
3736
3737 /* Sync IO can't be done on a file opened in AIO mode. */
3738 // ut_a(GetLastError() != ERROR_IO_PENDING);
3739
3740 return (ret ? static_cast<ssize_t>(slot->n_bytes) : -1);
3741 }
3742
3743 /** Check if the file system supports sparse files.
3744 @param[in] name File name
3745 @return true if the file system supports sparse files */
os_is_sparse_file_supported_win32(const char * filename)3746 static bool os_is_sparse_file_supported_win32(const char *filename) {
3747 char volname[MAX_PATH];
3748 BOOL result = GetVolumePathName(filename, volname, MAX_PATH);
3749
3750 if (!result) {
3751 ib::error(ER_IB_MSG_785)
3752 << "os_is_sparse_file_supported: "
3753 << "Failed to get the volume path name for: " << filename
3754 << "- OS error number " << GetLastError();
3755
3756 return (false);
3757 }
3758
3759 DWORD flags;
3760
3761 GetVolumeInformation(volname, NULL, MAX_PATH, NULL, NULL, &flags, NULL,
3762 MAX_PATH);
3763
3764 return (flags & FILE_SUPPORTS_SPARSE_FILES) ? true : false;
3765 }
3766
3767 /** Free storage space associated with a section of the file.
3768 @param[in] fh Open file handle
3769 @param[in] page_size Tablespace page size
3770 @param[in] block_size File system block size
3771 @param[in] off Starting offset (SEEK_SET)
3772 @param[in] len Size of the hole
3773 @return 0 on success or errno */
os_file_punch_hole_win32(os_file_t fh,os_offset_t off,os_offset_t len)3774 static dberr_t os_file_punch_hole_win32(os_file_t fh, os_offset_t off,
3775 os_offset_t len) {
3776 FILE_ZERO_DATA_INFORMATION punch;
3777
3778 punch.FileOffset.QuadPart = off;
3779 punch.BeyondFinalZero.QuadPart = off + len;
3780
3781 /* If lpOverlapped is NULL, lpBytesReturned cannot be NULL,
3782 therefore we pass a dummy parameter. */
3783 DWORD temp;
3784
3785 BOOL result = DeviceIoControl(fh, FSCTL_SET_ZERO_DATA, &punch, sizeof(punch),
3786 NULL, 0, &temp, NULL);
3787
3788 return (!result ? DB_IO_NO_PUNCH_HOLE : DB_SUCCESS);
3789 }
3790
3791 /** Check the existence and type of a given path.
3792 @param[in] path pathname of the file
3793 @param[out] exists true if file exists
3794 @param[out] type type of the file (if it exists)
3795 @return true if call succeeded */
os_file_status_win32(const char * path,bool * exists,os_file_type_t * type)3796 static bool os_file_status_win32(const char *path, bool *exists,
3797 os_file_type_t *type) {
3798 struct _stat64 statinfo;
3799
3800 int ret = _stat64(path, &statinfo);
3801
3802 if (exists != nullptr) {
3803 *exists = !ret;
3804 }
3805
3806 if (ret == 0) {
3807 /* file exists, everything OK */
3808
3809 } else if (errno == ENOENT || errno == ENOTDIR) {
3810 *type = OS_FILE_TYPE_MISSING;
3811
3812 /* file does not exist */
3813
3814 if (exists != nullptr) {
3815 *exists = false;
3816 }
3817
3818 return (true);
3819
3820 } else if (errno == EACCES) {
3821 *type = OS_FILE_PERMISSION_ERROR;
3822 return (false);
3823
3824 } else {
3825 *type = OS_FILE_TYPE_FAILED;
3826
3827 /* The _stat64() call failed with some other error */
3828 os_file_handle_error_no_exit(path, "file_status_win_stat64", false);
3829 return (false);
3830 }
3831
3832 if (exists != nullptr) {
3833 *exists = true;
3834 }
3835
3836 if (_S_IFDIR & statinfo.st_mode) {
3837 *type = OS_FILE_TYPE_DIR;
3838
3839 } else if (_S_IFREG & statinfo.st_mode) {
3840 *type = OS_FILE_TYPE_FILE;
3841
3842 } else {
3843 *type = OS_FILE_TYPE_UNKNOWN;
3844 }
3845
3846 return (true);
3847 }
3848
3849 /** Check the existence and usefulness of a given path.
3850 @param[in] path path name
3851 @retval true if the path exists and can be used
3852 @retval false if the path does not exist or if the path is
3853 unuseable to get to a possibly existing file or directory. */
os_file_exists_win32(const char * path)3854 static bool os_file_exists_win32(const char *path) {
3855 struct _stat64 statinfo;
3856
3857 int ret = _stat64(path, &statinfo);
3858
3859 if (ret == 0) {
3860 return (true);
3861 }
3862
3863 if (!(errno == ENOENT || errno == EINVAL || errno == EACCES)) {
3864 /* The _stat64() call failed with an unknown error */
3865 os_file_handle_error_no_exit(path, "file_exists_win_stat64", false);
3866 }
3867
3868 return (false);
3869 }
3870
3871 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
3872 function!
3873 Flushes the write buffers of a given file to the disk.
3874 @param[in] file handle to a file
3875 @return true if success */
os_file_flush_func(os_file_t file)3876 bool os_file_flush_func(os_file_t file) {
3877 ++os_n_fsyncs;
3878
3879 BOOL ret = FlushFileBuffers(file);
3880
3881 if (ret) {
3882 return (true);
3883 }
3884
3885 /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
3886 actually a raw device, we choose to ignore that error if we are using
3887 raw disks */
3888
3889 if (srv_start_raw_disk_in_use && GetLastError() == ERROR_INVALID_FUNCTION) {
3890 return (true);
3891 }
3892
3893 os_file_handle_error(NULL, "flush");
3894
3895 /* It is a fatal error if a file flush does not succeed, because then
3896 the database can get corrupt on disk */
3897 ut_error;
3898 }
3899
3900 /** Retrieves the last error number if an error occurs in a file io function.
3901 The number should be retrieved before any other OS calls (because they may
3902 overwrite the error number). If the number is not known to this program,
3903 the OS error number + 100 is returned.
3904 @param[in] report_all_errors true if we want an error message printed
3905 of all errors
3906 @param[in] on_error_silent true then don't print any diagnostic
3907 to the log
3908 @return error number, or OS error number + 100 */
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)3909 static ulint os_file_get_last_error_low(bool report_all_errors,
3910 bool on_error_silent) {
3911 ulint err = (ulint)GetLastError();
3912
3913 if (err == ERROR_SUCCESS) {
3914 return (0);
3915 }
3916
3917 if (report_all_errors || (!on_error_silent && err != ERROR_DISK_FULL &&
3918 err != ERROR_FILE_EXISTS)) {
3919 ib::error(ER_IB_MSG_786)
3920 << "Operating system error number " << err << " in a file operation.";
3921
3922 if (err == ERROR_PATH_NOT_FOUND) {
3923 ib::error(ER_IB_MSG_787) << "The error means the system cannot find"
3924 " the path specified. It might be too long"
3925 " or it might not exist.";
3926
3927 #ifndef UNIV_HOTBACKUP
3928 if (srv_is_being_started) {
3929 ib::error(ER_IB_MSG_788) << "If you are installing InnoDB,"
3930 " remember that you must create"
3931 " directories yourself, InnoDB"
3932 " does not create them.";
3933 }
3934 #endif /* !UNIV_HOTBACKUP */
3935
3936 } else if (err == ERROR_ACCESS_DENIED) {
3937 ib::error(ER_IB_MSG_789) << "The error means mysqld does not have"
3938 " the access rights to"
3939 " the directory. It may also be"
3940 " you have created a subdirectory"
3941 " of the same name as a data file.";
3942
3943 } else if (err == ERROR_SHARING_VIOLATION || err == ERROR_LOCK_VIOLATION) {
3944 ib::error(ER_IB_MSG_790) << "The error means that another program"
3945 " is using InnoDB's files."
3946 " This might be a backup or antivirus"
3947 " software or another instance"
3948 " of MySQL."
3949 " Please close it to get rid of this error.";
3950
3951 } else if (err == ERROR_WORKING_SET_QUOTA ||
3952 err == ERROR_NO_SYSTEM_RESOURCES) {
3953 ib::error(ER_IB_MSG_791) << "The error means that there are no"
3954 " sufficient system resources or quota to"
3955 " complete the operation.";
3956
3957 } else if (err == ERROR_OPERATION_ABORTED) {
3958 ib::error(ER_IB_MSG_792) << "The error means that the I/O"
3959 " operation has been aborted"
3960 " because of either a thread exit"
3961 " or an application request."
3962 " Retry attempt is made.";
3963 } else {
3964 ib::info(ER_IB_MSG_793) << OPERATING_SYSTEM_ERROR_MSG;
3965 }
3966 }
3967
3968 if (err == ERROR_FILE_NOT_FOUND) {
3969 return (OS_FILE_NOT_FOUND);
3970 } else if (err == ERROR_PATH_NOT_FOUND) {
3971 return (OS_FILE_NAME_TOO_LONG);
3972 } else if (err == ERROR_DISK_FULL) {
3973 return (OS_FILE_DISK_FULL);
3974 } else if (err == ERROR_FILE_EXISTS) {
3975 return (OS_FILE_ALREADY_EXISTS);
3976 } else if (err == ERROR_SHARING_VIOLATION || err == ERROR_LOCK_VIOLATION) {
3977 return (OS_FILE_SHARING_VIOLATION);
3978 } else if (err == ERROR_WORKING_SET_QUOTA ||
3979 err == ERROR_NO_SYSTEM_RESOURCES) {
3980 return (OS_FILE_INSUFFICIENT_RESOURCE);
3981 } else if (err == ERROR_OPERATION_ABORTED) {
3982 return (OS_FILE_OPERATION_ABORTED);
3983 } else if (err == ERROR_ACCESS_DENIED) {
3984 return (OS_FILE_ACCESS_VIOLATION);
3985 }
3986
3987 return (OS_FILE_ERROR_MAX + err);
3988 }
3989
3990 /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
3991 this function!
3992 A simple function to open or create a file.
3993 @param[in] name name of the file or path as a null-terminated
3994 string
3995 @param[in] create_mode create mode
3996 @param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
3997 @param[in] read_only if true read only mode checks are enforced
3998 @param[out] success true if succeed, false if error
3999 @return handle to the file, not defined if error, error number
4000 can be retrieved with os_file_get_last_error */
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)4001 os_file_t os_file_create_simple_func(const char *name, ulint create_mode,
4002 ulint access_type, bool read_only,
4003 bool *success) {
4004 os_file_t file;
4005
4006 *success = false;
4007
4008 DWORD access;
4009 DWORD create_flag;
4010 DWORD attributes = 0;
4011 #ifdef UNIV_HOTBACKUP
4012 DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE;
4013 #else
4014 DWORD share_mode = FILE_SHARE_READ;
4015 #endif /* UNIV_HOTBACKUP */
4016
4017 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
4018 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
4019
4020 if (create_mode == OS_FILE_OPEN) {
4021 create_flag = OPEN_EXISTING;
4022
4023 } else if (read_only) {
4024 create_flag = OPEN_EXISTING;
4025
4026 } else if (create_mode == OS_FILE_CREATE) {
4027 create_flag = CREATE_NEW;
4028
4029 } else if (create_mode == OS_FILE_CREATE_PATH) {
4030 /* Create subdirs along the path if needed. */
4031 dberr_t err;
4032
4033 err = os_file_create_subdirs_if_needed(name);
4034
4035 if (err != DB_SUCCESS) {
4036 *success = false;
4037 ib::error(ER_IB_MSG_794)
4038 << "Unable to create subdirectories '" << name << "'";
4039
4040 return (OS_FILE_CLOSED);
4041 }
4042
4043 create_flag = CREATE_NEW;
4044 create_mode = OS_FILE_CREATE;
4045
4046 } else {
4047 ib::error(ER_IB_MSG_795) << "Unknown file create mode (" << create_mode
4048 << ") for file '" << name << "'";
4049
4050 return (OS_FILE_CLOSED);
4051 }
4052
4053 if (access_type == OS_FILE_READ_ONLY) {
4054 access = GENERIC_READ;
4055
4056 } else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
4057 ut_ad(read_only);
4058
4059 access = GENERIC_READ;
4060 share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE;
4061
4062 } else if (read_only) {
4063 ib::info(ER_IB_MSG_796) << "Read only mode set. Unable to"
4064 " open file '"
4065 << name << "' in RW mode, "
4066 << "trying RO mode",
4067 name;
4068
4069 access = GENERIC_READ;
4070
4071 } else if (access_type == OS_FILE_READ_WRITE) {
4072 access = GENERIC_READ | GENERIC_WRITE;
4073
4074 } else {
4075 ib::error(ER_IB_MSG_797) << "Unknown file access type (" << access_type
4076 << ") "
4077 "for file '"
4078 << name << "'";
4079
4080 return (OS_FILE_CLOSED);
4081 }
4082
4083 bool retry;
4084
4085 do {
4086 /* Use default security attributes and no template file. */
4087
4088 file = CreateFile((LPCTSTR)name, access, share_mode, NULL, create_flag,
4089 attributes, NULL);
4090
4091 if (file == INVALID_HANDLE_VALUE) {
4092 *success = false;
4093
4094 retry = os_file_handle_error(
4095 name, create_mode == OS_FILE_OPEN ? "open" : "create");
4096
4097 } else {
4098 retry = false;
4099
4100 *success = true;
4101
4102 DWORD temp;
4103
4104 /* This is a best effort use case, if it fails then
4105 we will find out when we try and punch the hole. */
4106
4107 DeviceIoControl(file, FSCTL_SET_SPARSE, NULL, 0, NULL, 0, &temp, NULL);
4108 }
4109
4110 } while (retry);
4111
4112 return (file);
4113 }
4114
4115 /** This function attempts to create a directory named pathname. The new
4116 directory gets default permissions. On Unix the permissions are
4117 (0770 & ~umask). If the directory exists already, nothing is done and
4118 the call succeeds, unless the fail_if_exists arguments is true.
4119 If another error occurs, such as a permission error, this does not crash,
4120 but reports the error and returns false.
4121 @param[in] pathname directory name as null-terminated string
4122 @param[in] fail_if_exists if true, pre-existing directory is treated
4123 as an error.
4124 @return true if call succeeds, false on error */
os_file_create_directory(const char * pathname,bool fail_if_exists)4125 bool os_file_create_directory(const char *pathname, bool fail_if_exists) {
4126 BOOL rcode;
4127
4128 rcode = CreateDirectory((LPCTSTR)pathname, NULL);
4129 if (!(rcode != 0 ||
4130 (GetLastError() == ERROR_ALREADY_EXISTS && !fail_if_exists))) {
4131 os_file_handle_error_no_exit(pathname, "CreateDirectory", false);
4132
4133 return (false);
4134 }
4135
4136 return (true);
4137 }
4138
4139 /** This function scans the contents of a directory and invokes the callback
4140 for each entry.
4141 @param[in] path directory name as null-terminated string
4142 @param[in] scan_cbk use callback to be called for each entry
4143 @param[in] is_drop attempt to drop the directory after scan
4144 @return true if call succeeds, false on error */
os_file_scan_directory(const char * path,os_dir_cbk_t scan_cbk,bool is_drop)4145 bool os_file_scan_directory(const char *path, os_dir_cbk_t scan_cbk,
4146 bool is_drop) {
4147 bool file_found;
4148 HANDLE find_hdl;
4149 WIN32_FIND_DATA find_data;
4150 char wild_card_path[MAX_PATH];
4151
4152 snprintf(wild_card_path, MAX_PATH, "%s\\*", path);
4153
4154 find_hdl = FindFirstFile((LPCTSTR)wild_card_path, &find_data);
4155
4156 if (find_hdl == INVALID_HANDLE_VALUE) {
4157 os_file_handle_error_no_exit(path, "FindFirstFile", false);
4158 return (false);
4159 }
4160
4161 do {
4162 scan_cbk(path, find_data.cFileName);
4163 file_found = FindNextFile(find_hdl, &find_data);
4164
4165 } while (file_found);
4166
4167 FindClose(find_hdl);
4168
4169 if (is_drop) {
4170 bool ret;
4171
4172 ret = RemoveDirectory((LPCSTR)path);
4173
4174 if (!ret) {
4175 os_file_handle_error_no_exit(path, "RemoveDirectory", false);
4176 return (false);
4177 }
4178 }
4179
4180 return (true);
4181 }
4182
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,bool read_only,bool * success)4183 pfs_os_file_t os_file_create_func(const char *name, ulint create_mode,
4184 ulint purpose, ulint type, bool read_only,
4185 bool *success) {
4186 pfs_os_file_t file;
4187 bool retry;
4188 bool on_error_no_exit;
4189 bool on_error_silent;
4190
4191 *success = false;
4192
4193 DBUG_EXECUTE_IF("ib_create_table_fail_disk_full", *success = false;
4194 SetLastError(ERROR_DISK_FULL); file.m_file = OS_FILE_CLOSED;
4195 return (file););
4196
4197 DWORD create_flag;
4198 DWORD share_mode = FILE_SHARE_READ;
4199
4200 on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT ? true : false;
4201
4202 on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT ? true : false;
4203
4204 create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
4205 create_mode &= ~OS_FILE_ON_ERROR_SILENT;
4206
4207 if (create_mode == OS_FILE_OPEN_RAW) {
4208 ut_a(!read_only);
4209
4210 create_flag = OPEN_EXISTING;
4211
4212 /* On Windows Physical devices require admin privileges and
4213 have to have the write-share mode set. See the remarks
4214 section for the CreateFile() function documentation in MSDN. */
4215
4216 share_mode |= FILE_SHARE_WRITE;
4217
4218 } else if (create_mode == OS_FILE_OPEN || create_mode == OS_FILE_OPEN_RETRY) {
4219 create_flag = OPEN_EXISTING;
4220
4221 } else if (read_only) {
4222 create_flag = OPEN_EXISTING;
4223
4224 } else if (create_mode == OS_FILE_CREATE) {
4225 create_flag = CREATE_NEW;
4226
4227 } else if (create_mode == OS_FILE_CREATE_PATH) {
4228 /* Create subdirs along the path if needed. */
4229 dberr_t err;
4230
4231 err = os_file_create_subdirs_if_needed(name);
4232
4233 if (err != DB_SUCCESS) {
4234 *success = false;
4235 ib::error(ER_IB_MSG_798)
4236 << "Unable to create subdirectories '" << name << "'";
4237
4238 file.m_file = OS_FILE_CLOSED;
4239 return (file);
4240 }
4241
4242 create_flag = CREATE_NEW;
4243 create_mode = OS_FILE_CREATE;
4244
4245 } else {
4246 ib::error(ER_IB_MSG_799)
4247 << "Unknown file create mode (" << create_mode << ") "
4248 << " for file '" << name << "'";
4249
4250 file.m_file = OS_FILE_CLOSED;
4251 return (file);
4252 }
4253
4254 DWORD attributes = 0;
4255
4256 #ifdef UNIV_HOTBACKUP
4257 attributes |= FILE_FLAG_NO_BUFFERING;
4258 #else /* UNIV_HOTBACKUP */
4259
4260 if (purpose == OS_FILE_AIO) {
4261 #ifdef WIN_ASYNC_IO
4262 /* If specified, use asynchronous (overlapped) io and no
4263 buffering of writes in the OS */
4264
4265 if (srv_use_native_aio) {
4266 attributes |= FILE_FLAG_OVERLAPPED;
4267 }
4268 #endif /* WIN_ASYNC_IO */
4269
4270 } else if (purpose == OS_FILE_NORMAL) {
4271 /* Use default setting. */
4272
4273 } else {
4274 ib::error(ER_IB_MSG_800) << "Unknown purpose flag (" << purpose << ") "
4275 << "while opening file '" << name << "'";
4276
4277 file.m_file = OS_FILE_CLOSED;
4278 return (file);
4279 }
4280
4281 #ifdef UNIV_NON_BUFFERED_IO
4282 // TODO: Create a bug, this looks wrong. The flush log
4283 // parameter is dynamic.
4284 if (type == OS_BUFFERED_FILE || type == OS_CLONE_LOG_FILE ||
4285 type == OS_LOG_FILE) {
4286 /* Do not use unbuffered i/o for the log files because
4287 we write really a lot and we have log flusher for fsyncs. */
4288
4289 } else if (srv_win_file_flush_method == SRV_WIN_IO_UNBUFFERED) {
4290 attributes |= FILE_FLAG_NO_BUFFERING;
4291 }
4292 #endif /* UNIV_NON_BUFFERED_IO */
4293
4294 #endif /* UNIV_HOTBACKUP */
4295 DWORD access = GENERIC_READ;
4296
4297 if (!read_only) {
4298 access |= GENERIC_WRITE;
4299 }
4300
4301 /* Clone must allow concurrent write to file. */
4302 if (type == OS_CLONE_LOG_FILE || type == OS_CLONE_DATA_FILE) {
4303 share_mode |= FILE_SHARE_WRITE;
4304 }
4305
4306 do {
4307 /* Use default security attributes and no template file. */
4308 file.m_file = CreateFile((LPCTSTR)name, access, share_mode, NULL,
4309 create_flag, attributes, NULL);
4310
4311 if (file.m_file == INVALID_HANDLE_VALUE) {
4312 const char *operation;
4313
4314 operation =
4315 (create_mode == OS_FILE_CREATE && !read_only) ? "create" : "open";
4316
4317 *success = false;
4318
4319 if (on_error_no_exit) {
4320 retry = os_file_handle_error_no_exit(name, operation, on_error_silent);
4321 } else {
4322 retry = os_file_handle_error(name, operation);
4323 }
4324 } else {
4325 retry = false;
4326
4327 *success = true;
4328
4329 DWORD temp;
4330
4331 /* This is a best effort use case, if it fails then
4332 we will find out when we try and punch the hole. */
4333 DeviceIoControl(file.m_file, FSCTL_SET_SPARSE, NULL, 0, NULL, 0, &temp,
4334 NULL);
4335 }
4336
4337 } while (retry);
4338
4339 return (file);
4340 }
4341
4342 /** NOTE! Use the corresponding macro os_file_create_simple_no_error_handling(),
4343 not directly this function!
4344 A simple function to open or create a file.
4345 @param[in] name name of the file or path as a null-terminated
4346 string
4347 @param[in] create_mode create mode
4348 @param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
4349 OS_FILE_READ_ALLOW_DELETE; the last option is
4350 used by a backup program reading the file
4351 @param[out] success true if succeeded
4352 @return own: handle to the file, not defined if error, error number
4353 can be retrieved with os_file_get_last_error */
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)4354 pfs_os_file_t os_file_create_simple_no_error_handling_func(const char *name,
4355 ulint create_mode,
4356 ulint access_type,
4357 bool read_only,
4358 bool *success) {
4359 pfs_os_file_t file;
4360
4361 *success = false;
4362
4363 DWORD access;
4364 DWORD create_flag;
4365 DWORD attributes = 0;
4366 DWORD share_mode = FILE_SHARE_READ;
4367
4368 #ifdef UNIV_HOTBACKUP
4369 share_mode |= FILE_SHARE_WRITE;
4370 #endif /* UNIV_HOTBACKUP */
4371
4372 ut_a(name);
4373
4374 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
4375 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
4376
4377 if (create_mode == OS_FILE_OPEN) {
4378 create_flag = OPEN_EXISTING;
4379
4380 } else if (read_only) {
4381 create_flag = OPEN_EXISTING;
4382
4383 } else if (create_mode == OS_FILE_CREATE) {
4384 create_flag = CREATE_NEW;
4385
4386 } else {
4387 ib::error(ER_IB_MSG_801)
4388 << "Unknown file create mode (" << create_mode << ") "
4389 << " for file '" << name << "'";
4390
4391 file.m_file = OS_FILE_CLOSED;
4392 return (file);
4393 }
4394
4395 if (access_type == OS_FILE_READ_ONLY) {
4396 access = GENERIC_READ;
4397
4398 } else if (read_only) {
4399 access = GENERIC_READ;
4400
4401 } else if (access_type == OS_FILE_READ_WRITE) {
4402 access = GENERIC_READ | GENERIC_WRITE;
4403
4404 } else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
4405 ut_a(!read_only);
4406
4407 access = GENERIC_READ;
4408
4409 /* A backup program has to give mysqld the maximum
4410 freedom to do what it likes with the file */
4411
4412 share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE;
4413 } else {
4414 ib::error(ER_IB_MSG_802)
4415 << "Unknown file access type (" << access_type << ") "
4416 << "for file '" << name << "'";
4417
4418 file.m_file = OS_FILE_CLOSED;
4419 return (file);
4420 }
4421
4422 file.m_file = CreateFile((LPCTSTR)name, access, share_mode,
4423 NULL, // Security attributes
4424 create_flag, attributes,
4425 NULL); // No template file
4426
4427 *success = (file.m_file != INVALID_HANDLE_VALUE);
4428
4429 return (file);
4430 }
4431
4432 /** Deletes a file if it exists. The file has to be closed before calling this.
4433 @param[in] name file path as a null-terminated string
4434 @param[out] exist indicate if file pre-exist
4435 @return true if success */
os_file_delete_if_exists_func(const char * name,bool * exist)4436 bool os_file_delete_if_exists_func(const char *name, bool *exist) {
4437 if (!os_file_can_delete(name)) {
4438 return (false);
4439 }
4440
4441 if (exist != nullptr) {
4442 *exist = true;
4443 }
4444
4445 ulint count = 0;
4446
4447 for (;;) {
4448 /* In Windows, deleting an .ibd file may fail if mysqlbackup
4449 is copying it */
4450
4451 bool ret = DeleteFile((LPCTSTR)name);
4452
4453 if (ret) {
4454 return (true);
4455 }
4456
4457 DWORD lasterr = GetLastError();
4458
4459 if (lasterr == ERROR_FILE_NOT_FOUND || lasterr == ERROR_PATH_NOT_FOUND) {
4460 /* The file does not exist, this not an error */
4461 if (exist != NULL) {
4462 *exist = false;
4463 }
4464
4465 return (true);
4466 }
4467
4468 ++count;
4469
4470 if (count > 100 && 0 == (count % 10)) {
4471 /* Print error information */
4472 os_file_get_last_error(true);
4473
4474 ib::warn(ER_IB_MSG_803) << "Delete of file '" << name << "' failed.";
4475 }
4476
4477 /* Sleep for a second */
4478 os_thread_sleep(1000000);
4479
4480 if (count > 2000) {
4481 return (false);
4482 }
4483 }
4484 }
4485
4486 /** Deletes a file. The file has to be closed before calling this.
4487 @param[in] name File path as NUL terminated string
4488 @return true if success */
os_file_delete_func(const char * name)4489 bool os_file_delete_func(const char *name) {
4490 ulint count = 0;
4491
4492 for (;;) {
4493 /* In Windows, deleting an .ibd file may fail if mysqlbackup
4494 is copying it */
4495
4496 BOOL ret = DeleteFile((LPCTSTR)name);
4497
4498 if (ret) {
4499 return (true);
4500 }
4501
4502 if (GetLastError() == ERROR_FILE_NOT_FOUND) {
4503 /* If the file does not exist, we classify this as
4504 a 'mild' error and return */
4505
4506 return (false);
4507 }
4508
4509 ++count;
4510
4511 if (count > 100 && 0 == (count % 10)) {
4512 /* print error information */
4513 os_file_get_last_error(true);
4514
4515 ib::warn(ER_IB_MSG_804)
4516 << "Cannot delete file '" << name << "'. Are you running mysqlbackup"
4517 << " to back up the file?";
4518 }
4519
4520 /* sleep for a second */
4521 os_thread_sleep(1000000);
4522
4523 if (count > 2000) {
4524 return (false);
4525 }
4526 }
4527
4528 ut_error;
4529 return (false);
4530 }
4531
4532 /** NOTE! Use the corresponding macro os_file_rename(), not directly this
4533 function!
4534 Renames a file (can also move it to another directory). It is safest that the
4535 file is closed before calling this function.
4536 @param[in] oldpath old file path as a null-terminated string
4537 @param[in] newpath new file path
4538 @return true if success */
os_file_rename_func(const char * oldpath,const char * newpath)4539 bool os_file_rename_func(const char *oldpath, const char *newpath) {
4540 #ifdef UNIV_DEBUG
4541 /* New path must be valid but not exist. */
4542 os_file_type_t type;
4543 bool exists;
4544 ut_ad(os_file_status(newpath, &exists, &type));
4545 ut_ad(!exists);
4546
4547 /* Old path must exist. */
4548 ut_ad(os_file_exists(oldpath));
4549 #endif /* UNIV_DEBUG */
4550
4551 if (MoveFile((LPCTSTR)oldpath, (LPCTSTR)newpath)) {
4552 return (true);
4553 }
4554
4555 os_file_handle_error_no_exit(oldpath, "rename", false);
4556
4557 return (false);
4558 }
4559
4560 /** NOTE! Use the corresponding macro os_file_close(), not directly
4561 this function!
4562 Closes a file handle. In case of error, error number can be retrieved with
4563 os_file_get_last_error.
4564 @param[in,own] file Handle to a file
4565 @return true if success */
os_file_close_func(os_file_t file)4566 bool os_file_close_func(os_file_t file) {
4567 ut_a(file != INVALID_HANDLE_VALUE);
4568
4569 if (CloseHandle(file)) {
4570 return (true);
4571 }
4572
4573 os_file_handle_error(NULL, "close");
4574
4575 return (false);
4576 }
4577
4578 /** Gets a file size.
4579 @param[in] file Handle to a file
4580 @return file size, or (os_offset_t) -1 on failure */
os_file_get_size(pfs_os_file_t file)4581 os_offset_t os_file_get_size(pfs_os_file_t file) {
4582 DWORD high;
4583 DWORD low;
4584
4585 low = GetFileSize(file.m_file, &high);
4586 if (low == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
4587 return ((os_offset_t)-1);
4588 }
4589
4590 return (os_offset_t(low | (os_offset_t(high) << 32)));
4591 }
4592
4593 /** Gets a file size.
4594 @param[in] filename Full path to the filename to check
4595 @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
4596 errno */
os_file_get_size(const char * filename)4597 os_file_size_t os_file_get_size(const char *filename) {
4598 struct __stat64 s;
4599 os_file_size_t file_size;
4600
4601 int ret = _stat64(filename, &s);
4602
4603 if (ret == 0) {
4604 file_size.m_total_size = s.st_size;
4605
4606 DWORD low_size;
4607 DWORD high_size;
4608
4609 low_size = GetCompressedFileSize(filename, &high_size);
4610
4611 if (low_size != INVALID_FILE_SIZE) {
4612 file_size.m_alloc_size = high_size;
4613 file_size.m_alloc_size <<= 32;
4614 file_size.m_alloc_size |= low_size;
4615
4616 } else {
4617 ib::error(ER_IB_MSG_805)
4618 << "GetCompressedFileSize(" << filename << ", ..) failed.";
4619
4620 file_size.m_alloc_size = (os_offset_t)-1;
4621 }
4622 } else {
4623 file_size.m_total_size = ~0;
4624 file_size.m_alloc_size = (os_offset_t)ret;
4625 }
4626
4627 return (file_size);
4628 }
4629
4630 /** Get available free space on disk
4631 @param[in] path pathname of a directory or file in disk
4632 @param[out] block_size Block size to use for IO in bytes
4633 @param[out] free_space free space available in bytes
4634 @return DB_SUCCESS if all OK */
os_get_free_space_win32(const char * path,uint32_t & block_size,uint64_t & free_space)4635 static dberr_t os_get_free_space_win32(const char *path, uint32_t &block_size,
4636 uint64_t &free_space) {
4637 char volname[MAX_PATH];
4638 BOOL result = GetVolumePathName(path, volname, MAX_PATH);
4639
4640 if (!result) {
4641 ib::error(ER_IB_MSG_806)
4642 << "os_file_get_status_win32: "
4643 << "Failed to get the volume path name for: " << path
4644 << "- OS error number " << GetLastError();
4645
4646 return (DB_FAIL);
4647 }
4648
4649 DWORD sectorsPerCluster;
4650 DWORD bytesPerSector;
4651 DWORD numberOfFreeClusters;
4652 DWORD totalNumberOfClusters;
4653
4654 result =
4655 GetDiskFreeSpace((LPCSTR)volname, §orsPerCluster, &bytesPerSector,
4656 &numberOfFreeClusters, &totalNumberOfClusters);
4657
4658 if (!result) {
4659 ib::error(ER_IB_MSG_807) << "GetDiskFreeSpace(" << volname << ",...) "
4660 << "failed "
4661 << "- OS error number " << GetLastError();
4662
4663 return (DB_FAIL);
4664 }
4665
4666 block_size = bytesPerSector * sectorsPerCluster;
4667
4668 free_space = static_cast<uint64_t>(block_size);
4669 free_space *= numberOfFreeClusters;
4670
4671 return (DB_SUCCESS);
4672 }
4673
4674 /** This function returns information about the specified file
4675 @param[in] path pathname of the file
4676 @param[out] stat_info information of a file in a directory
4677 @param[in,out] statinfo information of a file in a directory
4678 @param[in] check_rw_perm for testing whether the file can be opened
4679 in RW mode
4680 @param[in] read_only true if the file is opened in read-only mode
4681 @return DB_SUCCESS if all OK */
os_file_get_status_win32(const char * path,os_file_stat_t * stat_info,struct _stat64 * statinfo,bool check_rw_perm,bool read_only)4682 static dberr_t os_file_get_status_win32(const char *path,
4683 os_file_stat_t *stat_info,
4684 struct _stat64 *statinfo,
4685 bool check_rw_perm, bool read_only) {
4686 int ret = _stat64(path, statinfo);
4687
4688 if (ret && (errno == ENOENT || errno == ENOTDIR)) {
4689 /* file does not exist */
4690
4691 return (DB_NOT_FOUND);
4692
4693 } else if (ret) {
4694 /* file exists, but stat call failed */
4695
4696 os_file_handle_error_no_exit(path, "stat", false);
4697
4698 return (DB_FAIL);
4699
4700 } else if (_S_IFDIR & statinfo->st_mode) {
4701 stat_info->type = OS_FILE_TYPE_DIR;
4702
4703 } else if (_S_IFREG & statinfo->st_mode) {
4704 DWORD access = GENERIC_READ;
4705
4706 if (!read_only) {
4707 access |= GENERIC_WRITE;
4708 }
4709
4710 stat_info->type = OS_FILE_TYPE_FILE;
4711
4712 /* Check if we can open it in read-only mode. */
4713
4714 if (check_rw_perm) {
4715 HANDLE fh;
4716
4717 fh = CreateFile((LPCTSTR)path, // File to open
4718 access, FILE_SHARE_READ,
4719 NULL, // Default security
4720 OPEN_EXISTING, // Existing file only
4721 FILE_ATTRIBUTE_NORMAL, // Normal file
4722 NULL); // No attr. template
4723
4724 if (fh == INVALID_HANDLE_VALUE) {
4725 stat_info->rw_perm = false;
4726 } else {
4727 stat_info->rw_perm = true;
4728 CloseHandle(fh);
4729 }
4730 }
4731
4732 uint64_t free_space;
4733 auto err = os_get_free_space_win32(path, stat_info->block_size, free_space);
4734
4735 if (err != DB_SUCCESS) {
4736 return (err);
4737 }
4738 /* On Windows the block size is not used as the allocation
4739 unit for sparse files. The underlying infra-structure for
4740 sparse files is based on NTFS compression. The punch hole
4741 is done on a "compression unit". This compression unit
4742 is based on the cluster size. You cannot punch a hole if
4743 the cluster size >= 8K. For smaller sizes the table is
4744 as follows:
4745
4746 Cluster Size Compression Unit
4747 512 Bytes 8 KB
4748 1 KB 16 KB
4749 2 KB 32 KB
4750 4 KB 64 KB
4751
4752 Default NTFS cluster size is 4K, compression unit size of 64K.
4753 Therefore unless the user has created the file system with
4754 a smaller cluster size and used larger page sizes there is
4755 little benefit from compression out of the box. */
4756
4757 stat_info->block_size = (stat_info->block_size <= 4096)
4758 ? stat_info->block_size * 16
4759 : UINT32_UNDEFINED;
4760 } else {
4761 stat_info->type = OS_FILE_TYPE_UNKNOWN;
4762 }
4763
4764 return (DB_SUCCESS);
4765 }
4766
4767 /** Truncates a file to a specified size in bytes.
4768 Do nothing if the size to preserve is greater or equal to the current
4769 size of the file.
4770 @param[in] pathname file path
4771 @param[in] file file to be truncated
4772 @param[in] size size to preserve in bytes
4773 @return true if success */
os_file_truncate_win32(const char * pathname,pfs_os_file_t file,os_offset_t size)4774 static bool os_file_truncate_win32(const char *pathname, pfs_os_file_t file,
4775 os_offset_t size) {
4776 LARGE_INTEGER length;
4777
4778 length.QuadPart = size;
4779
4780 BOOL success = SetFilePointerEx(file.m_file, length, NULL, FILE_BEGIN);
4781
4782 if (!success) {
4783 os_file_handle_error_no_exit(pathname, "SetFilePointerEx", false);
4784 } else {
4785 success = SetEndOfFile(file.m_file);
4786 if (!success) {
4787 os_file_handle_error_no_exit(pathname, "SetEndOfFile", false);
4788 }
4789 }
4790 return (success);
4791 }
4792
4793 /** Truncates a file at its current position.
4794 @param[in] file Handle to be truncated
4795 @return true if success */
os_file_set_eof(FILE * file)4796 bool os_file_set_eof(FILE *file) {
4797 HANDLE h = (HANDLE)_get_osfhandle(fileno(file));
4798
4799 return (SetEndOfFile(h));
4800 }
4801
4802 #ifdef UNIV_HOTBACKUP
4803 /** Closes a file handle.
4804 @param[in] file Handle to close
4805 @return true if success */
os_file_close_no_error_handling(os_file_t file)4806 bool os_file_close_no_error_handling(os_file_t file) {
4807 return (CloseHandle(file) ? true : false);
4808 }
4809 #endif /* UNIV_HOTBACKUP */
4810
4811 /** This function can be called if one wants to post a batch of reads and
4812 prefers an i/o-handler thread to handle them all at once later. You must
4813 call os_aio_simulated_wake_handler_threads later to ensure the threads
4814 are not left sleeping! */
os_aio_simulated_put_read_threads_to_sleep()4815 void os_aio_simulated_put_read_threads_to_sleep() {
4816 AIO::simulated_put_read_threads_to_sleep();
4817 }
4818
4819 /** This function can be called if one wants to post a batch of reads and
4820 prefers an i/o-handler thread to handle them all at once later. You must
4821 call os_aio_simulated_wake_handler_threads later to ensure the threads
4822 are not left sleeping! */
simulated_put_read_threads_to_sleep()4823 void AIO::simulated_put_read_threads_to_sleep() {
4824 /* The idea of putting background IO threads to sleep is only for
4825 Windows when using simulated AIO. Windows XP seems to schedule
4826 background threads too eagerly to allow for coalescing during
4827 readahead requests. */
4828
4829 if (srv_use_native_aio) {
4830 /* We do not use simulated AIO: do nothing */
4831
4832 return;
4833 }
4834
4835 os_aio_recommend_sleep_for_read_threads = true;
4836
4837 for (ulint i = 0; i < os_aio_n_segments; i++) {
4838 AIO *array{};
4839
4840 get_array_and_local_segment(array, i);
4841
4842 if (array == s_reads) {
4843 os_event_reset(os_aio_segment_wait_events[i]);
4844 }
4845 }
4846 }
4847
4848 /** Depth first traversal of the directory starting from basedir
4849 @param[in] basedir Start scanning from this directory
4850 @param[in] recursive `true` if scan should be recursive
4851 @param[in] f Callback for each entry found */
walk_win32(const Path & basedir,bool recursive,Function && f)4852 void Dir_Walker::walk_win32(const Path &basedir, bool recursive, Function &&f) {
4853 using Stack = std::stack<Entry>;
4854
4855 HRESULT res;
4856 size_t length;
4857 Stack directories;
4858 TCHAR directory[MAX_PATH];
4859
4860 res = StringCchLength(basedir.c_str(), MAX_PATH, &length);
4861
4862 /* Check if the name is too long. */
4863 if (!SUCCEEDED(res)) {
4864 ib::warn(ER_IB_MSG_808) << "StringCchLength() call failed!";
4865 return;
4866
4867 } else if (length > (MAX_PATH - 3)) {
4868 ib::warn(ER_IB_MSG_809) << "Directory name too long: '" << basedir << "'";
4869 return;
4870 }
4871
4872 StringCchCopy(directory, MAX_PATH, basedir.c_str());
4873
4874 if (directory[_tcslen(directory) - 1] != TEXT('\\')) {
4875 StringCchCat(directory, MAX_PATH, TEXT("\\*"));
4876 } else {
4877 StringCchCat(directory, MAX_PATH, TEXT("*"));
4878 }
4879
4880 directories.push(Entry(directory, 0));
4881
4882 using Type = std::codecvt_utf8<wchar_t>;
4883 using Converter = std::wstring_convert<Type, wchar_t>;
4884
4885 Converter converter;
4886
4887 while (!directories.empty()) {
4888 Entry current = directories.top();
4889
4890 directories.pop();
4891
4892 if (Fil_path::is_hidden(current.m_path)) {
4893 ib::info(ER_IB_MSG_SKIP_HIDDEN_DIR, current.m_path.c_str());
4894 continue;
4895 }
4896
4897 HANDLE h;
4898 WIN32_FIND_DATA dirent;
4899
4900 h = FindFirstFile(current.m_path.c_str(), &dirent);
4901
4902 if (h == INVALID_HANDLE_VALUE) {
4903 ib::info(ER_IB_MSG_810) << "Directory read failed:"
4904 << " '" << current.m_path << "' during scan";
4905
4906 continue;
4907 }
4908
4909 do {
4910 /* dirent.cFileName is a TCHAR. */
4911 if (_tcscmp(dirent.cFileName, _T(".")) == 0 ||
4912 _tcscmp(dirent.cFileName, _T("..")) == 0) {
4913 continue;
4914 }
4915
4916 Path path(current.m_path);
4917
4918 /* Shorten the path to remove the trailing '*'. */
4919 ut_ad(path.substr(path.size() - 2).compare("\\*") == 0);
4920
4921 path.resize(path.size() - 1);
4922 path.append(dirent.cFileName);
4923
4924 /* Ignore hidden files and directories. */
4925 if (Fil_path::is_hidden(dirent) || Fil_path::is_hidden(path)) {
4926 ib::info(ER_IB_MSG_SKIP_HIDDEN_DIR, path.c_str());
4927 continue;
4928 }
4929
4930 if ((dirent.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) && recursive) {
4931 path.append("\\*");
4932
4933 using value_type = Stack::value_type;
4934
4935 value_type dir(path, current.m_depth + 1);
4936
4937 directories.push(dir);
4938
4939 } else {
4940 f(path, current.m_depth + 1);
4941 }
4942
4943 } while (FindNextFile(h, &dirent) != 0);
4944
4945 if (GetLastError() != ERROR_NO_MORE_FILES) {
4946 ib::error(ER_IB_MSG_811) << "Scanning '" << directory << "'"
4947 << " - FindNextFile(): returned error";
4948 }
4949
4950 FindClose(h);
4951 }
4952 }
4953 #endif /* !_WIN32*/
4954
4955 /** Does a syncronous read or write depending upon the type specified
4956 In case of partial reads/writes the function tries
4957 NUM_RETRIES_ON_PARTIAL_IO times to read/write the complete data.
4958 @param[in] in_type IO flags
4959 @param[in] file handle to an open file
4960 @param[out] buf buffer where to read
4961 @param[in] offset file offset from the start where to read
4962 @param[in] n number of bytes to read, starting from offset
4963 @param[out] err DB_SUCCESS or error code
4964 @return number of bytes read/written, -1 if error */
4965 static MY_ATTRIBUTE((warn_unused_result)) ssize_t
os_file_io(const IORequest & in_type,os_file_t file,void * buf,ulint n,os_offset_t offset,dberr_t * err)4966 os_file_io(const IORequest &in_type, os_file_t file, void *buf, ulint n,
4967 os_offset_t offset, dberr_t *err) {
4968 ulint original_n = n;
4969 file::Block *block{};
4970 IORequest type = in_type;
4971 ssize_t bytes_returned = 0;
4972 byte *encrypt_log_buf = nullptr;
4973
4974 if (type.is_compressed()) {
4975 /* We don't compress the first page of any file. */
4976 ut_ad(offset > 0);
4977 block = os_file_compress_page(type, buf, &n);
4978 } else {
4979 block = nullptr;
4980 }
4981
4982 /* We do encryption after compression, since if we do encryption
4983 before compression, the encrypted data will cause compression fail
4984 or low compression rate. */
4985 if (type.is_encrypted() && type.is_write()) {
4986 if (!type.is_log()) {
4987 /* We don't encrypt the first page of any file. */
4988 auto compressed_block = block;
4989 ut_ad(offset > 0);
4990
4991 block = os_file_encrypt_page(type, buf, &n);
4992
4993 if (compressed_block != nullptr) {
4994 os_free_block(compressed_block);
4995 }
4996 } else {
4997 /* Skip encrypt log file header */
4998 if (offset >= LOG_FILE_HDR_SIZE) {
4999 block = os_file_encrypt_log(type, buf, encrypt_log_buf, &n);
5000 }
5001 }
5002 }
5003
5004 SyncFileIO sync_file_io(file, buf, n, offset);
5005
5006 for (ulint i = 0; i < NUM_RETRIES_ON_PARTIAL_IO; ++i) {
5007 ssize_t n_bytes = sync_file_io.execute(type);
5008
5009 /* Check for a hard error. Not much we can do now. */
5010 if (n_bytes < 0) {
5011 break;
5012
5013 } else if ((ulint)n_bytes + bytes_returned == n) {
5014 bytes_returned += n_bytes;
5015
5016 if (offset > 0 && (type.is_compressed() || type.is_read())) {
5017 *err = os_file_io_complete(type, file, reinterpret_cast<byte *>(buf),
5018 nullptr, original_n, offset, n);
5019 } else {
5020 *err = DB_SUCCESS;
5021 }
5022
5023 if (block != nullptr) {
5024 os_free_block(block);
5025 }
5026
5027 if (encrypt_log_buf != nullptr) {
5028 ut_free(encrypt_log_buf);
5029 }
5030
5031 return (original_n);
5032 }
5033
5034 /* Handle partial read/write. */
5035
5036 ut_ad((ulint)n_bytes + bytes_returned < n);
5037
5038 bytes_returned += (ulint)n_bytes;
5039
5040 if (!type.is_partial_io_warning_disabled()) {
5041 const char *op = type.is_read() ? "read" : "written";
5042
5043 ib::warn(ER_IB_MSG_812)
5044 << n << " bytes should have been " << op << ". Only "
5045 << bytes_returned << " bytes " << op << ". Retrying"
5046 << " for the remaining bytes.";
5047 }
5048
5049 /* Advance the offset and buffer by n_bytes */
5050 sync_file_io.advance(n_bytes);
5051 }
5052
5053 if (block != nullptr) {
5054 os_free_block(block);
5055 }
5056
5057 if (encrypt_log_buf != nullptr) {
5058 ut_free(encrypt_log_buf);
5059 }
5060
5061 *err = DB_IO_ERROR;
5062
5063 if (!type.is_partial_io_warning_disabled()) {
5064 ib::warn(ER_IB_MSG_813)
5065 << "Retry attempts for " << (type.is_read() ? "reading" : "writing")
5066 << " partial data failed.";
5067 }
5068
5069 return (bytes_returned);
5070 }
5071
5072 /** Does a synchronous write operation in Posix.
5073 @param[in] type IO context
5074 @param[in] file handle to an open file
5075 @param[out] buf buffer from which to write
5076 @param[in] n number of bytes to read, starting from offset
5077 @param[in] offset file offset from the start where to read
5078 @param[out] err DB_SUCCESS or error code
5079 @return number of bytes written, -1 if error */
5080 static MY_ATTRIBUTE((warn_unused_result)) ssize_t
os_file_pwrite(IORequest & type,os_file_t file,const byte * buf,ulint n,os_offset_t offset,dberr_t * err)5081 os_file_pwrite(IORequest &type, os_file_t file, const byte *buf, ulint n,
5082 os_offset_t offset, dberr_t *err) {
5083 #ifdef UNIV_HOTBACKUP
5084 static meb::Mutex meb_mutex;
5085 #endif /* UNIV_HOTBACKUP */
5086
5087 ut_ad(type.validate());
5088
5089 #ifdef UNIV_HOTBACKUP
5090 meb_mutex.lock();
5091 #endif /* UNIV_HOTBACKUP */
5092 ++os_n_file_writes;
5093 #ifdef UNIV_HOTBACKUP
5094 meb_mutex.unlock();
5095 #endif /* UNIV_HOTBACKUP */
5096
5097 (void)os_atomic_increment_ulint(&os_n_pending_writes, 1);
5098 MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_WRITES);
5099
5100 ssize_t n_bytes = os_file_io(type, file, (void *)buf, n, offset, err);
5101
5102 (void)os_atomic_decrement_ulint(&os_n_pending_writes, 1);
5103 MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_WRITES);
5104
5105 return (n_bytes);
5106 }
5107
5108 /** Requests a synchronous write operation.
5109 @param[in] type IO flags
5110 @param[in] name name of the file or path as a null-terminated
5111 string
5112 @param[in] file handle to an open file
5113 @param[out] buf buffer from which to write
5114 @param[in] offset file offset from the start where to read
5115 @param[in] n number of bytes to read, starting from offset
5116 @return DB_SUCCESS if request was successful, false if fail */
5117 static MY_ATTRIBUTE((warn_unused_result)) dberr_t
os_file_write_page(IORequest & type,const char * name,os_file_t file,const byte * buf,os_offset_t offset,ulint n)5118 os_file_write_page(IORequest &type, const char *name, os_file_t file,
5119 const byte *buf, os_offset_t offset, ulint n) {
5120 dberr_t err;
5121
5122 ut_ad(type.validate());
5123 ut_ad(n > 0);
5124
5125 ssize_t n_bytes = os_file_pwrite(type, file, buf, n, offset, &err);
5126
5127 if ((ulint)n_bytes != n && !os_has_said_disk_full) {
5128 ib::error(ER_IB_MSG_814) << "Write to file " << name << " failed at offset "
5129 << offset << ", " << n
5130 << " bytes should have been written,"
5131 " only "
5132 << n_bytes
5133 << " were written."
5134 " Operating system error number "
5135 << errno
5136 << "."
5137 " Check that your OS and file system"
5138 " support files of this size."
5139 " Check also that the disk is not full"
5140 " or a disk quota exceeded.";
5141
5142 if (strerror(errno) != nullptr) {
5143 ib::error(ER_IB_MSG_815)
5144 << "Error number " << errno << " means '" << strerror(errno) << "'";
5145 }
5146
5147 ib::info(ER_IB_MSG_816) << OPERATING_SYSTEM_ERROR_MSG;
5148
5149 os_has_said_disk_full = true;
5150 }
5151
5152 return (err);
5153 }
5154
5155 /** Does a synchronous read operation in Posix.
5156 @param[in] type IO flags
5157 @param[in] file handle to an open file
5158 @param[out] buf buffer where to read
5159 @param[in] offset file offset from the start where to read
5160 @param[in] n number of bytes to read, starting from offset
5161 @param[out] err DB_SUCCESS or error code
5162 @return number of bytes read, -1 if error */
5163 static MY_ATTRIBUTE((warn_unused_result)) ssize_t
os_file_pread(IORequest & type,os_file_t file,void * buf,ulint n,os_offset_t offset,dberr_t * err)5164 os_file_pread(IORequest &type, os_file_t file, void *buf, ulint n,
5165 os_offset_t offset, dberr_t *err) {
5166 #ifdef UNIV_HOTBACKUP
5167 static meb::Mutex meb_mutex;
5168
5169 meb_mutex.lock();
5170 #endif /* UNIV_HOTBACKUP */
5171 ++os_n_file_reads;
5172 #ifdef UNIV_HOTBACKUP
5173 meb_mutex.unlock();
5174 #endif /* UNIV_HOTBACKUP */
5175
5176 (void)os_atomic_increment_ulint(&os_n_pending_reads, 1);
5177 MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS);
5178
5179 ssize_t n_bytes = os_file_io(type, file, buf, n, offset, err);
5180
5181 (void)os_atomic_decrement_ulint(&os_n_pending_reads, 1);
5182 MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_READS);
5183
5184 return (n_bytes);
5185 }
5186
5187 /** Requests a synchronous positioned read operation.
5188 @return DB_SUCCESS if request was successful, false if fail
5189 @param[in] type IO flags
5190 @param[in] file_name file name
5191 @param[in] file handle to an open file
5192 @param[out] buf buffer where to read
5193 @param[in] offset file offset from the start where to read
5194 @param[in] n number of bytes to read, starting from offset
5195 @param[out] o number of bytes actually read
5196 @param[in] exit_on_err if true then exit on error
5197 @return DB_SUCCESS or error code */
5198 static MY_ATTRIBUTE((warn_unused_result)) dberr_t
os_file_read_page(IORequest & type,const char * file_name,os_file_t file,void * buf,os_offset_t offset,ulint n,ulint * o,bool exit_on_err)5199 os_file_read_page(IORequest &type, const char *file_name, os_file_t file,
5200 void *buf, os_offset_t offset, ulint n, ulint *o,
5201 bool exit_on_err) {
5202 dberr_t err;
5203
5204 #ifdef UNIV_HOTBACKUP
5205 static meb::Mutex meb_mutex;
5206
5207 meb_mutex.lock();
5208 #endif /* UNIV_HOTBACKUP */
5209 os_bytes_read_since_printout += n;
5210 #ifdef UNIV_HOTBACKUP
5211 meb_mutex.unlock();
5212 #endif /* UNIV_HOTBACKUP */
5213
5214 ut_ad(type.validate());
5215 ut_ad(n > 0);
5216
5217 for (;;) {
5218 ssize_t n_bytes;
5219
5220 n_bytes = os_file_pread(type, file, buf, n, offset, &err);
5221
5222 if (o != nullptr) {
5223 *o = n_bytes;
5224 }
5225
5226 if (err != DB_SUCCESS && !exit_on_err) {
5227 return (err);
5228
5229 } else if ((ulint)n_bytes == n) {
5230 /** The read will succeed but decompress can fail
5231 for various reasons. */
5232
5233 if (type.is_compression_enabled() &&
5234 !Compression::is_compressed_page(static_cast<byte *>(buf))) {
5235 return (DB_SUCCESS);
5236
5237 } else {
5238 return (err);
5239 }
5240 }
5241
5242 ib::error(ER_IB_MSG_817)
5243 << "Tried to read " << n << " bytes at offset " << offset
5244 << ", but was only able to read " << n_bytes;
5245
5246 if (exit_on_err) {
5247 if (!os_file_handle_error(file_name, "read")) {
5248 /* Hard error */
5249 break;
5250 }
5251
5252 } else if (!os_file_handle_error_no_exit(file_name, "read", false)) {
5253 /* Hard error */
5254 break;
5255 }
5256
5257 if (n_bytes > 0 && (ulint)n_bytes < n) {
5258 n -= (ulint)n_bytes;
5259 offset += (ulint)n_bytes;
5260 buf = reinterpret_cast<uchar *>(buf) + (ulint)n_bytes;
5261 }
5262 }
5263
5264 ib::fatal(ER_IB_MSG_818) << "Cannot read from file. OS error number " << errno
5265 << ".";
5266
5267 return (err);
5268 }
5269
5270 /** Retrieves the last error number if an error occurs in a file io function.
5271 The number should be retrieved before any other OS calls (because they may
5272 overwrite the error number). If the number is not known to this program,
5273 the OS error number + 100 is returned.
5274 @param[in] report_all_errors true if we want an error printed
5275 for all errors
5276 @return error number, or OS error number + 100 */
os_file_get_last_error(bool report_all_errors)5277 ulint os_file_get_last_error(bool report_all_errors) {
5278 return (os_file_get_last_error_low(report_all_errors, false));
5279 }
5280
5281 /** Does error handling when a file operation fails.
5282 Conditionally exits (calling srv_fatal_error()) based on should_exit value
5283 and the error type, if should_exit is true then on_error_silent is ignored.
5284 @param[in] name name of a file or NULL
5285 @param[in] operation operation
5286 @param[in] should_exit call srv_fatal_error() on an unknown error,
5287 if this parameter is true
5288 @param[in] on_error_silent if true then don't print any message to the log
5289 iff it is an unknown non-fatal error
5290 @return true if we should retry the operation */
os_file_handle_error_cond_exit(const char * name,const char * operation,bool should_exit,bool on_error_silent)5291 static MY_ATTRIBUTE((warn_unused_result)) bool os_file_handle_error_cond_exit(
5292 const char *name, const char *operation, bool should_exit,
5293 bool on_error_silent) {
5294 ulint err;
5295
5296 err = os_file_get_last_error_low(false, on_error_silent);
5297
5298 switch (err) {
5299 case OS_FILE_DISK_FULL:
5300 /* We only print a warning about disk full once */
5301
5302 if (os_has_said_disk_full) {
5303 return (false);
5304 }
5305
5306 /* Disk full error is reported irrespective of the
5307 on_error_silent setting. */
5308
5309 if (name) {
5310 ib::error(ER_IB_MSG_819)
5311 << "Encountered a problem with file '" << name << "'";
5312 }
5313
5314 ib::error(ER_IB_MSG_820)
5315 << "Disk is full. Try to clean the disk to free space.";
5316
5317 os_has_said_disk_full = true;
5318
5319 return (false);
5320
5321 case OS_FILE_AIO_RESOURCES_RESERVED:
5322 case OS_FILE_AIO_INTERRUPTED:
5323
5324 return (true);
5325
5326 case OS_FILE_PATH_ERROR:
5327 case OS_FILE_ALREADY_EXISTS:
5328 case OS_FILE_ACCESS_VIOLATION:
5329
5330 return (false);
5331
5332 case OS_FILE_SHARING_VIOLATION:
5333
5334 os_thread_sleep(10000000); /* 10 sec */
5335 return (true);
5336
5337 case OS_FILE_OPERATION_ABORTED:
5338 case OS_FILE_INSUFFICIENT_RESOURCE:
5339
5340 os_thread_sleep(100000); /* 100 ms */
5341 return (true);
5342
5343 case OS_FILE_NAME_TOO_LONG:
5344 return (false);
5345
5346 default:
5347
5348 /* If it is an operation that can crash on error then it
5349 is better to ignore on_error_silent and print an error message
5350 to the log. */
5351
5352 if (should_exit || !on_error_silent) {
5353 ib::error(ER_IB_MSG_821)
5354 << "File " << (name != nullptr ? name : "(unknown)") << ": '"
5355 << operation
5356 << "'"
5357 " returned OS error "
5358 << err << "." << (should_exit ? " Cannot continue operation" : "");
5359 }
5360
5361 if (should_exit) {
5362 #ifndef UNIV_HOTBACKUP
5363 srv_fatal_error();
5364 #else /* !UNIV_HOTBACKUP */
5365 ib::fatal(ER_IB_MSG_822) << "Internal error,"
5366 << " cannot continue operation.";
5367 #endif /* !UNIV_HOTBACKUP */
5368 }
5369 }
5370
5371 return (false);
5372 }
5373
5374 /** Does error handling when a file operation fails.
5375 @param[in] name name of a file or NULL
5376 @param[in] operation operation name that failed
5377 @return true if we should retry the operation */
os_file_handle_error(const char * name,const char * operation)5378 static bool os_file_handle_error(const char *name, const char *operation) {
5379 /* Exit in case of unknown error */
5380 return (os_file_handle_error_cond_exit(name, operation, true, false));
5381 }
5382
5383 /** Does error handling when a file operation fails.
5384 @param[in] name name of a file or NULL
5385 @param[in] operation operation name that failed
5386 @param[in] on_error_silent if true then don't print any message to the log.
5387 @return true if we should retry the operation */
os_file_handle_error_no_exit(const char * name,const char * operation,bool on_error_silent)5388 static bool os_file_handle_error_no_exit(const char *name,
5389 const char *operation,
5390 bool on_error_silent) {
5391 /* Don't exit in case of unknown error */
5392 return (
5393 os_file_handle_error_cond_exit(name, operation, false, on_error_silent));
5394 }
5395
5396 /** Tries to disable OS caching on an opened file descriptor.
5397 @param[in] fd file descriptor to alter
5398 @param[in] file_name file name, used in the diagnostic message
5399 @param[in] operation_name "open" or "create"; used in the diagnostic
5400 message */
os_file_set_nocache(int fd MY_ATTRIBUTE ((unused)),const char * file_name MY_ATTRIBUTE ((unused)),const char * operation_name MY_ATTRIBUTE ((unused)))5401 void os_file_set_nocache(int fd MY_ATTRIBUTE((unused)),
5402 const char *file_name MY_ATTRIBUTE((unused)),
5403 const char *operation_name MY_ATTRIBUTE((unused))) {
5404 /* some versions of Solaris may not have DIRECTIO_ON */
5405 #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
5406 if (directio(fd, DIRECTIO_ON) == -1) {
5407 int errno_save = errno;
5408
5409 ib::error(ER_IB_MSG_823)
5410 << "Failed to set DIRECTIO_ON on file " << file_name << "; "
5411 << operation_name << ": " << strerror(errno_save)
5412 << ","
5413 " continuing anyway.";
5414 }
5415 #elif defined(O_DIRECT)
5416 if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
5417 int errno_save = errno;
5418 static bool warning_message_printed = false;
5419 if (errno_save == EINVAL) {
5420 if (!warning_message_printed) {
5421 warning_message_printed = true;
5422 #ifdef UNIV_LINUX
5423 ib::warn(ER_IB_MSG_824)
5424 << "Failed to set O_DIRECT on file" << file_name << "; "
5425 << operation_name << ": " << strerror(errno_save)
5426 << ", "
5427 "continuing anyway. O_DIRECT is "
5428 "known to result in 'Invalid argument' "
5429 "on Linux on tmpfs, "
5430 "see MySQL Bug#26662.";
5431 #else /* UNIV_LINUX */
5432 goto short_warning;
5433 #endif /* UNIV_LINUX */
5434 }
5435 } else {
5436 #ifndef UNIV_LINUX
5437 short_warning:
5438 #endif
5439 ib::warn(ER_IB_MSG_825) << "Failed to set O_DIRECT on file " << file_name
5440 << "; " << operation_name << " : "
5441 << strerror(errno_save) << ", continuing anyway.";
5442 }
5443 }
5444 #endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */
5445 }
5446
os_file_set_size_fast(const char * name,pfs_os_file_t pfs_file,os_offset_t offset,os_offset_t size,bool read_only,bool flush)5447 bool os_file_set_size_fast(const char *name, pfs_os_file_t pfs_file,
5448 os_offset_t offset, os_offset_t size, bool read_only,
5449 bool flush) {
5450 #if !defined(NO_FALLOCATE) && defined(UNIV_LINUX) && \
5451 defined(HAVE_FALLOC_FL_ZERO_RANGE)
5452 ut_a(size >= offset);
5453
5454 static bool print_message = true;
5455
5456 int ret =
5457 fallocate(pfs_file.m_file, FALLOC_FL_ZERO_RANGE, offset, size - offset);
5458
5459 if (ret == 0) {
5460 if (flush) {
5461 return os_file_flush(pfs_file);
5462 }
5463
5464 return true;
5465 }
5466
5467 ut_a(ret == -1);
5468
5469 /* Print the failure message only once for all the redo log files. */
5470 if (print_message) {
5471 ib::info(ER_IB_MSG_1359) << "fallocate() failed with errno " << errno
5472 << " - falling back to writing NULLs.";
5473 print_message = false;
5474 }
5475 #endif /* !NO_FALLOCATE && UNIV_LINUX && HAVE_FALLOC_FL_ZERO_RANGE */
5476
5477 return os_file_set_size(name, pfs_file, offset, size, read_only, flush);
5478 }
5479
os_file_set_size(const char * name,pfs_os_file_t file,os_offset_t offset,os_offset_t size,bool read_only,bool flush)5480 bool os_file_set_size(const char *name, pfs_os_file_t file, os_offset_t offset,
5481 os_offset_t size, bool read_only, bool flush) {
5482 /* Write up to FSP_EXTENT_SIZE bytes at a time. */
5483 ulint buf_size = 0;
5484
5485 if (size <= UNIV_PAGE_SIZE) {
5486 buf_size = 1;
5487 } else {
5488 buf_size = ut_min(static_cast<ulint>(64),
5489 static_cast<ulint>(size / UNIV_PAGE_SIZE));
5490 }
5491
5492 ut_ad(buf_size != 0);
5493
5494 buf_size *= UNIV_PAGE_SIZE;
5495
5496 /* Align the buffer for possible raw i/o */
5497 byte *buf2;
5498
5499 buf2 = static_cast<byte *>(ut_malloc_nokey(buf_size + UNIV_PAGE_SIZE));
5500
5501 byte *buf = static_cast<byte *>(ut_align(buf2, UNIV_PAGE_SIZE));
5502
5503 /* Write buffer full of zeros */
5504 memset(buf, 0, buf_size);
5505
5506 os_offset_t current_size = offset;
5507
5508 /* Count to check and print progress of file write for file_size > 100 MB. */
5509 uint percentage_count = 10;
5510
5511 while (current_size < size) {
5512 ulint n_bytes;
5513
5514 if (size - current_size < (os_offset_t)buf_size) {
5515 n_bytes = (ulint)(size - current_size);
5516 } else {
5517 n_bytes = buf_size;
5518 }
5519
5520 dberr_t err;
5521 IORequest request(IORequest::WRITE);
5522
5523 #ifdef UNIV_HOTBACKUP
5524
5525 err = os_file_write(request, name, file, buf, current_size, n_bytes);
5526 #else
5527 /* Using AIO_mode::SYNC mode on POSIX systems will result in
5528 fall back to os_file_write/read. On Windows it will use
5529 special mechanism to wait before it returns back. */
5530
5531 err = os_aio(request, AIO_mode::SYNC, name, file, buf, current_size,
5532 n_bytes, read_only, nullptr, nullptr);
5533 #endif /* UNIV_HOTBACKUP */
5534
5535 if (err != DB_SUCCESS) {
5536 ut_free(buf2);
5537 return (false);
5538 }
5539
5540 /* Flush after each os_fsync_threhold bytes */
5541 if (flush && os_fsync_threshold != 0) {
5542 if ((current_size + n_bytes) / os_fsync_threshold !=
5543 current_size / os_fsync_threshold) {
5544 DBUG_EXECUTE_IF("flush_after_reaching_threshold",
5545 std::cerr << os_fsync_threshold
5546 << " bytes being flushed at once"
5547 << std::endl;);
5548
5549 bool ret = os_file_flush(file);
5550
5551 if (!ret) {
5552 ut_free(buf2);
5553 return (false);
5554 }
5555 }
5556 }
5557
5558 /* Print percentage of progress if the size is more than 100MB */
5559 if ((size >> 20) > 100) {
5560 float progress_percentage =
5561 ((float)(current_size + n_bytes) / (float)size) * 100;
5562
5563 if (progress_percentage >= percentage_count) {
5564 ib::info(ER_IB_MSG_1062, name, ulonglong{size >> 20}, percentage_count);
5565 percentage_count += 10;
5566 }
5567 }
5568
5569 current_size += n_bytes;
5570 }
5571
5572 ut_free(buf2);
5573
5574 if (flush) {
5575 return (os_file_flush(file));
5576 }
5577
5578 return (true);
5579 }
5580
5581 /** Truncates a file to a specified size in bytes.
5582 Do nothing if the size to preserve is greater or equal to the current
5583 size of the file.
5584 @param[in] pathname file path
5585 @param[in] file file to be truncated
5586 @param[in] size size to preserve in bytes
5587 @return true if success */
os_file_truncate(const char * pathname,pfs_os_file_t file,os_offset_t size)5588 bool os_file_truncate(const char *pathname, pfs_os_file_t file,
5589 os_offset_t size) {
5590 /* Do nothing if the size preserved is larger than or equal to the
5591 current size of file */
5592 os_offset_t size_bytes = os_file_get_size(file);
5593
5594 if (size >= size_bytes) {
5595 return (true);
5596 }
5597
5598 #ifdef _WIN32
5599 return (os_file_truncate_win32(pathname, file, size));
5600 #else /* _WIN32 */
5601 return (os_file_truncate_posix(pathname, file, size));
5602 #endif /* _WIN32 */
5603 }
5604
5605 /** Set read/write position of a file handle to specific offset.
5606 @param[in] pathname file path
5607 @param[in] file file handle
5608 @param[in] offset read/write offset
5609 @return true if success */
os_file_seek(const char * pathname,os_file_t file,os_offset_t offset)5610 bool os_file_seek(const char *pathname, os_file_t file, os_offset_t offset) {
5611 bool success = true;
5612
5613 #ifdef _WIN32
5614 LARGE_INTEGER length;
5615
5616 length.QuadPart = offset;
5617
5618 success = SetFilePointerEx(file, length, NULL, FILE_BEGIN);
5619
5620 #else /* _WIN32 */
5621 off_t ret;
5622
5623 ret = lseek(file, offset, SEEK_SET);
5624
5625 if (ret == -1) {
5626 success = false;
5627 }
5628 #endif /* _WIN32 */
5629
5630 if (!success) {
5631 os_file_handle_error_no_exit(pathname, "os_file_set", false);
5632 }
5633
5634 return (success);
5635 }
5636
5637 /** NOTE! Use the corresponding macro os_file_read(), not directly this
5638 function!
5639 Requests a synchronous positioned read operation.
5640 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
5641 @param[in] type IO flags
5642 @param[in] file_name file name
5643 @param[in] file handle to an open file
5644 @param[out] buf buffer where to read
5645 @param[in] offset file offset from the start where to read
5646 @param[in] n number of bytes to read, starting from offset
5647 @return DB_SUCCESS or error code */
os_file_read_func(IORequest & type,const char * file_name,os_file_t file,void * buf,os_offset_t offset,ulint n)5648 dberr_t os_file_read_func(IORequest &type, const char *file_name,
5649 os_file_t file, void *buf, os_offset_t offset,
5650 ulint n) {
5651 ut_ad(type.is_read());
5652
5653 return (
5654 os_file_read_page(type, file_name, file, buf, offset, n, nullptr, true));
5655 }
5656
5657 /** NOTE! Use the corresponding macro os_file_read_first_page(), not
5658 directly this function!
5659 Requests a synchronous positioned read operation of page 0 of IBD file
5660 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
5661 @param[in] type IO flags
5662 @param[in] file_name file name
5663 @param[in] file handle to an open file
5664 @param[out] buf buffer where to read
5665 @param[in] n number of bytes to read, starting from offset
5666 @return DB_SUCCESS or error code */
os_file_read_first_page_func(IORequest & type,const char * file_name,os_file_t file,void * buf,ulint n)5667 dberr_t os_file_read_first_page_func(IORequest &type, const char *file_name,
5668 os_file_t file, void *buf, ulint n) {
5669 ut_ad(type.is_read());
5670
5671 dberr_t err = os_file_read_page(type, file_name, file, buf, 0,
5672 UNIV_ZIP_SIZE_MIN, nullptr, true);
5673
5674 if (err == DB_SUCCESS) {
5675 uint32_t flags = fsp_header_get_flags(static_cast<byte *>(buf));
5676 const page_size_t page_size(flags);
5677 ut_ad(page_size.physical() <= n);
5678 err = os_file_read_page(type, file_name, file, buf, 0, page_size.physical(),
5679 nullptr, true);
5680 }
5681 return (err);
5682 }
5683
5684 /** copy data from one file to another file using read, write.
5685 @param[in] src_file file handle to copy from
5686 @param[in] src_offset offset to copy from
5687 @param[in] dest_file file handle to copy to
5688 @param[in] dest_offset offset to copy to
5689 @param[in] size number of bytes to copy
5690 @return DB_SUCCESS if successful */
os_file_copy_read_write(os_file_t src_file,os_offset_t src_offset,os_file_t dest_file,os_offset_t dest_offset,uint size)5691 static dberr_t os_file_copy_read_write(os_file_t src_file,
5692 os_offset_t src_offset,
5693 os_file_t dest_file,
5694 os_offset_t dest_offset, uint size) {
5695 dberr_t err;
5696 uint request_size;
5697 const uint BUF_SIZE = 4 * UNIV_SECTOR_SIZE;
5698
5699 char buf[BUF_SIZE + UNIV_SECTOR_SIZE];
5700 char *buf_ptr;
5701
5702 buf_ptr = static_cast<char *>(ut_align(buf, UNIV_SECTOR_SIZE));
5703
5704 IORequest read_request(IORequest::READ);
5705 read_request.disable_compression();
5706 read_request.clear_encrypted();
5707
5708 IORequest write_request(IORequest::WRITE);
5709 write_request.disable_compression();
5710 write_request.clear_encrypted();
5711
5712 while (size > 0) {
5713 if (size > BUF_SIZE) {
5714 request_size = BUF_SIZE;
5715 } else {
5716 request_size = size;
5717 }
5718
5719 err = os_file_read_func(read_request, nullptr, src_file, buf_ptr,
5720 src_offset, request_size);
5721
5722 if (err != DB_SUCCESS) {
5723 return (err);
5724 }
5725 src_offset += request_size;
5726
5727 err = os_file_write_func(write_request, "file copy", dest_file, buf_ptr,
5728 dest_offset, request_size);
5729
5730 if (err != DB_SUCCESS) {
5731 return (err);
5732 }
5733 dest_offset += request_size;
5734 size -= request_size;
5735 }
5736
5737 return (DB_SUCCESS);
5738 }
5739
5740 /** copy data from one file to another file.
5741 @param[in] src_file file handle to copy from
5742 @param[in] src_offset offset to copy from
5743 @param[in] dest_file file handle to copy to
5744 @param[in] dest_offset offset to copy to
5745 @param[in] size number of bytes to copy
5746 @return DB_SUCCESS if successful */
5747 #ifdef __linux__
os_file_copy_func(os_file_t src_file,os_offset_t src_offset,os_file_t dest_file,os_offset_t dest_offset,uint size)5748 dberr_t os_file_copy_func(os_file_t src_file, os_offset_t src_offset,
5749 os_file_t dest_file, os_offset_t dest_offset,
5750 uint size) {
5751 dberr_t err;
5752 static bool use_sendfile = true;
5753
5754 uint actual_size;
5755 int ret_size;
5756
5757 int src_fd;
5758 int dest_fd;
5759
5760 if (!os_file_seek(nullptr, src_file, src_offset)) {
5761 return (DB_IO_ERROR);
5762 }
5763
5764 if (!os_file_seek(nullptr, dest_file, dest_offset)) {
5765 return (DB_IO_ERROR);
5766 }
5767
5768 src_fd = OS_FD_FROM_FILE(src_file);
5769 dest_fd = OS_FD_FROM_FILE(dest_file);
5770
5771 while (use_sendfile && size > 0) {
5772 ret_size = sendfile(dest_fd, src_fd, nullptr, size);
5773
5774 if (ret_size == -1) {
5775 /* Fall through read/write path. */
5776 ib::info(ER_IB_MSG_827) << "sendfile failed to copy data"
5777 " : trying read/write ";
5778
5779 use_sendfile = false;
5780 break;
5781 }
5782
5783 actual_size = static_cast<uint>(ret_size);
5784
5785 ut_ad(size >= actual_size);
5786 size -= actual_size;
5787 }
5788
5789 if (size == 0) {
5790 return (DB_SUCCESS);
5791 }
5792
5793 err = os_file_copy_read_write(src_file, src_offset, dest_file, dest_offset,
5794 size);
5795
5796 return (err);
5797 }
5798 #else
os_file_copy_func(os_file_t src_file,os_offset_t src_offset,os_file_t dest_file,os_offset_t dest_offset,uint size)5799 dberr_t os_file_copy_func(os_file_t src_file, os_offset_t src_offset,
5800 os_file_t dest_file, os_offset_t dest_offset,
5801 uint size) {
5802 dberr_t err;
5803
5804 err = os_file_copy_read_write(src_file, src_offset, dest_file, dest_offset,
5805 size);
5806 return (err);
5807 }
5808 #endif
5809
5810 /** NOTE! Use the corresponding macro os_file_read_no_error_handling(),
5811 not directly this function!
5812 Requests a synchronous positioned read operation.
5813 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
5814 @param[in] type IO flags
5815 @param[in] file_name file name
5816 @param[in] file handle to an open file
5817 @param[out] buf buffer where to read
5818 @param[in] offset file offset from the start where to read
5819 @param[in] n number of bytes to read, starting from offset
5820 @param[out] o number of bytes actually read
5821 @return DB_SUCCESS or error code */
os_file_read_no_error_handling_func(IORequest & type,const char * file_name,os_file_t file,void * buf,os_offset_t offset,ulint n,ulint * o)5822 dberr_t os_file_read_no_error_handling_func(IORequest &type,
5823 const char *file_name,
5824 os_file_t file, void *buf,
5825 os_offset_t offset, ulint n,
5826 ulint *o) {
5827 ut_ad(type.is_read());
5828
5829 return (os_file_read_page(type, file_name, file, buf, offset, n, o, false));
5830 }
5831
5832 /** NOTE! Use the corresponding macro os_file_write(), not directly
5833 Requests a synchronous write operation.
5834 @param[in] type IO flags
5835 @param[in] name name of the file or path as a null-terminated
5836 string
5837 @param[in] file handle to an open file
5838 @param[out] buf buffer from which to write
5839 @param[in] offset file offset from the start where to read
5840 @param[in] n number of bytes to read, starting from offset
5841 @return DB_SUCCESS if request was successful, false if fail */
os_file_write_func(IORequest & type,const char * name,os_file_t file,const void * buf,os_offset_t offset,ulint n)5842 dberr_t os_file_write_func(IORequest &type, const char *name, os_file_t file,
5843 const void *buf, os_offset_t offset, ulint n) {
5844 ut_ad(type.validate());
5845 ut_ad(type.is_write());
5846
5847 /* We never compress the first page.
5848 Note: This assumes we always do block IO. */
5849 if (offset == 0) {
5850 type.clear_compressed();
5851 }
5852
5853 const byte *ptr = reinterpret_cast<const byte *>(buf);
5854
5855 return (os_file_write_page(type, name, file, ptr, offset, n));
5856 }
5857
os_file_status(const char * path,bool * exists,os_file_type_t * type)5858 bool os_file_status(const char *path, bool *exists, os_file_type_t *type) {
5859 #ifdef _WIN32
5860 return (os_file_status_win32(path, exists, type));
5861 #else
5862 return (os_file_status_posix(path, exists, type));
5863 #endif /* _WIN32 */
5864 }
5865
os_file_exists(const char * path)5866 bool os_file_exists(const char *path) {
5867 #ifdef _WIN32
5868 return (os_file_exists_win32(path));
5869 #else
5870 return (os_file_exists_posix(path));
5871 #endif /* _WIN32 */
5872 }
5873
5874 /** Free storage space associated with a section of the file.
5875 @param[in] fh Open file handle
5876 @param[in] off Starting offset (SEEK_SET)
5877 @param[in] len Size of the hole
5878 @return DB_SUCCESS or error code */
os_file_punch_hole(os_file_t fh,os_offset_t off,os_offset_t len)5879 dberr_t os_file_punch_hole(os_file_t fh, os_offset_t off, os_offset_t len) {
5880 /* In this debugging mode, we act as if punch hole is supported,
5881 and then skip any calls to actually punch a hole here.
5882 In this way, Transparent Page Compression is still being tested. */
5883 DBUG_EXECUTE_IF("ignore_punch_hole", return (DB_SUCCESS););
5884
5885 #ifdef _WIN32
5886 return (os_file_punch_hole_win32(fh, off, len));
5887 #else
5888 return (os_file_punch_hole_posix(fh, off, len));
5889 #endif /* _WIN32 */
5890 }
5891
5892 /** Check if the file system supports sparse files.
5893
5894 Warning: On POSIX systems we try and punch a hole from offset 0 to
5895 the system configured page size. This should only be called on an empty
5896 file.
5897
5898 Note: On Windows we use the name and on Unices we use the file handle.
5899
5900 @param[in] path File name
5901 @param[in] fh File handle for the file - if opened
5902 @return true if the file system supports sparse files */
os_is_sparse_file_supported(const char * path,pfs_os_file_t fh)5903 bool os_is_sparse_file_supported(const char *path, pfs_os_file_t fh) {
5904 /* In this debugging mode, we act as if punch hole is supported,
5905 then we skip any calls to actually punch a hole. In this way,
5906 Transparent Page Compression is still being tested. */
5907 DBUG_EXECUTE_IF("ignore_punch_hole", return (true););
5908
5909 #ifdef _WIN32
5910 return (os_is_sparse_file_supported_win32(path));
5911 #else
5912 dberr_t err;
5913
5914 /* We don't know the FS block size, use the sector size. The FS
5915 will do the magic. */
5916 err = os_file_punch_hole(fh.m_file, 0, UNIV_PAGE_SIZE);
5917
5918 return (err == DB_SUCCESS);
5919 #endif /* _WIN32 */
5920 }
5921
os_get_free_space(const char * path,uint64_t & free_space)5922 dberr_t os_get_free_space(const char *path, uint64_t &free_space) {
5923 #ifdef _WIN32
5924 uint32_t block_size;
5925 auto err = os_get_free_space_win32(path, block_size, free_space);
5926
5927 #else
5928 auto err = os_get_free_space_posix(path, free_space);
5929
5930 #endif /* _WIN32 */
5931 return (err);
5932 }
5933
5934 /** This function returns information about the specified file
5935 @param[in] path pathname of the file
5936 @param[out] stat_info information of a file in a directory
5937 @param[in] check_rw_perm for testing whether the file can be opened
5938 in RW mode
5939 @param[in] read_only true if file is opened in read-only mode
5940 @return DB_SUCCESS if all OK */
os_file_get_status(const char * path,os_file_stat_t * stat_info,bool check_rw_perm,bool read_only)5941 dberr_t os_file_get_status(const char *path, os_file_stat_t *stat_info,
5942 bool check_rw_perm, bool read_only) {
5943 dberr_t ret;
5944
5945 #ifdef _WIN32
5946 struct _stat64 info;
5947
5948 ret = os_file_get_status_win32(path, stat_info, &info, check_rw_perm,
5949 read_only);
5950
5951 #else
5952 struct stat info;
5953
5954 ret = os_file_get_status_posix(path, stat_info, &info, check_rw_perm,
5955 read_only);
5956
5957 #endif /* _WIN32 */
5958
5959 if (ret == DB_SUCCESS) {
5960 stat_info->ctime = info.st_ctime;
5961 stat_info->atime = info.st_atime;
5962 stat_info->mtime = info.st_mtime;
5963 stat_info->size = info.st_size;
5964 }
5965
5966 return (ret);
5967 }
5968
5969 /** Fill the pages with NULs
5970 @param[in] file File handle
5971 @param[in] name File name
5972 @param[in] page_size physical page size
5973 @param[in] start Offset from the start of the file in bytes
5974 @param[in] len Length in bytes
5975 @param[in] read_only_mode
5976 if true, then read only mode checks are enforced.
5977 @return DB_SUCCESS or error code */
os_file_write_zeros(pfs_os_file_t file,const char * name,ulint page_size,os_offset_t start,ulint len,bool read_only_mode)5978 dberr_t os_file_write_zeros(pfs_os_file_t file, const char *name,
5979 ulint page_size, os_offset_t start, ulint len,
5980 bool read_only_mode) {
5981 ut_a(len > 0);
5982
5983 /* Extend at most 1M at a time */
5984 ulint n_bytes = ut_min(static_cast<ulint>(1024 * 1024), len);
5985
5986 byte *ptr = reinterpret_cast<byte *>(ut_zalloc_nokey(n_bytes + page_size));
5987
5988 byte *buf = reinterpret_cast<byte *>(ut_align(ptr, page_size));
5989
5990 os_offset_t offset = start;
5991 dberr_t err = DB_SUCCESS;
5992 const os_offset_t end = start + len;
5993 IORequest request(IORequest::WRITE);
5994
5995 while (offset < end) {
5996 #ifdef UNIV_HOTBACKUP
5997 err = os_file_write(request, name, file, buf, offset, n_bytes);
5998 #else
5999 err = os_aio(request, AIO_mode::SYNC, name, file, buf, offset, n_bytes,
6000 read_only_mode, NULL, NULL);
6001 #endif /* UNIV_HOTBACKUP */
6002
6003 if (err != DB_SUCCESS) {
6004 break;
6005 }
6006
6007 offset += n_bytes;
6008
6009 n_bytes = ut_min(n_bytes, static_cast<ulint>(end - offset));
6010
6011 DBUG_EXECUTE_IF("ib_crash_during_tablespace_extension", DBUG_SUICIDE(););
6012 }
6013
6014 ut_free(ptr);
6015
6016 return (err);
6017 }
6018
6019 /** Waits for an AIO operation to complete. This function is used to wait the
6020 for completed requests. The aio array of pending requests is divided
6021 into segments. The thread specifies which segment or slot it wants to wait
6022 for. NOTE: this function will also take care of freeing the aio slot,
6023 therefore no other thread is allowed to do the freeing!
6024 @param[in] segment The number of the segment in the aio arrays to
6025 wait for; segment 0 is the ibuf I/O thread,
6026 segment 1 the log I/O thread, then follow the
6027 non-ibuf read threads, and as the last are the
6028 non-ibuf write threads; if this is
6029 ULINT_UNDEFINED, then it means that sync AIO
6030 is used, and this parameter is ignored
6031 @param[out] m1 the messages passed with the AIO request; note
6032 that also in the case where the AIO operation
6033 failed, these output parameters are valid and
6034 can be used to restart the operation,
6035 for example
6036 @param[out] m2 callback message
6037 @param[out] request OS_FILE_WRITE or ..._READ
6038 @return DB_SUCCESS or error code */
os_aio_handler(ulint segment,fil_node_t ** m1,void ** m2,IORequest * request)6039 dberr_t os_aio_handler(ulint segment, fil_node_t **m1, void **m2,
6040 IORequest *request) {
6041 dberr_t err;
6042
6043 if (srv_use_native_aio) {
6044 srv_set_io_thread_op_info(segment, "native aio handle");
6045
6046 #ifdef WIN_ASYNC_IO
6047
6048 err = os_aio_windows_handler(segment, 0, m1, m2, request);
6049
6050 #elif defined(LINUX_NATIVE_AIO)
6051
6052 err = os_aio_linux_handler(segment, m1, m2, request);
6053 #else
6054 ut_error;
6055
6056 err = DB_ERROR; /* Eliminate compiler warning */
6057
6058 #endif /* WIN_ASYNC_IO */
6059
6060 } else {
6061 srv_set_io_thread_op_info(segment, "simulated aio handle");
6062
6063 err = os_aio_simulated_handler(segment, m1, m2, request);
6064 }
6065
6066 return (err);
6067 }
6068
6069 /** Constructor
6070 @param[in] id The latch ID
6071 @param[in] n Number of AIO slots
6072 @param[in] segments Number of segments */
AIO(latch_id_t id,ulint n,ulint segments)6073 AIO::AIO(latch_id_t id, ulint n, ulint segments)
6074 : m_slots(n),
6075 m_n_segments(segments),
6076 m_n_reserved()
6077 #ifdef LINUX_NATIVE_AIO
6078 ,
6079 m_aio_ctx(),
6080 m_events(m_slots.size())
6081 #elif defined(_WIN32)
6082 ,
6083 m_handles()
6084 #endif /* LINUX_NATIVE_AIO */
6085 {
6086 ut_a(n > 0);
6087 ut_a(m_n_segments > 0);
6088
6089 mutex_create(id, &m_mutex);
6090
6091 m_not_full = os_event_create();
6092 m_is_empty = os_event_create();
6093
6094 #ifdef LINUX_NATIVE_AIO
6095 memset(&m_events[0], 0x0, sizeof(m_events[0]) * m_events.size());
6096 #endif /* LINUX_NATIVE_AIO */
6097
6098 os_event_set(m_is_empty);
6099 }
6100
6101 /** Initialise the slots */
init_slots()6102 dberr_t AIO::init_slots() {
6103 for (ulint i = 0; i < m_slots.size(); ++i) {
6104 Slot &slot = m_slots[i];
6105
6106 slot.pos = static_cast<uint16_t>(i);
6107
6108 slot.is_reserved = false;
6109
6110 #ifdef WIN_ASYNC_IO
6111
6112 slot.handle = CreateEvent(NULL, TRUE, FALSE, NULL);
6113
6114 OVERLAPPED *over = &slot.control;
6115
6116 over->hEvent = slot.handle;
6117
6118 (*m_handles)[i] = over->hEvent;
6119
6120 #elif defined(LINUX_NATIVE_AIO)
6121
6122 slot.ret = 0;
6123
6124 slot.n_bytes = 0;
6125
6126 memset(&slot.control, 0x0, sizeof(slot.control));
6127
6128 #endif /* WIN_ASYNC_IO */
6129 }
6130
6131 return (DB_SUCCESS);
6132 }
6133
6134 #ifdef LINUX_NATIVE_AIO
6135 /** Initialise the Linux Native AIO interface */
init_linux_native_aio()6136 dberr_t AIO::init_linux_native_aio() {
6137 /* Initialize the io_context array. One io_context
6138 per segment in the array. */
6139
6140 ut_a(m_aio_ctx == nullptr);
6141
6142 m_aio_ctx = static_cast<io_context **>(
6143 ut_zalloc_nokey(m_n_segments * sizeof(*m_aio_ctx)));
6144
6145 if (m_aio_ctx == nullptr) {
6146 return (DB_OUT_OF_MEMORY);
6147 }
6148
6149 io_context **ctx = m_aio_ctx;
6150 ulint max_events = slots_per_segment();
6151
6152 for (ulint i = 0; i < m_n_segments; ++i, ++ctx) {
6153 if (!linux_create_io_ctx(max_events, ctx)) {
6154 /* If something bad happened during aio setup
6155 we should call it a day and return right away.
6156 We don't care about any leaks because a failure
6157 to initialize the io subsystem means that the
6158 server (or atleast the innodb storage engine)
6159 is not going to startup. */
6160 return (DB_IO_ERROR);
6161 }
6162 }
6163
6164 return (DB_SUCCESS);
6165 }
6166 #endif /* LINUX_NATIVE_AIO */
6167
6168 /** Initialise the array */
init()6169 dberr_t AIO::init() {
6170 ut_a(!m_slots.empty());
6171
6172 #ifdef _WIN32
6173 ut_a(m_handles == NULL);
6174
6175 m_handles = UT_NEW_NOKEY(Handles(m_slots.size()));
6176 #endif /* _WIN32 */
6177
6178 if (srv_use_native_aio) {
6179 #ifdef LINUX_NATIVE_AIO
6180 dberr_t err = init_linux_native_aio();
6181
6182 if (err != DB_SUCCESS) {
6183 return (err);
6184 }
6185
6186 #endif /* LINUX_NATIVE_AIO */
6187 }
6188
6189 return (init_slots());
6190 }
6191
6192 /** Creates an aio wait array. Note that we return NULL in case of failure.
6193 We don't care about freeing memory here because we assume that a
6194 failure will result in server refusing to start up.
6195 @param[in] id Latch ID
6196 @param[in] n maximum number of pending AIO operations
6197 allowed; n must be divisible by m_n_segments
6198 @param[in] n_segments number of segments in the AIO array
6199 @return own: AIO array, NULL on failure */
create(latch_id_t id,ulint n,ulint n_segments)6200 AIO *AIO::create(latch_id_t id, ulint n, ulint n_segments) {
6201 ut_a(n_segments > 0);
6202
6203 if ((n % n_segments)) {
6204 ib::error(ER_IB_MSG_828) << "Maximum number of AIO operations must be "
6205 << "divisible by number of segments";
6206
6207 return (nullptr);
6208 }
6209
6210 AIO *array = UT_NEW_NOKEY(AIO(id, n, n_segments));
6211
6212 if (array != nullptr && array->init() != DB_SUCCESS) {
6213 UT_DELETE(array);
6214
6215 array = nullptr;
6216 }
6217
6218 return (array);
6219 }
6220
6221 /** AIO destructor */
~AIO()6222 AIO::~AIO() {
6223 #ifdef WIN_ASYNC_IO
6224 for (ulint i = 0; i < m_slots.size(); ++i) {
6225 CloseHandle(m_slots[i].handle);
6226 }
6227 #endif /* WIN_ASYNC_IO */
6228
6229 #ifdef _WIN32
6230 UT_DELETE(m_handles);
6231 #endif /* _WIN32 */
6232
6233 mutex_destroy(&m_mutex);
6234
6235 os_event_destroy(m_not_full);
6236 os_event_destroy(m_is_empty);
6237
6238 #if defined(LINUX_NATIVE_AIO)
6239 if (srv_use_native_aio) {
6240 m_events.clear();
6241 ut_free(m_aio_ctx);
6242 }
6243 #endif /* LINUX_NATIVE_AIO */
6244
6245 m_slots.clear();
6246 }
6247
6248 /** Initializes the asynchronous io system. Creates one array each for ibuf
6249 and log i/o. Also creates one array each for read and write where each
6250 array is divided logically into n_readers and n_writers
6251 respectively. The caller must create an i/o handler thread for each
6252 segment in these arrays. This function also creates the sync array.
6253 No i/o handler thread needs to be created for that
6254 @param[in] n_per_seg maximum number of pending aio
6255 operations allowed per segment
6256 @param[in] n_readers number of reader threads
6257 @param[in] n_writers number of writer threads
6258 @param[in] n_slots_sync number of slots in the sync aio array
6259 @return true if the AIO sub-system was started successfully */
start(ulint n_per_seg,ulint n_readers,ulint n_writers,ulint n_slots_sync)6260 bool AIO::start(ulint n_per_seg, ulint n_readers, ulint n_writers,
6261 ulint n_slots_sync) {
6262 #if defined(LINUX_NATIVE_AIO)
6263 /* Check if native aio is supported on this system and tmpfs */
6264 if (srv_use_native_aio && !is_linux_native_aio_supported()) {
6265 ib::warn(ER_IB_MSG_829) << "Linux Native AIO disabled.";
6266
6267 srv_use_native_aio = FALSE;
6268 }
6269 #endif /* LINUX_NATIVE_AIO */
6270
6271 srv_reset_io_thread_op_info();
6272
6273 s_reads =
6274 create(LATCH_ID_OS_AIO_READ_MUTEX, n_readers * n_per_seg, n_readers);
6275
6276 if (s_reads == nullptr) {
6277 return false;
6278 }
6279
6280 ulint start = srv_read_only_mode ? 0 : 2;
6281 ulint n_segs = n_readers + start;
6282
6283 #ifndef UNIV_HOTBACKUP
6284 /* 0 is the ibuf segment and 1 is the redo log segment. */
6285 for (ulint i = start; i < n_segs; ++i) {
6286 ut_a(i < SRV_MAX_N_IO_THREADS);
6287 srv_io_thread_function[i] = "read thread";
6288 }
6289 #endif /* !UNIV_HOTBACKUP */
6290
6291 ulint n_segments = n_readers;
6292
6293 if (!srv_read_only_mode) {
6294 s_ibuf = create(LATCH_ID_OS_AIO_IBUF_MUTEX, n_per_seg, 1);
6295
6296 if (s_ibuf == nullptr) {
6297 return false;
6298 }
6299
6300 ++n_segments;
6301
6302 #ifndef UNIV_HOTBACKUP
6303 srv_io_thread_function[0] = "insert buffer thread";
6304 #endif /* !UNIV_HOTBACKUP */
6305
6306 s_log = create(LATCH_ID_OS_AIO_LOG_MUTEX, n_per_seg, 1);
6307
6308 if (s_log == nullptr) {
6309 return false;
6310 }
6311
6312 ++n_segments;
6313
6314 #ifndef UNIV_HOTBACKUP
6315 srv_io_thread_function[1] = "log thread";
6316 #endif /* !UNIV_HOTBAKUP */
6317
6318 } else {
6319 s_ibuf = s_log = nullptr;
6320 }
6321
6322 s_writes =
6323 create(LATCH_ID_OS_AIO_WRITE_MUTEX, n_writers * n_per_seg, n_writers);
6324
6325 if (s_writes == nullptr) {
6326 return false;
6327 }
6328
6329 n_segments += n_writers;
6330
6331 #ifndef UNIV_HOTBACKUP
6332 for (ulint i = start + n_readers; i < n_segments; ++i) {
6333 ut_a(i < SRV_MAX_N_IO_THREADS);
6334 srv_io_thread_function[i] = "write thread";
6335 }
6336 #endif /* !UNIV_HOTBACKUP */
6337
6338 ut_ad(n_segments >= static_cast<ulint>(srv_read_only_mode ? 2 : 4));
6339
6340 s_sync = create(LATCH_ID_OS_AIO_SYNC_MUTEX, n_slots_sync, 1);
6341
6342 if (s_sync == nullptr) {
6343 return false;
6344 }
6345
6346 os_aio_n_segments = n_segments;
6347
6348 os_aio_validate();
6349
6350 os_aio_segment_wait_events = static_cast<os_event_t *>(
6351 ut_zalloc_nokey(n_segments * sizeof *os_aio_segment_wait_events));
6352
6353 if (os_aio_segment_wait_events == nullptr) {
6354 return false;
6355 }
6356
6357 for (ulint i = 0; i < n_segments; ++i) {
6358 os_aio_segment_wait_events[i] = os_event_create();
6359 }
6360
6361 os_last_printout = ut_time_monotonic();
6362
6363 return true;
6364 }
6365
6366 /** Free the AIO arrays */
shutdown()6367 void AIO::shutdown() {
6368 UT_DELETE(s_ibuf);
6369 s_ibuf = nullptr;
6370
6371 UT_DELETE(s_log);
6372 s_log = nullptr;
6373
6374 UT_DELETE(s_writes);
6375 s_writes = nullptr;
6376
6377 UT_DELETE(s_sync);
6378 s_sync = nullptr;
6379
6380 UT_DELETE(s_reads);
6381 s_reads = nullptr;
6382 }
6383
6384 #if !defined(NO_FALLOCATE) && defined(UNIV_LINUX)
6385
6386 /** Max disk sector size */
6387 static const ulint MAX_SECTOR_SIZE = 4096;
6388
6389 /**
6390 Try and get the FusionIO sector size. */
os_fusionio_get_sector_size()6391 void os_fusionio_get_sector_size() {
6392 if (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT ||
6393 srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC) {
6394 ulint sector_size = UNIV_SECTOR_SIZE;
6395 char *path = srv_data_home;
6396 os_file_t check_file;
6397 byte *ptr;
6398 byte *block_ptr;
6399 char current_dir[3];
6400 char *dir_end;
6401 ulint dir_len;
6402 ulint check_path_len;
6403 char *check_file_name;
6404 ssize_t ret;
6405
6406 /* If the srv_data_home is empty, set the path to
6407 current dir. */
6408 if (*path == 0) {
6409 current_dir[0] = FN_CURLIB;
6410 current_dir[1] = FN_LIBCHAR;
6411 current_dir[2] = 0;
6412 path = current_dir;
6413 }
6414
6415 /* Get the path of data file */
6416 dir_end = strrchr(path, OS_PATH_SEPARATOR);
6417 dir_len = dir_end ? dir_end - path : strlen(path);
6418
6419 /* allocate a new path and move the directory path to it. */
6420 check_path_len = dir_len + sizeof "/check_sector_size";
6421 check_file_name = static_cast<char *>(ut_zalloc_nokey(check_path_len));
6422 memcpy(check_file_name, path, dir_len);
6423
6424 /* Construct a check file name. */
6425 strcat(check_file_name + dir_len, "/check_sector_size");
6426
6427 /* Create a tmp file for checking sector size. */
6428 check_file = ::open(check_file_name,
6429 O_CREAT | O_TRUNC | O_WRONLY | O_DIRECT, S_IRWXU);
6430
6431 if (check_file == -1) {
6432 ib::error(ER_IB_MSG_830)
6433 << "Failed to create check sector file, errno:" << errno
6434 << " Please confirm O_DIRECT is"
6435 << " supported and remove the file " << check_file_name
6436 << " if it exists.";
6437 ut_free(check_file_name);
6438 errno = 0;
6439 return;
6440 }
6441
6442 /* Try to write the file with different sector size
6443 alignment. */
6444 ptr = static_cast<byte *>(ut_zalloc_nokey(2 * MAX_SECTOR_SIZE));
6445
6446 while (sector_size <= MAX_SECTOR_SIZE) {
6447 block_ptr = static_cast<byte *>(ut_align(ptr, sector_size));
6448 ret = pwrite(check_file, block_ptr, sector_size, 0);
6449 if (ret > 0 && (ulint)ret == sector_size) {
6450 break;
6451 }
6452 sector_size *= 2;
6453 }
6454
6455 /* The sector size should <= MAX_SECTOR_SIZE. */
6456 ut_ad(sector_size <= MAX_SECTOR_SIZE);
6457
6458 close(check_file);
6459 unlink(check_file_name);
6460
6461 ut_free(check_file_name);
6462 ut_free(ptr);
6463 errno = 0;
6464
6465 os_io_ptr_align = sector_size;
6466 }
6467 }
6468 #endif /* !NO_FALLOCATE && UNIV_LINUX */
6469
6470 /** Creates and initializes block_cache. Creates array of MAX_BLOCKS
6471 and allocates the memory in each block to hold BUFFER_BLOCK_SIZE
6472 of data.
6473
6474 This function is called by InnoDB during srv_start().
6475 It is also called by MEB while applying the redo logs on TDE tablespaces,
6476 the "Blocks" allocated in this block_cache are used to hold the decrypted
6477 page data. */
os_create_block_cache()6478 void os_create_block_cache() {
6479 ut_a(block_cache == nullptr);
6480
6481 block_cache = UT_NEW_NOKEY(Blocks(MAX_BLOCKS));
6482
6483 for (Blocks::iterator it = block_cache->begin(); it != block_cache->end();
6484 ++it) {
6485 ut_a(it->m_in_use == 0);
6486 ut_a(it->m_ptr == nullptr);
6487
6488 /* Allocate double of max page size memory, since
6489 compress could generate more bytes than orgininal
6490 data. */
6491 it->m_ptr = static_cast<byte *>(ut_malloc_nokey(BUFFER_BLOCK_SIZE));
6492
6493 ut_a(it->m_ptr != nullptr);
6494 }
6495 }
6496
6497 #ifdef UNIV_HOTBACKUP
6498 /** De-allocates block cache at InnoDB shutdown. */
meb_free_block_cache()6499 void meb_free_block_cache() {
6500 if (block_cache == nullptr) {
6501 return;
6502 }
6503
6504 for (Blocks::iterator it = block_cache->begin(); it != block_cache->end();
6505 ++it) {
6506 ut_a(it->m_in_use == 0);
6507 ut_free(it->m_ptr);
6508 }
6509
6510 UT_DELETE(block_cache);
6511
6512 block_cache = nullptr;
6513 }
6514 #endif /* UNIV_HOTBACKUP */
6515
6516 /** Initializes the asynchronous io system. Creates one array each for ibuf
6517 and log i/o. Also creates one array each for read and write where each
6518 array is divided logically into n_readers and n_writers
6519 respectively. The caller must create an i/o handler thread for each
6520 segment in these arrays. This function also creates the sync array.
6521 No i/o handler thread needs to be created for that
6522 @param[in] n_readers number of reader threads
6523 @param[in] n_writers number of writer threads
6524 @param[in] n_slots_sync number of dblwr slots in the sync aio array */
os_aio_init(ulint n_readers,ulint n_writers,ulint n_slots_sync)6525 bool os_aio_init(ulint n_readers, ulint n_writers, ulint n_slots_sync) {
6526 /* Maximum number of pending aio operations allowed per segment */
6527 ulint limit = 8 * OS_AIO_N_PENDING_IOS_PER_THREAD;
6528
6529 #ifdef _WIN32
6530 if (srv_use_native_aio) {
6531 limit = SRV_N_PENDING_IOS_PER_THREAD;
6532 }
6533 #endif /* _WIN32 */
6534
6535 /* Get sector size for DIRECT_IO. In this case, we need to
6536 know the sector size for aligning the write buffer. */
6537 #if !defined(NO_FALLOCATE) && defined(UNIV_LINUX)
6538 os_fusionio_get_sector_size();
6539 #endif /* !NO_FALLOCATE && UNIV_LINUX */
6540
6541 return (AIO::start(limit, n_readers, n_writers, n_slots_sync));
6542 }
6543
6544 /** Frees the asynchronous io system. */
os_aio_free()6545 void os_aio_free() {
6546 AIO::shutdown();
6547
6548 for (ulint i = 0; i < os_aio_n_segments; i++) {
6549 os_event_destroy(os_aio_segment_wait_events[i]);
6550 }
6551
6552 ut_free(os_aio_segment_wait_events);
6553 os_aio_segment_wait_events = nullptr;
6554 os_aio_n_segments = 0;
6555
6556 for (Blocks::iterator it = block_cache->begin(); it != block_cache->end();
6557 ++it) {
6558 ut_a(it->m_in_use == 0);
6559 ut_free(it->m_ptr);
6560 }
6561
6562 UT_DELETE(block_cache);
6563
6564 block_cache = nullptr;
6565 }
6566
6567 /** Wakes up all async i/o threads so that they know to exit themselves in
6568 shutdown. */
os_aio_wake_all_threads_at_shutdown()6569 void os_aio_wake_all_threads_at_shutdown() {
6570 #ifdef WIN_ASYNC_IO
6571
6572 AIO::wake_at_shutdown();
6573
6574 #elif defined(LINUX_NATIVE_AIO)
6575
6576 /* When using native AIO interface the io helper threads
6577 wait on io_getevents with a timeout value of 500ms. At
6578 each wake up these threads check the server status.
6579 No need to do anything to wake them up. */
6580
6581 if (srv_use_native_aio) {
6582 return;
6583 }
6584
6585 #endif /* !WIN_ASYNC_AIO */
6586
6587 /* Fall through to simulated AIO handler wakeup if we are
6588 not using native AIO. */
6589
6590 /* This loop wakes up all simulated ai/o threads */
6591
6592 for (ulint i = 0; i < os_aio_n_segments; ++i) {
6593 os_event_set(os_aio_segment_wait_events[i]);
6594 }
6595 }
6596
6597 /** Waits until there are no pending writes in AIO::s_writes. There can
6598 be other, synchronous, pending writes. */
os_aio_wait_until_no_pending_writes()6599 void os_aio_wait_until_no_pending_writes() {
6600 AIO::wait_until_no_pending_writes();
6601 }
6602
6603 /** Calculates segment number for a slot.
6604 @param[in] array AIO wait array
6605 @param[in] slot slot in this array
6606 @return segment number (which is the number used by, for example,
6607 I/O-handler threads) */
get_segment_no_from_slot(const AIO * array,const Slot * slot)6608 ulint AIO::get_segment_no_from_slot(const AIO *array, const Slot *slot) {
6609 ulint segment;
6610 ulint seg_len;
6611
6612 if (array == s_ibuf) {
6613 ut_ad(!srv_read_only_mode);
6614
6615 segment = IO_IBUF_SEGMENT;
6616
6617 } else if (array == s_log) {
6618 ut_ad(!srv_read_only_mode);
6619
6620 segment = IO_LOG_SEGMENT;
6621
6622 } else if (array == s_reads) {
6623 seg_len = s_reads->slots_per_segment();
6624
6625 segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
6626 } else {
6627 ut_a(array == s_writes);
6628
6629 seg_len = s_writes->slots_per_segment();
6630
6631 segment = s_reads->m_n_segments + (srv_read_only_mode ? 0 : 2) +
6632 slot->pos / seg_len;
6633 }
6634 return (segment);
6635 }
6636
6637 /** Requests for a slot in the aio array. If no slot is available, waits until
6638 not_full-event becomes signaled.
6639
6640 @param[in,out] type IO context
6641 @param[in,out] m1 message to be passed along with the AIO
6642 operation
6643 @param[in,out] m2 message to be passed along with the AIO
6644 operation
6645 @param[in] file file handle
6646 @param[in] name name of the file or path as a NUL-terminated
6647 string
6648 @param[in,out] buf buffer where to read or from which to write
6649 @param[in] offset file offset, where to read from or start writing
6650 @param[in] len length of the block to read or write
6651 @return pointer to slot */
reserve_slot(IORequest & type,fil_node_t * m1,void * m2,pfs_os_file_t file,const char * name,void * buf,os_offset_t offset,ulint len)6652 Slot *AIO::reserve_slot(IORequest &type, fil_node_t *m1, void *m2,
6653 pfs_os_file_t file, const char *name, void *buf,
6654 os_offset_t offset, ulint len) {
6655 #ifdef WIN_ASYNC_IO
6656 ut_a((len & 0xFFFFFFFFUL) == len);
6657 #endif /* WIN_ASYNC_IO */
6658
6659 /* No need of a mutex. Only reading constant fields */
6660 ut_ad(type.validate());
6661
6662 const auto slots_per_seg = slots_per_segment();
6663
6664 /* We attempt to keep adjacent blocks in the same local
6665 segment. This can help in merging IO requests when we are
6666 doing simulated AIO */
6667 ulint local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6)) % m_n_segments;
6668
6669 for (;;) {
6670 acquire();
6671
6672 if (m_n_reserved != m_slots.size()) {
6673 break;
6674 }
6675
6676 release();
6677
6678 if (!srv_use_native_aio) {
6679 /* If the handler threads are suspended,
6680 wake them so that we get more slots */
6681
6682 os_aio_simulated_wake_handler_threads();
6683 }
6684
6685 os_event_wait(m_not_full);
6686 }
6687
6688 ulint counter = 0;
6689 Slot *slot = nullptr;
6690
6691 /* We start our search for an available slot from our preferred
6692 local segment and do a full scan of the array. We are
6693 guaranteed to find a slot in full scan. */
6694 for (ulint i = local_seg * slots_per_seg; counter < m_slots.size();
6695 ++i, ++counter) {
6696 i %= m_slots.size();
6697
6698 slot = at(i);
6699
6700 if (slot->is_reserved == false) {
6701 break;
6702 }
6703 }
6704
6705 /* We MUST always be able to get hold of a reserved slot. */
6706 ut_a(counter < m_slots.size());
6707
6708 ut_a(slot->is_reserved == false);
6709
6710 ++m_n_reserved;
6711
6712 if (m_n_reserved == 1) {
6713 os_event_reset(m_is_empty);
6714 }
6715
6716 if (m_n_reserved == m_slots.size()) {
6717 os_event_reset(m_not_full);
6718 }
6719
6720 slot->is_reserved = true;
6721 slot->reservation_time = ut_time_monotonic();
6722 slot->m1 = m1;
6723 slot->m2 = m2;
6724 slot->file = file;
6725 slot->name = name;
6726 #ifdef _WIN32
6727 slot->len = static_cast<DWORD>(len);
6728 #else
6729 slot->len = static_cast<ulint>(len);
6730 #endif /* _WIN32 */
6731 slot->type = type;
6732 slot->buf = static_cast<byte *>(buf);
6733 slot->ptr = slot->buf;
6734 slot->offset = offset;
6735 slot->err = DB_SUCCESS;
6736 slot->original_len = static_cast<uint32>(len);
6737 slot->io_already_done = false;
6738 slot->buf_block = nullptr;
6739 slot->encrypt_log_buf = nullptr;
6740
6741 if (srv_use_native_aio && offset > 0 && type.is_write() &&
6742 type.is_compressed()) {
6743 ulint compressed_len = len;
6744
6745 ut_ad(!type.is_log());
6746
6747 release();
6748
6749 void *src_buf = slot->buf;
6750 slot->buf_block = os_file_compress_page(type, src_buf, &compressed_len);
6751
6752 slot->buf = static_cast<byte *>(src_buf);
6753 slot->ptr = slot->buf;
6754 #ifdef _WIN32
6755 slot->len = static_cast<DWORD>(compressed_len);
6756 #else
6757 slot->len = static_cast<ulint>(compressed_len);
6758 #endif /* _WIN32 */
6759 slot->skip_punch_hole = !type.punch_hole();
6760
6761 acquire();
6762 }
6763
6764 /* We do encryption after compression, since if we do encryption
6765 before compression, the encrypted data will cause compression fail
6766 or low compression rate. */
6767 if (srv_use_native_aio && offset > 0 && type.is_write() &&
6768 type.is_encrypted()) {
6769 ulint encrypted_len = slot->len;
6770 file::Block *encrypted_block;
6771 byte *encrypt_log_buf;
6772
6773 release();
6774
6775 void *src_buf = slot->buf;
6776 if (!type.is_log()) {
6777 encrypted_block = os_file_encrypt_page(type, src_buf, &encrypted_len);
6778
6779 if (slot->buf_block != nullptr) {
6780 os_free_block(slot->buf_block);
6781 }
6782
6783 slot->buf_block = encrypted_block;
6784 } else {
6785 /* Skip encrypt log file header */
6786 if (offset >= LOG_FILE_HDR_SIZE) {
6787 encrypted_block =
6788 os_file_encrypt_log(type, src_buf, encrypt_log_buf, &encrypted_len);
6789
6790 if (slot->buf_block != nullptr) {
6791 os_free_block(slot->buf_block);
6792 }
6793
6794 slot->buf_block = encrypted_block;
6795
6796 if (slot->encrypt_log_buf != nullptr) {
6797 ut_free(slot->encrypt_log_buf);
6798 }
6799
6800 slot->encrypt_log_buf = encrypt_log_buf;
6801 }
6802 }
6803
6804 slot->buf = static_cast<byte *>(src_buf);
6805
6806 slot->ptr = slot->buf;
6807
6808 #ifdef _WIN32
6809 slot->len = static_cast<DWORD>(encrypted_len);
6810 #else
6811 slot->len = static_cast<ulint>(encrypted_len);
6812 #endif /* _WIN32 */
6813
6814 acquire();
6815 }
6816
6817 #ifdef WIN_ASYNC_IO
6818 {
6819 OVERLAPPED *control;
6820
6821 control = &slot->control;
6822 control->Offset = (DWORD)offset & 0xFFFFFFFF;
6823 control->OffsetHigh = (DWORD)(offset >> 32);
6824
6825 ResetEvent(slot->handle);
6826 }
6827 #elif defined(LINUX_NATIVE_AIO)
6828
6829 /* If we are not using native AIO skip this part. */
6830 if (srv_use_native_aio) {
6831 off_t aio_offset;
6832
6833 /* Check if we are dealing with 64 bit arch.
6834 If not then make sure that offset fits in 32 bits. */
6835 aio_offset = (off_t)offset;
6836
6837 ut_a(sizeof(aio_offset) >= sizeof(offset) ||
6838 ((os_offset_t)aio_offset) == offset);
6839
6840 auto iocb = &slot->control;
6841
6842 if (type.is_read()) {
6843 io_prep_pread(iocb, file.m_file, slot->ptr, slot->len, aio_offset);
6844 } else {
6845 ut_ad(type.is_write());
6846 io_prep_pwrite(iocb, file.m_file, slot->ptr, slot->len, aio_offset);
6847 }
6848
6849 iocb->data = slot;
6850
6851 slot->n_bytes = 0;
6852 slot->ret = 0;
6853 }
6854 #endif /* LINUX_NATIVE_AIO */
6855
6856 release();
6857
6858 return (slot);
6859 }
6860
6861 /** Wakes up a simulated aio i/o-handler thread if it has something to do.
6862 @param[in] global_segment The number of the segment in the AIO arrays */
wake_simulated_handler_thread(ulint global_segment)6863 void AIO::wake_simulated_handler_thread(ulint global_segment) {
6864 ut_ad(!srv_use_native_aio);
6865
6866 AIO *array{};
6867
6868 auto segment = get_array_and_local_segment(array, global_segment);
6869
6870 array->wake_simulated_handler_thread(global_segment, segment);
6871 }
6872
6873 /** Wakes up a simulated AIO I/O-handler thread if it has something to do
6874 for a local segment in the AIO array.
6875 @param[in] global_segment The number of the segment in the AIO arrays
6876 @param[in] segment The local segment in the AIO array */
wake_simulated_handler_thread(ulint global_segment,ulint segment)6877 void AIO::wake_simulated_handler_thread(ulint global_segment, ulint segment) {
6878 ut_ad(!srv_use_native_aio);
6879
6880 ulint n = slots_per_segment();
6881 ulint offset = segment * n;
6882
6883 /* Look through n slots after the segment * n'th slot */
6884
6885 acquire();
6886
6887 const Slot *slot = at(offset);
6888
6889 for (ulint i = 0; i < n; ++i, ++slot) {
6890 if (slot->is_reserved) {
6891 /* Found an i/o request */
6892
6893 release();
6894
6895 os_event_set(os_aio_segment_wait_events[global_segment]);
6896
6897 return;
6898 }
6899 }
6900
6901 release();
6902 }
6903
6904 /** Wakes up simulated aio i/o-handler threads if they have something to do. */
os_aio_simulated_wake_handler_threads()6905 void os_aio_simulated_wake_handler_threads() {
6906 if (srv_use_native_aio) {
6907 /* We do not use simulated aio: do nothing */
6908
6909 return;
6910 }
6911
6912 os_aio_recommend_sleep_for_read_threads = false;
6913
6914 for (ulint i = 0; i < os_aio_n_segments; ++i) {
6915 AIO::wake_simulated_handler_thread(i);
6916 }
6917 }
6918
6919 /** Select the IO slot array
6920 @param[in,out] type Type of IO, READ or WRITE
6921 @param[in] read_only true if running in read-only mode
6922 @param[in] aio_mode IO mode
6923 @return slot array or NULL if invalid mode specified */
select_slot_array(IORequest & type,bool read_only,AIO_mode aio_mode)6924 AIO *AIO::select_slot_array(IORequest &type, bool read_only,
6925 AIO_mode aio_mode) {
6926 AIO *array;
6927
6928 ut_ad(type.validate());
6929
6930 switch (aio_mode) {
6931 case AIO_mode::NORMAL:
6932 array = type.is_read() ? AIO::s_reads : AIO::s_writes;
6933 break;
6934
6935 case AIO_mode::IBUF:
6936 ut_ad(type.is_read());
6937
6938 /* Reduce probability of deadlock bugs in connection with ibuf:
6939 do not let the ibuf i/o handler sleep */
6940
6941 type.clear_do_not_wake();
6942
6943 array = read_only ? AIO::s_reads : AIO::s_ibuf;
6944 break;
6945
6946 case AIO_mode::LOG:
6947 array = read_only ? AIO::s_reads : AIO::s_log;
6948 break;
6949
6950 case AIO_mode::SYNC:
6951
6952 array = AIO::s_sync;
6953 #if defined(LINUX_NATIVE_AIO)
6954 /* In Linux native AIO we don't use sync IO array. */
6955 ut_a(!srv_use_native_aio);
6956 #endif /* LINUX_NATIVE_AIO */
6957 break;
6958
6959 default:
6960 ut_error;
6961 }
6962
6963 return (array);
6964 }
6965
6966 #ifdef WIN_ASYNC_IO
6967 /** This function is only used in Windows asynchronous i/o.
6968 Waits for an aio operation to complete. This function is used to wait the
6969 for completed requests. The aio array of pending requests is divided
6970 into segments. The thread specifies which segment or slot it wants to wait
6971 for. NOTE: this function will also take care of freeing the aio slot,
6972 therefore no other thread is allowed to do the freeing!
6973 @param[in] segment The number of the segment in the aio arrays to
6974 wait for; segment 0 is the ibuf I/O thread,
6975 segment 1 the log I/O thread, then follow the
6976 non-ibuf read threads, and as the last are the
6977 non-ibuf write threads; if this is
6978 ULINT_UNDEFINED, then it means that sync AIO
6979 is used, and this parameter is ignored
6980 @param[in] pos this parameter is used only in sync AIO:
6981 wait for the aio slot at this position
6982 @param[out] m1 the messages passed with the AIO request; note
6983 that also in the case where the AIO operation
6984 failed, these output parameters are valid and
6985 can be used to restart the operation,
6986 for example
6987 @param[out] m2 callback message
6988 @param[out] type OS_FILE_WRITE or ..._READ
6989 @return DB_SUCCESS or error code */
os_aio_windows_handler(ulint segment,ulint pos,fil_node_t ** m1,void ** m2,IORequest * type)6990 static dberr_t os_aio_windows_handler(ulint segment, ulint pos, fil_node_t **m1,
6991 void **m2, IORequest *type) {
6992 Slot *slot;
6993 dberr_t err;
6994 AIO *array{};
6995 ulint orig_seg = segment;
6996
6997 if (segment == ULINT_UNDEFINED) {
6998 segment = 0;
6999 array = AIO::sync_array();
7000 } else {
7001 segment = AIO::get_array_and_local_segment(array, segment);
7002 }
7003
7004 /* NOTE! We only access constant fields in os_aio_array. Therefore
7005 we do not have to acquire the protecting mutex yet */
7006
7007 #ifndef UNIV_HOTBACKUP
7008 ut_ad(os_aio_validate_skip());
7009 #endif /* !UNIV_HOTBACKUP */
7010
7011 if (array == AIO::sync_array()) {
7012 WaitForSingleObject(array->at(pos)->handle, INFINITE);
7013
7014 } else {
7015 if (orig_seg != ULINT_UNDEFINED) {
7016 srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
7017 }
7018
7019 pos = WaitForMultipleObjects((DWORD)array->slots_per_segment(),
7020 array->handles(segment), FALSE, INFINITE);
7021 }
7022
7023 array->acquire();
7024
7025 if (
7026 #ifndef UNIV_HOTBACKUP
7027 srv_shutdown_state.load() == SRV_SHUTDOWN_EXIT_THREADS
7028 #else /* !UNIV_HOTBACKUP */
7029 true
7030 #endif /* !UNIV_HOTBACKUP */
7031 && array->is_empty() && !buf_flush_page_cleaner_is_active()) {
7032
7033 *m1 = NULL;
7034 *m2 = NULL;
7035
7036 array->release();
7037
7038 return (DB_SUCCESS);
7039 }
7040
7041 ulint n = array->slots_per_segment();
7042
7043 ut_a(pos >= WAIT_OBJECT_0 && pos <= WAIT_OBJECT_0 + n);
7044
7045 slot = array->at(pos + segment * n);
7046
7047 ut_a(slot->is_reserved);
7048
7049 if (orig_seg != ULINT_UNDEFINED) {
7050 srv_set_io_thread_op_info(orig_seg, "get windows aio return value");
7051 }
7052
7053 BOOL ret;
7054 ret = GetOverlappedResult(slot->file.m_file, &slot->control, &slot->n_bytes,
7055 TRUE);
7056
7057 *m1 = slot->m1;
7058 *m2 = slot->m2;
7059
7060 *type = slot->type;
7061
7062 BOOL retry = FALSE;
7063
7064 if (ret && slot->n_bytes == slot->len) {
7065 err = DB_SUCCESS;
7066
7067 } else if (os_file_handle_error(slot->name, "Windows aio")) {
7068 retry = true;
7069
7070 } else {
7071 err = DB_IO_ERROR;
7072 }
7073
7074 array->release();
7075
7076 if (retry) {
7077 /* Retry failed read/write operation synchronously.
7078 No need to hold array->m_mutex. */
7079
7080 #ifdef UNIV_PFS_IO
7081 /* This read/write does not go through os_file_read
7082 and os_file_write APIs, need to register with
7083 performance schema explicitly here. */
7084 struct PSI_file_locker *locker = NULL;
7085 PSI_file_locker_state state;
7086 register_pfs_file_io_begin(
7087 &state, locker, slot->file, slot->len,
7088 slot->type.is_write() ? PSI_FILE_WRITE : PSI_FILE_READ, __FILE__,
7089 __LINE__);
7090 #endif /* UNIV_PFS_IO */
7091
7092 ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
7093
7094 ssize_t n_bytes = SyncFileIO::execute(slot);
7095
7096 #ifdef UNIV_PFS_IO
7097 register_pfs_file_io_end(locker, slot->len);
7098 #endif /* UNIV_PFS_IO */
7099
7100 if (n_bytes < 0 && GetLastError() == ERROR_IO_PENDING) {
7101 /* AIO was queued successfully!
7102 We want a synchronous I/O operation on a
7103 file where we also use async I/O: in Windows
7104 we must use the same wait mechanism as for
7105 async I/O */
7106
7107 BOOL ret;
7108 ret = GetOverlappedResult(slot->file.m_file, &slot->control,
7109 &slot->n_bytes, TRUE);
7110
7111 n_bytes = ret ? slot->n_bytes : -1;
7112 }
7113
7114 err = (n_bytes == slot->len) ? DB_SUCCESS : DB_IO_ERROR;
7115 }
7116
7117 if (err == DB_SUCCESS) {
7118 err = AIOHandler::post_io_processing(slot);
7119 }
7120
7121 array->release_with_mutex(slot);
7122
7123 return (err);
7124 }
7125 #endif /* WIN_ASYNC_IO */
7126
os_aio_func(IORequest & type,AIO_mode aio_mode,const char * name,pfs_os_file_t file,void * buf,os_offset_t offset,ulint n,bool read_only,fil_node_t * m1,void * m2)7127 dberr_t os_aio_func(IORequest &type, AIO_mode aio_mode, const char *name,
7128 pfs_os_file_t file, void *buf, os_offset_t offset, ulint n,
7129 bool read_only, fil_node_t *m1, void *m2) {
7130 #ifdef WIN_ASYNC_IO
7131 BOOL ret = TRUE;
7132 #endif /* WIN_ASYNC_IO */
7133
7134 ut_ad(n > 0);
7135 ut_ad((n % OS_FILE_LOG_BLOCK_SIZE) == 0);
7136 ut_ad((offset % OS_FILE_LOG_BLOCK_SIZE) == 0);
7137 #ifndef UNIV_HOTBACKUP
7138 ut_ad(os_aio_validate_skip());
7139 #endif /* !UNIV_HOTBACKUP */
7140
7141 #ifdef WIN_ASYNC_IO
7142 ut_ad((n & 0xFFFFFFFFUL) == n);
7143 #endif /* WIN_ASYNC_IO */
7144
7145 if (aio_mode == AIO_mode::SYNC
7146 #ifdef WIN_ASYNC_IO
7147 && !srv_use_native_aio
7148 #endif /* WIN_ASYNC_IO */
7149 ) {
7150 /* This is actually an ordinary synchronous read or write:
7151 no need to use an i/o-handler thread. NOTE that if we use
7152 Windows async i/o, Windows does not allow us to use
7153 ordinary synchronous os_file_read etc. on the same file,
7154 therefore we have built a special mechanism for synchronous
7155 wait in the Windows case.
7156 Also note that the Performance Schema instrumentation has
7157 been performed by current os_aio_func()'s wrapper function
7158 pfs_os_aio_func(). So we would no longer need to call
7159 Performance Schema instrumented os_file_read() and
7160 os_file_write(). Instead, we should use os_file_read_func()
7161 and os_file_write_func() */
7162
7163 if (type.is_read()) {
7164 return (os_file_read_func(type, name, file.m_file, buf, offset, n));
7165 }
7166
7167 ut_ad(type.is_write());
7168 return (os_file_write_func(type, name, file.m_file, buf, offset, n));
7169 }
7170
7171 try_again:
7172
7173 auto array = AIO::select_slot_array(type, read_only, aio_mode);
7174
7175 auto slot = array->reserve_slot(type, m1, m2, file, name, buf, offset, n);
7176
7177 if (type.is_read()) {
7178 if (srv_use_native_aio) {
7179 ++os_n_file_reads;
7180
7181 os_bytes_read_since_printout += n;
7182 #ifdef WIN_ASYNC_IO
7183 ret = ReadFile(file.m_file, slot->ptr, slot->len, &slot->n_bytes,
7184 &slot->control);
7185 #elif defined(LINUX_NATIVE_AIO)
7186 if (!array->linux_dispatch(slot)) {
7187 goto err_exit;
7188 }
7189 #endif /* WIN_ASYNC_IO */
7190 } else if (type.is_wake()) {
7191 AIO::wake_simulated_handler_thread(
7192 AIO::get_segment_no_from_slot(array, slot));
7193 }
7194 } else if (type.is_write()) {
7195 if (srv_use_native_aio) {
7196 ++os_n_file_writes;
7197
7198 #ifdef WIN_ASYNC_IO
7199 ret = WriteFile(file.m_file, slot->ptr, slot->len, &slot->n_bytes,
7200 &slot->control);
7201 #elif defined(LINUX_NATIVE_AIO)
7202 if (!array->linux_dispatch(slot)) {
7203 goto err_exit;
7204 }
7205 #endif /* WIN_ASYNC_IO */
7206
7207 } else if (type.is_wake()) {
7208 AIO::wake_simulated_handler_thread(
7209 AIO::get_segment_no_from_slot(array, slot));
7210 }
7211 } else {
7212 ut_error;
7213 }
7214
7215 #ifdef WIN_ASYNC_IO
7216 if (srv_use_native_aio) {
7217 if ((ret && slot->len == slot->n_bytes) ||
7218 (!ret && GetLastError() == ERROR_IO_PENDING)) {
7219 /* AIO was queued successfully! */
7220
7221 if (aio_mode == AIO_mode::SYNC) {
7222 void *dummy_mess2;
7223 IORequest dummy_type;
7224 fil_node_t *dummy_mess1;
7225
7226 /* We want a synchronous i/o operation on a
7227 file where we also use async i/o: in Windows
7228 we must use the same wait mechanism as for
7229 async i/o */
7230
7231 return (os_aio_windows_handler(ULINT_UNDEFINED, slot->pos, &dummy_mess1,
7232 &dummy_mess2, &dummy_type));
7233 }
7234
7235 return (DB_SUCCESS);
7236 }
7237
7238 goto err_exit;
7239 }
7240 #endif /* WIN_ASYNC_IO */
7241
7242 /* AIO request was queued successfully! */
7243 return (DB_SUCCESS);
7244
7245 #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
7246 err_exit:
7247 #endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
7248
7249 array->release_with_mutex(slot);
7250
7251 if (os_file_handle_error(name, type.is_read() ? "aio read" : "aio write")) {
7252 goto try_again;
7253 }
7254
7255 return (DB_IO_ERROR);
7256 }
7257
7258 /** Simulated AIO handler for reaping IO requests */
7259 class SimulatedAIOHandler {
7260 public:
7261 /** Constructor
7262 @param[in,out] array The AIO array
7263 @param[in] segment Local segment in the array */
SimulatedAIOHandler(AIO * array,ulint segment)7264 SimulatedAIOHandler(AIO *array, ulint segment)
7265 : m_oldest(),
7266 m_n_elems(),
7267 m_lowest_offset(IB_UINT64_MAX),
7268 m_array(array),
7269 m_n_slots(),
7270 m_segment(segment),
7271 m_ptr(),
7272 m_buf() {
7273 ut_ad(m_segment < 100);
7274
7275 m_slots.resize(OS_AIO_MERGE_N_CONSECUTIVE);
7276 }
7277
7278 /** Destructor */
~SimulatedAIOHandler()7279 ~SimulatedAIOHandler() {
7280 if (m_ptr != nullptr) {
7281 ut_free(m_ptr);
7282 }
7283 }
7284
7285 /** Reset the state of the handler
7286 @param[in] n_slots Number of pending AIO operations supported */
init(ulint n_slots)7287 void init(ulint n_slots) {
7288 m_oldest = 0;
7289 m_n_elems = 0;
7290 m_n_slots = n_slots;
7291 m_lowest_offset = IB_UINT64_MAX;
7292
7293 if (m_ptr != nullptr) {
7294 ut_free(m_ptr);
7295 m_ptr = m_buf = nullptr;
7296 }
7297
7298 m_slots[0] = nullptr;
7299 }
7300
7301 /** Check if there is a slot for which the i/o has already been done
7302 @param[out] n_reserved Number of reserved slots
7303 @return the first completed slot that is found. */
check_completed(ulint * n_reserved)7304 Slot *check_completed(ulint *n_reserved) {
7305 ulint offset = m_segment * m_n_slots;
7306
7307 *n_reserved = 0;
7308
7309 Slot *slot;
7310
7311 slot = m_array->at(offset);
7312
7313 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7314 if (slot->is_reserved) {
7315 if (slot->io_already_done) {
7316 ut_a(slot->is_reserved);
7317
7318 return (slot);
7319 }
7320
7321 ++*n_reserved;
7322 }
7323 }
7324
7325 return (nullptr);
7326 }
7327
7328 /** If there are at least 2 seconds old requests, then pick the
7329 oldest one to prevent starvation. If several requests have the
7330 same age, then pick the one at the lowest offset.
7331 @return true if request was selected */
select()7332 bool select() {
7333 if (!select_oldest()) {
7334 return (select_lowest_offset());
7335 }
7336
7337 return (true);
7338 }
7339
7340 /** Check if there are several consecutive blocks
7341 to read or write. Merge them if found. */
merge()7342 void merge() {
7343 /* if m_n_elems != 0, then we have assigned
7344 something valid to consecutive_ios[0] */
7345 ut_ad(m_n_elems != 0);
7346 ut_ad(first_slot() != nullptr);
7347
7348 Slot *slot = first_slot();
7349
7350 while (!merge_adjacent(slot)) {
7351 /* No op */
7352 }
7353 }
7354
7355 /** We have now collected n_consecutive I/O requests
7356 in the array; allocate a single buffer which can hold
7357 all data, and perform the I/O
7358 @return the length of the buffer */
allocate_buffer()7359 ulint allocate_buffer() MY_ATTRIBUTE((warn_unused_result)) {
7360 ulint len;
7361 Slot *slot = first_slot();
7362
7363 ut_ad(m_ptr == nullptr);
7364
7365 if (slot->type.is_read() && m_n_elems > 1) {
7366 len = 0;
7367
7368 for (ulint i = 0; i < m_n_elems; ++i) {
7369 len += m_slots[i]->len;
7370 }
7371
7372 m_ptr = static_cast<byte *>(ut_malloc_nokey(len + UNIV_PAGE_SIZE));
7373
7374 m_buf = static_cast<byte *>(ut_align(m_ptr, UNIV_PAGE_SIZE));
7375
7376 } else {
7377 len = first_slot()->len;
7378 m_buf = first_slot()->buf;
7379 }
7380
7381 return (len);
7382 }
7383
7384 /** We have to compress the individual pages and punch
7385 holes in them on a page by page basis when writing to
7386 tables that can be compresed at the IO level.
7387 @param[in] len Value returned by allocate_buffer */
copy_to_buffer(ulint len)7388 void copy_to_buffer(ulint len) {
7389 Slot *slot = first_slot();
7390
7391 if (len > slot->len && slot->type.is_write()) {
7392 byte *ptr = m_buf;
7393
7394 ut_ad(ptr != slot->buf);
7395
7396 /* Copy the buffers to the combined buffer */
7397 for (ulint i = 0; i < m_n_elems; ++i) {
7398 slot = m_slots[i];
7399
7400 memmove(ptr, slot->buf, slot->len);
7401
7402 ptr += slot->len;
7403 }
7404 }
7405 }
7406
7407 /** Do the I/O with ordinary, synchronous i/o functions: */
io()7408 void io() {
7409 if (first_slot()->type.is_write()) {
7410 for (ulint i = 0; i < m_n_elems; ++i) {
7411 write(m_slots[i]);
7412 }
7413
7414 } else {
7415 for (ulint i = 0; i < m_n_elems; ++i) {
7416 read(m_slots[i]);
7417 }
7418 }
7419 }
7420
7421 /** Do the decompression of the pages read in */
io_complete()7422 void io_complete() {
7423 // Note: For non-compressed tables. Not required
7424 // for correctness.
7425 }
7426
7427 /** Mark the i/os done in slots */
done()7428 void done() {
7429 for (ulint i = 0; i < m_n_elems; ++i) {
7430 m_slots[i]->io_already_done = true;
7431 }
7432 }
7433
7434 /** @return the first slot in the consecutive array */
first_slot()7435 Slot *first_slot() MY_ATTRIBUTE((warn_unused_result)) {
7436 ut_a(m_n_elems > 0);
7437
7438 return (m_slots[0]);
7439 }
7440
7441 /** Wait for I/O requests
7442 @param[in] global_segment The global segment
7443 @param[in,out] event Wait on event if no active requests
7444 @return the number of slots */
7445 ulint check_pending(ulint global_segment, os_event_t event)
7446 MY_ATTRIBUTE((warn_unused_result));
7447
7448 private:
7449 /** Do the file read
7450 @param[in,out] slot Slot that has the IO context */
read(Slot * slot)7451 void read(Slot *slot) {
7452 dberr_t err = os_file_read_func(slot->type, slot->name, slot->file.m_file,
7453 slot->ptr, slot->offset, slot->len);
7454 ut_a(err == DB_SUCCESS);
7455 }
7456
7457 /** Do the file write
7458 @param[in,out] slot Slot that has the IO context */
write(Slot * slot)7459 void write(Slot *slot) {
7460 dberr_t err = os_file_write_func(slot->type, slot->name, slot->file.m_file,
7461 slot->ptr, slot->offset, slot->len);
7462 ut_a(err == DB_SUCCESS || err == DB_IO_NO_PUNCH_HOLE);
7463 }
7464
7465 /** @return true if the slots are adjacent and can be merged */
adjacent(const Slot * s1,const Slot * s2) const7466 bool adjacent(const Slot *s1, const Slot *s2) const {
7467 return (s1 != s2 && s1->file.m_file == s2->file.m_file &&
7468 s2->offset == s1->offset + s1->len && s1->type == s2->type);
7469 }
7470
7471 /** @return true if merge limit reached or no adjacent slots found. */
merge_adjacent(Slot * & current)7472 bool merge_adjacent(Slot *¤t) {
7473 Slot *slot;
7474 ulint offset = m_segment * m_n_slots;
7475
7476 slot = m_array->at(offset);
7477
7478 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7479 if (slot->is_reserved && adjacent(current, slot)) {
7480 current = slot;
7481
7482 /* Found a consecutive i/o request */
7483
7484 m_slots[m_n_elems] = slot;
7485
7486 ++m_n_elems;
7487
7488 return (m_n_elems >= m_slots.capacity());
7489 }
7490 }
7491
7492 return (true);
7493 }
7494
7495 /** There were no old requests. Look for an I/O request at the lowest
7496 offset in the array (we ignore the high 32 bits of the offset in these
7497 heuristics) */
select_lowest_offset()7498 bool select_lowest_offset() {
7499 ut_ad(m_n_elems == 0);
7500
7501 ulint offset = m_segment * m_n_slots;
7502
7503 m_lowest_offset = IB_UINT64_MAX;
7504
7505 for (ulint i = 0; i < m_n_slots; ++i) {
7506 Slot *slot;
7507
7508 slot = m_array->at(i + offset);
7509
7510 if (slot->is_reserved && slot->offset < m_lowest_offset) {
7511 /* Found an i/o request */
7512 m_slots[0] = slot;
7513
7514 m_n_elems = 1;
7515
7516 m_lowest_offset = slot->offset;
7517 }
7518 }
7519
7520 return (m_n_elems > 0);
7521 }
7522
7523 /** Select the slot if it is older than the current oldest slot.
7524 @param[in] slot The slot to check */
select_if_older(Slot * slot)7525 void select_if_older(Slot *slot) {
7526 const auto time_diff = ut_time_monotonic() - slot->reservation_time;
7527
7528 const uint64_t age = time_diff > 0 ? (uint64_t)time_diff : 0;
7529
7530 if ((age >= 2 && age > m_oldest) ||
7531 (age >= 2 && age == m_oldest && slot->offset < m_lowest_offset)) {
7532 /* Found an i/o request */
7533 m_slots[0] = slot;
7534
7535 m_n_elems = 1;
7536
7537 m_oldest = age;
7538
7539 m_lowest_offset = slot->offset;
7540 }
7541 }
7542
7543 /** Select th oldest slot in the array
7544 @return true if oldest slot found */
select_oldest()7545 bool select_oldest() {
7546 ut_ad(m_n_elems == 0);
7547
7548 Slot *slot;
7549 ulint offset = m_n_slots * m_segment;
7550
7551 slot = m_array->at(offset);
7552
7553 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7554 if (slot->is_reserved) {
7555 select_if_older(slot);
7556 }
7557 }
7558
7559 return (m_n_elems > 0);
7560 }
7561
7562 typedef std::vector<Slot *> slots_t;
7563
7564 private:
7565 ulint m_oldest;
7566 ulint m_n_elems;
7567 os_offset_t m_lowest_offset;
7568
7569 AIO *m_array;
7570 ulint m_n_slots;
7571 ulint m_segment;
7572
7573 slots_t m_slots;
7574
7575 byte *m_ptr;
7576 byte *m_buf;
7577 };
7578
7579 /** Wait for I/O requests
7580 @return the number of slots */
check_pending(ulint global_segment,os_event_t event)7581 ulint SimulatedAIOHandler::check_pending(ulint global_segment,
7582 os_event_t event) {
7583 /* NOTE! We only access constant fields in os_aio_array.
7584 Therefore we do not have to acquire the protecting mutex yet */
7585
7586 #ifndef UNIV_HOTBACKUP
7587 ut_ad(os_aio_validate_skip());
7588 #endif /* !UNIV_HOTBACKUP */
7589
7590 ut_ad(m_segment < m_array->get_n_segments());
7591
7592 /* Look through n slots after the segment * n'th slot */
7593
7594 if (AIO::is_read(m_array) && os_aio_recommend_sleep_for_read_threads) {
7595 /* Give other threads chance to add several
7596 I/Os to the array at once. */
7597
7598 srv_set_io_thread_op_info(global_segment, "waiting for i/o request");
7599
7600 os_event_wait(event);
7601
7602 return (0);
7603 }
7604
7605 return (m_array->slots_per_segment());
7606 }
7607
7608 /** Does simulated AIO. This function should be called by an i/o-handler
7609 thread.
7610
7611 @param[in] global_segment The number of the segment in the aio arrays to
7612 wait for; segment 0 is the ibuf i/o thread,
7613 segment 1 the log i/o thread, then follow the
7614 non-ibuf read threads, and as the last are the
7615 non-ibuf write threads
7616 @param[out] m1 the messages passed with the AIO request; note
7617 that also in the case where the AIO operation
7618 failed, these output parameters are valid and
7619 can be used to restart
7620 the operation, for example
7621 @param[out] m2 Callback argument
7622 @param[in] type IO context
7623 @return DB_SUCCESS or error code */
os_aio_simulated_handler(ulint global_segment,fil_node_t ** m1,void ** m2,IORequest * type)7624 static dberr_t os_aio_simulated_handler(ulint global_segment, fil_node_t **m1,
7625 void **m2, IORequest *type) {
7626 Slot *slot;
7627 AIO *array{};
7628 os_event_t event = os_aio_segment_wait_events[global_segment];
7629
7630 auto segment = AIO::get_array_and_local_segment(array, global_segment);
7631
7632 SimulatedAIOHandler handler(array, segment);
7633
7634 for (;;) {
7635 srv_set_io_thread_op_info(global_segment, "looking for i/o requests (a)");
7636
7637 ulint n_slots = handler.check_pending(global_segment, event);
7638
7639 if (n_slots == 0) {
7640 continue;
7641 }
7642
7643 handler.init(n_slots);
7644
7645 srv_set_io_thread_op_info(global_segment, "looking for i/o requests (b)");
7646
7647 array->acquire();
7648
7649 ulint n_reserved;
7650
7651 slot = handler.check_completed(&n_reserved);
7652
7653 if (slot != nullptr) {
7654 break;
7655
7656 } else if (n_reserved == 0
7657 #ifndef UNIV_HOTBACKUP
7658 && !buf_flush_page_cleaner_is_active() &&
7659 srv_shutdown_state.load() == SRV_SHUTDOWN_EXIT_THREADS
7660 #endif /* !UNIV_HOTBACKUP */
7661 ) {
7662
7663 /* There is no completed request. If there
7664 are no pending request at all, and the system
7665 is being shut down, exit. */
7666
7667 array->release();
7668
7669 *m1 = nullptr;
7670
7671 *m2 = nullptr;
7672
7673 return (DB_SUCCESS);
7674
7675 } else if (handler.select()) {
7676 break;
7677 }
7678
7679 /* No I/O requested at the moment */
7680
7681 srv_set_io_thread_op_info(global_segment, "resetting wait event");
7682
7683 /* We wait here until tbere are more IO requests
7684 for this segment. */
7685
7686 os_event_reset(event);
7687
7688 array->release();
7689
7690 srv_set_io_thread_op_info(global_segment, "waiting for i/o request");
7691
7692 os_event_wait(event);
7693 }
7694
7695 /** Found a slot that has already completed its IO */
7696
7697 if (slot == nullptr) {
7698 /* Merge adjacent requests */
7699 handler.merge();
7700
7701 /* Check if there are several consecutive blocks
7702 to read or write */
7703
7704 srv_set_io_thread_op_info(global_segment, "consecutive i/o requests");
7705
7706 // Note: We don't support write combining for simulated AIO.
7707 // ulint total_len = handler.allocate_buffer();
7708
7709 /* We release the array mutex for the time of the I/O: NOTE that
7710 this assumes that there is just one i/o-handler thread serving
7711 a single segment of slots! */
7712
7713 array->release();
7714
7715 // Note: We don't support write combining for simulated AIO.
7716 // handler.copy_to_buffer(total_len);
7717
7718 srv_set_io_thread_op_info(global_segment, "doing file i/o");
7719
7720 handler.io();
7721
7722 srv_set_io_thread_op_info(global_segment, "file i/o done");
7723
7724 handler.io_complete();
7725
7726 array->acquire();
7727
7728 handler.done();
7729
7730 /* We return the messages for the first slot now, and if there
7731 were several slots, the messages will be returned with
7732 subsequent calls of this function */
7733
7734 slot = handler.first_slot();
7735 }
7736
7737 ut_ad(slot->is_reserved);
7738
7739 *m1 = slot->m1;
7740 *m2 = slot->m2;
7741
7742 *type = slot->type;
7743
7744 array->release(slot);
7745
7746 array->release();
7747
7748 return (DB_SUCCESS);
7749 }
7750
7751 /** Get the total number of pending IOs
7752 @return the total number of pending IOs */
total_pending_io_count()7753 ulint AIO::total_pending_io_count() {
7754 ulint count = s_reads->pending_io_count();
7755
7756 if (s_writes != nullptr) {
7757 count += s_writes->pending_io_count();
7758 }
7759
7760 if (s_ibuf != nullptr) {
7761 count += s_ibuf->pending_io_count();
7762 }
7763
7764 if (s_log != nullptr) {
7765 count += s_log->pending_io_count();
7766 }
7767
7768 if (s_sync != nullptr) {
7769 count += s_sync->pending_io_count();
7770 }
7771
7772 return (count);
7773 }
7774
7775 /** Validates the consistency the aio system.
7776 @return true if ok */
os_aio_validate()7777 static bool os_aio_validate() {
7778 /* The methods countds and validates, we ignore the count. */
7779 AIO::total_pending_io_count();
7780
7781 return (true);
7782 }
7783
7784 /** Prints pending IO requests per segment of an aio array.
7785 We probably don't need per segment statistics but they can help us
7786 during development phase to see if the IO requests are being
7787 distributed as expected.
7788 @param[in,out] file File where to print
7789 @param[in] segments Pending IO array */
print_segment_info(FILE * file,const ulint * segments)7790 void AIO::print_segment_info(FILE *file, const ulint *segments) {
7791 ut_ad(m_n_segments > 0);
7792
7793 if (m_n_segments > 1) {
7794 fprintf(file, " [");
7795
7796 for (ulint i = 0; i < m_n_segments; ++i, ++segments) {
7797 if (i != 0) {
7798 fprintf(file, ", ");
7799 }
7800
7801 fprintf(file, ULINTPF, *segments);
7802 }
7803
7804 fprintf(file, "] ");
7805 }
7806 }
7807
7808 /** Prints info about the aio array.
7809 @param[in,out] file Where to print */
print(FILE * file)7810 void AIO::print(FILE *file) {
7811 ulint count = 0;
7812 ulint n_res_seg[SRV_MAX_N_IO_THREADS];
7813
7814 mutex_enter(&m_mutex);
7815
7816 ut_a(!m_slots.empty());
7817 ut_a(m_n_segments > 0);
7818
7819 memset(n_res_seg, 0x0, sizeof(n_res_seg));
7820
7821 for (ulint i = 0; i < m_slots.size(); ++i) {
7822 Slot &slot = m_slots[i];
7823 ulint segment = (i * m_n_segments) / m_slots.size();
7824
7825 if (slot.is_reserved) {
7826 ++count;
7827
7828 ++n_res_seg[segment];
7829
7830 ut_a(slot.len > 0);
7831 }
7832 }
7833
7834 ut_a(m_n_reserved == count);
7835
7836 print_segment_info(file, n_res_seg);
7837
7838 mutex_exit(&m_mutex);
7839 }
7840
7841 /** Print all the AIO segments
7842 @param[in,out] file Where to print */
print_all(FILE * file)7843 void AIO::print_all(FILE *file) {
7844 s_reads->print(file);
7845
7846 if (s_writes != nullptr) {
7847 fputs(", aio writes:", file);
7848 s_writes->print(file);
7849 }
7850
7851 if (s_ibuf != nullptr) {
7852 fputs(",\n ibuf aio reads:", file);
7853 s_ibuf->print(file);
7854 }
7855
7856 if (s_log != nullptr) {
7857 fputs(", log i/o's:", file);
7858 s_log->print(file);
7859 }
7860
7861 if (s_sync != nullptr) {
7862 fputs(", sync i/o's:", file);
7863 s_sync->print(file);
7864 }
7865 }
7866
7867 /** Prints info of the aio arrays.
7868 @param[in,out] file file where to print */
os_aio_print(FILE * file)7869 void os_aio_print(FILE *file) {
7870 double avg_bytes_read;
7871
7872 #ifndef UNIV_HOTBACKUP
7873 for (ulint i = 0; i < srv_n_file_io_threads; ++i) {
7874 fprintf(file, "I/O thread %lu state: %s (%s)", (ulong)i,
7875 srv_io_thread_op_info[i], srv_io_thread_function[i]);
7876
7877 #ifndef _WIN32
7878 if (os_event_is_set(os_aio_segment_wait_events[i])) {
7879 fprintf(file, " ev set");
7880 }
7881 #endif /* _WIN32 */
7882
7883 fprintf(file, "\n");
7884 }
7885 #endif /* !UNIV_HOTBACKUP */
7886
7887 fputs("Pending normal aio reads:", file);
7888
7889 AIO::print_all(file);
7890
7891 putc('\n', file);
7892 const auto current_time = ut_time_monotonic();
7893 const auto time_elapsed = 0.001 + (current_time - os_last_printout);
7894
7895 fprintf(file,
7896 "Pending flushes (fsync) log: " ULINTPF
7897 "; "
7898 "buffer pool: " ULINTPF "\n" ULINTPF " OS file reads, " ULINTPF
7899 " OS file writes, " ULINTPF " OS fsyncs\n",
7900 fil_n_pending_log_flushes, fil_n_pending_tablespace_flushes,
7901 os_n_file_reads, os_n_file_writes, os_n_fsyncs);
7902
7903 if (os_n_pending_writes != 0 || os_n_pending_reads != 0) {
7904 fprintf(file, ULINTPF " pending preads, " ULINTPF " pending pwrites\n",
7905 os_n_pending_reads, os_n_pending_writes);
7906 }
7907
7908 if (os_n_file_reads == os_n_file_reads_old) {
7909 avg_bytes_read = 0.0;
7910 } else {
7911 avg_bytes_read = (double)os_bytes_read_since_printout /
7912 (os_n_file_reads - os_n_file_reads_old);
7913 }
7914
7915 fprintf(file,
7916 "%.2f reads/s, %lu avg bytes/read,"
7917 " %.2f writes/s, %.2f fsyncs/s\n",
7918 (os_n_file_reads - os_n_file_reads_old) / time_elapsed,
7919 (ulong)avg_bytes_read,
7920 (os_n_file_writes - os_n_file_writes_old) / time_elapsed,
7921 (os_n_fsyncs - os_n_fsyncs_old) / time_elapsed);
7922
7923 os_n_file_reads_old = os_n_file_reads;
7924 os_n_file_writes_old = os_n_file_writes;
7925 os_n_fsyncs_old = os_n_fsyncs;
7926 os_bytes_read_since_printout = 0;
7927
7928 os_last_printout = current_time;
7929 }
7930
7931 /** Refreshes the statistics used to print per-second averages. */
os_aio_refresh_stats()7932 void os_aio_refresh_stats() {
7933 os_n_fsyncs_old = os_n_fsyncs;
7934
7935 os_bytes_read_since_printout = 0;
7936
7937 os_n_file_reads_old = os_n_file_reads;
7938
7939 os_n_file_writes_old = os_n_file_writes;
7940
7941 os_n_fsyncs_old = os_n_fsyncs;
7942
7943 os_bytes_read_since_printout = 0;
7944
7945 os_last_printout = ut_time_monotonic();
7946 }
7947
7948 /** Checks that all slots in the system have been freed, that is, there are
7949 no pending io operations.
7950 @return true if all free */
os_aio_all_slots_free()7951 bool os_aio_all_slots_free() { return (AIO::total_pending_io_count() == 0); }
7952
7953 #ifdef UNIV_DEBUG
7954 /** Prints all pending IO for the array
7955 @param[in] file file where to print */
to_file(FILE * file) const7956 void AIO::to_file(FILE *file) const {
7957 acquire();
7958
7959 fprintf(file, " %lu\n", static_cast<ulong>(m_n_reserved));
7960
7961 for (ulint i = 0; i < m_slots.size(); ++i) {
7962 const Slot &slot = m_slots[i];
7963
7964 if (slot.is_reserved) {
7965 fprintf(file, "%s IO for %s (offset=" UINT64PF ", size=%lu)\n",
7966 slot.type.is_read() ? "read" : "write", slot.name, slot.offset,
7967 slot.len);
7968 }
7969 }
7970
7971 release();
7972 }
7973
7974 /** Print pending IOs for all arrays */
print_to_file(FILE * file)7975 void AIO::print_to_file(FILE *file) {
7976 fprintf(file, "Pending normal aio reads:");
7977
7978 s_reads->to_file(file);
7979
7980 if (s_writes != nullptr) {
7981 fprintf(file, "Pending normal aio writes:");
7982 s_writes->to_file(file);
7983 }
7984
7985 if (s_ibuf != nullptr) {
7986 fprintf(file, "Pending ibuf aio reads:");
7987 s_ibuf->to_file(file);
7988 }
7989
7990 if (s_log != nullptr) {
7991 fprintf(file, "Pending log i/o's:");
7992 s_log->to_file(file);
7993 }
7994
7995 if (s_sync != nullptr) {
7996 fprintf(file, "Pending sync i/o's:");
7997 s_sync->to_file(file);
7998 }
7999 }
8000
8001 /** Prints all pending IO
8002 @param[in] file File where to print */
os_aio_print_pending_io(FILE * file)8003 void os_aio_print_pending_io(FILE *file) { AIO::print_to_file(file); }
8004
8005 #endif /* UNIV_DEBUG */
8006
8007 /**
8008 Set the file create umask
8009 @param[in] umask The umask to use for file creation. */
os_file_set_umask(ulint umask)8010 void os_file_set_umask(ulint umask) { os_innodb_umask = umask; }
8011
8012 /** Get the file create umask
8013 @return the umask to use for file creation. */
os_file_get_umask()8014 ulint os_file_get_umask() { return (os_innodb_umask); }
8015
8016 /** Check if the path is a directory. The file/directory must exist.
8017 @param[in] path The path to check
8018 @return true if it is a directory */
is_directory(const Path & path)8019 bool Dir_Walker::is_directory(const Path &path) {
8020 os_file_type_t type;
8021 bool exists;
8022
8023 if (os_file_status(path.c_str(), &exists, &type)) {
8024 ut_ad(exists);
8025 ut_ad(type != OS_FILE_TYPE_MISSING);
8026
8027 return (type == OS_FILE_TYPE_DIR);
8028 }
8029
8030 ut_ad(exists || type == OS_FILE_TYPE_FAILED);
8031 ut_ad(type != OS_FILE_TYPE_MISSING);
8032
8033 return (false);
8034 }
8035