1 /***********************************************************************
2
3 Copyright (c) 1995, 2019, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2009, Percona Inc.
5 Copyright (c) 2013, 2021, MariaDB Corporation.
6
7 Portions of this file contain modifications contributed and copyrighted
8 by Percona Inc.. Those modifications are
9 gratefully acknowledged and are described briefly in the InnoDB
10 documentation. The contributions by Percona Inc. are incorporated with
11 their permission, and subject to the conditions contained in the file
12 COPYING.Percona.
13
14 This program is free software; you can redistribute it and/or modify it
15 under the terms of the GNU General Public License as published by the
16 Free Software Foundation; version 2 of the License.
17
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
21 Public License for more details.
22
23 You should have received a copy of the GNU General Public License along with
24 this program; if not, write to the Free Software Foundation, Inc.,
25 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
26
27 ***********************************************************************/
28
29 /**************************************************//**
30 @file os/os0file.cc
31 The interface to the operating system file i/o primitives
32
33 Created 10/21/1995 Heikki Tuuri
34 *******************************************************/
35
36 #ifndef UNIV_INNOCHECKSUM
37 #include "os0file.h"
38 #include "sql_const.h"
39
40 #ifdef UNIV_LINUX
41 #include <sys/types.h>
42 #include <sys/stat.h>
43 #endif
44
45 #include "srv0srv.h"
46 #include "srv0start.h"
47 #include "fil0fil.h"
48 #include "srv0srv.h"
49 #ifdef HAVE_LINUX_UNISTD_H
50 #include "unistd.h"
51 #endif
52 #include "os0event.h"
53 #include "os0thread.h"
54
55 #include <vector>
56
57 #ifdef LINUX_NATIVE_AIO
58 #include <libaio.h>
59 #endif /* LINUX_NATIVE_AIO */
60
61 #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
62 # include <fcntl.h>
63 # include <linux/falloc.h>
64 #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
65
66 #if defined(UNIV_LINUX) && defined(HAVE_SYS_IOCTL_H)
67 # include <sys/ioctl.h>
68 # ifndef DFS_IOCTL_ATOMIC_WRITE_SET
69 # define DFS_IOCTL_ATOMIC_WRITE_SET _IOW(0x95, 2, uint)
70 # endif
71 #endif
72
73 #if defined(UNIV_LINUX) && defined(HAVE_SYS_STATVFS_H)
74 #include <sys/statvfs.h>
75 #endif
76
77 #if defined(UNIV_LINUX) && defined(HAVE_LINUX_FALLOC_H)
78 #include <linux/falloc.h>
79 #endif
80
81 #ifdef _WIN32
82 #include <winioctl.h>
83 #endif
84
85 /** Insert buffer segment id */
86 static const ulint IO_IBUF_SEGMENT = 0;
87
88 /** Log segment id */
89 static const ulint IO_LOG_SEGMENT = 1;
90
91 /** Number of retries for partial I/O's */
92 static const ulint NUM_RETRIES_ON_PARTIAL_IO = 10;
93
94 /* This specifies the file permissions InnoDB uses when it creates files in
95 Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
96 my_umask */
97
98 #ifndef _WIN32
99 /** Umask for creating files */
100 static ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
101 #else
102 /** Umask for creating files */
103 static ulint os_innodb_umask = 0;
104 static HANDLE data_completion_port;
105 static HANDLE log_completion_port;
106
107 static DWORD fls_sync_io = FLS_OUT_OF_INDEXES;
108 #define IOCP_SHUTDOWN_KEY (ULONG_PTR)-1
109 #endif /* _WIN32 */
110
111 /** In simulated aio, merge at most this many consecutive i/os */
112 static const ulint OS_AIO_MERGE_N_CONSECUTIVE = 64;
113
114 /** Flag indicating if the page_cleaner is in active state. */
115 extern bool buf_page_cleaner_is_active;
116
117 #ifdef WITH_INNODB_DISALLOW_WRITES
118 #define WAIT_ALLOW_WRITES() os_event_wait(srv_allow_writes_event)
119 #else
120 #define WAIT_ALLOW_WRITES() do { } while (0)
121 #endif /* WITH_INNODB_DISALLOW_WRITES */
122
123 /**********************************************************************
124
125 InnoDB AIO Implementation:
126 =========================
127
128 We support native AIO for Windows and Linux. For rest of the platforms
129 we simulate AIO by special IO-threads servicing the IO-requests.
130
131 Simulated AIO:
132 ==============
133
134 On platforms where we 'simulate' AIO, the following is a rough explanation
135 of the high level design.
136 There are four io-threads (for ibuf, log, read, write).
137 All synchronous IO requests are serviced by the calling thread using
138 os_file_write/os_file_read. The Asynchronous requests are queued up
139 in an array (there are four such arrays) by the calling thread.
140 Later these requests are picked up by the IO-thread and are serviced
141 synchronously.
142
143 Windows native AIO:
144 ==================
145
146 If srv_use_native_aio is not set then Windows follow the same
147 code as simulated AIO. If the flag is set then native AIO interface
148 is used. On windows, one of the limitation is that if a file is opened
149 for AIO no synchronous IO can be done on it. Therefore we have an
150 extra fifth array to queue up synchronous IO requests.
151 There are innodb_file_io_threads helper threads. These threads work
152 on the four arrays mentioned above in Simulated AIO. No thread is
153 required for the sync array.
154 If a synchronous IO request is made, it is first queued in the sync
155 array. Then the calling thread itself waits on the request, thus
156 making the call synchronous.
157 If an AIO request is made the calling thread not only queues it in the
158 array but also submits the requests. The helper thread then collects
159 the completed IO request and calls completion routine on it.
160
161 Linux native AIO:
162 =================
163
164 If we have libaio installed on the system and innodb_use_native_aio
165 is set to true we follow the code path of native AIO, otherwise we
166 do simulated AIO.
167 There are innodb_file_io_threads helper threads. These threads work
168 on the four arrays mentioned above in Simulated AIO.
169 If a synchronous IO request is made, it is handled by calling
170 os_file_write/os_file_read.
171 If an AIO request is made the calling thread not only queues it in the
172 array but also submits the requests. The helper thread then collects
173 the completed IO request and calls completion routine on it.
174
175 **********************************************************************/
176
177
178 #ifdef UNIV_PFS_IO
179 /* Keys to register InnoDB I/O with performance schema */
180 mysql_pfs_key_t innodb_data_file_key;
181 mysql_pfs_key_t innodb_log_file_key;
182 mysql_pfs_key_t innodb_temp_file_key;
183 #endif /* UNIV_PFS_IO */
184
185 class AIO;
186
187 /** The asynchronous I/O context */
188 struct Slot {
189
190 #ifdef WIN_ASYNC_IO
191 /** Windows control block for the aio request
192 must be at the very start of Slot, so we can
193 cast Slot* to OVERLAPPED*
194 */
195 OVERLAPPED control;
196 #endif
197
198 /** index of the slot in the aio array */
199 uint16_t pos;
200
201 /** true if this slot is reserved */
202 bool is_reserved;
203
204 /** time when reserved */
205 time_t reservation_time;
206
207 /** buffer used in i/o */
208 byte* buf;
209
210 /** Buffer pointer used for actual IO. We advance this
211 when partial IO is required and not buf */
212 byte* ptr;
213
214 /** OS_FILE_READ or OS_FILE_WRITE */
215 IORequest type;
216
217 /** file offset in bytes */
218 os_offset_t offset;
219
220 /** file where to read or write */
221 pfs_os_file_t file;
222
223 /** file name or path */
224 const char* name;
225
226 /** used only in simulated aio: true if the physical i/o
227 already made and only the slot message needs to be passed
228 to the caller of os_aio_simulated_handle */
229 bool io_already_done;
230
231 /*!< file block size */
232 ulint file_block_size;
233
234 /** The file node for which the IO is requested. */
235 fil_node_t* m1;
236
237 /** the requester of an aio operation and which can be used
238 to identify which pending aio operation was completed */
239 void* m2;
240
241 /** AIO completion status */
242 dberr_t err;
243
244 #ifdef WIN_ASYNC_IO
245
246 /** bytes written/read */
247 DWORD n_bytes;
248
249 /** length of the block to read or write */
250 DWORD len;
251
252 /** aio array containing this slot */
253 AIO *array;
254 #elif defined(LINUX_NATIVE_AIO)
255 /** Linux control block for aio */
256 struct iocb control;
257
258 /** AIO return code */
259 int ret;
260
261 /** bytes written/read. */
262 ssize_t n_bytes;
263
264 /** length of the block to read or write */
265 ulint len;
266 #else
267 /** length of the block to read or write */
268 ulint len;
269
270 /** bytes written/read. */
271 ulint n_bytes;
272 #endif /* WIN_ASYNC_IO */
273
274 /** Length of the block before it was compressed */
275 uint32 original_len;
276
277 };
278
279 /** The asynchronous i/o array structure */
280 class AIO {
281 public:
282 /** Constructor
283 @param[in] id Latch ID
284 @param[in] n_slots Number of slots to configure
285 @param[in] segments Number of segments to configure */
286 AIO(latch_id_t id, ulint n_slots, ulint segments);
287
288 /** Destructor */
289 ~AIO();
290
291 /** Initialize the instance
292 @return DB_SUCCESS or error code */
293 dberr_t init();
294
295 /** Requests for a slot in the aio array. If no slot is available, waits
296 until not_full-event becomes signaled.
297
298 @param[in] type IO context
299 @param[in,out] m1 message to be passed along with the AIO
300 operation
301 @param[in,out] m2 message to be passed along with the AIO
302 operation
303 @param[in] file file handle
304 @param[in] name name of the file or path as a null-terminated
305 string
306 @param[in,out] buf buffer where to read or from which to write
307 @param[in] offset file offset, where to read from or start writing
308 @param[in] len length of the block to read or write
309 @return pointer to slot */
310 Slot* reserve_slot(
311 const IORequest& type,
312 fil_node_t* m1,
313 void* m2,
314 pfs_os_file_t file,
315 const char* name,
316 void* buf,
317 os_offset_t offset,
318 ulint len)
319 MY_ATTRIBUTE((warn_unused_result));
320
321 /** @return number of reserved slots */
322 ulint pending_io_count() const;
323
324 /** Returns a pointer to the nth slot in the aio array.
325 @param[in] index Index of the slot in the array
326 @return pointer to slot */
at(ulint i) const327 const Slot* at(ulint i) const
328 MY_ATTRIBUTE((warn_unused_result))
329 {
330 ut_a(i < m_slots.size());
331
332 return(&m_slots[i]);
333 }
334
335 /** Non const version */
at(ulint i)336 Slot* at(ulint i)
337 MY_ATTRIBUTE((warn_unused_result))
338 {
339 ut_a(i < m_slots.size());
340
341 return(&m_slots[i]);
342 }
343
344 /** Frees a slot in the AIO array, assumes caller owns the mutex.
345 @param[in,out] slot Slot to release */
346 void release(Slot* slot);
347
348 /** Frees a slot in the AIO array, assumes caller doesn't own the mutex.
349 @param[in,out] slot Slot to release */
350 void release_with_mutex(Slot* slot);
351
352 /** Prints info about the aio array.
353 @param[in,out] file Where to print */
354 void print(FILE* file);
355
356 /** @return the number of slots per segment */
slots_per_segment() const357 ulint slots_per_segment() const
358 MY_ATTRIBUTE((warn_unused_result))
359 {
360 return(m_slots.size() / m_n_segments);
361 }
362
363 /** @return accessor for n_segments */
get_n_segments() const364 ulint get_n_segments() const
365 MY_ATTRIBUTE((warn_unused_result))
366 {
367 return(m_n_segments);
368 }
369
370 #ifdef UNIV_DEBUG
371 /** @return true if the thread owns the mutex */
is_mutex_owned() const372 bool is_mutex_owned() const
373 MY_ATTRIBUTE((warn_unused_result))
374 {
375 return(mutex_own(&m_mutex));
376 }
377 #endif /* UNIV_DEBUG */
378
379 /** Acquire the mutex */
acquire() const380 void acquire() const
381 {
382 mutex_enter(&m_mutex);
383 }
384
385 /** Release the mutex */
release() const386 void release() const
387 {
388 mutex_exit(&m_mutex);
389 }
390
391 /** Write out the state to the file/stream
392 @param[in, out] file File to write to */
393 void to_file(FILE* file) const;
394
395 #ifdef LINUX_NATIVE_AIO
396 /** Dispatch an AIO request to the kernel.
397 @param[in,out] slot an already reserved slot
398 @return true on success. */
399 bool linux_dispatch(Slot* slot)
400 MY_ATTRIBUTE((warn_unused_result));
401
402 /** Accessor for an AIO event
403 @param[in] index Index into the array
404 @return the event at the index */
io_events(ulint index)405 io_event* io_events(ulint index)
406 MY_ATTRIBUTE((warn_unused_result))
407 {
408 ut_a(index < m_events.size());
409
410 return(&m_events[index]);
411 }
412
413 /** Accessor for the AIO context
414 @param[in] segment Segment for which to get the context
415 @return the AIO context for the segment */
io_ctx(ulint segment)416 io_context_t io_ctx(ulint segment)
417 MY_ATTRIBUTE((warn_unused_result))
418 {
419 ut_ad(segment < get_n_segments());
420
421 return(m_aio_ctx[segment]);
422 }
423
424 /** Creates an io_context_t for native linux AIO.
425 @param[in] max_events number of events
426 @param[out] io_ctx io_ctx to initialize.
427 @return true on success. */
428 static bool linux_create_io_ctx(unsigned max_events, io_context_t& io_ctx)
429 MY_ATTRIBUTE((warn_unused_result));
430
431 /** Checks if the system supports native linux aio. On some kernel
432 versions where native aio is supported it won't work on tmpfs. In such
433 cases we can't use native aio as it is not possible to mix simulated
434 and native aio.
435 @return true if supported, false otherwise. */
436 static bool is_linux_native_aio_supported()
437 MY_ATTRIBUTE((warn_unused_result));
438 #endif /* LINUX_NATIVE_AIO */
439
440 #ifdef WIN_ASYNC_IO
441 HANDLE m_completion_port;
442 /** Wake up all AIO threads in Windows native aio */
wake_at_shutdown()443 static void wake_at_shutdown() {
444 AIO *all_arrays[] = {s_reads, s_writes, s_log, s_ibuf };
445 for (size_t i = 0; i < array_elements(all_arrays); i++) {
446 AIO *a = all_arrays[i];
447 if (a) {
448 PostQueuedCompletionStatus(a->m_completion_port, 0,
449 IOCP_SHUTDOWN_KEY, 0);
450 }
451 }
452 }
453 #endif /* WIN_ASYNC_IO */
454
455 #ifdef _WIN32
456 /** This function can be called if one wants to post a batch of reads
457 and prefers an I/O - handler thread to handle them all at once later.You
458 must call os_aio_simulated_wake_handler_threads later to ensure the
459 threads are not left sleeping! */
460 static void simulated_put_read_threads_to_sleep();
461 #endif /* _WIN32 */
462
463 /** Create an instance using new(std::nothrow)
464 @param[in] id Latch ID
465 @param[in] n_slots The number of AIO request slots
466 @param[in] segments The number of segments
467 @return a new AIO instance */
468 static AIO* create(
469 latch_id_t id,
470 ulint n_slots,
471 ulint segments)
472 MY_ATTRIBUTE((warn_unused_result));
473
474 /** Initializes the asynchronous io system. Creates one array each
475 for ibuf and log I/O. Also creates one array each for read and write
476 where each array is divided logically into n_readers and n_writers
477 respectively. The caller must create an i/o handler thread for each
478 segment in these arrays. This function also creates the sync array.
479 No I/O handler thread needs to be created for that
480 @param[in] n_per_seg maximum number of pending aio
481 operations allowed per segment
482 @param[in] n_readers number of reader threads
483 @param[in] n_writers number of writer threads
484 @param[in] n_slots_sync number of slots in the sync aio array
485 @return true if AIO sub-system was started successfully */
486 static bool start(
487 ulint n_per_seg,
488 ulint n_readers,
489 ulint n_writers,
490 ulint n_slots_sync)
491 MY_ATTRIBUTE((warn_unused_result));
492
493 /** Free the AIO arrays */
494 static void shutdown();
495
496 /** Print all the AIO segments
497 @param[in,out] file Where to print */
498 static void print_all(FILE* file);
499
500 /** Calculates local segment number and aio array from global
501 segment number.
502 @param[out] array AIO wait array
503 @param[in] segment global segment number
504 @return local segment number within the aio array */
505 static ulint get_array_and_local_segment(
506 AIO** array,
507 ulint segment)
508 MY_ATTRIBUTE((warn_unused_result));
509
510 /** Select the IO slot array
511 @param[in,out] type Type of IO, READ or WRITE
512 @param[in] read_only true if running in read-only mode
513 @param[in] mode IO mode
514 @return slot array or NULL if invalid mode specified */
515 static AIO* select_slot_array(
516 IORequest& type,
517 bool read_only,
518 ulint mode)
519 MY_ATTRIBUTE((warn_unused_result));
520
521 /** Calculates segment number for a slot.
522 @param[in] array AIO wait array
523 @param[in] slot slot in this array
524 @return segment number (which is the number used by, for example,
525 I/O handler threads) */
526 static ulint get_segment_no_from_slot(
527 const AIO* array,
528 const Slot* slot)
529 MY_ATTRIBUTE((warn_unused_result));
530
531 /** Wakes up a simulated AIO I/O-handler thread if it has something
532 to do.
533 @param[in] global_segment the number of the segment in the
534 AIO arrays */
535 static void wake_simulated_handler_thread(ulint global_segment);
536
537 /** Check if it is a read request
538 @param[in] aio The AIO instance to check
539 @return true if the AIO instance is for reading. */
is_read(const AIO * aio)540 static bool is_read(const AIO* aio)
541 MY_ATTRIBUTE((warn_unused_result))
542 {
543 return(s_reads == aio);
544 }
545
546 /** Wait on an event until no pending writes */
wait_until_no_pending_writes()547 static void wait_until_no_pending_writes()
548 {
549 os_event_wait(AIO::s_writes->m_is_empty);
550 }
551
552 /** Print to file
553 @param[in] file File to write to */
554 static void print_to_file(FILE* file);
555
556 /** Check for pending IO. Gets the count and also validates the
557 data structures.
558 @return count of pending IO requests */
559 static ulint total_pending_io_count();
560
561 private:
562 /** Initialise the slots
563 @return DB_SUCCESS or error code */
564 dberr_t init_slots()
565 MY_ATTRIBUTE((warn_unused_result));
566
567 /** Wakes up a simulated AIO I/O-handler thread if it has something
568 to do for a local segment in the AIO array.
569 @param[in] global_segment the number of the segment in the
570 AIO arrays
571 @param[in] segment the local segment in the AIO array */
572 void wake_simulated_handler_thread(ulint global_segment, ulint segment);
573
574 /** Prints pending IO requests per segment of an aio array.
575 We probably don't need per segment statistics but they can help us
576 during development phase to see if the IO requests are being
577 distributed as expected.
578 @param[in,out] file file where to print
579 @param[in] segments pending IO array */
580 void print_segment_info(
581 FILE* file,
582 const ulint* segments);
583
584 #ifdef LINUX_NATIVE_AIO
585 /** Initialise the Linux native AIO data structures
586 @return DB_SUCCESS or error code */
587 dberr_t init_linux_native_aio()
588 MY_ATTRIBUTE((warn_unused_result));
589 #endif /* LINUX_NATIVE_AIO */
590
591 private:
592 typedef std::vector<Slot> Slots;
593
594 /** the mutex protecting the aio array */
595 mutable SysMutex m_mutex;
596
597 /** Pointer to the slots in the array.
598 Number of elements must be divisible by n_threads. */
599 Slots m_slots;
600
601 /** Number of segments in the aio array of pending aio requests.
602 A thread can wait separately for any one of the segments. */
603 ulint m_n_segments;
604
605 /** The event which is set to the signaled state when
606 there is space in the aio outside the ibuf segment;
607 os_event_set() and os_event_reset() are protected by AIO::m_mutex */
608 os_event_t m_not_full;
609
610 /** The event which is set to the signaled state when
611 there are no pending i/os in this array;
612 os_event_set() and os_event_reset() are protected by AIO::m_mutex */
613 os_event_t m_is_empty;
614
615 /** Number of reserved slots in the AIO array outside
616 the ibuf segment */
617 ulint m_n_reserved;
618
619
620 #if defined(LINUX_NATIVE_AIO)
621 typedef std::vector<io_event> IOEvents;
622
623 /** completion queue for IO. There is one such queue per
624 segment. Each thread will work on one ctx exclusively. */
625 std::vector<io_context_t> m_aio_ctx;
626
627 /** The array to collect completed IOs. There is one such
628 event for each possible pending IO. The size of the array
629 is equal to m_slots.size(). */
630 IOEvents m_events;
631 #endif /* LINUX_NATIV_AIO */
632
633 /** The aio arrays for non-ibuf i/o and ibuf i/o, as well as
634 sync AIO. These are NULL when the module has not yet been
635 initialized. */
636
637 /** Insert buffer */
638 static AIO* s_ibuf;
639
640 /** Redo log */
641 static AIO* s_log;
642
643 /** Reads */
644 static AIO* s_reads;
645
646 /** Writes */
647 static AIO* s_writes;
648
649 /** Synchronous I/O */
650 static AIO* s_sync;
651 };
652
653 /** Static declarations */
654 AIO* AIO::s_reads;
655 AIO* AIO::s_writes;
656 AIO* AIO::s_ibuf;
657 AIO* AIO::s_log;
658 AIO* AIO::s_sync;
659
660 #if defined(LINUX_NATIVE_AIO)
661 /** timeout for each io_getevents() call = 500ms. */
662 static const ulint OS_AIO_REAP_TIMEOUT = 500000000UL;
663
664 /** time to sleep, in microseconds if io_setup() returns EAGAIN. */
665 static const ulint OS_AIO_IO_SETUP_RETRY_SLEEP = 500000UL;
666
667 /** number of attempts before giving up on io_setup(). */
668 static const int OS_AIO_IO_SETUP_RETRY_ATTEMPTS = 5;
669 #endif /* LINUX_NATIVE_AIO */
670
671 /** Array of events used in simulated AIO */
672 static os_event_t* os_aio_segment_wait_events;
673
674 /** Number of asynchronous I/O segments. Set by os_aio_init(). */
675 static ulint os_aio_n_segments = ULINT_UNDEFINED;
676
677 /** If the following is true, read i/o handler threads try to
678 wait until a batch of new read requests have been posted */
679 static bool os_aio_recommend_sleep_for_read_threads;
680
681 ulint os_n_file_reads;
682 static ulint os_bytes_read_since_printout;
683 ulint os_n_file_writes;
684 ulint os_n_fsyncs;
685 static ulint os_n_file_reads_old;
686 static ulint os_n_file_writes_old;
687 static ulint os_n_fsyncs_old;
688
689 static time_t os_last_printout;
690 bool os_has_said_disk_full;
691
692 /** Default Zip compression level */
693 extern uint page_zip_level;
694
695 /** Validates the consistency of the aio system.
696 @return true if ok */
697 static
698 bool
699 os_aio_validate();
700
701 /** Handle errors for file operations.
702 @param[in] name name of a file or NULL
703 @param[in] operation operation
704 @param[in] should_abort whether to abort on an unknown error
705 @param[in] on_error_silent whether to suppress reports of non-fatal errors
706 @return true if we should retry the operation */
707 static MY_ATTRIBUTE((warn_unused_result))
708 bool
709 os_file_handle_error_cond_exit(
710 const char* name,
711 const char* operation,
712 bool should_abort,
713 bool on_error_silent);
714
715 /** Does error handling when a file operation fails.
716 @param[in] name name of a file or NULL
717 @param[in] operation operation name that failed
718 @return true if we should retry the operation */
719 static
720 bool
os_file_handle_error(const char * name,const char * operation)721 os_file_handle_error(
722 const char* name,
723 const char* operation)
724 {
725 /* Exit in case of unknown error */
726 return(os_file_handle_error_cond_exit(name, operation, true, false));
727 }
728
729 /** Does error handling when a file operation fails.
730 @param[in] name name of a file or NULL
731 @param[in] operation operation name that failed
732 @param[in] on_error_silent if true then don't print any message to the log.
733 @return true if we should retry the operation */
734 static
735 bool
os_file_handle_error_no_exit(const char * name,const char * operation,bool on_error_silent)736 os_file_handle_error_no_exit(
737 const char* name,
738 const char* operation,
739 bool on_error_silent)
740 {
741 /* Don't exit in case of unknown error */
742 return(os_file_handle_error_cond_exit(
743 name, operation, false, on_error_silent));
744 }
745
746 /** Handle RENAME error.
747 @param name old name of the file
748 @param new_name new name of the file */
os_file_handle_rename_error(const char * name,const char * new_name)749 static void os_file_handle_rename_error(const char* name, const char* new_name)
750 {
751 if (os_file_get_last_error(true) != OS_FILE_DISK_FULL) {
752 ib::error() << "Cannot rename file '" << name << "' to '"
753 << new_name << "'";
754 } else if (!os_has_said_disk_full) {
755 os_has_said_disk_full = true;
756 /* Disk full error is reported irrespective of the
757 on_error_silent setting. */
758 ib::error() << "Full disk prevents renaming file '"
759 << name << "' to '" << new_name << "'";
760 }
761 }
762
763 /** Does simulated AIO. This function should be called by an i/o-handler
764 thread.
765
766 @param[in] segment The number of the segment in the aio arrays to wait
767 for; segment 0 is the ibuf i/o thread, segment 1 the
768 log i/o thread, then follow the non-ibuf read threads,
769 and as the last are the non-ibuf write threads
770 @param[out] m1 the messages passed with the AIO request; note that
771 also in the case where the AIO operation failed, these
772 output parameters are valid and can be used to restart
773 the operation, for example
774 @param[out] m2 Callback argument
775 @param[in] type IO context
776 @return DB_SUCCESS or error code */
777 static
778 dberr_t
779 os_aio_simulated_handler(
780 ulint global_segment,
781 fil_node_t** m1,
782 void** m2,
783 IORequest* type);
784
785 #ifdef _WIN32
786 static HANDLE win_get_syncio_event();
787
788 /**
789 Wrapper around Windows DeviceIoControl() function.
790
791 Works synchronously, also in case for handle opened
792 for async access (i.e with FILE_FLAG_OVERLAPPED).
793
794 Accepts the same parameters as DeviceIoControl(),except
795 last parameter (OVERLAPPED).
796 */
797 static
798 BOOL
os_win32_device_io_control(HANDLE handle,DWORD code,LPVOID inbuf,DWORD inbuf_size,LPVOID outbuf,DWORD outbuf_size,LPDWORD bytes_returned)799 os_win32_device_io_control(
800 HANDLE handle,
801 DWORD code,
802 LPVOID inbuf,
803 DWORD inbuf_size,
804 LPVOID outbuf,
805 DWORD outbuf_size,
806 LPDWORD bytes_returned
807 )
808 {
809 OVERLAPPED overlapped = { 0 };
810 overlapped.hEvent = win_get_syncio_event();
811 BOOL result = DeviceIoControl(handle, code, inbuf, inbuf_size, outbuf,
812 outbuf_size, NULL, &overlapped);
813
814 if (result || (GetLastError() == ERROR_IO_PENDING)) {
815 /* Wait for async io to complete */
816 result = GetOverlappedResult(handle, &overlapped, bytes_returned, TRUE);
817 }
818
819 return result;
820 }
821
822 #endif
823
824 /***********************************************************************//**
825 Try to get number of bytes per sector from file system.
826 @return file block size */
827 UNIV_INTERN
828 ulint
os_file_get_block_size(os_file_t file,const char * name)829 os_file_get_block_size(
830 /*===================*/
831 os_file_t file, /*!< in: handle to a file */
832 const char* name) /*!< in: file name */
833 {
834 ulint fblock_size = 512;
835
836 #if defined(UNIV_LINUX)
837 struct stat local_stat;
838 int err;
839
840 err = fstat((int)file, &local_stat);
841
842 if (err != 0) {
843 os_file_handle_error_no_exit(name, "fstat()", FALSE);
844 } else {
845 fblock_size = local_stat.st_blksize;
846 }
847 #endif /* UNIV_LINUX */
848 #ifdef _WIN32
849
850 fblock_size = 0;
851 BOOL result = false;
852 size_t len = 0;
853 // Open volume for this file, find out it "physical bytes per sector"
854
855 HANDLE volume_handle = INVALID_HANDLE_VALUE;
856 char volume[MAX_PATH + 4]="\\\\.\\"; // Special prefix required for volume names.
857 if (!GetVolumePathName(name , volume + 4, MAX_PATH)) {
858 os_file_handle_error_no_exit(name,
859 "GetVolumePathName()", FALSE);
860 goto end;
861 }
862
863 len = strlen(volume);
864 if (volume[len - 1] == '\\') {
865 // Trim trailing backslash from volume name.
866 volume[len - 1] = 0;
867 }
868
869 volume_handle = CreateFile(volume, FILE_READ_ATTRIBUTES,
870 FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
871 0, OPEN_EXISTING, 0, 0);
872
873 if (volume_handle == INVALID_HANDLE_VALUE) {
874 if (GetLastError() != ERROR_ACCESS_DENIED) {
875 os_file_handle_error_no_exit(volume,
876 "CreateFile()", FALSE);
877 }
878 goto end;
879 }
880
881 DWORD tmp;
882 STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR disk_alignment;
883
884 STORAGE_PROPERTY_QUERY storage_query;
885 memset(&storage_query, 0, sizeof(storage_query));
886 storage_query.PropertyId = StorageAccessAlignmentProperty;
887 storage_query.QueryType = PropertyStandardQuery;
888
889 result = os_win32_device_io_control(volume_handle,
890 IOCTL_STORAGE_QUERY_PROPERTY,
891 &storage_query,
892 sizeof(storage_query),
893 &disk_alignment,
894 sizeof(disk_alignment),
895 &tmp);
896
897 if (!result) {
898 DWORD err = GetLastError();
899 if (err != ERROR_INVALID_FUNCTION && err != ERROR_NOT_SUPPORTED) {
900 os_file_handle_error_no_exit(volume,
901 "DeviceIoControl(IOCTL_STORAGE_QUERY_PROPERTY)", FALSE);
902 }
903 goto end;
904 }
905
906 fblock_size = disk_alignment.BytesPerPhysicalSector;
907
908 end:
909 if (volume_handle != INVALID_HANDLE_VALUE) {
910 CloseHandle(volume_handle);
911 }
912 #endif /* _WIN32 */
913
914 /* Currently we support file block size up to 4Kb */
915 if (fblock_size > 4096 || fblock_size < 512) {
916 if (fblock_size < 512) {
917 fblock_size = 512;
918 } else {
919 fblock_size = 4096;
920 }
921 }
922
923 return fblock_size;
924 }
925
926 #ifdef WIN_ASYNC_IO
927 /** This function is only used in Windows asynchronous i/o.
928 Waits for an aio operation to complete. This function is used to wait the
929 for completed requests. The aio array of pending requests is divided
930 into segments. The thread specifies which segment or slot it wants to wait
931 for. NOTE: this function will also take care of freeing the aio slot,
932 therefore no other thread is allowed to do the freeing!
933 @param[in] segment The number of the segment in the aio arrays to
934 wait for; segment 0 is the ibuf I/O thread,
935 segment 1 the log I/O thread, then follow the
936 non-ibuf read threads, and as the last are the
937 non-ibuf write threads; if this is
938 ULINT_UNDEFINED, then it means that sync AIO
939 is used, and this parameter is ignored
940 @param[in] pos this parameter is used only in sync AIO:
941 wait for the aio slot at this position
942 @param[out] m1 the messages passed with the AIO request; note
943 that also in the case where the AIO operation
944 failed, these output parameters are valid and
945 can be used to restart the operation,
946 for example
947 @param[out] m2 callback message
948 @param[out] type OS_FILE_WRITE or ..._READ
949 @return DB_SUCCESS or error code */
950 static
951 dberr_t
952 os_aio_windows_handler(
953 ulint segment,
954 ulint pos,
955 fil_node_t** m1,
956 void** m2,
957 IORequest* type);
958 #endif /* WIN_ASYNC_IO */
959
960 /** Generic AIO Handler methods. Currently handles IO post processing. */
961 class AIOHandler {
962 public:
963 /** Do any post processing after a read/write
964 @return DB_SUCCESS or error code. */
965 static dberr_t post_io_processing(Slot* slot);
966 };
967
968 /** Helper class for doing synchronous file IO. Currently, the objective
969 is to hide the OS specific code, so that the higher level functions aren't
970 peppered with #ifdef. Makes the code flow difficult to follow. */
971 class SyncFileIO {
972 public:
973 /** Constructor
974 @param[in] fh File handle
975 @param[in,out] buf Buffer to read/write
976 @param[in] n Number of bytes to read/write
977 @param[in] offset Offset where to read or write */
SyncFileIO(os_file_t fh,void * buf,ulint n,os_offset_t offset)978 SyncFileIO(os_file_t fh, void* buf, ulint n, os_offset_t offset)
979 :
980 m_fh(fh),
981 m_buf(buf),
982 m_n(static_cast<ssize_t>(n)),
983 m_offset(offset)
984 {
985 ut_ad(m_n > 0);
986 }
987
988 /** Destructor */
~SyncFileIO()989 ~SyncFileIO()
990 {
991 /* No op */
992 }
993
994 /** Do the read/write
995 @param[in] request The IO context and type
996 @return the number of bytes read/written or negative value on error */
997 ssize_t execute(const IORequest& request);
998
999 /** Do the read/write
1000 @param[in,out] slot The IO slot, it has the IO context
1001 @return the number of bytes read/written or negative value on error */
1002 static ssize_t execute(Slot* slot);
1003
1004 /** Move the read/write offset up to where the partial IO succeeded.
1005 @param[in] n_bytes The number of bytes to advance */
advance(ssize_t n_bytes)1006 void advance(ssize_t n_bytes)
1007 {
1008 m_offset += n_bytes;
1009
1010 ut_ad(m_n >= n_bytes);
1011
1012 m_n -= n_bytes;
1013
1014 m_buf = reinterpret_cast<uchar*>(m_buf) + n_bytes;
1015 }
1016
1017 private:
1018 /** Open file handle */
1019 os_file_t m_fh;
1020
1021 /** Buffer to read/write */
1022 void* m_buf;
1023
1024 /** Number of bytes to read/write */
1025 ssize_t m_n;
1026
1027 /** Offset from where to read/write */
1028 os_offset_t m_offset;
1029 };
1030
1031 /** Do any post processing after a read/write
1032 @return DB_SUCCESS or error code. */
1033 dberr_t
post_io_processing(Slot * slot)1034 AIOHandler::post_io_processing(Slot* slot)
1035 {
1036 ut_ad(slot->is_reserved);
1037
1038 /* Total bytes read so far */
1039 ulint n_bytes = ulint(slot->ptr - slot->buf) + slot->n_bytes;
1040
1041 return(n_bytes == slot->original_len ? DB_SUCCESS : DB_FAIL);
1042 }
1043
1044 /** Count the number of free slots
1045 @return number of reserved slots */
1046 ulint
pending_io_count() const1047 AIO::pending_io_count() const
1048 {
1049 acquire();
1050
1051 #ifdef UNIV_DEBUG
1052 ut_a(m_n_segments > 0);
1053 ut_a(!m_slots.empty());
1054
1055 ulint count = 0;
1056
1057 for (ulint i = 0; i < m_slots.size(); ++i) {
1058
1059 const Slot& slot = m_slots[i];
1060
1061 if (slot.is_reserved) {
1062 ++count;
1063 ut_a(slot.len > 0);
1064 }
1065 }
1066
1067 ut_a(m_n_reserved == count);
1068 #endif /* UNIV_DEBUG */
1069
1070 ulint reserved = m_n_reserved;
1071
1072 release();
1073
1074 return(reserved);
1075 }
1076
1077 #ifdef UNIV_DEBUG
1078 /** Validates the consistency the aio system some of the time.
1079 @return true if ok or the check was skipped */
1080 static
1081 bool
os_aio_validate_skip()1082 os_aio_validate_skip()
1083 {
1084 /** Try os_aio_validate() every this many times */
1085 # define OS_AIO_VALIDATE_SKIP 13
1086
1087 static int os_aio_validate_count;
1088
1089 if (my_atomic_add32_explicit(&os_aio_validate_count, -1,
1090 MY_MEMORY_ORDER_RELAXED)
1091 % OS_AIO_VALIDATE_SKIP) {
1092 return true;
1093 }
1094
1095 return(os_aio_validate());
1096 }
1097 #endif /* UNIV_DEBUG */
1098
1099 #undef USE_FILE_LOCK
1100 #ifndef _WIN32
1101 /* On Windows, mandatory locking is used */
1102 # define USE_FILE_LOCK
1103 #endif
1104 #ifdef USE_FILE_LOCK
1105 /** Obtain an exclusive lock on a file.
1106 @param[in] fd file descriptor
1107 @param[in] name file name
1108 @return 0 on success */
1109 static
1110 int
os_file_lock(int fd,const char * name)1111 os_file_lock(
1112 int fd,
1113 const char* name)
1114 {
1115 if (my_disable_locking) {
1116 return 0;
1117 }
1118
1119 struct flock lk;
1120
1121 lk.l_type = F_WRLCK;
1122 lk.l_whence = SEEK_SET;
1123 lk.l_start = lk.l_len = 0;
1124
1125 if (fcntl(fd, F_SETLK, &lk) == -1) {
1126
1127 ib::error()
1128 << "Unable to lock " << name
1129 << " error: " << errno;
1130
1131 if (errno == EAGAIN || errno == EACCES) {
1132
1133 ib::info()
1134 << "Check that you do not already have"
1135 " another mysqld process using the"
1136 " same InnoDB data or log files.";
1137 }
1138
1139 return(-1);
1140 }
1141
1142 return(0);
1143 }
1144 #endif /* USE_FILE_LOCK */
1145
1146 /** Calculates local segment number and aio array from global segment number.
1147 @param[out] array aio wait array
1148 @param[in] segment global segment number
1149 @return local segment number within the aio array */
1150 ulint
get_array_and_local_segment(AIO ** array,ulint segment)1151 AIO::get_array_and_local_segment(
1152 AIO** array,
1153 ulint segment)
1154 {
1155 ulint local_segment;
1156 ulint n_extra_segs = (srv_read_only_mode) ? 0 : 2;
1157
1158 ut_a(segment < os_aio_n_segments);
1159
1160 if (!srv_read_only_mode && segment < n_extra_segs) {
1161
1162 /* We don't support ibuf/log IO during read only mode. */
1163
1164 if (segment == IO_IBUF_SEGMENT) {
1165
1166 *array = s_ibuf;
1167
1168 } else if (segment == IO_LOG_SEGMENT) {
1169
1170 *array = s_log;
1171
1172 } else {
1173 *array = NULL;
1174 }
1175
1176 local_segment = 0;
1177
1178 } else if (segment < s_reads->m_n_segments + n_extra_segs) {
1179
1180 *array = s_reads;
1181 local_segment = segment - n_extra_segs;
1182
1183 } else {
1184 *array = s_writes;
1185
1186 local_segment = segment
1187 - (s_reads->m_n_segments + n_extra_segs);
1188 }
1189
1190 return(local_segment);
1191 }
1192
1193 /** Frees a slot in the aio array. Assumes caller owns the mutex.
1194 @param[in,out] slot Slot to release */
1195 void
release(Slot * slot)1196 AIO::release(Slot* slot)
1197 {
1198 ut_ad(is_mutex_owned());
1199
1200 ut_ad(slot->is_reserved);
1201
1202 slot->is_reserved = false;
1203
1204 --m_n_reserved;
1205
1206 if (m_n_reserved == m_slots.size() - 1) {
1207 os_event_set(m_not_full);
1208 }
1209
1210 if (m_n_reserved == 0) {
1211 os_event_set(m_is_empty);
1212 }
1213
1214 #if defined(LINUX_NATIVE_AIO)
1215
1216 if (srv_use_native_aio) {
1217 memset(&slot->control, 0x0, sizeof(slot->control));
1218 slot->ret = 0;
1219 slot->n_bytes = 0;
1220 } else {
1221 /* These fields should not be used if we are not
1222 using native AIO. */
1223 ut_ad(slot->n_bytes == 0);
1224 ut_ad(slot->ret == 0);
1225 }
1226
1227 #endif /* WIN_ASYNC_IO */
1228 }
1229
1230 /** Frees a slot in the AIO array. Assumes caller doesn't own the mutex.
1231 @param[in,out] slot Slot to release */
1232 void
release_with_mutex(Slot * slot)1233 AIO::release_with_mutex(Slot* slot)
1234 {
1235 acquire();
1236
1237 release(slot);
1238
1239 release();
1240 }
1241
1242 /** Create a temporary file. This function is like tmpfile(3), but
1243 the temporary file is created in the in the mysql server configuration
1244 parameter (--tmpdir).
1245 @return temporary file handle, or NULL on error */
1246 FILE*
os_file_create_tmpfile()1247 os_file_create_tmpfile()
1248 {
1249 FILE* file = NULL;
1250 WAIT_ALLOW_WRITES();
1251 os_file_t fd = innobase_mysql_tmpfile(NULL);
1252
1253 if (fd != OS_FILE_CLOSED) {
1254 #ifdef _WIN32
1255 int crt_fd = _open_osfhandle((intptr_t)HANDLE(fd), 0);
1256 if (crt_fd != -1) {
1257 file = fdopen(crt_fd, "w+b");
1258 if (!file) {
1259 close(crt_fd);
1260 }
1261 }
1262 #else
1263 file = fdopen(fd, "w+b");
1264 if (!file) {
1265 close(fd);
1266 }
1267 #endif
1268 }
1269
1270 if (file == NULL) {
1271
1272 ib::error()
1273 << "Unable to create temporary file; errno: "
1274 << errno;
1275 }
1276
1277 return(file);
1278 }
1279
1280 /** Rewind file to its start, read at most size - 1 bytes from it to str, and
1281 NUL-terminate str. All errors are silently ignored. This function is
1282 mostly meant to be used with temporary files.
1283 @param[in,out] file File to read from
1284 @param[in,out] str Buffer where to read
1285 @param[in] size Size of buffer */
1286 void
os_file_read_string(FILE * file,char * str,ulint size)1287 os_file_read_string(
1288 FILE* file,
1289 char* str,
1290 ulint size)
1291 {
1292 if (size != 0) {
1293 rewind(file);
1294
1295 size_t flen = fread(str, 1, size - 1, file);
1296
1297 str[flen] = '\0';
1298 }
1299 }
1300
1301 /** This function returns a new path name after replacing the basename
1302 in an old path with a new basename. The old_path is a full path
1303 name including the extension. The tablename is in the normal
1304 form "databasename/tablename". The new base name is found after
1305 the forward slash. Both input strings are null terminated.
1306
1307 This function allocates memory to be returned. It is the callers
1308 responsibility to free the return value after it is no longer needed.
1309
1310 @param[in] old_path Pathname
1311 @param[in] tablename Contains new base name
1312 @return own: new full pathname */
1313 char*
os_file_make_new_pathname(const char * old_path,const char * tablename)1314 os_file_make_new_pathname(
1315 const char* old_path,
1316 const char* tablename)
1317 {
1318 ulint dir_len;
1319 char* last_slash;
1320 char* base_name;
1321 char* new_path;
1322 ulint new_path_len;
1323
1324 /* Split the tablename into its database and table name components.
1325 They are separated by a '/'. */
1326 last_slash = strrchr((char*) tablename, '/');
1327 base_name = last_slash ? last_slash + 1 : (char*) tablename;
1328
1329 /* Find the offset of the last slash. We will strip off the
1330 old basename.ibd which starts after that slash. */
1331 last_slash = strrchr((char*) old_path, OS_PATH_SEPARATOR);
1332 dir_len = last_slash ? ulint(last_slash - old_path) : strlen(old_path);
1333
1334 /* allocate a new path and move the old directory path to it. */
1335 new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd";
1336 new_path = static_cast<char*>(ut_malloc_nokey(new_path_len));
1337 memcpy(new_path, old_path, dir_len);
1338
1339 snprintf(new_path + dir_len, new_path_len - dir_len,
1340 "%c%s.ibd", OS_PATH_SEPARATOR, base_name);
1341
1342 return(new_path);
1343 }
1344
1345 /** This function reduces a null-terminated full remote path name into
1346 the path that is sent by MySQL for DATA DIRECTORY clause. It replaces
1347 the 'databasename/tablename.ibd' found at the end of the path with just
1348 'tablename'.
1349
1350 Since the result is always smaller than the path sent in, no new memory
1351 is allocated. The caller should allocate memory for the path sent in.
1352 This function manipulates that path in place.
1353
1354 If the path format is not as expected, just return. The result is used
1355 to inform a SHOW CREATE TABLE command.
1356 @param[in,out] data_dir_path Full path/data_dir_path */
1357 void
os_file_make_data_dir_path(char * data_dir_path)1358 os_file_make_data_dir_path(
1359 char* data_dir_path)
1360 {
1361 /* Replace the period before the extension with a null byte. */
1362 char* ptr = strrchr((char*) data_dir_path, '.');
1363
1364 if (ptr == NULL) {
1365 return;
1366 }
1367
1368 ptr[0] = '\0';
1369
1370 /* The tablename starts after the last slash. */
1371 ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
1372
1373 if (ptr == NULL) {
1374 return;
1375 }
1376
1377 ptr[0] = '\0';
1378
1379 char* tablename = ptr + 1;
1380
1381 /* The databasename starts after the next to last slash. */
1382 ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
1383
1384 if (ptr == NULL) {
1385 return;
1386 }
1387
1388 ulint tablename_len = ut_strlen(tablename);
1389
1390 ut_memmove(++ptr, tablename, tablename_len);
1391
1392 ptr[tablename_len] = '\0';
1393 }
1394
1395 /** Check if the path refers to the root of a drive using a pointer
1396 to the last directory separator that the caller has fixed.
1397 @param[in] path path name
1398 @param[in] path last directory separator in the path
1399 @return true if this path is a drive root, false if not */
1400 UNIV_INLINE
1401 bool
os_file_is_root(const char * path,const char * last_slash)1402 os_file_is_root(
1403 const char* path,
1404 const char* last_slash)
1405 {
1406 return(
1407 #ifdef _WIN32
1408 (last_slash == path + 2 && path[1] == ':') ||
1409 #endif /* _WIN32 */
1410 last_slash == path);
1411 }
1412
1413 /** Return the parent directory component of a null-terminated path.
1414 Return a new buffer containing the string up to, but not including,
1415 the final component of the path.
1416 The path returned will not contain a trailing separator.
1417 Do not return a root path, return NULL instead.
1418 The final component trimmed off may be a filename or a directory name.
1419 If the final component is the only component of the path, return NULL.
1420 It is the caller's responsibility to free the returned string after it
1421 is no longer needed.
1422 @param[in] path Path name
1423 @return own: parent directory of the path */
1424 static
1425 char*
os_file_get_parent_dir(const char * path)1426 os_file_get_parent_dir(
1427 const char* path)
1428 {
1429 bool has_trailing_slash = false;
1430
1431 /* Find the offset of the last slash */
1432 const char* last_slash = strrchr(path, OS_PATH_SEPARATOR);
1433
1434 if (!last_slash) {
1435 /* No slash in the path, return NULL */
1436 return(NULL);
1437 }
1438
1439 /* Ok, there is a slash. Is there anything after it? */
1440 if (static_cast<size_t>(last_slash - path + 1) == strlen(path)) {
1441 has_trailing_slash = true;
1442 }
1443
1444 /* Reduce repetative slashes. */
1445 while (last_slash > path
1446 && last_slash[-1] == OS_PATH_SEPARATOR) {
1447 last_slash--;
1448 }
1449
1450 /* Check for the root of a drive. */
1451 if (os_file_is_root(path, last_slash)) {
1452 return(NULL);
1453 }
1454
1455 /* If a trailing slash prevented the first strrchr() from trimming
1456 the last component of the path, trim that component now. */
1457 if (has_trailing_slash) {
1458 /* Back up to the previous slash. */
1459 last_slash--;
1460 while (last_slash > path
1461 && last_slash[0] != OS_PATH_SEPARATOR) {
1462 last_slash--;
1463 }
1464
1465 /* Reduce repetative slashes. */
1466 while (last_slash > path
1467 && last_slash[-1] == OS_PATH_SEPARATOR) {
1468 last_slash--;
1469 }
1470 }
1471
1472 /* Check for the root of a drive. */
1473 if (os_file_is_root(path, last_slash)) {
1474 return(NULL);
1475 }
1476
1477 if (last_slash - path < 0) {
1478 /* Sanity check, it prevents gcc from trying to handle this case which
1479 * results in warnings for some optimized builds */
1480 return (NULL);
1481 }
1482
1483 /* Non-trivial directory component */
1484
1485 return(mem_strdupl(path, ulint(last_slash - path)));
1486 }
1487 #ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
1488
1489 /* Test the function os_file_get_parent_dir. */
1490 void
test_os_file_get_parent_dir(const char * child_dir,const char * expected_dir)1491 test_os_file_get_parent_dir(
1492 const char* child_dir,
1493 const char* expected_dir)
1494 {
1495 char* child = mem_strdup(child_dir);
1496 char* expected = expected_dir == NULL ? NULL
1497 : mem_strdup(expected_dir);
1498
1499 /* os_file_get_parent_dir() assumes that separators are
1500 converted to OS_PATH_SEPARATOR. */
1501 os_normalize_path(child);
1502 os_normalize_path(expected);
1503
1504 char* parent = os_file_get_parent_dir(child);
1505
1506 bool unexpected = (expected == NULL
1507 ? (parent != NULL)
1508 : (0 != strcmp(parent, expected)));
1509 if (unexpected) {
1510 ib::fatal() << "os_file_get_parent_dir('" << child
1511 << "') returned '" << parent
1512 << "', instead of '" << expected << "'.";
1513 }
1514 ut_free(parent);
1515 ut_free(child);
1516 ut_free(expected);
1517 }
1518
1519 /* Test the function os_file_get_parent_dir. */
1520 void
unit_test_os_file_get_parent_dir()1521 unit_test_os_file_get_parent_dir()
1522 {
1523 test_os_file_get_parent_dir("/usr/lib/a", "/usr/lib");
1524 test_os_file_get_parent_dir("/usr/", NULL);
1525 test_os_file_get_parent_dir("//usr//", NULL);
1526 test_os_file_get_parent_dir("usr", NULL);
1527 test_os_file_get_parent_dir("usr//", NULL);
1528 test_os_file_get_parent_dir("/", NULL);
1529 test_os_file_get_parent_dir("//", NULL);
1530 test_os_file_get_parent_dir(".", NULL);
1531 test_os_file_get_parent_dir("..", NULL);
1532 # ifdef _WIN32
1533 test_os_file_get_parent_dir("D:", NULL);
1534 test_os_file_get_parent_dir("D:/", NULL);
1535 test_os_file_get_parent_dir("D:\\", NULL);
1536 test_os_file_get_parent_dir("D:/data", NULL);
1537 test_os_file_get_parent_dir("D:/data/", NULL);
1538 test_os_file_get_parent_dir("D:\\data\\", NULL);
1539 test_os_file_get_parent_dir("D:///data/////", NULL);
1540 test_os_file_get_parent_dir("D:\\\\\\data\\\\\\\\", NULL);
1541 test_os_file_get_parent_dir("D:/data//a", "D:/data");
1542 test_os_file_get_parent_dir("D:\\data\\\\a", "D:\\data");
1543 test_os_file_get_parent_dir("D:///data//a///b/", "D:///data//a");
1544 test_os_file_get_parent_dir("D:\\\\\\data\\\\a\\\\\\b\\", "D:\\\\\\data\\\\a");
1545 #endif /* _WIN32 */
1546 }
1547 #endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */
1548
1549
1550 /** Creates all missing subdirectories along the given path.
1551 @param[in] path Path name
1552 @return DB_SUCCESS if OK, otherwise error code. */
1553 dberr_t
os_file_create_subdirs_if_needed(const char * path)1554 os_file_create_subdirs_if_needed(
1555 const char* path)
1556 {
1557 if (srv_read_only_mode) {
1558
1559 ib::error()
1560 << "read only mode set. Can't create "
1561 << "subdirectories '" << path << "'";
1562
1563 return(DB_READ_ONLY);
1564
1565 }
1566
1567 char* subdir = os_file_get_parent_dir(path);
1568
1569 if (subdir == NULL) {
1570 /* subdir is root or cwd, nothing to do */
1571 return(DB_SUCCESS);
1572 }
1573
1574 /* Test if subdir exists */
1575 os_file_type_t type;
1576 bool subdir_exists;
1577 bool success = os_file_status(subdir, &subdir_exists, &type);
1578
1579 if (success && !subdir_exists) {
1580
1581 /* Subdir does not exist, create it */
1582 dberr_t err = os_file_create_subdirs_if_needed(subdir);
1583
1584 if (err != DB_SUCCESS) {
1585
1586 ut_free(subdir);
1587
1588 return(err);
1589 }
1590
1591 success = os_file_create_directory(subdir, false);
1592 }
1593
1594 ut_free(subdir);
1595
1596 return(success ? DB_SUCCESS : DB_ERROR);
1597 }
1598
1599 #ifndef _WIN32
1600
1601 /** Do the read/write
1602 @param[in] request The IO context and type
1603 @return the number of bytes read/written or negative value on error */
1604 ssize_t
execute(const IORequest & request)1605 SyncFileIO::execute(const IORequest& request)
1606 {
1607 ssize_t n_bytes;
1608
1609 if (request.is_read()) {
1610 n_bytes = pread(m_fh, m_buf, m_n, m_offset);
1611 } else {
1612 ut_ad(request.is_write());
1613 n_bytes = pwrite(m_fh, m_buf, m_n, m_offset);
1614 }
1615
1616 return(n_bytes);
1617 }
1618 /** Free storage space associated with a section of the file.
1619 @param[in] fh Open file handle
1620 @param[in] off Starting offset (SEEK_SET)
1621 @param[in] len Size of the hole
1622 @return DB_SUCCESS or error code */
1623 static
1624 dberr_t
os_file_punch_hole_posix(os_file_t fh,os_offset_t off,os_offset_t len)1625 os_file_punch_hole_posix(
1626 os_file_t fh,
1627 os_offset_t off,
1628 os_offset_t len)
1629 {
1630
1631 #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
1632 const int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
1633
1634 int ret = fallocate(fh, mode, off, len);
1635
1636 if (ret == 0) {
1637 return(DB_SUCCESS);
1638 }
1639
1640 if (errno == ENOTSUP) {
1641 return(DB_IO_NO_PUNCH_HOLE);
1642 }
1643
1644 ib::warn()
1645 << "fallocate("
1646 <<", FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, "
1647 << off << ", " << len << ") returned errno: "
1648 << errno;
1649
1650 return(DB_IO_ERROR);
1651
1652 #elif defined(UNIV_SOLARIS)
1653
1654 // Use F_FREESP
1655
1656 #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
1657
1658 return(DB_IO_NO_PUNCH_HOLE);
1659 }
1660
1661 #if defined(LINUX_NATIVE_AIO)
1662
1663 /** Linux native AIO handler */
1664 class LinuxAIOHandler {
1665 public:
1666 /**
1667 @param[in] global_segment The global segment*/
LinuxAIOHandler(ulint global_segment)1668 LinuxAIOHandler(ulint global_segment)
1669 :
1670 m_global_segment(global_segment)
1671 {
1672 /* Should never be doing Sync IO here. */
1673 ut_a(m_global_segment != ULINT_UNDEFINED);
1674
1675 /* Find the array and the local segment. */
1676
1677 m_segment = AIO::get_array_and_local_segment(
1678 &m_array, m_global_segment);
1679
1680 m_n_slots = m_array->slots_per_segment();
1681 }
1682
1683 /** Destructor */
~LinuxAIOHandler()1684 ~LinuxAIOHandler()
1685 {
1686 // No op
1687 }
1688
1689 /**
1690 Process a Linux AIO request
1691 @param[out] m1 the messages passed with the
1692 @param[out] m2 AIO request; note that in case the
1693 AIO operation failed, these output
1694 parameters are valid and can be used to
1695 restart the operation.
1696 @param[out] request IO context
1697 @return DB_SUCCESS or error code */
1698 dberr_t poll(fil_node_t** m1, void** m2, IORequest* request);
1699
1700 private:
1701 /** Resubmit an IO request that was only partially successful
1702 @param[in,out] slot Request to resubmit
1703 @return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
1704 dberr_t resubmit(Slot* slot);
1705
1706 /** Check if the AIO succeeded
1707 @param[in,out] slot The slot to check
1708 @return DB_SUCCESS, DB_FAIL if the operation should be retried or
1709 DB_IO_ERROR on all other errors */
1710 dberr_t check_state(Slot* slot);
1711
1712 /** @return true if a shutdown was detected */
is_shutdown() const1713 bool is_shutdown() const
1714 {
1715 return(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
1716 && !buf_page_cleaner_is_active);
1717 }
1718
1719 /** If no slot was found then the m_array->m_mutex will be released.
1720 @param[out] n_pending The number of pending IOs
1721 @return NULL or a slot that has completed IO */
1722 Slot* find_completed_slot(ulint* n_pending);
1723
1724 /** This is called from within the IO-thread. If there are no completed
1725 IO requests in the slot array, the thread calls this function to
1726 collect more requests from the Linux kernel.
1727 The IO-thread waits on io_getevents(), which is a blocking call, with
1728 a timeout value. Unless the system is very heavy loaded, keeping the
1729 IO-thread very busy, the io-thread will spend most of its time waiting
1730 in this function.
1731 The IO-thread also exits in this function. It checks server status at
1732 each wakeup and that is why we use timed wait in io_getevents(). */
1733 void collect();
1734
1735 private:
1736 /** Slot array */
1737 AIO* m_array;
1738
1739 /** Number of slots inthe local segment */
1740 ulint m_n_slots;
1741
1742 /** The local segment to check */
1743 ulint m_segment;
1744
1745 /** The global segment */
1746 ulint m_global_segment;
1747 };
1748
1749 /** Resubmit an IO request that was only partially successful
1750 @param[in,out] slot Request to resubmit
1751 @return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
1752 dberr_t
resubmit(Slot * slot)1753 LinuxAIOHandler::resubmit(Slot* slot)
1754 {
1755 #ifdef UNIV_DEBUG
1756 /* Bytes already read/written out */
1757 ulint n_bytes = slot->ptr - slot->buf;
1758
1759 ut_ad(m_array->is_mutex_owned());
1760
1761 ut_ad(n_bytes < slot->original_len);
1762 ut_ad(static_cast<ulint>(slot->n_bytes) < slot->original_len - n_bytes);
1763 /* Partial read or write scenario */
1764 ut_ad(slot->len >= static_cast<ulint>(slot->n_bytes));
1765 #endif /* UNIV_DEBUG */
1766
1767 slot->len -= slot->n_bytes;
1768 slot->ptr += slot->n_bytes;
1769 slot->offset += slot->n_bytes;
1770
1771 /* Resetting the bytes read/written */
1772 slot->n_bytes = 0;
1773 slot->io_already_done = false;
1774
1775 compile_time_assert(sizeof(off_t) >= sizeof(os_offset_t));
1776
1777 struct iocb* iocb = &slot->control;
1778
1779 if (slot->type.is_read()) {
1780
1781 io_prep_pread(
1782 iocb,
1783 slot->file,
1784 slot->ptr,
1785 slot->len,
1786 slot->offset);
1787 } else {
1788
1789 ut_a(slot->type.is_write());
1790
1791 io_prep_pwrite(
1792 iocb,
1793 slot->file,
1794 slot->ptr,
1795 slot->len,
1796 slot->offset);
1797 }
1798
1799 iocb->data = slot;
1800
1801 ut_a(reinterpret_cast<size_t>(iocb->u.c.buf) % OS_FILE_LOG_BLOCK_SIZE
1802 == 0);
1803
1804 /* Resubmit an I/O request */
1805 int ret = io_submit(m_array->io_ctx(m_segment), 1, &iocb);
1806 ut_a(ret != -EINVAL);
1807
1808 if (ret < 0) {
1809 errno = -ret;
1810 }
1811
1812 return(ret < 0 ? DB_IO_PARTIAL_FAILED : DB_SUCCESS);
1813 }
1814
1815 /** Check if the AIO succeeded
1816 @param[in,out] slot The slot to check
1817 @return DB_SUCCESS, DB_FAIL if the operation should be retried or
1818 DB_IO_ERROR on all other errors */
1819 dberr_t
check_state(Slot * slot)1820 LinuxAIOHandler::check_state(Slot* slot)
1821 {
1822 ut_ad(m_array->is_mutex_owned());
1823
1824 /* Note that it may be that there is more then one completed
1825 IO requests. We process them one at a time. We may have a case
1826 here to improve the performance slightly by dealing with all
1827 requests in one sweep. */
1828
1829 srv_set_io_thread_op_info(
1830 m_global_segment, "processing completed aio requests");
1831
1832 ut_ad(slot->io_already_done);
1833
1834 dberr_t err = DB_SUCCESS;
1835
1836 if (slot->ret == 0) {
1837
1838 err = AIOHandler::post_io_processing(slot);
1839
1840 } else {
1841 errno = -slot->ret;
1842
1843 /* os_file_handle_error does tell us if we should retry
1844 this IO. As it stands now, we don't do this retry when
1845 reaping requests from a different context than
1846 the dispatcher. This non-retry logic is the same for
1847 Windows and Linux native AIO.
1848 We should probably look into this to transparently
1849 re-submit the IO. */
1850 os_file_handle_error(slot->name, "Linux aio");
1851
1852 err = DB_IO_ERROR;
1853 }
1854
1855 return(err);
1856 }
1857
1858 /** If no slot was found then the m_array->m_mutex will be released.
1859 @param[out] n_pending The number of pending IOs
1860 @return NULL or a slot that has completed IO */
1861 Slot*
find_completed_slot(ulint * n_pending)1862 LinuxAIOHandler::find_completed_slot(ulint* n_pending)
1863 {
1864 ulint offset = m_n_slots * m_segment;
1865
1866 *n_pending = 0;
1867
1868 m_array->acquire();
1869
1870 Slot* slot = m_array->at(offset);
1871
1872 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
1873
1874 if (slot->is_reserved) {
1875
1876 ++*n_pending;
1877
1878 if (slot->io_already_done) {
1879
1880 /* Something for us to work on.
1881 Note: We don't release the mutex. */
1882 return(slot);
1883 }
1884 }
1885 }
1886
1887 m_array->release();
1888
1889 return(NULL);
1890 }
1891
1892 /** This function is only used in Linux native asynchronous i/o. This is
1893 called from within the io-thread. If there are no completed IO requests
1894 in the slot array, the thread calls this function to collect more
1895 requests from the kernel.
1896 The io-thread waits on io_getevents(), which is a blocking call, with
1897 a timeout value. Unless the system is very heavy loaded, keeping the
1898 io-thread very busy, the io-thread will spend most of its time waiting
1899 in this function.
1900 The io-thread also exits in this function. It checks server status at
1901 each wakeup and that is why we use timed wait in io_getevents(). */
1902 void
collect()1903 LinuxAIOHandler::collect()
1904 {
1905 ut_ad(m_n_slots > 0);
1906 ut_ad(m_array != NULL);
1907 ut_ad(m_segment < m_array->get_n_segments());
1908
1909 /* Which io_context_t we are going to use. */
1910 io_context_t io_ctx = m_array->io_ctx(m_segment);
1911
1912 /* Starting point of the m_segment we will be working on. */
1913 ulint start_pos = m_segment * m_n_slots;
1914
1915 /* End point. */
1916 ulint end_pos = start_pos + m_n_slots;
1917
1918 for (;;) {
1919 struct io_event* events;
1920
1921 /* Which part of event array we are going to work on. */
1922 events = m_array->io_events(m_segment * m_n_slots);
1923
1924 /* Initialize the events. */
1925 memset(events, 0, sizeof(*events) * m_n_slots);
1926
1927 /* The timeout value is arbitrary. We probably need
1928 to experiment with it a little. */
1929 struct timespec timeout;
1930
1931 timeout.tv_sec = 0;
1932 timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
1933
1934 int ret;
1935
1936 ret = io_getevents(io_ctx, 1, m_n_slots, events, &timeout);
1937 ut_a(ret != -EINVAL);
1938 ut_ad(ret != -EFAULT);
1939
1940 for (int i = 0; i < ret; ++i) {
1941
1942 struct iocb* iocb;
1943
1944 iocb = reinterpret_cast<struct iocb*>(events[i].obj);
1945 ut_a(iocb != NULL);
1946
1947 Slot* slot = reinterpret_cast<Slot*>(iocb->data);
1948
1949 /* Some sanity checks. */
1950 ut_a(slot != NULL);
1951 ut_a(slot->is_reserved);
1952
1953 /* We are not scribbling previous segment. */
1954 ut_a(slot->pos >= start_pos);
1955
1956 /* We have not overstepped to next segment. */
1957 ut_a(slot->pos < end_pos);
1958
1959 /* Deallocate unused blocks from file system.
1960 This is newer done to page 0 or to log files.*/
1961 if (slot->offset > 0
1962 && !slot->type.is_log()
1963 && slot->type.is_write()
1964 && slot->type.punch_hole()) {
1965
1966 slot->err = slot->type.punch_hole(
1967 slot->file,
1968 slot->offset, slot->len);
1969 } else {
1970 slot->err = DB_SUCCESS;
1971 }
1972
1973 /* Mark this request as completed. The error handling
1974 will be done in the calling function. */
1975 m_array->acquire();
1976
1977 /* events[i].res2 should always be ZERO */
1978 ut_ad(events[i].res2 == 0);
1979 slot->io_already_done = true;
1980
1981 /*Even though events[i].res is an unsigned number
1982 in libaio, it is used to return a negative value
1983 (negated errno value) to indicate error and a positive
1984 value to indicate number of bytes read or written. */
1985
1986 if (events[i].res > slot->len) {
1987 /* failure */
1988 slot->n_bytes = 0;
1989 slot->ret = events[i].res;
1990 } else {
1991 /* success */
1992 slot->n_bytes = events[i].res;
1993 slot->ret = 0;
1994 }
1995 m_array->release();
1996 }
1997
1998 if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
1999 || !buf_page_cleaner_is_active
2000 || ret > 0) {
2001
2002 break;
2003 }
2004
2005 /* This error handling is for any error in collecting the
2006 IO requests. The errors, if any, for any particular IO
2007 request are simply passed on to the calling routine. */
2008
2009 switch (ret) {
2010 case -EAGAIN:
2011 /* Not enough resources! Try again. */
2012
2013 case -EINTR:
2014 /* Interrupted! The behaviour in case of an interrupt.
2015 If we have some completed IOs available then the
2016 return code will be the number of IOs. We get EINTR
2017 only if there are no completed IOs and we have been
2018 interrupted. */
2019
2020 case 0:
2021 /* No pending request! Go back and check again. */
2022
2023 continue;
2024 }
2025
2026 /* All other errors should cause a trap for now. */
2027 ib::fatal()
2028 << "Unexpected ret_code[" << ret
2029 << "] from io_getevents()!";
2030
2031 break;
2032 }
2033 }
2034
2035 /** Process a Linux AIO request
2036 @param[out] m1 the messages passed with the
2037 @param[out] m2 AIO request; note that in case the
2038 AIO operation failed, these output
2039 parameters are valid and can be used to
2040 restart the operation.
2041 @param[out] request IO context
2042 @return DB_SUCCESS or error code */
2043 dberr_t
poll(fil_node_t ** m1,void ** m2,IORequest * request)2044 LinuxAIOHandler::poll(fil_node_t** m1, void** m2, IORequest* request)
2045 {
2046 dberr_t err = DB_SUCCESS;
2047 Slot* slot;
2048
2049 /* Loop until we have found a completed request. */
2050 for (;;) {
2051
2052 ulint n_pending;
2053
2054 slot = find_completed_slot(&n_pending);
2055
2056 if (slot != NULL) {
2057
2058 ut_ad(m_array->is_mutex_owned());
2059
2060 err = check_state(slot);
2061
2062 /* DB_FAIL is not a hard error, we should retry */
2063 if (err != DB_FAIL) {
2064 break;
2065 }
2066
2067 /* Partial IO, resubmit request for
2068 remaining bytes to read/write */
2069 err = resubmit(slot);
2070
2071 if (err != DB_SUCCESS) {
2072 break;
2073 }
2074
2075 m_array->release();
2076
2077 } else if (is_shutdown() && n_pending == 0) {
2078
2079 /* There is no completed request. If there is
2080 no pending request at all, and the system is
2081 being shut down, exit. */
2082
2083 *m1 = NULL;
2084 *m2 = NULL;
2085
2086 return(DB_SUCCESS);
2087
2088 } else {
2089
2090 /* Wait for some request. Note that we return
2091 from wait if we have found a request. */
2092
2093 srv_set_io_thread_op_info(
2094 m_global_segment,
2095 "waiting for completed aio requests");
2096
2097 collect();
2098 }
2099 }
2100
2101 if (err == DB_IO_PARTIAL_FAILED) {
2102 /* Aborting in case of submit failure */
2103 ib::fatal()
2104 << "Native Linux AIO interface. "
2105 "io_submit() call failed when "
2106 "resubmitting a partial I/O "
2107 "request on the file " << slot->name
2108 << ".";
2109 }
2110
2111 *m1 = slot->m1;
2112 *m2 = slot->m2;
2113
2114 *request = slot->type;
2115
2116 m_array->release(slot);
2117
2118 m_array->release();
2119
2120 return(err);
2121 }
2122
2123 /** This function is only used in Linux native asynchronous i/o.
2124 Waits for an aio operation to complete. This function is used to wait for
2125 the completed requests. The aio array of pending requests is divided
2126 into segments. The thread specifies which segment or slot it wants to wait
2127 for. NOTE: this function will also take care of freeing the aio slot,
2128 therefore no other thread is allowed to do the freeing!
2129
2130 @param[in] global_seg segment number in the aio array
2131 to wait for; segment 0 is the ibuf
2132 i/o thread, segment 1 is log i/o thread,
2133 then follow the non-ibuf read threads,
2134 and the last are the non-ibuf write
2135 threads.
2136 @param[out] m1 the messages passed with the
2137 @param[out] m2 AIO request; note that in case the
2138 AIO operation failed, these output
2139 parameters are valid and can be used to
2140 restart the operation.
2141 @param[out]xi request IO context
2142 @return DB_SUCCESS if the IO was successful */
2143 static
2144 dberr_t
os_aio_linux_handler(ulint global_segment,fil_node_t ** m1,void ** m2,IORequest * request)2145 os_aio_linux_handler(
2146 ulint global_segment,
2147 fil_node_t** m1,
2148 void** m2,
2149 IORequest* request)
2150 {
2151 return LinuxAIOHandler(global_segment).poll(m1, m2, request);
2152 }
2153
2154 /** Dispatch an AIO request to the kernel.
2155 @param[in,out] slot an already reserved slot
2156 @return true on success. */
2157 bool
linux_dispatch(Slot * slot)2158 AIO::linux_dispatch(Slot* slot)
2159 {
2160 ut_a(slot->is_reserved);
2161 ut_ad(slot->type.validate());
2162
2163 /* Find out what we are going to work with.
2164 The iocb struct is directly in the slot.
2165 The io_context_t is one per segment. */
2166
2167 ulint io_ctx_index;
2168 struct iocb* iocb = &slot->control;
2169
2170 io_ctx_index = (slot->pos * m_n_segments) / m_slots.size();
2171
2172 ut_a(reinterpret_cast<size_t>(iocb->u.c.buf) % OS_FILE_LOG_BLOCK_SIZE
2173 == 0);
2174
2175 int ret = io_submit(io_ctx(io_ctx_index), 1, &iocb);
2176 ut_a(ret != -EINVAL);
2177
2178 /* io_submit() returns number of successfully queued requests
2179 or -errno. */
2180
2181 if (ret != 1) {
2182 errno = -ret;
2183 }
2184
2185 return(ret == 1);
2186 }
2187
2188 /** Creates an io_context_t for native linux AIO.
2189 @param[in] max_events number of events
2190 @param[out] io_ctx io_ctx to initialize.
2191 @return true on success. */
2192 bool
linux_create_io_ctx(unsigned max_events,io_context_t & io_ctx)2193 AIO::linux_create_io_ctx(
2194 unsigned max_events,
2195 io_context_t& io_ctx)
2196 {
2197 ssize_t n_retries = 0;
2198
2199 for (;;) {
2200
2201 memset(&io_ctx, 0x0, sizeof(io_ctx));
2202
2203 /* Initialize the io_ctx. Tell it how many pending
2204 IO requests this context will handle. */
2205
2206 int ret = io_setup(max_events, &io_ctx);
2207 ut_a(ret != -EINVAL);
2208
2209 if (ret == 0) {
2210 /* Success. Return now. */
2211 return(true);
2212 }
2213
2214 /* If we hit EAGAIN we'll make a few attempts before failing. */
2215
2216 switch (ret) {
2217 case -EAGAIN:
2218 if (n_retries == 0) {
2219 /* First time around. */
2220 ib::warn()
2221 << "io_setup() failed with EAGAIN."
2222 " Will make "
2223 << OS_AIO_IO_SETUP_RETRY_ATTEMPTS
2224 << " attempts before giving up.";
2225 }
2226
2227 if (n_retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
2228
2229 ++n_retries;
2230
2231 ib::warn()
2232 << "io_setup() attempt "
2233 << n_retries << ".";
2234
2235 os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
2236
2237 continue;
2238 }
2239
2240 /* Have tried enough. Better call it a day. */
2241 ib::warn()
2242 << "io_setup() failed with EAGAIN after "
2243 << OS_AIO_IO_SETUP_RETRY_ATTEMPTS
2244 << " attempts.";
2245 break;
2246
2247 case -ENOSYS:
2248 ib::warn()
2249 << "Linux Native AIO interface"
2250 " is not supported on this platform. Please"
2251 " check your OS documentation and install"
2252 " appropriate binary of InnoDB.";
2253
2254 break;
2255
2256 default:
2257 ib::warn()
2258 << "Linux Native AIO setup"
2259 << " returned following error["
2260 << ret << "]";
2261 break;
2262 }
2263
2264 ib::info()
2265 << "You can disable Linux Native AIO by"
2266 " setting innodb_use_native_aio = 0 in my.cnf";
2267
2268 break;
2269 }
2270
2271 return(false);
2272 }
2273
2274 /** Checks if the system supports native linux aio. On some kernel
2275 versions where native aio is supported it won't work on tmpfs. In such
2276 cases we can't use native aio as it is not possible to mix simulated
2277 and native aio.
2278 @return: true if supported, false otherwise. */
2279 bool
is_linux_native_aio_supported()2280 AIO::is_linux_native_aio_supported()
2281 {
2282 int fd;
2283 io_context_t io_ctx;
2284 char name[1000];
2285
2286 if (!linux_create_io_ctx(1, io_ctx)) {
2287
2288 /* The platform does not support native aio. */
2289
2290 return(false);
2291
2292 } else if (!srv_read_only_mode) {
2293
2294 /* Now check if tmpdir supports native aio ops. */
2295 fd = innobase_mysql_tmpfile(NULL);
2296
2297 if (fd < 0) {
2298 ib::warn()
2299 << "Unable to create temp file to check"
2300 " native AIO support.";
2301
2302 int ret = io_destroy(io_ctx);
2303 ut_a(ret != -EINVAL);
2304 ut_ad(ret != -EFAULT);
2305
2306 return(false);
2307 }
2308 } else {
2309
2310 os_normalize_path(srv_log_group_home_dir);
2311
2312 ulint dirnamelen = strlen(srv_log_group_home_dir);
2313
2314 ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
2315
2316 memcpy(name, srv_log_group_home_dir, dirnamelen);
2317
2318 /* Add a path separator if needed. */
2319 if (dirnamelen && name[dirnamelen - 1] != OS_PATH_SEPARATOR) {
2320
2321 name[dirnamelen++] = OS_PATH_SEPARATOR;
2322 }
2323
2324 strcpy(name + dirnamelen, "ib_logfile0");
2325
2326 fd = open(name, O_RDONLY | O_CLOEXEC);
2327
2328 if (fd == -1) {
2329
2330 ib::warn()
2331 << "Unable to open"
2332 << " \"" << name << "\" to check native"
2333 << " AIO read support.";
2334
2335 int ret = io_destroy(io_ctx);
2336 ut_a(ret != EINVAL);
2337 ut_ad(ret != EFAULT);
2338
2339 return(false);
2340 }
2341 }
2342
2343 struct io_event io_event;
2344
2345 memset(&io_event, 0x0, sizeof(io_event));
2346
2347 byte* buf = static_cast<byte*>(ut_malloc_nokey(srv_page_size * 2));
2348 byte* ptr = static_cast<byte*>(ut_align(buf, srv_page_size));
2349
2350 struct iocb iocb;
2351
2352 /* Suppress valgrind warning. */
2353 memset(buf, 0x00, srv_page_size * 2);
2354 memset(&iocb, 0x0, sizeof(iocb));
2355
2356 struct iocb* p_iocb = &iocb;
2357
2358 if (!srv_read_only_mode) {
2359
2360 io_prep_pwrite(p_iocb, fd, ptr, srv_page_size, 0);
2361
2362 } else {
2363 ut_a(srv_page_size >= 4096);
2364 io_prep_pread(p_iocb, fd, ptr, srv_page_size, 0);
2365 }
2366
2367 ut_a(reinterpret_cast<size_t>(p_iocb->u.c.buf) % OS_FILE_LOG_BLOCK_SIZE
2368 == 0);
2369 int err = io_submit(io_ctx, 1, &p_iocb);
2370 ut_a(err != -EINVAL);
2371
2372 if (err >= 1) {
2373 /* Now collect the submitted IO request. */
2374 err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
2375 ut_a(err != -EINVAL);
2376 }
2377
2378 ut_free(buf);
2379 close(fd);
2380
2381 switch (err) {
2382 case 1:
2383 {
2384 int ret = io_destroy(io_ctx);
2385 ut_a(ret != -EINVAL);
2386 ut_ad(ret != -EFAULT);
2387
2388 return(true);
2389 }
2390
2391 case -EINVAL:
2392 case -ENOSYS:
2393 ib::error()
2394 << "Linux Native AIO not supported. You can either"
2395 " move "
2396 << (srv_read_only_mode ? name : "tmpdir")
2397 << " to a file system that supports native"
2398 " AIO or you can set innodb_use_native_aio to"
2399 " FALSE to avoid this message.";
2400
2401 /* fall through. */
2402 default:
2403 ib::error()
2404 << "Linux Native AIO check on "
2405 << (srv_read_only_mode ? name : "tmpdir")
2406 << "returned error[" << -err << "]";
2407 }
2408
2409 int ret = io_destroy(io_ctx);
2410 ut_a(ret != -EINVAL);
2411 ut_ad(ret != -EFAULT);
2412
2413 return(false);
2414 }
2415
2416 #endif /* LINUX_NATIVE_AIO */
2417
2418 /** Retrieves the last error number if an error occurs in a file io function.
2419 The number should be retrieved before any other OS calls (because they may
2420 overwrite the error number). If the number is not known to this program,
2421 the OS error number + OS_FILE_ERROR_MAX is returned.
2422 @param[in] report_all_errors true if we want an error message
2423 printed of all errors
2424 @param[in] on_error_silent true then don't print any diagnostic
2425 to the log
2426 @return error number, or OS error number + OS_FILE_ERROR_MAX */
2427 static
2428 ulint
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)2429 os_file_get_last_error_low(
2430 bool report_all_errors,
2431 bool on_error_silent)
2432 {
2433 int err = errno;
2434
2435 if (err == 0) {
2436 return(0);
2437 }
2438
2439 if (report_all_errors
2440 || (err != ENOSPC && err != EEXIST && !on_error_silent)) {
2441
2442 ib::error()
2443 << "Operating system error number "
2444 << err
2445 << " in a file operation.";
2446
2447 if (err == ENOENT) {
2448
2449 ib::error()
2450 << "The error means the system"
2451 " cannot find the path specified.";
2452
2453 if (srv_is_being_started) {
2454
2455 ib::error()
2456 << "If you are installing InnoDB,"
2457 " remember that you must create"
2458 " directories yourself, InnoDB"
2459 " does not create them.";
2460 }
2461 } else if (err == EACCES) {
2462
2463 ib::error()
2464 << "The error means mysqld does not have"
2465 " the access rights to the directory.";
2466
2467 } else {
2468 if (strerror(err) != NULL) {
2469
2470 ib::error()
2471 << "Error number " << err << " means '"
2472 << strerror(err) << "'";
2473 }
2474
2475 ib::info() << OPERATING_SYSTEM_ERROR_MSG;
2476 }
2477 }
2478
2479 switch (err) {
2480 case ENOSPC:
2481 return(OS_FILE_DISK_FULL);
2482 case ENOENT:
2483 return(OS_FILE_NOT_FOUND);
2484 case EEXIST:
2485 return(OS_FILE_ALREADY_EXISTS);
2486 case EXDEV:
2487 case ENOTDIR:
2488 case EISDIR:
2489 return(OS_FILE_PATH_ERROR);
2490 case EAGAIN:
2491 if (srv_use_native_aio) {
2492 return(OS_FILE_AIO_RESOURCES_RESERVED);
2493 }
2494 break;
2495 case EINTR:
2496 if (srv_use_native_aio) {
2497 return(OS_FILE_AIO_INTERRUPTED);
2498 }
2499 break;
2500 case EACCES:
2501 return(OS_FILE_ACCESS_VIOLATION);
2502 }
2503 return(OS_FILE_ERROR_MAX + err);
2504 }
2505
2506 /** Wrapper to fsync(2) that retries the call on some errors.
2507 Returns the value 0 if successful; otherwise the value -1 is returned and
2508 the global variable errno is set to indicate the error.
2509 @param[in] file open file handle
2510 @return 0 if success, -1 otherwise */
2511 static
2512 int
os_file_fsync_posix(os_file_t file)2513 os_file_fsync_posix(
2514 os_file_t file)
2515 {
2516 ulint failures = 0;
2517
2518 for (;;) {
2519
2520 ++os_n_fsyncs;
2521
2522 int ret = fsync(file);
2523
2524 if (ret == 0) {
2525 return(ret);
2526 }
2527
2528 switch(errno) {
2529 case ENOLCK:
2530
2531 ++failures;
2532 ut_a(failures < 1000);
2533
2534 if (!(failures % 100)) {
2535
2536 ib::warn()
2537 << "fsync(): "
2538 << "No locks available; retrying";
2539 }
2540
2541 /* 0.2 sec */
2542 os_thread_sleep(200000);
2543 break;
2544
2545 case EINTR:
2546
2547 ++failures;
2548 ut_a(failures < 2000);
2549 break;
2550
2551 default:
2552 ib::fatal() << "fsync() returned " << errno;
2553 }
2554 }
2555 }
2556
2557 /** Check the existence and type of the given file.
2558 @param[in] path path name of file
2559 @param[out] exists true if the file exists
2560 @param[out] type Type of the file, if it exists
2561 @return true if call succeeded */
2562 static
2563 bool
os_file_status_posix(const char * path,bool * exists,os_file_type_t * type)2564 os_file_status_posix(
2565 const char* path,
2566 bool* exists,
2567 os_file_type_t* type)
2568 {
2569 struct stat statinfo;
2570
2571 int ret = stat(path, &statinfo);
2572
2573 *exists = !ret;
2574
2575 if (!ret) {
2576 /* file exists, everything OK */
2577
2578 } else if (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG) {
2579 /* file does not exist */
2580 return(true);
2581
2582 } else {
2583 /* file exists, but stat call failed */
2584 os_file_handle_error_no_exit(path, "stat", false);
2585 return(false);
2586 }
2587
2588 if (S_ISDIR(statinfo.st_mode)) {
2589 *type = OS_FILE_TYPE_DIR;
2590
2591 } else if (S_ISLNK(statinfo.st_mode)) {
2592 *type = OS_FILE_TYPE_LINK;
2593
2594 } else if (S_ISREG(statinfo.st_mode)) {
2595 *type = OS_FILE_TYPE_FILE;
2596 } else {
2597 *type = OS_FILE_TYPE_UNKNOWN;
2598 }
2599
2600 return(true);
2601 }
2602
2603 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
2604 function!
2605 Flushes the write buffers of a given file to the disk.
2606 @param[in] file handle to a file
2607 @return true if success */
2608 bool
os_file_flush_func(os_file_t file)2609 os_file_flush_func(
2610 os_file_t file)
2611 {
2612 int ret;
2613
2614 WAIT_ALLOW_WRITES();
2615 ret = os_file_fsync_posix(file);
2616
2617 if (ret == 0) {
2618 return(true);
2619 }
2620
2621 /* Since Linux returns EINVAL if the 'file' is actually a raw device,
2622 we choose to ignore that error if we are using raw disks */
2623
2624 if (srv_start_raw_disk_in_use && errno == EINVAL) {
2625
2626 return(true);
2627 }
2628
2629 ib::error() << "The OS said file flush did not succeed";
2630
2631 os_file_handle_error(NULL, "flush");
2632
2633 /* It is a fatal error if a file flush does not succeed, because then
2634 the database can get corrupt on disk */
2635 ut_error;
2636
2637 return(false);
2638 }
2639
2640 /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
2641 this function!
2642 A simple function to open or create a file.
2643 @param[in] name name of the file or path as a null-terminated
2644 string
2645 @param[in] create_mode create mode
2646 @param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
2647 @param[in] read_only if true, read only checks are enforced
2648 @param[out] success true if succeed, false if error
2649 @return handle to the file, not defined if error, error number
2650 can be retrieved with os_file_get_last_error */
2651 pfs_os_file_t
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)2652 os_file_create_simple_func(
2653 const char* name,
2654 ulint create_mode,
2655 ulint access_type,
2656 bool read_only,
2657 bool* success)
2658 {
2659 pfs_os_file_t file;
2660
2661 *success = false;
2662
2663 int create_flag;
2664 const char* mode_str = NULL;
2665
2666 if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) {
2667 WAIT_ALLOW_WRITES();
2668 }
2669
2670 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
2671 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
2672
2673 if (create_mode == OS_FILE_OPEN) {
2674 mode_str = "OPEN";
2675
2676 if (access_type == OS_FILE_READ_ONLY) {
2677
2678 create_flag = O_RDONLY;
2679
2680 } else if (read_only) {
2681
2682 create_flag = O_RDONLY;
2683
2684 } else {
2685 create_flag = O_RDWR;
2686 }
2687
2688 } else if (read_only) {
2689
2690 mode_str = "OPEN";
2691 create_flag = O_RDONLY;
2692
2693 } else if (create_mode == OS_FILE_CREATE) {
2694
2695 mode_str = "CREATE";
2696 create_flag = O_RDWR | O_CREAT | O_EXCL;
2697
2698 } else if (create_mode == OS_FILE_CREATE_PATH) {
2699
2700 mode_str = "CREATE PATH";
2701 /* Create subdirs along the path if needed. */
2702
2703 *success = os_file_create_subdirs_if_needed(name);
2704
2705 if (!*success) {
2706
2707 ib::error()
2708 << "Unable to create subdirectories '"
2709 << name << "'";
2710
2711 return(OS_FILE_CLOSED);
2712 }
2713
2714 create_flag = O_RDWR | O_CREAT | O_EXCL;
2715 create_mode = OS_FILE_CREATE;
2716 } else {
2717
2718 ib::error()
2719 << "Unknown file create mode ("
2720 << create_mode
2721 << " for file '" << name << "'";
2722
2723 return(OS_FILE_CLOSED);
2724 }
2725
2726 bool retry;
2727
2728 do {
2729 file = open(name, create_flag | O_CLOEXEC, os_innodb_umask);
2730
2731 if (file == -1) {
2732 *success = false;
2733 retry = os_file_handle_error(
2734 name,
2735 create_mode == OS_FILE_OPEN
2736 ? "open" : "create");
2737 } else {
2738 *success = true;
2739 retry = false;
2740 }
2741
2742 } while (retry);
2743
2744 /* This function is always called for data files, we should disable
2745 OS caching (O_DIRECT) here as we do in os_file_create_func(), so
2746 we open the same file in the same mode, see man page of open(2). */
2747 if (!srv_read_only_mode
2748 && *success
2749 && (srv_file_flush_method == SRV_O_DIRECT
2750 || srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)) {
2751
2752 os_file_set_nocache(file, name, mode_str);
2753 }
2754
2755 #ifdef USE_FILE_LOCK
2756 if (!read_only
2757 && *success
2758 && (access_type == OS_FILE_READ_WRITE)
2759 && os_file_lock(file, name)) {
2760
2761 *success = false;
2762 close(file);
2763 file = -1;
2764 }
2765 #endif /* USE_FILE_LOCK */
2766
2767 return(file);
2768 }
2769
2770 /** This function attempts to create a directory named pathname. The new
2771 directory gets default permissions. On Unix the permissions are
2772 (0770 & ~umask). If the directory exists already, nothing is done and
2773 the call succeeds, unless the fail_if_exists arguments is true.
2774 If another error occurs, such as a permission error, this does not crash,
2775 but reports the error and returns false.
2776 @param[in] pathname directory name as null-terminated string
2777 @param[in] fail_if_exists if true, pre-existing directory is treated as
2778 an error.
2779 @return true if call succeeds, false on error */
2780 bool
os_file_create_directory(const char * pathname,bool fail_if_exists)2781 os_file_create_directory(
2782 const char* pathname,
2783 bool fail_if_exists)
2784 {
2785 int rcode;
2786
2787 WAIT_ALLOW_WRITES();
2788 rcode = mkdir(pathname, 0770);
2789
2790 if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
2791 /* failure */
2792 os_file_handle_error_no_exit(pathname, "mkdir", false);
2793
2794 return(false);
2795 }
2796
2797 return(true);
2798 }
2799
2800 /**
2801 The os_file_opendir() function opens a directory stream corresponding to the
2802 directory named by the dirname argument. The directory stream is positioned
2803 at the first entry. In both Unix and Windows we automatically skip the '.'
2804 and '..' items at the start of the directory listing.
2805 @param[in] dirname directory name; it must not contain a trailing
2806 '\' or '/'
2807 @param[in] is_fatal true if we should treat an error as a fatal
2808 error; if we try to open symlinks then we do
2809 not wish a fatal error if it happens not to be
2810 a directory
2811 @return directory stream, NULL if error */
2812 os_file_dir_t
os_file_opendir(const char * dirname,bool error_is_fatal)2813 os_file_opendir(
2814 const char* dirname,
2815 bool error_is_fatal)
2816 {
2817 os_file_dir_t dir;
2818 dir = opendir(dirname);
2819
2820 if (dir == NULL && error_is_fatal) {
2821 os_file_handle_error(dirname, "opendir");
2822 }
2823
2824 return(dir);
2825 }
2826
2827 /** Closes a directory stream.
2828 @param[in] dir directory stream
2829 @return 0 if success, -1 if failure */
2830 int
os_file_closedir(os_file_dir_t dir)2831 os_file_closedir(
2832 os_file_dir_t dir)
2833 {
2834 int ret = closedir(dir);
2835
2836 if (ret != 0) {
2837 os_file_handle_error_no_exit(NULL, "closedir", false);
2838 }
2839
2840 return(ret);
2841 }
2842
2843 /** This function returns information of the next file in the directory. We jump
2844 over the '.' and '..' entries in the directory.
2845 @param[in] dirname directory name or path
2846 @param[in] dir directory stream
2847 @param[out] info buffer where the info is returned
2848 @return 0 if ok, -1 if error, 1 if at the end of the directory */
2849 int
os_file_readdir_next_file(const char * dirname,os_file_dir_t dir,os_file_stat_t * info)2850 os_file_readdir_next_file(
2851 const char* dirname,
2852 os_file_dir_t dir,
2853 os_file_stat_t* info)
2854 {
2855 struct dirent* ent;
2856 char* full_path;
2857 int ret;
2858 struct stat statinfo;
2859
2860 next_file:
2861
2862 ent = readdir(dir);
2863
2864 if (ent == NULL) {
2865
2866 return(1);
2867 }
2868
2869 ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
2870
2871 if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
2872
2873 goto next_file;
2874 }
2875
2876 strcpy(info->name, ent->d_name);
2877
2878 full_path = static_cast<char*>(
2879 ut_malloc_nokey(strlen(dirname) + strlen(ent->d_name) + 10));
2880
2881 sprintf(full_path, "%s/%s", dirname, ent->d_name);
2882
2883 ret = stat(full_path, &statinfo);
2884
2885 if (ret) {
2886
2887 if (errno == ENOENT) {
2888 /* readdir() returned a file that does not exist,
2889 it must have been deleted in the meantime. Do what
2890 would have happened if the file was deleted before
2891 readdir() - ignore and go to the next entry.
2892 If this is the last entry then info->name will still
2893 contain the name of the deleted file when this
2894 function returns, but this is not an issue since the
2895 caller shouldn't be looking at info when end of
2896 directory is returned. */
2897
2898 ut_free(full_path);
2899
2900 goto next_file;
2901 }
2902
2903 os_file_handle_error_no_exit(full_path, "stat", false);
2904
2905 ut_free(full_path);
2906
2907 return(-1);
2908 }
2909
2910 info->size = statinfo.st_size;
2911
2912 if (S_ISDIR(statinfo.st_mode)) {
2913 info->type = OS_FILE_TYPE_DIR;
2914 } else if (S_ISLNK(statinfo.st_mode)) {
2915 info->type = OS_FILE_TYPE_LINK;
2916 } else if (S_ISREG(statinfo.st_mode)) {
2917 info->type = OS_FILE_TYPE_FILE;
2918 } else {
2919 info->type = OS_FILE_TYPE_UNKNOWN;
2920 }
2921
2922 ut_free(full_path);
2923
2924 return(0);
2925 }
2926
2927 /** NOTE! Use the corresponding macro os_file_create(), not directly
2928 this function!
2929 Opens an existing file or creates a new.
2930 @param[in] name name of the file or path as a null-terminated
2931 string
2932 @param[in] create_mode create mode
2933 @param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O
2934 is desired, OS_FILE_NORMAL, if any normal file;
2935 NOTE that it also depends on type, os_aio_..
2936 and srv_.. variables whether we really use async
2937 I/O or unbuffered I/O: look in the function
2938 source code for the exact rules
2939 @param[in] type OS_DATA_FILE or OS_LOG_FILE
2940 @param[in] read_only true, if read only checks should be enforcedm
2941 @param[in] success true if succeeded
2942 @return handle to the file, not defined if error, error number
2943 can be retrieved with os_file_get_last_error */
2944 pfs_os_file_t
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,bool read_only,bool * success)2945 os_file_create_func(
2946 const char* name,
2947 ulint create_mode,
2948 ulint purpose,
2949 ulint type,
2950 bool read_only,
2951 bool* success)
2952 {
2953 bool on_error_no_exit;
2954 bool on_error_silent;
2955
2956 *success = false;
2957
2958 DBUG_EXECUTE_IF(
2959 "ib_create_table_fail_disk_full",
2960 *success = false;
2961 errno = ENOSPC;
2962 return(OS_FILE_CLOSED);
2963 );
2964
2965 int create_flag;
2966 const char* mode_str = NULL;
2967
2968 on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
2969 ? true : false;
2970 on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
2971 ? true : false;
2972
2973 create_mode &= ulint(~(OS_FILE_ON_ERROR_NO_EXIT
2974 | OS_FILE_ON_ERROR_SILENT));
2975
2976 if (create_mode == OS_FILE_OPEN
2977 || create_mode == OS_FILE_OPEN_RAW
2978 || create_mode == OS_FILE_OPEN_RETRY) {
2979
2980 mode_str = "OPEN";
2981
2982 create_flag = read_only ? O_RDONLY : O_RDWR;
2983
2984 } else if (read_only) {
2985
2986 mode_str = "OPEN";
2987
2988 create_flag = O_RDONLY;
2989
2990 } else if (create_mode == OS_FILE_CREATE) {
2991
2992 mode_str = "CREATE";
2993 create_flag = O_RDWR | O_CREAT | O_EXCL;
2994
2995 } else if (create_mode == OS_FILE_OVERWRITE) {
2996
2997 mode_str = "OVERWRITE";
2998 create_flag = O_RDWR | O_CREAT | O_TRUNC;
2999
3000 } else {
3001 ib::error()
3002 << "Unknown file create mode (" << create_mode << ")"
3003 << " for file '" << name << "'";
3004
3005 return(OS_FILE_CLOSED);
3006 }
3007
3008 ut_a(type == OS_LOG_FILE
3009 || type == OS_DATA_FILE
3010 || type == OS_DATA_FILE_NO_O_DIRECT);
3011
3012 ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
3013
3014 #ifdef O_SYNC
3015 /* We let O_SYNC only affect log files; note that we map O_DSYNC to
3016 O_SYNC because the datasync options seemed to corrupt files in 2001
3017 in both Linux and Solaris */
3018
3019 if (!read_only
3020 && type == OS_LOG_FILE
3021 && srv_file_flush_method == SRV_O_DSYNC) {
3022
3023 create_flag |= O_SYNC;
3024 }
3025 #endif /* O_SYNC */
3026
3027 os_file_t file;
3028 bool retry;
3029
3030 do {
3031 file = open(name, create_flag | O_CLOEXEC, os_innodb_umask);
3032
3033 if (file == -1) {
3034 const char* operation;
3035
3036 operation = (create_mode == OS_FILE_CREATE
3037 && !read_only) ? "create" : "open";
3038
3039 *success = false;
3040
3041 if (on_error_no_exit) {
3042 retry = os_file_handle_error_no_exit(
3043 name, operation, on_error_silent);
3044 } else {
3045 retry = os_file_handle_error(name, operation);
3046 }
3047 } else {
3048 *success = true;
3049 retry = false;
3050 }
3051
3052 } while (retry);
3053
3054 /* We disable OS caching (O_DIRECT) only on data files */
3055 if (!read_only
3056 && *success
3057 && (type != OS_LOG_FILE
3058 && type != OS_DATA_FILE_NO_O_DIRECT)
3059 && (srv_file_flush_method == SRV_O_DIRECT
3060 || srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)) {
3061
3062 os_file_set_nocache(file, name, mode_str);
3063 }
3064
3065 #ifdef USE_FILE_LOCK
3066 if (!read_only
3067 && *success
3068 && create_mode != OS_FILE_OPEN_RAW
3069 && os_file_lock(file, name)) {
3070
3071 if (create_mode == OS_FILE_OPEN_RETRY) {
3072
3073 ib::info()
3074 << "Retrying to lock the first data file";
3075
3076 for (int i = 0; i < 100; i++) {
3077 os_thread_sleep(1000000);
3078
3079 if (!os_file_lock(file, name)) {
3080 *success = true;
3081 return(file);
3082 }
3083 }
3084
3085 ib::info()
3086 << "Unable to open the first data file";
3087 }
3088
3089 *success = false;
3090 close(file);
3091 file = -1;
3092 }
3093 #endif /* USE_FILE_LOCK */
3094
3095 return(file);
3096 }
3097
3098 /** NOTE! Use the corresponding macro
3099 os_file_create_simple_no_error_handling(), not directly this function!
3100 A simple function to open or create a file.
3101 @param[in] name name of the file or path as a null-terminated
3102 string
3103 @param[in] create_mode create mode
3104 @param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
3105 OS_FILE_READ_ALLOW_DELETE; the last option
3106 is used by a backup program reading the file
3107 @param[in] read_only if true read only mode checks are enforced
3108 @param[out] success true if succeeded
3109 @return own: handle to the file, not defined if error, error number
3110 can be retrieved with os_file_get_last_error */
3111 pfs_os_file_t
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)3112 os_file_create_simple_no_error_handling_func(
3113 const char* name,
3114 ulint create_mode,
3115 ulint access_type,
3116 bool read_only,
3117 bool* success)
3118 {
3119 os_file_t file;
3120 int create_flag;
3121
3122 if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) {
3123 WAIT_ALLOW_WRITES();
3124 }
3125
3126 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
3127 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
3128
3129 *success = false;
3130
3131 if (create_mode == OS_FILE_OPEN) {
3132
3133 if (access_type == OS_FILE_READ_ONLY) {
3134
3135 create_flag = O_RDONLY;
3136
3137 } else if (read_only) {
3138
3139 create_flag = O_RDONLY;
3140
3141 } else {
3142
3143 ut_a(access_type == OS_FILE_READ_WRITE
3144 || access_type == OS_FILE_READ_ALLOW_DELETE);
3145
3146 create_flag = O_RDWR;
3147 }
3148
3149 } else if (read_only) {
3150
3151 create_flag = O_RDONLY;
3152
3153 } else if (create_mode == OS_FILE_CREATE) {
3154
3155 create_flag = O_RDWR | O_CREAT | O_EXCL;
3156
3157 } else {
3158
3159 ib::error()
3160 << "Unknown file create mode "
3161 << create_mode << " for file '" << name << "'";
3162
3163 return(OS_FILE_CLOSED);
3164 }
3165
3166 file = open(name, create_flag | O_CLOEXEC, os_innodb_umask);
3167
3168 *success = (file != -1);
3169
3170 #ifdef USE_FILE_LOCK
3171 if (!read_only
3172 && *success
3173 && access_type == OS_FILE_READ_WRITE
3174 && os_file_lock(file, name)) {
3175
3176 *success = false;
3177 close(file);
3178 file = -1;
3179
3180 }
3181 #endif /* USE_FILE_LOCK */
3182
3183 return(file);
3184 }
3185
3186 /** Deletes a file if it exists. The file has to be closed before calling this.
3187 @param[in] name file path as a null-terminated string
3188 @param[out] exist indicate if file pre-exist
3189 @return true if success */
3190 bool
os_file_delete_if_exists_func(const char * name,bool * exist)3191 os_file_delete_if_exists_func(
3192 const char* name,
3193 bool* exist)
3194 {
3195 if (exist != NULL) {
3196 *exist = true;
3197 }
3198
3199 int ret;
3200 WAIT_ALLOW_WRITES();
3201
3202 ret = unlink(name);
3203
3204 if (ret != 0 && errno == ENOENT) {
3205 if (exist != NULL) {
3206 *exist = false;
3207 }
3208 } else if (ret != 0 && errno != ENOENT) {
3209 os_file_handle_error_no_exit(name, "delete", false);
3210
3211 return(false);
3212 }
3213
3214 return(true);
3215 }
3216
3217 /** Deletes a file. The file has to be closed before calling this.
3218 @param[in] name file path as a null-terminated string
3219 @return true if success */
3220 bool
os_file_delete_func(const char * name)3221 os_file_delete_func(
3222 const char* name)
3223 {
3224 int ret;
3225 WAIT_ALLOW_WRITES();
3226
3227 ret = unlink(name);
3228
3229 if (ret != 0) {
3230 os_file_handle_error_no_exit(name, "delete", FALSE);
3231
3232 return(false);
3233 }
3234
3235 return(true);
3236 }
3237
3238 /** NOTE! Use the corresponding macro os_file_rename(), not directly this
3239 function!
3240 Renames a file (can also move it to another directory). It is safest that the
3241 file is closed before calling this function.
3242 @param[in] oldpath old file path as a null-terminated string
3243 @param[in] newpath new file path
3244 @return true if success */
3245 bool
os_file_rename_func(const char * oldpath,const char * newpath)3246 os_file_rename_func(
3247 const char* oldpath,
3248 const char* newpath)
3249 {
3250 #ifdef UNIV_DEBUG
3251 os_file_type_t type;
3252 bool exists;
3253
3254 /* New path must not exist. */
3255 ut_ad(os_file_status(newpath, &exists, &type));
3256 ut_ad(!exists);
3257
3258 /* Old path must exist. */
3259 ut_ad(os_file_status(oldpath, &exists, &type));
3260 ut_ad(exists);
3261 #endif /* UNIV_DEBUG */
3262
3263 int ret;
3264 WAIT_ALLOW_WRITES();
3265
3266 ret = rename(oldpath, newpath);
3267
3268 if (ret != 0) {
3269 os_file_handle_rename_error(oldpath, newpath);
3270
3271 return(false);
3272 }
3273
3274 return(true);
3275 }
3276
3277 /** NOTE! Use the corresponding macro os_file_close(), not directly this
3278 function!
3279 Closes a file handle. In case of error, error number can be retrieved with
3280 os_file_get_last_error.
3281 @param[in] file Handle to close
3282 @return true if success */
3283 bool
os_file_close_func(os_file_t file)3284 os_file_close_func(
3285 os_file_t file)
3286 {
3287 int ret = close(file);
3288
3289 if (ret == -1) {
3290 os_file_handle_error(NULL, "close");
3291
3292 return(false);
3293 }
3294
3295 return(true);
3296 }
3297
3298 /** Gets a file size.
3299 @param[in] file handle to an open file
3300 @return file size, or (os_offset_t) -1 on failure */
3301 os_offset_t
os_file_get_size(os_file_t file)3302 os_file_get_size(os_file_t file)
3303 {
3304 struct stat statbuf;
3305 return fstat(file, &statbuf) ? os_offset_t(-1) : statbuf.st_size;
3306 }
3307
3308 /** Gets a file size.
3309 @param[in] filename Full path to the filename to check
3310 @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
3311 errno */
3312 os_file_size_t
os_file_get_size(const char * filename)3313 os_file_get_size(
3314 const char* filename)
3315 {
3316 struct stat s;
3317 os_file_size_t file_size;
3318
3319 int ret = stat(filename, &s);
3320
3321 if (ret == 0) {
3322 file_size.m_total_size = s.st_size;
3323 /* st_blocks is in 512 byte sized blocks */
3324 file_size.m_alloc_size = s.st_blocks * 512;
3325 } else {
3326 file_size.m_total_size = ~0U;
3327 file_size.m_alloc_size = (os_offset_t) errno;
3328 }
3329
3330 return(file_size);
3331 }
3332
3333 /** This function returns information about the specified file
3334 @param[in] path pathname of the file
3335 @param[out] stat_info information of a file in a directory
3336 @param[in,out] statinfo information of a file in a directory
3337 @param[in] check_rw_perm for testing whether the file can be opened
3338 in RW mode
3339 @param[in] read_only if true read only mode checks are enforced
3340 @return DB_SUCCESS if all OK */
3341 static
3342 dberr_t
os_file_get_status_posix(const char * path,os_file_stat_t * stat_info,struct stat * statinfo,bool check_rw_perm,bool read_only)3343 os_file_get_status_posix(
3344 const char* path,
3345 os_file_stat_t* stat_info,
3346 struct stat* statinfo,
3347 bool check_rw_perm,
3348 bool read_only)
3349 {
3350 int ret = stat(path, statinfo);
3351
3352 if (ret && (errno == ENOENT || errno == ENOTDIR
3353 || errno == ENAMETOOLONG)) {
3354 /* file does not exist */
3355
3356 return(DB_NOT_FOUND);
3357
3358 } else if (ret) {
3359 /* file exists, but stat call failed */
3360
3361 os_file_handle_error_no_exit(path, "stat", false);
3362
3363 return(DB_FAIL);
3364 }
3365
3366 switch (statinfo->st_mode & S_IFMT) {
3367 case S_IFDIR:
3368 stat_info->type = OS_FILE_TYPE_DIR;
3369 break;
3370 case S_IFLNK:
3371 stat_info->type = OS_FILE_TYPE_LINK;
3372 break;
3373 case S_IFBLK:
3374 /* Handle block device as regular file. */
3375 case S_IFCHR:
3376 /* Handle character device as regular file. */
3377 case S_IFREG:
3378 stat_info->type = OS_FILE_TYPE_FILE;
3379 break;
3380 default:
3381 stat_info->type = OS_FILE_TYPE_UNKNOWN;
3382 }
3383
3384 stat_info->size = statinfo->st_size;
3385 stat_info->block_size = statinfo->st_blksize;
3386 stat_info->alloc_size = statinfo->st_blocks * 512;
3387
3388 if (check_rw_perm
3389 && (stat_info->type == OS_FILE_TYPE_FILE
3390 || stat_info->type == OS_FILE_TYPE_BLOCK)) {
3391
3392 stat_info->rw_perm = !access(path, read_only
3393 ? R_OK : R_OK | W_OK);
3394 }
3395
3396 return(DB_SUCCESS);
3397 }
3398
3399 /** Truncates a file to a specified size in bytes.
3400 Do nothing if the size to preserve is greater or equal to the current
3401 size of the file.
3402 @param[in] pathname file path
3403 @param[in] file file to be truncated
3404 @param[in] size size to preserve in bytes
3405 @return true if success */
3406 static
3407 bool
os_file_truncate_posix(const char * pathname,os_file_t file,os_offset_t size)3408 os_file_truncate_posix(
3409 const char* pathname,
3410 os_file_t file,
3411 os_offset_t size)
3412 {
3413 int res = ftruncate(file, size);
3414
3415 if (res == -1) {
3416
3417 bool retry;
3418
3419 retry = os_file_handle_error_no_exit(
3420 pathname, "truncate", false);
3421
3422 if (retry) {
3423 ib::warn()
3424 << "Truncate failed for '"
3425 << pathname << "'";
3426 }
3427 }
3428
3429 return(res == 0);
3430 }
3431
3432 /** Truncates a file at its current position.
3433 @return true if success */
3434 bool
os_file_set_eof(FILE * file)3435 os_file_set_eof(
3436 FILE* file) /*!< in: file to be truncated */
3437 {
3438 WAIT_ALLOW_WRITES();
3439 return(!ftruncate(fileno(file), ftell(file)));
3440 }
3441
3442 #else /* !_WIN32 */
3443
3444 #include <WinIoCtl.h>
3445
3446 /*
3447 Windows : Handling synchronous IO on files opened asynchronously.
3448
3449 If file is opened for asynchronous IO (FILE_FLAG_OVERLAPPED) and also bound to
3450 a completion port, then every IO on this file would normally be enqueued to the
3451 completion port. Sometimes however we would like to do a synchronous IO. This is
3452 possible if we initialitze have overlapped.hEvent with a valid event and set its
3453 lowest order bit to 1 (see MSDN ReadFile and WriteFile description for more info)
3454
3455 We'll create this special event once for each thread and store in thread local
3456 storage.
3457 */
3458
3459
win_free_syncio_event(void * data)3460 static void __stdcall win_free_syncio_event(void *data) {
3461 if (data) {
3462 CloseHandle((HANDLE)data);
3463 }
3464 }
3465
3466
3467 /*
3468 Retrieve per-thread event for doing synchronous io on asyncronously opened files
3469 */
win_get_syncio_event()3470 static HANDLE win_get_syncio_event()
3471 {
3472 HANDLE h;
3473
3474 h = (HANDLE)FlsGetValue(fls_sync_io);
3475 if (h) {
3476 return h;
3477 }
3478 h = CreateEventA(NULL, FALSE, FALSE, NULL);
3479 ut_a(h);
3480 /* Set low-order bit to keeps I/O completion from being queued */
3481 h = (HANDLE)((uintptr_t)h | 1);
3482 FlsSetValue(fls_sync_io, h);
3483 return h;
3484 }
3485
3486
3487 /** Do the read/write
3488 @param[in] request The IO context and type
3489 @return the number of bytes read/written or negative value on error */
3490 ssize_t
execute(const IORequest & request)3491 SyncFileIO::execute(const IORequest& request)
3492 {
3493 OVERLAPPED seek;
3494
3495 memset(&seek, 0x0, sizeof(seek));
3496
3497 seek.hEvent = win_get_syncio_event();
3498 seek.Offset = (DWORD) m_offset & 0xFFFFFFFF;
3499 seek.OffsetHigh = (DWORD) (m_offset >> 32);
3500
3501 BOOL ret;
3502 DWORD n_bytes;
3503
3504 if (request.is_read()) {
3505 ret = ReadFile(m_fh, m_buf,
3506 static_cast<DWORD>(m_n), NULL, &seek);
3507
3508 } else {
3509 ut_ad(request.is_write());
3510 ret = WriteFile(m_fh, m_buf,
3511 static_cast<DWORD>(m_n), NULL, &seek);
3512 }
3513 if (ret || (GetLastError() == ERROR_IO_PENDING)) {
3514 /* Wait for async io to complete */
3515 ret = GetOverlappedResult(m_fh, &seek, &n_bytes, TRUE);
3516 }
3517
3518 return(ret ? static_cast<ssize_t>(n_bytes) : -1);
3519 }
3520
3521 /** Do the read/write
3522 @param[in,out] slot The IO slot, it has the IO context
3523 @return the number of bytes read/written or negative value on error */
3524 ssize_t
execute(Slot * slot)3525 SyncFileIO::execute(Slot* slot)
3526 {
3527 BOOL ret;
3528 slot->control.hEvent = win_get_syncio_event();
3529 if (slot->type.is_read()) {
3530
3531 ret = ReadFile(
3532 slot->file, slot->ptr, slot->len,
3533 NULL, &slot->control);
3534
3535 } else {
3536 ut_ad(slot->type.is_write());
3537
3538 ret = WriteFile(
3539 slot->file, slot->ptr, slot->len,
3540 NULL, &slot->control);
3541
3542 }
3543 if (ret || (GetLastError() == ERROR_IO_PENDING)) {
3544 /* Wait for async io to complete */
3545 ret = GetOverlappedResult(slot->file, &slot->control, &slot->n_bytes, TRUE);
3546 }
3547
3548 return(ret ? static_cast<ssize_t>(slot->n_bytes) : -1);
3549 }
3550
3551 /* Startup/shutdown */
3552
3553 struct WinIoInit
3554 {
WinIoInitWinIoInit3555 WinIoInit() {
3556 fls_sync_io = FlsAlloc(win_free_syncio_event);
3557 ut_a(fls_sync_io != FLS_OUT_OF_INDEXES);
3558 }
3559
~WinIoInitWinIoInit3560 ~WinIoInit() {
3561 FlsFree(fls_sync_io);
3562 }
3563 };
3564
3565 /* Ensures proper initialization and shutdown */
3566 static WinIoInit win_io_init;
3567
3568
3569 /** Free storage space associated with a section of the file.
3570 @param[in] fh Open file handle
3571 @param[in] page_size Tablespace page size
3572 @param[in] block_size File system block size
3573 @param[in] off Starting offset (SEEK_SET)
3574 @param[in] len Size of the hole
3575 @return 0 on success or errno */
3576 static
3577 dberr_t
os_file_punch_hole_win32(os_file_t fh,os_offset_t off,os_offset_t len)3578 os_file_punch_hole_win32(
3579 os_file_t fh,
3580 os_offset_t off,
3581 os_offset_t len)
3582 {
3583 FILE_ZERO_DATA_INFORMATION punch;
3584
3585 punch.FileOffset.QuadPart = off;
3586 punch.BeyondFinalZero.QuadPart = off + len;
3587
3588 /* If lpOverlapped is NULL, lpBytesReturned cannot be NULL,
3589 therefore we pass a dummy parameter. */
3590 DWORD temp;
3591 BOOL success = os_win32_device_io_control(
3592 fh, FSCTL_SET_ZERO_DATA, &punch, sizeof(punch),
3593 NULL, 0, &temp);
3594
3595 return(success ? DB_SUCCESS: DB_IO_NO_PUNCH_HOLE);
3596 }
3597
3598 /** Check the existence and type of the given file.
3599 @param[in] path path name of file
3600 @param[out] exists true if the file exists
3601 @param[out] type Type of the file, if it exists
3602 @return true if call succeeded */
3603 static
3604 bool
os_file_status_win32(const char * path,bool * exists,os_file_type_t * type)3605 os_file_status_win32(
3606 const char* path,
3607 bool* exists,
3608 os_file_type_t* type)
3609 {
3610 int ret;
3611 struct _stat64 statinfo;
3612
3613 ret = _stat64(path, &statinfo);
3614
3615 *exists = !ret;
3616
3617 if (!ret) {
3618 /* file exists, everything OK */
3619
3620 } else if (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG) {
3621 /* file does not exist */
3622 return(true);
3623
3624 } else {
3625 /* file exists, but stat call failed */
3626 os_file_handle_error_no_exit(path, "stat", false);
3627 return(false);
3628 }
3629
3630 if (_S_IFDIR & statinfo.st_mode) {
3631 *type = OS_FILE_TYPE_DIR;
3632
3633 } else if (_S_IFREG & statinfo.st_mode) {
3634 *type = OS_FILE_TYPE_FILE;
3635
3636 } else {
3637 *type = OS_FILE_TYPE_UNKNOWN;
3638 }
3639
3640 return(true);
3641 }
3642
3643 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
3644 function!
3645 Flushes the write buffers of a given file to the disk.
3646 @param[in] file handle to a file
3647 @return true if success */
3648 bool
os_file_flush_func(os_file_t file)3649 os_file_flush_func(
3650 os_file_t file)
3651 {
3652 ++os_n_fsyncs;
3653
3654 BOOL ret = FlushFileBuffers(file);
3655
3656 if (ret) {
3657 return(true);
3658 }
3659
3660 /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
3661 actually a raw device, we choose to ignore that error if we are using
3662 raw disks */
3663
3664 if (srv_start_raw_disk_in_use && GetLastError()
3665 == ERROR_INVALID_FUNCTION) {
3666 return(true);
3667 }
3668
3669 os_file_handle_error(NULL, "flush");
3670
3671 /* It is a fatal error if a file flush does not succeed, because then
3672 the database can get corrupt on disk */
3673 ut_error;
3674
3675 return(false);
3676 }
3677
3678 /** Retrieves the last error number if an error occurs in a file io function.
3679 The number should be retrieved before any other OS calls (because they may
3680 overwrite the error number). If the number is not known to this program,
3681 the OS error number + 100 is returned.
3682 @param[in] report_all_errors true if we want an error message printed
3683 of all errors
3684 @param[in] on_error_silent true then don't print any diagnostic
3685 to the log
3686 @return error number, or OS error number + 100 */
3687 static
3688 ulint
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)3689 os_file_get_last_error_low(
3690 bool report_all_errors,
3691 bool on_error_silent)
3692 {
3693 ulint err = (ulint) GetLastError();
3694
3695 if (err == ERROR_SUCCESS) {
3696 return(0);
3697 }
3698
3699 if (report_all_errors
3700 || (!on_error_silent
3701 && err != ERROR_DISK_FULL
3702 && err != ERROR_FILE_EXISTS)) {
3703
3704 ib::error()
3705 << "Operating system error number " << err
3706 << " in a file operation.";
3707
3708 if (err == ERROR_PATH_NOT_FOUND) {
3709 ib::error()
3710 << "The error means the system"
3711 " cannot find the path specified.";
3712
3713 if (srv_is_being_started) {
3714 ib::error()
3715 << "If you are installing InnoDB,"
3716 " remember that you must create"
3717 " directories yourself, InnoDB"
3718 " does not create them.";
3719 }
3720
3721 } else if (err == ERROR_ACCESS_DENIED) {
3722
3723 ib::error()
3724 << "The error means mysqld does not have"
3725 " the access rights to"
3726 " the directory. It may also be"
3727 " you have created a subdirectory"
3728 " of the same name as a data file.";
3729
3730 } else if (err == ERROR_SHARING_VIOLATION
3731 || err == ERROR_LOCK_VIOLATION) {
3732
3733 ib::error()
3734 << "The error means that another program"
3735 " is using InnoDB's files."
3736 " This might be a backup or antivirus"
3737 " software or another instance"
3738 " of MySQL."
3739 " Please close it to get rid of this error.";
3740
3741 } else if (err == ERROR_WORKING_SET_QUOTA
3742 || err == ERROR_NO_SYSTEM_RESOURCES) {
3743
3744 ib::error()
3745 << "The error means that there are no"
3746 " sufficient system resources or quota to"
3747 " complete the operation.";
3748
3749 } else if (err == ERROR_OPERATION_ABORTED) {
3750
3751 ib::error()
3752 << "The error means that the I/O"
3753 " operation has been aborted"
3754 " because of either a thread exit"
3755 " or an application request."
3756 " Retry attempt is made.";
3757 } else {
3758
3759 ib::info() << OPERATING_SYSTEM_ERROR_MSG;
3760 }
3761 }
3762
3763 if (err == ERROR_FILE_NOT_FOUND) {
3764 return(OS_FILE_NOT_FOUND);
3765 } else if (err == ERROR_DISK_FULL) {
3766 return(OS_FILE_DISK_FULL);
3767 } else if (err == ERROR_FILE_EXISTS) {
3768 return(OS_FILE_ALREADY_EXISTS);
3769 } else if (err == ERROR_SHARING_VIOLATION
3770 || err == ERROR_LOCK_VIOLATION) {
3771 return(OS_FILE_SHARING_VIOLATION);
3772 } else if (err == ERROR_WORKING_SET_QUOTA
3773 || err == ERROR_NO_SYSTEM_RESOURCES) {
3774 return(OS_FILE_INSUFFICIENT_RESOURCE);
3775 } else if (err == ERROR_OPERATION_ABORTED) {
3776 return(OS_FILE_OPERATION_ABORTED);
3777 } else if (err == ERROR_ACCESS_DENIED) {
3778 return(OS_FILE_ACCESS_VIOLATION);
3779 }
3780
3781 return(OS_FILE_ERROR_MAX + err);
3782 }
3783
3784
3785 /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
3786 this function!
3787 A simple function to open or create a file.
3788 @param[in] name name of the file or path as a null-terminated
3789 string
3790 @param[in] create_mode create mode
3791 @param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
3792 @param[in] read_only if true read only mode checks are enforced
3793 @param[out] success true if succeed, false if error
3794 @return handle to the file, not defined if error, error number
3795 can be retrieved with os_file_get_last_error */
3796 pfs_os_file_t
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)3797 os_file_create_simple_func(
3798 const char* name,
3799 ulint create_mode,
3800 ulint access_type,
3801 bool read_only,
3802 bool* success)
3803 {
3804 os_file_t file;
3805
3806 *success = false;
3807
3808 DWORD access;
3809 DWORD create_flag;
3810 DWORD attributes = 0;
3811
3812 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
3813 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
3814 ut_ad(srv_operation == SRV_OPERATION_NORMAL);
3815
3816 if (create_mode == OS_FILE_OPEN) {
3817
3818 create_flag = OPEN_EXISTING;
3819
3820 } else if (read_only) {
3821
3822 create_flag = OPEN_EXISTING;
3823
3824 } else if (create_mode == OS_FILE_CREATE) {
3825
3826 create_flag = CREATE_NEW;
3827
3828 } else if (create_mode == OS_FILE_CREATE_PATH) {
3829
3830 /* Create subdirs along the path if needed. */
3831 *success = os_file_create_subdirs_if_needed(name);
3832
3833 if (!*success) {
3834
3835 ib::error()
3836 << "Unable to create subdirectories '"
3837 << name << "'";
3838
3839 return(OS_FILE_CLOSED);
3840 }
3841
3842 create_flag = CREATE_NEW;
3843 create_mode = OS_FILE_CREATE;
3844
3845 } else {
3846
3847 ib::error()
3848 << "Unknown file create mode ("
3849 << create_mode << ") for file '"
3850 << name << "'";
3851
3852 return(OS_FILE_CLOSED);
3853 }
3854
3855 if (access_type == OS_FILE_READ_ONLY) {
3856
3857 access = GENERIC_READ;
3858
3859 } else if (read_only) {
3860
3861 ib::info()
3862 << "Read only mode set. Unable to"
3863 " open file '" << name << "' in RW mode, "
3864 << "trying RO mode";
3865
3866 access = GENERIC_READ;
3867
3868 } else if (access_type == OS_FILE_READ_WRITE) {
3869
3870 access = GENERIC_READ | GENERIC_WRITE;
3871
3872 } else {
3873
3874 ib::error()
3875 << "Unknown file access type (" << access_type << ") "
3876 "for file '" << name << "'";
3877
3878 return(OS_FILE_CLOSED);
3879 }
3880
3881 bool retry;
3882
3883 do {
3884 /* Use default security attributes and no template file. */
3885
3886 file = CreateFile(
3887 (LPCTSTR) name, access,
3888 FILE_SHARE_READ | FILE_SHARE_DELETE, NULL,
3889 create_flag, attributes, NULL);
3890
3891 if (file == INVALID_HANDLE_VALUE) {
3892
3893 *success = false;
3894
3895 retry = os_file_handle_error(
3896 name, create_mode == OS_FILE_OPEN ?
3897 "open" : "create");
3898
3899 } else {
3900
3901 retry = false;
3902
3903 *success = true;
3904 }
3905
3906 } while (retry);
3907
3908 return(file);
3909 }
3910
3911 /** This function attempts to create a directory named pathname. The new
3912 directory gets default permissions. On Unix the permissions are
3913 (0770 & ~umask). If the directory exists already, nothing is done and
3914 the call succeeds, unless the fail_if_exists arguments is true.
3915 If another error occurs, such as a permission error, this does not crash,
3916 but reports the error and returns false.
3917 @param[in] pathname directory name as null-terminated string
3918 @param[in] fail_if_exists if true, pre-existing directory is treated
3919 as an error.
3920 @return true if call succeeds, false on error */
3921 bool
os_file_create_directory(const char * pathname,bool fail_if_exists)3922 os_file_create_directory(
3923 const char* pathname,
3924 bool fail_if_exists)
3925 {
3926 BOOL rcode;
3927
3928 rcode = CreateDirectory((LPCTSTR) pathname, NULL);
3929 if (!(rcode != 0
3930 || (GetLastError() == ERROR_ALREADY_EXISTS
3931 && !fail_if_exists))) {
3932
3933 os_file_handle_error_no_exit(
3934 pathname, "CreateDirectory", false);
3935
3936 return(false);
3937 }
3938
3939 return(true);
3940 }
3941
3942 /** The os_file_opendir() function opens a directory stream corresponding to the
3943 directory named by the dirname argument. The directory stream is positioned
3944 at the first entry. In both Unix and Windows we automatically skip the '.'
3945 and '..' items at the start of the directory listing.
3946 @param[in] dirname directory name; it must not contain a trailing
3947 '\' or '/'
3948 @param[in] is_fatal true if we should treat an error as a fatal
3949 error; if we try to open symlinks then we do
3950 not wish a fatal error if it happens not to
3951 be a directory
3952 @return directory stream, NULL if error */
3953 os_file_dir_t
os_file_opendir(const char * dirname,bool error_is_fatal)3954 os_file_opendir(
3955 const char* dirname,
3956 bool error_is_fatal)
3957 {
3958 os_file_dir_t dir;
3959 LPWIN32_FIND_DATA lpFindFileData;
3960 char path[OS_FILE_MAX_PATH + 3];
3961
3962 ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
3963
3964 strcpy(path, dirname);
3965 strcpy(path + strlen(path), "\\*");
3966
3967 /* Note that in Windows opening the 'directory stream' also retrieves
3968 the first entry in the directory. Since it is '.', that is no problem,
3969 as we will skip over the '.' and '..' entries anyway. */
3970
3971 lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
3972 ut_malloc_nokey(sizeof(WIN32_FIND_DATA)));
3973
3974 dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
3975
3976 ut_free(lpFindFileData);
3977
3978 if (dir == INVALID_HANDLE_VALUE) {
3979
3980 if (error_is_fatal) {
3981 os_file_handle_error(dirname, "opendir");
3982 }
3983
3984 return(NULL);
3985 }
3986
3987 return(dir);
3988 }
3989
3990 /** Closes a directory stream.
3991 @param[in] dir directory stream
3992 @return 0 if success, -1 if failure */
3993 int
os_file_closedir(os_file_dir_t dir)3994 os_file_closedir(
3995 os_file_dir_t dir)
3996 {
3997 BOOL ret;
3998
3999 ret = FindClose(dir);
4000
4001 if (!ret) {
4002 os_file_handle_error_no_exit(NULL, "closedir", false);
4003
4004 return(-1);
4005 }
4006
4007 return(0);
4008 }
4009
4010 /** This function returns information of the next file in the directory. We
4011 jump over the '.' and '..' entries in the directory.
4012 @param[in] dirname directory name or path
4013 @param[in] dir directory stream
4014 @param[out] info buffer where the info is returned
4015 @return 0 if ok, -1 if error, 1 if at the end of the directory */
4016 int
os_file_readdir_next_file(const char * dirname,os_file_dir_t dir,os_file_stat_t * info)4017 os_file_readdir_next_file(
4018 const char* dirname,
4019 os_file_dir_t dir,
4020 os_file_stat_t* info)
4021 {
4022 BOOL ret;
4023 int status;
4024 WIN32_FIND_DATA find_data;
4025
4026 next_file:
4027
4028 ret = FindNextFile(dir, &find_data);
4029
4030 if (ret > 0) {
4031
4032 const char* name;
4033
4034 name = static_cast<const char*>(find_data.cFileName);
4035
4036 ut_a(strlen(name) < OS_FILE_MAX_PATH);
4037
4038 if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0) {
4039
4040 goto next_file;
4041 }
4042
4043 strcpy(info->name, name);
4044
4045 info->size = find_data.nFileSizeHigh;
4046 info->size <<= 32;
4047 info->size |= find_data.nFileSizeLow;
4048
4049 if (find_data.dwFileAttributes
4050 & FILE_ATTRIBUTE_REPARSE_POINT) {
4051
4052 /* TODO: test Windows symlinks */
4053 /* TODO: MySQL has apparently its own symlink
4054 implementation in Windows, dbname.sym can
4055 redirect a database directory:
4056 REFMAN "windows-symbolic-links.html" */
4057
4058 info->type = OS_FILE_TYPE_LINK;
4059
4060 } else if (find_data.dwFileAttributes
4061 & FILE_ATTRIBUTE_DIRECTORY) {
4062
4063 info->type = OS_FILE_TYPE_DIR;
4064
4065 } else {
4066
4067 /* It is probably safest to assume that all other
4068 file types are normal. Better to check them rather
4069 than blindly skip them. */
4070
4071 info->type = OS_FILE_TYPE_FILE;
4072 }
4073
4074 status = 0;
4075
4076 } else if (GetLastError() == ERROR_NO_MORE_FILES) {
4077
4078 status = 1;
4079
4080 } else {
4081
4082 os_file_handle_error_no_exit(NULL, "readdir_next_file", false);
4083
4084 status = -1;
4085 }
4086
4087 return(status);
4088 }
4089
4090 /** Check that IO of specific size is possible for the file
4091 opened with FILE_FLAG_NO_BUFFERING.
4092
4093 The requirement is that IO is multiple of the disk sector size.
4094
4095 @param[in] file file handle
4096 @param[in] io_size expected io size
4097 @return true - unbuffered io of requested size is possible, false otherwise.
4098
4099 @note: this function only works correctly with Windows 8 or later,
4100 (GetFileInformationByHandleEx with FileStorageInfo is only supported there).
4101 It will return true on earlier Windows version.
4102 */
unbuffered_io_possible(HANDLE file,size_t io_size)4103 static bool unbuffered_io_possible(HANDLE file, size_t io_size)
4104 {
4105 FILE_STORAGE_INFO info;
4106 if (GetFileInformationByHandleEx(
4107 file, FileStorageInfo, &info, sizeof(info))) {
4108 ULONG sector_size = info.LogicalBytesPerSector;
4109 if (sector_size)
4110 return io_size % sector_size == 0;
4111 }
4112 return true;
4113 }
4114
4115
4116 /** NOTE! Use the corresponding macro os_file_create(), not directly
4117 this function!
4118 Opens an existing file or creates a new.
4119 @param[in] name name of the file or path as a null-terminated
4120 string
4121 @param[in] create_mode create mode
4122 @param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O
4123 is desired, OS_FILE_NORMAL, if any normal file;
4124 NOTE that it also depends on type, os_aio_..
4125 and srv_.. variables whether we really use async
4126 I/O or unbuffered I/O: look in the function
4127 source code for the exact rules
4128 @param[in] type OS_DATA_FILE or OS_LOG_FILE
4129 @param[in] success true if succeeded
4130 @return handle to the file, not defined if error, error number
4131 can be retrieved with os_file_get_last_error */
4132 pfs_os_file_t
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,bool read_only,bool * success)4133 os_file_create_func(
4134 const char* name,
4135 ulint create_mode,
4136 ulint purpose,
4137 ulint type,
4138 bool read_only,
4139 bool* success)
4140 {
4141 os_file_t file;
4142 bool retry;
4143 bool on_error_no_exit;
4144 bool on_error_silent;
4145
4146 *success = false;
4147
4148 DBUG_EXECUTE_IF(
4149 "ib_create_table_fail_disk_full",
4150 *success = false;
4151 SetLastError(ERROR_DISK_FULL);
4152 return(OS_FILE_CLOSED);
4153 );
4154
4155 DWORD create_flag;
4156 DWORD share_mode = srv_operation != SRV_OPERATION_NORMAL
4157 ? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE
4158 : FILE_SHARE_READ | FILE_SHARE_DELETE;
4159
4160 if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) {
4161 WAIT_ALLOW_WRITES();
4162 }
4163
4164 on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
4165 ? true : false;
4166
4167 on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
4168 ? true : false;
4169
4170 create_mode &= ~(OS_FILE_ON_ERROR_NO_EXIT | OS_FILE_ON_ERROR_SILENT);
4171
4172 if (create_mode == OS_FILE_OPEN_RAW) {
4173
4174 ut_a(!read_only);
4175
4176 create_flag = OPEN_EXISTING;
4177
4178 /* On Windows Physical devices require admin privileges and
4179 have to have the write-share mode set. See the remarks
4180 section for the CreateFile() function documentation in MSDN. */
4181
4182 share_mode |= FILE_SHARE_WRITE;
4183
4184 } else if (create_mode == OS_FILE_OPEN
4185 || create_mode == OS_FILE_OPEN_RETRY) {
4186
4187 create_flag = OPEN_EXISTING;
4188
4189 } else if (read_only) {
4190
4191 create_flag = OPEN_EXISTING;
4192
4193 } else if (create_mode == OS_FILE_CREATE) {
4194
4195 create_flag = CREATE_NEW;
4196
4197 } else if (create_mode == OS_FILE_OVERWRITE) {
4198
4199 create_flag = CREATE_ALWAYS;
4200
4201 } else {
4202 ib::error()
4203 << "Unknown file create mode (" << create_mode << ") "
4204 << " for file '" << name << "'";
4205
4206 return(OS_FILE_CLOSED);
4207 }
4208
4209 DWORD attributes = 0;
4210
4211 if (purpose == OS_FILE_AIO) {
4212
4213 #ifdef WIN_ASYNC_IO
4214 /* If specified, use asynchronous (overlapped) io and no
4215 buffering of writes in the OS */
4216
4217 if (srv_use_native_aio) {
4218 attributes |= FILE_FLAG_OVERLAPPED;
4219 }
4220 #endif /* WIN_ASYNC_IO */
4221
4222 } else if (purpose == OS_FILE_NORMAL) {
4223
4224 /* Use default setting. */
4225
4226 } else {
4227
4228 ib::error()
4229 << "Unknown purpose flag (" << purpose << ") "
4230 << "while opening file '" << name << "'";
4231
4232 return(OS_FILE_CLOSED);
4233 }
4234
4235 if (type == OS_LOG_FILE) {
4236 /* There is not reason to use buffered write to logs.*/
4237 attributes |= FILE_FLAG_NO_BUFFERING;
4238 }
4239
4240 switch (srv_file_flush_method)
4241 {
4242 case SRV_O_DSYNC:
4243 if (type == OS_LOG_FILE) {
4244 /* Map O_SYNC to FILE_WRITE_THROUGH */
4245 attributes |= FILE_FLAG_WRITE_THROUGH;
4246 }
4247 break;
4248
4249 case SRV_O_DIRECT_NO_FSYNC:
4250 case SRV_O_DIRECT:
4251 if (type == OS_DATA_FILE) {
4252 attributes |= FILE_FLAG_NO_BUFFERING;
4253 }
4254 break;
4255
4256 case SRV_ALL_O_DIRECT_FSYNC:
4257 /*Traditional Windows behavior, no buffering for any files.*/
4258 if (type != OS_DATA_FILE_NO_O_DIRECT) {
4259 attributes |= FILE_FLAG_NO_BUFFERING;
4260 }
4261 break;
4262
4263 case SRV_FSYNC:
4264 case SRV_LITTLESYNC:
4265 break;
4266
4267 case SRV_NOSYNC:
4268 /* Let Windows cache manager handle all writes.*/
4269 attributes &= ~(FILE_FLAG_WRITE_THROUGH | FILE_FLAG_NO_BUFFERING);
4270 break;
4271
4272 default:
4273 ut_a(false); /* unknown flush mode.*/
4274 }
4275
4276
4277 // TODO: Create a bug, this looks wrong. The flush log
4278 // parameter is dynamic.
4279 if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
4280 /* Do not use unbuffered i/o for the log files because
4281 value 2 denotes that we do not flush the log at every
4282 commit, but only once per second */
4283 attributes &= ~(FILE_FLAG_WRITE_THROUGH | FILE_FLAG_NO_BUFFERING);
4284 }
4285
4286
4287 DWORD access = GENERIC_READ;
4288
4289 if (!read_only) {
4290 access |= GENERIC_WRITE;
4291 }
4292
4293 for (;;) {
4294 const char *operation;
4295
4296 /* Use default security attributes and no template file. */
4297 file = CreateFile(
4298 name, access, share_mode, NULL,
4299 create_flag, attributes, NULL);
4300
4301 /* If FILE_FLAG_NO_BUFFERING was set, check if this can work at all,
4302 for expected IO sizes. Reopen without the unbuffered flag, if it is won't work*/
4303 if ((file != INVALID_HANDLE_VALUE)
4304 && (attributes & FILE_FLAG_NO_BUFFERING)
4305 && (type == OS_LOG_FILE)
4306 && !unbuffered_io_possible(file, OS_FILE_LOG_BLOCK_SIZE)) {
4307 ut_a(CloseHandle(file));
4308 attributes &= ~FILE_FLAG_NO_BUFFERING;
4309 create_flag = OPEN_ALWAYS;
4310 continue;
4311 }
4312
4313 *success = (file != INVALID_HANDLE_VALUE);
4314 if (*success) {
4315 break;
4316 }
4317
4318 operation = (create_mode == OS_FILE_CREATE && !read_only) ?
4319 "create" : "open";
4320
4321 if (on_error_no_exit) {
4322 retry = os_file_handle_error_no_exit(
4323 name, operation, on_error_silent);
4324 }
4325 else {
4326 retry = os_file_handle_error(name, operation);
4327 }
4328
4329 if (!retry) {
4330 break;
4331 }
4332 }
4333
4334 if (*success && srv_use_native_aio && (attributes & FILE_FLAG_OVERLAPPED)) {
4335 /* Bind the file handle to completion port. Completion port
4336 might not be created yet, in some stages of backup, but
4337 must always be there for the server.*/
4338 HANDLE port = (type == OS_LOG_FILE) ?
4339 log_completion_port : data_completion_port;
4340 ut_a(port || srv_operation != SRV_OPERATION_NORMAL);
4341 if (port) {
4342 ut_a(CreateIoCompletionPort(file, port, 0, 0));
4343 }
4344 }
4345
4346 return(file);
4347 }
4348
4349 /** NOTE! Use the corresponding macro os_file_create_simple_no_error_handling(),
4350 not directly this function!
4351 A simple function to open or create a file.
4352 @param[in] name name of the file or path as a null-terminated
4353 string
4354 @param[in] create_mode create mode
4355 @param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
4356 OS_FILE_READ_ALLOW_DELETE; the last option is
4357 used by a backup program reading the file
4358 @param[out] success true if succeeded
4359 @return own: handle to the file, not defined if error, error number
4360 can be retrieved with os_file_get_last_error */
4361 pfs_os_file_t
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)4362 os_file_create_simple_no_error_handling_func(
4363 const char* name,
4364 ulint create_mode,
4365 ulint access_type,
4366 bool read_only,
4367 bool* success)
4368 {
4369 os_file_t file;
4370
4371 *success = false;
4372
4373 DWORD access;
4374 DWORD create_flag;
4375 DWORD attributes = 0;
4376 DWORD share_mode = srv_operation != SRV_OPERATION_NORMAL
4377 ? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE
4378 : FILE_SHARE_READ | FILE_SHARE_DELETE;
4379
4380 ut_a(name);
4381
4382 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
4383 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
4384
4385 if (create_mode == OS_FILE_OPEN) {
4386
4387 create_flag = OPEN_EXISTING;
4388
4389 } else if (read_only) {
4390
4391 create_flag = OPEN_EXISTING;
4392
4393 } else if (create_mode == OS_FILE_CREATE) {
4394
4395 create_flag = CREATE_NEW;
4396
4397 } else {
4398
4399 ib::error()
4400 << "Unknown file create mode (" << create_mode << ") "
4401 << " for file '" << name << "'";
4402
4403 return(OS_FILE_CLOSED);
4404 }
4405
4406 if (access_type == OS_FILE_READ_ONLY) {
4407
4408 access = GENERIC_READ;
4409
4410 } else if (read_only) {
4411
4412 access = GENERIC_READ;
4413
4414 } else if (access_type == OS_FILE_READ_WRITE) {
4415
4416 access = GENERIC_READ | GENERIC_WRITE;
4417
4418 } else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
4419
4420 ut_a(!read_only);
4421
4422 access = GENERIC_READ;
4423
4424 /*!< A backup program has to give mysqld the maximum
4425 freedom to do what it likes with the file */
4426
4427 share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE
4428 | FILE_SHARE_READ;
4429 } else {
4430
4431 ib::error()
4432 << "Unknown file access type (" << access_type << ") "
4433 << "for file '" << name << "'";
4434
4435 return(OS_FILE_CLOSED);
4436 }
4437
4438 file = CreateFile((LPCTSTR) name,
4439 access,
4440 share_mode,
4441 NULL, // Security attributes
4442 create_flag,
4443 attributes,
4444 NULL); // No template file
4445
4446 *success = (file != INVALID_HANDLE_VALUE);
4447
4448 return(file);
4449 }
4450
4451 /** Deletes a file if it exists. The file has to be closed before calling this.
4452 @param[in] name file path as a null-terminated string
4453 @param[out] exist indicate if file pre-exist
4454 @return true if success */
4455 bool
os_file_delete_if_exists_func(const char * name,bool * exist)4456 os_file_delete_if_exists_func(
4457 const char* name,
4458 bool* exist)
4459 {
4460 ulint count = 0;
4461
4462 if (exist != NULL) {
4463 *exist = true;
4464 }
4465
4466 for (;;) {
4467 /* In Windows, deleting an .ibd file may fail if
4468 the file is being accessed by an external program,
4469 such as a backup tool. */
4470
4471 bool ret = DeleteFile((LPCTSTR) name);
4472
4473 if (ret) {
4474 return(true);
4475 }
4476
4477 DWORD lasterr = GetLastError();
4478
4479 if (lasterr == ERROR_FILE_NOT_FOUND
4480 || lasterr == ERROR_PATH_NOT_FOUND) {
4481
4482 /* the file does not exist, this not an error */
4483 if (exist != NULL) {
4484 *exist = false;
4485 }
4486
4487 return(true);
4488 }
4489
4490 ++count;
4491
4492 if (count > 100 && 0 == (count % 10)) {
4493
4494 /* Print error information */
4495 os_file_get_last_error(true);
4496
4497 ib::warn() << "Delete of file '" << name << "' failed.";
4498 }
4499
4500 /* Sleep for a second */
4501 os_thread_sleep(1000000);
4502
4503 if (count > 2000) {
4504
4505 return(false);
4506 }
4507 }
4508 }
4509
4510 /** Deletes a file. The file has to be closed before calling this.
4511 @param[in] name File path as NUL terminated string
4512 @return true if success */
4513 bool
os_file_delete_func(const char * name)4514 os_file_delete_func(
4515 const char* name)
4516 {
4517 ulint count = 0;
4518
4519 for (;;) {
4520 /* In Windows, deleting an .ibd file may fail if
4521 the file is being accessed by an external program,
4522 such as a backup tool. */
4523
4524 BOOL ret = DeleteFile((LPCTSTR) name);
4525
4526 if (ret) {
4527 return(true);
4528 }
4529
4530 if (GetLastError() == ERROR_FILE_NOT_FOUND) {
4531 /* If the file does not exist, we classify this as
4532 a 'mild' error and return */
4533
4534 return(false);
4535 }
4536
4537 ++count;
4538
4539 if (count > 100 && 0 == (count % 10)) {
4540
4541 /* print error information */
4542 os_file_get_last_error(true);
4543
4544 ib::warn()
4545 << "Cannot delete file '" << name << "'. Is "
4546 << "another program accessing it?";
4547 }
4548
4549 /* sleep for a second */
4550 os_thread_sleep(1000000);
4551
4552 if (count > 2000) {
4553
4554 return(false);
4555 }
4556 }
4557
4558 ut_error;
4559 return(false);
4560 }
4561
4562 /** NOTE! Use the corresponding macro os_file_rename(), not directly this
4563 function!
4564 Renames a file (can also move it to another directory). It is safest that the
4565 file is closed before calling this function.
4566 @param[in] oldpath old file path as a null-terminated string
4567 @param[in] newpath new file path
4568 @return true if success */
4569 bool
os_file_rename_func(const char * oldpath,const char * newpath)4570 os_file_rename_func(
4571 const char* oldpath,
4572 const char* newpath)
4573 {
4574 #ifdef UNIV_DEBUG
4575 os_file_type_t type;
4576 bool exists;
4577
4578 /* New path must not exist. */
4579 ut_ad(os_file_status(newpath, &exists, &type));
4580 ut_ad(!exists);
4581
4582 /* Old path must exist. */
4583 ut_ad(os_file_status(oldpath, &exists, &type));
4584 ut_ad(exists);
4585 #endif /* UNIV_DEBUG */
4586
4587 if (MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath)) {
4588 return(true);
4589 }
4590
4591 os_file_handle_rename_error(oldpath, newpath);
4592 return(false);
4593 }
4594
4595 /** NOTE! Use the corresponding macro os_file_close(), not directly
4596 this function!
4597 Closes a file handle. In case of error, error number can be retrieved with
4598 os_file_get_last_error.
4599 @param[in,own] file Handle to a file
4600 @return true if success */
4601 bool
os_file_close_func(os_file_t file)4602 os_file_close_func(
4603 os_file_t file)
4604 {
4605 ut_a(file);
4606
4607 if (CloseHandle(file)) {
4608 return(true);
4609 }
4610
4611 os_file_handle_error(NULL, "close");
4612
4613 return(false);
4614 }
4615
4616 /** Gets a file size.
4617 @param[in] file Handle to a file
4618 @return file size, or (os_offset_t) -1 on failure */
4619 os_offset_t
os_file_get_size(os_file_t file)4620 os_file_get_size(
4621 os_file_t file)
4622 {
4623 DWORD high;
4624 DWORD low = GetFileSize(file, &high);
4625
4626 if (low == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
4627 return((os_offset_t) -1);
4628 }
4629
4630 return(os_offset_t(low | (os_offset_t(high) << 32)));
4631 }
4632
4633 /** Gets a file size.
4634 @param[in] filename Full path to the filename to check
4635 @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
4636 errno */
4637 os_file_size_t
os_file_get_size(const char * filename)4638 os_file_get_size(
4639 const char* filename)
4640 {
4641 struct __stat64 s;
4642 os_file_size_t file_size;
4643
4644 int ret = _stat64(filename, &s);
4645
4646 if (ret == 0) {
4647
4648 file_size.m_total_size = s.st_size;
4649
4650 DWORD low_size;
4651 DWORD high_size;
4652
4653 low_size = GetCompressedFileSize(filename, &high_size);
4654
4655 if (low_size != INVALID_FILE_SIZE) {
4656
4657 file_size.m_alloc_size = high_size;
4658 file_size.m_alloc_size <<= 32;
4659 file_size.m_alloc_size |= low_size;
4660
4661 } else {
4662 ib::error()
4663 << "GetCompressedFileSize("
4664 << filename << ", ..) failed.";
4665
4666 file_size.m_alloc_size = (os_offset_t) -1;
4667 }
4668 } else {
4669 file_size.m_total_size = ~0;
4670 file_size.m_alloc_size = (os_offset_t) ret;
4671 }
4672
4673 return(file_size);
4674 }
4675
4676 /** This function returns information about the specified file
4677 @param[in] path pathname of the file
4678 @param[out] stat_info information of a file in a directory
4679 @param[in,out] statinfo information of a file in a directory
4680 @param[in] check_rw_perm for testing whether the file can be opened
4681 in RW mode
4682 @param[in] read_only true if the file is opened in read-only mode
4683 @return DB_SUCCESS if all OK */
4684 static
4685 dberr_t
os_file_get_status_win32(const char * path,os_file_stat_t * stat_info,struct _stat64 * statinfo,bool check_rw_perm,bool read_only)4686 os_file_get_status_win32(
4687 const char* path,
4688 os_file_stat_t* stat_info,
4689 struct _stat64* statinfo,
4690 bool check_rw_perm,
4691 bool read_only)
4692 {
4693 int ret = _stat64(path, statinfo);
4694
4695 if (ret && (errno == ENOENT || errno == ENOTDIR
4696 || errno == ENAMETOOLONG)) {
4697 /* file does not exist */
4698
4699 return(DB_NOT_FOUND);
4700
4701 } else if (ret) {
4702 /* file exists, but stat call failed */
4703
4704 os_file_handle_error_no_exit(path, "STAT", false);
4705
4706 return(DB_FAIL);
4707
4708 } else if (_S_IFDIR & statinfo->st_mode) {
4709
4710 stat_info->type = OS_FILE_TYPE_DIR;
4711
4712 } else if (_S_IFREG & statinfo->st_mode) {
4713
4714 DWORD access = GENERIC_READ;
4715
4716 if (!read_only) {
4717 access |= GENERIC_WRITE;
4718 }
4719
4720 stat_info->type = OS_FILE_TYPE_FILE;
4721
4722 /* Check if we can open it in read-only mode. */
4723
4724 if (check_rw_perm) {
4725 HANDLE fh;
4726
4727 fh = CreateFile(
4728 (LPCTSTR) path, // File to open
4729 access,
4730 FILE_SHARE_READ | FILE_SHARE_WRITE
4731 | FILE_SHARE_DELETE, // Full sharing
4732 NULL, // Default security
4733 OPEN_EXISTING, // Existing file only
4734 FILE_ATTRIBUTE_NORMAL, // Normal file
4735 NULL); // No attr. template
4736
4737 if (fh == INVALID_HANDLE_VALUE) {
4738 stat_info->rw_perm = false;
4739 } else {
4740 stat_info->rw_perm = true;
4741 CloseHandle(fh);
4742 }
4743 }
4744 stat_info->block_size = 0;
4745
4746 /* What follows, is calculation of FS block size, which is not important
4747 (it is just shown in I_S innodb tables). The error to calculate it will be ignored.*/
4748 char volname[MAX_PATH];
4749 BOOL result = GetVolumePathName(path, volname, MAX_PATH);
4750 static bool warned_once = false;
4751 if (!result) {
4752 if (!warned_once) {
4753 ib::warn()
4754 << "os_file_get_status_win32: "
4755 << "Failed to get the volume path name for: "
4756 << path
4757 << "- OS error number " << GetLastError();
4758 warned_once = true;
4759 }
4760 return(DB_SUCCESS);
4761 }
4762
4763 DWORD sectorsPerCluster;
4764 DWORD bytesPerSector;
4765 DWORD numberOfFreeClusters;
4766 DWORD totalNumberOfClusters;
4767
4768 result = GetDiskFreeSpace(
4769 (LPCSTR) volname,
4770 §orsPerCluster,
4771 &bytesPerSector,
4772 &numberOfFreeClusters,
4773 &totalNumberOfClusters);
4774
4775 if (!result) {
4776 if (!warned_once) {
4777 ib::warn()
4778 << "GetDiskFreeSpace(" << volname << ",...) "
4779 << "failed "
4780 << "- OS error number " << GetLastError();
4781 warned_once = true;
4782 }
4783 return(DB_SUCCESS);
4784 }
4785 stat_info->block_size = bytesPerSector * sectorsPerCluster;
4786 } else {
4787 stat_info->type = OS_FILE_TYPE_UNKNOWN;
4788 }
4789
4790 return(DB_SUCCESS);
4791 }
4792
4793 /**
4794 Sets a sparse flag on Windows file.
4795 @param[in] file file handle
4796 @return true on success, false on error
4797 */
4798 #include <versionhelpers.h>
os_file_set_sparse_win32(os_file_t file,bool is_sparse)4799 bool os_file_set_sparse_win32(os_file_t file, bool is_sparse)
4800 {
4801 if (!is_sparse && !IsWindows8OrGreater()) {
4802 /* Cannot unset sparse flag on older Windows.
4803 Until Windows8 it is documented to produce unpredictable results,
4804 if there are unallocated ranges in file.*/
4805 return false;
4806 }
4807 DWORD temp;
4808 FILE_SET_SPARSE_BUFFER sparse_buffer;
4809 sparse_buffer.SetSparse = is_sparse;
4810 return os_win32_device_io_control(file,
4811 FSCTL_SET_SPARSE, &sparse_buffer, sizeof(sparse_buffer), 0, 0,&temp);
4812 }
4813
4814
4815 /**
4816 Change file size on Windows.
4817
4818 If file is extended, the bytes between old and new EOF
4819 are zeros.
4820
4821 If file is sparse, "virtual" block is added at the end of
4822 allocated area.
4823
4824 If file is normal, file system allocates storage.
4825
4826 @param[in] pathname file path
4827 @param[in] file file handle
4828 @param[in] size size to preserve in bytes
4829 @return true if success */
4830 bool
os_file_change_size_win32(const char * pathname,os_file_t file,os_offset_t size)4831 os_file_change_size_win32(
4832 const char* pathname,
4833 os_file_t file,
4834 os_offset_t size)
4835 {
4836 LARGE_INTEGER length;
4837
4838 length.QuadPart = size;
4839
4840 BOOL success = SetFilePointerEx(file, length, NULL, FILE_BEGIN);
4841
4842 if (!success) {
4843 os_file_handle_error_no_exit(
4844 pathname, "SetFilePointerEx", false);
4845 } else {
4846 success = SetEndOfFile(file);
4847 if (!success) {
4848 os_file_handle_error_no_exit(
4849 pathname, "SetEndOfFile", false);
4850 }
4851 }
4852 return(success);
4853 }
4854
4855 /** Truncates a file at its current position.
4856 @param[in] file Handle to be truncated
4857 @return true if success */
4858 bool
os_file_set_eof(FILE * file)4859 os_file_set_eof(
4860 FILE* file)
4861 {
4862 HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
4863
4864 return(SetEndOfFile(h));
4865 }
4866
4867 /** This function can be called if one wants to post a batch of reads and
4868 prefers an i/o-handler thread to handle them all at once later. You must
4869 call os_aio_simulated_wake_handler_threads later to ensure the threads
4870 are not left sleeping! */
4871 void
os_aio_simulated_put_read_threads_to_sleep()4872 os_aio_simulated_put_read_threads_to_sleep()
4873 {
4874 AIO::simulated_put_read_threads_to_sleep();
4875 }
4876
4877 /** This function can be called if one wants to post a batch of reads and
4878 prefers an i/o-handler thread to handle them all at once later. You must
4879 call os_aio_simulated_wake_handler_threads later to ensure the threads
4880 are not left sleeping! */
4881 void
simulated_put_read_threads_to_sleep()4882 AIO::simulated_put_read_threads_to_sleep()
4883 {
4884 /* The idea of putting background IO threads to sleep is only for
4885 Windows when using simulated AIO. Windows XP seems to schedule
4886 background threads too eagerly to allow for coalescing during
4887 readahead requests. */
4888
4889 if (srv_use_native_aio) {
4890 /* We do not use simulated AIO: do nothing */
4891
4892 return;
4893 }
4894
4895 os_aio_recommend_sleep_for_read_threads = true;
4896
4897 for (ulint i = 0; i < os_aio_n_segments; i++) {
4898 AIO* array;
4899
4900 get_array_and_local_segment(&array, i);
4901
4902 if (array == s_reads) {
4903
4904 os_event_reset(os_aio_segment_wait_events[i]);
4905 }
4906 }
4907 }
4908
4909 #endif /* !_WIN32*/
4910
4911 /** Does a syncronous read or write depending upon the type specified
4912 In case of partial reads/writes the function tries
4913 NUM_RETRIES_ON_PARTIAL_IO times to read/write the complete data.
4914 @param[in] type, IO flags
4915 @param[in] file handle to an open file
4916 @param[out] buf buffer where to read
4917 @param[in] offset file offset from the start where to read
4918 @param[in] n number of bytes to read, starting from offset
4919 @param[out] err DB_SUCCESS or error code
4920 @return number of bytes read/written, -1 if error */
4921 static MY_ATTRIBUTE((warn_unused_result))
4922 ssize_t
os_file_io(const IORequest & in_type,os_file_t file,void * buf,ulint n,os_offset_t offset,dberr_t * err)4923 os_file_io(
4924 const IORequest&in_type,
4925 os_file_t file,
4926 void* buf,
4927 ulint n,
4928 os_offset_t offset,
4929 dberr_t* err)
4930 {
4931 ssize_t original_n = ssize_t(n);
4932 IORequest type = in_type;
4933 ssize_t bytes_returned = 0;
4934
4935 SyncFileIO sync_file_io(file, buf, n, offset);
4936
4937 for (ulint i = 0; i < NUM_RETRIES_ON_PARTIAL_IO; ++i) {
4938
4939 ssize_t n_bytes = sync_file_io.execute(type);
4940
4941 /* Check for a hard error. Not much we can do now. */
4942 if (n_bytes < 0) {
4943
4944 break;
4945
4946 } else if (n_bytes + bytes_returned == ssize_t(n)) {
4947
4948 bytes_returned += n_bytes;
4949
4950 if (offset > 0
4951 && !type.is_log()
4952 && type.is_write()
4953 && type.punch_hole()) {
4954 *err = type.punch_hole(file, offset, n);
4955
4956 } else {
4957 *err = DB_SUCCESS;
4958 }
4959
4960 return(original_n);
4961 }
4962
4963 /* Handle partial read/write. */
4964
4965 ut_ad(ulint(n_bytes + bytes_returned) < n);
4966
4967 bytes_returned += n_bytes;
4968
4969 if (!type.is_partial_io_warning_disabled()) {
4970
4971 const char* op = type.is_read()
4972 ? "read" : "written";
4973
4974 ib::warn()
4975 << n
4976 << " bytes should have been " << op << ". Only "
4977 << bytes_returned
4978 << " bytes " << op << ". Retrying"
4979 << " for the remaining bytes.";
4980 }
4981
4982 /* Advance the offset and buffer by n_bytes */
4983 sync_file_io.advance(n_bytes);
4984 }
4985
4986 *err = DB_IO_ERROR;
4987
4988 if (!type.is_partial_io_warning_disabled()) {
4989 ib::warn()
4990 << "Retry attempts for "
4991 << (type.is_read() ? "reading" : "writing")
4992 << " partial data failed.";
4993 }
4994
4995 return(bytes_returned);
4996 }
4997
4998 /** Does a synchronous write operation in Posix.
4999 @param[in] type IO context
5000 @param[in] file handle to an open file
5001 @param[out] buf buffer from which to write
5002 @param[in] n number of bytes to read, starting from offset
5003 @param[in] offset file offset from the start where to read
5004 @param[out] err DB_SUCCESS or error code
5005 @return number of bytes written, -1 if error */
5006 static MY_ATTRIBUTE((warn_unused_result))
5007 ssize_t
os_file_pwrite(const IORequest & type,os_file_t file,const byte * buf,ulint n,os_offset_t offset,dberr_t * err)5008 os_file_pwrite(
5009 const IORequest& type,
5010 os_file_t file,
5011 const byte* buf,
5012 ulint n,
5013 os_offset_t offset,
5014 dberr_t* err)
5015 {
5016 ut_ad(type.validate());
5017 ut_ad(type.is_write());
5018
5019 ++os_n_file_writes;
5020
5021 const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_WRITES);
5022 MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
5023 ssize_t n_bytes = os_file_io(type, file, const_cast<byte*>(buf),
5024 n, offset, err);
5025 MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
5026
5027 return(n_bytes);
5028 }
5029
5030 /** NOTE! Use the corresponding macro os_file_write(), not directly
5031 Requests a synchronous write operation.
5032 @param[in] type IO flags
5033 @param[in] file handle to an open file
5034 @param[out] buf buffer from which to write
5035 @param[in] offset file offset from the start where to read
5036 @param[in] n number of bytes to read, starting from offset
5037 @return error code
5038 @retval DB_SUCCESS if the operation succeeded */
5039 dberr_t
os_file_write_func(const IORequest & type,const char * name,os_file_t file,const void * buf,os_offset_t offset,ulint n)5040 os_file_write_func(
5041 const IORequest& type,
5042 const char* name,
5043 os_file_t file,
5044 const void* buf,
5045 os_offset_t offset,
5046 ulint n)
5047 {
5048 dberr_t err;
5049
5050 ut_ad(type.validate());
5051 ut_ad(n > 0);
5052
5053 WAIT_ALLOW_WRITES();
5054
5055 ssize_t n_bytes = os_file_pwrite(type, file, (byte*)buf, n, offset, &err);
5056
5057 if ((ulint) n_bytes != n && !os_has_said_disk_full) {
5058
5059 ib::error()
5060 << "Write to file " << name << " failed at offset "
5061 << offset << ", " << n
5062 << " bytes should have been written,"
5063 " only " << n_bytes << " were written."
5064 " Operating system error number " << IF_WIN(GetLastError(),errno) << "."
5065 " Check that your OS and file system"
5066 " support files of this size."
5067 " Check also that the disk is not full"
5068 " or a disk quota exceeded.";
5069 #ifndef _WIN32
5070 if (strerror(errno) != NULL) {
5071
5072 ib::error()
5073 << "Error number " << errno
5074 << " means '" << strerror(errno) << "'";
5075 }
5076
5077 ib::info() << OPERATING_SYSTEM_ERROR_MSG;
5078 #endif
5079 os_has_said_disk_full = true;
5080 }
5081
5082 return(err);
5083 }
5084
5085 /** Does a synchronous read operation in Posix.
5086 @param[in] type IO flags
5087 @param[in] file handle to an open file
5088 @param[out] buf buffer where to read
5089 @param[in] offset file offset from the start where to read
5090 @param[in] n number of bytes to read, starting from offset
5091 @param[out] err DB_SUCCESS or error code
5092 @return number of bytes read, -1 if error */
5093 static MY_ATTRIBUTE((warn_unused_result))
5094 ssize_t
os_file_pread(const IORequest & type,os_file_t file,void * buf,ulint n,os_offset_t offset,dberr_t * err)5095 os_file_pread(
5096 const IORequest& type,
5097 os_file_t file,
5098 void* buf,
5099 ulint n,
5100 os_offset_t offset,
5101 dberr_t* err)
5102 {
5103 ut_ad(type.is_read());
5104
5105 ++os_n_file_reads;
5106
5107 const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS);
5108 MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor);
5109 ssize_t n_bytes = os_file_io(type, file, buf, n, offset, err);
5110 MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
5111
5112 return(n_bytes);
5113 }
5114
5115 /** Requests a synchronous positioned read operation.
5116 @return DB_SUCCESS if request was successful, false if fail
5117 @param[in] type IO flags
5118 @param[in] file handle to an open file
5119 @param[out] buf buffer where to read
5120 @param[in] offset file offset from the start where to read
5121 @param[in] n number of bytes to read, starting from offset
5122 @param[out] o number of bytes actually read
5123 @param[in] exit_on_err if true then exit on error
5124 @return DB_SUCCESS or error code */
5125 static MY_ATTRIBUTE((warn_unused_result))
5126 dberr_t
os_file_read_page(const IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n,ulint * o,bool exit_on_err)5127 os_file_read_page(
5128 const IORequest& type,
5129 os_file_t file,
5130 void* buf,
5131 os_offset_t offset,
5132 ulint n,
5133 ulint* o,
5134 bool exit_on_err)
5135 {
5136 dberr_t err;
5137
5138 os_bytes_read_since_printout += n;
5139
5140 ut_ad(type.validate());
5141 ut_ad(n > 0);
5142
5143 ssize_t n_bytes = os_file_pread(type, file, buf, n, offset, &err);
5144
5145 if (o) {
5146 *o = n_bytes;
5147 }
5148
5149 if (ulint(n_bytes) == n || (err != DB_SUCCESS && !exit_on_err)) {
5150 return err;
5151 }
5152
5153 ib::error() << "Tried to read " << n << " bytes at offset "
5154 << offset << ", but was only able to read " << n_bytes;
5155
5156 if (!os_file_handle_error_cond_exit(
5157 NULL, "read", exit_on_err, false)) {
5158 ib::fatal()
5159 << "Cannot read from file. OS error number "
5160 << errno << ".";
5161 }
5162
5163 if (err == DB_SUCCESS) {
5164 err = DB_IO_ERROR;
5165 }
5166
5167 return err;
5168 }
5169
5170 /** Retrieves the last error number if an error occurs in a file io function.
5171 The number should be retrieved before any other OS calls (because they may
5172 overwrite the error number). If the number is not known to this program,
5173 the OS error number + 100 is returned.
5174 @param[in] report_all_errors true if we want an error printed
5175 for all errors
5176 @return error number, or OS error number + 100 */
5177 ulint
os_file_get_last_error(bool report_all_errors)5178 os_file_get_last_error(
5179 bool report_all_errors)
5180 {
5181 return(os_file_get_last_error_low(report_all_errors, false));
5182 }
5183
5184 /** Handle errors for file operations.
5185 @param[in] name name of a file or NULL
5186 @param[in] operation operation
5187 @param[in] should_abort whether to abort on an unknown error
5188 @param[in] on_error_silent whether to suppress reports of non-fatal errors
5189 @return true if we should retry the operation */
5190 static MY_ATTRIBUTE((warn_unused_result))
5191 bool
os_file_handle_error_cond_exit(const char * name,const char * operation,bool should_abort,bool on_error_silent)5192 os_file_handle_error_cond_exit(
5193 const char* name,
5194 const char* operation,
5195 bool should_abort,
5196 bool on_error_silent)
5197 {
5198 ulint err;
5199
5200 err = os_file_get_last_error_low(false, on_error_silent);
5201
5202 switch (err) {
5203 case OS_FILE_DISK_FULL:
5204 /* We only print a warning about disk full once */
5205
5206 if (os_has_said_disk_full) {
5207
5208 return(false);
5209 }
5210
5211 /* Disk full error is reported irrespective of the
5212 on_error_silent setting. */
5213
5214 if (name) {
5215
5216 ib::error()
5217 << "Encountered a problem with file '"
5218 << name << "'";
5219 }
5220
5221 ib::error()
5222 << "Disk is full. Try to clean the disk to free space.";
5223
5224 os_has_said_disk_full = true;
5225
5226 return(false);
5227
5228 case OS_FILE_AIO_RESOURCES_RESERVED:
5229 case OS_FILE_AIO_INTERRUPTED:
5230
5231 return(true);
5232
5233 case OS_FILE_PATH_ERROR:
5234 case OS_FILE_ALREADY_EXISTS:
5235 case OS_FILE_ACCESS_VIOLATION:
5236
5237 return(false);
5238
5239 case OS_FILE_SHARING_VIOLATION:
5240
5241 os_thread_sleep(10000000); /* 10 sec */
5242 return(true);
5243
5244 case OS_FILE_OPERATION_ABORTED:
5245 case OS_FILE_INSUFFICIENT_RESOURCE:
5246
5247 os_thread_sleep(100000); /* 100 ms */
5248 return(true);
5249
5250 default:
5251
5252 /* If it is an operation that can crash on error then it
5253 is better to ignore on_error_silent and print an error message
5254 to the log. */
5255
5256 if (should_abort || !on_error_silent) {
5257 ib::error() << "File "
5258 << (name != NULL ? name : "(unknown)")
5259 << ": '" << operation << "'"
5260 " returned OS error " << err << "."
5261 << (should_abort
5262 ? " Cannot continue operation" : "");
5263 }
5264
5265 if (should_abort) {
5266 abort();
5267 }
5268 }
5269
5270 return(false);
5271 }
5272
5273 #ifndef _WIN32
5274 /** Tries to disable OS caching on an opened file descriptor.
5275 @param[in] fd file descriptor to alter
5276 @param[in] file_name file name, used in the diagnostic message
5277 @param[in] name "open" or "create"; used in the diagnostic
5278 message */
5279 void
os_file_set_nocache(int fd MY_ATTRIBUTE ((unused)),const char * file_name MY_ATTRIBUTE ((unused)),const char * operation_name MY_ATTRIBUTE ((unused)))5280 os_file_set_nocache(
5281 int fd MY_ATTRIBUTE((unused)),
5282 const char* file_name MY_ATTRIBUTE((unused)),
5283 const char* operation_name MY_ATTRIBUTE((unused)))
5284 {
5285 /* some versions of Solaris may not have DIRECTIO_ON */
5286 #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
5287 if (directio(fd, DIRECTIO_ON) == -1) {
5288 int errno_save = errno;
5289
5290 ib::error()
5291 << "Failed to set DIRECTIO_ON on file "
5292 << file_name << "; " << operation_name << ": "
5293 << strerror(errno_save) << ","
5294 " continuing anyway.";
5295 }
5296 #elif defined(O_DIRECT)
5297 if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
5298 int errno_save = errno;
5299 static bool warning_message_printed = false;
5300 if (errno_save == EINVAL) {
5301 if (!warning_message_printed) {
5302 warning_message_printed = true;
5303 # ifdef UNIV_LINUX
5304 ib::warn()
5305 << "Failed to set O_DIRECT on file"
5306 << file_name << "; " << operation_name
5307 << ": " << strerror(errno_save) << ", "
5308 "continuing anyway. O_DIRECT is "
5309 "known to result in 'Invalid argument' "
5310 "on Linux on tmpfs, "
5311 "see MySQL Bug#26662.";
5312 # else /* UNIV_LINUX */
5313 goto short_warning;
5314 # endif /* UNIV_LINUX */
5315 }
5316 } else {
5317 # ifndef UNIV_LINUX
5318 short_warning:
5319 # endif
5320 ib::warn()
5321 << "Failed to set O_DIRECT on file "
5322 << file_name << "; " << operation_name
5323 << " : " << strerror(errno_save)
5324 << ", continuing anyway.";
5325 }
5326 }
5327 #endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */
5328 }
5329
5330 #endif /* _WIN32 */
5331
5332 /** Extend a file.
5333
5334 On Windows, extending a file allocates blocks for the file,
5335 unless the file is sparse.
5336
5337 On Unix, we will extend the file with ftruncate(), if
5338 file needs to be sparse. Otherwise posix_fallocate() is used
5339 when available, and if not, binary zeroes are added to the end
5340 of file.
5341
5342 @param[in] name file name
5343 @param[in] file file handle
5344 @param[in] size desired file size
5345 @param[in] sparse whether to create a sparse file (no preallocating)
5346 @return whether the operation succeeded */
5347 bool
os_file_set_size(const char * name,os_file_t file,os_offset_t size,bool is_sparse)5348 os_file_set_size(
5349 const char* name,
5350 os_file_t file,
5351 os_offset_t size,
5352 bool is_sparse)
5353 {
5354 ut_ad(!(size & 4095));
5355
5356 #ifdef _WIN32
5357 /* On Windows, changing file size works well and as expected for both
5358 sparse and normal files.
5359
5360 However, 10.2 up until 10.2.9 made every file sparse in innodb,
5361 causing NTFS fragmentation issues(MDEV-13941). We try to undo
5362 the damage, and unsparse the file.*/
5363
5364 if (!is_sparse && os_is_sparse_file_supported(file)) {
5365 if (!os_file_set_sparse_win32(file, false))
5366 /* Unsparsing file failed. Fallback to writing binary
5367 zeros, to avoid even higher fragmentation.*/
5368 goto fallback;
5369 }
5370
5371 return os_file_change_size_win32(name, file, size);
5372
5373 fallback:
5374 #else
5375 struct stat statbuf;
5376
5377 if (is_sparse) {
5378 bool success = !ftruncate(file, size);
5379 if (!success) {
5380 ib::error() << "ftruncate of file " << name << " to "
5381 << size << " bytes failed with error "
5382 << errno;
5383 }
5384 return(success);
5385 }
5386
5387 # ifdef HAVE_POSIX_FALLOCATE
5388 int err;
5389 do {
5390 if (fstat(file, &statbuf)) {
5391 err = errno;
5392 } else {
5393 os_offset_t current_size = statbuf.st_size;
5394 if (current_size >= size) {
5395 return true;
5396 }
5397 current_size &= ~4095ULL;
5398 err = posix_fallocate(file, current_size,
5399 size - current_size);
5400 }
5401 } while (err == EINTR
5402 && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED);
5403
5404 switch (err) {
5405 case 0:
5406 return true;
5407 default:
5408 ib::error() << "preallocating "
5409 << size << " bytes for file " << name
5410 << " failed with error " << err;
5411 /* fall through */
5412 case EINTR:
5413 errno = err;
5414 return false;
5415 case EINVAL:
5416 case EOPNOTSUPP:
5417 /* fall back to the code below */
5418 break;
5419 }
5420 # endif /* HAVE_POSIX_ALLOCATE */
5421 #endif /* _WIN32*/
5422
5423 #ifdef _WIN32
5424 os_offset_t current_size = os_file_get_size(file);
5425 FILE_STORAGE_INFO info;
5426 if (GetFileInformationByHandleEx(file, FileStorageInfo, &info,
5427 sizeof info)) {
5428 if (info.LogicalBytesPerSector) {
5429 current_size &= ~os_offset_t(info.LogicalBytesPerSector
5430 - 1);
5431 }
5432 }
5433 #else
5434 if (fstat(file, &statbuf)) {
5435 return false;
5436 }
5437 os_offset_t current_size = statbuf.st_size & ~4095ULL;
5438 #endif
5439 if (current_size >= size) {
5440 return true;
5441 }
5442
5443 /* Write up to 1 megabyte at a time. */
5444 ulint buf_size = ut_min(ulint(64),
5445 ulint(size >> srv_page_size_shift))
5446 << srv_page_size_shift;
5447
5448 /* Align the buffer for possible raw i/o */
5449 byte* buf2;
5450
5451 buf2 = static_cast<byte*>(ut_malloc_nokey(buf_size + srv_page_size));
5452
5453 byte* buf = static_cast<byte*>(ut_align(buf2, srv_page_size));
5454
5455 /* Write buffer full of zeros */
5456 memset(buf, 0, buf_size);
5457
5458 while (current_size < size
5459 && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED) {
5460 ulint n_bytes;
5461
5462 if (size - current_size < (os_offset_t) buf_size) {
5463 n_bytes = (ulint) (size - current_size);
5464 } else {
5465 n_bytes = buf_size;
5466 }
5467
5468 dberr_t err;
5469 IORequest request(IORequest::WRITE);
5470
5471 err = os_file_write(
5472 request, name, file, buf, current_size, n_bytes);
5473
5474 if (err != DB_SUCCESS) {
5475 break;
5476 }
5477
5478 current_size += n_bytes;
5479 }
5480
5481 ut_free(buf2);
5482
5483 return(current_size >= size && os_file_flush(file));
5484 }
5485
5486 /** Truncate a file to a specified size in bytes.
5487 @param[in] pathname file path
5488 @param[in] file file to be truncated
5489 @param[in] size size preserved in bytes
5490 @param[in] allow_shrink whether to allow the file to become smaller
5491 @return true if success */
5492 bool
os_file_truncate(const char * pathname,os_file_t file,os_offset_t size,bool allow_shrink)5493 os_file_truncate(
5494 const char* pathname,
5495 os_file_t file,
5496 os_offset_t size,
5497 bool allow_shrink)
5498 {
5499 if (!allow_shrink) {
5500 /* Do nothing if the size preserved is larger than or
5501 equal to the current size of file */
5502 os_offset_t size_bytes = os_file_get_size(file);
5503
5504 if (size >= size_bytes) {
5505 return(true);
5506 }
5507 }
5508
5509 #ifdef _WIN32
5510 return(os_file_change_size_win32(pathname, file, size));
5511 #else /* _WIN32 */
5512 return(os_file_truncate_posix(pathname, file, size));
5513 #endif /* _WIN32 */
5514 }
5515
5516 /** NOTE! Use the corresponding macro os_file_read(), not directly this
5517 function!
5518 Requests a synchronous positioned read operation.
5519 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
5520 @param[in] type IO flags
5521 @param[in] file handle to an open file
5522 @param[out] buf buffer where to read
5523 @param[in] offset file offset from the start where to read
5524 @param[in] n number of bytes to read, starting from offset
5525 @return error code
5526 @retval DB_SUCCESS if the operation succeeded */
5527 dberr_t
os_file_read_func(const IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n)5528 os_file_read_func(
5529 const IORequest& type,
5530 os_file_t file,
5531 void* buf,
5532 os_offset_t offset,
5533 ulint n)
5534 {
5535 return(os_file_read_page(type, file, buf, offset, n, NULL, true));
5536 }
5537
5538 /** NOTE! Use the corresponding macro os_file_read_no_error_handling(),
5539 not directly this function!
5540 Requests a synchronous positioned read operation.
5541 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
5542 @param[in] type IO flags
5543 @param[in] file handle to an open file
5544 @param[out] buf buffer where to read
5545 @param[in] offset file offset from the start where to read
5546 @param[in] n number of bytes to read, starting from offset
5547 @param[out] o number of bytes actually read
5548 @return DB_SUCCESS or error code */
5549 dberr_t
os_file_read_no_error_handling_func(const IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n,ulint * o)5550 os_file_read_no_error_handling_func(
5551 const IORequest& type,
5552 os_file_t file,
5553 void* buf,
5554 os_offset_t offset,
5555 ulint n,
5556 ulint* o)
5557 {
5558 return(os_file_read_page(type, file, buf, offset, n, o, false));
5559 }
5560
5561 /** Check the existence and type of the given file.
5562 @param[in] path path name of file
5563 @param[out] exists true if the file exists
5564 @param[out] type Type of the file, if it exists
5565 @return true if call succeeded */
5566 bool
os_file_status(const char * path,bool * exists,os_file_type_t * type)5567 os_file_status(
5568 const char* path,
5569 bool* exists,
5570 os_file_type_t* type)
5571 {
5572 #ifdef _WIN32
5573 return(os_file_status_win32(path, exists, type));
5574 #else
5575 return(os_file_status_posix(path, exists, type));
5576 #endif /* _WIN32 */
5577 }
5578
5579 /** Free storage space associated with a section of the file.
5580 @param[in] fh Open file handle
5581 @param[in] off Starting offset (SEEK_SET)
5582 @param[in] len Size of the hole
5583 @return DB_SUCCESS or error code */
5584 dberr_t
os_file_punch_hole(os_file_t fh,os_offset_t off,os_offset_t len)5585 os_file_punch_hole(
5586 os_file_t fh,
5587 os_offset_t off,
5588 os_offset_t len)
5589 {
5590 dberr_t err;
5591
5592 #ifdef _WIN32
5593 err = os_file_punch_hole_win32(fh, off, len);
5594 #else
5595 err = os_file_punch_hole_posix(fh, off, len);
5596 #endif /* _WIN32 */
5597
5598 return (err);
5599 }
5600
5601 /** Free storage space associated with a section of the file.
5602 @param[in] fh Open file handle
5603 @param[in] off Starting offset (SEEK_SET)
5604 @param[in] len Size of the hole
5605 @return DB_SUCCESS or error code */
5606 dberr_t
punch_hole(os_file_t fh,os_offset_t off,ulint len)5607 IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len)
5608 {
5609 /* In this debugging mode, we act as if punch hole is supported,
5610 and then skip any calls to actually punch a hole here.
5611 In this way, Transparent Page Compression is still being tested. */
5612 DBUG_EXECUTE_IF("ignore_punch_hole",
5613 return(DB_SUCCESS);
5614 );
5615
5616 ulint trim_len = get_trim_length(len);
5617
5618 if (trim_len == 0) {
5619 return(DB_SUCCESS);
5620 }
5621
5622 off += len;
5623
5624 /* Check does file system support punching holes for this
5625 tablespace. */
5626 if (!should_punch_hole()) {
5627 return DB_IO_NO_PUNCH_HOLE;
5628 }
5629
5630 dberr_t err = os_file_punch_hole(fh, off, trim_len);
5631
5632 if (err == DB_SUCCESS) {
5633 srv_stats.page_compressed_trim_op.inc();
5634 } else {
5635 /* If punch hole is not supported,
5636 set space so that it is not used. */
5637 if (err == DB_IO_NO_PUNCH_HOLE) {
5638 space_no_punch_hole();
5639 err = DB_SUCCESS;
5640 }
5641 }
5642
5643 return (err);
5644 }
5645
5646 /** Check if the file system supports sparse files.
5647
5648 Warning: On POSIX systems we try and punch a hole from offset 0 to
5649 the system configured page size. This should only be called on an empty
5650 file.
5651 @param[in] fh File handle for the file - if opened
5652 @return true if the file system supports sparse files */
5653 bool
os_is_sparse_file_supported(os_file_t fh)5654 os_is_sparse_file_supported(os_file_t fh)
5655 {
5656 /* In this debugging mode, we act as if punch hole is supported,
5657 then we skip any calls to actually punch a hole. In this way,
5658 Transparent Page Compression is still being tested. */
5659 DBUG_EXECUTE_IF("ignore_punch_hole",
5660 return(true);
5661 );
5662
5663 #ifdef _WIN32
5664 FILE_ATTRIBUTE_TAG_INFO info;
5665 if (GetFileInformationByHandleEx(fh, FileAttributeTagInfo,
5666 &info, (DWORD)sizeof(info))) {
5667 if (info.FileAttributes != INVALID_FILE_ATTRIBUTES) {
5668 return (info.FileAttributes & FILE_ATTRIBUTE_SPARSE_FILE) != 0;
5669 }
5670 }
5671 return false;
5672 #else
5673 dberr_t err;
5674
5675 /* We don't know the FS block size, use the sector size. The FS
5676 will do the magic. */
5677 err = os_file_punch_hole_posix(fh, 0, srv_page_size);
5678
5679 return(err == DB_SUCCESS);
5680 #endif /* _WIN32 */
5681 }
5682
5683 /** This function returns information about the specified file
5684 @param[in] path pathname of the file
5685 @param[out] stat_info information of a file in a directory
5686 @param[in] check_rw_perm for testing whether the file can be opened
5687 in RW mode
5688 @param[in] read_only true if file is opened in read-only mode
5689 @return DB_SUCCESS if all OK */
5690 dberr_t
os_file_get_status(const char * path,os_file_stat_t * stat_info,bool check_rw_perm,bool read_only)5691 os_file_get_status(
5692 const char* path,
5693 os_file_stat_t* stat_info,
5694 bool check_rw_perm,
5695 bool read_only)
5696 {
5697 dberr_t ret;
5698
5699 #ifdef _WIN32
5700 struct _stat64 info;
5701
5702 ret = os_file_get_status_win32(
5703 path, stat_info, &info, check_rw_perm, read_only);
5704
5705 #else
5706 struct stat info;
5707
5708 ret = os_file_get_status_posix(
5709 path, stat_info, &info, check_rw_perm, read_only);
5710
5711 #endif /* _WIN32 */
5712
5713 if (ret == DB_SUCCESS) {
5714 stat_info->ctime = info.st_ctime;
5715 stat_info->atime = info.st_atime;
5716 stat_info->mtime = info.st_mtime;
5717 stat_info->size = info.st_size;
5718 }
5719
5720 return(ret);
5721 }
5722
5723 /**
5724 Waits for an AIO operation to complete. This function is used to wait the
5725 for completed requests. The aio array of pending requests is divided
5726 into segments. The thread specifies which segment or slot it wants to wait
5727 for. NOTE: this function will also take care of freeing the aio slot,
5728 therefore no other thread is allowed to do the freeing!
5729 @param[in] segment The number of the segment in the aio arrays to
5730 wait for; segment 0 is the ibuf I/O thread,
5731 segment 1 the log I/O thread, then follow the
5732 non-ibuf read threads, and as the last are the
5733 non-ibuf write threads; if this is
5734 ULINT_UNDEFINED, then it means that sync AIO
5735 is used, and this parameter is ignored
5736 @param[out] m1 the messages passed with the AIO request; note
5737 that also in the case where the AIO operation
5738 failed, these output parameters are valid and
5739 can be used to restart the operation,
5740 for example
5741 @param[out] m2 callback message
5742 @param[out] type OS_FILE_WRITE or ..._READ
5743 @return DB_SUCCESS or error code */
5744 dberr_t
os_aio_handler(ulint segment,fil_node_t ** m1,void ** m2,IORequest * request)5745 os_aio_handler(
5746 ulint segment,
5747 fil_node_t** m1,
5748 void** m2,
5749 IORequest* request)
5750 {
5751 dberr_t err;
5752
5753 if (srv_use_native_aio) {
5754 srv_set_io_thread_op_info(segment, "native aio handle");
5755
5756 #ifdef WIN_ASYNC_IO
5757
5758 err = os_aio_windows_handler(segment, 0, m1, m2, request);
5759
5760 #elif defined(LINUX_NATIVE_AIO)
5761
5762 err = os_aio_linux_handler(segment, m1, m2, request);
5763
5764 #else
5765 ut_error;
5766
5767 err = DB_ERROR; /* Eliminate compiler warning */
5768
5769 #endif /* WIN_ASYNC_IO */
5770
5771 } else {
5772 srv_set_io_thread_op_info(segment, "simulated aio handle");
5773
5774 err = os_aio_simulated_handler(segment, m1, m2, request);
5775 }
5776
5777 return(err);
5778 }
5779
5780 #ifdef WIN_ASYNC_IO
new_completion_port()5781 static HANDLE new_completion_port()
5782 {
5783 HANDLE h = CreateIoCompletionPort(INVALID_HANDLE_VALUE, 0, 0, 0);
5784 ut_a(h);
5785 return h;
5786 }
5787 #endif
5788
5789 /** Constructor
5790 @param[in] id The latch ID
5791 @param[in] n Number of AIO slots
5792 @param[in] segments Number of segments */
AIO(latch_id_t id,ulint n,ulint segments)5793 AIO::AIO(
5794 latch_id_t id,
5795 ulint n,
5796 ulint segments)
5797 :
5798 m_slots(n),
5799 m_n_segments(segments),
5800 m_n_reserved()
5801 # ifdef LINUX_NATIVE_AIO
5802 ,m_events(m_slots.size())
5803 # endif /* LINUX_NATIVE_AIO */
5804 #ifdef WIN_ASYNC_IO
5805 ,m_completion_port(new_completion_port())
5806 #endif
5807 {
5808 ut_a(n > 0);
5809 ut_a(m_n_segments > 0);
5810
5811 mutex_create(id, &m_mutex);
5812
5813 m_not_full = os_event_create("aio_not_full");
5814 m_is_empty = os_event_create("aio_is_empty");
5815
5816 memset((void*)&m_slots[0], 0x0, sizeof(m_slots[0]) * m_slots.size());
5817 #ifdef LINUX_NATIVE_AIO
5818 memset(&m_events[0], 0x0, sizeof(m_events[0]) * m_events.size());
5819 #endif /* LINUX_NATIVE_AIO */
5820
5821 os_event_set(m_is_empty);
5822 }
5823
5824 /** Initialise the slots */
5825 dberr_t
init_slots()5826 AIO::init_slots()
5827 {
5828 for (ulint i = 0; i < m_slots.size(); ++i) {
5829 Slot& slot = m_slots[i];
5830
5831 slot.pos = static_cast<uint16_t>(i);
5832
5833 slot.is_reserved = false;
5834
5835 #ifdef WIN_ASYNC_IO
5836
5837 slot.array = this;
5838
5839 #elif defined(LINUX_NATIVE_AIO)
5840
5841 slot.ret = 0;
5842
5843 slot.n_bytes = 0;
5844
5845 memset(&slot.control, 0x0, sizeof(slot.control));
5846
5847 #endif /* WIN_ASYNC_IO */
5848 }
5849
5850 return(DB_SUCCESS);
5851 }
5852
5853 #ifdef LINUX_NATIVE_AIO
5854 /** Initialise the Linux Native AIO interface */
5855 dberr_t
init_linux_native_aio()5856 AIO::init_linux_native_aio()
5857 {
5858
5859 /* Initialize the io_context_t array. One io_context_t
5860 per segment in the array. */
5861 m_aio_ctx.resize(get_n_segments());
5862
5863 ulint max_events = slots_per_segment();
5864
5865 for (std::vector<io_context_t>::iterator it = m_aio_ctx.begin(),
5866 end = m_aio_ctx.end();
5867 it != end; ++it) {
5868
5869 if (!linux_create_io_ctx(max_events, *it)) {
5870 /* If something bad happened during aio setup
5871 we disable linux native aio.
5872 This frequently happens when running the test suite
5873 with many threads on a system with low fs.aio-max-nr!
5874 */
5875
5876 ib::warn()
5877 << "Warning: Linux Native AIO disabled "
5878 << "because _linux_create_io_ctx() "
5879 << "failed. To get rid of this warning you can "
5880 << "try increasing system "
5881 << "fs.aio-max-nr to 1048576 or larger or "
5882 << "setting innodb_use_native_aio = 0 in my.cnf";
5883
5884 for (std::vector<io_context_t>::iterator it2
5885 = m_aio_ctx.begin();
5886 it2 != it; ++it2) {
5887 int ret = io_destroy(*it2);
5888 ut_a(ret != -EINVAL);
5889 }
5890
5891 m_aio_ctx.clear();
5892 srv_use_native_aio = FALSE;
5893 return(DB_SUCCESS);
5894 }
5895 }
5896
5897 return(DB_SUCCESS);
5898 }
5899 #endif /* LINUX_NATIVE_AIO */
5900
5901 /** Initialise the array */
5902 dberr_t
init()5903 AIO::init()
5904 {
5905 ut_a(!m_slots.empty());
5906
5907
5908 if (srv_use_native_aio) {
5909 #ifdef LINUX_NATIVE_AIO
5910 dberr_t err = init_linux_native_aio();
5911
5912 if (err != DB_SUCCESS) {
5913 return(err);
5914 }
5915
5916 #endif /* LINUX_NATIVE_AIO */
5917 }
5918
5919 return(init_slots());
5920 }
5921
5922 /** Creates an aio wait array. Note that we return NULL in case of failure.
5923 We don't care about freeing memory here because we assume that a
5924 failure will result in server refusing to start up.
5925 @param[in] id Latch ID
5926 @param[in] n maximum number of pending AIO operations
5927 allowed; n must be divisible by m_n_segments
5928 @param[in] n_segments number of segments in the AIO array
5929 @return own: AIO array, NULL on failure */
5930 AIO*
create(latch_id_t id,ulint n,ulint n_segments)5931 AIO::create(
5932 latch_id_t id,
5933 ulint n,
5934 ulint n_segments)
5935 {
5936 if ((n % n_segments)) {
5937
5938 ib::error()
5939 << "Maximum number of AIO operations must be "
5940 << "divisible by number of segments";
5941
5942 return(NULL);
5943 }
5944
5945 AIO* array = UT_NEW_NOKEY(AIO(id, n, n_segments));
5946
5947 if (array != NULL && array->init() != DB_SUCCESS) {
5948
5949 UT_DELETE(array);
5950
5951 array = NULL;
5952 }
5953
5954 return(array);
5955 }
5956
5957 /** AIO destructor */
~AIO()5958 AIO::~AIO()
5959 {
5960 mutex_destroy(&m_mutex);
5961
5962 os_event_destroy(m_not_full);
5963 os_event_destroy(m_is_empty);
5964
5965 #if defined(LINUX_NATIVE_AIO)
5966 if (srv_use_native_aio) {
5967 for (ulint i = 0; i < m_aio_ctx.size(); i++) {
5968 int ret = io_destroy(m_aio_ctx[i]);
5969 ut_a(ret != -EINVAL);
5970 }
5971 }
5972 #endif /* LINUX_NATIVE_AIO */
5973 #if defined(WIN_ASYNC_IO)
5974 CloseHandle(m_completion_port);
5975 #endif
5976 }
5977
5978 /** Initializes the asynchronous io system. Creates one array each for ibuf
5979 and log i/o. Also creates one array each for read and write where each
5980 array is divided logically into n_readers and n_writers
5981 respectively. The caller must create an i/o handler thread for each
5982 segment in these arrays. This function also creates the sync array.
5983 No i/o handler thread needs to be created for that
5984 @param[in] n_per_seg maximum number of pending aio
5985 operations allowed per segment
5986 @param[in] n_readers number of reader threads
5987 @param[in] n_writers number of writer threads
5988 @param[in] n_slots_sync number of slots in the sync aio array
5989 @return true if the AIO sub-system was started successfully */
5990 bool
start(ulint n_per_seg,ulint n_readers,ulint n_writers,ulint n_slots_sync)5991 AIO::start(
5992 ulint n_per_seg,
5993 ulint n_readers,
5994 ulint n_writers,
5995 ulint n_slots_sync)
5996 {
5997 #if defined(LINUX_NATIVE_AIO)
5998 /* Check if native aio is supported on this system and tmpfs */
5999 if (srv_use_native_aio && !is_linux_native_aio_supported()) {
6000
6001 ib::warn() << "Linux Native AIO disabled.";
6002
6003 srv_use_native_aio = FALSE;
6004 }
6005 #endif /* LINUX_NATIVE_AIO */
6006
6007 srv_reset_io_thread_op_info();
6008
6009 s_reads = create(
6010 LATCH_ID_OS_AIO_READ_MUTEX, n_readers * n_per_seg, n_readers);
6011
6012 if (s_reads == NULL) {
6013 return(false);
6014 }
6015
6016 ulint start = srv_read_only_mode ? 0 : 2;
6017 ulint n_segs = n_readers + start;
6018
6019 /* 0 is the ibuf segment and 1 is the redo log segment. */
6020 for (ulint i = start; i < n_segs; ++i) {
6021 ut_a(i < SRV_MAX_N_IO_THREADS);
6022 srv_io_thread_function[i] = "read thread";
6023 }
6024
6025 ulint n_segments = n_readers;
6026
6027 if (!srv_read_only_mode) {
6028
6029 s_ibuf = create(LATCH_ID_OS_AIO_IBUF_MUTEX, n_per_seg, 1);
6030
6031 if (s_ibuf == NULL) {
6032 return(false);
6033 }
6034
6035 ++n_segments;
6036
6037 srv_io_thread_function[0] = "insert buffer thread";
6038
6039 s_log = create(LATCH_ID_OS_AIO_LOG_MUTEX, n_per_seg, 1);
6040
6041 if (s_log == NULL) {
6042 return(false);
6043 }
6044
6045 ++n_segments;
6046
6047 srv_io_thread_function[1] = "log thread";
6048
6049 } else {
6050 s_ibuf = s_log = NULL;
6051 }
6052
6053 s_writes = create(
6054 LATCH_ID_OS_AIO_WRITE_MUTEX, n_writers * n_per_seg, n_writers);
6055
6056 if (s_writes == NULL) {
6057 return(false);
6058 }
6059
6060 #ifdef WIN_ASYNC_IO
6061 data_completion_port = s_writes->m_completion_port;
6062 log_completion_port =
6063 s_log ? s_log->m_completion_port : data_completion_port;
6064 #endif
6065
6066 n_segments += n_writers;
6067
6068 for (ulint i = start + n_readers; i < n_segments; ++i) {
6069 ut_a(i < SRV_MAX_N_IO_THREADS);
6070 srv_io_thread_function[i] = "write thread";
6071 }
6072
6073 ut_ad(n_segments >= static_cast<ulint>(srv_read_only_mode ? 2 : 4));
6074
6075 s_sync = create(LATCH_ID_OS_AIO_SYNC_MUTEX, n_slots_sync, 1);
6076
6077 if (s_sync == NULL) {
6078
6079 return(false);
6080 }
6081
6082 os_aio_n_segments = n_segments;
6083
6084 os_aio_validate();
6085
6086 os_last_printout = time(NULL);
6087
6088 if (srv_use_native_aio) {
6089 return(true);
6090 }
6091
6092 os_aio_segment_wait_events = static_cast<os_event_t*>(
6093 ut_zalloc_nokey(
6094 n_segments * sizeof *os_aio_segment_wait_events));
6095
6096 if (os_aio_segment_wait_events == NULL) {
6097
6098 return(false);
6099 }
6100
6101 for (ulint i = 0; i < n_segments; ++i) {
6102 os_aio_segment_wait_events[i] = os_event_create(0);
6103 }
6104
6105 return(true);
6106 }
6107
6108 /** Free the AIO arrays */
6109 void
shutdown()6110 AIO::shutdown()
6111 {
6112 UT_DELETE(s_ibuf);
6113 s_ibuf = NULL;
6114
6115 UT_DELETE(s_log);
6116 s_log = NULL;
6117
6118 UT_DELETE(s_writes);
6119 s_writes = NULL;
6120
6121 UT_DELETE(s_sync);
6122 s_sync = NULL;
6123
6124 UT_DELETE(s_reads);
6125 s_reads = NULL;
6126 }
6127
6128 /** Initializes the asynchronous io system. Creates one array each for ibuf
6129 and log i/o. Also creates one array each for read and write where each
6130 array is divided logically into n_readers and n_writers
6131 respectively. The caller must create an i/o handler thread for each
6132 segment in these arrays. This function also creates the sync array.
6133 No i/o handler thread needs to be created for that
6134 @param[in] n_readers number of reader threads
6135 @param[in] n_writers number of writer threads
6136 @param[in] n_slots_sync number of slots in the sync aio array */
6137 bool
os_aio_init(ulint n_readers,ulint n_writers,ulint n_slots_sync)6138 os_aio_init(
6139 ulint n_readers,
6140 ulint n_writers,
6141 ulint n_slots_sync)
6142 {
6143 /* Maximum number of pending aio operations allowed per segment */
6144 ulint limit = 8 * OS_AIO_N_PENDING_IOS_PER_THREAD;
6145
6146 return(AIO::start(limit, n_readers, n_writers, n_slots_sync));
6147 }
6148
6149 /** Frees the asynchronous io system. */
6150 void
os_aio_free()6151 os_aio_free()
6152 {
6153 AIO::shutdown();
6154
6155 ut_ad(!os_aio_segment_wait_events || !srv_use_native_aio);
6156 ut_ad(srv_use_native_aio || os_aio_segment_wait_events
6157 || !srv_was_started);
6158
6159 if (!srv_use_native_aio && os_aio_segment_wait_events) {
6160 for (ulint i = 0; i < os_aio_n_segments; i++) {
6161 os_event_destroy(os_aio_segment_wait_events[i]);
6162 }
6163
6164 ut_free(os_aio_segment_wait_events);
6165 os_aio_segment_wait_events = 0;
6166 }
6167 os_aio_n_segments = 0;
6168 }
6169
6170 /** Wakes up all async i/o threads so that they know to exit themselves in
6171 shutdown. */
6172 void
os_aio_wake_all_threads_at_shutdown()6173 os_aio_wake_all_threads_at_shutdown()
6174 {
6175 #ifdef WIN_ASYNC_IO
6176 AIO::wake_at_shutdown();
6177 #elif defined(LINUX_NATIVE_AIO)
6178 /* When using native AIO interface the io helper threads
6179 wait on io_getevents with a timeout value of 500ms. At
6180 each wake up these threads check the server status.
6181 No need to do anything to wake them up. */
6182 #endif /* !WIN_ASYNC_AIO */
6183
6184 if (srv_use_native_aio) {
6185 return;
6186 }
6187
6188 /* This loop wakes up all simulated ai/o threads */
6189
6190 for (ulint i = 0; i < os_aio_n_segments; ++i) {
6191
6192 os_event_set(os_aio_segment_wait_events[i]);
6193 }
6194 }
6195
6196 /** Waits until there are no pending writes in AIO::s_writes. There can
6197 be other, synchronous, pending writes. */
6198 void
os_aio_wait_until_no_pending_writes()6199 os_aio_wait_until_no_pending_writes()
6200 {
6201 AIO::wait_until_no_pending_writes();
6202 }
6203
6204 /** Calculates segment number for a slot.
6205 @param[in] array AIO wait array
6206 @param[in] slot slot in this array
6207 @return segment number (which is the number used by, for example,
6208 I/O-handler threads) */
6209 ulint
get_segment_no_from_slot(const AIO * array,const Slot * slot)6210 AIO::get_segment_no_from_slot(
6211 const AIO* array,
6212 const Slot* slot)
6213 {
6214 ulint segment;
6215 ulint seg_len;
6216
6217 if (array == s_ibuf) {
6218 ut_ad(!srv_read_only_mode);
6219
6220 segment = IO_IBUF_SEGMENT;
6221
6222 } else if (array == s_log) {
6223 ut_ad(!srv_read_only_mode);
6224
6225 segment = IO_LOG_SEGMENT;
6226
6227 } else if (array == s_reads) {
6228 seg_len = s_reads->slots_per_segment();
6229
6230 segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
6231 } else {
6232 ut_a(array == s_writes);
6233
6234 seg_len = s_writes->slots_per_segment();
6235
6236 segment = s_reads->m_n_segments
6237 + (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
6238 }
6239
6240 return(segment);
6241 }
6242
6243 /** Requests for a slot in the aio array. If no slot is available, waits until
6244 not_full-event becomes signaled.
6245
6246 @param[in] type IO context
6247 @param[in,out] m1 message to be passed along with the AIO
6248 operation
6249 @param[in,out] m2 message to be passed along with the AIO
6250 operation
6251 @param[in] file file handle
6252 @param[in] name name of the file or path as a NUL-terminated
6253 string
6254 @param[in,out] buf buffer where to read or from which to write
6255 @param[in] offset file offset, where to read from or start writing
6256 @param[in] len length of the block to read or write
6257 @return pointer to slot */
6258 Slot*
reserve_slot(const IORequest & type,fil_node_t * m1,void * m2,pfs_os_file_t file,const char * name,void * buf,os_offset_t offset,ulint len)6259 AIO::reserve_slot(
6260 const IORequest& type,
6261 fil_node_t* m1,
6262 void* m2,
6263 pfs_os_file_t file,
6264 const char* name,
6265 void* buf,
6266 os_offset_t offset,
6267 ulint len)
6268 {
6269 ut_ad(reinterpret_cast<size_t>(buf) % OS_FILE_LOG_BLOCK_SIZE == 0);
6270 ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
6271 ut_ad(len % OS_FILE_LOG_BLOCK_SIZE == 0);
6272
6273 #ifdef WIN_ASYNC_IO
6274 ut_a((len & 0xFFFFFFFFUL) == len);
6275 #endif /* WIN_ASYNC_IO */
6276
6277 /* No need of a mutex. Only reading constant fields */
6278 ulint slots_per_seg;
6279
6280 ut_ad(type.validate());
6281
6282 slots_per_seg = slots_per_segment();
6283
6284 /* We attempt to keep adjacent blocks in the same local
6285 segment. This can help in merging IO requests when we are
6286 doing simulated AIO */
6287 ulint local_seg;
6288
6289 local_seg = (offset >> (srv_page_size_shift + 6)) % m_n_segments;
6290
6291 for (;;) {
6292
6293 acquire();
6294
6295 if (m_n_reserved != m_slots.size()) {
6296 break;
6297 }
6298
6299 release();
6300
6301 if (!srv_use_native_aio) {
6302 /* If the handler threads are suspended,
6303 wake them so that we get more slots */
6304
6305 os_aio_simulated_wake_handler_threads();
6306 }
6307
6308 os_event_wait(m_not_full);
6309 }
6310
6311 ulint counter = 0;
6312 Slot* slot = NULL;
6313
6314 /* We start our search for an available slot from our preferred
6315 local segment and do a full scan of the array. We are
6316 guaranteed to find a slot in full scan. */
6317 for (ulint i = local_seg * slots_per_seg;
6318 counter < m_slots.size();
6319 ++i, ++counter) {
6320
6321 i %= m_slots.size();
6322
6323 slot = at(i);
6324
6325 if (slot->is_reserved == false) {
6326 break;
6327 }
6328 }
6329
6330 /* We MUST always be able to get hold of a reserved slot. */
6331 ut_a(counter < m_slots.size());
6332
6333 ut_a(slot->is_reserved == false);
6334
6335 ++m_n_reserved;
6336
6337 if (m_n_reserved == 1) {
6338 os_event_reset(m_is_empty);
6339 }
6340
6341 if (m_n_reserved == m_slots.size()) {
6342 os_event_reset(m_not_full);
6343 }
6344
6345 slot->is_reserved = true;
6346 slot->reservation_time = time(NULL);
6347 slot->m1 = m1;
6348 slot->m2 = m2;
6349 slot->file = file;
6350 slot->name = name;
6351 #ifdef _WIN32
6352 slot->len = static_cast<DWORD>(len);
6353 #else
6354 slot->len = len;
6355 #endif /* _WIN32 */
6356 slot->type = type;
6357 slot->buf = static_cast<byte*>(buf);
6358 slot->ptr = slot->buf;
6359 slot->offset = offset;
6360 slot->err = DB_SUCCESS;
6361 slot->original_len = static_cast<uint32>(len);
6362 slot->io_already_done = false;
6363 slot->buf = static_cast<byte*>(buf);
6364
6365 #ifdef WIN_ASYNC_IO
6366 {
6367 OVERLAPPED* control;
6368
6369 control = &slot->control;
6370 control->Offset = (DWORD) offset & 0xFFFFFFFF;
6371 control->OffsetHigh = (DWORD) (offset >> 32);
6372 }
6373 #elif defined(LINUX_NATIVE_AIO)
6374
6375 /* If we are not using native AIO skip this part. */
6376 if (srv_use_native_aio) {
6377
6378 off_t aio_offset;
6379
6380 /* Check if we are dealing with 64 bit arch.
6381 If not then make sure that offset fits in 32 bits. */
6382 aio_offset = (off_t) offset;
6383
6384 ut_a(sizeof(aio_offset) >= sizeof(offset)
6385 || ((os_offset_t) aio_offset) == offset);
6386
6387 struct iocb* iocb = &slot->control;
6388
6389 if (type.is_read()) {
6390
6391 io_prep_pread(
6392 iocb, file, slot->ptr, slot->len, aio_offset);
6393 } else {
6394 ut_ad(type.is_write());
6395
6396 io_prep_pwrite(
6397 iocb, file, slot->ptr, slot->len, aio_offset);
6398 }
6399
6400 iocb->data = slot;
6401
6402 slot->n_bytes = 0;
6403 slot->ret = 0;
6404 }
6405 #endif /* LINUX_NATIVE_AIO */
6406
6407 release();
6408
6409 return(slot);
6410 }
6411
6412 /** Wakes up a simulated aio i/o-handler thread if it has something to do.
6413 @param[in] global_segment The number of the segment in the AIO arrays */
6414 void
wake_simulated_handler_thread(ulint global_segment)6415 AIO::wake_simulated_handler_thread(ulint global_segment)
6416 {
6417 ut_ad(!srv_use_native_aio);
6418
6419 AIO* array;
6420 ulint segment = get_array_and_local_segment(&array, global_segment);
6421
6422 array->wake_simulated_handler_thread(global_segment, segment);
6423 }
6424
6425 /** Wakes up a simulated AIO I/O-handler thread if it has something to do
6426 for a local segment in the AIO array.
6427 @param[in] global_segment The number of the segment in the AIO arrays
6428 @param[in] segment The local segment in the AIO array */
6429 void
wake_simulated_handler_thread(ulint global_segment,ulint segment)6430 AIO::wake_simulated_handler_thread(ulint global_segment, ulint segment)
6431 {
6432 ut_ad(!srv_use_native_aio);
6433
6434 ulint n = slots_per_segment();
6435 ulint offset = segment * n;
6436
6437 /* Look through n slots after the segment * n'th slot */
6438
6439 acquire();
6440
6441 const Slot* slot = at(offset);
6442
6443 for (ulint i = 0; i < n; ++i, ++slot) {
6444
6445 if (slot->is_reserved) {
6446
6447 /* Found an i/o request */
6448
6449 release();
6450
6451 os_event_t event;
6452
6453 event = os_aio_segment_wait_events[global_segment];
6454
6455 os_event_set(event);
6456
6457 return;
6458 }
6459 }
6460
6461 release();
6462 }
6463
6464 /** Wakes up simulated aio i/o-handler threads if they have something to do. */
6465 void
os_aio_simulated_wake_handler_threads()6466 os_aio_simulated_wake_handler_threads()
6467 {
6468 if (srv_use_native_aio) {
6469 /* We do not use simulated aio: do nothing */
6470
6471 return;
6472 }
6473
6474 os_aio_recommend_sleep_for_read_threads = false;
6475
6476 for (ulint i = 0; i < os_aio_n_segments; i++) {
6477 AIO::wake_simulated_handler_thread(i);
6478 }
6479 }
6480
6481 /** Select the IO slot array
6482 @param[in,out] type Type of IO, READ or WRITE
6483 @param[in] read_only true if running in read-only mode
6484 @param[in] mode IO mode
6485 @return slot array or NULL if invalid mode specified */
6486 AIO*
select_slot_array(IORequest & type,bool read_only,ulint mode)6487 AIO::select_slot_array(IORequest& type, bool read_only, ulint mode)
6488 {
6489 AIO* array;
6490
6491 ut_ad(type.validate());
6492
6493 switch (mode) {
6494 case OS_AIO_NORMAL:
6495
6496 array = type.is_read() ? AIO::s_reads : AIO::s_writes;
6497 break;
6498
6499 case OS_AIO_IBUF:
6500 ut_ad(type.is_read());
6501
6502 /* Reduce probability of deadlock bugs in connection with ibuf:
6503 do not let the ibuf i/o handler sleep */
6504
6505 type.clear_do_not_wake();
6506
6507 array = read_only ? AIO::s_reads : AIO::s_ibuf;
6508 break;
6509
6510 case OS_AIO_LOG:
6511
6512 array = read_only ? AIO::s_reads : AIO::s_log;
6513 break;
6514
6515 case OS_AIO_SYNC:
6516
6517 array = AIO::s_sync;
6518 #if defined(LINUX_NATIVE_AIO)
6519 /* In Linux native AIO we don't use sync IO array. */
6520 ut_a(!srv_use_native_aio);
6521 #endif /* LINUX_NATIVE_AIO */
6522 break;
6523
6524 default:
6525 ut_error;
6526 array = NULL; /* Eliminate compiler warning */
6527 }
6528
6529 return(array);
6530 }
6531
6532 #ifdef WIN_ASYNC_IO
6533 /** This function is only used in Windows asynchronous i/o.
6534 Waits for an aio operation to complete. This function is used to wait the
6535 for completed requests. The aio array of pending requests is divided
6536 into segments. The thread specifies which segment or slot it wants to wait
6537 for. NOTE: this function will also take care of freeing the aio slot,
6538 therefore no other thread is allowed to do the freeing!
6539 @param[in] segment The number of the segment in the aio arrays to
6540 wait for; segment 0 is the ibuf I/O thread,
6541 segment 1 the log I/O thread, then follow the
6542 non-ibuf read threads, and as the last are the
6543 non-ibuf write threads; if this is
6544 ULINT_UNDEFINED, then it means that sync AIO
6545 is used, and this parameter is ignored
6546 @param[in] pos this parameter is used only in sync AIO:
6547 wait for the aio slot at this position
6548 @param[out] m1 the messages passed with the AIO request; note
6549 that also in the case where the AIO operation
6550 failed, these output parameters are valid and
6551 can be used to restart the operation,
6552 for example
6553 @param[out] m2 callback message
6554 @param[out] type OS_FILE_WRITE or ..._READ
6555 @return DB_SUCCESS or error code */
6556
6557
6558
6559 static
6560 dberr_t
os_aio_windows_handler(ulint segment,ulint pos,fil_node_t ** m1,void ** m2,IORequest * type)6561 os_aio_windows_handler(
6562 ulint segment,
6563 ulint pos,
6564 fil_node_t** m1,
6565 void** m2,
6566 IORequest* type)
6567 {
6568 Slot* slot= 0;
6569 dberr_t err;
6570
6571 BOOL ret;
6572 ULONG_PTR key;
6573
6574 ut_a(segment != ULINT_UNDEFINED);
6575
6576 /* NOTE! We only access constant fields in os_aio_array. Therefore
6577 we do not have to acquire the protecting mutex yet */
6578
6579 ut_ad(os_aio_validate_skip());
6580 AIO *my_array;
6581 AIO::get_array_and_local_segment(&my_array, segment);
6582
6583 HANDLE port = my_array->m_completion_port;
6584 ut_ad(port);
6585 for (;;) {
6586 DWORD len;
6587 ret = GetQueuedCompletionStatus(port, &len, &key,
6588 (OVERLAPPED **)&slot, INFINITE);
6589
6590 /* If shutdown key was received, repost the shutdown message and exit */
6591 if (ret && key == IOCP_SHUTDOWN_KEY) {
6592 PostQueuedCompletionStatus(port, 0, key, NULL);
6593 *m1 = NULL;
6594 *m2 = NULL;
6595 return (DB_SUCCESS);
6596 }
6597
6598 ut_a(slot);
6599
6600 if (!ret) {
6601 /* IO failed */
6602 break;
6603 }
6604
6605 slot->n_bytes= len;
6606 ut_a(slot->array);
6607 HANDLE slot_port = slot->array->m_completion_port;
6608 if (slot_port != port) {
6609 /* there are no redirections between data and log */
6610 ut_ad(port == data_completion_port);
6611 ut_ad(slot_port != log_completion_port);
6612
6613 /*
6614 Redirect completions to the dedicated completion port
6615 and threads.
6616
6617 "Write array" threads receive write,read and ibuf
6618 notifications, read and ibuf completions are redirected.
6619
6620 Forwarding IO completion this way costs a context switch,
6621 and this seems tolerable since asynchronous reads are by
6622 far less frequent.
6623 */
6624 ut_a(PostQueuedCompletionStatus(slot_port,
6625 len, key, &slot->control));
6626 }
6627 else {
6628 break;
6629 }
6630 }
6631
6632 ut_a(slot->is_reserved);
6633
6634 *m1 = slot->m1;
6635 *m2 = slot->m2;
6636
6637 *type = slot->type;
6638
6639 bool retry = false;
6640
6641 if (ret && slot->n_bytes == slot->len) {
6642
6643 err = DB_SUCCESS;
6644
6645 } else if (os_file_handle_error(slot->name, "Windows aio")) {
6646
6647 retry = true;
6648
6649 } else {
6650
6651 err = DB_IO_ERROR;
6652 }
6653
6654
6655 if (retry) {
6656 /* Retry failed read/write operation synchronously. */
6657
6658 #ifdef UNIV_PFS_IO
6659 /* This read/write does not go through os_file_read
6660 and os_file_write APIs, need to register with
6661 performance schema explicitly here. */
6662 PSI_file_locker_state state;
6663 struct PSI_file_locker* locker = NULL;
6664
6665 register_pfs_file_io_begin(
6666 &state, locker, slot->file, slot->len,
6667 slot->type.is_write()
6668 ? PSI_FILE_WRITE : PSI_FILE_READ, __FILE__, __LINE__);
6669 #endif /* UNIV_PFS_IO */
6670
6671 ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
6672
6673 ssize_t n_bytes = SyncFileIO::execute(slot);
6674
6675 #ifdef UNIV_PFS_IO
6676 register_pfs_file_io_end(locker, slot->len);
6677 #endif /* UNIV_PFS_IO */
6678
6679 err = (n_bytes == slot->len) ? DB_SUCCESS : DB_IO_ERROR;
6680 }
6681
6682 if (err == DB_SUCCESS) {
6683 err = AIOHandler::post_io_processing(slot);
6684 }
6685
6686 slot->array->release_with_mutex(slot);
6687
6688 if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
6689 && !buf_page_cleaner_is_active
6690 && os_aio_all_slots_free()) {
6691 /* Last IO, wakeup other io threads */
6692 AIO::wake_at_shutdown();
6693 }
6694 return(err);
6695 }
6696 #endif /* WIN_ASYNC_IO */
6697
6698 /**
6699 NOTE! Use the corresponding macro os_aio(), not directly this function!
6700 Requests an asynchronous i/o operation.
6701 @param[in,out] type IO request context
6702 @param[in] mode IO mode
6703 @param[in] name Name of the file or path as NUL terminated
6704 string
6705 @param[in] file Open file handle
6706 @param[out] buf buffer where to read
6707 @param[in] offset file offset where to read
6708 @param[in] n number of bytes to read
6709 @param[in] read_only if true read only mode checks are enforced
6710 @param[in,out] m1 Message for the AIO handler, (can be used to
6711 identify a completed AIO operation); ignored
6712 if mode is OS_AIO_SYNC
6713 @param[in,out] m2 message for the AIO handler (can be used to
6714 identify a completed AIO operation); ignored
6715 if mode is OS_AIO_SYNC
6716
6717 @return DB_SUCCESS or error code */
6718 dberr_t
os_aio_func(IORequest & type,ulint mode,const char * name,pfs_os_file_t file,void * buf,os_offset_t offset,ulint n,bool read_only,fil_node_t * m1,void * m2)6719 os_aio_func(
6720 IORequest& type,
6721 ulint mode,
6722 const char* name,
6723 pfs_os_file_t file,
6724 void* buf,
6725 os_offset_t offset,
6726 ulint n,
6727 bool read_only,
6728 fil_node_t* m1,
6729 void* m2)
6730 {
6731 #ifdef WIN_ASYNC_IO
6732 BOOL ret = TRUE;
6733 #endif /* WIN_ASYNC_IO */
6734
6735 ut_ad(n > 0);
6736 ut_ad((n % OS_FILE_LOG_BLOCK_SIZE) == 0);
6737 ut_ad((offset % OS_FILE_LOG_BLOCK_SIZE) == 0);
6738 ut_ad(os_aio_validate_skip());
6739
6740 #ifdef WIN_ASYNC_IO
6741 ut_ad((n & 0xFFFFFFFFUL) == n);
6742 #endif /* WIN_ASYNC_IO */
6743
6744 DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28",
6745 mode = OS_AIO_SYNC; os_has_said_disk_full = FALSE;);
6746
6747 if (mode == OS_AIO_SYNC) {
6748 if (type.is_read()) {
6749 return(os_file_read_func(type, file, buf, offset, n));
6750 }
6751
6752 ut_ad(type.is_write());
6753
6754 return(os_file_write_func(type, name, file, buf, offset, n));
6755 }
6756
6757 try_again:
6758
6759 AIO* array;
6760
6761 array = AIO::select_slot_array(type, read_only, mode);
6762
6763 Slot* slot;
6764
6765 slot = array->reserve_slot(type, m1, m2, file, name, buf, offset, n);
6766
6767 if (type.is_read()) {
6768
6769
6770 if (srv_use_native_aio) {
6771
6772 ++os_n_file_reads;
6773
6774 os_bytes_read_since_printout += n;
6775 #ifdef WIN_ASYNC_IO
6776 ret = ReadFile(
6777 file, slot->ptr, slot->len,
6778 NULL, &slot->control);
6779 #elif defined(LINUX_NATIVE_AIO)
6780 if (!array->linux_dispatch(slot)) {
6781 goto err_exit;
6782 }
6783 #endif /* WIN_ASYNC_IO */
6784 } else if (type.is_wake()) {
6785 AIO::wake_simulated_handler_thread(
6786 AIO::get_segment_no_from_slot(array, slot));
6787 }
6788 } else if (type.is_write()) {
6789
6790 if (srv_use_native_aio) {
6791 ++os_n_file_writes;
6792
6793 #ifdef WIN_ASYNC_IO
6794 ret = WriteFile(
6795 file, slot->ptr, slot->len,
6796 NULL, &slot->control);
6797 #elif defined(LINUX_NATIVE_AIO)
6798 if (!array->linux_dispatch(slot)) {
6799 goto err_exit;
6800 }
6801 #endif /* WIN_ASYNC_IO */
6802
6803 } else if (type.is_wake()) {
6804 AIO::wake_simulated_handler_thread(
6805 AIO::get_segment_no_from_slot(array, slot));
6806 }
6807 } else {
6808 ut_error;
6809 }
6810
6811 #ifdef WIN_ASYNC_IO
6812 if (ret || (GetLastError() == ERROR_IO_PENDING)) {
6813 /* aio completed or was queued successfully! */
6814 return(DB_SUCCESS);
6815 }
6816
6817 goto err_exit;
6818
6819 #endif /* WIN_ASYNC_IO */
6820
6821 /* AIO request was queued successfully! */
6822 return(DB_SUCCESS);
6823
6824 #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
6825 err_exit:
6826 #endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
6827
6828 array->release_with_mutex(slot);
6829
6830 if (os_file_handle_error(
6831 name, type.is_read() ? "aio read" : "aio write")) {
6832
6833 goto try_again;
6834 }
6835
6836 return(DB_IO_ERROR);
6837 }
6838
6839 /** Simulated AIO handler for reaping IO requests */
6840 class SimulatedAIOHandler {
6841
6842 public:
6843
6844 /** Constructor
6845 @param[in,out] array The AIO array
6846 @param[in] segment Local segment in the array */
SimulatedAIOHandler(AIO * array,ulint segment)6847 SimulatedAIOHandler(AIO* array, ulint segment)
6848 :
6849 m_oldest(),
6850 m_n_elems(),
6851 m_lowest_offset(IB_UINT64_MAX),
6852 m_array(array),
6853 m_n_slots(),
6854 m_segment(segment),
6855 m_ptr(),
6856 m_buf()
6857 {
6858 ut_ad(m_segment < 100);
6859
6860 m_slots.resize(OS_AIO_MERGE_N_CONSECUTIVE);
6861 }
6862
6863 /** Destructor */
~SimulatedAIOHandler()6864 ~SimulatedAIOHandler()
6865 {
6866 if (m_ptr != NULL) {
6867 ut_free(m_ptr);
6868 }
6869 }
6870
6871 /** Reset the state of the handler
6872 @param[in] n_slots Number of pending AIO operations supported */
init(ulint n_slots)6873 void init(ulint n_slots)
6874 {
6875 m_oldest = 0;
6876 m_n_elems = 0;
6877 m_n_slots = n_slots;
6878 m_lowest_offset = IB_UINT64_MAX;
6879
6880 if (m_ptr != NULL) {
6881 ut_free(m_ptr);
6882 m_ptr = m_buf = NULL;
6883 }
6884
6885 m_slots[0] = NULL;
6886 }
6887
6888 /** Check if there is a slot for which the i/o has already been done
6889 @param[out] n_reserved Number of reserved slots
6890 @return the first completed slot that is found. */
check_completed(ulint * n_reserved)6891 Slot* check_completed(ulint* n_reserved)
6892 {
6893 ulint offset = m_segment * m_n_slots;
6894
6895 *n_reserved = 0;
6896
6897 Slot* slot;
6898
6899 slot = m_array->at(offset);
6900
6901 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
6902
6903 if (slot->is_reserved) {
6904
6905 if (slot->io_already_done) {
6906
6907 ut_a(slot->is_reserved);
6908
6909 return(slot);
6910 }
6911
6912 ++*n_reserved;
6913 }
6914 }
6915
6916 return(NULL);
6917 }
6918
6919 /** If there are at least 2 seconds old requests, then pick the
6920 oldest one to prevent starvation. If several requests have the
6921 same age, then pick the one at the lowest offset.
6922 @return true if request was selected */
select()6923 bool select()
6924 {
6925 if (!select_oldest()) {
6926
6927 return(select_lowest_offset());
6928 }
6929
6930 return(true);
6931 }
6932
6933 /** Check if there are several consecutive blocks
6934 to read or write. Merge them if found. */
merge()6935 void merge()
6936 {
6937 /* if m_n_elems != 0, then we have assigned
6938 something valid to consecutive_ios[0] */
6939 ut_ad(m_n_elems != 0);
6940 ut_ad(first_slot() != NULL);
6941
6942 Slot* slot = first_slot();
6943
6944 while (!merge_adjacent(slot)) {
6945 /* No op */
6946 }
6947 }
6948
6949 /** We have now collected n_consecutive I/O requests
6950 in the array; allocate a single buffer which can hold
6951 all data, and perform the I/O
6952 @return the length of the buffer */
allocate_buffer()6953 ulint allocate_buffer()
6954 MY_ATTRIBUTE((warn_unused_result))
6955 {
6956 ulint len;
6957 Slot* slot = first_slot();
6958
6959 ut_ad(m_ptr == NULL);
6960
6961 if (slot->type.is_read() && m_n_elems > 1) {
6962
6963 len = 0;
6964
6965 for (ulint i = 0; i < m_n_elems; ++i) {
6966 len += m_slots[i]->len;
6967 }
6968
6969 m_ptr = static_cast<byte*>(
6970 ut_malloc_nokey(len + srv_page_size));
6971
6972 m_buf = static_cast<byte*>(
6973 ut_align(m_ptr, srv_page_size));
6974
6975 } else {
6976 len = first_slot()->len;
6977 m_buf = first_slot()->buf;
6978 }
6979
6980 return(len);
6981 }
6982
6983 /** We have to compress the individual pages and punch
6984 holes in them on a page by page basis when writing to
6985 tables that can be compresed at the IO level.
6986 @param[in] len Value returned by allocate_buffer */
copy_to_buffer(ulint len)6987 void copy_to_buffer(ulint len)
6988 {
6989 Slot* slot = first_slot();
6990
6991 if (len > slot->len && slot->type.is_write()) {
6992
6993 byte* ptr = m_buf;
6994
6995 ut_ad(ptr != slot->buf);
6996
6997 /* Copy the buffers to the combined buffer */
6998 for (ulint i = 0; i < m_n_elems; ++i) {
6999
7000 slot = m_slots[i];
7001
7002 memmove(ptr, slot->buf, slot->len);
7003
7004 ptr += slot->len;
7005 }
7006 }
7007 }
7008
7009 /** Do the I/O with ordinary, synchronous i/o functions:
7010 @param[in] len Length of buffer for IO */
io()7011 void io()
7012 {
7013 if (first_slot()->type.is_write()) {
7014
7015 for (ulint i = 0; i < m_n_elems; ++i) {
7016 write(m_slots[i]);
7017 }
7018
7019 } else {
7020
7021 for (ulint i = 0; i < m_n_elems; ++i) {
7022 read(m_slots[i]);
7023 }
7024 }
7025 }
7026
7027 /** Mark the i/os done in slots */
done()7028 void done()
7029 {
7030 for (ulint i = 0; i < m_n_elems; ++i) {
7031 m_slots[i]->io_already_done = true;
7032 }
7033 }
7034
7035 /** @return the first slot in the consecutive array */
first_slot()7036 Slot* first_slot()
7037 MY_ATTRIBUTE((warn_unused_result))
7038 {
7039 ut_a(m_n_elems > 0);
7040
7041 return(m_slots[0]);
7042 }
7043
7044 /** Wait for I/O requests
7045 @param[in] global_segment The global segment
7046 @param[in,out] event Wait on event if no active requests
7047 @return the number of slots */
7048 ulint check_pending(
7049 ulint global_segment,
7050 os_event_t event)
7051 MY_ATTRIBUTE((warn_unused_result));
7052 private:
7053
7054 /** Do the file read
7055 @param[in,out] slot Slot that has the IO context */
read(Slot * slot)7056 void read(Slot* slot)
7057 {
7058 dberr_t err = os_file_read(
7059 slot->type,
7060 slot->file,
7061 slot->ptr,
7062 slot->offset,
7063 slot->len);
7064
7065 ut_a(err == DB_SUCCESS);
7066 }
7067
7068 /** Do the file read
7069 @param[in,out] slot Slot that has the IO context */
write(Slot * slot)7070 void write(Slot* slot)
7071 {
7072 dberr_t err = os_file_write(
7073 slot->type,
7074 slot->name,
7075 slot->file,
7076 slot->ptr,
7077 slot->offset,
7078 slot->len);
7079
7080 ut_a(err == DB_SUCCESS);
7081 }
7082
7083 /** @return true if the slots are adjacent and can be merged */
adjacent(const Slot * s1,const Slot * s2) const7084 bool adjacent(const Slot* s1, const Slot* s2) const
7085 {
7086 return(s1 != s2
7087 && s1->file == s2->file
7088 && s2->offset == s1->offset + s1->len
7089 && s1->type == s2->type);
7090 }
7091
7092 /** @return true if merge limit reached or no adjacent slots found. */
merge_adjacent(Slot * & current)7093 bool merge_adjacent(Slot*& current)
7094 {
7095 Slot* slot;
7096 ulint offset = m_segment * m_n_slots;
7097
7098 slot = m_array->at(offset);
7099
7100 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7101
7102 if (slot->is_reserved && adjacent(current, slot)) {
7103
7104 current = slot;
7105
7106 /* Found a consecutive i/o request */
7107
7108 m_slots[m_n_elems] = slot;
7109
7110 ++m_n_elems;
7111
7112 return(m_n_elems >= m_slots.capacity());
7113 }
7114 }
7115
7116 return(true);
7117 }
7118
7119 /** There were no old requests. Look for an I/O request at the lowest
7120 offset in the array (we ignore the high 32 bits of the offset in these
7121 heuristics) */
select_lowest_offset()7122 bool select_lowest_offset()
7123 {
7124 ut_ad(m_n_elems == 0);
7125
7126 ulint offset = m_segment * m_n_slots;
7127
7128 m_lowest_offset = IB_UINT64_MAX;
7129
7130 for (ulint i = 0; i < m_n_slots; ++i) {
7131 Slot* slot;
7132
7133 slot = m_array->at(i + offset);
7134
7135 if (slot->is_reserved
7136 && slot->offset < m_lowest_offset) {
7137
7138 /* Found an i/o request */
7139 m_slots[0] = slot;
7140
7141 m_n_elems = 1;
7142
7143 m_lowest_offset = slot->offset;
7144 }
7145 }
7146
7147 return(m_n_elems > 0);
7148 }
7149
7150 /** Select the slot if it is older than the current oldest slot.
7151 @param[in] slot The slot to check */
select_if_older(Slot * slot)7152 void select_if_older(Slot* slot)
7153 {
7154 ulint age;
7155
7156 age = (ulint) difftime(time(NULL), slot->reservation_time);
7157
7158 if ((age >= 2 && age > m_oldest)
7159 || (age >= 2
7160 && age == m_oldest
7161 && slot->offset < m_lowest_offset)) {
7162
7163 /* Found an i/o request */
7164 m_slots[0] = slot;
7165
7166 m_n_elems = 1;
7167
7168 m_oldest = age;
7169
7170 m_lowest_offset = slot->offset;
7171 }
7172 }
7173
7174 /** Select th oldest slot in the array
7175 @return true if oldest slot found */
select_oldest()7176 bool select_oldest()
7177 {
7178 ut_ad(m_n_elems == 0);
7179
7180 Slot* slot;
7181 ulint offset = m_n_slots * m_segment;
7182
7183 slot = m_array->at(offset);
7184
7185 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7186
7187 if (slot->is_reserved) {
7188 select_if_older(slot);
7189 }
7190 }
7191
7192 return(m_n_elems > 0);
7193 }
7194
7195 typedef std::vector<Slot*> slots_t;
7196
7197 private:
7198 ulint m_oldest;
7199 ulint m_n_elems;
7200 os_offset_t m_lowest_offset;
7201
7202 AIO* m_array;
7203 ulint m_n_slots;
7204 ulint m_segment;
7205
7206 slots_t m_slots;
7207
7208 byte* m_ptr;
7209 byte* m_buf;
7210 };
7211
7212 /** Wait for I/O requests
7213 @return the number of slots */
7214 ulint
check_pending(ulint global_segment,os_event_t event)7215 SimulatedAIOHandler::check_pending(
7216 ulint global_segment,
7217 os_event_t event)
7218 {
7219 /* NOTE! We only access constant fields in os_aio_array.
7220 Therefore we do not have to acquire the protecting mutex yet */
7221
7222 ut_ad(os_aio_validate_skip());
7223
7224 ut_ad(m_segment < m_array->get_n_segments());
7225
7226 /* Look through n slots after the segment * n'th slot */
7227
7228 if (AIO::is_read(m_array)
7229 && os_aio_recommend_sleep_for_read_threads) {
7230
7231 /* Give other threads chance to add several
7232 I/Os to the array at once. */
7233
7234 srv_set_io_thread_op_info(
7235 global_segment, "waiting for i/o request");
7236
7237 os_event_wait(event);
7238
7239 return(0);
7240 }
7241
7242 return(m_array->slots_per_segment());
7243 }
7244
7245 /** Does simulated AIO. This function should be called by an i/o-handler
7246 thread.
7247
7248 @param[in] segment The number of the segment in the aio arrays to wait
7249 for; segment 0 is the ibuf i/o thread, segment 1 the
7250 log i/o thread, then follow the non-ibuf read threads,
7251 and as the last are the non-ibuf write threads
7252 @param[out] m1 the messages passed with the AIO request; note that
7253 also in the case where the AIO operation failed, these
7254 output parameters are valid and can be used to restart
7255 the operation, for example
7256 @param[out] m2 Callback argument
7257 @param[in] type IO context
7258 @return DB_SUCCESS or error code */
7259 static
7260 dberr_t
os_aio_simulated_handler(ulint global_segment,fil_node_t ** m1,void ** m2,IORequest * type)7261 os_aio_simulated_handler(
7262 ulint global_segment,
7263 fil_node_t** m1,
7264 void** m2,
7265 IORequest* type)
7266 {
7267 Slot* slot;
7268 AIO* array;
7269 ulint segment;
7270 os_event_t event = os_aio_segment_wait_events[global_segment];
7271
7272 segment = AIO::get_array_and_local_segment(&array, global_segment);
7273
7274 SimulatedAIOHandler handler(array, segment);
7275
7276 for (;;) {
7277
7278 srv_set_io_thread_op_info(
7279 global_segment, "looking for i/o requests (a)");
7280
7281 ulint n_slots = handler.check_pending(global_segment, event);
7282
7283 if (n_slots == 0) {
7284 continue;
7285 }
7286
7287 handler.init(n_slots);
7288
7289 srv_set_io_thread_op_info(
7290 global_segment, "looking for i/o requests (b)");
7291
7292 array->acquire();
7293
7294 ulint n_reserved;
7295
7296 slot = handler.check_completed(&n_reserved);
7297
7298 if (slot != NULL) {
7299
7300 break;
7301
7302 } else if (n_reserved == 0
7303 && !buf_page_cleaner_is_active
7304 && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
7305
7306 /* There is no completed request. If there
7307 are no pending request at all, and the system
7308 is being shut down, exit. */
7309
7310 array->release();
7311
7312 *m1 = NULL;
7313
7314 *m2 = NULL;
7315
7316 return(DB_SUCCESS);
7317
7318 } else if (handler.select()) {
7319
7320 break;
7321 }
7322
7323 /* No I/O requested at the moment */
7324
7325 srv_set_io_thread_op_info(
7326 global_segment, "resetting wait event");
7327
7328 /* We wait here until tbere are more IO requests
7329 for this segment. */
7330
7331 os_event_reset(event);
7332
7333 array->release();
7334
7335 srv_set_io_thread_op_info(
7336 global_segment, "waiting for i/o request");
7337
7338 os_event_wait(event);
7339 }
7340
7341 /** Found a slot that has already completed its IO */
7342
7343 if (slot == NULL) {
7344 /* Merge adjacent requests */
7345 handler.merge();
7346
7347 /* Check if there are several consecutive blocks
7348 to read or write */
7349
7350 srv_set_io_thread_op_info(
7351 global_segment, "consecutive i/o requests");
7352
7353 // Note: We don't support write combining for simulated AIO.
7354 //ulint total_len = handler.allocate_buffer();
7355
7356 /* We release the array mutex for the time of the I/O: NOTE that
7357 this assumes that there is just one i/o-handler thread serving
7358 a single segment of slots! */
7359
7360 array->release();
7361
7362 // Note: We don't support write combining for simulated AIO.
7363 //handler.copy_to_buffer(total_len);
7364
7365 srv_set_io_thread_op_info(global_segment, "doing file i/o");
7366
7367 handler.io();
7368
7369 srv_set_io_thread_op_info(global_segment, "file i/o done");
7370
7371 array->acquire();
7372
7373 handler.done();
7374
7375 /* We return the messages for the first slot now, and if there
7376 were several slots, the messages will be returned with
7377 subsequent calls of this function */
7378
7379 slot = handler.first_slot();
7380 }
7381
7382 ut_ad(slot->is_reserved);
7383
7384 *m1 = slot->m1;
7385 *m2 = slot->m2;
7386
7387 *type = slot->type;
7388
7389 array->release(slot);
7390
7391 array->release();
7392
7393 return(DB_SUCCESS);
7394 }
7395
7396 /** Get the total number of pending IOs
7397 @return the total number of pending IOs */
7398 ulint
total_pending_io_count()7399 AIO::total_pending_io_count()
7400 {
7401 ulint count = s_reads->pending_io_count();
7402
7403 if (s_writes != NULL) {
7404 count += s_writes->pending_io_count();
7405 }
7406
7407 if (s_ibuf != NULL) {
7408 count += s_ibuf->pending_io_count();
7409 }
7410
7411 if (s_log != NULL) {
7412 count += s_log->pending_io_count();
7413 }
7414
7415 if (s_sync != NULL) {
7416 count += s_sync->pending_io_count();
7417 }
7418
7419 return(count);
7420 }
7421
7422 /** Validates the consistency the aio system.
7423 @return true if ok */
7424 static
7425 bool
os_aio_validate()7426 os_aio_validate()
7427 {
7428 /* The methods countds and validates, we ignore the count. */
7429 AIO::total_pending_io_count();
7430
7431 return(true);
7432 }
7433
7434 /** Prints pending IO requests per segment of an aio array.
7435 We probably don't need per segment statistics but they can help us
7436 during development phase to see if the IO requests are being
7437 distributed as expected.
7438 @param[in,out] file File where to print
7439 @param[in] segments Pending IO array */
7440 void
print_segment_info(FILE * file,const ulint * segments)7441 AIO::print_segment_info(
7442 FILE* file,
7443 const ulint* segments)
7444 {
7445 ut_ad(m_n_segments > 0);
7446
7447 if (m_n_segments > 1) {
7448
7449 fprintf(file, " [");
7450
7451 for (ulint i = 0; i < m_n_segments; ++i, ++segments) {
7452
7453 if (i != 0) {
7454 fprintf(file, ", ");
7455 }
7456
7457 fprintf(file, ULINTPF, *segments);
7458 }
7459
7460 fprintf(file, "] ");
7461 }
7462 }
7463
7464 /** Prints info about the aio array.
7465 @param[in,out] file Where to print */
7466 void
print(FILE * file)7467 AIO::print(FILE* file)
7468 {
7469 ulint count = 0;
7470 ulint n_res_seg[SRV_MAX_N_IO_THREADS];
7471
7472 mutex_enter(&m_mutex);
7473
7474 ut_a(!m_slots.empty());
7475 ut_a(m_n_segments > 0);
7476
7477 memset(n_res_seg, 0x0, sizeof(n_res_seg));
7478
7479 for (ulint i = 0; i < m_slots.size(); ++i) {
7480 Slot& slot = m_slots[i];
7481 ulint segment = (i * m_n_segments) / m_slots.size();
7482
7483 if (slot.is_reserved) {
7484
7485 ++count;
7486
7487 ++n_res_seg[segment];
7488
7489 ut_a(slot.len > 0);
7490 }
7491 }
7492
7493 ut_a(m_n_reserved == count);
7494
7495 print_segment_info(file, n_res_seg);
7496
7497 mutex_exit(&m_mutex);
7498 }
7499
7500 /** Print all the AIO segments
7501 @param[in,out] file Where to print */
7502 void
print_all(FILE * file)7503 AIO::print_all(FILE* file)
7504 {
7505 s_reads->print(file);
7506
7507 if (s_writes != NULL) {
7508 fputs(", aio writes:", file);
7509 s_writes->print(file);
7510 }
7511
7512 if (s_ibuf != NULL) {
7513 fputs(",\n ibuf aio reads:", file);
7514 s_ibuf->print(file);
7515 }
7516
7517 if (s_log != NULL) {
7518 fputs(", log i/o's:", file);
7519 s_log->print(file);
7520 }
7521
7522 if (s_sync != NULL) {
7523 fputs(", sync i/o's:", file);
7524 s_sync->print(file);
7525 }
7526 }
7527
7528 /** Prints info of the aio arrays.
7529 @param[in,out] file file where to print */
7530 void
os_aio_print(FILE * file)7531 os_aio_print(FILE* file)
7532 {
7533 time_t current_time;
7534 double time_elapsed;
7535 double avg_bytes_read;
7536
7537 for (ulint i = 0; i < srv_n_file_io_threads; ++i) {
7538 fprintf(file, "I/O thread " ULINTPF " state: %s (%s)",
7539 i,
7540 srv_io_thread_op_info[i],
7541 srv_io_thread_function[i]);
7542
7543 #ifndef _WIN32
7544 if (!srv_use_native_aio
7545 && os_event_is_set(os_aio_segment_wait_events[i])) {
7546 fprintf(file, " ev set");
7547 }
7548 #endif /* _WIN32 */
7549
7550 fprintf(file, "\n");
7551 }
7552
7553 fputs("Pending normal aio reads:", file);
7554
7555 AIO::print_all(file);
7556
7557 putc('\n', file);
7558 current_time = time(NULL);
7559 time_elapsed = 0.001 + difftime(current_time, os_last_printout);
7560
7561 fprintf(file,
7562 "Pending flushes (fsync) log: " ULINTPF
7563 "; buffer pool: " ULINTPF "\n"
7564 ULINTPF " OS file reads, "
7565 ULINTPF " OS file writes, "
7566 ULINTPF " OS fsyncs\n",
7567 fil_n_pending_log_flushes,
7568 fil_n_pending_tablespace_flushes,
7569 os_n_file_reads,
7570 os_n_file_writes,
7571 os_n_fsyncs);
7572
7573 const ulint n_reads = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_READS));
7574 const ulint n_writes = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_WRITES));
7575
7576 if (n_reads != 0 || n_writes != 0) {
7577 fprintf(file,
7578 ULINTPF " pending reads, " ULINTPF " pending writes\n",
7579 n_reads, n_writes);
7580 }
7581
7582 if (os_n_file_reads == os_n_file_reads_old) {
7583 avg_bytes_read = 0.0;
7584 } else {
7585 avg_bytes_read = (double) os_bytes_read_since_printout
7586 / (os_n_file_reads - os_n_file_reads_old);
7587 }
7588
7589 fprintf(file,
7590 "%.2f reads/s, " ULINTPF " avg bytes/read,"
7591 " %.2f writes/s, %.2f fsyncs/s\n",
7592 (os_n_file_reads - os_n_file_reads_old)
7593 / time_elapsed,
7594 (ulint) avg_bytes_read,
7595 (os_n_file_writes - os_n_file_writes_old)
7596 / time_elapsed,
7597 (os_n_fsyncs - os_n_fsyncs_old)
7598 / time_elapsed);
7599
7600 os_n_file_reads_old = os_n_file_reads;
7601 os_n_file_writes_old = os_n_file_writes;
7602 os_n_fsyncs_old = os_n_fsyncs;
7603 os_bytes_read_since_printout = 0;
7604
7605 os_last_printout = current_time;
7606 }
7607
7608 /** Refreshes the statistics used to print per-second averages. */
7609 void
os_aio_refresh_stats()7610 os_aio_refresh_stats()
7611 {
7612 os_n_fsyncs_old = os_n_fsyncs;
7613
7614 os_bytes_read_since_printout = 0;
7615
7616 os_n_file_reads_old = os_n_file_reads;
7617
7618 os_n_file_writes_old = os_n_file_writes;
7619
7620 os_n_fsyncs_old = os_n_fsyncs;
7621
7622 os_bytes_read_since_printout = 0;
7623
7624 os_last_printout = time(NULL);
7625 }
7626
7627 /** Checks that all slots in the system have been freed, that is, there are
7628 no pending io operations.
7629 @return true if all free */
7630 bool
os_aio_all_slots_free()7631 os_aio_all_slots_free()
7632 {
7633 return(AIO::total_pending_io_count() == 0);
7634 }
7635
7636 #ifdef UNIV_DEBUG
7637 /** Prints all pending IO for the array
7638 @param[in] file file where to print
7639 @param[in] array array to process */
7640 void
to_file(FILE * file) const7641 AIO::to_file(FILE* file) const
7642 {
7643 acquire();
7644
7645 fprintf(file, " " ULINTPF "\n", m_n_reserved);
7646
7647 for (ulint i = 0; i < m_slots.size(); ++i) {
7648
7649 const Slot& slot = m_slots[i];
7650
7651 if (slot.is_reserved) {
7652
7653 fprintf(file,
7654 "%s IO for %s (offset=" UINT64PF
7655 ", size=%lu)\n",
7656 slot.type.is_read() ? "read" : "write",
7657 slot.name, slot.offset, (unsigned long)(slot.len));
7658 }
7659 }
7660
7661 release();
7662 }
7663
7664 /** Print pending IOs for all arrays */
7665 void
print_to_file(FILE * file)7666 AIO::print_to_file(FILE* file)
7667 {
7668 fprintf(file, "Pending normal aio reads:");
7669
7670 s_reads->to_file(file);
7671
7672 if (s_writes != NULL) {
7673 fprintf(file, "Pending normal aio writes:");
7674 s_writes->to_file(file);
7675 }
7676
7677 if (s_ibuf != NULL) {
7678 fprintf(file, "Pending ibuf aio reads:");
7679 s_ibuf->to_file(file);
7680 }
7681
7682 if (s_log != NULL) {
7683 fprintf(file, "Pending log i/o's:");
7684 s_log->to_file(file);
7685 }
7686
7687 if (s_sync != NULL) {
7688 fprintf(file, "Pending sync i/o's:");
7689 s_sync->to_file(file);
7690 }
7691 }
7692
7693 /** Prints all pending IO
7694 @param[in] file File where to print */
7695 void
os_aio_print_pending_io(FILE * file)7696 os_aio_print_pending_io(
7697 FILE* file)
7698 {
7699 AIO::print_to_file(file);
7700 }
7701
7702 #endif /* UNIV_DEBUG */
7703
7704 /**
7705 Set the file create umask
7706 @param[in] umask The umask to use for file creation. */
7707 void
os_file_set_umask(ulint umask)7708 os_file_set_umask(ulint umask)
7709 {
7710 os_innodb_umask = umask;
7711 }
7712
7713 #else
7714 #include "univ.i"
7715 #endif /* !UNIV_INNOCHECKSUM */
7716
7717 /** Normalizes a directory path for the current OS:
7718 On Windows, we convert '/' to '\', else we convert '\' to '/'.
7719 @param[in,out] str A null-terminated directory and file path */
7720 void
os_normalize_path(char * str)7721 os_normalize_path(
7722 char* str)
7723 {
7724 if (str != NULL) {
7725 for (; *str; str++) {
7726 if (*str == OS_PATH_SEPARATOR_ALT) {
7727 *str = OS_PATH_SEPARATOR;
7728 }
7729 }
7730 }
7731 }
7732