1 /***********************************************************************
2
3 Copyright (c) 1995, 2019, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2009, Percona Inc.
5 Copyright (c) 2013, 2021, MariaDB Corporation.
6
7 Portions of this file contain modifications contributed and copyrighted
8 by Percona Inc.. Those modifications are
9 gratefully acknowledged and are described briefly in the InnoDB
10 documentation. The contributions by Percona Inc. are incorporated with
11 their permission, and subject to the conditions contained in the file
12 COPYING.Percona.
13
14 This program is free software; you can redistribute it and/or modify it
15 under the terms of the GNU General Public License as published by the
16 Free Software Foundation; version 2 of the License.
17
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
21 Public License for more details.
22
23 You should have received a copy of the GNU General Public License along with
24 this program; if not, write to the Free Software Foundation, Inc.,
25 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
26
27 ***********************************************************************/
28
29 /**************************************************//**
30 @file os/os0file.cc
31 The interface to the operating system file i/o primitives
32
33 Created 10/21/1995 Heikki Tuuri
34 *******************************************************/
35
36 #ifndef UNIV_INNOCHECKSUM
37 #include "os0file.h"
38 #include "sql_const.h"
39
40 #ifdef UNIV_LINUX
41 # include <sys/types.h>
42 # include <sys/stat.h>
43 #endif
44
45 #include "srv0srv.h"
46 #include "srv0start.h"
47 #include "fil0fil.h"
48 #include "fsp0fsp.h"
49 #ifdef HAVE_LINUX_UNISTD_H
50 #include "unistd.h"
51 #endif
52 #include "os0event.h"
53 #include "os0thread.h"
54
55 #include <vector>
56
57 #ifdef LINUX_NATIVE_AIO
58 #include <libaio.h>
59 #endif /* LINUX_NATIVE_AIO */
60
61 #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
62 # include <fcntl.h>
63 # include <linux/falloc.h>
64 #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
65
66 #if defined(UNIV_LINUX) && defined(HAVE_SYS_IOCTL_H)
67 # include <sys/ioctl.h>
68 # ifndef DFS_IOCTL_ATOMIC_WRITE_SET
69 # define DFS_IOCTL_ATOMIC_WRITE_SET _IOW(0x95, 2, uint)
70 # endif
71 #endif
72
73 #ifdef _WIN32
74 #include <winioctl.h>
75 #else
76 // my_test_if_atomic_write()
77 #include <my_sys.h>
78 #endif
79
80
81 /** Insert buffer segment id */
82 static const ulint IO_IBUF_SEGMENT = 0;
83
84 /** Log segment id */
85 static const ulint IO_LOG_SEGMENT = 1;
86
87 /** Number of retries for partial I/O's */
88 static const ulint NUM_RETRIES_ON_PARTIAL_IO = 10;
89
90 /* This specifies the file permissions InnoDB uses when it creates files in
91 Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
92 my_umask */
93
94 #ifndef _WIN32
95 /** Umask for creating files */
96 static ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
97 #else
98 /** Umask for creating files */
99 static ulint os_innodb_umask = 0;
100 static HANDLE data_completion_port;
101 static HANDLE log_completion_port;
102
103 static DWORD fls_sync_io = FLS_OUT_OF_INDEXES;
104 #define IOCP_SHUTDOWN_KEY (ULONG_PTR)-1
105 #endif /* _WIN32 */
106
107 /** In simulated aio, merge at most this many consecutive i/os */
108 static const ulint OS_AIO_MERGE_N_CONSECUTIVE = 64;
109
110 /** Flag indicating if the page_cleaner is in active state. */
111 extern bool buf_page_cleaner_is_active;
112
113 #ifdef WITH_INNODB_DISALLOW_WRITES
114 #define WAIT_ALLOW_WRITES() os_event_wait(srv_allow_writes_event)
115 #else
116 #define WAIT_ALLOW_WRITES() do { } while (0)
117 #endif /* WITH_INNODB_DISALLOW_WRITES */
118
119 /**********************************************************************
120
121 InnoDB AIO Implementation:
122 =========================
123
124 We support native AIO for Windows and Linux. For rest of the platforms
125 we simulate AIO by special IO-threads servicing the IO-requests.
126
127 Simulated AIO:
128 ==============
129
130 On platforms where we 'simulate' AIO, the following is a rough explanation
131 of the high level design.
132 There are four io-threads (for ibuf, log, read, write).
133 All synchronous IO requests are serviced by the calling thread using
134 os_file_write/os_file_read. The Asynchronous requests are queued up
135 in an array (there are four such arrays) by the calling thread.
136 Later these requests are picked up by the IO-thread and are serviced
137 synchronously.
138
139 Windows native AIO:
140 ==================
141
142 If srv_use_native_aio is not set then Windows follow the same
143 code as simulated AIO. If the flag is set then native AIO interface
144 is used. On windows, one of the limitation is that if a file is opened
145 for AIO no synchronous IO can be done on it. Therefore we have an
146 extra fifth array to queue up synchronous IO requests.
147 There are innodb_file_io_threads helper threads. These threads work
148 on the four arrays mentioned above in Simulated AIO. No thread is
149 required for the sync array.
150 If a synchronous IO request is made, it is first queued in the sync
151 array. Then the calling thread itself waits on the request, thus
152 making the call synchronous.
153 If an AIO request is made the calling thread not only queues it in the
154 array but also submits the requests. The helper thread then collects
155 the completed IO request and calls completion routine on it.
156
157 Linux native AIO:
158 =================
159
160 If we have libaio installed on the system and innodb_use_native_aio
161 is set to true we follow the code path of native AIO, otherwise we
162 do simulated AIO.
163 There are innodb_file_io_threads helper threads. These threads work
164 on the four arrays mentioned above in Simulated AIO.
165 If a synchronous IO request is made, it is handled by calling
166 os_file_write/os_file_read.
167 If an AIO request is made the calling thread not only queues it in the
168 array but also submits the requests. The helper thread then collects
169 the completed IO request and calls completion routine on it.
170
171 **********************************************************************/
172
173
174 #ifdef UNIV_PFS_IO
175 /* Keys to register InnoDB I/O with performance schema */
176 mysql_pfs_key_t innodb_data_file_key;
177 mysql_pfs_key_t innodb_log_file_key;
178 mysql_pfs_key_t innodb_temp_file_key;
179 #endif /* UNIV_PFS_IO */
180
181 class AIO;
182
183 /** The asynchronous I/O context */
184 struct Slot {
185
186 #ifdef WIN_ASYNC_IO
187 /** Windows control block for the aio request
188 must be at the very start of Slot, so we can
189 cast Slot* to OVERLAPPED*
190 */
191 OVERLAPPED control;
192 #endif
193
194 /** index of the slot in the aio array */
195 uint16_t pos;
196
197 /** true if this slot is reserved */
198 bool is_reserved;
199
200 /** time when reserved */
201 time_t reservation_time;
202
203 /** buffer used in i/o */
204 byte* buf;
205
206 /** Buffer pointer used for actual IO. We advance this
207 when partial IO is required and not buf */
208 byte* ptr;
209
210 /** OS_FILE_READ or OS_FILE_WRITE */
211 IORequest type;
212
213 /** file offset in bytes */
214 os_offset_t offset;
215
216 /** file where to read or write */
217 pfs_os_file_t file;
218
219 /** file name or path */
220 const char* name;
221
222 /** used only in simulated aio: true if the physical i/o
223 already made and only the slot message needs to be passed
224 to the caller of os_aio_simulated_handle */
225 bool io_already_done;
226
227 /*!< file block size */
228 ulint file_block_size;
229
230 /** The file node for which the IO is requested. */
231 fil_node_t* m1;
232
233 /** the requester of an aio operation and which can be used
234 to identify which pending aio operation was completed */
235 void* m2;
236
237 /** AIO completion status */
238 dberr_t err;
239
240 #ifdef WIN_ASYNC_IO
241
242 /** bytes written/read */
243 DWORD n_bytes;
244
245 /** length of the block to read or write */
246 DWORD len;
247
248 /** aio array containing this slot */
249 AIO *array;
250 #elif defined(LINUX_NATIVE_AIO)
251 /** Linux control block for aio */
252 struct iocb control;
253
254 /** AIO return code */
255 int ret;
256
257 /** bytes written/read. */
258 ssize_t n_bytes;
259
260 /** length of the block to read or write */
261 ulint len;
262 #else
263 /** length of the block to read or write */
264 ulint len;
265
266 /** bytes written/read. */
267 ulint n_bytes;
268 #endif /* WIN_ASYNC_IO */
269
270 /** Length of the block before it was compressed */
271 uint32 original_len;
272
273 };
274
275 /** The asynchronous i/o array structure */
276 class AIO {
277 public:
278 /** Constructor
279 @param[in] id Latch ID
280 @param[in] n_slots Number of slots to configure
281 @param[in] segments Number of segments to configure */
282 AIO(latch_id_t id, ulint n_slots, ulint segments);
283
284 /** Destructor */
285 ~AIO();
286
287 /** Initialize the instance
288 @return DB_SUCCESS or error code */
289 dberr_t init();
290
291 /** Requests for a slot in the aio array. If no slot is available, waits
292 until not_full-event becomes signaled.
293
294 @param[in] type IO context
295 @param[in,out] m1 message to be passed along with the AIO
296 operation
297 @param[in,out] m2 message to be passed along with the AIO
298 operation
299 @param[in] file file handle
300 @param[in] name name of the file or path as a null-terminated
301 string
302 @param[in,out] buf buffer where to read or from which to write
303 @param[in] offset file offset, where to read from or start writing
304 @param[in] len length of the block to read or write
305 @return pointer to slot */
306 Slot* reserve_slot(
307 const IORequest& type,
308 fil_node_t* m1,
309 void* m2,
310 pfs_os_file_t file,
311 const char* name,
312 void* buf,
313 os_offset_t offset,
314 ulint len)
315 MY_ATTRIBUTE((warn_unused_result));
316
317 /** @return number of reserved slots */
318 ulint pending_io_count() const;
319
320 /** Returns a pointer to the nth slot in the aio array.
321 @param[in] index Index of the slot in the array
322 @return pointer to slot */
at(ulint i) const323 const Slot* at(ulint i) const
324 MY_ATTRIBUTE((warn_unused_result))
325 {
326 ut_a(i < m_slots.size());
327
328 return(&m_slots[i]);
329 }
330
331 /** Non const version */
at(ulint i)332 Slot* at(ulint i)
333 MY_ATTRIBUTE((warn_unused_result))
334 {
335 ut_a(i < m_slots.size());
336
337 return(&m_slots[i]);
338 }
339
340 /** Frees a slot in the AIO array, assumes caller owns the mutex.
341 @param[in,out] slot Slot to release */
342 void release(Slot* slot);
343
344 /** Frees a slot in the AIO array, assumes caller doesn't own the mutex.
345 @param[in,out] slot Slot to release */
346 void release_with_mutex(Slot* slot);
347
348 /** Prints info about the aio array.
349 @param[in,out] file Where to print */
350 void print(FILE* file);
351
352 /** @return the number of slots per segment */
slots_per_segment() const353 ulint slots_per_segment() const
354 MY_ATTRIBUTE((warn_unused_result))
355 {
356 return(m_slots.size() / m_n_segments);
357 }
358
359 /** @return accessor for n_segments */
get_n_segments() const360 ulint get_n_segments() const
361 MY_ATTRIBUTE((warn_unused_result))
362 {
363 return(m_n_segments);
364 }
365
366 #ifdef UNIV_DEBUG
367 /** @return true if the thread owns the mutex */
is_mutex_owned() const368 bool is_mutex_owned() const
369 MY_ATTRIBUTE((warn_unused_result))
370 {
371 return(mutex_own(&m_mutex));
372 }
373 #endif /* UNIV_DEBUG */
374
375 /** Acquire the mutex */
acquire() const376 void acquire() const
377 {
378 mutex_enter(&m_mutex);
379 }
380
381 /** Release the mutex */
release() const382 void release() const
383 {
384 mutex_exit(&m_mutex);
385 }
386
387 /** Write out the state to the file/stream
388 @param[in, out] file File to write to */
389 void to_file(FILE* file) const;
390
391 #ifdef LINUX_NATIVE_AIO
392 /** Dispatch an AIO request to the kernel.
393 @param[in,out] slot an already reserved slot
394 @return true on success. */
395 bool linux_dispatch(Slot* slot)
396 MY_ATTRIBUTE((warn_unused_result));
397
398 /** Accessor for an AIO event
399 @param[in] index Index into the array
400 @return the event at the index */
io_events(ulint index)401 io_event* io_events(ulint index)
402 MY_ATTRIBUTE((warn_unused_result))
403 {
404 ut_a(index < m_events.size());
405
406 return(&m_events[index]);
407 }
408
409 /** Accessor for the AIO context
410 @param[in] segment Segment for which to get the context
411 @return the AIO context for the segment */
io_ctx(ulint segment)412 io_context_t io_ctx(ulint segment)
413 MY_ATTRIBUTE((warn_unused_result))
414 {
415 ut_ad(segment < get_n_segments());
416
417 return(m_aio_ctx[segment]);
418 }
419
420 /** Creates an io_context_t for native linux AIO.
421 @param[in] max_events number of events
422 @param[out] io_ctx io_ctx to initialize.
423 @return true on success. */
424 static bool linux_create_io_ctx(unsigned max_events, io_context_t& io_ctx)
425 MY_ATTRIBUTE((warn_unused_result));
426
427 /** Checks if the system supports native linux aio. On some kernel
428 versions where native aio is supported it won't work on tmpfs. In such
429 cases we can't use native aio as it is not possible to mix simulated
430 and native aio.
431 @return true if supported, false otherwise. */
432 static bool is_linux_native_aio_supported()
433 MY_ATTRIBUTE((warn_unused_result));
434 #endif /* LINUX_NATIVE_AIO */
435
436 #ifdef WIN_ASYNC_IO
437 HANDLE m_completion_port;
438 /** Wake up all AIO threads in Windows native aio */
wake_at_shutdown()439 static void wake_at_shutdown() {
440 AIO *all_arrays[] = {s_reads, s_writes, s_log, s_ibuf };
441 for (size_t i = 0; i < array_elements(all_arrays); i++) {
442 AIO *a = all_arrays[i];
443 if (a) {
444 PostQueuedCompletionStatus(a->m_completion_port, 0,
445 IOCP_SHUTDOWN_KEY, 0);
446 }
447 }
448 }
449 #endif /* WIN_ASYNC_IO */
450
451 #ifdef _WIN32
452 /** This function can be called if one wants to post a batch of reads
453 and prefers an I/O - handler thread to handle them all at once later.You
454 must call os_aio_simulated_wake_handler_threads later to ensure the
455 threads are not left sleeping! */
456 static void simulated_put_read_threads_to_sleep();
457 #endif /* _WIN32 */
458
459 /** Create an instance using new(std::nothrow)
460 @param[in] id Latch ID
461 @param[in] n_slots The number of AIO request slots
462 @param[in] segments The number of segments
463 @return a new AIO instance */
464 static AIO* create(
465 latch_id_t id,
466 ulint n_slots,
467 ulint segments)
468 MY_ATTRIBUTE((warn_unused_result));
469
470 /** Initializes the asynchronous io system. Creates one array each
471 for ibuf and log I/O. Also creates one array each for read and write
472 where each array is divided logically into n_readers and n_writers
473 respectively. The caller must create an i/o handler thread for each
474 segment in these arrays. This function also creates the sync array.
475 No I/O handler thread needs to be created for that
476 @param[in] n_per_seg maximum number of pending aio
477 operations allowed per segment
478 @param[in] n_readers number of reader threads
479 @param[in] n_writers number of writer threads
480 @param[in] n_slots_sync number of slots in the sync aio array
481 @return true if AIO sub-system was started successfully */
482 static bool start(
483 ulint n_per_seg,
484 ulint n_readers,
485 ulint n_writers,
486 ulint n_slots_sync)
487 MY_ATTRIBUTE((warn_unused_result));
488
489 /** Free the AIO arrays */
490 static void shutdown();
491
492 /** Print all the AIO segments
493 @param[in,out] file Where to print */
494 static void print_all(FILE* file);
495
496 /** Calculates local segment number and aio array from global
497 segment number.
498 @param[out] array AIO wait array
499 @param[in] segment global segment number
500 @return local segment number within the aio array */
501 static ulint get_array_and_local_segment(
502 AIO** array,
503 ulint segment)
504 MY_ATTRIBUTE((warn_unused_result));
505
506 /** Select the IO slot array
507 @param[in,out] type Type of IO, READ or WRITE
508 @param[in] read_only true if running in read-only mode
509 @param[in] mode IO mode
510 @return slot array or NULL if invalid mode specified */
511 static AIO* select_slot_array(
512 IORequest& type,
513 bool read_only,
514 ulint mode)
515 MY_ATTRIBUTE((warn_unused_result));
516
517 /** Calculates segment number for a slot.
518 @param[in] array AIO wait array
519 @param[in] slot slot in this array
520 @return segment number (which is the number used by, for example,
521 I/O handler threads) */
522 static ulint get_segment_no_from_slot(
523 const AIO* array,
524 const Slot* slot)
525 MY_ATTRIBUTE((warn_unused_result));
526
527 /** Wakes up a simulated AIO I/O-handler thread if it has something
528 to do.
529 @param[in] global_segment the number of the segment in the
530 AIO arrays */
531 static void wake_simulated_handler_thread(ulint global_segment);
532
533 /** Check if it is a read request
534 @param[in] aio The AIO instance to check
535 @return true if the AIO instance is for reading. */
is_read(const AIO * aio)536 static bool is_read(const AIO* aio)
537 MY_ATTRIBUTE((warn_unused_result))
538 {
539 return(s_reads == aio);
540 }
541
542 /** Wait on an event until no pending writes */
wait_until_no_pending_writes()543 static void wait_until_no_pending_writes()
544 {
545 os_event_wait(AIO::s_writes->m_is_empty);
546 }
547
548 /** Print to file
549 @param[in] file File to write to */
550 static void print_to_file(FILE* file);
551
552 /** Check for pending IO. Gets the count and also validates the
553 data structures.
554 @return count of pending IO requests */
555 static ulint total_pending_io_count();
556
557 private:
558 /** Initialise the slots
559 @return DB_SUCCESS or error code */
560 dberr_t init_slots()
561 MY_ATTRIBUTE((warn_unused_result));
562
563 /** Wakes up a simulated AIO I/O-handler thread if it has something
564 to do for a local segment in the AIO array.
565 @param[in] global_segment the number of the segment in the
566 AIO arrays
567 @param[in] segment the local segment in the AIO array */
568 void wake_simulated_handler_thread(ulint global_segment, ulint segment);
569
570 /** Prints pending IO requests per segment of an aio array.
571 We probably don't need per segment statistics but they can help us
572 during development phase to see if the IO requests are being
573 distributed as expected.
574 @param[in,out] file file where to print
575 @param[in] segments pending IO array */
576 void print_segment_info(
577 FILE* file,
578 const ulint* segments);
579
580 #ifdef LINUX_NATIVE_AIO
581 /** Initialise the Linux native AIO data structures
582 @return DB_SUCCESS or error code */
583 dberr_t init_linux_native_aio()
584 MY_ATTRIBUTE((warn_unused_result));
585 #endif /* LINUX_NATIVE_AIO */
586
587 private:
588 typedef std::vector<Slot> Slots;
589
590 /** the mutex protecting the aio array */
591 mutable SysMutex m_mutex;
592
593 /** Pointer to the slots in the array.
594 Number of elements must be divisible by n_threads. */
595 Slots m_slots;
596
597 /** Number of segments in the aio array of pending aio requests.
598 A thread can wait separately for any one of the segments. */
599 ulint m_n_segments;
600
601 /** The event which is set to the signaled state when
602 there is space in the aio outside the ibuf segment;
603 os_event_set() and os_event_reset() are protected by AIO::m_mutex */
604 os_event_t m_not_full;
605
606 /** The event which is set to the signaled state when
607 there are no pending i/os in this array;
608 os_event_set() and os_event_reset() are protected by AIO::m_mutex */
609 os_event_t m_is_empty;
610
611 /** Number of reserved slots in the AIO array outside
612 the ibuf segment */
613 ulint m_n_reserved;
614
615
616 #if defined(LINUX_NATIVE_AIO)
617 typedef std::vector<io_event> IOEvents;
618
619 /** completion queue for IO. There is one such queue per
620 segment. Each thread will work on one ctx exclusively. */
621 std::vector<io_context_t> m_aio_ctx;
622
623 /** The array to collect completed IOs. There is one such
624 event for each possible pending IO. The size of the array
625 is equal to m_slots.size(). */
626 IOEvents m_events;
627 #endif /* LINUX_NATIV_AIO */
628
629 /** The aio arrays for non-ibuf i/o and ibuf i/o, as well as
630 sync AIO. These are NULL when the module has not yet been
631 initialized. */
632
633 /** Insert buffer */
634 static AIO* s_ibuf;
635
636 /** Redo log */
637 static AIO* s_log;
638
639 /** Reads */
640 static AIO* s_reads;
641
642 /** Writes */
643 static AIO* s_writes;
644
645 /** Synchronous I/O */
646 static AIO* s_sync;
647 };
648
649 /** Static declarations */
650 AIO* AIO::s_reads;
651 AIO* AIO::s_writes;
652 AIO* AIO::s_ibuf;
653 AIO* AIO::s_log;
654 AIO* AIO::s_sync;
655
656 #if defined(LINUX_NATIVE_AIO)
657 /** timeout for each io_getevents() call = 500ms. */
658 static const ulint OS_AIO_REAP_TIMEOUT = 500000000UL;
659
660 /** time to sleep, in microseconds if io_setup() returns EAGAIN. */
661 static const ulint OS_AIO_IO_SETUP_RETRY_SLEEP = 500000UL;
662
663 /** number of attempts before giving up on io_setup(). */
664 static const int OS_AIO_IO_SETUP_RETRY_ATTEMPTS = 5;
665 #endif /* LINUX_NATIVE_AIO */
666
667 /** Array of events used in simulated AIO */
668 static os_event_t* os_aio_segment_wait_events;
669
670 /** Number of asynchronous I/O segments. Set by os_aio_init(). */
671 static ulint os_aio_n_segments = ULINT_UNDEFINED;
672
673 /** If the following is true, read i/o handler threads try to
674 wait until a batch of new read requests have been posted */
675 static bool os_aio_recommend_sleep_for_read_threads;
676
677 Atomic_counter<ulint> os_n_file_reads;
678 static ulint os_bytes_read_since_printout;
679 ulint os_n_file_writes;
680 ulint os_n_fsyncs;
681 static ulint os_n_file_reads_old;
682 static ulint os_n_file_writes_old;
683 static ulint os_n_fsyncs_old;
684
685 static time_t os_last_printout;
686 bool os_has_said_disk_full;
687
688 /** Default Zip compression level */
689 extern uint page_zip_level;
690
691 /** Validates the consistency of the aio system.
692 @return true if ok */
693 static
694 bool
695 os_aio_validate();
696
697 /** Handle errors for file operations.
698 @param[in] name name of a file or NULL
699 @param[in] operation operation
700 @param[in] should_abort whether to abort on an unknown error
701 @param[in] on_error_silent whether to suppress reports of non-fatal errors
702 @return true if we should retry the operation */
703 static MY_ATTRIBUTE((warn_unused_result))
704 bool
705 os_file_handle_error_cond_exit(
706 const char* name,
707 const char* operation,
708 bool should_abort,
709 bool on_error_silent);
710
711 /** Does error handling when a file operation fails.
712 @param[in] name name of a file or NULL
713 @param[in] operation operation name that failed
714 @return true if we should retry the operation */
715 static
716 bool
os_file_handle_error(const char * name,const char * operation)717 os_file_handle_error(
718 const char* name,
719 const char* operation)
720 {
721 /* Exit in case of unknown error */
722 return(os_file_handle_error_cond_exit(name, operation, true, false));
723 }
724
725 /** Does error handling when a file operation fails.
726 @param[in] name name of a file or NULL
727 @param[in] operation operation name that failed
728 @param[in] on_error_silent if true then don't print any message to the log.
729 @return true if we should retry the operation */
730 static
731 bool
os_file_handle_error_no_exit(const char * name,const char * operation,bool on_error_silent)732 os_file_handle_error_no_exit(
733 const char* name,
734 const char* operation,
735 bool on_error_silent)
736 {
737 /* Don't exit in case of unknown error */
738 return(os_file_handle_error_cond_exit(
739 name, operation, false, on_error_silent));
740 }
741
742 /** Handle RENAME error.
743 @param name old name of the file
744 @param new_name new name of the file */
os_file_handle_rename_error(const char * name,const char * new_name)745 static void os_file_handle_rename_error(const char* name, const char* new_name)
746 {
747 if (os_file_get_last_error(true) != OS_FILE_DISK_FULL) {
748 ib::error() << "Cannot rename file '" << name << "' to '"
749 << new_name << "'";
750 } else if (!os_has_said_disk_full) {
751 os_has_said_disk_full = true;
752 /* Disk full error is reported irrespective of the
753 on_error_silent setting. */
754 ib::error() << "Full disk prevents renaming file '"
755 << name << "' to '" << new_name << "'";
756 }
757 }
758
759 /** Does simulated AIO. This function should be called by an i/o-handler
760 thread.
761
762 @param[in] segment The number of the segment in the aio arrays to wait
763 for; segment 0 is the ibuf i/o thread, segment 1 the
764 log i/o thread, then follow the non-ibuf read threads,
765 and as the last are the non-ibuf write threads
766 @param[out] m1 the messages passed with the AIO request; note that
767 also in the case where the AIO operation failed, these
768 output parameters are valid and can be used to restart
769 the operation, for example
770 @param[out] m2 Callback argument
771 @param[in] type IO context
772 @return DB_SUCCESS or error code */
773 static
774 dberr_t
775 os_aio_simulated_handler(
776 ulint global_segment,
777 fil_node_t** m1,
778 void** m2,
779 IORequest* type);
780
781 #ifdef _WIN32
782 static HANDLE win_get_syncio_event();
783
784 /**
785 Wrapper around Windows DeviceIoControl() function.
786
787 Works synchronously, also in case for handle opened
788 for async access (i.e with FILE_FLAG_OVERLAPPED).
789
790 Accepts the same parameters as DeviceIoControl(),except
791 last parameter (OVERLAPPED).
792 */
793 static
794 BOOL
os_win32_device_io_control(HANDLE handle,DWORD code,LPVOID inbuf,DWORD inbuf_size,LPVOID outbuf,DWORD outbuf_size,LPDWORD bytes_returned)795 os_win32_device_io_control(
796 HANDLE handle,
797 DWORD code,
798 LPVOID inbuf,
799 DWORD inbuf_size,
800 LPVOID outbuf,
801 DWORD outbuf_size,
802 LPDWORD bytes_returned
803 )
804 {
805 OVERLAPPED overlapped = { 0 };
806 overlapped.hEvent = win_get_syncio_event();
807 BOOL result = DeviceIoControl(handle, code, inbuf, inbuf_size, outbuf,
808 outbuf_size, NULL, &overlapped);
809
810 if (result || (GetLastError() == ERROR_IO_PENDING)) {
811 /* Wait for async io to complete */
812 result = GetOverlappedResult(handle, &overlapped, bytes_returned, TRUE);
813 }
814
815 return result;
816 }
817
818 #endif
819
820 #ifdef WIN_ASYNC_IO
821 /** This function is only used in Windows asynchronous i/o.
822 Waits for an aio operation to complete. This function is used to wait the
823 for completed requests. The aio array of pending requests is divided
824 into segments. The thread specifies which segment or slot it wants to wait
825 for. NOTE: this function will also take care of freeing the aio slot,
826 therefore no other thread is allowed to do the freeing!
827 @param[in] segment The number of the segment in the aio arrays to
828 wait for; segment 0 is the ibuf I/O thread,
829 segment 1 the log I/O thread, then follow the
830 non-ibuf read threads, and as the last are the
831 non-ibuf write threads; if this is
832 ULINT_UNDEFINED, then it means that sync AIO
833 is used, and this parameter is ignored
834 @param[in] pos this parameter is used only in sync AIO:
835 wait for the aio slot at this position
836 @param[out] m1 the messages passed with the AIO request; note
837 that also in the case where the AIO operation
838 failed, these output parameters are valid and
839 can be used to restart the operation,
840 for example
841 @param[out] m2 callback message
842 @param[out] type OS_FILE_WRITE or ..._READ
843 @return DB_SUCCESS or error code */
844 static
845 dberr_t
846 os_aio_windows_handler(
847 ulint segment,
848 ulint pos,
849 fil_node_t** m1,
850 void** m2,
851 IORequest* type);
852 #endif /* WIN_ASYNC_IO */
853
854 /** Generic AIO Handler methods. Currently handles IO post processing. */
855 class AIOHandler {
856 public:
857 /** Do any post processing after a read/write
858 @return DB_SUCCESS or error code. */
859 static dberr_t post_io_processing(Slot* slot);
860 };
861
862 /** Helper class for doing synchronous file IO. Currently, the objective
863 is to hide the OS specific code, so that the higher level functions aren't
864 peppered with #ifdef. Makes the code flow difficult to follow. */
865 class SyncFileIO {
866 public:
867 /** Constructor
868 @param[in] fh File handle
869 @param[in,out] buf Buffer to read/write
870 @param[in] n Number of bytes to read/write
871 @param[in] offset Offset where to read or write */
SyncFileIO(os_file_t fh,void * buf,ulint n,os_offset_t offset)872 SyncFileIO(os_file_t fh, void* buf, ulint n, os_offset_t offset)
873 :
874 m_fh(fh),
875 m_buf(buf),
876 m_n(static_cast<ssize_t>(n)),
877 m_offset(offset)
878 {
879 ut_ad(m_n > 0);
880 }
881
882 /** Destructor */
~SyncFileIO()883 ~SyncFileIO()
884 {
885 /* No op */
886 }
887
888 /** Do the read/write
889 @param[in] request The IO context and type
890 @return the number of bytes read/written or negative value on error */
891 ssize_t execute(const IORequest& request);
892
893 /** Do the read/write
894 @param[in,out] slot The IO slot, it has the IO context
895 @return the number of bytes read/written or negative value on error */
896 static ssize_t execute(Slot* slot);
897
898 /** Move the read/write offset up to where the partial IO succeeded.
899 @param[in] n_bytes The number of bytes to advance */
advance(ssize_t n_bytes)900 void advance(ssize_t n_bytes)
901 {
902 m_offset += n_bytes;
903
904 ut_ad(m_n >= n_bytes);
905
906 m_n -= n_bytes;
907
908 m_buf = reinterpret_cast<uchar*>(m_buf) + n_bytes;
909 }
910
911 private:
912 /** Open file handle */
913 os_file_t m_fh;
914
915 /** Buffer to read/write */
916 void* m_buf;
917
918 /** Number of bytes to read/write */
919 ssize_t m_n;
920
921 /** Offset from where to read/write */
922 os_offset_t m_offset;
923 };
924
925 /** Do any post processing after a read/write
926 @return DB_SUCCESS or error code. */
927 dberr_t
post_io_processing(Slot * slot)928 AIOHandler::post_io_processing(Slot* slot)
929 {
930 ut_ad(slot->is_reserved);
931
932 /* Total bytes read so far */
933 ulint n_bytes = ulint(slot->ptr - slot->buf) + slot->n_bytes;
934
935 return(n_bytes == slot->original_len ? DB_SUCCESS : DB_FAIL);
936 }
937
938 /** Count the number of free slots
939 @return number of reserved slots */
940 ulint
pending_io_count() const941 AIO::pending_io_count() const
942 {
943 acquire();
944
945 #ifdef UNIV_DEBUG
946 ut_a(m_n_segments > 0);
947 ut_a(!m_slots.empty());
948
949 ulint count = 0;
950
951 for (ulint i = 0; i < m_slots.size(); ++i) {
952
953 const Slot& slot = m_slots[i];
954
955 if (slot.is_reserved) {
956 ++count;
957 ut_a(slot.len > 0);
958 }
959 }
960
961 ut_a(m_n_reserved == count);
962 #endif /* UNIV_DEBUG */
963
964 ulint reserved = m_n_reserved;
965
966 release();
967
968 return(reserved);
969 }
970
971 #ifdef UNIV_DEBUG
972 /** Validates the consistency the aio system some of the time.
973 @return true if ok or the check was skipped */
974 static
975 bool
os_aio_validate_skip()976 os_aio_validate_skip()
977 {
978 /** Try os_aio_validate() every this many times */
979 # define OS_AIO_VALIDATE_SKIP 13
980
981 static Atomic_counter<uint32_t> os_aio_validate_count;
982 return (os_aio_validate_count++ % OS_AIO_VALIDATE_SKIP) || os_aio_validate();
983 }
984 #endif /* UNIV_DEBUG */
985
986 #undef USE_FILE_LOCK
987 #ifndef _WIN32
988 /* On Windows, mandatory locking is used */
989 # define USE_FILE_LOCK
990 #endif
991 #ifdef USE_FILE_LOCK
992 /** Obtain an exclusive lock on a file.
993 @param[in] fd file descriptor
994 @param[in] name file name
995 @return 0 on success */
996 static
997 int
os_file_lock(int fd,const char * name)998 os_file_lock(
999 int fd,
1000 const char* name)
1001 {
1002 if (my_disable_locking) {
1003 return 0;
1004 }
1005
1006 struct flock lk;
1007
1008 lk.l_type = F_WRLCK;
1009 lk.l_whence = SEEK_SET;
1010 lk.l_start = lk.l_len = 0;
1011
1012 if (fcntl(fd, F_SETLK, &lk) == -1) {
1013
1014 ib::error()
1015 << "Unable to lock " << name
1016 << " error: " << errno;
1017
1018 if (errno == EAGAIN || errno == EACCES) {
1019
1020 ib::info()
1021 << "Check that you do not already have"
1022 " another mysqld process using the"
1023 " same InnoDB data or log files.";
1024 }
1025
1026 return(-1);
1027 }
1028
1029 return(0);
1030 }
1031 #endif /* USE_FILE_LOCK */
1032
1033 /** Calculates local segment number and aio array from global segment number.
1034 @param[out] array aio wait array
1035 @param[in] segment global segment number
1036 @return local segment number within the aio array */
1037 ulint
get_array_and_local_segment(AIO ** array,ulint segment)1038 AIO::get_array_and_local_segment(
1039 AIO** array,
1040 ulint segment)
1041 {
1042 ulint local_segment;
1043 ulint n_extra_segs = (srv_read_only_mode) ? 0 : 2;
1044
1045 ut_a(segment < os_aio_n_segments);
1046
1047 if (!srv_read_only_mode && segment < n_extra_segs) {
1048
1049 /* We don't support ibuf/log IO during read only mode. */
1050
1051 if (segment == IO_IBUF_SEGMENT) {
1052
1053 *array = s_ibuf;
1054
1055 } else if (segment == IO_LOG_SEGMENT) {
1056
1057 *array = s_log;
1058
1059 } else {
1060 *array = NULL;
1061 }
1062
1063 local_segment = 0;
1064
1065 } else if (segment < s_reads->m_n_segments + n_extra_segs) {
1066
1067 *array = s_reads;
1068 local_segment = segment - n_extra_segs;
1069
1070 } else {
1071 *array = s_writes;
1072
1073 local_segment = segment
1074 - (s_reads->m_n_segments + n_extra_segs);
1075 }
1076
1077 return(local_segment);
1078 }
1079
1080 /** Frees a slot in the aio array. Assumes caller owns the mutex.
1081 @param[in,out] slot Slot to release */
1082 void
release(Slot * slot)1083 AIO::release(Slot* slot)
1084 {
1085 ut_ad(is_mutex_owned());
1086
1087 ut_ad(slot->is_reserved);
1088
1089 slot->is_reserved = false;
1090
1091 --m_n_reserved;
1092
1093 if (m_n_reserved == m_slots.size() - 1) {
1094 os_event_set(m_not_full);
1095 }
1096
1097 if (m_n_reserved == 0) {
1098 os_event_set(m_is_empty);
1099 }
1100
1101 #if defined(LINUX_NATIVE_AIO)
1102
1103 if (srv_use_native_aio) {
1104 memset(&slot->control, 0x0, sizeof(slot->control));
1105 slot->ret = 0;
1106 slot->n_bytes = 0;
1107 } else {
1108 /* These fields should not be used if we are not
1109 using native AIO. */
1110 ut_ad(slot->n_bytes == 0);
1111 ut_ad(slot->ret == 0);
1112 }
1113
1114 #endif /* WIN_ASYNC_IO */
1115 }
1116
1117 /** Frees a slot in the AIO array. Assumes caller doesn't own the mutex.
1118 @param[in,out] slot Slot to release */
1119 void
release_with_mutex(Slot * slot)1120 AIO::release_with_mutex(Slot* slot)
1121 {
1122 acquire();
1123
1124 release(slot);
1125
1126 release();
1127 }
1128
1129 /** Create a temporary file. This function is like tmpfile(3), but
1130 the temporary file is created in the in the mysql server configuration
1131 parameter (--tmpdir).
1132 @return temporary file handle, or NULL on error */
1133 FILE*
os_file_create_tmpfile()1134 os_file_create_tmpfile()
1135 {
1136 FILE* file = NULL;
1137 WAIT_ALLOW_WRITES();
1138 os_file_t fd = innobase_mysql_tmpfile(NULL);
1139
1140 if (fd != OS_FILE_CLOSED) {
1141 #ifdef _WIN32
1142 int crt_fd = _open_osfhandle((intptr_t)HANDLE(fd), 0);
1143 if (crt_fd != -1) {
1144 file = fdopen(crt_fd, "w+b");
1145 if (!file) {
1146 close(crt_fd);
1147 }
1148 }
1149 #else
1150 file = fdopen(fd, "w+b");
1151 if (!file) {
1152 close(fd);
1153 }
1154 #endif
1155 }
1156
1157 if (file == NULL) {
1158
1159 ib::error()
1160 << "Unable to create temporary file; errno: "
1161 << errno;
1162 }
1163
1164 return(file);
1165 }
1166
1167 /** Rewind file to its start, read at most size - 1 bytes from it to str, and
1168 NUL-terminate str. All errors are silently ignored. This function is
1169 mostly meant to be used with temporary files.
1170 @param[in,out] file File to read from
1171 @param[in,out] str Buffer where to read
1172 @param[in] size Size of buffer */
1173 void
os_file_read_string(FILE * file,char * str,ulint size)1174 os_file_read_string(
1175 FILE* file,
1176 char* str,
1177 ulint size)
1178 {
1179 if (size != 0) {
1180 rewind(file);
1181
1182 size_t flen = fread(str, 1, size - 1, file);
1183
1184 str[flen] = '\0';
1185 }
1186 }
1187
1188 /** This function returns a new path name after replacing the basename
1189 in an old path with a new basename. The old_path is a full path
1190 name including the extension. The tablename is in the normal
1191 form "databasename/tablename". The new base name is found after
1192 the forward slash. Both input strings are null terminated.
1193
1194 This function allocates memory to be returned. It is the callers
1195 responsibility to free the return value after it is no longer needed.
1196
1197 @param[in] old_path Pathname
1198 @param[in] tablename Contains new base name
1199 @return own: new full pathname */
1200 char*
os_file_make_new_pathname(const char * old_path,const char * tablename)1201 os_file_make_new_pathname(
1202 const char* old_path,
1203 const char* tablename)
1204 {
1205 ulint dir_len;
1206 char* last_slash;
1207 char* base_name;
1208 char* new_path;
1209 ulint new_path_len;
1210
1211 /* Split the tablename into its database and table name components.
1212 They are separated by a '/'. */
1213 last_slash = strrchr((char*) tablename, '/');
1214 base_name = last_slash ? last_slash + 1 : (char*) tablename;
1215
1216 /* Find the offset of the last slash. We will strip off the
1217 old basename.ibd which starts after that slash. */
1218 last_slash = strrchr((char*) old_path, OS_PATH_SEPARATOR);
1219 dir_len = last_slash ? ulint(last_slash - old_path) : strlen(old_path);
1220
1221 /* allocate a new path and move the old directory path to it. */
1222 new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd";
1223 new_path = static_cast<char*>(ut_malloc_nokey(new_path_len));
1224 memcpy(new_path, old_path, dir_len);
1225
1226 snprintf(new_path + dir_len, new_path_len - dir_len,
1227 "%c%s.ibd", OS_PATH_SEPARATOR, base_name);
1228
1229 return(new_path);
1230 }
1231
1232 /** This function reduces a null-terminated full remote path name into
1233 the path that is sent by MySQL for DATA DIRECTORY clause. It replaces
1234 the 'databasename/tablename.ibd' found at the end of the path with just
1235 'tablename'.
1236
1237 Since the result is always smaller than the path sent in, no new memory
1238 is allocated. The caller should allocate memory for the path sent in.
1239 This function manipulates that path in place.
1240
1241 If the path format is not as expected, just return. The result is used
1242 to inform a SHOW CREATE TABLE command.
1243 @param[in,out] data_dir_path Full path/data_dir_path */
1244 void
os_file_make_data_dir_path(char * data_dir_path)1245 os_file_make_data_dir_path(
1246 char* data_dir_path)
1247 {
1248 /* Replace the period before the extension with a null byte. */
1249 char* ptr = strrchr((char*) data_dir_path, '.');
1250
1251 if (ptr == NULL) {
1252 return;
1253 }
1254
1255 ptr[0] = '\0';
1256
1257 /* The tablename starts after the last slash. */
1258 ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
1259
1260 if (ptr == NULL) {
1261 return;
1262 }
1263
1264 ptr[0] = '\0';
1265
1266 char* tablename = ptr + 1;
1267
1268 /* The databasename starts after the next to last slash. */
1269 ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
1270
1271 if (ptr == NULL) {
1272 return;
1273 }
1274
1275 ulint tablename_len = ut_strlen(tablename);
1276
1277 ut_memmove(++ptr, tablename, tablename_len);
1278
1279 ptr[tablename_len] = '\0';
1280 }
1281
1282 /** Check if the path refers to the root of a drive using a pointer
1283 to the last directory separator that the caller has fixed.
1284 @param[in] path path name
1285 @param[in] path last directory separator in the path
1286 @return true if this path is a drive root, false if not */
1287 UNIV_INLINE
1288 bool
os_file_is_root(const char * path,const char * last_slash)1289 os_file_is_root(
1290 const char* path,
1291 const char* last_slash)
1292 {
1293 return(
1294 #ifdef _WIN32
1295 (last_slash == path + 2 && path[1] == ':') ||
1296 #endif /* _WIN32 */
1297 last_slash == path);
1298 }
1299
1300 /** Return the parent directory component of a null-terminated path.
1301 Return a new buffer containing the string up to, but not including,
1302 the final component of the path.
1303 The path returned will not contain a trailing separator.
1304 Do not return a root path, return NULL instead.
1305 The final component trimmed off may be a filename or a directory name.
1306 If the final component is the only component of the path, return NULL.
1307 It is the caller's responsibility to free the returned string after it
1308 is no longer needed.
1309 @param[in] path Path name
1310 @return own: parent directory of the path */
1311 static
1312 char*
os_file_get_parent_dir(const char * path)1313 os_file_get_parent_dir(
1314 const char* path)
1315 {
1316 bool has_trailing_slash = false;
1317
1318 /* Find the offset of the last slash */
1319 const char* last_slash = strrchr(path, OS_PATH_SEPARATOR);
1320
1321 if (!last_slash) {
1322 /* No slash in the path, return NULL */
1323 return(NULL);
1324 }
1325
1326 /* Ok, there is a slash. Is there anything after it? */
1327 if (static_cast<size_t>(last_slash - path + 1) == strlen(path)) {
1328 has_trailing_slash = true;
1329 }
1330
1331 /* Reduce repetative slashes. */
1332 while (last_slash > path
1333 && last_slash[-1] == OS_PATH_SEPARATOR) {
1334 last_slash--;
1335 }
1336
1337 /* Check for the root of a drive. */
1338 if (os_file_is_root(path, last_slash)) {
1339 return(NULL);
1340 }
1341
1342 /* If a trailing slash prevented the first strrchr() from trimming
1343 the last component of the path, trim that component now. */
1344 if (has_trailing_slash) {
1345 /* Back up to the previous slash. */
1346 last_slash--;
1347 while (last_slash > path
1348 && last_slash[0] != OS_PATH_SEPARATOR) {
1349 last_slash--;
1350 }
1351
1352 /* Reduce repetative slashes. */
1353 while (last_slash > path
1354 && last_slash[-1] == OS_PATH_SEPARATOR) {
1355 last_slash--;
1356 }
1357 }
1358
1359 /* Check for the root of a drive. */
1360 if (os_file_is_root(path, last_slash)) {
1361 return(NULL);
1362 }
1363
1364 if (last_slash - path < 0) {
1365 /* Sanity check, it prevents gcc from trying to handle this case which
1366 * results in warnings for some optimized builds */
1367 return (NULL);
1368 }
1369
1370 /* Non-trivial directory component */
1371
1372 return(mem_strdupl(path, ulint(last_slash - path)));
1373 }
1374 #ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
1375
1376 /* Test the function os_file_get_parent_dir. */
1377 void
test_os_file_get_parent_dir(const char * child_dir,const char * expected_dir)1378 test_os_file_get_parent_dir(
1379 const char* child_dir,
1380 const char* expected_dir)
1381 {
1382 char* child = mem_strdup(child_dir);
1383 char* expected = expected_dir == NULL ? NULL
1384 : mem_strdup(expected_dir);
1385
1386 /* os_file_get_parent_dir() assumes that separators are
1387 converted to OS_PATH_SEPARATOR. */
1388 os_normalize_path(child);
1389 os_normalize_path(expected);
1390
1391 char* parent = os_file_get_parent_dir(child);
1392
1393 bool unexpected = (expected == NULL
1394 ? (parent != NULL)
1395 : (0 != strcmp(parent, expected)));
1396 if (unexpected) {
1397 ib::fatal() << "os_file_get_parent_dir('" << child
1398 << "') returned '" << parent
1399 << "', instead of '" << expected << "'.";
1400 }
1401 ut_free(parent);
1402 ut_free(child);
1403 ut_free(expected);
1404 }
1405
1406 /* Test the function os_file_get_parent_dir. */
1407 void
unit_test_os_file_get_parent_dir()1408 unit_test_os_file_get_parent_dir()
1409 {
1410 test_os_file_get_parent_dir("/usr/lib/a", "/usr/lib");
1411 test_os_file_get_parent_dir("/usr/", NULL);
1412 test_os_file_get_parent_dir("//usr//", NULL);
1413 test_os_file_get_parent_dir("usr", NULL);
1414 test_os_file_get_parent_dir("usr//", NULL);
1415 test_os_file_get_parent_dir("/", NULL);
1416 test_os_file_get_parent_dir("//", NULL);
1417 test_os_file_get_parent_dir(".", NULL);
1418 test_os_file_get_parent_dir("..", NULL);
1419 # ifdef _WIN32
1420 test_os_file_get_parent_dir("D:", NULL);
1421 test_os_file_get_parent_dir("D:/", NULL);
1422 test_os_file_get_parent_dir("D:\\", NULL);
1423 test_os_file_get_parent_dir("D:/data", NULL);
1424 test_os_file_get_parent_dir("D:/data/", NULL);
1425 test_os_file_get_parent_dir("D:\\data\\", NULL);
1426 test_os_file_get_parent_dir("D:///data/////", NULL);
1427 test_os_file_get_parent_dir("D:\\\\\\data\\\\\\\\", NULL);
1428 test_os_file_get_parent_dir("D:/data//a", "D:/data");
1429 test_os_file_get_parent_dir("D:\\data\\\\a", "D:\\data");
1430 test_os_file_get_parent_dir("D:///data//a///b/", "D:///data//a");
1431 test_os_file_get_parent_dir("D:\\\\\\data\\\\a\\\\\\b\\", "D:\\\\\\data\\\\a");
1432 #endif /* _WIN32 */
1433 }
1434 #endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */
1435
1436
1437 /** Creates all missing subdirectories along the given path.
1438 @param[in] path Path name
1439 @return DB_SUCCESS if OK, otherwise error code. */
1440 dberr_t
os_file_create_subdirs_if_needed(const char * path)1441 os_file_create_subdirs_if_needed(
1442 const char* path)
1443 {
1444 if (srv_read_only_mode) {
1445
1446 ib::error()
1447 << "read only mode set. Can't create "
1448 << "subdirectories '" << path << "'";
1449
1450 return(DB_READ_ONLY);
1451
1452 }
1453
1454 char* subdir = os_file_get_parent_dir(path);
1455
1456 if (subdir == NULL) {
1457 /* subdir is root or cwd, nothing to do */
1458 return(DB_SUCCESS);
1459 }
1460
1461 /* Test if subdir exists */
1462 os_file_type_t type;
1463 bool subdir_exists;
1464 bool success = os_file_status(subdir, &subdir_exists, &type);
1465
1466 if (success && !subdir_exists) {
1467
1468 /* Subdir does not exist, create it */
1469 dberr_t err = os_file_create_subdirs_if_needed(subdir);
1470
1471 if (err != DB_SUCCESS) {
1472
1473 ut_free(subdir);
1474
1475 return(err);
1476 }
1477
1478 success = os_file_create_directory(subdir, false);
1479 }
1480
1481 ut_free(subdir);
1482
1483 return(success ? DB_SUCCESS : DB_ERROR);
1484 }
1485
1486 #ifndef _WIN32
1487
1488 /** Do the read/write
1489 @param[in] request The IO context and type
1490 @return the number of bytes read/written or negative value on error */
1491 ssize_t
execute(const IORequest & request)1492 SyncFileIO::execute(const IORequest& request)
1493 {
1494 ssize_t n_bytes;
1495
1496 if (request.is_read()) {
1497 n_bytes = pread(m_fh, m_buf, m_n, m_offset);
1498 } else {
1499 ut_ad(request.is_write());
1500 n_bytes = pwrite(m_fh, m_buf, m_n, m_offset);
1501 }
1502
1503 return(n_bytes);
1504 }
1505 /** Free storage space associated with a section of the file.
1506 @param[in] fh Open file handle
1507 @param[in] off Starting offset (SEEK_SET)
1508 @param[in] len Size of the hole
1509 @return DB_SUCCESS or error code */
1510 static
1511 dberr_t
os_file_punch_hole_posix(os_file_t fh,os_offset_t off,os_offset_t len)1512 os_file_punch_hole_posix(
1513 os_file_t fh,
1514 os_offset_t off,
1515 os_offset_t len)
1516 {
1517
1518 #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
1519 const int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
1520
1521 int ret = fallocate(fh, mode, off, len);
1522
1523 if (ret == 0) {
1524 return(DB_SUCCESS);
1525 }
1526
1527 if (errno == ENOTSUP) {
1528 return(DB_IO_NO_PUNCH_HOLE);
1529 }
1530
1531 ib::warn()
1532 << "fallocate("
1533 <<", FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, "
1534 << off << ", " << len << ") returned errno: "
1535 << errno;
1536
1537 return(DB_IO_ERROR);
1538
1539 #elif defined(UNIV_SOLARIS)
1540
1541 // Use F_FREESP
1542
1543 #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
1544
1545 return(DB_IO_NO_PUNCH_HOLE);
1546 }
1547
1548 #if defined(LINUX_NATIVE_AIO)
1549
1550 /** Linux native AIO handler */
1551 class LinuxAIOHandler {
1552 public:
1553 /**
1554 @param[in] global_segment The global segment*/
LinuxAIOHandler(ulint global_segment)1555 LinuxAIOHandler(ulint global_segment)
1556 :
1557 m_global_segment(global_segment)
1558 {
1559 /* Should never be doing Sync IO here. */
1560 ut_a(m_global_segment != ULINT_UNDEFINED);
1561
1562 /* Find the array and the local segment. */
1563
1564 m_segment = AIO::get_array_and_local_segment(
1565 &m_array, m_global_segment);
1566
1567 m_n_slots = m_array->slots_per_segment();
1568 }
1569
1570 /** Destructor */
~LinuxAIOHandler()1571 ~LinuxAIOHandler()
1572 {
1573 // No op
1574 }
1575
1576 /**
1577 Process a Linux AIO request
1578 @param[out] m1 the messages passed with the
1579 @param[out] m2 AIO request; note that in case the
1580 AIO operation failed, these output
1581 parameters are valid and can be used to
1582 restart the operation.
1583 @param[out] request IO context
1584 @return DB_SUCCESS or error code */
1585 dberr_t poll(fil_node_t** m1, void** m2, IORequest* request);
1586
1587 private:
1588 /** Resubmit an IO request that was only partially successful
1589 @param[in,out] slot Request to resubmit
1590 @return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
1591 dberr_t resubmit(Slot* slot);
1592
1593 /** Check if the AIO succeeded
1594 @param[in,out] slot The slot to check
1595 @return DB_SUCCESS, DB_FAIL if the operation should be retried or
1596 DB_IO_ERROR on all other errors */
1597 dberr_t check_state(Slot* slot);
1598
1599 /** @return true if a shutdown was detected */
is_shutdown() const1600 bool is_shutdown() const
1601 {
1602 return(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
1603 && !buf_page_cleaner_is_active);
1604 }
1605
1606 /** If no slot was found then the m_array->m_mutex will be released.
1607 @param[out] n_pending The number of pending IOs
1608 @return NULL or a slot that has completed IO */
1609 Slot* find_completed_slot(ulint* n_pending);
1610
1611 /** This is called from within the IO-thread. If there are no completed
1612 IO requests in the slot array, the thread calls this function to
1613 collect more requests from the Linux kernel.
1614 The IO-thread waits on io_getevents(), which is a blocking call, with
1615 a timeout value. Unless the system is very heavy loaded, keeping the
1616 IO-thread very busy, the io-thread will spend most of its time waiting
1617 in this function.
1618 The IO-thread also exits in this function. It checks server status at
1619 each wakeup and that is why we use timed wait in io_getevents(). */
1620 void collect();
1621
1622 private:
1623 /** Slot array */
1624 AIO* m_array;
1625
1626 /** Number of slots inthe local segment */
1627 ulint m_n_slots;
1628
1629 /** The local segment to check */
1630 ulint m_segment;
1631
1632 /** The global segment */
1633 ulint m_global_segment;
1634 };
1635
1636 /** Resubmit an IO request that was only partially successful
1637 @param[in,out] slot Request to resubmit
1638 @return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
1639 dberr_t
resubmit(Slot * slot)1640 LinuxAIOHandler::resubmit(Slot* slot)
1641 {
1642 #ifdef UNIV_DEBUG
1643 /* Bytes already read/written out */
1644 ulint n_bytes = slot->ptr - slot->buf;
1645
1646 ut_ad(m_array->is_mutex_owned());
1647
1648 ut_ad(n_bytes < slot->original_len);
1649 ut_ad(static_cast<ulint>(slot->n_bytes) < slot->original_len - n_bytes);
1650 /* Partial read or write scenario */
1651 ut_ad(slot->len >= static_cast<ulint>(slot->n_bytes));
1652 #endif /* UNIV_DEBUG */
1653
1654 slot->len -= slot->n_bytes;
1655 slot->ptr += slot->n_bytes;
1656 slot->offset += slot->n_bytes;
1657
1658 /* Resetting the bytes read/written */
1659 slot->n_bytes = 0;
1660 slot->io_already_done = false;
1661
1662 compile_time_assert(sizeof(off_t) >= sizeof(os_offset_t));
1663
1664 struct iocb* iocb = &slot->control;
1665
1666 if (slot->type.is_read()) {
1667
1668 io_prep_pread(
1669 iocb,
1670 slot->file,
1671 slot->ptr,
1672 slot->len,
1673 slot->offset);
1674 } else {
1675
1676 ut_a(slot->type.is_write());
1677
1678 io_prep_pwrite(
1679 iocb,
1680 slot->file,
1681 slot->ptr,
1682 slot->len,
1683 slot->offset);
1684 }
1685
1686 iocb->data = slot;
1687
1688 ut_a(reinterpret_cast<size_t>(iocb->u.c.buf) % OS_FILE_LOG_BLOCK_SIZE
1689 == 0);
1690
1691 /* Resubmit an I/O request */
1692 int ret = io_submit(m_array->io_ctx(m_segment), 1, &iocb);
1693 ut_a(ret != -EINVAL);
1694
1695 if (ret < 0) {
1696 errno = -ret;
1697 }
1698
1699 return(ret < 0 ? DB_IO_PARTIAL_FAILED : DB_SUCCESS);
1700 }
1701
1702 /** Check if the AIO succeeded
1703 @param[in,out] slot The slot to check
1704 @return DB_SUCCESS, DB_FAIL if the operation should be retried or
1705 DB_IO_ERROR on all other errors */
1706 dberr_t
check_state(Slot * slot)1707 LinuxAIOHandler::check_state(Slot* slot)
1708 {
1709 ut_ad(m_array->is_mutex_owned());
1710
1711 /* Note that it may be that there is more then one completed
1712 IO requests. We process them one at a time. We may have a case
1713 here to improve the performance slightly by dealing with all
1714 requests in one sweep. */
1715
1716 srv_set_io_thread_op_info(
1717 m_global_segment, "processing completed aio requests");
1718
1719 ut_ad(slot->io_already_done);
1720
1721 dberr_t err = DB_SUCCESS;
1722
1723 if (slot->ret == 0) {
1724
1725 err = AIOHandler::post_io_processing(slot);
1726
1727 } else {
1728 errno = -slot->ret;
1729
1730 /* os_file_handle_error does tell us if we should retry
1731 this IO. As it stands now, we don't do this retry when
1732 reaping requests from a different context than
1733 the dispatcher. This non-retry logic is the same for
1734 Windows and Linux native AIO.
1735 We should probably look into this to transparently
1736 re-submit the IO. */
1737 os_file_handle_error(slot->name, "Linux aio");
1738
1739 err = DB_IO_ERROR;
1740 }
1741
1742 return(err);
1743 }
1744
1745 /** If no slot was found then the m_array->m_mutex will be released.
1746 @param[out] n_pending The number of pending IOs
1747 @return NULL or a slot that has completed IO */
1748 Slot*
find_completed_slot(ulint * n_pending)1749 LinuxAIOHandler::find_completed_slot(ulint* n_pending)
1750 {
1751 ulint offset = m_n_slots * m_segment;
1752
1753 *n_pending = 0;
1754
1755 m_array->acquire();
1756
1757 Slot* slot = m_array->at(offset);
1758
1759 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
1760
1761 if (slot->is_reserved) {
1762
1763 ++*n_pending;
1764
1765 if (slot->io_already_done) {
1766
1767 /* Something for us to work on.
1768 Note: We don't release the mutex. */
1769 return(slot);
1770 }
1771 }
1772 }
1773
1774 m_array->release();
1775
1776 return(NULL);
1777 }
1778
1779 /** This function is only used in Linux native asynchronous i/o. This is
1780 called from within the io-thread. If there are no completed IO requests
1781 in the slot array, the thread calls this function to collect more
1782 requests from the kernel.
1783 The io-thread waits on io_getevents(), which is a blocking call, with
1784 a timeout value. Unless the system is very heavy loaded, keeping the
1785 io-thread very busy, the io-thread will spend most of its time waiting
1786 in this function.
1787 The io-thread also exits in this function. It checks server status at
1788 each wakeup and that is why we use timed wait in io_getevents(). */
1789 void
collect()1790 LinuxAIOHandler::collect()
1791 {
1792 ut_ad(m_n_slots > 0);
1793 ut_ad(m_array != NULL);
1794 ut_ad(m_segment < m_array->get_n_segments());
1795
1796 /* Which io_context_t we are going to use. */
1797 io_context_t io_ctx = m_array->io_ctx(m_segment);
1798
1799 /* Starting point of the m_segment we will be working on. */
1800 ulint start_pos = m_segment * m_n_slots;
1801
1802 /* End point. */
1803 ulint end_pos = start_pos + m_n_slots;
1804
1805 for (;;) {
1806 struct io_event* events;
1807
1808 /* Which part of event array we are going to work on. */
1809 events = m_array->io_events(m_segment * m_n_slots);
1810
1811 /* Initialize the events. */
1812 memset(events, 0, sizeof(*events) * m_n_slots);
1813
1814 /* The timeout value is arbitrary. We probably need
1815 to experiment with it a little. */
1816 struct timespec timeout;
1817
1818 timeout.tv_sec = 0;
1819 timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
1820
1821 int ret;
1822
1823 ret = io_getevents(io_ctx, 1, m_n_slots, events, &timeout);
1824 ut_a(ret != -EINVAL);
1825 ut_ad(ret != -EFAULT);
1826
1827 for (int i = 0; i < ret; ++i) {
1828
1829 struct iocb* iocb;
1830
1831 iocb = reinterpret_cast<struct iocb*>(events[i].obj);
1832 ut_a(iocb != NULL);
1833
1834 Slot* slot = reinterpret_cast<Slot*>(iocb->data);
1835
1836 /* Some sanity checks. */
1837 ut_a(slot != NULL);
1838 ut_a(slot->is_reserved);
1839
1840 /* We are not scribbling previous segment. */
1841 ut_a(slot->pos >= start_pos);
1842
1843 /* We have not overstepped to next segment. */
1844 ut_a(slot->pos < end_pos);
1845
1846 /* Deallocate unused blocks from file system.
1847 This is newer done to page 0 or to log files.*/
1848 if (slot->offset > 0
1849 && !slot->type.is_log()
1850 && slot->type.is_write()
1851 && slot->type.punch_hole()) {
1852
1853 slot->err = slot->type.punch_hole(
1854 slot->file,
1855 slot->offset, slot->len);
1856 } else {
1857 slot->err = DB_SUCCESS;
1858 }
1859
1860 /* Mark this request as completed. The error handling
1861 will be done in the calling function. */
1862 m_array->acquire();
1863
1864 /* events[i].res2 should always be ZERO */
1865 ut_ad(events[i].res2 == 0);
1866 slot->io_already_done = true;
1867
1868 /*Even though events[i].res is an unsigned number
1869 in libaio, it is used to return a negative value
1870 (negated errno value) to indicate error and a positive
1871 value to indicate number of bytes read or written. */
1872
1873 if (events[i].res > slot->len) {
1874 /* failure */
1875 slot->n_bytes = 0;
1876 slot->ret = events[i].res;
1877 } else {
1878 /* success */
1879 slot->n_bytes = events[i].res;
1880 slot->ret = 0;
1881 }
1882 m_array->release();
1883 }
1884
1885 if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
1886 || !buf_page_cleaner_is_active
1887 || ret > 0) {
1888
1889 break;
1890 }
1891
1892 /* This error handling is for any error in collecting the
1893 IO requests. The errors, if any, for any particular IO
1894 request are simply passed on to the calling routine. */
1895
1896 switch (ret) {
1897 case -EAGAIN:
1898 /* Not enough resources! Try again. */
1899
1900 case -EINTR:
1901 /* Interrupted! The behaviour in case of an interrupt.
1902 If we have some completed IOs available then the
1903 return code will be the number of IOs. We get EINTR
1904 only if there are no completed IOs and we have been
1905 interrupted. */
1906
1907 case 0:
1908 /* No pending request! Go back and check again. */
1909
1910 continue;
1911 }
1912
1913 /* All other errors should cause a trap for now. */
1914 ib::fatal()
1915 << "Unexpected ret_code[" << ret
1916 << "] from io_getevents()!";
1917
1918 break;
1919 }
1920 }
1921
1922 /** Process a Linux AIO request
1923 @param[out] m1 the messages passed with the
1924 @param[out] m2 AIO request; note that in case the
1925 AIO operation failed, these output
1926 parameters are valid and can be used to
1927 restart the operation.
1928 @param[out] request IO context
1929 @return DB_SUCCESS or error code */
1930 dberr_t
poll(fil_node_t ** m1,void ** m2,IORequest * request)1931 LinuxAIOHandler::poll(fil_node_t** m1, void** m2, IORequest* request)
1932 {
1933 dberr_t err = DB_SUCCESS;
1934 Slot* slot;
1935
1936 /* Loop until we have found a completed request. */
1937 for (;;) {
1938
1939 ulint n_pending;
1940
1941 slot = find_completed_slot(&n_pending);
1942
1943 if (slot != NULL) {
1944
1945 ut_ad(m_array->is_mutex_owned());
1946
1947 err = check_state(slot);
1948
1949 /* DB_FAIL is not a hard error, we should retry */
1950 if (err != DB_FAIL) {
1951 break;
1952 }
1953
1954 /* Partial IO, resubmit request for
1955 remaining bytes to read/write */
1956 err = resubmit(slot);
1957
1958 if (err != DB_SUCCESS) {
1959 break;
1960 }
1961
1962 m_array->release();
1963
1964 } else if (is_shutdown() && n_pending == 0) {
1965
1966 /* There is no completed request. If there is
1967 no pending request at all, and the system is
1968 being shut down, exit. */
1969
1970 *m1 = NULL;
1971 *m2 = NULL;
1972
1973 return(DB_SUCCESS);
1974
1975 } else {
1976
1977 /* Wait for some request. Note that we return
1978 from wait if we have found a request. */
1979
1980 srv_set_io_thread_op_info(
1981 m_global_segment,
1982 "waiting for completed aio requests");
1983
1984 collect();
1985 }
1986 }
1987
1988 if (err == DB_IO_PARTIAL_FAILED) {
1989 /* Aborting in case of submit failure */
1990 ib::fatal()
1991 << "Native Linux AIO interface. "
1992 "io_submit() call failed when "
1993 "resubmitting a partial I/O "
1994 "request on the file " << slot->name
1995 << ".";
1996 }
1997
1998 *m1 = slot->m1;
1999 *m2 = slot->m2;
2000
2001 *request = slot->type;
2002
2003 m_array->release(slot);
2004
2005 m_array->release();
2006
2007 return(err);
2008 }
2009
2010 /** This function is only used in Linux native asynchronous i/o.
2011 Waits for an aio operation to complete. This function is used to wait for
2012 the completed requests. The aio array of pending requests is divided
2013 into segments. The thread specifies which segment or slot it wants to wait
2014 for. NOTE: this function will also take care of freeing the aio slot,
2015 therefore no other thread is allowed to do the freeing!
2016
2017 @param[in] global_seg segment number in the aio array
2018 to wait for; segment 0 is the ibuf
2019 i/o thread, segment 1 is log i/o thread,
2020 then follow the non-ibuf read threads,
2021 and the last are the non-ibuf write
2022 threads.
2023 @param[out] m1 the messages passed with the
2024 @param[out] m2 AIO request; note that in case the
2025 AIO operation failed, these output
2026 parameters are valid and can be used to
2027 restart the operation.
2028 @param[out]xi request IO context
2029 @return DB_SUCCESS if the IO was successful */
2030 static
2031 dberr_t
os_aio_linux_handler(ulint global_segment,fil_node_t ** m1,void ** m2,IORequest * request)2032 os_aio_linux_handler(
2033 ulint global_segment,
2034 fil_node_t** m1,
2035 void** m2,
2036 IORequest* request)
2037 {
2038 return LinuxAIOHandler(global_segment).poll(m1, m2, request);
2039 }
2040
2041 /** Dispatch an AIO request to the kernel.
2042 @param[in,out] slot an already reserved slot
2043 @return true on success. */
2044 bool
linux_dispatch(Slot * slot)2045 AIO::linux_dispatch(Slot* slot)
2046 {
2047 ut_a(slot->is_reserved);
2048 ut_ad(slot->type.validate());
2049
2050 /* Find out what we are going to work with.
2051 The iocb struct is directly in the slot.
2052 The io_context_t is one per segment. */
2053
2054 ulint io_ctx_index;
2055 struct iocb* iocb = &slot->control;
2056
2057 io_ctx_index = (slot->pos * m_n_segments) / m_slots.size();
2058
2059 ut_a(reinterpret_cast<size_t>(iocb->u.c.buf) % OS_FILE_LOG_BLOCK_SIZE
2060 == 0);
2061
2062 int ret = io_submit(io_ctx(io_ctx_index), 1, &iocb);
2063 ut_a(ret != -EINVAL);
2064
2065 /* io_submit() returns number of successfully queued requests
2066 or -errno. */
2067
2068 if (ret != 1) {
2069 errno = -ret;
2070 }
2071
2072 return(ret == 1);
2073 }
2074
2075 /** Creates an io_context_t for native linux AIO.
2076 @param[in] max_events number of events
2077 @param[out] io_ctx io_ctx to initialize.
2078 @return true on success. */
2079 bool
linux_create_io_ctx(unsigned max_events,io_context_t & io_ctx)2080 AIO::linux_create_io_ctx(
2081 unsigned max_events,
2082 io_context_t& io_ctx)
2083 {
2084 ssize_t n_retries = 0;
2085
2086 for (;;) {
2087
2088 memset(&io_ctx, 0x0, sizeof(io_ctx));
2089
2090 /* Initialize the io_ctx. Tell it how many pending
2091 IO requests this context will handle. */
2092
2093 int ret = io_setup(max_events, &io_ctx);
2094 ut_a(ret != -EINVAL);
2095
2096 if (ret == 0) {
2097 /* Success. Return now. */
2098 return(true);
2099 }
2100
2101 /* If we hit EAGAIN we'll make a few attempts before failing. */
2102
2103 switch (ret) {
2104 case -EAGAIN:
2105 if (n_retries == 0) {
2106 /* First time around. */
2107 ib::warn()
2108 << "io_setup() failed with EAGAIN."
2109 " Will make "
2110 << OS_AIO_IO_SETUP_RETRY_ATTEMPTS
2111 << " attempts before giving up.";
2112 }
2113
2114 if (n_retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
2115
2116 ++n_retries;
2117
2118 ib::warn()
2119 << "io_setup() attempt "
2120 << n_retries << ".";
2121
2122 os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
2123
2124 continue;
2125 }
2126
2127 /* Have tried enough. Better call it a day. */
2128 ib::warn()
2129 << "io_setup() failed with EAGAIN after "
2130 << OS_AIO_IO_SETUP_RETRY_ATTEMPTS
2131 << " attempts.";
2132 break;
2133
2134 case -ENOSYS:
2135 ib::warn()
2136 << "Linux Native AIO interface"
2137 " is not supported on this platform. Please"
2138 " check your OS documentation and install"
2139 " appropriate binary of InnoDB.";
2140
2141 break;
2142
2143 default:
2144 ib::warn()
2145 << "Linux Native AIO setup"
2146 << " returned following error["
2147 << ret << "]";
2148 break;
2149 }
2150
2151 ib::info()
2152 << "You can disable Linux Native AIO by"
2153 " setting innodb_use_native_aio = 0 in my.cnf";
2154
2155 break;
2156 }
2157
2158 return(false);
2159 }
2160
2161 /** Checks if the system supports native linux aio. On some kernel
2162 versions where native aio is supported it won't work on tmpfs. In such
2163 cases we can't use native aio as it is not possible to mix simulated
2164 and native aio.
2165 @return: true if supported, false otherwise. */
2166 bool
is_linux_native_aio_supported()2167 AIO::is_linux_native_aio_supported()
2168 {
2169 int fd;
2170 io_context_t io_ctx;
2171 char name[1000];
2172
2173 if (!linux_create_io_ctx(1, io_ctx)) {
2174
2175 /* The platform does not support native aio. */
2176
2177 return(false);
2178
2179 } else if (!srv_read_only_mode) {
2180
2181 /* Now check if tmpdir supports native aio ops. */
2182 fd = innobase_mysql_tmpfile(NULL);
2183
2184 if (fd < 0) {
2185 ib::warn()
2186 << "Unable to create temp file to check"
2187 " native AIO support.";
2188
2189 int ret = io_destroy(io_ctx);
2190 ut_a(ret != -EINVAL);
2191 ut_ad(ret != -EFAULT);
2192
2193 return(false);
2194 }
2195 } else {
2196
2197 os_normalize_path(srv_log_group_home_dir);
2198
2199 ulint dirnamelen = strlen(srv_log_group_home_dir);
2200
2201 ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
2202
2203 memcpy(name, srv_log_group_home_dir, dirnamelen);
2204
2205 /* Add a path separator if needed. */
2206 if (dirnamelen && name[dirnamelen - 1] != OS_PATH_SEPARATOR) {
2207
2208 name[dirnamelen++] = OS_PATH_SEPARATOR;
2209 }
2210
2211 strcpy(name + dirnamelen, "ib_logfile0");
2212
2213 fd = open(name, O_RDONLY | O_CLOEXEC);
2214
2215 if (fd == -1) {
2216
2217 ib::warn()
2218 << "Unable to open"
2219 << " \"" << name << "\" to check native"
2220 << " AIO read support.";
2221
2222 int ret = io_destroy(io_ctx);
2223 ut_a(ret != EINVAL);
2224 ut_ad(ret != EFAULT);
2225
2226 return(false);
2227 }
2228 }
2229
2230 struct io_event io_event;
2231
2232 memset(&io_event, 0x0, sizeof(io_event));
2233
2234 byte* buf = static_cast<byte*>(ut_malloc_nokey(srv_page_size * 2));
2235 byte* ptr = static_cast<byte*>(ut_align(buf, srv_page_size));
2236
2237 struct iocb iocb;
2238
2239 /* Suppress valgrind warning. */
2240 memset(buf, 0x00, srv_page_size * 2);
2241 memset(&iocb, 0x0, sizeof(iocb));
2242
2243 struct iocb* p_iocb = &iocb;
2244
2245 if (!srv_read_only_mode) {
2246
2247 io_prep_pwrite(p_iocb, fd, ptr, srv_page_size, 0);
2248
2249 } else {
2250 ut_a(srv_page_size >= 4096);
2251 io_prep_pread(p_iocb, fd, ptr, srv_page_size, 0);
2252 }
2253
2254 ut_a(reinterpret_cast<size_t>(p_iocb->u.c.buf) % OS_FILE_LOG_BLOCK_SIZE
2255 == 0);
2256 int err = io_submit(io_ctx, 1, &p_iocb);
2257 ut_a(err != -EINVAL);
2258
2259 if (err >= 1) {
2260 /* Now collect the submitted IO request. */
2261 err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
2262 ut_a(err != -EINVAL);
2263 }
2264
2265 ut_free(buf);
2266 close(fd);
2267
2268 switch (err) {
2269 case 1:
2270 {
2271 int ret = io_destroy(io_ctx);
2272 ut_a(ret != -EINVAL);
2273 ut_ad(ret != -EFAULT);
2274
2275 return(true);
2276 }
2277
2278 case -EINVAL:
2279 case -ENOSYS:
2280 ib::error()
2281 << "Linux Native AIO not supported. You can either"
2282 " move "
2283 << (srv_read_only_mode ? name : "tmpdir")
2284 << " to a file system that supports native"
2285 " AIO or you can set innodb_use_native_aio to"
2286 " FALSE to avoid this message.";
2287
2288 /* fall through. */
2289 default:
2290 ib::error()
2291 << "Linux Native AIO check on "
2292 << (srv_read_only_mode ? name : "tmpdir")
2293 << "returned error[" << -err << "]";
2294 }
2295
2296 int ret = io_destroy(io_ctx);
2297 ut_a(ret != -EINVAL);
2298 ut_ad(ret != -EFAULT);
2299
2300 return(false);
2301 }
2302
2303 #endif /* LINUX_NATIVE_AIO */
2304
2305 /** Retrieves the last error number if an error occurs in a file io function.
2306 The number should be retrieved before any other OS calls (because they may
2307 overwrite the error number). If the number is not known to this program,
2308 the OS error number + OS_FILE_ERROR_MAX is returned.
2309 @param[in] report_all_errors true if we want an error message
2310 printed of all errors
2311 @param[in] on_error_silent true then don't print any diagnostic
2312 to the log
2313 @return error number, or OS error number + OS_FILE_ERROR_MAX */
2314 static
2315 ulint
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)2316 os_file_get_last_error_low(
2317 bool report_all_errors,
2318 bool on_error_silent)
2319 {
2320 int err = errno;
2321
2322 if (err == 0) {
2323 return(0);
2324 }
2325
2326 if (report_all_errors
2327 || (err != ENOSPC && err != EEXIST && !on_error_silent)) {
2328
2329 ib::error()
2330 << "Operating system error number "
2331 << err
2332 << " in a file operation.";
2333
2334 if (err == ENOENT) {
2335
2336 ib::error()
2337 << "The error means the system"
2338 " cannot find the path specified.";
2339
2340 if (srv_is_being_started) {
2341
2342 ib::error()
2343 << "If you are installing InnoDB,"
2344 " remember that you must create"
2345 " directories yourself, InnoDB"
2346 " does not create them.";
2347 }
2348 } else if (err == EACCES) {
2349
2350 ib::error()
2351 << "The error means mysqld does not have"
2352 " the access rights to the directory.";
2353
2354 } else {
2355 if (strerror(err) != NULL) {
2356
2357 ib::error()
2358 << "Error number " << err << " means '"
2359 << strerror(err) << "'";
2360 }
2361
2362 ib::info() << OPERATING_SYSTEM_ERROR_MSG;
2363 }
2364 }
2365
2366 switch (err) {
2367 case ENOSPC:
2368 return(OS_FILE_DISK_FULL);
2369 case ENOENT:
2370 return(OS_FILE_NOT_FOUND);
2371 case EEXIST:
2372 return(OS_FILE_ALREADY_EXISTS);
2373 case EXDEV:
2374 case ENOTDIR:
2375 case EISDIR:
2376 return(OS_FILE_PATH_ERROR);
2377 case EAGAIN:
2378 if (srv_use_native_aio) {
2379 return(OS_FILE_AIO_RESOURCES_RESERVED);
2380 }
2381 break;
2382 case EINTR:
2383 if (srv_use_native_aio) {
2384 return(OS_FILE_AIO_INTERRUPTED);
2385 }
2386 break;
2387 case EACCES:
2388 return(OS_FILE_ACCESS_VIOLATION);
2389 }
2390 return(OS_FILE_ERROR_MAX + err);
2391 }
2392
2393 /** Wrapper to fsync(2) that retries the call on some errors.
2394 Returns the value 0 if successful; otherwise the value -1 is returned and
2395 the global variable errno is set to indicate the error.
2396 @param[in] file open file handle
2397 @return 0 if success, -1 otherwise */
2398 static
2399 int
os_file_fsync_posix(os_file_t file)2400 os_file_fsync_posix(
2401 os_file_t file)
2402 {
2403 ulint failures = 0;
2404
2405 for (;;) {
2406
2407 ++os_n_fsyncs;
2408
2409 int ret = fsync(file);
2410
2411 if (ret == 0) {
2412 return(ret);
2413 }
2414
2415 switch(errno) {
2416 case ENOLCK:
2417
2418 ++failures;
2419 ut_a(failures < 1000);
2420
2421 if (!(failures % 100)) {
2422
2423 ib::warn()
2424 << "fsync(): "
2425 << "No locks available; retrying";
2426 }
2427
2428 /* 0.2 sec */
2429 os_thread_sleep(200000);
2430 break;
2431
2432 case EINTR:
2433
2434 ++failures;
2435 ut_a(failures < 2000);
2436 break;
2437
2438 default:
2439 ib::fatal() << "fsync() returned " << errno;
2440 }
2441 }
2442 }
2443
2444 /** Check the existence and type of the given file.
2445 @param[in] path path name of file
2446 @param[out] exists true if the file exists
2447 @param[out] type Type of the file, if it exists
2448 @return true if call succeeded */
2449 static
2450 bool
os_file_status_posix(const char * path,bool * exists,os_file_type_t * type)2451 os_file_status_posix(
2452 const char* path,
2453 bool* exists,
2454 os_file_type_t* type)
2455 {
2456 struct stat statinfo;
2457
2458 int ret = stat(path, &statinfo);
2459
2460 *exists = !ret;
2461
2462 if (!ret) {
2463 /* file exists, everything OK */
2464
2465 } else if (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG) {
2466 /* file does not exist */
2467 return(true);
2468
2469 } else {
2470 /* file exists, but stat call failed */
2471 os_file_handle_error_no_exit(path, "stat", false);
2472 return(false);
2473 }
2474
2475 if (S_ISDIR(statinfo.st_mode)) {
2476 *type = OS_FILE_TYPE_DIR;
2477
2478 } else if (S_ISLNK(statinfo.st_mode)) {
2479 *type = OS_FILE_TYPE_LINK;
2480
2481 } else if (S_ISREG(statinfo.st_mode)) {
2482 *type = OS_FILE_TYPE_FILE;
2483 } else {
2484 *type = OS_FILE_TYPE_UNKNOWN;
2485 }
2486
2487 return(true);
2488 }
2489
2490 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
2491 function!
2492 Flushes the write buffers of a given file to the disk.
2493 @param[in] file handle to a file
2494 @return true if success */
2495 bool
os_file_flush_func(os_file_t file)2496 os_file_flush_func(
2497 os_file_t file)
2498 {
2499 int ret;
2500
2501 WAIT_ALLOW_WRITES();
2502 ret = os_file_fsync_posix(file);
2503
2504 if (ret == 0) {
2505 return(true);
2506 }
2507
2508 /* Since Linux returns EINVAL if the 'file' is actually a raw device,
2509 we choose to ignore that error if we are using raw disks */
2510
2511 if (srv_start_raw_disk_in_use && errno == EINVAL) {
2512
2513 return(true);
2514 }
2515
2516 ib::error() << "The OS said file flush did not succeed";
2517
2518 os_file_handle_error(NULL, "flush");
2519
2520 /* It is a fatal error if a file flush does not succeed, because then
2521 the database can get corrupt on disk */
2522 ut_error;
2523
2524 return(false);
2525 }
2526
2527 /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
2528 this function!
2529 A simple function to open or create a file.
2530 @param[in] name name of the file or path as a null-terminated
2531 string
2532 @param[in] create_mode create mode
2533 @param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
2534 @param[in] read_only if true, read only checks are enforced
2535 @param[out] success true if succeed, false if error
2536 @return handle to the file, not defined if error, error number
2537 can be retrieved with os_file_get_last_error */
2538 pfs_os_file_t
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)2539 os_file_create_simple_func(
2540 const char* name,
2541 ulint create_mode,
2542 ulint access_type,
2543 bool read_only,
2544 bool* success)
2545 {
2546 pfs_os_file_t file;
2547
2548 *success = false;
2549
2550 int create_flag;
2551 const char* mode_str = NULL;
2552
2553 if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) {
2554 WAIT_ALLOW_WRITES();
2555 }
2556
2557 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
2558 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
2559
2560 if (create_mode == OS_FILE_OPEN) {
2561 mode_str = "OPEN";
2562
2563 if (access_type == OS_FILE_READ_ONLY) {
2564
2565 create_flag = O_RDONLY;
2566
2567 } else if (read_only) {
2568
2569 create_flag = O_RDONLY;
2570
2571 } else {
2572 create_flag = O_RDWR;
2573 }
2574
2575 } else if (read_only) {
2576
2577 mode_str = "OPEN";
2578 create_flag = O_RDONLY;
2579
2580 } else if (create_mode == OS_FILE_CREATE) {
2581
2582 mode_str = "CREATE";
2583 create_flag = O_RDWR | O_CREAT | O_EXCL;
2584
2585 } else if (create_mode == OS_FILE_CREATE_PATH) {
2586
2587 mode_str = "CREATE PATH";
2588 /* Create subdirs along the path if needed. */
2589
2590 *success = os_file_create_subdirs_if_needed(name);
2591
2592 if (!*success) {
2593
2594 ib::error()
2595 << "Unable to create subdirectories '"
2596 << name << "'";
2597
2598 return(OS_FILE_CLOSED);
2599 }
2600
2601 create_flag = O_RDWR | O_CREAT | O_EXCL;
2602 create_mode = OS_FILE_CREATE;
2603 } else {
2604
2605 ib::error()
2606 << "Unknown file create mode ("
2607 << create_mode
2608 << " for file '" << name << "'";
2609
2610 return(OS_FILE_CLOSED);
2611 }
2612
2613 bool retry;
2614
2615 do {
2616 file = open(name, create_flag | O_CLOEXEC, os_innodb_umask);
2617
2618 if (file == -1) {
2619 *success = false;
2620 retry = os_file_handle_error(
2621 name,
2622 create_mode == OS_FILE_OPEN
2623 ? "open" : "create");
2624 } else {
2625 *success = true;
2626 retry = false;
2627 }
2628
2629 } while (retry);
2630
2631 /* This function is always called for data files, we should disable
2632 OS caching (O_DIRECT) here as we do in os_file_create_func(), so
2633 we open the same file in the same mode, see man page of open(2). */
2634 if (!srv_read_only_mode
2635 && *success
2636 && (srv_file_flush_method == SRV_O_DIRECT
2637 || srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)) {
2638
2639 os_file_set_nocache(file, name, mode_str);
2640 }
2641
2642 #ifdef USE_FILE_LOCK
2643 if (!read_only
2644 && *success
2645 && (access_type == OS_FILE_READ_WRITE)
2646 && os_file_lock(file, name)) {
2647
2648 *success = false;
2649 close(file);
2650 file = -1;
2651 }
2652 #endif /* USE_FILE_LOCK */
2653
2654 return(file);
2655 }
2656
2657 /** This function attempts to create a directory named pathname. The new
2658 directory gets default permissions. On Unix the permissions are
2659 (0770 & ~umask). If the directory exists already, nothing is done and
2660 the call succeeds, unless the fail_if_exists arguments is true.
2661 If another error occurs, such as a permission error, this does not crash,
2662 but reports the error and returns false.
2663 @param[in] pathname directory name as null-terminated string
2664 @param[in] fail_if_exists if true, pre-existing directory is treated as
2665 an error.
2666 @return true if call succeeds, false on error */
2667 bool
os_file_create_directory(const char * pathname,bool fail_if_exists)2668 os_file_create_directory(
2669 const char* pathname,
2670 bool fail_if_exists)
2671 {
2672 int rcode;
2673
2674 WAIT_ALLOW_WRITES();
2675 rcode = mkdir(pathname, 0770);
2676
2677 if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
2678 /* failure */
2679 os_file_handle_error_no_exit(pathname, "mkdir", false);
2680
2681 return(false);
2682 }
2683
2684 return(true);
2685 }
2686
2687 /** NOTE! Use the corresponding macro os_file_create(), not directly
2688 this function!
2689 Opens an existing file or creates a new.
2690 @param[in] name name of the file or path as a null-terminated
2691 string
2692 @param[in] create_mode create mode
2693 @param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O
2694 is desired, OS_FILE_NORMAL, if any normal file;
2695 NOTE that it also depends on type, os_aio_..
2696 and srv_.. variables whether we really use async
2697 I/O or unbuffered I/O: look in the function
2698 source code for the exact rules
2699 @param[in] type OS_DATA_FILE or OS_LOG_FILE
2700 @param[in] read_only true, if read only checks should be enforcedm
2701 @param[in] success true if succeeded
2702 @return handle to the file, not defined if error, error number
2703 can be retrieved with os_file_get_last_error */
2704 pfs_os_file_t
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,bool read_only,bool * success)2705 os_file_create_func(
2706 const char* name,
2707 ulint create_mode,
2708 ulint purpose,
2709 ulint type,
2710 bool read_only,
2711 bool* success)
2712 {
2713 bool on_error_no_exit;
2714 bool on_error_silent;
2715
2716 *success = false;
2717
2718 DBUG_EXECUTE_IF(
2719 "ib_create_table_fail_disk_full",
2720 *success = false;
2721 errno = ENOSPC;
2722 return(OS_FILE_CLOSED);
2723 );
2724
2725 int create_flag;
2726 const char* mode_str = NULL;
2727
2728 on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
2729 ? true : false;
2730 on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
2731 ? true : false;
2732
2733 create_mode &= ulint(~(OS_FILE_ON_ERROR_NO_EXIT
2734 | OS_FILE_ON_ERROR_SILENT));
2735
2736 if (create_mode == OS_FILE_OPEN
2737 || create_mode == OS_FILE_OPEN_RAW
2738 || create_mode == OS_FILE_OPEN_RETRY) {
2739
2740 mode_str = "OPEN";
2741
2742 create_flag = read_only ? O_RDONLY : O_RDWR;
2743
2744 } else if (read_only) {
2745
2746 mode_str = "OPEN";
2747
2748 create_flag = O_RDONLY;
2749
2750 } else if (create_mode == OS_FILE_CREATE) {
2751
2752 mode_str = "CREATE";
2753 create_flag = O_RDWR | O_CREAT | O_EXCL;
2754
2755 } else if (create_mode == OS_FILE_OVERWRITE) {
2756
2757 mode_str = "OVERWRITE";
2758 create_flag = O_RDWR | O_CREAT | O_TRUNC;
2759
2760 } else {
2761 ib::error()
2762 << "Unknown file create mode (" << create_mode << ")"
2763 << " for file '" << name << "'";
2764
2765 return(OS_FILE_CLOSED);
2766 }
2767
2768 ut_a(type == OS_LOG_FILE
2769 || type == OS_DATA_FILE
2770 || type == OS_DATA_FILE_NO_O_DIRECT);
2771
2772 ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
2773
2774 #ifdef O_SYNC
2775 /* We let O_SYNC only affect log files; note that we map O_DSYNC to
2776 O_SYNC because the datasync options seemed to corrupt files in 2001
2777 in both Linux and Solaris */
2778
2779 if (!read_only
2780 && type == OS_LOG_FILE
2781 && srv_file_flush_method == SRV_O_DSYNC) {
2782
2783 create_flag |= O_SYNC;
2784 }
2785 #endif /* O_SYNC */
2786
2787 os_file_t file;
2788 bool retry;
2789
2790 do {
2791 file = open(name, create_flag | O_CLOEXEC, os_innodb_umask);
2792
2793 if (file == -1) {
2794 const char* operation;
2795
2796 operation = (create_mode == OS_FILE_CREATE
2797 && !read_only) ? "create" : "open";
2798
2799 *success = false;
2800
2801 if (on_error_no_exit) {
2802 retry = os_file_handle_error_no_exit(
2803 name, operation, on_error_silent);
2804 } else {
2805 retry = os_file_handle_error(name, operation);
2806 }
2807 } else {
2808 *success = true;
2809 retry = false;
2810 }
2811
2812 } while (retry);
2813
2814 /* We disable OS caching (O_DIRECT) only on data files */
2815 if (!read_only
2816 && *success
2817 && (type != OS_LOG_FILE
2818 && type != OS_DATA_FILE_NO_O_DIRECT)
2819 && (srv_file_flush_method == SRV_O_DIRECT
2820 || srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)) {
2821
2822 os_file_set_nocache(file, name, mode_str);
2823 }
2824
2825 #ifdef USE_FILE_LOCK
2826 if (!read_only
2827 && *success
2828 && create_mode != OS_FILE_OPEN_RAW
2829 && os_file_lock(file, name)) {
2830
2831 if (create_mode == OS_FILE_OPEN_RETRY) {
2832
2833 ib::info()
2834 << "Retrying to lock the first data file";
2835
2836 for (int i = 0; i < 100; i++) {
2837 os_thread_sleep(1000000);
2838
2839 if (!os_file_lock(file, name)) {
2840 *success = true;
2841 return(file);
2842 }
2843 }
2844
2845 ib::info()
2846 << "Unable to open the first data file";
2847 }
2848
2849 *success = false;
2850 close(file);
2851 file = -1;
2852 }
2853 #endif /* USE_FILE_LOCK */
2854
2855 return(file);
2856 }
2857
2858 /** NOTE! Use the corresponding macro
2859 os_file_create_simple_no_error_handling(), not directly this function!
2860 A simple function to open or create a file.
2861 @param[in] name name of the file or path as a null-terminated
2862 string
2863 @param[in] create_mode create mode
2864 @param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
2865 OS_FILE_READ_ALLOW_DELETE; the last option
2866 is used by a backup program reading the file
2867 @param[in] read_only if true read only mode checks are enforced
2868 @param[out] success true if succeeded
2869 @return own: handle to the file, not defined if error, error number
2870 can be retrieved with os_file_get_last_error */
2871 pfs_os_file_t
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)2872 os_file_create_simple_no_error_handling_func(
2873 const char* name,
2874 ulint create_mode,
2875 ulint access_type,
2876 bool read_only,
2877 bool* success)
2878 {
2879 os_file_t file;
2880 int create_flag;
2881
2882 if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) {
2883 WAIT_ALLOW_WRITES();
2884 }
2885
2886 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
2887 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
2888
2889 *success = false;
2890
2891 if (create_mode == OS_FILE_OPEN) {
2892
2893 if (access_type == OS_FILE_READ_ONLY) {
2894
2895 create_flag = O_RDONLY;
2896
2897 } else if (read_only) {
2898
2899 create_flag = O_RDONLY;
2900
2901 } else {
2902
2903 ut_a(access_type == OS_FILE_READ_WRITE
2904 || access_type == OS_FILE_READ_ALLOW_DELETE);
2905
2906 create_flag = O_RDWR;
2907 }
2908
2909 } else if (read_only) {
2910
2911 create_flag = O_RDONLY;
2912
2913 } else if (create_mode == OS_FILE_CREATE) {
2914
2915 create_flag = O_RDWR | O_CREAT | O_EXCL;
2916
2917 } else {
2918
2919 ib::error()
2920 << "Unknown file create mode "
2921 << create_mode << " for file '" << name << "'";
2922
2923 return(OS_FILE_CLOSED);
2924 }
2925
2926 file = open(name, create_flag | O_CLOEXEC, os_innodb_umask);
2927
2928 *success = (file != -1);
2929
2930 #ifdef USE_FILE_LOCK
2931 if (!read_only
2932 && *success
2933 && access_type == OS_FILE_READ_WRITE
2934 && os_file_lock(file, name)) {
2935
2936 *success = false;
2937 close(file);
2938 file = -1;
2939
2940 }
2941 #endif /* USE_FILE_LOCK */
2942
2943 return(file);
2944 }
2945
2946 /** Deletes a file if it exists. The file has to be closed before calling this.
2947 @param[in] name file path as a null-terminated string
2948 @param[out] exist indicate if file pre-exist
2949 @return true if success */
2950 bool
os_file_delete_if_exists_func(const char * name,bool * exist)2951 os_file_delete_if_exists_func(
2952 const char* name,
2953 bool* exist)
2954 {
2955 if (exist != NULL) {
2956 *exist = true;
2957 }
2958
2959 int ret;
2960 WAIT_ALLOW_WRITES();
2961
2962 ret = unlink(name);
2963
2964 if (ret != 0 && errno == ENOENT) {
2965 if (exist != NULL) {
2966 *exist = false;
2967 }
2968 } else if (ret != 0 && errno != ENOENT) {
2969 os_file_handle_error_no_exit(name, "delete", false);
2970
2971 return(false);
2972 }
2973
2974 return(true);
2975 }
2976
2977 /** Deletes a file. The file has to be closed before calling this.
2978 @param[in] name file path as a null-terminated string
2979 @return true if success */
2980 bool
os_file_delete_func(const char * name)2981 os_file_delete_func(
2982 const char* name)
2983 {
2984 int ret;
2985 WAIT_ALLOW_WRITES();
2986
2987 ret = unlink(name);
2988
2989 if (ret != 0) {
2990 os_file_handle_error_no_exit(name, "delete", FALSE);
2991
2992 return(false);
2993 }
2994
2995 return(true);
2996 }
2997
2998 /** NOTE! Use the corresponding macro os_file_rename(), not directly this
2999 function!
3000 Renames a file (can also move it to another directory). It is safest that the
3001 file is closed before calling this function.
3002 @param[in] oldpath old file path as a null-terminated string
3003 @param[in] newpath new file path
3004 @return true if success */
3005 bool
os_file_rename_func(const char * oldpath,const char * newpath)3006 os_file_rename_func(
3007 const char* oldpath,
3008 const char* newpath)
3009 {
3010 #ifdef UNIV_DEBUG
3011 os_file_type_t type;
3012 bool exists;
3013
3014 /* New path must not exist. */
3015 ut_ad(os_file_status(newpath, &exists, &type));
3016 ut_ad(!exists);
3017
3018 /* Old path must exist. */
3019 ut_ad(os_file_status(oldpath, &exists, &type));
3020 ut_ad(exists);
3021 #endif /* UNIV_DEBUG */
3022
3023 int ret;
3024 WAIT_ALLOW_WRITES();
3025
3026 ret = rename(oldpath, newpath);
3027
3028 if (ret != 0) {
3029 os_file_handle_rename_error(oldpath, newpath);
3030
3031 return(false);
3032 }
3033
3034 return(true);
3035 }
3036
3037 /** NOTE! Use the corresponding macro os_file_close(), not directly this
3038 function!
3039 Closes a file handle. In case of error, error number can be retrieved with
3040 os_file_get_last_error.
3041 @param[in] file Handle to close
3042 @return true if success */
3043 bool
os_file_close_func(os_file_t file)3044 os_file_close_func(
3045 os_file_t file)
3046 {
3047 int ret = close(file);
3048
3049 if (ret == -1) {
3050 os_file_handle_error(NULL, "close");
3051
3052 return(false);
3053 }
3054
3055 return(true);
3056 }
3057
3058 /** Gets a file size.
3059 @param[in] file handle to an open file
3060 @return file size, or (os_offset_t) -1 on failure */
3061 os_offset_t
os_file_get_size(os_file_t file)3062 os_file_get_size(os_file_t file)
3063 {
3064 struct stat statbuf;
3065 return fstat(file, &statbuf) ? os_offset_t(-1) : statbuf.st_size;
3066 }
3067
3068 /** Gets a file size.
3069 @param[in] filename Full path to the filename to check
3070 @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
3071 errno */
3072 os_file_size_t
os_file_get_size(const char * filename)3073 os_file_get_size(
3074 const char* filename)
3075 {
3076 struct stat s;
3077 os_file_size_t file_size;
3078
3079 int ret = stat(filename, &s);
3080
3081 if (ret == 0) {
3082 file_size.m_total_size = s.st_size;
3083 /* st_blocks is in 512 byte sized blocks */
3084 file_size.m_alloc_size = s.st_blocks * 512;
3085 } else {
3086 file_size.m_total_size = ~0U;
3087 file_size.m_alloc_size = (os_offset_t) errno;
3088 }
3089
3090 return(file_size);
3091 }
3092
3093 /** This function returns information about the specified file
3094 @param[in] path pathname of the file
3095 @param[out] stat_info information of a file in a directory
3096 @param[in,out] statinfo information of a file in a directory
3097 @param[in] check_rw_perm for testing whether the file can be opened
3098 in RW mode
3099 @param[in] read_only if true read only mode checks are enforced
3100 @return DB_SUCCESS if all OK */
3101 static
3102 dberr_t
os_file_get_status_posix(const char * path,os_file_stat_t * stat_info,struct stat * statinfo,bool check_rw_perm,bool read_only)3103 os_file_get_status_posix(
3104 const char* path,
3105 os_file_stat_t* stat_info,
3106 struct stat* statinfo,
3107 bool check_rw_perm,
3108 bool read_only)
3109 {
3110 int ret = stat(path, statinfo);
3111
3112 if (ret && (errno == ENOENT || errno == ENOTDIR
3113 || errno == ENAMETOOLONG)) {
3114 /* file does not exist */
3115
3116 return(DB_NOT_FOUND);
3117
3118 } else if (ret) {
3119 /* file exists, but stat call failed */
3120
3121 os_file_handle_error_no_exit(path, "stat", false);
3122
3123 return(DB_FAIL);
3124 }
3125
3126 switch (statinfo->st_mode & S_IFMT) {
3127 case S_IFDIR:
3128 stat_info->type = OS_FILE_TYPE_DIR;
3129 break;
3130 case S_IFLNK:
3131 stat_info->type = OS_FILE_TYPE_LINK;
3132 break;
3133 case S_IFBLK:
3134 /* Handle block device as regular file. */
3135 case S_IFCHR:
3136 /* Handle character device as regular file. */
3137 case S_IFREG:
3138 stat_info->type = OS_FILE_TYPE_FILE;
3139 break;
3140 default:
3141 stat_info->type = OS_FILE_TYPE_UNKNOWN;
3142 }
3143
3144 stat_info->size = statinfo->st_size;
3145 stat_info->block_size = statinfo->st_blksize;
3146 stat_info->alloc_size = statinfo->st_blocks * 512;
3147
3148 if (check_rw_perm
3149 && (stat_info->type == OS_FILE_TYPE_FILE
3150 || stat_info->type == OS_FILE_TYPE_BLOCK)) {
3151
3152 stat_info->rw_perm = !access(path, read_only
3153 ? R_OK : R_OK | W_OK);
3154 }
3155
3156 return(DB_SUCCESS);
3157 }
3158
3159 /** Truncates a file to a specified size in bytes.
3160 Do nothing if the size to preserve is greater or equal to the current
3161 size of the file.
3162 @param[in] pathname file path
3163 @param[in] file file to be truncated
3164 @param[in] size size to preserve in bytes
3165 @return true if success */
3166 static
3167 bool
os_file_truncate_posix(const char * pathname,os_file_t file,os_offset_t size)3168 os_file_truncate_posix(
3169 const char* pathname,
3170 os_file_t file,
3171 os_offset_t size)
3172 {
3173 int res = ftruncate(file, size);
3174
3175 if (res == -1) {
3176
3177 bool retry;
3178
3179 retry = os_file_handle_error_no_exit(
3180 pathname, "truncate", false);
3181
3182 if (retry) {
3183 ib::warn()
3184 << "Truncate failed for '"
3185 << pathname << "'";
3186 }
3187 }
3188
3189 return(res == 0);
3190 }
3191
3192 /** Truncates a file at its current position.
3193 @return true if success */
3194 bool
os_file_set_eof(FILE * file)3195 os_file_set_eof(
3196 FILE* file) /*!< in: file to be truncated */
3197 {
3198 WAIT_ALLOW_WRITES();
3199 return(!ftruncate(fileno(file), ftell(file)));
3200 }
3201
3202 #else /* !_WIN32 */
3203
3204 #include <WinIoCtl.h>
3205
3206 /*
3207 Windows : Handling synchronous IO on files opened asynchronously.
3208
3209 If file is opened for asynchronous IO (FILE_FLAG_OVERLAPPED) and also bound to
3210 a completion port, then every IO on this file would normally be enqueued to the
3211 completion port. Sometimes however we would like to do a synchronous IO. This is
3212 possible if we initialitze have overlapped.hEvent with a valid event and set its
3213 lowest order bit to 1 (see MSDN ReadFile and WriteFile description for more info)
3214
3215 We'll create this special event once for each thread and store in thread local
3216 storage.
3217 */
3218
3219
win_free_syncio_event(void * data)3220 static void __stdcall win_free_syncio_event(void *data) {
3221 if (data) {
3222 CloseHandle((HANDLE)data);
3223 }
3224 }
3225
3226
3227 /*
3228 Retrieve per-thread event for doing synchronous io on asyncronously opened files
3229 */
win_get_syncio_event()3230 static HANDLE win_get_syncio_event()
3231 {
3232 HANDLE h;
3233
3234 h = (HANDLE)FlsGetValue(fls_sync_io);
3235 if (h) {
3236 return h;
3237 }
3238 h = CreateEventA(NULL, FALSE, FALSE, NULL);
3239 ut_a(h);
3240 /* Set low-order bit to keeps I/O completion from being queued */
3241 h = (HANDLE)((uintptr_t)h | 1);
3242 FlsSetValue(fls_sync_io, h);
3243 return h;
3244 }
3245
3246
3247 /** Do the read/write
3248 @param[in] request The IO context and type
3249 @return the number of bytes read/written or negative value on error */
3250 ssize_t
execute(const IORequest & request)3251 SyncFileIO::execute(const IORequest& request)
3252 {
3253 OVERLAPPED seek;
3254
3255 memset(&seek, 0x0, sizeof(seek));
3256
3257 seek.hEvent = win_get_syncio_event();
3258 seek.Offset = (DWORD) m_offset & 0xFFFFFFFF;
3259 seek.OffsetHigh = (DWORD) (m_offset >> 32);
3260
3261 BOOL ret;
3262 DWORD n_bytes;
3263
3264 if (request.is_read()) {
3265 ret = ReadFile(m_fh, m_buf,
3266 static_cast<DWORD>(m_n), NULL, &seek);
3267
3268 } else {
3269 ut_ad(request.is_write());
3270 ret = WriteFile(m_fh, m_buf,
3271 static_cast<DWORD>(m_n), NULL, &seek);
3272 }
3273 if (ret || (GetLastError() == ERROR_IO_PENDING)) {
3274 /* Wait for async io to complete */
3275 ret = GetOverlappedResult(m_fh, &seek, &n_bytes, TRUE);
3276 }
3277
3278 return(ret ? static_cast<ssize_t>(n_bytes) : -1);
3279 }
3280
3281 /** Do the read/write
3282 @param[in,out] slot The IO slot, it has the IO context
3283 @return the number of bytes read/written or negative value on error */
3284 ssize_t
execute(Slot * slot)3285 SyncFileIO::execute(Slot* slot)
3286 {
3287 BOOL ret;
3288 slot->control.hEvent = win_get_syncio_event();
3289 if (slot->type.is_read()) {
3290
3291 ret = ReadFile(
3292 slot->file, slot->ptr, slot->len,
3293 NULL, &slot->control);
3294
3295 } else {
3296 ut_ad(slot->type.is_write());
3297
3298 ret = WriteFile(
3299 slot->file, slot->ptr, slot->len,
3300 NULL, &slot->control);
3301
3302 }
3303 if (ret || (GetLastError() == ERROR_IO_PENDING)) {
3304 /* Wait for async io to complete */
3305 ret = GetOverlappedResult(slot->file, &slot->control, &slot->n_bytes, TRUE);
3306 }
3307
3308 return(ret ? static_cast<ssize_t>(slot->n_bytes) : -1);
3309 }
3310
3311 /* Startup/shutdown */
3312
3313 struct WinIoInit
3314 {
WinIoInitWinIoInit3315 WinIoInit() {
3316 fls_sync_io = FlsAlloc(win_free_syncio_event);
3317 ut_a(fls_sync_io != FLS_OUT_OF_INDEXES);
3318 }
3319
~WinIoInitWinIoInit3320 ~WinIoInit() {
3321 FlsFree(fls_sync_io);
3322 }
3323 };
3324
3325 /* Ensures proper initialization and shutdown */
3326 static WinIoInit win_io_init;
3327
3328
3329 /** Free storage space associated with a section of the file.
3330 @param[in] fh Open file handle
3331 @param[in] off Starting offset (SEEK_SET)
3332 @param[in] len Size of the hole
3333 @return 0 on success or errno */
3334 static
3335 dberr_t
os_file_punch_hole_win32(os_file_t fh,os_offset_t off,os_offset_t len)3336 os_file_punch_hole_win32(
3337 os_file_t fh,
3338 os_offset_t off,
3339 os_offset_t len)
3340 {
3341 FILE_ZERO_DATA_INFORMATION punch;
3342
3343 punch.FileOffset.QuadPart = off;
3344 punch.BeyondFinalZero.QuadPart = off + len;
3345
3346 /* If lpOverlapped is NULL, lpBytesReturned cannot be NULL,
3347 therefore we pass a dummy parameter. */
3348 DWORD temp;
3349 BOOL success = os_win32_device_io_control(
3350 fh, FSCTL_SET_ZERO_DATA, &punch, sizeof(punch),
3351 NULL, 0, &temp);
3352
3353 return(success ? DB_SUCCESS: DB_IO_NO_PUNCH_HOLE);
3354 }
3355
3356 /** Check the existence and type of the given file.
3357 @param[in] path path name of file
3358 @param[out] exists true if the file exists
3359 @param[out] type Type of the file, if it exists
3360 @return true if call succeeded */
3361 static
3362 bool
os_file_status_win32(const char * path,bool * exists,os_file_type_t * type)3363 os_file_status_win32(
3364 const char* path,
3365 bool* exists,
3366 os_file_type_t* type)
3367 {
3368 int ret;
3369 struct _stat64 statinfo;
3370
3371 ret = _stat64(path, &statinfo);
3372
3373 *exists = !ret;
3374
3375 if (!ret) {
3376 /* file exists, everything OK */
3377
3378 } else if (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG) {
3379 /* file does not exist */
3380 return(true);
3381
3382 } else {
3383 /* file exists, but stat call failed */
3384 os_file_handle_error_no_exit(path, "stat", false);
3385 return(false);
3386 }
3387
3388 if (_S_IFDIR & statinfo.st_mode) {
3389 *type = OS_FILE_TYPE_DIR;
3390
3391 } else if (_S_IFREG & statinfo.st_mode) {
3392 *type = OS_FILE_TYPE_FILE;
3393
3394 } else {
3395 *type = OS_FILE_TYPE_UNKNOWN;
3396 }
3397
3398 return(true);
3399 }
3400
3401 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
3402 function!
3403 Flushes the write buffers of a given file to the disk.
3404 @param[in] file handle to a file
3405 @return true if success */
3406 bool
os_file_flush_func(os_file_t file)3407 os_file_flush_func(
3408 os_file_t file)
3409 {
3410 ++os_n_fsyncs;
3411
3412 BOOL ret = FlushFileBuffers(file);
3413
3414 if (ret) {
3415 return(true);
3416 }
3417
3418 /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
3419 actually a raw device, we choose to ignore that error if we are using
3420 raw disks */
3421
3422 if (srv_start_raw_disk_in_use && GetLastError()
3423 == ERROR_INVALID_FUNCTION) {
3424 return(true);
3425 }
3426
3427 os_file_handle_error(NULL, "flush");
3428
3429 /* It is a fatal error if a file flush does not succeed, because then
3430 the database can get corrupt on disk */
3431 ut_error;
3432
3433 return(false);
3434 }
3435
3436 /** Retrieves the last error number if an error occurs in a file io function.
3437 The number should be retrieved before any other OS calls (because they may
3438 overwrite the error number). If the number is not known to this program,
3439 the OS error number + 100 is returned.
3440 @param[in] report_all_errors true if we want an error message printed
3441 of all errors
3442 @param[in] on_error_silent true then don't print any diagnostic
3443 to the log
3444 @return error number, or OS error number + 100 */
3445 static
3446 ulint
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)3447 os_file_get_last_error_low(
3448 bool report_all_errors,
3449 bool on_error_silent)
3450 {
3451 ulint err = (ulint) GetLastError();
3452
3453 if (err == ERROR_SUCCESS) {
3454 return(0);
3455 }
3456
3457 if (report_all_errors
3458 || (!on_error_silent
3459 && err != ERROR_DISK_FULL
3460 && err != ERROR_FILE_EXISTS)) {
3461
3462 ib::error()
3463 << "Operating system error number " << err
3464 << " in a file operation.";
3465
3466 if (err == ERROR_PATH_NOT_FOUND) {
3467 ib::error()
3468 << "The error means the system"
3469 " cannot find the path specified.";
3470
3471 if (srv_is_being_started) {
3472 ib::error()
3473 << "If you are installing InnoDB,"
3474 " remember that you must create"
3475 " directories yourself, InnoDB"
3476 " does not create them.";
3477 }
3478
3479 } else if (err == ERROR_ACCESS_DENIED) {
3480
3481 ib::error()
3482 << "The error means mysqld does not have"
3483 " the access rights to"
3484 " the directory. It may also be"
3485 " you have created a subdirectory"
3486 " of the same name as a data file.";
3487
3488 } else if (err == ERROR_SHARING_VIOLATION
3489 || err == ERROR_LOCK_VIOLATION) {
3490
3491 ib::error()
3492 << "The error means that another program"
3493 " is using InnoDB's files."
3494 " This might be a backup or antivirus"
3495 " software or another instance"
3496 " of MySQL."
3497 " Please close it to get rid of this error.";
3498
3499 } else if (err == ERROR_WORKING_SET_QUOTA
3500 || err == ERROR_NO_SYSTEM_RESOURCES) {
3501
3502 ib::error()
3503 << "The error means that there are no"
3504 " sufficient system resources or quota to"
3505 " complete the operation.";
3506
3507 } else if (err == ERROR_OPERATION_ABORTED) {
3508
3509 ib::error()
3510 << "The error means that the I/O"
3511 " operation has been aborted"
3512 " because of either a thread exit"
3513 " or an application request."
3514 " Retry attempt is made.";
3515 } else {
3516
3517 ib::info() << OPERATING_SYSTEM_ERROR_MSG;
3518 }
3519 }
3520
3521 if (err == ERROR_FILE_NOT_FOUND) {
3522 return(OS_FILE_NOT_FOUND);
3523 } else if (err == ERROR_DISK_FULL) {
3524 return(OS_FILE_DISK_FULL);
3525 } else if (err == ERROR_FILE_EXISTS) {
3526 return(OS_FILE_ALREADY_EXISTS);
3527 } else if (err == ERROR_SHARING_VIOLATION
3528 || err == ERROR_LOCK_VIOLATION) {
3529 return(OS_FILE_SHARING_VIOLATION);
3530 } else if (err == ERROR_WORKING_SET_QUOTA
3531 || err == ERROR_NO_SYSTEM_RESOURCES) {
3532 return(OS_FILE_INSUFFICIENT_RESOURCE);
3533 } else if (err == ERROR_OPERATION_ABORTED) {
3534 return(OS_FILE_OPERATION_ABORTED);
3535 } else if (err == ERROR_ACCESS_DENIED) {
3536 return(OS_FILE_ACCESS_VIOLATION);
3537 }
3538
3539 return(OS_FILE_ERROR_MAX + err);
3540 }
3541
3542
3543 /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
3544 this function!
3545 A simple function to open or create a file.
3546 @param[in] name name of the file or path as a null-terminated
3547 string
3548 @param[in] create_mode create mode
3549 @param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
3550 @param[in] read_only if true read only mode checks are enforced
3551 @param[out] success true if succeed, false if error
3552 @return handle to the file, not defined if error, error number
3553 can be retrieved with os_file_get_last_error */
3554 pfs_os_file_t
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)3555 os_file_create_simple_func(
3556 const char* name,
3557 ulint create_mode,
3558 ulint access_type,
3559 bool read_only,
3560 bool* success)
3561 {
3562 os_file_t file;
3563
3564 *success = false;
3565
3566 DWORD access;
3567 DWORD create_flag;
3568 DWORD attributes = 0;
3569
3570 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
3571 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
3572 ut_ad(srv_operation == SRV_OPERATION_NORMAL);
3573
3574 if (create_mode == OS_FILE_OPEN) {
3575
3576 create_flag = OPEN_EXISTING;
3577
3578 } else if (read_only) {
3579
3580 create_flag = OPEN_EXISTING;
3581
3582 } else if (create_mode == OS_FILE_CREATE) {
3583
3584 create_flag = CREATE_NEW;
3585
3586 } else if (create_mode == OS_FILE_CREATE_PATH) {
3587
3588 /* Create subdirs along the path if needed. */
3589 *success = os_file_create_subdirs_if_needed(name);
3590
3591 if (!*success) {
3592
3593 ib::error()
3594 << "Unable to create subdirectories '"
3595 << name << "'";
3596
3597 return(OS_FILE_CLOSED);
3598 }
3599
3600 create_flag = CREATE_NEW;
3601 create_mode = OS_FILE_CREATE;
3602
3603 } else {
3604
3605 ib::error()
3606 << "Unknown file create mode ("
3607 << create_mode << ") for file '"
3608 << name << "'";
3609
3610 return(OS_FILE_CLOSED);
3611 }
3612
3613 if (access_type == OS_FILE_READ_ONLY) {
3614
3615 access = GENERIC_READ;
3616
3617 } else if (read_only) {
3618
3619 ib::info()
3620 << "Read only mode set. Unable to"
3621 " open file '" << name << "' in RW mode, "
3622 << "trying RO mode";
3623
3624 access = GENERIC_READ;
3625
3626 } else if (access_type == OS_FILE_READ_WRITE) {
3627
3628 access = GENERIC_READ | GENERIC_WRITE;
3629
3630 } else {
3631
3632 ib::error()
3633 << "Unknown file access type (" << access_type << ") "
3634 "for file '" << name << "'";
3635
3636 return(OS_FILE_CLOSED);
3637 }
3638
3639 bool retry;
3640
3641 do {
3642 /* Use default security attributes and no template file. */
3643
3644 file = CreateFile(
3645 (LPCTSTR) name, access,
3646 FILE_SHARE_READ | FILE_SHARE_DELETE, NULL,
3647 create_flag, attributes, NULL);
3648
3649 if (file == INVALID_HANDLE_VALUE) {
3650
3651 *success = false;
3652
3653 retry = os_file_handle_error(
3654 name, create_mode == OS_FILE_OPEN ?
3655 "open" : "create");
3656
3657 } else {
3658
3659 retry = false;
3660
3661 *success = true;
3662 }
3663
3664 } while (retry);
3665
3666 return(file);
3667 }
3668
3669 /** This function attempts to create a directory named pathname. The new
3670 directory gets default permissions. On Unix the permissions are
3671 (0770 & ~umask). If the directory exists already, nothing is done and
3672 the call succeeds, unless the fail_if_exists arguments is true.
3673 If another error occurs, such as a permission error, this does not crash,
3674 but reports the error and returns false.
3675 @param[in] pathname directory name as null-terminated string
3676 @param[in] fail_if_exists if true, pre-existing directory is treated
3677 as an error.
3678 @return true if call succeeds, false on error */
3679 bool
os_file_create_directory(const char * pathname,bool fail_if_exists)3680 os_file_create_directory(
3681 const char* pathname,
3682 bool fail_if_exists)
3683 {
3684 BOOL rcode;
3685
3686 rcode = CreateDirectory((LPCTSTR) pathname, NULL);
3687 if (!(rcode != 0
3688 || (GetLastError() == ERROR_ALREADY_EXISTS
3689 && !fail_if_exists))) {
3690
3691 os_file_handle_error_no_exit(
3692 pathname, "CreateDirectory", false);
3693
3694 return(false);
3695 }
3696
3697 return(true);
3698 }
3699
3700 /** Check that IO of specific size is possible for the file
3701 opened with FILE_FLAG_NO_BUFFERING.
3702
3703 The requirement is that IO is multiple of the disk sector size.
3704
3705 @param[in] file file handle
3706 @param[in] io_size expected io size
3707 @return true - unbuffered io of requested size is possible, false otherwise.
3708
3709 @note: this function only works correctly with Windows 8 or later,
3710 (GetFileInformationByHandleEx with FileStorageInfo is only supported there).
3711 It will return true on earlier Windows version.
3712 */
unbuffered_io_possible(HANDLE file,size_t io_size)3713 static bool unbuffered_io_possible(HANDLE file, size_t io_size)
3714 {
3715 FILE_STORAGE_INFO info;
3716 if (GetFileInformationByHandleEx(
3717 file, FileStorageInfo, &info, sizeof(info))) {
3718 ULONG sector_size = info.LogicalBytesPerSector;
3719 if (sector_size)
3720 return io_size % sector_size == 0;
3721 }
3722 return true;
3723 }
3724
3725
3726 /** NOTE! Use the corresponding macro os_file_create(), not directly
3727 this function!
3728 Opens an existing file or creates a new.
3729 @param[in] name name of the file or path as a null-terminated
3730 string
3731 @param[in] create_mode create mode
3732 @param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O
3733 is desired, OS_FILE_NORMAL, if any normal file;
3734 NOTE that it also depends on type, os_aio_..
3735 and srv_.. variables whether we really use async
3736 I/O or unbuffered I/O: look in the function
3737 source code for the exact rules
3738 @param[in] type OS_DATA_FILE or OS_LOG_FILE
3739 @param[in] success true if succeeded
3740 @return handle to the file, not defined if error, error number
3741 can be retrieved with os_file_get_last_error */
3742 pfs_os_file_t
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,bool read_only,bool * success)3743 os_file_create_func(
3744 const char* name,
3745 ulint create_mode,
3746 ulint purpose,
3747 ulint type,
3748 bool read_only,
3749 bool* success)
3750 {
3751 os_file_t file;
3752 bool retry;
3753 bool on_error_no_exit;
3754 bool on_error_silent;
3755
3756 *success = false;
3757
3758 DBUG_EXECUTE_IF(
3759 "ib_create_table_fail_disk_full",
3760 *success = false;
3761 SetLastError(ERROR_DISK_FULL);
3762 return(OS_FILE_CLOSED);
3763 );
3764
3765 DWORD create_flag;
3766 DWORD share_mode = srv_operation != SRV_OPERATION_NORMAL
3767 ? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE
3768 : FILE_SHARE_READ | FILE_SHARE_DELETE;
3769
3770 if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) {
3771 WAIT_ALLOW_WRITES();
3772 }
3773
3774 on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
3775 ? true : false;
3776
3777 on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
3778 ? true : false;
3779
3780 create_mode &= ~(OS_FILE_ON_ERROR_NO_EXIT | OS_FILE_ON_ERROR_SILENT);
3781
3782 if (create_mode == OS_FILE_OPEN_RAW) {
3783
3784 ut_a(!read_only);
3785
3786 create_flag = OPEN_EXISTING;
3787
3788 /* On Windows Physical devices require admin privileges and
3789 have to have the write-share mode set. See the remarks
3790 section for the CreateFile() function documentation in MSDN. */
3791
3792 share_mode |= FILE_SHARE_WRITE;
3793
3794 } else if (create_mode == OS_FILE_OPEN
3795 || create_mode == OS_FILE_OPEN_RETRY) {
3796
3797 create_flag = OPEN_EXISTING;
3798
3799 } else if (read_only) {
3800
3801 create_flag = OPEN_EXISTING;
3802
3803 } else if (create_mode == OS_FILE_CREATE) {
3804
3805 create_flag = CREATE_NEW;
3806
3807 } else if (create_mode == OS_FILE_OVERWRITE) {
3808
3809 create_flag = CREATE_ALWAYS;
3810
3811 } else {
3812 ib::error()
3813 << "Unknown file create mode (" << create_mode << ") "
3814 << " for file '" << name << "'";
3815
3816 return(OS_FILE_CLOSED);
3817 }
3818
3819 DWORD attributes = 0;
3820
3821 if (purpose == OS_FILE_AIO) {
3822
3823 #ifdef WIN_ASYNC_IO
3824 /* If specified, use asynchronous (overlapped) io and no
3825 buffering of writes in the OS */
3826
3827 if (srv_use_native_aio) {
3828 attributes |= FILE_FLAG_OVERLAPPED;
3829 }
3830 #endif /* WIN_ASYNC_IO */
3831
3832 } else if (purpose == OS_FILE_NORMAL) {
3833
3834 /* Use default setting. */
3835
3836 } else {
3837
3838 ib::error()
3839 << "Unknown purpose flag (" << purpose << ") "
3840 << "while opening file '" << name << "'";
3841
3842 return(OS_FILE_CLOSED);
3843 }
3844
3845 if (type == OS_LOG_FILE) {
3846 /* There is not reason to use buffered write to logs.*/
3847 attributes |= FILE_FLAG_NO_BUFFERING;
3848 }
3849
3850 switch (srv_file_flush_method)
3851 {
3852 case SRV_O_DSYNC:
3853 if (type == OS_LOG_FILE) {
3854 /* Map O_SYNC to FILE_WRITE_THROUGH */
3855 attributes |= FILE_FLAG_WRITE_THROUGH;
3856 }
3857 break;
3858
3859 case SRV_O_DIRECT_NO_FSYNC:
3860 case SRV_O_DIRECT:
3861 if (type == OS_DATA_FILE) {
3862 attributes |= FILE_FLAG_NO_BUFFERING;
3863 }
3864 break;
3865
3866 case SRV_ALL_O_DIRECT_FSYNC:
3867 /*Traditional Windows behavior, no buffering for any files.*/
3868 if (type != OS_DATA_FILE_NO_O_DIRECT) {
3869 attributes |= FILE_FLAG_NO_BUFFERING;
3870 }
3871 break;
3872
3873 case SRV_FSYNC:
3874 case SRV_LITTLESYNC:
3875 break;
3876
3877 case SRV_NOSYNC:
3878 /* Let Windows cache manager handle all writes.*/
3879 attributes &= ~(FILE_FLAG_WRITE_THROUGH | FILE_FLAG_NO_BUFFERING);
3880 break;
3881
3882 default:
3883 ut_a(false); /* unknown flush mode.*/
3884 }
3885
3886
3887 // TODO: Create a bug, this looks wrong. The flush log
3888 // parameter is dynamic.
3889 if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
3890 /* Do not use unbuffered i/o for the log files because
3891 value 2 denotes that we do not flush the log at every
3892 commit, but only once per second */
3893 attributes &= ~(FILE_FLAG_WRITE_THROUGH | FILE_FLAG_NO_BUFFERING);
3894 }
3895
3896
3897 DWORD access = GENERIC_READ;
3898
3899 if (!read_only) {
3900 access |= GENERIC_WRITE;
3901 }
3902
3903 for (;;) {
3904 const char *operation;
3905
3906 /* Use default security attributes and no template file. */
3907 file = CreateFile(
3908 name, access, share_mode, NULL,
3909 create_flag, attributes, NULL);
3910
3911 /* If FILE_FLAG_NO_BUFFERING was set, check if this can work at all,
3912 for expected IO sizes. Reopen without the unbuffered flag, if it is won't work*/
3913 if ((file != INVALID_HANDLE_VALUE)
3914 && (attributes & FILE_FLAG_NO_BUFFERING)
3915 && (type == OS_LOG_FILE)
3916 && !unbuffered_io_possible(file, OS_FILE_LOG_BLOCK_SIZE)) {
3917 ut_a(CloseHandle(file));
3918 attributes &= ~FILE_FLAG_NO_BUFFERING;
3919 create_flag = OPEN_ALWAYS;
3920 continue;
3921 }
3922
3923 *success = (file != INVALID_HANDLE_VALUE);
3924 if (*success) {
3925 break;
3926 }
3927
3928 operation = (create_mode == OS_FILE_CREATE && !read_only) ?
3929 "create" : "open";
3930
3931 if (on_error_no_exit) {
3932 retry = os_file_handle_error_no_exit(
3933 name, operation, on_error_silent);
3934 }
3935 else {
3936 retry = os_file_handle_error(name, operation);
3937 }
3938
3939 if (!retry) {
3940 break;
3941 }
3942 }
3943
3944 if (*success && srv_use_native_aio && (attributes & FILE_FLAG_OVERLAPPED)) {
3945 /* Bind the file handle to completion port. Completion port
3946 might not be created yet, in some stages of backup, but
3947 must always be there for the server.*/
3948 HANDLE port = (type == OS_LOG_FILE) ?
3949 log_completion_port : data_completion_port;
3950 ut_a(port || srv_operation != SRV_OPERATION_NORMAL);
3951 if (port) {
3952 ut_a(CreateIoCompletionPort(file, port, 0, 0));
3953 }
3954 }
3955
3956 return(file);
3957 }
3958
3959 /** NOTE! Use the corresponding macro os_file_create_simple_no_error_handling(),
3960 not directly this function!
3961 A simple function to open or create a file.
3962 @param[in] name name of the file or path as a null-terminated
3963 string
3964 @param[in] create_mode create mode
3965 @param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
3966 OS_FILE_READ_ALLOW_DELETE; the last option is
3967 used by a backup program reading the file
3968 @param[out] success true if succeeded
3969 @return own: handle to the file, not defined if error, error number
3970 can be retrieved with os_file_get_last_error */
3971 pfs_os_file_t
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)3972 os_file_create_simple_no_error_handling_func(
3973 const char* name,
3974 ulint create_mode,
3975 ulint access_type,
3976 bool read_only,
3977 bool* success)
3978 {
3979 os_file_t file;
3980
3981 *success = false;
3982
3983 DWORD access;
3984 DWORD create_flag;
3985 DWORD attributes = 0;
3986 DWORD share_mode = srv_operation != SRV_OPERATION_NORMAL
3987 ? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE
3988 : FILE_SHARE_READ | FILE_SHARE_DELETE;
3989
3990 ut_a(name);
3991
3992 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
3993 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
3994
3995 if (create_mode == OS_FILE_OPEN) {
3996
3997 create_flag = OPEN_EXISTING;
3998
3999 } else if (read_only) {
4000
4001 create_flag = OPEN_EXISTING;
4002
4003 } else if (create_mode == OS_FILE_CREATE) {
4004
4005 create_flag = CREATE_NEW;
4006
4007 } else {
4008
4009 ib::error()
4010 << "Unknown file create mode (" << create_mode << ") "
4011 << " for file '" << name << "'";
4012
4013 return(OS_FILE_CLOSED);
4014 }
4015
4016 if (access_type == OS_FILE_READ_ONLY) {
4017
4018 access = GENERIC_READ;
4019
4020 } else if (read_only) {
4021
4022 access = GENERIC_READ;
4023
4024 } else if (access_type == OS_FILE_READ_WRITE) {
4025
4026 access = GENERIC_READ | GENERIC_WRITE;
4027
4028 } else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
4029
4030 ut_a(!read_only);
4031
4032 access = GENERIC_READ;
4033
4034 /*!< A backup program has to give mysqld the maximum
4035 freedom to do what it likes with the file */
4036
4037 share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE
4038 | FILE_SHARE_READ;
4039 } else {
4040
4041 ib::error()
4042 << "Unknown file access type (" << access_type << ") "
4043 << "for file '" << name << "'";
4044
4045 return(OS_FILE_CLOSED);
4046 }
4047
4048 file = CreateFile((LPCTSTR) name,
4049 access,
4050 share_mode,
4051 NULL, // Security attributes
4052 create_flag,
4053 attributes,
4054 NULL); // No template file
4055
4056 *success = (file != INVALID_HANDLE_VALUE);
4057
4058 return(file);
4059 }
4060
4061 /** Deletes a file if it exists. The file has to be closed before calling this.
4062 @param[in] name file path as a null-terminated string
4063 @param[out] exist indicate if file pre-exist
4064 @return true if success */
4065 bool
os_file_delete_if_exists_func(const char * name,bool * exist)4066 os_file_delete_if_exists_func(
4067 const char* name,
4068 bool* exist)
4069 {
4070 ulint count = 0;
4071
4072 if (exist != NULL) {
4073 *exist = true;
4074 }
4075
4076 for (;;) {
4077 /* In Windows, deleting an .ibd file may fail if
4078 the file is being accessed by an external program,
4079 such as a backup tool. */
4080
4081 bool ret = DeleteFile((LPCTSTR) name);
4082
4083 if (ret) {
4084 return(true);
4085 }
4086
4087 DWORD lasterr = GetLastError();
4088
4089 if (lasterr == ERROR_FILE_NOT_FOUND
4090 || lasterr == ERROR_PATH_NOT_FOUND) {
4091
4092 /* the file does not exist, this not an error */
4093 if (exist != NULL) {
4094 *exist = false;
4095 }
4096
4097 return(true);
4098 }
4099
4100 ++count;
4101
4102 if (count > 100 && 0 == (count % 10)) {
4103
4104 /* Print error information */
4105 os_file_get_last_error(true);
4106
4107 ib::warn() << "Delete of file '" << name << "' failed.";
4108 }
4109
4110 /* Sleep for a second */
4111 os_thread_sleep(1000000);
4112
4113 if (count > 2000) {
4114
4115 return(false);
4116 }
4117 }
4118 }
4119
4120 /** Deletes a file. The file has to be closed before calling this.
4121 @param[in] name File path as NUL terminated string
4122 @return true if success */
4123 bool
os_file_delete_func(const char * name)4124 os_file_delete_func(
4125 const char* name)
4126 {
4127 ulint count = 0;
4128
4129 for (;;) {
4130 /* In Windows, deleting an .ibd file may fail if
4131 the file is being accessed by an external program,
4132 such as a backup tool. */
4133
4134 BOOL ret = DeleteFile((LPCTSTR) name);
4135
4136 if (ret) {
4137 return(true);
4138 }
4139
4140 if (GetLastError() == ERROR_FILE_NOT_FOUND) {
4141 /* If the file does not exist, we classify this as
4142 a 'mild' error and return */
4143
4144 return(false);
4145 }
4146
4147 ++count;
4148
4149 if (count > 100 && 0 == (count % 10)) {
4150
4151 /* print error information */
4152 os_file_get_last_error(true);
4153
4154 ib::warn()
4155 << "Cannot delete file '" << name << "'. Is "
4156 << "another program accessing it?";
4157 }
4158
4159 /* sleep for a second */
4160 os_thread_sleep(1000000);
4161
4162 if (count > 2000) {
4163
4164 return(false);
4165 }
4166 }
4167
4168 ut_error;
4169 return(false);
4170 }
4171
4172 /** NOTE! Use the corresponding macro os_file_rename(), not directly this
4173 function!
4174 Renames a file (can also move it to another directory). It is safest that the
4175 file is closed before calling this function.
4176 @param[in] oldpath old file path as a null-terminated string
4177 @param[in] newpath new file path
4178 @return true if success */
4179 bool
os_file_rename_func(const char * oldpath,const char * newpath)4180 os_file_rename_func(
4181 const char* oldpath,
4182 const char* newpath)
4183 {
4184 #ifdef UNIV_DEBUG
4185 os_file_type_t type;
4186 bool exists;
4187
4188 /* New path must not exist. */
4189 ut_ad(os_file_status(newpath, &exists, &type));
4190 ut_ad(!exists);
4191
4192 /* Old path must exist. */
4193 ut_ad(os_file_status(oldpath, &exists, &type));
4194 ut_ad(exists);
4195 #endif /* UNIV_DEBUG */
4196
4197 if (MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath)) {
4198 return(true);
4199 }
4200
4201 os_file_handle_rename_error(oldpath, newpath);
4202 return(false);
4203 }
4204
4205 /** NOTE! Use the corresponding macro os_file_close(), not directly
4206 this function!
4207 Closes a file handle. In case of error, error number can be retrieved with
4208 os_file_get_last_error.
4209 @param[in,own] file Handle to a file
4210 @return true if success */
4211 bool
os_file_close_func(os_file_t file)4212 os_file_close_func(
4213 os_file_t file)
4214 {
4215 ut_a(file);
4216
4217 if (CloseHandle(file)) {
4218 return(true);
4219 }
4220
4221 os_file_handle_error(NULL, "close");
4222
4223 return(false);
4224 }
4225
4226 /** Gets a file size.
4227 @param[in] file Handle to a file
4228 @return file size, or (os_offset_t) -1 on failure */
4229 os_offset_t
os_file_get_size(os_file_t file)4230 os_file_get_size(
4231 os_file_t file)
4232 {
4233 DWORD high;
4234 DWORD low = GetFileSize(file, &high);
4235
4236 if (low == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
4237 return((os_offset_t) -1);
4238 }
4239
4240 return(os_offset_t(low | (os_offset_t(high) << 32)));
4241 }
4242
4243 /** Gets a file size.
4244 @param[in] filename Full path to the filename to check
4245 @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
4246 errno */
4247 os_file_size_t
os_file_get_size(const char * filename)4248 os_file_get_size(
4249 const char* filename)
4250 {
4251 struct __stat64 s;
4252 os_file_size_t file_size;
4253
4254 int ret = _stat64(filename, &s);
4255
4256 if (ret == 0) {
4257
4258 file_size.m_total_size = s.st_size;
4259
4260 DWORD low_size;
4261 DWORD high_size;
4262
4263 low_size = GetCompressedFileSize(filename, &high_size);
4264
4265 if (low_size != INVALID_FILE_SIZE) {
4266
4267 file_size.m_alloc_size = high_size;
4268 file_size.m_alloc_size <<= 32;
4269 file_size.m_alloc_size |= low_size;
4270
4271 } else {
4272 ib::error()
4273 << "GetCompressedFileSize("
4274 << filename << ", ..) failed.";
4275
4276 file_size.m_alloc_size = (os_offset_t) -1;
4277 }
4278 } else {
4279 file_size.m_total_size = ~0;
4280 file_size.m_alloc_size = (os_offset_t) ret;
4281 }
4282
4283 return(file_size);
4284 }
4285
4286 /** This function returns information about the specified file
4287 @param[in] path pathname of the file
4288 @param[out] stat_info information of a file in a directory
4289 @param[in,out] statinfo information of a file in a directory
4290 @param[in] check_rw_perm for testing whether the file can be opened
4291 in RW mode
4292 @param[in] read_only true if the file is opened in read-only mode
4293 @return DB_SUCCESS if all OK */
4294 static
4295 dberr_t
os_file_get_status_win32(const char * path,os_file_stat_t * stat_info,struct _stat64 * statinfo,bool check_rw_perm,bool read_only)4296 os_file_get_status_win32(
4297 const char* path,
4298 os_file_stat_t* stat_info,
4299 struct _stat64* statinfo,
4300 bool check_rw_perm,
4301 bool read_only)
4302 {
4303 int ret = _stat64(path, statinfo);
4304
4305 if (ret && (errno == ENOENT || errno == ENOTDIR
4306 || errno == ENAMETOOLONG)) {
4307 /* file does not exist */
4308
4309 return(DB_NOT_FOUND);
4310
4311 } else if (ret) {
4312 /* file exists, but stat call failed */
4313
4314 os_file_handle_error_no_exit(path, "STAT", false);
4315
4316 return(DB_FAIL);
4317
4318 } else if (_S_IFDIR & statinfo->st_mode) {
4319
4320 stat_info->type = OS_FILE_TYPE_DIR;
4321
4322 } else if (_S_IFREG & statinfo->st_mode) {
4323
4324 DWORD access = GENERIC_READ;
4325
4326 if (!read_only) {
4327 access |= GENERIC_WRITE;
4328 }
4329
4330 stat_info->type = OS_FILE_TYPE_FILE;
4331
4332 /* Check if we can open it in read-only mode. */
4333
4334 if (check_rw_perm) {
4335 HANDLE fh;
4336
4337 fh = CreateFile(
4338 (LPCTSTR) path, // File to open
4339 access,
4340 FILE_SHARE_READ | FILE_SHARE_WRITE
4341 | FILE_SHARE_DELETE, // Full sharing
4342 NULL, // Default security
4343 OPEN_EXISTING, // Existing file only
4344 FILE_ATTRIBUTE_NORMAL, // Normal file
4345 NULL); // No attr. template
4346
4347 if (fh == INVALID_HANDLE_VALUE) {
4348 stat_info->rw_perm = false;
4349 } else {
4350 stat_info->rw_perm = true;
4351 CloseHandle(fh);
4352 }
4353 }
4354 stat_info->block_size = 0;
4355
4356 /* What follows, is calculation of FS block size, which is not important
4357 (it is just shown in I_S innodb tables). The error to calculate it will be ignored.*/
4358 char volname[MAX_PATH];
4359 BOOL result = GetVolumePathName(path, volname, MAX_PATH);
4360 static bool warned_once = false;
4361 if (!result) {
4362 if (!warned_once) {
4363 ib::warn()
4364 << "os_file_get_status_win32: "
4365 << "Failed to get the volume path name for: "
4366 << path
4367 << "- OS error number " << GetLastError();
4368 warned_once = true;
4369 }
4370 return(DB_SUCCESS);
4371 }
4372
4373 DWORD sectorsPerCluster;
4374 DWORD bytesPerSector;
4375 DWORD numberOfFreeClusters;
4376 DWORD totalNumberOfClusters;
4377
4378 result = GetDiskFreeSpace(
4379 (LPCSTR) volname,
4380 §orsPerCluster,
4381 &bytesPerSector,
4382 &numberOfFreeClusters,
4383 &totalNumberOfClusters);
4384
4385 if (!result) {
4386 if (!warned_once) {
4387 ib::warn()
4388 << "GetDiskFreeSpace(" << volname << ",...) "
4389 << "failed "
4390 << "- OS error number " << GetLastError();
4391 warned_once = true;
4392 }
4393 return(DB_SUCCESS);
4394 }
4395 stat_info->block_size = bytesPerSector * sectorsPerCluster;
4396 } else {
4397 stat_info->type = OS_FILE_TYPE_UNKNOWN;
4398 }
4399
4400 return(DB_SUCCESS);
4401 }
4402
4403 /**
4404 Sets a sparse flag on Windows file.
4405 @param[in] file file handle
4406 @return true on success, false on error
4407 */
4408 #include <versionhelpers.h>
os_file_set_sparse_win32(os_file_t file,bool is_sparse)4409 bool os_file_set_sparse_win32(os_file_t file, bool is_sparse)
4410 {
4411 if (!is_sparse && !IsWindows8OrGreater()) {
4412 /* Cannot unset sparse flag on older Windows.
4413 Until Windows8 it is documented to produce unpredictable results,
4414 if there are unallocated ranges in file.*/
4415 return false;
4416 }
4417 DWORD temp;
4418 FILE_SET_SPARSE_BUFFER sparse_buffer;
4419 sparse_buffer.SetSparse = is_sparse;
4420 return os_win32_device_io_control(file,
4421 FSCTL_SET_SPARSE, &sparse_buffer, sizeof(sparse_buffer), 0, 0,&temp);
4422 }
4423
4424
4425 /**
4426 Change file size on Windows.
4427
4428 If file is extended, the bytes between old and new EOF
4429 are zeros.
4430
4431 If file is sparse, "virtual" block is added at the end of
4432 allocated area.
4433
4434 If file is normal, file system allocates storage.
4435
4436 @param[in] pathname file path
4437 @param[in] file file handle
4438 @param[in] size size to preserve in bytes
4439 @return true if success */
4440 bool
os_file_change_size_win32(const char * pathname,os_file_t file,os_offset_t size)4441 os_file_change_size_win32(
4442 const char* pathname,
4443 os_file_t file,
4444 os_offset_t size)
4445 {
4446 LARGE_INTEGER length;
4447
4448 length.QuadPart = size;
4449
4450 BOOL success = SetFilePointerEx(file, length, NULL, FILE_BEGIN);
4451
4452 if (!success) {
4453 os_file_handle_error_no_exit(
4454 pathname, "SetFilePointerEx", false);
4455 } else {
4456 success = SetEndOfFile(file);
4457 if (!success) {
4458 os_file_handle_error_no_exit(
4459 pathname, "SetEndOfFile", false);
4460 }
4461 }
4462 return(success);
4463 }
4464
4465 /** Truncates a file at its current position.
4466 @param[in] file Handle to be truncated
4467 @return true if success */
4468 bool
os_file_set_eof(FILE * file)4469 os_file_set_eof(
4470 FILE* file)
4471 {
4472 HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
4473
4474 return(SetEndOfFile(h));
4475 }
4476
4477 /** This function can be called if one wants to post a batch of reads and
4478 prefers an i/o-handler thread to handle them all at once later. You must
4479 call os_aio_simulated_wake_handler_threads later to ensure the threads
4480 are not left sleeping! */
4481 void
os_aio_simulated_put_read_threads_to_sleep()4482 os_aio_simulated_put_read_threads_to_sleep()
4483 {
4484 AIO::simulated_put_read_threads_to_sleep();
4485 }
4486
4487 /** This function can be called if one wants to post a batch of reads and
4488 prefers an i/o-handler thread to handle them all at once later. You must
4489 call os_aio_simulated_wake_handler_threads later to ensure the threads
4490 are not left sleeping! */
4491 void
simulated_put_read_threads_to_sleep()4492 AIO::simulated_put_read_threads_to_sleep()
4493 {
4494 /* The idea of putting background IO threads to sleep is only for
4495 Windows when using simulated AIO. Windows XP seems to schedule
4496 background threads too eagerly to allow for coalescing during
4497 readahead requests. */
4498
4499 if (srv_use_native_aio) {
4500 /* We do not use simulated AIO: do nothing */
4501
4502 return;
4503 }
4504
4505 os_aio_recommend_sleep_for_read_threads = true;
4506
4507 for (ulint i = 0; i < os_aio_n_segments; i++) {
4508 AIO* array;
4509
4510 get_array_and_local_segment(&array, i);
4511
4512 if (array == s_reads) {
4513
4514 os_event_reset(os_aio_segment_wait_events[i]);
4515 }
4516 }
4517 }
4518
4519 #endif /* !_WIN32*/
4520
4521 /** Does a syncronous read or write depending upon the type specified
4522 In case of partial reads/writes the function tries
4523 NUM_RETRIES_ON_PARTIAL_IO times to read/write the complete data.
4524 @param[in] type, IO flags
4525 @param[in] file handle to an open file
4526 @param[out] buf buffer where to read
4527 @param[in] offset file offset from the start where to read
4528 @param[in] n number of bytes to read, starting from offset
4529 @param[out] err DB_SUCCESS or error code
4530 @return number of bytes read/written, -1 if error */
4531 static MY_ATTRIBUTE((warn_unused_result))
4532 ssize_t
os_file_io(const IORequest & in_type,os_file_t file,void * buf,ulint n,os_offset_t offset,dberr_t * err)4533 os_file_io(
4534 const IORequest&in_type,
4535 os_file_t file,
4536 void* buf,
4537 ulint n,
4538 os_offset_t offset,
4539 dberr_t* err)
4540 {
4541 ssize_t original_n = ssize_t(n);
4542 IORequest type = in_type;
4543 ssize_t bytes_returned = 0;
4544
4545 SyncFileIO sync_file_io(file, buf, n, offset);
4546
4547 for (ulint i = 0; i < NUM_RETRIES_ON_PARTIAL_IO; ++i) {
4548
4549 ssize_t n_bytes = sync_file_io.execute(type);
4550
4551 /* Check for a hard error. Not much we can do now. */
4552 if (n_bytes < 0) {
4553
4554 break;
4555
4556 } else if (n_bytes + bytes_returned == ssize_t(n)) {
4557
4558 bytes_returned += n_bytes;
4559
4560 if (offset > 0
4561 && !type.is_log()
4562 && type.is_write()
4563 && type.punch_hole()) {
4564 *err = type.punch_hole(file, offset, n);
4565
4566 } else {
4567 *err = DB_SUCCESS;
4568 }
4569
4570 return(original_n);
4571 }
4572
4573 /* Handle partial read/write. */
4574
4575 ut_ad(ulint(n_bytes + bytes_returned) < n);
4576
4577 bytes_returned += n_bytes;
4578
4579 if (!type.is_partial_io_warning_disabled()) {
4580
4581 const char* op = type.is_read()
4582 ? "read" : "written";
4583
4584 ib::warn()
4585 << n
4586 << " bytes should have been " << op << ". Only "
4587 << bytes_returned
4588 << " bytes " << op << ". Retrying"
4589 << " for the remaining bytes.";
4590 }
4591
4592 /* Advance the offset and buffer by n_bytes */
4593 sync_file_io.advance(n_bytes);
4594 }
4595
4596 *err = DB_IO_ERROR;
4597
4598 if (!type.is_partial_io_warning_disabled()) {
4599 ib::warn()
4600 << "Retry attempts for "
4601 << (type.is_read() ? "reading" : "writing")
4602 << " partial data failed.";
4603 }
4604
4605 return(bytes_returned);
4606 }
4607
4608 /** Does a synchronous write operation in Posix.
4609 @param[in] type IO context
4610 @param[in] file handle to an open file
4611 @param[out] buf buffer from which to write
4612 @param[in] n number of bytes to read, starting from offset
4613 @param[in] offset file offset from the start where to read
4614 @param[out] err DB_SUCCESS or error code
4615 @return number of bytes written, -1 if error */
4616 static MY_ATTRIBUTE((warn_unused_result))
4617 ssize_t
os_file_pwrite(const IORequest & type,os_file_t file,const byte * buf,ulint n,os_offset_t offset,dberr_t * err)4618 os_file_pwrite(
4619 const IORequest& type,
4620 os_file_t file,
4621 const byte* buf,
4622 ulint n,
4623 os_offset_t offset,
4624 dberr_t* err)
4625 {
4626 ut_ad(type.validate());
4627 ut_ad(type.is_write());
4628
4629 ++os_n_file_writes;
4630
4631 const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_WRITES);
4632 MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
4633 ssize_t n_bytes = os_file_io(type, file, const_cast<byte*>(buf),
4634 n, offset, err);
4635 MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
4636
4637 return(n_bytes);
4638 }
4639
4640 /** NOTE! Use the corresponding macro os_file_write(), not directly
4641 Requests a synchronous write operation.
4642 @param[in] type IO flags
4643 @param[in] file handle to an open file
4644 @param[out] buf buffer from which to write
4645 @param[in] offset file offset from the start where to read
4646 @param[in] n number of bytes to read, starting from offset
4647 @return error code
4648 @retval DB_SUCCESS if the operation succeeded */
4649 dberr_t
os_file_write_func(const IORequest & type,const char * name,os_file_t file,const void * buf,os_offset_t offset,ulint n)4650 os_file_write_func(
4651 const IORequest& type,
4652 const char* name,
4653 os_file_t file,
4654 const void* buf,
4655 os_offset_t offset,
4656 ulint n)
4657 {
4658 dberr_t err;
4659
4660 ut_ad(type.validate());
4661 ut_ad(n > 0);
4662
4663 WAIT_ALLOW_WRITES();
4664
4665 ssize_t n_bytes = os_file_pwrite(type, file, (byte*)buf, n, offset, &err);
4666
4667 if ((ulint) n_bytes != n && !os_has_said_disk_full) {
4668
4669 ib::error()
4670 << "Write to file " << name << " failed at offset "
4671 << offset << ", " << n
4672 << " bytes should have been written,"
4673 " only " << n_bytes << " were written."
4674 " Operating system error number " << IF_WIN(GetLastError(),errno) << "."
4675 " Check that your OS and file system"
4676 " support files of this size."
4677 " Check also that the disk is not full"
4678 " or a disk quota exceeded.";
4679 #ifndef _WIN32
4680 if (strerror(errno) != NULL) {
4681
4682 ib::error()
4683 << "Error number " << errno
4684 << " means '" << strerror(errno) << "'";
4685 }
4686
4687 ib::info() << OPERATING_SYSTEM_ERROR_MSG;
4688 #endif
4689 os_has_said_disk_full = true;
4690 }
4691
4692 return(err);
4693 }
4694
4695 /** Does a synchronous read operation in Posix.
4696 @param[in] type IO flags
4697 @param[in] file handle to an open file
4698 @param[out] buf buffer where to read
4699 @param[in] offset file offset from the start where to read
4700 @param[in] n number of bytes to read, starting from offset
4701 @param[out] err DB_SUCCESS or error code
4702 @return number of bytes read, -1 if error */
4703 static MY_ATTRIBUTE((warn_unused_result))
4704 ssize_t
os_file_pread(const IORequest & type,os_file_t file,void * buf,ulint n,os_offset_t offset,dberr_t * err)4705 os_file_pread(
4706 const IORequest& type,
4707 os_file_t file,
4708 void* buf,
4709 ulint n,
4710 os_offset_t offset,
4711 dberr_t* err)
4712 {
4713 ut_ad(type.is_read());
4714
4715 ++os_n_file_reads;
4716
4717 const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS);
4718 MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor);
4719 ssize_t n_bytes = os_file_io(type, file, buf, n, offset, err);
4720 MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
4721
4722 return(n_bytes);
4723 }
4724
4725 /** Requests a synchronous positioned read operation.
4726 @return DB_SUCCESS if request was successful, false if fail
4727 @param[in] type IO flags
4728 @param[in] file handle to an open file
4729 @param[out] buf buffer where to read
4730 @param[in] offset file offset from the start where to read
4731 @param[in] n number of bytes to read, starting from offset
4732 @param[out] o number of bytes actually read
4733 @param[in] exit_on_err if true then exit on error
4734 @return DB_SUCCESS or error code */
4735 static MY_ATTRIBUTE((warn_unused_result))
4736 dberr_t
os_file_read_page(const IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n,ulint * o,bool exit_on_err)4737 os_file_read_page(
4738 const IORequest& type,
4739 os_file_t file,
4740 void* buf,
4741 os_offset_t offset,
4742 ulint n,
4743 ulint* o,
4744 bool exit_on_err)
4745 {
4746 dberr_t err;
4747
4748 os_bytes_read_since_printout += n;
4749
4750 ut_ad(type.validate());
4751 ut_ad(n > 0);
4752
4753 ssize_t n_bytes = os_file_pread(type, file, buf, n, offset, &err);
4754
4755 if (o) {
4756 *o = n_bytes;
4757 }
4758
4759 if (ulint(n_bytes) == n || (err != DB_SUCCESS && !exit_on_err)) {
4760 return err;
4761 }
4762
4763 ib::error() << "Tried to read " << n << " bytes at offset "
4764 << offset << ", but was only able to read " << n_bytes;
4765
4766 if (!os_file_handle_error_cond_exit(
4767 NULL, "read", exit_on_err, false)) {
4768 ib::fatal()
4769 << "Cannot read from file. OS error number "
4770 << errno << ".";
4771 }
4772
4773 if (err == DB_SUCCESS) {
4774 err = DB_IO_ERROR;
4775 }
4776
4777 return err;
4778 }
4779
4780 /** Retrieves the last error number if an error occurs in a file io function.
4781 The number should be retrieved before any other OS calls (because they may
4782 overwrite the error number). If the number is not known to this program,
4783 the OS error number + 100 is returned.
4784 @param[in] report_all_errors true if we want an error printed
4785 for all errors
4786 @return error number, or OS error number + 100 */
4787 ulint
os_file_get_last_error(bool report_all_errors)4788 os_file_get_last_error(
4789 bool report_all_errors)
4790 {
4791 return(os_file_get_last_error_low(report_all_errors, false));
4792 }
4793
4794 /** Handle errors for file operations.
4795 @param[in] name name of a file or NULL
4796 @param[in] operation operation
4797 @param[in] should_abort whether to abort on an unknown error
4798 @param[in] on_error_silent whether to suppress reports of non-fatal errors
4799 @return true if we should retry the operation */
4800 static MY_ATTRIBUTE((warn_unused_result))
4801 bool
os_file_handle_error_cond_exit(const char * name,const char * operation,bool should_abort,bool on_error_silent)4802 os_file_handle_error_cond_exit(
4803 const char* name,
4804 const char* operation,
4805 bool should_abort,
4806 bool on_error_silent)
4807 {
4808 ulint err;
4809
4810 err = os_file_get_last_error_low(false, on_error_silent);
4811
4812 switch (err) {
4813 case OS_FILE_DISK_FULL:
4814 /* We only print a warning about disk full once */
4815
4816 if (os_has_said_disk_full) {
4817
4818 return(false);
4819 }
4820
4821 /* Disk full error is reported irrespective of the
4822 on_error_silent setting. */
4823
4824 if (name) {
4825
4826 ib::error()
4827 << "Encountered a problem with file '"
4828 << name << "'";
4829 }
4830
4831 ib::error()
4832 << "Disk is full. Try to clean the disk to free space.";
4833
4834 os_has_said_disk_full = true;
4835
4836 return(false);
4837
4838 case OS_FILE_AIO_RESOURCES_RESERVED:
4839 case OS_FILE_AIO_INTERRUPTED:
4840
4841 return(true);
4842
4843 case OS_FILE_PATH_ERROR:
4844 case OS_FILE_ALREADY_EXISTS:
4845 case OS_FILE_ACCESS_VIOLATION:
4846
4847 return(false);
4848
4849 case OS_FILE_SHARING_VIOLATION:
4850
4851 os_thread_sleep(10000000); /* 10 sec */
4852 return(true);
4853
4854 case OS_FILE_OPERATION_ABORTED:
4855 case OS_FILE_INSUFFICIENT_RESOURCE:
4856
4857 os_thread_sleep(100000); /* 100 ms */
4858 return(true);
4859
4860 default:
4861
4862 /* If it is an operation that can crash on error then it
4863 is better to ignore on_error_silent and print an error message
4864 to the log. */
4865
4866 if (should_abort || !on_error_silent) {
4867 ib::error() << "File "
4868 << (name != NULL ? name : "(unknown)")
4869 << ": '" << operation << "'"
4870 " returned OS error " << err << "."
4871 << (should_abort
4872 ? " Cannot continue operation" : "");
4873 }
4874
4875 if (should_abort) {
4876 abort();
4877 }
4878 }
4879
4880 return(false);
4881 }
4882
4883 #ifndef _WIN32
4884 /** Tries to disable OS caching on an opened file descriptor.
4885 @param[in] fd file descriptor to alter
4886 @param[in] file_name file name, used in the diagnostic message
4887 @param[in] name "open" or "create"; used in the diagnostic
4888 message */
4889 void
os_file_set_nocache(int fd MY_ATTRIBUTE ((unused)),const char * file_name MY_ATTRIBUTE ((unused)),const char * operation_name MY_ATTRIBUTE ((unused)))4890 os_file_set_nocache(
4891 int fd MY_ATTRIBUTE((unused)),
4892 const char* file_name MY_ATTRIBUTE((unused)),
4893 const char* operation_name MY_ATTRIBUTE((unused)))
4894 {
4895 /* some versions of Solaris may not have DIRECTIO_ON */
4896 #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
4897 if (directio(fd, DIRECTIO_ON) == -1) {
4898 int errno_save = errno;
4899
4900 ib::error()
4901 << "Failed to set DIRECTIO_ON on file "
4902 << file_name << "; " << operation_name << ": "
4903 << strerror(errno_save) << ","
4904 " continuing anyway.";
4905 }
4906 #elif defined(O_DIRECT)
4907 if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
4908 int errno_save = errno;
4909 static bool warning_message_printed = false;
4910 if (errno_save == EINVAL) {
4911 if (!warning_message_printed) {
4912 warning_message_printed = true;
4913 # ifdef UNIV_LINUX
4914 ib::warn()
4915 << "Failed to set O_DIRECT on file"
4916 << file_name << "; " << operation_name
4917 << ": " << strerror(errno_save) << ", "
4918 "continuing anyway. O_DIRECT is "
4919 "known to result in 'Invalid argument' "
4920 "on Linux on tmpfs, "
4921 "see MySQL Bug#26662.";
4922 # else /* UNIV_LINUX */
4923 goto short_warning;
4924 # endif /* UNIV_LINUX */
4925 }
4926 } else {
4927 # ifndef UNIV_LINUX
4928 short_warning:
4929 # endif
4930 ib::warn()
4931 << "Failed to set O_DIRECT on file "
4932 << file_name << "; " << operation_name
4933 << " : " << strerror(errno_save)
4934 << ", continuing anyway.";
4935 }
4936 }
4937 #endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */
4938 }
4939
4940 #endif /* _WIN32 */
4941
4942 /** Check if the file system supports sparse files.
4943 @param fh file handle
4944 @return true if the file system supports sparse files */
os_is_sparse_file_supported(os_file_t fh)4945 IF_WIN(static,) bool os_is_sparse_file_supported(os_file_t fh)
4946 {
4947 /* In this debugging mode, we act as if punch hole is supported,
4948 then we skip any calls to actually punch a hole. In this way,
4949 Transparent Page Compression is still being tested. */
4950 DBUG_EXECUTE_IF("ignore_punch_hole",
4951 return(true);
4952 );
4953
4954 #ifdef _WIN32
4955 FILE_ATTRIBUTE_TAG_INFO info;
4956 if (GetFileInformationByHandleEx(fh, FileAttributeTagInfo,
4957 &info, (DWORD)sizeof(info))) {
4958 if (info.FileAttributes != INVALID_FILE_ATTRIBUTES) {
4959 return (info.FileAttributes & FILE_ATTRIBUTE_SPARSE_FILE) != 0;
4960 }
4961 }
4962 return false;
4963 #else
4964 /* We don't know the FS block size, use the sector size. The FS
4965 will do the magic. */
4966 return DB_SUCCESS == os_file_punch_hole_posix(fh, 0, srv_page_size);
4967 #endif /* _WIN32 */
4968 }
4969
4970 /** Extend a file.
4971
4972 On Windows, extending a file allocates blocks for the file,
4973 unless the file is sparse.
4974
4975 On Unix, we will extend the file with ftruncate(), if
4976 file needs to be sparse. Otherwise posix_fallocate() is used
4977 when available, and if not, binary zeroes are added to the end
4978 of file.
4979
4980 @param[in] name file name
4981 @param[in] file file handle
4982 @param[in] size desired file size
4983 @param[in] sparse whether to create a sparse file (no preallocating)
4984 @return whether the operation succeeded */
4985 bool
os_file_set_size(const char * name,os_file_t file,os_offset_t size,bool is_sparse)4986 os_file_set_size(
4987 const char* name,
4988 os_file_t file,
4989 os_offset_t size,
4990 bool is_sparse)
4991 {
4992 ut_ad(!(size & 4095));
4993
4994 #ifdef _WIN32
4995 /* On Windows, changing file size works well and as expected for both
4996 sparse and normal files.
4997
4998 However, 10.2 up until 10.2.9 made every file sparse in innodb,
4999 causing NTFS fragmentation issues(MDEV-13941). We try to undo
5000 the damage, and unsparse the file.*/
5001
5002 if (!is_sparse && os_is_sparse_file_supported(file)) {
5003 if (!os_file_set_sparse_win32(file, false))
5004 /* Unsparsing file failed. Fallback to writing binary
5005 zeros, to avoid even higher fragmentation.*/
5006 goto fallback;
5007 }
5008
5009 return os_file_change_size_win32(name, file, size);
5010
5011 fallback:
5012 #else
5013 struct stat statbuf;
5014
5015 if (is_sparse) {
5016 bool success = !ftruncate(file, size);
5017 if (!success) {
5018 ib::error() << "ftruncate of file " << name << " to "
5019 << size << " bytes failed with error "
5020 << errno;
5021 }
5022 return(success);
5023 }
5024
5025 # ifdef HAVE_POSIX_FALLOCATE
5026 int err;
5027 do {
5028 if (fstat(file, &statbuf)) {
5029 err = errno;
5030 } else {
5031 os_offset_t current_size = statbuf.st_size;
5032 if (current_size >= size) {
5033 return true;
5034 }
5035 current_size &= ~4095ULL;
5036 err = posix_fallocate(file, current_size,
5037 size - current_size);
5038 }
5039 } while (err == EINTR
5040 && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED);
5041
5042 switch (err) {
5043 case 0:
5044 return true;
5045 default:
5046 ib::error() << "preallocating "
5047 << size << " bytes for file " << name
5048 << " failed with error " << err;
5049 /* fall through */
5050 case EINTR:
5051 errno = err;
5052 return false;
5053 case EINVAL:
5054 case EOPNOTSUPP:
5055 /* fall back to the code below */
5056 break;
5057 }
5058 # endif /* HAVE_POSIX_ALLOCATE */
5059 #endif /* _WIN32*/
5060
5061 #ifdef _WIN32
5062 os_offset_t current_size = os_file_get_size(file);
5063 FILE_STORAGE_INFO info;
5064 if (GetFileInformationByHandleEx(file, FileStorageInfo, &info,
5065 sizeof info)) {
5066 if (info.LogicalBytesPerSector) {
5067 current_size &= ~os_offset_t(info.LogicalBytesPerSector
5068 - 1);
5069 }
5070 }
5071 #else
5072 if (fstat(file, &statbuf)) {
5073 return false;
5074 }
5075 os_offset_t current_size = statbuf.st_size & ~4095ULL;
5076 #endif
5077 if (current_size >= size) {
5078 return true;
5079 }
5080
5081 /* Write up to 1 megabyte at a time. */
5082 ulint buf_size = ut_min(ulint(64),
5083 ulint(size >> srv_page_size_shift))
5084 << srv_page_size_shift;
5085
5086 /* Align the buffer for possible raw i/o */
5087 byte* buf2;
5088
5089 buf2 = static_cast<byte*>(ut_malloc_nokey(buf_size + srv_page_size));
5090
5091 byte* buf = static_cast<byte*>(ut_align(buf2, srv_page_size));
5092
5093 /* Write buffer full of zeros */
5094 memset(buf, 0, buf_size);
5095
5096 while (current_size < size
5097 && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED) {
5098 ulint n_bytes;
5099
5100 if (size - current_size < (os_offset_t) buf_size) {
5101 n_bytes = (ulint) (size - current_size);
5102 } else {
5103 n_bytes = buf_size;
5104 }
5105
5106 dberr_t err;
5107 IORequest request(IORequest::WRITE);
5108
5109 err = os_file_write(
5110 request, name, file, buf, current_size, n_bytes);
5111
5112 if (err != DB_SUCCESS) {
5113 break;
5114 }
5115
5116 current_size += n_bytes;
5117 }
5118
5119 ut_free(buf2);
5120
5121 return(current_size >= size && os_file_flush(file));
5122 }
5123
5124 /** Truncate a file to a specified size in bytes.
5125 @param[in] pathname file path
5126 @param[in] file file to be truncated
5127 @param[in] size size preserved in bytes
5128 @param[in] allow_shrink whether to allow the file to become smaller
5129 @return true if success */
5130 bool
os_file_truncate(const char * pathname,os_file_t file,os_offset_t size,bool allow_shrink)5131 os_file_truncate(
5132 const char* pathname,
5133 os_file_t file,
5134 os_offset_t size,
5135 bool allow_shrink)
5136 {
5137 if (!allow_shrink) {
5138 /* Do nothing if the size preserved is larger than or
5139 equal to the current size of file */
5140 os_offset_t size_bytes = os_file_get_size(file);
5141
5142 if (size >= size_bytes) {
5143 return(true);
5144 }
5145 }
5146
5147 #ifdef _WIN32
5148 return(os_file_change_size_win32(pathname, file, size));
5149 #else /* _WIN32 */
5150 return(os_file_truncate_posix(pathname, file, size));
5151 #endif /* _WIN32 */
5152 }
5153
5154 /** NOTE! Use the corresponding macro os_file_read(), not directly this
5155 function!
5156 Requests a synchronous positioned read operation.
5157 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
5158 @param[in] type IO flags
5159 @param[in] file handle to an open file
5160 @param[out] buf buffer where to read
5161 @param[in] offset file offset from the start where to read
5162 @param[in] n number of bytes to read, starting from offset
5163 @return error code
5164 @retval DB_SUCCESS if the operation succeeded */
5165 dberr_t
os_file_read_func(const IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n)5166 os_file_read_func(
5167 const IORequest& type,
5168 os_file_t file,
5169 void* buf,
5170 os_offset_t offset,
5171 ulint n)
5172 {
5173 return(os_file_read_page(type, file, buf, offset, n, NULL, true));
5174 }
5175
5176 /** NOTE! Use the corresponding macro os_file_read_no_error_handling(),
5177 not directly this function!
5178 Requests a synchronous positioned read operation.
5179 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
5180 @param[in] type IO flags
5181 @param[in] file handle to an open file
5182 @param[out] buf buffer where to read
5183 @param[in] offset file offset from the start where to read
5184 @param[in] n number of bytes to read, starting from offset
5185 @param[out] o number of bytes actually read
5186 @return DB_SUCCESS or error code */
5187 dberr_t
os_file_read_no_error_handling_func(const IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n,ulint * o)5188 os_file_read_no_error_handling_func(
5189 const IORequest& type,
5190 os_file_t file,
5191 void* buf,
5192 os_offset_t offset,
5193 ulint n,
5194 ulint* o)
5195 {
5196 return(os_file_read_page(type, file, buf, offset, n, o, false));
5197 }
5198
5199 /** Check the existence and type of the given file.
5200 @param[in] path path name of file
5201 @param[out] exists true if the file exists
5202 @param[out] type Type of the file, if it exists
5203 @return true if call succeeded */
5204 bool
os_file_status(const char * path,bool * exists,os_file_type_t * type)5205 os_file_status(
5206 const char* path,
5207 bool* exists,
5208 os_file_type_t* type)
5209 {
5210 #ifdef _WIN32
5211 return(os_file_status_win32(path, exists, type));
5212 #else
5213 return(os_file_status_posix(path, exists, type));
5214 #endif /* _WIN32 */
5215 }
5216
5217 /** Free storage space associated with a section of the file.
5218 @param[in] fh Open file handle
5219 @param[in] off Starting offset (SEEK_SET)
5220 @param[in] len Size of the hole
5221 @return DB_SUCCESS or error code */
5222 dberr_t
os_file_punch_hole(os_file_t fh,os_offset_t off,os_offset_t len)5223 os_file_punch_hole(
5224 os_file_t fh,
5225 os_offset_t off,
5226 os_offset_t len)
5227 {
5228 #ifdef _WIN32
5229 return os_file_punch_hole_win32(fh, off, len);
5230 #else
5231 return os_file_punch_hole_posix(fh, off, len);
5232 #endif /* _WIN32 */
5233 }
5234
should_punch_hole() const5235 inline bool IORequest::should_punch_hole() const
5236 {
5237 return m_fil_node && m_fil_node->space->punch_hole;
5238 }
5239
5240 /** Free storage space associated with a section of the file.
5241 @param[in] fh Open file handle
5242 @param[in] off Starting offset (SEEK_SET)
5243 @param[in] len Size of the hole
5244 @return DB_SUCCESS or error code */
5245 dberr_t
punch_hole(os_file_t fh,os_offset_t off,ulint len)5246 IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len)
5247 {
5248 /* In this debugging mode, we act as if punch hole is supported,
5249 and then skip any calls to actually punch a hole here.
5250 In this way, Transparent Page Compression is still being tested. */
5251 DBUG_EXECUTE_IF("ignore_punch_hole",
5252 return(DB_SUCCESS);
5253 );
5254
5255 ulint trim_len = get_trim_length(len);
5256
5257 if (trim_len == 0) {
5258 return(DB_SUCCESS);
5259 }
5260
5261 off += len;
5262
5263 /* Check does file system support punching holes for this
5264 tablespace. */
5265 if (!should_punch_hole()) {
5266 return DB_IO_NO_PUNCH_HOLE;
5267 }
5268
5269 dberr_t err = os_file_punch_hole(fh, off, trim_len);
5270
5271 if (err == DB_SUCCESS) {
5272 srv_stats.page_compressed_trim_op.inc();
5273 } else {
5274 /* If punch hole is not supported,
5275 set space so that it is not used. */
5276 if (err == DB_IO_NO_PUNCH_HOLE) {
5277 if (m_fil_node) {
5278 m_fil_node->space->punch_hole = false;
5279 }
5280 err = DB_SUCCESS;
5281 }
5282 }
5283
5284 return (err);
5285 }
5286
5287 /** This function returns information about the specified file
5288 @param[in] path pathname of the file
5289 @param[out] stat_info information of a file in a directory
5290 @param[in] check_rw_perm for testing whether the file can be opened
5291 in RW mode
5292 @param[in] read_only true if file is opened in read-only mode
5293 @return DB_SUCCESS if all OK */
5294 dberr_t
os_file_get_status(const char * path,os_file_stat_t * stat_info,bool check_rw_perm,bool read_only)5295 os_file_get_status(
5296 const char* path,
5297 os_file_stat_t* stat_info,
5298 bool check_rw_perm,
5299 bool read_only)
5300 {
5301 dberr_t ret;
5302
5303 #ifdef _WIN32
5304 struct _stat64 info;
5305
5306 ret = os_file_get_status_win32(
5307 path, stat_info, &info, check_rw_perm, read_only);
5308
5309 #else
5310 struct stat info;
5311
5312 ret = os_file_get_status_posix(
5313 path, stat_info, &info, check_rw_perm, read_only);
5314
5315 #endif /* _WIN32 */
5316
5317 if (ret == DB_SUCCESS) {
5318 stat_info->ctime = info.st_ctime;
5319 stat_info->atime = info.st_atime;
5320 stat_info->mtime = info.st_mtime;
5321 stat_info->size = info.st_size;
5322 }
5323
5324 return(ret);
5325 }
5326
5327 /**
5328 Waits for an AIO operation to complete. This function is used to wait the
5329 for completed requests. The aio array of pending requests is divided
5330 into segments. The thread specifies which segment or slot it wants to wait
5331 for. NOTE: this function will also take care of freeing the aio slot,
5332 therefore no other thread is allowed to do the freeing!
5333 @param[in] segment The number of the segment in the aio arrays to
5334 wait for; segment 0 is the ibuf I/O thread,
5335 segment 1 the log I/O thread, then follow the
5336 non-ibuf read threads, and as the last are the
5337 non-ibuf write threads; if this is
5338 ULINT_UNDEFINED, then it means that sync AIO
5339 is used, and this parameter is ignored
5340 @param[out] m1 the messages passed with the AIO request; note
5341 that also in the case where the AIO operation
5342 failed, these output parameters are valid and
5343 can be used to restart the operation,
5344 for example
5345 @param[out] m2 callback message
5346 @param[out] type OS_FILE_WRITE or ..._READ
5347 @return DB_SUCCESS or error code */
5348 dberr_t
os_aio_handler(ulint segment,fil_node_t ** m1,void ** m2,IORequest * request)5349 os_aio_handler(
5350 ulint segment,
5351 fil_node_t** m1,
5352 void** m2,
5353 IORequest* request)
5354 {
5355 dberr_t err;
5356
5357 if (srv_use_native_aio) {
5358 srv_set_io_thread_op_info(segment, "native aio handle");
5359
5360 #ifdef WIN_ASYNC_IO
5361
5362 err = os_aio_windows_handler(segment, 0, m1, m2, request);
5363
5364 #elif defined(LINUX_NATIVE_AIO)
5365
5366 err = os_aio_linux_handler(segment, m1, m2, request);
5367
5368 #else
5369 ut_error;
5370
5371 err = DB_ERROR; /* Eliminate compiler warning */
5372
5373 #endif /* WIN_ASYNC_IO */
5374
5375 } else {
5376 srv_set_io_thread_op_info(segment, "simulated aio handle");
5377
5378 err = os_aio_simulated_handler(segment, m1, m2, request);
5379 }
5380
5381 return(err);
5382 }
5383
5384 #ifdef WIN_ASYNC_IO
new_completion_port()5385 static HANDLE new_completion_port()
5386 {
5387 HANDLE h = CreateIoCompletionPort(INVALID_HANDLE_VALUE, 0, 0, 0);
5388 ut_a(h);
5389 return h;
5390 }
5391 #endif
5392
5393 /** Constructor
5394 @param[in] id The latch ID
5395 @param[in] n Number of AIO slots
5396 @param[in] segments Number of segments */
AIO(latch_id_t id,ulint n,ulint segments)5397 AIO::AIO(
5398 latch_id_t id,
5399 ulint n,
5400 ulint segments)
5401 :
5402 m_slots(n),
5403 m_n_segments(segments),
5404 m_n_reserved()
5405 # ifdef LINUX_NATIVE_AIO
5406 ,m_events(m_slots.size())
5407 # endif /* LINUX_NATIVE_AIO */
5408 #ifdef WIN_ASYNC_IO
5409 ,m_completion_port(new_completion_port())
5410 #endif
5411 {
5412 ut_a(n > 0);
5413 ut_a(m_n_segments > 0);
5414
5415 mutex_create(id, &m_mutex);
5416
5417 m_not_full = os_event_create("aio_not_full");
5418 m_is_empty = os_event_create("aio_is_empty");
5419
5420 memset((void*)&m_slots[0], 0x0, sizeof(m_slots[0]) * m_slots.size());
5421 #ifdef LINUX_NATIVE_AIO
5422 memset(&m_events[0], 0x0, sizeof(m_events[0]) * m_events.size());
5423 #endif /* LINUX_NATIVE_AIO */
5424
5425 os_event_set(m_is_empty);
5426 }
5427
5428 /** Initialise the slots */
5429 dberr_t
init_slots()5430 AIO::init_slots()
5431 {
5432 for (ulint i = 0; i < m_slots.size(); ++i) {
5433 Slot& slot = m_slots[i];
5434
5435 slot.pos = static_cast<uint16_t>(i);
5436
5437 slot.is_reserved = false;
5438
5439 #ifdef WIN_ASYNC_IO
5440
5441 slot.array = this;
5442
5443 #elif defined(LINUX_NATIVE_AIO)
5444
5445 slot.ret = 0;
5446
5447 slot.n_bytes = 0;
5448
5449 memset(&slot.control, 0x0, sizeof(slot.control));
5450
5451 #endif /* WIN_ASYNC_IO */
5452 }
5453
5454 return(DB_SUCCESS);
5455 }
5456
5457 #ifdef LINUX_NATIVE_AIO
5458 /** Initialise the Linux Native AIO interface */
5459 dberr_t
init_linux_native_aio()5460 AIO::init_linux_native_aio()
5461 {
5462
5463 /* Initialize the io_context_t array. One io_context_t
5464 per segment in the array. */
5465 m_aio_ctx.resize(get_n_segments());
5466
5467 ulint max_events = slots_per_segment();
5468
5469 for (std::vector<io_context_t>::iterator it = m_aio_ctx.begin(),
5470 end = m_aio_ctx.end();
5471 it != end; ++it) {
5472
5473 if (!linux_create_io_ctx(max_events, *it)) {
5474 /* If something bad happened during aio setup
5475 we disable linux native aio.
5476 This frequently happens when running the test suite
5477 with many threads on a system with low fs.aio-max-nr!
5478 */
5479
5480 ib::warn()
5481 << "Warning: Linux Native AIO disabled "
5482 << "because _linux_create_io_ctx() "
5483 << "failed. To get rid of this warning you can "
5484 << "try increasing system "
5485 << "fs.aio-max-nr to 1048576 or larger or "
5486 << "setting innodb_use_native_aio = 0 in my.cnf";
5487
5488 for (std::vector<io_context_t>::iterator it2
5489 = m_aio_ctx.begin();
5490 it2 != it; ++it2) {
5491 int ret = io_destroy(*it2);
5492 ut_a(ret != -EINVAL);
5493 }
5494
5495 m_aio_ctx.clear();
5496 srv_use_native_aio = FALSE;
5497 return(DB_SUCCESS);
5498 }
5499 }
5500
5501 return(DB_SUCCESS);
5502 }
5503 #endif /* LINUX_NATIVE_AIO */
5504
5505 /** Initialise the array */
5506 dberr_t
init()5507 AIO::init()
5508 {
5509 ut_a(!m_slots.empty());
5510
5511
5512 if (srv_use_native_aio) {
5513 #ifdef LINUX_NATIVE_AIO
5514 dberr_t err = init_linux_native_aio();
5515
5516 if (err != DB_SUCCESS) {
5517 return(err);
5518 }
5519
5520 #endif /* LINUX_NATIVE_AIO */
5521 }
5522
5523 return(init_slots());
5524 }
5525
5526 /** Creates an aio wait array. Note that we return NULL in case of failure.
5527 We don't care about freeing memory here because we assume that a
5528 failure will result in server refusing to start up.
5529 @param[in] id Latch ID
5530 @param[in] n maximum number of pending AIO operations
5531 allowed; n must be divisible by m_n_segments
5532 @param[in] n_segments number of segments in the AIO array
5533 @return own: AIO array, NULL on failure */
5534 AIO*
create(latch_id_t id,ulint n,ulint n_segments)5535 AIO::create(
5536 latch_id_t id,
5537 ulint n,
5538 ulint n_segments)
5539 {
5540 if ((n % n_segments)) {
5541
5542 ib::error()
5543 << "Maximum number of AIO operations must be "
5544 << "divisible by number of segments";
5545
5546 return(NULL);
5547 }
5548
5549 AIO* array = UT_NEW_NOKEY(AIO(id, n, n_segments));
5550
5551 if (array != NULL && array->init() != DB_SUCCESS) {
5552
5553 UT_DELETE(array);
5554
5555 array = NULL;
5556 }
5557
5558 return(array);
5559 }
5560
5561 /** AIO destructor */
~AIO()5562 AIO::~AIO()
5563 {
5564 mutex_destroy(&m_mutex);
5565
5566 os_event_destroy(m_not_full);
5567 os_event_destroy(m_is_empty);
5568
5569 #if defined(LINUX_NATIVE_AIO)
5570 if (srv_use_native_aio) {
5571 for (ulint i = 0; i < m_aio_ctx.size(); i++) {
5572 int ret = io_destroy(m_aio_ctx[i]);
5573 ut_a(ret != -EINVAL);
5574 }
5575 }
5576 #endif /* LINUX_NATIVE_AIO */
5577 #if defined(WIN_ASYNC_IO)
5578 CloseHandle(m_completion_port);
5579 #endif
5580 }
5581
5582 /** Initializes the asynchronous io system. Creates one array each for ibuf
5583 and log i/o. Also creates one array each for read and write where each
5584 array is divided logically into n_readers and n_writers
5585 respectively. The caller must create an i/o handler thread for each
5586 segment in these arrays. This function also creates the sync array.
5587 No i/o handler thread needs to be created for that
5588 @param[in] n_per_seg maximum number of pending aio
5589 operations allowed per segment
5590 @param[in] n_readers number of reader threads
5591 @param[in] n_writers number of writer threads
5592 @param[in] n_slots_sync number of slots in the sync aio array
5593 @return true if the AIO sub-system was started successfully */
5594 bool
start(ulint n_per_seg,ulint n_readers,ulint n_writers,ulint n_slots_sync)5595 AIO::start(
5596 ulint n_per_seg,
5597 ulint n_readers,
5598 ulint n_writers,
5599 ulint n_slots_sync)
5600 {
5601 #if defined(LINUX_NATIVE_AIO)
5602 /* Check if native aio is supported on this system and tmpfs */
5603 if (srv_use_native_aio && !is_linux_native_aio_supported()) {
5604
5605 ib::warn() << "Linux Native AIO disabled.";
5606
5607 srv_use_native_aio = FALSE;
5608 }
5609 #endif /* LINUX_NATIVE_AIO */
5610
5611 srv_reset_io_thread_op_info();
5612
5613 s_reads = create(
5614 LATCH_ID_OS_AIO_READ_MUTEX, n_readers * n_per_seg, n_readers);
5615
5616 if (s_reads == NULL) {
5617 return(false);
5618 }
5619
5620 ulint start = srv_read_only_mode ? 0 : 2;
5621 ulint n_segs = n_readers + start;
5622
5623 /* 0 is the ibuf segment and 1 is the redo log segment. */
5624 for (ulint i = start; i < n_segs; ++i) {
5625 ut_a(i < SRV_MAX_N_IO_THREADS);
5626 srv_io_thread_function[i] = "read thread";
5627 }
5628
5629 ulint n_segments = n_readers;
5630
5631 if (!srv_read_only_mode) {
5632
5633 s_ibuf = create(LATCH_ID_OS_AIO_IBUF_MUTEX, n_per_seg, 1);
5634
5635 if (s_ibuf == NULL) {
5636 return(false);
5637 }
5638
5639 ++n_segments;
5640
5641 srv_io_thread_function[0] = "insert buffer thread";
5642
5643 s_log = create(LATCH_ID_OS_AIO_LOG_MUTEX, n_per_seg, 1);
5644
5645 if (s_log == NULL) {
5646 return(false);
5647 }
5648
5649 ++n_segments;
5650
5651 srv_io_thread_function[1] = "log thread";
5652
5653 } else {
5654 s_ibuf = s_log = NULL;
5655 }
5656
5657 s_writes = create(
5658 LATCH_ID_OS_AIO_WRITE_MUTEX, n_writers * n_per_seg, n_writers);
5659
5660 if (s_writes == NULL) {
5661 return(false);
5662 }
5663
5664 #ifdef WIN_ASYNC_IO
5665 data_completion_port = s_writes->m_completion_port;
5666 log_completion_port =
5667 s_log ? s_log->m_completion_port : data_completion_port;
5668 #endif
5669
5670 n_segments += n_writers;
5671
5672 for (ulint i = start + n_readers; i < n_segments; ++i) {
5673 ut_a(i < SRV_MAX_N_IO_THREADS);
5674 srv_io_thread_function[i] = "write thread";
5675 }
5676
5677 ut_ad(n_segments >= static_cast<ulint>(srv_read_only_mode ? 2 : 4));
5678
5679 s_sync = create(LATCH_ID_OS_AIO_SYNC_MUTEX, n_slots_sync, 1);
5680
5681 if (s_sync == NULL) {
5682
5683 return(false);
5684 }
5685
5686 os_aio_n_segments = n_segments;
5687
5688 os_aio_validate();
5689
5690 os_last_printout = time(NULL);
5691
5692 if (srv_use_native_aio) {
5693 return(true);
5694 }
5695
5696 os_aio_segment_wait_events = static_cast<os_event_t*>(
5697 ut_zalloc_nokey(
5698 n_segments * sizeof *os_aio_segment_wait_events));
5699
5700 if (os_aio_segment_wait_events == NULL) {
5701
5702 return(false);
5703 }
5704
5705 for (ulint i = 0; i < n_segments; ++i) {
5706 os_aio_segment_wait_events[i] = os_event_create(0);
5707 }
5708
5709 return(true);
5710 }
5711
5712 /** Free the AIO arrays */
5713 void
shutdown()5714 AIO::shutdown()
5715 {
5716 UT_DELETE(s_ibuf);
5717 s_ibuf = NULL;
5718
5719 UT_DELETE(s_log);
5720 s_log = NULL;
5721
5722 UT_DELETE(s_writes);
5723 s_writes = NULL;
5724
5725 UT_DELETE(s_sync);
5726 s_sync = NULL;
5727
5728 UT_DELETE(s_reads);
5729 s_reads = NULL;
5730 }
5731
5732 /** Initializes the asynchronous io system. Creates one array each for ibuf
5733 and log i/o. Also creates one array each for read and write where each
5734 array is divided logically into n_readers and n_writers
5735 respectively. The caller must create an i/o handler thread for each
5736 segment in these arrays. This function also creates the sync array.
5737 No i/o handler thread needs to be created for that
5738 @param[in] n_readers number of reader threads
5739 @param[in] n_writers number of writer threads
5740 @param[in] n_slots_sync number of slots in the sync aio array */
5741 bool
os_aio_init(ulint n_readers,ulint n_writers,ulint n_slots_sync)5742 os_aio_init(
5743 ulint n_readers,
5744 ulint n_writers,
5745 ulint n_slots_sync)
5746 {
5747 /* Maximum number of pending aio operations allowed per segment */
5748 ulint limit = 8 * OS_AIO_N_PENDING_IOS_PER_THREAD;
5749
5750 return(AIO::start(limit, n_readers, n_writers, n_slots_sync));
5751 }
5752
5753 /** Frees the asynchronous io system. */
5754 void
os_aio_free()5755 os_aio_free()
5756 {
5757 AIO::shutdown();
5758
5759 ut_ad(!os_aio_segment_wait_events || !srv_use_native_aio);
5760 ut_ad(srv_use_native_aio || os_aio_segment_wait_events
5761 || !srv_was_started);
5762
5763 if (!srv_use_native_aio && os_aio_segment_wait_events) {
5764 for (ulint i = 0; i < os_aio_n_segments; i++) {
5765 os_event_destroy(os_aio_segment_wait_events[i]);
5766 }
5767
5768 ut_free(os_aio_segment_wait_events);
5769 os_aio_segment_wait_events = 0;
5770 }
5771 os_aio_n_segments = 0;
5772 }
5773
5774 /** Wakes up all async i/o threads so that they know to exit themselves in
5775 shutdown. */
5776 void
os_aio_wake_all_threads_at_shutdown()5777 os_aio_wake_all_threads_at_shutdown()
5778 {
5779 #ifdef WIN_ASYNC_IO
5780 AIO::wake_at_shutdown();
5781 #elif defined(LINUX_NATIVE_AIO)
5782 /* When using native AIO interface the io helper threads
5783 wait on io_getevents with a timeout value of 500ms. At
5784 each wake up these threads check the server status.
5785 No need to do anything to wake them up. */
5786 #endif /* !WIN_ASYNC_AIO */
5787
5788 if (srv_use_native_aio) {
5789 return;
5790 }
5791
5792 /* This loop wakes up all simulated ai/o threads */
5793
5794 for (ulint i = 0; i < os_aio_n_segments; ++i) {
5795
5796 os_event_set(os_aio_segment_wait_events[i]);
5797 }
5798 }
5799
5800 /** Waits until there are no pending writes in AIO::s_writes. There can
5801 be other, synchronous, pending writes. */
5802 void
os_aio_wait_until_no_pending_writes()5803 os_aio_wait_until_no_pending_writes()
5804 {
5805 AIO::wait_until_no_pending_writes();
5806 }
5807
5808 /** Calculates segment number for a slot.
5809 @param[in] array AIO wait array
5810 @param[in] slot slot in this array
5811 @return segment number (which is the number used by, for example,
5812 I/O-handler threads) */
5813 ulint
get_segment_no_from_slot(const AIO * array,const Slot * slot)5814 AIO::get_segment_no_from_slot(
5815 const AIO* array,
5816 const Slot* slot)
5817 {
5818 ulint segment;
5819 ulint seg_len;
5820
5821 if (array == s_ibuf) {
5822 ut_ad(!srv_read_only_mode);
5823
5824 segment = IO_IBUF_SEGMENT;
5825
5826 } else if (array == s_log) {
5827 ut_ad(!srv_read_only_mode);
5828
5829 segment = IO_LOG_SEGMENT;
5830
5831 } else if (array == s_reads) {
5832 seg_len = s_reads->slots_per_segment();
5833
5834 segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
5835 } else {
5836 ut_a(array == s_writes);
5837
5838 seg_len = s_writes->slots_per_segment();
5839
5840 segment = s_reads->m_n_segments
5841 + (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
5842 }
5843
5844 return(segment);
5845 }
5846
5847 /** Requests for a slot in the aio array. If no slot is available, waits until
5848 not_full-event becomes signaled.
5849
5850 @param[in] type IO context
5851 @param[in,out] m1 message to be passed along with the AIO
5852 operation
5853 @param[in,out] m2 message to be passed along with the AIO
5854 operation
5855 @param[in] file file handle
5856 @param[in] name name of the file or path as a NUL-terminated
5857 string
5858 @param[in,out] buf buffer where to read or from which to write
5859 @param[in] offset file offset, where to read from or start writing
5860 @param[in] len length of the block to read or write
5861 @return pointer to slot */
5862 Slot*
reserve_slot(const IORequest & type,fil_node_t * m1,void * m2,pfs_os_file_t file,const char * name,void * buf,os_offset_t offset,ulint len)5863 AIO::reserve_slot(
5864 const IORequest& type,
5865 fil_node_t* m1,
5866 void* m2,
5867 pfs_os_file_t file,
5868 const char* name,
5869 void* buf,
5870 os_offset_t offset,
5871 ulint len)
5872 {
5873 ut_ad(reinterpret_cast<size_t>(buf) % OS_FILE_LOG_BLOCK_SIZE == 0);
5874 ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
5875 ut_ad(len % OS_FILE_LOG_BLOCK_SIZE == 0);
5876
5877 #ifdef WIN_ASYNC_IO
5878 ut_a((len & 0xFFFFFFFFUL) == len);
5879 #endif /* WIN_ASYNC_IO */
5880
5881 /* No need of a mutex. Only reading constant fields */
5882 ulint slots_per_seg;
5883
5884 ut_ad(type.validate());
5885
5886 slots_per_seg = slots_per_segment();
5887
5888 /* We attempt to keep adjacent blocks in the same local
5889 segment. This can help in merging IO requests when we are
5890 doing simulated AIO */
5891 ulint local_seg;
5892
5893 local_seg = (offset >> (srv_page_size_shift + 6)) % m_n_segments;
5894
5895 for (;;) {
5896
5897 acquire();
5898
5899 if (m_n_reserved != m_slots.size()) {
5900 break;
5901 }
5902
5903 release();
5904
5905 if (!srv_use_native_aio) {
5906 /* If the handler threads are suspended,
5907 wake them so that we get more slots */
5908
5909 os_aio_simulated_wake_handler_threads();
5910 }
5911
5912 os_event_wait(m_not_full);
5913 }
5914
5915 ulint counter = 0;
5916 Slot* slot = NULL;
5917
5918 /* We start our search for an available slot from our preferred
5919 local segment and do a full scan of the array. We are
5920 guaranteed to find a slot in full scan. */
5921 for (ulint i = local_seg * slots_per_seg;
5922 counter < m_slots.size();
5923 ++i, ++counter) {
5924
5925 i %= m_slots.size();
5926
5927 slot = at(i);
5928
5929 if (slot->is_reserved == false) {
5930 break;
5931 }
5932 }
5933
5934 /* We MUST always be able to get hold of a reserved slot. */
5935 ut_a(counter < m_slots.size());
5936
5937 ut_a(slot->is_reserved == false);
5938
5939 ++m_n_reserved;
5940
5941 if (m_n_reserved == 1) {
5942 os_event_reset(m_is_empty);
5943 }
5944
5945 if (m_n_reserved == m_slots.size()) {
5946 os_event_reset(m_not_full);
5947 }
5948
5949 slot->is_reserved = true;
5950 slot->reservation_time = time(NULL);
5951 slot->m1 = m1;
5952 slot->m2 = m2;
5953 slot->file = file;
5954 slot->name = name;
5955 #ifdef _WIN32
5956 slot->len = static_cast<DWORD>(len);
5957 #else
5958 slot->len = len;
5959 #endif /* _WIN32 */
5960 slot->type = type;
5961 slot->buf = static_cast<byte*>(buf);
5962 slot->ptr = slot->buf;
5963 slot->offset = offset;
5964 slot->err = DB_SUCCESS;
5965 slot->original_len = static_cast<uint32>(len);
5966 slot->io_already_done = false;
5967 slot->buf = static_cast<byte*>(buf);
5968
5969 #ifdef WIN_ASYNC_IO
5970 {
5971 OVERLAPPED* control;
5972
5973 control = &slot->control;
5974 control->Offset = (DWORD) offset & 0xFFFFFFFF;
5975 control->OffsetHigh = (DWORD) (offset >> 32);
5976 }
5977 #elif defined(LINUX_NATIVE_AIO)
5978
5979 /* If we are not using native AIO skip this part. */
5980 if (srv_use_native_aio) {
5981
5982 off_t aio_offset;
5983
5984 /* Check if we are dealing with 64 bit arch.
5985 If not then make sure that offset fits in 32 bits. */
5986 aio_offset = (off_t) offset;
5987
5988 ut_a(sizeof(aio_offset) >= sizeof(offset)
5989 || ((os_offset_t) aio_offset) == offset);
5990
5991 struct iocb* iocb = &slot->control;
5992
5993 if (type.is_read()) {
5994
5995 io_prep_pread(
5996 iocb, file, slot->ptr, slot->len, aio_offset);
5997 } else {
5998 ut_ad(type.is_write());
5999
6000 io_prep_pwrite(
6001 iocb, file, slot->ptr, slot->len, aio_offset);
6002 }
6003
6004 iocb->data = slot;
6005
6006 slot->n_bytes = 0;
6007 slot->ret = 0;
6008 }
6009 #endif /* LINUX_NATIVE_AIO */
6010
6011 release();
6012
6013 return(slot);
6014 }
6015
6016 /** Wakes up a simulated aio i/o-handler thread if it has something to do.
6017 @param[in] global_segment The number of the segment in the AIO arrays */
6018 void
wake_simulated_handler_thread(ulint global_segment)6019 AIO::wake_simulated_handler_thread(ulint global_segment)
6020 {
6021 ut_ad(!srv_use_native_aio);
6022
6023 AIO* array;
6024 ulint segment = get_array_and_local_segment(&array, global_segment);
6025
6026 array->wake_simulated_handler_thread(global_segment, segment);
6027 }
6028
6029 /** Wakes up a simulated AIO I/O-handler thread if it has something to do
6030 for a local segment in the AIO array.
6031 @param[in] global_segment The number of the segment in the AIO arrays
6032 @param[in] segment The local segment in the AIO array */
6033 void
wake_simulated_handler_thread(ulint global_segment,ulint segment)6034 AIO::wake_simulated_handler_thread(ulint global_segment, ulint segment)
6035 {
6036 ut_ad(!srv_use_native_aio);
6037
6038 ulint n = slots_per_segment();
6039 ulint offset = segment * n;
6040
6041 /* Look through n slots after the segment * n'th slot */
6042
6043 acquire();
6044
6045 const Slot* slot = at(offset);
6046
6047 for (ulint i = 0; i < n; ++i, ++slot) {
6048
6049 if (slot->is_reserved) {
6050
6051 /* Found an i/o request */
6052
6053 release();
6054
6055 os_event_t event;
6056
6057 event = os_aio_segment_wait_events[global_segment];
6058
6059 os_event_set(event);
6060
6061 return;
6062 }
6063 }
6064
6065 release();
6066 }
6067
6068 /** Wakes up simulated aio i/o-handler threads if they have something to do. */
6069 void
os_aio_simulated_wake_handler_threads()6070 os_aio_simulated_wake_handler_threads()
6071 {
6072 if (srv_use_native_aio) {
6073 /* We do not use simulated aio: do nothing */
6074
6075 return;
6076 }
6077
6078 os_aio_recommend_sleep_for_read_threads = false;
6079
6080 for (ulint i = 0; i < os_aio_n_segments; i++) {
6081 AIO::wake_simulated_handler_thread(i);
6082 }
6083 }
6084
6085 /** Select the IO slot array
6086 @param[in,out] type Type of IO, READ or WRITE
6087 @param[in] read_only true if running in read-only mode
6088 @param[in] mode IO mode
6089 @return slot array or NULL if invalid mode specified */
6090 AIO*
select_slot_array(IORequest & type,bool read_only,ulint mode)6091 AIO::select_slot_array(IORequest& type, bool read_only, ulint mode)
6092 {
6093 AIO* array;
6094
6095 ut_ad(type.validate());
6096
6097 switch (mode) {
6098 case OS_AIO_NORMAL:
6099
6100 array = type.is_read() ? AIO::s_reads : AIO::s_writes;
6101 break;
6102
6103 case OS_AIO_IBUF:
6104 ut_ad(type.is_read());
6105
6106 /* Reduce probability of deadlock bugs in connection with ibuf:
6107 do not let the ibuf i/o handler sleep */
6108
6109 type.clear_do_not_wake();
6110
6111 array = read_only ? AIO::s_reads : AIO::s_ibuf;
6112 break;
6113
6114 case OS_AIO_LOG:
6115
6116 array = read_only ? AIO::s_reads : AIO::s_log;
6117 break;
6118
6119 case OS_AIO_SYNC:
6120
6121 array = AIO::s_sync;
6122 #if defined(LINUX_NATIVE_AIO)
6123 /* In Linux native AIO we don't use sync IO array. */
6124 ut_a(!srv_use_native_aio);
6125 #endif /* LINUX_NATIVE_AIO */
6126 break;
6127
6128 default:
6129 ut_error;
6130 array = NULL; /* Eliminate compiler warning */
6131 }
6132
6133 return(array);
6134 }
6135
6136 #ifdef WIN_ASYNC_IO
6137 /** This function is only used in Windows asynchronous i/o.
6138 Waits for an aio operation to complete. This function is used to wait the
6139 for completed requests. The aio array of pending requests is divided
6140 into segments. The thread specifies which segment or slot it wants to wait
6141 for. NOTE: this function will also take care of freeing the aio slot,
6142 therefore no other thread is allowed to do the freeing!
6143 @param[in] segment The number of the segment in the aio arrays to
6144 wait for; segment 0 is the ibuf I/O thread,
6145 segment 1 the log I/O thread, then follow the
6146 non-ibuf read threads, and as the last are the
6147 non-ibuf write threads; if this is
6148 ULINT_UNDEFINED, then it means that sync AIO
6149 is used, and this parameter is ignored
6150 @param[in] pos this parameter is used only in sync AIO:
6151 wait for the aio slot at this position
6152 @param[out] m1 the messages passed with the AIO request; note
6153 that also in the case where the AIO operation
6154 failed, these output parameters are valid and
6155 can be used to restart the operation,
6156 for example
6157 @param[out] m2 callback message
6158 @param[out] type OS_FILE_WRITE or ..._READ
6159 @return DB_SUCCESS or error code */
6160
6161
6162
6163 static
6164 dberr_t
os_aio_windows_handler(ulint segment,ulint pos,fil_node_t ** m1,void ** m2,IORequest * type)6165 os_aio_windows_handler(
6166 ulint segment,
6167 ulint pos,
6168 fil_node_t** m1,
6169 void** m2,
6170 IORequest* type)
6171 {
6172 Slot* slot= 0;
6173 dberr_t err;
6174
6175 BOOL ret;
6176 ULONG_PTR key;
6177
6178 ut_a(segment != ULINT_UNDEFINED);
6179
6180 /* NOTE! We only access constant fields in os_aio_array. Therefore
6181 we do not have to acquire the protecting mutex yet */
6182
6183 ut_ad(os_aio_validate_skip());
6184 AIO *my_array;
6185 AIO::get_array_and_local_segment(&my_array, segment);
6186
6187 HANDLE port = my_array->m_completion_port;
6188 ut_ad(port);
6189 for (;;) {
6190 DWORD len;
6191 ret = GetQueuedCompletionStatus(port, &len, &key,
6192 (OVERLAPPED **)&slot, INFINITE);
6193
6194 /* If shutdown key was received, repost the shutdown message and exit */
6195 if (ret && key == IOCP_SHUTDOWN_KEY) {
6196 PostQueuedCompletionStatus(port, 0, key, NULL);
6197 *m1 = NULL;
6198 *m2 = NULL;
6199 return (DB_SUCCESS);
6200 }
6201
6202 ut_a(slot);
6203
6204 if (!ret) {
6205 /* IO failed */
6206 break;
6207 }
6208
6209 slot->n_bytes= len;
6210 ut_a(slot->array);
6211 HANDLE slot_port = slot->array->m_completion_port;
6212 if (slot_port != port) {
6213 /* there are no redirections between data and log */
6214 ut_ad(port == data_completion_port);
6215 ut_ad(slot_port != log_completion_port);
6216
6217 /*
6218 Redirect completions to the dedicated completion port
6219 and threads.
6220
6221 "Write array" threads receive write,read and ibuf
6222 notifications, read and ibuf completions are redirected.
6223
6224 Forwarding IO completion this way costs a context switch,
6225 and this seems tolerable since asynchronous reads are by
6226 far less frequent.
6227 */
6228 ut_a(PostQueuedCompletionStatus(slot_port,
6229 len, key, &slot->control));
6230 }
6231 else {
6232 break;
6233 }
6234 }
6235
6236 ut_a(slot->is_reserved);
6237
6238 *m1 = slot->m1;
6239 *m2 = slot->m2;
6240
6241 *type = slot->type;
6242
6243 bool retry = false;
6244
6245 if (ret && slot->n_bytes == slot->len) {
6246
6247 err = DB_SUCCESS;
6248
6249 } else if (os_file_handle_error(slot->name, "Windows aio")) {
6250
6251 retry = true;
6252
6253 } else {
6254
6255 err = DB_IO_ERROR;
6256 }
6257
6258
6259 if (retry) {
6260 /* Retry failed read/write operation synchronously. */
6261
6262 #ifdef UNIV_PFS_IO
6263 /* This read/write does not go through os_file_read
6264 and os_file_write APIs, need to register with
6265 performance schema explicitly here. */
6266 PSI_file_locker_state state;
6267 struct PSI_file_locker* locker = NULL;
6268
6269 register_pfs_file_io_begin(
6270 &state, locker, slot->file, slot->len,
6271 slot->type.is_write()
6272 ? PSI_FILE_WRITE : PSI_FILE_READ, __FILE__, __LINE__);
6273 #endif /* UNIV_PFS_IO */
6274
6275 ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
6276
6277 ssize_t n_bytes = SyncFileIO::execute(slot);
6278
6279 #ifdef UNIV_PFS_IO
6280 register_pfs_file_io_end(locker, slot->len);
6281 #endif /* UNIV_PFS_IO */
6282
6283 err = (n_bytes == slot->len) ? DB_SUCCESS : DB_IO_ERROR;
6284 }
6285
6286 if (err == DB_SUCCESS) {
6287 err = AIOHandler::post_io_processing(slot);
6288 }
6289
6290 slot->array->release_with_mutex(slot);
6291
6292 if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
6293 && !buf_page_cleaner_is_active
6294 && os_aio_all_slots_free()) {
6295 /* Last IO, wakeup other io threads */
6296 AIO::wake_at_shutdown();
6297 }
6298 return(err);
6299 }
6300 #endif /* WIN_ASYNC_IO */
6301
6302 /**
6303 NOTE! Use the corresponding macro os_aio(), not directly this function!
6304 Requests an asynchronous i/o operation.
6305 @param[in,out] type IO request context
6306 @param[in] mode IO mode
6307 @param[in] name Name of the file or path as NUL terminated
6308 string
6309 @param[in] file Open file handle
6310 @param[out] buf buffer where to read
6311 @param[in] offset file offset where to read
6312 @param[in] n number of bytes to read
6313 @param[in] read_only if true read only mode checks are enforced
6314 @param[in,out] m1 Message for the AIO handler, (can be used to
6315 identify a completed AIO operation); ignored
6316 if mode is OS_AIO_SYNC
6317 @param[in,out] m2 message for the AIO handler (can be used to
6318 identify a completed AIO operation); ignored
6319 if mode is OS_AIO_SYNC
6320
6321 @return DB_SUCCESS or error code */
6322 dberr_t
os_aio_func(IORequest & type,ulint mode,const char * name,pfs_os_file_t file,void * buf,os_offset_t offset,ulint n,bool read_only,fil_node_t * m1,void * m2)6323 os_aio_func(
6324 IORequest& type,
6325 ulint mode,
6326 const char* name,
6327 pfs_os_file_t file,
6328 void* buf,
6329 os_offset_t offset,
6330 ulint n,
6331 bool read_only,
6332 fil_node_t* m1,
6333 void* m2)
6334 {
6335 #ifdef WIN_ASYNC_IO
6336 BOOL ret = TRUE;
6337 #endif /* WIN_ASYNC_IO */
6338
6339 ut_ad(n > 0);
6340 ut_ad((n % OS_FILE_LOG_BLOCK_SIZE) == 0);
6341 ut_ad((offset % OS_FILE_LOG_BLOCK_SIZE) == 0);
6342 ut_ad(os_aio_validate_skip());
6343
6344 #ifdef WIN_ASYNC_IO
6345 ut_ad((n & 0xFFFFFFFFUL) == n);
6346 #endif /* WIN_ASYNC_IO */
6347
6348 DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28",
6349 mode = OS_AIO_SYNC; os_has_said_disk_full = FALSE;);
6350
6351 if (mode == OS_AIO_SYNC) {
6352 if (type.is_read()) {
6353 return(os_file_read_func(type, file, buf, offset, n));
6354 }
6355
6356 ut_ad(type.is_write());
6357
6358 return(os_file_write_func(type, name, file, buf, offset, n));
6359 }
6360
6361 try_again:
6362
6363 AIO* array;
6364
6365 array = AIO::select_slot_array(type, read_only, mode);
6366
6367 Slot* slot;
6368
6369 slot = array->reserve_slot(type, m1, m2, file, name, buf, offset, n);
6370
6371 if (type.is_read()) {
6372
6373
6374 if (srv_use_native_aio) {
6375
6376 ++os_n_file_reads;
6377
6378 os_bytes_read_since_printout += n;
6379 #ifdef WIN_ASYNC_IO
6380 ret = ReadFile(
6381 file, slot->ptr, slot->len,
6382 NULL, &slot->control);
6383 #elif defined(LINUX_NATIVE_AIO)
6384 if (!array->linux_dispatch(slot)) {
6385 goto err_exit;
6386 }
6387 #endif /* WIN_ASYNC_IO */
6388 } else if (type.is_wake()) {
6389 AIO::wake_simulated_handler_thread(
6390 AIO::get_segment_no_from_slot(array, slot));
6391 }
6392 } else if (type.is_write()) {
6393
6394 if (srv_use_native_aio) {
6395 ++os_n_file_writes;
6396
6397 #ifdef WIN_ASYNC_IO
6398 ret = WriteFile(
6399 file, slot->ptr, slot->len,
6400 NULL, &slot->control);
6401 #elif defined(LINUX_NATIVE_AIO)
6402 if (!array->linux_dispatch(slot)) {
6403 goto err_exit;
6404 }
6405 #endif /* WIN_ASYNC_IO */
6406
6407 } else if (type.is_wake()) {
6408 AIO::wake_simulated_handler_thread(
6409 AIO::get_segment_no_from_slot(array, slot));
6410 }
6411 } else {
6412 ut_error;
6413 }
6414
6415 #ifdef WIN_ASYNC_IO
6416 if (ret || (GetLastError() == ERROR_IO_PENDING)) {
6417 /* aio completed or was queued successfully! */
6418 return(DB_SUCCESS);
6419 }
6420
6421 goto err_exit;
6422
6423 #endif /* WIN_ASYNC_IO */
6424
6425 /* AIO request was queued successfully! */
6426 return(DB_SUCCESS);
6427
6428 #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
6429 err_exit:
6430 #endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
6431
6432 array->release_with_mutex(slot);
6433
6434 if (os_file_handle_error(
6435 name, type.is_read() ? "aio read" : "aio write")) {
6436
6437 goto try_again;
6438 }
6439
6440 return(DB_IO_ERROR);
6441 }
6442
6443 /** Simulated AIO handler for reaping IO requests */
6444 class SimulatedAIOHandler {
6445
6446 public:
6447
6448 /** Constructor
6449 @param[in,out] array The AIO array
6450 @param[in] segment Local segment in the array */
SimulatedAIOHandler(AIO * array,ulint segment)6451 SimulatedAIOHandler(AIO* array, ulint segment)
6452 :
6453 m_oldest(),
6454 m_n_elems(),
6455 m_lowest_offset(IB_UINT64_MAX),
6456 m_array(array),
6457 m_n_slots(),
6458 m_segment(segment),
6459 m_ptr(),
6460 m_buf()
6461 {
6462 ut_ad(m_segment < 100);
6463
6464 m_slots.resize(OS_AIO_MERGE_N_CONSECUTIVE);
6465 }
6466
6467 /** Destructor */
~SimulatedAIOHandler()6468 ~SimulatedAIOHandler()
6469 {
6470 if (m_ptr != NULL) {
6471 ut_free(m_ptr);
6472 }
6473 }
6474
6475 /** Reset the state of the handler
6476 @param[in] n_slots Number of pending AIO operations supported */
init(ulint n_slots)6477 void init(ulint n_slots)
6478 {
6479 m_oldest = 0;
6480 m_n_elems = 0;
6481 m_n_slots = n_slots;
6482 m_lowest_offset = IB_UINT64_MAX;
6483
6484 if (m_ptr != NULL) {
6485 ut_free(m_ptr);
6486 m_ptr = m_buf = NULL;
6487 }
6488
6489 m_slots[0] = NULL;
6490 }
6491
6492 /** Check if there is a slot for which the i/o has already been done
6493 @param[out] n_reserved Number of reserved slots
6494 @return the first completed slot that is found. */
check_completed(ulint * n_reserved)6495 Slot* check_completed(ulint* n_reserved)
6496 {
6497 ulint offset = m_segment * m_n_slots;
6498
6499 *n_reserved = 0;
6500
6501 Slot* slot;
6502
6503 slot = m_array->at(offset);
6504
6505 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
6506
6507 if (slot->is_reserved) {
6508
6509 if (slot->io_already_done) {
6510
6511 ut_a(slot->is_reserved);
6512
6513 return(slot);
6514 }
6515
6516 ++*n_reserved;
6517 }
6518 }
6519
6520 return(NULL);
6521 }
6522
6523 /** If there are at least 2 seconds old requests, then pick the
6524 oldest one to prevent starvation. If several requests have the
6525 same age, then pick the one at the lowest offset.
6526 @return true if request was selected */
select()6527 bool select()
6528 {
6529 if (!select_oldest()) {
6530
6531 return(select_lowest_offset());
6532 }
6533
6534 return(true);
6535 }
6536
6537 /** Check if there are several consecutive blocks
6538 to read or write. Merge them if found. */
merge()6539 void merge()
6540 {
6541 /* if m_n_elems != 0, then we have assigned
6542 something valid to consecutive_ios[0] */
6543 ut_ad(m_n_elems != 0);
6544 ut_ad(first_slot() != NULL);
6545
6546 Slot* slot = first_slot();
6547
6548 while (!merge_adjacent(slot)) {
6549 /* No op */
6550 }
6551 }
6552
6553 /** We have now collected n_consecutive I/O requests
6554 in the array; allocate a single buffer which can hold
6555 all data, and perform the I/O
6556 @return the length of the buffer */
allocate_buffer()6557 ulint allocate_buffer()
6558 MY_ATTRIBUTE((warn_unused_result))
6559 {
6560 ulint len;
6561 Slot* slot = first_slot();
6562
6563 ut_ad(m_ptr == NULL);
6564
6565 if (slot->type.is_read() && m_n_elems > 1) {
6566
6567 len = 0;
6568
6569 for (ulint i = 0; i < m_n_elems; ++i) {
6570 len += m_slots[i]->len;
6571 }
6572
6573 m_ptr = static_cast<byte*>(
6574 ut_malloc_nokey(len + srv_page_size));
6575
6576 m_buf = static_cast<byte*>(
6577 ut_align(m_ptr, srv_page_size));
6578
6579 } else {
6580 len = first_slot()->len;
6581 m_buf = first_slot()->buf;
6582 }
6583
6584 return(len);
6585 }
6586
6587 /** We have to compress the individual pages and punch
6588 holes in them on a page by page basis when writing to
6589 tables that can be compresed at the IO level.
6590 @param[in] len Value returned by allocate_buffer */
copy_to_buffer(ulint len)6591 void copy_to_buffer(ulint len)
6592 {
6593 Slot* slot = first_slot();
6594
6595 if (len > slot->len && slot->type.is_write()) {
6596
6597 byte* ptr = m_buf;
6598
6599 ut_ad(ptr != slot->buf);
6600
6601 /* Copy the buffers to the combined buffer */
6602 for (ulint i = 0; i < m_n_elems; ++i) {
6603
6604 slot = m_slots[i];
6605
6606 memmove(ptr, slot->buf, slot->len);
6607
6608 ptr += slot->len;
6609 }
6610 }
6611 }
6612
6613 /** Do the I/O with ordinary, synchronous i/o functions:
6614 @param[in] len Length of buffer for IO */
io()6615 void io()
6616 {
6617 if (first_slot()->type.is_write()) {
6618
6619 for (ulint i = 0; i < m_n_elems; ++i) {
6620 write(m_slots[i]);
6621 }
6622
6623 } else {
6624
6625 for (ulint i = 0; i < m_n_elems; ++i) {
6626 read(m_slots[i]);
6627 }
6628 }
6629 }
6630
6631 /** Mark the i/os done in slots */
done()6632 void done()
6633 {
6634 for (ulint i = 0; i < m_n_elems; ++i) {
6635 m_slots[i]->io_already_done = true;
6636 }
6637 }
6638
6639 /** @return the first slot in the consecutive array */
first_slot()6640 Slot* first_slot()
6641 MY_ATTRIBUTE((warn_unused_result))
6642 {
6643 ut_a(m_n_elems > 0);
6644
6645 return(m_slots[0]);
6646 }
6647
6648 /** Wait for I/O requests
6649 @param[in] global_segment The global segment
6650 @param[in,out] event Wait on event if no active requests
6651 @return the number of slots */
6652 ulint check_pending(
6653 ulint global_segment,
6654 os_event_t event)
6655 MY_ATTRIBUTE((warn_unused_result));
6656 private:
6657
6658 /** Do the file read
6659 @param[in,out] slot Slot that has the IO context */
read(Slot * slot)6660 void read(Slot* slot)
6661 {
6662 dberr_t err = os_file_read(
6663 slot->type,
6664 slot->file,
6665 slot->ptr,
6666 slot->offset,
6667 slot->len);
6668
6669 ut_a(err == DB_SUCCESS);
6670 }
6671
6672 /** Do the file read
6673 @param[in,out] slot Slot that has the IO context */
write(Slot * slot)6674 void write(Slot* slot)
6675 {
6676 dberr_t err = os_file_write(
6677 slot->type,
6678 slot->name,
6679 slot->file,
6680 slot->ptr,
6681 slot->offset,
6682 slot->len);
6683
6684 ut_a(err == DB_SUCCESS);
6685 }
6686
6687 /** @return true if the slots are adjacent and can be merged */
adjacent(const Slot * s1,const Slot * s2) const6688 bool adjacent(const Slot* s1, const Slot* s2) const
6689 {
6690 return(s1 != s2
6691 && s1->file == s2->file
6692 && s2->offset == s1->offset + s1->len
6693 && s1->type == s2->type);
6694 }
6695
6696 /** @return true if merge limit reached or no adjacent slots found. */
merge_adjacent(Slot * & current)6697 bool merge_adjacent(Slot*& current)
6698 {
6699 Slot* slot;
6700 ulint offset = m_segment * m_n_slots;
6701
6702 slot = m_array->at(offset);
6703
6704 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
6705
6706 if (slot->is_reserved && adjacent(current, slot)) {
6707
6708 current = slot;
6709
6710 /* Found a consecutive i/o request */
6711
6712 m_slots[m_n_elems] = slot;
6713
6714 ++m_n_elems;
6715
6716 return(m_n_elems >= m_slots.capacity());
6717 }
6718 }
6719
6720 return(true);
6721 }
6722
6723 /** There were no old requests. Look for an I/O request at the lowest
6724 offset in the array (we ignore the high 32 bits of the offset in these
6725 heuristics) */
select_lowest_offset()6726 bool select_lowest_offset()
6727 {
6728 ut_ad(m_n_elems == 0);
6729
6730 ulint offset = m_segment * m_n_slots;
6731
6732 m_lowest_offset = IB_UINT64_MAX;
6733
6734 for (ulint i = 0; i < m_n_slots; ++i) {
6735 Slot* slot;
6736
6737 slot = m_array->at(i + offset);
6738
6739 if (slot->is_reserved
6740 && slot->offset < m_lowest_offset) {
6741
6742 /* Found an i/o request */
6743 m_slots[0] = slot;
6744
6745 m_n_elems = 1;
6746
6747 m_lowest_offset = slot->offset;
6748 }
6749 }
6750
6751 return(m_n_elems > 0);
6752 }
6753
6754 /** Select the slot if it is older than the current oldest slot.
6755 @param[in] slot The slot to check */
select_if_older(Slot * slot)6756 void select_if_older(Slot* slot)
6757 {
6758 ulint age;
6759
6760 age = (ulint) difftime(time(NULL), slot->reservation_time);
6761
6762 if ((age >= 2 && age > m_oldest)
6763 || (age >= 2
6764 && age == m_oldest
6765 && slot->offset < m_lowest_offset)) {
6766
6767 /* Found an i/o request */
6768 m_slots[0] = slot;
6769
6770 m_n_elems = 1;
6771
6772 m_oldest = age;
6773
6774 m_lowest_offset = slot->offset;
6775 }
6776 }
6777
6778 /** Select th oldest slot in the array
6779 @return true if oldest slot found */
select_oldest()6780 bool select_oldest()
6781 {
6782 ut_ad(m_n_elems == 0);
6783
6784 Slot* slot;
6785 ulint offset = m_n_slots * m_segment;
6786
6787 slot = m_array->at(offset);
6788
6789 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
6790
6791 if (slot->is_reserved) {
6792 select_if_older(slot);
6793 }
6794 }
6795
6796 return(m_n_elems > 0);
6797 }
6798
6799 typedef std::vector<Slot*> slots_t;
6800
6801 private:
6802 ulint m_oldest;
6803 ulint m_n_elems;
6804 os_offset_t m_lowest_offset;
6805
6806 AIO* m_array;
6807 ulint m_n_slots;
6808 ulint m_segment;
6809
6810 slots_t m_slots;
6811
6812 byte* m_ptr;
6813 byte* m_buf;
6814 };
6815
6816 /** Wait for I/O requests
6817 @return the number of slots */
6818 ulint
check_pending(ulint global_segment,os_event_t event)6819 SimulatedAIOHandler::check_pending(
6820 ulint global_segment,
6821 os_event_t event)
6822 {
6823 /* NOTE! We only access constant fields in os_aio_array.
6824 Therefore we do not have to acquire the protecting mutex yet */
6825
6826 ut_ad(os_aio_validate_skip());
6827
6828 ut_ad(m_segment < m_array->get_n_segments());
6829
6830 /* Look through n slots after the segment * n'th slot */
6831
6832 if (AIO::is_read(m_array)
6833 && os_aio_recommend_sleep_for_read_threads) {
6834
6835 /* Give other threads chance to add several
6836 I/Os to the array at once. */
6837
6838 srv_set_io_thread_op_info(
6839 global_segment, "waiting for i/o request");
6840
6841 os_event_wait(event);
6842
6843 return(0);
6844 }
6845
6846 return(m_array->slots_per_segment());
6847 }
6848
6849 /** Does simulated AIO. This function should be called by an i/o-handler
6850 thread.
6851
6852 @param[in] segment The number of the segment in the aio arrays to wait
6853 for; segment 0 is the ibuf i/o thread, segment 1 the
6854 log i/o thread, then follow the non-ibuf read threads,
6855 and as the last are the non-ibuf write threads
6856 @param[out] m1 the messages passed with the AIO request; note that
6857 also in the case where the AIO operation failed, these
6858 output parameters are valid and can be used to restart
6859 the operation, for example
6860 @param[out] m2 Callback argument
6861 @param[in] type IO context
6862 @return DB_SUCCESS or error code */
6863 static
6864 dberr_t
os_aio_simulated_handler(ulint global_segment,fil_node_t ** m1,void ** m2,IORequest * type)6865 os_aio_simulated_handler(
6866 ulint global_segment,
6867 fil_node_t** m1,
6868 void** m2,
6869 IORequest* type)
6870 {
6871 Slot* slot;
6872 AIO* array;
6873 ulint segment;
6874 os_event_t event = os_aio_segment_wait_events[global_segment];
6875
6876 segment = AIO::get_array_and_local_segment(&array, global_segment);
6877
6878 SimulatedAIOHandler handler(array, segment);
6879
6880 for (;;) {
6881
6882 srv_set_io_thread_op_info(
6883 global_segment, "looking for i/o requests (a)");
6884
6885 ulint n_slots = handler.check_pending(global_segment, event);
6886
6887 if (n_slots == 0) {
6888 continue;
6889 }
6890
6891 handler.init(n_slots);
6892
6893 srv_set_io_thread_op_info(
6894 global_segment, "looking for i/o requests (b)");
6895
6896 array->acquire();
6897
6898 ulint n_reserved;
6899
6900 slot = handler.check_completed(&n_reserved);
6901
6902 if (slot != NULL) {
6903
6904 break;
6905
6906 } else if (n_reserved == 0
6907 && !buf_page_cleaner_is_active
6908 && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
6909
6910 /* There is no completed request. If there
6911 are no pending request at all, and the system
6912 is being shut down, exit. */
6913
6914 array->release();
6915
6916 *m1 = NULL;
6917
6918 *m2 = NULL;
6919
6920 return(DB_SUCCESS);
6921
6922 } else if (handler.select()) {
6923
6924 break;
6925 }
6926
6927 /* No I/O requested at the moment */
6928
6929 srv_set_io_thread_op_info(
6930 global_segment, "resetting wait event");
6931
6932 /* We wait here until tbere are more IO requests
6933 for this segment. */
6934
6935 os_event_reset(event);
6936
6937 array->release();
6938
6939 srv_set_io_thread_op_info(
6940 global_segment, "waiting for i/o request");
6941
6942 os_event_wait(event);
6943 }
6944
6945 /** Found a slot that has already completed its IO */
6946
6947 if (slot == NULL) {
6948 /* Merge adjacent requests */
6949 handler.merge();
6950
6951 /* Check if there are several consecutive blocks
6952 to read or write */
6953
6954 srv_set_io_thread_op_info(
6955 global_segment, "consecutive i/o requests");
6956
6957 // Note: We don't support write combining for simulated AIO.
6958 //ulint total_len = handler.allocate_buffer();
6959
6960 /* We release the array mutex for the time of the I/O: NOTE that
6961 this assumes that there is just one i/o-handler thread serving
6962 a single segment of slots! */
6963
6964 array->release();
6965
6966 // Note: We don't support write combining for simulated AIO.
6967 //handler.copy_to_buffer(total_len);
6968
6969 srv_set_io_thread_op_info(global_segment, "doing file i/o");
6970
6971 handler.io();
6972
6973 srv_set_io_thread_op_info(global_segment, "file i/o done");
6974
6975 array->acquire();
6976
6977 handler.done();
6978
6979 /* We return the messages for the first slot now, and if there
6980 were several slots, the messages will be returned with
6981 subsequent calls of this function */
6982
6983 slot = handler.first_slot();
6984 }
6985
6986 ut_ad(slot->is_reserved);
6987
6988 *m1 = slot->m1;
6989 *m2 = slot->m2;
6990
6991 *type = slot->type;
6992
6993 array->release(slot);
6994
6995 array->release();
6996
6997 return(DB_SUCCESS);
6998 }
6999
7000 /** Get the total number of pending IOs
7001 @return the total number of pending IOs */
7002 ulint
total_pending_io_count()7003 AIO::total_pending_io_count()
7004 {
7005 ulint count = s_reads->pending_io_count();
7006
7007 if (s_writes != NULL) {
7008 count += s_writes->pending_io_count();
7009 }
7010
7011 if (s_ibuf != NULL) {
7012 count += s_ibuf->pending_io_count();
7013 }
7014
7015 if (s_log != NULL) {
7016 count += s_log->pending_io_count();
7017 }
7018
7019 if (s_sync != NULL) {
7020 count += s_sync->pending_io_count();
7021 }
7022
7023 return(count);
7024 }
7025
7026 /** Validates the consistency the aio system.
7027 @return true if ok */
7028 static
7029 bool
os_aio_validate()7030 os_aio_validate()
7031 {
7032 /* The methods countds and validates, we ignore the count. */
7033 AIO::total_pending_io_count();
7034
7035 return(true);
7036 }
7037
7038 /** Prints pending IO requests per segment of an aio array.
7039 We probably don't need per segment statistics but they can help us
7040 during development phase to see if the IO requests are being
7041 distributed as expected.
7042 @param[in,out] file File where to print
7043 @param[in] segments Pending IO array */
7044 void
print_segment_info(FILE * file,const ulint * segments)7045 AIO::print_segment_info(
7046 FILE* file,
7047 const ulint* segments)
7048 {
7049 ut_ad(m_n_segments > 0);
7050
7051 if (m_n_segments > 1) {
7052
7053 fprintf(file, " [");
7054
7055 for (ulint i = 0; i < m_n_segments; ++i, ++segments) {
7056
7057 if (i != 0) {
7058 fprintf(file, ", ");
7059 }
7060
7061 fprintf(file, ULINTPF, *segments);
7062 }
7063
7064 fprintf(file, "] ");
7065 }
7066 }
7067
7068 /** Prints info about the aio array.
7069 @param[in,out] file Where to print */
7070 void
print(FILE * file)7071 AIO::print(FILE* file)
7072 {
7073 ulint count = 0;
7074 ulint n_res_seg[SRV_MAX_N_IO_THREADS];
7075
7076 mutex_enter(&m_mutex);
7077
7078 ut_a(!m_slots.empty());
7079 ut_a(m_n_segments > 0);
7080
7081 memset(n_res_seg, 0x0, sizeof(n_res_seg));
7082
7083 for (ulint i = 0; i < m_slots.size(); ++i) {
7084 Slot& slot = m_slots[i];
7085 ulint segment = (i * m_n_segments) / m_slots.size();
7086
7087 if (slot.is_reserved) {
7088
7089 ++count;
7090
7091 ++n_res_seg[segment];
7092
7093 ut_a(slot.len > 0);
7094 }
7095 }
7096
7097 ut_a(m_n_reserved == count);
7098
7099 print_segment_info(file, n_res_seg);
7100
7101 mutex_exit(&m_mutex);
7102 }
7103
7104 /** Print all the AIO segments
7105 @param[in,out] file Where to print */
7106 void
print_all(FILE * file)7107 AIO::print_all(FILE* file)
7108 {
7109 s_reads->print(file);
7110
7111 if (s_writes != NULL) {
7112 fputs(", aio writes:", file);
7113 s_writes->print(file);
7114 }
7115
7116 if (s_ibuf != NULL) {
7117 fputs(",\n ibuf aio reads:", file);
7118 s_ibuf->print(file);
7119 }
7120
7121 if (s_log != NULL) {
7122 fputs(", log i/o's:", file);
7123 s_log->print(file);
7124 }
7125
7126 if (s_sync != NULL) {
7127 fputs(", sync i/o's:", file);
7128 s_sync->print(file);
7129 }
7130 }
7131
7132 /** Prints info of the aio arrays.
7133 @param[in,out] file file where to print */
7134 void
os_aio_print(FILE * file)7135 os_aio_print(FILE* file)
7136 {
7137 time_t current_time;
7138 double time_elapsed;
7139 double avg_bytes_read;
7140
7141 for (ulint i = 0; i < srv_n_file_io_threads; ++i) {
7142 fprintf(file, "I/O thread " ULINTPF " state: %s (%s)",
7143 i,
7144 srv_io_thread_op_info[i],
7145 srv_io_thread_function[i]);
7146
7147 #ifndef _WIN32
7148 if (!srv_use_native_aio
7149 && os_event_is_set(os_aio_segment_wait_events[i])) {
7150 fprintf(file, " ev set");
7151 }
7152 #endif /* _WIN32 */
7153
7154 fprintf(file, "\n");
7155 }
7156
7157 fputs("Pending normal aio reads:", file);
7158
7159 AIO::print_all(file);
7160
7161 putc('\n', file);
7162 current_time = time(NULL);
7163 time_elapsed = 0.001 + difftime(current_time, os_last_printout);
7164
7165 fprintf(file,
7166 "Pending flushes (fsync) log: " ULINTPF
7167 "; buffer pool: " ULINTPF "\n"
7168 ULINTPF " OS file reads, "
7169 ULINTPF " OS file writes, "
7170 ULINTPF " OS fsyncs\n",
7171 fil_n_pending_log_flushes,
7172 fil_n_pending_tablespace_flushes,
7173 ulint{os_n_file_reads},
7174 os_n_file_writes,
7175 os_n_fsyncs);
7176
7177 const ulint n_reads = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_READS));
7178 const ulint n_writes = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_WRITES));
7179
7180 if (n_reads != 0 || n_writes != 0) {
7181 fprintf(file,
7182 ULINTPF " pending reads, " ULINTPF " pending writes\n",
7183 n_reads, n_writes);
7184 }
7185
7186 if (os_n_file_reads == os_n_file_reads_old) {
7187 avg_bytes_read = 0.0;
7188 } else {
7189 avg_bytes_read = (double) os_bytes_read_since_printout
7190 / (os_n_file_reads - os_n_file_reads_old);
7191 }
7192
7193 fprintf(file,
7194 "%.2f reads/s, " ULINTPF " avg bytes/read,"
7195 " %.2f writes/s, %.2f fsyncs/s\n",
7196 (os_n_file_reads - os_n_file_reads_old)
7197 / time_elapsed,
7198 (ulint) avg_bytes_read,
7199 (os_n_file_writes - os_n_file_writes_old)
7200 / time_elapsed,
7201 (os_n_fsyncs - os_n_fsyncs_old)
7202 / time_elapsed);
7203
7204 os_n_file_reads_old = os_n_file_reads;
7205 os_n_file_writes_old = os_n_file_writes;
7206 os_n_fsyncs_old = os_n_fsyncs;
7207 os_bytes_read_since_printout = 0;
7208
7209 os_last_printout = current_time;
7210 }
7211
7212 /** Refreshes the statistics used to print per-second averages. */
7213 void
os_aio_refresh_stats()7214 os_aio_refresh_stats()
7215 {
7216 os_n_fsyncs_old = os_n_fsyncs;
7217
7218 os_bytes_read_since_printout = 0;
7219
7220 os_n_file_reads_old = os_n_file_reads;
7221
7222 os_n_file_writes_old = os_n_file_writes;
7223
7224 os_n_fsyncs_old = os_n_fsyncs;
7225
7226 os_bytes_read_since_printout = 0;
7227
7228 os_last_printout = time(NULL);
7229 }
7230
7231 /** Checks that all slots in the system have been freed, that is, there are
7232 no pending io operations.
7233 @return true if all free */
7234 bool
os_aio_all_slots_free()7235 os_aio_all_slots_free()
7236 {
7237 return(AIO::total_pending_io_count() == 0);
7238 }
7239
7240 #ifdef UNIV_DEBUG
7241 /** Prints all pending IO for the array
7242 @param[in] file file where to print
7243 @param[in] array array to process */
7244 void
to_file(FILE * file) const7245 AIO::to_file(FILE* file) const
7246 {
7247 acquire();
7248
7249 fprintf(file, " " ULINTPF "\n", m_n_reserved);
7250
7251 for (ulint i = 0; i < m_slots.size(); ++i) {
7252
7253 const Slot& slot = m_slots[i];
7254
7255 if (slot.is_reserved) {
7256
7257 fprintf(file,
7258 "%s IO for %s (offset=" UINT64PF
7259 ", size=%lu)\n",
7260 slot.type.is_read() ? "read" : "write",
7261 slot.name, slot.offset, (unsigned long)(slot.len));
7262 }
7263 }
7264
7265 release();
7266 }
7267
7268 /** Print pending IOs for all arrays */
7269 void
print_to_file(FILE * file)7270 AIO::print_to_file(FILE* file)
7271 {
7272 fprintf(file, "Pending normal aio reads:");
7273
7274 s_reads->to_file(file);
7275
7276 if (s_writes != NULL) {
7277 fprintf(file, "Pending normal aio writes:");
7278 s_writes->to_file(file);
7279 }
7280
7281 if (s_ibuf != NULL) {
7282 fprintf(file, "Pending ibuf aio reads:");
7283 s_ibuf->to_file(file);
7284 }
7285
7286 if (s_log != NULL) {
7287 fprintf(file, "Pending log i/o's:");
7288 s_log->to_file(file);
7289 }
7290
7291 if (s_sync != NULL) {
7292 fprintf(file, "Pending sync i/o's:");
7293 s_sync->to_file(file);
7294 }
7295 }
7296
7297 /** Prints all pending IO
7298 @param[in] file File where to print */
7299 void
os_aio_print_pending_io(FILE * file)7300 os_aio_print_pending_io(
7301 FILE* file)
7302 {
7303 AIO::print_to_file(file);
7304 }
7305
7306 #endif /* UNIV_DEBUG */
7307
7308 /**
7309 Set the file create umask
7310 @param[in] umask The umask to use for file creation. */
7311 void
os_file_set_umask(ulint umask)7312 os_file_set_umask(ulint umask)
7313 {
7314 os_innodb_umask = umask;
7315 }
7316
7317 #ifdef _WIN32
7318
7319 /* Checks whether physical drive is on SSD.*/
is_drive_on_ssd(DWORD nr)7320 static bool is_drive_on_ssd(DWORD nr)
7321 {
7322 char physical_drive_path[32];
7323 snprintf(physical_drive_path, sizeof(physical_drive_path),
7324 "\\\\.\\PhysicalDrive%lu", nr);
7325
7326 HANDLE h= CreateFile(physical_drive_path, 0,
7327 FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
7328 nullptr, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, nullptr);
7329 if (h == INVALID_HANDLE_VALUE)
7330 return false;
7331
7332 DEVICE_SEEK_PENALTY_DESCRIPTOR seek_penalty;
7333 STORAGE_PROPERTY_QUERY storage_query{};
7334 storage_query.PropertyId= StorageDeviceSeekPenaltyProperty;
7335 storage_query.QueryType= PropertyStandardQuery;
7336
7337 bool on_ssd= false;
7338 DWORD bytes_written;
7339 if (DeviceIoControl(h, IOCTL_STORAGE_QUERY_PROPERTY, &storage_query,
7340 sizeof storage_query, &seek_penalty, sizeof seek_penalty,
7341 &bytes_written, nullptr))
7342 {
7343 on_ssd= seek_penalty.IncursSeekPenalty;
7344 }
7345 else
7346 {
7347 on_ssd= false;
7348 }
7349 CloseHandle(h);
7350 return on_ssd;
7351 }
7352
7353 /*
7354 Checks whether volume is on SSD, by checking all physical drives
7355 in that volume.
7356 */
is_volume_on_ssd(const char * volume_mount_point)7357 static bool is_volume_on_ssd(const char *volume_mount_point)
7358 {
7359 char volume_name[MAX_PATH];
7360
7361 if (!GetVolumeNameForVolumeMountPoint(volume_mount_point, volume_name,
7362 array_elements(volume_name)))
7363 {
7364 /* This can fail, e.g if file is on network share */
7365 return false;
7366 }
7367
7368 /* Chomp last backslash, this is needed to open volume.*/
7369 size_t length= strlen(volume_name);
7370 if (length && volume_name[length - 1] == '\\')
7371 volume_name[length - 1]= 0;
7372
7373 /* Open volume handle */
7374 HANDLE volume_handle= CreateFile(
7375 volume_name, 0, FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
7376 nullptr, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, nullptr);
7377
7378 if (volume_handle == INVALID_HANDLE_VALUE)
7379 return false;
7380
7381 /*
7382 Enumerate all volume extends, check whether all of them are on SSD
7383 */
7384
7385 /* Anticipate common case where there is only one extent.*/
7386 VOLUME_DISK_EXTENTS single_extent;
7387
7388 /* But also have a place to manage allocated data.*/
7389 std::unique_ptr<BYTE[]> lifetime;
7390
7391 DWORD bytes_written;
7392 VOLUME_DISK_EXTENTS *extents= nullptr;
7393 if (DeviceIoControl(volume_handle, IOCTL_VOLUME_GET_VOLUME_DISK_EXTENTS,
7394 nullptr, 0, &single_extent, sizeof(single_extent),
7395 &bytes_written, nullptr))
7396 {
7397 /* Worked on the first try. Use the preallocated buffer.*/
7398 extents= &single_extent;
7399 }
7400 else
7401 {
7402 VOLUME_DISK_EXTENTS *last_query= &single_extent;
7403 while (GetLastError() == ERROR_MORE_DATA)
7404 {
7405 DWORD extentCount= last_query->NumberOfDiskExtents;
7406 DWORD allocatedSize=
7407 FIELD_OFFSET(VOLUME_DISK_EXTENTS, Extents[extentCount]);
7408 lifetime.reset(new BYTE[allocatedSize]);
7409 last_query= (VOLUME_DISK_EXTENTS *) lifetime.get();
7410 if (DeviceIoControl(volume_handle, IOCTL_VOLUME_GET_VOLUME_DISK_EXTENTS,
7411 nullptr, 0, last_query, allocatedSize,
7412 &bytes_written, nullptr))
7413 {
7414 extents= last_query;
7415 break;
7416 }
7417 }
7418 }
7419 CloseHandle(volume_handle);
7420 if (!extents)
7421 return false;
7422
7423 for (DWORD i= 0; i < extents->NumberOfDiskExtents; i++)
7424 if (!is_drive_on_ssd(extents->Extents[i].DiskNumber))
7425 return false;
7426
7427 return true;
7428 }
7429
7430 #include <unordered_map>
is_file_on_ssd(char * file_path)7431 static bool is_file_on_ssd(char *file_path)
7432 {
7433 /* Cache of volume_path => volume_info, protected by rwlock.*/
7434 static std::unordered_map<std::string, bool> cache;
7435 static SRWLOCK lock= SRWLOCK_INIT;
7436
7437 /* Preset result, in case something fails, e.g we're on network drive.*/
7438 char volume_path[MAX_PATH];
7439 if (!GetVolumePathName(file_path, volume_path, array_elements(volume_path)))
7440 return false;
7441
7442 /* Try cached volume info first.*/
7443 std::string volume_path_str(volume_path);
7444 bool found;
7445 bool result;
7446 AcquireSRWLockShared(&lock);
7447 auto e= cache.find(volume_path_str);
7448 if ((found= e != cache.end()))
7449 result= e->second;
7450 ReleaseSRWLockShared(&lock);
7451
7452 if (found)
7453 return result;
7454
7455 result= is_volume_on_ssd(volume_path);
7456
7457 /* Update cache */
7458 AcquireSRWLockExclusive(&lock);
7459 cache[volume_path_str]= result;
7460 ReleaseSRWLockExclusive(&lock);
7461 return result;
7462 }
7463
7464 #endif
7465
7466 /** Determine some file metadata when creating or reading the file.
7467 @param file the file that is being created, or OS_FILE_CLOSED */
find_metadata(os_file_t file,struct stat * statbuf)7468 void fil_node_t::find_metadata(os_file_t file
7469 #ifndef _WIN32
7470 , struct stat* statbuf
7471 #endif
7472 )
7473 {
7474 if (file == OS_FILE_CLOSED) {
7475 file = handle;
7476 ut_ad(is_open());
7477 }
7478
7479 #ifdef _WIN32 /* FIXME: make this unconditional */
7480 if (space->punch_hole) {
7481 space->punch_hole = os_is_sparse_file_supported(file);
7482 }
7483 #endif
7484
7485 /*
7486 For the temporary tablespace and during the
7487 non-redo-logged adjustments in
7488 IMPORT TABLESPACE, we do not care about
7489 the atomicity of writes.
7490
7491 Atomic writes is supported if the file can be used
7492 with atomic_writes (not log file), O_DIRECT is
7493 used (tested in ha_innodb.cc) and the file is
7494 device and file system that supports atomic writes
7495 for the given block size.
7496 */
7497 space->atomic_write_supported = space->purpose == FIL_TYPE_TEMPORARY
7498 || space->purpose == FIL_TYPE_IMPORT;
7499 #ifdef _WIN32
7500 on_ssd = is_file_on_ssd(name);
7501 FILE_STORAGE_INFO info;
7502 if (GetFileInformationByHandleEx(
7503 file, FileStorageInfo, &info, sizeof(info))) {
7504 block_size = info.PhysicalBytesPerSectorForAtomicity;
7505 } else {
7506 block_size = 512;
7507 }
7508 #else
7509 struct stat sbuf;
7510 if (!statbuf && !fstat(file, &sbuf)) {
7511 statbuf = &sbuf;
7512 }
7513 if (statbuf) {
7514 block_size = statbuf->st_blksize;
7515 }
7516 on_ssd = space->atomic_write_supported
7517 # ifdef UNIV_LINUX
7518 || (statbuf && fil_system.is_ssd(statbuf->st_dev))
7519 # endif
7520 ;
7521 #endif
7522 if (!space->atomic_write_supported) {
7523 space->atomic_write_supported = atomic_write
7524 && srv_use_atomic_writes
7525 #ifndef _WIN32
7526 && my_test_if_atomic_write(file,
7527 space->physical_size())
7528 #else
7529 /* On Windows, all single sector writes are atomic,
7530 as per WriteFile() documentation on MSDN.
7531 We also require SSD for atomic writes, eventhough
7532 technically it is not necessary- the reason is that
7533 on hard disks, we still want the benefit from
7534 (non-atomic) neighbor page flushing in the buffer
7535 pool code. */
7536 && srv_page_size == block_size
7537 && on_ssd
7538 #endif
7539 ;
7540 }
7541 }
7542
7543 /** Read the first page of a data file.
7544 @param[in] first whether this is the very first read
7545 @return whether the page was found valid */
read_page0(bool first)7546 bool fil_node_t::read_page0(bool first)
7547 {
7548 ut_ad(mutex_own(&fil_system.mutex));
7549 ut_a(space->purpose != FIL_TYPE_LOG);
7550 const ulint psize = space->physical_size();
7551 #ifndef _WIN32
7552 struct stat statbuf;
7553 if (fstat(handle, &statbuf)) {
7554 return false;
7555 }
7556 os_offset_t size_bytes = statbuf.st_size;
7557 #else
7558 os_offset_t size_bytes = os_file_get_size(handle);
7559 ut_a(size_bytes != (os_offset_t) -1);
7560 #endif
7561 const ulint min_size = FIL_IBD_FILE_INITIAL_SIZE * psize;
7562
7563 if (size_bytes < min_size) {
7564 ib::error() << "The size of the file " << name
7565 << " is only " << size_bytes
7566 << " bytes, should be at least " << min_size;
7567 return false;
7568 }
7569
7570 byte* buf2 = static_cast<byte*>(ut_malloc_nokey(2 * psize));
7571
7572 /* Align the memory for file i/o if we might have O_DIRECT set */
7573 byte* page = static_cast<byte*>(ut_align(buf2, psize));
7574 IORequest request(IORequest::READ);
7575 if (os_file_read(request, handle, page, 0, psize) != DB_SUCCESS) {
7576 ib::error() << "Unable to read first page of file " << name;
7577 ut_free(buf2);
7578 return false;
7579 }
7580 const ulint space_id = fsp_header_get_space_id(page);
7581 ulint flags = fsp_header_get_flags(page);
7582 const ulint size = fsp_header_get_field(page, FSP_SIZE);
7583 const ulint free_limit = fsp_header_get_field(page, FSP_FREE_LIMIT);
7584 const ulint free_len = flst_get_len(FSP_HEADER_OFFSET + FSP_FREE
7585 + page);
7586 if (!fil_space_t::is_valid_flags(flags, space->id)) {
7587 ulint cflags = fsp_flags_convert_from_101(flags);
7588 if (cflags == ULINT_UNDEFINED) {
7589 invalid:
7590 ib::error()
7591 << "Expected tablespace flags "
7592 << ib::hex(space->flags)
7593 << " but found " << ib::hex(flags)
7594 << " in the file " << name;
7595 ut_free(buf2);
7596 return false;
7597 }
7598
7599 ulint cf = cflags & ~FSP_FLAGS_MEM_MASK;
7600 ulint sf = space->flags & ~FSP_FLAGS_MEM_MASK;
7601
7602 if (!fil_space_t::is_flags_equal(cf, sf)
7603 && !fil_space_t::is_flags_equal(sf, cf)) {
7604 goto invalid;
7605 }
7606
7607 flags = cflags;
7608 }
7609
7610 ut_ad(!(flags & FSP_FLAGS_MEM_MASK));
7611
7612 /* Try to read crypt_data from page 0 if it is not yet read. */
7613 if (!space->crypt_data) {
7614 space->crypt_data = fil_space_read_crypt_data(
7615 fil_space_t::zip_size(flags), page);
7616 }
7617 ut_free(buf2);
7618
7619 if (UNIV_UNLIKELY(space_id != space->id)) {
7620 ib::error() << "Expected tablespace id " << space->id
7621 << " but found " << space_id
7622 << " in the file " << name;
7623 return false;
7624 }
7625
7626 if (first) {
7627 ut_ad(space->id != TRX_SYS_SPACE);
7628 #ifdef UNIV_LINUX
7629 find_metadata(handle, &statbuf);
7630 #else
7631 find_metadata();
7632 #endif
7633
7634 /* Truncate the size to a multiple of extent size. */
7635 ulint mask = psize * FSP_EXTENT_SIZE - 1;
7636
7637 if (size_bytes <= mask) {
7638 /* .ibd files start smaller than an
7639 extent size. Do not truncate valid data. */
7640 } else {
7641 size_bytes &= ~os_offset_t(mask);
7642 }
7643
7644 space->flags = (space->flags & FSP_FLAGS_MEM_MASK) | flags;
7645
7646 this->size = ulint(size_bytes / psize);
7647 space->committed_size = space->size += this->size;
7648 } else if (space->id != TRX_SYS_SPACE || space->size_in_header) {
7649 /* If this is not the first-time open, do nothing.
7650 For the system tablespace, we always get invoked as
7651 first=false, so we detect the true first-time-open based
7652 on size_in_header and proceed to initialize the data. */
7653 return true;
7654 } else {
7655 /* Initialize the size of predefined tablespaces
7656 to FSP_SIZE. */
7657 space->committed_size = size;
7658 }
7659
7660 ut_ad(space->free_limit == 0 || space->free_limit == free_limit);
7661 ut_ad(space->free_len == 0 || space->free_len == free_len);
7662 space->size_in_header = size;
7663 space->free_limit = free_limit;
7664 space->free_len = free_len;
7665 return true;
7666 }
7667
7668 #else
7669 #include "univ.i"
7670 #endif /* !UNIV_INNOCHECKSUM */
7671
7672 /** Normalizes a directory path for the current OS:
7673 On Windows, we convert '/' to '\', else we convert '\' to '/'.
7674 @param[in,out] str A null-terminated directory and file path */
7675 void
os_normalize_path(char * str)7676 os_normalize_path(
7677 char* str)
7678 {
7679 if (str != NULL) {
7680 for (; *str; str++) {
7681 if (*str == OS_PATH_SEPARATOR_ALT) {
7682 *str = OS_PATH_SEPARATOR;
7683 }
7684 }
7685 }
7686 }
7687