1 /***********************************************************************
2
3 Copyright (c) 1995, 2020, Oracle and/or its affiliates.
4 Copyright (c) 2009, Percona Inc.
5
6 Portions of this file contain modifications contributed and copyrighted
7 by Percona Inc.. Those modifications are
8 gratefully acknowledged and are described briefly in the InnoDB
9 documentation. The contributions by Percona Inc. are incorporated with
10 their permission, and subject to the conditions contained in the file
11 COPYING.Percona.
12
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License, version 2.0,
15 as published by the Free Software Foundation.
16
17 This program is also distributed with certain software (including
18 but not limited to OpenSSL) that is licensed under separate terms,
19 as designated in a particular file or component or in included license
20 documentation. The authors of MySQL hereby grant you an additional
21 permission to link the program and your derivative works with the
22 separately licensed software that they have included with MySQL.
23
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License, version 2.0, for more details.
28
29 You should have received a copy of the GNU General Public License along with
30 this program; if not, write to the Free Software Foundation, Inc.,
31 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
32
33 ***********************************************************************/
34
35 /**************************************************//**
36 @file os/os0file.cc
37 The interface to the operating system file i/o primitives
38
39 Created 10/21/1995 Heikki Tuuri
40 *******************************************************/
41
42 #ifndef UNIV_INNOCHECKSUM
43
44 #include "ha_prototypes.h"
45 #include "sql_const.h"
46
47 #include "os0file.h"
48
49 #ifdef UNIV_NONINL
50 #include "os0file.ic"
51 #endif
52
53 #include "page0page.h"
54 #include "srv0srv.h"
55 #include "srv0start.h"
56 #include "fil0fil.h"
57 #ifndef UNIV_HOTBACKUP
58 # include "os0event.h"
59 # include "os0thread.h"
60 #else /* !UNIV_HOTBACKUP */
61 # ifdef _WIN32
62 /* Add includes for the _stat() call to compile on Windows */
63 # include <sys/types.h>
64 # include <sys/stat.h>
65 # include <errno.h>
66 # endif /* _WIN32 */
67 #endif /* !UNIV_HOTBACKUP */
68
69 #include <vector>
70 #include <functional>
71
72 #ifdef LINUX_NATIVE_AIO
73 #include <libaio.h>
74 #endif /* LINUX_NATIVE_AIO */
75
76 #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
77 # include <fcntl.h>
78 # include <linux/falloc.h>
79 #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
80
81 #include <lz4.h>
82 #include <zlib.h>
83
84 #ifdef UNIV_DEBUG
85 /** Set when InnoDB has invoked exit(). */
86 bool innodb_calling_exit;
87 #endif /* UNIV_DEBUG */
88
89 #include <my_aes.h>
90 #include <my_rnd.h>
91 #include <mysqld.h>
92 #include <mysql/service_mysql_keyring.h>
93
94 /** Insert buffer segment id */
95 static const ulint IO_IBUF_SEGMENT = 0;
96
97 /** Log segment id */
98 static const ulint IO_LOG_SEGMENT = 1;
99
100 /** Number of retries for partial I/O's */
101 static const ulint NUM_RETRIES_ON_PARTIAL_IO = 10;
102
103 /** Blocks for doing IO, used in the transparent compression
104 and encryption code. */
105 struct Block {
106 /** Default constructor */
BlockBlock107 Block() : m_ptr(), m_in_use() { }
108
109 byte* m_ptr;
110
111 byte pad[CACHE_LINE_SIZE - sizeof(ulint)];
112 lock_word_t m_in_use;
113 };
114
115 /** For storing the allocated blocks */
116 typedef std::vector<Block> Blocks;
117
118 /** Block collection */
119 static Blocks* block_cache;
120
121 /** Number of blocks to allocate for sync read/writes */
122 static const size_t MAX_BLOCKS = 128;
123
124 /** Block buffer size */
125 #define BUFFER_BLOCK_SIZE ((ulint)(UNIV_PAGE_SIZE * 1.3))
126
127 /** Disk sector size of aligning write buffer for DIRECT_IO */
128 static ulint os_io_ptr_align = UNIV_SECTOR_SIZE;
129
130 /* This specifies the file permissions InnoDB uses when it creates files in
131 Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
132 my_umask */
133
134 #ifndef _WIN32
135 /** Umask for creating files */
136 ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
137 #else
138 /** Umask for creating files */
139 ulint os_innodb_umask = 0;
140
141 /* On Windows when using native AIO the number of AIO requests
142 that a thread can handle at a given time is limited to 32
143 i.e.: SRV_N_PENDING_IOS_PER_THREAD */
144 #define SRV_N_PENDING_IOS_PER_THREAD OS_AIO_N_PENDING_IOS_PER_THREAD
145
146 #endif /* _WIN32 */
147
148 #ifndef UNIV_HOTBACKUP
149
150 /** In simulated aio, merge at most this many consecutive i/os */
151 static const ulint OS_AIO_MERGE_N_CONSECUTIVE = 64;
152
153 /** Flag indicating if the page_cleaner is in active state. */
154 extern bool buf_page_cleaner_is_active;
155
156 /**********************************************************************
157
158 InnoDB AIO Implementation:
159 =========================
160
161 We support native AIO for Windows and Linux. For rest of the platforms
162 we simulate AIO by special IO-threads servicing the IO-requests.
163
164 Simulated AIO:
165 ==============
166
167 On platforms where we 'simulate' AIO, the following is a rough explanation
168 of the high level design.
169 There are four io-threads (for ibuf, log, read, write).
170 All synchronous IO requests are serviced by the calling thread using
171 os_file_write/os_file_read. The Asynchronous requests are queued up
172 in an array (there are four such arrays) by the calling thread.
173 Later these requests are picked up by the IO-thread and are serviced
174 synchronously.
175
176 Windows native AIO:
177 ==================
178
179 If srv_use_native_aio is not set then Windows follow the same
180 code as simulated AIO. If the flag is set then native AIO interface
181 is used. On windows, one of the limitation is that if a file is opened
182 for AIO no synchronous IO can be done on it. Therefore we have an
183 extra fifth array to queue up synchronous IO requests.
184 There are innodb_file_io_threads helper threads. These threads work
185 on the four arrays mentioned above in Simulated AIO. No thread is
186 required for the sync array.
187 If a synchronous IO request is made, it is first queued in the sync
188 array. Then the calling thread itself waits on the request, thus
189 making the call synchronous.
190 If an AIO request is made the calling thread not only queues it in the
191 array but also submits the requests. The helper thread then collects
192 the completed IO request and calls completion routine on it.
193
194 Linux native AIO:
195 =================
196
197 If we have libaio installed on the system and innodb_use_native_aio
198 is set to true we follow the code path of native AIO, otherwise we
199 do simulated AIO.
200 There are innodb_file_io_threads helper threads. These threads work
201 on the four arrays mentioned above in Simulated AIO.
202 If a synchronous IO request is made, it is handled by calling
203 os_file_write/os_file_read.
204 If an AIO request is made the calling thread not only queues it in the
205 array but also submits the requests. The helper thread then collects
206 the completed IO request and calls completion routine on it.
207
208 **********************************************************************/
209
210
211 #ifdef UNIV_PFS_IO
212 /* Keys to register InnoDB I/O with performance schema */
213 mysql_pfs_key_t innodb_data_file_key;
214 mysql_pfs_key_t innodb_log_file_key;
215 mysql_pfs_key_t innodb_temp_file_key;
216 #endif /* UNIV_PFS_IO */
217
218 /** The asynchronous I/O context */
219 struct Slot {
SlotSlot220 Slot() { memset(this, 0, sizeof(*this)); }
221
222 /** index of the slot in the aio array */
223 uint16_t pos;
224
225 /** true if this slot is reserved */
226 bool is_reserved;
227
228 /** time when reserved */
229 ib_time_monotonic_t reservation_time;
230
231 /** buffer used in i/o */
232 byte* buf;
233
234 /** Buffer pointer used for actual IO. We advance this
235 when partial IO is required and not buf */
236 byte* ptr;
237
238 /** OS_FILE_READ or OS_FILE_WRITE */
239 IORequest type;
240
241 /** file offset in bytes */
242 os_offset_t offset;
243
244 /** file where to read or write */
245 pfs_os_file_t file;
246
247 /** file name or path */
248 const char* name;
249
250 /** used only in simulated aio: true if the physical i/o
251 already made and only the slot message needs to be passed
252 to the caller of os_aio_simulated_handle */
253 bool io_already_done;
254
255 /** The file node for which the IO is requested. */
256 fil_node_t* m1;
257
258 /** the requester of an aio operation and which can be used
259 to identify which pending aio operation was completed */
260 void* m2;
261
262 /** AIO completion status */
263 dberr_t err;
264
265 #ifdef WIN_ASYNC_IO
266 /** handle object we need in the OVERLAPPED struct */
267 HANDLE handle;
268
269 /** Windows control block for the aio request */
270 OVERLAPPED control;
271
272 /** bytes written/read */
273 DWORD n_bytes;
274
275 /** length of the block to read or write */
276 DWORD len;
277
278 #elif defined(LINUX_NATIVE_AIO)
279 /** Linux control block for aio */
280 struct iocb control;
281
282 /** AIO return code */
283 int ret;
284
285 /** bytes written/read. */
286 ssize_t n_bytes;
287
288 /** length of the block to read or write */
289 ulint len;
290 #else
291 /** length of the block to read or write */
292 ulint len;
293
294 /** bytes written/read. */
295 ulint n_bytes;
296 #endif /* WIN_ASYNC_IO */
297
298 /** Length of the block before it was compressed */
299 uint32 original_len;
300
301 /** Buffer block for compressed pages or encrypted pages */
302 Block* buf_block;
303
304 /** true, if we shouldn't punch a hole after writing the page */
305 bool skip_punch_hole;
306 };
307
308 /** The asynchronous i/o array structure */
309 class AIO {
310 public:
311 /** Constructor
312 @param[in] id Latch ID
313 @param[in] n_slots Number of slots to configure
314 @param[in] segments Number of segments to configure */
315 AIO(latch_id_t id, ulint n_slots, ulint segments);
316
317 /** Destructor */
318 ~AIO();
319
320 /** Initialize the instance
321 @return DB_SUCCESS or error code */
322 dberr_t init();
323
324 /** Requests for a slot in the aio array. If no slot is available, waits
325 until not_full-event becomes signaled.
326
327 @param[in,out] type IO context
328 @param[in,out] m1 message to be passed along with the AIO
329 operation
330 @param[in,out] m2 message to be passed along with the AIO
331 operation
332 @param[in] file file handle
333 @param[in] name name of the file or path as a null-terminated
334 string
335 @param[in,out] buf buffer where to read or from which to write
336 @param[in] offset file offset, where to read from or start writing
337 @param[in] len length of the block to read or write
338 @return pointer to slot */
339 Slot* reserve_slot(
340 IORequest& type,
341 fil_node_t* m1,
342 void* m2,
343 pfs_os_file_t file,
344 const char* name,
345 void* buf,
346 os_offset_t offset,
347 ulint len)
348 MY_ATTRIBUTE((warn_unused_result));
349
350 /** @return number of reserved slots */
351 ulint pending_io_count() const;
352
353 /** Returns a pointer to the nth slot in the aio array.
354 @param[in] index Index of the slot in the array
355 @return pointer to slot */
at(ulint i) const356 const Slot* at(ulint i) const
357 MY_ATTRIBUTE((warn_unused_result))
358 {
359 ut_a(i < m_slots.size());
360
361 return(&m_slots[i]);
362 }
363
364 /** Non const version */
at(ulint i)365 Slot* at(ulint i)
366 MY_ATTRIBUTE((warn_unused_result))
367 {
368 ut_a(i < m_slots.size());
369
370 return(&m_slots[i]);
371 }
372
373 /** Frees a slot in the AIO array, assumes caller owns the mutex.
374 @param[in,out] slot Slot to release */
375 void release(Slot* slot);
376
377 /** Frees a slot in the AIO array, assumes caller doesn't own the mutex.
378 @param[in,out] slot Slot to release */
379 void release_with_mutex(Slot* slot);
380
381 /** Prints info about the aio array.
382 @param[in,out] file Where to print */
383 void print(FILE* file);
384
385 /** @return the number of slots per segment */
slots_per_segment() const386 ulint slots_per_segment() const
387 MY_ATTRIBUTE((warn_unused_result))
388 {
389 return(m_slots.size() / m_n_segments);
390 }
391
392 /** @return accessor for n_segments */
get_n_segments() const393 ulint get_n_segments() const
394 MY_ATTRIBUTE((warn_unused_result))
395 {
396 return(m_n_segments);
397 }
398
399 #ifdef UNIV_DEBUG
400 /** @return true if the thread owns the mutex */
is_mutex_owned() const401 bool is_mutex_owned() const
402 MY_ATTRIBUTE((warn_unused_result))
403 {
404 return(mutex_own(&m_mutex));
405 }
406 #endif /* UNIV_DEBUG */
407
408 /** Acquire the mutex */
acquire() const409 void acquire() const
410 {
411 mutex_enter(&m_mutex);
412 }
413
414 /** Release the mutex */
release() const415 void release() const
416 {
417 mutex_exit(&m_mutex);
418 }
419
420 /** Write out the state to the file/stream
421 @param[in, out] file File to write to */
422 void to_file(FILE* file) const;
423
424 #ifdef LINUX_NATIVE_AIO
425 /** Dispatch an AIO request to the kernel.
426 @param[in,out] slot an already reserved slot
427 @return true on success. */
428 bool linux_dispatch(Slot* slot)
429 MY_ATTRIBUTE((warn_unused_result));
430
431 /** Accessor for an AIO event
432 @param[in] index Index into the array
433 @return the event at the index */
io_events(ulint index)434 io_event* io_events(ulint index)
435 MY_ATTRIBUTE((warn_unused_result))
436 {
437 ut_a(index < m_events.size());
438
439 return(&m_events[index]);
440 }
441
442 /** Accessor for the AIO context
443 @param[in] segment Segment for which to get the context
444 @return the AIO context for the segment */
io_ctx(ulint segment)445 io_context* io_ctx(ulint segment)
446 MY_ATTRIBUTE((warn_unused_result))
447 {
448 ut_ad(segment < get_n_segments());
449
450 return(m_aio_ctx[segment]);
451 }
452
453 /** Creates an io_context for native linux AIO.
454 @param[in] max_events number of events
455 @param[out] io_ctx io_ctx to initialize.
456 @return true on success. */
457 static bool linux_create_io_ctx(ulint max_events, io_context_t* io_ctx)
458 MY_ATTRIBUTE((warn_unused_result));
459
460 /** Checks if the system supports native linux aio. On some kernel
461 versions where native aio is supported it won't work on tmpfs. In such
462 cases we can't use native aio as it is not possible to mix simulated
463 and native aio.
464 @return true if supported, false otherwise. */
465 static bool is_linux_native_aio_supported()
466 MY_ATTRIBUTE((warn_unused_result));
467 #endif /* LINUX_NATIVE_AIO */
468
469 #ifdef WIN_ASYNC_IO
470 /** Wakes up all async i/o threads in the array in Windows async I/O at
471 shutdown. */
signal()472 void signal()
473 {
474 for (ulint i = 0; i < m_slots.size(); ++i) {
475 SetEvent(m_slots[i].handle);
476 }
477 }
478
479 /** Wake up all AIO threads in Windows native aio */
wake_at_shutdown()480 static void wake_at_shutdown()
481 {
482 s_reads->signal();
483
484 if (s_writes != NULL) {
485 s_writes->signal();
486 }
487
488 if (s_ibuf != NULL) {
489 s_ibuf->signal();
490 }
491
492 if (s_log != NULL) {
493 s_log->signal();
494 }
495 }
496 #endif /* WIN_ASYNC_IO */
497
498 #ifdef _WIN32
499 /** This function can be called if one wants to post a batch of reads
500 and prefers an I/O - handler thread to handle them all at once later.You
501 must call os_aio_simulated_wake_handler_threads later to ensure the
502 threads are not left sleeping! */
503 static void simulated_put_read_threads_to_sleep();
504
505 /** The non asynchronous IO array.
506 @return the synchronous AIO array instance. */
sync_array()507 static AIO* sync_array()
508 MY_ATTRIBUTE((warn_unused_result))
509 {
510 return(s_sync);
511 }
512
513 /**
514 Get the AIO handles for a segment.
515 @param[in] segment The local segment.
516 @return the handles for the segment. */
handles(ulint segment)517 HANDLE* handles(ulint segment)
518 MY_ATTRIBUTE((warn_unused_result))
519 {
520 ut_ad(segment < m_handles->size() / slots_per_segment());
521
522 return(&(*m_handles)[segment * slots_per_segment()]);
523 }
524
525 /** @return true if no slots are reserved */
is_empty() const526 bool is_empty() const
527 MY_ATTRIBUTE((warn_unused_result))
528 {
529 ut_ad(is_mutex_owned());
530 return(m_n_reserved == 0);
531 }
532 #endif /* _WIN32 */
533
534 /** Create an instance using new(std::nothrow)
535 @param[in] id Latch ID
536 @param[in] n_slots The number of AIO request slots
537 @param[in] segments The number of segments
538 @return a new AIO instance */
539 static AIO* create(
540 latch_id_t id,
541 ulint n_slots,
542 ulint segments)
543 MY_ATTRIBUTE((warn_unused_result));
544
545 /** Initializes the asynchronous io system. Creates one array each
546 for ibuf and log I/O. Also creates one array each for read and write
547 where each array is divided logically into n_readers and n_writers
548 respectively. The caller must create an i/o handler thread for each
549 segment in these arrays. This function also creates the sync array.
550 No I/O handler thread needs to be created for that
551 @param[in] n_per_seg maximum number of pending aio
552 operations allowed per segment
553 @param[in] n_readers number of reader threads
554 @param[in] n_writers number of writer threads
555 @param[in] n_slots_sync number of slots in the sync aio array
556 @return true if AIO sub-system was started successfully */
557 static bool start(
558 ulint n_per_seg,
559 ulint n_readers,
560 ulint n_writers,
561 ulint n_slots_sync)
562 MY_ATTRIBUTE((warn_unused_result));
563
564 /** Free the AIO arrays */
565 static void shutdown();
566
567 /** Print all the AIO segments
568 @param[in,out] file Where to print */
569 static void print_all(FILE* file);
570
571 /** Calculates local segment number and aio array from global
572 segment number.
573 @param[out] array AIO wait array
574 @param[in] segment global segment number
575 @return local segment number within the aio array */
576 static ulint get_array_and_local_segment(
577 AIO** array,
578 ulint segment)
579 MY_ATTRIBUTE((warn_unused_result));
580
581 /** Select the IO slot array
582 @param[in] type Type of IO, READ or WRITE
583 @param[in] read_only true if running in read-only mode
584 @param[in] mode IO mode
585 @return slot array or NULL if invalid mode specified */
586 static AIO* select_slot_array(
587 IORequest& type,
588 bool read_only,
589 ulint mode)
590 MY_ATTRIBUTE((warn_unused_result));
591
592 /** Calculates segment number for a slot.
593 @param[in] array AIO wait array
594 @param[in] slot slot in this array
595 @return segment number (which is the number used by, for example,
596 I/O handler threads) */
597 static ulint get_segment_no_from_slot(
598 const AIO* array,
599 const Slot* slot)
600 MY_ATTRIBUTE((warn_unused_result));
601
602 /** Wakes up a simulated AIO I/O-handler thread if it has something
603 to do.
604 @param[in] global_segment the number of the segment in the
605 AIO arrays */
606 static void wake_simulated_handler_thread(ulint global_segment);
607
608 /** Check if it is a read request
609 @param[in] aio The AIO instance to check
610 @return true if the AIO instance is for reading. */
is_read(const AIO * aio)611 static bool is_read(const AIO* aio)
612 MY_ATTRIBUTE((warn_unused_result))
613 {
614 return(s_reads == aio);
615 }
616
617 /** Wait on an event until no pending writes */
wait_until_no_pending_writes()618 static void wait_until_no_pending_writes()
619 {
620 os_event_wait(AIO::s_writes->m_is_empty);
621 }
622
623 /** Print to file
624 @param[in] file File to write to */
625 static void print_to_file(FILE* file);
626
627 /** Check for pending IO. Gets the count and also validates the
628 data structures.
629 @return count of pending IO requests */
630 static ulint total_pending_io_count();
631
632 private:
633 /** Initialise the slots
634 @return DB_SUCCESS or error code */
635 dberr_t init_slots()
636 MY_ATTRIBUTE((warn_unused_result));
637
638 /** Wakes up a simulated AIO I/O-handler thread if it has something
639 to do for a local segment in the AIO array.
640 @param[in] global_segment the number of the segment in the
641 AIO arrays
642 @param[in] segment the local segment in the AIO array */
643 void wake_simulated_handler_thread(ulint global_segment, ulint segment);
644
645 /** Prints pending IO requests per segment of an aio array.
646 We probably don't need per segment statistics but they can help us
647 during development phase to see if the IO requests are being
648 distributed as expected.
649 @param[in,out] file file where to print
650 @param[in] segments pending IO array */
651 void print_segment_info(
652 FILE* file,
653 const ulint* segments);
654
655 #ifdef LINUX_NATIVE_AIO
656 /** Initialise the Linux native AIO data structures
657 @return DB_SUCCESS or error code */
658 dberr_t init_linux_native_aio()
659 MY_ATTRIBUTE((warn_unused_result));
660 #endif /* LINUX_NATIVE_AIO */
661
662 private:
663 typedef std::vector<Slot> Slots;
664
665 /** the mutex protecting the aio array */
666 mutable SysMutex m_mutex;
667
668 /** Pointer to the slots in the array.
669 Number of elements must be divisible by n_threads. */
670 Slots m_slots;
671
672 /** Number of segments in the aio array of pending aio requests.
673 A thread can wait separately for any one of the segments. */
674 ulint m_n_segments;
675
676 /** The event which is set to the signaled state when
677 there is space in the aio outside the ibuf segment */
678 os_event_t m_not_full;
679
680 /** The event which is set to the signaled state when
681 there are no pending i/os in this array */
682 os_event_t m_is_empty;
683
684 /** Number of reserved slots in the AIO array outside
685 the ibuf segment */
686 ulint m_n_reserved;
687
688 #ifdef _WIN32
689 typedef std::vector<HANDLE, ut_allocator<HANDLE> > Handles;
690
691 /** Pointer to an array of OS native event handles where
692 we copied the handles from slots, in the same order. This
693 can be used in WaitForMultipleObjects; used only in Windows */
694 Handles* m_handles;
695 #endif /* _WIN32 */
696
697 #if defined(LINUX_NATIVE_AIO)
698 typedef std::vector<io_event> IOEvents;
699
700 /** completion queue for IO. There is one such queue per
701 segment. Each thread will work on one ctx exclusively. */
702 io_context_t* m_aio_ctx;
703
704 /** The array to collect completed IOs. There is one such
705 event for each possible pending IO. The size of the array
706 is equal to m_slots.size(). */
707 IOEvents m_events;
708 #endif /* LINUX_NATIV_AIO */
709
710 /** The aio arrays for non-ibuf i/o and ibuf i/o, as well as
711 sync AIO. These are NULL when the module has not yet been
712 initialized. */
713
714 /** Insert buffer */
715 static AIO* s_ibuf;
716
717 /** Redo log */
718 static AIO* s_log;
719
720 /** Reads */
721 static AIO* s_reads;
722
723 /** Writes */
724 static AIO* s_writes;
725
726 /** Synchronous I/O */
727 static AIO* s_sync;
728 };
729
730 /** Static declarations */
731 AIO* AIO::s_reads;
732 AIO* AIO::s_writes;
733 AIO* AIO::s_ibuf;
734 AIO* AIO::s_log;
735 AIO* AIO::s_sync;
736
737 #if defined(LINUX_NATIVE_AIO)
738 /** timeout for each io_getevents() call = 500ms. */
739 static const ulint OS_AIO_REAP_TIMEOUT = 500000000UL;
740
741 /** time to sleep, in microseconds if io_setup() returns EAGAIN. */
742 static const ulint OS_AIO_IO_SETUP_RETRY_SLEEP = 500000UL;
743
744 /** number of attempts before giving up on io_setup(). */
745 static const int OS_AIO_IO_SETUP_RETRY_ATTEMPTS = 5;
746 #endif /* LINUX_NATIVE_AIO */
747
748 /** Array of events used in simulated AIO */
749 static os_event_t* os_aio_segment_wait_events = NULL;
750
751 /** Number of asynchronous I/O segments. Set by os_aio_init(). */
752 static ulint os_aio_n_segments = ULINT_UNDEFINED;
753
754 /** If the following is true, read i/o handler threads try to
755 wait until a batch of new read requests have been posted */
756 static bool os_aio_recommend_sleep_for_read_threads = false;
757 #endif /* !UNIV_HOTBACKUP */
758
759 ulint os_n_file_reads = 0;
760 ulint os_bytes_read_since_printout = 0;
761 ulint os_n_file_writes = 0;
762 ulint os_n_fsyncs = 0;
763 ulint os_n_file_reads_old = 0;
764 ulint os_n_file_writes_old = 0;
765 ulint os_n_fsyncs_old = 0;
766 /** Number of pending write operations */
767 ulint os_n_pending_writes = 0;
768 /** Number of pending read operations */
769 ulint os_n_pending_reads = 0;
770
771 ib_time_monotonic_t os_last_printout;
772 bool os_has_said_disk_full = false;
773
774 /** Default Zip compression level */
775 extern uint page_zip_level;
776
777 #if DATA_TRX_ID_LEN > 6
778 #error "COMPRESSION_ALGORITHM will not fit"
779 #endif /* DATA_TRX_ID_LEN */
780
781 /** Validates the consistency of the aio system.
782 @return true if ok */
783 static
784 bool
785 os_aio_validate();
786
787 /** Does error handling when a file operation fails.
788 @param[in] name File name or NULL
789 @param[in] operation Name of operation e.g., "read", "write"
790 @return true if we should retry the operation */
791 static
792 bool
793 os_file_handle_error(
794 const char* name,
795 const char* operation);
796
797 /** Free storage space associated with a section of the file.
798 @param[in] fh Open file handle
799 @param[in] off Starting offset (SEEK_SET)
800 @param[in] len Size of the hole
801 @return DB_SUCCESS or error code */
802 dberr_t
803 os_file_punch_hole(
804 os_file_t fh,
805 os_offset_t off,
806 os_offset_t len);
807
808 /**
809 Does error handling when a file operation fails.
810 @param[in] name File name or NULL
811 @param[in] operation Name of operation e.g., "read", "write"
812 @param[in] silent if true then don't print any message to the log.
813 @return true if we should retry the operation */
814 static
815 bool
816 os_file_handle_error_no_exit(
817 const char* name,
818 const char* operation,
819 bool silent);
820
821 /** Decompress after a read and punch a hole in the file if it was a write
822 @param[in] type IO context
823 @param[in] fh Open file handle
824 @param[in,out] buf Buffer to transform
825 @param[in,out] scratch Scratch area for read decompression
826 @param[in] src_len Length of the buffer before compression
827 @param[in] len Compressed buffer length for write and size
828 of buf len for read
829 @return DB_SUCCESS or error code */
830 static
831 dberr_t
832 os_file_io_complete(
833 const IORequest&type,
834 os_file_t fh,
835 byte* buf,
836 byte* scratch,
837 ulint src_len,
838 os_offset_t offset,
839 ulint len);
840
841 /** Does simulated AIO. This function should be called by an i/o-handler
842 thread.
843
844 @param[in] segment The number of the segment in the aio arrays to wait
845 for; segment 0 is the ibuf i/o thread, segment 1 the
846 log i/o thread, then follow the non-ibuf read threads,
847 and as the last are the non-ibuf write threads
848 @param[out] m1 the messages passed with the AIO request; note that
849 also in the case where the AIO operation failed, these
850 output parameters are valid and can be used to restart
851 the operation, for example
852 @param[out] m2 Callback argument
853 @param[in] type IO context
854 @return DB_SUCCESS or error code */
855 static
856 dberr_t
857 os_aio_simulated_handler(
858 ulint global_segment,
859 fil_node_t** m1,
860 void** m2,
861 IORequest* type);
862
863 #ifdef WIN_ASYNC_IO
864 /** This function is only used in Windows asynchronous i/o.
865 Waits for an aio operation to complete. This function is used to wait the
866 for completed requests. The aio array of pending requests is divided
867 into segments. The thread specifies which segment or slot it wants to wait
868 for. NOTE: this function will also take care of freeing the aio slot,
869 therefore no other thread is allowed to do the freeing!
870 @param[in] segment The number of the segment in the aio arrays to
871 wait for; segment 0 is the ibuf I/O thread,
872 segment 1 the log I/O thread, then follow the
873 non-ibuf read threads, and as the last are the
874 non-ibuf write threads; if this is
875 ULINT_UNDEFINED, then it means that sync AIO
876 is used, and this parameter is ignored
877 @param[in] pos this parameter is used only in sync AIO:
878 wait for the aio slot at this position
879 @param[out] m1 the messages passed with the AIO request; note
880 that also in the case where the AIO operation
881 failed, these output parameters are valid and
882 can be used to restart the operation,
883 for example
884 @param[out] m2 callback message
885 @param[out] type OS_FILE_WRITE or ..._READ
886 @return DB_SUCCESS or error code */
887 static
888 dberr_t
889 os_aio_windows_handler(
890 ulint segment,
891 ulint pos,
892 fil_node_t** m1,
893 void** m2,
894 IORequest* type);
895 #endif /* WIN_ASYNC_IO */
896
897 /** Allocate a page for sync IO
898 @return pointer to page */
899 static
900 Block*
os_alloc_block()901 os_alloc_block()
902 {
903 size_t pos;
904 Blocks& blocks = *block_cache;
905 size_t i = static_cast<size_t>(my_timer_cycles());
906 const size_t size = blocks.size();
907 ulint retry = 0;
908 Block* block;
909
910 DBUG_EXECUTE_IF("os_block_cache_busy", retry = MAX_BLOCKS * 3;);
911
912 for (;;) {
913
914 /* After go through the block cache for 3 times,
915 allocate a new temporary block. */
916 if (retry == MAX_BLOCKS * 3) {
917 byte* ptr;
918
919 ptr = static_cast<byte*>(
920 ut_malloc_nokey(sizeof(*block)
921 + BUFFER_BLOCK_SIZE));
922
923 block = new (ptr) Block();
924 block->m_ptr = static_cast<byte*>(
925 ptr + sizeof(*block));
926 block->m_in_use = 1;
927
928 break;
929 }
930
931 pos = i++ % size;
932
933 if (TAS(&blocks[pos].m_in_use, 1) == 0) {
934 block = &blocks[pos];
935 break;
936 }
937
938 os_thread_yield();
939
940 ++retry;
941 }
942
943 ut_a(block->m_in_use != 0);
944
945 return(block);
946 }
947
948 /** Free a page after sync IO
949 @param[in,own] block The block to free/release */
950 static
951 void
os_free_block(Block * block)952 os_free_block(Block* block)
953 {
954 ut_ad(block->m_in_use == 1);
955
956 TAS(&block->m_in_use, 0);
957
958 /* When this block is not in the block cache, and it's
959 a temporary block, we need to free it directly. */
960 if (std::less<Block*>()(block, &block_cache->front())
961 || std::greater<Block*>()(block, &block_cache->back())) {
962 ut_free(block);
963 }
964 }
965
966 /** Generic AIO Handler methods. Currently handles IO post processing. */
967 class AIOHandler {
968 public:
969 /** Do any post processing after a read/write
970 @return DB_SUCCESS or error code. */
971 static dberr_t post_io_processing(Slot* slot);
972
973 /** Decompress after a read and punch a hole in the file if
974 it was a write */
io_complete(const Slot * slot)975 static dberr_t io_complete(const Slot* slot)
976 {
977 ut_a(slot->offset > 0);
978 ut_a(slot->type.is_read() || !slot->skip_punch_hole);
979 return(os_file_io_complete(
980 slot->type, slot->file.m_file, slot->buf,
981 NULL, slot->original_len,
982 slot->offset, slot->len));
983 }
984
985 private:
986 /** Check whether the page was encrypted.
987 @param[in] slot The slot that contains the IO request
988 @return true if it was an encyrpted page */
is_encrypted_page(const Slot * slot)989 static bool is_encrypted_page(const Slot* slot)
990 {
991 return(Encryption::is_encrypted_page(slot->buf));
992 }
993
994 /** Check whether the page was compressed.
995 @param[in] slot The slot that contains the IO request
996 @return true if it was a compressed page */
is_compressed_page(const Slot * slot)997 static bool is_compressed_page(const Slot* slot)
998 {
999 const byte* src = slot->buf;
1000
1001 ulint page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
1002
1003 return(page_type == FIL_PAGE_COMPRESSED);
1004 }
1005
1006 /** Get the compressed page size.
1007 @param[in] slot The slot that contains the IO request
1008 @return number of bytes to read for a successful decompress */
compressed_page_size(const Slot * slot)1009 static ulint compressed_page_size(const Slot* slot)
1010 {
1011 ut_ad(slot->type.is_read());
1012 ut_ad(is_compressed_page(slot));
1013
1014 ulint size;
1015 const byte* src = slot->buf;
1016
1017 size = mach_read_from_2(src + FIL_PAGE_COMPRESS_SIZE_V1);
1018
1019 return(size + FIL_PAGE_DATA);
1020 }
1021
1022 /** Check if the page contents can be decompressed.
1023 @param[in] slot The slot that contains the IO request
1024 @return true if the data read has all the compressed data */
can_decompress(const Slot * slot)1025 static bool can_decompress(const Slot* slot)
1026 {
1027 ut_ad(slot->type.is_read());
1028 ut_ad(is_compressed_page(slot));
1029
1030 ulint version;
1031 const byte* src = slot->buf;
1032
1033 version = mach_read_from_1(src + FIL_PAGE_VERSION);
1034
1035 ut_a(Compression::is_valid_page_version(version));
1036
1037 /* Includes the page header size too */
1038 ulint size = compressed_page_size(slot);
1039
1040 return(size <= (slot->ptr - slot->buf) + (ulint) slot->n_bytes);
1041 }
1042
1043 /** Check if we need to read some more data.
1044 @param[in] slot The slot that contains the IO request
1045 @param[in] n_bytes Total bytes read so far
1046 @return DB_SUCCESS or error code */
1047 static dberr_t check_read(Slot* slot, ulint n_bytes);
1048 };
1049
1050 /** Helper class for doing synchronous file IO. Currently, the objective
1051 is to hide the OS specific code, so that the higher level functions aren't
1052 peppered with #ifdef. Makes the code flow difficult to follow. */
1053 class SyncFileIO {
1054 public:
1055 /** Constructor
1056 @param[in] fh File handle
1057 @param[in,out] buf Buffer to read/write
1058 @param[in] n Number of bytes to read/write
1059 @param[in] offset Offset where to read or write */
SyncFileIO(os_file_t fh,void * buf,ulint n,os_offset_t offset)1060 SyncFileIO(os_file_t fh, void* buf, ulint n, os_offset_t offset)
1061 :
1062 m_fh(fh),
1063 m_buf(buf),
1064 m_n(static_cast<ssize_t>(n)),
1065 m_offset(offset)
1066 {
1067 ut_ad(m_n > 0);
1068 }
1069
1070 /** Destructor */
~SyncFileIO()1071 ~SyncFileIO()
1072 {
1073 /* No op */
1074 }
1075
1076 /** Do the read/write
1077 @param[in] request The IO context and type
1078 @return the number of bytes read/written or negative value on error */
1079 ssize_t execute(const IORequest& request);
1080
1081 /** Do the read/write
1082 @param[in,out] slot The IO slot, it has the IO context
1083 @return the number of bytes read/written or negative value on error */
1084 static ssize_t execute(Slot* slot);
1085
1086 /** Move the read/write offset up to where the partial IO succeeded.
1087 @param[in] n_bytes The number of bytes to advance */
advance(ssize_t n_bytes)1088 void advance(ssize_t n_bytes)
1089 {
1090 m_offset += n_bytes;
1091
1092 ut_ad(m_n >= n_bytes);
1093
1094 m_n -= n_bytes;
1095
1096 m_buf = reinterpret_cast<uchar*>(m_buf) + n_bytes;
1097 }
1098
1099 private:
1100 /** Open file handle */
1101 os_file_t m_fh;
1102
1103 /** Buffer to read/write */
1104 void* m_buf;
1105
1106 /** Number of bytes to read/write */
1107 ssize_t m_n;
1108
1109 /** Offset from where to read/write */
1110 os_offset_t m_offset;
1111 };
1112
1113 /** If it is a compressed page return the compressed page data + footer size
1114 @param[in] buf Buffer to check, must include header + 10 bytes
1115 @return ULINT_UNDEFINED if the page is not a compressed page or length
1116 of the compressed data (including footer) if it is a compressed page */
1117 ulint
os_file_compressed_page_size(const byte * buf)1118 os_file_compressed_page_size(const byte* buf)
1119 {
1120 ulint type = mach_read_from_2(buf + FIL_PAGE_TYPE);
1121
1122 if (type == FIL_PAGE_COMPRESSED) {
1123 ulint version = mach_read_from_1(buf + FIL_PAGE_VERSION);
1124 ut_a(Compression::is_valid_page_version(version));
1125 return(mach_read_from_2(buf + FIL_PAGE_COMPRESS_SIZE_V1));
1126 }
1127
1128 return(ULINT_UNDEFINED);
1129 }
1130
1131 /** If it is a compressed page return the original page data + footer size
1132 @param[in] buf Buffer to check, must include header + 10 bytes
1133 @return ULINT_UNDEFINED if the page is not a compressed page or length
1134 of the original data + footer if it is a compressed page */
1135 ulint
os_file_original_page_size(const byte * buf)1136 os_file_original_page_size(const byte* buf)
1137 {
1138 ulint type = mach_read_from_2(buf + FIL_PAGE_TYPE);
1139
1140 if (type == FIL_PAGE_COMPRESSED) {
1141
1142 ulint version = mach_read_from_1(buf + FIL_PAGE_VERSION);
1143 ut_a(Compression::is_valid_page_version(version));
1144
1145 return(mach_read_from_2(buf + FIL_PAGE_ORIGINAL_SIZE_V1));
1146 }
1147
1148 return(ULINT_UNDEFINED);
1149 }
1150
1151 /** Check if we need to read some more data.
1152 @param[in] slot The slot that contains the IO request
1153 @param[in] n_bytes Total bytes read so far
1154 @return DB_SUCCESS or error code */
1155 dberr_t
check_read(Slot * slot,ulint n_bytes)1156 AIOHandler::check_read(Slot* slot, ulint n_bytes)
1157 {
1158 dberr_t err;
1159
1160 ut_ad(slot->type.is_read());
1161 ut_ad(slot->original_len > slot->len);
1162
1163 if (is_compressed_page(slot)) {
1164
1165 if (can_decompress(slot)) {
1166
1167 ut_a(slot->offset > 0);
1168
1169 slot->len = slot->original_len;
1170 #ifdef _WIN32
1171 slot->n_bytes = static_cast<DWORD>(n_bytes);
1172 #else
1173 slot->n_bytes = static_cast<ulint>(n_bytes);
1174 #endif /* _WIN32 */
1175
1176 err = io_complete(slot);
1177 ut_a(err == DB_SUCCESS);
1178 } else {
1179 /* Read the next block in */
1180 ut_ad(compressed_page_size(slot) >= n_bytes);
1181
1182 err = DB_FAIL;
1183 }
1184 } else if (is_encrypted_page(slot)) {
1185 ut_a(slot->offset > 0);
1186
1187 slot->len = slot->original_len;
1188 #ifdef _WIN32
1189 slot->n_bytes = static_cast<DWORD>(n_bytes);
1190 #else
1191 slot->n_bytes = static_cast<ulint>(n_bytes);
1192 #endif /* _WIN32 */
1193
1194 err = io_complete(slot);
1195 ut_a(err == DB_SUCCESS);
1196
1197 } else {
1198 err = DB_FAIL;
1199 }
1200
1201 if (slot->buf_block != NULL) {
1202 os_free_block(slot->buf_block);
1203 slot->buf_block = NULL;
1204 }
1205
1206 return(err);
1207 }
1208
1209 /** Do any post processing after a read/write
1210 @return DB_SUCCESS or error code. */
1211 dberr_t
post_io_processing(Slot * slot)1212 AIOHandler::post_io_processing(Slot* slot)
1213 {
1214 dberr_t err;
1215
1216 ut_ad(slot->is_reserved);
1217
1218 /* Total bytes read so far */
1219 ulint n_bytes = (slot->ptr - slot->buf) + slot->n_bytes;
1220
1221 /* Compressed writes can be smaller than the original length.
1222 Therefore they can be processed without further IO. */
1223 if (n_bytes == slot->original_len
1224 || (slot->type.is_write()
1225 && slot->type.is_compressed()
1226 && slot->len == static_cast<ulint>(slot->n_bytes))) {
1227
1228 if (!slot->type.is_log()
1229 && (is_compressed_page(slot)
1230 || is_encrypted_page(slot))) {
1231
1232 ut_a(slot->offset > 0);
1233
1234 if (slot->type.is_read()) {
1235 slot->len = slot->original_len;
1236 }
1237
1238 /* The punch hole has been done on collect() */
1239
1240 if (slot->type.is_read()) {
1241 err = io_complete(slot);
1242 } else {
1243 err = DB_SUCCESS;
1244 }
1245
1246 ut_ad(err == DB_SUCCESS
1247 || err == DB_UNSUPPORTED
1248 || err == DB_CORRUPTION
1249 || err == DB_IO_DECOMPRESS_FAIL);
1250 } else {
1251
1252 err = DB_SUCCESS;
1253 }
1254
1255 if (slot->buf_block != NULL) {
1256 os_free_block(slot->buf_block);
1257 slot->buf_block = NULL;
1258 }
1259
1260 } else if ((ulint) slot->n_bytes == (ulint) slot->len) {
1261
1262 /* It *must* be a partial read. */
1263 ut_ad(slot->len < slot->original_len);
1264
1265 /* Has to be a read request, if it is less than
1266 the original length. */
1267 ut_ad(slot->type.is_read());
1268 err = check_read(slot, n_bytes);
1269
1270 } else {
1271 err = DB_FAIL;
1272 }
1273
1274 return(err);
1275 }
1276
1277 /** Count the number of free slots
1278 @return number of reserved slots */
1279 ulint
pending_io_count() const1280 AIO::pending_io_count() const
1281 {
1282 acquire();
1283
1284 #ifdef UNIV_DEBUG
1285 ut_a(m_n_segments > 0);
1286 ut_a(!m_slots.empty());
1287
1288 ulint count = 0;
1289
1290 for (ulint i = 0; i < m_slots.size(); ++i) {
1291
1292 const Slot& slot = m_slots[i];
1293
1294 if (slot.is_reserved) {
1295 ++count;
1296 ut_a(slot.len > 0);
1297 }
1298 }
1299
1300 ut_a(m_n_reserved == count);
1301 #endif /* UNIV_DEBUG */
1302
1303 ulint reserved = m_n_reserved;
1304
1305 release();
1306
1307 return(reserved);
1308 }
1309
1310 /** Compress a data page
1311 #param[in] block_size File system block size
1312 @param[in] src Source contents to compress
1313 @param[in] src_len Length in bytes of the source
1314 @param[out] dst Compressed page contents
1315 @param[out] dst_len Length in bytes of dst contents
1316 @return buffer data, dst_len will have the length of the data */
1317 static
1318 byte*
os_file_compress_page(Compression compression,ulint block_size,byte * src,ulint src_len,byte * dst,ulint * dst_len)1319 os_file_compress_page(
1320 Compression compression,
1321 ulint block_size,
1322 byte* src,
1323 ulint src_len,
1324 byte* dst,
1325 ulint* dst_len)
1326 {
1327 ulint len = 0;
1328 ulint compression_level = page_zip_level;
1329 ulint page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
1330
1331 /* The page size must be a multiple of the OS punch hole size. */
1332 ut_ad(!(src_len % block_size));
1333
1334 /* Shouldn't compress an already compressed page. */
1335 ut_ad(page_type != FIL_PAGE_COMPRESSED);
1336
1337 /* The page must be at least twice as large as the file system
1338 block size if we are to save any space. Ignore R-Tree pages for now,
1339 they repurpose the same 8 bytes in the page header. No point in
1340 compressing if the file system block size >= our page size. */
1341
1342 if (page_type == FIL_PAGE_RTREE
1343 || block_size == ULINT_UNDEFINED
1344 || compression.m_type == Compression::NONE
1345 || src_len < block_size * 2) {
1346
1347 *dst_len = src_len;
1348
1349 return(src);
1350 }
1351
1352 /* Leave the header alone when compressing. */
1353 ut_ad(block_size >= FIL_PAGE_DATA * 2);
1354
1355 ut_ad(src_len > FIL_PAGE_DATA + block_size);
1356
1357 /* Must compress to <= N-1 FS blocks. */
1358 ulint out_len = src_len - (FIL_PAGE_DATA + block_size);
1359
1360 /* This is the original data page size - the page header. */
1361 ulint content_len = src_len - FIL_PAGE_DATA;
1362
1363 ut_ad(out_len >= block_size - FIL_PAGE_DATA);
1364 ut_ad(out_len <= src_len - (block_size + FIL_PAGE_DATA));
1365
1366 /* Only compress the data + trailer, leave the header alone */
1367
1368 switch (compression.m_type) {
1369 case Compression::NONE:
1370 ut_error;
1371
1372 case Compression::ZLIB: {
1373
1374 uLongf zlen = static_cast<uLongf>(out_len);
1375
1376 if (compress2(
1377 dst + FIL_PAGE_DATA,
1378 &zlen,
1379 src + FIL_PAGE_DATA,
1380 static_cast<uLong>(content_len),
1381 static_cast<int>(compression_level)) != Z_OK) {
1382
1383 *dst_len = src_len;
1384
1385 return(src);
1386 }
1387
1388 len = static_cast<ulint>(zlen);
1389
1390 break;
1391 }
1392
1393 case Compression::LZ4:
1394
1395 len = LZ4_compress_default(
1396 reinterpret_cast<char*>(src) + FIL_PAGE_DATA,
1397 reinterpret_cast<char*>(dst) + FIL_PAGE_DATA,
1398 static_cast<int>(content_len),
1399 static_cast<int>(out_len));
1400
1401 ut_a(len <= src_len - FIL_PAGE_DATA);
1402
1403 if (len == 0 || len >= out_len) {
1404
1405 *dst_len = src_len;
1406
1407 return(src);
1408 }
1409
1410 break;
1411
1412 default:
1413 *dst_len = src_len;
1414 return(src);
1415 }
1416
1417 ut_a(len <= out_len);
1418
1419 ut_ad(memcmp(src + FIL_PAGE_LSN + 4,
1420 src + src_len - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)
1421 == 0);
1422
1423 /* Copy the header as is. */
1424 memmove(dst, src, FIL_PAGE_DATA);
1425
1426 /* Add compression control information. Required for decompressing. */
1427 mach_write_to_2(dst + FIL_PAGE_TYPE, FIL_PAGE_COMPRESSED);
1428
1429 mach_write_to_1(dst + FIL_PAGE_VERSION, Compression::FIL_PAGE_VERSION_2);
1430
1431 mach_write_to_1(dst + FIL_PAGE_ALGORITHM_V1, compression.m_type);
1432
1433 mach_write_to_2(dst + FIL_PAGE_ORIGINAL_TYPE_V1, page_type);
1434
1435 mach_write_to_2(dst + FIL_PAGE_ORIGINAL_SIZE_V1, content_len);
1436
1437 mach_write_to_2(dst + FIL_PAGE_COMPRESS_SIZE_V1, len);
1438
1439 /* Round to the next full block size */
1440
1441 len += FIL_PAGE_DATA;
1442
1443 *dst_len = ut_calc_align(len, block_size);
1444
1445 ut_ad(*dst_len >= len && *dst_len <= out_len + FIL_PAGE_DATA);
1446
1447 /* Clear out the unused portion of the page. */
1448 if (len % block_size) {
1449 memset(dst + len, 0x0, block_size - (len % block_size));
1450 }
1451
1452 return(dst);
1453 }
1454
1455 #ifdef UNIV_DEBUG
1456 # ifndef UNIV_HOTBACKUP
1457 /** Validates the consistency the aio system some of the time.
1458 @return true if ok or the check was skipped */
1459 bool
os_aio_validate_skip()1460 os_aio_validate_skip()
1461 {
1462 /** Try os_aio_validate() every this many times */
1463 # define OS_AIO_VALIDATE_SKIP 13
1464
1465 /** The os_aio_validate() call skip counter.
1466 Use a signed type because of the race condition below. */
1467 static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
1468
1469 /* There is a race condition below, but it does not matter,
1470 because this call is only for heuristic purposes. We want to
1471 reduce the call frequency of the costly os_aio_validate()
1472 check in debug builds. */
1473 --os_aio_validate_count;
1474
1475 if (os_aio_validate_count > 0) {
1476 return(true);
1477 }
1478
1479 os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
1480 return(os_aio_validate());
1481 }
1482 # endif /* !UNIV_HOTBACKUP */
1483 #endif /* UNIV_DEBUG */
1484
1485 #undef USE_FILE_LOCK
1486 //#define USE_FILE_LOCK
1487 #if defined(UNIV_HOTBACKUP) || defined(_WIN32)
1488 /* InnoDB Hot Backup does not lock the data files.
1489 * On Windows, mandatory locking is used.
1490 */
1491 # undef USE_FILE_LOCK
1492 #endif
1493 #ifdef USE_FILE_LOCK
1494 /** Obtain an exclusive lock on a file.
1495 @param[in] fd file descriptor
1496 @param[in] name file name
1497 @return 0 on success */
1498 static
1499 int
os_file_lock(int fd,const char * name)1500 os_file_lock(
1501 int fd,
1502 const char* name)
1503 {
1504 struct flock lk;
1505
1506 lk.l_type = F_WRLCK;
1507 lk.l_whence = SEEK_SET;
1508 lk.l_start = lk.l_len = 0;
1509
1510 if (fcntl(fd, F_SETLK, &lk) == -1) {
1511
1512 ib::error()
1513 << "Unable to lock " << name
1514 << " error: " << errno;
1515
1516 if (errno == EAGAIN || errno == EACCES) {
1517
1518 ib::info()
1519 << "Check that you do not already have"
1520 " another mysqld process using the"
1521 " same InnoDB data or log files.";
1522 }
1523
1524 return(-1);
1525 }
1526
1527 return(0);
1528 }
1529 #endif /* USE_FILE_LOCK */
1530
1531 #ifndef UNIV_HOTBACKUP
1532
1533 /** Calculates local segment number and aio array from global segment number.
1534 @param[out] array aio wait array
1535 @param[in] segment global segment number
1536 @return local segment number within the aio array */
1537 ulint
get_array_and_local_segment(AIO ** array,ulint segment)1538 AIO::get_array_and_local_segment(
1539 AIO** array,
1540 ulint segment)
1541 {
1542 ulint local_segment;
1543 ulint n_extra_segs = (srv_read_only_mode) ? 0 : 2;
1544
1545 ut_a(segment < os_aio_n_segments);
1546
1547 if (!srv_read_only_mode && segment < n_extra_segs) {
1548
1549 /* We don't support ibuf/log IO during read only mode. */
1550
1551 if (segment == IO_IBUF_SEGMENT) {
1552
1553 *array = s_ibuf;
1554
1555 } else if (segment == IO_LOG_SEGMENT) {
1556
1557 *array = s_log;
1558
1559 } else {
1560 *array = NULL;
1561 }
1562
1563 local_segment = 0;
1564
1565 } else if (segment < s_reads->m_n_segments + n_extra_segs) {
1566
1567 *array = s_reads;
1568 local_segment = segment - n_extra_segs;
1569
1570 } else {
1571 *array = s_writes;
1572
1573 local_segment = segment
1574 - (s_reads->m_n_segments + n_extra_segs);
1575 }
1576
1577 return(local_segment);
1578 }
1579
1580 /** Frees a slot in the aio array. Assumes caller owns the mutex.
1581 @param[in,out] slot Slot to release */
1582 void
release(Slot * slot)1583 AIO::release(Slot* slot)
1584 {
1585 ut_ad(is_mutex_owned());
1586
1587 ut_ad(slot->is_reserved);
1588
1589 slot->is_reserved = false;
1590
1591 --m_n_reserved;
1592
1593 if (m_n_reserved == m_slots.size() - 1) {
1594 os_event_set(m_not_full);
1595 }
1596
1597 if (m_n_reserved == 0) {
1598 os_event_set(m_is_empty);
1599 }
1600
1601 #ifdef WIN_ASYNC_IO
1602
1603 ResetEvent(slot->handle);
1604
1605 #elif defined(LINUX_NATIVE_AIO)
1606
1607 if (srv_use_native_aio) {
1608 memset(&slot->control, 0x0, sizeof(slot->control));
1609 slot->ret = 0;
1610 slot->n_bytes = 0;
1611 } else {
1612 /* These fields should not be used if we are not
1613 using native AIO. */
1614 ut_ad(slot->n_bytes == 0);
1615 ut_ad(slot->ret == 0);
1616 }
1617
1618 #endif /* WIN_ASYNC_IO */
1619 }
1620
1621 /** Frees a slot in the AIO array. Assumes caller doesn't own the mutex.
1622 @param[in,out] slot Slot to release */
1623 void
release_with_mutex(Slot * slot)1624 AIO::release_with_mutex(Slot* slot)
1625 {
1626 acquire();
1627
1628 release(slot);
1629
1630 release();
1631 }
1632
1633 /** Creates a temporary file. This function is like tmpfile(3), but
1634 the temporary file is created in the given parameter path. If the path
1635 is NULL then it will create the file in the MySQL server configuration
1636 parameter (--tmpdir).
1637 @param[in] path location for creating temporary file
1638 @return temporary file handle, or NULL on error */
1639 FILE*
os_file_create_tmpfile(const char * path)1640 os_file_create_tmpfile(
1641 const char* path)
1642 {
1643 FILE* file = NULL;
1644 int fd = innobase_mysql_tmpfile(path);
1645
1646 if (fd >= 0) {
1647 file = fdopen(fd, "w+b");
1648 }
1649
1650 if (file == NULL) {
1651
1652 ib::error()
1653 << "Unable to create temporary file; errno: "
1654 << errno;
1655
1656 if (fd >= 0) {
1657 close(fd);
1658 }
1659 }
1660
1661 return(file);
1662 }
1663
1664 /** Rewind file to its start, read at most size - 1 bytes from it to str, and
1665 NUL-terminate str. All errors are silently ignored. This function is
1666 mostly meant to be used with temporary files.
1667 @param[in,out] file File to read from
1668 @param[in,out] str Buffer where to read
1669 @param[in] size Size of buffer */
1670 void
os_file_read_string(FILE * file,char * str,ulint size)1671 os_file_read_string(
1672 FILE* file,
1673 char* str,
1674 ulint size)
1675 {
1676 if (size != 0) {
1677 rewind(file);
1678
1679 size_t flen = fread(str, 1, size - 1, file);
1680
1681 str[flen] = '\0';
1682 }
1683 }
1684
1685 /** Decompress after a read and punch a hole in the file if it was a write
1686 @param[in] type IO context
1687 @param[in] fh Open file handle
1688 @param[in,out] buf Buffer to transform
1689 @param[in,out] scratch Scratch area for read decompression
1690 @param[in] src_len Length of the buffer before compression
1691 @param[in] len Used buffer length for write and output
1692 buf len for read
1693 @return DB_SUCCESS or error code */
1694 static
1695 dberr_t
os_file_io_complete(const IORequest & type,os_file_t fh,byte * buf,byte * scratch,ulint src_len,os_offset_t offset,ulint len)1696 os_file_io_complete(
1697 const IORequest&type,
1698 os_file_t fh,
1699 byte* buf,
1700 byte* scratch,
1701 ulint src_len,
1702 os_offset_t offset,
1703 ulint len)
1704 {
1705 /* We never compress/decompress the first page */
1706 ut_a(offset > 0);
1707 ut_ad(type.validate());
1708
1709 if (!type.is_compression_enabled()) {
1710
1711 return(DB_SUCCESS);
1712
1713 } else if (type.is_read() && !srv_backup_mode) {
1714 /* Do not decrypt / decompress when taking a backup.
1715 We actually decompress the pages in fil_cur.
1716 We want encrypted pages to remain encrypted. */
1717 dberr_t ret;
1718 Encryption encryption(type.encryption_algorithm());
1719
1720 ut_ad(!type.is_log());
1721
1722 ret = encryption.decrypt(type, buf, src_len, scratch, len);
1723 if (ret == DB_SUCCESS) {
1724 return(os_file_decompress_page(
1725 type.is_dblwr_recover(),
1726 buf, scratch, len));
1727 } else {
1728 return(ret);
1729 }
1730
1731 } else if (type.punch_hole()) {
1732
1733 ut_ad(len <= src_len);
1734 ut_ad(!type.is_log());
1735 ut_ad(type.is_write());
1736 ut_ad(type.is_compressed());
1737
1738 /* Nothing to do. */
1739 if (len == src_len) {
1740 return(DB_SUCCESS);
1741 }
1742
1743 #ifdef UNIV_DEBUG
1744 const ulint block_size = type.block_size();
1745 #endif /* UNIV_DEBUG */
1746
1747 /* We don't support multiple page sizes in the server
1748 at the moment. */
1749 ut_ad(src_len == srv_page_size);
1750
1751 /* Must be a multiple of the compression unit size. */
1752 ut_ad((len % block_size) == 0);
1753 ut_ad((offset % block_size) == 0);
1754
1755 ut_ad(len + block_size <= src_len);
1756
1757 offset += len;
1758
1759 return(os_file_punch_hole(fh, offset, src_len - len));
1760 }
1761
1762 ut_ad(!type.is_log());
1763
1764 return(DB_SUCCESS);
1765 }
1766
1767 #endif /* !UNIV_HOTBACKUP */
1768
1769 /** This function returns a new path name after replacing the basename
1770 in an old path with a new basename. The old_path is a full path
1771 name including the extension. The tablename is in the normal
1772 form "databasename/tablename". The new base name is found after
1773 the forward slash. Both input strings are null terminated.
1774
1775 This function allocates memory to be returned. It is the callers
1776 responsibility to free the return value after it is no longer needed.
1777
1778 @param[in] old_path Pathname
1779 @param[in] tablename Contains new base name
1780 @return own: new full pathname */
1781 char*
os_file_make_new_pathname(const char * old_path,const char * tablename)1782 os_file_make_new_pathname(
1783 const char* old_path,
1784 const char* tablename)
1785 {
1786 ulint dir_len;
1787 char* last_slash;
1788 char* base_name;
1789 char* new_path;
1790 ulint new_path_len;
1791
1792 /* Split the tablename into its database and table name components.
1793 They are separated by a '/'. */
1794 last_slash = strrchr((char*) tablename, '/');
1795 base_name = last_slash ? last_slash + 1 : (char*) tablename;
1796
1797 /* Find the offset of the last slash. We will strip off the
1798 old basename.ibd which starts after that slash. */
1799 last_slash = strrchr((char*) old_path, OS_PATH_SEPARATOR);
1800 dir_len = last_slash ? last_slash - old_path : strlen(old_path);
1801
1802 /* allocate a new path and move the old directory path to it. */
1803 new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd";
1804 new_path = static_cast<char*>(ut_malloc_nokey(new_path_len));
1805 memcpy(new_path, old_path, dir_len);
1806
1807 ut_snprintf(new_path + dir_len,
1808 new_path_len - dir_len,
1809 "%c%s.ibd",
1810 OS_PATH_SEPARATOR,
1811 base_name);
1812
1813 return(new_path);
1814 }
1815
1816 /** This function reduces a null-terminated full remote path name into
1817 the path that is sent by MySQL for DATA DIRECTORY clause. It replaces
1818 the 'databasename/tablename.ibd' found at the end of the path with just
1819 'tablename'.
1820
1821 Since the result is always smaller than the path sent in, no new memory
1822 is allocated. The caller should allocate memory for the path sent in.
1823 This function manipulates that path in place.
1824
1825 If the path format is not as expected, just return. The result is used
1826 to inform a SHOW CREATE TABLE command.
1827 @param[in,out] data_dir_path Full path/data_dir_path */
1828 void
os_file_make_data_dir_path(char * data_dir_path)1829 os_file_make_data_dir_path(
1830 char* data_dir_path)
1831 {
1832 /* Replace the period before the extension with a null byte. */
1833 char* ptr = strrchr((char*) data_dir_path, '.');
1834
1835 if (ptr == NULL) {
1836 return;
1837 }
1838
1839 ptr[0] = '\0';
1840
1841 /* The tablename starts after the last slash. */
1842 ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
1843
1844 if (ptr == NULL) {
1845 return;
1846 }
1847
1848 ptr[0] = '\0';
1849
1850 char* tablename = ptr + 1;
1851
1852 /* The databasename starts after the next to last slash. */
1853 ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
1854
1855 if (ptr == NULL) {
1856 return;
1857 }
1858
1859 ulint tablename_len = ut_strlen(tablename);
1860
1861 ut_memmove(++ptr, tablename, tablename_len);
1862
1863 ptr[tablename_len] = '\0';
1864 }
1865
1866 /** Check if the path refers to the root of a drive using a pointer
1867 to the last directory separator that the caller has fixed.
1868 @param[in] path path name
1869 @param[in] path last directory separator in the path
1870 @return true if this path is a drive root, false if not */
1871 UNIV_INLINE
1872 bool
os_file_is_root(const char * path,const char * last_slash)1873 os_file_is_root(
1874 const char* path,
1875 const char* last_slash)
1876 {
1877 return(
1878 #ifdef _WIN32
1879 (last_slash == path + 2 && path[1] == ':') ||
1880 #endif /* _WIN32 */
1881 last_slash == path);
1882 }
1883
1884 /** Return the parent directory component of a null-terminated path.
1885 Return a new buffer containing the string up to, but not including,
1886 the final component of the path.
1887 The path returned will not contain a trailing separator.
1888 Do not return a root path, return NULL instead.
1889 The final component trimmed off may be a filename or a directory name.
1890 If the final component is the only component of the path, return NULL.
1891 It is the caller's responsibility to free the returned string after it
1892 is no longer needed.
1893 @param[in] path Path name
1894 @return own: parent directory of the path */
1895 static
1896 char*
os_file_get_parent_dir(const char * path)1897 os_file_get_parent_dir(
1898 const char* path)
1899 {
1900 bool has_trailing_slash = false;
1901
1902 /* Find the offset of the last slash */
1903 const char* last_slash = strrchr(path, OS_PATH_SEPARATOR);
1904
1905 if (!last_slash) {
1906 /* No slash in the path, return NULL */
1907 return(NULL);
1908 }
1909
1910 /* Ok, there is a slash. Is there anything after it? */
1911 if (static_cast<size_t>(last_slash - path + 1) == strlen(path)) {
1912 has_trailing_slash = true;
1913 }
1914
1915 /* Reduce repetative slashes. */
1916 while (last_slash > path
1917 && last_slash[-1] == OS_PATH_SEPARATOR) {
1918 last_slash--;
1919 }
1920
1921 /* Check for the root of a drive. */
1922 if (os_file_is_root(path, last_slash)) {
1923 return(NULL);
1924 }
1925
1926 /* If a trailing slash prevented the first strrchr() from trimming
1927 the last component of the path, trim that component now. */
1928 if (has_trailing_slash) {
1929 /* Back up to the previous slash. */
1930 last_slash--;
1931 while (last_slash > path
1932 && last_slash[0] != OS_PATH_SEPARATOR) {
1933 last_slash--;
1934 }
1935
1936 /* Reduce repetative slashes. */
1937 while (last_slash > path
1938 && last_slash[-1] == OS_PATH_SEPARATOR) {
1939 last_slash--;
1940 }
1941 }
1942
1943 /* Check for the root of a drive. */
1944 if (os_file_is_root(path, last_slash)) {
1945 return(NULL);
1946 }
1947
1948 if (last_slash - path < 0) {
1949 /* Sanity check, it prevents gcc from trying to handle this case which
1950 * results in warnings for some optimized builds */
1951 return (NULL);
1952 }
1953
1954 /* Non-trivial directory component */
1955
1956 return(mem_strdupl(path, last_slash - path));
1957 }
1958 #ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
1959
1960 /* Test the function os_file_get_parent_dir. */
1961 void
test_os_file_get_parent_dir(const char * child_dir,const char * expected_dir)1962 test_os_file_get_parent_dir(
1963 const char* child_dir,
1964 const char* expected_dir)
1965 {
1966 char* child = mem_strdup(child_dir);
1967 char* expected = expected_dir == NULL ? NULL
1968 : mem_strdup(expected_dir);
1969
1970 /* os_file_get_parent_dir() assumes that separators are
1971 converted to OS_PATH_SEPARATOR. */
1972 os_normalize_path(child);
1973 os_normalize_path(expected);
1974
1975 char* parent = os_file_get_parent_dir(child);
1976
1977 bool unexpected = (expected == NULL
1978 ? (parent != NULL)
1979 : (0 != strcmp(parent, expected)));
1980 if (unexpected) {
1981 ib::fatal() << "os_file_get_parent_dir('" << child
1982 << "') returned '" << parent
1983 << "', instead of '" << expected << "'.";
1984 }
1985 ut_free(parent);
1986 ut_free(child);
1987 ut_free(expected);
1988 }
1989
1990 /* Test the function os_file_get_parent_dir. */
1991 void
unit_test_os_file_get_parent_dir()1992 unit_test_os_file_get_parent_dir()
1993 {
1994 test_os_file_get_parent_dir("/usr/lib/a", "/usr/lib");
1995 test_os_file_get_parent_dir("/usr/", NULL);
1996 test_os_file_get_parent_dir("//usr//", NULL);
1997 test_os_file_get_parent_dir("usr", NULL);
1998 test_os_file_get_parent_dir("usr//", NULL);
1999 test_os_file_get_parent_dir("/", NULL);
2000 test_os_file_get_parent_dir("//", NULL);
2001 test_os_file_get_parent_dir(".", NULL);
2002 test_os_file_get_parent_dir("..", NULL);
2003 # ifdef _WIN32
2004 test_os_file_get_parent_dir("D:", NULL);
2005 test_os_file_get_parent_dir("D:/", NULL);
2006 test_os_file_get_parent_dir("D:\\", NULL);
2007 test_os_file_get_parent_dir("D:/data", NULL);
2008 test_os_file_get_parent_dir("D:/data/", NULL);
2009 test_os_file_get_parent_dir("D:\\data\\", NULL);
2010 test_os_file_get_parent_dir("D:///data/////", NULL);
2011 test_os_file_get_parent_dir("D:\\\\\\data\\\\\\\\", NULL);
2012 test_os_file_get_parent_dir("D:/data//a", "D:/data");
2013 test_os_file_get_parent_dir("D:\\data\\\\a", "D:\\data");
2014 test_os_file_get_parent_dir("D:///data//a///b/", "D:///data//a");
2015 test_os_file_get_parent_dir("D:\\\\\\data\\\\a\\\\\\b\\", "D:\\\\\\data\\\\a");
2016 #endif /* _WIN32 */
2017 }
2018 #endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */
2019
2020
2021 /** Creates all missing subdirectories along the given path.
2022 @param[in] path Path name
2023 @return DB_SUCCESS if OK, otherwise error code. */
2024 dberr_t
os_file_create_subdirs_if_needed(const char * path)2025 os_file_create_subdirs_if_needed(
2026 const char* path)
2027 {
2028 if (srv_read_only_mode) {
2029
2030 ib::error()
2031 << "read only mode set. Can't create "
2032 << "subdirectories '" << path << "'";
2033
2034 return(DB_READ_ONLY);
2035
2036 }
2037
2038 char* subdir = os_file_get_parent_dir(path);
2039
2040 if (subdir == NULL) {
2041 /* subdir is root or cwd, nothing to do */
2042 return(DB_SUCCESS);
2043 }
2044
2045 /* Test if subdir exists */
2046 os_file_type_t type;
2047 bool subdir_exists;
2048 bool success = os_file_status(subdir, &subdir_exists, &type);
2049
2050 if (success && !subdir_exists) {
2051
2052 /* Subdir does not exist, create it */
2053 dberr_t err = os_file_create_subdirs_if_needed(subdir);
2054
2055 if (err != DB_SUCCESS) {
2056
2057 ut_free(subdir);
2058
2059 return(err);
2060 }
2061
2062 success = os_file_create_directory(subdir, false);
2063 }
2064
2065 ut_free(subdir);
2066
2067 return(success ? DB_SUCCESS : DB_ERROR);
2068 }
2069
2070 /** Allocate the buffer for IO on a transparently compressed table.
2071 @param[in] type IO flags
2072 @param[out] buf buffer to read or write
2073 @param[in,out] n number of bytes to read/write, starting from
2074 offset
2075 @return pointer to allocated page, compressed data is written to the offset
2076 that is aligned on the disk sector size */
2077 static
2078 Block*
os_file_compress_page(IORequest & type,void * & buf,ulint * n)2079 os_file_compress_page(
2080 IORequest& type,
2081 void*& buf,
2082 ulint* n)
2083 {
2084 ut_ad(!type.is_log());
2085 ut_ad(type.is_write());
2086 ut_ad(type.is_compressed());
2087
2088 ulint n_alloc = *n * 2;
2089
2090 ut_a(n_alloc <= UNIV_PAGE_SIZE_MAX * 2);
2091 ut_a(type.compression_algorithm().m_type != Compression::LZ4
2092 || static_cast<ulint>(LZ4_COMPRESSBOUND(*n)) < n_alloc);
2093
2094 Block* block = os_alloc_block();
2095
2096 ulint old_compressed_len;
2097 ulint compressed_len = *n;
2098
2099 old_compressed_len = mach_read_from_2(
2100 reinterpret_cast<byte*>(buf)
2101 + FIL_PAGE_COMPRESS_SIZE_V1);
2102
2103 if (old_compressed_len > 0) {
2104 old_compressed_len = ut_calc_align(
2105 old_compressed_len + FIL_PAGE_DATA,
2106 type.block_size());
2107 } else {
2108 old_compressed_len = *n;
2109 }
2110
2111 byte* compressed_page;
2112
2113 compressed_page = static_cast<byte*>(
2114 ut_align(block->m_ptr, os_io_ptr_align));
2115
2116 byte* buf_ptr;
2117
2118 buf_ptr = os_file_compress_page(
2119 type.compression_algorithm(),
2120 type.block_size(),
2121 reinterpret_cast<byte*>(buf),
2122 *n,
2123 compressed_page,
2124 &compressed_len);
2125
2126 if (buf_ptr != buf) {
2127 /* Set new compressed size to uncompressed page. */
2128 memcpy(reinterpret_cast<byte*>(buf) + FIL_PAGE_COMPRESS_SIZE_V1,
2129 buf_ptr + FIL_PAGE_COMPRESS_SIZE_V1, 2);
2130
2131 buf = buf_ptr;
2132 *n = compressed_len;
2133
2134 if (compressed_len >= old_compressed_len) {
2135
2136 ut_ad(old_compressed_len <= UNIV_PAGE_SIZE);
2137
2138 type.clear_punch_hole();
2139 }
2140 }
2141
2142 return(block);
2143 }
2144
2145 /** Encrypt a page content when write it to disk.
2146 @param[in] type IO flags
2147 @param[out] buf buffer to read or write
2148 @param[in,out] n number of bytes to read/write, starting from
2149 offset
2150 @return pointer to the encrypted page */
2151 static
2152 Block*
os_file_encrypt_page(const IORequest & type,void * & buf,ulint * n)2153 os_file_encrypt_page(
2154 const IORequest& type,
2155 void*& buf,
2156 ulint* n)
2157 {
2158
2159 byte* encrypted_page;
2160 ulint encrypted_len = *n;
2161 byte* buf_ptr;
2162 Encryption encryption(type.encryption_algorithm());
2163
2164 ut_ad(!type.is_log());
2165 ut_ad(type.is_write());
2166 ut_ad(type.is_encrypted());
2167
2168 Block* block = os_alloc_block();
2169
2170 encrypted_page = static_cast<byte*>(
2171 ut_align(block->m_ptr, os_io_ptr_align));
2172
2173 buf_ptr = encryption.encrypt(type,
2174 reinterpret_cast<byte*>(buf), *n,
2175 encrypted_page, &encrypted_len);
2176
2177 bool encrypted = buf_ptr != buf;
2178
2179 if (encrypted) {
2180
2181 buf = buf_ptr;
2182 *n = encrypted_len;
2183 }
2184
2185 return(block);
2186 }
2187
2188 #ifndef _WIN32
2189
2190 /** Do the read/write
2191 @param[in] request The IO context and type
2192 @return the number of bytes read/written or negative value on error */
2193 ssize_t
execute(const IORequest & request)2194 SyncFileIO::execute(const IORequest& request)
2195 {
2196 ssize_t n_bytes;
2197
2198 if (request.is_read()) {
2199 n_bytes = pread(m_fh, m_buf, m_n, m_offset);
2200 } else {
2201 ut_ad(request.is_write());
2202 n_bytes = pwrite(m_fh, m_buf, m_n, m_offset);
2203 }
2204
2205 return(n_bytes);
2206 }
2207
2208 /** Free storage space associated with a section of the file.
2209 @param[in] fh Open file handle
2210 @param[in] off Starting offset (SEEK_SET)
2211 @param[in] len Size of the hole
2212 @return DB_SUCCESS or error code */
2213 static
2214 dberr_t
os_file_punch_hole_posix(os_file_t fh,os_offset_t off,os_offset_t len)2215 os_file_punch_hole_posix(
2216 os_file_t fh,
2217 os_offset_t off,
2218 os_offset_t len)
2219 {
2220 #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
2221 const int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
2222
2223 int ret = fallocate(fh, mode, off, len);
2224
2225 if (ret == 0) {
2226 return(DB_SUCCESS);
2227 }
2228
2229 ut_a(ret == -1);
2230
2231 if (errno == ENOTSUP) {
2232 return(DB_IO_NO_PUNCH_HOLE);
2233 }
2234
2235 ib::warn()
2236 << "fallocate(" << fh
2237 <<", FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, "
2238 << off << ", " << len << ") returned errno: "
2239 << errno;
2240
2241 return(DB_IO_ERROR);
2242
2243 #elif defined(UNIV_SOLARIS)
2244
2245 // Use F_FREESP
2246
2247 #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
2248
2249 return(DB_IO_NO_PUNCH_HOLE);
2250 }
2251
2252 #if defined(LINUX_NATIVE_AIO)
2253
2254 /** Linux native AIO handler */
2255 class LinuxAIOHandler {
2256 public:
2257 /**
2258 @param[in] global_segment The global segment*/
LinuxAIOHandler(ulint global_segment)2259 LinuxAIOHandler(ulint global_segment)
2260 :
2261 m_global_segment(global_segment)
2262 {
2263 /* Should never be doing Sync IO here. */
2264 ut_a(m_global_segment != ULINT_UNDEFINED);
2265
2266 /* Find the array and the local segment. */
2267
2268 m_segment = AIO::get_array_and_local_segment(
2269 &m_array, m_global_segment);
2270
2271 m_n_slots = m_array->slots_per_segment();
2272 }
2273
2274 /** Destructor */
~LinuxAIOHandler()2275 ~LinuxAIOHandler()
2276 {
2277 // No op
2278 }
2279
2280 /**
2281 Process a Linux AIO request
2282 @param[out] m1 the messages passed with the
2283 @param[out] m2 AIO request; note that in case the
2284 AIO operation failed, these output
2285 parameters are valid and can be used to
2286 restart the operation.
2287 @param[out] request IO context
2288 @return DB_SUCCESS or error code */
2289 dberr_t poll(fil_node_t** m1, void** m2, IORequest* request);
2290
2291 private:
2292 /** Resubmit an IO request that was only partially successful
2293 @param[in,out] slot Request to resubmit
2294 @return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
2295 dberr_t resubmit(Slot* slot);
2296
2297 /** Check if the AIO succeeded
2298 @param[in,out] slot The slot to check
2299 @return DB_SUCCESS, DB_FAIL if the operation should be retried or
2300 DB_IO_ERROR on all other errors */
2301 dberr_t check_state(Slot* slot);
2302
2303 /** @return true if a shutdown was detected */
is_shutdown() const2304 bool is_shutdown() const
2305 {
2306 return(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
2307 && !buf_page_cleaner_is_active);
2308 }
2309
2310 /** If no slot was found then the m_array->m_mutex will be released.
2311 @param[out] n_pending The number of pending IOs
2312 @return NULL or a slot that has completed IO */
2313 Slot* find_completed_slot(ulint* n_pending);
2314
2315 /** This is called from within the IO-thread. If there are no completed
2316 IO requests in the slot array, the thread calls this function to
2317 collect more requests from the Linux kernel.
2318 The IO-thread waits on io_getevents(), which is a blocking call, with
2319 a timeout value. Unless the system is very heavy loaded, keeping the
2320 IO-thread very busy, the io-thread will spend most of its time waiting
2321 in this function.
2322 The IO-thread also exits in this function. It checks server status at
2323 each wakeup and that is why we use timed wait in io_getevents(). */
2324 void collect();
2325
2326 private:
2327 /** Slot array */
2328 AIO* m_array;
2329
2330 /** Number of slots inthe local segment */
2331 ulint m_n_slots;
2332
2333 /** The local segment to check */
2334 ulint m_segment;
2335
2336 /** The global segment */
2337 ulint m_global_segment;
2338 };
2339
2340 /** Resubmit an IO request that was only partially successful
2341 @param[in,out] slot Request to resubmit
2342 @return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
2343 dberr_t
resubmit(Slot * slot)2344 LinuxAIOHandler::resubmit(Slot* slot)
2345 {
2346 #ifdef UNIV_DEBUG
2347 /* Bytes already read/written out */
2348 ulint n_bytes = slot->ptr - slot->buf;
2349
2350 ut_ad(m_array->is_mutex_owned());
2351
2352 ut_ad(n_bytes < slot->original_len);
2353 ut_ad(static_cast<ulint>(slot->n_bytes) < slot->original_len - n_bytes);
2354 /* Partial read or write scenario */
2355 ut_ad(slot->len >= static_cast<ulint>(slot->n_bytes));
2356 #endif /* UNIV_DEBUG */
2357
2358 slot->len -= slot->n_bytes;
2359 slot->ptr += slot->n_bytes;
2360 slot->offset += slot->n_bytes;
2361
2362 /* Resetting the bytes read/written */
2363 slot->n_bytes = 0;
2364 slot->io_already_done = false;
2365
2366 /* make sure that slot->offset fits in off_t */
2367 ut_ad(sizeof(off_t) >= sizeof(os_offset_t));
2368
2369 struct iocb* iocb = &slot->control;
2370 if (slot->type.is_read()) {
2371 io_prep_pread(
2372 iocb,
2373 slot->file.m_file,
2374 slot->ptr,
2375 slot->len,
2376 slot->offset);
2377
2378 } else {
2379
2380 ut_a(slot->type.is_write());
2381
2382 io_prep_pwrite(
2383 iocb,
2384 slot->file.m_file,
2385 slot->ptr,
2386 slot->len,
2387 slot->offset);
2388 }
2389
2390 iocb->data = slot;
2391
2392 /* Resubmit an I/O request */
2393 int ret = io_submit(m_array->io_ctx(m_segment), 1, &iocb);
2394
2395 if (ret < -1) {
2396 errno = -ret;
2397 }
2398
2399 return(ret < 0 ? DB_IO_PARTIAL_FAILED : DB_SUCCESS);
2400 }
2401
2402 /** Check if the AIO succeeded
2403 @param[in,out] slot The slot to check
2404 @return DB_SUCCESS, DB_FAIL if the operation should be retried or
2405 DB_IO_ERROR on all other errors */
2406 dberr_t
check_state(Slot * slot)2407 LinuxAIOHandler::check_state(Slot* slot)
2408 {
2409 ut_ad(m_array->is_mutex_owned());
2410
2411 /* Note that it may be that there is more then one completed
2412 IO requests. We process them one at a time. We may have a case
2413 here to improve the performance slightly by dealing with all
2414 requests in one sweep. */
2415
2416 srv_set_io_thread_op_info(
2417 m_global_segment, "processing completed aio requests");
2418
2419 ut_ad(slot->io_already_done);
2420
2421 dberr_t err;
2422
2423 if (slot->ret == 0) {
2424
2425 err = AIOHandler::post_io_processing(slot);
2426
2427 } else {
2428 errno = -slot->ret;
2429
2430 /* os_file_handle_error does tell us if we should retry
2431 this IO. As it stands now, we don't do this retry when
2432 reaping requests from a different context than
2433 the dispatcher. This non-retry logic is the same for
2434 Windows and Linux native AIO.
2435 We should probably look into this to transparently
2436 re-submit the IO. */
2437 os_file_handle_error(slot->name, "Linux aio");
2438
2439 err = DB_IO_ERROR;
2440 }
2441
2442 return(err);
2443 }
2444
2445 /** If no slot was found then the m_array->m_mutex will be released.
2446 @param[out] n_pending The number of pending IOs
2447 @return NULL or a slot that has completed IO */
2448 Slot*
find_completed_slot(ulint * n_pending)2449 LinuxAIOHandler::find_completed_slot(ulint* n_pending)
2450 {
2451 ulint offset = m_n_slots * m_segment;
2452
2453 *n_pending = 0;
2454
2455 m_array->acquire();
2456
2457 Slot* slot = m_array->at(offset);
2458
2459 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
2460
2461 if (slot->is_reserved) {
2462
2463 ++*n_pending;
2464
2465 if (slot->io_already_done) {
2466
2467 /* Something for us to work on.
2468 Note: We don't release the mutex. */
2469 return(slot);
2470 }
2471 }
2472 }
2473
2474 m_array->release();
2475
2476 return(NULL);
2477 }
2478
2479 /** This function is only used in Linux native asynchronous i/o. This is
2480 called from within the io-thread. If there are no completed IO requests
2481 in the slot array, the thread calls this function to collect more
2482 requests from the kernel.
2483 The io-thread waits on io_getevents(), which is a blocking call, with
2484 a timeout value. Unless the system is very heavy loaded, keeping the
2485 io-thread very busy, the io-thread will spend most of its time waiting
2486 in this function.
2487 The io-thread also exits in this function. It checks server status at
2488 each wakeup and that is why we use timed wait in io_getevents(). */
2489 void
collect()2490 LinuxAIOHandler::collect()
2491 {
2492 ut_ad(m_n_slots > 0);
2493 ut_ad(m_array != NULL);
2494 ut_ad(m_segment < m_array->get_n_segments());
2495
2496 /* Which io_context we are going to use. */
2497 io_context* io_ctx = m_array->io_ctx(m_segment);
2498
2499 /* Starting point of the m_segment we will be working on. */
2500 ulint start_pos = m_segment * m_n_slots;
2501
2502 /* End point. */
2503 ulint end_pos = start_pos + m_n_slots;
2504
2505 for (;;) {
2506 struct io_event* events;
2507
2508 /* Which part of event array we are going to work on. */
2509 events = m_array->io_events(m_segment * m_n_slots);
2510
2511 /* Initialize the events. */
2512 memset(events, 0, sizeof(*events) * m_n_slots);
2513
2514 /* The timeout value is arbitrary. We probably need
2515 to experiment with it a little. */
2516 struct timespec timeout;
2517
2518 timeout.tv_sec = 0;
2519 timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
2520
2521 int ret;
2522
2523 ret = io_getevents(io_ctx, 1, m_n_slots, events, &timeout);
2524
2525 for (int i = 0; i < ret; ++i) {
2526
2527 struct iocb* iocb;
2528
2529 iocb = reinterpret_cast<struct iocb*>(events[i].obj);
2530 ut_a(iocb != NULL);
2531
2532 Slot* slot = reinterpret_cast<Slot*>(iocb->data);
2533
2534 /* Some sanity checks. */
2535 ut_a(slot != NULL);
2536 ut_a(slot->is_reserved);
2537
2538 /* We are not scribbling previous segment. */
2539 ut_a(slot->pos >= start_pos);
2540
2541 /* We have not overstepped to next segment. */
2542 ut_a(slot->pos < end_pos);
2543
2544 /* We never compress/decompress the first page */
2545
2546 if (slot->offset > 0
2547 && !slot->skip_punch_hole
2548 && slot->type.is_compression_enabled()
2549 && !slot->type.is_log()
2550 && slot->type.is_write()
2551 && slot->type.is_compressed()
2552 && slot->type.punch_hole()) {
2553
2554 slot->err = AIOHandler::io_complete(slot);
2555 } else {
2556 slot->err = DB_SUCCESS;
2557 }
2558
2559 /* Mark this request as completed. The error handling
2560 will be done in the calling function. */
2561 m_array->acquire();
2562
2563 /* events[i].res2 should always be ZERO */
2564 ut_ad(events[i].res2 == 0);
2565 slot->io_already_done = true;
2566
2567 /*Even though events[i].res is an unsigned number
2568 in libaio, it is used to return a negative value
2569 (negated errno value) to indicate error and a positive
2570 value to indicate number of bytes read or written. */
2571
2572 if (events[i].res > slot->len) {
2573 /* failure */
2574 slot->n_bytes = 0;
2575 slot->ret = events[i].res;
2576 } else {
2577 /* success */
2578 slot->n_bytes = events[i].res;
2579 slot->ret = 0;
2580 }
2581 m_array->release();
2582 }
2583
2584 if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
2585 || !buf_page_cleaner_is_active
2586 || ret > 0) {
2587
2588 break;
2589 }
2590
2591 /* This error handling is for any error in collecting the
2592 IO requests. The errors, if any, for any particular IO
2593 request are simply passed on to the calling routine. */
2594
2595 switch (ret) {
2596 case -EAGAIN:
2597 /* Not enough resources! Try again. */
2598
2599 case -EINTR:
2600 /* Interrupted! The behaviour in case of an interrupt.
2601 If we have some completed IOs available then the
2602 return code will be the number of IOs. We get EINTR
2603 only if there are no completed IOs and we have been
2604 interrupted. */
2605
2606 case 0:
2607 /* No pending request! Go back and check again. */
2608
2609 continue;
2610 }
2611
2612 /* All other errors should cause a trap for now. */
2613 ib::fatal()
2614 << "Unexpected ret_code[" << ret
2615 << "] from io_getevents()!";
2616
2617 break;
2618 }
2619 }
2620
2621 /** Process a Linux AIO request
2622 @param[out] m1 the messages passed with the
2623 @param[out] m2 AIO request; note that in case the
2624 AIO operation failed, these output
2625 parameters are valid and can be used to
2626 restart the operation.
2627 @param[out] request IO context
2628 @return DB_SUCCESS or error code */
2629 dberr_t
poll(fil_node_t ** m1,void ** m2,IORequest * request)2630 LinuxAIOHandler::poll(fil_node_t** m1, void** m2, IORequest* request)
2631 {
2632 dberr_t err;
2633 Slot* slot;
2634
2635 /* Loop until we have found a completed request. */
2636 for (;;) {
2637
2638 ulint n_pending;
2639
2640 slot = find_completed_slot(&n_pending);
2641
2642 if (slot != NULL) {
2643
2644 ut_ad(m_array->is_mutex_owned());
2645
2646 err = check_state(slot);
2647
2648 /* DB_FAIL is not a hard error, we should retry */
2649 if (err != DB_FAIL) {
2650 break;
2651 }
2652
2653 /* Partial IO, resubmit request for
2654 remaining bytes to read/write */
2655 err = resubmit(slot);
2656
2657 if (err != DB_SUCCESS) {
2658 break;
2659 }
2660
2661 m_array->release();
2662
2663 } else if (is_shutdown() && n_pending == 0) {
2664
2665 /* There is no completed request. If there is
2666 no pending request at all, and the system is
2667 being shut down, exit. */
2668
2669 *m1 = NULL;
2670 *m2 = NULL;
2671
2672 return(DB_SUCCESS);
2673
2674 } else {
2675
2676 /* Wait for some request. Note that we return
2677 from wait if we have found a request. */
2678
2679 srv_set_io_thread_op_info(
2680 m_global_segment,
2681 "waiting for completed aio requests");
2682
2683 collect();
2684 }
2685 }
2686
2687 if (err == DB_IO_PARTIAL_FAILED) {
2688 /* Aborting in case of submit failure */
2689 ib::fatal()
2690 << "Native Linux AIO interface. "
2691 "io_submit() call failed when "
2692 "resubmitting a partial I/O "
2693 "request on the file " << slot->name
2694 << ".";
2695 }
2696
2697 *m1 = slot->m1;
2698 *m2 = slot->m2;
2699
2700 *request = slot->type;
2701
2702 m_array->release(slot);
2703
2704 m_array->release();
2705
2706 return(err);
2707 }
2708
2709 /** This function is only used in Linux native asynchronous i/o.
2710 Waits for an aio operation to complete. This function is used to wait for
2711 the completed requests. The aio array of pending requests is divided
2712 into segments. The thread specifies which segment or slot it wants to wait
2713 for. NOTE: this function will also take care of freeing the aio slot,
2714 therefore no other thread is allowed to do the freeing!
2715
2716 @param[in] global_seg segment number in the aio array
2717 to wait for; segment 0 is the ibuf
2718 i/o thread, segment 1 is log i/o thread,
2719 then follow the non-ibuf read threads,
2720 and the last are the non-ibuf write
2721 threads.
2722 @param[out] m1 the messages passed with the
2723 @param[out] m2 AIO request; note that in case the
2724 AIO operation failed, these output
2725 parameters are valid and can be used to
2726 restart the operation.
2727 @param[out]xi request IO context
2728 @return DB_SUCCESS if the IO was successful */
2729 static
2730 dberr_t
os_aio_linux_handler(ulint global_segment,fil_node_t ** m1,void ** m2,IORequest * request)2731 os_aio_linux_handler(
2732 ulint global_segment,
2733 fil_node_t** m1,
2734 void** m2,
2735 IORequest* request)
2736 {
2737 LinuxAIOHandler handler(global_segment);
2738
2739 dberr_t err = handler.poll(m1, m2, request);
2740
2741 if (err == DB_IO_NO_PUNCH_HOLE) {
2742 fil_no_punch_hole(*m1);
2743 err = DB_SUCCESS;
2744 }
2745
2746 return(err);
2747 }
2748
2749 /** Dispatch an AIO request to the kernel.
2750 @param[in,out] slot an already reserved slot
2751 @return true on success. */
2752 bool
linux_dispatch(Slot * slot)2753 AIO::linux_dispatch(Slot* slot)
2754 {
2755 ut_a(slot->is_reserved);
2756 ut_ad(slot->type.validate());
2757
2758 /* Find out what we are going to work with.
2759 The iocb struct is directly in the slot.
2760 The io_context is one per segment. */
2761
2762 ulint io_ctx_index;
2763 struct iocb* iocb = &slot->control;
2764
2765 io_ctx_index = (slot->pos * m_n_segments) / m_slots.size();
2766
2767 int ret = io_submit(m_aio_ctx[io_ctx_index], 1, &iocb);
2768
2769 /* io_submit() returns number of successfully queued requests
2770 or -errno. */
2771
2772 if (ret != 1) {
2773 errno = -ret;
2774 }
2775
2776 return(ret == 1);
2777 }
2778
2779 /** Creates an io_context for native linux AIO.
2780 @param[in] max_events number of events
2781 @param[out] io_ctx io_ctx to initialize.
2782 @return true on success. */
2783 bool
linux_create_io_ctx(ulint max_events,io_context_t * io_ctx)2784 AIO::linux_create_io_ctx(
2785 ulint max_events,
2786 io_context_t* io_ctx)
2787 {
2788 ssize_t n_retries = 0;
2789
2790 for (;;) {
2791
2792 memset(io_ctx, 0x0, sizeof(*io_ctx));
2793
2794 /* Initialize the io_ctx. Tell it how many pending
2795 IO requests this context will handle. */
2796
2797 int ret = io_setup(max_events, io_ctx);
2798
2799 if (ret == 0) {
2800 /* Success. Return now. */
2801 return(true);
2802 }
2803
2804 /* If we hit EAGAIN we'll make a few attempts before failing. */
2805
2806 switch (ret) {
2807 case -EAGAIN:
2808 if (n_retries == 0) {
2809 /* First time around. */
2810 ib::warn()
2811 << "io_setup() failed with EAGAIN."
2812 " Will make "
2813 << OS_AIO_IO_SETUP_RETRY_ATTEMPTS
2814 << " attempts before giving up.";
2815 }
2816
2817 if (n_retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
2818
2819 ++n_retries;
2820
2821 ib::warn()
2822 << "io_setup() attempt "
2823 << n_retries << ".";
2824
2825 os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
2826
2827 continue;
2828 }
2829
2830 /* Have tried enough. Better call it a day. */
2831 ib::error()
2832 << "io_setup() failed with EAGAIN after "
2833 << OS_AIO_IO_SETUP_RETRY_ATTEMPTS
2834 << " attempts.";
2835 break;
2836
2837 case -ENOSYS:
2838 ib::error()
2839 << "Linux Native AIO interface"
2840 " is not supported on this platform. Please"
2841 " check your OS documentation and install"
2842 " appropriate binary of InnoDB.";
2843
2844 break;
2845
2846 default:
2847 ib::error()
2848 << "Linux Native AIO setup"
2849 << " returned following error["
2850 << ret << "]";
2851 break;
2852 }
2853
2854 ib::info()
2855 << "You can disable Linux Native AIO by"
2856 " setting innodb_use_native_aio = 0 in my.cnf";
2857
2858 break;
2859 }
2860
2861 return(false);
2862 }
2863
2864 /** Checks if the system supports native linux aio. On some kernel
2865 versions where native aio is supported it won't work on tmpfs. In such
2866 cases we can't use native aio as it is not possible to mix simulated
2867 and native aio.
2868 @return: true if supported, false otherwise. */
2869 bool
is_linux_native_aio_supported()2870 AIO::is_linux_native_aio_supported()
2871 {
2872 int fd;
2873 io_context_t io_ctx;
2874 char name[1000];
2875
2876 if (!linux_create_io_ctx(1, &io_ctx)) {
2877
2878 /* The platform does not support native aio. */
2879
2880 return(false);
2881
2882 } else if (!srv_read_only_mode) {
2883
2884 /* Now check if tmpdir supports native aio ops. */
2885 fd = innobase_mysql_tmpfile(NULL);
2886
2887 if (fd < 0) {
2888 ib::warn()
2889 << "Unable to create temp file to check"
2890 " native AIO support.";
2891
2892 return(false);
2893 }
2894 } else {
2895
2896 os_normalize_path(srv_log_group_home_dir);
2897
2898 ulint dirnamelen = strlen(srv_log_group_home_dir);
2899
2900 ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
2901
2902 memcpy(name, srv_log_group_home_dir, dirnamelen);
2903
2904 /* Add a path separator if needed. */
2905 if (dirnamelen && name[dirnamelen - 1] != OS_PATH_SEPARATOR) {
2906
2907 name[dirnamelen++] = OS_PATH_SEPARATOR;
2908 }
2909
2910 strcpy(name + dirnamelen, "ib_logfile0");
2911
2912 fd = ::open(name, O_RDONLY);
2913
2914 if (fd == -1) {
2915
2916 ib::warn()
2917 << "Unable to open"
2918 << " \"" << name << "\" to check native"
2919 << " AIO read support.";
2920
2921 return(false);
2922 }
2923 }
2924
2925 struct io_event io_event;
2926
2927 memset(&io_event, 0x0, sizeof(io_event));
2928
2929 byte* buf = static_cast<byte*>(ut_malloc_nokey(UNIV_PAGE_SIZE * 2));
2930 byte* ptr = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE));
2931
2932 struct iocb iocb;
2933
2934 /* Suppress valgrind warning. */
2935 memset(buf, 0x00, UNIV_PAGE_SIZE * 2);
2936 memset(&iocb, 0x0, sizeof(iocb));
2937
2938 struct iocb* p_iocb = &iocb;
2939
2940 if (!srv_read_only_mode) {
2941
2942 io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0);
2943
2944 } else {
2945 ut_a(UNIV_PAGE_SIZE >= 512);
2946 io_prep_pread(p_iocb, fd, ptr, 512, 0);
2947 }
2948
2949 int err = io_submit(io_ctx, 1, &p_iocb);
2950
2951 if (err >= 1) {
2952 /* Now collect the submitted IO request. */
2953 err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
2954 }
2955
2956 ut_free(buf);
2957 close(fd);
2958
2959 switch (err) {
2960 case 1:
2961 return(true);
2962
2963 case -EINVAL:
2964 case -ENOSYS:
2965 ib::error()
2966 << "Linux Native AIO not supported. You can either"
2967 " move "
2968 << (srv_read_only_mode ? name : "tmpdir")
2969 << " to a file system that supports native"
2970 " AIO or you can set innodb_use_native_aio to"
2971 " FALSE to avoid this message.";
2972
2973 /* fall through. */
2974 default:
2975 ib::error()
2976 << "Linux Native AIO check on "
2977 << (srv_read_only_mode ? name : "tmpdir")
2978 << "returned error[" << -err << "]";
2979 }
2980
2981 return(false);
2982 }
2983
2984 #endif /* LINUX_NATIVE_AIO */
2985
2986 /** Retrieves the last error number if an error occurs in a file io function.
2987 The number should be retrieved before any other OS calls (because they may
2988 overwrite the error number). If the number is not known to this program,
2989 the OS error number + 100 is returned.
2990 @param[in] report_all_errors true if we want an error message
2991 printed of all errors
2992 @param[in] on_error_silent true then don't print any diagnostic
2993 to the log
2994 @return error number, or OS error number + 100 */
2995 static
2996 ulint
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)2997 os_file_get_last_error_low(
2998 bool report_all_errors,
2999 bool on_error_silent)
3000 {
3001 int err = errno;
3002
3003 if (err == 0) {
3004 return(0);
3005 }
3006
3007 if (report_all_errors
3008 || (err != ENOSPC && err != EEXIST && !on_error_silent)) {
3009
3010 ib::error()
3011 << "Operating system error number "
3012 << err
3013 << " in a file operation.";
3014
3015 if (err == ENOENT) {
3016
3017 ib::error()
3018 << "The error means the system"
3019 " cannot find the path specified.";
3020
3021 if (srv_is_being_started) {
3022
3023 ib::error()
3024 << "If you are installing InnoDB,"
3025 " remember that you must create"
3026 " directories yourself, InnoDB"
3027 " does not create them.";
3028 }
3029 } else if (err == EACCES) {
3030
3031 ib::error()
3032 << "The error means mysqld does not have"
3033 " the access rights to the directory.";
3034
3035 } else {
3036 if (strerror(err) != NULL) {
3037
3038 ib::error()
3039 << "Error number " << err << " means '"
3040 << strerror(err) << "'";
3041 }
3042
3043 ib::info() << OPERATING_SYSTEM_ERROR_MSG;
3044 }
3045 }
3046
3047 switch (err) {
3048 case ENOSPC:
3049 return(OS_FILE_DISK_FULL);
3050 case ENOENT:
3051 return(OS_FILE_NOT_FOUND);
3052 case EEXIST:
3053 return(OS_FILE_ALREADY_EXISTS);
3054 case EXDEV:
3055 case ENOTDIR:
3056 case EISDIR:
3057 return(OS_FILE_PATH_ERROR);
3058 case EAGAIN:
3059 if (srv_use_native_aio) {
3060 return(OS_FILE_AIO_RESOURCES_RESERVED);
3061 }
3062 break;
3063 case EINTR:
3064 if (srv_use_native_aio) {
3065 return(OS_FILE_AIO_INTERRUPTED);
3066 }
3067 break;
3068 case EACCES:
3069 return(OS_FILE_ACCESS_VIOLATION);
3070 }
3071 return(OS_FILE_ERROR_MAX + err);
3072 }
3073
3074 /** Wrapper to fsync(2) that retries the call on some errors.
3075 Returns the value 0 if successful; otherwise the value -1 is returned and
3076 the global variable errno is set to indicate the error.
3077 @param[in] file open file handle
3078 @return 0 if success, -1 otherwise */
3079 static
3080 int
os_file_fsync_posix(os_file_t file)3081 os_file_fsync_posix(
3082 os_file_t file)
3083 {
3084 ulint failures = 0;
3085
3086 for (;;) {
3087
3088 ++os_n_fsyncs;
3089
3090 int ret = fsync(file);
3091
3092 if (ret == 0) {
3093 return(ret);
3094 }
3095
3096 switch(errno) {
3097 case ENOLCK:
3098
3099 ++failures;
3100 ut_a(failures < 1000);
3101
3102 if (!(failures % 100)) {
3103
3104 ib::warn()
3105 << "fsync(): "
3106 << "No locks available; retrying";
3107 }
3108
3109 /* 0.2 sec */
3110 os_thread_sleep(200000);
3111 break;
3112
3113 case EIO:
3114
3115 ib::fatal()
3116 << "fsync() returned EIO, aborting.";
3117 break;
3118
3119 case EINTR:
3120
3121 ++failures;
3122 ut_a(failures < 2000);
3123 break;
3124
3125 default:
3126 ut_error;
3127 break;
3128 }
3129 }
3130
3131 ut_error;
3132
3133 return(-1);
3134 }
3135
3136 /** Check the existence and type of the given file.
3137 @param[in] path path name of file
3138 @param[out] exists true if the file exists
3139 @param[out] type Type of the file, if it exists
3140 @return true if call succeeded */
3141 bool
os_file_status_posix(const char * path,bool * exists,os_file_type_t * type)3142 os_file_status_posix(
3143 const char* path,
3144 bool* exists,
3145 os_file_type_t* type)
3146 {
3147 struct stat statinfo;
3148
3149 int ret = stat(path, &statinfo);
3150
3151 *exists = !ret;
3152
3153 if (!ret) {
3154 /* file exists, everything OK */
3155
3156 } else if (errno == ENOENT || errno == ENOTDIR
3157 || errno == ENAMETOOLONG) {
3158 /* file does not exist */
3159 return(true);
3160
3161 } else {
3162 /* file exists, but stat call failed */
3163 os_file_handle_error_no_exit(path, "stat", false);
3164 return(false);
3165 }
3166
3167 if (S_ISDIR(statinfo.st_mode)) {
3168 *type = OS_FILE_TYPE_DIR;
3169
3170 } else if (S_ISLNK(statinfo.st_mode)) {
3171 *type = OS_FILE_TYPE_LINK;
3172
3173 } else if (S_ISREG(statinfo.st_mode)) {
3174 *type = OS_FILE_TYPE_FILE;
3175
3176 } else {
3177 *type = OS_FILE_TYPE_UNKNOWN;
3178 }
3179
3180 return(true);
3181 }
3182
3183 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
3184 function!
3185 Flushes the write buffers of a given file to the disk.
3186 @param[in] file handle to a file
3187 @return true if success */
3188 bool
os_file_flush_func(os_file_t file)3189 os_file_flush_func(
3190 os_file_t file)
3191 {
3192 int ret;
3193
3194 ret = os_file_fsync_posix(file);
3195
3196 if (ret == 0) {
3197 return(true);
3198 }
3199
3200 /* Since Linux returns EINVAL if the 'file' is actually a raw device,
3201 we choose to ignore that error if we are using raw disks */
3202
3203 if (srv_start_raw_disk_in_use && errno == EINVAL) {
3204
3205 return(true);
3206 }
3207
3208 ib::error() << "The OS said file flush did not succeed";
3209
3210 os_file_handle_error(NULL, "flush");
3211
3212 /* It is a fatal error if a file flush does not succeed, because then
3213 the database can get corrupt on disk */
3214 ut_error;
3215
3216 return(false);
3217 }
3218
3219 /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
3220 this function!
3221 A simple function to open or create a file.
3222 @param[in] name name of the file or path as a null-terminated
3223 string
3224 @param[in] create_mode create mode
3225 @param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
3226 @param[in] read_only if true, read only checks are enforced
3227 @param[out] success true if succeed, false if error
3228 @return handle to the file, not defined if error, error number
3229 can be retrieved with os_file_get_last_error */
3230 pfs_os_file_t
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)3231 os_file_create_simple_func(
3232 const char* name,
3233 ulint create_mode,
3234 ulint access_type,
3235 bool read_only,
3236 bool* success)
3237 {
3238 pfs_os_file_t file;
3239
3240 *success = false;
3241
3242 int create_flag;
3243
3244 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
3245 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
3246
3247 if (create_mode == OS_FILE_OPEN) {
3248
3249 if (access_type == OS_FILE_READ_ONLY) {
3250
3251 create_flag = O_RDONLY;
3252
3253 } else if (read_only) {
3254
3255 create_flag = O_RDONLY;
3256
3257 } else {
3258 create_flag = O_RDWR;
3259 }
3260
3261 } else if (read_only) {
3262
3263 create_flag = O_RDONLY;
3264
3265 } else if (create_mode == OS_FILE_CREATE) {
3266
3267 create_flag = O_RDWR | O_CREAT | O_EXCL;
3268
3269 } else if (create_mode == OS_FILE_CREATE_PATH) {
3270
3271 /* Create subdirs along the path if needed. */
3272
3273 *success = os_file_create_subdirs_if_needed(name);
3274
3275 if (!*success) {
3276
3277 ib::error()
3278 << "Unable to create subdirectories '"
3279 << name << "'";
3280
3281 file.m_file = OS_FILE_CLOSED;
3282 return(file);
3283 }
3284
3285 create_flag = O_RDWR | O_CREAT | O_EXCL;
3286 create_mode = OS_FILE_CREATE;
3287 } else {
3288
3289 ib::error()
3290 << "Unknown file create mode ("
3291 << create_mode
3292 << " for file '" << name << "'";
3293
3294 file.m_file = OS_FILE_CLOSED;
3295 return(file);
3296 }
3297
3298 bool retry;
3299
3300 do {
3301 file.m_file = ::open(name, create_flag, os_innodb_umask);
3302
3303 if (file.m_file == -1) {
3304 *success = false;
3305
3306 retry = os_file_handle_error(
3307 name,
3308 create_mode == OS_FILE_OPEN
3309 ? "open" : "create");
3310 } else {
3311 *success = true;
3312 retry = false;
3313 }
3314
3315 } while (retry);
3316
3317 #ifdef USE_FILE_LOCK
3318 if (!read_only
3319 && *success
3320 && access_type == OS_FILE_READ_WRITE
3321 && os_file_lock(file.m_file, name)) {
3322
3323 *success = false;
3324 close(file.m_file);
3325 file.m_file = -1;
3326 }
3327 #endif /* USE_FILE_LOCK */
3328
3329 return(file);
3330 }
3331
3332 /** This function attempts to create a directory named pathname. The new
3333 directory gets default permissions. On Unix the permissions are
3334 (0770 & ~umask). If the directory exists already, nothing is done and
3335 the call succeeds, unless the fail_if_exists arguments is true.
3336 If another error occurs, such as a permission error, this does not crash,
3337 but reports the error and returns false.
3338 @param[in] pathname directory name as null-terminated string
3339 @param[in] fail_if_exists if true, pre-existing directory is treated as
3340 an error.
3341 @return true if call succeeds, false on error */
3342 bool
os_file_create_directory(const char * pathname,bool fail_if_exists)3343 os_file_create_directory(
3344 const char* pathname,
3345 bool fail_if_exists)
3346 {
3347 int rcode = mkdir(pathname, 0770);
3348
3349 if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
3350 /* failure */
3351 os_file_handle_error_no_exit(pathname, "mkdir", false);
3352
3353 return(false);
3354 }
3355
3356 return(true);
3357 }
3358
3359 /**
3360 The os_file_opendir() function opens a directory stream corresponding to the
3361 directory named by the dirname argument. The directory stream is positioned
3362 at the first entry. In both Unix and Windows we automatically skip the '.'
3363 and '..' items at the start of the directory listing.
3364 @param[in] dirname directory name; it must not contain a trailing
3365 '\' or '/'
3366 @param[in] is_fatal true if we should treat an error as a fatal
3367 error; if we try to open symlinks then we do
3368 not wish a fatal error if it happens not to be
3369 a directory
3370 @return directory stream, NULL if error */
3371 os_file_dir_t
os_file_opendir(const char * dirname,bool error_is_fatal)3372 os_file_opendir(
3373 const char* dirname,
3374 bool error_is_fatal)
3375 {
3376 os_file_dir_t dir;
3377 dir = opendir(dirname);
3378
3379 if (dir == NULL && error_is_fatal) {
3380 os_file_handle_error(dirname, "opendir");
3381 }
3382
3383 return(dir);
3384 }
3385
3386 /** Closes a directory stream.
3387 @param[in] dir directory stream
3388 @return 0 if success, -1 if failure */
3389 int
os_file_closedir(os_file_dir_t dir)3390 os_file_closedir(
3391 os_file_dir_t dir)
3392 {
3393 int ret = closedir(dir);
3394
3395 if (ret != 0) {
3396 os_file_handle_error_no_exit(NULL, "closedir", false);
3397 }
3398
3399 return(ret);
3400 }
3401
3402 /** This function returns information of the next file in the directory. We jump
3403 over the '.' and '..' entries in the directory.
3404 @param[in] dirname directory name or path
3405 @param[in] dir directory stream
3406 @param[out] info buffer where the info is returned
3407 @return 0 if ok, -1 if error, 1 if at the end of the directory */
3408 int
os_file_readdir_next_file(const char * dirname,os_file_dir_t dir,os_file_stat_t * info)3409 os_file_readdir_next_file(
3410 const char* dirname,
3411 os_file_dir_t dir,
3412 os_file_stat_t* info)
3413 {
3414 struct dirent* ent;
3415 char* full_path;
3416 int ret;
3417 struct stat statinfo;
3418
3419 #ifdef HAVE_READDIR_R
3420 char dirent_buf[sizeof(struct dirent)
3421 + _POSIX_PATH_MAX + 100];
3422 /* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
3423 the max file name len; but in most standards, the
3424 length is NAME_MAX; we add 100 to be even safer */
3425 #endif /* HAVE_READDIR_R */
3426
3427 next_file:
3428
3429 #ifdef HAVE_READDIR_R
3430 ret = readdir_r(dir, (struct dirent*) dirent_buf, &ent);
3431
3432 if (ret != 0) {
3433
3434 ib::error()
3435 << "Cannot read directory " << dirname
3436 << " error: " << ret;
3437
3438 return(-1);
3439 }
3440
3441 if (ent == NULL) {
3442 /* End of directory */
3443
3444 return(1);
3445 }
3446
3447 ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
3448 #else
3449 ent = readdir(dir);
3450
3451 if (ent == NULL) {
3452
3453 return(1);
3454 }
3455 #endif /* HAVE_READDIR_R */
3456 ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
3457
3458 if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
3459
3460 goto next_file;
3461 }
3462
3463 strcpy(info->name, ent->d_name);
3464
3465 full_path = static_cast<char*>(
3466 ut_malloc_nokey(strlen(dirname) + strlen(ent->d_name) + 10));
3467
3468 sprintf(full_path, "%s/%s", dirname, ent->d_name);
3469
3470 ret = stat(full_path, &statinfo);
3471
3472 if (ret) {
3473
3474 if (errno == ENOENT) {
3475 /* readdir() returned a file that does not exist,
3476 it must have been deleted in the meantime. Do what
3477 would have happened if the file was deleted before
3478 readdir() - ignore and go to the next entry.
3479 If this is the last entry then info->name will still
3480 contain the name of the deleted file when this
3481 function returns, but this is not an issue since the
3482 caller shouldn't be looking at info when end of
3483 directory is returned. */
3484
3485 ut_free(full_path);
3486
3487 goto next_file;
3488 }
3489
3490 os_file_handle_error_no_exit(full_path, "stat", false);
3491
3492 ut_free(full_path);
3493
3494 return(-1);
3495 }
3496
3497 info->size = statinfo.st_size;
3498
3499 if (S_ISDIR(statinfo.st_mode)) {
3500 info->type = OS_FILE_TYPE_DIR;
3501 } else if (S_ISLNK(statinfo.st_mode)) {
3502 info->type = OS_FILE_TYPE_LINK;
3503 } else if (S_ISREG(statinfo.st_mode)) {
3504 info->type = OS_FILE_TYPE_FILE;
3505 } else {
3506 info->type = OS_FILE_TYPE_UNKNOWN;
3507 }
3508
3509 ut_free(full_path);
3510
3511 return(0);
3512 }
3513
3514 /** NOTE! Use the corresponding macro os_file_create(), not directly
3515 this function!
3516 Opens an existing file or creates a new.
3517 @param[in] name name of the file or path as a null-terminated
3518 string
3519 @param[in] create_mode create mode
3520 @param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O
3521 is desired, OS_FILE_NORMAL, if any normal file;
3522 NOTE that it also depends on type, os_aio_..
3523 and srv_.. variables whether we really use async
3524 I/O or unbuffered I/O: look in the function
3525 source code for the exact rules
3526 @param[in] type OS_DATA_FILE or OS_LOG_FILE
3527 @param[in] read_only true, if read only checks should be enforcedm
3528 @param[in] success true if succeeded
3529 @return handle to the file, not defined if error, error number
3530 can be retrieved with os_file_get_last_error */
3531 pfs_os_file_t
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,bool read_only,bool * success)3532 os_file_create_func(
3533 const char* name,
3534 ulint create_mode,
3535 ulint purpose,
3536 ulint type,
3537 bool read_only,
3538 bool* success)
3539 {
3540 bool on_error_no_exit;
3541 bool on_error_silent;
3542 pfs_os_file_t file;
3543
3544 *success = false;
3545
3546 DBUG_EXECUTE_IF(
3547 "ib_create_table_fail_disk_full",
3548 *success = false;
3549 errno = ENOSPC;
3550 file.m_file = OS_FILE_CLOSED;
3551 return(file);
3552 );
3553
3554 int create_flag;
3555 const char* mode_str = NULL;
3556
3557 on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
3558 ? true : false;
3559 on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
3560 ? true : false;
3561
3562 create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
3563 create_mode &= ~OS_FILE_ON_ERROR_SILENT;
3564
3565 if (create_mode == OS_FILE_OPEN
3566 || create_mode == OS_FILE_OPEN_RAW
3567 || create_mode == OS_FILE_OPEN_RETRY) {
3568
3569 mode_str = "OPEN";
3570
3571 create_flag = read_only ? O_RDONLY : O_RDWR;
3572
3573 } else if (read_only) {
3574
3575 mode_str = "OPEN";
3576
3577 create_flag = O_RDONLY;
3578
3579 } else if (create_mode == OS_FILE_CREATE) {
3580
3581 mode_str = "CREATE";
3582 create_flag = O_RDWR | O_CREAT | O_EXCL;
3583
3584 } else if (create_mode == OS_FILE_OVERWRITE) {
3585
3586 mode_str = "OVERWRITE";
3587 create_flag = O_RDWR | O_CREAT | O_TRUNC;
3588
3589 } else {
3590 ib::error()
3591 << "Unknown file create mode (" << create_mode << ")"
3592 << " for file '" << name << "'";
3593
3594 file.m_file = OS_FILE_CLOSED;
3595 return(file);
3596 }
3597
3598 ut_a(type == OS_LOG_FILE
3599 || type == OS_DATA_FILE
3600 || type == OS_DATA_TEMP_FILE);
3601
3602 ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
3603
3604 #ifdef O_SYNC
3605 /* We let O_SYNC only affect log files; note that we map O_DSYNC to
3606 O_SYNC because the datasync options seemed to corrupt files in 2001
3607 in both Linux and Solaris */
3608
3609 if (!read_only
3610 && type == OS_LOG_FILE
3611 && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
3612
3613 create_flag |= O_SYNC;
3614 }
3615 #endif /* O_SYNC */
3616
3617 bool retry;
3618
3619 do {
3620 file.m_file = ::open(name, create_flag, os_innodb_umask);
3621
3622 if (file.m_file == -1) {
3623 const char* operation;
3624
3625 operation = (create_mode == OS_FILE_CREATE
3626 && !read_only) ? "create" : "open";
3627
3628 *success = false;
3629
3630 if (on_error_no_exit) {
3631 retry = os_file_handle_error_no_exit(
3632 name, operation, on_error_silent);
3633 } else {
3634 retry = os_file_handle_error(name, operation);
3635 }
3636 } else {
3637 *success = true;
3638 retry = false;
3639 }
3640
3641 } while (retry);
3642
3643 /* We disable OS caching (O_DIRECT) only on data files */
3644
3645 if (!read_only
3646 && *success
3647 && (type != OS_LOG_FILE && type != OS_DATA_TEMP_FILE)
3648 && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
3649 || srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) {
3650
3651 os_file_set_nocache(file.m_file, name, mode_str);
3652 } else if (!read_only
3653 && *success
3654 && srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) {
3655
3656 os_file_set_nocache(file.m_file, name, mode_str);
3657 }
3658
3659 #ifdef USE_FILE_LOCK
3660 if (!read_only
3661 && *success
3662 && create_mode != OS_FILE_OPEN_RAW
3663 && os_file_lock(file.m_file, name)) {
3664
3665 if (create_mode == OS_FILE_OPEN_RETRY) {
3666
3667 ib::info()
3668 << "Retrying to lock the first data file";
3669
3670 for (int i = 0; i < 100; i++) {
3671 os_thread_sleep(1000000);
3672
3673 if (!os_file_lock(file.m_file, name)) {
3674 *success = true;
3675 return(file);
3676 }
3677 }
3678
3679 ib::info()
3680 << "Unable to open the first data file";
3681 }
3682
3683 *success = false;
3684 close(file.m_file);
3685 file.m_file = -1;
3686 }
3687 #endif /* USE_FILE_LOCK */
3688
3689 return(file);
3690 }
3691
3692 /** NOTE! Use the corresponding macro
3693 os_file_create_simple_no_error_handling(), not directly this function!
3694 A simple function to open or create a file.
3695 @param[in] name name of the file or path as a null-terminated
3696 string
3697 @param[in] create_mode create mode
3698 @param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
3699 OS_FILE_READ_ALLOW_DELETE; the last option
3700 is used by a backup program reading the file
3701 @param[in] read_only if true read only mode checks are enforced
3702 @param[out] success true if succeeded
3703 @return own: handle to the file, not defined if error, error number
3704 can be retrieved with os_file_get_last_error */
3705 pfs_os_file_t
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)3706 os_file_create_simple_no_error_handling_func(
3707 const char* name,
3708 ulint create_mode,
3709 ulint access_type,
3710 bool read_only,
3711 bool* success)
3712 {
3713 pfs_os_file_t file;
3714 int create_flag;
3715
3716 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
3717 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
3718
3719 *success = false;
3720
3721 if (create_mode == OS_FILE_OPEN) {
3722
3723 if (access_type == OS_FILE_READ_ONLY) {
3724
3725 create_flag = O_RDONLY;
3726
3727 } else if (read_only) {
3728
3729 create_flag = O_RDONLY;
3730
3731 } else {
3732
3733 ut_a(access_type == OS_FILE_READ_WRITE
3734 || access_type == OS_FILE_READ_ALLOW_DELETE);
3735
3736 create_flag = O_RDWR;
3737 }
3738
3739 } else if (read_only) {
3740
3741 create_flag = O_RDONLY;
3742
3743 } else if (create_mode == OS_FILE_CREATE) {
3744
3745 create_flag = O_RDWR | O_CREAT | O_EXCL;
3746
3747 } else {
3748
3749 ib::error()
3750 << "Unknown file create mode "
3751 << create_mode << " for file '" << name << "'";
3752 file.m_file = OS_FILE_CLOSED;
3753 return(file);
3754 }
3755
3756 file.m_file = ::open(name, create_flag, os_innodb_umask);
3757
3758 *success = (file.m_file != -1);
3759
3760 #ifdef USE_FILE_LOCK
3761 if (!read_only
3762 && *success
3763 && access_type == OS_FILE_READ_WRITE
3764 && os_file_lock(file.m_file, name)) {
3765
3766 *success = false;
3767 close(file.m_file);
3768 file.m_file = -1;
3769
3770 }
3771 #endif /* USE_FILE_LOCK */
3772
3773 return(file);
3774 }
3775
3776 /** Deletes a file if it exists. The file has to be closed before calling this.
3777 @param[in] name file path as a null-terminated string
3778 @param[out] exist indicate if file pre-exist
3779 @return true if success */
3780 bool
os_file_delete_if_exists_func(const char * name,bool * exist)3781 os_file_delete_if_exists_func(
3782 const char* name,
3783 bool* exist)
3784 {
3785 if (exist != NULL) {
3786 *exist = true;
3787 }
3788
3789 int ret = unlink(name);
3790
3791 if (ret != 0 && errno == ENOENT) {
3792 if (exist != NULL) {
3793 *exist = false;
3794 }
3795 } else if (ret != 0 && errno != ENOENT) {
3796 os_file_handle_error_no_exit(name, "delete", false);
3797
3798 return(false);
3799 }
3800
3801 return(true);
3802 }
3803
3804 /** Deletes a file. The file has to be closed before calling this.
3805 @param[in] name file path as a null-terminated string
3806 @return true if success */
3807 bool
os_file_delete_func(const char * name)3808 os_file_delete_func(
3809 const char* name)
3810 {
3811 int ret = unlink(name);
3812
3813 if (ret != 0) {
3814 os_file_handle_error_no_exit(name, "delete", false);
3815
3816 return(false);
3817 }
3818
3819 return(true);
3820 }
3821
3822 /** NOTE! Use the corresponding macro os_file_rename(), not directly this
3823 function!
3824 Renames a file (can also move it to another directory). It is safest that the
3825 file is closed before calling this function.
3826 @param[in] oldpath old file path as a null-terminated string
3827 @param[in] newpath new file path
3828 @return true if success */
3829 bool
os_file_rename_func(const char * oldpath,const char * newpath)3830 os_file_rename_func(
3831 const char* oldpath,
3832 const char* newpath)
3833 {
3834 #ifdef UNIV_DEBUG
3835 os_file_type_t type;
3836 bool exists;
3837
3838 /* New path must not exist. */
3839 ut_ad(os_file_status(newpath, &exists, &type));
3840 ut_ad(!exists);
3841
3842 /* Old path must exist. */
3843 ut_ad(os_file_status(oldpath, &exists, &type));
3844 ut_ad(exists);
3845 #endif /* UNIV_DEBUG */
3846
3847 int ret = rename(oldpath, newpath);
3848
3849 if (ret != 0) {
3850 os_file_handle_error_no_exit(oldpath, "rename", false);
3851
3852 return(false);
3853 }
3854
3855 return(true);
3856 }
3857
3858 /** NOTE! Use the corresponding macro os_file_close(), not directly this
3859 function!
3860 Closes a file handle. In case of error, error number can be retrieved with
3861 os_file_get_last_error.
3862 @param[in] file Handle to close
3863 @return true if success */
3864 bool
os_file_close_func(os_file_t file)3865 os_file_close_func(
3866 os_file_t file)
3867 {
3868 int ret = close(file);
3869
3870 if (ret == -1) {
3871 os_file_handle_error(NULL, "close");
3872
3873 return(false);
3874 }
3875
3876 return(true);
3877 }
3878
3879 /** Gets a file size.
3880 @param[in] file handle to an open file
3881 @return file size, or (os_offset_t) -1 on failure */
3882 os_offset_t
os_file_get_size(pfs_os_file_t file)3883 os_file_get_size(
3884 pfs_os_file_t file)
3885 {
3886 /* Store current position */
3887 os_offset_t pos = lseek(file.m_file, 0, SEEK_CUR);
3888 os_offset_t file_size = lseek(file.m_file, 0, SEEK_END);
3889
3890 /* Restore current position as the function should not change it */
3891 lseek(file.m_file, pos, SEEK_SET);
3892
3893 return(file_size);
3894 }
3895
3896 /** Gets a file size.
3897 @param[in] filename Full path to the filename to check
3898 @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
3899 errno */
3900 os_file_size_t
os_file_get_size(const char * filename)3901 os_file_get_size(
3902 const char* filename)
3903 {
3904 struct stat s;
3905 os_file_size_t file_size;
3906
3907 int ret = stat(filename, &s);
3908
3909 if (ret == 0) {
3910 file_size.m_total_size = s.st_size;
3911 /* st_blocks is in 512 byte sized blocks */
3912 file_size.m_alloc_size = s.st_blocks * 512;
3913 } else {
3914 file_size.m_total_size = ~0;
3915 file_size.m_alloc_size = (os_offset_t) errno;
3916 }
3917
3918 return(file_size);
3919 }
3920
3921 /** This function returns information about the specified file
3922 @param[in] path pathname of the file
3923 @param[out] stat_info information of a file in a directory
3924 @param[in,out] statinfo information of a file in a directory
3925 @param[in] check_rw_perm for testing whether the file can be opened
3926 in RW mode
3927 @param[in] read_only if true read only mode checks are enforced
3928 @return DB_SUCCESS if all OK */
3929 static
3930 dberr_t
os_file_get_status_posix(const char * path,os_file_stat_t * stat_info,struct stat * statinfo,bool check_rw_perm,bool read_only)3931 os_file_get_status_posix(
3932 const char* path,
3933 os_file_stat_t* stat_info,
3934 struct stat* statinfo,
3935 bool check_rw_perm,
3936 bool read_only)
3937 {
3938 int ret = stat(path, statinfo);
3939
3940 if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3941 /* file does not exist */
3942
3943 return(DB_NOT_FOUND);
3944
3945 } else if (ret) {
3946 /* file exists, but stat call failed */
3947
3948 os_file_handle_error_no_exit(path, "stat", false);
3949
3950 return(DB_FAIL);
3951 }
3952
3953 switch (statinfo->st_mode & S_IFMT) {
3954 case S_IFDIR:
3955 stat_info->type = OS_FILE_TYPE_DIR;
3956 break;
3957 case S_IFLNK:
3958 stat_info->type = OS_FILE_TYPE_LINK;
3959 break;
3960 case S_IFBLK:
3961 /* Handle block device as regular file. */
3962 case S_IFCHR:
3963 /* Handle character device as regular file. */
3964 case S_IFREG:
3965 stat_info->type = OS_FILE_TYPE_FILE;
3966 break;
3967 default:
3968 stat_info->type = OS_FILE_TYPE_UNKNOWN;
3969 }
3970
3971 stat_info->size = statinfo->st_size;
3972 stat_info->block_size = statinfo->st_blksize;
3973 stat_info->alloc_size = statinfo->st_blocks * 512;
3974
3975 if (check_rw_perm
3976 && (stat_info->type == OS_FILE_TYPE_FILE
3977 || stat_info->type == OS_FILE_TYPE_BLOCK)) {
3978
3979 int access = !read_only ? O_RDWR : O_RDONLY;
3980 int fh = ::open(path, access, os_innodb_umask);
3981
3982 if (fh == -1) {
3983 stat_info->rw_perm = false;
3984 } else {
3985 stat_info->rw_perm = true;
3986 close(fh);
3987 }
3988 }
3989
3990 return(DB_SUCCESS);
3991 }
3992
3993 /** Truncates a file to a specified size in bytes.
3994 Do nothing if the size to preserve is greater or equal to the current
3995 size of the file.
3996 @param[in] pathname file path
3997 @param[in] file file to be truncated
3998 @param[in] size size to preserve in bytes
3999 @return true if success */
4000 static
4001 bool
os_file_truncate_posix(const char * pathname,pfs_os_file_t file,os_offset_t size)4002 os_file_truncate_posix(
4003 const char* pathname,
4004 pfs_os_file_t file,
4005 os_offset_t size)
4006 {
4007 int res = ftruncate(file.m_file, size);
4008 if (res == -1) {
4009
4010 bool retry;
4011
4012 retry = os_file_handle_error_no_exit(
4013 pathname, "truncate", false);
4014
4015 if (retry) {
4016 ib::warn()
4017 << "Truncate failed for '"
4018 << pathname << "'";
4019 }
4020 }
4021
4022 return(res == 0);
4023 }
4024
4025 /** Truncates a file at its current position.
4026 @return true if success */
4027 bool
os_file_set_eof(FILE * file)4028 os_file_set_eof(
4029 FILE* file) /*!< in: file to be truncated */
4030 {
4031 return(!ftruncate(fileno(file), ftell(file)));
4032 }
4033
4034 #ifdef UNIV_HOTBACKUP
4035 /** Closes a file handle.
4036 @param[in] file Handle to a file
4037 @return true if success */
4038 bool
os_file_close_no_error_handling(os_file_t file)4039 os_file_close_no_error_handling(
4040 os_file_t file)
4041 {
4042 return(close(file) != -1);
4043 }
4044 #endif /* UNIV_HOTBACKUP */
4045
4046 /** This function can be called if one wants to post a batch of reads and
4047 prefers an i/o-handler thread to handle them all at once later. You must
4048 call os_aio_simulated_wake_handler_threads later to ensure the threads
4049 are not left sleeping! */
4050 void
os_aio_simulated_put_read_threads_to_sleep()4051 os_aio_simulated_put_read_threads_to_sleep()
4052 {
4053 /* No op on non Windows */
4054 }
4055
4056 #else /* !_WIN32 */
4057
4058 #include <WinIoCtl.h>
4059
4060 /** Do the read/write
4061 @param[in] request The IO context and type
4062 @return the number of bytes read/written or negative value on error */
4063 ssize_t
execute(const IORequest & request)4064 SyncFileIO::execute(const IORequest& request)
4065 {
4066 OVERLAPPED seek;
4067
4068 memset(&seek, 0x0, sizeof(seek));
4069
4070 seek.Offset = (DWORD) m_offset & 0xFFFFFFFF;
4071 seek.OffsetHigh = (DWORD) (m_offset >> 32);
4072
4073 BOOL ret;
4074 DWORD n_bytes;
4075
4076 if (request.is_read()) {
4077 ret = ReadFile(m_fh, m_buf,
4078 static_cast<DWORD>(m_n), &n_bytes, &seek);
4079
4080 } else {
4081 ut_ad(request.is_write());
4082 ret = WriteFile(m_fh, m_buf,
4083 static_cast<DWORD>(m_n), &n_bytes, &seek);
4084 }
4085
4086 return(ret ? static_cast<ssize_t>(n_bytes) : -1);
4087 }
4088
4089 /** Do the read/write
4090 @param[in,out] slot The IO slot, it has the IO context
4091 @return the number of bytes read/written or negative value on error */
4092 ssize_t
execute(Slot * slot)4093 SyncFileIO::execute(Slot* slot)
4094 {
4095 BOOL ret;
4096
4097 if (slot->type.is_read()) {
4098 ret = ReadFile(
4099 slot->file.m_file, slot->ptr, slot->len,
4100 &slot->n_bytes, &slot->control);
4101 } else {
4102 ut_ad(slot->type.is_write());
4103 ret = WriteFile(
4104 slot->file.m_file, slot->ptr, slot->len,
4105 &slot->n_bytes, &slot->control);
4106 }
4107
4108 return(ret ? static_cast<ssize_t>(slot->n_bytes) : -1);
4109 }
4110
4111 /** Check if the file system supports sparse files.
4112 @param[in] name File name
4113 @return true if the file system supports sparse files */
4114 static
4115 bool
os_is_sparse_file_supported_win32(const char * filename)4116 os_is_sparse_file_supported_win32(const char* filename)
4117 {
4118 char volname[MAX_PATH];
4119 BOOL result = GetVolumePathName(filename, volname, MAX_PATH);
4120
4121 if (!result) {
4122
4123 ib::error()
4124 << "os_is_sparse_file_supported: "
4125 << "Failed to get the volume path name for: "
4126 << filename
4127 << "- OS error number " << GetLastError();
4128
4129 return(false);
4130 }
4131
4132 DWORD flags;
4133
4134 GetVolumeInformation(
4135 volname, NULL, MAX_PATH, NULL, NULL,
4136 &flags, NULL, MAX_PATH);
4137
4138 return(flags & FILE_SUPPORTS_SPARSE_FILES) ? true : false;
4139 }
4140
4141 /** Free storage space associated with a section of the file.
4142 @param[in] fh Open file handle
4143 @param[in] page_size Tablespace page size
4144 @param[in] block_size File system block size
4145 @param[in] off Starting offset (SEEK_SET)
4146 @param[in] len Size of the hole
4147 @return 0 on success or errno */
4148 static
4149 dberr_t
os_file_punch_hole_win32(os_file_t fh,os_offset_t off,os_offset_t len)4150 os_file_punch_hole_win32(
4151 os_file_t fh,
4152 os_offset_t off,
4153 os_offset_t len)
4154 {
4155 FILE_ZERO_DATA_INFORMATION punch;
4156
4157 punch.FileOffset.QuadPart = off;
4158 punch.BeyondFinalZero.QuadPart = off + len;
4159
4160 /* If lpOverlapped is NULL, lpBytesReturned cannot be NULL,
4161 therefore we pass a dummy parameter. */
4162 DWORD temp;
4163
4164 BOOL result = DeviceIoControl(
4165 fh, FSCTL_SET_ZERO_DATA, &punch, sizeof(punch),
4166 NULL, 0, &temp, NULL);
4167
4168 return(!result ? DB_IO_NO_PUNCH_HOLE : DB_SUCCESS);
4169 }
4170
4171 /** Check the existence and type of the given file.
4172 @param[in] path path name of file
4173 @param[out] exists true if the file exists
4174 @param[out] type Type of the file, if it exists
4175 @return true if call succeeded */
4176 bool
os_file_status_win32(const char * path,bool * exists,os_file_type_t * type)4177 os_file_status_win32(
4178 const char* path,
4179 bool* exists,
4180 os_file_type_t* type)
4181 {
4182 int ret;
4183 struct _stat64 statinfo;
4184
4185 ret = _stat64(path, &statinfo);
4186
4187 *exists = !ret;
4188
4189 if (!ret) {
4190 /* file exists, everything OK */
4191
4192 } else if (errno == ENOENT || errno == ENOTDIR
4193 || errno == ENAMETOOLONG) {
4194 /* file does not exist */
4195 return(true);
4196
4197 } else {
4198 /* file exists, but stat call failed */
4199 os_file_handle_error_no_exit(path, "stat", false);
4200 return(false);
4201 }
4202
4203 if (_S_IFDIR & statinfo.st_mode) {
4204 *type = OS_FILE_TYPE_DIR;
4205
4206 } else if (_S_IFREG & statinfo.st_mode) {
4207 *type = OS_FILE_TYPE_FILE;
4208
4209 } else {
4210 *type = OS_FILE_TYPE_UNKNOWN;
4211 }
4212
4213 return(true);
4214 }
4215
4216 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
4217 function!
4218 Flushes the write buffers of a given file to the disk.
4219 @param[in] file handle to a file
4220 @return true if success */
4221 bool
os_file_flush_func(os_file_t file)4222 os_file_flush_func(
4223 os_file_t file)
4224 {
4225 ++os_n_fsyncs;
4226
4227 BOOL ret = FlushFileBuffers(file);
4228
4229 if (ret) {
4230 return(true);
4231 }
4232
4233 /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
4234 actually a raw device, we choose to ignore that error if we are using
4235 raw disks */
4236
4237 if (srv_start_raw_disk_in_use && GetLastError()
4238 == ERROR_INVALID_FUNCTION) {
4239 return(true);
4240 }
4241
4242 os_file_handle_error(NULL, "flush");
4243
4244 /* It is a fatal error if a file flush does not succeed, because then
4245 the database can get corrupt on disk */
4246 ut_error;
4247
4248 return(false);
4249 }
4250
4251 /** Retrieves the last error number if an error occurs in a file io function.
4252 The number should be retrieved before any other OS calls (because they may
4253 overwrite the error number). If the number is not known to this program,
4254 the OS error number + 100 is returned.
4255 @param[in] report_all_errors true if we want an error message printed
4256 of all errors
4257 @param[in] on_error_silent true then don't print any diagnostic
4258 to the log
4259 @return error number, or OS error number + 100 */
4260 static
4261 ulint
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)4262 os_file_get_last_error_low(
4263 bool report_all_errors,
4264 bool on_error_silent)
4265 {
4266 ulint err = (ulint) GetLastError();
4267
4268 if (err == ERROR_SUCCESS) {
4269 return(0);
4270 }
4271
4272 if (report_all_errors
4273 || (!on_error_silent
4274 && err != ERROR_DISK_FULL
4275 && err != ERROR_FILE_EXISTS)) {
4276
4277 ib::error()
4278 << "Operating system error number " << err
4279 << " in a file operation.";
4280
4281 if (err == ERROR_PATH_NOT_FOUND) {
4282 ib::error()
4283 << "The error means the system"
4284 " cannot find the path specified.";
4285
4286 if (srv_is_being_started) {
4287 ib::error()
4288 << "If you are installing InnoDB,"
4289 " remember that you must create"
4290 " directories yourself, InnoDB"
4291 " does not create them.";
4292 }
4293
4294 } else if (err == ERROR_ACCESS_DENIED) {
4295
4296 ib::error()
4297 << "The error means mysqld does not have"
4298 " the access rights to"
4299 " the directory. It may also be"
4300 " you have created a subdirectory"
4301 " of the same name as a data file.";
4302
4303 } else if (err == ERROR_SHARING_VIOLATION
4304 || err == ERROR_LOCK_VIOLATION) {
4305
4306 ib::error()
4307 << "The error means that another program"
4308 " is using InnoDB's files."
4309 " This might be a backup or antivirus"
4310 " software or another instance"
4311 " of MySQL."
4312 " Please close it to get rid of this error.";
4313
4314 } else if (err == ERROR_WORKING_SET_QUOTA
4315 || err == ERROR_NO_SYSTEM_RESOURCES) {
4316
4317 ib::error()
4318 << "The error means that there are no"
4319 " sufficient system resources or quota to"
4320 " complete the operation.";
4321
4322 } else if (err == ERROR_OPERATION_ABORTED) {
4323
4324 ib::error()
4325 << "The error means that the I/O"
4326 " operation has been aborted"
4327 " because of either a thread exit"
4328 " or an application request."
4329 " Retry attempt is made.";
4330 } else {
4331
4332 ib::info() << OPERATING_SYSTEM_ERROR_MSG;
4333 }
4334 }
4335
4336 if (err == ERROR_FILE_NOT_FOUND) {
4337 return(OS_FILE_NOT_FOUND);
4338 } else if (err == ERROR_DISK_FULL) {
4339 return(OS_FILE_DISK_FULL);
4340 } else if (err == ERROR_FILE_EXISTS) {
4341 return(OS_FILE_ALREADY_EXISTS);
4342 } else if (err == ERROR_SHARING_VIOLATION
4343 || err == ERROR_LOCK_VIOLATION) {
4344 return(OS_FILE_SHARING_VIOLATION);
4345 } else if (err == ERROR_WORKING_SET_QUOTA
4346 || err == ERROR_NO_SYSTEM_RESOURCES) {
4347 return(OS_FILE_INSUFFICIENT_RESOURCE);
4348 } else if (err == ERROR_OPERATION_ABORTED) {
4349 return(OS_FILE_OPERATION_ABORTED);
4350 } else if (err == ERROR_ACCESS_DENIED) {
4351 return(OS_FILE_ACCESS_VIOLATION);
4352 }
4353
4354 return(OS_FILE_ERROR_MAX + err);
4355 }
4356
4357 /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
4358 this function!
4359 A simple function to open or create a file.
4360 @param[in] name name of the file or path as a null-terminated
4361 string
4362 @param[in] create_mode create mode
4363 @param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
4364 @param[in] read_only if true read only mode checks are enforced
4365 @param[out] success true if succeed, false if error
4366 @return handle to the file, not defined if error, error number
4367 can be retrieved with os_file_get_last_error */
4368 pfs_os_file_t
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)4369 os_file_create_simple_func(
4370 const char* name,
4371 ulint create_mode,
4372 ulint access_type,
4373 bool read_only,
4374 bool* success)
4375 {
4376 pfs_os_file_t file;
4377
4378 *success = false;
4379
4380 DWORD access;
4381 DWORD create_flag;
4382 DWORD attributes = 0;
4383
4384 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
4385 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
4386
4387 if (create_mode == OS_FILE_OPEN) {
4388
4389 create_flag = OPEN_EXISTING;
4390
4391 } else if (read_only) {
4392
4393 create_flag = OPEN_EXISTING;
4394
4395 } else if (create_mode == OS_FILE_CREATE) {
4396
4397 create_flag = CREATE_NEW;
4398
4399 } else if (create_mode == OS_FILE_CREATE_PATH) {
4400
4401 /* Create subdirs along the path if needed. */
4402 *success = os_file_create_subdirs_if_needed(name);
4403
4404 if (!*success) {
4405
4406 ib::error()
4407 << "Unable to create subdirectories '"
4408 << name << "'";
4409 file.m_file = OS_FILE_CLOSED;
4410 return(file);
4411 }
4412
4413 create_flag = CREATE_NEW;
4414 create_mode = OS_FILE_CREATE;
4415
4416 } else {
4417
4418 ib::error()
4419 << "Unknown file create mode ("
4420 << create_mode << ") for file '"
4421 << name << "'";
4422
4423 file.m_file = OS_FILE_CLOSED;
4424 return(file);
4425 }
4426
4427 if (access_type == OS_FILE_READ_ONLY) {
4428
4429 access = GENERIC_READ;
4430
4431 } else if (read_only) {
4432
4433 ib::info()
4434 << "Read only mode set. Unable to"
4435 " open file '" << name << "' in RW mode, "
4436 << "trying RO mode", name;
4437
4438 access = GENERIC_READ;
4439
4440 } else if (access_type == OS_FILE_READ_WRITE) {
4441
4442 access = GENERIC_READ | GENERIC_WRITE;
4443
4444 } else {
4445
4446 ib::error()
4447 << "Unknown file access type (" << access_type << ") "
4448 "for file '" << name << "'";
4449
4450 file.m_file = OS_FILE_CLOSED;
4451 return(file);
4452 }
4453
4454 bool retry;
4455
4456 do {
4457 /* Use default security attributes and no template file. */
4458
4459 file.m_file = CreateFile(
4460 (LPCTSTR) name, access, FILE_SHARE_READ, NULL,
4461 create_flag, attributes, NULL);
4462
4463 if (file.m_file == INVALID_HANDLE_VALUE) {
4464
4465 *success = false;
4466
4467 retry = os_file_handle_error(
4468 name, create_mode == OS_FILE_OPEN ?
4469 "open" : "create");
4470
4471 } else {
4472
4473 retry = false;
4474
4475 *success = true;
4476
4477 DWORD temp;
4478
4479 /* This is a best effort use case, if it fails then
4480 we will find out when we try and punch the hole. */
4481
4482 DeviceIoControl(
4483 file.m_file, FSCTL_SET_SPARSE, NULL, 0, NULL, 0,
4484 &temp, NULL);
4485 }
4486
4487 } while (retry);
4488
4489 return(file);
4490 }
4491
4492 /** This function attempts to create a directory named pathname. The new
4493 directory gets default permissions. On Unix the permissions are
4494 (0770 & ~umask). If the directory exists already, nothing is done and
4495 the call succeeds, unless the fail_if_exists arguments is true.
4496 If another error occurs, such as a permission error, this does not crash,
4497 but reports the error and returns false.
4498 @param[in] pathname directory name as null-terminated string
4499 @param[in] fail_if_exists if true, pre-existing directory is treated
4500 as an error.
4501 @return true if call succeeds, false on error */
4502 bool
os_file_create_directory(const char * pathname,bool fail_if_exists)4503 os_file_create_directory(
4504 const char* pathname,
4505 bool fail_if_exists)
4506 {
4507 BOOL rcode;
4508
4509 rcode = CreateDirectory((LPCTSTR) pathname, NULL);
4510 if (!(rcode != 0
4511 || (GetLastError() == ERROR_ALREADY_EXISTS
4512 && !fail_if_exists))) {
4513
4514 os_file_handle_error_no_exit(
4515 pathname, "CreateDirectory", false);
4516
4517 return(false);
4518 }
4519
4520 return(true);
4521 }
4522
4523 /** The os_file_opendir() function opens a directory stream corresponding to the
4524 directory named by the dirname argument. The directory stream is positioned
4525 at the first entry. In both Unix and Windows we automatically skip the '.'
4526 and '..' items at the start of the directory listing.
4527 @param[in] dirname directory name; it must not contain a trailing
4528 '\' or '/'
4529 @param[in] is_fatal true if we should treat an error as a fatal
4530 error; if we try to open symlinks then we do
4531 not wish a fatal error if it happens not to
4532 be a directory
4533 @return directory stream, NULL if error */
4534 os_file_dir_t
os_file_opendir(const char * dirname,bool error_is_fatal)4535 os_file_opendir(
4536 const char* dirname,
4537 bool error_is_fatal)
4538 {
4539 os_file_dir_t dir;
4540 LPWIN32_FIND_DATA lpFindFileData;
4541 char path[OS_FILE_MAX_PATH + 3];
4542
4543 ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
4544
4545 strcpy(path, dirname);
4546 strcpy(path + strlen(path), "\\*");
4547
4548 /* Note that in Windows opening the 'directory stream' also retrieves
4549 the first entry in the directory. Since it is '.', that is no problem,
4550 as we will skip over the '.' and '..' entries anyway. */
4551
4552 lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
4553 ut_malloc_nokey(sizeof(WIN32_FIND_DATA)));
4554
4555 dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
4556
4557 ut_free(lpFindFileData);
4558
4559 if (dir == INVALID_HANDLE_VALUE) {
4560
4561 if (error_is_fatal) {
4562 os_file_handle_error(dirname, "opendir");
4563 }
4564
4565 return(NULL);
4566 }
4567
4568 return(dir);
4569 }
4570
4571 /** Closes a directory stream.
4572 @param[in] dir directory stream
4573 @return 0 if success, -1 if failure */
4574 int
os_file_closedir(os_file_dir_t dir)4575 os_file_closedir(
4576 os_file_dir_t dir)
4577 {
4578 BOOL ret;
4579
4580 ret = FindClose(dir);
4581
4582 if (!ret) {
4583 os_file_handle_error_no_exit(NULL, "closedir", false);
4584
4585 return(-1);
4586 }
4587
4588 return(0);
4589 }
4590
4591 /** This function returns information of the next file in the directory. We
4592 jump over the '.' and '..' entries in the directory.
4593 @param[in] dirname directory name or path
4594 @param[in] dir directory stream
4595 @param[out] info buffer where the info is returned
4596 @return 0 if ok, -1 if error, 1 if at the end of the directory */
4597 int
os_file_readdir_next_file(const char * dirname,os_file_dir_t dir,os_file_stat_t * info)4598 os_file_readdir_next_file(
4599 const char* dirname,
4600 os_file_dir_t dir,
4601 os_file_stat_t* info)
4602 {
4603 BOOL ret;
4604 int status;
4605 WIN32_FIND_DATA find_data;
4606
4607 next_file:
4608
4609 ret = FindNextFile(dir, &find_data);
4610
4611 if (ret > 0) {
4612
4613 const char* name;
4614
4615 name = static_cast<const char*>(find_data.cFileName);
4616
4617 ut_a(strlen(name) < OS_FILE_MAX_PATH);
4618
4619 if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0) {
4620
4621 goto next_file;
4622 }
4623
4624 strcpy(info->name, name);
4625
4626 info->size = find_data.nFileSizeHigh;
4627 info->size <<= 32;
4628 info->size |= find_data.nFileSizeLow;
4629
4630 if (find_data.dwFileAttributes
4631 & FILE_ATTRIBUTE_REPARSE_POINT) {
4632
4633 /* TODO: test Windows symlinks */
4634 /* TODO: MySQL has apparently its own symlink
4635 implementation in Windows, dbname.sym can
4636 redirect a database directory:
4637 REFMAN "windows-symbolic-links.html" */
4638
4639 info->type = OS_FILE_TYPE_LINK;
4640
4641 } else if (find_data.dwFileAttributes
4642 & FILE_ATTRIBUTE_DIRECTORY) {
4643
4644 info->type = OS_FILE_TYPE_DIR;
4645
4646 } else {
4647
4648 /* It is probably safest to assume that all other
4649 file types are normal. Better to check them rather
4650 than blindly skip them. */
4651
4652 info->type = OS_FILE_TYPE_FILE;
4653 }
4654
4655 status = 0;
4656
4657 } else if (GetLastError() == ERROR_NO_MORE_FILES) {
4658
4659 status = 1;
4660
4661 } else {
4662
4663 os_file_handle_error_no_exit(NULL, "readdir_next_file", false);
4664
4665 status = -1;
4666 }
4667
4668 return(status);
4669 }
4670
4671 /** NOTE! Use the corresponding macro os_file_create(), not directly
4672 this function!
4673 Opens an existing file or creates a new.
4674 @param[in] name name of the file or path as a null-terminated
4675 string
4676 @param[in] create_mode create mode
4677 @param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O
4678 is desired, OS_FILE_NORMAL, if any normal file;
4679 NOTE that it also depends on type, os_aio_..
4680 and srv_.. variables whether we really use async
4681 I/O or unbuffered I/O: look in the function
4682 source code for the exact rules
4683 @param[in] type OS_DATA_FILE or OS_LOG_FILE
4684 @param[in] success true if succeeded
4685 @return handle to the file, not defined if error, error number
4686 can be retrieved with os_file_get_last_error */
4687 pfs_os_file_t
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,bool read_only,bool * success)4688 os_file_create_func(
4689 const char* name,
4690 ulint create_mode,
4691 ulint purpose,
4692 ulint type,
4693 bool read_only,
4694 bool* success)
4695 {
4696 pfs_os_file_t file;
4697 bool retry;
4698 bool on_error_no_exit;
4699 bool on_error_silent;
4700
4701 *success = false;
4702
4703 DBUG_EXECUTE_IF(
4704 "ib_create_table_fail_disk_full",
4705 *success = false;
4706 SetLastError(ERROR_DISK_FULL);
4707 file.m_file = OS_FILE_CLOSED;
4708 return(file);
4709 );
4710
4711 DWORD create_flag;
4712 DWORD share_mode = FILE_SHARE_READ;
4713
4714 on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
4715 ? true : false;
4716
4717 on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
4718 ? true : false;
4719
4720 create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
4721 create_mode &= ~OS_FILE_ON_ERROR_SILENT;
4722
4723 if (create_mode == OS_FILE_OPEN_RAW) {
4724
4725 ut_a(!read_only);
4726
4727 create_flag = OPEN_EXISTING;
4728
4729 /* On Windows Physical devices require admin privileges and
4730 have to have the write-share mode set. See the remarks
4731 section for the CreateFile() function documentation in MSDN. */
4732
4733 share_mode |= FILE_SHARE_WRITE;
4734
4735 } else if (create_mode == OS_FILE_OPEN
4736 || create_mode == OS_FILE_OPEN_RETRY) {
4737
4738 create_flag = OPEN_EXISTING;
4739
4740 } else if (read_only) {
4741
4742 create_flag = OPEN_EXISTING;
4743
4744 } else if (create_mode == OS_FILE_CREATE) {
4745
4746 create_flag = CREATE_NEW;
4747
4748 } else if (create_mode == OS_FILE_OVERWRITE) {
4749
4750 create_flag = CREATE_ALWAYS;
4751
4752 } else {
4753 ib::error()
4754 << "Unknown file create mode (" << create_mode << ") "
4755 << " for file '" << name << "'";
4756
4757 file.m_file = OS_FILE_CLOSED;
4758 return(file);
4759 }
4760
4761 DWORD attributes = 0;
4762
4763 #ifdef UNIV_HOTBACKUP
4764 attributes |= FILE_FLAG_NO_BUFFERING;
4765 #else
4766 if (purpose == OS_FILE_AIO) {
4767
4768 #ifdef WIN_ASYNC_IO
4769 /* If specified, use asynchronous (overlapped) io and no
4770 buffering of writes in the OS */
4771
4772 if (srv_use_native_aio) {
4773 attributes |= FILE_FLAG_OVERLAPPED;
4774 }
4775 #endif /* WIN_ASYNC_IO */
4776
4777 } else if (purpose == OS_FILE_NORMAL) {
4778
4779 /* Use default setting. */
4780
4781 } else {
4782
4783 ib::error()
4784 << "Unknown purpose flag (" << purpose << ") "
4785 << "while opening file '" << name << "'";
4786
4787 file.m_file = OS_FILE_CLOSED;
4788 return(file);
4789 }
4790
4791 #ifdef UNIV_NON_BUFFERED_IO
4792 // TODO: Create a bug, this looks wrong. The flush log
4793 // parameter is dynamic.
4794 if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
4795
4796 /* Do not use unbuffered i/o for the log files because
4797 value 2 denotes that we do not flush the log at every
4798 commit, but only once per second */
4799
4800 } else if (srv_win_file_flush_method == SRV_WIN_IO_UNBUFFERED) {
4801
4802 attributes |= FILE_FLAG_NO_BUFFERING;
4803 }
4804 #endif /* UNIV_NON_BUFFERED_IO */
4805
4806 #endif /* UNIV_HOTBACKUP */
4807 DWORD access = GENERIC_READ;
4808
4809 if (!read_only) {
4810 access |= GENERIC_WRITE;
4811 }
4812
4813 do {
4814 /* Use default security attributes and no template file. */
4815 file.m_file = CreateFile(
4816 (LPCTSTR) name, access, share_mode, NULL,
4817 create_flag, attributes, NULL);
4818
4819 if (file.m_file == INVALID_HANDLE_VALUE) {
4820 const char* operation;
4821
4822 operation = (create_mode == OS_FILE_CREATE
4823 && !read_only)
4824 ? "create" : "open";
4825
4826 *success = false;
4827
4828 if (on_error_no_exit) {
4829 retry = os_file_handle_error_no_exit(
4830 name, operation, on_error_silent);
4831 } else {
4832 retry = os_file_handle_error(name, operation);
4833 }
4834 } else {
4835
4836 retry = false;
4837
4838 *success = true;
4839
4840 DWORD temp;
4841
4842 /* This is a best effort use case, if it fails then
4843 we will find out when we try and punch the hole. */
4844 DeviceIoControl(
4845 file.m_file, FSCTL_SET_SPARSE, NULL, 0, NULL, 0,
4846 &temp, NULL);
4847 }
4848
4849 } while (retry);
4850
4851 return(file);
4852 }
4853
4854 /** NOTE! Use the corresponding macro os_file_create_simple_no_error_handling(),
4855 not directly this function!
4856 A simple function to open or create a file.
4857 @param[in] name name of the file or path as a null-terminated
4858 string
4859 @param[in] create_mode create mode
4860 @param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
4861 OS_FILE_READ_ALLOW_DELETE; the last option is
4862 used by a backup program reading the file
4863 @param[out] success true if succeeded
4864 @return own: handle to the file, not defined if error, error number
4865 can be retrieved with os_file_get_last_error */
4866 pfs_os_file_t
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)4867 os_file_create_simple_no_error_handling_func(
4868 const char* name,
4869 ulint create_mode,
4870 ulint access_type,
4871 bool read_only,
4872 bool* success)
4873 {
4874 pfs_os_file_t file;
4875
4876 *success = false;
4877
4878 DWORD access;
4879 DWORD create_flag;
4880 DWORD attributes = 0;
4881 DWORD share_mode = FILE_SHARE_READ;
4882
4883 ut_a(name);
4884
4885 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
4886 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
4887
4888 if (create_mode == OS_FILE_OPEN) {
4889
4890 create_flag = OPEN_EXISTING;
4891
4892 } else if (read_only) {
4893
4894 create_flag = OPEN_EXISTING;
4895
4896 } else if (create_mode == OS_FILE_CREATE) {
4897
4898 create_flag = CREATE_NEW;
4899
4900 } else {
4901
4902 ib::error()
4903 << "Unknown file create mode (" << create_mode << ") "
4904 << " for file '" << name << "'";
4905
4906 file.m_file = OS_FILE_CLOSED;
4907 return(file);
4908 }
4909
4910 if (access_type == OS_FILE_READ_ONLY) {
4911
4912 access = GENERIC_READ;
4913
4914 } else if (read_only) {
4915
4916 access = GENERIC_READ;
4917
4918 } else if (access_type == OS_FILE_READ_WRITE) {
4919
4920 access = GENERIC_READ | GENERIC_WRITE;
4921
4922 } else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
4923
4924 ut_a(!read_only);
4925
4926 access = GENERIC_READ;
4927
4928 /*!< A backup program has to give mysqld the maximum
4929 freedom to do what it likes with the file */
4930
4931 share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE;
4932 } else {
4933
4934 ib::error()
4935 << "Unknown file access type (" << access_type << ") "
4936 << "for file '" << name << "'";
4937
4938 file.m_file = OS_FILE_CLOSED;
4939 return(file);
4940 }
4941
4942 file.m_file = CreateFile((LPCTSTR) name,
4943 access,
4944 share_mode,
4945 NULL, // Security attributes
4946 create_flag,
4947 attributes,
4948 NULL); // No template file
4949
4950 *success = (file.m_file != INVALID_HANDLE_VALUE);
4951
4952 return(file);
4953 }
4954
4955 /** Deletes a file if it exists. The file has to be closed before calling this.
4956 @param[in] name file path as a null-terminated string
4957 @param[out] exist indicate if file pre-exist
4958 @return true if success */
4959 bool
os_file_delete_if_exists_func(const char * name,bool * exist)4960 os_file_delete_if_exists_func(
4961 const char* name,
4962 bool* exist)
4963 {
4964 ulint count = 0;
4965
4966 if (exist != NULL) {
4967 *exist = true;
4968 }
4969
4970 for (;;) {
4971 /* In Windows, deleting an .ibd file may fail if ibbackup
4972 is copying it */
4973
4974 bool ret = DeleteFile((LPCTSTR) name);
4975
4976 if (ret) {
4977 return(true);
4978 }
4979
4980 DWORD lasterr = GetLastError();
4981
4982 if (lasterr == ERROR_FILE_NOT_FOUND
4983 || lasterr == ERROR_PATH_NOT_FOUND) {
4984
4985 /* the file does not exist, this not an error */
4986 if (exist != NULL) {
4987 *exist = false;
4988 }
4989
4990 return(true);
4991 }
4992
4993 ++count;
4994
4995 if (count > 100 && 0 == (count % 10)) {
4996
4997 /* Print error information */
4998 os_file_get_last_error(true);
4999
5000 ib::warn() << "Delete of file '" << name << "' failed.";
5001 }
5002
5003 /* Sleep for a second */
5004 os_thread_sleep(1000000);
5005
5006 if (count > 2000) {
5007
5008 return(false);
5009 }
5010 }
5011 }
5012
5013 /** Deletes a file. The file has to be closed before calling this.
5014 @param[in] name File path as NUL terminated string
5015 @return true if success */
5016 bool
os_file_delete_func(const char * name)5017 os_file_delete_func(
5018 const char* name)
5019 {
5020 ulint count = 0;
5021
5022 for (;;) {
5023 /* In Windows, deleting an .ibd file may fail if ibbackup
5024 is copying it */
5025
5026 BOOL ret = DeleteFile((LPCTSTR) name);
5027
5028 if (ret) {
5029 return(true);
5030 }
5031
5032 if (GetLastError() == ERROR_FILE_NOT_FOUND) {
5033 /* If the file does not exist, we classify this as
5034 a 'mild' error and return */
5035
5036 return(false);
5037 }
5038
5039 ++count;
5040
5041 if (count > 100 && 0 == (count % 10)) {
5042
5043 /* print error information */
5044 os_file_get_last_error(true);
5045
5046 ib::warn()
5047 << "Cannot delete file '" << name << "'. Are "
5048 << "you running ibbackup to back up the file?";
5049 }
5050
5051 /* sleep for a second */
5052 os_thread_sleep(1000000);
5053
5054 if (count > 2000) {
5055
5056 return(false);
5057 }
5058 }
5059
5060 ut_error;
5061 return(false);
5062 }
5063
5064 /** NOTE! Use the corresponding macro os_file_rename(), not directly this
5065 function!
5066 Renames a file (can also move it to another directory). It is safest that the
5067 file is closed before calling this function.
5068 @param[in] oldpath old file path as a null-terminated string
5069 @param[in] newpath new file path
5070 @return true if success */
5071 bool
os_file_rename_func(const char * oldpath,const char * newpath)5072 os_file_rename_func(
5073 const char* oldpath,
5074 const char* newpath)
5075 {
5076 #ifdef UNIV_DEBUG
5077 os_file_type_t type;
5078 bool exists;
5079
5080 /* New path must not exist. */
5081 ut_ad(os_file_status(newpath, &exists, &type));
5082 ut_ad(!exists);
5083
5084 /* Old path must exist. */
5085 ut_ad(os_file_status(oldpath, &exists, &type));
5086 ut_ad(exists);
5087 #endif /* UNIV_DEBUG */
5088
5089 if (MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath)) {
5090 return(true);
5091 }
5092
5093 os_file_handle_error_no_exit(oldpath, "rename", false);
5094
5095 return(false);
5096 }
5097
5098 /** NOTE! Use the corresponding macro os_file_close(), not directly
5099 this function!
5100 Closes a file handle. In case of error, error number can be retrieved with
5101 os_file_get_last_error.
5102 @param[in,own] file Handle to a file
5103 @return true if success */
5104 bool
os_file_close_func(os_file_t file)5105 os_file_close_func(
5106 os_file_t file)
5107 {
5108 ut_a(file > 0);
5109
5110 if (CloseHandle(file)) {
5111 return(true);
5112 }
5113
5114 os_file_handle_error(NULL, "close");
5115
5116 return(false);
5117 }
5118
5119 /** Gets a file size.
5120 @param[in] file Handle to a file
5121 @return file size, or (os_offset_t) -1 on failure */
5122 os_offset_t
os_file_get_size(pfs_os_file_t file)5123 os_file_get_size(
5124 pfs_os_file_t file)
5125 {
5126 DWORD high;
5127 DWORD low;
5128
5129 low = GetFileSize(file.m_file, &high);
5130
5131 if (low == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
5132 return((os_offset_t) -1);
5133 }
5134
5135 return(os_offset_t(low | (os_offset_t(high) << 32)));
5136 }
5137
5138 /** Gets a file size.
5139 @param[in] filename Full path to the filename to check
5140 @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
5141 errno */
5142 os_file_size_t
os_file_get_size(const char * filename)5143 os_file_get_size(
5144 const char* filename)
5145 {
5146 struct __stat64 s;
5147 os_file_size_t file_size;
5148
5149 int ret = _stat64(filename, &s);
5150
5151 if (ret == 0) {
5152
5153 file_size.m_total_size = s.st_size;
5154
5155 DWORD low_size;
5156 DWORD high_size;
5157
5158 low_size = GetCompressedFileSize(filename, &high_size);
5159
5160 if (low_size != INVALID_FILE_SIZE) {
5161
5162 file_size.m_alloc_size = high_size;
5163 file_size.m_alloc_size <<= 32;
5164 file_size.m_alloc_size |= low_size;
5165
5166 } else {
5167 ib::error()
5168 << "GetCompressedFileSize("
5169 << filename << ", ..) failed.";
5170
5171 file_size.m_alloc_size = (os_offset_t) -1;
5172 }
5173 } else {
5174 file_size.m_total_size = ~0;
5175 file_size.m_alloc_size = (os_offset_t) ret;
5176 }
5177
5178 return(file_size);
5179 }
5180
5181 /** This function returns information about the specified file
5182 @param[in] path pathname of the file
5183 @param[out] stat_info information of a file in a directory
5184 @param[in,out] statinfo information of a file in a directory
5185 @param[in] check_rw_perm for testing whether the file can be opened
5186 in RW mode
5187 @param[in] read_only true if the file is opened in read-only mode
5188 @return DB_SUCCESS if all OK */
5189 static
5190 dberr_t
os_file_get_status_win32(const char * path,os_file_stat_t * stat_info,struct _stat64 * statinfo,bool check_rw_perm,bool read_only)5191 os_file_get_status_win32(
5192 const char* path,
5193 os_file_stat_t* stat_info,
5194 struct _stat64* statinfo,
5195 bool check_rw_perm,
5196 bool read_only)
5197 {
5198 int ret = _stat64(path, statinfo);
5199
5200 if (ret && (errno == ENOENT || errno == ENOTDIR)) {
5201 /* file does not exist */
5202
5203 return(DB_NOT_FOUND);
5204
5205 } else if (ret) {
5206 /* file exists, but stat call failed */
5207
5208 os_file_handle_error_no_exit(path, "stat", false);
5209
5210 return(DB_FAIL);
5211
5212 } else if (_S_IFDIR & statinfo->st_mode) {
5213
5214 stat_info->type = OS_FILE_TYPE_DIR;
5215
5216 } else if (_S_IFREG & statinfo->st_mode) {
5217
5218 DWORD access = GENERIC_READ;
5219
5220 if (!read_only) {
5221 access |= GENERIC_WRITE;
5222 }
5223
5224 stat_info->type = OS_FILE_TYPE_FILE;
5225
5226 /* Check if we can open it in read-only mode. */
5227
5228 if (check_rw_perm) {
5229 HANDLE fh;
5230
5231 fh = CreateFile(
5232 (LPCTSTR) path, // File to open
5233 access,
5234 0, // No sharing
5235 NULL, // Default security
5236 OPEN_EXISTING, // Existing file only
5237 FILE_ATTRIBUTE_NORMAL, // Normal file
5238 NULL); // No attr. template
5239
5240 if (fh == INVALID_HANDLE_VALUE) {
5241 stat_info->rw_perm = false;
5242 } else {
5243 stat_info->rw_perm = true;
5244 CloseHandle(fh);
5245 }
5246 }
5247
5248 char volname[MAX_PATH];
5249 BOOL result = GetVolumePathName(path, volname, MAX_PATH);
5250
5251 if (!result) {
5252
5253 ib::error()
5254 << "os_file_get_status_win32: "
5255 << "Failed to get the volume path name for: "
5256 << path
5257 << "- OS error number " << GetLastError();
5258
5259 return(DB_FAIL);
5260 }
5261
5262 DWORD sectorsPerCluster;
5263 DWORD bytesPerSector;
5264 DWORD numberOfFreeClusters;
5265 DWORD totalNumberOfClusters;
5266
5267 result = GetDiskFreeSpace(
5268 (LPCSTR) volname,
5269 §orsPerCluster,
5270 &bytesPerSector,
5271 &numberOfFreeClusters,
5272 &totalNumberOfClusters);
5273
5274 if (!result) {
5275
5276 ib::error()
5277 << "GetDiskFreeSpace(" << volname << ",...) "
5278 << "failed "
5279 << "- OS error number " << GetLastError();
5280
5281 return(DB_FAIL);
5282 }
5283
5284 stat_info->block_size = bytesPerSector * sectorsPerCluster;
5285
5286 /* On Windows the block size is not used as the allocation
5287 unit for sparse files. The underlying infra-structure for
5288 sparse files is based on NTFS compression. The punch hole
5289 is done on a "compression unit". This compression unit
5290 is based on the cluster size. You cannot punch a hole if
5291 the cluster size >= 8K. For smaller sizes the table is
5292 as follows:
5293
5294 Cluster Size Compression Unit
5295 512 Bytes 8 KB
5296 1 KB 16 KB
5297 2 KB 32 KB
5298 4 KB 64 KB
5299
5300 Default NTFS cluster size is 4K, compression unit size of 64K.
5301 Therefore unless the user has created the file system with
5302 a smaller cluster size and used larger page sizes there is
5303 little benefit from compression out of the box. */
5304
5305 stat_info->block_size = (stat_info->block_size <= 4096)
5306 ? stat_info->block_size * 16 : ULINT_UNDEFINED;
5307 } else {
5308 stat_info->type = OS_FILE_TYPE_UNKNOWN;
5309 }
5310
5311 return(DB_SUCCESS);
5312 }
5313
5314 /** Truncates a file to a specified size in bytes.
5315 Do nothing if the size to preserve is greater or equal to the current
5316 size of the file.
5317 @param[in] pathname file path
5318 @param[in] file file to be truncated
5319 @param[in] size size to preserve in bytes
5320 @return true if success */
5321 static
5322 bool
os_file_truncate_win32(const char * pathname,pfs_os_file_t file,os_offset_t size)5323 os_file_truncate_win32(
5324 const char* pathname,
5325 pfs_os_file_t file,
5326 os_offset_t size)
5327 {
5328 LARGE_INTEGER length;
5329
5330 length.QuadPart = size;
5331 BOOL success = SetFilePointerEx(file.m_file, length, NULL, FILE_BEGIN);
5332 if (!success) {
5333 os_file_handle_error_no_exit(
5334 pathname, "SetFilePointerEx", false);
5335 } else {
5336 success = SetEndOfFile(file.m_file);
5337 if (!success) {
5338 os_file_handle_error_no_exit(
5339 pathname, "SetEndOfFile", false);
5340 }
5341 }
5342 return(success);
5343 }
5344
5345 /** Truncates a file at its current position.
5346 @param[in] file Handle to be truncated
5347 @return true if success */
5348 bool
os_file_set_eof(FILE * file)5349 os_file_set_eof(
5350 FILE* file)
5351 {
5352 HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
5353
5354 return(SetEndOfFile(h));
5355 }
5356
5357 #ifdef UNIV_HOTBACKUP
5358 /** Closes a file handle.
5359 @param[in] file Handle to close
5360 @return true if success */
5361 bool
os_file_close_no_error_handling(os_file_t file)5362 os_file_close_no_error_handling(
5363 os_file_t file)
5364 {
5365 return(CloseHandle(file) ? true : false);
5366 }
5367 #endif /* UNIV_HOTBACKUP */
5368
5369 /** This function can be called if one wants to post a batch of reads and
5370 prefers an i/o-handler thread to handle them all at once later. You must
5371 call os_aio_simulated_wake_handler_threads later to ensure the threads
5372 are not left sleeping! */
5373 void
os_aio_simulated_put_read_threads_to_sleep()5374 os_aio_simulated_put_read_threads_to_sleep()
5375 {
5376 AIO::simulated_put_read_threads_to_sleep();
5377 }
5378
5379 /** This function can be called if one wants to post a batch of reads and
5380 prefers an i/o-handler thread to handle them all at once later. You must
5381 call os_aio_simulated_wake_handler_threads later to ensure the threads
5382 are not left sleeping! */
5383 void
simulated_put_read_threads_to_sleep()5384 AIO::simulated_put_read_threads_to_sleep()
5385 {
5386 /* The idea of putting background IO threads to sleep is only for
5387 Windows when using simulated AIO. Windows XP seems to schedule
5388 background threads too eagerly to allow for coalescing during
5389 readahead requests. */
5390
5391 if (srv_use_native_aio) {
5392 /* We do not use simulated AIO: do nothing */
5393
5394 return;
5395 }
5396
5397 os_aio_recommend_sleep_for_read_threads = true;
5398
5399 for (ulint i = 0; i < os_aio_n_segments; i++) {
5400 AIO* array;
5401
5402 get_array_and_local_segment(&array, i);
5403
5404 if (array == s_reads) {
5405
5406 os_event_reset(os_aio_segment_wait_events[i]);
5407 }
5408 }
5409 }
5410
5411 #endif /* !_WIN32*/
5412
5413 /** Does a syncronous read or write depending upon the type specified
5414 In case of partial reads/writes the function tries
5415 NUM_RETRIES_ON_PARTIAL_IO times to read/write the complete data.
5416 @param[in] type, IO flags
5417 @param[in] file handle to an open file
5418 @param[out] buf buffer where to read
5419 @param[in] offset file offset from the start where to read
5420 @param[in] n number of bytes to read, starting from offset
5421 @param[out] err DB_SUCCESS or error code
5422 @return number of bytes read/written, -1 if error */
5423 static MY_ATTRIBUTE((warn_unused_result))
5424 ssize_t
os_file_io(const IORequest & in_type,os_file_t file,void * buf,ulint n,os_offset_t offset,dberr_t * err)5425 os_file_io(
5426 const IORequest&in_type,
5427 os_file_t file,
5428 void* buf,
5429 ulint n,
5430 os_offset_t offset,
5431 dberr_t* err)
5432 {
5433 Block* block;
5434 ulint original_n = n;
5435 IORequest type = in_type;
5436 ssize_t bytes_returned = 0;
5437
5438 if (type.is_compressed()) {
5439
5440 /* We don't compress the first page of any file. */
5441 ut_ad(offset > 0);
5442
5443 block = os_file_compress_page(type, buf, &n);
5444 } else {
5445 block = NULL;
5446 }
5447
5448 /* We do encryption after compression, since if we do encryption
5449 before compression, the encrypted data will cause compression fail
5450 or low compression rate. */
5451 if (type.is_encrypted() && type.is_write()) {
5452 /* We don't encrypt the first page of any file. */
5453 Block* compressed_block = block;
5454 ut_ad(offset > 0);
5455
5456 block = os_file_encrypt_page(type, buf, &n);
5457
5458 if (compressed_block != NULL) {
5459 os_free_block(compressed_block);
5460 }
5461 }
5462
5463 SyncFileIO sync_file_io(file, buf, n, offset);
5464
5465 for (ulint i = 0; i < NUM_RETRIES_ON_PARTIAL_IO; ++i) {
5466
5467 ssize_t n_bytes = sync_file_io.execute(type);
5468
5469 /* Check for a hard error. Not much we can do now. */
5470 if (n_bytes < 0) {
5471
5472 break;
5473
5474 } else if ((ulint) n_bytes + bytes_returned == n) {
5475
5476 bytes_returned += n_bytes;
5477
5478 if (offset > 0
5479 && (type.is_compressed() || type.is_read())) {
5480
5481 *err = os_file_io_complete(
5482 type, file,
5483 reinterpret_cast<byte*>(buf),
5484 NULL, original_n, offset, n);
5485 } else {
5486
5487 *err = DB_SUCCESS;
5488 }
5489
5490 if (block != NULL) {
5491 os_free_block(block);
5492 }
5493
5494 return(original_n);
5495 }
5496
5497 /* Handle partial read/write. */
5498
5499 ut_ad((ulint) n_bytes + bytes_returned < n);
5500
5501 bytes_returned += (ulint) n_bytes;
5502
5503 if (!type.is_partial_io_warning_disabled()) {
5504
5505 const char* op = type.is_read()
5506 ? "read" : "written";
5507
5508 ib::warn()
5509 << n
5510 << " bytes should have been " << op << ". Only "
5511 << bytes_returned
5512 << " bytes " << op << ". Retrying"
5513 << " for the remaining bytes.";
5514 }
5515
5516 /* Advance the offset and buffer by n_bytes */
5517 sync_file_io.advance(n_bytes);
5518 }
5519
5520 if (block != NULL) {
5521 os_free_block(block);
5522 }
5523
5524 *err = DB_IO_ERROR;
5525
5526 if (!type.is_partial_io_warning_disabled()) {
5527 ib::warn()
5528 << "Retry attempts for "
5529 << (type.is_read() ? "reading" : "writing")
5530 << " partial data failed.";
5531 }
5532
5533 return(bytes_returned);
5534 }
5535
5536 /** Does a synchronous write operation in Posix.
5537 @param[in] type IO context
5538 @param[in] file handle to an open file
5539 @param[out] buf buffer from which to write
5540 @param[in] n number of bytes to read, starting from offset
5541 @param[in] offset file offset from the start where to read
5542 @param[out] err DB_SUCCESS or error code
5543 @return number of bytes written, -1 if error */
5544 static MY_ATTRIBUTE((warn_unused_result))
5545 ssize_t
os_file_pwrite(IORequest & type,os_file_t file,const byte * buf,ulint n,os_offset_t offset,dberr_t * err)5546 os_file_pwrite(
5547 IORequest& type,
5548 os_file_t file,
5549 const byte* buf,
5550 ulint n,
5551 os_offset_t offset,
5552 dberr_t* err)
5553 {
5554 ut_ad(type.validate());
5555
5556 ++os_n_file_writes;
5557
5558 (void) os_atomic_increment_ulint(&os_n_pending_writes, 1);
5559 MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_WRITES);
5560
5561 ssize_t n_bytes = os_file_io(type, file, (void*) buf, n, offset, err);
5562
5563 (void) os_atomic_decrement_ulint(&os_n_pending_writes, 1);
5564 MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_WRITES);
5565
5566 return(n_bytes);
5567 }
5568
5569 /** Requests a synchronous write operation.
5570 @param[in] type IO flags
5571 @param[in] file handle to an open file
5572 @param[out] buf buffer from which to write
5573 @param[in] offset file offset from the start where to read
5574 @param[in] n number of bytes to read, starting from offset
5575 @return DB_SUCCESS if request was successful, false if fail */
5576 static MY_ATTRIBUTE((warn_unused_result))
5577 dberr_t
os_file_write_page(IORequest & type,const char * name,os_file_t file,const byte * buf,os_offset_t offset,ulint n)5578 os_file_write_page(
5579 IORequest& type,
5580 const char* name,
5581 os_file_t file,
5582 const byte* buf,
5583 os_offset_t offset,
5584 ulint n)
5585 {
5586 dberr_t err;
5587 ut_ad(type.validate());
5588 ut_ad(n > 0);
5589
5590 ssize_t n_bytes = os_file_pwrite(type, file, buf, n, offset, &err);
5591
5592 if ((ulint) n_bytes != n && !os_has_said_disk_full) {
5593
5594 ib::error()
5595 << "Write to file " << name << "failed at offset "
5596 << offset << ", " << n
5597 << " bytes should have been written,"
5598 " only " << n_bytes << " were written."
5599 " Operating system error number " << errno << "."
5600 " Check that your OS and file system"
5601 " support files of this size."
5602 " Check also that the disk is not full"
5603 " or a disk quota exceeded.";
5604
5605 if (strerror(errno) != NULL) {
5606
5607 ib::error()
5608 << "Error number " << errno
5609 << " means '" << strerror(errno) << "'";
5610 }
5611
5612 ib::info() << OPERATING_SYSTEM_ERROR_MSG;
5613
5614 os_has_said_disk_full = true;
5615 }
5616
5617 return(err);
5618 }
5619
5620 /** Does a synchronous read operation in Posix.
5621 @param[in] type IO flags
5622 @param[in] file handle to an open file
5623 @param[out] buf buffer where to read
5624 @param[in] offset file offset from the start where to read
5625 @param[in] n number of bytes to read, starting from offset
5626 @param[out] err DB_SUCCESS or error code
5627 @return number of bytes read, -1 if error */
5628 static MY_ATTRIBUTE((warn_unused_result))
5629 ssize_t
os_file_pread(IORequest & type,os_file_t file,void * buf,ulint n,os_offset_t offset,dberr_t * err)5630 os_file_pread(
5631 IORequest& type,
5632 os_file_t file,
5633 void* buf,
5634 ulint n,
5635 os_offset_t offset,
5636 dberr_t* err)
5637 {
5638 ++os_n_file_reads;
5639
5640 (void) os_atomic_increment_ulint(&os_n_pending_reads, 1);
5641 MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS);
5642
5643 ssize_t n_bytes = os_file_io(type, file, buf, n, offset, err);
5644
5645 (void) os_atomic_decrement_ulint(&os_n_pending_reads, 1);
5646 MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_READS);
5647
5648 return(n_bytes);
5649 }
5650
5651 /** Requests a synchronous positioned read operation.
5652 @return DB_SUCCESS if request was successful, false if fail
5653 @param[in] type IO flags
5654 @param[in] file handle to an open file
5655 @param[out] buf buffer where to read
5656 @param[in] offset file offset from the start where to read
5657 @param[in] n number of bytes to read, starting from offset
5658 @param[out] o number of bytes actually read
5659 @param[in] exit_on_err if true then exit on error
5660 @return DB_SUCCESS or error code */
5661 static MY_ATTRIBUTE((warn_unused_result))
5662 dberr_t
os_file_read_page(IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n,ulint * o,bool exit_on_err)5663 os_file_read_page(
5664 IORequest& type,
5665 os_file_t file,
5666 void* buf,
5667 os_offset_t offset,
5668 ulint n,
5669 ulint* o,
5670 bool exit_on_err)
5671 {
5672 dberr_t err;
5673
5674 os_bytes_read_since_printout += n;
5675
5676 ut_ad(type.validate());
5677 ut_ad(n > 0);
5678
5679 for (;;) {
5680 ssize_t n_bytes;
5681
5682 n_bytes = os_file_pread(type, file, buf, n, offset, &err);
5683
5684 if (o != NULL) {
5685 *o = n_bytes;
5686 }
5687
5688 if (err != DB_SUCCESS && !exit_on_err) {
5689
5690 return(err);
5691
5692 } else if ((ulint) n_bytes == n) {
5693
5694 /** The read will succeed but decompress can fail
5695 for various reasons. */
5696
5697 if (type.is_compression_enabled()
5698 && !Compression::is_compressed_page(
5699 static_cast<byte*>(buf))) {
5700
5701 return(DB_SUCCESS);
5702
5703 } else {
5704 return(err);
5705 }
5706 }
5707
5708 ib::error() << "Tried to read " << n
5709 << " bytes at offset " << offset
5710 << ", but was only able to read " << n_bytes;
5711
5712 if (exit_on_err) {
5713
5714 if (!os_file_handle_error(NULL, "read")) {
5715 /* Hard error */
5716 break;
5717 }
5718
5719 } else if (!os_file_handle_error_no_exit(NULL, "read", false)) {
5720
5721 /* Hard error */
5722 break;
5723 }
5724
5725 if (n_bytes > 0 && (ulint) n_bytes < n) {
5726 n -= (ulint) n_bytes;
5727 offset += (ulint) n_bytes;
5728 buf = reinterpret_cast<uchar*>(buf) + (ulint) n_bytes;
5729 }
5730 }
5731
5732 ib::fatal()
5733 << "Cannot read from file. OS error number "
5734 << errno << ".";
5735
5736 return(err);
5737 }
5738
5739 /** Retrieves the last error number if an error occurs in a file io function.
5740 The number should be retrieved before any other OS calls (because they may
5741 overwrite the error number). If the number is not known to this program,
5742 the OS error number + 100 is returned.
5743 @param[in] report_all_errors true if we want an error printed
5744 for all errors
5745 @return error number, or OS error number + 100 */
5746 ulint
os_file_get_last_error(bool report_all_errors)5747 os_file_get_last_error(
5748 bool report_all_errors)
5749 {
5750 return(os_file_get_last_error_low(report_all_errors, false));
5751 }
5752
5753 /** Does error handling when a file operation fails.
5754 Conditionally exits (calling srv_fatal_error()) based on should_exit value
5755 and the error type, if should_exit is true then on_error_silent is ignored.
5756 @param[in] name name of a file or NULL
5757 @param[in] operation operation
5758 @param[in] should_exit call srv_fatal_error() on an unknown error,
5759 if this parameter is true
5760 @param[in] on_error_silent if true then don't print any message to the log
5761 iff it is an unknown non-fatal error
5762 @return true if we should retry the operation */
5763 static MY_ATTRIBUTE((warn_unused_result))
5764 bool
os_file_handle_error_cond_exit(const char * name,const char * operation,bool should_exit,bool on_error_silent)5765 os_file_handle_error_cond_exit(
5766 const char* name,
5767 const char* operation,
5768 bool should_exit,
5769 bool on_error_silent)
5770 {
5771 ulint err;
5772
5773 err = os_file_get_last_error_low(false, on_error_silent);
5774
5775 switch (err) {
5776 case OS_FILE_DISK_FULL:
5777 /* We only print a warning about disk full once */
5778
5779 if (os_has_said_disk_full) {
5780
5781 return(false);
5782 }
5783
5784 /* Disk full error is reported irrespective of the
5785 on_error_silent setting. */
5786
5787 if (name) {
5788
5789 ib::error()
5790 << "Encountered a problem with file '"
5791 << name << "'";
5792 }
5793
5794 ib::error()
5795 << "Disk is full. Try to clean the disk to free space.";
5796
5797 os_has_said_disk_full = true;
5798
5799 return(false);
5800
5801 case OS_FILE_AIO_RESOURCES_RESERVED:
5802 case OS_FILE_AIO_INTERRUPTED:
5803
5804 return(true);
5805
5806 case OS_FILE_PATH_ERROR:
5807 case OS_FILE_ALREADY_EXISTS:
5808 case OS_FILE_ACCESS_VIOLATION:
5809
5810 return(false);
5811
5812 case OS_FILE_SHARING_VIOLATION:
5813
5814 os_thread_sleep(10000000); /* 10 sec */
5815 return(true);
5816
5817 case OS_FILE_OPERATION_ABORTED:
5818 case OS_FILE_INSUFFICIENT_RESOURCE:
5819
5820 os_thread_sleep(100000); /* 100 ms */
5821 return(true);
5822
5823 default:
5824
5825 /* If it is an operation that can crash on error then it
5826 is better to ignore on_error_silent and print an error message
5827 to the log. */
5828
5829 if (should_exit || !on_error_silent) {
5830 ib::error() << "File "
5831 << (name != NULL ? name : "(unknown)")
5832 << ": '" << operation << "'"
5833 " returned OS error " << err << "."
5834 << (should_exit
5835 ? " Cannot continue operation" : "");
5836 }
5837
5838 if (should_exit) {
5839 srv_fatal_error();
5840 }
5841 }
5842
5843 return(false);
5844 }
5845
5846 /** Does error handling when a file operation fails.
5847 @param[in] name name of a file or NULL
5848 @param[in] operation operation name that failed
5849 @return true if we should retry the operation */
5850 static
5851 bool
os_file_handle_error(const char * name,const char * operation)5852 os_file_handle_error(
5853 const char* name,
5854 const char* operation)
5855 {
5856 /* Exit in case of unknown error */
5857 return(os_file_handle_error_cond_exit(name, operation, true, false));
5858 }
5859
5860 /** Does error handling when a file operation fails.
5861 @param[in] name name of a file or NULL
5862 @param[in] operation operation name that failed
5863 @param[in] on_error_silent if true then don't print any message to the log.
5864 @return true if we should retry the operation */
5865 static
5866 bool
os_file_handle_error_no_exit(const char * name,const char * operation,bool on_error_silent)5867 os_file_handle_error_no_exit(
5868 const char* name,
5869 const char* operation,
5870 bool on_error_silent)
5871 {
5872 /* Don't exit in case of unknown error */
5873 return(os_file_handle_error_cond_exit(
5874 name, operation, false, on_error_silent));
5875 }
5876
5877 /** Tries to disable OS caching on an opened file descriptor.
5878 @param[in] fd file descriptor to alter
5879 @param[in] file_name file name, used in the diagnostic message
5880 @param[in] name "open" or "create"; used in the diagnostic
5881 message */
5882 void
os_file_set_nocache(int fd MY_ATTRIBUTE ((unused)),const char * file_name MY_ATTRIBUTE ((unused)),const char * operation_name MY_ATTRIBUTE ((unused)))5883 os_file_set_nocache(
5884 int fd MY_ATTRIBUTE((unused)),
5885 const char* file_name MY_ATTRIBUTE((unused)),
5886 const char* operation_name MY_ATTRIBUTE((unused)))
5887 {
5888 /* some versions of Solaris may not have DIRECTIO_ON */
5889 #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
5890 if (directio(fd, DIRECTIO_ON) == -1) {
5891 int errno_save = errno;
5892
5893 ib::error()
5894 << "Failed to set DIRECTIO_ON on file "
5895 << file_name << ": " << operation_name
5896 << strerror(errno_save) << ","
5897 " continuing anyway.";
5898 }
5899 #elif defined(O_DIRECT)
5900 if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
5901 int errno_save = errno;
5902 static bool warning_message_printed = false;
5903 if (errno_save == EINVAL) {
5904 if (!warning_message_printed) {
5905 warning_message_printed = true;
5906 # ifdef UNIV_LINUX
5907 ib::warn()
5908 << "Failed to set O_DIRECT on file"
5909 << file_name << ";" << operation_name
5910 << ": " << strerror(errno_save) << ", "
5911 << "continuing anyway. O_DIRECT is "
5912 "known to result in 'Invalid argument' "
5913 "on Linux on tmpfs, "
5914 "see MySQL Bug#26662.";
5915 # else /* UNIV_LINUX */
5916 goto short_warning;
5917 # endif /* UNIV_LINUX */
5918 }
5919 } else {
5920 # ifndef UNIV_LINUX
5921 short_warning:
5922 # endif
5923 ib::warn()
5924 << "Failed to set O_DIRECT on file "
5925 << file_name << "; " << operation_name
5926 << " : " << strerror(errno_save)
5927 << " continuing anyway.";
5928 }
5929 }
5930 #endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */
5931 }
5932
5933 /** Write the specified number of zeros to a newly created file.
5934 @param[in] name name of the file or path as a null-terminated
5935 string
5936 @param[in] file handle to a file
5937 @param[in] size file size
5938 @param[in] read_only Enable read-only checks if true
5939 @return true if success */
5940 bool
os_file_set_size(const char * name,pfs_os_file_t file,os_offset_t size,bool read_only)5941 os_file_set_size(
5942 const char* name,
5943 pfs_os_file_t file,
5944 os_offset_t size,
5945 bool read_only)
5946 {
5947 /* Write up to 1 megabyte at a time. */
5948 ulint buf_size = ut_min(
5949 static_cast<ulint>(64),
5950 static_cast<ulint>(size / UNIV_PAGE_SIZE));
5951
5952 buf_size *= UNIV_PAGE_SIZE;
5953
5954 /* Align the buffer for possible raw i/o */
5955 byte* buf2;
5956
5957 buf2 = static_cast<byte*>(ut_malloc_nokey(buf_size + UNIV_PAGE_SIZE));
5958
5959 byte* buf = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
5960
5961 /* Write buffer full of zeros */
5962 memset(buf, 0, buf_size);
5963
5964 if (size >= (os_offset_t) 100 << 20) {
5965
5966 ib::info() << "Progress in MB:";
5967 }
5968
5969 os_offset_t current_size = 0;
5970
5971 while (current_size < size) {
5972 ulint n_bytes;
5973
5974 if (size - current_size < (os_offset_t) buf_size) {
5975 n_bytes = (ulint) (size - current_size);
5976 } else {
5977 n_bytes = buf_size;
5978 }
5979
5980 dberr_t err;
5981 IORequest request(IORequest::WRITE);
5982
5983 #ifdef UNIV_HOTBACKUP
5984
5985 err = os_file_write(
5986 request, name, file, buf, current_size, n_bytes);
5987 #else
5988 /* Using OS_AIO_SYNC mode on POSIX systems will result in
5989 fall back to os_file_write/read. On Windows it will use
5990 special mechanism to wait before it returns back. */
5991
5992 err = os_aio(
5993 request,
5994 OS_AIO_SYNC, name,
5995 file, buf, current_size, n_bytes,
5996 read_only, NULL, NULL);
5997 #endif /* UNIV_HOTBACKUP */
5998
5999 if (err != DB_SUCCESS) {
6000
6001 ut_free(buf2);
6002 return(false);
6003 }
6004
6005 /* Print about progress for each 100 MB written */
6006 if ((current_size + n_bytes) / (100 << 20)
6007 != current_size / (100 << 20)) {
6008
6009 fprintf(stderr, " %lu00",
6010 (ulong) ((current_size + n_bytes)
6011 / (100 << 20)));
6012 }
6013
6014 current_size += n_bytes;
6015 }
6016
6017 if (size >= (os_offset_t) 100 << 20) {
6018
6019 fprintf(stderr, "\n");
6020 }
6021
6022 ut_free(buf2);
6023
6024 return(os_file_flush(file));
6025 }
6026
6027 /** Truncates a file to a specified size in bytes.
6028 Do nothing if the size to preserve is greater or equal to the current
6029 size of the file.
6030 @param[in] pathname file path
6031 @param[in] file file to be truncated
6032 @param[in] size size to preserve in bytes
6033 @return true if success */
6034 bool
os_file_truncate(const char * pathname,pfs_os_file_t file,os_offset_t size)6035 os_file_truncate(
6036 const char* pathname,
6037 pfs_os_file_t file,
6038 os_offset_t size)
6039 {
6040 /* Do nothing if the size preserved is larger than or equal to the
6041 current size of file */
6042 os_offset_t size_bytes = os_file_get_size(file);
6043
6044 if (size >= size_bytes) {
6045 return(true);
6046 }
6047
6048 #ifdef _WIN32
6049 return(os_file_truncate_win32(pathname, file, size));
6050 #else /* _WIN32 */
6051 return(os_file_truncate_posix(pathname, file, size));
6052 #endif /* _WIN32 */
6053 }
6054
6055 /** NOTE! Use the corresponding macro os_file_read(), not directly this
6056 function!
6057 Requests a synchronous positioned read operation.
6058 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
6059 @param[in] type IO flags
6060 @param[in] file handle to an open file
6061 @param[out] buf buffer where to read
6062 @param[in] offset file offset from the start where to read
6063 @param[in] n number of bytes to read, starting from offset
6064 @return DB_SUCCESS or error code */
6065 dberr_t
os_file_read_func(IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n)6066 os_file_read_func(
6067 IORequest& type,
6068 os_file_t file,
6069 void* buf,
6070 os_offset_t offset,
6071 ulint n)
6072 {
6073 ut_ad(type.is_read());
6074
6075 return(os_file_read_page(type, file, buf, offset, n, NULL, true));
6076 }
6077
6078 /** NOTE! Use the corresponding macro os_file_read_no_error_handling(),
6079 not directly this function!
6080 Requests a synchronous positioned read operation.
6081 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
6082 @param[in] type IO flags
6083 @param[in] file handle to an open file
6084 @param[out] buf buffer where to read
6085 @param[in] offset file offset from the start where to read
6086 @param[in] n number of bytes to read, starting from offset
6087 @param[out] o number of bytes actually read
6088 @return DB_SUCCESS or error code */
6089 dberr_t
os_file_read_no_error_handling_func(IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n,ulint * o)6090 os_file_read_no_error_handling_func(
6091 IORequest& type,
6092 os_file_t file,
6093 void* buf,
6094 os_offset_t offset,
6095 ulint n,
6096 ulint* o)
6097 {
6098 ut_ad(type.is_read());
6099
6100 return(os_file_read_page(type, file, buf, offset, n, o, false));
6101 }
6102
6103 /** NOTE! Use the corresponding macro os_file_write(), not directly
6104 Requests a synchronous write operation.
6105 @param[in] type IO flags
6106 @param[in] file handle to an open file
6107 @param[out] buf buffer from which to write
6108 @param[in] offset file offset from the start where to read
6109 @param[in] n number of bytes to read, starting from offset
6110 @return DB_SUCCESS if request was successful, false if fail */
6111 dberr_t
os_file_write_func(IORequest & type,const char * name,os_file_t file,const void * buf,os_offset_t offset,ulint n)6112 os_file_write_func(
6113 IORequest& type,
6114 const char* name,
6115 os_file_t file,
6116 const void* buf,
6117 os_offset_t offset,
6118 ulint n)
6119 {
6120 ut_ad(type.validate());
6121 ut_ad(type.is_write());
6122
6123 /* We never compress the first page.
6124 Note: This assumes we always do block IO. */
6125 if (offset == 0) {
6126 type.clear_compressed();
6127 }
6128
6129 const byte* ptr = reinterpret_cast<const byte*>(buf);
6130
6131 return(os_file_write_page(type, name, file, ptr, offset, n));
6132 }
6133
6134 /** Check the existence and type of the given file.
6135 @param[in] path path name of file
6136 @param[out] exists true if the file exists
6137 @param[out] type Type of the file, if it exists
6138 @return true if call succeeded */
6139 bool
os_file_status(const char * path,bool * exists,os_file_type_t * type)6140 os_file_status(
6141 const char* path,
6142 bool* exists,
6143 os_file_type_t* type)
6144 {
6145 #ifdef _WIN32
6146 return(os_file_status_win32(path, exists, type));
6147 #else
6148 return(os_file_status_posix(path, exists, type));
6149 #endif /* _WIN32 */
6150 }
6151
6152 /** Free storage space associated with a section of the file.
6153 @param[in] fh Open file handle
6154 @param[in] off Starting offset (SEEK_SET)
6155 @param[in] len Size of the hole
6156 @return DB_SUCCESS or error code */
6157 dberr_t
os_file_punch_hole(os_file_t fh,os_offset_t off,os_offset_t len)6158 os_file_punch_hole(
6159 os_file_t fh,
6160 os_offset_t off,
6161 os_offset_t len)
6162 {
6163 /* In this debugging mode, we act as if punch hole is supported,
6164 and then skip any calls to actually punch a hole here.
6165 In this way, Transparent Page Compression is still being tested. */
6166 DBUG_EXECUTE_IF("ignore_punch_hole",
6167 return(DB_SUCCESS);
6168 );
6169
6170 #ifdef _WIN32
6171 return(os_file_punch_hole_win32(fh, off, len));
6172 #else
6173 return(os_file_punch_hole_posix(fh, off, len));
6174 #endif /* _WIN32 */
6175 }
6176
6177 /** Check if the file system supports sparse files.
6178
6179 Warning: On POSIX systems we try and punch a hole from offset 0 to
6180 the system configured page size. This should only be called on an empty
6181 file.
6182
6183 Note: On Windows we use the name and on Unices we use the file handle.
6184
6185 @param[in] name File name
6186 @param[in] fh File handle for the file - if opened
6187 @return true if the file system supports sparse files */
6188 bool
os_is_sparse_file_supported(const char * path,pfs_os_file_t fh)6189 os_is_sparse_file_supported(const char* path, pfs_os_file_t fh)
6190 {
6191 /* In this debugging mode, we act as if punch hole is supported,
6192 then we skip any calls to actually punch a hole. In this way,
6193 Transparent Page Compression is still being tested. */
6194 DBUG_EXECUTE_IF("ignore_punch_hole",
6195 return(true);
6196 );
6197
6198 #ifdef _WIN32
6199 return(os_is_sparse_file_supported_win32(path));
6200 #else
6201 dberr_t err;
6202
6203 /* We don't know the FS block size, use the sector size. The FS
6204 will do the magic. */
6205 err = os_file_punch_hole(fh.m_file, 0, UNIV_PAGE_SIZE);
6206
6207 return(err == DB_SUCCESS);
6208 #endif /* _WIN32 */
6209 }
6210
6211 /** This function returns information about the specified file
6212 @param[in] path pathname of the file
6213 @param[out] stat_info information of a file in a directory
6214 @param[in] check_rw_perm for testing whether the file can be opened
6215 in RW mode
6216 @param[in] read_only true if file is opened in read-only mode
6217 @return DB_SUCCESS if all OK */
6218 dberr_t
os_file_get_status(const char * path,os_file_stat_t * stat_info,bool check_rw_perm,bool read_only)6219 os_file_get_status(
6220 const char* path,
6221 os_file_stat_t* stat_info,
6222 bool check_rw_perm,
6223 bool read_only)
6224 {
6225 dberr_t ret;
6226
6227 #ifdef _WIN32
6228 struct _stat64 info;
6229
6230 ret = os_file_get_status_win32(
6231 path, stat_info, &info, check_rw_perm, read_only);
6232
6233 #else
6234 struct stat info;
6235
6236 ret = os_file_get_status_posix(
6237 path, stat_info, &info, check_rw_perm, read_only);
6238
6239 #endif /* _WIN32 */
6240
6241 if (ret == DB_SUCCESS) {
6242 stat_info->ctime = info.st_ctime;
6243 stat_info->atime = info.st_atime;
6244 stat_info->mtime = info.st_mtime;
6245 stat_info->size = info.st_size;
6246 }
6247
6248 return(ret);
6249 }
6250
6251 /**
6252 Waits for an AIO operation to complete. This function is used to wait the
6253 for completed requests. The aio array of pending requests is divided
6254 into segments. The thread specifies which segment or slot it wants to wait
6255 for. NOTE: this function will also take care of freeing the aio slot,
6256 therefore no other thread is allowed to do the freeing!
6257 @param[in] segment The number of the segment in the aio arrays to
6258 wait for; segment 0 is the ibuf I/O thread,
6259 segment 1 the log I/O thread, then follow the
6260 non-ibuf read threads, and as the last are the
6261 non-ibuf write threads; if this is
6262 ULINT_UNDEFINED, then it means that sync AIO
6263 is used, and this parameter is ignored
6264 @param[out] m1 the messages passed with the AIO request; note
6265 that also in the case where the AIO operation
6266 failed, these output parameters are valid and
6267 can be used to restart the operation,
6268 for example
6269 @param[out] m2 callback message
6270 @param[out] type OS_FILE_WRITE or ..._READ
6271 @return DB_SUCCESS or error code */
6272 dberr_t
os_aio_handler(ulint segment,fil_node_t ** m1,void ** m2,IORequest * request)6273 os_aio_handler(
6274 ulint segment,
6275 fil_node_t** m1,
6276 void** m2,
6277 IORequest* request)
6278 {
6279 dberr_t err;
6280
6281 if (srv_use_native_aio) {
6282 srv_set_io_thread_op_info(segment, "native aio handle");
6283
6284 #ifdef WIN_ASYNC_IO
6285
6286 err = os_aio_windows_handler(segment, 0, m1, m2, request);
6287
6288 #elif defined(LINUX_NATIVE_AIO)
6289
6290 err = os_aio_linux_handler(segment, m1, m2, request);
6291
6292 #else
6293 ut_error;
6294
6295 err = DB_ERROR; /* Eliminate compiler warning */
6296
6297 #endif /* WIN_ASYNC_IO */
6298
6299 } else {
6300 srv_set_io_thread_op_info(segment, "simulated aio handle");
6301
6302 err = os_aio_simulated_handler(segment, m1, m2, request);
6303 }
6304
6305 return(err);
6306 }
6307
6308 /** Constructor
6309 @param[in] id The latch ID
6310 @param[in] n Number of AIO slots
6311 @param[in] segments Number of segments */
AIO(latch_id_t id,ulint n,ulint segments)6312 AIO::AIO(
6313 latch_id_t id,
6314 ulint n,
6315 ulint segments)
6316 :
6317 m_slots(n),
6318 m_n_segments(segments),
6319 m_n_reserved()
6320 # ifdef LINUX_NATIVE_AIO
6321 ,m_aio_ctx(),
6322 m_events(m_slots.size())
6323 # elif defined(_WIN32)
6324 ,m_handles()
6325 # endif /* LINUX_NATIVE_AIO */
6326 {
6327 ut_a(n > 0);
6328 ut_a(m_n_segments > 0);
6329
6330 mutex_create(id, &m_mutex);
6331
6332 m_not_full = os_event_create("aio_not_full");
6333 m_is_empty = os_event_create("aio_is_empty");
6334
6335 std::uninitialized_fill(m_slots.begin(), m_slots.end(), Slot());
6336 #ifdef LINUX_NATIVE_AIO
6337 memset(&m_events[0], 0x0, sizeof(m_events[0]) * m_events.size());
6338 #endif /* LINUX_NATIVE_AIO */
6339
6340 os_event_set(m_is_empty);
6341 }
6342
6343 /** Initialise the slots */
6344 dberr_t
init_slots()6345 AIO::init_slots()
6346 {
6347 for (ulint i = 0; i < m_slots.size(); ++i) {
6348 Slot& slot = m_slots[i];
6349
6350 slot.pos = static_cast<uint16_t>(i);
6351
6352 slot.is_reserved = false;
6353
6354 #ifdef WIN_ASYNC_IO
6355
6356 slot.handle = CreateEvent(NULL, TRUE, FALSE, NULL);
6357
6358 OVERLAPPED* over = &slot.control;
6359
6360 over->hEvent = slot.handle;
6361
6362 (*m_handles)[i] = over->hEvent;
6363
6364 #elif defined(LINUX_NATIVE_AIO)
6365
6366 slot.ret = 0;
6367
6368 slot.n_bytes = 0;
6369
6370 memset(&slot.control, 0x0, sizeof(slot.control));
6371
6372 #endif /* WIN_ASYNC_IO */
6373 }
6374
6375 return(DB_SUCCESS);
6376 }
6377
6378 #ifdef LINUX_NATIVE_AIO
6379 /** Initialise the Linux Native AIO interface */
6380 dberr_t
init_linux_native_aio()6381 AIO::init_linux_native_aio()
6382 {
6383 /* Initialize the io_context array. One io_context
6384 per segment in the array. */
6385
6386 ut_a(m_aio_ctx == NULL);
6387
6388 m_aio_ctx = static_cast<io_context**>(
6389 ut_zalloc_nokey(m_n_segments * sizeof(*m_aio_ctx)));
6390
6391 if (m_aio_ctx == NULL) {
6392 return(DB_OUT_OF_MEMORY);
6393 }
6394
6395 io_context** ctx = m_aio_ctx;
6396 ulint max_events = slots_per_segment();
6397
6398 for (ulint i = 0; i < m_n_segments; ++i, ++ctx) {
6399
6400 if (!linux_create_io_ctx(max_events, ctx)) {
6401 /* If something bad happened during aio setup
6402 we should call it a day and return right away.
6403 We don't care about any leaks because a failure
6404 to initialize the io subsystem means that the
6405 server (or atleast the innodb storage engine)
6406 is not going to startup. */
6407 return(DB_IO_ERROR);
6408 }
6409 }
6410
6411 return(DB_SUCCESS);
6412 }
6413 #endif /* LINUX_NATIVE_AIO */
6414
6415 /** Initialise the array */
6416 dberr_t
init()6417 AIO::init()
6418 {
6419 ut_a(!m_slots.empty());
6420
6421 #ifdef _WIN32
6422 ut_a(m_handles == NULL);
6423
6424 m_handles = UT_NEW_NOKEY(Handles(m_slots.size()));
6425 #endif /* _WIN32 */
6426
6427 if (srv_use_native_aio) {
6428 #ifdef LINUX_NATIVE_AIO
6429 dberr_t err = init_linux_native_aio();
6430
6431 if (err != DB_SUCCESS) {
6432 return(err);
6433 }
6434
6435 #endif /* LINUX_NATIVE_AIO */
6436 }
6437
6438 return(init_slots());
6439 }
6440
6441 /** Creates an aio wait array. Note that we return NULL in case of failure.
6442 We don't care about freeing memory here because we assume that a
6443 failure will result in server refusing to start up.
6444 @param[in] id Latch ID
6445 @param[in] n maximum number of pending AIO operations
6446 allowed; n must be divisible by m_n_segments
6447 @param[in] n_segments number of segments in the AIO array
6448 @return own: AIO array, NULL on failure */
6449 AIO*
create(latch_id_t id,ulint n,ulint n_segments)6450 AIO::create(
6451 latch_id_t id,
6452 ulint n,
6453 ulint n_segments)
6454 {
6455 if ((n % n_segments)) {
6456
6457 ib::error()
6458 << "Maximum number of AIO operations must be "
6459 << "divisible by number of segments";
6460
6461 return(NULL);
6462 }
6463
6464 AIO* array = UT_NEW_NOKEY(AIO(id, n, n_segments));
6465
6466 if (array != NULL && array->init() != DB_SUCCESS) {
6467
6468 UT_DELETE(array);
6469
6470 array = NULL;
6471 }
6472
6473 return(array);
6474 }
6475
6476 /** AIO destructor */
~AIO()6477 AIO::~AIO()
6478 {
6479 #ifdef WIN_ASYNC_IO
6480 for (ulint i = 0; i < m_slots.size(); ++i) {
6481 CloseHandle(m_slots[i].handle);
6482 }
6483 #endif /* WIN_ASYNC_IO */
6484
6485 #ifdef _WIN32
6486 UT_DELETE(m_handles);
6487 #endif /* _WIN32 */
6488
6489 mutex_destroy(&m_mutex);
6490
6491 os_event_destroy(m_not_full);
6492 os_event_destroy(m_is_empty);
6493
6494 #if defined(LINUX_NATIVE_AIO)
6495 if (srv_use_native_aio) {
6496 m_events.clear();
6497 ut_free(m_aio_ctx);
6498 }
6499 #endif /* LINUX_NATIVE_AIO */
6500
6501 m_slots.clear();
6502 }
6503
6504 /** Initializes the asynchronous io system. Creates one array each for ibuf
6505 and log i/o. Also creates one array each for read and write where each
6506 array is divided logically into n_readers and n_writers
6507 respectively. The caller must create an i/o handler thread for each
6508 segment in these arrays. This function also creates the sync array.
6509 No i/o handler thread needs to be created for that
6510 @param[in] n_per_seg maximum number of pending aio
6511 operations allowed per segment
6512 @param[in] n_readers number of reader threads
6513 @param[in] n_writers number of writer threads
6514 @param[in] n_slots_sync number of slots in the sync aio array
6515 @return true if the AIO sub-system was started successfully */
6516 bool
start(ulint n_per_seg,ulint n_readers,ulint n_writers,ulint n_slots_sync)6517 AIO::start(
6518 ulint n_per_seg,
6519 ulint n_readers,
6520 ulint n_writers,
6521 ulint n_slots_sync)
6522 {
6523 #if defined(LINUX_NATIVE_AIO)
6524 /* Check if native aio is supported on this system and tmpfs */
6525 if (srv_use_native_aio && !is_linux_native_aio_supported()) {
6526
6527 ib::warn() << "Linux Native AIO disabled.";
6528
6529 srv_use_native_aio = FALSE;
6530 }
6531 #endif /* LINUX_NATIVE_AIO */
6532
6533 srv_reset_io_thread_op_info();
6534
6535 s_reads = create(
6536 LATCH_ID_OS_AIO_READ_MUTEX, n_readers * n_per_seg, n_readers);
6537
6538 if (s_reads == NULL) {
6539 return(false);
6540 }
6541
6542 ulint start = srv_read_only_mode ? 0 : 2;
6543 ulint n_segs = n_readers + start;
6544
6545 /* 0 is the ibuf segment and 1 is the redo log segment. */
6546 for (ulint i = start; i < n_segs; ++i) {
6547 ut_a(i < SRV_MAX_N_IO_THREADS);
6548 srv_io_thread_function[i] = "read thread";
6549 }
6550
6551 ulint n_segments = n_readers;
6552
6553 if (!srv_read_only_mode) {
6554
6555 s_ibuf = create(LATCH_ID_OS_AIO_IBUF_MUTEX, n_per_seg, 1);
6556
6557 if (s_ibuf == NULL) {
6558 return(false);
6559 }
6560
6561 ++n_segments;
6562
6563 srv_io_thread_function[0] = "insert buffer thread";
6564
6565 s_log = create(LATCH_ID_OS_AIO_LOG_MUTEX, n_per_seg, 1);
6566
6567 if (s_log == NULL) {
6568 return(false);
6569 }
6570
6571 ++n_segments;
6572
6573 srv_io_thread_function[1] = "log thread";
6574
6575 } else {
6576 s_ibuf = s_log = NULL;
6577 }
6578
6579 s_writes = create(
6580 LATCH_ID_OS_AIO_WRITE_MUTEX, n_writers * n_per_seg, n_writers);
6581
6582 if (s_writes == NULL) {
6583 return(false);
6584 }
6585
6586 n_segments += n_writers;
6587
6588 for (ulint i = start + n_readers; i < n_segments; ++i) {
6589 ut_a(i < SRV_MAX_N_IO_THREADS);
6590 srv_io_thread_function[i] = "write thread";
6591 }
6592
6593 ut_ad(n_segments >= static_cast<ulint>(srv_read_only_mode ? 2 : 4));
6594
6595 s_sync = create(LATCH_ID_OS_AIO_SYNC_MUTEX, n_slots_sync, 1);
6596
6597 if (s_sync == NULL) {
6598
6599 return(false);
6600 }
6601
6602 os_aio_n_segments = n_segments;
6603
6604 os_aio_validate();
6605
6606 os_aio_segment_wait_events = static_cast<os_event_t*>(
6607 ut_zalloc_nokey(
6608 n_segments * sizeof *os_aio_segment_wait_events));
6609
6610 if (os_aio_segment_wait_events == NULL) {
6611
6612 return(false);
6613 }
6614
6615 for (ulint i = 0; i < n_segments; ++i) {
6616 os_aio_segment_wait_events[i] = os_event_create(0);
6617 }
6618
6619 os_last_printout = ut_time_monotonic();
6620
6621 return(true);
6622 }
6623
6624 /** Free the AIO arrays */
6625 void
shutdown()6626 AIO::shutdown()
6627 {
6628 UT_DELETE(s_ibuf);
6629 s_ibuf = NULL;
6630
6631 UT_DELETE(s_log);
6632 s_log = NULL;
6633
6634 UT_DELETE(s_writes);
6635 s_writes = NULL;
6636
6637 UT_DELETE(s_sync);
6638 s_sync = NULL;
6639
6640 UT_DELETE(s_reads);
6641 s_reads = NULL;
6642 }
6643
6644 #if !defined(NO_FALLOCATE) && defined(UNIV_LINUX)
6645
6646 /** Max disk sector size */
6647 static const ulint MAX_SECTOR_SIZE = 4096;
6648
6649 /**
6650 Try and get the FusionIO sector size. */
6651 void
os_fusionio_get_sector_size()6652 os_fusionio_get_sector_size()
6653 {
6654 if (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
6655 || srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC) {
6656 ulint sector_size = UNIV_SECTOR_SIZE;
6657 char* path = srv_data_home;
6658 os_file_t check_file;
6659 byte* ptr;
6660 byte* block_ptr;
6661 char current_dir[3];
6662 char* dir_end;
6663 ulint dir_len;
6664 ulint check_path_len;
6665 char* check_file_name;
6666 ssize_t ret;
6667
6668 /* If the srv_data_home is empty, set the path to
6669 current dir. */
6670 if (*path == 0) {
6671 current_dir[0] = FN_CURLIB;
6672 current_dir[1] = FN_LIBCHAR;
6673 current_dir[2] = 0;
6674 path = current_dir;
6675 }
6676
6677 /* Get the path of data file */
6678 dir_end = strrchr(path, OS_PATH_SEPARATOR);
6679 dir_len = dir_end? dir_end - path : strlen(path);
6680
6681 /* allocate a new path and move the directory path to it. */
6682 check_path_len = dir_len + sizeof "/check_sector_size";
6683 check_file_name = static_cast<char*>(
6684 ut_zalloc_nokey(check_path_len));
6685 memcpy(check_file_name, path, dir_len);
6686
6687 /* Construct a check file name. */
6688 strcat(check_file_name + dir_len, "/check_sector_size");
6689
6690 /* Create a tmp file for checking sector size. */
6691 check_file = ::open(check_file_name,
6692 O_CREAT|O_TRUNC|O_WRONLY|O_DIRECT,
6693 S_IRWXU);
6694
6695 if (check_file == -1) {
6696 ib::error()
6697 << "Failed to create check sector file, errno:"
6698 << errno << " Please confirm O_DIRECT is"
6699 << " supported and remove the file "
6700 << check_file_name << " if it exists.";
6701 ut_free(check_file_name);
6702 errno = 0;
6703 return;
6704 }
6705
6706 /* Try to write the file with different sector size
6707 alignment. */
6708 ptr = static_cast<byte*>(ut_malloc_nokey(2 * MAX_SECTOR_SIZE));
6709
6710 while (sector_size <= MAX_SECTOR_SIZE) {
6711 block_ptr = static_cast<byte*>(
6712 ut_align(ptr, sector_size));
6713 ret = pwrite(check_file, block_ptr,
6714 sector_size, 0);
6715 if (ret > 0 && (ulint) ret == sector_size) {
6716 break;
6717 }
6718 sector_size *= 2;
6719 }
6720
6721 /* The sector size should <= MAX_SECTOR_SIZE. */
6722 ut_ad(sector_size <= MAX_SECTOR_SIZE);
6723
6724 close(check_file);
6725 unlink(check_file_name);
6726
6727 ut_free(check_file_name);
6728 ut_free(ptr);
6729 errno = 0;
6730
6731 os_io_ptr_align = sector_size;
6732 }
6733 }
6734 #endif /* !NO_FALLOCATE && UNIV_LINUX */
6735
6736 /** Initializes the asynchronous io system. Creates one array each for ibuf
6737 and log i/o. Also creates one array each for read and write where each
6738 array is divided logically into n_readers and n_writers
6739 respectively. The caller must create an i/o handler thread for each
6740 segment in these arrays. This function also creates the sync array.
6741 No i/o handler thread needs to be created for that
6742 @param[in] n_readers number of reader threads
6743 @param[in] n_writers number of writer threads
6744 @param[in] n_slots_sync number of slots in the sync aio array */
6745 bool
os_aio_init(ulint n_readers,ulint n_writers,ulint n_slots_sync)6746 os_aio_init(
6747 ulint n_readers,
6748 ulint n_writers,
6749 ulint n_slots_sync)
6750 {
6751 /* Maximum number of pending aio operations allowed per segment */
6752 ulint limit = 8 * OS_AIO_N_PENDING_IOS_PER_THREAD;
6753
6754 #ifdef _WIN32
6755 if (srv_use_native_aio) {
6756 limit = SRV_N_PENDING_IOS_PER_THREAD;
6757 }
6758 #endif /* _WIN32 */
6759
6760 ut_a(block_cache == NULL);
6761
6762 block_cache = UT_NEW_NOKEY(Blocks(MAX_BLOCKS));
6763
6764 for (Blocks::iterator it = block_cache->begin();
6765 it != block_cache->end();
6766 ++it) {
6767
6768 ut_a(it->m_in_use == 0);
6769 ut_a(it->m_ptr == NULL);
6770
6771 /* Allocate double of max page size memory, since
6772 compress could generate more bytes than orgininal
6773 data. */
6774 it->m_ptr = static_cast<byte*>(
6775 ut_malloc_nokey(BUFFER_BLOCK_SIZE));
6776
6777 ut_a(it->m_ptr != NULL);
6778 }
6779
6780 /* Get sector size for DIRECT_IO. In this case, we need to
6781 know the sector size for aligning the write buffer. */
6782 #if !defined(NO_FALLOCATE) && defined(UNIV_LINUX)
6783 os_fusionio_get_sector_size();
6784 #endif /* !NO_FALLOCATE && UNIV_LINUX */
6785
6786 return(AIO::start(limit, n_readers, n_writers, n_slots_sync));
6787 }
6788
6789 /** Frees the asynchronous io system. */
6790 void
os_aio_free()6791 os_aio_free()
6792 {
6793 AIO::shutdown();
6794
6795 for (ulint i = 0; i < os_aio_n_segments; i++) {
6796 os_event_destroy(os_aio_segment_wait_events[i]);
6797 }
6798
6799 ut_free(os_aio_segment_wait_events);
6800 os_aio_segment_wait_events = 0;
6801 os_aio_n_segments = 0;
6802
6803 for (Blocks::iterator it = block_cache->begin();
6804 it != block_cache->end();
6805 ++it) {
6806
6807 ut_a(it->m_in_use == 0);
6808 ut_free(it->m_ptr);
6809 }
6810
6811 UT_DELETE(block_cache);
6812
6813 block_cache = NULL;
6814 }
6815
6816 /** Wakes up all async i/o threads so that they know to exit themselves in
6817 shutdown. */
6818 void
os_aio_wake_all_threads_at_shutdown()6819 os_aio_wake_all_threads_at_shutdown()
6820 {
6821 #ifdef WIN_ASYNC_IO
6822
6823 AIO::wake_at_shutdown();
6824
6825 #elif defined(LINUX_NATIVE_AIO)
6826
6827 /* When using native AIO interface the io helper threads
6828 wait on io_getevents with a timeout value of 500ms. At
6829 each wake up these threads check the server status.
6830 No need to do anything to wake them up. */
6831
6832 if (srv_use_native_aio) {
6833 return;
6834 }
6835
6836 #endif /* !WIN_ASYNC_AIO */
6837
6838 /* Fall through to simulated AIO handler wakeup if we are
6839 not using native AIO. */
6840
6841 /* This loop wakes up all simulated ai/o threads */
6842
6843 for (ulint i = 0; i < os_aio_n_segments; ++i) {
6844
6845 os_event_set(os_aio_segment_wait_events[i]);
6846 }
6847 }
6848
6849 /** Waits until there are no pending writes in AIO::s_writes. There can
6850 be other, synchronous, pending writes. */
6851 void
os_aio_wait_until_no_pending_writes()6852 os_aio_wait_until_no_pending_writes()
6853 {
6854 AIO::wait_until_no_pending_writes();
6855 }
6856
6857 /** Calculates segment number for a slot.
6858 @param[in] array AIO wait array
6859 @param[in] slot slot in this array
6860 @return segment number (which is the number used by, for example,
6861 I/O-handler threads) */
6862 ulint
get_segment_no_from_slot(const AIO * array,const Slot * slot)6863 AIO::get_segment_no_from_slot(
6864 const AIO* array,
6865 const Slot* slot)
6866 {
6867 ulint segment;
6868 ulint seg_len;
6869
6870 if (array == s_ibuf) {
6871 ut_ad(!srv_read_only_mode);
6872
6873 segment = IO_IBUF_SEGMENT;
6874
6875 } else if (array == s_log) {
6876 ut_ad(!srv_read_only_mode);
6877
6878 segment = IO_LOG_SEGMENT;
6879
6880 } else if (array == s_reads) {
6881 seg_len = s_reads->slots_per_segment();
6882
6883 segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
6884 } else {
6885 ut_a(array == s_writes);
6886
6887 seg_len = s_writes->slots_per_segment();
6888
6889 segment = s_reads->m_n_segments
6890 + (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
6891 }
6892
6893 return(segment);
6894 }
6895
6896 /** Requests for a slot in the aio array. If no slot is available, waits until
6897 not_full-event becomes signaled.
6898
6899 @param[in,out] type IO context
6900 @param[in,out] m1 message to be passed along with the AIO
6901 operation
6902 @param[in,out] m2 message to be passed along with the AIO
6903 operation
6904 @param[in] file file handle
6905 @param[in] name name of the file or path as a NUL-terminated
6906 string
6907 @param[in,out] buf buffer where to read or from which to write
6908 @param[in] offset file offset, where to read from or start writing
6909 @param[in] len length of the block to read or write
6910 @return pointer to slot */
6911 Slot*
reserve_slot(IORequest & type,fil_node_t * m1,void * m2,pfs_os_file_t file,const char * name,void * buf,os_offset_t offset,ulint len)6912 AIO::reserve_slot(
6913 IORequest& type,
6914 fil_node_t* m1,
6915 void* m2,
6916 pfs_os_file_t file,
6917 const char* name,
6918 void* buf,
6919 os_offset_t offset,
6920 ulint len)
6921 {
6922 #ifdef WIN_ASYNC_IO
6923 ut_a((len & 0xFFFFFFFFUL) == len);
6924 #endif /* WIN_ASYNC_IO */
6925
6926 /* No need of a mutex. Only reading constant fields */
6927 ulint slots_per_seg;
6928
6929 ut_ad(type.validate());
6930
6931 slots_per_seg = slots_per_segment();
6932
6933 /* We attempt to keep adjacent blocks in the same local
6934 segment. This can help in merging IO requests when we are
6935 doing simulated AIO */
6936 ulint local_seg;
6937
6938 local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6)) % m_n_segments;
6939
6940 for (;;) {
6941
6942 acquire();
6943
6944 if (m_n_reserved != m_slots.size()) {
6945 break;
6946 }
6947
6948 release();
6949
6950 if (!srv_use_native_aio) {
6951 /* If the handler threads are suspended,
6952 wake them so that we get more slots */
6953
6954 os_aio_simulated_wake_handler_threads();
6955 }
6956
6957 os_event_wait(m_not_full);
6958 }
6959
6960 ulint counter = 0;
6961 Slot* slot = NULL;
6962
6963 /* We start our search for an available slot from our preferred
6964 local segment and do a full scan of the array. We are
6965 guaranteed to find a slot in full scan. */
6966 for (ulint i = local_seg * slots_per_seg;
6967 counter < m_slots.size();
6968 ++i, ++counter) {
6969
6970 i %= m_slots.size();
6971
6972 slot = at(i);
6973
6974 if (slot->is_reserved == false) {
6975 break;
6976 }
6977 }
6978
6979 /* We MUST always be able to get hold of a reserved slot. */
6980 ut_a(counter < m_slots.size());
6981
6982 ut_a(slot->is_reserved == false);
6983
6984 ++m_n_reserved;
6985
6986 if (m_n_reserved == 1) {
6987 os_event_reset(m_is_empty);
6988 }
6989
6990 if (m_n_reserved == m_slots.size()) {
6991 os_event_reset(m_not_full);
6992 }
6993
6994 slot->is_reserved = true;
6995 slot->reservation_time = ut_time_monotonic();
6996 slot->m1 = m1;
6997 slot->m2 = m2;
6998 slot->file = file;
6999 slot->name = name;
7000 #ifdef _WIN32
7001 slot->len = static_cast<DWORD>(len);
7002 #else
7003 slot->len = static_cast<ulint>(len);
7004 #endif /* _WIN32 */
7005 slot->type = type;
7006 slot->buf = static_cast<byte*>(buf);
7007 slot->ptr = slot->buf;
7008 slot->offset = offset;
7009 slot->err = DB_SUCCESS;
7010 slot->original_len = static_cast<uint32>(len);
7011 slot->io_already_done = false;
7012 slot->buf_block = NULL;
7013
7014 if (srv_use_native_aio
7015 && offset > 0
7016 && type.is_write()
7017 && type.is_compressed()) {
7018 ulint compressed_len = len;
7019
7020 ut_ad(!type.is_log());
7021
7022 release();
7023
7024 void* src_buf = slot->buf;
7025 slot->buf_block = os_file_compress_page(
7026 type,
7027 src_buf,
7028 &compressed_len);
7029
7030 slot->buf = static_cast<byte*>(src_buf);
7031 slot->ptr = slot->buf;
7032 #ifdef _WIN32
7033 slot->len = static_cast<DWORD>(compressed_len);
7034 #else
7035 slot->len = static_cast<ulint>(compressed_len);
7036 #endif /* _WIN32 */
7037 slot->skip_punch_hole = !type.punch_hole();
7038
7039 acquire();
7040 }
7041
7042 /* We do encryption after compression, since if we do encryption
7043 before compression, the encrypted data will cause compression fail
7044 or low compression rate. */
7045 if (srv_use_native_aio
7046 && offset > 0
7047 && type.is_write()
7048 && type.is_encrypted()) {
7049 ulint encrypted_len = slot->len;
7050 Block* encrypted_block;
7051
7052 ut_ad(!type.is_log());
7053
7054 release();
7055
7056 void* src_buf = slot->buf;
7057 encrypted_block = os_file_encrypt_page(
7058 type,
7059 src_buf,
7060 &encrypted_len);
7061
7062 if (slot->buf_block != NULL) {
7063 os_free_block(slot->buf_block);
7064 }
7065
7066 slot->buf_block = encrypted_block;
7067 slot->buf = static_cast<byte*>(src_buf);
7068 slot->ptr = slot->buf;
7069
7070 #ifdef _WIN32
7071 slot->len = static_cast<DWORD>(encrypted_len);
7072 #else
7073 slot->len = static_cast<ulint>(encrypted_len);
7074 #endif /* _WIN32 */
7075
7076 acquire();
7077 }
7078
7079 #ifdef WIN_ASYNC_IO
7080 {
7081 OVERLAPPED* control;
7082
7083 control = &slot->control;
7084 control->Offset = (DWORD) offset & 0xFFFFFFFF;
7085 control->OffsetHigh = (DWORD) (offset >> 32);
7086
7087 ResetEvent(slot->handle);
7088 }
7089 #elif defined(LINUX_NATIVE_AIO)
7090
7091 /* If we are not using native AIO skip this part. */
7092 if (srv_use_native_aio) {
7093
7094 off_t aio_offset;
7095
7096 /* Check if we are dealing with 64 bit arch.
7097 If not then make sure that offset fits in 32 bits. */
7098 aio_offset = (off_t) offset;
7099
7100 ut_a(sizeof(aio_offset) >= sizeof(offset)
7101 || ((os_offset_t) aio_offset) == offset);
7102
7103 struct iocb* iocb = &slot->control;
7104
7105 if (type.is_read()) {
7106 io_prep_pread(
7107 iocb, file.m_file, slot->ptr, slot->len, aio_offset);
7108 } else {
7109 ut_ad(type.is_write());
7110 io_prep_pwrite(
7111 iocb, file.m_file, slot->ptr, slot->len, aio_offset);
7112 }
7113
7114 iocb->data = slot;
7115
7116 slot->n_bytes = 0;
7117 slot->ret = 0;
7118 }
7119 #endif /* LINUX_NATIVE_AIO */
7120
7121 release();
7122
7123 return(slot);
7124 }
7125
7126 /** Wakes up a simulated aio i/o-handler thread if it has something to do.
7127 @param[in] global_segment The number of the segment in the AIO arrays */
7128 void
wake_simulated_handler_thread(ulint global_segment)7129 AIO::wake_simulated_handler_thread(ulint global_segment)
7130 {
7131 ut_ad(!srv_use_native_aio);
7132
7133 AIO* array;
7134 ulint segment = get_array_and_local_segment(&array, global_segment);
7135
7136 array->wake_simulated_handler_thread(global_segment, segment);
7137 }
7138
7139 /** Wakes up a simulated AIO I/O-handler thread if it has something to do
7140 for a local segment in the AIO array.
7141 @param[in] global_segment The number of the segment in the AIO arrays
7142 @param[in] segment The local segment in the AIO array */
7143 void
wake_simulated_handler_thread(ulint global_segment,ulint segment)7144 AIO::wake_simulated_handler_thread(ulint global_segment, ulint segment)
7145 {
7146 ut_ad(!srv_use_native_aio);
7147
7148 ulint n = slots_per_segment();
7149 ulint offset = segment * n;
7150
7151 /* Look through n slots after the segment * n'th slot */
7152
7153 acquire();
7154
7155 const Slot* slot = at(offset);
7156
7157 for (ulint i = 0; i < n; ++i, ++slot) {
7158
7159 if (slot->is_reserved) {
7160
7161 /* Found an i/o request */
7162
7163 release();
7164
7165 os_event_t event;
7166
7167 event = os_aio_segment_wait_events[global_segment];
7168
7169 os_event_set(event);
7170
7171 return;
7172 }
7173 }
7174
7175 release();
7176 }
7177
7178 /** Wakes up simulated aio i/o-handler threads if they have something to do. */
7179 void
os_aio_simulated_wake_handler_threads()7180 os_aio_simulated_wake_handler_threads()
7181 {
7182 if (srv_use_native_aio) {
7183 /* We do not use simulated aio: do nothing */
7184
7185 return;
7186 }
7187
7188 os_aio_recommend_sleep_for_read_threads = false;
7189
7190 for (ulint i = 0; i < os_aio_n_segments; i++) {
7191 AIO::wake_simulated_handler_thread(i);
7192 }
7193 }
7194
7195 /** Select the IO slot array
7196 @param[in] type Type of IO, READ or WRITE
7197 @param[in] read_only true if running in read-only mode
7198 @param[in] mode IO mode
7199 @return slot array or NULL if invalid mode specified */
7200 AIO*
select_slot_array(IORequest & type,bool read_only,ulint mode)7201 AIO::select_slot_array(IORequest& type, bool read_only, ulint mode)
7202 {
7203 AIO* array;
7204
7205 ut_ad(type.validate());
7206
7207 switch (mode) {
7208 case OS_AIO_NORMAL:
7209
7210 array = type.is_read() ? AIO::s_reads : AIO::s_writes;
7211 break;
7212
7213 case OS_AIO_IBUF:
7214 ut_ad(type.is_read());
7215
7216 /* Reduce probability of deadlock bugs in connection with ibuf:
7217 do not let the ibuf i/o handler sleep */
7218
7219 type.clear_do_not_wake();
7220
7221 array = read_only ? AIO::s_reads : AIO::s_ibuf;
7222 break;
7223
7224 case OS_AIO_LOG:
7225
7226 array = read_only ? AIO::s_reads : AIO::s_log;
7227 break;
7228
7229 case OS_AIO_SYNC:
7230
7231 array = AIO::s_sync;
7232 #if defined(LINUX_NATIVE_AIO)
7233 /* In Linux native AIO we don't use sync IO array. */
7234 ut_a(!srv_use_native_aio);
7235 #endif /* LINUX_NATIVE_AIO */
7236 break;
7237
7238 default:
7239 ut_error;
7240 array = NULL; /* Eliminate compiler warning */
7241 }
7242
7243 return(array);
7244 }
7245
7246 #ifdef WIN_ASYNC_IO
7247 /** This function is only used in Windows asynchronous i/o.
7248 Waits for an aio operation to complete. This function is used to wait the
7249 for completed requests. The aio array of pending requests is divided
7250 into segments. The thread specifies which segment or slot it wants to wait
7251 for. NOTE: this function will also take care of freeing the aio slot,
7252 therefore no other thread is allowed to do the freeing!
7253 @param[in] segment The number of the segment in the aio arrays to
7254 wait for; segment 0 is the ibuf I/O thread,
7255 segment 1 the log I/O thread, then follow the
7256 non-ibuf read threads, and as the last are the
7257 non-ibuf write threads; if this is
7258 ULINT_UNDEFINED, then it means that sync AIO
7259 is used, and this parameter is ignored
7260 @param[in] pos this parameter is used only in sync AIO:
7261 wait for the aio slot at this position
7262 @param[out] m1 the messages passed with the AIO request; note
7263 that also in the case where the AIO operation
7264 failed, these output parameters are valid and
7265 can be used to restart the operation,
7266 for example
7267 @param[out] m2 callback message
7268 @param[out] type OS_FILE_WRITE or ..._READ
7269 @return DB_SUCCESS or error code */
7270 static
7271 dberr_t
os_aio_windows_handler(ulint segment,ulint pos,fil_node_t ** m1,void ** m2,IORequest * type)7272 os_aio_windows_handler(
7273 ulint segment,
7274 ulint pos,
7275 fil_node_t** m1,
7276 void** m2,
7277 IORequest* type)
7278 {
7279 Slot* slot;
7280 dberr_t err;
7281 AIO* array;
7282 ulint orig_seg = segment;
7283
7284 if (segment == ULINT_UNDEFINED) {
7285 segment = 0;
7286 array = AIO::sync_array();
7287 } else {
7288 segment = AIO::get_array_and_local_segment(&array, segment);
7289 }
7290
7291 /* NOTE! We only access constant fields in os_aio_array. Therefore
7292 we do not have to acquire the protecting mutex yet */
7293
7294 ut_ad(os_aio_validate_skip());
7295
7296 if (array == AIO::sync_array()) {
7297
7298 WaitForSingleObject(array->at(pos)->handle, INFINITE);
7299
7300 } else {
7301 if (orig_seg != ULINT_UNDEFINED) {
7302 srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
7303 }
7304
7305 pos = WaitForMultipleObjects(
7306 (DWORD) array->slots_per_segment(),
7307 array->handles(segment),
7308 FALSE, INFINITE);
7309 }
7310
7311 array->acquire();
7312
7313 if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
7314 && array->is_empty()
7315 && !buf_page_cleaner_is_active) {
7316
7317 *m1 = NULL;
7318 *m2 = NULL;
7319
7320 array->release();
7321
7322 return(DB_SUCCESS);
7323 }
7324
7325 ulint n = array->slots_per_segment();
7326
7327 ut_a(pos >= WAIT_OBJECT_0 && pos <= WAIT_OBJECT_0 + n);
7328
7329 slot = array->at(pos + segment * n);
7330
7331 ut_a(slot->is_reserved);
7332
7333 if (orig_seg != ULINT_UNDEFINED) {
7334 srv_set_io_thread_op_info(
7335 orig_seg, "get windows aio return value");
7336 }
7337
7338 BOOL ret;
7339 ret = GetOverlappedResult(
7340 slot->file.m_file, &slot->control, &slot->n_bytes, TRUE);
7341 *m1 = slot->m1;
7342 *m2 = slot->m2;
7343
7344 *type = slot->type;
7345
7346 BOOL retry = FALSE;
7347
7348 if (ret && slot->n_bytes == slot->len) {
7349
7350 err = DB_SUCCESS;
7351
7352 } else if (os_file_handle_error(slot->name, "Windows aio")) {
7353
7354 retry = true;
7355
7356 } else {
7357
7358 err = DB_IO_ERROR;
7359 }
7360
7361 array->release();
7362
7363 if (retry) {
7364 /* Retry failed read/write operation synchronously.
7365 No need to hold array->m_mutex. */
7366
7367 #ifdef UNIV_PFS_IO
7368 /* This read/write does not go through os_file_read
7369 and os_file_write APIs, need to register with
7370 performance schema explicitly here. */
7371 struct PSI_file_locker* locker = NULL;
7372 PSI_file_locker_state state;
7373 register_pfs_file_io_begin(
7374 &state, locker, slot->file, slot->len,
7375 slot->type.is_write()
7376 ? PSI_FILE_WRITE : PSI_FILE_READ, __FILE__, __LINE__);
7377 #endif /* UNIV_PFS_IO */
7378
7379 ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
7380
7381 ssize_t n_bytes = SyncFileIO::execute(slot);
7382
7383 #ifdef UNIV_PFS_IO
7384 register_pfs_file_io_end(locker, slot->len);
7385 #endif /* UNIV_PFS_IO */
7386
7387 if (n_bytes < 0 && GetLastError() == ERROR_IO_PENDING) {
7388 /* AIO was queued successfully!
7389 We want a synchronous I/O operation on a
7390 file where we also use async I/O: in Windows
7391 we must use the same wait mechanism as for
7392 async I/O */
7393
7394 BOOL ret;
7395 ret = GetOverlappedResult(
7396 slot->file.m_file, &slot->control, &slot->n_bytes,
7397 TRUE);
7398 n_bytes = ret ? slot->n_bytes : -1;
7399 }
7400
7401 err = (n_bytes == slot->len) ? DB_SUCCESS : DB_IO_ERROR;
7402 }
7403
7404 if (err == DB_SUCCESS) {
7405 err = AIOHandler::post_io_processing(slot);
7406 }
7407
7408 array->release_with_mutex(slot);
7409
7410 return(err);
7411 }
7412 #endif /* WIN_ASYNC_IO */
7413
7414 /**
7415 NOTE! Use the corresponding macro os_aio(), not directly this function!
7416 Requests an asynchronous i/o operation.
7417 @param[in] type IO request context
7418 @param[in] mode IO mode
7419 @param[in] name Name of the file or path as NUL terminated
7420 string
7421 @param[in] file Open file handle
7422 @param[out] buf buffer where to read
7423 @param[in] offset file offset where to read
7424 @param[in] n number of bytes to read
7425 @param[in] read_only if true read only mode checks are enforced
7426 @param[in,out] m1 Message for the AIO handler, (can be used to
7427 identify a completed AIO operation); ignored
7428 if mode is OS_AIO_SYNC
7429 @param[in,out] m2 message for the AIO handler (can be used to
7430 identify a completed AIO operation); ignored
7431 if mode is OS_AIO_SYNC
7432 @return DB_SUCCESS or error code */
7433 dberr_t
os_aio_func(IORequest & type,ulint mode,const char * name,pfs_os_file_t file,void * buf,os_offset_t offset,ulint n,bool read_only,fil_node_t * m1,void * m2)7434 os_aio_func(
7435 IORequest& type,
7436 ulint mode,
7437 const char* name,
7438 pfs_os_file_t file,
7439 void* buf,
7440 os_offset_t offset,
7441 ulint n,
7442 bool read_only,
7443 fil_node_t* m1,
7444 void* m2)
7445 {
7446 #ifdef WIN_ASYNC_IO
7447 BOOL ret = TRUE;
7448 #endif /* WIN_ASYNC_IO */
7449
7450 ut_ad(n > 0);
7451 ut_ad((n % OS_MIN_LOG_BLOCK_SIZE) == 0);
7452 ut_ad((offset % OS_MIN_LOG_BLOCK_SIZE) == 0);
7453 ut_ad(os_aio_validate_skip());
7454
7455 #ifdef WIN_ASYNC_IO
7456 ut_ad((n & 0xFFFFFFFFUL) == n);
7457 #endif /* WIN_ASYNC_IO */
7458
7459 if (mode == OS_AIO_SYNC
7460 #ifdef WIN_ASYNC_IO
7461 && !srv_use_native_aio
7462 #endif /* WIN_ASYNC_IO */
7463 ) {
7464 /* This is actually an ordinary synchronous read or write:
7465 no need to use an i/o-handler thread. NOTE that if we use
7466 Windows async i/o, Windows does not allow us to use
7467 ordinary synchronous os_file_read etc. on the same file,
7468 therefore we have built a special mechanism for synchronous
7469 wait in the Windows case.
7470 Also note that the Performance Schema instrumentation has
7471 been performed by current os_aio_func()'s wrapper function
7472 pfs_os_aio_func(). So we would no longer need to call
7473 Performance Schema instrumented os_file_read() and
7474 os_file_write(). Instead, we should use os_file_read_func()
7475 and os_file_write_func() */
7476
7477 if (type.is_read()) {
7478 return(os_file_read_func(type, file.m_file, buf, offset, n));
7479 }
7480
7481 ut_ad(type.is_write());
7482 return(os_file_write_func(type, name, file.m_file, buf, offset, n));
7483 }
7484
7485 try_again:
7486
7487 AIO* array;
7488
7489 array = AIO::select_slot_array(type, read_only, mode);
7490
7491 Slot* slot;
7492
7493 slot = array->reserve_slot(type, m1, m2, file, name, buf, offset, n);
7494
7495 if (type.is_read()) {
7496
7497 if (srv_use_native_aio) {
7498
7499 ++os_n_file_reads;
7500
7501 os_bytes_read_since_printout += n;
7502 #ifdef WIN_ASYNC_IO
7503 ret = ReadFile(
7504 file.m_file, slot->ptr, slot->len,
7505 &slot->n_bytes, &slot->control);
7506 #elif defined(LINUX_NATIVE_AIO)
7507 if (!array->linux_dispatch(slot)) {
7508 goto err_exit;
7509 }
7510 #endif /* WIN_ASYNC_IO */
7511 } else if (type.is_wake()) {
7512 AIO::wake_simulated_handler_thread(
7513 AIO::get_segment_no_from_slot(array, slot));
7514 }
7515 } else if (type.is_write()) {
7516
7517 if (srv_use_native_aio) {
7518 ++os_n_file_writes;
7519
7520 #ifdef WIN_ASYNC_IO
7521 ret = WriteFile(
7522 file.m_file, slot->ptr, slot->len,
7523 &slot->n_bytes, &slot->control);
7524 #elif defined(LINUX_NATIVE_AIO)
7525 if (!array->linux_dispatch(slot)) {
7526 goto err_exit;
7527 }
7528 #endif /* WIN_ASYNC_IO */
7529
7530 } else if (type.is_wake()) {
7531 AIO::wake_simulated_handler_thread(
7532 AIO::get_segment_no_from_slot(array, slot));
7533 }
7534 } else {
7535 ut_error;
7536 }
7537
7538 #ifdef WIN_ASYNC_IO
7539 if (srv_use_native_aio) {
7540 if ((ret && slot->len == slot->n_bytes)
7541 || (!ret && GetLastError() == ERROR_IO_PENDING)) {
7542 /* aio was queued successfully! */
7543
7544 if (mode == OS_AIO_SYNC) {
7545 IORequest dummy_type;
7546 void* dummy_mess2;
7547 struct fil_node_t* dummy_mess1;
7548
7549 /* We want a synchronous i/o operation on a
7550 file where we also use async i/o: in Windows
7551 we must use the same wait mechanism as for
7552 async i/o */
7553
7554 return(os_aio_windows_handler(
7555 ULINT_UNDEFINED, slot->pos,
7556 &dummy_mess1, &dummy_mess2,
7557 &dummy_type));
7558 }
7559
7560 return(DB_SUCCESS);
7561 }
7562
7563 goto err_exit;
7564 }
7565 #endif /* WIN_ASYNC_IO */
7566
7567 /* AIO request was queued successfully! */
7568 return(DB_SUCCESS);
7569
7570 #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
7571 err_exit:
7572 #endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
7573
7574 array->release_with_mutex(slot);
7575
7576 if (os_file_handle_error(
7577 name, type.is_read() ? "aio read" : "aio write")) {
7578
7579 goto try_again;
7580 }
7581
7582 return(DB_IO_ERROR);
7583 }
7584
7585 /** Simulated AIO handler for reaping IO requests */
7586 class SimulatedAIOHandler {
7587
7588 public:
7589
7590 /** Constructor
7591 @param[in,out] array The AIO array
7592 @param[in] segment Local segment in the array */
SimulatedAIOHandler(AIO * array,ulint segment)7593 SimulatedAIOHandler(AIO* array, ulint segment)
7594 :
7595 m_oldest(),
7596 m_n_elems(),
7597 m_lowest_offset(IB_UINT64_MAX),
7598 m_array(array),
7599 m_n_slots(),
7600 m_segment(segment),
7601 m_ptr(),
7602 m_buf()
7603 {
7604 ut_ad(m_segment < 100);
7605
7606 m_slots.resize(OS_AIO_MERGE_N_CONSECUTIVE);
7607 }
7608
7609 /** Destructor */
~SimulatedAIOHandler()7610 ~SimulatedAIOHandler()
7611 {
7612 if (m_ptr != NULL) {
7613 ut_free(m_ptr);
7614 }
7615 }
7616
7617 /** Reset the state of the handler
7618 @param[in] n_slots Number of pending AIO operations supported */
init(ulint n_slots)7619 void init(ulint n_slots)
7620 {
7621 m_oldest = 0;
7622 m_n_elems = 0;
7623 m_n_slots = n_slots;
7624 m_lowest_offset = IB_UINT64_MAX;
7625
7626 if (m_ptr != NULL) {
7627 ut_free(m_ptr);
7628 m_ptr = m_buf = NULL;
7629 }
7630
7631 m_slots[0] = NULL;
7632 }
7633
7634 /** Check if there is a slot for which the i/o has already been done
7635 @param[out] n_reserved Number of reserved slots
7636 @return the first completed slot that is found. */
check_completed(ulint * n_reserved)7637 Slot* check_completed(ulint* n_reserved)
7638 {
7639 ulint offset = m_segment * m_n_slots;
7640
7641 *n_reserved = 0;
7642
7643 Slot* slot;
7644
7645 slot = m_array->at(offset);
7646
7647 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7648
7649 if (slot->is_reserved) {
7650
7651 if (slot->io_already_done) {
7652
7653 ut_a(slot->is_reserved);
7654
7655 return(slot);
7656 }
7657
7658 ++*n_reserved;
7659 }
7660 }
7661
7662 return(NULL);
7663 }
7664
7665 /** If there are at least 2 seconds old requests, then pick the
7666 oldest one to prevent starvation. If several requests have the
7667 same age, then pick the one at the lowest offset.
7668 @return true if request was selected */
select()7669 bool select()
7670 {
7671 if (!select_oldest()) {
7672
7673 return(select_lowest_offset());
7674 }
7675
7676 return(true);
7677 }
7678
7679 /** Check if there are several consecutive blocks
7680 to read or write. Merge them if found. */
merge()7681 void merge()
7682 {
7683 /* if m_n_elems != 0, then we have assigned
7684 something valid to consecutive_ios[0] */
7685 ut_ad(m_n_elems != 0);
7686 ut_ad(first_slot() != NULL);
7687
7688 Slot* slot = first_slot();
7689
7690 while (!merge_adjacent(slot)) {
7691 /* No op */
7692 }
7693 }
7694
7695 /** We have now collected n_consecutive I/O requests
7696 in the array; allocate a single buffer which can hold
7697 all data, and perform the I/O
7698 @return the length of the buffer */
allocate_buffer()7699 ulint allocate_buffer()
7700 MY_ATTRIBUTE((warn_unused_result))
7701 {
7702 ulint len;
7703 Slot* slot = first_slot();
7704
7705 ut_ad(m_ptr == NULL);
7706
7707 if (slot->type.is_read() && m_n_elems > 1) {
7708
7709 len = 0;
7710
7711 for (ulint i = 0; i < m_n_elems; ++i) {
7712 len += m_slots[i]->len;
7713 }
7714
7715 m_ptr = static_cast<byte*>(
7716 ut_malloc_nokey(len + UNIV_PAGE_SIZE));
7717
7718 m_buf = static_cast<byte*>(
7719 ut_align(m_ptr, UNIV_PAGE_SIZE));
7720
7721 } else {
7722 len = first_slot()->len;
7723 m_buf = first_slot()->buf;
7724 }
7725
7726 return(len);
7727 }
7728
7729 /** We have to compress the individual pages and punch
7730 holes in them on a page by page basis when writing to
7731 tables that can be compresed at the IO level.
7732 @param[in] len Value returned by allocate_buffer */
copy_to_buffer(ulint len)7733 void copy_to_buffer(ulint len)
7734 {
7735 Slot* slot = first_slot();
7736
7737 if (len > slot->len && slot->type.is_write()) {
7738
7739 byte* ptr = m_buf;
7740
7741 ut_ad(ptr != slot->buf);
7742
7743 /* Copy the buffers to the combined buffer */
7744 for (ulint i = 0; i < m_n_elems; ++i) {
7745
7746 slot = m_slots[i];
7747
7748 memmove(ptr, slot->buf, slot->len);
7749
7750 ptr += slot->len;
7751 }
7752 }
7753 }
7754
7755 /** Do the I/O with ordinary, synchronous i/o functions:
7756 @param[in] len Length of buffer for IO */
io()7757 void io()
7758 {
7759 if (first_slot()->type.is_write()) {
7760
7761 for (ulint i = 0; i < m_n_elems; ++i) {
7762 write(m_slots[i]);
7763 }
7764
7765 } else {
7766
7767 for (ulint i = 0; i < m_n_elems; ++i) {
7768 read(m_slots[i]);
7769 }
7770 }
7771 }
7772
7773 /** Do the decompression of the pages read in */
io_complete()7774 void io_complete()
7775 {
7776 // Note: For non-compressed tables. Not required
7777 // for correctness.
7778 }
7779
7780 /** Mark the i/os done in slots */
done()7781 void done()
7782 {
7783 for (ulint i = 0; i < m_n_elems; ++i) {
7784 m_slots[i]->io_already_done = true;
7785 }
7786 }
7787
7788 /** @return the first slot in the consecutive array */
first_slot()7789 Slot* first_slot()
7790 MY_ATTRIBUTE((warn_unused_result))
7791 {
7792 ut_a(m_n_elems > 0);
7793
7794 return(m_slots[0]);
7795 }
7796
7797 /** Wait for I/O requests
7798 @param[in] global_segment The global segment
7799 @param[in,out] event Wait on event if no active requests
7800 @return the number of slots */
7801 ulint check_pending(
7802 ulint global_segment,
7803 os_event_t event)
7804 MY_ATTRIBUTE((warn_unused_result));
7805 private:
7806
7807 /** Do the file read
7808 @param[in,out] slot Slot that has the IO context */
read(Slot * slot)7809 void read(Slot* slot)
7810 {
7811 dberr_t err = os_file_read_func(
7812 slot->type,
7813 slot->file.m_file,
7814 slot->ptr,
7815 slot->offset,
7816 slot->len);
7817 ut_a(err == DB_SUCCESS);
7818 }
7819
7820 /** Do the file read
7821 @param[in,out] slot Slot that has the IO context */
write(Slot * slot)7822 void write(Slot* slot)
7823 {
7824 dberr_t err = os_file_write_func(
7825 slot->type,
7826 slot->name,
7827 slot->file.m_file,
7828 slot->ptr,
7829 slot->offset,
7830 slot->len);
7831 ut_a(err == DB_SUCCESS || err == DB_IO_NO_PUNCH_HOLE);
7832 }
7833
7834 /** @return true if the slots are adjacent and can be merged */
adjacent(const Slot * s1,const Slot * s2) const7835 bool adjacent(const Slot* s1, const Slot* s2) const
7836 {
7837 return(s1 != s2
7838 && s1->file.m_file == s2->file.m_file
7839 && s2->offset == s1->offset + s1->len
7840 && s1->type == s2->type);
7841 }
7842
7843 /** @return true if merge limit reached or no adjacent slots found. */
merge_adjacent(Slot * & current)7844 bool merge_adjacent(Slot*& current)
7845 {
7846 Slot* slot;
7847 ulint offset = m_segment * m_n_slots;
7848
7849 slot = m_array->at(offset);
7850
7851 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7852
7853 if (slot->is_reserved && adjacent(current, slot)) {
7854
7855 current = slot;
7856
7857 /* Found a consecutive i/o request */
7858
7859 m_slots[m_n_elems] = slot;
7860
7861 ++m_n_elems;
7862
7863 return(m_n_elems >= m_slots.capacity());
7864 }
7865 }
7866
7867 return(true);
7868 }
7869
7870 /** There were no old requests. Look for an I/O request at the lowest
7871 offset in the array (we ignore the high 32 bits of the offset in these
7872 heuristics) */
select_lowest_offset()7873 bool select_lowest_offset()
7874 {
7875 ut_ad(m_n_elems == 0);
7876
7877 ulint offset = m_segment * m_n_slots;
7878
7879 m_lowest_offset = IB_UINT64_MAX;
7880
7881 for (ulint i = 0; i < m_n_slots; ++i) {
7882 Slot* slot;
7883
7884 slot = m_array->at(i + offset);
7885
7886 if (slot->is_reserved
7887 && slot->offset < m_lowest_offset) {
7888
7889 /* Found an i/o request */
7890 m_slots[0] = slot;
7891
7892 m_n_elems = 1;
7893
7894 m_lowest_offset = slot->offset;
7895 }
7896 }
7897
7898 return(m_n_elems > 0);
7899 }
7900
7901 /** Select the slot if it is older than the current oldest slot.
7902 @param[in] slot The slot to check */
select_if_older(Slot * slot)7903 void select_if_older(Slot* slot)
7904 {
7905 int64_t time_diff = ut_time_monotonic() -
7906 slot->reservation_time;
7907
7908 const uint64_t age = time_diff > 0 ? (uint64_t) time_diff : 0;
7909
7910 if ((age >= 2 && age > m_oldest)
7911 || (age >= 2
7912 && age == m_oldest
7913 && slot->offset < m_lowest_offset)) {
7914
7915 /* Found an i/o request */
7916 m_slots[0] = slot;
7917
7918 m_n_elems = 1;
7919
7920 m_oldest = age;
7921
7922 m_lowest_offset = slot->offset;
7923 }
7924 }
7925
7926 /** Select th oldest slot in the array
7927 @return true if oldest slot found */
select_oldest()7928 bool select_oldest()
7929 {
7930 ut_ad(m_n_elems == 0);
7931
7932 Slot* slot;
7933 ulint offset = m_n_slots * m_segment;
7934
7935 slot = m_array->at(offset);
7936
7937 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7938
7939 if (slot->is_reserved) {
7940 select_if_older(slot);
7941 }
7942 }
7943
7944 return(m_n_elems > 0);
7945 }
7946
7947 typedef std::vector<Slot*> slots_t;
7948
7949 private:
7950 ulint m_oldest;
7951 ulint m_n_elems;
7952 os_offset_t m_lowest_offset;
7953
7954 AIO* m_array;
7955 ulint m_n_slots;
7956 ulint m_segment;
7957
7958 slots_t m_slots;
7959
7960 byte* m_ptr;
7961 byte* m_buf;
7962 };
7963
7964 /** Wait for I/O requests
7965 @return the number of slots */
7966 ulint
check_pending(ulint global_segment,os_event_t event)7967 SimulatedAIOHandler::check_pending(
7968 ulint global_segment,
7969 os_event_t event)
7970 {
7971 /* NOTE! We only access constant fields in os_aio_array.
7972 Therefore we do not have to acquire the protecting mutex yet */
7973
7974 ut_ad(os_aio_validate_skip());
7975
7976 ut_ad(m_segment < m_array->get_n_segments());
7977
7978 /* Look through n slots after the segment * n'th slot */
7979
7980 if (AIO::is_read(m_array)
7981 && os_aio_recommend_sleep_for_read_threads) {
7982
7983 /* Give other threads chance to add several
7984 I/Os to the array at once. */
7985
7986 srv_set_io_thread_op_info(
7987 global_segment, "waiting for i/o request");
7988
7989 os_event_wait(event);
7990
7991 return(0);
7992 }
7993
7994 return(m_array->slots_per_segment());
7995 }
7996
7997 /** Does simulated AIO. This function should be called by an i/o-handler
7998 thread.
7999
8000 @param[in] segment The number of the segment in the aio arrays to wait
8001 for; segment 0 is the ibuf i/o thread, segment 1 the
8002 log i/o thread, then follow the non-ibuf read threads,
8003 and as the last are the non-ibuf write threads
8004 @param[out] m1 the messages passed with the AIO request; note that
8005 also in the case where the AIO operation failed, these
8006 output parameters are valid and can be used to restart
8007 the operation, for example
8008 @param[out] m2 Callback argument
8009 @param[in] type IO context
8010 @return DB_SUCCESS or error code */
8011 static
8012 dberr_t
os_aio_simulated_handler(ulint global_segment,fil_node_t ** m1,void ** m2,IORequest * type)8013 os_aio_simulated_handler(
8014 ulint global_segment,
8015 fil_node_t** m1,
8016 void** m2,
8017 IORequest* type)
8018 {
8019 Slot* slot;
8020 AIO* array;
8021 ulint segment;
8022 os_event_t event = os_aio_segment_wait_events[global_segment];
8023
8024 segment = AIO::get_array_and_local_segment(&array, global_segment);
8025
8026 SimulatedAIOHandler handler(array, segment);
8027
8028 for (;;) {
8029
8030 srv_set_io_thread_op_info(
8031 global_segment, "looking for i/o requests (a)");
8032
8033 ulint n_slots = handler.check_pending(global_segment, event);
8034
8035 if (n_slots == 0) {
8036 continue;
8037 }
8038
8039 handler.init(n_slots);
8040
8041 srv_set_io_thread_op_info(
8042 global_segment, "looking for i/o requests (b)");
8043
8044 array->acquire();
8045
8046 ulint n_reserved;
8047
8048 slot = handler.check_completed(&n_reserved);
8049
8050 if (slot != NULL) {
8051
8052 break;
8053
8054 } else if (n_reserved == 0
8055 && !buf_page_cleaner_is_active
8056 && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
8057
8058 /* There is no completed request. If there
8059 are no pending request at all, and the system
8060 is being shut down, exit. */
8061
8062 array->release();
8063
8064 *m1 = NULL;
8065
8066 *m2 = NULL;
8067
8068 return(DB_SUCCESS);
8069
8070 } else if (handler.select()) {
8071
8072 break;
8073 }
8074
8075 /* No I/O requested at the moment */
8076
8077 srv_set_io_thread_op_info(
8078 global_segment, "resetting wait event");
8079
8080 /* We wait here until tbere are more IO requests
8081 for this segment. */
8082
8083 os_event_reset(event);
8084
8085 array->release();
8086
8087 srv_set_io_thread_op_info(
8088 global_segment, "waiting for i/o request");
8089
8090 os_event_wait(event);
8091 }
8092
8093 /** Found a slot that has already completed its IO */
8094
8095 if (slot == NULL) {
8096 /* Merge adjacent requests */
8097 handler.merge();
8098
8099 /* Check if there are several consecutive blocks
8100 to read or write */
8101
8102 srv_set_io_thread_op_info(
8103 global_segment, "consecutive i/o requests");
8104
8105 // Note: We don't support write combining for simulated AIO.
8106 //ulint total_len = handler.allocate_buffer();
8107
8108 /* We release the array mutex for the time of the I/O: NOTE that
8109 this assumes that there is just one i/o-handler thread serving
8110 a single segment of slots! */
8111
8112 array->release();
8113
8114 // Note: We don't support write combining for simulated AIO.
8115 //handler.copy_to_buffer(total_len);
8116
8117 srv_set_io_thread_op_info(global_segment, "doing file i/o");
8118
8119 handler.io();
8120
8121 srv_set_io_thread_op_info(global_segment, "file i/o done");
8122
8123 handler.io_complete();
8124
8125 array->acquire();
8126
8127 handler.done();
8128
8129 /* We return the messages for the first slot now, and if there
8130 were several slots, the messages will be returned with
8131 subsequent calls of this function */
8132
8133 slot = handler.first_slot();
8134 }
8135
8136 ut_ad(slot->is_reserved);
8137
8138 *m1 = slot->m1;
8139 *m2 = slot->m2;
8140
8141 *type = slot->type;
8142
8143 array->release(slot);
8144
8145 array->release();
8146
8147 return(DB_SUCCESS);
8148 }
8149
8150 /** Get the total number of pending IOs
8151 @return the total number of pending IOs */
8152 ulint
total_pending_io_count()8153 AIO::total_pending_io_count()
8154 {
8155 ulint count = s_reads->pending_io_count();
8156
8157 if (s_writes != NULL) {
8158 count += s_writes->pending_io_count();
8159 }
8160
8161 if (s_ibuf != NULL) {
8162 count += s_ibuf->pending_io_count();
8163 }
8164
8165 if (s_log != NULL) {
8166 count += s_log->pending_io_count();
8167 }
8168
8169 if (s_sync != NULL) {
8170 count += s_sync->pending_io_count();
8171 }
8172
8173 return(count);
8174 }
8175
8176 /** Validates the consistency the aio system.
8177 @return true if ok */
8178 static
8179 bool
os_aio_validate()8180 os_aio_validate()
8181 {
8182 /* The methods countds and validates, we ignore the count. */
8183 AIO::total_pending_io_count();
8184
8185 return(true);
8186 }
8187
8188 /** Prints pending IO requests per segment of an aio array.
8189 We probably don't need per segment statistics but they can help us
8190 during development phase to see if the IO requests are being
8191 distributed as expected.
8192 @param[in,out] file File where to print
8193 @param[in] segments Pending IO array */
8194 void
print_segment_info(FILE * file,const ulint * segments)8195 AIO::print_segment_info(
8196 FILE* file,
8197 const ulint* segments)
8198 {
8199 ut_ad(m_n_segments > 0);
8200
8201 if (m_n_segments > 1) {
8202
8203 fprintf(file, " [");
8204
8205 for (ulint i = 0; i < m_n_segments; ++i, ++segments) {
8206
8207 if (i != 0) {
8208 fprintf(file, ", ");
8209 }
8210
8211 fprintf(file, ULINTPF, *segments);
8212 }
8213
8214 fprintf(file, "] ");
8215 }
8216 }
8217
8218 /** Prints info about the aio array.
8219 @param[in,out] file Where to print */
8220 void
print(FILE * file)8221 AIO::print(FILE* file)
8222 {
8223 ulint count = 0;
8224 ulint n_res_seg[SRV_MAX_N_IO_THREADS];
8225
8226 mutex_enter(&m_mutex);
8227
8228 ut_a(!m_slots.empty());
8229 ut_a(m_n_segments > 0);
8230
8231 memset(n_res_seg, 0x0, sizeof(n_res_seg));
8232
8233 for (ulint i = 0; i < m_slots.size(); ++i) {
8234 Slot& slot = m_slots[i];
8235 ulint segment = (i * m_n_segments) / m_slots.size();
8236
8237 if (slot.is_reserved) {
8238
8239 ++count;
8240
8241 ++n_res_seg[segment];
8242
8243 ut_a(slot.len > 0);
8244 }
8245 }
8246
8247 ut_a(m_n_reserved == count);
8248
8249 print_segment_info(file, n_res_seg);
8250
8251 mutex_exit(&m_mutex);
8252 }
8253
8254 /** Print all the AIO segments
8255 @param[in,out] file Where to print */
8256 void
print_all(FILE * file)8257 AIO::print_all(FILE* file)
8258 {
8259 s_reads->print(file);
8260
8261 if (s_writes != NULL) {
8262 fputs(", aio writes:", file);
8263 s_writes->print(file);
8264 }
8265
8266 if (s_ibuf != NULL) {
8267 fputs(",\n ibuf aio reads:", file);
8268 s_ibuf->print(file);
8269 }
8270
8271 if (s_log != NULL) {
8272 fputs(", log i/o's:", file);
8273 s_log->print(file);
8274 }
8275
8276 if (s_sync != NULL) {
8277 fputs(", sync i/o's:", file);
8278 s_sync->print(file);
8279 }
8280 }
8281
8282 /** Prints info of the aio arrays.
8283 @param[in,out] file file where to print */
8284 void
os_aio_print(FILE * file)8285 os_aio_print(FILE* file)
8286 {
8287 ib_time_monotonic_t current_time;
8288 double time_elapsed;
8289 double avg_bytes_read;
8290
8291 for (ulint i = 0; i < srv_n_file_io_threads; ++i) {
8292 fprintf(file, "I/O thread %lu state: %s (%s)",
8293 (ulong) i,
8294 srv_io_thread_op_info[i],
8295 srv_io_thread_function[i]);
8296
8297 #ifndef _WIN32
8298 if (os_event_is_set(os_aio_segment_wait_events[i])) {
8299 fprintf(file, " ev set");
8300 }
8301 #endif /* _WIN32 */
8302
8303 fprintf(file, "\n");
8304 }
8305
8306 fputs("Pending normal aio reads:", file);
8307
8308 AIO::print_all(file);
8309
8310 putc('\n', file);
8311 current_time = ut_time_monotonic();
8312 time_elapsed = 0.001 + (current_time - os_last_printout);
8313
8314 fprintf(file,
8315 "Pending flushes (fsync) log: " ULINTPF "; "
8316 "buffer pool: " ULINTPF "\n"
8317 ULINTPF " OS file reads, "
8318 ULINTPF " OS file writes, "
8319 ULINTPF " OS fsyncs\n",
8320 fil_n_pending_log_flushes,
8321 fil_n_pending_tablespace_flushes,
8322 os_n_file_reads,
8323 os_n_file_writes,
8324 os_n_fsyncs);
8325
8326 if (os_n_pending_writes != 0 || os_n_pending_reads != 0) {
8327 fprintf(file,
8328 ULINTPF " pending preads, "
8329 ULINTPF " pending pwrites\n",
8330 os_n_pending_reads,
8331 os_n_pending_writes);
8332 }
8333
8334 if (os_n_file_reads == os_n_file_reads_old) {
8335 avg_bytes_read = 0.0;
8336 } else {
8337 avg_bytes_read = (double) os_bytes_read_since_printout
8338 / (os_n_file_reads - os_n_file_reads_old);
8339 }
8340
8341 fprintf(file,
8342 "%.2f reads/s, %lu avg bytes/read,"
8343 " %.2f writes/s, %.2f fsyncs/s\n",
8344 (os_n_file_reads - os_n_file_reads_old)
8345 / time_elapsed,
8346 (ulong) avg_bytes_read,
8347 (os_n_file_writes - os_n_file_writes_old)
8348 / time_elapsed,
8349 (os_n_fsyncs - os_n_fsyncs_old)
8350 / time_elapsed);
8351
8352 os_n_file_reads_old = os_n_file_reads;
8353 os_n_file_writes_old = os_n_file_writes;
8354 os_n_fsyncs_old = os_n_fsyncs;
8355 os_bytes_read_since_printout = 0;
8356
8357 os_last_printout = current_time;
8358 }
8359
8360 /** Refreshes the statistics used to print per-second averages. */
8361 void
os_aio_refresh_stats()8362 os_aio_refresh_stats()
8363 {
8364 os_n_fsyncs_old = os_n_fsyncs;
8365
8366 os_bytes_read_since_printout = 0;
8367
8368 os_n_file_reads_old = os_n_file_reads;
8369
8370 os_n_file_writes_old = os_n_file_writes;
8371
8372 os_n_fsyncs_old = os_n_fsyncs;
8373
8374 os_bytes_read_since_printout = 0;
8375
8376 os_last_printout = ut_time_monotonic();
8377 }
8378
8379 /** Checks that all slots in the system have been freed, that is, there are
8380 no pending io operations.
8381 @return true if all free */
8382 bool
os_aio_all_slots_free()8383 os_aio_all_slots_free()
8384 {
8385 return(AIO::total_pending_io_count() == 0);
8386 }
8387
8388 #ifdef UNIV_DEBUG
8389 /** Prints all pending IO for the array
8390 @param[in] file file where to print
8391 @param[in] array array to process */
8392 void
to_file(FILE * file) const8393 AIO::to_file(FILE* file) const
8394 {
8395 acquire();
8396
8397 fprintf(file, " %lu\n", static_cast<ulong>(m_n_reserved));
8398
8399 for (ulint i = 0; i < m_slots.size(); ++i) {
8400
8401 const Slot& slot = m_slots[i];
8402
8403 if (slot.is_reserved) {
8404
8405 fprintf(file,
8406 "%s IO for %s (offset=" UINT64PF
8407 ", size=%lu)\n",
8408 slot.type.is_read() ? "read" : "write",
8409 slot.name, slot.offset, slot.len);
8410 }
8411 }
8412
8413 release();
8414 }
8415
8416 /** Print pending IOs for all arrays */
8417 void
print_to_file(FILE * file)8418 AIO::print_to_file(FILE* file)
8419 {
8420 fprintf(file, "Pending normal aio reads:");
8421
8422 s_reads->to_file(file);
8423
8424 if (s_writes != NULL) {
8425 fprintf(file, "Pending normal aio writes:");
8426 s_writes->to_file(file);
8427 }
8428
8429 if (s_ibuf != NULL) {
8430 fprintf(file, "Pending ibuf aio reads:");
8431 s_ibuf->to_file(file);
8432 }
8433
8434 if (s_log != NULL) {
8435 fprintf(file, "Pending log i/o's:");
8436 s_log->to_file(file);
8437 }
8438
8439 if (s_sync != NULL) {
8440 fprintf(file, "Pending sync i/o's:");
8441 s_sync->to_file(file);
8442 }
8443 }
8444
8445 /** Prints all pending IO
8446 @param[in] file File where to print */
8447 void
os_aio_print_pending_io(FILE * file)8448 os_aio_print_pending_io(
8449 FILE* file)
8450 {
8451 AIO::print_to_file(file);
8452 }
8453
8454 #endif /* UNIV_DEBUG */
8455
8456 /**
8457 Set the file create umask
8458 @param[in] umask The umask to use for file creation. */
8459 void
os_file_set_umask(ulint umask)8460 os_file_set_umask(ulint umask)
8461 {
8462 os_innodb_umask = umask;
8463 }
8464 #else
8465
8466 #include "univ.i"
8467 #include "db0err.h"
8468 #include "mach0data.h"
8469 #include "fil0fil.h"
8470 #include "os0file.h"
8471
8472 #include <lz4.h>
8473 #include <zlib.h>
8474
8475 #include <my_aes.h>
8476 #include <my_rnd.h>
8477 #include <mysqld.h>
8478 #include <mysql/service_mysql_keyring.h>
8479
8480 typedef byte Block;
8481
8482 /** Allocate a page for sync IO
8483 @return pointer to page */
8484 static
8485 Block*
os_alloc_block()8486 os_alloc_block()
8487 {
8488 return(reinterpret_cast<byte*>(malloc(UNIV_PAGE_SIZE_MAX * 2)));
8489 }
8490
8491 /** Free a page after sync IO
8492 @param[in,own] block The block to free/release */
8493 static
8494 void
os_free_block(Block * block)8495 os_free_block(Block* block)
8496 {
8497 ut_free(block);
8498 }
8499
8500 #endif /* !UNIV_INNOCHECKSUM */
8501
8502 /** Minimum length needed for encryption */
8503 const unsigned int MIN_ENCRYPTION_LEN = 2 * MY_AES_BLOCK_SIZE + FIL_PAGE_DATA;
8504
8505 /**
8506 @param[in] type The compression type
8507 @return the string representation */
8508 const char*
to_string(Type type)8509 Compression::to_string(Type type)
8510 {
8511 switch(type) {
8512 case NONE:
8513 return("None");
8514 case ZLIB:
8515 return("Zlib");
8516 case LZ4:
8517 return("LZ4");
8518 }
8519
8520 ut_ad(0);
8521
8522 return("<UNKNOWN>");
8523 }
8524
8525 /**
8526 @param[in] meta Page Meta data
8527 @return the string representation */
to_string(const Compression::meta_t & meta)8528 std::string Compression::to_string(const Compression::meta_t& meta)
8529 {
8530 std::ostringstream stream;
8531
8532 stream << "version: " << int(meta.m_version) << " "
8533 << "algorithm: " << meta.m_algorithm << " "
8534 << "(" << to_string(meta.m_algorithm) << ") "
8535 << "orginal_type: " << meta.m_original_type << " "
8536 << "original_size: " << meta.m_original_size << " "
8537 << "compressed_size: " << meta.m_compressed_size;
8538
8539 return(stream.str());
8540 }
8541
8542 /** @return true if it is a compressed page */
8543 bool
is_compressed_page(const byte * page)8544 Compression::is_compressed_page(const byte* page)
8545 {
8546 return(mach_read_from_2(page + FIL_PAGE_TYPE) == FIL_PAGE_COMPRESSED);
8547 }
8548
8549 bool
is_compressed_encrypted_page(const byte * page)8550 Compression::is_compressed_encrypted_page(const byte *page) {
8551 return (mach_read_from_2(page + FIL_PAGE_TYPE) ==
8552 FIL_PAGE_COMPRESSED_AND_ENCRYPTED);
8553 }
8554
8555 bool
is_valid_page_version(uint8_t version)8556 Compression::is_valid_page_version(uint8_t version) {
8557 return (version == FIL_PAGE_VERSION_1 || version == FIL_PAGE_VERSION_2);
8558 }
8559
8560 /** Deserizlise the page header compression meta-data
8561 @param[in] page Pointer to the page header
8562 @param[out] control Deserialised data */
8563 void
deserialize_header(const byte * page,Compression::meta_t * control)8564 Compression::deserialize_header(
8565 const byte* page,
8566 Compression::meta_t* control)
8567 {
8568 ut_ad(is_compressed_page(page) || is_compressed_encrypted_page(page));
8569
8570 control->m_version = static_cast<uint8_t>(
8571 mach_read_from_1(page + FIL_PAGE_VERSION));
8572
8573 control->m_original_type = static_cast<uint16_t>(
8574 mach_read_from_2(page + FIL_PAGE_ORIGINAL_TYPE_V1));
8575
8576 control->m_compressed_size = static_cast<uint16_t>(
8577 mach_read_from_2(page + FIL_PAGE_COMPRESS_SIZE_V1));
8578
8579 control->m_original_size = static_cast<uint16_t>(
8580 mach_read_from_2(page + FIL_PAGE_ORIGINAL_SIZE_V1));
8581
8582 control->m_algorithm = static_cast<Type>(
8583 mach_read_from_1(page + FIL_PAGE_ALGORITHM_V1));
8584 }
8585
8586 /** Decompress the page data contents. Page type must be FIL_PAGE_COMPRESSED, if
8587 not then the source contents are left unchanged and DB_SUCCESS is returned.
8588 @param[in] dblwr_recover true of double write recovery in progress
8589 @param[in,out] src Data read from disk, decompressed data will be
8590 copied to this page
8591 @param[in,out] dst Scratch area to use for decompression
8592 @param[in] dst_len Size of the scratch area in bytes
8593 @return DB_SUCCESS or error code */
8594 dberr_t
deserialize(bool dblwr_recover,byte * src,byte * dst,ulint dst_len)8595 Compression::deserialize(
8596 bool dblwr_recover,
8597 byte* src,
8598 byte* dst,
8599 ulint dst_len)
8600 {
8601 if (!is_compressed_page(src)) {
8602 /* There is nothing we can do. */
8603 return(DB_SUCCESS);
8604 }
8605
8606 meta_t header;
8607
8608 deserialize_header(src, &header);
8609
8610 byte* ptr = src + FIL_PAGE_DATA;
8611
8612 ut_ad(is_valid_page_version(header.m_version));
8613
8614 if (!is_valid_page_version(header.m_version)
8615 || header.m_original_size < UNIV_PAGE_SIZE_MIN - (FIL_PAGE_DATA + 8)
8616 || header.m_original_size > UNIV_PAGE_SIZE_MAX - FIL_PAGE_DATA
8617 || dst_len < header.m_original_size + FIL_PAGE_DATA) {
8618
8619 /* The last check could potentially return DB_OVERFLOW,
8620 the caller should be able to retry with a larger buffer. */
8621
8622 return(DB_CORRUPTION);
8623 }
8624
8625 Block* block;
8626
8627 /* The caller doesn't know what to expect */
8628 if (dst == NULL) {
8629
8630 block = os_alloc_block();
8631
8632 #ifdef UNIV_INNOCHECKSUM
8633 dst = block;
8634 #else
8635 dst = block->m_ptr;
8636 #endif /* UNIV_INNOCHECKSUM */
8637
8638 } else {
8639 block = NULL;
8640 }
8641
8642 int ret;
8643 Compression compression;
8644 ulint len = header.m_original_size;
8645
8646 compression.m_type = static_cast<Compression::Type>(header.m_algorithm);
8647
8648 switch(compression.m_type) {
8649 case Compression::ZLIB: {
8650
8651 uLongf zlen = header.m_original_size;
8652
8653 if (uncompress(dst, &zlen, ptr, header.m_compressed_size)
8654 != Z_OK) {
8655
8656 if (block != NULL) {
8657 os_free_block(block);
8658 }
8659
8660 return(DB_IO_DECOMPRESS_FAIL);
8661 }
8662
8663 len = static_cast<ulint>(zlen);
8664
8665 break;
8666 }
8667
8668 case Compression::LZ4:
8669
8670 if (dblwr_recover) {
8671
8672 ret = LZ4_decompress_safe(
8673 reinterpret_cast<char*>(ptr),
8674 reinterpret_cast<char*>(dst),
8675 header.m_compressed_size,
8676 header.m_original_size);
8677
8678 } else {
8679
8680 /* This can potentially read beyond the input
8681 buffer if the data is malformed. According to
8682 the LZ4 documentation it is a little faster
8683 than the above function. When recovering from
8684 the double write buffer we can afford to us the
8685 slower function above. */
8686
8687 ret = LZ4_decompress_fast(
8688 reinterpret_cast<char*>(ptr),
8689 reinterpret_cast<char*>(dst),
8690 header.m_original_size);
8691 }
8692
8693 if (ret < 0) {
8694
8695 if (block != NULL) {
8696 os_free_block(block);
8697 }
8698
8699 return(DB_IO_DECOMPRESS_FAIL);
8700 }
8701
8702 break;
8703
8704 default:
8705 #if !defined(UNIV_INNOCHECKSUM)
8706 ib::error()
8707 << "Compression algorithm support missing: "
8708 << Compression::to_string(compression.m_type);
8709 #else
8710 fprintf(stderr, "Compression algorithm support missing: %s\n",
8711 Compression::to_string(compression.m_type));
8712 #endif /* !UNIV_INNOCHECKSUM */
8713
8714 if (block != NULL) {
8715 os_free_block(block);
8716 }
8717
8718 return(DB_UNSUPPORTED);
8719 }
8720
8721 /* Leave the header alone */
8722 memmove(src + FIL_PAGE_DATA, dst, len);
8723
8724 mach_write_to_2(src + FIL_PAGE_TYPE, header.m_original_type);
8725
8726 ut_ad(dblwr_recover
8727 || memcmp(src + FIL_PAGE_LSN + 4,
8728 src + (header.m_original_size + FIL_PAGE_DATA)
8729 - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4) == 0);
8730
8731 if (block != NULL) {
8732 os_free_block(block);
8733 }
8734
8735 return(DB_SUCCESS);
8736 }
8737
8738 /** Decompress the page data contents. Page type must be FIL_PAGE_COMPRESSED, if
8739 not then the source contents are left unchanged and DB_SUCCESS is returned.
8740 @param[in] dblwr_recover true of double write recovery in progress
8741 @param[in,out] src Data read from disk, decompressed data will be
8742 copied to this page
8743 @param[in,out] dst Scratch area to use for decompression
8744 @param[in] dst_len Size of the scratch area in bytes
8745 @return DB_SUCCESS or error code */
8746 dberr_t
os_file_decompress_page(bool dblwr_recover,byte * src,byte * dst,ulint dst_len)8747 os_file_decompress_page(
8748 bool dblwr_recover,
8749 byte* src,
8750 byte* dst,
8751 ulint dst_len)
8752 {
8753 return(Compression::deserialize(dblwr_recover, src, dst, dst_len));
8754 }
8755
8756 /**
8757 @param[in] type The encryption type
8758 @return the string representation */
8759 const char*
to_string(Type type)8760 Encryption::to_string(Type type)
8761 {
8762 switch(type) {
8763 case NONE:
8764 return("N");
8765 case AES:
8766 return("Y");
8767 }
8768
8769 ut_ad(0);
8770
8771 return("<UNKNOWN>");
8772 }
8773
8774 /** Generate random encryption value for key and iv.
8775 @param[in,out] value Encryption value */
random_value(byte * value)8776 void Encryption::random_value(byte* value)
8777 {
8778 ut_ad(value != NULL);
8779
8780 my_rand_buffer(value, ENCRYPTION_KEY_LEN);
8781 }
8782
8783 /** Create new master key
8784 @param[in,out] master_key master key */
8785 void
create_master_key_v0(byte ** master_key)8786 Encryption::create_master_key_v0(byte** master_key)
8787 {
8788 #ifndef UNIV_INNOCHECKSUM
8789 char* key_type = NULL;
8790 size_t key_len;
8791 char key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
8792 int ret;
8793
8794 memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
8795
8796 /* Generate new master key */
8797 sprintf(key_name, "%s-%lu-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8798 server_id, master_key_id + 1);
8799
8800 /* We call key ring API to generate master key here. */
8801 ret = my_key_generate(key_name, "AES",
8802 NULL, ENCRYPTION_KEY_LEN);
8803
8804 /* We call key ring API to get master key here. */
8805 ret = my_key_fetch(key_name, &key_type, NULL,
8806 reinterpret_cast<void**>(master_key),
8807 &key_len);
8808
8809 if (ret) {
8810 ib::error() << "Encryption can't find master key, please check"
8811 " the keyring plugin is loaded.";
8812 *master_key = NULL;
8813 }
8814
8815 master_key_id++;
8816
8817 if (key_type) {
8818 my_free(key_type);
8819 }
8820 #endif
8821 }
8822
8823 /** Create new master key for key rotation.
8824 @param[in,out] master_key master key */
8825 void
create_master_key(byte ** master_key)8826 Encryption::create_master_key(byte** master_key)
8827 {
8828 #ifndef UNIV_INNOCHECKSUM
8829 char* key_type = NULL;
8830 size_t key_len;
8831 char key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
8832 int ret;
8833
8834 /* If uuid does not match with current server uuid,
8835 set uuid as current server uuid. */
8836 if (strcmp(uuid, server_uuid) != 0) {
8837 memcpy(uuid, server_uuid, ENCRYPTION_SERVER_UUID_LEN);
8838 }
8839 memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
8840
8841 /* Generate new master key */
8842 ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8843 "%s-%s-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8844 uuid, master_key_id + 1);
8845
8846 /* We call key ring API to generate master key here. */
8847 ret = my_key_generate(key_name, "AES",
8848 NULL, ENCRYPTION_KEY_LEN);
8849
8850 /* We call key ring API to get master key here. */
8851 ret = my_key_fetch(key_name, &key_type, NULL,
8852 reinterpret_cast<void**>(master_key),
8853 &key_len);
8854
8855 if (ret || *master_key == NULL) {
8856 ib::error() << "Encryption can't find master key, please check"
8857 " the keyring plugin is loaded.";
8858 *master_key = NULL;
8859 } else {
8860 master_key_id++;
8861 }
8862
8863 if (key_type) {
8864 my_free(key_type);
8865 }
8866 #endif
8867 }
8868
8869 /** Get master key by key id.
8870 @param[in] master_key_id master key id
8871 @param[in] srv_uuid uuid of server instance
8872 @param[in,out] master_key master key */
8873 void
get_master_key(ulint master_key_id,char * srv_uuid,byte ** master_key)8874 Encryption::get_master_key(ulint master_key_id,
8875 char* srv_uuid,
8876 byte** master_key)
8877 {
8878 #ifndef UNIV_INNOCHECKSUM
8879 char* key_type = NULL;
8880 size_t key_len;
8881 char key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
8882 int ret;
8883
8884 memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
8885
8886 if (srv_uuid != NULL) {
8887 ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8888 "%s-%s-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8889 srv_uuid, master_key_id);
8890 } else {
8891 /* For compitable with 5.7.11, we need to get master key with
8892 server id. */
8893 memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
8894 ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8895 "%s-%lu-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8896 server_id, master_key_id);
8897 }
8898
8899 /* We call key ring API to get master key here. */
8900 ret = my_key_fetch(key_name, &key_type, NULL,
8901 reinterpret_cast<void**>(master_key), &key_len);
8902
8903 if (key_type) {
8904 my_free(key_type);
8905 }
8906
8907 if (ret) {
8908 *master_key = NULL;
8909 ib::error() << "Encryption can't find master key, please check"
8910 " the keyring plugin is loaded.";
8911 }
8912
8913 #ifdef UNIV_ENCRYPT_DEBUG
8914 if (!ret && *master_key) {
8915 fprintf(stderr, "Fetched master key:%lu ", master_key_id);
8916 ut_print_buf(stderr, *master_key, key_len);
8917 fprintf(stderr, "\n");
8918 }
8919 #endif /* DEBUG_TDE */
8920
8921 #endif
8922 }
8923
8924 /** Current master key id */
8925 ulint Encryption::master_key_id = 0;
8926
8927 /** Current uuid of server instance */
8928 char Encryption::uuid[ENCRYPTION_SERVER_UUID_LEN + 1] = {0};
8929
8930 /** Default max_version */
8931 Encryption::Version Encryption::max_version = Encryption::ENCRYPTION_VERSION_2;
8932
8933 /** Get current master key and master key id
8934 @param[in,out] master_key_id master key id
8935 @param[in,out] master_key master key
8936 @param[in,out] version encryption information version */
8937 void
get_master_key(ulint * master_key_id,byte ** master_key,Encryption::Version * version)8938 Encryption::get_master_key(ulint* master_key_id,
8939 byte** master_key,
8940 Encryption::Version* version)
8941 {
8942 #ifndef UNIV_INNOCHECKSUM
8943 char* key_type = NULL;
8944 size_t key_len;
8945 char key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
8946 int ret;
8947
8948 memset(key_name, 0, ENCRYPTION_KEY_LEN);
8949 *version = Encryption::max_version;
8950
8951 if (Encryption::master_key_id == 0) {
8952 /* If m_master_key is 0, means there's no encrypted
8953 tablespace, we need to generate the first master key,
8954 and store it to key ring. */
8955 memset(uuid, 0, ENCRYPTION_SERVER_UUID_LEN + 1);
8956 memcpy(uuid, server_uuid, ENCRYPTION_SERVER_UUID_LEN);
8957
8958 /* Prepare the server uuid. */
8959 ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8960 "%s-%s-1", ENCRYPTION_MASTER_KEY_PRIFIX,
8961 uuid);
8962
8963 /* We call key ring API to generate master key here. */
8964 ret = my_key_generate(key_name, "AES",
8965 NULL, ENCRYPTION_KEY_LEN);
8966
8967 /* We call key ring API to get master key here. */
8968 ret = my_key_fetch(key_name, &key_type, NULL,
8969 reinterpret_cast<void**>(master_key),
8970 &key_len);
8971
8972 if (!ret && *master_key != NULL) {
8973 Encryption::master_key_id++;
8974 *master_key_id = Encryption::master_key_id;
8975 }
8976 #ifdef UNIV_ENCRYPT_DEBUG
8977 if (!ret && *master_key) {
8978 fprintf(stderr, "Generated new master key:");
8979 ut_print_buf(stderr, *master_key, key_len);
8980 fprintf(stderr, "\n");
8981 }
8982 #endif
8983 } else {
8984 *master_key_id = Encryption::master_key_id;
8985
8986 ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8987 "%s-%s-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8988 uuid, *master_key_id);
8989
8990 /* We call key ring API to get master key here. */
8991 ret = my_key_fetch(key_name, &key_type, NULL,
8992 reinterpret_cast<void**>(master_key),
8993 &key_len);
8994
8995 /* For compitable with 5.7.11, we need to try to get master key with
8996 server id when get master key with server uuid failure. */
8997 if (ret || *master_key == NULL) {
8998 if (key_type) {
8999 my_free(key_type);
9000 }
9001
9002 memset(key_name, 0,
9003 ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
9004 ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
9005 "%s-%lu-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
9006 server_id, *master_key_id);
9007
9008 ret = my_key_fetch(key_name, &key_type, NULL,
9009 reinterpret_cast<void**>(master_key),
9010 &key_len);
9011 *version = Encryption::ENCRYPTION_VERSION_1;
9012 }
9013 #ifdef UNIV_ENCRYPT_DEBUG
9014 if (!ret && *master_key) {
9015 fprintf(stderr, "Fetched master key:%lu ",
9016 *master_key_id);
9017 ut_print_buf(stderr, *master_key, key_len);
9018 fprintf(stderr, "\n");
9019 }
9020 #endif
9021 }
9022
9023 if (ret) {
9024 *master_key = NULL;
9025 ib::error() << "Encryption can't find master key, please check"
9026 " the keyring plugin is loaded.";
9027 }
9028
9029 if (key_type) {
9030 my_free(key_type);
9031 }
9032 #endif
9033 }
9034
9035 /** Check if page is encrypted page or not
9036 @param[in] page page which need to check
9037 @return true if it is a encrypted page */
9038 bool
is_encrypted_page(const byte * page)9039 Encryption::is_encrypted_page(const byte* page)
9040 {
9041 ulint page_type = mach_read_from_2(page + FIL_PAGE_TYPE);
9042
9043 return(page_type == FIL_PAGE_ENCRYPTED
9044 || page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED
9045 || page_type == FIL_PAGE_ENCRYPTED_RTREE);
9046 }
9047
9048 /** Encrypt the page data contents. Page type can't be
9049 FIL_PAGE_ENCRYPTED, FIL_PAGE_COMPRESSED_AND_ENCRYPTED,
9050 FIL_PAGE_ENCRYPTED_RTREE.
9051 @param[in] type IORequest
9052 @param[in,out] src page data which need to encrypt
9053 @param[in] src_len Size of the source in bytes
9054 @param[in,out] dst destination area
9055 @param[in,out] dst_len Size of the destination in bytes
9056 @return buffer data, dst_len will have the length of the data */
9057 byte*
encrypt(const IORequest & type,byte * src,ulint src_len,byte * dst,ulint * dst_len)9058 Encryption::encrypt(
9059 const IORequest& type,
9060 byte* src,
9061 ulint src_len,
9062 byte* dst,
9063 ulint* dst_len)
9064 {
9065 ut_ad(m_type != NONE);
9066 ut_ad(!type.is_log());
9067 #ifdef UNIV_ENCRYPT_DEBUG
9068 ulint space_id =
9069 mach_read_from_4(src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
9070 ulint page_no = mach_read_from_4(src + FIL_PAGE_OFFSET);
9071
9072 fprintf(stderr, "Encrypting page:%lu.%lu len:%lu\n",
9073 space_id, page_no, src_len);
9074 #endif
9075
9076 /* Shouldn't encrypte an already encrypted page. */
9077 ut_ad(!is_encrypted_page(src));
9078
9079 const uint16_t page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
9080
9081 /* This is data size which need to encrypt. */
9082 ulint src_enc_len = src_len;
9083
9084 /* In FIL_PAGE_VERSION_2, we encrypt the actual compressed data length. */
9085 if (page_type == FIL_PAGE_COMPRESSED) {
9086 src_enc_len = mach_read_from_2(src + FIL_PAGE_COMPRESS_SIZE_V1) +
9087 FIL_PAGE_DATA;
9088 /* Extend src_enc_len if needed */
9089 if (src_enc_len < MIN_ENCRYPTION_LEN) {
9090 src_enc_len = MIN_ENCRYPTION_LEN;
9091 }
9092 ut_a(src_enc_len <= src_len);
9093 }
9094
9095 /* Only encrypt the data + trailer, leave the header alone */
9096
9097 switch (m_type) {
9098 case Encryption::NONE:
9099 ut_error;
9100
9101 case Encryption::AES: {
9102 ut_ad(m_klen == ENCRYPTION_KEY_LEN);
9103
9104 /* Total length of the data to encrypt. */
9105 const ulint data_len = src_enc_len - FIL_PAGE_DATA;
9106
9107 /* Server encryption functions expect input data to be in
9108 multiples of MY_AES_BLOCK SIZE. Therefore we encrypt the
9109 overlapping data of the chunk_len and trailer_len twice.
9110 First we encrypt the bigger chunk of data then we do the
9111 trailer. The trailer encryption block starts at
9112 2 * MY_AES_BLOCK_SIZE bytes offset from the end of the enc_len.
9113 During decryption we do the reverse of the above process. */
9114 ut_ad(data_len >= 2 * MY_AES_BLOCK_SIZE);
9115
9116 const ulint chunk_len =
9117 (data_len / MY_AES_BLOCK_SIZE) * MY_AES_BLOCK_SIZE;
9118 const ulint remain_len = data_len - chunk_len;
9119
9120 lint elen = my_aes_encrypt(
9121 src + FIL_PAGE_DATA, static_cast<uint32>(chunk_len),
9122 dst + FIL_PAGE_DATA, reinterpret_cast<byte *>(m_key),
9123 static_cast<uint32>(m_klen), my_aes_256_cbc,
9124 reinterpret_cast<byte *>(m_iv), false);
9125
9126 if (elen == MY_AES_BAD_DATA) {
9127 ulint page_no =mach_read_from_4(
9128 src + FIL_PAGE_OFFSET);
9129 ulint space_id = mach_read_from_4(
9130 src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
9131 *dst_len = src_len;
9132 #ifndef UNIV_INNOCHECKSUM
9133 ib::warn()
9134 << " Can't encrypt data of page,"
9135 << " page no:" << page_no
9136 << " space id:" << space_id;
9137 #else
9138 fprintf(stderr, " Can't encrypt data of page,"
9139 " page no:" ULINTPF
9140 " space id:" ULINTPF,
9141 page_no, space_id);
9142 #endif /* !UNIV_INNOCHECKSUM */
9143 return(src);
9144 }
9145
9146 const ulint len = static_cast<ulint>(elen);
9147 ut_ad(len == chunk_len);
9148
9149 /* Encrypt the trailing bytes. */
9150 if (remain_len != 0) {
9151 /* Copy remaining bytes and page tailer. */
9152 memcpy(dst + FIL_PAGE_DATA + len,
9153 src + FIL_PAGE_DATA + len,
9154 remain_len);
9155
9156 const ulint trailer_len = MY_AES_BLOCK_SIZE * 2;
9157 byte buf[trailer_len];
9158
9159 elen = my_aes_encrypt(
9160 dst + FIL_PAGE_DATA + data_len - trailer_len,
9161 static_cast<uint32>(trailer_len), buf,
9162 reinterpret_cast<unsigned char*>(m_key),
9163 static_cast<uint32>(m_klen), my_aes_256_cbc,
9164 reinterpret_cast<byte *>(m_iv), false);
9165
9166 if (elen == MY_AES_BAD_DATA) {
9167 ulint page_no =mach_read_from_4(
9168 src + FIL_PAGE_OFFSET);
9169 ulint space_id = mach_read_from_4(
9170 src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
9171 #ifndef UNIV_INNOCHECKSUM
9172 ib::warn()
9173 << " Can't encrypt data of page,"
9174 << " page no:" << page_no
9175 << " space id:" << space_id;
9176 #else
9177 fprintf(stderr, " Can't encrypt data of page,"
9178 " page no:" ULINTPF
9179 " space id:" ULINTPF,
9180 page_no, space_id);
9181 #endif /* !UNIV_INNOCHECKSUM */
9182 *dst_len = src_len;
9183 return(src);
9184 }
9185
9186 ut_a(static_cast<ulint>(elen) == trailer_len);
9187
9188 memcpy(dst + FIL_PAGE_DATA + data_len - trailer_len,
9189 buf, trailer_len);
9190 }
9191
9192
9193 break;
9194 }
9195
9196 default:
9197 ut_error;
9198 }
9199
9200 /* Copy the header as is. */
9201 memmove(dst, src, FIL_PAGE_DATA);
9202 ut_ad(memcmp(src, dst, FIL_PAGE_DATA) == 0);
9203
9204 /* Add encryption control information. Required for decrypting. */
9205 if (page_type == FIL_PAGE_COMPRESSED) {
9206 /* If the page is compressed, we don't need to save the
9207 original type, since it is done in compression already. */
9208 mach_write_to_2(dst + FIL_PAGE_TYPE,
9209 FIL_PAGE_COMPRESSED_AND_ENCRYPTED);
9210 ut_ad(memcmp(src+FIL_PAGE_TYPE+2,
9211 dst+FIL_PAGE_TYPE+2,
9212 FIL_PAGE_DATA-FIL_PAGE_TYPE-2) == 0);
9213 } else if (page_type == FIL_PAGE_RTREE) {
9214 /* If the page is R-tree page, we need to save original type. */
9215 mach_write_to_2(dst + FIL_PAGE_TYPE, FIL_PAGE_ENCRYPTED_RTREE);
9216 } else{
9217 mach_write_to_2(dst + FIL_PAGE_TYPE, FIL_PAGE_ENCRYPTED);
9218 mach_write_to_2(dst + FIL_PAGE_ORIGINAL_TYPE_V1, page_type);
9219 }
9220
9221 #ifdef UNIV_ENCRYPT_DEBUG
9222 #ifndef UNIV_INNOCHECKSUM
9223 #if 0
9224 byte* check_buf = static_cast<byte*>(ut_malloc_nokey(src_len));
9225 byte* buf2 = static_cast<byte*>(ut_malloc_nokey(src_len));
9226
9227 memcpy(check_buf, dst, src_len);
9228
9229 dberr_t err = decrypt(type, check_buf, src_len, buf2, src_len);
9230 if (err != DB_SUCCESS || memcmp(src + FIL_PAGE_DATA,
9231 check_buf + FIL_PAGE_DATA,
9232 src_len - FIL_PAGE_DATA) != 0) {
9233 ut_print_buf(stderr, src, src_len);
9234 ut_print_buf(stderr, check_buf, src_len);
9235 ut_ad(0);
9236 }
9237 ut_free(buf2);
9238 ut_free(check_buf);
9239 #endif
9240 fprintf(stderr, "Encrypted page:%lu.%lu\n", space_id, page_no);
9241 #endif
9242 #endif
9243
9244 /* Add padding 0 for unused portion */
9245 if (src_len > src_enc_len) {
9246 memset(dst + src_enc_len, 0, src_len - src_enc_len);
9247 }
9248
9249 *dst_len = src_len;
9250
9251 return(dst);
9252 }
9253
9254 /** Decrypt the page data contents. Page type must be FIL_PAGE_ENCRYPTED,
9255 if not then the source contents are left unchanged and DB_SUCCESS is returned.
9256 @param[in] type IORequest
9257 @param[in,out] src Data read from disk, decrypted data will be
9258 copied to this page
9259 @param[in] src_len source data length
9260 @param[in,out] dst Scratch area to use for decryption
9261 @param[in] dst_len Size of the scratch area in bytes
9262 @return DB_SUCCESS or error code */
9263 dberr_t
decrypt(const IORequest & type,byte * src,ulint src_len,byte * dst,ulint dst_len)9264 Encryption::decrypt(
9265 const IORequest& type,
9266 byte* src,
9267 ulint src_len,
9268 byte* dst,
9269 ulint dst_len)
9270 {
9271 ulint data_len;
9272 ulint main_len;
9273 ulint remain_len;
9274 ulint original_type;
9275 ulint page_type;
9276 byte remain_buf[MY_AES_BLOCK_SIZE * 2];
9277 Block* block;
9278
9279 /* Do nothing if it's not an encrypted table. */
9280 if (!is_encrypted_page(src)) {
9281 return(DB_SUCCESS);
9282 }
9283
9284 /* For compressed page, we need to get the compressed size
9285 for decryption */
9286 page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
9287 if (page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED) {
9288 src_len = static_cast<uint16_t>(
9289 mach_read_from_2(src + FIL_PAGE_COMPRESS_SIZE_V1))
9290 + FIL_PAGE_DATA;
9291 #ifndef UNIV_INNOCHECKSUM
9292 Compression::meta_t header;
9293 Compression::deserialize_header(src, &header);
9294 if (header.m_version == Compression::FIL_PAGE_VERSION_1) {
9295 src_len = ut_calc_align(src_len, type.block_size());
9296 } else {
9297 /* Extend src_len if needed */
9298 if (src_len < MIN_ENCRYPTION_LEN) {
9299 src_len = MIN_ENCRYPTION_LEN;
9300 }
9301 }
9302 #endif
9303 }
9304 #ifdef UNIV_ENCRYPT_DEBUG
9305 ulint space_id =
9306 mach_read_from_4(src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
9307 ulint page_no = mach_read_from_4(src + FIL_PAGE_OFFSET);
9308
9309 fprintf(stderr, "Decrypting page:%lu.%lu len:%lu\n",
9310 space_id, page_no, src_len);
9311 #endif
9312
9313 original_type = static_cast<uint16_t>(
9314 mach_read_from_2(src + FIL_PAGE_ORIGINAL_TYPE_V1));
9315
9316 byte* ptr = src + FIL_PAGE_DATA;
9317
9318 /* The caller doesn't know what to expect */
9319 if (dst == NULL) {
9320
9321 block = os_alloc_block();
9322 #ifdef UNIV_INNOCHECKSUM
9323 dst = block;
9324 #else
9325 dst = block->m_ptr;
9326 #endif /* UNIV_INNOCHECKSUM */
9327
9328 } else {
9329 block = NULL;
9330 }
9331
9332 data_len = src_len - FIL_PAGE_DATA;
9333 main_len = (data_len / MY_AES_BLOCK_SIZE) * MY_AES_BLOCK_SIZE;
9334 remain_len = data_len - main_len;
9335
9336 switch(m_type) {
9337 case Encryption::AES: {
9338 lint elen;
9339
9340 /* First decrypt the last 2 blocks data of data, since
9341 data is no block aligned. */
9342 if (remain_len != 0) {
9343 ut_ad(m_klen == ENCRYPTION_KEY_LEN);
9344
9345 remain_len = MY_AES_BLOCK_SIZE * 2;
9346
9347 /* Copy the last 2 blocks. */
9348 memcpy(remain_buf,
9349 ptr + data_len - remain_len,
9350 remain_len);
9351
9352 elen = my_aes_decrypt(
9353 remain_buf,
9354 static_cast<uint32>(remain_len),
9355 dst + data_len - remain_len,
9356 reinterpret_cast<unsigned char*>(m_key),
9357 static_cast<uint32>(m_klen),
9358 my_aes_256_cbc,
9359 reinterpret_cast<unsigned char*>(m_iv),
9360 false);
9361 if (elen == MY_AES_BAD_DATA) {
9362 if (block != NULL) {
9363 os_free_block(block);
9364 }
9365
9366 return(DB_IO_DECRYPT_FAIL);
9367 }
9368
9369 /* Copy the other data bytes to temp area. */
9370 memcpy(dst, ptr, data_len - remain_len);
9371 } else {
9372 ut_ad(data_len == main_len);
9373
9374 /* Copy the data bytes to temp area. */
9375 memcpy(dst, ptr, data_len);
9376 }
9377
9378 /* Then decrypt the main data */
9379 elen = my_aes_decrypt(
9380 dst,
9381 static_cast<uint32>(main_len),
9382 ptr,
9383 reinterpret_cast<unsigned char*>(m_key),
9384 static_cast<uint32>(m_klen),
9385 my_aes_256_cbc,
9386 reinterpret_cast<unsigned char*>(m_iv),
9387 false);
9388 if (elen == MY_AES_BAD_DATA) {
9389
9390 if (block != NULL) {
9391 os_free_block(block);
9392 }
9393
9394 return(DB_IO_DECRYPT_FAIL);
9395 }
9396
9397 ut_ad(static_cast<ulint>(elen) == main_len);
9398
9399 /* Copy the remain bytes. */
9400 memcpy(ptr + main_len, dst + main_len, data_len - main_len);
9401
9402 break;
9403 }
9404
9405 default:
9406 if (!type.is_dblwr_recover()) {
9407 #if !defined(UNIV_INNOCHECKSUM)
9408 ib::error()
9409 << "Encryption algorithm support missing: "
9410 << Encryption::to_string(m_type);
9411 #else
9412 fprintf(stderr, "Encryption algorithm support missing: %s\n",
9413 Encryption::to_string(m_type));
9414 #endif /* !UNIV_INNOCHECKSUM */
9415 }
9416
9417 if (block != NULL) {
9418 os_free_block(block);
9419 }
9420
9421 return(DB_UNSUPPORTED);
9422 }
9423
9424 /* Restore the original page type. If it's a compressed and
9425 encrypted page, just reset it as compressed page type, since
9426 we will do uncompress later. */
9427
9428 if (page_type == FIL_PAGE_ENCRYPTED) {
9429 mach_write_to_2(src + FIL_PAGE_TYPE, original_type);
9430 mach_write_to_2(src + FIL_PAGE_ORIGINAL_TYPE_V1, 0);
9431 } else if (page_type == FIL_PAGE_ENCRYPTED_RTREE) {
9432 mach_write_to_2(src + FIL_PAGE_TYPE, FIL_PAGE_RTREE);
9433 } else {
9434 ut_ad(page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED);
9435 mach_write_to_2(src + FIL_PAGE_TYPE, FIL_PAGE_COMPRESSED);
9436 }
9437
9438 if (block != NULL) {
9439 os_free_block(block);
9440 }
9441
9442 #ifdef UNIV_ENCRYPT_DEBUG
9443 fprintf(stderr, "Decrypted page:%lu.%lu\n", space_id, page_no);
9444 #endif
9445
9446 DBUG_EXECUTE_IF("ib_crash_during_decrypt_page", DBUG_SUICIDE(););
9447
9448 return(DB_SUCCESS);
9449 }
9450
9451 /** Normalizes a directory path for the current OS:
9452 On Windows, we convert '/' to '\', else we convert '\' to '/'.
9453 @param[in,out] str A null-terminated directory and file path */
9454 void
os_normalize_path(char * str)9455 os_normalize_path(
9456 char* str)
9457 {
9458 if (str != NULL) {
9459 for (; *str; str++) {
9460 if (*str == OS_PATH_SEPARATOR_ALT) {
9461 *str = OS_PATH_SEPARATOR;
9462 }
9463 }
9464 }
9465 }
9466