1 /***********************************************************************
2
3 Copyright (c) 1995, 2021, Oracle and/or its affiliates.
4 Copyright (c) 2009, Percona Inc.
5
6 Portions of this file contain modifications contributed and copyrighted
7 by Percona Inc.. Those modifications are
8 gratefully acknowledged and are described briefly in the InnoDB
9 documentation. The contributions by Percona Inc. are incorporated with
10 their permission, and subject to the conditions contained in the file
11 COPYING.Percona.
12
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License, version 2.0,
15 as published by the Free Software Foundation.
16
17 This program is also distributed with certain software (including
18 but not limited to OpenSSL) that is licensed under separate terms,
19 as designated in a particular file or component or in included license
20 documentation. The authors of MySQL hereby grant you an additional
21 permission to link the program and your derivative works with the
22 separately licensed software that they have included with MySQL.
23
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License, version 2.0, for more details.
28
29 You should have received a copy of the GNU General Public License along with
30 this program; if not, write to the Free Software Foundation, Inc.,
31 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
32
33 ***********************************************************************/
34
35 /**************************************************//**
36 @file os/os0file.cc
37 The interface to the operating system file i/o primitives
38
39 Created 10/21/1995 Heikki Tuuri
40 *******************************************************/
41
42 #ifndef UNIV_INNOCHECKSUM
43
44 #include "ha_prototypes.h"
45 #include "sql_const.h"
46
47 #include "os0file.h"
48
49 #ifdef UNIV_NONINL
50 #include "os0file.ic"
51 #endif
52
53 #include "page0page.h"
54 #include "srv0srv.h"
55 #include "srv0start.h"
56 #include "fil0fil.h"
57 #ifndef UNIV_HOTBACKUP
58 # include "os0event.h"
59 # include "os0thread.h"
60 #else /* !UNIV_HOTBACKUP */
61 # ifdef _WIN32
62 /* Add includes for the _stat() call to compile on Windows */
63 # include <sys/types.h>
64 # include <sys/stat.h>
65 # include <errno.h>
66 # endif /* _WIN32 */
67 #endif /* !UNIV_HOTBACKUP */
68
69 #include <vector>
70 #include <functional>
71
72 #ifdef LINUX_NATIVE_AIO
73 #include <libaio.h>
74 #endif /* LINUX_NATIVE_AIO */
75
76 #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
77 # include <fcntl.h>
78 # include <linux/falloc.h>
79 #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
80
81 #include <lz4.h>
82 #include <zlib.h>
83
84 #ifdef UNIV_DEBUG
85 /** Set when InnoDB has invoked exit(). */
86 bool innodb_calling_exit;
87 #endif /* UNIV_DEBUG */
88
89 #include <my_aes.h>
90 #include <my_rnd.h>
91 #include <mysqld.h>
92 #include <mysql/service_mysql_keyring.h>
93
94 /** Insert buffer segment id */
95 static const ulint IO_IBUF_SEGMENT = 0;
96
97 /** Log segment id */
98 static const ulint IO_LOG_SEGMENT = 1;
99
100 /** Number of retries for partial I/O's */
101 static const ulint NUM_RETRIES_ON_PARTIAL_IO = 10;
102
103 /** Blocks for doing IO, used in the transparent compression
104 and encryption code. */
105 struct Block {
106 /** Default constructor */
BlockBlock107 Block() : m_ptr(), m_in_use() { }
108
109 byte* m_ptr;
110
111 byte pad[CACHE_LINE_SIZE - sizeof(ulint)];
112 lock_word_t m_in_use;
113 };
114
115 /** For storing the allocated blocks */
116 typedef std::vector<Block> Blocks;
117
118 /** Block collection */
119 static Blocks* block_cache;
120
121 /** Number of blocks to allocate for sync read/writes */
122 static const size_t MAX_BLOCKS = 128;
123
124 /** Block buffer size */
125 #define BUFFER_BLOCK_SIZE ((ulint)(UNIV_PAGE_SIZE * 1.3))
126
127 /** Disk sector size of aligning write buffer for DIRECT_IO */
128 static ulint os_io_ptr_align = UNIV_SECTOR_SIZE;
129
130 /* This specifies the file permissions InnoDB uses when it creates files in
131 Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
132 my_umask */
133
134 #ifndef _WIN32
135 /** Umask for creating files */
136 static ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
137 #else
138 /** Umask for creating files */
139 static ulint os_innodb_umask = 0;
140
141 /* On Windows when using native AIO the number of AIO requests
142 that a thread can handle at a given time is limited to 32
143 i.e.: SRV_N_PENDING_IOS_PER_THREAD */
144 #define SRV_N_PENDING_IOS_PER_THREAD OS_AIO_N_PENDING_IOS_PER_THREAD
145
146 #endif /* _WIN32 */
147
148 #ifndef UNIV_HOTBACKUP
149
150 /** In simulated aio, merge at most this many consecutive i/os */
151 static const ulint OS_AIO_MERGE_N_CONSECUTIVE = 64;
152
153 /** Flag indicating if the page_cleaner is in active state. */
154 extern bool buf_page_cleaner_is_active;
155
156 /**********************************************************************
157
158 InnoDB AIO Implementation:
159 =========================
160
161 We support native AIO for Windows and Linux. For rest of the platforms
162 we simulate AIO by special IO-threads servicing the IO-requests.
163
164 Simulated AIO:
165 ==============
166
167 On platforms where we 'simulate' AIO, the following is a rough explanation
168 of the high level design.
169 There are four io-threads (for ibuf, log, read, write).
170 All synchronous IO requests are serviced by the calling thread using
171 os_file_write/os_file_read. The Asynchronous requests are queued up
172 in an array (there are four such arrays) by the calling thread.
173 Later these requests are picked up by the IO-thread and are serviced
174 synchronously.
175
176 Windows native AIO:
177 ==================
178
179 If srv_use_native_aio is not set then Windows follow the same
180 code as simulated AIO. If the flag is set then native AIO interface
181 is used. On windows, one of the limitation is that if a file is opened
182 for AIO no synchronous IO can be done on it. Therefore we have an
183 extra fifth array to queue up synchronous IO requests.
184 There are innodb_file_io_threads helper threads. These threads work
185 on the four arrays mentioned above in Simulated AIO. No thread is
186 required for the sync array.
187 If a synchronous IO request is made, it is first queued in the sync
188 array. Then the calling thread itself waits on the request, thus
189 making the call synchronous.
190 If an AIO request is made the calling thread not only queues it in the
191 array but also submits the requests. The helper thread then collects
192 the completed IO request and calls completion routine on it.
193
194 Linux native AIO:
195 =================
196
197 If we have libaio installed on the system and innodb_use_native_aio
198 is set to true we follow the code path of native AIO, otherwise we
199 do simulated AIO.
200 There are innodb_file_io_threads helper threads. These threads work
201 on the four arrays mentioned above in Simulated AIO.
202 If a synchronous IO request is made, it is handled by calling
203 os_file_write/os_file_read.
204 If an AIO request is made the calling thread not only queues it in the
205 array but also submits the requests. The helper thread then collects
206 the completed IO request and calls completion routine on it.
207
208 **********************************************************************/
209
210
211 #ifdef UNIV_PFS_IO
212 /* Keys to register InnoDB I/O with performance schema */
213 mysql_pfs_key_t innodb_data_file_key;
214 mysql_pfs_key_t innodb_log_file_key;
215 mysql_pfs_key_t innodb_temp_file_key;
216 #endif /* UNIV_PFS_IO */
217
218 /** The asynchronous I/O context */
219 struct Slot {
SlotSlot220 Slot() { memset(this, 0, sizeof(*this)); }
221
222 /** index of the slot in the aio array */
223 uint16_t pos;
224
225 /** true if this slot is reserved */
226 bool is_reserved;
227
228 /** time when reserved */
229 ib_time_monotonic_t reservation_time;
230
231 /** buffer used in i/o */
232 byte* buf;
233
234 /** Buffer pointer used for actual IO. We advance this
235 when partial IO is required and not buf */
236 byte* ptr;
237
238 /** OS_FILE_READ or OS_FILE_WRITE */
239 IORequest type;
240
241 /** file offset in bytes */
242 os_offset_t offset;
243
244 /** file where to read or write */
245 pfs_os_file_t file;
246
247 /** file name or path */
248 const char* name;
249
250 /** used only in simulated aio: true if the physical i/o
251 already made and only the slot message needs to be passed
252 to the caller of os_aio_simulated_handle */
253 bool io_already_done;
254
255 /** The file node for which the IO is requested. */
256 fil_node_t* m1;
257
258 /** the requester of an aio operation and which can be used
259 to identify which pending aio operation was completed */
260 void* m2;
261
262 /** AIO completion status */
263 dberr_t err;
264
265 #ifdef WIN_ASYNC_IO
266 /** handle object we need in the OVERLAPPED struct */
267 HANDLE handle;
268
269 /** Windows control block for the aio request */
270 OVERLAPPED control;
271
272 /** bytes written/read */
273 DWORD n_bytes;
274
275 /** length of the block to read or write */
276 DWORD len;
277
278 #elif defined(LINUX_NATIVE_AIO)
279 /** Linux control block for aio */
280 struct iocb control;
281
282 /** AIO return code */
283 int ret;
284
285 /** bytes written/read. */
286 ssize_t n_bytes;
287
288 /** length of the block to read or write */
289 ulint len;
290 #else
291 /** length of the block to read or write */
292 ulint len;
293
294 /** bytes written/read. */
295 ulint n_bytes;
296 #endif /* WIN_ASYNC_IO */
297
298 /** Length of the block before it was compressed */
299 uint32 original_len;
300
301 /** Buffer block for compressed pages or encrypted pages */
302 Block* buf_block;
303
304 /** true, if we shouldn't punch a hole after writing the page */
305 bool skip_punch_hole;
306 };
307
308 /** The asynchronous i/o array structure */
309 class AIO {
310 public:
311 /** Constructor
312 @param[in] id Latch ID
313 @param[in] n_slots Number of slots to configure
314 @param[in] segments Number of segments to configure */
315 AIO(latch_id_t id, ulint n_slots, ulint segments);
316
317 /** Destructor */
318 ~AIO();
319
320 /** Initialize the instance
321 @return DB_SUCCESS or error code */
322 dberr_t init();
323
324 /** Requests for a slot in the aio array. If no slot is available, waits
325 until not_full-event becomes signaled.
326
327 @param[in,out] type IO context
328 @param[in,out] m1 message to be passed along with the AIO
329 operation
330 @param[in,out] m2 message to be passed along with the AIO
331 operation
332 @param[in] file file handle
333 @param[in] name name of the file or path as a null-terminated
334 string
335 @param[in,out] buf buffer where to read or from which to write
336 @param[in] offset file offset, where to read from or start writing
337 @param[in] len length of the block to read or write
338 @return pointer to slot */
339 Slot* reserve_slot(
340 IORequest& type,
341 fil_node_t* m1,
342 void* m2,
343 pfs_os_file_t file,
344 const char* name,
345 void* buf,
346 os_offset_t offset,
347 ulint len)
348 MY_ATTRIBUTE((warn_unused_result));
349
350 /** @return number of reserved slots */
351 ulint pending_io_count() const;
352
353 /** Returns a pointer to the nth slot in the aio array.
354 @param[in] index Index of the slot in the array
355 @return pointer to slot */
at(ulint i) const356 const Slot* at(ulint i) const
357 MY_ATTRIBUTE((warn_unused_result))
358 {
359 ut_a(i < m_slots.size());
360
361 return(&m_slots[i]);
362 }
363
364 /** Non const version */
at(ulint i)365 Slot* at(ulint i)
366 MY_ATTRIBUTE((warn_unused_result))
367 {
368 ut_a(i < m_slots.size());
369
370 return(&m_slots[i]);
371 }
372
373 /** Frees a slot in the AIO array, assumes caller owns the mutex.
374 @param[in,out] slot Slot to release */
375 void release(Slot* slot);
376
377 /** Frees a slot in the AIO array, assumes caller doesn't own the mutex.
378 @param[in,out] slot Slot to release */
379 void release_with_mutex(Slot* slot);
380
381 /** Prints info about the aio array.
382 @param[in,out] file Where to print */
383 void print(FILE* file);
384
385 /** @return the number of slots per segment */
slots_per_segment() const386 ulint slots_per_segment() const
387 MY_ATTRIBUTE((warn_unused_result))
388 {
389 return(m_slots.size() / m_n_segments);
390 }
391
392 /** @return accessor for n_segments */
get_n_segments() const393 ulint get_n_segments() const
394 MY_ATTRIBUTE((warn_unused_result))
395 {
396 return(m_n_segments);
397 }
398
399 #ifdef UNIV_DEBUG
400 /** @return true if the thread owns the mutex */
is_mutex_owned() const401 bool is_mutex_owned() const
402 MY_ATTRIBUTE((warn_unused_result))
403 {
404 return(mutex_own(&m_mutex));
405 }
406 #endif /* UNIV_DEBUG */
407
408 /** Acquire the mutex */
acquire() const409 void acquire() const
410 {
411 mutex_enter(&m_mutex);
412 }
413
414 /** Release the mutex */
release() const415 void release() const
416 {
417 mutex_exit(&m_mutex);
418 }
419
420 /** Write out the state to the file/stream
421 @param[in, out] file File to write to */
422 void to_file(FILE* file) const;
423
424 #ifdef LINUX_NATIVE_AIO
425 /** Dispatch an AIO request to the kernel.
426 @param[in,out] slot an already reserved slot
427 @return true on success. */
428 bool linux_dispatch(Slot* slot)
429 MY_ATTRIBUTE((warn_unused_result));
430
431 /** Accessor for an AIO event
432 @param[in] index Index into the array
433 @return the event at the index */
io_events(ulint index)434 io_event* io_events(ulint index)
435 MY_ATTRIBUTE((warn_unused_result))
436 {
437 ut_a(index < m_events.size());
438
439 return(&m_events[index]);
440 }
441
442 /** Accessor for the AIO context
443 @param[in] segment Segment for which to get the context
444 @return the AIO context for the segment */
io_ctx(ulint segment)445 io_context* io_ctx(ulint segment)
446 MY_ATTRIBUTE((warn_unused_result))
447 {
448 ut_ad(segment < get_n_segments());
449
450 return(m_aio_ctx[segment]);
451 }
452
453 /** Creates an io_context for native linux AIO.
454 @param[in] max_events number of events
455 @param[out] io_ctx io_ctx to initialize.
456 @return true on success. */
457 static bool linux_create_io_ctx(ulint max_events, io_context_t* io_ctx)
458 MY_ATTRIBUTE((warn_unused_result));
459
460 /** Checks if the system supports native linux aio. On some kernel
461 versions where native aio is supported it won't work on tmpfs. In such
462 cases we can't use native aio as it is not possible to mix simulated
463 and native aio.
464 @return true if supported, false otherwise. */
465 static bool is_linux_native_aio_supported()
466 MY_ATTRIBUTE((warn_unused_result));
467 #endif /* LINUX_NATIVE_AIO */
468
469 #ifdef WIN_ASYNC_IO
470 /** Wakes up all async i/o threads in the array in Windows async I/O at
471 shutdown. */
signal()472 void signal()
473 {
474 for (ulint i = 0; i < m_slots.size(); ++i) {
475 SetEvent(m_slots[i].handle);
476 }
477 }
478
479 /** Wake up all AIO threads in Windows native aio */
wake_at_shutdown()480 static void wake_at_shutdown()
481 {
482 s_reads->signal();
483
484 if (s_writes != NULL) {
485 s_writes->signal();
486 }
487
488 if (s_ibuf != NULL) {
489 s_ibuf->signal();
490 }
491
492 if (s_log != NULL) {
493 s_log->signal();
494 }
495 }
496 #endif /* WIN_ASYNC_IO */
497
498 #ifdef _WIN32
499 /** This function can be called if one wants to post a batch of reads
500 and prefers an I/O - handler thread to handle them all at once later.You
501 must call os_aio_simulated_wake_handler_threads later to ensure the
502 threads are not left sleeping! */
503 static void simulated_put_read_threads_to_sleep();
504
505 /** The non asynchronous IO array.
506 @return the synchronous AIO array instance. */
sync_array()507 static AIO* sync_array()
508 MY_ATTRIBUTE((warn_unused_result))
509 {
510 return(s_sync);
511 }
512
513 /**
514 Get the AIO handles for a segment.
515 @param[in] segment The local segment.
516 @return the handles for the segment. */
handles(ulint segment)517 HANDLE* handles(ulint segment)
518 MY_ATTRIBUTE((warn_unused_result))
519 {
520 ut_ad(segment < m_handles->size() / slots_per_segment());
521
522 return(&(*m_handles)[segment * slots_per_segment()]);
523 }
524
525 /** @return true if no slots are reserved */
is_empty() const526 bool is_empty() const
527 MY_ATTRIBUTE((warn_unused_result))
528 {
529 ut_ad(is_mutex_owned());
530 return(m_n_reserved == 0);
531 }
532 #endif /* _WIN32 */
533
534 /** Create an instance using new(std::nothrow)
535 @param[in] id Latch ID
536 @param[in] n_slots The number of AIO request slots
537 @param[in] segments The number of segments
538 @return a new AIO instance */
539 static AIO* create(
540 latch_id_t id,
541 ulint n_slots,
542 ulint segments)
543 MY_ATTRIBUTE((warn_unused_result));
544
545 /** Initializes the asynchronous io system. Creates one array each
546 for ibuf and log I/O. Also creates one array each for read and write
547 where each array is divided logically into n_readers and n_writers
548 respectively. The caller must create an i/o handler thread for each
549 segment in these arrays. This function also creates the sync array.
550 No I/O handler thread needs to be created for that
551 @param[in] n_per_seg maximum number of pending aio
552 operations allowed per segment
553 @param[in] n_readers number of reader threads
554 @param[in] n_writers number of writer threads
555 @param[in] n_slots_sync number of slots in the sync aio array
556 @return true if AIO sub-system was started successfully */
557 static bool start(
558 ulint n_per_seg,
559 ulint n_readers,
560 ulint n_writers,
561 ulint n_slots_sync)
562 MY_ATTRIBUTE((warn_unused_result));
563
564 /** Free the AIO arrays */
565 static void shutdown();
566
567 /** Print all the AIO segments
568 @param[in,out] file Where to print */
569 static void print_all(FILE* file);
570
571 /** Calculates local segment number and aio array from global
572 segment number.
573 @param[out] array AIO wait array
574 @param[in] segment global segment number
575 @return local segment number within the aio array */
576 static ulint get_array_and_local_segment(
577 AIO** array,
578 ulint segment)
579 MY_ATTRIBUTE((warn_unused_result));
580
581 /** Select the IO slot array
582 @param[in] type Type of IO, READ or WRITE
583 @param[in] read_only true if running in read-only mode
584 @param[in] mode IO mode
585 @return slot array or NULL if invalid mode specified */
586 static AIO* select_slot_array(
587 IORequest& type,
588 bool read_only,
589 ulint mode)
590 MY_ATTRIBUTE((warn_unused_result));
591
592 /** Calculates segment number for a slot.
593 @param[in] array AIO wait array
594 @param[in] slot slot in this array
595 @return segment number (which is the number used by, for example,
596 I/O handler threads) */
597 static ulint get_segment_no_from_slot(
598 const AIO* array,
599 const Slot* slot)
600 MY_ATTRIBUTE((warn_unused_result));
601
602 /** Wakes up a simulated AIO I/O-handler thread if it has something
603 to do.
604 @param[in] global_segment the number of the segment in the
605 AIO arrays */
606 static void wake_simulated_handler_thread(ulint global_segment);
607
608 /** Check if it is a read request
609 @param[in] aio The AIO instance to check
610 @return true if the AIO instance is for reading. */
is_read(const AIO * aio)611 static bool is_read(const AIO* aio)
612 MY_ATTRIBUTE((warn_unused_result))
613 {
614 return(s_reads == aio);
615 }
616
617 /** Wait on an event until no pending writes */
wait_until_no_pending_writes()618 static void wait_until_no_pending_writes()
619 {
620 os_event_wait(AIO::s_writes->m_is_empty);
621 }
622
623 /** Print to file
624 @param[in] file File to write to */
625 static void print_to_file(FILE* file);
626
627 /** Check for pending IO. Gets the count and also validates the
628 data structures.
629 @return count of pending IO requests */
630 static ulint total_pending_io_count();
631
632 private:
633 /** Initialise the slots
634 @return DB_SUCCESS or error code */
635 dberr_t init_slots()
636 MY_ATTRIBUTE((warn_unused_result));
637
638 /** Wakes up a simulated AIO I/O-handler thread if it has something
639 to do for a local segment in the AIO array.
640 @param[in] global_segment the number of the segment in the
641 AIO arrays
642 @param[in] segment the local segment in the AIO array */
643 void wake_simulated_handler_thread(ulint global_segment, ulint segment);
644
645 /** Prints pending IO requests per segment of an aio array.
646 We probably don't need per segment statistics but they can help us
647 during development phase to see if the IO requests are being
648 distributed as expected.
649 @param[in,out] file file where to print
650 @param[in] segments pending IO array */
651 void print_segment_info(
652 FILE* file,
653 const ulint* segments);
654
655 #ifdef LINUX_NATIVE_AIO
656 /** Initialise the Linux native AIO data structures
657 @return DB_SUCCESS or error code */
658 dberr_t init_linux_native_aio()
659 MY_ATTRIBUTE((warn_unused_result));
660 #endif /* LINUX_NATIVE_AIO */
661
662 private:
663 typedef std::vector<Slot> Slots;
664
665 /** the mutex protecting the aio array */
666 mutable SysMutex m_mutex;
667
668 /** Pointer to the slots in the array.
669 Number of elements must be divisible by n_threads. */
670 Slots m_slots;
671
672 /** Number of segments in the aio array of pending aio requests.
673 A thread can wait separately for any one of the segments. */
674 ulint m_n_segments;
675
676 /** The event which is set to the signaled state when
677 there is space in the aio outside the ibuf segment */
678 os_event_t m_not_full;
679
680 /** The event which is set to the signaled state when
681 there are no pending i/os in this array */
682 os_event_t m_is_empty;
683
684 /** Number of reserved slots in the AIO array outside
685 the ibuf segment */
686 ulint m_n_reserved;
687
688 #ifdef _WIN32
689 typedef std::vector<HANDLE, ut_allocator<HANDLE> > Handles;
690
691 /** Pointer to an array of OS native event handles where
692 we copied the handles from slots, in the same order. This
693 can be used in WaitForMultipleObjects; used only in Windows */
694 Handles* m_handles;
695 #endif /* _WIN32 */
696
697 #if defined(LINUX_NATIVE_AIO)
698 typedef std::vector<io_event> IOEvents;
699
700 /** completion queue for IO. There is one such queue per
701 segment. Each thread will work on one ctx exclusively. */
702 io_context_t* m_aio_ctx;
703
704 /** The array to collect completed IOs. There is one such
705 event for each possible pending IO. The size of the array
706 is equal to m_slots.size(). */
707 IOEvents m_events;
708 #endif /* LINUX_NATIV_AIO */
709
710 /** The aio arrays for non-ibuf i/o and ibuf i/o, as well as
711 sync AIO. These are NULL when the module has not yet been
712 initialized. */
713
714 /** Insert buffer */
715 static AIO* s_ibuf;
716
717 /** Redo log */
718 static AIO* s_log;
719
720 /** Reads */
721 static AIO* s_reads;
722
723 /** Writes */
724 static AIO* s_writes;
725
726 /** Synchronous I/O */
727 static AIO* s_sync;
728 };
729
730 /** Static declarations */
731 AIO* AIO::s_reads;
732 AIO* AIO::s_writes;
733 AIO* AIO::s_ibuf;
734 AIO* AIO::s_log;
735 AIO* AIO::s_sync;
736
737 #if defined(LINUX_NATIVE_AIO)
738 /** timeout for each io_getevents() call = 500ms. */
739 static const ulint OS_AIO_REAP_TIMEOUT = 500000000UL;
740
741 /** time to sleep, in microseconds if io_setup() returns EAGAIN. */
742 static const ulint OS_AIO_IO_SETUP_RETRY_SLEEP = 500000UL;
743
744 /** number of attempts before giving up on io_setup(). */
745 static const int OS_AIO_IO_SETUP_RETRY_ATTEMPTS = 5;
746 #endif /* LINUX_NATIVE_AIO */
747
748 /** Array of events used in simulated AIO */
749 static os_event_t* os_aio_segment_wait_events = NULL;
750
751 /** Number of asynchronous I/O segments. Set by os_aio_init(). */
752 static ulint os_aio_n_segments = ULINT_UNDEFINED;
753
754 /** If the following is true, read i/o handler threads try to
755 wait until a batch of new read requests have been posted */
756 static bool os_aio_recommend_sleep_for_read_threads = false;
757 #endif /* !UNIV_HOTBACKUP */
758
759 ulint os_n_file_reads = 0;
760 ulint os_bytes_read_since_printout = 0;
761 ulint os_n_file_writes = 0;
762 ulint os_n_fsyncs = 0;
763 ulint os_n_file_reads_old = 0;
764 ulint os_n_file_writes_old = 0;
765 ulint os_n_fsyncs_old = 0;
766 /** Number of pending write operations */
767 ulint os_n_pending_writes = 0;
768 /** Number of pending read operations */
769 ulint os_n_pending_reads = 0;
770
771 ib_time_monotonic_t os_last_printout;
772 bool os_has_said_disk_full = false;
773
774 /** Default Zip compression level */
775 extern uint page_zip_level;
776
777 #if DATA_TRX_ID_LEN > 6
778 #error "COMPRESSION_ALGORITHM will not fit"
779 #endif /* DATA_TRX_ID_LEN */
780
781 /** Validates the consistency of the aio system.
782 @return true if ok */
783 static
784 bool
785 os_aio_validate();
786
787 /** Does error handling when a file operation fails.
788 @param[in] name File name or NULL
789 @param[in] operation Name of operation e.g., "read", "write"
790 @return true if we should retry the operation */
791 static
792 bool
793 os_file_handle_error(
794 const char* name,
795 const char* operation);
796
797 /** Free storage space associated with a section of the file.
798 @param[in] fh Open file handle
799 @param[in] off Starting offset (SEEK_SET)
800 @param[in] len Size of the hole
801 @return DB_SUCCESS or error code */
802 dberr_t
803 os_file_punch_hole(
804 os_file_t fh,
805 os_offset_t off,
806 os_offset_t len);
807
808 /**
809 Does error handling when a file operation fails.
810 @param[in] name File name or NULL
811 @param[in] operation Name of operation e.g., "read", "write"
812 @param[in] silent if true then don't print any message to the log.
813 @return true if we should retry the operation */
814 static
815 bool
816 os_file_handle_error_no_exit(
817 const char* name,
818 const char* operation,
819 bool silent);
820
821 /** Decompress after a read and punch a hole in the file if it was a write
822 @param[in] type IO context
823 @param[in] fh Open file handle
824 @param[in,out] buf Buffer to transform
825 @param[in,out] scratch Scratch area for read decompression
826 @param[in] src_len Length of the buffer before compression
827 @param[in] len Compressed buffer length for write and size
828 of buf len for read
829 @return DB_SUCCESS or error code */
830 static
831 dberr_t
832 os_file_io_complete(
833 const IORequest&type,
834 os_file_t fh,
835 byte* buf,
836 byte* scratch,
837 ulint src_len,
838 os_offset_t offset,
839 ulint len);
840
841 /** Does simulated AIO. This function should be called by an i/o-handler
842 thread.
843
844 @param[in] segment The number of the segment in the aio arrays to wait
845 for; segment 0 is the ibuf i/o thread, segment 1 the
846 log i/o thread, then follow the non-ibuf read threads,
847 and as the last are the non-ibuf write threads
848 @param[out] m1 the messages passed with the AIO request; note that
849 also in the case where the AIO operation failed, these
850 output parameters are valid and can be used to restart
851 the operation, for example
852 @param[out] m2 Callback argument
853 @param[in] type IO context
854 @return DB_SUCCESS or error code */
855 static
856 dberr_t
857 os_aio_simulated_handler(
858 ulint global_segment,
859 fil_node_t** m1,
860 void** m2,
861 IORequest* type);
862
863 #ifdef WIN_ASYNC_IO
864 /** This function is only used in Windows asynchronous i/o.
865 Waits for an aio operation to complete. This function is used to wait the
866 for completed requests. The aio array of pending requests is divided
867 into segments. The thread specifies which segment or slot it wants to wait
868 for. NOTE: this function will also take care of freeing the aio slot,
869 therefore no other thread is allowed to do the freeing!
870 @param[in] segment The number of the segment in the aio arrays to
871 wait for; segment 0 is the ibuf I/O thread,
872 segment 1 the log I/O thread, then follow the
873 non-ibuf read threads, and as the last are the
874 non-ibuf write threads; if this is
875 ULINT_UNDEFINED, then it means that sync AIO
876 is used, and this parameter is ignored
877 @param[in] pos this parameter is used only in sync AIO:
878 wait for the aio slot at this position
879 @param[out] m1 the messages passed with the AIO request; note
880 that also in the case where the AIO operation
881 failed, these output parameters are valid and
882 can be used to restart the operation,
883 for example
884 @param[out] m2 callback message
885 @param[out] type OS_FILE_WRITE or ..._READ
886 @return DB_SUCCESS or error code */
887 static
888 dberr_t
889 os_aio_windows_handler(
890 ulint segment,
891 ulint pos,
892 fil_node_t** m1,
893 void** m2,
894 IORequest* type);
895 #endif /* WIN_ASYNC_IO */
896
897 /** Allocate a page for sync IO
898 @return pointer to page */
899 static
900 Block*
os_alloc_block()901 os_alloc_block()
902 {
903 size_t pos;
904 Blocks& blocks = *block_cache;
905 size_t i = static_cast<size_t>(my_timer_cycles());
906 const size_t size = blocks.size();
907 ulint retry = 0;
908 Block* block;
909
910 DBUG_EXECUTE_IF("os_block_cache_busy", retry = MAX_BLOCKS * 3;);
911
912 for (;;) {
913
914 /* After go through the block cache for 3 times,
915 allocate a new temporary block. */
916 if (retry == MAX_BLOCKS * 3) {
917 byte* ptr;
918
919 ptr = static_cast<byte*>(
920 ut_malloc_nokey(sizeof(*block)
921 + BUFFER_BLOCK_SIZE));
922
923 block = new (ptr) Block();
924 block->m_ptr = static_cast<byte*>(
925 ptr + sizeof(*block));
926 block->m_in_use = 1;
927
928 break;
929 }
930
931 pos = i++ % size;
932
933 if (TAS(&blocks[pos].m_in_use, 1) == 0) {
934 block = &blocks[pos];
935 break;
936 }
937
938 os_thread_yield();
939
940 ++retry;
941 }
942
943 ut_a(block->m_in_use != 0);
944
945 return(block);
946 }
947
948 /** Free a page after sync IO
949 @param[in,own] block The block to free/release */
950 static
951 void
os_free_block(Block * block)952 os_free_block(Block* block)
953 {
954 ut_ad(block->m_in_use == 1);
955
956 TAS(&block->m_in_use, 0);
957
958 /* When this block is not in the block cache, and it's
959 a temporary block, we need to free it directly. */
960 if (std::less<Block*>()(block, &block_cache->front())
961 || std::greater<Block*>()(block, &block_cache->back())) {
962 ut_free(block);
963 }
964 }
965
966 /** Generic AIO Handler methods. Currently handles IO post processing. */
967 class AIOHandler {
968 public:
969 /** Do any post processing after a read/write
970 @return DB_SUCCESS or error code. */
971 static dberr_t post_io_processing(Slot* slot);
972
973 /** Decompress after a read and punch a hole in the file if
974 it was a write */
io_complete(const Slot * slot)975 static dberr_t io_complete(const Slot* slot)
976 {
977 ut_a(slot->offset > 0);
978 ut_a(slot->type.is_read() || !slot->skip_punch_hole);
979 return(os_file_io_complete(
980 slot->type, slot->file.m_file, slot->buf,
981 NULL, slot->original_len,
982 slot->offset, slot->len));
983 }
984
985 private:
986 /** Check whether the page was encrypted.
987 @param[in] slot The slot that contains the IO request
988 @return true if it was an encyrpted page */
is_encrypted_page(const Slot * slot)989 static bool is_encrypted_page(const Slot* slot)
990 {
991 return(Encryption::is_encrypted_page(slot->buf));
992 }
993
994 /** Check whether the page was compressed.
995 @param[in] slot The slot that contains the IO request
996 @return true if it was a compressed page */
is_compressed_page(const Slot * slot)997 static bool is_compressed_page(const Slot* slot)
998 {
999 const byte* src = slot->buf;
1000
1001 ulint page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
1002
1003 return(page_type == FIL_PAGE_COMPRESSED);
1004 }
1005
1006 /** Get the compressed page size.
1007 @param[in] slot The slot that contains the IO request
1008 @return number of bytes to read for a successful decompress */
compressed_page_size(const Slot * slot)1009 static ulint compressed_page_size(const Slot* slot)
1010 {
1011 ut_ad(slot->type.is_read());
1012 ut_ad(is_compressed_page(slot));
1013
1014 ulint size;
1015 const byte* src = slot->buf;
1016
1017 size = mach_read_from_2(src + FIL_PAGE_COMPRESS_SIZE_V1);
1018
1019 return(size + FIL_PAGE_DATA);
1020 }
1021
1022 /** Check if the page contents can be decompressed.
1023 @param[in] slot The slot that contains the IO request
1024 @return true if the data read has all the compressed data */
can_decompress(const Slot * slot)1025 static bool can_decompress(const Slot* slot)
1026 {
1027 ut_ad(slot->type.is_read());
1028 ut_ad(is_compressed_page(slot));
1029
1030 ulint version;
1031 const byte* src = slot->buf;
1032
1033 version = mach_read_from_1(src + FIL_PAGE_VERSION);
1034
1035 ut_a(Compression::is_valid_page_version(version));
1036
1037 /* Includes the page header size too */
1038 ulint size = compressed_page_size(slot);
1039
1040 return(size <= (slot->ptr - slot->buf) + (ulint) slot->n_bytes);
1041 }
1042
1043 /** Check if we need to read some more data.
1044 @param[in] slot The slot that contains the IO request
1045 @param[in] n_bytes Total bytes read so far
1046 @return DB_SUCCESS or error code */
1047 static dberr_t check_read(Slot* slot, ulint n_bytes);
1048 };
1049
1050 /** Helper class for doing synchronous file IO. Currently, the objective
1051 is to hide the OS specific code, so that the higher level functions aren't
1052 peppered with #ifdef. Makes the code flow difficult to follow. */
1053 class SyncFileIO {
1054 public:
1055 /** Constructor
1056 @param[in] fh File handle
1057 @param[in,out] buf Buffer to read/write
1058 @param[in] n Number of bytes to read/write
1059 @param[in] offset Offset where to read or write */
SyncFileIO(os_file_t fh,void * buf,ulint n,os_offset_t offset)1060 SyncFileIO(os_file_t fh, void* buf, ulint n, os_offset_t offset)
1061 :
1062 m_fh(fh),
1063 m_buf(buf),
1064 m_n(static_cast<ssize_t>(n)),
1065 m_offset(offset)
1066 {
1067 ut_ad(m_n > 0);
1068 }
1069
1070 /** Destructor */
~SyncFileIO()1071 ~SyncFileIO()
1072 {
1073 /* No op */
1074 }
1075
1076 /** Do the read/write
1077 @param[in] request The IO context and type
1078 @return the number of bytes read/written or negative value on error */
1079 ssize_t execute(const IORequest& request);
1080
1081 /** Do the read/write
1082 @param[in,out] slot The IO slot, it has the IO context
1083 @return the number of bytes read/written or negative value on error */
1084 static ssize_t execute(Slot* slot);
1085
1086 /** Move the read/write offset up to where the partial IO succeeded.
1087 @param[in] n_bytes The number of bytes to advance */
advance(ssize_t n_bytes)1088 void advance(ssize_t n_bytes)
1089 {
1090 m_offset += n_bytes;
1091
1092 ut_ad(m_n >= n_bytes);
1093
1094 m_n -= n_bytes;
1095
1096 m_buf = reinterpret_cast<uchar*>(m_buf) + n_bytes;
1097 }
1098
1099 private:
1100 /** Open file handle */
1101 os_file_t m_fh;
1102
1103 /** Buffer to read/write */
1104 void* m_buf;
1105
1106 /** Number of bytes to read/write */
1107 ssize_t m_n;
1108
1109 /** Offset from where to read/write */
1110 os_offset_t m_offset;
1111 };
1112
1113 /** If it is a compressed page return the compressed page data + footer size
1114 @param[in] buf Buffer to check, must include header + 10 bytes
1115 @return ULINT_UNDEFINED if the page is not a compressed page or length
1116 of the compressed data (including footer) if it is a compressed page */
1117 ulint
os_file_compressed_page_size(const byte * buf)1118 os_file_compressed_page_size(const byte* buf)
1119 {
1120 ulint type = mach_read_from_2(buf + FIL_PAGE_TYPE);
1121
1122 if (type == FIL_PAGE_COMPRESSED) {
1123 ulint version = mach_read_from_1(buf + FIL_PAGE_VERSION);
1124 ut_a(Compression::is_valid_page_version(version));
1125 return(mach_read_from_2(buf + FIL_PAGE_COMPRESS_SIZE_V1));
1126 }
1127
1128 return(ULINT_UNDEFINED);
1129 }
1130
1131 /** If it is a compressed page return the original page data + footer size
1132 @param[in] buf Buffer to check, must include header + 10 bytes
1133 @return ULINT_UNDEFINED if the page is not a compressed page or length
1134 of the original data + footer if it is a compressed page */
1135 ulint
os_file_original_page_size(const byte * buf)1136 os_file_original_page_size(const byte* buf)
1137 {
1138 ulint type = mach_read_from_2(buf + FIL_PAGE_TYPE);
1139
1140 if (type == FIL_PAGE_COMPRESSED) {
1141
1142 ulint version = mach_read_from_1(buf + FIL_PAGE_VERSION);
1143 ut_a(Compression::is_valid_page_version(version));
1144
1145 return(mach_read_from_2(buf + FIL_PAGE_ORIGINAL_SIZE_V1));
1146 }
1147
1148 return(ULINT_UNDEFINED);
1149 }
1150
1151 /** Check if we need to read some more data.
1152 @param[in] slot The slot that contains the IO request
1153 @param[in] n_bytes Total bytes read so far
1154 @return DB_SUCCESS or error code */
1155 dberr_t
check_read(Slot * slot,ulint n_bytes)1156 AIOHandler::check_read(Slot* slot, ulint n_bytes)
1157 {
1158 dberr_t err;
1159
1160 ut_ad(slot->type.is_read());
1161 ut_ad(slot->original_len > slot->len);
1162
1163 if (is_compressed_page(slot)) {
1164
1165 if (can_decompress(slot)) {
1166
1167 ut_a(slot->offset > 0);
1168
1169 slot->len = slot->original_len;
1170 #ifdef _WIN32
1171 slot->n_bytes = static_cast<DWORD>(n_bytes);
1172 #else
1173 slot->n_bytes = static_cast<ulint>(n_bytes);
1174 #endif /* _WIN32 */
1175
1176 err = io_complete(slot);
1177 ut_a(err == DB_SUCCESS);
1178 } else {
1179 /* Read the next block in */
1180 ut_ad(compressed_page_size(slot) >= n_bytes);
1181
1182 err = DB_FAIL;
1183 }
1184 } else if (is_encrypted_page(slot)) {
1185 ut_a(slot->offset > 0);
1186
1187 slot->len = slot->original_len;
1188 #ifdef _WIN32
1189 slot->n_bytes = static_cast<DWORD>(n_bytes);
1190 #else
1191 slot->n_bytes = static_cast<ulint>(n_bytes);
1192 #endif /* _WIN32 */
1193
1194 err = io_complete(slot);
1195 ut_a(err == DB_SUCCESS);
1196
1197 } else {
1198 err = DB_FAIL;
1199 }
1200
1201 if (slot->buf_block != NULL) {
1202 os_free_block(slot->buf_block);
1203 slot->buf_block = NULL;
1204 }
1205
1206 return(err);
1207 }
1208
1209 /** Do any post processing after a read/write
1210 @return DB_SUCCESS or error code. */
1211 dberr_t
post_io_processing(Slot * slot)1212 AIOHandler::post_io_processing(Slot* slot)
1213 {
1214 dberr_t err;
1215
1216 ut_ad(slot->is_reserved);
1217
1218 /* Total bytes read so far */
1219 ulint n_bytes = (slot->ptr - slot->buf) + slot->n_bytes;
1220
1221 /* Compressed writes can be smaller than the original length.
1222 Therefore they can be processed without further IO. */
1223 if (n_bytes == slot->original_len
1224 || (slot->type.is_write()
1225 && slot->type.is_compressed()
1226 && slot->len == static_cast<ulint>(slot->n_bytes))) {
1227
1228 if (!slot->type.is_log()
1229 && (is_compressed_page(slot)
1230 || is_encrypted_page(slot))) {
1231
1232 ut_a(slot->offset > 0);
1233
1234 if (slot->type.is_read()) {
1235 slot->len = slot->original_len;
1236 }
1237
1238 /* The punch hole has been done on collect() */
1239
1240 if (slot->type.is_read()) {
1241 err = io_complete(slot);
1242 } else {
1243 err = DB_SUCCESS;
1244 }
1245
1246 ut_ad(err == DB_SUCCESS
1247 || err == DB_UNSUPPORTED
1248 || err == DB_CORRUPTION
1249 || err == DB_IO_DECOMPRESS_FAIL);
1250 } else {
1251
1252 err = DB_SUCCESS;
1253 }
1254
1255 if (slot->buf_block != NULL) {
1256 os_free_block(slot->buf_block);
1257 slot->buf_block = NULL;
1258 }
1259
1260 } else if ((ulint) slot->n_bytes == (ulint) slot->len) {
1261
1262 /* It *must* be a partial read. */
1263 ut_ad(slot->len < slot->original_len);
1264
1265 /* Has to be a read request, if it is less than
1266 the original length. */
1267 ut_ad(slot->type.is_read());
1268 err = check_read(slot, n_bytes);
1269
1270 } else {
1271 err = DB_FAIL;
1272 }
1273
1274 return(err);
1275 }
1276
1277 /** Count the number of free slots
1278 @return number of reserved slots */
1279 ulint
pending_io_count() const1280 AIO::pending_io_count() const
1281 {
1282 acquire();
1283
1284 #ifdef UNIV_DEBUG
1285 ut_a(m_n_segments > 0);
1286 ut_a(!m_slots.empty());
1287
1288 ulint count = 0;
1289
1290 for (ulint i = 0; i < m_slots.size(); ++i) {
1291
1292 const Slot& slot = m_slots[i];
1293
1294 if (slot.is_reserved) {
1295 ++count;
1296 ut_a(slot.len > 0);
1297 }
1298 }
1299
1300 ut_a(m_n_reserved == count);
1301 #endif /* UNIV_DEBUG */
1302
1303 ulint reserved = m_n_reserved;
1304
1305 release();
1306
1307 return(reserved);
1308 }
1309
1310 /** Compress a data page
1311 #param[in] block_size File system block size
1312 @param[in] src Source contents to compress
1313 @param[in] src_len Length in bytes of the source
1314 @param[out] dst Compressed page contents
1315 @param[out] dst_len Length in bytes of dst contents
1316 @return buffer data, dst_len will have the length of the data */
1317 static
1318 byte*
os_file_compress_page(Compression compression,ulint block_size,byte * src,ulint src_len,byte * dst,ulint * dst_len)1319 os_file_compress_page(
1320 Compression compression,
1321 ulint block_size,
1322 byte* src,
1323 ulint src_len,
1324 byte* dst,
1325 ulint* dst_len)
1326 {
1327 ulint len = 0;
1328 ulint compression_level = page_zip_level;
1329 ulint page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
1330
1331 /* The page size must be a multiple of the OS punch hole size. */
1332 ut_ad(!(src_len % block_size));
1333
1334 /* Shouldn't compress an already compressed page. */
1335 ut_ad(page_type != FIL_PAGE_COMPRESSED);
1336
1337 /* The page must be at least twice as large as the file system
1338 block size if we are to save any space. Ignore R-Tree pages for now,
1339 they repurpose the same 8 bytes in the page header. No point in
1340 compressing if the file system block size >= our page size. */
1341
1342 if (page_type == FIL_PAGE_RTREE
1343 || block_size == ULINT_UNDEFINED
1344 || compression.m_type == Compression::NONE
1345 || src_len < block_size * 2) {
1346
1347 *dst_len = src_len;
1348
1349 return(src);
1350 }
1351
1352 /* Leave the header alone when compressing. */
1353 ut_ad(block_size >= FIL_PAGE_DATA * 2);
1354
1355 ut_ad(src_len > FIL_PAGE_DATA + block_size);
1356
1357 /* Must compress to <= N-1 FS blocks. */
1358 ulint out_len = src_len - (FIL_PAGE_DATA + block_size);
1359
1360 /* This is the original data page size - the page header. */
1361 ulint content_len = src_len - FIL_PAGE_DATA;
1362
1363 ut_ad(out_len >= block_size - FIL_PAGE_DATA);
1364 ut_ad(out_len <= src_len - (block_size + FIL_PAGE_DATA));
1365
1366 /* Only compress the data + trailer, leave the header alone */
1367
1368 switch (compression.m_type) {
1369 case Compression::NONE:
1370 ut_error;
1371
1372 case Compression::ZLIB: {
1373
1374 uLongf zlen = static_cast<uLongf>(out_len);
1375
1376 if (compress2(
1377 dst + FIL_PAGE_DATA,
1378 &zlen,
1379 src + FIL_PAGE_DATA,
1380 static_cast<uLong>(content_len),
1381 static_cast<int>(compression_level)) != Z_OK) {
1382
1383 *dst_len = src_len;
1384
1385 return(src);
1386 }
1387
1388 len = static_cast<ulint>(zlen);
1389
1390 break;
1391 }
1392
1393 case Compression::LZ4:
1394
1395 len = LZ4_compress_default(
1396 reinterpret_cast<char*>(src) + FIL_PAGE_DATA,
1397 reinterpret_cast<char*>(dst) + FIL_PAGE_DATA,
1398 static_cast<int>(content_len),
1399 static_cast<int>(out_len));
1400
1401 ut_a(len <= src_len - FIL_PAGE_DATA);
1402
1403 if (len == 0 || len >= out_len) {
1404
1405 *dst_len = src_len;
1406
1407 return(src);
1408 }
1409
1410 break;
1411
1412 default:
1413 *dst_len = src_len;
1414 return(src);
1415 }
1416
1417 ut_a(len <= out_len);
1418
1419 ut_ad(memcmp(src + FIL_PAGE_LSN + 4,
1420 src + src_len - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)
1421 == 0);
1422
1423 /* Copy the header as is. */
1424 memmove(dst, src, FIL_PAGE_DATA);
1425
1426 /* Add compression control information. Required for decompressing. */
1427 mach_write_to_2(dst + FIL_PAGE_TYPE, FIL_PAGE_COMPRESSED);
1428
1429 mach_write_to_1(dst + FIL_PAGE_VERSION, Compression::FIL_PAGE_VERSION_2);
1430
1431 mach_write_to_1(dst + FIL_PAGE_ALGORITHM_V1, compression.m_type);
1432
1433 mach_write_to_2(dst + FIL_PAGE_ORIGINAL_TYPE_V1, page_type);
1434
1435 mach_write_to_2(dst + FIL_PAGE_ORIGINAL_SIZE_V1, content_len);
1436
1437 mach_write_to_2(dst + FIL_PAGE_COMPRESS_SIZE_V1, len);
1438
1439 /* Round to the next full block size */
1440
1441 len += FIL_PAGE_DATA;
1442
1443 *dst_len = ut_calc_align(len, block_size);
1444
1445 ut_ad(*dst_len >= len && *dst_len <= out_len + FIL_PAGE_DATA);
1446
1447 /* Clear out the unused portion of the page. */
1448 if (len % block_size) {
1449 memset(dst + len, 0x0, block_size - (len % block_size));
1450 }
1451
1452 return(dst);
1453 }
1454
1455 #ifdef UNIV_DEBUG
1456 # ifndef UNIV_HOTBACKUP
1457 /** Validates the consistency the aio system some of the time.
1458 @return true if ok or the check was skipped */
1459 bool
os_aio_validate_skip()1460 os_aio_validate_skip()
1461 {
1462 /** Try os_aio_validate() every this many times */
1463 # define OS_AIO_VALIDATE_SKIP 13
1464
1465 /** The os_aio_validate() call skip counter.
1466 Use a signed type because of the race condition below. */
1467 static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
1468
1469 /* There is a race condition below, but it does not matter,
1470 because this call is only for heuristic purposes. We want to
1471 reduce the call frequency of the costly os_aio_validate()
1472 check in debug builds. */
1473 --os_aio_validate_count;
1474
1475 if (os_aio_validate_count > 0) {
1476 return(true);
1477 }
1478
1479 os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
1480 return(os_aio_validate());
1481 }
1482 # endif /* !UNIV_HOTBACKUP */
1483 #endif /* UNIV_DEBUG */
1484
1485 #undef USE_FILE_LOCK
1486 #define USE_FILE_LOCK
1487 #if defined(UNIV_HOTBACKUP) || defined(_WIN32)
1488 /* InnoDB Hot Backup does not lock the data files.
1489 * On Windows, mandatory locking is used.
1490 */
1491 # undef USE_FILE_LOCK
1492 #endif
1493 #ifdef USE_FILE_LOCK
1494 /** Obtain an exclusive lock on a file.
1495 @param[in] fd file descriptor
1496 @param[in] name file name
1497 @return 0 on success */
1498 static
1499 int
os_file_lock(int fd,const char * name)1500 os_file_lock(
1501 int fd,
1502 const char* name)
1503 {
1504 struct flock lk;
1505
1506 lk.l_type = F_WRLCK;
1507 lk.l_whence = SEEK_SET;
1508 lk.l_start = lk.l_len = 0;
1509
1510 if (fcntl(fd, F_SETLK, &lk) == -1) {
1511
1512 ib::error()
1513 << "Unable to lock " << name
1514 << " error: " << errno;
1515
1516 if (errno == EAGAIN || errno == EACCES) {
1517
1518 ib::info()
1519 << "Check that you do not already have"
1520 " another mysqld process using the"
1521 " same InnoDB data or log files.";
1522 }
1523
1524 return(-1);
1525 }
1526
1527 return(0);
1528 }
1529 #endif /* USE_FILE_LOCK */
1530
1531 #ifndef UNIV_HOTBACKUP
1532
1533 /** Calculates local segment number and aio array from global segment number.
1534 @param[out] array aio wait array
1535 @param[in] segment global segment number
1536 @return local segment number within the aio array */
1537 ulint
get_array_and_local_segment(AIO ** array,ulint segment)1538 AIO::get_array_and_local_segment(
1539 AIO** array,
1540 ulint segment)
1541 {
1542 ulint local_segment;
1543 ulint n_extra_segs = (srv_read_only_mode) ? 0 : 2;
1544
1545 ut_a(segment < os_aio_n_segments);
1546
1547 if (!srv_read_only_mode && segment < n_extra_segs) {
1548
1549 /* We don't support ibuf/log IO during read only mode. */
1550
1551 if (segment == IO_IBUF_SEGMENT) {
1552
1553 *array = s_ibuf;
1554
1555 } else if (segment == IO_LOG_SEGMENT) {
1556
1557 *array = s_log;
1558
1559 } else {
1560 *array = NULL;
1561 }
1562
1563 local_segment = 0;
1564
1565 } else if (segment < s_reads->m_n_segments + n_extra_segs) {
1566
1567 *array = s_reads;
1568 local_segment = segment - n_extra_segs;
1569
1570 } else {
1571 *array = s_writes;
1572
1573 local_segment = segment
1574 - (s_reads->m_n_segments + n_extra_segs);
1575 }
1576
1577 return(local_segment);
1578 }
1579
1580 /** Frees a slot in the aio array. Assumes caller owns the mutex.
1581 @param[in,out] slot Slot to release */
1582 void
release(Slot * slot)1583 AIO::release(Slot* slot)
1584 {
1585 ut_ad(is_mutex_owned());
1586
1587 ut_ad(slot->is_reserved);
1588
1589 slot->is_reserved = false;
1590
1591 --m_n_reserved;
1592
1593 if (m_n_reserved == m_slots.size() - 1) {
1594 os_event_set(m_not_full);
1595 }
1596
1597 if (m_n_reserved == 0) {
1598 os_event_set(m_is_empty);
1599 }
1600
1601 #ifdef WIN_ASYNC_IO
1602
1603 ResetEvent(slot->handle);
1604
1605 #elif defined(LINUX_NATIVE_AIO)
1606
1607 if (srv_use_native_aio) {
1608 memset(&slot->control, 0x0, sizeof(slot->control));
1609 slot->ret = 0;
1610 slot->n_bytes = 0;
1611 } else {
1612 /* These fields should not be used if we are not
1613 using native AIO. */
1614 ut_ad(slot->n_bytes == 0);
1615 ut_ad(slot->ret == 0);
1616 }
1617
1618 #endif /* WIN_ASYNC_IO */
1619 }
1620
1621 /** Frees a slot in the AIO array. Assumes caller doesn't own the mutex.
1622 @param[in,out] slot Slot to release */
1623 void
release_with_mutex(Slot * slot)1624 AIO::release_with_mutex(Slot* slot)
1625 {
1626 acquire();
1627
1628 release(slot);
1629
1630 release();
1631 }
1632
1633 /** Creates a temporary file. This function is like tmpfile(3), but
1634 the temporary file is created in the given parameter path. If the path
1635 is NULL then it will create the file in the MySQL server configuration
1636 parameter (--tmpdir).
1637 @param[in] path location for creating temporary file
1638 @return temporary file handle, or NULL on error */
1639 FILE*
os_file_create_tmpfile(const char * path)1640 os_file_create_tmpfile(
1641 const char* path)
1642 {
1643 FILE* file = NULL;
1644 int fd = innobase_mysql_tmpfile(path);
1645
1646 if (fd >= 0) {
1647 file = fdopen(fd, "w+b");
1648 }
1649
1650 if (file == NULL) {
1651
1652 ib::error()
1653 << "Unable to create temporary file; errno: "
1654 << errno;
1655
1656 if (fd >= 0) {
1657 close(fd);
1658 }
1659 }
1660
1661 return(file);
1662 }
1663
1664 /** Rewind file to its start, read at most size - 1 bytes from it to str, and
1665 NUL-terminate str. All errors are silently ignored. This function is
1666 mostly meant to be used with temporary files.
1667 @param[in,out] file File to read from
1668 @param[in,out] str Buffer where to read
1669 @param[in] size Size of buffer */
1670 void
os_file_read_string(FILE * file,char * str,ulint size)1671 os_file_read_string(
1672 FILE* file,
1673 char* str,
1674 ulint size)
1675 {
1676 if (size != 0) {
1677 rewind(file);
1678
1679 size_t flen = fread(str, 1, size - 1, file);
1680
1681 str[flen] = '\0';
1682 }
1683 }
1684
1685 /** Decompress after a read and punch a hole in the file if it was a write
1686 @param[in] type IO context
1687 @param[in] fh Open file handle
1688 @param[in,out] buf Buffer to transform
1689 @param[in,out] scratch Scratch area for read decompression
1690 @param[in] src_len Length of the buffer before compression
1691 @param[in] len Used buffer length for write and output
1692 buf len for read
1693 @return DB_SUCCESS or error code */
1694 static
1695 dberr_t
os_file_io_complete(const IORequest & type,os_file_t fh,byte * buf,byte * scratch,ulint src_len,os_offset_t offset,ulint len)1696 os_file_io_complete(
1697 const IORequest&type,
1698 os_file_t fh,
1699 byte* buf,
1700 byte* scratch,
1701 ulint src_len,
1702 os_offset_t offset,
1703 ulint len)
1704 {
1705 /* We never compress/decompress the first page */
1706 ut_a(offset > 0);
1707 ut_ad(type.validate());
1708
1709 if (!type.is_compression_enabled()) {
1710
1711 return(DB_SUCCESS);
1712
1713 } else if (type.is_read()) {
1714 dberr_t ret;
1715 Encryption encryption(type.encryption_algorithm());
1716
1717 ut_ad(!type.is_log());
1718 ut_ad(!type.is_row_log());
1719
1720 ret = encryption.decrypt(type, buf, src_len, scratch, len);
1721 if (ret == DB_SUCCESS) {
1722 return(os_file_decompress_page(
1723 type.is_dblwr_recover(),
1724 buf, scratch, len));
1725 } else {
1726 return(ret);
1727 }
1728
1729 } else if (type.punch_hole()) {
1730
1731 ut_ad(len <= src_len);
1732 ut_ad(!type.is_log());
1733 ut_ad(type.is_write());
1734 ut_ad(type.is_compressed());
1735
1736 /* Nothing to do. */
1737 if (len == src_len) {
1738 return(DB_SUCCESS);
1739 }
1740
1741 #ifdef UNIV_DEBUG
1742 const ulint block_size = type.block_size();
1743 #endif /* UNIV_DEBUG */
1744
1745 /* We don't support multiple page sizes in the server
1746 at the moment. */
1747 ut_ad(src_len == srv_page_size);
1748
1749 /* Must be a multiple of the compression unit size. */
1750 ut_ad((len % block_size) == 0);
1751 ut_ad((offset % block_size) == 0);
1752
1753 ut_ad(len + block_size <= src_len);
1754
1755 offset += len;
1756
1757 return(os_file_punch_hole(fh, offset, src_len - len));
1758 }
1759
1760 ut_ad(!type.is_log());
1761
1762 return(DB_SUCCESS);
1763 }
1764
1765 #endif /* !UNIV_HOTBACKUP */
1766
1767 /** This function returns a new path name after replacing the basename
1768 in an old path with a new basename. The old_path is a full path
1769 name including the extension. The tablename is in the normal
1770 form "databasename/tablename". The new base name is found after
1771 the forward slash. Both input strings are null terminated.
1772
1773 This function allocates memory to be returned. It is the callers
1774 responsibility to free the return value after it is no longer needed.
1775
1776 @param[in] old_path Pathname
1777 @param[in] tablename Contains new base name
1778 @return own: new full pathname */
1779 char*
os_file_make_new_pathname(const char * old_path,const char * tablename)1780 os_file_make_new_pathname(
1781 const char* old_path,
1782 const char* tablename)
1783 {
1784 ulint dir_len;
1785 char* last_slash;
1786 char* base_name;
1787 char* new_path;
1788 ulint new_path_len;
1789
1790 /* Split the tablename into its database and table name components.
1791 They are separated by a '/'. */
1792 last_slash = strrchr((char*) tablename, '/');
1793 base_name = last_slash ? last_slash + 1 : (char*) tablename;
1794
1795 /* Find the offset of the last slash. We will strip off the
1796 old basename.ibd which starts after that slash. */
1797 last_slash = strrchr((char*) old_path, OS_PATH_SEPARATOR);
1798 dir_len = last_slash ? last_slash - old_path : strlen(old_path);
1799
1800 /* allocate a new path and move the old directory path to it. */
1801 new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd";
1802 new_path = static_cast<char*>(ut_malloc_nokey(new_path_len));
1803 memcpy(new_path, old_path, dir_len);
1804
1805 ut_snprintf(new_path + dir_len,
1806 new_path_len - dir_len,
1807 "%c%s.ibd",
1808 OS_PATH_SEPARATOR,
1809 base_name);
1810
1811 return(new_path);
1812 }
1813
1814 /** This function reduces a null-terminated full remote path name into
1815 the path that is sent by MySQL for DATA DIRECTORY clause. It replaces
1816 the 'databasename/tablename.ibd' found at the end of the path with just
1817 'tablename'.
1818
1819 Since the result is always smaller than the path sent in, no new memory
1820 is allocated. The caller should allocate memory for the path sent in.
1821 This function manipulates that path in place.
1822
1823 If the path format is not as expected, just return. The result is used
1824 to inform a SHOW CREATE TABLE command.
1825 @param[in,out] data_dir_path Full path/data_dir_path */
1826 void
os_file_make_data_dir_path(char * data_dir_path)1827 os_file_make_data_dir_path(
1828 char* data_dir_path)
1829 {
1830 /* Replace the period before the extension with a null byte. */
1831 char* ptr = strrchr((char*) data_dir_path, '.');
1832
1833 if (ptr == NULL) {
1834 return;
1835 }
1836
1837 ptr[0] = '\0';
1838
1839 /* The tablename starts after the last slash. */
1840 ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
1841
1842 if (ptr == NULL) {
1843 return;
1844 }
1845
1846 ptr[0] = '\0';
1847
1848 char* tablename = ptr + 1;
1849
1850 /* The databasename starts after the next to last slash. */
1851 ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
1852
1853 if (ptr == NULL) {
1854 return;
1855 }
1856
1857 ulint tablename_len = ut_strlen(tablename);
1858
1859 ut_memmove(++ptr, tablename, tablename_len);
1860
1861 ptr[tablename_len] = '\0';
1862 }
1863
1864 /** Check if the path refers to the root of a drive using a pointer
1865 to the last directory separator that the caller has fixed.
1866 @param[in] path path name
1867 @param[in] path last directory separator in the path
1868 @return true if this path is a drive root, false if not */
1869 UNIV_INLINE
1870 bool
os_file_is_root(const char * path,const char * last_slash)1871 os_file_is_root(
1872 const char* path,
1873 const char* last_slash)
1874 {
1875 return(
1876 #ifdef _WIN32
1877 (last_slash == path + 2 && path[1] == ':') ||
1878 #endif /* _WIN32 */
1879 last_slash == path);
1880 }
1881
1882 /** Return the parent directory component of a null-terminated path.
1883 Return a new buffer containing the string up to, but not including,
1884 the final component of the path.
1885 The path returned will not contain a trailing separator.
1886 Do not return a root path, return NULL instead.
1887 The final component trimmed off may be a filename or a directory name.
1888 If the final component is the only component of the path, return NULL.
1889 It is the caller's responsibility to free the returned string after it
1890 is no longer needed.
1891 @param[in] path Path name
1892 @return own: parent directory of the path */
1893 static
1894 char*
os_file_get_parent_dir(const char * path)1895 os_file_get_parent_dir(
1896 const char* path)
1897 {
1898 bool has_trailing_slash = false;
1899
1900 /* Find the offset of the last slash */
1901 const char* last_slash = strrchr(path, OS_PATH_SEPARATOR);
1902
1903 if (!last_slash) {
1904 /* No slash in the path, return NULL */
1905 return(NULL);
1906 }
1907
1908 /* Ok, there is a slash. Is there anything after it? */
1909 if (static_cast<size_t>(last_slash - path + 1) == strlen(path)) {
1910 has_trailing_slash = true;
1911 }
1912
1913 /* Reduce repetative slashes. */
1914 while (last_slash > path
1915 && last_slash[-1] == OS_PATH_SEPARATOR) {
1916 last_slash--;
1917 }
1918
1919 /* Check for the root of a drive. */
1920 if (os_file_is_root(path, last_slash)) {
1921 return(NULL);
1922 }
1923
1924 /* If a trailing slash prevented the first strrchr() from trimming
1925 the last component of the path, trim that component now. */
1926 if (has_trailing_slash) {
1927 /* Back up to the previous slash. */
1928 last_slash--;
1929 while (last_slash > path
1930 && last_slash[0] != OS_PATH_SEPARATOR) {
1931 last_slash--;
1932 }
1933
1934 /* Reduce repetative slashes. */
1935 while (last_slash > path
1936 && last_slash[-1] == OS_PATH_SEPARATOR) {
1937 last_slash--;
1938 }
1939 }
1940
1941 /* Check for the root of a drive. */
1942 if (os_file_is_root(path, last_slash)) {
1943 return(NULL);
1944 }
1945
1946 if (last_slash - path < 0) {
1947 /* Sanity check, it prevents gcc from trying to handle this case which
1948 * results in warnings for some optimized builds */
1949 return (NULL);
1950 }
1951
1952 /* Non-trivial directory component */
1953
1954 return(mem_strdupl(path, last_slash - path));
1955 }
1956 #ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
1957
1958 /* Test the function os_file_get_parent_dir. */
1959 void
test_os_file_get_parent_dir(const char * child_dir,const char * expected_dir)1960 test_os_file_get_parent_dir(
1961 const char* child_dir,
1962 const char* expected_dir)
1963 {
1964 char* child = mem_strdup(child_dir);
1965 char* expected = expected_dir == NULL ? NULL
1966 : mem_strdup(expected_dir);
1967
1968 /* os_file_get_parent_dir() assumes that separators are
1969 converted to OS_PATH_SEPARATOR. */
1970 os_normalize_path(child);
1971 os_normalize_path(expected);
1972
1973 char* parent = os_file_get_parent_dir(child);
1974
1975 bool unexpected = (expected == NULL
1976 ? (parent != NULL)
1977 : (0 != strcmp(parent, expected)));
1978 if (unexpected) {
1979 ib::fatal() << "os_file_get_parent_dir('" << child
1980 << "') returned '" << parent
1981 << "', instead of '" << expected << "'.";
1982 }
1983 ut_free(parent);
1984 ut_free(child);
1985 ut_free(expected);
1986 }
1987
1988 /* Test the function os_file_get_parent_dir. */
1989 void
unit_test_os_file_get_parent_dir()1990 unit_test_os_file_get_parent_dir()
1991 {
1992 test_os_file_get_parent_dir("/usr/lib/a", "/usr/lib");
1993 test_os_file_get_parent_dir("/usr/", NULL);
1994 test_os_file_get_parent_dir("//usr//", NULL);
1995 test_os_file_get_parent_dir("usr", NULL);
1996 test_os_file_get_parent_dir("usr//", NULL);
1997 test_os_file_get_parent_dir("/", NULL);
1998 test_os_file_get_parent_dir("//", NULL);
1999 test_os_file_get_parent_dir(".", NULL);
2000 test_os_file_get_parent_dir("..", NULL);
2001 # ifdef _WIN32
2002 test_os_file_get_parent_dir("D:", NULL);
2003 test_os_file_get_parent_dir("D:/", NULL);
2004 test_os_file_get_parent_dir("D:\\", NULL);
2005 test_os_file_get_parent_dir("D:/data", NULL);
2006 test_os_file_get_parent_dir("D:/data/", NULL);
2007 test_os_file_get_parent_dir("D:\\data\\", NULL);
2008 test_os_file_get_parent_dir("D:///data/////", NULL);
2009 test_os_file_get_parent_dir("D:\\\\\\data\\\\\\\\", NULL);
2010 test_os_file_get_parent_dir("D:/data//a", "D:/data");
2011 test_os_file_get_parent_dir("D:\\data\\\\a", "D:\\data");
2012 test_os_file_get_parent_dir("D:///data//a///b/", "D:///data//a");
2013 test_os_file_get_parent_dir("D:\\\\\\data\\\\a\\\\\\b\\", "D:\\\\\\data\\\\a");
2014 #endif /* _WIN32 */
2015 }
2016 #endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */
2017
2018
2019 /** Creates all missing subdirectories along the given path.
2020 @param[in] path Path name
2021 @return DB_SUCCESS if OK, otherwise error code. */
2022 dberr_t
os_file_create_subdirs_if_needed(const char * path)2023 os_file_create_subdirs_if_needed(
2024 const char* path)
2025 {
2026 if (srv_read_only_mode) {
2027
2028 ib::error()
2029 << "read only mode set. Can't create "
2030 << "subdirectories '" << path << "'";
2031
2032 return(DB_READ_ONLY);
2033
2034 }
2035
2036 char* subdir = os_file_get_parent_dir(path);
2037
2038 if (subdir == NULL) {
2039 /* subdir is root or cwd, nothing to do */
2040 return(DB_SUCCESS);
2041 }
2042
2043 /* Test if subdir exists */
2044 os_file_type_t type;
2045 bool subdir_exists;
2046 bool success = os_file_status(subdir, &subdir_exists, &type);
2047
2048 if (success && !subdir_exists) {
2049
2050 /* Subdir does not exist, create it */
2051 dberr_t err = os_file_create_subdirs_if_needed(subdir);
2052
2053 if (err != DB_SUCCESS) {
2054
2055 ut_free(subdir);
2056
2057 return(err);
2058 }
2059
2060 success = os_file_create_directory(subdir, false);
2061 }
2062
2063 ut_free(subdir);
2064
2065 return(success ? DB_SUCCESS : DB_ERROR);
2066 }
2067
2068 /** Allocate the buffer for IO on a transparently compressed table.
2069 @param[in] type IO flags
2070 @param[out] buf buffer to read or write
2071 @param[in,out] n number of bytes to read/write, starting from
2072 offset
2073 @return pointer to allocated page, compressed data is written to the offset
2074 that is aligned on the disk sector size */
2075 static
2076 Block*
os_file_compress_page(IORequest & type,void * & buf,ulint * n)2077 os_file_compress_page(
2078 IORequest& type,
2079 void*& buf,
2080 ulint* n)
2081 {
2082 ut_ad(!type.is_log());
2083 ut_ad(type.is_write());
2084 ut_ad(type.is_compressed());
2085
2086 ulint n_alloc = *n * 2;
2087
2088 ut_a(n_alloc <= UNIV_PAGE_SIZE_MAX * 2);
2089 ut_a(type.compression_algorithm().m_type != Compression::LZ4
2090 || static_cast<ulint>(LZ4_COMPRESSBOUND(*n)) < n_alloc);
2091
2092 Block* block = os_alloc_block();
2093
2094 ulint old_compressed_len;
2095 ulint compressed_len = *n;
2096
2097 old_compressed_len = mach_read_from_2(
2098 reinterpret_cast<byte*>(buf)
2099 + FIL_PAGE_COMPRESS_SIZE_V1);
2100
2101 if (old_compressed_len > 0) {
2102 old_compressed_len = ut_calc_align(
2103 old_compressed_len + FIL_PAGE_DATA,
2104 type.block_size());
2105 } else {
2106 old_compressed_len = *n;
2107 }
2108
2109 byte* compressed_page;
2110
2111 compressed_page = static_cast<byte*>(
2112 ut_align(block->m_ptr, os_io_ptr_align));
2113
2114 byte* buf_ptr;
2115
2116 buf_ptr = os_file_compress_page(
2117 type.compression_algorithm(),
2118 type.block_size(),
2119 reinterpret_cast<byte*>(buf),
2120 *n,
2121 compressed_page,
2122 &compressed_len);
2123
2124 if (buf_ptr != buf) {
2125 /* Set new compressed size to uncompressed page. */
2126 memcpy(reinterpret_cast<byte*>(buf) + FIL_PAGE_COMPRESS_SIZE_V1,
2127 buf_ptr + FIL_PAGE_COMPRESS_SIZE_V1, 2);
2128
2129 buf = buf_ptr;
2130 *n = compressed_len;
2131
2132 if (compressed_len >= old_compressed_len) {
2133
2134 ut_ad(old_compressed_len <= UNIV_PAGE_SIZE);
2135
2136 type.clear_punch_hole();
2137 }
2138 }
2139
2140 return(block);
2141 }
2142
2143 /** Encrypt a page content when write it to disk.
2144 @param[in] type IO flags
2145 @param[out] buf buffer to read or write
2146 @param[in,out] n number of bytes to read/write, starting from
2147 offset
2148 @return pointer to the encrypted page */
2149 static
2150 Block*
os_file_encrypt_page(const IORequest & type,void * & buf,ulint * n)2151 os_file_encrypt_page(
2152 const IORequest& type,
2153 void*& buf,
2154 ulint* n)
2155 {
2156
2157 byte* encrypted_page;
2158 ulint encrypted_len = *n;
2159 byte* buf_ptr;
2160 Encryption encryption(type.encryption_algorithm());
2161
2162 ut_ad(!type.is_log());
2163 ut_ad(type.is_write());
2164 ut_ad(type.is_encrypted());
2165
2166 Block* block = os_alloc_block();
2167
2168 encrypted_page = static_cast<byte*>(
2169 ut_align(block->m_ptr, os_io_ptr_align));
2170
2171 buf_ptr = encryption.encrypt(type,
2172 reinterpret_cast<byte*>(buf), *n,
2173 encrypted_page, &encrypted_len);
2174
2175 bool encrypted = buf_ptr != buf;
2176
2177 if (encrypted) {
2178
2179 buf = buf_ptr;
2180 *n = encrypted_len;
2181 }
2182
2183 return(block);
2184 }
2185
2186 #ifndef _WIN32
2187
2188 /** Do the read/write
2189 @param[in] request The IO context and type
2190 @return the number of bytes read/written or negative value on error */
2191 ssize_t
execute(const IORequest & request)2192 SyncFileIO::execute(const IORequest& request)
2193 {
2194 ssize_t n_bytes;
2195
2196 if (request.is_read()) {
2197 n_bytes = pread(m_fh, m_buf, m_n, m_offset);
2198 } else {
2199 ut_ad(request.is_write());
2200 n_bytes = pwrite(m_fh, m_buf, m_n, m_offset);
2201 }
2202
2203 return(n_bytes);
2204 }
2205
2206 /** Free storage space associated with a section of the file.
2207 @param[in] fh Open file handle
2208 @param[in] off Starting offset (SEEK_SET)
2209 @param[in] len Size of the hole
2210 @return DB_SUCCESS or error code */
2211 static
2212 dberr_t
os_file_punch_hole_posix(os_file_t fh,os_offset_t off,os_offset_t len)2213 os_file_punch_hole_posix(
2214 os_file_t fh,
2215 os_offset_t off,
2216 os_offset_t len)
2217 {
2218 #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
2219 const int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
2220
2221 int ret = fallocate(fh, mode, off, len);
2222
2223 if (ret == 0) {
2224 return(DB_SUCCESS);
2225 }
2226
2227 ut_a(ret == -1);
2228
2229 if (errno == ENOTSUP) {
2230 return(DB_IO_NO_PUNCH_HOLE);
2231 }
2232
2233 ib::warn()
2234 << "fallocate(" << fh
2235 <<", FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, "
2236 << off << ", " << len << ") returned errno: "
2237 << errno;
2238
2239 return(DB_IO_ERROR);
2240
2241 #elif defined(UNIV_SOLARIS)
2242
2243 // Use F_FREESP
2244
2245 #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
2246
2247 return(DB_IO_NO_PUNCH_HOLE);
2248 }
2249
2250 #if defined(LINUX_NATIVE_AIO)
2251
2252 /** Linux native AIO handler */
2253 class LinuxAIOHandler {
2254 public:
2255 /**
2256 @param[in] global_segment The global segment*/
LinuxAIOHandler(ulint global_segment)2257 LinuxAIOHandler(ulint global_segment)
2258 :
2259 m_global_segment(global_segment)
2260 {
2261 /* Should never be doing Sync IO here. */
2262 ut_a(m_global_segment != ULINT_UNDEFINED);
2263
2264 /* Find the array and the local segment. */
2265
2266 m_segment = AIO::get_array_and_local_segment(
2267 &m_array, m_global_segment);
2268
2269 m_n_slots = m_array->slots_per_segment();
2270 }
2271
2272 /** Destructor */
~LinuxAIOHandler()2273 ~LinuxAIOHandler()
2274 {
2275 // No op
2276 }
2277
2278 /**
2279 Process a Linux AIO request
2280 @param[out] m1 the messages passed with the
2281 @param[out] m2 AIO request; note that in case the
2282 AIO operation failed, these output
2283 parameters are valid and can be used to
2284 restart the operation.
2285 @param[out] request IO context
2286 @return DB_SUCCESS or error code */
2287 dberr_t poll(fil_node_t** m1, void** m2, IORequest* request);
2288
2289 private:
2290 /** Resubmit an IO request that was only partially successful
2291 @param[in,out] slot Request to resubmit
2292 @return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
2293 dberr_t resubmit(Slot* slot);
2294
2295 /** Check if the AIO succeeded
2296 @param[in,out] slot The slot to check
2297 @return DB_SUCCESS, DB_FAIL if the operation should be retried or
2298 DB_IO_ERROR on all other errors */
2299 dberr_t check_state(Slot* slot);
2300
2301 /** @return true if a shutdown was detected */
is_shutdown() const2302 bool is_shutdown() const
2303 {
2304 return(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
2305 && !buf_page_cleaner_is_active);
2306 }
2307
2308 /** If no slot was found then the m_array->m_mutex will be released.
2309 @param[out] n_pending The number of pending IOs
2310 @return NULL or a slot that has completed IO */
2311 Slot* find_completed_slot(ulint* n_pending);
2312
2313 /** This is called from within the IO-thread. If there are no completed
2314 IO requests in the slot array, the thread calls this function to
2315 collect more requests from the Linux kernel.
2316 The IO-thread waits on io_getevents(), which is a blocking call, with
2317 a timeout value. Unless the system is very heavy loaded, keeping the
2318 IO-thread very busy, the io-thread will spend most of its time waiting
2319 in this function.
2320 The IO-thread also exits in this function. It checks server status at
2321 each wakeup and that is why we use timed wait in io_getevents(). */
2322 void collect();
2323
2324 private:
2325 /** Slot array */
2326 AIO* m_array;
2327
2328 /** Number of slots inthe local segment */
2329 ulint m_n_slots;
2330
2331 /** The local segment to check */
2332 ulint m_segment;
2333
2334 /** The global segment */
2335 ulint m_global_segment;
2336 };
2337
2338 /** Resubmit an IO request that was only partially successful
2339 @param[in,out] slot Request to resubmit
2340 @return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
2341 dberr_t
resubmit(Slot * slot)2342 LinuxAIOHandler::resubmit(Slot* slot)
2343 {
2344 #ifdef UNIV_DEBUG
2345 /* Bytes already read/written out */
2346 ulint n_bytes = slot->ptr - slot->buf;
2347
2348 ut_ad(m_array->is_mutex_owned());
2349
2350 ut_ad(n_bytes < slot->original_len);
2351 ut_ad(static_cast<ulint>(slot->n_bytes) < slot->original_len - n_bytes);
2352 /* Partial read or write scenario */
2353 ut_ad(slot->len >= static_cast<ulint>(slot->n_bytes));
2354 #endif /* UNIV_DEBUG */
2355
2356 slot->len -= slot->n_bytes;
2357 slot->ptr += slot->n_bytes;
2358 slot->offset += slot->n_bytes;
2359
2360 /* Resetting the bytes read/written */
2361 slot->n_bytes = 0;
2362 slot->io_already_done = false;
2363
2364 /* make sure that slot->offset fits in off_t */
2365 ut_ad(sizeof(off_t) >= sizeof(os_offset_t));
2366
2367 struct iocb* iocb = &slot->control;
2368 if (slot->type.is_read()) {
2369 io_prep_pread(
2370 iocb,
2371 slot->file.m_file,
2372 slot->ptr,
2373 slot->len,
2374 slot->offset);
2375
2376 } else {
2377
2378 ut_a(slot->type.is_write());
2379
2380 io_prep_pwrite(
2381 iocb,
2382 slot->file.m_file,
2383 slot->ptr,
2384 slot->len,
2385 slot->offset);
2386 }
2387
2388 iocb->data = slot;
2389
2390 /* Resubmit an I/O request */
2391 int ret = io_submit(m_array->io_ctx(m_segment), 1, &iocb);
2392
2393 if (ret < -1) {
2394 errno = -ret;
2395 }
2396
2397 return(ret < 0 ? DB_IO_PARTIAL_FAILED : DB_SUCCESS);
2398 }
2399
2400 /** Check if the AIO succeeded
2401 @param[in,out] slot The slot to check
2402 @return DB_SUCCESS, DB_FAIL if the operation should be retried or
2403 DB_IO_ERROR on all other errors */
2404 dberr_t
check_state(Slot * slot)2405 LinuxAIOHandler::check_state(Slot* slot)
2406 {
2407 ut_ad(m_array->is_mutex_owned());
2408
2409 /* Note that it may be that there is more then one completed
2410 IO requests. We process them one at a time. We may have a case
2411 here to improve the performance slightly by dealing with all
2412 requests in one sweep. */
2413
2414 srv_set_io_thread_op_info(
2415 m_global_segment, "processing completed aio requests");
2416
2417 ut_ad(slot->io_already_done);
2418
2419 dberr_t err;
2420
2421 if (slot->ret == 0) {
2422
2423 err = AIOHandler::post_io_processing(slot);
2424
2425 } else {
2426 errno = -slot->ret;
2427
2428 /* os_file_handle_error does tell us if we should retry
2429 this IO. As it stands now, we don't do this retry when
2430 reaping requests from a different context than
2431 the dispatcher. This non-retry logic is the same for
2432 Windows and Linux native AIO.
2433 We should probably look into this to transparently
2434 re-submit the IO. */
2435 os_file_handle_error(slot->name, "Linux aio");
2436
2437 err = DB_IO_ERROR;
2438 }
2439
2440 return(err);
2441 }
2442
2443 /** If no slot was found then the m_array->m_mutex will be released.
2444 @param[out] n_pending The number of pending IOs
2445 @return NULL or a slot that has completed IO */
2446 Slot*
find_completed_slot(ulint * n_pending)2447 LinuxAIOHandler::find_completed_slot(ulint* n_pending)
2448 {
2449 ulint offset = m_n_slots * m_segment;
2450
2451 *n_pending = 0;
2452
2453 m_array->acquire();
2454
2455 Slot* slot = m_array->at(offset);
2456
2457 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
2458
2459 if (slot->is_reserved) {
2460
2461 ++*n_pending;
2462
2463 if (slot->io_already_done) {
2464
2465 /* Something for us to work on.
2466 Note: We don't release the mutex. */
2467 return(slot);
2468 }
2469 }
2470 }
2471
2472 m_array->release();
2473
2474 return(NULL);
2475 }
2476
2477 /** This function is only used in Linux native asynchronous i/o. This is
2478 called from within the io-thread. If there are no completed IO requests
2479 in the slot array, the thread calls this function to collect more
2480 requests from the kernel.
2481 The io-thread waits on io_getevents(), which is a blocking call, with
2482 a timeout value. Unless the system is very heavy loaded, keeping the
2483 io-thread very busy, the io-thread will spend most of its time waiting
2484 in this function.
2485 The io-thread also exits in this function. It checks server status at
2486 each wakeup and that is why we use timed wait in io_getevents(). */
2487 void
collect()2488 LinuxAIOHandler::collect()
2489 {
2490 ut_ad(m_n_slots > 0);
2491 ut_ad(m_array != NULL);
2492 ut_ad(m_segment < m_array->get_n_segments());
2493
2494 /* Which io_context we are going to use. */
2495 io_context* io_ctx = m_array->io_ctx(m_segment);
2496
2497 /* Starting point of the m_segment we will be working on. */
2498 ulint start_pos = m_segment * m_n_slots;
2499
2500 /* End point. */
2501 ulint end_pos = start_pos + m_n_slots;
2502
2503 for (;;) {
2504 struct io_event* events;
2505
2506 /* Which part of event array we are going to work on. */
2507 events = m_array->io_events(m_segment * m_n_slots);
2508
2509 /* Initialize the events. */
2510 memset(events, 0, sizeof(*events) * m_n_slots);
2511
2512 /* The timeout value is arbitrary. We probably need
2513 to experiment with it a little. */
2514 struct timespec timeout;
2515
2516 timeout.tv_sec = 0;
2517 timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
2518
2519 int ret;
2520
2521 ret = io_getevents(io_ctx, 1, m_n_slots, events, &timeout);
2522
2523 for (int i = 0; i < ret; ++i) {
2524
2525 struct iocb* iocb;
2526
2527 iocb = reinterpret_cast<struct iocb*>(events[i].obj);
2528 ut_a(iocb != NULL);
2529
2530 Slot* slot = reinterpret_cast<Slot*>(iocb->data);
2531
2532 /* Some sanity checks. */
2533 ut_a(slot != NULL);
2534 ut_a(slot->is_reserved);
2535
2536 /* We are not scribbling previous segment. */
2537 ut_a(slot->pos >= start_pos);
2538
2539 /* We have not overstepped to next segment. */
2540 ut_a(slot->pos < end_pos);
2541
2542 /* We never compress/decompress the first page */
2543
2544 if (slot->offset > 0
2545 && !slot->skip_punch_hole
2546 && slot->type.is_compression_enabled()
2547 && !slot->type.is_log()
2548 && slot->type.is_write()
2549 && slot->type.is_compressed()
2550 && slot->type.punch_hole()) {
2551
2552 slot->err = AIOHandler::io_complete(slot);
2553 } else {
2554 slot->err = DB_SUCCESS;
2555 }
2556
2557 /* Mark this request as completed. The error handling
2558 will be done in the calling function. */
2559 m_array->acquire();
2560
2561 /* events[i].res2 should always be ZERO */
2562 ut_ad(events[i].res2 == 0);
2563 slot->io_already_done = true;
2564
2565 /*Even though events[i].res is an unsigned number
2566 in libaio, it is used to return a negative value
2567 (negated errno value) to indicate error and a positive
2568 value to indicate number of bytes read or written. */
2569
2570 if (events[i].res > slot->len) {
2571 /* failure */
2572 slot->n_bytes = 0;
2573 slot->ret = events[i].res;
2574 } else {
2575 /* success */
2576 slot->n_bytes = events[i].res;
2577 slot->ret = 0;
2578 }
2579 m_array->release();
2580 }
2581
2582 if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
2583 || !buf_page_cleaner_is_active
2584 || ret > 0) {
2585
2586 break;
2587 }
2588
2589 /* This error handling is for any error in collecting the
2590 IO requests. The errors, if any, for any particular IO
2591 request are simply passed on to the calling routine. */
2592
2593 switch (ret) {
2594 case -EAGAIN:
2595 /* Not enough resources! Try again. */
2596
2597 case -EINTR:
2598 /* Interrupted! The behaviour in case of an interrupt.
2599 If we have some completed IOs available then the
2600 return code will be the number of IOs. We get EINTR
2601 only if there are no completed IOs and we have been
2602 interrupted. */
2603
2604 case 0:
2605 /* No pending request! Go back and check again. */
2606
2607 continue;
2608 }
2609
2610 /* All other errors should cause a trap for now. */
2611 ib::fatal()
2612 << "Unexpected ret_code[" << ret
2613 << "] from io_getevents()!";
2614
2615 break;
2616 }
2617 }
2618
2619 /** Process a Linux AIO request
2620 @param[out] m1 the messages passed with the
2621 @param[out] m2 AIO request; note that in case the
2622 AIO operation failed, these output
2623 parameters are valid and can be used to
2624 restart the operation.
2625 @param[out] request IO context
2626 @return DB_SUCCESS or error code */
2627 dberr_t
poll(fil_node_t ** m1,void ** m2,IORequest * request)2628 LinuxAIOHandler::poll(fil_node_t** m1, void** m2, IORequest* request)
2629 {
2630 dberr_t err;
2631 Slot* slot;
2632
2633 /* Loop until we have found a completed request. */
2634 for (;;) {
2635
2636 ulint n_pending;
2637
2638 slot = find_completed_slot(&n_pending);
2639
2640 if (slot != NULL) {
2641
2642 ut_ad(m_array->is_mutex_owned());
2643
2644 err = check_state(slot);
2645
2646 /* DB_FAIL is not a hard error, we should retry */
2647 if (err != DB_FAIL) {
2648 break;
2649 }
2650
2651 /* Partial IO, resubmit request for
2652 remaining bytes to read/write */
2653 err = resubmit(slot);
2654
2655 if (err != DB_SUCCESS) {
2656 break;
2657 }
2658
2659 m_array->release();
2660
2661 } else if (is_shutdown() && n_pending == 0) {
2662
2663 /* There is no completed request. If there is
2664 no pending request at all, and the system is
2665 being shut down, exit. */
2666
2667 *m1 = NULL;
2668 *m2 = NULL;
2669
2670 return(DB_SUCCESS);
2671
2672 } else {
2673
2674 /* Wait for some request. Note that we return
2675 from wait if we have found a request. */
2676
2677 srv_set_io_thread_op_info(
2678 m_global_segment,
2679 "waiting for completed aio requests");
2680
2681 collect();
2682 }
2683 }
2684
2685 if (err == DB_IO_PARTIAL_FAILED) {
2686 /* Aborting in case of submit failure */
2687 ib::fatal()
2688 << "Native Linux AIO interface. "
2689 "io_submit() call failed when "
2690 "resubmitting a partial I/O "
2691 "request on the file " << slot->name
2692 << ".";
2693 }
2694
2695 *m1 = slot->m1;
2696 *m2 = slot->m2;
2697
2698 *request = slot->type;
2699
2700 m_array->release(slot);
2701
2702 m_array->release();
2703
2704 return(err);
2705 }
2706
2707 /** This function is only used in Linux native asynchronous i/o.
2708 Waits for an aio operation to complete. This function is used to wait for
2709 the completed requests. The aio array of pending requests is divided
2710 into segments. The thread specifies which segment or slot it wants to wait
2711 for. NOTE: this function will also take care of freeing the aio slot,
2712 therefore no other thread is allowed to do the freeing!
2713
2714 @param[in] global_seg segment number in the aio array
2715 to wait for; segment 0 is the ibuf
2716 i/o thread, segment 1 is log i/o thread,
2717 then follow the non-ibuf read threads,
2718 and the last are the non-ibuf write
2719 threads.
2720 @param[out] m1 the messages passed with the
2721 @param[out] m2 AIO request; note that in case the
2722 AIO operation failed, these output
2723 parameters are valid and can be used to
2724 restart the operation.
2725 @param[out]xi request IO context
2726 @return DB_SUCCESS if the IO was successful */
2727 static
2728 dberr_t
os_aio_linux_handler(ulint global_segment,fil_node_t ** m1,void ** m2,IORequest * request)2729 os_aio_linux_handler(
2730 ulint global_segment,
2731 fil_node_t** m1,
2732 void** m2,
2733 IORequest* request)
2734 {
2735 LinuxAIOHandler handler(global_segment);
2736
2737 dberr_t err = handler.poll(m1, m2, request);
2738
2739 if (err == DB_IO_NO_PUNCH_HOLE) {
2740 fil_no_punch_hole(*m1);
2741 err = DB_SUCCESS;
2742 }
2743
2744 return(err);
2745 }
2746
2747 /** Dispatch an AIO request to the kernel.
2748 @param[in,out] slot an already reserved slot
2749 @return true on success. */
2750 bool
linux_dispatch(Slot * slot)2751 AIO::linux_dispatch(Slot* slot)
2752 {
2753 ut_a(slot->is_reserved);
2754 ut_ad(slot->type.validate());
2755
2756 /* Find out what we are going to work with.
2757 The iocb struct is directly in the slot.
2758 The io_context is one per segment. */
2759
2760 ulint io_ctx_index;
2761 struct iocb* iocb = &slot->control;
2762
2763 io_ctx_index = (slot->pos * m_n_segments) / m_slots.size();
2764
2765 int ret = io_submit(m_aio_ctx[io_ctx_index], 1, &iocb);
2766
2767 /* io_submit() returns number of successfully queued requests
2768 or -errno. */
2769
2770 if (ret != 1) {
2771 errno = -ret;
2772 }
2773
2774 return(ret == 1);
2775 }
2776
2777 /** Creates an io_context for native linux AIO.
2778 @param[in] max_events number of events
2779 @param[out] io_ctx io_ctx to initialize.
2780 @return true on success. */
2781 bool
linux_create_io_ctx(ulint max_events,io_context_t * io_ctx)2782 AIO::linux_create_io_ctx(
2783 ulint max_events,
2784 io_context_t* io_ctx)
2785 {
2786 ssize_t n_retries = 0;
2787
2788 for (;;) {
2789
2790 memset(io_ctx, 0x0, sizeof(*io_ctx));
2791
2792 /* Initialize the io_ctx. Tell it how many pending
2793 IO requests this context will handle. */
2794
2795 int ret = io_setup(max_events, io_ctx);
2796
2797 if (ret == 0) {
2798 /* Success. Return now. */
2799 return(true);
2800 }
2801
2802 /* If we hit EAGAIN we'll make a few attempts before failing. */
2803
2804 switch (ret) {
2805 case -EAGAIN:
2806 if (n_retries == 0) {
2807 /* First time around. */
2808 ib::warn()
2809 << "io_setup() failed with EAGAIN."
2810 " Will make "
2811 << OS_AIO_IO_SETUP_RETRY_ATTEMPTS
2812 << " attempts before giving up.";
2813 }
2814
2815 if (n_retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
2816
2817 ++n_retries;
2818
2819 ib::warn()
2820 << "io_setup() attempt "
2821 << n_retries << ".";
2822
2823 os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
2824
2825 continue;
2826 }
2827
2828 /* Have tried enough. Better call it a day. */
2829 ib::error()
2830 << "io_setup() failed with EAGAIN after "
2831 << OS_AIO_IO_SETUP_RETRY_ATTEMPTS
2832 << " attempts.";
2833 break;
2834
2835 case -ENOSYS:
2836 ib::error()
2837 << "Linux Native AIO interface"
2838 " is not supported on this platform. Please"
2839 " check your OS documentation and install"
2840 " appropriate binary of InnoDB.";
2841
2842 break;
2843
2844 default:
2845 ib::error()
2846 << "Linux Native AIO setup"
2847 << " returned following error["
2848 << ret << "]";
2849 break;
2850 }
2851
2852 ib::info()
2853 << "You can disable Linux Native AIO by"
2854 " setting innodb_use_native_aio = 0 in my.cnf";
2855
2856 break;
2857 }
2858
2859 return(false);
2860 }
2861
2862 /** Checks if the system supports native linux aio. On some kernel
2863 versions where native aio is supported it won't work on tmpfs. In such
2864 cases we can't use native aio as it is not possible to mix simulated
2865 and native aio.
2866 @return: true if supported, false otherwise. */
2867 bool
is_linux_native_aio_supported()2868 AIO::is_linux_native_aio_supported()
2869 {
2870 int fd;
2871 io_context_t io_ctx;
2872 char name[1000];
2873
2874 if (!linux_create_io_ctx(1, &io_ctx)) {
2875
2876 /* The platform does not support native aio. */
2877
2878 return(false);
2879
2880 } else if (!srv_read_only_mode) {
2881
2882 /* Now check if tmpdir supports native aio ops. */
2883 fd = innobase_mysql_tmpfile(NULL);
2884
2885 if (fd < 0) {
2886 ib::warn()
2887 << "Unable to create temp file to check"
2888 " native AIO support.";
2889
2890 return(false);
2891 }
2892 } else {
2893
2894 os_normalize_path(srv_log_group_home_dir);
2895
2896 ulint dirnamelen = strlen(srv_log_group_home_dir);
2897
2898 ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
2899
2900 memcpy(name, srv_log_group_home_dir, dirnamelen);
2901
2902 /* Add a path separator if needed. */
2903 if (dirnamelen && name[dirnamelen - 1] != OS_PATH_SEPARATOR) {
2904
2905 name[dirnamelen++] = OS_PATH_SEPARATOR;
2906 }
2907
2908 strcpy(name + dirnamelen, "ib_logfile0");
2909
2910 fd = ::open(name, O_RDONLY);
2911
2912 if (fd == -1) {
2913
2914 ib::warn()
2915 << "Unable to open"
2916 << " \"" << name << "\" to check native"
2917 << " AIO read support.";
2918
2919 return(false);
2920 }
2921 }
2922
2923 struct io_event io_event;
2924
2925 memset(&io_event, 0x0, sizeof(io_event));
2926
2927 byte* buf = static_cast<byte*>(ut_malloc_nokey(UNIV_PAGE_SIZE * 2));
2928 byte* ptr = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE));
2929
2930 struct iocb iocb;
2931
2932 /* Suppress valgrind warning. */
2933 memset(buf, 0x00, UNIV_PAGE_SIZE * 2);
2934 memset(&iocb, 0x0, sizeof(iocb));
2935
2936 struct iocb* p_iocb = &iocb;
2937
2938 if (!srv_read_only_mode) {
2939
2940 io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0);
2941
2942 } else {
2943 ut_a(UNIV_PAGE_SIZE >= 512);
2944 io_prep_pread(p_iocb, fd, ptr, 512, 0);
2945 }
2946
2947 int err = io_submit(io_ctx, 1, &p_iocb);
2948
2949 if (err >= 1) {
2950 /* Now collect the submitted IO request. */
2951 err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
2952 }
2953
2954 ut_free(buf);
2955 close(fd);
2956
2957 switch (err) {
2958 case 1:
2959 return(true);
2960
2961 case -EINVAL:
2962 case -ENOSYS:
2963 ib::error()
2964 << "Linux Native AIO not supported. You can either"
2965 " move "
2966 << (srv_read_only_mode ? name : "tmpdir")
2967 << " to a file system that supports native"
2968 " AIO or you can set innodb_use_native_aio to"
2969 " FALSE to avoid this message.";
2970
2971 /* fall through. */
2972 default:
2973 ib::error()
2974 << "Linux Native AIO check on "
2975 << (srv_read_only_mode ? name : "tmpdir")
2976 << "returned error[" << -err << "]";
2977 }
2978
2979 return(false);
2980 }
2981
2982 #endif /* LINUX_NATIVE_AIO */
2983
2984 /** Retrieves the last error number if an error occurs in a file io function.
2985 The number should be retrieved before any other OS calls (because they may
2986 overwrite the error number). If the number is not known to this program,
2987 the OS error number + 100 is returned.
2988 @param[in] report_all_errors true if we want an error message
2989 printed of all errors
2990 @param[in] on_error_silent true then don't print any diagnostic
2991 to the log
2992 @return error number, or OS error number + 100 */
2993 static
2994 ulint
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)2995 os_file_get_last_error_low(
2996 bool report_all_errors,
2997 bool on_error_silent)
2998 {
2999 int err = errno;
3000
3001 if (err == 0) {
3002 return(0);
3003 }
3004
3005 if (report_all_errors
3006 || (err != ENOSPC && err != EEXIST && !on_error_silent)) {
3007
3008 ib::error()
3009 << "Operating system error number "
3010 << err
3011 << " in a file operation.";
3012
3013 if (err == ENOENT) {
3014
3015 ib::error()
3016 << "The error means the system"
3017 " cannot find the path specified.";
3018
3019 if (srv_is_being_started) {
3020
3021 ib::error()
3022 << "If you are installing InnoDB,"
3023 " remember that you must create"
3024 " directories yourself, InnoDB"
3025 " does not create them.";
3026 }
3027 } else if (err == EACCES) {
3028
3029 ib::error()
3030 << "The error means mysqld does not have"
3031 " the access rights to the directory.";
3032
3033 } else {
3034 if (strerror(err) != NULL) {
3035
3036 ib::error()
3037 << "Error number " << err << " means '"
3038 << strerror(err) << "'";
3039 }
3040
3041 ib::info() << OPERATING_SYSTEM_ERROR_MSG;
3042 }
3043 }
3044
3045 switch (err) {
3046 case ENOSPC:
3047 return(OS_FILE_DISK_FULL);
3048 case ENOENT:
3049 return(OS_FILE_NOT_FOUND);
3050 case EEXIST:
3051 return(OS_FILE_ALREADY_EXISTS);
3052 case EXDEV:
3053 case ENOTDIR:
3054 case EISDIR:
3055 return(OS_FILE_PATH_ERROR);
3056 case EAGAIN:
3057 if (srv_use_native_aio) {
3058 return(OS_FILE_AIO_RESOURCES_RESERVED);
3059 }
3060 break;
3061 case EINTR:
3062 if (srv_use_native_aio) {
3063 return(OS_FILE_AIO_INTERRUPTED);
3064 }
3065 break;
3066 case EACCES:
3067 return(OS_FILE_ACCESS_VIOLATION);
3068 }
3069 return(OS_FILE_ERROR_MAX + err);
3070 }
3071
3072 /** Wrapper to fsync(2) that retries the call on some errors.
3073 Returns the value 0 if successful; otherwise the value -1 is returned and
3074 the global variable errno is set to indicate the error.
3075 @param[in] file open file handle
3076 @return 0 if success, -1 otherwise */
3077 static
3078 int
os_file_fsync_posix(os_file_t file)3079 os_file_fsync_posix(
3080 os_file_t file)
3081 {
3082 ulint failures = 0;
3083
3084 for (;;) {
3085
3086 ++os_n_fsyncs;
3087
3088 int ret = fsync(file);
3089
3090 if (ret == 0) {
3091 return(ret);
3092 }
3093
3094 switch(errno) {
3095 case ENOLCK:
3096
3097 ++failures;
3098 ut_a(failures < 1000);
3099
3100 if (!(failures % 100)) {
3101
3102 ib::warn()
3103 << "fsync(): "
3104 << "No locks available; retrying";
3105 }
3106
3107 /* 0.2 sec */
3108 os_thread_sleep(200000);
3109 break;
3110
3111 case EIO:
3112
3113 ib::fatal()
3114 << "fsync() returned EIO, aborting.";
3115 break;
3116
3117 case EINTR:
3118
3119 ++failures;
3120 ut_a(failures < 2000);
3121 break;
3122
3123 default:
3124 ut_error;
3125 break;
3126 }
3127 }
3128
3129 ut_error;
3130
3131 return(-1);
3132 }
3133
3134 /** Check the existence and type of the given file.
3135 @param[in] path path name of file
3136 @param[out] exists true if the file exists
3137 @param[out] type Type of the file, if it exists
3138 @return true if call succeeded */
3139 bool
os_file_status_posix(const char * path,bool * exists,os_file_type_t * type)3140 os_file_status_posix(
3141 const char* path,
3142 bool* exists,
3143 os_file_type_t* type)
3144 {
3145 struct stat statinfo;
3146
3147 int ret = stat(path, &statinfo);
3148
3149 *exists = !ret;
3150
3151 if (!ret) {
3152 /* file exists, everything OK */
3153
3154 } else if (errno == ENOENT || errno == ENOTDIR
3155 || errno == ENAMETOOLONG) {
3156 /* file does not exist */
3157 return(true);
3158
3159 } else {
3160 /* file exists, but stat call failed */
3161 os_file_handle_error_no_exit(path, "stat", false);
3162 return(false);
3163 }
3164
3165 if (S_ISDIR(statinfo.st_mode)) {
3166 *type = OS_FILE_TYPE_DIR;
3167
3168 } else if (S_ISLNK(statinfo.st_mode)) {
3169 *type = OS_FILE_TYPE_LINK;
3170
3171 } else if (S_ISREG(statinfo.st_mode)) {
3172 *type = OS_FILE_TYPE_FILE;
3173
3174 } else {
3175 *type = OS_FILE_TYPE_UNKNOWN;
3176 }
3177
3178 return(true);
3179 }
3180
3181 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
3182 function!
3183 Flushes the write buffers of a given file to the disk.
3184 @param[in] file handle to a file
3185 @return true if success */
3186 bool
os_file_flush_func(os_file_t file)3187 os_file_flush_func(
3188 os_file_t file)
3189 {
3190 int ret;
3191
3192 ret = os_file_fsync_posix(file);
3193
3194 if (ret == 0) {
3195 return(true);
3196 }
3197
3198 /* Since Linux returns EINVAL if the 'file' is actually a raw device,
3199 we choose to ignore that error if we are using raw disks */
3200
3201 if (srv_start_raw_disk_in_use && errno == EINVAL) {
3202
3203 return(true);
3204 }
3205
3206 ib::error() << "The OS said file flush did not succeed";
3207
3208 os_file_handle_error(NULL, "flush");
3209
3210 /* It is a fatal error if a file flush does not succeed, because then
3211 the database can get corrupt on disk */
3212 ut_error;
3213
3214 return(false);
3215 }
3216
3217 /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
3218 this function!
3219 A simple function to open or create a file.
3220 @param[in] name name of the file or path as a null-terminated
3221 string
3222 @param[in] create_mode create mode
3223 @param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
3224 @param[in] read_only if true, read only checks are enforced
3225 @param[out] success true if succeed, false if error
3226 @return handle to the file, not defined if error, error number
3227 can be retrieved with os_file_get_last_error */
3228 pfs_os_file_t
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)3229 os_file_create_simple_func(
3230 const char* name,
3231 ulint create_mode,
3232 ulint access_type,
3233 bool read_only,
3234 bool* success)
3235 {
3236 pfs_os_file_t file;
3237
3238 *success = false;
3239
3240 int create_flag;
3241
3242 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
3243 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
3244
3245 if (create_mode == OS_FILE_OPEN) {
3246
3247 if (access_type == OS_FILE_READ_ONLY) {
3248
3249 create_flag = O_RDONLY;
3250
3251 } else if (read_only) {
3252
3253 create_flag = O_RDONLY;
3254
3255 } else {
3256 create_flag = O_RDWR;
3257 }
3258
3259 } else if (read_only) {
3260
3261 create_flag = O_RDONLY;
3262
3263 } else if (create_mode == OS_FILE_CREATE) {
3264
3265 create_flag = O_RDWR | O_CREAT | O_EXCL;
3266
3267 } else if (create_mode == OS_FILE_CREATE_PATH) {
3268
3269 /* Create subdirs along the path if needed. */
3270
3271 *success = os_file_create_subdirs_if_needed(name);
3272
3273 if (!*success) {
3274
3275 ib::error()
3276 << "Unable to create subdirectories '"
3277 << name << "'";
3278
3279 file.m_file = OS_FILE_CLOSED;
3280 return(file);
3281 }
3282
3283 create_flag = O_RDWR | O_CREAT | O_EXCL;
3284 create_mode = OS_FILE_CREATE;
3285 } else {
3286
3287 ib::error()
3288 << "Unknown file create mode ("
3289 << create_mode
3290 << " for file '" << name << "'";
3291
3292 file.m_file = OS_FILE_CLOSED;
3293 return(file);
3294 }
3295
3296 bool retry;
3297
3298 do {
3299 file.m_file = ::open(name, create_flag, os_innodb_umask);
3300
3301 if (file.m_file == -1) {
3302 *success = false;
3303
3304 retry = os_file_handle_error(
3305 name,
3306 create_mode == OS_FILE_OPEN
3307 ? "open" : "create");
3308 } else {
3309 *success = true;
3310 retry = false;
3311 }
3312
3313 } while (retry);
3314
3315 #ifdef USE_FILE_LOCK
3316 if (!read_only
3317 && *success
3318 && access_type == OS_FILE_READ_WRITE
3319 && os_file_lock(file.m_file, name)) {
3320
3321 *success = false;
3322 close(file.m_file);
3323 file.m_file = -1;
3324 }
3325 #endif /* USE_FILE_LOCK */
3326
3327 return(file);
3328 }
3329
3330 /** This function attempts to create a directory named pathname. The new
3331 directory gets default permissions. On Unix the permissions are
3332 (0770 & ~umask). If the directory exists already, nothing is done and
3333 the call succeeds, unless the fail_if_exists arguments is true.
3334 If another error occurs, such as a permission error, this does not crash,
3335 but reports the error and returns false.
3336 @param[in] pathname directory name as null-terminated string
3337 @param[in] fail_if_exists if true, pre-existing directory is treated as
3338 an error.
3339 @return true if call succeeds, false on error */
3340 bool
os_file_create_directory(const char * pathname,bool fail_if_exists)3341 os_file_create_directory(
3342 const char* pathname,
3343 bool fail_if_exists)
3344 {
3345 int rcode = mkdir(pathname, 0770);
3346
3347 if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
3348 /* failure */
3349 os_file_handle_error_no_exit(pathname, "mkdir", false);
3350
3351 return(false);
3352 }
3353
3354 return(true);
3355 }
3356
3357 /**
3358 The os_file_opendir() function opens a directory stream corresponding to the
3359 directory named by the dirname argument. The directory stream is positioned
3360 at the first entry. In both Unix and Windows we automatically skip the '.'
3361 and '..' items at the start of the directory listing.
3362 @param[in] dirname directory name; it must not contain a trailing
3363 '\' or '/'
3364 @param[in] is_fatal true if we should treat an error as a fatal
3365 error; if we try to open symlinks then we do
3366 not wish a fatal error if it happens not to be
3367 a directory
3368 @return directory stream, NULL if error */
3369 os_file_dir_t
os_file_opendir(const char * dirname,bool error_is_fatal)3370 os_file_opendir(
3371 const char* dirname,
3372 bool error_is_fatal)
3373 {
3374 os_file_dir_t dir;
3375 dir = opendir(dirname);
3376
3377 if (dir == NULL && error_is_fatal) {
3378 os_file_handle_error(dirname, "opendir");
3379 }
3380
3381 return(dir);
3382 }
3383
3384 /** Closes a directory stream.
3385 @param[in] dir directory stream
3386 @return 0 if success, -1 if failure */
3387 int
os_file_closedir(os_file_dir_t dir)3388 os_file_closedir(
3389 os_file_dir_t dir)
3390 {
3391 int ret = closedir(dir);
3392
3393 if (ret != 0) {
3394 os_file_handle_error_no_exit(NULL, "closedir", false);
3395 }
3396
3397 return(ret);
3398 }
3399
3400 /** This function returns information of the next file in the directory. We jump
3401 over the '.' and '..' entries in the directory.
3402 @param[in] dirname directory name or path
3403 @param[in] dir directory stream
3404 @param[out] info buffer where the info is returned
3405 @return 0 if ok, -1 if error, 1 if at the end of the directory */
3406 int
os_file_readdir_next_file(const char * dirname,os_file_dir_t dir,os_file_stat_t * info)3407 os_file_readdir_next_file(
3408 const char* dirname,
3409 os_file_dir_t dir,
3410 os_file_stat_t* info)
3411 {
3412 struct dirent* ent;
3413 char* full_path;
3414 int ret;
3415 struct stat statinfo;
3416
3417 #ifdef HAVE_READDIR_R
3418 char dirent_buf[sizeof(struct dirent)
3419 + _POSIX_PATH_MAX + 100];
3420 /* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
3421 the max file name len; but in most standards, the
3422 length is NAME_MAX; we add 100 to be even safer */
3423 #endif /* HAVE_READDIR_R */
3424
3425 next_file:
3426
3427 #ifdef HAVE_READDIR_R
3428 ret = readdir_r(dir, (struct dirent*) dirent_buf, &ent);
3429
3430 if (ret != 0) {
3431
3432 ib::error()
3433 << "Cannot read directory " << dirname
3434 << " error: " << ret;
3435
3436 return(-1);
3437 }
3438
3439 if (ent == NULL) {
3440 /* End of directory */
3441
3442 return(1);
3443 }
3444
3445 ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
3446 #else
3447 ent = readdir(dir);
3448
3449 if (ent == NULL) {
3450
3451 return(1);
3452 }
3453 #endif /* HAVE_READDIR_R */
3454 ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
3455
3456 if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
3457
3458 goto next_file;
3459 }
3460
3461 strcpy(info->name, ent->d_name);
3462
3463 full_path = static_cast<char*>(
3464 ut_malloc_nokey(strlen(dirname) + strlen(ent->d_name) + 10));
3465
3466 sprintf(full_path, "%s/%s", dirname, ent->d_name);
3467
3468 ret = stat(full_path, &statinfo);
3469
3470 if (ret) {
3471
3472 if (errno == ENOENT) {
3473 /* readdir() returned a file that does not exist,
3474 it must have been deleted in the meantime. Do what
3475 would have happened if the file was deleted before
3476 readdir() - ignore and go to the next entry.
3477 If this is the last entry then info->name will still
3478 contain the name of the deleted file when this
3479 function returns, but this is not an issue since the
3480 caller shouldn't be looking at info when end of
3481 directory is returned. */
3482
3483 ut_free(full_path);
3484
3485 goto next_file;
3486 }
3487
3488 os_file_handle_error_no_exit(full_path, "stat", false);
3489
3490 ut_free(full_path);
3491
3492 return(-1);
3493 }
3494
3495 info->size = statinfo.st_size;
3496
3497 if (S_ISDIR(statinfo.st_mode)) {
3498 info->type = OS_FILE_TYPE_DIR;
3499 } else if (S_ISLNK(statinfo.st_mode)) {
3500 info->type = OS_FILE_TYPE_LINK;
3501 } else if (S_ISREG(statinfo.st_mode)) {
3502 info->type = OS_FILE_TYPE_FILE;
3503 } else {
3504 info->type = OS_FILE_TYPE_UNKNOWN;
3505 }
3506
3507 ut_free(full_path);
3508
3509 return(0);
3510 }
3511
3512 /** NOTE! Use the corresponding macro os_file_create(), not directly
3513 this function!
3514 Opens an existing file or creates a new.
3515 @param[in] name name of the file or path as a null-terminated
3516 string
3517 @param[in] create_mode create mode
3518 @param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O
3519 is desired, OS_FILE_NORMAL, if any normal file;
3520 NOTE that it also depends on type, os_aio_..
3521 and srv_.. variables whether we really use async
3522 I/O or unbuffered I/O: look in the function
3523 source code for the exact rules
3524 @param[in] type OS_DATA_FILE or OS_LOG_FILE
3525 @param[in] read_only true, if read only checks should be enforcedm
3526 @param[in] success true if succeeded
3527 @return handle to the file, not defined if error, error number
3528 can be retrieved with os_file_get_last_error */
3529 pfs_os_file_t
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,bool read_only,bool * success)3530 os_file_create_func(
3531 const char* name,
3532 ulint create_mode,
3533 ulint purpose,
3534 ulint type,
3535 bool read_only,
3536 bool* success)
3537 {
3538 bool on_error_no_exit;
3539 bool on_error_silent;
3540 pfs_os_file_t file;
3541
3542 *success = false;
3543
3544 DBUG_EXECUTE_IF(
3545 "ib_create_table_fail_disk_full",
3546 *success = false;
3547 errno = ENOSPC;
3548 file.m_file = OS_FILE_CLOSED;
3549 return(file);
3550 );
3551
3552 int create_flag;
3553 const char* mode_str = NULL;
3554
3555 on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
3556 ? true : false;
3557 on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
3558 ? true : false;
3559
3560 create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
3561 create_mode &= ~OS_FILE_ON_ERROR_SILENT;
3562
3563 if (create_mode == OS_FILE_OPEN
3564 || create_mode == OS_FILE_OPEN_RAW
3565 || create_mode == OS_FILE_OPEN_RETRY) {
3566
3567 mode_str = "OPEN";
3568
3569 create_flag = read_only ? O_RDONLY : O_RDWR;
3570
3571 } else if (read_only) {
3572
3573 mode_str = "OPEN";
3574
3575 create_flag = O_RDONLY;
3576
3577 } else if (create_mode == OS_FILE_CREATE) {
3578
3579 mode_str = "CREATE";
3580 create_flag = O_RDWR | O_CREAT | O_EXCL;
3581
3582 } else if (create_mode == OS_FILE_OVERWRITE) {
3583
3584 mode_str = "OVERWRITE";
3585 create_flag = O_RDWR | O_CREAT | O_TRUNC;
3586
3587 } else {
3588 ib::error()
3589 << "Unknown file create mode (" << create_mode << ")"
3590 << " for file '" << name << "'";
3591
3592 file.m_file = OS_FILE_CLOSED;
3593 return(file);
3594 }
3595
3596 ut_a(type == OS_LOG_FILE
3597 || type == OS_DATA_FILE
3598 || type == OS_DATA_TEMP_FILE);
3599
3600 ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
3601
3602 #ifdef O_SYNC
3603 /* We let O_SYNC only affect log files; note that we map O_DSYNC to
3604 O_SYNC because the datasync options seemed to corrupt files in 2001
3605 in both Linux and Solaris */
3606
3607 if (!read_only
3608 && type == OS_LOG_FILE
3609 && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
3610
3611 create_flag |= O_SYNC;
3612 }
3613 #endif /* O_SYNC */
3614
3615 bool retry;
3616
3617 do {
3618 file.m_file = ::open(name, create_flag, os_innodb_umask);
3619
3620 if (file.m_file == -1) {
3621 const char* operation;
3622
3623 operation = (create_mode == OS_FILE_CREATE
3624 && !read_only) ? "create" : "open";
3625
3626 *success = false;
3627
3628 if (on_error_no_exit) {
3629 retry = os_file_handle_error_no_exit(
3630 name, operation, on_error_silent);
3631 } else {
3632 retry = os_file_handle_error(name, operation);
3633 }
3634 } else {
3635 *success = true;
3636 retry = false;
3637 }
3638
3639 } while (retry);
3640
3641 /* We disable OS caching (O_DIRECT) only on data files */
3642
3643 if (!read_only
3644 && *success
3645 && (type != OS_LOG_FILE && type != OS_DATA_TEMP_FILE)
3646 && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
3647 || srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) {
3648
3649 os_file_set_nocache(file.m_file, name, mode_str);
3650 }
3651
3652 #ifdef USE_FILE_LOCK
3653 if (!read_only
3654 && *success
3655 && create_mode != OS_FILE_OPEN_RAW
3656 && os_file_lock(file.m_file, name)) {
3657
3658 if (create_mode == OS_FILE_OPEN_RETRY) {
3659
3660 ib::info()
3661 << "Retrying to lock the first data file";
3662
3663 for (int i = 0; i < 100; i++) {
3664 os_thread_sleep(1000000);
3665
3666 if (!os_file_lock(file.m_file, name)) {
3667 *success = true;
3668 return(file);
3669 }
3670 }
3671
3672 ib::info()
3673 << "Unable to open the first data file";
3674 }
3675
3676 *success = false;
3677 close(file.m_file);
3678 file.m_file = -1;
3679 }
3680 #endif /* USE_FILE_LOCK */
3681
3682 return(file);
3683 }
3684
3685 /** NOTE! Use the corresponding macro
3686 os_file_create_simple_no_error_handling(), not directly this function!
3687 A simple function to open or create a file.
3688 @param[in] name name of the file or path as a null-terminated
3689 string
3690 @param[in] create_mode create mode
3691 @param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
3692 OS_FILE_READ_ALLOW_DELETE; the last option
3693 is used by a backup program reading the file
3694 @param[in] read_only if true read only mode checks are enforced
3695 @param[out] success true if succeeded
3696 @return own: handle to the file, not defined if error, error number
3697 can be retrieved with os_file_get_last_error */
3698 pfs_os_file_t
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)3699 os_file_create_simple_no_error_handling_func(
3700 const char* name,
3701 ulint create_mode,
3702 ulint access_type,
3703 bool read_only,
3704 bool* success)
3705 {
3706 pfs_os_file_t file;
3707 int create_flag;
3708
3709 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
3710 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
3711
3712 *success = false;
3713
3714 if (create_mode == OS_FILE_OPEN) {
3715
3716 if (access_type == OS_FILE_READ_ONLY) {
3717
3718 create_flag = O_RDONLY;
3719
3720 } else if (read_only) {
3721
3722 create_flag = O_RDONLY;
3723
3724 } else {
3725
3726 ut_a(access_type == OS_FILE_READ_WRITE
3727 || access_type == OS_FILE_READ_ALLOW_DELETE);
3728
3729 create_flag = O_RDWR;
3730 }
3731
3732 } else if (read_only) {
3733
3734 create_flag = O_RDONLY;
3735
3736 } else if (create_mode == OS_FILE_CREATE) {
3737
3738 create_flag = O_RDWR | O_CREAT | O_EXCL;
3739
3740 } else {
3741
3742 ib::error()
3743 << "Unknown file create mode "
3744 << create_mode << " for file '" << name << "'";
3745 file.m_file = OS_FILE_CLOSED;
3746 return(file);
3747 }
3748
3749 file.m_file = ::open(name, create_flag, os_innodb_umask);
3750
3751 *success = (file.m_file != -1);
3752
3753 #ifdef USE_FILE_LOCK
3754 if (!read_only
3755 && *success
3756 && access_type == OS_FILE_READ_WRITE
3757 && os_file_lock(file.m_file, name)) {
3758
3759 *success = false;
3760 close(file.m_file);
3761 file.m_file = -1;
3762
3763 }
3764 #endif /* USE_FILE_LOCK */
3765
3766 return(file);
3767 }
3768
3769 /** Deletes a file if it exists. The file has to be closed before calling this.
3770 @param[in] name file path as a null-terminated string
3771 @param[out] exist indicate if file pre-exist
3772 @return true if success */
3773 bool
os_file_delete_if_exists_func(const char * name,bool * exist)3774 os_file_delete_if_exists_func(
3775 const char* name,
3776 bool* exist)
3777 {
3778 if (exist != NULL) {
3779 *exist = true;
3780 }
3781
3782 int ret = unlink(name);
3783
3784 if (ret != 0 && errno == ENOENT) {
3785 if (exist != NULL) {
3786 *exist = false;
3787 }
3788 } else if (ret != 0 && errno != ENOENT) {
3789 os_file_handle_error_no_exit(name, "delete", false);
3790
3791 return(false);
3792 }
3793
3794 return(true);
3795 }
3796
3797 /** Deletes a file. The file has to be closed before calling this.
3798 @param[in] name file path as a null-terminated string
3799 @return true if success */
3800 bool
os_file_delete_func(const char * name)3801 os_file_delete_func(
3802 const char* name)
3803 {
3804 int ret = unlink(name);
3805
3806 if (ret != 0) {
3807 os_file_handle_error_no_exit(name, "delete", false);
3808
3809 return(false);
3810 }
3811
3812 return(true);
3813 }
3814
3815 /** NOTE! Use the corresponding macro os_file_rename(), not directly this
3816 function!
3817 Renames a file (can also move it to another directory). It is safest that the
3818 file is closed before calling this function.
3819 @param[in] oldpath old file path as a null-terminated string
3820 @param[in] newpath new file path
3821 @return true if success */
3822 bool
os_file_rename_func(const char * oldpath,const char * newpath)3823 os_file_rename_func(
3824 const char* oldpath,
3825 const char* newpath)
3826 {
3827 #ifdef UNIV_DEBUG
3828 os_file_type_t type;
3829 bool exists;
3830
3831 /* New path must not exist. */
3832 ut_ad(os_file_status(newpath, &exists, &type));
3833 ut_ad(!exists);
3834
3835 /* Old path must exist. */
3836 ut_ad(os_file_status(oldpath, &exists, &type));
3837 ut_ad(exists);
3838 #endif /* UNIV_DEBUG */
3839
3840 int ret = rename(oldpath, newpath);
3841
3842 if (ret != 0) {
3843 os_file_handle_error_no_exit(oldpath, "rename", false);
3844
3845 return(false);
3846 }
3847
3848 return(true);
3849 }
3850
3851 /** NOTE! Use the corresponding macro os_file_close(), not directly this
3852 function!
3853 Closes a file handle. In case of error, error number can be retrieved with
3854 os_file_get_last_error.
3855 @param[in] file Handle to close
3856 @return true if success */
3857 bool
os_file_close_func(os_file_t file)3858 os_file_close_func(
3859 os_file_t file)
3860 {
3861 int ret = close(file);
3862
3863 if (ret == -1) {
3864 os_file_handle_error(NULL, "close");
3865
3866 return(false);
3867 }
3868
3869 return(true);
3870 }
3871
3872 /** Gets a file size.
3873 @param[in] file handle to an open file
3874 @return file size, or (os_offset_t) -1 on failure */
3875 os_offset_t
os_file_get_size(pfs_os_file_t file)3876 os_file_get_size(
3877 pfs_os_file_t file)
3878 {
3879 /* Store current position */
3880 os_offset_t pos = lseek(file.m_file, 0, SEEK_CUR);
3881 os_offset_t file_size = lseek(file.m_file, 0, SEEK_END);
3882
3883 /* Restore current position as the function should not change it */
3884 lseek(file.m_file, pos, SEEK_SET);
3885
3886 return(file_size);
3887 }
3888
3889 /** Gets a file size.
3890 @param[in] filename Full path to the filename to check
3891 @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
3892 errno */
3893 os_file_size_t
os_file_get_size(const char * filename)3894 os_file_get_size(
3895 const char* filename)
3896 {
3897 struct stat s;
3898 os_file_size_t file_size;
3899
3900 int ret = stat(filename, &s);
3901
3902 if (ret == 0) {
3903 file_size.m_total_size = s.st_size;
3904 /* st_blocks is in 512 byte sized blocks */
3905 file_size.m_alloc_size = s.st_blocks * 512;
3906 } else {
3907 file_size.m_total_size = ~0;
3908 file_size.m_alloc_size = (os_offset_t) errno;
3909 }
3910
3911 return(file_size);
3912 }
3913
3914 /** This function returns information about the specified file
3915 @param[in] path pathname of the file
3916 @param[out] stat_info information of a file in a directory
3917 @param[in,out] statinfo information of a file in a directory
3918 @param[in] check_rw_perm for testing whether the file can be opened
3919 in RW mode
3920 @param[in] read_only if true read only mode checks are enforced
3921 @return DB_SUCCESS if all OK */
3922 static
3923 dberr_t
os_file_get_status_posix(const char * path,os_file_stat_t * stat_info,struct stat * statinfo,bool check_rw_perm,bool read_only)3924 os_file_get_status_posix(
3925 const char* path,
3926 os_file_stat_t* stat_info,
3927 struct stat* statinfo,
3928 bool check_rw_perm,
3929 bool read_only)
3930 {
3931 int ret = stat(path, statinfo);
3932
3933 if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3934 /* file does not exist */
3935
3936 return(DB_NOT_FOUND);
3937
3938 } else if (ret) {
3939 /* file exists, but stat call failed */
3940
3941 os_file_handle_error_no_exit(path, "stat", false);
3942
3943 return(DB_FAIL);
3944 }
3945
3946 switch (statinfo->st_mode & S_IFMT) {
3947 case S_IFDIR:
3948 stat_info->type = OS_FILE_TYPE_DIR;
3949 break;
3950 case S_IFLNK:
3951 stat_info->type = OS_FILE_TYPE_LINK;
3952 break;
3953 case S_IFBLK:
3954 /* Handle block device as regular file. */
3955 case S_IFCHR:
3956 /* Handle character device as regular file. */
3957 case S_IFREG:
3958 stat_info->type = OS_FILE_TYPE_FILE;
3959 break;
3960 default:
3961 stat_info->type = OS_FILE_TYPE_UNKNOWN;
3962 }
3963
3964 stat_info->size = statinfo->st_size;
3965 stat_info->block_size = statinfo->st_blksize;
3966 stat_info->alloc_size = statinfo->st_blocks * 512;
3967
3968 if (check_rw_perm
3969 && (stat_info->type == OS_FILE_TYPE_FILE
3970 || stat_info->type == OS_FILE_TYPE_BLOCK)) {
3971
3972 int access = !read_only ? O_RDWR : O_RDONLY;
3973 int fh = ::open(path, access, os_innodb_umask);
3974
3975 if (fh == -1) {
3976 stat_info->rw_perm = false;
3977 } else {
3978 stat_info->rw_perm = true;
3979 close(fh);
3980 }
3981 }
3982
3983 return(DB_SUCCESS);
3984 }
3985
3986 /** Truncates a file to a specified size in bytes.
3987 Do nothing if the size to preserve is greater or equal to the current
3988 size of the file.
3989 @param[in] pathname file path
3990 @param[in] file file to be truncated
3991 @param[in] size size to preserve in bytes
3992 @return true if success */
3993 static
3994 bool
os_file_truncate_posix(const char * pathname,pfs_os_file_t file,os_offset_t size)3995 os_file_truncate_posix(
3996 const char* pathname,
3997 pfs_os_file_t file,
3998 os_offset_t size)
3999 {
4000 int res = ftruncate(file.m_file, size);
4001 if (res == -1) {
4002
4003 bool retry;
4004
4005 retry = os_file_handle_error_no_exit(
4006 pathname, "truncate", false);
4007
4008 if (retry) {
4009 ib::warn()
4010 << "Truncate failed for '"
4011 << pathname << "'";
4012 }
4013 }
4014
4015 return(res == 0);
4016 }
4017
4018 /** Truncates a file at its current position.
4019 @return true if success */
4020 bool
os_file_set_eof(FILE * file)4021 os_file_set_eof(
4022 FILE* file) /*!< in: file to be truncated */
4023 {
4024 return(!ftruncate(fileno(file), ftell(file)));
4025 }
4026
4027 #ifdef UNIV_HOTBACKUP
4028 /** Closes a file handle.
4029 @param[in] file Handle to a file
4030 @return true if success */
4031 bool
os_file_close_no_error_handling(os_file_t file)4032 os_file_close_no_error_handling(
4033 os_file_t file)
4034 {
4035 return(close(file) != -1);
4036 }
4037 #endif /* UNIV_HOTBACKUP */
4038
4039 /** This function can be called if one wants to post a batch of reads and
4040 prefers an i/o-handler thread to handle them all at once later. You must
4041 call os_aio_simulated_wake_handler_threads later to ensure the threads
4042 are not left sleeping! */
4043 void
os_aio_simulated_put_read_threads_to_sleep()4044 os_aio_simulated_put_read_threads_to_sleep()
4045 {
4046 /* No op on non Windows */
4047 }
4048
4049 #else /* !_WIN32 */
4050
4051 #include <WinIoCtl.h>
4052
4053 /** Do the read/write
4054 @param[in] request The IO context and type
4055 @return the number of bytes read/written or negative value on error */
4056 ssize_t
execute(const IORequest & request)4057 SyncFileIO::execute(const IORequest& request)
4058 {
4059 OVERLAPPED seek;
4060
4061 memset(&seek, 0x0, sizeof(seek));
4062
4063 seek.Offset = (DWORD) m_offset & 0xFFFFFFFF;
4064 seek.OffsetHigh = (DWORD) (m_offset >> 32);
4065
4066 BOOL ret;
4067 DWORD n_bytes;
4068
4069 if (request.is_read()) {
4070 ret = ReadFile(m_fh, m_buf,
4071 static_cast<DWORD>(m_n), &n_bytes, &seek);
4072
4073 } else {
4074 ut_ad(request.is_write());
4075 ret = WriteFile(m_fh, m_buf,
4076 static_cast<DWORD>(m_n), &n_bytes, &seek);
4077 }
4078
4079 return(ret ? static_cast<ssize_t>(n_bytes) : -1);
4080 }
4081
4082 /** Do the read/write
4083 @param[in,out] slot The IO slot, it has the IO context
4084 @return the number of bytes read/written or negative value on error */
4085 ssize_t
execute(Slot * slot)4086 SyncFileIO::execute(Slot* slot)
4087 {
4088 BOOL ret;
4089
4090 if (slot->type.is_read()) {
4091 ret = ReadFile(
4092 slot->file.m_file, slot->ptr, slot->len,
4093 &slot->n_bytes, &slot->control);
4094 } else {
4095 ut_ad(slot->type.is_write());
4096 ret = WriteFile(
4097 slot->file.m_file, slot->ptr, slot->len,
4098 &slot->n_bytes, &slot->control);
4099 }
4100
4101 return(ret ? static_cast<ssize_t>(slot->n_bytes) : -1);
4102 }
4103
4104 /** Check if the file system supports sparse files.
4105 @param[in] name File name
4106 @return true if the file system supports sparse files */
4107 static
4108 bool
os_is_sparse_file_supported_win32(const char * filename)4109 os_is_sparse_file_supported_win32(const char* filename)
4110 {
4111 char volname[MAX_PATH];
4112 BOOL result = GetVolumePathName(filename, volname, MAX_PATH);
4113
4114 if (!result) {
4115
4116 ib::error()
4117 << "os_is_sparse_file_supported: "
4118 << "Failed to get the volume path name for: "
4119 << filename
4120 << "- OS error number " << GetLastError();
4121
4122 return(false);
4123 }
4124
4125 DWORD flags;
4126
4127 GetVolumeInformation(
4128 volname, NULL, MAX_PATH, NULL, NULL,
4129 &flags, NULL, MAX_PATH);
4130
4131 return(flags & FILE_SUPPORTS_SPARSE_FILES) ? true : false;
4132 }
4133
4134 /** Free storage space associated with a section of the file.
4135 @param[in] fh Open file handle
4136 @param[in] page_size Tablespace page size
4137 @param[in] block_size File system block size
4138 @param[in] off Starting offset (SEEK_SET)
4139 @param[in] len Size of the hole
4140 @return 0 on success or errno */
4141 static
4142 dberr_t
os_file_punch_hole_win32(os_file_t fh,os_offset_t off,os_offset_t len)4143 os_file_punch_hole_win32(
4144 os_file_t fh,
4145 os_offset_t off,
4146 os_offset_t len)
4147 {
4148 FILE_ZERO_DATA_INFORMATION punch;
4149
4150 punch.FileOffset.QuadPart = off;
4151 punch.BeyondFinalZero.QuadPart = off + len;
4152
4153 /* If lpOverlapped is NULL, lpBytesReturned cannot be NULL,
4154 therefore we pass a dummy parameter. */
4155 DWORD temp;
4156
4157 BOOL result = DeviceIoControl(
4158 fh, FSCTL_SET_ZERO_DATA, &punch, sizeof(punch),
4159 NULL, 0, &temp, NULL);
4160
4161 return(!result ? DB_IO_NO_PUNCH_HOLE : DB_SUCCESS);
4162 }
4163
4164 /** Check the existence and type of the given file.
4165 @param[in] path path name of file
4166 @param[out] exists true if the file exists
4167 @param[out] type Type of the file, if it exists
4168 @return true if call succeeded */
4169 bool
os_file_status_win32(const char * path,bool * exists,os_file_type_t * type)4170 os_file_status_win32(
4171 const char* path,
4172 bool* exists,
4173 os_file_type_t* type)
4174 {
4175 int ret;
4176 struct _stat64 statinfo;
4177
4178 ret = _stat64(path, &statinfo);
4179
4180 *exists = !ret;
4181
4182 if (!ret) {
4183 /* file exists, everything OK */
4184
4185 } else if (errno == ENOENT || errno == ENOTDIR
4186 || errno == ENAMETOOLONG) {
4187 /* file does not exist */
4188 return(true);
4189
4190 } else {
4191 /* file exists, but stat call failed */
4192 os_file_handle_error_no_exit(path, "stat", false);
4193 return(false);
4194 }
4195
4196 if (_S_IFDIR & statinfo.st_mode) {
4197 *type = OS_FILE_TYPE_DIR;
4198
4199 } else if (_S_IFREG & statinfo.st_mode) {
4200 *type = OS_FILE_TYPE_FILE;
4201
4202 } else {
4203 *type = OS_FILE_TYPE_UNKNOWN;
4204 }
4205
4206 return(true);
4207 }
4208
4209 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
4210 function!
4211 Flushes the write buffers of a given file to the disk.
4212 @param[in] file handle to a file
4213 @return true if success */
4214 bool
os_file_flush_func(os_file_t file)4215 os_file_flush_func(
4216 os_file_t file)
4217 {
4218 ++os_n_fsyncs;
4219
4220 BOOL ret = FlushFileBuffers(file);
4221
4222 if (ret) {
4223 return(true);
4224 }
4225
4226 /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
4227 actually a raw device, we choose to ignore that error if we are using
4228 raw disks */
4229
4230 if (srv_start_raw_disk_in_use && GetLastError()
4231 == ERROR_INVALID_FUNCTION) {
4232 return(true);
4233 }
4234
4235 os_file_handle_error(NULL, "flush");
4236
4237 /* It is a fatal error if a file flush does not succeed, because then
4238 the database can get corrupt on disk */
4239 ut_error;
4240
4241 return(false);
4242 }
4243
4244 /** Retrieves the last error number if an error occurs in a file io function.
4245 The number should be retrieved before any other OS calls (because they may
4246 overwrite the error number). If the number is not known to this program,
4247 the OS error number + 100 is returned.
4248 @param[in] report_all_errors true if we want an error message printed
4249 of all errors
4250 @param[in] on_error_silent true then don't print any diagnostic
4251 to the log
4252 @return error number, or OS error number + 100 */
4253 static
4254 ulint
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)4255 os_file_get_last_error_low(
4256 bool report_all_errors,
4257 bool on_error_silent)
4258 {
4259 ulint err = (ulint) GetLastError();
4260
4261 if (err == ERROR_SUCCESS) {
4262 return(0);
4263 }
4264
4265 if (report_all_errors
4266 || (!on_error_silent
4267 && err != ERROR_DISK_FULL
4268 && err != ERROR_FILE_EXISTS)) {
4269
4270 ib::error()
4271 << "Operating system error number " << err
4272 << " in a file operation.";
4273
4274 if (err == ERROR_PATH_NOT_FOUND) {
4275 ib::error()
4276 << "The error means the system"
4277 " cannot find the path specified.";
4278
4279 if (srv_is_being_started) {
4280 ib::error()
4281 << "If you are installing InnoDB,"
4282 " remember that you must create"
4283 " directories yourself, InnoDB"
4284 " does not create them.";
4285 }
4286
4287 } else if (err == ERROR_ACCESS_DENIED) {
4288
4289 ib::error()
4290 << "The error means mysqld does not have"
4291 " the access rights to"
4292 " the directory. It may also be"
4293 " you have created a subdirectory"
4294 " of the same name as a data file.";
4295
4296 } else if (err == ERROR_SHARING_VIOLATION
4297 || err == ERROR_LOCK_VIOLATION) {
4298
4299 ib::error()
4300 << "The error means that another program"
4301 " is using InnoDB's files."
4302 " This might be a backup or antivirus"
4303 " software or another instance"
4304 " of MySQL."
4305 " Please close it to get rid of this error.";
4306
4307 } else if (err == ERROR_WORKING_SET_QUOTA
4308 || err == ERROR_NO_SYSTEM_RESOURCES) {
4309
4310 ib::error()
4311 << "The error means that there are no"
4312 " sufficient system resources or quota to"
4313 " complete the operation.";
4314
4315 } else if (err == ERROR_OPERATION_ABORTED) {
4316
4317 ib::error()
4318 << "The error means that the I/O"
4319 " operation has been aborted"
4320 " because of either a thread exit"
4321 " or an application request."
4322 " Retry attempt is made.";
4323 } else {
4324
4325 ib::info() << OPERATING_SYSTEM_ERROR_MSG;
4326 }
4327 }
4328
4329 if (err == ERROR_FILE_NOT_FOUND) {
4330 return(OS_FILE_NOT_FOUND);
4331 } else if (err == ERROR_DISK_FULL) {
4332 return(OS_FILE_DISK_FULL);
4333 } else if (err == ERROR_FILE_EXISTS) {
4334 return(OS_FILE_ALREADY_EXISTS);
4335 } else if (err == ERROR_SHARING_VIOLATION
4336 || err == ERROR_LOCK_VIOLATION) {
4337 return(OS_FILE_SHARING_VIOLATION);
4338 } else if (err == ERROR_WORKING_SET_QUOTA
4339 || err == ERROR_NO_SYSTEM_RESOURCES) {
4340 return(OS_FILE_INSUFFICIENT_RESOURCE);
4341 } else if (err == ERROR_OPERATION_ABORTED) {
4342 return(OS_FILE_OPERATION_ABORTED);
4343 } else if (err == ERROR_ACCESS_DENIED) {
4344 return(OS_FILE_ACCESS_VIOLATION);
4345 }
4346
4347 return(OS_FILE_ERROR_MAX + err);
4348 }
4349
4350 /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
4351 this function!
4352 A simple function to open or create a file.
4353 @param[in] name name of the file or path as a null-terminated
4354 string
4355 @param[in] create_mode create mode
4356 @param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
4357 @param[in] read_only if true read only mode checks are enforced
4358 @param[out] success true if succeed, false if error
4359 @return handle to the file, not defined if error, error number
4360 can be retrieved with os_file_get_last_error */
4361 pfs_os_file_t
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)4362 os_file_create_simple_func(
4363 const char* name,
4364 ulint create_mode,
4365 ulint access_type,
4366 bool read_only,
4367 bool* success)
4368 {
4369 pfs_os_file_t file;
4370
4371 *success = false;
4372
4373 DWORD access;
4374 DWORD create_flag;
4375 DWORD attributes = 0;
4376
4377 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
4378 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
4379
4380 if (create_mode == OS_FILE_OPEN) {
4381
4382 create_flag = OPEN_EXISTING;
4383
4384 } else if (read_only) {
4385
4386 create_flag = OPEN_EXISTING;
4387
4388 } else if (create_mode == OS_FILE_CREATE) {
4389
4390 create_flag = CREATE_NEW;
4391
4392 } else if (create_mode == OS_FILE_CREATE_PATH) {
4393
4394 /* Create subdirs along the path if needed. */
4395 *success = os_file_create_subdirs_if_needed(name);
4396
4397 if (!*success) {
4398
4399 ib::error()
4400 << "Unable to create subdirectories '"
4401 << name << "'";
4402 file.m_file = OS_FILE_CLOSED;
4403 return(file);
4404 }
4405
4406 create_flag = CREATE_NEW;
4407 create_mode = OS_FILE_CREATE;
4408
4409 } else {
4410
4411 ib::error()
4412 << "Unknown file create mode ("
4413 << create_mode << ") for file '"
4414 << name << "'";
4415
4416 file.m_file = OS_FILE_CLOSED;
4417 return(file);
4418 }
4419
4420 if (access_type == OS_FILE_READ_ONLY) {
4421
4422 access = GENERIC_READ;
4423
4424 } else if (read_only) {
4425
4426 ib::info()
4427 << "Read only mode set. Unable to"
4428 " open file '" << name << "' in RW mode, "
4429 << "trying RO mode", name;
4430
4431 access = GENERIC_READ;
4432
4433 } else if (access_type == OS_FILE_READ_WRITE) {
4434
4435 access = GENERIC_READ | GENERIC_WRITE;
4436
4437 } else {
4438
4439 ib::error()
4440 << "Unknown file access type (" << access_type << ") "
4441 "for file '" << name << "'";
4442
4443 file.m_file = OS_FILE_CLOSED;
4444 return(file);
4445 }
4446
4447 bool retry;
4448
4449 do {
4450 /* Use default security attributes and no template file. */
4451
4452 file.m_file = CreateFile(
4453 (LPCTSTR) name, access, FILE_SHARE_READ, NULL,
4454 create_flag, attributes, NULL);
4455
4456 if (file.m_file == INVALID_HANDLE_VALUE) {
4457
4458 *success = false;
4459
4460 retry = os_file_handle_error(
4461 name, create_mode == OS_FILE_OPEN ?
4462 "open" : "create");
4463
4464 } else {
4465
4466 retry = false;
4467
4468 *success = true;
4469
4470 DWORD temp;
4471
4472 /* This is a best effort use case, if it fails then
4473 we will find out when we try and punch the hole. */
4474
4475 DeviceIoControl(
4476 file.m_file, FSCTL_SET_SPARSE, NULL, 0, NULL, 0,
4477 &temp, NULL);
4478 }
4479
4480 } while (retry);
4481
4482 return(file);
4483 }
4484
4485 /** This function attempts to create a directory named pathname. The new
4486 directory gets default permissions. On Unix the permissions are
4487 (0770 & ~umask). If the directory exists already, nothing is done and
4488 the call succeeds, unless the fail_if_exists arguments is true.
4489 If another error occurs, such as a permission error, this does not crash,
4490 but reports the error and returns false.
4491 @param[in] pathname directory name as null-terminated string
4492 @param[in] fail_if_exists if true, pre-existing directory is treated
4493 as an error.
4494 @return true if call succeeds, false on error */
4495 bool
os_file_create_directory(const char * pathname,bool fail_if_exists)4496 os_file_create_directory(
4497 const char* pathname,
4498 bool fail_if_exists)
4499 {
4500 BOOL rcode;
4501
4502 rcode = CreateDirectory((LPCTSTR) pathname, NULL);
4503 if (!(rcode != 0
4504 || (GetLastError() == ERROR_ALREADY_EXISTS
4505 && !fail_if_exists))) {
4506
4507 os_file_handle_error_no_exit(
4508 pathname, "CreateDirectory", false);
4509
4510 return(false);
4511 }
4512
4513 return(true);
4514 }
4515
4516 /** The os_file_opendir() function opens a directory stream corresponding to the
4517 directory named by the dirname argument. The directory stream is positioned
4518 at the first entry. In both Unix and Windows we automatically skip the '.'
4519 and '..' items at the start of the directory listing.
4520 @param[in] dirname directory name; it must not contain a trailing
4521 '\' or '/'
4522 @param[in] is_fatal true if we should treat an error as a fatal
4523 error; if we try to open symlinks then we do
4524 not wish a fatal error if it happens not to
4525 be a directory
4526 @return directory stream, NULL if error */
4527 os_file_dir_t
os_file_opendir(const char * dirname,bool error_is_fatal)4528 os_file_opendir(
4529 const char* dirname,
4530 bool error_is_fatal)
4531 {
4532 os_file_dir_t dir;
4533 LPWIN32_FIND_DATA lpFindFileData;
4534 char path[OS_FILE_MAX_PATH + 3];
4535
4536 ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
4537
4538 strcpy(path, dirname);
4539 strcpy(path + strlen(path), "\\*");
4540
4541 /* Note that in Windows opening the 'directory stream' also retrieves
4542 the first entry in the directory. Since it is '.', that is no problem,
4543 as we will skip over the '.' and '..' entries anyway. */
4544
4545 lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
4546 ut_malloc_nokey(sizeof(WIN32_FIND_DATA)));
4547
4548 dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
4549
4550 ut_free(lpFindFileData);
4551
4552 if (dir == INVALID_HANDLE_VALUE) {
4553
4554 if (error_is_fatal) {
4555 os_file_handle_error(dirname, "opendir");
4556 }
4557
4558 return(NULL);
4559 }
4560
4561 return(dir);
4562 }
4563
4564 /** Closes a directory stream.
4565 @param[in] dir directory stream
4566 @return 0 if success, -1 if failure */
4567 int
os_file_closedir(os_file_dir_t dir)4568 os_file_closedir(
4569 os_file_dir_t dir)
4570 {
4571 BOOL ret;
4572
4573 ret = FindClose(dir);
4574
4575 if (!ret) {
4576 os_file_handle_error_no_exit(NULL, "closedir", false);
4577
4578 return(-1);
4579 }
4580
4581 return(0);
4582 }
4583
4584 /** This function returns information of the next file in the directory. We
4585 jump over the '.' and '..' entries in the directory.
4586 @param[in] dirname directory name or path
4587 @param[in] dir directory stream
4588 @param[out] info buffer where the info is returned
4589 @return 0 if ok, -1 if error, 1 if at the end of the directory */
4590 int
os_file_readdir_next_file(const char * dirname,os_file_dir_t dir,os_file_stat_t * info)4591 os_file_readdir_next_file(
4592 const char* dirname,
4593 os_file_dir_t dir,
4594 os_file_stat_t* info)
4595 {
4596 BOOL ret;
4597 int status;
4598 WIN32_FIND_DATA find_data;
4599
4600 next_file:
4601
4602 ret = FindNextFile(dir, &find_data);
4603
4604 if (ret > 0) {
4605
4606 const char* name;
4607
4608 name = static_cast<const char*>(find_data.cFileName);
4609
4610 ut_a(strlen(name) < OS_FILE_MAX_PATH);
4611
4612 if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0) {
4613
4614 goto next_file;
4615 }
4616
4617 strcpy(info->name, name);
4618
4619 info->size = find_data.nFileSizeHigh;
4620 info->size <<= 32;
4621 info->size |= find_data.nFileSizeLow;
4622
4623 if (find_data.dwFileAttributes
4624 & FILE_ATTRIBUTE_REPARSE_POINT) {
4625
4626 /* TODO: test Windows symlinks */
4627 /* TODO: MySQL has apparently its own symlink
4628 implementation in Windows, dbname.sym can
4629 redirect a database directory:
4630 REFMAN "windows-symbolic-links.html" */
4631
4632 info->type = OS_FILE_TYPE_LINK;
4633
4634 } else if (find_data.dwFileAttributes
4635 & FILE_ATTRIBUTE_DIRECTORY) {
4636
4637 info->type = OS_FILE_TYPE_DIR;
4638
4639 } else {
4640
4641 /* It is probably safest to assume that all other
4642 file types are normal. Better to check them rather
4643 than blindly skip them. */
4644
4645 info->type = OS_FILE_TYPE_FILE;
4646 }
4647
4648 status = 0;
4649
4650 } else if (GetLastError() == ERROR_NO_MORE_FILES) {
4651
4652 status = 1;
4653
4654 } else {
4655
4656 os_file_handle_error_no_exit(NULL, "readdir_next_file", false);
4657
4658 status = -1;
4659 }
4660
4661 return(status);
4662 }
4663
4664 /** NOTE! Use the corresponding macro os_file_create(), not directly
4665 this function!
4666 Opens an existing file or creates a new.
4667 @param[in] name name of the file or path as a null-terminated
4668 string
4669 @param[in] create_mode create mode
4670 @param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O
4671 is desired, OS_FILE_NORMAL, if any normal file;
4672 NOTE that it also depends on type, os_aio_..
4673 and srv_.. variables whether we really use async
4674 I/O or unbuffered I/O: look in the function
4675 source code for the exact rules
4676 @param[in] type OS_DATA_FILE or OS_LOG_FILE
4677 @param[in] success true if succeeded
4678 @return handle to the file, not defined if error, error number
4679 can be retrieved with os_file_get_last_error */
4680 pfs_os_file_t
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,bool read_only,bool * success)4681 os_file_create_func(
4682 const char* name,
4683 ulint create_mode,
4684 ulint purpose,
4685 ulint type,
4686 bool read_only,
4687 bool* success)
4688 {
4689 pfs_os_file_t file;
4690 bool retry;
4691 bool on_error_no_exit;
4692 bool on_error_silent;
4693
4694 *success = false;
4695
4696 DBUG_EXECUTE_IF(
4697 "ib_create_table_fail_disk_full",
4698 *success = false;
4699 SetLastError(ERROR_DISK_FULL);
4700 file.m_file = OS_FILE_CLOSED;
4701 return(file);
4702 );
4703
4704 DWORD create_flag;
4705 DWORD share_mode = FILE_SHARE_READ;
4706
4707 on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
4708 ? true : false;
4709
4710 on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
4711 ? true : false;
4712
4713 create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
4714 create_mode &= ~OS_FILE_ON_ERROR_SILENT;
4715
4716 if (create_mode == OS_FILE_OPEN_RAW) {
4717
4718 ut_a(!read_only);
4719
4720 create_flag = OPEN_EXISTING;
4721
4722 /* On Windows Physical devices require admin privileges and
4723 have to have the write-share mode set. See the remarks
4724 section for the CreateFile() function documentation in MSDN. */
4725
4726 share_mode |= FILE_SHARE_WRITE;
4727
4728 } else if (create_mode == OS_FILE_OPEN
4729 || create_mode == OS_FILE_OPEN_RETRY) {
4730
4731 create_flag = OPEN_EXISTING;
4732
4733 } else if (read_only) {
4734
4735 create_flag = OPEN_EXISTING;
4736
4737 } else if (create_mode == OS_FILE_CREATE) {
4738
4739 create_flag = CREATE_NEW;
4740
4741 } else if (create_mode == OS_FILE_OVERWRITE) {
4742
4743 create_flag = CREATE_ALWAYS;
4744
4745 } else {
4746 ib::error()
4747 << "Unknown file create mode (" << create_mode << ") "
4748 << " for file '" << name << "'";
4749
4750 file.m_file = OS_FILE_CLOSED;
4751 return(file);
4752 }
4753
4754 DWORD attributes = 0;
4755
4756 #ifdef UNIV_HOTBACKUP
4757 attributes |= FILE_FLAG_NO_BUFFERING;
4758 #else
4759 if (purpose == OS_FILE_AIO) {
4760
4761 #ifdef WIN_ASYNC_IO
4762 /* If specified, use asynchronous (overlapped) io and no
4763 buffering of writes in the OS */
4764
4765 if (srv_use_native_aio) {
4766 attributes |= FILE_FLAG_OVERLAPPED;
4767 }
4768 #endif /* WIN_ASYNC_IO */
4769
4770 } else if (purpose == OS_FILE_NORMAL) {
4771
4772 /* Use default setting. */
4773
4774 } else {
4775
4776 ib::error()
4777 << "Unknown purpose flag (" << purpose << ") "
4778 << "while opening file '" << name << "'";
4779
4780 file.m_file = OS_FILE_CLOSED;
4781 return(file);
4782 }
4783
4784 #ifdef UNIV_NON_BUFFERED_IO
4785 // TODO: Create a bug, this looks wrong. The flush log
4786 // parameter is dynamic.
4787 if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
4788
4789 /* Do not use unbuffered i/o for the log files because
4790 value 2 denotes that we do not flush the log at every
4791 commit, but only once per second */
4792
4793 } else if (srv_win_file_flush_method == SRV_WIN_IO_UNBUFFERED) {
4794
4795 attributes |= FILE_FLAG_NO_BUFFERING;
4796 }
4797 #endif /* UNIV_NON_BUFFERED_IO */
4798
4799 #endif /* UNIV_HOTBACKUP */
4800 DWORD access = GENERIC_READ;
4801
4802 if (!read_only) {
4803 access |= GENERIC_WRITE;
4804 }
4805
4806 do {
4807 /* Use default security attributes and no template file. */
4808 file.m_file = CreateFile(
4809 (LPCTSTR) name, access, share_mode, NULL,
4810 create_flag, attributes, NULL);
4811
4812 if (file.m_file == INVALID_HANDLE_VALUE) {
4813 const char* operation;
4814
4815 operation = (create_mode == OS_FILE_CREATE
4816 && !read_only)
4817 ? "create" : "open";
4818
4819 *success = false;
4820
4821 if (on_error_no_exit) {
4822 retry = os_file_handle_error_no_exit(
4823 name, operation, on_error_silent);
4824 } else {
4825 retry = os_file_handle_error(name, operation);
4826 }
4827 } else {
4828
4829 retry = false;
4830
4831 *success = true;
4832
4833 DWORD temp;
4834
4835 /* This is a best effort use case, if it fails then
4836 we will find out when we try and punch the hole. */
4837 DeviceIoControl(
4838 file.m_file, FSCTL_SET_SPARSE, NULL, 0, NULL, 0,
4839 &temp, NULL);
4840 }
4841
4842 } while (retry);
4843
4844 return(file);
4845 }
4846
4847 /** NOTE! Use the corresponding macro os_file_create_simple_no_error_handling(),
4848 not directly this function!
4849 A simple function to open or create a file.
4850 @param[in] name name of the file or path as a null-terminated
4851 string
4852 @param[in] create_mode create mode
4853 @param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
4854 OS_FILE_READ_ALLOW_DELETE; the last option is
4855 used by a backup program reading the file
4856 @param[out] success true if succeeded
4857 @return own: handle to the file, not defined if error, error number
4858 can be retrieved with os_file_get_last_error */
4859 pfs_os_file_t
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)4860 os_file_create_simple_no_error_handling_func(
4861 const char* name,
4862 ulint create_mode,
4863 ulint access_type,
4864 bool read_only,
4865 bool* success)
4866 {
4867 pfs_os_file_t file;
4868
4869 *success = false;
4870
4871 DWORD access;
4872 DWORD create_flag;
4873 DWORD attributes = 0;
4874 DWORD share_mode = FILE_SHARE_READ;
4875
4876 ut_a(name);
4877
4878 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
4879 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
4880
4881 if (create_mode == OS_FILE_OPEN) {
4882
4883 create_flag = OPEN_EXISTING;
4884
4885 } else if (read_only) {
4886
4887 create_flag = OPEN_EXISTING;
4888
4889 } else if (create_mode == OS_FILE_CREATE) {
4890
4891 create_flag = CREATE_NEW;
4892
4893 } else {
4894
4895 ib::error()
4896 << "Unknown file create mode (" << create_mode << ") "
4897 << " for file '" << name << "'";
4898
4899 file.m_file = OS_FILE_CLOSED;
4900 return(file);
4901 }
4902
4903 if (access_type == OS_FILE_READ_ONLY) {
4904
4905 access = GENERIC_READ;
4906
4907 } else if (read_only) {
4908
4909 access = GENERIC_READ;
4910
4911 } else if (access_type == OS_FILE_READ_WRITE) {
4912
4913 access = GENERIC_READ | GENERIC_WRITE;
4914
4915 } else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
4916
4917 ut_a(!read_only);
4918
4919 access = GENERIC_READ;
4920
4921 /*!< A backup program has to give mysqld the maximum
4922 freedom to do what it likes with the file */
4923
4924 share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE;
4925 } else {
4926
4927 ib::error()
4928 << "Unknown file access type (" << access_type << ") "
4929 << "for file '" << name << "'";
4930
4931 file.m_file = OS_FILE_CLOSED;
4932 return(file);
4933 }
4934
4935 file.m_file = CreateFile((LPCTSTR) name,
4936 access,
4937 share_mode,
4938 NULL, // Security attributes
4939 create_flag,
4940 attributes,
4941 NULL); // No template file
4942
4943 *success = (file.m_file != INVALID_HANDLE_VALUE);
4944
4945 return(file);
4946 }
4947
4948 /** Deletes a file if it exists. The file has to be closed before calling this.
4949 @param[in] name file path as a null-terminated string
4950 @param[out] exist indicate if file pre-exist
4951 @return true if success */
4952 bool
os_file_delete_if_exists_func(const char * name,bool * exist)4953 os_file_delete_if_exists_func(
4954 const char* name,
4955 bool* exist)
4956 {
4957 ulint count = 0;
4958
4959 if (exist != NULL) {
4960 *exist = true;
4961 }
4962
4963 for (;;) {
4964 /* In Windows, deleting an .ibd file may fail if ibbackup
4965 is copying it */
4966
4967 bool ret = DeleteFile((LPCTSTR) name);
4968
4969 if (ret) {
4970 return(true);
4971 }
4972
4973 DWORD lasterr = GetLastError();
4974
4975 if (lasterr == ERROR_FILE_NOT_FOUND
4976 || lasterr == ERROR_PATH_NOT_FOUND) {
4977
4978 /* the file does not exist, this not an error */
4979 if (exist != NULL) {
4980 *exist = false;
4981 }
4982
4983 return(true);
4984 }
4985
4986 ++count;
4987
4988 if (count > 100 && 0 == (count % 10)) {
4989
4990 /* Print error information */
4991 os_file_get_last_error(true);
4992
4993 ib::warn() << "Delete of file '" << name << "' failed.";
4994 }
4995
4996 /* Sleep for a second */
4997 os_thread_sleep(1000000);
4998
4999 if (count > 2000) {
5000
5001 return(false);
5002 }
5003 }
5004 }
5005
5006 /** Deletes a file. The file has to be closed before calling this.
5007 @param[in] name File path as NUL terminated string
5008 @return true if success */
5009 bool
os_file_delete_func(const char * name)5010 os_file_delete_func(
5011 const char* name)
5012 {
5013 ulint count = 0;
5014
5015 for (;;) {
5016 /* In Windows, deleting an .ibd file may fail if ibbackup
5017 is copying it */
5018
5019 BOOL ret = DeleteFile((LPCTSTR) name);
5020
5021 if (ret) {
5022 return(true);
5023 }
5024
5025 if (GetLastError() == ERROR_FILE_NOT_FOUND) {
5026 /* If the file does not exist, we classify this as
5027 a 'mild' error and return */
5028
5029 return(false);
5030 }
5031
5032 ++count;
5033
5034 if (count > 100 && 0 == (count % 10)) {
5035
5036 /* print error information */
5037 os_file_get_last_error(true);
5038
5039 ib::warn()
5040 << "Cannot delete file '" << name << "'. Are "
5041 << "you running ibbackup to back up the file?";
5042 }
5043
5044 /* sleep for a second */
5045 os_thread_sleep(1000000);
5046
5047 if (count > 2000) {
5048
5049 return(false);
5050 }
5051 }
5052
5053 ut_error;
5054 return(false);
5055 }
5056
5057 /** NOTE! Use the corresponding macro os_file_rename(), not directly this
5058 function!
5059 Renames a file (can also move it to another directory). It is safest that the
5060 file is closed before calling this function.
5061 @param[in] oldpath old file path as a null-terminated string
5062 @param[in] newpath new file path
5063 @return true if success */
5064 bool
os_file_rename_func(const char * oldpath,const char * newpath)5065 os_file_rename_func(
5066 const char* oldpath,
5067 const char* newpath)
5068 {
5069 #ifdef UNIV_DEBUG
5070 os_file_type_t type;
5071 bool exists;
5072
5073 /* New path must not exist. */
5074 ut_ad(os_file_status(newpath, &exists, &type));
5075 ut_ad(!exists);
5076
5077 /* Old path must exist. */
5078 ut_ad(os_file_status(oldpath, &exists, &type));
5079 ut_ad(exists);
5080 #endif /* UNIV_DEBUG */
5081
5082 if (MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath)) {
5083 return(true);
5084 }
5085
5086 os_file_handle_error_no_exit(oldpath, "rename", false);
5087
5088 return(false);
5089 }
5090
5091 /** NOTE! Use the corresponding macro os_file_close(), not directly
5092 this function!
5093 Closes a file handle. In case of error, error number can be retrieved with
5094 os_file_get_last_error.
5095 @param[in,own] file Handle to a file
5096 @return true if success */
5097 bool
os_file_close_func(os_file_t file)5098 os_file_close_func(
5099 os_file_t file)
5100 {
5101 ut_a(file > 0);
5102
5103 if (CloseHandle(file)) {
5104 return(true);
5105 }
5106
5107 os_file_handle_error(NULL, "close");
5108
5109 return(false);
5110 }
5111
5112 /** Gets a file size.
5113 @param[in] file Handle to a file
5114 @return file size, or (os_offset_t) -1 on failure */
5115 os_offset_t
os_file_get_size(pfs_os_file_t file)5116 os_file_get_size(
5117 pfs_os_file_t file)
5118 {
5119 DWORD high;
5120 DWORD low;
5121
5122 low = GetFileSize(file.m_file, &high);
5123
5124 if (low == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
5125 return((os_offset_t) -1);
5126 }
5127
5128 return(os_offset_t(low | (os_offset_t(high) << 32)));
5129 }
5130
5131 /** Gets a file size.
5132 @param[in] filename Full path to the filename to check
5133 @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
5134 errno */
5135 os_file_size_t
os_file_get_size(const char * filename)5136 os_file_get_size(
5137 const char* filename)
5138 {
5139 struct __stat64 s;
5140 os_file_size_t file_size;
5141
5142 int ret = _stat64(filename, &s);
5143
5144 if (ret == 0) {
5145
5146 file_size.m_total_size = s.st_size;
5147
5148 DWORD low_size;
5149 DWORD high_size;
5150
5151 low_size = GetCompressedFileSize(filename, &high_size);
5152
5153 if (low_size != INVALID_FILE_SIZE) {
5154
5155 file_size.m_alloc_size = high_size;
5156 file_size.m_alloc_size <<= 32;
5157 file_size.m_alloc_size |= low_size;
5158
5159 } else {
5160 ib::error()
5161 << "GetCompressedFileSize("
5162 << filename << ", ..) failed.";
5163
5164 file_size.m_alloc_size = (os_offset_t) -1;
5165 }
5166 } else {
5167 file_size.m_total_size = ~0;
5168 file_size.m_alloc_size = (os_offset_t) ret;
5169 }
5170
5171 return(file_size);
5172 }
5173
5174 /** This function returns information about the specified file
5175 @param[in] path pathname of the file
5176 @param[out] stat_info information of a file in a directory
5177 @param[in,out] statinfo information of a file in a directory
5178 @param[in] check_rw_perm for testing whether the file can be opened
5179 in RW mode
5180 @param[in] read_only true if the file is opened in read-only mode
5181 @return DB_SUCCESS if all OK */
5182 static
5183 dberr_t
os_file_get_status_win32(const char * path,os_file_stat_t * stat_info,struct _stat64 * statinfo,bool check_rw_perm,bool read_only)5184 os_file_get_status_win32(
5185 const char* path,
5186 os_file_stat_t* stat_info,
5187 struct _stat64* statinfo,
5188 bool check_rw_perm,
5189 bool read_only)
5190 {
5191 int ret = _stat64(path, statinfo);
5192
5193 if (ret && (errno == ENOENT || errno == ENOTDIR)) {
5194 /* file does not exist */
5195
5196 return(DB_NOT_FOUND);
5197
5198 } else if (ret) {
5199 /* file exists, but stat call failed */
5200
5201 os_file_handle_error_no_exit(path, "stat", false);
5202
5203 return(DB_FAIL);
5204
5205 } else if (_S_IFDIR & statinfo->st_mode) {
5206
5207 stat_info->type = OS_FILE_TYPE_DIR;
5208
5209 } else if (_S_IFREG & statinfo->st_mode) {
5210
5211 DWORD access = GENERIC_READ;
5212
5213 if (!read_only) {
5214 access |= GENERIC_WRITE;
5215 }
5216
5217 stat_info->type = OS_FILE_TYPE_FILE;
5218
5219 /* Check if we can open it in read-only mode. */
5220
5221 if (check_rw_perm) {
5222 HANDLE fh;
5223
5224 fh = CreateFile(
5225 (LPCTSTR) path, // File to open
5226 access,
5227 0, // No sharing
5228 NULL, // Default security
5229 OPEN_EXISTING, // Existing file only
5230 FILE_ATTRIBUTE_NORMAL, // Normal file
5231 NULL); // No attr. template
5232
5233 if (fh == INVALID_HANDLE_VALUE) {
5234 stat_info->rw_perm = false;
5235 } else {
5236 stat_info->rw_perm = true;
5237 CloseHandle(fh);
5238 }
5239 }
5240
5241 char volname[MAX_PATH];
5242 BOOL result = GetVolumePathName(path, volname, MAX_PATH);
5243
5244 if (!result) {
5245
5246 ib::error()
5247 << "os_file_get_status_win32: "
5248 << "Failed to get the volume path name for: "
5249 << path
5250 << "- OS error number " << GetLastError();
5251
5252 return(DB_FAIL);
5253 }
5254
5255 DWORD sectorsPerCluster;
5256 DWORD bytesPerSector;
5257 DWORD numberOfFreeClusters;
5258 DWORD totalNumberOfClusters;
5259
5260 result = GetDiskFreeSpace(
5261 (LPCSTR) volname,
5262 §orsPerCluster,
5263 &bytesPerSector,
5264 &numberOfFreeClusters,
5265 &totalNumberOfClusters);
5266
5267 if (!result) {
5268
5269 ib::error()
5270 << "GetDiskFreeSpace(" << volname << ",...) "
5271 << "failed "
5272 << "- OS error number " << GetLastError();
5273
5274 return(DB_FAIL);
5275 }
5276
5277 stat_info->block_size = bytesPerSector * sectorsPerCluster;
5278
5279 /* On Windows the block size is not used as the allocation
5280 unit for sparse files. The underlying infra-structure for
5281 sparse files is based on NTFS compression. The punch hole
5282 is done on a "compression unit". This compression unit
5283 is based on the cluster size. You cannot punch a hole if
5284 the cluster size >= 8K. For smaller sizes the table is
5285 as follows:
5286
5287 Cluster Size Compression Unit
5288 512 Bytes 8 KB
5289 1 KB 16 KB
5290 2 KB 32 KB
5291 4 KB 64 KB
5292
5293 Default NTFS cluster size is 4K, compression unit size of 64K.
5294 Therefore unless the user has created the file system with
5295 a smaller cluster size and used larger page sizes there is
5296 little benefit from compression out of the box. */
5297
5298 stat_info->block_size = (stat_info->block_size <= 4096)
5299 ? stat_info->block_size * 16 : ULINT_UNDEFINED;
5300 } else {
5301 stat_info->type = OS_FILE_TYPE_UNKNOWN;
5302 }
5303
5304 return(DB_SUCCESS);
5305 }
5306
5307 /** Truncates a file to a specified size in bytes.
5308 Do nothing if the size to preserve is greater or equal to the current
5309 size of the file.
5310 @param[in] pathname file path
5311 @param[in] file file to be truncated
5312 @param[in] size size to preserve in bytes
5313 @return true if success */
5314 static
5315 bool
os_file_truncate_win32(const char * pathname,pfs_os_file_t file,os_offset_t size)5316 os_file_truncate_win32(
5317 const char* pathname,
5318 pfs_os_file_t file,
5319 os_offset_t size)
5320 {
5321 LARGE_INTEGER length;
5322
5323 length.QuadPart = size;
5324 BOOL success = SetFilePointerEx(file.m_file, length, NULL, FILE_BEGIN);
5325 if (!success) {
5326 os_file_handle_error_no_exit(
5327 pathname, "SetFilePointerEx", false);
5328 } else {
5329 success = SetEndOfFile(file.m_file);
5330 if (!success) {
5331 os_file_handle_error_no_exit(
5332 pathname, "SetEndOfFile", false);
5333 }
5334 }
5335 return(success);
5336 }
5337
5338 /** Truncates a file at its current position.
5339 @param[in] file Handle to be truncated
5340 @return true if success */
5341 bool
os_file_set_eof(FILE * file)5342 os_file_set_eof(
5343 FILE* file)
5344 {
5345 HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
5346
5347 return(SetEndOfFile(h));
5348 }
5349
5350 #ifdef UNIV_HOTBACKUP
5351 /** Closes a file handle.
5352 @param[in] file Handle to close
5353 @return true if success */
5354 bool
os_file_close_no_error_handling(os_file_t file)5355 os_file_close_no_error_handling(
5356 os_file_t file)
5357 {
5358 return(CloseHandle(file) ? true : false);
5359 }
5360 #endif /* UNIV_HOTBACKUP */
5361
5362 /** This function can be called if one wants to post a batch of reads and
5363 prefers an i/o-handler thread to handle them all at once later. You must
5364 call os_aio_simulated_wake_handler_threads later to ensure the threads
5365 are not left sleeping! */
5366 void
os_aio_simulated_put_read_threads_to_sleep()5367 os_aio_simulated_put_read_threads_to_sleep()
5368 {
5369 AIO::simulated_put_read_threads_to_sleep();
5370 }
5371
5372 /** This function can be called if one wants to post a batch of reads and
5373 prefers an i/o-handler thread to handle them all at once later. You must
5374 call os_aio_simulated_wake_handler_threads later to ensure the threads
5375 are not left sleeping! */
5376 void
simulated_put_read_threads_to_sleep()5377 AIO::simulated_put_read_threads_to_sleep()
5378 {
5379 /* The idea of putting background IO threads to sleep is only for
5380 Windows when using simulated AIO. Windows XP seems to schedule
5381 background threads too eagerly to allow for coalescing during
5382 readahead requests. */
5383
5384 if (srv_use_native_aio) {
5385 /* We do not use simulated AIO: do nothing */
5386
5387 return;
5388 }
5389
5390 os_aio_recommend_sleep_for_read_threads = true;
5391
5392 for (ulint i = 0; i < os_aio_n_segments; i++) {
5393 AIO* array;
5394
5395 get_array_and_local_segment(&array, i);
5396
5397 if (array == s_reads) {
5398
5399 os_event_reset(os_aio_segment_wait_events[i]);
5400 }
5401 }
5402 }
5403
5404 #endif /* !_WIN32*/
5405
5406 /** Does a syncronous read or write depending upon the type specified
5407 In case of partial reads/writes the function tries
5408 NUM_RETRIES_ON_PARTIAL_IO times to read/write the complete data.
5409 @param[in] type, IO flags
5410 @param[in] file handle to an open file
5411 @param[out] buf buffer where to read
5412 @param[in] offset file offset from the start where to read
5413 @param[in] n number of bytes to read, starting from offset
5414 @param[out] err DB_SUCCESS or error code
5415 @return number of bytes read/written, -1 if error */
5416 static MY_ATTRIBUTE((warn_unused_result))
5417 ssize_t
os_file_io(const IORequest & in_type,os_file_t file,void * buf,ulint n,os_offset_t offset,dberr_t * err)5418 os_file_io(
5419 const IORequest&in_type,
5420 os_file_t file,
5421 void* buf,
5422 ulint n,
5423 os_offset_t offset,
5424 dberr_t* err)
5425 {
5426 Block* block;
5427 ulint original_n = n;
5428 IORequest type = in_type;
5429 ssize_t bytes_returned = 0;
5430
5431 if (type.is_compressed()) {
5432
5433 /* We don't compress the first page of any file. */
5434 ut_ad(offset > 0);
5435
5436 block = os_file_compress_page(type, buf, &n);
5437 } else {
5438 block = NULL;
5439 }
5440
5441 /* We do encryption after compression, since if we do encryption
5442 before compression, the encrypted data will cause compression fail
5443 or low compression rate. */
5444 if (type.is_encrypted() && type.is_write()) {
5445 /* We don't encrypt the first page of any file. */
5446 Block* compressed_block = block;
5447 ut_ad(offset > 0);
5448
5449 block = os_file_encrypt_page(type, buf, &n);
5450
5451 if (compressed_block != NULL) {
5452 os_free_block(compressed_block);
5453 }
5454 }
5455
5456 SyncFileIO sync_file_io(file, buf, n, offset);
5457
5458 for (ulint i = 0; i < NUM_RETRIES_ON_PARTIAL_IO; ++i) {
5459
5460 ssize_t n_bytes = sync_file_io.execute(type);
5461
5462 /* Check for a hard error. Not much we can do now. */
5463 if (n_bytes < 0) {
5464
5465 break;
5466
5467 } else if ((ulint) n_bytes + bytes_returned == n) {
5468
5469 bytes_returned += n_bytes;
5470
5471 if (offset > 0
5472 && (type.is_compressed() || type.is_read())) {
5473
5474 *err = os_file_io_complete(
5475 type, file,
5476 reinterpret_cast<byte*>(buf),
5477 NULL, original_n, offset, n);
5478 } else {
5479
5480 *err = DB_SUCCESS;
5481 }
5482
5483 if (block != NULL) {
5484 os_free_block(block);
5485 }
5486
5487 return(original_n);
5488 }
5489
5490 /* Handle partial read/write. */
5491
5492 ut_ad((ulint) n_bytes + bytes_returned < n);
5493
5494 bytes_returned += (ulint) n_bytes;
5495
5496 if (!type.is_partial_io_warning_disabled()) {
5497
5498 const char* op = type.is_read()
5499 ? "read" : "written";
5500
5501 ib::warn()
5502 << n
5503 << " bytes should have been " << op << ". Only "
5504 << bytes_returned
5505 << " bytes " << op << ". Retrying"
5506 << " for the remaining bytes.";
5507 }
5508
5509 /* Advance the offset and buffer by n_bytes */
5510 sync_file_io.advance(n_bytes);
5511 }
5512
5513 if (block != NULL) {
5514 os_free_block(block);
5515 }
5516
5517 *err = DB_IO_ERROR;
5518
5519 if (!type.is_partial_io_warning_disabled()) {
5520 ib::warn()
5521 << "Retry attempts for "
5522 << (type.is_read() ? "reading" : "writing")
5523 << " partial data failed.";
5524 }
5525
5526 return(bytes_returned);
5527 }
5528
5529 /** Does a synchronous write operation in Posix.
5530 @param[in] type IO context
5531 @param[in] file handle to an open file
5532 @param[out] buf buffer from which to write
5533 @param[in] n number of bytes to read, starting from offset
5534 @param[in] offset file offset from the start where to read
5535 @param[out] err DB_SUCCESS or error code
5536 @return number of bytes written, -1 if error */
5537 static MY_ATTRIBUTE((warn_unused_result))
5538 ssize_t
os_file_pwrite(IORequest & type,os_file_t file,const byte * buf,ulint n,os_offset_t offset,dberr_t * err)5539 os_file_pwrite(
5540 IORequest& type,
5541 os_file_t file,
5542 const byte* buf,
5543 ulint n,
5544 os_offset_t offset,
5545 dberr_t* err)
5546 {
5547 ut_ad(type.validate());
5548
5549 ++os_n_file_writes;
5550
5551 (void) os_atomic_increment_ulint(&os_n_pending_writes, 1);
5552 MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_WRITES);
5553
5554 ssize_t n_bytes = os_file_io(type, file, (void*) buf, n, offset, err);
5555
5556 (void) os_atomic_decrement_ulint(&os_n_pending_writes, 1);
5557 MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_WRITES);
5558
5559 return(n_bytes);
5560 }
5561
5562 /** Requests a synchronous write operation.
5563 @param[in] type IO flags
5564 @param[in] file handle to an open file
5565 @param[out] buf buffer from which to write
5566 @param[in] offset file offset from the start where to read
5567 @param[in] n number of bytes to read, starting from offset
5568 @return DB_SUCCESS if request was successful, false if fail */
5569 static MY_ATTRIBUTE((warn_unused_result))
5570 dberr_t
os_file_write_page(IORequest & type,const char * name,os_file_t file,const byte * buf,os_offset_t offset,ulint n)5571 os_file_write_page(
5572 IORequest& type,
5573 const char* name,
5574 os_file_t file,
5575 const byte* buf,
5576 os_offset_t offset,
5577 ulint n)
5578 {
5579 dberr_t err;
5580 ut_ad(type.validate());
5581 ut_ad(n > 0);
5582
5583 ssize_t n_bytes = os_file_pwrite(type, file, buf, n, offset, &err);
5584
5585 if ((ulint) n_bytes != n && !os_has_said_disk_full) {
5586
5587 ib::error()
5588 << "Write to file " << name << "failed at offset "
5589 << offset << ", " << n
5590 << " bytes should have been written,"
5591 " only " << n_bytes << " were written."
5592 " Operating system error number " << errno << "."
5593 " Check that your OS and file system"
5594 " support files of this size."
5595 " Check also that the disk is not full"
5596 " or a disk quota exceeded.";
5597
5598 if (strerror(errno) != NULL) {
5599
5600 ib::error()
5601 << "Error number " << errno
5602 << " means '" << strerror(errno) << "'";
5603 }
5604
5605 ib::info() << OPERATING_SYSTEM_ERROR_MSG;
5606
5607 os_has_said_disk_full = true;
5608 }
5609
5610 return(err);
5611 }
5612
5613 /** Does a synchronous read operation in Posix.
5614 @param[in] type IO flags
5615 @param[in] file handle to an open file
5616 @param[out] buf buffer where to read
5617 @param[in] offset file offset from the start where to read
5618 @param[in] n number of bytes to read, starting from offset
5619 @param[out] err DB_SUCCESS or error code
5620 @return number of bytes read, -1 if error */
5621 static MY_ATTRIBUTE((warn_unused_result))
5622 ssize_t
os_file_pread(IORequest & type,os_file_t file,void * buf,ulint n,os_offset_t offset,dberr_t * err)5623 os_file_pread(
5624 IORequest& type,
5625 os_file_t file,
5626 void* buf,
5627 ulint n,
5628 os_offset_t offset,
5629 dberr_t* err)
5630 {
5631 ++os_n_file_reads;
5632
5633 (void) os_atomic_increment_ulint(&os_n_pending_reads, 1);
5634 MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS);
5635
5636 ssize_t n_bytes = os_file_io(type, file, buf, n, offset, err);
5637
5638 (void) os_atomic_decrement_ulint(&os_n_pending_reads, 1);
5639 MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_READS);
5640
5641 return(n_bytes);
5642 }
5643
5644 /** Requests a synchronous positioned read operation.
5645 @return DB_SUCCESS if request was successful, false if fail
5646 @param[in] type IO flags
5647 @param[in] file handle to an open file
5648 @param[out] buf buffer where to read
5649 @param[in] offset file offset from the start where to read
5650 @param[in] n number of bytes to read, starting from offset
5651 @param[out] o number of bytes actually read
5652 @param[in] exit_on_err if true then exit on error
5653 @return DB_SUCCESS or error code */
5654 static MY_ATTRIBUTE((warn_unused_result))
5655 dberr_t
os_file_read_page(IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n,ulint * o,bool exit_on_err)5656 os_file_read_page(
5657 IORequest& type,
5658 os_file_t file,
5659 void* buf,
5660 os_offset_t offset,
5661 ulint n,
5662 ulint* o,
5663 bool exit_on_err)
5664 {
5665 dberr_t err;
5666
5667 os_bytes_read_since_printout += n;
5668
5669 ut_ad(type.validate());
5670 ut_ad(n > 0);
5671
5672 for (;;) {
5673 ssize_t n_bytes;
5674
5675 n_bytes = os_file_pread(type, file, buf, n, offset, &err);
5676
5677 if (o != NULL) {
5678 *o = n_bytes;
5679 }
5680
5681 if (err != DB_SUCCESS && !exit_on_err) {
5682
5683 return(err);
5684
5685 } else if ((ulint) n_bytes == n) {
5686
5687 /** The read will succeed but decompress can fail
5688 for various reasons. */
5689
5690 if (type.is_compression_enabled()
5691 && !Compression::is_compressed_page(
5692 static_cast<byte*>(buf))) {
5693
5694 return(DB_SUCCESS);
5695
5696 } else {
5697 return(err);
5698 }
5699 }
5700
5701 ib::error() << "Tried to read " << n
5702 << " bytes at offset " << offset
5703 << ", but was only able to read " << n_bytes;
5704
5705 if (exit_on_err) {
5706
5707 if (!os_file_handle_error(NULL, "read")) {
5708 /* Hard error */
5709 break;
5710 }
5711
5712 } else if (!os_file_handle_error_no_exit(NULL, "read", false)) {
5713
5714 /* Hard error */
5715 break;
5716 }
5717
5718 if (n_bytes > 0 && (ulint) n_bytes < n) {
5719 n -= (ulint) n_bytes;
5720 offset += (ulint) n_bytes;
5721 buf = reinterpret_cast<uchar*>(buf) + (ulint) n_bytes;
5722 }
5723 }
5724
5725 ib::fatal()
5726 << "Cannot read from file. OS error number "
5727 << errno << ".";
5728
5729 return(err);
5730 }
5731
5732 /** Retrieves the last error number if an error occurs in a file io function.
5733 The number should be retrieved before any other OS calls (because they may
5734 overwrite the error number). If the number is not known to this program,
5735 the OS error number + 100 is returned.
5736 @param[in] report_all_errors true if we want an error printed
5737 for all errors
5738 @return error number, or OS error number + 100 */
5739 ulint
os_file_get_last_error(bool report_all_errors)5740 os_file_get_last_error(
5741 bool report_all_errors)
5742 {
5743 return(os_file_get_last_error_low(report_all_errors, false));
5744 }
5745
5746 /** Does error handling when a file operation fails.
5747 Conditionally exits (calling srv_fatal_error()) based on should_exit value
5748 and the error type, if should_exit is true then on_error_silent is ignored.
5749 @param[in] name name of a file or NULL
5750 @param[in] operation operation
5751 @param[in] should_exit call srv_fatal_error() on an unknown error,
5752 if this parameter is true
5753 @param[in] on_error_silent if true then don't print any message to the log
5754 iff it is an unknown non-fatal error
5755 @return true if we should retry the operation */
5756 static MY_ATTRIBUTE((warn_unused_result))
5757 bool
os_file_handle_error_cond_exit(const char * name,const char * operation,bool should_exit,bool on_error_silent)5758 os_file_handle_error_cond_exit(
5759 const char* name,
5760 const char* operation,
5761 bool should_exit,
5762 bool on_error_silent)
5763 {
5764 ulint err;
5765
5766 err = os_file_get_last_error_low(false, on_error_silent);
5767
5768 switch (err) {
5769 case OS_FILE_DISK_FULL:
5770 /* We only print a warning about disk full once */
5771
5772 if (os_has_said_disk_full) {
5773
5774 return(false);
5775 }
5776
5777 /* Disk full error is reported irrespective of the
5778 on_error_silent setting. */
5779
5780 if (name) {
5781
5782 ib::error()
5783 << "Encountered a problem with file '"
5784 << name << "'";
5785 }
5786
5787 ib::error()
5788 << "Disk is full. Try to clean the disk to free space.";
5789
5790 os_has_said_disk_full = true;
5791
5792 return(false);
5793
5794 case OS_FILE_AIO_RESOURCES_RESERVED:
5795 case OS_FILE_AIO_INTERRUPTED:
5796
5797 return(true);
5798
5799 case OS_FILE_PATH_ERROR:
5800 case OS_FILE_ALREADY_EXISTS:
5801 case OS_FILE_ACCESS_VIOLATION:
5802
5803 return(false);
5804
5805 case OS_FILE_SHARING_VIOLATION:
5806
5807 os_thread_sleep(10000000); /* 10 sec */
5808 return(true);
5809
5810 case OS_FILE_OPERATION_ABORTED:
5811 case OS_FILE_INSUFFICIENT_RESOURCE:
5812
5813 os_thread_sleep(100000); /* 100 ms */
5814 return(true);
5815
5816 default:
5817
5818 /* If it is an operation that can crash on error then it
5819 is better to ignore on_error_silent and print an error message
5820 to the log. */
5821
5822 if (should_exit || !on_error_silent) {
5823 ib::error() << "File "
5824 << (name != NULL ? name : "(unknown)")
5825 << ": '" << operation << "'"
5826 " returned OS error " << err << "."
5827 << (should_exit
5828 ? " Cannot continue operation" : "");
5829 }
5830
5831 if (should_exit) {
5832 srv_fatal_error();
5833 }
5834 }
5835
5836 return(false);
5837 }
5838
5839 /** Does error handling when a file operation fails.
5840 @param[in] name name of a file or NULL
5841 @param[in] operation operation name that failed
5842 @return true if we should retry the operation */
5843 static
5844 bool
os_file_handle_error(const char * name,const char * operation)5845 os_file_handle_error(
5846 const char* name,
5847 const char* operation)
5848 {
5849 /* Exit in case of unknown error */
5850 return(os_file_handle_error_cond_exit(name, operation, true, false));
5851 }
5852
5853 /** Does error handling when a file operation fails.
5854 @param[in] name name of a file or NULL
5855 @param[in] operation operation name that failed
5856 @param[in] on_error_silent if true then don't print any message to the log.
5857 @return true if we should retry the operation */
5858 static
5859 bool
os_file_handle_error_no_exit(const char * name,const char * operation,bool on_error_silent)5860 os_file_handle_error_no_exit(
5861 const char* name,
5862 const char* operation,
5863 bool on_error_silent)
5864 {
5865 /* Don't exit in case of unknown error */
5866 return(os_file_handle_error_cond_exit(
5867 name, operation, false, on_error_silent));
5868 }
5869
5870 /** Tries to disable OS caching on an opened file descriptor.
5871 @param[in] fd file descriptor to alter
5872 @param[in] file_name file name, used in the diagnostic message
5873 @param[in] name "open" or "create"; used in the diagnostic
5874 message */
5875 void
os_file_set_nocache(int fd MY_ATTRIBUTE ((unused)),const char * file_name MY_ATTRIBUTE ((unused)),const char * operation_name MY_ATTRIBUTE ((unused)))5876 os_file_set_nocache(
5877 int fd MY_ATTRIBUTE((unused)),
5878 const char* file_name MY_ATTRIBUTE((unused)),
5879 const char* operation_name MY_ATTRIBUTE((unused)))
5880 {
5881 /* some versions of Solaris may not have DIRECTIO_ON */
5882 #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
5883 if (directio(fd, DIRECTIO_ON) == -1) {
5884 int errno_save = errno;
5885
5886 ib::error()
5887 << "Failed to set DIRECTIO_ON on file "
5888 << file_name << ": " << operation_name
5889 << strerror(errno_save) << ","
5890 " continuing anyway.";
5891 }
5892 #elif defined(O_DIRECT)
5893 if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
5894 int errno_save = errno;
5895 static bool warning_message_printed = false;
5896 if (errno_save == EINVAL) {
5897 if (!warning_message_printed) {
5898 warning_message_printed = true;
5899 # ifdef UNIV_LINUX
5900 ib::warn()
5901 << "Failed to set O_DIRECT on file"
5902 << file_name << ";" << operation_name
5903 << ": " << strerror(errno_save) << ", "
5904 << "continuing anyway. O_DIRECT is "
5905 "known to result in 'Invalid argument' "
5906 "on Linux on tmpfs, "
5907 "see MySQL Bug#26662.";
5908 # else /* UNIV_LINUX */
5909 goto short_warning;
5910 # endif /* UNIV_LINUX */
5911 }
5912 } else {
5913 # ifndef UNIV_LINUX
5914 short_warning:
5915 # endif
5916 ib::warn()
5917 << "Failed to set O_DIRECT on file "
5918 << file_name << "; " << operation_name
5919 << " : " << strerror(errno_save)
5920 << " continuing anyway.";
5921 }
5922 }
5923 #endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */
5924 }
5925
5926 /** Write the specified number of zeros to a newly created file.
5927 @param[in] name name of the file or path as a null-terminated
5928 string
5929 @param[in] file handle to a file
5930 @param[in] size file size
5931 @param[in] read_only Enable read-only checks if true
5932 @return true if success */
5933 bool
os_file_set_size(const char * name,pfs_os_file_t file,os_offset_t size,bool read_only)5934 os_file_set_size(
5935 const char* name,
5936 pfs_os_file_t file,
5937 os_offset_t size,
5938 bool read_only)
5939 {
5940 /* Write up to 1 megabyte at a time. */
5941 ulint buf_size = ut_min(
5942 static_cast<ulint>(64),
5943 static_cast<ulint>(size / UNIV_PAGE_SIZE));
5944
5945 buf_size *= UNIV_PAGE_SIZE;
5946
5947 /* Align the buffer for possible raw i/o */
5948 byte* buf2;
5949
5950 buf2 = static_cast<byte*>(ut_malloc_nokey(buf_size + UNIV_PAGE_SIZE));
5951
5952 byte* buf = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
5953
5954 /* Write buffer full of zeros */
5955 memset(buf, 0, buf_size);
5956
5957 if (size >= (os_offset_t) 100 << 20) {
5958
5959 ib::info() << "Progress in MB:";
5960 }
5961
5962 os_offset_t current_size = 0;
5963
5964 while (current_size < size) {
5965 ulint n_bytes;
5966
5967 if (size - current_size < (os_offset_t) buf_size) {
5968 n_bytes = (ulint) (size - current_size);
5969 } else {
5970 n_bytes = buf_size;
5971 }
5972
5973 dberr_t err;
5974 IORequest request(IORequest::WRITE);
5975
5976 #ifdef UNIV_HOTBACKUP
5977
5978 err = os_file_write(
5979 request, name, file, buf, current_size, n_bytes);
5980 #else
5981 /* Using OS_AIO_SYNC mode on POSIX systems will result in
5982 fall back to os_file_write/read. On Windows it will use
5983 special mechanism to wait before it returns back. */
5984
5985 err = os_aio(
5986 request,
5987 OS_AIO_SYNC, name,
5988 file, buf, current_size, n_bytes,
5989 read_only, NULL, NULL);
5990 #endif /* UNIV_HOTBACKUP */
5991
5992 if (err != DB_SUCCESS) {
5993
5994 ut_free(buf2);
5995 return(false);
5996 }
5997
5998 /* Print about progress for each 100 MB written */
5999 if ((current_size + n_bytes) / (100 << 20)
6000 != current_size / (100 << 20)) {
6001
6002 fprintf(stderr, " %lu00",
6003 (ulong) ((current_size + n_bytes)
6004 / (100 << 20)));
6005 }
6006
6007 current_size += n_bytes;
6008 }
6009
6010 if (size >= (os_offset_t) 100 << 20) {
6011
6012 fprintf(stderr, "\n");
6013 }
6014
6015 ut_free(buf2);
6016
6017 return(os_file_flush(file));
6018 }
6019
6020 /** Truncates a file to a specified size in bytes.
6021 Do nothing if the size to preserve is greater or equal to the current
6022 size of the file.
6023 @param[in] pathname file path
6024 @param[in] file file to be truncated
6025 @param[in] size size to preserve in bytes
6026 @return true if success */
6027 bool
os_file_truncate(const char * pathname,pfs_os_file_t file,os_offset_t size)6028 os_file_truncate(
6029 const char* pathname,
6030 pfs_os_file_t file,
6031 os_offset_t size)
6032 {
6033 /* Do nothing if the size preserved is larger than or equal to the
6034 current size of file */
6035 os_offset_t size_bytes = os_file_get_size(file);
6036
6037 if (size >= size_bytes) {
6038 return(true);
6039 }
6040
6041 #ifdef _WIN32
6042 return(os_file_truncate_win32(pathname, file, size));
6043 #else /* _WIN32 */
6044 return(os_file_truncate_posix(pathname, file, size));
6045 #endif /* _WIN32 */
6046 }
6047
6048 /** NOTE! Use the corresponding macro os_file_read(), not directly this
6049 function!
6050 Requests a synchronous positioned read operation.
6051 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
6052 @param[in] type IO flags
6053 @param[in] file handle to an open file
6054 @param[out] buf buffer where to read
6055 @param[in] offset file offset from the start where to read
6056 @param[in] n number of bytes to read, starting from offset
6057 @return DB_SUCCESS or error code */
6058 dberr_t
os_file_read_func(IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n)6059 os_file_read_func(
6060 IORequest& type,
6061 os_file_t file,
6062 void* buf,
6063 os_offset_t offset,
6064 ulint n)
6065 {
6066 ut_ad(type.is_read());
6067
6068 return(os_file_read_page(type, file, buf, offset, n, NULL, true));
6069 }
6070
6071 /** NOTE! Use the corresponding macro os_file_read_no_error_handling(),
6072 not directly this function!
6073 Requests a synchronous positioned read operation.
6074 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
6075 @param[in] type IO flags
6076 @param[in] file handle to an open file
6077 @param[out] buf buffer where to read
6078 @param[in] offset file offset from the start where to read
6079 @param[in] n number of bytes to read, starting from offset
6080 @param[out] o number of bytes actually read
6081 @return DB_SUCCESS or error code */
6082 dberr_t
os_file_read_no_error_handling_func(IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n,ulint * o)6083 os_file_read_no_error_handling_func(
6084 IORequest& type,
6085 os_file_t file,
6086 void* buf,
6087 os_offset_t offset,
6088 ulint n,
6089 ulint* o)
6090 {
6091 ut_ad(type.is_read());
6092
6093 return(os_file_read_page(type, file, buf, offset, n, o, false));
6094 }
6095
6096 /** NOTE! Use the corresponding macro os_file_write(), not directly
6097 Requests a synchronous write operation.
6098 @param[in] type IO flags
6099 @param[in] file handle to an open file
6100 @param[out] buf buffer from which to write
6101 @param[in] offset file offset from the start where to read
6102 @param[in] n number of bytes to read, starting from offset
6103 @return DB_SUCCESS if request was successful, false if fail */
6104 dberr_t
os_file_write_func(IORequest & type,const char * name,os_file_t file,const void * buf,os_offset_t offset,ulint n)6105 os_file_write_func(
6106 IORequest& type,
6107 const char* name,
6108 os_file_t file,
6109 const void* buf,
6110 os_offset_t offset,
6111 ulint n)
6112 {
6113 ut_ad(type.validate());
6114 ut_ad(type.is_write());
6115
6116 /* We never compress the first page.
6117 Note: This assumes we always do block IO. */
6118 if (offset == 0) {
6119 type.clear_compressed();
6120 }
6121
6122 const byte* ptr = reinterpret_cast<const byte*>(buf);
6123
6124 return(os_file_write_page(type, name, file, ptr, offset, n));
6125 }
6126
6127 /** Check the existence and type of the given file.
6128 @param[in] path path name of file
6129 @param[out] exists true if the file exists
6130 @param[out] type Type of the file, if it exists
6131 @return true if call succeeded */
6132 bool
os_file_status(const char * path,bool * exists,os_file_type_t * type)6133 os_file_status(
6134 const char* path,
6135 bool* exists,
6136 os_file_type_t* type)
6137 {
6138 #ifdef _WIN32
6139 return(os_file_status_win32(path, exists, type));
6140 #else
6141 return(os_file_status_posix(path, exists, type));
6142 #endif /* _WIN32 */
6143 }
6144
6145 /** Free storage space associated with a section of the file.
6146 @param[in] fh Open file handle
6147 @param[in] off Starting offset (SEEK_SET)
6148 @param[in] len Size of the hole
6149 @return DB_SUCCESS or error code */
6150 dberr_t
os_file_punch_hole(os_file_t fh,os_offset_t off,os_offset_t len)6151 os_file_punch_hole(
6152 os_file_t fh,
6153 os_offset_t off,
6154 os_offset_t len)
6155 {
6156 /* In this debugging mode, we act as if punch hole is supported,
6157 and then skip any calls to actually punch a hole here.
6158 In this way, Transparent Page Compression is still being tested. */
6159 DBUG_EXECUTE_IF("ignore_punch_hole",
6160 return(DB_SUCCESS);
6161 );
6162
6163 #ifdef _WIN32
6164 return(os_file_punch_hole_win32(fh, off, len));
6165 #else
6166 return(os_file_punch_hole_posix(fh, off, len));
6167 #endif /* _WIN32 */
6168 }
6169
6170 /** Check if the file system supports sparse files.
6171
6172 Warning: On POSIX systems we try and punch a hole from offset 0 to
6173 the system configured page size. This should only be called on an empty
6174 file.
6175
6176 Note: On Windows we use the name and on Unices we use the file handle.
6177
6178 @param[in] name File name
6179 @param[in] fh File handle for the file - if opened
6180 @return true if the file system supports sparse files */
6181 bool
os_is_sparse_file_supported(const char * path,pfs_os_file_t fh)6182 os_is_sparse_file_supported(const char* path, pfs_os_file_t fh)
6183 {
6184 /* In this debugging mode, we act as if punch hole is supported,
6185 then we skip any calls to actually punch a hole. In this way,
6186 Transparent Page Compression is still being tested. */
6187 DBUG_EXECUTE_IF("ignore_punch_hole",
6188 return(true);
6189 );
6190
6191 #ifdef _WIN32
6192 return(os_is_sparse_file_supported_win32(path));
6193 #else
6194 dberr_t err;
6195
6196 /* We don't know the FS block size, use the sector size. The FS
6197 will do the magic. */
6198 err = os_file_punch_hole(fh.m_file, 0, UNIV_PAGE_SIZE);
6199
6200 return(err == DB_SUCCESS);
6201 #endif /* _WIN32 */
6202 }
6203
6204 /** This function returns information about the specified file
6205 @param[in] path pathname of the file
6206 @param[out] stat_info information of a file in a directory
6207 @param[in] check_rw_perm for testing whether the file can be opened
6208 in RW mode
6209 @param[in] read_only true if file is opened in read-only mode
6210 @return DB_SUCCESS if all OK */
6211 dberr_t
os_file_get_status(const char * path,os_file_stat_t * stat_info,bool check_rw_perm,bool read_only)6212 os_file_get_status(
6213 const char* path,
6214 os_file_stat_t* stat_info,
6215 bool check_rw_perm,
6216 bool read_only)
6217 {
6218 dberr_t ret;
6219
6220 #ifdef _WIN32
6221 struct _stat64 info;
6222
6223 ret = os_file_get_status_win32(
6224 path, stat_info, &info, check_rw_perm, read_only);
6225
6226 #else
6227 struct stat info;
6228
6229 ret = os_file_get_status_posix(
6230 path, stat_info, &info, check_rw_perm, read_only);
6231
6232 #endif /* _WIN32 */
6233
6234 if (ret == DB_SUCCESS) {
6235 stat_info->ctime = info.st_ctime;
6236 stat_info->atime = info.st_atime;
6237 stat_info->mtime = info.st_mtime;
6238 stat_info->size = info.st_size;
6239 }
6240
6241 return(ret);
6242 }
6243
6244 /**
6245 Waits for an AIO operation to complete. This function is used to wait the
6246 for completed requests. The aio array of pending requests is divided
6247 into segments. The thread specifies which segment or slot it wants to wait
6248 for. NOTE: this function will also take care of freeing the aio slot,
6249 therefore no other thread is allowed to do the freeing!
6250 @param[in] segment The number of the segment in the aio arrays to
6251 wait for; segment 0 is the ibuf I/O thread,
6252 segment 1 the log I/O thread, then follow the
6253 non-ibuf read threads, and as the last are the
6254 non-ibuf write threads; if this is
6255 ULINT_UNDEFINED, then it means that sync AIO
6256 is used, and this parameter is ignored
6257 @param[out] m1 the messages passed with the AIO request; note
6258 that also in the case where the AIO operation
6259 failed, these output parameters are valid and
6260 can be used to restart the operation,
6261 for example
6262 @param[out] m2 callback message
6263 @param[out] type OS_FILE_WRITE or ..._READ
6264 @return DB_SUCCESS or error code */
6265 dberr_t
os_aio_handler(ulint segment,fil_node_t ** m1,void ** m2,IORequest * request)6266 os_aio_handler(
6267 ulint segment,
6268 fil_node_t** m1,
6269 void** m2,
6270 IORequest* request)
6271 {
6272 dberr_t err;
6273
6274 if (srv_use_native_aio) {
6275 srv_set_io_thread_op_info(segment, "native aio handle");
6276
6277 #ifdef WIN_ASYNC_IO
6278
6279 err = os_aio_windows_handler(segment, 0, m1, m2, request);
6280
6281 #elif defined(LINUX_NATIVE_AIO)
6282
6283 err = os_aio_linux_handler(segment, m1, m2, request);
6284
6285 #else
6286 ut_error;
6287
6288 err = DB_ERROR; /* Eliminate compiler warning */
6289
6290 #endif /* WIN_ASYNC_IO */
6291
6292 } else {
6293 srv_set_io_thread_op_info(segment, "simulated aio handle");
6294
6295 err = os_aio_simulated_handler(segment, m1, m2, request);
6296 }
6297
6298 return(err);
6299 }
6300
6301 /** Constructor
6302 @param[in] id The latch ID
6303 @param[in] n Number of AIO slots
6304 @param[in] segments Number of segments */
AIO(latch_id_t id,ulint n,ulint segments)6305 AIO::AIO(
6306 latch_id_t id,
6307 ulint n,
6308 ulint segments)
6309 :
6310 m_slots(n),
6311 m_n_segments(segments),
6312 m_n_reserved()
6313 # ifdef LINUX_NATIVE_AIO
6314 ,m_aio_ctx(),
6315 m_events(m_slots.size())
6316 # elif defined(_WIN32)
6317 ,m_handles()
6318 # endif /* LINUX_NATIVE_AIO */
6319 {
6320 ut_a(n > 0);
6321 ut_a(m_n_segments > 0);
6322
6323 mutex_create(id, &m_mutex);
6324
6325 m_not_full = os_event_create("aio_not_full");
6326 m_is_empty = os_event_create("aio_is_empty");
6327
6328 std::uninitialized_fill(m_slots.begin(), m_slots.end(), Slot());
6329 #ifdef LINUX_NATIVE_AIO
6330 memset(&m_events[0], 0x0, sizeof(m_events[0]) * m_events.size());
6331 #endif /* LINUX_NATIVE_AIO */
6332
6333 os_event_set(m_is_empty);
6334 }
6335
6336 /** Initialise the slots */
6337 dberr_t
init_slots()6338 AIO::init_slots()
6339 {
6340 for (ulint i = 0; i < m_slots.size(); ++i) {
6341 Slot& slot = m_slots[i];
6342
6343 slot.pos = static_cast<uint16_t>(i);
6344
6345 slot.is_reserved = false;
6346
6347 #ifdef WIN_ASYNC_IO
6348
6349 slot.handle = CreateEvent(NULL, TRUE, FALSE, NULL);
6350
6351 OVERLAPPED* over = &slot.control;
6352
6353 over->hEvent = slot.handle;
6354
6355 (*m_handles)[i] = over->hEvent;
6356
6357 #elif defined(LINUX_NATIVE_AIO)
6358
6359 slot.ret = 0;
6360
6361 slot.n_bytes = 0;
6362
6363 memset(&slot.control, 0x0, sizeof(slot.control));
6364
6365 #endif /* WIN_ASYNC_IO */
6366 }
6367
6368 return(DB_SUCCESS);
6369 }
6370
6371 #ifdef LINUX_NATIVE_AIO
6372 /** Initialise the Linux Native AIO interface */
6373 dberr_t
init_linux_native_aio()6374 AIO::init_linux_native_aio()
6375 {
6376 /* Initialize the io_context array. One io_context
6377 per segment in the array. */
6378
6379 ut_a(m_aio_ctx == NULL);
6380
6381 m_aio_ctx = static_cast<io_context**>(
6382 ut_zalloc_nokey(m_n_segments * sizeof(*m_aio_ctx)));
6383
6384 if (m_aio_ctx == NULL) {
6385 return(DB_OUT_OF_MEMORY);
6386 }
6387
6388 io_context** ctx = m_aio_ctx;
6389 ulint max_events = slots_per_segment();
6390
6391 for (ulint i = 0; i < m_n_segments; ++i, ++ctx) {
6392
6393 if (!linux_create_io_ctx(max_events, ctx)) {
6394 /* If something bad happened during aio setup
6395 we should call it a day and return right away.
6396 We don't care about any leaks because a failure
6397 to initialize the io subsystem means that the
6398 server (or atleast the innodb storage engine)
6399 is not going to startup. */
6400 return(DB_IO_ERROR);
6401 }
6402 }
6403
6404 return(DB_SUCCESS);
6405 }
6406 #endif /* LINUX_NATIVE_AIO */
6407
6408 /** Initialise the array */
6409 dberr_t
init()6410 AIO::init()
6411 {
6412 ut_a(!m_slots.empty());
6413
6414 #ifdef _WIN32
6415 ut_a(m_handles == NULL);
6416
6417 m_handles = UT_NEW_NOKEY(Handles(m_slots.size()));
6418 #endif /* _WIN32 */
6419
6420 if (srv_use_native_aio) {
6421 #ifdef LINUX_NATIVE_AIO
6422 dberr_t err = init_linux_native_aio();
6423
6424 if (err != DB_SUCCESS) {
6425 return(err);
6426 }
6427
6428 #endif /* LINUX_NATIVE_AIO */
6429 }
6430
6431 return(init_slots());
6432 }
6433
6434 /** Creates an aio wait array. Note that we return NULL in case of failure.
6435 We don't care about freeing memory here because we assume that a
6436 failure will result in server refusing to start up.
6437 @param[in] id Latch ID
6438 @param[in] n maximum number of pending AIO operations
6439 allowed; n must be divisible by m_n_segments
6440 @param[in] n_segments number of segments in the AIO array
6441 @return own: AIO array, NULL on failure */
6442 AIO*
create(latch_id_t id,ulint n,ulint n_segments)6443 AIO::create(
6444 latch_id_t id,
6445 ulint n,
6446 ulint n_segments)
6447 {
6448 if ((n % n_segments)) {
6449
6450 ib::error()
6451 << "Maximum number of AIO operations must be "
6452 << "divisible by number of segments";
6453
6454 return(NULL);
6455 }
6456
6457 AIO* array = UT_NEW_NOKEY(AIO(id, n, n_segments));
6458
6459 if (array != NULL && array->init() != DB_SUCCESS) {
6460
6461 UT_DELETE(array);
6462
6463 array = NULL;
6464 }
6465
6466 return(array);
6467 }
6468
6469 /** AIO destructor */
~AIO()6470 AIO::~AIO()
6471 {
6472 #ifdef WIN_ASYNC_IO
6473 for (ulint i = 0; i < m_slots.size(); ++i) {
6474 CloseHandle(m_slots[i].handle);
6475 }
6476 #endif /* WIN_ASYNC_IO */
6477
6478 #ifdef _WIN32
6479 UT_DELETE(m_handles);
6480 #endif /* _WIN32 */
6481
6482 mutex_destroy(&m_mutex);
6483
6484 os_event_destroy(m_not_full);
6485 os_event_destroy(m_is_empty);
6486
6487 #if defined(LINUX_NATIVE_AIO)
6488 if (srv_use_native_aio) {
6489 m_events.clear();
6490 ut_free(m_aio_ctx);
6491 }
6492 #endif /* LINUX_NATIVE_AIO */
6493
6494 m_slots.clear();
6495 }
6496
6497 /** Initializes the asynchronous io system. Creates one array each for ibuf
6498 and log i/o. Also creates one array each for read and write where each
6499 array is divided logically into n_readers and n_writers
6500 respectively. The caller must create an i/o handler thread for each
6501 segment in these arrays. This function also creates the sync array.
6502 No i/o handler thread needs to be created for that
6503 @param[in] n_per_seg maximum number of pending aio
6504 operations allowed per segment
6505 @param[in] n_readers number of reader threads
6506 @param[in] n_writers number of writer threads
6507 @param[in] n_slots_sync number of slots in the sync aio array
6508 @return true if the AIO sub-system was started successfully */
6509 bool
start(ulint n_per_seg,ulint n_readers,ulint n_writers,ulint n_slots_sync)6510 AIO::start(
6511 ulint n_per_seg,
6512 ulint n_readers,
6513 ulint n_writers,
6514 ulint n_slots_sync)
6515 {
6516 #if defined(LINUX_NATIVE_AIO)
6517 /* Check if native aio is supported on this system and tmpfs */
6518 if (srv_use_native_aio && !is_linux_native_aio_supported()) {
6519
6520 ib::warn() << "Linux Native AIO disabled.";
6521
6522 srv_use_native_aio = FALSE;
6523 }
6524 #endif /* LINUX_NATIVE_AIO */
6525
6526 srv_reset_io_thread_op_info();
6527
6528 s_reads = create(
6529 LATCH_ID_OS_AIO_READ_MUTEX, n_readers * n_per_seg, n_readers);
6530
6531 if (s_reads == NULL) {
6532 return(false);
6533 }
6534
6535 ulint start = srv_read_only_mode ? 0 : 2;
6536 ulint n_segs = n_readers + start;
6537
6538 /* 0 is the ibuf segment and 1 is the redo log segment. */
6539 for (ulint i = start; i < n_segs; ++i) {
6540 ut_a(i < SRV_MAX_N_IO_THREADS);
6541 srv_io_thread_function[i] = "read thread";
6542 }
6543
6544 ulint n_segments = n_readers;
6545
6546 if (!srv_read_only_mode) {
6547
6548 s_ibuf = create(LATCH_ID_OS_AIO_IBUF_MUTEX, n_per_seg, 1);
6549
6550 if (s_ibuf == NULL) {
6551 return(false);
6552 }
6553
6554 ++n_segments;
6555
6556 srv_io_thread_function[0] = "insert buffer thread";
6557
6558 s_log = create(LATCH_ID_OS_AIO_LOG_MUTEX, n_per_seg, 1);
6559
6560 if (s_log == NULL) {
6561 return(false);
6562 }
6563
6564 ++n_segments;
6565
6566 srv_io_thread_function[1] = "log thread";
6567
6568 } else {
6569 s_ibuf = s_log = NULL;
6570 }
6571
6572 s_writes = create(
6573 LATCH_ID_OS_AIO_WRITE_MUTEX, n_writers * n_per_seg, n_writers);
6574
6575 if (s_writes == NULL) {
6576 return(false);
6577 }
6578
6579 n_segments += n_writers;
6580
6581 for (ulint i = start + n_readers; i < n_segments; ++i) {
6582 ut_a(i < SRV_MAX_N_IO_THREADS);
6583 srv_io_thread_function[i] = "write thread";
6584 }
6585
6586 ut_ad(n_segments >= static_cast<ulint>(srv_read_only_mode ? 2 : 4));
6587
6588 s_sync = create(LATCH_ID_OS_AIO_SYNC_MUTEX, n_slots_sync, 1);
6589
6590 if (s_sync == NULL) {
6591
6592 return(false);
6593 }
6594
6595 os_aio_n_segments = n_segments;
6596
6597 os_aio_validate();
6598
6599 os_aio_segment_wait_events = static_cast<os_event_t*>(
6600 ut_zalloc_nokey(
6601 n_segments * sizeof *os_aio_segment_wait_events));
6602
6603 if (os_aio_segment_wait_events == NULL) {
6604
6605 return(false);
6606 }
6607
6608 for (ulint i = 0; i < n_segments; ++i) {
6609 os_aio_segment_wait_events[i] = os_event_create(0);
6610 }
6611
6612 os_last_printout = ut_time_monotonic();
6613
6614 return(true);
6615 }
6616
6617 /** Free the AIO arrays */
6618 void
shutdown()6619 AIO::shutdown()
6620 {
6621 UT_DELETE(s_ibuf);
6622 s_ibuf = NULL;
6623
6624 UT_DELETE(s_log);
6625 s_log = NULL;
6626
6627 UT_DELETE(s_writes);
6628 s_writes = NULL;
6629
6630 UT_DELETE(s_sync);
6631 s_sync = NULL;
6632
6633 UT_DELETE(s_reads);
6634 s_reads = NULL;
6635 }
6636
6637 #if !defined(NO_FALLOCATE) && defined(UNIV_LINUX)
6638
6639 /** Max disk sector size */
6640 static const ulint MAX_SECTOR_SIZE = 4096;
6641
6642 /**
6643 Try and get the FusionIO sector size. */
6644 void
os_fusionio_get_sector_size()6645 os_fusionio_get_sector_size()
6646 {
6647 if (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
6648 || srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC) {
6649 ulint sector_size = UNIV_SECTOR_SIZE;
6650 char* path = srv_data_home;
6651 os_file_t check_file;
6652 byte* ptr;
6653 byte* block_ptr;
6654 char current_dir[3];
6655 char* dir_end;
6656 ulint dir_len;
6657 ulint check_path_len;
6658 char* check_file_name;
6659 ssize_t ret;
6660
6661 /* If the srv_data_home is empty, set the path to
6662 current dir. */
6663 if (*path == 0) {
6664 current_dir[0] = FN_CURLIB;
6665 current_dir[1] = FN_LIBCHAR;
6666 current_dir[2] = 0;
6667 path = current_dir;
6668 }
6669
6670 /* Get the path of data file */
6671 dir_end = strrchr(path, OS_PATH_SEPARATOR);
6672 dir_len = dir_end? dir_end - path : strlen(path);
6673
6674 /* allocate a new path and move the directory path to it. */
6675 check_path_len = dir_len + sizeof "/check_sector_size";
6676 check_file_name = static_cast<char*>(
6677 ut_zalloc_nokey(check_path_len));
6678 memcpy(check_file_name, path, dir_len);
6679
6680 /* Construct a check file name. */
6681 strcat(check_file_name + dir_len, "/check_sector_size");
6682
6683 /* Create a tmp file for checking sector size. */
6684 check_file = ::open(check_file_name,
6685 O_CREAT|O_TRUNC|O_WRONLY|O_DIRECT,
6686 S_IRWXU);
6687
6688 if (check_file == -1) {
6689 ib::error()
6690 << "Failed to create check sector file, errno:"
6691 << errno << " Please confirm O_DIRECT is"
6692 << " supported and remove the file "
6693 << check_file_name << " if it exists.";
6694 ut_free(check_file_name);
6695 errno = 0;
6696 return;
6697 }
6698
6699 /* Try to write the file with different sector size
6700 alignment. */
6701 ptr = static_cast<byte*>(ut_malloc_nokey(2 * MAX_SECTOR_SIZE));
6702
6703 while (sector_size <= MAX_SECTOR_SIZE) {
6704 block_ptr = static_cast<byte*>(
6705 ut_align(ptr, sector_size));
6706 ret = pwrite(check_file, block_ptr,
6707 sector_size, 0);
6708 if (ret > 0 && (ulint) ret == sector_size) {
6709 break;
6710 }
6711 sector_size *= 2;
6712 }
6713
6714 /* The sector size should <= MAX_SECTOR_SIZE. */
6715 ut_ad(sector_size <= MAX_SECTOR_SIZE);
6716
6717 close(check_file);
6718 unlink(check_file_name);
6719
6720 ut_free(check_file_name);
6721 ut_free(ptr);
6722 errno = 0;
6723
6724 os_io_ptr_align = sector_size;
6725 }
6726 }
6727 #endif /* !NO_FALLOCATE && UNIV_LINUX */
6728
6729 /** Initializes the asynchronous io system. Creates one array each for ibuf
6730 and log i/o. Also creates one array each for read and write where each
6731 array is divided logically into n_readers and n_writers
6732 respectively. The caller must create an i/o handler thread for each
6733 segment in these arrays. This function also creates the sync array.
6734 No i/o handler thread needs to be created for that
6735 @param[in] n_readers number of reader threads
6736 @param[in] n_writers number of writer threads
6737 @param[in] n_slots_sync number of slots in the sync aio array */
6738 bool
os_aio_init(ulint n_readers,ulint n_writers,ulint n_slots_sync)6739 os_aio_init(
6740 ulint n_readers,
6741 ulint n_writers,
6742 ulint n_slots_sync)
6743 {
6744 /* Maximum number of pending aio operations allowed per segment */
6745 ulint limit = 8 * OS_AIO_N_PENDING_IOS_PER_THREAD;
6746
6747 #ifdef _WIN32
6748 if (srv_use_native_aio) {
6749 limit = SRV_N_PENDING_IOS_PER_THREAD;
6750 }
6751 #endif /* _WIN32 */
6752
6753 ut_a(block_cache == NULL);
6754
6755 block_cache = UT_NEW_NOKEY(Blocks(MAX_BLOCKS));
6756
6757 for (Blocks::iterator it = block_cache->begin();
6758 it != block_cache->end();
6759 ++it) {
6760
6761 ut_a(it->m_in_use == 0);
6762 ut_a(it->m_ptr == NULL);
6763
6764 /* Allocate double of max page size memory, since
6765 compress could generate more bytes than orgininal
6766 data. */
6767 it->m_ptr = static_cast<byte*>(
6768 ut_malloc_nokey(BUFFER_BLOCK_SIZE));
6769
6770 ut_a(it->m_ptr != NULL);
6771 }
6772
6773 /* Get sector size for DIRECT_IO. In this case, we need to
6774 know the sector size for aligning the write buffer. */
6775 #if !defined(NO_FALLOCATE) && defined(UNIV_LINUX)
6776 os_fusionio_get_sector_size();
6777 #endif /* !NO_FALLOCATE && UNIV_LINUX */
6778
6779 return(AIO::start(limit, n_readers, n_writers, n_slots_sync));
6780 }
6781
6782 /** Frees the asynchronous io system. */
6783 void
os_aio_free()6784 os_aio_free()
6785 {
6786 AIO::shutdown();
6787
6788 for (ulint i = 0; i < os_aio_n_segments; i++) {
6789 os_event_destroy(os_aio_segment_wait_events[i]);
6790 }
6791
6792 ut_free(os_aio_segment_wait_events);
6793 os_aio_segment_wait_events = 0;
6794 os_aio_n_segments = 0;
6795
6796 for (Blocks::iterator it = block_cache->begin();
6797 it != block_cache->end();
6798 ++it) {
6799
6800 ut_a(it->m_in_use == 0);
6801 ut_free(it->m_ptr);
6802 }
6803
6804 UT_DELETE(block_cache);
6805
6806 block_cache = NULL;
6807 }
6808
6809 /** Wakes up all async i/o threads so that they know to exit themselves in
6810 shutdown. */
6811 void
os_aio_wake_all_threads_at_shutdown()6812 os_aio_wake_all_threads_at_shutdown()
6813 {
6814 #ifdef WIN_ASYNC_IO
6815
6816 AIO::wake_at_shutdown();
6817
6818 #elif defined(LINUX_NATIVE_AIO)
6819
6820 /* When using native AIO interface the io helper threads
6821 wait on io_getevents with a timeout value of 500ms. At
6822 each wake up these threads check the server status.
6823 No need to do anything to wake them up. */
6824
6825 if (srv_use_native_aio) {
6826 return;
6827 }
6828
6829 #endif /* !WIN_ASYNC_AIO */
6830
6831 /* Fall through to simulated AIO handler wakeup if we are
6832 not using native AIO. */
6833
6834 /* This loop wakes up all simulated ai/o threads */
6835
6836 for (ulint i = 0; i < os_aio_n_segments; ++i) {
6837
6838 os_event_set(os_aio_segment_wait_events[i]);
6839 }
6840 }
6841
6842 /** Waits until there are no pending writes in AIO::s_writes. There can
6843 be other, synchronous, pending writes. */
6844 void
os_aio_wait_until_no_pending_writes()6845 os_aio_wait_until_no_pending_writes()
6846 {
6847 AIO::wait_until_no_pending_writes();
6848 }
6849
6850 /** Calculates segment number for a slot.
6851 @param[in] array AIO wait array
6852 @param[in] slot slot in this array
6853 @return segment number (which is the number used by, for example,
6854 I/O-handler threads) */
6855 ulint
get_segment_no_from_slot(const AIO * array,const Slot * slot)6856 AIO::get_segment_no_from_slot(
6857 const AIO* array,
6858 const Slot* slot)
6859 {
6860 ulint segment;
6861 ulint seg_len;
6862
6863 if (array == s_ibuf) {
6864 ut_ad(!srv_read_only_mode);
6865
6866 segment = IO_IBUF_SEGMENT;
6867
6868 } else if (array == s_log) {
6869 ut_ad(!srv_read_only_mode);
6870
6871 segment = IO_LOG_SEGMENT;
6872
6873 } else if (array == s_reads) {
6874 seg_len = s_reads->slots_per_segment();
6875
6876 segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
6877 } else {
6878 ut_a(array == s_writes);
6879
6880 seg_len = s_writes->slots_per_segment();
6881
6882 segment = s_reads->m_n_segments
6883 + (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
6884 }
6885
6886 return(segment);
6887 }
6888
6889 /** Requests for a slot in the aio array. If no slot is available, waits until
6890 not_full-event becomes signaled.
6891
6892 @param[in,out] type IO context
6893 @param[in,out] m1 message to be passed along with the AIO
6894 operation
6895 @param[in,out] m2 message to be passed along with the AIO
6896 operation
6897 @param[in] file file handle
6898 @param[in] name name of the file or path as a NUL-terminated
6899 string
6900 @param[in,out] buf buffer where to read or from which to write
6901 @param[in] offset file offset, where to read from or start writing
6902 @param[in] len length of the block to read or write
6903 @return pointer to slot */
6904 Slot*
reserve_slot(IORequest & type,fil_node_t * m1,void * m2,pfs_os_file_t file,const char * name,void * buf,os_offset_t offset,ulint len)6905 AIO::reserve_slot(
6906 IORequest& type,
6907 fil_node_t* m1,
6908 void* m2,
6909 pfs_os_file_t file,
6910 const char* name,
6911 void* buf,
6912 os_offset_t offset,
6913 ulint len)
6914 {
6915 #ifdef WIN_ASYNC_IO
6916 ut_a((len & 0xFFFFFFFFUL) == len);
6917 #endif /* WIN_ASYNC_IO */
6918
6919 /* No need of a mutex. Only reading constant fields */
6920 ulint slots_per_seg;
6921
6922 ut_ad(type.validate());
6923
6924 slots_per_seg = slots_per_segment();
6925
6926 /* We attempt to keep adjacent blocks in the same local
6927 segment. This can help in merging IO requests when we are
6928 doing simulated AIO */
6929 ulint local_seg;
6930
6931 local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6)) % m_n_segments;
6932
6933 for (;;) {
6934
6935 acquire();
6936
6937 if (m_n_reserved != m_slots.size()) {
6938 break;
6939 }
6940
6941 release();
6942
6943 if (!srv_use_native_aio) {
6944 /* If the handler threads are suspended,
6945 wake them so that we get more slots */
6946
6947 os_aio_simulated_wake_handler_threads();
6948 }
6949
6950 os_event_wait(m_not_full);
6951 }
6952
6953 ulint counter = 0;
6954 Slot* slot = NULL;
6955
6956 /* We start our search for an available slot from our preferred
6957 local segment and do a full scan of the array. We are
6958 guaranteed to find a slot in full scan. */
6959 for (ulint i = local_seg * slots_per_seg;
6960 counter < m_slots.size();
6961 ++i, ++counter) {
6962
6963 i %= m_slots.size();
6964
6965 slot = at(i);
6966
6967 if (slot->is_reserved == false) {
6968 break;
6969 }
6970 }
6971
6972 /* We MUST always be able to get hold of a reserved slot. */
6973 ut_a(counter < m_slots.size());
6974
6975 ut_a(slot->is_reserved == false);
6976
6977 ++m_n_reserved;
6978
6979 if (m_n_reserved == 1) {
6980 os_event_reset(m_is_empty);
6981 }
6982
6983 if (m_n_reserved == m_slots.size()) {
6984 os_event_reset(m_not_full);
6985 }
6986
6987 slot->is_reserved = true;
6988 slot->reservation_time = ut_time_monotonic();
6989 slot->m1 = m1;
6990 slot->m2 = m2;
6991 slot->file = file;
6992 slot->name = name;
6993 #ifdef _WIN32
6994 slot->len = static_cast<DWORD>(len);
6995 #else
6996 slot->len = static_cast<ulint>(len);
6997 #endif /* _WIN32 */
6998 slot->type = type;
6999 slot->buf = static_cast<byte*>(buf);
7000 slot->ptr = slot->buf;
7001 slot->offset = offset;
7002 slot->err = DB_SUCCESS;
7003 slot->original_len = static_cast<uint32>(len);
7004 slot->io_already_done = false;
7005 slot->buf_block = NULL;
7006
7007 if (srv_use_native_aio
7008 && offset > 0
7009 && type.is_write()
7010 && type.is_compressed()) {
7011 ulint compressed_len = len;
7012
7013 ut_ad(!type.is_log());
7014
7015 release();
7016
7017 void* src_buf = slot->buf;
7018 slot->buf_block = os_file_compress_page(
7019 type,
7020 src_buf,
7021 &compressed_len);
7022
7023 slot->buf = static_cast<byte*>(src_buf);
7024 slot->ptr = slot->buf;
7025 #ifdef _WIN32
7026 slot->len = static_cast<DWORD>(compressed_len);
7027 #else
7028 slot->len = static_cast<ulint>(compressed_len);
7029 #endif /* _WIN32 */
7030 slot->skip_punch_hole = !type.punch_hole();
7031
7032 acquire();
7033 }
7034
7035 /* We do encryption after compression, since if we do encryption
7036 before compression, the encrypted data will cause compression fail
7037 or low compression rate. */
7038 if (srv_use_native_aio
7039 && offset > 0
7040 && type.is_write()
7041 && type.is_encrypted()) {
7042 ulint encrypted_len = slot->len;
7043 Block* encrypted_block;
7044
7045 ut_ad(!type.is_log());
7046
7047 release();
7048
7049 void* src_buf = slot->buf;
7050 encrypted_block = os_file_encrypt_page(
7051 type,
7052 src_buf,
7053 &encrypted_len);
7054
7055 if (slot->buf_block != NULL) {
7056 os_free_block(slot->buf_block);
7057 }
7058
7059 slot->buf_block = encrypted_block;
7060 slot->buf = static_cast<byte*>(src_buf);
7061 slot->ptr = slot->buf;
7062
7063 #ifdef _WIN32
7064 slot->len = static_cast<DWORD>(encrypted_len);
7065 #else
7066 slot->len = static_cast<ulint>(encrypted_len);
7067 #endif /* _WIN32 */
7068
7069 acquire();
7070 }
7071
7072 #ifdef WIN_ASYNC_IO
7073 {
7074 OVERLAPPED* control;
7075
7076 control = &slot->control;
7077 control->Offset = (DWORD) offset & 0xFFFFFFFF;
7078 control->OffsetHigh = (DWORD) (offset >> 32);
7079
7080 ResetEvent(slot->handle);
7081 }
7082 #elif defined(LINUX_NATIVE_AIO)
7083
7084 /* If we are not using native AIO skip this part. */
7085 if (srv_use_native_aio) {
7086
7087 off_t aio_offset;
7088
7089 /* Check if we are dealing with 64 bit arch.
7090 If not then make sure that offset fits in 32 bits. */
7091 aio_offset = (off_t) offset;
7092
7093 ut_a(sizeof(aio_offset) >= sizeof(offset)
7094 || ((os_offset_t) aio_offset) == offset);
7095
7096 struct iocb* iocb = &slot->control;
7097
7098 if (type.is_read()) {
7099 io_prep_pread(
7100 iocb, file.m_file, slot->ptr, slot->len, aio_offset);
7101 } else {
7102 ut_ad(type.is_write());
7103 io_prep_pwrite(
7104 iocb, file.m_file, slot->ptr, slot->len, aio_offset);
7105 }
7106
7107 iocb->data = slot;
7108
7109 slot->n_bytes = 0;
7110 slot->ret = 0;
7111 }
7112 #endif /* LINUX_NATIVE_AIO */
7113
7114 release();
7115
7116 return(slot);
7117 }
7118
7119 /** Wakes up a simulated aio i/o-handler thread if it has something to do.
7120 @param[in] global_segment The number of the segment in the AIO arrays */
7121 void
wake_simulated_handler_thread(ulint global_segment)7122 AIO::wake_simulated_handler_thread(ulint global_segment)
7123 {
7124 ut_ad(!srv_use_native_aio);
7125
7126 AIO* array;
7127 ulint segment = get_array_and_local_segment(&array, global_segment);
7128
7129 array->wake_simulated_handler_thread(global_segment, segment);
7130 }
7131
7132 /** Wakes up a simulated AIO I/O-handler thread if it has something to do
7133 for a local segment in the AIO array.
7134 @param[in] global_segment The number of the segment in the AIO arrays
7135 @param[in] segment The local segment in the AIO array */
7136 void
wake_simulated_handler_thread(ulint global_segment,ulint segment)7137 AIO::wake_simulated_handler_thread(ulint global_segment, ulint segment)
7138 {
7139 ut_ad(!srv_use_native_aio);
7140
7141 ulint n = slots_per_segment();
7142 ulint offset = segment * n;
7143
7144 /* Look through n slots after the segment * n'th slot */
7145
7146 acquire();
7147
7148 const Slot* slot = at(offset);
7149
7150 for (ulint i = 0; i < n; ++i, ++slot) {
7151
7152 if (slot->is_reserved) {
7153
7154 /* Found an i/o request */
7155
7156 release();
7157
7158 os_event_t event;
7159
7160 event = os_aio_segment_wait_events[global_segment];
7161
7162 os_event_set(event);
7163
7164 return;
7165 }
7166 }
7167
7168 release();
7169 }
7170
7171 /** Wakes up simulated aio i/o-handler threads if they have something to do. */
7172 void
os_aio_simulated_wake_handler_threads()7173 os_aio_simulated_wake_handler_threads()
7174 {
7175 if (srv_use_native_aio) {
7176 /* We do not use simulated aio: do nothing */
7177
7178 return;
7179 }
7180
7181 os_aio_recommend_sleep_for_read_threads = false;
7182
7183 for (ulint i = 0; i < os_aio_n_segments; i++) {
7184 AIO::wake_simulated_handler_thread(i);
7185 }
7186 }
7187
7188 /** Select the IO slot array
7189 @param[in] type Type of IO, READ or WRITE
7190 @param[in] read_only true if running in read-only mode
7191 @param[in] mode IO mode
7192 @return slot array or NULL if invalid mode specified */
7193 AIO*
select_slot_array(IORequest & type,bool read_only,ulint mode)7194 AIO::select_slot_array(IORequest& type, bool read_only, ulint mode)
7195 {
7196 AIO* array;
7197
7198 ut_ad(type.validate());
7199
7200 switch (mode) {
7201 case OS_AIO_NORMAL:
7202
7203 array = type.is_read() ? AIO::s_reads : AIO::s_writes;
7204 break;
7205
7206 case OS_AIO_IBUF:
7207 ut_ad(type.is_read());
7208
7209 /* Reduce probability of deadlock bugs in connection with ibuf:
7210 do not let the ibuf i/o handler sleep */
7211
7212 type.clear_do_not_wake();
7213
7214 array = read_only ? AIO::s_reads : AIO::s_ibuf;
7215 break;
7216
7217 case OS_AIO_LOG:
7218
7219 array = read_only ? AIO::s_reads : AIO::s_log;
7220 break;
7221
7222 case OS_AIO_SYNC:
7223
7224 array = AIO::s_sync;
7225 #if defined(LINUX_NATIVE_AIO)
7226 /* In Linux native AIO we don't use sync IO array. */
7227 ut_a(!srv_use_native_aio);
7228 #endif /* LINUX_NATIVE_AIO */
7229 break;
7230
7231 default:
7232 ut_error;
7233 array = NULL; /* Eliminate compiler warning */
7234 }
7235
7236 return(array);
7237 }
7238
7239 #ifdef WIN_ASYNC_IO
7240 /** This function is only used in Windows asynchronous i/o.
7241 Waits for an aio operation to complete. This function is used to wait the
7242 for completed requests. The aio array of pending requests is divided
7243 into segments. The thread specifies which segment or slot it wants to wait
7244 for. NOTE: this function will also take care of freeing the aio slot,
7245 therefore no other thread is allowed to do the freeing!
7246 @param[in] segment The number of the segment in the aio arrays to
7247 wait for; segment 0 is the ibuf I/O thread,
7248 segment 1 the log I/O thread, then follow the
7249 non-ibuf read threads, and as the last are the
7250 non-ibuf write threads; if this is
7251 ULINT_UNDEFINED, then it means that sync AIO
7252 is used, and this parameter is ignored
7253 @param[in] pos this parameter is used only in sync AIO:
7254 wait for the aio slot at this position
7255 @param[out] m1 the messages passed with the AIO request; note
7256 that also in the case where the AIO operation
7257 failed, these output parameters are valid and
7258 can be used to restart the operation,
7259 for example
7260 @param[out] m2 callback message
7261 @param[out] type OS_FILE_WRITE or ..._READ
7262 @return DB_SUCCESS or error code */
7263 static
7264 dberr_t
os_aio_windows_handler(ulint segment,ulint pos,fil_node_t ** m1,void ** m2,IORequest * type)7265 os_aio_windows_handler(
7266 ulint segment,
7267 ulint pos,
7268 fil_node_t** m1,
7269 void** m2,
7270 IORequest* type)
7271 {
7272 Slot* slot;
7273 dberr_t err;
7274 AIO* array;
7275 ulint orig_seg = segment;
7276
7277 if (segment == ULINT_UNDEFINED) {
7278 segment = 0;
7279 array = AIO::sync_array();
7280 } else {
7281 segment = AIO::get_array_and_local_segment(&array, segment);
7282 }
7283
7284 /* NOTE! We only access constant fields in os_aio_array. Therefore
7285 we do not have to acquire the protecting mutex yet */
7286
7287 ut_ad(os_aio_validate_skip());
7288
7289 if (array == AIO::sync_array()) {
7290
7291 WaitForSingleObject(array->at(pos)->handle, INFINITE);
7292
7293 } else {
7294 if (orig_seg != ULINT_UNDEFINED) {
7295 srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
7296 }
7297
7298 pos = WaitForMultipleObjects(
7299 (DWORD) array->slots_per_segment(),
7300 array->handles(segment),
7301 FALSE, INFINITE);
7302 }
7303
7304 array->acquire();
7305
7306 if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
7307 && array->is_empty()
7308 && !buf_page_cleaner_is_active) {
7309
7310 *m1 = NULL;
7311 *m2 = NULL;
7312
7313 array->release();
7314
7315 return(DB_SUCCESS);
7316 }
7317
7318 ulint n = array->slots_per_segment();
7319
7320 ut_a(pos >= WAIT_OBJECT_0 && pos <= WAIT_OBJECT_0 + n);
7321
7322 slot = array->at(pos + segment * n);
7323
7324 ut_a(slot->is_reserved);
7325
7326 if (orig_seg != ULINT_UNDEFINED) {
7327 srv_set_io_thread_op_info(
7328 orig_seg, "get windows aio return value");
7329 }
7330
7331 BOOL ret;
7332 ret = GetOverlappedResult(
7333 slot->file.m_file, &slot->control, &slot->n_bytes, TRUE);
7334 *m1 = slot->m1;
7335 *m2 = slot->m2;
7336
7337 *type = slot->type;
7338
7339 BOOL retry = FALSE;
7340
7341 if (ret && slot->n_bytes == slot->len) {
7342
7343 err = DB_SUCCESS;
7344
7345 } else if (os_file_handle_error(slot->name, "Windows aio")) {
7346
7347 retry = true;
7348
7349 } else {
7350
7351 err = DB_IO_ERROR;
7352 }
7353
7354 array->release();
7355
7356 if (retry) {
7357 /* Retry failed read/write operation synchronously.
7358 No need to hold array->m_mutex. */
7359
7360 #ifdef UNIV_PFS_IO
7361 /* This read/write does not go through os_file_read
7362 and os_file_write APIs, need to register with
7363 performance schema explicitly here. */
7364 struct PSI_file_locker* locker = NULL;
7365 PSI_file_locker_state state;
7366 register_pfs_file_io_begin(
7367 &state, locker, slot->file, slot->len,
7368 slot->type.is_write()
7369 ? PSI_FILE_WRITE : PSI_FILE_READ, __FILE__, __LINE__);
7370 #endif /* UNIV_PFS_IO */
7371
7372 ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
7373
7374 ssize_t n_bytes = SyncFileIO::execute(slot);
7375
7376 #ifdef UNIV_PFS_IO
7377 register_pfs_file_io_end(locker, slot->len);
7378 #endif /* UNIV_PFS_IO */
7379
7380 if (n_bytes < 0 && GetLastError() == ERROR_IO_PENDING) {
7381 /* AIO was queued successfully!
7382 We want a synchronous I/O operation on a
7383 file where we also use async I/O: in Windows
7384 we must use the same wait mechanism as for
7385 async I/O */
7386
7387 BOOL ret;
7388 ret = GetOverlappedResult(
7389 slot->file.m_file, &slot->control, &slot->n_bytes,
7390 TRUE);
7391 n_bytes = ret ? slot->n_bytes : -1;
7392 }
7393
7394 err = (n_bytes == slot->len) ? DB_SUCCESS : DB_IO_ERROR;
7395 }
7396
7397 if (err == DB_SUCCESS) {
7398 err = AIOHandler::post_io_processing(slot);
7399 }
7400
7401 array->release_with_mutex(slot);
7402
7403 return(err);
7404 }
7405 #endif /* WIN_ASYNC_IO */
7406
7407 /**
7408 NOTE! Use the corresponding macro os_aio(), not directly this function!
7409 Requests an asynchronous i/o operation.
7410 @param[in] type IO request context
7411 @param[in] mode IO mode
7412 @param[in] name Name of the file or path as NUL terminated
7413 string
7414 @param[in] file Open file handle
7415 @param[out] buf buffer where to read
7416 @param[in] offset file offset where to read
7417 @param[in] n number of bytes to read
7418 @param[in] read_only if true read only mode checks are enforced
7419 @param[in,out] m1 Message for the AIO handler, (can be used to
7420 identify a completed AIO operation); ignored
7421 if mode is OS_AIO_SYNC
7422 @param[in,out] m2 message for the AIO handler (can be used to
7423 identify a completed AIO operation); ignored
7424 if mode is OS_AIO_SYNC
7425 @return DB_SUCCESS or error code */
7426 dberr_t
os_aio_func(IORequest & type,ulint mode,const char * name,pfs_os_file_t file,void * buf,os_offset_t offset,ulint n,bool read_only,fil_node_t * m1,void * m2)7427 os_aio_func(
7428 IORequest& type,
7429 ulint mode,
7430 const char* name,
7431 pfs_os_file_t file,
7432 void* buf,
7433 os_offset_t offset,
7434 ulint n,
7435 bool read_only,
7436 fil_node_t* m1,
7437 void* m2)
7438 {
7439 #ifdef WIN_ASYNC_IO
7440 BOOL ret = TRUE;
7441 #endif /* WIN_ASYNC_IO */
7442
7443 ut_ad(n > 0);
7444 ut_ad((n % OS_FILE_LOG_BLOCK_SIZE) == 0);
7445 ut_ad((offset % OS_FILE_LOG_BLOCK_SIZE) == 0);
7446 ut_ad(os_aio_validate_skip());
7447
7448 #ifdef WIN_ASYNC_IO
7449 ut_ad((n & 0xFFFFFFFFUL) == n);
7450 #endif /* WIN_ASYNC_IO */
7451
7452 if (mode == OS_AIO_SYNC
7453 #ifdef WIN_ASYNC_IO
7454 && !srv_use_native_aio
7455 #endif /* WIN_ASYNC_IO */
7456 ) {
7457 /* This is actually an ordinary synchronous read or write:
7458 no need to use an i/o-handler thread. NOTE that if we use
7459 Windows async i/o, Windows does not allow us to use
7460 ordinary synchronous os_file_read etc. on the same file,
7461 therefore we have built a special mechanism for synchronous
7462 wait in the Windows case.
7463 Also note that the Performance Schema instrumentation has
7464 been performed by current os_aio_func()'s wrapper function
7465 pfs_os_aio_func(). So we would no longer need to call
7466 Performance Schema instrumented os_file_read() and
7467 os_file_write(). Instead, we should use os_file_read_func()
7468 and os_file_write_func() */
7469
7470 if (type.is_read()) {
7471 return(os_file_read_func(type, file.m_file, buf, offset, n));
7472 }
7473
7474 ut_ad(type.is_write());
7475 return(os_file_write_func(type, name, file.m_file, buf, offset, n));
7476 }
7477
7478 try_again:
7479
7480 AIO* array;
7481
7482 array = AIO::select_slot_array(type, read_only, mode);
7483
7484 Slot* slot;
7485
7486 slot = array->reserve_slot(type, m1, m2, file, name, buf, offset, n);
7487
7488 if (type.is_read()) {
7489
7490 if (srv_use_native_aio) {
7491
7492 ++os_n_file_reads;
7493
7494 os_bytes_read_since_printout += n;
7495 #ifdef WIN_ASYNC_IO
7496 ret = ReadFile(
7497 file.m_file, slot->ptr, slot->len,
7498 &slot->n_bytes, &slot->control);
7499 #elif defined(LINUX_NATIVE_AIO)
7500 if (!array->linux_dispatch(slot)) {
7501 goto err_exit;
7502 }
7503 #endif /* WIN_ASYNC_IO */
7504 } else if (type.is_wake()) {
7505 AIO::wake_simulated_handler_thread(
7506 AIO::get_segment_no_from_slot(array, slot));
7507 }
7508 } else if (type.is_write()) {
7509
7510 if (srv_use_native_aio) {
7511 ++os_n_file_writes;
7512
7513 #ifdef WIN_ASYNC_IO
7514 ret = WriteFile(
7515 file.m_file, slot->ptr, slot->len,
7516 &slot->n_bytes, &slot->control);
7517 #elif defined(LINUX_NATIVE_AIO)
7518 if (!array->linux_dispatch(slot)) {
7519 goto err_exit;
7520 }
7521 #endif /* WIN_ASYNC_IO */
7522
7523 } else if (type.is_wake()) {
7524 AIO::wake_simulated_handler_thread(
7525 AIO::get_segment_no_from_slot(array, slot));
7526 }
7527 } else {
7528 ut_error;
7529 }
7530
7531 #ifdef WIN_ASYNC_IO
7532 if (srv_use_native_aio) {
7533 if ((ret && slot->len == slot->n_bytes)
7534 || (!ret && GetLastError() == ERROR_IO_PENDING)) {
7535 /* aio was queued successfully! */
7536
7537 if (mode == OS_AIO_SYNC) {
7538 IORequest dummy_type;
7539 void* dummy_mess2;
7540 struct fil_node_t* dummy_mess1;
7541
7542 /* We want a synchronous i/o operation on a
7543 file where we also use async i/o: in Windows
7544 we must use the same wait mechanism as for
7545 async i/o */
7546
7547 return(os_aio_windows_handler(
7548 ULINT_UNDEFINED, slot->pos,
7549 &dummy_mess1, &dummy_mess2,
7550 &dummy_type));
7551 }
7552
7553 return(DB_SUCCESS);
7554 }
7555
7556 goto err_exit;
7557 }
7558 #endif /* WIN_ASYNC_IO */
7559
7560 /* AIO request was queued successfully! */
7561 return(DB_SUCCESS);
7562
7563 #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
7564 err_exit:
7565 #endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
7566
7567 array->release_with_mutex(slot);
7568
7569 if (os_file_handle_error(
7570 name, type.is_read() ? "aio read" : "aio write")) {
7571
7572 goto try_again;
7573 }
7574
7575 return(DB_IO_ERROR);
7576 }
7577
7578 /** Simulated AIO handler for reaping IO requests */
7579 class SimulatedAIOHandler {
7580
7581 public:
7582
7583 /** Constructor
7584 @param[in,out] array The AIO array
7585 @param[in] segment Local segment in the array */
SimulatedAIOHandler(AIO * array,ulint segment)7586 SimulatedAIOHandler(AIO* array, ulint segment)
7587 :
7588 m_oldest(),
7589 m_n_elems(),
7590 m_lowest_offset(IB_UINT64_MAX),
7591 m_array(array),
7592 m_n_slots(),
7593 m_segment(segment),
7594 m_ptr(),
7595 m_buf()
7596 {
7597 ut_ad(m_segment < 100);
7598
7599 m_slots.resize(OS_AIO_MERGE_N_CONSECUTIVE);
7600 }
7601
7602 /** Destructor */
~SimulatedAIOHandler()7603 ~SimulatedAIOHandler()
7604 {
7605 if (m_ptr != NULL) {
7606 ut_free(m_ptr);
7607 }
7608 }
7609
7610 /** Reset the state of the handler
7611 @param[in] n_slots Number of pending AIO operations supported */
init(ulint n_slots)7612 void init(ulint n_slots)
7613 {
7614 m_oldest = 0;
7615 m_n_elems = 0;
7616 m_n_slots = n_slots;
7617 m_lowest_offset = IB_UINT64_MAX;
7618
7619 if (m_ptr != NULL) {
7620 ut_free(m_ptr);
7621 m_ptr = m_buf = NULL;
7622 }
7623
7624 m_slots[0] = NULL;
7625 }
7626
7627 /** Check if there is a slot for which the i/o has already been done
7628 @param[out] n_reserved Number of reserved slots
7629 @return the first completed slot that is found. */
check_completed(ulint * n_reserved)7630 Slot* check_completed(ulint* n_reserved)
7631 {
7632 ulint offset = m_segment * m_n_slots;
7633
7634 *n_reserved = 0;
7635
7636 Slot* slot;
7637
7638 slot = m_array->at(offset);
7639
7640 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7641
7642 if (slot->is_reserved) {
7643
7644 if (slot->io_already_done) {
7645
7646 ut_a(slot->is_reserved);
7647
7648 return(slot);
7649 }
7650
7651 ++*n_reserved;
7652 }
7653 }
7654
7655 return(NULL);
7656 }
7657
7658 /** If there are at least 2 seconds old requests, then pick the
7659 oldest one to prevent starvation. If several requests have the
7660 same age, then pick the one at the lowest offset.
7661 @return true if request was selected */
select()7662 bool select()
7663 {
7664 if (!select_oldest()) {
7665
7666 return(select_lowest_offset());
7667 }
7668
7669 return(true);
7670 }
7671
7672 /** Check if there are several consecutive blocks
7673 to read or write. Merge them if found. */
merge()7674 void merge()
7675 {
7676 /* if m_n_elems != 0, then we have assigned
7677 something valid to consecutive_ios[0] */
7678 ut_ad(m_n_elems != 0);
7679 ut_ad(first_slot() != NULL);
7680
7681 Slot* slot = first_slot();
7682
7683 while (!merge_adjacent(slot)) {
7684 /* No op */
7685 }
7686 }
7687
7688 /** We have now collected n_consecutive I/O requests
7689 in the array; allocate a single buffer which can hold
7690 all data, and perform the I/O
7691 @return the length of the buffer */
allocate_buffer()7692 ulint allocate_buffer()
7693 MY_ATTRIBUTE((warn_unused_result))
7694 {
7695 ulint len;
7696 Slot* slot = first_slot();
7697
7698 ut_ad(m_ptr == NULL);
7699
7700 if (slot->type.is_read() && m_n_elems > 1) {
7701
7702 len = 0;
7703
7704 for (ulint i = 0; i < m_n_elems; ++i) {
7705 len += m_slots[i]->len;
7706 }
7707
7708 m_ptr = static_cast<byte*>(
7709 ut_malloc_nokey(len + UNIV_PAGE_SIZE));
7710
7711 m_buf = static_cast<byte*>(
7712 ut_align(m_ptr, UNIV_PAGE_SIZE));
7713
7714 } else {
7715 len = first_slot()->len;
7716 m_buf = first_slot()->buf;
7717 }
7718
7719 return(len);
7720 }
7721
7722 /** We have to compress the individual pages and punch
7723 holes in them on a page by page basis when writing to
7724 tables that can be compresed at the IO level.
7725 @param[in] len Value returned by allocate_buffer */
copy_to_buffer(ulint len)7726 void copy_to_buffer(ulint len)
7727 {
7728 Slot* slot = first_slot();
7729
7730 if (len > slot->len && slot->type.is_write()) {
7731
7732 byte* ptr = m_buf;
7733
7734 ut_ad(ptr != slot->buf);
7735
7736 /* Copy the buffers to the combined buffer */
7737 for (ulint i = 0; i < m_n_elems; ++i) {
7738
7739 slot = m_slots[i];
7740
7741 memmove(ptr, slot->buf, slot->len);
7742
7743 ptr += slot->len;
7744 }
7745 }
7746 }
7747
7748 /** Do the I/O with ordinary, synchronous i/o functions:
7749 @param[in] len Length of buffer for IO */
io()7750 void io()
7751 {
7752 if (first_slot()->type.is_write()) {
7753
7754 for (ulint i = 0; i < m_n_elems; ++i) {
7755 write(m_slots[i]);
7756 }
7757
7758 } else {
7759
7760 for (ulint i = 0; i < m_n_elems; ++i) {
7761 read(m_slots[i]);
7762 }
7763 }
7764 }
7765
7766 /** Do the decompression of the pages read in */
io_complete()7767 void io_complete()
7768 {
7769 // Note: For non-compressed tables. Not required
7770 // for correctness.
7771 }
7772
7773 /** Mark the i/os done in slots */
done()7774 void done()
7775 {
7776 for (ulint i = 0; i < m_n_elems; ++i) {
7777 m_slots[i]->io_already_done = true;
7778 }
7779 }
7780
7781 /** @return the first slot in the consecutive array */
first_slot()7782 Slot* first_slot()
7783 MY_ATTRIBUTE((warn_unused_result))
7784 {
7785 ut_a(m_n_elems > 0);
7786
7787 return(m_slots[0]);
7788 }
7789
7790 /** Wait for I/O requests
7791 @param[in] global_segment The global segment
7792 @param[in,out] event Wait on event if no active requests
7793 @return the number of slots */
7794 ulint check_pending(
7795 ulint global_segment,
7796 os_event_t event)
7797 MY_ATTRIBUTE((warn_unused_result));
7798 private:
7799
7800 /** Do the file read
7801 @param[in,out] slot Slot that has the IO context */
read(Slot * slot)7802 void read(Slot* slot)
7803 {
7804 dberr_t err = os_file_read_func(
7805 slot->type,
7806 slot->file.m_file,
7807 slot->ptr,
7808 slot->offset,
7809 slot->len);
7810 ut_a(err == DB_SUCCESS);
7811 }
7812
7813 /** Do the file read
7814 @param[in,out] slot Slot that has the IO context */
write(Slot * slot)7815 void write(Slot* slot)
7816 {
7817 dberr_t err = os_file_write_func(
7818 slot->type,
7819 slot->name,
7820 slot->file.m_file,
7821 slot->ptr,
7822 slot->offset,
7823 slot->len);
7824 ut_a(err == DB_SUCCESS || err == DB_IO_NO_PUNCH_HOLE);
7825 }
7826
7827 /** @return true if the slots are adjacent and can be merged */
adjacent(const Slot * s1,const Slot * s2) const7828 bool adjacent(const Slot* s1, const Slot* s2) const
7829 {
7830 return(s1 != s2
7831 && s1->file.m_file == s2->file.m_file
7832 && s2->offset == s1->offset + s1->len
7833 && s1->type == s2->type);
7834 }
7835
7836 /** @return true if merge limit reached or no adjacent slots found. */
merge_adjacent(Slot * & current)7837 bool merge_adjacent(Slot*& current)
7838 {
7839 Slot* slot;
7840 ulint offset = m_segment * m_n_slots;
7841
7842 slot = m_array->at(offset);
7843
7844 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7845
7846 if (slot->is_reserved && adjacent(current, slot)) {
7847
7848 current = slot;
7849
7850 /* Found a consecutive i/o request */
7851
7852 m_slots[m_n_elems] = slot;
7853
7854 ++m_n_elems;
7855
7856 return(m_n_elems >= m_slots.capacity());
7857 }
7858 }
7859
7860 return(true);
7861 }
7862
7863 /** There were no old requests. Look for an I/O request at the lowest
7864 offset in the array (we ignore the high 32 bits of the offset in these
7865 heuristics) */
select_lowest_offset()7866 bool select_lowest_offset()
7867 {
7868 ut_ad(m_n_elems == 0);
7869
7870 ulint offset = m_segment * m_n_slots;
7871
7872 m_lowest_offset = IB_UINT64_MAX;
7873
7874 for (ulint i = 0; i < m_n_slots; ++i) {
7875 Slot* slot;
7876
7877 slot = m_array->at(i + offset);
7878
7879 if (slot->is_reserved
7880 && slot->offset < m_lowest_offset) {
7881
7882 /* Found an i/o request */
7883 m_slots[0] = slot;
7884
7885 m_n_elems = 1;
7886
7887 m_lowest_offset = slot->offset;
7888 }
7889 }
7890
7891 return(m_n_elems > 0);
7892 }
7893
7894 /** Select the slot if it is older than the current oldest slot.
7895 @param[in] slot The slot to check */
select_if_older(Slot * slot)7896 void select_if_older(Slot* slot)
7897 {
7898 int64_t time_diff = ut_time_monotonic() -
7899 slot->reservation_time;
7900
7901 const uint64_t age = time_diff > 0 ? (uint64_t) time_diff : 0;
7902
7903 if ((age >= 2 && age > m_oldest)
7904 || (age >= 2
7905 && age == m_oldest
7906 && slot->offset < m_lowest_offset)) {
7907
7908 /* Found an i/o request */
7909 m_slots[0] = slot;
7910
7911 m_n_elems = 1;
7912
7913 m_oldest = age;
7914
7915 m_lowest_offset = slot->offset;
7916 }
7917 }
7918
7919 /** Select th oldest slot in the array
7920 @return true if oldest slot found */
select_oldest()7921 bool select_oldest()
7922 {
7923 ut_ad(m_n_elems == 0);
7924
7925 Slot* slot;
7926 ulint offset = m_n_slots * m_segment;
7927
7928 slot = m_array->at(offset);
7929
7930 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7931
7932 if (slot->is_reserved) {
7933 select_if_older(slot);
7934 }
7935 }
7936
7937 return(m_n_elems > 0);
7938 }
7939
7940 typedef std::vector<Slot*> slots_t;
7941
7942 private:
7943 ulint m_oldest;
7944 ulint m_n_elems;
7945 os_offset_t m_lowest_offset;
7946
7947 AIO* m_array;
7948 ulint m_n_slots;
7949 ulint m_segment;
7950
7951 slots_t m_slots;
7952
7953 byte* m_ptr;
7954 byte* m_buf;
7955 };
7956
7957 /** Wait for I/O requests
7958 @return the number of slots */
7959 ulint
check_pending(ulint global_segment,os_event_t event)7960 SimulatedAIOHandler::check_pending(
7961 ulint global_segment,
7962 os_event_t event)
7963 {
7964 /* NOTE! We only access constant fields in os_aio_array.
7965 Therefore we do not have to acquire the protecting mutex yet */
7966
7967 ut_ad(os_aio_validate_skip());
7968
7969 ut_ad(m_segment < m_array->get_n_segments());
7970
7971 /* Look through n slots after the segment * n'th slot */
7972
7973 if (AIO::is_read(m_array)
7974 && os_aio_recommend_sleep_for_read_threads) {
7975
7976 /* Give other threads chance to add several
7977 I/Os to the array at once. */
7978
7979 srv_set_io_thread_op_info(
7980 global_segment, "waiting for i/o request");
7981
7982 os_event_wait(event);
7983
7984 return(0);
7985 }
7986
7987 return(m_array->slots_per_segment());
7988 }
7989
7990 /** Does simulated AIO. This function should be called by an i/o-handler
7991 thread.
7992
7993 @param[in] segment The number of the segment in the aio arrays to wait
7994 for; segment 0 is the ibuf i/o thread, segment 1 the
7995 log i/o thread, then follow the non-ibuf read threads,
7996 and as the last are the non-ibuf write threads
7997 @param[out] m1 the messages passed with the AIO request; note that
7998 also in the case where the AIO operation failed, these
7999 output parameters are valid and can be used to restart
8000 the operation, for example
8001 @param[out] m2 Callback argument
8002 @param[in] type IO context
8003 @return DB_SUCCESS or error code */
8004 static
8005 dberr_t
os_aio_simulated_handler(ulint global_segment,fil_node_t ** m1,void ** m2,IORequest * type)8006 os_aio_simulated_handler(
8007 ulint global_segment,
8008 fil_node_t** m1,
8009 void** m2,
8010 IORequest* type)
8011 {
8012 Slot* slot;
8013 AIO* array;
8014 ulint segment;
8015 os_event_t event = os_aio_segment_wait_events[global_segment];
8016
8017 segment = AIO::get_array_and_local_segment(&array, global_segment);
8018
8019 SimulatedAIOHandler handler(array, segment);
8020
8021 for (;;) {
8022
8023 srv_set_io_thread_op_info(
8024 global_segment, "looking for i/o requests (a)");
8025
8026 ulint n_slots = handler.check_pending(global_segment, event);
8027
8028 if (n_slots == 0) {
8029 continue;
8030 }
8031
8032 handler.init(n_slots);
8033
8034 srv_set_io_thread_op_info(
8035 global_segment, "looking for i/o requests (b)");
8036
8037 array->acquire();
8038
8039 ulint n_reserved;
8040
8041 slot = handler.check_completed(&n_reserved);
8042
8043 if (slot != NULL) {
8044
8045 break;
8046
8047 } else if (n_reserved == 0
8048 && !buf_page_cleaner_is_active
8049 && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
8050
8051 /* There is no completed request. If there
8052 are no pending request at all, and the system
8053 is being shut down, exit. */
8054
8055 array->release();
8056
8057 *m1 = NULL;
8058
8059 *m2 = NULL;
8060
8061 return(DB_SUCCESS);
8062
8063 } else if (handler.select()) {
8064
8065 break;
8066 }
8067
8068 /* No I/O requested at the moment */
8069
8070 srv_set_io_thread_op_info(
8071 global_segment, "resetting wait event");
8072
8073 /* We wait here until tbere are more IO requests
8074 for this segment. */
8075
8076 os_event_reset(event);
8077
8078 array->release();
8079
8080 srv_set_io_thread_op_info(
8081 global_segment, "waiting for i/o request");
8082
8083 os_event_wait(event);
8084 }
8085
8086 /** Found a slot that has already completed its IO */
8087
8088 if (slot == NULL) {
8089 /* Merge adjacent requests */
8090 handler.merge();
8091
8092 /* Check if there are several consecutive blocks
8093 to read or write */
8094
8095 srv_set_io_thread_op_info(
8096 global_segment, "consecutive i/o requests");
8097
8098 // Note: We don't support write combining for simulated AIO.
8099 //ulint total_len = handler.allocate_buffer();
8100
8101 /* We release the array mutex for the time of the I/O: NOTE that
8102 this assumes that there is just one i/o-handler thread serving
8103 a single segment of slots! */
8104
8105 array->release();
8106
8107 // Note: We don't support write combining for simulated AIO.
8108 //handler.copy_to_buffer(total_len);
8109
8110 srv_set_io_thread_op_info(global_segment, "doing file i/o");
8111
8112 handler.io();
8113
8114 srv_set_io_thread_op_info(global_segment, "file i/o done");
8115
8116 handler.io_complete();
8117
8118 array->acquire();
8119
8120 handler.done();
8121
8122 /* We return the messages for the first slot now, and if there
8123 were several slots, the messages will be returned with
8124 subsequent calls of this function */
8125
8126 slot = handler.first_slot();
8127 }
8128
8129 ut_ad(slot->is_reserved);
8130
8131 *m1 = slot->m1;
8132 *m2 = slot->m2;
8133
8134 *type = slot->type;
8135
8136 array->release(slot);
8137
8138 array->release();
8139
8140 return(DB_SUCCESS);
8141 }
8142
8143 /** Get the total number of pending IOs
8144 @return the total number of pending IOs */
8145 ulint
total_pending_io_count()8146 AIO::total_pending_io_count()
8147 {
8148 ulint count = s_reads->pending_io_count();
8149
8150 if (s_writes != NULL) {
8151 count += s_writes->pending_io_count();
8152 }
8153
8154 if (s_ibuf != NULL) {
8155 count += s_ibuf->pending_io_count();
8156 }
8157
8158 if (s_log != NULL) {
8159 count += s_log->pending_io_count();
8160 }
8161
8162 if (s_sync != NULL) {
8163 count += s_sync->pending_io_count();
8164 }
8165
8166 return(count);
8167 }
8168
8169 /** Validates the consistency the aio system.
8170 @return true if ok */
8171 static
8172 bool
os_aio_validate()8173 os_aio_validate()
8174 {
8175 /* The methods countds and validates, we ignore the count. */
8176 AIO::total_pending_io_count();
8177
8178 return(true);
8179 }
8180
8181 /** Prints pending IO requests per segment of an aio array.
8182 We probably don't need per segment statistics but they can help us
8183 during development phase to see if the IO requests are being
8184 distributed as expected.
8185 @param[in,out] file File where to print
8186 @param[in] segments Pending IO array */
8187 void
print_segment_info(FILE * file,const ulint * segments)8188 AIO::print_segment_info(
8189 FILE* file,
8190 const ulint* segments)
8191 {
8192 ut_ad(m_n_segments > 0);
8193
8194 if (m_n_segments > 1) {
8195
8196 fprintf(file, " [");
8197
8198 for (ulint i = 0; i < m_n_segments; ++i, ++segments) {
8199
8200 if (i != 0) {
8201 fprintf(file, ", ");
8202 }
8203
8204 fprintf(file, ULINTPF, *segments);
8205 }
8206
8207 fprintf(file, "] ");
8208 }
8209 }
8210
8211 /** Prints info about the aio array.
8212 @param[in,out] file Where to print */
8213 void
print(FILE * file)8214 AIO::print(FILE* file)
8215 {
8216 ulint count = 0;
8217 ulint n_res_seg[SRV_MAX_N_IO_THREADS];
8218
8219 mutex_enter(&m_mutex);
8220
8221 ut_a(!m_slots.empty());
8222 ut_a(m_n_segments > 0);
8223
8224 memset(n_res_seg, 0x0, sizeof(n_res_seg));
8225
8226 for (ulint i = 0; i < m_slots.size(); ++i) {
8227 Slot& slot = m_slots[i];
8228 ulint segment = (i * m_n_segments) / m_slots.size();
8229
8230 if (slot.is_reserved) {
8231
8232 ++count;
8233
8234 ++n_res_seg[segment];
8235
8236 ut_a(slot.len > 0);
8237 }
8238 }
8239
8240 ut_a(m_n_reserved == count);
8241
8242 print_segment_info(file, n_res_seg);
8243
8244 mutex_exit(&m_mutex);
8245 }
8246
8247 /** Print all the AIO segments
8248 @param[in,out] file Where to print */
8249 void
print_all(FILE * file)8250 AIO::print_all(FILE* file)
8251 {
8252 s_reads->print(file);
8253
8254 if (s_writes != NULL) {
8255 fputs(", aio writes:", file);
8256 s_writes->print(file);
8257 }
8258
8259 if (s_ibuf != NULL) {
8260 fputs(",\n ibuf aio reads:", file);
8261 s_ibuf->print(file);
8262 }
8263
8264 if (s_log != NULL) {
8265 fputs(", log i/o's:", file);
8266 s_log->print(file);
8267 }
8268
8269 if (s_sync != NULL) {
8270 fputs(", sync i/o's:", file);
8271 s_sync->print(file);
8272 }
8273 }
8274
8275 /** Prints info of the aio arrays.
8276 @param[in,out] file file where to print */
8277 void
os_aio_print(FILE * file)8278 os_aio_print(FILE* file)
8279 {
8280 ib_time_monotonic_t current_time;
8281 double time_elapsed;
8282 double avg_bytes_read;
8283
8284 for (ulint i = 0; i < srv_n_file_io_threads; ++i) {
8285 fprintf(file, "I/O thread %lu state: %s (%s)",
8286 (ulong) i,
8287 srv_io_thread_op_info[i],
8288 srv_io_thread_function[i]);
8289
8290 #ifndef _WIN32
8291 if (os_event_is_set(os_aio_segment_wait_events[i])) {
8292 fprintf(file, " ev set");
8293 }
8294 #endif /* _WIN32 */
8295
8296 fprintf(file, "\n");
8297 }
8298
8299 fputs("Pending normal aio reads:", file);
8300
8301 AIO::print_all(file);
8302
8303 putc('\n', file);
8304 current_time = ut_time_monotonic();
8305 time_elapsed = 0.001 + (current_time - os_last_printout);
8306
8307 fprintf(file,
8308 "Pending flushes (fsync) log: " ULINTPF "; "
8309 "buffer pool: " ULINTPF "\n"
8310 ULINTPF " OS file reads, "
8311 ULINTPF " OS file writes, "
8312 ULINTPF " OS fsyncs\n",
8313 fil_n_pending_log_flushes,
8314 fil_n_pending_tablespace_flushes,
8315 os_n_file_reads,
8316 os_n_file_writes,
8317 os_n_fsyncs);
8318
8319 if (os_n_pending_writes != 0 || os_n_pending_reads != 0) {
8320 fprintf(file,
8321 ULINTPF " pending preads, "
8322 ULINTPF " pending pwrites\n",
8323 os_n_pending_reads,
8324 os_n_pending_writes);
8325 }
8326
8327 if (os_n_file_reads == os_n_file_reads_old) {
8328 avg_bytes_read = 0.0;
8329 } else {
8330 avg_bytes_read = (double) os_bytes_read_since_printout
8331 / (os_n_file_reads - os_n_file_reads_old);
8332 }
8333
8334 fprintf(file,
8335 "%.2f reads/s, %lu avg bytes/read,"
8336 " %.2f writes/s, %.2f fsyncs/s\n",
8337 (os_n_file_reads - os_n_file_reads_old)
8338 / time_elapsed,
8339 (ulong) avg_bytes_read,
8340 (os_n_file_writes - os_n_file_writes_old)
8341 / time_elapsed,
8342 (os_n_fsyncs - os_n_fsyncs_old)
8343 / time_elapsed);
8344
8345 os_n_file_reads_old = os_n_file_reads;
8346 os_n_file_writes_old = os_n_file_writes;
8347 os_n_fsyncs_old = os_n_fsyncs;
8348 os_bytes_read_since_printout = 0;
8349
8350 os_last_printout = current_time;
8351 }
8352
8353 /** Refreshes the statistics used to print per-second averages. */
8354 void
os_aio_refresh_stats()8355 os_aio_refresh_stats()
8356 {
8357 os_n_fsyncs_old = os_n_fsyncs;
8358
8359 os_bytes_read_since_printout = 0;
8360
8361 os_n_file_reads_old = os_n_file_reads;
8362
8363 os_n_file_writes_old = os_n_file_writes;
8364
8365 os_n_fsyncs_old = os_n_fsyncs;
8366
8367 os_bytes_read_since_printout = 0;
8368
8369 os_last_printout = ut_time_monotonic();
8370 }
8371
8372 /** Checks that all slots in the system have been freed, that is, there are
8373 no pending io operations.
8374 @return true if all free */
8375 bool
os_aio_all_slots_free()8376 os_aio_all_slots_free()
8377 {
8378 return(AIO::total_pending_io_count() == 0);
8379 }
8380
8381 #ifdef UNIV_DEBUG
8382 /** Prints all pending IO for the array
8383 @param[in] file file where to print
8384 @param[in] array array to process */
8385 void
to_file(FILE * file) const8386 AIO::to_file(FILE* file) const
8387 {
8388 acquire();
8389
8390 fprintf(file, " %lu\n", static_cast<ulong>(m_n_reserved));
8391
8392 for (ulint i = 0; i < m_slots.size(); ++i) {
8393
8394 const Slot& slot = m_slots[i];
8395
8396 if (slot.is_reserved) {
8397
8398 fprintf(file,
8399 "%s IO for %s (offset=" UINT64PF
8400 ", size=%lu)\n",
8401 slot.type.is_read() ? "read" : "write",
8402 slot.name, slot.offset, slot.len);
8403 }
8404 }
8405
8406 release();
8407 }
8408
8409 /** Print pending IOs for all arrays */
8410 void
print_to_file(FILE * file)8411 AIO::print_to_file(FILE* file)
8412 {
8413 fprintf(file, "Pending normal aio reads:");
8414
8415 s_reads->to_file(file);
8416
8417 if (s_writes != NULL) {
8418 fprintf(file, "Pending normal aio writes:");
8419 s_writes->to_file(file);
8420 }
8421
8422 if (s_ibuf != NULL) {
8423 fprintf(file, "Pending ibuf aio reads:");
8424 s_ibuf->to_file(file);
8425 }
8426
8427 if (s_log != NULL) {
8428 fprintf(file, "Pending log i/o's:");
8429 s_log->to_file(file);
8430 }
8431
8432 if (s_sync != NULL) {
8433 fprintf(file, "Pending sync i/o's:");
8434 s_sync->to_file(file);
8435 }
8436 }
8437
8438 /** Prints all pending IO
8439 @param[in] file File where to print */
8440 void
os_aio_print_pending_io(FILE * file)8441 os_aio_print_pending_io(
8442 FILE* file)
8443 {
8444 AIO::print_to_file(file);
8445 }
8446
8447 #endif /* UNIV_DEBUG */
8448
8449 /**
8450 Set the file create umask
8451 @param[in] umask The umask to use for file creation. */
8452 void
os_file_set_umask(ulint umask)8453 os_file_set_umask(ulint umask)
8454 {
8455 os_innodb_umask = umask;
8456 }
8457 #else
8458
8459 #include "univ.i"
8460 #include "db0err.h"
8461 #include "mach0data.h"
8462 #include "fil0fil.h"
8463 #include "os0file.h"
8464
8465 #include <lz4.h>
8466 #include <zlib.h>
8467
8468 #include <my_aes.h>
8469 #include <my_rnd.h>
8470 #include <mysqld.h>
8471 #include <mysql/service_mysql_keyring.h>
8472
8473 typedef byte Block;
8474
8475 /** Allocate a page for sync IO
8476 @return pointer to page */
8477 static
8478 Block*
os_alloc_block()8479 os_alloc_block()
8480 {
8481 return(reinterpret_cast<byte*>(malloc(UNIV_PAGE_SIZE_MAX * 2)));
8482 }
8483
8484 /** Free a page after sync IO
8485 @param[in,own] block The block to free/release */
8486 static
8487 void
os_free_block(Block * block)8488 os_free_block(Block* block)
8489 {
8490 ut_free(block);
8491 }
8492
8493 #endif /* !UNIV_INNOCHECKSUM */
8494
8495 /** Minimum length needed for encryption */
8496 const unsigned int MIN_ENCRYPTION_LEN = 2 * MY_AES_BLOCK_SIZE + FIL_PAGE_DATA;
8497
8498 /**
8499 @param[in] type The compression type
8500 @return the string representation */
8501 const char*
to_string(Type type)8502 Compression::to_string(Type type)
8503 {
8504 switch(type) {
8505 case NONE:
8506 return("None");
8507 case ZLIB:
8508 return("Zlib");
8509 case LZ4:
8510 return("LZ4");
8511 }
8512
8513 ut_ad(0);
8514
8515 return("<UNKNOWN>");
8516 }
8517
8518 /**
8519 @param[in] meta Page Meta data
8520 @return the string representation */
to_string(const Compression::meta_t & meta)8521 std::string Compression::to_string(const Compression::meta_t& meta)
8522 {
8523 std::ostringstream stream;
8524
8525 stream << "version: " << int(meta.m_version) << " "
8526 << "algorithm: " << meta.m_algorithm << " "
8527 << "(" << to_string(meta.m_algorithm) << ") "
8528 << "orginal_type: " << meta.m_original_type << " "
8529 << "original_size: " << meta.m_original_size << " "
8530 << "compressed_size: " << meta.m_compressed_size;
8531
8532 return(stream.str());
8533 }
8534
8535 /** @return true if it is a compressed page */
8536 bool
is_compressed_page(const byte * page)8537 Compression::is_compressed_page(const byte* page)
8538 {
8539 return(mach_read_from_2(page + FIL_PAGE_TYPE) == FIL_PAGE_COMPRESSED);
8540 }
8541
8542 bool
is_compressed_encrypted_page(const byte * page)8543 Compression::is_compressed_encrypted_page(const byte *page) {
8544 return (mach_read_from_2(page + FIL_PAGE_TYPE) ==
8545 FIL_PAGE_COMPRESSED_AND_ENCRYPTED);
8546 }
8547
8548 bool
is_valid_page_version(uint8_t version)8549 Compression::is_valid_page_version(uint8_t version) {
8550 return (version == FIL_PAGE_VERSION_1 || version == FIL_PAGE_VERSION_2);
8551 }
8552
8553 /** Deserizlise the page header compression meta-data
8554 @param[in] page Pointer to the page header
8555 @param[out] control Deserialised data */
8556 void
deserialize_header(const byte * page,Compression::meta_t * control)8557 Compression::deserialize_header(
8558 const byte* page,
8559 Compression::meta_t* control)
8560 {
8561 ut_ad(is_compressed_page(page) || is_compressed_encrypted_page(page));
8562
8563 control->m_version = static_cast<uint8_t>(
8564 mach_read_from_1(page + FIL_PAGE_VERSION));
8565
8566 control->m_original_type = static_cast<uint16_t>(
8567 mach_read_from_2(page + FIL_PAGE_ORIGINAL_TYPE_V1));
8568
8569 control->m_compressed_size = static_cast<uint16_t>(
8570 mach_read_from_2(page + FIL_PAGE_COMPRESS_SIZE_V1));
8571
8572 control->m_original_size = static_cast<uint16_t>(
8573 mach_read_from_2(page + FIL_PAGE_ORIGINAL_SIZE_V1));
8574
8575 control->m_algorithm = static_cast<Type>(
8576 mach_read_from_1(page + FIL_PAGE_ALGORITHM_V1));
8577 }
8578
8579 /** Decompress the page data contents. Page type must be FIL_PAGE_COMPRESSED, if
8580 not then the source contents are left unchanged and DB_SUCCESS is returned.
8581 @param[in] dblwr_recover true of double write recovery in progress
8582 @param[in,out] src Data read from disk, decompressed data will be
8583 copied to this page
8584 @param[in,out] dst Scratch area to use for decompression
8585 @param[in] dst_len Size of the scratch area in bytes
8586 @return DB_SUCCESS or error code */
8587 dberr_t
deserialize(bool dblwr_recover,byte * src,byte * dst,ulint dst_len)8588 Compression::deserialize(
8589 bool dblwr_recover,
8590 byte* src,
8591 byte* dst,
8592 ulint dst_len)
8593 {
8594 if (!is_compressed_page(src)) {
8595 /* There is nothing we can do. */
8596 return(DB_SUCCESS);
8597 }
8598
8599 meta_t header;
8600
8601 deserialize_header(src, &header);
8602
8603 byte* ptr = src + FIL_PAGE_DATA;
8604
8605 ut_ad(is_valid_page_version(header.m_version));
8606
8607 if (!is_valid_page_version(header.m_version)
8608 || header.m_original_size < UNIV_PAGE_SIZE_MIN - (FIL_PAGE_DATA + 8)
8609 || header.m_original_size > UNIV_PAGE_SIZE_MAX - FIL_PAGE_DATA
8610 || dst_len < header.m_original_size + FIL_PAGE_DATA) {
8611
8612 /* The last check could potentially return DB_OVERFLOW,
8613 the caller should be able to retry with a larger buffer. */
8614
8615 return(DB_CORRUPTION);
8616 }
8617
8618 Block* block;
8619
8620 /* The caller doesn't know what to expect */
8621 if (dst == NULL) {
8622
8623 block = os_alloc_block();
8624
8625 #ifdef UNIV_INNOCHECKSUM
8626 dst = block;
8627 #else
8628 dst = block->m_ptr;
8629 #endif /* UNIV_INNOCHECKSUM */
8630
8631 } else {
8632 block = NULL;
8633 }
8634
8635 int ret;
8636 Compression compression;
8637 ulint len = header.m_original_size;
8638
8639 compression.m_type = static_cast<Compression::Type>(header.m_algorithm);
8640
8641 switch(compression.m_type) {
8642 case Compression::ZLIB: {
8643
8644 uLongf zlen = header.m_original_size;
8645
8646 if (uncompress(dst, &zlen, ptr, header.m_compressed_size)
8647 != Z_OK) {
8648
8649 if (block != NULL) {
8650 os_free_block(block);
8651 }
8652
8653 return(DB_IO_DECOMPRESS_FAIL);
8654 }
8655
8656 len = static_cast<ulint>(zlen);
8657
8658 break;
8659 }
8660
8661 case Compression::LZ4:
8662
8663 ret = LZ4_decompress_safe(
8664 reinterpret_cast<char*>(ptr),
8665 reinterpret_cast<char*>(dst),
8666 header.m_compressed_size,
8667 header.m_original_size);
8668 if (ret < 0) {
8669
8670 if (block != NULL) {
8671 os_free_block(block);
8672 }
8673
8674 return(DB_IO_DECOMPRESS_FAIL);
8675 }
8676
8677 break;
8678
8679 default:
8680 #if !defined(UNIV_INNOCHECKSUM)
8681 ib::error()
8682 << "Compression algorithm support missing: "
8683 << Compression::to_string(compression.m_type);
8684 #else
8685 fprintf(stderr, "Compression algorithm support missing: %s\n",
8686 Compression::to_string(compression.m_type));
8687 #endif /* !UNIV_INNOCHECKSUM */
8688
8689 if (block != NULL) {
8690 os_free_block(block);
8691 }
8692
8693 return(DB_UNSUPPORTED);
8694 }
8695
8696 /* Leave the header alone */
8697 memmove(src + FIL_PAGE_DATA, dst, len);
8698
8699 mach_write_to_2(src + FIL_PAGE_TYPE, header.m_original_type);
8700
8701 ut_ad(dblwr_recover
8702 || memcmp(src + FIL_PAGE_LSN + 4,
8703 src + (header.m_original_size + FIL_PAGE_DATA)
8704 - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4) == 0);
8705
8706 if (block != NULL) {
8707 os_free_block(block);
8708 }
8709
8710 return(DB_SUCCESS);
8711 }
8712
8713 /** Decompress the page data contents. Page type must be FIL_PAGE_COMPRESSED, if
8714 not then the source contents are left unchanged and DB_SUCCESS is returned.
8715 @param[in] dblwr_recover true of double write recovery in progress
8716 @param[in,out] src Data read from disk, decompressed data will be
8717 copied to this page
8718 @param[in,out] dst Scratch area to use for decompression
8719 @param[in] dst_len Size of the scratch area in bytes
8720 @return DB_SUCCESS or error code */
8721 dberr_t
os_file_decompress_page(bool dblwr_recover,byte * src,byte * dst,ulint dst_len)8722 os_file_decompress_page(
8723 bool dblwr_recover,
8724 byte* src,
8725 byte* dst,
8726 ulint dst_len)
8727 {
8728 return(Compression::deserialize(dblwr_recover, src, dst, dst_len));
8729 }
8730
8731 /**
8732 @param[in] type The encryption type
8733 @return the string representation */
8734 const char*
to_string(Type type)8735 Encryption::to_string(Type type)
8736 {
8737 switch(type) {
8738 case NONE:
8739 return("N");
8740 case AES:
8741 return("Y");
8742 }
8743
8744 ut_ad(0);
8745
8746 return("<UNKNOWN>");
8747 }
8748
8749 /** Generate random encryption value for key and iv.
8750 @param[in,out] value Encryption value */
random_value(byte * value)8751 void Encryption::random_value(byte* value)
8752 {
8753 ut_ad(value != NULL);
8754
8755 my_rand_buffer(value, ENCRYPTION_KEY_LEN);
8756 }
8757
8758 /** Create new master key for key rotation.
8759 @param[in,out] master_key master key */
8760 void
create_master_key(byte ** master_key)8761 Encryption::create_master_key(byte** master_key)
8762 {
8763 #ifndef UNIV_INNOCHECKSUM
8764 char* key_type = NULL;
8765 size_t key_len;
8766 char key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
8767 int ret;
8768
8769 /* If uuid does not match with current server uuid,
8770 set uuid as current server uuid. */
8771 if (strcmp(uuid, server_uuid) != 0) {
8772 memcpy(uuid, server_uuid, ENCRYPTION_SERVER_UUID_LEN);
8773 }
8774 memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
8775
8776 /* Generate new master key */
8777 ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8778 "%s-%s-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8779 uuid, master_key_id + 1);
8780
8781 /* We call key ring API to generate master key here. */
8782 ret = my_key_generate(key_name, "AES",
8783 NULL, ENCRYPTION_KEY_LEN);
8784
8785 /* We call key ring API to get master key here. */
8786 ret = my_key_fetch(key_name, &key_type, NULL,
8787 reinterpret_cast<void**>(master_key),
8788 &key_len);
8789
8790 if (ret || *master_key == NULL) {
8791 ib::error() << "Encryption can't find master key, please check"
8792 " the keyring plugin is loaded.";
8793 *master_key = NULL;
8794 } else {
8795 master_key_id++;
8796 }
8797
8798 if (key_type) {
8799 my_free(key_type);
8800 }
8801 #endif
8802 }
8803
8804 /** Get master key by key id.
8805 @param[in] master_key_id master key id
8806 @param[in] srv_uuid uuid of server instance
8807 @param[in,out] master_key master key */
8808 void
get_master_key(ulint master_key_id,char * srv_uuid,byte ** master_key)8809 Encryption::get_master_key(ulint master_key_id,
8810 char* srv_uuid,
8811 byte** master_key)
8812 {
8813 #ifndef UNIV_INNOCHECKSUM
8814 char* key_type = NULL;
8815 size_t key_len;
8816 char key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
8817 int ret;
8818
8819 memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
8820
8821 if (srv_uuid != NULL) {
8822 ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8823 "%s-%s-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8824 srv_uuid, master_key_id);
8825 } else {
8826 /* For compitable with 5.7.11, we need to get master key with
8827 server id. */
8828 memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
8829 ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8830 "%s-%lu-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8831 server_id, master_key_id);
8832 }
8833
8834 /* We call key ring API to get master key here. */
8835 ret = my_key_fetch(key_name, &key_type, NULL,
8836 reinterpret_cast<void**>(master_key), &key_len);
8837
8838 if (key_type) {
8839 my_free(key_type);
8840 }
8841
8842 if (ret) {
8843 *master_key = NULL;
8844 ib::error() << "Encryption can't find master key, please check"
8845 " the keyring plugin is loaded.";
8846 }
8847
8848 #ifdef UNIV_ENCRYPT_DEBUG
8849 if (!ret && *master_key) {
8850 fprintf(stderr, "Fetched master key:%lu ", master_key_id);
8851 ut_print_buf(stderr, *master_key, key_len);
8852 fprintf(stderr, "\n");
8853 }
8854 #endif /* DEBUG_TDE */
8855
8856 #endif
8857 }
8858
8859 /** Current master key id */
8860 ulint Encryption::master_key_id = 0;
8861
8862 /** Current uuid of server instance */
8863 char Encryption::uuid[ENCRYPTION_SERVER_UUID_LEN + 1] = {0};
8864
8865 /** Get current master key and master key id
8866 @param[in,out] master_key_id master key id
8867 @param[in,out] master_key master key
8868 @param[in,out] version encryption information version */
8869 void
get_master_key(ulint * master_key_id,byte ** master_key,Encryption::Version * version)8870 Encryption::get_master_key(ulint* master_key_id,
8871 byte** master_key,
8872 Encryption::Version* version)
8873 {
8874 #ifndef UNIV_INNOCHECKSUM
8875 char* key_type = NULL;
8876 size_t key_len;
8877 char key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
8878 int ret;
8879
8880 memset(key_name, 0, ENCRYPTION_KEY_LEN);
8881 *version = Encryption::ENCRYPTION_VERSION_2;
8882
8883 if (Encryption::master_key_id == 0) {
8884 /* If m_master_key is 0, means there's no encrypted
8885 tablespace, we need to generate the first master key,
8886 and store it to key ring. */
8887 memset(uuid, 0, ENCRYPTION_SERVER_UUID_LEN + 1);
8888 memcpy(uuid, server_uuid, ENCRYPTION_SERVER_UUID_LEN);
8889
8890 /* Prepare the server uuid. */
8891 ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8892 "%s-%s-1", ENCRYPTION_MASTER_KEY_PRIFIX,
8893 uuid);
8894
8895 /* We call key ring API to generate master key here. */
8896 ret = my_key_generate(key_name, "AES",
8897 NULL, ENCRYPTION_KEY_LEN);
8898
8899 /* We call key ring API to get master key here. */
8900 ret = my_key_fetch(key_name, &key_type, NULL,
8901 reinterpret_cast<void**>(master_key),
8902 &key_len);
8903
8904 if (!ret && *master_key != NULL) {
8905 Encryption::master_key_id++;
8906 *master_key_id = Encryption::master_key_id;
8907 }
8908 #ifdef UNIV_ENCRYPT_DEBUG
8909 if (!ret && *master_key) {
8910 fprintf(stderr, "Generated new master key:");
8911 ut_print_buf(stderr, *master_key, key_len);
8912 fprintf(stderr, "\n");
8913 }
8914 #endif
8915 } else {
8916 *master_key_id = Encryption::master_key_id;
8917
8918 ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8919 "%s-%s-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8920 uuid, *master_key_id);
8921
8922 /* We call key ring API to get master key here. */
8923 ret = my_key_fetch(key_name, &key_type, NULL,
8924 reinterpret_cast<void**>(master_key),
8925 &key_len);
8926
8927 /* For compitable with 5.7.11, we need to try to get master key with
8928 server id when get master key with server uuid failure. */
8929 if (ret || *master_key == NULL) {
8930 if (key_type) {
8931 my_free(key_type);
8932 }
8933
8934 memset(key_name, 0,
8935 ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
8936 ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8937 "%s-%lu-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8938 server_id, *master_key_id);
8939
8940 ret = my_key_fetch(key_name, &key_type, NULL,
8941 reinterpret_cast<void**>(master_key),
8942 &key_len);
8943 *version = Encryption::ENCRYPTION_VERSION_1;
8944 }
8945 #ifdef UNIV_ENCRYPT_DEBUG
8946 if (!ret && *master_key) {
8947 fprintf(stderr, "Fetched master key:%lu ",
8948 *master_key_id);
8949 ut_print_buf(stderr, *master_key, key_len);
8950 fprintf(stderr, "\n");
8951 }
8952 #endif
8953 }
8954
8955 if (ret) {
8956 *master_key = NULL;
8957 ib::error() << "Encryption can't find master key, please check"
8958 " the keyring plugin is loaded.";
8959 }
8960
8961 if (key_type) {
8962 my_free(key_type);
8963 }
8964 #endif
8965 }
8966
8967 /** Check if page is encrypted page or not
8968 @param[in] page page which need to check
8969 @return true if it is a encrypted page */
8970 bool
is_encrypted_page(const byte * page)8971 Encryption::is_encrypted_page(const byte* page)
8972 {
8973 ulint page_type = mach_read_from_2(page + FIL_PAGE_TYPE);
8974
8975 return(page_type == FIL_PAGE_ENCRYPTED
8976 || page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED
8977 || page_type == FIL_PAGE_ENCRYPTED_RTREE);
8978 }
8979
8980 /** Encrypt the page data contents. Page type can't be
8981 FIL_PAGE_ENCRYPTED, FIL_PAGE_COMPRESSED_AND_ENCRYPTED,
8982 FIL_PAGE_ENCRYPTED_RTREE.
8983 @param[in] type IORequest
8984 @param[in,out] src page data which need to encrypt
8985 @param[in] src_len Size of the source in bytes
8986 @param[in,out] dst destination area
8987 @param[in,out] dst_len Size of the destination in bytes
8988 @return buffer data, dst_len will have the length of the data */
8989 byte*
encrypt(const IORequest & type,byte * src,ulint src_len,byte * dst,ulint * dst_len)8990 Encryption::encrypt(
8991 const IORequest& type,
8992 byte* src,
8993 ulint src_len,
8994 byte* dst,
8995 ulint* dst_len)
8996 {
8997 ut_ad(m_type != NONE);
8998 ut_ad(!type.is_log());
8999 #ifdef UNIV_ENCRYPT_DEBUG
9000 ulint space_id =
9001 mach_read_from_4(src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
9002 ulint page_no = mach_read_from_4(src + FIL_PAGE_OFFSET);
9003
9004 fprintf(stderr, "Encrypting page:%lu.%lu len:%lu\n",
9005 space_id, page_no, src_len);
9006 #endif
9007
9008 /* Shouldn't encrypte an already encrypted page. */
9009 ut_ad(!is_encrypted_page(src));
9010
9011 const uint16_t page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
9012
9013 /* This is data size which need to encrypt. */
9014 ulint src_enc_len = src_len;
9015
9016 /* In FIL_PAGE_VERSION_2, we encrypt the actual compressed data length. */
9017 if (page_type == FIL_PAGE_COMPRESSED) {
9018 src_enc_len = mach_read_from_2(src + FIL_PAGE_COMPRESS_SIZE_V1) +
9019 FIL_PAGE_DATA;
9020 /* Extend src_enc_len if needed */
9021 if (src_enc_len < MIN_ENCRYPTION_LEN) {
9022 src_enc_len = MIN_ENCRYPTION_LEN;
9023 }
9024 ut_a(src_enc_len <= src_len);
9025 }
9026
9027 /* Only encrypt the data + trailer, leave the header alone */
9028
9029 switch (m_type) {
9030 case Encryption::NONE:
9031 ut_error;
9032
9033 case Encryption::AES: {
9034 ut_ad(m_klen == ENCRYPTION_KEY_LEN);
9035
9036 /* Total length of the data to encrypt. */
9037 const ulint data_len = src_enc_len - FIL_PAGE_DATA;
9038
9039 /* Server encryption functions expect input data to be in
9040 multiples of MY_AES_BLOCK SIZE. Therefore we encrypt the
9041 overlapping data of the chunk_len and trailer_len twice.
9042 First we encrypt the bigger chunk of data then we do the
9043 trailer. The trailer encryption block starts at
9044 2 * MY_AES_BLOCK_SIZE bytes offset from the end of the enc_len.
9045 During decryption we do the reverse of the above process. */
9046 ut_ad(data_len >= 2 * MY_AES_BLOCK_SIZE);
9047
9048 const ulint chunk_len =
9049 (data_len / MY_AES_BLOCK_SIZE) * MY_AES_BLOCK_SIZE;
9050 const ulint remain_len = data_len - chunk_len;
9051
9052 lint elen = my_aes_encrypt(
9053 src + FIL_PAGE_DATA, static_cast<uint32>(chunk_len),
9054 dst + FIL_PAGE_DATA, reinterpret_cast<byte *>(m_key),
9055 static_cast<uint32>(m_klen), my_aes_256_cbc,
9056 reinterpret_cast<byte *>(m_iv), false);
9057
9058 if (elen == MY_AES_BAD_DATA) {
9059 ulint page_no =mach_read_from_4(
9060 src + FIL_PAGE_OFFSET);
9061 ulint space_id = mach_read_from_4(
9062 src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
9063 *dst_len = src_len;
9064 #ifndef UNIV_INNOCHECKSUM
9065 ib::warn()
9066 << " Can't encrypt data of page,"
9067 << " page no:" << page_no
9068 << " space id:" << space_id;
9069 #else
9070 fprintf(stderr, " Can't encrypt data of page,"
9071 " page no:" ULINTPF
9072 " space id:" ULINTPF,
9073 page_no, space_id);
9074 #endif /* !UNIV_INNOCHECKSUM */
9075 return(src);
9076 }
9077
9078 const ulint len = static_cast<ulint>(elen);
9079 ut_ad(len == chunk_len);
9080
9081 /* Encrypt the trailing bytes. */
9082 if (remain_len != 0) {
9083 /* Copy remaining bytes and page tailer. */
9084 memcpy(dst + FIL_PAGE_DATA + len,
9085 src + FIL_PAGE_DATA + len,
9086 remain_len);
9087
9088 const ulint trailer_len = MY_AES_BLOCK_SIZE * 2;
9089 byte buf[trailer_len];
9090
9091 elen = my_aes_encrypt(
9092 dst + FIL_PAGE_DATA + data_len - trailer_len,
9093 static_cast<uint32>(trailer_len), buf,
9094 reinterpret_cast<unsigned char*>(m_key),
9095 static_cast<uint32>(m_klen), my_aes_256_cbc,
9096 reinterpret_cast<byte *>(m_iv), false);
9097
9098 if (elen == MY_AES_BAD_DATA) {
9099 ulint page_no =mach_read_from_4(
9100 src + FIL_PAGE_OFFSET);
9101 ulint space_id = mach_read_from_4(
9102 src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
9103 #ifndef UNIV_INNOCHECKSUM
9104 ib::warn()
9105 << " Can't encrypt data of page,"
9106 << " page no:" << page_no
9107 << " space id:" << space_id;
9108 #else
9109 fprintf(stderr, " Can't encrypt data of page,"
9110 " page no:" ULINTPF
9111 " space id:" ULINTPF,
9112 page_no, space_id);
9113 #endif /* !UNIV_INNOCHECKSUM */
9114 *dst_len = src_len;
9115 return(src);
9116 }
9117
9118 ut_a(static_cast<ulint>(elen) == trailer_len);
9119
9120 memcpy(dst + FIL_PAGE_DATA + data_len - trailer_len,
9121 buf, trailer_len);
9122 }
9123
9124
9125 break;
9126 }
9127
9128 default:
9129 ut_error;
9130 }
9131
9132 /* Copy the header as is. */
9133 memmove(dst, src, FIL_PAGE_DATA);
9134 ut_ad(memcmp(src, dst, FIL_PAGE_DATA) == 0);
9135
9136 /* Add encryption control information. Required for decrypting. */
9137 if (page_type == FIL_PAGE_COMPRESSED) {
9138 /* If the page is compressed, we don't need to save the
9139 original type, since it is done in compression already. */
9140 mach_write_to_2(dst + FIL_PAGE_TYPE,
9141 FIL_PAGE_COMPRESSED_AND_ENCRYPTED);
9142 ut_ad(memcmp(src+FIL_PAGE_TYPE+2,
9143 dst+FIL_PAGE_TYPE+2,
9144 FIL_PAGE_DATA-FIL_PAGE_TYPE-2) == 0);
9145 } else if (page_type == FIL_PAGE_RTREE) {
9146 /* If the page is R-tree page, we need to save original type. */
9147 mach_write_to_2(dst + FIL_PAGE_TYPE, FIL_PAGE_ENCRYPTED_RTREE);
9148 } else{
9149 mach_write_to_2(dst + FIL_PAGE_TYPE, FIL_PAGE_ENCRYPTED);
9150 mach_write_to_2(dst + FIL_PAGE_ORIGINAL_TYPE_V1, page_type);
9151 }
9152
9153 #ifdef UNIV_ENCRYPT_DEBUG
9154 #ifndef UNIV_INNOCHECKSUM
9155 #if 0
9156 byte* check_buf = static_cast<byte*>(ut_malloc_nokey(src_len));
9157 byte* buf2 = static_cast<byte*>(ut_malloc_nokey(src_len));
9158
9159 memcpy(check_buf, dst, src_len);
9160
9161 dberr_t err = decrypt(type, check_buf, src_len, buf2, src_len);
9162 if (err != DB_SUCCESS || memcmp(src + FIL_PAGE_DATA,
9163 check_buf + FIL_PAGE_DATA,
9164 src_len - FIL_PAGE_DATA) != 0) {
9165 ut_print_buf(stderr, src, src_len);
9166 ut_print_buf(stderr, check_buf, src_len);
9167 ut_ad(0);
9168 }
9169 ut_free(buf2);
9170 ut_free(check_buf);
9171 #endif
9172 fprintf(stderr, "Encrypted page:%lu.%lu\n", space_id, page_no);
9173 #endif
9174 #endif
9175
9176 /* Add padding 0 for unused portion */
9177 if (src_len > src_enc_len) {
9178 memset(dst + src_enc_len, 0, src_len - src_enc_len);
9179 }
9180
9181 *dst_len = src_len;
9182
9183 return(dst);
9184 }
9185
9186 /** Decrypt the page data contents. Page type must be FIL_PAGE_ENCRYPTED,
9187 if not then the source contents are left unchanged and DB_SUCCESS is returned.
9188 @param[in] type IORequest
9189 @param[in,out] src Data read from disk, decrypted data will be
9190 copied to this page
9191 @param[in] src_len source data length
9192 @param[in,out] dst Scratch area to use for decryption
9193 @param[in] dst_len Size of the scratch area in bytes
9194 @return DB_SUCCESS or error code */
9195 dberr_t
decrypt(const IORequest & type,byte * src,ulint src_len,byte * dst,ulint dst_len)9196 Encryption::decrypt(
9197 const IORequest& type,
9198 byte* src,
9199 ulint src_len,
9200 byte* dst,
9201 ulint dst_len)
9202 {
9203 ulint data_len;
9204 ulint main_len;
9205 ulint remain_len;
9206 ulint original_type;
9207 ulint page_type;
9208 byte remain_buf[MY_AES_BLOCK_SIZE * 2];
9209 Block* block;
9210
9211 /* Do nothing if it's not an encrypted table. */
9212 if (!is_encrypted_page(src)) {
9213 return(DB_SUCCESS);
9214 }
9215
9216 /* For compressed page, we need to get the compressed size
9217 for decryption */
9218 page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
9219 if (page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED) {
9220 src_len = static_cast<uint16_t>(
9221 mach_read_from_2(src + FIL_PAGE_COMPRESS_SIZE_V1))
9222 + FIL_PAGE_DATA;
9223 #ifndef UNIV_INNOCHECKSUM
9224 Compression::meta_t header;
9225 Compression::deserialize_header(src, &header);
9226 if (header.m_version == Compression::FIL_PAGE_VERSION_1) {
9227 src_len = ut_calc_align(src_len, type.block_size());
9228 } else {
9229 /* Extend src_len if needed */
9230 if (src_len < MIN_ENCRYPTION_LEN) {
9231 src_len = MIN_ENCRYPTION_LEN;
9232 }
9233 }
9234 #endif
9235 }
9236 #ifdef UNIV_ENCRYPT_DEBUG
9237 ulint space_id =
9238 mach_read_from_4(src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
9239 ulint page_no = mach_read_from_4(src + FIL_PAGE_OFFSET);
9240
9241 fprintf(stderr, "Decrypting page:%lu.%lu len:%lu\n",
9242 space_id, page_no, src_len);
9243 #endif
9244
9245 original_type = static_cast<uint16_t>(
9246 mach_read_from_2(src + FIL_PAGE_ORIGINAL_TYPE_V1));
9247
9248 byte* ptr = src + FIL_PAGE_DATA;
9249
9250 /* The caller doesn't know what to expect */
9251 if (dst == NULL) {
9252
9253 block = os_alloc_block();
9254 #ifdef UNIV_INNOCHECKSUM
9255 dst = block;
9256 #else
9257 dst = block->m_ptr;
9258 #endif /* UNIV_INNOCHECKSUM */
9259
9260 } else {
9261 block = NULL;
9262 }
9263
9264 data_len = src_len - FIL_PAGE_DATA;
9265 main_len = (data_len / MY_AES_BLOCK_SIZE) * MY_AES_BLOCK_SIZE;
9266 remain_len = data_len - main_len;
9267
9268 switch(m_type) {
9269 case Encryption::AES: {
9270 lint elen;
9271
9272 /* First decrypt the last 2 blocks data of data, since
9273 data is no block aligned. */
9274 if (remain_len != 0) {
9275 ut_ad(m_klen == ENCRYPTION_KEY_LEN);
9276
9277 remain_len = MY_AES_BLOCK_SIZE * 2;
9278
9279 /* Copy the last 2 blocks. */
9280 memcpy(remain_buf,
9281 ptr + data_len - remain_len,
9282 remain_len);
9283
9284 elen = my_aes_decrypt(
9285 remain_buf,
9286 static_cast<uint32>(remain_len),
9287 dst + data_len - remain_len,
9288 reinterpret_cast<unsigned char*>(m_key),
9289 static_cast<uint32>(m_klen),
9290 my_aes_256_cbc,
9291 reinterpret_cast<unsigned char*>(m_iv),
9292 false);
9293 if (elen == MY_AES_BAD_DATA) {
9294 if (block != NULL) {
9295 os_free_block(block);
9296 }
9297
9298 return(DB_IO_DECRYPT_FAIL);
9299 }
9300
9301 /* Copy the other data bytes to temp area. */
9302 memcpy(dst, ptr, data_len - remain_len);
9303 } else {
9304 ut_ad(data_len == main_len);
9305
9306 /* Copy the data bytes to temp area. */
9307 memcpy(dst, ptr, data_len);
9308 }
9309
9310 /* Then decrypt the main data */
9311 elen = my_aes_decrypt(
9312 dst,
9313 static_cast<uint32>(main_len),
9314 ptr,
9315 reinterpret_cast<unsigned char*>(m_key),
9316 static_cast<uint32>(m_klen),
9317 my_aes_256_cbc,
9318 reinterpret_cast<unsigned char*>(m_iv),
9319 false);
9320 if (elen == MY_AES_BAD_DATA) {
9321
9322 if (block != NULL) {
9323 os_free_block(block);
9324 }
9325
9326 return(DB_IO_DECRYPT_FAIL);
9327 }
9328
9329 ut_ad(static_cast<ulint>(elen) == main_len);
9330
9331 /* Copy the remain bytes. */
9332 memcpy(ptr + main_len, dst + main_len, data_len - main_len);
9333
9334 break;
9335 }
9336
9337 default:
9338 if (!type.is_dblwr_recover()) {
9339 #if !defined(UNIV_INNOCHECKSUM)
9340 ib::error()
9341 << "Encryption algorithm support missing: "
9342 << Encryption::to_string(m_type);
9343 #else
9344 fprintf(stderr, "Encryption algorithm support missing: %s\n",
9345 Encryption::to_string(m_type));
9346 #endif /* !UNIV_INNOCHECKSUM */
9347 }
9348
9349 if (block != NULL) {
9350 os_free_block(block);
9351 }
9352
9353 return(DB_UNSUPPORTED);
9354 }
9355
9356 /* Restore the original page type. If it's a compressed and
9357 encrypted page, just reset it as compressed page type, since
9358 we will do uncompress later. */
9359
9360 if (page_type == FIL_PAGE_ENCRYPTED) {
9361 mach_write_to_2(src + FIL_PAGE_TYPE, original_type);
9362 mach_write_to_2(src + FIL_PAGE_ORIGINAL_TYPE_V1, 0);
9363 } else if (page_type == FIL_PAGE_ENCRYPTED_RTREE) {
9364 mach_write_to_2(src + FIL_PAGE_TYPE, FIL_PAGE_RTREE);
9365 } else {
9366 ut_ad(page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED);
9367 mach_write_to_2(src + FIL_PAGE_TYPE, FIL_PAGE_COMPRESSED);
9368 }
9369
9370 if (block != NULL) {
9371 os_free_block(block);
9372 }
9373
9374 #ifdef UNIV_ENCRYPT_DEBUG
9375 fprintf(stderr, "Decrypted page:%lu.%lu\n", space_id, page_no);
9376 #endif
9377
9378 DBUG_EXECUTE_IF("ib_crash_during_decrypt_page", DBUG_SUICIDE(););
9379
9380 return(DB_SUCCESS);
9381 }
9382
9383 /** Normalizes a directory path for the current OS:
9384 On Windows, we convert '/' to '\', else we convert '\' to '/'.
9385 @param[in,out] str A null-terminated directory and file path */
9386 void
os_normalize_path(char * str)9387 os_normalize_path(
9388 char* str)
9389 {
9390 if (str != NULL) {
9391 for (; *str; str++) {
9392 if (*str == OS_PATH_SEPARATOR_ALT) {
9393 *str = OS_PATH_SEPARATOR;
9394 }
9395 }
9396 }
9397 }
9398