1 /***********************************************************************
2
3 Copyright (c) 1995, 2021, Oracle and/or its affiliates.
4 Copyright (c) 2009, Percona Inc.
5
6 Portions of this file contain modifications contributed and copyrighted
7 by Percona Inc.. Those modifications are
8 gratefully acknowledged and are described briefly in the InnoDB
9 documentation. The contributions by Percona Inc. are incorporated with
10 their permission, and subject to the conditions contained in the file
11 COPYING.Percona.
12
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License, version 2.0,
15 as published by the Free Software Foundation.
16
17 This program is also distributed with certain software (including
18 but not limited to OpenSSL) that is licensed under separate terms,
19 as designated in a particular file or component or in included license
20 documentation. The authors of MySQL hereby grant you an additional
21 permission to link the program and your derivative works with the
22 separately licensed software that they have included with MySQL.
23
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License, version 2.0, for more details.
28
29 You should have received a copy of the GNU General Public License along with
30 this program; if not, write to the Free Software Foundation, Inc.,
31 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
32
33 ***********************************************************************/
34
35 /**************************************************//**
36 @file os/os0file.cc
37 The interface to the operating system file i/o primitives
38
39 Created 10/21/1995 Heikki Tuuri
40 *******************************************************/
41
42 #ifndef UNIV_INNOCHECKSUM
43
44 #include "ha_prototypes.h"
45 #include "sql_const.h"
46
47 #include "os0file.h"
48
49 #ifdef UNIV_NONINL
50 #include "os0file.ic"
51 #endif
52
53 #include "page0page.h"
54 #include "srv0srv.h"
55 #include "srv0start.h"
56 #include "fil0fil.h"
57 #ifndef UNIV_HOTBACKUP
58 # include "os0event.h"
59 # include "os0thread.h"
60 #else /* !UNIV_HOTBACKUP */
61 # ifdef _WIN32
62 /* Add includes for the _stat() call to compile on Windows */
63 # include <sys/types.h>
64 # include <sys/stat.h>
65 # include <errno.h>
66 # endif /* _WIN32 */
67 #endif /* !UNIV_HOTBACKUP */
68
69 #include <vector>
70 #include <functional>
71
72 #ifdef LINUX_NATIVE_AIO
73 #include <libaio.h>
74 #endif /* LINUX_NATIVE_AIO */
75
76 #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
77 # include <fcntl.h>
78 # include <linux/falloc.h>
79 #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
80
81 #include <lz4.h>
82 #include <zlib.h>
83
84 #ifdef UNIV_DEBUG
85 /** Set when InnoDB has invoked exit(). */
86 bool innodb_calling_exit;
87 #endif /* UNIV_DEBUG */
88
89 #include <my_aes.h>
90 #include <my_rnd.h>
91 #include <mysqld.h>
92 #include <mysql/service_mysql_keyring.h>
93
94 /** Insert buffer segment id */
95 static const ulint IO_IBUF_SEGMENT = 0;
96
97 /** Log segment id */
98 static const ulint IO_LOG_SEGMENT = 1;
99
100 /** Number of retries for partial I/O's */
101 static const ulint NUM_RETRIES_ON_PARTIAL_IO = 10;
102
103 /** Blocks for doing IO, used in the transparent compression
104 and encryption code. */
105 struct Block {
106 /** Default constructor */
BlockBlock107 Block() : m_ptr(), m_in_use() { }
108
109 byte* m_ptr;
110
111 byte pad[CACHE_LINE_SIZE - sizeof(ulint)];
112 lock_word_t m_in_use;
113 };
114
115 /** For storing the allocated blocks */
116 typedef std::vector<Block> Blocks;
117
118 /** Block collection */
119 static Blocks* block_cache;
120
121 /** Number of blocks to allocate for sync read/writes */
122 static const size_t MAX_BLOCKS = 128;
123
124 /** Block buffer size */
125 #define BUFFER_BLOCK_SIZE ((ulint)(UNIV_PAGE_SIZE * 1.3))
126
127 /** Disk sector size of aligning write buffer for DIRECT_IO */
128 static ulint os_io_ptr_align = UNIV_SECTOR_SIZE;
129
130 /* This specifies the file permissions InnoDB uses when it creates files in
131 Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
132 my_umask */
133
134 #ifndef _WIN32
135 /** Umask for creating files */
136 static ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
137 #else
138 /** Umask for creating files */
139 static ulint os_innodb_umask = 0;
140
141 /* On Windows when using native AIO the number of AIO requests
142 that a thread can handle at a given time is limited to 32
143 i.e.: SRV_N_PENDING_IOS_PER_THREAD */
144 #define SRV_N_PENDING_IOS_PER_THREAD OS_AIO_N_PENDING_IOS_PER_THREAD
145
146 #endif /* _WIN32 */
147
148 #ifndef UNIV_HOTBACKUP
149
150 /** In simulated aio, merge at most this many consecutive i/os */
151 static const ulint OS_AIO_MERGE_N_CONSECUTIVE = 64;
152
153 /** Flag indicating if the page_cleaner is in active state. */
154 extern bool buf_page_cleaner_is_active;
155
156 #ifdef WITH_INNODB_DISALLOW_WRITES
157 #define WAIT_ALLOW_WRITES() os_event_wait(srv_allow_writes_event)
158 #else
159 #define WAIT_ALLOW_WRITES() do { } while (0)
160 #endif /* WITH_INNODB_DISALLOW_WRITES */
161
162 /**********************************************************************
163
164 InnoDB AIO Implementation:
165 =========================
166
167 We support native AIO for Windows and Linux. For rest of the platforms
168 we simulate AIO by special IO-threads servicing the IO-requests.
169
170 Simulated AIO:
171 ==============
172
173 On platforms where we 'simulate' AIO, the following is a rough explanation
174 of the high level design.
175 There are four io-threads (for ibuf, log, read, write).
176 All synchronous IO requests are serviced by the calling thread using
177 os_file_write/os_file_read. The Asynchronous requests are queued up
178 in an array (there are four such arrays) by the calling thread.
179 Later these requests are picked up by the IO-thread and are serviced
180 synchronously.
181
182 Windows native AIO:
183 ==================
184
185 If srv_use_native_aio is not set then Windows follow the same
186 code as simulated AIO. If the flag is set then native AIO interface
187 is used. On windows, one of the limitation is that if a file is opened
188 for AIO no synchronous IO can be done on it. Therefore we have an
189 extra fifth array to queue up synchronous IO requests.
190 There are innodb_file_io_threads helper threads. These threads work
191 on the four arrays mentioned above in Simulated AIO. No thread is
192 required for the sync array.
193 If a synchronous IO request is made, it is first queued in the sync
194 array. Then the calling thread itself waits on the request, thus
195 making the call synchronous.
196 If an AIO request is made the calling thread not only queues it in the
197 array but also submits the requests. The helper thread then collects
198 the completed IO request and calls completion routine on it.
199
200 Linux native AIO:
201 =================
202
203 If we have libaio installed on the system and innodb_use_native_aio
204 is set to true we follow the code path of native AIO, otherwise we
205 do simulated AIO.
206 There are innodb_file_io_threads helper threads. These threads work
207 on the four arrays mentioned above in Simulated AIO.
208 If a synchronous IO request is made, it is handled by calling
209 os_file_write/os_file_read.
210 If an AIO request is made the calling thread not only queues it in the
211 array but also submits the requests. The helper thread then collects
212 the completed IO request and calls completion routine on it.
213
214 **********************************************************************/
215
216
217 #ifdef UNIV_PFS_IO
218 /* Keys to register InnoDB I/O with performance schema */
219 mysql_pfs_key_t innodb_data_file_key;
220 mysql_pfs_key_t innodb_log_file_key;
221 mysql_pfs_key_t innodb_temp_file_key;
222 #endif /* UNIV_PFS_IO */
223
224 /** The asynchronous I/O context */
225 struct Slot {
SlotSlot226 Slot() { memset(this, 0, sizeof(*this)); }
227
228 /** index of the slot in the aio array */
229 uint16_t pos;
230
231 /** true if this slot is reserved */
232 bool is_reserved;
233
234 /** time when reserved */
235 ib_time_monotonic_t reservation_time;
236
237 /** buffer used in i/o */
238 byte* buf;
239
240 /** Buffer pointer used for actual IO. We advance this
241 when partial IO is required and not buf */
242 byte* ptr;
243
244 /** OS_FILE_READ or OS_FILE_WRITE */
245 IORequest type;
246
247 /** file offset in bytes */
248 os_offset_t offset;
249
250 /** file where to read or write */
251 pfs_os_file_t file;
252
253 /** file name or path */
254 const char* name;
255
256 /** used only in simulated aio: true if the physical i/o
257 already made and only the slot message needs to be passed
258 to the caller of os_aio_simulated_handle */
259 bool io_already_done;
260
261 /** The file node for which the IO is requested. */
262 fil_node_t* m1;
263
264 /** the requester of an aio operation and which can be used
265 to identify which pending aio operation was completed */
266 void* m2;
267
268 /** AIO completion status */
269 dberr_t err;
270
271 #ifdef WIN_ASYNC_IO
272 /** handle object we need in the OVERLAPPED struct */
273 HANDLE handle;
274
275 /** Windows control block for the aio request */
276 OVERLAPPED control;
277
278 /** bytes written/read */
279 DWORD n_bytes;
280
281 /** length of the block to read or write */
282 DWORD len;
283
284 #elif defined(LINUX_NATIVE_AIO)
285 /** Linux control block for aio */
286 struct iocb control;
287
288 /** AIO return code */
289 int ret;
290
291 /** bytes written/read. */
292 ssize_t n_bytes;
293
294 /** length of the block to read or write */
295 ulint len;
296 #else
297 /** length of the block to read or write */
298 ulint len;
299
300 /** bytes written/read. */
301 ulint n_bytes;
302 #endif /* WIN_ASYNC_IO */
303
304 /** Length of the block before it was compressed */
305 uint32 original_len;
306
307 /** Buffer block for compressed pages or encrypted pages */
308 Block* buf_block;
309
310 /** true, if we shouldn't punch a hole after writing the page */
311 bool skip_punch_hole;
312 };
313
314 /** The asynchronous i/o array structure */
315 class AIO {
316 public:
317 /** Constructor
318 @param[in] id Latch ID
319 @param[in] n_slots Number of slots to configure
320 @param[in] segments Number of segments to configure */
321 AIO(latch_id_t id, ulint n_slots, ulint segments);
322
323 /** Destructor */
324 ~AIO();
325
326 /** Initialize the instance
327 @return DB_SUCCESS or error code */
328 dberr_t init();
329
330 /** Requests for a slot in the aio array. If no slot is available, waits
331 until not_full-event becomes signaled.
332
333 @param[in,out] type IO context
334 @param[in,out] m1 message to be passed along with the AIO
335 operation
336 @param[in,out] m2 message to be passed along with the AIO
337 operation
338 @param[in] file file handle
339 @param[in] name name of the file or path as a null-terminated
340 string
341 @param[in,out] buf buffer where to read or from which to write
342 @param[in] offset file offset, where to read from or start writing
343 @param[in] len length of the block to read or write
344 @return pointer to slot */
345 Slot* reserve_slot(
346 IORequest& type,
347 fil_node_t* m1,
348 void* m2,
349 pfs_os_file_t file,
350 const char* name,
351 void* buf,
352 os_offset_t offset,
353 ulint len)
354 MY_ATTRIBUTE((warn_unused_result));
355
356 /** @return number of reserved slots */
357 ulint pending_io_count() const;
358
359 /** Returns a pointer to the nth slot in the aio array.
360 @param[in] index Index of the slot in the array
361 @return pointer to slot */
at(ulint i) const362 const Slot* at(ulint i) const
363 MY_ATTRIBUTE((warn_unused_result))
364 {
365 ut_a(i < m_slots.size());
366
367 return(&m_slots[i]);
368 }
369
370 /** Non const version */
at(ulint i)371 Slot* at(ulint i)
372 MY_ATTRIBUTE((warn_unused_result))
373 {
374 ut_a(i < m_slots.size());
375
376 return(&m_slots[i]);
377 }
378
379 /** Frees a slot in the AIO array, assumes caller owns the mutex.
380 @param[in,out] slot Slot to release */
381 void release(Slot* slot);
382
383 /** Frees a slot in the AIO array, assumes caller doesn't own the mutex.
384 @param[in,out] slot Slot to release */
385 void release_with_mutex(Slot* slot);
386
387 /** Prints info about the aio array.
388 @param[in,out] file Where to print */
389 void print(FILE* file);
390
391 /** @return the number of slots per segment */
slots_per_segment() const392 ulint slots_per_segment() const
393 MY_ATTRIBUTE((warn_unused_result))
394 {
395 return(m_slots.size() / m_n_segments);
396 }
397
398 /** @return accessor for n_segments */
get_n_segments() const399 ulint get_n_segments() const
400 MY_ATTRIBUTE((warn_unused_result))
401 {
402 return(m_n_segments);
403 }
404
405 #ifdef UNIV_DEBUG
406 /** @return true if the thread owns the mutex */
is_mutex_owned() const407 bool is_mutex_owned() const
408 MY_ATTRIBUTE((warn_unused_result))
409 {
410 return(mutex_own(&m_mutex));
411 }
412 #endif /* UNIV_DEBUG */
413
414 /** Acquire the mutex */
acquire() const415 void acquire() const
416 {
417 mutex_enter(&m_mutex);
418 }
419
420 /** Release the mutex */
release() const421 void release() const
422 {
423 mutex_exit(&m_mutex);
424 }
425
426 /** Write out the state to the file/stream
427 @param[in, out] file File to write to */
428 void to_file(FILE* file) const;
429
430 #ifdef LINUX_NATIVE_AIO
431 /** Dispatch an AIO request to the kernel.
432 @param[in,out] slot an already reserved slot
433 @return true on success. */
434 bool linux_dispatch(Slot* slot)
435 MY_ATTRIBUTE((warn_unused_result));
436
437 /** Accessor for an AIO event
438 @param[in] index Index into the array
439 @return the event at the index */
io_events(ulint index)440 io_event* io_events(ulint index)
441 MY_ATTRIBUTE((warn_unused_result))
442 {
443 ut_a(index < m_events.size());
444
445 return(&m_events[index]);
446 }
447
448 /** Accessor for the AIO context
449 @param[in] segment Segment for which to get the context
450 @return the AIO context for the segment */
io_ctx(ulint segment)451 io_context* io_ctx(ulint segment)
452 MY_ATTRIBUTE((warn_unused_result))
453 {
454 ut_ad(segment < get_n_segments());
455
456 return(m_aio_ctx[segment]);
457 }
458
459 /** Creates an io_context for native linux AIO.
460 @param[in] max_events number of events
461 @param[out] io_ctx io_ctx to initialize.
462 @return true on success. */
463 static bool linux_create_io_ctx(ulint max_events, io_context_t* io_ctx)
464 MY_ATTRIBUTE((warn_unused_result));
465
466 /** Checks if the system supports native linux aio. On some kernel
467 versions where native aio is supported it won't work on tmpfs. In such
468 cases we can't use native aio as it is not possible to mix simulated
469 and native aio.
470 @return true if supported, false otherwise. */
471 static bool is_linux_native_aio_supported()
472 MY_ATTRIBUTE((warn_unused_result));
473 #endif /* LINUX_NATIVE_AIO */
474
475 #ifdef WIN_ASYNC_IO
476 /** Wakes up all async i/o threads in the array in Windows async I/O at
477 shutdown. */
signal()478 void signal()
479 {
480 for (ulint i = 0; i < m_slots.size(); ++i) {
481 SetEvent(m_slots[i].handle);
482 }
483 }
484
485 /** Wake up all AIO threads in Windows native aio */
wake_at_shutdown()486 static void wake_at_shutdown()
487 {
488 s_reads->signal();
489
490 if (s_writes != NULL) {
491 s_writes->signal();
492 }
493
494 if (s_ibuf != NULL) {
495 s_ibuf->signal();
496 }
497
498 if (s_log != NULL) {
499 s_log->signal();
500 }
501 }
502 #endif /* WIN_ASYNC_IO */
503
504 #ifdef _WIN32
505 /** This function can be called if one wants to post a batch of reads
506 and prefers an I/O - handler thread to handle them all at once later.You
507 must call os_aio_simulated_wake_handler_threads later to ensure the
508 threads are not left sleeping! */
509 static void simulated_put_read_threads_to_sleep();
510
511 /** The non asynchronous IO array.
512 @return the synchronous AIO array instance. */
sync_array()513 static AIO* sync_array()
514 MY_ATTRIBUTE((warn_unused_result))
515 {
516 return(s_sync);
517 }
518
519 /**
520 Get the AIO handles for a segment.
521 @param[in] segment The local segment.
522 @return the handles for the segment. */
handles(ulint segment)523 HANDLE* handles(ulint segment)
524 MY_ATTRIBUTE((warn_unused_result))
525 {
526 ut_ad(segment < m_handles->size() / slots_per_segment());
527
528 return(&(*m_handles)[segment * slots_per_segment()]);
529 }
530
531 /** @return true if no slots are reserved */
is_empty() const532 bool is_empty() const
533 MY_ATTRIBUTE((warn_unused_result))
534 {
535 ut_ad(is_mutex_owned());
536 return(m_n_reserved == 0);
537 }
538 #endif /* _WIN32 */
539
540 /** Create an instance using new(std::nothrow)
541 @param[in] id Latch ID
542 @param[in] n_slots The number of AIO request slots
543 @param[in] segments The number of segments
544 @return a new AIO instance */
545 static AIO* create(
546 latch_id_t id,
547 ulint n_slots,
548 ulint segments)
549 MY_ATTRIBUTE((warn_unused_result));
550
551 /** Initializes the asynchronous io system. Creates one array each
552 for ibuf and log I/O. Also creates one array each for read and write
553 where each array is divided logically into n_readers and n_writers
554 respectively. The caller must create an i/o handler thread for each
555 segment in these arrays. This function also creates the sync array.
556 No I/O handler thread needs to be created for that
557 @param[in] n_per_seg maximum number of pending aio
558 operations allowed per segment
559 @param[in] n_readers number of reader threads
560 @param[in] n_writers number of writer threads
561 @param[in] n_slots_sync number of slots in the sync aio array
562 @return true if AIO sub-system was started successfully */
563 static bool start(
564 ulint n_per_seg,
565 ulint n_readers,
566 ulint n_writers,
567 ulint n_slots_sync)
568 MY_ATTRIBUTE((warn_unused_result));
569
570 /** Free the AIO arrays */
571 static void shutdown();
572
573 /** Print all the AIO segments
574 @param[in,out] file Where to print */
575 static void print_all(FILE* file);
576
577 /** Calculates local segment number and aio array from global
578 segment number.
579 @param[out] array AIO wait array
580 @param[in] segment global segment number
581 @return local segment number within the aio array */
582 static ulint get_array_and_local_segment(
583 AIO** array,
584 ulint segment)
585 MY_ATTRIBUTE((warn_unused_result));
586
587 /** Select the IO slot array
588 @param[in] type Type of IO, READ or WRITE
589 @param[in] read_only true if running in read-only mode
590 @param[in] mode IO mode
591 @return slot array or NULL if invalid mode specified */
592 static AIO* select_slot_array(
593 IORequest& type,
594 bool read_only,
595 ulint mode)
596 MY_ATTRIBUTE((warn_unused_result));
597
598 /** Calculates segment number for a slot.
599 @param[in] array AIO wait array
600 @param[in] slot slot in this array
601 @return segment number (which is the number used by, for example,
602 I/O handler threads) */
603 static ulint get_segment_no_from_slot(
604 const AIO* array,
605 const Slot* slot)
606 MY_ATTRIBUTE((warn_unused_result));
607
608 /** Wakes up a simulated AIO I/O-handler thread if it has something
609 to do.
610 @param[in] global_segment the number of the segment in the
611 AIO arrays */
612 static void wake_simulated_handler_thread(ulint global_segment);
613
614 /** Check if it is a read request
615 @param[in] aio The AIO instance to check
616 @return true if the AIO instance is for reading. */
is_read(const AIO * aio)617 static bool is_read(const AIO* aio)
618 MY_ATTRIBUTE((warn_unused_result))
619 {
620 return(s_reads == aio);
621 }
622
623 /** Wait on an event until no pending writes */
wait_until_no_pending_writes()624 static void wait_until_no_pending_writes()
625 {
626 os_event_wait(AIO::s_writes->m_is_empty);
627 }
628
629 /** Print to file
630 @param[in] file File to write to */
631 static void print_to_file(FILE* file);
632
633 /** Check for pending IO. Gets the count and also validates the
634 data structures.
635 @return count of pending IO requests */
636 static ulint total_pending_io_count();
637
638 private:
639 /** Initialise the slots
640 @return DB_SUCCESS or error code */
641 dberr_t init_slots()
642 MY_ATTRIBUTE((warn_unused_result));
643
644 /** Wakes up a simulated AIO I/O-handler thread if it has something
645 to do for a local segment in the AIO array.
646 @param[in] global_segment the number of the segment in the
647 AIO arrays
648 @param[in] segment the local segment in the AIO array */
649 void wake_simulated_handler_thread(ulint global_segment, ulint segment);
650
651 /** Prints pending IO requests per segment of an aio array.
652 We probably don't need per segment statistics but they can help us
653 during development phase to see if the IO requests are being
654 distributed as expected.
655 @param[in,out] file file where to print
656 @param[in] segments pending IO array */
657 void print_segment_info(
658 FILE* file,
659 const ulint* segments);
660
661 #ifdef LINUX_NATIVE_AIO
662 /** Initialise the Linux native AIO data structures
663 @return DB_SUCCESS or error code */
664 dberr_t init_linux_native_aio()
665 MY_ATTRIBUTE((warn_unused_result));
666 #endif /* LINUX_NATIVE_AIO */
667
668 private:
669 typedef std::vector<Slot> Slots;
670
671 /** the mutex protecting the aio array */
672 mutable SysMutex m_mutex;
673
674 /** Pointer to the slots in the array.
675 Number of elements must be divisible by n_threads. */
676 Slots m_slots;
677
678 /** Number of segments in the aio array of pending aio requests.
679 A thread can wait separately for any one of the segments. */
680 ulint m_n_segments;
681
682 /** The event which is set to the signaled state when
683 there is space in the aio outside the ibuf segment */
684 os_event_t m_not_full;
685
686 /** The event which is set to the signaled state when
687 there are no pending i/os in this array */
688 os_event_t m_is_empty;
689
690 /** Number of reserved slots in the AIO array outside
691 the ibuf segment */
692 ulint m_n_reserved;
693
694 #ifdef _WIN32
695 typedef std::vector<HANDLE, ut_allocator<HANDLE> > Handles;
696
697 /** Pointer to an array of OS native event handles where
698 we copied the handles from slots, in the same order. This
699 can be used in WaitForMultipleObjects; used only in Windows */
700 Handles* m_handles;
701 #endif /* _WIN32 */
702
703 #if defined(LINUX_NATIVE_AIO)
704 typedef std::vector<io_event> IOEvents;
705
706 /** completion queue for IO. There is one such queue per
707 segment. Each thread will work on one ctx exclusively. */
708 io_context_t* m_aio_ctx;
709
710 /** The array to collect completed IOs. There is one such
711 event for each possible pending IO. The size of the array
712 is equal to m_slots.size(). */
713 IOEvents m_events;
714 #endif /* LINUX_NATIV_AIO */
715
716 /** The aio arrays for non-ibuf i/o and ibuf i/o, as well as
717 sync AIO. These are NULL when the module has not yet been
718 initialized. */
719
720 /** Insert buffer */
721 static AIO* s_ibuf;
722
723 /** Redo log */
724 static AIO* s_log;
725
726 /** Reads */
727 static AIO* s_reads;
728
729 /** Writes */
730 static AIO* s_writes;
731
732 /** Synchronous I/O */
733 static AIO* s_sync;
734 };
735
736 /** Static declarations */
737 AIO* AIO::s_reads;
738 AIO* AIO::s_writes;
739 AIO* AIO::s_ibuf;
740 AIO* AIO::s_log;
741 AIO* AIO::s_sync;
742
743 #if defined(LINUX_NATIVE_AIO)
744 /** timeout for each io_getevents() call = 500ms. */
745 static const ulint OS_AIO_REAP_TIMEOUT = 500000000UL;
746
747 /** time to sleep, in microseconds if io_setup() returns EAGAIN. */
748 static const ulint OS_AIO_IO_SETUP_RETRY_SLEEP = 500000UL;
749
750 /** number of attempts before giving up on io_setup(). */
751 static const int OS_AIO_IO_SETUP_RETRY_ATTEMPTS = 5;
752 #endif /* LINUX_NATIVE_AIO */
753
754 /** Array of events used in simulated AIO */
755 static os_event_t* os_aio_segment_wait_events = NULL;
756
757 /** Number of asynchronous I/O segments. Set by os_aio_init(). */
758 static ulint os_aio_n_segments = ULINT_UNDEFINED;
759
760 /** If the following is true, read i/o handler threads try to
761 wait until a batch of new read requests have been posted */
762 static bool os_aio_recommend_sleep_for_read_threads = false;
763 #endif /* !UNIV_HOTBACKUP */
764
765 ulint os_n_file_reads = 0;
766 ulint os_bytes_read_since_printout = 0;
767 ulint os_n_file_writes = 0;
768 ulint os_n_fsyncs = 0;
769 ulint os_n_file_reads_old = 0;
770 ulint os_n_file_writes_old = 0;
771 ulint os_n_fsyncs_old = 0;
772 /** Number of pending write operations */
773 ulint os_n_pending_writes = 0;
774 /** Number of pending read operations */
775 ulint os_n_pending_reads = 0;
776
777 ib_time_monotonic_t os_last_printout;
778 bool os_has_said_disk_full = false;
779
780 /** Default Zip compression level */
781 extern uint page_zip_level;
782
783 #if DATA_TRX_ID_LEN > 6
784 #error "COMPRESSION_ALGORITHM will not fit"
785 #endif /* DATA_TRX_ID_LEN */
786
787 /** Validates the consistency of the aio system.
788 @return true if ok */
789 static
790 bool
791 os_aio_validate();
792
793 /** Does error handling when a file operation fails.
794 @param[in] name File name or NULL
795 @param[in] operation Name of operation e.g., "read", "write"
796 @return true if we should retry the operation */
797 static
798 bool
799 os_file_handle_error(
800 const char* name,
801 const char* operation);
802
803 /** Free storage space associated with a section of the file.
804 @param[in] fh Open file handle
805 @param[in] off Starting offset (SEEK_SET)
806 @param[in] len Size of the hole
807 @return DB_SUCCESS or error code */
808 dberr_t
809 os_file_punch_hole(
810 os_file_t fh,
811 os_offset_t off,
812 os_offset_t len);
813
814 /**
815 Does error handling when a file operation fails.
816 @param[in] name File name or NULL
817 @param[in] operation Name of operation e.g., "read", "write"
818 @param[in] silent if true then don't print any message to the log.
819 @return true if we should retry the operation */
820 static
821 bool
822 os_file_handle_error_no_exit(
823 const char* name,
824 const char* operation,
825 bool silent);
826
827 /** Decompress after a read and punch a hole in the file if it was a write
828 @param[in] type IO context
829 @param[in] fh Open file handle
830 @param[in,out] buf Buffer to transform
831 @param[in,out] scratch Scratch area for read decompression
832 @param[in] src_len Length of the buffer before compression
833 @param[in] len Compressed buffer length for write and size
834 of buf len for read
835 @return DB_SUCCESS or error code */
836 static
837 dberr_t
838 os_file_io_complete(
839 const IORequest&type,
840 os_file_t fh,
841 byte* buf,
842 byte* scratch,
843 ulint src_len,
844 os_offset_t offset,
845 ulint len);
846
847 /** Does simulated AIO. This function should be called by an i/o-handler
848 thread.
849
850 @param[in] segment The number of the segment in the aio arrays to wait
851 for; segment 0 is the ibuf i/o thread, segment 1 the
852 log i/o thread, then follow the non-ibuf read threads,
853 and as the last are the non-ibuf write threads
854 @param[out] m1 the messages passed with the AIO request; note that
855 also in the case where the AIO operation failed, these
856 output parameters are valid and can be used to restart
857 the operation, for example
858 @param[out] m2 Callback argument
859 @param[in] type IO context
860 @return DB_SUCCESS or error code */
861 static
862 dberr_t
863 os_aio_simulated_handler(
864 ulint global_segment,
865 fil_node_t** m1,
866 void** m2,
867 IORequest* type);
868
869 #ifdef WIN_ASYNC_IO
870 /** This function is only used in Windows asynchronous i/o.
871 Waits for an aio operation to complete. This function is used to wait the
872 for completed requests. The aio array of pending requests is divided
873 into segments. The thread specifies which segment or slot it wants to wait
874 for. NOTE: this function will also take care of freeing the aio slot,
875 therefore no other thread is allowed to do the freeing!
876 @param[in] segment The number of the segment in the aio arrays to
877 wait for; segment 0 is the ibuf I/O thread,
878 segment 1 the log I/O thread, then follow the
879 non-ibuf read threads, and as the last are the
880 non-ibuf write threads; if this is
881 ULINT_UNDEFINED, then it means that sync AIO
882 is used, and this parameter is ignored
883 @param[in] pos this parameter is used only in sync AIO:
884 wait for the aio slot at this position
885 @param[out] m1 the messages passed with the AIO request; note
886 that also in the case where the AIO operation
887 failed, these output parameters are valid and
888 can be used to restart the operation,
889 for example
890 @param[out] m2 callback message
891 @param[out] type OS_FILE_WRITE or ..._READ
892 @return DB_SUCCESS or error code */
893 static
894 dberr_t
895 os_aio_windows_handler(
896 ulint segment,
897 ulint pos,
898 fil_node_t** m1,
899 void** m2,
900 IORequest* type);
901 #endif /* WIN_ASYNC_IO */
902
903 /** Allocate a page for sync IO
904 @return pointer to page */
905 static
906 Block*
os_alloc_block()907 os_alloc_block()
908 {
909 size_t pos;
910 Blocks& blocks = *block_cache;
911 size_t i = static_cast<size_t>(my_timer_cycles());
912 const size_t size = blocks.size();
913 ulint retry = 0;
914 Block* block;
915
916 DBUG_EXECUTE_IF("os_block_cache_busy", retry = MAX_BLOCKS * 3;);
917
918 for (;;) {
919
920 /* After go through the block cache for 3 times,
921 allocate a new temporary block. */
922 if (retry == MAX_BLOCKS * 3) {
923 byte* ptr;
924
925 ptr = static_cast<byte*>(
926 ut_malloc_nokey(sizeof(*block)
927 + BUFFER_BLOCK_SIZE));
928
929 block = new (ptr) Block();
930 block->m_ptr = static_cast<byte*>(
931 ptr + sizeof(*block));
932 block->m_in_use = 1;
933
934 break;
935 }
936
937 pos = i++ % size;
938
939 if (TAS(&blocks[pos].m_in_use, 1) == 0) {
940 block = &blocks[pos];
941 break;
942 }
943
944 os_thread_yield();
945
946 ++retry;
947 }
948
949 ut_a(block->m_in_use != 0);
950
951 return(block);
952 }
953
954 /** Free a page after sync IO
955 @param[in,own] block The block to free/release */
956 static
957 void
os_free_block(Block * block)958 os_free_block(Block* block)
959 {
960 ut_ad(block->m_in_use == 1);
961
962 TAS(&block->m_in_use, 0);
963
964 /* When this block is not in the block cache, and it's
965 a temporary block, we need to free it directly. */
966 if (std::less<Block*>()(block, &block_cache->front())
967 || std::greater<Block*>()(block, &block_cache->back())) {
968 ut_free(block);
969 }
970 }
971
972 /** Generic AIO Handler methods. Currently handles IO post processing. */
973 class AIOHandler {
974 public:
975 /** Do any post processing after a read/write
976 @return DB_SUCCESS or error code. */
977 static dberr_t post_io_processing(Slot* slot);
978
979 /** Decompress after a read and punch a hole in the file if
980 it was a write */
io_complete(const Slot * slot)981 static dberr_t io_complete(const Slot* slot)
982 {
983 ut_a(slot->offset > 0);
984 ut_a(slot->type.is_read() || !slot->skip_punch_hole);
985 return(os_file_io_complete(
986 slot->type, slot->file.m_file, slot->buf,
987 NULL, slot->original_len,
988 slot->offset, slot->len));
989 }
990
991 private:
992 /** Check whether the page was encrypted.
993 @param[in] slot The slot that contains the IO request
994 @return true if it was an encyrpted page */
is_encrypted_page(const Slot * slot)995 static bool is_encrypted_page(const Slot* slot)
996 {
997 return(Encryption::is_encrypted_page(slot->buf));
998 }
999
1000 /** Check whether the page was compressed.
1001 @param[in] slot The slot that contains the IO request
1002 @return true if it was a compressed page */
is_compressed_page(const Slot * slot)1003 static bool is_compressed_page(const Slot* slot)
1004 {
1005 const byte* src = slot->buf;
1006
1007 ulint page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
1008
1009 return(page_type == FIL_PAGE_COMPRESSED);
1010 }
1011
1012 /** Get the compressed page size.
1013 @param[in] slot The slot that contains the IO request
1014 @return number of bytes to read for a successful decompress */
compressed_page_size(const Slot * slot)1015 static ulint compressed_page_size(const Slot* slot)
1016 {
1017 ut_ad(slot->type.is_read());
1018 ut_ad(is_compressed_page(slot));
1019
1020 ulint size;
1021 const byte* src = slot->buf;
1022
1023 size = mach_read_from_2(src + FIL_PAGE_COMPRESS_SIZE_V1);
1024
1025 return(size + FIL_PAGE_DATA);
1026 }
1027
1028 /** Check if the page contents can be decompressed.
1029 @param[in] slot The slot that contains the IO request
1030 @return true if the data read has all the compressed data */
can_decompress(const Slot * slot)1031 static bool can_decompress(const Slot* slot)
1032 {
1033 ut_ad(slot->type.is_read());
1034 ut_ad(is_compressed_page(slot));
1035
1036 ulint version;
1037 const byte* src = slot->buf;
1038
1039 version = mach_read_from_1(src + FIL_PAGE_VERSION);
1040
1041 ut_a(Compression::is_valid_page_version(version));
1042
1043 /* Includes the page header size too */
1044 ulint size = compressed_page_size(slot);
1045
1046 return(size <= (slot->ptr - slot->buf) + (ulint) slot->n_bytes);
1047 }
1048
1049 /** Check if we need to read some more data.
1050 @param[in] slot The slot that contains the IO request
1051 @param[in] n_bytes Total bytes read so far
1052 @return DB_SUCCESS or error code */
1053 static dberr_t check_read(Slot* slot, ulint n_bytes);
1054 };
1055
1056 /** Helper class for doing synchronous file IO. Currently, the objective
1057 is to hide the OS specific code, so that the higher level functions aren't
1058 peppered with #ifdef. Makes the code flow difficult to follow. */
1059 class SyncFileIO {
1060 public:
1061 /** Constructor
1062 @param[in] fh File handle
1063 @param[in,out] buf Buffer to read/write
1064 @param[in] n Number of bytes to read/write
1065 @param[in] offset Offset where to read or write */
SyncFileIO(os_file_t fh,void * buf,ulint n,os_offset_t offset)1066 SyncFileIO(os_file_t fh, void* buf, ulint n, os_offset_t offset)
1067 :
1068 m_fh(fh),
1069 m_buf(buf),
1070 m_n(static_cast<ssize_t>(n)),
1071 m_offset(offset)
1072 {
1073 ut_ad(m_n > 0);
1074 }
1075
1076 /** Destructor */
~SyncFileIO()1077 ~SyncFileIO()
1078 {
1079 /* No op */
1080 }
1081
1082 /** Do the read/write
1083 @param[in] request The IO context and type
1084 @return the number of bytes read/written or negative value on error */
1085 ssize_t execute(const IORequest& request);
1086
1087 /** Do the read/write
1088 @param[in,out] slot The IO slot, it has the IO context
1089 @return the number of bytes read/written or negative value on error */
1090 static ssize_t execute(Slot* slot);
1091
1092 /** Move the read/write offset up to where the partial IO succeeded.
1093 @param[in] n_bytes The number of bytes to advance */
advance(ssize_t n_bytes)1094 void advance(ssize_t n_bytes)
1095 {
1096 m_offset += n_bytes;
1097
1098 ut_ad(m_n >= n_bytes);
1099
1100 m_n -= n_bytes;
1101
1102 m_buf = reinterpret_cast<uchar*>(m_buf) + n_bytes;
1103 }
1104
1105 private:
1106 /** Open file handle */
1107 os_file_t m_fh;
1108
1109 /** Buffer to read/write */
1110 void* m_buf;
1111
1112 /** Number of bytes to read/write */
1113 ssize_t m_n;
1114
1115 /** Offset from where to read/write */
1116 os_offset_t m_offset;
1117 };
1118
1119 /** If it is a compressed page return the compressed page data + footer size
1120 @param[in] buf Buffer to check, must include header + 10 bytes
1121 @return ULINT_UNDEFINED if the page is not a compressed page or length
1122 of the compressed data (including footer) if it is a compressed page */
1123 ulint
os_file_compressed_page_size(const byte * buf)1124 os_file_compressed_page_size(const byte* buf)
1125 {
1126 ulint type = mach_read_from_2(buf + FIL_PAGE_TYPE);
1127
1128 if (type == FIL_PAGE_COMPRESSED) {
1129 ulint version = mach_read_from_1(buf + FIL_PAGE_VERSION);
1130 ut_a(Compression::is_valid_page_version(version));
1131 return(mach_read_from_2(buf + FIL_PAGE_COMPRESS_SIZE_V1));
1132 }
1133
1134 return(ULINT_UNDEFINED);
1135 }
1136
1137 /** If it is a compressed page return the original page data + footer size
1138 @param[in] buf Buffer to check, must include header + 10 bytes
1139 @return ULINT_UNDEFINED if the page is not a compressed page or length
1140 of the original data + footer if it is a compressed page */
1141 ulint
os_file_original_page_size(const byte * buf)1142 os_file_original_page_size(const byte* buf)
1143 {
1144 ulint type = mach_read_from_2(buf + FIL_PAGE_TYPE);
1145
1146 if (type == FIL_PAGE_COMPRESSED) {
1147
1148 ulint version = mach_read_from_1(buf + FIL_PAGE_VERSION);
1149 ut_a(Compression::is_valid_page_version(version));
1150
1151 return(mach_read_from_2(buf + FIL_PAGE_ORIGINAL_SIZE_V1));
1152 }
1153
1154 return(ULINT_UNDEFINED);
1155 }
1156
1157 /** Check if we need to read some more data.
1158 @param[in] slot The slot that contains the IO request
1159 @param[in] n_bytes Total bytes read so far
1160 @return DB_SUCCESS or error code */
1161 dberr_t
check_read(Slot * slot,ulint n_bytes)1162 AIOHandler::check_read(Slot* slot, ulint n_bytes)
1163 {
1164 dberr_t err;
1165
1166 ut_ad(slot->type.is_read());
1167 ut_ad(slot->original_len > slot->len);
1168
1169 if (is_compressed_page(slot)) {
1170
1171 if (can_decompress(slot)) {
1172
1173 ut_a(slot->offset > 0);
1174
1175 slot->len = slot->original_len;
1176 #ifdef _WIN32
1177 slot->n_bytes = static_cast<DWORD>(n_bytes);
1178 #else
1179 slot->n_bytes = static_cast<ulint>(n_bytes);
1180 #endif /* _WIN32 */
1181
1182 err = io_complete(slot);
1183 ut_a(err == DB_SUCCESS);
1184 } else {
1185 /* Read the next block in */
1186 ut_ad(compressed_page_size(slot) >= n_bytes);
1187
1188 err = DB_FAIL;
1189 }
1190 } else if (is_encrypted_page(slot)) {
1191 ut_a(slot->offset > 0);
1192
1193 slot->len = slot->original_len;
1194 #ifdef _WIN32
1195 slot->n_bytes = static_cast<DWORD>(n_bytes);
1196 #else
1197 slot->n_bytes = static_cast<ulint>(n_bytes);
1198 #endif /* _WIN32 */
1199
1200 err = io_complete(slot);
1201 ut_a(err == DB_SUCCESS);
1202
1203 } else {
1204 err = DB_FAIL;
1205 }
1206
1207 if (slot->buf_block != NULL) {
1208 os_free_block(slot->buf_block);
1209 slot->buf_block = NULL;
1210 }
1211
1212 return(err);
1213 }
1214
1215 /** Do any post processing after a read/write
1216 @return DB_SUCCESS or error code. */
1217 dberr_t
post_io_processing(Slot * slot)1218 AIOHandler::post_io_processing(Slot* slot)
1219 {
1220 dberr_t err;
1221
1222 ut_ad(slot->is_reserved);
1223
1224 /* Total bytes read so far */
1225 ulint n_bytes = (slot->ptr - slot->buf) + slot->n_bytes;
1226
1227 /* Compressed writes can be smaller than the original length.
1228 Therefore they can be processed without further IO. */
1229 if (n_bytes == slot->original_len
1230 || (slot->type.is_write()
1231 && slot->type.is_compressed()
1232 && slot->len == static_cast<ulint>(slot->n_bytes))) {
1233
1234 if (!slot->type.is_log()
1235 && (is_compressed_page(slot)
1236 || is_encrypted_page(slot))) {
1237
1238 ut_a(slot->offset > 0);
1239
1240 if (slot->type.is_read()) {
1241 slot->len = slot->original_len;
1242 }
1243
1244 /* The punch hole has been done on collect() */
1245
1246 if (slot->type.is_read()) {
1247 err = io_complete(slot);
1248 } else {
1249 err = DB_SUCCESS;
1250 }
1251
1252 ut_ad(err == DB_SUCCESS
1253 || err == DB_UNSUPPORTED
1254 || err == DB_CORRUPTION
1255 || err == DB_IO_DECOMPRESS_FAIL);
1256 } else {
1257
1258 err = DB_SUCCESS;
1259 }
1260
1261 if (slot->buf_block != NULL) {
1262 os_free_block(slot->buf_block);
1263 slot->buf_block = NULL;
1264 }
1265
1266 } else if ((ulint) slot->n_bytes == (ulint) slot->len) {
1267
1268 /* It *must* be a partial read. */
1269 ut_ad(slot->len < slot->original_len);
1270
1271 /* Has to be a read request, if it is less than
1272 the original length. */
1273 ut_ad(slot->type.is_read());
1274 err = check_read(slot, n_bytes);
1275
1276 } else {
1277 err = DB_FAIL;
1278 }
1279
1280 return(err);
1281 }
1282
1283 /** Count the number of free slots
1284 @return number of reserved slots */
1285 ulint
pending_io_count() const1286 AIO::pending_io_count() const
1287 {
1288 acquire();
1289
1290 #ifdef UNIV_DEBUG
1291 ut_a(m_n_segments > 0);
1292 ut_a(!m_slots.empty());
1293
1294 ulint count = 0;
1295
1296 for (ulint i = 0; i < m_slots.size(); ++i) {
1297
1298 const Slot& slot = m_slots[i];
1299
1300 if (slot.is_reserved) {
1301 ++count;
1302 ut_a(slot.len > 0);
1303 }
1304 }
1305
1306 ut_a(m_n_reserved == count);
1307 #endif /* UNIV_DEBUG */
1308
1309 ulint reserved = m_n_reserved;
1310
1311 release();
1312
1313 return(reserved);
1314 }
1315
1316 /** Compress a data page
1317 #param[in] block_size File system block size
1318 @param[in] src Source contents to compress
1319 @param[in] src_len Length in bytes of the source
1320 @param[out] dst Compressed page contents
1321 @param[out] dst_len Length in bytes of dst contents
1322 @return buffer data, dst_len will have the length of the data */
1323 static
1324 byte*
os_file_compress_page(Compression compression,ulint block_size,byte * src,ulint src_len,byte * dst,ulint * dst_len)1325 os_file_compress_page(
1326 Compression compression,
1327 ulint block_size,
1328 byte* src,
1329 ulint src_len,
1330 byte* dst,
1331 ulint* dst_len)
1332 {
1333 ulint len = 0;
1334 ulint compression_level = page_zip_level;
1335 ulint page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
1336
1337 /* The page size must be a multiple of the OS punch hole size. */
1338 ut_ad(!(src_len % block_size));
1339
1340 /* Shouldn't compress an already compressed page. */
1341 ut_ad(page_type != FIL_PAGE_COMPRESSED);
1342
1343 /* The page must be at least twice as large as the file system
1344 block size if we are to save any space. Ignore R-Tree pages for now,
1345 they repurpose the same 8 bytes in the page header. No point in
1346 compressing if the file system block size >= our page size. */
1347
1348 if (page_type == FIL_PAGE_RTREE
1349 || block_size == ULINT_UNDEFINED
1350 || compression.m_type == Compression::NONE
1351 || src_len < block_size * 2) {
1352
1353 *dst_len = src_len;
1354
1355 return(src);
1356 }
1357
1358 /* Leave the header alone when compressing. */
1359 ut_ad(block_size >= FIL_PAGE_DATA * 2);
1360
1361 ut_ad(src_len > FIL_PAGE_DATA + block_size);
1362
1363 /* Must compress to <= N-1 FS blocks. */
1364 ulint out_len = src_len - (FIL_PAGE_DATA + block_size);
1365
1366 /* This is the original data page size - the page header. */
1367 ulint content_len = src_len - FIL_PAGE_DATA;
1368
1369 ut_ad(out_len >= block_size - FIL_PAGE_DATA);
1370 ut_ad(out_len <= src_len - (block_size + FIL_PAGE_DATA));
1371
1372 /* Only compress the data + trailer, leave the header alone */
1373
1374 switch (compression.m_type) {
1375 case Compression::NONE:
1376 ut_error;
1377
1378 case Compression::ZLIB: {
1379
1380 uLongf zlen = static_cast<uLongf>(out_len);
1381
1382 if (compress2(
1383 dst + FIL_PAGE_DATA,
1384 &zlen,
1385 src + FIL_PAGE_DATA,
1386 static_cast<uLong>(content_len),
1387 static_cast<int>(compression_level)) != Z_OK) {
1388
1389 *dst_len = src_len;
1390
1391 return(src);
1392 }
1393
1394 len = static_cast<ulint>(zlen);
1395
1396 break;
1397 }
1398
1399 case Compression::LZ4:
1400
1401 len = LZ4_compress_default(
1402 reinterpret_cast<char*>(src) + FIL_PAGE_DATA,
1403 reinterpret_cast<char*>(dst) + FIL_PAGE_DATA,
1404 static_cast<int>(content_len),
1405 static_cast<int>(out_len));
1406
1407 ut_a(len <= src_len - FIL_PAGE_DATA);
1408
1409 if (len == 0 || len >= out_len) {
1410
1411 *dst_len = src_len;
1412
1413 return(src);
1414 }
1415
1416 break;
1417
1418 default:
1419 *dst_len = src_len;
1420 return(src);
1421 }
1422
1423 ut_a(len <= out_len);
1424
1425 ut_ad(memcmp(src + FIL_PAGE_LSN + 4,
1426 src + src_len - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)
1427 == 0);
1428
1429 /* Copy the header as is. */
1430 memmove(dst, src, FIL_PAGE_DATA);
1431
1432 /* Add compression control information. Required for decompressing. */
1433 mach_write_to_2(dst + FIL_PAGE_TYPE, FIL_PAGE_COMPRESSED);
1434
1435 mach_write_to_1(dst + FIL_PAGE_VERSION, Compression::FIL_PAGE_VERSION_2);
1436
1437 mach_write_to_1(dst + FIL_PAGE_ALGORITHM_V1, compression.m_type);
1438
1439 mach_write_to_2(dst + FIL_PAGE_ORIGINAL_TYPE_V1, page_type);
1440
1441 mach_write_to_2(dst + FIL_PAGE_ORIGINAL_SIZE_V1, content_len);
1442
1443 mach_write_to_2(dst + FIL_PAGE_COMPRESS_SIZE_V1, len);
1444
1445 /* Round to the next full block size */
1446
1447 len += FIL_PAGE_DATA;
1448
1449 *dst_len = ut_calc_align(len, block_size);
1450
1451 ut_ad(*dst_len >= len && *dst_len <= out_len + FIL_PAGE_DATA);
1452
1453 /* Clear out the unused portion of the page. */
1454 if (len % block_size) {
1455 memset(dst + len, 0x0, block_size - (len % block_size));
1456 }
1457
1458 return(dst);
1459 }
1460
1461 #ifdef UNIV_DEBUG
1462 # ifndef UNIV_HOTBACKUP
1463 /** Validates the consistency the aio system some of the time.
1464 @return true if ok or the check was skipped */
1465 bool
os_aio_validate_skip()1466 os_aio_validate_skip()
1467 {
1468 /** Try os_aio_validate() every this many times */
1469 # define OS_AIO_VALIDATE_SKIP 13
1470
1471 /** The os_aio_validate() call skip counter.
1472 Use a signed type because of the race condition below. */
1473 static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
1474
1475 /* There is a race condition below, but it does not matter,
1476 because this call is only for heuristic purposes. We want to
1477 reduce the call frequency of the costly os_aio_validate()
1478 check in debug builds. */
1479 --os_aio_validate_count;
1480
1481 if (os_aio_validate_count > 0) {
1482 return(true);
1483 }
1484
1485 os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
1486 return(os_aio_validate());
1487 }
1488 # endif /* !UNIV_HOTBACKUP */
1489 #endif /* UNIV_DEBUG */
1490
1491 #undef USE_FILE_LOCK
1492 #define USE_FILE_LOCK
1493 #if defined(UNIV_HOTBACKUP) || defined(_WIN32)
1494 /* InnoDB Hot Backup does not lock the data files.
1495 * On Windows, mandatory locking is used.
1496 */
1497 # undef USE_FILE_LOCK
1498 #endif
1499 #ifdef USE_FILE_LOCK
1500 /** Obtain an exclusive lock on a file.
1501 @param[in] fd file descriptor
1502 @param[in] name file name
1503 @return 0 on success */
1504 static
1505 int
os_file_lock(int fd,const char * name)1506 os_file_lock(
1507 int fd,
1508 const char* name)
1509 {
1510 struct flock lk;
1511
1512 lk.l_type = F_WRLCK;
1513 lk.l_whence = SEEK_SET;
1514 lk.l_start = lk.l_len = 0;
1515
1516 if (fcntl(fd, F_SETLK, &lk) == -1) {
1517
1518 ib::error()
1519 << "Unable to lock " << name
1520 << " error: " << errno;
1521
1522 if (errno == EAGAIN || errno == EACCES) {
1523
1524 ib::info()
1525 << "Check that you do not already have"
1526 " another mysqld process using the"
1527 " same InnoDB data or log files.";
1528 }
1529
1530 return(-1);
1531 }
1532
1533 return(0);
1534 }
1535 #endif /* USE_FILE_LOCK */
1536
1537 #ifndef UNIV_HOTBACKUP
1538
1539 /** Calculates local segment number and aio array from global segment number.
1540 @param[out] array aio wait array
1541 @param[in] segment global segment number
1542 @return local segment number within the aio array */
1543 ulint
get_array_and_local_segment(AIO ** array,ulint segment)1544 AIO::get_array_and_local_segment(
1545 AIO** array,
1546 ulint segment)
1547 {
1548 ulint local_segment;
1549 ulint n_extra_segs = (srv_read_only_mode) ? 0 : 2;
1550
1551 ut_a(segment < os_aio_n_segments);
1552
1553 if (!srv_read_only_mode && segment < n_extra_segs) {
1554
1555 /* We don't support ibuf/log IO during read only mode. */
1556
1557 if (segment == IO_IBUF_SEGMENT) {
1558
1559 *array = s_ibuf;
1560
1561 } else if (segment == IO_LOG_SEGMENT) {
1562
1563 *array = s_log;
1564
1565 } else {
1566 *array = NULL;
1567 }
1568
1569 local_segment = 0;
1570
1571 } else if (segment < s_reads->m_n_segments + n_extra_segs) {
1572
1573 *array = s_reads;
1574 local_segment = segment - n_extra_segs;
1575
1576 } else {
1577 *array = s_writes;
1578
1579 local_segment = segment
1580 - (s_reads->m_n_segments + n_extra_segs);
1581 }
1582
1583 return(local_segment);
1584 }
1585
1586 /** Frees a slot in the aio array. Assumes caller owns the mutex.
1587 @param[in,out] slot Slot to release */
1588 void
release(Slot * slot)1589 AIO::release(Slot* slot)
1590 {
1591 ut_ad(is_mutex_owned());
1592
1593 ut_ad(slot->is_reserved);
1594
1595 slot->is_reserved = false;
1596
1597 --m_n_reserved;
1598
1599 if (m_n_reserved == m_slots.size() - 1) {
1600 os_event_set(m_not_full);
1601 }
1602
1603 if (m_n_reserved == 0) {
1604 os_event_set(m_is_empty);
1605 }
1606
1607 #ifdef WIN_ASYNC_IO
1608
1609 ResetEvent(slot->handle);
1610
1611 #elif defined(LINUX_NATIVE_AIO)
1612
1613 if (srv_use_native_aio) {
1614 memset(&slot->control, 0x0, sizeof(slot->control));
1615 slot->ret = 0;
1616 slot->n_bytes = 0;
1617 } else {
1618 /* These fields should not be used if we are not
1619 using native AIO. */
1620 ut_ad(slot->n_bytes == 0);
1621 ut_ad(slot->ret == 0);
1622 }
1623
1624 #endif /* WIN_ASYNC_IO */
1625 }
1626
1627 /** Frees a slot in the AIO array. Assumes caller doesn't own the mutex.
1628 @param[in,out] slot Slot to release */
1629 void
release_with_mutex(Slot * slot)1630 AIO::release_with_mutex(Slot* slot)
1631 {
1632 acquire();
1633
1634 release(slot);
1635
1636 release();
1637 }
1638
1639 /** Creates a temporary file. This function is like tmpfile(3), but
1640 the temporary file is created in the given parameter path. If the path
1641 is NULL then it will create the file in the MySQL server configuration
1642 parameter (--tmpdir).
1643 @param[in] path location for creating temporary file
1644 @return temporary file handle, or NULL on error */
1645 FILE*
os_file_create_tmpfile(const char * path)1646 os_file_create_tmpfile(
1647 const char* path)
1648 {
1649 FILE* file = NULL;
1650 WAIT_ALLOW_WRITES();
1651 int fd = innobase_mysql_tmpfile(path);
1652
1653 if (fd >= 0) {
1654 file = fdopen(fd, "w+b");
1655 }
1656
1657 if (file == NULL) {
1658
1659 ib::error()
1660 << "Unable to create temporary file; errno: "
1661 << errno;
1662
1663 if (fd >= 0) {
1664 close(fd);
1665 }
1666 }
1667
1668 return(file);
1669 }
1670
1671 /** Rewind file to its start, read at most size - 1 bytes from it to str, and
1672 NUL-terminate str. All errors are silently ignored. This function is
1673 mostly meant to be used with temporary files.
1674 @param[in,out] file File to read from
1675 @param[in,out] str Buffer where to read
1676 @param[in] size Size of buffer */
1677 void
os_file_read_string(FILE * file,char * str,ulint size)1678 os_file_read_string(
1679 FILE* file,
1680 char* str,
1681 ulint size)
1682 {
1683 if (size != 0) {
1684 rewind(file);
1685
1686 size_t flen = fread(str, 1, size - 1, file);
1687
1688 str[flen] = '\0';
1689 }
1690 }
1691
1692 /** Decompress after a read and punch a hole in the file if it was a write
1693 @param[in] type IO context
1694 @param[in] fh Open file handle
1695 @param[in,out] buf Buffer to transform
1696 @param[in,out] scratch Scratch area for read decompression
1697 @param[in] src_len Length of the buffer before compression
1698 @param[in] len Used buffer length for write and output
1699 buf len for read
1700 @return DB_SUCCESS or error code */
1701 static
1702 dberr_t
os_file_io_complete(const IORequest & type,os_file_t fh,byte * buf,byte * scratch,ulint src_len,os_offset_t offset,ulint len)1703 os_file_io_complete(
1704 const IORequest&type,
1705 os_file_t fh,
1706 byte* buf,
1707 byte* scratch,
1708 ulint src_len,
1709 os_offset_t offset,
1710 ulint len)
1711 {
1712 /* We never compress/decompress the first page */
1713 ut_a(offset > 0);
1714 ut_ad(type.validate());
1715
1716 if (!type.is_compression_enabled()) {
1717
1718 return(DB_SUCCESS);
1719
1720 } else if (type.is_read()) {
1721 dberr_t ret;
1722 Encryption encryption(type.encryption_algorithm());
1723
1724 ut_ad(!type.is_log());
1725 ut_ad(!type.is_row_log());
1726
1727 ret = encryption.decrypt(type, buf, src_len, scratch, len);
1728 if (ret == DB_SUCCESS) {
1729 return(os_file_decompress_page(
1730 type.is_dblwr_recover(),
1731 buf, scratch, len));
1732 } else {
1733 return(ret);
1734 }
1735
1736 } else if (type.punch_hole()) {
1737
1738 ut_ad(len <= src_len);
1739 ut_ad(!type.is_log());
1740 ut_ad(type.is_write());
1741 ut_ad(type.is_compressed());
1742
1743 /* Nothing to do. */
1744 if (len == src_len) {
1745 return(DB_SUCCESS);
1746 }
1747
1748 #ifdef UNIV_DEBUG
1749 const ulint block_size = type.block_size();
1750 #endif /* UNIV_DEBUG */
1751
1752 /* We don't support multiple page sizes in the server
1753 at the moment. */
1754 ut_ad(src_len == srv_page_size);
1755
1756 /* Must be a multiple of the compression unit size. */
1757 ut_ad((len % block_size) == 0);
1758 ut_ad((offset % block_size) == 0);
1759
1760 ut_ad(len + block_size <= src_len);
1761
1762 offset += len;
1763
1764 return(os_file_punch_hole(fh, offset, src_len - len));
1765 }
1766
1767 ut_ad(!type.is_log());
1768
1769 return(DB_SUCCESS);
1770 }
1771
1772 #endif /* !UNIV_HOTBACKUP */
1773
1774 /** This function returns a new path name after replacing the basename
1775 in an old path with a new basename. The old_path is a full path
1776 name including the extension. The tablename is in the normal
1777 form "databasename/tablename". The new base name is found after
1778 the forward slash. Both input strings are null terminated.
1779
1780 This function allocates memory to be returned. It is the callers
1781 responsibility to free the return value after it is no longer needed.
1782
1783 @param[in] old_path Pathname
1784 @param[in] tablename Contains new base name
1785 @return own: new full pathname */
1786 char*
os_file_make_new_pathname(const char * old_path,const char * tablename)1787 os_file_make_new_pathname(
1788 const char* old_path,
1789 const char* tablename)
1790 {
1791 ulint dir_len;
1792 char* last_slash;
1793 char* base_name;
1794 char* new_path;
1795 ulint new_path_len;
1796
1797 /* Split the tablename into its database and table name components.
1798 They are separated by a '/'. */
1799 last_slash = strrchr((char*) tablename, '/');
1800 base_name = last_slash ? last_slash + 1 : (char*) tablename;
1801
1802 /* Find the offset of the last slash. We will strip off the
1803 old basename.ibd which starts after that slash. */
1804 last_slash = strrchr((char*) old_path, OS_PATH_SEPARATOR);
1805 dir_len = last_slash ? last_slash - old_path : strlen(old_path);
1806
1807 /* allocate a new path and move the old directory path to it. */
1808 new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd";
1809 new_path = static_cast<char*>(ut_malloc_nokey(new_path_len));
1810 memcpy(new_path, old_path, dir_len);
1811
1812 ut_snprintf(new_path + dir_len,
1813 new_path_len - dir_len,
1814 "%c%s.ibd",
1815 OS_PATH_SEPARATOR,
1816 base_name);
1817
1818 return(new_path);
1819 }
1820
1821 /** This function reduces a null-terminated full remote path name into
1822 the path that is sent by MySQL for DATA DIRECTORY clause. It replaces
1823 the 'databasename/tablename.ibd' found at the end of the path with just
1824 'tablename'.
1825
1826 Since the result is always smaller than the path sent in, no new memory
1827 is allocated. The caller should allocate memory for the path sent in.
1828 This function manipulates that path in place.
1829
1830 If the path format is not as expected, just return. The result is used
1831 to inform a SHOW CREATE TABLE command.
1832 @param[in,out] data_dir_path Full path/data_dir_path */
1833 void
os_file_make_data_dir_path(char * data_dir_path)1834 os_file_make_data_dir_path(
1835 char* data_dir_path)
1836 {
1837 /* Replace the period before the extension with a null byte. */
1838 char* ptr = strrchr((char*) data_dir_path, '.');
1839
1840 if (ptr == NULL) {
1841 return;
1842 }
1843
1844 ptr[0] = '\0';
1845
1846 /* The tablename starts after the last slash. */
1847 ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
1848
1849 if (ptr == NULL) {
1850 return;
1851 }
1852
1853 ptr[0] = '\0';
1854
1855 char* tablename = ptr + 1;
1856
1857 /* The databasename starts after the next to last slash. */
1858 ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
1859
1860 if (ptr == NULL) {
1861 return;
1862 }
1863
1864 ulint tablename_len = ut_strlen(tablename);
1865
1866 ut_memmove(++ptr, tablename, tablename_len);
1867
1868 ptr[tablename_len] = '\0';
1869 }
1870
1871 /** Check if the path refers to the root of a drive using a pointer
1872 to the last directory separator that the caller has fixed.
1873 @param[in] path path name
1874 @param[in] path last directory separator in the path
1875 @return true if this path is a drive root, false if not */
1876 UNIV_INLINE
1877 bool
os_file_is_root(const char * path,const char * last_slash)1878 os_file_is_root(
1879 const char* path,
1880 const char* last_slash)
1881 {
1882 return(
1883 #ifdef _WIN32
1884 (last_slash == path + 2 && path[1] == ':') ||
1885 #endif /* _WIN32 */
1886 last_slash == path);
1887 }
1888
1889 /** Return the parent directory component of a null-terminated path.
1890 Return a new buffer containing the string up to, but not including,
1891 the final component of the path.
1892 The path returned will not contain a trailing separator.
1893 Do not return a root path, return NULL instead.
1894 The final component trimmed off may be a filename or a directory name.
1895 If the final component is the only component of the path, return NULL.
1896 It is the caller's responsibility to free the returned string after it
1897 is no longer needed.
1898 @param[in] path Path name
1899 @return own: parent directory of the path */
1900 static
1901 char*
os_file_get_parent_dir(const char * path)1902 os_file_get_parent_dir(
1903 const char* path)
1904 {
1905 bool has_trailing_slash = false;
1906
1907 /* Find the offset of the last slash */
1908 const char* last_slash = strrchr(path, OS_PATH_SEPARATOR);
1909
1910 if (!last_slash) {
1911 /* No slash in the path, return NULL */
1912 return(NULL);
1913 }
1914
1915 /* Ok, there is a slash. Is there anything after it? */
1916 if (static_cast<size_t>(last_slash - path + 1) == strlen(path)) {
1917 has_trailing_slash = true;
1918 }
1919
1920 /* Reduce repetative slashes. */
1921 while (last_slash > path
1922 && last_slash[-1] == OS_PATH_SEPARATOR) {
1923 last_slash--;
1924 }
1925
1926 /* Check for the root of a drive. */
1927 if (os_file_is_root(path, last_slash)) {
1928 return(NULL);
1929 }
1930
1931 /* If a trailing slash prevented the first strrchr() from trimming
1932 the last component of the path, trim that component now. */
1933 if (has_trailing_slash) {
1934 /* Back up to the previous slash. */
1935 last_slash--;
1936 while (last_slash > path
1937 && last_slash[0] != OS_PATH_SEPARATOR) {
1938 last_slash--;
1939 }
1940
1941 /* Reduce repetative slashes. */
1942 while (last_slash > path
1943 && last_slash[-1] == OS_PATH_SEPARATOR) {
1944 last_slash--;
1945 }
1946 }
1947
1948 /* Check for the root of a drive. */
1949 if (os_file_is_root(path, last_slash)) {
1950 return(NULL);
1951 }
1952
1953 if (last_slash - path < 0) {
1954 /* Sanity check, it prevents gcc from trying to handle this case which
1955 * results in warnings for some optimized builds */
1956 return (NULL);
1957 }
1958
1959 /* Non-trivial directory component */
1960
1961 return(mem_strdupl(path, last_slash - path));
1962 }
1963 #ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
1964
1965 /* Test the function os_file_get_parent_dir. */
1966 void
test_os_file_get_parent_dir(const char * child_dir,const char * expected_dir)1967 test_os_file_get_parent_dir(
1968 const char* child_dir,
1969 const char* expected_dir)
1970 {
1971 char* child = mem_strdup(child_dir);
1972 char* expected = expected_dir == NULL ? NULL
1973 : mem_strdup(expected_dir);
1974
1975 /* os_file_get_parent_dir() assumes that separators are
1976 converted to OS_PATH_SEPARATOR. */
1977 os_normalize_path(child);
1978 os_normalize_path(expected);
1979
1980 char* parent = os_file_get_parent_dir(child);
1981
1982 bool unexpected = (expected == NULL
1983 ? (parent != NULL)
1984 : (0 != strcmp(parent, expected)));
1985 if (unexpected) {
1986 ib::fatal() << "os_file_get_parent_dir('" << child
1987 << "') returned '" << parent
1988 << "', instead of '" << expected << "'.";
1989 }
1990 ut_free(parent);
1991 ut_free(child);
1992 ut_free(expected);
1993 }
1994
1995 /* Test the function os_file_get_parent_dir. */
1996 void
unit_test_os_file_get_parent_dir()1997 unit_test_os_file_get_parent_dir()
1998 {
1999 test_os_file_get_parent_dir("/usr/lib/a", "/usr/lib");
2000 test_os_file_get_parent_dir("/usr/", NULL);
2001 test_os_file_get_parent_dir("//usr//", NULL);
2002 test_os_file_get_parent_dir("usr", NULL);
2003 test_os_file_get_parent_dir("usr//", NULL);
2004 test_os_file_get_parent_dir("/", NULL);
2005 test_os_file_get_parent_dir("//", NULL);
2006 test_os_file_get_parent_dir(".", NULL);
2007 test_os_file_get_parent_dir("..", NULL);
2008 # ifdef _WIN32
2009 test_os_file_get_parent_dir("D:", NULL);
2010 test_os_file_get_parent_dir("D:/", NULL);
2011 test_os_file_get_parent_dir("D:\\", NULL);
2012 test_os_file_get_parent_dir("D:/data", NULL);
2013 test_os_file_get_parent_dir("D:/data/", NULL);
2014 test_os_file_get_parent_dir("D:\\data\\", NULL);
2015 test_os_file_get_parent_dir("D:///data/////", NULL);
2016 test_os_file_get_parent_dir("D:\\\\\\data\\\\\\\\", NULL);
2017 test_os_file_get_parent_dir("D:/data//a", "D:/data");
2018 test_os_file_get_parent_dir("D:\\data\\\\a", "D:\\data");
2019 test_os_file_get_parent_dir("D:///data//a///b/", "D:///data//a");
2020 test_os_file_get_parent_dir("D:\\\\\\data\\\\a\\\\\\b\\", "D:\\\\\\data\\\\a");
2021 #endif /* _WIN32 */
2022 }
2023 #endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */
2024
2025
2026 /** Creates all missing subdirectories along the given path.
2027 @param[in] path Path name
2028 @return DB_SUCCESS if OK, otherwise error code. */
2029 dberr_t
os_file_create_subdirs_if_needed(const char * path)2030 os_file_create_subdirs_if_needed(
2031 const char* path)
2032 {
2033 if (srv_read_only_mode) {
2034
2035 ib::error()
2036 << "read only mode set. Can't create "
2037 << "subdirectories '" << path << "'";
2038
2039 return(DB_READ_ONLY);
2040
2041 }
2042
2043 char* subdir = os_file_get_parent_dir(path);
2044
2045 if (subdir == NULL) {
2046 /* subdir is root or cwd, nothing to do */
2047 return(DB_SUCCESS);
2048 }
2049
2050 /* Test if subdir exists */
2051 os_file_type_t type;
2052 bool subdir_exists;
2053 bool success = os_file_status(subdir, &subdir_exists, &type);
2054
2055 if (success && !subdir_exists) {
2056
2057 /* Subdir does not exist, create it */
2058 dberr_t err = os_file_create_subdirs_if_needed(subdir);
2059
2060 if (err != DB_SUCCESS) {
2061
2062 ut_free(subdir);
2063
2064 return(err);
2065 }
2066
2067 success = os_file_create_directory(subdir, false);
2068 }
2069
2070 ut_free(subdir);
2071
2072 return(success ? DB_SUCCESS : DB_ERROR);
2073 }
2074
2075 /** Allocate the buffer for IO on a transparently compressed table.
2076 @param[in] type IO flags
2077 @param[out] buf buffer to read or write
2078 @param[in,out] n number of bytes to read/write, starting from
2079 offset
2080 @return pointer to allocated page, compressed data is written to the offset
2081 that is aligned on the disk sector size */
2082 static
2083 Block*
os_file_compress_page(IORequest & type,void * & buf,ulint * n)2084 os_file_compress_page(
2085 IORequest& type,
2086 void*& buf,
2087 ulint* n)
2088 {
2089 ut_ad(!type.is_log());
2090 ut_ad(type.is_write());
2091 ut_ad(type.is_compressed());
2092
2093 ulint n_alloc = *n * 2;
2094
2095 ut_a(n_alloc <= UNIV_PAGE_SIZE_MAX * 2);
2096 ut_a(type.compression_algorithm().m_type != Compression::LZ4
2097 || static_cast<ulint>(LZ4_COMPRESSBOUND(*n)) < n_alloc);
2098
2099 Block* block = os_alloc_block();
2100
2101 ulint old_compressed_len;
2102 ulint compressed_len = *n;
2103
2104 old_compressed_len = mach_read_from_2(
2105 reinterpret_cast<byte*>(buf)
2106 + FIL_PAGE_COMPRESS_SIZE_V1);
2107
2108 if (old_compressed_len > 0) {
2109 old_compressed_len = ut_calc_align(
2110 old_compressed_len + FIL_PAGE_DATA,
2111 type.block_size());
2112 } else {
2113 old_compressed_len = *n;
2114 }
2115
2116 byte* compressed_page;
2117
2118 compressed_page = static_cast<byte*>(
2119 ut_align(block->m_ptr, os_io_ptr_align));
2120
2121 byte* buf_ptr;
2122
2123 buf_ptr = os_file_compress_page(
2124 type.compression_algorithm(),
2125 type.block_size(),
2126 reinterpret_cast<byte*>(buf),
2127 *n,
2128 compressed_page,
2129 &compressed_len);
2130
2131 if (buf_ptr != buf) {
2132 /* Set new compressed size to uncompressed page. */
2133 memcpy(reinterpret_cast<byte*>(buf) + FIL_PAGE_COMPRESS_SIZE_V1,
2134 buf_ptr + FIL_PAGE_COMPRESS_SIZE_V1, 2);
2135
2136 buf = buf_ptr;
2137 *n = compressed_len;
2138
2139 if (compressed_len >= old_compressed_len) {
2140
2141 ut_ad(old_compressed_len <= UNIV_PAGE_SIZE);
2142
2143 type.clear_punch_hole();
2144 }
2145 }
2146
2147 return(block);
2148 }
2149
2150 /** Encrypt a page content when write it to disk.
2151 @param[in] type IO flags
2152 @param[out] buf buffer to read or write
2153 @param[in,out] n number of bytes to read/write, starting from
2154 offset
2155 @return pointer to the encrypted page */
2156 static
2157 Block*
os_file_encrypt_page(const IORequest & type,void * & buf,ulint * n)2158 os_file_encrypt_page(
2159 const IORequest& type,
2160 void*& buf,
2161 ulint* n)
2162 {
2163
2164 byte* encrypted_page;
2165 ulint encrypted_len = *n;
2166 byte* buf_ptr;
2167 Encryption encryption(type.encryption_algorithm());
2168
2169 ut_ad(!type.is_log());
2170 ut_ad(type.is_write());
2171 ut_ad(type.is_encrypted());
2172
2173 Block* block = os_alloc_block();
2174
2175 encrypted_page = static_cast<byte*>(
2176 ut_align(block->m_ptr, os_io_ptr_align));
2177
2178 buf_ptr = encryption.encrypt(type,
2179 reinterpret_cast<byte*>(buf), *n,
2180 encrypted_page, &encrypted_len);
2181
2182 bool encrypted = buf_ptr != buf;
2183
2184 if (encrypted) {
2185
2186 buf = buf_ptr;
2187 *n = encrypted_len;
2188 }
2189
2190 return(block);
2191 }
2192
2193 #ifndef _WIN32
2194
2195 /** Do the read/write
2196 @param[in] request The IO context and type
2197 @return the number of bytes read/written or negative value on error */
2198 ssize_t
execute(const IORequest & request)2199 SyncFileIO::execute(const IORequest& request)
2200 {
2201 ssize_t n_bytes;
2202
2203 if (request.is_read()) {
2204 n_bytes = pread(m_fh, m_buf, m_n, m_offset);
2205 } else {
2206 ut_ad(request.is_write());
2207 n_bytes = pwrite(m_fh, m_buf, m_n, m_offset);
2208 }
2209
2210 return(n_bytes);
2211 }
2212
2213 /** Free storage space associated with a section of the file.
2214 @param[in] fh Open file handle
2215 @param[in] off Starting offset (SEEK_SET)
2216 @param[in] len Size of the hole
2217 @return DB_SUCCESS or error code */
2218 static
2219 dberr_t
os_file_punch_hole_posix(os_file_t fh,os_offset_t off,os_offset_t len)2220 os_file_punch_hole_posix(
2221 os_file_t fh,
2222 os_offset_t off,
2223 os_offset_t len)
2224 {
2225 #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
2226 const int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
2227
2228 int ret = fallocate(fh, mode, off, len);
2229
2230 if (ret == 0) {
2231 return(DB_SUCCESS);
2232 }
2233
2234 ut_a(ret == -1);
2235
2236 if (errno == ENOTSUP) {
2237 return(DB_IO_NO_PUNCH_HOLE);
2238 }
2239
2240 ib::warn()
2241 << "fallocate(" << fh
2242 <<", FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, "
2243 << off << ", " << len << ") returned errno: "
2244 << errno;
2245
2246 return(DB_IO_ERROR);
2247
2248 #elif defined(UNIV_SOLARIS)
2249
2250 // Use F_FREESP
2251
2252 #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
2253
2254 return(DB_IO_NO_PUNCH_HOLE);
2255 }
2256
2257 #if defined(LINUX_NATIVE_AIO)
2258
2259 /** Linux native AIO handler */
2260 class LinuxAIOHandler {
2261 public:
2262 /**
2263 @param[in] global_segment The global segment*/
LinuxAIOHandler(ulint global_segment)2264 LinuxAIOHandler(ulint global_segment)
2265 :
2266 m_global_segment(global_segment)
2267 {
2268 /* Should never be doing Sync IO here. */
2269 ut_a(m_global_segment != ULINT_UNDEFINED);
2270
2271 /* Find the array and the local segment. */
2272
2273 m_segment = AIO::get_array_and_local_segment(
2274 &m_array, m_global_segment);
2275
2276 m_n_slots = m_array->slots_per_segment();
2277 }
2278
2279 /** Destructor */
~LinuxAIOHandler()2280 ~LinuxAIOHandler()
2281 {
2282 // No op
2283 }
2284
2285 /**
2286 Process a Linux AIO request
2287 @param[out] m1 the messages passed with the
2288 @param[out] m2 AIO request; note that in case the
2289 AIO operation failed, these output
2290 parameters are valid and can be used to
2291 restart the operation.
2292 @param[out] request IO context
2293 @return DB_SUCCESS or error code */
2294 dberr_t poll(fil_node_t** m1, void** m2, IORequest* request);
2295
2296 private:
2297 /** Resubmit an IO request that was only partially successful
2298 @param[in,out] slot Request to resubmit
2299 @return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
2300 dberr_t resubmit(Slot* slot);
2301
2302 /** Check if the AIO succeeded
2303 @param[in,out] slot The slot to check
2304 @return DB_SUCCESS, DB_FAIL if the operation should be retried or
2305 DB_IO_ERROR on all other errors */
2306 dberr_t check_state(Slot* slot);
2307
2308 /** @return true if a shutdown was detected */
is_shutdown() const2309 bool is_shutdown() const
2310 {
2311 return(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
2312 && !buf_page_cleaner_is_active);
2313 }
2314
2315 /** If no slot was found then the m_array->m_mutex will be released.
2316 @param[out] n_pending The number of pending IOs
2317 @return NULL or a slot that has completed IO */
2318 Slot* find_completed_slot(ulint* n_pending);
2319
2320 /** This is called from within the IO-thread. If there are no completed
2321 IO requests in the slot array, the thread calls this function to
2322 collect more requests from the Linux kernel.
2323 The IO-thread waits on io_getevents(), which is a blocking call, with
2324 a timeout value. Unless the system is very heavy loaded, keeping the
2325 IO-thread very busy, the io-thread will spend most of its time waiting
2326 in this function.
2327 The IO-thread also exits in this function. It checks server status at
2328 each wakeup and that is why we use timed wait in io_getevents(). */
2329 void collect();
2330
2331 private:
2332 /** Slot array */
2333 AIO* m_array;
2334
2335 /** Number of slots inthe local segment */
2336 ulint m_n_slots;
2337
2338 /** The local segment to check */
2339 ulint m_segment;
2340
2341 /** The global segment */
2342 ulint m_global_segment;
2343 };
2344
2345 /** Resubmit an IO request that was only partially successful
2346 @param[in,out] slot Request to resubmit
2347 @return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
2348 dberr_t
resubmit(Slot * slot)2349 LinuxAIOHandler::resubmit(Slot* slot)
2350 {
2351 #ifdef UNIV_DEBUG
2352 /* Bytes already read/written out */
2353 ulint n_bytes = slot->ptr - slot->buf;
2354
2355 ut_ad(m_array->is_mutex_owned());
2356
2357 ut_ad(n_bytes < slot->original_len);
2358 ut_ad(static_cast<ulint>(slot->n_bytes) < slot->original_len - n_bytes);
2359 /* Partial read or write scenario */
2360 ut_ad(slot->len >= static_cast<ulint>(slot->n_bytes));
2361 #endif /* UNIV_DEBUG */
2362
2363 slot->len -= slot->n_bytes;
2364 slot->ptr += slot->n_bytes;
2365 slot->offset += slot->n_bytes;
2366
2367 /* Resetting the bytes read/written */
2368 slot->n_bytes = 0;
2369 slot->io_already_done = false;
2370
2371 /* make sure that slot->offset fits in off_t */
2372 ut_ad(sizeof(off_t) >= sizeof(os_offset_t));
2373
2374 struct iocb* iocb = &slot->control;
2375 if (slot->type.is_read()) {
2376 io_prep_pread(
2377 iocb,
2378 slot->file.m_file,
2379 slot->ptr,
2380 slot->len,
2381 slot->offset);
2382
2383 } else {
2384
2385 ut_a(slot->type.is_write());
2386
2387 io_prep_pwrite(
2388 iocb,
2389 slot->file.m_file,
2390 slot->ptr,
2391 slot->len,
2392 slot->offset);
2393 }
2394
2395 iocb->data = slot;
2396
2397 /* Resubmit an I/O request */
2398 int ret = io_submit(m_array->io_ctx(m_segment), 1, &iocb);
2399
2400 if (ret < -1) {
2401 errno = -ret;
2402 }
2403
2404 return(ret < 0 ? DB_IO_PARTIAL_FAILED : DB_SUCCESS);
2405 }
2406
2407 /** Check if the AIO succeeded
2408 @param[in,out] slot The slot to check
2409 @return DB_SUCCESS, DB_FAIL if the operation should be retried or
2410 DB_IO_ERROR on all other errors */
2411 dberr_t
check_state(Slot * slot)2412 LinuxAIOHandler::check_state(Slot* slot)
2413 {
2414 ut_ad(m_array->is_mutex_owned());
2415
2416 /* Note that it may be that there is more then one completed
2417 IO requests. We process them one at a time. We may have a case
2418 here to improve the performance slightly by dealing with all
2419 requests in one sweep. */
2420
2421 srv_set_io_thread_op_info(
2422 m_global_segment, "processing completed aio requests");
2423
2424 ut_ad(slot->io_already_done);
2425
2426 dberr_t err;
2427
2428 if (slot->ret == 0) {
2429
2430 err = AIOHandler::post_io_processing(slot);
2431
2432 } else {
2433 errno = -slot->ret;
2434
2435 /* os_file_handle_error does tell us if we should retry
2436 this IO. As it stands now, we don't do this retry when
2437 reaping requests from a different context than
2438 the dispatcher. This non-retry logic is the same for
2439 Windows and Linux native AIO.
2440 We should probably look into this to transparently
2441 re-submit the IO. */
2442 os_file_handle_error(slot->name, "Linux aio");
2443
2444 err = DB_IO_ERROR;
2445 }
2446
2447 return(err);
2448 }
2449
2450 /** If no slot was found then the m_array->m_mutex will be released.
2451 @param[out] n_pending The number of pending IOs
2452 @return NULL or a slot that has completed IO */
2453 Slot*
find_completed_slot(ulint * n_pending)2454 LinuxAIOHandler::find_completed_slot(ulint* n_pending)
2455 {
2456 ulint offset = m_n_slots * m_segment;
2457
2458 *n_pending = 0;
2459
2460 m_array->acquire();
2461
2462 Slot* slot = m_array->at(offset);
2463
2464 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
2465
2466 if (slot->is_reserved) {
2467
2468 ++*n_pending;
2469
2470 if (slot->io_already_done) {
2471
2472 /* Something for us to work on.
2473 Note: We don't release the mutex. */
2474 return(slot);
2475 }
2476 }
2477 }
2478
2479 m_array->release();
2480
2481 return(NULL);
2482 }
2483
2484 /** This function is only used in Linux native asynchronous i/o. This is
2485 called from within the io-thread. If there are no completed IO requests
2486 in the slot array, the thread calls this function to collect more
2487 requests from the kernel.
2488 The io-thread waits on io_getevents(), which is a blocking call, with
2489 a timeout value. Unless the system is very heavy loaded, keeping the
2490 io-thread very busy, the io-thread will spend most of its time waiting
2491 in this function.
2492 The io-thread also exits in this function. It checks server status at
2493 each wakeup and that is why we use timed wait in io_getevents(). */
2494 void
collect()2495 LinuxAIOHandler::collect()
2496 {
2497 ut_ad(m_n_slots > 0);
2498 ut_ad(m_array != NULL);
2499 ut_ad(m_segment < m_array->get_n_segments());
2500
2501 /* Which io_context we are going to use. */
2502 io_context* io_ctx = m_array->io_ctx(m_segment);
2503
2504 /* Starting point of the m_segment we will be working on. */
2505 ulint start_pos = m_segment * m_n_slots;
2506
2507 /* End point. */
2508 ulint end_pos = start_pos + m_n_slots;
2509
2510 for (;;) {
2511 struct io_event* events;
2512
2513 /* Which part of event array we are going to work on. */
2514 events = m_array->io_events(m_segment * m_n_slots);
2515
2516 /* Initialize the events. */
2517 memset(events, 0, sizeof(*events) * m_n_slots);
2518
2519 /* The timeout value is arbitrary. We probably need
2520 to experiment with it a little. */
2521 struct timespec timeout;
2522
2523 timeout.tv_sec = 0;
2524 timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
2525
2526 int ret;
2527
2528 ret = io_getevents(io_ctx, 1, m_n_slots, events, &timeout);
2529
2530 for (int i = 0; i < ret; ++i) {
2531
2532 struct iocb* iocb;
2533
2534 iocb = reinterpret_cast<struct iocb*>(events[i].obj);
2535 ut_a(iocb != NULL);
2536
2537 Slot* slot = reinterpret_cast<Slot*>(iocb->data);
2538
2539 /* Some sanity checks. */
2540 ut_a(slot != NULL);
2541 ut_a(slot->is_reserved);
2542
2543 /* We are not scribbling previous segment. */
2544 ut_a(slot->pos >= start_pos);
2545
2546 /* We have not overstepped to next segment. */
2547 ut_a(slot->pos < end_pos);
2548
2549 /* We never compress/decompress the first page */
2550
2551 if (slot->offset > 0
2552 && !slot->skip_punch_hole
2553 && slot->type.is_compression_enabled()
2554 && !slot->type.is_log()
2555 && slot->type.is_write()
2556 && slot->type.is_compressed()
2557 && slot->type.punch_hole()) {
2558
2559 slot->err = AIOHandler::io_complete(slot);
2560 } else {
2561 slot->err = DB_SUCCESS;
2562 }
2563
2564 /* Mark this request as completed. The error handling
2565 will be done in the calling function. */
2566 m_array->acquire();
2567
2568 /* events[i].res2 should always be ZERO */
2569 ut_ad(events[i].res2 == 0);
2570 slot->io_already_done = true;
2571
2572 /*Even though events[i].res is an unsigned number
2573 in libaio, it is used to return a negative value
2574 (negated errno value) to indicate error and a positive
2575 value to indicate number of bytes read or written. */
2576
2577 if (events[i].res > slot->len) {
2578 /* failure */
2579 slot->n_bytes = 0;
2580 slot->ret = events[i].res;
2581 } else {
2582 /* success */
2583 slot->n_bytes = events[i].res;
2584 slot->ret = 0;
2585 }
2586 m_array->release();
2587 }
2588
2589 if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
2590 || !buf_page_cleaner_is_active
2591 || ret > 0) {
2592
2593 break;
2594 }
2595
2596 /* This error handling is for any error in collecting the
2597 IO requests. The errors, if any, for any particular IO
2598 request are simply passed on to the calling routine. */
2599
2600 switch (ret) {
2601 case -EAGAIN:
2602 /* Not enough resources! Try again. */
2603
2604 case -EINTR:
2605 /* Interrupted! The behaviour in case of an interrupt.
2606 If we have some completed IOs available then the
2607 return code will be the number of IOs. We get EINTR
2608 only if there are no completed IOs and we have been
2609 interrupted. */
2610
2611 case 0:
2612 /* No pending request! Go back and check again. */
2613
2614 continue;
2615 }
2616
2617 /* All other errors should cause a trap for now. */
2618 ib::fatal()
2619 << "Unexpected ret_code[" << ret
2620 << "] from io_getevents()!";
2621
2622 break;
2623 }
2624 }
2625
2626 /** Process a Linux AIO request
2627 @param[out] m1 the messages passed with the
2628 @param[out] m2 AIO request; note that in case the
2629 AIO operation failed, these output
2630 parameters are valid and can be used to
2631 restart the operation.
2632 @param[out] request IO context
2633 @return DB_SUCCESS or error code */
2634 dberr_t
poll(fil_node_t ** m1,void ** m2,IORequest * request)2635 LinuxAIOHandler::poll(fil_node_t** m1, void** m2, IORequest* request)
2636 {
2637 dberr_t err;
2638 Slot* slot;
2639
2640 /* Loop until we have found a completed request. */
2641 for (;;) {
2642
2643 ulint n_pending;
2644
2645 slot = find_completed_slot(&n_pending);
2646
2647 if (slot != NULL) {
2648
2649 ut_ad(m_array->is_mutex_owned());
2650
2651 err = check_state(slot);
2652
2653 /* DB_FAIL is not a hard error, we should retry */
2654 if (err != DB_FAIL) {
2655 break;
2656 }
2657
2658 /* Partial IO, resubmit request for
2659 remaining bytes to read/write */
2660 err = resubmit(slot);
2661
2662 if (err != DB_SUCCESS) {
2663 break;
2664 }
2665
2666 m_array->release();
2667
2668 } else if (is_shutdown() && n_pending == 0) {
2669
2670 /* There is no completed request. If there is
2671 no pending request at all, and the system is
2672 being shut down, exit. */
2673
2674 *m1 = NULL;
2675 *m2 = NULL;
2676
2677 return(DB_SUCCESS);
2678
2679 } else {
2680
2681 /* Wait for some request. Note that we return
2682 from wait if we have found a request. */
2683
2684 srv_set_io_thread_op_info(
2685 m_global_segment,
2686 "waiting for completed aio requests");
2687
2688 collect();
2689 }
2690 }
2691
2692 if (err == DB_IO_PARTIAL_FAILED) {
2693 /* Aborting in case of submit failure */
2694 ib::fatal()
2695 << "Native Linux AIO interface. "
2696 "io_submit() call failed when "
2697 "resubmitting a partial I/O "
2698 "request on the file " << slot->name
2699 << ".";
2700 }
2701
2702 *m1 = slot->m1;
2703 *m2 = slot->m2;
2704
2705 *request = slot->type;
2706
2707 m_array->release(slot);
2708
2709 m_array->release();
2710
2711 return(err);
2712 }
2713
2714 /** This function is only used in Linux native asynchronous i/o.
2715 Waits for an aio operation to complete. This function is used to wait for
2716 the completed requests. The aio array of pending requests is divided
2717 into segments. The thread specifies which segment or slot it wants to wait
2718 for. NOTE: this function will also take care of freeing the aio slot,
2719 therefore no other thread is allowed to do the freeing!
2720
2721 @param[in] global_seg segment number in the aio array
2722 to wait for; segment 0 is the ibuf
2723 i/o thread, segment 1 is log i/o thread,
2724 then follow the non-ibuf read threads,
2725 and the last are the non-ibuf write
2726 threads.
2727 @param[out] m1 the messages passed with the
2728 @param[out] m2 AIO request; note that in case the
2729 AIO operation failed, these output
2730 parameters are valid and can be used to
2731 restart the operation.
2732 @param[out]xi request IO context
2733 @return DB_SUCCESS if the IO was successful */
2734 static
2735 dberr_t
os_aio_linux_handler(ulint global_segment,fil_node_t ** m1,void ** m2,IORequest * request)2736 os_aio_linux_handler(
2737 ulint global_segment,
2738 fil_node_t** m1,
2739 void** m2,
2740 IORequest* request)
2741 {
2742 LinuxAIOHandler handler(global_segment);
2743
2744 dberr_t err = handler.poll(m1, m2, request);
2745
2746 if (err == DB_IO_NO_PUNCH_HOLE) {
2747 fil_no_punch_hole(*m1);
2748 err = DB_SUCCESS;
2749 }
2750
2751 return(err);
2752 }
2753
2754 /** Dispatch an AIO request to the kernel.
2755 @param[in,out] slot an already reserved slot
2756 @return true on success. */
2757 bool
linux_dispatch(Slot * slot)2758 AIO::linux_dispatch(Slot* slot)
2759 {
2760 ut_a(slot->is_reserved);
2761 ut_ad(slot->type.validate());
2762
2763 /* Find out what we are going to work with.
2764 The iocb struct is directly in the slot.
2765 The io_context is one per segment. */
2766
2767 ulint io_ctx_index;
2768 struct iocb* iocb = &slot->control;
2769
2770 io_ctx_index = (slot->pos * m_n_segments) / m_slots.size();
2771
2772 int ret = io_submit(m_aio_ctx[io_ctx_index], 1, &iocb);
2773
2774 /* io_submit() returns number of successfully queued requests
2775 or -errno. */
2776
2777 if (ret != 1) {
2778 errno = -ret;
2779 }
2780
2781 return(ret == 1);
2782 }
2783
2784 /** Creates an io_context for native linux AIO.
2785 @param[in] max_events number of events
2786 @param[out] io_ctx io_ctx to initialize.
2787 @return true on success. */
2788 bool
linux_create_io_ctx(ulint max_events,io_context_t * io_ctx)2789 AIO::linux_create_io_ctx(
2790 ulint max_events,
2791 io_context_t* io_ctx)
2792 {
2793 ssize_t n_retries = 0;
2794
2795 for (;;) {
2796
2797 memset(io_ctx, 0x0, sizeof(*io_ctx));
2798
2799 /* Initialize the io_ctx. Tell it how many pending
2800 IO requests this context will handle. */
2801
2802 int ret = io_setup(max_events, io_ctx);
2803
2804 if (ret == 0) {
2805 /* Success. Return now. */
2806 return(true);
2807 }
2808
2809 /* If we hit EAGAIN we'll make a few attempts before failing. */
2810
2811 switch (ret) {
2812 case -EAGAIN:
2813 if (n_retries == 0) {
2814 /* First time around. */
2815 ib::warn()
2816 << "io_setup() failed with EAGAIN."
2817 " Will make "
2818 << OS_AIO_IO_SETUP_RETRY_ATTEMPTS
2819 << " attempts before giving up.";
2820 }
2821
2822 if (n_retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
2823
2824 ++n_retries;
2825
2826 ib::warn()
2827 << "io_setup() attempt "
2828 << n_retries << ".";
2829
2830 os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
2831
2832 continue;
2833 }
2834
2835 /* Have tried enough. Better call it a day. */
2836 ib::error()
2837 << "io_setup() failed with EAGAIN after "
2838 << OS_AIO_IO_SETUP_RETRY_ATTEMPTS
2839 << " attempts.";
2840 break;
2841
2842 case -ENOSYS:
2843 ib::error()
2844 << "Linux Native AIO interface"
2845 " is not supported on this platform. Please"
2846 " check your OS documentation and install"
2847 " appropriate binary of InnoDB.";
2848
2849 break;
2850
2851 default:
2852 ib::error()
2853 << "Linux Native AIO setup"
2854 << " returned following error["
2855 << ret << "]";
2856 break;
2857 }
2858
2859 ib::info()
2860 << "You can disable Linux Native AIO by"
2861 " setting innodb_use_native_aio = 0 in my.cnf";
2862
2863 break;
2864 }
2865
2866 return(false);
2867 }
2868
2869 /** Checks if the system supports native linux aio. On some kernel
2870 versions where native aio is supported it won't work on tmpfs. In such
2871 cases we can't use native aio as it is not possible to mix simulated
2872 and native aio.
2873 @return: true if supported, false otherwise. */
2874 bool
is_linux_native_aio_supported()2875 AIO::is_linux_native_aio_supported()
2876 {
2877 int fd;
2878 io_context_t io_ctx;
2879 char name[1000];
2880
2881 if (!linux_create_io_ctx(1, &io_ctx)) {
2882
2883 /* The platform does not support native aio. */
2884
2885 return(false);
2886
2887 } else if (!srv_read_only_mode) {
2888
2889 /* Now check if tmpdir supports native aio ops. */
2890 fd = innobase_mysql_tmpfile(NULL);
2891
2892 if (fd < 0) {
2893 ib::warn()
2894 << "Unable to create temp file to check"
2895 " native AIO support.";
2896
2897 return(false);
2898 }
2899 } else {
2900
2901 os_normalize_path(srv_log_group_home_dir);
2902
2903 ulint dirnamelen = strlen(srv_log_group_home_dir);
2904
2905 ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
2906
2907 memcpy(name, srv_log_group_home_dir, dirnamelen);
2908
2909 /* Add a path separator if needed. */
2910 if (dirnamelen && name[dirnamelen - 1] != OS_PATH_SEPARATOR) {
2911
2912 name[dirnamelen++] = OS_PATH_SEPARATOR;
2913 }
2914
2915 strcpy(name + dirnamelen, "ib_logfile0");
2916
2917 fd = ::open(name, O_RDONLY);
2918
2919 if (fd == -1) {
2920
2921 ib::warn()
2922 << "Unable to open"
2923 << " \"" << name << "\" to check native"
2924 << " AIO read support.";
2925
2926 return(false);
2927 }
2928 }
2929
2930 struct io_event io_event;
2931
2932 memset(&io_event, 0x0, sizeof(io_event));
2933
2934 byte* buf = static_cast<byte*>(ut_malloc_nokey(UNIV_PAGE_SIZE * 2));
2935 byte* ptr = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE));
2936
2937 struct iocb iocb;
2938
2939 /* Suppress valgrind warning. */
2940 memset(buf, 0x00, UNIV_PAGE_SIZE * 2);
2941 memset(&iocb, 0x0, sizeof(iocb));
2942
2943 struct iocb* p_iocb = &iocb;
2944
2945 if (!srv_read_only_mode) {
2946
2947 io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0);
2948
2949 } else {
2950 ut_a(UNIV_PAGE_SIZE >= 512);
2951 io_prep_pread(p_iocb, fd, ptr, 512, 0);
2952 }
2953
2954 int err = io_submit(io_ctx, 1, &p_iocb);
2955
2956 if (err >= 1) {
2957 /* Now collect the submitted IO request. */
2958 err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
2959 }
2960
2961 ut_free(buf);
2962 close(fd);
2963
2964 switch (err) {
2965 case 1:
2966 return(true);
2967
2968 case -EINVAL:
2969 case -ENOSYS:
2970 ib::error()
2971 << "Linux Native AIO not supported. You can either"
2972 " move "
2973 << (srv_read_only_mode ? name : "tmpdir")
2974 << " to a file system that supports native"
2975 " AIO or you can set innodb_use_native_aio to"
2976 " FALSE to avoid this message.";
2977
2978 /* fall through. */
2979 default:
2980 ib::error()
2981 << "Linux Native AIO check on "
2982 << (srv_read_only_mode ? name : "tmpdir")
2983 << "returned error[" << -err << "]";
2984 }
2985
2986 return(false);
2987 }
2988
2989 #endif /* LINUX_NATIVE_AIO */
2990
2991 /** Retrieves the last error number if an error occurs in a file io function.
2992 The number should be retrieved before any other OS calls (because they may
2993 overwrite the error number). If the number is not known to this program,
2994 the OS error number + 100 is returned.
2995 @param[in] report_all_errors true if we want an error message
2996 printed of all errors
2997 @param[in] on_error_silent true then don't print any diagnostic
2998 to the log
2999 @return error number, or OS error number + 100 */
3000 static
3001 ulint
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)3002 os_file_get_last_error_low(
3003 bool report_all_errors,
3004 bool on_error_silent)
3005 {
3006 int err = errno;
3007
3008 if (err == 0) {
3009 return(0);
3010 }
3011
3012 if (report_all_errors
3013 || (err != ENOSPC && err != EEXIST && !on_error_silent)) {
3014
3015 ib::error()
3016 << "Operating system error number "
3017 << err
3018 << " in a file operation.";
3019
3020 if (err == ENOENT) {
3021
3022 ib::error()
3023 << "The error means the system"
3024 " cannot find the path specified.";
3025
3026 if (srv_is_being_started) {
3027
3028 ib::error()
3029 << "If you are installing InnoDB,"
3030 " remember that you must create"
3031 " directories yourself, InnoDB"
3032 " does not create them.";
3033 }
3034 } else if (err == EACCES) {
3035
3036 ib::error()
3037 << "The error means mysqld does not have"
3038 " the access rights to the directory.";
3039
3040 } else {
3041 if (strerror(err) != NULL) {
3042
3043 ib::error()
3044 << "Error number " << err << " means '"
3045 << strerror(err) << "'";
3046 }
3047
3048 ib::info() << OPERATING_SYSTEM_ERROR_MSG;
3049 }
3050 }
3051
3052 switch (err) {
3053 case ENOSPC:
3054 return(OS_FILE_DISK_FULL);
3055 case ENOENT:
3056 return(OS_FILE_NOT_FOUND);
3057 case EEXIST:
3058 return(OS_FILE_ALREADY_EXISTS);
3059 case EXDEV:
3060 case ENOTDIR:
3061 case EISDIR:
3062 return(OS_FILE_PATH_ERROR);
3063 case EAGAIN:
3064 if (srv_use_native_aio) {
3065 return(OS_FILE_AIO_RESOURCES_RESERVED);
3066 }
3067 break;
3068 case EINTR:
3069 if (srv_use_native_aio) {
3070 return(OS_FILE_AIO_INTERRUPTED);
3071 }
3072 break;
3073 case EACCES:
3074 return(OS_FILE_ACCESS_VIOLATION);
3075 }
3076 return(OS_FILE_ERROR_MAX + err);
3077 }
3078
3079 /** Wrapper to fsync(2) that retries the call on some errors.
3080 Returns the value 0 if successful; otherwise the value -1 is returned and
3081 the global variable errno is set to indicate the error.
3082 @param[in] file open file handle
3083 @return 0 if success, -1 otherwise */
3084 static
3085 int
os_file_fsync_posix(os_file_t file)3086 os_file_fsync_posix(
3087 os_file_t file)
3088 {
3089 ulint failures = 0;
3090
3091 for (;;) {
3092
3093 ++os_n_fsyncs;
3094
3095 int ret = fsync(file);
3096
3097 if (ret == 0) {
3098 return(ret);
3099 }
3100
3101 switch(errno) {
3102 case ENOLCK:
3103
3104 ++failures;
3105 ut_a(failures < 1000);
3106
3107 if (!(failures % 100)) {
3108
3109 ib::warn()
3110 << "fsync(): "
3111 << "No locks available; retrying";
3112 }
3113
3114 /* 0.2 sec */
3115 os_thread_sleep(200000);
3116 break;
3117
3118 case EIO:
3119
3120 ib::fatal()
3121 << "fsync() returned EIO, aborting.";
3122 break;
3123
3124 case EINTR:
3125
3126 ++failures;
3127 ut_a(failures < 2000);
3128 break;
3129
3130 default:
3131 ut_error;
3132 break;
3133 }
3134 }
3135
3136 ut_error;
3137
3138 return(-1);
3139 }
3140
3141 /** Check the existence and type of the given file.
3142 @param[in] path path name of file
3143 @param[out] exists true if the file exists
3144 @param[out] type Type of the file, if it exists
3145 @return true if call succeeded */
3146 bool
os_file_status_posix(const char * path,bool * exists,os_file_type_t * type)3147 os_file_status_posix(
3148 const char* path,
3149 bool* exists,
3150 os_file_type_t* type)
3151 {
3152 struct stat statinfo;
3153
3154 int ret = stat(path, &statinfo);
3155
3156 *exists = !ret;
3157
3158 if (!ret) {
3159 /* file exists, everything OK */
3160
3161 } else if (errno == ENOENT || errno == ENOTDIR
3162 || errno == ENAMETOOLONG) {
3163 /* file does not exist */
3164 return(true);
3165
3166 } else {
3167 /* file exists, but stat call failed */
3168 os_file_handle_error_no_exit(path, "stat", false);
3169 return(false);
3170 }
3171
3172 if (S_ISDIR(statinfo.st_mode)) {
3173 *type = OS_FILE_TYPE_DIR;
3174
3175 } else if (S_ISLNK(statinfo.st_mode)) {
3176 *type = OS_FILE_TYPE_LINK;
3177
3178 } else if (S_ISREG(statinfo.st_mode)) {
3179 *type = OS_FILE_TYPE_FILE;
3180
3181 } else {
3182 *type = OS_FILE_TYPE_UNKNOWN;
3183 }
3184
3185 return(true);
3186 }
3187
3188 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
3189 function!
3190 Flushes the write buffers of a given file to the disk.
3191 @param[in] file handle to a file
3192 @return true if success */
3193 bool
os_file_flush_func(os_file_t file)3194 os_file_flush_func(
3195 os_file_t file)
3196 {
3197 int ret;
3198
3199 ret = os_file_fsync_posix(file);
3200
3201 if (ret == 0) {
3202 return(true);
3203 }
3204
3205 /* Since Linux returns EINVAL if the 'file' is actually a raw device,
3206 we choose to ignore that error if we are using raw disks */
3207
3208 if (srv_start_raw_disk_in_use && errno == EINVAL) {
3209
3210 return(true);
3211 }
3212
3213 ib::error() << "The OS said file flush did not succeed";
3214
3215 os_file_handle_error(NULL, "flush");
3216
3217 /* It is a fatal error if a file flush does not succeed, because then
3218 the database can get corrupt on disk */
3219 ut_error;
3220
3221 return(false);
3222 }
3223
3224 /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
3225 this function!
3226 A simple function to open or create a file.
3227 @param[in] name name of the file or path as a null-terminated
3228 string
3229 @param[in] create_mode create mode
3230 @param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
3231 @param[in] read_only if true, read only checks are enforced
3232 @param[out] success true if succeed, false if error
3233 @return handle to the file, not defined if error, error number
3234 can be retrieved with os_file_get_last_error */
3235 pfs_os_file_t
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)3236 os_file_create_simple_func(
3237 const char* name,
3238 ulint create_mode,
3239 ulint access_type,
3240 bool read_only,
3241 bool* success)
3242 {
3243 pfs_os_file_t file;
3244
3245 *success = false;
3246
3247 int create_flag;
3248
3249 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
3250 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
3251
3252 if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
3253 WAIT_ALLOW_WRITES();
3254 if (create_mode == OS_FILE_OPEN) {
3255
3256 if (access_type == OS_FILE_READ_ONLY) {
3257
3258 create_flag = O_RDONLY;
3259
3260 } else if (read_only) {
3261
3262 create_flag = O_RDONLY;
3263
3264 } else {
3265 create_flag = O_RDWR;
3266 }
3267
3268 } else if (read_only) {
3269
3270 create_flag = O_RDONLY;
3271
3272 } else if (create_mode == OS_FILE_CREATE) {
3273
3274 create_flag = O_RDWR | O_CREAT | O_EXCL;
3275
3276 } else if (create_mode == OS_FILE_CREATE_PATH) {
3277
3278 /* Create subdirs along the path if needed. */
3279
3280 *success = os_file_create_subdirs_if_needed(name);
3281
3282 if (!*success) {
3283
3284 ib::error()
3285 << "Unable to create subdirectories '"
3286 << name << "'";
3287
3288 file.m_file = OS_FILE_CLOSED;
3289 return(file);
3290 }
3291
3292 create_flag = O_RDWR | O_CREAT | O_EXCL;
3293 create_mode = OS_FILE_CREATE;
3294 } else {
3295
3296 ib::error()
3297 << "Unknown file create mode ("
3298 << create_mode
3299 << " for file '" << name << "'";
3300
3301 file.m_file = OS_FILE_CLOSED;
3302 return(file);
3303 }
3304
3305 bool retry;
3306
3307 do {
3308 file.m_file = ::open(name, create_flag, os_innodb_umask);
3309
3310 if (file.m_file == -1) {
3311 *success = false;
3312
3313 retry = os_file_handle_error(
3314 name,
3315 create_mode == OS_FILE_OPEN
3316 ? "open" : "create");
3317 } else {
3318 *success = true;
3319 retry = false;
3320 }
3321
3322 } while (retry);
3323
3324 #ifdef USE_FILE_LOCK
3325 if (!read_only
3326 && *success
3327 && access_type == OS_FILE_READ_WRITE
3328 && os_file_lock(file.m_file, name)) {
3329
3330 *success = false;
3331 close(file.m_file);
3332 file.m_file = -1;
3333 }
3334 #endif /* USE_FILE_LOCK */
3335
3336 return(file);
3337 }
3338
3339 /** This function attempts to create a directory named pathname. The new
3340 directory gets default permissions. On Unix the permissions are
3341 (0770 & ~umask). If the directory exists already, nothing is done and
3342 the call succeeds, unless the fail_if_exists arguments is true.
3343 If another error occurs, such as a permission error, this does not crash,
3344 but reports the error and returns false.
3345 @param[in] pathname directory name as null-terminated string
3346 @param[in] fail_if_exists if true, pre-existing directory is treated as
3347 an error.
3348 @return true if call succeeds, false on error */
3349 bool
os_file_create_directory(const char * pathname,bool fail_if_exists)3350 os_file_create_directory(
3351 const char* pathname,
3352 bool fail_if_exists)
3353 {
3354 WAIT_ALLOW_WRITES();
3355 int rcode = mkdir(pathname, 0770);
3356
3357 if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
3358 /* failure */
3359 os_file_handle_error_no_exit(pathname, "mkdir", false);
3360
3361 return(false);
3362 }
3363
3364 return(true);
3365 }
3366
3367 /**
3368 The os_file_opendir() function opens a directory stream corresponding to the
3369 directory named by the dirname argument. The directory stream is positioned
3370 at the first entry. In both Unix and Windows we automatically skip the '.'
3371 and '..' items at the start of the directory listing.
3372 @param[in] dirname directory name; it must not contain a trailing
3373 '\' or '/'
3374 @param[in] is_fatal true if we should treat an error as a fatal
3375 error; if we try to open symlinks then we do
3376 not wish a fatal error if it happens not to be
3377 a directory
3378 @return directory stream, NULL if error */
3379 os_file_dir_t
os_file_opendir(const char * dirname,bool error_is_fatal)3380 os_file_opendir(
3381 const char* dirname,
3382 bool error_is_fatal)
3383 {
3384 os_file_dir_t dir;
3385 dir = opendir(dirname);
3386
3387 if (dir == NULL && error_is_fatal) {
3388 os_file_handle_error(dirname, "opendir");
3389 }
3390
3391 return(dir);
3392 }
3393
3394 /** Closes a directory stream.
3395 @param[in] dir directory stream
3396 @return 0 if success, -1 if failure */
3397 int
os_file_closedir(os_file_dir_t dir)3398 os_file_closedir(
3399 os_file_dir_t dir)
3400 {
3401 int ret = closedir(dir);
3402
3403 if (ret != 0) {
3404 os_file_handle_error_no_exit(NULL, "closedir", false);
3405 }
3406
3407 return(ret);
3408 }
3409
3410 /** This function returns information of the next file in the directory. We jump
3411 over the '.' and '..' entries in the directory.
3412 @param[in] dirname directory name or path
3413 @param[in] dir directory stream
3414 @param[out] info buffer where the info is returned
3415 @return 0 if ok, -1 if error, 1 if at the end of the directory */
3416 int
os_file_readdir_next_file(const char * dirname,os_file_dir_t dir,os_file_stat_t * info)3417 os_file_readdir_next_file(
3418 const char* dirname,
3419 os_file_dir_t dir,
3420 os_file_stat_t* info)
3421 {
3422 struct dirent* ent;
3423 char* full_path;
3424 int ret;
3425 struct stat statinfo;
3426
3427 #ifdef HAVE_READDIR_R
3428 char dirent_buf[sizeof(struct dirent)
3429 + _POSIX_PATH_MAX + 100];
3430 /* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
3431 the max file name len; but in most standards, the
3432 length is NAME_MAX; we add 100 to be even safer */
3433 #endif /* HAVE_READDIR_R */
3434
3435 next_file:
3436
3437 #ifdef HAVE_READDIR_R
3438 ret = readdir_r(dir, (struct dirent*) dirent_buf, &ent);
3439
3440 if (ret != 0) {
3441
3442 ib::error()
3443 << "Cannot read directory " << dirname
3444 << " error: " << ret;
3445
3446 return(-1);
3447 }
3448
3449 if (ent == NULL) {
3450 /* End of directory */
3451
3452 return(1);
3453 }
3454
3455 ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
3456 #else
3457 ent = readdir(dir);
3458
3459 if (ent == NULL) {
3460
3461 return(1);
3462 }
3463 #endif /* HAVE_READDIR_R */
3464 ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
3465
3466 if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
3467
3468 goto next_file;
3469 }
3470
3471 strcpy(info->name, ent->d_name);
3472
3473 full_path = static_cast<char*>(
3474 ut_malloc_nokey(strlen(dirname) + strlen(ent->d_name) + 10));
3475
3476 sprintf(full_path, "%s/%s", dirname, ent->d_name);
3477
3478 ret = stat(full_path, &statinfo);
3479
3480 if (ret) {
3481
3482 if (errno == ENOENT) {
3483 /* readdir() returned a file that does not exist,
3484 it must have been deleted in the meantime. Do what
3485 would have happened if the file was deleted before
3486 readdir() - ignore and go to the next entry.
3487 If this is the last entry then info->name will still
3488 contain the name of the deleted file when this
3489 function returns, but this is not an issue since the
3490 caller shouldn't be looking at info when end of
3491 directory is returned. */
3492
3493 ut_free(full_path);
3494
3495 goto next_file;
3496 }
3497
3498 os_file_handle_error_no_exit(full_path, "stat", false);
3499
3500 ut_free(full_path);
3501
3502 return(-1);
3503 }
3504
3505 info->size = statinfo.st_size;
3506
3507 if (S_ISDIR(statinfo.st_mode)) {
3508 info->type = OS_FILE_TYPE_DIR;
3509 } else if (S_ISLNK(statinfo.st_mode)) {
3510 info->type = OS_FILE_TYPE_LINK;
3511 } else if (S_ISREG(statinfo.st_mode)) {
3512 info->type = OS_FILE_TYPE_FILE;
3513 } else {
3514 info->type = OS_FILE_TYPE_UNKNOWN;
3515 }
3516
3517 ut_free(full_path);
3518
3519 return(0);
3520 }
3521
3522 /** NOTE! Use the corresponding macro os_file_create(), not directly
3523 this function!
3524 Opens an existing file or creates a new.
3525 @param[in] name name of the file or path as a null-terminated
3526 string
3527 @param[in] create_mode create mode
3528 @param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O
3529 is desired, OS_FILE_NORMAL, if any normal file;
3530 NOTE that it also depends on type, os_aio_..
3531 and srv_.. variables whether we really use async
3532 I/O or unbuffered I/O: look in the function
3533 source code for the exact rules
3534 @param[in] type OS_DATA_FILE or OS_LOG_FILE
3535 @param[in] read_only true, if read only checks should be enforcedm
3536 @param[in] success true if succeeded
3537 @return handle to the file, not defined if error, error number
3538 can be retrieved with os_file_get_last_error */
3539 pfs_os_file_t
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,bool read_only,bool * success)3540 os_file_create_func(
3541 const char* name,
3542 ulint create_mode,
3543 ulint purpose,
3544 ulint type,
3545 bool read_only,
3546 bool* success)
3547 {
3548 bool on_error_no_exit;
3549 bool on_error_silent;
3550 if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
3551 WAIT_ALLOW_WRITES();
3552 pfs_os_file_t file;
3553
3554 *success = false;
3555
3556 DBUG_EXECUTE_IF(
3557 "ib_create_table_fail_disk_full",
3558 *success = false;
3559 errno = ENOSPC;
3560 file.m_file = OS_FILE_CLOSED;
3561 return(file);
3562 );
3563
3564 int create_flag;
3565 const char* mode_str = NULL;
3566
3567 on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
3568 ? true : false;
3569 on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
3570 ? true : false;
3571
3572 create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
3573 create_mode &= ~OS_FILE_ON_ERROR_SILENT;
3574
3575 if (create_mode == OS_FILE_OPEN
3576 || create_mode == OS_FILE_OPEN_RAW
3577 || create_mode == OS_FILE_OPEN_RETRY) {
3578
3579 mode_str = "OPEN";
3580
3581 create_flag = read_only ? O_RDONLY : O_RDWR;
3582
3583 } else if (read_only) {
3584
3585 mode_str = "OPEN";
3586
3587 create_flag = O_RDONLY;
3588
3589 } else if (create_mode == OS_FILE_CREATE) {
3590
3591 mode_str = "CREATE";
3592 create_flag = O_RDWR | O_CREAT | O_EXCL;
3593
3594 } else if (create_mode == OS_FILE_OVERWRITE) {
3595
3596 mode_str = "OVERWRITE";
3597 create_flag = O_RDWR | O_CREAT | O_TRUNC;
3598
3599 } else {
3600 ib::error()
3601 << "Unknown file create mode (" << create_mode << ")"
3602 << " for file '" << name << "'";
3603
3604 file.m_file = OS_FILE_CLOSED;
3605 return(file);
3606 }
3607
3608 ut_a(type == OS_LOG_FILE
3609 || type == OS_DATA_FILE
3610 || type == OS_DATA_TEMP_FILE);
3611
3612 ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
3613
3614 #ifdef O_SYNC
3615 /* We let O_SYNC only affect log files; note that we map O_DSYNC to
3616 O_SYNC because the datasync options seemed to corrupt files in 2001
3617 in both Linux and Solaris */
3618
3619 if (!read_only
3620 && type == OS_LOG_FILE
3621 && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
3622
3623 create_flag |= O_SYNC;
3624 }
3625 #endif /* O_SYNC */
3626
3627 bool retry;
3628
3629 do {
3630 file.m_file = ::open(name, create_flag, os_innodb_umask);
3631
3632 if (file.m_file == -1) {
3633 const char* operation;
3634
3635 operation = (create_mode == OS_FILE_CREATE
3636 && !read_only) ? "create" : "open";
3637
3638 *success = false;
3639
3640 if (on_error_no_exit) {
3641 retry = os_file_handle_error_no_exit(
3642 name, operation, on_error_silent);
3643 } else {
3644 retry = os_file_handle_error(name, operation);
3645 }
3646 } else {
3647 *success = true;
3648 retry = false;
3649 }
3650
3651 } while (retry);
3652
3653 /* We disable OS caching (O_DIRECT) only on data files */
3654
3655 if (!read_only
3656 && *success
3657 && (type != OS_LOG_FILE && type != OS_DATA_TEMP_FILE)
3658 && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
3659 || srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) {
3660
3661 os_file_set_nocache(file.m_file, name, mode_str);
3662 }
3663
3664 #ifdef USE_FILE_LOCK
3665 if (!read_only
3666 && *success
3667 && create_mode != OS_FILE_OPEN_RAW
3668 && os_file_lock(file.m_file, name)) {
3669
3670 if (create_mode == OS_FILE_OPEN_RETRY) {
3671
3672 ib::info()
3673 << "Retrying to lock the first data file";
3674
3675 for (int i = 0; i < 100; i++) {
3676 os_thread_sleep(1000000);
3677
3678 if (!os_file_lock(file.m_file, name)) {
3679 *success = true;
3680 return(file);
3681 }
3682 }
3683
3684 ib::info()
3685 << "Unable to open the first data file";
3686 }
3687
3688 *success = false;
3689 close(file.m_file);
3690 file.m_file = -1;
3691 }
3692 #endif /* USE_FILE_LOCK */
3693
3694 return(file);
3695 }
3696
3697 /** NOTE! Use the corresponding macro
3698 os_file_create_simple_no_error_handling(), not directly this function!
3699 A simple function to open or create a file.
3700 @param[in] name name of the file or path as a null-terminated
3701 string
3702 @param[in] create_mode create mode
3703 @param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
3704 OS_FILE_READ_ALLOW_DELETE; the last option
3705 is used by a backup program reading the file
3706 @param[in] read_only if true read only mode checks are enforced
3707 @param[out] success true if succeeded
3708 @return own: handle to the file, not defined if error, error number
3709 can be retrieved with os_file_get_last_error */
3710 pfs_os_file_t
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)3711 os_file_create_simple_no_error_handling_func(
3712 const char* name,
3713 ulint create_mode,
3714 ulint access_type,
3715 bool read_only,
3716 bool* success)
3717 {
3718 pfs_os_file_t file;
3719 int create_flag;
3720
3721 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
3722 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
3723
3724 if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
3725 WAIT_ALLOW_WRITES();
3726 *success = false;
3727
3728 if (create_mode == OS_FILE_OPEN) {
3729
3730 if (access_type == OS_FILE_READ_ONLY) {
3731
3732 create_flag = O_RDONLY;
3733
3734 } else if (read_only) {
3735
3736 create_flag = O_RDONLY;
3737
3738 } else {
3739
3740 ut_a(access_type == OS_FILE_READ_WRITE
3741 || access_type == OS_FILE_READ_ALLOW_DELETE);
3742
3743 create_flag = O_RDWR;
3744 }
3745
3746 } else if (read_only) {
3747
3748 create_flag = O_RDONLY;
3749
3750 } else if (create_mode == OS_FILE_CREATE) {
3751
3752 create_flag = O_RDWR | O_CREAT | O_EXCL;
3753
3754 } else {
3755
3756 ib::error()
3757 << "Unknown file create mode "
3758 << create_mode << " for file '" << name << "'";
3759 file.m_file = OS_FILE_CLOSED;
3760 return(file);
3761 }
3762
3763 file.m_file = ::open(name, create_flag, os_innodb_umask);
3764
3765 *success = (file.m_file != -1);
3766
3767 #ifdef USE_FILE_LOCK
3768 if (!read_only
3769 && *success
3770 && access_type == OS_FILE_READ_WRITE
3771 && os_file_lock(file.m_file, name)) {
3772
3773 *success = false;
3774 close(file.m_file);
3775 file.m_file = -1;
3776
3777 }
3778 #endif /* USE_FILE_LOCK */
3779
3780 return(file);
3781 }
3782
3783 /** Deletes a file if it exists. The file has to be closed before calling this.
3784 @param[in] name file path as a null-terminated string
3785 @param[out] exist indicate if file pre-exist
3786 @return true if success */
3787 bool
os_file_delete_if_exists_func(const char * name,bool * exist)3788 os_file_delete_if_exists_func(
3789 const char* name,
3790 bool* exist)
3791 {
3792 WAIT_ALLOW_WRITES();
3793 if (exist != NULL) {
3794 *exist = true;
3795 }
3796
3797 int ret = unlink(name);
3798
3799 if (ret != 0 && errno == ENOENT) {
3800 if (exist != NULL) {
3801 *exist = false;
3802 }
3803 } else if (ret != 0 && errno != ENOENT) {
3804 os_file_handle_error_no_exit(name, "delete", false);
3805
3806 return(false);
3807 }
3808
3809 return(true);
3810 }
3811
3812 /** Deletes a file. The file has to be closed before calling this.
3813 @param[in] name file path as a null-terminated string
3814 @return true if success */
3815 bool
os_file_delete_func(const char * name)3816 os_file_delete_func(
3817 const char* name)
3818 {
3819 WAIT_ALLOW_WRITES();
3820 int ret = unlink(name);
3821
3822 if (ret != 0) {
3823 os_file_handle_error_no_exit(name, "delete", false);
3824
3825 return(false);
3826 }
3827
3828 return(true);
3829 }
3830
3831 /** NOTE! Use the corresponding macro os_file_rename(), not directly this
3832 function!
3833 Renames a file (can also move it to another directory). It is safest that the
3834 file is closed before calling this function.
3835 @param[in] oldpath old file path as a null-terminated string
3836 @param[in] newpath new file path
3837 @return true if success */
3838 bool
os_file_rename_func(const char * oldpath,const char * newpath)3839 os_file_rename_func(
3840 const char* oldpath,
3841 const char* newpath)
3842 {
3843 #ifdef UNIV_DEBUG
3844 os_file_type_t type;
3845 bool exists;
3846
3847 /* New path must not exist. */
3848 ut_ad(os_file_status(newpath, &exists, &type));
3849 ut_ad(!exists);
3850
3851 /* Old path must exist. */
3852 ut_ad(os_file_status(oldpath, &exists, &type));
3853 ut_ad(exists);
3854 #endif /* UNIV_DEBUG */
3855 WAIT_ALLOW_WRITES();
3856
3857 int ret = rename(oldpath, newpath);
3858
3859 if (ret != 0) {
3860 os_file_handle_error_no_exit(oldpath, "rename", false);
3861
3862 return(false);
3863 }
3864
3865 return(true);
3866 }
3867
3868 /** NOTE! Use the corresponding macro os_file_close(), not directly this
3869 function!
3870 Closes a file handle. In case of error, error number can be retrieved with
3871 os_file_get_last_error.
3872 @param[in] file Handle to close
3873 @return true if success */
3874 bool
os_file_close_func(os_file_t file)3875 os_file_close_func(
3876 os_file_t file)
3877 {
3878 int ret = close(file);
3879
3880 if (ret == -1) {
3881 os_file_handle_error(NULL, "close");
3882
3883 return(false);
3884 }
3885
3886 return(true);
3887 }
3888
3889 /** Gets a file size.
3890 @param[in] file handle to an open file
3891 @return file size, or (os_offset_t) -1 on failure */
3892 os_offset_t
os_file_get_size(pfs_os_file_t file)3893 os_file_get_size(
3894 pfs_os_file_t file)
3895 {
3896 /* Store current position */
3897 os_offset_t pos = lseek(file.m_file, 0, SEEK_CUR);
3898 os_offset_t file_size = lseek(file.m_file, 0, SEEK_END);
3899
3900 /* Restore current position as the function should not change it */
3901 lseek(file.m_file, pos, SEEK_SET);
3902
3903 return(file_size);
3904 }
3905
3906 /** Gets a file size.
3907 @param[in] filename Full path to the filename to check
3908 @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
3909 errno */
3910 os_file_size_t
os_file_get_size(const char * filename)3911 os_file_get_size(
3912 const char* filename)
3913 {
3914 struct stat s;
3915 os_file_size_t file_size;
3916
3917 int ret = stat(filename, &s);
3918
3919 if (ret == 0) {
3920 file_size.m_total_size = s.st_size;
3921 /* st_blocks is in 512 byte sized blocks */
3922 file_size.m_alloc_size = s.st_blocks * 512;
3923 } else {
3924 file_size.m_total_size = ~0;
3925 file_size.m_alloc_size = (os_offset_t) errno;
3926 }
3927
3928 return(file_size);
3929 }
3930
3931 /** This function returns information about the specified file
3932 @param[in] path pathname of the file
3933 @param[out] stat_info information of a file in a directory
3934 @param[in,out] statinfo information of a file in a directory
3935 @param[in] check_rw_perm for testing whether the file can be opened
3936 in RW mode
3937 @param[in] read_only if true read only mode checks are enforced
3938 @return DB_SUCCESS if all OK */
3939 static
3940 dberr_t
os_file_get_status_posix(const char * path,os_file_stat_t * stat_info,struct stat * statinfo,bool check_rw_perm,bool read_only)3941 os_file_get_status_posix(
3942 const char* path,
3943 os_file_stat_t* stat_info,
3944 struct stat* statinfo,
3945 bool check_rw_perm,
3946 bool read_only)
3947 {
3948 int ret = stat(path, statinfo);
3949
3950 if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3951 /* file does not exist */
3952
3953 return(DB_NOT_FOUND);
3954
3955 } else if (ret) {
3956 /* file exists, but stat call failed */
3957
3958 os_file_handle_error_no_exit(path, "stat", false);
3959
3960 return(DB_FAIL);
3961 }
3962
3963 switch (statinfo->st_mode & S_IFMT) {
3964 case S_IFDIR:
3965 stat_info->type = OS_FILE_TYPE_DIR;
3966 break;
3967 case S_IFLNK:
3968 stat_info->type = OS_FILE_TYPE_LINK;
3969 break;
3970 case S_IFBLK:
3971 /* Handle block device as regular file. */
3972 case S_IFCHR:
3973 /* Handle character device as regular file. */
3974 case S_IFREG:
3975 stat_info->type = OS_FILE_TYPE_FILE;
3976 break;
3977 default:
3978 stat_info->type = OS_FILE_TYPE_UNKNOWN;
3979 }
3980
3981 stat_info->size = statinfo->st_size;
3982 stat_info->block_size = statinfo->st_blksize;
3983 stat_info->alloc_size = statinfo->st_blocks * 512;
3984
3985 if (check_rw_perm
3986 && (stat_info->type == OS_FILE_TYPE_FILE
3987 || stat_info->type == OS_FILE_TYPE_BLOCK)) {
3988
3989 int access = !read_only ? O_RDWR : O_RDONLY;
3990 int fh = ::open(path, access, os_innodb_umask);
3991
3992 if (fh == -1) {
3993 stat_info->rw_perm = false;
3994 } else {
3995 stat_info->rw_perm = true;
3996 close(fh);
3997 }
3998 }
3999
4000 return(DB_SUCCESS);
4001 }
4002
4003 /** Truncates a file to a specified size in bytes.
4004 Do nothing if the size to preserve is greater or equal to the current
4005 size of the file.
4006 @param[in] pathname file path
4007 @param[in] file file to be truncated
4008 @param[in] size size to preserve in bytes
4009 @return true if success */
4010 static
4011 bool
os_file_truncate_posix(const char * pathname,pfs_os_file_t file,os_offset_t size)4012 os_file_truncate_posix(
4013 const char* pathname,
4014 pfs_os_file_t file,
4015 os_offset_t size)
4016 {
4017 WAIT_ALLOW_WRITES();
4018 int res = ftruncate(file.m_file, size);
4019 if (res == -1) {
4020
4021 bool retry;
4022
4023 retry = os_file_handle_error_no_exit(
4024 pathname, "truncate", false);
4025
4026 if (retry) {
4027 ib::warn()
4028 << "Truncate failed for '"
4029 << pathname << "'";
4030 }
4031 }
4032
4033 return(res == 0);
4034 }
4035
4036 /** Truncates a file at its current position.
4037 @return true if success */
4038 bool
os_file_set_eof(FILE * file)4039 os_file_set_eof(
4040 FILE* file) /*!< in: file to be truncated */
4041 {
4042 WAIT_ALLOW_WRITES();
4043 return(!ftruncate(fileno(file), ftell(file)));
4044 }
4045
4046 #ifdef UNIV_HOTBACKUP
4047 /** Closes a file handle.
4048 @param[in] file Handle to a file
4049 @return true if success */
4050 bool
os_file_close_no_error_handling(os_file_t file)4051 os_file_close_no_error_handling(
4052 os_file_t file)
4053 {
4054 return(close(file) != -1);
4055 }
4056 #endif /* UNIV_HOTBACKUP */
4057
4058 /** This function can be called if one wants to post a batch of reads and
4059 prefers an i/o-handler thread to handle them all at once later. You must
4060 call os_aio_simulated_wake_handler_threads later to ensure the threads
4061 are not left sleeping! */
4062 void
os_aio_simulated_put_read_threads_to_sleep()4063 os_aio_simulated_put_read_threads_to_sleep()
4064 {
4065 /* No op on non Windows */
4066 }
4067
4068 #else /* !_WIN32 */
4069
4070 #include <WinIoCtl.h>
4071
4072 /** Do the read/write
4073 @param[in] request The IO context and type
4074 @return the number of bytes read/written or negative value on error */
4075 ssize_t
execute(const IORequest & request)4076 SyncFileIO::execute(const IORequest& request)
4077 {
4078 OVERLAPPED seek;
4079
4080 memset(&seek, 0x0, sizeof(seek));
4081
4082 seek.Offset = (DWORD) m_offset & 0xFFFFFFFF;
4083 seek.OffsetHigh = (DWORD) (m_offset >> 32);
4084
4085 BOOL ret;
4086 DWORD n_bytes;
4087
4088 if (request.is_read()) {
4089 ret = ReadFile(m_fh, m_buf,
4090 static_cast<DWORD>(m_n), &n_bytes, &seek);
4091
4092 } else {
4093 ut_ad(request.is_write());
4094 ret = WriteFile(m_fh, m_buf,
4095 static_cast<DWORD>(m_n), &n_bytes, &seek);
4096 }
4097
4098 return(ret ? static_cast<ssize_t>(n_bytes) : -1);
4099 }
4100
4101 /** Do the read/write
4102 @param[in,out] slot The IO slot, it has the IO context
4103 @return the number of bytes read/written or negative value on error */
4104 ssize_t
execute(Slot * slot)4105 SyncFileIO::execute(Slot* slot)
4106 {
4107 BOOL ret;
4108
4109 if (slot->type.is_read()) {
4110 ret = ReadFile(
4111 slot->file.m_file, slot->ptr, slot->len,
4112 &slot->n_bytes, &slot->control);
4113 } else {
4114 ut_ad(slot->type.is_write());
4115 ret = WriteFile(
4116 slot->file.m_file, slot->ptr, slot->len,
4117 &slot->n_bytes, &slot->control);
4118 }
4119
4120 return(ret ? static_cast<ssize_t>(slot->n_bytes) : -1);
4121 }
4122
4123 /** Check if the file system supports sparse files.
4124 @param[in] name File name
4125 @return true if the file system supports sparse files */
4126 static
4127 bool
os_is_sparse_file_supported_win32(const char * filename)4128 os_is_sparse_file_supported_win32(const char* filename)
4129 {
4130 char volname[MAX_PATH];
4131 BOOL result = GetVolumePathName(filename, volname, MAX_PATH);
4132
4133 if (!result) {
4134
4135 ib::error()
4136 << "os_is_sparse_file_supported: "
4137 << "Failed to get the volume path name for: "
4138 << filename
4139 << "- OS error number " << GetLastError();
4140
4141 return(false);
4142 }
4143
4144 DWORD flags;
4145
4146 GetVolumeInformation(
4147 volname, NULL, MAX_PATH, NULL, NULL,
4148 &flags, NULL, MAX_PATH);
4149
4150 return(flags & FILE_SUPPORTS_SPARSE_FILES) ? true : false;
4151 }
4152
4153 /** Free storage space associated with a section of the file.
4154 @param[in] fh Open file handle
4155 @param[in] page_size Tablespace page size
4156 @param[in] block_size File system block size
4157 @param[in] off Starting offset (SEEK_SET)
4158 @param[in] len Size of the hole
4159 @return 0 on success or errno */
4160 static
4161 dberr_t
os_file_punch_hole_win32(os_file_t fh,os_offset_t off,os_offset_t len)4162 os_file_punch_hole_win32(
4163 os_file_t fh,
4164 os_offset_t off,
4165 os_offset_t len)
4166 {
4167 FILE_ZERO_DATA_INFORMATION punch;
4168
4169 punch.FileOffset.QuadPart = off;
4170 punch.BeyondFinalZero.QuadPart = off + len;
4171
4172 /* If lpOverlapped is NULL, lpBytesReturned cannot be NULL,
4173 therefore we pass a dummy parameter. */
4174 DWORD temp;
4175
4176 BOOL result = DeviceIoControl(
4177 fh, FSCTL_SET_ZERO_DATA, &punch, sizeof(punch),
4178 NULL, 0, &temp, NULL);
4179
4180 return(!result ? DB_IO_NO_PUNCH_HOLE : DB_SUCCESS);
4181 }
4182
4183 /** Check the existence and type of the given file.
4184 @param[in] path path name of file
4185 @param[out] exists true if the file exists
4186 @param[out] type Type of the file, if it exists
4187 @return true if call succeeded */
4188 bool
os_file_status_win32(const char * path,bool * exists,os_file_type_t * type)4189 os_file_status_win32(
4190 const char* path,
4191 bool* exists,
4192 os_file_type_t* type)
4193 {
4194 int ret;
4195 struct _stat64 statinfo;
4196
4197 ret = _stat64(path, &statinfo);
4198
4199 *exists = !ret;
4200
4201 if (!ret) {
4202 /* file exists, everything OK */
4203
4204 } else if (errno == ENOENT || errno == ENOTDIR
4205 || errno == ENAMETOOLONG) {
4206 /* file does not exist */
4207 return(true);
4208
4209 } else {
4210 /* file exists, but stat call failed */
4211 os_file_handle_error_no_exit(path, "stat", false);
4212 return(false);
4213 }
4214
4215 if (_S_IFDIR & statinfo.st_mode) {
4216 *type = OS_FILE_TYPE_DIR;
4217
4218 } else if (_S_IFREG & statinfo.st_mode) {
4219 *type = OS_FILE_TYPE_FILE;
4220
4221 } else {
4222 *type = OS_FILE_TYPE_UNKNOWN;
4223 }
4224
4225 return(true);
4226 }
4227
4228 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
4229 function!
4230 Flushes the write buffers of a given file to the disk.
4231 @param[in] file handle to a file
4232 @return true if success */
4233 bool
os_file_flush_func(os_file_t file)4234 os_file_flush_func(
4235 os_file_t file)
4236 {
4237 WAIT_ALLOW_WRITES();
4238 ++os_n_fsyncs;
4239
4240 BOOL ret = FlushFileBuffers(file);
4241
4242 if (ret) {
4243 return(true);
4244 }
4245
4246 /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
4247 actually a raw device, we choose to ignore that error if we are using
4248 raw disks */
4249
4250 if (srv_start_raw_disk_in_use && GetLastError()
4251 == ERROR_INVALID_FUNCTION) {
4252 return(true);
4253 }
4254
4255 os_file_handle_error(NULL, "flush");
4256
4257 /* It is a fatal error if a file flush does not succeed, because then
4258 the database can get corrupt on disk */
4259 ut_error;
4260
4261 return(false);
4262 }
4263
4264 /** Retrieves the last error number if an error occurs in a file io function.
4265 The number should be retrieved before any other OS calls (because they may
4266 overwrite the error number). If the number is not known to this program,
4267 the OS error number + 100 is returned.
4268 @param[in] report_all_errors true if we want an error message printed
4269 of all errors
4270 @param[in] on_error_silent true then don't print any diagnostic
4271 to the log
4272 @return error number, or OS error number + 100 */
4273 static
4274 ulint
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)4275 os_file_get_last_error_low(
4276 bool report_all_errors,
4277 bool on_error_silent)
4278 {
4279 ulint err = (ulint) GetLastError();
4280
4281 if (err == ERROR_SUCCESS) {
4282 return(0);
4283 }
4284
4285 if (report_all_errors
4286 || (!on_error_silent
4287 && err != ERROR_DISK_FULL
4288 && err != ERROR_FILE_EXISTS)) {
4289
4290 ib::error()
4291 << "Operating system error number " << err
4292 << " in a file operation.";
4293
4294 if (err == ERROR_PATH_NOT_FOUND) {
4295 ib::error()
4296 << "The error means the system"
4297 " cannot find the path specified.";
4298
4299 if (srv_is_being_started) {
4300 ib::error()
4301 << "If you are installing InnoDB,"
4302 " remember that you must create"
4303 " directories yourself, InnoDB"
4304 " does not create them.";
4305 }
4306
4307 } else if (err == ERROR_ACCESS_DENIED) {
4308
4309 ib::error()
4310 << "The error means mysqld does not have"
4311 " the access rights to"
4312 " the directory. It may also be"
4313 " you have created a subdirectory"
4314 " of the same name as a data file.";
4315
4316 } else if (err == ERROR_SHARING_VIOLATION
4317 || err == ERROR_LOCK_VIOLATION) {
4318
4319 ib::error()
4320 << "The error means that another program"
4321 " is using InnoDB's files."
4322 " This might be a backup or antivirus"
4323 " software or another instance"
4324 " of MySQL."
4325 " Please close it to get rid of this error.";
4326
4327 } else if (err == ERROR_WORKING_SET_QUOTA
4328 || err == ERROR_NO_SYSTEM_RESOURCES) {
4329
4330 ib::error()
4331 << "The error means that there are no"
4332 " sufficient system resources or quota to"
4333 " complete the operation.";
4334
4335 } else if (err == ERROR_OPERATION_ABORTED) {
4336
4337 ib::error()
4338 << "The error means that the I/O"
4339 " operation has been aborted"
4340 " because of either a thread exit"
4341 " or an application request."
4342 " Retry attempt is made.";
4343 } else {
4344
4345 ib::info() << OPERATING_SYSTEM_ERROR_MSG;
4346 }
4347 }
4348
4349 if (err == ERROR_FILE_NOT_FOUND) {
4350 return(OS_FILE_NOT_FOUND);
4351 } else if (err == ERROR_DISK_FULL) {
4352 return(OS_FILE_DISK_FULL);
4353 } else if (err == ERROR_FILE_EXISTS) {
4354 return(OS_FILE_ALREADY_EXISTS);
4355 } else if (err == ERROR_SHARING_VIOLATION
4356 || err == ERROR_LOCK_VIOLATION) {
4357 return(OS_FILE_SHARING_VIOLATION);
4358 } else if (err == ERROR_WORKING_SET_QUOTA
4359 || err == ERROR_NO_SYSTEM_RESOURCES) {
4360 return(OS_FILE_INSUFFICIENT_RESOURCE);
4361 } else if (err == ERROR_OPERATION_ABORTED) {
4362 return(OS_FILE_OPERATION_ABORTED);
4363 } else if (err == ERROR_ACCESS_DENIED) {
4364 return(OS_FILE_ACCESS_VIOLATION);
4365 }
4366
4367 return(OS_FILE_ERROR_MAX + err);
4368 }
4369
4370 /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
4371 this function!
4372 A simple function to open or create a file.
4373 @param[in] name name of the file or path as a null-terminated
4374 string
4375 @param[in] create_mode create mode
4376 @param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
4377 @param[in] read_only if true read only mode checks are enforced
4378 @param[out] success true if succeed, false if error
4379 @return handle to the file, not defined if error, error number
4380 can be retrieved with os_file_get_last_error */
4381 pfs_os_file_t
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)4382 os_file_create_simple_func(
4383 const char* name,
4384 ulint create_mode,
4385 ulint access_type,
4386 bool read_only,
4387 bool* success)
4388 {
4389 pfs_os_file_t file;
4390
4391 *success = false;
4392
4393 DWORD access;
4394 DWORD create_flag;
4395 DWORD attributes = 0;
4396
4397 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
4398 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
4399
4400 if (create_mode == OS_FILE_OPEN) {
4401
4402 create_flag = OPEN_EXISTING;
4403
4404 } else if (read_only) {
4405
4406 create_flag = OPEN_EXISTING;
4407
4408 } else if (create_mode == OS_FILE_CREATE) {
4409
4410 create_flag = CREATE_NEW;
4411
4412 } else if (create_mode == OS_FILE_CREATE_PATH) {
4413
4414 /* Create subdirs along the path if needed. */
4415 *success = os_file_create_subdirs_if_needed(name);
4416
4417 if (!*success) {
4418
4419 ib::error()
4420 << "Unable to create subdirectories '"
4421 << name << "'";
4422 file.m_file = OS_FILE_CLOSED;
4423 return(file);
4424 }
4425
4426 create_flag = CREATE_NEW;
4427 create_mode = OS_FILE_CREATE;
4428
4429 } else {
4430
4431 ib::error()
4432 << "Unknown file create mode ("
4433 << create_mode << ") for file '"
4434 << name << "'";
4435
4436 file.m_file = OS_FILE_CLOSED;
4437 return(file);
4438 }
4439
4440 if (access_type == OS_FILE_READ_ONLY) {
4441
4442 access = GENERIC_READ;
4443
4444 } else if (read_only) {
4445
4446 ib::info()
4447 << "Read only mode set. Unable to"
4448 " open file '" << name << "' in RW mode, "
4449 << "trying RO mode", name;
4450
4451 access = GENERIC_READ;
4452
4453 } else if (access_type == OS_FILE_READ_WRITE) {
4454
4455 access = GENERIC_READ | GENERIC_WRITE;
4456
4457 } else {
4458
4459 ib::error()
4460 << "Unknown file access type (" << access_type << ") "
4461 "for file '" << name << "'";
4462
4463 file.m_file = OS_FILE_CLOSED;
4464 return(file);
4465 }
4466
4467 bool retry;
4468
4469 do {
4470 /* Use default security attributes and no template file. */
4471
4472 file.m_file = CreateFile(
4473 (LPCTSTR) name, access, FILE_SHARE_READ, NULL,
4474 create_flag, attributes, NULL);
4475
4476 if (file.m_file == INVALID_HANDLE_VALUE) {
4477
4478 *success = false;
4479
4480 retry = os_file_handle_error(
4481 name, create_mode == OS_FILE_OPEN ?
4482 "open" : "create");
4483
4484 } else {
4485
4486 retry = false;
4487
4488 *success = true;
4489
4490 DWORD temp;
4491
4492 /* This is a best effort use case, if it fails then
4493 we will find out when we try and punch the hole. */
4494
4495 DeviceIoControl(
4496 file.m_file, FSCTL_SET_SPARSE, NULL, 0, NULL, 0,
4497 &temp, NULL);
4498 }
4499
4500 } while (retry);
4501
4502 return(file);
4503 }
4504
4505 /** This function attempts to create a directory named pathname. The new
4506 directory gets default permissions. On Unix the permissions are
4507 (0770 & ~umask). If the directory exists already, nothing is done and
4508 the call succeeds, unless the fail_if_exists arguments is true.
4509 If another error occurs, such as a permission error, this does not crash,
4510 but reports the error and returns false.
4511 @param[in] pathname directory name as null-terminated string
4512 @param[in] fail_if_exists if true, pre-existing directory is treated
4513 as an error.
4514 @return true if call succeeds, false on error */
4515 bool
os_file_create_directory(const char * pathname,bool fail_if_exists)4516 os_file_create_directory(
4517 const char* pathname,
4518 bool fail_if_exists)
4519 {
4520 BOOL rcode;
4521
4522 rcode = CreateDirectory((LPCTSTR) pathname, NULL);
4523 if (!(rcode != 0
4524 || (GetLastError() == ERROR_ALREADY_EXISTS
4525 && !fail_if_exists))) {
4526
4527 os_file_handle_error_no_exit(
4528 pathname, "CreateDirectory", false);
4529
4530 return(false);
4531 }
4532
4533 return(true);
4534 }
4535
4536 /** The os_file_opendir() function opens a directory stream corresponding to the
4537 directory named by the dirname argument. The directory stream is positioned
4538 at the first entry. In both Unix and Windows we automatically skip the '.'
4539 and '..' items at the start of the directory listing.
4540 @param[in] dirname directory name; it must not contain a trailing
4541 '\' or '/'
4542 @param[in] is_fatal true if we should treat an error as a fatal
4543 error; if we try to open symlinks then we do
4544 not wish a fatal error if it happens not to
4545 be a directory
4546 @return directory stream, NULL if error */
4547 os_file_dir_t
os_file_opendir(const char * dirname,bool error_is_fatal)4548 os_file_opendir(
4549 const char* dirname,
4550 bool error_is_fatal)
4551 {
4552 os_file_dir_t dir;
4553 LPWIN32_FIND_DATA lpFindFileData;
4554 char path[OS_FILE_MAX_PATH + 3];
4555
4556 ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
4557
4558 strcpy(path, dirname);
4559 strcpy(path + strlen(path), "\\*");
4560
4561 /* Note that in Windows opening the 'directory stream' also retrieves
4562 the first entry in the directory. Since it is '.', that is no problem,
4563 as we will skip over the '.' and '..' entries anyway. */
4564
4565 lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
4566 ut_malloc_nokey(sizeof(WIN32_FIND_DATA)));
4567
4568 dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
4569
4570 ut_free(lpFindFileData);
4571
4572 if (dir == INVALID_HANDLE_VALUE) {
4573
4574 if (error_is_fatal) {
4575 os_file_handle_error(dirname, "opendir");
4576 }
4577
4578 return(NULL);
4579 }
4580
4581 return(dir);
4582 }
4583
4584 /** Closes a directory stream.
4585 @param[in] dir directory stream
4586 @return 0 if success, -1 if failure */
4587 int
os_file_closedir(os_file_dir_t dir)4588 os_file_closedir(
4589 os_file_dir_t dir)
4590 {
4591 BOOL ret;
4592
4593 ret = FindClose(dir);
4594
4595 if (!ret) {
4596 os_file_handle_error_no_exit(NULL, "closedir", false);
4597
4598 return(-1);
4599 }
4600
4601 return(0);
4602 }
4603
4604 /** This function returns information of the next file in the directory. We
4605 jump over the '.' and '..' entries in the directory.
4606 @param[in] dirname directory name or path
4607 @param[in] dir directory stream
4608 @param[out] info buffer where the info is returned
4609 @return 0 if ok, -1 if error, 1 if at the end of the directory */
4610 int
os_file_readdir_next_file(const char * dirname,os_file_dir_t dir,os_file_stat_t * info)4611 os_file_readdir_next_file(
4612 const char* dirname,
4613 os_file_dir_t dir,
4614 os_file_stat_t* info)
4615 {
4616 BOOL ret;
4617 int status;
4618 WIN32_FIND_DATA find_data;
4619
4620 next_file:
4621
4622 ret = FindNextFile(dir, &find_data);
4623
4624 if (ret > 0) {
4625
4626 const char* name;
4627
4628 name = static_cast<const char*>(find_data.cFileName);
4629
4630 ut_a(strlen(name) < OS_FILE_MAX_PATH);
4631
4632 if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0) {
4633
4634 goto next_file;
4635 }
4636
4637 strcpy(info->name, name);
4638
4639 info->size = find_data.nFileSizeHigh;
4640 info->size <<= 32;
4641 info->size |= find_data.nFileSizeLow;
4642
4643 if (find_data.dwFileAttributes
4644 & FILE_ATTRIBUTE_REPARSE_POINT) {
4645
4646 /* TODO: test Windows symlinks */
4647 /* TODO: MySQL has apparently its own symlink
4648 implementation in Windows, dbname.sym can
4649 redirect a database directory:
4650 REFMAN "windows-symbolic-links.html" */
4651
4652 info->type = OS_FILE_TYPE_LINK;
4653
4654 } else if (find_data.dwFileAttributes
4655 & FILE_ATTRIBUTE_DIRECTORY) {
4656
4657 info->type = OS_FILE_TYPE_DIR;
4658
4659 } else {
4660
4661 /* It is probably safest to assume that all other
4662 file types are normal. Better to check them rather
4663 than blindly skip them. */
4664
4665 info->type = OS_FILE_TYPE_FILE;
4666 }
4667
4668 status = 0;
4669
4670 } else if (GetLastError() == ERROR_NO_MORE_FILES) {
4671
4672 status = 1;
4673
4674 } else {
4675
4676 os_file_handle_error_no_exit(NULL, "readdir_next_file", false);
4677
4678 status = -1;
4679 }
4680
4681 return(status);
4682 }
4683
4684 /** NOTE! Use the corresponding macro os_file_create(), not directly
4685 this function!
4686 Opens an existing file or creates a new.
4687 @param[in] name name of the file or path as a null-terminated
4688 string
4689 @param[in] create_mode create mode
4690 @param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O
4691 is desired, OS_FILE_NORMAL, if any normal file;
4692 NOTE that it also depends on type, os_aio_..
4693 and srv_.. variables whether we really use async
4694 I/O or unbuffered I/O: look in the function
4695 source code for the exact rules
4696 @param[in] type OS_DATA_FILE or OS_LOG_FILE
4697 @param[in] success true if succeeded
4698 @return handle to the file, not defined if error, error number
4699 can be retrieved with os_file_get_last_error */
4700 pfs_os_file_t
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,bool read_only,bool * success)4701 os_file_create_func(
4702 const char* name,
4703 ulint create_mode,
4704 ulint purpose,
4705 ulint type,
4706 bool read_only,
4707 bool* success)
4708 {
4709 pfs_os_file_t file;
4710 bool retry;
4711 bool on_error_no_exit;
4712 bool on_error_silent;
4713
4714 *success = false;
4715
4716 DBUG_EXECUTE_IF(
4717 "ib_create_table_fail_disk_full",
4718 *success = false;
4719 SetLastError(ERROR_DISK_FULL);
4720 file.m_file = OS_FILE_CLOSED;
4721 return(file);
4722 );
4723
4724 DWORD create_flag;
4725 DWORD share_mode = FILE_SHARE_READ;
4726
4727 on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
4728 ? true : false;
4729
4730 on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
4731 ? true : false;
4732
4733 create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
4734 create_mode &= ~OS_FILE_ON_ERROR_SILENT;
4735
4736 if (create_mode == OS_FILE_OPEN_RAW) {
4737
4738 ut_a(!read_only);
4739
4740 create_flag = OPEN_EXISTING;
4741
4742 /* On Windows Physical devices require admin privileges and
4743 have to have the write-share mode set. See the remarks
4744 section for the CreateFile() function documentation in MSDN. */
4745
4746 share_mode |= FILE_SHARE_WRITE;
4747
4748 } else if (create_mode == OS_FILE_OPEN
4749 || create_mode == OS_FILE_OPEN_RETRY) {
4750
4751 create_flag = OPEN_EXISTING;
4752
4753 } else if (read_only) {
4754
4755 create_flag = OPEN_EXISTING;
4756
4757 } else if (create_mode == OS_FILE_CREATE) {
4758
4759 create_flag = CREATE_NEW;
4760
4761 } else if (create_mode == OS_FILE_OVERWRITE) {
4762
4763 create_flag = CREATE_ALWAYS;
4764
4765 } else {
4766 ib::error()
4767 << "Unknown file create mode (" << create_mode << ") "
4768 << " for file '" << name << "'";
4769
4770 file.m_file = OS_FILE_CLOSED;
4771 return(file);
4772 }
4773
4774 DWORD attributes = 0;
4775
4776 #ifdef UNIV_HOTBACKUP
4777 attributes |= FILE_FLAG_NO_BUFFERING;
4778 #else
4779 if (purpose == OS_FILE_AIO) {
4780
4781 #ifdef WIN_ASYNC_IO
4782 /* If specified, use asynchronous (overlapped) io and no
4783 buffering of writes in the OS */
4784
4785 if (srv_use_native_aio) {
4786 attributes |= FILE_FLAG_OVERLAPPED;
4787 }
4788 #endif /* WIN_ASYNC_IO */
4789
4790 } else if (purpose == OS_FILE_NORMAL) {
4791
4792 /* Use default setting. */
4793
4794 } else {
4795
4796 ib::error()
4797 << "Unknown purpose flag (" << purpose << ") "
4798 << "while opening file '" << name << "'";
4799
4800 file.m_file = OS_FILE_CLOSED;
4801 return(file);
4802 }
4803
4804 #ifdef UNIV_NON_BUFFERED_IO
4805 // TODO: Create a bug, this looks wrong. The flush log
4806 // parameter is dynamic.
4807 if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
4808
4809 /* Do not use unbuffered i/o for the log files because
4810 value 2 denotes that we do not flush the log at every
4811 commit, but only once per second */
4812
4813 } else if (srv_win_file_flush_method == SRV_WIN_IO_UNBUFFERED) {
4814
4815 attributes |= FILE_FLAG_NO_BUFFERING;
4816 }
4817 #endif /* UNIV_NON_BUFFERED_IO */
4818
4819 #endif /* UNIV_HOTBACKUP */
4820 DWORD access = GENERIC_READ;
4821
4822 if (!read_only) {
4823 access |= GENERIC_WRITE;
4824 }
4825
4826 do {
4827 /* Use default security attributes and no template file. */
4828 file.m_file = CreateFile(
4829 (LPCTSTR) name, access, share_mode, NULL,
4830 create_flag, attributes, NULL);
4831
4832 if (file.m_file == INVALID_HANDLE_VALUE) {
4833 const char* operation;
4834
4835 operation = (create_mode == OS_FILE_CREATE
4836 && !read_only)
4837 ? "create" : "open";
4838
4839 *success = false;
4840
4841 if (on_error_no_exit) {
4842 retry = os_file_handle_error_no_exit(
4843 name, operation, on_error_silent);
4844 } else {
4845 retry = os_file_handle_error(name, operation);
4846 }
4847 } else {
4848
4849 retry = false;
4850
4851 *success = true;
4852
4853 DWORD temp;
4854
4855 /* This is a best effort use case, if it fails then
4856 we will find out when we try and punch the hole. */
4857 DeviceIoControl(
4858 file.m_file, FSCTL_SET_SPARSE, NULL, 0, NULL, 0,
4859 &temp, NULL);
4860 }
4861
4862 } while (retry);
4863
4864 return(file);
4865 }
4866
4867 /** NOTE! Use the corresponding macro os_file_create_simple_no_error_handling(),
4868 not directly this function!
4869 A simple function to open or create a file.
4870 @param[in] name name of the file or path as a null-terminated
4871 string
4872 @param[in] create_mode create mode
4873 @param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
4874 OS_FILE_READ_ALLOW_DELETE; the last option is
4875 used by a backup program reading the file
4876 @param[out] success true if succeeded
4877 @return own: handle to the file, not defined if error, error number
4878 can be retrieved with os_file_get_last_error */
4879 pfs_os_file_t
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)4880 os_file_create_simple_no_error_handling_func(
4881 const char* name,
4882 ulint create_mode,
4883 ulint access_type,
4884 bool read_only,
4885 bool* success)
4886 {
4887 pfs_os_file_t file;
4888
4889 *success = false;
4890
4891 DWORD access;
4892 DWORD create_flag;
4893 DWORD attributes = 0;
4894 DWORD share_mode = FILE_SHARE_READ;
4895
4896 ut_a(name);
4897
4898 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
4899 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
4900
4901 if (create_mode == OS_FILE_OPEN) {
4902
4903 create_flag = OPEN_EXISTING;
4904
4905 } else if (read_only) {
4906
4907 create_flag = OPEN_EXISTING;
4908
4909 } else if (create_mode == OS_FILE_CREATE) {
4910
4911 create_flag = CREATE_NEW;
4912
4913 } else {
4914
4915 ib::error()
4916 << "Unknown file create mode (" << create_mode << ") "
4917 << " for file '" << name << "'";
4918
4919 file.m_file = OS_FILE_CLOSED;
4920 return(file);
4921 }
4922
4923 if (access_type == OS_FILE_READ_ONLY) {
4924
4925 access = GENERIC_READ;
4926
4927 } else if (read_only) {
4928
4929 access = GENERIC_READ;
4930
4931 } else if (access_type == OS_FILE_READ_WRITE) {
4932
4933 access = GENERIC_READ | GENERIC_WRITE;
4934
4935 } else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
4936
4937 ut_a(!read_only);
4938
4939 access = GENERIC_READ;
4940
4941 /*!< A backup program has to give mysqld the maximum
4942 freedom to do what it likes with the file */
4943
4944 share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE;
4945 } else {
4946
4947 ib::error()
4948 << "Unknown file access type (" << access_type << ") "
4949 << "for file '" << name << "'";
4950
4951 file.m_file = OS_FILE_CLOSED;
4952 return(file);
4953 }
4954
4955 file.m_file = CreateFile((LPCTSTR) name,
4956 access,
4957 share_mode,
4958 NULL, // Security attributes
4959 create_flag,
4960 attributes,
4961 NULL); // No template file
4962
4963 *success = (file.m_file != INVALID_HANDLE_VALUE);
4964
4965 return(file);
4966 }
4967
4968 /** Deletes a file if it exists. The file has to be closed before calling this.
4969 @param[in] name file path as a null-terminated string
4970 @param[out] exist indicate if file pre-exist
4971 @return true if success */
4972 bool
os_file_delete_if_exists_func(const char * name,bool * exist)4973 os_file_delete_if_exists_func(
4974 const char* name,
4975 bool* exist)
4976 {
4977 ulint count = 0;
4978
4979 if (exist != NULL) {
4980 *exist = true;
4981 }
4982
4983 for (;;) {
4984 /* In Windows, deleting an .ibd file may fail if ibbackup
4985 is copying it */
4986
4987 bool ret = DeleteFile((LPCTSTR) name);
4988
4989 if (ret) {
4990 return(true);
4991 }
4992
4993 DWORD lasterr = GetLastError();
4994
4995 if (lasterr == ERROR_FILE_NOT_FOUND
4996 || lasterr == ERROR_PATH_NOT_FOUND) {
4997
4998 /* the file does not exist, this not an error */
4999 if (exist != NULL) {
5000 *exist = false;
5001 }
5002
5003 return(true);
5004 }
5005
5006 ++count;
5007
5008 if (count > 100 && 0 == (count % 10)) {
5009
5010 /* Print error information */
5011 os_file_get_last_error(true);
5012
5013 ib::warn() << "Delete of file '" << name << "' failed.";
5014 }
5015
5016 /* Sleep for a second */
5017 os_thread_sleep(1000000);
5018
5019 if (count > 2000) {
5020
5021 return(false);
5022 }
5023 }
5024 }
5025
5026 /** Deletes a file. The file has to be closed before calling this.
5027 @param[in] name File path as NUL terminated string
5028 @return true if success */
5029 bool
os_file_delete_func(const char * name)5030 os_file_delete_func(
5031 const char* name)
5032 {
5033 ulint count = 0;
5034
5035 for (;;) {
5036 /* In Windows, deleting an .ibd file may fail if ibbackup
5037 is copying it */
5038
5039 BOOL ret = DeleteFile((LPCTSTR) name);
5040
5041 if (ret) {
5042 return(true);
5043 }
5044
5045 if (GetLastError() == ERROR_FILE_NOT_FOUND) {
5046 /* If the file does not exist, we classify this as
5047 a 'mild' error and return */
5048
5049 return(false);
5050 }
5051
5052 ++count;
5053
5054 if (count > 100 && 0 == (count % 10)) {
5055
5056 /* print error information */
5057 os_file_get_last_error(true);
5058
5059 ib::warn()
5060 << "Cannot delete file '" << name << "'. Are "
5061 << "you running ibbackup to back up the file?";
5062 }
5063
5064 /* sleep for a second */
5065 os_thread_sleep(1000000);
5066
5067 if (count > 2000) {
5068
5069 return(false);
5070 }
5071 }
5072
5073 ut_error;
5074 return(false);
5075 }
5076
5077 /** NOTE! Use the corresponding macro os_file_rename(), not directly this
5078 function!
5079 Renames a file (can also move it to another directory). It is safest that the
5080 file is closed before calling this function.
5081 @param[in] oldpath old file path as a null-terminated string
5082 @param[in] newpath new file path
5083 @return true if success */
5084 bool
os_file_rename_func(const char * oldpath,const char * newpath)5085 os_file_rename_func(
5086 const char* oldpath,
5087 const char* newpath)
5088 {
5089 #ifdef UNIV_DEBUG
5090 os_file_type_t type;
5091 bool exists;
5092
5093 /* New path must not exist. */
5094 ut_ad(os_file_status(newpath, &exists, &type));
5095 ut_ad(!exists);
5096
5097 /* Old path must exist. */
5098 ut_ad(os_file_status(oldpath, &exists, &type));
5099 ut_ad(exists);
5100 #endif /* UNIV_DEBUG */
5101
5102 if (MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath)) {
5103 return(true);
5104 }
5105
5106 os_file_handle_error_no_exit(oldpath, "rename", false);
5107
5108 return(false);
5109 }
5110
5111 /** NOTE! Use the corresponding macro os_file_close(), not directly
5112 this function!
5113 Closes a file handle. In case of error, error number can be retrieved with
5114 os_file_get_last_error.
5115 @param[in,own] file Handle to a file
5116 @return true if success */
5117 bool
os_file_close_func(os_file_t file)5118 os_file_close_func(
5119 os_file_t file)
5120 {
5121 ut_a(file > 0);
5122
5123 if (CloseHandle(file)) {
5124 return(true);
5125 }
5126
5127 os_file_handle_error(NULL, "close");
5128
5129 return(false);
5130 }
5131
5132 /** Gets a file size.
5133 @param[in] file Handle to a file
5134 @return file size, or (os_offset_t) -1 on failure */
5135 os_offset_t
os_file_get_size(pfs_os_file_t file)5136 os_file_get_size(
5137 pfs_os_file_t file)
5138 {
5139 DWORD high;
5140 DWORD low;
5141
5142 low = GetFileSize(file.m_file, &high);
5143
5144 if (low == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
5145 return((os_offset_t) -1);
5146 }
5147
5148 return(os_offset_t(low | (os_offset_t(high) << 32)));
5149 }
5150
5151 /** Gets a file size.
5152 @param[in] filename Full path to the filename to check
5153 @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
5154 errno */
5155 os_file_size_t
os_file_get_size(const char * filename)5156 os_file_get_size(
5157 const char* filename)
5158 {
5159 struct __stat64 s;
5160 os_file_size_t file_size;
5161
5162 int ret = _stat64(filename, &s);
5163
5164 if (ret == 0) {
5165
5166 file_size.m_total_size = s.st_size;
5167
5168 DWORD low_size;
5169 DWORD high_size;
5170
5171 low_size = GetCompressedFileSize(filename, &high_size);
5172
5173 if (low_size != INVALID_FILE_SIZE) {
5174
5175 file_size.m_alloc_size = high_size;
5176 file_size.m_alloc_size <<= 32;
5177 file_size.m_alloc_size |= low_size;
5178
5179 } else {
5180 ib::error()
5181 << "GetCompressedFileSize("
5182 << filename << ", ..) failed.";
5183
5184 file_size.m_alloc_size = (os_offset_t) -1;
5185 }
5186 } else {
5187 file_size.m_total_size = ~0;
5188 file_size.m_alloc_size = (os_offset_t) ret;
5189 }
5190
5191 return(file_size);
5192 }
5193
5194 /** This function returns information about the specified file
5195 @param[in] path pathname of the file
5196 @param[out] stat_info information of a file in a directory
5197 @param[in,out] statinfo information of a file in a directory
5198 @param[in] check_rw_perm for testing whether the file can be opened
5199 in RW mode
5200 @param[in] read_only true if the file is opened in read-only mode
5201 @return DB_SUCCESS if all OK */
5202 static
5203 dberr_t
os_file_get_status_win32(const char * path,os_file_stat_t * stat_info,struct _stat64 * statinfo,bool check_rw_perm,bool read_only)5204 os_file_get_status_win32(
5205 const char* path,
5206 os_file_stat_t* stat_info,
5207 struct _stat64* statinfo,
5208 bool check_rw_perm,
5209 bool read_only)
5210 {
5211 int ret = _stat64(path, statinfo);
5212
5213 if (ret && (errno == ENOENT || errno == ENOTDIR)) {
5214 /* file does not exist */
5215
5216 return(DB_NOT_FOUND);
5217
5218 } else if (ret) {
5219 /* file exists, but stat call failed */
5220
5221 os_file_handle_error_no_exit(path, "stat", false);
5222
5223 return(DB_FAIL);
5224
5225 } else if (_S_IFDIR & statinfo->st_mode) {
5226
5227 stat_info->type = OS_FILE_TYPE_DIR;
5228
5229 } else if (_S_IFREG & statinfo->st_mode) {
5230
5231 DWORD access = GENERIC_READ;
5232
5233 if (!read_only) {
5234 access |= GENERIC_WRITE;
5235 }
5236
5237 stat_info->type = OS_FILE_TYPE_FILE;
5238
5239 /* Check if we can open it in read-only mode. */
5240
5241 if (check_rw_perm) {
5242 HANDLE fh;
5243
5244 fh = CreateFile(
5245 (LPCTSTR) path, // File to open
5246 access,
5247 0, // No sharing
5248 NULL, // Default security
5249 OPEN_EXISTING, // Existing file only
5250 FILE_ATTRIBUTE_NORMAL, // Normal file
5251 NULL); // No attr. template
5252
5253 if (fh == INVALID_HANDLE_VALUE) {
5254 stat_info->rw_perm = false;
5255 } else {
5256 stat_info->rw_perm = true;
5257 CloseHandle(fh);
5258 }
5259 }
5260
5261 char volname[MAX_PATH];
5262 BOOL result = GetVolumePathName(path, volname, MAX_PATH);
5263
5264 if (!result) {
5265
5266 ib::error()
5267 << "os_file_get_status_win32: "
5268 << "Failed to get the volume path name for: "
5269 << path
5270 << "- OS error number " << GetLastError();
5271
5272 return(DB_FAIL);
5273 }
5274
5275 DWORD sectorsPerCluster;
5276 DWORD bytesPerSector;
5277 DWORD numberOfFreeClusters;
5278 DWORD totalNumberOfClusters;
5279
5280 result = GetDiskFreeSpace(
5281 (LPCSTR) volname,
5282 §orsPerCluster,
5283 &bytesPerSector,
5284 &numberOfFreeClusters,
5285 &totalNumberOfClusters);
5286
5287 if (!result) {
5288
5289 ib::error()
5290 << "GetDiskFreeSpace(" << volname << ",...) "
5291 << "failed "
5292 << "- OS error number " << GetLastError();
5293
5294 return(DB_FAIL);
5295 }
5296
5297 stat_info->block_size = bytesPerSector * sectorsPerCluster;
5298
5299 /* On Windows the block size is not used as the allocation
5300 unit for sparse files. The underlying infra-structure for
5301 sparse files is based on NTFS compression. The punch hole
5302 is done on a "compression unit". This compression unit
5303 is based on the cluster size. You cannot punch a hole if
5304 the cluster size >= 8K. For smaller sizes the table is
5305 as follows:
5306
5307 Cluster Size Compression Unit
5308 512 Bytes 8 KB
5309 1 KB 16 KB
5310 2 KB 32 KB
5311 4 KB 64 KB
5312
5313 Default NTFS cluster size is 4K, compression unit size of 64K.
5314 Therefore unless the user has created the file system with
5315 a smaller cluster size and used larger page sizes there is
5316 little benefit from compression out of the box. */
5317
5318 stat_info->block_size = (stat_info->block_size <= 4096)
5319 ? stat_info->block_size * 16 : ULINT_UNDEFINED;
5320 } else {
5321 stat_info->type = OS_FILE_TYPE_UNKNOWN;
5322 }
5323
5324 return(DB_SUCCESS);
5325 }
5326
5327 /** Truncates a file to a specified size in bytes.
5328 Do nothing if the size to preserve is greater or equal to the current
5329 size of the file.
5330 @param[in] pathname file path
5331 @param[in] file file to be truncated
5332 @param[in] size size to preserve in bytes
5333 @return true if success */
5334 static
5335 bool
os_file_truncate_win32(const char * pathname,pfs_os_file_t file,os_offset_t size)5336 os_file_truncate_win32(
5337 const char* pathname,
5338 pfs_os_file_t file,
5339 os_offset_t size)
5340 {
5341 LARGE_INTEGER length;
5342
5343 length.QuadPart = size;
5344 BOOL success = SetFilePointerEx(file.m_file, length, NULL, FILE_BEGIN);
5345 if (!success) {
5346 os_file_handle_error_no_exit(
5347 pathname, "SetFilePointerEx", false);
5348 } else {
5349 success = SetEndOfFile(file.m_file);
5350 if (!success) {
5351 os_file_handle_error_no_exit(
5352 pathname, "SetEndOfFile", false);
5353 }
5354 }
5355 return(success);
5356 }
5357
5358 /** Truncates a file at its current position.
5359 @param[in] file Handle to be truncated
5360 @return true if success */
5361 bool
os_file_set_eof(FILE * file)5362 os_file_set_eof(
5363 FILE* file)
5364 {
5365 HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
5366
5367 return(SetEndOfFile(h));
5368 }
5369
5370 #ifdef UNIV_HOTBACKUP
5371 /** Closes a file handle.
5372 @param[in] file Handle to close
5373 @return true if success */
5374 bool
os_file_close_no_error_handling(os_file_t file)5375 os_file_close_no_error_handling(
5376 os_file_t file)
5377 {
5378 return(CloseHandle(file) ? true : false);
5379 }
5380 #endif /* UNIV_HOTBACKUP */
5381
5382 /** This function can be called if one wants to post a batch of reads and
5383 prefers an i/o-handler thread to handle them all at once later. You must
5384 call os_aio_simulated_wake_handler_threads later to ensure the threads
5385 are not left sleeping! */
5386 void
os_aio_simulated_put_read_threads_to_sleep()5387 os_aio_simulated_put_read_threads_to_sleep()
5388 {
5389 AIO::simulated_put_read_threads_to_sleep();
5390 }
5391
5392 /** This function can be called if one wants to post a batch of reads and
5393 prefers an i/o-handler thread to handle them all at once later. You must
5394 call os_aio_simulated_wake_handler_threads later to ensure the threads
5395 are not left sleeping! */
5396 void
simulated_put_read_threads_to_sleep()5397 AIO::simulated_put_read_threads_to_sleep()
5398 {
5399 /* The idea of putting background IO threads to sleep is only for
5400 Windows when using simulated AIO. Windows XP seems to schedule
5401 background threads too eagerly to allow for coalescing during
5402 readahead requests. */
5403
5404 if (srv_use_native_aio) {
5405 /* We do not use simulated AIO: do nothing */
5406
5407 return;
5408 }
5409
5410 os_aio_recommend_sleep_for_read_threads = true;
5411
5412 for (ulint i = 0; i < os_aio_n_segments; i++) {
5413 AIO* array;
5414
5415 get_array_and_local_segment(&array, i);
5416
5417 if (array == s_reads) {
5418
5419 os_event_reset(os_aio_segment_wait_events[i]);
5420 }
5421 }
5422 }
5423
5424 #endif /* !_WIN32*/
5425
5426 /** Does a syncronous read or write depending upon the type specified
5427 In case of partial reads/writes the function tries
5428 NUM_RETRIES_ON_PARTIAL_IO times to read/write the complete data.
5429 @param[in] type, IO flags
5430 @param[in] file handle to an open file
5431 @param[out] buf buffer where to read
5432 @param[in] offset file offset from the start where to read
5433 @param[in] n number of bytes to read, starting from offset
5434 @param[out] err DB_SUCCESS or error code
5435 @return number of bytes read/written, -1 if error */
5436 static MY_ATTRIBUTE((warn_unused_result))
5437 ssize_t
os_file_io(const IORequest & in_type,os_file_t file,void * buf,ulint n,os_offset_t offset,dberr_t * err)5438 os_file_io(
5439 const IORequest&in_type,
5440 os_file_t file,
5441 void* buf,
5442 ulint n,
5443 os_offset_t offset,
5444 dberr_t* err)
5445 {
5446 Block* block;
5447 ulint original_n = n;
5448 IORequest type = in_type;
5449 ssize_t bytes_returned = 0;
5450
5451 if (type.is_compressed()) {
5452
5453 /* We don't compress the first page of any file. */
5454 ut_ad(offset > 0);
5455
5456 block = os_file_compress_page(type, buf, &n);
5457 } else {
5458 block = NULL;
5459 }
5460
5461 /* We do encryption after compression, since if we do encryption
5462 before compression, the encrypted data will cause compression fail
5463 or low compression rate. */
5464 if (type.is_encrypted() && type.is_write()) {
5465 /* We don't encrypt the first page of any file. */
5466 Block* compressed_block = block;
5467 ut_ad(offset > 0);
5468
5469 block = os_file_encrypt_page(type, buf, &n);
5470
5471 if (compressed_block != NULL) {
5472 os_free_block(compressed_block);
5473 }
5474 }
5475
5476 SyncFileIO sync_file_io(file, buf, n, offset);
5477
5478 for (ulint i = 0; i < NUM_RETRIES_ON_PARTIAL_IO; ++i) {
5479
5480 ssize_t n_bytes = sync_file_io.execute(type);
5481
5482 /* Check for a hard error. Not much we can do now. */
5483 if (n_bytes < 0) {
5484
5485 break;
5486
5487 } else if ((ulint) n_bytes + bytes_returned == n) {
5488
5489 bytes_returned += n_bytes;
5490
5491 if (offset > 0
5492 && (type.is_compressed() || type.is_read())) {
5493
5494 *err = os_file_io_complete(
5495 type, file,
5496 reinterpret_cast<byte*>(buf),
5497 NULL, original_n, offset, n);
5498 } else {
5499
5500 *err = DB_SUCCESS;
5501 }
5502
5503 if (block != NULL) {
5504 os_free_block(block);
5505 }
5506
5507 return(original_n);
5508 }
5509
5510 /* Handle partial read/write. */
5511
5512 ut_ad((ulint) n_bytes + bytes_returned < n);
5513
5514 bytes_returned += (ulint) n_bytes;
5515
5516 if (!type.is_partial_io_warning_disabled()) {
5517
5518 const char* op = type.is_read()
5519 ? "read" : "written";
5520
5521 ib::warn()
5522 << n
5523 << " bytes should have been " << op << ". Only "
5524 << bytes_returned
5525 << " bytes " << op << ". Retrying"
5526 << " for the remaining bytes.";
5527 }
5528
5529 /* Advance the offset and buffer by n_bytes */
5530 sync_file_io.advance(n_bytes);
5531 }
5532
5533 if (block != NULL) {
5534 os_free_block(block);
5535 }
5536
5537 *err = DB_IO_ERROR;
5538
5539 if (!type.is_partial_io_warning_disabled()) {
5540 ib::warn()
5541 << "Retry attempts for "
5542 << (type.is_read() ? "reading" : "writing")
5543 << " partial data failed.";
5544 }
5545
5546 return(bytes_returned);
5547 }
5548
5549 /** Does a synchronous write operation in Posix.
5550 @param[in] type IO context
5551 @param[in] file handle to an open file
5552 @param[out] buf buffer from which to write
5553 @param[in] n number of bytes to read, starting from offset
5554 @param[in] offset file offset from the start where to read
5555 @param[out] err DB_SUCCESS or error code
5556 @return number of bytes written, -1 if error */
5557 static MY_ATTRIBUTE((warn_unused_result))
5558 ssize_t
os_file_pwrite(IORequest & type,os_file_t file,const byte * buf,ulint n,os_offset_t offset,dberr_t * err)5559 os_file_pwrite(
5560 IORequest& type,
5561 os_file_t file,
5562 const byte* buf,
5563 ulint n,
5564 os_offset_t offset,
5565 dberr_t* err)
5566 {
5567 ut_ad(type.validate());
5568
5569 ++os_n_file_writes;
5570
5571 (void) os_atomic_increment_ulint(&os_n_pending_writes, 1);
5572 MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_WRITES);
5573
5574 ssize_t n_bytes = os_file_io(type, file, (void*) buf, n, offset, err);
5575
5576 (void) os_atomic_decrement_ulint(&os_n_pending_writes, 1);
5577 MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_WRITES);
5578
5579 return(n_bytes);
5580 }
5581
5582 /** Requests a synchronous write operation.
5583 @param[in] type IO flags
5584 @param[in] file handle to an open file
5585 @param[out] buf buffer from which to write
5586 @param[in] offset file offset from the start where to read
5587 @param[in] n number of bytes to read, starting from offset
5588 @return DB_SUCCESS if request was successful, false if fail */
5589 static MY_ATTRIBUTE((warn_unused_result))
5590 dberr_t
os_file_write_page(IORequest & type,const char * name,os_file_t file,const byte * buf,os_offset_t offset,ulint n)5591 os_file_write_page(
5592 IORequest& type,
5593 const char* name,
5594 os_file_t file,
5595 const byte* buf,
5596 os_offset_t offset,
5597 ulint n)
5598 {
5599 dberr_t err;
5600 ut_ad(type.validate());
5601 ut_ad(n > 0);
5602
5603 WAIT_ALLOW_WRITES();
5604 ssize_t n_bytes = os_file_pwrite(type, file, buf, n, offset, &err);
5605
5606 if ((ulint) n_bytes != n && !os_has_said_disk_full) {
5607
5608 ib::error()
5609 << "Write to file " << name << "failed at offset "
5610 << offset << ", " << n
5611 << " bytes should have been written,"
5612 " only " << n_bytes << " were written."
5613 " Operating system error number " << errno << "."
5614 " Check that your OS and file system"
5615 " support files of this size."
5616 " Check also that the disk is not full"
5617 " or a disk quota exceeded.";
5618
5619 if (strerror(errno) != NULL) {
5620
5621 ib::error()
5622 << "Error number " << errno
5623 << " means '" << strerror(errno) << "'";
5624 }
5625
5626 ib::info() << OPERATING_SYSTEM_ERROR_MSG;
5627
5628 os_has_said_disk_full = true;
5629 }
5630
5631 return(err);
5632 }
5633
5634 /** Does a synchronous read operation in Posix.
5635 @param[in] type IO flags
5636 @param[in] file handle to an open file
5637 @param[out] buf buffer where to read
5638 @param[in] offset file offset from the start where to read
5639 @param[in] n number of bytes to read, starting from offset
5640 @param[out] err DB_SUCCESS or error code
5641 @return number of bytes read, -1 if error */
5642 static MY_ATTRIBUTE((warn_unused_result))
5643 ssize_t
os_file_pread(IORequest & type,os_file_t file,void * buf,ulint n,os_offset_t offset,dberr_t * err)5644 os_file_pread(
5645 IORequest& type,
5646 os_file_t file,
5647 void* buf,
5648 ulint n,
5649 os_offset_t offset,
5650 dberr_t* err)
5651 {
5652 ++os_n_file_reads;
5653
5654 (void) os_atomic_increment_ulint(&os_n_pending_reads, 1);
5655 MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS);
5656
5657 ssize_t n_bytes = os_file_io(type, file, buf, n, offset, err);
5658
5659 (void) os_atomic_decrement_ulint(&os_n_pending_reads, 1);
5660 MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_READS);
5661
5662 return(n_bytes);
5663 }
5664
5665 /** Requests a synchronous positioned read operation.
5666 @return DB_SUCCESS if request was successful, false if fail
5667 @param[in] type IO flags
5668 @param[in] file handle to an open file
5669 @param[out] buf buffer where to read
5670 @param[in] offset file offset from the start where to read
5671 @param[in] n number of bytes to read, starting from offset
5672 @param[out] o number of bytes actually read
5673 @param[in] exit_on_err if true then exit on error
5674 @return DB_SUCCESS or error code */
5675 static MY_ATTRIBUTE((warn_unused_result))
5676 dberr_t
os_file_read_page(IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n,ulint * o,bool exit_on_err)5677 os_file_read_page(
5678 IORequest& type,
5679 os_file_t file,
5680 void* buf,
5681 os_offset_t offset,
5682 ulint n,
5683 ulint* o,
5684 bool exit_on_err)
5685 {
5686 dberr_t err;
5687
5688 os_bytes_read_since_printout += n;
5689
5690 ut_ad(type.validate());
5691 ut_ad(n > 0);
5692
5693 for (;;) {
5694 ssize_t n_bytes;
5695
5696 n_bytes = os_file_pread(type, file, buf, n, offset, &err);
5697
5698 if (o != NULL) {
5699 *o = n_bytes;
5700 }
5701
5702 if (err != DB_SUCCESS && !exit_on_err) {
5703
5704 return(err);
5705
5706 } else if ((ulint) n_bytes == n) {
5707
5708 /** The read will succeed but decompress can fail
5709 for various reasons. */
5710
5711 if (type.is_compression_enabled()
5712 && !Compression::is_compressed_page(
5713 static_cast<byte*>(buf))) {
5714
5715 return(DB_SUCCESS);
5716
5717 } else {
5718 return(err);
5719 }
5720 }
5721
5722 ib::error() << "Tried to read " << n
5723 << " bytes at offset " << offset
5724 << ", but was only able to read " << n_bytes;
5725
5726 if (exit_on_err) {
5727
5728 if (!os_file_handle_error(NULL, "read")) {
5729 /* Hard error */
5730 break;
5731 }
5732
5733 } else if (!os_file_handle_error_no_exit(NULL, "read", false)) {
5734
5735 /* Hard error */
5736 break;
5737 }
5738
5739 if (n_bytes > 0 && (ulint) n_bytes < n) {
5740 n -= (ulint) n_bytes;
5741 offset += (ulint) n_bytes;
5742 buf = reinterpret_cast<uchar*>(buf) + (ulint) n_bytes;
5743 }
5744 }
5745
5746 ib::fatal()
5747 << "Cannot read from file. OS error number "
5748 << errno << ".";
5749
5750 return(err);
5751 }
5752
5753 /** Retrieves the last error number if an error occurs in a file io function.
5754 The number should be retrieved before any other OS calls (because they may
5755 overwrite the error number). If the number is not known to this program,
5756 the OS error number + 100 is returned.
5757 @param[in] report_all_errors true if we want an error printed
5758 for all errors
5759 @return error number, or OS error number + 100 */
5760 ulint
os_file_get_last_error(bool report_all_errors)5761 os_file_get_last_error(
5762 bool report_all_errors)
5763 {
5764 return(os_file_get_last_error_low(report_all_errors, false));
5765 }
5766
5767 /** Does error handling when a file operation fails.
5768 Conditionally exits (calling srv_fatal_error()) based on should_exit value
5769 and the error type, if should_exit is true then on_error_silent is ignored.
5770 @param[in] name name of a file or NULL
5771 @param[in] operation operation
5772 @param[in] should_exit call srv_fatal_error() on an unknown error,
5773 if this parameter is true
5774 @param[in] on_error_silent if true then don't print any message to the log
5775 iff it is an unknown non-fatal error
5776 @return true if we should retry the operation */
5777 static MY_ATTRIBUTE((warn_unused_result))
5778 bool
os_file_handle_error_cond_exit(const char * name,const char * operation,bool should_exit,bool on_error_silent)5779 os_file_handle_error_cond_exit(
5780 const char* name,
5781 const char* operation,
5782 bool should_exit,
5783 bool on_error_silent)
5784 {
5785 ulint err;
5786
5787 err = os_file_get_last_error_low(false, on_error_silent);
5788
5789 switch (err) {
5790 case OS_FILE_DISK_FULL:
5791 /* We only print a warning about disk full once */
5792
5793 if (os_has_said_disk_full) {
5794
5795 return(false);
5796 }
5797
5798 /* Disk full error is reported irrespective of the
5799 on_error_silent setting. */
5800
5801 if (name) {
5802
5803 ib::error()
5804 << "Encountered a problem with file '"
5805 << name << "'";
5806 }
5807
5808 ib::error()
5809 << "Disk is full. Try to clean the disk to free space.";
5810
5811 os_has_said_disk_full = true;
5812
5813 return(false);
5814
5815 case OS_FILE_AIO_RESOURCES_RESERVED:
5816 case OS_FILE_AIO_INTERRUPTED:
5817
5818 return(true);
5819
5820 case OS_FILE_PATH_ERROR:
5821 case OS_FILE_ALREADY_EXISTS:
5822 case OS_FILE_ACCESS_VIOLATION:
5823
5824 return(false);
5825
5826 case OS_FILE_SHARING_VIOLATION:
5827
5828 os_thread_sleep(10000000); /* 10 sec */
5829 return(true);
5830
5831 case OS_FILE_OPERATION_ABORTED:
5832 case OS_FILE_INSUFFICIENT_RESOURCE:
5833
5834 os_thread_sleep(100000); /* 100 ms */
5835 return(true);
5836
5837 default:
5838
5839 /* If it is an operation that can crash on error then it
5840 is better to ignore on_error_silent and print an error message
5841 to the log. */
5842
5843 if (should_exit || !on_error_silent) {
5844 ib::error() << "File "
5845 << (name != NULL ? name : "(unknown)")
5846 << ": '" << operation << "'"
5847 " returned OS error " << err << "."
5848 << (should_exit
5849 ? " Cannot continue operation" : "");
5850 }
5851
5852 if (should_exit) {
5853 srv_fatal_error();
5854 }
5855 }
5856
5857 return(false);
5858 }
5859
5860 /** Does error handling when a file operation fails.
5861 @param[in] name name of a file or NULL
5862 @param[in] operation operation name that failed
5863 @return true if we should retry the operation */
5864 static
5865 bool
os_file_handle_error(const char * name,const char * operation)5866 os_file_handle_error(
5867 const char* name,
5868 const char* operation)
5869 {
5870 /* Exit in case of unknown error */
5871 return(os_file_handle_error_cond_exit(name, operation, true, false));
5872 }
5873
5874 /** Does error handling when a file operation fails.
5875 @param[in] name name of a file or NULL
5876 @param[in] operation operation name that failed
5877 @param[in] on_error_silent if true then don't print any message to the log.
5878 @return true if we should retry the operation */
5879 static
5880 bool
os_file_handle_error_no_exit(const char * name,const char * operation,bool on_error_silent)5881 os_file_handle_error_no_exit(
5882 const char* name,
5883 const char* operation,
5884 bool on_error_silent)
5885 {
5886 /* Don't exit in case of unknown error */
5887 return(os_file_handle_error_cond_exit(
5888 name, operation, false, on_error_silent));
5889 }
5890
5891 /** Tries to disable OS caching on an opened file descriptor.
5892 @param[in] fd file descriptor to alter
5893 @param[in] file_name file name, used in the diagnostic message
5894 @param[in] name "open" or "create"; used in the diagnostic
5895 message */
5896 void
os_file_set_nocache(int fd MY_ATTRIBUTE ((unused)),const char * file_name MY_ATTRIBUTE ((unused)),const char * operation_name MY_ATTRIBUTE ((unused)))5897 os_file_set_nocache(
5898 int fd MY_ATTRIBUTE((unused)),
5899 const char* file_name MY_ATTRIBUTE((unused)),
5900 const char* operation_name MY_ATTRIBUTE((unused)))
5901 {
5902 /* some versions of Solaris may not have DIRECTIO_ON */
5903 #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
5904 if (directio(fd, DIRECTIO_ON) == -1) {
5905 int errno_save = errno;
5906
5907 ib::error()
5908 << "Failed to set DIRECTIO_ON on file "
5909 << file_name << ": " << operation_name
5910 << strerror(errno_save) << ","
5911 " continuing anyway.";
5912 }
5913 #elif defined(O_DIRECT)
5914 if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
5915 int errno_save = errno;
5916 static bool warning_message_printed = false;
5917 if (errno_save == EINVAL) {
5918 if (!warning_message_printed) {
5919 warning_message_printed = true;
5920 # ifdef UNIV_LINUX
5921 ib::warn()
5922 << "Failed to set O_DIRECT on file"
5923 << file_name << ";" << operation_name
5924 << ": " << strerror(errno_save) << ", "
5925 << "continuing anyway. O_DIRECT is "
5926 "known to result in 'Invalid argument' "
5927 "on Linux on tmpfs, "
5928 "see MySQL Bug#26662.";
5929 # else /* UNIV_LINUX */
5930 goto short_warning;
5931 # endif /* UNIV_LINUX */
5932 }
5933 } else {
5934 # ifndef UNIV_LINUX
5935 short_warning:
5936 # endif
5937 ib::warn()
5938 << "Failed to set O_DIRECT on file "
5939 << file_name << "; " << operation_name
5940 << " : " << strerror(errno_save)
5941 << " continuing anyway.";
5942 }
5943 }
5944 #endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */
5945 }
5946
5947 /** Write the specified number of zeros to a newly created file.
5948 @param[in] name name of the file or path as a null-terminated
5949 string
5950 @param[in] file handle to a file
5951 @param[in] size file size
5952 @param[in] read_only Enable read-only checks if true
5953 @return true if success */
5954 bool
os_file_set_size(const char * name,pfs_os_file_t file,os_offset_t size,bool read_only)5955 os_file_set_size(
5956 const char* name,
5957 pfs_os_file_t file,
5958 os_offset_t size,
5959 bool read_only)
5960 {
5961 /* Write up to 1 megabyte at a time. */
5962 ulint buf_size = ut_min(
5963 static_cast<ulint>(64),
5964 static_cast<ulint>(size / UNIV_PAGE_SIZE));
5965
5966 buf_size *= UNIV_PAGE_SIZE;
5967
5968 /* Align the buffer for possible raw i/o */
5969 byte* buf2;
5970
5971 buf2 = static_cast<byte*>(ut_malloc_nokey(buf_size + UNIV_PAGE_SIZE));
5972
5973 byte* buf = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
5974
5975 /* Write buffer full of zeros */
5976 memset(buf, 0, buf_size);
5977
5978 if (size >= (os_offset_t) 100 << 20) {
5979
5980 ib::info() << "Progress in MB:";
5981 }
5982
5983 os_offset_t current_size = 0;
5984
5985 while (current_size < size) {
5986 ulint n_bytes;
5987
5988 if (size - current_size < (os_offset_t) buf_size) {
5989 n_bytes = (ulint) (size - current_size);
5990 } else {
5991 n_bytes = buf_size;
5992 }
5993
5994 dberr_t err;
5995 IORequest request(IORequest::WRITE);
5996
5997 #ifdef UNIV_HOTBACKUP
5998
5999 err = os_file_write(
6000 request, name, file, buf, current_size, n_bytes);
6001 #else
6002 /* Using OS_AIO_SYNC mode on POSIX systems will result in
6003 fall back to os_file_write/read. On Windows it will use
6004 special mechanism to wait before it returns back. */
6005
6006 err = os_aio(
6007 request,
6008 OS_AIO_SYNC, name,
6009 file, buf, current_size, n_bytes,
6010 read_only, NULL, NULL);
6011 #endif /* UNIV_HOTBACKUP */
6012
6013 if (err != DB_SUCCESS) {
6014
6015 ut_free(buf2);
6016 return(false);
6017 }
6018
6019 /* Print about progress for each 100 MB written */
6020 if ((current_size + n_bytes) / (100 << 20)
6021 != current_size / (100 << 20)) {
6022
6023 fprintf(stderr, " %lu00",
6024 (ulong) ((current_size + n_bytes)
6025 / (100 << 20)));
6026 }
6027
6028 current_size += n_bytes;
6029 }
6030
6031 if (size >= (os_offset_t) 100 << 20) {
6032
6033 fprintf(stderr, "\n");
6034 }
6035
6036 ut_free(buf2);
6037
6038 return(os_file_flush(file));
6039 }
6040
6041 /** Truncates a file to a specified size in bytes.
6042 Do nothing if the size to preserve is greater or equal to the current
6043 size of the file.
6044 @param[in] pathname file path
6045 @param[in] file file to be truncated
6046 @param[in] size size to preserve in bytes
6047 @return true if success */
6048 bool
os_file_truncate(const char * pathname,pfs_os_file_t file,os_offset_t size)6049 os_file_truncate(
6050 const char* pathname,
6051 pfs_os_file_t file,
6052 os_offset_t size)
6053 {
6054 /* Do nothing if the size preserved is larger than or equal to the
6055 current size of file */
6056 os_offset_t size_bytes = os_file_get_size(file);
6057
6058 if (size >= size_bytes) {
6059 return(true);
6060 }
6061
6062 #ifdef _WIN32
6063 return(os_file_truncate_win32(pathname, file, size));
6064 #else /* _WIN32 */
6065 return(os_file_truncate_posix(pathname, file, size));
6066 #endif /* _WIN32 */
6067 }
6068
6069 /** NOTE! Use the corresponding macro os_file_read(), not directly this
6070 function!
6071 Requests a synchronous positioned read operation.
6072 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
6073 @param[in] type IO flags
6074 @param[in] file handle to an open file
6075 @param[out] buf buffer where to read
6076 @param[in] offset file offset from the start where to read
6077 @param[in] n number of bytes to read, starting from offset
6078 @return DB_SUCCESS or error code */
6079 dberr_t
os_file_read_func(IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n)6080 os_file_read_func(
6081 IORequest& type,
6082 os_file_t file,
6083 void* buf,
6084 os_offset_t offset,
6085 ulint n)
6086 {
6087 ut_ad(type.is_read());
6088
6089 return(os_file_read_page(type, file, buf, offset, n, NULL, true));
6090 }
6091
6092 /** NOTE! Use the corresponding macro os_file_read_no_error_handling(),
6093 not directly this function!
6094 Requests a synchronous positioned read operation.
6095 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
6096 @param[in] type IO flags
6097 @param[in] file handle to an open file
6098 @param[out] buf buffer where to read
6099 @param[in] offset file offset from the start where to read
6100 @param[in] n number of bytes to read, starting from offset
6101 @param[out] o number of bytes actually read
6102 @return DB_SUCCESS or error code */
6103 dberr_t
os_file_read_no_error_handling_func(IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n,ulint * o)6104 os_file_read_no_error_handling_func(
6105 IORequest& type,
6106 os_file_t file,
6107 void* buf,
6108 os_offset_t offset,
6109 ulint n,
6110 ulint* o)
6111 {
6112 ut_ad(type.is_read());
6113
6114 return(os_file_read_page(type, file, buf, offset, n, o, false));
6115 }
6116
6117 /** NOTE! Use the corresponding macro os_file_write(), not directly
6118 Requests a synchronous write operation.
6119 @param[in] type IO flags
6120 @param[in] file handle to an open file
6121 @param[out] buf buffer from which to write
6122 @param[in] offset file offset from the start where to read
6123 @param[in] n number of bytes to read, starting from offset
6124 @return DB_SUCCESS if request was successful, false if fail */
6125 dberr_t
os_file_write_func(IORequest & type,const char * name,os_file_t file,const void * buf,os_offset_t offset,ulint n)6126 os_file_write_func(
6127 IORequest& type,
6128 const char* name,
6129 os_file_t file,
6130 const void* buf,
6131 os_offset_t offset,
6132 ulint n)
6133 {
6134 ut_ad(type.validate());
6135 ut_ad(type.is_write());
6136
6137 /* We never compress the first page.
6138 Note: This assumes we always do block IO. */
6139 if (offset == 0) {
6140 type.clear_compressed();
6141 }
6142
6143 const byte* ptr = reinterpret_cast<const byte*>(buf);
6144
6145 return(os_file_write_page(type, name, file, ptr, offset, n));
6146 }
6147
6148 /** Check the existence and type of the given file.
6149 @param[in] path path name of file
6150 @param[out] exists true if the file exists
6151 @param[out] type Type of the file, if it exists
6152 @return true if call succeeded */
6153 bool
os_file_status(const char * path,bool * exists,os_file_type_t * type)6154 os_file_status(
6155 const char* path,
6156 bool* exists,
6157 os_file_type_t* type)
6158 {
6159 #ifdef _WIN32
6160 return(os_file_status_win32(path, exists, type));
6161 #else
6162 return(os_file_status_posix(path, exists, type));
6163 #endif /* _WIN32 */
6164 }
6165
6166 /** Free storage space associated with a section of the file.
6167 @param[in] fh Open file handle
6168 @param[in] off Starting offset (SEEK_SET)
6169 @param[in] len Size of the hole
6170 @return DB_SUCCESS or error code */
6171 dberr_t
os_file_punch_hole(os_file_t fh,os_offset_t off,os_offset_t len)6172 os_file_punch_hole(
6173 os_file_t fh,
6174 os_offset_t off,
6175 os_offset_t len)
6176 {
6177 /* In this debugging mode, we act as if punch hole is supported,
6178 and then skip any calls to actually punch a hole here.
6179 In this way, Transparent Page Compression is still being tested. */
6180 DBUG_EXECUTE_IF("ignore_punch_hole",
6181 return(DB_SUCCESS);
6182 );
6183
6184 #ifdef _WIN32
6185 return(os_file_punch_hole_win32(fh, off, len));
6186 #else
6187 return(os_file_punch_hole_posix(fh, off, len));
6188 #endif /* _WIN32 */
6189 }
6190
6191 /** Check if the file system supports sparse files.
6192
6193 Warning: On POSIX systems we try and punch a hole from offset 0 to
6194 the system configured page size. This should only be called on an empty
6195 file.
6196
6197 Note: On Windows we use the name and on Unices we use the file handle.
6198
6199 @param[in] name File name
6200 @param[in] fh File handle for the file - if opened
6201 @return true if the file system supports sparse files */
6202 bool
os_is_sparse_file_supported(const char * path,pfs_os_file_t fh)6203 os_is_sparse_file_supported(const char* path, pfs_os_file_t fh)
6204 {
6205 /* In this debugging mode, we act as if punch hole is supported,
6206 then we skip any calls to actually punch a hole. In this way,
6207 Transparent Page Compression is still being tested. */
6208 DBUG_EXECUTE_IF("ignore_punch_hole",
6209 return(true);
6210 );
6211
6212 #ifdef _WIN32
6213 return(os_is_sparse_file_supported_win32(path));
6214 #else
6215 dberr_t err;
6216
6217 /* We don't know the FS block size, use the sector size. The FS
6218 will do the magic. */
6219 err = os_file_punch_hole(fh.m_file, 0, UNIV_PAGE_SIZE);
6220
6221 return(err == DB_SUCCESS);
6222 #endif /* _WIN32 */
6223 }
6224
6225 /** This function returns information about the specified file
6226 @param[in] path pathname of the file
6227 @param[out] stat_info information of a file in a directory
6228 @param[in] check_rw_perm for testing whether the file can be opened
6229 in RW mode
6230 @param[in] read_only true if file is opened in read-only mode
6231 @return DB_SUCCESS if all OK */
6232 dberr_t
os_file_get_status(const char * path,os_file_stat_t * stat_info,bool check_rw_perm,bool read_only)6233 os_file_get_status(
6234 const char* path,
6235 os_file_stat_t* stat_info,
6236 bool check_rw_perm,
6237 bool read_only)
6238 {
6239 dberr_t ret;
6240
6241 #ifdef _WIN32
6242 struct _stat64 info;
6243
6244 ret = os_file_get_status_win32(
6245 path, stat_info, &info, check_rw_perm, read_only);
6246
6247 #else
6248 struct stat info;
6249
6250 ret = os_file_get_status_posix(
6251 path, stat_info, &info, check_rw_perm, read_only);
6252
6253 #endif /* _WIN32 */
6254
6255 if (ret == DB_SUCCESS) {
6256 stat_info->ctime = info.st_ctime;
6257 stat_info->atime = info.st_atime;
6258 stat_info->mtime = info.st_mtime;
6259 stat_info->size = info.st_size;
6260 }
6261
6262 return(ret);
6263 }
6264
6265 /**
6266 Waits for an AIO operation to complete. This function is used to wait the
6267 for completed requests. The aio array of pending requests is divided
6268 into segments. The thread specifies which segment or slot it wants to wait
6269 for. NOTE: this function will also take care of freeing the aio slot,
6270 therefore no other thread is allowed to do the freeing!
6271 @param[in] segment The number of the segment in the aio arrays to
6272 wait for; segment 0 is the ibuf I/O thread,
6273 segment 1 the log I/O thread, then follow the
6274 non-ibuf read threads, and as the last are the
6275 non-ibuf write threads; if this is
6276 ULINT_UNDEFINED, then it means that sync AIO
6277 is used, and this parameter is ignored
6278 @param[out] m1 the messages passed with the AIO request; note
6279 that also in the case where the AIO operation
6280 failed, these output parameters are valid and
6281 can be used to restart the operation,
6282 for example
6283 @param[out] m2 callback message
6284 @param[out] type OS_FILE_WRITE or ..._READ
6285 @return DB_SUCCESS or error code */
6286 dberr_t
os_aio_handler(ulint segment,fil_node_t ** m1,void ** m2,IORequest * request)6287 os_aio_handler(
6288 ulint segment,
6289 fil_node_t** m1,
6290 void** m2,
6291 IORequest* request)
6292 {
6293 dberr_t err;
6294
6295 if (srv_use_native_aio) {
6296 srv_set_io_thread_op_info(segment, "native aio handle");
6297
6298 #ifdef WIN_ASYNC_IO
6299
6300 err = os_aio_windows_handler(segment, 0, m1, m2, request);
6301
6302 #elif defined(LINUX_NATIVE_AIO)
6303
6304 err = os_aio_linux_handler(segment, m1, m2, request);
6305
6306 #else
6307 ut_error;
6308
6309 err = DB_ERROR; /* Eliminate compiler warning */
6310
6311 #endif /* WIN_ASYNC_IO */
6312
6313 } else {
6314 srv_set_io_thread_op_info(segment, "simulated aio handle");
6315
6316 err = os_aio_simulated_handler(segment, m1, m2, request);
6317 }
6318
6319 return(err);
6320 }
6321
6322 /** Constructor
6323 @param[in] id The latch ID
6324 @param[in] n Number of AIO slots
6325 @param[in] segments Number of segments */
AIO(latch_id_t id,ulint n,ulint segments)6326 AIO::AIO(
6327 latch_id_t id,
6328 ulint n,
6329 ulint segments)
6330 :
6331 m_slots(n),
6332 m_n_segments(segments),
6333 m_n_reserved()
6334 # ifdef LINUX_NATIVE_AIO
6335 ,m_aio_ctx(),
6336 m_events(m_slots.size())
6337 # elif defined(_WIN32)
6338 ,m_handles()
6339 # endif /* LINUX_NATIVE_AIO */
6340 {
6341 ut_a(n > 0);
6342 ut_a(m_n_segments > 0);
6343
6344 mutex_create(id, &m_mutex);
6345
6346 m_not_full = os_event_create("aio_not_full");
6347 m_is_empty = os_event_create("aio_is_empty");
6348
6349 std::uninitialized_fill(m_slots.begin(), m_slots.end(), Slot());
6350 #ifdef LINUX_NATIVE_AIO
6351 memset(&m_events[0], 0x0, sizeof(m_events[0]) * m_events.size());
6352 #endif /* LINUX_NATIVE_AIO */
6353
6354 os_event_set(m_is_empty);
6355 }
6356
6357 /** Initialise the slots */
6358 dberr_t
init_slots()6359 AIO::init_slots()
6360 {
6361 for (ulint i = 0; i < m_slots.size(); ++i) {
6362 Slot& slot = m_slots[i];
6363
6364 slot.pos = static_cast<uint16_t>(i);
6365
6366 slot.is_reserved = false;
6367
6368 #ifdef WIN_ASYNC_IO
6369
6370 slot.handle = CreateEvent(NULL, TRUE, FALSE, NULL);
6371
6372 OVERLAPPED* over = &slot.control;
6373
6374 over->hEvent = slot.handle;
6375
6376 (*m_handles)[i] = over->hEvent;
6377
6378 #elif defined(LINUX_NATIVE_AIO)
6379
6380 slot.ret = 0;
6381
6382 slot.n_bytes = 0;
6383
6384 memset(&slot.control, 0x0, sizeof(slot.control));
6385
6386 #endif /* WIN_ASYNC_IO */
6387 }
6388
6389 return(DB_SUCCESS);
6390 }
6391
6392 #ifdef LINUX_NATIVE_AIO
6393 /** Initialise the Linux Native AIO interface */
6394 dberr_t
init_linux_native_aio()6395 AIO::init_linux_native_aio()
6396 {
6397 /* Initialize the io_context array. One io_context
6398 per segment in the array. */
6399
6400 ut_a(m_aio_ctx == NULL);
6401
6402 m_aio_ctx = static_cast<io_context**>(
6403 ut_zalloc_nokey(m_n_segments * sizeof(*m_aio_ctx)));
6404
6405 if (m_aio_ctx == NULL) {
6406 return(DB_OUT_OF_MEMORY);
6407 }
6408
6409 io_context** ctx = m_aio_ctx;
6410 ulint max_events = slots_per_segment();
6411
6412 for (ulint i = 0; i < m_n_segments; ++i, ++ctx) {
6413
6414 if (!linux_create_io_ctx(max_events, ctx)) {
6415 /* If something bad happened during aio setup
6416 we should call it a day and return right away.
6417 We don't care about any leaks because a failure
6418 to initialize the io subsystem means that the
6419 server (or atleast the innodb storage engine)
6420 is not going to startup. */
6421 return(DB_IO_ERROR);
6422 }
6423 }
6424
6425 return(DB_SUCCESS);
6426 }
6427 #endif /* LINUX_NATIVE_AIO */
6428
6429 /** Initialise the array */
6430 dberr_t
init()6431 AIO::init()
6432 {
6433 ut_a(!m_slots.empty());
6434
6435 #ifdef _WIN32
6436 ut_a(m_handles == NULL);
6437
6438 m_handles = UT_NEW_NOKEY(Handles(m_slots.size()));
6439 #endif /* _WIN32 */
6440
6441 if (srv_use_native_aio) {
6442 #ifdef LINUX_NATIVE_AIO
6443 dberr_t err = init_linux_native_aio();
6444
6445 if (err != DB_SUCCESS) {
6446 return(err);
6447 }
6448
6449 #endif /* LINUX_NATIVE_AIO */
6450 }
6451
6452 return(init_slots());
6453 }
6454
6455 /** Creates an aio wait array. Note that we return NULL in case of failure.
6456 We don't care about freeing memory here because we assume that a
6457 failure will result in server refusing to start up.
6458 @param[in] id Latch ID
6459 @param[in] n maximum number of pending AIO operations
6460 allowed; n must be divisible by m_n_segments
6461 @param[in] n_segments number of segments in the AIO array
6462 @return own: AIO array, NULL on failure */
6463 AIO*
create(latch_id_t id,ulint n,ulint n_segments)6464 AIO::create(
6465 latch_id_t id,
6466 ulint n,
6467 ulint n_segments)
6468 {
6469 if ((n % n_segments)) {
6470
6471 ib::error()
6472 << "Maximum number of AIO operations must be "
6473 << "divisible by number of segments";
6474
6475 return(NULL);
6476 }
6477
6478 AIO* array = UT_NEW_NOKEY(AIO(id, n, n_segments));
6479
6480 if (array != NULL && array->init() != DB_SUCCESS) {
6481
6482 UT_DELETE(array);
6483
6484 array = NULL;
6485 }
6486
6487 return(array);
6488 }
6489
6490 /** AIO destructor */
~AIO()6491 AIO::~AIO()
6492 {
6493 #ifdef WIN_ASYNC_IO
6494 for (ulint i = 0; i < m_slots.size(); ++i) {
6495 CloseHandle(m_slots[i].handle);
6496 }
6497 #endif /* WIN_ASYNC_IO */
6498
6499 #ifdef _WIN32
6500 UT_DELETE(m_handles);
6501 #endif /* _WIN32 */
6502
6503 mutex_destroy(&m_mutex);
6504
6505 os_event_destroy(m_not_full);
6506 os_event_destroy(m_is_empty);
6507
6508 #if defined(LINUX_NATIVE_AIO)
6509 if (srv_use_native_aio) {
6510 m_events.clear();
6511 ut_free(m_aio_ctx);
6512 }
6513 #endif /* LINUX_NATIVE_AIO */
6514
6515 m_slots.clear();
6516 }
6517
6518 /** Initializes the asynchronous io system. Creates one array each for ibuf
6519 and log i/o. Also creates one array each for read and write where each
6520 array is divided logically into n_readers and n_writers
6521 respectively. The caller must create an i/o handler thread for each
6522 segment in these arrays. This function also creates the sync array.
6523 No i/o handler thread needs to be created for that
6524 @param[in] n_per_seg maximum number of pending aio
6525 operations allowed per segment
6526 @param[in] n_readers number of reader threads
6527 @param[in] n_writers number of writer threads
6528 @param[in] n_slots_sync number of slots in the sync aio array
6529 @return true if the AIO sub-system was started successfully */
6530 bool
start(ulint n_per_seg,ulint n_readers,ulint n_writers,ulint n_slots_sync)6531 AIO::start(
6532 ulint n_per_seg,
6533 ulint n_readers,
6534 ulint n_writers,
6535 ulint n_slots_sync)
6536 {
6537 #if defined(LINUX_NATIVE_AIO)
6538 /* Check if native aio is supported on this system and tmpfs */
6539 if (srv_use_native_aio && !is_linux_native_aio_supported()) {
6540
6541 ib::warn() << "Linux Native AIO disabled.";
6542
6543 srv_use_native_aio = FALSE;
6544 }
6545 #endif /* LINUX_NATIVE_AIO */
6546
6547 srv_reset_io_thread_op_info();
6548
6549 s_reads = create(
6550 LATCH_ID_OS_AIO_READ_MUTEX, n_readers * n_per_seg, n_readers);
6551
6552 if (s_reads == NULL) {
6553 return(false);
6554 }
6555
6556 ulint start = srv_read_only_mode ? 0 : 2;
6557 ulint n_segs = n_readers + start;
6558
6559 /* 0 is the ibuf segment and 1 is the redo log segment. */
6560 for (ulint i = start; i < n_segs; ++i) {
6561 ut_a(i < SRV_MAX_N_IO_THREADS);
6562 srv_io_thread_function[i] = "read thread";
6563 }
6564
6565 ulint n_segments = n_readers;
6566
6567 if (!srv_read_only_mode) {
6568
6569 s_ibuf = create(LATCH_ID_OS_AIO_IBUF_MUTEX, n_per_seg, 1);
6570
6571 if (s_ibuf == NULL) {
6572 return(false);
6573 }
6574
6575 ++n_segments;
6576
6577 srv_io_thread_function[0] = "insert buffer thread";
6578
6579 s_log = create(LATCH_ID_OS_AIO_LOG_MUTEX, n_per_seg, 1);
6580
6581 if (s_log == NULL) {
6582 return(false);
6583 }
6584
6585 ++n_segments;
6586
6587 srv_io_thread_function[1] = "log thread";
6588
6589 } else {
6590 s_ibuf = s_log = NULL;
6591 }
6592
6593 s_writes = create(
6594 LATCH_ID_OS_AIO_WRITE_MUTEX, n_writers * n_per_seg, n_writers);
6595
6596 if (s_writes == NULL) {
6597 return(false);
6598 }
6599
6600 n_segments += n_writers;
6601
6602 for (ulint i = start + n_readers; i < n_segments; ++i) {
6603 ut_a(i < SRV_MAX_N_IO_THREADS);
6604 srv_io_thread_function[i] = "write thread";
6605 }
6606
6607 ut_ad(n_segments >= static_cast<ulint>(srv_read_only_mode ? 2 : 4));
6608
6609 s_sync = create(LATCH_ID_OS_AIO_SYNC_MUTEX, n_slots_sync, 1);
6610
6611 if (s_sync == NULL) {
6612
6613 return(false);
6614 }
6615
6616 os_aio_n_segments = n_segments;
6617
6618 os_aio_validate();
6619
6620 os_aio_segment_wait_events = static_cast<os_event_t*>(
6621 ut_zalloc_nokey(
6622 n_segments * sizeof *os_aio_segment_wait_events));
6623
6624 if (os_aio_segment_wait_events == NULL) {
6625
6626 return(false);
6627 }
6628
6629 for (ulint i = 0; i < n_segments; ++i) {
6630 os_aio_segment_wait_events[i] = os_event_create(0);
6631 }
6632
6633 os_last_printout = ut_time_monotonic();
6634
6635 return(true);
6636 }
6637
6638 /** Free the AIO arrays */
6639 void
shutdown()6640 AIO::shutdown()
6641 {
6642 UT_DELETE(s_ibuf);
6643 s_ibuf = NULL;
6644
6645 UT_DELETE(s_log);
6646 s_log = NULL;
6647
6648 UT_DELETE(s_writes);
6649 s_writes = NULL;
6650
6651 UT_DELETE(s_sync);
6652 s_sync = NULL;
6653
6654 UT_DELETE(s_reads);
6655 s_reads = NULL;
6656 }
6657
6658 #if !defined(NO_FALLOCATE) && defined(UNIV_LINUX)
6659
6660 /** Max disk sector size */
6661 static const ulint MAX_SECTOR_SIZE = 4096;
6662
6663 /**
6664 Try and get the FusionIO sector size. */
6665 void
os_fusionio_get_sector_size()6666 os_fusionio_get_sector_size()
6667 {
6668 if (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
6669 || srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC) {
6670 ulint sector_size = UNIV_SECTOR_SIZE;
6671 char* path = srv_data_home;
6672 os_file_t check_file;
6673 byte* ptr;
6674 byte* block_ptr;
6675 char current_dir[3];
6676 char* dir_end;
6677 ulint dir_len;
6678 ulint check_path_len;
6679 char* check_file_name;
6680 ssize_t ret;
6681
6682 /* If the srv_data_home is empty, set the path to
6683 current dir. */
6684 if (*path == 0) {
6685 current_dir[0] = FN_CURLIB;
6686 current_dir[1] = FN_LIBCHAR;
6687 current_dir[2] = 0;
6688 path = current_dir;
6689 }
6690
6691 /* Get the path of data file */
6692 dir_end = strrchr(path, OS_PATH_SEPARATOR);
6693 dir_len = dir_end? dir_end - path : strlen(path);
6694
6695 /* allocate a new path and move the directory path to it. */
6696 check_path_len = dir_len + sizeof "/check_sector_size";
6697 check_file_name = static_cast<char*>(
6698 ut_zalloc_nokey(check_path_len));
6699 memcpy(check_file_name, path, dir_len);
6700
6701 /* Construct a check file name. */
6702 strcat(check_file_name + dir_len, "/check_sector_size");
6703
6704 /* Create a tmp file for checking sector size. */
6705 check_file = ::open(check_file_name,
6706 O_CREAT|O_TRUNC|O_WRONLY|O_DIRECT,
6707 S_IRWXU);
6708
6709 if (check_file == -1) {
6710 ib::error()
6711 << "Failed to create check sector file, errno:"
6712 << errno << " Please confirm O_DIRECT is"
6713 << " supported and remove the file "
6714 << check_file_name << " if it exists.";
6715 ut_free(check_file_name);
6716 errno = 0;
6717 return;
6718 }
6719
6720 /* Try to write the file with different sector size
6721 alignment. */
6722 ptr = static_cast<byte*>(ut_malloc_nokey(2 * MAX_SECTOR_SIZE));
6723
6724 while (sector_size <= MAX_SECTOR_SIZE) {
6725 block_ptr = static_cast<byte*>(
6726 ut_align(ptr, sector_size));
6727 ret = pwrite(check_file, block_ptr,
6728 sector_size, 0);
6729 if (ret > 0 && (ulint) ret == sector_size) {
6730 break;
6731 }
6732 sector_size *= 2;
6733 }
6734
6735 /* The sector size should <= MAX_SECTOR_SIZE. */
6736 ut_ad(sector_size <= MAX_SECTOR_SIZE);
6737
6738 close(check_file);
6739 unlink(check_file_name);
6740
6741 ut_free(check_file_name);
6742 ut_free(ptr);
6743 errno = 0;
6744
6745 os_io_ptr_align = sector_size;
6746 }
6747 }
6748 #endif /* !NO_FALLOCATE && UNIV_LINUX */
6749
6750 /** Initializes the asynchronous io system. Creates one array each for ibuf
6751 and log i/o. Also creates one array each for read and write where each
6752 array is divided logically into n_readers and n_writers
6753 respectively. The caller must create an i/o handler thread for each
6754 segment in these arrays. This function also creates the sync array.
6755 No i/o handler thread needs to be created for that
6756 @param[in] n_readers number of reader threads
6757 @param[in] n_writers number of writer threads
6758 @param[in] n_slots_sync number of slots in the sync aio array */
6759 bool
os_aio_init(ulint n_readers,ulint n_writers,ulint n_slots_sync)6760 os_aio_init(
6761 ulint n_readers,
6762 ulint n_writers,
6763 ulint n_slots_sync)
6764 {
6765 /* Maximum number of pending aio operations allowed per segment */
6766 ulint limit = 8 * OS_AIO_N_PENDING_IOS_PER_THREAD;
6767
6768 #ifdef _WIN32
6769 if (srv_use_native_aio) {
6770 limit = SRV_N_PENDING_IOS_PER_THREAD;
6771 }
6772 #endif /* _WIN32 */
6773
6774 ut_a(block_cache == NULL);
6775
6776 block_cache = UT_NEW_NOKEY(Blocks(MAX_BLOCKS));
6777
6778 for (Blocks::iterator it = block_cache->begin();
6779 it != block_cache->end();
6780 ++it) {
6781
6782 ut_a(it->m_in_use == 0);
6783 ut_a(it->m_ptr == NULL);
6784
6785 /* Allocate double of max page size memory, since
6786 compress could generate more bytes than orgininal
6787 data. */
6788 it->m_ptr = static_cast<byte*>(
6789 ut_malloc_nokey(BUFFER_BLOCK_SIZE));
6790
6791 ut_a(it->m_ptr != NULL);
6792 }
6793
6794 /* Get sector size for DIRECT_IO. In this case, we need to
6795 know the sector size for aligning the write buffer. */
6796 #if !defined(NO_FALLOCATE) && defined(UNIV_LINUX)
6797 os_fusionio_get_sector_size();
6798 #endif /* !NO_FALLOCATE && UNIV_LINUX */
6799
6800 return(AIO::start(limit, n_readers, n_writers, n_slots_sync));
6801 }
6802
6803 /** Frees the asynchronous io system. */
6804 void
os_aio_free()6805 os_aio_free()
6806 {
6807 AIO::shutdown();
6808
6809 for (ulint i = 0; i < os_aio_n_segments; i++) {
6810 os_event_destroy(os_aio_segment_wait_events[i]);
6811 }
6812
6813 ut_free(os_aio_segment_wait_events);
6814 os_aio_segment_wait_events = 0;
6815 os_aio_n_segments = 0;
6816
6817 for (Blocks::iterator it = block_cache->begin();
6818 it != block_cache->end();
6819 ++it) {
6820
6821 ut_a(it->m_in_use == 0);
6822 ut_free(it->m_ptr);
6823 }
6824
6825 UT_DELETE(block_cache);
6826
6827 block_cache = NULL;
6828 }
6829
6830 /** Wakes up all async i/o threads so that they know to exit themselves in
6831 shutdown. */
6832 void
os_aio_wake_all_threads_at_shutdown()6833 os_aio_wake_all_threads_at_shutdown()
6834 {
6835 #ifdef WIN_ASYNC_IO
6836
6837 AIO::wake_at_shutdown();
6838
6839 #elif defined(LINUX_NATIVE_AIO)
6840
6841 /* When using native AIO interface the io helper threads
6842 wait on io_getevents with a timeout value of 500ms. At
6843 each wake up these threads check the server status.
6844 No need to do anything to wake them up. */
6845
6846 if (srv_use_native_aio) {
6847 return;
6848 }
6849
6850 #endif /* !WIN_ASYNC_AIO */
6851
6852 /* Fall through to simulated AIO handler wakeup if we are
6853 not using native AIO. */
6854
6855 /* This loop wakes up all simulated ai/o threads */
6856
6857 for (ulint i = 0; i < os_aio_n_segments; ++i) {
6858
6859 os_event_set(os_aio_segment_wait_events[i]);
6860 }
6861 }
6862
6863 /** Waits until there are no pending writes in AIO::s_writes. There can
6864 be other, synchronous, pending writes. */
6865 void
os_aio_wait_until_no_pending_writes()6866 os_aio_wait_until_no_pending_writes()
6867 {
6868 AIO::wait_until_no_pending_writes();
6869 }
6870
6871 /** Calculates segment number for a slot.
6872 @param[in] array AIO wait array
6873 @param[in] slot slot in this array
6874 @return segment number (which is the number used by, for example,
6875 I/O-handler threads) */
6876 ulint
get_segment_no_from_slot(const AIO * array,const Slot * slot)6877 AIO::get_segment_no_from_slot(
6878 const AIO* array,
6879 const Slot* slot)
6880 {
6881 ulint segment;
6882 ulint seg_len;
6883
6884 if (array == s_ibuf) {
6885 ut_ad(!srv_read_only_mode);
6886
6887 segment = IO_IBUF_SEGMENT;
6888
6889 } else if (array == s_log) {
6890 ut_ad(!srv_read_only_mode);
6891
6892 segment = IO_LOG_SEGMENT;
6893
6894 } else if (array == s_reads) {
6895 seg_len = s_reads->slots_per_segment();
6896
6897 segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
6898 } else {
6899 ut_a(array == s_writes);
6900
6901 seg_len = s_writes->slots_per_segment();
6902
6903 segment = s_reads->m_n_segments
6904 + (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
6905 }
6906
6907 return(segment);
6908 }
6909
6910 /** Requests for a slot in the aio array. If no slot is available, waits until
6911 not_full-event becomes signaled.
6912
6913 @param[in,out] type IO context
6914 @param[in,out] m1 message to be passed along with the AIO
6915 operation
6916 @param[in,out] m2 message to be passed along with the AIO
6917 operation
6918 @param[in] file file handle
6919 @param[in] name name of the file or path as a NUL-terminated
6920 string
6921 @param[in,out] buf buffer where to read or from which to write
6922 @param[in] offset file offset, where to read from or start writing
6923 @param[in] len length of the block to read or write
6924 @return pointer to slot */
6925 Slot*
reserve_slot(IORequest & type,fil_node_t * m1,void * m2,pfs_os_file_t file,const char * name,void * buf,os_offset_t offset,ulint len)6926 AIO::reserve_slot(
6927 IORequest& type,
6928 fil_node_t* m1,
6929 void* m2,
6930 pfs_os_file_t file,
6931 const char* name,
6932 void* buf,
6933 os_offset_t offset,
6934 ulint len)
6935 {
6936 #ifdef WIN_ASYNC_IO
6937 ut_a((len & 0xFFFFFFFFUL) == len);
6938 #endif /* WIN_ASYNC_IO */
6939
6940 /* No need of a mutex. Only reading constant fields */
6941 ulint slots_per_seg;
6942
6943 ut_ad(type.validate());
6944
6945 slots_per_seg = slots_per_segment();
6946
6947 /* We attempt to keep adjacent blocks in the same local
6948 segment. This can help in merging IO requests when we are
6949 doing simulated AIO */
6950 ulint local_seg;
6951
6952 local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6)) % m_n_segments;
6953
6954 for (;;) {
6955
6956 acquire();
6957
6958 if (m_n_reserved != m_slots.size()) {
6959 break;
6960 }
6961
6962 release();
6963
6964 if (!srv_use_native_aio) {
6965 /* If the handler threads are suspended,
6966 wake them so that we get more slots */
6967
6968 os_aio_simulated_wake_handler_threads();
6969 }
6970
6971 os_event_wait(m_not_full);
6972 }
6973
6974 ulint counter = 0;
6975 Slot* slot = NULL;
6976
6977 /* We start our search for an available slot from our preferred
6978 local segment and do a full scan of the array. We are
6979 guaranteed to find a slot in full scan. */
6980 for (ulint i = local_seg * slots_per_seg;
6981 counter < m_slots.size();
6982 ++i, ++counter) {
6983
6984 i %= m_slots.size();
6985
6986 slot = at(i);
6987
6988 if (slot->is_reserved == false) {
6989 break;
6990 }
6991 }
6992
6993 /* We MUST always be able to get hold of a reserved slot. */
6994 ut_a(counter < m_slots.size());
6995
6996 ut_a(slot->is_reserved == false);
6997
6998 ++m_n_reserved;
6999
7000 if (m_n_reserved == 1) {
7001 os_event_reset(m_is_empty);
7002 }
7003
7004 if (m_n_reserved == m_slots.size()) {
7005 os_event_reset(m_not_full);
7006 }
7007
7008 slot->is_reserved = true;
7009 slot->reservation_time = ut_time_monotonic();
7010 slot->m1 = m1;
7011 slot->m2 = m2;
7012 slot->file = file;
7013 slot->name = name;
7014 #ifdef _WIN32
7015 slot->len = static_cast<DWORD>(len);
7016 #else
7017 slot->len = static_cast<ulint>(len);
7018 #endif /* _WIN32 */
7019 slot->type = type;
7020 slot->buf = static_cast<byte*>(buf);
7021 slot->ptr = slot->buf;
7022 slot->offset = offset;
7023 slot->err = DB_SUCCESS;
7024 slot->original_len = static_cast<uint32>(len);
7025 slot->io_already_done = false;
7026 slot->buf_block = NULL;
7027
7028 if (srv_use_native_aio
7029 && offset > 0
7030 && type.is_write()
7031 && type.is_compressed()) {
7032 ulint compressed_len = len;
7033
7034 ut_ad(!type.is_log());
7035
7036 release();
7037
7038 void* src_buf = slot->buf;
7039 slot->buf_block = os_file_compress_page(
7040 type,
7041 src_buf,
7042 &compressed_len);
7043
7044 slot->buf = static_cast<byte*>(src_buf);
7045 slot->ptr = slot->buf;
7046 #ifdef _WIN32
7047 slot->len = static_cast<DWORD>(compressed_len);
7048 #else
7049 slot->len = static_cast<ulint>(compressed_len);
7050 #endif /* _WIN32 */
7051 slot->skip_punch_hole = !type.punch_hole();
7052
7053 acquire();
7054 }
7055
7056 /* We do encryption after compression, since if we do encryption
7057 before compression, the encrypted data will cause compression fail
7058 or low compression rate. */
7059 if (srv_use_native_aio
7060 && offset > 0
7061 && type.is_write()
7062 && type.is_encrypted()) {
7063 ulint encrypted_len = slot->len;
7064 Block* encrypted_block;
7065
7066 ut_ad(!type.is_log());
7067
7068 release();
7069
7070 void* src_buf = slot->buf;
7071 encrypted_block = os_file_encrypt_page(
7072 type,
7073 src_buf,
7074 &encrypted_len);
7075
7076 if (slot->buf_block != NULL) {
7077 os_free_block(slot->buf_block);
7078 }
7079
7080 slot->buf_block = encrypted_block;
7081 slot->buf = static_cast<byte*>(src_buf);
7082 slot->ptr = slot->buf;
7083
7084 #ifdef _WIN32
7085 slot->len = static_cast<DWORD>(encrypted_len);
7086 #else
7087 slot->len = static_cast<ulint>(encrypted_len);
7088 #endif /* _WIN32 */
7089
7090 acquire();
7091 }
7092
7093 #ifdef WIN_ASYNC_IO
7094 {
7095 OVERLAPPED* control;
7096
7097 control = &slot->control;
7098 control->Offset = (DWORD) offset & 0xFFFFFFFF;
7099 control->OffsetHigh = (DWORD) (offset >> 32);
7100
7101 ResetEvent(slot->handle);
7102 }
7103 #elif defined(LINUX_NATIVE_AIO)
7104
7105 /* If we are not using native AIO skip this part. */
7106 if (srv_use_native_aio) {
7107
7108 off_t aio_offset;
7109
7110 /* Check if we are dealing with 64 bit arch.
7111 If not then make sure that offset fits in 32 bits. */
7112 aio_offset = (off_t) offset;
7113
7114 ut_a(sizeof(aio_offset) >= sizeof(offset)
7115 || ((os_offset_t) aio_offset) == offset);
7116
7117 struct iocb* iocb = &slot->control;
7118
7119 if (type.is_read()) {
7120 io_prep_pread(
7121 iocb, file.m_file, slot->ptr, slot->len, aio_offset);
7122 } else {
7123 ut_ad(type.is_write());
7124 io_prep_pwrite(
7125 iocb, file.m_file, slot->ptr, slot->len, aio_offset);
7126 }
7127
7128 iocb->data = slot;
7129
7130 slot->n_bytes = 0;
7131 slot->ret = 0;
7132 }
7133 #endif /* LINUX_NATIVE_AIO */
7134
7135 release();
7136
7137 return(slot);
7138 }
7139
7140 /** Wakes up a simulated aio i/o-handler thread if it has something to do.
7141 @param[in] global_segment The number of the segment in the AIO arrays */
7142 void
wake_simulated_handler_thread(ulint global_segment)7143 AIO::wake_simulated_handler_thread(ulint global_segment)
7144 {
7145 ut_ad(!srv_use_native_aio);
7146
7147 AIO* array;
7148 ulint segment = get_array_and_local_segment(&array, global_segment);
7149
7150 array->wake_simulated_handler_thread(global_segment, segment);
7151 }
7152
7153 /** Wakes up a simulated AIO I/O-handler thread if it has something to do
7154 for a local segment in the AIO array.
7155 @param[in] global_segment The number of the segment in the AIO arrays
7156 @param[in] segment The local segment in the AIO array */
7157 void
wake_simulated_handler_thread(ulint global_segment,ulint segment)7158 AIO::wake_simulated_handler_thread(ulint global_segment, ulint segment)
7159 {
7160 ut_ad(!srv_use_native_aio);
7161
7162 ulint n = slots_per_segment();
7163 ulint offset = segment * n;
7164
7165 /* Look through n slots after the segment * n'th slot */
7166
7167 acquire();
7168
7169 const Slot* slot = at(offset);
7170
7171 for (ulint i = 0; i < n; ++i, ++slot) {
7172
7173 if (slot->is_reserved) {
7174
7175 /* Found an i/o request */
7176
7177 release();
7178
7179 os_event_t event;
7180
7181 event = os_aio_segment_wait_events[global_segment];
7182
7183 os_event_set(event);
7184
7185 return;
7186 }
7187 }
7188
7189 release();
7190 }
7191
7192 /** Wakes up simulated aio i/o-handler threads if they have something to do. */
7193 void
os_aio_simulated_wake_handler_threads()7194 os_aio_simulated_wake_handler_threads()
7195 {
7196 if (srv_use_native_aio) {
7197 /* We do not use simulated aio: do nothing */
7198
7199 return;
7200 }
7201
7202 os_aio_recommend_sleep_for_read_threads = false;
7203
7204 for (ulint i = 0; i < os_aio_n_segments; i++) {
7205 AIO::wake_simulated_handler_thread(i);
7206 }
7207 }
7208
7209 /** Select the IO slot array
7210 @param[in] type Type of IO, READ or WRITE
7211 @param[in] read_only true if running in read-only mode
7212 @param[in] mode IO mode
7213 @return slot array or NULL if invalid mode specified */
7214 AIO*
select_slot_array(IORequest & type,bool read_only,ulint mode)7215 AIO::select_slot_array(IORequest& type, bool read_only, ulint mode)
7216 {
7217 AIO* array;
7218
7219 ut_ad(type.validate());
7220
7221 switch (mode) {
7222 case OS_AIO_NORMAL:
7223
7224 array = type.is_read() ? AIO::s_reads : AIO::s_writes;
7225 break;
7226
7227 case OS_AIO_IBUF:
7228 ut_ad(type.is_read());
7229
7230 /* Reduce probability of deadlock bugs in connection with ibuf:
7231 do not let the ibuf i/o handler sleep */
7232
7233 type.clear_do_not_wake();
7234
7235 array = read_only ? AIO::s_reads : AIO::s_ibuf;
7236 break;
7237
7238 case OS_AIO_LOG:
7239
7240 array = read_only ? AIO::s_reads : AIO::s_log;
7241 break;
7242
7243 case OS_AIO_SYNC:
7244
7245 array = AIO::s_sync;
7246 #if defined(LINUX_NATIVE_AIO)
7247 /* In Linux native AIO we don't use sync IO array. */
7248 ut_a(!srv_use_native_aio);
7249 #endif /* LINUX_NATIVE_AIO */
7250 break;
7251
7252 default:
7253 ut_error;
7254 array = NULL; /* Eliminate compiler warning */
7255 }
7256
7257 return(array);
7258 }
7259
7260 #ifdef WIN_ASYNC_IO
7261 /** This function is only used in Windows asynchronous i/o.
7262 Waits for an aio operation to complete. This function is used to wait the
7263 for completed requests. The aio array of pending requests is divided
7264 into segments. The thread specifies which segment or slot it wants to wait
7265 for. NOTE: this function will also take care of freeing the aio slot,
7266 therefore no other thread is allowed to do the freeing!
7267 @param[in] segment The number of the segment in the aio arrays to
7268 wait for; segment 0 is the ibuf I/O thread,
7269 segment 1 the log I/O thread, then follow the
7270 non-ibuf read threads, and as the last are the
7271 non-ibuf write threads; if this is
7272 ULINT_UNDEFINED, then it means that sync AIO
7273 is used, and this parameter is ignored
7274 @param[in] pos this parameter is used only in sync AIO:
7275 wait for the aio slot at this position
7276 @param[out] m1 the messages passed with the AIO request; note
7277 that also in the case where the AIO operation
7278 failed, these output parameters are valid and
7279 can be used to restart the operation,
7280 for example
7281 @param[out] m2 callback message
7282 @param[out] type OS_FILE_WRITE or ..._READ
7283 @return DB_SUCCESS or error code */
7284 static
7285 dberr_t
os_aio_windows_handler(ulint segment,ulint pos,fil_node_t ** m1,void ** m2,IORequest * type)7286 os_aio_windows_handler(
7287 ulint segment,
7288 ulint pos,
7289 fil_node_t** m1,
7290 void** m2,
7291 IORequest* type)
7292 {
7293 Slot* slot;
7294 dberr_t err;
7295 AIO* array;
7296 ulint orig_seg = segment;
7297
7298 if (segment == ULINT_UNDEFINED) {
7299 segment = 0;
7300 array = AIO::sync_array();
7301 } else {
7302 segment = AIO::get_array_and_local_segment(&array, segment);
7303 }
7304
7305 /* NOTE! We only access constant fields in os_aio_array. Therefore
7306 we do not have to acquire the protecting mutex yet */
7307
7308 ut_ad(os_aio_validate_skip());
7309
7310 if (array == AIO::sync_array()) {
7311
7312 WaitForSingleObject(array->at(pos)->handle, INFINITE);
7313
7314 } else {
7315 if (orig_seg != ULINT_UNDEFINED) {
7316 srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
7317 }
7318
7319 pos = WaitForMultipleObjects(
7320 (DWORD) array->slots_per_segment(),
7321 array->handles(segment),
7322 FALSE, INFINITE);
7323 }
7324
7325 array->acquire();
7326
7327 if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
7328 && array->is_empty()
7329 && !buf_page_cleaner_is_active) {
7330
7331 *m1 = NULL;
7332 *m2 = NULL;
7333
7334 array->release();
7335
7336 return(DB_SUCCESS);
7337 }
7338
7339 ulint n = array->slots_per_segment();
7340
7341 ut_a(pos >= WAIT_OBJECT_0 && pos <= WAIT_OBJECT_0 + n);
7342
7343 slot = array->at(pos + segment * n);
7344
7345 ut_a(slot->is_reserved);
7346
7347 if (orig_seg != ULINT_UNDEFINED) {
7348 srv_set_io_thread_op_info(
7349 orig_seg, "get windows aio return value");
7350 }
7351
7352 BOOL ret;
7353 ret = GetOverlappedResult(
7354 slot->file.m_file, &slot->control, &slot->n_bytes, TRUE);
7355 *m1 = slot->m1;
7356 *m2 = slot->m2;
7357
7358 *type = slot->type;
7359
7360 BOOL retry = FALSE;
7361
7362 if (ret && slot->n_bytes == slot->len) {
7363
7364 err = DB_SUCCESS;
7365
7366 } else if (os_file_handle_error(slot->name, "Windows aio")) {
7367
7368 retry = true;
7369
7370 } else {
7371
7372 err = DB_IO_ERROR;
7373 }
7374
7375 array->release();
7376
7377 if (retry) {
7378 /* Retry failed read/write operation synchronously.
7379 No need to hold array->m_mutex. */
7380
7381 #ifdef UNIV_PFS_IO
7382 /* This read/write does not go through os_file_read
7383 and os_file_write APIs, need to register with
7384 performance schema explicitly here. */
7385 struct PSI_file_locker* locker = NULL;
7386 PSI_file_locker_state state;
7387 register_pfs_file_io_begin(
7388 &state, locker, slot->file, slot->len,
7389 slot->type.is_write()
7390 ? PSI_FILE_WRITE : PSI_FILE_READ, __FILE__, __LINE__);
7391 #endif /* UNIV_PFS_IO */
7392
7393 ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
7394
7395 ssize_t n_bytes = SyncFileIO::execute(slot);
7396
7397 #ifdef UNIV_PFS_IO
7398 register_pfs_file_io_end(locker, slot->len);
7399 #endif /* UNIV_PFS_IO */
7400
7401 if (n_bytes < 0 && GetLastError() == ERROR_IO_PENDING) {
7402 /* AIO was queued successfully!
7403 We want a synchronous I/O operation on a
7404 file where we also use async I/O: in Windows
7405 we must use the same wait mechanism as for
7406 async I/O */
7407
7408 BOOL ret;
7409 ret = GetOverlappedResult(
7410 slot->file.m_file, &slot->control, &slot->n_bytes,
7411 TRUE);
7412 n_bytes = ret ? slot->n_bytes : -1;
7413 }
7414
7415 err = (n_bytes == slot->len) ? DB_SUCCESS : DB_IO_ERROR;
7416 }
7417
7418 if (err == DB_SUCCESS) {
7419 err = AIOHandler::post_io_processing(slot);
7420 }
7421
7422 array->release_with_mutex(slot);
7423
7424 return(err);
7425 }
7426 #endif /* WIN_ASYNC_IO */
7427
7428 /**
7429 NOTE! Use the corresponding macro os_aio(), not directly this function!
7430 Requests an asynchronous i/o operation.
7431 @param[in] type IO request context
7432 @param[in] mode IO mode
7433 @param[in] name Name of the file or path as NUL terminated
7434 string
7435 @param[in] file Open file handle
7436 @param[out] buf buffer where to read
7437 @param[in] offset file offset where to read
7438 @param[in] n number of bytes to read
7439 @param[in] read_only if true read only mode checks are enforced
7440 @param[in,out] m1 Message for the AIO handler, (can be used to
7441 identify a completed AIO operation); ignored
7442 if mode is OS_AIO_SYNC
7443 @param[in,out] m2 message for the AIO handler (can be used to
7444 identify a completed AIO operation); ignored
7445 if mode is OS_AIO_SYNC
7446 @return DB_SUCCESS or error code */
7447 dberr_t
os_aio_func(IORequest & type,ulint mode,const char * name,pfs_os_file_t file,void * buf,os_offset_t offset,ulint n,bool read_only,fil_node_t * m1,void * m2)7448 os_aio_func(
7449 IORequest& type,
7450 ulint mode,
7451 const char* name,
7452 pfs_os_file_t file,
7453 void* buf,
7454 os_offset_t offset,
7455 ulint n,
7456 bool read_only,
7457 fil_node_t* m1,
7458 void* m2)
7459 {
7460 #ifdef WIN_ASYNC_IO
7461 BOOL ret = TRUE;
7462 #endif /* WIN_ASYNC_IO */
7463
7464 ut_ad(n > 0);
7465 ut_ad((n % OS_FILE_LOG_BLOCK_SIZE) == 0);
7466 ut_ad((offset % OS_FILE_LOG_BLOCK_SIZE) == 0);
7467 ut_ad(os_aio_validate_skip());
7468
7469 #ifdef WIN_ASYNC_IO
7470 ut_ad((n & 0xFFFFFFFFUL) == n);
7471 #endif /* WIN_ASYNC_IO */
7472
7473 if (mode == OS_AIO_SYNC
7474 #ifdef WIN_ASYNC_IO
7475 && !srv_use_native_aio
7476 #endif /* WIN_ASYNC_IO */
7477 ) {
7478 /* This is actually an ordinary synchronous read or write:
7479 no need to use an i/o-handler thread. NOTE that if we use
7480 Windows async i/o, Windows does not allow us to use
7481 ordinary synchronous os_file_read etc. on the same file,
7482 therefore we have built a special mechanism for synchronous
7483 wait in the Windows case.
7484 Also note that the Performance Schema instrumentation has
7485 been performed by current os_aio_func()'s wrapper function
7486 pfs_os_aio_func(). So we would no longer need to call
7487 Performance Schema instrumented os_file_read() and
7488 os_file_write(). Instead, we should use os_file_read_func()
7489 and os_file_write_func() */
7490
7491 if (type.is_read()) {
7492 return(os_file_read_func(type, file.m_file, buf, offset, n));
7493 }
7494
7495 ut_ad(type.is_write());
7496 return(os_file_write_func(type, name, file.m_file, buf, offset, n));
7497 }
7498
7499 try_again:
7500
7501 AIO* array;
7502
7503 array = AIO::select_slot_array(type, read_only, mode);
7504
7505 Slot* slot;
7506
7507 slot = array->reserve_slot(type, m1, m2, file, name, buf, offset, n);
7508
7509 if (type.is_read()) {
7510
7511 if (srv_use_native_aio) {
7512
7513 ++os_n_file_reads;
7514
7515 os_bytes_read_since_printout += n;
7516 #ifdef WIN_ASYNC_IO
7517 ret = ReadFile(
7518 file.m_file, slot->ptr, slot->len,
7519 &slot->n_bytes, &slot->control);
7520 #elif defined(LINUX_NATIVE_AIO)
7521 if (!array->linux_dispatch(slot)) {
7522 goto err_exit;
7523 }
7524 #endif /* WIN_ASYNC_IO */
7525 } else if (type.is_wake()) {
7526 AIO::wake_simulated_handler_thread(
7527 AIO::get_segment_no_from_slot(array, slot));
7528 }
7529 } else if (type.is_write()) {
7530
7531 if (srv_use_native_aio) {
7532 ++os_n_file_writes;
7533
7534 #ifdef WIN_ASYNC_IO
7535 ret = WriteFile(
7536 file.m_file, slot->ptr, slot->len,
7537 &slot->n_bytes, &slot->control);
7538 #elif defined(LINUX_NATIVE_AIO)
7539 if (!array->linux_dispatch(slot)) {
7540 goto err_exit;
7541 }
7542 #endif /* WIN_ASYNC_IO */
7543
7544 } else if (type.is_wake()) {
7545 AIO::wake_simulated_handler_thread(
7546 AIO::get_segment_no_from_slot(array, slot));
7547 }
7548 } else {
7549 ut_error;
7550 }
7551
7552 #ifdef WIN_ASYNC_IO
7553 if (srv_use_native_aio) {
7554 if ((ret && slot->len == slot->n_bytes)
7555 || (!ret && GetLastError() == ERROR_IO_PENDING)) {
7556 /* aio was queued successfully! */
7557
7558 if (mode == OS_AIO_SYNC) {
7559 IORequest dummy_type;
7560 void* dummy_mess2;
7561 struct fil_node_t* dummy_mess1;
7562
7563 /* We want a synchronous i/o operation on a
7564 file where we also use async i/o: in Windows
7565 we must use the same wait mechanism as for
7566 async i/o */
7567
7568 return(os_aio_windows_handler(
7569 ULINT_UNDEFINED, slot->pos,
7570 &dummy_mess1, &dummy_mess2,
7571 &dummy_type));
7572 }
7573
7574 return(DB_SUCCESS);
7575 }
7576
7577 goto err_exit;
7578 }
7579 #endif /* WIN_ASYNC_IO */
7580
7581 /* AIO request was queued successfully! */
7582 return(DB_SUCCESS);
7583
7584 #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
7585 err_exit:
7586 #endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
7587
7588 array->release_with_mutex(slot);
7589
7590 if (os_file_handle_error(
7591 name, type.is_read() ? "aio read" : "aio write")) {
7592
7593 goto try_again;
7594 }
7595
7596 return(DB_IO_ERROR);
7597 }
7598
7599 /** Simulated AIO handler for reaping IO requests */
7600 class SimulatedAIOHandler {
7601
7602 public:
7603
7604 /** Constructor
7605 @param[in,out] array The AIO array
7606 @param[in] segment Local segment in the array */
SimulatedAIOHandler(AIO * array,ulint segment)7607 SimulatedAIOHandler(AIO* array, ulint segment)
7608 :
7609 m_oldest(),
7610 m_n_elems(),
7611 m_lowest_offset(IB_UINT64_MAX),
7612 m_array(array),
7613 m_n_slots(),
7614 m_segment(segment),
7615 m_ptr(),
7616 m_buf()
7617 {
7618 ut_ad(m_segment < 100);
7619
7620 m_slots.resize(OS_AIO_MERGE_N_CONSECUTIVE);
7621 }
7622
7623 /** Destructor */
~SimulatedAIOHandler()7624 ~SimulatedAIOHandler()
7625 {
7626 if (m_ptr != NULL) {
7627 ut_free(m_ptr);
7628 }
7629 }
7630
7631 /** Reset the state of the handler
7632 @param[in] n_slots Number of pending AIO operations supported */
init(ulint n_slots)7633 void init(ulint n_slots)
7634 {
7635 m_oldest = 0;
7636 m_n_elems = 0;
7637 m_n_slots = n_slots;
7638 m_lowest_offset = IB_UINT64_MAX;
7639
7640 if (m_ptr != NULL) {
7641 ut_free(m_ptr);
7642 m_ptr = m_buf = NULL;
7643 }
7644
7645 m_slots[0] = NULL;
7646 }
7647
7648 /** Check if there is a slot for which the i/o has already been done
7649 @param[out] n_reserved Number of reserved slots
7650 @return the first completed slot that is found. */
check_completed(ulint * n_reserved)7651 Slot* check_completed(ulint* n_reserved)
7652 {
7653 ulint offset = m_segment * m_n_slots;
7654
7655 *n_reserved = 0;
7656
7657 Slot* slot;
7658
7659 slot = m_array->at(offset);
7660
7661 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7662
7663 if (slot->is_reserved) {
7664
7665 if (slot->io_already_done) {
7666
7667 ut_a(slot->is_reserved);
7668
7669 return(slot);
7670 }
7671
7672 ++*n_reserved;
7673 }
7674 }
7675
7676 return(NULL);
7677 }
7678
7679 /** If there are at least 2 seconds old requests, then pick the
7680 oldest one to prevent starvation. If several requests have the
7681 same age, then pick the one at the lowest offset.
7682 @return true if request was selected */
select()7683 bool select()
7684 {
7685 if (!select_oldest()) {
7686
7687 return(select_lowest_offset());
7688 }
7689
7690 return(true);
7691 }
7692
7693 /** Check if there are several consecutive blocks
7694 to read or write. Merge them if found. */
merge()7695 void merge()
7696 {
7697 /* if m_n_elems != 0, then we have assigned
7698 something valid to consecutive_ios[0] */
7699 ut_ad(m_n_elems != 0);
7700 ut_ad(first_slot() != NULL);
7701
7702 Slot* slot = first_slot();
7703
7704 while (!merge_adjacent(slot)) {
7705 /* No op */
7706 }
7707 }
7708
7709 /** We have now collected n_consecutive I/O requests
7710 in the array; allocate a single buffer which can hold
7711 all data, and perform the I/O
7712 @return the length of the buffer */
allocate_buffer()7713 ulint allocate_buffer()
7714 MY_ATTRIBUTE((warn_unused_result))
7715 {
7716 ulint len;
7717 Slot* slot = first_slot();
7718
7719 ut_ad(m_ptr == NULL);
7720
7721 if (slot->type.is_read() && m_n_elems > 1) {
7722
7723 len = 0;
7724
7725 for (ulint i = 0; i < m_n_elems; ++i) {
7726 len += m_slots[i]->len;
7727 }
7728
7729 m_ptr = static_cast<byte*>(
7730 ut_malloc_nokey(len + UNIV_PAGE_SIZE));
7731
7732 m_buf = static_cast<byte*>(
7733 ut_align(m_ptr, UNIV_PAGE_SIZE));
7734
7735 } else {
7736 len = first_slot()->len;
7737 m_buf = first_slot()->buf;
7738 }
7739
7740 return(len);
7741 }
7742
7743 /** We have to compress the individual pages and punch
7744 holes in them on a page by page basis when writing to
7745 tables that can be compresed at the IO level.
7746 @param[in] len Value returned by allocate_buffer */
copy_to_buffer(ulint len)7747 void copy_to_buffer(ulint len)
7748 {
7749 Slot* slot = first_slot();
7750
7751 if (len > slot->len && slot->type.is_write()) {
7752
7753 byte* ptr = m_buf;
7754
7755 ut_ad(ptr != slot->buf);
7756
7757 /* Copy the buffers to the combined buffer */
7758 for (ulint i = 0; i < m_n_elems; ++i) {
7759
7760 slot = m_slots[i];
7761
7762 memmove(ptr, slot->buf, slot->len);
7763
7764 ptr += slot->len;
7765 }
7766 }
7767 }
7768
7769 /** Do the I/O with ordinary, synchronous i/o functions:
7770 @param[in] len Length of buffer for IO */
io()7771 void io()
7772 {
7773 if (first_slot()->type.is_write()) {
7774
7775 for (ulint i = 0; i < m_n_elems; ++i) {
7776 write(m_slots[i]);
7777 }
7778
7779 } else {
7780
7781 for (ulint i = 0; i < m_n_elems; ++i) {
7782 read(m_slots[i]);
7783 }
7784 }
7785 }
7786
7787 /** Do the decompression of the pages read in */
io_complete()7788 void io_complete()
7789 {
7790 // Note: For non-compressed tables. Not required
7791 // for correctness.
7792 }
7793
7794 /** Mark the i/os done in slots */
done()7795 void done()
7796 {
7797 for (ulint i = 0; i < m_n_elems; ++i) {
7798 m_slots[i]->io_already_done = true;
7799 }
7800 }
7801
7802 /** @return the first slot in the consecutive array */
first_slot()7803 Slot* first_slot()
7804 MY_ATTRIBUTE((warn_unused_result))
7805 {
7806 ut_a(m_n_elems > 0);
7807
7808 return(m_slots[0]);
7809 }
7810
7811 /** Wait for I/O requests
7812 @param[in] global_segment The global segment
7813 @param[in,out] event Wait on event if no active requests
7814 @return the number of slots */
7815 ulint check_pending(
7816 ulint global_segment,
7817 os_event_t event)
7818 MY_ATTRIBUTE((warn_unused_result));
7819 private:
7820
7821 /** Do the file read
7822 @param[in,out] slot Slot that has the IO context */
read(Slot * slot)7823 void read(Slot* slot)
7824 {
7825 dberr_t err = os_file_read_func(
7826 slot->type,
7827 slot->file.m_file,
7828 slot->ptr,
7829 slot->offset,
7830 slot->len);
7831 ut_a(err == DB_SUCCESS);
7832 }
7833
7834 /** Do the file read
7835 @param[in,out] slot Slot that has the IO context */
write(Slot * slot)7836 void write(Slot* slot)
7837 {
7838 dberr_t err = os_file_write_func(
7839 slot->type,
7840 slot->name,
7841 slot->file.m_file,
7842 slot->ptr,
7843 slot->offset,
7844 slot->len);
7845 ut_a(err == DB_SUCCESS || err == DB_IO_NO_PUNCH_HOLE);
7846 }
7847
7848 /** @return true if the slots are adjacent and can be merged */
adjacent(const Slot * s1,const Slot * s2) const7849 bool adjacent(const Slot* s1, const Slot* s2) const
7850 {
7851 return(s1 != s2
7852 && s1->file.m_file == s2->file.m_file
7853 && s2->offset == s1->offset + s1->len
7854 && s1->type == s2->type);
7855 }
7856
7857 /** @return true if merge limit reached or no adjacent slots found. */
merge_adjacent(Slot * & current)7858 bool merge_adjacent(Slot*& current)
7859 {
7860 Slot* slot;
7861 ulint offset = m_segment * m_n_slots;
7862
7863 slot = m_array->at(offset);
7864
7865 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7866
7867 if (slot->is_reserved && adjacent(current, slot)) {
7868
7869 current = slot;
7870
7871 /* Found a consecutive i/o request */
7872
7873 m_slots[m_n_elems] = slot;
7874
7875 ++m_n_elems;
7876
7877 return(m_n_elems >= m_slots.capacity());
7878 }
7879 }
7880
7881 return(true);
7882 }
7883
7884 /** There were no old requests. Look for an I/O request at the lowest
7885 offset in the array (we ignore the high 32 bits of the offset in these
7886 heuristics) */
select_lowest_offset()7887 bool select_lowest_offset()
7888 {
7889 ut_ad(m_n_elems == 0);
7890
7891 ulint offset = m_segment * m_n_slots;
7892
7893 m_lowest_offset = IB_UINT64_MAX;
7894
7895 for (ulint i = 0; i < m_n_slots; ++i) {
7896 Slot* slot;
7897
7898 slot = m_array->at(i + offset);
7899
7900 if (slot->is_reserved
7901 && slot->offset < m_lowest_offset) {
7902
7903 /* Found an i/o request */
7904 m_slots[0] = slot;
7905
7906 m_n_elems = 1;
7907
7908 m_lowest_offset = slot->offset;
7909 }
7910 }
7911
7912 return(m_n_elems > 0);
7913 }
7914
7915 /** Select the slot if it is older than the current oldest slot.
7916 @param[in] slot The slot to check */
select_if_older(Slot * slot)7917 void select_if_older(Slot* slot)
7918 {
7919 int64_t time_diff = ut_time_monotonic() -
7920 slot->reservation_time;
7921
7922 const uint64_t age = time_diff > 0 ? (uint64_t) time_diff : 0;
7923
7924 if ((age >= 2 && age > m_oldest)
7925 || (age >= 2
7926 && age == m_oldest
7927 && slot->offset < m_lowest_offset)) {
7928
7929 /* Found an i/o request */
7930 m_slots[0] = slot;
7931
7932 m_n_elems = 1;
7933
7934 m_oldest = age;
7935
7936 m_lowest_offset = slot->offset;
7937 }
7938 }
7939
7940 /** Select th oldest slot in the array
7941 @return true if oldest slot found */
select_oldest()7942 bool select_oldest()
7943 {
7944 ut_ad(m_n_elems == 0);
7945
7946 Slot* slot;
7947 ulint offset = m_n_slots * m_segment;
7948
7949 slot = m_array->at(offset);
7950
7951 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7952
7953 if (slot->is_reserved) {
7954 select_if_older(slot);
7955 }
7956 }
7957
7958 return(m_n_elems > 0);
7959 }
7960
7961 typedef std::vector<Slot*> slots_t;
7962
7963 private:
7964 ulint m_oldest;
7965 ulint m_n_elems;
7966 os_offset_t m_lowest_offset;
7967
7968 AIO* m_array;
7969 ulint m_n_slots;
7970 ulint m_segment;
7971
7972 slots_t m_slots;
7973
7974 byte* m_ptr;
7975 byte* m_buf;
7976 };
7977
7978 /** Wait for I/O requests
7979 @return the number of slots */
7980 ulint
check_pending(ulint global_segment,os_event_t event)7981 SimulatedAIOHandler::check_pending(
7982 ulint global_segment,
7983 os_event_t event)
7984 {
7985 /* NOTE! We only access constant fields in os_aio_array.
7986 Therefore we do not have to acquire the protecting mutex yet */
7987
7988 ut_ad(os_aio_validate_skip());
7989
7990 ut_ad(m_segment < m_array->get_n_segments());
7991
7992 /* Look through n slots after the segment * n'th slot */
7993
7994 if (AIO::is_read(m_array)
7995 && os_aio_recommend_sleep_for_read_threads) {
7996
7997 /* Give other threads chance to add several
7998 I/Os to the array at once. */
7999
8000 srv_set_io_thread_op_info(
8001 global_segment, "waiting for i/o request");
8002
8003 os_event_wait(event);
8004
8005 return(0);
8006 }
8007
8008 return(m_array->slots_per_segment());
8009 }
8010
8011 /** Does simulated AIO. This function should be called by an i/o-handler
8012 thread.
8013
8014 @param[in] segment The number of the segment in the aio arrays to wait
8015 for; segment 0 is the ibuf i/o thread, segment 1 the
8016 log i/o thread, then follow the non-ibuf read threads,
8017 and as the last are the non-ibuf write threads
8018 @param[out] m1 the messages passed with the AIO request; note that
8019 also in the case where the AIO operation failed, these
8020 output parameters are valid and can be used to restart
8021 the operation, for example
8022 @param[out] m2 Callback argument
8023 @param[in] type IO context
8024 @return DB_SUCCESS or error code */
8025 static
8026 dberr_t
os_aio_simulated_handler(ulint global_segment,fil_node_t ** m1,void ** m2,IORequest * type)8027 os_aio_simulated_handler(
8028 ulint global_segment,
8029 fil_node_t** m1,
8030 void** m2,
8031 IORequest* type)
8032 {
8033 Slot* slot;
8034 AIO* array;
8035 ulint segment;
8036 os_event_t event = os_aio_segment_wait_events[global_segment];
8037
8038 segment = AIO::get_array_and_local_segment(&array, global_segment);
8039
8040 SimulatedAIOHandler handler(array, segment);
8041
8042 for (;;) {
8043
8044 srv_set_io_thread_op_info(
8045 global_segment, "looking for i/o requests (a)");
8046
8047 ulint n_slots = handler.check_pending(global_segment, event);
8048
8049 if (n_slots == 0) {
8050 continue;
8051 }
8052
8053 handler.init(n_slots);
8054
8055 srv_set_io_thread_op_info(
8056 global_segment, "looking for i/o requests (b)");
8057
8058 array->acquire();
8059
8060 ulint n_reserved;
8061
8062 slot = handler.check_completed(&n_reserved);
8063
8064 if (slot != NULL) {
8065
8066 break;
8067
8068 } else if (n_reserved == 0
8069 && !buf_page_cleaner_is_active
8070 && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
8071
8072 /* There is no completed request. If there
8073 are no pending request at all, and the system
8074 is being shut down, exit. */
8075
8076 array->release();
8077
8078 *m1 = NULL;
8079
8080 *m2 = NULL;
8081
8082 return(DB_SUCCESS);
8083
8084 } else if (handler.select()) {
8085
8086 break;
8087 }
8088
8089 /* No I/O requested at the moment */
8090
8091 srv_set_io_thread_op_info(
8092 global_segment, "resetting wait event");
8093
8094 /* We wait here until tbere are more IO requests
8095 for this segment. */
8096
8097 os_event_reset(event);
8098
8099 array->release();
8100
8101 srv_set_io_thread_op_info(
8102 global_segment, "waiting for i/o request");
8103
8104 os_event_wait(event);
8105 }
8106
8107 /** Found a slot that has already completed its IO */
8108
8109 if (slot == NULL) {
8110 /* Merge adjacent requests */
8111 handler.merge();
8112
8113 /* Check if there are several consecutive blocks
8114 to read or write */
8115
8116 srv_set_io_thread_op_info(
8117 global_segment, "consecutive i/o requests");
8118
8119 // Note: We don't support write combining for simulated AIO.
8120 //ulint total_len = handler.allocate_buffer();
8121
8122 /* We release the array mutex for the time of the I/O: NOTE that
8123 this assumes that there is just one i/o-handler thread serving
8124 a single segment of slots! */
8125
8126 array->release();
8127
8128 // Note: We don't support write combining for simulated AIO.
8129 //handler.copy_to_buffer(total_len);
8130
8131 srv_set_io_thread_op_info(global_segment, "doing file i/o");
8132
8133 handler.io();
8134
8135 srv_set_io_thread_op_info(global_segment, "file i/o done");
8136
8137 handler.io_complete();
8138
8139 array->acquire();
8140
8141 handler.done();
8142
8143 /* We return the messages for the first slot now, and if there
8144 were several slots, the messages will be returned with
8145 subsequent calls of this function */
8146
8147 slot = handler.first_slot();
8148 }
8149
8150 ut_ad(slot->is_reserved);
8151
8152 *m1 = slot->m1;
8153 *m2 = slot->m2;
8154
8155 *type = slot->type;
8156
8157 array->release(slot);
8158
8159 array->release();
8160
8161 return(DB_SUCCESS);
8162 }
8163
8164 /** Get the total number of pending IOs
8165 @return the total number of pending IOs */
8166 ulint
total_pending_io_count()8167 AIO::total_pending_io_count()
8168 {
8169 ulint count = s_reads->pending_io_count();
8170
8171 if (s_writes != NULL) {
8172 count += s_writes->pending_io_count();
8173 }
8174
8175 if (s_ibuf != NULL) {
8176 count += s_ibuf->pending_io_count();
8177 }
8178
8179 if (s_log != NULL) {
8180 count += s_log->pending_io_count();
8181 }
8182
8183 if (s_sync != NULL) {
8184 count += s_sync->pending_io_count();
8185 }
8186
8187 return(count);
8188 }
8189
8190 /** Validates the consistency the aio system.
8191 @return true if ok */
8192 static
8193 bool
os_aio_validate()8194 os_aio_validate()
8195 {
8196 /* The methods countds and validates, we ignore the count. */
8197 AIO::total_pending_io_count();
8198
8199 return(true);
8200 }
8201
8202 /** Prints pending IO requests per segment of an aio array.
8203 We probably don't need per segment statistics but they can help us
8204 during development phase to see if the IO requests are being
8205 distributed as expected.
8206 @param[in,out] file File where to print
8207 @param[in] segments Pending IO array */
8208 void
print_segment_info(FILE * file,const ulint * segments)8209 AIO::print_segment_info(
8210 FILE* file,
8211 const ulint* segments)
8212 {
8213 ut_ad(m_n_segments > 0);
8214
8215 if (m_n_segments > 1) {
8216
8217 fprintf(file, " [");
8218
8219 for (ulint i = 0; i < m_n_segments; ++i, ++segments) {
8220
8221 if (i != 0) {
8222 fprintf(file, ", ");
8223 }
8224
8225 fprintf(file, ULINTPF, *segments);
8226 }
8227
8228 fprintf(file, "] ");
8229 }
8230 }
8231
8232 /** Prints info about the aio array.
8233 @param[in,out] file Where to print */
8234 void
print(FILE * file)8235 AIO::print(FILE* file)
8236 {
8237 ulint count = 0;
8238 ulint n_res_seg[SRV_MAX_N_IO_THREADS];
8239
8240 mutex_enter(&m_mutex);
8241
8242 ut_a(!m_slots.empty());
8243 ut_a(m_n_segments > 0);
8244
8245 memset(n_res_seg, 0x0, sizeof(n_res_seg));
8246
8247 for (ulint i = 0; i < m_slots.size(); ++i) {
8248 Slot& slot = m_slots[i];
8249 ulint segment = (i * m_n_segments) / m_slots.size();
8250
8251 if (slot.is_reserved) {
8252
8253 ++count;
8254
8255 ++n_res_seg[segment];
8256
8257 ut_a(slot.len > 0);
8258 }
8259 }
8260
8261 ut_a(m_n_reserved == count);
8262
8263 print_segment_info(file, n_res_seg);
8264
8265 mutex_exit(&m_mutex);
8266 }
8267
8268 /** Print all the AIO segments
8269 @param[in,out] file Where to print */
8270 void
print_all(FILE * file)8271 AIO::print_all(FILE* file)
8272 {
8273 s_reads->print(file);
8274
8275 if (s_writes != NULL) {
8276 fputs(", aio writes:", file);
8277 s_writes->print(file);
8278 }
8279
8280 if (s_ibuf != NULL) {
8281 fputs(",\n ibuf aio reads:", file);
8282 s_ibuf->print(file);
8283 }
8284
8285 if (s_log != NULL) {
8286 fputs(", log i/o's:", file);
8287 s_log->print(file);
8288 }
8289
8290 if (s_sync != NULL) {
8291 fputs(", sync i/o's:", file);
8292 s_sync->print(file);
8293 }
8294 }
8295
8296 /** Prints info of the aio arrays.
8297 @param[in,out] file file where to print */
8298 void
os_aio_print(FILE * file)8299 os_aio_print(FILE* file)
8300 {
8301 ib_time_monotonic_t current_time;
8302 double time_elapsed;
8303 double avg_bytes_read;
8304
8305 for (ulint i = 0; i < srv_n_file_io_threads; ++i) {
8306 fprintf(file, "I/O thread %lu state: %s (%s)",
8307 (ulong) i,
8308 srv_io_thread_op_info[i],
8309 srv_io_thread_function[i]);
8310
8311 #ifndef _WIN32
8312 if (os_event_is_set(os_aio_segment_wait_events[i])) {
8313 fprintf(file, " ev set");
8314 }
8315 #endif /* _WIN32 */
8316
8317 fprintf(file, "\n");
8318 }
8319
8320 fputs("Pending normal aio reads:", file);
8321
8322 AIO::print_all(file);
8323
8324 putc('\n', file);
8325 current_time = ut_time_monotonic();
8326 time_elapsed = 0.001 + (current_time - os_last_printout);
8327
8328 fprintf(file,
8329 "Pending flushes (fsync) log: " ULINTPF "; "
8330 "buffer pool: " ULINTPF "\n"
8331 ULINTPF " OS file reads, "
8332 ULINTPF " OS file writes, "
8333 ULINTPF " OS fsyncs\n",
8334 fil_n_pending_log_flushes,
8335 fil_n_pending_tablespace_flushes,
8336 os_n_file_reads,
8337 os_n_file_writes,
8338 os_n_fsyncs);
8339
8340 if (os_n_pending_writes != 0 || os_n_pending_reads != 0) {
8341 fprintf(file,
8342 ULINTPF " pending preads, "
8343 ULINTPF " pending pwrites\n",
8344 os_n_pending_reads,
8345 os_n_pending_writes);
8346 }
8347
8348 if (os_n_file_reads == os_n_file_reads_old) {
8349 avg_bytes_read = 0.0;
8350 } else {
8351 avg_bytes_read = (double) os_bytes_read_since_printout
8352 / (os_n_file_reads - os_n_file_reads_old);
8353 }
8354
8355 fprintf(file,
8356 "%.2f reads/s, %lu avg bytes/read,"
8357 " %.2f writes/s, %.2f fsyncs/s\n",
8358 (os_n_file_reads - os_n_file_reads_old)
8359 / time_elapsed,
8360 (ulong) avg_bytes_read,
8361 (os_n_file_writes - os_n_file_writes_old)
8362 / time_elapsed,
8363 (os_n_fsyncs - os_n_fsyncs_old)
8364 / time_elapsed);
8365
8366 os_n_file_reads_old = os_n_file_reads;
8367 os_n_file_writes_old = os_n_file_writes;
8368 os_n_fsyncs_old = os_n_fsyncs;
8369 os_bytes_read_since_printout = 0;
8370
8371 os_last_printout = current_time;
8372 }
8373
8374 /** Refreshes the statistics used to print per-second averages. */
8375 void
os_aio_refresh_stats()8376 os_aio_refresh_stats()
8377 {
8378 os_n_fsyncs_old = os_n_fsyncs;
8379
8380 os_bytes_read_since_printout = 0;
8381
8382 os_n_file_reads_old = os_n_file_reads;
8383
8384 os_n_file_writes_old = os_n_file_writes;
8385
8386 os_n_fsyncs_old = os_n_fsyncs;
8387
8388 os_bytes_read_since_printout = 0;
8389
8390 os_last_printout = ut_time_monotonic();
8391 }
8392
8393 /** Checks that all slots in the system have been freed, that is, there are
8394 no pending io operations.
8395 @return true if all free */
8396 bool
os_aio_all_slots_free()8397 os_aio_all_slots_free()
8398 {
8399 return(AIO::total_pending_io_count() == 0);
8400 }
8401
8402 #ifdef UNIV_DEBUG
8403 /** Prints all pending IO for the array
8404 @param[in] file file where to print
8405 @param[in] array array to process */
8406 void
to_file(FILE * file) const8407 AIO::to_file(FILE* file) const
8408 {
8409 acquire();
8410
8411 fprintf(file, " %lu\n", static_cast<ulong>(m_n_reserved));
8412
8413 for (ulint i = 0; i < m_slots.size(); ++i) {
8414
8415 const Slot& slot = m_slots[i];
8416
8417 if (slot.is_reserved) {
8418
8419 fprintf(file,
8420 "%s IO for %s (offset=" UINT64PF
8421 ", size=%lu)\n",
8422 slot.type.is_read() ? "read" : "write",
8423 slot.name, slot.offset, slot.len);
8424 }
8425 }
8426
8427 release();
8428 }
8429
8430 /** Print pending IOs for all arrays */
8431 void
print_to_file(FILE * file)8432 AIO::print_to_file(FILE* file)
8433 {
8434 fprintf(file, "Pending normal aio reads:");
8435
8436 s_reads->to_file(file);
8437
8438 if (s_writes != NULL) {
8439 fprintf(file, "Pending normal aio writes:");
8440 s_writes->to_file(file);
8441 }
8442
8443 if (s_ibuf != NULL) {
8444 fprintf(file, "Pending ibuf aio reads:");
8445 s_ibuf->to_file(file);
8446 }
8447
8448 if (s_log != NULL) {
8449 fprintf(file, "Pending log i/o's:");
8450 s_log->to_file(file);
8451 }
8452
8453 if (s_sync != NULL) {
8454 fprintf(file, "Pending sync i/o's:");
8455 s_sync->to_file(file);
8456 }
8457 }
8458
8459 /** Prints all pending IO
8460 @param[in] file File where to print */
8461 void
os_aio_print_pending_io(FILE * file)8462 os_aio_print_pending_io(
8463 FILE* file)
8464 {
8465 AIO::print_to_file(file);
8466 }
8467
8468 #endif /* UNIV_DEBUG */
8469
8470 /**
8471 Set the file create umask
8472 @param[in] umask The umask to use for file creation. */
8473 void
os_file_set_umask(ulint umask)8474 os_file_set_umask(ulint umask)
8475 {
8476 os_innodb_umask = umask;
8477 }
8478 #else
8479
8480 #include "univ.i"
8481 #include "db0err.h"
8482 #include "mach0data.h"
8483 #include "fil0fil.h"
8484 #include "os0file.h"
8485
8486 #include <lz4.h>
8487 #include <zlib.h>
8488
8489 #include <my_aes.h>
8490 #include <my_rnd.h>
8491 #include <mysqld.h>
8492 #include <mysql/service_mysql_keyring.h>
8493
8494 typedef byte Block;
8495
8496 /** Allocate a page for sync IO
8497 @return pointer to page */
8498 static
8499 Block*
os_alloc_block()8500 os_alloc_block()
8501 {
8502 return(reinterpret_cast<byte*>(malloc(UNIV_PAGE_SIZE_MAX * 2)));
8503 }
8504
8505 /** Free a page after sync IO
8506 @param[in,own] block The block to free/release */
8507 static
8508 void
os_free_block(Block * block)8509 os_free_block(Block* block)
8510 {
8511 ut_free(block);
8512 }
8513
8514 #endif /* !UNIV_INNOCHECKSUM */
8515
8516 /** Minimum length needed for encryption */
8517 const unsigned int MIN_ENCRYPTION_LEN = 2 * MY_AES_BLOCK_SIZE + FIL_PAGE_DATA;
8518
8519 /**
8520 @param[in] type The compression type
8521 @return the string representation */
8522 const char*
to_string(Type type)8523 Compression::to_string(Type type)
8524 {
8525 switch(type) {
8526 case NONE:
8527 return("None");
8528 case ZLIB:
8529 return("Zlib");
8530 case LZ4:
8531 return("LZ4");
8532 }
8533
8534 ut_ad(0);
8535
8536 return("<UNKNOWN>");
8537 }
8538
8539 /**
8540 @param[in] meta Page Meta data
8541 @return the string representation */
to_string(const Compression::meta_t & meta)8542 std::string Compression::to_string(const Compression::meta_t& meta)
8543 {
8544 std::ostringstream stream;
8545
8546 stream << "version: " << int(meta.m_version) << " "
8547 << "algorithm: " << meta.m_algorithm << " "
8548 << "(" << to_string(meta.m_algorithm) << ") "
8549 << "orginal_type: " << meta.m_original_type << " "
8550 << "original_size: " << meta.m_original_size << " "
8551 << "compressed_size: " << meta.m_compressed_size;
8552
8553 return(stream.str());
8554 }
8555
8556 /** @return true if it is a compressed page */
8557 bool
is_compressed_page(const byte * page)8558 Compression::is_compressed_page(const byte* page)
8559 {
8560 return(mach_read_from_2(page + FIL_PAGE_TYPE) == FIL_PAGE_COMPRESSED);
8561 }
8562
8563 bool
is_compressed_encrypted_page(const byte * page)8564 Compression::is_compressed_encrypted_page(const byte *page) {
8565 return (mach_read_from_2(page + FIL_PAGE_TYPE) ==
8566 FIL_PAGE_COMPRESSED_AND_ENCRYPTED);
8567 }
8568
8569 bool
is_valid_page_version(uint8_t version)8570 Compression::is_valid_page_version(uint8_t version) {
8571 return (version == FIL_PAGE_VERSION_1 || version == FIL_PAGE_VERSION_2);
8572 }
8573
8574 /** Deserizlise the page header compression meta-data
8575 @param[in] page Pointer to the page header
8576 @param[out] control Deserialised data */
8577 void
deserialize_header(const byte * page,Compression::meta_t * control)8578 Compression::deserialize_header(
8579 const byte* page,
8580 Compression::meta_t* control)
8581 {
8582 ut_ad(is_compressed_page(page) || is_compressed_encrypted_page(page));
8583
8584 control->m_version = static_cast<uint8_t>(
8585 mach_read_from_1(page + FIL_PAGE_VERSION));
8586
8587 control->m_original_type = static_cast<uint16_t>(
8588 mach_read_from_2(page + FIL_PAGE_ORIGINAL_TYPE_V1));
8589
8590 control->m_compressed_size = static_cast<uint16_t>(
8591 mach_read_from_2(page + FIL_PAGE_COMPRESS_SIZE_V1));
8592
8593 control->m_original_size = static_cast<uint16_t>(
8594 mach_read_from_2(page + FIL_PAGE_ORIGINAL_SIZE_V1));
8595
8596 control->m_algorithm = static_cast<Type>(
8597 mach_read_from_1(page + FIL_PAGE_ALGORITHM_V1));
8598 }
8599
8600 /** Decompress the page data contents. Page type must be FIL_PAGE_COMPRESSED, if
8601 not then the source contents are left unchanged and DB_SUCCESS is returned.
8602 @param[in] dblwr_recover true of double write recovery in progress
8603 @param[in,out] src Data read from disk, decompressed data will be
8604 copied to this page
8605 @param[in,out] dst Scratch area to use for decompression
8606 @param[in] dst_len Size of the scratch area in bytes
8607 @return DB_SUCCESS or error code */
8608 dberr_t
deserialize(bool dblwr_recover,byte * src,byte * dst,ulint dst_len)8609 Compression::deserialize(
8610 bool dblwr_recover,
8611 byte* src,
8612 byte* dst,
8613 ulint dst_len)
8614 {
8615 if (!is_compressed_page(src)) {
8616 /* There is nothing we can do. */
8617 return(DB_SUCCESS);
8618 }
8619
8620 meta_t header;
8621
8622 deserialize_header(src, &header);
8623
8624 byte* ptr = src + FIL_PAGE_DATA;
8625
8626 ut_ad(is_valid_page_version(header.m_version));
8627
8628 if (!is_valid_page_version(header.m_version)
8629 || header.m_original_size < UNIV_PAGE_SIZE_MIN - (FIL_PAGE_DATA + 8)
8630 || header.m_original_size > UNIV_PAGE_SIZE_MAX - FIL_PAGE_DATA
8631 || dst_len < header.m_original_size + FIL_PAGE_DATA) {
8632
8633 /* The last check could potentially return DB_OVERFLOW,
8634 the caller should be able to retry with a larger buffer. */
8635
8636 return(DB_CORRUPTION);
8637 }
8638
8639 Block* block;
8640
8641 /* The caller doesn't know what to expect */
8642 if (dst == NULL) {
8643
8644 block = os_alloc_block();
8645
8646 #ifdef UNIV_INNOCHECKSUM
8647 dst = block;
8648 #else
8649 dst = block->m_ptr;
8650 #endif /* UNIV_INNOCHECKSUM */
8651
8652 } else {
8653 block = NULL;
8654 }
8655
8656 int ret;
8657 Compression compression;
8658 ulint len = header.m_original_size;
8659
8660 compression.m_type = static_cast<Compression::Type>(header.m_algorithm);
8661
8662 switch(compression.m_type) {
8663 case Compression::ZLIB: {
8664
8665 uLongf zlen = header.m_original_size;
8666
8667 if (uncompress(dst, &zlen, ptr, header.m_compressed_size)
8668 != Z_OK) {
8669
8670 if (block != NULL) {
8671 os_free_block(block);
8672 }
8673
8674 return(DB_IO_DECOMPRESS_FAIL);
8675 }
8676
8677 len = static_cast<ulint>(zlen);
8678
8679 break;
8680 }
8681
8682 case Compression::LZ4:
8683
8684 ret = LZ4_decompress_safe(
8685 reinterpret_cast<char*>(ptr),
8686 reinterpret_cast<char*>(dst),
8687 header.m_compressed_size,
8688 header.m_original_size);
8689 if (ret < 0) {
8690
8691 if (block != NULL) {
8692 os_free_block(block);
8693 }
8694
8695 return(DB_IO_DECOMPRESS_FAIL);
8696 }
8697
8698 break;
8699
8700 default:
8701 #if !defined(UNIV_INNOCHECKSUM)
8702 ib::error()
8703 << "Compression algorithm support missing: "
8704 << Compression::to_string(compression.m_type);
8705 #else
8706 fprintf(stderr, "Compression algorithm support missing: %s\n",
8707 Compression::to_string(compression.m_type));
8708 #endif /* !UNIV_INNOCHECKSUM */
8709
8710 if (block != NULL) {
8711 os_free_block(block);
8712 }
8713
8714 return(DB_UNSUPPORTED);
8715 }
8716
8717 /* Leave the header alone */
8718 memmove(src + FIL_PAGE_DATA, dst, len);
8719
8720 mach_write_to_2(src + FIL_PAGE_TYPE, header.m_original_type);
8721
8722 ut_ad(dblwr_recover
8723 || memcmp(src + FIL_PAGE_LSN + 4,
8724 src + (header.m_original_size + FIL_PAGE_DATA)
8725 - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4) == 0);
8726
8727 if (block != NULL) {
8728 os_free_block(block);
8729 }
8730
8731 return(DB_SUCCESS);
8732 }
8733
8734 /** Decompress the page data contents. Page type must be FIL_PAGE_COMPRESSED, if
8735 not then the source contents are left unchanged and DB_SUCCESS is returned.
8736 @param[in] dblwr_recover true of double write recovery in progress
8737 @param[in,out] src Data read from disk, decompressed data will be
8738 copied to this page
8739 @param[in,out] dst Scratch area to use for decompression
8740 @param[in] dst_len Size of the scratch area in bytes
8741 @return DB_SUCCESS or error code */
8742 dberr_t
os_file_decompress_page(bool dblwr_recover,byte * src,byte * dst,ulint dst_len)8743 os_file_decompress_page(
8744 bool dblwr_recover,
8745 byte* src,
8746 byte* dst,
8747 ulint dst_len)
8748 {
8749 return(Compression::deserialize(dblwr_recover, src, dst, dst_len));
8750 }
8751
8752 /**
8753 @param[in] type The encryption type
8754 @return the string representation */
8755 const char*
to_string(Type type)8756 Encryption::to_string(Type type)
8757 {
8758 switch(type) {
8759 case NONE:
8760 return("N");
8761 case AES:
8762 return("Y");
8763 }
8764
8765 ut_ad(0);
8766
8767 return("<UNKNOWN>");
8768 }
8769
8770 /** Generate random encryption value for key and iv.
8771 @param[in,out] value Encryption value */
random_value(byte * value)8772 void Encryption::random_value(byte* value)
8773 {
8774 ut_ad(value != NULL);
8775
8776 my_rand_buffer(value, ENCRYPTION_KEY_LEN);
8777 }
8778
8779 /** Create new master key for key rotation.
8780 @param[in,out] master_key master key */
8781 void
create_master_key(byte ** master_key)8782 Encryption::create_master_key(byte** master_key)
8783 {
8784 #ifndef UNIV_INNOCHECKSUM
8785 char* key_type = NULL;
8786 size_t key_len;
8787 char key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
8788 int ret;
8789
8790 /* If uuid does not match with current server uuid,
8791 set uuid as current server uuid. */
8792 if (strcmp(uuid, server_uuid) != 0) {
8793 memcpy(uuid, server_uuid, ENCRYPTION_SERVER_UUID_LEN);
8794 }
8795 memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
8796
8797 /* Generate new master key */
8798 ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8799 "%s-%s-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8800 uuid, master_key_id + 1);
8801
8802 /* We call key ring API to generate master key here. */
8803 ret = my_key_generate(key_name, "AES",
8804 NULL, ENCRYPTION_KEY_LEN);
8805
8806 /* We call key ring API to get master key here. */
8807 ret = my_key_fetch(key_name, &key_type, NULL,
8808 reinterpret_cast<void**>(master_key),
8809 &key_len);
8810
8811 if (ret || *master_key == NULL) {
8812 ib::error() << "Encryption can't find master key, please check"
8813 " the keyring plugin is loaded.";
8814 *master_key = NULL;
8815 } else {
8816 master_key_id++;
8817 }
8818
8819 if (key_type) {
8820 my_free(key_type);
8821 }
8822 #endif
8823 }
8824
8825 /** Get master key by key id.
8826 @param[in] master_key_id master key id
8827 @param[in] srv_uuid uuid of server instance
8828 @param[in,out] master_key master key */
8829 void
get_master_key(ulint master_key_id,char * srv_uuid,byte ** master_key)8830 Encryption::get_master_key(ulint master_key_id,
8831 char* srv_uuid,
8832 byte** master_key)
8833 {
8834 #ifndef UNIV_INNOCHECKSUM
8835 char* key_type = NULL;
8836 size_t key_len;
8837 char key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
8838 int ret;
8839
8840 memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
8841
8842 if (srv_uuid != NULL) {
8843 ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8844 "%s-%s-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8845 srv_uuid, master_key_id);
8846 } else {
8847 /* For compitable with 5.7.11, we need to get master key with
8848 server id. */
8849 memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
8850 ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8851 "%s-%lu-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8852 server_id, master_key_id);
8853 }
8854
8855 /* We call key ring API to get master key here. */
8856 ret = my_key_fetch(key_name, &key_type, NULL,
8857 reinterpret_cast<void**>(master_key), &key_len);
8858
8859 if (key_type) {
8860 my_free(key_type);
8861 }
8862
8863 if (ret) {
8864 *master_key = NULL;
8865 ib::error() << "Encryption can't find master key, please check"
8866 " the keyring plugin is loaded.";
8867 }
8868
8869 #ifdef UNIV_ENCRYPT_DEBUG
8870 if (!ret && *master_key) {
8871 fprintf(stderr, "Fetched master key:%lu ", master_key_id);
8872 ut_print_buf(stderr, *master_key, key_len);
8873 fprintf(stderr, "\n");
8874 }
8875 #endif /* DEBUG_TDE */
8876
8877 #endif
8878 }
8879
8880 /** Current master key id */
8881 ulint Encryption::master_key_id = 0;
8882
8883 /** Current uuid of server instance */
8884 char Encryption::uuid[ENCRYPTION_SERVER_UUID_LEN + 1] = {0};
8885
8886 /** Get current master key and master key id
8887 @param[in,out] master_key_id master key id
8888 @param[in,out] master_key master key
8889 @param[in,out] version encryption information version */
8890 void
get_master_key(ulint * master_key_id,byte ** master_key,Encryption::Version * version)8891 Encryption::get_master_key(ulint* master_key_id,
8892 byte** master_key,
8893 Encryption::Version* version)
8894 {
8895 #ifndef UNIV_INNOCHECKSUM
8896 char* key_type = NULL;
8897 size_t key_len;
8898 char key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
8899 int ret;
8900
8901 memset(key_name, 0, ENCRYPTION_KEY_LEN);
8902 *version = Encryption::ENCRYPTION_VERSION_2;
8903
8904 if (Encryption::master_key_id == 0) {
8905 /* If m_master_key is 0, means there's no encrypted
8906 tablespace, we need to generate the first master key,
8907 and store it to key ring. */
8908 memset(uuid, 0, ENCRYPTION_SERVER_UUID_LEN + 1);
8909 memcpy(uuid, server_uuid, ENCRYPTION_SERVER_UUID_LEN);
8910
8911 /* Prepare the server uuid. */
8912 ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8913 "%s-%s-1", ENCRYPTION_MASTER_KEY_PRIFIX,
8914 uuid);
8915
8916 /* We call key ring API to generate master key here. */
8917 ret = my_key_generate(key_name, "AES",
8918 NULL, ENCRYPTION_KEY_LEN);
8919
8920 /* We call key ring API to get master key here. */
8921 ret = my_key_fetch(key_name, &key_type, NULL,
8922 reinterpret_cast<void**>(master_key),
8923 &key_len);
8924
8925 if (!ret && *master_key != NULL) {
8926 Encryption::master_key_id++;
8927 *master_key_id = Encryption::master_key_id;
8928 }
8929 #ifdef UNIV_ENCRYPT_DEBUG
8930 if (!ret && *master_key) {
8931 fprintf(stderr, "Generated new master key:");
8932 ut_print_buf(stderr, *master_key, key_len);
8933 fprintf(stderr, "\n");
8934 }
8935 #endif
8936 } else {
8937 *master_key_id = Encryption::master_key_id;
8938
8939 ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8940 "%s-%s-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8941 uuid, *master_key_id);
8942
8943 /* We call key ring API to get master key here. */
8944 ret = my_key_fetch(key_name, &key_type, NULL,
8945 reinterpret_cast<void**>(master_key),
8946 &key_len);
8947
8948 /* For compitable with 5.7.11, we need to try to get master key with
8949 server id when get master key with server uuid failure. */
8950 if (ret || *master_key == NULL) {
8951 if (key_type) {
8952 my_free(key_type);
8953 }
8954
8955 memset(key_name, 0,
8956 ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
8957 ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
8958 "%s-%lu-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
8959 server_id, *master_key_id);
8960
8961 ret = my_key_fetch(key_name, &key_type, NULL,
8962 reinterpret_cast<void**>(master_key),
8963 &key_len);
8964 *version = Encryption::ENCRYPTION_VERSION_1;
8965 }
8966 #ifdef UNIV_ENCRYPT_DEBUG
8967 if (!ret && *master_key) {
8968 fprintf(stderr, "Fetched master key:%lu ",
8969 *master_key_id);
8970 ut_print_buf(stderr, *master_key, key_len);
8971 fprintf(stderr, "\n");
8972 }
8973 #endif
8974 }
8975
8976 if (ret) {
8977 *master_key = NULL;
8978 ib::error() << "Encryption can't find master key, please check"
8979 " the keyring plugin is loaded.";
8980 }
8981
8982 if (key_type) {
8983 my_free(key_type);
8984 }
8985 #endif
8986 }
8987
8988 /** Check if page is encrypted page or not
8989 @param[in] page page which need to check
8990 @return true if it is a encrypted page */
8991 bool
is_encrypted_page(const byte * page)8992 Encryption::is_encrypted_page(const byte* page)
8993 {
8994 ulint page_type = mach_read_from_2(page + FIL_PAGE_TYPE);
8995
8996 return(page_type == FIL_PAGE_ENCRYPTED
8997 || page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED
8998 || page_type == FIL_PAGE_ENCRYPTED_RTREE);
8999 }
9000
9001 /** Encrypt the page data contents. Page type can't be
9002 FIL_PAGE_ENCRYPTED, FIL_PAGE_COMPRESSED_AND_ENCRYPTED,
9003 FIL_PAGE_ENCRYPTED_RTREE.
9004 @param[in] type IORequest
9005 @param[in,out] src page data which need to encrypt
9006 @param[in] src_len Size of the source in bytes
9007 @param[in,out] dst destination area
9008 @param[in,out] dst_len Size of the destination in bytes
9009 @return buffer data, dst_len will have the length of the data */
9010 byte*
encrypt(const IORequest & type,byte * src,ulint src_len,byte * dst,ulint * dst_len)9011 Encryption::encrypt(
9012 const IORequest& type,
9013 byte* src,
9014 ulint src_len,
9015 byte* dst,
9016 ulint* dst_len)
9017 {
9018 ut_ad(m_type != NONE);
9019 ut_ad(!type.is_log());
9020 #ifdef UNIV_ENCRYPT_DEBUG
9021 ulint space_id =
9022 mach_read_from_4(src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
9023 ulint page_no = mach_read_from_4(src + FIL_PAGE_OFFSET);
9024
9025 fprintf(stderr, "Encrypting page:%lu.%lu len:%lu\n",
9026 space_id, page_no, src_len);
9027 #endif
9028
9029 /* Shouldn't encrypte an already encrypted page. */
9030 ut_ad(!is_encrypted_page(src));
9031
9032 const uint16_t page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
9033
9034 /* This is data size which need to encrypt. */
9035 ulint src_enc_len = src_len;
9036
9037 /* In FIL_PAGE_VERSION_2, we encrypt the actual compressed data length. */
9038 if (page_type == FIL_PAGE_COMPRESSED) {
9039 src_enc_len = mach_read_from_2(src + FIL_PAGE_COMPRESS_SIZE_V1) +
9040 FIL_PAGE_DATA;
9041 /* Extend src_enc_len if needed */
9042 if (src_enc_len < MIN_ENCRYPTION_LEN) {
9043 src_enc_len = MIN_ENCRYPTION_LEN;
9044 }
9045 ut_a(src_enc_len <= src_len);
9046 }
9047
9048 /* Only encrypt the data + trailer, leave the header alone */
9049
9050 switch (m_type) {
9051 case Encryption::NONE:
9052 ut_error;
9053
9054 case Encryption::AES: {
9055 ut_ad(m_klen == ENCRYPTION_KEY_LEN);
9056
9057 /* Total length of the data to encrypt. */
9058 const ulint data_len = src_enc_len - FIL_PAGE_DATA;
9059
9060 /* Server encryption functions expect input data to be in
9061 multiples of MY_AES_BLOCK SIZE. Therefore we encrypt the
9062 overlapping data of the chunk_len and trailer_len twice.
9063 First we encrypt the bigger chunk of data then we do the
9064 trailer. The trailer encryption block starts at
9065 2 * MY_AES_BLOCK_SIZE bytes offset from the end of the enc_len.
9066 During decryption we do the reverse of the above process. */
9067 ut_ad(data_len >= 2 * MY_AES_BLOCK_SIZE);
9068
9069 const ulint chunk_len =
9070 (data_len / MY_AES_BLOCK_SIZE) * MY_AES_BLOCK_SIZE;
9071 const ulint remain_len = data_len - chunk_len;
9072
9073 lint elen = my_aes_encrypt(
9074 src + FIL_PAGE_DATA, static_cast<uint32>(chunk_len),
9075 dst + FIL_PAGE_DATA, reinterpret_cast<byte *>(m_key),
9076 static_cast<uint32>(m_klen), my_aes_256_cbc,
9077 reinterpret_cast<byte *>(m_iv), false);
9078
9079 if (elen == MY_AES_BAD_DATA) {
9080 ulint page_no =mach_read_from_4(
9081 src + FIL_PAGE_OFFSET);
9082 ulint space_id = mach_read_from_4(
9083 src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
9084 *dst_len = src_len;
9085 #ifndef UNIV_INNOCHECKSUM
9086 ib::warn()
9087 << " Can't encrypt data of page,"
9088 << " page no:" << page_no
9089 << " space id:" << space_id;
9090 #else
9091 fprintf(stderr, " Can't encrypt data of page,"
9092 " page no:" ULINTPF
9093 " space id:" ULINTPF,
9094 page_no, space_id);
9095 #endif /* !UNIV_INNOCHECKSUM */
9096 return(src);
9097 }
9098
9099 const ulint len = static_cast<ulint>(elen);
9100 ut_ad(len == chunk_len);
9101
9102 /* Encrypt the trailing bytes. */
9103 if (remain_len != 0) {
9104 /* Copy remaining bytes and page tailer. */
9105 memcpy(dst + FIL_PAGE_DATA + len,
9106 src + FIL_PAGE_DATA + len,
9107 remain_len);
9108
9109 const ulint trailer_len = MY_AES_BLOCK_SIZE * 2;
9110 byte buf[trailer_len];
9111
9112 elen = my_aes_encrypt(
9113 dst + FIL_PAGE_DATA + data_len - trailer_len,
9114 static_cast<uint32>(trailer_len), buf,
9115 reinterpret_cast<unsigned char*>(m_key),
9116 static_cast<uint32>(m_klen), my_aes_256_cbc,
9117 reinterpret_cast<byte *>(m_iv), false);
9118
9119 if (elen == MY_AES_BAD_DATA) {
9120 ulint page_no =mach_read_from_4(
9121 src + FIL_PAGE_OFFSET);
9122 ulint space_id = mach_read_from_4(
9123 src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
9124 #ifndef UNIV_INNOCHECKSUM
9125 ib::warn()
9126 << " Can't encrypt data of page,"
9127 << " page no:" << page_no
9128 << " space id:" << space_id;
9129 #else
9130 fprintf(stderr, " Can't encrypt data of page,"
9131 " page no:" ULINTPF
9132 " space id:" ULINTPF,
9133 page_no, space_id);
9134 #endif /* !UNIV_INNOCHECKSUM */
9135 *dst_len = src_len;
9136 return(src);
9137 }
9138
9139 ut_a(static_cast<ulint>(elen) == trailer_len);
9140
9141 memcpy(dst + FIL_PAGE_DATA + data_len - trailer_len,
9142 buf, trailer_len);
9143 }
9144
9145
9146 break;
9147 }
9148
9149 default:
9150 ut_error;
9151 }
9152
9153 /* Copy the header as is. */
9154 memmove(dst, src, FIL_PAGE_DATA);
9155 ut_ad(memcmp(src, dst, FIL_PAGE_DATA) == 0);
9156
9157 /* Add encryption control information. Required for decrypting. */
9158 if (page_type == FIL_PAGE_COMPRESSED) {
9159 /* If the page is compressed, we don't need to save the
9160 original type, since it is done in compression already. */
9161 mach_write_to_2(dst + FIL_PAGE_TYPE,
9162 FIL_PAGE_COMPRESSED_AND_ENCRYPTED);
9163 ut_ad(memcmp(src+FIL_PAGE_TYPE+2,
9164 dst+FIL_PAGE_TYPE+2,
9165 FIL_PAGE_DATA-FIL_PAGE_TYPE-2) == 0);
9166 } else if (page_type == FIL_PAGE_RTREE) {
9167 /* If the page is R-tree page, we need to save original type. */
9168 mach_write_to_2(dst + FIL_PAGE_TYPE, FIL_PAGE_ENCRYPTED_RTREE);
9169 } else{
9170 mach_write_to_2(dst + FIL_PAGE_TYPE, FIL_PAGE_ENCRYPTED);
9171 mach_write_to_2(dst + FIL_PAGE_ORIGINAL_TYPE_V1, page_type);
9172 }
9173
9174 #ifdef UNIV_ENCRYPT_DEBUG
9175 #ifndef UNIV_INNOCHECKSUM
9176 #if 0
9177 byte* check_buf = static_cast<byte*>(ut_malloc_nokey(src_len));
9178 byte* buf2 = static_cast<byte*>(ut_malloc_nokey(src_len));
9179
9180 memcpy(check_buf, dst, src_len);
9181
9182 dberr_t err = decrypt(type, check_buf, src_len, buf2, src_len);
9183 if (err != DB_SUCCESS || memcmp(src + FIL_PAGE_DATA,
9184 check_buf + FIL_PAGE_DATA,
9185 src_len - FIL_PAGE_DATA) != 0) {
9186 ut_print_buf(stderr, src, src_len);
9187 ut_print_buf(stderr, check_buf, src_len);
9188 ut_ad(0);
9189 }
9190 ut_free(buf2);
9191 ut_free(check_buf);
9192 #endif
9193 fprintf(stderr, "Encrypted page:%lu.%lu\n", space_id, page_no);
9194 #endif
9195 #endif
9196
9197 /* Add padding 0 for unused portion */
9198 if (src_len > src_enc_len) {
9199 memset(dst + src_enc_len, 0, src_len - src_enc_len);
9200 }
9201
9202 *dst_len = src_len;
9203
9204 return(dst);
9205 }
9206
9207 /** Decrypt the page data contents. Page type must be FIL_PAGE_ENCRYPTED,
9208 if not then the source contents are left unchanged and DB_SUCCESS is returned.
9209 @param[in] type IORequest
9210 @param[in,out] src Data read from disk, decrypted data will be
9211 copied to this page
9212 @param[in] src_len source data length
9213 @param[in,out] dst Scratch area to use for decryption
9214 @param[in] dst_len Size of the scratch area in bytes
9215 @return DB_SUCCESS or error code */
9216 dberr_t
decrypt(const IORequest & type,byte * src,ulint src_len,byte * dst,ulint dst_len)9217 Encryption::decrypt(
9218 const IORequest& type,
9219 byte* src,
9220 ulint src_len,
9221 byte* dst,
9222 ulint dst_len)
9223 {
9224 ulint data_len;
9225 ulint main_len;
9226 ulint remain_len;
9227 ulint original_type;
9228 ulint page_type;
9229 byte remain_buf[MY_AES_BLOCK_SIZE * 2];
9230 Block* block;
9231
9232 /* Do nothing if it's not an encrypted table. */
9233 if (!is_encrypted_page(src)) {
9234 return(DB_SUCCESS);
9235 }
9236
9237 /* For compressed page, we need to get the compressed size
9238 for decryption */
9239 page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
9240 if (page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED) {
9241 src_len = static_cast<uint16_t>(
9242 mach_read_from_2(src + FIL_PAGE_COMPRESS_SIZE_V1))
9243 + FIL_PAGE_DATA;
9244 #ifndef UNIV_INNOCHECKSUM
9245 Compression::meta_t header;
9246 Compression::deserialize_header(src, &header);
9247 if (header.m_version == Compression::FIL_PAGE_VERSION_1) {
9248 src_len = ut_calc_align(src_len, type.block_size());
9249 } else {
9250 /* Extend src_len if needed */
9251 if (src_len < MIN_ENCRYPTION_LEN) {
9252 src_len = MIN_ENCRYPTION_LEN;
9253 }
9254 }
9255 #endif
9256 }
9257 #ifdef UNIV_ENCRYPT_DEBUG
9258 ulint space_id =
9259 mach_read_from_4(src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
9260 ulint page_no = mach_read_from_4(src + FIL_PAGE_OFFSET);
9261
9262 fprintf(stderr, "Decrypting page:%lu.%lu len:%lu\n",
9263 space_id, page_no, src_len);
9264 #endif
9265
9266 original_type = static_cast<uint16_t>(
9267 mach_read_from_2(src + FIL_PAGE_ORIGINAL_TYPE_V1));
9268
9269 byte* ptr = src + FIL_PAGE_DATA;
9270
9271 /* The caller doesn't know what to expect */
9272 if (dst == NULL) {
9273
9274 block = os_alloc_block();
9275 #ifdef UNIV_INNOCHECKSUM
9276 dst = block;
9277 #else
9278 dst = block->m_ptr;
9279 #endif /* UNIV_INNOCHECKSUM */
9280
9281 } else {
9282 block = NULL;
9283 }
9284
9285 data_len = src_len - FIL_PAGE_DATA;
9286 main_len = (data_len / MY_AES_BLOCK_SIZE) * MY_AES_BLOCK_SIZE;
9287 remain_len = data_len - main_len;
9288
9289 switch(m_type) {
9290 case Encryption::AES: {
9291 lint elen;
9292
9293 /* First decrypt the last 2 blocks data of data, since
9294 data is no block aligned. */
9295 if (remain_len != 0) {
9296 ut_ad(m_klen == ENCRYPTION_KEY_LEN);
9297
9298 remain_len = MY_AES_BLOCK_SIZE * 2;
9299
9300 /* Copy the last 2 blocks. */
9301 memcpy(remain_buf,
9302 ptr + data_len - remain_len,
9303 remain_len);
9304
9305 elen = my_aes_decrypt(
9306 remain_buf,
9307 static_cast<uint32>(remain_len),
9308 dst + data_len - remain_len,
9309 reinterpret_cast<unsigned char*>(m_key),
9310 static_cast<uint32>(m_klen),
9311 my_aes_256_cbc,
9312 reinterpret_cast<unsigned char*>(m_iv),
9313 false);
9314 if (elen == MY_AES_BAD_DATA) {
9315 if (block != NULL) {
9316 os_free_block(block);
9317 }
9318
9319 return(DB_IO_DECRYPT_FAIL);
9320 }
9321
9322 /* Copy the other data bytes to temp area. */
9323 memcpy(dst, ptr, data_len - remain_len);
9324 } else {
9325 ut_ad(data_len == main_len);
9326
9327 /* Copy the data bytes to temp area. */
9328 memcpy(dst, ptr, data_len);
9329 }
9330
9331 /* Then decrypt the main data */
9332 elen = my_aes_decrypt(
9333 dst,
9334 static_cast<uint32>(main_len),
9335 ptr,
9336 reinterpret_cast<unsigned char*>(m_key),
9337 static_cast<uint32>(m_klen),
9338 my_aes_256_cbc,
9339 reinterpret_cast<unsigned char*>(m_iv),
9340 false);
9341 if (elen == MY_AES_BAD_DATA) {
9342
9343 if (block != NULL) {
9344 os_free_block(block);
9345 }
9346
9347 return(DB_IO_DECRYPT_FAIL);
9348 }
9349
9350 ut_ad(static_cast<ulint>(elen) == main_len);
9351
9352 /* Copy the remain bytes. */
9353 memcpy(ptr + main_len, dst + main_len, data_len - main_len);
9354
9355 break;
9356 }
9357
9358 default:
9359 if (!type.is_dblwr_recover()) {
9360 #if !defined(UNIV_INNOCHECKSUM)
9361 ib::error()
9362 << "Encryption algorithm support missing: "
9363 << Encryption::to_string(m_type);
9364 #else
9365 fprintf(stderr, "Encryption algorithm support missing: %s\n",
9366 Encryption::to_string(m_type));
9367 #endif /* !UNIV_INNOCHECKSUM */
9368 }
9369
9370 if (block != NULL) {
9371 os_free_block(block);
9372 }
9373
9374 return(DB_UNSUPPORTED);
9375 }
9376
9377 /* Restore the original page type. If it's a compressed and
9378 encrypted page, just reset it as compressed page type, since
9379 we will do uncompress later. */
9380
9381 if (page_type == FIL_PAGE_ENCRYPTED) {
9382 mach_write_to_2(src + FIL_PAGE_TYPE, original_type);
9383 mach_write_to_2(src + FIL_PAGE_ORIGINAL_TYPE_V1, 0);
9384 } else if (page_type == FIL_PAGE_ENCRYPTED_RTREE) {
9385 mach_write_to_2(src + FIL_PAGE_TYPE, FIL_PAGE_RTREE);
9386 } else {
9387 ut_ad(page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED);
9388 mach_write_to_2(src + FIL_PAGE_TYPE, FIL_PAGE_COMPRESSED);
9389 }
9390
9391 if (block != NULL) {
9392 os_free_block(block);
9393 }
9394
9395 #ifdef UNIV_ENCRYPT_DEBUG
9396 fprintf(stderr, "Decrypted page:%lu.%lu\n", space_id, page_no);
9397 #endif
9398
9399 DBUG_EXECUTE_IF("ib_crash_during_decrypt_page", DBUG_SUICIDE(););
9400
9401 return(DB_SUCCESS);
9402 }
9403
9404 /** Normalizes a directory path for the current OS:
9405 On Windows, we convert '/' to '\', else we convert '\' to '/'.
9406 @param[in,out] str A null-terminated directory and file path */
9407 void
os_normalize_path(char * str)9408 os_normalize_path(
9409 char* str)
9410 {
9411 if (str != NULL) {
9412 for (; *str; str++) {
9413 if (*str == OS_PATH_SEPARATOR_ALT) {
9414 *str = OS_PATH_SEPARATOR;
9415 }
9416 }
9417 }
9418 }
9419